Netdev List

Netdev List
 help / color / mirror / Atom feed

* Re: [PATCH resend] netfilter: place in source hash after SNAT is done
From: Patrick McHardy @ 2011-01-20 14:51 UTC (permalink / raw)
  To: Changli Gao; +Cc: David S. Miller, netfilter-devel, netdev
In-Reply-To: <AANLkTimPEaXiiFiyD7Anvg+x5zJUsF04P_yPbj8KzYaF@mail.gmail.com>

Am 19.01.2011 01:03, schrieb Changli Gao:
> On Tue, Jan 18, 2011 at 10:17 PM, Patrick McHardy <kaber@trash.net> wrote:
>>>  net/ipv4/netfilter/nf_nat_core.c |   18 +++++++++++-------
>>>  1 file changed, 11 insertions(+), 7 deletions(-)
>>> diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c
>>> index c04787c..51ce55a 100644
>>> --- a/net/ipv4/netfilter/nf_nat_core.c
>>> +++ b/net/ipv4/netfilter/nf_nat_core.c
>>> @@ -221,7 +221,14 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
>>>          manips not an issue.  */
>>>       if (maniptype == IP_NAT_MANIP_SRC &&
>>>           !(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) {
>>> -             if (find_appropriate_src(net, zone, orig_tuple, tuple, range)) {
>>> +             /* try the original tuple first */
>>
>> This doesn't seem to be related to the hashing change. Please describe
>> the intention behind this change.
> 
> Currently, we add the ct at the head of the corresponding bucket of
> the source hash table after DNAT is done, so when we do SNAT, the
> original ct will be tried first. This change is used to keep this
> behavior.

Thanks for the explanation, applied.

^ permalink raw reply

* [PATCH] atl1e: remove private #define.
From: Francois Romieu @ 2011-01-20 14:59 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Jay Cliburn, Chris Snook, Jie Yang

Either unused or duplicates from mii.h.

Signed-off-by: Francois Romieu <romieu@fr.zoreil.com>
Cc: Jay Cliburn <jcliburn@gmail.com>
Cc: Chris Snook <chris.snook@gmail.com>
Cc: Jie Yang <jie.yang@atheros.com>
---
 drivers/net/atl1e/atl1e_ethtool.c |   12 ++--
 drivers/net/atl1e/atl1e_hw.c      |   34 ++++-------
 drivers/net/atl1e/atl1e_hw.h      |  111 +-----------------------------------
 drivers/net/atl1e/atl1e_main.c    |    4 +-
 4 files changed, 25 insertions(+), 136 deletions(-)

diff --git a/drivers/net/atl1e/atl1e_ethtool.c b/drivers/net/atl1e/atl1e_ethtool.c
index 6943a6c..1209297 100644
--- a/drivers/net/atl1e/atl1e_ethtool.c
+++ b/drivers/net/atl1e/atl1e_ethtool.c
@@ -95,18 +95,18 @@ static int atl1e_set_settings(struct net_device *netdev,
 		ecmd->advertising = hw->autoneg_advertised |
 				    ADVERTISED_TP | ADVERTISED_Autoneg;
 
-		adv4 = hw->mii_autoneg_adv_reg & ~MII_AR_SPEED_MASK;
+		adv4 = hw->mii_autoneg_adv_reg & ~ADVERTISE_ALL;
 		adv9 = hw->mii_1000t_ctrl_reg & ~MII_AT001_CR_1000T_SPEED_MASK;
 		if (hw->autoneg_advertised & ADVERTISE_10_HALF)
-			adv4 |= MII_AR_10T_HD_CAPS;
+			adv4 |= ADVERTISE_10HALF;
 		if (hw->autoneg_advertised & ADVERTISE_10_FULL)
-			adv4 |= MII_AR_10T_FD_CAPS;
+			adv4 |= ADVERTISE_10FULL;
 		if (hw->autoneg_advertised & ADVERTISE_100_HALF)
-			adv4 |= MII_AR_100TX_HD_CAPS;
+			adv4 |= ADVERTISE_100HALF;
 		if (hw->autoneg_advertised & ADVERTISE_100_FULL)
-			adv4 |= MII_AR_100TX_FD_CAPS;
+			adv4 |= ADVERTISE_100FULL;
 		if (hw->autoneg_advertised & ADVERTISE_1000_FULL)
-			adv9 |= MII_AT001_CR_1000T_FD_CAPS;
+			adv9 |= ADVERTISE_1000FULL;
 
 		if (adv4 != hw->mii_autoneg_adv_reg ||
 				adv9 != hw->mii_1000t_ctrl_reg) {
diff --git a/drivers/net/atl1e/atl1e_hw.c b/drivers/net/atl1e/atl1e_hw.c
index 76cc043..923063d 100644
--- a/drivers/net/atl1e/atl1e_hw.c
+++ b/drivers/net/atl1e/atl1e_hw.c
@@ -318,7 +318,7 @@ static int atl1e_phy_setup_autoneg_adv(struct atl1e_hw *hw)
 	 * Advertisement Register (Address 4) and the 1000 mb speed bits in
 	 * the  1000Base-T control Register (Address 9).
 	 */
-	mii_autoneg_adv_reg &= ~MII_AR_SPEED_MASK;
+	mii_autoneg_adv_reg &= ~ADVERTISE_ALL;
 	mii_1000t_ctrl_reg  &= ~MII_AT001_CR_1000T_SPEED_MASK;
 
 	/*
@@ -327,44 +327,37 @@ static int atl1e_phy_setup_autoneg_adv(struct atl1e_hw *hw)
 	 */
 	switch (hw->media_type) {
 	case MEDIA_TYPE_AUTO_SENSOR:
-		mii_autoneg_adv_reg |= (MII_AR_10T_HD_CAPS   |
-					MII_AR_10T_FD_CAPS   |
-					MII_AR_100TX_HD_CAPS |
-					MII_AR_100TX_FD_CAPS);
-		hw->autoneg_advertised = ADVERTISE_10_HALF  |
-					 ADVERTISE_10_FULL  |
-					 ADVERTISE_100_HALF |
-					 ADVERTISE_100_FULL;
+		mii_autoneg_adv_reg |= ADVERTISE_ALL;
+		hw->autoneg_advertised = ADVERTISE_ALL;
 		if (hw->nic_type == athr_l1e) {
-			mii_1000t_ctrl_reg |=
-				MII_AT001_CR_1000T_FD_CAPS;
+			mii_1000t_ctrl_reg |= ADVERTISE_1000FULL;
 			hw->autoneg_advertised |= ADVERTISE_1000_FULL;
 		}
 		break;
 
 	case MEDIA_TYPE_100M_FULL:
-		mii_autoneg_adv_reg   |= MII_AR_100TX_FD_CAPS;
+		mii_autoneg_adv_reg   |= ADVERTISE_100FULL;
 		hw->autoneg_advertised = ADVERTISE_100_FULL;
 		break;
 
 	case MEDIA_TYPE_100M_HALF:
-		mii_autoneg_adv_reg   |= MII_AR_100TX_HD_CAPS;
+		mii_autoneg_adv_reg   |= ADVERTISE_100_HALF;
 		hw->autoneg_advertised = ADVERTISE_100_HALF;
 		break;
 
 	case MEDIA_TYPE_10M_FULL:
-		mii_autoneg_adv_reg   |= MII_AR_10T_FD_CAPS;
+		mii_autoneg_adv_reg   |= ADVERTISE_10_FULL;
 		hw->autoneg_advertised = ADVERTISE_10_FULL;
 		break;
 
 	default:
-		mii_autoneg_adv_reg   |= MII_AR_10T_HD_CAPS;
+		mii_autoneg_adv_reg   |= ADVERTISE_10_HALF;
 		hw->autoneg_advertised = ADVERTISE_10_HALF;
 		break;
 	}
 
 	/* flow control fixed to enable all */
-	mii_autoneg_adv_reg |= (MII_AR_ASM_DIR | MII_AR_PAUSE);
+	mii_autoneg_adv_reg |= (ADVERTISE_PAUSE_ASYM | ADVERTISE_PAUSE_CAP);
 
 	hw->mii_autoneg_adv_reg = mii_autoneg_adv_reg;
 	hw->mii_1000t_ctrl_reg  = mii_1000t_ctrl_reg;
@@ -374,7 +367,7 @@ static int atl1e_phy_setup_autoneg_adv(struct atl1e_hw *hw)
 		return ret_val;
 
 	if (hw->nic_type == athr_l1e || hw->nic_type == athr_l2e_revA) {
-		ret_val = atl1e_write_phy_reg(hw, MII_AT001_CR,
+		ret_val = atl1e_write_phy_reg(hw, MII_CTRL1000,
 					   mii_1000t_ctrl_reg);
 		if (ret_val)
 			return ret_val;
@@ -397,7 +390,7 @@ int atl1e_phy_commit(struct atl1e_hw *hw)
 	int ret_val;
 	u16 phy_data;
 
-	phy_data = MII_CR_RESET | MII_CR_AUTO_NEG_EN | MII_CR_RESTART_AUTO_NEG;
+	phy_data = BMCR_RESET | BMCR_ANENABLE | BMCR_ANRESTART;
 
 	ret_val = atl1e_write_phy_reg(hw, MII_BMCR, phy_data);
 	if (ret_val) {
@@ -645,15 +638,14 @@ int atl1e_restart_autoneg(struct atl1e_hw *hw)
 		return err;
 
 	if (hw->nic_type == athr_l1e || hw->nic_type == athr_l2e_revA) {
-		err = atl1e_write_phy_reg(hw, MII_AT001_CR,
+		err = atl1e_write_phy_reg(hw, MII_CTRL1000,
 				       hw->mii_1000t_ctrl_reg);
 		if (err)
 			return err;
 	}
 
 	err = atl1e_write_phy_reg(hw, MII_BMCR,
-			MII_CR_RESET | MII_CR_AUTO_NEG_EN |
-			MII_CR_RESTART_AUTO_NEG);
+			BMCR_RESET | BMCR_ANENABLE | BMCR_ANRESTART);
 	return err;
 }
 
diff --git a/drivers/net/atl1e/atl1e_hw.h b/drivers/net/atl1e/atl1e_hw.h
index 5ea2f4d..74df16a 100644
--- a/drivers/net/atl1e/atl1e_hw.h
+++ b/drivers/net/atl1e/atl1e_hw.h
@@ -629,127 +629,24 @@ s32 atl1e_restart_autoneg(struct atl1e_hw *hw);
 
 /***************************** MII definition ***************************************/
 /* PHY Common Register */
-#define MII_BMCR                        0x00
-#define MII_BMSR                        0x01
-#define MII_PHYSID1                     0x02
-#define MII_PHYSID2                     0x03
-#define MII_ADVERTISE                   0x04
-#define MII_LPA                         0x05
-#define MII_EXPANSION                   0x06
-#define MII_AT001_CR                    0x09
-#define MII_AT001_SR                    0x0A
-#define MII_AT001_ESR                   0x0F
 #define MII_AT001_PSCR                  0x10
 #define MII_AT001_PSSR                  0x11
 #define MII_INT_CTRL                    0x12
 #define MII_INT_STATUS                  0x13
 #define MII_SMARTSPEED                  0x14
-#define MII_RERRCOUNTER                 0x15
-#define MII_SREVISION                   0x16
-#define MII_RESV1                       0x17
 #define MII_LBRERROR                    0x18
-#define MII_PHYADDR                     0x19
 #define MII_RESV2                       0x1a
-#define MII_TPISTATUS                   0x1b
-#define MII_NCONFIG                     0x1c
 
 #define MII_DBG_ADDR			0x1D
 #define MII_DBG_DATA			0x1E
 
-
-/* PHY Control Register */
-#define MII_CR_SPEED_SELECT_MSB                  0x0040  /* bits 6,13: 10=1000, 01=100, 00=10 */
-#define MII_CR_COLL_TEST_ENABLE                  0x0080  /* Collision test enable */
-#define MII_CR_FULL_DUPLEX                       0x0100  /* FDX =1, half duplex =0 */
-#define MII_CR_RESTART_AUTO_NEG                  0x0200  /* Restart auto negotiation */
-#define MII_CR_ISOLATE                           0x0400  /* Isolate PHY from MII */
-#define MII_CR_POWER_DOWN                        0x0800  /* Power down */
-#define MII_CR_AUTO_NEG_EN                       0x1000  /* Auto Neg Enable */
-#define MII_CR_SPEED_SELECT_LSB                  0x2000  /* bits 6,13: 10=1000, 01=100, 00=10 */
-#define MII_CR_LOOPBACK                          0x4000  /* 0 = normal, 1 = loopback */
-#define MII_CR_RESET                             0x8000  /* 0 = normal, 1 = PHY reset */
-#define MII_CR_SPEED_MASK                        0x2040
-#define MII_CR_SPEED_1000                        0x0040
-#define MII_CR_SPEED_100                         0x2000
-#define MII_CR_SPEED_10                          0x0000
-
-
-/* PHY Status Register */
-#define MII_SR_EXTENDED_CAPS                     0x0001  /* Extended register capabilities */
-#define MII_SR_JABBER_DETECT                     0x0002  /* Jabber Detected */
-#define MII_SR_LINK_STATUS                       0x0004  /* Link Status 1 = link */
-#define MII_SR_AUTONEG_CAPS                      0x0008  /* Auto Neg Capable */
-#define MII_SR_REMOTE_FAULT                      0x0010  /* Remote Fault Detect */
-#define MII_SR_AUTONEG_COMPLETE                  0x0020  /* Auto Neg Complete */
-#define MII_SR_PREAMBLE_SUPPRESS                 0x0040  /* Preamble may be suppressed */
-#define MII_SR_EXTENDED_STATUS                   0x0100  /* Ext. status info in Reg 0x0F */
-#define MII_SR_100T2_HD_CAPS                     0x0200  /* 100T2 Half Duplex Capable */
-#define MII_SR_100T2_FD_CAPS                     0x0400  /* 100T2 Full Duplex Capable */
-#define MII_SR_10T_HD_CAPS                       0x0800  /* 10T   Half Duplex Capable */
-#define MII_SR_10T_FD_CAPS                       0x1000  /* 10T   Full Duplex Capable */
-#define MII_SR_100X_HD_CAPS                      0x2000  /* 100X  Half Duplex Capable */
-#define MII_SR_100X_FD_CAPS                      0x4000  /* 100X  Full Duplex Capable */
-#define MII_SR_100T4_CAPS                        0x8000  /* 100T4 Capable */
-
-/* Link partner ability register. */
-#define MII_LPA_SLCT                             0x001f  /* Same as advertise selector  */
-#define MII_LPA_10HALF                           0x0020  /* Can do 10mbps half-duplex   */
-#define MII_LPA_10FULL                           0x0040  /* Can do 10mbps full-duplex   */
-#define MII_LPA_100HALF                          0x0080  /* Can do 100mbps half-duplex  */
-#define MII_LPA_100FULL                          0x0100  /* Can do 100mbps full-duplex  */
-#define MII_LPA_100BASE4                         0x0200  /* 100BASE-T4  */
-#define MII_LPA_PAUSE                            0x0400  /* PAUSE */
-#define MII_LPA_ASYPAUSE                         0x0800  /* Asymmetrical PAUSE */
-#define MII_LPA_RFAULT                           0x2000  /* Link partner faulted        */
-#define MII_LPA_LPACK                            0x4000  /* Link partner acked us       */
-#define MII_LPA_NPAGE                            0x8000  /* Next page bit               */
-
 /* Autoneg Advertisement Register */
-#define MII_AR_SELECTOR_FIELD                   0x0001  /* indicates IEEE 802.3 CSMA/CD */
-#define MII_AR_10T_HD_CAPS                      0x0020  /* 10T   Half Duplex Capable */
-#define MII_AR_10T_FD_CAPS                      0x0040  /* 10T   Full Duplex Capable */
-#define MII_AR_100TX_HD_CAPS                    0x0080  /* 100TX Half Duplex Capable */
-#define MII_AR_100TX_FD_CAPS                    0x0100  /* 100TX Full Duplex Capable */
-#define MII_AR_100T4_CAPS                       0x0200  /* 100T4 Capable */
-#define MII_AR_PAUSE                            0x0400  /* Pause operation desired */
-#define MII_AR_ASM_DIR                          0x0800  /* Asymmetric Pause Direction bit */
-#define MII_AR_REMOTE_FAULT                     0x2000  /* Remote Fault detected */
-#define MII_AR_NEXT_PAGE                        0x8000  /* Next Page ability supported */
-#define MII_AR_SPEED_MASK                       0x01E0
-#define MII_AR_DEFAULT_CAP_MASK                 0x0DE0
+#define MII_AR_DEFAULT_CAP_MASK                 0
 
 /* 1000BASE-T Control Register */
-#define MII_AT001_CR_1000T_HD_CAPS              0x0100  /* Advertise 1000T HD capability */
-#define MII_AT001_CR_1000T_FD_CAPS              0x0200  /* Advertise 1000T FD capability  */
-#define MII_AT001_CR_1000T_REPEATER_DTE         0x0400  /* 1=Repeater/switch device port */
-/* 0=DTE device */
-#define MII_AT001_CR_1000T_MS_VALUE             0x0800  /* 1=Configure PHY as Master */
-/* 0=Configure PHY as Slave */
-#define MII_AT001_CR_1000T_MS_ENABLE            0x1000  /* 1=Master/Slave manual config value */
-/* 0=Automatic Master/Slave config */
-#define MII_AT001_CR_1000T_TEST_MODE_NORMAL     0x0000  /* Normal Operation */
-#define MII_AT001_CR_1000T_TEST_MODE_1          0x2000  /* Transmit Waveform test */
-#define MII_AT001_CR_1000T_TEST_MODE_2          0x4000  /* Master Transmit Jitter test */
-#define MII_AT001_CR_1000T_TEST_MODE_3          0x6000  /* Slave Transmit Jitter test */
-#define MII_AT001_CR_1000T_TEST_MODE_4          0x8000  /* Transmitter Distortion test */
-#define MII_AT001_CR_1000T_SPEED_MASK           0x0300
-#define MII_AT001_CR_1000T_DEFAULT_CAP_MASK     0x0300
-
-/* 1000BASE-T Status Register */
-#define MII_AT001_SR_1000T_LP_HD_CAPS           0x0400  /* LP is 1000T HD capable */
-#define MII_AT001_SR_1000T_LP_FD_CAPS           0x0800  /* LP is 1000T FD capable */
-#define MII_AT001_SR_1000T_REMOTE_RX_STATUS     0x1000  /* Remote receiver OK */
-#define MII_AT001_SR_1000T_LOCAL_RX_STATUS      0x2000  /* Local receiver OK */
-#define MII_AT001_SR_1000T_MS_CONFIG_RES        0x4000  /* 1=Local TX is Master, 0=Slave */
-#define MII_AT001_SR_1000T_MS_CONFIG_FAULT      0x8000  /* Master/Slave config fault */
-#define MII_AT001_SR_1000T_REMOTE_RX_STATUS_SHIFT   12
-#define MII_AT001_SR_1000T_LOCAL_RX_STATUS_SHIFT    13
-
-/* Extended Status Register */
-#define MII_AT001_ESR_1000T_HD_CAPS             0x1000  /* 1000T HD capable */
-#define MII_AT001_ESR_1000T_FD_CAPS             0x2000  /* 1000T FD capable */
-#define MII_AT001_ESR_1000X_HD_CAPS             0x4000  /* 1000X HD capable */
-#define MII_AT001_ESR_1000X_FD_CAPS             0x8000  /* 1000X FD capable */
+#define MII_AT001_CR_1000T_SPEED_MASK \
+	(ADVERTISE_1000FULL | ADVERTISE_1000HALF)
+#define MII_AT001_CR_1000T_DEFAULT_CAP_MASK	MII_AT001_CR_1000T_SPEED_MASK
 
 /* AT001 PHY Specific Control Register */
 #define MII_AT001_PSCR_JABBER_DISABLE           0x0001  /* 1=Jabber Function disabled */
diff --git a/drivers/net/atl1e/atl1e_main.c b/drivers/net/atl1e/atl1e_main.c
index e28f8ba..bf7500c 100644
--- a/drivers/net/atl1e/atl1e_main.c
+++ b/drivers/net/atl1e/atl1e_main.c
@@ -2051,9 +2051,9 @@ static int atl1e_suspend(struct pci_dev *pdev, pm_message_t state)
 		atl1e_read_phy_reg(hw, MII_BMSR, (u16 *)&mii_bmsr_data);
 		atl1e_read_phy_reg(hw, MII_BMSR, (u16 *)&mii_bmsr_data);
 
-		mii_advertise_data = MII_AR_10T_HD_CAPS;
+		mii_advertise_data = ADVERTISE_10HALF;
 
-		if ((atl1e_write_phy_reg(hw, MII_AT001_CR, 0) != 0) ||
+		if ((atl1e_write_phy_reg(hw, MII_CTRL1000, 0) != 0) ||
 		    (atl1e_write_phy_reg(hw,
 			   MII_ADVERTISE, mii_advertise_data) != 0) ||
 		    (atl1e_phy_commit(hw)) != 0) {
-- 
1.7.3.4


^ permalink raw reply related

* [PATCH] via-velocity: fix the WOL bug on 1000M full duplex forced mode.
From: Francois Romieu @ 2011-01-20 14:59 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, David Lv

The VIA velocity card can't be waken up by WOL tool on 1000M full
duplex forced mode. This patch fixes the bug.

Signed-off-by: David Lv <DavidLv@viatech.com.cn>
Acked-by: Francois Romieu <romieu@fr.zoreil.com>
---
 drivers/net/via-velocity.c |    9 +++++++++
 drivers/net/via-velocity.h |    8 ++++----
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/drivers/net/via-velocity.c b/drivers/net/via-velocity.c
index 09cac70..0d6fec6 100644
--- a/drivers/net/via-velocity.c
+++ b/drivers/net/via-velocity.c
@@ -2923,6 +2923,7 @@ static u16 wol_calc_crc(int size, u8 *pattern, u8 *mask_pattern)
 static int velocity_set_wol(struct velocity_info *vptr)
 {
 	struct mac_regs __iomem *regs = vptr->mac_regs;
+	enum speed_opt spd_dpx = vptr->options.spd_dpx;
 	static u8 buf[256];
 	int i;
 
@@ -2968,6 +2969,12 @@ static int velocity_set_wol(struct velocity_info *vptr)
 
 	writew(0x0FFF, &regs->WOLSRClr);
 
+	if (spd_dpx == SPD_DPX_1000_FULL)
+		goto mac_done;
+
+	if (spd_dpx != SPD_DPX_AUTO)
+		goto advertise_done;
+
 	if (vptr->mii_status & VELOCITY_AUTONEG_ENABLE) {
 		if (PHYID_GET_PHY_ID(vptr->phy_id) == PHYID_CICADA_CS8201)
 			MII_REG_BITS_ON(AUXCR_MDPPS, MII_NCONFIG, vptr->mac_regs);
@@ -2978,6 +2985,7 @@ static int velocity_set_wol(struct velocity_info *vptr)
 	if (vptr->mii_status & VELOCITY_SPEED_1000)
 		MII_REG_BITS_ON(BMCR_ANRESTART, MII_BMCR, vptr->mac_regs);
 
+advertise_done:
 	BYTE_REG_BITS_ON(CHIPGCR_FCMODE, &regs->CHIPGCR);
 
 	{
@@ -2987,6 +2995,7 @@ static int velocity_set_wol(struct velocity_info *vptr)
 		writeb(GCR, &regs->CHIPGCR);
 	}
 
+mac_done:
 	BYTE_REG_BITS_OFF(ISR_PWEI, &regs->ISR);
 	/* Turn on SWPTAG just before entering power mode */
 	BYTE_REG_BITS_ON(STICKHW_SWPTAG, &regs->STICKHW);
diff --git a/drivers/net/via-velocity.h b/drivers/net/via-velocity.h
index aa2e69b..d722753 100644
--- a/drivers/net/via-velocity.h
+++ b/drivers/net/via-velocity.h
@@ -361,7 +361,7 @@ enum  velocity_owner {
 #define MAC_REG_CHIPGSR     0x9C
 #define MAC_REG_TESTCFG     0x9D
 #define MAC_REG_DEBUG       0x9E
-#define MAC_REG_CHIPGCR     0x9F
+#define MAC_REG_CHIPGCR     0x9F	/* Chip Operation and Diagnostic Control */
 #define MAC_REG_WOLCR0_SET  0xA0
 #define MAC_REG_WOLCR1_SET  0xA1
 #define MAC_REG_PWCFG_SET   0xA2
@@ -848,10 +848,10 @@ enum  velocity_owner {
  *	Bits in CHIPGCR register
  */
 
-#define CHIPGCR_FCGMII      0x80	/* enable GMII mode */
-#define CHIPGCR_FCFDX       0x40
+#define CHIPGCR_FCGMII      0x80	/* force GMII (else MII only) */
+#define CHIPGCR_FCFDX       0x40	/* force full duplex */
 #define CHIPGCR_FCRESV      0x20
-#define CHIPGCR_FCMODE      0x10
+#define CHIPGCR_FCMODE      0x10	/* enable MAC forced mode */
 #define CHIPGCR_LPSOPT      0x08
 #define CHIPGCR_TM1US       0x04
 #define CHIPGCR_TM0US       0x02
-- 
1.7.3.4


^ permalink raw reply related

* [PATCH] atl1c: remove private #define.
From: Francois Romieu @ 2011-01-20 14:59 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Jay Cliburn, Chris Snook, Jie Yang

Either unused or duplicates from mii.h.

Signed-off-by: Francois Romieu <romieu@fr.zoreil.com>
Cc: Jay Cliburn <jcliburn@gmail.com>
Cc: Chris Snook <chris.snook@gmail.com>
Cc: Jie Yang <jie.yang@atheros.com>
---
 drivers/net/atl1c/atl1c_hw.c |   15 ++++++-------
 drivers/net/atl1c/atl1c_hw.h |   43 +----------------------------------------
 2 files changed, 9 insertions(+), 49 deletions(-)

diff --git a/drivers/net/atl1c/atl1c_hw.c b/drivers/net/atl1c/atl1c_hw.c
index 1bf6720..23f2ab0 100644
--- a/drivers/net/atl1c/atl1c_hw.c
+++ b/drivers/net/atl1c/atl1c_hw.c
@@ -345,7 +345,7 @@ int atl1c_write_phy_reg(struct atl1c_hw *hw, u32 reg_addr, u16 phy_data)
  */
 static int atl1c_phy_setup_adv(struct atl1c_hw *hw)
 {
-	u16 mii_adv_data = ADVERTISE_DEFAULT_CAP & ~ADVERTISE_SPEED_MASK;
+	u16 mii_adv_data = ADVERTISE_DEFAULT_CAP & ~ADVERTISE_ALL;
 	u16 mii_giga_ctrl_data = GIGA_CR_1000T_DEFAULT_CAP &
 				~GIGA_CR_1000T_SPEED_MASK;
 
@@ -373,7 +373,7 @@ static int atl1c_phy_setup_adv(struct atl1c_hw *hw)
 	}
 
 	if (atl1c_write_phy_reg(hw, MII_ADVERTISE, mii_adv_data) != 0 ||
-	    atl1c_write_phy_reg(hw, MII_GIGA_CR, mii_giga_ctrl_data) != 0)
+	    atl1c_write_phy_reg(hw, MII_CTRL1000, mii_giga_ctrl_data) != 0)
 		return -1;
 	return 0;
 }
@@ -517,19 +517,18 @@ int atl1c_phy_init(struct atl1c_hw *hw)
 					"Error Setting up Auto-Negotiation\n");
 			return ret_val;
 		}
-		mii_bmcr_data |= BMCR_AUTO_NEG_EN | BMCR_RESTART_AUTO_NEG;
+		mii_bmcr_data |= BMCR_ANENABLE | BMCR_ANRESTART;
 		break;
 	case MEDIA_TYPE_100M_FULL:
-		mii_bmcr_data |= BMCR_SPEED_100 | BMCR_FULL_DUPLEX;
+		mii_bmcr_data |= BMCR_SPEED100 | BMCR_FULLDPLX;
 		break;
 	case MEDIA_TYPE_100M_HALF:
-		mii_bmcr_data |= BMCR_SPEED_100;
+		mii_bmcr_data |= BMCR_SPEED100;
 		break;
 	case MEDIA_TYPE_10M_FULL:
-		mii_bmcr_data |= BMCR_SPEED_10 | BMCR_FULL_DUPLEX;
+		mii_bmcr_data |= BMCR_FULLDPLX;
 		break;
 	case MEDIA_TYPE_10M_HALF:
-		mii_bmcr_data |= BMCR_SPEED_10;
 		break;
 	default:
 		if (netif_msg_link(adapter))
@@ -657,7 +656,7 @@ int atl1c_restart_autoneg(struct atl1c_hw *hw)
 	err = atl1c_phy_setup_adv(hw);
 	if (err)
 		return err;
-	mii_bmcr_data |= BMCR_AUTO_NEG_EN | BMCR_RESTART_AUTO_NEG;
+	mii_bmcr_data |= BMCR_ANENABLE | BMCR_ANRESTART;
 
 	return atl1c_write_phy_reg(hw, MII_BMCR, mii_bmcr_data);
 }
diff --git a/drivers/net/atl1c/atl1c_hw.h b/drivers/net/atl1c/atl1c_hw.h
index 3dd6759..655fc6c 100644
--- a/drivers/net/atl1c/atl1c_hw.h
+++ b/drivers/net/atl1c/atl1c_hw.h
@@ -736,55 +736,16 @@ int atl1c_phy_power_saving(struct atl1c_hw *hw);
 #define REG_DEBUG_DATA0 		0x1900
 #define REG_DEBUG_DATA1 		0x1904
 
-/* PHY Control Register */
-#define MII_BMCR			0x00
-#define BMCR_SPEED_SELECT_MSB		0x0040  /* bits 6,13: 10=1000, 01=100, 00=10 */
-#define BMCR_COLL_TEST_ENABLE		0x0080  /* Collision test enable */
-#define BMCR_FULL_DUPLEX		0x0100  /* FDX =1, half duplex =0 */
-#define BMCR_RESTART_AUTO_NEG		0x0200  /* Restart auto negotiation */
-#define BMCR_ISOLATE			0x0400  /* Isolate PHY from MII */
-#define BMCR_POWER_DOWN			0x0800  /* Power down */
-#define BMCR_AUTO_NEG_EN		0x1000  /* Auto Neg Enable */
-#define BMCR_SPEED_SELECT_LSB		0x2000  /* bits 6,13: 10=1000, 01=100, 00=10 */
-#define BMCR_LOOPBACK			0x4000  /* 0 = normal, 1 = loopback */
-#define BMCR_RESET			0x8000  /* 0 = normal, 1 = PHY reset */
-#define BMCR_SPEED_MASK			0x2040
-#define BMCR_SPEED_1000			0x0040
-#define BMCR_SPEED_100			0x2000
-#define BMCR_SPEED_10			0x0000
-
-/* PHY Status Register */
-#define MII_BMSR			0x01
-#define BMMSR_EXTENDED_CAPS		0x0001  /* Extended register capabilities */
-#define BMSR_JABBER_DETECT		0x0002  /* Jabber Detected */
-#define BMSR_LINK_STATUS		0x0004  /* Link Status 1 = link */
-#define BMSR_AUTONEG_CAPS		0x0008  /* Auto Neg Capable */
-#define BMSR_REMOTE_FAULT		0x0010  /* Remote Fault Detect */
-#define BMSR_AUTONEG_COMPLETE		0x0020  /* Auto Neg Complete */
-#define BMSR_PREAMBLE_SUPPRESS		0x0040  /* Preamble may be suppressed */
-#define BMSR_EXTENDED_STATUS		0x0100  /* Ext. status info in Reg 0x0F */
-#define BMSR_100T2_HD_CAPS		0x0200  /* 100T2 Half Duplex Capable */
-#define BMSR_100T2_FD_CAPS		0x0400  /* 100T2 Full Duplex Capable */
-#define BMSR_10T_HD_CAPS		0x0800  /* 10T   Half Duplex Capable */
-#define BMSR_10T_FD_CAPS		0x1000  /* 10T   Full Duplex Capable */
-#define BMSR_100X_HD_CAPS		0x2000  /* 100X  Half Duplex Capable */
-#define BMMII_SR_100X_FD_CAPS		0x4000  /* 100X  Full Duplex Capable */
-#define BMMII_SR_100T4_CAPS		0x8000  /* 100T4 Capable */
-
-#define MII_PHYSID1			0x02
-#define MII_PHYSID2			0x03
 #define L1D_MPW_PHYID1			0xD01C  /* V7 */
 #define L1D_MPW_PHYID2			0xD01D  /* V1-V6 */
 #define L1D_MPW_PHYID3			0xD01E  /* V8 */
 
 
 /* Autoneg Advertisement Register */
-#define MII_ADVERTISE			0x04
-#define ADVERTISE_SPEED_MASK		0x01E0
-#define ADVERTISE_DEFAULT_CAP		0x0DE0
+#define ADVERTISE_DEFAULT_CAP \
+	(ADVERTISE_ALL | ADVERTISE_PAUSE_CAP | ADVERTISE_PAUSE_ASYM)
 
 /* 1000BASE-T Control Register */
-#define MII_GIGA_CR			0x09
 #define GIGA_CR_1000T_REPEATER_DTE	0x0400  /* 1=Repeater/switch device port 0=DTE device */
 
 #define GIGA_CR_1000T_MS_VALUE		0x0800  /* 1=Configure PHY as Master 0=Configure PHY as Slave */
-- 
1.7.3.4


^ permalink raw reply related

* Re: [PATCH net-next-2.6] net_sched: RCU conversion of stab
From: Patrick McHardy @ 2011-01-20 15:01 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David Miller, netdev, Jesper Dangaard Brouer, Jarek Poplawski,
	jamal
In-Reply-To: <1295531299.2825.175.camel@edumazet-laptop>

Am 20.01.2011 14:48, schrieb Eric Dumazet:
> This patch converts stab qdisc management to RCU, so that we can perform
> the qdisc_calculate_pkt_len() call before getting qdisc lock.
> 
> This shortens the lock's held time in __dev_xmit_skb().
> 
> This permits more qdiscs to get TCQ_F_CAN_BYPASS status, avoiding lot of
> cache misses and so reducing latencies.
> 

Looks good to me.


^ permalink raw reply

* Re: MultiPath TCP in the Linux Kernel
From: Christoph Paasch @ 2011-01-20 15:11 UTC (permalink / raw)
  To: Peter Chacko
  Cc: netdev, bonding-devel, linux-sctp, MS PRASAD, Lal Samuel Varghese,
	Gregory S. Tseytin
In-Reply-To: <AANLkTinkqCjQqRKDFnxTF4-uF27XyPzh0gY-vX3fnfSo@mail.gmail.com>

Hi,

MultiPath TCP is not a port of SCTP. It is based on regular TCP and presents a 
regular socket-api to the application. Thus applications do not have to be 
modified.

MPTCP opens several TCP-subflows across it's different IP-addresses, and lets 
the data go over these different TCP sessions. To synchronize the data-
transfer MPTCP uses TCP-options. Thus, on the wire it looks like regular TCP, 
with the only difference being that there are additional TCP-options.

MPTCP increases the throughput, because it uses the TCP-subflows 
simultaneously. With our implementation we got 2Gbps throughput for a single 
iperf-session on a machine having two 1Gb-interfaces (using jumbo-frames), 
whereas regular TCP could only go up to 1Gbps, as it only uses one interface.

To maintain bottleneck-fairness the Coupled Congestion Control controls the 
congestion window of the individual subflows (included in the implementation 
since the latest release).
http://datatracker.ietf.org/doc/draft-ietf-mptcp-congestion/

Cheers,
Christoph

P.S.: We have a public webserver running MPTCP at http://mptcp.info.ucl.ac.be
So you can directly try out the power of MPTCP... ;-)

On Thursday, January 20, 2011 wrote Peter Chacko:
> SCTP already provides that , and is TCP Multi-Path is going to be a port of
> it or any other difference ?
> 
> We are looking to use SCTP for this feature, but as we found it it has not
> kicked off , because of its firewall issues, we are trying add
> Multi-Pathing at application layer, sharing all the congestion states(like
> CM idea) as we are building a WAN optimized storage replication module as
> part of our cloud storage gateway development.
> 
> Curious to see more info on this.
> 
> Thanks
> 
> 2011/1/20 Christoph Paasch <christoph.paasch@uclouvain.be>
> 
> > Hi all,
> > 
> > The IETF is developing a new transport layer solution, MultiPath TCP,
> > which allows to efficiently exploit several Internet paths between a
> > pair of hosts,
> > while presenting a single TCP connection to the application layer.
> > 
> > At the UCLouvain in Belgium we are developping the support for MultiPath
> > TCP
> > in the Linux Kernel. The implementation is a major extension to the Linux
> > TCP-
> > stack.
> > 
> > For general information, access:
> > http://inl.info.ucl.ac.be/mptcp
> > https://scm.info.ucl.ac.be/trac/mptcp/
> > 
> > To access the git-repository:
> > git://scm.info.ucl.ac.be/mtcp.git
> > 
> > branches:
> >        mptcp_2.6.36 - based on Linux Kernel 2.6.36
> >        mtcp_no_subrcvqueue - based on Linux Kernel 2.6.28
> > 
> > For questions, feedback,... feel free to subscribe to the mptcp-dev
> > Mailing-
> > List:
> > https://listes-2.sipr.ucl.ac.be/sympa/info/mptcp-dev
> > 
> > 
> > Regards,
> > Christoph
> > 
> > --
> > Christoph Paasch
> > PhD Student
> > 
> > IP Networking Lab --- http://inl.info.ucl.ac.be
> > MultiPath TCP in the Linux Kernel --- http://inl.info.ucl.ac.be/mptcp
> > Université Catholique de Louvain
> > 
> > www.rollerbulls.be
> > --
> > --
> > To unsubscribe from this list: send the line "unsubscribe netdev" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
Christoph Paasch
PhD Student

IP Networking Lab --- http://inl.info.ucl.ac.be
MultiPath TCP in the Linux Kernel --- http://inl.info.ucl.ac.be/mptcp
Université Catholique de Louvain

www.rollerbulls.be
--

^ permalink raw reply

* [PATCH net-next-2.6] net_sched:  move TCQ_F_THROTTLED flag
From: Eric Dumazet @ 2011-01-20 15:27 UTC (permalink / raw)
  To: David Miller
  Cc: netdev, Patrick McHardy, Jesper Dangaard Brouer, Jarek Poplawski,
	jamal
In-Reply-To: <1295531299.2825.175.camel@edumazet-laptop>

In commit 371121057607e (net: QDISC_STATE_RUNNING dont need atomic bit
ops) I moved QDISC_STATE_RUNNING flag to __state container, located in
the cache line containing qdisc lock and often dirtied fields.

I now move TCQ_F_THROTTLED bit too, so that we let first cache line read
mostly, and shared by all cpus. This should speedup HTB/CBQ for example.

Not using test_bit()/__clear_bit()/__test_and_set_bit allows to use an
"unsigned int" for __state container, reducing by 8 bytes Qdisc size.

Introduce helpers to hide implementation details.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
CC: Patrick McHardy <kaber@trash.net>
CC: Jesper Dangaard Brouer <hawk@diku.dk>
CC: Jarek Poplawski <jarkao2@gmail.com>
CC: Jamal Hadi Salim <hadi@cyberus.ca>
CC: Stephen Hemminger <shemminger@vyatta.com>
---
 include/net/sch_generic.h |   38 ++++++++++++++++++++++++++----------
 net/sched/sch_api.c       |    6 ++---
 net/sched/sch_cbq.c       |    6 ++---
 net/sched/sch_hfsc.c      |    2 -
 net/sched/sch_htb.c       |    4 +--
 net/sched/sch_netem.c     |    2 -
 net/sched/sch_tbf.c       |    2 -
 7 files changed, 39 insertions(+), 21 deletions(-)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index e9eee99..f6345f5 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -31,7 +31,8 @@ enum qdisc_state_t {
  * following bits are only changed while qdisc lock is held
  */
 enum qdisc___state_t {
-	__QDISC___STATE_RUNNING,
+	__QDISC___STATE_RUNNING = 1,
+	__QDISC___STATE_THROTTLED = 2,
 };
 
 struct qdisc_size_table {
@@ -46,10 +47,9 @@ struct Qdisc {
 	struct sk_buff *	(*dequeue)(struct Qdisc *dev);
 	unsigned		flags;
 #define TCQ_F_BUILTIN		1
-#define TCQ_F_THROTTLED		2
-#define TCQ_F_INGRESS		4
-#define TCQ_F_CAN_BYPASS	8
-#define TCQ_F_MQROOT		16
+#define TCQ_F_INGRESS		2
+#define TCQ_F_CAN_BYPASS	4
+#define TCQ_F_MQROOT		8
 #define TCQ_F_WARN_NONWC	(1 << 16)
 	int			padded;
 	struct Qdisc_ops	*ops;
@@ -78,25 +78,43 @@ struct Qdisc {
 	unsigned long		state;
 	struct sk_buff_head	q;
 	struct gnet_stats_basic_packed bstats;
-	unsigned long		__state;
+	unsigned int		__state;
 	struct gnet_stats_queue	qstats;
 	struct rcu_head		rcu_head;
 	spinlock_t		busylock;
 };
 
-static inline bool qdisc_is_running(struct Qdisc *qdisc)
+static inline bool qdisc_is_running(const struct Qdisc *qdisc)
 {
-	return test_bit(__QDISC___STATE_RUNNING, &qdisc->__state);
+	return (qdisc->__state & __QDISC___STATE_RUNNING) ? true : false;
 }
 
 static inline bool qdisc_run_begin(struct Qdisc *qdisc)
 {
-	return !__test_and_set_bit(__QDISC___STATE_RUNNING, &qdisc->__state);
+	if (qdisc_is_running(qdisc))
+		return false;
+	qdisc->__state |= __QDISC___STATE_RUNNING;
+	return true;
 }
 
 static inline void qdisc_run_end(struct Qdisc *qdisc)
 {
-	__clear_bit(__QDISC___STATE_RUNNING, &qdisc->__state);
+	qdisc->__state &= ~__QDISC___STATE_RUNNING;
+}
+
+static inline bool qdisc_is_throttled(const struct Qdisc *qdisc)
+{
+	return (qdisc->__state & __QDISC___STATE_THROTTLED) ? true : false;
+}
+
+static inline void qdisc_throttled(struct Qdisc *qdisc)
+{
+	qdisc->__state |= __QDISC___STATE_THROTTLED;
+}
+
+static inline void qdisc_unthrottled(struct Qdisc *qdisc)
+{
+	qdisc->__state &= ~__QDISC___STATE_THROTTLED;
 }
 
 struct Qdisc_class_ops {
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 36ac0ec..374fcbe 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -473,7 +473,7 @@ static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
 	struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
 						 timer);
 
-	wd->qdisc->flags &= ~TCQ_F_THROTTLED;
+	qdisc_unthrottled(wd->qdisc);
 	__netif_schedule(qdisc_root(wd->qdisc));
 
 	return HRTIMER_NORESTART;
@@ -495,7 +495,7 @@ void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires)
 		     &qdisc_root_sleeping(wd->qdisc)->state))
 		return;
 
-	wd->qdisc->flags |= TCQ_F_THROTTLED;
+	qdisc_throttled(wd->qdisc);
 	time = ktime_set(0, 0);
 	time = ktime_add_ns(time, PSCHED_TICKS2NS(expires));
 	hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS);
@@ -505,7 +505,7 @@ EXPORT_SYMBOL(qdisc_watchdog_schedule);
 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
 {
 	hrtimer_cancel(&wd->timer);
-	wd->qdisc->flags &= ~TCQ_F_THROTTLED;
+	qdisc_unthrottled(wd->qdisc);
 }
 EXPORT_SYMBOL(qdisc_watchdog_cancel);
 
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index 4aaf44c..25ed522 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -351,7 +351,7 @@ cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
 {
 	int toplevel = q->toplevel;
 
-	if (toplevel > cl->level && !(cl->q->flags & TCQ_F_THROTTLED)) {
+	if (toplevel > cl->level && !(qdisc_is_throttled(cl->q))) {
 		psched_time_t now;
 		psched_tdiff_t incr;
 
@@ -625,7 +625,7 @@ static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
 		hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS);
 	}
 
-	sch->flags &= ~TCQ_F_THROTTLED;
+	qdisc_unthrottled(sch);
 	__netif_schedule(qdisc_root(sch));
 	return HRTIMER_NORESTART;
 }
@@ -974,7 +974,7 @@ cbq_dequeue(struct Qdisc *sch)
 		skb = cbq_dequeue_1(sch);
 		if (skb) {
 			sch->q.qlen--;
-			sch->flags &= ~TCQ_F_THROTTLED;
+			qdisc_unthrottled(sch);
 			return skb;
 		}
 
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index dea4009..b632d92 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -1664,7 +1664,7 @@ hfsc_dequeue(struct Qdisc *sch)
 		set_passive(cl);
 	}
 
-	sch->flags &= ~TCQ_F_THROTTLED;
+	qdisc_unthrottled(sch);
 	sch->q.qlen--;
 
 	return skb;
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 3e86fd3..39db75c 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -865,7 +865,7 @@ static struct sk_buff *htb_dequeue(struct Qdisc *sch)
 	/* try to dequeue direct packets as high prio (!) to minimize cpu work */
 	skb = __skb_dequeue(&q->direct_queue);
 	if (skb != NULL) {
-		sch->flags &= ~TCQ_F_THROTTLED;
+		qdisc_unthrottled(sch);
 		sch->q.qlen--;
 		return skb;
 	}
@@ -901,7 +901,7 @@ static struct sk_buff *htb_dequeue(struct Qdisc *sch)
 			skb = htb_dequeue_tree(q, prio, level);
 			if (likely(skb != NULL)) {
 				sch->q.qlen--;
-				sch->flags &= ~TCQ_F_THROTTLED;
+				qdisc_unthrottled(sch);
 				goto fin;
 			}
 		}
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index c2bbbe6..c26ef36 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -266,7 +266,7 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch)
 	struct netem_sched_data *q = qdisc_priv(sch);
 	struct sk_buff *skb;
 
-	if (sch->flags & TCQ_F_THROTTLED)
+	if (qdisc_is_throttled(sch))
 		return NULL;
 
 	skb = q->qdisc->ops->peek(q->qdisc);
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index 475edfb..86c0166 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -185,7 +185,7 @@ static struct sk_buff *tbf_dequeue(struct Qdisc *sch)
 			q->tokens = toks;
 			q->ptokens = ptoks;
 			sch->q.qlen--;
-			sch->flags &= ~TCQ_F_THROTTLED;
+			qdisc_unthrottled(sch);
 			return skb;
 		}
 



^ permalink raw reply related

* [PATCH v3] net: add Faraday FTMAC100 10/100 Ethernet driver
From: Po-Yu Chuang @ 2011-01-20 15:30 UTC (permalink / raw)
  To: netdev; +Cc: linux-kernel, bhutchings, eric.dumazet, joe, dilinger,
	Po-Yu Chuang
In-Reply-To: <1295256060-2091-1-git-send-email-ratbert.chuang@gmail.com>

From: Po-Yu Chuang <ratbert@faraday-tech.com>

FTMAC100 Ethernet Media Access Controller supports 10/100 Mbps and
MII.  This driver has been working on some ARM/NDS32 SoC's including
Faraday A320 and Andes AG101.

Signed-off-by: Po-Yu Chuang <ratbert@faraday-tech.com>
---
v2:
always use NAPI
do not use our own net_device_stats structure
don't set trans_start and last_rx
stats.rx_packets and stats.rx_bytes include dropped packets
add missed netif_napi_del()
initialize spinlocks in probe function
remove rx_lock and hw_lock
use netdev_[err/info/dbg] instead of dev_* ones
use netdev_alloc_skb_ip_align()
remove ftmac100_get_stats()
use is_valid_ether_addr() instead of is_zero_ether_addr()
add const to ftmac100_ethtool_ops and ftmac100_netdev_ops
use net_ratelimit() instead of printk_ratelimit()
no explicit inline
use %pM to print MAC address
add comment before wmb
use napi poll() to handle all interrupts

v3:
undo "stats.rx_packets and stats.rx_bytes include dropped packets"
ftmac100_mdio_read() returns 0 if error
fix comment typos
use pr_fmt and pr_info
define INT_MASK_ALL_ENABLED
define MACCR_ENABLE_ALL
do not count length error many times
use bool/true/false
use cpu_to_le32/le32_to_cpu to access descriptors
indent style fix

 drivers/net/Kconfig    |    9 +
 drivers/net/Makefile   |    1 +
 drivers/net/ftmac100.c | 1243 ++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/net/ftmac100.h |  180 +++++++
 4 files changed, 1433 insertions(+), 0 deletions(-)
 create mode 100644 drivers/net/ftmac100.c
 create mode 100644 drivers/net/ftmac100.h

diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 4f1755b..26da0ee 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -2014,6 +2014,15 @@ config BCM63XX_ENET
 	  This driver supports the ethernet MACs in the Broadcom 63xx
 	  MIPS chipset family (BCM63XX).
 
+config FTMAC100
+	tristate "Faraday FTMAC100 10/100 Ethernet support"
+	depends on ARM
+	select MII
+	help
+	  This driver supports the FTMAC100 Ethernet controller from
+	  Faraday. It is used on Faraday A320, Andes AG101, AG101P
+	  and some other ARM/NDS32 SoC's.
+
 source "drivers/net/fs_enet/Kconfig"
 
 source "drivers/net/octeon/Kconfig"
diff --git a/drivers/net/Makefile b/drivers/net/Makefile
index b90738d..7c21711 100644
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -147,6 +147,7 @@ obj-$(CONFIG_FORCEDETH) += forcedeth.o
 obj-$(CONFIG_NE_H8300) += ne-h8300.o 8390.o
 obj-$(CONFIG_AX88796) += ax88796.o
 obj-$(CONFIG_BCM63XX_ENET) += bcm63xx_enet.o
+obj-$(CONFIG_FTMAC100) += ftmac100.o
 
 obj-$(CONFIG_TSI108_ETH) += tsi108_eth.o
 obj-$(CONFIG_MV643XX_ETH) += mv643xx_eth.o
diff --git a/drivers/net/ftmac100.c b/drivers/net/ftmac100.c
new file mode 100644
index 0000000..3d39fe1
--- /dev/null
+++ b/drivers/net/ftmac100.c
@@ -0,0 +1,1243 @@
+/*
+ * Faraday FTMAC100 10/100 Ethernet
+ *
+ * (C) Copyright 2009-2011 Faraday Technology
+ * Po-Yu Chuang <ratbert@faraday-tech.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
+
+#include <linux/dma-mapping.h>
+#include <linux/etherdevice.h>
+#include <linux/ethtool.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/mii.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/netdevice.h>
+#include <linux/platform_device.h>
+
+#include "ftmac100.h"
+
+#define DRV_NAME	"ftmac100"
+#define DRV_VERSION	"0.2"
+
+#define RX_QUEUE_ENTRIES	128	/* must be power of 2 */
+#define TX_QUEUE_ENTRIES	16	/* must be power of 2 */
+
+#define MAX_PKT_SIZE		1518
+#define RX_BUF_SIZE		2044	/* must be smaller than 0x7ff */
+
+/******************************************************************************
+ * private data
+ *****************************************************************************/
+struct ftmac100_descs {
+	struct ftmac100_rxdes	rxdes[RX_QUEUE_ENTRIES];
+	struct ftmac100_txdes	txdes[TX_QUEUE_ENTRIES];
+};
+
+struct ftmac100 {
+	struct resource		*res;
+	void			*base;
+	int			irq;
+
+	struct ftmac100_descs	*descs;
+	dma_addr_t		descs_dma_addr;
+
+	unsigned int		rx_pointer;
+	unsigned int		tx_clean_pointer;
+	unsigned int		tx_pointer;
+	unsigned int		tx_pending;
+
+	spinlock_t		tx_lock;
+
+	struct net_device	*netdev;
+	struct device		*dev;
+	struct napi_struct	napi;
+
+	struct mii_if_info	mii;
+};
+
+/******************************************************************************
+ * internal functions (hardware register access)
+ *****************************************************************************/
+#define INT_MASK_ALL_ENABLED	(FTMAC100_INT_RPKT_FINISH	| \
+				 FTMAC100_INT_NORXBUF		| \
+				 FTMAC100_INT_XPKT_OK		| \
+				 FTMAC100_INT_XPKT_LOST		| \
+				 FTMAC100_INT_RPKT_LOST		| \
+				 FTMAC100_INT_AHB_ERR		| \
+				 FTMAC100_INT_PHYSTS_CHG)
+
+static void ftmac100_enable_all_int(struct ftmac100 *priv)
+{
+	iowrite32(INT_MASK_ALL_ENABLED, priv->base + FTMAC100_OFFSET_IMR);
+}
+
+static void ftmac100_disable_all_int(struct ftmac100 *priv)
+{
+	iowrite32(0, priv->base + FTMAC100_OFFSET_IMR);
+}
+
+static void ftmac100_set_rx_ring_base(struct ftmac100 *priv, dma_addr_t addr)
+{
+	iowrite32(addr, priv->base + FTMAC100_OFFSET_RXR_BADR);
+}
+
+static void ftmac100_set_tx_ring_base(struct ftmac100 *priv, dma_addr_t addr)
+{
+	iowrite32(addr, priv->base + FTMAC100_OFFSET_TXR_BADR);
+}
+
+static void ftmac100_txdma_start_polling(struct ftmac100 *priv)
+{
+	iowrite32(1, priv->base + FTMAC100_OFFSET_TXPD);
+}
+
+static int ftmac100_reset(struct ftmac100 *priv)
+{
+	struct net_device *netdev = priv->netdev;
+	int i;
+
+	/* NOTE: reset clears all registers */
+	iowrite32(FTMAC100_MACCR_SW_RST, priv->base + FTMAC100_OFFSET_MACCR);
+
+	for (i = 0; i < 5; i++) {
+		unsigned int maccr;
+
+		maccr = ioread32(priv->base + FTMAC100_OFFSET_MACCR);
+		if (!(maccr & FTMAC100_MACCR_SW_RST)) {
+			/*
+			 * FTMAC100_MACCR_SW_RST cleared does not indicate
+			 * that hardware reset completed (what the f*ck).
+			 * We still need to wait for a while.
+			 */
+			usleep_range(500, 1000);
+			return 0;
+		}
+
+		usleep_range(1000, 10000);
+	}
+
+	netdev_err(netdev, "software reset failed\n");
+	return -EIO;
+}
+
+static void ftmac100_set_mac(struct ftmac100 *priv, const unsigned char *mac)
+{
+	unsigned int maddr = mac[0] << 8 | mac[1];
+	unsigned int laddr = mac[2] << 24 | mac[3] << 16 | mac[4] << 8 | mac[5];
+
+	iowrite32(maddr, priv->base + FTMAC100_OFFSET_MAC_MADR);
+	iowrite32(laddr, priv->base + FTMAC100_OFFSET_MAC_LADR);
+}
+
+#define MACCR_ENABLE_ALL	(FTMAC100_MACCR_XMT_EN	| \
+				 FTMAC100_MACCR_RCV_EN	| \
+				 FTMAC100_MACCR_XDMA_EN	| \
+				 FTMAC100_MACCR_RDMA_EN	| \
+				 FTMAC100_MACCR_CRC_APD	| \
+				 FTMAC100_MACCR_FULLDUP	| \
+				 FTMAC100_MACCR_RX_RUNT	| \
+				 FTMAC100_MACCR_RX_BROADPKT)
+
+static int ftmac100_start_hw(struct ftmac100 *priv)
+{
+	struct net_device *netdev = priv->netdev;
+
+	if (ftmac100_reset(priv))
+		return -EIO;
+
+	/* setup ring buffer base registers */
+
+	ftmac100_set_rx_ring_base(priv,
+				  priv->descs_dma_addr +
+				  offsetof(struct ftmac100_descs, rxdes));
+	ftmac100_set_tx_ring_base(priv,
+				  priv->descs_dma_addr +
+				  offsetof(struct ftmac100_descs, txdes));
+
+	iowrite32(FTMAC100_APTC_RXPOLL_CNT(1), priv->base + FTMAC100_OFFSET_APTC);
+
+	ftmac100_set_mac(priv, netdev->dev_addr);
+
+	iowrite32(MACCR_ENABLE_ALL, priv->base + FTMAC100_OFFSET_MACCR);
+	return 0;
+}
+
+static void ftmac100_stop_hw(struct ftmac100 *priv)
+{
+	iowrite32(0, priv->base + FTMAC100_OFFSET_MACCR);
+}
+
+/******************************************************************************
+ * internal functions (receive descriptor)
+ *****************************************************************************/
+static bool ftmac100_rxdes_first_segment(struct ftmac100_rxdes *rxdes)
+{
+	return le32_to_cpu(rxdes->rxdes0) & FTMAC100_RXDES0_FRS;
+}
+
+static bool ftmac100_rxdes_last_segment(struct ftmac100_rxdes *rxdes)
+{
+	return le32_to_cpu(rxdes->rxdes0) & FTMAC100_RXDES0_LRS;
+}
+
+static bool ftmac100_rxdes_owned_by_dma(struct ftmac100_rxdes *rxdes)
+{
+	return le32_to_cpu(rxdes->rxdes0) & FTMAC100_RXDES0_RXDMA_OWN;
+}
+
+static void ftmac100_rxdes_set_dma_own(struct ftmac100_rxdes *rxdes)
+{
+	/* clear status bits */
+	rxdes->rxdes0 = cpu_to_le32(FTMAC100_RXDES0_RXDMA_OWN);
+}
+
+static bool ftmac100_rxdes_rx_error(struct ftmac100_rxdes *rxdes)
+{
+	return le32_to_cpu(rxdes->rxdes0) & FTMAC100_RXDES0_RX_ERR;
+}
+
+static bool ftmac100_rxdes_crc_error(struct ftmac100_rxdes *rxdes)
+{
+	return le32_to_cpu(rxdes->rxdes0) & FTMAC100_RXDES0_CRC_ERR;
+}
+
+static bool ftmac100_rxdes_frame_too_long(struct ftmac100_rxdes *rxdes)
+{
+	return le32_to_cpu(rxdes->rxdes0) & FTMAC100_RXDES0_FTL;
+}
+
+static bool ftmac100_rxdes_runt(struct ftmac100_rxdes *rxdes)
+{
+	return le32_to_cpu(rxdes->rxdes0) & FTMAC100_RXDES0_RUNT;
+}
+
+static bool ftmac100_rxdes_odd_nibble(struct ftmac100_rxdes *rxdes)
+{
+	return le32_to_cpu(rxdes->rxdes0) & FTMAC100_RXDES0_RX_ODD_NB;
+}
+
+static unsigned int ftmac100_rxdes_frame_length(struct ftmac100_rxdes *rxdes)
+{
+	return le32_to_cpu(rxdes->rxdes0) & FTMAC100_RXDES0_RFL;
+}
+
+static bool ftmac100_rxdes_multicast(struct ftmac100_rxdes *rxdes)
+{
+	return le32_to_cpu(rxdes->rxdes0) & FTMAC100_RXDES0_MULTICAST;
+}
+
+static void ftmac100_rxdes_set_buffer_size(struct ftmac100_rxdes *rxdes,
+					   unsigned int size)
+{
+	unsigned int tmp = le32_to_cpu(rxdes->rxdes1);
+
+	tmp &= FTMAC100_RXDES1_EDORR;
+	tmp |= FTMAC100_RXDES1_RXBUF_SIZE(size);
+	rxdes->rxdes1 = cpu_to_le32(tmp);
+}
+
+static void ftmac100_rxdes_set_end_of_ring(struct ftmac100_rxdes *rxdes)
+{
+	unsigned int tmp = le32_to_cpu(rxdes->rxdes1);
+
+	tmp |= FTMAC100_RXDES1_EDORR;
+	rxdes->rxdes1 = cpu_to_le32(tmp);
+}
+
+static void ftmac100_rxdes_set_dma_addr(struct ftmac100_rxdes *rxdes,
+					dma_addr_t addr)
+{
+	rxdes->rxdes2 = cpu_to_le32(addr);
+}
+
+static dma_addr_t ftmac100_rxdes_get_dma_addr(struct ftmac100_rxdes *rxdes)
+{
+	return le32_to_cpu(rxdes->rxdes2);
+}
+
+/* rxdes3 is not used by hardware, we use it to keep track of buffer */
+static void ftmac100_rxdes_set_va(struct ftmac100_rxdes *rxdes, void *addr)
+{
+	rxdes->rxdes3 = cpu_to_le32(addr);
+}
+
+static void *ftmac100_rxdes_get_va(struct ftmac100_rxdes *rxdes)
+{
+	return (void *)le32_to_cpu(rxdes->rxdes3);
+}
+
+/******************************************************************************
+ * internal functions (receive)
+ *****************************************************************************/
+static int ftmac100_next_rx_pointer(int pointer)
+{
+	return (pointer + 1) & (RX_QUEUE_ENTRIES - 1);
+}
+
+static void ftmac100_rx_pointer_advance(struct ftmac100 *priv)
+{
+	priv->rx_pointer = ftmac100_next_rx_pointer(priv->rx_pointer);
+}
+
+static struct ftmac100_rxdes *ftmac100_current_rxdes(struct ftmac100 *priv)
+{
+	return &priv->descs->rxdes[priv->rx_pointer];
+}
+
+static struct ftmac100_rxdes *
+ftmac100_rx_locate_first_segment(struct ftmac100 *priv)
+{
+	struct ftmac100_rxdes *rxdes = ftmac100_current_rxdes(priv);
+
+	while (!ftmac100_rxdes_owned_by_dma(rxdes)) {
+		if (ftmac100_rxdes_first_segment(rxdes))
+			return rxdes;
+
+		ftmac100_rxdes_set_dma_own(rxdes);
+		ftmac100_rx_pointer_advance(priv);
+		rxdes = ftmac100_current_rxdes(priv);
+	}
+
+	return NULL;
+}
+
+static bool ftmac100_rx_packet_error(struct ftmac100 *priv,
+				     struct ftmac100_rxdes *rxdes)
+{
+	struct net_device *netdev = priv->netdev;
+	bool error = false;
+
+	if (unlikely(ftmac100_rxdes_rx_error(rxdes))) {
+		if (net_ratelimit())
+			netdev_info(netdev, "rx err\n");
+
+		netdev->stats.rx_errors++;
+		error = true;
+	}
+
+	if (unlikely(ftmac100_rxdes_crc_error(rxdes))) {
+		if (net_ratelimit())
+			netdev_info(netdev, "rx crc err\n");
+
+		netdev->stats.rx_crc_errors++;
+		error = true;
+	}
+
+	if (unlikely(ftmac100_rxdes_frame_too_long(rxdes))) {
+		if (net_ratelimit())
+			netdev_info(netdev, "rx frame too long\n");
+
+		netdev->stats.rx_length_errors++;
+		error = true;
+	} else if (unlikely(ftmac100_rxdes_runt(rxdes))) {
+		if (net_ratelimit())
+			netdev_info(netdev, "rx runt\n");
+
+		netdev->stats.rx_length_errors++;
+		error = true;
+	} else if (unlikely(ftmac100_rxdes_odd_nibble(rxdes))) {
+		if (net_ratelimit())
+			netdev_info(netdev, "rx odd nibble\n");
+
+		netdev->stats.rx_length_errors++;
+		error = true;
+	}
+
+	return error;
+}
+
+static void ftmac100_rx_drop_packet(struct ftmac100 *priv)
+{
+	struct net_device *netdev = priv->netdev;
+	struct ftmac100_rxdes *rxdes = ftmac100_current_rxdes(priv);
+	bool done = false;
+
+	if (net_ratelimit())
+		netdev_dbg(netdev, "drop packet %p\n", rxdes);
+
+	do {
+		if (ftmac100_rxdes_last_segment(rxdes))
+			done = true;
+
+		ftmac100_rxdes_set_dma_own(rxdes);
+		ftmac100_rx_pointer_advance(priv);
+		rxdes = ftmac100_current_rxdes(priv);
+	} while (!done && !ftmac100_rxdes_owned_by_dma(rxdes));
+
+	netdev->stats.rx_dropped++;
+}
+
+static bool ftmac100_rx_packet(struct ftmac100 *priv, int *processed)
+{
+	struct net_device *netdev = priv->netdev;
+	struct ftmac100_rxdes *rxdes;
+	struct sk_buff *skb;
+	int length;
+	bool copied = false;
+	bool done = false;
+
+	rxdes = ftmac100_rx_locate_first_segment(priv);
+	if (!rxdes)
+		return false;
+
+	if (unlikely(ftmac100_rx_packet_error(priv, rxdes))) {
+		ftmac100_rx_drop_packet(priv);
+		return true;
+	}
+
+	/* start processing */
+
+	length = ftmac100_rxdes_frame_length(rxdes);
+
+	skb = netdev_alloc_skb_ip_align(netdev, length);
+	if (unlikely(!skb)) {
+		if (net_ratelimit())
+			netdev_err(netdev, "rx skb alloc failed\n");
+
+		ftmac100_rx_drop_packet(priv);
+		return true;
+	}
+
+	if (unlikely(ftmac100_rxdes_multicast(rxdes)))
+		netdev->stats.multicast++;
+
+	do {
+		dma_addr_t d = ftmac100_rxdes_get_dma_addr(rxdes);
+		void *buf = ftmac100_rxdes_get_va(rxdes);
+		int size;
+
+		size = min(length - copied, RX_BUF_SIZE);
+
+		dma_sync_single_for_cpu(priv->dev, d, RX_BUF_SIZE,
+					DMA_FROM_DEVICE);
+		memcpy(skb_put(skb, size), buf, size);
+
+		copied += size;
+
+		if (ftmac100_rxdes_last_segment(rxdes))
+			done = true;
+
+		dma_sync_single_for_device(priv->dev, d, RX_BUF_SIZE,
+					   DMA_FROM_DEVICE);
+
+		ftmac100_rxdes_set_dma_own(rxdes);
+
+		ftmac100_rx_pointer_advance(priv);
+		rxdes = ftmac100_current_rxdes(priv);
+	} while (!done && copied < length);
+
+	skb->protocol = eth_type_trans(skb, netdev);
+
+	/* push packet to protocol stack */
+	netif_receive_skb(skb);
+
+	netdev->stats.rx_packets++;
+	netdev->stats.rx_bytes += skb->len;
+
+	(*processed)++;
+
+	return true;
+}
+
+/******************************************************************************
+ * internal functions (transmit descriptor)
+ *****************************************************************************/
+static void ftmac100_txdes_reset(struct ftmac100_txdes *txdes)
+{
+	/* clear all except end of ring bit */
+	txdes->txdes0 = 0;
+	txdes->txdes1 &= FTMAC100_TXDES1_EDOTR;
+	txdes->txdes2 = 0;
+	txdes->txdes3 = 0;
+}
+
+static bool ftmac100_txdes_owned_by_dma(struct ftmac100_txdes *txdes)
+{
+	return le32_to_cpu(txdes->txdes0) & FTMAC100_TXDES0_TXDMA_OWN;
+}
+
+static void ftmac100_txdes_set_dma_own(struct ftmac100_txdes *txdes)
+{
+	unsigned int tmp = le32_to_cpu(txdes->txdes0);
+
+	tmp |= FTMAC100_TXDES0_TXDMA_OWN;
+
+	/*
+	 * Make sure dma own bit will not be set before any other
+	 * descriptor fields.
+	 */
+	wmb();
+	txdes->txdes0 = cpu_to_le32(tmp);
+}
+
+static bool ftmac100_txdes_excessive_collision(struct ftmac100_txdes *txdes)
+{
+	return le32_to_cpu(txdes->txdes0) & FTMAC100_TXDES0_TXPKT_EXSCOL;
+}
+
+static bool ftmac100_txdes_late_collision(struct ftmac100_txdes *txdes)
+{
+	return le32_to_cpu(txdes->txdes0) & FTMAC100_TXDES0_TXPKT_LATECOL;
+}
+
+static void ftmac100_txdes_set_end_of_ring(struct ftmac100_txdes *txdes)
+{
+	unsigned int tmp = le32_to_cpu(txdes->txdes1);
+
+	tmp |= FTMAC100_TXDES1_EDOTR;
+	txdes->txdes1 = cpu_to_le32(tmp);
+}
+
+static void ftmac100_txdes_set_first_segment(struct ftmac100_txdes *txdes)
+{
+	unsigned int tmp = le32_to_cpu(txdes->txdes1);
+
+	tmp |= FTMAC100_TXDES1_FTS;
+	txdes->txdes1 = cpu_to_le32(tmp);
+}
+
+static void ftmac100_txdes_set_last_segment(struct ftmac100_txdes *txdes)
+{
+	unsigned int tmp = le32_to_cpu(txdes->txdes1);
+
+	tmp |= FTMAC100_TXDES1_LTS;
+	txdes->txdes1 = cpu_to_le32(tmp);
+}
+
+static void ftmac100_txdes_set_txint(struct ftmac100_txdes *txdes)
+{
+	unsigned int tmp = le32_to_cpu(txdes->txdes1);
+
+	tmp |= FTMAC100_TXDES1_TXIC;
+	txdes->txdes1 = cpu_to_le32(tmp);
+}
+
+static void ftmac100_txdes_set_buffer_size(struct ftmac100_txdes *txdes,
+					   unsigned int len)
+{
+	unsigned int tmp = le32_to_cpu(txdes->txdes1);
+
+	tmp |= FTMAC100_TXDES1_TXBUF_SIZE(len);
+	txdes->txdes1 = cpu_to_le32(tmp);
+}
+
+static void ftmac100_txdes_set_dma_addr(struct ftmac100_txdes *txdes,
+					dma_addr_t addr)
+{
+	txdes->txdes2 = cpu_to_le32(addr);
+}
+
+static dma_addr_t ftmac100_txdes_get_dma_addr(struct ftmac100_txdes *txdes)
+{
+	return le32_to_cpu(txdes->txdes2);
+}
+
+/* txdes3 is not used by hardware, we use it to keep track of socket buffer */
+static void ftmac100_txdes_set_skb(struct ftmac100_txdes *txdes,
+				   struct sk_buff *skb)
+{
+	txdes->txdes3 = cpu_to_le32(skb);
+}
+
+static struct sk_buff *ftmac100_txdes_get_skb(struct ftmac100_txdes *txdes)
+{
+	return (struct sk_buff *)le32_to_cpu(txdes->txdes3);
+}
+
+/******************************************************************************
+ * internal functions (transmit)
+ *****************************************************************************/
+static int ftmac100_next_tx_pointer(int pointer)
+{
+	return (pointer + 1) & (TX_QUEUE_ENTRIES - 1);
+}
+
+static void ftmac100_tx_pointer_advance(struct ftmac100 *priv)
+{
+	priv->tx_pointer = ftmac100_next_tx_pointer(priv->tx_pointer);
+}
+
+static void ftmac100_tx_clean_pointer_advance(struct ftmac100 *priv)
+{
+	priv->tx_clean_pointer = ftmac100_next_tx_pointer(priv->tx_clean_pointer);
+}
+
+static struct ftmac100_txdes *ftmac100_current_txdes(struct ftmac100 *priv)
+{
+	return &priv->descs->txdes[priv->tx_pointer];
+}
+
+static struct ftmac100_txdes *
+ftmac100_current_clean_txdes(struct ftmac100 *priv)
+{
+	return &priv->descs->txdes[priv->tx_clean_pointer];
+}
+
+static bool ftmac100_tx_complete_packet(struct ftmac100 *priv)
+{
+	struct net_device *netdev = priv->netdev;
+	struct ftmac100_txdes *txdes;
+	struct sk_buff *skb;
+	dma_addr_t map;
+
+	if (priv->tx_pending == 0)
+		return false;
+
+	txdes = ftmac100_current_clean_txdes(priv);
+
+	if (ftmac100_txdes_owned_by_dma(txdes))
+		return false;
+
+	skb = ftmac100_txdes_get_skb(txdes);
+	map = ftmac100_txdes_get_dma_addr(txdes);
+
+	if (unlikely(ftmac100_txdes_excessive_collision(txdes) ||
+		     ftmac100_txdes_late_collision(txdes))) {
+		/*
+		 * packet transmitted to ethernet lost due to late collision
+		 * or excessive collision
+		 */
+		netdev->stats.tx_aborted_errors++;
+	} else {
+		netdev->stats.tx_packets++;
+		netdev->stats.tx_bytes += skb->len;
+	}
+
+	dma_unmap_single(priv->dev, map, skb_headlen(skb), DMA_TO_DEVICE);
+
+	dev_kfree_skb_irq(skb);
+
+	ftmac100_txdes_reset(txdes);
+
+	ftmac100_tx_clean_pointer_advance(priv);
+
+	priv->tx_pending--;
+	netif_wake_queue(netdev);
+
+	return true;
+}
+
+static void ftmac100_tx_complete(struct ftmac100 *priv)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&priv->tx_lock, flags);
+	while (ftmac100_tx_complete_packet(priv))
+		;
+	spin_unlock_irqrestore(&priv->tx_lock, flags);
+}
+
+static int ftmac100_xmit(struct ftmac100 *priv, struct sk_buff *skb,
+			 dma_addr_t map)
+{
+	struct net_device *netdev = priv->netdev;
+	struct ftmac100_txdes *txdes;
+	unsigned int len = (skb->len < ETH_ZLEN) ? ETH_ZLEN : skb->len;
+	unsigned long flags;
+
+	txdes = ftmac100_current_txdes(priv);
+	ftmac100_tx_pointer_advance(priv);
+
+	/* setup TX descriptor */
+
+	spin_lock_irqsave(&priv->tx_lock, flags);
+	ftmac100_txdes_set_skb(txdes, skb);
+	ftmac100_txdes_set_dma_addr(txdes, map);
+
+	ftmac100_txdes_set_first_segment(txdes);
+	ftmac100_txdes_set_last_segment(txdes);
+	ftmac100_txdes_set_txint(txdes);
+	ftmac100_txdes_set_buffer_size(txdes, len);
+
+	priv->tx_pending++;
+	if (priv->tx_pending == TX_QUEUE_ENTRIES) {
+		if (net_ratelimit())
+			netdev_info(netdev, "tx queue full\n");
+
+		netif_stop_queue(netdev);
+	}
+
+	/* start transmit */
+	ftmac100_txdes_set_dma_own(txdes);
+	spin_unlock_irqrestore(&priv->tx_lock, flags);
+
+	ftmac100_txdma_start_polling(priv);
+
+	return NETDEV_TX_OK;
+}
+
+/******************************************************************************
+ * internal functions (buffer)
+ *****************************************************************************/
+static void ftmac100_free_buffers(struct ftmac100 *priv)
+{
+	int i;
+
+	for (i = 0; i < RX_QUEUE_ENTRIES; i += 2) {
+		struct ftmac100_rxdes *rxdes = &priv->descs->rxdes[i];
+		dma_addr_t d = ftmac100_rxdes_get_dma_addr(rxdes);
+		void *page = ftmac100_rxdes_get_va(rxdes);
+
+		if (d)
+			dma_unmap_single(priv->dev, d, PAGE_SIZE,
+					 DMA_FROM_DEVICE);
+
+		if (page != NULL)
+			free_page((unsigned long)page);
+	}
+
+	for (i = 0; i < TX_QUEUE_ENTRIES; i++) {
+		struct ftmac100_txdes *txdes = &priv->descs->txdes[i];
+		struct sk_buff *skb = ftmac100_txdes_get_skb(txdes);
+
+		if (skb) {
+			dma_addr_t map;
+
+			map = ftmac100_txdes_get_dma_addr(txdes);
+			dma_unmap_single(priv->dev, map, skb_headlen(skb),
+					 DMA_TO_DEVICE);
+			dev_kfree_skb(skb);
+		}
+	}
+
+	dma_free_coherent(priv->dev, sizeof(struct ftmac100_descs),
+			  priv->descs, priv->descs_dma_addr);
+}
+
+static int ftmac100_alloc_buffers(struct ftmac100 *priv)
+{
+	int i;
+
+	priv->descs = dma_alloc_coherent(priv->dev,
+					 sizeof(struct ftmac100_descs),
+					 &priv->descs_dma_addr,
+					 GFP_KERNEL | GFP_DMA);
+	if (priv->descs == NULL)
+		return -ENOMEM;
+
+	memset(priv->descs, 0, sizeof(struct ftmac100_descs));
+
+	/* initialize RX ring */
+
+	ftmac100_rxdes_set_end_of_ring(&priv->descs->rxdes[RX_QUEUE_ENTRIES - 1]);
+
+	for (i = 0; i < RX_QUEUE_ENTRIES; i += 2) {
+		struct ftmac100_rxdes *rxdes = &priv->descs->rxdes[i];
+		void *page;
+		dma_addr_t d;
+
+		page = (void *)__get_free_page(GFP_KERNEL | GFP_DMA);
+		if (page == NULL)
+			goto err;
+
+		d = dma_map_single(priv->dev, page, PAGE_SIZE, DMA_FROM_DEVICE);
+		if (unlikely(dma_mapping_error(priv->dev, d))) {
+			free_page((unsigned long)page);
+			goto err;
+		}
+
+		/*
+		 * The hardware enforces a sub-2K maximum packet size, so we
+		 * put two buffers on every hardware page.
+		 */
+		ftmac100_rxdes_set_va(rxdes, page);
+		ftmac100_rxdes_set_va(rxdes + 1, page + PAGE_SIZE / 2);
+
+		ftmac100_rxdes_set_dma_addr(rxdes, d);
+		ftmac100_rxdes_set_dma_addr(rxdes + 1, d + PAGE_SIZE / 2);
+
+		ftmac100_rxdes_set_buffer_size(rxdes, RX_BUF_SIZE);
+		ftmac100_rxdes_set_buffer_size(rxdes + 1, RX_BUF_SIZE);
+
+		ftmac100_rxdes_set_dma_own(rxdes);
+		ftmac100_rxdes_set_dma_own(rxdes + 1);
+	}
+
+	/* initialize TX ring */
+
+	ftmac100_txdes_set_end_of_ring(&priv->descs->txdes[TX_QUEUE_ENTRIES - 1]);
+	return 0;
+
+err:
+	ftmac100_free_buffers(priv);
+	return -ENOMEM;
+}
+
+/******************************************************************************
+ * struct mii_if_info functions
+ *****************************************************************************/
+static int ftmac100_mdio_read(struct net_device *netdev, int phy_id, int reg)
+{
+	struct ftmac100 *priv = netdev_priv(netdev);
+	unsigned int phycr;
+	int i;
+
+	phycr = FTMAC100_PHYCR_PHYAD(phy_id) |
+		FTMAC100_PHYCR_REGAD(reg) |
+		FTMAC100_PHYCR_MIIRD;
+
+	iowrite32(phycr, priv->base + FTMAC100_OFFSET_PHYCR);
+	for (i = 0; i < 10; i++) {
+		phycr = ioread32(priv->base + FTMAC100_OFFSET_PHYCR);
+
+		if ((phycr & FTMAC100_PHYCR_MIIRD) == 0)
+			return phycr & FTMAC100_PHYCR_MIIRDATA;
+
+		usleep_range(100, 1000);
+	}
+
+	netdev_err(netdev, "mdio read timed out\n");
+	return 0;
+}
+
+static void ftmac100_mdio_write(struct net_device *netdev, int phy_id, int reg,
+				int data)
+{
+	struct ftmac100 *priv = netdev_priv(netdev);
+	unsigned int phycr;
+	int i;
+
+	phycr = FTMAC100_PHYCR_PHYAD(phy_id) |
+		FTMAC100_PHYCR_REGAD(reg) |
+		FTMAC100_PHYCR_MIIWR;
+
+	data = FTMAC100_PHYWDATA_MIIWDATA(data);
+
+	iowrite32(data, priv->base + FTMAC100_OFFSET_PHYWDATA);
+	iowrite32(phycr, priv->base + FTMAC100_OFFSET_PHYCR);
+
+	for (i = 0; i < 10; i++) {
+		phycr = ioread32(priv->base + FTMAC100_OFFSET_PHYCR);
+
+		if ((phycr & FTMAC100_PHYCR_MIIWR) == 0)
+			return;
+
+		usleep_range(100, 1000);
+	}
+
+	netdev_err(netdev, "mdio write timed out\n");
+}
+
+/******************************************************************************
+ * struct ethtool_ops functions
+ *****************************************************************************/
+static void ftmac100_get_drvinfo(struct net_device *netdev,
+				 struct ethtool_drvinfo *info)
+{
+	strcpy(info->driver, DRV_NAME);
+	strcpy(info->version, DRV_VERSION);
+	strcpy(info->bus_info, dev_name(&netdev->dev));
+}
+
+static int ftmac100_get_settings(struct net_device *netdev,
+				 struct ethtool_cmd *cmd)
+{
+	struct ftmac100 *priv = netdev_priv(netdev);
+	return mii_ethtool_gset(&priv->mii, cmd);
+}
+
+static int ftmac100_set_settings(struct net_device *netdev,
+				 struct ethtool_cmd *cmd)
+{
+	struct ftmac100 *priv = netdev_priv(netdev);
+	return mii_ethtool_sset(&priv->mii, cmd);
+}
+
+static int ftmac100_nway_reset(struct net_device *netdev)
+{
+	struct ftmac100 *priv = netdev_priv(netdev);
+	return mii_nway_restart(&priv->mii);
+}
+
+static u32 ftmac100_get_link(struct net_device *netdev)
+{
+	struct ftmac100 *priv = netdev_priv(netdev);
+	return mii_link_ok(&priv->mii);
+}
+
+static const struct ethtool_ops ftmac100_ethtool_ops = {
+	.set_settings		= ftmac100_set_settings,
+	.get_settings		= ftmac100_get_settings,
+	.get_drvinfo		= ftmac100_get_drvinfo,
+	.nway_reset		= ftmac100_nway_reset,
+	.get_link		= ftmac100_get_link,
+};
+
+/******************************************************************************
+ * interrupt handler
+ *****************************************************************************/
+static irqreturn_t ftmac100_interrupt(int irq, void *dev_id)
+{
+	struct net_device *netdev = dev_id;
+	struct ftmac100 *priv = netdev_priv(netdev);
+
+	if (likely(netif_running(netdev))) {
+		/* Disable interrupts for polling */
+		ftmac100_disable_all_int(priv);
+		napi_schedule(&priv->napi);
+	}
+
+	return IRQ_HANDLED;
+}
+
+/******************************************************************************
+ * struct napi_struct functions
+ *****************************************************************************/
+static int ftmac100_poll(struct napi_struct *napi, int budget)
+{
+	struct ftmac100 *priv = container_of(napi, struct ftmac100, napi);
+	struct net_device *netdev = priv->netdev;
+	unsigned int status;
+	bool completed = true;
+	int rx = 0;
+
+	status = ioread32(priv->base + FTMAC100_OFFSET_ISR);
+
+	if (status & (FTMAC100_INT_RPKT_FINISH | FTMAC100_INT_NORXBUF)) {
+		/*
+		 * FTMAC100_INT_RPKT_FINISH:
+		 *	RX DMA has received packets into RX buffer successfully
+		 *
+		 * FTMAC100_INT_NORXBUF:
+		 *	RX buffer unavailable
+		 */
+		bool retry;
+
+		do {
+			retry = ftmac100_rx_packet(priv, &rx);
+		} while (retry && rx < budget);
+
+		if (retry && rx == budget)
+			completed = false;
+	}
+
+	if (status & FTMAC100_INT_NORXBUF) {
+		/* RX buffer unavailable */
+		if (net_ratelimit())
+			netdev_info(netdev, "INT_NORXBUF\n");
+
+		netdev->stats.rx_over_errors++;
+	}
+
+	if (status & (FTMAC100_INT_XPKT_OK | FTMAC100_INT_XPKT_LOST)) {
+		/*
+		 * FTMAC100_INT_XPKT_OK:
+		 *	 packet transmitted to ethernet successfully
+		 *
+		 * FTMAC100_INT_XPKT_LOST:
+		 *	packet transmitted to ethernet lost due to late
+		 *	collision or excessive collision
+		 */
+		ftmac100_tx_complete(priv);
+	}
+
+	if (status & FTMAC100_INT_RPKT_LOST) {
+		/* received packet lost due to RX FIFO full */
+		if (net_ratelimit())
+			netdev_info(netdev, "INT_RPKT_LOST\n");
+
+		netdev->stats.rx_fifo_errors++;
+	}
+
+	if (status & FTMAC100_INT_AHB_ERR) {
+		/* AHB error */
+		if (net_ratelimit())
+			netdev_info(netdev, "INT_AHB_ERR\n");
+
+		/* do nothing */
+	}
+
+	if (status & FTMAC100_INT_PHYSTS_CHG) {
+		/* PHY link status change */
+		if (net_ratelimit())
+			netdev_info(netdev, "INT_PHYSTS_CHG\n");
+
+		mii_check_link(&priv->mii);
+	}
+
+	if (completed) {
+		/* stop polling */
+		napi_complete(napi);
+		ftmac100_enable_all_int(priv);
+	}
+
+	return rx;
+}
+
+/******************************************************************************
+ * struct net_device_ops functions
+ *****************************************************************************/
+static int ftmac100_open(struct net_device *netdev)
+{
+	struct ftmac100 *priv = netdev_priv(netdev);
+	int err;
+
+	err = ftmac100_alloc_buffers(priv);
+	if (err) {
+		netdev_err(netdev, "failed to allocate buffers\n");
+		goto err_alloc;
+	}
+
+	err = request_irq(priv->irq, ftmac100_interrupt, 0, netdev->name,
+		netdev);
+	if (err) {
+		netdev_err(netdev, "failed to request irq %d\n", priv->irq);
+		goto err_irq;
+	}
+
+	priv->rx_pointer = 0;
+	priv->tx_clean_pointer = 0;
+	priv->tx_pointer = 0;
+	priv->tx_pending = 0;
+
+	err = ftmac100_start_hw(priv);
+	if (err)
+		goto err_hw;
+
+	napi_enable(&priv->napi);
+	netif_start_queue(netdev);
+
+	ftmac100_enable_all_int(priv);
+	return 0;
+
+err_hw:
+	free_irq(priv->irq, netdev);
+err_irq:
+	ftmac100_free_buffers(priv);
+err_alloc:
+	return err;
+}
+
+static int ftmac100_stop(struct net_device *netdev)
+{
+	struct ftmac100 *priv = netdev_priv(netdev);
+
+	ftmac100_disable_all_int(priv);
+	netif_stop_queue(netdev);
+	napi_disable(&priv->napi);
+	ftmac100_stop_hw(priv);
+	free_irq(priv->irq, netdev);
+	ftmac100_free_buffers(priv);
+
+	return 0;
+}
+
+static int ftmac100_hard_start_xmit(struct sk_buff *skb,
+				    struct net_device *netdev)
+{
+	struct ftmac100 *priv = netdev_priv(netdev);
+	dma_addr_t map;
+
+	if (unlikely(skb->len > MAX_PKT_SIZE)) {
+		if (net_ratelimit())
+			netdev_dbg(netdev, "tx packet too big\n");
+
+		netdev->stats.tx_dropped++;
+		dev_kfree_skb(skb);
+		return NETDEV_TX_OK;
+	}
+
+	map = dma_map_single(priv->dev, skb->data, skb_headlen(skb), DMA_TO_DEVICE);
+	if (unlikely(dma_mapping_error(priv->dev, map))) {
+		/* drop packet */
+		if (net_ratelimit())
+			netdev_err(netdev, "map socket buffer failed\n");
+
+		netdev->stats.tx_dropped++;
+		dev_kfree_skb(skb);
+		return NETDEV_TX_OK;
+	}
+
+	return ftmac100_xmit(priv, skb, map);
+}
+
+/* optional */
+static int ftmac100_do_ioctl(struct net_device *netdev, struct ifreq *ifr,
+			     int cmd)
+{
+	struct ftmac100 *priv = netdev_priv(netdev);
+	struct mii_ioctl_data *data = if_mii(ifr);
+
+	return generic_mii_ioctl(&priv->mii, data, cmd, NULL);
+}
+
+static const struct net_device_ops ftmac100_netdev_ops = {
+	.ndo_open		= ftmac100_open,
+	.ndo_stop		= ftmac100_stop,
+	.ndo_start_xmit		= ftmac100_hard_start_xmit,
+	.ndo_set_mac_address	= eth_mac_addr,
+	.ndo_validate_addr	= eth_validate_addr,
+	.ndo_do_ioctl		= ftmac100_do_ioctl,
+};
+
+/******************************************************************************
+ * struct platform_driver functions
+ *****************************************************************************/
+static int ftmac100_remove(struct platform_device *pdev)
+{
+	struct net_device *netdev;
+	struct ftmac100 *priv;
+
+	netdev = platform_get_drvdata(pdev);
+	if (netdev == NULL)
+		return 0;
+
+	platform_set_drvdata(pdev, NULL);
+
+	priv = netdev_priv(netdev);
+
+	netif_napi_del(&priv->napi);
+	unregister_netdev(netdev);
+
+	if (priv->base != NULL)
+		iounmap(priv->base);
+
+	if (priv->res != NULL)
+		release_resource(priv->res);
+
+	free_netdev(netdev);
+	return 0;
+}
+
+static int ftmac100_probe(struct platform_device *pdev)
+{
+	struct resource *res;
+	int irq;
+	struct net_device *netdev;
+	struct ftmac100 *priv;
+	int err;
+
+	if (!pdev)
+		return -ENODEV;
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!res)
+		return -ENXIO;
+
+	irq = platform_get_irq(pdev, 0);
+	if (irq < 0)
+		return irq;
+
+	/* setup net_device */
+
+	netdev = alloc_etherdev(sizeof(struct ftmac100));
+	if (netdev == NULL) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+
+	SET_NETDEV_DEV(netdev, &pdev->dev);
+	SET_ETHTOOL_OPS(netdev, &ftmac100_ethtool_ops);
+	netdev->netdev_ops = &ftmac100_netdev_ops;
+
+	platform_set_drvdata(pdev, netdev);
+
+	/* setup private data */
+
+	priv = netdev_priv(netdev);
+	priv->netdev = netdev;
+	priv->dev = &pdev->dev;
+
+	spin_lock_init(&priv->tx_lock);
+
+	/* initialize NAPI */
+	netif_napi_add(netdev, &priv->napi, ftmac100_poll, 64);
+
+	/* map io memory */
+	priv->res = request_mem_region(res->start, res->end - res->start,
+				       dev_name(&pdev->dev));
+	if (priv->res == NULL) {
+		dev_err(&pdev->dev, "Could not reserve memory region\n");
+		err = -ENOMEM;
+		goto err_out;
+	}
+
+	priv->base = ioremap(res->start, res->end - res->start);
+	if (priv->base == NULL) {
+		dev_err(&pdev->dev, "Failed to ioremap ethernet registers\n");
+		err = -EIO;
+		goto err_out;
+	}
+
+	priv->irq = irq;
+
+	/* initialize struct mii_if_info */
+
+	priv->mii.phy_id	= 0;
+	priv->mii.phy_id_mask	= 0x1f;
+	priv->mii.reg_num_mask	= 0x1f;
+	priv->mii.dev		= netdev;
+	priv->mii.mdio_read	= ftmac100_mdio_read;
+	priv->mii.mdio_write	= ftmac100_mdio_write;
+
+	/* register network device */
+
+	err = register_netdev(netdev);
+	if (err) {
+		dev_err(&pdev->dev, "Failed to register netdev\n");
+		goto err_out;
+	}
+
+	netdev_info(netdev, "irq %d, mapped at %p\n", priv->irq, priv->base);
+
+	if (!is_valid_ether_addr(netdev->dev_addr)) {
+		random_ether_addr(netdev->dev_addr);
+		netdev_info(netdev, "generated random MAC address %pM\n",
+			    netdev->dev_addr);
+	}
+
+	return 0;
+
+err_out:
+	ftmac100_remove(pdev);
+	return err;
+}
+
+static struct platform_driver ftmac100_driver = {
+	.probe		= ftmac100_probe,
+	.remove		= ftmac100_remove,
+	.driver		= {
+		.name	= DRV_NAME,
+		.owner	= THIS_MODULE,
+	},
+};
+
+/******************************************************************************
+ * initialization / finalization
+ *****************************************************************************/
+static int __init ftmac100_init(void)
+{
+	pr_info("Loading version " DRV_VERSION " ...\n");
+	return platform_driver_register(&ftmac100_driver);
+}
+
+static void __exit ftmac100_exit(void)
+{
+	platform_driver_unregister(&ftmac100_driver);
+}
+
+module_init(ftmac100_init);
+module_exit(ftmac100_exit);
+
+MODULE_AUTHOR("Po-Yu Chuang <ratbert@faraday-tech.com>");
+MODULE_DESCRIPTION("FTMAC100 driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/net/ftmac100.h b/drivers/net/ftmac100.h
new file mode 100644
index 0000000..46a0c47
--- /dev/null
+++ b/drivers/net/ftmac100.h
@@ -0,0 +1,180 @@
+/*
+ * Faraday FTMAC100 10/100 Ethernet
+ *
+ * (C) Copyright 2009-2011 Faraday Technology
+ * Po-Yu Chuang <ratbert@faraday-tech.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef __FTMAC100_H
+#define __FTMAC100_H
+
+#define	FTMAC100_OFFSET_ISR		0x00
+#define	FTMAC100_OFFSET_IMR		0x04
+#define	FTMAC100_OFFSET_MAC_MADR	0x08
+#define	FTMAC100_OFFSET_MAC_LADR	0x0c
+#define	FTMAC100_OFFSET_MAHT0		0x10
+#define	FTMAC100_OFFSET_MAHT1		0x14
+#define	FTMAC100_OFFSET_TXPD		0x18
+#define	FTMAC100_OFFSET_RXPD		0x1c
+#define	FTMAC100_OFFSET_TXR_BADR	0x20
+#define	FTMAC100_OFFSET_RXR_BADR	0x24
+#define	FTMAC100_OFFSET_ITC		0x28
+#define	FTMAC100_OFFSET_APTC		0x2c
+#define	FTMAC100_OFFSET_DBLAC		0x30
+#define	FTMAC100_OFFSET_MACCR		0x88
+#define	FTMAC100_OFFSET_MACSR		0x8c
+#define	FTMAC100_OFFSET_PHYCR		0x90
+#define	FTMAC100_OFFSET_PHYWDATA	0x94
+#define	FTMAC100_OFFSET_FCR		0x98
+#define	FTMAC100_OFFSET_BPR		0x9c
+#define	FTMAC100_OFFSET_TS		0xc4
+#define	FTMAC100_OFFSET_DMAFIFOS	0xc8
+#define	FTMAC100_OFFSET_TM		0xcc
+#define	FTMAC100_OFFSET_TX_MCOL_SCOL	0xd4
+#define	FTMAC100_OFFSET_RPF_AEP		0xd8
+#define	FTMAC100_OFFSET_XM_PG		0xdc
+#define	FTMAC100_OFFSET_RUNT_TLCC	0xe0
+#define	FTMAC100_OFFSET_CRCER_FTL	0xe4
+#define	FTMAC100_OFFSET_RLC_RCC		0xe8
+#define	FTMAC100_OFFSET_BROC		0xec
+#define	FTMAC100_OFFSET_MULCA		0xf0
+#define	FTMAC100_OFFSET_RP		0xf4
+#define	FTMAC100_OFFSET_XP		0xf8
+
+/*
+ * Interrupt status register & interrupt mask register
+ */
+#define	FTMAC100_INT_RPKT_FINISH	(1 << 0)
+#define	FTMAC100_INT_NORXBUF		(1 << 1)
+#define	FTMAC100_INT_XPKT_FINISH	(1 << 2)
+#define	FTMAC100_INT_NOTXBUF		(1 << 3)
+#define	FTMAC100_INT_XPKT_OK		(1 << 4)
+#define	FTMAC100_INT_XPKT_LOST		(1 << 5)
+#define	FTMAC100_INT_RPKT_SAV		(1 << 6)
+#define	FTMAC100_INT_RPKT_LOST		(1 << 7)
+#define	FTMAC100_INT_AHB_ERR		(1 << 8)
+#define	FTMAC100_INT_PHYSTS_CHG		(1 << 9)
+
+/*
+ * Interrupt timer control register
+ */
+#define FTMAC100_ITC_RXINT_CNT(x)	(((x) & 0xf) << 0)
+#define FTMAC100_ITC_RXINT_THR(x)	(((x) & 0x7) << 4)
+#define FTMAC100_ITC_RXINT_TIME_SEL	(1 << 7)
+#define FTMAC100_ITC_TXINT_CNT(x)	(((x) & 0xf) << 8)
+#define FTMAC100_ITC_TXINT_THR(x)	(((x) & 0x7) << 12)
+#define FTMAC100_ITC_TXINT_TIME_SEL	(1 << 15)
+
+/*
+ * Automatic polling timer control register
+ */
+#define	FTMAC100_APTC_RXPOLL_CNT(x)	(((x) & 0xf) << 0)
+#define	FTMAC100_APTC_RXPOLL_TIME_SEL	(1 << 4)
+#define	FTMAC100_APTC_TXPOLL_CNT(x)	(((x) & 0xf) << 8)
+#define	FTMAC100_APTC_TXPOLL_TIME_SEL	(1 << 12)
+
+/*
+ * DMA burst length and arbitration control register
+ */
+#define FTMAC100_DBLAC_INCR4_EN		(1 << 0)
+#define FTMAC100_DBLAC_INCR8_EN		(1 << 1)
+#define FTMAC100_DBLAC_INCR16_EN	(1 << 2)
+#define FTMAC100_DBLAC_RXFIFO_LTHR(x)	(((x) & 0x7) << 3)
+#define FTMAC100_DBLAC_RXFIFO_HTHR(x)	(((x) & 0x7) << 6)
+#define FTMAC100_DBLAC_RX_THR_EN	(1 << 9)
+
+/*
+ * MAC control register
+ */
+#define	FTMAC100_MACCR_XDMA_EN		(1 << 0)
+#define	FTMAC100_MACCR_RDMA_EN		(1 << 1)
+#define	FTMAC100_MACCR_SW_RST		(1 << 2)
+#define	FTMAC100_MACCR_LOOP_EN		(1 << 3)
+#define	FTMAC100_MACCR_CRC_DIS		(1 << 4)
+#define	FTMAC100_MACCR_XMT_EN		(1 << 5)
+#define	FTMAC100_MACCR_ENRX_IN_HALFTX	(1 << 6)
+#define	FTMAC100_MACCR_RCV_EN		(1 << 8)
+#define	FTMAC100_MACCR_HT_MULTI_EN	(1 << 9)
+#define	FTMAC100_MACCR_RX_RUNT		(1 << 10)
+#define	FTMAC100_MACCR_RX_FTL		(1 << 11)
+#define	FTMAC100_MACCR_RCV_ALL		(1 << 12)
+#define	FTMAC100_MACCR_CRC_APD		(1 << 14)
+#define	FTMAC100_MACCR_FULLDUP		(1 << 15)
+#define	FTMAC100_MACCR_RX_MULTIPKT	(1 << 16)
+#define	FTMAC100_MACCR_RX_BROADPKT	(1 << 17)
+
+/*
+ * PHY control register
+ */
+#define FTMAC100_PHYCR_MIIRDATA		0xffff
+#define FTMAC100_PHYCR_PHYAD(x)		(((x) & 0x1f) << 16)
+#define FTMAC100_PHYCR_REGAD(x)		(((x) & 0x1f) << 21)
+#define FTMAC100_PHYCR_MIIRD		(1 << 26)
+#define FTMAC100_PHYCR_MIIWR		(1 << 27)
+
+/*
+ * PHY write data register
+ */
+#define FTMAC100_PHYWDATA_MIIWDATA(x)	((x) & 0xffff)
+
+/*
+ * Transmit descriptor, aligned to 16 bytes
+ */
+struct ftmac100_txdes {
+	unsigned int	txdes0;
+	unsigned int	txdes1;
+	unsigned int	txdes2;	/* TXBUF_BADR */
+	unsigned int	txdes3;	/* not used by HW */
+} __attribute__ ((aligned(16)));
+
+#define	FTMAC100_TXDES0_TXPKT_LATECOL	(1 << 0)
+#define	FTMAC100_TXDES0_TXPKT_EXSCOL	(1 << 1)
+#define	FTMAC100_TXDES0_TXDMA_OWN	(1 << 31)
+
+#define	FTMAC100_TXDES1_TXBUF_SIZE(x)	((x) & 0x7ff)
+#define	FTMAC100_TXDES1_LTS		(1 << 27)
+#define	FTMAC100_TXDES1_FTS		(1 << 28)
+#define	FTMAC100_TXDES1_TX2FIC		(1 << 29)
+#define	FTMAC100_TXDES1_TXIC		(1 << 30)
+#define	FTMAC100_TXDES1_EDOTR		(1 << 31)
+
+/*
+ * Receive descriptor, aligned to 16 bytes
+ */
+struct ftmac100_rxdes {
+	unsigned int	rxdes0;
+	unsigned int	rxdes1;
+	unsigned int	rxdes2;	/* RXBUF_BADR */
+	unsigned int	rxdes3;	/* not used by HW */
+} __attribute__ ((aligned(16)));
+
+#define	FTMAC100_RXDES0_RFL		0x7ff
+#define	FTMAC100_RXDES0_MULTICAST	(1 << 16)
+#define	FTMAC100_RXDES0_BROADCAST	(1 << 17)
+#define	FTMAC100_RXDES0_RX_ERR		(1 << 18)
+#define	FTMAC100_RXDES0_CRC_ERR		(1 << 19)
+#define	FTMAC100_RXDES0_FTL		(1 << 20)
+#define	FTMAC100_RXDES0_RUNT		(1 << 21)
+#define	FTMAC100_RXDES0_RX_ODD_NB	(1 << 22)
+#define	FTMAC100_RXDES0_LRS		(1 << 28)
+#define	FTMAC100_RXDES0_FRS		(1 << 29)
+#define	FTMAC100_RXDES0_RXDMA_OWN	(1 << 31)
+
+#define	FTMAC100_RXDES1_RXBUF_SIZE(x)	((x) & 0x7ff)
+#define	FTMAC100_RXDES1_EDORR		(1 << 31)
+
+#endif /* __FTMAC100_H */
-- 
1.6.3.3


^ permalink raw reply related

* Re: Bonding on bond
From: Jiri Bohac @ 2011-01-20 15:31 UTC (permalink / raw)
  To: Nicolas de Pesloüan
  Cc: Jiri Bohac, Jay Vosburgh, bonding-devel@lists.sourceforge.net,
	netdev@vger.kernel.org
In-Reply-To: <4D374A8F.2020303@gmail.com>

On Wed, Jan 19, 2011 at 09:33:19PM +0100, Nicolas de Pesloüan wrote:
> Even if it is possible to test for slave and for master with a
> single condition (IFF_BONDING), I suggest to split the tests and the
> error messages, to give end user the best possible diagnostic.

OK, why not. The below patch still uses IFF_BONDING to detect a
master is being enslaved, because IFF_MASTER is also used by the
eql driver. No idea if it works / someone ever uses it with
bonding, but it might collide.

bonding: prohibit enslaving of bonding masters

Nested bonding is not supported and will result in strange problems, e.g.:
- netif_receive_skb() will not properly change skb->dev to point to the
  uppoer-most bonding master
- arp monitor will not work (dev->last_rx is only updated by hardware drivers)
- accidentally enslaving a bonding master to itself will cause an infinite
  recursion in the TX path

This patch prevents this by prohibiting a bonding master from being further enslaved.

Signed-off-by: Jiri Bohac <jbohac@suse.cz>

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index b1025b8..b117dd8 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1453,6 +1453,12 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
 		return -EBUSY;
 	}
 
+	/* cannot enslave a master */
+	if (slave_dev->priv_flags & IFF_BONDING) {
+		pr_debug("Error, cannot enslave a bonding master\n");
+		return -EBUSY;
+	}
+
 	/* vlan challenged mutual exclusion */
 	/* no need to lock since we're protected by rtnl_lock */
 	if (slave_dev->features & NETIF_F_VLAN_CHALLENGED) {

-- 
Jiri Bohac <jbohac@suse.cz>
SUSE Labs, SUSE CZ


^ permalink raw reply related

* Re: [PATCH v3] net: add Faraday FTMAC100 10/100 Ethernet driver
From: Eric Dumazet @ 2011-01-20 15:35 UTC (permalink / raw)
  To: Po-Yu Chuang
  Cc: netdev, linux-kernel, bhutchings, joe, dilinger, Po-Yu Chuang
In-Reply-To: <1295537418-2057-1-git-send-email-ratbert.chuang@gmail.com>

Le jeudi 20 janvier 2011 à 23:30 +0800, Po-Yu Chuang a écrit :

> +	/* push packet to protocol stack */
> +	netif_receive_skb(skb);
> +
> +	netdev->stats.rx_packets++;
> +	netdev->stats.rx_bytes += skb->len;
> +
> +	(*processed)++;
> +
> +	return true;
> +}
> +

Hmm, after call to netif_receive_skb(skb), you are not allowed to access
skb anymore (maybe it was freed)

netdev->stats.rx_packets++;
netdev->stats.rx_bytes += skb->len;
netif_receive_skb(skb);

^ permalink raw reply

* Re: MultiPath TCP in the Linux Kernel
From: Hagen Paul Pfeifer @ 2011-01-20 15:38 UTC (permalink / raw)
  To: christoph.paasch; +Cc: netdev, bonding-devel, linux-sctp
In-Reply-To: <201101201509.34536.christoph.paasch@uclouvain.be>


On Thu, 20 Jan 2011 15:09:34 +0100, Christoph Paasch wrote:

> Hi all,

> 

> The IETF is developing a new transport layer solution, MultiPath TCP,

> which 

> allows to efficiently exploit several Internet paths between a pair of

> hosts, 

> while presenting a single TCP connection to the application layer.

> 

> At the UCLouvain in Belgium we are developping the support for MultiPath

> TCP 

> in the Linux Kernel. The implementation is a major extension to the

Linux

> TCP-

> stack.



Hello Christoph,



if you want that your work becomes part of the official network stack you

should align your effort on the official Linux way. This means you should

split your work and publish patches on this maillinglist.



Cheers, Hagen

^ permalink raw reply

* Re: [PATCH v3] net: add Faraday FTMAC100 10/100 Ethernet driver
From: Eric Dumazet @ 2011-01-20 15:41 UTC (permalink / raw)
  To: Po-Yu Chuang
  Cc: netdev, linux-kernel, bhutchings, joe, dilinger, Po-Yu Chuang
In-Reply-To: <1295537418-2057-1-git-send-email-ratbert.chuang@gmail.com>

Le jeudi 20 janvier 2011 à 23:30 +0800, Po-Yu Chuang a écrit :

> +static bool ftmac100_tx_complete_packet(struct ftmac100 *priv)
> +{
> +	struct net_device *netdev = priv->netdev;
> +	struct ftmac100_txdes *txdes;
> +	struct sk_buff *skb;
> +	dma_addr_t map;
> +
> +	if (priv->tx_pending == 0)
> +		return false;
> +
> +	txdes = ftmac100_current_clean_txdes(priv);
> +
> +	if (ftmac100_txdes_owned_by_dma(txdes))
> +		return false;
> +
> +	skb = ftmac100_txdes_get_skb(txdes);
> +	map = ftmac100_txdes_get_dma_addr(txdes);
> +
> +	if (unlikely(ftmac100_txdes_excessive_collision(txdes) ||
> +		     ftmac100_txdes_late_collision(txdes))) {
> +		/*
> +		 * packet transmitted to ethernet lost due to late collision
> +		 * or excessive collision
> +		 */
> +		netdev->stats.tx_aborted_errors++;
> +	} else {
> +		netdev->stats.tx_packets++;
> +		netdev->stats.tx_bytes += skb->len;
> +	}
> +
> +	dma_unmap_single(priv->dev, map, skb_headlen(skb), DMA_TO_DEVICE);
> +
> +	dev_kfree_skb_irq(skb);
> +
> +	ftmac100_txdes_reset(txdes);
> +
> +	ftmac100_tx_clean_pointer_advance(priv);
> +
> +	priv->tx_pending--;
> +	netif_wake_queue(netdev);
> +
> +	return true;
> +}
> +
> +static void ftmac100_tx_complete(struct ftmac100 *priv)
> +{
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&priv->tx_lock, flags);
> +	while (ftmac100_tx_complete_packet(priv))
> +		;
> +	spin_unlock_irqrestore(&priv->tx_lock, flags);
> +}
> +

I dont understand why you still block hard IRQS, after full NAPI
conversion.

Now you run from NAPI, and softirq handler, are you sure you still need
to block hard IRQ and tx_lock ?

It seems to me ftmac100_xmit() could only block softirqs (but they
already are blocked by caller), so you could use spin_lock() from
ftmac100_xmit()

^ permalink raw reply

* Re: MultiPath TCP in the Linux Kernel
From: Eric Dumazet @ 2011-01-20 15:42 UTC (permalink / raw)
  To: Hagen Paul Pfeifer; +Cc: christoph.paasch, netdev, bonding-devel, linux-sctp
In-Reply-To: <7e28a5ddc16848dcd98c05351113304f@localhost>

Le jeudi 20 janvier 2011 à 16:38 +0100, Hagen Paul Pfeifer a écrit :

> Hello Christoph,
> 
> if you want that your work becomes part of the official network stack you
> should align your effort on the official Linux way. This means you should
> split your work and publish patches on this maillinglist.

Hmm, they probably know that, and prefer to wait MTCP stuff is mature
before patch submission :)



^ permalink raw reply

* Re: [PATCH v3] net: add Faraday FTMAC100 10/100 Ethernet driver
From: Po-Yu Chuang @ 2011-01-20 15:43 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: netdev, linux-kernel, bhutchings, joe, dilinger, Po-Yu Chuang
In-Reply-To: <1295537746.2825.298.camel@edumazet-laptop>

Dear Eric,

On Thu, Jan 20, 2011 at 11:35 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> Le jeudi 20 janvier 2011 à 23:30 +0800, Po-Yu Chuang a écrit :
>
>> +     /* push packet to protocol stack */
>> +     netif_receive_skb(skb);
>> +
>> +     netdev->stats.rx_packets++;
>> +     netdev->stats.rx_bytes += skb->len;
>> +
>> +     (*processed)++;
>> +
>> +     return true;
>> +}
>> +
>
> Hmm, after call to netif_receive_skb(skb), you are not allowed to access
> skb anymore (maybe it was freed)
>
> netdev->stats.rx_packets++;
> netdev->stats.rx_bytes += skb->len;
> netif_receive_skb(skb);

Wow, you are totally right.

Thanks,
Po-Yu Chuang

^ permalink raw reply

* Re: [PATCH v3 00/16] make rpc_pipefs be mountable multiple time
From: J. Bruce Fields @ 2011-01-20 15:52 UTC (permalink / raw)
  To: Rob Landley
  Cc: Kirill A. Shutemov, Trond Myklebust, Neil Brown, Pavel Emelyanov,
	linux-nfs, David S. Miller, Al Viro, containers, netdev,
	linux-kernel
In-Reply-To: <4D383F60.4080907@parallels.com>

On Thu, Jan 20, 2011 at 07:57:52AM -0600, Rob Landley wrote:
> I've also mounted NFS shares in test environments I never mounted
> rpc_pipefs in.  (It's possible that the nfs.mount command was doing so
> for me, and apparently unmounting it again afterwards.)  My test
> environment is exporting with the userspace NFS daemon so it's not using
> the kernel cacheing infrastructure.

If you grep for rpc_mkpipe() you'll see it's only used in the idmapping
and rpcsec_gss code; which means you wouldn't need it unless using NFSv4
(which uses idmapping) or rpcsec_gss.

> > There is client dir (nfs/clntX) in rpc_pipefs for every sunrpc client.
> > Both client and server (see fs/nfsd/nfs4callback.c) can create sunrpc
> > client. So we rpc_pipefs on both side.
> 
> Ok, both client and server can create instances.  And use them to do what?
> 
> I understand that the kernel needs to handle RPC calls to service NFS
> requests, what I'm not figuring out is how userspace is involved after
> the initial mount except on the other side of filesystem-agnostic system
> calls.

If you want to see the code on the userland side of the interfaces, see
utils/gssd and utils/idmapd in the nfs-utils code.

--b.

^ permalink raw reply

* Re: [PATCH v3] net: add Faraday FTMAC100 10/100 Ethernet driver
From: Po-Yu Chuang @ 2011-01-20 15:54 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: netdev, linux-kernel, bhutchings, joe, dilinger, Po-Yu Chuang
In-Reply-To: <1295538105.2825.308.camel@edumazet-laptop>

Dear Eric,

On Thu, Jan 20, 2011 at 11:41 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> Le jeudi 20 janvier 2011 à 23:30 +0800, Po-Yu Chuang a écrit :
>
>> +static bool ftmac100_tx_complete_packet(struct ftmac100 *priv)
>> +{
>> +     struct net_device *netdev = priv->netdev;
>> +     struct ftmac100_txdes *txdes;
>> +     struct sk_buff *skb;
>> +     dma_addr_t map;
>> +
>> +     if (priv->tx_pending == 0)
>> +             return false;
>> +
>> +     txdes = ftmac100_current_clean_txdes(priv);
>> +
>> +     if (ftmac100_txdes_owned_by_dma(txdes))
>> +             return false;
>> +
>> +     skb = ftmac100_txdes_get_skb(txdes);
>> +     map = ftmac100_txdes_get_dma_addr(txdes);
>> +
>> +     if (unlikely(ftmac100_txdes_excessive_collision(txdes) ||
>> +                  ftmac100_txdes_late_collision(txdes))) {
>> +             /*
>> +              * packet transmitted to ethernet lost due to late collision
>> +              * or excessive collision
>> +              */
>> +             netdev->stats.tx_aborted_errors++;
>> +     } else {
>> +             netdev->stats.tx_packets++;
>> +             netdev->stats.tx_bytes += skb->len;
>> +     }
>> +
>> +     dma_unmap_single(priv->dev, map, skb_headlen(skb), DMA_TO_DEVICE);
>> +
>> +     dev_kfree_skb_irq(skb);
>> +
>> +     ftmac100_txdes_reset(txdes);
>> +
>> +     ftmac100_tx_clean_pointer_advance(priv);
>> +
>> +     priv->tx_pending--;
>> +     netif_wake_queue(netdev);
>> +
>> +     return true;
>> +}
>> +
>> +static void ftmac100_tx_complete(struct ftmac100 *priv)
>> +{
>> +     unsigned long flags;
>> +
>> +     spin_lock_irqsave(&priv->tx_lock, flags);
>> +     while (ftmac100_tx_complete_packet(priv))
>> +             ;
>> +     spin_unlock_irqrestore(&priv->tx_lock, flags);
>> +}
>> +
>
> I dont understand why you still block hard IRQS, after full NAPI
> conversion.
>
> Now you run from NAPI, and softirq handler, are you sure you still need
> to block hard IRQ and tx_lock ?
>
> It seems to me ftmac100_xmit() could only block softirqs (but they
> already are blocked by caller), so you could use spin_lock() from
> ftmac100_xmit()

I was not quite clear about when to use what kinds of locking.
After your explanation, now I understand.
I will submit v4 tomorrow.

really appreciate,
Po-Yu Chuang

^ permalink raw reply

* Re: Bonding on bond
From: Nicolas de Pesloüan @ 2011-01-20 16:12 UTC (permalink / raw)
  To: Jiri Bohac
  Cc: Jay Vosburgh, bonding-devel@lists.sourceforge.net,
	netdev@vger.kernel.org
In-Reply-To: <20110120153110.GA3931@midget.suse.cz>

Le 20/01/2011 16:31, Jiri Bohac a écrit :
> On Wed, Jan 19, 2011 at 09:33:19PM +0100, Nicolas de Pesloüan wrote:
>> Even if it is possible to test for slave and for master with a
>> single condition (IFF_BONDING), I suggest to split the tests and the
>> error messages, to give end user the best possible diagnostic.
>
> OK, why not. The below patch still uses IFF_BONDING to detect a
> master is being enslaved, because IFF_MASTER is also used by the
> eql driver. No idea if it works / someone ever uses it with
> bonding, but it might collide.

Thanks Jiri.

> bonding: prohibit enslaving of bonding masters
>
> Nested bonding is not supported and will result in strange problems, e.g.:
> - netif_receive_skb() will not properly change skb->dev to point to the
>    uppoer-most bonding master
> - arp monitor will not work (dev->last_rx is only updated by hardware drivers)
> - accidentally enslaving a bonding master to itself will cause an infinite
>    recursion in the TX path
>
> This patch prevents this by prohibiting a bonding master from being further enslaved.
>
> Signed-off-by: Jiri Bohac<jbohac@suse.cz>

Reviewed-by: Nicolas de Pesloüan <nicolas.2p.debian@free.fr>

> diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
> index b1025b8..b117dd8 100644
> --- a/drivers/net/bonding/bond_main.c
> +++ b/drivers/net/bonding/bond_main.c
> @@ -1453,6 +1453,12 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
>   		return -EBUSY;
>   	}
>
> +	/* cannot enslave a master */
> +	if (slave_dev->priv_flags&  IFF_BONDING) {
> +		pr_debug("Error, cannot enslave a bonding master\n");
> +		return -EBUSY;
> +	}
> +
>   	/* vlan challenged mutual exclusion */
>   	/* no need to lock since we're protected by rtnl_lock */
>   	if (slave_dev->features&  NETIF_F_VLAN_CHALLENGED) {
>


^ permalink raw reply

* Re: [PATCH net-next-2.6] net_sched: RCU conversion of stab
From: Eric Dumazet @ 2011-01-20 16:39 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: David Miller, netdev, Jesper Dangaard Brouer, Jarek Poplawski,
	jamal
In-Reply-To: <4D384E2D.5050001@trash.net>

Le jeudi 20 janvier 2011 à 16:01 +0100, Patrick McHardy a écrit :
> Am 20.01.2011 14:48, schrieb Eric Dumazet:
> > This patch converts stab qdisc management to RCU, so that we can perform
> > the qdisc_calculate_pkt_len() call before getting qdisc lock.
> > 
> > This shortens the lock's held time in __dev_xmit_skb().
> > 
> > This permits more qdiscs to get TCQ_F_CAN_BYPASS status, avoiding lot of
> > cache misses and so reducing latencies.
> > 
> 
> Looks good to me.
> 

Thanks for reviewing Patrick !



^ permalink raw reply

* [PATCH] net: ipv6: sit: fix rcu annotations
From: Eric Dumazet @ 2011-01-20 17:16 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

Fix minor __rcu annotations and remove sparse warnings

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 net/ipv6/sit.c |   23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 8ce38f1..b1599a3 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -412,7 +412,7 @@ static void prl_list_destroy_rcu(struct rcu_head *head)
 
 	p = container_of(head, struct ip_tunnel_prl_entry, rcu_head);
 	do {
-		n = p->next;
+		n = rcu_dereference_protected(p->next, 1);
 		kfree(p);
 		p = n;
 	} while (p);
@@ -421,15 +421,17 @@ static void prl_list_destroy_rcu(struct rcu_head *head)
 static int
 ipip6_tunnel_del_prl(struct ip_tunnel *t, struct ip_tunnel_prl *a)
 {
-	struct ip_tunnel_prl_entry *x, **p;
+	struct ip_tunnel_prl_entry *x;
+	struct ip_tunnel_prl_entry __rcu **p;
 	int err = 0;
 
 	ASSERT_RTNL();
 
 	if (a && a->addr != htonl(INADDR_ANY)) {
-		for (p = &t->prl; *p; p = &(*p)->next) {
-			if ((*p)->addr == a->addr) {
-				x = *p;
+		for (p = &t->prl;
+		     (x = rtnl_dereference(*p)) != NULL;
+		     p = &x->next) {
+			if (x->addr == a->addr) {
 				*p = x->next;
 				call_rcu(&x->rcu_head, prl_entry_destroy_rcu);
 				t->prl_count--;
@@ -438,9 +440,9 @@ ipip6_tunnel_del_prl(struct ip_tunnel *t, struct ip_tunnel_prl *a)
 		}
 		err = -ENXIO;
 	} else {
-		if (t->prl) {
+		x = rtnl_dereference(t->prl);
+		if (x) {
 			t->prl_count = 0;
-			x = t->prl;
 			call_rcu(&x->rcu_head, prl_list_destroy_rcu);
 			t->prl = NULL;
 		}
@@ -1179,7 +1181,7 @@ static int __net_init ipip6_fb_tunnel_init(struct net_device *dev)
 	if (!dev->tstats)
 		return -ENOMEM;
 	dev_hold(dev);
-	sitn->tunnels_wc[0]	= tunnel;
+	rcu_assign_pointer(sitn->tunnels_wc[0], tunnel);
 	return 0;
 }
 
@@ -1196,11 +1198,12 @@ static void __net_exit sit_destroy_tunnels(struct sit_net *sitn, struct list_hea
 	for (prio = 1; prio < 4; prio++) {
 		int h;
 		for (h = 0; h < HASH_SIZE; h++) {
-			struct ip_tunnel *t = sitn->tunnels[prio][h];
+			struct ip_tunnel *t;
 
+			t = rtnl_dereference(sitn->tunnels[prio][h]);
 			while (t != NULL) {
 				unregister_netdevice_queue(t->dev, head);
-				t = t->next;
+				t = rtnl_dereference(t->next);
 			}
 		}
 	}



^ permalink raw reply related

* Re: Question on RX packet classification in ethtool userspace
From: Alexander H Duyck @ 2011-01-20 17:24 UTC (permalink / raw)
  To: Ben Hutchings
  Cc: David Miller, santwona.behera@sun.com, jeff@garzik.org,
	netdev@vger.kernel.org
In-Reply-To: <1295479169.2906.53.camel@localhost>

On Wed, 2011-01-19 at 15:19 -0800, Ben Hutchings wrote:
> On Wed, 2011-01-19 at 14:52 -0800, Duyck, Alexander H wrote:
> > So I was looking into the option of replacing the ETHTOOL_GRXNTUPLE
> > call with something like ETHTOOL_GRXCLSRLALL and it seems like I
> > wasn't having much luck finding the userspace implementation in the
> > ethtool.
> 
> I wasn't aware until now that there was a public implementation!
> 
> > I eventually found a patch in patchwork at
> > http://patchwork.ozlabs.org/patch/23223/ which is supposed to add
> > support but the current ethtool git tree doesn't appear to contain
> > this code.  I was wondering if I am missing something and it is on a
> > branch somewhere, or is this something that was not applied to the
> > userspace for some specific reason?
> 
> Thanks for digging this up.  It's not in any branch that I know of.  I'm
> about to go on vacation, but I'll look at it when I get back.  I would
> appreciate it if someone would refresh the patch against current ethtool
> (preferably reusing some of the functions added for the RX n-tuple
> operations).
> 
> Ben.
> 

I'll look into updating the patch.

Thanks,

Alex


^ permalink raw reply

* [PATCH] ipv6: raw: rcu annotations
From: Eric Dumazet @ 2011-01-20 17:37 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

Remove sparse warnings, using a function typedef to be able to use __rcu
annotation on mh_filter pointer.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 net/ipv6/raw.c |   14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 86c3952..2bc6cd7 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -123,18 +123,18 @@ static __inline__ int icmpv6_filter(struct sock *sk, struct sk_buff *skb)
 }
 
 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
-static int (*mh_filter)(struct sock *sock, struct sk_buff *skb);
+typedef int mh_filter_t(struct sock *sock, struct sk_buff *skb);
 
-int rawv6_mh_filter_register(int (*filter)(struct sock *sock,
-					   struct sk_buff *skb))
+static mh_filter_t __rcu *mh_filter __read_mostly;
+
+int rawv6_mh_filter_register(mh_filter_t filter)
 {
 	rcu_assign_pointer(mh_filter, filter);
 	return 0;
 }
 EXPORT_SYMBOL(rawv6_mh_filter_register);
 
-int rawv6_mh_filter_unregister(int (*filter)(struct sock *sock,
-					     struct sk_buff *skb))
+int rawv6_mh_filter_unregister(mh_filter_t filter)
 {
 	rcu_assign_pointer(mh_filter, NULL);
 	synchronize_rcu();
@@ -192,10 +192,10 @@ static int ipv6_raw_deliver(struct sk_buff *skb, int nexthdr)
 			 * policy is placed in rawv6_rcv() because it is
 			 * required for each socket.
 			 */
-			int (*filter)(struct sock *sock, struct sk_buff *skb);
+			mh_filter_t *filter;
 
 			filter = rcu_dereference(mh_filter);
-			filtered = filter ? filter(sk, skb) : 0;
+			filtered = filter ? (*filter)(sk, skb) : 0;
 			break;
 		}
 #endif



^ permalink raw reply related

* [PATCH] CHOKe flow scheduler (0.10)
From: Stephen Hemminger @ 2011-01-20 17:38 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Patrick McHardy, David Miller, netdev
In-Reply-To: <1295379257.9097.9.camel@edumazet-laptop>

CHOKe ("CHOose and Kill" or "CHOose and Keep") is an alternative
packet scheduler based on the Random Exponential Drop (RED) algorithm.

The core idea is:
  For every packet arrival:
  	Calculate Qave
	if (Qave < minth) 
	     Queue the new packet
	else 
	     Select randomly a packet from the queue 
	     if (both packets from same flow)
	     then Drop both the packets
	     else if (Qave > maxth)
	          Drop packet
	     else
	       	  Admit packet with proability p (same as RED)

See also:
  Rong Pan, Balaji Prabhakar, Konstantinos Psounis, "CHOKe: a stateless active
   queue management scheme for approximating fair bandwidth allocation", 
  Proceeding of INFOCOM'2000, March 2000.

Help from:
     Eric Dumazet <eric.dumazet@gmail.com>
     Patrick McHardy <kaber@trash.net>

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

---
0.10 The internal flow classifier now does full compare of IPv4/IPv6 headers
  to avoid possible DoS attack from hash collision. Internal flow match code
  needs review/testing.

 include/linux/pkt_sched.h |   29 +
 net/sched/Kconfig         |   11 
 net/sched/Makefile        |    1 
 net/sched/sch_choke.c     |  713 ++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 754 insertions(+)

--- a/net/sched/Kconfig	2011-01-18 09:27:32.224666104 -0800
+++ b/net/sched/Kconfig	2011-01-18 09:27:45.522274178 -0800
@@ -205,6 +205,17 @@ config NET_SCH_DRR
 
 	  If unsure, say N.
 
+config NET_SCH_CHOKE
+	tristate "CHOose and Keep responsive flow scheduler (CHOKE)"
+	help
+	  Say Y here if you want to use the CHOKe packet scheduler (CHOose
+	  and Keep for responsive flows, CHOose and Kill for unresponsive
+	  flows). This is a variation of RED which trys to penalize flows
+	  that monopolize the queue.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called sch_choke.
+
 config NET_SCH_INGRESS
 	tristate "Ingress Qdisc"
 	depends on NET_CLS_ACT
--- a/net/sched/Makefile	2011-01-18 09:27:32.232666939 -0800
+++ b/net/sched/Makefile	2011-01-18 09:27:45.526274731 -0800
@@ -32,6 +32,7 @@ obj-$(CONFIG_NET_SCH_MULTIQ)	+= sch_mult
 obj-$(CONFIG_NET_SCH_ATM)	+= sch_atm.o
 obj-$(CONFIG_NET_SCH_NETEM)	+= sch_netem.o
 obj-$(CONFIG_NET_SCH_DRR)	+= sch_drr.o
+obj-$(CONFIG_NET_SCH_CHOKE)	+= sch_choke.o
 obj-$(CONFIG_NET_CLS_U32)	+= cls_u32.o
 obj-$(CONFIG_NET_CLS_ROUTE4)	+= cls_route.o
 obj-$(CONFIG_NET_CLS_FW)	+= cls_fw.o
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ b/net/sched/sch_choke.c	2011-01-20 08:33:06.021631871 -0800
@@ -0,0 +1,713 @@
+/*
+ * net/sched/sch_choke.c	CHOKE scheduler
+ *
+ * Copyright (c) 2011 Stephen Hemminger <shemminger@vyatta.com>
+ * Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/reciprocal_div.h>
+#include <net/pkt_sched.h>
+#include <net/inet_ecn.h>
+#include <net/red.h>
+#include <linux/ip.h>
+#include <net/ip.h>
+#include <linux/ipv6.h>
+#include <net/ipv6.h>
+
+/*
+   CHOKe stateless AQM for fair bandwidth allocation
+   =================================================
+
+   CHOKe (CHOose and Keep for responsive flows, CHOose and Kill for
+   unresponsive flows) is a variant of RED that penalizes misbehaving flows but
+   maintains no flow state. The difference from RED is an additional step
+   during the enqueuing process. If average queue size is over the
+   low threshold (qmin), a packet is chosen at random from the queue.
+   If both the new and chosen packet are from the same flow, both
+   are dropped. Unlike RED, CHOKe is not really a "classful" qdisc because it
+   needs to access packets in queue randomly. It has a minimal class
+   interface to allow overriding the builtin flow classifier with
+   filters.
+
+   Source:
+   R. Pan, B. Prabhakar, and K. Psounis, "CHOKe, A Stateless
+   Active Queue Management Scheme for Approximating Fair Bandwidth Allocation",
+   IEEE INFOCOM, 2000.
+
+   A. Tang, J. Wang, S. Low, "Understanding CHOKe: Throughput and Spatial
+   Characteristics", IEEE/ACM Transactions on Networking, 2004
+
+ */
+
+/* Upper bound on size of sk_buff table (packets) */
+#define CHOKE_MAX_QUEUE	(128*1024 - 1)
+
+struct choke_sched_data {
+/* Parameters */
+	u32		 limit;
+	unsigned char	 flags;
+
+	struct red_parms parms;
+
+/* Variables */
+	struct tcf_proto *filter_list;
+	struct {
+		u32	prob_drop;	/* Early probability drops */
+		u32	prob_mark;	/* Early probability marks */
+		u32	forced_drop;	/* Forced drops, qavg > max_thresh */
+		u32	forced_mark;	/* Forced marks, qavg > max_thresh */
+		u32	pdrop;          /* Drops due to queue limits */
+		u32	other;          /* Drops due to drop() calls */
+		u32	matched;	/* Drops to flow match */
+	} stats;
+
+	unsigned int	 head;
+	unsigned int	 tail;
+
+	unsigned int	 tab_mask; /* size - 1 */
+
+	struct sk_buff **tab;
+};
+
+/* deliver a random number between 0 and N - 1 */
+static u32 random_N(unsigned int N)
+{
+	return reciprocal_divide(random32(), N);
+}
+
+/* number of elements in queue including holes */
+static unsigned int choke_len(const struct choke_sched_data *q)
+{
+	return (q->tail - q->head) & q->tab_mask;
+}
+
+/* Is ECN parameter configured */
+static int use_ecn(const struct choke_sched_data *q)
+{
+	return q->flags & TC_RED_ECN;
+}
+
+/* Should packets over max just be dropped (versus marked) */
+static int use_harddrop(const struct choke_sched_data *q)
+{
+	return q->flags & TC_RED_HARDDROP;
+}
+
+/* Move head pointer forward to skip over holes */
+static void choke_zap_head_holes(struct choke_sched_data *q)
+{
+	do {
+		q->head = (q->head + 1) & q->tab_mask;
+		if (q->head == q->tail)
+			break;
+	} while (q->tab[q->head] == NULL);
+}
+
+/* Move tail pointer backwards to reuse holes */
+static void choke_zap_tail_holes(struct choke_sched_data *q)
+{
+	do {
+		q->tail = (q->tail - 1) & q->tab_mask;
+		if (q->head == q->tail)
+			break;
+	} while (q->tab[q->tail] == NULL);
+}
+
+/* Drop packet from queue array by creating a "hole" */
+static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+	struct sk_buff *skb = q->tab[idx];
+
+	q->tab[idx] = NULL;
+
+	if (idx == q->head)
+		choke_zap_head_holes(q);
+	if (idx == q->tail)
+		choke_zap_tail_holes(q);
+
+	sch->qstats.backlog -= qdisc_pkt_len(skb);
+	qdisc_drop(skb, sch);
+	qdisc_tree_decrease_qlen(sch, 1);
+	--sch->q.qlen;
+}
+
+/* Make sure network and transport headers can be dereferenced */
+static bool choke_linearize_header(struct sk_buff *skb)
+{
+	int nhoff, poff;
+	struct ipv6hdr *ip6;
+	struct iphdr *ip;
+	u8 ip_proto;
+	u32 ihl;
+
+	nhoff = skb_network_offset(skb);
+
+	switch (skb->protocol) {
+	case __constant_htons(ETH_P_IP):
+		if (!pskb_may_pull(skb, sizeof(*ip) + nhoff))
+			return false;
+
+		ip = (struct iphdr *) (skb->data + nhoff);
+		if (ip->frag_off & htons(IP_MF | IP_OFFSET))
+			return false;
+
+		ip_proto = ip->protocol;
+		ihl = ip->ihl;
+		break;
+	case __constant_htons(ETH_P_IPV6):
+		if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff))
+			return false;
+
+		ip6 = (struct ipv6hdr *) (skb->data + nhoff);
+		ip_proto = ip6->nexthdr;
+		ihl = (40 >> 2);
+		break;
+	default:
+		return true;
+	}
+
+	poff = proto_ports_offset(ip_proto);
+	if (poff >= 0) {
+		nhoff += ihl * 4 + poff;
+		if (!pskb_may_pull(skb, nhoff + 4))
+			return false;
+	}
+
+	return true;
+}
+
+/*
+ * Compare flow of two packets
+ *  Returns true only if source and destination address and port match.
+ *          false for special cases
+ */
+static bool choke_match_flow(const struct sk_buff *skb1,
+			     const struct sk_buff *skb2)
+{
+	int off1, off2, poff;
+
+	if (skb1->protocol != skb2->protocol)
+		return false;
+
+	off1 = skb_network_offset(skb1);
+	off2 = skb_network_offset(skb2);
+
+	switch (skb1->protocol) {
+	case __constant_htons(ETH_P_IP): {
+		const struct iphdr *ip1, *ip2;
+
+		ip1 = (struct iphdr *) (skb1->data + off1);
+		ip2 = (struct iphdr *) (skb2->data + off2);
+
+		if (ip1->protocol != ip2->protocol ||
+		    ip1->saddr != ip2->saddr || ip1->daddr != ip2->daddr)
+			return false;
+
+		if (ip1->frag_off & htons(IP_MF | IP_OFFSET) ||
+		    ip2->frag_off & htons(IP_MF | IP_OFFSET))
+			return ip1->id == ip2->id;
+
+		poff = proto_ports_offset(ip1->protocol);
+		if (poff < 0)
+			return true;
+		else {
+			const u32 *ports1, *ports2;
+
+			ports1 = (__force u32 *) (skb1->data + off1
+						  + ip1->ihl * 4 + poff);
+			ports2 = (__force u32 *) (skb2->data + off2
+						  + ip2->ihl * 4 + poff);
+
+			return *ports1 == *ports2;
+		}
+		break;
+	}
+
+	case __constant_htons(ETH_P_IPV6): {
+		const struct ipv6hdr *ip1, *ip2;
+
+		ip1 = (struct ipv6hdr *) (skb1->data + off1);
+		ip2 = (struct ipv6hdr *) (skb2->data + off2);
+
+		return (ip1->nexthdr == ip2->nexthdr &&
+			ipv6_addr_cmp(&ip1->saddr, &ip2->saddr) == 0 &&
+			ipv6_addr_cmp(&ip1->daddr, &ip2->daddr) == 0);
+	}
+
+	default: /* Maybe compare MAC header here? */
+		return false;
+	}
+	/* not reached */
+}
+
+static inline void choke_set_classid(struct sk_buff *skb, u16 classid)
+{
+	*(unsigned int *)(qdisc_skb_cb(skb)->data) = classid;
+}
+
+static u16 choke_get_classid(const struct sk_buff *skb)
+{
+	return *(unsigned int *)(qdisc_skb_cb(skb)->data);
+}
+
+/*
+ * Classify flow using either:
+ *  1. pre-existing classification result in skb
+ *  2. fast internal classification
+ *  3. use TC filter based classification
+ */
+static bool choke_classify(struct sk_buff *skb,
+			   struct Qdisc *sch, int *qerr)
+
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+	struct tcf_result res;
+	int result;
+
+	result = tc_classify(skb, q->filter_list, &res);
+	if (result >= 0) {
+#ifdef CONFIG_NET_CLS_ACT
+		switch (result) {
+		case TC_ACT_STOLEN:
+		case TC_ACT_QUEUED:
+			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+		case TC_ACT_SHOT:
+			return false;
+		}
+#endif
+		choke_set_classid(skb, TC_H_MIN(res.classid));
+		return true;
+	}
+
+	return false;
+}
+
+/*
+ * Select a packet at random from queue
+ * HACK: since queue can have holes from previous deletion; retry several
+ *   times to find a random skb but then just give up and return the head
+ * Will return NULL if queue is empty (q->head == q->tail)
+ */
+static struct sk_buff *choke_peek_random(const struct choke_sched_data *q,
+					 unsigned int *pidx)
+{
+	struct sk_buff *skb;
+	int retrys = 3;
+
+	do {
+		*pidx = (q->head + random_N(choke_len(q))) & q->tab_mask;
+		skb = q->tab[*pidx];
+		if (skb)
+			return skb;
+	} while (--retrys > 0);
+
+	return q->tab[*pidx = q->head];
+}
+
+/*
+ * Compare new packet with random packet in queue
+ * returns true if matched and sets *pidx
+ */
+static bool choke_match_random(const struct choke_sched_data *q,
+			       const struct sk_buff *nskb,
+			       unsigned int *pidx)
+{
+	const struct sk_buff *oskb;
+
+	if (q->head == q->tail)
+		return false;
+
+	oskb = choke_peek_random(q, pidx);
+	if (q->filter_list)
+		return choke_get_classid(nskb) == choke_get_classid(oskb);
+
+
+	return choke_match_flow(oskb, nskb);
+}
+
+static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+	struct red_parms *p = &q->parms;
+	int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+
+	if (q->filter_list) {
+		/* If using external classifiers, get result and record it. */
+		if (!choke_classify(skb, sch, &ret))
+			goto other_drop;	/* Packet was eaten by filter */
+	} else {
+		/* for internal classifier, make header accessible */
+		if (!choke_linearize_header(skb))
+			goto other_drop;
+	}
+
+	/* Compute average queue usage (see RED) */
+	p->qavg = red_calc_qavg(p, sch->q.qlen);
+	if (red_is_idling(p))
+		red_end_of_idle_period(p);
+
+	/* Is queue small? */
+	if (p->qavg <= p->qth_min)
+		p->qcount = -1;
+	else {
+		unsigned int idx;
+
+		/* Draw a packet at random from queue and compare flow */
+		if (choke_match_random(q, skb, &idx)) {
+			q->stats.matched++;
+			choke_drop_by_idx(sch, idx);
+			goto congestion_drop;
+		}
+
+		/* Queue is large, always mark/drop */
+		if (p->qavg > p->qth_max) {
+			p->qcount = -1;
+
+			sch->qstats.overlimits++;
+			if (use_harddrop(q) || !use_ecn(q) ||
+			    !INET_ECN_set_ce(skb)) {
+				q->stats.forced_drop++;
+				goto congestion_drop;
+			}
+
+			q->stats.forced_mark++;
+		} else if (++p->qcount) {
+			if (red_mark_probability(p, p->qavg)) {
+				p->qcount = 0;
+				p->qR = red_random(p);
+
+				sch->qstats.overlimits++;
+				if (!use_ecn(q) || !INET_ECN_set_ce(skb)) {
+					q->stats.prob_drop++;
+					goto congestion_drop;
+				}
+
+				q->stats.prob_mark++;
+			}
+		} else
+			p->qR = red_random(p);
+	}
+
+	/* Admit new packet */
+	if (sch->q.qlen < q->limit) {
+		q->tab[q->tail] = skb;
+		q->tail = (q->tail + 1) & q->tab_mask;
+		++sch->q.qlen;
+		sch->qstats.backlog += qdisc_pkt_len(skb);
+		return NET_XMIT_SUCCESS;
+	}
+
+	q->stats.pdrop++;
+	sch->qstats.drops++;
+	kfree_skb(skb);
+	return NET_XMIT_DROP;
+
+ congestion_drop:
+	qdisc_drop(skb, sch);
+	return NET_XMIT_CN;
+
+ other_drop:
+	if (ret & __NET_XMIT_BYPASS)
+		sch->qstats.drops++;
+	kfree_skb(skb);
+	return ret;
+}
+
+static struct sk_buff *choke_dequeue(struct Qdisc *sch)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+	struct sk_buff *skb;
+
+	if (q->head == q->tail) {
+		if (!red_is_idling(&q->parms))
+			red_start_of_idle_period(&q->parms);
+		return NULL;
+	}
+
+	skb = q->tab[q->head];
+	q->tab[q->head] = NULL;
+	choke_zap_head_holes(q);
+	--sch->q.qlen;
+	sch->qstats.backlog -= qdisc_pkt_len(skb);
+	qdisc_bstats_update(sch, skb);
+
+	return skb;
+}
+
+static unsigned int choke_drop(struct Qdisc *sch)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+	unsigned int len;
+
+	len = qdisc_queue_drop(sch);
+	if (len > 0)
+		q->stats.other++;
+	else {
+		if (!red_is_idling(&q->parms))
+			red_start_of_idle_period(&q->parms);
+	}
+
+	return len;
+}
+
+static void choke_reset(struct Qdisc *sch)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+
+	red_restart(&q->parms);
+}
+
+static const struct nla_policy choke_policy[TCA_CHOKE_MAX + 1] = {
+	[TCA_CHOKE_PARMS]	= { .len = sizeof(struct tc_red_qopt) },
+	[TCA_CHOKE_STAB]	= { .len = RED_STAB_SIZE },
+};
+
+
+static void choke_free(void *addr)
+{
+	if (addr) {
+		if (is_vmalloc_addr(addr))
+			vfree(addr);
+		else
+			kfree(addr);
+	}
+}
+
+static int choke_change(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+	struct nlattr *tb[TCA_CHOKE_MAX + 1];
+	const struct tc_red_qopt *ctl;
+	int err;
+	struct sk_buff **old = NULL;
+	unsigned int mask;
+
+	if (opt == NULL)
+		return -EINVAL;
+
+	err = nla_parse_nested(tb, TCA_CHOKE_MAX, opt, choke_policy);
+	if (err < 0)
+		return err;
+
+	if (tb[TCA_CHOKE_PARMS] == NULL ||
+	    tb[TCA_CHOKE_STAB] == NULL)
+		return -EINVAL;
+
+	ctl = nla_data(tb[TCA_CHOKE_PARMS]);
+
+	if (ctl->limit > CHOKE_MAX_QUEUE)
+		return -EINVAL;
+
+	mask = roundup_pow_of_two(ctl->limit + 1) - 1;
+	if (mask != q->tab_mask) {
+		struct sk_buff **ntab;
+
+		ntab = kcalloc(mask + 1, sizeof(struct sk_buff *), GFP_KERNEL);
+		if (!ntab)
+			ntab = vzalloc((mask + 1) * sizeof(struct sk_buff *));
+		if (!ntab)
+			return -ENOMEM;
+
+		sch_tree_lock(sch);
+		old = q->tab;
+		if (old) {
+			unsigned int oqlen = sch->q.qlen, tail = 0;
+
+			while (q->head != q->tail) {
+				struct sk_buff *skb = q->tab[q->head];
+
+				q->head = (q->head + 1) & q->tab_mask;
+				if (!skb)
+					continue;
+				if (tail < mask) {
+					ntab[tail++] = skb;
+					continue;
+				}
+				sch->qstats.backlog -= qdisc_pkt_len(skb);
+				--sch->q.qlen;
+				qdisc_drop(skb, sch);
+			}
+			qdisc_tree_decrease_qlen(sch, oqlen - sch->q.qlen);
+			q->head = 0;
+			q->tail = tail;
+		}
+
+		q->tab_mask = mask;
+		q->tab = ntab;
+	} else
+		sch_tree_lock(sch);
+
+	q->flags = ctl->flags;
+	q->limit = ctl->limit;
+
+	red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog,
+		      ctl->Plog, ctl->Scell_log,
+		      nla_data(tb[TCA_CHOKE_STAB]));
+
+	if (q->head == q->tail)
+		red_end_of_idle_period(&q->parms);
+
+	sch_tree_unlock(sch);
+	choke_free(old);
+	return 0;
+}
+
+static int choke_init(struct Qdisc *sch, struct nlattr *opt)
+{
+	return choke_change(sch, opt);
+}
+
+static int choke_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+	struct nlattr *opts = NULL;
+	struct tc_red_qopt opt = {
+		.limit		= q->limit,
+		.flags		= q->flags,
+		.qth_min	= q->parms.qth_min >> q->parms.Wlog,
+		.qth_max	= q->parms.qth_max >> q->parms.Wlog,
+		.Wlog		= q->parms.Wlog,
+		.Plog		= q->parms.Plog,
+		.Scell_log	= q->parms.Scell_log,
+	};
+
+	opts = nla_nest_start(skb, TCA_OPTIONS);
+	if (opts == NULL)
+		goto nla_put_failure;
+
+	NLA_PUT(skb, TCA_CHOKE_PARMS, sizeof(opt), &opt);
+	return nla_nest_end(skb, opts);
+
+nla_put_failure:
+	nla_nest_cancel(skb, opts);
+	return -EMSGSIZE;
+}
+
+static int choke_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+	struct tc_choke_xstats st = {
+		.early	= q->stats.prob_drop + q->stats.forced_drop,
+		.marked	= q->stats.prob_mark + q->stats.forced_mark,
+		.pdrop	= q->stats.pdrop,
+		.other	= q->stats.other,
+		.matched = q->stats.matched,
+	};
+
+	return gnet_stats_copy_app(d, &st, sizeof(st));
+}
+
+static void choke_destroy(struct Qdisc *sch)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+
+	tcf_destroy_chain(&q->filter_list);
+	choke_free(q->tab);
+}
+
+static struct Qdisc *choke_leaf(struct Qdisc *sch, unsigned long arg)
+{
+	return NULL;
+}
+
+static unsigned long choke_get(struct Qdisc *sch, u32 classid)
+{
+	return 0;
+}
+
+static void choke_put(struct Qdisc *q, unsigned long cl)
+{
+}
+
+static unsigned long choke_bind(struct Qdisc *sch, unsigned long parent,
+				u32 classid)
+{
+	return 0;
+}
+
+static struct tcf_proto **choke_find_tcf(struct Qdisc *sch, unsigned long cl)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+
+	if (cl)
+		return NULL;
+	return &q->filter_list;
+}
+
+static int choke_dump_class(struct Qdisc *sch, unsigned long cl,
+			  struct sk_buff *skb, struct tcmsg *tcm)
+{
+	tcm->tcm_handle |= TC_H_MIN(cl);
+	return 0;
+}
+
+static void choke_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+	if (!arg->stop) {
+		if (arg->fn(sch, 1, arg) < 0) {
+			arg->stop = 1;
+			return;
+		}
+		arg->count++;
+	}
+}
+
+static const struct Qdisc_class_ops choke_class_ops = {
+	.leaf		=	choke_leaf,
+	.get		=	choke_get,
+	.put		=	choke_put,
+	.tcf_chain	=	choke_find_tcf,
+	.bind_tcf	=	choke_bind,
+	.unbind_tcf	=	choke_put,
+	.dump		=	choke_dump_class,
+	.walk		=	choke_walk,
+};
+
+static struct sk_buff *choke_peek_head(struct Qdisc *sch)
+{
+	struct choke_sched_data *q = qdisc_priv(sch);
+
+	return (q->head != q->tail) ? q->tab[q->head] : NULL;
+}
+
+static struct Qdisc_ops choke_qdisc_ops __read_mostly = {
+	.id		=	"choke",
+	.priv_size	=	sizeof(struct choke_sched_data),
+
+	.enqueue	=	choke_enqueue,
+	.dequeue	=	choke_dequeue,
+	.peek		=	choke_peek_head,
+	.drop		=	choke_drop,
+	.init		=	choke_init,
+	.destroy	=	choke_destroy,
+	.reset		=	choke_reset,
+	.change		=	choke_change,
+	.dump		=	choke_dump,
+	.dump_stats	=	choke_dump_stats,
+	.owner		=	THIS_MODULE,
+};
+
+static int __init choke_module_init(void)
+{
+	return register_qdisc(&choke_qdisc_ops);
+}
+
+static void __exit choke_module_exit(void)
+{
+	unregister_qdisc(&choke_qdisc_ops);
+}
+
+module_init(choke_module_init)
+module_exit(choke_module_exit)
+
+MODULE_LICENSE("GPL");
--- a/include/linux/pkt_sched.h	2011-01-18 09:27:32.252669307 -0800
+++ b/include/linux/pkt_sched.h	2011-01-18 09:27:45.526274731 -0800
@@ -247,6 +247,35 @@ struct tc_gred_sopt {
 	__u16		pad1;
 };
 
+/* CHOKe section */
+
+enum {
+	TCA_CHOKE_UNSPEC,
+	TCA_CHOKE_PARMS,
+	TCA_CHOKE_STAB,
+	__TCA_CHOKE_MAX,
+};
+
+#define TCA_CHOKE_MAX (__TCA_CHOKE_MAX - 1)
+
+struct tc_choke_qopt {
+	__u32		limit;		/* Hard queue length (packets)	*/
+	__u32		qth_min;	/* Min average threshold (packets) */
+	__u32		qth_max;	/* Max average threshold (packets) */
+	unsigned char   Wlog;		/* log(W)		*/
+	unsigned char   Plog;		/* log(P_max/(qth_max-qth_min))	*/
+	unsigned char   Scell_log;	/* cell size for idle damping */
+	unsigned char	flags;		/* see RED flags */
+};
+
+struct tc_choke_xstats {
+	__u32		early;          /* Early drops */
+	__u32		pdrop;          /* Drops due to queue limits */
+	__u32		other;          /* Drops due to drop() calls */
+	__u32		marked;         /* Marked packets */
+	__u32		matched;	/* Drops due to flow match */
+};
+
 /* HTB section */
 #define TC_HTB_NUMPRIO		8
 #define TC_HTB_MAXDEPTH		8

^ permalink raw reply

* [PATCH] netfilter: add a missing include in nf_conntrack_reasm.c
From: Eric Dumazet @ 2011-01-20 17:53 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: Netfilter Development Mailinglist, netdev, KOVACS Krisztian

After commit ae90bdeaeac6b (netfilter: fix compilation when conntrack is
disabled but tproxy is enabled) we have following warnings :

net/ipv6/netfilter/nf_conntrack_reasm.c:520:16: warning: symbol
'nf_ct_frag6_gather' was not declared. Should it be static?
net/ipv6/netfilter/nf_conntrack_reasm.c:591:6: warning: symbol
'nf_ct_frag6_output' was not declared. Should it be static?
net/ipv6/netfilter/nf_conntrack_reasm.c:612:5: warning: symbol
'nf_ct_frag6_init' was not declared. Should it be static?
net/ipv6/netfilter/nf_conntrack_reasm.c:640:6: warning: symbol
'nf_ct_frag6_cleanup' was not declared. Should it be static?

Fix this including net/netfilter/ipv6/nf_defrag_ipv6.h

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
CC: KOVACS Krisztian <hidden@balabit.hu>
---
 net/ipv6/netfilter/nf_conntrack_reasm.c |    1 +
 1 file changed, 1 insertion(+)

diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 66e003e..0857272 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -45,6 +45,7 @@
 #include <linux/netfilter_ipv6.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
+#include <net/netfilter/ipv6/nf_defrag_ipv6.h>

 struct nf_ct_frag6_skb_cb

^ permalink raw reply related

* Re: [PATCH v3] net: add Faraday FTMAC100 10/100 Ethernet driver
From: Joe Perches @ 2011-01-20 17:56 UTC (permalink / raw)
  To: Po-Yu Chuang
  Cc: netdev, linux-kernel, bhutchings, eric.dumazet, dilinger,
	Po-Yu Chuang
In-Reply-To: <1295537418-2057-1-git-send-email-ratbert.chuang@gmail.com>

On Thu, 2011-01-20 at 23:30 +0800, Po-Yu Chuang wrote:
> From: Po-Yu Chuang <ratbert@faraday-tech.com>
[]
> +	/* map io memory */
> +	priv->res = request_mem_region(res->start, res->end - res->start,
> +				       dev_name(&pdev->dev));

Off by one?

	priv->res = request_mem_region(res->start, resource_size(res),
				       dev_name(&pdev->dev));

^ permalink raw reply

* Re: [PATCH] CHOKe flow scheduler (0.10)
From: Eric Dumazet @ 2011-01-20 18:19 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: Patrick McHardy, David Miller, netdev
In-Reply-To: <20110120093838.43f9f9b1@nehalam>

Le jeudi 20 janvier 2011 à 09:38 -0800, Stephen Hemminger a écrit :

...

Hi Stephen

I am contemplating choke_linearize_header() (yet another hash
function... oh well...) and say :

Instead of this boolean return, just use skb_get_rxhash(). It performs
what you want, and return 32bits of information, instead of one bit.

(if result (rxhash) is zero, it means you can drop packet because it is
not linearized)

So choke_match_flow() can be faster if rxhash differs (no cache miss on
skb->data on an old skb)
(Do the full check only if rxhash is equal on skb1 / skb2)

More over, the skb_get_rxhash() call is _really_ needed only if CHOKe
triggers :
if (p->qavg <= p->qth_min)
	we dont have to compute rxhash at all.

> +/* Make sure network and transport headers can be dereferenced */
> +static bool choke_linearize_header(struct sk_buff *skb)
> +{
> +	int nhoff, poff;
> +	struct ipv6hdr *ip6;
> +	struct iphdr *ip;
> +	u8 ip_proto;
> +	u32 ihl;
> +
> +	nhoff = skb_network_offset(skb);
> +
> +	switch (skb->protocol) {
> +	case __constant_htons(ETH_P_IP):
> +		if (!pskb_may_pull(skb, sizeof(*ip) + nhoff))
> +			return false;
> +
> +		ip = (struct iphdr *) (skb->data + nhoff);
> +		if (ip->frag_off & htons(IP_MF | IP_OFFSET))
> +			return false;
> +
> +		ip_proto = ip->protocol;
> +		ihl = ip->ihl;
> +		break;
> +	case __constant_htons(ETH_P_IPV6):
> +		if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff))
> +			return false;
> +
> +		ip6 = (struct ipv6hdr *) (skb->data + nhoff);
> +		ip_proto = ip6->nexthdr;
> +		ihl = (40 >> 2);
> +		break;
> +	default:
> +		return true;
> +	}
> +
> +	poff = proto_ports_offset(ip_proto);
> +	if (poff >= 0) {
> +		nhoff += ihl * 4 + poff;
> +		if (!pskb_may_pull(skb, nhoff + 4))
> +			return false;
> +	}
> +
> +	return true;
> +}
> +


> +/*
> + * Compare flow of two packets
> + *  Returns true only if source and destination address and port match.
> + *          false for special cases
> + */
> +static bool choke_match_flow(const struct sk_buff *skb1,
> +			     const struct sk_buff *skb2)
> +{
> +	int off1, off2, poff;
> +
> +	if (skb1->protocol != skb2->protocol)
> +		return false;
> +
> +	off1 = skb_network_offset(skb1);
> +	off2 = skb_network_offset(skb2);
> +
> +	switch (skb1->protocol) {
> +	case __constant_htons(ETH_P_IP): {
> +		const struct iphdr *ip1, *ip2;
> +
> +		ip1 = (struct iphdr *) (skb1->data + off1);
> +		ip2 = (struct iphdr *) (skb2->data + off2);
> +
> +		if (ip1->protocol != ip2->protocol ||
> +		    ip1->saddr != ip2->saddr || ip1->daddr != ip2->daddr)
> +			return false;
> +
> +		if (ip1->frag_off & htons(IP_MF | IP_OFFSET) ||
> +		    ip2->frag_off & htons(IP_MF | IP_OFFSET))
> +			return ip1->id == ip2->id;
> +
> +		poff = proto_ports_offset(ip1->protocol);
> +		if (poff < 0)
> +			return true;
> +		else {
> +			const u32 *ports1, *ports2;
> +
> +			ports1 = (__force u32 *) (skb1->data + off1
> +						  + ip1->ihl * 4 + poff);
> +			ports2 = (__force u32 *) (skb2->data + off2
> +						  + ip2->ihl * 4 + poff);
> +
> +			return *ports1 == *ports2;
> +		}
> +		break;
> +	}
> +
> +	case __constant_htons(ETH_P_IPV6): {
> +		const struct ipv6hdr *ip1, *ip2;
> +
> +		ip1 = (struct ipv6hdr *) (skb1->data + off1);
> +		ip2 = (struct ipv6hdr *) (skb2->data + off2);
> +
> +		return (ip1->nexthdr == ip2->nexthdr &&
> +			ipv6_addr_cmp(&ip1->saddr, &ip2->saddr) == 0 &&
> +			ipv6_addr_cmp(&ip1->daddr, &ip2->daddr) == 0);

Hmm, we also need to check ports, you might just move the check from
case __constant_htons(ETH_P_IP) after the switch(skb1->protocol)


> +	}
> +
> +	default: /* Maybe compare MAC header here? */
> +		return false;
> +	}
> +	/* not reached */
> +}
> +

If you want, I can send a diff on your v0.10, just tell me !

Thanks !



^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox