Netdev List

Netdev List
 help / color / mirror / Atom feed

* Re: [PATCH] include listenq max backlog in /proc/net/tcp and include in tcp_info
From: Eric Dumazet @ 2007-09-11  6:09 UTC (permalink / raw)
  To: Sridhar Samudrala, Rick Jones; +Cc: netdev
In-Reply-To: <1189471180.11066.19.camel@w-sridhar2.beaverton.ibm.com>

[-- Attachment #1: Type: text/plain, Size: 714 bytes --]

Sridhar Samudrala a écrit :
> On Mon, 2007-09-10 at 16:13 -0700, Rick Jones wrote:
>> Return some useful information such as the maximum listen backlog and
>> the current listen backlog in the tcp_info structure and have that 
>> match what one can see in /proc/net/tcp and /proc/net/tcp6.
> 
> If we are also exporting max listen backlog, another place to
> consider adding this is to tcp_diag_get_info() called via INET_DIAG_INFO.
> Current listen backlog is returned in inet_diag_msg->idiag_rqueue.
> max listen backlog can be returned in inet_diag_msg->idiag_wqueue.
> 

I agree, /proc/net/tcp is deprecated nowadays...

Rick, could you add this part in your patch, and add my Sign-off-by ?

Thank you
Eric




[-- Attachment #2: diff --]
[-- Type: text/plain, Size: 682 bytes --]

diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index 57c5f0b..f5b6275 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -25,11 +25,13 @@ static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
 	const struct tcp_sock *tp = tcp_sk(sk);
 	struct tcp_info *info = _info;
 
-	if (sk->sk_state == TCP_LISTEN)
+	if (sk->sk_state == TCP_LISTEN) {
 		r->idiag_rqueue = sk->sk_ack_backlog;
-	else
+		r->idiag_wqueue = sk->sk_max_ack_backlog;
+	else {
 		r->idiag_rqueue = tp->rcv_nxt - tp->copied_seq;
-	r->idiag_wqueue = tp->write_seq - tp->snd_una;
+		r->idiag_wqueue = tp->write_seq - tp->snd_una;
+	}
 	if (info != NULL)
 		tcp_get_info(sk, info);
 }

^ permalink raw reply related

* [PATCH] Add IP1000A Driver
From: Jesse Huang @ 2007-09-11 15:30 UTC (permalink / raw)
  To: jeff, akpm, netdev, jesse

From: Jesse Huang <jesse@icplus.com.tw>

Change Logs: Add IP1000A Driver to kernel tree.

Signed-off-by: Jesse Huang <jesse@icplus.com.tw>
---

 drivers/net/ipg.c | 2331 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/net/ipg.h |  856 +++++++++++++++++++
 2 files changed, 3187 insertions(+), 0 deletions(-)
 create mode 100755 drivers/net/ipg.c
 create mode 100755 drivers/net/ipg.h

e804d1c265bf1d843f845457f925a1728bbfdff7
diff --git a/drivers/net/ipg.c b/drivers/net/ipg.c
new file mode 100755
index 0000000..bdc2b8d
--- /dev/null
+++ b/drivers/net/ipg.c
@@ -0,0 +1,2331 @@
+/*
+ * ipg.c: Device Driver for the IP1000 Gigabit Ethernet Adapter
+ *
+ * Copyright (C) 2003, 2006  IC Plus Corp.
+ *
+ * Original Author:
+ *
+ *   Craig Rich
+ *   Sundance Technology, Inc.
+ *   1485 Saratoga Avenue
+ *   Suite 200
+ *   San Jose, CA 95129
+ *   408 873 4117
+ *   www.sundanceti.com
+ *   craig_rich@sundanceti.com
+ *
+ * Current Maintainer:
+ *
+ *   Sorbica Shieh.
+ *   10F, No.47, Lane 2, Kwang-Fu RD.
+ *   Sec. 2, Hsin-Chu, Taiwan, R.O.C.
+ *   http://www.icplus.com.tw
+ *   sorbica@icplus.com.tw
+ */
+#include <linux/crc32.h>
+#include <linux/ethtool.h>
+#include <linux/mii.h>
+#include <linux/mutex.h>
+
+#define IPG_RX_RING_BYTES	(sizeof(struct ipg_rx) * IPG_RFDLIST_LENGTH)
+#define IPG_TX_RING_BYTES	(sizeof(struct ipg_tx) * IPG_TFDLIST_LENGTH)
+#define IPG_RESET_MASK \
+	(IPG_AC_GLOBAL_RESET | IPG_AC_RX_RESET | IPG_AC_TX_RESET | \
+	 IPG_AC_DMA | IPG_AC_FIFO | IPG_AC_NETWORK | IPG_AC_HOST | \
+	 IPG_AC_AUTO_INIT)
+
+#define ipg_w32(val32,reg)	iowrite32((val32), ioaddr + (reg))
+#define ipg_w16(val16,reg)	iowrite16((val16), ioaddr + (reg))
+#define ipg_w8(val8,reg)	iowrite8((val8), ioaddr + (reg))
+
+#define ipg_r32(reg)		ioread32(ioaddr + (reg))
+#define ipg_r16(reg)		ioread16(ioaddr + (reg))
+#define ipg_r8(reg)		ioread8(ioaddr + (reg))
+
+#define JUMBO_FRAME_4k_ONLY
+enum {
+	netdev_io_size = 128
+};
+
+#include "ipg.h"
+#define DRV_NAME	"ipg"
+
+MODULE_AUTHOR("IC Plus Corp. 2003");
+MODULE_DESCRIPTION("IC Plus IP1000 Gigabit Ethernet Adapter Linux Driver "
+		   DrvVer);
+MODULE_LICENSE("GPL");
+
+static const char *ipg_brand_name[] = {
+	"IC PLUS IP1000 1000/100/10 based NIC",
+	"Sundance Technology ST2021 based NIC",
+	"Tamarack Microelectronics TC9020/9021 based NIC",
+	"Tamarack Microelectronics TC9020/9021 based NIC",
+	"D-Link NIC",
+	"D-Link NIC IP1000A"
+};
+
+static struct pci_device_id ipg_pci_tbl[] __devinitdata = {
+	{ PCI_DEVICE(PCI_VENDOR_ID_SUNDANCE,	0x1023), 0, 0, 0 },
+	{ PCI_DEVICE(PCI_VENDOR_ID_SUNDANCE,	0x2021), 0, 0, 1 },
+	{ PCI_DEVICE(PCI_VENDOR_ID_SUNDANCE,	0x1021), 0, 0, 2 },
+	{ PCI_DEVICE(PCI_VENDOR_ID_DLINK,	0x9021), 0, 0, 3 },
+	{ PCI_DEVICE(PCI_VENDOR_ID_DLINK,	0x4000), 0, 0, 4 },
+	{ PCI_DEVICE(PCI_VENDOR_ID_DLINK,	0x4020), 0, 0, 5 },
+	{ 0, }
+};
+
+MODULE_DEVICE_TABLE(pci, ipg_pci_tbl);
+
+static inline void __iomem *ipg_ioaddr(struct net_device *dev)
+{
+	struct ipg_nic_private *sp = netdev_priv(dev);
+	return sp->ioaddr;
+}
+
+#ifdef IPG_DEBUG
+static void ipg_dump_rfdlist(struct net_device *dev)
+{
+	struct ipg_nic_private *sp = netdev_priv(dev);
+	void __iomem *ioaddr = sp->ioaddr;
+	unsigned int i;
+	u32 offset;
+
+	IPG_DEBUG_MSG("_dump_rfdlist\n");
+
+	printk(KERN_INFO "rx_current = %2.2x\n", sp->rx_current);
+	printk(KERN_INFO "rx_dirty   = %2.2x\n", sp->rx_dirty);
+	printk(KERN_INFO "RFDList start address = %16.16lx\n",
+	       (unsigned long) sp->rxd_map);
+	printk(KERN_INFO "RFDListPtr register   = %8.8x%8.8x\n",
+	       ipg_r32(IPG_RFDLISTPTR1), ipg_r32(IPG_RFDLISTPTR0));
+
+	for (i = 0; i < IPG_RFDLIST_LENGTH; i++) {
+		offset = (u32) &sp->rxd[i].next_desc - (u32) sp->rxd;
+		printk(KERN_INFO "%2.2x %4.4x RFDNextPtr = %16.16lx\n", i,
+		       offset, (unsigned long) sp->rxd[i].next_desc);
+		offset = (u32) &sp->rxd[i].rfs - (u32) sp->rxd;
+		printk(KERN_INFO "%2.2x %4.4x RFS        = %16.16lx\n", i,
+		       offset, (unsigned long) sp->rxd[i].rfs);
+		offset = (u32) &sp->rxd[i].frag_info - (u32) sp->rxd;
+		printk(KERN_INFO "%2.2x %4.4x frag_info   = %16.16lx\n", i,
+		       offset, (unsigned long) sp->rxd[i].frag_info);
+	}
+}
+
+static void ipg_dump_tfdlist(struct net_device *dev)
+{
+	struct ipg_nic_private *sp = netdev_priv(dev);
+	void __iomem *ioaddr = sp->ioaddr;
+	unsigned int i;
+	u32 offset;
+
+	IPG_DEBUG_MSG("_dump_tfdlist\n");
+
+	printk(KERN_INFO "tx_current         = %2.2x\n", sp->tx_current);
+	printk(KERN_INFO "tx_dirty = %2.2x\n", sp->tx_dirty);
+	printk(KERN_INFO "TFDList start address = %16.16lx\n",
+	       (unsigned long) sp->txd_map);
+	printk(KERN_INFO "TFDListPtr register   = %8.8x%8.8x\n",
+	       ipg_r32(IPG_TFDLISTPTR1), ipg_r32(IPG_TFDLISTPTR0));
+
+	for (i = 0; i < IPG_TFDLIST_LENGTH; i++) {
+		offset = (u32) &sp->txd[i].next_desc - (u32) sp->txd;
+		printk(KERN_INFO "%2.2x %4.4x TFDNextPtr = %16.16lx\n", i,
+		       offset, (unsigned long) sp->txd[i].next_desc);
+
+		offset = (u32) &sp->txd[i].tfc - (u32) sp->txd;
+		printk(KERN_INFO "%2.2x %4.4x TFC        = %16.16lx\n", i,
+		       offset, (unsigned long) sp->txd[i].tfc);
+		offset = (u32) &sp->txd[i].frag_info - (u32) sp->txd;
+		printk(KERN_INFO "%2.2x %4.4x frag_info   = %16.16lx\n", i,
+		       offset, (unsigned long) sp->txd[i].frag_info);
+	}
+}
+#endif
+
+static void ipg_write_phy_ctl(void __iomem *ioaddr, u8 data)
+{
+	ipg_w8(IPG_PC_RSVD_MASK & data, PHY_CTRL);
+	ndelay(IPG_PC_PHYCTRLWAIT_NS);
+}
+
+static void ipg_drive_phy_ctl_low_high(void __iomem *ioaddr, u8 data)
+{
+	ipg_write_phy_ctl(ioaddr, IPG_PC_MGMTCLK_LO | data);
+	ipg_write_phy_ctl(ioaddr, IPG_PC_MGMTCLK_HI | data);
+}
+
+static void send_three_state(void __iomem *ioaddr, u8 phyctrlpolarity)
+{
+	phyctrlpolarity |= (IPG_PC_MGMTDATA & 0) | IPG_PC_MGMTDIR;
+
+	ipg_drive_phy_ctl_low_high(ioaddr, phyctrlpolarity);
+}
+
+static void send_end(void __iomem *ioaddr, u8 phyctrlpolarity)
+{
+	ipg_w8((IPG_PC_MGMTCLK_LO | (IPG_PC_MGMTDATA & 0) | IPG_PC_MGMTDIR |
+		phyctrlpolarity) & IPG_PC_RSVD_MASK, PHY_CTRL);
+}
+
+static u16 read_phy_bit(void __iomem * ioaddr, u8 phyctrlpolarity)
+{
+	u16 bit_data;
+
+	ipg_write_phy_ctl(ioaddr, IPG_PC_MGMTCLK_LO | phyctrlpolarity);
+
+	bit_data = ((ipg_r8(PHY_CTRL) & IPG_PC_MGMTDATA) >> 1) & 1;
+
+	ipg_write_phy_ctl(ioaddr, IPG_PC_MGMTCLK_HI | phyctrlpolarity);
+
+	return bit_data;
+}
+
+/*
+ * Read a register from the Physical Layer device located
+ * on the IPG NIC, using the IPG PHYCTRL register.
+ */
+static int mdio_read(struct net_device * dev, int phy_id, int phy_reg)
+{
+	void __iomem *ioaddr = ipg_ioaddr(dev);
+	/*
+	 * The GMII mangement frame structure for a read is as follows:
+	 *
+	 * |Preamble|st|op|phyad|regad|ta|      data      |idle|
+	 * |< 32 1s>|01|10|AAAAA|RRRRR|z0|DDDDDDDDDDDDDDDD|z   |
+	 *
+	 * <32 1s> = 32 consecutive logic 1 values
+	 * A = bit of Physical Layer device address (MSB first)
+	 * R = bit of register address (MSB first)
+	 * z = High impedance state
+	 * D = bit of read data (MSB first)
+	 *
+	 * Transmission order is 'Preamble' field first, bits transmitted
+	 * left to right (first to last).
+	 */
+	struct {
+		u32 field;
+		unsigned int len;
+	} p[] = {
+		{ GMII_PREAMBLE,	32 },	/* Preamble */
+		{ GMII_ST,		2  },	/* ST */
+		{ GMII_READ,		2  },	/* OP */
+		{ phy_id,		5  },	/* PHYAD */
+		{ phy_reg,		5  },	/* REGAD */
+		{ 0x0000,		2  },	/* TA */
+		{ 0x0000,		16 },	/* DATA */
+		{ 0x0000,		1  }	/* IDLE */
+	};
+	unsigned int i, j;
+	u8 polarity, data;
+
+	polarity  = ipg_r8(PHY_CTRL);
+	polarity &= (IPG_PC_DUPLEX_POLARITY | IPG_PC_LINK_POLARITY);
+
+	/* Create the Preamble, ST, OP, PHYAD, and REGAD field. */
+	for (j = 0; j < 5; j++) {
+		for (i = 0; i < p[j].len; i++) {
+			/* For each variable length field, the MSB must be
+			 * transmitted first. Rotate through the field bits,
+			 * starting with the MSB, and move each bit into the
+			 * the 1st (2^1) bit position (this is the bit position
+			 * corresponding to the MgmtData bit of the PhyCtrl
+			 * register for the IPG).
+			 *
+			 * Example: ST = 01;
+			 *
+			 *          First write a '0' to bit 1 of the PhyCtrl
+			 *          register, then write a '1' to bit 1 of the
+			 *          PhyCtrl register.
+			 *
+			 * To do this, right shift the MSB of ST by the value:
+			 * [field length - 1 - #ST bits already written]
+			 * then left shift this result by 1.
+			 */
+			data  = (p[j].field >> (p[j].len - 1 - i)) << 1;
+			data &= IPG_PC_MGMTDATA;
+			data |= polarity | IPG_PC_MGMTDIR;
+
+			ipg_drive_phy_ctl_low_high(ioaddr, data);
+		}
+	}
+
+	send_three_state(ioaddr, polarity);
+
+	read_phy_bit(ioaddr, polarity);
+
+	/*
+	 * For a read cycle, the bits for the next two fields (TA and
+	 * DATA) are driven by the PHY (the IPG reads these bits).
+	 */
+	for (i = 0; i < p[6].len; i++) {
+		p[6].field |=
+		    (read_phy_bit(ioaddr, polarity) << (p[6].len - 1 - i));
+	}
+
+	send_three_state(ioaddr, polarity);
+	send_three_state(ioaddr, polarity);
+	send_three_state(ioaddr, polarity);
+	send_end(ioaddr, polarity);
+
+	/* Return the value of the DATA field. */
+	return p[6].field;
+}
+
+/*
+ * Write to a register from the Physical Layer device located
+ * on the IPG NIC, using the IPG PHYCTRL register.
+ */
+static void mdio_write(struct net_device *dev, int phy_id, int phy_reg, int val)
+{
+	void __iomem *ioaddr = ipg_ioaddr(dev);
+	/*
+	 * The GMII mangement frame structure for a read is as follows:
+	 *
+	 * |Preamble|st|op|phyad|regad|ta|      data      |idle|
+	 * |< 32 1s>|01|10|AAAAA|RRRRR|z0|DDDDDDDDDDDDDDDD|z   |
+	 *
+	 * <32 1s> = 32 consecutive logic 1 values
+	 * A = bit of Physical Layer device address (MSB first)
+	 * R = bit of register address (MSB first)
+	 * z = High impedance state
+	 * D = bit of write data (MSB first)
+	 *
+	 * Transmission order is 'Preamble' field first, bits transmitted
+	 * left to right (first to last).
+	 */
+	struct {
+		u32 field;
+		unsigned int len;
+	} p[] = {
+		{ GMII_PREAMBLE,	32 },	/* Preamble */
+		{ GMII_ST,		2  },	/* ST */
+		{ GMII_WRITE,		2  },	/* OP */
+		{ phy_id,		5  },	/* PHYAD */
+		{ phy_reg,		5  },	/* REGAD */
+		{ 0x0002,		2  },	/* TA */
+		{ val & 0xffff,		16 },	/* DATA */
+		{ 0x0000,		1  }	/* IDLE */
+	};
+	unsigned int i, j;
+	u8 polarity, data;
+
+	polarity  = ipg_r8(PHY_CTRL);
+	polarity &= (IPG_PC_DUPLEX_POLARITY | IPG_PC_LINK_POLARITY);
+
+	/* Create the Preamble, ST, OP, PHYAD, and REGAD field. */
+	for (j = 0; j < 7; j++) {
+		for (i = 0; i < p[j].len; i++) {
+			/* For each variable length field, the MSB must be
+			 * transmitted first. Rotate through the field bits,
+			 * starting with the MSB, and move each bit into the
+			 * the 1st (2^1) bit position (this is the bit position
+			 * corresponding to the MgmtData bit of the PhyCtrl
+			 * register for the IPG).
+			 *
+			 * Example: ST = 01;
+			 *
+			 *          First write a '0' to bit 1 of the PhyCtrl
+			 *          register, then write a '1' to bit 1 of the
+			 *          PhyCtrl register.
+			 *
+			 * To do this, right shift the MSB of ST by the value:
+			 * [field length - 1 - #ST bits already written]
+			 * then left shift this result by 1.
+			 */
+			data  = (p[j].field >> (p[j].len - 1 - i)) << 1;
+			data &= IPG_PC_MGMTDATA;
+			data |= polarity | IPG_PC_MGMTDIR;
+
+			ipg_drive_phy_ctl_low_high(ioaddr, data);
+		}
+	}
+
+	/* The last cycle is a tri-state, so read from the PHY. */
+	for (j = 7; j < 8; j++) {
+		for (i = 0; i < p[j].len; i++) {
+			ipg_write_phy_ctl(ioaddr, IPG_PC_MGMTCLK_LO | polarity);
+
+			p[j].field |= ((ipg_r8(PHY_CTRL) &
+				IPG_PC_MGMTDATA) >> 1) << (p[j].len - 1 - i);
+
+			ipg_write_phy_ctl(ioaddr, IPG_PC_MGMTCLK_HI | polarity);
+		}
+	}
+}
+
+/* Set LED_Mode JES20040127EEPROM */
+static void ipg_set_led_mode(struct net_device *dev)
+{
+	struct ipg_nic_private *sp = netdev_priv(dev);
+	void __iomem *ioaddr = sp->ioaddr;
+	u32 mode;
+
+	mode = ipg_r32(ASIC_CTRL);
+	mode &= ~(IPG_AC_LED_MODE_BIT_1 | IPG_AC_LED_MODE | IPG_AC_LED_SPEED);
+
+	if ((sp->LED_Mode & 0x03) > 1)
+		mode |= IPG_AC_LED_MODE_BIT_1;	/* Write Asic Control Bit 29 */
+
+	if ((sp->LED_Mode & 0x01) == 1)
+		mode |= IPG_AC_LED_MODE;	/* Write Asic Control Bit 14 */
+
+	if ((sp->LED_Mode & 0x08) == 8)
+		mode |= IPG_AC_LED_SPEED;	/* Write Asic Control Bit 27 */
+
+	ipg_w32(mode, ASIC_CTRL);
+}
+
+/* Set PHYSet JES20040127EEPROM */
+static void ipg_set_phy_set(struct net_device *dev)
+{
+	struct ipg_nic_private *sp = netdev_priv(dev);
+	void __iomem *ioaddr = sp->ioaddr;
+	int physet;
+
+	physet = ipg_r8(PHY_SET);
+	physet &= ~(IPG_PS_MEM_LENB9B | IPG_PS_MEM_LEN9 | IPG_PS_NON_COMPDET);
+	physet |= ((sp->LED_Mode & 0x70) >> 4);
+	ipg_w8(physet, PHY_SET);
+}
+
+static int ipg_reset(struct net_device *dev, u32 resetflags)
+{
+	/* Assert functional resets via the IPG AsicCtrl
+	 * register as specified by the 'resetflags' input
+	 * parameter.
+	 */
+	void __iomem *ioaddr = ipg_ioaddr(dev);	//JES20040127EEPROM:
+	unsigned int timeout_count = 0;
+
+	IPG_DEBUG_MSG("_reset\n");
+
+	ipg_w32(ipg_r32(ASIC_CTRL) | resetflags, ASIC_CTRL);
+
+	/* Delay added to account for problem with 10Mbps reset. */
+	mdelay(IPG_AC_RESETWAIT);
+
+	while (IPG_AC_RESET_BUSY & ipg_r32(ASIC_CTRL)) {
+		mdelay(IPG_AC_RESETWAIT);
+		if (++timeout_count > IPG_AC_RESET_TIMEOUT)
+			return -ETIME;
+	}
+	/* Set LED Mode in Asic Control JES20040127EEPROM */
+	ipg_set_led_mode(dev);
+
+	/* Set PHYSet Register Value JES20040127EEPROM */
+	ipg_set_phy_set(dev);
+	return 0;
+}
+
+/* Find the GMII PHY address. */
+static int ipg_find_phyaddr(struct net_device *dev)
+{
+	unsigned int phyaddr, i;
+
+	for (i = 0; i < 32; i++) {
+		u32 status;
+
+		/* Search for the correct PHY address among 32 possible. */
+		phyaddr = (IPG_NIC_PHY_ADDRESS + i) % 32;
+
+		/* 10/22/03 Grace change verify from GMII_PHY_STATUS to
+		   GMII_PHY_ID1
+		 */
+
+		status = mdio_read(dev, phyaddr, MII_BMSR);
+
+		if ((status != 0xFFFF) && (status != 0))
+			return phyaddr;
+	}
+
+	return 0x1f;
+}
+
+/*
+ * Configure IPG based on result of IEEE 802.3 PHY
+ * auto-negotiation.
+ */
+static int ipg_config_autoneg(struct net_device *dev)
+{
+	struct ipg_nic_private *sp = netdev_priv(dev);
+	void __iomem *ioaddr = sp->ioaddr;
+	unsigned int txflowcontrol;
+	unsigned int rxflowcontrol;
+	unsigned int fullduplex;
+	unsigned int gig;
+	u32 mac_ctrl_val;
+	u32 asicctrl;
+	u8 phyctrl;
+
+	IPG_DEBUG_MSG("_config_autoneg\n");
+
+	asicctrl = ipg_r32(ASIC_CTRL);
+	phyctrl = ipg_r8(PHY_CTRL);
+	mac_ctrl_val = ipg_r32(MAC_CTRL);
+
+	/* Set flags for use in resolving auto-negotation, assuming
+	 * non-1000Mbps, half duplex, no flow control.
+	 */
+	fullduplex = 0;
+	txflowcontrol = 0;
+	rxflowcontrol = 0;
+	gig = 0;
+
+	/* To accomodate a problem in 10Mbps operation,
+	 * set a global flag if PHY running in 10Mbps mode.
+	 */
+	sp->tenmbpsmode = 0;
+
+	printk(KERN_INFO "%s: Link speed = ", dev->name);
+
+	/* Determine actual speed of operation. */
+	switch (phyctrl & IPG_PC_LINK_SPEED) {
+	case IPG_PC_LINK_SPEED_10MBPS:
+		printk("10Mbps.\n");
+		printk(KERN_INFO "%s: 10Mbps operational mode enabled.\n",
+		       dev->name);
+		sp->tenmbpsmode = 1;
+		break;
+	case IPG_PC_LINK_SPEED_100MBPS:
+		printk("100Mbps.\n");
+		break;
+	case IPG_PC_LINK_SPEED_1000MBPS:
+		printk("1000Mbps.\n");
+		gig = 1;
+		break;
+	default:
+		printk("undefined!\n");
+		return 0;
+	}
+
+	if (phyctrl & IPG_PC_DUPLEX_STATUS) {
+		fullduplex = 1;
+		txflowcontrol = 1;
+		rxflowcontrol = 1;
+	}
+
+	/* Configure full duplex, and flow control. */
+	if (fullduplex == 1) {
+		/* Configure IPG for full duplex operation. */
+		printk(KERN_INFO "%s: setting full duplex, ", dev->name);
+
+		mac_ctrl_val |= IPG_MC_DUPLEX_SELECT_FD;
+
+		if (txflowcontrol == 1) {
+			printk("TX flow control");
+			mac_ctrl_val |= IPG_MC_TX_FLOW_CONTROL_ENABLE;
+		} else {
+			printk("no TX flow control");
+			mac_ctrl_val &= ~IPG_MC_TX_FLOW_CONTROL_ENABLE;
+		}
+
+		if (rxflowcontrol == 1) {
+			printk(", RX flow control.");
+			mac_ctrl_val |= IPG_MC_RX_FLOW_CONTROL_ENABLE;
+		} else {
+			printk(", no RX flow control.");
+			mac_ctrl_val &= ~IPG_MC_RX_FLOW_CONTROL_ENABLE;
+		}
+
+		printk("\n");
+	} else {
+		/* Configure IPG for half duplex operation. */
+	        printk(KERN_INFO "%s: setting half duplex, "
+		       "no TX flow control, no RX flow control.\n", dev->name);
+
+		mac_ctrl_val &= ~IPG_MC_DUPLEX_SELECT_FD &
+			~IPG_MC_TX_FLOW_CONTROL_ENABLE &
+			~IPG_MC_RX_FLOW_CONTROL_ENABLE;
+	}
+	ipg_w32(mac_ctrl_val, MAC_CTRL);
+	return 0;
+}
+
+/* Determine and configure multicast operation and set
+ * receive mode for IPG.
+ */
+static void ipg_nic_set_multicast_list(struct net_device *dev)
+{
+	void __iomem *ioaddr = ipg_ioaddr(dev);
+	struct dev_mc_list *mc_list_ptr;
+	unsigned int hashindex;
+	u32 hashtable[2];
+	u8 receivemode;
+
+	IPG_DEBUG_MSG("_nic_set_multicast_list\n");
+
+	receivemode = IPG_RM_RECEIVEUNICAST | IPG_RM_RECEIVEBROADCAST;
+
+	if (dev->flags & IFF_PROMISC) {
+		/* NIC to be configured in promiscuous mode. */
+		receivemode = IPG_RM_RECEIVEALLFRAMES;
+	} else if ((dev->flags & IFF_ALLMULTI) ||
+		   (dev->flags & IFF_MULTICAST &
+		    (dev->mc_count > IPG_MULTICAST_HASHTABLE_SIZE))) {
+		/* NIC to be configured to receive all multicast
+		 * frames. */
+		receivemode |= IPG_RM_RECEIVEMULTICAST;
+	} else if (dev->flags & IFF_MULTICAST & (dev->mc_count > 0)) {
+		/* NIC to be configured to receive selected
+		 * multicast addresses. */
+		receivemode |= IPG_RM_RECEIVEMULTICASTHASH;
+	}
+
+	/* Calculate the bits to set for the 64 bit, IPG HASHTABLE.
+	 * The IPG applies a cyclic-redundancy-check (the same CRC
+	 * used to calculate the frame data FCS) to the destination
+	 * address all incoming multicast frames whose destination
+	 * address has the multicast bit set. The least significant
+	 * 6 bits of the CRC result are used as an addressing index
+	 * into the hash table. If the value of the bit addressed by
+	 * this index is a 1, the frame is passed to the host system.
+	 */
+
+	/* Clear hashtable. */
+	hashtable[0] = 0x00000000;
+	hashtable[1] = 0x00000000;
+
+	/* Cycle through all multicast addresses to filter. */
+	for (mc_list_ptr = dev->mc_list;
+	     mc_list_ptr != NULL; mc_list_ptr = mc_list_ptr->next) {
+		/* Calculate CRC result for each multicast address. */
+		hashindex = crc32_le(0xffffffff, mc_list_ptr->dmi_addr,
+				     ETH_ALEN);
+
+		/* Use only the least significant 6 bits. */
+		hashindex = hashindex & 0x3F;
+
+		/* Within "hashtable", set bit number "hashindex"
+		 * to a logic 1.
+		 */
+		set_bit(hashindex, (void *)hashtable);
+	}
+
+	/* Write the value of the hashtable, to the 4, 16 bit
+	 * HASHTABLE IPG registers.
+	 */
+	ipg_w32(hashtable[0], HASHTABLE_0);
+	ipg_w32(hashtable[1], HASHTABLE_1);
+
+	ipg_w8(IPG_RM_RSVD_MASK & receivemode, RECEIVE_MODE);
+
+	IPG_DEBUG_MSG("ReceiveMode = %x\n", ipg_r8(RECEIVE_MODE));
+}
+
+static int ipg_io_config(struct net_device *dev)
+{
+	void __iomem *ioaddr = ipg_ioaddr(dev);
+	u32 origmacctrl;
+	u32 restoremacctrl;
+
+	IPG_DEBUG_MSG("_io_config\n");
+
+	origmacctrl = ipg_r32(MAC_CTRL);
+
+	restoremacctrl = origmacctrl | IPG_MC_STATISTICS_ENABLE;
+
+	/* Based on compilation option, determine if FCS is to be
+	 * stripped on receive frames by IPG.
+	 */
+	if (!IPG_STRIP_FCS_ON_RX)
+		restoremacctrl |= IPG_MC_RCV_FCS;
+
+	/* Determine if transmitter and/or receiver are
+	 * enabled so we may restore MACCTRL correctly.
+	 */
+	if (origmacctrl & IPG_MC_TX_ENABLED)
+		restoremacctrl |= IPG_MC_TX_ENABLE;
+
+	if (origmacctrl & IPG_MC_RX_ENABLED)
+		restoremacctrl |= IPG_MC_RX_ENABLE;
+
+	/* Transmitter and receiver must be disabled before setting
+	 * IFSSelect.
+	 */
+	ipg_w32((origmacctrl & (IPG_MC_RX_DISABLE | IPG_MC_TX_DISABLE)) &
+		IPG_MC_RSVD_MASK, MAC_CTRL);
+
+	/* Now that transmitter and receiver are disabled, write
+	 * to IFSSelect.
+	 */
+	ipg_w32((origmacctrl & IPG_MC_IFS_96BIT) & IPG_MC_RSVD_MASK, MAC_CTRL);
+
+	/* Set RECEIVEMODE register. */
+	ipg_nic_set_multicast_list(dev);
+
+	ipg_w16(IPG_MAX_RXFRAME_SIZE, MAX_FRAME_SIZE);
+
+	ipg_w8(IPG_RXDMAPOLLPERIOD_VALUE,   RX_DMA_POLL_PERIOD);
+	ipg_w8(IPG_RXDMAURGENTTHRESH_VALUE, RX_DMA_URGENT_THRESH);
+	ipg_w8(IPG_RXDMABURSTTHRESH_VALUE,  RX_DMA_BURST_THRESH);
+	ipg_w8(IPG_TXDMAPOLLPERIOD_VALUE,   TX_DMA_POLL_PERIOD);
+	ipg_w8(IPG_TXDMAURGENTTHRESH_VALUE, TX_DMA_URGENT_THRESH);
+	ipg_w8(IPG_TXDMABURSTTHRESH_VALUE,  TX_DMA_BURST_THRESH);
+	ipg_w16((IPG_IE_HOST_ERROR | IPG_IE_TX_DMA_COMPLETE |
+		 IPG_IE_TX_COMPLETE | IPG_IE_INT_REQUESTED |
+		 IPG_IE_UPDATE_STATS | IPG_IE_LINK_EVENT |
+		 IPG_IE_RX_DMA_COMPLETE | IPG_IE_RX_DMA_PRIORITY), INT_ENABLE);
+	ipg_w16(IPG_FLOWONTHRESH_VALUE,  FLOW_ON_THRESH);
+	ipg_w16(IPG_FLOWOFFTHRESH_VALUE, FLOW_OFF_THRESH);
+
+	/* IPG multi-frag frame bug workaround.
+	 * Per silicon revision B3 eratta.
+	 */
+	ipg_w16(ipg_r16(DEBUG_CTRL) | 0x0200, DEBUG_CTRL);
+
+	/* IPG TX poll now bug workaround.
+	 * Per silicon revision B3 eratta.
+	 */
+	ipg_w16(ipg_r16(DEBUG_CTRL) | 0x0010, DEBUG_CTRL);
+
+	/* IPG RX poll now bug workaround.
+	 * Per silicon revision B3 eratta.
+	 */
+	ipg_w16(ipg_r16(DEBUG_CTRL) | 0x0020, DEBUG_CTRL);
+
+	/* Now restore MACCTRL to original setting. */
+	ipg_w32(IPG_MC_RSVD_MASK & restoremacctrl, MAC_CTRL);
+
+	/* Disable unused RMON statistics. */
+	ipg_w32(IPG_RZ_ALL, RMON_STATISTICS_MASK);
+
+	/* Disable unused MIB statistics. */
+	ipg_w32(IPG_SM_MACCONTROLFRAMESXMTD | IPG_SM_MACCONTROLFRAMESRCVD |
+		IPG_SM_BCSTOCTETXMTOK_BCSTFRAMESXMTDOK | IPG_SM_TXJUMBOFRAMES |
+		IPG_SM_MCSTOCTETXMTOK_MCSTFRAMESXMTDOK | IPG_SM_RXJUMBOFRAMES |
+		IPG_SM_BCSTOCTETRCVDOK_BCSTFRAMESRCVDOK |
+		IPG_SM_UDPCHECKSUMERRORS | IPG_SM_TCPCHECKSUMERRORS |
+		IPG_SM_IPCHECKSUMERRORS, STATISTICS_MASK);
+
+	return 0;
+}
+
+/*
+ * Create a receive buffer within system memory and update
+ * NIC private structure appropriately.
+ */
+static int ipg_get_rxbuff(struct net_device *dev, int entry)
+{
+	struct ipg_nic_private *sp = netdev_priv(dev);
+	struct ipg_rx *rxfd = sp->rxd + entry;
+	struct sk_buff *skb;
+	u64 rxfragsize;
+
+	IPG_DEBUG_MSG("_get_rxbuff\n");
+
+	skb = netdev_alloc_skb(dev, IPG_RXSUPPORT_SIZE + NET_IP_ALIGN);
+	if (!skb) {
+		sp->RxBuff[entry] = NULL;
+		return -ENOMEM;
+	}
+
+	/* Adjust the data start location within the buffer to
+	 * align IP address field to a 16 byte boundary.
+	 */
+	skb_reserve(skb, NET_IP_ALIGN);
+
+	/* Associate the receive buffer with the IPG NIC. */
+	skb->dev = dev;
+
+	/* Save the address of the sk_buff structure. */
+	sp->RxBuff[entry] = skb;
+
+	rxfd->frag_info = cpu_to_le64(pci_map_single(sp->pdev, skb->data,
+		sp->rx_buf_sz, PCI_DMA_FROMDEVICE));
+
+	/* Set the RFD fragment length. */
+	rxfragsize = IPG_RXFRAG_SIZE;
+	rxfd->frag_info |= cpu_to_le64((rxfragsize << 48) & IPG_RFI_FRAGLEN);
+
+	return 0;
+}
+
+static int init_rfdlist(struct net_device *dev)
+{
+	struct ipg_nic_private *sp = netdev_priv(dev);
+	void __iomem *ioaddr = sp->ioaddr;
+	unsigned int i;
+
+	IPG_DEBUG_MSG("_init_rfdlist\n");
+
+	for (i = 0; i < IPG_RFDLIST_LENGTH; i++) {
+		struct ipg_rx *rxfd = sp->rxd + i;
+
+		if (sp->RxBuff[i]) {
+			pci_unmap_single(sp->pdev,
+				le64_to_cpu(rxfd->frag_info & ~IPG_RFI_FRAGLEN),
+				sp->rx_buf_sz, PCI_DMA_FROMDEVICE);
+			IPG_DEV_KFREE_SKB(sp->RxBuff[i]);
+			sp->RxBuff[i] = NULL;
+		}
+
+		/* Clear out the RFS field. */
+		rxfd->rfs = 0x0000000000000000;
+
+		if (ipg_get_rxbuff(dev, i) < 0) {
+			/*
+			 * A receive buffer was not ready, break the
+			 * RFD list here.
+			 */
+			IPG_DEBUG_MSG("Cannot allocate Rx buffer.\n");
+
+			/* Just in case we cannot allocate a single RFD.
+			 * Should not occur.
+			 */
+			if (i == 0) {
+				printk(KERN_ERR "%s: No memory available"
+					" for RFD list.\n", dev->name);
+				return -ENOMEM;
+			}
+		}
+
+		rxfd->next_desc = cpu_to_le64(sp->rxd_map +
+			sizeof(struct ipg_rx)*(i + 1));
+	}
+	sp->rxd[i - 1].next_desc = cpu_to_le64(sp->rxd_map);
+
+	sp->rx_current = 0;
+	sp->rx_dirty = 0;
+
+	/* Write the location of the RFDList to the IPG. */
+	ipg_w32((u32) sp->rxd_map, RFD_LIST_PTR_0);
+	ipg_w32(0x00000000, RFD_LIST_PTR_1);
+
+	return 0;
+}
+
+static void init_tfdlist(struct net_device *dev)
+{
+	struct ipg_nic_private *sp = netdev_priv(dev);
+	void __iomem *ioaddr = sp->ioaddr;
+	unsigned int i;
+
+	IPG_DEBUG_MSG("_init_tfdlist\n");
+
+	for (i = 0; i < IPG_TFDLIST_LENGTH; i++) {
+		struct ipg_tx *txfd = sp->txd + i;
+
+		txfd->tfc = cpu_to_le64(IPG_TFC_TFDDONE);
+
+		if (sp->TxBuff[i]) {
+			IPG_DEV_KFREE_SKB(sp->TxBuff[i]);
+			sp->TxBuff[i] = NULL;
+		}
+
+		txfd->next_desc = cpu_to_le64(sp->txd_map +
+			sizeof(struct ipg_tx)*(i + 1));
+	}
+	sp->txd[i - 1].next_desc = cpu_to_le64(sp->txd_map);
+
+	sp->tx_current = 0;
+	sp->tx_dirty = 0;
+
+	/* Write the location of the TFDList to the IPG. */
+	IPG_DDEBUG_MSG("Starting TFDListPtr = %8.8x\n",
+		       (u32) sp->txd_map);
+	ipg_w32((u32) sp->txd_map, TFD_LIST_PTR_0);
+	ipg_w32(0x00000000, TFD_LIST_PTR_1);
+
+	sp->ResetCurrentTFD = 1;
+}
+
+/*
+ * Free all transmit buffers which have already been transfered
+ * via DMA to the IPG.
+ */
+static void ipg_nic_txfree(struct net_device *dev)
+{
+	struct ipg_nic_private *sp = netdev_priv(dev);
+	void __iomem *ioaddr = sp->ioaddr;
+	const unsigned int curr = ipg_r32(TFD_LIST_PTR_0) -
+		(sp->txd_map / sizeof(struct ipg_tx)) - 1;
+	unsigned int released, pending;
+
+	IPG_DEBUG_MSG("_nic_txfree\n");
+
+	pending = sp->tx_current - sp->tx_dirty;
+
+	for (released = 0; released < pending; released++) {
+		unsigned int dirty = sp->tx_dirty % IPG_TFDLIST_LENGTH;
+		struct sk_buff *skb = sp->TxBuff[dirty];
+		struct ipg_tx *txfd = sp->txd + dirty;
+
+		IPG_DEBUG_MSG("TFC = %16.16lx\n", (unsigned long) txfd->tfc);
+
+		/* Look at each TFD's TFC field beginning
+		 * at the last freed TFD up to the current TFD.
+		 * If the TFDDone bit is set, free the associated
+		 * buffer.
+		 */
+		if (dirty == curr)
+			break;
+
+		/* Setup TFDDONE for compatible issue. */
+		txfd->tfc |= cpu_to_le64(IPG_TFC_TFDDONE);
+
+		/* Free the transmit buffer. */
+		if (skb) {
+			pci_unmap_single(sp->pdev,
+				le64_to_cpu(txfd->frag_info & ~IPG_TFI_FRAGLEN),
+				skb->len, PCI_DMA_TODEVICE);
+
+			IPG_DEV_KFREE_SKB(skb);
+
+			sp->TxBuff[dirty] = NULL;
+		}
+	}
+
+	sp->tx_dirty += released;
+
+	if (netif_queue_stopped(dev) &&
+	    (sp->tx_current != (sp->tx_dirty + IPG_TFDLIST_LENGTH))) {
+		netif_wake_queue(dev);
+	}
+}
+
+static void ipg_tx_timeout(struct net_device *dev)
+{
+	struct ipg_nic_private *sp = netdev_priv(dev);
+	void __iomem *ioaddr = sp->ioaddr;
+
+	ipg_reset(dev, IPG_AC_TX_RESET | IPG_AC_DMA | IPG_AC_NETWORK |
+		  IPG_AC_FIFO);
+
+	spin_lock_irq(&sp->lock);
+
+	/* Re-configure after DMA reset. */
+	if (ipg_io_config(dev) < 0) {
+		printk(KERN_INFO "%s: Error during re-configuration.\n",
+		       dev->name);
+	}
+
+	init_tfdlist(dev);
+
+	spin_unlock_irq(&sp->lock);
+
+	ipg_w32((ipg_r32(MAC_CTRL) | IPG_MC_TX_ENABLE) & IPG_MC_RSVD_MASK,
+		MAC_CTRL);
+}
+
+/*
+ * For TxComplete interrupts, free all transmit
+ * buffers which have already been transfered via DMA
+ * to the IPG.
+ */
+static void ipg_nic_txcleanup(struct net_device *dev)
+{
+	struct ipg_nic_private *sp = netdev_priv(dev);
+	void __iomem *ioaddr = sp->ioaddr;
+	unsigned int i;
+
+	IPG_DEBUG_MSG("_nic_txcleanup\n");
+
+	for (i = 0; i < IPG_TFDLIST_LENGTH; i++) {
+		/* Reading the TXSTATUS register clears the
+		 * TX_COMPLETE interrupt.
+		 */
+		u32 txstatusdword = ipg_r32(TX_STATUS);
+
+		IPG_DEBUG_MSG("TxStatus = %8.8x\n", txstatusdword);
+
+		/* Check for Transmit errors. Error bits only valid if
+		 * TX_COMPLETE bit in the TXSTATUS register is a 1.
+		 */
+		if (!(txstatusdword & IPG_TS_TX_COMPLETE))
+			break;
+
+		/* If in 10Mbps mode, indicate transmit is ready. */
+		if (sp->tenmbpsmode) {
+			netif_wake_queue(dev);
+		}
+
+		/* Transmit error, increment stat counters. */
+		if (txstatusdword & IPG_TS_TX_ERROR) {
+			IPG_DEBUG_MSG("Transmit error.\n");
+			sp->stats.tx_errors++;
+		}
+
+		/* Late collision, re-enable transmitter. */
+		if (txstatusdword & IPG_TS_LATE_COLLISION) {
+			IPG_DEBUG_MSG("Late collision on transmit.\n");
+			ipg_w32((ipg_r32(MAC_CTRL) | IPG_MC_TX_ENABLE) &
+				IPG_MC_RSVD_MASK, MAC_CTRL);
+		}
+
+		/* Maximum collisions, re-enable transmitter. */
+		if (txstatusdword & IPG_TS_TX_MAX_COLL) {
+			IPG_DEBUG_MSG("Maximum collisions on transmit.\n");
+			ipg_w32((ipg_r32(MAC_CTRL) | IPG_MC_TX_ENABLE) &
+				IPG_MC_RSVD_MASK, MAC_CTRL);
+		}
+
+		/* Transmit underrun, reset and re-enable
+		 * transmitter.
+		 */
+		if (txstatusdword & IPG_TS_TX_UNDERRUN) {
+			IPG_DEBUG_MSG("Transmitter underrun.\n");
+			sp->stats.tx_fifo_errors++;
+			ipg_reset(dev, IPG_AC_TX_RESET | IPG_AC_DMA |
+				  IPG_AC_NETWORK | IPG_AC_FIFO);
+
+			/* Re-configure after DMA reset. */
+			if (ipg_io_config(dev) < 0) {
+				printk(KERN_INFO
+				       "%s: Error during re-configuration.\n",
+				       dev->name);
+			}
+			init_tfdlist(dev);
+
+			ipg_w32((ipg_r32(MAC_CTRL) | IPG_MC_TX_ENABLE) &
+				IPG_MC_RSVD_MASK, MAC_CTRL);
+		}
+	}
+
+	ipg_nic_txfree(dev);
+}
+
+/* Provides statistical information about the IPG NIC. */
+struct net_device_stats *ipg_nic_get_stats(struct net_device *dev)
+{
+	struct ipg_nic_private *sp = netdev_priv(dev);
+	void __iomem *ioaddr = sp->ioaddr;
+	u16 temp1;
+	u16 temp2;
+
+	IPG_DEBUG_MSG("_nic_get_stats\n");
+
+	/* Check to see if the NIC has been initialized via nic_open,
+	 * before trying to read statistic registers.
+	 */
+	if (!test_bit(__LINK_STATE_START, &dev->state))
+		return &sp->stats;
+
+	sp->stats.rx_packets += ipg_r32(IPG_FRAMESRCVDOK);
+	sp->stats.tx_packets += ipg_r32(IPG_FRAMESXMTDOK);
+	sp->stats.rx_bytes += ipg_r32(IPG_OCTETRCVOK);
+	sp->stats.tx_bytes += ipg_r32(IPG_OCTETXMTOK);
+	temp1 = ipg_r16(IPG_FRAMESLOSTRXERRORS);
+	sp->stats.rx_errors += temp1;
+	sp->stats.rx_missed_errors += temp1;
+	temp1 = ipg_r32(IPG_SINGLECOLFRAMES) + ipg_r32(IPG_MULTICOLFRAMES) +
+		ipg_r32(IPG_LATECOLLISIONS);
+	temp2 = ipg_r16(IPG_CARRIERSENSEERRORS);
+	sp->stats.collisions += temp1;
+	sp->stats.tx_dropped += ipg_r16(IPG_FRAMESABORTXSCOLLS);
+	sp->stats.tx_errors += ipg_r16(IPG_FRAMESWEXDEFERRAL) +
+		ipg_r32(IPG_FRAMESWDEFERREDXMT) + temp1 + temp2;
+	sp->stats.multicast += ipg_r32(IPG_MCSTOCTETRCVDOK);
+
+	/* detailed tx_errors */
+	sp->stats.tx_carrier_errors += temp2;
+
+	/* detailed rx_errors */
+	sp->stats.rx_length_errors += ipg_r16(IPG_INRANGELENGTHERRORS) +
+		ipg_r16(IPG_FRAMETOOLONGERRRORS);
+	sp->stats.rx_crc_errors += ipg_r16(IPG_FRAMECHECKSEQERRORS);
+
+	/* Unutilized IPG statistic registers. */
+	ipg_r32(IPG_MCSTFRAMESRCVDOK);
+
+	return &sp->stats;
+}
+
+/* Restore used receive buffers. */
+static int ipg_nic_rxrestore(struct net_device *dev)
+{
+	struct ipg_nic_private *sp = netdev_priv(dev);
+	const unsigned int curr = sp->rx_current;
+	unsigned int dirty = sp->rx_dirty;
+
+	IPG_DEBUG_MSG("_nic_rxrestore\n");
+
+	for (dirty = sp->rx_dirty; curr - dirty > 0; dirty++) {
+		unsigned int entry = dirty % IPG_RFDLIST_LENGTH;
+
+		/* rx_copybreak may poke hole here and there. */
+		if (sp->RxBuff[entry])
+			continue;
+
+		/* Generate a new receive buffer to replace the
+		 * current buffer (which will be released by the
+		 * Linux system).
+		 */
+		if (ipg_get_rxbuff(dev, entry) < 0) {
+			IPG_DEBUG_MSG("Cannot allocate new Rx buffer.\n");
+
+			break;
+		}
+
+		/* Reset the RFS field. */
+		sp->rxd[entry].rfs = 0x0000000000000000;
+	}
+	sp->rx_dirty = dirty;
+
+	return 0;
+}
+
+#ifdef JUMBO_FRAME
+
+/* use jumboindex and jumbosize to control jumbo frame status
+   initial status is jumboindex=-1 and jumbosize=0
+   1. jumboindex = -1 and jumbosize=0 : previous jumbo frame has been done.
+   2. jumboindex != -1 and jumbosize != 0 : jumbo frame is not over size and receiving
+   3. jumboindex = -1 and jumbosize != 0 : jumbo frame is over size, already dump
+                previous receiving and need to continue dumping the current one
+*/
+enum {
+	NormalPacket,
+	ErrorPacket
+};
+
+enum {
+	Frame_NoStart_NoEnd	= 0,
+	Frame_WithStart		= 1,
+	Frame_WithEnd		= 10,
+	Frame_WithStart_WithEnd = 11
+};
+
+inline void ipg_nic_rx_free_skb(struct net_device *dev)
+{
+	struct ipg_nic_private *sp = netdev_priv(dev);
+	unsigned int entry = sp->rx_current % IPG_RFDLIST_LENGTH;
+
+	if (sp->RxBuff[entry]) {
+		struct ipg_rx *rxfd = sp->rxd + entry;
+
+		pci_unmap_single(sp->pdev,
+			le64_to_cpu(rxfd->frag_info & ~IPG_RFI_FRAGLEN),
+			sp->rx_buf_sz, PCI_DMA_FROMDEVICE);
+		IPG_DEV_KFREE_SKB(sp->RxBuff[entry]);
+		sp->RxBuff[entry] = NULL;
+	}
+}
+
+inline int ipg_nic_rx_check_frame_type(struct net_device *dev)
+{
+	struct ipg_nic_private *sp = netdev_priv(dev);
+	struct ipg_rx *rxfd = sp->rxd + (sp->rx_current % IPG_RFDLIST_LENGTH);
+	int type = Frame_NoStart_NoEnd;
+
+	if (le64_to_cpu(rxfd->rfs) & IPG_RFS_FRAMESTART)
+		type += Frame_WithStart;
+	if (le64_to_cpu(rxfd->rfs) & IPG_RFS_FRAMEEND)
+		type += Frame_WithEnd;
+	return type;
+}
+
+inline int ipg_nic_rx_check_error(struct net_device *dev)
+{
+	struct ipg_nic_private *sp = netdev_priv(dev);
+	unsigned int entry = sp->rx_current % IPG_RFDLIST_LENGTH;
+	struct ipg_rx *rxfd = sp->rxd + entry;
+
+	if (IPG_DROP_ON_RX_ETH_ERRORS && (le64_to_cpu(rxfd->rfs) &
+	     (IPG_RFS_RXFIFOOVERRUN | IPG_RFS_RXRUNTFRAME |
+	      IPG_RFS_RXALIGNMENTERROR | IPG_RFS_RXFCSERROR |
+	      IPG_RFS_RXOVERSIZEDFRAME | IPG_RFS_RXLENGTHERROR))) {
+		IPG_DEBUG_MSG("Rx error, RFS = %16.16lx\n",
+			      (unsigned long) rxfd->rfs);
+
+		/* Increment general receive error statistic. */
+		sp->stats.rx_errors++;
+
+		/* Increment detailed receive error statistics. */
+		if (le64_to_cpu(rxfd->rfs) & IPG_RFS_RXFIFOOVERRUN) {
+			IPG_DEBUG_MSG("RX FIFO overrun occured.\n");
+
+			sp->stats.rx_fifo_errors++;
+		}
+
+		if (le64_to_cpu(rxfd->rfs) & IPG_RFS_RXRUNTFRAME) {
+			IPG_DEBUG_MSG("RX runt occured.\n");
+			sp->stats.rx_length_errors++;
+		}
+
+		/* Do nothing for IPG_RFS_RXOVERSIZEDFRAME,
+		 * error count handled by a IPG statistic register.
+		 */
+
+		if (le64_to_cpu(rxfd->rfs) & IPG_RFS_RXALIGNMENTERROR) {
+			IPG_DEBUG_MSG("RX alignment error occured.\n");
+			sp->stats.rx_frame_errors++;
+		}
+
+		/* Do nothing for IPG_RFS_RXFCSERROR, error count
+		 * handled by a IPG statistic register.
+		 */
+
+		/* Free the memory associated with the RX
+		 * buffer since it is erroneous and we will
+		 * not pass it to higher layer processes.
+		 */
+		if (sp->RxBuff[entry]) {
+			pci_unmap_single(sp->pdev,
+				le64_to_cpu(rxfd->frag_info & ~IPG_RFI_FRAGLEN),
+				sp->rx_buf_sz, PCI_DMA_FROMDEVICE);
+
+			IPG_DEV_KFREE_SKB(sp->RxBuff[entry]);
+			sp->RxBuff[entry] = NULL;
+		}
+		return ErrorPacket;
+	}
+	return NormalPacket;
+}
+
+static void ipg_nic_rx_with_start_and_end(struct net_device *dev,
+					  struct ipg_nic_private *sp,
+					  struct ipg_rx *rxfd, unsigned entry)
+{
+	struct SJumbo *jumbo = &sp->Jumbo;
+	struct sk_buff *skb;
+	int framelen;
+
+	if (jumbo->FoundStart) {
+		IPG_DEV_KFREE_SKB(jumbo->skb);
+		jumbo->FoundStart = 0;
+		jumbo->CurrentSize = 0;
+		jumbo->skb = NULL;
+	}
+
+	// 1: found error, 0 no error
+	if (ipg_nic_rx_check_error(dev) != NormalPacket)
+		return;
+
+	skb = sp->RxBuff[entry];
+	if (!skb)
+		return;
+
+	// accept this frame and send to upper layer
+	framelen = le64_to_cpu(rxfd->rfs) & IPG_RFS_RXFRAMELEN;
+	if (framelen > IPG_RXFRAG_SIZE)
+		framelen = IPG_RXFRAG_SIZE;
+
+	skb_put(skb, framelen);
+	skb->protocol = eth_type_trans(skb, dev);
+	skb->ip_summed = CHECKSUM_NONE;
+	netif_rx(skb);
+	dev->last_rx = jiffies;
+	sp->RxBuff[entry] = NULL;
+}
+
+static void ipg_nic_rx_with_start(struct net_device *dev,
+				  struct ipg_nic_private *sp,
+				  struct ipg_rx *rxfd, unsigned entry)
+{
+	struct SJumbo *jumbo = &sp->Jumbo;
+	struct pci_dev *pdev = sp->pdev;
+	struct sk_buff *skb;
+
+	// 1: found error, 0 no error
+	if (ipg_nic_rx_check_error(dev) != NormalPacket)
+		return;
+
+	// accept this frame and send to upper layer
+	skb = sp->RxBuff[entry];
+	if (!skb)
+		return;
+
+	if (jumbo->FoundStart)
+		IPG_DEV_KFREE_SKB(jumbo->skb);
+
+	pci_unmap_single(pdev, le64_to_cpu(rxfd->frag_info & ~IPG_RFI_FRAGLEN),
+			 sp->rx_buf_sz, PCI_DMA_FROMDEVICE);
+
+	skb_put(skb, IPG_RXFRAG_SIZE);
+
+	jumbo->FoundStart = 1;
+	jumbo->CurrentSize = IPG_RXFRAG_SIZE;
+	jumbo->skb = skb;
+
+	sp->RxBuff[entry] = NULL;
+	dev->last_rx = jiffies;
+}
+
+static void ipg_nic_rx_with_end(struct net_device *dev,
+				struct ipg_nic_private *sp,
+				struct ipg_rx *rxfd, unsigned entry)
+{
+	struct SJumbo *jumbo = &sp->Jumbo;
+
+	//1: found error, 0 no error
+	if (ipg_nic_rx_check_error(dev) == NormalPacket) {
+		struct sk_buff *skb = sp->RxBuff[entry];
+
+		if (!skb)
+			return;
+
+		if (jumbo->FoundStart) {
+			int framelen, endframelen;
+
+			framelen = le64_to_cpu(rxfd->rfs) & IPG_RFS_RXFRAMELEN;
+
+			endframeLen = framelen - jumbo->CurrentSize;
+			/*
+			if (framelen > IPG_RXFRAG_SIZE)
+				framelen=IPG_RXFRAG_SIZE;
+			 */
+			if (framelen > IPG_RXSUPPORT_SIZE)
+				IPG_DEV_KFREE_SKB(jumbo->skb);
+			else {
+				memcpy(skb_put(jumbo->skb, endframeLen),
+				       skb->data, endframeLen);
+
+				jumbo->skb->protocol =
+				    eth_type_trans(jumbo->skb, dev);
+
+				jumbo->skb->ip_summed = CHECKSUM_NONE;
+				netif_rx(jumbo->skb);
+			}
+		}
+
+		dev->last_rx = jiffies;
+		jumbo->FoundStart = 0;
+		jumbo->CurrentSize = 0;
+		jumbo->skb = NULL;
+
+		ipg_nic_rx_free_skb(dev);
+	} else {
+		IPG_DEV_KFREE_SKB(jumbo->skb);
+		jumbo->FoundStart = 0;
+		jumbo->CurrentSize = 0;
+		jumbo->skb = NULL;
+	}
+}
+
+static void ipg_nic_rx_no_start_no_end(struct net_device *dev,
+				       struct ipg_nic_private *sp,
+				       struct ipg_rx *rxfd, unsigned entry)
+{
+	struct SJumbo *jumbo = &sp->Jumbo;
+
+	//1: found error, 0 no error
+	if (ipg_nic_rx_check_error(dev) == NormalPacket) {
+		struct sk_buff *skb = sp->RxBuff[entry];
+
+		if (skb) {
+			if (jumbo->FoundStart) {
+				jumbo->CurrentSize += IPG_RXFRAG_SIZE;
+				if (jumbo->CurrentSize <= IPG_RXSUPPORT_SIZE) {
+					memcpy(skb_put(jumbo->skb,
+						       IPG_RXFRAG_SIZE),
+					       skb->data, IPG_RXFRAG_SIZE);
+				}
+			}
+			dev->last_rx = jiffies;
+			ipg_nic_rx_free_skb(dev);
+		}
+	} else {
+		IPG_DEV_KFREE_SKB(jumbo->skb);
+		jumbo->FoundStart = 0;
+		jumbo->CurrentSize = 0;
+		jumbo->skb = NULL;
+	}
+}
+
+static int ipg_nic_rx(struct net_device *dev)
+{
+	struct ipg_nic_private *sp = netdev_priv(dev);
+	unsigned int curr = sp->rx_current;
+	void __iomem *ioaddr = sp->ioaddr;
+	unsigned int i;
+
+	IPG_DEBUG_MSG("_nic_rx\n");
+
+	for (i = 0; i < IPG_MAXRFDPROCESS_COUNT; i++, curr++) {
+		unsigned int entry = curr % IPG_RFDLIST_LENGTH;
+		struct ipg_rx *rxfd = sp->rxd + entry;
+
+		if (!(rxfd->rfs & le64_to_cpu(IPG_RFS_RFDDONE)))
+			break;
+
+		switch (ipg_nic_rx_check_frame_type(dev)) {
+		case Frame_WithStart_WithEnd:
+			ipg_nic_rx_with_start_and_end(dev, tp, rxfd, entry);
+			break;
+		case Frame_WithStart:
+			ipg_nic_rx_with_start(dev, tp, rxfd, entry);
+			break;
+		case Frame_WithEnd:
+			ipg_nic_rx_with_end(dev, tp, rxfd, entry);
+			break;
+		case Frame_NoStart_NoEnd:
+			ipg_nic_rx_no_start_no_end(dev, tp, rxfd, entry);
+			break;
+		}
+	}
+
+	sp->rx_current = curr;
+
+	if (i == IPG_MAXRFDPROCESS_COUNT) {
+		/* There are more RFDs to process, however the
+		 * allocated amount of RFD processing time has
+		 * expired. Assert Interrupt Requested to make
+		 * sure we come back to process the remaining RFDs.
+		 */
+		ipg_w32(ipg_r32(ASIC_CTRL) | IPG_AC_INT_REQUEST, ASIC_CTRL);
+	}
+
+	ipg_nic_rxrestore(dev);
+
+	return 0;
+}
+
+#else
+static int ipg_nic_rx(struct net_device *dev)
+{
+	/* Transfer received Ethernet frames to higher network layers. */
+	struct ipg_nic_private *sp = netdev_priv(dev);
+	unsigned int curr = sp->rx_current;
+	void __iomem *ioaddr = sp->ioaddr;
+	struct ipg_rx *rxfd;
+	unsigned int i;
+
+	IPG_DEBUG_MSG("_nic_rx\n");
+
+#define __RFS_MASK \
+	cpu_to_le64(IPG_RFS_RFDDONE | IPG_RFS_FRAMESTART | IPG_RFS_FRAMEEND)
+
+	for (i = 0; i < IPG_MAXRFDPROCESS_COUNT; i++, curr++) {
+		unsigned int entry = curr % IPG_RFDLIST_LENGTH;
+		struct sk_buff *skb = sp->RxBuff[entry];
+		unsigned int framelen;
+
+		rxfd = sp->rxd + entry;
+
+		if (((rxfd->rfs & __RFS_MASK) != __RFS_MASK) || !skb)
+			break;
+
+		/* Get received frame length. */
+		framelen = le64_to_cpu(rxfd->rfs) & IPG_RFS_RXFRAMELEN;
+
+		/* Check for jumbo frame arrival with too small
+		 * RXFRAG_SIZE.
+		 */
+		if (framelen > IPG_RXFRAG_SIZE) {
+			IPG_DEBUG_MSG
+			    ("RFS FrameLen > allocated fragment size.\n");
+
+			framelen = IPG_RXFRAG_SIZE;
+		}
+
+		if ((IPG_DROP_ON_RX_ETH_ERRORS && (le64_to_cpu(rxfd->rfs &
+		       (IPG_RFS_RXFIFOOVERRUN | IPG_RFS_RXRUNTFRAME |
+			IPG_RFS_RXALIGNMENTERROR | IPG_RFS_RXFCSERROR |
+			IPG_RFS_RXOVERSIZEDFRAME | IPG_RFS_RXLENGTHERROR))))) {
+
+			IPG_DEBUG_MSG("Rx error, RFS = %16.16lx\n",
+				      (unsigned long int) rxfd->rfs);
+
+			/* Increment general receive error statistic. */
+			sp->stats.rx_errors++;
+
+			/* Increment detailed receive error statistics. */
+			if (le64_to_cpu(rxfd->rfs & IPG_RFS_RXFIFOOVERRUN)) {
+				IPG_DEBUG_MSG("RX FIFO overrun occured.\n");
+				sp->stats.rx_fifo_errors++;
+			}
+
+			if (le64_to_cpu(rxfd->rfs & IPG_RFS_RXRUNTFRAME)) {
+				IPG_DEBUG_MSG("RX runt occured.\n");
+				sp->stats.rx_length_errors++;
+			}
+
+			if (le64_to_cpu(rxfd->rfs & IPG_RFS_RXOVERSIZEDFRAME)) ;
+			/* Do nothing, error count handled by a IPG
+			 * statistic register.
+			 */
+
+			if (le64_to_cpu(rxfd->rfs & IPG_RFS_RXALIGNMENTERROR)) {
+				IPG_DEBUG_MSG("RX alignment error occured.\n");
+				sp->stats.rx_frame_errors++;
+			}
+
+			if (le64_to_cpu(rxfd->rfs & IPG_RFS_RXFCSERROR)) ;
+			/* Do nothing, error count handled by a IPG
+			 * statistic register.
+			 */
+
+			/* Free the memory associated with the RX
+			 * buffer since it is erroneous and we will
+			 * not pass it to higher layer processes.
+			 */
+			if (skb) {
+				u64 info = rxfd->frag_info;
+
+				pci_unmap_single(sp->pdev,
+					le64_to_cpu(info & ~IPG_RFI_FRAGLEN),
+					sp->rx_buf_sz, PCI_DMA_FROMDEVICE);
+
+				IPG_DEV_KFREE_SKB(skb);
+			}
+		} else {
+
+			/* Adjust the new buffer length to accomodate the size
+			 * of the received frame.
+			 */
+			skb_put(skb, framelen);
+
+			/* Set the buffer's protocol field to Ethernet. */
+			skb->protocol = eth_type_trans(skb, dev);
+
+			/* If the frame contains an IP/TCP/UDP frame,
+			 * determine if upper layer must check IP/TCP/UDP
+			 * checksums.
+			 *
+			 * NOTE: DO NOT RELY ON THE TCP/UDP CHECKSUM
+			 *       VERIFICATION FOR SILICON REVISIONS B3
+			 *       AND EARLIER!
+			 *
+			 if ((le64_to_cpu(rxfd->rfs &
+			 (IPG_RFS_TCPDETECTED | IPG_RFS_UDPDETECTED |
+			 IPG_RFS_IPDETECTED))) &&
+			 !(le64_to_cpu(rxfd->rfs &
+			 (IPG_RFS_TCPERROR | IPG_RFS_UDPERROR |
+			 IPG_RFS_IPERROR))))
+			 {
+			 * Indicate IP checksums were performed
+			 * by the IPG.
+			 *
+			 skb->ip_summed = CHECKSUM_UNNECESSARY;
+			 }
+			 else
+			 */
+			if (1 == 1) {
+				/* The IPG encountered an error with (or
+				 * there were no) IP/TCP/UDP checksums.
+				 * This may or may not indicate an invalid
+				 * IP/TCP/UDP frame was received. Let the
+				 * upper layer decide.
+				 */
+				skb->ip_summed = CHECKSUM_NONE;
+			}
+
+			/* Hand off frame for higher layer processing.
+			 * The function netif_rx() releases the sk_buff
+			 * when processing completes.
+			 */
+			netif_rx(skb);
+
+			/* Record frame receive time (jiffies = Linux
+			 * kernel current time stamp).
+			 */
+			dev->last_rx = jiffies;
+		}
+
+		/* Assure RX buffer is not reused by IPG. */
+		sp->RxBuff[entry] = NULL;
+	}
+
+	/*
+	 * If there are more RFDs to proces and the allocated amount of RFD
+	 * processing time has expired, assert Interrupt Requested to make
+	 * sure we come back to process the remaining RFDs.
+	 */
+	if (i == IPG_MAXRFDPROCESS_COUNT)
+		ipg_w32(ipg_r32(ASIC_CTRL) | IPG_AC_INT_REQUEST, ASIC_CTRL);
+
+#ifdef IPG_DEBUG
+	/* Check if the RFD list contained no receive frame data. */
+	if (!i)
+		sp->EmptyRFDListCount++;
+#endif
+	while ((le64_to_cpu(rxfd->rfs & IPG_RFS_RFDDONE)) &&
+	       !((le64_to_cpu(rxfd->rfs & IPG_RFS_FRAMESTART)) &&
+		 (le64_to_cpu(rxfd->rfs & IPG_RFS_FRAMEEND)))) {
+		unsigned int entry = curr++ % IPG_RFDLIST_LENGTH;
+
+		rxfd = sp->rxd + entry;
+
+		IPG_DEBUG_MSG("Frame requires multiple RFDs.\n");
+
+		/* An unexpected event, additional code needed to handle
+		 * properly. So for the time being, just disregard the
+		 * frame.
+		 */
+
+		/* Free the memory associated with the RX
+		 * buffer since it is erroneous and we will
+		 * not pass it to higher layer processes.
+		 */
+		if (sp->RxBuff[entry]) {
+			pci_unmap_single(sp->pdev,
+				le64_to_cpu(rxfd->frag_info & ~IPG_RFI_FRAGLEN),
+				sp->rx_buf_sz, PCI_DMA_FROMDEVICE);
+			IPG_DEV_KFREE_SKB(sp->RxBuff[entry]);
+		}
+
+		/* Assure RX buffer is not reused by IPG. */
+		sp->RxBuff[entry] = NULL;
+	}
+
+	sp->rx_current = curr;
+
+	/* Check to see if there are a minimum number of used
+	 * RFDs before restoring any (should improve performance.)
+	 */
+	if ((curr - sp->rx_dirty) >= IPG_MINUSEDRFDSTOFREE)
+		ipg_nic_rxrestore(dev);
+
+	return 0;
+}
+#endif
+
+static void ipg_reset_after_host_error(struct work_struct *work)
+{
+	struct ipg_nic_private *sp =
+		container_of(work, struct ipg_nic_private, task.work);
+	struct net_device *dev = sp->dev;
+
+	IPG_DDEBUG_MSG("DMACtrl = %8.8x\n", ioread32(sp->ioaddr + IPG_DMACTRL));
+
+	/*
+	 * Acknowledge HostError interrupt by resetting
+	 * IPG DMA and HOST.
+	 */
+	ipg_reset(dev, IPG_AC_GLOBAL_RESET | IPG_AC_HOST | IPG_AC_DMA);
+
+	init_rfdlist(dev);
+	init_tfdlist(dev);
+
+	if (ipg_io_config(dev) < 0) {
+		printk(KERN_INFO "%s: Cannot recover from PCI error.\n",
+		       dev->name);
+		schedule_delayed_work(&sp->task, HZ);
+	}
+}
+
+static irqreturn_t ipg_interrupt_handler(int irq, void *dev_inst)
+{
+	struct net_device *dev = dev_inst;
+	struct ipg_nic_private *sp = netdev_priv(dev);
+	void __iomem *ioaddr = sp->ioaddr;
+	unsigned int handled = 0;
+	u16 status;
+
+	IPG_DEBUG_MSG("_interrupt_handler\n");
+
+#ifdef JUMBO_FRAME
+	ipg_nic_rxrestore(dev);
+#endif
+	/* Get interrupt source information, and acknowledge
+	 * some (i.e. TxDMAComplete, RxDMAComplete, RxEarly,
+	 * IntRequested, MacControlFrame, LinkEvent) interrupts
+	 * if issued. Also, all IPG interrupts are disabled by
+	 * reading IntStatusAck.
+	 */
+	status = ipg_r16(INT_STATUS_ACK);
+
+	IPG_DEBUG_MSG("IntStatusAck = %4.4x\n", status);
+
+	/* Shared IRQ of remove event. */
+	if (!(status & IPG_IS_RSVD_MASK))
+		goto out_enable;
+
+	handled = 1;
+
+	if (unlikely(!netif_running(dev)))
+		goto out;
+
+	spin_lock(&sp->lock);
+
+	/* If RFDListEnd interrupt, restore all used RFDs. */
+	if (status & IPG_IS_RFD_LIST_END) {
+		IPG_DEBUG_MSG("RFDListEnd Interrupt.\n");
+
+		/* The RFD list end indicates an RFD was encountered
+		 * with a 0 NextPtr, or with an RFDDone bit set to 1
+		 * (indicating the RFD is not read for use by the
+		 * IPG.) Try to restore all RFDs.
+		 */
+		ipg_nic_rxrestore(dev);
+
+#ifdef IPG_DEBUG
+		/* Increment the RFDlistendCount counter. */
+		sp->RFDlistendCount++;
+#endif
+	}
+
+	/* If RFDListEnd, RxDMAPriority, RxDMAComplete, or
+	 * IntRequested interrupt, process received frames. */
+	if ((status & IPG_IS_RX_DMA_PRIORITY) ||
+	    (status & IPG_IS_RFD_LIST_END) ||
+	    (status & IPG_IS_RX_DMA_COMPLETE) ||
+	    (status & IPG_IS_INT_REQUESTED)) {
+#ifdef IPG_DEBUG
+		/* Increment the RFD list checked counter if interrupted
+		 * only to check the RFD list. */
+		if (status & (~(IPG_IS_RX_DMA_PRIORITY | IPG_IS_RFD_LIST_END |
+				IPG_IS_RX_DMA_COMPLETE | IPG_IS_INT_REQUESTED) &
+			       (IPG_IS_HOST_ERROR | IPG_IS_TX_DMA_COMPLETE |
+				IPG_IS_LINK_EVENT | IPG_IS_TX_COMPLETE |
+				IPG_IS_UPDATE_STATS)))
+			sp->RFDListCheckedCount++;
+#endif
+
+		ipg_nic_rx(dev);
+	}
+
+	/* If TxDMAComplete interrupt, free used TFDs. */
+	if (status & IPG_IS_TX_DMA_COMPLETE)
+		ipg_nic_txfree(dev);
+
+	/* TxComplete interrupts indicate one of numerous actions.
+	 * Determine what action to take based on TXSTATUS register.
+	 */
+	if (status & IPG_IS_TX_COMPLETE)
+		ipg_nic_txcleanup(dev);
+
+	/* If UpdateStats interrupt, update Linux Ethernet statistics */
+	if (status & IPG_IS_UPDATE_STATS)
+		ipg_nic_get_stats(dev);
+
+	/* If HostError interrupt, reset IPG. */
+	if (status & IPG_IS_HOST_ERROR) {
+		IPG_DDEBUG_MSG("HostError Interrupt\n");
+
+		schedule_delayed_work(&sp->task, 0);
+	}
+
+	/* If LinkEvent interrupt, resolve autonegotiation. */
+	if (status & IPG_IS_LINK_EVENT) {
+		if (ipg_config_autoneg(dev) < 0)
+			printk(KERN_INFO "%s: Auto-negotiation error.\n",
+			       dev->name);
+	}
+
+	/* If MACCtrlFrame interrupt, do nothing. */
+	if (status & IPG_IS_MAC_CTRL_FRAME)
+		IPG_DEBUG_MSG("MACCtrlFrame interrupt.\n");
+
+	/* If RxComplete interrupt, do nothing. */
+	if (status & IPG_IS_RX_COMPLETE)
+		IPG_DEBUG_MSG("RxComplete interrupt.\n");
+
+	/* If RxEarly interrupt, do nothing. */
+	if (status & IPG_IS_RX_EARLY)
+		IPG_DEBUG_MSG("RxEarly interrupt.\n");
+
+out_enable:
+	/* Re-enable IPG interrupts. */
+	ipg_w16(IPG_IE_TX_DMA_COMPLETE | IPG_IE_RX_DMA_COMPLETE |
+		IPG_IE_HOST_ERROR | IPG_IE_INT_REQUESTED | IPG_IE_TX_COMPLETE |
+		IPG_IE_LINK_EVENT | IPG_IE_UPDATE_STATS, INT_ENABLE);
+
+	spin_unlock(&sp->lock);
+out:
+	return IRQ_RETVAL(handled);
+}
+
+static void ipg_rx_clear(struct ipg_nic_private *sp)
+{
+	unsigned int i;
+
+	for (i = 0; i < IPG_RFDLIST_LENGTH; i++) {
+		if (sp->RxBuff[i]) {
+			struct ipg_rx *rxfd = sp->rxd + i;
+
+			IPG_DEV_KFREE_SKB(sp->RxBuff[i]);
+			sp->RxBuff[i] = NULL;
+			pci_unmap_single(sp->pdev,
+				le64_to_cpu(rxfd->frag_info & ~IPG_RFI_FRAGLEN),
+				sp->rx_buf_sz, PCI_DMA_FROMDEVICE);
+		}
+	}
+}
+
+static void ipg_tx_clear(struct ipg_nic_private *sp)
+{
+	unsigned int i;
+
+	for (i = 0; i < IPG_TFDLIST_LENGTH; i++) {
+		if (sp->TxBuff[i]) {
+			struct ipg_tx *txfd = sp->txd + i;
+
+			pci_unmap_single(sp->pdev,
+				le64_to_cpu(txfd->frag_info & ~IPG_TFI_FRAGLEN),
+				sp->TxBuff[i]->len, PCI_DMA_TODEVICE);
+
+			IPG_DEV_KFREE_SKB(sp->TxBuff[i]);
+
+			sp->TxBuff[i] = NULL;
+		}
+	}
+}
+
+static int ipg_nic_open(struct net_device *dev)
+{
+	struct ipg_nic_private *sp = netdev_priv(dev);
+	void __iomem *ioaddr = sp->ioaddr;
+	struct pci_dev *pdev = sp->pdev;
+	int rc;
+
+	IPG_DEBUG_MSG("_nic_open\n");
+
+	sp->rx_buf_sz = IPG_RXSUPPORT_SIZE;
+
+	/* Check for interrupt line conflicts, and request interrupt
+	 * line for IPG.
+	 *
+	 * IMPORTANT: Disable IPG interrupts prior to registering
+	 *            IRQ.
+	 */
+	ipg_w16(0x0000, INT_ENABLE);
+
+	/* Register the interrupt line to be used by the IPG within
+	 * the Linux system.
+	 */
+	rc = request_irq(pdev->irq, &ipg_interrupt_handler, IRQF_SHARED,
+			 dev->name, dev);
+	if (rc < 0) {
+		printk(KERN_INFO "%s: Error when requesting interrupt.\n",
+		       dev->name);
+		goto out;
+	}
+
+	dev->irq = pdev->irq;
+
+	rc = -ENOMEM;
+
+	sp->rxd = dma_alloc_coherent(&pdev->dev, IPG_RX_RING_BYTES,
+				     &sp->rxd_map, GFP_KERNEL);
+	if (!sp->rxd)
+		goto err_free_irq_0;
+
+	sp->txd = dma_alloc_coherent(&pdev->dev, IPG_TX_RING_BYTES,
+				     &sp->txd_map, GFP_KERNEL);
+	if (!sp->txd)
+		goto err_free_rx_1;
+
+	rc = init_rfdlist(dev);
+	if (rc < 0) {
+		printk(KERN_INFO "%s: Error during configuration.\n",
+		       dev->name);
+		goto err_free_tx_2;
+	}
+
+	init_tfdlist(dev);
+
+	rc = ipg_io_config(dev);
+	if (rc < 0) {
+		printk(KERN_INFO "%s: Error during configuration.\n",
+		       dev->name);
+		goto err_release_tfdlist_3;
+	}
+
+	/* Resolve autonegotiation. */
+	if (ipg_config_autoneg(dev) < 0)
+		printk(KERN_INFO "%s: Auto-negotiation error.\n", dev->name);
+
+#ifdef JUMBO_FRAME
+	/* initialize JUMBO Frame control variable */
+	sp->Jumbo.FoundStart = 0;
+	sp->Jumbo.CurrentSize = 0;
+	sp->Jumbo.skb = 0;
+	dev->mtu = IPG_TXFRAG_SIZE;
+#endif
+
+	/* Enable transmit and receive operation of the IPG. */
+	ipg_w32((ipg_r32(MAC_CTRL) | IPG_MC_RX_ENABLE | IPG_MC_TX_ENABLE) &
+		 IPG_MC_RSVD_MASK, MAC_CTRL);
+
+	netif_start_queue(dev);
+out:
+	return rc;
+
+err_release_tfdlist_3:
+	ipg_tx_clear(sp);
+	ipg_rx_clear(sp);
+err_free_tx_2:
+	dma_free_coherent(&pdev->dev, IPG_TX_RING_BYTES, sp->txd, sp->txd_map);
+err_free_rx_1:
+	dma_free_coherent(&pdev->dev, IPG_RX_RING_BYTES, sp->rxd, sp->rxd_map);
+err_free_irq_0:
+	free_irq(pdev->irq, dev);
+	goto out;
+}
+
+static int ipg_nic_stop(struct net_device *dev)
+{
+	struct ipg_nic_private *sp = netdev_priv(dev);
+	void __iomem *ioaddr = sp->ioaddr;
+	struct pci_dev *pdev = sp->pdev;
+
+	IPG_DEBUG_MSG("_nic_stop\n");
+
+	netif_stop_queue(dev);
+
+	IPG_DDEBUG_MSG("RFDlistendCount = %i\n", sp->RFDlistendCount);
+	IPG_DDEBUG_MSG("RFDListCheckedCount = %i\n", sp->rxdCheckedCount);
+	IPG_DDEBUG_MSG("EmptyRFDListCount = %i\n", sp->EmptyRFDListCount);
+	IPG_DUMPTFDLIST(dev);
+
+	do {
+		(void) ipg_r16(INT_STATUS_ACK);
+
+		ipg_reset(dev, IPG_AC_GLOBAL_RESET | IPG_AC_HOST | IPG_AC_DMA);
+
+		synchronize_irq(pdev->irq);
+	} while (ipg_r16(INT_ENABLE) & IPG_IE_RSVD_MASK);
+
+	ipg_rx_clear(sp);
+
+	ipg_tx_clear(sp);
+
+	pci_free_consistent(pdev, IPG_RX_RING_BYTES, sp->rxd, sp->rxd_map);
+	pci_free_consistent(pdev, IPG_TX_RING_BYTES, sp->txd, sp->txd_map);
+
+	free_irq(pdev->irq, dev);
+
+	return 0;
+}
+
+static int ipg_nic_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct ipg_nic_private *sp = netdev_priv(dev);
+	void __iomem *ioaddr = sp->ioaddr;
+	unsigned int entry = sp->tx_current % IPG_TFDLIST_LENGTH;
+	unsigned long flags;
+	struct ipg_tx *txfd;
+
+	IPG_DDEBUG_MSG("_nic_hard_start_xmit\n");
+
+	/* If in 10Mbps mode, stop the transmit queue so
+	 * no more transmit frames are accepted.
+	 */
+	if (sp->tenmbpsmode)
+		netif_stop_queue(dev);
+
+	if (sp->ResetCurrentTFD) {
+		sp->ResetCurrentTFD = 0;
+		entry = 0;
+	}
+
+	txfd = sp->txd + entry;
+
+	sp->TxBuff[entry] = skb;
+
+	/* Clear all TFC fields, except TFDDONE. */
+	txfd->tfc = cpu_to_le64(IPG_TFC_TFDDONE);
+
+	/* Specify the TFC field within the TFD. */
+	txfd->tfc |= cpu_to_le64(IPG_TFC_WORDALIGNDISABLED |
+		(IPG_TFC_FRAMEID & cpu_to_le64(sp->tx_current)) |
+		(IPG_TFC_FRAGCOUNT & (1 << 24)));
+
+	/* Request TxComplete interrupts at an interval defined
+	 * by the constant IPG_FRAMESBETWEENTXCOMPLETES.
+	 * Request TxComplete interrupt for every frame
+	 * if in 10Mbps mode to accomodate problem with 10Mbps
+	 * processing.
+	 */
+	if (sp->tenmbpsmode)
+		txfd->tfc |= cpu_to_le64(IPG_TFC_TXINDICATE);
+	else if (!((sp->tx_current - sp->tx_dirty + 1) >
+	    IPG_FRAMESBETWEENTXDMACOMPLETES)) {
+		txfd->tfc |= cpu_to_le64(IPG_TFC_TXDMAINDICATE);
+	}
+	/* Based on compilation option, determine if FCS is to be
+	 * appended to transmit frame by IPG.
+	 */
+	if (!(IPG_APPEND_FCS_ON_TX))
+		txfd->tfc |= cpu_to_le64(IPG_TFC_FCSAPPENDDISABLE);
+
+	/* Based on compilation option, determine if IP, TCP and/or
+	 * UDP checksums are to be added to transmit frame by IPG.
+	 */
+	if (IPG_ADD_IPCHECKSUM_ON_TX)
+		txfd->tfc |= cpu_to_le64(IPG_TFC_IPCHECKSUMENABLE);
+
+	if (IPG_ADD_TCPCHECKSUM_ON_TX)
+		txfd->tfc |= cpu_to_le64(IPG_TFC_TCPCHECKSUMENABLE);
+
+	if (IPG_ADD_UDPCHECKSUM_ON_TX)
+		txfd->tfc |= cpu_to_le64(IPG_TFC_UDPCHECKSUMENABLE);
+
+	/* Based on compilation option, determine if VLAN tag info is to be
+	 * inserted into transmit frame by IPG.
+	 */
+	if (IPG_INSERT_MANUAL_VLAN_TAG) {
+		txfd->tfc |= cpu_to_le64(IPG_TFC_VLANTAGINSERT |
+			((u64) IPG_MANUAL_VLAN_VID << 32) |
+			((u64) IPG_MANUAL_VLAN_CFI << 44) |
+			((u64) IPG_MANUAL_VLAN_USERPRIORITY << 45));
+	}
+
+	/* The fragment start location within system memory is defined
+	 * by the sk_buff structure's data field. The physical address
+	 * of this location within the system's virtual memory space
+	 * is determined using the IPG_HOST2BUS_MAP function.
+	 */
+	txfd->frag_info = cpu_to_le64(pci_map_single(sp->pdev, skb->data,
+		skb->len, PCI_DMA_TODEVICE));
+
+	/* The length of the fragment within system memory is defined by
+	 * the sk_buff structure's len field.
+	 */
+	txfd->frag_info |= cpu_to_le64(IPG_TFI_FRAGLEN &
+		((u64) (skb->len & 0xffff) << 48));
+
+	/* Clear the TFDDone bit last to indicate the TFD is ready
+	 * for transfer to the IPG.
+	 */
+	txfd->tfc &= cpu_to_le64(~IPG_TFC_TFDDONE);
+
+	spin_lock_irqsave(&sp->lock, flags);
+
+	sp->tx_current++;
+
+	mmiowb();
+
+	ipg_w32(IPG_DC_TX_DMA_POLL_NOW, DMA_CTRL);
+
+	if (sp->tx_current == (sp->tx_dirty + IPG_TFDLIST_LENGTH))
+		netif_wake_queue(dev);
+
+	spin_unlock_irqrestore(&sp->lock, flags);
+
+	return NETDEV_TX_OK;
+}
+
+static void ipg_set_phy_default_param(unsigned char rev,
+				      struct net_device *dev, int phy_address)
+{
+	unsigned short length;
+	unsigned char revision;
+	unsigned short *phy_param;
+	unsigned short address, value;
+
+	phy_param = &DefaultPhyParam[0];
+	length = *phy_param & 0x00FF;
+	revision = (unsigned char)((*phy_param) >> 8);
+	phy_param++;
+	while (length != 0) {
+		if (rev == revision) {
+			while (length > 1) {
+				address = *phy_param;
+				value = *(phy_param + 1);
+				phy_param += 2;
+				mdio_write(dev, phy_address, address, value);
+				length -= 4;
+			}
+			break;
+		} else {
+			phy_param += length / 2;
+			length = *phy_param & 0x00FF;
+			revision = (unsigned char)((*phy_param) >> 8);
+			phy_param++;
+		}
+	}
+}
+
+/* JES20040127EEPROM */
+static int read_eeprom(struct net_device *dev, int eep_addr)
+{
+	void __iomem *ioaddr = ipg_ioaddr(dev);
+	unsigned int i;
+	int ret = 0;
+	u16 value;
+
+	value = IPG_EC_EEPROM_READOPCODE | (eep_addr & 0xff);
+	ipg_w16(value, EEPROM_CTRL);
+
+	for (i = 0; i < 1000; i++) {
+		u16 data;
+
+		mdelay(10);
+		data = ipg_r16(EEPROM_CTRL);
+		if (!(data & IPG_EC_EEPROM_BUSY)) {
+			ret = ipg_r16(EEPROM_DATA);
+			break;
+		}
+	}
+	return ret;
+}
+
+static void ipg_init_mii(struct net_device *dev)
+{
+	struct ipg_nic_private *sp = netdev_priv(dev);
+	struct mii_if_info *mii_if = &sp->mii_if;
+	int phyaddr;
+
+	mii_if->dev          = dev;
+	mii_if->mdio_read    = mdio_read;
+	mii_if->mdio_write   = mdio_write;
+	mii_if->phy_id_mask  = 0x1f;
+	mii_if->reg_num_mask = 0x1f;
+
+	mii_if->phy_id = phyaddr = ipg_find_phyaddr(dev);
+
+	if (phyaddr != 0x1f) {
+		u16 mii_phyctrl, mii_1000cr;
+		u8 revisionid = 0;
+
+		mii_1000cr  = mdio_read(dev, phyaddr, MII_CTRL1000);
+		mii_1000cr |= ADVERTISE_1000FULL | ADVERTISE_1000HALF |
+			GMII_PHY_1000BASETCONTROL_PreferMaster;
+		mdio_write(dev, phyaddr, MII_CTRL1000, mii_1000cr);
+
+		mii_phyctrl = mdio_read(dev, phyaddr, MII_BMCR);
+
+		/* Set default phyparam */
+		pci_read_config_byte(sp->pdev, PCI_REVISION_ID, &revisionid);
+		ipg_set_phy_default_param(revisionid, dev, phyaddr);
+
+		/* Reset PHY */
+		mii_phyctrl |= BMCR_RESET | BMCR_ANRESTART;
+		mdio_write(dev, phyaddr, MII_BMCR, mii_phyctrl);
+
+	}
+}
+
+static int ipg_hw_init(struct net_device *dev)
+{
+	struct ipg_nic_private *sp = netdev_priv(dev);
+	void __iomem *ioaddr = sp->ioaddr;
+	unsigned int i;
+	int rc;
+
+	/* Read/Write and Reset EEPROM Value Jesse20040128EEPROM_VALUE */
+	/* Read LED Mode Configuration from EEPROM */
+	sp->LED_Mode = read_eeprom(dev, 6);
+
+	/* Reset all functions within the IPG. Do not assert
+	 * RST_OUT as not compatible with some PHYs.
+	 */
+	rc = ipg_reset(dev, IPG_RESET_MASK);
+	if (rc < 0)
+		goto out;
+
+	ipg_init_mii(dev);
+
+	/* Read MAC Address from EEPROM */
+	for (i = 0; i < 3; i++)
+		sp->station_addr[i] = read_eeprom(dev, 16 + i);
+
+	for (i = 0; i < 3; i++)
+		ipg_w16(sp->station_addr[i], STATION_ADDRESS_0 + 2*i);
+
+	/* Set station address in ethernet_device structure. */
+	dev->dev_addr[0] =  ipg_r16(STATION_ADDRESS_0) & 0x00ff;
+	dev->dev_addr[1] = (ipg_r16(STATION_ADDRESS_0) & 0xff00) >> 8;
+	dev->dev_addr[2] =  ipg_r16(STATION_ADDRESS_1) & 0x00ff;
+	dev->dev_addr[3] = (ipg_r16(STATION_ADDRESS_1) & 0xff00) >> 8;
+	dev->dev_addr[4] =  ipg_r16(STATION_ADDRESS_2) & 0x00ff;
+	dev->dev_addr[5] = (ipg_r16(STATION_ADDRESS_2) & 0xff00) >> 8;
+out:
+	return rc;
+}
+
+static int ipg_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+{
+	struct ipg_nic_private *sp = netdev_priv(dev);
+	int rc;
+
+	mutex_lock(&sp->mii_mutex);
+	rc = generic_mii_ioctl(&sp->mii_if, if_mii(ifr), cmd, NULL);
+	mutex_unlock(&sp->mii_mutex);
+
+	return rc;
+}
+
+static int ipg_nic_change_mtu(struct net_device *dev, int new_mtu)
+{
+	/* Function to accomodate changes to Maximum Transfer Unit
+	 * (or MTU) of IPG NIC. Cannot use default function since
+	 * the default will not allow for MTU > 1500 bytes.
+	 */
+
+	IPG_DEBUG_MSG("_nic_change_mtu\n");
+
+	/* Check that the new MTU value is between 68 (14 byte header, 46
+	 * byte payload, 4 byte FCS) and IPG_MAX_RXFRAME_SIZE, which
+	 * corresponds to the MAXFRAMESIZE register in the IPG.
+	 */
+	if ((new_mtu < 68) || (new_mtu > IPG_MAX_RXFRAME_SIZE))
+		return -EINVAL;
+
+	dev->mtu = new_mtu;
+
+	return 0;
+}
+
+static int ipg_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
+{
+	struct ipg_nic_private *sp = netdev_priv(dev);
+	int rc;
+
+	mutex_lock(&sp->mii_mutex);
+	rc = mii_ethtool_gset(&sp->mii_if, cmd);
+	mutex_unlock(&sp->mii_mutex);
+
+	return rc;
+}
+
+static int ipg_set_settings(struct net_device *dev, struct ethtool_cmd *cmd)
+{
+	struct ipg_nic_private *sp = netdev_priv(dev);
+	int rc;
+
+	mutex_lock(&sp->mii_mutex);
+	rc = mii_ethtool_sset(&sp->mii_if, cmd);
+	mutex_unlock(&sp->mii_mutex);
+
+	return rc;
+}
+
+static int ipg_nway_reset(struct net_device *dev)
+{
+	struct ipg_nic_private *sp = netdev_priv(dev);
+	int rc;
+
+	mutex_lock(&sp->mii_mutex);
+	rc = mii_nway_restart(&sp->mii_if);
+	mutex_unlock(&sp->mii_mutex);
+
+	return rc;
+}
+
+static struct ethtool_ops ipg_ethtool_ops = {
+	.get_settings = ipg_get_settings,
+	.set_settings = ipg_set_settings,
+	.nway_reset   = ipg_nway_reset,
+};
+
+static void ipg_remove(struct pci_dev *pdev)
+{
+	struct net_device *dev = pci_get_drvdata(pdev);
+	struct ipg_nic_private *sp = netdev_priv(dev);
+
+	IPG_DEBUG_MSG("_remove\n");
+
+	/* Un-register Ethernet device. */
+	unregister_netdev(dev);
+
+	pci_iounmap(pdev, sp->ioaddr);
+
+	pci_release_regions(pdev);
+
+	free_netdev(dev);
+	pci_disable_device(pdev);
+	pci_set_drvdata(pdev, NULL);
+}
+
+static int __devinit ipg_probe(struct pci_dev *pdev,
+			       const struct pci_device_id *id)
+{
+	unsigned int i = id->driver_data;
+	struct ipg_nic_private *sp;
+	struct net_device *dev;
+	void __iomem *ioaddr;
+	int rc;
+
+	rc = pci_enable_device(pdev);
+	if (rc < 0)
+		goto out;
+
+	printk(KERN_INFO "%s: %s\n", pci_name(pdev), ipg_brand_name[i]);
+
+	pci_set_master(pdev);
+
+	rc = pci_set_dma_mask(pdev, DMA_40BIT_MASK);
+	if (rc < 0) {
+		rc = pci_set_dma_mask(pdev, DMA_32BIT_MASK);
+		if (rc < 0) {
+			printk(KERN_ERR "%s: DMA config failed.\n",
+			       pci_name(pdev));
+			goto err_disable_0;
+		}
+	}
+
+	/*
+	 * Initialize net device.
+	 */
+	dev = alloc_etherdev(sizeof(struct ipg_nic_private));
+	if (!dev) {
+		printk(KERN_ERR "%s: alloc_etherdev failed\n", pci_name(pdev));
+		rc = -ENOMEM;
+		goto err_disable_0;
+	}
+
+	sp = netdev_priv(dev);
+	spin_lock_init(&sp->lock);
+	mutex_init(&sp->mii_mutex);
+
+	/* Declare IPG NIC functions for Ethernet device methods.
+	 */
+	dev->open = &ipg_nic_open;
+	dev->stop = &ipg_nic_stop;
+	dev->hard_start_xmit = &ipg_nic_hard_start_xmit;
+	dev->get_stats = &ipg_nic_get_stats;
+	dev->set_multicast_list = &ipg_nic_set_multicast_list;
+	dev->do_ioctl = ipg_ioctl;
+	dev->tx_timeout = ipg_tx_timeout;
+	dev->change_mtu = &ipg_nic_change_mtu;
+
+	SET_MODULE_OWNER(dev);
+	SET_NETDEV_DEV(dev, &pdev->dev);
+	SET_ETHTOOL_OPS(dev, &ipg_ethtool_ops);
+
+	rc = pci_request_regions(pdev, DRV_NAME);
+	if (rc)
+		goto err_free_dev_1;
+
+	ioaddr = pci_iomap(pdev, 1, pci_resource_len(pdev, 1));
+	if (!ioaddr) {
+		printk(KERN_ERR "%s cannot map MMIO\n", pci_name(pdev));
+		rc = -EIO;
+		goto err_release_regions_2;
+	}
+
+	/* Save the pointer to the PCI device information. */
+	sp->ioaddr = ioaddr;
+	sp->pdev = pdev;
+	sp->dev = dev;
+
+	INIT_DELAYED_WORK(&sp->task, ipg_reset_after_host_error);
+
+	pci_set_drvdata(pdev, dev);
+
+	rc = ipg_hw_init(dev);
+	if (rc < 0)
+		goto err_unmap_3;
+
+	rc = register_netdev(dev);
+	if (rc < 0)
+		goto err_unmap_3;
+
+	printk(KERN_INFO "Ethernet device registered as: %s\n", dev->name);
+out:
+	return rc;
+
+err_unmap_3:
+	pci_iounmap(pdev, ioaddr);
+err_release_regions_2:
+	pci_release_regions(pdev);
+err_free_dev_1:
+	free_netdev(dev);
+err_disable_0:
+	pci_disable_device(pdev);
+	goto out;
+}
+
+static struct pci_driver ipg_pci_driver = {
+	.name		= IPG_DRIVER_NAME,
+	.id_table	= ipg_pci_tbl,
+	.probe		= ipg_probe,
+	.remove		= __devexit_p(ipg_remove),
+};
+
+static int __init ipg_init_module(void)
+{
+	return pci_register_driver(&ipg_pci_driver);
+}
+
+static void __exit ipg_exit_module(void)
+{
+	pci_unregister_driver(&ipg_pci_driver);
+}
+
+module_init(ipg_init_module);
+module_exit(ipg_exit_module);
diff --git a/drivers/net/ipg.h b/drivers/net/ipg.h
new file mode 100755
index 0000000..9b8e3bb
--- /dev/null
+++ b/drivers/net/ipg.h
@@ -0,0 +1,856 @@
+/*
+ *
+ * ipg.h
+ *
+ * Include file for Gigabit Ethernet device driver for Network
+ * Interface Cards (NICs) utilizing the Tamarack Microelectronics
+ * Inc. IPG Gigabit or Triple Speed Ethernet Media Access
+ * Controller.
+ *
+ * Craig Rich
+ * Sundance Technology, Inc.
+ * 1485 Saratoga Avenue
+ * Suite 200
+ * San Jose, CA 95129
+ * 408 873 4117
+ * www.sundanceti.com
+ * craig_rich@sundanceti.com
+ */
+#ifndef __LINUX_IPG_H
+#define __LINUX_IPG_H
+
+#include <linux/version.h>
+#include <linux/module.h>
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/ioport.h>
+#include <linux/errno.h>
+#include <asm/io.h>
+#include <linux/delay.h>
+#include <linux/types.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/init.h>
+#include <linux/skbuff.h>
+#include <linux/version.h>
+#include <asm/bitops.h>
+/*#include <asm/spinlock.h>*/
+
+#define DrvVer "2.09d"
+
+#define IPG_DEV_KFREE_SKB(skb) dev_kfree_skb_irq(skb)
+
+/*
+ *	Constants
+ */
+
+/* GMII based PHY IDs */
+#define		NS				0x2000
+#define		MARVELL				0x0141
+#define		ICPLUS_PHY		0x243
+
+/* NIC Physical Layer Device MII register fields. */
+#define         MII_PHY_SELECTOR_IEEE8023       0x0001
+#define         MII_PHY_TECHABILITYFIELD        0x1FE0
+
+/* GMII_PHY_1000 need to set to prefer master */
+#define         GMII_PHY_1000BASETCONTROL_PreferMaster 0x0400
+
+/* NIC Physical Layer Device GMII constants. */
+#define         GMII_PREAMBLE                    0xFFFFFFFF
+#define         GMII_ST                          0x1
+#define         GMII_READ                        0x2
+#define         GMII_WRITE                       0x1
+#define         GMII_TA_READ_MASK                0x1
+#define         GMII_TA_WRITE                    0x2
+
+/* I/O register offsets. */
+enum ipg_regs {
+	DMA_CTRL		= 0x00,
+	RX_DMA_STATUS		= 0x08, // Unused + reserved
+	TFD_LIST_PTR_0		= 0x10,
+	TFD_LIST_PTR_1		= 0x14,
+	TX_DMA_BURST_THRESH	= 0x18,
+	TX_DMA_URGENT_THRESH	= 0x19,
+	TX_DMA_POLL_PERIOD	= 0x1a,
+	RFD_LIST_PTR_0		= 0x1c,
+	RFD_LIST_PTR_1		= 0x20,
+	RX_DMA_BURST_THRESH	= 0x24,
+	RX_DMA_URGENT_THRESH	= 0x25,
+	RX_DMA_POLL_PERIOD	= 0x26,
+	DEBUG_CTRL		= 0x2c,
+	ASIC_CTRL		= 0x30,
+	FIFO_CTRL		= 0x38, // Unused
+	FLOW_OFF_THRESH		= 0x3c,
+	FLOW_ON_THRESH		= 0x3e,
+	EEPROM_DATA		= 0x48,
+	EEPROM_CTRL		= 0x4a,
+	EXPROM_ADDR		= 0x4c, // Unused
+	EXPROM_DATA		= 0x50, // Unused
+	WAKE_EVENT		= 0x51, // Unused
+	COUNTDOWN		= 0x54, // Unused
+	INT_STATUS_ACK		= 0x5a,
+	INT_ENABLE		= 0x5c,
+	INT_STATUS		= 0x5e, // Unused
+	TX_STATUS		= 0x60,
+	MAC_CTRL		= 0x6c,
+	VLAN_TAG		= 0x70, // Unused
+	PHY_SET			= 0x75,	// JES20040127EEPROM
+	PHY_CTRL		= 0x76,
+	STATION_ADDRESS_0	= 0x78,
+	STATION_ADDRESS_1	= 0x7a,
+	STATION_ADDRESS_2	= 0x7c,
+	MAX_FRAME_SIZE		= 0x86,
+	RECEIVE_MODE		= 0x88,
+	HASHTABLE_0		= 0x8c,
+	HASHTABLE_1		= 0x90,
+	RMON_STATISTICS_MASK	= 0x98,
+	STATISTICS_MASK		= 0x9c,
+	RX_JUMBO_FRAMES		= 0xbc, // Unused
+	TCP_CHECKSUM_ERRORS	= 0xc0, // Unused
+	IP_CHECKSUM_ERRORS	= 0xc2, // Unused
+	UDP_CHECKSUM_ERRORS	= 0xc4, // Unused
+	TX_JUMBO_FRAMES		= 0xf4  // Unused
+};
+
+/* Ethernet MIB statistic register offsets. */
+#define	IPG_OCTETRCVOK		0xA8
+#define	IPG_MCSTOCTETRCVDOK		0xAC
+#define	IPG_BCSTOCTETRCVOK		0xB0
+#define	IPG_FRAMESRCVDOK		0xB4
+#define	IPG_MCSTFRAMESRCVDOK		0xB8
+#define	IPG_BCSTFRAMESRCVDOK		0xBE
+#define	IPG_MACCONTROLFRAMESRCVD	0xC6
+#define	IPG_FRAMETOOLONGERRRORS	0xC8
+#define	IPG_INRANGELENGTHERRORS	0xCA
+#define	IPG_FRAMECHECKSEQERRORS	0xCC
+#define	IPG_FRAMESLOSTRXERRORS	0xCE
+#define	IPG_OCTETXMTOK		0xD0
+#define	IPG_MCSTOCTETXMTOK		0xD4
+#define	IPG_BCSTOCTETXMTOK		0xD8
+#define	IPG_FRAMESXMTDOK		0xDC
+#define	IPG_MCSTFRAMESXMTDOK		0xE0
+#define	IPG_FRAMESWDEFERREDXMT	0xE4
+#define	IPG_LATECOLLISIONS		0xE8
+#define	IPG_MULTICOLFRAMES		0xEC
+#define	IPG_SINGLECOLFRAMES		0xF0
+#define	IPG_BCSTFRAMESXMTDOK		0xF6
+#define	IPG_CARRIERSENSEERRORS	0xF8
+#define	IPG_MACCONTROLFRAMESXMTDOK	0xFA
+#define	IPG_FRAMESABORTXSCOLLS	0xFC
+#define	IPG_FRAMESWEXDEFERRAL	0xFE
+
+/* RMON statistic register offsets. */
+#define	IPG_ETHERSTATSCOLLISIONS			0x100
+#define	IPG_ETHERSTATSOCTETSTRANSMIT			0x104
+#define	IPG_ETHERSTATSPKTSTRANSMIT			0x108
+#define	IPG_ETHERSTATSPKTS64OCTESTSTRANSMIT		0x10C
+#define	IPG_ETHERSTATSPKTS65TO127OCTESTSTRANSMIT	0x110
+#define	IPG_ETHERSTATSPKTS128TO255OCTESTSTRANSMIT	0x114
+#define	IPG_ETHERSTATSPKTS256TO511OCTESTSTRANSMIT	0x118
+#define	IPG_ETHERSTATSPKTS512TO1023OCTESTSTRANSMIT	0x11C
+#define	IPG_ETHERSTATSPKTS1024TO1518OCTESTSTRANSMIT	0x120
+#define	IPG_ETHERSTATSCRCALIGNERRORS			0x124
+#define	IPG_ETHERSTATSUNDERSIZEPKTS			0x128
+#define	IPG_ETHERSTATSFRAGMENTS			0x12C
+#define	IPG_ETHERSTATSJABBERS			0x130
+#define	IPG_ETHERSTATSOCTETS				0x134
+#define	IPG_ETHERSTATSPKTS				0x138
+#define	IPG_ETHERSTATSPKTS64OCTESTS			0x13C
+#define	IPG_ETHERSTATSPKTS65TO127OCTESTS		0x140
+#define	IPG_ETHERSTATSPKTS128TO255OCTESTS		0x144
+#define	IPG_ETHERSTATSPKTS256TO511OCTESTS		0x148
+#define	IPG_ETHERSTATSPKTS512TO1023OCTESTS		0x14C
+#define	IPG_ETHERSTATSPKTS1024TO1518OCTESTS		0x150
+
+/* RMON statistic register equivalents. */
+#define	IPG_ETHERSTATSMULTICASTPKTSTRANSMIT		0xE0
+#define	IPG_ETHERSTATSBROADCASTPKTSTRANSMIT		0xF6
+#define	IPG_ETHERSTATSMULTICASTPKTS			0xB8
+#define	IPG_ETHERSTATSBROADCASTPKTS			0xBE
+#define	IPG_ETHERSTATSOVERSIZEPKTS			0xC8
+#define	IPG_ETHERSTATSDROPEVENTS			0xCE
+
+/* Serial EEPROM offsets */
+#define	IPG_EEPROM_CONFIGPARAM	0x00
+#define	IPG_EEPROM_ASICCTRL		0x01
+#define	IPG_EEPROM_SUBSYSTEMVENDORID	0x02
+#define	IPG_EEPROM_SUBSYSTEMID	0x03
+#define	IPG_EEPROM_STATIONADDRESS0	0x10
+#define	IPG_EEPROM_STATIONADDRESS1	0x11
+#define	IPG_EEPROM_STATIONADDRESS2	0x12
+
+/* Register & data structure bit masks */
+
+/* PCI register masks. */
+
+/* IOBaseAddress */
+#define         IPG_PIB_RSVD_MASK		0xFFFFFE01
+#define         IPG_PIB_IOBASEADDRESS	0xFFFFFF00
+#define         IPG_PIB_IOBASEADDRIND	0x00000001
+
+/* MemBaseAddress */
+#define         IPG_PMB_RSVD_MASK		0xFFFFFE07
+#define         IPG_PMB_MEMBASEADDRIND	0x00000001
+#define         IPG_PMB_MEMMAPTYPE		0x00000006
+#define         IPG_PMB_MEMMAPTYPE0		0x00000002
+#define         IPG_PMB_MEMMAPTYPE1		0x00000004
+#define         IPG_PMB_MEMBASEADDRESS	0xFFFFFE00
+
+/* ConfigStatus */
+#define IPG_CS_RSVD_MASK                0xFFB0
+#define IPG_CS_CAPABILITIES             0x0010
+#define IPG_CS_66MHZCAPABLE             0x0020
+#define IPG_CS_FASTBACK2BACK            0x0080
+#define IPG_CS_DATAPARITYREPORTED       0x0100
+#define IPG_CS_DEVSELTIMING             0x0600
+#define IPG_CS_SIGNALEDTARGETABORT      0x0800
+#define IPG_CS_RECEIVEDTARGETABORT      0x1000
+#define IPG_CS_RECEIVEDMASTERABORT      0x2000
+#define IPG_CS_SIGNALEDSYSTEMERROR      0x4000
+#define IPG_CS_DETECTEDPARITYERROR      0x8000
+
+/* TFD data structure masks. */
+
+/* TFDList, TFC */
+#define	IPG_TFC_RSVD_MASK			0x0000FFFF9FFFFFFF
+#define	IPG_TFC_FRAMEID			0x000000000000FFFF
+#define	IPG_TFC_WORDALIGN			0x0000000000030000
+#define	IPG_TFC_WORDALIGNTODWORD		0x0000000000000000
+#define	IPG_TFC_WORDALIGNTOWORD		0x0000000000020000
+#define	IPG_TFC_WORDALIGNDISABLED		0x0000000000030000
+#define	IPG_TFC_TCPCHECKSUMENABLE		0x0000000000040000
+#define	IPG_TFC_UDPCHECKSUMENABLE		0x0000000000080000
+#define	IPG_TFC_IPCHECKSUMENABLE		0x0000000000100000
+#define	IPG_TFC_FCSAPPENDDISABLE		0x0000000000200000
+#define	IPG_TFC_TXINDICATE			0x0000000000400000
+#define	IPG_TFC_TXDMAINDICATE		0x0000000000800000
+#define	IPG_TFC_FRAGCOUNT			0x000000000F000000
+#define	IPG_TFC_VLANTAGINSERT		0x0000000010000000
+#define	IPG_TFC_TFDDONE			0x0000000080000000
+#define	IPG_TFC_VID				0x00000FFF00000000
+#define	IPG_TFC_CFI				0x0000100000000000
+#define	IPG_TFC_USERPRIORITY			0x0000E00000000000
+
+/* TFDList, FragInfo */
+#define	IPG_TFI_RSVD_MASK			0xFFFF00FFFFFFFFFF
+#define	IPG_TFI_FRAGADDR			0x000000FFFFFFFFFF
+#define	IPG_TFI_FRAGLEN			0xFFFF000000000000LL
+
+/* RFD data structure masks. */
+
+/* RFDList, RFS */
+#define	IPG_RFS_RSVD_MASK			0x0000FFFFFFFFFFFF
+#define	IPG_RFS_RXFRAMELEN			0x000000000000FFFF
+#define	IPG_RFS_RXFIFOOVERRUN		0x0000000000010000
+#define	IPG_RFS_RXRUNTFRAME			0x0000000000020000
+#define	IPG_RFS_RXALIGNMENTERROR		0x0000000000040000
+#define	IPG_RFS_RXFCSERROR			0x0000000000080000
+#define	IPG_RFS_RXOVERSIZEDFRAME		0x0000000000100000
+#define	IPG_RFS_RXLENGTHERROR		0x0000000000200000
+#define	IPG_RFS_VLANDETECTED			0x0000000000400000
+#define	IPG_RFS_TCPDETECTED			0x0000000000800000
+#define	IPG_RFS_TCPERROR			0x0000000001000000
+#define	IPG_RFS_UDPDETECTED			0x0000000002000000
+#define	IPG_RFS_UDPERROR			0x0000000004000000
+#define	IPG_RFS_IPDETECTED			0x0000000008000000
+#define	IPG_RFS_IPERROR			0x0000000010000000
+#define	IPG_RFS_FRAMESTART			0x0000000020000000
+#define	IPG_RFS_FRAMEEND			0x0000000040000000
+#define	IPG_RFS_RFDDONE			0x0000000080000000
+#define	IPG_RFS_TCI				0x0000FFFF00000000
+
+/* RFDList, FragInfo */
+#define	IPG_RFI_RSVD_MASK			0xFFFF00FFFFFFFFFF
+#define	IPG_RFI_FRAGADDR			0x000000FFFFFFFFFF
+#define	IPG_RFI_FRAGLEN			0xFFFF000000000000LL
+
+/* I/O Register masks. */
+
+/* RMON Statistics Mask */
+#define	IPG_RZ_ALL					0x0FFFFFFF
+
+/* Statistics Mask */
+#define	IPG_SM_ALL					0x0FFFFFFF
+#define	IPG_SM_OCTETRCVOK_FRAMESRCVDOK		0x00000001
+#define	IPG_SM_MCSTOCTETRCVDOK_MCSTFRAMESRCVDOK	0x00000002
+#define	IPG_SM_BCSTOCTETRCVDOK_BCSTFRAMESRCVDOK	0x00000004
+#define	IPG_SM_RXJUMBOFRAMES				0x00000008
+#define	IPG_SM_TCPCHECKSUMERRORS			0x00000010
+#define	IPG_SM_IPCHECKSUMERRORS			0x00000020
+#define	IPG_SM_UDPCHECKSUMERRORS			0x00000040
+#define	IPG_SM_MACCONTROLFRAMESRCVD			0x00000080
+#define	IPG_SM_FRAMESTOOLONGERRORS			0x00000100
+#define	IPG_SM_INRANGELENGTHERRORS			0x00000200
+#define	IPG_SM_FRAMECHECKSEQERRORS			0x00000400
+#define	IPG_SM_FRAMESLOSTRXERRORS			0x00000800
+#define	IPG_SM_OCTETXMTOK_FRAMESXMTOK		0x00001000
+#define	IPG_SM_MCSTOCTETXMTOK_MCSTFRAMESXMTDOK	0x00002000
+#define	IPG_SM_BCSTOCTETXMTOK_BCSTFRAMESXMTDOK	0x00004000
+#define	IPG_SM_FRAMESWDEFERREDXMT			0x00008000
+#define	IPG_SM_LATECOLLISIONS			0x00010000
+#define	IPG_SM_MULTICOLFRAMES			0x00020000
+#define	IPG_SM_SINGLECOLFRAMES			0x00040000
+#define	IPG_SM_TXJUMBOFRAMES				0x00080000
+#define	IPG_SM_CARRIERSENSEERRORS			0x00100000
+#define	IPG_SM_MACCONTROLFRAMESXMTD			0x00200000
+#define	IPG_SM_FRAMESABORTXSCOLLS			0x00400000
+#define	IPG_SM_FRAMESWEXDEFERAL			0x00800000
+
+/* Countdown */
+#define	IPG_CD_RSVD_MASK		0x0700FFFF
+#define	IPG_CD_COUNT			0x0000FFFF
+#define	IPG_CD_COUNTDOWNSPEED	0x01000000
+#define	IPG_CD_COUNTDOWNMODE		0x02000000
+#define	IPG_CD_COUNTINTENABLED	0x04000000
+
+/* TxDMABurstThresh */
+#define IPG_TB_RSVD_MASK                0xFF
+
+/* TxDMAUrgentThresh */
+#define IPG_TU_RSVD_MASK                0xFF
+
+/* TxDMAPollPeriod */
+#define IPG_TP_RSVD_MASK                0xFF
+
+/* RxDMAUrgentThresh */
+#define IPG_RU_RSVD_MASK                0xFF
+
+/* RxDMAPollPeriod */
+#define IPG_RP_RSVD_MASK                0xFF
+
+/* ReceiveMode */
+#define IPG_RM_RSVD_MASK                0x3F
+#define IPG_RM_RECEIVEUNICAST           0x01
+#define IPG_RM_RECEIVEMULTICAST         0x02
+#define IPG_RM_RECEIVEBROADCAST         0x04
+#define IPG_RM_RECEIVEALLFRAMES         0x08
+#define IPG_RM_RECEIVEMULTICASTHASH     0x10
+#define IPG_RM_RECEIVEIPMULTICAST       0x20
+
+/* PhySet JES20040127EEPROM*/
+#define IPG_PS_MEM_LENB9B               0x01
+#define IPG_PS_MEM_LEN9                 0x02
+#define IPG_PS_NON_COMPDET              0x04
+
+/* PhyCtrl */
+#define IPG_PC_RSVD_MASK                0xFF
+#define IPG_PC_MGMTCLK_LO               0x00
+#define IPG_PC_MGMTCLK_HI               0x01
+#define IPG_PC_MGMTCLK                  0x01
+#define IPG_PC_MGMTDATA                 0x02
+#define IPG_PC_MGMTDIR                  0x04
+#define IPG_PC_DUPLEX_POLARITY          0x08
+#define IPG_PC_DUPLEX_STATUS            0x10
+#define IPG_PC_LINK_POLARITY            0x20
+#define IPG_PC_LINK_SPEED               0xC0
+#define IPG_PC_LINK_SPEED_10MBPS        0x40
+#define IPG_PC_LINK_SPEED_100MBPS       0x80
+#define IPG_PC_LINK_SPEED_1000MBPS      0xC0
+
+/* DMACtrl */
+#define IPG_DC_RSVD_MASK                0xC07D9818
+#define IPG_DC_RX_DMA_COMPLETE          0x00000008
+#define IPG_DC_RX_DMA_POLL_NOW          0x00000010
+#define IPG_DC_TX_DMA_COMPLETE          0x00000800
+#define IPG_DC_TX_DMA_POLL_NOW          0x00001000
+#define IPG_DC_TX_DMA_IN_PROG           0x00008000
+#define IPG_DC_RX_EARLY_DISABLE         0x00010000
+#define IPG_DC_MWI_DISABLE              0x00040000
+#define IPG_DC_TX_WRITE_BACK_DISABLE    0x00080000
+#define IPG_DC_TX_BURST_LIMIT           0x00700000
+#define IPG_DC_TARGET_ABORT             0x40000000
+#define IPG_DC_MASTER_ABORT             0x80000000
+
+/* ASICCtrl */
+#define IPG_AC_RSVD_MASK                0x07FFEFF2
+#define IPG_AC_EXP_ROM_SIZE             0x00000002
+#define IPG_AC_PHY_SPEED10              0x00000010
+#define IPG_AC_PHY_SPEED100             0x00000020
+#define IPG_AC_PHY_SPEED1000            0x00000040
+#define IPG_AC_PHY_MEDIA                0x00000080
+#define IPG_AC_FORCED_CFG               0x00000700
+#define IPG_AC_D3RESETDISABLE           0x00000800
+#define IPG_AC_SPEED_UP_MODE            0x00002000
+#define IPG_AC_LED_MODE                 0x00004000
+#define IPG_AC_RST_OUT_POLARITY         0x00008000
+#define IPG_AC_GLOBAL_RESET             0x00010000
+#define IPG_AC_RX_RESET                 0x00020000
+#define IPG_AC_TX_RESET                 0x00040000
+#define IPG_AC_DMA                      0x00080000
+#define IPG_AC_FIFO                     0x00100000
+#define IPG_AC_NETWORK                  0x00200000
+#define IPG_AC_HOST                     0x00400000
+#define IPG_AC_AUTO_INIT                0x00800000
+#define IPG_AC_RST_OUT                  0x01000000
+#define IPG_AC_INT_REQUEST              0x02000000
+#define IPG_AC_RESET_BUSY               0x04000000
+#define IPG_AC_LED_SPEED                0x08000000	//JES20040127EEPROM
+#define IPG_AC_LED_MODE_BIT_1           0x20000000	//JES20040127EEPROM
+
+/* EepromCtrl */
+#define IPG_EC_RSVD_MASK                0x83FF
+#define IPG_EC_EEPROM_ADDR              0x00FF
+#define IPG_EC_EEPROM_OPCODE            0x0300
+#define IPG_EC_EEPROM_SUBCOMMAD         0x0000
+#define IPG_EC_EEPROM_WRITEOPCODE       0x0100
+#define IPG_EC_EEPROM_READOPCODE        0x0200
+#define IPG_EC_EEPROM_ERASEOPCODE       0x0300
+#define IPG_EC_EEPROM_BUSY              0x8000
+
+/* FIFOCtrl */
+#define IPG_FC_RSVD_MASK                0xC001
+#define IPG_FC_RAM_TEST_MODE            0x0001
+#define IPG_FC_TRANSMITTING             0x4000
+#define IPG_FC_RECEIVING                0x8000
+
+/* TxStatus */
+#define IPG_TS_RSVD_MASK                0xFFFF00DD
+#define IPG_TS_TX_ERROR                 0x00000001
+#define IPG_TS_LATE_COLLISION           0x00000004
+#define IPG_TS_TX_MAX_COLL              0x00000008
+#define IPG_TS_TX_UNDERRUN              0x00000010
+#define IPG_TS_TX_IND_REQD              0x00000040
+#define IPG_TS_TX_COMPLETE              0x00000080
+#define IPG_TS_TX_FRAMEID               0xFFFF0000
+
+/* WakeEvent */
+#define IPG_WE_WAKE_PKT_ENABLE          0x01
+#define IPG_WE_MAGIC_PKT_ENABLE         0x02
+#define IPG_WE_LINK_EVT_ENABLE          0x04
+#define IPG_WE_WAKE_POLARITY            0x08
+#define IPG_WE_WAKE_PKT_EVT             0x10
+#define IPG_WE_MAGIC_PKT_EVT            0x20
+#define IPG_WE_LINK_EVT                 0x40
+#define IPG_WE_WOL_ENABLE               0x80
+
+/* IntEnable */
+#define IPG_IE_RSVD_MASK                0x1FFE
+#define IPG_IE_HOST_ERROR               0x0002
+#define IPG_IE_TX_COMPLETE              0x0004
+#define IPG_IE_MAC_CTRL_FRAME           0x0008
+#define IPG_IE_RX_COMPLETE              0x0010
+#define IPG_IE_RX_EARLY                 0x0020
+#define IPG_IE_INT_REQUESTED            0x0040
+#define IPG_IE_UPDATE_STATS             0x0080
+#define IPG_IE_LINK_EVENT               0x0100
+#define IPG_IE_TX_DMA_COMPLETE          0x0200
+#define IPG_IE_RX_DMA_COMPLETE          0x0400
+#define IPG_IE_RFD_LIST_END             0x0800
+#define IPG_IE_RX_DMA_PRIORITY          0x1000
+
+/* IntStatus */
+#define IPG_IS_RSVD_MASK                0x1FFF
+#define IPG_IS_INTERRUPT_STATUS         0x0001
+#define IPG_IS_HOST_ERROR               0x0002
+#define IPG_IS_TX_COMPLETE              0x0004
+#define IPG_IS_MAC_CTRL_FRAME           0x0008
+#define IPG_IS_RX_COMPLETE              0x0010
+#define IPG_IS_RX_EARLY                 0x0020
+#define IPG_IS_INT_REQUESTED            0x0040
+#define IPG_IS_UPDATE_STATS             0x0080
+#define IPG_IS_LINK_EVENT               0x0100
+#define IPG_IS_TX_DMA_COMPLETE          0x0200
+#define IPG_IS_RX_DMA_COMPLETE          0x0400
+#define IPG_IS_RFD_LIST_END             0x0800
+#define IPG_IS_RX_DMA_PRIORITY          0x1000
+
+/* MACCtrl */
+#define IPG_MC_RSVD_MASK                0x7FE33FA3
+#define IPG_MC_IFS_SELECT               0x00000003
+#define IPG_MC_IFS_4352BIT              0x00000003
+#define IPG_MC_IFS_1792BIT              0x00000002
+#define IPG_MC_IFS_1024BIT              0x00000001
+#define IPG_MC_IFS_96BIT                0x00000000
+#define IPG_MC_DUPLEX_SELECT            0x00000020
+#define IPG_MC_DUPLEX_SELECT_FD         0x00000020
+#define IPG_MC_DUPLEX_SELECT_HD         0x00000000
+#define IPG_MC_TX_FLOW_CONTROL_ENABLE   0x00000080
+#define IPG_MC_RX_FLOW_CONTROL_ENABLE   0x00000100
+#define IPG_MC_RCV_FCS                  0x00000200
+#define IPG_MC_FIFO_LOOPBACK            0x00000400
+#define IPG_MC_MAC_LOOPBACK             0x00000800
+#define IPG_MC_AUTO_VLAN_TAGGING        0x00001000
+#define IPG_MC_AUTO_VLAN_UNTAGGING      0x00002000
+#define IPG_MC_COLLISION_DETECT         0x00010000
+#define IPG_MC_CARRIER_SENSE            0x00020000
+#define IPG_MC_STATISTICS_ENABLE        0x00200000
+#define IPG_MC_STATISTICS_DISABLE       0x00400000
+#define IPG_MC_STATISTICS_ENABLED       0x00800000
+#define IPG_MC_TX_ENABLE                0x01000000
+#define IPG_MC_TX_DISABLE               0x02000000
+#define IPG_MC_TX_ENABLED               0x04000000
+#define IPG_MC_RX_ENABLE                0x08000000
+#define IPG_MC_RX_DISABLE               0x10000000
+#define IPG_MC_RX_ENABLED               0x20000000
+#define IPG_MC_PAUSED                   0x40000000
+
+/*
+ *	Tune
+ */
+
+/* Miscellaneous Constants. */
+#define   TRUE  1
+#define   FALSE 0
+
+/* Assign IPG_APPEND_FCS_ON_TX > 0 for auto FCS append on TX. */
+#define         IPG_APPEND_FCS_ON_TX         TRUE
+
+/* Assign IPG_APPEND_FCS_ON_TX > 0 for auto FCS strip on RX. */
+#define         IPG_STRIP_FCS_ON_RX          TRUE
+
+/* Assign IPG_DROP_ON_RX_ETH_ERRORS > 0 to drop RX frames with
+ * Ethernet errors.
+ */
+#define         IPG_DROP_ON_RX_ETH_ERRORS    TRUE
+
+/* Assign IPG_INSERT_MANUAL_VLAN_TAG > 0 to insert VLAN tags manually
+ * (via TFC).
+ */
+#define		IPG_INSERT_MANUAL_VLAN_TAG   FALSE
+
+/* Assign IPG_ADD_IPCHECKSUM_ON_TX > 0 for auto IP checksum on TX. */
+#define         IPG_ADD_IPCHECKSUM_ON_TX     FALSE
+
+/* Assign IPG_ADD_TCPCHECKSUM_ON_TX > 0 for auto TCP checksum on TX.
+ * DO NOT USE FOR SILICON REVISIONS B3 AND EARLIER.
+ */
+#define         IPG_ADD_TCPCHECKSUM_ON_TX    FALSE
+
+/* Assign IPG_ADD_UDPCHECKSUM_ON_TX > 0 for auto UDP checksum on TX.
+ * DO NOT USE FOR SILICON REVISIONS B3 AND EARLIER.
+ */
+#define         IPG_ADD_UDPCHECKSUM_ON_TX    FALSE
+
+/* If inserting VLAN tags manually, assign the IPG_MANUAL_VLAN_xx
+ * constants as desired.
+ */
+#define		IPG_MANUAL_VLAN_VID		0xABC
+#define		IPG_MANUAL_VLAN_CFI		0x1
+#define		IPG_MANUAL_VLAN_USERPRIORITY 0x5
+
+#define         IPG_IO_REG_RANGE		0xFF
+#define         IPG_MEM_REG_RANGE		0x154
+#define         IPG_DRIVER_NAME		"Sundance Technology IPG Triple-Speed Ethernet"
+#define         IPG_NIC_PHY_ADDRESS          0x01
+#define		IPG_DMALIST_ALIGN_PAD	0x07
+#define		IPG_MULTICAST_HASHTABLE_SIZE	0x40
+
+/* Number of miliseconds to wait after issuing a software reset.
+ * 0x05 <= IPG_AC_RESETWAIT to account for proper 10Mbps operation.
+ */
+#define         IPG_AC_RESETWAIT             0x05
+
+/* Number of IPG_AC_RESETWAIT timeperiods before declaring timeout. */
+#define         IPG_AC_RESET_TIMEOUT         0x0A
+
+/* Minimum number of nanoseconds used to toggle MDC clock during
+ * MII/GMII register access.
+ */
+#define		IPG_PC_PHYCTRLWAIT_NS		200
+
+#define		IPG_TFDLIST_LENGTH		0x100
+
+/* Number of frames between TxDMAComplete interrupt.
+ * 0 < IPG_FRAMESBETWEENTXDMACOMPLETES <= IPG_TFDLIST_LENGTH
+ */
+#define		IPG_FRAMESBETWEENTXDMACOMPLETES 0x1
+
+#ifdef JUMBO_FRAME
+
+# ifdef JUMBO_FRAME_SIZE_2K
+# define JUMBO_FRAME_SIZE 2048
+# define __IPG_RXFRAG_SIZE 2048
+# else
+#  ifdef JUMBO_FRAME_SIZE_3K
+#  define JUMBO_FRAME_SIZE 3072
+#  define __IPG_RXFRAG_SIZE 3072
+#  else
+#   ifdef JUMBO_FRAME_SIZE_4K
+#   define JUMBO_FRAME_SIZE 4096
+#   define __IPG_RXFRAG_SIZE 4088
+#   else
+#    ifdef JUMBO_FRAME_SIZE_5K
+#    define JUMBO_FRAME_SIZE 5120
+#    define __IPG_RXFRAG_SIZE 4088
+#    else
+#     ifdef JUMBO_FRAME_SIZE_6K
+#     define JUMBO_FRAME_SIZE 6144
+#     define __IPG_RXFRAG_SIZE 4088
+#     else
+#      ifdef JUMBO_FRAME_SIZE_7K
+#      define JUMBO_FRAME_SIZE 7168
+#      define __IPG_RXFRAG_SIZE 4088
+#      else
+#       ifdef JUMBO_FRAME_SIZE_8K
+#       define JUMBO_FRAME_SIZE 8192
+#       define __IPG_RXFRAG_SIZE 4088
+#       else
+#        ifdef JUMBO_FRAME_SIZE_9K
+#        define JUMBO_FRAME_SIZE 9216
+#        define __IPG_RXFRAG_SIZE 4088
+#        else
+#         ifdef JUMBO_FRAME_SIZE_10K
+#         define JUMBO_FRAME_SIZE 10240
+#         define __IPG_RXFRAG_SIZE 4088
+#         else
+#         define JUMBO_FRAME_SIZE 4096
+#         endif
+#        endif
+#       endif
+#      endif
+#     endif
+#    endif
+#   endif
+#  endif
+# endif
+#endif
+
+/* Size of allocated received buffers. Nominally 0x0600.
+ * Define larger if expecting jumbo frames.
+ */
+#ifdef JUMBO_FRAME
+//IPG_TXFRAG_SIZE must <= 0x2b00, or TX will crash
+#define		IPG_TXFRAG_SIZE		JUMBO_FRAME_SIZE
+#endif
+
+/* Size of allocated received buffers. Nominally 0x0600.
+ * Define larger if expecting jumbo frames.
+ */
+#ifdef JUMBO_FRAME
+//4088=4096-8
+#define		IPG_RXFRAG_SIZE		__IPG_RXFRAG_SIZE
+#define     IPG_RXSUPPORT_SIZE   IPG_MAX_RXFRAME_SIZE
+#else
+#define		IPG_RXFRAG_SIZE		0x0600
+#define     IPG_RXSUPPORT_SIZE   IPG_RXFRAG_SIZE
+#endif
+
+/* IPG_MAX_RXFRAME_SIZE <= IPG_RXFRAG_SIZE */
+#ifdef JUMBO_FRAME
+#define		IPG_MAX_RXFRAME_SIZE		JUMBO_FRAME_SIZE
+#else
+#define		IPG_MAX_RXFRAME_SIZE		0x0600
+#endif
+
+#define		IPG_RFDLIST_LENGTH		0x100
+
+/* Maximum number of RFDs to process per interrupt.
+ * 1 < IPG_MAXRFDPROCESS_COUNT < IPG_RFDLIST_LENGTH
+ */
+#define		IPG_MAXRFDPROCESS_COUNT	0x80
+
+/* Minimum margin between last freed RFD, and current RFD.
+ * 1 < IPG_MINUSEDRFDSTOFREE < IPG_RFDLIST_LENGTH
+ */
+#define		IPG_MINUSEDRFDSTOFREE	0x80
+
+/* specify the jumbo frame maximum size
+ * per unit is 0x600 (the RxBuffer size that one RFD can carry)
+ */
+#define     MAX_JUMBOSIZE	        0x8	// max is 12K
+
+/* Key register values loaded at driver start up. */
+
+/* TXDMAPollPeriod is specified in 320ns increments.
+ *
+ * Value	Time
+ * ---------------------
+ * 0x00-0x01	320ns
+ * 0x03		~1us
+ * 0x1F		~10us
+ * 0xFF		~82us
+ */
+#define		IPG_TXDMAPOLLPERIOD_VALUE	0x26
+
+/* TxDMAUrgentThresh specifies the minimum amount of
+ * data in the transmit FIFO before asserting an
+ * urgent transmit DMA request.
+ *
+ * Value	Min TxFIFO occupied space before urgent TX request
+ * ---------------------------------------------------------------
+ * 0x00-0x04	128 bytes (1024 bits)
+ * 0x27		1248 bytes (~10000 bits)
+ * 0x30		1536 bytes (12288 bits)
+ * 0xFF		8192 bytes (65535 bits)
+ */
+#define		IPG_TXDMAURGENTTHRESH_VALUE	0x04
+
+/* TxDMABurstThresh specifies the minimum amount of
+ * free space in the transmit FIFO before asserting an
+ * transmit DMA request.
+ *
+ * Value	Min TxFIFO free space before TX request
+ * ----------------------------------------------------
+ * 0x00-0x08	256 bytes
+ * 0x30		1536 bytes
+ * 0xFF		8192 bytes
+ */
+#define		IPG_TXDMABURSTTHRESH_VALUE	0x30
+
+/* RXDMAPollPeriod is specified in 320ns increments.
+ *
+ * Value	Time
+ * ---------------------
+ * 0x00-0x01	320ns
+ * 0x03		~1us
+ * 0x1F		~10us
+ * 0xFF		~82us
+ */
+#define		IPG_RXDMAPOLLPERIOD_VALUE	0x01
+
+/* RxDMAUrgentThresh specifies the minimum amount of
+ * free space within the receive FIFO before asserting
+ * a urgent receive DMA request.
+ *
+ * Value	Min RxFIFO free space before urgent RX request
+ * ---------------------------------------------------------------
+ * 0x00-0x04	128 bytes (1024 bits)
+ * 0x27		1248 bytes (~10000 bits)
+ * 0x30		1536 bytes (12288 bits)
+ * 0xFF		8192 bytes (65535 bits)
+ */
+#define		IPG_RXDMAURGENTTHRESH_VALUE	0x30
+
+/* RxDMABurstThresh specifies the minimum amount of
+ * occupied space within the receive FIFO before asserting
+ * a receive DMA request.
+ *
+ * Value	Min TxFIFO free space before TX request
+ * ----------------------------------------------------
+ * 0x00-0x08	256 bytes
+ * 0x30		1536 bytes
+ * 0xFF		8192 bytes
+ */
+#define		IPG_RXDMABURSTTHRESH_VALUE	0x30
+
+/* FlowOnThresh specifies the maximum amount of occupied
+ * space in the receive FIFO before a PAUSE frame with
+ * maximum pause time transmitted.
+ *
+ * Value	Max RxFIFO occupied space before PAUSE
+ * ---------------------------------------------------
+ * 0x0000	0 bytes
+ * 0x0740	29,696 bytes
+ * 0x07FF	32,752 bytes
+ */
+#define		IPG_FLOWONTHRESH_VALUE	0x0740
+
+/* FlowOffThresh specifies the minimum amount of occupied
+ * space in the receive FIFO before a PAUSE frame with
+ * zero pause time is transmitted.
+ *
+ * Value	Max RxFIFO occupied space before PAUSE
+ * ---------------------------------------------------
+ * 0x0000	0 bytes
+ * 0x00BF	3056 bytes
+ * 0x07FF	32,752 bytes
+ */
+#define		IPG_FLOWOFFTHRESH_VALUE	0x00BF
+
+/*
+ * Miscellaneous macros.
+ */
+
+/* Marco for printing debug statements.
+#  define IPG_DDEBUG_MSG(args...) printk(KERN_DEBUG "IPG: " ## args) */
+#ifdef IPG_DEBUG
+#  define IPG_DEBUG_MSG(args...)
+#  define IPG_DDEBUG_MSG(args...) printk(KERN_DEBUG "IPG: " args)
+#  define IPG_DUMPRFDLIST(args) ipg_dump_rfdlist(args)
+#  define IPG_DUMPTFDLIST(args) ipg_dump_tfdlist(args)
+#else
+#  define IPG_DEBUG_MSG(args...)
+#  define IPG_DDEBUG_MSG(args...)
+#  define IPG_DUMPRFDLIST(args)
+#  define IPG_DUMPTFDLIST(args)
+#endif
+
+/*
+ * End miscellaneous macros.
+ */
+
+/* Transmit Frame Descriptor. The IPG supports 15 fragments,
+ * however Linux requires only a single fragment. Note, each
+ * TFD field is 64 bits wide.
+ */
+struct ipg_tx {
+	u64 next_desc;
+	u64 tfc;
+	u64 frag_info;
+};
+
+/* Receive Frame Descriptor. Note, each RFD field is 64 bits wide.
+ */
+struct ipg_rx {
+	u64 next_desc;
+	u64 rfs;
+	u64 frag_info;
+};
+
+struct SJumbo {
+	int FoundStart;
+	int CurrentSize;
+	struct sk_buff *skb;
+};
+/* Structure of IPG NIC specific data. */
+struct ipg_nic_private {
+	void __iomem *ioaddr;
+	struct ipg_tx *txd;
+	struct ipg_rx *rxd;
+	dma_addr_t txd_map;
+	dma_addr_t rxd_map;
+	struct sk_buff *TxBuff[IPG_TFDLIST_LENGTH];
+	struct sk_buff *RxBuff[IPG_RFDLIST_LENGTH];
+	unsigned int tx_current;
+	unsigned int tx_dirty;
+	unsigned int rx_current;
+	unsigned int rx_dirty;
+// Add by Grace 2005/05/19
+#ifdef JUMBO_FRAME
+	struct SJumbo Jumbo;
+#endif
+	unsigned int rx_buf_sz;
+	struct pci_dev *pdev;
+	struct net_device *dev;
+	struct net_device_stats stats;
+	spinlock_t lock;
+	int tenmbpsmode;
+
+	/*Jesse20040128EEPROM_VALUE */
+	u16 LED_Mode;
+	u16 station_addr[3];	/* Station Address in EEPROM Reg 0x10..0x12 */
+
+	struct mutex		mii_mutex;
+	struct mii_if_info	mii_if;
+	int ResetCurrentTFD;
+#ifdef IPG_DEBUG
+	int RFDlistendCount;
+	int RFDListCheckedCount;
+	int EmptyRFDListCount;
+#endif
+	struct delayed_work task;
+};
+
+//variable record -- index by leading revision/length
+//Revision/Length(=N*4), Address1, Data1, Address2, Data2,...,AddressN,DataN
+unsigned short DefaultPhyParam[] = {
+	// 11/12/03 IP1000A v1-3 rev=0x40
+	/*--------------------------------------------------------------------------
+	(0x4000|(15*4)), 31, 0x0001, 27, 0x01e0, 31, 0x0002, 22, 0x85bd, 24, 0xfff2,
+		    		 27, 0x0c10, 28, 0x0c10, 29, 0x2c10, 31, 0x0003, 23, 0x92f6,
+		    		 31, 0x0000, 23, 0x003d, 30, 0x00de, 20, 0x20e7,  9, 0x0700,
+	  --------------------------------------------------------------------------*/
+	// 12/17/03 IP1000A v1-4 rev=0x40
+	(0x4000 | (07 * 4)), 31, 0x0001, 27, 0x01e0, 31, 0x0002, 27, 0xeb8e, 31,
+	    0x0000,
+	30, 0x005e, 9, 0x0700,
+	// 01/09/04 IP1000A v1-5 rev=0x41
+	(0x4100 | (07 * 4)), 31, 0x0001, 27, 0x01e0, 31, 0x0002, 27, 0xeb8e, 31,
+	    0x0000,
+	30, 0x005e, 9, 0x0700,
+	0x0000
+};
+
+#endif				/* __LINUX_IPG_H */
-- 
1.3.GIT




^ permalink raw reply related

* [PATCH] Add IP1000A Driver
From: Jesse Huang @ 2007-09-11 15:24 UTC (permalink / raw)
  To: "Jeff Garzik [jeff", akpm, netdev, jesse

From: Jesse Huang <jesse@icplus.com.tw>

Change Logs: Add IP1000A Driver to kernel tree.

Signed-off-by: Jesse Huang <jesse@icplus.com.tw>
---

 drivers/net/ipg.c | 2331 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/net/ipg.h |  856 +++++++++++++++++++
 2 files changed, 3187 insertions(+), 0 deletions(-)
 create mode 100755 drivers/net/ipg.c
 create mode 100755 drivers/net/ipg.h

e804d1c265bf1d843f845457f925a1728bbfdff7
diff --git a/drivers/net/ipg.c b/drivers/net/ipg.c
new file mode 100755
index 0000000..bdc2b8d
--- /dev/null
+++ b/drivers/net/ipg.c
@@ -0,0 +1,2331 @@
+/*
+ * ipg.c: Device Driver for the IP1000 Gigabit Ethernet Adapter
+ *
+ * Copyright (C) 2003, 2006  IC Plus Corp.
+ *
+ * Original Author:
+ *
+ *   Craig Rich
+ *   Sundance Technology, Inc.
+ *   1485 Saratoga Avenue
+ *   Suite 200
+ *   San Jose, CA 95129
+ *   408 873 4117
+ *   www.sundanceti.com
+ *   craig_rich@sundanceti.com
+ *
+ * Current Maintainer:
+ *
+ *   Sorbica Shieh.
+ *   10F, No.47, Lane 2, Kwang-Fu RD.
+ *   Sec. 2, Hsin-Chu, Taiwan, R.O.C.
+ *   http://www.icplus.com.tw
+ *   sorbica@icplus.com.tw
+ */
+#include <linux/crc32.h>
+#include <linux/ethtool.h>
+#include <linux/mii.h>
+#include <linux/mutex.h>
+
+#define IPG_RX_RING_BYTES	(sizeof(struct ipg_rx) * IPG_RFDLIST_LENGTH)
+#define IPG_TX_RING_BYTES	(sizeof(struct ipg_tx) * IPG_TFDLIST_LENGTH)
+#define IPG_RESET_MASK \
+	(IPG_AC_GLOBAL_RESET | IPG_AC_RX_RESET | IPG_AC_TX_RESET | \
+	 IPG_AC_DMA | IPG_AC_FIFO | IPG_AC_NETWORK | IPG_AC_HOST | \
+	 IPG_AC_AUTO_INIT)
+
+#define ipg_w32(val32,reg)	iowrite32((val32), ioaddr + (reg))
+#define ipg_w16(val16,reg)	iowrite16((val16), ioaddr + (reg))
+#define ipg_w8(val8,reg)	iowrite8((val8), ioaddr + (reg))
+
+#define ipg_r32(reg)		ioread32(ioaddr + (reg))
+#define ipg_r16(reg)		ioread16(ioaddr + (reg))
+#define ipg_r8(reg)		ioread8(ioaddr + (reg))
+
+#define JUMBO_FRAME_4k_ONLY
+enum {
+	netdev_io_size = 128
+};
+
+#include "ipg.h"
+#define DRV_NAME	"ipg"
+
+MODULE_AUTHOR("IC Plus Corp. 2003");
+MODULE_DESCRIPTION("IC Plus IP1000 Gigabit Ethernet Adapter Linux Driver "
+		   DrvVer);
+MODULE_LICENSE("GPL");
+
+static const char *ipg_brand_name[] = {
+	"IC PLUS IP1000 1000/100/10 based NIC",
+	"Sundance Technology ST2021 based NIC",
+	"Tamarack Microelectronics TC9020/9021 based NIC",
+	"Tamarack Microelectronics TC9020/9021 based NIC",
+	"D-Link NIC",
+	"D-Link NIC IP1000A"
+};
+
+static struct pci_device_id ipg_pci_tbl[] __devinitdata = {
+	{ PCI_DEVICE(PCI_VENDOR_ID_SUNDANCE,	0x1023), 0, 0, 0 },
+	{ PCI_DEVICE(PCI_VENDOR_ID_SUNDANCE,	0x2021), 0, 0, 1 },
+	{ PCI_DEVICE(PCI_VENDOR_ID_SUNDANCE,	0x1021), 0, 0, 2 },
+	{ PCI_DEVICE(PCI_VENDOR_ID_DLINK,	0x9021), 0, 0, 3 },
+	{ PCI_DEVICE(PCI_VENDOR_ID_DLINK,	0x4000), 0, 0, 4 },
+	{ PCI_DEVICE(PCI_VENDOR_ID_DLINK,	0x4020), 0, 0, 5 },
+	{ 0, }
+};
+
+MODULE_DEVICE_TABLE(pci, ipg_pci_tbl);
+
+static inline void __iomem *ipg_ioaddr(struct net_device *dev)
+{
+	struct ipg_nic_private *sp = netdev_priv(dev);
+	return sp->ioaddr;
+}
+
+#ifdef IPG_DEBUG
+static void ipg_dump_rfdlist(struct net_device *dev)
+{
+	struct ipg_nic_private *sp = netdev_priv(dev);
+	void __iomem *ioaddr = sp->ioaddr;
+	unsigned int i;
+	u32 offset;
+
+	IPG_DEBUG_MSG("_dump_rfdlist\n");
+
+	printk(KERN_INFO "rx_current = %2.2x\n", sp->rx_current);
+	printk(KERN_INFO "rx_dirty   = %2.2x\n", sp->rx_dirty);
+	printk(KERN_INFO "RFDList start address = %16.16lx\n",
+	       (unsigned long) sp->rxd_map);
+	printk(KERN_INFO "RFDListPtr register   = %8.8x%8.8x\n",
+	       ipg_r32(IPG_RFDLISTPTR1), ipg_r32(IPG_RFDLISTPTR0));
+
+	for (i = 0; i < IPG_RFDLIST_LENGTH; i++) {
+		offset = (u32) &sp->rxd[i].next_desc - (u32) sp->rxd;
+		printk(KERN_INFO "%2.2x %4.4x RFDNextPtr = %16.16lx\n", i,
+		       offset, (unsigned long) sp->rxd[i].next_desc);
+		offset = (u32) &sp->rxd[i].rfs - (u32) sp->rxd;
+		printk(KERN_INFO "%2.2x %4.4x RFS        = %16.16lx\n", i,
+		       offset, (unsigned long) sp->rxd[i].rfs);
+		offset = (u32) &sp->rxd[i].frag_info - (u32) sp->rxd;
+		printk(KERN_INFO "%2.2x %4.4x frag_info   = %16.16lx\n", i,
+		       offset, (unsigned long) sp->rxd[i].frag_info);
+	}
+}
+
+static void ipg_dump_tfdlist(struct net_device *dev)
+{
+	struct ipg_nic_private *sp = netdev_priv(dev);
+	void __iomem *ioaddr = sp->ioaddr;
+	unsigned int i;
+	u32 offset;
+
+	IPG_DEBUG_MSG("_dump_tfdlist\n");
+
+	printk(KERN_INFO "tx_current         = %2.2x\n", sp->tx_current);
+	printk(KERN_INFO "tx_dirty = %2.2x\n", sp->tx_dirty);
+	printk(KERN_INFO "TFDList start address = %16.16lx\n",
+	       (unsigned long) sp->txd_map);
+	printk(KERN_INFO "TFDListPtr register   = %8.8x%8.8x\n",
+	       ipg_r32(IPG_TFDLISTPTR1), ipg_r32(IPG_TFDLISTPTR0));
+
+	for (i = 0; i < IPG_TFDLIST_LENGTH; i++) {
+		offset = (u32) &sp->txd[i].next_desc - (u32) sp->txd;
+		printk(KERN_INFO "%2.2x %4.4x TFDNextPtr = %16.16lx\n", i,
+		       offset, (unsigned long) sp->txd[i].next_desc);
+
+		offset = (u32) &sp->txd[i].tfc - (u32) sp->txd;
+		printk(KERN_INFO "%2.2x %4.4x TFC        = %16.16lx\n", i,
+		       offset, (unsigned long) sp->txd[i].tfc);
+		offset = (u32) &sp->txd[i].frag_info - (u32) sp->txd;
+		printk(KERN_INFO "%2.2x %4.4x frag_info   = %16.16lx\n", i,
+		       offset, (unsigned long) sp->txd[i].frag_info);
+	}
+}
+#endif
+
+static void ipg_write_phy_ctl(void __iomem *ioaddr, u8 data)
+{
+	ipg_w8(IPG_PC_RSVD_MASK & data, PHY_CTRL);
+	ndelay(IPG_PC_PHYCTRLWAIT_NS);
+}
+
+static void ipg_drive_phy_ctl_low_high(void __iomem *ioaddr, u8 data)
+{
+	ipg_write_phy_ctl(ioaddr, IPG_PC_MGMTCLK_LO | data);
+	ipg_write_phy_ctl(ioaddr, IPG_PC_MGMTCLK_HI | data);
+}
+
+static void send_three_state(void __iomem *ioaddr, u8 phyctrlpolarity)
+{
+	phyctrlpolarity |= (IPG_PC_MGMTDATA & 0) | IPG_PC_MGMTDIR;
+
+	ipg_drive_phy_ctl_low_high(ioaddr, phyctrlpolarity);
+}
+
+static void send_end(void __iomem *ioaddr, u8 phyctrlpolarity)
+{
+	ipg_w8((IPG_PC_MGMTCLK_LO | (IPG_PC_MGMTDATA & 0) | IPG_PC_MGMTDIR |
+		phyctrlpolarity) & IPG_PC_RSVD_MASK, PHY_CTRL);
+}
+
+static u16 read_phy_bit(void __iomem * ioaddr, u8 phyctrlpolarity)
+{
+	u16 bit_data;
+
+	ipg_write_phy_ctl(ioaddr, IPG_PC_MGMTCLK_LO | phyctrlpolarity);
+
+	bit_data = ((ipg_r8(PHY_CTRL) & IPG_PC_MGMTDATA) >> 1) & 1;
+
+	ipg_write_phy_ctl(ioaddr, IPG_PC_MGMTCLK_HI | phyctrlpolarity);
+
+	return bit_data;
+}
+
+/*
+ * Read a register from the Physical Layer device located
+ * on the IPG NIC, using the IPG PHYCTRL register.
+ */
+static int mdio_read(struct net_device * dev, int phy_id, int phy_reg)
+{
+	void __iomem *ioaddr = ipg_ioaddr(dev);
+	/*
+	 * The GMII mangement frame structure for a read is as follows:
+	 *
+	 * |Preamble|st|op|phyad|regad|ta|      data      |idle|
+	 * |< 32 1s>|01|10|AAAAA|RRRRR|z0|DDDDDDDDDDDDDDDD|z   |
+	 *
+	 * <32 1s> = 32 consecutive logic 1 values
+	 * A = bit of Physical Layer device address (MSB first)
+	 * R = bit of register address (MSB first)
+	 * z = High impedance state
+	 * D = bit of read data (MSB first)
+	 *
+	 * Transmission order is 'Preamble' field first, bits transmitted
+	 * left to right (first to last).
+	 */
+	struct {
+		u32 field;
+		unsigned int len;
+	} p[] = {
+		{ GMII_PREAMBLE,	32 },	/* Preamble */
+		{ GMII_ST,		2  },	/* ST */
+		{ GMII_READ,		2  },	/* OP */
+		{ phy_id,		5  },	/* PHYAD */
+		{ phy_reg,		5  },	/* REGAD */
+		{ 0x0000,		2  },	/* TA */
+		{ 0x0000,		16 },	/* DATA */
+		{ 0x0000,		1  }	/* IDLE */
+	};
+	unsigned int i, j;
+	u8 polarity, data;
+
+	polarity  = ipg_r8(PHY_CTRL);
+	polarity &= (IPG_PC_DUPLEX_POLARITY | IPG_PC_LINK_POLARITY);
+
+	/* Create the Preamble, ST, OP, PHYAD, and REGAD field. */
+	for (j = 0; j < 5; j++) {
+		for (i = 0; i < p[j].len; i++) {
+			/* For each variable length field, the MSB must be
+			 * transmitted first. Rotate through the field bits,
+			 * starting with the MSB, and move each bit into the
+			 * the 1st (2^1) bit position (this is the bit position
+			 * corresponding to the MgmtData bit of the PhyCtrl
+			 * register for the IPG).
+			 *
+			 * Example: ST = 01;
+			 *
+			 *          First write a '0' to bit 1 of the PhyCtrl
+			 *          register, then write a '1' to bit 1 of the
+			 *          PhyCtrl register.
+			 *
+			 * To do this, right shift the MSB of ST by the value:
+			 * [field length - 1 - #ST bits already written]
+			 * then left shift this result by 1.
+			 */
+			data  = (p[j].field >> (p[j].len - 1 - i)) << 1;
+			data &= IPG_PC_MGMTDATA;
+			data |= polarity | IPG_PC_MGMTDIR;
+
+			ipg_drive_phy_ctl_low_high(ioaddr, data);
+		}
+	}
+
+	send_three_state(ioaddr, polarity);
+
+	read_phy_bit(ioaddr, polarity);
+
+	/*
+	 * For a read cycle, the bits for the next two fields (TA and
+	 * DATA) are driven by the PHY (the IPG reads these bits).
+	 */
+	for (i = 0; i < p[6].len; i++) {
+		p[6].field |=
+		    (read_phy_bit(ioaddr, polarity) << (p[6].len - 1 - i));
+	}
+
+	send_three_state(ioaddr, polarity);
+	send_three_state(ioaddr, polarity);
+	send_three_state(ioaddr, polarity);
+	send_end(ioaddr, polarity);
+
+	/* Return the value of the DATA field. */
+	return p[6].field;
+}
+
+/*
+ * Write to a register from the Physical Layer device located
+ * on the IPG NIC, using the IPG PHYCTRL register.
+ */
+static void mdio_write(struct net_device *dev, int phy_id, int phy_reg, int val)
+{
+	void __iomem *ioaddr = ipg_ioaddr(dev);
+	/*
+	 * The GMII mangement frame structure for a read is as follows:
+	 *
+	 * |Preamble|st|op|phyad|regad|ta|      data      |idle|
+	 * |< 32 1s>|01|10|AAAAA|RRRRR|z0|DDDDDDDDDDDDDDDD|z   |
+	 *
+	 * <32 1s> = 32 consecutive logic 1 values
+	 * A = bit of Physical Layer device address (MSB first)
+	 * R = bit of register address (MSB first)
+	 * z = High impedance state
+	 * D = bit of write data (MSB first)
+	 *
+	 * Transmission order is 'Preamble' field first, bits transmitted
+	 * left to right (first to last).
+	 */
+	struct {
+		u32 field;
+		unsigned int len;
+	} p[] = {
+		{ GMII_PREAMBLE,	32 },	/* Preamble */
+		{ GMII_ST,		2  },	/* ST */
+		{ GMII_WRITE,		2  },	/* OP */
+		{ phy_id,		5  },	/* PHYAD */
+		{ phy_reg,		5  },	/* REGAD */
+		{ 0x0002,		2  },	/* TA */
+		{ val & 0xffff,		16 },	/* DATA */
+		{ 0x0000,		1  }	/* IDLE */
+	};
+	unsigned int i, j;
+	u8 polarity, data;
+
+	polarity  = ipg_r8(PHY_CTRL);
+	polarity &= (IPG_PC_DUPLEX_POLARITY | IPG_PC_LINK_POLARITY);
+
+	/* Create the Preamble, ST, OP, PHYAD, and REGAD field. */
+	for (j = 0; j < 7; j++) {
+		for (i = 0; i < p[j].len; i++) {
+			/* For each variable length field, the MSB must be
+			 * transmitted first. Rotate through the field bits,
+			 * starting with the MSB, and move each bit into the
+			 * the 1st (2^1) bit position (this is the bit position
+			 * corresponding to the MgmtData bit of the PhyCtrl
+			 * register for the IPG).
+			 *
+			 * Example: ST = 01;
+			 *
+			 *          First write a '0' to bit 1 of the PhyCtrl
+			 *          register, then write a '1' to bit 1 of the
+			 *          PhyCtrl register.
+			 *
+			 * To do this, right shift the MSB of ST by the value:
+			 * [field length - 1 - #ST bits already written]
+			 * then left shift this result by 1.
+			 */
+			data  = (p[j].field >> (p[j].len - 1 - i)) << 1;
+			data &= IPG_PC_MGMTDATA;
+			data |= polarity | IPG_PC_MGMTDIR;
+
+			ipg_drive_phy_ctl_low_high(ioaddr, data);
+		}
+	}
+
+	/* The last cycle is a tri-state, so read from the PHY. */
+	for (j = 7; j < 8; j++) {
+		for (i = 0; i < p[j].len; i++) {
+			ipg_write_phy_ctl(ioaddr, IPG_PC_MGMTCLK_LO | polarity);
+
+			p[j].field |= ((ipg_r8(PHY_CTRL) &
+				IPG_PC_MGMTDATA) >> 1) << (p[j].len - 1 - i);
+
+			ipg_write_phy_ctl(ioaddr, IPG_PC_MGMTCLK_HI | polarity);
+		}
+	}
+}
+
+/* Set LED_Mode JES20040127EEPROM */
+static void ipg_set_led_mode(struct net_device *dev)
+{
+	struct ipg_nic_private *sp = netdev_priv(dev);
+	void __iomem *ioaddr = sp->ioaddr;
+	u32 mode;
+
+	mode = ipg_r32(ASIC_CTRL);
+	mode &= ~(IPG_AC_LED_MODE_BIT_1 | IPG_AC_LED_MODE | IPG_AC_LED_SPEED);
+
+	if ((sp->LED_Mode & 0x03) > 1)
+		mode |= IPG_AC_LED_MODE_BIT_1;	/* Write Asic Control Bit 29 */
+
+	if ((sp->LED_Mode & 0x01) == 1)
+		mode |= IPG_AC_LED_MODE;	/* Write Asic Control Bit 14 */
+
+	if ((sp->LED_Mode & 0x08) == 8)
+		mode |= IPG_AC_LED_SPEED;	/* Write Asic Control Bit 27 */
+
+	ipg_w32(mode, ASIC_CTRL);
+}
+
+/* Set PHYSet JES20040127EEPROM */
+static void ipg_set_phy_set(struct net_device *dev)
+{
+	struct ipg_nic_private *sp = netdev_priv(dev);
+	void __iomem *ioaddr = sp->ioaddr;
+	int physet;
+
+	physet = ipg_r8(PHY_SET);
+	physet &= ~(IPG_PS_MEM_LENB9B | IPG_PS_MEM_LEN9 | IPG_PS_NON_COMPDET);
+	physet |= ((sp->LED_Mode & 0x70) >> 4);
+	ipg_w8(physet, PHY_SET);
+}
+
+static int ipg_reset(struct net_device *dev, u32 resetflags)
+{
+	/* Assert functional resets via the IPG AsicCtrl
+	 * register as specified by the 'resetflags' input
+	 * parameter.
+	 */
+	void __iomem *ioaddr = ipg_ioaddr(dev);	//JES20040127EEPROM:
+	unsigned int timeout_count = 0;
+
+	IPG_DEBUG_MSG("_reset\n");
+
+	ipg_w32(ipg_r32(ASIC_CTRL) | resetflags, ASIC_CTRL);
+
+	/* Delay added to account for problem with 10Mbps reset. */
+	mdelay(IPG_AC_RESETWAIT);
+
+	while (IPG_AC_RESET_BUSY & ipg_r32(ASIC_CTRL)) {
+		mdelay(IPG_AC_RESETWAIT);
+		if (++timeout_count > IPG_AC_RESET_TIMEOUT)
+			return -ETIME;
+	}
+	/* Set LED Mode in Asic Control JES20040127EEPROM */
+	ipg_set_led_mode(dev);
+
+	/* Set PHYSet Register Value JES20040127EEPROM */
+	ipg_set_phy_set(dev);
+	return 0;
+}
+
+/* Find the GMII PHY address. */
+static int ipg_find_phyaddr(struct net_device *dev)
+{
+	unsigned int phyaddr, i;
+
+	for (i = 0; i < 32; i++) {
+		u32 status;
+
+		/* Search for the correct PHY address among 32 possible. */
+		phyaddr = (IPG_NIC_PHY_ADDRESS + i) % 32;
+
+		/* 10/22/03 Grace change verify from GMII_PHY_STATUS to
+		   GMII_PHY_ID1
+		 */
+
+		status = mdio_read(dev, phyaddr, MII_BMSR);
+
+		if ((status != 0xFFFF) && (status != 0))
+			return phyaddr;
+	}
+
+	return 0x1f;
+}
+
+/*
+ * Configure IPG based on result of IEEE 802.3 PHY
+ * auto-negotiation.
+ */
+static int ipg_config_autoneg(struct net_device *dev)
+{
+	struct ipg_nic_private *sp = netdev_priv(dev);
+	void __iomem *ioaddr = sp->ioaddr;
+	unsigned int txflowcontrol;
+	unsigned int rxflowcontrol;
+	unsigned int fullduplex;
+	unsigned int gig;
+	u32 mac_ctrl_val;
+	u32 asicctrl;
+	u8 phyctrl;
+
+	IPG_DEBUG_MSG("_config_autoneg\n");
+
+	asicctrl = ipg_r32(ASIC_CTRL);
+	phyctrl = ipg_r8(PHY_CTRL);
+	mac_ctrl_val = ipg_r32(MAC_CTRL);
+
+	/* Set flags for use in resolving auto-negotation, assuming
+	 * non-1000Mbps, half duplex, no flow control.
+	 */
+	fullduplex = 0;
+	txflowcontrol = 0;
+	rxflowcontrol = 0;
+	gig = 0;
+
+	/* To accomodate a problem in 10Mbps operation,
+	 * set a global flag if PHY running in 10Mbps mode.
+	 */
+	sp->tenmbpsmode = 0;
+
+	printk(KERN_INFO "%s: Link speed = ", dev->name);
+
+	/* Determine actual speed of operation. */
+	switch (phyctrl & IPG_PC_LINK_SPEED) {
+	case IPG_PC_LINK_SPEED_10MBPS:
+		printk("10Mbps.\n");
+		printk(KERN_INFO "%s: 10Mbps operational mode enabled.\n",
+		       dev->name);
+		sp->tenmbpsmode = 1;
+		break;
+	case IPG_PC_LINK_SPEED_100MBPS:
+		printk("100Mbps.\n");
+		break;
+	case IPG_PC_LINK_SPEED_1000MBPS:
+		printk("1000Mbps.\n");
+		gig = 1;
+		break;
+	default:
+		printk("undefined!\n");
+		return 0;
+	}
+
+	if (phyctrl & IPG_PC_DUPLEX_STATUS) {
+		fullduplex = 1;
+		txflowcontrol = 1;
+		rxflowcontrol = 1;
+	}
+
+	/* Configure full duplex, and flow control. */
+	if (fullduplex == 1) {
+		/* Configure IPG for full duplex operation. */
+		printk(KERN_INFO "%s: setting full duplex, ", dev->name);
+
+		mac_ctrl_val |= IPG_MC_DUPLEX_SELECT_FD;
+
+		if (txflowcontrol == 1) {
+			printk("TX flow control");
+			mac_ctrl_val |= IPG_MC_TX_FLOW_CONTROL_ENABLE;
+		} else {
+			printk("no TX flow control");
+			mac_ctrl_val &= ~IPG_MC_TX_FLOW_CONTROL_ENABLE;
+		}
+
+		if (rxflowcontrol == 1) {
+			printk(", RX flow control.");
+			mac_ctrl_val |= IPG_MC_RX_FLOW_CONTROL_ENABLE;
+		} else {
+			printk(", no RX flow control.");
+			mac_ctrl_val &= ~IPG_MC_RX_FLOW_CONTROL_ENABLE;
+		}
+
+		printk("\n");
+	} else {
+		/* Configure IPG for half duplex operation. */
+	        printk(KERN_INFO "%s: setting half duplex, "
+		       "no TX flow control, no RX flow control.\n", dev->name);
+
+		mac_ctrl_val &= ~IPG_MC_DUPLEX_SELECT_FD &
+			~IPG_MC_TX_FLOW_CONTROL_ENABLE &
+			~IPG_MC_RX_FLOW_CONTROL_ENABLE;
+	}
+	ipg_w32(mac_ctrl_val, MAC_CTRL);
+	return 0;
+}
+
+/* Determine and configure multicast operation and set
+ * receive mode for IPG.
+ */
+static void ipg_nic_set_multicast_list(struct net_device *dev)
+{
+	void __iomem *ioaddr = ipg_ioaddr(dev);
+	struct dev_mc_list *mc_list_ptr;
+	unsigned int hashindex;
+	u32 hashtable[2];
+	u8 receivemode;
+
+	IPG_DEBUG_MSG("_nic_set_multicast_list\n");
+
+	receivemode = IPG_RM_RECEIVEUNICAST | IPG_RM_RECEIVEBROADCAST;
+
+	if (dev->flags & IFF_PROMISC) {
+		/* NIC to be configured in promiscuous mode. */
+		receivemode = IPG_RM_RECEIVEALLFRAMES;
+	} else if ((dev->flags & IFF_ALLMULTI) ||
+		   (dev->flags & IFF_MULTICAST &
+		    (dev->mc_count > IPG_MULTICAST_HASHTABLE_SIZE))) {
+		/* NIC to be configured to receive all multicast
+		 * frames. */
+		receivemode |= IPG_RM_RECEIVEMULTICAST;
+	} else if (dev->flags & IFF_MULTICAST & (dev->mc_count > 0)) {
+		/* NIC to be configured to receive selected
+		 * multicast addresses. */
+		receivemode |= IPG_RM_RECEIVEMULTICASTHASH;
+	}
+
+	/* Calculate the bits to set for the 64 bit, IPG HASHTABLE.
+	 * The IPG applies a cyclic-redundancy-check (the same CRC
+	 * used to calculate the frame data FCS) to the destination
+	 * address all incoming multicast frames whose destination
+	 * address has the multicast bit set. The least significant
+	 * 6 bits of the CRC result are used as an addressing index
+	 * into the hash table. If the value of the bit addressed by
+	 * this index is a 1, the frame is passed to the host system.
+	 */
+
+	/* Clear hashtable. */
+	hashtable[0] = 0x00000000;
+	hashtable[1] = 0x00000000;
+
+	/* Cycle through all multicast addresses to filter. */
+	for (mc_list_ptr = dev->mc_list;
+	     mc_list_ptr != NULL; mc_list_ptr = mc_list_ptr->next) {
+		/* Calculate CRC result for each multicast address. */
+		hashindex = crc32_le(0xffffffff, mc_list_ptr->dmi_addr,
+				     ETH_ALEN);
+
+		/* Use only the least significant 6 bits. */
+		hashindex = hashindex & 0x3F;
+
+		/* Within "hashtable", set bit number "hashindex"
+		 * to a logic 1.
+		 */
+		set_bit(hashindex, (void *)hashtable);
+	}
+
+	/* Write the value of the hashtable, to the 4, 16 bit
+	 * HASHTABLE IPG registers.
+	 */
+	ipg_w32(hashtable[0], HASHTABLE_0);
+	ipg_w32(hashtable[1], HASHTABLE_1);
+
+	ipg_w8(IPG_RM_RSVD_MASK & receivemode, RECEIVE_MODE);
+
+	IPG_DEBUG_MSG("ReceiveMode = %x\n", ipg_r8(RECEIVE_MODE));
+}
+
+static int ipg_io_config(struct net_device *dev)
+{
+	void __iomem *ioaddr = ipg_ioaddr(dev);
+	u32 origmacctrl;
+	u32 restoremacctrl;
+
+	IPG_DEBUG_MSG("_io_config\n");
+
+	origmacctrl = ipg_r32(MAC_CTRL);
+
+	restoremacctrl = origmacctrl | IPG_MC_STATISTICS_ENABLE;
+
+	/* Based on compilation option, determine if FCS is to be
+	 * stripped on receive frames by IPG.
+	 */
+	if (!IPG_STRIP_FCS_ON_RX)
+		restoremacctrl |= IPG_MC_RCV_FCS;
+
+	/* Determine if transmitter and/or receiver are
+	 * enabled so we may restore MACCTRL correctly.
+	 */
+	if (origmacctrl & IPG_MC_TX_ENABLED)
+		restoremacctrl |= IPG_MC_TX_ENABLE;
+
+	if (origmacctrl & IPG_MC_RX_ENABLED)
+		restoremacctrl |= IPG_MC_RX_ENABLE;
+
+	/* Transmitter and receiver must be disabled before setting
+	 * IFSSelect.
+	 */
+	ipg_w32((origmacctrl & (IPG_MC_RX_DISABLE | IPG_MC_TX_DISABLE)) &
+		IPG_MC_RSVD_MASK, MAC_CTRL);
+
+	/* Now that transmitter and receiver are disabled, write
+	 * to IFSSelect.
+	 */
+	ipg_w32((origmacctrl & IPG_MC_IFS_96BIT) & IPG_MC_RSVD_MASK, MAC_CTRL);
+
+	/* Set RECEIVEMODE register. */
+	ipg_nic_set_multicast_list(dev);
+
+	ipg_w16(IPG_MAX_RXFRAME_SIZE, MAX_FRAME_SIZE);
+
+	ipg_w8(IPG_RXDMAPOLLPERIOD_VALUE,   RX_DMA_POLL_PERIOD);
+	ipg_w8(IPG_RXDMAURGENTTHRESH_VALUE, RX_DMA_URGENT_THRESH);
+	ipg_w8(IPG_RXDMABURSTTHRESH_VALUE,  RX_DMA_BURST_THRESH);
+	ipg_w8(IPG_TXDMAPOLLPERIOD_VALUE,   TX_DMA_POLL_PERIOD);
+	ipg_w8(IPG_TXDMAURGENTTHRESH_VALUE, TX_DMA_URGENT_THRESH);
+	ipg_w8(IPG_TXDMABURSTTHRESH_VALUE,  TX_DMA_BURST_THRESH);
+	ipg_w16((IPG_IE_HOST_ERROR | IPG_IE_TX_DMA_COMPLETE |
+		 IPG_IE_TX_COMPLETE | IPG_IE_INT_REQUESTED |
+		 IPG_IE_UPDATE_STATS | IPG_IE_LINK_EVENT |
+		 IPG_IE_RX_DMA_COMPLETE | IPG_IE_RX_DMA_PRIORITY), INT_ENABLE);
+	ipg_w16(IPG_FLOWONTHRESH_VALUE,  FLOW_ON_THRESH);
+	ipg_w16(IPG_FLOWOFFTHRESH_VALUE, FLOW_OFF_THRESH);
+
+	/* IPG multi-frag frame bug workaround.
+	 * Per silicon revision B3 eratta.
+	 */
+	ipg_w16(ipg_r16(DEBUG_CTRL) | 0x0200, DEBUG_CTRL);
+
+	/* IPG TX poll now bug workaround.
+	 * Per silicon revision B3 eratta.
+	 */
+	ipg_w16(ipg_r16(DEBUG_CTRL) | 0x0010, DEBUG_CTRL);
+
+	/* IPG RX poll now bug workaround.
+	 * Per silicon revision B3 eratta.
+	 */
+	ipg_w16(ipg_r16(DEBUG_CTRL) | 0x0020, DEBUG_CTRL);
+
+	/* Now restore MACCTRL to original setting. */
+	ipg_w32(IPG_MC_RSVD_MASK & restoremacctrl, MAC_CTRL);
+
+	/* Disable unused RMON statistics. */
+	ipg_w32(IPG_RZ_ALL, RMON_STATISTICS_MASK);
+
+	/* Disable unused MIB statistics. */
+	ipg_w32(IPG_SM_MACCONTROLFRAMESXMTD | IPG_SM_MACCONTROLFRAMESRCVD |
+		IPG_SM_BCSTOCTETXMTOK_BCSTFRAMESXMTDOK | IPG_SM_TXJUMBOFRAMES |
+		IPG_SM_MCSTOCTETXMTOK_MCSTFRAMESXMTDOK | IPG_SM_RXJUMBOFRAMES |
+		IPG_SM_BCSTOCTETRCVDOK_BCSTFRAMESRCVDOK |
+		IPG_SM_UDPCHECKSUMERRORS | IPG_SM_TCPCHECKSUMERRORS |
+		IPG_SM_IPCHECKSUMERRORS, STATISTICS_MASK);
+
+	return 0;
+}
+
+/*
+ * Create a receive buffer within system memory and update
+ * NIC private structure appropriately.
+ */
+static int ipg_get_rxbuff(struct net_device *dev, int entry)
+{
+	struct ipg_nic_private *sp = netdev_priv(dev);
+	struct ipg_rx *rxfd = sp->rxd + entry;
+	struct sk_buff *skb;
+	u64 rxfragsize;
+
+	IPG_DEBUG_MSG("_get_rxbuff\n");
+
+	skb = netdev_alloc_skb(dev, IPG_RXSUPPORT_SIZE + NET_IP_ALIGN);
+	if (!skb) {
+		sp->RxBuff[entry] = NULL;
+		return -ENOMEM;
+	}
+
+	/* Adjust the data start location within the buffer to
+	 * align IP address field to a 16 byte boundary.
+	 */
+	skb_reserve(skb, NET_IP_ALIGN);
+
+	/* Associate the receive buffer with the IPG NIC. */
+	skb->dev = dev;
+
+	/* Save the address of the sk_buff structure. */
+	sp->RxBuff[entry] = skb;
+
+	rxfd->frag_info = cpu_to_le64(pci_map_single(sp->pdev, skb->data,
+		sp->rx_buf_sz, PCI_DMA_FROMDEVICE));
+
+	/* Set the RFD fragment length. */
+	rxfragsize = IPG_RXFRAG_SIZE;
+	rxfd->frag_info |= cpu_to_le64((rxfragsize << 48) & IPG_RFI_FRAGLEN);
+
+	return 0;
+}
+
+static int init_rfdlist(struct net_device *dev)
+{
+	struct ipg_nic_private *sp = netdev_priv(dev);
+	void __iomem *ioaddr = sp->ioaddr;
+	unsigned int i;
+
+	IPG_DEBUG_MSG("_init_rfdlist\n");
+
+	for (i = 0; i < IPG_RFDLIST_LENGTH; i++) {
+		struct ipg_rx *rxfd = sp->rxd + i;
+
+		if (sp->RxBuff[i]) {
+			pci_unmap_single(sp->pdev,
+				le64_to_cpu(rxfd->frag_info & ~IPG_RFI_FRAGLEN),
+				sp->rx_buf_sz, PCI_DMA_FROMDEVICE);
+			IPG_DEV_KFREE_SKB(sp->RxBuff[i]);
+			sp->RxBuff[i] = NULL;
+		}
+
+		/* Clear out the RFS field. */
+		rxfd->rfs = 0x0000000000000000;
+
+		if (ipg_get_rxbuff(dev, i) < 0) {
+			/*
+			 * A receive buffer was not ready, break the
+			 * RFD list here.
+			 */
+			IPG_DEBUG_MSG("Cannot allocate Rx buffer.\n");
+
+			/* Just in case we cannot allocate a single RFD.
+			 * Should not occur.
+			 */
+			if (i == 0) {
+				printk(KERN_ERR "%s: No memory available"
+					" for RFD list.\n", dev->name);
+				return -ENOMEM;
+			}
+		}
+
+		rxfd->next_desc = cpu_to_le64(sp->rxd_map +
+			sizeof(struct ipg_rx)*(i + 1));
+	}
+	sp->rxd[i - 1].next_desc = cpu_to_le64(sp->rxd_map);
+
+	sp->rx_current = 0;
+	sp->rx_dirty = 0;
+
+	/* Write the location of the RFDList to the IPG. */
+	ipg_w32((u32) sp->rxd_map, RFD_LIST_PTR_0);
+	ipg_w32(0x00000000, RFD_LIST_PTR_1);
+
+	return 0;
+}
+
+static void init_tfdlist(struct net_device *dev)
+{
+	struct ipg_nic_private *sp = netdev_priv(dev);
+	void __iomem *ioaddr = sp->ioaddr;
+	unsigned int i;
+
+	IPG_DEBUG_MSG("_init_tfdlist\n");
+
+	for (i = 0; i < IPG_TFDLIST_LENGTH; i++) {
+		struct ipg_tx *txfd = sp->txd + i;
+
+		txfd->tfc = cpu_to_le64(IPG_TFC_TFDDONE);
+
+		if (sp->TxBuff[i]) {
+			IPG_DEV_KFREE_SKB(sp->TxBuff[i]);
+			sp->TxBuff[i] = NULL;
+		}
+
+		txfd->next_desc = cpu_to_le64(sp->txd_map +
+			sizeof(struct ipg_tx)*(i + 1));
+	}
+	sp->txd[i - 1].next_desc = cpu_to_le64(sp->txd_map);
+
+	sp->tx_current = 0;
+	sp->tx_dirty = 0;
+
+	/* Write the location of the TFDList to the IPG. */
+	IPG_DDEBUG_MSG("Starting TFDListPtr = %8.8x\n",
+		       (u32) sp->txd_map);
+	ipg_w32((u32) sp->txd_map, TFD_LIST_PTR_0);
+	ipg_w32(0x00000000, TFD_LIST_PTR_1);
+
+	sp->ResetCurrentTFD = 1;
+}
+
+/*
+ * Free all transmit buffers which have already been transfered
+ * via DMA to the IPG.
+ */
+static void ipg_nic_txfree(struct net_device *dev)
+{
+	struct ipg_nic_private *sp = netdev_priv(dev);
+	void __iomem *ioaddr = sp->ioaddr;
+	const unsigned int curr = ipg_r32(TFD_LIST_PTR_0) -
+		(sp->txd_map / sizeof(struct ipg_tx)) - 1;
+	unsigned int released, pending;
+
+	IPG_DEBUG_MSG("_nic_txfree\n");
+
+	pending = sp->tx_current - sp->tx_dirty;
+
+	for (released = 0; released < pending; released++) {
+		unsigned int dirty = sp->tx_dirty % IPG_TFDLIST_LENGTH;
+		struct sk_buff *skb = sp->TxBuff[dirty];
+		struct ipg_tx *txfd = sp->txd + dirty;
+
+		IPG_DEBUG_MSG("TFC = %16.16lx\n", (unsigned long) txfd->tfc);
+
+		/* Look at each TFD's TFC field beginning
+		 * at the last freed TFD up to the current TFD.
+		 * If the TFDDone bit is set, free the associated
+		 * buffer.
+		 */
+		if (dirty == curr)
+			break;
+
+		/* Setup TFDDONE for compatible issue. */
+		txfd->tfc |= cpu_to_le64(IPG_TFC_TFDDONE);
+
+		/* Free the transmit buffer. */
+		if (skb) {
+			pci_unmap_single(sp->pdev,
+				le64_to_cpu(txfd->frag_info & ~IPG_TFI_FRAGLEN),
+				skb->len, PCI_DMA_TODEVICE);
+
+			IPG_DEV_KFREE_SKB(skb);
+
+			sp->TxBuff[dirty] = NULL;
+		}
+	}
+
+	sp->tx_dirty += released;
+
+	if (netif_queue_stopped(dev) &&
+	    (sp->tx_current != (sp->tx_dirty + IPG_TFDLIST_LENGTH))) {
+		netif_wake_queue(dev);
+	}
+}
+
+static void ipg_tx_timeout(struct net_device *dev)
+{
+	struct ipg_nic_private *sp = netdev_priv(dev);
+	void __iomem *ioaddr = sp->ioaddr;
+
+	ipg_reset(dev, IPG_AC_TX_RESET | IPG_AC_DMA | IPG_AC_NETWORK |
+		  IPG_AC_FIFO);
+
+	spin_lock_irq(&sp->lock);
+
+	/* Re-configure after DMA reset. */
+	if (ipg_io_config(dev) < 0) {
+		printk(KERN_INFO "%s: Error during re-configuration.\n",
+		       dev->name);
+	}
+
+	init_tfdlist(dev);
+
+	spin_unlock_irq(&sp->lock);
+
+	ipg_w32((ipg_r32(MAC_CTRL) | IPG_MC_TX_ENABLE) & IPG_MC_RSVD_MASK,
+		MAC_CTRL);
+}
+
+/*
+ * For TxComplete interrupts, free all transmit
+ * buffers which have already been transfered via DMA
+ * to the IPG.
+ */
+static void ipg_nic_txcleanup(struct net_device *dev)
+{
+	struct ipg_nic_private *sp = netdev_priv(dev);
+	void __iomem *ioaddr = sp->ioaddr;
+	unsigned int i;
+
+	IPG_DEBUG_MSG("_nic_txcleanup\n");
+
+	for (i = 0; i < IPG_TFDLIST_LENGTH; i++) {
+		/* Reading the TXSTATUS register clears the
+		 * TX_COMPLETE interrupt.
+		 */
+		u32 txstatusdword = ipg_r32(TX_STATUS);
+
+		IPG_DEBUG_MSG("TxStatus = %8.8x\n", txstatusdword);
+
+		/* Check for Transmit errors. Error bits only valid if
+		 * TX_COMPLETE bit in the TXSTATUS register is a 1.
+		 */
+		if (!(txstatusdword & IPG_TS_TX_COMPLETE))
+			break;
+
+		/* If in 10Mbps mode, indicate transmit is ready. */
+		if (sp->tenmbpsmode) {
+			netif_wake_queue(dev);
+		}
+
+		/* Transmit error, increment stat counters. */
+		if (txstatusdword & IPG_TS_TX_ERROR) {
+			IPG_DEBUG_MSG("Transmit error.\n");
+			sp->stats.tx_errors++;
+		}
+
+		/* Late collision, re-enable transmitter. */
+		if (txstatusdword & IPG_TS_LATE_COLLISION) {
+			IPG_DEBUG_MSG("Late collision on transmit.\n");
+			ipg_w32((ipg_r32(MAC_CTRL) | IPG_MC_TX_ENABLE) &
+				IPG_MC_RSVD_MASK, MAC_CTRL);
+		}
+
+		/* Maximum collisions, re-enable transmitter. */
+		if (txstatusdword & IPG_TS_TX_MAX_COLL) {
+			IPG_DEBUG_MSG("Maximum collisions on transmit.\n");
+			ipg_w32((ipg_r32(MAC_CTRL) | IPG_MC_TX_ENABLE) &
+				IPG_MC_RSVD_MASK, MAC_CTRL);
+		}
+
+		/* Transmit underrun, reset and re-enable
+		 * transmitter.
+		 */
+		if (txstatusdword & IPG_TS_TX_UNDERRUN) {
+			IPG_DEBUG_MSG("Transmitter underrun.\n");
+			sp->stats.tx_fifo_errors++;
+			ipg_reset(dev, IPG_AC_TX_RESET | IPG_AC_DMA |
+				  IPG_AC_NETWORK | IPG_AC_FIFO);
+
+			/* Re-configure after DMA reset. */
+			if (ipg_io_config(dev) < 0) {
+				printk(KERN_INFO
+				       "%s: Error during re-configuration.\n",
+				       dev->name);
+			}
+			init_tfdlist(dev);
+
+			ipg_w32((ipg_r32(MAC_CTRL) | IPG_MC_TX_ENABLE) &
+				IPG_MC_RSVD_MASK, MAC_CTRL);
+		}
+	}
+
+	ipg_nic_txfree(dev);
+}
+
+/* Provides statistical information about the IPG NIC. */
+struct net_device_stats *ipg_nic_get_stats(struct net_device *dev)
+{
+	struct ipg_nic_private *sp = netdev_priv(dev);
+	void __iomem *ioaddr = sp->ioaddr;
+	u16 temp1;
+	u16 temp2;
+
+	IPG_DEBUG_MSG("_nic_get_stats\n");
+
+	/* Check to see if the NIC has been initialized via nic_open,
+	 * before trying to read statistic registers.
+	 */
+	if (!test_bit(__LINK_STATE_START, &dev->state))
+		return &sp->stats;
+
+	sp->stats.rx_packets += ipg_r32(IPG_FRAMESRCVDOK);
+	sp->stats.tx_packets += ipg_r32(IPG_FRAMESXMTDOK);
+	sp->stats.rx_bytes += ipg_r32(IPG_OCTETRCVOK);
+	sp->stats.tx_bytes += ipg_r32(IPG_OCTETXMTOK);
+	temp1 = ipg_r16(IPG_FRAMESLOSTRXERRORS);
+	sp->stats.rx_errors += temp1;
+	sp->stats.rx_missed_errors += temp1;
+	temp1 = ipg_r32(IPG_SINGLECOLFRAMES) + ipg_r32(IPG_MULTICOLFRAMES) +
+		ipg_r32(IPG_LATECOLLISIONS);
+	temp2 = ipg_r16(IPG_CARRIERSENSEERRORS);
+	sp->stats.collisions += temp1;
+	sp->stats.tx_dropped += ipg_r16(IPG_FRAMESABORTXSCOLLS);
+	sp->stats.tx_errors += ipg_r16(IPG_FRAMESWEXDEFERRAL) +
+		ipg_r32(IPG_FRAMESWDEFERREDXMT) + temp1 + temp2;
+	sp->stats.multicast += ipg_r32(IPG_MCSTOCTETRCVDOK);
+
+	/* detailed tx_errors */
+	sp->stats.tx_carrier_errors += temp2;
+
+	/* detailed rx_errors */
+	sp->stats.rx_length_errors += ipg_r16(IPG_INRANGELENGTHERRORS) +
+		ipg_r16(IPG_FRAMETOOLONGERRRORS);
+	sp->stats.rx_crc_errors += ipg_r16(IPG_FRAMECHECKSEQERRORS);
+
+	/* Unutilized IPG statistic registers. */
+	ipg_r32(IPG_MCSTFRAMESRCVDOK);
+
+	return &sp->stats;
+}
+
+/* Restore used receive buffers. */
+static int ipg_nic_rxrestore(struct net_device *dev)
+{
+	struct ipg_nic_private *sp = netdev_priv(dev);
+	const unsigned int curr = sp->rx_current;
+	unsigned int dirty = sp->rx_dirty;
+
+	IPG_DEBUG_MSG("_nic_rxrestore\n");
+
+	for (dirty = sp->rx_dirty; curr - dirty > 0; dirty++) {
+		unsigned int entry = dirty % IPG_RFDLIST_LENGTH;
+
+		/* rx_copybreak may poke hole here and there. */
+		if (sp->RxBuff[entry])
+			continue;
+
+		/* Generate a new receive buffer to replace the
+		 * current buffer (which will be released by the
+		 * Linux system).
+		 */
+		if (ipg_get_rxbuff(dev, entry) < 0) {
+			IPG_DEBUG_MSG("Cannot allocate new Rx buffer.\n");
+
+			break;
+		}
+
+		/* Reset the RFS field. */
+		sp->rxd[entry].rfs = 0x0000000000000000;
+	}
+	sp->rx_dirty = dirty;
+
+	return 0;
+}
+
+#ifdef JUMBO_FRAME
+
+/* use jumboindex and jumbosize to control jumbo frame status
+   initial status is jumboindex=-1 and jumbosize=0
+   1. jumboindex = -1 and jumbosize=0 : previous jumbo frame has been done.
+   2. jumboindex != -1 and jumbosize != 0 : jumbo frame is not over size and receiving
+   3. jumboindex = -1 and jumbosize != 0 : jumbo frame is over size, already dump
+                previous receiving and need to continue dumping the current one
+*/
+enum {
+	NormalPacket,
+	ErrorPacket
+};
+
+enum {
+	Frame_NoStart_NoEnd	= 0,
+	Frame_WithStart		= 1,
+	Frame_WithEnd		= 10,
+	Frame_WithStart_WithEnd = 11
+};
+
+inline void ipg_nic_rx_free_skb(struct net_device *dev)
+{
+	struct ipg_nic_private *sp = netdev_priv(dev);
+	unsigned int entry = sp->rx_current % IPG_RFDLIST_LENGTH;
+
+	if (sp->RxBuff[entry]) {
+		struct ipg_rx *rxfd = sp->rxd + entry;
+
+		pci_unmap_single(sp->pdev,
+			le64_to_cpu(rxfd->frag_info & ~IPG_RFI_FRAGLEN),
+			sp->rx_buf_sz, PCI_DMA_FROMDEVICE);
+		IPG_DEV_KFREE_SKB(sp->RxBuff[entry]);
+		sp->RxBuff[entry] = NULL;
+	}
+}
+
+inline int ipg_nic_rx_check_frame_type(struct net_device *dev)
+{
+	struct ipg_nic_private *sp = netdev_priv(dev);
+	struct ipg_rx *rxfd = sp->rxd + (sp->rx_current % IPG_RFDLIST_LENGTH);
+	int type = Frame_NoStart_NoEnd;
+
+	if (le64_to_cpu(rxfd->rfs) & IPG_RFS_FRAMESTART)
+		type += Frame_WithStart;
+	if (le64_to_cpu(rxfd->rfs) & IPG_RFS_FRAMEEND)
+		type += Frame_WithEnd;
+	return type;
+}
+
+inline int ipg_nic_rx_check_error(struct net_device *dev)
+{
+	struct ipg_nic_private *sp = netdev_priv(dev);
+	unsigned int entry = sp->rx_current % IPG_RFDLIST_LENGTH;
+	struct ipg_rx *rxfd = sp->rxd + entry;
+
+	if (IPG_DROP_ON_RX_ETH_ERRORS && (le64_to_cpu(rxfd->rfs) &
+	     (IPG_RFS_RXFIFOOVERRUN | IPG_RFS_RXRUNTFRAME |
+	      IPG_RFS_RXALIGNMENTERROR | IPG_RFS_RXFCSERROR |
+	      IPG_RFS_RXOVERSIZEDFRAME | IPG_RFS_RXLENGTHERROR))) {
+		IPG_DEBUG_MSG("Rx error, RFS = %16.16lx\n",
+			      (unsigned long) rxfd->rfs);
+
+		/* Increment general receive error statistic. */
+		sp->stats.rx_errors++;
+
+		/* Increment detailed receive error statistics. */
+		if (le64_to_cpu(rxfd->rfs) & IPG_RFS_RXFIFOOVERRUN) {
+			IPG_DEBUG_MSG("RX FIFO overrun occured.\n");
+
+			sp->stats.rx_fifo_errors++;
+		}
+
+		if (le64_to_cpu(rxfd->rfs) & IPG_RFS_RXRUNTFRAME) {
+			IPG_DEBUG_MSG("RX runt occured.\n");
+			sp->stats.rx_length_errors++;
+		}
+
+		/* Do nothing for IPG_RFS_RXOVERSIZEDFRAME,
+		 * error count handled by a IPG statistic register.
+		 */
+
+		if (le64_to_cpu(rxfd->rfs) & IPG_RFS_RXALIGNMENTERROR) {
+			IPG_DEBUG_MSG("RX alignment error occured.\n");
+			sp->stats.rx_frame_errors++;
+		}
+
+		/* Do nothing for IPG_RFS_RXFCSERROR, error count
+		 * handled by a IPG statistic register.
+		 */
+
+		/* Free the memory associated with the RX
+		 * buffer since it is erroneous and we will
+		 * not pass it to higher layer processes.
+		 */
+		if (sp->RxBuff[entry]) {
+			pci_unmap_single(sp->pdev,
+				le64_to_cpu(rxfd->frag_info & ~IPG_RFI_FRAGLEN),
+				sp->rx_buf_sz, PCI_DMA_FROMDEVICE);
+
+			IPG_DEV_KFREE_SKB(sp->RxBuff[entry]);
+			sp->RxBuff[entry] = NULL;
+		}
+		return ErrorPacket;
+	}
+	return NormalPacket;
+}
+
+static void ipg_nic_rx_with_start_and_end(struct net_device *dev,
+					  struct ipg_nic_private *sp,
+					  struct ipg_rx *rxfd, unsigned entry)
+{
+	struct SJumbo *jumbo = &sp->Jumbo;
+	struct sk_buff *skb;
+	int framelen;
+
+	if (jumbo->FoundStart) {
+		IPG_DEV_KFREE_SKB(jumbo->skb);
+		jumbo->FoundStart = 0;
+		jumbo->CurrentSize = 0;
+		jumbo->skb = NULL;
+	}
+
+	// 1: found error, 0 no error
+	if (ipg_nic_rx_check_error(dev) != NormalPacket)
+		return;
+
+	skb = sp->RxBuff[entry];
+	if (!skb)
+		return;
+
+	// accept this frame and send to upper layer
+	framelen = le64_to_cpu(rxfd->rfs) & IPG_RFS_RXFRAMELEN;
+	if (framelen > IPG_RXFRAG_SIZE)
+		framelen = IPG_RXFRAG_SIZE;
+
+	skb_put(skb, framelen);
+	skb->protocol = eth_type_trans(skb, dev);
+	skb->ip_summed = CHECKSUM_NONE;
+	netif_rx(skb);
+	dev->last_rx = jiffies;
+	sp->RxBuff[entry] = NULL;
+}
+
+static void ipg_nic_rx_with_start(struct net_device *dev,
+				  struct ipg_nic_private *sp,
+				  struct ipg_rx *rxfd, unsigned entry)
+{
+	struct SJumbo *jumbo = &sp->Jumbo;
+	struct pci_dev *pdev = sp->pdev;
+	struct sk_buff *skb;
+
+	// 1: found error, 0 no error
+	if (ipg_nic_rx_check_error(dev) != NormalPacket)
+		return;
+
+	// accept this frame and send to upper layer
+	skb = sp->RxBuff[entry];
+	if (!skb)
+		return;
+
+	if (jumbo->FoundStart)
+		IPG_DEV_KFREE_SKB(jumbo->skb);
+
+	pci_unmap_single(pdev, le64_to_cpu(rxfd->frag_info & ~IPG_RFI_FRAGLEN),
+			 sp->rx_buf_sz, PCI_DMA_FROMDEVICE);
+
+	skb_put(skb, IPG_RXFRAG_SIZE);
+
+	jumbo->FoundStart = 1;
+	jumbo->CurrentSize = IPG_RXFRAG_SIZE;
+	jumbo->skb = skb;
+
+	sp->RxBuff[entry] = NULL;
+	dev->last_rx = jiffies;
+}
+
+static void ipg_nic_rx_with_end(struct net_device *dev,
+				struct ipg_nic_private *sp,
+				struct ipg_rx *rxfd, unsigned entry)
+{
+	struct SJumbo *jumbo = &sp->Jumbo;
+
+	//1: found error, 0 no error
+	if (ipg_nic_rx_check_error(dev) == NormalPacket) {
+		struct sk_buff *skb = sp->RxBuff[entry];
+
+		if (!skb)
+			return;
+
+		if (jumbo->FoundStart) {
+			int framelen, endframelen;
+
+			framelen = le64_to_cpu(rxfd->rfs) & IPG_RFS_RXFRAMELEN;
+
+			endframeLen = framelen - jumbo->CurrentSize;
+			/*
+			if (framelen > IPG_RXFRAG_SIZE)
+				framelen=IPG_RXFRAG_SIZE;
+			 */
+			if (framelen > IPG_RXSUPPORT_SIZE)
+				IPG_DEV_KFREE_SKB(jumbo->skb);
+			else {
+				memcpy(skb_put(jumbo->skb, endframeLen),
+				       skb->data, endframeLen);
+
+				jumbo->skb->protocol =
+				    eth_type_trans(jumbo->skb, dev);
+
+				jumbo->skb->ip_summed = CHECKSUM_NONE;
+				netif_rx(jumbo->skb);
+			}
+		}
+
+		dev->last_rx = jiffies;
+		jumbo->FoundStart = 0;
+		jumbo->CurrentSize = 0;
+		jumbo->skb = NULL;
+
+		ipg_nic_rx_free_skb(dev);
+	} else {
+		IPG_DEV_KFREE_SKB(jumbo->skb);
+		jumbo->FoundStart = 0;
+		jumbo->CurrentSize = 0;
+		jumbo->skb = NULL;
+	}
+}
+
+static void ipg_nic_rx_no_start_no_end(struct net_device *dev,
+				       struct ipg_nic_private *sp,
+				       struct ipg_rx *rxfd, unsigned entry)
+{
+	struct SJumbo *jumbo = &sp->Jumbo;
+
+	//1: found error, 0 no error
+	if (ipg_nic_rx_check_error(dev) == NormalPacket) {
+		struct sk_buff *skb = sp->RxBuff[entry];
+
+		if (skb) {
+			if (jumbo->FoundStart) {
+				jumbo->CurrentSize += IPG_RXFRAG_SIZE;
+				if (jumbo->CurrentSize <= IPG_RXSUPPORT_SIZE) {
+					memcpy(skb_put(jumbo->skb,
+						       IPG_RXFRAG_SIZE),
+					       skb->data, IPG_RXFRAG_SIZE);
+				}
+			}
+			dev->last_rx = jiffies;
+			ipg_nic_rx_free_skb(dev);
+		}
+	} else {
+		IPG_DEV_KFREE_SKB(jumbo->skb);
+		jumbo->FoundStart = 0;
+		jumbo->CurrentSize = 0;
+		jumbo->skb = NULL;
+	}
+}
+
+static int ipg_nic_rx(struct net_device *dev)
+{
+	struct ipg_nic_private *sp = netdev_priv(dev);
+	unsigned int curr = sp->rx_current;
+	void __iomem *ioaddr = sp->ioaddr;
+	unsigned int i;
+
+	IPG_DEBUG_MSG("_nic_rx\n");
+
+	for (i = 0; i < IPG_MAXRFDPROCESS_COUNT; i++, curr++) {
+		unsigned int entry = curr % IPG_RFDLIST_LENGTH;
+		struct ipg_rx *rxfd = sp->rxd + entry;
+
+		if (!(rxfd->rfs & le64_to_cpu(IPG_RFS_RFDDONE)))
+			break;
+
+		switch (ipg_nic_rx_check_frame_type(dev)) {
+		case Frame_WithStart_WithEnd:
+			ipg_nic_rx_with_start_and_end(dev, tp, rxfd, entry);
+			break;
+		case Frame_WithStart:
+			ipg_nic_rx_with_start(dev, tp, rxfd, entry);
+			break;
+		case Frame_WithEnd:
+			ipg_nic_rx_with_end(dev, tp, rxfd, entry);
+			break;
+		case Frame_NoStart_NoEnd:
+			ipg_nic_rx_no_start_no_end(dev, tp, rxfd, entry);
+			break;
+		}
+	}
+
+	sp->rx_current = curr;
+
+	if (i == IPG_MAXRFDPROCESS_COUNT) {
+		/* There are more RFDs to process, however the
+		 * allocated amount of RFD processing time has
+		 * expired. Assert Interrupt Requested to make
+		 * sure we come back to process the remaining RFDs.
+		 */
+		ipg_w32(ipg_r32(ASIC_CTRL) | IPG_AC_INT_REQUEST, ASIC_CTRL);
+	}
+
+	ipg_nic_rxrestore(dev);
+
+	return 0;
+}
+
+#else
+static int ipg_nic_rx(struct net_device *dev)
+{
+	/* Transfer received Ethernet frames to higher network layers. */
+	struct ipg_nic_private *sp = netdev_priv(dev);
+	unsigned int curr = sp->rx_current;
+	void __iomem *ioaddr = sp->ioaddr;
+	struct ipg_rx *rxfd;
+	unsigned int i;
+
+	IPG_DEBUG_MSG("_nic_rx\n");
+
+#define __RFS_MASK \
+	cpu_to_le64(IPG_RFS_RFDDONE | IPG_RFS_FRAMESTART | IPG_RFS_FRAMEEND)
+
+	for (i = 0; i < IPG_MAXRFDPROCESS_COUNT; i++, curr++) {
+		unsigned int entry = curr % IPG_RFDLIST_LENGTH;
+		struct sk_buff *skb = sp->RxBuff[entry];
+		unsigned int framelen;
+
+		rxfd = sp->rxd + entry;
+
+		if (((rxfd->rfs & __RFS_MASK) != __RFS_MASK) || !skb)
+			break;
+
+		/* Get received frame length. */
+		framelen = le64_to_cpu(rxfd->rfs) & IPG_RFS_RXFRAMELEN;
+
+		/* Check for jumbo frame arrival with too small
+		 * RXFRAG_SIZE.
+		 */
+		if (framelen > IPG_RXFRAG_SIZE) {
+			IPG_DEBUG_MSG
+			    ("RFS FrameLen > allocated fragment size.\n");
+
+			framelen = IPG_RXFRAG_SIZE;
+		}
+
+		if ((IPG_DROP_ON_RX_ETH_ERRORS && (le64_to_cpu(rxfd->rfs &
+		       (IPG_RFS_RXFIFOOVERRUN | IPG_RFS_RXRUNTFRAME |
+			IPG_RFS_RXALIGNMENTERROR | IPG_RFS_RXFCSERROR |
+			IPG_RFS_RXOVERSIZEDFRAME | IPG_RFS_RXLENGTHERROR))))) {
+
+			IPG_DEBUG_MSG("Rx error, RFS = %16.16lx\n",
+				      (unsigned long int) rxfd->rfs);
+
+			/* Increment general receive error statistic. */
+			sp->stats.rx_errors++;
+
+			/* Increment detailed receive error statistics. */
+			if (le64_to_cpu(rxfd->rfs & IPG_RFS_RXFIFOOVERRUN)) {
+				IPG_DEBUG_MSG("RX FIFO overrun occured.\n");
+				sp->stats.rx_fifo_errors++;
+			}
+
+			if (le64_to_cpu(rxfd->rfs & IPG_RFS_RXRUNTFRAME)) {
+				IPG_DEBUG_MSG("RX runt occured.\n");
+				sp->stats.rx_length_errors++;
+			}
+
+			if (le64_to_cpu(rxfd->rfs & IPG_RFS_RXOVERSIZEDFRAME)) ;
+			/* Do nothing, error count handled by a IPG
+			 * statistic register.
+			 */
+
+			if (le64_to_cpu(rxfd->rfs & IPG_RFS_RXALIGNMENTERROR)) {
+				IPG_DEBUG_MSG("RX alignment error occured.\n");
+				sp->stats.rx_frame_errors++;
+			}
+
+			if (le64_to_cpu(rxfd->rfs & IPG_RFS_RXFCSERROR)) ;
+			/* Do nothing, error count handled by a IPG
+			 * statistic register.
+			 */
+
+			/* Free the memory associated with the RX
+			 * buffer since it is erroneous and we will
+			 * not pass it to higher layer processes.
+			 */
+			if (skb) {
+				u64 info = rxfd->frag_info;
+
+				pci_unmap_single(sp->pdev,
+					le64_to_cpu(info & ~IPG_RFI_FRAGLEN),
+					sp->rx_buf_sz, PCI_DMA_FROMDEVICE);
+
+				IPG_DEV_KFREE_SKB(skb);
+			}
+		} else {
+
+			/* Adjust the new buffer length to accomodate the size
+			 * of the received frame.
+			 */
+			skb_put(skb, framelen);
+
+			/* Set the buffer's protocol field to Ethernet. */
+			skb->protocol = eth_type_trans(skb, dev);
+
+			/* If the frame contains an IP/TCP/UDP frame,
+			 * determine if upper layer must check IP/TCP/UDP
+			 * checksums.
+			 *
+			 * NOTE: DO NOT RELY ON THE TCP/UDP CHECKSUM
+			 *       VERIFICATION FOR SILICON REVISIONS B3
+			 *       AND EARLIER!
+			 *
+			 if ((le64_to_cpu(rxfd->rfs &
+			 (IPG_RFS_TCPDETECTED | IPG_RFS_UDPDETECTED |
+			 IPG_RFS_IPDETECTED))) &&
+			 !(le64_to_cpu(rxfd->rfs &
+			 (IPG_RFS_TCPERROR | IPG_RFS_UDPERROR |
+			 IPG_RFS_IPERROR))))
+			 {
+			 * Indicate IP checksums were performed
+			 * by the IPG.
+			 *
+			 skb->ip_summed = CHECKSUM_UNNECESSARY;
+			 }
+			 else
+			 */
+			if (1 == 1) {
+				/* The IPG encountered an error with (or
+				 * there were no) IP/TCP/UDP checksums.
+				 * This may or may not indicate an invalid
+				 * IP/TCP/UDP frame was received. Let the
+				 * upper layer decide.
+				 */
+				skb->ip_summed = CHECKSUM_NONE;
+			}
+
+			/* Hand off frame for higher layer processing.
+			 * The function netif_rx() releases the sk_buff
+			 * when processing completes.
+			 */
+			netif_rx(skb);
+
+			/* Record frame receive time (jiffies = Linux
+			 * kernel current time stamp).
+			 */
+			dev->last_rx = jiffies;
+		}
+
+		/* Assure RX buffer is not reused by IPG. */
+		sp->RxBuff[entry] = NULL;
+	}
+
+	/*
+	 * If there are more RFDs to proces and the allocated amount of RFD
+	 * processing time has expired, assert Interrupt Requested to make
+	 * sure we come back to process the remaining RFDs.
+	 */
+	if (i == IPG_MAXRFDPROCESS_COUNT)
+		ipg_w32(ipg_r32(ASIC_CTRL) | IPG_AC_INT_REQUEST, ASIC_CTRL);
+
+#ifdef IPG_DEBUG
+	/* Check if the RFD list contained no receive frame data. */
+	if (!i)
+		sp->EmptyRFDListCount++;
+#endif
+	while ((le64_to_cpu(rxfd->rfs & IPG_RFS_RFDDONE)) &&
+	       !((le64_to_cpu(rxfd->rfs & IPG_RFS_FRAMESTART)) &&
+		 (le64_to_cpu(rxfd->rfs & IPG_RFS_FRAMEEND)))) {
+		unsigned int entry = curr++ % IPG_RFDLIST_LENGTH;
+
+		rxfd = sp->rxd + entry;
+
+		IPG_DEBUG_MSG("Frame requires multiple RFDs.\n");
+
+		/* An unexpected event, additional code needed to handle
+		 * properly. So for the time being, just disregard the
+		 * frame.
+		 */
+
+		/* Free the memory associated with the RX
+		 * buffer since it is erroneous and we will
+		 * not pass it to higher layer processes.
+		 */
+		if (sp->RxBuff[entry]) {
+			pci_unmap_single(sp->pdev,
+				le64_to_cpu(rxfd->frag_info & ~IPG_RFI_FRAGLEN),
+				sp->rx_buf_sz, PCI_DMA_FROMDEVICE);
+			IPG_DEV_KFREE_SKB(sp->RxBuff[entry]);
+		}
+
+		/* Assure RX buffer is not reused by IPG. */
+		sp->RxBuff[entry] = NULL;
+	}
+
+	sp->rx_current = curr;
+
+	/* Check to see if there are a minimum number of used
+	 * RFDs before restoring any (should improve performance.)
+	 */
+	if ((curr - sp->rx_dirty) >= IPG_MINUSEDRFDSTOFREE)
+		ipg_nic_rxrestore(dev);
+
+	return 0;
+}
+#endif
+
+static void ipg_reset_after_host_error(struct work_struct *work)
+{
+	struct ipg_nic_private *sp =
+		container_of(work, struct ipg_nic_private, task.work);
+	struct net_device *dev = sp->dev;
+
+	IPG_DDEBUG_MSG("DMACtrl = %8.8x\n", ioread32(sp->ioaddr + IPG_DMACTRL));
+
+	/*
+	 * Acknowledge HostError interrupt by resetting
+	 * IPG DMA and HOST.
+	 */
+	ipg_reset(dev, IPG_AC_GLOBAL_RESET | IPG_AC_HOST | IPG_AC_DMA);
+
+	init_rfdlist(dev);
+	init_tfdlist(dev);
+
+	if (ipg_io_config(dev) < 0) {
+		printk(KERN_INFO "%s: Cannot recover from PCI error.\n",
+		       dev->name);
+		schedule_delayed_work(&sp->task, HZ);
+	}
+}
+
+static irqreturn_t ipg_interrupt_handler(int irq, void *dev_inst)
+{
+	struct net_device *dev = dev_inst;
+	struct ipg_nic_private *sp = netdev_priv(dev);
+	void __iomem *ioaddr = sp->ioaddr;
+	unsigned int handled = 0;
+	u16 status;
+
+	IPG_DEBUG_MSG("_interrupt_handler\n");
+
+#ifdef JUMBO_FRAME
+	ipg_nic_rxrestore(dev);
+#endif
+	/* Get interrupt source information, and acknowledge
+	 * some (i.e. TxDMAComplete, RxDMAComplete, RxEarly,
+	 * IntRequested, MacControlFrame, LinkEvent) interrupts
+	 * if issued. Also, all IPG interrupts are disabled by
+	 * reading IntStatusAck.
+	 */
+	status = ipg_r16(INT_STATUS_ACK);
+
+	IPG_DEBUG_MSG("IntStatusAck = %4.4x\n", status);
+
+	/* Shared IRQ of remove event. */
+	if (!(status & IPG_IS_RSVD_MASK))
+		goto out_enable;
+
+	handled = 1;
+
+	if (unlikely(!netif_running(dev)))
+		goto out;
+
+	spin_lock(&sp->lock);
+
+	/* If RFDListEnd interrupt, restore all used RFDs. */
+	if (status & IPG_IS_RFD_LIST_END) {
+		IPG_DEBUG_MSG("RFDListEnd Interrupt.\n");
+
+		/* The RFD list end indicates an RFD was encountered
+		 * with a 0 NextPtr, or with an RFDDone bit set to 1
+		 * (indicating the RFD is not read for use by the
+		 * IPG.) Try to restore all RFDs.
+		 */
+		ipg_nic_rxrestore(dev);
+
+#ifdef IPG_DEBUG
+		/* Increment the RFDlistendCount counter. */
+		sp->RFDlistendCount++;
+#endif
+	}
+
+	/* If RFDListEnd, RxDMAPriority, RxDMAComplete, or
+	 * IntRequested interrupt, process received frames. */
+	if ((status & IPG_IS_RX_DMA_PRIORITY) ||
+	    (status & IPG_IS_RFD_LIST_END) ||
+	    (status & IPG_IS_RX_DMA_COMPLETE) ||
+	    (status & IPG_IS_INT_REQUESTED)) {
+#ifdef IPG_DEBUG
+		/* Increment the RFD list checked counter if interrupted
+		 * only to check the RFD list. */
+		if (status & (~(IPG_IS_RX_DMA_PRIORITY | IPG_IS_RFD_LIST_END |
+				IPG_IS_RX_DMA_COMPLETE | IPG_IS_INT_REQUESTED) &
+			       (IPG_IS_HOST_ERROR | IPG_IS_TX_DMA_COMPLETE |
+				IPG_IS_LINK_EVENT | IPG_IS_TX_COMPLETE |
+				IPG_IS_UPDATE_STATS)))
+			sp->RFDListCheckedCount++;
+#endif
+
+		ipg_nic_rx(dev);
+	}
+
+	/* If TxDMAComplete interrupt, free used TFDs. */
+	if (status & IPG_IS_TX_DMA_COMPLETE)
+		ipg_nic_txfree(dev);
+
+	/* TxComplete interrupts indicate one of numerous actions.
+	 * Determine what action to take based on TXSTATUS register.
+	 */
+	if (status & IPG_IS_TX_COMPLETE)
+		ipg_nic_txcleanup(dev);
+
+	/* If UpdateStats interrupt, update Linux Ethernet statistics */
+	if (status & IPG_IS_UPDATE_STATS)
+		ipg_nic_get_stats(dev);
+
+	/* If HostError interrupt, reset IPG. */
+	if (status & IPG_IS_HOST_ERROR) {
+		IPG_DDEBUG_MSG("HostError Interrupt\n");
+
+		schedule_delayed_work(&sp->task, 0);
+	}
+
+	/* If LinkEvent interrupt, resolve autonegotiation. */
+	if (status & IPG_IS_LINK_EVENT) {
+		if (ipg_config_autoneg(dev) < 0)
+			printk(KERN_INFO "%s: Auto-negotiation error.\n",
+			       dev->name);
+	}
+
+	/* If MACCtrlFrame interrupt, do nothing. */
+	if (status & IPG_IS_MAC_CTRL_FRAME)
+		IPG_DEBUG_MSG("MACCtrlFrame interrupt.\n");
+
+	/* If RxComplete interrupt, do nothing. */
+	if (status & IPG_IS_RX_COMPLETE)
+		IPG_DEBUG_MSG("RxComplete interrupt.\n");
+
+	/* If RxEarly interrupt, do nothing. */
+	if (status & IPG_IS_RX_EARLY)
+		IPG_DEBUG_MSG("RxEarly interrupt.\n");
+
+out_enable:
+	/* Re-enable IPG interrupts. */
+	ipg_w16(IPG_IE_TX_DMA_COMPLETE | IPG_IE_RX_DMA_COMPLETE |
+		IPG_IE_HOST_ERROR | IPG_IE_INT_REQUESTED | IPG_IE_TX_COMPLETE |
+		IPG_IE_LINK_EVENT | IPG_IE_UPDATE_STATS, INT_ENABLE);
+
+	spin_unlock(&sp->lock);
+out:
+	return IRQ_RETVAL(handled);
+}
+
+static void ipg_rx_clear(struct ipg_nic_private *sp)
+{
+	unsigned int i;
+
+	for (i = 0; i < IPG_RFDLIST_LENGTH; i++) {
+		if (sp->RxBuff[i]) {
+			struct ipg_rx *rxfd = sp->rxd + i;
+
+			IPG_DEV_KFREE_SKB(sp->RxBuff[i]);
+			sp->RxBuff[i] = NULL;
+			pci_unmap_single(sp->pdev,
+				le64_to_cpu(rxfd->frag_info & ~IPG_RFI_FRAGLEN),
+				sp->rx_buf_sz, PCI_DMA_FROMDEVICE);
+		}
+	}
+}
+
+static void ipg_tx_clear(struct ipg_nic_private *sp)
+{
+	unsigned int i;
+
+	for (i = 0; i < IPG_TFDLIST_LENGTH; i++) {
+		if (sp->TxBuff[i]) {
+			struct ipg_tx *txfd = sp->txd + i;
+
+			pci_unmap_single(sp->pdev,
+				le64_to_cpu(txfd->frag_info & ~IPG_TFI_FRAGLEN),
+				sp->TxBuff[i]->len, PCI_DMA_TODEVICE);
+
+			IPG_DEV_KFREE_SKB(sp->TxBuff[i]);
+
+			sp->TxBuff[i] = NULL;
+		}
+	}
+}
+
+static int ipg_nic_open(struct net_device *dev)
+{
+	struct ipg_nic_private *sp = netdev_priv(dev);
+	void __iomem *ioaddr = sp->ioaddr;
+	struct pci_dev *pdev = sp->pdev;
+	int rc;
+
+	IPG_DEBUG_MSG("_nic_open\n");
+
+	sp->rx_buf_sz = IPG_RXSUPPORT_SIZE;
+
+	/* Check for interrupt line conflicts, and request interrupt
+	 * line for IPG.
+	 *
+	 * IMPORTANT: Disable IPG interrupts prior to registering
+	 *            IRQ.
+	 */
+	ipg_w16(0x0000, INT_ENABLE);
+
+	/* Register the interrupt line to be used by the IPG within
+	 * the Linux system.
+	 */
+	rc = request_irq(pdev->irq, &ipg_interrupt_handler, IRQF_SHARED,
+			 dev->name, dev);
+	if (rc < 0) {
+		printk(KERN_INFO "%s: Error when requesting interrupt.\n",
+		       dev->name);
+		goto out;
+	}
+
+	dev->irq = pdev->irq;
+
+	rc = -ENOMEM;
+
+	sp->rxd = dma_alloc_coherent(&pdev->dev, IPG_RX_RING_BYTES,
+				     &sp->rxd_map, GFP_KERNEL);
+	if (!sp->rxd)
+		goto err_free_irq_0;
+
+	sp->txd = dma_alloc_coherent(&pdev->dev, IPG_TX_RING_BYTES,
+				     &sp->txd_map, GFP_KERNEL);
+	if (!sp->txd)
+		goto err_free_rx_1;
+
+	rc = init_rfdlist(dev);
+	if (rc < 0) {
+		printk(KERN_INFO "%s: Error during configuration.\n",
+		       dev->name);
+		goto err_free_tx_2;
+	}
+
+	init_tfdlist(dev);
+
+	rc = ipg_io_config(dev);
+	if (rc < 0) {
+		printk(KERN_INFO "%s: Error during configuration.\n",
+		       dev->name);
+		goto err_release_tfdlist_3;
+	}
+
+	/* Resolve autonegotiation. */
+	if (ipg_config_autoneg(dev) < 0)
+		printk(KERN_INFO "%s: Auto-negotiation error.\n", dev->name);
+
+#ifdef JUMBO_FRAME
+	/* initialize JUMBO Frame control variable */
+	sp->Jumbo.FoundStart = 0;
+	sp->Jumbo.CurrentSize = 0;
+	sp->Jumbo.skb = 0;
+	dev->mtu = IPG_TXFRAG_SIZE;
+#endif
+
+	/* Enable transmit and receive operation of the IPG. */
+	ipg_w32((ipg_r32(MAC_CTRL) | IPG_MC_RX_ENABLE | IPG_MC_TX_ENABLE) &
+		 IPG_MC_RSVD_MASK, MAC_CTRL);
+
+	netif_start_queue(dev);
+out:
+	return rc;
+
+err_release_tfdlist_3:
+	ipg_tx_clear(sp);
+	ipg_rx_clear(sp);
+err_free_tx_2:
+	dma_free_coherent(&pdev->dev, IPG_TX_RING_BYTES, sp->txd, sp->txd_map);
+err_free_rx_1:
+	dma_free_coherent(&pdev->dev, IPG_RX_RING_BYTES, sp->rxd, sp->rxd_map);
+err_free_irq_0:
+	free_irq(pdev->irq, dev);
+	goto out;
+}
+
+static int ipg_nic_stop(struct net_device *dev)
+{
+	struct ipg_nic_private *sp = netdev_priv(dev);
+	void __iomem *ioaddr = sp->ioaddr;
+	struct pci_dev *pdev = sp->pdev;
+
+	IPG_DEBUG_MSG("_nic_stop\n");
+
+	netif_stop_queue(dev);
+
+	IPG_DDEBUG_MSG("RFDlistendCount = %i\n", sp->RFDlistendCount);
+	IPG_DDEBUG_MSG("RFDListCheckedCount = %i\n", sp->rxdCheckedCount);
+	IPG_DDEBUG_MSG("EmptyRFDListCount = %i\n", sp->EmptyRFDListCount);
+	IPG_DUMPTFDLIST(dev);
+
+	do {
+		(void) ipg_r16(INT_STATUS_ACK);
+
+		ipg_reset(dev, IPG_AC_GLOBAL_RESET | IPG_AC_HOST | IPG_AC_DMA);
+
+		synchronize_irq(pdev->irq);
+	} while (ipg_r16(INT_ENABLE) & IPG_IE_RSVD_MASK);
+
+	ipg_rx_clear(sp);
+
+	ipg_tx_clear(sp);
+
+	pci_free_consistent(pdev, IPG_RX_RING_BYTES, sp->rxd, sp->rxd_map);
+	pci_free_consistent(pdev, IPG_TX_RING_BYTES, sp->txd, sp->txd_map);
+
+	free_irq(pdev->irq, dev);
+
+	return 0;
+}
+
+static int ipg_nic_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct ipg_nic_private *sp = netdev_priv(dev);
+	void __iomem *ioaddr = sp->ioaddr;
+	unsigned int entry = sp->tx_current % IPG_TFDLIST_LENGTH;
+	unsigned long flags;
+	struct ipg_tx *txfd;
+
+	IPG_DDEBUG_MSG("_nic_hard_start_xmit\n");
+
+	/* If in 10Mbps mode, stop the transmit queue so
+	 * no more transmit frames are accepted.
+	 */
+	if (sp->tenmbpsmode)
+		netif_stop_queue(dev);
+
+	if (sp->ResetCurrentTFD) {
+		sp->ResetCurrentTFD = 0;
+		entry = 0;
+	}
+
+	txfd = sp->txd + entry;
+
+	sp->TxBuff[entry] = skb;
+
+	/* Clear all TFC fields, except TFDDONE. */
+	txfd->tfc = cpu_to_le64(IPG_TFC_TFDDONE);
+
+	/* Specify the TFC field within the TFD. */
+	txfd->tfc |= cpu_to_le64(IPG_TFC_WORDALIGNDISABLED |
+		(IPG_TFC_FRAMEID & cpu_to_le64(sp->tx_current)) |
+		(IPG_TFC_FRAGCOUNT & (1 << 24)));
+
+	/* Request TxComplete interrupts at an interval defined
+	 * by the constant IPG_FRAMESBETWEENTXCOMPLETES.
+	 * Request TxComplete interrupt for every frame
+	 * if in 10Mbps mode to accomodate problem with 10Mbps
+	 * processing.
+	 */
+	if (sp->tenmbpsmode)
+		txfd->tfc |= cpu_to_le64(IPG_TFC_TXINDICATE);
+	else if (!((sp->tx_current - sp->tx_dirty + 1) >
+	    IPG_FRAMESBETWEENTXDMACOMPLETES)) {
+		txfd->tfc |= cpu_to_le64(IPG_TFC_TXDMAINDICATE);
+	}
+	/* Based on compilation option, determine if FCS is to be
+	 * appended to transmit frame by IPG.
+	 */
+	if (!(IPG_APPEND_FCS_ON_TX))
+		txfd->tfc |= cpu_to_le64(IPG_TFC_FCSAPPENDDISABLE);
+
+	/* Based on compilation option, determine if IP, TCP and/or
+	 * UDP checksums are to be added to transmit frame by IPG.
+	 */
+	if (IPG_ADD_IPCHECKSUM_ON_TX)
+		txfd->tfc |= cpu_to_le64(IPG_TFC_IPCHECKSUMENABLE);
+
+	if (IPG_ADD_TCPCHECKSUM_ON_TX)
+		txfd->tfc |= cpu_to_le64(IPG_TFC_TCPCHECKSUMENABLE);
+
+	if (IPG_ADD_UDPCHECKSUM_ON_TX)
+		txfd->tfc |= cpu_to_le64(IPG_TFC_UDPCHECKSUMENABLE);
+
+	/* Based on compilation option, determine if VLAN tag info is to be
+	 * inserted into transmit frame by IPG.
+	 */
+	if (IPG_INSERT_MANUAL_VLAN_TAG) {
+		txfd->tfc |= cpu_to_le64(IPG_TFC_VLANTAGINSERT |
+			((u64) IPG_MANUAL_VLAN_VID << 32) |
+			((u64) IPG_MANUAL_VLAN_CFI << 44) |
+			((u64) IPG_MANUAL_VLAN_USERPRIORITY << 45));
+	}
+
+	/* The fragment start location within system memory is defined
+	 * by the sk_buff structure's data field. The physical address
+	 * of this location within the system's virtual memory space
+	 * is determined using the IPG_HOST2BUS_MAP function.
+	 */
+	txfd->frag_info = cpu_to_le64(pci_map_single(sp->pdev, skb->data,
+		skb->len, PCI_DMA_TODEVICE));
+
+	/* The length of the fragment within system memory is defined by
+	 * the sk_buff structure's len field.
+	 */
+	txfd->frag_info |= cpu_to_le64(IPG_TFI_FRAGLEN &
+		((u64) (skb->len & 0xffff) << 48));
+
+	/* Clear the TFDDone bit last to indicate the TFD is ready
+	 * for transfer to the IPG.
+	 */
+	txfd->tfc &= cpu_to_le64(~IPG_TFC_TFDDONE);
+
+	spin_lock_irqsave(&sp->lock, flags);
+
+	sp->tx_current++;
+
+	mmiowb();
+
+	ipg_w32(IPG_DC_TX_DMA_POLL_NOW, DMA_CTRL);
+
+	if (sp->tx_current == (sp->tx_dirty + IPG_TFDLIST_LENGTH))
+		netif_wake_queue(dev);
+
+	spin_unlock_irqrestore(&sp->lock, flags);
+
+	return NETDEV_TX_OK;
+}
+
+static void ipg_set_phy_default_param(unsigned char rev,
+				      struct net_device *dev, int phy_address)
+{
+	unsigned short length;
+	unsigned char revision;
+	unsigned short *phy_param;
+	unsigned short address, value;
+
+	phy_param = &DefaultPhyParam[0];
+	length = *phy_param & 0x00FF;
+	revision = (unsigned char)((*phy_param) >> 8);
+	phy_param++;
+	while (length != 0) {
+		if (rev == revision) {
+			while (length > 1) {
+				address = *phy_param;
+				value = *(phy_param + 1);
+				phy_param += 2;
+				mdio_write(dev, phy_address, address, value);
+				length -= 4;
+			}
+			break;
+		} else {
+			phy_param += length / 2;
+			length = *phy_param & 0x00FF;
+			revision = (unsigned char)((*phy_param) >> 8);
+			phy_param++;
+		}
+	}
+}
+
+/* JES20040127EEPROM */
+static int read_eeprom(struct net_device *dev, int eep_addr)
+{
+	void __iomem *ioaddr = ipg_ioaddr(dev);
+	unsigned int i;
+	int ret = 0;
+	u16 value;
+
+	value = IPG_EC_EEPROM_READOPCODE | (eep_addr & 0xff);
+	ipg_w16(value, EEPROM_CTRL);
+
+	for (i = 0; i < 1000; i++) {
+		u16 data;
+
+		mdelay(10);
+		data = ipg_r16(EEPROM_CTRL);
+		if (!(data & IPG_EC_EEPROM_BUSY)) {
+			ret = ipg_r16(EEPROM_DATA);
+			break;
+		}
+	}
+	return ret;
+}
+
+static void ipg_init_mii(struct net_device *dev)
+{
+	struct ipg_nic_private *sp = netdev_priv(dev);
+	struct mii_if_info *mii_if = &sp->mii_if;
+	int phyaddr;
+
+	mii_if->dev          = dev;
+	mii_if->mdio_read    = mdio_read;
+	mii_if->mdio_write   = mdio_write;
+	mii_if->phy_id_mask  = 0x1f;
+	mii_if->reg_num_mask = 0x1f;
+
+	mii_if->phy_id = phyaddr = ipg_find_phyaddr(dev);
+
+	if (phyaddr != 0x1f) {
+		u16 mii_phyctrl, mii_1000cr;
+		u8 revisionid = 0;
+
+		mii_1000cr  = mdio_read(dev, phyaddr, MII_CTRL1000);
+		mii_1000cr |= ADVERTISE_1000FULL | ADVERTISE_1000HALF |
+			GMII_PHY_1000BASETCONTROL_PreferMaster;
+		mdio_write(dev, phyaddr, MII_CTRL1000, mii_1000cr);
+
+		mii_phyctrl = mdio_read(dev, phyaddr, MII_BMCR);
+
+		/* Set default phyparam */
+		pci_read_config_byte(sp->pdev, PCI_REVISION_ID, &revisionid);
+		ipg_set_phy_default_param(revisionid, dev, phyaddr);
+
+		/* Reset PHY */
+		mii_phyctrl |= BMCR_RESET | BMCR_ANRESTART;
+		mdio_write(dev, phyaddr, MII_BMCR, mii_phyctrl);
+
+	}
+}
+
+static int ipg_hw_init(struct net_device *dev)
+{
+	struct ipg_nic_private *sp = netdev_priv(dev);
+	void __iomem *ioaddr = sp->ioaddr;
+	unsigned int i;
+	int rc;
+
+	/* Read/Write and Reset EEPROM Value Jesse20040128EEPROM_VALUE */
+	/* Read LED Mode Configuration from EEPROM */
+	sp->LED_Mode = read_eeprom(dev, 6);
+
+	/* Reset all functions within the IPG. Do not assert
+	 * RST_OUT as not compatible with some PHYs.
+	 */
+	rc = ipg_reset(dev, IPG_RESET_MASK);
+	if (rc < 0)
+		goto out;
+
+	ipg_init_mii(dev);
+
+	/* Read MAC Address from EEPROM */
+	for (i = 0; i < 3; i++)
+		sp->station_addr[i] = read_eeprom(dev, 16 + i);
+
+	for (i = 0; i < 3; i++)
+		ipg_w16(sp->station_addr[i], STATION_ADDRESS_0 + 2*i);
+
+	/* Set station address in ethernet_device structure. */
+	dev->dev_addr[0] =  ipg_r16(STATION_ADDRESS_0) & 0x00ff;
+	dev->dev_addr[1] = (ipg_r16(STATION_ADDRESS_0) & 0xff00) >> 8;
+	dev->dev_addr[2] =  ipg_r16(STATION_ADDRESS_1) & 0x00ff;
+	dev->dev_addr[3] = (ipg_r16(STATION_ADDRESS_1) & 0xff00) >> 8;
+	dev->dev_addr[4] =  ipg_r16(STATION_ADDRESS_2) & 0x00ff;
+	dev->dev_addr[5] = (ipg_r16(STATION_ADDRESS_2) & 0xff00) >> 8;
+out:
+	return rc;
+}
+
+static int ipg_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+{
+	struct ipg_nic_private *sp = netdev_priv(dev);
+	int rc;
+
+	mutex_lock(&sp->mii_mutex);
+	rc = generic_mii_ioctl(&sp->mii_if, if_mii(ifr), cmd, NULL);
+	mutex_unlock(&sp->mii_mutex);
+
+	return rc;
+}
+
+static int ipg_nic_change_mtu(struct net_device *dev, int new_mtu)
+{
+	/* Function to accomodate changes to Maximum Transfer Unit
+	 * (or MTU) of IPG NIC. Cannot use default function since
+	 * the default will not allow for MTU > 1500 bytes.
+	 */
+
+	IPG_DEBUG_MSG("_nic_change_mtu\n");
+
+	/* Check that the new MTU value is between 68 (14 byte header, 46
+	 * byte payload, 4 byte FCS) and IPG_MAX_RXFRAME_SIZE, which
+	 * corresponds to the MAXFRAMESIZE register in the IPG.
+	 */
+	if ((new_mtu < 68) || (new_mtu > IPG_MAX_RXFRAME_SIZE))
+		return -EINVAL;
+
+	dev->mtu = new_mtu;
+
+	return 0;
+}
+
+static int ipg_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
+{
+	struct ipg_nic_private *sp = netdev_priv(dev);
+	int rc;
+
+	mutex_lock(&sp->mii_mutex);
+	rc = mii_ethtool_gset(&sp->mii_if, cmd);
+	mutex_unlock(&sp->mii_mutex);
+
+	return rc;
+}
+
+static int ipg_set_settings(struct net_device *dev, struct ethtool_cmd *cmd)
+{
+	struct ipg_nic_private *sp = netdev_priv(dev);
+	int rc;
+
+	mutex_lock(&sp->mii_mutex);
+	rc = mii_ethtool_sset(&sp->mii_if, cmd);
+	mutex_unlock(&sp->mii_mutex);
+
+	return rc;
+}
+
+static int ipg_nway_reset(struct net_device *dev)
+{
+	struct ipg_nic_private *sp = netdev_priv(dev);
+	int rc;
+
+	mutex_lock(&sp->mii_mutex);
+	rc = mii_nway_restart(&sp->mii_if);
+	mutex_unlock(&sp->mii_mutex);
+
+	return rc;
+}
+
+static struct ethtool_ops ipg_ethtool_ops = {
+	.get_settings = ipg_get_settings,
+	.set_settings = ipg_set_settings,
+	.nway_reset   = ipg_nway_reset,
+};
+
+static void ipg_remove(struct pci_dev *pdev)
+{
+	struct net_device *dev = pci_get_drvdata(pdev);
+	struct ipg_nic_private *sp = netdev_priv(dev);
+
+	IPG_DEBUG_MSG("_remove\n");
+
+	/* Un-register Ethernet device. */
+	unregister_netdev(dev);
+
+	pci_iounmap(pdev, sp->ioaddr);
+
+	pci_release_regions(pdev);
+
+	free_netdev(dev);
+	pci_disable_device(pdev);
+	pci_set_drvdata(pdev, NULL);
+}
+
+static int __devinit ipg_probe(struct pci_dev *pdev,
+			       const struct pci_device_id *id)
+{
+	unsigned int i = id->driver_data;
+	struct ipg_nic_private *sp;
+	struct net_device *dev;
+	void __iomem *ioaddr;
+	int rc;
+
+	rc = pci_enable_device(pdev);
+	if (rc < 0)
+		goto out;
+
+	printk(KERN_INFO "%s: %s\n", pci_name(pdev), ipg_brand_name[i]);
+
+	pci_set_master(pdev);
+
+	rc = pci_set_dma_mask(pdev, DMA_40BIT_MASK);
+	if (rc < 0) {
+		rc = pci_set_dma_mask(pdev, DMA_32BIT_MASK);
+		if (rc < 0) {
+			printk(KERN_ERR "%s: DMA config failed.\n",
+			       pci_name(pdev));
+			goto err_disable_0;
+		}
+	}
+
+	/*
+	 * Initialize net device.
+	 */
+	dev = alloc_etherdev(sizeof(struct ipg_nic_private));
+	if (!dev) {
+		printk(KERN_ERR "%s: alloc_etherdev failed\n", pci_name(pdev));
+		rc = -ENOMEM;
+		goto err_disable_0;
+	}
+
+	sp = netdev_priv(dev);
+	spin_lock_init(&sp->lock);
+	mutex_init(&sp->mii_mutex);
+
+	/* Declare IPG NIC functions for Ethernet device methods.
+	 */
+	dev->open = &ipg_nic_open;
+	dev->stop = &ipg_nic_stop;
+	dev->hard_start_xmit = &ipg_nic_hard_start_xmit;
+	dev->get_stats = &ipg_nic_get_stats;
+	dev->set_multicast_list = &ipg_nic_set_multicast_list;
+	dev->do_ioctl = ipg_ioctl;
+	dev->tx_timeout = ipg_tx_timeout;
+	dev->change_mtu = &ipg_nic_change_mtu;
+
+	SET_MODULE_OWNER(dev);
+	SET_NETDEV_DEV(dev, &pdev->dev);
+	SET_ETHTOOL_OPS(dev, &ipg_ethtool_ops);
+
+	rc = pci_request_regions(pdev, DRV_NAME);
+	if (rc)
+		goto err_free_dev_1;
+
+	ioaddr = pci_iomap(pdev, 1, pci_resource_len(pdev, 1));
+	if (!ioaddr) {
+		printk(KERN_ERR "%s cannot map MMIO\n", pci_name(pdev));
+		rc = -EIO;
+		goto err_release_regions_2;
+	}
+
+	/* Save the pointer to the PCI device information. */
+	sp->ioaddr = ioaddr;
+	sp->pdev = pdev;
+	sp->dev = dev;
+
+	INIT_DELAYED_WORK(&sp->task, ipg_reset_after_host_error);
+
+	pci_set_drvdata(pdev, dev);
+
+	rc = ipg_hw_init(dev);
+	if (rc < 0)
+		goto err_unmap_3;
+
+	rc = register_netdev(dev);
+	if (rc < 0)
+		goto err_unmap_3;
+
+	printk(KERN_INFO "Ethernet device registered as: %s\n", dev->name);
+out:
+	return rc;
+
+err_unmap_3:
+	pci_iounmap(pdev, ioaddr);
+err_release_regions_2:
+	pci_release_regions(pdev);
+err_free_dev_1:
+	free_netdev(dev);
+err_disable_0:
+	pci_disable_device(pdev);
+	goto out;
+}
+
+static struct pci_driver ipg_pci_driver = {
+	.name		= IPG_DRIVER_NAME,
+	.id_table	= ipg_pci_tbl,
+	.probe		= ipg_probe,
+	.remove		= __devexit_p(ipg_remove),
+};
+
+static int __init ipg_init_module(void)
+{
+	return pci_register_driver(&ipg_pci_driver);
+}
+
+static void __exit ipg_exit_module(void)
+{
+	pci_unregister_driver(&ipg_pci_driver);
+}
+
+module_init(ipg_init_module);
+module_exit(ipg_exit_module);
diff --git a/drivers/net/ipg.h b/drivers/net/ipg.h
new file mode 100755
index 0000000..9b8e3bb
--- /dev/null
+++ b/drivers/net/ipg.h
@@ -0,0 +1,856 @@
+/*
+ *
+ * ipg.h
+ *
+ * Include file for Gigabit Ethernet device driver for Network
+ * Interface Cards (NICs) utilizing the Tamarack Microelectronics
+ * Inc. IPG Gigabit or Triple Speed Ethernet Media Access
+ * Controller.
+ *
+ * Craig Rich
+ * Sundance Technology, Inc.
+ * 1485 Saratoga Avenue
+ * Suite 200
+ * San Jose, CA 95129
+ * 408 873 4117
+ * www.sundanceti.com
+ * craig_rich@sundanceti.com
+ */
+#ifndef __LINUX_IPG_H
+#define __LINUX_IPG_H
+
+#include <linux/version.h>
+#include <linux/module.h>
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/ioport.h>
+#include <linux/errno.h>
+#include <asm/io.h>
+#include <linux/delay.h>
+#include <linux/types.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/init.h>
+#include <linux/skbuff.h>
+#include <linux/version.h>
+#include <asm/bitops.h>
+/*#include <asm/spinlock.h>*/
+
+#define DrvVer "2.09d"
+
+#define IPG_DEV_KFREE_SKB(skb) dev_kfree_skb_irq(skb)
+
+/*
+ *	Constants
+ */
+
+/* GMII based PHY IDs */
+#define		NS				0x2000
+#define		MARVELL				0x0141
+#define		ICPLUS_PHY		0x243
+
+/* NIC Physical Layer Device MII register fields. */
+#define         MII_PHY_SELECTOR_IEEE8023       0x0001
+#define         MII_PHY_TECHABILITYFIELD        0x1FE0
+
+/* GMII_PHY_1000 need to set to prefer master */
+#define         GMII_PHY_1000BASETCONTROL_PreferMaster 0x0400
+
+/* NIC Physical Layer Device GMII constants. */
+#define         GMII_PREAMBLE                    0xFFFFFFFF
+#define         GMII_ST                          0x1
+#define         GMII_READ                        0x2
+#define         GMII_WRITE                       0x1
+#define         GMII_TA_READ_MASK                0x1
+#define         GMII_TA_WRITE                    0x2
+
+/* I/O register offsets. */
+enum ipg_regs {
+	DMA_CTRL		= 0x00,
+	RX_DMA_STATUS		= 0x08, // Unused + reserved
+	TFD_LIST_PTR_0		= 0x10,
+	TFD_LIST_PTR_1		= 0x14,
+	TX_DMA_BURST_THRESH	= 0x18,
+	TX_DMA_URGENT_THRESH	= 0x19,
+	TX_DMA_POLL_PERIOD	= 0x1a,
+	RFD_LIST_PTR_0		= 0x1c,
+	RFD_LIST_PTR_1		= 0x20,
+	RX_DMA_BURST_THRESH	= 0x24,
+	RX_DMA_URGENT_THRESH	= 0x25,
+	RX_DMA_POLL_PERIOD	= 0x26,
+	DEBUG_CTRL		= 0x2c,
+	ASIC_CTRL		= 0x30,
+	FIFO_CTRL		= 0x38, // Unused
+	FLOW_OFF_THRESH		= 0x3c,
+	FLOW_ON_THRESH		= 0x3e,
+	EEPROM_DATA		= 0x48,
+	EEPROM_CTRL		= 0x4a,
+	EXPROM_ADDR		= 0x4c, // Unused
+	EXPROM_DATA		= 0x50, // Unused
+	WAKE_EVENT		= 0x51, // Unused
+	COUNTDOWN		= 0x54, // Unused
+	INT_STATUS_ACK		= 0x5a,
+	INT_ENABLE		= 0x5c,
+	INT_STATUS		= 0x5e, // Unused
+	TX_STATUS		= 0x60,
+	MAC_CTRL		= 0x6c,
+	VLAN_TAG		= 0x70, // Unused
+	PHY_SET			= 0x75,	// JES20040127EEPROM
+	PHY_CTRL		= 0x76,
+	STATION_ADDRESS_0	= 0x78,
+	STATION_ADDRESS_1	= 0x7a,
+	STATION_ADDRESS_2	= 0x7c,
+	MAX_FRAME_SIZE		= 0x86,
+	RECEIVE_MODE		= 0x88,
+	HASHTABLE_0		= 0x8c,
+	HASHTABLE_1		= 0x90,
+	RMON_STATISTICS_MASK	= 0x98,
+	STATISTICS_MASK		= 0x9c,
+	RX_JUMBO_FRAMES		= 0xbc, // Unused
+	TCP_CHECKSUM_ERRORS	= 0xc0, // Unused
+	IP_CHECKSUM_ERRORS	= 0xc2, // Unused
+	UDP_CHECKSUM_ERRORS	= 0xc4, // Unused
+	TX_JUMBO_FRAMES		= 0xf4  // Unused
+};
+
+/* Ethernet MIB statistic register offsets. */
+#define	IPG_OCTETRCVOK		0xA8
+#define	IPG_MCSTOCTETRCVDOK		0xAC
+#define	IPG_BCSTOCTETRCVOK		0xB0
+#define	IPG_FRAMESRCVDOK		0xB4
+#define	IPG_MCSTFRAMESRCVDOK		0xB8
+#define	IPG_BCSTFRAMESRCVDOK		0xBE
+#define	IPG_MACCONTROLFRAMESRCVD	0xC6
+#define	IPG_FRAMETOOLONGERRRORS	0xC8
+#define	IPG_INRANGELENGTHERRORS	0xCA
+#define	IPG_FRAMECHECKSEQERRORS	0xCC
+#define	IPG_FRAMESLOSTRXERRORS	0xCE
+#define	IPG_OCTETXMTOK		0xD0
+#define	IPG_MCSTOCTETXMTOK		0xD4
+#define	IPG_BCSTOCTETXMTOK		0xD8
+#define	IPG_FRAMESXMTDOK		0xDC
+#define	IPG_MCSTFRAMESXMTDOK		0xE0
+#define	IPG_FRAMESWDEFERREDXMT	0xE4
+#define	IPG_LATECOLLISIONS		0xE8
+#define	IPG_MULTICOLFRAMES		0xEC
+#define	IPG_SINGLECOLFRAMES		0xF0
+#define	IPG_BCSTFRAMESXMTDOK		0xF6
+#define	IPG_CARRIERSENSEERRORS	0xF8
+#define	IPG_MACCONTROLFRAMESXMTDOK	0xFA
+#define	IPG_FRAMESABORTXSCOLLS	0xFC
+#define	IPG_FRAMESWEXDEFERRAL	0xFE
+
+/* RMON statistic register offsets. */
+#define	IPG_ETHERSTATSCOLLISIONS			0x100
+#define	IPG_ETHERSTATSOCTETSTRANSMIT			0x104
+#define	IPG_ETHERSTATSPKTSTRANSMIT			0x108
+#define	IPG_ETHERSTATSPKTS64OCTESTSTRANSMIT		0x10C
+#define	IPG_ETHERSTATSPKTS65TO127OCTESTSTRANSMIT	0x110
+#define	IPG_ETHERSTATSPKTS128TO255OCTESTSTRANSMIT	0x114
+#define	IPG_ETHERSTATSPKTS256TO511OCTESTSTRANSMIT	0x118
+#define	IPG_ETHERSTATSPKTS512TO1023OCTESTSTRANSMIT	0x11C
+#define	IPG_ETHERSTATSPKTS1024TO1518OCTESTSTRANSMIT	0x120
+#define	IPG_ETHERSTATSCRCALIGNERRORS			0x124
+#define	IPG_ETHERSTATSUNDERSIZEPKTS			0x128
+#define	IPG_ETHERSTATSFRAGMENTS			0x12C
+#define	IPG_ETHERSTATSJABBERS			0x130
+#define	IPG_ETHERSTATSOCTETS				0x134
+#define	IPG_ETHERSTATSPKTS				0x138
+#define	IPG_ETHERSTATSPKTS64OCTESTS			0x13C
+#define	IPG_ETHERSTATSPKTS65TO127OCTESTS		0x140
+#define	IPG_ETHERSTATSPKTS128TO255OCTESTS		0x144
+#define	IPG_ETHERSTATSPKTS256TO511OCTESTS		0x148
+#define	IPG_ETHERSTATSPKTS512TO1023OCTESTS		0x14C
+#define	IPG_ETHERSTATSPKTS1024TO1518OCTESTS		0x150
+
+/* RMON statistic register equivalents. */
+#define	IPG_ETHERSTATSMULTICASTPKTSTRANSMIT		0xE0
+#define	IPG_ETHERSTATSBROADCASTPKTSTRANSMIT		0xF6
+#define	IPG_ETHERSTATSMULTICASTPKTS			0xB8
+#define	IPG_ETHERSTATSBROADCASTPKTS			0xBE
+#define	IPG_ETHERSTATSOVERSIZEPKTS			0xC8
+#define	IPG_ETHERSTATSDROPEVENTS			0xCE
+
+/* Serial EEPROM offsets */
+#define	IPG_EEPROM_CONFIGPARAM	0x00
+#define	IPG_EEPROM_ASICCTRL		0x01
+#define	IPG_EEPROM_SUBSYSTEMVENDORID	0x02
+#define	IPG_EEPROM_SUBSYSTEMID	0x03
+#define	IPG_EEPROM_STATIONADDRESS0	0x10
+#define	IPG_EEPROM_STATIONADDRESS1	0x11
+#define	IPG_EEPROM_STATIONADDRESS2	0x12
+
+/* Register & data structure bit masks */
+
+/* PCI register masks. */
+
+/* IOBaseAddress */
+#define         IPG_PIB_RSVD_MASK		0xFFFFFE01
+#define         IPG_PIB_IOBASEADDRESS	0xFFFFFF00
+#define         IPG_PIB_IOBASEADDRIND	0x00000001
+
+/* MemBaseAddress */
+#define         IPG_PMB_RSVD_MASK		0xFFFFFE07
+#define         IPG_PMB_MEMBASEADDRIND	0x00000001
+#define         IPG_PMB_MEMMAPTYPE		0x00000006
+#define         IPG_PMB_MEMMAPTYPE0		0x00000002
+#define         IPG_PMB_MEMMAPTYPE1		0x00000004
+#define         IPG_PMB_MEMBASEADDRESS	0xFFFFFE00
+
+/* ConfigStatus */
+#define IPG_CS_RSVD_MASK                0xFFB0
+#define IPG_CS_CAPABILITIES             0x0010
+#define IPG_CS_66MHZCAPABLE             0x0020
+#define IPG_CS_FASTBACK2BACK            0x0080
+#define IPG_CS_DATAPARITYREPORTED       0x0100
+#define IPG_CS_DEVSELTIMING             0x0600
+#define IPG_CS_SIGNALEDTARGETABORT      0x0800
+#define IPG_CS_RECEIVEDTARGETABORT      0x1000
+#define IPG_CS_RECEIVEDMASTERABORT      0x2000
+#define IPG_CS_SIGNALEDSYSTEMERROR      0x4000
+#define IPG_CS_DETECTEDPARITYERROR      0x8000
+
+/* TFD data structure masks. */
+
+/* TFDList, TFC */
+#define	IPG_TFC_RSVD_MASK			0x0000FFFF9FFFFFFF
+#define	IPG_TFC_FRAMEID			0x000000000000FFFF
+#define	IPG_TFC_WORDALIGN			0x0000000000030000
+#define	IPG_TFC_WORDALIGNTODWORD		0x0000000000000000
+#define	IPG_TFC_WORDALIGNTOWORD		0x0000000000020000
+#define	IPG_TFC_WORDALIGNDISABLED		0x0000000000030000
+#define	IPG_TFC_TCPCHECKSUMENABLE		0x0000000000040000
+#define	IPG_TFC_UDPCHECKSUMENABLE		0x0000000000080000
+#define	IPG_TFC_IPCHECKSUMENABLE		0x0000000000100000
+#define	IPG_TFC_FCSAPPENDDISABLE		0x0000000000200000
+#define	IPG_TFC_TXINDICATE			0x0000000000400000
+#define	IPG_TFC_TXDMAINDICATE		0x0000000000800000
+#define	IPG_TFC_FRAGCOUNT			0x000000000F000000
+#define	IPG_TFC_VLANTAGINSERT		0x0000000010000000
+#define	IPG_TFC_TFDDONE			0x0000000080000000
+#define	IPG_TFC_VID				0x00000FFF00000000
+#define	IPG_TFC_CFI				0x0000100000000000
+#define	IPG_TFC_USERPRIORITY			0x0000E00000000000
+
+/* TFDList, FragInfo */
+#define	IPG_TFI_RSVD_MASK			0xFFFF00FFFFFFFFFF
+#define	IPG_TFI_FRAGADDR			0x000000FFFFFFFFFF
+#define	IPG_TFI_FRAGLEN			0xFFFF000000000000LL
+
+/* RFD data structure masks. */
+
+/* RFDList, RFS */
+#define	IPG_RFS_RSVD_MASK			0x0000FFFFFFFFFFFF
+#define	IPG_RFS_RXFRAMELEN			0x000000000000FFFF
+#define	IPG_RFS_RXFIFOOVERRUN		0x0000000000010000
+#define	IPG_RFS_RXRUNTFRAME			0x0000000000020000
+#define	IPG_RFS_RXALIGNMENTERROR		0x0000000000040000
+#define	IPG_RFS_RXFCSERROR			0x0000000000080000
+#define	IPG_RFS_RXOVERSIZEDFRAME		0x0000000000100000
+#define	IPG_RFS_RXLENGTHERROR		0x0000000000200000
+#define	IPG_RFS_VLANDETECTED			0x0000000000400000
+#define	IPG_RFS_TCPDETECTED			0x0000000000800000
+#define	IPG_RFS_TCPERROR			0x0000000001000000
+#define	IPG_RFS_UDPDETECTED			0x0000000002000000
+#define	IPG_RFS_UDPERROR			0x0000000004000000
+#define	IPG_RFS_IPDETECTED			0x0000000008000000
+#define	IPG_RFS_IPERROR			0x0000000010000000
+#define	IPG_RFS_FRAMESTART			0x0000000020000000
+#define	IPG_RFS_FRAMEEND			0x0000000040000000
+#define	IPG_RFS_RFDDONE			0x0000000080000000
+#define	IPG_RFS_TCI				0x0000FFFF00000000
+
+/* RFDList, FragInfo */
+#define	IPG_RFI_RSVD_MASK			0xFFFF00FFFFFFFFFF
+#define	IPG_RFI_FRAGADDR			0x000000FFFFFFFFFF
+#define	IPG_RFI_FRAGLEN			0xFFFF000000000000LL
+
+/* I/O Register masks. */
+
+/* RMON Statistics Mask */
+#define	IPG_RZ_ALL					0x0FFFFFFF
+
+/* Statistics Mask */
+#define	IPG_SM_ALL					0x0FFFFFFF
+#define	IPG_SM_OCTETRCVOK_FRAMESRCVDOK		0x00000001
+#define	IPG_SM_MCSTOCTETRCVDOK_MCSTFRAMESRCVDOK	0x00000002
+#define	IPG_SM_BCSTOCTETRCVDOK_BCSTFRAMESRCVDOK	0x00000004
+#define	IPG_SM_RXJUMBOFRAMES				0x00000008
+#define	IPG_SM_TCPCHECKSUMERRORS			0x00000010
+#define	IPG_SM_IPCHECKSUMERRORS			0x00000020
+#define	IPG_SM_UDPCHECKSUMERRORS			0x00000040
+#define	IPG_SM_MACCONTROLFRAMESRCVD			0x00000080
+#define	IPG_SM_FRAMESTOOLONGERRORS			0x00000100
+#define	IPG_SM_INRANGELENGTHERRORS			0x00000200
+#define	IPG_SM_FRAMECHECKSEQERRORS			0x00000400
+#define	IPG_SM_FRAMESLOSTRXERRORS			0x00000800
+#define	IPG_SM_OCTETXMTOK_FRAMESXMTOK		0x00001000
+#define	IPG_SM_MCSTOCTETXMTOK_MCSTFRAMESXMTDOK	0x00002000
+#define	IPG_SM_BCSTOCTETXMTOK_BCSTFRAMESXMTDOK	0x00004000
+#define	IPG_SM_FRAMESWDEFERREDXMT			0x00008000
+#define	IPG_SM_LATECOLLISIONS			0x00010000
+#define	IPG_SM_MULTICOLFRAMES			0x00020000
+#define	IPG_SM_SINGLECOLFRAMES			0x00040000
+#define	IPG_SM_TXJUMBOFRAMES				0x00080000
+#define	IPG_SM_CARRIERSENSEERRORS			0x00100000
+#define	IPG_SM_MACCONTROLFRAMESXMTD			0x00200000
+#define	IPG_SM_FRAMESABORTXSCOLLS			0x00400000
+#define	IPG_SM_FRAMESWEXDEFERAL			0x00800000
+
+/* Countdown */
+#define	IPG_CD_RSVD_MASK		0x0700FFFF
+#define	IPG_CD_COUNT			0x0000FFFF
+#define	IPG_CD_COUNTDOWNSPEED	0x01000000
+#define	IPG_CD_COUNTDOWNMODE		0x02000000
+#define	IPG_CD_COUNTINTENABLED	0x04000000
+
+/* TxDMABurstThresh */
+#define IPG_TB_RSVD_MASK                0xFF
+
+/* TxDMAUrgentThresh */
+#define IPG_TU_RSVD_MASK                0xFF
+
+/* TxDMAPollPeriod */
+#define IPG_TP_RSVD_MASK                0xFF
+
+/* RxDMAUrgentThresh */
+#define IPG_RU_RSVD_MASK                0xFF
+
+/* RxDMAPollPeriod */
+#define IPG_RP_RSVD_MASK                0xFF
+
+/* ReceiveMode */
+#define IPG_RM_RSVD_MASK                0x3F
+#define IPG_RM_RECEIVEUNICAST           0x01
+#define IPG_RM_RECEIVEMULTICAST         0x02
+#define IPG_RM_RECEIVEBROADCAST         0x04
+#define IPG_RM_RECEIVEALLFRAMES         0x08
+#define IPG_RM_RECEIVEMULTICASTHASH     0x10
+#define IPG_RM_RECEIVEIPMULTICAST       0x20
+
+/* PhySet JES20040127EEPROM*/
+#define IPG_PS_MEM_LENB9B               0x01
+#define IPG_PS_MEM_LEN9                 0x02
+#define IPG_PS_NON_COMPDET              0x04
+
+/* PhyCtrl */
+#define IPG_PC_RSVD_MASK                0xFF
+#define IPG_PC_MGMTCLK_LO               0x00
+#define IPG_PC_MGMTCLK_HI               0x01
+#define IPG_PC_MGMTCLK                  0x01
+#define IPG_PC_MGMTDATA                 0x02
+#define IPG_PC_MGMTDIR                  0x04
+#define IPG_PC_DUPLEX_POLARITY          0x08
+#define IPG_PC_DUPLEX_STATUS            0x10
+#define IPG_PC_LINK_POLARITY            0x20
+#define IPG_PC_LINK_SPEED               0xC0
+#define IPG_PC_LINK_SPEED_10MBPS        0x40
+#define IPG_PC_LINK_SPEED_100MBPS       0x80
+#define IPG_PC_LINK_SPEED_1000MBPS      0xC0
+
+/* DMACtrl */
+#define IPG_DC_RSVD_MASK                0xC07D9818
+#define IPG_DC_RX_DMA_COMPLETE          0x00000008
+#define IPG_DC_RX_DMA_POLL_NOW          0x00000010
+#define IPG_DC_TX_DMA_COMPLETE          0x00000800
+#define IPG_DC_TX_DMA_POLL_NOW          0x00001000
+#define IPG_DC_TX_DMA_IN_PROG           0x00008000
+#define IPG_DC_RX_EARLY_DISABLE         0x00010000
+#define IPG_DC_MWI_DISABLE              0x00040000
+#define IPG_DC_TX_WRITE_BACK_DISABLE    0x00080000
+#define IPG_DC_TX_BURST_LIMIT           0x00700000
+#define IPG_DC_TARGET_ABORT             0x40000000
+#define IPG_DC_MASTER_ABORT             0x80000000
+
+/* ASICCtrl */
+#define IPG_AC_RSVD_MASK                0x07FFEFF2
+#define IPG_AC_EXP_ROM_SIZE             0x00000002
+#define IPG_AC_PHY_SPEED10              0x00000010
+#define IPG_AC_PHY_SPEED100             0x00000020
+#define IPG_AC_PHY_SPEED1000            0x00000040
+#define IPG_AC_PHY_MEDIA                0x00000080
+#define IPG_AC_FORCED_CFG               0x00000700
+#define IPG_AC_D3RESETDISABLE           0x00000800
+#define IPG_AC_SPEED_UP_MODE            0x00002000
+#define IPG_AC_LED_MODE                 0x00004000
+#define IPG_AC_RST_OUT_POLARITY         0x00008000
+#define IPG_AC_GLOBAL_RESET             0x00010000
+#define IPG_AC_RX_RESET                 0x00020000
+#define IPG_AC_TX_RESET                 0x00040000
+#define IPG_AC_DMA                      0x00080000
+#define IPG_AC_FIFO                     0x00100000
+#define IPG_AC_NETWORK                  0x00200000
+#define IPG_AC_HOST                     0x00400000
+#define IPG_AC_AUTO_INIT                0x00800000
+#define IPG_AC_RST_OUT                  0x01000000
+#define IPG_AC_INT_REQUEST              0x02000000
+#define IPG_AC_RESET_BUSY               0x04000000
+#define IPG_AC_LED_SPEED                0x08000000	//JES20040127EEPROM
+#define IPG_AC_LED_MODE_BIT_1           0x20000000	//JES20040127EEPROM
+
+/* EepromCtrl */
+#define IPG_EC_RSVD_MASK                0x83FF
+#define IPG_EC_EEPROM_ADDR              0x00FF
+#define IPG_EC_EEPROM_OPCODE            0x0300
+#define IPG_EC_EEPROM_SUBCOMMAD         0x0000
+#define IPG_EC_EEPROM_WRITEOPCODE       0x0100
+#define IPG_EC_EEPROM_READOPCODE        0x0200
+#define IPG_EC_EEPROM_ERASEOPCODE       0x0300
+#define IPG_EC_EEPROM_BUSY              0x8000
+
+/* FIFOCtrl */
+#define IPG_FC_RSVD_MASK                0xC001
+#define IPG_FC_RAM_TEST_MODE            0x0001
+#define IPG_FC_TRANSMITTING             0x4000
+#define IPG_FC_RECEIVING                0x8000
+
+/* TxStatus */
+#define IPG_TS_RSVD_MASK                0xFFFF00DD
+#define IPG_TS_TX_ERROR                 0x00000001
+#define IPG_TS_LATE_COLLISION           0x00000004
+#define IPG_TS_TX_MAX_COLL              0x00000008
+#define IPG_TS_TX_UNDERRUN              0x00000010
+#define IPG_TS_TX_IND_REQD              0x00000040
+#define IPG_TS_TX_COMPLETE              0x00000080
+#define IPG_TS_TX_FRAMEID               0xFFFF0000
+
+/* WakeEvent */
+#define IPG_WE_WAKE_PKT_ENABLE          0x01
+#define IPG_WE_MAGIC_PKT_ENABLE         0x02
+#define IPG_WE_LINK_EVT_ENABLE          0x04
+#define IPG_WE_WAKE_POLARITY            0x08
+#define IPG_WE_WAKE_PKT_EVT             0x10
+#define IPG_WE_MAGIC_PKT_EVT            0x20
+#define IPG_WE_LINK_EVT                 0x40
+#define IPG_WE_WOL_ENABLE               0x80
+
+/* IntEnable */
+#define IPG_IE_RSVD_MASK                0x1FFE
+#define IPG_IE_HOST_ERROR               0x0002
+#define IPG_IE_TX_COMPLETE              0x0004
+#define IPG_IE_MAC_CTRL_FRAME           0x0008
+#define IPG_IE_RX_COMPLETE              0x0010
+#define IPG_IE_RX_EARLY                 0x0020
+#define IPG_IE_INT_REQUESTED            0x0040
+#define IPG_IE_UPDATE_STATS             0x0080
+#define IPG_IE_LINK_EVENT               0x0100
+#define IPG_IE_TX_DMA_COMPLETE          0x0200
+#define IPG_IE_RX_DMA_COMPLETE          0x0400
+#define IPG_IE_RFD_LIST_END             0x0800
+#define IPG_IE_RX_DMA_PRIORITY          0x1000
+
+/* IntStatus */
+#define IPG_IS_RSVD_MASK                0x1FFF
+#define IPG_IS_INTERRUPT_STATUS         0x0001
+#define IPG_IS_HOST_ERROR               0x0002
+#define IPG_IS_TX_COMPLETE              0x0004
+#define IPG_IS_MAC_CTRL_FRAME           0x0008
+#define IPG_IS_RX_COMPLETE              0x0010
+#define IPG_IS_RX_EARLY                 0x0020
+#define IPG_IS_INT_REQUESTED            0x0040
+#define IPG_IS_UPDATE_STATS             0x0080
+#define IPG_IS_LINK_EVENT               0x0100
+#define IPG_IS_TX_DMA_COMPLETE          0x0200
+#define IPG_IS_RX_DMA_COMPLETE          0x0400
+#define IPG_IS_RFD_LIST_END             0x0800
+#define IPG_IS_RX_DMA_PRIORITY          0x1000
+
+/* MACCtrl */
+#define IPG_MC_RSVD_MASK                0x7FE33FA3
+#define IPG_MC_IFS_SELECT               0x00000003
+#define IPG_MC_IFS_4352BIT              0x00000003
+#define IPG_MC_IFS_1792BIT              0x00000002
+#define IPG_MC_IFS_1024BIT              0x00000001
+#define IPG_MC_IFS_96BIT                0x00000000
+#define IPG_MC_DUPLEX_SELECT            0x00000020
+#define IPG_MC_DUPLEX_SELECT_FD         0x00000020
+#define IPG_MC_DUPLEX_SELECT_HD         0x00000000
+#define IPG_MC_TX_FLOW_CONTROL_ENABLE   0x00000080
+#define IPG_MC_RX_FLOW_CONTROL_ENABLE   0x00000100
+#define IPG_MC_RCV_FCS                  0x00000200
+#define IPG_MC_FIFO_LOOPBACK            0x00000400
+#define IPG_MC_MAC_LOOPBACK             0x00000800
+#define IPG_MC_AUTO_VLAN_TAGGING        0x00001000
+#define IPG_MC_AUTO_VLAN_UNTAGGING      0x00002000
+#define IPG_MC_COLLISION_DETECT         0x00010000
+#define IPG_MC_CARRIER_SENSE            0x00020000
+#define IPG_MC_STATISTICS_ENABLE        0x00200000
+#define IPG_MC_STATISTICS_DISABLE       0x00400000
+#define IPG_MC_STATISTICS_ENABLED       0x00800000
+#define IPG_MC_TX_ENABLE                0x01000000
+#define IPG_MC_TX_DISABLE               0x02000000
+#define IPG_MC_TX_ENABLED               0x04000000
+#define IPG_MC_RX_ENABLE                0x08000000
+#define IPG_MC_RX_DISABLE               0x10000000
+#define IPG_MC_RX_ENABLED               0x20000000
+#define IPG_MC_PAUSED                   0x40000000
+
+/*
+ *	Tune
+ */
+
+/* Miscellaneous Constants. */
+#define   TRUE  1
+#define   FALSE 0
+
+/* Assign IPG_APPEND_FCS_ON_TX > 0 for auto FCS append on TX. */
+#define         IPG_APPEND_FCS_ON_TX         TRUE
+
+/* Assign IPG_APPEND_FCS_ON_TX > 0 for auto FCS strip on RX. */
+#define         IPG_STRIP_FCS_ON_RX          TRUE
+
+/* Assign IPG_DROP_ON_RX_ETH_ERRORS > 0 to drop RX frames with
+ * Ethernet errors.
+ */
+#define         IPG_DROP_ON_RX_ETH_ERRORS    TRUE
+
+/* Assign IPG_INSERT_MANUAL_VLAN_TAG > 0 to insert VLAN tags manually
+ * (via TFC).
+ */
+#define		IPG_INSERT_MANUAL_VLAN_TAG   FALSE
+
+/* Assign IPG_ADD_IPCHECKSUM_ON_TX > 0 for auto IP checksum on TX. */
+#define         IPG_ADD_IPCHECKSUM_ON_TX     FALSE
+
+/* Assign IPG_ADD_TCPCHECKSUM_ON_TX > 0 for auto TCP checksum on TX.
+ * DO NOT USE FOR SILICON REVISIONS B3 AND EARLIER.
+ */
+#define         IPG_ADD_TCPCHECKSUM_ON_TX    FALSE
+
+/* Assign IPG_ADD_UDPCHECKSUM_ON_TX > 0 for auto UDP checksum on TX.
+ * DO NOT USE FOR SILICON REVISIONS B3 AND EARLIER.
+ */
+#define         IPG_ADD_UDPCHECKSUM_ON_TX    FALSE
+
+/* If inserting VLAN tags manually, assign the IPG_MANUAL_VLAN_xx
+ * constants as desired.
+ */
+#define		IPG_MANUAL_VLAN_VID		0xABC
+#define		IPG_MANUAL_VLAN_CFI		0x1
+#define		IPG_MANUAL_VLAN_USERPRIORITY 0x5
+
+#define         IPG_IO_REG_RANGE		0xFF
+#define         IPG_MEM_REG_RANGE		0x154
+#define         IPG_DRIVER_NAME		"Sundance Technology IPG Triple-Speed Ethernet"
+#define         IPG_NIC_PHY_ADDRESS          0x01
+#define		IPG_DMALIST_ALIGN_PAD	0x07
+#define		IPG_MULTICAST_HASHTABLE_SIZE	0x40
+
+/* Number of miliseconds to wait after issuing a software reset.
+ * 0x05 <= IPG_AC_RESETWAIT to account for proper 10Mbps operation.
+ */
+#define         IPG_AC_RESETWAIT             0x05
+
+/* Number of IPG_AC_RESETWAIT timeperiods before declaring timeout. */
+#define         IPG_AC_RESET_TIMEOUT         0x0A
+
+/* Minimum number of nanoseconds used to toggle MDC clock during
+ * MII/GMII register access.
+ */
+#define		IPG_PC_PHYCTRLWAIT_NS		200
+
+#define		IPG_TFDLIST_LENGTH		0x100
+
+/* Number of frames between TxDMAComplete interrupt.
+ * 0 < IPG_FRAMESBETWEENTXDMACOMPLETES <= IPG_TFDLIST_LENGTH
+ */
+#define		IPG_FRAMESBETWEENTXDMACOMPLETES 0x1
+
+#ifdef JUMBO_FRAME
+
+# ifdef JUMBO_FRAME_SIZE_2K
+# define JUMBO_FRAME_SIZE 2048
+# define __IPG_RXFRAG_SIZE 2048
+# else
+#  ifdef JUMBO_FRAME_SIZE_3K
+#  define JUMBO_FRAME_SIZE 3072
+#  define __IPG_RXFRAG_SIZE 3072
+#  else
+#   ifdef JUMBO_FRAME_SIZE_4K
+#   define JUMBO_FRAME_SIZE 4096
+#   define __IPG_RXFRAG_SIZE 4088
+#   else
+#    ifdef JUMBO_FRAME_SIZE_5K
+#    define JUMBO_FRAME_SIZE 5120
+#    define __IPG_RXFRAG_SIZE 4088
+#    else
+#     ifdef JUMBO_FRAME_SIZE_6K
+#     define JUMBO_FRAME_SIZE 6144
+#     define __IPG_RXFRAG_SIZE 4088
+#     else
+#      ifdef JUMBO_FRAME_SIZE_7K
+#      define JUMBO_FRAME_SIZE 7168
+#      define __IPG_RXFRAG_SIZE 4088
+#      else
+#       ifdef JUMBO_FRAME_SIZE_8K
+#       define JUMBO_FRAME_SIZE 8192
+#       define __IPG_RXFRAG_SIZE 4088
+#       else
+#        ifdef JUMBO_FRAME_SIZE_9K
+#        define JUMBO_FRAME_SIZE 9216
+#        define __IPG_RXFRAG_SIZE 4088
+#        else
+#         ifdef JUMBO_FRAME_SIZE_10K
+#         define JUMBO_FRAME_SIZE 10240
+#         define __IPG_RXFRAG_SIZE 4088
+#         else
+#         define JUMBO_FRAME_SIZE 4096
+#         endif
+#        endif
+#       endif
+#      endif
+#     endif
+#    endif
+#   endif
+#  endif
+# endif
+#endif
+
+/* Size of allocated received buffers. Nominally 0x0600.
+ * Define larger if expecting jumbo frames.
+ */
+#ifdef JUMBO_FRAME
+//IPG_TXFRAG_SIZE must <= 0x2b00, or TX will crash
+#define		IPG_TXFRAG_SIZE		JUMBO_FRAME_SIZE
+#endif
+
+/* Size of allocated received buffers. Nominally 0x0600.
+ * Define larger if expecting jumbo frames.
+ */
+#ifdef JUMBO_FRAME
+//4088=4096-8
+#define		IPG_RXFRAG_SIZE		__IPG_RXFRAG_SIZE
+#define     IPG_RXSUPPORT_SIZE   IPG_MAX_RXFRAME_SIZE
+#else
+#define		IPG_RXFRAG_SIZE		0x0600
+#define     IPG_RXSUPPORT_SIZE   IPG_RXFRAG_SIZE
+#endif
+
+/* IPG_MAX_RXFRAME_SIZE <= IPG_RXFRAG_SIZE */
+#ifdef JUMBO_FRAME
+#define		IPG_MAX_RXFRAME_SIZE		JUMBO_FRAME_SIZE
+#else
+#define		IPG_MAX_RXFRAME_SIZE		0x0600
+#endif
+
+#define		IPG_RFDLIST_LENGTH		0x100
+
+/* Maximum number of RFDs to process per interrupt.
+ * 1 < IPG_MAXRFDPROCESS_COUNT < IPG_RFDLIST_LENGTH
+ */
+#define		IPG_MAXRFDPROCESS_COUNT	0x80
+
+/* Minimum margin between last freed RFD, and current RFD.
+ * 1 < IPG_MINUSEDRFDSTOFREE < IPG_RFDLIST_LENGTH
+ */
+#define		IPG_MINUSEDRFDSTOFREE	0x80
+
+/* specify the jumbo frame maximum size
+ * per unit is 0x600 (the RxBuffer size that one RFD can carry)
+ */
+#define     MAX_JUMBOSIZE	        0x8	// max is 12K
+
+/* Key register values loaded at driver start up. */
+
+/* TXDMAPollPeriod is specified in 320ns increments.
+ *
+ * Value	Time
+ * ---------------------
+ * 0x00-0x01	320ns
+ * 0x03		~1us
+ * 0x1F		~10us
+ * 0xFF		~82us
+ */
+#define		IPG_TXDMAPOLLPERIOD_VALUE	0x26
+
+/* TxDMAUrgentThresh specifies the minimum amount of
+ * data in the transmit FIFO before asserting an
+ * urgent transmit DMA request.
+ *
+ * Value	Min TxFIFO occupied space before urgent TX request
+ * ---------------------------------------------------------------
+ * 0x00-0x04	128 bytes (1024 bits)
+ * 0x27		1248 bytes (~10000 bits)
+ * 0x30		1536 bytes (12288 bits)
+ * 0xFF		8192 bytes (65535 bits)
+ */
+#define		IPG_TXDMAURGENTTHRESH_VALUE	0x04
+
+/* TxDMABurstThresh specifies the minimum amount of
+ * free space in the transmit FIFO before asserting an
+ * transmit DMA request.
+ *
+ * Value	Min TxFIFO free space before TX request
+ * ----------------------------------------------------
+ * 0x00-0x08	256 bytes
+ * 0x30		1536 bytes
+ * 0xFF		8192 bytes
+ */
+#define		IPG_TXDMABURSTTHRESH_VALUE	0x30
+
+/* RXDMAPollPeriod is specified in 320ns increments.
+ *
+ * Value	Time
+ * ---------------------
+ * 0x00-0x01	320ns
+ * 0x03		~1us
+ * 0x1F		~10us
+ * 0xFF		~82us
+ */
+#define		IPG_RXDMAPOLLPERIOD_VALUE	0x01
+
+/* RxDMAUrgentThresh specifies the minimum amount of
+ * free space within the receive FIFO before asserting
+ * a urgent receive DMA request.
+ *
+ * Value	Min RxFIFO free space before urgent RX request
+ * ---------------------------------------------------------------
+ * 0x00-0x04	128 bytes (1024 bits)
+ * 0x27		1248 bytes (~10000 bits)
+ * 0x30		1536 bytes (12288 bits)
+ * 0xFF		8192 bytes (65535 bits)
+ */
+#define		IPG_RXDMAURGENTTHRESH_VALUE	0x30
+
+/* RxDMABurstThresh specifies the minimum amount of
+ * occupied space within the receive FIFO before asserting
+ * a receive DMA request.
+ *
+ * Value	Min TxFIFO free space before TX request
+ * ----------------------------------------------------
+ * 0x00-0x08	256 bytes
+ * 0x30		1536 bytes
+ * 0xFF		8192 bytes
+ */
+#define		IPG_RXDMABURSTTHRESH_VALUE	0x30
+
+/* FlowOnThresh specifies the maximum amount of occupied
+ * space in the receive FIFO before a PAUSE frame with
+ * maximum pause time transmitted.
+ *
+ * Value	Max RxFIFO occupied space before PAUSE
+ * ---------------------------------------------------
+ * 0x0000	0 bytes
+ * 0x0740	29,696 bytes
+ * 0x07FF	32,752 bytes
+ */
+#define		IPG_FLOWONTHRESH_VALUE	0x0740
+
+/* FlowOffThresh specifies the minimum amount of occupied
+ * space in the receive FIFO before a PAUSE frame with
+ * zero pause time is transmitted.
+ *
+ * Value	Max RxFIFO occupied space before PAUSE
+ * ---------------------------------------------------
+ * 0x0000	0 bytes
+ * 0x00BF	3056 bytes
+ * 0x07FF	32,752 bytes
+ */
+#define		IPG_FLOWOFFTHRESH_VALUE	0x00BF
+
+/*
+ * Miscellaneous macros.
+ */
+
+/* Marco for printing debug statements.
+#  define IPG_DDEBUG_MSG(args...) printk(KERN_DEBUG "IPG: " ## args) */
+#ifdef IPG_DEBUG
+#  define IPG_DEBUG_MSG(args...)
+#  define IPG_DDEBUG_MSG(args...) printk(KERN_DEBUG "IPG: " args)
+#  define IPG_DUMPRFDLIST(args) ipg_dump_rfdlist(args)
+#  define IPG_DUMPTFDLIST(args) ipg_dump_tfdlist(args)
+#else
+#  define IPG_DEBUG_MSG(args...)
+#  define IPG_DDEBUG_MSG(args...)
+#  define IPG_DUMPRFDLIST(args)
+#  define IPG_DUMPTFDLIST(args)
+#endif
+
+/*
+ * End miscellaneous macros.
+ */
+
+/* Transmit Frame Descriptor. The IPG supports 15 fragments,
+ * however Linux requires only a single fragment. Note, each
+ * TFD field is 64 bits wide.
+ */
+struct ipg_tx {
+	u64 next_desc;
+	u64 tfc;
+	u64 frag_info;
+};
+
+/* Receive Frame Descriptor. Note, each RFD field is 64 bits wide.
+ */
+struct ipg_rx {
+	u64 next_desc;
+	u64 rfs;
+	u64 frag_info;
+};
+
+struct SJumbo {
+	int FoundStart;
+	int CurrentSize;
+	struct sk_buff *skb;
+};
+/* Structure of IPG NIC specific data. */
+struct ipg_nic_private {
+	void __iomem *ioaddr;
+	struct ipg_tx *txd;
+	struct ipg_rx *rxd;
+	dma_addr_t txd_map;
+	dma_addr_t rxd_map;
+	struct sk_buff *TxBuff[IPG_TFDLIST_LENGTH];
+	struct sk_buff *RxBuff[IPG_RFDLIST_LENGTH];
+	unsigned int tx_current;
+	unsigned int tx_dirty;
+	unsigned int rx_current;
+	unsigned int rx_dirty;
+// Add by Grace 2005/05/19
+#ifdef JUMBO_FRAME
+	struct SJumbo Jumbo;
+#endif
+	unsigned int rx_buf_sz;
+	struct pci_dev *pdev;
+	struct net_device *dev;
+	struct net_device_stats stats;
+	spinlock_t lock;
+	int tenmbpsmode;
+
+	/*Jesse20040128EEPROM_VALUE */
+	u16 LED_Mode;
+	u16 station_addr[3];	/* Station Address in EEPROM Reg 0x10..0x12 */
+
+	struct mutex		mii_mutex;
+	struct mii_if_info	mii_if;
+	int ResetCurrentTFD;
+#ifdef IPG_DEBUG
+	int RFDlistendCount;
+	int RFDListCheckedCount;
+	int EmptyRFDListCount;
+#endif
+	struct delayed_work task;
+};
+
+//variable record -- index by leading revision/length
+//Revision/Length(=N*4), Address1, Data1, Address2, Data2,...,AddressN,DataN
+unsigned short DefaultPhyParam[] = {
+	// 11/12/03 IP1000A v1-3 rev=0x40
+	/*--------------------------------------------------------------------------
+	(0x4000|(15*4)), 31, 0x0001, 27, 0x01e0, 31, 0x0002, 22, 0x85bd, 24, 0xfff2,
+		    		 27, 0x0c10, 28, 0x0c10, 29, 0x2c10, 31, 0x0003, 23, 0x92f6,
+		    		 31, 0x0000, 23, 0x003d, 30, 0x00de, 20, 0x20e7,  9, 0x0700,
+	  --------------------------------------------------------------------------*/
+	// 12/17/03 IP1000A v1-4 rev=0x40
+	(0x4000 | (07 * 4)), 31, 0x0001, 27, 0x01e0, 31, 0x0002, 27, 0xeb8e, 31,
+	    0x0000,
+	30, 0x005e, 9, 0x0700,
+	// 01/09/04 IP1000A v1-5 rev=0x41
+	(0x4100 | (07 * 4)), 31, 0x0001, 27, 0x01e0, 31, 0x0002, 27, 0xeb8e, 31,
+	    0x0000,
+	30, 0x005e, 9, 0x0700,
+	0x0000
+};
+
+#endif				/* __LINUX_IPG_H */
-- 
1.3.GIT




^ permalink raw reply related

* [PATCH][2/2] Add ICMPMsgStats MIB (RFC 4293)
From: David Stevens @ 2007-09-11  3:12 UTC (permalink / raw)
  To: davem, yoshfuji; +Cc: netdev

[-- Attachment #1: Type: text/plain, Size: 19807 bytes --]

Background: RFC 4293 deprecates existing individual, named ICMP
type counters to be replaced with the ICMPMsgStatsTable. This table
includes entries for both IPv4 and IPv6, and requires counting of all
ICMP types, whether or not the machine implements the type.

These patches "remove" (but not really) the existing counters, and
replace them with the ICMPMsgStats tables for v4 and v6.
It includes the named counters in the /proc places they were, but gets the
values for them from the new tables. It also counts packets generated
from raw socket output (e.g., OutEchoes, MLD queries, RA's from
radvd, etc).

Changes:
1) create icmpmsg_statistics mib
2) create icmpv6msg_statistics mib
3) modify existing counters to use these
4) modify /proc/net/snmp to add "IcmpMsg" with all ICMP types
        listed by number for easy SNMP parsing
5) modify /proc/net/snmp printing for "Icmp" to get the named data
        from new counters.

IPv6 patch attached.

                                        +-DLS

Signed-off-by: David L Stevens <dlstevens@us.ibm.com>

diff -ruNp linux-2.6.22.5/include/linux/snmp.h 
linux-2.6.22.5_ICMPv6MSG/include/linux/snmp.h
--- linux-2.6.22.5/include/linux/snmp.h 2007-08-22 16:23:54.000000000 
-0700
+++ linux-2.6.22.5_ICMPv6MSG/include/linux/snmp.h       2007-09-10 
15:02:43.000000000 -0700
@@ -91,35 +91,12 @@ enum
        ICMP6_MIB_NUM = 0,
        ICMP6_MIB_INMSGS,                       /* InMsgs */
        ICMP6_MIB_INERRORS,                     /* InErrors */
-       ICMP6_MIB_INDESTUNREACHS,               /* InDestUnreachs */
-       ICMP6_MIB_INPKTTOOBIGS,                 /* InPktTooBigs */
-       ICMP6_MIB_INTIMEEXCDS,                  /* InTimeExcds */
-       ICMP6_MIB_INPARMPROBLEMS,               /* InParmProblems */
-       ICMP6_MIB_INECHOS,                      /* InEchos */
-       ICMP6_MIB_INECHOREPLIES,                /* InEchoReplies */
-       ICMP6_MIB_INGROUPMEMBQUERIES,           /* InGroupMembQueries */
-       ICMP6_MIB_INGROUPMEMBRESPONSES,         /* InGroupMembResponses */
-       ICMP6_MIB_INGROUPMEMBREDUCTIONS,        /* InGroupMembReductions 
*/
-       ICMP6_MIB_INROUTERSOLICITS,             /* InRouterSolicits */
-       ICMP6_MIB_INROUTERADVERTISEMENTS,       /* InRouterAdvertisements 
*/
-       ICMP6_MIB_INNEIGHBORSOLICITS,           /* InNeighborSolicits */
-       ICMP6_MIB_INNEIGHBORADVERTISEMENTS,     /* 
InNeighborAdvertisements */
-       ICMP6_MIB_INREDIRECTS,                  /* InRedirects */
        ICMP6_MIB_OUTMSGS,                      /* OutMsgs */
-       ICMP6_MIB_OUTDESTUNREACHS,              /* OutDestUnreachs */
-       ICMP6_MIB_OUTPKTTOOBIGS,                /* OutPktTooBigs */
-       ICMP6_MIB_OUTTIMEEXCDS,                 /* OutTimeExcds */
-       ICMP6_MIB_OUTPARMPROBLEMS,              /* OutParmProblems */
-       ICMP6_MIB_OUTECHOREPLIES,               /* OutEchoReplies */
-       ICMP6_MIB_OUTROUTERSOLICITS,            /* OutRouterSolicits */
-       ICMP6_MIB_OUTNEIGHBORSOLICITS,          /* OutNeighborSolicits */
-       ICMP6_MIB_OUTNEIGHBORADVERTISEMENTS,    /* 
OutNeighborAdvertisements */
-       ICMP6_MIB_OUTREDIRECTS,                 /* OutRedirects */
-       ICMP6_MIB_OUTGROUPMEMBRESPONSES,        /* OutGroupMembResponses 
*/
-       ICMP6_MIB_OUTGROUPMEMBREDUCTIONS,       /* OutGroupMembReductions 
*/
        __ICMP6_MIB_MAX
 };
 
+#define __ICMP6MSG_MIB_MAX 512 /* Out+In for all 8-bit ICMPv6 types */
+
 /* tcp mib definitions */
 /*
  * RFC 1213:  MIB-II TCP group
diff -ruNp linux-2.6.22.5/include/net/ipv6.h 
linux-2.6.22.5_ICMPv6MSG/include/net/ipv6.h
--- linux-2.6.22.5/include/net/ipv6.h   2007-08-22 16:23:54.000000000 
-0700
+++ linux-2.6.22.5_ICMPv6MSG/include/net/ipv6.h 2007-09-10 
15:41:19.000000000 -0700
@@ -132,6 +132,7 @@ DECLARE_SNMP_STAT(struct ipstats_mib, ip
        SNMP_INC_STATS_USER(ipv6_statistics, field);                    \
 })
 DECLARE_SNMP_STAT(struct icmpv6_mib, icmpv6_statistics);
+DECLARE_SNMP_STAT(struct icmpv6msg_mib, icmpv6msg_statistics);
 #define ICMP6_INC_STATS(idev, field)           ({                      \
        struct inet6_dev *_idev = (idev);                               \
        if (likely(_idev != NULL))                                      \
@@ -157,6 +158,14 @@ DECLARE_SNMP_STAT(struct icmpv6_mib, icm
                SNMP_INC_STATS_OFFSET_BH(_idev->stats.icmpv6, field, 
_offset);       \
        SNMP_INC_STATS_OFFSET_BH(icmpv6_statistics, field, _offset); \
 })
+
+#define ICMP6MSGOUT_INC_STATS(field) SNMP_INC_STATS(icmpv6msg_statistics, 
field+256)
+#define ICMP6MSGOUT_INC_STATS_BH(field) 
SNMP_INC_STATS_BH(icmpv6msg_statistics, field+256)
+#define ICMP6MSGOUT_INC_STATS_USER(field) 
SNMP_INC_STATS_USER(icmpv6msg_statistics, field+256)
+#define ICMP6MSGIN_INC_STATS(field) SNMP_INC_STATS(icmpv6msg_statistics, 
field)
+#define ICMP6MSGIN_INC_STATS_BH(field) 
SNMP_INC_STATS_BH(icmpv6msg_statistics, field)
+#define ICMP6MSGIN_INC_STATS_USER(field) 
SNMP_INC_STATS_USER(icmpv6msg_statistics, field)
+
 DECLARE_SNMP_STAT(struct udp_mib, udp_stats_in6);
 DECLARE_SNMP_STAT(struct udp_mib, udplite_stats_in6);
 #define UDP6_INC_STATS_BH(field, is_udplite)                         do { 
 \
diff -ruNp linux-2.6.22.5/include/net/snmp.h 
linux-2.6.22.5_ICMPv6MSG/include/net/snmp.h
--- linux-2.6.22.5/include/net/snmp.h   2007-08-22 16:23:54.000000000 
-0700
+++ linux-2.6.22.5_ICMPv6MSG/include/net/snmp.h 2007-09-10 
15:00:18.000000000 -0700
@@ -88,6 +88,12 @@ struct icmpv6_mib {
        unsigned long   mibs[ICMP6_MIB_MAX];
 } __SNMP_MIB_ALIGN__;
 
+#define ICMP6MSG_MIB_MAX  __ICMP6MSG_MIB_MAX
+struct icmpv6msg_mib {
+       unsigned long   mibs[ICMP6MSG_MIB_MAX];
+} __SNMP_MIB_ALIGN__;
+
+
 /* TCP */
 #define TCP_MIB_MAX    __TCP_MIB_MAX
 struct tcp_mib {
diff -ruNp linux-2.6.22.5/net/ipv6/af_inet6.c 
linux-2.6.22.5_ICMPv6MSG/net/ipv6/af_inet6.c
--- linux-2.6.22.5/net/ipv6/af_inet6.c  2007-08-22 16:23:54.000000000 
-0700
+++ linux-2.6.22.5_ICMPv6MSG/net/ipv6/af_inet6.c        2007-09-10 
14:19:57.000000000 -0700
@@ -719,6 +719,9 @@ static int __init init_ipv6_mibs(void)
        if (snmp_mib_init((void **)icmpv6_statistics, sizeof (struct 
icmpv6_mib),
                          __alignof__(struct icmpv6_mib)) < 0)
                goto err_icmp_mib;
+       if (snmp_mib_init((void **)icmpv6msg_statistics,
+           sizeof (struct icmpv6msg_mib), __alignof__(struct icmpv6_mib)) 
< 0)
+               goto err_icmpmsg_mib;
        if (snmp_mib_init((void **)udp_stats_in6, sizeof (struct udp_mib),
                          __alignof__(struct udp_mib)) < 0)
                goto err_udp_mib;
@@ -730,6 +733,8 @@ static int __init init_ipv6_mibs(void)
 err_udplite_mib:
        snmp_mib_free((void **)udp_stats_in6);
 err_udp_mib:
+       snmp_mib_free((void **)icmpv6msg_statistics);
+err_icmpmsg_mib:
        snmp_mib_free((void **)icmpv6_statistics);
 err_icmp_mib:
        snmp_mib_free((void **)ipv6_statistics);
diff -ruNp linux-2.6.22.5/net/ipv6/icmp.c 
linux-2.6.22.5_ICMPv6MSG/net/ipv6/icmp.c
--- linux-2.6.22.5/net/ipv6/icmp.c      2007-08-22 16:23:54.000000000 
-0700
+++ linux-2.6.22.5_ICMPv6MSG/net/ipv6/icmp.c    2007-09-10 
15:24:27.000000000 -0700
@@ -69,6 +69,8 @@
 
 DEFINE_SNMP_STAT(struct icmpv6_mib, icmpv6_statistics) __read_mostly;
 EXPORT_SYMBOL(icmpv6_statistics);
+DEFINE_SNMP_STAT(struct icmpv6msg_mib, icmpv6msg_statistics) 
__read_mostly;
+EXPORT_SYMBOL(icmpv6msg_statistics);
 
 /*
  *     The ICMP socket(s). This is the most convenient way to flow 
control
@@ -247,6 +249,7 @@ static int icmpv6_push_pending_frames(st
                                                      len, fl->proto,
                                                      tmp_csum);
        }
+       ICMP6MSGOUT_INC_STATS_BH(icmp6h->icmp6_type);
        ip6_push_pending_frames(sk);
 out:
        return err;
@@ -456,8 +459,6 @@ void icmpv6_send(struct sk_buff *skb, in
        }
        err = icmpv6_push_pending_frames(sk, &fl, &tmp_hdr, len + 
sizeof(struct icmp6hdr));
 
-       if (type >= ICMPV6_DEST_UNREACH && type <= ICMPV6_PARAMPROB)
-               ICMP6_INC_STATS_OFFSET_BH(idev, ICMP6_MIB_OUTDESTUNREACHS, 
type - ICMPV6_DEST_UNREACH);
        ICMP6_INC_STATS_BH(idev, ICMP6_MIB_OUTMSGS);
 
 out_put:
@@ -547,7 +548,6 @@ static void icmpv6_echo_reply(struct sk_
        }
        err = icmpv6_push_pending_frames(sk, &fl, &tmp_hdr, skb->len + 
sizeof(struct icmp6hdr));
 
-       ICMP6_INC_STATS_BH(idev, ICMP6_MIB_OUTECHOREPLIES);
        ICMP6_INC_STATS_BH(idev, ICMP6_MIB_OUTMSGS);
 
 out_put:
@@ -656,10 +656,7 @@ static int icmpv6_rcv(struct sk_buff **p
 
        type = hdr->icmp6_type;
 
-       if (type >= ICMPV6_DEST_UNREACH && type <= ICMPV6_PARAMPROB)
-               ICMP6_INC_STATS_OFFSET_BH(idev, ICMP6_MIB_INDESTUNREACHS, 
type - ICMPV6_DEST_UNREACH);
-       else if (type >= ICMPV6_ECHO_REQUEST && type <= NDISC_REDIRECT)
-               ICMP6_INC_STATS_OFFSET_BH(idev, ICMP6_MIB_INECHOS, type - 
ICMPV6_ECHO_REQUEST);
+       ICMP6MSGIN_INC_STATS_BH(type);
 
        switch (type) {
        case ICMPV6_ECHO_REQUEST:
diff -ruNp linux-2.6.22.5/net/ipv6/mcast.c 
linux-2.6.22.5_ICMPv6MSG/net/ipv6/mcast.c
--- linux-2.6.22.5/net/ipv6/mcast.c     2007-08-22 16:23:54.000000000 
-0700
+++ linux-2.6.22.5_ICMPv6MSG/net/ipv6/mcast.c   2007-09-10 
14:24:33.000000000 -0700
@@ -1478,6 +1478,7 @@ static void mld_sendpack(struct sk_buff 
        err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dev,
                mld_dev_queue_xmit);
        if (!err) {
+               ICMP6MSGOUT_INC_STATS(ICMPV6_MLD2_REPORT);
                ICMP6_INC_STATS(idev,ICMP6_MIB_OUTMSGS);
                IP6_INC_STATS(idev, IPSTATS_MIB_OUTMCASTPKTS);
        } else
@@ -1821,10 +1822,7 @@ static void igmp6_send(struct in6_addr *
        err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dev,
                mld_dev_queue_xmit);
        if (!err) {
-               if (type == ICMPV6_MGM_REDUCTION)
-                       ICMP6_INC_STATS(idev, 
ICMP6_MIB_OUTGROUPMEMBREDUCTIONS);
-               else
-                       ICMP6_INC_STATS(idev, 
ICMP6_MIB_OUTGROUPMEMBRESPONSES);
+               ICMP6MSGOUT_INC_STATS(type);
                ICMP6_INC_STATS(idev, ICMP6_MIB_OUTMSGS);
                IP6_INC_STATS(idev, IPSTATS_MIB_OUTMCASTPKTS);
        } else
diff -ruNp linux-2.6.22.5/net/ipv6/ndisc.c 
linux-2.6.22.5_ICMPv6MSG/net/ipv6/ndisc.c
--- linux-2.6.22.5/net/ipv6/ndisc.c     2007-08-22 16:23:54.000000000 
-0700
+++ linux-2.6.22.5_ICMPv6MSG/net/ipv6/ndisc.c   2007-09-10 
15:04:51.000000000 -0700
@@ -431,7 +431,7 @@ static void __ndisc_send(struct net_devi
                         struct neighbour *neigh,
                         struct in6_addr *daddr, struct in6_addr *saddr,
                         struct icmp6hdr *icmp6h, struct in6_addr *target,
-                        int llinfo, int icmp6_mib_outnd)
+                        int llinfo)
 {
        struct flowi fl;
        struct dst_entry *dst;
@@ -441,9 +441,11 @@ static void __ndisc_send(struct net_devi
        struct inet6_dev *idev;
        int len;
        int err;
-       u8 *opt;
+       u8 *opt, type;
+
+       type = icmp6h->icmp6_type;
 
-       ndisc_flow_init(&fl, icmp6h->icmp6_type, saddr, daddr,
+       ndisc_flow_init(&fl, type, saddr, daddr,
                        dev->ifindex);
 
        dst = ndisc_dst_alloc(dev, neigh, daddr, ip6_output);
@@ -504,7 +506,7 @@ static void __ndisc_send(struct net_devi
 
        err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev, 
dst_output);
        if (!err) {
-               ICMP6_INC_STATS(idev, icmp6_mib_outnd);
+               ICMP6MSGOUT_INC_STATS(type);
                ICMP6_INC_STATS(idev, ICMP6_MIB_OUTMSGS);
        }
 
@@ -542,8 +544,7 @@ static void ndisc_send_na(struct net_dev
 
        __ndisc_send(dev, neigh, daddr, src_addr,
                     &icmp6h, solicited_addr,
-                    inc_opt ? ND_OPT_TARGET_LL_ADDR : 0,
-                    ICMP6_MIB_OUTNEIGHBORADVERTISEMENTS);
+                    inc_opt ? ND_OPT_TARGET_LL_ADDR : 0);
 }
 
 void ndisc_send_ns(struct net_device *dev, struct neighbour *neigh,
@@ -564,8 +565,7 @@ void ndisc_send_ns(struct net_device *de
 
        __ndisc_send(dev, neigh, daddr, saddr,
                     &icmp6h, solicit,
-                    !ipv6_addr_any(saddr) ? ND_OPT_SOURCE_LL_ADDR : 0,
-                    ICMP6_MIB_OUTNEIGHBORSOLICITS);
+                    !ipv6_addr_any(saddr) ? ND_OPT_SOURCE_LL_ADDR : 0);
 }
 
 void ndisc_send_rs(struct net_device *dev, struct in6_addr *saddr,
@@ -599,8 +599,7 @@ void ndisc_send_rs(struct net_device *de
 #endif
        __ndisc_send(dev, NULL, daddr, saddr,
                     &icmp6h, NULL,
-                    send_sllao ? ND_OPT_SOURCE_LL_ADDR : 0,
-                    ICMP6_MIB_OUTROUTERSOLICITS);
+                    send_sllao ? ND_OPT_SOURCE_LL_ADDR : 0);
 }
 
 
@@ -1454,7 +1453,7 @@ void ndisc_send_redirect(struct sk_buff 
        IP6_INC_STATS(idev, IPSTATS_MIB_OUTREQUESTS);
        err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, buff, NULL, dst->dev, 
dst_output);
        if (!err) {
-               ICMP6_INC_STATS(idev, ICMP6_MIB_OUTREDIRECTS);
+               ICMP6MSGOUT_INC_STATS(NDISC_REDIRECT);
                ICMP6_INC_STATS(idev, ICMP6_MIB_OUTMSGS);
        }
 
diff -ruNp linux-2.6.22.5/net/ipv6/proc.c 
linux-2.6.22.5_ICMPv6MSG/net/ipv6/proc.c
--- linux-2.6.22.5/net/ipv6/proc.c      2007-08-22 16:23:54.000000000 
-0700
+++ linux-2.6.22.5_ICMPv6MSG/net/ipv6/proc.c    2007-09-10 
15:15:31.000000000 -0700
@@ -85,47 +85,33 @@ static struct snmp_mib snmp6_ipstats_lis
 };
 
 static struct snmp_mib snmp6_icmp6_list[] = {
-/* icmpv6 mib according to RFC 2466
-
-   Exceptions:  {In|Out}AdminProhibs are removed, because I see
-               no good reasons to account them separately
-               of another dest.unreachs.
-               OutErrs is zero identically.
-               OutEchos too.
-               OutRouterAdvertisements too.
-               OutGroupMembQueries too.
- */
+/* icmpv6 mib according to RFC 2466 */
        SNMP_MIB_ITEM("Icmp6InMsgs", ICMP6_MIB_INMSGS),
        SNMP_MIB_ITEM("Icmp6InErrors", ICMP6_MIB_INERRORS),
-       SNMP_MIB_ITEM("Icmp6InDestUnreachs", ICMP6_MIB_INDESTUNREACHS),
-       SNMP_MIB_ITEM("Icmp6InPktTooBigs", ICMP6_MIB_INPKTTOOBIGS),
-       SNMP_MIB_ITEM("Icmp6InTimeExcds", ICMP6_MIB_INTIMEEXCDS),
-       SNMP_MIB_ITEM("Icmp6InParmProblems", ICMP6_MIB_INPARMPROBLEMS),
-       SNMP_MIB_ITEM("Icmp6InEchos", ICMP6_MIB_INECHOS),
-       SNMP_MIB_ITEM("Icmp6InEchoReplies", ICMP6_MIB_INECHOREPLIES),
-       SNMP_MIB_ITEM("Icmp6InGroupMembQueries", 
ICMP6_MIB_INGROUPMEMBQUERIES),
-       SNMP_MIB_ITEM("Icmp6InGroupMembResponses", 
ICMP6_MIB_INGROUPMEMBRESPONSES),
-       SNMP_MIB_ITEM("Icmp6InGroupMembReductions", 
ICMP6_MIB_INGROUPMEMBREDUCTIONS),
-       SNMP_MIB_ITEM("Icmp6InRouterSolicits", 
ICMP6_MIB_INROUTERSOLICITS),
-       SNMP_MIB_ITEM("Icmp6InRouterAdvertisements", 
ICMP6_MIB_INROUTERADVERTISEMENTS),
-       SNMP_MIB_ITEM("Icmp6InNeighborSolicits", 
ICMP6_MIB_INNEIGHBORSOLICITS),
-       SNMP_MIB_ITEM("Icmp6InNeighborAdvertisements", 
ICMP6_MIB_INNEIGHBORADVERTISEMENTS),
-       SNMP_MIB_ITEM("Icmp6InRedirects", ICMP6_MIB_INREDIRECTS),
        SNMP_MIB_ITEM("Icmp6OutMsgs", ICMP6_MIB_OUTMSGS),
-       SNMP_MIB_ITEM("Icmp6OutDestUnreachs", ICMP6_MIB_OUTDESTUNREACHS),
-       SNMP_MIB_ITEM("Icmp6OutPktTooBigs", ICMP6_MIB_OUTPKTTOOBIGS),
-       SNMP_MIB_ITEM("Icmp6OutTimeExcds", ICMP6_MIB_OUTTIMEEXCDS),
-       SNMP_MIB_ITEM("Icmp6OutParmProblems", ICMP6_MIB_OUTPARMPROBLEMS),
-       SNMP_MIB_ITEM("Icmp6OutEchoReplies", ICMP6_MIB_OUTECHOREPLIES),
-       SNMP_MIB_ITEM("Icmp6OutRouterSolicits", 
ICMP6_MIB_OUTROUTERSOLICITS),
-       SNMP_MIB_ITEM("Icmp6OutNeighborSolicits", 
ICMP6_MIB_OUTNEIGHBORSOLICITS),
-       SNMP_MIB_ITEM("Icmp6OutNeighborAdvertisements", 
ICMP6_MIB_OUTNEIGHBORADVERTISEMENTS),
-       SNMP_MIB_ITEM("Icmp6OutRedirects", ICMP6_MIB_OUTREDIRECTS),
-       SNMP_MIB_ITEM("Icmp6OutGroupMembResponses", 
ICMP6_MIB_OUTGROUPMEMBRESPONSES),
-       SNMP_MIB_ITEM("Icmp6OutGroupMembReductions", 
ICMP6_MIB_OUTGROUPMEMBREDUCTIONS),
        SNMP_MIB_SENTINEL
 };
 
+/* RFC 4293 v6 ICMPMsgStatsTable; named items for RFC 2466 compatibility 
*/
+static char *icmp6type2name[256] = {
+       [ICMPV6_DEST_UNREACH] = "DestUnreachs",
+       [ICMPV6_PKT_TOOBIG] = "PktTooBigs",
+       [ICMPV6_TIME_EXCEED] = "TimeExcds",
+       [ICMPV6_PARAMPROB] = "ParmProblems",
+       [ICMPV6_ECHO_REQUEST] = "EchoRequest",
+       [ICMPV6_ECHO_REPLY] = "EchoReplies",
+       [ICMPV6_MGM_QUERY] = "GroupMembQueries",
+       [ICMPV6_MGM_REPORT] = "GroupMembResponses",
+       [ICMPV6_MGM_REDUCTION] = "GroupMembReductions",
+       [ICMPV6_MLD2_REPORT] = "MLDv2Reports",
+       [NDISC_ROUTER_ADVERTISEMENT] = "RouterAdvertisements",
+       [NDISC_ROUTER_SOLICITATION] = "RouterSolicits",
+       [NDISC_NEIGHBOUR_ADVERTISEMENT] = "NeighborAdvertisements",
+       [NDISC_NEIGHBOUR_SOLICITATION] = "NeighborSolicits",
+       [NDISC_REDIRECT] = "NeighborRedirects",
+};
+
+
 static struct snmp_mib snmp6_udp6_list[] = {
        SNMP_MIB_ITEM("Udp6InDatagrams", UDP_MIB_INDATAGRAMS),
        SNMP_MIB_ITEM("Udp6NoPorts", UDP_MIB_NOPORTS),
@@ -142,6 +128,40 @@ static struct snmp_mib snmp6_udplite6_li
        SNMP_MIB_SENTINEL
 };
 
+static void snmp6_seq_show_icmpv6msg(struct seq_file *seq)
+{
+       static char name[32];
+       int i;
+
+       /* print by name -- deprecated items */
+       for (i = 0; i < ICMP6MSG_MIB_MAX; i++) {
+               int icmptype;
+               char *p;
+
+               icmptype = i & 0xff;
+               p = icmp6type2name[icmptype];
+               if (!p) /* don't print un-named types here */
+                       continue;
+               (void) snprintf(name, sizeof(name)-1, "Icmp6%s%s",
+                       i & 0x100 ? "Out" : "In", p);
+               seq_printf(seq, "%-32s\t%lu\n", name,
+                       snmp_fold_field((void **) icmpv6msg_statistics, 
i));
+       }
+
+       /* print by number (nonzero only) - ICMPMsgStat format */
+       for (i = 0; i < ICMP6MSG_MIB_MAX; i++) {
+               unsigned long val;
+
+               val = snmp_fold_field((void **) icmpv6msg_statistics, i);
+               if (!val)
+                       continue;
+               (void) snprintf(name, sizeof(name)-1, "Icmp6%sType%u",
+                       i & 0x100 ?  "Out" : "In", i & 0xff);
+               seq_printf(seq, "%-32s\t%lu\n", name, val);
+       }
+       return;
+}
+
 static inline void
 snmp6_seq_show_item(struct seq_file *seq, void **mib, struct snmp_mib 
*itemlist)
 {
@@ -162,6 +182,7 @@ static int snmp6_seq_show(struct seq_fil
        } else {
                snmp6_seq_show_item(seq, (void **)ipv6_statistics, 
snmp6_ipstats_list);
                snmp6_seq_show_item(seq, (void **)icmpv6_statistics, 
snmp6_icmp6_list);
+               snmp6_seq_show_icmpv6msg(seq);
                snmp6_seq_show_item(seq, (void **)udp_stats_in6, 
snmp6_udp6_list);
                snmp6_seq_show_item(seq, (void **)udplite_stats_in6, 
snmp6_udplite6_list);
        }
diff -ruNp linux-2.6.22.5/net/ipv6/raw.c 
linux-2.6.22.5_ICMPv6MSG/net/ipv6/raw.c
--- linux-2.6.22.5/net/ipv6/raw.c       2007-08-22 16:23:54.000000000 
-0700
+++ linux-2.6.22.5_ICMPv6MSG/net/ipv6/raw.c     2007-09-10 
15:09:37.000000000 -0700
@@ -543,6 +543,11 @@ static int rawv6_push_pending_frames(str
        if (skb_store_bits(skb, offset, &csum, 2))
                BUG();
 
+       if (fl->proto == IPPROTO_ICMPV6) {
+               ICMP6_INC_STATS_BH(NULL, ICMP6_MIB_OUTMSGS);
+               ICMP6MSGOUT_INC_STATS_BH(icmp6_hdr(skb)->icmp6_type);
+       }
+
 send:
        err = ip6_push_pending_frames(sk);
 out:


[-- Attachment #2: icmpmsgmib6.patch --]
[-- Type: application/octet-stream, Size: 16082 bytes --]

diff -ruNp linux-2.6.22.5/include/linux/snmp.h linux-2.6.22.5_ICMPv6MSG/include/linux/snmp.h
--- linux-2.6.22.5/include/linux/snmp.h	2007-08-22 16:23:54.000000000 -0700
+++ linux-2.6.22.5_ICMPv6MSG/include/linux/snmp.h	2007-09-10 15:02:43.000000000 -0700
@@ -91,35 +91,12 @@ enum
 	ICMP6_MIB_NUM = 0,
 	ICMP6_MIB_INMSGS,			/* InMsgs */
 	ICMP6_MIB_INERRORS,			/* InErrors */
-	ICMP6_MIB_INDESTUNREACHS,		/* InDestUnreachs */
-	ICMP6_MIB_INPKTTOOBIGS,			/* InPktTooBigs */
-	ICMP6_MIB_INTIMEEXCDS,			/* InTimeExcds */
-	ICMP6_MIB_INPARMPROBLEMS,		/* InParmProblems */
-	ICMP6_MIB_INECHOS,			/* InEchos */
-	ICMP6_MIB_INECHOREPLIES,		/* InEchoReplies */
-	ICMP6_MIB_INGROUPMEMBQUERIES,		/* InGroupMembQueries */
-	ICMP6_MIB_INGROUPMEMBRESPONSES,		/* InGroupMembResponses */
-	ICMP6_MIB_INGROUPMEMBREDUCTIONS,	/* InGroupMembReductions */
-	ICMP6_MIB_INROUTERSOLICITS,		/* InRouterSolicits */
-	ICMP6_MIB_INROUTERADVERTISEMENTS,	/* InRouterAdvertisements */
-	ICMP6_MIB_INNEIGHBORSOLICITS,		/* InNeighborSolicits */
-	ICMP6_MIB_INNEIGHBORADVERTISEMENTS,	/* InNeighborAdvertisements */
-	ICMP6_MIB_INREDIRECTS,			/* InRedirects */
 	ICMP6_MIB_OUTMSGS,			/* OutMsgs */
-	ICMP6_MIB_OUTDESTUNREACHS,		/* OutDestUnreachs */
-	ICMP6_MIB_OUTPKTTOOBIGS,		/* OutPktTooBigs */
-	ICMP6_MIB_OUTTIMEEXCDS,			/* OutTimeExcds */
-	ICMP6_MIB_OUTPARMPROBLEMS,		/* OutParmProblems */
-	ICMP6_MIB_OUTECHOREPLIES,		/* OutEchoReplies */
-	ICMP6_MIB_OUTROUTERSOLICITS,		/* OutRouterSolicits */
-	ICMP6_MIB_OUTNEIGHBORSOLICITS,		/* OutNeighborSolicits */
-	ICMP6_MIB_OUTNEIGHBORADVERTISEMENTS,	/* OutNeighborAdvertisements */
-	ICMP6_MIB_OUTREDIRECTS,			/* OutRedirects */
-	ICMP6_MIB_OUTGROUPMEMBRESPONSES,	/* OutGroupMembResponses */
-	ICMP6_MIB_OUTGROUPMEMBREDUCTIONS,	/* OutGroupMembReductions */
 	__ICMP6_MIB_MAX
 };
 
+#define __ICMP6MSG_MIB_MAX 512 /* Out+In for all 8-bit ICMPv6 types */
+
 /* tcp mib definitions */
 /*
  * RFC 1213:  MIB-II TCP group
diff -ruNp linux-2.6.22.5/include/net/ipv6.h linux-2.6.22.5_ICMPv6MSG/include/net/ipv6.h
--- linux-2.6.22.5/include/net/ipv6.h	2007-08-22 16:23:54.000000000 -0700
+++ linux-2.6.22.5_ICMPv6MSG/include/net/ipv6.h	2007-09-10 15:41:19.000000000 -0700
@@ -132,6 +132,7 @@ DECLARE_SNMP_STAT(struct ipstats_mib, ip
 	SNMP_INC_STATS_USER(ipv6_statistics, field);			\
 })
 DECLARE_SNMP_STAT(struct icmpv6_mib, icmpv6_statistics);
+DECLARE_SNMP_STAT(struct icmpv6msg_mib, icmpv6msg_statistics);
 #define ICMP6_INC_STATS(idev, field)		({			\
 	struct inet6_dev *_idev = (idev);				\
 	if (likely(_idev != NULL))					\
@@ -157,6 +158,14 @@ DECLARE_SNMP_STAT(struct icmpv6_mib, icm
 		SNMP_INC_STATS_OFFSET_BH(_idev->stats.icmpv6, field, _offset);	\
 	SNMP_INC_STATS_OFFSET_BH(icmpv6_statistics, field, _offset);    	\
 })
+
+#define ICMP6MSGOUT_INC_STATS(field) SNMP_INC_STATS(icmpv6msg_statistics, field+256)
+#define ICMP6MSGOUT_INC_STATS_BH(field) SNMP_INC_STATS_BH(icmpv6msg_statistics, field+256)
+#define ICMP6MSGOUT_INC_STATS_USER(field) SNMP_INC_STATS_USER(icmpv6msg_statistics, field+256)
+#define ICMP6MSGIN_INC_STATS(field) SNMP_INC_STATS(icmpv6msg_statistics, field)
+#define ICMP6MSGIN_INC_STATS_BH(field) SNMP_INC_STATS_BH(icmpv6msg_statistics, field)
+#define ICMP6MSGIN_INC_STATS_USER(field) SNMP_INC_STATS_USER(icmpv6msg_statistics, field)
+
 DECLARE_SNMP_STAT(struct udp_mib, udp_stats_in6);
 DECLARE_SNMP_STAT(struct udp_mib, udplite_stats_in6);
 #define UDP6_INC_STATS_BH(field, is_udplite) 			      do  {  \
diff -ruNp linux-2.6.22.5/include/net/snmp.h linux-2.6.22.5_ICMPv6MSG/include/net/snmp.h
--- linux-2.6.22.5/include/net/snmp.h	2007-08-22 16:23:54.000000000 -0700
+++ linux-2.6.22.5_ICMPv6MSG/include/net/snmp.h	2007-09-10 15:00:18.000000000 -0700
@@ -88,6 +88,12 @@ struct icmpv6_mib {
 	unsigned long	mibs[ICMP6_MIB_MAX];
 } __SNMP_MIB_ALIGN__;
 
+#define ICMP6MSG_MIB_MAX  __ICMP6MSG_MIB_MAX
+struct icmpv6msg_mib {
+	unsigned long	mibs[ICMP6MSG_MIB_MAX];
+} __SNMP_MIB_ALIGN__;
+
+
 /* TCP */
 #define TCP_MIB_MAX	__TCP_MIB_MAX
 struct tcp_mib {
diff -ruNp linux-2.6.22.5/net/ipv6/af_inet6.c linux-2.6.22.5_ICMPv6MSG/net/ipv6/af_inet6.c
--- linux-2.6.22.5/net/ipv6/af_inet6.c	2007-08-22 16:23:54.000000000 -0700
+++ linux-2.6.22.5_ICMPv6MSG/net/ipv6/af_inet6.c	2007-09-10 14:19:57.000000000 -0700
@@ -719,6 +719,9 @@ static int __init init_ipv6_mibs(void)
 	if (snmp_mib_init((void **)icmpv6_statistics, sizeof (struct icmpv6_mib),
 			  __alignof__(struct icmpv6_mib)) < 0)
 		goto err_icmp_mib;
+	if (snmp_mib_init((void **)icmpv6msg_statistics,
+	    sizeof (struct icmpv6msg_mib), __alignof__(struct icmpv6_mib)) < 0)
+		goto err_icmpmsg_mib;
 	if (snmp_mib_init((void **)udp_stats_in6, sizeof (struct udp_mib),
 			  __alignof__(struct udp_mib)) < 0)
 		goto err_udp_mib;
@@ -730,6 +733,8 @@ static int __init init_ipv6_mibs(void)
 err_udplite_mib:
 	snmp_mib_free((void **)udp_stats_in6);
 err_udp_mib:
+	snmp_mib_free((void **)icmpv6msg_statistics);
+err_icmpmsg_mib:
 	snmp_mib_free((void **)icmpv6_statistics);
 err_icmp_mib:
 	snmp_mib_free((void **)ipv6_statistics);
diff -ruNp linux-2.6.22.5/net/ipv6/icmp.c linux-2.6.22.5_ICMPv6MSG/net/ipv6/icmp.c
--- linux-2.6.22.5/net/ipv6/icmp.c	2007-08-22 16:23:54.000000000 -0700
+++ linux-2.6.22.5_ICMPv6MSG/net/ipv6/icmp.c	2007-09-10 15:24:27.000000000 -0700
@@ -69,6 +69,8 @@
 
 DEFINE_SNMP_STAT(struct icmpv6_mib, icmpv6_statistics) __read_mostly;
 EXPORT_SYMBOL(icmpv6_statistics);
+DEFINE_SNMP_STAT(struct icmpv6msg_mib, icmpv6msg_statistics) __read_mostly;
+EXPORT_SYMBOL(icmpv6msg_statistics);
 
 /*
  *	The ICMP socket(s). This is the most convenient way to flow control
@@ -247,6 +249,7 @@ static int icmpv6_push_pending_frames(st
 						      len, fl->proto,
 						      tmp_csum);
 	}
+	ICMP6MSGOUT_INC_STATS_BH(icmp6h->icmp6_type);
 	ip6_push_pending_frames(sk);
 out:
 	return err;
@@ -456,8 +459,6 @@ void icmpv6_send(struct sk_buff *skb, in
 	}
 	err = icmpv6_push_pending_frames(sk, &fl, &tmp_hdr, len + sizeof(struct icmp6hdr));
 
-	if (type >= ICMPV6_DEST_UNREACH && type <= ICMPV6_PARAMPROB)
-		ICMP6_INC_STATS_OFFSET_BH(idev, ICMP6_MIB_OUTDESTUNREACHS, type - ICMPV6_DEST_UNREACH);
 	ICMP6_INC_STATS_BH(idev, ICMP6_MIB_OUTMSGS);
 
 out_put:
@@ -547,7 +548,6 @@ static void icmpv6_echo_reply(struct sk_
 	}
 	err = icmpv6_push_pending_frames(sk, &fl, &tmp_hdr, skb->len + sizeof(struct icmp6hdr));
 
-	ICMP6_INC_STATS_BH(idev, ICMP6_MIB_OUTECHOREPLIES);
 	ICMP6_INC_STATS_BH(idev, ICMP6_MIB_OUTMSGS);
 
 out_put:
@@ -656,10 +656,7 @@ static int icmpv6_rcv(struct sk_buff **p
 
 	type = hdr->icmp6_type;
 
-	if (type >= ICMPV6_DEST_UNREACH && type <= ICMPV6_PARAMPROB)
-		ICMP6_INC_STATS_OFFSET_BH(idev, ICMP6_MIB_INDESTUNREACHS, type - ICMPV6_DEST_UNREACH);
-	else if (type >= ICMPV6_ECHO_REQUEST && type <= NDISC_REDIRECT)
-		ICMP6_INC_STATS_OFFSET_BH(idev, ICMP6_MIB_INECHOS, type - ICMPV6_ECHO_REQUEST);
+	ICMP6MSGIN_INC_STATS_BH(type);
 
 	switch (type) {
 	case ICMPV6_ECHO_REQUEST:
diff -ruNp linux-2.6.22.5/net/ipv6/mcast.c linux-2.6.22.5_ICMPv6MSG/net/ipv6/mcast.c
--- linux-2.6.22.5/net/ipv6/mcast.c	2007-08-22 16:23:54.000000000 -0700
+++ linux-2.6.22.5_ICMPv6MSG/net/ipv6/mcast.c	2007-09-10 14:24:33.000000000 -0700
@@ -1478,6 +1478,7 @@ static void mld_sendpack(struct sk_buff 
 	err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dev,
 		mld_dev_queue_xmit);
 	if (!err) {
+		ICMP6MSGOUT_INC_STATS(ICMPV6_MLD2_REPORT);
 		ICMP6_INC_STATS(idev,ICMP6_MIB_OUTMSGS);
 		IP6_INC_STATS(idev, IPSTATS_MIB_OUTMCASTPKTS);
 	} else
@@ -1821,10 +1822,7 @@ static void igmp6_send(struct in6_addr *
 	err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dev,
 		mld_dev_queue_xmit);
 	if (!err) {
-		if (type == ICMPV6_MGM_REDUCTION)
-			ICMP6_INC_STATS(idev, ICMP6_MIB_OUTGROUPMEMBREDUCTIONS);
-		else
-			ICMP6_INC_STATS(idev, ICMP6_MIB_OUTGROUPMEMBRESPONSES);
+		ICMP6MSGOUT_INC_STATS(type);
 		ICMP6_INC_STATS(idev, ICMP6_MIB_OUTMSGS);
 		IP6_INC_STATS(idev, IPSTATS_MIB_OUTMCASTPKTS);
 	} else
diff -ruNp linux-2.6.22.5/net/ipv6/ndisc.c linux-2.6.22.5_ICMPv6MSG/net/ipv6/ndisc.c
--- linux-2.6.22.5/net/ipv6/ndisc.c	2007-08-22 16:23:54.000000000 -0700
+++ linux-2.6.22.5_ICMPv6MSG/net/ipv6/ndisc.c	2007-09-10 15:04:51.000000000 -0700
@@ -431,7 +431,7 @@ static void __ndisc_send(struct net_devi
 			 struct neighbour *neigh,
 			 struct in6_addr *daddr, struct in6_addr *saddr,
 			 struct icmp6hdr *icmp6h, struct in6_addr *target,
-			 int llinfo, int icmp6_mib_outnd)
+			 int llinfo)
 {
 	struct flowi fl;
 	struct dst_entry *dst;
@@ -441,9 +441,11 @@ static void __ndisc_send(struct net_devi
 	struct inet6_dev *idev;
 	int len;
 	int err;
-	u8 *opt;
+	u8 *opt, type;
+
+	type = icmp6h->icmp6_type;
 
-	ndisc_flow_init(&fl, icmp6h->icmp6_type, saddr, daddr,
+	ndisc_flow_init(&fl, type, saddr, daddr,
 			dev->ifindex);
 
 	dst = ndisc_dst_alloc(dev, neigh, daddr, ip6_output);
@@ -504,7 +506,7 @@ static void __ndisc_send(struct net_devi
 
 	err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev, dst_output);
 	if (!err) {
-		ICMP6_INC_STATS(idev, icmp6_mib_outnd);
+		ICMP6MSGOUT_INC_STATS(type);
 		ICMP6_INC_STATS(idev, ICMP6_MIB_OUTMSGS);
 	}
 
@@ -542,8 +544,7 @@ static void ndisc_send_na(struct net_dev
 
 	__ndisc_send(dev, neigh, daddr, src_addr,
 		     &icmp6h, solicited_addr,
-		     inc_opt ? ND_OPT_TARGET_LL_ADDR : 0,
-		     ICMP6_MIB_OUTNEIGHBORADVERTISEMENTS);
+		     inc_opt ? ND_OPT_TARGET_LL_ADDR : 0);
 }
 
 void ndisc_send_ns(struct net_device *dev, struct neighbour *neigh,
@@ -564,8 +565,7 @@ void ndisc_send_ns(struct net_device *de
 
 	__ndisc_send(dev, neigh, daddr, saddr,
 		     &icmp6h, solicit,
-		     !ipv6_addr_any(saddr) ? ND_OPT_SOURCE_LL_ADDR : 0,
-		     ICMP6_MIB_OUTNEIGHBORSOLICITS);
+		     !ipv6_addr_any(saddr) ? ND_OPT_SOURCE_LL_ADDR : 0);
 }
 
 void ndisc_send_rs(struct net_device *dev, struct in6_addr *saddr,
@@ -599,8 +599,7 @@ void ndisc_send_rs(struct net_device *de
 #endif
 	__ndisc_send(dev, NULL, daddr, saddr,
 		     &icmp6h, NULL,
-		     send_sllao ? ND_OPT_SOURCE_LL_ADDR : 0,
-		     ICMP6_MIB_OUTROUTERSOLICITS);
+		     send_sllao ? ND_OPT_SOURCE_LL_ADDR : 0);
 }
 
 
@@ -1454,7 +1453,7 @@ void ndisc_send_redirect(struct sk_buff 
 	IP6_INC_STATS(idev, IPSTATS_MIB_OUTREQUESTS);
 	err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, buff, NULL, dst->dev, dst_output);
 	if (!err) {
-		ICMP6_INC_STATS(idev, ICMP6_MIB_OUTREDIRECTS);
+		ICMP6MSGOUT_INC_STATS(NDISC_REDIRECT);
 		ICMP6_INC_STATS(idev, ICMP6_MIB_OUTMSGS);
 	}
 
diff -ruNp linux-2.6.22.5/net/ipv6/proc.c linux-2.6.22.5_ICMPv6MSG/net/ipv6/proc.c
--- linux-2.6.22.5/net/ipv6/proc.c	2007-08-22 16:23:54.000000000 -0700
+++ linux-2.6.22.5_ICMPv6MSG/net/ipv6/proc.c	2007-09-10 15:15:31.000000000 -0700
@@ -85,47 +85,33 @@ static struct snmp_mib snmp6_ipstats_lis
 };
 
 static struct snmp_mib snmp6_icmp6_list[] = {
-/* icmpv6 mib according to RFC 2466
-
-   Exceptions:  {In|Out}AdminProhibs are removed, because I see
-		no good reasons to account them separately
-		of another dest.unreachs.
-		OutErrs is zero identically.
-		OutEchos too.
-		OutRouterAdvertisements too.
-		OutGroupMembQueries too.
- */
+/* icmpv6 mib according to RFC 2466 */
 	SNMP_MIB_ITEM("Icmp6InMsgs", ICMP6_MIB_INMSGS),
 	SNMP_MIB_ITEM("Icmp6InErrors", ICMP6_MIB_INERRORS),
-	SNMP_MIB_ITEM("Icmp6InDestUnreachs", ICMP6_MIB_INDESTUNREACHS),
-	SNMP_MIB_ITEM("Icmp6InPktTooBigs", ICMP6_MIB_INPKTTOOBIGS),
-	SNMP_MIB_ITEM("Icmp6InTimeExcds", ICMP6_MIB_INTIMEEXCDS),
-	SNMP_MIB_ITEM("Icmp6InParmProblems", ICMP6_MIB_INPARMPROBLEMS),
-	SNMP_MIB_ITEM("Icmp6InEchos", ICMP6_MIB_INECHOS),
-	SNMP_MIB_ITEM("Icmp6InEchoReplies", ICMP6_MIB_INECHOREPLIES),
-	SNMP_MIB_ITEM("Icmp6InGroupMembQueries", ICMP6_MIB_INGROUPMEMBQUERIES),
-	SNMP_MIB_ITEM("Icmp6InGroupMembResponses", ICMP6_MIB_INGROUPMEMBRESPONSES),
-	SNMP_MIB_ITEM("Icmp6InGroupMembReductions", ICMP6_MIB_INGROUPMEMBREDUCTIONS),
-	SNMP_MIB_ITEM("Icmp6InRouterSolicits", ICMP6_MIB_INROUTERSOLICITS),
-	SNMP_MIB_ITEM("Icmp6InRouterAdvertisements", ICMP6_MIB_INROUTERADVERTISEMENTS),
-	SNMP_MIB_ITEM("Icmp6InNeighborSolicits", ICMP6_MIB_INNEIGHBORSOLICITS),
-	SNMP_MIB_ITEM("Icmp6InNeighborAdvertisements", ICMP6_MIB_INNEIGHBORADVERTISEMENTS),
-	SNMP_MIB_ITEM("Icmp6InRedirects", ICMP6_MIB_INREDIRECTS),
 	SNMP_MIB_ITEM("Icmp6OutMsgs", ICMP6_MIB_OUTMSGS),
-	SNMP_MIB_ITEM("Icmp6OutDestUnreachs", ICMP6_MIB_OUTDESTUNREACHS),
-	SNMP_MIB_ITEM("Icmp6OutPktTooBigs", ICMP6_MIB_OUTPKTTOOBIGS),
-	SNMP_MIB_ITEM("Icmp6OutTimeExcds", ICMP6_MIB_OUTTIMEEXCDS),
-	SNMP_MIB_ITEM("Icmp6OutParmProblems", ICMP6_MIB_OUTPARMPROBLEMS),
-	SNMP_MIB_ITEM("Icmp6OutEchoReplies", ICMP6_MIB_OUTECHOREPLIES),
-	SNMP_MIB_ITEM("Icmp6OutRouterSolicits", ICMP6_MIB_OUTROUTERSOLICITS),
-	SNMP_MIB_ITEM("Icmp6OutNeighborSolicits", ICMP6_MIB_OUTNEIGHBORSOLICITS),
-	SNMP_MIB_ITEM("Icmp6OutNeighborAdvertisements", ICMP6_MIB_OUTNEIGHBORADVERTISEMENTS),
-	SNMP_MIB_ITEM("Icmp6OutRedirects", ICMP6_MIB_OUTREDIRECTS),
-	SNMP_MIB_ITEM("Icmp6OutGroupMembResponses", ICMP6_MIB_OUTGROUPMEMBRESPONSES),
-	SNMP_MIB_ITEM("Icmp6OutGroupMembReductions", ICMP6_MIB_OUTGROUPMEMBREDUCTIONS),
 	SNMP_MIB_SENTINEL
 };
 
+/* RFC 4293 v6 ICMPMsgStatsTable; named items for RFC 2466 compatibility */
+static char *icmp6type2name[256] = {
+	[ICMPV6_DEST_UNREACH] = "DestUnreachs",
+	[ICMPV6_PKT_TOOBIG] = "PktTooBigs",
+	[ICMPV6_TIME_EXCEED] = "TimeExcds",
+	[ICMPV6_PARAMPROB] = "ParmProblems",
+	[ICMPV6_ECHO_REQUEST] = "EchoRequest",
+	[ICMPV6_ECHO_REPLY] = "EchoReplies",
+	[ICMPV6_MGM_QUERY] = "GroupMembQueries",
+	[ICMPV6_MGM_REPORT] = "GroupMembResponses",
+	[ICMPV6_MGM_REDUCTION] = "GroupMembReductions",
+	[ICMPV6_MLD2_REPORT] = "MLDv2Reports",
+	[NDISC_ROUTER_ADVERTISEMENT] = "RouterAdvertisements",
+	[NDISC_ROUTER_SOLICITATION] = "RouterSolicits",
+	[NDISC_NEIGHBOUR_ADVERTISEMENT] = "NeighborAdvertisements",
+	[NDISC_NEIGHBOUR_SOLICITATION] = "NeighborSolicits",
+	[NDISC_REDIRECT] = "NeighborRedirects",
+};
+
+
 static struct snmp_mib snmp6_udp6_list[] = {
 	SNMP_MIB_ITEM("Udp6InDatagrams", UDP_MIB_INDATAGRAMS),
 	SNMP_MIB_ITEM("Udp6NoPorts", UDP_MIB_NOPORTS),
@@ -142,6 +128,40 @@ static struct snmp_mib snmp6_udplite6_li
 	SNMP_MIB_SENTINEL
 };
 
+static void snmp6_seq_show_icmpv6msg(struct seq_file *seq)
+{
+	static char name[32];
+	int i;
+
+	/* print by name -- deprecated items */
+	for (i = 0; i < ICMP6MSG_MIB_MAX; i++) {
+		int icmptype;
+		char *p;
+
+		icmptype = i & 0xff;
+		p = icmp6type2name[icmptype];
+		if (!p)	/* don't print un-named types here */
+			continue;
+		(void) snprintf(name, sizeof(name)-1, "Icmp6%s%s",
+			i & 0x100 ? "Out" : "In", p);
+		seq_printf(seq, "%-32s\t%lu\n", name,
+			snmp_fold_field((void **) icmpv6msg_statistics, i));
+	}
+
+	/* print by number (nonzero only) - ICMPMsgStat format */
+	for (i = 0; i < ICMP6MSG_MIB_MAX; i++) {
+		unsigned long val;
+
+		val = snmp_fold_field((void **) icmpv6msg_statistics, i);
+		if (!val)
+			continue;
+		(void) snprintf(name, sizeof(name)-1, "Icmp6%sType%u",
+			i & 0x100 ?  "Out" : "In", i & 0xff);
+		seq_printf(seq, "%-32s\t%lu\n", name, val);
+	}
+	return;
+}
+
 static inline void
 snmp6_seq_show_item(struct seq_file *seq, void **mib, struct snmp_mib *itemlist)
 {
@@ -162,6 +182,7 @@ static int snmp6_seq_show(struct seq_fil
 	} else {
 		snmp6_seq_show_item(seq, (void **)ipv6_statistics, snmp6_ipstats_list);
 		snmp6_seq_show_item(seq, (void **)icmpv6_statistics, snmp6_icmp6_list);
+		snmp6_seq_show_icmpv6msg(seq);
 		snmp6_seq_show_item(seq, (void **)udp_stats_in6, snmp6_udp6_list);
 		snmp6_seq_show_item(seq, (void **)udplite_stats_in6, snmp6_udplite6_list);
 	}
diff -ruNp linux-2.6.22.5/net/ipv6/raw.c linux-2.6.22.5_ICMPv6MSG/net/ipv6/raw.c
--- linux-2.6.22.5/net/ipv6/raw.c	2007-08-22 16:23:54.000000000 -0700
+++ linux-2.6.22.5_ICMPv6MSG/net/ipv6/raw.c	2007-09-10 15:09:37.000000000 -0700
@@ -543,6 +543,11 @@ static int rawv6_push_pending_frames(str
 	if (skb_store_bits(skb, offset, &csum, 2))
 		BUG();
 
+	if (fl->proto == IPPROTO_ICMPV6) {
+		ICMP6_INC_STATS_BH(NULL, ICMP6_MIB_OUTMSGS);
+		ICMP6MSGOUT_INC_STATS_BH(icmp6_hdr(skb)->icmp6_type);
+	}
+
 send:
 	err = ip6_push_pending_frames(sk);
 out:

^ permalink raw reply

* Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures
From: Segher Boessenkool @ 2007-09-11  2:27 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Paul Mackerras, heiko.carstens, horms, Stefan Richter,
	Satyam Sharma, Linux Kernel Mailing List, David Miller,
	Paul E. McKenney, Ilpo Järvinen, ak, cfriesen, rpjday,
	Netdev, jesper.juhl, linux-arch, Andrew Morton, zlynx,
	schwidefsky, Chris Snook, Herbert Xu, Linus Torvalds, wensong,
	wjiang
In-Reply-To: <Pine.LNX.4.64.0709101157060.24491@schroedinger.engr.sgi.com>

>> "volatile" has nothing to do with reordering.  atomic_dec() writes
>> to memory, so it _does_ have "volatile semantics", implicitly, as
>> long as the compiler cannot optimise the atomic variable away
>> completely -- any store counts as a side effect.
>
> Stores can be reordered. Only x86 has (mostly) implicit write ordering.
> So no atomic_dec has no volatile semantics

Read again: I said the C "volatile" construct has nothing to do
with CPU memory access reordering.

> and may be reordered on a variety
> of processors. Writes to memory may not follow code order on several
> processors.

The _compiler_ isn't allowed to reorder things here.  Yes, of course
you do need stronger barriers for many purposes, volatile isn't all
that useful you know.


Segher

^ permalink raw reply

* [PATCH][1/2] Add ICMPMsgStats MIB (RFC 4293)
From: David Stevens @ 2007-09-11  2:25 UTC (permalink / raw)
  To: davem, yoshfuji; +Cc: netdev

[-- Attachment #1: Type: text/plain, Size: 18992 bytes --]

Background: RFC 4293 deprecates existing individual, named ICMP
type counters to be replaced with the ICMPMsgStatsTable. This table
includes entries for both IPv4 and IPv6, and requires counting of all
ICMP types, whether or not the machine implements the type.

These patches "remove" (but not really) the existing counters, and
replace them with the ICMPMsgStats tables for v4 and v6.
It includes the named counters in the /proc places they were, but gets the
values for them from the new tables. It also counts packets generated
from raw socket output (e.g., OutEchoes, MLD queries, RA's from
radvd, etc).

Changes:
1) create icmpmsg_statistics mib
2) create icmpv6msg_statistics mib
3) modify existing counters to use these
4) modify /proc/net/snmp to add "IcmpMsg" with all ICMP types
        listed by number for easy SNMP parsing
5) modify /proc/net/snmp printing for "Icmp" to get the named data
        from new counters.

IPv4 patch attached, IPv6 patch to follow.

                                        +-DLS

Signed-off-by: David L Stevens <dlstevens@us.ibm.com>

diff -ruNp linux-2.6.22.5/include/linux/snmp.h 
linux-2.6.22.5_ICMPMSG/include/linux/snmp.h
--- linux-2.6.22.5/include/linux/snmp.h 2007-08-22 16:23:54.000000000 
-0700
+++ linux-2.6.22.5_ICMPMSG/include/linux/snmp.h 2007-08-23 
15:32:29.000000000 -0700
@@ -82,6 +82,8 @@ enum
        __ICMP_MIB_MAX
 };
 
+#define __ICMPMSG_MIB_MAX 512  /* Out+In for all 8-bit ICMP types */
+
 /* icmp6 mib definitions */
 /*
  * RFC 2466:  ICMPv6-MIB
diff -ruNp linux-2.6.22.5/include/net/icmp.h 
linux-2.6.22.5_ICMPMSG/include/net/icmp.h
--- linux-2.6.22.5/include/net/icmp.h   2007-08-22 16:23:54.000000000 
-0700
+++ linux-2.6.22.5_ICMPMSG/include/net/icmp.h   2007-08-23 
15:56:45.000000000 -0700
@@ -30,9 +30,16 @@ struct icmp_err {
 
 extern struct icmp_err icmp_err_convert[];
 DECLARE_SNMP_STAT(struct icmp_mib, icmp_statistics);
+DECLARE_SNMP_STAT(struct icmpmsg_mib, icmpmsg_statistics);
 #define ICMP_INC_STATS(field)          SNMP_INC_STATS(icmp_statistics, 
field)
 #define ICMP_INC_STATS_BH(field)       SNMP_INC_STATS_BH(icmp_statistics, 
field)
 #define ICMP_INC_STATS_USER(field) SNMP_INC_STATS_USER(icmp_statistics, 
field)
+#define ICMPMSGOUT_INC_STATS(field)    SNMP_INC_STATS(icmpmsg_statistics, 
field+256)
+#define ICMPMSGOUT_INC_STATS_BH(field) 
SNMP_INC_STATS_BH(icmpmsg_statistics, field+256)
+#define ICMPMSGOUT_INC_STATS_USER(field) 
SNMP_INC_STATS_USER(icmpmsg_statistics, field+256)
+#define ICMPMSGIN_INC_STATS(field)     SNMP_INC_STATS(icmpmsg_statistics, 
field)
+#define ICMPMSGIN_INC_STATS_BH(field) 
SNMP_INC_STATS_BH(icmpmsg_statistics, field)
+#define ICMPMSGIN_INC_STATS_USER(field) 
SNMP_INC_STATS_USER(icmpmsg_statistics, field)
 
 struct dst_entry;
 struct net_proto_family;
@@ -42,6 +49,7 @@ extern void   icmp_send(struct sk_buff *sk
 extern int     icmp_rcv(struct sk_buff *skb);
 extern int     icmp_ioctl(struct sock *sk, int cmd, unsigned long arg);
 extern void    icmp_init(struct net_proto_family *ops);
+extern void    icmp_out_count(unsigned char type);
 
 /* Move into dst.h ? */
 extern int     xrlim_allow(struct dst_entry *dst, int timeout);
diff -ruNp linux-2.6.22.5/include/net/snmp.h 
linux-2.6.22.5_ICMPMSG/include/net/snmp.h
--- linux-2.6.22.5/include/net/snmp.h   2007-08-22 16:23:54.000000000 
-0700
+++ linux-2.6.22.5_ICMPMSG/include/net/snmp.h   2007-08-23 
14:42:50.000000000 -0700
@@ -82,6 +82,11 @@ struct icmp_mib {
        unsigned long   mibs[ICMP_MIB_MAX];
 } __SNMP_MIB_ALIGN__;
 
+#define ICMPMSG_MIB_MAX        __ICMPMSG_MIB_MAX
+struct icmpmsg_mib {
+       unsigned long   mibs[ICMPMSG_MIB_MAX];
+} __SNMP_MIB_ALIGN__;
+
 /* ICMP6 (IPv6-ICMP) */
 #define ICMP6_MIB_MAX  __ICMP6_MIB_MAX
 struct icmpv6_mib {
diff -ruNp linux-2.6.22.5/net/ipv4/af_inet.c 
linux-2.6.22.5_ICMPMSG/net/ipv4/af_inet.c
--- linux-2.6.22.5/net/ipv4/af_inet.c   2007-08-22 16:23:54.000000000 
-0700
+++ linux-2.6.22.5_ICMPMSG/net/ipv4/af_inet.c   2007-08-23 
14:47:26.000000000 -0700
@@ -1296,6 +1296,10 @@ static int __init init_ipv4_mibs(void)
                          sizeof(struct icmp_mib),
                          __alignof__(struct icmp_mib)) < 0)
                goto err_icmp_mib;
+       if (snmp_mib_init((void **)icmpmsg_statistics,
+                         sizeof(struct icmpmsg_mib),
+                         __alignof__(struct icmpmsg_mib)) < 0)
+               goto err_icmpmsg_mib;
        if (snmp_mib_init((void **)tcp_statistics,
                          sizeof(struct tcp_mib),
                          __alignof__(struct tcp_mib)) < 0)
@@ -1318,6 +1322,8 @@ err_udplite_mib:
 err_udp_mib:
        snmp_mib_free((void **)tcp_statistics);
 err_tcp_mib:
+       snmp_mib_free((void **)icmpmsg_statistics);
+err_icmpmsg_mib:
        snmp_mib_free((void **)icmp_statistics);
 err_icmp_mib:
        snmp_mib_free((void **)ip_statistics);
diff -ruNp linux-2.6.22.5/net/ipv4/icmp.c 
linux-2.6.22.5_ICMPMSG/net/ipv4/icmp.c
--- linux-2.6.22.5/net/ipv4/icmp.c      2007-08-22 16:23:54.000000000 
-0700
+++ linux-2.6.22.5_ICMPMSG/net/ipv4/icmp.c      2007-08-23 
15:57:07.000000000 -0700
@@ -115,6 +115,7 @@ struct icmp_bxm {
  *     Statistics
  */
 DEFINE_SNMP_STAT(struct icmp_mib, icmp_statistics) __read_mostly;
+DEFINE_SNMP_STAT(struct icmpmsg_mib, icmpmsg_statistics) __read_mostly;
 
 /* An array of errno for error messages from dest unreach. */
 /* RFC 1122: 3.2.2.1 States that NET_UNREACH, HOST_UNREACH and SR_FAILED 
MUST be considered 'transient errs'. */
@@ -214,8 +215,6 @@ int sysctl_icmp_errors_use_inbound_ifadd
  */
 
 struct icmp_control {
-       int output_entry;       /* Field for increment on output */
-       int input_entry;        /* Field for increment on input */
        void (*handler)(struct sk_buff *skb);
        short   error;          /* This ICMP is classed as an error 
message */
 };
@@ -316,12 +315,10 @@ out:
 /*
  *     Maintain the counters used in the SNMP statistics for outgoing 
ICMP
  */
-static void icmp_out_count(int type)
+void icmp_out_count(unsigned char type)
 {
-       if (type <= NR_ICMP_TYPES) {
-               ICMP_INC_STATS(icmp_pointers[type].output_entry);
-               ICMP_INC_STATS(ICMP_MIB_OUTMSGS);
-       }
+       ICMPMSGOUT_INC_STATS(type);
+       ICMP_INC_STATS(ICMP_MIB_OUTMSGS);
 }
 
 /*
@@ -390,7 +387,6 @@ static void icmp_reply(struct icmp_bxm *
                return;
 
        icmp_param->data.icmph.checksum = 0;
-       icmp_out_count(icmp_param->data.icmph.type);
 
        inet->tos = ip_hdr(skb)->tos;
        daddr = ipc.addr = rt->rt_src;
@@ -952,6 +948,7 @@ int icmp_rcv(struct sk_buff *skb)
 
        icmph = icmp_hdr(skb);
 
+       ICMPMSGIN_INC_STATS_BH(icmph->type);
        /*
         *      18 is the highest 'known' ICMP type. Anything else is a 
mystery
         *
@@ -986,7 +983,6 @@ int icmp_rcv(struct sk_buff *skb)
                }
        }
 
-       ICMP_INC_STATS_BH(icmp_pointers[icmph->type].input_entry);
        icmp_pointers[icmph->type].handler(skb);
 
 drop:
@@ -1002,109 +998,71 @@ error:
  */
 static const struct icmp_control icmp_pointers[NR_ICMP_TYPES + 1] = {
        [ICMP_ECHOREPLY] = {
-               .output_entry = ICMP_MIB_OUTECHOREPS,
-               .input_entry = ICMP_MIB_INECHOREPS,
                .handler = icmp_discard,
        },
        [1] = {
-               .output_entry = ICMP_MIB_DUMMY,
-               .input_entry = ICMP_MIB_INERRORS,
                .handler = icmp_discard,
                .error = 1,
        },
        [2] = {
-               .output_entry = ICMP_MIB_DUMMY,
-               .input_entry = ICMP_MIB_INERRORS,
                .handler = icmp_discard,
                .error = 1,
        },
        [ICMP_DEST_UNREACH] = {
-               .output_entry = ICMP_MIB_OUTDESTUNREACHS,
-               .input_entry = ICMP_MIB_INDESTUNREACHS,
                .handler = icmp_unreach,
                .error = 1,
        },
        [ICMP_SOURCE_QUENCH] = {
-               .output_entry = ICMP_MIB_OUTSRCQUENCHS,
-               .input_entry = ICMP_MIB_INSRCQUENCHS,
                .handler = icmp_unreach,
                .error = 1,
        },
        [ICMP_REDIRECT] = {
-               .output_entry = ICMP_MIB_OUTREDIRECTS,
-               .input_entry = ICMP_MIB_INREDIRECTS,
                .handler = icmp_redirect,
                .error = 1,
        },
        [6] = {
-               .output_entry = ICMP_MIB_DUMMY,
-               .input_entry = ICMP_MIB_INERRORS,
                .handler = icmp_discard,
                .error = 1,
        },
        [7] = {
-               .output_entry = ICMP_MIB_DUMMY,
-               .input_entry = ICMP_MIB_INERRORS,
                .handler = icmp_discard,
                .error = 1,
        },
        [ICMP_ECHO] = {
-               .output_entry = ICMP_MIB_OUTECHOS,
-               .input_entry = ICMP_MIB_INECHOS,
                .handler = icmp_echo,
        },
        [9] = {
-               .output_entry = ICMP_MIB_DUMMY,
-               .input_entry = ICMP_MIB_INERRORS,
                .handler = icmp_discard,
                .error = 1,
        },
        [10] = {
-               .output_entry = ICMP_MIB_DUMMY,
-               .input_entry = ICMP_MIB_INERRORS,
                .handler = icmp_discard,
                .error = 1,
        },
        [ICMP_TIME_EXCEEDED] = {
-               .output_entry = ICMP_MIB_OUTTIMEEXCDS,
-               .input_entry = ICMP_MIB_INTIMEEXCDS,
                .handler = icmp_unreach,
                .error = 1,
        },
        [ICMP_PARAMETERPROB] = {
-               .output_entry = ICMP_MIB_OUTPARMPROBS,
-               .input_entry = ICMP_MIB_INPARMPROBS,
                .handler = icmp_unreach,
                .error = 1,
        },
        [ICMP_TIMESTAMP] = {
-               .output_entry = ICMP_MIB_OUTTIMESTAMPS,
-               .input_entry = ICMP_MIB_INTIMESTAMPS,
                .handler = icmp_timestamp,
        },
        [ICMP_TIMESTAMPREPLY] = {
-               .output_entry = ICMP_MIB_OUTTIMESTAMPREPS,
-               .input_entry = ICMP_MIB_INTIMESTAMPREPS,
                .handler = icmp_discard,
        },
        [ICMP_INFO_REQUEST] = {
-               .output_entry = ICMP_MIB_DUMMY,
-               .input_entry = ICMP_MIB_DUMMY,
                .handler = icmp_discard,
        },
        [ICMP_INFO_REPLY] = {
-               .output_entry = ICMP_MIB_DUMMY,
-               .input_entry = ICMP_MIB_DUMMY,
                .handler = icmp_discard,
        },
        [ICMP_ADDRESS] = {
-               .output_entry = ICMP_MIB_OUTADDRMASKS,
-               .input_entry = ICMP_MIB_INADDRMASKS,
                .handler = icmp_address,
        },
        [ICMP_ADDRESSREPLY] = {
-               .output_entry = ICMP_MIB_OUTADDRMASKREPS,
-               .input_entry = ICMP_MIB_INADDRMASKREPS,
                .handler = icmp_address_reply,
        },
 };
@@ -1146,4 +1104,5 @@ void __init icmp_init(struct net_proto_f
 EXPORT_SYMBOL(icmp_err_convert);
 EXPORT_SYMBOL(icmp_send);
 EXPORT_SYMBOL(icmp_statistics);
+EXPORT_SYMBOL(icmpmsg_statistics);
 EXPORT_SYMBOL(xrlim_allow);
diff -ruNp linux-2.6.22.5/net/ipv4/ip_output.c 
linux-2.6.22.5_ICMPMSG/net/ipv4/ip_output.c
--- linux-2.6.22.5/net/ipv4/ip_output.c 2007-08-22 16:23:54.000000000 
-0700
+++ linux-2.6.22.5_ICMPMSG/net/ipv4/ip_output.c 2007-08-23 
15:54:45.000000000 -0700
@@ -1258,6 +1258,10 @@ int ip_push_pending_frames(struct sock *
        skb->priority = sk->sk_priority;
        skb->dst = dst_clone(&rt->u.dst);
 
+       if (iph->protocol == IPPROTO_ICMP)
+               icmp_out_count(((struct icmphdr *)
+                       skb_transport_header(skb))->type);
+
        /* Netfilter gets whole the not fragmented skb. */
        err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL,
                      skb->dst->dev, dst_output);
diff -ruNp linux-2.6.22.5/net/ipv4/proc.c 
linux-2.6.22.5_ICMPMSG/net/ipv4/proc.c
--- linux-2.6.22.5/net/ipv4/proc.c      2007-08-22 16:23:54.000000000 
-0700
+++ linux-2.6.22.5_ICMPMSG/net/ipv4/proc.c      2007-08-23 
15:58:42.000000000 -0700
@@ -123,33 +123,30 @@ static const struct snmp_mib snmp4_ipext
 static const struct snmp_mib snmp4_icmp_list[] = {
        SNMP_MIB_ITEM("InMsgs", ICMP_MIB_INMSGS),
        SNMP_MIB_ITEM("InErrors", ICMP_MIB_INERRORS),
-       SNMP_MIB_ITEM("InDestUnreachs", ICMP_MIB_INDESTUNREACHS),
-       SNMP_MIB_ITEM("InTimeExcds", ICMP_MIB_INTIMEEXCDS),
-       SNMP_MIB_ITEM("InParmProbs", ICMP_MIB_INPARMPROBS),
-       SNMP_MIB_ITEM("InSrcQuenchs", ICMP_MIB_INSRCQUENCHS),
-       SNMP_MIB_ITEM("InRedirects", ICMP_MIB_INREDIRECTS),
-       SNMP_MIB_ITEM("InEchos", ICMP_MIB_INECHOS),
-       SNMP_MIB_ITEM("InEchoReps", ICMP_MIB_INECHOREPS),
-       SNMP_MIB_ITEM("InTimestamps", ICMP_MIB_INTIMESTAMPS),
-       SNMP_MIB_ITEM("InTimestampReps", ICMP_MIB_INTIMESTAMPREPS),
-       SNMP_MIB_ITEM("InAddrMasks", ICMP_MIB_INADDRMASKS),
-       SNMP_MIB_ITEM("InAddrMaskReps", ICMP_MIB_INADDRMASKREPS),
        SNMP_MIB_ITEM("OutMsgs", ICMP_MIB_OUTMSGS),
        SNMP_MIB_ITEM("OutErrors", ICMP_MIB_OUTERRORS),
-       SNMP_MIB_ITEM("OutDestUnreachs", ICMP_MIB_OUTDESTUNREACHS),
-       SNMP_MIB_ITEM("OutTimeExcds", ICMP_MIB_OUTTIMEEXCDS),
-       SNMP_MIB_ITEM("OutParmProbs", ICMP_MIB_OUTPARMPROBS),
-       SNMP_MIB_ITEM("OutSrcQuenchs", ICMP_MIB_OUTSRCQUENCHS),
-       SNMP_MIB_ITEM("OutRedirects", ICMP_MIB_OUTREDIRECTS),
-       SNMP_MIB_ITEM("OutEchos", ICMP_MIB_OUTECHOS),
-       SNMP_MIB_ITEM("OutEchoReps", ICMP_MIB_OUTECHOREPS),
-       SNMP_MIB_ITEM("OutTimestamps", ICMP_MIB_OUTTIMESTAMPS),
-       SNMP_MIB_ITEM("OutTimestampReps", ICMP_MIB_OUTTIMESTAMPREPS),
-       SNMP_MIB_ITEM("OutAddrMasks", ICMP_MIB_OUTADDRMASKS),
-       SNMP_MIB_ITEM("OutAddrMaskReps", ICMP_MIB_OUTADDRMASKREPS),
        SNMP_MIB_SENTINEL
 };
 
+static struct {
+       char *name;
+       int index;
+} icmpmibmap[] = {
+       { "DestUnreachs", ICMP_DEST_UNREACH },
+       { "TimeExcds", ICMP_TIME_EXCEEDED },
+       { "ParmProbs", ICMP_PARAMETERPROB },
+       { "SrcQuenchs", ICMP_SOURCE_QUENCH },
+       { "Redirects", ICMP_REDIRECT },
+       { "Echos", ICMP_ECHO },
+       { "EchoReps", ICMP_ECHOREPLY },
+       { "Timestamps", ICMP_TIMESTAMP },
+       { "TimestampReps", ICMP_TIMESTAMPREPLY },
+       { "AddrMasks", ICMP_ADDRESS },
+       { "AddrMaskReps", ICMP_ADDRESSREPLY },
+       { 0, 0 }
+};
+
+
 static const struct snmp_mib snmp4_tcp_list[] = {
        SNMP_MIB_ITEM("RtoAlgorithm", TCP_MIB_RTOALGORITHM),
        SNMP_MIB_ITEM("RtoMin", TCP_MIB_RTOMIN),
@@ -247,6 +244,72 @@ static const struct snmp_mib snmp4_net_l
        SNMP_MIB_SENTINEL
 };
 
+static void icmpmsg_put(struct seq_file *seq)
+{
+#define PERLINE        16
+
+       int j, i, count;
+       static int out[PERLINE];
+
+       count = 0;
+       for (i = 0; i < ICMPMSG_MIB_MAX; i++) {
+
+               if (snmp_fold_field((void **) icmpmsg_statistics, i))
+                       out[count++] = i;
+               if (count < PERLINE)
+                       continue;
+
+               seq_printf(seq, "\nIcmpMsg:");
+               for (j = 0; j < PERLINE; ++j)
+                       seq_printf(seq, " %sType%u", i & 0x100 ? "Out" : 
"In",
+                                       i & 0xff);
+               seq_printf(seq, "\nIcmpMsg: ");
+               for (j = 0; j < PERLINE; ++j)
+                       seq_printf(seq, " %lu",
+                               snmp_fold_field((void **) 
icmpmsg_statistics,
+                               out[j]));
+               seq_putc(seq, '\n');
+       }
+       if (count) {
+               seq_printf(seq, "\nIcmpMsg:");
+               for (j = 0; j < count; ++j)
+                       seq_printf(seq, " %sType%u", out[j] & 0x100 ? 
"Out" :
+                               "In", out[j] & 0xff);
+               seq_printf(seq, "\nIcmpMsg:");
+               for (j = 0; j < count; ++j)
+                       seq_printf(seq, " %lu", snmp_fold_field((void **)
+                               icmpmsg_statistics, out[j]));
+       }
+
+#undef PERLINE
+}
+
+static void icmp_put(struct seq_file *seq)
+{
+       int i;
+
+       seq_puts(seq, "\nIcmp: InMsgs InErrors");
+       for (i=0; icmpmibmap[i].name != NULL; i++)
+               seq_printf(seq, " In%s", icmpmibmap[i].name);
+       seq_printf(seq, " OutMsgs OutErrors");
+       for (i=0; icmpmibmap[i].name != NULL; i++)
+               seq_printf(seq, " Out%s", icmpmibmap[i].name);
+       seq_printf(seq, "\nIcmp: %lu %lu",
+               snmp_fold_field((void **) icmp_statistics, 
ICMP_MIB_INMSGS),
+               snmp_fold_field((void **) icmp_statistics, 
ICMP_MIB_INERRORS));
+       for (i=0; icmpmibmap[i].name != NULL; i++)
+               seq_printf(seq, " %lu",
+                       snmp_fold_field((void **) icmpmsg_statistics,
+                               icmpmibmap[i].index));
+       seq_printf(seq, " %lu %lu",
+               snmp_fold_field((void **) icmp_statistics, 
ICMP_MIB_OUTMSGS),
+               snmp_fold_field((void **) icmp_statistics, 
ICMP_MIB_OUTERRORS));
+       for (i=0; icmpmibmap[i].name != NULL; i++)
+               seq_printf(seq, " %lu",
+                       snmp_fold_field((void **) icmpmsg_statistics,
+                               icmpmibmap[i].index));
+}
+
 /*
  *     Called from the PROCfs module. This outputs /proc/net/snmp.
  */
@@ -267,15 +330,8 @@ static int snmp_seq_show(struct seq_file
                           snmp_fold_field((void **)ip_statistics,
                                           snmp4_ipstats_list[i].entry));
 
-       seq_puts(seq, "\nIcmp:");
-       for (i = 0; snmp4_icmp_list[i].name != NULL; i++)
-               seq_printf(seq, " %s", snmp4_icmp_list[i].name);
-
-       seq_puts(seq, "\nIcmp:");
-       for (i = 0; snmp4_icmp_list[i].name != NULL; i++)
-               seq_printf(seq, " %lu",
-                          snmp_fold_field((void **)icmp_statistics,
-                                          snmp4_icmp_list[i].entry));
+       icmp_put(seq);  /* RFC 2011 compatibility */
+       icmpmsg_put(seq);
 
        seq_puts(seq, "\nTcp:");
        for (i = 0; snmp4_tcp_list[i].name != NULL; i++)
@@ -332,6 +388,8 @@ static const struct file_operations snmp
        .release = single_release,
 };
 
+
+
 /*
  *     Output /proc/net/netstat
  */
diff -ruNp linux-2.6.22.5/net/ipv4/raw.c 
linux-2.6.22.5_ICMPMSG/net/ipv4/raw.c
--- linux-2.6.22.5/net/ipv4/raw.c       2007-08-22 16:23:54.000000000 
-0700
+++ linux-2.6.22.5_ICMPMSG/net/ipv4/raw.c       2007-08-23 
15:54:12.000000000 -0700
@@ -313,6 +313,9 @@ static int raw_send_hdrinc(struct sock *
 
                iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
        }
+       if (iph->protocol == IPPROTO_ICMP)
+               icmp_out_count(((struct icmphdr *)
+                       skb_transport_header(skb))->type);
 
        err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
                      dst_output);


[-- Attachment #2: icmpmsgmib4.patch --]
[-- Type: application/octet-stream, Size: 14936 bytes --]

diff -ruNp linux-2.6.22.5/include/linux/snmp.h linux-2.6.22.5_ICMPMSG/include/linux/snmp.h
--- linux-2.6.22.5/include/linux/snmp.h	2007-08-22 16:23:54.000000000 -0700
+++ linux-2.6.22.5_ICMPMSG/include/linux/snmp.h	2007-08-23 15:32:29.000000000 -0700
@@ -82,6 +82,8 @@ enum
 	__ICMP_MIB_MAX
 };
 
+#define __ICMPMSG_MIB_MAX 512	/* Out+In for all 8-bit ICMP types */
+
 /* icmp6 mib definitions */
 /*
  * RFC 2466:  ICMPv6-MIB
diff -ruNp linux-2.6.22.5/include/net/icmp.h linux-2.6.22.5_ICMPMSG/include/net/icmp.h
--- linux-2.6.22.5/include/net/icmp.h	2007-08-22 16:23:54.000000000 -0700
+++ linux-2.6.22.5_ICMPMSG/include/net/icmp.h	2007-08-23 15:56:45.000000000 -0700
@@ -30,9 +30,16 @@ struct icmp_err {
 
 extern struct icmp_err icmp_err_convert[];
 DECLARE_SNMP_STAT(struct icmp_mib, icmp_statistics);
+DECLARE_SNMP_STAT(struct icmpmsg_mib, icmpmsg_statistics);
 #define ICMP_INC_STATS(field)		SNMP_INC_STATS(icmp_statistics, field)
 #define ICMP_INC_STATS_BH(field)	SNMP_INC_STATS_BH(icmp_statistics, field)
 #define ICMP_INC_STATS_USER(field) 	SNMP_INC_STATS_USER(icmp_statistics, field)
+#define ICMPMSGOUT_INC_STATS(field)	SNMP_INC_STATS(icmpmsg_statistics, field+256)
+#define ICMPMSGOUT_INC_STATS_BH(field)	SNMP_INC_STATS_BH(icmpmsg_statistics, field+256)
+#define ICMPMSGOUT_INC_STATS_USER(field) 	SNMP_INC_STATS_USER(icmpmsg_statistics, field+256)
+#define ICMPMSGIN_INC_STATS(field)	SNMP_INC_STATS(icmpmsg_statistics, field)
+#define ICMPMSGIN_INC_STATS_BH(field)	SNMP_INC_STATS_BH(icmpmsg_statistics, field)
+#define ICMPMSGIN_INC_STATS_USER(field) SNMP_INC_STATS_USER(icmpmsg_statistics, field)
 
 struct dst_entry;
 struct net_proto_family;
@@ -42,6 +49,7 @@ extern void	icmp_send(struct sk_buff *sk
 extern int	icmp_rcv(struct sk_buff *skb);
 extern int	icmp_ioctl(struct sock *sk, int cmd, unsigned long arg);
 extern void	icmp_init(struct net_proto_family *ops);
+extern void	icmp_out_count(unsigned char type);
 
 /* Move into dst.h ? */
 extern int 	xrlim_allow(struct dst_entry *dst, int timeout);
diff -ruNp linux-2.6.22.5/include/net/snmp.h linux-2.6.22.5_ICMPMSG/include/net/snmp.h
--- linux-2.6.22.5/include/net/snmp.h	2007-08-22 16:23:54.000000000 -0700
+++ linux-2.6.22.5_ICMPMSG/include/net/snmp.h	2007-08-23 14:42:50.000000000 -0700
@@ -82,6 +82,11 @@ struct icmp_mib {
 	unsigned long	mibs[ICMP_MIB_MAX];
 } __SNMP_MIB_ALIGN__;
 
+#define ICMPMSG_MIB_MAX	__ICMPMSG_MIB_MAX
+struct icmpmsg_mib {
+	unsigned long	mibs[ICMPMSG_MIB_MAX];
+} __SNMP_MIB_ALIGN__;
+
 /* ICMP6 (IPv6-ICMP) */
 #define ICMP6_MIB_MAX	__ICMP6_MIB_MAX
 struct icmpv6_mib {
diff -ruNp linux-2.6.22.5/net/ipv4/af_inet.c linux-2.6.22.5_ICMPMSG/net/ipv4/af_inet.c
--- linux-2.6.22.5/net/ipv4/af_inet.c	2007-08-22 16:23:54.000000000 -0700
+++ linux-2.6.22.5_ICMPMSG/net/ipv4/af_inet.c	2007-08-23 14:47:26.000000000 -0700
@@ -1296,6 +1296,10 @@ static int __init init_ipv4_mibs(void)
 			  sizeof(struct icmp_mib),
 			  __alignof__(struct icmp_mib)) < 0)
 		goto err_icmp_mib;
+	if (snmp_mib_init((void **)icmpmsg_statistics,
+			  sizeof(struct icmpmsg_mib),
+			  __alignof__(struct icmpmsg_mib)) < 0)
+		goto err_icmpmsg_mib;
 	if (snmp_mib_init((void **)tcp_statistics,
 			  sizeof(struct tcp_mib),
 			  __alignof__(struct tcp_mib)) < 0)
@@ -1318,6 +1322,8 @@ err_udplite_mib:
 err_udp_mib:
 	snmp_mib_free((void **)tcp_statistics);
 err_tcp_mib:
+	snmp_mib_free((void **)icmpmsg_statistics);
+err_icmpmsg_mib:
 	snmp_mib_free((void **)icmp_statistics);
 err_icmp_mib:
 	snmp_mib_free((void **)ip_statistics);
diff -ruNp linux-2.6.22.5/net/ipv4/icmp.c linux-2.6.22.5_ICMPMSG/net/ipv4/icmp.c
--- linux-2.6.22.5/net/ipv4/icmp.c	2007-08-22 16:23:54.000000000 -0700
+++ linux-2.6.22.5_ICMPMSG/net/ipv4/icmp.c	2007-08-23 15:57:07.000000000 -0700
@@ -115,6 +115,7 @@ struct icmp_bxm {
  *	Statistics
  */
 DEFINE_SNMP_STAT(struct icmp_mib, icmp_statistics) __read_mostly;
+DEFINE_SNMP_STAT(struct icmpmsg_mib, icmpmsg_statistics) __read_mostly;
 
 /* An array of errno for error messages from dest unreach. */
 /* RFC 1122: 3.2.2.1 States that NET_UNREACH, HOST_UNREACH and SR_FAILED MUST be considered 'transient errs'. */
@@ -214,8 +215,6 @@ int sysctl_icmp_errors_use_inbound_ifadd
  */
 
 struct icmp_control {
-	int output_entry;	/* Field for increment on output */
-	int input_entry;	/* Field for increment on input */
 	void (*handler)(struct sk_buff *skb);
 	short   error;		/* This ICMP is classed as an error message */
 };
@@ -316,12 +315,10 @@ out:
 /*
  *	Maintain the counters used in the SNMP statistics for outgoing ICMP
  */
-static void icmp_out_count(int type)
+void icmp_out_count(unsigned char type)
 {
-	if (type <= NR_ICMP_TYPES) {
-		ICMP_INC_STATS(icmp_pointers[type].output_entry);
-		ICMP_INC_STATS(ICMP_MIB_OUTMSGS);
-	}
+	ICMPMSGOUT_INC_STATS(type);
+	ICMP_INC_STATS(ICMP_MIB_OUTMSGS);
 }
 
 /*
@@ -390,7 +387,6 @@ static void icmp_reply(struct icmp_bxm *
 		return;
 
 	icmp_param->data.icmph.checksum = 0;
-	icmp_out_count(icmp_param->data.icmph.type);
 
 	inet->tos = ip_hdr(skb)->tos;
 	daddr = ipc.addr = rt->rt_src;
@@ -952,6 +948,7 @@ int icmp_rcv(struct sk_buff *skb)
 
 	icmph = icmp_hdr(skb);
 
+	ICMPMSGIN_INC_STATS_BH(icmph->type);
 	/*
 	 *	18 is the highest 'known' ICMP type. Anything else is a mystery
 	 *
@@ -986,7 +983,6 @@ int icmp_rcv(struct sk_buff *skb)
 		}
 	}
 
-	ICMP_INC_STATS_BH(icmp_pointers[icmph->type].input_entry);
 	icmp_pointers[icmph->type].handler(skb);
 
 drop:
@@ -1002,109 +998,71 @@ error:
  */
 static const struct icmp_control icmp_pointers[NR_ICMP_TYPES + 1] = {
 	[ICMP_ECHOREPLY] = {
-		.output_entry = ICMP_MIB_OUTECHOREPS,
-		.input_entry = ICMP_MIB_INECHOREPS,
 		.handler = icmp_discard,
 	},
 	[1] = {
-		.output_entry = ICMP_MIB_DUMMY,
-		.input_entry = ICMP_MIB_INERRORS,
 		.handler = icmp_discard,
 		.error = 1,
 	},
 	[2] = {
-		.output_entry = ICMP_MIB_DUMMY,
-		.input_entry = ICMP_MIB_INERRORS,
 		.handler = icmp_discard,
 		.error = 1,
 	},
 	[ICMP_DEST_UNREACH] = {
-		.output_entry = ICMP_MIB_OUTDESTUNREACHS,
-		.input_entry = ICMP_MIB_INDESTUNREACHS,
 		.handler = icmp_unreach,
 		.error = 1,
 	},
 	[ICMP_SOURCE_QUENCH] = {
-		.output_entry = ICMP_MIB_OUTSRCQUENCHS,
-		.input_entry = ICMP_MIB_INSRCQUENCHS,
 		.handler = icmp_unreach,
 		.error = 1,
 	},
 	[ICMP_REDIRECT] = {
-		.output_entry = ICMP_MIB_OUTREDIRECTS,
-		.input_entry = ICMP_MIB_INREDIRECTS,
 		.handler = icmp_redirect,
 		.error = 1,
 	},
 	[6] = {
-		.output_entry = ICMP_MIB_DUMMY,
-		.input_entry = ICMP_MIB_INERRORS,
 		.handler = icmp_discard,
 		.error = 1,
 	},
 	[7] = {
-		.output_entry = ICMP_MIB_DUMMY,
-		.input_entry = ICMP_MIB_INERRORS,
 		.handler = icmp_discard,
 		.error = 1,
 	},
 	[ICMP_ECHO] = {
-		.output_entry = ICMP_MIB_OUTECHOS,
-		.input_entry = ICMP_MIB_INECHOS,
 		.handler = icmp_echo,
 	},
 	[9] = {
-		.output_entry = ICMP_MIB_DUMMY,
-		.input_entry = ICMP_MIB_INERRORS,
 		.handler = icmp_discard,
 		.error = 1,
 	},
 	[10] = {
-		.output_entry = ICMP_MIB_DUMMY,
-		.input_entry = ICMP_MIB_INERRORS,
 		.handler = icmp_discard,
 		.error = 1,
 	},
 	[ICMP_TIME_EXCEEDED] = {
-		.output_entry = ICMP_MIB_OUTTIMEEXCDS,
-		.input_entry = ICMP_MIB_INTIMEEXCDS,
 		.handler = icmp_unreach,
 		.error = 1,
 	},
 	[ICMP_PARAMETERPROB] = {
-		.output_entry = ICMP_MIB_OUTPARMPROBS,
-		.input_entry = ICMP_MIB_INPARMPROBS,
 		.handler = icmp_unreach,
 		.error = 1,
 	},
 	[ICMP_TIMESTAMP] = {
-		.output_entry = ICMP_MIB_OUTTIMESTAMPS,
-		.input_entry = ICMP_MIB_INTIMESTAMPS,
 		.handler = icmp_timestamp,
 	},
 	[ICMP_TIMESTAMPREPLY] = {
-		.output_entry = ICMP_MIB_OUTTIMESTAMPREPS,
-		.input_entry = ICMP_MIB_INTIMESTAMPREPS,
 		.handler = icmp_discard,
 	},
 	[ICMP_INFO_REQUEST] = {
-		.output_entry = ICMP_MIB_DUMMY,
-		.input_entry = ICMP_MIB_DUMMY,
 		.handler = icmp_discard,
 	},
 	[ICMP_INFO_REPLY] = {
-		.output_entry = ICMP_MIB_DUMMY,
-		.input_entry = ICMP_MIB_DUMMY,
 		.handler = icmp_discard,
 	},
 	[ICMP_ADDRESS] = {
-		.output_entry = ICMP_MIB_OUTADDRMASKS,
-		.input_entry = ICMP_MIB_INADDRMASKS,
 		.handler = icmp_address,
 	},
 	[ICMP_ADDRESSREPLY] = {
-		.output_entry = ICMP_MIB_OUTADDRMASKREPS,
-		.input_entry = ICMP_MIB_INADDRMASKREPS,
 		.handler = icmp_address_reply,
 	},
 };
@@ -1146,4 +1104,5 @@ void __init icmp_init(struct net_proto_f
 EXPORT_SYMBOL(icmp_err_convert);
 EXPORT_SYMBOL(icmp_send);
 EXPORT_SYMBOL(icmp_statistics);
+EXPORT_SYMBOL(icmpmsg_statistics);
 EXPORT_SYMBOL(xrlim_allow);
diff -ruNp linux-2.6.22.5/net/ipv4/ip_output.c linux-2.6.22.5_ICMPMSG/net/ipv4/ip_output.c
--- linux-2.6.22.5/net/ipv4/ip_output.c	2007-08-22 16:23:54.000000000 -0700
+++ linux-2.6.22.5_ICMPMSG/net/ipv4/ip_output.c	2007-08-23 15:54:45.000000000 -0700
@@ -1258,6 +1258,10 @@ int ip_push_pending_frames(struct sock *
 	skb->priority = sk->sk_priority;
 	skb->dst = dst_clone(&rt->u.dst);
 
+	if (iph->protocol == IPPROTO_ICMP)
+		icmp_out_count(((struct icmphdr *)
+			skb_transport_header(skb))->type);
+
 	/* Netfilter gets whole the not fragmented skb. */
 	err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL,
 		      skb->dst->dev, dst_output);
diff -ruNp linux-2.6.22.5/net/ipv4/proc.c linux-2.6.22.5_ICMPMSG/net/ipv4/proc.c
--- linux-2.6.22.5/net/ipv4/proc.c	2007-08-22 16:23:54.000000000 -0700
+++ linux-2.6.22.5_ICMPMSG/net/ipv4/proc.c	2007-08-23 15:58:42.000000000 -0700
@@ -123,33 +123,30 @@ static const struct snmp_mib snmp4_ipext
 static const struct snmp_mib snmp4_icmp_list[] = {
 	SNMP_MIB_ITEM("InMsgs", ICMP_MIB_INMSGS),
 	SNMP_MIB_ITEM("InErrors", ICMP_MIB_INERRORS),
-	SNMP_MIB_ITEM("InDestUnreachs", ICMP_MIB_INDESTUNREACHS),
-	SNMP_MIB_ITEM("InTimeExcds", ICMP_MIB_INTIMEEXCDS),
-	SNMP_MIB_ITEM("InParmProbs", ICMP_MIB_INPARMPROBS),
-	SNMP_MIB_ITEM("InSrcQuenchs", ICMP_MIB_INSRCQUENCHS),
-	SNMP_MIB_ITEM("InRedirects", ICMP_MIB_INREDIRECTS),
-	SNMP_MIB_ITEM("InEchos", ICMP_MIB_INECHOS),
-	SNMP_MIB_ITEM("InEchoReps", ICMP_MIB_INECHOREPS),
-	SNMP_MIB_ITEM("InTimestamps", ICMP_MIB_INTIMESTAMPS),
-	SNMP_MIB_ITEM("InTimestampReps", ICMP_MIB_INTIMESTAMPREPS),
-	SNMP_MIB_ITEM("InAddrMasks", ICMP_MIB_INADDRMASKS),
-	SNMP_MIB_ITEM("InAddrMaskReps", ICMP_MIB_INADDRMASKREPS),
 	SNMP_MIB_ITEM("OutMsgs", ICMP_MIB_OUTMSGS),
 	SNMP_MIB_ITEM("OutErrors", ICMP_MIB_OUTERRORS),
-	SNMP_MIB_ITEM("OutDestUnreachs", ICMP_MIB_OUTDESTUNREACHS),
-	SNMP_MIB_ITEM("OutTimeExcds", ICMP_MIB_OUTTIMEEXCDS),
-	SNMP_MIB_ITEM("OutParmProbs", ICMP_MIB_OUTPARMPROBS),
-	SNMP_MIB_ITEM("OutSrcQuenchs", ICMP_MIB_OUTSRCQUENCHS),
-	SNMP_MIB_ITEM("OutRedirects", ICMP_MIB_OUTREDIRECTS),
-	SNMP_MIB_ITEM("OutEchos", ICMP_MIB_OUTECHOS),
-	SNMP_MIB_ITEM("OutEchoReps", ICMP_MIB_OUTECHOREPS),
-	SNMP_MIB_ITEM("OutTimestamps", ICMP_MIB_OUTTIMESTAMPS),
-	SNMP_MIB_ITEM("OutTimestampReps", ICMP_MIB_OUTTIMESTAMPREPS),
-	SNMP_MIB_ITEM("OutAddrMasks", ICMP_MIB_OUTADDRMASKS),
-	SNMP_MIB_ITEM("OutAddrMaskReps", ICMP_MIB_OUTADDRMASKREPS),
 	SNMP_MIB_SENTINEL
 };
 
+static struct {
+	char *name;
+	int index;
+} icmpmibmap[] = {
+	{ "DestUnreachs", ICMP_DEST_UNREACH },
+	{ "TimeExcds", ICMP_TIME_EXCEEDED },
+	{ "ParmProbs", ICMP_PARAMETERPROB },
+	{ "SrcQuenchs", ICMP_SOURCE_QUENCH },
+	{ "Redirects", ICMP_REDIRECT },
+	{ "Echos", ICMP_ECHO },
+	{ "EchoReps", ICMP_ECHOREPLY },
+	{ "Timestamps", ICMP_TIMESTAMP },
+	{ "TimestampReps", ICMP_TIMESTAMPREPLY },
+	{ "AddrMasks", ICMP_ADDRESS },
+	{ "AddrMaskReps", ICMP_ADDRESSREPLY },
+	{ 0, 0 }
+};
+
+
 static const struct snmp_mib snmp4_tcp_list[] = {
 	SNMP_MIB_ITEM("RtoAlgorithm", TCP_MIB_RTOALGORITHM),
 	SNMP_MIB_ITEM("RtoMin", TCP_MIB_RTOMIN),
@@ -247,6 +244,72 @@ static const struct snmp_mib snmp4_net_l
 	SNMP_MIB_SENTINEL
 };
 
+static void icmpmsg_put(struct seq_file *seq)
+{
+#define PERLINE	16
+
+	int j, i, count;
+	static int out[PERLINE];
+
+	count = 0;
+	for (i = 0; i < ICMPMSG_MIB_MAX; i++) {
+
+		if (snmp_fold_field((void **) icmpmsg_statistics, i))
+			out[count++] = i;
+		if (count < PERLINE)
+			continue;
+
+		seq_printf(seq, "\nIcmpMsg:");
+		for (j = 0; j < PERLINE; ++j)
+			seq_printf(seq, " %sType%u", i & 0x100 ? "Out" : "In",
+					i & 0xff);
+		seq_printf(seq, "\nIcmpMsg: ");
+		for (j = 0; j < PERLINE; ++j)
+			seq_printf(seq, " %lu",
+				snmp_fold_field((void **) icmpmsg_statistics,
+				out[j]));
+		seq_putc(seq, '\n');
+	}
+	if (count) {
+		seq_printf(seq, "\nIcmpMsg:");
+		for (j = 0; j < count; ++j)
+			seq_printf(seq, " %sType%u", out[j] & 0x100 ? "Out" :
+				"In", out[j] & 0xff);
+		seq_printf(seq, "\nIcmpMsg:");
+		for (j = 0; j < count; ++j)
+			seq_printf(seq, " %lu", snmp_fold_field((void **)
+				icmpmsg_statistics, out[j]));
+	}
+
+#undef PERLINE
+}
+
+static void icmp_put(struct seq_file *seq)
+{
+	int i;
+
+	seq_puts(seq, "\nIcmp: InMsgs InErrors");
+	for (i=0; icmpmibmap[i].name != NULL; i++)
+		seq_printf(seq, " In%s", icmpmibmap[i].name);
+	seq_printf(seq, " OutMsgs OutErrors");
+	for (i=0; icmpmibmap[i].name != NULL; i++)
+		seq_printf(seq, " Out%s", icmpmibmap[i].name);
+	seq_printf(seq, "\nIcmp: %lu %lu",
+		snmp_fold_field((void **) icmp_statistics, ICMP_MIB_INMSGS),
+		snmp_fold_field((void **) icmp_statistics, ICMP_MIB_INERRORS));
+	for (i=0; icmpmibmap[i].name != NULL; i++)
+		seq_printf(seq, " %lu",
+			snmp_fold_field((void **) icmpmsg_statistics,
+				icmpmibmap[i].index));
+	seq_printf(seq, " %lu %lu",
+		snmp_fold_field((void **) icmp_statistics, ICMP_MIB_OUTMSGS),
+		snmp_fold_field((void **) icmp_statistics, ICMP_MIB_OUTERRORS));
+	for (i=0; icmpmibmap[i].name != NULL; i++)
+		seq_printf(seq, " %lu",
+			snmp_fold_field((void **) icmpmsg_statistics,
+				icmpmibmap[i].index));
+}
+
 /*
  *	Called from the PROCfs module. This outputs /proc/net/snmp.
  */
@@ -267,15 +330,8 @@ static int snmp_seq_show(struct seq_file
 			   snmp_fold_field((void **)ip_statistics,
 					   snmp4_ipstats_list[i].entry));
 
-	seq_puts(seq, "\nIcmp:");
-	for (i = 0; snmp4_icmp_list[i].name != NULL; i++)
-		seq_printf(seq, " %s", snmp4_icmp_list[i].name);
-
-	seq_puts(seq, "\nIcmp:");
-	for (i = 0; snmp4_icmp_list[i].name != NULL; i++)
-		seq_printf(seq, " %lu",
-			   snmp_fold_field((void **)icmp_statistics,
-					   snmp4_icmp_list[i].entry));
+	icmp_put(seq);	/* RFC 2011 compatibility */
+	icmpmsg_put(seq);
 
 	seq_puts(seq, "\nTcp:");
 	for (i = 0; snmp4_tcp_list[i].name != NULL; i++)
@@ -332,6 +388,8 @@ static const struct file_operations snmp
 	.release = single_release,
 };
 
+
+
 /*
  *	Output /proc/net/netstat
  */
diff -ruNp linux-2.6.22.5/net/ipv4/raw.c linux-2.6.22.5_ICMPMSG/net/ipv4/raw.c
--- linux-2.6.22.5/net/ipv4/raw.c	2007-08-22 16:23:54.000000000 -0700
+++ linux-2.6.22.5_ICMPMSG/net/ipv4/raw.c	2007-08-23 15:54:12.000000000 -0700
@@ -313,6 +313,9 @@ static int raw_send_hdrinc(struct sock *
 
 		iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
 	}
+	if (iph->protocol == IPPROTO_ICMP)
+		icmp_out_count(((struct icmphdr *)
+			skb_transport_header(skb))->type);
 
 	err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
 		      dst_output);

^ permalink raw reply

* Re: [PATCH resend] Fix a lock problem in generic phy code
From: Herbert Xu @ 2007-09-11  2:21 UTC (permalink / raw)
  To: Hans-Jürgen Koch; +Cc: linux-kernel, netdev, jeff, afleming
In-Reply-To: <200709102045.51063.hjk@linutronix.de>

On Mon, Sep 10, 2007 at 08:45:50PM +0200, Hans-Jürgen Koch wrote:
>
> > Could you please audit all instances of physdev->lock and add
> > _bh where necessary?  I can see that at least phys_stop also
> > needs the _bh.
> 
> I think the patch does all that's necessary. At least, there're no error
> messages in the logs anymore. I didn't check if there's an error on
> unload, though.

Sorry, but you can't rely on the non-existence of lockdep
messages as a proof of correctness :)

If we're going to fix the obvious bugs here, we should fix
the subtle ones too as otherwise they'll be much harder to
notice with this patch merged.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* Re: 2.6.23-rc5: possible irq lock inversion dependency detected
From: Herbert Xu @ 2007-09-11  2:18 UTC (permalink / raw)
  To: jamal; +Cc: Christian Kujau, linux-kernel, netdev
In-Reply-To: <1189469081.14002.3.camel@localhost>

On Mon, Sep 10, 2007 at 08:04:41PM -0400, jamal wrote:
>
> disabling BH would make it more symmetric to the way we handle
> egress. I couldnt reproduce the issue, but this should hopefully resolve
> it.
> Christian, can you test with this patch?

Jamal, it's the police_lock that we need to make _bh.  The
ingress_lock is already _bh because of the spin_lock_bh that
directly precedes it.

Oh and I think the same thing applies for the other actions
too.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* Re: [PATCH] cfg80211: fix initialisation if built-in
From: Rob Hussey @ 2007-09-11  1:18 UTC (permalink / raw)
  To: Magnus Damm
  Cc: Johannes Berg, John W. Linville, linux-wireless, netdev, stable
In-Reply-To: <aec7e5c30709101804p337084e9o63b5bcb4f007828@mail.gmail.com>

On 9/10/07, Magnus Damm <magnus.damm@gmail.com> wrote:
> -module_init(rate_control_simple_init);
> +//module_init(rate_control_simple_init);
> +postcore_initcall(rate_control_simple_init);
>  module_exit(rate_control_simple_exit);
>
>  MODULE_DESCRIPTION("Simple rate control algorithm for ieee80211");

Same problem here, except with the rt2x00 driver. I changed it to
subsys_initcall(rate_control_simple_init), which also worked. I also
found that without this change, it was failing at this point in
ieee80211_rate.c:

ieee80211_try_rate_control_ops_get(const char *name)
{
        struct rate_control_alg *alg;
        struct rate_control_ops *ops = NULL;

        mutex_lock(&rate_ctrl_mutex);
        list_for_each_entry(alg, &rate_ctrl_algs, list) {   <===== Here
                if (!name || !strcmp(alg->ops->name, name))
                        if (try_module_get(alg->ops->module)) {
                                ops = alg->ops;
                                break;
                        }

^ permalink raw reply

* Re: [PATCH] cfg80211: fix initialisation if built-in
From: Magnus Damm @ 2007-09-11  1:04 UTC (permalink / raw)
  To: Johannes Berg
  Cc: John W. Linville, linux-wireless, netdev, Rob Hussey,
	stable-DgEjT+Ai2ygdnm+yROfE0A
In-Reply-To: <1189424685.4506.63.camel-YfaajirXv214zXjbi5bjpg@public.gmane.org>

On 9/10/07, Johannes Berg <johannes-cdvu00un1VgdHxzADdlk8Q@public.gmane.org> wrote:
> When cfg80211 is built into the kernel it needs to init earlier
> so that device registrations are run after it has initialised.
>
> Signed-off-by: Johannes Berg <johannes-cdvu00un1VgdHxzADdlk8Q@public.gmane.org>

Yep, I need this fix as well. Without it the ath5k driver built in
bombs out during module_init(). Something with kref and a struct
device pointing to an uninitialized ieee80211_class.

I need a similar fix for net/mac80211/rc80211_simple.c as well to get
ath5k working though, not sure why at the moment. There may be some
bug with request_module() not being called properly.
ieee80211_register_hw() calls ieee80211_init_rate_ctrl_alg() with NULL
as name which calls rate_control_alloc() with NULL which always seems
to fail when built in.

This hack works around that problem, not sure what the real fix is.

--- 0002/net/mac80211/rc80211_simple.c
+++ work/net/mac80211/rc80211_simple.c  2007-09-09 18:11:48.000000000 +0900
@@ -431,7 +431,8 @@ static void __exit rate_control_simple_e
 }

-module_init(rate_control_simple_init);
+//module_init(rate_control_simple_init);
+postcore_initcall(rate_control_simple_init);
 module_exit(rate_control_simple_exit);

 MODULE_DESCRIPTION("Simple rate control algorithm for ieee80211");

Thanks,

/ magnus

^ permalink raw reply

* Re: [PATCH 16/16] net: netlink support for moving devices between network namespaces.
From: Serge E. Hallyn @ 2007-09-11  0:54 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: Serge E. Hallyn, David Miller, Linux Containers, netdev
In-Reply-To: <m1fy1me2s9.fsf@ebiederm.dsl.xmission.com>

Quoting Eric W. Biederman (ebiederm@xmission.com):
> "Serge E. Hallyn" <serue@us.ibm.com> writes:
> >> 
> >> +static struct net *get_net_ns_by_pid(pid_t pid)
> >> +{
> >> +	struct task_struct *tsk;
> >> +	struct net *net;
> >> +
> >> +	/* Lookup the network namespace */
> >> +	net = ERR_PTR(-ESRCH);
> >> +	rcu_read_lock();
> >> +	tsk = find_task_by_pid(pid);
> >> +	if (tsk) {
> >> +		task_lock(tsk);
> >> +		if (tsk->nsproxy)
> >> +			net = get_net(tsk->nsproxy->net_ns);
> >> +		task_unlock(tsk);
> >
> > Thinking...  Ok, I'm not sure this is 100% safe in the target tree, but
> > the long-term correct way probably isn't yet implemented in the net-
> > tree.  Eventually you will want to:
> >
> > 	net_ns = NULL;
> > 	rcu_read_lock();
> > 	tsk = find_task_by_pid();  /* or _pidns equiv? */
> > 	nsproxy = task_nsproxy(tsk);
> > 	if (nsproxy)
> > 		net_ns = get_net(nsproxy->net_ns);
> > 	rcu_read_unlock;
> >
> > What you have here is probably unsafe if tsk is the last task pointing
> > to it's nsproxy and it does an unshare, bc unshare isn't protected by
> > task_lock, and you're not rcu_dereferencing tsk->nsproxy (which
> > task_nsproxy does).  At one point we floated a patch to reuse the same
> > nsproxy in that case which would prevent you having to worry about it,
> > but that isn't being done in -mm now so i doubt it's in -net.
> 
> 
> That change isn't merged upstream yet, so it isn't in David's
> net-2.6.24 tree.  Currently task->nsproxy is protected but
> task_lock(current). So the code is fine.
> 
> I am aware that removing the task_lock(current) for the setting
> of current->nsproxy is currently in the works, and I have planned
> to revisit this later when all of these pieces come together.
> 
> For now the code is fine.
> 
> If need be we can drop this patch to remove the potential merge
> conflict.

No, no.  Like you say it's correct at the moment.  Just something we
need to watch out for when it does get merged with the newer changes.

> But I figured it was useful

Absolutely.

> for this part of the user space
> interface to be available for review.

Agreed.  And the rest of the patchset looks good to me.

Thanks.

-serge

^ permalink raw reply

* Re: [PATCH] include listenq max backlog in /proc/net/tcp and include in tcp_info
From: Sridhar Samudrala @ 2007-09-11  0:39 UTC (permalink / raw)
  To: Rick Jones; +Cc: netdev
In-Reply-To: <200709102313.QAA01722@tardy.cup.hp.com>

On Mon, 2007-09-10 at 16:13 -0700, Rick Jones wrote:
> Return some useful information such as the maximum listen backlog and
> the current listen backlog in the tcp_info structure and have that 
> match what one can see in /proc/net/tcp and /proc/net/tcp6.

If we are also exporting max listen backlog, another place to
consider adding this is to tcp_diag_get_info() called via INET_DIAG_INFO.
Current listen backlog is returned in inet_diag_msg->idiag_rqueue.
max listen backlog can be returned in inet_diag_msg->idiag_wqueue.

Thanks
Sridhar
> 
> Signed-off-by: Rick Jones <rick.jones2@hp.com>
> ---
> 
> diff -r bdcdd0e1ee9d Documentation/networking/proc_net_tcp.txt
> --- a/Documentation/networking/proc_net_tcp.txt	Sat Sep 01 07:00:31 2007 +0000
> +++ b/Documentation/networking/proc_net_tcp.txt	Mon Sep 10 16:09:46 2007 -0700
> @@ -20,8 +20,8 @@ up into 3 parts because of the length of
>        |        |     |     |       |--> number of unrecovered RTO timeouts
>        |        |     |     |----------> number of jiffies until timer expires
>        |        |     |----------------> timer_active (see below)
> -      |        |----------------------> receive-queue
> -      |-------------------------------> transmit-queue
> +      |        |----------------------> receive-queue or connection backlog
> +      |-------------------------------> transmit-queue or connection limit
> 
>     1000        0 54165785 4 cd1e6040 25 4 27 3 -1
>      |          |    |     |    |     |  | |  | |--> slow start size threshold, 
> diff -r bdcdd0e1ee9d net/ipv4/tcp.c
> --- a/net/ipv4/tcp.c	Sat Sep 01 07:00:31 2007 +0000
> +++ b/net/ipv4/tcp.c	Mon Sep 10 16:09:46 2007 -0700
> @@ -2030,8 +2030,14 @@ void tcp_get_info(struct sock *sk, struc
>  	info->tcpi_snd_mss = tp->mss_cache;
>  	info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
> 
> -	info->tcpi_unacked = tp->packets_out;
> -	info->tcpi_sacked = tp->sacked_out;
> +	if (sk->sk_state == TCP_LISTEN) {
> +		info->tcpi_unacked = sk->sk_ack_backlog;
> +		info->tcpi_sacked = sk->sk_max_ack_backlog;
> +	}
> +	else {
> +		info->tcpi_unacked = tp->packets_out;
> +		info->tcpi_sacked = tp->sacked_out;
> +	}
>  	info->tcpi_lost = tp->lost_out;
>  	info->tcpi_retrans = tp->retrans_out;
>  	info->tcpi_fackets = tp->fackets_out;
> diff -r bdcdd0e1ee9d net/ipv4/tcp_ipv4.c
> --- a/net/ipv4/tcp_ipv4.c	Sat Sep 01 07:00:31 2007 +0000
> +++ b/net/ipv4/tcp_ipv4.c	Mon Sep 10 16:09:46 2007 -0700
> @@ -2320,7 +2320,8 @@ static void get_tcp4_sock(struct sock *s
>  	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
>  			"%08X %5d %8d %lu %d %p %u %u %u %u %d",
>  		i, src, srcp, dest, destp, sk->sk_state,
> -		tp->write_seq - tp->snd_una,
> +		sk->sk_state == TCP_LISTEN ? sk->sk_max_ack_backlog :
> +					     (tp->write_seq - tp->snd_una),
>  		sk->sk_state == TCP_LISTEN ? sk->sk_ack_backlog :
>  					     (tp->rcv_nxt - tp->copied_seq),
>  		timer_active,
> diff -r bdcdd0e1ee9d net/ipv6/tcp_ipv6.c
> --- a/net/ipv6/tcp_ipv6.c	Sat Sep 01 07:00:31 2007 +0000
> +++ b/net/ipv6/tcp_ipv6.c	Mon Sep 10 16:09:46 2007 -0700
> @@ -2005,8 +2005,10 @@ static void get_tcp6_sock(struct seq_fil
>  		   dest->s6_addr32[0], dest->s6_addr32[1],
>  		   dest->s6_addr32[2], dest->s6_addr32[3], destp,
>  		   sp->sk_state,
> -		   tp->write_seq-tp->snd_una,
> -		   (sp->sk_state == TCP_LISTEN) ? sp->sk_ack_backlog : (tp->rcv_nxt - tp->copied_seq),
> +		   (sp->sk_state == TCP_LISTEN) ? sp->sk_max_ack_backlog:
> +						  tp->write_seq-tp->snd_una,
> +		   (sp->sk_state == TCP_LISTEN) ? sp->sk_ack_backlog : 
> +					(tp->rcv_nxt - tp->copied_seq),
>  		   timer_active,
>  		   jiffies_to_clock_t(timer_expires - jiffies),
>  		   icsk->icsk_retransmits,
> -
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


^ permalink raw reply

* Re: New NAPI API: Need for netif_napi_remove() ?!
From: Kok, Auke @ 2007-09-11  0:27 UTC (permalink / raw)
  To: Kok, Auke
  Cc: David S. Miller, Stephen Hemminger, Waskiewicz Jr, Peter P,
	Jesse Brandeburg, NetDev
In-Reply-To: <46E5DECE.30206@intel.com>

Kok, Auke wrote:
> David,
> 
>  From an old thread:
> 
>  > 5) Since, in the NETPOLL case, netif_napi_init() adds the NAPI struct
>  >   to the per-device list I renamed it to netif_napi_add().  Currently
>  >   no teardown is really necessary, anything that would need to be done
>  >   would be driver internal, so I didn't create the corollary
>  >   netif_napi_remove() for the time being.  Let's not add it unless it
>  >   really becomes necessary.
> 
> while coding the NAPI API changes into the ixgbe driver, I notice that I'm in 
> need for an implementation for netif_napi_remove(). The ixgbe driver itself 
> already modifies it's polling routing on open() and close() based on whether it 
> was able to acquire MSI-X vectors or not, and can thus logically change as the 
> system suspends/resumes and new hardware is inserted that change the balance in 
> the MSI-X vectors in the system. Or, even more bluntly, all MSI support is 
> disabled and we want the driver to come up in legacy mode and use a completely 
> different poll routine alltogether. We can't do this at probe time.
> 
> In any case I think we have a legitimate case for netif_napi_remove() to be 
> implemented.


hm, I spoke too soon, I think I can get by for now by just modifying 
adapter->napi.poll when needed, and this would be clean enough for now. This 
might change as I enable multiqueue in this driver later though.

Auke

^ permalink raw reply

* New NAPI API: Need for netif_napi_remove() ?!
From: Kok, Auke @ 2007-09-11  0:18 UTC (permalink / raw)
  To: David S. Miller, Stephen Hemminger, Waskiewicz Jr, Peter P,
	Jesse Brandeburg
  Cc: NetDev

David,

 From an old thread:

 > 5) Since, in the NETPOLL case, netif_napi_init() adds the NAPI struct
 >   to the per-device list I renamed it to netif_napi_add().  Currently
 >   no teardown is really necessary, anything that would need to be done
 >   would be driver internal, so I didn't create the corollary
 >   netif_napi_remove() for the time being.  Let's not add it unless it
 >   really becomes necessary.

while coding the NAPI API changes into the ixgbe driver, I notice that I'm in 
need for an implementation for netif_napi_remove(). The ixgbe driver itself 
already modifies it's polling routing on open() and close() based on whether it 
was able to acquire MSI-X vectors or not, and can thus logically change as the 
system suspends/resumes and new hardware is inserted that change the balance in 
the MSI-X vectors in the system. Or, even more bluntly, all MSI support is 
disabled and we want the driver to come up in legacy mode and use a completely 
different poll routine alltogether. We can't do this at probe time.

In any case I think we have a legitimate case for netif_napi_remove() to be 
implemented.

Auke

^ permalink raw reply

* noob dev question
From: DHAJOGLO @ 2007-09-11  0:09 UTC (permalink / raw)
  To: netdev

I'm new to the list and new to just about everything involving kernel development.  I'm working on a project where I have successfully written a LKM to handle IP packets for protocol type 253 (an experimental protocol number).  Now, I'm working on sending my reply and I hit a road block.  I'm not sure if I should:

1) create a new skb with my protocol data and try to pass it to the IP handler
OR
2) use the inbound skb and just turn it around and use it to send back out manually via a NF Hook.

Ideally I would like to get my reply formatted and sent into ip_queue_xmit but I'm at the limit of what I can do regarding the manipulation of the socket buffers.  I'm currently working in 2.6.18.8 as the book I'm using does not reflect some of the newer changes to the sk_buff struct.  Any help or pointers would be nice.  I'm looking more for online tutorials at this point.

Regards,
-dave

^ permalink raw reply

* Re: 2.6.23-rc5: possible irq lock inversion dependency detected
From: jamal @ 2007-09-11  0:04 UTC (permalink / raw)
  To: Herbert Xu; +Cc: Christian Kujau, linux-kernel, netdev
In-Reply-To: <20070910130024.GA27939@gondor.apana.org.au>

[-- Attachment #1: Type: text/plain, Size: 336 bytes --]

On Mon, 2007-10-09 at 21:00 +0800, Herbert Xu wrote:


> The minimal fix would be to make sure that we disable BH on
> the first CPU. 

disabling BH would make it more symmetric to the way we handle
egress. I couldnt reproduce the issue, but this should hopefully resolve
it.
Christian, can you test with this patch?

cheers,
jamal





[-- Attachment #2: ing1 --]
[-- Type: text/plain, Size: 549 bytes --]

[NET_SCHED] make ingress qlock symmetric to egress

Signed-off-by: Jamal Hadi Salim <hadi@cyberus.ca>

--- a/net/sched/sch_generic.c	2007/09/10 23:19:45	1.1
+++ b/net/sched/sch_generic.c	2007/09/10 23:52:45
@@ -42,12 +42,12 @@
 void qdisc_lock_tree(struct net_device *dev)
 {
 	spin_lock_bh(&dev->queue_lock);
-	spin_lock(&dev->ingress_lock);
+	spin_lock_bh(&dev->ingress_lock);
 }
 
 void qdisc_unlock_tree(struct net_device *dev)
 {
-	spin_unlock(&dev->ingress_lock);
+	spin_unlock_bh(&dev->ingress_lock);
 	spin_unlock_bh(&dev->queue_lock);
 }
 

^ permalink raw reply

* Re: [PATCH] Document non-semantics of atomic_read() and atomic_set()
From: Paul E. McKenney @ 2007-09-10 23:44 UTC (permalink / raw)
  To: Chris Snook
  Cc: Linus Torvalds, Denys Vlasenko, Kyle Moffett, Arjan van de Ven,
	Nick Piggin, Satyam Sharma, Herbert Xu, Paul Mackerras,
	Christoph Lameter, Ilpo Jarvinen, Stefan Richter,
	Linux Kernel Mailing List, linux-arch, Netdev, Andrew Morton, ak,
	heiko.carstens, David Miller, schwidefsky, wensong, horms, wjiang,
	cfriesen, zlynx, rpjday, jesper.juhl, segher
In-Reply-To: <20070910231944.GA3484@shell.boston.redhat.com>

On Mon, Sep 10, 2007 at 07:19:44PM -0400, Chris Snook wrote:
> From: Chris Snook <csnook@redhat.com>
> 
> Unambiguously document the fact that atomic_read() and atomic_set()
> do not imply any ordering or memory access, and that callers are
> obligated to explicitly invoke barriers as needed to ensure that
> changes to atomic variables are visible in all contexts that need
> to see them.

Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

> Signed-off-by: Chris Snook <csnook@redhat.com>
> 
> --- a/Documentation/atomic_ops.txt	2007-07-08 19:32:17.000000000 -0400
> +++ b/Documentation/atomic_ops.txt	2007-09-10 19:02:50.000000000 -0400
> @@ -12,7 +12,11 @@
>  C integer type will fail.  Something like the following should
>  suffice:
> 
> -	typedef struct { volatile int counter; } atomic_t;
> +	typedef struct { int counter; } atomic_t;
> +
> +	Historically, counter has been declared volatile.  This is now
> +discouraged.  See Documentation/volatile-considered-harmful.txt for the
> +complete rationale.
> 
>  	The first operations to implement for atomic_t's are the
>  initializers and plain reads.
> @@ -42,6 +46,22 @@
> 
>  which simply reads the current value of the counter.
> 
> +*** WARNING: atomic_read() and atomic_set() DO NOT IMPLY BARRIERS! ***
> +
> +Some architectures may choose to use the volatile keyword, barriers, or
> +inline assembly to guarantee some degree of immediacy for atomic_read()
> +and atomic_set().  This is not uniformly guaranteed, and may change in
> +the future, so all users of atomic_t should treat atomic_read() and
> +atomic_set() as simple C assignment statements that may be reordered or
> +optimized away entirely by the compiler or processor, and explicitly
> +invoke the appropriate compiler and/or memory barrier for each use case.
> +Failure to do so will result in code that may suddenly break when used with
> +different architectures or compiler optimizations, or even changes in
> +unrelated code which changes how the compiler optimizes the section
> +accessing atomic_t variables.
> +
> +*** YOU HAVE BEEN WARNED! ***
> +
>  Now, we move onto the actual atomic operation interfaces.
> 
>  	void atomic_add(int i, atomic_t *v);

^ permalink raw reply

* [PATCH] [-MM, FIX V4] e1000e: incorporate napi_struct changes from net-2.6.24.git
From: Auke Kok @ 2007-09-10 23:35 UTC (permalink / raw)
  To: akpm, davem; +Cc: e1000-devel, netdev, jirislaby, jeff, Robert.Olsson

This incorporates the new napi_struct changes into e1000e. Included
bugfix for ifdown hang from Krishna Kumar for e1000.

Disabling polling is no longer needed at init time, so remove
napi_disable() call from _probe().

This also fixes an endless polling loop where the driver signalled
"polling done" improperly back to the stack.

Signed-off-by: Auke Kok <auke-jan.h.kok@intel.com>
---

 drivers/net/e1000e/e1000.h  |    2 ++
 drivers/net/e1000e/netdev.c |   40 ++++++++++++++++------------------------
 2 files changed, 18 insertions(+), 24 deletions(-)

diff --git a/drivers/net/e1000e/e1000.h b/drivers/net/e1000e/e1000.h
index c57e35a..d2499bb 100644
--- a/drivers/net/e1000e/e1000.h
+++ b/drivers/net/e1000e/e1000.h
@@ -187,6 +187,8 @@ struct e1000_adapter {
 	struct e1000_ring *tx_ring /* One per active queue */
 						____cacheline_aligned_in_smp;
 
+	struct napi_struct napi;
+
 	unsigned long tx_queue_len;
 	unsigned int restart_queue;
 	u32 txd_cmd;
diff --git a/drivers/net/e1000e/netdev.c b/drivers/net/e1000e/netdev.c
index 372da46..eeb40cc 100644
--- a/drivers/net/e1000e/netdev.c
+++ b/drivers/net/e1000e/netdev.c
@@ -1149,12 +1149,12 @@ static irqreturn_t e1000_intr_msi(int irq, void *data)
 			mod_timer(&adapter->watchdog_timer, jiffies + 1);
 	}
 
-	if (netif_rx_schedule_prep(netdev)) {
+	if (netif_rx_schedule_prep(netdev, &adapter->napi)) {
 		adapter->total_tx_bytes = 0;
 		adapter->total_tx_packets = 0;
 		adapter->total_rx_bytes = 0;
 		adapter->total_rx_packets = 0;
-		__netif_rx_schedule(netdev);
+		__netif_rx_schedule(netdev, &adapter->napi);
 	} else {
 		atomic_dec(&adapter->irq_sem);
 	}
@@ -1212,12 +1212,12 @@ static irqreturn_t e1000_intr(int irq, void *data)
 			mod_timer(&adapter->watchdog_timer, jiffies + 1);
 	}
 
-	if (netif_rx_schedule_prep(netdev)) {
+	if (netif_rx_schedule_prep(netdev, &adapter->napi)) {
 		adapter->total_tx_bytes = 0;
 		adapter->total_tx_packets = 0;
 		adapter->total_rx_bytes = 0;
 		adapter->total_rx_packets = 0;
-		__netif_rx_schedule(netdev);
+		__netif_rx_schedule(netdev, &adapter->napi);
 	} else {
 		atomic_dec(&adapter->irq_sem);
 	}
@@ -1662,10 +1662,10 @@ set_itr_now:
  * e1000_clean - NAPI Rx polling callback
  * @adapter: board private structure
  **/
-static int e1000_clean(struct net_device *poll_dev, int *budget)
+static int e1000_clean(struct napi_struct *napi, int budget)
 {
-	struct e1000_adapter *adapter;
-	int work_to_do = min(*budget, poll_dev->quota);
+	struct e1000_adapter *adapter = container_of(napi, struct e1000_adapter, napi);
+	struct net_device *poll_dev = adapter->netdev;
 	int tx_cleaned = 0, work_done = 0;
 
 	/* Must NOT use netdev_priv macro here. */
@@ -1684,25 +1684,19 @@ static int e1000_clean(struct net_device *poll_dev, int *budget)
 		spin_unlock(&adapter->tx_queue_lock);
 	}
 
-	adapter->clean_rx(adapter, &work_done, work_to_do);
-	*budget -= work_done;
-	poll_dev->quota -= work_done;
+	adapter->clean_rx(adapter, &work_done, budget);
 
 	/* If no Tx and not enough Rx work done, exit the polling mode */
-	if ((!tx_cleaned && (work_done == 0)) ||
+	if ((!tx_cleaned && (work_done < budget)) ||
 	   !netif_running(poll_dev)) {
 quit_polling:
 		if (adapter->itr_setting & 3)
 			e1000_set_itr(adapter);
-		netif_rx_complete(poll_dev);
-		if (test_bit(__E1000_DOWN, &adapter->state))
-			atomic_dec(&adapter->irq_sem);
-		else
-			e1000_irq_enable(adapter);
-		return 0;
+		netif_rx_complete(poll_dev, napi);
+		e1000_irq_enable(adapter);
 	}
 
-	return 1;
+	return work_done;
 }
 
 static void e1000_vlan_rx_add_vid(struct net_device *netdev, u16 vid)
@@ -2439,7 +2433,7 @@ int e1000e_up(struct e1000_adapter *adapter)
 
 	clear_bit(__E1000_DOWN, &adapter->state);
 
-	netif_poll_enable(adapter->netdev);
+	napi_enable(&adapter->napi);
 	e1000_irq_enable(adapter);
 
 	/* fire a link change interrupt to start the watchdog */
@@ -2472,7 +2466,7 @@ void e1000e_down(struct e1000_adapter *adapter)
 	e1e_flush();
 	msleep(10);
 
-	netif_poll_disable(netdev);
+	napi_disable(&adapter->napi);
 	e1000_irq_disable(adapter);
 
 	del_timer_sync(&adapter->watchdog_timer);
@@ -2605,7 +2599,7 @@ static int e1000_open(struct net_device *netdev)
 	/* From here on the code is the same as e1000e_up() */
 	clear_bit(__E1000_DOWN, &adapter->state);
 
-	netif_poll_enable(netdev);
+	napi_enable(&adapter->napi);
 
 	e1000_irq_enable(adapter);
 
@@ -4090,8 +4084,7 @@ static int __devinit e1000_probe(struct pci_dev *pdev,
 	e1000e_set_ethtool_ops(netdev);
 	netdev->tx_timeout		= &e1000_tx_timeout;
 	netdev->watchdog_timeo		= 5 * HZ;
-	netdev->poll			= &e1000_clean;
-	netdev->weight			= 64;
+	netif_napi_add(netdev, &adapter->napi, e1000_clean, 64);
 	netdev->vlan_rx_register	= e1000_vlan_rx_register;
 	netdev->vlan_rx_add_vid		= e1000_vlan_rx_add_vid;
 	netdev->vlan_rx_kill_vid	= e1000_vlan_rx_kill_vid;
@@ -4260,7 +4253,6 @@ static int __devinit e1000_probe(struct pci_dev *pdev,
 	/* tell the stack to leave us alone until e1000_open() is called */
 	netif_carrier_off(netdev);
 	netif_stop_queue(netdev);
-	netif_poll_disable(netdev);
 
 	strcpy(netdev->name, "eth%d");
 	err = register_netdev(netdev);

-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2005.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/

^ permalink raw reply related

* [PATCH] Document non-semantics of atomic_read() and atomic_set()
From: Chris Snook @ 2007-09-10 23:19 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Denys Vlasenko, Kyle Moffett, Arjan van de Ven, Nick Piggin,
	Satyam Sharma, Herbert Xu, Paul Mackerras, Christoph Lameter,
	Ilpo Jarvinen, Paul E. McKenney, Stefan Richter,
	Linux Kernel Mailing List, linux-arch, Netdev, Andrew Morton, ak,
	heiko.carstens, David Miller, schwidefsky, wensong, horms, wjiang,
	cfriesen, zlynx, rpjday, jesper.juhl, segher
In-Reply-To: <alpine.LFD.0.999.0709100807260.16478@woody.linux-foundation.org>

From: Chris Snook <csnook@redhat.com>

Unambiguously document the fact that atomic_read() and atomic_set()
do not imply any ordering or memory access, and that callers are
obligated to explicitly invoke barriers as needed to ensure that
changes to atomic variables are visible in all contexts that need
to see them.

Signed-off-by: Chris Snook <csnook@redhat.com>

--- a/Documentation/atomic_ops.txt	2007-07-08 19:32:17.000000000 -0400
+++ b/Documentation/atomic_ops.txt	2007-09-10 19:02:50.000000000 -0400
@@ -12,7 +12,11 @@
 C integer type will fail.  Something like the following should
 suffice:
 
-	typedef struct { volatile int counter; } atomic_t;
+	typedef struct { int counter; } atomic_t;
+
+	Historically, counter has been declared volatile.  This is now
+discouraged.  See Documentation/volatile-considered-harmful.txt for the
+complete rationale.
 
 	The first operations to implement for atomic_t's are the
 initializers and plain reads.
@@ -42,6 +46,22 @@
 
 which simply reads the current value of the counter.
 
+*** WARNING: atomic_read() and atomic_set() DO NOT IMPLY BARRIERS! ***
+
+Some architectures may choose to use the volatile keyword, barriers, or
+inline assembly to guarantee some degree of immediacy for atomic_read()
+and atomic_set().  This is not uniformly guaranteed, and may change in
+the future, so all users of atomic_t should treat atomic_read() and
+atomic_set() as simple C assignment statements that may be reordered or
+optimized away entirely by the compiler or processor, and explicitly
+invoke the appropriate compiler and/or memory barrier for each use case.
+Failure to do so will result in code that may suddenly break when used with
+different architectures or compiler optimizations, or even changes in
+unrelated code which changes how the compiler optimizes the section
+accessing atomic_t variables.
+
+*** YOU HAVE BEEN WARNED! ***
+
 Now, we move onto the actual atomic operation interfaces.
 
 	void atomic_add(int i, atomic_t *v);

^ permalink raw reply

* Re: [PATCH 3/3] rfkill: Add rfkill documentation
From: Randy Dunlap @ 2007-09-10 18:26 UTC (permalink / raw)
  To: Ivo van Doorn; +Cc: davem, Dmitry Torokhov, netdev, Inaky Perez-Gonzalez
In-Reply-To: <200709101956.03861.IvDoorn@gmail.com>

On Mon, 10 Sep 2007 19:56:03 +0200 Ivo van Doorn wrote:

> Add a documentation file which contains
> a short description about rfkill with some
> notes about drivers and the userspace interface.

Thanks.  I have noted a few typo/editorial changes below.


> Signed-off-by: Ivo van Doorn <IvDoorn@gmail.com>
> Acked-by: Dmitry Torokhov <dtor@mail.ru>
> ---
>  Documentation/rfkill.txt |   88 ++++++++++++++++++++++++++++++++++++++++++++++
>  1 files changed, 88 insertions(+), 0 deletions(-)
>  create mode 100644 Documentation/rfkill.txt
> 
> diff --git a/Documentation/rfkill.txt b/Documentation/rfkill.txt
> new file mode 100644
> index 0000000..93c76fc
> --- /dev/null
> +++ b/Documentation/rfkill.txt
> @@ -0,0 +1,88 @@
> +rfkill - RF switch subsystem support
> +====================================
> +
> +1 Implementation details
> +2 Driver support
> +3 Userspace support
> +
> +===============================================================================
> +1: Implementation details
> +
> +The rfkill switch subsystem offers support for keys often found on laptops
> +to enable wireless devices like WiFi and Bluetooth.
> +
> +This is done by providing the user 3 possibilities:
> + - The rfkill system handles all events, userspace is not aware of events.
> + - The rfkill system handles all events, userspace is informed about the event.
> + - The rfkill system does not handle events, userspace handles all events.

I would s/,/;/ in the 3 lines above.

> +The buttons to enable and disable the wireless radios are important in
> +situations where the user is for example using his laptop on a location where
> +wireless radios _must_ be disabled (e.g airplanes).
> +Because of this requirement, userspace support for the keys should not be
> +made mandatory. Because userspace might want to perform some additional smarter
> +tasks when the key is pressed, rfkill still provides userspace the possibility
> +to take over the task to handle the key events.
> +
> +The system inside the kernel has been split into 2 seperate sections:

                                                      separate

> +	1 - RFKILL
> +	2 - RFKILL_INPUT
> +
> +The first option enables rfkill support and will make sure userspace will
> +be notified of any events through the input device. It also creates several
> +sysfs entries which can be used by userspace. See section "Userspace support".
> +
> +The second option provides a rfkill input handler. This handler will

                              an

> +listen to all rfkill key events and will toggle the radio accordingly,

end above with ; or .  If '.', s/with/With/ on next line.

> +with this option enabled userspace could either do nothing or simply
> +perform monitoring tasks.
> +
> +====================================
> +2: Driver support
> +
> +Drivers who wish to build in rfkill subsystem support should

   Drivers that

But, drivers can't/don't wish, so it would be better to say something
like:

To build a driver with rfkill subsystem support, the driver should
depend on the Kconfig symbol RFKILL; it should _not_ depend on
RKFILL_INPUT.


> +make sure their driver depends of the Kconfig option RFKILL, it should
> +_not_ depend on RFKILL_INPUT.
> +
> +Unless key events trigger a interrupt to which the driver listens, polling

                             an interrupt

> +will be required to determine the key state changes. For this the input
> +layer providers the input-polldev handler.
> +
> +A driver should implement a few steps to correctly make use of the
> +rfkill subsystem. First for non-polling drivers:
> +
> +	- rfkill_allocate()
> +	- input_allocate_device()
> +	- rfkill_register()
> +	- input_register_device()
> +
> +For polling drivers:
> +
> +	- rfkill_allocate()
> +	- input_allocate_polled_device()
> +	- rfkill_register()
> +	- input_register_polled_device()
> +
> +When a key event has been detected, the correct event should be
> +send over the input device which has been registered by the driver.

   sent

> +
> +====================================
> +3: Userspace support
> +
> +For each key a input device will be created which will send out the correct

                an

> +key event when the rfkill key has been pressed.
> +
> +The following sysfs entries will be created:
> +
> +	name: Name assigned by driver to this key (interface or driver name).
> +	type: Name of the key type ("wlan", "bluetooth", etc).
> +	state: Current state of the key. 1: On, 0: Off.
> +	claim: 1: Userspace handles events, 0: Kernel handles events
> +
> +Both the "state" and "claim" entries are also writable. For the "state" entry
> +this means that when 1 or 0 is written all radios will be toggled accordingly.

will be written even if they are already in that state?

> +For the "claim" entry writing 1 to it will mean that the kernel will no longer

s/will mean/means/
s/will no longer handle/no longer handles/

> +handle key events even though RFKILL_INPUT input was enabled. When "claim" has
> +been set to 0, userspace should make sure it will listen for the input events

s/it will listen/that it listens/

> +or check the sysfs "state" entry regularly to correctly perform the required
> +tasks when the rkfill key is pressed.
> -- 

---
~Randy
*** Remember to use Documentation/SubmitChecklist when testing your code ***

^ permalink raw reply

* [PATCH] include listenq max backlog in /proc/net/tcp and include in tcp_info
From: Rick Jones @ 2007-09-10 23:13 UTC (permalink / raw)
  To: netdev

Return some useful information such as the maximum listen backlog and
the current listen backlog in the tcp_info structure and have that 
match what one can see in /proc/net/tcp and /proc/net/tcp6.

Signed-off-by: Rick Jones <rick.jones2@hp.com>
---

diff -r bdcdd0e1ee9d Documentation/networking/proc_net_tcp.txt
--- a/Documentation/networking/proc_net_tcp.txt	Sat Sep 01 07:00:31 2007 +0000
+++ b/Documentation/networking/proc_net_tcp.txt	Mon Sep 10 16:09:46 2007 -0700
@@ -20,8 +20,8 @@ up into 3 parts because of the length of
       |        |     |     |       |--> number of unrecovered RTO timeouts
       |        |     |     |----------> number of jiffies until timer expires
       |        |     |----------------> timer_active (see below)
-      |        |----------------------> receive-queue
-      |-------------------------------> transmit-queue
+      |        |----------------------> receive-queue or connection backlog
+      |-------------------------------> transmit-queue or connection limit
 
    1000        0 54165785 4 cd1e6040 25 4 27 3 -1
     |          |    |     |    |     |  | |  | |--> slow start size threshold, 
diff -r bdcdd0e1ee9d net/ipv4/tcp.c
--- a/net/ipv4/tcp.c	Sat Sep 01 07:00:31 2007 +0000
+++ b/net/ipv4/tcp.c	Mon Sep 10 16:09:46 2007 -0700
@@ -2030,8 +2030,14 @@ void tcp_get_info(struct sock *sk, struc
 	info->tcpi_snd_mss = tp->mss_cache;
 	info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
 
-	info->tcpi_unacked = tp->packets_out;
-	info->tcpi_sacked = tp->sacked_out;
+	if (sk->sk_state == TCP_LISTEN) {
+		info->tcpi_unacked = sk->sk_ack_backlog;
+		info->tcpi_sacked = sk->sk_max_ack_backlog;
+	}
+	else {
+		info->tcpi_unacked = tp->packets_out;
+		info->tcpi_sacked = tp->sacked_out;
+	}
 	info->tcpi_lost = tp->lost_out;
 	info->tcpi_retrans = tp->retrans_out;
 	info->tcpi_fackets = tp->fackets_out;
diff -r bdcdd0e1ee9d net/ipv4/tcp_ipv4.c
--- a/net/ipv4/tcp_ipv4.c	Sat Sep 01 07:00:31 2007 +0000
+++ b/net/ipv4/tcp_ipv4.c	Mon Sep 10 16:09:46 2007 -0700
@@ -2320,7 +2320,8 @@ static void get_tcp4_sock(struct sock *s
 	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
 			"%08X %5d %8d %lu %d %p %u %u %u %u %d",
 		i, src, srcp, dest, destp, sk->sk_state,
-		tp->write_seq - tp->snd_una,
+		sk->sk_state == TCP_LISTEN ? sk->sk_max_ack_backlog :
+					     (tp->write_seq - tp->snd_una),
 		sk->sk_state == TCP_LISTEN ? sk->sk_ack_backlog :
 					     (tp->rcv_nxt - tp->copied_seq),
 		timer_active,
diff -r bdcdd0e1ee9d net/ipv6/tcp_ipv6.c
--- a/net/ipv6/tcp_ipv6.c	Sat Sep 01 07:00:31 2007 +0000
+++ b/net/ipv6/tcp_ipv6.c	Mon Sep 10 16:09:46 2007 -0700
@@ -2005,8 +2005,10 @@ static void get_tcp6_sock(struct seq_fil
 		   dest->s6_addr32[0], dest->s6_addr32[1],
 		   dest->s6_addr32[2], dest->s6_addr32[3], destp,
 		   sp->sk_state,
-		   tp->write_seq-tp->snd_una,
-		   (sp->sk_state == TCP_LISTEN) ? sp->sk_ack_backlog : (tp->rcv_nxt - tp->copied_seq),
+		   (sp->sk_state == TCP_LISTEN) ? sp->sk_max_ack_backlog:
+						  tp->write_seq-tp->snd_una,
+		   (sp->sk_state == TCP_LISTEN) ? sp->sk_ack_backlog : 
+					(tp->rcv_nxt - tp->copied_seq),
 		   timer_active,
 		   jiffies_to_clock_t(timer_expires - jiffies),
 		   icsk->icsk_retransmits,

^ permalink raw reply

* Re: why does tcp_v[46]_conn_request not inc MIB stats
From: Rick Jones @ 2007-09-10 22:22 UTC (permalink / raw)
  To: Sridhar Samudrala; +Cc: Linux Network Development list
In-Reply-To: <1189461284.11066.10.camel@w-sridhar2.beaverton.ibm.com>

Sridhar Samudrala wrote:
> On Mon, 2007-09-10 at 11:42 -0700, Rick Jones wrote:
> 
>>I've been digging around to see about inducing /proc/net/tcp to show 
>>some "interesting" things for listen sockets (eg backlog depth, its max, 
>>and dropped connection requests).
> 
> 
> backlog depth(acceptq length) for a listening socket should be available
> with the newer kernels. The following patch exports this value via the rx_queue
> field in /proc/net/tcp.
>  http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commitdiff;h=47da8ee681d04e68ca1b1812c10e28162150d453

Yep, I see it in the 2.6.23-rc5 tree I'm using.  At the risk of yet 
another "merely practice" patching excercise I'm putting together a 
patch which will also return that in a TCP_INFO and add the max backlog 
(to the tx_queue field)

While doing that, I've noticed that 
Documenation/networking/proc-net-tcp.txt (?) talks about a tcp6_get_info 
which I cannot find anywhere in the tree.  I'm not sure if that simply 
means that tcp_get_info is what is used for a "tcp6" connection and the 
text can simply be removed from the documentation, or if it is called 
something else.

>>  While there I've noticed that both 
>>tcp_v[46]_syn_recv_sock and tcp_v[46]conn_request both check that the 
>>listen queue is full, but only tcp_v[46]_syn_recv_sock increments some 
>>mib stats for dropped connection requests.
>>
>>Is that deliberate, or is that a hole in the stats?
> 
> 
> looks like it is a hole in the stats. I think we should increment
> LISTENOVERFLOWS or LISTENDROPS in tcp_v[46]_conn_request too if the
> SYN is dropped.

OK.  Now, can we get a third to Sridhar's second?-)

rick jones
struggling through the maze of twisty routines for connection establishment

^ permalink raw reply

* Re: Distributed storage. Security attributes and ducumentation update.
From: Paul E. McKenney @ 2007-09-10 22:14 UTC (permalink / raw)
  To: Evgeniy Polyakov; +Cc: netdev, linux-kernel, linux-fsdevel
In-Reply-To: <20070831160611.GA21660@2ka.mipt.ru>

On Fri, Aug 31, 2007 at 08:06:13PM +0400, Evgeniy Polyakov wrote:
> On Tue, Jul 31, 2007 at 09:13:47PM +0400, Evgeniy Polyakov (johnpol@2ka.mipt.ru) wrote:
> Hi.
> 
> I'm pleased to announce third release of the distributed storage
> subsystem, which allows to form a storage on top of remote and local
> nodes, which in turn can be exported to another storage as a node to
> form tree-like storages.
> 
> This release includes following changes:
> * security attributes (permission mask assigned to addresses, allowed to
> 	connect to given local export node)
> * big documentation update (userspace documentation on the site also
> 	includes various usage case examples and descirption of the
> 	configuration utilitiy, protocols and userspace target)
> * mirror algorithm has been moved from per-page to per-sector dirty
> 	bitmask
> 
> Further TODO list includes:
> * implement optional saving of mirroring/linear information on the remote
> 	nodes (simple)
> * implement netlink based setup (simple)
> * new redundancy algorithm (complex)
> 
> Homepage:
> http://tservice.net.ru/~s0mbre/old/?section=projects&item=dst

A couple questions below, but otherwise looks good from an RCU viewpoint.

							Thanx, Paul

> Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mipt.ru>
> 
> diff --git a/Documentation/dst/algorithms.txt b/Documentation/dst/algorithms.txt
> new file mode 100644
> index 0000000..bfc6984
> --- /dev/null
> +++ b/Documentation/dst/algorithms.txt
> @@ -0,0 +1,115 @@
> +Each storage by itself is just a set of contiguous logical blocks, with
> +allowed number of operations. Nodes, each of which has own start and size,
> +are placed into storage by appropriate algorithm, which remaps
> +logical sector number into real node's sector. One can create
> +own algorithms, since DST has pluggable interface for that.
> +Currently mirrored and linear algorithms are supported.
> +
> +Let's briefly describe how they work.
> +
> +Linear algorithm.
> +Simple approach of concatenating storages into single device with
> +increased size is used in this algorithm. Essentially new device
> +has size equal to sum of sizes of underlying nodes and nodes are
> +placed one after another.
> +
> +  /----- Node 1 ---\                         /------ Node 3 ----\
> +start              end                     start               end
> + |==================|========================|==================|
> + |                start                     end                 |
> + |                  \------- Node 2 ---------/                  |
> + |                                                              |
> +start                                                          end
> + \-------------------------- DST storage ----------------------/
> +
> +			        /\
> +			        ||
> +			        ||
> +
> +			   IO operations
> +
> +			    Figure 1. 
> +     3 nodes combined into single storage using linear algorithm.
> +
> +Mirror algorithm.
> +In this algorithms nodes are placed under each other, so when
> +operation comes to the first one, it can be mirrored to all
> +underlying nodes. In case of reading, actual data is obtained from
> +the nearest node - algoritm keeps track of previous operation
> +and knows where it was stopped, so that subsequent seek to the 
> +start of the new request will take the shortest time.
> +Writing is always mirrored to all underlying nodes.
> +
> +                  IO operations
> +                       ||
> +                       ||
> +                       \/
> +
> +|---------------- DST storate -------------------|
> +|      prev position                             |
> +|-------|------------ Node 1 --------------------|
> +|                              prev pos          |
> +|-------------------- Node 2 -----|--------------|
> +|prev pos                                        |
> +|---|---------------- Node 3 --------------------|
> +
> +		Figure 2.
> +   3 nodes combined into single storage using mirror algorithm.
> +
> +Each algorithm must implement number of callbacks,
> +which must be registered during initialization time.
> +
> +struct dst_alg_ops
> +{
> +	int			(*add_node)(struct dst_node *n);
> +	void			(*del_node)(struct dst_node *n);
> +	int 			(*remap)(struct dst_request *req);
> +	int			(*error)(struct kst_state *state, int err);
> +	struct module 		*owner;
> +};
> +
> +@add_node.
> +This callback is invoked when new node is being added into the storage,
> +but before node is actually added into the storage, so that it could
> +be accessed from it. When it is called, all appropriate initialization
> +of the underlying device is already completed (system has been connected
> +to remote node or got a reference to the local block device). At this
> +stage algorithm can add node into private map. 
> +It must return zero on success or negative value otherwise.
> +
> +@del_node.
> +This callback is invoked when node is being deleted from the storage,
> +i.e. when its reference counter hits zero. It is called before
> +any cleaning is performed.
> +It must return zero on success or negative value otherwise.
> +
> +@remap.
> +This callback is invoked each time new bio hits the storage.
> +Request structure contains BIO itself, pointer to the node, which originally
> +stores the whole region under given IO request, and various parameters
> +used by storage core to process this block request.
> +It must return zero on success or negative value otherwise. It is upto
> +this method to call all cleaning if remapping failed, for example it must
> +call kst_bio_endio() for given callback in case of error, which in turn
> +will call bio_endio(). Note, that dst_request structure provided in this
> +callback is allocated on stack, so if there is a need to use it outside
> +of the given function, it must be cloned (it will happen automatically
> +in state's push callback, but that copy will not be shared by any other
> +user).
> +
> +@error.
> +This callback is invoked for each error, which happend when processed
> +requests for remote nodes or when talking to remote size
> +of the local export node (state contains data related to data
> +transfers over the network).
> +If this function has fixed given error, it must return 0 or negative
> +error value otherwise.
> +
> +@owner.
> +This is module reference counter updated automatically by DST core.
> +
> +Algorithm must provide its name and above structure to the 
> +dst_alloc_alg() function, which will return a reference to the newly
> +created algorithm.
> +To remove it, one needs to call dst_remove_alg() with given algorithm
> +pointer.
> diff --git a/Documentation/dst/dst.txt b/Documentation/dst/dst.txt
> new file mode 100644
> index 0000000..3b326aa
> --- /dev/null
> +++ b/Documentation/dst/dst.txt
> @@ -0,0 +1,66 @@
> +Distributed storage. Design and implementation.
> +http://tservice.net.ru/~s0mbre/old/?section=projects&item=dst
> +
> +	     Evgeniy Polyakov
> +
> +This document is intended to briefly describe design and
> +implementation details of the distributed storage project,
> +aimed to create ability to group physically and/or logically
> +distributed storages into single device.
> +
> +Main operational unit in the storage is node. Node can represent
> +either remote storage, connected to local machine, or local
> +device, or storage exported to the outside of the system.
> +Here goes small explaination of basic therms.
> +
> +Local node.
> +This node is just a logical link between block device (with given
> +major and minor numbers) and structure in the DST hierarchy,
> +which represents number of sectors on the area, corresponding to given
> +block device. it can be a disk, a device mapper node or stacked
> +block device on top of another underlying DST nodes.
> +
> +Local export node.
> +Essentially the same as local node, but it allows to access
> +to its data via network. Remote clients can connect to given local 
> +export node and read or write blocks according to its size.
> +Blocks are then forwarded to underlying local node and processed
> +there accordingly to the nature of the local node.
> +
> +Remote node.
> +This type of nodes contain remotely accessible devices. One can think
> +about remote nodes as remote disks, which can be connected to
> +local system and combined into single storage. Remote nodes
> +are presented as number of sectors accessed over the network
> +by the local machine, where distributed storage is being formed.
> +
> +
> +Each node or set of them can be formed into single array, which
> +in turn becomes a local node, which can be exported further by stacking
> +a local export node on top of it.
> +
> +Each storage by itself is just a set of contiguous logical blocks, with
> +allowed number of operations. Nodes, each of which has own start and size,
> +are placed into storage by appropriate algorithm, which remaps
> +logical sector number into real node's sector. One can create
> +own algorithms, since DST has pluggable interface for that.
> +Currently mirrored and linear algorithms are supported.
> +One can find more details in Documentation/dst/algorithms.txt file.
> +
> +Main goal of the distributed storage is to combine remote nodes into
> +single device, so each block IO request is being sent over the network
> +(contrary requests for local nodes are handled by the gneric block
> +layer features). Each network connection has number of variables which
> +describe it (socket, list of requests, error handling and so on),
> +which form kst_state structure. This network state is added into per-socket
> +polling state machine, and can be processed by dedicated thread when
> +becomes ready. This system forms asynchronous IO for given block
> +requests. If block request can be processed without blocking, then
> +no new structures are allocated and async part of the state is not used.
> +
> +When connection to the remote peer breaks, DST core tries to reconnect
> +to failed node and no requests are marked as errorneous, instead
> +they live in the queue until reconnectin is established.
> +
> +Userspace code, setup documentation and examples can be found on project's
> +homepage above.
> diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
> index b4c8319..ca6592d 100644
> --- a/drivers/block/Kconfig
> +++ b/drivers/block/Kconfig
> @@ -451,6 +451,8 @@ config ATA_OVER_ETH
>  	This driver provides Support for ATA over Ethernet block
>  	devices like the Coraid EtherDrive (R) Storage Blade.
> 
> +source "drivers/block/dst/Kconfig"
> +
>  source "drivers/s390/block/Kconfig"
> 
>  endmenu
> diff --git a/drivers/block/Makefile b/drivers/block/Makefile
> index dd88e33..fcf042d 100644
> --- a/drivers/block/Makefile
> +++ b/drivers/block/Makefile
> @@ -29,3 +29,4 @@ obj-$(CONFIG_VIODASD)		+= viodasd.o
>  obj-$(CONFIG_BLK_DEV_SX8)	+= sx8.o
>  obj-$(CONFIG_BLK_DEV_UB)	+= ub.o
> 
> +obj-$(CONFIG_DST)		+= dst/
> diff --git a/drivers/block/dst/Kconfig b/drivers/block/dst/Kconfig
> new file mode 100644
> index 0000000..9c5eba2
> --- /dev/null
> +++ b/drivers/block/dst/Kconfig
> @@ -0,0 +1,19 @@
> +config DST
> +	tristate "Distributed storage"
> +	depends on NET
> +	---help---
> +	This driver allows to create a distributed storage.
> +
> +config DST_ALG_LINEAR
> +	tristate "Linear distribution algorithm"
> +	depends on DST
> +	---help---
> +	This module allows to create linear mapping of the nodes
> +	in the distributed storage.
> +
> +config DST_ALG_MIRROR
> +	tristate "Mirror distribution algorithm"
> +	depends on DST
> +	---help---
> +	This module allows to create a mirror of the noes in the
> +	distributed storage.
> diff --git a/drivers/block/dst/Makefile b/drivers/block/dst/Makefile
> new file mode 100644
> index 0000000..1400e94
> --- /dev/null
> +++ b/drivers/block/dst/Makefile
> @@ -0,0 +1,6 @@
> +obj-$(CONFIG_DST) += dst.o
> +
> +dst-y := dcore.o kst.o
> +
> +obj-$(CONFIG_DST_ALG_LINEAR) += alg_linear.o
> +obj-$(CONFIG_DST_ALG_MIRROR) += alg_mirror.o
> diff --git a/drivers/block/dst/alg_linear.c b/drivers/block/dst/alg_linear.c
> new file mode 100644
> index 0000000..584f99e
> --- /dev/null
> +++ b/drivers/block/dst/alg_linear.c
> @@ -0,0 +1,99 @@
> +/*
> + * 2007+ Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
> + * All rights reserved.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + */
> +
> +#include <linux/module.h>
> +#include <linux/kernel.h>
> +#include <linux/init.h>
> +#include <linux/dst.h>
> +
> +static struct dst_alg *alg_linear;
> +
> +/*
> + * This callback is invoked when node is removed from storage.
> + */
> +static void dst_linear_del_node(struct dst_node *n)
> +{
> +}
> +
> +/*
> + * This callback is invoked when node is added to storage.
> + */
> +static int dst_linear_add_node(struct dst_node *n)
> +{
> +	struct dst_storage *st = n->st;
> +
> +	n->start = st->disk_size;
> +	st->disk_size += n->size;
> +
> +	return 0;
> +}
> +
> +static int dst_linear_remap(struct dst_request *req)
> +{
> +	int err;
> +
> +	if (req->node->bdev) {
> +		generic_make_request(req->bio);
> +		return 0;
> +	}
> +
> +	err = kst_check_permissions(req->state, req->bio);
> +	if (err)
> +		return err;
> +
> +	return req->state->ops->push(req);
> +}
> +
> +/*
> + * Failover callback - it is invoked each time error happens during
> + * request processing.
> + */
> +static int dst_linear_error(struct kst_state *st, int err)
> +{
> +	if (err)
> +		set_bit(DST_NODE_FROZEN, &st->node->flags);
> +	else
> +		clear_bit(DST_NODE_FROZEN, &st->node->flags);
> +	return 0;
> +}
> +
> +static struct dst_alg_ops alg_linear_ops = {
> +	.remap		= dst_linear_remap,
> +	.add_node 	= dst_linear_add_node,
> +	.del_node 	= dst_linear_del_node,
> +	.error		= dst_linear_error,
> +	.owner		= THIS_MODULE,
> +};
> +
> +static int __devinit alg_linear_init(void)
> +{
> +	alg_linear = dst_alloc_alg("alg_linear", &alg_linear_ops);
> +	if (!alg_linear)
> +		return -ENOMEM;
> +
> +	return 0;
> +}
> +
> +static void __devexit alg_linear_exit(void)
> +{
> +	dst_remove_alg(alg_linear);
> +}
> +
> +module_init(alg_linear_init);
> +module_exit(alg_linear_exit);
> +
> +MODULE_LICENSE("GPL");
> +MODULE_AUTHOR("Evgeniy Polyakov <johnpol@2ka.mipt.ru>");
> +MODULE_DESCRIPTION("Linear distributed algorithm.");
> diff --git a/drivers/block/dst/alg_mirror.c b/drivers/block/dst/alg_mirror.c
> new file mode 100644
> index 0000000..be42350
> --- /dev/null
> +++ b/drivers/block/dst/alg_mirror.c
> @@ -0,0 +1,765 @@
> +/*
> + * 2007+ Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
> + * All rights reserved.
> + * 
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + */
> +
> +#include <linux/module.h>
> +#include <linux/kernel.h>
> +#include <linux/init.h>
> +#include <linux/poll.h>
> +#include <linux/dst.h>
> +
> +#define DST_MIRROR_MAX_CHUNKS		4096
> +
> +struct dst_mirror_priv
> +{
> +	unsigned int		chunk_num;
> +
> +	u64			last_start;
> +
> +	spinlock_t		backlog_lock;
> +	struct list_head	backlog_list;
> +
> +	unsigned long		*chunk;
> +};
> +
> +static struct dst_alg *alg_mirror;
> +static struct bio_set *dst_mirror_bio_set;
> +
> +static ssize_t dst_mirror_chunk_mask_show(struct device *dev,
> +		struct device_attribute *attr, char *buf)
> +{
> +	struct dst_node *n = container_of(dev, struct dst_node, device);
> +	struct dst_mirror_priv *priv = n->priv;
> +	unsigned int i;
> +	int rest = PAGE_SIZE;
> +
> +	for (i = 0; i < priv->chunk_num/BITS_PER_LONG; ++i) {
> +		int bit, j;
> +
> +		for (j = 0; j < BITS_PER_LONG; ++j) {
> +			bit = (priv->chunk[i] >> j) & 1;
> +			sprintf(buf, "%c", (bit)?'+':'-');
> +			buf++;
> +		}
> +
> +		rest -= BITS_PER_LONG;
> +
> +		if (rest < BITS_PER_LONG)
> +			break;
> +	}
> +
> +	return PAGE_SIZE - rest;
> +}
> +
> +static DEVICE_ATTR(chunks, 0444, dst_mirror_chunk_mask_show, NULL);
> +
> +/*
> + * This callback is invoked when node is removed from storage.
> + */
> +static void dst_mirror_del_node(struct dst_node *n)
> +{
> +	struct dst_mirror_priv *priv = n->priv;
> +
> +	vfree(priv->chunk);
> +	kfree(priv);
> +	n->priv = NULL;
> +
> +	if (n->device.parent == &n->st->device)
> +		device_remove_file(&n->device, &dev_attr_chunks);
> +}
> +
> +static void dst_mirror_handle_priv(struct dst_node *n)
> +{
> +	if (n->priv) {
> +		int err;
> +		err = device_create_file(&n->device, &dev_attr_chunks);
> +	}
> +}
> +
> +/*
> + * This callback is invoked when node is added to storage.
> + */
> +static int dst_mirror_add_node(struct dst_node *n)
> +{
> +	struct dst_storage *st = n->st;
> +	struct dst_mirror_priv *priv;
> +
> +	if (st->disk_size)
> +		st->disk_size = min(n->size, st->disk_size);
> +	else
> +		st->disk_size = n->size;
> +
> +	priv = kzalloc(sizeof(struct dst_mirror_priv), GFP_KERNEL);
> +	if (!priv)
> +		return -ENOMEM;
> +
> +	priv->chunk_num = st->disk_size;
> +
> +	priv->chunk = vmalloc(priv->chunk_num/BITS_PER_LONG * sizeof(long));
> +	if (!priv->chunk)
> +		goto err_out_free;
> +
> +	memset(priv->chunk, 0, priv->chunk_num/BITS_PER_LONG * sizeof(long));
> +
> +	spin_lock_init(&priv->backlog_lock);
> +	INIT_LIST_HEAD(&priv->backlog_list);
> +
> +	dprintk("%s: %llu:%llu, chunk_num: %u, disk_size: %llu.\n",
> +			__func__, n->start, n->size,
> +			priv->chunk_num, st->disk_size);
> +
> +	n->priv_callback = &dst_mirror_handle_priv;
> +	n->priv = priv;
> +
> +	return 0;
> +
> +err_out_free:
> +	kfree(priv);
> +	return -ENOMEM;
> +}
> +
> +static void dst_mirror_sync_destructor(struct bio *bio)
> +{
> +	struct bio_vec *bv;
> +	int i;
> +
> +	bio_for_each_segment(bv, bio, i)
> +		__free_page(bv->bv_page);
> +	bio_free(bio, dst_mirror_bio_set);
> +}
> +
> +static void dst_mirror_sync_requeue(struct dst_node *n)
> +{
> +	struct dst_mirror_priv *p = n->priv;
> +	struct dst_request *req;
> +	unsigned int num, idx, i;
> +	u64 start;
> +	unsigned long flags;
> +	int err;
> +
> +	while (!list_empty(&p->backlog_list)) {
> +		req = NULL;
> +		spin_lock_irqsave(&p->backlog_lock, flags);
> +		if (!list_empty(&p->backlog_list)) {
> +			req = list_entry(p->backlog_list.next,
> +					struct dst_request,
> +					request_list_entry);
> +			list_del(&req->request_list_entry);
> +		}
> +		spin_unlock_irqrestore(&p->backlog_lock, flags);
> +
> +		if (!req)
> +			break;
> +
> +		start = req->start - to_sector(req->orig_size - req->size);
> +
> +		idx = start;
> +		num = to_sector(req->orig_size);
> +
> +		for (i=0; i<num; ++i)
> +			if (test_bit(idx+i, p->chunk))
> +				break;
> +
> +		dprintk("%s: idx: %u, num: %u, i: %u, req: %p, "
> +				"start: %llu, size: %llu.\n",
> +				__func__, idx, num, i, req, 
> +				req->start, req->orig_size);
> +
> +		err = -1;
> +		if (i != num) {
> +			err = kst_enqueue_req(n->state, req);
> +			if (err) {
> +				printk("%s: congestion [%c]: req: %p, "
> +						"start: %llu, size: %llu.\n",
> +					__func__,
> +					(bio_rw(req->bio) == WRITE)?'W':'R',
> +					req, req->start, req->size);
> +				kst_del_req(req);
> +			}
> +		}
> +		if (err) {
> +			req->bio_endio(req, err);
> +			dst_free_request(req);
> +		}
> +	}
> +
> +	kst_wake(n->state);
> +}
> +
> +static void dst_mirror_mark_sync(struct dst_node *n)
> +{
> +	if (test_bit(DST_NODE_NOTSYNC, &n->flags)) {
> +		clear_bit(DST_NODE_NOTSYNC, &n->flags);
> +		printk("%s: node: %p, %llu:%llu synchronization "
> +				"has been completed.\n",
> +			__func__, n, n->start, n->size);
> +	}
> +}
> +
> +static void dst_mirror_mark_notsync(struct dst_node *n)
> +{
> +	if (!test_bit(DST_NODE_NOTSYNC, &n->flags)) {
> +		set_bit(DST_NODE_NOTSYNC, &n->flags);
> +		printk("%s: not synced node n: %p.\n", __func__, n);
> +	}
> +}
> +
> +/*
> + * Without errors it is always called under node's request lock,
> + * so it is safe to requeue them.
> + */
> +static void dst_mirror_bio_error(struct dst_request *req, int err)
> +{
> +	int i;
> +	struct dst_mirror_priv *priv = req->node->priv;
> +	unsigned int num, idx;
> +	void (*process_bit[])(int nr, volatile void *addr) =
> +		{&__clear_bit, &__set_bit};
> +	u64 start = req->start - to_sector(req->orig_size - req->size);
> +
> +	if (err)
> +		dst_mirror_mark_notsync(req->node);
> +	else
> +		dst_mirror_sync_requeue(req->node);
> +
> +	priv->last_start = req->start;
> +
> +	idx = start;
> +	num = to_sector(req->orig_size);
> +
> +	dprintk("%s: req_priv: %p, chunk %p, %llu:%llu start: %llu, size: %llu, "
> +		"chunk_num: %u, idx: %d, num: %d, err: %d.\n",
> +		__func__, req->priv, priv->chunk, req->node->start, 
> +		req->node->size, start, req->orig_size, priv->chunk_num, 
> +		idx, num, err);
> +
> +	if (unlikely(idx >= priv->chunk_num || idx + num > priv->chunk_num)) {
> +		printk("%s: %llu:%llu req: %p, start: %llu, orig_size: %llu, "
> +			"req_start: %llu, req_size: %llu, "
> +			"chunk_num: %u, idx: %d, num: %d, err: %d.\n",
> +			__func__, req->node->start, req->node->size, req,
> +			start, req->orig_size, 
> +			req->start, req->size,
> +			priv->chunk_num, idx, num, err);
> +		return;
> +	}
> +
> +	for (i=0; i<num; ++i)
> +		process_bit[!!err](idx+i, priv->chunk);
> +}
> +
> +static void dst_mirror_sync_req_endio(struct dst_request *req, int err)
> +{
> +	unsigned long notsync = 0;
> +	struct dst_mirror_priv *priv = req->node->priv;
> +	int i;
> +
> +	dst_mirror_bio_error(req, err);
> +
> +	printk("%s: freeing bio: %p, bi_size: %u, "
> +			"orig_size: %llu, req: %p, node: %p.\n",
> +		__func__, req->bio, req->bio->bi_size, req->orig_size, req,
> +		req->node);
> +
> +	bio_put(req->bio);
> +
> +	for (i = 0; i < priv->chunk_num/BITS_PER_LONG; ++i) {
> +		notsync = priv->chunk[i];
> +
> +		if (notsync)
> +			break;
> +	}
> +
> +	if (!notsync)
> +		dst_mirror_mark_sync(req->node);
> +}
> +
> +static int dst_mirror_sync_endio(struct bio *bio, unsigned int size, int err)
> +{
> +	struct dst_request *req = bio->bi_private;
> +	struct dst_node *n = req->node;
> +	struct dst_mirror_priv *priv = n->priv;
> +	unsigned long flags;
> +
> +	printk("%s: bio: %p, err: %d, size: %u, req: %p.\n",
> +			__func__, bio, err, bio->bi_size, req);
> +
> +	if (bio->bi_size)
> +		return 1;
> +
> +	bio->bi_rw = WRITE;
> +	bio->bi_size = req->orig_size;
> +	bio->bi_sector = req->start;
> +
> +	if (!err) {
> +		spin_lock_irqsave(&priv->backlog_lock, flags);
> +		list_add_tail(&req->request_list_entry, &priv->backlog_list);
> +		spin_unlock_irqrestore(&priv->backlog_lock, flags);
> +		kst_wake(req->state);
> +	} else {
> +		req->bio_endio(req, err);
> +		dst_free_request(req);
> +	}
> +	return 0;
> +}
> +
> +static int dst_mirror_sync_block(struct dst_node *n,
> +		int bit_start, int bit_num)
> +{
> +	u64 start = to_bytes(bit_start);
> +	struct bio *bio;
> +	unsigned int nr_pages = to_bytes(bit_num)/PAGE_SIZE, i;
> +	struct page *page;
> +	int err = -ENOMEM;
> +	struct dst_request *req;
> +
> +	printk("%s: bit_start: %d, bit_num: %d, start: %llu, nr_pages: %u, "
> +			"disk_size: %llu.\n",
> +			__func__, bit_start, bit_num, start, nr_pages,
> +			n->st->disk_size);
> +
> +	while (nr_pages) {
> +		req = dst_clone_request(NULL, n->w->req_pool);
> +		if (!req)
> +			return -ENOMEM;
> +
> +		bio = bio_alloc_bioset(GFP_NOIO, nr_pages, dst_mirror_bio_set);
> +		if (!bio)
> +			goto err_out_free_req;
> +
> +		bio->bi_rw = READ;
> +		bio->bi_private = req;
> +		bio->bi_sector = to_sector(start);
> +		bio->bi_bdev = NULL;
> +		bio->bi_destructor = dst_mirror_sync_destructor;
> +		bio->bi_end_io = dst_mirror_sync_endio;
> +
> +		for (i = 0; i < nr_pages; ++i) {
> +			err = -ENOMEM;
> +
> +			page = alloc_page(GFP_NOIO);
> +			if (!page)
> +				break;
> +
> +			err = bio_add_pc_page(n->st->queue, bio,
> +					page, PAGE_SIZE, 0);
> +			if (err <= 0)
> +				break;
> +			err = 0;
> +		}
> +
> +		if (err && !bio->bi_vcnt)
> +			goto err_out_put_bio;
> +
> +		req->node = n;
> +		req->state = n->state;
> +		req->start = bio->bi_sector;
> +		req->size = req->orig_size = bio->bi_size;
> +		req->bio = bio;
> +		req->idx = bio->bi_idx;
> +		req->num = bio->bi_vcnt;
> +		req->bio_endio = &dst_mirror_sync_req_endio;
> +		req->callback = &kst_data_callback;
> +
> +		dprintk("%s: start: %llu, size(pages): %u, bio: %p, "
> +				"size: %u, cnt: %d, req: %p, size: %llu.\n",
> +				__func__, bio->bi_sector, nr_pages, bio,
> +				bio->bi_size, bio->bi_vcnt, req, req->size);
> +
> +		err = n->st->queue->make_request_fn(n->st->queue, bio);
> +		if (err)
> +			goto err_out_put_bio;
> +
> +		nr_pages -= bio->bi_vcnt;
> +		start += bio->bi_size;
> +	}
> +
> +	return 0;
> +
> +err_out_put_bio:
> +	bio_put(bio);
> +err_out_free_req:
> +	dst_free_request(req);
> +	return err;
> +}
> +
> +/*
> + * Resync logic.
> + *
> + * System allocates and queues requests for number of regions.
> + * Each request initially is reading from the one of the nodes.
> + * When it is completed, system checks if given region was already
> + * written to, and in such case just drops read request, otherwise
> + * it writes it to the node being updated. Any write clears not-uptodate
> + * bit, which is used as a flag that region must be synchronized or not.
> + * Reading is never performed from the node under resync.
> + */
> +static int dst_mirror_resync(struct dst_node *n)
> +{
> +	int err = 0, sync = 0;
> +	struct dst_mirror_priv *priv = n->priv;
> +	unsigned int i;
> +
> +	printk("%s: node: %p, %llu:%llu synchronization has been started.\n",
> +			__func__, n, n->start, n->size);
> +
> +	for (i = 0; i < priv->chunk_num/BITS_PER_LONG; ++i) {
> +		int bit, num, start;
> +		unsigned long word = priv->chunk[i];
> +
> +		if (!word)
> +			continue;
> +
> +		num = 0;
> +		start = -1;
> +		while (word && num < BITS_PER_LONG) {
> +			bit = __ffs(word);
> +			if (start == -1)
> +				start = bit;
> +			num++;
> +			word >>= (bit+1);
> +		}
> +
> +		if (start != -1) {
> +			err = dst_mirror_sync_block(n, start + i*BITS_PER_LONG,
> +					num);
> +			if (err)
> +				break;
> +			sync++;
> +		}
> +	}
> +
> +	if (!sync && !err)
> +		dst_mirror_mark_sync(n);
> +
> +	return err;
> +}
> +
> +static void dst_mirror_destructor(struct bio *bio)
> +{
> +	dprintk("%s: bio: %p.\n", __func__, bio);
> +	bio_free(bio, dst_mirror_bio_set);
> +}
> +
> +static int dst_mirror_end_io(struct bio *bio, unsigned int size, int err)
> +{
> +	struct dst_request *req = bio->bi_private;
> +
> +	if (bio->bi_size)
> +		return 0;
> +
> +	dprintk("%s: req: %p, bio: %p, req->bio: %p, err: %d.\n",
> +			__func__, req, bio, req->bio, err);
> +	req->bio_endio(req, err);
> +	bio_put(bio);
> +	return 0;
> +}
> +
> +static void dst_mirror_read_endio(struct dst_request *req, int err)
> +{
> +	dst_mirror_bio_error(req, err);
> +
> +	if (!err)
> +		kst_bio_endio(req, 0);
> +}
> +
> +static void dst_mirror_write_endio(struct dst_request *req, int err)
> +{
> +	dst_mirror_bio_error(req, err);
> +
> +	req = req->priv;
> +
> +	dprintk("%s: req: %p, priv: %p err: %d, bio: %p, "
> +			"cnt: %d, orig_size: %llu.\n",
> +		__func__, req, req->priv, err, req->bio,
> +		atomic_read(&req->refcnt), req->orig_size);
> +
> +	if (atomic_dec_and_test(&req->refcnt)) {
> +		dprintk("%s: freeing bio %p.\n", __func__, req->bio);
> +		bio_endio(req->bio, req->orig_size, 0);
> +		dst_free_request(req);
> +	}
> +}
> +
> +static int dst_mirror_process_request(struct dst_request *req,
> +		struct dst_node *n)
> +{
> +	int err = 0;
> +
> +	/*
> +	 * Block layer requires to clone a bio.
> +	 */
> +	if (n->bdev) {
> +		struct bio *clone = bio_alloc_bioset(GFP_NOIO,
> +			req->bio->bi_max_vecs, dst_mirror_bio_set);
> +
> +		__bio_clone(clone, req->bio);
> +
> +		clone->bi_bdev = n->bdev;
> +		clone->bi_destructor = dst_mirror_destructor;
> +		clone->bi_private = req;
> +		clone->bi_end_io = &dst_mirror_end_io;
> +
> +		dprintk("%s: clone: %p, bio: %p, req: %p.\n",
> +				__func__, clone, req->bio, req);
> +
> +		generic_make_request(clone);
> +	} else {
> +		struct dst_request nr;
> +		/*
> +		 * Network state processing engine will clone request 
> +		 * by itself if needed. We can not use the same structure
> +		 * here, since number of its fields will be modified.
> +		 */
> +		memcpy(&nr, req, sizeof(struct dst_request));
> +
> +		nr.node = n;
> +		nr.state = n->state;
> +		nr.priv = req;
> +
> +		err = kst_check_permissions(n->state, req->bio);
> +		if (!err)
> +			err = req->state->ops->push(&nr);
> +	}
> +
> +	dprintk("%s: req: %p, n: %p, bdev: %p, err: %d.\n",
> +			__func__, req, n, n->bdev, err);
> +	return err;
> +}
> +
> +static int dst_mirror_write(struct dst_request *oreq)
> +{
> +	struct dst_node *n, *node = req->node;
> +	int num, err = 0, err_num = 0, orig_num;
> +
> +	req = dst_clone_request(oreq, oreq->node->w->req_pool);
> +	if (!req) {
> +		kst_bio_endio(oreq, -ENOMEM);
> +		return -ENOMEM;
> +	}
> +
> +	req->priv = req;
> +
> +	/*
> +	 * This logic is pretty simple - req->bio_endio will not
> +	 * call bio_endio() until all mirror devices completed
> +	 * processing of the request (no matter with or without error).
> +	 * Mirror's req->bio_endio callback will take care of that.
> +	 */
> +	orig_num = num = atomic_read(&req->node->shared_num) + 1;
> +	atomic_set(&req->refcnt, num);
> +
> +	req->bio_endio = &dst_mirror_write_endio;
> +
> +	dprintk("\n%s: req: %p, mirror to %d nodes.\n",
> +			__func__, req, num);
> +
> +	err = dst_mirror_process_request(req, node);
> +	if (err)
> +		err_num++;
> +
> +	if (--num) {
> +		list_for_each_entry_rcu(n, &node->shared, shared) {

This function is called under rcu_read_lock() or similar, right?
(Can't tell from this patch.)  It is also OK to call it from under the
update-side mutex, of course.

> +			dprintk("\n%s: req: %p, start: %llu, size: %llu, "
> +					"num: %d, n: %p.\n",
> +				__func__, req, req->start, 
> +				req->size, num, n);
> +
> +			err = dst_mirror_process_request(req, n);
> +			if (err)
> +				err_num++;
> +
> +			if (--num <= 0)
> +				break;
> +		}
> +	}
> +
> +	if (err_num == orig_num) {
> +		dprintk("%s: req: %p, num: %d, err: %d.\n",
> +				__func__, req, num, err);
> +		return -ENODEV;
> +	}
> +
> +	return 0;
> +}
> +
> +static int dst_mirror_read(struct dst_request *req)
> +{
> +	struct dst_node *node = req->node, *n, *min_dist_node;
> +	struct dst_mirror_priv *priv = node->priv;
> +	u64 dist, d;
> +	int err;
> +
> +	req->bio_endio = &dst_mirror_read_endio;
> +
> +	do {
> +		err = -ENODEV;
> +		min_dist_node = NULL;
> +		dist = -1ULL;
> + 
> +		/*
> +		 * Reading is never performed from the node under resync.
> +		 * If this will cause any troubles (like all nodes must be
> +		 * resynced between each other), this check can be removed
> +		 * and per-chunk dirty bit can be tested instead.
> +		 */
> +
> +		if (!test_bit(DST_NODE_NOTSYNC, &node->flags)) {
> +			priv = node->priv;
> +			if (req->start > priv->last_start)
> +				dist = req->start - priv->last_start;
> +			else
> +				dist = priv->last_start - req->start;
> +			min_dist_node = req->node;
> +		}
> +
> +		list_for_each_entry_rcu(n, &node->shared, shared) {

I see one call to this function that appears to be under the update-side
mutex, but I cannot tell if the other calls are safe.  (Safe as in either
under the update-side mutex or under rcu_read_lock() and friends.)

> +			if (test_bit(DST_NODE_NOTSYNC, &n->flags))
> +				continue;
> +
> +			priv = n->priv;
> +
> +			if (req->start > priv->last_start)
> +				d = req->start - priv->last_start;
> +			else
> +				d = priv->last_start - req->start;
> +
> +			if (d < dist)
> +				min_dist_node = n;
> +		}
> +
> +		if (!min_dist_node)
> +			break;
> +
> +		req->node = min_dist_node;
> +		req->state = req->node->state;
> +
> +		if (req->node->bdev) {
> +			req->bio->bi_bdev = req->node->bdev;
> +			generic_make_request(req->bio);
> +			err = 0;
> +			break;
> +		}
> +
> +		err = req->state->ops->push(req);
> +		if (err) {
> +			printk("%s: 1 req: %p, bio: %p, node: %p, err: %d.\n",
> +				__func__, req, req->bio, min_dist_node, err);
> +			dst_mirror_mark_notsync(req->node);
> +		}
> +	} while (err && min_dist_node);
> +
> +	if (err) {
> +		printk("%s: req: %p, bio: %p, node: %p, err: %d.\n",
> +			__func__, req, req->bio, min_dist_node, err);
> +		kst_bio_endio(req, err);
> +	}
> +	return err;
> +}
> +
> +/*
> + * This callback is invoked from block layer request processing function,
> + * its task is to remap block request to different nodes.
> + */
> +static int dst_mirror_remap(struct dst_request *req)
> +{
> +	int (*remap[])(struct dst_request *) = 
> +		{&dst_mirror_read, &dst_mirror_write};
> +
> +	return remap[bio_rw(req->bio) == WRITE](req);
> +}
> +
> +static int dst_mirror_error(struct kst_state *st, int err)
> +{
> +	struct dst_request *req, *tmp;
> +	unsigned int revents = st->socket->ops->poll(NULL, st->socket, NULL);
> +
> +	if (err == -EEXIST)
> +		return err;
> +
> +	if (!(revents & (POLLERR | POLLHUP))) {
> +		if (test_bit(DST_NODE_NOTSYNC, &st->node->flags)) {
> +			return dst_mirror_resync(st->node);
> +		}
> +		return 0;
> +	}
> +
> +	dst_mirror_mark_notsync(st->node);
> +
> +	mutex_lock(&st->request_lock);
> +	list_for_each_entry_safe(req, tmp, &st->request_list,
> +					request_list_entry) {
> +		kst_del_req(req);
> +		dprintk("%s: requeue [%c], start: %llu, idx: %d,"
> +				" num: %d, size: %llu, offset: %u, err: %d.\n",
> +			__func__, (bio_rw(req->bio) == WRITE)?'W':'R',
> +			req->start, req->idx, req->num, req->size,
> +			req->offset, err);
> +
> +		if (bio_rw(req->bio) == READ) {
> +			req->start -= to_sector(req->orig_size - req->size);
> +			req->size = req->orig_size;
> +			req->flags &= ~DST_REQ_HEADER_SENT;
> +			req->idx = 0;
> +			if (dst_mirror_read(req))
> +				kst_complete_req(req, err);
> +			else
> +				dst_free_request(req);
> +		} else {
> +			kst_complete_req(req, err);
> +		}
> +	}
> +	mutex_unlock(&st->request_lock);
> +	return err;
> +}
> +
> +static struct dst_alg_ops alg_mirror_ops = {
> +	.remap		= dst_mirror_remap,
> +	.add_node	= dst_mirror_add_node,
> +	.del_node	= dst_mirror_del_node,
> +	.error		= dst_mirror_error,
> +	.owner		= THIS_MODULE,
> +};
> +
> +static int __devinit alg_mirror_init(void)
> +{
> +	int err = -ENOMEM;
> +
> +	dst_mirror_bio_set = bioset_create(256, 256);
> +	if (!dst_mirror_bio_set)
> +		return -ENOMEM;
> +
> +	alg_mirror = dst_alloc_alg("alg_mirror", &alg_mirror_ops);
> +	if (!alg_mirror)
> +		goto err_out;
> +
> +	return 0;
> +
> +err_out:
> +	bioset_free(dst_mirror_bio_set);
> +	return err;
> +}
> +
> +static void __devexit alg_mirror_exit(void)
> +{
> +	dst_remove_alg(alg_mirror);
> +	bioset_free(dst_mirror_bio_set);
> +}
> +
> +module_init(alg_mirror_init);
> +module_exit(alg_mirror_exit);
> +
> +MODULE_LICENSE("GPL");
> +MODULE_AUTHOR("Evgeniy Polyakov <johnpol@2ka.mipt.ru>");
> +MODULE_DESCRIPTION("Mirror distributed algorithm.");
> diff --git a/drivers/block/dst/dcore.c b/drivers/block/dst/dcore.c
> new file mode 100644
> index 0000000..2bf7fc1
> --- /dev/null
> +++ b/drivers/block/dst/dcore.c
> @@ -0,0 +1,1526 @@
> +/*
> + * 2007+ Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
> + * All rights reserved.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + */
> +
> +#include <linux/module.h>
> +#include <linux/kernel.h>
> +#include <linux/init.h>
> +#include <linux/blkdev.h>
> +#include <linux/bio.h>
> +#include <linux/slab.h>
> +#include <linux/miscdevice.h>
> +#include <linux/socket.h>
> +#include <linux/dst.h>
> +#include <linux/device.h>
> +#include <linux/in.h>
> +#include <linux/in6.h>
> +#include <linux/buffer_head.h>
> +
> +#include <net/sock.h>
> +
> +static LIST_HEAD(dst_storage_list);
> +static LIST_HEAD(dst_alg_list);
> +static DEFINE_MUTEX(dst_storage_lock);
> +static DEFINE_MUTEX(dst_alg_lock);
> +static int dst_major;
> +static struct kst_worker *kst_main_worker;
> +
> +struct kmem_cache *dst_request_cache;
> +
> +/*
> + * DST sysfs tree. For device called 'storage' which is formed
> + * on top of two nodes this looks like this:
> + *
> + * /sys/devices/storage/
> + * /sys/devices/storage/alg : alg_linear
> + * /sys/devices/storage/n-800/type : R: 192.168.4.80:1025
> + * /sys/devices/storage/n-800/size : 800
> + * /sys/devices/storage/n-800/start : 800
> + * /sys/devices/storage/n-0/type : R: 192.168.4.81:1025
> + * /sys/devices/storage/n-0/size : 800
> + * /sys/devices/storage/n-0/start : 0
> + * /sys/devices/storage/remove_all_nodes
> + * /sys/devices/storage/nodes : sectors (start [size]): 0 [800] | 800 [800]
> + * /sys/devices/storage/name : storage
> + */
> +
> +static int dst_dev_match(struct device *dev, struct device_driver *drv)
> +{
> +	return 1;
> +}
> +
> +static void dst_dev_release(struct device *dev)
> +{
> +}
> +
> +static struct bus_type dst_dev_bus_type = {
> +	.name 		= "dst",
> +	.match 		= &dst_dev_match,
> +};
> +
> +static struct device dst_dev = {
> +	.bus 		= &dst_dev_bus_type,
> +	.release 	= &dst_dev_release
> +};
> +
> +static void dst_node_release(struct device *dev)
> +{
> +}
> +
> +static struct device dst_node_dev = {
> +	.release 	= &dst_node_release
> +};
> +
> +static struct bio_set *dst_bio_set;
> +
> +static void dst_destructor(struct bio *bio)
> +{
> +	bio_free(bio, dst_bio_set);
> +}
> +
> +/*
> + * Internal callback for local requests (i.e. for local disk),
> + * which are splitted between nodes (part with local node destination
> + * ends up with this ->bi_end_io() callback).
> + */
> +static int dst_end_io(struct bio *bio, unsigned int size, int err)
> +{
> +	struct bio *orig_bio = bio->bi_private;
> +
> +	if (bio->bi_size)
> +		return 0;
> +
> +	dprintk("%s: bio: %p, orig_bio: %p, size: %u, orig_size: %u.\n",
> +		__func__, bio, orig_bio, size, orig_bio->bi_size);
> +
> +	bio_endio(orig_bio, size, 0);
> +	bio_put(bio);
> +	return 0;
> +}
> +
> +/*
> + * This function sends processing request down to block layer (for local node)
> + * or to network state machine (for remote node).
> + */
> +static int dst_node_push(struct dst_request *req)
> +{
> +	int err = 0;
> +	struct dst_node *n = req->node;
> +
> +	if (n->bdev) {
> +		struct bio *bio = req->bio;
> +
> +		dprintk("%s: start: %llu, num: %d, idx: %d, offset: %u, "
> +				"size: %llu, bi_idx: %d, bi_vcnt: %d.\n",
> +			__func__, req->start, req->num, req->idx,
> +			req->offset, req->size,	bio->bi_idx, bio->bi_vcnt);
> +
> +		if (likely(bio->bi_idx == req->idx &&
> +					bio->bi_vcnt == req->num)) {
> +			bio->bi_bdev = n->bdev;
> +			bio->bi_sector = req->start;
> +		} else {
> +			struct bio *clone = bio_alloc_bioset(GFP_NOIO,
> +					bio->bi_max_vecs, dst_bio_set);
> +			struct bio_vec *bv;
> +
> +			err = -ENOMEM;
> +			if (!clone)
> +				goto out_put;
> +
> +			__bio_clone(clone, bio);
> +
> +			bv = bio_iovec_idx(clone, req->idx);
> +			bv->bv_offset += req->offset;
> +			clone->bi_idx = req->idx;
> +			clone->bi_vcnt = req->num;
> +			clone->bi_bdev = n->bdev;
> +			clone->bi_sector = req->start;
> +			clone->bi_destructor = dst_destructor;
> +			clone->bi_private = bio;
> +			clone->bi_size = req->orig_size;
> +			clone->bi_end_io = &dst_end_io;
> +			req->bio = clone;
> +
> +			dprintk("%s: start: %llu, num: %d, idx: %d, "
> +				"offset: %u, size: %llu, "
> +				"bi_idx: %d, bi_vcnt: %d, req: %p, bio: %p.\n",
> +				__func__, req->start, req->num, req->idx,
> +				req->offset, req->size,
> +				clone->bi_idx, clone->bi_vcnt, req, req->bio);
> +
> +		}
> +	}
> +
> +	err = n->st->alg->ops->remap(req);
> +
> +out_put:
> +	dst_node_put(n);
> +	return err;
> +}
> +
> +/*
> + * This function is invoked from block layer request processing function,
> + * its task is to remap block request to different nodes.
> + */
> +static int dst_remap(struct dst_storage *st, struct bio *bio)
> +{
> +	struct dst_node *n;
> +	int err = -EINVAL, i, cnt;
> +	unsigned int bio_sectors = bio->bi_size>>9;
> +	struct bio_vec *bv;
> +	struct dst_request req;
> +	u64 rest_in_node, start, total_size;
> +
> +	mutex_lock(&st->tree_lock);
> +	n = dst_storage_tree_search(st, bio->bi_sector);
> +	mutex_unlock(&st->tree_lock);
> +
> +	if (!n) {
> +		dprintk("%s: failed to find a node for bio: %p, "
> +				"sector: %llu.\n",
> +				__func__, bio, bio->bi_sector);
> +		return -ENODEV;
> +	}
> +
> +	dprintk("%s: bio: %llu-%llu, dev: %llu-%llu, in sectors.\n",
> +			__func__, bio->bi_sector, bio->bi_sector+bio_sectors,
> +			n->start, n->start+n->size);
> +
> +	memset(&req, 0, sizeof(struct dst_request));
> +
> +	start = bio->bi_sector;
> +	total_size = bio->bi_size;
> +
> +	req.flags = (test_bit(DST_NODE_FROZEN, &n->flags))?
> +				DST_REQ_ALWAYS_QUEUE:0;
> +	req.start = start - n->start;
> +	req.offset = 0;
> +	req.state = n->state;
> +	req.node = n;
> +	req.bio = bio;
> +
> +	req.size = bio->bi_size;
> +	req.orig_size = bio->bi_size;
> +	req.idx = bio->bi_idx;
> +	req.num = bio->bi_vcnt;
> +
> +	req.bio_endio = &kst_bio_endio;
> +
> +	/*
> +	 * Common fast path - block request does not cross
> +	 * boundaries between nodes.
> +	 */
> +	if (likely(bio->bi_sector + bio_sectors <= n->start + n->size))
> +		return dst_node_push(&req);
> +
> +	req.size = 0;
> +	req.idx = 0;
> +	req.num = 1;
> +
> +	cnt = bio->bi_vcnt;
> +
> +	rest_in_node = to_bytes(n->size - req.start);
> +
> +	for (i = 0; i < cnt; ++i) {
> +		bv = bio_iovec_idx(bio, i);
> +
> +		if (req.size + bv->bv_len >= rest_in_node) {
> +			unsigned int diff = req.size + bv->bv_len -
> +				rest_in_node;
> +
> +			req.size += bv->bv_len - diff;
> +			req.start = start - n->start;
> +			req.orig_size = req.size;
> +			req.bio = bio;
> +			req.bio_endio = &kst_bio_endio;
> +
> +			dprintk("%s: split: start: %llu/%llu, size: %llu, "
> +					"total_size: %llu, diff: %u, idx: %d, "
> +					"num: %d, bv_len: %u, bv_offset: %u.\n",
> +					__func__, start, req.start, req.size,
> +					total_size, diff, req.idx, req.num,
> +					bv->bv_len, bv->bv_offset);
> +
> +			err = dst_node_push(&req);
> +			if (err)
> +				break;
> +
> +			total_size -= req.orig_size;
> +
> +			if (!total_size)
> +				break;
> +
> +			start += to_sector(req.orig_size);
> +
> +			req.flags = (test_bit(DST_NODE_FROZEN, &n->flags))?
> +				DST_REQ_ALWAYS_QUEUE:0;
> +			req.orig_size = req.size = diff;
> +
> +			if (diff) {
> +				req.offset = bv->bv_len - diff;
> +				req.idx = req.num - 1;
> +			} else {
> +				req.idx = req.num;
> +				req.offset = 0;
> +			}
> +
> +			dprintk("%s: next: start: %llu, size: %llu, "
> +				"total_size: %llu, diff: %u, idx: %d, "
> +				"num: %d, offset: %u, bv_len: %u, "
> +				"bv_offset: %u.\n",
> +				__func__, start, req.size, total_size, diff,
> +				req.idx, req.num, req.offset,
> +				bv->bv_len, bv->bv_offset);
> +
> +			mutex_lock(&st->tree_lock);
> +			n = dst_storage_tree_search(st, start);
> +			mutex_unlock(&st->tree_lock);
> +
> +			if (!n) {
> +				err = -ENODEV;
> +				dprintk("%s: failed to find a split node for "
> +				  "bio: %p, sector: %llu, start: %llu.\n",
> +						__func__, bio, bio->bi_sector,
> +						req.start);
> +				break;
> +			}
> +
> +			req.state = n->state;
> +			req.node = n;
> +			req.start = start - n->start;
> +			rest_in_node = to_bytes(n->size - req.start);
> +
> +			dprintk("%s: req.start: %llu, start: %llu, "
> +					"dev_start: %llu, dev_size: %llu, "
> +					"rest_in_node: %llu.\n",
> +				__func__, req.start, start, n->start,
> +				n->size, rest_in_node);
> +		} else {
> +			req.size += bv->bv_len;
> +			req.num++;
> +		}
> +	}
> +
> +	dprintk("%s: last request: start: %llu, size: %llu, "
> +			"total_size: %llu.\n", __func__,
> +			req.start, req.size, total_size);
> +	if (total_size) {
> +		req.orig_size = req.size;
> +		req.bio = bio;
> +		req.bio_endio = &kst_bio_endio;
> +
> +		dprintk("%s: last: start: %llu/%llu, size: %llu, "
> +				"total_size: %llu, idx: %d, num: %d.\n",
> +			__func__, start, req.start, req.size,
> +			total_size, req.idx, req.num);
> +
> +		err = dst_node_push(&req);
> +		if (!err) {
> +			total_size -= req.orig_size;
> +
> +			BUG_ON(total_size != 0);
> +		}
> +	}
> +
> +	dprintk("%s: end bio: %p, err: %d.\n", __func__, bio, err);
> +	return err;
> +}
> +
> +
> +/*
> + * Distributed storage erquest processing function.
> + * It calls algorithm spcific remapping code only.
> + */
> +static int dst_request(request_queue_t *q, struct bio *bio)
> +{
> +	struct dst_storage *st = q->queuedata;
> +	int err;
> +
> +	dprintk("\n%s: start: st: %p, bio: %p, cnt: %u.\n",
> +			__func__, st, bio, bio->bi_vcnt);
> +
> +	err = dst_remap(st, bio);
> +
> +	dprintk("%s: end: st: %p, bio: %p, err: %d.\n",
> +			__func__, st, bio, err);
> +	return 0;
> +}
> +
> +static void dst_unplug(request_queue_t *q)
> +{
> +}
> +
> +static int dst_flush(request_queue_t *q, struct gendisk *disk, sector_t *sec)
> +{
> +	return 0;
> +}
> +
> +static struct block_device_operations dst_blk_ops = {
> +	.owner =	THIS_MODULE,
> +};
> +
> +/*
> + * Block layer binding - disk is created when array is fully configured
> + * by userspace request.
> + */
> +static int dst_create_disk(struct dst_storage *st)
> +{
> +	int err = -ENOMEM;
> +
> +	st->queue = blk_alloc_queue(GFP_KERNEL);
> +	if (!st->queue)
> +		goto err_out_exit;
> +
> +	st->queue->queuedata = st;
> +	blk_queue_make_request(st->queue, dst_request);
> +	blk_queue_bounce_limit(st->queue, BLK_BOUNCE_ANY);
> +	st->queue->unplug_fn = dst_unplug;
> +	st->queue->issue_flush_fn = dst_flush;
> +
> +	err = -EINVAL;
> +	st->disk = alloc_disk(1);
> +	if (!st->disk)
> +		goto err_out_free_queue;
> +
> +	st->disk->major = dst_major;
> +	st->disk->first_minor = (((unsigned long)st->disk) ^
> +		(((unsigned long)st->disk) >> 31)) & 0xff;
> +	st->disk->fops = &dst_blk_ops;
> +	st->disk->queue = st->queue;
> +	st->disk->private_data = st;
> +	snprintf(st->disk->disk_name, sizeof(st->disk->disk_name),
> +			"dst-%s-%d", st->name, st->disk->first_minor);
> +
> +	return 0;
> +
> +err_out_free_queue:
> +	blk_cleanup_queue(st->queue);
> +err_out_exit:
> +	return err;
> +}
> +
> +static void dst_remove_disk(struct dst_storage *st)
> +{
> +	del_gendisk(st->disk);
> +	put_disk(st->disk);
> +	blk_cleanup_queue(st->queue);
> +}
> +
> +/*
> + * Shows node name in sysfs.
> + */
> +static ssize_t dst_name_show(struct device *dev,
> +		struct device_attribute *attr, char *buf)
> +{
> +	struct dst_storage *st = container_of(dev, struct dst_storage, device);
> +
> +	return sprintf(buf, "%s\n", st->name);
> +}
> +
> +static void dst_remove_all_nodes(struct dst_storage *st)
> +{
> +	struct dst_node *n, *node, *tmp;
> +	struct rb_node *rb_node;
> +
> +	mutex_lock(&st->tree_lock);
> +	while ((rb_node = rb_first(&st->tree_root)) != NULL) {
> +		n = rb_entry(rb_node, struct dst_node, tree_node);
> +		dprintk("%s: n: %p, start: %llu, size: %llu.\n",
> +				__func__, n, n->start, n->size);
> +		rb_erase(&n->tree_node, &st->tree_root);
> +		if (!n->shared_head && atomic_read(&n->shared_num)) {
> +			list_for_each_entry_safe(node, tmp, &n->shared, shared) {
> +				list_del_rcu(&node->shared);

Under the update-side mutex, so OK.

> +				atomic_dec(&node->shared_head->refcnt);
> +				node->shared_head = NULL;
> +				dst_node_put(node);
> +			}
> +		}
> +		dst_node_put(n);
> +	}
> +	mutex_unlock(&st->tree_lock);
> +}
> +
> +/*
> + * Shows node layout in syfs.
> + */
> +static ssize_t dst_nodes_show(struct device *dev,
> +		struct device_attribute *attr, char *buf)
> +{
> +	struct dst_storage *st = container_of(dev, struct dst_storage, device);
> +	int size = PAGE_CACHE_SIZE, sz;
> +	struct dst_node *n;
> +	struct rb_node *rb_node;
> +
> +	sz = sprintf(buf, "sectors (start [size]): ");
> +	size -= sz;
> +	buf += sz;
> +
> +	mutex_lock(&st->tree_lock);
> +	for (rb_node = rb_first(&st->tree_root); rb_node;
> +			rb_node = rb_next(rb_node)) {
> +		n = rb_entry(rb_node, struct dst_node, tree_node);
> +		if (size < 32)
> +			break;
> +		sz = sprintf(buf, "%llu [%llu]", n->start, n->size);
> +		buf += sz;
> +		size -= sz;
> +
> +		if (!rb_next(rb_node))
> +			break;
> +
> +		sz = sprintf(buf, " | ");
> +		buf += sz;
> +		size -= sz;
> +	}
> +	mutex_unlock(&st->tree_lock);
> +	size -= sprintf(buf, "\n");
> +	return PAGE_CACHE_SIZE - size;
> +}
> +
> +/*
> + * Algorithm currently being used by given storage.
> + */
> +static ssize_t dst_alg_show(struct device *dev,
> +		struct device_attribute *attr, char *buf)
> +{
> +	struct dst_storage *st = container_of(dev, struct dst_storage, device);
> +	return sprintf(buf, "%s\n", st->alg->name);
> +}
> +
> +/*
> + * Writing to this sysfs file allows to remove all nodes
> + * and storage itself automatically.
> + */
> +static ssize_t dst_remove_nodes(struct device *dev,
> +		struct device_attribute *attr,
> +		const char *buf, size_t count)
> +{
> +	struct dst_storage *st = container_of(dev, struct dst_storage, device);
> +	dst_remove_all_nodes(st);
> +	return count;
> +}
> +
> +static DEVICE_ATTR(name, 0444, dst_name_show, NULL);
> +static DEVICE_ATTR(nodes, 0444, dst_nodes_show, NULL);
> +static DEVICE_ATTR(alg, 0444, dst_alg_show, NULL);
> +static DEVICE_ATTR(remove_all_nodes, 0644, NULL, dst_remove_nodes);
> +
> +static int dst_create_storage_attributes(struct dst_storage *st)
> +{
> +	int err;
> +
> +	err = device_create_file(&st->device, &dev_attr_name);
> +	err = device_create_file(&st->device, &dev_attr_nodes);
> +	err = device_create_file(&st->device, &dev_attr_alg);
> +	err = device_create_file(&st->device, &dev_attr_remove_all_nodes);
> +	return 0;
> +}
> +
> +static void dst_remove_storage_attributes(struct dst_storage *st)
> +{
> +	device_remove_file(&st->device, &dev_attr_name);
> +	device_remove_file(&st->device, &dev_attr_nodes);
> +	device_remove_file(&st->device, &dev_attr_alg);
> +	device_remove_file(&st->device, &dev_attr_remove_all_nodes);
> +}
> +
> +static void dst_storage_sysfs_exit(struct dst_storage *st)
> +{
> +	dst_remove_storage_attributes(st);
> +	device_unregister(&st->device);
> +}
> +
> +static int dst_storage_sysfs_init(struct dst_storage *st)
> +{
> +	int err;
> +
> +	memcpy(&st->device, &dst_dev, sizeof(struct device));
> +	snprintf(st->device.bus_id, sizeof(st->device.bus_id), "%s", st->name);
> +
> +	err = device_register(&st->device);
> +	if (err) {
> +		dprintk(KERN_ERR "Failed to register dst device %s, err: %d.\n",
> +			st->name, err);
> +		goto err_out_exit;
> +	}
> +
> +	dst_create_storage_attributes(st);
> +
> +	return 0;
> +
> +err_out_exit:
> +	return err;
> +}
> +
> +/*
> + * This functions shows size and start of the appropriate node.
> + * Both are in sectors.
> + */
> +static ssize_t dst_show_start(struct device *dev,
> +		struct device_attribute *attr, char *buf)
> +{
> +	struct dst_node *n = container_of(dev, struct dst_node, device);
> +
> +	return sprintf(buf, "%llu\n", n->start);
> +}
> +
> +static ssize_t dst_show_size(struct device *dev,
> +		struct device_attribute *attr, char *buf)
> +{
> +	struct dst_node *n = container_of(dev, struct dst_node, device);
> +
> +	return sprintf(buf, "%llu\n", n->size);
> +}
> +
> +/*
> + * Shows type of the remote node - device major/minor number
> + * for local nodes and address (af_inet ipv4/ipv6 only) for remote nodes.
> + */
> +static ssize_t dst_show_type(struct device *dev,
> +		struct device_attribute *attr, char *buf)
> +{
> +	struct dst_node *n = container_of(dev, struct dst_node, device);
> +	struct sockaddr addr;
> +	struct socket *sock;
> +	int addrlen;
> +
> +	if (!n->state && !n->bdev)
> +		return 0;
> +
> +	if (n->bdev)
> +		return sprintf(buf, "L: %d:%d\n",
> +				MAJOR(n->bdev->bd_dev), MINOR(n->bdev->bd_dev));
> +
> +	sock = n->state->socket;
> +	if (sock->ops->getname(sock, &addr, &addrlen, 2))
> +		return 0;
> +
> +	if (sock->ops->family == AF_INET) {
> +		struct sockaddr_in *sin = (struct sockaddr_in *)&addr;
> +		return sprintf(buf, "R: %u.%u.%u.%u:%d\n",
> +			NIPQUAD(sin->sin_addr.s_addr), ntohs(sin->sin_port));
> +	} else if (sock->ops->family == AF_INET6) {
> +		struct sockaddr_in6 *sin = (struct sockaddr_in6 *)&addr;
> +		return sprintf(buf,
> +			"R: %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x:%d\n",
> +			NIP6(sin->sin6_addr), ntohs(sin->sin6_port));
> +	}
> +	return 0;
> +}
> +
> +static DEVICE_ATTR(start, 0444, dst_show_start, NULL);
> +static DEVICE_ATTR(size, 0444, dst_show_size, NULL);
> +static DEVICE_ATTR(type, 0444, dst_show_type, NULL);
> +
> +static int dst_create_node_attributes(struct dst_node *n)
> +{
> +	int err;
> +
> +	err = device_create_file(&n->device, &dev_attr_start);
> +	err = device_create_file(&n->device, &dev_attr_size);
> +	err = device_create_file(&n->device, &dev_attr_type);
> +	return 0;
> +}
> +
> +static void dst_remove_node_attributes(struct dst_node *n)
> +{
> +	device_remove_file(&n->device, &dev_attr_start);
> +	device_remove_file(&n->device, &dev_attr_size);
> +	device_remove_file(&n->device, &dev_attr_type);
> +}
> +
> +static void dst_node_sysfs_exit(struct dst_node *n)
> +{
> +	if (n->device.parent == &n->st->device) {
> +		dst_remove_node_attributes(n);
> +		device_unregister(&n->device);
> +		n->device.parent = NULL;
> +	}
> +}
> +
> +static int dst_node_sysfs_init(struct dst_node *n)
> +{
> +	int err;
> +
> +	memcpy(&n->device, &dst_node_dev, sizeof(struct device));
> +
> +	n->device.parent = &n->st->device;
> +
> +	snprintf(n->device.bus_id, sizeof(n->device.bus_id),
> +			"n-%llu-%p", n->start, n);
> +	err = device_register(&n->device);
> +	if (err) {
> +		dprintk(KERN_ERR "Failed to register node, err: %d.\n", err);
> +		goto err_out_exit;
> +	}
> +
> +	dst_create_node_attributes(n);
> +
> +	return 0;
> +
> +err_out_exit:
> +	n->device.parent = NULL;
> +	return err;
> +}
> +
> +/*
> + * Gets a reference for given storage, if
> + * storage with given name and algorithm being used
> + * does not exist it is created.
> + */
> +static struct dst_storage *dst_get_storage(char *name, char *aname, int alloc)
> +{
> +	struct dst_storage *st, *rst = NULL;
> +	int err;
> +	struct dst_alg *alg;
> +
> +	mutex_lock(&dst_storage_lock);
> +	list_for_each_entry(st, &dst_storage_list, entry) {
> +		if (!strcmp(name, st->name) && !strcmp(st->alg->name, aname)) {
> +			rst = st;
> +			atomic_inc(&st->refcnt);
> +			break;
> +		}
> +	}
> +	mutex_unlock(&dst_storage_lock);
> +
> +	if (rst || !alloc)
> +		return rst;
> +
> +	st = kzalloc(sizeof(struct dst_storage), GFP_KERNEL);
> +	if (!st)
> +		return NULL;
> +
> +	mutex_init(&st->tree_lock);
> +	/*
> +	 * One for storage itself,
> +	 * another one for attached node below.
> +	 */
> +	atomic_set(&st->refcnt, 2);
> +	snprintf(st->name, DST_NAMELEN, "%s", name);
> +	st->tree_root.rb_node = NULL;
> +
> +	err = dst_storage_sysfs_init(st);
> +	if (err)
> +		goto err_out_free;
> +
> +	err = dst_create_disk(st);
> +	if (err)
> +		goto err_out_sysfs_exit;
> +
> +	mutex_lock(&dst_alg_lock);
> +	list_for_each_entry(alg, &dst_alg_list, entry) {
> +		if (!strcmp(alg->name, aname)) {
> +			atomic_inc(&alg->refcnt);
> +			try_module_get(alg->ops->owner);
> +			st->alg = alg;
> +			break;
> +		}
> +	}
> +	mutex_unlock(&dst_alg_lock);
> +
> +	if (!st->alg)
> +		goto err_out_disk_remove;
> +
> +	mutex_lock(&dst_storage_lock);
> +	list_add_tail(&st->entry, &dst_storage_list);
> +	mutex_unlock(&dst_storage_lock);
> +
> +	return st;
> +
> +err_out_disk_remove:
> +	dst_remove_disk(st);
> +err_out_sysfs_exit:
> +	dst_storage_sysfs_init(st);
> +err_out_free:
> +	kfree(st);
> +	return NULL;
> +}
> +
> +/*
> + * Allows to allocate and add new algorithm by external modules.
> + */
> +struct dst_alg *dst_alloc_alg(char *name, struct dst_alg_ops *ops)
> +{
> +	struct dst_alg *alg;
> +
> +	alg = kzalloc(sizeof(struct dst_alg), GFP_KERNEL);
> +	if (!alg)
> +		return NULL;
> +	snprintf(alg->name, DST_NAMELEN, "%s", name);
> +	atomic_set(&alg->refcnt, 1);
> +	alg->ops = ops;
> +
> +	mutex_lock(&dst_alg_lock);
> +	list_add_tail(&alg->entry, &dst_alg_list);
> +	mutex_unlock(&dst_alg_lock);
> +
> +	return alg;
> +}
> +EXPORT_SYMBOL_GPL(dst_alloc_alg);
> +
> +static void dst_free_alg(struct dst_alg *alg)
> +{
> +	dprintk("%s: alg: %p.\n", __func__, alg);
> +	kfree(alg);
> +}
> +
> +/*
> + * Algorithm is never freed directly,
> + * since its module reference counter is increased
> + * by storage when it is created - just like network protocols.
> + */
> +static inline void dst_put_alg(struct dst_alg *alg)
> +{
> +	dprintk("%s: alg: %p, refcnt: %d.\n",
> +			__func__, alg, atomic_read(&alg->refcnt));
> +	module_put(alg->ops->owner);
> +	if (atomic_dec_and_test(&alg->refcnt))
> +		dst_free_alg(alg);
> +}
> +
> +/*
> + * Removing algorithm from main list of supported algorithms.
> + */
> +void dst_remove_alg(struct dst_alg *alg)
> +{
> +	mutex_lock(&dst_alg_lock);
> +	list_del_init(&alg->entry);
> +	mutex_unlock(&dst_alg_lock);
> +
> +	dst_put_alg(alg);
> +}
> +EXPORT_SYMBOL_GPL(dst_remove_alg);
> +
> +static void dst_cleanup_node(struct dst_node *n)
> +{
> +	struct dst_storage *st = n->st;
> +
> +	dprintk("%s: node: %p.\n", __func__, n);
> +
> +	n->st->alg->ops->del_node(n);
> +
> +	if (n->shared_head) {
> +		mutex_lock(&st->tree_lock);
> +		list_del_rcu(&n->shared);

Under the update-side mutex, so OK.

> +		mutex_unlock(&st->tree_lock);
> +
> +		atomic_dec(&n->shared_head->refcnt);
> +		dst_node_put(n->shared_head);
> +		n->shared_head = NULL;
> +	}
> +
> +	if (n->cleanup)
> +		n->cleanup(n);
> +	dst_node_sysfs_exit(n);
> +	kfree(n);
> +}
> +
> +static void dst_free_storage(struct dst_storage *st)
> +{
> +	dprintk("%s: st: %p.\n", __func__, st);
> +
> +	BUG_ON(rb_first(&st->tree_root) != NULL);
> +
> +	dst_put_alg(st->alg);
> +	kfree(st);
> +}
> +
> +static inline void dst_put_storage(struct dst_storage *st)
> +{
> +	dprintk("%s: st: %p, refcnt: %d.\n",
> +			__func__, st, atomic_read(&st->refcnt));
> +	if (atomic_dec_and_test(&st->refcnt))
> +		dst_free_storage(st);
> +}
> +
> +void dst_node_put(struct dst_node *n)
> +{
> +	dprintk("%s: node: %p, start: %llu, size: %llu, refcnt: %d.\n",
> +			__func__, n, n->start, n->size,
> +			atomic_read(&n->refcnt));
> +
> +	if (atomic_dec_and_test(&n->refcnt)) {
> +		struct dst_storage *st = n->st;
> +
> +		dprintk("%s: freeing node: %p, start: %llu, size: %llu, "
> +				"refcnt: %d.\n",
> +				__func__, n, n->start, n->size,
> +				atomic_read(&n->refcnt));
> +
> +		dst_cleanup_node(n);
> +		dst_put_storage(st);
> +	}
> +}
> +EXPORT_SYMBOL_GPL(dst_node_put);
> +
> +static inline int dst_compare_id(struct dst_node *old, u64 new)
> +{
> +	if (old->start + old->size <= new)
> +		return 1;
> +	if (old->start > new)
> +		return -1;
> +	return 0;
> +}
> +
> +/*
> + * Tree of of the nodes, which form the storage.
> + * Tree is indexed via start of the node and its size.
> + * Comparison function above.
> + */
> +struct dst_node *dst_storage_tree_search(struct dst_storage *st, u64 start)
> +{
> +	struct rb_node *n = st->tree_root.rb_node;
> +	struct dst_node *dn;
> +	int cmp;
> +
> +	while (n) {
> +		dn = rb_entry(n, struct dst_node, tree_node);
> +
> +		cmp = dst_compare_id(dn, start);
> +		dprintk("%s: tree: %llu-%llu, new: %llu.\n",
> +			__func__, dn->start, dn->start+dn->size, start);
> +		if (cmp < 0)
> +			n = n->rb_left;
> +		else if (cmp > 0)
> +			n = n->rb_right;
> +		else {
> +			return dst_node_get(dn);
> +		}
> +	}
> +	return NULL;
> +}
> +EXPORT_SYMBOL_GPL(dst_storage_tree_search);
> +
> +/*
> + * This function allows to remove a node with given start address
> + * from the storage.
> + */
> +static struct dst_node *dst_storage_tree_del(struct dst_storage *st, u64 start)
> +{
> +	struct dst_node *n = dst_storage_tree_search(st, start);
> +
> +	if (!n)
> +		return NULL;
> +
> +	rb_erase(&n->tree_node, &st->tree_root);
> +	dst_node_put(n);
> +	return n;
> +}
> +
> +/*
> + * This function allows to add given node to the storage.
> + * Returns -EEXIST if the same area is already covered by another node.
> + * This is return must be checked for redundancy algorithms.
> + */
> +static struct dst_node *dst_storage_tree_add(struct dst_node *new,
> +		struct dst_storage *st)
> +{
> +	struct rb_node **n = &st->tree_root.rb_node, *parent = NULL;
> +	struct dst_node *dn;
> +	int cmp;
> +
> +	while (*n) {
> +		parent = *n;
> +		dn = rb_entry(parent, struct dst_node, tree_node);
> +
> +		cmp = dst_compare_id(dn, new->start);
> +		dprintk("%s: tree: %llu-%llu, new: %llu.\n",
> +				__func__, dn->start, dn->start+dn->size,
> +				new->start);
> +		if (cmp < 0)
> +			n = &parent->rb_left;
> +		else if (cmp > 0)
> +			n = &parent->rb_right;
> +		else {
> +			return dn;
> +		}
> +	}
> +
> +	rb_link_node(&new->tree_node, parent, n);
> +	rb_insert_color(&new->tree_node, &st->tree_root);
> +
> +	return NULL;
> +}
> +
> +/*
> + * This function finds devices major/minor numbers for given pathname.
> + */
> +static int dst_lookup_device(const char *path, dev_t *dev)
> +{
> +	int err;
> +	struct nameidata nd;
> +	struct inode *inode;
> +
> +	err = path_lookup(path, LOOKUP_FOLLOW, &nd);
> +	if (err)
> +		return err;
> +
> +	inode = nd.dentry->d_inode;
> +	if (!inode) {
> +		err = -ENOENT;
> +		goto out;
> +	}
> +
> +	if (!S_ISBLK(inode->i_mode)) {
> +		err = -ENOTBLK;
> +		goto out;
> +	}
> +
> +	*dev = inode->i_rdev;
> +
> +out:
> +	path_release(&nd);
> +	return err;
> +}
> +
> +/*
> + * Cleanup routings for local, local exporting and remote nodes.
> + */
> +static void dst_cleanup_remote(struct dst_node *n)
> +{
> +	if (n->state) {
> +		kst_state_exit(n->state);
> +		n->state = NULL;
> +	}
> +}
> +
> +static void dst_cleanup_local(struct dst_node *n)
> +{
> +	if (n->bdev) {
> +		sync_blockdev(n->bdev);
> +		blkdev_put(n->bdev);
> +		n->bdev = NULL;
> +	}
> +}
> +
> +static void dst_cleanup_local_export(struct dst_node *n)
> +{
> +	dst_cleanup_local(n);
> +	dst_cleanup_remote(n);
> +}
> +
> +/*
> + * Setup routings for local, local exporting and remote nodes.
> + */
> +static int dst_setup_local(struct dst_node *n, struct dst_ctl *ctl,
> +		struct dst_local_ctl *l)
> +{
> +	dev_t dev;
> +	int err;
> +
> +	err = dst_lookup_device(l->name, &dev);
> +	if (err)
> +		return err;
> +
> +	n->bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
> +	if (!n->bdev)
> +		return -ENODEV;
> +
> +	if (!n->size)
> +		n->size = get_capacity(n->bdev->bd_disk);
> +
> +	return 0;
> +}
> +
> +static int dst_setup_local_export(struct dst_node *n, struct dst_ctl *ctl,
> +		struct dst_le_template *tmp)
> +{
> +	int err;
> +
> +	err = dst_setup_local(n, ctl, &tmp->le.lctl);
> +	if (err)
> +		goto err_out_exit;
> +
> +	n->state = kst_listener_state_init(n, tmp);
> +	if (IS_ERR(n->state)) {
> +		err = PTR_ERR(n->state);
> +		goto err_out_cleanup;
> +	}
> +
> +	return 0;
> +
> +err_out_cleanup:
> +	dst_cleanup_local(n);
> +err_out_exit:
> +	return err;
> +}
> +
> +static int dst_request_remote_config(struct dst_node *n, struct socket *sock)
> +{
> +	struct dst_remote_request cfg;
> +	struct msghdr msg;
> +	struct kvec iov;
> +	int err;
> +
> +	memset(&cfg, 0, sizeof(struct dst_remote_request));
> +	cfg.cmd = cpu_to_be32(DST_REMOTE_CFG);
> +
> +	iov.iov_base = &cfg;
> +	iov.iov_len = sizeof(struct dst_remote_request);
> +
> +	msg.msg_iov = (struct iovec *)&iov;
> +	msg.msg_iovlen = 1;
> +	msg.msg_name = NULL;
> +	msg.msg_namelen = 0;
> +	msg.msg_control = NULL;
> +	msg.msg_controllen = 0;
> +	msg.msg_flags = MSG_WAITALL;
> +
> +	err = kernel_sendmsg(sock, &msg, &iov, 1, iov.iov_len);
> +	if (err <= 0) {
> +		if (err == 0)
> +			err = -ECONNRESET;
> +		return err;
> +	}
> +
> +	iov.iov_base = &cfg;
> +	iov.iov_len = sizeof(struct dst_remote_request);
> +
> +	msg.msg_iov = (struct iovec *)&iov;
> +	msg.msg_iovlen = 1;
> +	msg.msg_name = NULL;
> +	msg.msg_namelen = 0;
> +	msg.msg_control = NULL;
> +	msg.msg_controllen = 0;
> +	msg.msg_flags = MSG_WAITALL;
> +
> +	err = kernel_recvmsg(sock, &msg, &iov, 1, iov.iov_len, msg.msg_flags);
> +	if (err <= 0) {
> +		if (err == 0)
> +			err = -ECONNRESET;
> +		return err;
> +	}
> +
> +	if (be32_to_cpu(cfg.cmd) != DST_REMOTE_CFG)
> +		return -EINVAL;
> +
> +	n->size = be64_to_cpu(cfg.sector);
> +
> +	return 0;
> +}
> +
> +static int dst_setup_remote(struct dst_node *n, struct dst_ctl *ctl,
> +		struct dst_remote_ctl *r)
> +{
> +	int err;
> +	struct socket *sock;
> +
> +	err = sock_create(r->addr.sa_family, r->type, r->proto, &sock);
> +	if (err < 0)
> +		goto err_out_exit;
> +
> +	sock->sk->sk_sndtimeo = sock->sk->sk_rcvtimeo =
> +		msecs_to_jiffies(DST_DEFAULT_TIMEO);
> +
> +	err = sock->ops->connect(sock, (struct sockaddr *)&r->addr,
> +			r->addr.sa_data_len, 0);
> +	if (err)
> +		goto err_out_destroy;
> +
> +	if (!n->size) {
> +		err = dst_request_remote_config(n, sock);
> +		if (err)
> +			goto err_out_destroy;
> +	}
> +
> +	n->state = kst_data_state_init(n, sock);
> +	if (IS_ERR(n->state)) {
> +		err = PTR_ERR(n->state);
> +		goto err_out_destroy;
> +	}
> +
> +	return 0;
> +
> +err_out_destroy:
> +	sock_release(sock);
> +err_out_exit:
> +	return err;
> +}
> +
> +/*
> + * This function inserts node into storage.
> + */
> +static int dst_insert_node(struct dst_node *n)
> +{
> +	int err;
> +	struct dst_storage *st = n->st;
> +	struct dst_node *dn;
> +
> +	err = st->alg->ops->add_node(n);
> +	if (err)
> +		return err;
> +
> +	err = dst_node_sysfs_init(n);
> +	if (err)
> +		goto err_out_remove_node;
> +
> +	mutex_lock(&st->tree_lock);
> +	dn = dst_storage_tree_add(n, st);
> +	if (dn) {
> +		err = -EINVAL;
> +		dn->size = st->disk_size;
> +		if (dn->start == n->start) {
> +			err = 0;
> +			n->shared_head = dst_node_get(dn);
> +			atomic_inc(&dn->shared_num);
> +			list_add_tail_rcu(&n->shared, &dn->shared);

And this too is under the update-side mutex, so is OK.

> +		}
> +	}
> +	mutex_unlock(&st->tree_lock);
> +	if (err)
> +		goto err_out_sysfs_exit;
> +
> +	if (n->priv_callback)
> +		n->priv_callback(n);
> +
> +	return 0;
> +
> +err_out_sysfs_exit:
> +	dst_node_sysfs_exit(n);
> +err_out_remove_node:
> +	st->alg->ops->del_node(n);
> +	return err;
> +}
> +
> +static struct dst_node *dst_alloc_node(struct dst_ctl *ctl,
> +		void (*cleanup)(struct dst_node *))
> +{
> +	struct dst_storage *st;
> +	struct dst_node *n;
> +
> +	st = dst_get_storage(ctl->st, ctl->alg, 1);
> +	if (!st)
> +		goto err_out_exit;
> +
> +	n = kzalloc(sizeof(struct dst_node), GFP_KERNEL);
> +	if (!n)
> +		goto err_out_put_storage;
> +
> +	n->w = kst_main_worker;
> +	n->st = st;
> +	n->cleanup = cleanup;
> +	n->start = ctl->start;
> +	n->size = ctl->size;
> +	INIT_LIST_HEAD(&n->shared);
> +	n->shared_head = NULL;
> +	atomic_set(&n->shared_num, 0);
> +	atomic_set(&n->refcnt, 1);
> +
> +	return n;
> +
> +err_out_put_storage:
> +	mutex_lock(&dst_storage_lock);
> +	list_del_init(&st->entry);
> +	mutex_unlock(&dst_storage_lock);
> +
> +	dst_put_storage(st);
> +err_out_exit:
> +	return NULL;
> +}
> +
> +/*
> + * Control callback for userspace commands to setup
> + * different nodes and start/stop array.
> + */
> +static int dst_add_remote(struct dst_ctl *ctl, void __user *data)
> +{
> +	struct dst_node *n;
> +	int err;
> +	struct dst_remote_ctl rctl;
> +
> +	if (copy_from_user(&rctl, data, sizeof(struct dst_remote_ctl)))
> +		return -EFAULT;
> +
> +	n = dst_alloc_node(ctl, &dst_cleanup_remote);
> +	if (!n)
> +		return -ENOMEM;
> +
> +	err = dst_setup_remote(n, ctl, &rctl);
> +	if (err < 0)
> +		goto err_out_free;
> +
> +	err = dst_insert_node(n);
> +	if (err)
> +		goto err_out_free;
> +
> +	return 0;
> +
> +err_out_free:
> +	dst_node_put(n);
> +	return err;
> +}
> +
> +static int dst_add_local_export(struct dst_ctl *ctl, void __user *data)
> +{
> +	struct dst_node *n;
> +	int err;
> +	struct dst_le_template tmp;
> +
> +	if (copy_from_user(&tmp.le, data, sizeof(struct dst_local_export_ctl)))
> +		return -EFAULT;
> +
> +	tmp.data = data + sizeof(struct dst_local_export_ctl);
> +
> +	n = dst_alloc_node(ctl, &dst_cleanup_local_export);
> +	if (!n)
> +		return -EINVAL;
> +
> +	err = dst_setup_local_export(n, ctl, &tmp);
> +	if (err < 0)
> +		goto err_out_free;
> +
> +	err = dst_insert_node(n);
> +	if (err)
> +		goto err_out_free;
> +
> +
> +	return 0;
> +
> +err_out_free:
> +	dst_node_put(n);
> +	return err;
> +}
> +
> +static int dst_add_local(struct dst_ctl *ctl, void __user *data)
> +{
> +	struct dst_node *n;
> +	int err;
> +	struct dst_local_ctl lctl;
> +
> +	if (copy_from_user(&lctl, data, sizeof(struct dst_local_ctl)))
> +		return -EFAULT;
> +
> +	n = dst_alloc_node(ctl, &dst_cleanup_local);
> +	if (!n)
> +		return -EINVAL;
> +
> +	err = dst_setup_local(n, ctl, &lctl);
> +	if (err < 0)
> +		goto err_out_free;
> +
> +	err = dst_insert_node(n);
> +	if (err)
> +		goto err_out_free;
> +
> +	return 0;
> +
> +err_out_free:
> +	dst_node_put(n);
> +	return err;
> +}
> +
> +static int dst_del_node(struct dst_ctl *ctl, void __user *data)
> +{
> +	struct dst_node *n;
> +	struct dst_storage *st;
> +	int err = -ENODEV;
> +
> +	st = dst_get_storage(ctl->st, ctl->alg, 0);
> +	if (!st)
> +		goto err_out_exit;
> +
> +	mutex_lock(&st->tree_lock);
> +	n = dst_storage_tree_del(st, ctl->start);
> +	mutex_unlock(&st->tree_lock);
> +	if (!n)
> +		goto err_out_put;
> +
> +	dst_node_put(n);
> +	dst_put_storage(st);
> +
> +	return 0;
> +
> +err_out_put:
> +	dst_put_storage(st);
> +err_out_exit:
> +	return err;
> +}
> +
> +static int dst_start_storage(struct dst_ctl *ctl, void __user *data)
> +{
> +	struct dst_storage *st;
> +
> +	st = dst_get_storage(ctl->st, ctl->alg, 0);
> +	if (!st)
> +		return -ENODEV;
> +
> +	mutex_lock(&st->tree_lock);
> +	if (!(st->flags & DST_ST_STARTED)) {
> +		set_capacity(st->disk, st->disk_size);
> +		add_disk(st->disk);
> +		st->flags |= DST_ST_STARTED;
> +		dprintk("%s: STARTED st: %p, disk_size: %llu.\n",
> +				__func__, st, st->disk_size);
> +	}
> +	mutex_unlock(&st->tree_lock);
> +
> +	dst_put_storage(st);
> +
> +	return 0;
> +}
> +
> +static int dst_stop_storage(struct dst_ctl *ctl, void __user *data)
> +{
> +	struct dst_storage *st;
> +
> +	st = dst_get_storage(ctl->st, ctl->alg, 0);
> +	if (!st)
> +		return -ENODEV;
> +
> +	dprintk("%s: STOPPED storage: %s.\n", __func__, st->name);
> +
> +	dst_storage_sysfs_exit(st);
> +
> +	mutex_lock(&dst_storage_lock);
> +	list_del_init(&st->entry);
> +	mutex_unlock(&dst_storage_lock);
> +
> +	if (st->flags & DST_ST_STARTED)
> +		dst_remove_disk(st);
> +
> +	dst_remove_all_nodes(st);
> +	dst_put_storage(st); /* One reference got above */
> +	dst_put_storage(st); /* Another reference set during initialization */
> +
> +	return 0;
> +}
> +
> +typedef int (*dst_command_func)(struct dst_ctl *ctl, void __user *data);
> +
> +/*
> + * List of userspace commands.
> + */
> +static dst_command_func dst_commands[] = {
> +	[DST_ADD_REMOTE] = &dst_add_remote,
> +	[DST_ADD_LOCAL] = &dst_add_local,
> +	[DST_ADD_LOCAL_EXPORT] = &dst_add_local_export,
> +	[DST_DEL_NODE] = &dst_del_node,
> +	[DST_START_STORAGE] = &dst_start_storage,
> +	[DST_STOP_STORAGE] = &dst_stop_storage,
> +};
> +
> +/*
> + * Move to connector for configuration is in TODO list.
> + */
> +static int dst_ioctl(struct inode *inode, struct file *file,
> +		unsigned int command, unsigned long data)
> +{
> +	struct dst_ctl ctl;
> +	unsigned int cmd = _IOC_NR(command);
> +
> +	if (!capable(CAP_SYS_ADMIN))
> +		return -EACCES;
> +
> +	if (_IOC_TYPE(command) != DST_IOCTL)
> +		return -ENOTTY;
> +
> +	if (cmd >= DST_CMD_MAX)
> +		return -EINVAL;
> +
> +	if (copy_from_user(&ctl, (void __user *)data, sizeof(struct dst_ctl)))
> +		return -EFAULT;
> +
> +	data += sizeof(struct dst_ctl);
> +
> +	return dst_commands[cmd](&ctl, (void __user *)data);
> +}
> +
> +static const struct file_operations dst_fops = {
> +	.ioctl	 = dst_ioctl,
> +	.owner	 = THIS_MODULE,
> +};
> +
> +static struct miscdevice dst_misc = {
> +	.minor 		= MISC_DYNAMIC_MINOR,
> +	.name  		= DST_NAME,
> +	.fops  		= &dst_fops
> +};
> +
> +static int dst_sysfs_init(void)
> +{
> +	return bus_register(&dst_dev_bus_type);
> +}
> +
> +static void dst_sysfs_exit(void)
> +{
> +	bus_unregister(&dst_dev_bus_type);
> +}
> +
> +static int __devinit dst_sys_init(void)
> +{
> +	int err = -ENOMEM;
> +
> +	dst_request_cache = kmem_cache_create("dst", sizeof(struct dst_request),
> +				       0, 0, NULL, NULL);
> +	if (!dst_request_cache)
> +		return -ENOMEM;
> +
> +	dst_bio_set = bioset_create(32, 32);
> +	if (!dst_bio_set)
> +		goto err_out_destroy;
> +
> +	err = register_blkdev(dst_major, DST_NAME);
> +	if (err < 0)
> +		goto err_out_destroy_bioset;
> +	if (err)
> +		dst_major = err;
> +
> +	err = dst_sysfs_init();
> +	if (err)
> +		goto err_out_unregister;
> +
> +	kst_main_worker = kst_worker_init(0);
> +	if (IS_ERR(kst_main_worker)) {
> +		err = PTR_ERR(kst_main_worker);
> +		goto err_out_sysfs_exit;
> +	}
> +
> +	err = misc_register(&dst_misc);
> +	if (err)
> +		goto err_out_worker_exit;
> +
> +	return 0;
> +
> +err_out_worker_exit:
> +	kst_worker_exit(kst_main_worker);
> +err_out_sysfs_exit:
> +	dst_sysfs_exit();
> +err_out_unregister:
> +	unregister_blkdev(dst_major, DST_NAME);
> +err_out_destroy_bioset:
> +	bioset_free(dst_bio_set);
> +err_out_destroy:
> +	kmem_cache_destroy(dst_request_cache);
> +	return err;
> +}
> +
> +static void __devexit dst_sys_exit(void)
> +{
> +	misc_deregister(&dst_misc);
> +	dst_sysfs_exit();
> +	unregister_blkdev(dst_major, DST_NAME);
> +	kst_exit_all();
> +	bioset_free(dst_bio_set);
> +	kmem_cache_destroy(dst_request_cache);
> +}
> +
> +module_init(dst_sys_init);
> +module_exit(dst_sys_exit);
> +
> +MODULE_DESCRIPTION("Distributed storage");
> +MODULE_AUTHOR("Evgeniy Polyakov <johnpol@2ka.mipt.ru>");
> +MODULE_LICENSE("GPL");
> diff --git a/drivers/block/dst/kst.c b/drivers/block/dst/kst.c
> new file mode 100644
> index 0000000..b739402
> --- /dev/null
> +++ b/drivers/block/dst/kst.c
> @@ -0,0 +1,1609 @@
> +/*
> + * 2007+ Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
> + * All rights reserved.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + */
> +
> +#include <linux/kernel.h>
> +#include <linux/module.h>
> +#include <linux/list.h>
> +#include <linux/slab.h>
> +#include <linux/socket.h>
> +#include <linux/kthread.h>
> +#include <linux/net.h>
> +#include <linux/in.h>
> +#include <linux/poll.h>
> +#include <linux/bio.h>
> +#include <linux/dst.h>
> +
> +#include <net/sock.h>
> +
> +struct kst_poll_helper
> +{
> +	poll_table 		pt;
> +	struct kst_state	*st;
> +};
> +
> +static LIST_HEAD(kst_worker_list);
> +static DEFINE_MUTEX(kst_worker_mutex);
> +
> +/*
> + * This function creates bound socket for local export node.
> + */
> +static int kst_sock_create(struct kst_state *st, struct saddr *addr,
> +		int type, int proto, int backlog)
> +{
> +	int err;
> +
> +	err = sock_create(addr->sa_family, type, proto, &st->socket);
> +	if (err)
> +		goto err_out_exit;
> +
> +	err = st->socket->ops->bind(st->socket, (struct sockaddr *)addr,
> +			addr->sa_data_len);
> +
> +	err = st->socket->ops->listen(st->socket, backlog);
> +	if (err)
> +		goto err_out_release;
> +
> +	st->socket->sk->sk_allocation = GFP_NOIO;
> +
> +	return 0;
> +
> +err_out_release:
> +	sock_release(st->socket);
> +err_out_exit:
> +	return err;
> +}
> +
> +static void kst_sock_release(struct kst_state *st)
> +{
> +	if (st->socket) {
> +		sock_release(st->socket);
> +		st->socket = NULL;
> +	}
> +}
> +
> +void kst_wake(struct kst_state *st)
> +{
> +	struct kst_worker *w = st->node->w;
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&w->ready_lock, flags);
> +	if (list_empty(&st->ready_entry))
> +		list_add_tail(&st->ready_entry, &w->ready_list);
> +	spin_unlock_irqrestore(&w->ready_lock, flags);
> +
> +	wake_up(&w->wait);
> +}
> +EXPORT_SYMBOL_GPL(kst_wake);
> +
> +/*
> + * Polling machinery.
> + */
> +static int kst_state_wake_callback(wait_queue_t *wait, unsigned mode,
> +		int sync, void *key)
> +{
> +	struct kst_state *st = container_of(wait, struct kst_state, wait);
> +	kst_wake(st);
> +	return 1;
> +}
> +
> +static void kst_queue_func(struct file *file, wait_queue_head_t *whead,
> +				 poll_table *pt)
> +{
> +	struct kst_state *st = container_of(pt, struct kst_poll_helper, pt)->st;
> +
> +	st->whead = whead;
> +	init_waitqueue_func_entry(&st->wait, kst_state_wake_callback);
> +	add_wait_queue(whead, &st->wait);
> +}
> +
> +static void kst_poll_exit(struct kst_state *st)
> +{
> +	if (st->whead) {
> +		remove_wait_queue(st->whead, &st->wait);
> +		st->whead = NULL;
> +	}
> +}
> +
> +/*
> + * This function removes request from state tree and ordering list.
> + */
> +void kst_del_req(struct dst_request *req)
> +{
> +	struct kst_state *st = req->state;
> +
> +	rb_erase(&req->request_entry, &st->request_root);
> +	RB_CLEAR_NODE(&req->request_entry);
> +	list_del_init(&req->request_list_entry);
> +}
> +EXPORT_SYMBOL_GPL(kst_del_req);
> +
> +static struct dst_request *kst_req_first(struct kst_state *st)
> +{
> +	struct dst_request *req = NULL;
> +
> +	if (!list_empty(&st->request_list))
> +		req = list_entry(st->request_list.next, struct dst_request,
> +				request_list_entry);
> +	return req;
> +}
> +
> +/*
> + * This function dequeues first request from the queue and tree.
> + */
> +static struct dst_request *kst_dequeue_req(struct kst_state *st)
> +{
> +	struct dst_request *req;
> +
> +	mutex_lock(&st->request_lock);
> +	req = kst_req_first(st);
> +	if (req)
> +		kst_del_req(req);
> +	mutex_unlock(&st->request_lock);
> +	return req;
> +}
> +
> +static inline int dst_compare_request_id(struct dst_request *old,
> +		struct dst_request *new)
> +{
> +	int cmd = 0;
> +
> +	if (old->start + to_sector(old->orig_size) <= new->start)
> +		cmd = 1;
> +	if (old->start >= new->start + to_sector(new->orig_size))
> +		cmd = -1;
> +
> +	dprintk("%s: old: op: %lu, start: %llu, size: %llu, off: %u, "
> +		"new: op: %lu, start: %llu, size: %llu, off: %u, cmp: %d.\n",
> +		__func__, bio_rw(old->bio), old->start, old->orig_size,
> +		old->offset,
> +		bio_rw(new->bio), new->start, new->orig_size,
> +		new->offset, cmd);
> +
> +	return cmd;
> +}
> +
> +/*
> + * This function enqueues request into tree, indexed by start of the request,
> + * and also puts request into ordered queue.
> + */
> +int kst_enqueue_req(struct kst_state *st, struct dst_request *req)
> +{
> +	struct rb_node **n = &st->request_root.rb_node, *parent = NULL;
> +	struct dst_request *old = NULL;
> +	int cmp, err = 0;
> +
> +	while (*n) {
> +		parent = *n;
> +		old = rb_entry(parent, struct dst_request, request_entry);
> +
> +		cmp = dst_compare_request_id(old, req);
> +		if (cmp < 0)
> +			n = &parent->rb_left;
> +		else if (cmp > 0)
> +			n = &parent->rb_right;
> +		else {
> +			printk("%s: [%c] old_req: %p, start: %llu, "
> +					"size: %llu.\n",
> +					__func__, 
> +					(bio_rw(old->bio) == WRITE)?'W':'R',
> +					old, old->start, old->orig_size);
> +			err = -EEXIST;
> +			break;
> +		}
> +	}
> +
> +	if (!err) {
> +		rb_link_node(&req->request_entry, parent, n);
> +		rb_insert_color(&req->request_entry, &st->request_root);
> +	}
> +
> +	if (req->size != req->orig_size)
> +		list_add(&req->request_list_entry, &st->request_list);
> +	else
> +		list_add_tail(&req->request_list_entry, &st->request_list);
> +	return err;
> +}
> +EXPORT_SYMBOL_GPL(kst_enqueue_req);
> +
> +/*
> + * BIOs for local exporting node are freed via this function.
> + */
> +static void kst_export_put_bio(struct bio *bio)
> +{
> +	int i;
> +	struct bio_vec *bv;
> +
> +	dprintk("%s: bio: %p, size: %u, idx: %d, num: %d.\n",
> +			__func__, bio, bio->bi_size, bio->bi_idx,
> +			bio->bi_vcnt);
> +
> +	bio_for_each_segment(bv, bio, i)
> +		__free_page(bv->bv_page);
> +	bio_put(bio);
> +}
> +
> +/*
> + * This is a generic request completion function for requests,
> + * queued for async processing.
> + * If it is local export node, state machine is different,
> + * see details below.
> + */
> +void kst_complete_req(struct dst_request *req, int err)
> +{
> +	dprintk("%s: bio: %p, req: %p, size: %llu, orig_size: %llu, "
> +			"bi_size: %u, err: %d, flags: %u.\n",
> +			__func__, req->bio, req, req->size, req->orig_size,
> +			req->bio->bi_size, err, req->flags);
> +
> +	if (req->flags & DST_REQ_EXPORT) {
> +		if (req->flags & DST_REQ_EXPORT_WRITE) {
> +			req->bio->bi_rw = WRITE;
> +			generic_make_request(req->bio);
> +		} else
> +			kst_export_put_bio(req->bio);
> +	} else {
> +		req->bio_endio(req, err);
> +	}
> +	dst_free_request(req);
> +}
> +EXPORT_SYMBOL_GPL(kst_complete_req);
> +
> +static void kst_flush_requests(struct kst_state *st)
> +{
> +	struct dst_request *req;
> +
> +	while ((req = kst_dequeue_req(st)) != NULL)
> +		kst_complete_req(req, -EIO);
> +}
> +
> +static int kst_poll_init(struct kst_state *st)
> +{
> +	struct kst_poll_helper ph;
> +
> +	ph.st = st;
> +	init_poll_funcptr(&ph.pt, &kst_queue_func);
> +
> +	st->socket->ops->poll(NULL, st->socket, &ph.pt);
> +	return 0;
> +}
> +
> +/*
> + * Main state creation function.
> + * It creates new state according to given operations
> + * and links it into worker structure and node.
> + */
> +static struct kst_state *kst_state_init(struct dst_node *node,
> +		unsigned int permissions,
> +		struct kst_state_ops *ops, void *data)
> +{
> +	struct kst_state *st;
> +	int err;
> +
> +	st = kzalloc(sizeof(struct kst_state), GFP_KERNEL);
> +	if (!st)
> +		return ERR_PTR(-ENOMEM);
> +
> +	st->permissions = permissions;
> +	st->node = node;
> +	st->ops = ops;
> +	INIT_LIST_HEAD(&st->ready_entry);
> +	INIT_LIST_HEAD(&st->entry);
> +	st->request_root.rb_node = NULL;
> +	INIT_LIST_HEAD(&st->request_list);
> +	mutex_init(&st->request_lock);
> +
> +	err = st->ops->init(st, data);
> +	if (err)
> +		goto err_out_free;
> +	mutex_lock(&node->w->state_mutex);
> +	list_add_tail(&st->entry, &node->w->state_list);
> +	mutex_unlock(&node->w->state_mutex);
> +
> +	kst_wake(st);
> +
> +	return st;
> +
> +err_out_free:
> +	kfree(st);
> +	return ERR_PTR(err);
> +}
> +
> +/*
> + * This function is called when node is removed,
> + * or when state is destroyed for connected to local exporting
> + * node client.
> + */
> +void kst_state_exit(struct kst_state *st)
> +{
> +	struct kst_worker *w = st->node->w;
> +
> +	dprintk("%s: st: %p.\n", __func__, st);
> +
> +	mutex_lock(&w->state_mutex);
> +	list_del_init(&st->entry);
> +	mutex_unlock(&w->state_mutex);
> +
> +	st->ops->exit(st);
> +
> +	st->node->state = NULL;
> +
> +	kfree(st);
> +}
> +
> +static int kst_error(struct kst_state *st, int err)
> +{
> +	if ((err == -ECONNRESET || err == -EPIPE) && st->ops->recovery(st, err))
> +		err = st->ops->recovery(st, err);
> +
> +	return st->node->st->alg->ops->error(st, err);
> +}
> +
> +/*
> + * This is main state processing function.
> + * It tries to complete request and invoke appropriate
> + * callbacks in case of errors or successfull operation finish.
> + */
> +static int kst_thread_process_state(struct kst_state *st)
> +{
> +	int err, empty;
> +	unsigned int revents;
> +	struct dst_request *req, *tmp;
> +
> +	mutex_lock(&st->request_lock);
> +	if (st->ops->ready) {
> +		err = st->ops->ready(st);
> +		if (err) {
> +			mutex_unlock(&st->request_lock);
> +			if (err < 0)
> +				kst_state_exit(st);
> +			return err;
> +		}
> +	}
> +
> +	err = 0;
> +	empty = 1;
> +	req = NULL;
> +	list_for_each_entry_safe(req, tmp, &st->request_list,
> +			request_list_entry) {
> +		empty = 0;
> +		revents = st->socket->ops->poll(st->socket->file,
> +				st->socket, NULL);
> +		dprintk("\n%s: st: %p, revents: %x.\n", __func__, st, revents);
> +		if (!revents)
> +			break;
> +		err = req->callback(req, revents);
> +		dprintk("%s: callback returned, st: %p, err: %d.\n",
> +				__func__, st, err);
> +		if (err)
> +			break;
> +	}
> +	mutex_unlock(&st->request_lock);
> +
> +	dprintk("%s: req: %p, err: %d.\n", __func__, req, err);
> +	if (err < 0) {
> +		err = kst_error(st, err);
> +		if (err && (st != st->node->state)) {
> +			dprintk("%s: err: %d, st: %p, node->state: %p.\n",
> +					__func__, err, st, st->node->state);
> +			/*
> +			 * Accepted client has state not related to storage
> +			 * node, so it must be freed explicitely.
> +			 */
> +
> +			kst_state_exit(st);
> +			return err;
> +		}
> +
> +		kst_wake(st);
> +	}
> +
> +	if (list_empty(&st->request_list) && !empty)
> +		kst_wake(st);
> +
> +	return err;
> +}
> +
> +/*
> + * Main worker thread - one per storage.
> + */
> +static int kst_thread_func(void *data)
> +{
> +	struct kst_worker *w = data;
> +	struct kst_state *st;
> +	unsigned long flags;
> +	int err = 0;
> +
> +	while (!kthread_should_stop()) {
> +		wait_event_interruptible_timeout(w->wait,
> +				!list_empty(&w->ready_list) ||
> +				kthread_should_stop(),
> +				HZ);
> +
> +		st = NULL;
> +		spin_lock_irqsave(&w->ready_lock, flags);
> +		if (!list_empty(&w->ready_list)) {
> +			st = list_entry(w->ready_list.next, struct kst_state,
> +					ready_entry);
> +			list_del_init(&st->ready_entry);
> +		}
> +		spin_unlock_irqrestore(&w->ready_lock, flags);
> +
> +		if (!st)
> +			continue;
> +
> +		err = kst_thread_process_state(st);
> +	}
> +
> +	return err;
> +}
> +
> +/*
> + * Worker initialization - this object will host andprocess all states,
> + * which in turn host requests for remote targets.
> + */
> +struct kst_worker *kst_worker_init(int id)
> +{
> +	struct kst_worker *w;
> +	int err;
> +
> +	w = kzalloc(sizeof(struct kst_worker), GFP_KERNEL);
> +	if (!w)
> +		return ERR_PTR(-ENOMEM);
> +
> +	w->id = id;
> +	init_waitqueue_head(&w->wait);
> +	spin_lock_init(&w->ready_lock);
> +	mutex_init(&w->state_mutex);
> +
> +	INIT_LIST_HEAD(&w->ready_list);
> +	INIT_LIST_HEAD(&w->state_list);
> +
> +	w->req_pool = mempool_create_slab_pool(256, dst_request_cache);
> +	if (!w->req_pool) {
> +		err = -ENOMEM;
> +		goto err_out_free;
> +	}
> +
> +	w->thread = kthread_run(&kst_thread_func, w, "kst%d", w->id);
> +	if (IS_ERR(w->thread)) {
> +		err = PTR_ERR(w->thread);
> +		goto err_out_destroy;
> +	}
> +
> +	mutex_lock(&kst_worker_mutex);
> +	list_add_tail(&w->entry, &kst_worker_list);
> +	mutex_unlock(&kst_worker_mutex);
> +
> +	return w;
> +
> +err_out_destroy:
> +	mempool_destroy(w->req_pool);
> +err_out_free:
> +	kfree(w);
> +	return ERR_PTR(err);
> +}
> +
> +void kst_worker_exit(struct kst_worker *w)
> +{
> +	struct kst_state *st, *n;
> +
> +	mutex_lock(&kst_worker_mutex);
> +	list_del(&w->entry);
> +	mutex_unlock(&kst_worker_mutex);
> +
> +	kthread_stop(w->thread);
> +
> +	list_for_each_entry_safe(st, n, &w->state_list, entry) {
> +		kst_state_exit(st);
> +	}
> +
> +	mempool_destroy(w->req_pool);
> +	kfree(w);
> +}
> +
> +/*
> + * Common state exit callback.
> + * Removes itself from worker's list of states,
> + * releases socket and flushes all requests.
> + */
> +static void kst_common_exit(struct kst_state *st)
> +{
> +	unsigned long flags;
> +
> +	dprintk("%s: st: %p.\n", __func__, st);
> +	kst_poll_exit(st);
> +
> +	spin_lock_irqsave(&st->node->w->ready_lock, flags);
> +	list_del_init(&st->ready_entry);
> +	spin_unlock_irqrestore(&st->node->w->ready_lock, flags);
> +
> +	kst_sock_release(st);
> +	kst_flush_requests(st);
> +}
> +
> +/*
> + * Listen socket contains security attributes in request_list,
> + * so it can not be flushed via usual way.
> + */
> +static void kst_listen_flush(struct kst_state *st)
> +{
> +	struct dst_secure *s, *tmp;
> +
> +	list_for_each_entry_safe(s, tmp, &st->request_list, sec_entry) {
> +		list_del(&s->sec_entry);
> +		kfree(s);
> +	}
> +}
> +
> +static void kst_listen_exit(struct kst_state *st)
> +{
> +	kst_listen_flush(st);
> +	kst_common_exit(st);
> +}
> +
> +/*
> + * Header sending function - may block.
> + */
> +static int kst_data_send_header(struct kst_state *st,
> +		struct dst_remote_request *r)
> +{
> +	struct msghdr msg;
> +	struct kvec iov;
> +
> +	iov.iov_base = r;
> +	iov.iov_len = sizeof(struct dst_remote_request);
> +
> +	msg.msg_iov = (struct iovec *)&iov;
> +	msg.msg_iovlen = 1;
> +	msg.msg_name = NULL;
> +	msg.msg_namelen = 0;
> +	msg.msg_control = NULL;
> +	msg.msg_controllen = 0;
> +	msg.msg_flags = MSG_WAITALL | MSG_NOSIGNAL;
> +
> +	return kernel_sendmsg(st->socket, &msg, &iov, 1, iov.iov_len);
> +}
> +
> +/*
> + * BIO vector receiving function - does not block, but may sleep because
> + * of scheduling policy.
> + */
> +static int kst_data_recv_bio_vec(struct kst_state *st, struct bio_vec *bv,
> +		unsigned int offset, unsigned int size)
> +{
> +	struct msghdr msg;
> +	struct kvec iov;
> +	void *kaddr;
> +	int err;
> +
> +	kaddr = kmap(bv->bv_page);
> +
> +	iov.iov_base = kaddr + bv->bv_offset + offset;
> +	iov.iov_len = size;
> +
> +	msg.msg_iov = (struct iovec *)&iov;
> +	msg.msg_iovlen = 1;
> +	msg.msg_name = NULL;
> +	msg.msg_namelen = 0;
> +	msg.msg_control = NULL;
> +	msg.msg_controllen = 0;
> +	msg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
> +
> +	err = kernel_recvmsg(st->socket, &msg, &iov, 1, iov.iov_len,
> +			msg.msg_flags);
> +	kunmap(bv->bv_page);
> +
> +	return err;
> +}
> +
> +/*
> + * BIO vector sending function - does not block, but may sleep because
> + * of scheduling policy.
> + */
> +static int kst_data_send_bio_vec(struct kst_state *st, struct bio_vec *bv,
> +		unsigned int offset, unsigned int size)
> +{
> +	return kernel_sendpage(st->socket, bv->bv_page,
> +			bv->bv_offset + offset, size,
> +			MSG_DONTWAIT | MSG_NOSIGNAL);
> +}
> +
> +typedef int (*kst_data_process_bio_vec_t)(struct kst_state *st,
> +		struct bio_vec *bv, unsigned int offset, unsigned int size);
> +
> +/*
> + * @req: processing request.
> + * Contains BIO and all related to its processing info.
> + *
> + * This function sends or receives requested number of pages from given BIO.
> + *
> + * In case of errors negative return value is returned and @size,
> + * @index and @off are set to the:
> + * - number of bytes not yet processed (i.e. the rest of the bytes to be
> + *   processed).
> + * - index of the last bio_vec started to be processed (header sent).
> + * - offset of the first byte to be processed in the bio_vec.
> + *
> + * If there are no errors, zero is returned.
> + * -EAGAIN is not an error and is transformed into zero return value,
> + * called must check if @size is zero, in that case whole BIO is processed
> + * and thus req->bio_endio() can be called, othervise new request must be allocated
> + * to be processed later.
> + */
> +static int kst_data_process_bio(struct dst_request *req)
> +{
> +	int err = -ENOSPC, partial = (req->size != req->orig_size);
> +	struct dst_remote_request r;
> +	kst_data_process_bio_vec_t func;
> +	unsigned int cur_size;
> +
> +	r.flags = cpu_to_be32(((unsigned long)req->bio) & 0xffffffff);
> +
> +	if (bio_rw(req->bio) == WRITE) {
> +		r.cmd = cpu_to_be32(DST_WRITE);
> +		func = kst_data_send_bio_vec;
> +	} else {
> +		r.cmd = cpu_to_be32(DST_READ);
> +		func = kst_data_recv_bio_vec;
> +	}
> +
> +	dprintk("%s: start: [%c], start: %llu, idx: %d, num: %d, "
> +			"size: %llu, offset: %u.\n",
> +			__func__, (bio_rw(req->bio) == WRITE)?'W':'R',
> +			req->start, req->idx, req->num, req->size, req->offset);
> +
> +	while (req->idx < req->num) {
> +		struct bio_vec *bv = bio_iovec_idx(req->bio, req->idx);
> +
> +		cur_size = min_t(u64, bv->bv_len - req->offset, req->size);
> +
> +		if (cur_size == 0) {
> +			printk("%s: %d/%d: start: %llu, "
> +				"bv_offset: %u, bv_len: %u, "
> +				"req_offset: %u, req_size: %llu, "
> +				"req: %p, bio: %p, err: %d.\n",
> +				__func__, req->idx, req->num, req->start, 
> +				bv->bv_offset, bv->bv_len,
> +				req->offset, req->size,
> +				req, req->bio, err);
> +			BUG();
> +		}
> +
> +		if (!(req->flags & DST_REQ_HEADER_SENT)) {
> +			r.sector = cpu_to_be64(req->start);
> +			r.offset = cpu_to_be32(bv->bv_offset + req->offset);
> +			r.size = cpu_to_be32(cur_size);
> +
> +			err = kst_data_send_header(req->state, &r);
> +			if (err != sizeof(struct dst_remote_request)) {
> +				dprintk("%s: %d/%d: header: start: %llu, "
> +					"bv_offset: %u, bv_len: %u, "
> +					"a offset: %u, offset: %u, "
> +					"cur_size: %u, err: %d.\n",
> +					__func__, req->idx, req->num,
> +					req->start, bv->bv_offset, bv->bv_len,
> +					bv->bv_offset + req->offset,
> +					req->offset, cur_size, err);
> +				if (err >= 0)
> +					err = -EINVAL;
> +				break;
> +			}
> +
> +			req->flags |= DST_REQ_HEADER_SENT;
> +		}
> +
> +		err = func(req->state, bv, req->offset, cur_size);
> +		if (err <= 0)
> +			break;
> +
> +		req->offset += err;
> +		req->size -= err;
> +
> +		if (req->offset != bv->bv_len) {
> +			dprintk("%s: %d/%d: this: start: %llu, bv_offset: %u, "
> +				"bv_len: %u, a offset: %u, offset: %u, "
> +				"cur_size: %u, err: %d.\n",
> +				__func__, req->idx, req->num, req->start,
> +				bv->bv_offset, bv->bv_len,
> +				bv->bv_offset + req->offset,
> +				req->offset, cur_size, err);
> +			err = -EAGAIN;
> +			break;
> +		}
> +		req->offset = 0;
> +		req->idx++;
> +		req->flags &= ~DST_REQ_HEADER_SENT;
> +
> +		req->start += to_sector(bv->bv_len);
> +	}
> +
> +	if (err <= 0 && err != -EAGAIN) {
> +		if (err == 0)
> +			err = -ECONNRESET;
> +	} else
> +		err = 0;
> +
> +	if (req->size) {
> +		req->state->flags |= KST_FLAG_PARTIAL;
> +	} else if (partial) {
> +		req->state->flags &= ~KST_FLAG_PARTIAL;
> +	}
> +
> +	if (err < 0 || (req->idx == req->num && req->size)) {
> +		dprintk("%s: return: idx: %d, num: %d, offset: %u, "
> +				"size: %llu, err: %d.\n",
> +			__func__, req->idx, req->num, req->offset,
> +			req->size, err);
> +	}
> +	dprintk("%s: end: start: %llu, idx: %d, num: %d, "
> +			"size: %llu, offset: %u.\n",
> +		__func__, req->start, req->idx, req->num,
> +		req->size, req->offset);
> +
> +	return err;
> +}
> +
> +void kst_bio_endio(struct dst_request *req, int err)
> +{
> +	if (err)
> +		printk("%s: freeing bio: %p, bi_size: %u, "
> +			"orig_size: %llu, req: %p.\n",
> +		__func__, req->bio, req->bio->bi_size, req->orig_size, req);
> +	bio_endio(req->bio, req->orig_size, err);
> +}
> +EXPORT_SYMBOL_GPL(kst_bio_endio);
> +
> +/*
> + * This callback is invoked by worker thread to process given request.
> + */
> +int kst_data_callback(struct dst_request *req, unsigned int revents)
> +{
> +	int err;
> +
> +	dprintk("%s: req: %p, num: %d, idx: %d, bio: %p, "
> +			"revents: %x, flags: %x.\n",
> +			__func__, req, req->num, req->idx, req->bio,
> +			revents, req->flags);
> +
> +	if (req->flags & DST_REQ_EXPORT_READ)
> +		return 1;
> +
> +	err = kst_data_process_bio(req);
> +	if (err < 0)
> +		goto err_out;
> +
> +	if (!req->size) {
> +		dprintk("%s: complete: req: %p, bio: %p.\n",
> +				__func__, req, req->bio);
> +		kst_del_req(req);
> +		kst_complete_req(req, 0);
> +		return 0;
> +	}
> +
> +	if (revents & (POLLERR | POLLHUP | POLLRDHUP)) {
> +		err = -EPIPE;
> +		goto err_out;
> +	}
> +
> +	return 1;
> +
> +err_out:
> +	return err;
> +}
> +EXPORT_SYMBOL_GPL(kst_data_callback);
> +
> +#define KST_CONG_COMPLETED		(0)
> +#define KST_CONG_NOT_FOUND		(1)
> +#define KST_CONG_QUEUE			(-1)
> +
> +/*
> + * kst_congestion - checks for data congestion, i.e. the case, when given
> + * 	block request crosses an area of the another block request which
> + * 	is not yet sent to the remote node.
> + *
> + * @req: dst request containing block io related information.
> + *
> + * Return value:
> + * %KST_CONG_COMPLETED  - congestion was found and processed,
> + * 	bio must be ended, request is completed.
> + * %KST_CONG_NOT_FOUND  - no congestion found,
> + * 	request must be processed as usual
> + * %KST_CONG_QUEUE - congestion has been found, but bio is not completed,
> + * 	new request must be allocated and processed.
> + */
> +static int kst_congestion(struct dst_request *req)
> +{
> +	int cmp, i;
> +	struct kst_state *st = req->state;
> +	struct rb_node *n = st->request_root.rb_node;
> +	struct dst_request *old = NULL, *dst_req, *src_req;
> +
> +	while (n) {
> +		src_req = rb_entry(n, struct dst_request, request_entry);
> +		cmp = dst_compare_request_id(src_req, req);
> +
> +		if (cmp < 0)
> +			n = n->rb_left;
> +		else if (cmp > 0)
> +			n = n->rb_right;
> +		else {
> +			old = src_req;
> +			break;
> +		}
> +	}
> +
> +	if (likely(!old))
> +		return KST_CONG_NOT_FOUND;
> +
> +	dprintk("%s: old: op: %lu, start: %llu, size: %llu, off: %u, "
> +			"new: op: %lu, start: %llu, size: %llu, off: %u.\n",
> +		__func__, bio_rw(old->bio), old->start, old->orig_size,
> +		old->offset,
> +		bio_rw(req->bio), req->start, req->orig_size, req->offset);
> +
> +	if ((bio_rw(old->bio) != WRITE) && (bio_rw(req->bio) != WRITE)) {
> +		return KST_CONG_QUEUE;
> +	}
> +
> +	if (unlikely(req->offset != old->offset))
> +		return KST_CONG_QUEUE;
> +
> +	src_req = old;
> +	dst_req = req;
> +	if (bio_rw(req->bio) == WRITE) {
> +		dst_req = old;
> +		src_req = req;
> +	}
> +
> +	/* Actually we could partially complete new request by copying
> +	 * part of the first one, but not now, consider this as a
> +	 * (low-priority) todo item.
> +	 */
> +	if (src_req->start + src_req->orig_size <
> +			dst_req->start + dst_req->orig_size)
> +		return KST_CONG_QUEUE;
> +
> +	/*
> +	 * So, only process if new request is differnt from old one,
> +	 * or subsequent write, i.e.:
> +	 * - not completed write and request to read
> +	 * - not completed read and request to write
> +	 * - not completed write and request to (over)write
> +	 */
> +	for (i = old->idx; i < old->num; ++i) {
> +		struct bio_vec *bv_src, *bv_dst;
> +		void *src, *dst;
> +		u64 len;
> +
> +		bv_src = bio_iovec_idx(src_req->bio, i);
> +		bv_dst = bio_iovec_idx(dst_req->bio, i);
> +
> +		if (unlikely(bv_dst->bv_offset != bv_src->bv_offset))
> +			return KST_CONG_QUEUE;
> +
> +		if (unlikely(bv_dst->bv_len != bv_src->bv_len))
> +			return KST_CONG_QUEUE;
> +
> +		src = kmap_atomic(bv_src->bv_page, KM_USER0);
> +		dst = kmap_atomic(bv_dst->bv_page, KM_USER1);
> +
> +		len = min_t(u64, bv_dst->bv_len, dst_req->size);
> +
> +		memcpy(dst + bv_dst->bv_offset, src + bv_src->bv_offset, len);
> +
> +		kunmap_atomic(src, KM_USER0);
> +		kunmap_atomic(dst, KM_USER1);
> +
> +		dst_req->idx++;
> +		dst_req->size -= len;
> +		dst_req->offset = 0;
> +		dst_req->start += to_sector(len);
> +
> +		if (!dst_req->size)
> +			break;
> +	}
> +
> +	if (req == dst_req)
> +		return KST_CONG_COMPLETED;
> +
> +	kst_del_req(dst_req);
> +	kst_complete_req(dst_req, 0);
> +
> +	return KST_CONG_NOT_FOUND;
> +}
> +
> +struct dst_request *dst_clone_request(struct dst_request *req, mempool_t *pool)
> +{
> +	struct dst_request *new_req;
> +
> +	new_req = mempool_alloc(pool, GFP_NOIO);
> +	if (!new_req)
> +		return NULL;
> +
> +	memset(new_req, 0, sizeof(struct dst_request));
> +
> +	dprintk("%s: req: %p, new_req: %p, bio: %p.\n",
> +			__func__, req, new_req, req->bio);
> +
> +	RB_CLEAR_NODE(&new_req->request_entry);
> +
> +	if (req) {
> +		new_req->bio = req->bio;
> +		new_req->state = req->state;
> +		new_req->node = req->node;
> +		new_req->idx = req->idx;
> +		new_req->num = req->num;
> +		new_req->size = req->size;
> +		new_req->orig_size = req->orig_size;
> +		new_req->offset = req->offset;
> +		new_req->start = req->start;
> +		new_req->flags = req->flags;
> +		new_req->bio_endio = req->bio_endio;
> +		new_req->priv = req->priv;
> +	}
> +
> +	return new_req;
> +}
> +EXPORT_SYMBOL_GPL(dst_clone_request);
> +
> +void dst_free_request(struct dst_request *req)
> +{
> +	dprintk("%s: free req: %p, pool: %p, bio: %p, state: %p, node: %p.\n",
> +			__func__, req, req->node->w->req_pool,
> +			req->bio, req->state, req->node);
> +	mempool_free(req, req->node->w->req_pool);
> +}
> +EXPORT_SYMBOL_GPL(dst_free_request);
> +
> +/*
> + * This is main data processing function, eventually invoked from block layer.
> + * It tries to complte request, but if it is about to block, it allocates
> + * new request and queues it to main worker to be processed when events allow.
> + */
> +static int kst_data_push(struct dst_request *req)
> +{
> +	struct kst_state *st = req->state;
> +	struct dst_request *new_req;
> +	unsigned int revents;
> +	int err, locked = 0;
> +
> +	dprintk("%s: start: %llu, size: %llu, bio: %p.\n",
> +			__func__, req->start, req->size, req->bio);
> +
> +	if (mutex_trylock(&st->request_lock)) {
> +		locked = 1;
> +
> +		if (st->flags & (KST_FLAG_PARTIAL | DST_REQ_ALWAYS_QUEUE))
> +			goto alloc_new_req;
> +
> +		err = kst_congestion(req);
> +		if (err == KST_CONG_COMPLETED) {
> +			err = 0;
> +			goto out_bio_endio;
> +		}
> +
> +		if (err == KST_CONG_NOT_FOUND) {
> +			revents = st->socket->ops->poll(NULL, st->socket, NULL);
> +			dprintk("%s: st: %p, bio: %p, revents: %x.\n",
> +					__func__, st, req->bio, revents);
> +			if (revents & POLLOUT) {
> +				err = kst_data_process_bio(req);
> +				if (err < 0)
> +					goto out_unlock;
> +
> +				if (!req->size) {
> +					err = 0;
> +					goto out_bio_endio;
> +				}
> +			}
> +		}
> +	}
> +
> +alloc_new_req:
> +	err = -ENOMEM;
> +	new_req = dst_clone_request(req, req->node->w->req_pool);
> +	if (!new_req)
> +		goto out_unlock;
> +
> +	new_req->callback = &kst_data_callback;
> +
> +	if (!locked)
> +		mutex_lock(&st->request_lock);
> +	locked = 1;
> +
> +	err = kst_enqueue_req(st, new_req);
> +	mutex_unlock(&st->request_lock);
> +	locked = 0;
> +	if (err) {
> +		printk(KERN_NOTICE "%s: congestion [%c], start: %llu, idx: %d,"
> +				" num: %d, size: %llu, offset: %u, err: %d.\n",
> +			__func__, (bio_rw(req->bio) == WRITE)?'W':'R',
> +			req->start, req->idx, req->num, req->size,
> +			req->offset, err);
> +	}
> +
> +	kst_wake(st);
> +
> +	return 0;
> +
> +out_bio_endio:
> +	req->bio_endio(req, err);
> +out_unlock:
> +	if (locked)
> +		mutex_unlock(&st->request_lock);
> +	locked = 0;
> +
> +	if (err) {
> +		err = kst_error(st, err);
> +		if (!err)
> +			goto alloc_new_req;
> +	}
> +
> +	if (err) {
> +		printk("%s: error [%c], start: %llu, idx: %d, num: %d, "
> +				"size: %llu, offset: %u, err: %d.\n",
> +			__func__, (bio_rw(req->bio) == WRITE)?'W':'R',
> +			req->start, req->idx, req->num, req->size,
> +			req->offset, err);
> +		req->bio_endio(req, err);
> +	}
> +
> +	kst_wake(st);
> +	return err;
> +}
> +
> +/*
> + * Remote node initialization callback.
> + */
> +static int kst_data_init(struct kst_state *st, void *data)
> +{
> +	int err;
> +
> +	st->socket = data;
> +	st->socket->sk->sk_allocation = GFP_NOIO;
> +	/*
> +	 * Why not?
> +	 */
> +	st->socket->sk->sk_sndbuf = st->socket->sk->sk_sndbuf = 1024*1024*10;
> +
> +	err = kst_poll_init(st);
> +	if (err)
> +		return err;
> +
> +	return 0;
> +}
> +
> +/*
> + * Remote node recovery function - tries to reconnect to given target.
> + */
> +static int kst_data_recovery(struct kst_state *st, int err)
> +{
> +	struct socket *sock;
> +	struct sockaddr addr;
> +	int addrlen;
> +	struct dst_request *req;
> +
> +	if (err != -ECONNRESET && err != -EPIPE) {
> +		dprintk("%s: state %p does not know how "
> +				"to recover from error %d.\n",
> +				__func__, st, err);
> +		return err;
> +	}
> +
> +	err = sock_create(st->socket->ops->family, st->socket->type,
> +			st->socket->sk->sk_protocol, &sock);
> +	if (err < 0)
> +		goto err_out_exit;
> +
> +	sock->sk->sk_sndtimeo = sock->sk->sk_rcvtimeo =
> +		msecs_to_jiffies(DST_DEFAULT_TIMEO);
> +
> +	err = sock->ops->getname(st->socket, &addr, &addrlen, 2);
> +	if (err)
> +		goto err_out_destroy;
> +
> +	err = sock->ops->connect(sock, &addr, addrlen, 0);
> +	if (err)
> +		goto err_out_destroy;
> +
> +	kst_poll_exit(st);
> +	kst_sock_release(st);
> +
> +	mutex_lock(&st->request_lock);
> +	err = st->ops->init(st, sock);
> +	if (!err) {
> +		/*
> +		 * After reconnection is completed all requests
> +		 * must be resent from the state they were finished previously,
> +		 * but with new headers.
> +		 */
> +		list_for_each_entry(req, &st->request_list, request_list_entry)
> +			req->flags &= ~DST_REQ_HEADER_SENT;
> +	}
> +	mutex_unlock(&st->request_lock);
> +	if (err < 0)
> +		goto err_out_destroy;
> +
> +	kst_wake(st);
> +	dprintk("%s: recovery completed.\n", __func__);
> +
> +	return 0;
> +
> +err_out_destroy:
> +	sock_release(sock);
> +err_out_exit:
> +	dprintk("%s: revovery failed: st: %p, err: %d.\n", __func__, st, err);
> +	return err;
> +}
> +
> +static inline void kst_convert_header(struct dst_remote_request *r)
> +{
> +	r->cmd = be32_to_cpu(r->cmd);
> +	r->sector = be64_to_cpu(r->sector);
> +	r->offset = be32_to_cpu(r->offset);
> +	r->size = be32_to_cpu(r->size);
> +	r->flags = be32_to_cpu(r->flags);
> +}
> +
> +/*
> + * Local exporting node end IO callbacks.
> + */
> +static int kst_export_write_end_io(struct bio *bio, unsigned int size, int err)
> +{
> +	dprintk("%s: bio: %p, size: %u, idx: %d, num: %d, err: %d.\n",
> +		__func__, bio, bio->bi_size, bio->bi_idx, bio->bi_vcnt, err);
> +
> +	if (bio->bi_size)
> +		return 1;
> +
> +	kst_export_put_bio(bio);
> +	return 0;
> +}
> +
> +static int kst_export_read_end_io(struct bio *bio, unsigned int size, int err)
> +{
> +	struct dst_request *req = bio->bi_private;
> +	struct kst_state *st = req->state;
> +
> +	dprintk("%s: bio: %p, req: %p, size: %u, idx: %d, num: %d, err: %d.\n",
> +		__func__, bio, req, bio->bi_size, bio->bi_idx,
> +		bio->bi_vcnt, err);
> +
> +	if (bio->bi_size)
> +		return 1;
> +
> +	bio->bi_size = req->size = req->orig_size;
> +	bio->bi_rw = WRITE;
> +	req->flags &= ~DST_REQ_EXPORT_READ;
> +	kst_wake(st);
> +	return 0;
> +}
> +
> +/*
> + * This callback is invoked each time new request from remote
> + * node to given local export node is received.
> + * It allocates new block IO request and queues it for processing.
> + */
> +static int kst_export_ready(struct kst_state *st)
> +{
> +	struct dst_remote_request r;
> +	struct msghdr msg;
> +	struct kvec iov;
> +	struct bio *bio;
> +	int err, nr, i;
> +	struct dst_request *req;
> +	sector_t data_size;
> +	unsigned int revents = st->socket->ops->poll(NULL, st->socket, NULL);
> +
> +	if (revents & (POLLERR | POLLHUP)) {
> +		err = -EPIPE;
> +		goto err_out_exit;
> +	}
> +
> +	if (!(revents & POLLIN) || !list_empty(&st->request_list))
> +		return 0;
> +
> +	iov.iov_base = &r;
> +	iov.iov_len = sizeof(struct dst_remote_request);
> +
> +	msg.msg_iov = (struct iovec *)&iov;
> +	msg.msg_iovlen = 1;
> +	msg.msg_name = NULL;
> +	msg.msg_namelen = 0;
> +	msg.msg_control = NULL;
> +	msg.msg_controllen = 0;
> +	msg.msg_flags = MSG_WAITALL | MSG_NOSIGNAL;
> +
> +	err = kernel_recvmsg(st->socket, &msg, &iov, 1,
> +			iov.iov_len, msg.msg_flags);
> +	if (err != sizeof(struct dst_remote_request)) {
> +		err = -EINVAL;
> +		goto err_out_exit;
> +	}
> +
> +	kst_convert_header(&r);
> +
> +	dprintk("\n%s: cmd: %u, sector: %llu, size: %u, "
> +			"flags: %x, offset: %u.\n",
> +			__func__, r.cmd, r.sector, r.size, r.flags, r.offset);
> +
> +	err = -EINVAL;
> +	if (r.cmd != DST_READ && r.cmd != DST_WRITE && r.cmd != DST_REMOTE_CFG)
> +		goto err_out_exit;
> +
> +	data_size = get_capacity(st->node->bdev->bd_disk);
> +	if ((signed)(r.sector + to_sector(r.size)) < 0 ||
> +			(signed)(r.sector + to_sector(r.size)) > data_size ||
> +			(signed)r.sector > data_size)
> +		goto err_out_exit;
> +
> +	if (r.cmd == DST_REMOTE_CFG) {
> +		r.sector = data_size;
> +		kst_convert_header(&r);
> +
> +		iov.iov_base = &r;
> +		iov.iov_len = sizeof(struct dst_remote_request);
> +
> +		msg.msg_iov = (struct iovec *)&iov;
> +		msg.msg_iovlen = 1;
> +		msg.msg_name = NULL;
> +		msg.msg_namelen = 0;
> +		msg.msg_control = NULL;
> +		msg.msg_controllen = 0;
> +		msg.msg_flags = MSG_WAITALL | MSG_NOSIGNAL;
> +
> +		err = kernel_sendmsg(st->socket, &msg, &iov, 1, iov.iov_len);
> +		if (err != sizeof(struct dst_remote_request)) {
> +			err = -EINVAL;
> +			goto err_out_exit;
> +		}
> +		kst_wake(st);
> +		return 0;
> +	}
> +
> +	nr = r.size/PAGE_SIZE + 1;
> +
> +	while (r.size) {
> +		int nr_pages = min(BIO_MAX_PAGES, nr);
> +		unsigned int size;
> +		struct page *page;
> +
> +		err = -ENOMEM;
> +		req = dst_clone_request(NULL, st->node->w->req_pool);
> +		if (!req)
> +			goto err_out_exit;
> +
> +		dprintk("%s: alloc req: %p, pool: %p.\n",
> +				__func__, req, st->node->w->req_pool);
> +
> +		bio = bio_alloc(GFP_NOIO, nr_pages);
> +		if (!bio)
> +			goto err_out_free_req;
> +
> +		req->flags = DST_REQ_EXPORT | DST_REQ_HEADER_SENT;
> +		req->bio = bio;
> +		req->state = st;
> +		req->node = st->node;
> +		req->callback = &kst_data_callback;
> +		req->bio_endio = &kst_bio_endio;
> +
> +		/*
> +		 * Yes, looks a bit weird.
> +		 * Logic is simple - for local exporting node all operations
> +		 * are reversed compared to usual nodes, since usual nodes
> +		 * process remote data and local export node process remote
> +		 * requests, so that writing data means sending data to
> +		 * remote node and receiving on the local export one.
> +		 *
> +		 * So, to process writing to the exported node we need first 
> +		 * to receive data from the net (i.e. to perform READ 
> +		 * operationin terms of usual node), and then put it to the 
> +		 * storage (WRITE command, so it will be changed before 
> +		 * calling generic_make_request()).
> +		 *
> +		 * To process read request from the exported node we need
> +		 * first to read it from storage (READ command for BIO)
> +		 * and then send it over the net (perform WRITE operation
> +		 * in terms of network).
> +		 */
> +		if (r.cmd == DST_WRITE) {
> +			req->flags |= DST_REQ_EXPORT_WRITE;
> +			bio->bi_end_io = kst_export_write_end_io;
> +		} else {
> +			req->flags |= DST_REQ_EXPORT_READ;
> +			bio->bi_end_io = kst_export_read_end_io;
> +		}
> +		bio->bi_rw = READ;
> +		bio->bi_private = req;
> +		bio->bi_sector = r.sector;
> +		bio->bi_bdev = st->node->bdev;
> +
> +		for (i = 0; i < nr_pages; ++i) {
> +			page = alloc_page(GFP_NOIO);
> +			if (!page)
> +				break;
> +
> +			size = min_t(u32, PAGE_SIZE, r.size);
> +
> +			err = bio_add_page(bio, page, size, r.offset);
> +			dprintk("%s: %d/%d: page: %p, size: %u, offset: %u, "
> +					"err: %d.\n",
> +					__func__, i, nr_pages, page, size,
> +					r.offset, err);
> +			if (err <= 0)
> +				break;
> +
> +			if (err == size) {
> +				r.offset = 0;
> +				nr--;
> +			} else {
> +				r.offset += err;
> +			}
> +
> +			r.size -= err;
> +			r.sector += to_sector(err);
> +
> +			if (!r.size)
> +				break;
> +		}
> +
> +		if (!bio->bi_vcnt) {
> +			err = -ENOMEM;
> +			goto err_out_put;
> +		}
> +
> +		req->size = req->orig_size = bio->bi_size;
> +		req->start = bio->bi_sector;
> +		req->idx = 0;
> +		req->num = bio->bi_vcnt;
> +
> +		dprintk("%s: submitting: bio: %p, req: %p, start: %llu, "
> +			"size: %llu, idx: %d, num: %d, offset: %u, err: %d.\n",
> +			__func__, bio, req, req->start, req->size,
> +			req->idx, req->num, req->offset, err);
> +
> +		err = kst_enqueue_req(st, req);
> +		if (err)
> +			goto err_out_put;
> +
> +		if (r.cmd == DST_READ) {
> +			generic_make_request(bio);
> +		}
> +	}
> +
> +	kst_wake(st);
> +	return 0;
> +
> +err_out_put:
> +	bio_put(bio);
> +err_out_free_req:
> +	dst_free_request(req);
> +err_out_exit:
> +	dprintk("%s: error: %d.\n", __func__, err);
> +	return err;
> +}
> +
> +static void kst_export_exit(struct kst_state *st)
> +{
> +	struct dst_node *n = st->node;
> +
> +	dprintk("%s: st: %p.\n", __func__, st);
> +
> +	kst_common_exit(st);
> +	dst_node_put(n);
> +}
> +
> +static struct kst_state_ops kst_data_export_ops = {
> +	.init = &kst_data_init,
> +	.push = &kst_data_push,
> +	.exit = &kst_export_exit,
> +	.ready = &kst_export_ready,
> +};
> +
> +/*
> + * This callback is invoked each time listening socket for
> + * given local export node becomes ready.
> + * It creates new state for connected client and queues for processing.
> + */
> +static int kst_listen_ready(struct kst_state *st)
> +{
> +	struct socket *newsock;
> +	struct saddr addr;
> +	struct kst_state *newst;
> +	int err;
> +	unsigned int revents, permissions = 0;
> +	struct dst_secure *s;
> +
> +	revents = st->socket->ops->poll(NULL, st->socket, NULL);
> +	if (!(revents & POLLIN))
> +		return 1;
> +
> +	err = sock_create(st->socket->ops->family, st->socket->type,
> +			st->socket->sk->sk_protocol, &newsock);
> +	if (err)
> +		goto err_out_exit;
> +
> +	err = st->socket->ops->accept(st->socket, newsock, 0);
> +	if (err)
> +		goto err_out_put;
> +
> +	if (newsock->ops->getname(newsock, (struct sockaddr *)&addr,
> +				  (int *)&addr.sa_data_len, 2) < 0) {
> +		err = -ECONNABORTED;
> +		goto err_out_put;
> +	}
> +
> +	list_for_each_entry(s, &st->request_list, sec_entry) {
> +		void *sec_addr, *new_addr;
> +
> +		sec_addr = ((void *)&s->sec.addr) + s->sec.check_offset;
> +		new_addr = ((void *)&addr) + s->sec.check_offset;
> +
> +		if (!memcmp(sec_addr, new_addr,	
> +				addr.sa_data_len - s->sec.check_offset)) {
> +			permissions = s->sec.permissions;
> +			break;
> +		}
> +	}
> +
> +	/*
> +	 * So far only reading and writing are supported.
> +	 * Block device does not know about anything else,
> +	 * but as far as I recall, there was a prognosis,
> +	 * that computer will never require more than 640kb of RAM.
> +	 */
> +	if (permissions == 0) {
> +		err = -EPERM;
> +		goto err_out_put;
> +	}
> +
> +	if (st->socket->ops->family == AF_INET) {
> +		struct sockaddr_in *sin = (struct sockaddr_in *)&addr;
> +		printk(KERN_INFO "%s: Client: %u.%u.%u.%u:%d.\n", __func__,
> +			NIPQUAD(sin->sin_addr.s_addr), ntohs(sin->sin_port));
> +	} else if (st->socket->ops->family == AF_INET6) {
> +		struct sockaddr_in6 *sin = (struct sockaddr_in6 *)&addr;
> +		printk(KERN_INFO "%s: Client: "
> +			"%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x:%d",
> +			__func__, 
> +			NIP6(sin->sin6_addr), ntohs(sin->sin6_port));
> +	}
> +
> +	dst_node_get(st->node);
> +	newst = kst_state_init(st->node, permissions,
> +			&kst_data_export_ops, newsock);
> +	if (IS_ERR(newst)) {
> +		err = PTR_ERR(newst);
> +		goto err_out_put;
> +	}
> +
> +	/*
> +	 * Negative return value means error, positive - stop this state 
> +	 * processing. Zero allows to check state for pending requests.
> +	 * Listening socket contains security objects in request list,
> +	 * since it does not have any requests.
> +	 */
> +	return 1;
> +
> +err_out_put:
> +	sock_release(newsock);
> +err_out_exit:
> +	return 1;
> +}
> +
> +static int kst_listen_init(struct kst_state *st, void *data)
> +{
> +	int err = -ENOMEM, i;
> +	struct dst_le_template *tmp = data;
> +	struct dst_secure *s;
> +
> +	for (i=0; i<tmp->le.secure_attr_num; ++i) {
> +		s = kmalloc(sizeof(struct dst_secure), GFP_KERNEL);
> +		if (!s)
> +			goto err_out_exit;
> +
> +		if (copy_from_user(&s->sec, tmp->data,
> +				sizeof(struct dst_secure_user))) {
> +			kfree(s);
> +			err = -EFAULT;
> +			goto err_out_exit;
> +		}
> +
> +		list_add_tail(&s->sec_entry, &st->request_list);
> +		tmp->data += sizeof(struct dst_secure_user);
> +
> +		if (s->sec.addr.sa_family == AF_INET) {
> +			struct sockaddr_in *sin = 
> +				(struct sockaddr_in *)&s->sec.addr;
> +			printk(KERN_INFO "%s: Client: %u.%u.%u.%u:%d, "
> +					"permissions: %x.\n", 
> +				__func__, NIPQUAD(sin->sin_addr.s_addr), 
> +				ntohs(sin->sin_port), s->sec.permissions);
> +		} else if (s->sec.addr.sa_family == AF_INET6) {
> +			struct sockaddr_in6 *sin = 
> +				(struct sockaddr_in6 *)&s->sec.addr;
> +			printk(KERN_INFO "%s: Client: "
> +				"%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x:%d, "
> +				"permissions: %x.\n", 
> +				__func__, NIP6(sin->sin6_addr), 
> +				ntohs(sin->sin6_port), s->sec.permissions);
> +		}
> +	}
> +
> +	err = kst_sock_create(st, &tmp->le.rctl.addr, tmp->le.rctl.type,
> +			tmp->le.rctl.proto, tmp->le.backlog);
> +	if (err)
> +		goto err_out_exit;
> +
> +	err = kst_poll_init(st);
> +	if (err)
> +		goto err_out_release;
> +
> +	return 0;
> +
> +err_out_release:
> +	kst_sock_release(st);
> +err_out_exit:
> +	kst_listen_flush(st);
> +	return err;
> +}
> +
> +/*
> + * Operations for different types of states.
> + * There are three:
> + * data state - created for remote node, when distributed storage connects
> + * 	to remote node, which contain data.
> + * listen state - created for local export node, when remote distributed
> + * 	storage's node connects to given node to get/put data.
> + * data export state - created for each client connected to above listen
> + * 	state.
> + */
> +static struct kst_state_ops kst_listen_ops = {
> +	.init = &kst_listen_init,
> +	.exit = &kst_listen_exit,
> +	.ready = &kst_listen_ready,
> +};
> +static struct kst_state_ops kst_data_ops = {
> +	.init = &kst_data_init,
> +	.push = &kst_data_push,
> +	.exit = &kst_common_exit,
> +	.recovery = &kst_data_recovery,
> +};
> +
> +struct kst_state *kst_listener_state_init(struct dst_node *node,
> +		struct dst_le_template *tmp)
> +{
> +	return kst_state_init(node, DST_PERM_READ | DST_PERM_WRITE,
> +			&kst_listen_ops, tmp);
> +}
> +
> +struct kst_state *kst_data_state_init(struct dst_node *node,
> +		struct socket *newsock)
> +{
> +	return kst_state_init(node, DST_PERM_READ | DST_PERM_WRITE,
> +			&kst_data_ops, newsock);
> +}
> +
> +/*
> + * Remove all workers and associated states.
> + */
> +void kst_exit_all(void)
> +{
> +	struct kst_worker *w, *n;
> +
> +	list_for_each_entry_safe(w, n, &kst_worker_list, entry) {
> +		kst_worker_exit(w);
> +	}
> +}
> diff --git a/include/linux/dst.h b/include/linux/dst.h
> new file mode 100644
> index 0000000..7b0feb1
> --- /dev/null
> +++ b/include/linux/dst.h
> @@ -0,0 +1,354 @@
> +/*
> + * 2007+ Copyright (c) Evgeniy Polyakov <johnpol@2ka.mipt.ru>
> + * All rights reserved.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + */
> +
> +#ifndef __DST_H
> +#define __DST_H
> +
> +#include <linux/types.h>
> +
> +#define DST_NAMELEN		32
> +#define DST_NAME		"dst"
> +#define DST_IOCTL		0xba
> +
> +enum {
> +	DST_DEL_NODE	= 0,	/* Remove node with given id from storage */
> +	DST_ADD_REMOTE,		/* Add remote node with given id to the storage */
> +	DST_ADD_LOCAL,		/* Add local node with given id to the storage */
> +	DST_ADD_LOCAL_EXPORT,	/* Add local node with given id to the storage to be exported and used by remote peers */
> +	DST_START_STORAGE,	/* Array is ready and storage can be started, if there will be new nodes
> +				 * added to the storage, they will be checked against existing size and
> +				 * probably be dropped (for example in mirror format when new node has smaller
> +				 * size than array created) or inserted.
> +				 */
> +	DST_STOP_STORAGE,	/* Remove array and all nodes. */
> +	DST_CMD_MAX
> +};
> +
> +#define DST_CTL_FLAGS_REMOTE	(1<<0)
> +#define DST_CTL_FLAGS_EXPORT	(1<<1)
> +
> +struct dst_ctl
> +{
> +	char			st[DST_NAMELEN];
> +	char			alg[DST_NAMELEN];
> +	__u32			flags;
> +	__u64			start, size;
> +};
> +
> +struct dst_local_ctl
> +{
> +	char			name[DST_NAMELEN];
> +};
> +
> +#define SADDR_MAX_DATA	128
> +
> +struct saddr {
> +	unsigned short		sa_family;			/* address family, AF_xxx	*/
> +	char			sa_data[SADDR_MAX_DATA];	/* 14 bytes of protocol address	*/
> +	unsigned short		sa_data_len;			/* Number of bytes used in sa_data */
> +};
> +
> +struct dst_remote_ctl
> +{
> +	__u16			type;
> +	__u16			proto;
> +	struct saddr		addr;
> +};
> +
> +#define DST_PERM_READ		(1<<0)
> +#define DST_PERM_WRITE		(1<<1)
> +
> +/*
> + * Right now it is simple model, where each remote address
> + * is assigned to set of permissions it is allowed to perform.
> + * In real world block device does not know anything but
> + * reading and writing, so it should be more than enough.
> + */
> +struct dst_secure_user
> +{
> +	unsigned int		permissions;
> +	unsigned short		check_offset;
> +	struct saddr		addr;
> +};
> +
> +struct dst_local_export_ctl
> +{
> +	__u32			backlog;
> +	int			secure_attr_num;
> +	struct dst_local_ctl	lctl;
> +	struct dst_remote_ctl	rctl;
> +};
> +
> +enum {
> +	DST_REMOTE_CFG		= 1, 		/* Request remote configuration */
> +	DST_WRITE,				/* Writing */
> +	DST_READ,				/* Reading */
> +	DST_NCMD_MAX,
> +};
> +
> +struct dst_remote_request
> +{
> +	__u32			cmd;
> +	__u32			flags;
> +	__u64			sector;
> +	__u32			offset;
> +	__u32			size;
> +};
> +
> +#ifdef __KERNEL__
> +
> +#include <linux/rbtree.h>
> +#include <linux/net.h>
> +#include <linux/blkdev.h>
> +#include <linux/bio.h>
> +#include <linux/mempool.h>
> +#include <linux/device.h>
> +
> +//#define DST_DEBUG
> +
> +#ifdef DST_DEBUG
> +#define dprintk(f, a...) printk(KERN_NOTICE f, ##a)
> +#else
> +#define dprintk(f, a...) do {} while (0)
> +#endif
> +
> +struct kst_worker
> +{
> +	struct list_head	entry;
> +
> +	struct list_head	state_list;
> +	struct mutex		state_mutex;
> +
> +	struct list_head	ready_list;
> +	spinlock_t		ready_lock;
> +
> +	mempool_t		*req_pool;
> +
> +	struct task_struct	*thread;
> +
> +	wait_queue_head_t 	wait;
> +
> +	int			id;
> +};
> +
> +struct kst_state;
> +struct dst_node;
> +
> +#define DST_REQ_HEADER_SENT	(1<<0)
> +#define DST_REQ_EXPORT		(1<<1)
> +#define DST_REQ_EXPORT_WRITE	(1<<2)
> +#define DST_REQ_EXPORT_READ	(1<<3)
> +#define DST_REQ_ALWAYS_QUEUE	(1<<4)
> +
> +struct dst_request
> +{
> +	struct rb_node		request_entry;
> +	struct list_head	request_list_entry;
> +	struct bio		*bio;
> +	struct kst_state 	*state;
> +	struct dst_node 	*node;
> +
> +	u32			flags;
> +
> +	int 			(*callback)(struct dst_request *dst,
> +						unsigned int revents);
> +	void			(*bio_endio)(struct dst_request *dst, 
> +						int err);
> +
> +	void			*priv;
> +	atomic_t		refcnt;
> +
> +	u64			size, orig_size, start;
> +	int			idx, num;
> +	u32			offset;
> +};
> +
> +struct kst_state_ops
> +{
> +	int 		(*init)(struct kst_state *, void *);
> +	int 		(*push)(struct dst_request *req);
> +	int		(*ready)(struct kst_state *);
> +	int		(*recovery)(struct kst_state *, int err);
> +	void 		(*exit)(struct kst_state *);
> +};
> +
> +#define KST_FLAG_PARTIAL		(1<<0)
> +
> +struct kst_state
> +{
> +	struct list_head	entry;
> +	struct list_head	ready_entry;
> +
> +	wait_queue_t 		wait;
> +	wait_queue_head_t 	*whead;
> +
> +	struct dst_node		*node;
> +	struct socket		*socket;
> +
> +	u32			flags, permissions;
> +
> +	struct rb_root		request_root;
> +	struct mutex		request_lock;
> +	struct list_head	request_list;
> +
> +	struct kst_state_ops	*ops;
> +};
> +
> +#define DST_DEFAULT_TIMEO	2000
> +
> +struct dst_storage;
> +
> +struct dst_alg_ops
> +{
> +	int			(*add_node)(struct dst_node *n);
> +	void			(*del_node)(struct dst_node *n);
> +	int 			(*remap)(struct dst_request *req);
> +	int			(*error)(struct kst_state *state, int err);
> +	struct module 		*owner;
> +};
> +
> +struct dst_alg
> +{
> +	struct list_head	entry;
> +	char			name[DST_NAMELEN];
> +	atomic_t		refcnt;
> +	struct dst_alg_ops	*ops;
> +};
> +
> +#define DST_ST_STARTED		(1<<0)
> +
> +struct dst_storage
> +{
> +	struct list_head	entry;
> +	char			name[DST_NAMELEN];
> +	struct dst_alg		*alg;
> +	atomic_t		refcnt;
> +	struct mutex		tree_lock;
> +	struct rb_root		tree_root;
> +
> +	request_queue_t		*queue;
> +	struct gendisk		*disk;
> +
> +	long			flags;
> +	u64			disk_size;
> +
> +	struct device		device;
> +};
> +
> +#define DST_NODE_FROZEN		0
> +#define DST_NODE_NOTSYNC	1
> +
> +struct dst_node
> +{
> +	struct rb_node		tree_node;
> +
> +	struct list_head	shared;
> +	struct dst_node		*shared_head;
> +
> +	struct block_device 	*bdev;
> +	struct dst_storage	*st;
> +	struct kst_state	*state;
> +	struct kst_worker	*w;
> +
> +	atomic_t		refcnt;
> +	atomic_t		shared_num;
> +
> +	void			(*cleanup)(struct dst_node *);
> +
> +	long			flags;
> +
> +	u64			start, size;
> +
> +	void			(*priv_callback)(struct dst_node *);
> +	void			*priv;
> +
> +	struct device		device;
> +};
> +
> +struct dst_le_template
> +{
> +	struct dst_local_export_ctl	le;
> +	void __user			*data;
> +};
> +
> +struct dst_secure
> +{
> +	struct list_head	sec_entry;
> +	struct dst_secure_user	sec;
> +};
> +
> +void kst_state_exit(struct kst_state *st);
> +
> +struct kst_worker *kst_worker_init(int id);
> +void kst_worker_exit(struct kst_worker *w);
> +
> +struct kst_state *kst_listener_state_init(struct dst_node *node,
> +		struct dst_le_template *tmp);
> +struct kst_state *kst_data_state_init(struct dst_node *node,
> +		struct socket *newsock);
> +
> +void kst_wake(struct kst_state *st);
> +
> +void kst_exit_all(void);
> +
> +struct dst_alg *dst_alloc_alg(char *name, struct dst_alg_ops *ops);
> +void dst_remove_alg(struct dst_alg *alg);
> +
> +struct dst_node *dst_storage_tree_search(struct dst_storage *st, u64 start);
> +
> +void dst_node_put(struct dst_node *n);
> +
> +static inline struct dst_node *dst_node_get(struct dst_node *n)
> +{
> +	atomic_inc(&n->refcnt);
> +	return n;
> +}
> +
> +struct dst_request *dst_clone_request(struct dst_request *req, mempool_t *pool);
> +void dst_free_request(struct dst_request *req);
> +
> +void kst_complete_req(struct dst_request *req, int err);
> +void kst_bio_endio(struct dst_request *req, int err);
> +void kst_del_req(struct dst_request *req);
> +int kst_enqueue_req(struct kst_state *st, struct dst_request *req);
> +
> +int kst_data_callback(struct dst_request *req, unsigned int revents);
> +
> +extern struct kmem_cache *dst_request_cache;
> +
> +static inline sector_t to_sector(unsigned long n)
> +{
> +	return (n >> 9);
> +}
> +
> +static inline unsigned long to_bytes(sector_t n)
> +{
> +	return (n << 9);
> +}
> +
> +/*
> + * Checks state's permissions.
> + * Returns -EPERM if check failed.
> + */
> +static inline int kst_check_permissions(struct kst_state *st, struct bio *bio)
> +{
> +	if ((bio_rw(bio) == WRITE) && !(st->permissions & DST_PERM_WRITE))
> +		return -EPERM;
> +
> +	return 0;
> +}
> +
> +#endif /* __KERNEL__ */
> +#endif /* __DST_H */
> 
> -- 
> 	Evgeniy Polyakov
> -
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [RFC PATCH 2/2] SCTP: Convert bind_addr_list locking to RCU
From: Paul E. McKenney @ 2007-09-10 22:08 UTC (permalink / raw)
  To: Vlad Yasevich; +Cc: netdev, lksctp-developers
In-Reply-To: <11894535912570-git-send-email-vladislav.yasevich@hp.com>

On Mon, Sep 10, 2007 at 03:46:30PM -0400, Vlad Yasevich wrote:
> Since the sctp_sockaddr_entry is now RCU enabled as part of
> the patch to synchronize sctp_localaddr_list, it makes sense to
> change all handling of these entries to RCU.  This includes the
> sctp_bind_addrs structure and it's list of bound addresses.
> 
> This list is currently protected by an external rw_lock and that
> looks like an overkill.  There are only 2 writers to the list:
> bind()/bindx() calls, and BH processing of ASCONF-ACK chunks.
> These are already seriealized via the socket lock, so they will
> not step on each other.  These are also relatively rare, so we
> should be good with RCU.
> 
> The readers are varied and they are easily converted to RCU.

Again, good start -- similar questions as for the other patch in this
series.  Also a few places where it looks like you are letting a pointer
to an RCU-protected data structure slip out of rcu_read_lock() protection,
and a case of mixing rcu_read_lock() and rcu_read_lock_bh() within the
same RCU-protected data structure.

						Thanx, Paul

> Signed-off-by: Vlad Yasevich <vladislav.yasevich@hp.com>
> ---
>  include/net/sctp/structs.h |    3 -
>  net/sctp/associola.c       |   14 +------
>  net/sctp/bind_addr.c       |   59 ++++++++++++++++++----------
>  net/sctp/endpointola.c     |   26 ++++---------
>  net/sctp/ipv6.c            |   12 ++---
>  net/sctp/protocol.c        |   25 +++++-------
>  net/sctp/sm_make_chunk.c   |   17 +++-----
>  net/sctp/socket.c          |   92 ++++++++++++++------------------------------
>  8 files changed, 97 insertions(+), 151 deletions(-)
> 
> diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
> index 2591c49..1d46f7d 100644
> --- a/include/net/sctp/structs.h
> +++ b/include/net/sctp/structs.h
> @@ -1222,9 +1222,6 @@ struct sctp_ep_common {
>  	 * bind_addr.address_list is our set of local IP addresses.
>  	 */
>  	struct sctp_bind_addr bind_addr;
> -
> -	/* Protection during address list comparisons. */
> -	rwlock_t   addr_lock;
>  };
> 
> 
> diff --git a/net/sctp/associola.c b/net/sctp/associola.c
> index 2ad1caf..9bad8ba 100644
> --- a/net/sctp/associola.c
> +++ b/net/sctp/associola.c
> @@ -99,7 +99,6 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
> 
>  	/* Initialize the bind addr area.  */
>  	sctp_bind_addr_init(&asoc->base.bind_addr, ep->base.bind_addr.port);
> -	rwlock_init(&asoc->base.addr_lock);
> 
>  	asoc->state = SCTP_STATE_CLOSED;
> 
> @@ -937,8 +936,6 @@ struct sctp_transport *sctp_assoc_is_match(struct sctp_association *asoc,
>  {
>  	struct sctp_transport *transport;
> 
> -	sctp_read_lock(&asoc->base.addr_lock);
> -
>  	if ((htons(asoc->base.bind_addr.port) == laddr->v4.sin_port) &&
>  	    (htons(asoc->peer.port) == paddr->v4.sin_port)) {
>  		transport = sctp_assoc_lookup_paddr(asoc, paddr);
> @@ -952,7 +949,6 @@ struct sctp_transport *sctp_assoc_is_match(struct sctp_association *asoc,
>  	transport = NULL;
> 
>  out:
> -	sctp_read_unlock(&asoc->base.addr_lock);
>  	return transport;
>  }
> 
> @@ -1376,19 +1372,13 @@ int sctp_assoc_set_bind_addr_from_cookie(struct sctp_association *asoc,
>  int sctp_assoc_lookup_laddr(struct sctp_association *asoc,
>  			    const union sctp_addr *laddr)
>  {
> -	int found;
> +	int found = 0;
> 
> -	sctp_read_lock(&asoc->base.addr_lock);
>  	if ((asoc->base.bind_addr.port == ntohs(laddr->v4.sin_port)) &&
>  	    sctp_bind_addr_match(&asoc->base.bind_addr, laddr,
> -				 sctp_sk(asoc->base.sk))) {
> +				 sctp_sk(asoc->base.sk)))
>  		found = 1;
> -		goto out;
> -	}
> 
> -	found = 0;
> -out:
> -	sctp_read_unlock(&asoc->base.addr_lock);
>  	return found;
>  }
> 
> diff --git a/net/sctp/bind_addr.c b/net/sctp/bind_addr.c
> index 7fc369f..9c7db1f 100644
> --- a/net/sctp/bind_addr.c
> +++ b/net/sctp/bind_addr.c
> @@ -167,7 +167,10 @@ int sctp_add_bind_addr(struct sctp_bind_addr *bp, union sctp_addr *new,
> 
>  	INIT_LIST_HEAD(&addr->list);
>  	INIT_RCU_HEAD(&addr->rcu);
> -	list_add_tail(&addr->list, &bp->address_list);
> +
> +	rcu_read_lock();
> +	list_add_tail_rcu(&addr->list, &bp->address_list);
> +	rcu_read_unlock();

Given the original code, we presumably hold the update-side lock.  If so,
the rcu_read_lock() and rcu_read_unlock() are (harmlessly) redundant.

>  	SCTP_DBG_OBJCNT_INC(addr);
> 
>  	return 0;
> @@ -178,20 +181,23 @@ int sctp_add_bind_addr(struct sctp_bind_addr *bp, union sctp_addr *new,
>   */
>  int sctp_del_bind_addr(struct sctp_bind_addr *bp, union sctp_addr *del_addr)
>  {
> -	struct list_head *pos, *temp;
> -	struct sctp_sockaddr_entry *addr;
> +	struct sctp_sockaddr_entry *addr, *temp;
> 
> -	list_for_each_safe(pos, temp, &bp->address_list) {
> -		addr = list_entry(pos, struct sctp_sockaddr_entry, list);
> +	rcu_read_lock_bh();
> +	list_for_each_entry_safe(addr, temp, &bp->address_list, list) {
>  		if (sctp_cmp_addr_exact(&addr->a, del_addr)) {
>  			/* Found the exact match. */
> -			list_del(pos);
> -			kfree(addr);
> -			SCTP_DBG_OBJCNT_DEC(addr);
> -
> -			return 0;
> +			addr->valid = 0;
> +			list_del_rcu(&addr->list);
> +			break;
>  		}
>  	}
> +	rcu_read_unlock_bh();

Ditto.

> +
> +	if (addr && !addr->valid) {
> +		call_rcu_bh(&addr->rcu, sctp_local_addr_free);
> +		SCTP_DBG_OBJCNT_DEC(addr);
> +	}
> 
>  	return -EINVAL;
>  }
> @@ -302,15 +308,20 @@ int sctp_bind_addr_match(struct sctp_bind_addr *bp,
>  			 struct sctp_sock *opt)
>  {
>  	struct sctp_sockaddr_entry *laddr;
> -	struct list_head *pos;
> -
> -	list_for_each(pos, &bp->address_list) {
> -		laddr = list_entry(pos, struct sctp_sockaddr_entry, list);
> -		if (opt->pf->cmp_addr(&laddr->a, addr, opt))
> -			return 1;
> +	int match = 0;
> +
> +	rcu_read_lock();
> +	list_for_each_entry_rcu(laddr, &bp->address_list, list) {
> +		if (!laddr->valid)
> +			continue;

As before, what happens if the entry is deleted by some other CPU at
this point, and thus ->valid is cleared?  If harmless, why bother with
->valid?

> +		if (opt->pf->cmp_addr(&laddr->a, addr, opt)) {
> +			match = 1;
> +			break;
> +		}
>  	}
> +	rcu_read_unlock();
> 
> -	return 0;
> +	return match;
>  }
> 
>  /* Find the first address in the bind address list that is not present in
> @@ -325,27 +336,31 @@ union sctp_addr *sctp_find_unmatch_addr(struct sctp_bind_addr	*bp,
>  	union sctp_addr			*addr;
>  	void 				*addr_buf;
>  	struct sctp_af			*af;
> -	struct list_head		*pos;
>  	int				i;
> 
> -	list_for_each(pos, &bp->address_list) {
> -		laddr = list_entry(pos, struct sctp_sockaddr_entry, list);
> +	rcu_read_lock();
> +	list_for_each_entry_rcu(laddr, &bp->address_list, list) {
> +		if (!laddr->valid)
> +			continue;

Ditto...

> 
>  		addr_buf = (union sctp_addr *)addrs;
>  		for (i = 0; i < addrcnt; i++) {
>  			addr = (union sctp_addr *)addr_buf;
>  			af = sctp_get_af_specific(addr->v4.sin_family);
>  			if (!af)
> -				return NULL;
> +				break;
> 
>  			if (opt->pf->cmp_addr(&laddr->a, addr, opt))
>  				break;
> 
>  			addr_buf += af->sockaddr_len;
>  		}
> -		if (i == addrcnt)
> +		if (i == addrcnt) {
> +			rcu_read_unlock();

Since rcu_read_unlock() just happened, some other CPU is free to
free up this data structure.  In a CONFIG_PREEMPT kernel (as well as a
CONFIG_PREEMPT_RT kernel, for that matter), this task might be preempted
at this point, and a full grace period might elapse.

In which case, the following statement returns a pointer to the freelist,
which is not good.

>  			return &laddr->a;
> +		}
>  	}
> +	rcu_read_unlock();
> 
>  	return NULL;
>  }
> diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c
> index 1404a9e..fa10af5 100644
> --- a/net/sctp/endpointola.c
> +++ b/net/sctp/endpointola.c
> @@ -92,7 +92,6 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep,
> 
>  	/* Initialize the bind addr area */
>  	sctp_bind_addr_init(&ep->base.bind_addr, 0);
> -	rwlock_init(&ep->base.addr_lock);
> 
>  	/* Remember who we are attached to.  */
>  	ep->base.sk = sk;
> @@ -225,21 +224,14 @@ void sctp_endpoint_put(struct sctp_endpoint *ep)
>  struct sctp_endpoint *sctp_endpoint_is_match(struct sctp_endpoint *ep,
>  					       const union sctp_addr *laddr)
>  {
> -	struct sctp_endpoint *retval;
> +	struct sctp_endpoint *retval = NULL;
> 
> -	sctp_read_lock(&ep->base.addr_lock);
>  	if (htons(ep->base.bind_addr.port) == laddr->v4.sin_port) {
>  		if (sctp_bind_addr_match(&ep->base.bind_addr, laddr,
> -					 sctp_sk(ep->base.sk))) {
> +					 sctp_sk(ep->base.sk)))
>  			retval = ep;
> -			goto out;
> -		}
>  	}
> 
> -	retval = NULL;
> -
> -out:
> -	sctp_read_unlock(&ep->base.addr_lock);
>  	return retval;
>  }
> 
> @@ -261,9 +253,7 @@ static struct sctp_association *__sctp_endpoint_lookup_assoc(
>  	list_for_each(pos, &ep->asocs) {
>  		asoc = list_entry(pos, struct sctp_association, asocs);
>  		if (rport == asoc->peer.port) {
> -			sctp_read_lock(&asoc->base.addr_lock);
>  			*transport = sctp_assoc_lookup_paddr(asoc, paddr);
> -			sctp_read_unlock(&asoc->base.addr_lock);
> 
>  			if (*transport)
>  				return asoc;
> @@ -295,20 +285,20 @@ struct sctp_association *sctp_endpoint_lookup_assoc(
>  int sctp_endpoint_is_peeled_off(struct sctp_endpoint *ep,
>  				const union sctp_addr *paddr)
>  {
> -	struct list_head *pos;
>  	struct sctp_sockaddr_entry *addr;
>  	struct sctp_bind_addr *bp;
> 
> -	sctp_read_lock(&ep->base.addr_lock);
>  	bp = &ep->base.bind_addr;
> -	list_for_each(pos, &bp->address_list) {
> -		addr = list_entry(pos, struct sctp_sockaddr_entry, list);
> +	rcu_read_lock();
> +	list_for_each_entry_rcu(addr, &bp->address_list, list) {
> +		if (!addr->valid)
> +			continue;

And ditto again...

>  		if (sctp_has_association(&addr->a, paddr)) {
> -			sctp_read_unlock(&ep->base.addr_lock);
> +			rcu_read_unlock();
>  			return 1;
>  		}
>  	}
> -	sctp_read_unlock(&ep->base.addr_lock);
> +	rcu_read_unlock();
> 
>  	return 0;
>  }
> diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
> index fc2e4e2..4f6dc55 100644
> --- a/net/sctp/ipv6.c
> +++ b/net/sctp/ipv6.c
> @@ -302,9 +302,7 @@ static void sctp_v6_get_saddr(struct sctp_association *asoc,
>  			      union sctp_addr *saddr)
>  {
>  	struct sctp_bind_addr *bp;
> -	rwlock_t *addr_lock;
>  	struct sctp_sockaddr_entry *laddr;
> -	struct list_head *pos;
>  	sctp_scope_t scope;
>  	union sctp_addr *baddr = NULL;
>  	__u8 matchlen = 0;
> @@ -324,14 +322,14 @@ static void sctp_v6_get_saddr(struct sctp_association *asoc,
>  	scope = sctp_scope(daddr);
> 
>  	bp = &asoc->base.bind_addr;
> -	addr_lock = &asoc->base.addr_lock;
> 
>  	/* Go through the bind address list and find the best source address
>  	 * that matches the scope of the destination address.
>  	 */
> -	sctp_read_lock(addr_lock);
> -	list_for_each(pos, &bp->address_list) {
> -		laddr = list_entry(pos, struct sctp_sockaddr_entry, list);
> +	rcu_read_lock();
> +	list_for_each_entry_rcu(laddr, &bp->address_list, list) {
> +		if (!laddr->valid)
> +			continue;

Ditto yet again...

>  		if ((laddr->use_as_src) &&
>  		    (laddr->a.sa.sa_family == AF_INET6) &&
>  		    (scope <= sctp_scope(&laddr->a))) {
> @@ -353,7 +351,7 @@ static void sctp_v6_get_saddr(struct sctp_association *asoc,
>  		       __FUNCTION__, asoc, NIP6(daddr->v6.sin6_addr));
>  	}
> 
> -	sctp_read_unlock(addr_lock);
> +	rcu_read_unlock();
>  }
> 
>  /* Make a copy of all potential local addresses. */
> diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
> index ac52f9e..a1030ed 100644
> --- a/net/sctp/protocol.c
> +++ b/net/sctp/protocol.c
> @@ -222,7 +222,7 @@ int sctp_copy_local_addr_list(struct sctp_bind_addr *bp, sctp_scope_t scope,
>  			      (copy_flags & SCTP_ADDR6_ALLOWED) &&
>  			      (copy_flags & SCTP_ADDR6_PEERSUPP)))) {
>  				error = sctp_add_bind_addr(bp, &addr->a, 1,
> -							   GFP_ATOMIC);
> +						    GFP_ATOMIC);
>  				if (error)
>  					goto end_copy;
>  			}
> @@ -426,9 +426,7 @@ static struct dst_entry *sctp_v4_get_dst(struct sctp_association *asoc,
>  	struct rtable *rt;
>  	struct flowi fl;
>  	struct sctp_bind_addr *bp;
> -	rwlock_t *addr_lock;
>  	struct sctp_sockaddr_entry *laddr;
> -	struct list_head *pos;
>  	struct dst_entry *dst = NULL;
>  	union sctp_addr dst_saddr;
> 
> @@ -457,23 +455,20 @@ static struct dst_entry *sctp_v4_get_dst(struct sctp_association *asoc,
>  		goto out;
> 
>  	bp = &asoc->base.bind_addr;
> -	addr_lock = &asoc->base.addr_lock;
> 
>  	if (dst) {
>  		/* Walk through the bind address list and look for a bind
>  		 * address that matches the source address of the returned dst.
>  		 */
> -		sctp_read_lock(addr_lock);
> -		list_for_each(pos, &bp->address_list) {
> -			laddr = list_entry(pos, struct sctp_sockaddr_entry,
> -					   list);
> -			if (!laddr->use_as_src)
> +		rcu_read_lock();
> +		list_for_each_entry_rcu(laddr, &bp->address_list, list) {
> +			if (!laddr->valid || !laddr->use_as_src)
>  				continue;

And here as well...

>  			sctp_v4_dst_saddr(&dst_saddr, dst, htons(bp->port));
>  			if (sctp_v4_cmp_addr(&dst_saddr, &laddr->a))
>  				goto out_unlock;
>  		}
> -		sctp_read_unlock(addr_lock);
> +		rcu_read_unlock();
> 
>  		/* None of the bound addresses match the source address of the
>  		 * dst. So release it.
> @@ -485,10 +480,10 @@ static struct dst_entry *sctp_v4_get_dst(struct sctp_association *asoc,
>  	/* Walk through the bind address list and try to get a dst that
>  	 * matches a bind address as the source address.
>  	 */
> -	sctp_read_lock(addr_lock);
> -	list_for_each(pos, &bp->address_list) {
> -		laddr = list_entry(pos, struct sctp_sockaddr_entry, list);
> -
> +	rcu_read_lock();
> +	list_for_each_entry_rcu(laddr, &bp->address_list, list) {
> +		if (!laddr->valid)
> +			continue;

OK, this is the last one I am flagging, you can find the others.  ;-)

>  		if ((laddr->use_as_src) &&
>  		    (AF_INET == laddr->a.sa.sa_family)) {
>  			fl.fl4_src = laddr->a.v4.sin_addr.s_addr;
> @@ -500,7 +495,7 @@ static struct dst_entry *sctp_v4_get_dst(struct sctp_association *asoc,
>  	}
> 
>  out_unlock:
> -	sctp_read_unlock(addr_lock);
> +	rcu_read_unlock();
>  out:
>  	if (dst)
>  		SCTP_DEBUG_PRINTK("rt_dst:%u.%u.%u.%u, rt_src:%u.%u.%u.%u\n",
> diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
> index 79856c9..caaa29f 100644
> --- a/net/sctp/sm_make_chunk.c
> +++ b/net/sctp/sm_make_chunk.c
> @@ -1531,7 +1531,7 @@ no_hmac:
>  	/* Also, add the destination address. */
>  	if (list_empty(&retval->base.bind_addr.address_list)) {
>  		sctp_add_bind_addr(&retval->base.bind_addr, &chunk->dest, 1,
> -				   GFP_ATOMIC);
> +				GFP_ATOMIC);
>  	}
> 
>  	retval->next_tsn = retval->c.initial_tsn;
> @@ -2613,22 +2613,17 @@ static int sctp_asconf_param_success(struct sctp_association *asoc,
> 
>  	switch (asconf_param->param_hdr.type) {
>  	case SCTP_PARAM_ADD_IP:
> -		sctp_local_bh_disable();
> -		sctp_write_lock(&asoc->base.addr_lock);
> -		list_for_each(pos, &bp->address_list) {
> -			saddr = list_entry(pos, struct sctp_sockaddr_entry, list);
> +		rcu_read_lock_bh();
> +		list_for_each_entry_rcu(saddr, &bp->address_list, list) {
> +			if (!saddr->valid)
> +				continue;
>  			if (sctp_cmp_addr_exact(&saddr->a, &addr))
>  				saddr->use_as_src = 1;
>  		}
> -		sctp_write_unlock(&asoc->base.addr_lock);
> -		sctp_local_bh_enable();
> +		rcu_read_unlock_bh();

If you use rcu_read_lock_bh() and rcu_read_unlock_bh() in one read path
for a given data structure, you need to use them in all the other read
paths for that data structure.  In addition, you must use call_rcu_bh()
when deleting the corresponding data elements.

The normal and the _bh RCU grace periods are unrelated, so mixing them
for a given RCU-protected data structure is a bad idea.  (Or are these
somehow two independent data structures?)

>  		break;
>  	case SCTP_PARAM_DEL_IP:
> -		sctp_local_bh_disable();
> -		sctp_write_lock(&asoc->base.addr_lock);
>  		retval = sctp_del_bind_addr(bp, &addr);
> -		sctp_write_unlock(&asoc->base.addr_lock);
> -		sctp_local_bh_enable();
>  		list_for_each(pos, &asoc->peer.transport_addr_list) {
>  			transport = list_entry(pos, struct sctp_transport,
>  						 transports);
> diff --git a/net/sctp/socket.c b/net/sctp/socket.c
> index a3acf78..35cc30c 100644
> --- a/net/sctp/socket.c
> +++ b/net/sctp/socket.c
> @@ -367,14 +367,10 @@ SCTP_STATIC int sctp_do_bind(struct sock *sk, union sctp_addr *addr, int len)
>  	if (!bp->port)
>  		bp->port = inet_sk(sk)->num;
> 
> -	/* Add the address to the bind address list.  */
> -	sctp_local_bh_disable();
> -	sctp_write_lock(&ep->base.addr_lock);
> -
> -	/* Use GFP_ATOMIC since BHs are disabled.  */
> +	/* Add the address to the bind address list.
> +	 * Use GFP_ATOMIC since BHs will be disabled.
> +	 */
>  	ret = sctp_add_bind_addr(bp, addr, 1, GFP_ATOMIC);
> -	sctp_write_unlock(&ep->base.addr_lock);
> -	sctp_local_bh_enable();
> 
>  	/* Copy back into socket for getsockname() use. */
>  	if (!ret) {
> @@ -497,7 +493,6 @@ static int sctp_send_asconf_add_ip(struct sock		*sk,
>  	void				*addr_buf;
>  	struct sctp_af			*af;
>  	struct list_head		*pos;
> -	struct list_head		*p;
>  	int 				i;
>  	int 				retval = 0;
> 
> @@ -544,14 +539,15 @@ static int sctp_send_asconf_add_ip(struct sock		*sk,
>  		if (i < addrcnt)
>  			continue;
> 
> -		/* Use the first address in bind addr list of association as
> -		 * Address Parameter of ASCONF CHUNK.
> +		/* Use the first valid address in bind addr list of
> +		 * association as Address Parameter of ASCONF CHUNK.
>  		 */
> -		sctp_read_lock(&asoc->base.addr_lock);
>  		bp = &asoc->base.bind_addr;
> -		p = bp->address_list.next;
> -		laddr = list_entry(p, struct sctp_sockaddr_entry, list);
> -		sctp_read_unlock(&asoc->base.addr_lock);
> +		rcu_read_lock();
> +		list_for_each_entry_rcu(laddr, &bp->address_list, list)
> +			if (laddr->valid)
> +				break;
> +		rcu_read_unlock();

Here you are carrying an RCU-protected data item (*laddr) outside of an
rcu_read_lock()/rcu_read_unlock() pair.  This is not good -- you need
to move the rcu_read_unlock() farther down to cover the full extend to
uses of the laddr pointer.

Again, RCU is within its rights allowing a grace period to elapse, so
that past this point, laddr might well point into the freelist.

> 
>  		chunk = sctp_make_asconf_update_ip(asoc, &laddr->a, addrs,
>  						   addrcnt, SCTP_PARAM_ADD_IP);
> @@ -567,8 +563,6 @@ static int sctp_send_asconf_add_ip(struct sock		*sk,
>  		/* Add the new addresses to the bind address list with
>  		 * use_as_src set to 0.
>  		 */
> -		sctp_local_bh_disable();
> -		sctp_write_lock(&asoc->base.addr_lock);
>  		addr_buf = addrs;
>  		for (i = 0; i < addrcnt; i++) {
>  			addr = (union sctp_addr *)addr_buf;
> @@ -578,8 +572,6 @@ static int sctp_send_asconf_add_ip(struct sock		*sk,
>  						    GFP_ATOMIC);
>  			addr_buf += af->sockaddr_len;
>  		}
> -		sctp_write_unlock(&asoc->base.addr_lock);
> -		sctp_local_bh_enable();
>  	}
> 
>  out:
> @@ -651,14 +643,8 @@ static int sctp_bindx_rem(struct sock *sk, struct sockaddr *addrs, int addrcnt)
>  		 * socket routing and failover schemes. Refer to comments in
>  		 * sctp_do_bind(). -daisy
>  		 */
> -		sctp_local_bh_disable();
> -		sctp_write_lock(&ep->base.addr_lock);
> -
>  		retval = sctp_del_bind_addr(bp, sa_addr);
> 
> -		sctp_write_unlock(&ep->base.addr_lock);
> -		sctp_local_bh_enable();
> -
>  		addr_buf += af->sockaddr_len;
>  err_bindx_rem:
>  		if (retval < 0) {
> @@ -748,11 +734,9 @@ static int sctp_send_asconf_del_ip(struct sock		*sk,
>  		 * make sure that we do not delete all the addresses in the
>  		 * association.
>  		 */
> -		sctp_read_lock(&asoc->base.addr_lock);
>  		bp = &asoc->base.bind_addr;
>  		laddr = sctp_find_unmatch_addr(bp, (union sctp_addr *)addrs,
>  					       addrcnt, sp);
> -		sctp_read_unlock(&asoc->base.addr_lock);
>  		if (!laddr)
>  			continue;
> 
> @@ -766,23 +750,18 @@ static int sctp_send_asconf_del_ip(struct sock		*sk,
>  		/* Reset use_as_src flag for the addresses in the bind address
>  		 * list that are to be deleted.
>  		 */
> -		sctp_local_bh_disable();
> -		sctp_write_lock(&asoc->base.addr_lock);
>  		addr_buf = addrs;
>  		for (i = 0; i < addrcnt; i++) {
>  			laddr = (union sctp_addr *)addr_buf;
>  			af = sctp_get_af_specific(laddr->v4.sin_family);
> -			list_for_each(pos1, &bp->address_list) {
> -				saddr = list_entry(pos1,
> -						   struct sctp_sockaddr_entry,
> -						   list);
> +			rcu_read_lock();
> +			list_for_each_entry_rcu(saddr, &bp->address_list, list) {
>  				if (sctp_cmp_addr_exact(&saddr->a, laddr))
>  					saddr->use_as_src = 0;
>  			}
> +			rcu_read_unlock();
>  			addr_buf += af->sockaddr_len;
>  		}
> -		sctp_write_unlock(&asoc->base.addr_lock);
> -		sctp_local_bh_enable();
> 
>  		/* Update the route and saddr entries for all the transports
>  		 * as some of the addresses in the bind address list are
> @@ -4057,11 +4036,9 @@ static int sctp_getsockopt_local_addrs_num_old(struct sock *sk, int len,
>  					       int __user *optlen)
>  {
>  	sctp_assoc_t id;
> -	struct list_head *pos;
>  	struct sctp_bind_addr *bp;
>  	struct sctp_association *asoc;
>  	struct sctp_sockaddr_entry *addr;
> -	rwlock_t *addr_lock;
>  	int cnt = 0;
> 
>  	if (len < sizeof(sctp_assoc_t))
> @@ -4078,17 +4055,13 @@ static int sctp_getsockopt_local_addrs_num_old(struct sock *sk, int len,
>  	 */
>  	if (0 == id) {
>  		bp = &sctp_sk(sk)->ep->base.bind_addr;
> -		addr_lock = &sctp_sk(sk)->ep->base.addr_lock;
>  	} else {
>  		asoc = sctp_id2assoc(sk, id);
>  		if (!asoc)
>  			return -EINVAL;
>  		bp = &asoc->base.bind_addr;
> -		addr_lock = &asoc->base.addr_lock;
>  	}
> 
> -	sctp_read_lock(addr_lock);
> -
>  	/* If the endpoint is bound to 0.0.0.0 or ::0, count the valid
>  	 * addresses from the global local address list.
>  	 */
> @@ -4115,12 +4088,15 @@ static int sctp_getsockopt_local_addrs_num_old(struct sock *sk, int len,
>  		goto done;
>  	}
> 
> -	list_for_each(pos, &bp->address_list) {
> +	rcu_read_lock();
> +	list_for_each_entry_rcu(addr, &bp->address_list, list) {
> +		if (!addr->valid)
> +			continue;
>  		cnt ++;
>  	}
> +	rcu_read_unlock();
> 
>  done:
> -	sctp_read_unlock(addr_lock);
>  	return cnt;
>  }
> 
> @@ -4204,7 +4180,6 @@ static int sctp_getsockopt_local_addrs_old(struct sock *sk, int len,
>  {
>  	struct sctp_bind_addr *bp;
>  	struct sctp_association *asoc;
> -	struct list_head *pos;
>  	int cnt = 0;
>  	struct sctp_getaddrs_old getaddrs;
>  	struct sctp_sockaddr_entry *addr;
> @@ -4212,7 +4187,6 @@ static int sctp_getsockopt_local_addrs_old(struct sock *sk, int len,
>  	union sctp_addr temp;
>  	struct sctp_sock *sp = sctp_sk(sk);
>  	int addrlen;
> -	rwlock_t *addr_lock;
>  	int err = 0;
>  	void *addrs;
>  	void *buf;
> @@ -4234,13 +4208,11 @@ static int sctp_getsockopt_local_addrs_old(struct sock *sk, int len,
>  	 */
>  	if (0 == getaddrs.assoc_id) {
>  		bp = &sctp_sk(sk)->ep->base.bind_addr;
> -		addr_lock = &sctp_sk(sk)->ep->base.addr_lock;
>  	} else {
>  		asoc = sctp_id2assoc(sk, getaddrs.assoc_id);
>  		if (!asoc)
>  			return -EINVAL;
>  		bp = &asoc->base.bind_addr;
> -		addr_lock = &asoc->base.addr_lock;
>  	}
> 
>  	to = getaddrs.addrs;
> @@ -4254,8 +4226,6 @@ static int sctp_getsockopt_local_addrs_old(struct sock *sk, int len,
>  	if (!addrs)
>  		return -ENOMEM;
> 
> -	sctp_read_lock(addr_lock);
> -
>  	/* If the endpoint is bound to 0.0.0.0 or ::0, get the valid
>  	 * addresses from the global local address list.
>  	 */
> @@ -4271,8 +4241,10 @@ static int sctp_getsockopt_local_addrs_old(struct sock *sk, int len,
>  	}
> 
>  	buf = addrs;
> -	list_for_each(pos, &bp->address_list) {
> -		addr = list_entry(pos, struct sctp_sockaddr_entry, list);
> +	rcu_read_lock();
> +	list_for_each_entry_rcu(addr, &bp->address_list, list) {
> +		if (!addr->valid)
> +			continue;
>  		memcpy(&temp, &addr->a, sizeof(temp));
>  		sctp_get_pf_specific(sk->sk_family)->addr_v4map(sp, &temp);
>  		addrlen = sctp_get_af_specific(temp.sa.sa_family)->sockaddr_len;
> @@ -4282,10 +4254,9 @@ static int sctp_getsockopt_local_addrs_old(struct sock *sk, int len,
>  		cnt ++;
>  		if (cnt >= getaddrs.addr_num) break;
>  	}
> +	rcu_read_unlock();
> 
>  copy_getaddrs:
> -	sctp_read_unlock(addr_lock);
> -
>  	/* copy the entire address list into the user provided space */
>  	if (copy_to_user(to, addrs, bytes_copied)) {
>  		err = -EFAULT;
> @@ -4307,7 +4278,6 @@ static int sctp_getsockopt_local_addrs(struct sock *sk, int len,
>  {
>  	struct sctp_bind_addr *bp;
>  	struct sctp_association *asoc;
> -	struct list_head *pos;
>  	int cnt = 0;
>  	struct sctp_getaddrs getaddrs;
>  	struct sctp_sockaddr_entry *addr;
> @@ -4315,7 +4285,6 @@ static int sctp_getsockopt_local_addrs(struct sock *sk, int len,
>  	union sctp_addr temp;
>  	struct sctp_sock *sp = sctp_sk(sk);
>  	int addrlen;
> -	rwlock_t *addr_lock;
>  	int err = 0;
>  	size_t space_left;
>  	int bytes_copied = 0;
> @@ -4336,13 +4305,11 @@ static int sctp_getsockopt_local_addrs(struct sock *sk, int len,
>  	 */
>  	if (0 == getaddrs.assoc_id) {
>  		bp = &sctp_sk(sk)->ep->base.bind_addr;
> -		addr_lock = &sctp_sk(sk)->ep->base.addr_lock;
>  	} else {
>  		asoc = sctp_id2assoc(sk, getaddrs.assoc_id);
>  		if (!asoc)
>  			return -EINVAL;
>  		bp = &asoc->base.bind_addr;
> -		addr_lock = &asoc->base.addr_lock;
>  	}
> 
>  	to = optval + offsetof(struct sctp_getaddrs,addrs);
> @@ -4352,8 +4319,6 @@ static int sctp_getsockopt_local_addrs(struct sock *sk, int len,
>  	if (!addrs)
>  		return -ENOMEM;
> 
> -	sctp_read_lock(addr_lock);
> -
>  	/* If the endpoint is bound to 0.0.0.0 or ::0, get the valid
>  	 * addresses from the global local address list.
>  	 */
> @@ -4372,8 +4337,10 @@ static int sctp_getsockopt_local_addrs(struct sock *sk, int len,
>  	}
> 
>  	buf = addrs;
> -	list_for_each(pos, &bp->address_list) {
> -		addr = list_entry(pos, struct sctp_sockaddr_entry, list);
> +	rcu_read_lock();
> +	list_for_each_entry_rcu(addr, &bp->address_list, list) {
> +		if (!addr->valid)
> +			continue;
>  		memcpy(&temp, &addr->a, sizeof(temp));
>  		sctp_get_pf_specific(sk->sk_family)->addr_v4map(sp, &temp);
>  		addrlen = sctp_get_af_specific(temp.sa.sa_family)->sockaddr_len;
> @@ -4387,10 +4354,9 @@ static int sctp_getsockopt_local_addrs(struct sock *sk, int len,
>  		cnt ++;
>  		space_left -= addrlen;
>  	}
> +	rcu_read_unlock();
> 
>  copy_getaddrs:
> -	sctp_read_unlock(addr_lock);
> -
>  	if (copy_to_user(to, addrs, bytes_copied)) {
>  		err = -EFAULT;
>  		goto out;
> @@ -4405,7 +4371,7 @@ copy_getaddrs:
>  	goto out;
> 
>  error_lock:
> -	sctp_read_unlock(addr_lock);
> +	rcu_read_unlock();
> 
>  out:
>  	kfree(addrs);
> -- 
> 1.5.2.4
> 
> -
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH] ipconfig.c: De-clutter IP configuration report
From: Jan Engelhardt @ 2007-09-10 22:04 UTC (permalink / raw)
  To: Maciej W. Rozycki; +Cc: netdev, David S. Miller, linux-kernel
In-Reply-To: <Pine.LNX.4.64N.0709101249230.25038@blysk.ds.pg.gda.pl>


On Sep 10 2007 13:09, Maciej W. Rozycki wrote:
> The new code builds fine; no semantic changes.
>
> Please apply,
>
>  Maciej
>
>patch-mips-2.6.23-rc5-20070904-ipconfig-printk-2
>diff -up --recursive --new-file linux-mips-2.6.23-rc5-20070904.macro/net/ipv4/ipconfig.c linux-mips-2.6.23-rc5-20070904/net/ipv4/ipconfig.c
>--- linux-mips-2.6.23-rc5-20070904.macro/net/ipv4/ipconfig.c	2007-09-04 04:56:22.000000000 +0000
>+++ linux-mips-2.6.23-rc5-20070904/net/ipv4/ipconfig.c	2007-09-10 11:53:19.000000000 +0000
>@@ -1364,17 +1364,17 @@ static int __init ip_auto_config(void)
> 	/*
> 	 * Clue in the operator.
> 	 */
>-	printk("IP-Config: Complete:");
>-	printk("\n      device=%s", ic_dev->name);
>-	printk(", addr=%u.%u.%u.%u", NIPQUAD(ic_myaddr));
>-	printk(", mask=%u.%u.%u.%u", NIPQUAD(ic_netmask));
>-	printk(", gw=%u.%u.%u.%u", NIPQUAD(ic_gateway));
>-	printk(",\n     host=%s, domain=%s, nis-domain=%s",
>-	       utsname()->nodename, ic_domain, utsname()->domainname);
>-	printk(",\n     bootserver=%u.%u.%u.%u", NIPQUAD(ic_servaddr));
>-	printk(", rootserver=%u.%u.%u.%u", NIPQUAD(root_server_addr));
>-	printk(", rootpath=%s", root_server_path);
>-	printk("\n");
>+	pr_info("IP-Config: Complete:\n");
>+	pr_info("      device=%s, addr=%u.%u.%u.%u, "
>+		"mask=%u.%u.%u.%u, gw=%u.%u.%u.%u,\n",
>+		ic_dev->name, NIPQUAD(ic_myaddr),
>+		NIPQUAD(ic_netmask), NIPQUAD(ic_gateway));
>+	pr_info("      host=%s, domain=%s, nis-domain=%s,\n",
>+		utsname()->nodename, ic_domain, utsname()->domainname);
>+	pr_info("      bootserver=%u.%u.%u.%u, "
>+		"rootserver=%u.%u.%u.%u, rootpath=%s\n",
>+		NIPQUAD(ic_servaddr),
>+		NIPQUAD(root_server_addr), root_server_path);
> #endif /* !SILENT */
> 
> 	return 0;

It should really be done in userspace. And ripped from the kernel.



	Jan
-- 

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox