All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Eliezer Tamir" <eliezert@broadcom.com>
To: "Michael Buesch" <mb@bu3sch.de>
Cc: "Michael Chan" <mchan@broadcom.com>,
	davem@davemloft.net, jeff@garzik.org, netdev@vger.kernel.org,
	eilong@broadcom.com
Subject: Re: [RFC][BNX2X]: New driver for Broadcom 10Gb Ethernet.
Date: Thu, 02 Aug 2007 21:09:20 +0300	[thread overview]
Message-ID: <46B21DD0.7070206@broadcom.com> (raw)
In-Reply-To: <200708020006.13457.mb@bu3sch.de>

Michal,

Thanks for going over the code.
My responses are inline.

Eliezer

Michael Buesch wrote:
> On Wednesday 01 August 2007 10:31:17 Michael Chan wrote:
> 
>> +typedef struct {
>> +	u8 reserved[64];
>> +} license_key_t;
> 
> No typedef.
> What is a "license key" used for, anyway?
This will be removed.
This is just a placeholder that has the right size.

> 
>> +#define RUN_AT(x)		(jiffies + (x))
> 
> That macro does only obfuscate code, in my opinion.
> If you want jiffies+x, better opencode it.
OK

> 
>> +typedef enum {
>> +	BCM5710 = 0,
>> +} board_t;
> 
> No typedef. Do
> enum bnx2x_board {
> 	BCM5710 = 0,
> };
> Or something like that.
OK

> 
>> +static struct pci_device_id bnx2x_pci_tbl[] = {
> 
> static const struct...
> 
OK

>> +	{ PCI_VENDOR_ID_BROADCOM, PCI_DEVICE_ID_NX2_5710,
>> +		PCI_ANY_ID, PCI_ANY_ID, 0, 0, BCM5710 },
>> +	{ 0 }
>> +};
> 
>> +static inline u32 bnx2x_bits_en(struct bnx2x *bp, u32 block, u32 reg,
>> +				u32 bits)
> 
> Does that really need to be inline? I'd suggest dropping inline.

Does not need to be inlined. Will change.

> 
>> +{
>> +	u32 val = REG_RD(bp, block, reg);
>> +
>> +	val |= bits;
>> +	REG_WR(bp, block, reg, val);
>> +	return val;
>> +}
>> +
>> +static inline u32 bnx2x_bits_dis(struct bnx2x *bp, u32 block, u32 reg,
>> +				 u32 bits)
> 
> Same here.
Same
> 
>> +{
>> +	u32 val = REG_RD(bp, block, reg);
>> +
>> +	val &= ~bits;
>> +	REG_WR(bp, block, reg, val);
>> +	return val;
>> +}
>> +
>> +static int bnx2x_mdio22_write(struct bnx2x *bp, u32 reg, u32 val)
>> +{
>> +	int rc;
>> +	u32 tmp, i;
>> +	int port = bp->port;
>> +	u32 emac_base = port ? GRCBASE_EMAC1 : GRCBASE_EMAC0;
>> +
>> +	if (bp->phy_flags & PHY_INT_MODE_AUTO_POLLING_FLAG) {
>> +
>> +		tmp = REG_RD(bp, emac_base, EMAC_REG_EMAC_MDIO_MODE);
>> +		tmp &= ~EMAC_MDIO_MODE_AUTO_POLL;
>> +		EMAC_WR(EMAC_REG_EMAC_MDIO_MODE, tmp);
>> +		REG_RD(bp, emac_base, EMAC_REG_EMAC_MDIO_MODE);
>> +		udelay(40);
>> +	}
>> +
>> +	tmp = ((bp->phy_addr << 21) | (reg << 16) |
>> +	       (val & EMAC_MDIO_COMM_DATA) |
>> +	       EMAC_MDIO_COMM_COMMAND_WRITE_22 |
>> +	       EMAC_MDIO_COMM_START_BUSY);
>> +	EMAC_WR(EMAC_REG_EMAC_MDIO_COMM, tmp);
>> +
>> +	for (i = 0; i < 50; i++) {
>> +		udelay(10);
>> +
>> +		tmp = REG_RD(bp, emac_base, EMAC_REG_EMAC_MDIO_COMM);
>> +		if (!(tmp & EMAC_MDIO_COMM_START_BUSY)) {
>> +			udelay(5);
>> +			break;
>> +		}
>> +	}
>> +
>> +	if (tmp & EMAC_MDIO_COMM_START_BUSY) {
>> +		BNX2X_ERR("write phy register failed\n");
>> +
>> +		rc = -EBUSY;
>> +	} else {
>> +		rc = 0;
>> +	}
>> +
>> +	if (bp->phy_flags & PHY_INT_MODE_AUTO_POLLING_FLAG) {
>> +
>> +		tmp = REG_RD(bp, emac_base, EMAC_REG_EMAC_MDIO_MODE);
>> +		tmp |= EMAC_MDIO_MODE_AUTO_POLL;
>> +		EMAC_WR(EMAC_REG_EMAC_MDIO_MODE, tmp);
>> +	}
>> +
>> +	return rc;
>> +}
>> +
>> +static int bnx2x_mdio22_read(struct bnx2x *bp, u32 reg, u32 *ret_val)
>> +{
>> +	int rc;
>> +	u32 val, i;
>> +	int port = bp->port;
>> +	u32 emac_base = port ? GRCBASE_EMAC1 : GRCBASE_EMAC0;
>> +
>> +	if (bp->phy_flags & PHY_INT_MODE_AUTO_POLLING_FLAG) {
>> +
>> +		val = REG_RD(bp, emac_base, EMAC_REG_EMAC_MDIO_MODE);
>> +		val &= ~EMAC_MDIO_MODE_AUTO_POLL;
>> +		EMAC_WR(EMAC_REG_EMAC_MDIO_MODE, val);
>> +		REG_RD(bp, emac_base, EMAC_REG_EMAC_MDIO_MODE);
>> +		udelay(40);
>> +	}
>> +
>> +	val = ((bp->phy_addr << 21) | (reg << 16) |
>> +	       EMAC_MDIO_COMM_COMMAND_READ_22 |
>> +	       EMAC_MDIO_COMM_START_BUSY);
>> +	EMAC_WR(EMAC_REG_EMAC_MDIO_COMM, val);
>> +
>> +	for (i = 0; i < 50; i++) {
>> +		udelay(10);
>> +
>> +		val = REG_RD(bp, emac_base, EMAC_REG_EMAC_MDIO_COMM);
>> +		if (!(val & EMAC_MDIO_COMM_START_BUSY)) {
> 
> No udelay(5) here, like in write above?
There is a udelay.

> 
>> +			val &= EMAC_MDIO_COMM_DATA;
>> +			break;
>> +		}
>> +	}
> 
>> +static int bnx2x_mdio45_vwrite(struct bnx2x *bp, u32 reg, u32 addr, u32 val)
>> +{
>> +	int i;
>> +	u32 rd_val;
>> +
>> +	for (i = 0; i < 10; i++) {
>> +		bnx2x_mdio45_write(bp, reg, addr, val);
>> +		mdelay(5);
> 
> Can you msleep(5) here?
Can't sleep, this can happen from a "sleepless" context.
Maybe we can break it down to smaller mdelays, will ask the HW guys.
> 
>> +		bnx2x_mdio45_read(bp, reg, addr, &rd_val);
>> +		/* if the read value is not the same as the value we wrote,
>> +		   we should write it again */
>> +		if (rd_val == val) {
>> +			return 0;
>> +		}
>> +	}
>> +	BNX2X_ERR("MDIO write in CL45 failed\n");
>> +	return -EBUSY;
>> +}
> 
>> +/* DMAE command positions used
>> + * Port0 14
>> + * Port1 15
>> + */
>> +static void bnx2x_wb_write_dmae(struct bnx2x *bp, u32 wb_addr, u32 *wb_write,
>> +				u32 wb_len)
>> +{
>> +	struct dmae_command *dmae = &bp->dmae;
>> +	int port = bp->port;
>> +	u32 *wb_comp = bnx2x_sp(bp, wb_comp);
>> +
>> +	memcpy(bnx2x_sp(bp, wb_write[0]), wb_write, wb_len * 4);
>> +	memset(dmae, 0, sizeof(struct dmae_command));
>> +
>> +	dmae->opcode = (DMAE_CMD_SRC_PCI | DMAE_CMD_DST_GRC |
>> +			DMAE_CMD_C_DST_PCI | DMAE_CMD_C_ENABLE |
>> +			DMAE_CMD_SRC_RESET | DMAE_CMD_DST_RESET |
>> +			DMAE_CMD_ENDIANITY_DW_SWAP |
>> +			(port? DMAE_CMD_PORT_1 : DMAE_CMD_PORT_0));
>> +	dmae->src_addr_lo = U64_LO(bnx2x_sp_mapping(bp, wb_write));
>> +	dmae->src_addr_hi = U64_HI(bnx2x_sp_mapping(bp, wb_write));
>> +	dmae->dst_addr_lo = wb_addr >> 2;
>> +	dmae->dst_addr_hi = 0;
>> +	dmae->len = wb_len;
>> +	dmae->comp_addr_lo = U64_LO(bnx2x_sp_mapping(bp, wb_comp));
>> +	dmae->comp_addr_hi = U64_HI(bnx2x_sp_mapping(bp, wb_comp));
>> +	dmae->comp_val = BNX2X_WB_COMP_VAL;
>> +	bnx2x_post_dmae(bp, port? 15 : 14);
>> +
>> +	*wb_comp = 0;
> 
> wmb();
Right.

> 
>> +	REG_WR32(bp, GRCBASE_DMAE,
>> +		 (bp->port? DMAE_REGISTERS_GO_C15 :
>> +			    DMAE_REGISTERS_GO_C14), 1);
>> +	udelay(5);
>> +	while (*wb_comp != BNX2X_WB_COMP_VAL) {
>> +		udelay(5);
> 
> Not sure what this is doing here.
> Do we need some DMA syncing here?
> 
> Anyway, we _do_ need a timeout here.
> Probably also some kind of memory barrier, if no DMA syncing is needed.
This causes the chip to read from pci consistent memory, so I guess we 
can use an wmb().
Will add timeout.

> 
>> +	}
>> +}
> 
>> +static void bnx2x_emac_enable(struct bnx2x *bp)
>> +{
>> +	u32 val;
>> +	int port = bp->port;
>> +	u32 emac_base = port ? GRCBASE_EMAC1 : GRCBASE_EMAC0;
> 
>> +	while (val & EMAC_MODE_RESET) {
>> +		val = REG_RD(bp, emac_base, EMAC_REG_EMAC_MODE);
>> +		DP(NETIF_MSG_LINK, "EMAC reset reg is %u\n", val);
>> +	}
> 
> A timeout, please.
OK
> 
>> +	/* reset tx part */
>> +	EMAC_WR(EMAC_REG_EMAC_TX_MODE, EMAC_TX_MODE_RESET);
>> +
>> +	while (val & EMAC_TX_MODE_RESET) {
>> +		val = REG_RD(bp, emac_base, EMAC_REG_EMAC_TX_MODE);
>> +		DP(NETIF_MSG_LINK, "EMAC reset reg is %u\n", val);
>> +	}
> 
> A timeout, please.
OK
> 
>> +static void bnx2x_pbf_update(struct bnx2x *bp)
>> +{
>> +	int port = bp->port;
>> +	u32 init_crd, crd;
>> +	u32 count = 1000;
>> +	u32 pause = 0;
>> +
>> +	/* disable port */
>> +	REG_WR(bp, GRCBASE_PBF,
>> +	       PBF_REGISTERS_DISABLE_NEW_TASK_PROC_P0 + port*4, 0x1);
>> +
>> +	/* wait for init credit */
>> +	init_crd = REG_RD(bp, GRCBASE_PBF,
>> +			  PBF_REGISTERS_P0_INIT_CRD + port*4);
>> +	crd = REG_RD(bp, GRCBASE_PBF, PBF_REGISTERS_P0_CREDIT + port*8);
>> +	DP(NETIF_MSG_LINK, "init_crd 0x%x  crd 0x%x\n", init_crd, crd);
>> +
>> +	while ((init_crd != crd) && count) {
>> +		mdelay(5);
> 
> msleep(5)?
Can't sleep in link change code since it can be called from the slowpath 
task.
> 
>> +		crd = REG_RD(bp, GRCBASE_PBF,
>> +			     PBF_REGISTERS_P0_CREDIT + port*8);
>> +		count--;
>> +	}
> 
>> +	/* probe the credit changes */
>> +	REG_WR(bp, GRCBASE_PBF, PBF_REGISTERS_INIT_P0 + port*4, 0x1);
>> +	mdelay(5);
> 
> msleep(5)?
> 
>> +	REG_WR(bp, GRCBASE_PBF, PBF_REGISTERS_INIT_P0 + port*4, 0x0);
>> +
>> +	/* enable port */
>> +	REG_WR(bp, GRCBASE_PBF,
>> +	       PBF_REGISTERS_DISABLE_NEW_TASK_PROC_P0 + port*4, 0x0);
>> +}
> 
>> +/* This function is called upon link interrupt */
>> +static void bnx2x_link_update(struct bnx2x *bp)
>> +{
>> +	u32 gp_status;
>> +	int port = bp->port;
>> +	int i;
>> +	int link_10g;
>> +
>> +	DP(NETIF_MSG_LINK, "port %x, is xgxs %x, stat_mask 0x%x, "
>> +	   "int_mask 0x%x, saved_mask 0x%x, MI_INT %x, SERDES_LINK %x,"
>> +	   " 10G %x, XGXS_LINK %x\n",
>> +	   port, (bp->phy_flags & PHY_XGSX_FLAG),
>> +	   REG_RD(bp, GRCBASE_NIG,
>> +		  NIG_REGISTERS_STATUS_INTERRUPT_PORT0 + port*4),
>> +	   REG_RD(bp, GRCBASE_NIG,
>> +		  NIG_REGISTERS_MASK_INTERRUPT_PORT0 + port*4),
>> +	   bp->nig_mask,
>> +	   REG_RD(bp, GRCBASE_NIG,
>> +		  NIG_REGISTERS_EMAC0_STATUS_MISC_MI_INT + port*0x18),
>> +	   REG_RD(bp, GRCBASE_NIG,
>> +		  NIG_REGISTERS_SERDES0_STATUS_LINK_STATUS + port*0x3c),
>> +	   REG_RD(bp, GRCBASE_NIG,
>> +		  NIG_REGISTERS_XGXS0_STATUS_LINK10G + port*0x68),
>> +	   REG_RD(bp, GRCBASE_NIG,
>> +		  NIG_REGISTERS_XGXS0_STATUS_LINK_STATUS + port*0x68)
>> +	);
>> +
>> +	MDIO_SET_REG_BANK(bp, MDIO_REG_BANK_GP_STATUS);
>> +	/* avoid fast toggling */
>> +	for (i = 0 ; i < 10 ; i++) {
>> +		mdelay(10);
> 
> msleep(10)?
> 
>> +		bnx2x_mdio22_read(bp, MDIO_GP_STATUS_TOP_AN_STATUS1,
>> +				  &gp_status);
>> +	}
>> +
>> +	bnx2x_link_settings_status(bp, gp_status);
>> +
>> +	/* anything 10 and over uses the bmac */
>> +	link_10g = ((bp->line_speed >= SPEED_10000) &&
>> +		    (bp->line_speed <= SPEED_16000));
>> +
>> +	bnx2x_link_int_ack(bp, link_10g);
>> +
>> +	/* link is up only if both local phy and external phy are up */
>> +	if (bp->link_up && bnx2x_ext_phy_is_link_up(bp)) {
>> +		if (link_10g) {
>> +			bnx2x_bmac_enable(bp, 0);
>> +			bnx2x_leds_set(bp, SPEED_10000);
>> +
>> +		} else {
>> +			bnx2x_emac_enable(bp);
>> +			bnx2x_emac_program(bp);
>> +
>> +			/* AN complete? */
>> +			if (gp_status & MDIO_AN_CL73_OR_37_COMPLETE) {
>> +				if (!(bp->phy_flags & PHY_SGMII_FLAG)) {
>> +					bnx2x_set_sgmii_tx_driver(bp);
>> +				}/* Not SGMII */
>> +			}
>> +		}
>> +		bnx2x_link_up(bp);
>> +
>> +	} else { /* link down */
>> +		bnx2x_leds_unset(bp);
>> +		bnx2x_link_down(bp);
>> +	}
>> +}
> 
>> +static inline u16 bnx2x_update_dsb_idx(struct bnx2x *bp)
> 
> Too big for inlining.
> If this func is only used in one place, gcc will inline it automatically.

This is the slowpath status block so the inline can be removed.

> 
>> +{
>> +	struct host_def_status_block *dsb = bp->def_status_blk;
>> +	u16 rc = 0;
>> +
>> +	if (bp->def_att_idx != dsb->atten_status_block.attn_bits_index) {
>> +		bp->def_att_idx = dsb->atten_status_block.attn_bits_index;
>> +		rc |= 1;
>> +	}
>> +	if (bp->def_c_idx != dsb->c_def_status_block.status_block_index) {
>> +		bp->def_c_idx = dsb->c_def_status_block.status_block_index;
>> +		rc |= 2;
>> +	}
>> +	if (bp->def_u_idx != dsb->u_def_status_block.status_block_index) {
>> +		bp->def_u_idx = dsb->u_def_status_block.status_block_index;
>> +		rc |= 4;
>> +	}
>> +	if (bp->def_x_idx != dsb->x_def_status_block.status_block_index) {
>> +		bp->def_x_idx = dsb->x_def_status_block.status_block_index;
>> +		rc |= 8;
>> +	}
>> +	if (bp->def_t_idx != dsb->t_def_status_block.status_block_index) {
>> +		bp->def_t_idx = dsb->t_def_status_block.status_block_index;
>> +		rc |= 16;
>> +	}
>> +	rmb(); /* TBD chack this */
>> +	return rc;
>> +}
>> +
>> +static inline u16 bnx2x_update_fpsb_idx(struct bnx2x_fastpath *fp)
> 
> Too big for inlining.
>
This is fastpath code, I will have to see if removing the inline impacts 
performance.


>> +{
>> +	struct host_status_block *fpsb = fp->status_blk;
>> +	u16 rc = 0;
>> +
>> +	if (fp->fp_c_idx != fpsb->c_status_block.status_block_index) {
>> +		fp->fp_c_idx = fpsb->c_status_block.status_block_index;
>> +		rc |= 1;
>> +	}
>> +	if (fp->fp_u_idx != fpsb->u_status_block.status_block_index) {
>> +		fp->fp_u_idx = fpsb->u_status_block.status_block_index;
>> +		rc |= 2;
>> +	}
>> +	rmb(); /* TBD chack this */
>> +	return rc;
>> +}
>> +
>> +static inline u32 bnx2x_tx_avail(struct bnx2x_fastpath *fp)
> 
> Too big for inlining.

this is in the fast-fastpath I think it should be inlined.

> 
>> +{
>> +	u16 used;
>> +	u32 prod = fp->tx_bd_prod;
>> +	u32 cons = fp->tx_bd_cons;
>> +
>> +	smp_mb();
> 
> This barrier needs a comment. Why is it there? And why SMP only?
> 
>> +
>> +	used = (NUM_TX_BD - NUM_TX_RINGS + prod - cons +
>> +		(cons / TX_DESC_CNT) - (prod / TX_DESC_CNT));
>> +
>> +	if (prod >= cons) {
>> +		/* used = prod - cons - prod/size + cons/size */
>> +		used -= NUM_TX_BD - NUM_TX_RINGS;
>> +	}
>> +
>> +	BUG_TRAP(used <= fp->bp->tx_ring_size);
>> +	BUG_TRAP((fp->bp->tx_ring_size - used) <= MAX_TX_AVAIL);
>> +	return(fp->bp->tx_ring_size - used);
>> +}
> 
>> +/*========================================================================*/
>> +
>> +/* acquire split MCP access lock register */
>> +static int bnx2x_lock_alr( struct bnx2x *bp)
>> +{
>> +	int rc = 0;
>> +	u32 i, j, val;
>> +
>> +	i = 100;
>> +	val = 1UL << 31;
>> +
>> +	REG_WR(bp, GRCBASE_MCP, 0x9c, val);
>> +	for (j = 0; j < i*10; j++) {
>> +		val = REG_RD(bp, GRCBASE_MCP, 0x9c);
>> +		if (val & (1L << 31)) {
>> +			break;
>> +		}
>> +
>> +		mdelay(5);
> 
> msleep(5)?
called from link change event, can't sleep.
> 
>> +	}
>> +
>> +	if (!(val & (1L << 31))) {
>> +		BNX2X_ERR("Cannot acquire nvram interface.\n");
>> +
>> +		rc = -EBUSY;
>> +	}
>> +
>> +	return rc;
>> +}
> 
>> +static void bnx2x_free_mem(struct bnx2x *bp)
>> +{
>> +
>> +#define BNX2X_PCI_FREE(x, y, size) do { \
>> +	if (x) { \
>> +		pci_free_consistent(bp->pdev, size, x, y); \
>> +		x = NULL; \
>> +	} \
>> +} while (0)
>> +
>> +#define BNX2X_KFREE(x) do { \
>> +	if (x) { \
>> +		vfree(x); \
>> +		x = NULL; \
>> +	} \
>> +} while (0)
>> +
>> +	int i;
>> +
>> +	/* fastpath */
>> +	for_each_queue(bp, i) {
>> +
>> +		/* Status blocks */
>> +		BNX2X_PCI_FREE(bnx2x_fp(bp, i, status_blk),
>> +			       bnx2x_fp(bp, i, status_blk_mapping),
>> +			       sizeof(struct host_status_block)
>> +			       + sizeof(struct eth_tx_db_data));
>> +
>> +		/* fast path rings: tx_buf tx_desc rx_buf rx_desc rx_comp */
>> +		BNX2X_KFREE(bnx2x_fp(bp, i, tx_buf_ring));
>> +		BNX2X_PCI_FREE(bnx2x_fp(bp, i, tx_desc_ring),
>> +			       bnx2x_fp(bp, i, tx_desc_mapping),
>> +			       sizeof(struct eth_tx_bd) * NUM_TX_BD);
>> +
>> +		BNX2X_KFREE(bnx2x_fp(bp, i, rx_buf_ring));
>> +		BNX2X_PCI_FREE(bnx2x_fp(bp, i, rx_desc_ring),
>> +			       bnx2x_fp(bp, i, rx_desc_mapping),
>> +			       sizeof(struct eth_rx_bd) * NUM_RX_BD);
>> +
>> +		BNX2X_PCI_FREE(bnx2x_fp(bp, i, rx_comp_ring),
>> +			       bnx2x_fp(bp, i, rx_comp_mapping),
>> +			       sizeof(struct eth_rx_bd) * NUM_RX_BD);
>> +	}
>> +
>> +	BNX2X_KFREE( bp->fp);
>> +	/* end of fast path*/
>> +
>> +	BNX2X_PCI_FREE(bp->def_status_blk, bp->def_status_blk_mapping,
>> +		       (sizeof(struct host_def_status_block)));
>> +
>> +	BNX2X_PCI_FREE(bp->slowpath, bp->slowpath_mapping,
>> +		       (sizeof(struct bnx2x_slowpath)));
>> +
>> +	if (iscsi_active) {
>> +		BNX2X_PCI_FREE(bp->t1, bp->t1_mapping, 64*1024);
>> +		BNX2X_PCI_FREE(bp->t2, bp->t2_mapping, 16*1024);
>> +		BNX2X_PCI_FREE(bp->timers, bp->timers_mapping, 8*1024);
>> +		BNX2X_PCI_FREE(bp->qm, bp->qm_mapping, 128*1024);
>> +	}
>> +
>> +	BNX2X_PCI_FREE(bp->spq, bp->spq_mapping, PAGE_SIZE);
>> +
>> +#undef BNX2X_PCI_FREE
>> +#undef BNX2X_KFREE
>> +}
>> +
>> +static int bnx2x_alloc_mem(struct bnx2x *bp)
>> +{
>> +	int i;
>> +
>> +#define BNX2X_PCI_ALLOC(x, y, size) do { \
>> +	x = pci_alloc_consistent(bp->pdev, size, y); \
>> +	if (x == NULL) \
>> +		goto alloc_mem_err; \
>> +	memset(x, 0, size); \
>> +} while (0)
>> +
>> +#define BNX2X_ALLOC(x, size) do { \
>> +	x = vmalloc(size); \
> 
> Why not zalloc()?

We want to allow allocating chunks bigger that zalloc will allow.

> 
>> +	if (x == NULL) \
>> +		goto alloc_mem_err; \
>> +	memset(x, 0, size); \
>> +} while (0)
>> +
>> +	/* fastpath */
>> +	BNX2X_ALLOC(bp->fp, sizeof(struct bnx2x_fastpath)*bp->num_queues);
>> +
>> +	for_each_queue(bp, i) {
>> +
>> +		bnx2x_fp(bp, i, bp) = bp;
>> +
>> +		/* Status blocks */
>> +		BNX2X_PCI_ALLOC(bnx2x_fp(bp, i, status_blk),
>> +				&bnx2x_fp(bp, i, status_blk_mapping),
>> +				sizeof(struct host_status_block) +
>> +				sizeof(struct eth_tx_db_data));
>> +
>> +		bnx2x_fp(bp, i, tx_prods_mapping) =
>> +				bnx2x_fp(bp, i, status_blk_mapping) +
>> +				sizeof(struct host_status_block);
>> +
>> +		bnx2x_fp(bp, i, hw_tx_prods) =
>> +				(void *)(bnx2x_fp(bp, i, status_blk) + 1);
>> +
>> +		/* fast path rings: tx_buf tx_desc rx_buf rx_desc rx_comp */
>> +		BNX2X_ALLOC(bnx2x_fp(bp, i, tx_buf_ring),
>> +				sizeof(struct sw_tx_bd) * NUM_TX_BD);
>> +		BNX2X_PCI_ALLOC(bnx2x_fp(bp, i, tx_desc_ring),
>> +				&bnx2x_fp(bp, i, tx_desc_mapping),
>> +				sizeof(struct eth_tx_bd) * NUM_TX_BD);
>> +
>> +		BNX2X_ALLOC(bnx2x_fp(bp, i, rx_buf_ring),
>> +				sizeof(struct sw_rx_bd) * NUM_RX_BD);
>> +		BNX2X_PCI_ALLOC(bnx2x_fp(bp, i, rx_desc_ring),
>> +				&bnx2x_fp(bp, i, rx_desc_mapping),
>> +				sizeof(struct eth_rx_bd) * NUM_RX_BD);
>> +
>> +		BNX2X_PCI_ALLOC(bnx2x_fp(bp, i, rx_comp_ring),
>> +				&bnx2x_fp(bp, i, rx_comp_mapping),
>> +				sizeof(struct eth_rx_bd) * NUM_RX_BD);
>> +	}
>> +	/* end of fast path*/
>> +
>> +	BNX2X_PCI_ALLOC(bp->def_status_blk, &bp->def_status_blk_mapping,
>> +			sizeof(struct host_def_status_block));
>> +
>> +	BNX2X_PCI_ALLOC(bp->slowpath, &bp->slowpath_mapping,
>> +			sizeof(struct bnx2x_slowpath));
>> +
>> +	if (iscsi_active) {
>> +		BNX2X_PCI_ALLOC(bp->t1, &bp->t1_mapping, 64*1024);
>> +
>> +		/* Initialize T1 */
>> +		for (i = 0; i < 64*1024; i += 64) {
>> +			*(u64 *)((char *)bp->t1 + i+56) = 0x0UL;
>> +			*(u64 *)((char *)bp->t1 + i+3) = 0x0UL;
>> +		}
>> +
>> +		/* allocate searcher T2 table
>> +		   we allocate 1/4 of alloc num for T2
>> +		  (which is not entered into the ILT) */
>> +		BNX2X_PCI_ALLOC(bp->t2, &bp->t2_mapping, 16*1024);
>> +
>> +		/* Initialize T2 */
>> +		for (i = 0; i < 16*1024; i += 64) {
>> +			*(u64 *)((char *)bp->t2 + i+56) =
>> +				bp->t2_mapping + i + 64;
>> +		}
>> +
>> +		/* now sixup the last line in the block
>> +		 * to point to the next block
>> +		 */
>> +		*(u64 *)((char *)bp->t2 + 1024*16-8) = bp->t2_mapping;
>> +
>> +		/* Timer block array (MAX_CONN*8)
>> +		 * phys uncached for now 1024 conns
>> +		 */
>> +		BNX2X_PCI_ALLOC(bp->timers, &bp->timers_mapping, 8*1024);
>> +
>> +		/* QM queues (128*MAX_CONN) */
>> +		BNX2X_PCI_ALLOC(bp->qm, &bp->qm_mapping, 128*1024);
>> +	}
>> +
>> +	/* Slow path ring */
>> +	BNX2X_PCI_ALLOC(bp->spq, &bp->spq_mapping, PAGE_SIZE);
>> +
>> +	return 0;
>> +
>> +alloc_mem_err:
>> +	bnx2x_free_mem(bp);
>> +	return -ENOMEM;
>> +
>> +#undef BNX2X_PCI_ALLOC
>> +#undef BNX2X_ALLOC
>> +}
>> +
>> +
>> +
>> +static void bnx2x_set_mac_addr(struct bnx2x *bp)
>> +{
>> +	struct mac_configuration_cmd *config = bnx2x_sp(bp, mac_config);
>> +
>> +	/* CAM allocation
>> +	 * unicasts 0-31:port0 32-63:port1
>> +	 * multicast 64-127:port0 128-191:port1
>> +	 */
>> +	config->hdr.length = 2;
>> +	config->hdr.offset = bp->port ? 31 : 0;
>> +	config->hdr.reserved0 = 0;
>> +	config->hdr.reserved1 = 0;
>> +
>> +	/* primary MAC */
>> +	config->config_table[0].cam_entry.msb_mac_addr =
>> +		ntohl(*(u32 *)&bp->dev->dev_addr[0]);
>> +	config->config_table[0].cam_entry.lsb_mac_addr =
>> +		ntohs(*(u16 *)&bp->dev->dev_addr[4]);
> 
> Better use xx_to_cpu()
> anyway, ntohX is defined as beX_to_cpu().
> Are you sure this is correct? A byte array is little endian.
This is a HW quirk, will change to cpu_to_le().

> 
>> +static inline int bnx2x_alloc_rx_skb(struct bnx2x *bp,
>> +				     struct bnx2x_fastpath *fp, u16 index)
> 
> Too big for inlining.
> If this is only called in one place, gcc will inline it automatically.
this is fast-fast-path (called once for each rx packet) I think it 
should be inlined.

> 
>> +{
>> +	struct sk_buff *skb;
>> +	struct sw_rx_bd *rx_buf = &fp->rx_buf_ring[index];
>> +	struct eth_rx_bd *rxbd = &fp->rx_desc_ring[index];
>> +	dma_addr_t mapping;
>> +
>> +	skb = netdev_alloc_skb(bp->dev, bp->rx_buf_size);
>> +
>> +	if (skb == NULL) {
>> +		return -ENOMEM;
>> +	}
>> +
>> +	mapping = pci_map_single(bp->pdev, skb->data, bp->rx_buf_use_size,
>> +				 PCI_DMA_FROMDEVICE);
> 
> You need to check if the mapping succeed.
> There's a macro for that: dma_mapping_error()
OK

> 
>> +	rx_buf->skb = skb;
>> +	pci_unmap_addr_set(rx_buf, mapping, mapping);
>> +#ifdef BNX2X_STOP_ON_ERROR
>> +	rxbd->reserved = fp->last_alloc++;
>> +#else
>> +	rxbd->reserved = 0;
>> +#endif
>> +	rxbd->len = bp->rx_buf_use_size;
>> +	rxbd->addr_hi = U64_HI(mapping);
>> +	rxbd->addr_lo = U64_LO(mapping);
>> +	return 0;
>> +}
> 
>> +static void bnx2x_tx_int(struct bnx2x_fastpath *fp, int work)
>> +{
>> +	struct bnx2x *bp = fp->bp;
>> +	u16 hw_cons, sw_cons, bd_cons = fp->tx_bd_cons;
>> +	int done = 0;
>> +
>> +	hw_cons = *fp->tx_cons_sb;
>> +	sw_cons = fp->tx_pkt_cons;
>> +
>> +#ifdef BNX2X_STOP_ON_ERROR
>> +	if (unlikely(bp->panic)) {
>> +		return;
>> +	}
>> +#endif
>> +
>> +	while (sw_cons != hw_cons) {
>> +		u16 pkt_cons;
>> +
>> +		pkt_cons = TX_BD(sw_cons);
>> +
>> +		/* prefetch(bp->tx_buf_ring[pkt_cons].skb); */
>> +
>> +		DP(NETIF_MSG_TX_DONE, "sw_cons=%u  hw_cons=%u pkt_cons=%d\n",
>> +		   sw_cons, hw_cons, pkt_cons);
>> +
>> +/*        if (NEXT_TX_IDX(sw_cons)!=hw_cons){
>> +	    rmb();
>> +	    prefetch(fp->tx_buf_ring[NEXT_TX_IDX(sw_cons)].skb);
>> +	}
>> +*/
>> +
>> +		bd_cons = bnx2x_free_tx_pkt(bp, fp, pkt_cons);
>> +		sw_cons++;
>> +		done++;
>> +
>> +		if (done == work) {
>> +			break;
>> +		}
>> +	}
>> +
>> +	fp->tx_pkt_cons = sw_cons;
>> +	fp->tx_bd_cons = bd_cons;
>> +
>> +	smp_mb();
> 
> Please add a comment why we need a SMP MB here.
> 
>> +
>> +	/* TBD need a thresh?  */
>> +	if (unlikely(netif_queue_stopped(bp->dev))) {
>> +		netif_tx_lock(bp->dev);
>> +		if ((netif_queue_stopped(bp->dev)) &&
>> +		    (bnx2x_tx_avail(fp) >= MAX_SKB_FRAGS + 3)) {
>> +			netif_wake_queue(bp->dev);
>> +		}
>> +		netif_tx_unlock(bp->dev);
>> +	}
>> +}
> 
>> +static inline void bnx2x_reuse_rx_skb(struct bnx2x_fastpath *fp,
>> +				      struct sk_buff *skb, u16 cons, u16 prod)
> 
> Too big for inlining.
Now that I think about it, this is better off out-of-line.
Will change.

> 
>> +{
>> +	struct bnx2x *bp = fp->bp;
>> +	struct sw_rx_bd *cons_rx_buf = &fp->rx_buf_ring[cons];
>> +	struct sw_rx_bd *prod_rx_buf = &fp->rx_buf_ring[prod];
>> +	struct eth_rx_bd *cons_bd = &fp->rx_desc_ring[cons];
>> +	struct eth_rx_bd *prod_bd = &fp->rx_desc_ring[prod];
>> +
>> +	pci_dma_sync_single_for_device(bp->pdev,
>> +				       pci_unmap_addr(cons_rx_buf, mapping),
>> +				       bp->rx_offset + RX_COPY_THRESH,
>> +				       PCI_DMA_FROMDEVICE);
>> +
>> +	prod_rx_buf->skb = cons_rx_buf->skb;
>> +	pci_unmap_addr_set(prod_rx_buf, mapping,
>> +			   pci_unmap_addr(cons_rx_buf, mapping));
>> +	*prod_bd = *cons_bd;
>> +#ifdef BNX2X_STOP_ON_ERROR
>> +	prod_bd->reserved = fp->last_alloc++;
>> +#endif
>> +}
> 
>> +static void bnx2x_attn_int(struct bnx2x *bp)
>> +{
>> +	int port = bp->port;
>> +
>> +	/* read local copy of bits */
>> +	u16 attn_bits = bp->def_status_blk->atten_status_block.attn_bits;
>> +	u16 attn_ack = bp->def_status_blk->atten_status_block.attn_bits_ack;
>> +	u16 attn_state = bp->attn_state;
>> +
>> +	/* look for changed bits */
>> +	u16 asserted   =  attn_bits & ~attn_ack & ~attn_state;
>> +	u16 deasserted = ~attn_bits &  attn_ack &  attn_state;
> 
> Do you probably want | instead of & here?
This is code that came from the HW guys, it works.

> 
>> +
>> +	DP(NETIF_MSG_HW,
>> +	   "attn_bits %x, attn_ack %x, asserted %x, deasserted %x \n",
>> +	   attn_bits, attn_ack, asserted, deasserted);
>> +
>> +	if (~(attn_bits ^ attn_ack) & (attn_bits ^ attn_state)) {
> 
> Do you want && or | here, instead of & ?
> 
>> +		BNX2X_ERR("bad attention state\n");
>> +	}
> 
>> +}
>> +
>> +static inline int bnx2x_has_work(struct bnx2x_fastpath *fp)
> 
> Probably better not inline.
this is also fastpath, I will test to see if removing the inline impacts 
performance.

> 
>> +{
>> +	u16 rx_cons_sb = *fp->rx_cons_sb;
>> +
>> +	if ((rx_cons_sb & MAX_RX_DESC_CNT) == MAX_RX_DESC_CNT)
>> +		rx_cons_sb++;
>> +
>> +	if ((rx_cons_sb != fp->rx_comp_cons) ||
>> +	    (*fp->tx_cons_sb != fp->tx_pkt_cons))
>> +		return 1;
>> +
>> +	return 0;
>> +}
> 
>> +static irqreturn_t bnx2x_msix_sp_int(int irq, void *dev_instance)
>> +{
>> +	struct net_device *dev = dev_instance;
> 
> You need to check if dev==NULL and bail out.
> Another driver sharing the IRQ with this might choose to pass the dev
> pointer as NULL.
This is the MSI-X slowpath interrupt handler, it does not share it's 
interrupt.
Same for the MSI-X fastpath handler.

(snipping some lines)

>> +static irqreturn_t bnx2x_interrupt(int irq, void *dev_instance)
>> +{
>> +	struct net_device *dev = dev_instance;
> 
> Check if dev==NULL
OK

> 
>> +	struct bnx2x *bp = netdev_priv(dev);
>> +	u16 status = bnx2x_ack_int(bp);
>> +
>> +	if (unlikely(status == 0)) {
> 
> That's not unlikely.
We are optimizing for the most common case.
I think we should even think about adding a warn_once if we find out our 
line is shared since this is an extremely sub optimal configuration.
> 
>> +		DP(NETIF_MSG_INTR, "not our interrupt!\n");
>> +		return IRQ_NONE;
>> +	}
>> +
>> +	DP(NETIF_MSG_INTR, "got an interrupt status is %u\n", status);
>> +
>> +#ifdef BNX2X_STOP_ON_ERROR
>> +	if (unlikely(bp->panic)) {
>> +		return IRQ_HANDLED;
>> +	}
>> +#endif
>> +
>> +	/* Return here if interrupt is shared and is disabled. */
>> +	if (unlikely(atomic_read(&bp->intr_sem) != 0)) {
>> +		DP(NETIF_MSG_INTR, "called but int_sem not 0, returning. \n");
>> +		return IRQ_HANDLED;
>> +	}
>> +
>> +	if (status & 0x2) {
>> +
>> +		struct bnx2x_fastpath *fp = &bp->fp[0];
>> +
>> +		prefetch(fp->rx_cons_sb);
>> +		prefetch(fp->tx_cons_sb);
>> +		prefetch(&fp->status_blk->c_status_block.status_block_index);
>> +		prefetch(&fp->status_blk->u_status_block.status_block_index);
>> +
>> +		netif_rx_schedule(dev);
>> +
>> +		status &= ~0x2;
>> +
>> +		if (!status) {
>> +			return IRQ_HANDLED;
>> +		}
>> +	}
>> +
>> +	if (unlikely(status & 0x1)) {
>> +
>> +		tasklet_schedule(&bp->sp_task);
>> +		status &= ~0x1;
>> +		if (!status) {
>> +			return IRQ_HANDLED;
>> +		}
>> +	}
>> +
>> +	DP(NETIF_MSG_INTR, "got an unknown interrupt! (status is %u)\n",
>> +	   status);
>> +
>> +	return IRQ_HANDLED;
>> +}
> 
>> +/* some of the internal memories are not directly readable from the driver
>> +   to test them we send debug packets
>> + */
>> +static int bnx2x_int_mem_test(struct bnx2x *bp)
>> +{
>> +	u32 val;
>> +	u32 count;
>> +	u8 hw;
>> +	u8 port;
>> +	u8 i;
>> +	u32 factor;
>> +
>> +	switch (CHIP_REV(bp)) {
>> +	case CHIP_REV_EMUL:
>> +		hw = INIT_EMULATION;
>> +		factor = 2000;
>> +		break;
>> +	case CHIP_REV_FPGA:
>> +		hw = INIT_FPGA;
>> +		factor = 120;
>> +		break;
>> +	default:
>> +		hw = INIT_ASIC;
>> +		factor = 1;
>> +		break;
>> +	}
>> +
>> +	port = PORT0 << bp->port;
>> +
>> +	DP(NETIF_MSG_HW, "mem_wrk start part1\n");
>> +
>> +	/* Disable inputs of parser neighbor blocks */
>> +	REG_WR(bp, GRCBASE_TSDM, TSDM_REGISTERS_ENABLE_IN1, 0x0);
>> +	REG_WR(bp, GRCBASE_TCM, TCM_REGISTERS_PRS_IFEN, 0x0);
>> +	REG_WR(bp, GRCBASE_CFC, CFC_REGISTERS_DEBUG0, 0x1);
>> +	NIG_WR(NIG_REGISTERS_PRS_REQ_IN_EN, 0x0);
>> +
>> +	/*  Write 0 to parser credits for CFC search request */
>> +	REG_WR(bp, GRCBASE_PRS, PRS_REGISTERS_CFC_SEARCH_INITIAL_CREDIT, 0x0);
>> +
>> +	/* send Ethernet packet */
>> +	bnx2x_lb_pckt(bp);
>> +
>> +	/* TODO do i reset NIG statistic ?*/
>> +	/* Wait until NIG register shows 1 packet of size 0x10 */
>> +	count = 1000;
>> +	while (count) {
>> +		val = REG_RD(bp, GRCBASE_NIG, NIG_REGISTERS_STAT2_BRB_OCTET);
>> +		REG_RD(bp, GRCBASE_NIG, NIG_REGISTERS_STAT2_BRB_OCTET + 4);
>> +
>> +		if (val == 0x10) {
>> +			break;
>> +		}
>> +		msleep(10 * factor);
> 
> That would be msleep(20000) for CHIP_REV_EMUL.
> besides the sleep being pretty big (does it really need to wait
> 20 sec?), some arches don't support such a big msleep value.
> use ssleep(). It would take 5 and a half hours for the
> timeout count to hit in.
The timeout is too long, will change the factor.
(However CHIP_REV_EMUL means we are not running on a real chip, we are 
running on a simulated chip.)

> 
>> +		count--;
>> +	}
>> +	if (val != 0x10) {
>> +		BNX2X_ERR("NIG timeout val = 0x%x\n", val);
>> +		return -1;
>> +	}
>> +
>> +	/* Wait until PRS register shows 1 packet */
>> +	count = 1000;
>> +	while (count) {
>> +		val = REG_RD(bp, GRCBASE_PRS, PRS_REGISTERS_NUM_OF_PACKETS);
>> +
>> +		if (val == 0x1) {
>> +			break;
>> +		}
>> +		msleep(10 * factor);
> 
> Same here.
> 
>> +		count--;
>> +	}
>> +	if (val != 0x1) {
>> +		BNX2X_ERR("PRS timeout val = 0x%x\n", val);
>> +		return -2;
>> +	}
> 
>> +	/* Wait until NIG register shows 10 + 1
>> +	   packets of size 11*0x10 = 0xb0 */
>> +	count = 1000;
>> +	while (count) {
>> +
>> +		val = REG_RD(bp, GRCBASE_NIG, NIG_REGISTERS_STAT2_BRB_OCTET);
>> +		REG_RD(bp, GRCBASE_NIG, NIG_REGISTERS_STAT2_BRB_OCTET + 4);
>> +
>> +		if (val == 0xb0) {
>> +			break;
>> +		}
>> +		msleep(10 * factor);
> 
> Same here.
> 
>> +		count--;
>> +	}
>> +	if (val != 0xb0) {
>> +		BNX2X_ERR("NIG timeout val = 0x%x\n", val);
>> +		return -3;
>> +	}
>> +
>> +	/* Wait until PRS register shows 2 packet */
>> +	val = REG_RD(bp, GRCBASE_PRS, PRS_REGISTERS_NUM_OF_PACKETS);
>> +
>> +	if (val != 0x2) {
>> +		BNX2X_ERR("PRS timeout val = 0x%x\n", val);
>> +	}
>> +
>> +	/* Write 1 to parser credits for CFC search request */
>> +	REG_WR(bp, GRCBASE_PRS, PRS_REGISTERS_CFC_SEARCH_INITIAL_CREDIT, 0x1);
>> +
>> +	/* Wait until PRS register shows 3 packet */
>> +	msleep(10 * factor);
> 
> Same.
> 
>> +	DP(NETIF_MSG_HW, "done\n");
>> +	return 0; /* OK */
>> +}
> 
>> +static void bnx2x_stop_stats(struct bnx2x *bp)
>> +{
>> +	atomic_set(&bp->stats_state, STATS_STATE_STOP);
>> +	DP(BNX2X_MSG_STATS, "stats_state - STOP\n");
>> +	while (atomic_read(&bp->stats_state) != STATS_STATE_DISABLE) {
> 
> This seems to need a memory barrier (on this side _and_ on the write side,
> wherever that is (comment!)).
> There are special mbs for atomic variables.
This does not need to be an atomic_t will change to int.

> 
>> +		msleep(100);
>> +	}
>> +	DP(BNX2X_MSG_STATS, "stats_state - DISABLE\n");
>> +}
> 
>> +static void bnx2x_update_stats(struct bnx2x *bp)
>> +{
>> +
>> +	if (!bnx2x_update_storm_stats(bp)) {
>> +
>> +		if (bp->phy_flags & PHY_BMAC_FLAG) {
>> +			bnx2x_update_bmac_stats(bp);
>> +
>> +		} else if (bp->phy_flags & PHY_EMAC_FLAG) {
>> +			bnx2x_update_emac_stats(bp);
>> +
>> +		} else { /* unreached */
>> +			BNX2X_ERR("no MAC active\n");
>> +			return;
>> +		}
>> +
>> +		bnx2x_update_net_stats(bp);
>> +	}
>> +
>> +	if (bp->msglevel & NETIF_MSG_TIMER) {
>> +		printk(KERN_DEBUG "%s:\n"
>> +		       KERN_DEBUG "tx avail(%x)   rx usage(%x)\n"
>> +		       KERN_DEBUG "tx pkt (%lx)   rx pkt(%lx)\n"
>> +		       KERN_DEBUG "tx hc idx(%x)  rx hc index(%x)\n"
>> +		       KERN_DEBUG "%s (Xoff events %u)\n"
>> +		       KERN_DEBUG "brb drops                     %u\n"
>> +		       KERN_DEBUG "tstats->no_buff_discard,      %u\n"
>> +		       KERN_DEBUG "tstats->errors_discard,       %u\n"
>> +		       KERN_DEBUG "tstats->mac_filter_discard,   %u\n"
>> +		       KERN_DEBUG "tstats->xxoverflow_discard,   %u\n"
>> +		       KERN_DEBUG "tstats->ttl0_discard,         %u\n",
>> +		       bp->dev->name,
>> +		       bnx2x_tx_avail(bp->fp),
>> +		       (u16)(*bp->fp->rx_cons_sb - bp->fp->rx_comp_cons),
>> +		       bp->net_stats.tx_packets, bp->net_stats.rx_packets,
>> +		       *bp->fp->tx_cons_sb, *bp->fp->rx_cons_sb,
>> +		       netif_queue_stopped(bp->dev)? "Xoff" : "Xon ",
>> +		       bp->slowpath->eth_stats.driver_xoff,
>> +		       bp->slowpath->eth_stats.brb_discard,
>> +		       bp->slowpath->eth_stats.no_buff_discard,
>> +		       bp->slowpath->eth_stats.errors_discard,
>> +		       bp->slowpath->eth_stats.mac_filter_discard,
>> +		       bp->slowpath->eth_stats.xxoverflow_discard,
>> +		       bp->slowpath->eth_stats.ttl0_discard);
>> +	}
>> +
>> +	if (bp->state != BNX2X_STATE_OPEN) {
>> +		DP(BNX2X_MSG_STATS, "state is %x, returning\n", bp->state);
>> +		return;
>> +	}
>> +
>> +#ifdef BNX2X_STOP_ON_ERROR
>> +	if (unlikely(bp->panic)) {
>> +		return;
>> +	}
>> +#endif
>> +
>> +	if (bp->fw_mb) {
>> +		REG_WR32(bp, GRCBASE_DMAE,
>> +			 (bp->port? DMAE_REGISTERS_GO_C13 :
>> +				    DMAE_REGISTERS_GO_C12), 1);
>> +	}
>> +
>> +	if (atomic_read(&bp->stats_state) != STATS_STATE_ENABLE) {
>> +		atomic_set(&bp->stats_state, STATS_STATE_DISABLE);
> 
> Is that the write side? If yes, you need a mb here, too.
> And a comment.
>
>> +		return;
>> +	}
>> +
>> +	if (bp->phy_flags & PHY_BMAC_FLAG) {
>> +		REG_WR32(bp, GRCBASE_DMAE,
>> +			 (bp->port? DMAE_REGISTERS_GO_C6 :
>> +				    DMAE_REGISTERS_GO_C0), 1);
>> +
>> +	} else if (bp->phy_flags & PHY_EMAC_FLAG) {
>> +		REG_WR32(bp, GRCBASE_DMAE,
>> +			 (bp->port? DMAE_REGISTERS_GO_C8 :
>> +				    DMAE_REGISTERS_GO_C2), 1);
>> +	}
>> +
>> +	if (bnx2x_sp_post(bp, RAMROD_CMD_ID_ETH_STAT_QUERY, 0, 0, 0, 0) == 0) {
>> +		/* stats ramrod has it's own slot on the spe */
>> +		bp->spq_left++;
>> +		bp->stat_pending = 1;
>> +	}
>> +}
> 
>> +/* Called with netif_tx_lock.
>> + * bnx2x_tx_int() runs without netif_tx_lock unless it needs to call
>> + * netif_wake_queue().
>> + */
>> +static int bnx2x_start_xmit(struct sk_buff *skb, struct net_device *dev)
>> +{
>> +	struct bnx2x *bp = netdev_priv(dev);
>> +	dma_addr_t mapping;
>> +	struct eth_tx_bd *txbd;
>> +	struct eth_tx_parse_bd *pbd = NULL;
>> +	struct sw_tx_bd *tx_buf;
>> +	u16 pkt_prod, bd_prod;
>> +	int nbd, fp_index = 0;
>> +	struct bnx2x_fastpath *fp;
>> +
>> +	fp = &bp->fp[fp_index];
>> +#ifdef BNX2X_STOP_ON_ERROR
>> +	if (unlikely(bp->panic)) {
>> +		return NETDEV_TX_BUSY;
>> +	}
>> +#endif
>> +
>> +	if (unlikely(bnx2x_tx_avail(bp->fp) <
>> +		     (skb_shinfo(skb)->nr_frags + 3))) {
>> +		bp->slowpath->eth_stats.driver_xoff++,
>> +		netif_stop_queue(dev);
>> +		BNX2X_ERR("BUG! Tx ring full when queue awake!\n");
>> +		return NETDEV_TX_BUSY;
>> +	}
>> +
>> +	/*
>> +	This is a bit ugly. First we use one BD which we mark as start,
>> +	then for TSO or xsum we have a parsing info BD,
>> +	and only then we have the rest of the TSO bds.
>> +	(don't forget to mark the last one as last,
>> +	and to unmap only AFTER you write to the BD ...)
>> +	I would like to thank Dov Hirshfeld for this mess.
>> +	*/
>> +
>> +	pkt_prod = fp->tx_pkt_prod++;
>> +	bd_prod = fp->tx_bd_prod;
>> +
>> +	pkt_prod = TX_BD(pkt_prod);
>> +	bd_prod = TX_BD(bd_prod);
>> +
>> +	/* get a tx_buff and first bd */
>> +	tx_buf = &fp->tx_buf_ring[pkt_prod];
>> +	txbd = &fp->tx_desc_ring[bd_prod];
>> +
>> +	txbd->bd_flags.as_bitfield = ETH_TX_BD_FLAGS_START_BD;
>> +
>> +	/* remeber the first bd of the packet */
>> +	tx_buf->first_bd = bd_prod;
>> +
>> +	DP(NETIF_MSG_TX_QUEUED,
>> +	   "sending pkt=%u @%p, next_idx=%u, bd=%u @%p\n",
>> +	   pkt_prod, tx_buf, fp->tx_pkt_prod, bd_prod, txbd);
>> +
>> +	if (skb->ip_summed == CHECKSUM_PARTIAL) {
>> +		u8 len;
>> +		struct iphdr *iph = ip_hdr(skb);
>> +
>> +		txbd->bd_flags.as_bitfield |= ETH_TX_BD_FLAGS_IP_CSUM ;
>> +
>> +		/* turn on parsing and get a bd */
>> +		bd_prod = TX_BD(NEXT_TX_IDX(bd_prod));
>> +		pbd = (void *)&fp->tx_desc_ring[bd_prod];
>> +		
>> +		len = ((u8 *)ip_hdr(skb) - (u8 *)skb->data)/2;
>> +
>> +		/* for now NS flag is not used in Linux */
>> +		pbd->global_data =
>> +			len | ((skb->protocol == ETH_P_8021Q)
>> +			       << ETH_TX_PARSE_BD_LLC_SNAP_EN_SHIFT);
>> +
>> +		pbd->ip_hlen = ip_hdrlen(skb)/2;
>> +		pbd->total_hlen = len + pbd->ip_hlen;
>> +
>> +		if (iph->protocol == IPPROTO_TCP) {
>> +			struct tcphdr *th = tcp_hdr(skb);
>> +			txbd->bd_flags.as_bitfield |=
>> +				ETH_TX_BD_FLAGS_TCP_CSUM;
>> +			pbd->tcp_flags = htonl(tcp_flag_word(skb))&0xFFFF;
>> +			pbd->total_hlen += tcp_hdrlen(skb)/2;
>> +
>> +			pbd->tcp_pseudo_csum = ntohs(th->check);
>> +		} else if (iph->protocol == IPPROTO_UDP) {
>> +			struct udphdr *uh = udp_hdr(skb);
>> +			txbd->bd_flags.as_bitfield |=
>> +				ETH_TX_BD_FLAGS_TCP_CSUM;
>> +			pbd->total_hlen += 4;
>> +			pbd->global_data |= ETH_TX_PARSE_BD_UDP_FLG;
>> +
>> +			/* HW bug: we need to subtract 10 bytes before the
>> +			 * UDP header from the csum
>> +			 */
>> +			uh->check = (u16) ~csum_fold(csum_sub(uh->check,
>> +				csum_partial(((u8 *)(uh)-10), 10, 0)));
>> +		}
>> +	}
>> +
>> +	if (bp->vlgrp != 0 && vlan_tx_tag_present(skb)) {
>> +		txbd->vlan = vlan_tx_tag_get(skb);
>> +		txbd->bd_flags.as_bitfield |= ETH_TX_BD_FLAGS_VLAN_TAG;
>> +	}
>> +#ifdef BNX2X_STOP_ON_ERROR
>> +	else {
>> +		txbd->vlan = pkt_prod;
>> +	}
>> +#endif
>> +
>> +	mapping = pci_map_single(bp->pdev, skb->data,
>> +				 skb->len, PCI_DMA_TODEVICE);
> 
> dma_mapping_error()
OK

> 
>> +
>> +	txbd->addr_hi = U64_HI(mapping);
>> +	txbd->addr_lo = U64_LO(mapping);
>> +	txbd->hdr_nbds = 1;
>> +	txbd->nbd = nbd = skb_shinfo(skb)->nr_frags + ((pbd == NULL)? 1 : 2);
>> +	txbd->nbytes = skb_headlen(skb);
>> +
>> +	DP(NETIF_MSG_TX_QUEUED,
>> +	   "first bd @%p, addr(%x:%x) hdr_nbds(%d) nbd(%d)"
>> +	   " nbytes(%d) flags(%x) vlan(%u)\n",
>> +	   txbd, txbd->addr_hi, txbd->addr_lo, txbd->hdr_nbds, txbd->nbd,
>> +	   txbd->nbytes, txbd->bd_flags.as_bitfield, txbd->vlan);
>> +
>> +	if ((skb_shinfo(skb)->gso_size) &&
>> +	    (skb->len > (bp->dev->mtu + ETH_HLEN))) {
>> +
>> +		int hlen = 2*pbd->total_hlen;
>> +	DP(NETIF_MSG_TX_QUEUED,
>> +	   "TSO packet len %d, hlen %d, total len %d, tso size %d\n",
>> +	   skb->len, hlen, skb_headlen(skb), skb_shinfo(skb)->gso_size);
>> +
>> +		txbd->bd_flags.as_bitfield |= ETH_TX_BD_FLAGS_SW_LSO;
>> +
>> +		if (txbd->nbytes > hlen) {
>> +		/* we split the first bd into headers and data bds
>> +		 * to ease the pain of our fellow micocode engineers
>> +		 * we use one mapping for both bds
>> +		 * So far this has only been observed to happen
>> +		 * in Other Operating Systems(TM)
>> +		 */
>> +
>> +			/* first fix first bd */
>> +			nbd++;
>> +			txbd->nbd++;
>> +			txbd->nbytes = hlen;
>> +
>> +			/* we only print this as an error
>> +			 * because we don't think this will ever happen.
>> +			 */
>> +			BNX2X_ERR("TSO split headr size is %d (%x:%x)"
>> +				  " nbd %d\n", txbd->nbytes, txbd->addr_hi,
>> +				  txbd->addr_lo, txbd->nbd);
>> +
>> +			/* now get a new data bd
>> +			 * (after the pbd) and fill it */
>> +			bd_prod = TX_BD(NEXT_TX_IDX(bd_prod));
>> +			txbd = &fp->tx_desc_ring[bd_prod];
>> +
>> +#ifdef BNX2X_STOP_ON_ERROR
>> +			txbd->vlan = pkt_prod;
>> +#endif
>> +			txbd->addr_hi = U64_HI(mapping);
>> +			txbd->addr_lo = U64_LO(mapping) + hlen;
>> +			txbd->nbytes = skb_headlen(skb) - hlen;
>> +			/* this marks the bd
>> +			 * as one that has no individual mapping
>> +			 * the FW ignors this flag in a bd not maked start
>> +			 */
>> +			txbd->bd_flags.as_bitfield = ETH_TX_BD_FLAGS_SW_LSO;
>> +			DP(NETIF_MSG_TX_QUEUED,
>> +			   "TSO split data size is %d (%x:%x)\n",
>> +			   txbd->nbytes, txbd->addr_hi, txbd->addr_lo);
>> +		}
>> +
>> +		if (!pbd) {
>> +			/* supposed to be unreached
>> +			 * (and therefore not handled properly...)
>> +			 */
>> +			BNX2X_ERR("LSO with no PBD\n");
>> +			BUG();
>> +		}
>> +
>> +		pbd->lso_mss = skb_shinfo(skb)->gso_size;
>> +		pbd->tcp_send_seq = ntohl(tcp_hdr(skb)->seq);
>> +		pbd->ip_id  = ntohs(ip_hdr(skb)->id);
>> +		pbd->tcp_pseudo_csum =
>> +			ntohs(~csum_tcpudp_magic(ip_hdr(skb)->saddr,
>> +						 ip_hdr(skb)->daddr,
>> +						 0, IPPROTO_TCP, 0));
>> +		pbd->global_data |= ETH_TX_PARSE_BD_PSEUDO_CS_WITHOUT_LEN;
>> +	}
>> +
>> +	{
>> +		int i;
>> +		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
>> +			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
>> +
>> +			bd_prod = TX_BD(NEXT_TX_IDX(bd_prod));
>> +			txbd = &fp->tx_desc_ring[bd_prod];
>> +
>> +			mapping = pci_map_page(bp->pdev, frag->page,
>> +					       frag->page_offset,
>> +					       frag->size, PCI_DMA_TODEVICE);
>> +
>> +			txbd->addr_hi = U64_HI(mapping);
>> +			txbd->addr_lo = U64_LO(mapping);
>> +			txbd->nbytes = frag->size;
>> +			txbd->bd_flags.as_bitfield = 0;
>> +#ifdef BNX2X_STOP_ON_ERROR
>> +			txbd->vlan = pkt_prod;
>> +#endif
>> +			DP(NETIF_MSG_TX_QUEUED, "frag (%d) bd @%p,"
>> +			   " addr(%x:%x) nbytes(%d) flags(%x)\n",
>> +			   i, txbd, txbd->addr_hi, txbd->addr_lo,
>> +			   txbd->nbytes, txbd->bd_flags.as_bitfield);
>> +
>> +		}  /* for */
>> +	}
>> +
>> +	/* now at last mark the bd as the last bd */
>> +	txbd->bd_flags.as_bitfield |= ETH_TX_BD_FLAGS_END_BD;
>> +
>> +	DP(NETIF_MSG_TX_QUEUED, "last bd @%p flags(%x)\n",
>> +	   txbd, txbd->bd_flags.as_bitfield);
>> +
>> +	tx_buf->skb = skb;
>> +
>> +	bd_prod = TX_BD(NEXT_TX_IDX(bd_prod));
>> +
>> +	/* now send a tx doorbell, counting the next bd
>> +	 * if the packet contains or ends with it
>> +	 */
>> +	if (TX_BD_POFF(bd_prod) < nbd)
>> +		nbd++;
>> +
>> +	if (pbd) {
>> +		DP(NETIF_MSG_TX_QUEUED,
>> +		   "PBD @%p, ip_data=%x , ip_hlen %u, ip_id %u, lso_mss %u,"
>> +		   " tcp_flags %x, xsum %x, seq %u, hlen %u)\n",
>> +		   pbd, pbd->global_data, pbd->ip_hlen, pbd->ip_id,
>> +		   pbd->lso_mss, pbd->tcp_flags, pbd->tcp_pseudo_csum,
>> +		   pbd->tcp_send_seq, pbd->total_hlen);
>> +	}
>> +
>> +	DP(NETIF_MSG_TX_QUEUED, "doorbell: nbd=%u, bd=%d\n", nbd, bd_prod);
>> +
>> +	fp->hw_tx_prods->bds_prod += nbd;
>> +	fp->hw_tx_prods->packets_prod++;
>> +	DOORBELL(bp, fp_index, 0);
>> +
>> +	mmiowb();
>> +
>> +	fp->tx_bd_prod = bd_prod;
>> +	dev->trans_start = jiffies;
>> +
>> +	if (unlikely(bnx2x_tx_avail(fp) < MAX_SKB_FRAGS + 3)) {
>> +		netif_stop_queue(dev);
>> +		bp->slowpath->eth_stats.driver_xoff++;
>> +		if (bnx2x_tx_avail(fp) >= MAX_SKB_FRAGS + 3)
>> +			netif_wake_queue(dev);
>> +	}
>> +	fp->tx_pkt++;
>> +	return NETDEV_TX_OK;
>> +}
> 
>> +static int bnx2x_nic_unload(struct bnx2x *bp, int fre_irq)
>> +{
>> +	u32 reset_code = 0;
>> +	int rc /*, j */;
>> +	bp->state = BNX2X_STATE_CLOSING_WAIT4_HALT;
>> +
>> +	/* Calling flush_scheduled_work() may deadlock because
>> +	 * linkwatch_event() may be on the workqueue and it will try to get
>> +	 * the rtnl_lock which we are holding.
>> +	 */
>> +	while (bp->in_reset_task)
>> +		msleep(1);
> 
> Memory barrier?
Now that cancel_work is available we will use that instead of the 
in_rest_task flag.

> 
>> +
>> +	/* Delete the timer: do it before disabling interrupts, as it
>> +	   may be stil STAT_QUERY ramrod pending after stopping the timer */
>> +	del_timer_sync(&bp->timer);
>> +
>> +	/* Wait until stat ramrod returns and all SP tasks complete */
>> +	while (bp->stat_pending && (bp->spq_left != MAX_SPQ_PENDING)) {
> 
> Memory barrier?

OK
> 
>> +		msleep(1);
>> +	}
>> +
>> +	/* Stop fast path, disable MAC, disable interrupts */
>> +	bnx2x_netif_stop(bp);
> 
>> +	/* Remember cstorm producer in DSB to track it */
>> +	bp->dsb_prod_sp_idx = *bp->dsb_sp_prod;
>> +	/* Send CFC_DELETE ramrod */
>> +	bnx2x_sp_post(bp, RAMROD_CMD_ID_ETH_LEADING_CFC_DEL, 0, 0, 0, 1);
>> +	/* Wait for completion to arrive. Do nothing as we are going to reset
>> +	   the chip a few lines later */
>> +	while ( bp->dsb_prod_sp_idx == *bp->dsb_sp_prod ) {
> 
> What's that doing? Poking some DMA?
> Barrier needed? DMA sync needed?
sp_post send a command on the slowpath ring.
I will add a barrier inside it.

> 
>> +		msleep(1);
>> +	}
>> +
> 
>> +static void bnx2x_reset_task(struct work_struct *work)
>> +{
>> +	struct bnx2x *bp = container_of(work, struct bnx2x, reset_task);
>> +
>> +#ifdef BNX2X_STOP_ON_ERROR
>> +	BNX2X_ERR("reset taks called but STOP_ON_ERROR defined"
>> +		  " so reset not done to allow debug dump,\n"
>> +		  " you will need to reboot when done\n");
>> +	return;
>> +#endif
>> +
>> +	if (!netif_running(bp->dev))
>> +		return;
>> +
>> +	bp->in_reset_task = 1;
>> +	bnx2x_netif_stop(bp);
>> +
>> +	bnx2x_nic_unload(bp, 0);
>> +	bnx2x_nic_load(bp, 0);
> 
> 
>> +	bp->in_reset_task = 0;
> 
> smp_wmb()?
will be removed.
> 
>> +}
> 
>> +static int bnx2x_phys_id(struct net_device *dev, u32 data)
>> +{
>> +	struct bnx2x *bp = netdev_priv(dev);
>> +	int i;
>> +
>> +	if (data == 0)
>> +		data = 2;
>> +
>> +	for (i = 0; i < (data * 2); i++) {
>> +		if ((i % 2) == 0) {
>> +			bnx2x_leds_set(bp, SPEED_1000);
>> +		} else {
>> +			bnx2x_leds_unset(bp);
>> +		}
>> +		msleep_interruptible(500);
> 
> Check the return value of msleep_interruptible and drop the
> following check for signals.
OK

> 
>> +		if (signal_pending(current))
>> +			break;
>> +	}
>> +
>> +	if (bp->link_up) {
>> +		bnx2x_leds_set(bp, bp->line_speed);
>> +	}
>> +	return 0;
>> +}
> 
>> +#ifdef BNX2X_STOP_ON_ERROR
>> +#warning stop on error defined
>> +#define bnx2x_panic() \
>> +	do { bp->panic = 1; \
>> +		BNX2X_ERR("driver assert\n"); \
>> +		bnx2x_disable_int(bp); \
>> +		bnx2x_panic_dump(bp); \
>> +	} while (0);
> 
> Drop the last semicolon
Opps... ;)

> 
>> +#else
>> +#define bnx2x_panic()
>> +#endif
> 
> 
> Puh, that was a lot :)
> 
> 




  parent reply	other threads:[~2007-08-02 18:09 UTC|newest]

Thread overview: 13+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2007-08-01  8:31 [RFC][BNX2X]: New driver for Broadcom 10Gb Ethernet Michael Chan
2007-08-01 22:06 ` Michael Buesch
2007-08-01 22:28   ` Roland Dreier
2007-08-01 22:50     ` Jeff Garzik
2007-08-02 18:13       ` Eliezer Tamir
2007-08-02 18:09   ` Eliezer Tamir [this message]
2007-08-02 21:48   ` Michael Chan
2007-08-07 22:15   ` Jeff Garzik
2007-08-07 22:20     ` Michael Buesch
2007-08-07 23:04       ` Christoph Hellwig
2007-08-07 23:08         ` David Miller
2007-08-08  8:40           ` Michael Buesch
2007-08-07 23:04   ` Roland Dreier

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=46B21DD0.7070206@broadcom.com \
    --to=eliezert@broadcom.com \
    --cc=davem@davemloft.net \
    --cc=eilong@broadcom.com \
    --cc=jeff@garzik.org \
    --cc=mb@bu3sch.de \
    --cc=mchan@broadcom.com \
    --cc=netdev@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.