Netdev List

Netdev List
 help / color / mirror / Atom feed

* Re: [RFC][BNX2X]: New driver for Broadcom 10Gb Ethernet.
From: Eliezer Tamir @ 2007-08-02 18:09 UTC (permalink / raw)
  To: Michael Buesch; +Cc: Michael Chan, davem, jeff, netdev, eilong
In-Reply-To: <200708020006.13457.mb@bu3sch.de>

Michal,

Thanks for going over the code.
My responses are inline.

Eliezer

Michael Buesch wrote:
> On Wednesday 01 August 2007 10:31:17 Michael Chan wrote:
> 
>> +typedef struct {
>> +	u8 reserved[64];
>> +} license_key_t;
> 
> No typedef.
> What is a "license key" used for, anyway?
This will be removed.
This is just a placeholder that has the right size.

> 
>> +#define RUN_AT(x)		(jiffies + (x))
> 
> That macro does only obfuscate code, in my opinion.
> If you want jiffies+x, better opencode it.
OK

> 
>> +typedef enum {
>> +	BCM5710 = 0,
>> +} board_t;
> 
> No typedef. Do
> enum bnx2x_board {
> 	BCM5710 = 0,
> };
> Or something like that.
OK

> 
>> +static struct pci_device_id bnx2x_pci_tbl[] = {
> 
> static const struct...
> 
OK

>> +	{ PCI_VENDOR_ID_BROADCOM, PCI_DEVICE_ID_NX2_5710,
>> +		PCI_ANY_ID, PCI_ANY_ID, 0, 0, BCM5710 },
>> +	{ 0 }
>> +};
> 
>> +static inline u32 bnx2x_bits_en(struct bnx2x *bp, u32 block, u32 reg,
>> +				u32 bits)
> 
> Does that really need to be inline? I'd suggest dropping inline.

Does not need to be inlined. Will change.

> 
>> +{
>> +	u32 val = REG_RD(bp, block, reg);
>> +
>> +	val |= bits;
>> +	REG_WR(bp, block, reg, val);
>> +	return val;
>> +}
>> +
>> +static inline u32 bnx2x_bits_dis(struct bnx2x *bp, u32 block, u32 reg,
>> +				 u32 bits)
> 
> Same here.
Same
> 
>> +{
>> +	u32 val = REG_RD(bp, block, reg);
>> +
>> +	val &= ~bits;
>> +	REG_WR(bp, block, reg, val);
>> +	return val;
>> +}
>> +
>> +static int bnx2x_mdio22_write(struct bnx2x *bp, u32 reg, u32 val)
>> +{
>> +	int rc;
>> +	u32 tmp, i;
>> +	int port = bp->port;
>> +	u32 emac_base = port ? GRCBASE_EMAC1 : GRCBASE_EMAC0;
>> +
>> +	if (bp->phy_flags & PHY_INT_MODE_AUTO_POLLING_FLAG) {
>> +
>> +		tmp = REG_RD(bp, emac_base, EMAC_REG_EMAC_MDIO_MODE);
>> +		tmp &= ~EMAC_MDIO_MODE_AUTO_POLL;
>> +		EMAC_WR(EMAC_REG_EMAC_MDIO_MODE, tmp);
>> +		REG_RD(bp, emac_base, EMAC_REG_EMAC_MDIO_MODE);
>> +		udelay(40);
>> +	}
>> +
>> +	tmp = ((bp->phy_addr << 21) | (reg << 16) |
>> +	       (val & EMAC_MDIO_COMM_DATA) |
>> +	       EMAC_MDIO_COMM_COMMAND_WRITE_22 |
>> +	       EMAC_MDIO_COMM_START_BUSY);
>> +	EMAC_WR(EMAC_REG_EMAC_MDIO_COMM, tmp);
>> +
>> +	for (i = 0; i < 50; i++) {
>> +		udelay(10);
>> +
>> +		tmp = REG_RD(bp, emac_base, EMAC_REG_EMAC_MDIO_COMM);
>> +		if (!(tmp & EMAC_MDIO_COMM_START_BUSY)) {
>> +			udelay(5);
>> +			break;
>> +		}
>> +	}
>> +
>> +	if (tmp & EMAC_MDIO_COMM_START_BUSY) {
>> +		BNX2X_ERR("write phy register failed\n");
>> +
>> +		rc = -EBUSY;
>> +	} else {
>> +		rc = 0;
>> +	}
>> +
>> +	if (bp->phy_flags & PHY_INT_MODE_AUTO_POLLING_FLAG) {
>> +
>> +		tmp = REG_RD(bp, emac_base, EMAC_REG_EMAC_MDIO_MODE);
>> +		tmp |= EMAC_MDIO_MODE_AUTO_POLL;
>> +		EMAC_WR(EMAC_REG_EMAC_MDIO_MODE, tmp);
>> +	}
>> +
>> +	return rc;
>> +}
>> +
>> +static int bnx2x_mdio22_read(struct bnx2x *bp, u32 reg, u32 *ret_val)
>> +{
>> +	int rc;
>> +	u32 val, i;
>> +	int port = bp->port;
>> +	u32 emac_base = port ? GRCBASE_EMAC1 : GRCBASE_EMAC0;
>> +
>> +	if (bp->phy_flags & PHY_INT_MODE_AUTO_POLLING_FLAG) {
>> +
>> +		val = REG_RD(bp, emac_base, EMAC_REG_EMAC_MDIO_MODE);
>> +		val &= ~EMAC_MDIO_MODE_AUTO_POLL;
>> +		EMAC_WR(EMAC_REG_EMAC_MDIO_MODE, val);
>> +		REG_RD(bp, emac_base, EMAC_REG_EMAC_MDIO_MODE);
>> +		udelay(40);
>> +	}
>> +
>> +	val = ((bp->phy_addr << 21) | (reg << 16) |
>> +	       EMAC_MDIO_COMM_COMMAND_READ_22 |
>> +	       EMAC_MDIO_COMM_START_BUSY);
>> +	EMAC_WR(EMAC_REG_EMAC_MDIO_COMM, val);
>> +
>> +	for (i = 0; i < 50; i++) {
>> +		udelay(10);
>> +
>> +		val = REG_RD(bp, emac_base, EMAC_REG_EMAC_MDIO_COMM);
>> +		if (!(val & EMAC_MDIO_COMM_START_BUSY)) {
> 
> No udelay(5) here, like in write above?
There is a udelay.

> 
>> +			val &= EMAC_MDIO_COMM_DATA;
>> +			break;
>> +		}
>> +	}
> 
>> +static int bnx2x_mdio45_vwrite(struct bnx2x *bp, u32 reg, u32 addr, u32 val)
>> +{
>> +	int i;
>> +	u32 rd_val;
>> +
>> +	for (i = 0; i < 10; i++) {
>> +		bnx2x_mdio45_write(bp, reg, addr, val);
>> +		mdelay(5);
> 
> Can you msleep(5) here?
Can't sleep, this can happen from a "sleepless" context.
Maybe we can break it down to smaller mdelays, will ask the HW guys.
> 
>> +		bnx2x_mdio45_read(bp, reg, addr, &rd_val);
>> +		/* if the read value is not the same as the value we wrote,
>> +		   we should write it again */
>> +		if (rd_val == val) {
>> +			return 0;
>> +		}
>> +	}
>> +	BNX2X_ERR("MDIO write in CL45 failed\n");
>> +	return -EBUSY;
>> +}
> 
>> +/* DMAE command positions used
>> + * Port0 14
>> + * Port1 15
>> + */
>> +static void bnx2x_wb_write_dmae(struct bnx2x *bp, u32 wb_addr, u32 *wb_write,
>> +				u32 wb_len)
>> +{
>> +	struct dmae_command *dmae = &bp->dmae;
>> +	int port = bp->port;
>> +	u32 *wb_comp = bnx2x_sp(bp, wb_comp);
>> +
>> +	memcpy(bnx2x_sp(bp, wb_write[0]), wb_write, wb_len * 4);
>> +	memset(dmae, 0, sizeof(struct dmae_command));
>> +
>> +	dmae->opcode = (DMAE_CMD_SRC_PCI | DMAE_CMD_DST_GRC |
>> +			DMAE_CMD_C_DST_PCI | DMAE_CMD_C_ENABLE |
>> +			DMAE_CMD_SRC_RESET | DMAE_CMD_DST_RESET |
>> +			DMAE_CMD_ENDIANITY_DW_SWAP |
>> +			(port? DMAE_CMD_PORT_1 : DMAE_CMD_PORT_0));
>> +	dmae->src_addr_lo = U64_LO(bnx2x_sp_mapping(bp, wb_write));
>> +	dmae->src_addr_hi = U64_HI(bnx2x_sp_mapping(bp, wb_write));
>> +	dmae->dst_addr_lo = wb_addr >> 2;
>> +	dmae->dst_addr_hi = 0;
>> +	dmae->len = wb_len;
>> +	dmae->comp_addr_lo = U64_LO(bnx2x_sp_mapping(bp, wb_comp));
>> +	dmae->comp_addr_hi = U64_HI(bnx2x_sp_mapping(bp, wb_comp));
>> +	dmae->comp_val = BNX2X_WB_COMP_VAL;
>> +	bnx2x_post_dmae(bp, port? 15 : 14);
>> +
>> +	*wb_comp = 0;
> 
> wmb();
Right.

> 
>> +	REG_WR32(bp, GRCBASE_DMAE,
>> +		 (bp->port? DMAE_REGISTERS_GO_C15 :
>> +			    DMAE_REGISTERS_GO_C14), 1);
>> +	udelay(5);
>> +	while (*wb_comp != BNX2X_WB_COMP_VAL) {
>> +		udelay(5);
> 
> Not sure what this is doing here.
> Do we need some DMA syncing here?
> 
> Anyway, we _do_ need a timeout here.
> Probably also some kind of memory barrier, if no DMA syncing is needed.
This causes the chip to read from pci consistent memory, so I guess we 
can use an wmb().
Will add timeout.

> 
>> +	}
>> +}
> 
>> +static void bnx2x_emac_enable(struct bnx2x *bp)
>> +{
>> +	u32 val;
>> +	int port = bp->port;
>> +	u32 emac_base = port ? GRCBASE_EMAC1 : GRCBASE_EMAC0;
> 
>> +	while (val & EMAC_MODE_RESET) {
>> +		val = REG_RD(bp, emac_base, EMAC_REG_EMAC_MODE);
>> +		DP(NETIF_MSG_LINK, "EMAC reset reg is %u\n", val);
>> +	}
> 
> A timeout, please.
OK
> 
>> +	/* reset tx part */
>> +	EMAC_WR(EMAC_REG_EMAC_TX_MODE, EMAC_TX_MODE_RESET);
>> +
>> +	while (val & EMAC_TX_MODE_RESET) {
>> +		val = REG_RD(bp, emac_base, EMAC_REG_EMAC_TX_MODE);
>> +		DP(NETIF_MSG_LINK, "EMAC reset reg is %u\n", val);
>> +	}
> 
> A timeout, please.
OK
> 
>> +static void bnx2x_pbf_update(struct bnx2x *bp)
>> +{
>> +	int port = bp->port;
>> +	u32 init_crd, crd;
>> +	u32 count = 1000;
>> +	u32 pause = 0;
>> +
>> +	/* disable port */
>> +	REG_WR(bp, GRCBASE_PBF,
>> +	       PBF_REGISTERS_DISABLE_NEW_TASK_PROC_P0 + port*4, 0x1);
>> +
>> +	/* wait for init credit */
>> +	init_crd = REG_RD(bp, GRCBASE_PBF,
>> +			  PBF_REGISTERS_P0_INIT_CRD + port*4);
>> +	crd = REG_RD(bp, GRCBASE_PBF, PBF_REGISTERS_P0_CREDIT + port*8);
>> +	DP(NETIF_MSG_LINK, "init_crd 0x%x  crd 0x%x\n", init_crd, crd);
>> +
>> +	while ((init_crd != crd) && count) {
>> +		mdelay(5);
> 
> msleep(5)?
Can't sleep in link change code since it can be called from the slowpath 
task.
> 
>> +		crd = REG_RD(bp, GRCBASE_PBF,
>> +			     PBF_REGISTERS_P0_CREDIT + port*8);
>> +		count--;
>> +	}
> 
>> +	/* probe the credit changes */
>> +	REG_WR(bp, GRCBASE_PBF, PBF_REGISTERS_INIT_P0 + port*4, 0x1);
>> +	mdelay(5);
> 
> msleep(5)?
> 
>> +	REG_WR(bp, GRCBASE_PBF, PBF_REGISTERS_INIT_P0 + port*4, 0x0);
>> +
>> +	/* enable port */
>> +	REG_WR(bp, GRCBASE_PBF,
>> +	       PBF_REGISTERS_DISABLE_NEW_TASK_PROC_P0 + port*4, 0x0);
>> +}
> 
>> +/* This function is called upon link interrupt */
>> +static void bnx2x_link_update(struct bnx2x *bp)
>> +{
>> +	u32 gp_status;
>> +	int port = bp->port;
>> +	int i;
>> +	int link_10g;
>> +
>> +	DP(NETIF_MSG_LINK, "port %x, is xgxs %x, stat_mask 0x%x, "
>> +	   "int_mask 0x%x, saved_mask 0x%x, MI_INT %x, SERDES_LINK %x,"
>> +	   " 10G %x, XGXS_LINK %x\n",
>> +	   port, (bp->phy_flags & PHY_XGSX_FLAG),
>> +	   REG_RD(bp, GRCBASE_NIG,
>> +		  NIG_REGISTERS_STATUS_INTERRUPT_PORT0 + port*4),
>> +	   REG_RD(bp, GRCBASE_NIG,
>> +		  NIG_REGISTERS_MASK_INTERRUPT_PORT0 + port*4),
>> +	   bp->nig_mask,
>> +	   REG_RD(bp, GRCBASE_NIG,
>> +		  NIG_REGISTERS_EMAC0_STATUS_MISC_MI_INT + port*0x18),
>> +	   REG_RD(bp, GRCBASE_NIG,
>> +		  NIG_REGISTERS_SERDES0_STATUS_LINK_STATUS + port*0x3c),
>> +	   REG_RD(bp, GRCBASE_NIG,
>> +		  NIG_REGISTERS_XGXS0_STATUS_LINK10G + port*0x68),
>> +	   REG_RD(bp, GRCBASE_NIG,
>> +		  NIG_REGISTERS_XGXS0_STATUS_LINK_STATUS + port*0x68)
>> +	);
>> +
>> +	MDIO_SET_REG_BANK(bp, MDIO_REG_BANK_GP_STATUS);
>> +	/* avoid fast toggling */
>> +	for (i = 0 ; i < 10 ; i++) {
>> +		mdelay(10);
> 
> msleep(10)?
> 
>> +		bnx2x_mdio22_read(bp, MDIO_GP_STATUS_TOP_AN_STATUS1,
>> +				  &gp_status);
>> +	}
>> +
>> +	bnx2x_link_settings_status(bp, gp_status);
>> +
>> +	/* anything 10 and over uses the bmac */
>> +	link_10g = ((bp->line_speed >= SPEED_10000) &&
>> +		    (bp->line_speed <= SPEED_16000));
>> +
>> +	bnx2x_link_int_ack(bp, link_10g);
>> +
>> +	/* link is up only if both local phy and external phy are up */
>> +	if (bp->link_up && bnx2x_ext_phy_is_link_up(bp)) {
>> +		if (link_10g) {
>> +			bnx2x_bmac_enable(bp, 0);
>> +			bnx2x_leds_set(bp, SPEED_10000);
>> +
>> +		} else {
>> +			bnx2x_emac_enable(bp);
>> +			bnx2x_emac_program(bp);
>> +
>> +			/* AN complete? */
>> +			if (gp_status & MDIO_AN_CL73_OR_37_COMPLETE) {
>> +				if (!(bp->phy_flags & PHY_SGMII_FLAG)) {
>> +					bnx2x_set_sgmii_tx_driver(bp);
>> +				}/* Not SGMII */
>> +			}
>> +		}
>> +		bnx2x_link_up(bp);
>> +
>> +	} else { /* link down */
>> +		bnx2x_leds_unset(bp);
>> +		bnx2x_link_down(bp);
>> +	}
>> +}
> 
>> +static inline u16 bnx2x_update_dsb_idx(struct bnx2x *bp)
> 
> Too big for inlining.
> If this func is only used in one place, gcc will inline it automatically.

This is the slowpath status block so the inline can be removed.

> 
>> +{
>> +	struct host_def_status_block *dsb = bp->def_status_blk;
>> +	u16 rc = 0;
>> +
>> +	if (bp->def_att_idx != dsb->atten_status_block.attn_bits_index) {
>> +		bp->def_att_idx = dsb->atten_status_block.attn_bits_index;
>> +		rc |= 1;
>> +	}
>> +	if (bp->def_c_idx != dsb->c_def_status_block.status_block_index) {
>> +		bp->def_c_idx = dsb->c_def_status_block.status_block_index;
>> +		rc |= 2;
>> +	}
>> +	if (bp->def_u_idx != dsb->u_def_status_block.status_block_index) {
>> +		bp->def_u_idx = dsb->u_def_status_block.status_block_index;
>> +		rc |= 4;
>> +	}
>> +	if (bp->def_x_idx != dsb->x_def_status_block.status_block_index) {
>> +		bp->def_x_idx = dsb->x_def_status_block.status_block_index;
>> +		rc |= 8;
>> +	}
>> +	if (bp->def_t_idx != dsb->t_def_status_block.status_block_index) {
>> +		bp->def_t_idx = dsb->t_def_status_block.status_block_index;
>> +		rc |= 16;
>> +	}
>> +	rmb(); /* TBD chack this */
>> +	return rc;
>> +}
>> +
>> +static inline u16 bnx2x_update_fpsb_idx(struct bnx2x_fastpath *fp)
> 
> Too big for inlining.
>
This is fastpath code, I will have to see if removing the inline impacts 
performance.


>> +{
>> +	struct host_status_block *fpsb = fp->status_blk;
>> +	u16 rc = 0;
>> +
>> +	if (fp->fp_c_idx != fpsb->c_status_block.status_block_index) {
>> +		fp->fp_c_idx = fpsb->c_status_block.status_block_index;
>> +		rc |= 1;
>> +	}
>> +	if (fp->fp_u_idx != fpsb->u_status_block.status_block_index) {
>> +		fp->fp_u_idx = fpsb->u_status_block.status_block_index;
>> +		rc |= 2;
>> +	}
>> +	rmb(); /* TBD chack this */
>> +	return rc;
>> +}
>> +
>> +static inline u32 bnx2x_tx_avail(struct bnx2x_fastpath *fp)
> 
> Too big for inlining.

this is in the fast-fastpath I think it should be inlined.

> 
>> +{
>> +	u16 used;
>> +	u32 prod = fp->tx_bd_prod;
>> +	u32 cons = fp->tx_bd_cons;
>> +
>> +	smp_mb();
> 
> This barrier needs a comment. Why is it there? And why SMP only?
> 
>> +
>> +	used = (NUM_TX_BD - NUM_TX_RINGS + prod - cons +
>> +		(cons / TX_DESC_CNT) - (prod / TX_DESC_CNT));
>> +
>> +	if (prod >= cons) {
>> +		/* used = prod - cons - prod/size + cons/size */
>> +		used -= NUM_TX_BD - NUM_TX_RINGS;
>> +	}
>> +
>> +	BUG_TRAP(used <= fp->bp->tx_ring_size);
>> +	BUG_TRAP((fp->bp->tx_ring_size - used) <= MAX_TX_AVAIL);
>> +	return(fp->bp->tx_ring_size - used);
>> +}
> 
>> +/*========================================================================*/
>> +
>> +/* acquire split MCP access lock register */
>> +static int bnx2x_lock_alr( struct bnx2x *bp)
>> +{
>> +	int rc = 0;
>> +	u32 i, j, val;
>> +
>> +	i = 100;
>> +	val = 1UL << 31;
>> +
>> +	REG_WR(bp, GRCBASE_MCP, 0x9c, val);
>> +	for (j = 0; j < i*10; j++) {
>> +		val = REG_RD(bp, GRCBASE_MCP, 0x9c);
>> +		if (val & (1L << 31)) {
>> +			break;
>> +		}
>> +
>> +		mdelay(5);
> 
> msleep(5)?
called from link change event, can't sleep.
> 
>> +	}
>> +
>> +	if (!(val & (1L << 31))) {
>> +		BNX2X_ERR("Cannot acquire nvram interface.\n");
>> +
>> +		rc = -EBUSY;
>> +	}
>> +
>> +	return rc;
>> +}
> 
>> +static void bnx2x_free_mem(struct bnx2x *bp)
>> +{
>> +
>> +#define BNX2X_PCI_FREE(x, y, size) do { \
>> +	if (x) { \
>> +		pci_free_consistent(bp->pdev, size, x, y); \
>> +		x = NULL; \
>> +	} \
>> +} while (0)
>> +
>> +#define BNX2X_KFREE(x) do { \
>> +	if (x) { \
>> +		vfree(x); \
>> +		x = NULL; \
>> +	} \
>> +} while (0)
>> +
>> +	int i;
>> +
>> +	/* fastpath */
>> +	for_each_queue(bp, i) {
>> +
>> +		/* Status blocks */
>> +		BNX2X_PCI_FREE(bnx2x_fp(bp, i, status_blk),
>> +			       bnx2x_fp(bp, i, status_blk_mapping),
>> +			       sizeof(struct host_status_block)
>> +			       + sizeof(struct eth_tx_db_data));
>> +
>> +		/* fast path rings: tx_buf tx_desc rx_buf rx_desc rx_comp */
>> +		BNX2X_KFREE(bnx2x_fp(bp, i, tx_buf_ring));
>> +		BNX2X_PCI_FREE(bnx2x_fp(bp, i, tx_desc_ring),
>> +			       bnx2x_fp(bp, i, tx_desc_mapping),
>> +			       sizeof(struct eth_tx_bd) * NUM_TX_BD);
>> +
>> +		BNX2X_KFREE(bnx2x_fp(bp, i, rx_buf_ring));
>> +		BNX2X_PCI_FREE(bnx2x_fp(bp, i, rx_desc_ring),
>> +			       bnx2x_fp(bp, i, rx_desc_mapping),
>> +			       sizeof(struct eth_rx_bd) * NUM_RX_BD);
>> +
>> +		BNX2X_PCI_FREE(bnx2x_fp(bp, i, rx_comp_ring),
>> +			       bnx2x_fp(bp, i, rx_comp_mapping),
>> +			       sizeof(struct eth_rx_bd) * NUM_RX_BD);
>> +	}
>> +
>> +	BNX2X_KFREE( bp->fp);
>> +	/* end of fast path*/
>> +
>> +	BNX2X_PCI_FREE(bp->def_status_blk, bp->def_status_blk_mapping,
>> +		       (sizeof(struct host_def_status_block)));
>> +
>> +	BNX2X_PCI_FREE(bp->slowpath, bp->slowpath_mapping,
>> +		       (sizeof(struct bnx2x_slowpath)));
>> +
>> +	if (iscsi_active) {
>> +		BNX2X_PCI_FREE(bp->t1, bp->t1_mapping, 64*1024);
>> +		BNX2X_PCI_FREE(bp->t2, bp->t2_mapping, 16*1024);
>> +		BNX2X_PCI_FREE(bp->timers, bp->timers_mapping, 8*1024);
>> +		BNX2X_PCI_FREE(bp->qm, bp->qm_mapping, 128*1024);
>> +	}
>> +
>> +	BNX2X_PCI_FREE(bp->spq, bp->spq_mapping, PAGE_SIZE);
>> +
>> +#undef BNX2X_PCI_FREE
>> +#undef BNX2X_KFREE
>> +}
>> +
>> +static int bnx2x_alloc_mem(struct bnx2x *bp)
>> +{
>> +	int i;
>> +
>> +#define BNX2X_PCI_ALLOC(x, y, size) do { \
>> +	x = pci_alloc_consistent(bp->pdev, size, y); \
>> +	if (x == NULL) \
>> +		goto alloc_mem_err; \
>> +	memset(x, 0, size); \
>> +} while (0)
>> +
>> +#define BNX2X_ALLOC(x, size) do { \
>> +	x = vmalloc(size); \
> 
> Why not zalloc()?

We want to allow allocating chunks bigger that zalloc will allow.

> 
>> +	if (x == NULL) \
>> +		goto alloc_mem_err; \
>> +	memset(x, 0, size); \
>> +} while (0)
>> +
>> +	/* fastpath */
>> +	BNX2X_ALLOC(bp->fp, sizeof(struct bnx2x_fastpath)*bp->num_queues);
>> +
>> +	for_each_queue(bp, i) {
>> +
>> +		bnx2x_fp(bp, i, bp) = bp;
>> +
>> +		/* Status blocks */
>> +		BNX2X_PCI_ALLOC(bnx2x_fp(bp, i, status_blk),
>> +				&bnx2x_fp(bp, i, status_blk_mapping),
>> +				sizeof(struct host_status_block) +
>> +				sizeof(struct eth_tx_db_data));
>> +
>> +		bnx2x_fp(bp, i, tx_prods_mapping) =
>> +				bnx2x_fp(bp, i, status_blk_mapping) +
>> +				sizeof(struct host_status_block);
>> +
>> +		bnx2x_fp(bp, i, hw_tx_prods) =
>> +				(void *)(bnx2x_fp(bp, i, status_blk) + 1);
>> +
>> +		/* fast path rings: tx_buf tx_desc rx_buf rx_desc rx_comp */
>> +		BNX2X_ALLOC(bnx2x_fp(bp, i, tx_buf_ring),
>> +				sizeof(struct sw_tx_bd) * NUM_TX_BD);
>> +		BNX2X_PCI_ALLOC(bnx2x_fp(bp, i, tx_desc_ring),
>> +				&bnx2x_fp(bp, i, tx_desc_mapping),
>> +				sizeof(struct eth_tx_bd) * NUM_TX_BD);
>> +
>> +		BNX2X_ALLOC(bnx2x_fp(bp, i, rx_buf_ring),
>> +				sizeof(struct sw_rx_bd) * NUM_RX_BD);
>> +		BNX2X_PCI_ALLOC(bnx2x_fp(bp, i, rx_desc_ring),
>> +				&bnx2x_fp(bp, i, rx_desc_mapping),
>> +				sizeof(struct eth_rx_bd) * NUM_RX_BD);
>> +
>> +		BNX2X_PCI_ALLOC(bnx2x_fp(bp, i, rx_comp_ring),
>> +				&bnx2x_fp(bp, i, rx_comp_mapping),
>> +				sizeof(struct eth_rx_bd) * NUM_RX_BD);
>> +	}
>> +	/* end of fast path*/
>> +
>> +	BNX2X_PCI_ALLOC(bp->def_status_blk, &bp->def_status_blk_mapping,
>> +			sizeof(struct host_def_status_block));
>> +
>> +	BNX2X_PCI_ALLOC(bp->slowpath, &bp->slowpath_mapping,
>> +			sizeof(struct bnx2x_slowpath));
>> +
>> +	if (iscsi_active) {
>> +		BNX2X_PCI_ALLOC(bp->t1, &bp->t1_mapping, 64*1024);
>> +
>> +		/* Initialize T1 */
>> +		for (i = 0; i < 64*1024; i += 64) {
>> +			*(u64 *)((char *)bp->t1 + i+56) = 0x0UL;
>> +			*(u64 *)((char *)bp->t1 + i+3) = 0x0UL;
>> +		}
>> +
>> +		/* allocate searcher T2 table
>> +		   we allocate 1/4 of alloc num for T2
>> +		  (which is not entered into the ILT) */
>> +		BNX2X_PCI_ALLOC(bp->t2, &bp->t2_mapping, 16*1024);
>> +
>> +		/* Initialize T2 */
>> +		for (i = 0; i < 16*1024; i += 64) {
>> +			*(u64 *)((char *)bp->t2 + i+56) =
>> +				bp->t2_mapping + i + 64;
>> +		}
>> +
>> +		/* now sixup the last line in the block
>> +		 * to point to the next block
>> +		 */
>> +		*(u64 *)((char *)bp->t2 + 1024*16-8) = bp->t2_mapping;
>> +
>> +		/* Timer block array (MAX_CONN*8)
>> +		 * phys uncached for now 1024 conns
>> +		 */
>> +		BNX2X_PCI_ALLOC(bp->timers, &bp->timers_mapping, 8*1024);
>> +
>> +		/* QM queues (128*MAX_CONN) */
>> +		BNX2X_PCI_ALLOC(bp->qm, &bp->qm_mapping, 128*1024);
>> +	}
>> +
>> +	/* Slow path ring */
>> +	BNX2X_PCI_ALLOC(bp->spq, &bp->spq_mapping, PAGE_SIZE);
>> +
>> +	return 0;
>> +
>> +alloc_mem_err:
>> +	bnx2x_free_mem(bp);
>> +	return -ENOMEM;
>> +
>> +#undef BNX2X_PCI_ALLOC
>> +#undef BNX2X_ALLOC
>> +}
>> +
>> +
>> +
>> +static void bnx2x_set_mac_addr(struct bnx2x *bp)
>> +{
>> +	struct mac_configuration_cmd *config = bnx2x_sp(bp, mac_config);
>> +
>> +	/* CAM allocation
>> +	 * unicasts 0-31:port0 32-63:port1
>> +	 * multicast 64-127:port0 128-191:port1
>> +	 */
>> +	config->hdr.length = 2;
>> +	config->hdr.offset = bp->port ? 31 : 0;
>> +	config->hdr.reserved0 = 0;
>> +	config->hdr.reserved1 = 0;
>> +
>> +	/* primary MAC */
>> +	config->config_table[0].cam_entry.msb_mac_addr =
>> +		ntohl(*(u32 *)&bp->dev->dev_addr[0]);
>> +	config->config_table[0].cam_entry.lsb_mac_addr =
>> +		ntohs(*(u16 *)&bp->dev->dev_addr[4]);
> 
> Better use xx_to_cpu()
> anyway, ntohX is defined as beX_to_cpu().
> Are you sure this is correct? A byte array is little endian.
This is a HW quirk, will change to cpu_to_le().

> 
>> +static inline int bnx2x_alloc_rx_skb(struct bnx2x *bp,
>> +				     struct bnx2x_fastpath *fp, u16 index)
> 
> Too big for inlining.
> If this is only called in one place, gcc will inline it automatically.
this is fast-fast-path (called once for each rx packet) I think it 
should be inlined.

> 
>> +{
>> +	struct sk_buff *skb;
>> +	struct sw_rx_bd *rx_buf = &fp->rx_buf_ring[index];
>> +	struct eth_rx_bd *rxbd = &fp->rx_desc_ring[index];
>> +	dma_addr_t mapping;
>> +
>> +	skb = netdev_alloc_skb(bp->dev, bp->rx_buf_size);
>> +
>> +	if (skb == NULL) {
>> +		return -ENOMEM;
>> +	}
>> +
>> +	mapping = pci_map_single(bp->pdev, skb->data, bp->rx_buf_use_size,
>> +				 PCI_DMA_FROMDEVICE);
> 
> You need to check if the mapping succeed.
> There's a macro for that: dma_mapping_error()
OK

> 
>> +	rx_buf->skb = skb;
>> +	pci_unmap_addr_set(rx_buf, mapping, mapping);
>> +#ifdef BNX2X_STOP_ON_ERROR
>> +	rxbd->reserved = fp->last_alloc++;
>> +#else
>> +	rxbd->reserved = 0;
>> +#endif
>> +	rxbd->len = bp->rx_buf_use_size;
>> +	rxbd->addr_hi = U64_HI(mapping);
>> +	rxbd->addr_lo = U64_LO(mapping);
>> +	return 0;
>> +}
> 
>> +static void bnx2x_tx_int(struct bnx2x_fastpath *fp, int work)
>> +{
>> +	struct bnx2x *bp = fp->bp;
>> +	u16 hw_cons, sw_cons, bd_cons = fp->tx_bd_cons;
>> +	int done = 0;
>> +
>> +	hw_cons = *fp->tx_cons_sb;
>> +	sw_cons = fp->tx_pkt_cons;
>> +
>> +#ifdef BNX2X_STOP_ON_ERROR
>> +	if (unlikely(bp->panic)) {
>> +		return;
>> +	}
>> +#endif
>> +
>> +	while (sw_cons != hw_cons) {
>> +		u16 pkt_cons;
>> +
>> +		pkt_cons = TX_BD(sw_cons);
>> +
>> +		/* prefetch(bp->tx_buf_ring[pkt_cons].skb); */
>> +
>> +		DP(NETIF_MSG_TX_DONE, "sw_cons=%u  hw_cons=%u pkt_cons=%d\n",
>> +		   sw_cons, hw_cons, pkt_cons);
>> +
>> +/*        if (NEXT_TX_IDX(sw_cons)!=hw_cons){
>> +	    rmb();
>> +	    prefetch(fp->tx_buf_ring[NEXT_TX_IDX(sw_cons)].skb);
>> +	}
>> +*/
>> +
>> +		bd_cons = bnx2x_free_tx_pkt(bp, fp, pkt_cons);
>> +		sw_cons++;
>> +		done++;
>> +
>> +		if (done == work) {
>> +			break;
>> +		}
>> +	}
>> +
>> +	fp->tx_pkt_cons = sw_cons;
>> +	fp->tx_bd_cons = bd_cons;
>> +
>> +	smp_mb();
> 
> Please add a comment why we need a SMP MB here.
> 
>> +
>> +	/* TBD need a thresh?  */
>> +	if (unlikely(netif_queue_stopped(bp->dev))) {
>> +		netif_tx_lock(bp->dev);
>> +		if ((netif_queue_stopped(bp->dev)) &&
>> +		    (bnx2x_tx_avail(fp) >= MAX_SKB_FRAGS + 3)) {
>> +			netif_wake_queue(bp->dev);
>> +		}
>> +		netif_tx_unlock(bp->dev);
>> +	}
>> +}
> 
>> +static inline void bnx2x_reuse_rx_skb(struct bnx2x_fastpath *fp,
>> +				      struct sk_buff *skb, u16 cons, u16 prod)
> 
> Too big for inlining.
Now that I think about it, this is better off out-of-line.
Will change.

> 
>> +{
>> +	struct bnx2x *bp = fp->bp;
>> +	struct sw_rx_bd *cons_rx_buf = &fp->rx_buf_ring[cons];
>> +	struct sw_rx_bd *prod_rx_buf = &fp->rx_buf_ring[prod];
>> +	struct eth_rx_bd *cons_bd = &fp->rx_desc_ring[cons];
>> +	struct eth_rx_bd *prod_bd = &fp->rx_desc_ring[prod];
>> +
>> +	pci_dma_sync_single_for_device(bp->pdev,
>> +				       pci_unmap_addr(cons_rx_buf, mapping),
>> +				       bp->rx_offset + RX_COPY_THRESH,
>> +				       PCI_DMA_FROMDEVICE);
>> +
>> +	prod_rx_buf->skb = cons_rx_buf->skb;
>> +	pci_unmap_addr_set(prod_rx_buf, mapping,
>> +			   pci_unmap_addr(cons_rx_buf, mapping));
>> +	*prod_bd = *cons_bd;
>> +#ifdef BNX2X_STOP_ON_ERROR
>> +	prod_bd->reserved = fp->last_alloc++;
>> +#endif
>> +}
> 
>> +static void bnx2x_attn_int(struct bnx2x *bp)
>> +{
>> +	int port = bp->port;
>> +
>> +	/* read local copy of bits */
>> +	u16 attn_bits = bp->def_status_blk->atten_status_block.attn_bits;
>> +	u16 attn_ack = bp->def_status_blk->atten_status_block.attn_bits_ack;
>> +	u16 attn_state = bp->attn_state;
>> +
>> +	/* look for changed bits */
>> +	u16 asserted   =  attn_bits & ~attn_ack & ~attn_state;
>> +	u16 deasserted = ~attn_bits &  attn_ack &  attn_state;
> 
> Do you probably want | instead of & here?
This is code that came from the HW guys, it works.

> 
>> +
>> +	DP(NETIF_MSG_HW,
>> +	   "attn_bits %x, attn_ack %x, asserted %x, deasserted %x \n",
>> +	   attn_bits, attn_ack, asserted, deasserted);
>> +
>> +	if (~(attn_bits ^ attn_ack) & (attn_bits ^ attn_state)) {
> 
> Do you want && or | here, instead of & ?
> 
>> +		BNX2X_ERR("bad attention state\n");
>> +	}
> 
>> +}
>> +
>> +static inline int bnx2x_has_work(struct bnx2x_fastpath *fp)
> 
> Probably better not inline.
this is also fastpath, I will test to see if removing the inline impacts 
performance.

> 
>> +{
>> +	u16 rx_cons_sb = *fp->rx_cons_sb;
>> +
>> +	if ((rx_cons_sb & MAX_RX_DESC_CNT) == MAX_RX_DESC_CNT)
>> +		rx_cons_sb++;
>> +
>> +	if ((rx_cons_sb != fp->rx_comp_cons) ||
>> +	    (*fp->tx_cons_sb != fp->tx_pkt_cons))
>> +		return 1;
>> +
>> +	return 0;
>> +}
> 
>> +static irqreturn_t bnx2x_msix_sp_int(int irq, void *dev_instance)
>> +{
>> +	struct net_device *dev = dev_instance;
> 
> You need to check if dev==NULL and bail out.
> Another driver sharing the IRQ with this might choose to pass the dev
> pointer as NULL.
This is the MSI-X slowpath interrupt handler, it does not share it's 
interrupt.
Same for the MSI-X fastpath handler.

(snipping some lines)

>> +static irqreturn_t bnx2x_interrupt(int irq, void *dev_instance)
>> +{
>> +	struct net_device *dev = dev_instance;
> 
> Check if dev==NULL
OK

> 
>> +	struct bnx2x *bp = netdev_priv(dev);
>> +	u16 status = bnx2x_ack_int(bp);
>> +
>> +	if (unlikely(status == 0)) {
> 
> That's not unlikely.
We are optimizing for the most common case.
I think we should even think about adding a warn_once if we find out our 
line is shared since this is an extremely sub optimal configuration.
> 
>> +		DP(NETIF_MSG_INTR, "not our interrupt!\n");
>> +		return IRQ_NONE;
>> +	}
>> +
>> +	DP(NETIF_MSG_INTR, "got an interrupt status is %u\n", status);
>> +
>> +#ifdef BNX2X_STOP_ON_ERROR
>> +	if (unlikely(bp->panic)) {
>> +		return IRQ_HANDLED;
>> +	}
>> +#endif
>> +
>> +	/* Return here if interrupt is shared and is disabled. */
>> +	if (unlikely(atomic_read(&bp->intr_sem) != 0)) {
>> +		DP(NETIF_MSG_INTR, "called but int_sem not 0, returning. \n");
>> +		return IRQ_HANDLED;
>> +	}
>> +
>> +	if (status & 0x2) {
>> +
>> +		struct bnx2x_fastpath *fp = &bp->fp[0];
>> +
>> +		prefetch(fp->rx_cons_sb);
>> +		prefetch(fp->tx_cons_sb);
>> +		prefetch(&fp->status_blk->c_status_block.status_block_index);
>> +		prefetch(&fp->status_blk->u_status_block.status_block_index);
>> +
>> +		netif_rx_schedule(dev);
>> +
>> +		status &= ~0x2;
>> +
>> +		if (!status) {
>> +			return IRQ_HANDLED;
>> +		}
>> +	}
>> +
>> +	if (unlikely(status & 0x1)) {
>> +
>> +		tasklet_schedule(&bp->sp_task);
>> +		status &= ~0x1;
>> +		if (!status) {
>> +			return IRQ_HANDLED;
>> +		}
>> +	}
>> +
>> +	DP(NETIF_MSG_INTR, "got an unknown interrupt! (status is %u)\n",
>> +	   status);
>> +
>> +	return IRQ_HANDLED;
>> +}
> 
>> +/* some of the internal memories are not directly readable from the driver
>> +   to test them we send debug packets
>> + */
>> +static int bnx2x_int_mem_test(struct bnx2x *bp)
>> +{
>> +	u32 val;
>> +	u32 count;
>> +	u8 hw;
>> +	u8 port;
>> +	u8 i;
>> +	u32 factor;
>> +
>> +	switch (CHIP_REV(bp)) {
>> +	case CHIP_REV_EMUL:
>> +		hw = INIT_EMULATION;
>> +		factor = 2000;
>> +		break;
>> +	case CHIP_REV_FPGA:
>> +		hw = INIT_FPGA;
>> +		factor = 120;
>> +		break;
>> +	default:
>> +		hw = INIT_ASIC;
>> +		factor = 1;
>> +		break;
>> +	}
>> +
>> +	port = PORT0 << bp->port;
>> +
>> +	DP(NETIF_MSG_HW, "mem_wrk start part1\n");
>> +
>> +	/* Disable inputs of parser neighbor blocks */
>> +	REG_WR(bp, GRCBASE_TSDM, TSDM_REGISTERS_ENABLE_IN1, 0x0);
>> +	REG_WR(bp, GRCBASE_TCM, TCM_REGISTERS_PRS_IFEN, 0x0);
>> +	REG_WR(bp, GRCBASE_CFC, CFC_REGISTERS_DEBUG0, 0x1);
>> +	NIG_WR(NIG_REGISTERS_PRS_REQ_IN_EN, 0x0);
>> +
>> +	/*  Write 0 to parser credits for CFC search request */
>> +	REG_WR(bp, GRCBASE_PRS, PRS_REGISTERS_CFC_SEARCH_INITIAL_CREDIT, 0x0);
>> +
>> +	/* send Ethernet packet */
>> +	bnx2x_lb_pckt(bp);
>> +
>> +	/* TODO do i reset NIG statistic ?*/
>> +	/* Wait until NIG register shows 1 packet of size 0x10 */
>> +	count = 1000;
>> +	while (count) {
>> +		val = REG_RD(bp, GRCBASE_NIG, NIG_REGISTERS_STAT2_BRB_OCTET);
>> +		REG_RD(bp, GRCBASE_NIG, NIG_REGISTERS_STAT2_BRB_OCTET + 4);
>> +
>> +		if (val == 0x10) {
>> +			break;
>> +		}
>> +		msleep(10 * factor);
> 
> That would be msleep(20000) for CHIP_REV_EMUL.
> besides the sleep being pretty big (does it really need to wait
> 20 sec?), some arches don't support such a big msleep value.
> use ssleep(). It would take 5 and a half hours for the
> timeout count to hit in.
The timeout is too long, will change the factor.
(However CHIP_REV_EMUL means we are not running on a real chip, we are 
running on a simulated chip.)

> 
>> +		count--;
>> +	}
>> +	if (val != 0x10) {
>> +		BNX2X_ERR("NIG timeout val = 0x%x\n", val);
>> +		return -1;
>> +	}
>> +
>> +	/* Wait until PRS register shows 1 packet */
>> +	count = 1000;
>> +	while (count) {
>> +		val = REG_RD(bp, GRCBASE_PRS, PRS_REGISTERS_NUM_OF_PACKETS);
>> +
>> +		if (val == 0x1) {
>> +			break;
>> +		}
>> +		msleep(10 * factor);
> 
> Same here.
> 
>> +		count--;
>> +	}
>> +	if (val != 0x1) {
>> +		BNX2X_ERR("PRS timeout val = 0x%x\n", val);
>> +		return -2;
>> +	}
> 
>> +	/* Wait until NIG register shows 10 + 1
>> +	   packets of size 11*0x10 = 0xb0 */
>> +	count = 1000;
>> +	while (count) {
>> +
>> +		val = REG_RD(bp, GRCBASE_NIG, NIG_REGISTERS_STAT2_BRB_OCTET);
>> +		REG_RD(bp, GRCBASE_NIG, NIG_REGISTERS_STAT2_BRB_OCTET + 4);
>> +
>> +		if (val == 0xb0) {
>> +			break;
>> +		}
>> +		msleep(10 * factor);
> 
> Same here.
> 
>> +		count--;
>> +	}
>> +	if (val != 0xb0) {
>> +		BNX2X_ERR("NIG timeout val = 0x%x\n", val);
>> +		return -3;
>> +	}
>> +
>> +	/* Wait until PRS register shows 2 packet */
>> +	val = REG_RD(bp, GRCBASE_PRS, PRS_REGISTERS_NUM_OF_PACKETS);
>> +
>> +	if (val != 0x2) {
>> +		BNX2X_ERR("PRS timeout val = 0x%x\n", val);
>> +	}
>> +
>> +	/* Write 1 to parser credits for CFC search request */
>> +	REG_WR(bp, GRCBASE_PRS, PRS_REGISTERS_CFC_SEARCH_INITIAL_CREDIT, 0x1);
>> +
>> +	/* Wait until PRS register shows 3 packet */
>> +	msleep(10 * factor);
> 
> Same.
> 
>> +	DP(NETIF_MSG_HW, "done\n");
>> +	return 0; /* OK */
>> +}
> 
>> +static void bnx2x_stop_stats(struct bnx2x *bp)
>> +{
>> +	atomic_set(&bp->stats_state, STATS_STATE_STOP);
>> +	DP(BNX2X_MSG_STATS, "stats_state - STOP\n");
>> +	while (atomic_read(&bp->stats_state) != STATS_STATE_DISABLE) {
> 
> This seems to need a memory barrier (on this side _and_ on the write side,
> wherever that is (comment!)).
> There are special mbs for atomic variables.
This does not need to be an atomic_t will change to int.

> 
>> +		msleep(100);
>> +	}
>> +	DP(BNX2X_MSG_STATS, "stats_state - DISABLE\n");
>> +}
> 
>> +static void bnx2x_update_stats(struct bnx2x *bp)
>> +{
>> +
>> +	if (!bnx2x_update_storm_stats(bp)) {
>> +
>> +		if (bp->phy_flags & PHY_BMAC_FLAG) {
>> +			bnx2x_update_bmac_stats(bp);
>> +
>> +		} else if (bp->phy_flags & PHY_EMAC_FLAG) {
>> +			bnx2x_update_emac_stats(bp);
>> +
>> +		} else { /* unreached */
>> +			BNX2X_ERR("no MAC active\n");
>> +			return;
>> +		}
>> +
>> +		bnx2x_update_net_stats(bp);
>> +	}
>> +
>> +	if (bp->msglevel & NETIF_MSG_TIMER) {
>> +		printk(KERN_DEBUG "%s:\n"
>> +		       KERN_DEBUG "tx avail(%x)   rx usage(%x)\n"
>> +		       KERN_DEBUG "tx pkt (%lx)   rx pkt(%lx)\n"
>> +		       KERN_DEBUG "tx hc idx(%x)  rx hc index(%x)\n"
>> +		       KERN_DEBUG "%s (Xoff events %u)\n"
>> +		       KERN_DEBUG "brb drops                     %u\n"
>> +		       KERN_DEBUG "tstats->no_buff_discard,      %u\n"
>> +		       KERN_DEBUG "tstats->errors_discard,       %u\n"
>> +		       KERN_DEBUG "tstats->mac_filter_discard,   %u\n"
>> +		       KERN_DEBUG "tstats->xxoverflow_discard,   %u\n"
>> +		       KERN_DEBUG "tstats->ttl0_discard,         %u\n",
>> +		       bp->dev->name,
>> +		       bnx2x_tx_avail(bp->fp),
>> +		       (u16)(*bp->fp->rx_cons_sb - bp->fp->rx_comp_cons),
>> +		       bp->net_stats.tx_packets, bp->net_stats.rx_packets,
>> +		       *bp->fp->tx_cons_sb, *bp->fp->rx_cons_sb,
>> +		       netif_queue_stopped(bp->dev)? "Xoff" : "Xon ",
>> +		       bp->slowpath->eth_stats.driver_xoff,
>> +		       bp->slowpath->eth_stats.brb_discard,
>> +		       bp->slowpath->eth_stats.no_buff_discard,
>> +		       bp->slowpath->eth_stats.errors_discard,
>> +		       bp->slowpath->eth_stats.mac_filter_discard,
>> +		       bp->slowpath->eth_stats.xxoverflow_discard,
>> +		       bp->slowpath->eth_stats.ttl0_discard);
>> +	}
>> +
>> +	if (bp->state != BNX2X_STATE_OPEN) {
>> +		DP(BNX2X_MSG_STATS, "state is %x, returning\n", bp->state);
>> +		return;
>> +	}
>> +
>> +#ifdef BNX2X_STOP_ON_ERROR
>> +	if (unlikely(bp->panic)) {
>> +		return;
>> +	}
>> +#endif
>> +
>> +	if (bp->fw_mb) {
>> +		REG_WR32(bp, GRCBASE_DMAE,
>> +			 (bp->port? DMAE_REGISTERS_GO_C13 :
>> +				    DMAE_REGISTERS_GO_C12), 1);
>> +	}
>> +
>> +	if (atomic_read(&bp->stats_state) != STATS_STATE_ENABLE) {
>> +		atomic_set(&bp->stats_state, STATS_STATE_DISABLE);
> 
> Is that the write side? If yes, you need a mb here, too.
> And a comment.
>
>> +		return;
>> +	}
>> +
>> +	if (bp->phy_flags & PHY_BMAC_FLAG) {
>> +		REG_WR32(bp, GRCBASE_DMAE,
>> +			 (bp->port? DMAE_REGISTERS_GO_C6 :
>> +				    DMAE_REGISTERS_GO_C0), 1);
>> +
>> +	} else if (bp->phy_flags & PHY_EMAC_FLAG) {
>> +		REG_WR32(bp, GRCBASE_DMAE,
>> +			 (bp->port? DMAE_REGISTERS_GO_C8 :
>> +				    DMAE_REGISTERS_GO_C2), 1);
>> +	}
>> +
>> +	if (bnx2x_sp_post(bp, RAMROD_CMD_ID_ETH_STAT_QUERY, 0, 0, 0, 0) == 0) {
>> +		/* stats ramrod has it's own slot on the spe */
>> +		bp->spq_left++;
>> +		bp->stat_pending = 1;
>> +	}
>> +}
> 
>> +/* Called with netif_tx_lock.
>> + * bnx2x_tx_int() runs without netif_tx_lock unless it needs to call
>> + * netif_wake_queue().
>> + */
>> +static int bnx2x_start_xmit(struct sk_buff *skb, struct net_device *dev)
>> +{
>> +	struct bnx2x *bp = netdev_priv(dev);
>> +	dma_addr_t mapping;
>> +	struct eth_tx_bd *txbd;
>> +	struct eth_tx_parse_bd *pbd = NULL;
>> +	struct sw_tx_bd *tx_buf;
>> +	u16 pkt_prod, bd_prod;
>> +	int nbd, fp_index = 0;
>> +	struct bnx2x_fastpath *fp;
>> +
>> +	fp = &bp->fp[fp_index];
>> +#ifdef BNX2X_STOP_ON_ERROR
>> +	if (unlikely(bp->panic)) {
>> +		return NETDEV_TX_BUSY;
>> +	}
>> +#endif
>> +
>> +	if (unlikely(bnx2x_tx_avail(bp->fp) <
>> +		     (skb_shinfo(skb)->nr_frags + 3))) {
>> +		bp->slowpath->eth_stats.driver_xoff++,
>> +		netif_stop_queue(dev);
>> +		BNX2X_ERR("BUG! Tx ring full when queue awake!\n");
>> +		return NETDEV_TX_BUSY;
>> +	}
>> +
>> +	/*
>> +	This is a bit ugly. First we use one BD which we mark as start,
>> +	then for TSO or xsum we have a parsing info BD,
>> +	and only then we have the rest of the TSO bds.
>> +	(don't forget to mark the last one as last,
>> +	and to unmap only AFTER you write to the BD ...)
>> +	I would like to thank Dov Hirshfeld for this mess.
>> +	*/
>> +
>> +	pkt_prod = fp->tx_pkt_prod++;
>> +	bd_prod = fp->tx_bd_prod;
>> +
>> +	pkt_prod = TX_BD(pkt_prod);
>> +	bd_prod = TX_BD(bd_prod);
>> +
>> +	/* get a tx_buff and first bd */
>> +	tx_buf = &fp->tx_buf_ring[pkt_prod];
>> +	txbd = &fp->tx_desc_ring[bd_prod];
>> +
>> +	txbd->bd_flags.as_bitfield = ETH_TX_BD_FLAGS_START_BD;
>> +
>> +	/* remeber the first bd of the packet */
>> +	tx_buf->first_bd = bd_prod;
>> +
>> +	DP(NETIF_MSG_TX_QUEUED,
>> +	   "sending pkt=%u @%p, next_idx=%u, bd=%u @%p\n",
>> +	   pkt_prod, tx_buf, fp->tx_pkt_prod, bd_prod, txbd);
>> +
>> +	if (skb->ip_summed == CHECKSUM_PARTIAL) {
>> +		u8 len;
>> +		struct iphdr *iph = ip_hdr(skb);
>> +
>> +		txbd->bd_flags.as_bitfield |= ETH_TX_BD_FLAGS_IP_CSUM ;
>> +
>> +		/* turn on parsing and get a bd */
>> +		bd_prod = TX_BD(NEXT_TX_IDX(bd_prod));
>> +		pbd = (void *)&fp->tx_desc_ring[bd_prod];
>> +		
>> +		len = ((u8 *)ip_hdr(skb) - (u8 *)skb->data)/2;
>> +
>> +		/* for now NS flag is not used in Linux */
>> +		pbd->global_data =
>> +			len | ((skb->protocol == ETH_P_8021Q)
>> +			       << ETH_TX_PARSE_BD_LLC_SNAP_EN_SHIFT);
>> +
>> +		pbd->ip_hlen = ip_hdrlen(skb)/2;
>> +		pbd->total_hlen = len + pbd->ip_hlen;
>> +
>> +		if (iph->protocol == IPPROTO_TCP) {
>> +			struct tcphdr *th = tcp_hdr(skb);
>> +			txbd->bd_flags.as_bitfield |=
>> +				ETH_TX_BD_FLAGS_TCP_CSUM;
>> +			pbd->tcp_flags = htonl(tcp_flag_word(skb))&0xFFFF;
>> +			pbd->total_hlen += tcp_hdrlen(skb)/2;
>> +
>> +			pbd->tcp_pseudo_csum = ntohs(th->check);
>> +		} else if (iph->protocol == IPPROTO_UDP) {
>> +			struct udphdr *uh = udp_hdr(skb);
>> +			txbd->bd_flags.as_bitfield |=
>> +				ETH_TX_BD_FLAGS_TCP_CSUM;
>> +			pbd->total_hlen += 4;
>> +			pbd->global_data |= ETH_TX_PARSE_BD_UDP_FLG;
>> +
>> +			/* HW bug: we need to subtract 10 bytes before the
>> +			 * UDP header from the csum
>> +			 */
>> +			uh->check = (u16) ~csum_fold(csum_sub(uh->check,
>> +				csum_partial(((u8 *)(uh)-10), 10, 0)));
>> +		}
>> +	}
>> +
>> +	if (bp->vlgrp != 0 && vlan_tx_tag_present(skb)) {
>> +		txbd->vlan = vlan_tx_tag_get(skb);
>> +		txbd->bd_flags.as_bitfield |= ETH_TX_BD_FLAGS_VLAN_TAG;
>> +	}
>> +#ifdef BNX2X_STOP_ON_ERROR
>> +	else {
>> +		txbd->vlan = pkt_prod;
>> +	}
>> +#endif
>> +
>> +	mapping = pci_map_single(bp->pdev, skb->data,
>> +				 skb->len, PCI_DMA_TODEVICE);
> 
> dma_mapping_error()
OK

> 
>> +
>> +	txbd->addr_hi = U64_HI(mapping);
>> +	txbd->addr_lo = U64_LO(mapping);
>> +	txbd->hdr_nbds = 1;
>> +	txbd->nbd = nbd = skb_shinfo(skb)->nr_frags + ((pbd == NULL)? 1 : 2);
>> +	txbd->nbytes = skb_headlen(skb);
>> +
>> +	DP(NETIF_MSG_TX_QUEUED,
>> +	   "first bd @%p, addr(%x:%x) hdr_nbds(%d) nbd(%d)"
>> +	   " nbytes(%d) flags(%x) vlan(%u)\n",
>> +	   txbd, txbd->addr_hi, txbd->addr_lo, txbd->hdr_nbds, txbd->nbd,
>> +	   txbd->nbytes, txbd->bd_flags.as_bitfield, txbd->vlan);
>> +
>> +	if ((skb_shinfo(skb)->gso_size) &&
>> +	    (skb->len > (bp->dev->mtu + ETH_HLEN))) {
>> +
>> +		int hlen = 2*pbd->total_hlen;
>> +	DP(NETIF_MSG_TX_QUEUED,
>> +	   "TSO packet len %d, hlen %d, total len %d, tso size %d\n",
>> +	   skb->len, hlen, skb_headlen(skb), skb_shinfo(skb)->gso_size);
>> +
>> +		txbd->bd_flags.as_bitfield |= ETH_TX_BD_FLAGS_SW_LSO;
>> +
>> +		if (txbd->nbytes > hlen) {
>> +		/* we split the first bd into headers and data bds
>> +		 * to ease the pain of our fellow micocode engineers
>> +		 * we use one mapping for both bds
>> +		 * So far this has only been observed to happen
>> +		 * in Other Operating Systems(TM)
>> +		 */
>> +
>> +			/* first fix first bd */
>> +			nbd++;
>> +			txbd->nbd++;
>> +			txbd->nbytes = hlen;
>> +
>> +			/* we only print this as an error
>> +			 * because we don't think this will ever happen.
>> +			 */
>> +			BNX2X_ERR("TSO split headr size is %d (%x:%x)"
>> +				  " nbd %d\n", txbd->nbytes, txbd->addr_hi,
>> +				  txbd->addr_lo, txbd->nbd);
>> +
>> +			/* now get a new data bd
>> +			 * (after the pbd) and fill it */
>> +			bd_prod = TX_BD(NEXT_TX_IDX(bd_prod));
>> +			txbd = &fp->tx_desc_ring[bd_prod];
>> +
>> +#ifdef BNX2X_STOP_ON_ERROR
>> +			txbd->vlan = pkt_prod;
>> +#endif
>> +			txbd->addr_hi = U64_HI(mapping);
>> +			txbd->addr_lo = U64_LO(mapping) + hlen;
>> +			txbd->nbytes = skb_headlen(skb) - hlen;
>> +			/* this marks the bd
>> +			 * as one that has no individual mapping
>> +			 * the FW ignors this flag in a bd not maked start
>> +			 */
>> +			txbd->bd_flags.as_bitfield = ETH_TX_BD_FLAGS_SW_LSO;
>> +			DP(NETIF_MSG_TX_QUEUED,
>> +			   "TSO split data size is %d (%x:%x)\n",
>> +			   txbd->nbytes, txbd->addr_hi, txbd->addr_lo);
>> +		}
>> +
>> +		if (!pbd) {
>> +			/* supposed to be unreached
>> +			 * (and therefore not handled properly...)
>> +			 */
>> +			BNX2X_ERR("LSO with no PBD\n");
>> +			BUG();
>> +		}
>> +
>> +		pbd->lso_mss = skb_shinfo(skb)->gso_size;
>> +		pbd->tcp_send_seq = ntohl(tcp_hdr(skb)->seq);
>> +		pbd->ip_id  = ntohs(ip_hdr(skb)->id);
>> +		pbd->tcp_pseudo_csum =
>> +			ntohs(~csum_tcpudp_magic(ip_hdr(skb)->saddr,
>> +						 ip_hdr(skb)->daddr,
>> +						 0, IPPROTO_TCP, 0));
>> +		pbd->global_data |= ETH_TX_PARSE_BD_PSEUDO_CS_WITHOUT_LEN;
>> +	}
>> +
>> +	{
>> +		int i;
>> +		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
>> +			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
>> +
>> +			bd_prod = TX_BD(NEXT_TX_IDX(bd_prod));
>> +			txbd = &fp->tx_desc_ring[bd_prod];
>> +
>> +			mapping = pci_map_page(bp->pdev, frag->page,
>> +					       frag->page_offset,
>> +					       frag->size, PCI_DMA_TODEVICE);
>> +
>> +			txbd->addr_hi = U64_HI(mapping);
>> +			txbd->addr_lo = U64_LO(mapping);
>> +			txbd->nbytes = frag->size;
>> +			txbd->bd_flags.as_bitfield = 0;
>> +#ifdef BNX2X_STOP_ON_ERROR
>> +			txbd->vlan = pkt_prod;
>> +#endif
>> +			DP(NETIF_MSG_TX_QUEUED, "frag (%d) bd @%p,"
>> +			   " addr(%x:%x) nbytes(%d) flags(%x)\n",
>> +			   i, txbd, txbd->addr_hi, txbd->addr_lo,
>> +			   txbd->nbytes, txbd->bd_flags.as_bitfield);
>> +
>> +		}  /* for */
>> +	}
>> +
>> +	/* now at last mark the bd as the last bd */
>> +	txbd->bd_flags.as_bitfield |= ETH_TX_BD_FLAGS_END_BD;
>> +
>> +	DP(NETIF_MSG_TX_QUEUED, "last bd @%p flags(%x)\n",
>> +	   txbd, txbd->bd_flags.as_bitfield);
>> +
>> +	tx_buf->skb = skb;
>> +
>> +	bd_prod = TX_BD(NEXT_TX_IDX(bd_prod));
>> +
>> +	/* now send a tx doorbell, counting the next bd
>> +	 * if the packet contains or ends with it
>> +	 */
>> +	if (TX_BD_POFF(bd_prod) < nbd)
>> +		nbd++;
>> +
>> +	if (pbd) {
>> +		DP(NETIF_MSG_TX_QUEUED,
>> +		   "PBD @%p, ip_data=%x , ip_hlen %u, ip_id %u, lso_mss %u,"
>> +		   " tcp_flags %x, xsum %x, seq %u, hlen %u)\n",
>> +		   pbd, pbd->global_data, pbd->ip_hlen, pbd->ip_id,
>> +		   pbd->lso_mss, pbd->tcp_flags, pbd->tcp_pseudo_csum,
>> +		   pbd->tcp_send_seq, pbd->total_hlen);
>> +	}
>> +
>> +	DP(NETIF_MSG_TX_QUEUED, "doorbell: nbd=%u, bd=%d\n", nbd, bd_prod);
>> +
>> +	fp->hw_tx_prods->bds_prod += nbd;
>> +	fp->hw_tx_prods->packets_prod++;
>> +	DOORBELL(bp, fp_index, 0);
>> +
>> +	mmiowb();
>> +
>> +	fp->tx_bd_prod = bd_prod;
>> +	dev->trans_start = jiffies;
>> +
>> +	if (unlikely(bnx2x_tx_avail(fp) < MAX_SKB_FRAGS + 3)) {
>> +		netif_stop_queue(dev);
>> +		bp->slowpath->eth_stats.driver_xoff++;
>> +		if (bnx2x_tx_avail(fp) >= MAX_SKB_FRAGS + 3)
>> +			netif_wake_queue(dev);
>> +	}
>> +	fp->tx_pkt++;
>> +	return NETDEV_TX_OK;
>> +}
> 
>> +static int bnx2x_nic_unload(struct bnx2x *bp, int fre_irq)
>> +{
>> +	u32 reset_code = 0;
>> +	int rc /*, j */;
>> +	bp->state = BNX2X_STATE_CLOSING_WAIT4_HALT;
>> +
>> +	/* Calling flush_scheduled_work() may deadlock because
>> +	 * linkwatch_event() may be on the workqueue and it will try to get
>> +	 * the rtnl_lock which we are holding.
>> +	 */
>> +	while (bp->in_reset_task)
>> +		msleep(1);
> 
> Memory barrier?
Now that cancel_work is available we will use that instead of the 
in_rest_task flag.

> 
>> +
>> +	/* Delete the timer: do it before disabling interrupts, as it
>> +	   may be stil STAT_QUERY ramrod pending after stopping the timer */
>> +	del_timer_sync(&bp->timer);
>> +
>> +	/* Wait until stat ramrod returns and all SP tasks complete */
>> +	while (bp->stat_pending && (bp->spq_left != MAX_SPQ_PENDING)) {
> 
> Memory barrier?

OK
> 
>> +		msleep(1);
>> +	}
>> +
>> +	/* Stop fast path, disable MAC, disable interrupts */
>> +	bnx2x_netif_stop(bp);
> 
>> +	/* Remember cstorm producer in DSB to track it */
>> +	bp->dsb_prod_sp_idx = *bp->dsb_sp_prod;
>> +	/* Send CFC_DELETE ramrod */
>> +	bnx2x_sp_post(bp, RAMROD_CMD_ID_ETH_LEADING_CFC_DEL, 0, 0, 0, 1);
>> +	/* Wait for completion to arrive. Do nothing as we are going to reset
>> +	   the chip a few lines later */
>> +	while ( bp->dsb_prod_sp_idx == *bp->dsb_sp_prod ) {
> 
> What's that doing? Poking some DMA?
> Barrier needed? DMA sync needed?
sp_post send a command on the slowpath ring.
I will add a barrier inside it.

> 
>> +		msleep(1);
>> +	}
>> +
> 
>> +static void bnx2x_reset_task(struct work_struct *work)
>> +{
>> +	struct bnx2x *bp = container_of(work, struct bnx2x, reset_task);
>> +
>> +#ifdef BNX2X_STOP_ON_ERROR
>> +	BNX2X_ERR("reset taks called but STOP_ON_ERROR defined"
>> +		  " so reset not done to allow debug dump,\n"
>> +		  " you will need to reboot when done\n");
>> +	return;
>> +#endif
>> +
>> +	if (!netif_running(bp->dev))
>> +		return;
>> +
>> +	bp->in_reset_task = 1;
>> +	bnx2x_netif_stop(bp);
>> +
>> +	bnx2x_nic_unload(bp, 0);
>> +	bnx2x_nic_load(bp, 0);
> 
> 
>> +	bp->in_reset_task = 0;
> 
> smp_wmb()?
will be removed.
> 
>> +}
> 
>> +static int bnx2x_phys_id(struct net_device *dev, u32 data)
>> +{
>> +	struct bnx2x *bp = netdev_priv(dev);
>> +	int i;
>> +
>> +	if (data == 0)
>> +		data = 2;
>> +
>> +	for (i = 0; i < (data * 2); i++) {
>> +		if ((i % 2) == 0) {
>> +			bnx2x_leds_set(bp, SPEED_1000);
>> +		} else {
>> +			bnx2x_leds_unset(bp);
>> +		}
>> +		msleep_interruptible(500);
> 
> Check the return value of msleep_interruptible and drop the
> following check for signals.
OK

> 
>> +		if (signal_pending(current))
>> +			break;
>> +	}
>> +
>> +	if (bp->link_up) {
>> +		bnx2x_leds_set(bp, bp->line_speed);
>> +	}
>> +	return 0;
>> +}
> 
>> +#ifdef BNX2X_STOP_ON_ERROR
>> +#warning stop on error defined
>> +#define bnx2x_panic() \
>> +	do { bp->panic = 1; \
>> +		BNX2X_ERR("driver assert\n"); \
>> +		bnx2x_disable_int(bp); \
>> +		bnx2x_panic_dump(bp); \
>> +	} while (0);
> 
> Drop the last semicolon
Opps... ;)

> 
>> +#else
>> +#define bnx2x_panic()
>> +#endif
> 
> 
> Puh, that was a lot :)
> 
> 




^ permalink raw reply

* [GIT PULL] sctp updates
From: Vlad Yasevich @ 2007-08-02 17:55 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, lksctp-developers

Hi David

Please pull the following changes since commit fc34f6c617bf2a845d793af12b96bcc0afd472c4:
 Andrew Morton (1):
       Fix up "remove the arm26 port"

which are found in branch 'master' of the git repository at:

 master.kernel.org:/pub/scm/linux/kernel/git/vxy/lksctp-dev.git

Dave Johnson (1):
      SCTP: IPv4 mapped addr not returned in SCTPv6 accept()

Sebastian Siewior (2):
      sctp: try to fix readlock
      sctp: fix shadow symbol in net/sctp/tsnmap.c

Vlad Yasevich (1):
      SCTP: IPv4 mapped addr not returned in SCTPv6 accept()

Wei Yongjun (2):
      SCTP: drop SACK if ctsn is not less than the next tsn of assoc
      SCTP: remove useless code in function sctp_init_cause

sebastian@breakpoint.cc (3):
      sctp: make locally used function static
      sctp: move global declaration to header file.
      sctp: remove shadowed symbols

 include/net/sctp/sctp.h  |   10 ++++
 net/sctp/input.c         |    2 +-
 net/sctp/ipv6.c          |    2 +
 net/sctp/sm_make_chunk.c |    6 ---
 net/sctp/sm_statefuns.c  |  103 ++++++++++++++++++++++++++++++++++-----------
 net/sctp/socket.c        |   45 +++++++++++---------
 net/sctp/tsnmap.c        |   14 +++---
 7 files changed, 123 insertions(+), 59 deletions(-)

Thanks
-vlad



^ permalink raw reply

* Re: b44 compile error on !PCI
From: Michael Buesch @ 2007-08-02 17:17 UTC (permalink / raw)
  To: Meelis Roos; +Cc: netdev
In-Reply-To: <Pine.SOC.4.64.0708021943510.7589@math.ut.ee>

On Thursday 02 August 2007, Meelis Roos wrote:
> Tryng to compile todays git on SBus-only Sparc64 (Ultra 1), no PCI. b44 
> is selectable but fails to compile:
> 
>   CC [M]  drivers/net/b44.o
> drivers/net/b44.c: In function 'b44_sync_dma_desc_for_device':
> drivers/net/b44.c:134: error: implicit declaration of function 'dma_sync_single_range_for_device'
> drivers/net/b44.c: In function 'b44_sync_dma_desc_for_cpu':
> drivers/net/b44.c:144: error: implicit declaration of function 'dma_sync_single_range_for_cpu'

Are you sure this is related to !PCI ?
If I grep include/asm-sparc64/dma-mapping.h I don't
find dma_sync_single_range_for_*
So I'd rather call this a bug in the arch code of sparc64.

^ permalink raw reply

* Re: strange tcp behavior
From: Simon Arlott @ 2007-08-02 17:15 UTC (permalink / raw)
  To: john; +Cc: johnpol, netdev
In-Reply-To: <48738.simon.1186056932@5ec7c279.invalid>

On 02/08/07 13:15, Simon Arlott wrote:
> (Don't remove CC:s, don't top post)
>>> On Thu, August 2, 2007 11:16, Evgeniy Polyakov wrote:
>>>> On Thu, Aug 02, 2007 at 01:55:50PM +0400, Evgeniy Polyakov
>>>> (johnpol@2ka.mipt.ru) wrote:
>>>>> On Thu, Aug 02, 2007 at 09:19:06AM +0300, john@screen.lv
>>>>> (john@screen.lv) wrote:
>>>>> > 1186035057.207629    127.0.0.1 -> 127.0.0.1    TCP 50000 > smtp [SYN]
>>>>> > Seq=0 Len=0
>>>>> > 1186035057.207632    127.0.0.1 -> 127.0.0.1    TCP smtp > 50000 [SYN,
>>>>> ACK]
>>>>> > Seq=0 Ack=1 Win=32792 Len=0 MSS=16396
>>>>> > 1186035057.207666    127.0.0.1 -> 127.0.0.1    TCP 50000 > smtp [ACK]
>>>>> > Seq=1 Ack=1 Win=1500 Len=0
>>>>> > 1186035057.207699    127.0.0.1 -> 127.0.0.1    SMTP Command: EHLO
>>>>> localhost
>>>>> > 1186035057.207718    127.0.0.1 -> 127.0.0.1    TCP smtp > 50000 [ACK]
>>>>> > Seq=1 Ack=17 Win=32792 Len=0
>>>>> > 1186035057.207736    127.0.0.1 -> 127.0.0.1    TCP 50000 > smtp [RST]
>>>>> > Seq=17 Len=0
>>>>> > 1186035057.223934    127.0.0.1 -> 127.0.0.1    TCP 33787 > 50000
>>>>> [RST,
>>>>> > ACK] Seq=0 Ack=0 Win=32792 Len=0
>>>>> >
>>>>> > Can someone please comment as to why, tcp  stack sends rst packet
>>>>> from the
>>>>> > wrong source port in this situation.

> I don't know where that extra RST is coming from.
> This test would be more convincing between two hosts, since your bizarre
> client is using raw sockets as root and could be doing anything.

Server 192.168.7.8 (2.6.23)
Client 192.168.7.4 (2.6.20)

17:33:45.326246 IP 192.168.7.4.50000 > 192.168.7.8.2500: S 1385353579:1385353579(0) win 1500
17:33:45.326418 IP 192.168.7.8.2500 > 192.168.7.4.50000: S 1388203102:1388203102(0) ack 1385353580 win 14360 <mss 7180>
17:33:45.348833 IP 192.168.7.4.50000 > 192.168.7.8.2500: . ack 1 win 1500
17:33:45.349977 IP 192.168.7.4.50000 > 192.168.7.8.2500: P 1:17(16) ack 1 win 1500
17:33:45.350117 IP 192.168.7.8.2500 > 192.168.7.4.50000: . ack 17 win 14360
17:33:45.351273 IP 192.168.7.4.50000 > 192.168.7.8.2500: R 1385353596:1385353596(0) win 1500
17:33:45.360878 IP 192.168.7.8.48186 > 192.168.7.4.50000: R 1388203103:1388203103(0) ack 1385353596 win 14360

Seems to be losing the source port information when it decides to send 
that final RST|ACK. It's going through the "TCPAbortOnClose" path:

tcp_close:
	-> tcp_set_state(sk, TCP_CLOSE)
		-> inet_put_port(&tcp_hashinfo, sk)
		Perhaps it's losing the port information here?
	-> tcp_send_active_reset(sk, GFP_KERNEL)

"TCP_CLOSE       socket is finished"
Should these two calls be the other way round?


Also, I don't think it should be sending a RST after the other side has 
sent one - the connection no longer exists so there is nothing on the 
other side to reset.

-- 
Simon Arlott

^ permalink raw reply

* Re: [patch] genirq: temporary fix for level-triggered IRQ resend
From: Gabriel C @ 2007-08-02 17:03 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Linus Torvalds, Jarek Poplawski, Thomas Gleixner,
	Jean-Baptiste Vignaud, linux-kernel, shemminger, linux-net,
	netdev, Andrew Morton, Alan Cox, marcin.slusarz
In-Reply-To: <20070731155843.GA7033@elte.hu>

Ingo Molnar wrote:
> Linus,
> 
> with -rc2 approaching i think we should apply the minimal fix below to 
> get Marcin's ne2k-pci networking back in working order. The 
> WARN_ON_ONCE() will not prevent the system from working and it will be a 
> reminder.
> 
> a better workaround would be to inhibit the resent vector via the 
> IO-APIC irqchip - but i'd still like to have the patch below because the 
> ne2k driver _should_ be able to survive the spurious irq that happens. 
> (even on Marcin's system that ne2k-pci irq line is shared with another 
> networking card, so an irq could happen at any moment - it's just that 
> with the delayed-disable logic it happens _all the time_.)
> 

I get a warning on each boot now with this patch .. 

[   63.686613] WARNING: at kernel/irq/resend.c:70 check_irq_resend()
[   63.686636]  [<c013c55c>] check_irq_resend+0x8c/0xa0
[   63.686653]  [<c013c15f>] enable_irq+0xad/0xb3
[   63.686662]  [<e886481e>] vortex_timer+0x20c/0x3d5 [3c59x]
[   63.686675]  [<c01164b9>] scheduler_tick+0x154/0x273
[   63.686685]  [<c012fed1>] getnstimeofday+0x34/0xe3
[   63.686697]  [<c0121f4a>] run_timer_softirq+0x137/0x197
[   63.686709]  [<e8864612>] vortex_timer+0x0/0x3d5 [3c59x]
[   63.686720]  [<c011ed09>] __do_softirq+0x75/0xe1
[   63.686729]  [<c011edac>] do_softirq+0x37/0x3d
[   63.686735]  [<c011ef85>] irq_exit+0x7c/0x7e
[   63.686740]  [<c010e013>] smp_apic_timer_interrupt+0x59/0x84
[   63.686751]  [<c0103428>] apic_timer_interrupt+0x28/0x30
[   63.686759]  [<c0101355>] default_idle+0x0/0x3f
[   63.686767]  [<c0101385>] default_idle+0x30/0x3f
[   63.686773]  [<c0100c19>] cpu_idle+0x5e/0x8e
[   63.686779]  [<c03fdc5f>] start_kernel+0x2d7/0x368


That means ?:)


> 	Ingo
> 


Gabriel

^ permalink raw reply

* Re: TCP SACK issue, hung connection, tcpdump included
From: Darryl Miles @ 2007-08-02 16:58 UTC (permalink / raw)
  To: Ilpo Järvinen; +Cc: LKML, Netdev
In-Reply-To: <Pine.LNX.4.64.0708021056560.8788@kivilampi-30.cs.helsinki.fi>

Ilpo Järvinen wrote:
> On Tue, 31 Jul 2007, Darryl L. Miles wrote:
> 
>> I've been able to capture a tcpdump from both ends during the problem and its
>> my belief there is a bug in 2.6.20.1 (at the client side) in that it issues a
>> SACK option for an old sequence which the current window being advertised is
>> beyond it.  This is the most concerning issue as the integrity of the sequence
>> numbers doesn't seem right (to my limited understanding anyhow).
> 
> You probably didn't check the reference I explicitly gave to those who 
> are not familiar how DSACK works, just in case you didn't pick it up last 
> time, here it is again for you: RFC2883... 

I've now squinted the D-SACK RFC and understand a little about this,
however the RFC does make the claim "This extension is compatible with
current implementations of the SACK option in TCP.  That is, if one of
the TCP end-nodes does not implement this D-SACK extension and the other
TCP end-node does, we believe that this use of the D-SACK extension by
one of the end nodes will not introduce problems."

What if it turns out that is not true for a large enough number of SACK
implementations out there; in the timeframe that SACK was supported but
D-SACK was not supported.  Would it be possible to clearly catagorise an
implementation to be:

  * 100% SACK RFC compliant.  SACK works and by virtue of the mandatory
requirements written into the previous SACK RFCs then this
implementation would never see a problem with receiving D-SACK even
through the stack itself does not support D-SACK.

  * Mostly SACK RFC compliant.  SACK works but if it saw D-SACK it would
have a problems dealing with it, possibly resulting in fatal TCP
lockups.  Are there SACK implementation mandatory requirements in place
for to be able to clearly draw the line and state that the 2.6.9 SACK
implementation was not RFC compliant.

  * 100% SACK and D-DACK RFC compliant.  Such an implementation was 
written to support D-SACK on top of SACK.

So if there is a problem whos fault would it be:

  * The original SACK RFCs for not specifying a mandatory course of
action to take which D-SACK exploits.  Thus making the claim in RFC2883 
unsound.

  * The older linux kernel for not being 100% SACK RFC compliant in its
implementation ?  Not a lot we can do about this now, but if we're able
to identify there maybe backward compatibility issues with the same
implementation thats a useful point to take forward.

  * The newer linux kernel for enabling D-SACK by default when RFC2883
doesn't even claim a cast iron case for D-SACK to be compatible with any
100% RFC compliant SACK implementation.

Does TCP support the concept of vendor dependent options, that would be
TCP options which are in a special range that would both identify the
vendor and the vendors-specific option id.  Such a system would allow
Linux to implement a <D-SACK Ok> option, even if the RFC claims one is
not needed.  This would allow moving forward through this era until such
point in time when it was officially agreed it was just a linux problem 
or an RFC problem.  If its an RFC problem then IANA (or whoever) would 
issue a generic TCP option for it.

If the dump on this problem really does identify a risk/problem when as
its between 2 version of linux a vendor specific option also makes sense.

I don't really want to switch new useful stuff off by default (so it
never gets used), I'm all for experimentation but not to the point of
failure between default configurations of widely distributed version of 
the kernel.

So thats the technical approaches I can come up with to discuss.  Does
Ilpo have a particular vested interest in D-SACK that should be disclosed?

> However, if DSACKs really
> bother you still (though it shouldn't :-)), IIRC I also told you how
> you're able to turn it off (tcp_dsack sysctl) but I assure you that it's
> not a bug but feature called DSACK [RFC2883], there's _absolutely_ 
nothing
> wrong with it, instead, it would be wrong to _not_ send the below 
snd_una
> SACK in this scenario when tcp_dsack set to 1.

So it is necessary to turn off a TCP option (that is enabled by default)
to be sure to have reliable TCP connections (that don't lock up) in the
bugfree Linux networking stack ?  This is absurd.

If such an option causes such a problem; then that option should not be
enabled by default.  If however the problem is because of a bug then let
us continue to try to isolate the cause rather than wallpaper over the
cracks with the voodoo of turning things that are enabled by default off.

It only makes sense to turn options off when there is a 3rd party
involved (or other means beyond your control) which is affecting
function, the case here is that two Linux kernel stacks are affected and
no 3rd party device has been shown to be affecting function.

>> There is another concern of why the SERVER performed a retransmission in the
>> first place, when the tcpdump shows the ack covering it has been seen.
> 
> There are only three possible reasons to this thing:
> 1) The ACK didn't reach the SERVER (your logs prove this to not be the 
> case)
> 2) The ACK got discarded by the SERVER

I'd thought about that one, its a possibility.  The server in question
does have period of high UDP load on a fair number of UDP sockets at
once (a few 1000).  Both systems have 2Gb of RAM.  The server has maybe
just 250Mb of RSS of all apps combined.

> 3) The SERVER (not the client) is buggy and sends an incorrect 
> retransmission

This was my initial stab at the cause, simply due to the sequence like
this (from when the lockup starts) :

03:58:56.731637 IP (tos 0x10, ttl  64, id 53311, offset 0, flags [DF],
proto 6, length: 64) CLIENT.43726 > SERVER.ssh: . [tcp sum ok]
2634113543:2634113543(0) ack 617735600 win 501 <nop,nop,timestamp
819458962 16345815,nop,nop,sack sack 1 {617733440:617734888} >

03:58:57.322800 IP (tos 0x0, ttl  50, id 28689, offset 0, flags [DF],
proto 6, length: 1500) SERVER.ssh > CLIENT.43726: .
617733440:617734888(1448) ack 2634113543 win 2728 <nop,nop,timestamp
16346718 819458864>

The client sent a SACK.  But from understanding more about D-SACK, this
is a valid D-SACK response, but it appears to confuse the older Linux
kernel at the server end.

> ...So we have just two options remaining...
> 
>> I have made available the full dumps at:
>>
>> http://darrylmiles.org/snippets/lkml/20070731/
> 
> Thanks about these... Based on a quick check, it is rather clear that the 
> SERVER is for some reason discarding the packets it's receiving:
> 
> 04:11:26.833935 IP CLIENT.43726 > SERVER.ssh: P 4239:4287(48) ack 28176 win 501 <nop,nop,timestamp 819646456 16345815>
> 04:11:27.132425 IP SERVER.ssh > CLIENT.43726: . 26016:27464(1448) ack 4239 win 2728 <nop,nop,timestamp 17096579 819458864>
> 04:11:27.230081 IP CLIENT.43726 > SERVER.ssh: . ack 28176 win 501 <nop,nop,timestamp 819646555 16345815,nop,nop
> 
> Notice, (cumulative) ack field didn't advance though new data arrived, and 
> for the record, it's in advertised window too. There are no DSACK in here 
> so your theory about below snd_una SACK won't help to explain this one 
> at all... We'll just have to figure out why it's discarding it. And 
> there's even more to prove this...

Agreed on this.  However discarding data is allowed (providing it is
intentional discarding not a bug where the TCP stack is discarding 
segments it shouldn't), TCP should recover providing sufficient packets 
get through.

So the SNMP data would show up intentional discards (due to 
memory/resource issues).  So I'll get some of those too.

>> ...SNIPPED...MORE SIGNS OF UNEXPLAINED DISCARDING BY THE SERVER...
> 
> There was one bad checksum btw:
> 
>> 03:58:56.365662 IP (tos 0x10, ttl  64, id 28685, offset 0, flags [DF], 
>> proto 6, length: 764) SERVER.ssh > CLIENT.43726: P [bad tcp cksum 6662 
>> (->ef2b)!] 617734888:617735600(712) ack 2634113543 win 2728 
>> <nop,nop,timestamp 16345815 819458859>

I noticed this one too, but discarded the "[bad tcp cksum 6662
->ef2b)!]" as bogus on the basis of the following packet turning up at
the client:

03:58:56.422019 IP (tos 0x0, ttl  50, id 28685, offset 0, flags [DF],
proto 6, length: 764) SERVER.ssh > CLIENT.43726: P [tcp sum ok]
617734888:617735600(712) ack 2634113543 win 2728 <nop,nop,timestamp
16345815 819458859>

Forgive me if I am mistaken, but while the server reports a checksum
error, the client did not.  I took this to be a misreporting by tcpdump
at the server, probably due to the "e1000" network card checksum
offloading (I'd guess this level of card does offloading, I've never 
audited the driver before).  If you search both dumps for the timestamps 
"16345815 819458859" two packets were sent by the server and two 
received by the server with those timestamps.

>> There are some changes in 2.6.22 that appear to affect TCP SACK handling
>> does this fix a known issue ?
> 
> There is no such "known issue" :-)... This issue has nothing to do with 
> TCP SACK handling, since that code _won't_ be reached... We could verify 
> that from the timestamps. But if you still insist that SACK under snd_una 
> is the issue, please turn tcp_dsack to 0 on the CLIENT, you will not get 
> them after that and you can be happy as your out-of-window SACK "issue"
> is then fixed :-)... 

I had thrown up one interpretation of events for others to comment, so
thanks for your comments and viewpoint on the issue.

> ...Seriously, somebody else than me is probably better in suggesting what 
> could cause the discarding at the SERVER in this case. SNMP stuff Dave was 
> asking could help, you can find them from /proc/net/{netstat,snmp}...

The SNMP stats aren't so useful right now as
the box has been rebooted since then but I shall attempt to capture
/proc/net/* data, cause the problem, then capture /proc/net/* data again
if those numbers can help.

Darryl

^ permalink raw reply

* b44 compile error on !PCI
From: Meelis Roos @ 2007-08-02 16:45 UTC (permalink / raw)
  To: netdev

Tryng to compile todays git on SBus-only Sparc64 (Ultra 1), no PCI. b44 
is selectable but fails to compile:

  CC [M]  drivers/net/b44.o
drivers/net/b44.c: In function 'b44_sync_dma_desc_for_device':
drivers/net/b44.c:134: error: implicit declaration of function 'dma_sync_single_range_for_device'
drivers/net/b44.c: In function 'b44_sync_dma_desc_for_cpu':
drivers/net/b44.c:144: error: implicit declaration of function 'dma_sync_single_range_for_cpu'

-- 
Meelis Roos (mroos@linux.ee)

^ permalink raw reply

* Re: [PATCH] Merge the Sonics Silicon Backplane subsystem
From: Michael Buesch @ 2007-08-02 16:18 UTC (permalink / raw)
  To: Geert Uytterhoeven
  Cc: Andrew Morton, netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-wireless-u79uwXL29TY76Z2rM5mHXA, linux-kernel,
	Gary Zambrano, bcm43xx-dev-0fE9KPoRgkgATYTw5x5z8w
In-Reply-To: <Pine.LNX.4.64.0708021812260.5150@anakin>

On Thursday 02 August 2007, Geert Uytterhoeven wrote:
> On Thu, 2 Aug 2007, Michael Buesch wrote:
> > On Thursday 02 August 2007, Geert Uytterhoeven wrote:
> > > On Fri, 27 Jul 2007, Michael Buesch wrote:
> > > > The Sonics Silicon Backplane is a mini-bus used on
> > > > various Broadcom chips and embedded devices.
> > > > Devices using the SSB include b44, bcm43xx and various
> > > > Broadcom based wireless routers.
> > > > A b44 and bcm43xx port and a SSB based OHCI driver is available.
> > > 
> > > > --- a/drivers/Kconfig
> > > > +++ b/drivers/Kconfig
> > > > @@ -58,6 +58,8 @@ source "drivers/power/Kconfig"
> > > >  
> > > >  source "drivers/hwmon/Kconfig"
> > > >  
> > > > +source "drivers/ssb/Kconfig"
> > > > +
> > > >  source "drivers/mfd/Kconfig"
> > > >  
> > > >  source "drivers/media/Kconfig"
> > > 
> > > > --- /dev/null
> > > > +++ b/drivers/ssb/Kconfig
> > > > @@ -0,0 +1,92 @@
> > > > +menu "Sonics Silicon Backplane"
> > > > +
> > > > +config SSB
> > > > +	tristate "Sonics Silicon Backplane support"
> > > > +	depends on EXPERIMENTAL
> > > 
> > > Hence this will show up on all platforms?
> > 
> > So?
> 
> Shouldn't you add a dependency for platforms where it make sense to have SSB?

Well, that's everything where you can stick a PCI, PCMCIA, PC-Card or CF-Card
into, plus the MIPS platform, where we have the embedded SSB.
That's basically everything, no? Except these strange !HAS_IOMEM platforms,
which we already take care of in a followup patch.

^ permalink raw reply

* Re: [PATCH] Merge the Sonics Silicon Backplane subsystem
From: Geert Uytterhoeven @ 2007-08-02 16:12 UTC (permalink / raw)
  To: Michael Buesch
  Cc: Andrew Morton, linux-kernel, bcm43xx-dev, netdev, linux-wireless,
	Gary Zambrano
In-Reply-To: <200708021624.59701.mb@bu3sch.de>

On Thu, 2 Aug 2007, Michael Buesch wrote:
> On Thursday 02 August 2007, Geert Uytterhoeven wrote:
> > On Fri, 27 Jul 2007, Michael Buesch wrote:
> > > The Sonics Silicon Backplane is a mini-bus used on
> > > various Broadcom chips and embedded devices.
> > > Devices using the SSB include b44, bcm43xx and various
> > > Broadcom based wireless routers.
> > > A b44 and bcm43xx port and a SSB based OHCI driver is available.
> > 
> > > --- a/drivers/Kconfig
> > > +++ b/drivers/Kconfig
> > > @@ -58,6 +58,8 @@ source "drivers/power/Kconfig"
> > >  
> > >  source "drivers/hwmon/Kconfig"
> > >  
> > > +source "drivers/ssb/Kconfig"
> > > +
> > >  source "drivers/mfd/Kconfig"
> > >  
> > >  source "drivers/media/Kconfig"
> > 
> > > --- /dev/null
> > > +++ b/drivers/ssb/Kconfig
> > > @@ -0,0 +1,92 @@
> > > +menu "Sonics Silicon Backplane"
> > > +
> > > +config SSB
> > > +	tristate "Sonics Silicon Backplane support"
> > > +	depends on EXPERIMENTAL
> > 
> > Hence this will show up on all platforms?
> 
> So?

Shouldn't you add a dependency for platforms where it make sense to have SSB?

Gr{oetje,eeting}s,

						Geert

--
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
							    -- Linus Torvalds

^ permalink raw reply

* Re: NETPOLL=y , NETDEVICES=n compile error ( Re: 2.6.23-rc1-mm1 )
From: Matt Mackall @ 2007-08-02 15:59 UTC (permalink / raw)
  To: Jarek Poplawski
  Cc: Gabriel C, Andrew Morton, linux-kernel, netdev, jason.wessel,
	amitkale, Sam Ravnborg
In-Reply-To: <20070802090008.GA1699@ff.dom.local>

On Thu, Aug 02, 2007 at 11:00:08AM +0200, Jarek Poplawski wrote:
> On Wed, Aug 01, 2007 at 09:02:19PM -0500, Matt Mackall wrote:
> > On Wed, Aug 01, 2007 at 11:59:21AM +0200, Jarek Poplawski wrote:
> > > On Tue, Jul 31, 2007 at 05:05:00PM +0200, Gabriel C wrote:
> > > > Jarek Poplawski wrote:
> > > > > On Tue, Jul 31, 2007 at 12:14:36PM +0200, Gabriel C wrote:
> > > > >> Jarek Poplawski wrote:
> > > > >>> On 28-07-2007 20:42, Gabriel C wrote:
> > > > >>>> Andrew Morton wrote:
> > > > >>>>> On Sat, 28 Jul 2007 17:44:45 +0200 Gabriel C <nix.or.die@googlemail.com> wrote:
> ...
> > > > >>>>>> net/core/netpoll.c:155: error: 'struct net_device' has no member named 'poll_controller'
> ...
> > > > >>>>>> I think is because KGDBOE selects just NETPOLL.
> > > > >>>>>>
> > > > >>>>> Looks like it.
> > > > >>>>>
> > > > >>>>> Select went and selected NETPOLL and NETPOLL_TRAP but things like
> > > > >>>>> CONFIG_NETDEVICES and CONFIG_NET_POLL_CONTROLLER remain unset.  `select'
> > > > >>>>> remains evil.
> ...
> > > seems to select NET_POLL_CONTROLLER after selecting NETPOLL, but
> > > still doesn't check for NETDEVICES dependency.
> > 
> > That's odd. Adding Sam to the cc:.
> 
> Looks right, but after reading Andrew's opinion about select I'd be
> astonished if he doesn't know this problem already.
> 
> > 
> > > > >> Now KGDBOE just selects NETPOLL and NETPOLL_TRAP.
> > > > >> Adding 'select CONFIG_NET_POLL_CONTROLLER' let kgdboe compiles but the question is does it work without any ethernet card ?
> > > > > 
> > > > > Why kgdboe should care what netpoll needs? So, I hope, you are adding
> > > > > this select under config NETPOLL. On the other hand, if NETPOLL should
> > > > > depend on NET_POLL_CONTROLLER there is probably no reason to have them
> > > > > both.
> > > > 
> > > > NET_POLL_CONTROLLER has def_bool NETPOLL if NETDEVICES .
> > > > 
> > > > Net peoples ping ?:)
> > 
> > How about cc:ing the netpoll maintainer?
> 
> Is there a new one or do you suggest possibility of abusing the
> authority of the netpoll's author with such trifles...?!

I'm just subtly suggesting that if you're going to have a discussion
about netpoll, you ought to cc: me.

> There are some notions about "other diagnostic tools" in some
> net drivers, eg. 3c509.c, so there would be a little bit of
> work if, after changing this, they really exist (and even if
> not - maybe it's reasonable to save such possibility for the
> future?).

I created it for netpoll, only netpoll clients have ever cared.

-- 
Mathematics is the supreme nostalgia of our time.

^ permalink raw reply

* Re: [PATCH] SCTP: drop SACK if ctsn is not less than the next tsn of assoc
From: Vlad Yasevich @ 2007-08-02 14:40 UTC (permalink / raw)
  To: Wei Yongjun; +Cc: netdev, Neil Horman, lksctp-developers, Sridhar Samudrala
In-Reply-To: <46B19C88.8070104@cn.fujitsu.com>

Wei Yongjun wrote:
> Patch has been modified base on comment.
> Thanks.
> 
> Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
> 

Ok, I've applied this patch, but in the future, please
generate patches so that they can be applied
with a -p1 flag.

Please see Documentation/SubmittingPatches for proper
format.

Thanks
-vlad

^ permalink raw reply

* Re: [PATCH] Merge the Sonics Silicon Backplane subsystem
From: Michael Buesch @ 2007-08-02 14:24 UTC (permalink / raw)
  To: Geert Uytterhoeven
  Cc: Andrew Morton, netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-wireless-u79uwXL29TY76Z2rM5mHXA, linux-kernel,
	Gary Zambrano, bcm43xx-dev-0fE9KPoRgkgATYTw5x5z8w
In-Reply-To: <Pine.LNX.4.64.0708021557280.5150@anakin>

On Thursday 02 August 2007, Geert Uytterhoeven wrote:
> On Fri, 27 Jul 2007, Michael Buesch wrote:
> > The Sonics Silicon Backplane is a mini-bus used on
> > various Broadcom chips and embedded devices.
> > Devices using the SSB include b44, bcm43xx and various
> > Broadcom based wireless routers.
> > A b44 and bcm43xx port and a SSB based OHCI driver is available.
> 
> > --- a/drivers/Kconfig
> > +++ b/drivers/Kconfig
> > @@ -58,6 +58,8 @@ source "drivers/power/Kconfig"
> >  
> >  source "drivers/hwmon/Kconfig"
> >  
> > +source "drivers/ssb/Kconfig"
> > +
> >  source "drivers/mfd/Kconfig"
> >  
> >  source "drivers/media/Kconfig"
> 
> > --- /dev/null
> > +++ b/drivers/ssb/Kconfig
> > @@ -0,0 +1,92 @@
> > +menu "Sonics Silicon Backplane"
> > +
> > +config SSB
> > +	tristate "Sonics Silicon Backplane support"
> > +	depends on EXPERIMENTAL
> 
> Hence this will show up on all platforms?

So?

^ permalink raw reply

* Re: [PATCH] Merge the Sonics Silicon Backplane subsystem
From: Geert Uytterhoeven @ 2007-08-02 13:58 UTC (permalink / raw)
  To: Michael Buesch
  Cc: Andrew Morton, linux-kernel, bcm43xx-dev, netdev, linux-wireless,
	Gary Zambrano
In-Reply-To: <200707271857.24162.mb@bu3sch.de>

On Fri, 27 Jul 2007, Michael Buesch wrote:
> The Sonics Silicon Backplane is a mini-bus used on
> various Broadcom chips and embedded devices.
> Devices using the SSB include b44, bcm43xx and various
> Broadcom based wireless routers.
> A b44 and bcm43xx port and a SSB based OHCI driver is available.

> --- a/drivers/Kconfig
> +++ b/drivers/Kconfig
> @@ -58,6 +58,8 @@ source "drivers/power/Kconfig"
>  
>  source "drivers/hwmon/Kconfig"
>  
> +source "drivers/ssb/Kconfig"
> +
>  source "drivers/mfd/Kconfig"
>  
>  source "drivers/media/Kconfig"

> --- /dev/null
> +++ b/drivers/ssb/Kconfig
> @@ -0,0 +1,92 @@
> +menu "Sonics Silicon Backplane"
> +
> +config SSB
> +	tristate "Sonics Silicon Backplane support"
> +	depends on EXPERIMENTAL

Hence this will show up on all platforms?

Gr{oetje,eeting}s,

						Geert

--
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
							    -- Linus Torvalds

^ permalink raw reply

* Re: NETPOLL=y , NETDEVICES=n compile error ( Re: 2.6.23-rc1-mm1 )
From: Jarek Poplawski @ 2007-08-02 12:52 UTC (permalink / raw)
  To: Satyam Sharma
  Cc: Sam Ravnborg, Matt Mackall, Gabriel C, Andrew Morton,
	linux-kernel, netdev, jason.wessel, amitkale
In-Reply-To: <alpine.LFD.0.999.0708021721230.8258@enigma.security.iitk.ac.in>

On Thu, Aug 02, 2007 at 05:26:12PM +0530, Satyam Sharma wrote:
...
> Whoops, I only said that in humour, probably should've snuck in a
> smiley or two. Definitely not blaming anybody. Apologies to anyone
> who felt offended, sorry, nothing such was intended, I assure.

I see you probably didn't notice my smileys too. I need them so often
that I've to abbreviate them with something like this: ",.?!"
But, I'm also sorry if you felt confused I felt offended etc...

Jarek P.

^ permalink raw reply

* Re: strange tcp behavior
From: Evgeniy Polyakov @ 2007-08-02 12:28 UTC (permalink / raw)
  To: Simon Arlott; +Cc: john, netdev
In-Reply-To: <20070802120452.GB9975@2ka.mipt.ru>

On Thu, Aug 02, 2007 at 04:04:53PM +0400, Evgeniy Polyakov (johnpol@2ka.mipt.ru) wrote:
> On Thu, Aug 02, 2007 at 12:38:59PM +0100, Simon Arlott (simon@fire.lp0.eu) wrote:
> > I just got multiple RSTs instead of a connection too. The second RST looks
> > like it's from another connection - and a RST for a RST is wrong...
> 
> You should use iptables rule to block non-raw access:
> iptables -I INPUT -p tcp --dport 50000 -j DROP
> 
> but even in that case I got valid session.

Ok, I can now reproduce the problem.
I will try to debug it further.

-- 
	Evgeniy Polyakov

^ permalink raw reply

* Re: strange tcp behavior
From: Simon Arlott @ 2007-08-02 12:15 UTC (permalink / raw)
  To: john; +Cc: johnpol, netdev
In-Reply-To: <46860.212.93.96.73.1186055105.squirrel@mail.screen.lv>

(Don't remove CC:s, don't top post)
>> On Thu, August 2, 2007 11:16, Evgeniy Polyakov wrote:
>>> On Thu, Aug 02, 2007 at 01:55:50PM +0400, Evgeniy Polyakov
>>> (johnpol@2ka.mipt.ru) wrote:
>>>> On Thu, Aug 02, 2007 at 09:19:06AM +0300, john@screen.lv
>>>> (john@screen.lv) wrote:
>>>> > 1186035057.207629    127.0.0.1 -> 127.0.0.1    TCP 50000 > smtp [SYN]
>>>> > Seq=0 Len=0
>>>> > 1186035057.207632    127.0.0.1 -> 127.0.0.1    TCP smtp > 50000 [SYN,
>>>> ACK]
>>>> > Seq=0 Ack=1 Win=32792 Len=0 MSS=16396
>>>> > 1186035057.207666    127.0.0.1 -> 127.0.0.1    TCP 50000 > smtp [ACK]
>>>> > Seq=1 Ack=1 Win=1500 Len=0
>>>> > 1186035057.207699    127.0.0.1 -> 127.0.0.1    SMTP Command: EHLO
>>>> localhost
>>>> > 1186035057.207718    127.0.0.1 -> 127.0.0.1    TCP smtp > 50000 [ACK]
>>>> > Seq=1 Ack=17 Win=32792 Len=0
>>>> > 1186035057.207736    127.0.0.1 -> 127.0.0.1    TCP 50000 > smtp [RST]
>>>> > Seq=17 Len=0
>>>> > 1186035057.223934    127.0.0.1 -> 127.0.0.1    TCP 33787 > 50000
>>>> [RST,
>>>> > ACK] Seq=0 Ack=0 Win=32792 Len=0
>>>> >
>>>> > Can someone please comment as to why, tcp  stack sends rst packet
>>>> from the
>>>> > wrong source port in this situation.
>>>>
>>>> Besides the fact, that test applications do not run if started not as
>>>> root, I got this:
>>>
>>> And it actually does not initializes a session, since tird line below
>>> shows RST, but not ack. The same with sendmail smtp server (i.e. 25 port
>>> like in your server) and unmodified client.
>>> Please provide application which can trigger the issue and I will help
>>> to debug this issue. If it will help you to debug client, I can run
>>> tcpdump on public server (say 194.85.82.65, please tell me your source
>>> address) to collect dumps. Current code does not trigger the issue on my
>>> machines (and works not like was intended by you). Ugh, and code really
>>> looks horrible...
>>>
>>
>> I just got multiple RSTs instead of a connection too. The second RST looks
>> like it's from another connection - and a RST for a RST is wrong...

On Thu, August 2, 2007 12:45, john@screen.lv wrote:
> you need to add iptables rule for this to
> work, or else the tcp resets connection too early because it does not know
> that something is listening on 50000 port.
>
> iptables -I INPUT -p tcp --dport 50000 -j DROP should do the job.

You didn't mention this before.

Without the server running:

13:02:23.314352 IP 127.0.0.1.50000 > 127.0.0.1.2500: S 53123695:53123695(0) win 1500
13:02:23.314442 IP 127.0.0.1.2500 > 127.0.0.1.50000: R 0:0(0) ack 53123696 win 0
13:02:25.906975 IP 127.0.0.1.3315 > 127.0.0.1.49197: P 1285306902:1285307318(416) ack 1267361915 win 1024
<nop,nop,timestamp 3575709021 3575672670>
13:02:25.907060 IP 127.0.0.1.49197 > 127.0.0.1.3315: . ack 416 win 1541 <nop,nop,timestamp 3575709021
3575709021>

With the server running:

13:05:55.234696 IP 127.0.0.1.50000 > 127.0.0.1.2500: S 1960601450:1960601450(0) win 1500
13:05:55.234799 IP 127.0.0.1.2500 > 127.0.0.1.50000: S 2171862150:2171862150(0) ack 1960601451 win 32792
<mss 16396>
13:05:55.238271 IP 127.0.0.1.50000 > 127.0.0.1.2500: . ack 1 win 1500
13:05:55.240034 IP 127.0.0.1.50000 > 127.0.0.1.2500: P 1:17(16) ack 1 win 1500
13:05:55.240132 IP 127.0.0.1.2500 > 127.0.0.1.50000: . ack 17 win 32792
13:05:55.242251 IP 127.0.0.1.50000 > 127.0.0.1.2500: R 1960601467:1960601467(0) win 1500
13:05:55.253884 IP 127.0.0.1.56434 > 127.0.0.1.50000: R 2171862151:2171862151(0) ack 1960601467 win 32792

Weird. I resent your final RST a few times with a delay:

13:13:05.199275 IP 127.0.0.1.50000 > 127.0.0.1.2500: S 83018811:83018811(0) win 1500
13:13:05.199378 IP 127.0.0.1.2500 > 127.0.0.1.50000: S 2627922927:2627922927(0) ack 83018812 win 32792 <mss
16396>
13:13:05.203368 IP 127.0.0.1.50000 > 127.0.0.1.2500: . ack 1 win 1500
13:13:05.205049 IP 127.0.0.1.50000 > 127.0.0.1.2500: P 1:17(16) ack 1 win 1500
13:13:05.205173 IP 127.0.0.1.2500 > 127.0.0.1.50000: . ack 17 win 32792
13:13:05.206463 IP 127.0.0.1.50000 > 127.0.0.1.2500: R 83018828:83018828(0) win 1500
13:13:05.207656 IP 127.0.0.1.50000 > 127.0.0.1.2500: R 83018828:83018828(0) win 1500
13:13:05.217664 IP 127.0.0.1.55271 > 127.0.0.1.50000: R 2627922928:2627922928(0) ack 83018828 win 32792
13:13:05.510239 IP 127.0.0.1.50000 > 127.0.0.1.2500: R 83018828:83018828(0) win 1500
13:13:05.511644 IP 127.0.0.1.50000 > 127.0.0.1.2500: R 83018828:83018828(0) win 1500
13:13:05.512764 IP 127.0.0.1.50000 > 127.0.0.1.2500: R 83018828:83018828(0) win 1500

I don't know where that extra RST is coming from.
This test would be more convincing between two hosts, since your bizarre
client is using raw sockets as root and could be doing anything.

-- 
Simon Arlott

^ permalink raw reply

* Re: strange tcp behavior
From: Evgeniy Polyakov @ 2007-08-02 12:04 UTC (permalink / raw)
  To: Simon Arlott; +Cc: john, netdev
In-Reply-To: <36758.simon.1186054739@5ec7c279.invalid>

On Thu, Aug 02, 2007 at 12:38:59PM +0100, Simon Arlott (simon@fire.lp0.eu) wrote:
> I just got multiple RSTs instead of a connection too. The second RST looks
> like it's from another connection - and a RST for a RST is wrong...

You should use iptables rule to block non-raw access:
iptables -I INPUT -p tcp --dport 50000 -j DROP

but even in that case I got valid session.

> -- 
> Simon Arlott

-- 
	Evgeniy Polyakov

^ permalink raw reply

* Re: NETPOLL=y , NETDEVICES=n compile error ( Re: 2.6.23-rc1-mm1 )
From: Satyam Sharma @ 2007-08-02 11:56 UTC (permalink / raw)
  To: Jarek Poplawski
  Cc: Sam Ravnborg, Matt Mackall, Gabriel C, Andrew Morton,
	linux-kernel, netdev, jason.wessel, amitkale
In-Reply-To: <20070802114042.GB1699@ff.dom.local>



On Thu, 2 Aug 2007, Jarek Poplawski wrote:

> On Thu, Aug 02, 2007 at 04:02:21PM +0530, Satyam Sharma wrote:
> [...]
> How often "common" developer has to make such decisions in Kconfig?
> Probably no more than once per year. So, it's fair to blame anybody
> for not reading lkml to find if there are some bugs or
> recommendations before using apparently simple tool? I think there
> is usually some README for such things (maybe in Documentation/)?

Whoops, I only said that in humour, probably should've snuck in a
smiley or two. Definitely not blaming anybody. Apologies to anyone
who felt offended, sorry, nothing such was intended, I assure.

Satyam

^ permalink raw reply

* Re: NETPOLL=y , NETDEVICES=n compile error ( Re: 2.6.23-rc1-mm1 )
From: Satyam Sharma @ 2007-08-02 11:40 UTC (permalink / raw)
  To: Sam Ravnborg
  Cc: Matt Mackall, Jarek Poplawski, Gabriel C, Andrew Morton,
	Linux Kernel Mailing List, netdev, jason.wessel, amitkale, zippel,
	jengelh
In-Reply-To: <alpine.LFD.0.999.0708021547420.8258@enigma.security.iitk.ac.in>

[ Read through the thread, looked at Kconfig files,
  did some tests. Adding Kconfig experts to Cc: list. ]

> On Thu, 2 Aug 2007, Sam Ravnborg wrote:
> 
> > > > 
> > > > ...
> > > > endif # NETDEVICES
> > > > 
> > > > config NETPOLL
> > > >         depends on NETDEVICES
> > > >         def_bool NETCONSOLE
> > > > 
> > > > config NETPOLL_TRAP
> > > >         bool "Netpoll traffic trapping"
> > > >         default n
> > > >         depends on NETPOLL
> > > > 
> > > > config NET_POLL_CONTROLLER
> > > >         def_bool NETPOLL
> > > >         depends on NETPOLL

Gargh, what we're seeing here is a whole bunch of bugs, I think. First
I thought this must be one of those randconfig-producing-wrong-configs
issues, but surprisingly, running "make oldconfig" on this .config on
a fresh 2.6.23-rc1-mm1 tree didn't change anything in the .config.

Kconfig bug #1:
===============

Which means, although:

*****
menuconfig BAZ

if BAZ

config BAR

endif
*****

is widely believed (by most folks, I've heard this on several threads,
and as written in the comment in drivers/net/Kconfig) to be equivalent to:

*****
menuconfig BAZ

if BAZ
endif

config BAR
	depends on BAZ
*****

this is *not* enforced by "make oldconfig"! And hence, the NETPOLL &&
!NETDEVICES situation we're seeing here.

[ We could also categorize this as a bug in Kconfig's "if", fwiw. ]

Kconfig bug #2:
===============

config FOO
	def_bool BAR

is supposed to ensure that FOO == BAR (as Matt mentioned earlier).

However, even this is *not* enforced by "make oldconfig". And hence,
the NETPOLL && !NET_POLL_CONTROLLER situation we're seeing here.

In fact, I believe it's possible to even pass a NETCONSOLE but
!NETPOLL kind of .config through "make oldconfig" but it still won't
catch it, and build breakages *will* occur.

[ We could also categorize this as a bug in Kconfig's "def_bool", fwiw. ]

Possibly, we could also decide to just blame "randconfig" for the whole
issue, and forget about these, because I think it's highly unlikely
(though not impossible) for people with "real" .configs to hit the
problems we saw above.

KGDBOE bug #1:
==============

config KGDBOE in lib/Kconfig.kgdb must also "depend on" NETDEVICES,
and select NET_POLL_CONTROLLER also.

KGDBOE bug #2:
==============

config KGDBOE_NOMODULE is a sad, sad option, and must be killed. The
"if !KGDBOE_NOMODULE" in KGDBOE must be removed, and it must lose its
dependency on "m".

Satyam

^ permalink raw reply

* Re: strange tcp behavior
From: Simon Arlott @ 2007-08-02 11:38 UTC (permalink / raw)
  To: Evgeniy Polyakov; +Cc: john, netdev
In-Reply-To: <20070802101655.GA14749@2ka.mipt.ru>


On Thu, August 2, 2007 11:16, Evgeniy Polyakov wrote:
> On Thu, Aug 02, 2007 at 01:55:50PM +0400, Evgeniy Polyakov (johnpol@2ka.mipt.ru) wrote:
>> On Thu, Aug 02, 2007 at 09:19:06AM +0300, john@screen.lv (john@screen.lv) wrote:
>> > 1186035057.207629    127.0.0.1 -> 127.0.0.1    TCP 50000 > smtp [SYN]
>> > Seq=0 Len=0
>> > 1186035057.207632    127.0.0.1 -> 127.0.0.1    TCP smtp > 50000 [SYN, ACK]
>> > Seq=0 Ack=1 Win=32792 Len=0 MSS=16396
>> > 1186035057.207666    127.0.0.1 -> 127.0.0.1    TCP 50000 > smtp [ACK]
>> > Seq=1 Ack=1 Win=1500 Len=0
>> > 1186035057.207699    127.0.0.1 -> 127.0.0.1    SMTP Command: EHLO localhost
>> > 1186035057.207718    127.0.0.1 -> 127.0.0.1    TCP smtp > 50000 [ACK]
>> > Seq=1 Ack=17 Win=32792 Len=0
>> > 1186035057.207736    127.0.0.1 -> 127.0.0.1    TCP 50000 > smtp [RST]
>> > Seq=17 Len=0
>> > 1186035057.223934    127.0.0.1 -> 127.0.0.1    TCP 33787 > 50000 [RST,
>> > ACK] Seq=0 Ack=0 Win=32792 Len=0
>> >
>> > Can someone please comment as to why, tcp  stack sends rst packet from the
>> > wrong source port in this situation.
>>
>> Besides the fact, that test applications do not run if started not as
>> root, I got this:
>
> And it actually does not initializes a session, since tird line below
> shows RST, but not ack. The same with sendmail smtp server (i.e. 25 port
> like in your server) and unmodified client.
> Please provide application which can trigger the issue and I will help
> to debug this issue. If it will help you to debug client, I can run
> tcpdump on public server (say 194.85.82.65, please tell me your source
> address) to collect dumps. Current code does not trigger the issue on my
> machines (and works not like was intended by you). Ugh, and code really
> looks horrible...
>

I just got multiple RSTs instead of a connection too. The second RST looks
like it's from another connection - and a RST for a RST is wrong...

-- 
Simon Arlott

^ permalink raw reply

* Re: NETPOLL=y , NETDEVICES=n compile error ( Re: 2.6.23-rc1-mm1 )
From: Jarek Poplawski @ 2007-08-02 11:40 UTC (permalink / raw)
  To: Satyam Sharma
  Cc: Sam Ravnborg, Matt Mackall, Gabriel C, Andrew Morton,
	linux-kernel, netdev, jason.wessel, amitkale
In-Reply-To: <alpine.LFD.0.999.0708021547420.8258@enigma.security.iitk.ac.in>

On Thu, Aug 02, 2007 at 04:02:21PM +0530, Satyam Sharma wrote:
> Hi,
> 
> 
> On Thu, 2 Aug 2007, Sam Ravnborg wrote:
> 
> > > > 
> > > > ...
> > > > endif # NETDEVICES
> > > > 
> > > > config NETPOLL
> > > >         depends on NETDEVICES
> > > >         def_bool NETCONSOLE
> > > > 
> > > > config NETPOLL_TRAP
> > > >         bool "Netpoll traffic trapping"
> > > >         default n
> > > >         depends on NETPOLL
> > > > 
> > > > config NET_POLL_CONTROLLER
> > > >         def_bool NETPOLL
> > > >         depends on NETPOLL
> > > > 
> > > > 
> > > > seems to select NET_POLL_CONTROLLER after selecting NETPOLL, but
> > > > still doesn't check for NETDEVICES dependency.
> > > 
> > > That's odd. Adding Sam to the cc:.
> 
> I just noticed this thread, but I wonder what the fuss is all
> about :-) Kconfig dependencies are easy, really -- any code that
> pulls in code from elsewhere, must explicitly "depends on" it.
> It is possible to use "select" as well, but could lead to breakages
> as discussed to death on at least 64592 other threads on LKML already
> and hence should only be used for library-like code that does not
> have any dependencies itself.

So, it seems at least one time not enough (or maybe it would be better
to write this 1 time only, but in Documentation/).

> 
> 
> > select is evil....
> > select will by brute force set a symbol equal to 'y' without
> > visiting the dependencies.
> > So abusing select you are able to select a symbol FOO even 
> > if FOO depends on BAR that is not set.
> > 
> > In general use select only for non-visible symbols (no promts anywhere)
> > and for symbols with no dependencies.
> > That will limit the suefullness but on the other hand avoid the illegal
> > configurations all over.
> 
> The problem with using "depends on" is that your config symbol becomes
> invisible unless the dependency has already been selected.
> 
> So, there's a workaround: make the ultimate config symbol itself depend
> upon the grand-dependency (excuse the nomenclature) and just "select"
> the immediate-parent-dependency, i.e. the following:
> 
> CONFIG_BAZ
> 	...
> 
> CONFIG BAR
> 	depends on BAZ
> 
> CONFIG_FOO
> 	depends on BAZ
> 	select BAR
> 
> is perfectly legal, and doesn't cause any build problems. Perhaps such a
> solution makes sense here as well?
> 
> 
> > kconfig should one day warn about such things but I have not fel inclined
> > to dive into the matters hoping that Roman does one day.
> 
> Yup, I've wanted to do this myself, in fact I wanted to implement an idea
> I had in mind ( http://lkml.org/lkml/2007/5/16/257 ) but for some reason
> I tend to stay away from stuff in scripts/ :-)

How often "common" developer has to make such decisions in Kconfig?
Probably no more than once per year. So, it's fair to blame anybody
for not reading lkml to find if there are some bugs or
recommendations before using apparently simple tool? I think there
is usually some README for such things (maybe in Documentation/)?

Thanks,
Jarek P.

PS: if it's so easy and it's enough to read only 64592 lkml messages,
I wonder why Andrew, who knows all lkml, and reads more messages per
hour, cared to remember mainly one short conclusion...

^ permalink raw reply

* Re: [PATCH 1/2] [TCP]: Also handle snd_una changes in tcp_cwnd_down
From: Ilpo Järvinen @ 2007-08-02 11:18 UTC (permalink / raw)
  To: David Miller; +Cc: Netdev
In-Reply-To: <Pine.LNX.4.64.0708012054180.27657@kivilampi-30.cs.helsinki.fi>

[-- Attachment #1: Type: TEXT/PLAIN, Size: 1948 bytes --]

On Wed, 1 Aug 2007, Ilpo Järvinen wrote:

> 
> tcp_cwnd_down must check for it too as it should be conservative
> in case of collapse stuff and also when receiver is trying to
> lie (though that wouldn't be very successful/useful anyway).
> 
> Note:
> - Separated also is_dupack and do_lost in fast_retransalert
> 	* Much cleaner look-and-feel now
> 	* This time it really fixes cumulative ACK with many new
> 	  SACK blocks recovery entry (I claimed this fixes with
> 	  last patch but it wasn't). TCP will now call
> 	  tcp_update_scoreboard regardless of is_dupack when
> 	  in recovery as long as there is enough fackets_out.
> - Introduce FLAG_SND_UNA_ADVANCED
> 	* Some prior_snd_una arguments are unnecessary after it
> - Added helper FLAG_ANY_PROGRESS to avoid long FLAG...|FLAG...
>   constructs
> 
> Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
> ---
> 
> Dave, BEWARE, I wasn't able to do anything else but compile test
> because Linus' tree didn't seem to boot on the machine I was
> trying to test it... :-(
> 
> I think that to stable version only a small part of this change
> is necessary, not the full changeset. That should keep stable
> folks much happier... :-) I'll soon put my reduced proposal to:
>   http://www.cs.helsinki.fi/u/ijjarvin/patches/stable-0001.patch
> The other patch (DSACK) can go to stable as is.

I placed those two earlier sent bidir fixes and these two additional fixes 
on top of 2.6.22 and was finally able to have them tested on a bootable 
kernel (I had a boot failure on another machine too with 2.6.23-rc1 
stuff). FACK&NewReno/bidir and FACK/unidir tested, time-seq graphs were 
ok.

Dave, please put these two patches to net-2.6 to complete bidir fix 
series. ...And please push to stable as well, take just the minimized 
"fix" portion of this "[TCP]: Also handle snd_una changes in 
tcp_cwnd_down" patch as I described above. Other cleanups in it can be
put just to net-2.6.

-- 
 i.

^ permalink raw reply

* Re: NETPOLL=y , NETDEVICES=n compile error ( Re: 2.6.23-rc1-mm1 )
From: Satyam Sharma @ 2007-08-02 10:32 UTC (permalink / raw)
  To: Sam Ravnborg
  Cc: Matt Mackall, Jarek Poplawski, Gabriel C, Andrew Morton,
	linux-kernel, netdev, jason.wessel, amitkale
In-Reply-To: <20070802093659.GB27748@uranus.ravnborg.org>

Hi,

On Thu, 2 Aug 2007, Sam Ravnborg wrote:

> > > 
> > > ...
> > > endif # NETDEVICES
> > > 
> > > config NETPOLL
> > >         depends on NETDEVICES
> > >         def_bool NETCONSOLE
> > > 
> > > config NETPOLL_TRAP
> > >         bool "Netpoll traffic trapping"
> > >         default n
> > >         depends on NETPOLL
> > > 
> > > config NET_POLL_CONTROLLER
> > >         def_bool NETPOLL
> > >         depends on NETPOLL
> > > 
> > > 
> > > seems to select NET_POLL_CONTROLLER after selecting NETPOLL, but
> > > still doesn't check for NETDEVICES dependency.
> > 
> > That's odd. Adding Sam to the cc:.

I just noticed this thread, but I wonder what the fuss is all
about :-) Kconfig dependencies are easy, really -- any code that
pulls in code from elsewhere, must explicitly "depends on" it.
It is possible to use "select" as well, but could lead to breakages
as discussed to death on at least 64592 other threads on LKML already
and hence should only be used for library-like code that does not
have any dependencies itself.

> select is evil....
> select will by brute force set a symbol equal to 'y' without
> visiting the dependencies.
> So abusing select you are able to select a symbol FOO even 
> if FOO depends on BAR that is not set.
> 
> In general use select only for non-visible symbols (no promts anywhere)
> and for symbols with no dependencies.
> That will limit the suefullness but on the other hand avoid the illegal
> configurations all over.

The problem with using "depends on" is that your config symbol becomes
invisible unless the dependency has already been selected.

So, there's a workaround: make the ultimate config symbol itself depend
upon the grand-dependency (excuse the nomenclature) and just "select"
the immediate-parent-dependency, i.e. the following:

CONFIG_BAZ
	...

CONFIG BAR
	depends on BAZ

CONFIG_FOO
	depends on BAZ
	select BAR

is perfectly legal, and doesn't cause any build problems. Perhaps such a
solution makes sense here as well?

> kconfig should one day warn about such things but I have not fel inclined
> to dive into the matters hoping that Roman does one day.

Yup, I've wanted to do this myself, in fact I wanted to implement an idea
I had in mind ( http://lkml.org/lkml/2007/5/16/257 ) but for some reason
I tend to stay away from stuff in scripts/ :-)

Satyam

^ permalink raw reply

* Re: strange tcp behavior
From: Evgeniy Polyakov @ 2007-08-02 10:16 UTC (permalink / raw)
  To: john; +Cc: netdev
In-Reply-To: <20070802095550.GA27226@2ka.mipt.ru>

On Thu, Aug 02, 2007 at 01:55:50PM +0400, Evgeniy Polyakov (johnpol@2ka.mipt.ru) wrote:
> On Thu, Aug 02, 2007 at 09:19:06AM +0300, john@screen.lv (john@screen.lv) wrote:
> > 1186035057.207629    127.0.0.1 -> 127.0.0.1    TCP 50000 > smtp [SYN]
> > Seq=0 Len=0
> > 1186035057.207632    127.0.0.1 -> 127.0.0.1    TCP smtp > 50000 [SYN, ACK]
> > Seq=0 Ack=1 Win=32792 Len=0 MSS=16396
> > 1186035057.207666    127.0.0.1 -> 127.0.0.1    TCP 50000 > smtp [ACK]
> > Seq=1 Ack=1 Win=1500 Len=0
> > 1186035057.207699    127.0.0.1 -> 127.0.0.1    SMTP Command: EHLO localhost
> > 1186035057.207718    127.0.0.1 -> 127.0.0.1    TCP smtp > 50000 [ACK]
> > Seq=1 Ack=17 Win=32792 Len=0
> > 1186035057.207736    127.0.0.1 -> 127.0.0.1    TCP 50000 > smtp [RST]
> > Seq=17 Len=0
> > 1186035057.223934    127.0.0.1 -> 127.0.0.1    TCP 33787 > 50000 [RST,
> > ACK] Seq=0 Ack=0 Win=32792 Len=0
> > 
> > Can someone please comment as to why, tcp  stack sends rst packet from the
> > wrong source port in this situation.
> 
> Besides the fact, that test applications do not run if started not as
> root, I got this:

And it actually does not initializes a session, since tird line below
shows RST, but not ack. The same with sendmail smtp server (i.e. 25 port
like in your server) and unmodified client.
Please provide application which can trigger the issue and I will help
to debug this issue. If it will help you to debug client, I can run
tcpdump on public server (say 194.85.82.65, please tell me your source 
address) to collect dumps. Current code does not trigger the issue on my
machines (and works not like was intended by you). Ugh, and code really
looks horrible...

-- 
	Evgeniy Polyakov

^ permalink raw reply

* Re: strange tcp behavior
From: Evgeniy Polyakov @ 2007-08-02  9:55 UTC (permalink / raw)
  To: john; +Cc: netdev
In-Reply-To: <56697.212.93.96.73.1186035546.squirrel@mail.screen.lv>

On Thu, Aug 02, 2007 at 09:19:06AM +0300, john@screen.lv (john@screen.lv) wrote:
> 1186035057.207629    127.0.0.1 -> 127.0.0.1    TCP 50000 > smtp [SYN]
> Seq=0 Len=0
> 1186035057.207632    127.0.0.1 -> 127.0.0.1    TCP smtp > 50000 [SYN, ACK]
> Seq=0 Ack=1 Win=32792 Len=0 MSS=16396
> 1186035057.207666    127.0.0.1 -> 127.0.0.1    TCP 50000 > smtp [ACK]
> Seq=1 Ack=1 Win=1500 Len=0
> 1186035057.207699    127.0.0.1 -> 127.0.0.1    SMTP Command: EHLO localhost
> 1186035057.207718    127.0.0.1 -> 127.0.0.1    TCP smtp > 50000 [ACK]
> Seq=1 Ack=17 Win=32792 Len=0
> 1186035057.207736    127.0.0.1 -> 127.0.0.1    TCP 50000 > smtp [RST]
> Seq=17 Len=0
> 1186035057.223934    127.0.0.1 -> 127.0.0.1    TCP 33787 > 50000 [RST,
> ACK] Seq=0 Ack=0 Win=32792 Len=0
> 
> Can someone please comment as to why, tcp  stack sends rst packet from the
> wrong source port in this situation.

Besides the fact, that test applications do not run if started not as
root, I got this:

13:51:12.180241 IP localhost.localdomain.50000 > localhost.localdomain.10250: S 906222067:906222067(0) win 1500
13:51:12.180279 IP localhost.localdomain.10250 > localhost.localdomain.50000: S 2011233747:2011233747(0) ack 906222068
win 32792 <mss 16396>
13:51:12.180293 IP localhost.localdomain.50000 > localhost.localdomain.10250: R 906222068:906222068(0) win 0
13:51:12.180320 IP localhost.localdomain.50000 > localhost.localdomain.10250: . ack 1 win 1500
13:51:12.180329 IP localhost.localdomain.10250 > localhost.localdomain.50000: R 2011233748:2011233748(0) win 0
13:51:12.180341 IP localhost.localdomain.50000 > localhost.localdomain.10250: P 1:17(16) ack 1 win 1500
13:51:12.180349 IP localhost.localdomain.10250 > localhost.localdomain.50000: R 2011233748:2011233748(0) win 0
13:51:12.180361 IP localhost.localdomain.50000 > localhost.localdomain.10250: R 906222084:906222084(0) win 1500

I.e. there is no bug in this session.
FC7 2.6.22.1-27.fc7 kernel.

Here is vanilla (with my patches, unrelated to the problem though)
2.6.22-rc5:

09:33:37.650279 IP localhost.50000 > localhost.10250: S 1326688203:1326688203(0) win 1500
09:33:37.664391 IP localhost.10250 > localhost.50000: S 3637551175:3637551175(0) ack 1326688204 win 32792 <mss 16396>
09:33:37.664417 IP localhost.50000 > localhost.10250: R 1326688204:1326688204(0) win 0
09:33:37.650451 IP localhost.50000 > localhost.10250: . ack 1 win 1500
09:33:37.650467 IP localhost.10250 > localhost.50000: R 3637551176:3637551176(0) win 0
09:33:37.650481 IP localhost.50000 > localhost.10250: P 1:17(16) ack 1 win 1500
09:33:37.650493 IP localhost.10250 > localhost.50000: R 3637551176:3637551176(0) win 0
09:33:37.650507 IP localhost.50000 > localhost.10250: R 1326688220:1326688220(0) win 1500


Is it possible that your tcpdump is screwed?

-- 
	Evgeniy Polyakov

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox