Netdev List
 help / color / mirror / Atom feed
* [PATCH 2/7] Topcliff GbE: Add The Parameter check code
From: Masayuki Ohtake @ 2010-04-23 11:59 UTC (permalink / raw)
  To: NETDEV; +Cc: Wang, Yong Y, Wang, Qi, Andrew, Intel OTC

From: Masayuki Ohtake <masa-korg@dsn.okisemi.com>

This patch adds the Parameter check code of GbE driver for Topcliff.
The GbE driver needs all patch[1/7 to 7/7].

Signed-off-by: Masayuki Ohtake <masa-korg@dsn.okisemi.com>
---
 drivers/net/pch_gbe/pch_gbe_param.c        | 594 ++
+++++++++++++++++++++++++++++++ 1 files changed, 594 insertions(+)
diff -urN linux-2.6.33.1/drivers/net/pch_gbe/pch_gbe_param.c
topcliff-2.6.33.1/drivers/net/pch_gbe/pch_gbe_param.c
--- linux-2.6.33.1/drivers/net/pch_gbe/pch_gbe_param.c 1970-01-01
09:00:00.000000000 +0900
+++ topcliff-2.6.33.1/drivers/net/pch_gbe/pch_gbe_param.c 2010-04-13
18:18:01.000000000 +0900
@@ -0,0 +1,594 @@
+/*!
+ * @file pch_gbe_param.c
+ * @brief Linux PCH Gigabit Ethernet Driver parameter check source file
+ *
+ * @version 1.00
+ *
+ * @section
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307,
USA.
+ */
+
+/*
+ * History:
+ * Copyright (C) 2010 OKI SEMICONDUCTOR CO., LTD.
+ *
+ * created:
+ *   OKI SEMICONDUCTOR 04/13/2010
+ * modified:
+ *
+ */
+
+#include <linux/netdevice.h>
+#include <linux/ethtool.h>
+#include <linux/mii.h>
+
+#include "pch_debug.h"
+#include "pch_gbe_osdep.h"
+#include "pch_gbe_defines.h"
+#include "pch_gbe_hw.h"
+#include "pch_gbe.h"
+
+/* This is the only thing that needs to be changed to adjust the
+ * maximum number of ports that the driver can manage.
+ */
+
+#define PCH_GBE_MAX_NIC 1
+
+#define OPTION_UNSET   -1
+#define OPTION_DISABLED 0
+#define OPTION_ENABLED  1
+
+/* All parameters are treated the same, as an integer array of values.
+ * This macro just reduces the need to repeat the same declaration code
+ * over and over (plus this helps to avoid typo bugs).
+ */
+
+#define PCH_GBE_PARAM_INIT { [0 ... PCH_GBE_MAX_NIC] = OPTION_UNSET }
+#ifdef PCH_GBE_QAC
+#define PCH_GBE_PARAM(X, desc)
+#else
+#define PCH_GBE_PARAM(X, desc) \
+ static int X[PCH_GBE_MAX_NIC+1] = PCH_GBE_PARAM_INIT; \
+ static int num_##X; \
+ module_param_array_named(X, X, int, &num_##X, 0); \
+ MODULE_PARM_DESC(X, desc);
+#endif
+
+/*
+ * Transmit Descriptor Count
+ *    Valid Range:   PCH_GBE_MIN_TXD - PCH_GBE_MAX_TXD
+ *    Default Value: PCH_GBE_DEFAULT_TXD
+ */
+PCH_GBE_PARAM(TxDescriptors, "Number of transmit descriptors");
+
+/*
+ * Receive Descriptor Count
+ *    Valid Range:   PCH_GBE_MIN_RXD - PCH_GBE_MAX_RXD
+ *    Default Value: PCH_GBE_DEFAULT_RXD
+ */
+PCH_GBE_PARAM(RxDescriptors, "Number of receive descriptors");
+
+/* User Specified Speed Override
+ *
+ * Valid Range: 0, 10, 100, 1000
+ *  - 0    - auto-negotiate at all supported speeds
+ *  - 10   - only link at 10 Mbps
+ *  - 100  - only link at 100 Mbps
+ *  - 1000 - only link at 1000 Mbps
+ *
+ * Default Value: 0
+ */
+PCH_GBE_PARAM(Speed, "Speed setting");
+
+/* User Specified Duplex Override
+ *
+ * Valid Range: 0-2
+ *  - 0 - auto-negotiate for duplex
+ *  - 1 - only link at half duplex
+ *  - 2 - only link at full duplex
+ *
+ * Default Value: 0
+ */
+PCH_GBE_PARAM(Duplex, "Duplex setting");
+
+/*
+ * Auto-negotiation Advertisement Override
+ *    Valid Range: 0x01-0x0F, 0x20-0x2F
+ *
+ *       The AutoNeg value is a bit mask describing which speed and duplex
+ *       combinations should be advertised during auto-negotiation.
+ *       The supported speed and duplex modes are listed below
+ *
+ *       Bit           7     6     5      4      3     2     1      0
+ *       Speed (Mbps)  N/A   N/A   1000   N/A    100   100   10     10
+ *       Duplex                    Full          Full  Half  Full   Half
+ *
+ *    Default Value: 0x2F (copper)
+ */
+PCH_GBE_PARAM(AutoNeg, "Advertised auto-negotiation setting");
+#define AUTONEG_ADV_DEFAULT  0x2F
+
+/*
+ * User Specified Flow Control Override
+ *    Valid Range: 0-3
+ *     - 0 - No Flow Control
+ *     - 1 - Rx only, respond to PAUSE frames but do not generate them
+ *     - 2 - Tx only, generate PAUSE frames but ignore them on receive
+ *     - 3 - Full Flow Control Support
+ *    Default Value: Read flow control settings from the EEPROM
+ */
+PCH_GBE_PARAM(FlowControl, "Flow Control setting");
+
+/*
+ * XsumRX - Receive Checksum Offload Enable/Disable
+ *    Valid Range: 0, 1
+ *     - 0 - disables all checksum offload
+ *     - 1 - enables receive IP/TCP/UDP checksum offload
+ *    Default Value: PCH_GBE_DEFAULT_RX_CSUM
+ */
+PCH_GBE_PARAM(XsumRX, "Disable or enable Receive Checksum offload");
+
+/*
+ * XsumTX - Transmit Checksum Offload Enable/Disable
+ *    Valid Range: 0, 1
+ *     - 0 - disables all checksum offload
+ *     - 1 - enables transmit IP/TCP/UDP checksum offload
+ *    Default Value: PCH_GBE_DEFAULT_TX_CSUM
+ */
+PCH_GBE_PARAM(XsumTX, "Disable or enable Transmit Checksum offload");
+
+struct pch_gbe_option {
+ enum { enable_option, range_option, list_option } type;
+ signed char *name;
+ signed char *err;
+ int  def;
+ union {
+  struct { /* range_option info */
+   int min;
+   int max;
+  } r;
+  struct { /* list_option info */
+   int nr;
+   struct pch_gbe_opt_list { int i; signed char *str; } *p;
+  } l;
+ } arg;
+};
+
+/* ------------------------------------------------------------------------
----
+ Function prototype
+---------------------------------------------------------------------------
- */
+static void pch_gbe_check_copper_options(struct pch_gbe_adapter *adapter);
+static int pch_gbe_validate_option(int *value,
+     struct pch_gbe_option *opt,
+     struct pch_gbe_adapter *adapter);
+
+/* ------------------------------------------------------------------------
----
+ Function
+---------------------------------------------------------------------------
- */
+
+/*!
+ * @ingroup Linux driver internal function
+ * @fn      static int pch_gbe_validate_option(int *value,
+ *                                             struct pch_gbe_option *opt,
+ *                                             struct pch_gbe_adapter
*adapter)
+ * @brief   Validate option
+ * @param   value   [IN] value
+ * @param   opt     [IN] option
+ * @param   adapter [IN] Board private structure
+ * @return  PCH_GBE_SUCCESS:  Successfully
+ * @return  Negative value:   Failed
+ */
+static int
+pch_gbe_validate_option(int *value, struct pch_gbe_option *opt,
+  struct pch_gbe_adapter *adapter)
+{
+ if (*value == OPTION_UNSET) {
+  *value = opt->def;
+  return 0;
+ }
+
+ switch (opt->type) {
+ case enable_option:
+  switch (*value) {
+  case OPTION_ENABLED:
+   DPRINTK(PROBE, INFO, "%s Enabled\n", opt->name);
+   return 0;
+  case OPTION_DISABLED:
+   DPRINTK(PROBE, INFO, "%s Disabled\n", opt->name);
+   return 0;
+  }
+  break;
+ case range_option:
+  if (*value >= opt->arg.r.min && *value <= opt->arg.r.max) {
+   DPRINTK(PROBE, INFO,
+     "%s set to %i\n", opt->name, *value);
+   return 0;
+  }
+  break;
+ case list_option: {
+  int i;
+  struct pch_gbe_opt_list *ent;
+
+  for (i = 0; i < opt->arg.l.nr; i++) {
+   ent = &opt->arg.l.p[i];
+   if (*value == ent->i) {
+    if (ent->str[0] != '\0')
+     DPRINTK(PROBE, INFO, "%s\n", ent->str);
+    return 0;
+   }
+  }
+ }
+  break;
+ default:
+  BUG();
+ }
+
+ DPRINTK(PROBE, INFO, "Invalid %s value specified (%i) %s\n",
+   opt->name, *value, opt->err);
+ *value = opt->def;
+ return -1;
+}
+
+/*!
+ * @ingroup Linux driver internal function
+ * @fn      void pch_gbe_check_options(struct pch_gbe_adapter *adapter)
+ * @brief   Range Checking for Command Line Parameters
+ * @param   adapter  [IN] Board private structure
+ * @return  None
+ * @remarks
+ *  This routine checks all command line parameters for valid user
+ *  input.  If an invalid value is given, or if no user specified
+ *  value exists, a default value is used.  The final value is stored
+ *  in a variable in the adapter structure.
+ */
+void
+pch_gbe_check_options(struct pch_gbe_adapter *adapter)
+{
+ struct pch_gbe_hw *hw = &adapter->hw;
+ int bd = adapter->bd_number;
+
+ PCH_DEBUG("pch_gbe_check_options\n");
+
+ if (bd >= PCH_GBE_MAX_NIC) {
+  DPRINTK(PROBE, NOTICE,
+         "Warning: no configuration for board #%i\n", bd);
+  DPRINTK(PROBE, NOTICE, "Using defaults for all values\n");
+ }
+
+ { /* Transmit Descriptor Count */
+  struct pch_gbe_option opt = {
+   .type = range_option,
+   .name = "Transmit Descriptors",
+   .err  = "using default of "
+    __MODULE_STRING(PCH_GBE_DEFAULT_TXD),
+   .def  = PCH_GBE_DEFAULT_TXD,
+   .arg  = { .r = { .min = PCH_GBE_MIN_TXD } },
+   .arg  = { .r = { .max = PCH_GBE_MAX_TXD } }
+  };
+  struct pch_gbe_tx_ring *tx_ring = adapter->tx_ring;
+  if (num_TxDescriptors > bd) {
+   tx_ring->count = TxDescriptors[bd];
+   pch_gbe_validate_option(&tx_ring->count, &opt, adapter);
+   PCH_GBE_ROUNDUP(tx_ring->count,
+     PCH_GBE_TX_DESC_MULTIPLE);
+  } else {
+   tx_ring->count = opt.def;
+  }
+ }
+ { /* Receive Descriptor Count */
+  struct pch_gbe_option opt = {
+   .type = range_option,
+   .name = "Receive Descriptors",
+   .err  = "using default of "
+    __MODULE_STRING(PCH_GBE_DEFAULT_RXD),
+   .def  = PCH_GBE_DEFAULT_RXD,
+   .arg  = { .r = { .min = PCH_GBE_MIN_RXD } },
+   .arg  = { .r = { .max = PCH_GBE_MAX_RXD } }
+  };
+  struct pch_gbe_rx_ring *rx_ring = adapter->rx_ring;
+  if (num_RxDescriptors > bd) {
+   rx_ring->count = RxDescriptors[bd];
+   pch_gbe_validate_option(&rx_ring->count, &opt, adapter);
+   PCH_GBE_ROUNDUP(rx_ring->count,
+     PCH_GBE_RX_DESC_MULTIPLE);
+  } else {
+   rx_ring->count = opt.def;
+  }
+ }
+ { /* Checksum Offload Enable/Disable */
+  struct pch_gbe_option opt = {
+   .type = enable_option,
+   .name = "Checksum Offload",
+   .err  = "defaulting to Enabled",
+   .def  = PCH_GBE_DEFAULT_RX_CSUM
+  };
+
+  if (num_XsumRX > bd) {
+   adapter->rx_csum = XsumRX[bd];
+   pch_gbe_validate_option((int *)(&adapter->rx_csum),
+      &opt, adapter);
+  } else {
+   adapter->rx_csum = opt.def;
+  }
+ }
+ { /* Checksum Offload Enable/Disable */
+  struct pch_gbe_option opt = {
+   .type = enable_option,
+   .name = "Checksum Offload",
+   .err  = "defaulting to Enabled",
+   .def  = PCH_GBE_DEFAULT_TX_CSUM
+  };
+
+  if (num_XsumTX > bd) {
+   adapter->tx_csum = XsumTX[bd];
+   pch_gbe_validate_option((int *)(&adapter->tx_csum),
+      &opt, adapter);
+  } else {
+   adapter->tx_csum = opt.def;
+  }
+ }
+ { /* Flow Control */
+
+  struct pch_gbe_opt_list fc_list[] = {
+   {pch_gbe_fc_none, "Flow Control Disabled"},
+   {pch_gbe_fc_rx_pause, "Flow Control Receive Only"},
+   {pch_gbe_fc_tx_pause, "Flow Control Transmit Only"},
+   {pch_gbe_fc_full, "Flow Control Enabled"} };
+
+  struct pch_gbe_option opt = {
+   .type = list_option,
+   .name = "Flow Control",
+   .err  = "reading default settings from EEPROM",
+   .def  = PCH_GBE_FC_DEFAULT,
+   .arg  = { .l = { .nr = (int)ARRAY_SIZE(fc_list),
+      .p = fc_list } }
+  };
+
+  if (num_FlowControl > bd) {
+   hw->mac.fc = FlowControl[bd];
+   pch_gbe_validate_option((int *)(&hw->mac.fc),
+       &opt, adapter);
+  } else {
+   hw->mac.fc = opt.def;
+  }
+ }
+
+ pch_gbe_check_copper_options(adapter);
+}
+
+/*!
+ * @ingroup Linux driver internal function
+ * @fn      static void pch_gbe_check_copper_options(
+ *                      struct pch_gbe_adapter *adapter)
+ * @brief   Range Checking for Link Options, Copper Version
+ * @param   adapter  [IN] Board private structure
+ * @return  None
+ * @remarks
+ *  Handles speed and duplex options on copper adapters
+ */
+static void
+pch_gbe_check_copper_options(struct pch_gbe_adapter *adapter)
+{
+ struct pch_gbe_hw *hw = &adapter->hw;
+ int speed, dplx;
+ int bd = adapter->bd_number;
+
+ { /* Speed */
+  struct pch_gbe_opt_list speed_list[] = {
+    {0, "" },
+    {SPEED_10, ""},
+    {SPEED_100, ""},
+    {SPEED_1000, ""} };
+
+  struct pch_gbe_option opt = {
+   .type = list_option,
+   .name = "Speed",
+   .err  = "parameter ignored",
+   .def  = 0,
+   .arg  = { .l = { .nr = (int)ARRAY_SIZE(speed_list),
+      .p = speed_list } }
+  };
+
+  if (num_Speed > bd) {
+   speed = Speed[bd];
+   pch_gbe_validate_option(&speed, &opt, adapter);
+  } else {
+   speed = opt.def;
+  }
+ }
+ { /* Duplex */
+  struct pch_gbe_opt_list dplx_list[] = {
+    {0, ""},
+    {PHY_HALF_DUPLEX, ""},
+    {PHY_FULL_DUPLEX, ""} };
+
+  struct pch_gbe_option opt = {
+   .type = list_option,
+   .name = "Duplex",
+   .err  = "parameter ignored",
+   .def  = 0,
+   .arg  = { .l = { .nr = (int)ARRAY_SIZE(dplx_list),
+      .p = dplx_list } }
+  };
+
+  if (num_Duplex > bd) {
+   dplx = Duplex[bd];
+   pch_gbe_validate_option(&dplx, &opt, adapter);
+  } else {
+   dplx = opt.def;
+  }
+ }
+
+ { /* Autoneg */
+  struct pch_gbe_opt_list an_list[] =
+   #define AA "AutoNeg advertising "
+   {{ 0x01, AA "10/HD" },
+    { 0x02, AA "10/FD" },
+    { 0x03, AA "10/FD, 10/HD" },
+    { 0x04, AA "100/HD" },
+    { 0x05, AA "100/HD, 10/HD" },
+    { 0x06, AA "100/HD, 10/FD" },
+    { 0x07, AA "100/HD, 10/FD, 10/HD" },
+    { 0x08, AA "100/FD" },
+    { 0x09, AA "100/FD, 10/HD" },
+    { 0x0a, AA "100/FD, 10/FD" },
+    { 0x0b, AA "100/FD, 10/FD, 10/HD" },
+    { 0x0c, AA "100/FD, 100/HD" },
+    { 0x0d, AA "100/FD, 100/HD, 10/HD" },
+    { 0x0e, AA "100/FD, 100/HD, 10/FD" },
+    { 0x0f, AA "100/FD, 100/HD, 10/FD, 10/HD" },
+    { 0x20, AA "1000/FD" },
+    { 0x21, AA "1000/FD, 10/HD" },
+    { 0x22, AA "1000/FD, 10/FD" },
+    { 0x23, AA "1000/FD, 10/FD, 10/HD" },
+    { 0x24, AA "1000/FD, 100/HD" },
+    { 0x25, AA "1000/FD, 100/HD, 10/HD" },
+    { 0x26, AA "1000/FD, 100/HD, 10/FD" },
+    { 0x27, AA "1000/FD, 100/HD, 10/FD, 10/HD" },
+    { 0x28, AA "1000/FD, 100/FD" },
+    { 0x29, AA "1000/FD, 100/FD, 10/HD" },
+    { 0x2a, AA "1000/FD, 100/FD, 10/FD" },
+    { 0x2b, AA "1000/FD, 100/FD, 10/FD, 10/HD" },
+    { 0x2c, AA "1000/FD, 100/FD, 100/HD" },
+    { 0x2d, AA "1000/FD, 100/FD, 100/HD, 10/HD" },
+    { 0x2e, AA "1000/FD, 100/FD, 100/HD, 10/FD" },
+    { 0x2f, AA "1000/FD, 100/FD, 100/HD, 10/FD, 10/HD" } };
+
+  struct pch_gbe_option opt = {
+   .type = list_option,
+   .name = "AutoNeg",
+   .err  = "parameter ignored",
+   .def  = AUTONEG_ADV_DEFAULT,
+   .arg  = { .l = { .nr = (int)ARRAY_SIZE(an_list),
+      .p = an_list} }
+  };
+
+  if (num_AutoNeg > bd) {
+   if (speed != 0 || dplx != 0) {
+    DPRINTK(PROBE, INFO,
+    "AutoNeg specified along with Speed or Duplex, "
+    "parameter ignored\n");
+    hw->phy.autoneg_advertised = opt.def;
+   } else {
+    hw->phy.autoneg_advertised = AutoNeg[bd];
+    pch_gbe_validate_option(
+     (int *)(&hw->phy.autoneg_advertised),
+     &opt, adapter);
+   }
+  } else {
+   hw->phy.autoneg_advertised = opt.def;
+  }
+ }
+
+ switch (speed + dplx) {
+ case 0:
+  hw->mac.autoneg = hw->mac.fc_autoneg = 1;
+  if ((num_Speed > bd) && (speed != 0 || dplx != 0))
+   DPRINTK(PROBE, INFO,
+    "Speed and duplex autonegotiation enabled\n");
+  hw->mac.link_speed = SPEED_10;
+  hw->mac.link_duplex = DUPLEX_HALF;
+  break;
+ case PHY_HALF_DUPLEX:
+  DPRINTK(PROBE, INFO, "Half Duplex specified without Speed\n");
+  DPRINTK(PROBE, INFO, "Using Autonegotiation at "
+   "Half Duplex only\n");
+  hw->mac.autoneg = hw->mac.fc_autoneg = 1;
+  hw->phy.autoneg_advertised = PHY_ADVERTISE_10_HALF |
+      PHY_ADVERTISE_100_HALF;
+  hw->mac.link_speed = SPEED_10;
+  hw->mac.link_duplex = DUPLEX_HALF;
+  break;
+ case PHY_FULL_DUPLEX:
+  DPRINTK(PROBE, INFO, "Full Duplex specified without Speed\n");
+  DPRINTK(PROBE, INFO, "Using Autonegotiation at "
+   "Full Duplex only\n");
+  hw->mac.autoneg = hw->mac.fc_autoneg = 1;
+  hw->phy.autoneg_advertised = PHY_ADVERTISE_10_FULL |
+      PHY_ADVERTISE_100_FULL |
+      PHY_ADVERTISE_1000_FULL;
+  hw->mac.link_speed = SPEED_10;
+  hw->mac.link_duplex = DUPLEX_FULL;
+  break;
+ case PHY_SPEED_10:
+  DPRINTK(PROBE, INFO, "10 Mbps Speed specified "
+   "without Duplex\n");
+  DPRINTK(PROBE, INFO, "Using Autonegotiation at 10 Mbps only\n");
+  hw->mac.autoneg = hw->mac.fc_autoneg = 1;
+  hw->phy.autoneg_advertised = PHY_ADVERTISE_10_HALF |
+      PHY_ADVERTISE_10_FULL;
+  hw->mac.link_speed = SPEED_10;
+  hw->mac.link_duplex = DUPLEX_HALF;
+  break;
+ case PHY_SPEED_10 + PHY_HALF_DUPLEX:
+  DPRINTK(PROBE, INFO, "Forcing to 10 Mbps Half Duplex\n");
+  hw->mac.autoneg = hw->mac.fc_autoneg = 0;
+  hw->phy.autoneg_advertised = 0;
+  hw->mac.link_speed = SPEED_10;
+  hw->mac.link_duplex = DUPLEX_HALF;
+  break;
+ case PHY_SPEED_10 + PHY_FULL_DUPLEX:
+  DPRINTK(PROBE, INFO, "Forcing to 10 Mbps Full Duplex\n");
+  hw->mac.autoneg = hw->mac.fc_autoneg = 0;
+  hw->phy.autoneg_advertised = 0;
+  hw->mac.link_speed = SPEED_10;
+  hw->mac.link_duplex = DUPLEX_FULL;
+  break;
+ case PHY_SPEED_100:
+  DPRINTK(PROBE, INFO, "100 Mbps Speed specified "
+   "without Duplex\n");
+  DPRINTK(PROBE, INFO, "Using Autonegotiation at "
+   "100 Mbps only\n");
+  hw->mac.autoneg = hw->mac.fc_autoneg = 1;
+  hw->phy.autoneg_advertised = PHY_ADVERTISE_100_HALF |
+      PHY_ADVERTISE_100_FULL;
+  hw->mac.link_speed = SPEED_100;
+  hw->mac.link_duplex = DUPLEX_HALF;
+  break;
+ case PHY_SPEED_100 + PHY_HALF_DUPLEX:
+  DPRINTK(PROBE, INFO, "Forcing to 100 Mbps Half Duplex\n");
+  hw->mac.autoneg = hw->mac.fc_autoneg = 0;
+  hw->phy.autoneg_advertised = 0;
+  hw->mac.link_speed = SPEED_100;
+  hw->mac.link_duplex = DUPLEX_HALF;
+  break;
+ case PHY_SPEED_100 + PHY_FULL_DUPLEX:
+  DPRINTK(PROBE, INFO, "Forcing to 100 Mbps Full Duplex\n");
+  hw->mac.autoneg = hw->mac.fc_autoneg = 0;
+  hw->phy.autoneg_advertised = 0;
+  hw->mac.link_speed = SPEED_100;
+  hw->mac.link_duplex = DUPLEX_FULL;
+  break;
+ case PHY_SPEED_1000:
+  DPRINTK(PROBE, INFO, "1000 Mbps Speed specified without "
+   "Duplex\n");
+  goto full_duplex_only;
+ case PHY_SPEED_1000 + PHY_HALF_DUPLEX:
+  DPRINTK(PROBE, INFO,
+   "Half Duplex is not supported at 1000 Mbps\n");
+  /* fall through */
+ case PHY_SPEED_1000 + PHY_FULL_DUPLEX:
+full_duplex_only:
+  DPRINTK(PROBE, INFO,
+         "Using Autonegotiation at 1000 Mbps Full Duplex only\n");
+  hw->mac.autoneg = hw->mac.fc_autoneg = 1;
+  hw->phy.autoneg_advertised = PHY_ADVERTISE_1000_FULL;
+  hw->mac.link_speed = SPEED_1000;
+  hw->mac.link_duplex = DUPLEX_FULL;
+  break;
+ default:
+  BUG();
+ }
+}
+



^ permalink raw reply

* [PATCH 3/7] Topcliff GbE: Add The Ethtool code [2/2]
From: Masayuki Ohtake @ 2010-04-23 12:00 UTC (permalink / raw)
  To: NETDEV; +Cc: Wang, Yong Y, Wang, Qi, Intel OTC, Andrew

[-- Attachment #1: Type: message/partial, Size: 7634 bytes --]

^ permalink raw reply

* [PATCH 1/7] Topcliff GbE: Add The Main code [2/3]
From: Masayuki Ohtake @ 2010-04-23 11:56 UTC (permalink / raw)
  To: NETDEV; +Cc: Wang, Yong Y, Wang, Qi, Intel OTC, Andrew

[-- Attachment #1: Type: message/partial, Size: 39085 bytes --]

^ permalink raw reply

* [PATCH 1/7] Topcliff GbE: Add The Main code [1/3]
From: Masayuki Ohtake @ 2010-04-23 11:56 UTC (permalink / raw)
  To: NETDEV; +Cc: Wang, Yong Y, Wang, Qi, Intel OTC, Andrew

[-- Attachment #1: Type: message/partial, Size: 39156 bytes --]

^ permalink raw reply

* [PATCH 1/7] Topcliff GbE: Add The Main code [3/3]
From: Masayuki Ohtake @ 2010-04-23 11:56 UTC (permalink / raw)
  To: NETDEV; +Cc: Wang, Yong Y, Wang, Qi, Intel OTC, Andrew

[-- Attachment #1: Type: message/partial, Size: 18536 bytes --]

^ permalink raw reply

* nfs41: potential null deref in xprt_reserve_xprt()?
From: Dan Carpenter @ 2010-04-23 12:00 UTC (permalink / raw)
  To: iyer-HgOvQuBEEgTQT0dZR+AlfA
  Cc: linux-nfs-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA

I'm going through some Smatch results and had a question.  

Until commit 343952fa5a: "nfs41: Get the rpc_xprt * from the rpc_rqst 
instead of the rpc_clnt." we assumed that "task->tk_rqstp" can be NULL.  
But that patch dereferences it unconditionally.

diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 0eea2bf..c144611 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -195,8 +195,8 @@ EXPORT_SYMBOL_GPL(xprt_load_transport);
  */
 int xprt_reserve_xprt(struct rpc_task *task)
 {
-       struct rpc_xprt *xprt = task->tk_xprt;
        struct rpc_rqst *req = task->tk_rqstp;
+       struct rpc_xprt *xprt = req->rq_xprt;
	                        ^^^^^^^^^^^^^

Can "req" be null here?  The patch is a year old, so presumably it
isn't null very often.

If you would like, I can remove the checks for null from the rest of the
function.

regards,
dan carpenter
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [patch] sctp: cleanup: remove unneeded null check
From: Dan Carpenter @ 2010-04-23 11:59 UTC (permalink / raw)
  To: Vlad Yasevich
  Cc: Sridhar Samudrala, David S. Miller, Wei Yongjun, Chris Dischino,
	linux-sctp, netdev, kernel-janitors

"chunk" can never be null here.  We dereferenced it earlier in the
function and also at the start of the function we passed it to 
sctp_pack_cookie() which dereferences it.

This code has been around since the dawn of git history so if "chunk"
were ever null someone would have complained about it.

Signed-off-by: Dan Carpenter <error27@gmail.com>

diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 17cb400..52352fc 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -470,8 +470,7 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc,
 	 *
 	 * [INIT ACK back to where the INIT came from.]
 	 */
-	if (chunk)
-		retval->transport = chunk->transport;
+	retval->transport = chunk->transport;
 
 nomem_chunk:
 	kfree(cookie);

^ permalink raw reply related

* [PATCH net-next-2.6] net: disallow to use net_assign_generic externally
From: Jiri Pirko @ 2010-04-23 11:40 UTC (permalink / raw)
  To: netdev; +Cc: davem, ebiederm

Now there's no need to use this fuction directly because it's handled by
register_pernet_device. So to make this simple and easy to understand,
make this static to do not tempt potentional users.

Signed-off-by: Jiri Pirko <jpirko@redhat.com>

diff --git a/include/net/netns/generic.h b/include/net/netns/generic.h
index ff4982a..81a31c0 100644
--- a/include/net/netns/generic.h
+++ b/include/net/netns/generic.h
@@ -14,11 +14,8 @@
  * The rules are simple:
  * 1. set pernet_operations->id.  After register_pernet_device you
  *    will have the id of your private pointer.
- * 2. Either set pernet_operations->size (to have the code allocate and
- *    free a private structure pointed to from struct net ) or 
- *    call net_assign_generic() to put the private data on the struct
- *    net (most preferably this should be done in the ->init callback
- *    of the ops registered);
+ * 2. set pernet_operations->size to have the code allocate and free
+ *    a private structure pointed to from struct net.
  * 3. do not change this pointer while the net is alive;
  * 4. do not try to have any private reference on the net_generic object.
  *
@@ -46,6 +43,4 @@ static inline void *net_generic(struct net *net, int id)
 
 	return ptr;
 }
-
-extern int net_assign_generic(struct net *net, int id, void *data);
 #endif
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index bd8c471..777477c 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -27,6 +27,51 @@ EXPORT_SYMBOL(init_net);
 
 #define INITIAL_NET_GEN_PTRS	13 /* +1 for len +2 for rcu_head */
 
+static void net_generic_release(struct rcu_head *rcu)
+{
+	struct net_generic *ng;
+
+	ng = container_of(rcu, struct net_generic, rcu);
+	kfree(ng);
+}
+
+static int net_assign_generic(struct net *net, int id, void *data)
+{
+	struct net_generic *ng, *old_ng;
+
+	BUG_ON(!mutex_is_locked(&net_mutex));
+	BUG_ON(id == 0);
+
+	ng = old_ng = net->gen;
+	if (old_ng->len >= id)
+		goto assign;
+
+	ng = kzalloc(sizeof(struct net_generic) +
+			id * sizeof(void *), GFP_KERNEL);
+	if (ng == NULL)
+		return -ENOMEM;
+
+	/*
+	 * Some synchronisation notes:
+	 *
+	 * The net_generic explores the net->gen array inside rcu
+	 * read section. Besides once set the net->gen->ptr[x]
+	 * pointer never changes (see rules in netns/generic.h).
+	 *
+	 * That said, we simply duplicate this array and schedule
+	 * the old copy for kfree after a grace period.
+	 */
+
+	ng->len = id;
+	memcpy(&ng->ptr, &old_ng->ptr, old_ng->len * sizeof(void*));
+
+	rcu_assign_pointer(net->gen, ng);
+	call_rcu(&old_ng->rcu, net_generic_release);
+assign:
+	ng->ptr[id - 1] = data;
+	return 0;
+}
+
 static int ops_init(const struct pernet_operations *ops, struct net *net)
 {
 	int err;
@@ -526,49 +571,3 @@ void unregister_pernet_device(struct pernet_operations *ops)
 	mutex_unlock(&net_mutex);
 }
 EXPORT_SYMBOL_GPL(unregister_pernet_device);
-
-static void net_generic_release(struct rcu_head *rcu)
-{
-	struct net_generic *ng;
-
-	ng = container_of(rcu, struct net_generic, rcu);
-	kfree(ng);
-}
-
-int net_assign_generic(struct net *net, int id, void *data)
-{
-	struct net_generic *ng, *old_ng;
-
-	BUG_ON(!mutex_is_locked(&net_mutex));
-	BUG_ON(id == 0);
-
-	ng = old_ng = net->gen;
-	if (old_ng->len >= id)
-		goto assign;
-
-	ng = kzalloc(sizeof(struct net_generic) +
-			id * sizeof(void *), GFP_KERNEL);
-	if (ng == NULL)
-		return -ENOMEM;
-
-	/*
-	 * Some synchronisation notes:
-	 *
-	 * The net_generic explores the net->gen array inside rcu
-	 * read section. Besides once set the net->gen->ptr[x]
-	 * pointer never changes (see rules in netns/generic.h).
-	 *
-	 * That said, we simply duplicate this array and schedule
-	 * the old copy for kfree after a grace period.
-	 */
-
-	ng->len = id;
-	memcpy(&ng->ptr, &old_ng->ptr, old_ng->len * sizeof(void*));
-
-	rcu_assign_pointer(net->gen, ng);
-	call_rcu(&old_ng->rcu, net_generic_release);
-assign:
-	ng->ptr[id - 1] = data;
-	return 0;
-}
-EXPORT_SYMBOL_GPL(net_assign_generic);

^ permalink raw reply related

* [PATCH/RFC Resubmission] cdc_ether: Identify MBM devices by GUID in MDLM descriptor
From: Jonas Sjoquist @ 2010-04-23 11:07 UTC (permalink / raw)
  To: oneukum, davem; +Cc: netdev

From: Jonas Sjöquist <jonas.sjoquist@ericsson.com>

This patch removes vid/pid for Ericsson MBM devices from the whitelist set of
devices. The MBM devices are instead identified by GUID.

In order for cdc_ether to handle these devices the GUID in the MDLM descriptor
is tested. All MBM devices currently handled by cdc_ether as well as future
CDC Ethernet MBM devices can be identified by the GUID.

This is the same solution used in Carl Nordbeck's mbm driver,
http://kerneltrap.org/mailarchive/linux-usb/2008/11/17/4141384/thread

I post this as RFC to get feedback on however cdc_ether is the correct place to
do the binding, or if it should be done in a separate driver, e.g. zaurus.

Signed-off-by: Jonas Sjöquist <jonas.sjoquist@ericsson.com>
---
diff --git a/drivers/net/usb/cdc_ether.c b/drivers/net/usb/cdc_ether.c
index c8cdb7f..811b2dc 100644
--- a/drivers/net/usb/cdc_ether.c
+++ b/drivers/net/usb/cdc_ether.c
@@ -64,6 +64,11 @@ static int is_wireless_rndis(struct usb_interface_descriptor *desc)
 
 #endif
 
+static const u8 mbm_guid[16] = {
+	0xa3, 0x17, 0xa8, 0x8b, 0x04, 0x5e, 0x4f, 0x01,
+	0xa6, 0x07, 0xc0, 0xff, 0xcb, 0x7e, 0x39, 0x2a,
+};
+
 /*
  * probes control interface, claims data interface, collects the bulk
  * endpoints, activates data interface (if needed), maybe sets MTU.
@@ -79,6 +84,8 @@ int usbnet_generic_cdc_bind(struct usbnet *dev, struct usb_interface *intf)
 	int				status;
 	int				rndis;
 	struct usb_driver		*driver = driver_of(intf);
+	struct usb_cdc_mdlm_desc	*desc = NULL;
+	struct usb_cdc_mdlm_detail_desc *detail = NULL;
 
 	if (sizeof dev->data < sizeof *info)
 		return -EDOM;
@@ -229,6 +236,34 @@ int usbnet_generic_cdc_bind(struct usbnet *dev, struct usb_interface *intf)
 			 * side link address we were given.
 			 */
 			break;
+		case USB_CDC_MDLM_TYPE:
+			if (desc) {
+				dev_dbg(&intf->dev, "extra MDLM descriptor\n");
+				goto bad_desc;
+			}
+
+			desc = (void *)buf;
+
+			if (desc->bLength != sizeof(*desc))
+				goto bad_desc;
+
+			if (memcmp(&desc->bGUID, mbm_guid, 16))
+				goto bad_desc;
+			break;
+		case USB_CDC_MDLM_DETAIL_TYPE:
+			if (detail) {
+				dev_dbg(&intf->dev, "extra MDLM detail descriptor\n");
+				goto bad_desc;
+			}
+
+			detail = (void *)buf;
+
+			if (detail->bGuidDescriptorType == 0) {
+				if (detail->bLength < (sizeof(*detail) + 1))
+					goto bad_desc;
+			} else
+				goto bad_desc;
+			break;
 		}
 next_desc:
 		len -= buf [0];	/* bLength */
@@ -542,80 +577,10 @@ static const struct usb_device_id	products [] = {
 			USB_CDC_PROTO_NONE),
 	.driver_info = (unsigned long) &cdc_info,
 }, {
-	/* Ericsson F3507g */
-	USB_DEVICE_AND_INTERFACE_INFO(0x0bdb, 0x1900, USB_CLASS_COMM,
-			USB_CDC_SUBCLASS_MDLM, USB_CDC_PROTO_NONE),
-	.driver_info = (unsigned long) &mbm_info,
-}, {
-	/* Ericsson F3507g ver. 2 */
-	USB_DEVICE_AND_INTERFACE_INFO(0x0bdb, 0x1902, USB_CLASS_COMM,
-			USB_CDC_SUBCLASS_MDLM, USB_CDC_PROTO_NONE),
-	.driver_info = (unsigned long) &mbm_info,
-}, {
-	/* Ericsson F3607gw */
-	USB_DEVICE_AND_INTERFACE_INFO(0x0bdb, 0x1904, USB_CLASS_COMM,
-			USB_CDC_SUBCLASS_MDLM, USB_CDC_PROTO_NONE),
-	.driver_info = (unsigned long) &mbm_info,
-}, {
-	/* Ericsson F3607gw ver 2 */
-	USB_DEVICE_AND_INTERFACE_INFO(0x0bdb, 0x1905, USB_CLASS_COMM,
-			USB_CDC_SUBCLASS_MDLM, USB_CDC_PROTO_NONE),
-	.driver_info = (unsigned long) &mbm_info,
-}, {
-	/* Ericsson F3607gw ver 3 */
-	USB_DEVICE_AND_INTERFACE_INFO(0x0bdb, 0x1906, USB_CLASS_COMM,
-			USB_CDC_SUBCLASS_MDLM, USB_CDC_PROTO_NONE),
-	.driver_info = (unsigned long) &mbm_info,
-}, {
-	/* Ericsson F3307 */
-	USB_DEVICE_AND_INTERFACE_INFO(0x0bdb, 0x190a, USB_CLASS_COMM,
-			USB_CDC_SUBCLASS_MDLM, USB_CDC_PROTO_NONE),
-	.driver_info = (unsigned long) &mbm_info,
-}, {
-	/* Ericsson F3307 ver 2 */
-	USB_DEVICE_AND_INTERFACE_INFO(0x0bdb, 0x1909, USB_CLASS_COMM,
-			USB_CDC_SUBCLASS_MDLM, USB_CDC_PROTO_NONE),
-	.driver_info = (unsigned long) &mbm_info,
-}, {
-	/* Ericsson C3607w */
-	USB_DEVICE_AND_INTERFACE_INFO(0x0bdb, 0x1049, USB_CLASS_COMM,
-			USB_CDC_SUBCLASS_MDLM, USB_CDC_PROTO_NONE),
-	.driver_info = (unsigned long) &mbm_info,
-}, {
-	/* Ericsson C3607w ver 2 */
-	USB_DEVICE_AND_INTERFACE_INFO(0x0bdb, 0x190b, USB_CLASS_COMM,
-			USB_CDC_SUBCLASS_MDLM, USB_CDC_PROTO_NONE),
-	.driver_info = (unsigned long) &mbm_info,
-}, {
-	/* Toshiba F3507g */
-	USB_DEVICE_AND_INTERFACE_INFO(0x0930, 0x130b, USB_CLASS_COMM,
-			USB_CDC_SUBCLASS_MDLM, USB_CDC_PROTO_NONE),
-	.driver_info = (unsigned long) &mbm_info,
-}, {
-	/* Toshiba F3607gw */
-	USB_DEVICE_AND_INTERFACE_INFO(0x0930, 0x130c, USB_CLASS_COMM,
-			USB_CDC_SUBCLASS_MDLM, USB_CDC_PROTO_NONE),
-	.driver_info = (unsigned long) &mbm_info,
-}, {
-	/* Toshiba F3607gw ver 2 */
-	USB_DEVICE_AND_INTERFACE_INFO(0x0930, 0x1311, USB_CLASS_COMM,
-			USB_CDC_SUBCLASS_MDLM, USB_CDC_PROTO_NONE),
-	.driver_info = (unsigned long) &mbm_info,
-}, {
-	/* Dell F3507g */
-	USB_DEVICE_AND_INTERFACE_INFO(0x413c, 0x8147, USB_CLASS_COMM,
-			USB_CDC_SUBCLASS_MDLM, USB_CDC_PROTO_NONE),
-	.driver_info = (unsigned long) &mbm_info,
-}, {
-	/* Dell F3607gw */
-	USB_DEVICE_AND_INTERFACE_INFO(0x413c, 0x8183, USB_CLASS_COMM,
-			USB_CDC_SUBCLASS_MDLM, USB_CDC_PROTO_NONE),
-	.driver_info = (unsigned long) &mbm_info,
-}, {
-	/* Dell F3607gw ver 2 */
-	USB_DEVICE_AND_INTERFACE_INFO(0x413c, 0x8184, USB_CLASS_COMM,
-			USB_CDC_SUBCLASS_MDLM, USB_CDC_PROTO_NONE),
-	.driver_info = (unsigned long) &mbm_info,
+	USB_INTERFACE_INFO(USB_CLASS_COMM, USB_CDC_SUBCLASS_MDLM,
+			USB_CDC_PROTO_NONE),
+	.driver_info = (unsigned long)&mbm_info,
+
 },
 	{ },		// END
 };


^ permalink raw reply related

* Re: DDoS attack causing bad effect on conntrack searches
From: Patrick McHardy @ 2010-04-23 11:06 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Jesper Dangaard Brouer, paulmck, Changli Gao, hawk,
	Linux Kernel Network Hackers, Netfilter Developers
In-Reply-To: <1272020717.7895.7974.camel@edumazet-laptop>

Eric Dumazet wrote:
> Le vendredi 23 avril 2010 à 12:55 +0200, Patrick McHardy a écrit :
>> Eric Dumazet wrote:
>>> OK but a lookup last a fraction of a micro second, unless interrupted by
>>> hard irq.
>>>
>>> Probability of a change during a lookup should be very very small.
>>>
>>> Note that the scenario for a restart is :
>>>
>>> The lookup go through the chain.
>>> While it is examining one object, this object is deleted.
>>> The object is re-allocated by another cpu and inserted to a new chain.
>> I think another scenario that seems a bit more likely would be
>> that a new entry is added to the chain after it was fully searched.
>> Perhaps we could continue searching at the last position if the
>> last entry is not a nulls entry to improve this.
> 
> But the last entry is always a nulls entry, what do you mean exactly ?
> 
> When an unsert (of a fresh object, not a reused one) is done, this
> doesnt affect lookups in any way, since its done at the head of list.

Right, I missed that :)
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: DDoS attack causing bad effect on conntrack searches
From: Eric Dumazet @ 2010-04-23 11:06 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: Jesper Dangaard Brouer, paulmck, Changli Gao, hawk,
	Linux Kernel Network Hackers, Netfilter Developers
In-Reply-To: <4BD1784A.6010306@trash.net>

Le vendredi 23 avril 2010 à 12:36 +0200, Patrick McHardy a écrit :
> Eric Dumazet wrote:
> > Le jeudi 22 avril 2010 à 23:03 +0200, Eric Dumazet a écrit :
> >>> Guess I have to reproduce the DoS attack in a testlab (I will first have 
> >>> time Tuesday).  So we can determine if its bad hashing or restart of the 
> >>> search loop.
> >>>
> > 
> > Or very long chains, if attacker managed to find a jhash flaw.
> 
> That should be visible in the "searched" statistic.
> 
> > You could add a lookup_restart counter :
> 
> I've applied Jespers equivalent patch.

Yes of course, I missed it or I would not have cooked it ;)

Thanks


--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: DDoS attack causing bad effect on conntrack searches
From: Eric Dumazet @ 2010-04-23 11:05 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: Jesper Dangaard Brouer, paulmck, Changli Gao, hawk,
	Linux Kernel Network Hackers, Netfilter Developers
In-Reply-To: <4BD17CAA.4090708@trash.net>

Le vendredi 23 avril 2010 à 12:55 +0200, Patrick McHardy a écrit :
> Eric Dumazet wrote:
> > 
> > OK but a lookup last a fraction of a micro second, unless interrupted by
> > hard irq.
> > 
> > Probability of a change during a lookup should be very very small.
> > 
> > Note that the scenario for a restart is :
> > 
> > The lookup go through the chain.
> > While it is examining one object, this object is deleted.
> > The object is re-allocated by another cpu and inserted to a new chain.
> 
> I think another scenario that seems a bit more likely would be
> that a new entry is added to the chain after it was fully searched.
> Perhaps we could continue searching at the last position if the
> last entry is not a nulls entry to improve this.

But the last entry is always a nulls entry, what do you mean exactly ?

When an unsert (of a fresh object, not a reused one) is done, this
doesnt affect lookups in any way, since its done at the head of list.




^ permalink raw reply

* [PATCH net-next-2.6] l2tp_eth: fix memory allocation
From: Jiri Pirko @ 2010-04-23 11:01 UTC (permalink / raw)
  To: netdev; +Cc: davem, kleptog, jchapman

Since .size is set properly in "struct pernet_operations l2tp_eth_net_ops",
allocating space for "struct l2tp_eth_net" by hand is not correct, even causes
memory leakage.

Signed-off-by: Jiri Pirko <jpirko@redhat.com>

diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c
index ca1164a..58c6c4c 100644
--- a/net/l2tp/l2tp_eth.c
+++ b/net/l2tp/l2tp_eth.c
@@ -276,43 +276,16 @@ out:
 
 static __net_init int l2tp_eth_init_net(struct net *net)
 {
-	struct l2tp_eth_net *pn;
-	int err;
-
-	pn = kzalloc(sizeof(*pn), GFP_KERNEL);
-	if (!pn)
-		return -ENOMEM;
+	struct l2tp_eth_net *pn = net_generic(net, l2tp_eth_net_id);
 
 	INIT_LIST_HEAD(&pn->l2tp_eth_dev_list);
 	spin_lock_init(&pn->l2tp_eth_lock);
 
-	err = net_assign_generic(net, l2tp_eth_net_id, pn);
-	if (err)
-		goto out;
-
 	return 0;
-
-out:
-	kfree(pn);
-	return err;
-}
-
-static __net_exit void l2tp_eth_exit_net(struct net *net)
-{
-	struct l2tp_eth_net *pn;
-
-	pn = net_generic(net, l2tp_eth_net_id);
-	/*
-	 * if someone has cached our net then
-	 * further net_generic call will return NULL
-	 */
-	net_assign_generic(net, l2tp_eth_net_id, NULL);
-	kfree(pn);
 }
 
 static __net_initdata struct pernet_operations l2tp_eth_net_ops = {
 	.init = l2tp_eth_init_net,
-	.exit = l2tp_eth_exit_net,
 	.id   = &l2tp_eth_net_id,
 	.size = sizeof(struct l2tp_eth_net),
 };

^ permalink raw reply related

* Re: DDoS attack causing bad effect on conntrack searches
From: Patrick McHardy @ 2010-04-23 10:56 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Changli Gao, hawk, Linux Kernel Network Hackers, netfilter-devel,
	Paul E McKenney
In-Reply-To: <1271946961.7895.5665.camel@edumazet-laptop>

Eric Dumazet wrote:
> Le jeudi 22 avril 2010 à 15:17 +0200, Patrick McHardy a écrit :
>> Changli Gao wrote:
>>>> struct nf_conntrack_tuple_hash *
>>>> __nf_conntrack_find(struct net *net, const struct nf_conntrack_tuple *tuple)
>>>> ...
>>> We should add a retry limit there.
>> We can't do that since that would allow false negatives.
> 
> If one hash slot is under attack, then there is a bug somewhere.
> 
> If we cannot avoid this, we can fallback to a secure mode at the second
> retry, and take the spinlock.
> 
> Tis way, most of lookups stay lockless (one pass), and some might take
> the slot lock to avoid the possibility of a loop.

That sounds like a good idea. But lets what for Jesper's test results
before we start fixing this problem :)
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: DDoS attack causing bad effect on conntrack searches
From: Patrick McHardy @ 2010-04-23 10:55 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Jesper Dangaard Brouer, paulmck, Changli Gao, hawk,
	Linux Kernel Network Hackers, Netfilter Developers
In-Reply-To: <1271970199.7895.6482.camel@edumazet-laptop>

Eric Dumazet wrote:
> Le jeudi 22 avril 2010 à 22:38 +0200, Jesper Dangaard Brouer a écrit :
>> On Thu, 22 Apr 2010, Eric Dumazet wrote:
>>
>>> Le jeudi 22 avril 2010 à 08:51 -0700, Paul E. McKenney a écrit :
>>>> On Thu, Apr 22, 2010 at 04:53:49PM +0200, Eric Dumazet wrote:
>>>>> Le jeudi 22 avril 2010 à 16:36 +0200, Eric Dumazet a écrit :
>>>>>
>>>>> If we can do the 'retry' a 10 times, it means the attacker was really
>>>>> clever enough to inject new packets (new conntracks) at the right
>>>>> moment, in the right hash chain, and this sounds so higly incredible
>>>>> that I cannot believe it at all :)
>>>> Or maybe the DoS attack is injecting so many new conntracks that a large
>>>> fraction of the hash chains are being modified at any given time?
>>>>
>> I think its plausable, there is a lot of modification going on.
>> Approx 40.000 deletes/sec and 40.000 inserts/sec.
>> The hash bucket size is 300032, and with 80000 modifications/sec, we are 
>> (potentially) changing 26.6% of the hash chains each second.
>>
> 
> OK but a lookup last a fraction of a micro second, unless interrupted by
> hard irq.
> 
> Probability of a change during a lookup should be very very small.
> 
> Note that the scenario for a restart is :
> 
> The lookup go through the chain.
> While it is examining one object, this object is deleted.
> The object is re-allocated by another cpu and inserted to a new chain.

I think another scenario that seems a bit more likely would be
that a new entry is added to the chain after it was fully searched.
Perhaps we could continue searching at the last position if the
last entry is not a nulls entry to improve this.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* [PATCH net-next-2.6] l2tp: fix memory allocation
From: Jiri Pirko @ 2010-04-23 10:53 UTC (permalink / raw)
  To: netdev; +Cc: davem, kleptog, jchapman

Since .size is set properly in "struct pernet_operations l2tp_net_ops",
allocating space for "struct l2tp_net" by hand is not correct, even causes
memory leakage.

Signed-off-by: Jiri Pirko <jpirko@redhat.com>

diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
index ecc7aea..1712af1 100644
--- a/net/l2tp/l2tp_core.c
+++ b/net/l2tp/l2tp_core.c
@@ -1617,14 +1617,9 @@ EXPORT_SYMBOL_GPL(l2tp_session_create);
 
 static __net_init int l2tp_init_net(struct net *net)
 {
-	struct l2tp_net *pn;
-	int err;
+	struct l2tp_net *pn = net_generic(net, l2tp_net_id);
 	int hash;
 
-	pn = kzalloc(sizeof(*pn), GFP_KERNEL);
-	if (!pn)
-		return -ENOMEM;
-
 	INIT_LIST_HEAD(&pn->l2tp_tunnel_list);
 	spin_lock_init(&pn->l2tp_tunnel_list_lock);
 
@@ -1633,33 +1628,11 @@ static __net_init int l2tp_init_net(struct net *net)
 
 	spin_lock_init(&pn->l2tp_session_hlist_lock);
 
-	err = net_assign_generic(net, l2tp_net_id, pn);
-	if (err)
-		goto out;
-
 	return 0;
-
-out:
-	kfree(pn);
-	return err;
-}
-
-static __net_exit void l2tp_exit_net(struct net *net)
-{
-	struct l2tp_net *pn;
-
-	pn = net_generic(net, l2tp_net_id);
-	/*
-	 * if someone has cached our net then
-	 * further net_generic call will return NULL
-	 */
-	net_assign_generic(net, l2tp_net_id, NULL);
-	kfree(pn);
 }
 
 static struct pernet_operations l2tp_net_ops = {
 	.init = l2tp_init_net,
-	.exit = l2tp_exit_net,
 	.id   = &l2tp_net_id,
 	.size = sizeof(struct l2tp_net),
 };

^ permalink raw reply related

* Re: DDoS attack causing bad effect on conntrack searches
From: Patrick McHardy @ 2010-04-23 10:36 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Jesper Dangaard Brouer, paulmck, Changli Gao, hawk,
	Linux Kernel Network Hackers, Netfilter Developers
In-Reply-To: <1271970893.7895.6507.camel@edumazet-laptop>

Eric Dumazet wrote:
> Le jeudi 22 avril 2010 à 23:03 +0200, Eric Dumazet a écrit :
>>> Guess I have to reproduce the DoS attack in a testlab (I will first have 
>>> time Tuesday).  So we can determine if its bad hashing or restart of the 
>>> search loop.
>>>
> 
> Or very long chains, if attacker managed to find a jhash flaw.

That should be visible in the "searched" statistic.

> You could add a lookup_restart counter :

I've applied Jespers equivalent patch.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: DDoS attack causing bad effect on conntrack searches
From: Patrick McHardy @ 2010-04-23 10:35 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: Changli Gao, Eric Dumazet, Linux Kernel Network Hackers,
	netfilter-devel, Paul E McKenney
In-Reply-To: <1271943066.14501.194.camel@jdb-workstation>

Jesper Dangaard Brouer wrote:
> I have added a stats counter to prove my case, which I think we should add to the kernel (to detect the case in the future).
> The DDoS attack has disappeared, so I guess I'll try to see if I can reproduce the problem in my testlab.
> 
> 
> 
> [PATCH] net: netfilter conntrack extended with extra stat counter.
> 
> From: Jesper Dangaard Brouer <hawk@comx.dk>
> 
> I suspect an unfortunatly series of events occuring under a DDoS
> attack, in function __nf_conntrack_find() nf_contrack_core.c.
> 
> Adding a stats counter to see if the search is restarted too often.

Applied, thanks Jesper.

^ permalink raw reply

* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue
From: Eric Dumazet @ 2010-04-23 10:26 UTC (permalink / raw)
  To: Changli Gao
  Cc: David S. Miller, jamal, Tom Herbert, Stephen Hemminger, netdev
In-Reply-To: <1272010378-2955-1-git-send-email-xiaosuo@gmail.com>

Le vendredi 23 avril 2010 à 16:12 +0800, Changli Gao a écrit :
> batch skb dequeueing from softnet input_pkt_queue.
> 
> batch skb dequeueing from softnet input_pkt_queue to reduce potential lock
> contention when RPS is enabled.
> 
> Note: in the worst case, the number of packets in a softnet_data may be double
> of netdev_max_backlog.
> 
> Signed-off-by: Changli Gao <xiaosuo@gmail.com>
> ----

Oops, reading it again, I found process_backlog() was still taking the
lock twice, if only one packet is waiting in input_pkt_queue.

Possible fix, on top of your patch :

diff --git a/net/core/dev.c b/net/core/dev.c
index 0eddd23..0569be7 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3296,8 +3296,9 @@ static int process_backlog(struct napi_struct *napi, int quota)
 #endif
 	napi->weight = weight_p;
 	local_irq_disable();
-	while (1) {
+	while (work < quota) {
 		struct sk_buff *skb;
+		unsigned int qlen;
 
 		while ((skb = __skb_dequeue(&sd->process_queue))) {
 			local_irq_enable();
@@ -3308,13 +3309,15 @@ static int process_backlog(struct napi_struct *napi, int quota)
 		}
 
 		rps_lock(sd);
-		input_queue_head_add(sd, skb_queue_len(&sd->input_pkt_queue));
-		skb_queue_splice_tail_init(&sd->input_pkt_queue,
-					   &sd->process_queue);
-		if (skb_queue_empty(&sd->process_queue)) {
+		qlen = skb_queue_len(&sd->input_pkt_queue);
+		if (qlen) {
+			input_queue_head_add(sd, qlen);
+			skb_queue_splice_tail_init(&sd->input_pkt_queue,
+						   &sd->process_queue);
+		}
+		if (qlen < quota - work) {
 			__napi_complete(napi);
-			rps_unlock(sd);
-			break;
+			quota = work + qlen;
 		}
 		rps_unlock(sd);
 	}



^ permalink raw reply related

* Re: [RFC 2/2] phylib: Convert MDIO bitbang to new MDIO 45 format
From: Ben Hutchings @ 2010-04-23 10:22 UTC (permalink / raw)
  To: Andy Fleming; +Cc: davem, netdev
In-Reply-To: <1271997497-6896-3-git-send-email-afleming@freescale.com>

On Thu, 2010-04-22 at 23:38 -0500, Andy Fleming wrote:
> Now that we've added somewhat more complete MDIO 45 support to the PHY
> Lib, convert the MDIO bitbang driver to use this new infrastructure.
> 
> Signed-off-by: Andy Fleming <afleming@freescale.com>
> ---
>  drivers/net/phy/mdio-bitbang.c |   23 +++++++++++------------
>  1 files changed, 11 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/net/phy/mdio-bitbang.c b/drivers/net/phy/mdio-bitbang.c
> index 2f6f02e..4c0c89b 100644
> --- a/drivers/net/phy/mdio-bitbang.c
> +++ b/drivers/net/phy/mdio-bitbang.c
[...]
> @@ -157,9 +154,10 @@ static int mdiobb_read(struct mii_bus *bus, int phy, int devad, int reg)
>  	struct mdiobb_ctrl *ctrl = bus->priv;
>  	int ret, i;
>  
> -	if (reg & MII_ADDR_C45) {
> -		reg = mdiobb_cmd_addr(ctrl, phy, reg);
> -		mdiobb_cmd(ctrl, MDIO_C45_READ, phy, reg);
> +	/* Clause 22 PHYs only use devad = 0, and Clause 45 only use nonzero */
> +	if (devad) {
> +		mdiobb_cmd_addr(ctrl, phy, devad, reg);
> +		mdiobb_cmd(ctrl, MDIO_C45_READ, phy, devad);
>  	} else
>  		mdiobb_cmd(ctrl, MDIO_READ, phy, reg);
>  
[...]

I don't believe there's any protocol requirement in clause 45 that
devad != 0 (although the address is not allocated).  In the mdio module
I played safe and defined MDIO_DEVAD_NONE == -1 to indicate a clause 22
request.

Ben.

-- 
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply

* Re: [PATCH linux-next 1/2] irq: Add CPU mask affinity hint callback framework
From: John Fastabend @ 2010-04-23  9:27 UTC (permalink / raw)
  To: Ben Hutchings
  Cc: Waskiewicz Jr, Peter P, tglx@linutronix.de, davem@davemloft.net,
	arjan@linux.jf.intel.com, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org
In-Reply-To: <1271950900.2095.25.camel@achroite.uk.solarflarecom.com>

Ben Hutchings wrote:
> On Thu, 2010-04-22 at 05:11 -0700, Peter P Waskiewicz Jr wrote:
>> On Wed, 21 Apr 2010, Ben Hutchings wrote:
>>
>>> On Tue, 2010-04-20 at 11:01 -0700, Peter P Waskiewicz Jr wrote:
>>>> This patch adds a callback function pointer to the irq_desc
>>>> structure, along with a registration function and a read-only
>>>> proc entry for each interrupt.
>>>>
>>>> This affinity_hint handle for each interrupt can be used by
>>>> underlying drivers that need a better mechanism to control
>>>> interrupt affinity.  The underlying driver can register a
>>>> callback for the interrupt, which will allow the driver to
>>>> provide the CPU mask for the interrupt to anything that
>>>> requests it.  The intent is to extend the userspace daemon,
>>>> irqbalance, to help hint to it a preferred CPU mask to balance
>>>> the interrupt into.
>>> Doesn't it make more sense to have the driver follow affinity decisions
>>> made from user-space?  I realise that reallocating queues is disruptive
>>> and we probably don't want irqbalance to trigger that, but there should
>>> be a mechanism for the administrator to trigger it.
>> The driver here would be assisting userspace (irqbalance) to provide 
>> better details how the HW is laid out with respect to flows.  As it stands 
>> today, irqbalance is almost guaranteed to move interrups to CPUs that are 
>> not aligned with where applications are running for network adapters. 
>> This is very apparent when running at speeds in the 10 Gigabit range, or 
>> even multiple 1 Gigabit ports running at the same time.
> 
> I'm well aware that irqbalance isn't making good decisions at the
> moment.  The question is whether this will really help irqbalance to do
> better.
> 

FCoE is one example where these hints can really help irqbalance make 
good decisions.  By aligning the interrupt affinity with the FCoE 
receive processing thread we can avoid context switching from the NET_RX
softirq to the receive processing thread.

Because the base driver knows which rx rings are being used for FCoE in 
a particular configuration and their corresponding vectors it seems to 
be in the best position to provide good hints to irqbalance.  Also if 
the mapping changes at some point the base driver will be aware of it.

> [...]
>>> This just assigns IRQs to the first n CPU threads.  Depending on the
>>> enumeration order, this might result in assigning an IRQ to each of 2
>>> threads on a core while leaving other cores unused!
>> This ixgbe patch is only meant to be an example of how you could use it. 
>> I didn't hammer out all the corner cases of interrupt alignment in it yet. 
>> However, ixgbe is already aligning Tx flows onto the CPU/queue pair the Tx 
>> occurred (i.e. Tx session from CPU 4 will be queued on Tx queue 4),
> [...]
> 
> OK, now I remember ixgbe has this odd select_queue() implementation.
> But this behaviour can result in reordering whenever a user thread
> migrates, and in any case Dave discourages people from setting
> select_queue().  So I see that these changes would be useful for ixgbe
> (together with an update to irqbalance), but they don't seem to fit the
> general direction of multiqueue networking on Linux.

For DCB setting select_queue() is useful because we want to map traffic 
types to specific tx queues not hash them across all queues.  In this 
case where we are placing specific traffic on specific queues it also 
makes sense to align the interrupts for some types such as FCoE.  There 
shouldn't be any issues with user thread migration in this specific example.

> 
> (Actually, the hints seem to be incomplete.  If there are more than 16
> CPU threads then multiple CPU threads can map to the same queues, but it
> looks like you only include the first in the queue's hint.)
> 
> An alternate approach is to use the RX queue index to drive TX queue
> selection.  I posted a patch to do that earlier this week.  However I
> haven't yet had a chance to try that on a suitably large system.
>

I'll post an FCoE example patch soon and take a closer look at your 
patch, but mapping TX/RX queues in sock's won't help for cases like FCoE.

Thanks,
John.

^ permalink raw reply

* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue
From: Eric Dumazet @ 2010-04-23  9:27 UTC (permalink / raw)
  To: Changli Gao
  Cc: David S. Miller, jamal, Tom Herbert, Stephen Hemminger, netdev
In-Reply-To: <1272010378-2955-1-git-send-email-xiaosuo@gmail.com>

Le vendredi 23 avril 2010 à 16:12 +0800, Changli Gao a écrit :
> batch skb dequeueing from softnet input_pkt_queue.
> 
> batch skb dequeueing from softnet input_pkt_queue to reduce potential lock
> contention when RPS is enabled.
> 
> Note: in the worst case, the number of packets in a softnet_data may be double
> of netdev_max_backlog.
> 
> Signed-off-by: Changli Gao <xiaosuo@gmail.com>

Very good patch Changli, thanks !

Lets see how it improves thing for Jamal benchs ;)

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>

> ----
>  include/linux/netdevice.h |    6 +++--
>  net/core/dev.c            |   50 +++++++++++++++++++++++++++++++---------------
>  2 files changed, 38 insertions(+), 18 deletions(-)
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 3c5ed5f..6ae9f2b 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -1387,6 +1387,7 @@ struct softnet_data {
>  	struct Qdisc		*output_queue;
>  	struct list_head	poll_list;
>  	struct sk_buff		*completion_queue;
> +	struct sk_buff_head	process_queue;
>  
>  #ifdef CONFIG_RPS
>  	struct softnet_data	*rps_ipi_list;
> @@ -1401,10 +1402,11 @@ struct softnet_data {
>  	struct napi_struct	backlog;
>  };
>  
> -static inline void input_queue_head_incr(struct softnet_data *sd)
> +static inline void input_queue_head_add(struct softnet_data *sd,
> +					unsigned int len)
>  {
>  #ifdef CONFIG_RPS
> -	sd->input_queue_head++;
> +	sd->input_queue_head += len;
>  #endif
>  }
>  
> diff --git a/net/core/dev.c b/net/core/dev.c
> index a4a7c36..c1585f9 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -2409,12 +2409,13 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
>  	__get_cpu_var(netdev_rx_stat).total++;
>  
>  	rps_lock(sd);
> -	if (sd->input_pkt_queue.qlen <= netdev_max_backlog) {
> -		if (sd->input_pkt_queue.qlen) {
> +	if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
> +		if (skb_queue_len(&sd->input_pkt_queue)) {
>  enqueue:
>  			__skb_queue_tail(&sd->input_pkt_queue, skb);
>  #ifdef CONFIG_RPS
> -			*qtail = sd->input_queue_head + sd->input_pkt_queue.qlen;
> +			*qtail = sd->input_queue_head +
> +					skb_queue_len(&sd->input_pkt_queue);
>  #endif
>  			rps_unlock(sd);
>  			local_irq_restore(flags);
> @@ -2934,13 +2935,21 @@ static void flush_backlog(void *arg)
>  	struct sk_buff *skb, *tmp;
>  
>  	rps_lock(sd);
> -	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp)
> +	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
>  		if (skb->dev == dev) {
>  			__skb_unlink(skb, &sd->input_pkt_queue);
>  			kfree_skb(skb);
> -			input_queue_head_incr(sd);
> +			input_queue_head_add(sd, 1);
>  		}
> +	}
>  	rps_unlock(sd);
> +
> +	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
> +		if (skb->dev == dev) {
> +			__skb_unlink(skb, &sd->process_queue);
> +			kfree_skb(skb);
> +		}
> +	}
>  }
>  
>  static int napi_gro_complete(struct sk_buff *skb)
> @@ -3286,24 +3295,30 @@ static int process_backlog(struct napi_struct *napi, int quota)
>  	}
>  #endif
>  	napi->weight = weight_p;
> -	do {
> +	local_irq_disable();
> +	while (1) {
>  		struct sk_buff *skb;
>  
> -		local_irq_disable();
> +		while ((skb = __skb_dequeue(&sd->process_queue))) {
> +			local_irq_enable();
> +			__netif_receive_skb(skb);
> +			if (++work >= quota)
> +				return work;
> +			local_irq_disable();
> +		}
> +
>  		rps_lock(sd);
> -		skb = __skb_dequeue(&sd->input_pkt_queue);
> -		if (!skb) {
> +		input_queue_head_add(sd, skb_queue_len(&sd->input_pkt_queue));
> +		skb_queue_splice_tail_init(&sd->input_pkt_queue,
> +					   &sd->process_queue);
> +		if (skb_queue_empty(&sd->process_queue)) {
>  			__napi_complete(napi);
>  			rps_unlock(sd);
> -			local_irq_enable();
>  			break;
>  		}
> -		input_queue_head_incr(sd);
>  		rps_unlock(sd);
> -		local_irq_enable();
> -
> -		__netif_receive_skb(skb);
> -	} while (++work < quota);
> +	}
> +	local_irq_enable();
>  
>  	return work;
>  }
> @@ -5631,8 +5646,10 @@ static int dev_cpu_callback(struct notifier_block *nfb,
>  	/* Process offline CPU's input_pkt_queue */
>  	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
>  		netif_rx(skb);
> -		input_queue_head_incr(oldsd);
> +		input_queue_head_add(oldsd, 1);
>  	}
> +	while ((skb = __skb_dequeue(&oldsd->process_queue)))
> +		netif_rx(skb);
>  
>  	return NOTIFY_OK;
>  }
> @@ -5851,6 +5868,7 @@ static int __init net_dev_init(void)
>  		struct softnet_data *sd = &per_cpu(softnet_data, i);
>  
>  		skb_queue_head_init(&sd->input_pkt_queue);
> +		skb_queue_head_init(&sd->process_queue);
>  		sd->completion_queue = NULL;
>  		INIT_LIST_HEAD(&sd->poll_list);
>  
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 



^ permalink raw reply

* Re: DDoS attack causing bad effect on conntrack searches
From: Eric Dumazet @ 2010-04-23  9:23 UTC (permalink / raw)
  To: Jan Engelhardt
  Cc: Jesper Dangaard Brouer, Patrick McHardy, hawk,
	Linux Kernel Network Hackers, Netfilter Developers
In-Reply-To: <alpine.LSU.2.01.1004230955030.26168@obet.zrqbmnf.qr>

Le vendredi 23 avril 2010 à 09:55 +0200, Jan Engelhardt a écrit :
> On Friday 2010-04-23 09:46, Eric Dumazet wrote:
> >Years ago, we had to manually change PAGE_OFFSET, and I remember some
> >machines with PAGE_OFFSET 0xA0000000  (1.5 GB LOWMEM), 
> >or 0xB0000000 (1.25 GB), (PAE off)
> 
> I notice that 0xB0000000, which is now known as LOWMEM_3G_OPT,
> is only available when PAE is off. Would you know the reason for
> that decision? Are some values unsuitable for PAE?
> 

If PAE was on, PAGE_OFFSET must be a 1GB multiple.
This is because of hardware limitations.



^ permalink raw reply

* Re: DDoS attack causing bad effect on conntrack searches
From: Jesper Dangaard Brouer @ 2010-04-23  8:40 UTC (permalink / raw)
  To: David Miller
  Cc: eric.dumazet, paulmck, Patrick McHardy, xiaosuo, netdev,
	Netfilter Developers
In-Reply-To: <20100423.011845.254684857.davem@davemloft.net>

On Fri, 23 Apr 2010, David Miller wrote:

> This all reminds me of the namespace bug we dealt with
> a month or two ago.
>
> Jesper, you don't happen to be using network namespaces are you?

No, I don't use network namespaces.
(In .config CONFIG_NAMESPACES is not set.)

Cheers,
   Jesper Brouer

--
-------------------------------------------------------------------
MSc. Master of Computer Science
Dept. of Computer Science, University of Copenhagen
Author of http://www.adsl-optimizer.dk
-------------------------------------------------------------------

^ permalink raw reply

* [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue
From: Changli Gao @ 2010-04-23  8:12 UTC (permalink / raw)
  To: David S. Miller
  Cc: jamal, Tom Herbert, Eric Dumazet, Stephen Hemminger, netdev,
	Changli Gao

batch skb dequeueing from softnet input_pkt_queue.

batch skb dequeueing from softnet input_pkt_queue to reduce potential lock
contention when RPS is enabled.

Note: in the worst case, the number of packets in a softnet_data may be double
of netdev_max_backlog.

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
----
 include/linux/netdevice.h |    6 +++--
 net/core/dev.c            |   50 +++++++++++++++++++++++++++++++---------------
 2 files changed, 38 insertions(+), 18 deletions(-)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 3c5ed5f..6ae9f2b 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1387,6 +1387,7 @@ struct softnet_data {
 	struct Qdisc		*output_queue;
 	struct list_head	poll_list;
 	struct sk_buff		*completion_queue;
+	struct sk_buff_head	process_queue;
 
 #ifdef CONFIG_RPS
 	struct softnet_data	*rps_ipi_list;
@@ -1401,10 +1402,11 @@ struct softnet_data {
 	struct napi_struct	backlog;
 };
 
-static inline void input_queue_head_incr(struct softnet_data *sd)
+static inline void input_queue_head_add(struct softnet_data *sd,
+					unsigned int len)
 {
 #ifdef CONFIG_RPS
-	sd->input_queue_head++;
+	sd->input_queue_head += len;
 #endif
 }
 
diff --git a/net/core/dev.c b/net/core/dev.c
index a4a7c36..c1585f9 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2409,12 +2409,13 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
 	__get_cpu_var(netdev_rx_stat).total++;
 
 	rps_lock(sd);
-	if (sd->input_pkt_queue.qlen <= netdev_max_backlog) {
-		if (sd->input_pkt_queue.qlen) {
+	if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
+		if (skb_queue_len(&sd->input_pkt_queue)) {
 enqueue:
 			__skb_queue_tail(&sd->input_pkt_queue, skb);
 #ifdef CONFIG_RPS
-			*qtail = sd->input_queue_head + sd->input_pkt_queue.qlen;
+			*qtail = sd->input_queue_head +
+					skb_queue_len(&sd->input_pkt_queue);
 #endif
 			rps_unlock(sd);
 			local_irq_restore(flags);
@@ -2934,13 +2935,21 @@ static void flush_backlog(void *arg)
 	struct sk_buff *skb, *tmp;
 
 	rps_lock(sd);
-	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp)
+	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
 		if (skb->dev == dev) {
 			__skb_unlink(skb, &sd->input_pkt_queue);
 			kfree_skb(skb);
-			input_queue_head_incr(sd);
+			input_queue_head_add(sd, 1);
 		}
+	}
 	rps_unlock(sd);
+
+	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
+		if (skb->dev == dev) {
+			__skb_unlink(skb, &sd->process_queue);
+			kfree_skb(skb);
+		}
+	}
 }
 
 static int napi_gro_complete(struct sk_buff *skb)
@@ -3286,24 +3295,30 @@ static int process_backlog(struct napi_struct *napi, int quota)
 	}
 #endif
 	napi->weight = weight_p;
-	do {
+	local_irq_disable();
+	while (1) {
 		struct sk_buff *skb;
 
-		local_irq_disable();
+		while ((skb = __skb_dequeue(&sd->process_queue))) {
+			local_irq_enable();
+			__netif_receive_skb(skb);
+			if (++work >= quota)
+				return work;
+			local_irq_disable();
+		}
+
 		rps_lock(sd);
-		skb = __skb_dequeue(&sd->input_pkt_queue);
-		if (!skb) {
+		input_queue_head_add(sd, skb_queue_len(&sd->input_pkt_queue));
+		skb_queue_splice_tail_init(&sd->input_pkt_queue,
+					   &sd->process_queue);
+		if (skb_queue_empty(&sd->process_queue)) {
 			__napi_complete(napi);
 			rps_unlock(sd);
-			local_irq_enable();
 			break;
 		}
-		input_queue_head_incr(sd);
 		rps_unlock(sd);
-		local_irq_enable();
-
-		__netif_receive_skb(skb);
-	} while (++work < quota);
+	}
+	local_irq_enable();
 
 	return work;
 }
@@ -5631,8 +5646,10 @@ static int dev_cpu_callback(struct notifier_block *nfb,
 	/* Process offline CPU's input_pkt_queue */
 	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
 		netif_rx(skb);
-		input_queue_head_incr(oldsd);
+		input_queue_head_add(oldsd, 1);
 	}
+	while ((skb = __skb_dequeue(&oldsd->process_queue)))
+		netif_rx(skb);
 
 	return NOTIFY_OK;
 }
@@ -5851,6 +5868,7 @@ static int __init net_dev_init(void)
 		struct softnet_data *sd = &per_cpu(softnet_data, i);
 
 		skb_queue_head_init(&sd->input_pkt_queue);
+		skb_queue_head_init(&sd->process_queue);
 		sd->completion_queue = NULL;
 		INIT_LIST_HEAD(&sd->poll_list);
 

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox