* Re: [PATCH net-next-2.6 v4 1/5] sctp: Add Auto-ASCONF support
From: Wei Yongjun @ 2011-04-21 4:52 UTC (permalink / raw)
To: Michio Honda; +Cc: netdev, lksctp-developers
In-Reply-To: <0F65CB0F-7945-4E22-8003-A30EC2C653B5@sfc.wide.ad.jp>
comment inline.
> SCTP reconfigure the IP addresses in the association by using ASCONF chunks as mentioned in RFC5061.
> For example, we can start to use the newly configured IP address in the existing association.
> ASCONF operation is invoked in two ways:
> First is done by the application to call sctp_bindx() system call.
> Second is automatic operation in the SCTP stack with address events in the host computer (called auto_asconf) .
> The former is already implemented, and this patch implement the latter.
>
> Signed-off-by: Michio Honda <micchie@sfc.wide.ad.jp>
> ---
> diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
> index 505845d..8678cbd 100644
> --- a/include/net/sctp/sctp.h
> +++ b/include/net/sctp/sctp.h
> @@ -121,6 +121,7 @@ extern int sctp_copy_local_addr_list(struct sctp_bind_addr *,
> int flags);
> extern struct sctp_pf *sctp_get_pf_specific(sa_family_t family);
> extern int sctp_register_pf(struct sctp_pf *, sa_family_t);
> +void sctp_addr_wq_mgmt(struct sctp_sockaddr_entry *, int);
>
> /*
> * sctp/socket.c
> @@ -135,6 +136,7 @@ void sctp_sock_rfree(struct sk_buff *skb);
> void sctp_copy_sock(struct sock *newsk, struct sock *sk,
> struct sctp_association *asoc);
> extern struct percpu_counter sctp_sockets_allocated;
> +int sctp_asconf_mgmt(struct sctp_sock *, struct sctp_sockaddr_entry *);
>
> /*
> * sctp/primitive.c
> diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
> index cc9185c..80974c3 100644
> --- a/include/net/sctp/structs.h
> +++ b/include/net/sctp/structs.h
> @@ -205,6 +205,11 @@ extern struct sctp_globals {
> * It is a list of sctp_sockaddr_entry.
> */
> struct list_head local_addr_list;
> + int auto_asconf_enable;
> + struct list_head addr_waitq;
> + struct timer_list addr_wq_timer;
> + struct list_head auto_asconf_splist;
> + spinlock_t addr_wq_lock;
>
> /* Lock that protects the local_addr_list writers */
> spinlock_t addr_list_lock;
> @@ -264,6 +269,11 @@ extern struct sctp_globals {
> #define sctp_port_hashtable (sctp_globals.port_hashtable)
> #define sctp_local_addr_list (sctp_globals.local_addr_list)
> #define sctp_local_addr_lock (sctp_globals.addr_list_lock)
> +#define sctp_auto_asconf_splist (sctp_globals.auto_asconf_splist)
> +#define sctp_addr_waitq (sctp_globals.addr_waitq)
> +#define sctp_addr_wq_timer (sctp_globals.addr_wq_timer)
> +#define sctp_addr_wq_lock (sctp_globals.addr_wq_lock)
> +#define sctp_auto_asconf_enable (sctp_globals.auto_asconf_enable)
> #define sctp_scope_policy (sctp_globals.ipv4_scope_policy)
> #define sctp_addip_enable (sctp_globals.addip_enable)
> #define sctp_addip_noauth (sctp_globals.addip_noauth_enable)
> @@ -341,6 +351,8 @@ struct sctp_sock {
> atomic_t pd_mode;
> /* Receive to here while partial delivery is in effect. */
> struct sk_buff_head pd_lobby;
> + struct list_head auto_asconf_list;
> + int do_auto_asconf;
> };
>
> static inline struct sctp_sock *sctp_sk(const struct sock *sk)
> @@ -796,6 +808,10 @@ struct sctp_sockaddr_entry {
> __u8 valid;
> };
>
> +#define SCTP_NEWADDR 1
> +#define SCTP_DELADDR 2
We can use exist SCTP_ADDR_NEW and SCTP_ADDR_DEL.
> +#define SCTP_ADDRESS_TICK_DELAY 500
> +
> typedef struct sctp_chunk *(sctp_packet_phandler_t)(struct sctp_association *);
>
> /* This structure holds lists of chunks as we are assembling for
> @@ -1239,6 +1255,7 @@ sctp_scope_t sctp_scope(const union sctp_addr *);
> int sctp_in_scope(const union sctp_addr *addr, const sctp_scope_t scope);
> int sctp_is_any(struct sock *sk, const union sctp_addr *addr);
> int sctp_addr_is_valid(const union sctp_addr *addr);
> +int sctp_is_ep_boundall(struct sock *sk);
>
>
> /* What type of endpoint? */
> diff --git a/net/sctp/bind_addr.c b/net/sctp/bind_addr.c
> index faf71d1..869267b 100644
> --- a/net/sctp/bind_addr.c
> +++ b/net/sctp/bind_addr.c
> @@ -536,6 +536,21 @@ int sctp_in_scope(const union sctp_addr *addr, sctp_scope_t scope)
> return 0;
> }
>
> +int sctp_is_ep_boundall(struct sock *sk)
> +{
> + struct sctp_bind_addr *bp;
> + struct sctp_sockaddr_entry *addr;
> +
> + bp = &sctp_sk(sk)->ep->base.bind_addr;
> + if (sctp_list_single_entry(&bp->address_list)) {
> + addr = list_entry(bp->address_list.next,
> + struct sctp_sockaddr_entry, list);
> + if (sctp_is_any(sk, &addr->a))
> + return 1;
> + }
> + return 0;
> +}
> +
> /********************************************************************
> * 3rd Level Abstractions
> ********************************************************************/
> diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
> index 865ce7b..0ba27b9 100644
> --- a/net/sctp/ipv6.c
> +++ b/net/sctp/ipv6.c
> @@ -105,6 +105,7 @@ static int sctp_inet6addr_event(struct notifier_block *this, unsigned long ev,
> addr->valid = 1;
> spin_lock_bh(&sctp_local_addr_lock);
> list_add_tail_rcu(&addr->list, &sctp_local_addr_list);
> + sctp_addr_wq_mgmt(addr, SCTP_NEWADDR);
> spin_unlock_bh(&sctp_local_addr_lock);
> }
> break;
> @@ -115,6 +116,7 @@ static int sctp_inet6addr_event(struct notifier_block *this, unsigned long ev,
> if (addr->a.sa.sa_family == AF_INET6 &&
> ipv6_addr_equal(&addr->a.v6.sin6_addr,
> &ifa->addr)) {
> + sctp_addr_wq_mgmt(addr, SCTP_DELADDR);
> found = 1;
> addr->valid = 0;
> list_del_rcu(&addr->list);
> diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
> index 152976e..d8442a4 100644
> --- a/net/sctp/protocol.c
> +++ b/net/sctp/protocol.c
> @@ -636,6 +636,164 @@ static void sctp_v4_ecn_capable(struct sock *sk)
> INET_ECN_xmit(sk);
> }
>
> +void sctp_addr_wq_timeout_handler(unsigned long arg)
> +{
> + struct sctp_sockaddr_entry *addrw = NULL;
> + union sctp_addr *addr = NULL;
> + struct sctp_sock *sp = NULL;
> +
> + spin_lock_bh(&sctp_addr_wq_lock);
> +retry_wq:
> + if (list_empty(&sctp_addr_waitq)) {
> + SCTP_DEBUG_PRINTK("sctp_addrwq_timo_handler: nothing in addr waitq\n");
> + spin_unlock_bh(&sctp_addr_wq_lock);
> + return;
> + }
> + addrw = list_first_entry(&sctp_addr_waitq, struct sctp_sockaddr_entry,
> + list);
> + addr = &addrw->a;
> + SCTP_DEBUG_PRINTK_IPADDR("sctp_addrwq_timo_handler: the first ent in wq %p is ",
> + " for cmd %d at entry %p\n", &sctp_addr_waitq, addr, addrw->state,
> + addrw);
> +
> + /* Now we send an ASCONF for each association */
> + /* Note. we currently don't handle link local IPv6 addressees */
> + if (addr->sa.sa_family == AF_INET6) {
> + struct in6_addr *in6 = (struct in6_addr *)&addr->v6.sin6_addr;
> +
> + if (ipv6_addr_type(&addr->v6.sin6_addr) & IPV6_ADDR_LINKLOCAL) {
> + SCTP_DEBUG_PRINTK("sctp_timo_handler: link local, hence don't tell sockets\n");
> + list_del(&addrw->list);
> + kfree(addrw);
> + goto retry_wq;
> + }
> + if (ipv6_chk_addr(&init_net, in6, NULL, 0) == 0 &&
> + addrw->state == SCTP_NEWADDR) {
> + unsigned long timeo_val;
> +
> + SCTP_DEBUG_PRINTK("sctp_timo_handler: this is on DAD, trying %d sec later\n",
> + SCTP_ADDRESS_TICK_DELAY);
> + timeo_val = jiffies;
> + timeo_val += msecs_to_jiffies(SCTP_ADDRESS_TICK_DELAY);
> + mod_timer(&sctp_addr_wq_timer, timeo_val);
> + spin_unlock_bh(&sctp_addr_wq_lock);
> + return;
> + }
> + }
> + list_for_each_entry(sp, &sctp_auto_asconf_splist, auto_asconf_list) {
> + struct sock *sk;
> +
> + if (sp == NULL) {
> + SCTP_DEBUG_PRINTK("addrwq_timo_handler: no socket\n");
> + continue;
> + }
sp should never be NULL?
> + sk = sctp_opt2sk(sp);
> + if (!sctp_is_ep_boundall(sk))
> + /* ignore bound-specific endpoints */
> + continue;
better to comment before if (!sctp_is_ep_boundall(sk))
/* ignore bound-specific endpoints */
if (!sctp_is_ep_boundall(sk))
continue;
> + sctp_bh_lock_sock(sk);
> + if (sctp_asconf_mgmt(sp, addrw) < 0) {
> + SCTP_DEBUG_PRINTK("sctp_addrwq_timo_handler: sctp_asconf_mgmt failed\n");
> + sctp_bh_unlock_sock(sk);
> + continue;
> + }
> + sctp_bh_unlock_sock(sk);
> + }
> +
> + list_del(&addrw->list);
> + kfree(addrw);
> +
> + if (!list_empty(&sctp_addr_waitq))
> + goto retry_wq;
> +
> + spin_unlock_bh(&sctp_addr_wq_lock);
> +}
> +
> +static void sctp_free_addr_wq()
> +{
> + struct sctp_sockaddr_entry *addrw = NULL;
> + struct sctp_sockaddr_entry *temp = NULL;
> +
> + spin_lock_bh(&sctp_addr_wq_lock);
> + del_timer(&sctp_addr_wq_timer);
> + list_for_each_entry_safe(addrw, temp, &sctp_addr_waitq, list) {
> + list_del(&addrw->list);
> + kfree(addrw);
> + }
> + spin_unlock_bh(&sctp_addr_wq_lock);
> +}
> +
> +/* lookup the entry for the same address in the addr_waitq
> + * sctp_addr_wq MUST be locked
> + */
> +static struct sctp_sockaddr_entry *sctp_addr_wq_lookup(struct sctp_sockaddr_entry *addr)
> +{
> + struct sctp_sockaddr_entry *addrw;
> +
> + list_for_each_entry(addrw, &sctp_addr_waitq, list) {
> + if (addrw->a.sa.sa_family != addr->a.sa.sa_family)
> + continue;
> + if (addrw->a.sa.sa_family == AF_INET) {
> + if (addrw->a.v4.sin_addr.s_addr ==
> + addr->a.v4.sin_addr.s_addr)
> + return addrw;
> + } else if (addrw->a.sa.sa_family == AF_INET6) {
> + if (ipv6_addr_equal(&addrw->a.v6.sin6_addr,
> + &addr->a.v6.sin6_addr))
> + return addrw;
> + }
> + }
> + return NULL;
> +}
> +
> +void sctp_addr_wq_mgmt(struct sctp_sockaddr_entry *addr, int cmd)
> +{
> + struct sctp_sockaddr_entry *addrw = NULL;
> + unsigned long timeo_val;
> + union sctp_addr *tmpaddr;
> +
> + /* first, we check if an opposite message already exist in the queue.
> + * If we found such message, it is removed.
> + * This operation is a bit stupid, but the DHCP client attaches the
> + * new address after a couple of addition and deletion of that address
> + */
> +
> + spin_lock_bh(&sctp_addr_wq_lock);
> + /* Offsets existing events in addr_wq */
> + addrw = sctp_addr_wq_lookup(addr);
> + tmpaddr = &addrw->a;
> + if (addrw) {
> + if (addrw->state != cmd) {
> + SCTP_DEBUG_PRINTK_IPADDR("sctp_addr_wq_mgmt offsets existing entry for %d ",
> + " in wq %p\n", addrw->state, tmpaddr,
> + &sctp_addr_waitq);
> + list_del(&addrw->list);
> + kfree(addrw);
> + }
> + spin_unlock_bh(&sctp_addr_wq_lock);
> + return;
> + }
> +
> + /* OK, we have to add the new address to the wait queue */
> + addrw = kmemdup(addr, sizeof(struct sctp_sockaddr_entry), GFP_ATOMIC);
> + if (addrw == NULL) {
> + spin_unlock_bh(&sctp_addr_wq_lock);
> + return;
> + }
> + addrw->state = cmd;
> + list_add_tail(&addrw->list, &sctp_addr_waitq);
> + tmpaddr = &addrw->a;
> + SCTP_DEBUG_PRINTK_IPADDR("sctp_addr_wq_mgmt add new entry for cmd:%d ",
> + " in wq %p\n", addrw->state, tmpaddr, &sctp_addr_waitq);
> +
> + if (!timer_pending(&sctp_addr_wq_timer)) {
> + timeo_val = jiffies;
> + timeo_val += msecs_to_jiffies(SCTP_ADDRESS_TICK_DELAY);
> + mod_timer(&sctp_addr_wq_timer, timeo_val);
> + }
> + spin_unlock_bh(&sctp_addr_wq_lock);
> +}
> +
> /* Event handler for inet address addition/deletion events.
> * The sctp_local_addr_list needs to be protocted by a spin lock since
> * multiple notifiers (say IPv4 and IPv6) may be running at the same
> @@ -663,6 +821,7 @@ static int sctp_inetaddr_event(struct notifier_block *this, unsigned long ev,
> addr->valid = 1;
> spin_lock_bh(&sctp_local_addr_lock);
> list_add_tail_rcu(&addr->list, &sctp_local_addr_list);
> + sctp_addr_wq_mgmt(addr, SCTP_NEWADDR);
> spin_unlock_bh(&sctp_local_addr_lock);
> }
> break;
> @@ -673,6 +832,7 @@ static int sctp_inetaddr_event(struct notifier_block *this, unsigned long ev,
> if (addr->a.sa.sa_family == AF_INET &&
> addr->a.v4.sin_addr.s_addr ==
> ifa->ifa_local) {
> + sctp_addr_wq_mgmt(addr, SCTP_DELADDR);
> found = 1;
> addr->valid = 0;
> list_del_rcu(&addr->list);
> @@ -1256,6 +1416,7 @@ SCTP_STATIC __init int sctp_init(void)
> /* Disable ADDIP by default. */
> sctp_addip_enable = 0;
> sctp_addip_noauth = 0;
> + sctp_auto_asconf_enable = 0;
>
> /* Enable PR-SCTP by default. */
> sctp_prsctp_enable = 1;
> @@ -1280,6 +1441,13 @@ SCTP_STATIC __init int sctp_init(void)
> spin_lock_init(&sctp_local_addr_lock);
> sctp_get_local_addr_list();
>
> + /* Initialize the address event list */
> + INIT_LIST_HEAD(&sctp_addr_waitq);
> + INIT_LIST_HEAD(&sctp_auto_asconf_splist);
> + spin_lock_init(&sctp_addr_wq_lock);
> + sctp_addr_wq_timer.expires = 0;
> + setup_timer(&sctp_addr_wq_timer, sctp_addr_wq_timeout_handler, 0);
> +
> status = sctp_v4_protosw_init();
>
> if (status)
> @@ -1344,6 +1512,7 @@ err_chunk_cachep:
> /* Exit handler for the SCTP protocol. */
> SCTP_STATIC __exit void sctp_exit(void)
> {
> + sctp_free_addr_wq();
Not sure whether is the best place to free wq, NETDEV
event may not disabled?
> /* BUG. This should probably do something useful like clean
> * up all the remaining associations and all that memory.
> */
> diff --git a/net/sctp/socket.c b/net/sctp/socket.c
> index 3951a10..29f2b1f 100644
> --- a/net/sctp/socket.c
> +++ b/net/sctp/socket.c
> @@ -807,6 +807,37 @@ out:
> return retval;
> }
>
> +/* set addr events to assocs in the endpoint. ep and addr_wq must be locked */
> +int
> +sctp_asconf_mgmt(struct sctp_sock *sp, struct sctp_sockaddr_entry *addrw)
> +{
> + struct sock *sk = sctp_opt2sk(sp);
> + union sctp_addr *addr = NULL;
> + int cmd;
> + int error = 0;
> +
> + addr = &addrw->a;
> + addr->v4.sin_port = htons(sp->ep->base.bind_addr.port);
> + cmd = addrw->state;
> +
> + SCTP_DEBUG_PRINTK("sctp_asconf_mgmt sp:%p\n", sp);
> + if (cmd == SCTP_NEWADDR) {
> + error = sctp_send_asconf_add_ip(sk, (struct sockaddr *)addr, 1);
> + if (error) {
> + SCTP_DEBUG_PRINTK("asconf_mgmt: send_asconf_add_ip returns %d\n", error);
> + return error;
> + }
> + } else if (cmd == SCTP_DELADDR) {
> + error = sctp_send_asconf_del_ip(sk, (struct sockaddr *)addr, 1);
> + if (error) {
> + SCTP_DEBUG_PRINTK("asconf_mgmt: send_asconf_del_ip returns %d\n", error);
> + return error;
> + }
> + }
> +
> + return 0;
> +}
> +
> /* Helper for tunneling sctp_bindx() requests through sctp_setsockopt()
> *
> * API 8.1
> @@ -3770,6 +3801,13 @@ SCTP_STATIC int sctp_init_sock(struct sock *sk)
> local_bh_disable();
> percpu_counter_inc(&sctp_sockets_allocated);
> sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
> + if (sctp_auto_asconf_enable) {
> + list_add_tail(&sp->auto_asconf_list,
> + &sctp_auto_asconf_splist);
> + sp->do_auto_asconf = 1;
> + } else
> + sp->do_auto_asconf = 0;
> + SCTP_DEBUG_PRINTK("sctp_init_sk sk:%p ep:%p\n", sk, ep);
> local_bh_enable();
>
> return 0;
> @@ -3779,11 +3817,17 @@ SCTP_STATIC int sctp_init_sock(struct sock *sk)
> SCTP_STATIC void sctp_destroy_sock(struct sock *sk)
> {
> struct sctp_endpoint *ep;
> + struct sctp_sock *sp;
>
> SCTP_DEBUG_PRINTK("sctp_destroy_sock(sk: %p)\n", sk);
>
> /* Release our hold on the endpoint. */
> - ep = sctp_sk(sk)->ep;
> + sp = sctp_sk(sk);
> + ep = sp->ep;
> + if (sp->do_auto_asconf) {
> + sp->do_auto_asconf = 0;
> + list_del(&sp->auto_asconf_list);
> + }
> sctp_endpoint_free(ep);
> local_bh_disable();
> percpu_counter_dec(&sctp_sockets_allocated);
>
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
^ permalink raw reply
* Re: r8169 : always copying the rx buffer to new skb
From: John Lumby @ 2011-04-21 3:52 UTC (permalink / raw)
To: Francois Romieu; +Cc: netdev, Ben Hutchings, nic_swsd
In-Reply-To: <20110420191316.GA18805@electric-eye.fr.zoreil.com>
On 04/20/11 15:13, Francois Romieu wrote:
>
> Why don't you send the patch through the mailing list ?
>
> (hint, hint)
>
based on 2.6.39-rc2.
also has changes for ethtool -
. get and set ring parms (suggested by Ben)
. get and set rx_copybreak - not sure if this is a good idea
or not,
as it's a driver parm, not NIC setting,
but there are 22 net drivers that have the parm so I thought
maybe useful.
-------------------------------------------------------------------------------------
--- linux-2.6.39-rc2FCrtl/drivers/net/r8169.c.orig 2011-04-05
21:30:43.000000000 -0400
+++ linux-2.6.39-rc2FCrtl/drivers/net/r8169.c 2011-04-20
21:34:42.000000000 -0400
@@ -56,7 +56,7 @@
(NETIF_MSG_DRV | NETIF_MSG_PROBE | NETIF_MSG_IFUP | NETIF_MSG_IFDOWN)
#define TX_BUFFS_AVAIL(tp) \
- (tp->dirty_tx + NUM_TX_DESC - tp->cur_tx - 1)
+ (tp->dirty_tx + tp->num_tx_allocd - tp->cur_tx - 1)
/* Maximum number of multicast addresses to filter (vs. Rx-all-multicast).
The RTL chips use a 64 element hash table based on the Ethernet CRC. */
@@ -74,11 +74,19 @@ static const int multicast_filter_limit
#define R8169_REGS_SIZE 256
#define R8169_NAPI_WEIGHT 64
-#define NUM_TX_DESC 64 /* Number of Tx descriptor registers */
-#define NUM_RX_DESC 256 /* Number of Rx descriptor registers */
-#define RX_BUF_SIZE 1536 /* Rx Buffer size */
-#define R8169_TX_RING_BYTES (NUM_TX_DESC * sizeof(struct TxDesc))
-#define R8169_RX_RING_BYTES (NUM_RX_DESC * sizeof(struct RxDesc))
+/* #define NUM_TX_DESC 64 Number of Tx descriptor registers is
now based on variable num_tx_allocd */
+/* #define NUM_RX_DESC 256 Number of in-use Rx descriptor
registers is now based on variable num_rx_allocd :
+ see comments attached to definition of
that variable */
+#define MIN_NUM_RX_DESC 16 /* minimum number of Rx descriptor
registers with which the chip can operate ? */
+#define MAX_NUM_RX_DESC 256 /* maximum number of Rx descriptor
registers with which the chip can operate ? */
+#define MIN_NUM_TX_DESC 16 /* minimum number of Tx descriptor
registers with which the chip can operate ? */
+#define MAX_NUM_TX_DESC 64 /* maximum number of Tx descriptor
registers with which the chip can operate ? */
+
+ /* number of in-use Rx descriptors is based on
variable num_rx_allocd
+ ** and num_rx_allocd is always <= num_rx_requested value
+ */
+#define R8169_RX_RING_BYTES (tp->num_rx_requested * sizeof(struct
RxDesc))
+#define R8169_TX_RING_BYTES (tp->num_tx_requested * sizeof(struct
TxDesc))
#define RTL8169_TX_TIMEOUT (6*HZ)
#define RTL8169_PHY_TIMEOUT (10*HZ)
@@ -198,12 +206,23 @@ static DEFINE_PCI_DEVICE_TABLE(rtl8169_p
MODULE_DEVICE_TABLE(pci, rtl8169_pci_tbl);
-static int rx_buf_sz = 16383;
+static const int rx_buf_sz = 16383;
+/*
+ * we set our default copybreak very high to eliminate
+ * the possibility of running out of receive buffers.
+ * HOWEVER lowering it will reduce memcpying
+ * and may improve performance significantly.
+ */
+static int rx_copybreak = 16383;
static int use_dac;
static struct {
u32 msg_enable;
-} debug = { -1 };
+} debug = {
+-1};
+#ifdef RTL8169_DEBUG
+static int simulate_alloc_fail = 0; /* set to (P-1) to fail alloc
on all except every P attempts */
+#endif /* RTL8169_DEBUG */
enum rtl_registers {
MAC0 = 0, /* Ethernet hardware address. */
MAC4 = 4,
@@ -522,16 +541,50 @@ struct rtl8169_private {
u32 msg_enable;
int chipset;
int mac_version;
- u32 cur_rx; /* Index into the Rx descriptor buffer of next Rx pkt. */
- u32 cur_tx; /* Index into the Tx descriptor buffer of next Rx pkt. */
- u32 dirty_rx;
- u32 dirty_tx;
+
+ /* Note - re number of Rx/Tx descriptor buffers allocated :
+ ** we maintain two values per ring - requested and allocd.
+ ** requested can be set by ethtool and defaults to the max
permitted
+ ** allocd is the number actually obtained at open and may be
less than
+ ** requested, but provided it is at least the minimum
required, we'll continue.
+ ** ethtool setting is asynchronous and takes effect at next open.
+ ** The num_xx_allocd count is used as modulus for
+ ** locating active entries in the array using logic like this
snippet
+ ** in rtl8169_rx_interrupt :
+ ** entry = cur_rx % num_rx_allocd;
+ ** The size of each array of per-ring-element thingy is always
the maximum.
+ **
+ ** at present, with the tx ring info embedded in private,
+ ** it is a bit silly pretending to provide a settable tx_requested,
+ ** but if desired, at expense of extra ptr deref,
+ ** could change it to an array of pointers
+ */
+ u32 num_tx_requested; /* num Tx buffers requested */
+ u32 num_rx_requested; /* num Rx buffers requested */
+ u32 num_tx_allocd; /* num Tx descriptor buffers allocated */
+ u32 num_rx_allocd; /* num Rx descriptor buffers allocated */
+
+ /* note - the following two counters are monotonically-ascending
- can be thought of
+ ** as the count of number of buffers which the hardware
has accessed.
+ */
+ u32 cur_rx; /* Index of next Rx pkt. */
+ u32 cur_tx; /* Index of next Tx pkt. */
+
+ u32 totl_rx_replenished; /* monotonically-ascending count of
replenished buffers */
+ u32 replenish_rx_cursor; /* Index of next Rx pkt. to replenish
(modulo, not monotonic) */
+ /* the following counts pkts copied as opposed to uncopied
(unhooked) */
+ /* note - count of uncopied packets = cur_rx
- copied_rx_pkt_count */
+ u32 copied_rx_pkt_count; /* total pkts copied to new skb */
+ u32 totl_rx_alloc_fail; /* rx alloc failures */
+ u32 dirty_tx; /* monotonic count of transmitted packets (or
fragments?) */
struct TxDesc *TxDescArray; /* 256-aligned Tx descriptor ring */
struct RxDesc *RxDescArray; /* 256-aligned Rx descriptor ring */
dma_addr_t TxPhyAddr;
dma_addr_t RxPhyAddr;
- void *Rx_databuff[NUM_RX_DESC]; /* Rx data buffers */
- struct ring_info tx_skb[NUM_TX_DESC]; /* Tx data buffers */
+ struct sk_buff *Rx_skbuff[MAX_NUM_RX_DESC]; /* Rx data buffers */
+ struct ring_info tx_skb[MAX_NUM_TX_DESC]; /* Tx data buffers */
+
+ unsigned align;
struct timer_list timer;
u16 cp_cmd;
u16 intr_event;
@@ -569,6 +622,14 @@ struct rtl8169_private {
MODULE_AUTHOR("Realtek and the Linux r8169 crew
<netdev@vger.kernel.org>");
MODULE_DESCRIPTION("RealTek RTL-8169 Gigabit Ethernet driver");
+module_param(rx_copybreak, int, 0);
+MODULE_PARM_DESC(rx_copybreak, "Copy breakpoint for
copy-only-tiny-frames");
+#ifdef RTL8169_DEBUG
+module_param(simulate_alloc_fail, int, 0);
+MODULE_PARM_DESC(simulate_alloc_fail,
+ "set to (2**P - 1) eg 15, to fail alloc rx skb on all except
every 2**P attempts");
+#endif /* RTL8169_DEBUG */
+
module_param(use_dac, int, 0);
MODULE_PARM_DESC(use_dac, "Enable PCI DAC. Unsafe on 32 bit PCI slot.");
module_param_named(debug, debug.msg_enable, int, 0);
@@ -583,7 +644,7 @@ static int rtl8169_open(struct net_devic
static netdev_tx_t rtl8169_start_xmit(struct sk_buff *skb,
struct net_device *dev);
static irqreturn_t rtl8169_interrupt(int irq, void *dev_instance);
-static int rtl8169_init_ring(struct net_device *dev);
+static int rtl8169_init_ring(struct rtl8169_private *tp);
static void rtl_hw_start(struct net_device *dev);
static int rtl8169_close(struct net_device *dev);
static void rtl_set_rx_mode(struct net_device *dev);
@@ -1242,6 +1303,15 @@ static int rtl8169_set_settings(struct n
spin_lock_irqsave(&tp->lock, flags);
ret = rtl8169_set_speed(dev,
cmd->autoneg, cmd->speed, cmd->duplex, cmd->advertising);
+
+ /* check that ethtool has set a copybreak value before accepting
it */
+ if ( (cmd->supported & (SUPPORTED_cmd_extension |
+ SUPPORTED_cmd_extension_rx_copybreak))
+ && (cmd->rx_copybreak <= rx_buf_sz) ) {
+ rx_copybreak = cmd->rx_copybreak;
+ netif_info(tp, drv, dev, "set rx_copybreak to %d\n",
+ rx_copybreak);
+ }
spin_unlock_irqrestore(&tp->lock, flags);
return ret;
@@ -1254,6 +1324,49 @@ static u32 rtl8169_get_rx_csum(struct ne
return tp->cp_cmd & RxChkSum;
}
+static void rtl8169_get_ringparam(struct net_device *netdev,
+ struct ethtool_ringparam *ring)
+{
+ struct rtl8169_private *tp = netdev_priv(netdev);
+
+ ring->rx_max_pending = MAX_NUM_RX_DESC;
+ ring->tx_max_pending = MAX_NUM_TX_DESC;
+ ring->rx_mini_max_pending = 0;
+ ring->rx_jumbo_max_pending = 0;
+ ring->rx_pending = tp->num_rx_allocd;
+ ring->tx_pending = tp->num_tx_allocd;
+ ring->rx_mini_pending = 0;
+ ring->rx_jumbo_pending = 0;
+}
+
+static int rtl8169_set_ringparam(struct net_device *netdev,
+ struct ethtool_ringparam *ring)
+{
+ struct rtl8169_private *tp = netdev_priv(netdev);
+
+ if ((ring->rx_mini_pending) || (ring->rx_jumbo_pending))
+ return -EINVAL;
+
+ /* I am not sure about closing and opening the NIC here
+ * so will leave the change pending for next open
+ */
+
+ tp->num_rx_requested = ((ring->rx_pending < MIN_NUM_RX_DESC) ?
+ MIN_NUM_RX_DESC :
+ ((ring->rx_pending > MAX_NUM_RX_DESC) ?
+ MAX_NUM_RX_DESC : ring->rx_pending));
+ tp->num_tx_requested = ((ring->tx_pending < MIN_NUM_TX_DESC) ?
+ MIN_NUM_TX_DESC :
+ ((ring->tx_pending > MAX_NUM_TX_DESC) ?
+ MAX_NUM_TX_DESC : ring->tx_pending));
+
+ netif_info(tp, drv, netdev,
+ "Ring sizes to be requested at next open: num rx: %d, num tx
%d\n",
+ tp->num_rx_requested, tp->num_tx_requested);
+
+ return 0;
+}
+
static int rtl8169_set_rx_csum(struct net_device *dev, u32 data)
{
struct rtl8169_private *tp = netdev_priv(dev);
@@ -1351,6 +1464,13 @@ static int rtl8169_get_settings(struct n
rc = tp->get_settings(dev, cmd);
+ /* inform about returning extended info - rx_copybreak
+ * and initialize so we can detect if set to new val by ethtool
+ */
+ cmd->rx_copybreak = rx_copybreak;
+ cmd->supported |= SUPPORTED_cmd_extension;
+ cmd->supported &= ~SUPPORTED_cmd_extension_rx_copybreak;
+
spin_unlock_irqrestore(&tp->lock, flags);
return rc;
}
@@ -1397,6 +1517,11 @@ static const char rtl8169_gstrings[][ETH
"multicast",
"tx_aborted",
"tx_underrun",
+ /* extras maintained in driver code */
+ "tot rx intrpts",
+ "tot rx copied",
+ "tot rx replenished",
+ "tot rx alloc_fail"
};
static int rtl8169_get_sset_count(struct net_device *dev, int sset)
@@ -1472,9 +1597,15 @@ static void rtl8169_get_ethtool_stats(st
data[10] = le32_to_cpu(tp->counters.rx_multicast);
data[11] = le16_to_cpu(tp->counters.tx_aborted);
data[12] = le16_to_cpu(tp->counters.tx_underun);
+ /* extras maintained in driver code */
+ data[13] = tp->cur_rx;
+ data[14] = tp->copied_rx_pkt_count;
+ data[15] = tp->totl_rx_replenished;
+ data[16] = tp->totl_rx_alloc_fail;
}
-static void rtl8169_get_strings(struct net_device *dev, u32 stringset,
u8 *data)
+static void rtl8169_get_strings(struct net_device *dev, u32 stringset,
+ u8 * data)
{
switch(stringset) {
case ETH_SS_STATS:
@@ -1516,6 +1647,8 @@ static const struct ethtool_ops rtl8169_
.get_rx_csum = rtl8169_get_rx_csum,
.set_rx_csum = rtl8169_set_rx_csum,
.set_tx_csum = ethtool_op_set_tx_csum,
+ .get_ringparam = rtl8169_get_ringparam,
+ .set_ringparam = rtl8169_set_ringparam,
.set_sg = ethtool_op_set_sg,
.set_tso = ethtool_op_set_tso,
.get_regs = rtl8169_get_regs,
@@ -3060,6 +3193,10 @@ rtl8169_init_one(struct pci_dev *pdev, c
tp->pci_dev = pdev;
tp->msg_enable = netif_msg_init(debug.msg_enable, R8169_MSG_DEFAULT);
+ tp->num_rx_allocd = tp->num_tx_allocd = 0;
+ tp->num_rx_requested = MAX_NUM_RX_DESC;
+ tp->num_tx_requested = MAX_NUM_TX_DESC;
+
mii = &tp->mii;
mii->dev = dev;
mii->mdio_read = rtl_mdio_read;
@@ -3229,6 +3366,7 @@ rtl8169_init_one(struct pci_dev *pdev, c
dev->features |= NETIF_F_HW_VLAN_TX_RX | NETIF_F_GRO;
tp->intr_mask = 0xffff;
+ tp->align = cfg->align;
tp->hw_start = cfg->hw_start;
tp->intr_event = cfg->intr_event;
tp->napi_event = cfg->napi_event;
@@ -3326,7 +3464,7 @@ static int rtl8169_open(struct net_devic
if (!tp->RxDescArray)
goto err_free_tx_0;
- retval = rtl8169_init_ring(dev);
+ retval = rtl8169_init_ring(tp);
if (retval < 0)
goto err_free_rx_1;
@@ -4071,14 +4209,15 @@ static inline void rtl8169_make_unusable
desc->opts1 &= ~cpu_to_le32(DescOwn | RsvdMask);
}
-static void rtl8169_free_rx_databuff(struct rtl8169_private *tp,
- void **data_buff, struct RxDesc *desc)
+static void rtl8169_free_rx_skb(struct rtl8169_private *tp,
+ struct sk_buff **sk_buff, struct RxDesc *desc)
{
- dma_unmap_single(&tp->pci_dev->dev, le64_to_cpu(desc->addr), rx_buf_sz,
- DMA_FROM_DEVICE);
+ struct pci_dev *pdev = tp->pci_dev;
- kfree(*data_buff);
- *data_buff = NULL;
+ dma_unmap_single(&pdev->dev, le64_to_cpu(desc->addr), rx_buf_sz,
+ PCI_DMA_FROMDEVICE);
+ dev_kfree_skb(*sk_buff); /* also frees the data buffer! */
+ *sk_buff = NULL;
rtl8169_make_unusable_by_asic(desc);
}
@@ -4102,28 +4241,25 @@ static inline void *rtl8169_align(void *
return (void *)ALIGN((long)data, 16);
}
-static struct sk_buff *rtl8169_alloc_rx_data(struct rtl8169_private *tp,
- struct RxDesc *desc)
+static struct sk_buff *rtl8169_alloc_rx_skb(struct rtl8169_private *tp,
+ struct RxDesc *desc, gfp_t gfp)
{
- void *data;
+ struct sk_buff *skb;
dma_addr_t mapping;
struct device *d = &tp->pci_dev->dev;
struct net_device *dev = tp->dev;
- int node = dev->dev.parent ? dev_to_node(dev->dev.parent) : -1;
+ unsigned int pad;
- data = kmalloc_node(rx_buf_sz, GFP_KERNEL, node);
- if (!data)
- return NULL;
+ pad = tp->align ? tp->align : NET_IP_ALIGN;
- if (rtl8169_align(data) != data) {
- kfree(data);
- data = kmalloc_node(rx_buf_sz + 15, GFP_KERNEL, node);
- if (!data)
- return NULL;
- }
+ skb = __netdev_alloc_skb(dev, rx_buf_sz + pad, gfp);
+ if (!skb)
+ goto err_out;
+
+ skb_reserve(skb,
+ tp->align ? ((pad - 1) & (unsigned long)skb->data) : pad);
- mapping = dma_map_single(d, rtl8169_align(data), rx_buf_sz,
- DMA_FROM_DEVICE);
+ mapping = dma_map_single(d, skb->data, rx_buf_sz, DMA_FROM_DEVICE);
if (unlikely(dma_mapping_error(d, mapping))) {
if (net_ratelimit())
netif_err(tp, drv, tp->dev, "Failed to map RX DMA!\n");
@@ -4131,23 +4267,25 @@ static struct sk_buff *rtl8169_alloc_rx_
}
rtl8169_map_to_asic(desc, mapping, rx_buf_sz);
- return data;
+out:
+ return skb;
err_out:
- kfree(data);
- return NULL;
+ rtl8169_make_unusable_by_asic(desc);
+ goto out;
}
static void rtl8169_rx_clear(struct rtl8169_private *tp)
{
unsigned int i;
- for (i = 0; i < NUM_RX_DESC; i++) {
- if (tp->Rx_databuff[i]) {
- rtl8169_free_rx_databuff(tp, tp->Rx_databuff + i,
+ for (i = 0; i < tp->num_rx_allocd; i++) {
+ if (tp->Rx_skbuff[i]) {
+ rtl8169_free_rx_skb(tp, tp->Rx_skbuff + i,
tp->RxDescArray + i);
}
}
+ tp->num_rx_allocd = 0; /* no rx descriptors allocated any more ! */
}
static inline void rtl8169_mark_as_last_descriptor(struct RxDesc *desc)
@@ -4155,47 +4293,92 @@ static inline void rtl8169_mark_as_last_
desc->opts1 |= cpu_to_le32(RingEnd);
}
-static int rtl8169_rx_fill(struct rtl8169_private *tp)
+/* rtl8169_rx_fill :allocate num_to_alloc rx skb buffers to rx
descriptors
+ * starting with descriptor first_desc.
+ * this function operates in one of two slightly different modes,
+ * depending on whether the num_replenished parm is zero or not :
+ * zero - traverse a fixed number of buffers specified by
num_to_alloc,
+ * allocating those which are empty;
+ * non-zero - traverse as many buffers as needed
+ * to replenish num_replenished empty buffers,
+ * and update the parm with number actually replenished.
+ * in each case, stop if unable to allocate,
+ * and in each case return number of buffers traversed.
+ */
+static u32 rtl8169_rx_fill(struct rtl8169_private *tp, u32 first_desc,
+ u32 num_to_alloc, u32 * num_replenished, gfp_t gfp)
{
- unsigned int i;
+ unsigned int this_desc_index; /* loop through on this */
+ u32 count_allocd; /* count allocd */
+ u32 num_traversed; /* count num traversed */
+
+ for (count_allocd = 0, num_traversed = 0, this_desc_index = first_desc;
+ ((num_replenished && (count_allocd < *num_replenished))
+ || (num_traversed < num_to_alloc)
+ ); num_traversed++) {
+ struct sk_buff *skb;
- for (i = 0; i < NUM_RX_DESC; i++) {
- void *data;
+ if (tp->Rx_skbuff[this_desc_index] == (struct sk_buff *)0) {
/* bypass if allocd */
- if (tp->Rx_databuff[i])
- continue;
+ skb =
+ rtl8169_alloc_rx_skb(tp,
+ tp->RxDescArray +
+ this_desc_index, gfp);
+ if (!skb)
+ break;
- data = rtl8169_alloc_rx_data(tp, tp->RxDescArray + i);
- if (!data) {
- rtl8169_make_unusable_by_asic(tp->RxDescArray + i);
- goto err_out;
- }
- tp->Rx_databuff[i] = data;
+ tp->Rx_skbuff[this_desc_index] = skb;
+ count_allocd++;
}
- rtl8169_mark_as_last_descriptor(tp->RxDescArray + NUM_RX_DESC - 1);
- return 0;
+ /* increment this_desc_index allowing for modulo num_rx_allocd
if latter is > 0
+ * also ensuring we stop after one complete circuit
+ */
+ this_desc_index++;
+ if (this_desc_index == tp->num_rx_allocd) {
+ this_desc_index = 0;
+ }
+ if (this_desc_index == first_desc) {
+ break;
+ }
+ }
-err_out:
- rtl8169_rx_clear(tp);
- return -ENOMEM;
+ if (num_replenished)
+ *num_replenished = count_allocd;
+ return num_traversed;
}
static void rtl8169_init_ring_indexes(struct rtl8169_private *tp)
{
- tp->dirty_tx = tp->dirty_rx = tp->cur_tx = tp->cur_rx = 0;
+ tp->dirty_tx = tp->totl_rx_replenished = tp->totl_rx_alloc_fail =
+ tp->cur_tx = tp->cur_rx = tp->replenish_rx_cursor = 0;
}
-static int rtl8169_init_ring(struct net_device *dev)
+static int rtl8169_init_ring(struct rtl8169_private *tp)
{
- struct rtl8169_private *tp = netdev_priv(dev);
rtl8169_init_ring_indexes(tp);
- memset(tp->tx_skb, 0x0, NUM_TX_DESC * sizeof(struct ring_info));
- memset(tp->Rx_databuff, 0x0, NUM_RX_DESC * sizeof(void *));
+ memset(tp->tx_skb, 0x0, MAX_NUM_TX_DESC * sizeof(struct ring_info));
+ memset(tp->Rx_skbuff, 0x0, MAX_NUM_RX_DESC * sizeof(struct sk_buff *));
+ tp->copied_rx_pkt_count = 0;
+
+ /* see comment preceding defn of num_tx_requested */
+ tp->num_tx_allocd = tp->num_tx_requested;
+ tp->num_rx_allocd =
+ rtl8169_rx_fill(tp, 0, (u32) tp->num_rx_requested, 0, GFP_KERNEL);
+ printk(KERN_INFO "%s num_rx_requested= %d num_rx_allocd= %d\n",
+ MODULENAME, (u32) tp->num_rx_requested, tp->num_rx_allocd);
+ if (tp->num_rx_allocd < MIN_NUM_RX_DESC)
+ goto err_out;
+
+ rtl8169_mark_as_last_descriptor(tp->RxDescArray + tp->num_rx_allocd
- 1);
- return rtl8169_rx_fill(tp);
+ return 0;
+
+err_out:
+ rtl8169_rx_clear(tp);
+ return -ENOMEM;
}
static void rtl8169_unmap_tx_skb(struct device *d, struct ring_info
*tx_skb,
@@ -4217,7 +4400,7 @@ static void rtl8169_tx_clear_range(struc
unsigned int i;
for (i = 0; i < n; i++) {
- unsigned int entry = (start + i) % NUM_TX_DESC;
+ unsigned int entry = (start + i) % tp->num_tx_allocd;
struct ring_info *tx_skb = tp->tx_skb + entry;
unsigned int len = tx_skb->len;
@@ -4237,7 +4420,7 @@ static void rtl8169_tx_clear_range(struc
static void rtl8169_tx_clear(struct rtl8169_private *tp)
{
- rtl8169_tx_clear_range(tp, tp->dirty_tx, NUM_TX_DESC);
+ rtl8169_tx_clear_range(tp, tp->dirty_tx, tp->num_tx_allocd);
tp->cur_tx = tp->dirty_tx = 0;
}
@@ -4310,7 +4493,7 @@ static void rtl8169_reset_task(struct wo
rtl8169_rx_interrupt(dev, tp, tp->mmio_addr, ~(u32)0);
rtl8169_tx_clear(tp);
- if (tp->dirty_rx == tp->cur_rx) {
+ if (tp->totl_rx_replenished == tp->cur_rx) {
rtl8169_init_ring_indexes(tp);
rtl_hw_start(dev);
netif_wake_queue(dev);
@@ -4350,7 +4533,7 @@ static int rtl8169_xmit_frags(struct rtl
u32 status, len;
void *addr;
- entry = (entry + 1) % NUM_TX_DESC;
+ entry = (entry + 1) % tp->num_tx_allocd;
txd = tp->TxDescArray + entry;
len = frag->size;
@@ -4364,7 +4547,9 @@ static int rtl8169_xmit_frags(struct rtl
}
/* anti gcc 2.95.3 bugware (sic) */
- status = opts1 | len | (RingEnd * !((entry + 1) % NUM_TX_DESC));
+ status =
+ opts1 | len | (RingEnd *
+ !((entry + 1) % tp->num_tx_allocd));
txd->opts1 = cpu_to_le32(status);
txd->addr = cpu_to_le64(mapping);
@@ -4408,7 +4593,7 @@ static netdev_tx_t rtl8169_start_xmit(st
struct net_device *dev)
{
struct rtl8169_private *tp = netdev_priv(dev);
- unsigned int entry = tp->cur_tx % NUM_TX_DESC;
+ unsigned int entry = tp->cur_tx % tp->num_tx_allocd;
struct TxDesc *txd = tp->TxDescArray + entry;
void __iomem *ioaddr = tp->mmio_addr;
struct device *d = &tp->pci_dev->dev;
@@ -4418,7 +4603,8 @@ static netdev_tx_t rtl8169_start_xmit(st
int frags;
if (unlikely(TX_BUFFS_AVAIL(tp) < skb_shinfo(skb)->nr_frags)) {
- netif_err(tp, drv, dev, "BUG! Tx Ring full when queue awake!\n");
+ netif_err(tp, drv, dev,
+ "BUG! Tx Ring full when queue awake!\n");
goto err_stop_0;
}
@@ -4452,7 +4638,7 @@ static netdev_tx_t rtl8169_start_xmit(st
wmb();
/* anti gcc 2.95.3 bugware (sic) */
- status = opts1 | len | (RingEnd * !((entry + 1) % NUM_TX_DESC));
+ status = opts1 | len | (RingEnd * !((entry + 1) % tp->num_tx_allocd));
txd->opts1 = cpu_to_le32(status);
tp->cur_tx += frags + 1;
@@ -4512,11 +4698,13 @@ static void rtl8169_pcierr_interrupt(str
pci_write_config_word(pdev, PCI_STATUS,
pci_status & (PCI_STATUS_DETECTED_PARITY |
- PCI_STATUS_SIG_SYSTEM_ERROR | PCI_STATUS_REC_MASTER_ABORT |
- PCI_STATUS_REC_TARGET_ABORT | PCI_STATUS_SIG_TARGET_ABORT));
+ PCI_STATUS_SIG_SYSTEM_ERROR |
+ PCI_STATUS_REC_MASTER_ABORT |
+ PCI_STATUS_REC_TARGET_ABORT |
+ PCI_STATUS_SIG_TARGET_ABORT));
/* The infamous DAC f*ckup only happens at boot time */
- if ((tp->cp_cmd & PCIDAC) && !tp->dirty_rx && !tp->cur_rx) {
+ if ((tp->cp_cmd & PCIDAC) && !tp->totl_rx_replenished && !tp->cur_rx) {
void __iomem *ioaddr = tp->mmio_addr;
netif_info(tp, intr, dev, "disabling PCI DAC\n");
@@ -4541,7 +4729,7 @@ static void rtl8169_tx_interrupt(struct
tx_left = tp->cur_tx - dirty_tx;
while (tx_left > 0) {
- unsigned int entry = dirty_tx % NUM_TX_DESC;
+ unsigned int entry = dirty_tx % tp->num_tx_allocd;
struct ring_info *tx_skb = tp->tx_skb + entry;
u32 status;
@@ -4597,29 +4785,110 @@ static inline void rtl8169_rx_csum(struc
skb_checksum_none_assert(skb);
}
-static struct sk_buff *rtl8169_try_rx_copy(void *data,
- struct rtl8169_private *tp,
- int pkt_size,
- dma_addr_t addr)
+/* rtl8169_rx_deliver : delivers one rx skb up to higher netif layer
+ * and copies or replenishes the skb as needed.
+ * @tp -> private cb for this NIC
+ * @entry == index of rx descriptor in ring
+ * @polling == whether polling or not (see comments for rx_interrupt)
+ * we guarantee that the received packet will be passed up to the
higher layer.
+ * we also try to ensure that a buffer is available for next receive
on this skb,
+ * but do not guarantee that.
+ * This function does not write or read to the asic registers
+ * and does not return any return code - work is reported via the
descriptors.
+ * "original" skb means the one previously in the ring
+ * "returned" skb means the one passed up
+ * these may be the same or different :
+ * if packet size sufficiently small relative to rx_copybreak mod
parm,
+ * then try to copy the entire active skb to a new one, and,
+ * if successful, return the new and leave the original as active.
+ * otherwise, return the original and try to replenish the ring.
+ */
+
+void rtl8169_rx_deliver(struct rtl8169_private *tp, unsigned int entry,
+ int polling)
{
- struct sk_buff *skb;
- struct device *d = &tp->pci_dev->dev;
+ struct RxDesc *desc;
+ u32 opts1;
+ struct sk_buff *original_skb;
+ struct sk_buff *returned_skb;
+ dma_addr_t addr;
+ int pkt_size;
+ struct pci_dev *pdev;
+
+ desc = tp->RxDescArray + entry;
+ opts1 = le32_to_cpu(desc->opts1);
+ original_skb = tp->Rx_skbuff[entry];
+ addr = le64_to_cpu(desc->addr);
+ pkt_size = (opts1 & 0x00001FFF) - 4;
+ pdev = tp->pci_dev;
+
+ dprintk
+ ("rtl8169_rx_deliver entry= %d opts1= 0x%X pkt_size= %d
polling= 0x%X\n",
+ entry, opts1, pkt_size, polling);
+
+ if (pkt_size < rx_copybreak) {
+ returned_skb = netdev_alloc_skb_ip_align(tp->dev, pkt_size);
+ if (returned_skb) {
+ dma_sync_single_for_cpu(&pdev->dev, addr, pkt_size,
+ PCI_DMA_FROMDEVICE);
+ prefetch(original_skb->data);
+ memcpy(returned_skb->data, original_skb->data,
+ pkt_size);
+ dma_sync_single_for_device(&pdev->dev, addr, pkt_size,
+ PCI_DMA_FROMDEVICE);
+ rtl8169_mark_to_asic(desc, rx_buf_sz);
+ tp->totl_rx_replenished++;
+ tp->copied_rx_pkt_count++;
+ } else {
+ /* can't replenish (out of storage ) */
+ rtl8169_make_unusable_by_asic(desc);
+ dma_unmap_single(&pdev->dev, addr, rx_buf_sz,
+ PCI_DMA_FROMDEVICE);
+ tp->Rx_skbuff[entry] = NULL;
+ returned_skb = original_skb;
+ tp->totl_rx_alloc_fail++;
+ }
+ } else {
+ returned_skb = original_skb;
+ dma_unmap_single(&pdev->dev, addr, rx_buf_sz,
+ PCI_DMA_FROMDEVICE);
+ /* following may fail in which case it sets the skbuff ptr to 0 */
+#ifdef RTL8169_DEBUG
+ /* to simulate alloc failure every n attempts */
+ if (simulate_alloc_fail && ((simulate_alloc_fail & entry) != 0))
+ tp->Rx_skbuff[entry] = 0;
+ else
+#endif /* RTL8169_DEBUG */
+ tp->Rx_skbuff[entry] =
+ rtl8169_alloc_rx_skb(tp, desc, GFP_ATOMIC);
+ if (tp->Rx_skbuff[entry]) {
+ tp->totl_rx_replenished++;
+ } else {
+ rtl8169_make_unusable_by_asic(desc);
+ tp->totl_rx_alloc_fail++;
+ }
+ }
- data = rtl8169_align(data);
- dma_sync_single_for_cpu(d, addr, pkt_size, DMA_FROM_DEVICE);
- prefetch(data);
- skb = netdev_alloc_skb_ip_align(tp->dev, pkt_size);
- if (skb)
- memcpy(skb->data, data, pkt_size);
- dma_sync_single_for_device(d, addr, pkt_size, DMA_FROM_DEVICE);
+ rtl8169_rx_csum(returned_skb, opts1);
+ skb_put(returned_skb, pkt_size);
+ returned_skb->protocol = eth_type_trans(returned_skb, tp->dev);
+
+ rtl8169_rx_vlan_tag(desc, returned_skb);
+
+ if (likely(polling)) {
+ napi_gro_receive(&tp->napi, returned_skb);
+ dprintk("rtl8169_rx_deliver explicit napi_gro_receive\n");
+ } else {
+ netif_rx(returned_skb);
+ dprintk("rtl8169_rx_deliver explicit netif_rx\n");
+ }
- return skb;
}
/*
* Warning : rtl8169_rx_interrupt() might be called :
* 1) from NAPI (softirq) context
- * (polling = 1 : we should call netif_receive_skb())
+ * (polling = 1 : we should call napi_gro_receive())
* 2) from process context (rtl8169_reset_task())
* (polling = 0 : we must call netif_rx() instead)
*/
@@ -4628,71 +4897,55 @@ static int rtl8169_rx_interrupt(struct n
void __iomem *ioaddr, u32 budget)
{
unsigned int cur_rx, rx_left;
- unsigned int count;
+
+ unsigned int replenish_rx_cursor_delta; /* amount by which to
advance cursor */
+ unsigned int count; /* number of completed buffers handled in
this call */
+ unsigned int number_to_replenish; /* num buffers to replenish after
delivering */
int polling = (budget != ~(u32)0) ? 1 : 0;
cur_rx = tp->cur_rx;
- rx_left = NUM_RX_DESC + tp->dirty_rx - cur_rx;
+ rx_left = tp->num_rx_allocd + tp->totl_rx_replenished - cur_rx;
rx_left = min(rx_left, budget);
for (; rx_left > 0; rx_left--, cur_rx++) {
- unsigned int entry = cur_rx % NUM_RX_DESC;
+ unsigned int entry = cur_rx % tp->num_rx_allocd;
struct RxDesc *desc = tp->RxDescArray + entry;
- u32 status;
+ u32 opts1;
rmb();
- status = le32_to_cpu(desc->opts1);
+ opts1 = le32_to_cpu(desc->opts1);
- if (status & DescOwn)
+ if (opts1 & DescOwn)
break;
- if (unlikely(status & RxRES)) {
- netif_info(tp, rx_err, dev, "Rx ERROR. status = %08x\n",
- status);
+ if (unlikely(opts1 & RxRES)) {
+ netif_info(tp, rx_err, dev, "Rx ERROR. opts1 = %08x\n",
+ opts1);
dev->stats.rx_errors++;
- if (status & (RxRWT | RxRUNT))
+ if (opts1 & (RxRWT | RxRUNT))
dev->stats.rx_length_errors++;
- if (status & RxCRC)
+ if (opts1 & RxCRC)
dev->stats.rx_crc_errors++;
- if (status & RxFOVF) {
+ if (opts1 & RxFOVF) {
rtl8169_schedule_work(dev, rtl8169_reset_task);
dev->stats.rx_fifo_errors++;
}
rtl8169_mark_to_asic(desc, rx_buf_sz);
} else {
- struct sk_buff *skb;
- dma_addr_t addr = le64_to_cpu(desc->addr);
- int pkt_size = (status & 0x00001FFF) - 4;
+ int pkt_size = (opts1 & 0x00001FFF) - 4;
/*
* The driver does not support incoming fragmented
* frames. They are seen as a symptom of over-mtu
* sized frames.
*/
- if (unlikely(rtl8169_fragmented_frame(status))) {
+ if (unlikely(rtl8169_fragmented_frame(opts1))) {
dev->stats.rx_dropped++;
dev->stats.rx_length_errors++;
rtl8169_mark_to_asic(desc, rx_buf_sz);
continue;
}
- skb = rtl8169_try_rx_copy(tp->Rx_databuff[entry],
- tp, pkt_size, addr);
- rtl8169_mark_to_asic(desc, rx_buf_sz);
- if (!skb) {
- dev->stats.rx_dropped++;
- continue;
- }
-
- rtl8169_rx_csum(skb, status);
- skb_put(skb, pkt_size);
- skb->protocol = eth_type_trans(skb, dev);
-
- rtl8169_rx_vlan_tag(desc, skb);
-
- if (likely(polling))
- napi_gro_receive(&tp->napi, skb);
- else
- netif_rx(skb);
+ rtl8169_rx_deliver(tp, entry, polling);
dev->stats.rx_bytes += pkt_size;
dev->stats.rx_packets++;
@@ -4706,10 +4959,36 @@ static int rtl8169_rx_interrupt(struct n
}
}
- count = cur_rx - tp->cur_rx;
+ replenish_rx_cursor_delta = count = cur_rx - tp->cur_rx;
tp->cur_rx = cur_rx;
- tp->dirty_rx += count;
+ /* try to replenish buffers that any previous rtl8169_rx_deliver
+ * failed to. Note that these may not be contiguous -
+ * alloc success and fail may be interleaved.
+ * replenish_rx_cursor marks the earliest unreplenished.
+ */
+
+ number_to_replenish = (tp->cur_rx - tp->totl_rx_replenished);
+
+ if (number_to_replenish > 0) {
+ replenish_rx_cursor_delta =
+ rtl8169_rx_fill(tp, tp->replenish_rx_cursor, 0,
+ &number_to_replenish, GFP_ATOMIC);
+ if (!replenish_rx_cursor_delta)
+ netif_info(tp, intr, dev, "no Rx buffer allocated\n");
+ tp->totl_rx_replenished += number_to_replenish;
+ }
+ tp->replenish_rx_cursor =
+ ((tp->replenish_rx_cursor +
+ replenish_rx_cursor_delta) % tp->num_rx_allocd);
+
+ /*
+ * exhaustion of available buffers may kill the Rx process.
+ * the previous code tries to replenish but may fail. To prevent that,
+ * set or let default rx_copybreak to maximum value to copy every
buffer.
+ */
+ if ((tp->totl_rx_replenished + tp->num_rx_allocd) == tp->cur_rx)
+ netif_emerg(tp, intr, dev, "Rx buffers exhausted\n");
return count;
}
^ permalink raw reply
* Re: r8169 : always copying the rx buffer to new skb
From: John Lumby @ 2011-04-21 3:41 UTC (permalink / raw)
To: Francois Romieu; +Cc: netdev, Ben Hutchings, nic_swsd
In-Reply-To: <20110420191316.GA18805@electric-eye.fr.zoreil.com>
On 04/20/11 15:13, Francois Romieu wrote:
> John Lumby<johnlumby@hotmail.com> :
> [...]
>> I've verified that [...] and everything works just
>> fine,
> Did your testing account for some memory pressure ?
No, something I need to do. In my tests paging rate was light.
I will try a squeeze test to see what happens.
>> So do we really need to be that concerned about occasional
>> allocation failure?
> See $search_engine +r8169 high order memory allocation failure.
The bug reports I see are all related to the problem that occurred at
open in the days when the init_ring() requested GFP_ATOMIC *and*
insisted that *all* 256 buffers must be obtained or else failed the open.
I take your point, but I think it is different in the interrupt case
- the rx_interrupt does not consider a single alloc failure to trigger
any visible failure. In the old code (and my patched), it just tries
again next time. In the current code, it can't do that since it has
no skb to pass up, so it drops the packet.
Have I missed some other bug reports on alloc failures?
But I also realize that (I guess) few sysadmins may have ever set the
rx_copybreak down low in the days when the parameter was there, so
maybe we just don't really know how many problems would arise with
it. So I would propose leaving it to default to 16383 as before.
> Why don't you send the patch through the mailing list ?
>
> (hint, hint)
In my next post. Still on 2.6.39-rc2 and too late for me to try merge
to latest right now, hope still useful.
>>
>> I'm just not sure I
>> see why that has to imply the always_copy.
>>
>>
>> Because of high-order memory allocation failure under memory pressure and
>> memory wastage.
I think memory allocation failure can occur regardless of code design
- old, current and my patched code all do the same amount of dynamic
alloc'ing of skbs. It's just the initial set in the ring which is
different. In the current code, alloc failure => we drop packets.
Maybe dropping many packets might cause more trouble owing to
re-transmits than not replenishing the rx buffers temporarily.
Although I guess the NIC stays up with the current code. Not really sure.
>>
>> Btw several 816x have limited jumbo frames abilities.
Is that point that there is some special significance for memory
allocation? Not sure about that - I mean the total data rate per sec
is limited to 1Gbit nominal in each direction regardless of size of data
buffers (I think). So naiively jumbo will not put more memory
pressure than 1500 will it? Or are you thinking about testing?
John Lumby
^ permalink raw reply
* suspect locking in net/irda/iriap.c
From: Dave Jones @ 2011-04-21 3:40 UTC (permalink / raw)
To: netdev; +Cc: Samuel Ortiz
I just hit this..
=============================================
[ INFO: possible recursive locking detected ]
2.6.39-rc4+ #13
---------------------------------------------
trinity/11336 is trying to acquire lock:
(&(&hashbin->hb_spinlock)->rlock){......}, at: [<ffffffffa0653074>] irias_seq_show+0x4f/0x13b [irda]
but task is already holding lock:
(&(&hashbin->hb_spinlock)->rlock){......}, at: [<ffffffffa0653669>] irias_seq_start+0x1e/0x59 [irda]
other info that might help us debug this:
2 locks held by trinity/11336:
#0: (&p->lock){+.+.+.}, at: [<ffffffff811562b6>] seq_read+0x3d/0x367
#1: (&(&hashbin->hb_spinlock)->rlock){......}, at: [<ffffffffa0653669>] irias_seq_start+0x1e/0x59 [irda]
stack backtrace:
Pid: 11336, comm: trinity Not tainted 2.6.39-rc4+ #13
Call Trace:
[<ffffffff8108a7fd>] __lock_acquire+0x89b/0xc81
[<ffffffffa0653074>] ? irias_seq_show+0x4f/0x13b [irda]
[<ffffffff8108b0e3>] lock_acquire+0x108/0x133
[<ffffffffa0653074>] ? irias_seq_show+0x4f/0x13b [irda]
[<ffffffff814cc14b>] _raw_spin_lock+0x40/0x73
[<ffffffffa0653074>] ? irias_seq_show+0x4f/0x13b [irda]
[<ffffffffa0653074>] irias_seq_show+0x4f/0x13b [irda]
[<ffffffff811564fe>] seq_read+0x285/0x367
[<ffffffff81156279>] ? seq_lseek+0xe8/0xe8
[<ffffffff8118a192>] proc_reg_read+0x90/0xaf
[<ffffffff8113ab26>] vfs_read+0xac/0xf3
[<ffffffff8113c043>] ? fget_light+0x3a/0xa1
[<ffffffff8113abba>] sys_read+0x4d/0x74
[<ffffffff814d2d02>] system_call_fastpath+0x16/0x1b
irias_seq_start does this ..
996 {
997 spin_lock_irq(&irias_objects->hb_spinlock);
998
999 return *pos ? irias_seq_idx(*pos - 1) : SEQ_START_TOKEN;
1000 }
and then unlocks it in irias_seq_stop.
meanwhile in the seq_show iterator ...
1029 /* Careful for priority inversions here !
1030 * All other uses of attrib spinlock are independent of
1031 * the object spinlock, so we are safe. Jean II */
1032 spin_lock(&obj->attribs->hb_spinlock);
My reading of that comment suggests that the two locks aren't the same,
so is this just missing a lockdep annotation ?
Dave
^ permalink raw reply
* Re: [RFC PATCH 0/2] Multiqueue support for qemu(virtio-net)
From: Jason Wang @ 2011-04-21 3:33 UTC (permalink / raw)
To: Krishna Kumar2; +Cc: kvm, mst, netdev, Jason Wang, rusty, qemu-devel
In-Reply-To: <OF7CC46028.9A206852-ON65257878.00307271-65257878.00308D38@in.ibm.com>
Krishna Kumar2 writes:
> Thanks Jason!
>
> So I can use my virtio-net guest driver and test with this patch?
> Please provide the script you use to start MQ guest.
>
Yes and thanks. Following is a simple script may help you start macvtap mq
guest.
qemu_path=./qemu-system-x86_64
img_path=/home/kvm_autotest_root/images/mq.qcow2
vtap_dev=/dev/tap104
mac=96:88:12:1C:27:83
smp=2
mq=4
for i in `seq $mq`
do
vtap+=" -netdev tap,id=hn$i,fd=$((i+100)) $((i+100))<>$vtap_dev"
netdev+="hn$i#"
done
eval "$qemu_path $img_path $vtap -device virtio-net-pci,queues=$mq,netdev=$netdev,mac=$mac,vectors=32 -enable-kvm -smp $smp"
> Regards,
>
> - KK
>
> Jason Wang <jasowang@redhat.com> wrote on 04/20/2011 02:03:07 PM:
>
> > Jason Wang <jasowang@redhat.com>
> > 04/20/2011 02:03 PM
> >
> > To
> >
> > Krishna Kumar2/India/IBM@IBMIN, kvm@vger.kernel.org, mst@redhat.com,
> > netdev@vger.kernel.org, rusty@rustcorp.com.au, qemu-
> > devel@nongnu.org, anthony@codemonkey.ws
> >
> > cc
> >
> > Subject
> >
> > [RFC PATCH 0/2] Multiqueue support for qemu(virtio-net)
> >
> > Inspired by Krishna's patch
> (http://www.spinics.net/lists/kvm/msg52098.html
> > ) and
> > Michael's suggestions. The following series adds the multiqueue support
> for
> > qemu and enable it for virtio-net (both userspace and vhost).
> >
> > The aim for this series is to simplified the management and achieve the
> same
> > performacne with less codes.
> >
> > Follows are the differences between this series and Krishna's:
> >
> > - Add the multiqueue support for qemu and also for userspace virtio-net
> > - Instead of hacking the vhost module to manipulate kthreads, this patch
> just
> > implement the userspace based multiqueues and thus can re-use the
> > existed vhost kernel-side codes without any modification.
> > - Use 1:1 mapping between TX/RX pairs and vhost kthread because the
> > implementation is based on usersapce.
> > - The cli is also changed to make the mgmt easier, the -netdev option of
> qdev
> > can now accpet more than one ids. You can start a multiqueue virtio-net
> device
> > through:
> > ./qemu-system-x86_64 -netdev tap,id=hn0,vhost=on,fd=X -netdev
> > tap,id=hn0,vhost=on,fd=Y -device
> virtio-net-pci,netdev=hn0#hn1,queues=2 ...
> >
> > The series is very primitive and still need polished.
> >
> > Suggestions are welcomed.
> > ---
> >
> > Jason Wang (2):
> > net: Add multiqueue support
> > virtio-net: add multiqueue support
> >
> >
> > hw/qdev-properties.c | 37 ++++-
> > hw/qdev.h | 3
> > hw/vhost.c | 26 ++-
> > hw/vhost.h | 1
> > hw/vhost_net.c | 7 +
> > hw/vhost_net.h | 2
> > hw/virtio-net.c | 409 +++++++++++++++++++++++++++++++
> > +------------------
> > hw/virtio-net.h | 2
> > hw/virtio-pci.c | 1
> > hw/virtio.h | 1
> > net.c | 34 +++-
> > net.h | 15 +-
> > 12 files changed, 353 insertions(+), 185 deletions(-)
> >
> > --
> > Jason Wang
>
^ permalink raw reply
* RE: Low performance Intel 10GE NIC (3.2.10) on 2.6.38 Kernel
From: Wei Gu @ 2011-04-21 3:25 UTC (permalink / raw)
To: netdev
Cc: Peter Zijlstra, Eric Dumazet, Alexander Duyck, Kirsher, Jeffrey T,
Mike Galbraith, Thomas Gleixner, Jesse Brandeburg
In-Reply-To: <BANLkTin_OgBo3N8qi5cF5rOd__pRKifwsg@mail.gmail.com>
Okay, I see the magic in the code:
static inline void skb_record_rx_queue(struct sk_buff *skb, u16 rx_queue)
{
skb->queue_mapping = rx_queue + 1;
}
static inline u16 skb_get_rx_queue(const struct sk_buff *skb)
{
return skb->queue_mapping - 1;
}
static inline bool skb_rx_queue_recorded(const struct sk_buff *skb)
{
return (skb->queue_mapping != 0);
}
Anyway it seems strange that we have different meaning of skb->queue_mapping for Tx and Rx. Is it better to using higher 4 bit Or even FFFF to indicate rx_queue not recorded?
-----Original Message-----
From: Wei Gu
Sent: Thursday, April 21, 2011 10:57 AM
To: 'netdev'
Cc: 'Peter Zijlstra'; 'Eric Dumazet'; 'Alexander Duyck'; 'Kirsher, Jeffrey T'; 'Mike Galbraith'; 'Thomas Gleixner'; 'Jesse Brandeburg'
Subject: RE: Low performance Intel 10GE NIC (3.2.10) on 2.6.38 Kernel
A quick question, regarding the skb->queue_mapping?
Do you know who will put the queue number for the received skb on the receving path? Cause I found it has a value in the recevied skb, but it seems over the range of rx/tx queues. Like if only have 8 rx and 8 tx queues on this netdev, then I can see the queue_mapping in the receving skb will be in [1-8], which I expect is [0-7].
Thanks
WeiGu
-----Original Message-----
From: Wei Gu
Sent: Tuesday, April 19, 2011 12:09 PM
To: 'Jesse Brandeburg'
Cc: Peter Zijlstra; Eric Dumazet; Alexander Duyck; netdev; Kirsher, Jeffrey T; Mike Galbraith; Thomas Gleixner
Subject: RE: Low performance Intel 10GE NIC (3.2.10) on 2.6.38 Kernel
Hi,
This is the result that I running the turbostat via 2.6.35.3 and perform the same load test on eth10 (8 tx/rx queue binding to core 24-31) according you instruction.
I was add the processor.max_cstate=1 in the boot params, and also disabled the cstate in the BIOS. But looks like the kernel does take them.
Thanks
WeiGu
-----Original Message-----
From: Jesse Brandeburg [mailto:jesse.brandeburg@gmail.com]
Sent: Tuesday, April 19, 2011 5:12 AM
To: Wei Gu
Cc: Peter Zijlstra; Eric Dumazet; Alexander Duyck; netdev; Kirsher, Jeffrey T; Mike Galbraith; Thomas Gleixner
Subject: Re: Low performance Intel 10GE NIC (3.2.10) on 2.6.38 Kernel
On Fri, Apr 15, 2011 at 2:14 AM, Wei Gu <wei.gu@ericsson.com> wrote:
> Is there something that I can provide in order to identify the problem?
for power state concerns you may want to try running turbostat (available in recent kernels, runs also on older kernels) during the workload in question. Capture the results via something like:
cd /home/jbrandeb/linux-2.6.38.2/tools/power/x86/turbostat
make
for i in `seq 1 10` ; do ./turbostat -v sleep 5 >> turbostat.txt 2>&1 ; done
Jesse
^ permalink raw reply
* RE: Low performance Intel 10GE NIC (3.2.10) on 2.6.38 Kernel
From: Wei Gu @ 2011-04-21 2:57 UTC (permalink / raw)
To: netdev
Cc: Peter Zijlstra, Eric Dumazet, Alexander Duyck, Kirsher, Jeffrey T,
Mike Galbraith, Thomas Gleixner, Jesse Brandeburg
In-Reply-To: <BANLkTin_OgBo3N8qi5cF5rOd__pRKifwsg@mail.gmail.com>
A quick question, regarding the skb->queue_mapping?
Do you know who will put the queue number for the received skb on the receving path? Cause I found it has a value in the recevied skb, but it seems over the range of rx/tx queues. Like if only have 8 rx and 8 tx queues on this netdev, then I can see the queue_mapping in the receving skb will be in [1-8], which I expect is [0-7].
Thanks
WeiGu
-----Original Message-----
From: Wei Gu
Sent: Tuesday, April 19, 2011 12:09 PM
To: 'Jesse Brandeburg'
Cc: Peter Zijlstra; Eric Dumazet; Alexander Duyck; netdev; Kirsher, Jeffrey T; Mike Galbraith; Thomas Gleixner
Subject: RE: Low performance Intel 10GE NIC (3.2.10) on 2.6.38 Kernel
Hi,
This is the result that I running the turbostat via 2.6.35.3 and perform the same load test on eth10 (8 tx/rx queue binding to core 24-31) according you instruction.
I was add the processor.max_cstate=1 in the boot params, and also disabled the cstate in the BIOS. But looks like the kernel does take them.
Thanks
WeiGu
-----Original Message-----
From: Jesse Brandeburg [mailto:jesse.brandeburg@gmail.com]
Sent: Tuesday, April 19, 2011 5:12 AM
To: Wei Gu
Cc: Peter Zijlstra; Eric Dumazet; Alexander Duyck; netdev; Kirsher, Jeffrey T; Mike Galbraith; Thomas Gleixner
Subject: Re: Low performance Intel 10GE NIC (3.2.10) on 2.6.38 Kernel
On Fri, Apr 15, 2011 at 2:14 AM, Wei Gu <wei.gu@ericsson.com> wrote:
> Is there something that I can provide in order to identify the problem?
for power state concerns you may want to try running turbostat (available in recent kernels, runs also on older kernels) during the workload in question. Capture the results via something like:
cd /home/jbrandeb/linux-2.6.38.2/tools/power/x86/turbostat
make
for i in `seq 1 10` ; do ./turbostat -v sleep 5 >> turbostat.txt 2>&1 ; done
Jesse
^ permalink raw reply
* Re: Suspend/resume - slow resume
From: Ciprian Docan @ 2011-04-21 2:07 UTC (permalink / raw)
To: Francois Romieu
Cc: Linus Torvalds, netdev, Linux Kernel Mailing List, Len Brown,
Pavel Machek, Rafael, J. Wysocki, Greg KH, nic_swsd
In-Reply-To: <20110420195330.GB18897@electric-eye.fr.zoreil.com>
[-- Attachment #1: Type: TEXT/PLAIN, Size: 316 bytes --]
Hello Francois,
>> I tried to unload the module, but it crashed. I re-booted the
>> machine and tried to unload the module again. I got the output from
>> dmesg; please see attached.
>
> Hmmm...
I adapted your patch and fixed the module unload problem; please let me
know what you think. Thank you.
--
Ciprian
[-- Attachment #2: Type: TEXT/PLAIN, Size: 5545 bytes --]
diff --git a/drivers/net/r8169.c b/drivers/net/r8169.c
index 493b0de..397c368 100644
--- a/drivers/net/r8169.c
+++ b/drivers/net/r8169.c
@@ -170,6 +170,16 @@ static const struct {
};
#undef _R
+static const struct rtl_firmware_info {
+ int mac_version;
+ const char *fw_name;
+} rtl_firmware_infos[] = {
+ { .mac_version = RTL_GIGA_MAC_VER_25, .fw_name = FIRMWARE_8168D_1 },
+ { .mac_version = RTL_GIGA_MAC_VER_26, .fw_name = FIRMWARE_8168D_2 },
+ { .mac_version = RTL_GIGA_MAC_VER_29, .fw_name = FIRMWARE_8105E_1 },
+ { .mac_version = RTL_GIGA_MAC_VER_30, .fw_name = FIRMWARE_8105E_1 }
+};
+
enum cfg_version {
RTL_CFG_0 = 0x00,
RTL_CFG_1,
@@ -565,6 +575,7 @@ struct rtl8169_private {
u32 saved_wolopts;
const struct firmware *fw;
+#define RTL_FIRMWARE_UNKNOWN ERR_PTR(-EAGAIN);
};
MODULE_AUTHOR("Realtek and the Linux r8169 crew <netdev@vger.kernel.org>");
@@ -1789,25 +1800,26 @@ rtl_phy_write_fw(struct rtl8169_private *tp, const struct firmware *fw)
static void rtl_release_firmware(struct rtl8169_private *tp)
{
- release_firmware(tp->fw);
- tp->fw = NULL;
+ if (!IS_ERR_OR_NULL(tp->fw))
+ release_firmware(tp->fw);
+ tp->fw = RTL_FIRMWARE_UNKNOWN;
}
-static int rtl_apply_firmware(struct rtl8169_private *tp, const char *fw_name)
+static void rtl_apply_firmware(struct rtl8169_private *tp)
{
- const struct firmware **fw = &tp->fw;
- int rc = !*fw;
-
- if (rc) {
- rc = request_firmware(fw, fw_name, &tp->pci_dev->dev);
- if (rc < 0)
- goto out;
- }
+ const struct firmware *fw = tp->fw;
/* TODO: release firmware once rtl_phy_write_fw signals failures. */
- rtl_phy_write_fw(tp, *fw);
-out:
- return rc;
+ if (!IS_ERR_OR_NULL(fw))
+ rtl_phy_write_fw(tp, fw);
+}
+
+static void rtl_apply_firmware_cond(struct rtl8169_private *tp, u8 reg, u16 val)
+{
+ if (rtl_readphy(tp, reg) != val)
+ netif_warn(tp, hw, tp->dev, "chipset not ready for firmware\n");
+ else
+ rtl_apply_firmware(tp);
}
static void rtl8169s_hw_phy_config(struct rtl8169_private *tp)
@@ -2246,10 +2258,8 @@ static void rtl8168d_1_hw_phy_config(struct rtl8169_private *tp)
rtl_writephy(tp, 0x1f, 0x0005);
rtl_writephy(tp, 0x05, 0x001b);
- if ((rtl_readphy(tp, 0x06) != 0xbf00) ||
- (rtl_apply_firmware(tp, FIRMWARE_8168D_1) < 0)) {
- netif_warn(tp, probe, tp->dev, "unable to apply firmware patch\n");
- }
+
+ rtl_apply_firmware_cond(tp, MII_EXPANSION, 0xbf00);
rtl_writephy(tp, 0x1f, 0x0000);
}
@@ -2351,10 +2361,8 @@ static void rtl8168d_2_hw_phy_config(struct rtl8169_private *tp)
rtl_writephy(tp, 0x1f, 0x0005);
rtl_writephy(tp, 0x05, 0x001b);
- if ((rtl_readphy(tp, 0x06) != 0xb300) ||
- (rtl_apply_firmware(tp, FIRMWARE_8168D_2) < 0)) {
- netif_warn(tp, probe, tp->dev, "unable to apply firmware patch\n");
- }
+
+ rtl_apply_firmware_cond(tp, MII_EXPANSION, 0xb300);
rtl_writephy(tp, 0x1f, 0x0000);
}
@@ -2474,8 +2482,7 @@ static void rtl8105e_hw_phy_config(struct rtl8169_private *tp)
rtl_writephy(tp, 0x18, 0x0310);
msleep(100);
- if (rtl_apply_firmware(tp, FIRMWARE_8105E_1) < 0)
- netif_warn(tp, probe, tp->dev, "unable to apply firmware patch\n");
+ rtl_apply_firmware(tp);
rtl_writephy_batch(tp, phy_reg_init, ARRAY_SIZE(phy_reg_init));
}
@@ -3237,6 +3244,8 @@ rtl8169_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
tp->timer.data = (unsigned long) dev;
tp->timer.function = rtl8169_phy_timer;
+ tp->fw = RTL_FIRMWARE_UNKNOWN;
+
rc = register_netdev(dev);
if (rc < 0)
goto err_out_msi_4;
@@ -3288,10 +3297,10 @@ static void __devexit rtl8169_remove_one(struct pci_dev *pdev)
cancel_delayed_work_sync(&tp->task);
- rtl_release_firmware(tp);
-
unregister_netdev(dev);
+ rtl_release_firmware(tp);
+
if (pci_dev_run_wake(pdev))
pm_runtime_get_noresume(&pdev->dev);
@@ -3303,6 +3312,37 @@ static void __devexit rtl8169_remove_one(struct pci_dev *pdev)
pci_set_drvdata(pdev, NULL);
}
+static void rtl_request_firmware(struct rtl8169_private *tp)
+{
+ int i;
+
+ /* Return early if the firmware is already loaded / cached. */
+ if (!IS_ERR(tp->fw))
+ goto out;
+
+ for (i = 0; i < ARRAY_SIZE(rtl_firmware_infos); i++) {
+ const struct rtl_firmware_info *info = rtl_firmware_infos + i;
+
+ if (info->mac_version == tp->mac_version) {
+ const char *name = info->fw_name;
+ int rc;
+
+ rc = request_firmware(&tp->fw, name, &tp->pci_dev->dev);
+ if (rc < 0) {
+ netif_warn(tp, ifup, tp->dev, "unable to load "
+ "firmware patch %s (%d)\n", name, rc);
+ goto out_disable_request_firmware;
+ }
+ goto out;
+ }
+ }
+
+out_disable_request_firmware:
+ tp->fw = NULL;
+out:
+ return;
+}
+
static int rtl8169_open(struct net_device *dev)
{
struct rtl8169_private *tp = netdev_priv(dev);
@@ -3334,11 +3374,13 @@ static int rtl8169_open(struct net_device *dev)
smp_mb();
+ rtl_request_firmware(tp);
+
retval = request_irq(dev->irq, rtl8169_interrupt,
(tp->features & RTL_FEATURE_MSI) ? 0 : IRQF_SHARED,
dev->name, dev);
if (retval < 0)
- goto err_release_ring_2;
+ goto err_release_fw_2;
napi_enable(&tp->napi);
@@ -3359,7 +3401,8 @@ static int rtl8169_open(struct net_device *dev)
out:
return retval;
-err_release_ring_2:
+err_release_fw_2:
+ rtl_release_firmware(tp);
rtl8169_rx_clear(tp);
err_free_rx_1:
dma_free_coherent(&pdev->dev, R8169_RX_RING_BYTES, tp->RxDescArray,
^ permalink raw reply related
* [PATCH net-next] net: Allow ethtool to set interface in loopback mode.
From: Mahesh Bandewar @ 2011-04-21 0:57 UTC (permalink / raw)
To: David Miller; +Cc: netdev, Mahesh Bandewar
This patch enables ethtool to set the loopback mode on a given interface.
By configuring the interface in loopback mode in conjunction with a policy
route / rule, a userland application can stress the egress / ingress path
exposing the flows of the change in progress and potentially help developer(s)
understand the impact of those changes without even sending a packet out
on the network.
Following set of commands illustrates one such example -
a) ip -4 addr add 192.168.1.1/24 dev eth1
b) ip -4 rule add from all iif eth1 lookup 250
c) ip -4 route add local 0/0 dev lo proto kernel scope host table 250
d) arp -Ds 192.168.1.100 eth1
e) arp -Ds 192.168.1.200 eth1
f) sysctl -w net.ipv4.ip_nonlocal_bind=1
g) sysctl -w net.ipv4.conf.all.accept_local=1
# Assuming that the machine has 8 cores
h) taskset 000f netserver -L 192.168.1.200
i) taskset 00f0 netperf -t TCP_CRR -L 192.168.1.100 -H 192.168.1.200 -l 30
Signed-off-by: Mahesh Bandewar <maheshb@google.com>
---
include/linux/netdevice.h | 3 ++-
net/core/ethtool.c | 2 +-
2 files changed, 3 insertions(+), 2 deletions(-)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index cb8178a..c4d9dd7 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1067,6 +1067,7 @@ struct net_device {
#define NETIF_F_RXHASH (1 << 28) /* Receive hashing offload */
#define NETIF_F_RXCSUM (1 << 29) /* Receive checksumming offload */
#define NETIF_F_NOCACHE_COPY (1 << 30) /* Use no-cache copyfromuser */
+#define NETIF_F_LOOPBACK (1 << 31) /* Enable loopback */
/* Segmentation offload features */
#define NETIF_F_GSO_SHIFT 16
@@ -1082,7 +1083,7 @@ struct net_device {
/* = all defined minus driver/device-class-related */
#define NETIF_F_NEVER_CHANGE (NETIF_F_HIGHDMA | NETIF_F_VLAN_CHALLENGED | \
NETIF_F_LLTX | NETIF_F_NETNS_LOCAL)
-#define NETIF_F_ETHTOOL_BITS (0x7f3fffff & ~NETIF_F_NEVER_CHANGE)
+#define NETIF_F_ETHTOOL_BITS (0xff3fffff & ~NETIF_F_NEVER_CHANGE)
/* List of features with software fallbacks. */
#define NETIF_F_GSO_SOFTWARE (NETIF_F_TSO | NETIF_F_TSO_ECN | \
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 13d79f5..d177ce9 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -362,7 +362,7 @@ static const char netdev_features_strings[ETHTOOL_DEV_FEATURE_WORDS * 32][ETH_GS
/* NETIF_F_RXHASH */ "rx-hashing",
/* NETIF_F_RXCSUM */ "rx-checksum",
/* NETIF_F_NOCACHE_COPY */ "tx-nocache-copy"
- "",
+ /* NETIF_F_LOOPBACK */ "loopback",
};
static int __ethtool_get_sset_count(struct net_device *dev, int sset)
--
1.7.3.1
^ permalink raw reply related
* Re: [PATCH] net: tun: convert to hw_features
From: Michał Mirosław @ 2011-04-20 23:15 UTC (permalink / raw)
To: Rusty Russell; +Cc: netdev
In-Reply-To: <871v0xip3j.fsf@rustcorp.com.au>
On Wed, Apr 20, 2011 at 12:52:24PM +0930, Rusty Russell wrote:
> On Tue, 19 Apr 2011 18:13:10 +0200 (CEST), Michał Mirosław <mirq-linux@rere.qmqm.pl> wrote:
> > This changes offload setting behaviour to what I think is correct:
> > - offloads set via ethtool mean what admin wants to use (by default
> > he wants 'em all)
> > - offloads set via ioctl() mean what userspace is expecting to get
> > (this limits which admin wishes are granted)
> > - TUN_NOCHECKSUM is ignored, as it might cause broken packets when
> > forwarded (ip_summed == CHECKSUM_UNNECESSARY means that checksum
> > was verified, not that it can be ignored)
> >
> > If TUN_NOCHECKSUM is implemented, it should set skb->csum_* and
> > skb->ip_summed (= CHECKSUM_PARTIAL) for known protocols and let others
> > be verified by kernel when necessary.
> >
> > TUN_NOCHECKSUM handling was introduced by commit
> > f43798c27684ab925adde7d8acc34c78c6e50df8:
> >
> > tun: Allow GSO using virtio_net_hdr
> Err, not in my git tree! It predates git in fact.
Ah! Sorry! Your patch just moved the buggy line around.
> Since tap requires privs, I wouldn't worry about invalid packets too
> much.
I prefer correctness over uncertainty. ;)
Best Regards,
Michał Mirosław
^ permalink raw reply
* Re: [PATCH] net: tun: convert to hw_features
From: Rusty Russell @ 2011-04-20 3:22 UTC (permalink / raw)
To: Michał Mirosław, netdev
In-Reply-To: <20110419161310.7508513909@rere.qmqm.pl>
On Tue, 19 Apr 2011 18:13:10 +0200 (CEST), Michał Mirosław <mirq-linux@rere.qmqm.pl> wrote:
> This changes offload setting behaviour to what I think is correct:
> - offloads set via ethtool mean what admin wants to use (by default
> he wants 'em all)
> - offloads set via ioctl() mean what userspace is expecting to get
> (this limits which admin wishes are granted)
> - TUN_NOCHECKSUM is ignored, as it might cause broken packets when
> forwarded (ip_summed == CHECKSUM_UNNECESSARY means that checksum
> was verified, not that it can be ignored)
>
> If TUN_NOCHECKSUM is implemented, it should set skb->csum_* and
> skb->ip_summed (= CHECKSUM_PARTIAL) for known protocols and let others
> be verified by kernel when necessary.
>
> TUN_NOCHECKSUM handling was introduced by commit
> f43798c27684ab925adde7d8acc34c78c6e50df8:
>
> tun: Allow GSO using virtio_net_hdr
Err, not in my git tree! It predates git in fact.
Since tap requires privs, I wouldn't worry about invalid packets too
much.
Thanks,
Rusty.
^ permalink raw reply
* Re: r8169 doesn't report link state correctly.
From: Ben Greear @ 2011-04-20 21:56 UTC (permalink / raw)
To: Francois Romieu; +Cc: netdev
In-Reply-To: <20110420191403.GC18805@electric-eye.fr.zoreil.com>
On 04/20/2011 12:14 PM, Francois Romieu wrote:
> François Romieu<romieu@fr.zoreil.com> :
>> On Mon, Apr 11, 2011 at 01:09:19PM -0700, Ben Greear wrote:
>>> I notice that in kernel 2.6.38-wl, the realtek 8169 NIC doesn't
>>> report link down when in fact there is no cable connected. Instead,
> [...]
>> Thanks for the report. I'll try it tomorrow or friday.
>
> I have not been able to notice it with a current kernel.
>
> I'd welcome the XID of the 8169 NIC (see dmesg) and a short explanation
> (no cable from boot ? cable removed after ifconfig up ? brand / ability
> of the switch / hub ?).
Well, as luck would have it, my system will boot today's upstream
kernel (39-rc4+). And, I no longer see the problem in that release,
so it seems it is fixed (or harder to reproduce that I thought).
Basically, I was seeing it claim to have link in 'ethtool' output
when there was no cable connected. It did go to 10Mbps/half duplex
link speed when un-plugged. It showed full 1Gbps link when plugged
in.
I'll let you know if I see this again.
Thanks,
Ben
>
> Thanks.
>
--
Ben Greear <greearb@candelatech.com>
Candela Technologies Inc http://www.candelatech.com
^ permalink raw reply
* Re: [v2 PATCH 0/6] IPVS: init and cleanup.
From: Julian Anastasov @ 2011-04-20 21:36 UTC (permalink / raw)
To: Hans Schillstrom
Cc: horms, ebiederm, lvs-devel, netdev, netfilter-devel,
hans.schillstrom
In-Reply-To: <1303324434-14024-1-git-send-email-hans@schillstrom.com>
Hello,
On Wed, 20 Apr 2011, Hans Schillstrom wrote:
> This patch series handles exit from a network name space.
>
> REVISION
>
> This is version 2
>
> OVERVIEW
> Basically there was three faults in the netns implementation.
> - Kernel threads hold devices and preventing an exit.
> - dst cache holds references to devices.
> - Services was not always released.
>
> Patch 1 & 3 contains the functionality
> 4 renames funcctions
> 5 removes empty functions
> 6 Debuging.
>
> IMPLEMENTATION
> - Avoid to increment the usage counter for kernel threads.
> this is done in the first patch.
> - Patch 3 tries to restore the cleanup order.
> Add NETDEV_UNREGISTER notification for dst_reset
>
> >From Julian -
> "For __ip_vs_service_cleanup: it still has to use mutex.
> Or we can avoid it by introducing ip_vs_unlink_service_nolock:
> ip_vs_flush will look like your __ip_vs_service_cleanup and
> will call ip_vs_unlink_service_nolock. ip_vs_unlink_service_nolock
> will be called by ip_vs_flush and by ip_vs_unlink_service."
>
> I will give above another try later on see if I can get it to work.
> Right now ip_vs_service_net_cleanup() seems to work.
>
>
> An netns exit could look like this
> IPVS: Enter: __ip_vs_dev_cleanup, net/netfilter/ipvs/ip_vs_core.c
> IPVS: stopping master sync thread 1286 ...
> IPVS: stopping backup sync thread 1294 ...
> IPVS: Leave: __ip_vs_dev_cleanup, net/netfilter/ipvs/ip_vs_core.c
> IPVS: Enter: ip_vs_dst_event, net/netfilter/ipvs/ip_vs_ctl.c line
> IPVS: Leave: ip_vs_dst_event, net/netfilter/ipvs/ip_vs_ctl.c line
> ...
> IPVS: Enter: ip_vs_dst_event, net/netfilter/ipvs/ip_vs_ctl.c line
> IPVS: Leave: ip_vs_dst_event, net/netfilter/ipvs/ip_vs_ctl.c line
> IPVS: Enter: ip_vs_service_net_cleanup, net/netfilter/ipvs/ip_vs_ctl.c
> IPVS: __ip_vs_del_service: enter
> IPVS: Moving dest 192.168.1.6:0 into trash, dest->refcnt=43450
> ...
> IPVS: Moving dest 192.168.1.3:0 into trash, dest->refcnt=43449
> IPVS: __ip_vs_del_service: enter
> IPVS: Removing destination 0/[2003:0000:0000:0000:0000:0002:0000:0006]:80
> ...
> IPVS: Removing destination 0/[2003:0000:0000:0000:0000:0002:0000:0003]:80
> IPVS: Removing service 0/[2003:0000:0000:0000:0000:0002:0004:0100]:80 usecnt=0
> IPVS: Leave: ip_vs_service_net_cleanup, net/netfilter/ipvs/ip_vs_ctl.c
> IPVS: Enter: ip_vs_control_net_cleanup, net/netfilter/ipvs/ip_vs_ctl.c
> IPVS: Removing service 80/0.0.0.0:0 usecnt=0
> IPVS: Leave: ip_vs_control_net_cleanup, net/netfilter/ipvs/ip_vs_ctl.c
> IPVS: ipvs netns 8 released
Notes only about PATCH 3/6:
- I was worried that throttle was used during cleanup
with the idea to stop traffic. But as now it is only an
optimization to avoid traffic to spend cycles in IPVS
code I'll suggest to use it without atomic operations.
It is not fatal if some packet does not see the disabling
immediately. So, may be using 'if (net_ipvs(net)->enable)'
is enough.
As result, there is no need to disable traffic in
__ip_vs_dev_cleanup, it is going to stop during
_device cleanup anyways.
- Please move check for 'enable' in ip_vs_in() before
ip_vs_fill_iphdr, this is the preferred place:
+ net = skb_net(skb);
+ if (!net_ipvs(net)->enable)
+ return NF_ACCEPT;
ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
Note that only ip_vs_in, ip_vs_out and ip_vs_forward_icmp*
need check for 'enable'. Then we can remove them from
ip_vs_in_icmp*
- As device can change its name space it was mandatory we
to add support for NETDEV_UNREGISTER. As we are using
_subsys pernet operations I expect we to see NETDEV_UNREGISTER
while the _device pernet methods are called, so before
our _subsys cleanup.
We can see in log that ip_vs_dst_event() is called before
ip_vs_service_net_cleanup, i.e. before our subsys cleanup.
net/core/dev.c uses register_pernet_device(&default_device_ops)
to call NETDEV_UNREGISTER* during _device cleanup and
to move devices to init_net. So, it should be safe
to rely on NETDEV_UNREGISTER event while we are subsys.
- __ip_vs_service_cleanup needs to call ip_vs_flush after
converting to ip_vs_unlink_service_nolock.
- For ip_vs_dst_event: I prefer to put everything in this
function, the ip_vs_svc_reset is not needed (name is not
good too). For example:
struct ip_vs_dest *dest;
unsigned int hash;
mutex_lock(&__ip_vs_mutex);
/* No need to use rs_lock, the mutex protects the list */
for (hash = 0; hash < IP_VS_RTAB_SIZE; hash++) {
list_for_each_entry(dest, &ipvs->rs_table[hash], d_list) {
__ip_vs_dev_reset(dest, dev);
}
}
/* The mutex protects the trash list */
list_for_each_entry(dest, &ipvs->dest_trash, n_list) {
__ip_vs_dev_reset(dest, dev);
}
mutex_unlock(&__ip_vs_mutex);
No need to use __ip_vs_svc_lock or rs_lock because we
do not change the lists, __ip_vs_dev_reset has the needed
dst_cache locking (dst_lock). I assume we can safely use our
__ip_vs_mutex from netdevice notifier.
Regards
--
Julian Anastasov <ja@ssi.bg>
^ permalink raw reply
* Re: [PATCHv4] usbnet: Resubmit interrupt URB once if halted
From: Paul Stewart @ 2011-04-20 21:17 UTC (permalink / raw)
To: Alan Stern
Cc: netdev-u79uwXL29TY76Z2rM5mHXA, linux-usb-u79uwXL29TY76Z2rM5mHXA,
davem-fT/PcQaiUtIeIZ0/mPfg9Q, greg-U8xfFu+wG4EAvxtiuMwx3w
In-Reply-To: <Pine.LNX.4.44L0.1104201658280.1686-100000-IYeN2dnnYyZXsRXLowluHWD2FQJk+8+b@public.gmane.org>
On Wed, Apr 20, 2011 at 2:08 PM, Alan Stern <stern-nwvwT67g6+6dFdvTe/nMLpVzexx5G7lz@public.gmane.org> wrote:
> On Tue, 19 Apr 2011, Paul Stewart wrote:
>
>> Set a flag if the interrupt URB completes with ENOENT as this
>> occurs legitimately during system suspend. When the usbnet_bh
>> is called after resume, test this flag and try once to resubmit
>> the interrupt URB.
>
> No doubt there's a good reason for doing things this way, but it isn't
> clear. Why wait until usbnet_bh() is called after resume? Why not
> resubmit the interrupt URB _during_ usbnet_resume()?
Actually, I was doing this in the bh because of feedback I had gained
early in this process about not doing submit_urb in the resume(). If
that issue doesn't exist, that makes my work a lot easier. In testing
I found that just setting this to happen in the bh might be problematic
due to firing too early, so this is good news.
> This would seem
> to be the logical approach, seeing as how usbnet_suspend() kills the
> interrupt URB.
Aha! But you'll see from the current version of my patch that we don't
actually ever kill the interrupt URB. It gets killed all on its own (by the
hcd?) and handed back to us in intr_complete(). This last bit about the
complete function being called was lost on me for a while which is why
in a previous iteration of the patch I was trying to kill the urb in suspend().
> For that matter, where does the interrupt URB get deallocated? It
> obviously gets allocated in init_status(), but it doesn't seem to be
> freed anywhere.
That's a very interesting question and one that I noticed as well.
I was going to tackle that as a separate issue.
>
> Alan Stern
>
>
--
To unsubscribe from this list: send the line "unsubscribe linux-usb" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply
* Re: [PATCH V3 5/8] Enable cxgb3 to support zerocopy
From: Shirley Ma @ 2011-04-20 21:15 UTC (permalink / raw)
To: Dimitris Michailidis
Cc: David Miller, mst, Eric Dumazet, Avi Kivity, Arnd Bergmann,
netdev, kvm, linux-kernel
In-Reply-To: <1303333134.19336.70.camel@localhost.localdomain>
On Wed, 2011-04-20 at 13:58 -0700, Shirley Ma wrote:
> This flag can be ON when HIGHDMA and scatter/gather support. I will
> modify the patch to make it conditionally.
Double checked, it only needs HIGHDMA condition, not scatter/gather.
thanks
Shirley
^ permalink raw reply
* Re: [PATCHv4] usbnet: Resubmit interrupt URB once if halted
From: Alan Stern @ 2011-04-20 21:08 UTC (permalink / raw)
To: Paul Stewart; +Cc: netdev, linux-usb, davem, greg
In-Reply-To: <20110420195015.20A96201B8@glenhelen.mtv.corp.google.com>
On Tue, 19 Apr 2011, Paul Stewart wrote:
> Set a flag if the interrupt URB completes with ENOENT as this
> occurs legitimately during system suspend. When the usbnet_bh
> is called after resume, test this flag and try once to resubmit
> the interrupt URB.
No doubt there's a good reason for doing things this way, but it isn't
clear. Why wait until usbnet_bh() is called after resume? Why not
resubmit the interrupt URB _during_ usbnet_resume()? This would seem
to be the logical approach, seeing as how usbnet_suspend() kills the
interrupt URB.
For that matter, where does the interrupt URB get deallocated? It
obviously gets allocated in init_status(), but it doesn't seem to be
freed anywhere.
Alan Stern
^ permalink raw reply
* Re: [PATCH V3 5/8] Enable cxgb3 to support zerocopy
From: Shirley Ma @ 2011-04-20 20:58 UTC (permalink / raw)
To: Dimitris Michailidis
Cc: David Miller, mst, Eric Dumazet, Avi Kivity, Arnd Bergmann,
netdev, kvm, linux-kernel
In-Reply-To: <4DAF479D.7090309@chelsio.com>
On Wed, 2011-04-20 at 13:52 -0700, Dimitris Michailidis wrote:
> The features handling has been reworked in net-next and patches like
> this
> won't apply as the code you're patching has changed. Also core code
> now
> does a lot of the related work and you'll need to tell it what to do
> with
> any new flags.
Ok, will do.
> What properties does a device or driver need to meet to set this flag?
> It
> seems to be set a bit too unconditionally. For example, does it work
> if one
> disables scatter/gather?
This flag can be ON when HIGHDMA and scatter/gather support. I will
modify the patch to make it conditionally.
thanks
Shirley
^ permalink raw reply
* Re: [PATCH V3 5/8] Enable cxgb3 to support zerocopy
From: Dimitris Michailidis @ 2011-04-20 20:52 UTC (permalink / raw)
To: Shirley Ma
Cc: David Miller, mst, Eric Dumazet, Avi Kivity, Arnd Bergmann,
netdev, kvm, linux-kernel
In-Reply-To: <1303330438.19336.57.camel@localhost.localdomain>
On 04/20/2011 01:13 PM, Shirley Ma wrote:
> Signed-off-by: Shirley Ma <xma@us.ibm.com>
> ---
>
> drivers/net/cxgb3/cxgb3_main.c | 2 +-
> 1 files changed, 1 insertions(+), 1 deletions(-)
>
> diff --git a/drivers/net/cxgb3/cxgb3_main.c b/drivers/net/cxgb3/cxgb3_main.c
> index 9108931..93a1101 100644
> --- a/drivers/net/cxgb3/cxgb3_main.c
> +++ b/drivers/net/cxgb3/cxgb3_main.c
> @@ -3313,7 +3313,7 @@ static int __devinit init_one(struct pci_dev *pdev,
> netdev->features |= NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO;
> netdev->features |= NETIF_F_GRO;
> if (pci_using_dac)
> - netdev->features |= NETIF_F_HIGHDMA;
> + netdev->features |= NETIF_F_HIGHDMA | NETIF_F_ZEROCOPY;
>
> netdev->features |= NETIF_F_HW_VLAN_TX | NETIF_F_HW_VLAN_RX;
> netdev->netdev_ops = &cxgb_netdev_ops;
The features handling has been reworked in net-next and patches like this
won't apply as the code you're patching has changed. Also core code now
does a lot of the related work and you'll need to tell it what to do with
any new flags.
What properties does a device or driver need to meet to set this flag? It
seems to be set a bit too unconditionally. For example, does it work if one
disables scatter/gather?
^ permalink raw reply
* [PATCH net-next-2.6 v5 5/5] sctp: Add ADD/DEL ASCONF handling at the receiver
From: Michio Honda @ 2011-04-20 20:42 UTC (permalink / raw)
To: netdev; +Cc: lksctp-developers
This patch fixes the problem that the original code cannot delete the remote address where the corresponding transport is currently directed, even when the ASCONF is sent from the other address (this situation happens when the single-homed sender transmits ASCONF with ADD and DEL.)
Signed-off-by: Michio Honda <micchie@sfc.wide.ad.jp>
---
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index de98665..a9f25d7 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -2990,7 +2990,7 @@ static __be16 sctp_process_asconf_param(struct sctp_association *asoc,
* an Error Cause TLV set to the new error code 'Request to
* Delete Source IP Address'
*/
- if (sctp_cmp_addr_exact(sctp_source(asconf), &addr))
+ if (sctp_cmp_addr_exact(&asconf->source, &addr))
return SCTP_ERROR_DEL_SRC_IP;
/* Section 4.2.2
^ permalink raw reply related
* Re: [Bugme-new] [Bug 33502] New: Caught 64-bit read from uninitialized memory in __alloc_skb
From: Eric Dumazet @ 2011-04-20 20:32 UTC (permalink / raw)
To: Christian Casteyde
Cc: Christoph Lameter, Pekka Enberg, Andrew Morton, netdev,
bugzilla-daemon, bugme-daemon, Vegard Nossum, Changli Gao
In-Reply-To: <1303329300.2690.25.camel@edumazet-laptop>
Le mercredi 20 avril 2011 à 21:55 +0200, Eric Dumazet a écrit :
> Le mercredi 20 avril 2011 à 21:36 +0200, Christian Casteyde a écrit :
>
> > I'm not sure it's the same problem anyway as I've said previously.
> > I append my config file used to build this kernel.
>
> This is not the same problem, and is a known false positive from skb
> reallocation. (skb_reserve() doesnt mark memory as initialized)
>
>
Please try following patch. It's a bit tricky, because network stack has
different functions to fill bytes in skb and change pointers/offsets in
it.
Alternative would be to change pskb_expand_head() to not copy zone
between skb->head and skb->data (might contain unitialized data)
Thanks
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 79aafbb..3f2cba4 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1252,6 +1252,7 @@ static inline int skb_tailroom(const struct sk_buff *skb)
*/
static inline void skb_reserve(struct sk_buff *skb, int len)
{
+ kmemcheck_mark_initialized(skb->data, len);
skb->data += len;
skb->tail += len;
}
^ permalink raw reply related
* Re: [PATCH V3 2/8] Add a new zerocopy device flag
From: Shirley Ma @ 2011-04-20 20:30 UTC (permalink / raw)
To: Dimitris Michailidis
Cc: Ben Hutchings, David Miller, mst, Eric Dumazet, Avi Kivity,
Arnd Bergmann, netdev, kvm, linux-kernel
In-Reply-To: <4DAF40EB.20303@chelsio.com>
Resubmit it with 31 bit.
Signed-off-by: Shirley Ma <xma@us.ibm.com>
---
include/linux/netdevice.h | 3 +++
1 files changed, 3 insertions(+), 0 deletions(-)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 0249fe7..0998d3d 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1067,6 +1067,9 @@ struct net_device {
#define NETIF_F_RXHASH (1 << 28) /* Receive hashing offload */
#define NETIF_F_RXCSUM (1 << 29) /* Receive checksumming offload */
+/* Bit 31 is for device to map userspace buffers -- zerocopy */
+#define NETIF_F_ZEROCOPY (1 << 31)
+
/* Segmentation offload features */
#define NETIF_F_GSO_SHIFT 16
#define NETIF_F_GSO_MASK 0x00ff0000
^ permalink raw reply related
* Re: [PATCH V3 2/8] Add a new zerocopy device flag
From: Shirley Ma @ 2011-04-20 20:28 UTC (permalink / raw)
To: Dimitris Michailidis
Cc: Ben Hutchings, David Miller, mst, Eric Dumazet, Avi Kivity,
Arnd Bergmann, netdev, kvm, linux-kernel
In-Reply-To: <4DAF40EB.20303@chelsio.com>
On Wed, 2011-04-20 at 13:24 -0700, Dimitris Michailidis wrote:
> Bit 30 is also taken in net-next.
How about 31?
Thanks
Shirley
^ permalink raw reply
* [PATCH V3 6/8] macvtap/vhost TX zero copy support
From: Shirley Ma @ 2011-04-20 20:27 UTC (permalink / raw)
To: David Miller, Arnd Bergmann, mst
Cc: Eric Dumazet, Avi Kivity, netdev, kvm, linux-kernel
In-Reply-To: <1303328216.19336.18.camel@localhost.localdomain>
Only when buffer size is greater than GOODCOPY_LEN (128), macvtap
enables zero-copy.
Signed-off-by: Shirley Ma <xma@us.ibm.com>
---
drivers/net/macvtap.c | 124
++++++++++++++++++++++++++++++++++++++++++++-----
1 files changed, 112 insertions(+), 12 deletions(-)
diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index 6696e56..b4e6656 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -60,6 +60,7 @@ static struct proto macvtap_proto = {
*/
static dev_t macvtap_major;
#define MACVTAP_NUM_DEVS 65536
+#define GOODCOPY_LEN (L1_CACHE_BYTES < 128 ? 128 : L1_CACHE_BYTES)
static struct class *macvtap_class;
static struct cdev macvtap_cdev;
@@ -340,6 +341,7 @@ static int macvtap_open(struct inode *inode, struct
file *file)
{
struct net *net = current->nsproxy->net_ns;
struct net_device *dev = dev_get_by_index(net, iminor(inode));
+ struct macvlan_dev *vlan = netdev_priv(dev);
struct macvtap_queue *q;
int err;
@@ -369,6 +371,16 @@ static int macvtap_open(struct inode *inode, struct
file *file)
q->flags = IFF_VNET_HDR | IFF_NO_PI | IFF_TAP;
q->vnet_hdr_sz = sizeof(struct virtio_net_hdr);
+ /*
+ * so far only VM uses macvtap, enable zero copy between guest
+ * kernel and host kernel when lower device supports high memory
+ * DMA
+ */
+ if (vlan) {
+ if (vlan->lowerdev->features & NETIF_F_ZEROCOPY)
+ sock_set_flag(&q->sk, SOCK_ZEROCOPY);
+ }
+
err = macvtap_set_queue(dev, file, q);
if (err)
sock_put(&q->sk);
@@ -433,6 +445,80 @@ static inline struct sk_buff
*macvtap_alloc_skb(struct sock *sk, size_t prepad,
return skb;
}
+/* set skb frags from iovec, this can move to core network code for
reuse */
+static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct
iovec *from,
+ int offset, size_t count)
+{
+ int len = iov_length(from, count) - offset;
+ int copy = skb_headlen(skb);
+ int size, offset1 = 0;
+ int i = 0;
+ skb_frag_t *f;
+
+ /* Skip over from offset */
+ while (offset >= from->iov_len) {
+ offset -= from->iov_len;
+ ++from;
+ --count;
+ }
+
+ /* copy up to skb headlen */
+ while (copy > 0) {
+ size = min_t(unsigned int, copy, from->iov_len - offset);
+ if (copy_from_user(skb->data + offset1, from->iov_base + offset,
+ size))
+ return -EFAULT;
+ if (copy > size) {
+ ++from;
+ --count;
+ }
+ copy -= size;
+ offset1 += size;
+ offset = 0;
+ }
+
+ if (len == offset1)
+ return 0;
+
+ while (count--) {
+ struct page *page[MAX_SKB_FRAGS];
+ int num_pages;
+ unsigned long base;
+
+ len = from->iov_len - offset1;
+ if (!len) {
+ offset1 = 0;
+ ++from;
+ continue;
+ }
+ base = (unsigned long)from->iov_base + offset1;
+ size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT;
+ num_pages = get_user_pages_fast(base, size, 0, &page[i]);
+ if ((num_pages != size) ||
+ (num_pages > MAX_SKB_FRAGS - skb_shinfo(skb)->nr_frags))
+ /* put_page is in skb free */
+ return -EFAULT;
+ skb->data_len += len;
+ skb->len += len;
+ skb->truesize += len;
+ while (len) {
+ f = &skb_shinfo(skb)->frags[i];
+ f->page = page[i];
+ f->page_offset = base & ~PAGE_MASK;
+ f->size = min_t(int, len, PAGE_SIZE - f->page_offset);
+ skb_shinfo(skb)->nr_frags++;
+ /* increase sk_wmem_alloc */
+ atomic_add(f->size, &skb->sk->sk_wmem_alloc);
+ base += f->size;
+ len -= f->size;
+ i++;
+ }
+ offset1 = 0;
+ ++from;
+ }
+ return 0;
+}
+
/*
* macvtap_skb_from_vnet_hdr and macvtap_skb_to_vnet_hdr should
* be shared with the tun/tap driver.
@@ -515,17 +601,19 @@ static int macvtap_skb_to_vnet_hdr(const struct
sk_buff *skb,
/* Get packet from user space buffer */
-static ssize_t macvtap_get_user(struct macvtap_queue *q,
- const struct iovec *iv, size_t count,
- int noblock)
+static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr
*m,
+ const struct iovec *iv, unsigned long total_len,
+ size_t count, int noblock)
{
struct sk_buff *skb;
struct macvlan_dev *vlan;
- size_t len = count;
+ unsigned long len = total_len;
int err;
struct virtio_net_hdr vnet_hdr = { 0 };
int vnet_hdr_len = 0;
+ int copylen, zerocopy;
+ zerocopy = sock_flag(&q->sk, SOCK_ZEROCOPY) && (len > GOODCOPY_LEN);
if (q->flags & IFF_VNET_HDR) {
vnet_hdr_len = q->vnet_hdr_sz;
@@ -552,12 +640,24 @@ static ssize_t macvtap_get_user(struct
macvtap_queue *q,
if (unlikely(len < ETH_HLEN))
goto err;
- skb = macvtap_alloc_skb(&q->sk, NET_IP_ALIGN, len, vnet_hdr.hdr_len,
- noblock, &err);
+ if (zerocopy)
+ copylen = vnet_hdr.hdr_len;
+ else
+ copylen = len;
+
+ skb = macvtap_alloc_skb(&q->sk, NET_IP_ALIGN, copylen,
+ vnet_hdr.hdr_len, noblock, &err);
if (!skb)
goto err;
-
- err = skb_copy_datagram_from_iovec(skb, 0, iv, vnet_hdr_len, len);
+
+ if (zerocopy)
+ err = zerocopy_sg_from_iovec(skb, iv, vnet_hdr_len, count);
+ else
+ err = skb_copy_datagram_from_iovec(skb, 0, iv, vnet_hdr_len,
+ len);
+ if (sock_flag(&q->sk, SOCK_ZEROCOPY))
+ memcpy(&skb_shinfo(skb)->ubuf, m->msg_control,
+ sizeof(struct skb_ubuf_info));
if (err)
goto err_kfree;
@@ -579,7 +679,7 @@ static ssize_t macvtap_get_user(struct macvtap_queue
*q,
kfree_skb(skb);
rcu_read_unlock_bh();
- return count;
+ return total_len;
err_kfree:
kfree_skb(skb);
@@ -601,8 +701,8 @@ static ssize_t macvtap_aio_write(struct kiocb *iocb,
const struct iovec *iv,
ssize_t result = -ENOLINK;
struct macvtap_queue *q = file->private_data;
- result = macvtap_get_user(q, iv, iov_length(iv, count),
- file->f_flags & O_NONBLOCK);
+ result = macvtap_get_user(q, NULL, iv, iov_length(iv, count), count,
+ file->f_flags & O_NONBLOCK);
return result;
}
@@ -815,7 +915,7 @@ static int macvtap_sendmsg(struct kiocb *iocb,
struct socket *sock,
struct msghdr *m, size_t total_len)
{
struct macvtap_queue *q = container_of(sock, struct macvtap_queue,
sock);
- return macvtap_get_user(q, m->msg_iov, total_len,
+ return macvtap_get_user(q, m, m->msg_iov, total_len, m->msg_iovlen,
m->msg_flags & MSG_DONTWAIT);
}
^ permalink raw reply related
* Re: [PATCH V3 2/8] Add a new zerocopy device flag
From: Shirley Ma @ 2011-04-20 20:05 UTC (permalink / raw)
To: Ben Hutchings
Cc: David Miller, mst, Eric Dumazet, Avi Kivity, Arnd Bergmann,
netdev, kvm, linux-kernel
In-Reply-To: <1303328906.2823.24.camel@bwh-desktop>
Thanks. I need to update it to 30 bit.
Shirley
^ permalink raw reply
* Re: [PATCH V3 2/8] Add a new zerocopy device flag
From: Dimitris Michailidis @ 2011-04-20 20:24 UTC (permalink / raw)
To: Shirley Ma
Cc: Ben Hutchings, David Miller, mst, Eric Dumazet, Avi Kivity,
Arnd Bergmann, netdev, kvm, linux-kernel
In-Reply-To: <1303330173.19336.52.camel@localhost.localdomain>
On 04/20/2011 01:09 PM, Shirley Ma wrote:
> Resubmit this patch with the new bit.
Bit 30 is also taken in net-next.
>
> Signed-off-by: Shirley Ma <xma@us.ibm.com>
> ---
>
> include/linux/netdevice.h | 3 +++
> 1 files changed, 3 insertions(+), 0 deletions(-)
>
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 0249fe7..0998d3d 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -1067,6 +1067,9 @@ struct net_device {
> #define NETIF_F_RXHASH (1 << 28) /* Receive hashing offload */
> #define NETIF_F_RXCSUM (1 << 29) /* Receive checksumming offload */
>
> +/* Bit 30 is for device to map userspace buffers -- zerocopy */
> +#define NETIF_F_ZEROCOPY (1 << 30)
> +
> /* Segmentation offload features */
> #define NETIF_F_GSO_SHIFT 16
> #define NETIF_F_GSO_MASK 0x00ff0000
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox