Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH net-next] ipv6: generate link local address for GRE tunnel
From: Stephen Hemminger @ 2011-06-08 20:44 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

Use same logic as SIT tunnel to handle link local address
for GRE tunnel. OSPFv3 requires link-local address to function.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

--- a/net/ipv6/addrconf.c	2011-05-23 16:56:36.000000000 -0700
+++ b/net/ipv6/addrconf.c	2011-06-08 13:41:36.447300763 -0700
@@ -1559,6 +1559,11 @@ static int addrconf_ifid_sit(u8 *eui, st
 	return -1;
 }
 
+static int addrconf_ifid_gre(u8 *eui, struct net_device *dev)
+{
+	return __ipv6_isatap_ifid(eui, *(__be32 *)dev->dev_addr);
+}
+
 static int ipv6_generate_eui64(u8 *eui, struct net_device *dev)
 {
 	switch (dev->type) {
@@ -1572,6 +1577,8 @@ static int ipv6_generate_eui64(u8 *eui, 
 		return addrconf_ifid_infiniband(eui, dev);
 	case ARPHRD_SIT:
 		return addrconf_ifid_sit(eui, dev);
+	case ARPHRD_IPGRE:
+		return addrconf_ifid_gre(eui, dev);
 	}
 	return -1;
 }
@@ -2423,6 +2430,29 @@ static void addrconf_sit_config(struct n
 }
 #endif
 
+#if defined(CONFIG_NET_IPGRE) || defined(CONFIG_NET_IPGRE_MODULE)
+static void addrconf_gre_config(struct net_device *dev)
+{
+	struct inet6_dev *idev;
+	struct in6_addr addr;
+
+	pr_info("ipv6: addrconf_gre_config(%s)\n", dev->name);
+
+	ASSERT_RTNL();
+
+	if ((idev = ipv6_find_idev(dev)) == NULL) {
+		printk(KERN_DEBUG "init gre: add_dev failed\n");
+		return;
+	}
+
+	ipv6_addr_set(&addr,  htonl(0xFE800000), 0, 0, 0);
+	addrconf_prefix_route(&addr, 64, dev, 0, 0);
+
+	if (!ipv6_generate_eui64(addr.s6_addr + 8, dev))
+		addrconf_add_linklocal(idev, &addr);
+}
+#endif
+
 static inline int
 ipv6_inherit_linklocal(struct inet6_dev *idev, struct net_device *link_dev)
 {
@@ -2539,6 +2569,11 @@ static int addrconf_notify(struct notifi
 			addrconf_sit_config(dev);
 			break;
 #endif
+#if defined(CONFIG_NET_IPGRE) || defined(CONFIG_NET_IPGRE_MODULE)
+		case ARPHRD_IPGRE:
+			addrconf_gre_config(dev);
+			break;
+#endif
 		case ARPHRD_TUNNEL6:
 			addrconf_ip6_tnl_config(dev);
 			break;

^ permalink raw reply

* Please Read and Reply
From: Peter T.S Wong @ 2011-06-08 20:46 UTC (permalink / raw)
  To: netdev

Good day to you my friend, 

my name is Peter Wong living in Hong Kong.

I seek your assistance in a business transaction/investing of the sum of 44.5MUSD.

I will explain more in my next e-mail when I get your reply. You will get 50% of the
total sum after we finish this business. 

Please reach me back via my personal address peter455wong@yahoo.com.hk  

Thank you.
Sincerely,
Mr. Peter Wong

^ permalink raw reply

* Re: [PATCH] RFC2988bis + taking RTT sample from 3WHS for the passive open side
From: Yuchung Cheng @ 2011-06-08 20:54 UTC (permalink / raw)
  To: Jerry Chu; +Cc: netdev@vger.kernel.org
In-Reply-To: <BANLkTimXgK03tM1e3FYe+wausHT15W6H-2eMNCWaj8LLxRvbZw@mail.gmail.com>

Acked-by: Yuchung Cheng <ycheng@google.com>

On Wed, Jun 8, 2011 at 11:04 AM, Jerry Chu <hkchu@google.com> wrote:
>
> [resent to cc netdev]
>
> This patch lowers the default initRTO from 3secs to 1sec per
> RFC2988bis. It falls back to 3secs if the SYN or SYN-ACK packet
> has been retransmitted, AND the TCP timestamp option is not on.
>
> It also adds support to take RTT sample during 3WHS on the passive
> open side, just like its active open counterpart, and uses it, if
> valid, to seed the initRTO for the data transmission phase.
>
> The patch also resets ssthresh to its initial default at the
> beginning of the data transmission phase, and reduces cwnd to 1 if
> there has been MORE THAN ONE retransmission during 3WHS per RFC5681.
>
> Signed-off-by: H.K. Jerry Chu <hkchu@google.com>
> ---
>  include/linux/tcp.h      |    1 +
>  include/net/tcp.h        |   11 +++++++++--
>  net/ipv4/syncookies.c    |    1 +
>  net/ipv4/tcp_input.c     |   46 +++++++++++++++++++++++++---------------------
>  net/ipv4/tcp_ipv4.c      |   11 ++++++++---
>  net/ipv4/tcp_minisocks.c |    6 +++++-
>  net/ipv6/syncookies.c    |    1 +
>  net/ipv6/tcp_ipv6.c      |    5 +++++
>  8 files changed, 55 insertions(+), 27 deletions(-)
>
> diff --git a/include/linux/tcp.h b/include/linux/tcp.h
> index e64f4c6..531ede8 100644
> --- a/include/linux/tcp.h
> +++ b/include/linux/tcp.h
> @@ -282,6 +282,7 @@ struct tcp_request_sock {
>  #endif
>        u32                             rcv_isn;
>        u32                             snt_isn;
> +       u32                             snt_synack; /* synack sent time */
>  };
>
>  static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req)
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index cda30ea..149a415 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -122,7 +122,13 @@ extern void tcp_time_wait(struct sock *sk, int
> state, int timeo);
>  #endif
>  #define TCP_RTO_MAX    ((unsigned)(120*HZ))
>  #define TCP_RTO_MIN    ((unsigned)(HZ/5))
> -#define TCP_TIMEOUT_INIT ((unsigned)(3*HZ))    /* RFC 1122 initial
> RTO value   */
> +#define TCP_TIMEOUT_INIT ((unsigned)(1*HZ))    /* RFC2988bis initial
> RTO value */
> +#define TCP_TIMEOUT_FALLBACK ((unsigned)(3*HZ))        /* RFC 1122
> initial RTO value, now
> +                                                * used as a fallback
> RTO for the
> +                                                * initial data
> transmission if no
> +                                                * valid RTT sample
> has been acquired,
> +                                                * most likely due to
> retrans in 3WHS.
> +                                                */
>
>  #define TCP_RESOURCE_PROBE_INTERVAL ((unsigned)(HZ/2U)) /* Maximal
> interval between probes
>                                                         * for local resources.
> @@ -295,7 +301,7 @@ static inline void tcp_synq_overflow(struct sock *sk)
>  static inline int tcp_synq_no_recent_overflow(const struct sock *sk)
>  {
>        unsigned long last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp;
> -       return time_after(jiffies, last_overflow + TCP_TIMEOUT_INIT);
> +       return time_after(jiffies, last_overflow + TCP_TIMEOUT_FALLBACK);
>  }
>
>  extern struct proto tcp_prot;
> @@ -508,6 +514,7 @@ extern void tcp_initialize_rcv_mss(struct sock *sk);
>  extern int tcp_mtu_to_mss(struct sock *sk, int pmtu);
>  extern int tcp_mss_to_mtu(struct sock *sk, int mss);
>  extern void tcp_mtup_init(struct sock *sk);
> +extern void tcp_valid_rtt_meas(struct sock *sk, u32 seq_rtt);
>
>  static inline void tcp_bound_rto(const struct sock *sk)
>  {
> diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
> index 2646149..92bb943 100644
> --- a/net/ipv4/syncookies.c
> +++ b/net/ipv4/syncookies.c
> @@ -316,6 +316,7 @@ struct sock *cookie_v4_check(struct sock *sk,
> struct sk_buff *skb,
>        ireq->wscale_ok         = tcp_opt.wscale_ok;
>        ireq->tstamp_ok         = tcp_opt.saw_tstamp;
>        req->ts_recent          = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0;
> +       treq->snt_synack        = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0;
>
>        /* We throwed the options of the initial SYN away, so we hope
>         * the ACK carries the same options again (see RFC1122 4.2.3.8)
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index bef9f04..ea0d218 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -880,6 +880,11 @@ static void tcp_init_metrics(struct sock *sk)
>                tp->snd_ssthresh = dst_metric(dst, RTAX_SSTHRESH);
>                if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
>                        tp->snd_ssthresh = tp->snd_cwnd_clamp;
> +       } else {
> +               /* ssthresh may have been reduced unnecessarily during.
> +                * 3WHS. Restore it back to its initial default.
> +                */
> +               tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
>        }
>        if (dst_metric(dst, RTAX_REORDERING) &&
>            tp->reordering != dst_metric(dst, RTAX_REORDERING)) {
> @@ -887,10 +892,7 @@ static void tcp_init_metrics(struct sock *sk)
>                tp->reordering = dst_metric(dst, RTAX_REORDERING);
>        }
>
> -       if (dst_metric(dst, RTAX_RTT) == 0)
> -               goto reset;
> -
> -       if (!tp->srtt && dst_metric_rtt(dst, RTAX_RTT) <
> (TCP_TIMEOUT_INIT << 3))
> +       if (dst_metric(dst, RTAX_RTT) == 0 || tp->srtt == 0)
>                goto reset;
>
>        /* Initial rtt is determined from SYN,SYN-ACK.
> @@ -916,19 +918,26 @@ static void tcp_init_metrics(struct sock *sk)
>                tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk));
>        }
>        tcp_set_rto(sk);
> -       if (inet_csk(sk)->icsk_rto < TCP_TIMEOUT_INIT &&
> !tp->rx_opt.saw_tstamp) {
>  reset:
> -               /* Play conservative. If timestamps are not
> -                * supported, TCP will fail to recalculate correct
> -                * rtt, if initial rto is too small. FORGET ALL AND RESET!
> +       if (tp->srtt == 0) {
> +               /* RFC2988bis: We've failed to get a valid RTT sample from
> +                * 3WHS. This is most likely due to retransmission,
> +                * including spurious one. Reset the RTO back to 3secs
> +                * from the more aggressive 1sec to avoid more spurious
> +                * retransmission.
>                 */
> -               if (!tp->rx_opt.saw_tstamp && tp->srtt) {
> -                       tp->srtt = 0;
> -                       tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_INIT;
> -                       inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
> -               }
> +               tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_FALLBACK;
> +               inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK;
>        }
> -       tp->snd_cwnd = tcp_init_cwnd(tp, dst);
> +       /* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been
> +        * retransmitted. In light of RFC2988bis' more aggressive 1sec
> +        * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK
> +        * retransmission has occurred.
> +        */
> +       if (tp->total_retrans > 1)
> +               tp->snd_cwnd = 1;
> +       else
> +               tp->snd_cwnd = tcp_init_cwnd(tp, dst);
>        tp->snd_cwnd_stamp = tcp_time_stamp;
>  }
>
> @@ -3112,12 +3121,13 @@ static void tcp_fastretrans_alert(struct sock
> *sk, int pkts_acked, int flag)
>        tcp_xmit_retransmit_queue(sk);
>  }
>
> -static void tcp_valid_rtt_meas(struct sock *sk, u32 seq_rtt)
> +void tcp_valid_rtt_meas(struct sock *sk, u32 seq_rtt)
>  {
>        tcp_rtt_estimator(sk, seq_rtt);
>        tcp_set_rto(sk);
>        inet_csk(sk)->icsk_backoff = 0;
>  }
> +EXPORT_SYMBOL(tcp_valid_rtt_meas);
>
>  /* Read draft-ietf-tcplw-high-performance before mucking
>  * with this code. (Supersedes RFC1323)
> @@ -5806,12 +5816,6 @@ int tcp_rcv_state_process(struct sock *sk,
> struct sk_buff *skb,
>                                              tp->rx_opt.snd_wscale;
>                                tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
>
> -                               /* tcp_ack considers this ACK as duplicate
> -                                * and does not calculate rtt.
> -                                * Force it here.
> -                                */
> -                               tcp_ack_update_rtt(sk, 0, 0);
> -
>                                if (tp->rx_opt.tstamp_ok)
>                                        tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
>
> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> index 3c8d9b6..5fb504b 100644
> --- a/net/ipv4/tcp_ipv4.c
> +++ b/net/ipv4/tcp_ipv4.c
> @@ -429,8 +429,8 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
>                        break;
>
>                icsk->icsk_backoff--;
> -               inet_csk(sk)->icsk_rto = __tcp_set_rto(tp) <<
> -                                        icsk->icsk_backoff;
> +               inet_csk(sk)->icsk_rto = (tp->srtt ? __tcp_set_rto(tp) :
> +                       TCP_TIMEOUT_INIT) << icsk->icsk_backoff;
>                tcp_bound_rto(sk);
>
>                skb = tcp_write_queue_head(sk);
> @@ -1384,6 +1384,7 @@ int tcp_v4_conn_request(struct sock *sk, struct
> sk_buff *skb)
>                isn = tcp_v4_init_sequence(skb);
>        }
>        tcp_rsk(req)->snt_isn = isn;
> +       tcp_rsk(req)->snt_synack = tcp_time_stamp;
>
>        if (tcp_v4_send_synack(sk, dst, req,
>                               (struct request_values *)&tmp_ext) ||
> @@ -1458,6 +1459,10 @@ struct sock *tcp_v4_syn_recv_sock(struct sock
> *sk, struct sk_buff *skb,
>                newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
>
>        tcp_initialize_rcv_mss(newsk);
> +       if (tcp_rsk(req)->snt_synack)
> +               tcp_valid_rtt_meas(newsk,
> +                   tcp_time_stamp - tcp_rsk(req)->snt_synack);
> +       newtp->total_retrans = req->retrans;
>
>  #ifdef CONFIG_TCP_MD5SIG
>        /* Copy over the MD5 key from the original socket */
> @@ -1854,7 +1859,7 @@ static int tcp_v4_init_sock(struct sock *sk)
>         * algorithms that we must have the following bandaid to talk
>         * efficiently to them.  -DaveM
>         */
> -       tp->snd_cwnd = 2;
> +       tp->snd_cwnd = TCP_INIT_CWND;
>
>        /* See draft-stevens-tcpca-spec-01 for discussion of the
>         * initialization of these values.
> diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
> index 80b1f80..d2fe4e0 100644
> --- a/net/ipv4/tcp_minisocks.c
> +++ b/net/ipv4/tcp_minisocks.c
> @@ -486,7 +486,7 @@ struct sock *tcp_create_openreq_child(struct sock
> *sk, struct request_sock *req,
>                 * algorithms that we must have the following bandaid to talk
>                 * efficiently to them.  -DaveM
>                 */
> -               newtp->snd_cwnd = 2;
> +               newtp->snd_cwnd = TCP_INIT_CWND;
>                newtp->snd_cwnd_cnt = 0;
>                newtp->bytes_acked = 0;
>
> @@ -720,6 +720,10 @@ struct sock *tcp_check_req(struct sock *sk,
> struct sk_buff *skb,
>                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP);
>                return NULL;
>        }
> +       if (tmp_opt.saw_tstamp && tmp_opt.rcv_tsecr)
> +               tcp_rsk(req)->snt_synack = tmp_opt.rcv_tsecr;
> +       else if (req->retrans) /* don't take RTT sample if retrans && ~TS */
> +               tcp_rsk(req)->snt_synack = 0;
>
>        /* OK, ACK is valid, create big socket and
>         * feed this segment to it. It will repeat all
> diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
> index 8b9644a..89d5bf8 100644
> --- a/net/ipv6/syncookies.c
> +++ b/net/ipv6/syncookies.c
> @@ -223,6 +223,7 @@ struct sock *cookie_v6_check(struct sock *sk,
> struct sk_buff *skb)
>        ireq->wscale_ok         = tcp_opt.wscale_ok;
>        ireq->tstamp_ok         = tcp_opt.saw_tstamp;
>        req->ts_recent          = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0;
> +       treq->snt_synack        = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0;
>        treq->rcv_isn = ntohl(th->seq) - 1;
>        treq->snt_isn = cookie;
>
> diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
> index 8683664..e7d47e4 100644
> --- a/net/ipv6/tcp_ipv6.c
> +++ b/net/ipv6/tcp_ipv6.c
> @@ -1341,6 +1341,7 @@ static int tcp_v6_conn_request(struct sock *sk,
> struct sk_buff *skb)
>        }
>  have_isn:
>        tcp_rsk(req)->snt_isn = isn;
> +       tcp_rsk(req)->snt_synack = tcp_time_stamp;
>
>        security_inet_conn_request(sk, skb, req);
>
> @@ -1509,6 +1510,10 @@ static struct sock *
> tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
>        tcp_sync_mss(newsk, dst_mtu(dst));
>        newtp->advmss = dst_metric_advmss(dst);
>        tcp_initialize_rcv_mss(newsk);
> +       if (tcp_rsk(req)->snt_synack)
> +               tcp_valid_rtt_meas(newsk,
> +                   tcp_time_stamp - tcp_rsk(req)->snt_synack);
> +       newtp->total_retrans = req->retrans;
>
>        newinet->inet_daddr = newinet->inet_saddr = LOOPBACK4_IPV6;
>        newinet->inet_rcv_saddr = LOOPBACK4_IPV6;
> --
> 1.7.3.1
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH] RFC2988bis + taking RTT sample from 3WHS for the passive open side
From: Eric Dumazet @ 2011-06-08 21:03 UTC (permalink / raw)
  To: Jerry Chu
  Cc: David Miller, Hagen Paul Pfeifer, tsunanet,
	netdev@vger.kernel.org
In-Reply-To: <BANLkTimXgK03tM1e3FYe+wausHT15W6H-2eMNCWaj8LLxRvbZw@mail.gmail.com>

Le mercredi 08 juin 2011 à 11:04 -0700, Jerry Chu a écrit :
> [resent to cc netdev]
> 

Well, the first copy I received this morning was OK ;)

But this second one is mangled by your mailer or something ?

> This patch lowers the default initRTO from 3secs to 1sec per
> RFC2988bis. It falls back to 3secs if the SYN or SYN-ACK packet
> has been retransmitted, AND the TCP timestamp option is not on.
> 
> It also adds support to take RTT sample during 3WHS on the passive
> open side, just like its active open counterpart, and uses it, if
> valid, to seed the initRTO for the data transmission phase.
> 
> The patch also resets ssthresh to its initial default at the
> beginning of the data transmission phase, and reduces cwnd to 1 if
> there has been MORE THAN ONE retransmission during 3WHS per RFC5681.
> 
> Signed-off-by: H.K. Jerry Chu <hkchu@google.com>
> ---




^ permalink raw reply

* [PATCH] RFC2988bis + taking RTT sample from 3WHS for the passive open side
From: H.K. Jerry Chu @ 2011-06-08 21:08 UTC (permalink / raw)
  To: davem, eric.dumazet, hagen; +Cc: tsunanet, netdev, Jerry Chu

From: Jerry Chu <hkchu@google.com>

This patch lowers the default initRTO from 3secs to 1sec per
RFC2988bis. It falls back to 3secs if the SYN or SYN-ACK packet
has been retransmitted, AND the TCP timestamp option is not on.

It also adds support to take RTT sample during 3WHS on the passive
open side, just like its active open counterpart, and uses it, if
valid, to seed the initRTO for the data transmission phase.

The patch also resets ssthresh to its initial default at the
beginning of the data transmission phase, and reduces cwnd to 1 if
there has been MORE THAN ONE retransmission during 3WHS per RFC5681.

Signed-off-by: H.K. Jerry Chu <hkchu@google.com>
---
 include/linux/tcp.h      |    1 +
 include/net/tcp.h        |   11 +++++++++--
 net/ipv4/syncookies.c    |    1 +
 net/ipv4/tcp_input.c     |   46 +++++++++++++++++++++++++---------------------
 net/ipv4/tcp_ipv4.c      |   11 ++++++++---
 net/ipv4/tcp_minisocks.c |    6 +++++-
 net/ipv6/syncookies.c    |    1 +
 net/ipv6/tcp_ipv6.c      |    5 +++++
 8 files changed, 55 insertions(+), 27 deletions(-)

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index e64f4c6..531ede8 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -282,6 +282,7 @@ struct tcp_request_sock {
 #endif
 	u32				rcv_isn;
 	u32				snt_isn;
+	u32				snt_synack; /* synack sent time */
 };
 
 static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req)
diff --git a/include/net/tcp.h b/include/net/tcp.h
index cda30ea..149a415 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -122,7 +122,13 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo);
 #endif
 #define TCP_RTO_MAX	((unsigned)(120*HZ))
 #define TCP_RTO_MIN	((unsigned)(HZ/5))
-#define TCP_TIMEOUT_INIT ((unsigned)(3*HZ))	/* RFC 1122 initial RTO value	*/
+#define TCP_TIMEOUT_INIT ((unsigned)(1*HZ))	/* RFC2988bis initial RTO value	*/
+#define TCP_TIMEOUT_FALLBACK ((unsigned)(3*HZ))	/* RFC 1122 initial RTO value, now
+						 * used as a fallback RTO for the
+						 * initial data transmission if no
+						 * valid RTT sample has been acquired,
+						 * most likely due to retrans in 3WHS.
+						 */
 
 #define TCP_RESOURCE_PROBE_INTERVAL ((unsigned)(HZ/2U)) /* Maximal interval between probes
 					                 * for local resources.
@@ -295,7 +301,7 @@ static inline void tcp_synq_overflow(struct sock *sk)
 static inline int tcp_synq_no_recent_overflow(const struct sock *sk)
 {
 	unsigned long last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp;
-	return time_after(jiffies, last_overflow + TCP_TIMEOUT_INIT);
+	return time_after(jiffies, last_overflow + TCP_TIMEOUT_FALLBACK);
 }
 
 extern struct proto tcp_prot;
@@ -508,6 +514,7 @@ extern void tcp_initialize_rcv_mss(struct sock *sk);
 extern int tcp_mtu_to_mss(struct sock *sk, int pmtu);
 extern int tcp_mss_to_mtu(struct sock *sk, int mss);
 extern void tcp_mtup_init(struct sock *sk);
+extern void tcp_valid_rtt_meas(struct sock *sk, u32 seq_rtt);
 
 static inline void tcp_bound_rto(const struct sock *sk)
 {
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 2646149..92bb943 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -316,6 +316,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
 	ireq->wscale_ok		= tcp_opt.wscale_ok;
 	ireq->tstamp_ok		= tcp_opt.saw_tstamp;
 	req->ts_recent		= tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0;
+	treq->snt_synack	= tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0;
 
 	/* We throwed the options of the initial SYN away, so we hope
 	 * the ACK carries the same options again (see RFC1122 4.2.3.8)
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index bef9f04..ea0d218 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -880,6 +880,11 @@ static void tcp_init_metrics(struct sock *sk)
 		tp->snd_ssthresh = dst_metric(dst, RTAX_SSTHRESH);
 		if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
 			tp->snd_ssthresh = tp->snd_cwnd_clamp;
+	} else {
+		/* ssthresh may have been reduced unnecessarily during.
+		 * 3WHS. Restore it back to its initial default.
+		 */
+		tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
 	}
 	if (dst_metric(dst, RTAX_REORDERING) &&
 	    tp->reordering != dst_metric(dst, RTAX_REORDERING)) {
@@ -887,10 +892,7 @@ static void tcp_init_metrics(struct sock *sk)
 		tp->reordering = dst_metric(dst, RTAX_REORDERING);
 	}
 
-	if (dst_metric(dst, RTAX_RTT) == 0)
-		goto reset;
-
-	if (!tp->srtt && dst_metric_rtt(dst, RTAX_RTT) < (TCP_TIMEOUT_INIT << 3))
+	if (dst_metric(dst, RTAX_RTT) == 0 || tp->srtt == 0)
 		goto reset;
 
 	/* Initial rtt is determined from SYN,SYN-ACK.
@@ -916,19 +918,26 @@ static void tcp_init_metrics(struct sock *sk)
 		tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk));
 	}
 	tcp_set_rto(sk);
-	if (inet_csk(sk)->icsk_rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp) {
 reset:
-		/* Play conservative. If timestamps are not
-		 * supported, TCP will fail to recalculate correct
-		 * rtt, if initial rto is too small. FORGET ALL AND RESET!
+	if (tp->srtt == 0) {
+		/* RFC2988bis: We've failed to get a valid RTT sample from
+		 * 3WHS. This is most likely due to retransmission,
+		 * including spurious one. Reset the RTO back to 3secs
+		 * from the more aggressive 1sec to avoid more spurious
+		 * retransmission.
 		 */
-		if (!tp->rx_opt.saw_tstamp && tp->srtt) {
-			tp->srtt = 0;
-			tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_INIT;
-			inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
-		}
+		tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_FALLBACK;
+		inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK;
 	}
-	tp->snd_cwnd = tcp_init_cwnd(tp, dst);
+	/* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been
+	 * retransmitted. In light of RFC2988bis' more aggressive 1sec
+	 * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK
+	 * retransmission has occurred.
+	 */
+	if (tp->total_retrans > 1)
+		tp->snd_cwnd = 1;
+	else
+		tp->snd_cwnd = tcp_init_cwnd(tp, dst);
 	tp->snd_cwnd_stamp = tcp_time_stamp;
 }
 
@@ -3112,12 +3121,13 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag)
 	tcp_xmit_retransmit_queue(sk);
 }
 
-static void tcp_valid_rtt_meas(struct sock *sk, u32 seq_rtt)
+void tcp_valid_rtt_meas(struct sock *sk, u32 seq_rtt)
 {
 	tcp_rtt_estimator(sk, seq_rtt);
 	tcp_set_rto(sk);
 	inet_csk(sk)->icsk_backoff = 0;
 }
+EXPORT_SYMBOL(tcp_valid_rtt_meas);
 
 /* Read draft-ietf-tcplw-high-performance before mucking
  * with this code. (Supersedes RFC1323)
@@ -5806,12 +5816,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 					      tp->rx_opt.snd_wscale;
 				tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
 
-				/* tcp_ack considers this ACK as duplicate
-				 * and does not calculate rtt.
-				 * Force it here.
-				 */
-				tcp_ack_update_rtt(sk, 0, 0);
-
 				if (tp->rx_opt.tstamp_ok)
 					tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 3c8d9b6..5fb504b 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -429,8 +429,8 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 			break;
 
 		icsk->icsk_backoff--;
-		inet_csk(sk)->icsk_rto = __tcp_set_rto(tp) <<
-					 icsk->icsk_backoff;
+		inet_csk(sk)->icsk_rto = (tp->srtt ? __tcp_set_rto(tp) :
+			TCP_TIMEOUT_INIT) << icsk->icsk_backoff;
 		tcp_bound_rto(sk);
 
 		skb = tcp_write_queue_head(sk);
@@ -1384,6 +1384,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 		isn = tcp_v4_init_sequence(skb);
 	}
 	tcp_rsk(req)->snt_isn = isn;
+	tcp_rsk(req)->snt_synack = tcp_time_stamp;
 
 	if (tcp_v4_send_synack(sk, dst, req,
 			       (struct request_values *)&tmp_ext) ||
@@ -1458,6 +1459,10 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 		newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
 
 	tcp_initialize_rcv_mss(newsk);
+	if (tcp_rsk(req)->snt_synack)
+		tcp_valid_rtt_meas(newsk,
+		    tcp_time_stamp - tcp_rsk(req)->snt_synack);
+	newtp->total_retrans = req->retrans;
 
 #ifdef CONFIG_TCP_MD5SIG
 	/* Copy over the MD5 key from the original socket */
@@ -1854,7 +1859,7 @@ static int tcp_v4_init_sock(struct sock *sk)
 	 * algorithms that we must have the following bandaid to talk
 	 * efficiently to them.  -DaveM
 	 */
-	tp->snd_cwnd = 2;
+	tp->snd_cwnd = TCP_INIT_CWND;
 
 	/* See draft-stevens-tcpca-spec-01 for discussion of the
 	 * initialization of these values.
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 80b1f80..d2fe4e0 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -486,7 +486,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
 		 * algorithms that we must have the following bandaid to talk
 		 * efficiently to them.  -DaveM
 		 */
-		newtp->snd_cwnd = 2;
+		newtp->snd_cwnd = TCP_INIT_CWND;
 		newtp->snd_cwnd_cnt = 0;
 		newtp->bytes_acked = 0;
 
@@ -720,6 +720,10 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP);
 		return NULL;
 	}
+	if (tmp_opt.saw_tstamp && tmp_opt.rcv_tsecr)
+		tcp_rsk(req)->snt_synack = tmp_opt.rcv_tsecr;
+	else if (req->retrans) /* don't take RTT sample if retrans && ~TS */
+		tcp_rsk(req)->snt_synack = 0;
 
 	/* OK, ACK is valid, create big socket and
 	 * feed this segment to it. It will repeat all
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index 8b9644a..89d5bf8 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -223,6 +223,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
 	ireq->wscale_ok		= tcp_opt.wscale_ok;
 	ireq->tstamp_ok		= tcp_opt.saw_tstamp;
 	req->ts_recent		= tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0;
+	treq->snt_synack	= tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0;
 	treq->rcv_isn = ntohl(th->seq) - 1;
 	treq->snt_isn = cookie;
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 8683664..e7d47e4 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1341,6 +1341,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 	}
 have_isn:
 	tcp_rsk(req)->snt_isn = isn;
+	tcp_rsk(req)->snt_synack = tcp_time_stamp;
 
 	security_inet_conn_request(sk, skb, req);
 
@@ -1509,6 +1510,10 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 	tcp_sync_mss(newsk, dst_mtu(dst));
 	newtp->advmss = dst_metric_advmss(dst);
 	tcp_initialize_rcv_mss(newsk);
+	if (tcp_rsk(req)->snt_synack)
+		tcp_valid_rtt_meas(newsk,
+		    tcp_time_stamp - tcp_rsk(req)->snt_synack);
+	newtp->total_retrans = req->retrans;
 
 	newinet->inet_daddr = newinet->inet_saddr = LOOPBACK4_IPV6;
 	newinet->inet_rcv_saddr = LOOPBACK4_IPV6;
-- 
1.7.3.1


^ permalink raw reply related

* Re: [PATCH] RFC2988bis + taking RTT sample from 3WHS for the passive open side
From: Hagen Paul Pfeifer @ 2011-06-08 21:16 UTC (permalink / raw)
  To: Jerry Chu; +Cc: David Miller, Eric Dumazet, tsunanet, netdev@vger.kernel.org
In-Reply-To: <BANLkTimXgK03tM1e3FYe+wausHT15W6H-2eMNCWaj8LLxRvbZw@mail.gmail.com>

Hello Jerry

* Jerry Chu | 2011-06-08 11:04:22 [-0700]:

>+       /* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been
>+        * retransmitted. In light of RFC2988bis' more aggressive 1sec
>+        * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK
>+        * retransmission has occurred.
>+        */
>+       if (tp->total_retrans > 1)
>+               tp->snd_cwnd = 1;
>+       else
>+               tp->snd_cwnd = tcp_init_cwnd(tp, dst);

Is this behavior somewhere documented? In RFC2988bis I cannot find something
similar.


Cheers, Hagen

^ permalink raw reply

* Re: KVM induced panic on 2.6.38[2367] & 2.6.39
From: Eric Dumazet @ 2011-06-08 21:22 UTC (permalink / raw)
  To: Brad Campbell
  Cc: Patrick McHardy, Bart De Schuymer, kvm, linux-mm, linux-kernel,
	netdev, netfilter-devel
In-Reply-To: <4DEFAB15.2060905@fnarfbargle.com>

Le jeudi 09 juin 2011 à 01:02 +0800, Brad Campbell a écrit :
> On 08/06/11 11:59, Eric Dumazet wrote:
> 
> > Well, a bisection definitely should help, but needs a lot of time in
> > your case.
> 
> Yes. compile, test, crash, walk out to the other building to press 
> reset, lather, rinse, repeat.
> 
> I need a reset button on the end of a 50M wire, or a hardware watchdog!
> 
> Actually it's not so bad. If I turn off slub debugging the kernel panics 
> and reboots itself.
> 
> This.. :
> [    2.913034] netconsole: remote ethernet address 00:16:cb:a7:dd:d1
> [    2.913066] netconsole: device eth0 not up yet, forcing it
> [    3.660062] Refined TSC clocksource calibration: 3213.422 MHz.
> [    3.660118] Switching to clocksource tsc
> [   63.200273] r8169 0000:03:00.0: eth0: unable to load firmware patch 
> rtl_nic/rtl8168e-1.fw (-2)
> [   63.223513] r8169 0000:03:00.0: eth0: link down
> [   63.223556] r8169 0000:03:00.0: eth0: link down
> 
> ..is slowing down reboots considerably. 3.0-rc does _not_ like some 
> timing hardware in my machine. Having said that, at least it does not 
> randomly panic on SCSI like 2.6.39 does.
> 
> Ok, I've ruled out TCPMSS. Found out where it was being set and neutered 
> it. I've replicated it with only the single DNAT rule.
> 
> 
> > Could you try following patch, because this is the 'usual suspect' I had
> > yesterday :
> >
> > diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> > index 46cbd28..9f548f9 100644
> > --- a/net/core/skbuff.c
> > +++ b/net/core/skbuff.c
> > @@ -792,6 +792,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
> >   		fastpath = atomic_read(&skb_shinfo(skb)->dataref) == delta;
> >   	}
> >
> > +#if 0
> >   	if (fastpath&&
> >   	size + sizeof(struct skb_shared_info)<= ksize(skb->head)) {
> >   		memmove(skb->head + size, skb_shinfo(skb),
> > @@ -802,7 +803,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
> >   		off = nhead;
> >   		goto adjust_others;
> >   	}
> > -
> > +#endif
> >   	data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask);
> >   	if (!data)
> >   		goto nodata;
> >
> >
> >
> 
> Nope.. that's not it. <sigh> That might have changed the characteristic 
> of the fault slightly, but unfortunately I got caught with a couple of 
> fsck's, so I only got to test it 3 times tonight.
> 
> It's unfortunate that this is a production system, so I can only take it 
> down between about 9pm and 1am. That would normally be pretty 
> productive, except that an fsck of a 14TB ext4 can take 30 minutes if it 
> panics at the wrong time.
> 
> I'm out of time tonight, but I'll have a crack at some bisection 
> tomorrow night. Now I just have to go back far enough that it works, and 
> be near enough not to have to futz around with /proc /sys or drivers.
> 
> I really, really, really appreciate you guys helping me with this. It 
> has been driving me absolutely bonkers. If I'm ever in the same town as 
> any of you, dinner and drinks are on me.

Hmm, I wonder if kmemcheck could help you, but its slow as hell, so not
appropriate for production :(



--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Access to repo on git.linux-ipv6.org
From: Stefan BOSAK @ 2011-06-08 21:31 UTC (permalink / raw)
  To: netdev

Hello.

I would like to ask you how to access:
git://git.linux-ipv6.org/gitroot/iputils.git

IPv4 git service not running:
Nmap scan report for git.linux-ipv6.org (203.178.142.218)
Host is up (0.29s latency).
rDNS record for 203.178.142.218: nezu.linux-ipv6.org
PORT     STATE  SERVICE
9418/tcp closed git

IPv6 git service running, but what git options should i used to clone it:
Nmap scan report for git.linux-ipv6.org (2001:200:0:1c01:20f:1fff:fe67:32e9)
Host is up (0.30s latency).
PORT     STATE SERVICE
9418/tcp open  git

Thank you for your help.

^ permalink raw reply

* Re: [af-packet 2/2] Enhance af-packet to provide (near zero)lossless packet capture functionality
From: chetan loke @ 2011-06-08 22:01 UTC (permalink / raw)
  To: Joe Perches; +Cc: netdev, davem, eric.dumazet, kaber, johann.baudy, Chetan Loke
In-Reply-To: <1307506686.22272.49.camel@Joe-Laptop>

On Wed, Jun 8, 2011 at 12:18 AM, Joe Perches <joe@perches.com> wrote:
> On Tue, 2011-06-07 at 23:13 -0400, Chetan Loke wrote:
>> Signed-off-by: Chetan Loke <lokec@ccs.neu.edu>
>
> just trivia:
>
>> ---
>>  net/packet/af_packet.c |  878 +++++++++++++++++++++++++++++++++++++++++++++---
> []
>> +/* kbdq - kernel block descriptor queue */
>> +struct kbdq_core {
>> +     struct pgv      *pkbdq;
>> +     unsigned int    hdrlen;
>> +     unsigned char   reset_pending_on_curr_blk;
>> +     unsigned char   delete_blk_timer;
>> +     unsigned short  kactive_blk_num;
>> +     unsigned short  hole_bytes_size;
>> +     char            *pkblk_start;
>> +     char            *pkblk_end;
>> +     int             kblk_size;
>> +     unsigned int    knum_blocks;
>> +     uint64_t        knxt_seq_num;
>> +     char            *prev;
>> +     char            *nxt_offset;
>> +
>> +     /* last_kactive_blk_num:
>> +      * trick to see if user-space has caught up
>> +      * in order to avoid refreshing timer when every single pkt arrives.
>> +      */
>> +     unsigned short  last_kactive_blk_num;
>> +
>> +     atomic_t        blk_fill_in_prog;
>> +
>> +     /* Default is set to 8ms */
>> +#define DEFAULT_PRB_RETIRE_TOV       (8)
>> +
>> +     unsigned short  retire_blk_tov;
>> +     unsigned long   tov_in_jiffies;
>> +
>> +     /* timer to retire an outstanding block */
>> +     struct timer_list retire_blk_timer;
>> +};
>
> You could align the member entries a bit more,
> maybe move last_kactive_blk_num after retire_blk_tov
>
> []
>
>> @@ -248,8 +322,11 @@ static void __packet_set_status(struct packet_sock *po, void *frame, int status)
>>               h.h2->tp_status = status;
>>               flush_dcache_page(pgv_to_page(&h.h2->tp_status));
>>               break;
>> +     case TPACKET_V3:
>>       default:
>> -             pr_err("TPACKET version not supported\n");
>> +             pr_err("<%s> TPACKET version not supported.Who is calling?\
>> +                     Dumping stack.\n", __func__);
>
> whitespace defect because of line continuation.  Maybe just:
>                WARN(1, "TPACKET version not supported\n");
>
>>               BUG();
>>       }
>>
>> @@ -274,8 +351,11 @@ static int __packet_get_status(struct packet_sock *po, void *frame)
>>       case TPACKET_V2:
>>               flush_dcache_page(pgv_to_page(&h.h2->tp_status));
>>               return h.h2->tp_status;
>> +     case TPACKET_V3:
>>       default:
>> -             pr_err("TPACKET version not supported\n");
>> +             pr_err("<%s> TPACKET version:%d not supported.\
>> +                     Dumping stack.\n", __func__, po->tp_version);
>> +             dump_stack();
>
> here too.
>
>                WARN(1, "TPACKET version %d not supported\n", po->tp_version);
>
>>               BUG();
>>               return 0;
>>       }
> []
>> +static void prb_open_block(struct kbdq_core *pkc1, struct block_desc *pbd1)
>> +{
>> +     pr_err("<%s> ERROR block:%p is NOT FREE status:%d\
>> +                     kactive_blk_num:%d\n",
>> +                     __func__, pbd1, BLOCK_STATUS(pbd1), pkc1->kactive_blk_num);
>> +     dump_stack();
>> +     BUG();
>
> here too.  maybe just:
>        WARN(1, "%s: ERROR block:%p is not free.  status: %s kactive_blk_num:%d\n"
>             __func__, pbd1, BLOCK_STATUS(pbd1), pkc1->kactive_blk_num);
>
>> +static inline void packet_increment_rx_head(struct packet_sock *po,
>> +                                         struct packet_ring_buffer *rb)
>> +{
>> +     switch (po->tp_version) {
>> +     case TPACKET_V1:
>> +     case TPACKET_V2:
>> +             return packet_increment_head(rb);
>> +     case TPACKET_V3:
>> +     default:
>> +             pr_err("<%s> TPACKET version:%d not supported.\
>> +                     Dumping stack.\n", __func__, po->tp_version);
>
> whitespace, WARN(1, etc...
>
>
>> @@ -2412,7 +3168,7 @@ out_free_pgvec:
>>       goto out;
>>  }
>>
>> -static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
>> +static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
>>               int closing, int tx_ring)
>>  {
>>       struct pgv *pg_vec = NULL;
>> @@ -2421,7 +3177,17 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
>>       struct packet_ring_buffer *rb;
>>       struct sk_buff_head *rb_queue;
>>       __be16 num;
>> -     int err;
>> +     int err = -EINVAL;
>> +     /* Added to avoid minimal code churn */
>> +     struct tpacket_req *req = &req_u->req;
>> +
>> +     /* Opening a Tx-ring is NOT supported in TPACKET_V3 */
>> +     if (!closing && tx_ring && (po->tp_version > TPACKET_V2)) {
>> +             pr_err("<%s> Tx-ring is not supported on version:%d.\
>> +                        Dumping stack.\n", __func__, po->tp_version);
>
> whitespace, WARN(1, etc...
>
>
>

Sure, will address all the comments(sob,alignment,pr_err etc) in the
next iteration of the patch.

thanks
Chetan

^ permalink raw reply

* Re: [PATCH] RFC2988bis + taking RTT sample from 3WHS for the passive open side
From: Jerry Chu @ 2011-06-08 22:06 UTC (permalink / raw)
  To: Hagen Paul Pfeifer
  Cc: David Miller, Eric Dumazet, tsunanet, netdev@vger.kernel.org
In-Reply-To: <20110608211639.GA2876@nuttenaction>

Hi Hagen,

On Wed, Jun 8, 2011 at 2:16 PM, Hagen Paul Pfeifer <hagen@jauu.net> wrote:
> Hello Jerry
>
> * Jerry Chu | 2011-06-08 11:04:22 [-0700]:
>
>>+       /* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been
>>+        * retransmitted. In light of RFC2988bis' more aggressive 1sec
>>+        * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK
>>+        * retransmission has occurred.
>>+        */
>>+       if (tp->total_retrans > 1)
>>+               tp->snd_cwnd = 1;
>>+       else
>>+               tp->snd_cwnd = tcp_init_cwnd(tp, dst);
>
> Is this behavior somewhere documented? In RFC2988bis I cannot find something
> similar.

See my 5/20/11 mail on this subject. The short answer is, no, it's not
part of RFC2988bis. The long answer is, when we (Vern, Mark and I)
were working
on RFC2988bis we shared a common design goal to minimize the negative
impact to those paths with RTT > 1sec as much as possible.

RFC3390 and later RFC5681 have the following clause:
"... Further, if the SYN or SYN/ACK is lost, the initial window used by a
sender after a correctly transmitted SYN MUST be one segment
consisting of at most SMSS bytes."

Unfortunately, with initRTO reduced to 1sec, unless the stack can reliably
detect spurious retransmissions of SYN or SYN-ACK, the paths with RTT >
1sec may be penalized unnecessarily with initcwnd = 1 if the stack
implementation follows the above clause faithfully. We can't really amend
the clause as part of RFC2988bis because it doesn't belong there, as the
latter only covers RTO computation, not the congestion control algorithm.
In the end Vern/Mark suggested me to add the amendment to the IW10
proposal I was working on (now becomes draft-ietf-tcpm-initcwnd-01.txt)
So I did, with following paragraph:
"...
   Furthermore, RFC 3390 and RFC 5681 [RFC5681] state that

         "If the SYN or SYN/ACK is lost, the initial window used by a
         sender after a correctly transmitted SYN MUST be one segment
         consisting of MSS bytes."

   The proposed change to reduce the default RTO to 1 second [PACS11]
   increases the chance for spurious SYN or SYN/ACK retransmission, thus
   unnecessarily penalizing connections with RTT > 1 second if their
   initial window is reduced to 1 segment. For this reason, it is
   RECOMMENDED that implementations refrain from resetting the initial
   window to 1 segment, unless either there have been multiple SYN or
   SYN/ACK retransmissions, or true loss detection has been made."

The patch follows the intent of the co-authors. It also moves the
implementation closer to the spec. Without the patch the existing
implementation will reduce ssthresh (active open side only), but not the
cwnd upon SYN or SYN-ACK retransmission.

Best,

Jerry

>
>
> Cheers, Hagen
>

^ permalink raw reply

* Re: [af-packet 1/2] Enhance af-packet to provide (near zero)lossless packet capture functionality.
From: chetan loke @ 2011-06-08 22:10 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev, davem, kaber, johann.baudy, Chetan Loke
In-Reply-To: <1307507754.3102.18.camel@edumazet-laptop>

On Wed, Jun 8, 2011 at 12:35 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> Le mardi 07 juin 2011 à 23:13 -0400, Chetan Loke a écrit :
>>
>> +struct tpacket3_hdr {
>> +     __u32           tp_status;
>> +     __u32           tp_len;
>> +     __u32           tp_snaplen;
>> +     __u16           tp_mac;
>> +     __u16           tp_net;
>
>> +     __u32           tp_sec;
>> +     __u32           tp_nsec;
>> +     __u16           tp_vlan_tci;
>
> missing "__u16 tp_padding;" here
>
> check :
>
> http://git2.kernel.org/?p=linux/kernel/git/davem/net-2.6.git;a=commit;h=13fcb7bd322164c67926ffe272846d4860196dc6

Eric, thanks for pointing that. I will add the padding. But just out
of curiosity, how is the information being leaked in tpacket_rcv()?

If someone is capturing packets then they have access to all the data
anyways. Also, tpacket_rcv doesn't memset the frame-element to 'zero'
before calling
skb_copy_bits(). And we would never want to memset anyways.

thnx
Chetan Loke

^ permalink raw reply

* Re: [PATCH] RFC2988bis + taking RTT sample from 3WHS for the passive open side
From: David Miller @ 2011-06-08 22:13 UTC (permalink / raw)
  To: ycheng; +Cc: hkchu, netdev
In-Reply-To: <BANLkTi=Gzb7kVRBq3+3Sihdu8rKP_OeTE4kcw7hdfUH=9qm-UQ@mail.gmail.com>

From: Yuchung Cheng <ycheng@google.com>
Date: Wed, 8 Jun 2011 13:54:41 -0700

> Acked-by: Yuchung Cheng <ycheng@google.com>

Please do not top-post.  Quote the commit log message portion, delete
the patch itself since you're not giving any specific feedback, and
then insert your Acked-by/Tested-by/Reviewed-by/whatever right
afterwards.

I know that people are trained not to properly put context before the
things they say in email replied, and edit out the unnecessary
portions of the quoted material, simply because many popular email
clients tend to make this difficult and make top-posting way too easy,
but that is not our problem.

^ permalink raw reply

* Re: [af-packet 1/2] Enhance af-packet to provide (near zero)lossless packet capture functionality.
From: Eric Dumazet @ 2011-06-08 22:22 UTC (permalink / raw)
  To: chetan loke; +Cc: netdev, davem, kaber, johann.baudy, Chetan Loke
In-Reply-To: <BANLkTi=_EFRaiPuT=OjwRaJ4XPw0sUhcnA@mail.gmail.com>

Le mercredi 08 juin 2011 à 18:10 -0400, chetan loke a écrit :

> Eric, thanks for pointing that. I will add the padding. But just out
> of curiosity, how is the information being leaked in tpacket_rcv()?
> 
> If someone is capturing packets then they have access to all the data
> anyways. Also, tpacket_rcv doesn't memset the frame-element to 'zero'
> before calling
> skb_copy_bits(). And we would never want to memset anyways.
> 

Its a security risk, leaking content of kernel stack or kernel memory.

capturing packets capability is not meaning "accessing full memory"

Some clever hackers can exploit these kind of leaks.

Better make sure we dont have holes in structures copied to user.
(or mapped in this case, but you never knows ;) )

^ permalink raw reply

* Re: [PATCH] RFC2988bis + taking RTT sample from 3WHS for the passive open side
From: Jerry Chu @ 2011-06-08 22:26 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David Miller, Hagen Paul Pfeifer, tsunanet,
	netdev@vger.kernel.org
In-Reply-To: <BANLkTikpmXMpuTmgejsgisV9O2fP1i7VLnUO3bcYLSU0+iXJVg@mail.gmail.com>

Eric,

It just occurred to me now that initRTO is being reduced, both TCP_SYN_RETRIES
and TCP_SYNACK_RETRIES should be bumped up a bit, to e.g., 7 (?) to meet the
3 minutes R2 requirement per RFC1122.

If you agree, I will submit another patch.

Thanks,

Jerry

On Wed, Jun 8, 2011 at 10:04 AM, Jerry Chu <hkchu@google.com> wrote:
> +netdev (add missing cc)
>
> On Wed, Jun 8, 2011 at 9:30 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>>
>> Le mardi 07 juin 2011 à 22:00 -0700, H.K. Jerry Chu a écrit :
>> > From: Jerry Chu <hkchu@google.com>
>> >
>> > This patch lowers the default initRTO from 3secs to 1sec per
>> > RFC2988bis. It falls back to 3secs if the SYN or SYN-ACK packet
>> > has been retransmitted, AND the TCP timestamp option is not on.
>> >
>> > It also adds support to take RTT sample during 3WHS on the passive
>> > open side, just like its active open counterpart, and uses it, if
>> > valid, to seed the initRTO for the data transmission phase.
>> >
>> > The patch also resets ssthresh to its initial default at the
>> > beginning of the data transmission phase, and reduces cwnd to 1 if
>> > there has been MORE THAN ONE retransmission during 3WHS per RFC5681.
>> >
>> > Signed-off-by: H.K. Jerry Chu <hkchu@google.com>
>> > ---
>>
>> ...
>>
>> > @@ -1854,7 +1859,7 @@ static int tcp_v4_init_sock(struct sock *sk)
>> >        * algorithms that we must have the following bandaid to talk
>> >        * efficiently to them.  -DaveM
>> >        */
>> > -     tp->snd_cwnd = 2;
>> > +     tp->snd_cwnd = TCP_INIT_CWND;
>>
>> Hmm, are you sure this belongs to this patch ?
>>
>> >
>> >       /* See draft-stevens-tcpca-spec-01 for discussion of the
>> >        * initialization of these values.
>> > diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
>> > index 80b1f80..d2fe4e0 100644
>> > --- a/net/ipv4/tcp_minisocks.c
>> > +++ b/net/ipv4/tcp_minisocks.c
>> > @@ -486,7 +486,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
>> >                * algorithms that we must have the following bandaid to talk
>> >                * efficiently to them.  -DaveM
>> >                */
>> > -             newtp->snd_cwnd = 2;
>> > +             newtp->snd_cwnd = TCP_INIT_CWND;
>>
>> same here ?
>
> It's part of the cwnd initialization stuff, which I included some fix
> per RFC5681 (in
> tcp_init_metrics()). The field is currently not being used anyway
> until data-in-SYN is
> supported. I fixed it here just because TCP_INIT_CWND is the right value.
>
>>
>> >               newtp->snd_cwnd_cnt = 0;
>> >               newtp->bytes_acked = 0;
>> >
>> > @@ -720,6 +720,10 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
>> >               NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP);
>> >               return NULL;
>> >       }
>> > +     if (tmp_opt.saw_tstamp && tmp_opt.rcv_tsecr)
>> > +             tcp_rsk(req)->snt_synack = tmp_opt.rcv_tsecr;
>> > +     else if (req->retrans) /* don't take RTT sample if retrans && ~TS */
>> > +             tcp_rsk(req)->snt_synack = 0;
>>
>> Could you clarify why this is done in this order ?
>
> What else is more appropriate? It needs to decide if we've got a valid
> RTT sample.
> If TS is on and rcv_tsec != 0 then we've got a valid RTT sample. Otherwise it
> does not take sample if there has been retrans per Karn's algorithm.
>
> Thanks,
>
> Jerry
>
>>
>> Thanks !
>>
>>
>

^ permalink raw reply

* [PATCH] v2 ethtool: remove support for ETHTOOL_GRXNTUPLE
From: Alexander Duyck @ 2011-06-08 22:35 UTC (permalink / raw)
  To: davem, jeffrey.t.kirsher, bhutchings; +Cc: netdev

This change is meant to remove all support for displaying an ntuple as
strings via ETHTOOL_GRXNTUPLE.  The reason for this change is due to the
fact that multiple issues have been found including:
 - Multiple buffer overruns for strings being displayed.
 - Incorrect filters displayed, cleared filters with ring of -2 are displayed
 - Setting get_rx_ntuple displays no rules if defined.
 - Endianess wrong on displayed values.
 - Hard limit of 1024 filters makes display functionality extremely limited

The only driver that had supported this interface was ixgbe.  Since it no
longer uses the interface and due to the issues mentioned above I am
submitting this patch to remove it.

v2:
Updated based on comments from Ben Hutchings
 - Left ETH_SS_NTUPLE_FILTERS in code but commented on it being deprecated
 - Removed ethtool_rx_ntuple_list and ethtool_rx_ntuple_flow_spec_container
 - Left ETHTOOL_GRXNTUPLE but commented it as deprecated

Also cleaned up set_rx_ntuple since there is no flow spec container to
maintain we can drop all the code for the alloc and free of it and just
return ops->set_rx_ntuple().
Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
---

 include/linux/ethtool.h   |   19 ---
 include/linux/netdevice.h |    3 
 net/core/dev.c            |    5 -
 net/core/ethtool.c        |  309 ---------------------------------------------
 4 files changed, 3 insertions(+), 333 deletions(-)

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index c6a850a..dfd3493 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -287,7 +287,7 @@ enum ethtool_stringset {
 	ETH_SS_TEST		= 0,
 	ETH_SS_STATS,
 	ETH_SS_PRIV_FLAGS,
-	ETH_SS_NTUPLE_FILTERS,
+	ETH_SS_NTUPLE_FILTERS,	/* Do not use, GRXNTUPLE is now deprecated */
 	ETH_SS_FEATURES,
 };
 
@@ -714,18 +714,6 @@ enum ethtool_sfeatures_retval_bits {
 /* needed by dev_disable_lro() */
 extern int __ethtool_set_flags(struct net_device *dev, u32 flags);
 
-struct ethtool_rx_ntuple_flow_spec_container {
-	struct ethtool_rx_ntuple_flow_spec fs;
-	struct list_head list;
-};
-
-struct ethtool_rx_ntuple_list {
-#define ETHTOOL_MAX_NTUPLE_LIST_ENTRY 1024
-#define ETHTOOL_MAX_NTUPLE_STRING_PER_ENTRY 14
-	struct list_head	list;
-	unsigned int		count;
-};
-
 /**
  * enum ethtool_phys_id_state - indicator state for physical identification
  * @ETHTOOL_ID_INACTIVE: Physical ID indicator should be deactivated
@@ -758,7 +746,6 @@ u32 ethtool_op_get_ufo(struct net_device *dev);
 int ethtool_op_set_ufo(struct net_device *dev, u32 data);
 u32 ethtool_op_get_flags(struct net_device *dev);
 int ethtool_op_set_flags(struct net_device *dev, u32 data, u32 supported);
-void ethtool_ntuple_flush(struct net_device *dev);
 bool ethtool_invalid_flags(struct net_device *dev, u32 data, u32 supported);
 
 /**
@@ -865,7 +852,6 @@ bool ethtool_invalid_flags(struct net_device *dev, u32 data, u32 supported);
  *	error code or zero.
  * @set_rx_ntuple: Set an RX n-tuple rule.  Returns a negative error code
  *	or zero.
- * @get_rx_ntuple: Deprecated.
  * @get_rxfh_indir: Get the contents of the RX flow hash indirection table.
  *	Returns a negative error code or zero.
  * @set_rxfh_indir: Set the contents of the RX flow hash indirection table.
@@ -944,7 +930,6 @@ struct ethtool_ops {
 	int	(*reset)(struct net_device *, u32 *);
 	int	(*set_rx_ntuple)(struct net_device *,
 				 struct ethtool_rx_ntuple *);
-	int	(*get_rx_ntuple)(struct net_device *, u32 stringset, void *);
 	int	(*get_rxfh_indir)(struct net_device *,
 				  struct ethtool_rxfh_indir *);
 	int	(*set_rxfh_indir)(struct net_device *,
@@ -1017,7 +1002,7 @@ struct ethtool_ops {
 #define ETHTOOL_FLASHDEV	0x00000033 /* Flash firmware to device */
 #define ETHTOOL_RESET		0x00000034 /* Reset hardware */
 #define ETHTOOL_SRXNTUPLE	0x00000035 /* Add an n-tuple filter to device */
-#define ETHTOOL_GRXNTUPLE	0x00000036 /* Get n-tuple filters from device */
+#define ETHTOOL_GRXNTUPLE	0x00000036 /* deprecated */
 #define ETHTOOL_GSSET_INFO	0x00000037 /* Get string set info */
 #define ETHTOOL_GRXFHINDIR	0x00000038 /* Get RX flow hash indir'n table */
 #define ETHTOOL_SRXFHINDIR	0x00000039 /* Set RX flow hash indir'n table */
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 3362885..6469fa9 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1348,9 +1348,6 @@ struct net_device {
 	/* max exchange id for FCoE LRO by ddp */
 	unsigned int		fcoe_ddp_xid;
 #endif
-	/* n-tuple filter list attached to this device */
-	struct ethtool_rx_ntuple_list ethtool_ntuple_list;
-
 	/* phy device may attach itself for hardware timestamping */
 	struct phy_device *phydev;
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 9393078..b3f52d2 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5867,8 +5867,6 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 
 	dev->gso_max_size = GSO_MAX_SIZE;
 
-	INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list);
-	dev->ethtool_ntuple_list.count = 0;
 	INIT_LIST_HEAD(&dev->napi_list);
 	INIT_LIST_HEAD(&dev->unreg_list);
 	INIT_LIST_HEAD(&dev->link_watch_list);
@@ -5932,9 +5930,6 @@ void free_netdev(struct net_device *dev)
 	/* Flush device addresses */
 	dev_addr_flush(dev);
 
-	/* Clear ethtool n-tuple list */
-	ethtool_ntuple_flush(dev);
-
 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
 		netif_napi_del(p);
 
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index fd14116..b7c12a6 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -169,18 +169,6 @@ int ethtool_op_set_flags(struct net_device *dev, u32 data, u32 supported)
 }
 EXPORT_SYMBOL(ethtool_op_set_flags);
 
-void ethtool_ntuple_flush(struct net_device *dev)
-{
-	struct ethtool_rx_ntuple_flow_spec_container *fsc, *f;
-
-	list_for_each_entry_safe(fsc, f, &dev->ethtool_ntuple_list.list, list) {
-		list_del(&fsc->list);
-		kfree(fsc);
-	}
-	dev->ethtool_ntuple_list.count = 0;
-}
-EXPORT_SYMBOL(ethtool_ntuple_flush);
-
 /* Handlers for each ethtool command */
 
 #define ETHTOOL_DEV_FEATURE_WORDS	1
@@ -865,34 +853,6 @@ out:
 	return ret;
 }
 
-static void __rx_ntuple_filter_add(struct ethtool_rx_ntuple_list *list,
-			struct ethtool_rx_ntuple_flow_spec *spec,
-			struct ethtool_rx_ntuple_flow_spec_container *fsc)
-{
-
-	/* don't add filters forever */
-	if (list->count >= ETHTOOL_MAX_NTUPLE_LIST_ENTRY) {
-		/* free the container */
-		kfree(fsc);
-		return;
-	}
-
-	/* Copy the whole filter over */
-	fsc->fs.flow_type = spec->flow_type;
-	memcpy(&fsc->fs.h_u, &spec->h_u, sizeof(spec->h_u));
-	memcpy(&fsc->fs.m_u, &spec->m_u, sizeof(spec->m_u));
-
-	fsc->fs.vlan_tag = spec->vlan_tag;
-	fsc->fs.vlan_tag_mask = spec->vlan_tag_mask;
-	fsc->fs.data = spec->data;
-	fsc->fs.data_mask = spec->data_mask;
-	fsc->fs.action = spec->action;
-
-	/* add to the list */
-	list_add_tail_rcu(&fsc->list, &list->list);
-	list->count++;
-}
-
 /*
  * ethtool does not (or did not) set masks for flow parameters that are
  * not specified, so if both value and mask are 0 then this must be
@@ -930,8 +890,6 @@ static noinline_for_stack int ethtool_set_rx_ntuple(struct net_device *dev,
 {
 	struct ethtool_rx_ntuple cmd;
 	const struct ethtool_ops *ops = dev->ethtool_ops;
-	struct ethtool_rx_ntuple_flow_spec_container *fsc = NULL;
-	int ret;
 
 	if (!ops->set_rx_ntuple)
 		return -EOPNOTSUPP;
@@ -944,269 +902,7 @@ static noinline_for_stack int ethtool_set_rx_ntuple(struct net_device *dev,
 
 	rx_ntuple_fix_masks(&cmd.fs);
 
-	/*
-	 * Cache filter in dev struct for GET operation only if
-	 * the underlying driver doesn't have its own GET operation, and
-	 * only if the filter was added successfully.  First make sure we
-	 * can allocate the filter, then continue if successful.
-	 */
-	if (!ops->get_rx_ntuple) {
-		fsc = kmalloc(sizeof(*fsc), GFP_ATOMIC);
-		if (!fsc)
-			return -ENOMEM;
-	}
-
-	ret = ops->set_rx_ntuple(dev, &cmd);
-	if (ret) {
-		kfree(fsc);
-		return ret;
-	}
-
-	if (!ops->get_rx_ntuple)
-		__rx_ntuple_filter_add(&dev->ethtool_ntuple_list, &cmd.fs, fsc);
-
-	return ret;
-}
-
-static int ethtool_get_rx_ntuple(struct net_device *dev, void __user *useraddr)
-{
-	struct ethtool_gstrings gstrings;
-	const struct ethtool_ops *ops = dev->ethtool_ops;
-	struct ethtool_rx_ntuple_flow_spec_container *fsc;
-	u8 *data;
-	char *p;
-	int ret, i, num_strings = 0;
-
-	if (!ops->get_sset_count)
-		return -EOPNOTSUPP;
-
-	if (copy_from_user(&gstrings, useraddr, sizeof(gstrings)))
-		return -EFAULT;
-
-	ret = ops->get_sset_count(dev, gstrings.string_set);
-	if (ret < 0)
-		return ret;
-
-	gstrings.len = ret;
-
-	data = kzalloc(gstrings.len * ETH_GSTRING_LEN, GFP_USER);
-	if (!data)
-		return -ENOMEM;
-
-	if (ops->get_rx_ntuple) {
-		/* driver-specific filter grab */
-		ret = ops->get_rx_ntuple(dev, gstrings.string_set, data);
-		goto copy;
-	}
-
-	/* default ethtool filter grab */
-	i = 0;
-	p = (char *)data;
-	list_for_each_entry(fsc, &dev->ethtool_ntuple_list.list, list) {
-		sprintf(p, "Filter %d:\n", i);
-		p += ETH_GSTRING_LEN;
-		num_strings++;
-
-		switch (fsc->fs.flow_type) {
-		case TCP_V4_FLOW:
-			sprintf(p, "\tFlow Type: TCP\n");
-			p += ETH_GSTRING_LEN;
-			num_strings++;
-			break;
-		case UDP_V4_FLOW:
-			sprintf(p, "\tFlow Type: UDP\n");
-			p += ETH_GSTRING_LEN;
-			num_strings++;
-			break;
-		case SCTP_V4_FLOW:
-			sprintf(p, "\tFlow Type: SCTP\n");
-			p += ETH_GSTRING_LEN;
-			num_strings++;
-			break;
-		case AH_ESP_V4_FLOW:
-			sprintf(p, "\tFlow Type: AH ESP\n");
-			p += ETH_GSTRING_LEN;
-			num_strings++;
-			break;
-		case ESP_V4_FLOW:
-			sprintf(p, "\tFlow Type: ESP\n");
-			p += ETH_GSTRING_LEN;
-			num_strings++;
-			break;
-		case IP_USER_FLOW:
-			sprintf(p, "\tFlow Type: Raw IP\n");
-			p += ETH_GSTRING_LEN;
-			num_strings++;
-			break;
-		case IPV4_FLOW:
-			sprintf(p, "\tFlow Type: IPv4\n");
-			p += ETH_GSTRING_LEN;
-			num_strings++;
-			break;
-		default:
-			sprintf(p, "\tFlow Type: Unknown\n");
-			p += ETH_GSTRING_LEN;
-			num_strings++;
-			goto unknown_filter;
-		}
-
-		/* now the rest of the filters */
-		switch (fsc->fs.flow_type) {
-		case TCP_V4_FLOW:
-		case UDP_V4_FLOW:
-		case SCTP_V4_FLOW:
-			sprintf(p, "\tSrc IP addr: 0x%x\n",
-				fsc->fs.h_u.tcp_ip4_spec.ip4src);
-			p += ETH_GSTRING_LEN;
-			num_strings++;
-			sprintf(p, "\tSrc IP mask: 0x%x\n",
-				fsc->fs.m_u.tcp_ip4_spec.ip4src);
-			p += ETH_GSTRING_LEN;
-			num_strings++;
-			sprintf(p, "\tDest IP addr: 0x%x\n",
-				fsc->fs.h_u.tcp_ip4_spec.ip4dst);
-			p += ETH_GSTRING_LEN;
-			num_strings++;
-			sprintf(p, "\tDest IP mask: 0x%x\n",
-				fsc->fs.m_u.tcp_ip4_spec.ip4dst);
-			p += ETH_GSTRING_LEN;
-			num_strings++;
-			sprintf(p, "\tSrc Port: %d, mask: 0x%x\n",
-				fsc->fs.h_u.tcp_ip4_spec.psrc,
-				fsc->fs.m_u.tcp_ip4_spec.psrc);
-			p += ETH_GSTRING_LEN;
-			num_strings++;
-			sprintf(p, "\tDest Port: %d, mask: 0x%x\n",
-				fsc->fs.h_u.tcp_ip4_spec.pdst,
-				fsc->fs.m_u.tcp_ip4_spec.pdst);
-			p += ETH_GSTRING_LEN;
-			num_strings++;
-			sprintf(p, "\tTOS: %d, mask: 0x%x\n",
-				fsc->fs.h_u.tcp_ip4_spec.tos,
-				fsc->fs.m_u.tcp_ip4_spec.tos);
-			p += ETH_GSTRING_LEN;
-			num_strings++;
-			break;
-		case AH_ESP_V4_FLOW:
-		case ESP_V4_FLOW:
-			sprintf(p, "\tSrc IP addr: 0x%x\n",
-				fsc->fs.h_u.ah_ip4_spec.ip4src);
-			p += ETH_GSTRING_LEN;
-			num_strings++;
-			sprintf(p, "\tSrc IP mask: 0x%x\n",
-				fsc->fs.m_u.ah_ip4_spec.ip4src);
-			p += ETH_GSTRING_LEN;
-			num_strings++;
-			sprintf(p, "\tDest IP addr: 0x%x\n",
-				fsc->fs.h_u.ah_ip4_spec.ip4dst);
-			p += ETH_GSTRING_LEN;
-			num_strings++;
-			sprintf(p, "\tDest IP mask: 0x%x\n",
-				fsc->fs.m_u.ah_ip4_spec.ip4dst);
-			p += ETH_GSTRING_LEN;
-			num_strings++;
-			sprintf(p, "\tSPI: %d, mask: 0x%x\n",
-				fsc->fs.h_u.ah_ip4_spec.spi,
-				fsc->fs.m_u.ah_ip4_spec.spi);
-			p += ETH_GSTRING_LEN;
-			num_strings++;
-			sprintf(p, "\tTOS: %d, mask: 0x%x\n",
-				fsc->fs.h_u.ah_ip4_spec.tos,
-				fsc->fs.m_u.ah_ip4_spec.tos);
-			p += ETH_GSTRING_LEN;
-			num_strings++;
-			break;
-		case IP_USER_FLOW:
-			sprintf(p, "\tSrc IP addr: 0x%x\n",
-				fsc->fs.h_u.usr_ip4_spec.ip4src);
-			p += ETH_GSTRING_LEN;
-			num_strings++;
-			sprintf(p, "\tSrc IP mask: 0x%x\n",
-				fsc->fs.m_u.usr_ip4_spec.ip4src);
-			p += ETH_GSTRING_LEN;
-			num_strings++;
-			sprintf(p, "\tDest IP addr: 0x%x\n",
-				fsc->fs.h_u.usr_ip4_spec.ip4dst);
-			p += ETH_GSTRING_LEN;
-			num_strings++;
-			sprintf(p, "\tDest IP mask: 0x%x\n",
-				fsc->fs.m_u.usr_ip4_spec.ip4dst);
-			p += ETH_GSTRING_LEN;
-			num_strings++;
-			break;
-		case IPV4_FLOW:
-			sprintf(p, "\tSrc IP addr: 0x%x\n",
-				fsc->fs.h_u.usr_ip4_spec.ip4src);
-			p += ETH_GSTRING_LEN;
-			num_strings++;
-			sprintf(p, "\tSrc IP mask: 0x%x\n",
-				fsc->fs.m_u.usr_ip4_spec.ip4src);
-			p += ETH_GSTRING_LEN;
-			num_strings++;
-			sprintf(p, "\tDest IP addr: 0x%x\n",
-				fsc->fs.h_u.usr_ip4_spec.ip4dst);
-			p += ETH_GSTRING_LEN;
-			num_strings++;
-			sprintf(p, "\tDest IP mask: 0x%x\n",
-				fsc->fs.m_u.usr_ip4_spec.ip4dst);
-			p += ETH_GSTRING_LEN;
-			num_strings++;
-			sprintf(p, "\tL4 bytes: 0x%x, mask: 0x%x\n",
-				fsc->fs.h_u.usr_ip4_spec.l4_4_bytes,
-				fsc->fs.m_u.usr_ip4_spec.l4_4_bytes);
-			p += ETH_GSTRING_LEN;
-			num_strings++;
-			sprintf(p, "\tTOS: %d, mask: 0x%x\n",
-				fsc->fs.h_u.usr_ip4_spec.tos,
-				fsc->fs.m_u.usr_ip4_spec.tos);
-			p += ETH_GSTRING_LEN;
-			num_strings++;
-			sprintf(p, "\tIP Version: %d, mask: 0x%x\n",
-				fsc->fs.h_u.usr_ip4_spec.ip_ver,
-				fsc->fs.m_u.usr_ip4_spec.ip_ver);
-			p += ETH_GSTRING_LEN;
-			num_strings++;
-			sprintf(p, "\tProtocol: %d, mask: 0x%x\n",
-				fsc->fs.h_u.usr_ip4_spec.proto,
-				fsc->fs.m_u.usr_ip4_spec.proto);
-			p += ETH_GSTRING_LEN;
-			num_strings++;
-			break;
-		}
-		sprintf(p, "\tVLAN: %d, mask: 0x%x\n",
-			fsc->fs.vlan_tag, fsc->fs.vlan_tag_mask);
-		p += ETH_GSTRING_LEN;
-		num_strings++;
-		sprintf(p, "\tUser-defined: 0x%Lx\n", fsc->fs.data);
-		p += ETH_GSTRING_LEN;
-		num_strings++;
-		sprintf(p, "\tUser-defined mask: 0x%Lx\n", fsc->fs.data_mask);
-		p += ETH_GSTRING_LEN;
-		num_strings++;
-		if (fsc->fs.action == ETHTOOL_RXNTUPLE_ACTION_DROP)
-			sprintf(p, "\tAction: Drop\n");
-		else
-			sprintf(p, "\tAction: Direct to queue %d\n",
-				fsc->fs.action);
-		p += ETH_GSTRING_LEN;
-		num_strings++;
-unknown_filter:
-		i++;
-	}
-copy:
-	/* indicate to userspace how many strings we actually have */
-	gstrings.len = num_strings;
-	ret = -EFAULT;
-	if (copy_to_user(useraddr, &gstrings, sizeof(gstrings)))
-		goto out;
-	useraddr += sizeof(gstrings);
-	if (copy_to_user(useraddr, data, gstrings.len * ETH_GSTRING_LEN))
-		goto out;
-	ret = 0;
-
-out:
-	kfree(data);
-	return ret;
+	return ops->set_rx_ntuple(dev, &cmd);
 }
 
 static int ethtool_get_regs(struct net_device *dev, char __user *useraddr)
@@ -2101,9 +1797,6 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
 	case ETHTOOL_SRXNTUPLE:
 		rc = ethtool_set_rx_ntuple(dev, useraddr);
 		break;
-	case ETHTOOL_GRXNTUPLE:
-		rc = ethtool_get_rx_ntuple(dev, useraddr);
-		break;
 	case ETHTOOL_GSSET_INFO:
 		rc = ethtool_get_sset_info(dev, useraddr);
 		break;


^ permalink raw reply related

* Re: pull request: wireless-next-2.6 2011-06-08
From: David Miller @ 2011-06-08 22:42 UTC (permalink / raw)
  To: linville-2XuSBdqkA4R54TAoqtyWWQ
  Cc: linux-wireless-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20110608191246.GD27028-2XuSBdqkA4R54TAoqtyWWQ@public.gmane.org>

From: "John W. Linville" <linville-2XuSBdqkA4R54TAoqtyWWQ@public.gmane.org>
Date: Wed, 8 Jun 2011 15:12:47 -0400

> Dave,
> 
> Here is the first big wireless pull request for the 3.1 cycle.
> As usual, this covers a lot of ground -- mostly driver updates.
> 
> Some highlights include some math algorithm additions to lib/ from
> the Broadcom team, sizeable update series for rt2x00 and rtlwifi,
> the usual iwlagn and ath9k updates from the Intel and Atheros teams
> (and a few other folks), some more mwifiex updates from the Marvell
> folks, and a bunch of bcma, ssb and b43 updates from Rafał Miłecki.
 ...
> The following changes since commit ffbc03bc75b39c7bd412e7cc6d2185c11b0ffedd:
> 
>   net: add needed interrupt.h (2011-06-08 00:15:34 -0700)
> 
> are available in the git repository at:
>   git://git.kernel.org/pub/scm/linux/kernel/git/linville/wireless-next-2.6.git for-davem

Pulled, thanks!
--
To unsubscribe from this list: send the line "unsubscribe linux-wireless" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* [net-next 00/12][pull request] Intel Wired LAN Driver Update
From: Jeff Kirsher @ 2011-06-08 22:56 UTC (permalink / raw)
  To: davem; +Cc: Jeff Kirsher, netdev, gospo

The following series is a (smaller) subset of the previous 40 patch
submission.  This subset only contains updates to e1000e, igb, igbvf
and rtnetlink.

e1000e: several cleanups and fixes, as well as bump the version
igb/igbvf/ixgbevf: bump driver version
rtnetlink: Compute and store minimum ifinfo dump size

The following are changes since commit ffbc03bc75b39c7bd412e7cc6d2185c11b0ffedd:
  net: add needed interrupt.h
and are available in the git repository at:
  master.kernel.org:/pub/scm/linux/kernel/git/jkirsher/net-next-2.6 master

Bruce Allan (8):
  e1000e: disable far-end loopback mode on ESB2
  e1000e: 82579 intermittently disabled during S0->Sx
  e1000e: log when swflag is cleared unexpectedly on ICH/PCH devices
  e1000e: do not schedule the Tx queue until ready
  e1000e: access multiple PHY registers on same page at the same time
  e1000e: Clear host wakeup bit on 82577/8 without touching PHY page
    800
  e1000e: remove redundant reverse dependency on CRC32
  e1000e: update driver version

Carolyn Wyborny (1):
  igb: Change version to remove number after -k in kernel versions.

Greg Rose (2):
  rtnetlink: Compute and store minimum ifinfo dump size
  ixgbevf: Update the driver string

Williams, Mitch A (1):
  igbvf: update version number

 drivers/net/Kconfig                  |    1 -
 drivers/net/e1000e/e1000.h           |   42 +++--
 drivers/net/e1000e/es2lan.c          |    8 +
 drivers/net/e1000e/hw.h              |   21 ++-
 drivers/net/e1000e/ich8lan.c         |  191 ++++++++++++++-----
 drivers/net/e1000e/netdev.c          |  131 +++++--------
 drivers/net/e1000e/phy.c             |  352 ++++++++++++++++++++++------------
 drivers/net/igb/igb_main.c           |    3 +-
 drivers/net/igbvf/netdev.c           |    2 +-
 drivers/net/ixgbevf/ixgbevf_main.c   |    2 +-
 include/linux/netlink.h              |    6 +-
 include/net/rtnetlink.h              |    7 +-
 net/bridge/br_netlink.c              |   15 +-
 net/core/fib_rules.c                 |    6 +-
 net/core/neighbour.c                 |   11 +-
 net/core/rtnetlink.c                 |   60 +++++-
 net/dcb/dcbnl.c                      |    4 +-
 net/decnet/dn_dev.c                  |    6 +-
 net/decnet/dn_fib.c                  |    4 +-
 net/decnet/dn_route.c                |    5 +-
 net/ipv4/devinet.c                   |    6 +-
 net/ipv4/fib_frontend.c              |    6 +-
 net/ipv4/inet_diag.c                 |    2 +-
 net/ipv4/ipmr.c                      |    3 +-
 net/ipv4/route.c                     |    2 +-
 net/ipv6/addrconf.c                  |   16 +-
 net/ipv6/addrlabel.c                 |    9 +-
 net/ipv6/ip6_fib.c                   |    3 +-
 net/ipv6/ip6mr.c                     |    3 +-
 net/ipv6/route.c                     |    6 +-
 net/netfilter/ipset/ip_set_core.c    |    2 +-
 net/netfilter/nf_conntrack_netlink.c |    4 +-
 net/netlink/af_netlink.c             |   17 +-
 net/netlink/genetlink.c              |    2 +-
 net/phonet/pn_netlink.c              |   13 +-
 net/sched/act_api.c                  |    7 +-
 net/sched/cls_api.c                  |    6 +-
 net/sched/sch_api.c                  |   12 +-
 net/xfrm/xfrm_user.c                 |    3 +-
 39 files changed, 643 insertions(+), 356 deletions(-)

-- 
1.7.5.2


^ permalink raw reply

* [net-next 02/12] e1000e: 82579 intermittently disabled during S0->Sx
From: Jeff Kirsher @ 2011-06-08 22:56 UTC (permalink / raw)
  To: davem; +Cc: Bruce Allan, netdev, gospo, Jeff Kirsher
In-Reply-To: <1307573800-24868-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Bruce Allan <bruce.w.allan@intel.com>

When repeatedly cycling Sx->S0 states with the network cable unplugged,
the 82579 PHY may not initialize as expected and may require a full power
cycle to recover functionality to the device.  Workaround this by testing
access of the PHY registers after resuming; if that returns unexpected
results toggle the LANPHYPC signal to power cycle the PHY.

This is implemented in the new function e1000_resume_workarounds_pchlan()
which calls another new function, e1000_toggle_lanphypc_value_ich8lan(),
which has been created to reduce code duplication (same functionality
required by a previous workaround).  Also, e1000e_disable_gig_wol_ich8lan
is now e1000_suspend_workarounds_ich8lan to better reflect what it does.

Signed-off-by: Bruce Allan <bruce.w.allan@intel.com>
Tested-by: Aaron Brown <aaron.f.brown@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/e1000e/e1000.h   |    3 +-
 drivers/net/e1000e/ich8lan.c |   86 +++++++++++++++++++++++++++++++++++------
 drivers/net/e1000e/netdev.c  |    5 ++-
 3 files changed, 79 insertions(+), 15 deletions(-)

diff --git a/drivers/net/e1000e/e1000.h b/drivers/net/e1000e/e1000.h
index 9549879..f8cd56a 100644
--- a/drivers/net/e1000e/e1000.h
+++ b/drivers/net/e1000e/e1000.h
@@ -533,7 +533,8 @@ extern void e1000e_set_kmrn_lock_loss_workaround_ich8lan(struct e1000_hw *hw,
 						 bool state);
 extern void e1000e_igp3_phy_powerdown_workaround_ich8lan(struct e1000_hw *hw);
 extern void e1000e_gig_downshift_workaround_ich8lan(struct e1000_hw *hw);
-extern void e1000e_disable_gig_wol_ich8lan(struct e1000_hw *hw);
+extern void e1000_suspend_workarounds_ich8lan(struct e1000_hw *hw);
+extern void e1000_resume_workarounds_pchlan(struct e1000_hw *hw);
 extern s32 e1000_configure_k1_ich8lan(struct e1000_hw *hw, bool k1_enable);
 extern s32 e1000_lv_jumbo_workaround_ich8lan(struct e1000_hw *hw, bool enable);
 extern void e1000_copy_rx_addrs_to_phy_ich8lan(struct e1000_hw *hw);
diff --git a/drivers/net/e1000e/ich8lan.c b/drivers/net/e1000e/ich8lan.c
index 3369d1f..dcd5db5 100644
--- a/drivers/net/e1000e/ich8lan.c
+++ b/drivers/net/e1000e/ich8lan.c
@@ -275,6 +275,19 @@ static inline void __ew32flash(struct e1000_hw *hw, unsigned long reg, u32 val)
 #define ew16flash(reg,val)	__ew16flash(hw, (reg), (val))
 #define ew32flash(reg,val)	__ew32flash(hw, (reg), (val))
 
+static void e1000_toggle_lanphypc_value_ich8lan(struct e1000_hw *hw)
+{
+	u32 ctrl;
+
+	ctrl = er32(CTRL);
+	ctrl |= E1000_CTRL_LANPHYPC_OVERRIDE;
+	ctrl &= ~E1000_CTRL_LANPHYPC_VALUE;
+	ew32(CTRL, ctrl);
+	udelay(10);
+	ctrl &= ~E1000_CTRL_LANPHYPC_OVERRIDE;
+	ew32(CTRL, ctrl);
+}
+
 /**
  *  e1000_init_phy_params_pchlan - Initialize PHY function pointers
  *  @hw: pointer to the HW structure
@@ -284,7 +297,7 @@ static inline void __ew32flash(struct e1000_hw *hw, unsigned long reg, u32 val)
 static s32 e1000_init_phy_params_pchlan(struct e1000_hw *hw)
 {
 	struct e1000_phy_info *phy = &hw->phy;
-	u32 ctrl, fwsm;
+	u32 fwsm;
 	s32 ret_val = 0;
 
 	phy->addr                     = 1;
@@ -308,13 +321,7 @@ static s32 e1000_init_phy_params_pchlan(struct e1000_hw *hw)
 	 */
 	fwsm = er32(FWSM);
 	if (!(fwsm & E1000_ICH_FWSM_FW_VALID) && !e1000_check_reset_block(hw)) {
-		ctrl = er32(CTRL);
-		ctrl |= E1000_CTRL_LANPHYPC_OVERRIDE;
-		ctrl &= ~E1000_CTRL_LANPHYPC_VALUE;
-		ew32(CTRL, ctrl);
-		udelay(10);
-		ctrl &= ~E1000_CTRL_LANPHYPC_OVERRIDE;
-		ew32(CTRL, ctrl);
+		e1000_toggle_lanphypc_value_ich8lan(hw);
 		msleep(50);
 
 		/*
@@ -3586,17 +3593,16 @@ void e1000e_gig_downshift_workaround_ich8lan(struct e1000_hw *hw)
 }
 
 /**
- *  e1000e_disable_gig_wol_ich8lan - disable gig during WoL
+ *  e1000_suspend_workarounds_ich8lan - workarounds needed during S0->Sx
  *  @hw: pointer to the HW structure
  *
  *  During S0 to Sx transition, it is possible the link remains at gig
  *  instead of negotiating to a lower speed.  Before going to Sx, set
  *  'LPLU Enabled' and 'Gig Disable' to force link speed negotiation
- *  to a lower speed.
- *
- *  Should only be called for applicable parts.
+ *  to a lower speed.  For PCH and newer parts, the OEM bits PHY register
+ *  (LED, GbE disable and LPLU configurations) also needs to be written.
  **/
-void e1000e_disable_gig_wol_ich8lan(struct e1000_hw *hw)
+void e1000_suspend_workarounds_ich8lan(struct e1000_hw *hw)
 {
 	u32 phy_ctrl;
 	s32 ret_val;
@@ -3616,6 +3622,60 @@ void e1000e_disable_gig_wol_ich8lan(struct e1000_hw *hw)
 }
 
 /**
+ *  e1000_resume_workarounds_pchlan - workarounds needed during Sx->S0
+ *  @hw: pointer to the HW structure
+ *
+ *  During Sx to S0 transitions on non-managed devices or managed devices
+ *  on which PHY resets are not blocked, if the PHY registers cannot be
+ *  accessed properly by the s/w toggle the LANPHYPC value to power cycle
+ *  the PHY.
+ **/
+void e1000_resume_workarounds_pchlan(struct e1000_hw *hw)
+{
+	u32 fwsm;
+
+	if (hw->mac.type != e1000_pch2lan)
+		return;
+
+	fwsm = er32(FWSM);
+	if (!(fwsm & E1000_ICH_FWSM_FW_VALID) || !e1000_check_reset_block(hw)) {
+		u16 phy_id1, phy_id2;
+		s32 ret_val;
+
+		ret_val = hw->phy.ops.acquire(hw);
+		if (ret_val) {
+			e_dbg("Failed to acquire PHY semaphore in resume\n");
+			return;
+		}
+
+		/* Test access to the PHY registers by reading the ID regs */
+		ret_val = hw->phy.ops.read_reg_locked(hw, PHY_ID1, &phy_id1);
+		if (ret_val)
+			goto release;
+		ret_val = hw->phy.ops.read_reg_locked(hw, PHY_ID2, &phy_id2);
+		if (ret_val)
+			goto release;
+
+		if (hw->phy.id == ((u32)(phy_id1 << 16) |
+				   (u32)(phy_id2 & PHY_REVISION_MASK)))
+			goto release;
+
+		e1000_toggle_lanphypc_value_ich8lan(hw);
+
+		hw->phy.ops.release(hw);
+		msleep(50);
+		e1000_phy_hw_reset(hw);
+		msleep(50);
+		return;
+	}
+
+release:
+	hw->phy.ops.release(hw);
+
+	return;
+}
+
+/**
  *  e1000_cleanup_led_ich8lan - Restore the default LED operation
  *  @hw: pointer to the HW structure
  *
diff --git a/drivers/net/e1000e/netdev.c b/drivers/net/e1000e/netdev.c
index 888bd9c..c4a23c7 100644
--- a/drivers/net/e1000e/netdev.c
+++ b/drivers/net/e1000e/netdev.c
@@ -5278,7 +5278,7 @@ static int __e1000_shutdown(struct pci_dev *pdev, bool *enable_wake,
 		}
 
 		if (adapter->flags & FLAG_IS_ICH)
-			e1000e_disable_gig_wol_ich8lan(&adapter->hw);
+			e1000_suspend_workarounds_ich8lan(&adapter->hw);
 
 		/* Allow time for pending master requests to run */
 		e1000e_disable_pcie_master(&adapter->hw);
@@ -5429,6 +5429,9 @@ static int __e1000_resume(struct pci_dev *pdev)
 			return err;
 	}
 
+	if (hw->mac.type == e1000_pch2lan)
+		e1000_resume_workarounds_pchlan(&adapter->hw);
+
 	e1000e_power_up_phy(adapter);
 
 	/* report the system wakeup cause from S3/S4 */
-- 
1.7.5.2


^ permalink raw reply related

* [net-next 01/12] e1000e: disable far-end loopback mode on ESB2
From: Jeff Kirsher @ 2011-06-08 22:56 UTC (permalink / raw)
  To: davem; +Cc: Bruce Allan, netdev, gospo, Jeff Kirsher
In-Reply-To: <1307573800-24868-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Bruce Allan <bruce.w.allan@intel.com>

The ESB2 LAN includes a debug feature that enables far-end loopback (FELB)
of the SerDes/Kumeran interface.  This feature is activated when receiving
a sequence of symbols that includes a reserved codeword.  On a perfect
link, FELB would never be activated.  In the presence of bit errors, there
is a very small, but non-zero, probability of FELB being activated.

If the FELB is activated, the SerDes link becomes non-functional and must
be reset.  It could also corrupt the switching tables in the switch since
the ESB2 is transmitting packets with a different source MAC address.

This patch disables the FELB feature.

Signed-off-by: Bruce Allan <bruce.w.allan@intel.com>
Tested-by: Aaron Brown <aaron.f.brown@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/e1000e/es2lan.c |    8 ++++++++
 drivers/net/e1000e/hw.h     |    1 +
 2 files changed, 9 insertions(+), 0 deletions(-)

diff --git a/drivers/net/e1000e/es2lan.c b/drivers/net/e1000e/es2lan.c
index f4bbeb2..c0ecb2d 100644
--- a/drivers/net/e1000e/es2lan.c
+++ b/drivers/net/e1000e/es2lan.c
@@ -836,6 +836,7 @@ static s32 e1000_init_hw_80003es2lan(struct e1000_hw *hw)
 	struct e1000_mac_info *mac = &hw->mac;
 	u32 reg_data;
 	s32 ret_val;
+	u16 kum_reg_data;
 	u16 i;
 
 	e1000_initialize_hw_bits_80003es2lan(hw);
@@ -861,6 +862,13 @@ static s32 e1000_init_hw_80003es2lan(struct e1000_hw *hw)
 	/* Setup link and flow control */
 	ret_val = e1000e_setup_link(hw);
 
+	/* Disable IBIST slave mode (far-end loopback) */
+	e1000_read_kmrn_reg_80003es2lan(hw, E1000_KMRNCTRLSTA_INBAND_PARAM,
+					&kum_reg_data);
+	kum_reg_data |= E1000_KMRNCTRLSTA_IBIST_DISABLE;
+	e1000_write_kmrn_reg_80003es2lan(hw, E1000_KMRNCTRLSTA_INBAND_PARAM,
+					 kum_reg_data);
+
 	/* Set the transmit descriptor write-back policy */
 	reg_data = er32(TXDCTL(0));
 	reg_data = (reg_data & ~E1000_TXDCTL_WTHRESH) |
diff --git a/drivers/net/e1000e/hw.h b/drivers/net/e1000e/hw.h
index 6c2fa83..dde584f 100644
--- a/drivers/net/e1000e/hw.h
+++ b/drivers/net/e1000e/hw.h
@@ -312,6 +312,7 @@ enum e1e_registers {
 #define E1000_KMRNCTRLSTA_DIAG_OFFSET	0x3    /* Kumeran Diagnostic */
 #define E1000_KMRNCTRLSTA_TIMEOUTS	0x4    /* Kumeran Timeouts */
 #define E1000_KMRNCTRLSTA_INBAND_PARAM	0x9    /* Kumeran InBand Parameters */
+#define E1000_KMRNCTRLSTA_IBIST_DISABLE	0x0200 /* Kumeran IBIST Disable */
 #define E1000_KMRNCTRLSTA_DIAG_NELPBK	0x1000 /* Nearend Loopback mode */
 #define E1000_KMRNCTRLSTA_K1_CONFIG	0x7
 #define E1000_KMRNCTRLSTA_K1_ENABLE	0x0002
-- 
1.7.5.2


^ permalink raw reply related

* [net-next 04/12] e1000e: do not schedule the Tx queue until ready
From: Jeff Kirsher @ 2011-06-08 22:56 UTC (permalink / raw)
  To: davem; +Cc: Bruce Allan, netdev, gospo, Jeff Kirsher
In-Reply-To: <1307573800-24868-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Bruce Allan <bruce.w.allan@intel.com>

Start the Tx queue when the interface is brought up in e1000e_up() but do
not schedule the queue until link is up as detected in the watchdog task
which sets netif_carrier_on.

Also flush the descriptors and clean the Tx and Rx rings before resetting
the hardware when bringing the interface down otherwise there is a small
window where the watchdog task can be triggered with netif_carrier_off
and the Tx ring not yet empty which causes an additional and unnecessary
reset.

Signed-off-by: Bruce Allan <bruce.w.allan@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/e1000e/netdev.c |   11 +++++------
 1 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/drivers/net/e1000e/netdev.c b/drivers/net/e1000e/netdev.c
index c4a23c7..bc99458 100644
--- a/drivers/net/e1000e/netdev.c
+++ b/drivers/net/e1000e/netdev.c
@@ -3347,7 +3347,7 @@ int e1000e_up(struct e1000_adapter *adapter)
 		e1000_configure_msix(adapter);
 	e1000_irq_enable(adapter);
 
-	netif_wake_queue(adapter->netdev);
+	netif_start_queue(adapter->netdev);
 
 	/* fire a link change interrupt to start the watchdog */
 	if (adapter->msix_entries)
@@ -3414,17 +3414,16 @@ void e1000e_down(struct e1000_adapter *adapter)
 	e1000e_update_stats(adapter);
 	spin_unlock(&adapter->stats64_lock);
 
+	e1000e_flush_descriptors(adapter);
+	e1000_clean_tx_ring(adapter);
+	e1000_clean_rx_ring(adapter);
+
 	adapter->link_speed = 0;
 	adapter->link_duplex = 0;
 
 	if (!pci_channel_offline(adapter->pdev))
 		e1000e_reset(adapter);
 
-	e1000e_flush_descriptors(adapter);
-
-	e1000_clean_tx_ring(adapter);
-	e1000_clean_rx_ring(adapter);
-
 	/*
 	 * TODO: for power management, we could drop the link and
 	 * pci_disable_device here.
-- 
1.7.5.2


^ permalink raw reply related

* [net-next 03/12] e1000e: log when swflag is cleared unexpectedly on ICH/PCH devices
From: Jeff Kirsher @ 2011-06-08 22:56 UTC (permalink / raw)
  To: davem; +Cc: Bruce Allan, netdev, gospo, Jeff Kirsher
In-Reply-To: <1307573800-24868-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Bruce Allan <bruce.w.allan@intel.com>

Since EXTCNF_CTRL.SWFLAG (used in the ownership arbitration of shared
resources, e.g. the PHY shared between the s/w, f/w, and h/w clients)
can be cleared by any of those clients, log a debug message when
software attempts to clear it and it is already cleared unexpectedly.
And since the swflag is cleared by a hardware reset, the driver does
not need to do that, but the mutex acquired when the bit is set must
still be cleared.

Signed-off-by: Bruce Allan <bruce.w.allan@intel.com>
Tested-by: Aaron Brown <aaron.f.brown@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/e1000e/ich8lan.c |   11 ++++++++---
 1 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/drivers/net/e1000e/ich8lan.c b/drivers/net/e1000e/ich8lan.c
index dcd5db5..dddd0b3 100644
--- a/drivers/net/e1000e/ich8lan.c
+++ b/drivers/net/e1000e/ich8lan.c
@@ -889,8 +889,13 @@ static void e1000_release_swflag_ich8lan(struct e1000_hw *hw)
 	u32 extcnf_ctrl;
 
 	extcnf_ctrl = er32(EXTCNF_CTRL);
-	extcnf_ctrl &= ~E1000_EXTCNF_CTRL_SWFLAG;
-	ew32(EXTCNF_CTRL, extcnf_ctrl);
+
+	if (extcnf_ctrl & E1000_EXTCNF_CTRL_SWFLAG) {
+		extcnf_ctrl &= ~E1000_EXTCNF_CTRL_SWFLAG;
+		ew32(EXTCNF_CTRL, extcnf_ctrl);
+	} else {
+		e_dbg("Semaphore unexpectedly released by sw/fw/hw\n");
+	}
 
 	mutex_unlock(&swflag_mutex);
 }
@@ -3066,7 +3071,7 @@ static s32 e1000_reset_hw_ich8lan(struct e1000_hw *hw)
 	msleep(20);
 
 	if (!ret_val)
-		e1000_release_swflag_ich8lan(hw);
+		mutex_unlock(&swflag_mutex);
 
 	if (ctrl & E1000_CTRL_PHY_RST) {
 		ret_val = hw->phy.ops.get_cfg_done(hw);
-- 
1.7.5.2


^ permalink raw reply related

* [net-next 06/12] e1000e: Clear host wakeup bit on 82577/8 without touching PHY page 800
From: Jeff Kirsher @ 2011-06-08 22:56 UTC (permalink / raw)
  To: davem; +Cc: Bruce Allan, netdev, gospo, Jeff Kirsher
In-Reply-To: <1307573800-24868-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Bruce Allan <bruce.w.allan@intel.com>

The Host Wakeup Active bit in the PHY Port General Configuration register
(page 769 register 17) must be cleared after every PHY reset to prevent an
unexpected wake signal from the PHY. Originally, this was accomplished by
simply reading the PHY Wakeup Control register on page 800 which clears the
Host Wakeup Active bit as a side-effect. Unfortunately, a hardware bug on
the 82577 and 82578 PHY can cause unexpected behavior when registers on
page 800 are accessed while in gigabit mode.

This patch changes the remaining instances when the Host Wakeup Active bit
needs to be cleared while possibly in gigabit mode by accessing the Port
General Configuration register directly instead of accessing any register
on page 800.

Signed-off-by: Bruce Allan <bruce.w.allan@intel.com>
Tested-by: Jeff Pieper <jeffrey.e.pieper@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/e1000e/e1000.h   |    1 +
 drivers/net/e1000e/ich8lan.c |   24 +++++++++++++-----------
 2 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/drivers/net/e1000e/e1000.h b/drivers/net/e1000e/e1000.h
index 2c05b4f..c1e7f94 100644
--- a/drivers/net/e1000e/e1000.h
+++ b/drivers/net/e1000e/e1000.h
@@ -104,6 +104,7 @@ struct e1000_info;
 	 (((reg) & ~MAX_PHY_REG_ADDRESS) << (PHY_UPPER_SHIFT - PHY_PAGE_SHIFT)))
 
 /* PHY Wakeup Registers and defines */
+#define BM_PORT_GEN_CFG PHY_REG(BM_PORT_CTRL_PAGE, 17)
 #define BM_RCTL         PHY_REG(BM_WUC_PAGE, 0)
 #define BM_WUC          PHY_REG(BM_WUC_PAGE, 1)
 #define BM_WUFC         PHY_REG(BM_WUC_PAGE, 2)
diff --git a/drivers/net/e1000e/ich8lan.c b/drivers/net/e1000e/ich8lan.c
index 1ede6e0..c175212 100644
--- a/drivers/net/e1000e/ich8lan.c
+++ b/drivers/net/e1000e/ich8lan.c
@@ -1391,14 +1391,11 @@ static s32 e1000_hv_phy_workarounds_ich8lan(struct e1000_hw *hw)
 	ret_val = hw->phy.ops.acquire(hw);
 	if (ret_val)
 		goto out;
-	ret_val = hw->phy.ops.read_reg_locked(hw,
-	                                      PHY_REG(BM_PORT_CTRL_PAGE, 17),
-	                                      &phy_data);
+	ret_val = hw->phy.ops.read_reg_locked(hw, BM_PORT_GEN_CFG, &phy_data);
 	if (ret_val)
 		goto release;
-	ret_val = hw->phy.ops.write_reg_locked(hw,
-	                                       PHY_REG(BM_PORT_CTRL_PAGE, 17),
-	                                       phy_data & 0x00FF);
+	ret_val = hw->phy.ops.write_reg_locked(hw, BM_PORT_GEN_CFG,
+					       phy_data & 0x00FF);
 release:
 	hw->phy.ops.release(hw);
 out:
@@ -1760,9 +1757,12 @@ static s32 e1000_post_phy_reset_ich8lan(struct e1000_hw *hw)
 		break;
 	}
 
-	/* Dummy read to clear the phy wakeup bit after lcd reset */
-	if (hw->mac.type >= e1000_pchlan)
-		e1e_rphy(hw, BM_WUC, &reg);
+	/* Clear the host wakeup bit after lcd reset */
+	if (hw->mac.type >= e1000_pchlan) {
+		e1e_rphy(hw, BM_PORT_GEN_CFG, &reg);
+		reg &= ~BM_WUC_HOST_WU_BIT;
+		e1e_wphy(hw, BM_PORT_GEN_CFG, reg);
+	}
 
 	/* Configure the LCD with the extended configuration region in NVM */
 	ret_val = e1000_sw_lcd_config_ich8lan(hw);
@@ -3161,11 +3161,13 @@ static s32 e1000_init_hw_ich8lan(struct e1000_hw *hw)
 
 	/*
 	 * The 82578 Rx buffer will stall if wakeup is enabled in host and
-	 * the ME.  Reading the BM_WUC register will clear the host wakeup bit.
+	 * the ME.  Disable wakeup by clearing the host wakeup bit.
 	 * Reset the phy after disabling host wakeup to reset the Rx buffer.
 	 */
 	if (hw->phy.type == e1000_phy_82578) {
-		e1e_rphy(hw, BM_WUC, &i);
+		e1e_rphy(hw, BM_PORT_GEN_CFG, &i);
+		i &= ~BM_WUC_HOST_WU_BIT;
+		e1e_wphy(hw, BM_PORT_GEN_CFG, i);
 		ret_val = e1000_phy_hw_reset_ich8lan(hw);
 		if (ret_val)
 			return ret_val;
-- 
1.7.5.2


^ permalink raw reply related

* [net-next 07/12] e1000e: remove redundant reverse dependency on CRC32
From: Jeff Kirsher @ 2011-06-08 22:56 UTC (permalink / raw)
  To: davem; +Cc: Bruce Allan, netdev, gospo, Jeff Kirsher
In-Reply-To: <1307573800-24868-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Bruce Allan <bruce.w.allan@intel.com>

Commit 5d03078a6804bf4c7f943c5b68bef80468c0717f added a redundant 'select
CRC32'; remove it.

Signed-off-by: Bruce Allan <bruce.w.allan@intel.com>
Tested-by: Aaron Brown <aaron.f.brown@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/Kconfig |    1 -
 1 files changed, 0 insertions(+), 1 deletions(-)

diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 19f04a3..fa0ea6d 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -2115,7 +2115,6 @@ config E1000
 
 config E1000E
 	tristate "Intel(R) PRO/1000 PCI-Express Gigabit Ethernet support"
-	select CRC32
 	depends on PCI && (!SPARC32 || BROKEN)
 	select CRC32
 	---help---
-- 
1.7.5.2


^ permalink raw reply related

* [net-next 05/12] e1000e: access multiple PHY registers on same page at the same time
From: Jeff Kirsher @ 2011-06-08 22:56 UTC (permalink / raw)
  To: davem; +Cc: Bruce Allan, netdev, gospo, Jeff Kirsher
In-Reply-To: <1307573800-24868-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Bruce Allan <bruce.w.allan@intel.com>

Doing a PHY page select can take a long time, relatively speaking. This
can cause a significant delay when updating a number of PHY registers on
the same page by unnecessarily setting the page for each PHY access. For
example when going to Sx, all the PHY wakeup registers (WUC, RAR[], MTA[],
SHRAR[], IP4AT[], IP6AT[], etc.) on 82577/8/9 need to be updated which
takes a long time which can cause issues when suspending.

This patch introduces new PHY ops function pointers to allow callers to
set the page directly and do any number of PHY accesses on that page.
This feature is currently only implemented for 82577, 82578 and 82579
PHYs for both the normally addressed registers as well as the special-
case addressing of the PHY wakeup registers on page 800. For the latter
registers, the existing function for accessing the wakeup registers has
been divided up into three- 1) enable access to the wakeup register page,
2) perform the register access and 3) disable access to the wakeup register
page. The two functions that enable/disable access to the wakeup register
page are necessarily available to the caller so that the caller can restore
the value of the Port Control (a.k.a. Wakeup Enable) register after the
wakeup register accesses are done.

All instances of writing to multiple PHY registers on the same page are
updated to use this new method and to acquire any PHY locking mechanism
before setting the page and performing the register accesses, and release
the locking mechanism afterward.

Some affiliated magic number cleanup is done as well.

Signed-off-by: Bruce Allan <bruce.w.allan@intel.com>
Tested-by: Jeff Pieper <jeffrey.e.pieper@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/e1000e/e1000.h   |   38 +++--
 drivers/net/e1000e/hw.h      |   20 +++-
 drivers/net/e1000e/ich8lan.c |   70 ++++++---
 drivers/net/e1000e/netdev.c  |  111 +++++--------
 drivers/net/e1000e/phy.c     |  352 +++++++++++++++++++++++++++---------------
 5 files changed, 366 insertions(+), 225 deletions(-)

diff --git a/drivers/net/e1000e/e1000.h b/drivers/net/e1000e/e1000.h
index f8cd56a..2c05b4f 100644
--- a/drivers/net/e1000e/e1000.h
+++ b/drivers/net/e1000e/e1000.h
@@ -122,20 +122,21 @@ struct e1000_info;
 #define BM_RCTL_PMCF          0x0040          /* Pass MAC Control Frames */
 #define BM_RCTL_RFCE          0x0080          /* Rx Flow Control Enable */
 
-#define HV_SCC_UPPER		PHY_REG(778, 16) /* Single Collision Count */
-#define HV_SCC_LOWER		PHY_REG(778, 17)
-#define HV_ECOL_UPPER		PHY_REG(778, 18) /* Excessive Collision Count */
-#define HV_ECOL_LOWER		PHY_REG(778, 19)
-#define HV_MCC_UPPER		PHY_REG(778, 20) /* Multiple Collision Count */
-#define HV_MCC_LOWER		PHY_REG(778, 21)
-#define HV_LATECOL_UPPER	PHY_REG(778, 23) /* Late Collision Count */
-#define HV_LATECOL_LOWER	PHY_REG(778, 24)
-#define HV_COLC_UPPER		PHY_REG(778, 25) /* Collision Count */
-#define HV_COLC_LOWER		PHY_REG(778, 26)
-#define HV_DC_UPPER		PHY_REG(778, 27) /* Defer Count */
-#define HV_DC_LOWER		PHY_REG(778, 28)
-#define HV_TNCRS_UPPER		PHY_REG(778, 29) /* Transmit with no CRS */
-#define HV_TNCRS_LOWER		PHY_REG(778, 30)
+#define HV_STATS_PAGE	778
+#define HV_SCC_UPPER	PHY_REG(HV_STATS_PAGE, 16) /* Single Collision Count */
+#define HV_SCC_LOWER	PHY_REG(HV_STATS_PAGE, 17)
+#define HV_ECOL_UPPER	PHY_REG(HV_STATS_PAGE, 18) /* Excessive Coll. Count */
+#define HV_ECOL_LOWER	PHY_REG(HV_STATS_PAGE, 19)
+#define HV_MCC_UPPER	PHY_REG(HV_STATS_PAGE, 20) /* Multiple Coll. Count */
+#define HV_MCC_LOWER	PHY_REG(HV_STATS_PAGE, 21)
+#define HV_LATECOL_UPPER PHY_REG(HV_STATS_PAGE, 23) /* Late Collision Count */
+#define HV_LATECOL_LOWER PHY_REG(HV_STATS_PAGE, 24)
+#define HV_COLC_UPPER	PHY_REG(HV_STATS_PAGE, 25) /* Collision Count */
+#define HV_COLC_LOWER	PHY_REG(HV_STATS_PAGE, 26)
+#define HV_DC_UPPER	PHY_REG(HV_STATS_PAGE, 27) /* Defer Count */
+#define HV_DC_LOWER	PHY_REG(HV_STATS_PAGE, 28)
+#define HV_TNCRS_UPPER	PHY_REG(HV_STATS_PAGE, 29) /* Transmit with no CRS */
+#define HV_TNCRS_LOWER	PHY_REG(HV_STATS_PAGE, 30)
 
 #define E1000_FCRTV_PCH     0x05F40 /* PCH Flow Control Refresh Timer Value */
 
@@ -585,6 +586,7 @@ extern s32 e1000e_check_reset_block_generic(struct e1000_hw *hw);
 extern s32 e1000e_phy_force_speed_duplex_igp(struct e1000_hw *hw);
 extern s32 e1000e_get_cable_length_igp_2(struct e1000_hw *hw);
 extern s32 e1000e_get_phy_info_igp(struct e1000_hw *hw);
+extern s32 e1000_set_page_igp(struct e1000_hw *hw, u16 page);
 extern s32 e1000e_read_phy_reg_igp(struct e1000_hw *hw, u32 offset, u16 *data);
 extern s32 e1000e_read_phy_reg_igp_locked(struct e1000_hw *hw, u32 offset,
                                           u16 *data);
@@ -605,6 +607,10 @@ extern enum e1000_phy_type e1000e_get_phy_type_from_id(u32 phy_id);
 extern s32 e1000e_determine_phy_address(struct e1000_hw *hw);
 extern s32 e1000e_write_phy_reg_bm(struct e1000_hw *hw, u32 offset, u16 data);
 extern s32 e1000e_read_phy_reg_bm(struct e1000_hw *hw, u32 offset, u16 *data);
+extern s32 e1000_enable_phy_wakeup_reg_access_bm(struct e1000_hw *hw,
+						 u16 *phy_reg);
+extern s32 e1000_disable_phy_wakeup_reg_access_bm(struct e1000_hw *hw,
+						  u16 *phy_reg);
 extern s32 e1000e_read_phy_reg_bm2(struct e1000_hw *hw, u32 offset, u16 *data);
 extern s32 e1000e_write_phy_reg_bm2(struct e1000_hw *hw, u32 offset, u16 data);
 extern void e1000e_phy_force_speed_duplex_setup(struct e1000_hw *hw, u16 *phy_ctrl);
@@ -625,9 +631,13 @@ extern s32 e1000e_check_downshift(struct e1000_hw *hw);
 extern s32 e1000_read_phy_reg_hv(struct e1000_hw *hw, u32 offset, u16 *data);
 extern s32 e1000_read_phy_reg_hv_locked(struct e1000_hw *hw, u32 offset,
                                         u16 *data);
+extern s32 e1000_read_phy_reg_page_hv(struct e1000_hw *hw, u32 offset,
+				      u16 *data);
 extern s32 e1000_write_phy_reg_hv(struct e1000_hw *hw, u32 offset, u16 data);
 extern s32 e1000_write_phy_reg_hv_locked(struct e1000_hw *hw, u32 offset,
                                          u16 data);
+extern s32 e1000_write_phy_reg_page_hv(struct e1000_hw *hw, u32 offset,
+				       u16 data);
 extern s32 e1000_link_stall_workaround_hv(struct e1000_hw *hw);
 extern s32 e1000_copper_link_setup_82577(struct e1000_hw *hw);
 extern s32 e1000_check_polarity_82577(struct e1000_hw *hw);
diff --git a/drivers/net/e1000e/hw.h b/drivers/net/e1000e/hw.h
index dde584f..2967039 100644
--- a/drivers/net/e1000e/hw.h
+++ b/drivers/net/e1000e/hw.h
@@ -246,6 +246,7 @@ enum e1e_registers {
 #define BM_WUC_ENABLE_REG		17
 #define BM_WUC_ENABLE_BIT		(1 << 2)
 #define BM_WUC_HOST_WU_BIT		(1 << 4)
+#define BM_WUC_ME_WU_BIT		(1 << 5)
 
 #define BM_WUC	PHY_REG(BM_WUC_PAGE, 1)
 #define BM_WUFC PHY_REG(BM_WUC_PAGE, 2)
@@ -778,7 +779,21 @@ struct e1000_mac_operations {
 	s32  (*read_mac_addr)(struct e1000_hw *);
 };
 
-/* Function pointers for the PHY. */
+/*
+ * When to use various PHY register access functions:
+ *
+ *                 Func   Caller
+ *   Function      Does   Does    When to use
+ *   ~~~~~~~~~~~~  ~~~~~  ~~~~~~  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *   X_reg         L,P,A  n/a     for simple PHY reg accesses
+ *   X_reg_locked  P,A    L       for multiple accesses of different regs
+ *                                on different pages
+ *   X_reg_page    A      L,P     for multiple accesses of different regs
+ *                                on the same page
+ *
+ * Where X=[read|write], L=locking, P=sets page, A=register access
+ *
+ */
 struct e1000_phy_operations {
 	s32  (*acquire)(struct e1000_hw *);
 	s32  (*cfg_on_link_up)(struct e1000_hw *);
@@ -789,14 +804,17 @@ struct e1000_phy_operations {
 	s32  (*get_cfg_done)(struct e1000_hw *hw);
 	s32  (*get_cable_length)(struct e1000_hw *);
 	s32  (*get_info)(struct e1000_hw *);
+	s32  (*set_page)(struct e1000_hw *, u16);
 	s32  (*read_reg)(struct e1000_hw *, u32, u16 *);
 	s32  (*read_reg_locked)(struct e1000_hw *, u32, u16 *);
+	s32  (*read_reg_page)(struct e1000_hw *, u32, u16 *);
 	void (*release)(struct e1000_hw *);
 	s32  (*reset)(struct e1000_hw *);
 	s32  (*set_d0_lplu_state)(struct e1000_hw *, bool);
 	s32  (*set_d3_lplu_state)(struct e1000_hw *, bool);
 	s32  (*write_reg)(struct e1000_hw *, u32, u16);
 	s32  (*write_reg_locked)(struct e1000_hw *, u32, u16);
+	s32  (*write_reg_page)(struct e1000_hw *, u32, u16);
 	void (*power_up)(struct e1000_hw *);
 	void (*power_down)(struct e1000_hw *);
 };
diff --git a/drivers/net/e1000e/ich8lan.c b/drivers/net/e1000e/ich8lan.c
index dddd0b3..1ede6e0 100644
--- a/drivers/net/e1000e/ich8lan.c
+++ b/drivers/net/e1000e/ich8lan.c
@@ -303,12 +303,15 @@ static s32 e1000_init_phy_params_pchlan(struct e1000_hw *hw)
 	phy->addr                     = 1;
 	phy->reset_delay_us           = 100;
 
+	phy->ops.set_page             = e1000_set_page_igp;
 	phy->ops.read_reg             = e1000_read_phy_reg_hv;
 	phy->ops.read_reg_locked      = e1000_read_phy_reg_hv_locked;
+	phy->ops.read_reg_page        = e1000_read_phy_reg_page_hv;
 	phy->ops.set_d0_lplu_state    = e1000_set_lplu_state_pchlan;
 	phy->ops.set_d3_lplu_state    = e1000_set_lplu_state_pchlan;
 	phy->ops.write_reg            = e1000_write_phy_reg_hv;
 	phy->ops.write_reg_locked     = e1000_write_phy_reg_hv_locked;
+	phy->ops.write_reg_page       = e1000_write_phy_reg_page_hv;
 	phy->ops.power_up             = e1000_power_up_phy_copper;
 	phy->ops.power_down           = e1000_power_down_phy_copper_ich8lan;
 	phy->autoneg_mask             = AUTONEG_ADVERTISE_SPEED_DEFAULT;
@@ -1409,17 +1412,36 @@ out:
 void e1000_copy_rx_addrs_to_phy_ich8lan(struct e1000_hw *hw)
 {
 	u32 mac_reg;
-	u16 i;
+	u16 i, phy_reg = 0;
+	s32 ret_val;
+
+	ret_val = hw->phy.ops.acquire(hw);
+	if (ret_val)
+		return;
+	ret_val = e1000_enable_phy_wakeup_reg_access_bm(hw, &phy_reg);
+	if (ret_val)
+		goto release;
 
 	/* Copy both RAL/H (rar_entry_count) and SHRAL/H (+4) to PHY */
 	for (i = 0; i < (hw->mac.rar_entry_count + 4); i++) {
 		mac_reg = er32(RAL(i));
-		e1e_wphy(hw, BM_RAR_L(i), (u16)(mac_reg & 0xFFFF));
-		e1e_wphy(hw, BM_RAR_M(i), (u16)((mac_reg >> 16) & 0xFFFF));
+		hw->phy.ops.write_reg_page(hw, BM_RAR_L(i),
+					   (u16)(mac_reg & 0xFFFF));
+		hw->phy.ops.write_reg_page(hw, BM_RAR_M(i),
+					   (u16)((mac_reg >> 16) & 0xFFFF));
+
 		mac_reg = er32(RAH(i));
-		e1e_wphy(hw, BM_RAR_H(i), (u16)(mac_reg & 0xFFFF));
-		e1e_wphy(hw, BM_RAR_CTRL(i), (u16)((mac_reg >> 16) & 0x8000));
+		hw->phy.ops.write_reg_page(hw, BM_RAR_H(i),
+					   (u16)(mac_reg & 0xFFFF));
+		hw->phy.ops.write_reg_page(hw, BM_RAR_CTRL(i),
+					   (u16)((mac_reg & E1000_RAH_AV)
+						 >> 16));
 	}
+
+	e1000_disable_phy_wakeup_reg_access_bm(hw, &phy_reg);
+
+release:
+	hw->phy.ops.release(hw);
 }
 
 /**
@@ -3897,6 +3919,7 @@ static void e1000_power_down_phy_copper_ich8lan(struct e1000_hw *hw)
 static void e1000_clear_hw_cntrs_ich8lan(struct e1000_hw *hw)
 {
 	u16 phy_data;
+	s32 ret_val;
 
 	e1000e_clear_hw_cntrs_base(hw);
 
@@ -3918,20 +3941,29 @@ static void e1000_clear_hw_cntrs_ich8lan(struct e1000_hw *hw)
 	if ((hw->phy.type == e1000_phy_82578) ||
 	    (hw->phy.type == e1000_phy_82579) ||
 	    (hw->phy.type == e1000_phy_82577)) {
-		e1e_rphy(hw, HV_SCC_UPPER, &phy_data);
-		e1e_rphy(hw, HV_SCC_LOWER, &phy_data);
-		e1e_rphy(hw, HV_ECOL_UPPER, &phy_data);
-		e1e_rphy(hw, HV_ECOL_LOWER, &phy_data);
-		e1e_rphy(hw, HV_MCC_UPPER, &phy_data);
-		e1e_rphy(hw, HV_MCC_LOWER, &phy_data);
-		e1e_rphy(hw, HV_LATECOL_UPPER, &phy_data);
-		e1e_rphy(hw, HV_LATECOL_LOWER, &phy_data);
-		e1e_rphy(hw, HV_COLC_UPPER, &phy_data);
-		e1e_rphy(hw, HV_COLC_LOWER, &phy_data);
-		e1e_rphy(hw, HV_DC_UPPER, &phy_data);
-		e1e_rphy(hw, HV_DC_LOWER, &phy_data);
-		e1e_rphy(hw, HV_TNCRS_UPPER, &phy_data);
-		e1e_rphy(hw, HV_TNCRS_LOWER, &phy_data);
+		ret_val = hw->phy.ops.acquire(hw);
+		if (ret_val)
+			return;
+		ret_val = hw->phy.ops.set_page(hw,
+					       HV_STATS_PAGE << IGP_PAGE_SHIFT);
+		if (ret_val)
+			goto release;
+		hw->phy.ops.read_reg_page(hw, HV_SCC_UPPER, &phy_data);
+		hw->phy.ops.read_reg_page(hw, HV_SCC_LOWER, &phy_data);
+		hw->phy.ops.read_reg_page(hw, HV_ECOL_UPPER, &phy_data);
+		hw->phy.ops.read_reg_page(hw, HV_ECOL_LOWER, &phy_data);
+		hw->phy.ops.read_reg_page(hw, HV_MCC_UPPER, &phy_data);
+		hw->phy.ops.read_reg_page(hw, HV_MCC_LOWER, &phy_data);
+		hw->phy.ops.read_reg_page(hw, HV_LATECOL_UPPER, &phy_data);
+		hw->phy.ops.read_reg_page(hw, HV_LATECOL_LOWER, &phy_data);
+		hw->phy.ops.read_reg_page(hw, HV_COLC_UPPER, &phy_data);
+		hw->phy.ops.read_reg_page(hw, HV_COLC_LOWER, &phy_data);
+		hw->phy.ops.read_reg_page(hw, HV_DC_UPPER, &phy_data);
+		hw->phy.ops.read_reg_page(hw, HV_DC_LOWER, &phy_data);
+		hw->phy.ops.read_reg_page(hw, HV_TNCRS_UPPER, &phy_data);
+		hw->phy.ops.read_reg_page(hw, HV_TNCRS_LOWER, &phy_data);
+release:
+		hw->phy.ops.release(hw);
 	}
 }
 
diff --git a/drivers/net/e1000e/netdev.c b/drivers/net/e1000e/netdev.c
index bc99458..51cf715 100644
--- a/drivers/net/e1000e/netdev.c
+++ b/drivers/net/e1000e/netdev.c
@@ -3833,6 +3833,8 @@ static void e1000_update_phy_info(unsigned long data)
 /**
  * e1000e_update_phy_stats - Update the PHY statistics counters
  * @adapter: board private structure
+ *
+ * Read/clear the upper 16-bit PHY registers and read/accumulate lower
  **/
 static void e1000e_update_phy_stats(struct e1000_adapter *adapter)
 {
@@ -3844,89 +3846,61 @@ static void e1000e_update_phy_stats(struct e1000_adapter *adapter)
 	if (ret_val)
 		return;
 
-	hw->phy.addr = 1;
-
-#define HV_PHY_STATS_PAGE	778
 	/*
 	 * A page set is expensive so check if already on desired page.
 	 * If not, set to the page with the PHY status registers.
 	 */
+	hw->phy.addr = 1;
 	ret_val = e1000e_read_phy_reg_mdic(hw, IGP01E1000_PHY_PAGE_SELECT,
 					   &phy_data);
 	if (ret_val)
 		goto release;
-	if (phy_data != (HV_PHY_STATS_PAGE << IGP_PAGE_SHIFT)) {
-		ret_val = e1000e_write_phy_reg_mdic(hw,
-						    IGP01E1000_PHY_PAGE_SELECT,
-						    (HV_PHY_STATS_PAGE <<
-						     IGP_PAGE_SHIFT));
+	if (phy_data != (HV_STATS_PAGE << IGP_PAGE_SHIFT)) {
+		ret_val = hw->phy.ops.set_page(hw,
+					       HV_STATS_PAGE << IGP_PAGE_SHIFT);
 		if (ret_val)
 			goto release;
 	}
 
-	/* Read/clear the upper 16-bit registers and read/accumulate lower */
-
 	/* Single Collision Count */
-	e1000e_read_phy_reg_mdic(hw, HV_SCC_UPPER & MAX_PHY_REG_ADDRESS,
-				 &phy_data);
-	ret_val = e1000e_read_phy_reg_mdic(hw,
-					   HV_SCC_LOWER & MAX_PHY_REG_ADDRESS,
-					   &phy_data);
+	hw->phy.ops.read_reg_page(hw, HV_SCC_UPPER, &phy_data);
+	ret_val = hw->phy.ops.read_reg_page(hw, HV_SCC_LOWER, &phy_data);
 	if (!ret_val)
 		adapter->stats.scc += phy_data;
 
 	/* Excessive Collision Count */
-	e1000e_read_phy_reg_mdic(hw, HV_ECOL_UPPER & MAX_PHY_REG_ADDRESS,
-				 &phy_data);
-	ret_val = e1000e_read_phy_reg_mdic(hw,
-					   HV_ECOL_LOWER & MAX_PHY_REG_ADDRESS,
-					   &phy_data);
+	hw->phy.ops.read_reg_page(hw, HV_ECOL_UPPER, &phy_data);
+	ret_val = hw->phy.ops.read_reg_page(hw, HV_ECOL_LOWER, &phy_data);
 	if (!ret_val)
 		adapter->stats.ecol += phy_data;
 
 	/* Multiple Collision Count */
-	e1000e_read_phy_reg_mdic(hw, HV_MCC_UPPER & MAX_PHY_REG_ADDRESS,
-				 &phy_data);
-	ret_val = e1000e_read_phy_reg_mdic(hw,
-					   HV_MCC_LOWER & MAX_PHY_REG_ADDRESS,
-					   &phy_data);
+	hw->phy.ops.read_reg_page(hw, HV_MCC_UPPER, &phy_data);
+	ret_val = hw->phy.ops.read_reg_page(hw, HV_MCC_LOWER, &phy_data);
 	if (!ret_val)
 		adapter->stats.mcc += phy_data;
 
 	/* Late Collision Count */
-	e1000e_read_phy_reg_mdic(hw, HV_LATECOL_UPPER & MAX_PHY_REG_ADDRESS,
-				 &phy_data);
-	ret_val = e1000e_read_phy_reg_mdic(hw,
-					   HV_LATECOL_LOWER &
-					   MAX_PHY_REG_ADDRESS,
-					   &phy_data);
+	hw->phy.ops.read_reg_page(hw, HV_LATECOL_UPPER, &phy_data);
+	ret_val = hw->phy.ops.read_reg_page(hw, HV_LATECOL_LOWER, &phy_data);
 	if (!ret_val)
 		adapter->stats.latecol += phy_data;
 
 	/* Collision Count - also used for adaptive IFS */
-	e1000e_read_phy_reg_mdic(hw, HV_COLC_UPPER & MAX_PHY_REG_ADDRESS,
-				 &phy_data);
-	ret_val = e1000e_read_phy_reg_mdic(hw,
-					   HV_COLC_LOWER & MAX_PHY_REG_ADDRESS,
-					   &phy_data);
+	hw->phy.ops.read_reg_page(hw, HV_COLC_UPPER, &phy_data);
+	ret_val = hw->phy.ops.read_reg_page(hw, HV_COLC_LOWER, &phy_data);
 	if (!ret_val)
 		hw->mac.collision_delta = phy_data;
 
 	/* Defer Count */
-	e1000e_read_phy_reg_mdic(hw, HV_DC_UPPER & MAX_PHY_REG_ADDRESS,
-				 &phy_data);
-	ret_val = e1000e_read_phy_reg_mdic(hw,
-					   HV_DC_LOWER & MAX_PHY_REG_ADDRESS,
-					   &phy_data);
+	hw->phy.ops.read_reg_page(hw, HV_DC_UPPER, &phy_data);
+	ret_val = hw->phy.ops.read_reg_page(hw, HV_DC_LOWER, &phy_data);
 	if (!ret_val)
 		adapter->stats.dc += phy_data;
 
 	/* Transmit with no CRS */
-	e1000e_read_phy_reg_mdic(hw, HV_TNCRS_UPPER & MAX_PHY_REG_ADDRESS,
-				 &phy_data);
-	ret_val = e1000e_read_phy_reg_mdic(hw,
-					   HV_TNCRS_LOWER & MAX_PHY_REG_ADDRESS,
-					   &phy_data);
+	hw->phy.ops.read_reg_page(hw, HV_TNCRS_UPPER, &phy_data);
+	ret_val = hw->phy.ops.read_reg_page(hw, HV_TNCRS_LOWER, &phy_data);
 	if (!ret_val)
 		adapter->stats.tncrs += phy_data;
 
@@ -5154,21 +5128,34 @@ static int e1000_init_phy_wakeup(struct e1000_adapter *adapter, u32 wufc)
 {
 	struct e1000_hw *hw = &adapter->hw;
 	u32 i, mac_reg;
-	u16 phy_reg;
+	u16 phy_reg, wuc_enable;
 	int retval = 0;
 
 	/* copy MAC RARs to PHY RARs */
 	e1000_copy_rx_addrs_to_phy_ich8lan(hw);
 
-	/* copy MAC MTA to PHY MTA */
+	retval = hw->phy.ops.acquire(hw);
+	if (retval) {
+		e_err("Could not acquire PHY\n");
+		return retval;
+	}
+
+	/* Enable access to wakeup registers on and set page to BM_WUC_PAGE */
+	retval = e1000_enable_phy_wakeup_reg_access_bm(hw, &wuc_enable);
+	if (retval)
+		goto out;
+
+	/* copy MAC MTA to PHY MTA - only needed for pchlan */
 	for (i = 0; i < adapter->hw.mac.mta_reg_count; i++) {
 		mac_reg = E1000_READ_REG_ARRAY(hw, E1000_MTA, i);
-		e1e_wphy(hw, BM_MTA(i), (u16)(mac_reg & 0xFFFF));
-		e1e_wphy(hw, BM_MTA(i) + 1, (u16)((mac_reg >> 16) & 0xFFFF));
+		hw->phy.ops.write_reg_page(hw, BM_MTA(i),
+					   (u16)(mac_reg & 0xFFFF));
+		hw->phy.ops.write_reg_page(hw, BM_MTA(i) + 1,
+					   (u16)((mac_reg >> 16) & 0xFFFF));
 	}
 
 	/* configure PHY Rx Control register */
-	e1e_rphy(&adapter->hw, BM_RCTL, &phy_reg);
+	hw->phy.ops.read_reg_page(&adapter->hw, BM_RCTL, &phy_reg);
 	mac_reg = er32(RCTL);
 	if (mac_reg & E1000_RCTL_UPE)
 		phy_reg |= BM_RCTL_UPE;
@@ -5185,31 +5172,19 @@ static int e1000_init_phy_wakeup(struct e1000_adapter *adapter, u32 wufc)
 	mac_reg = er32(CTRL);
 	if (mac_reg & E1000_CTRL_RFCE)
 		phy_reg |= BM_RCTL_RFCE;
-	e1e_wphy(&adapter->hw, BM_RCTL, phy_reg);
+	hw->phy.ops.write_reg_page(&adapter->hw, BM_RCTL, phy_reg);
 
 	/* enable PHY wakeup in MAC register */
 	ew32(WUFC, wufc);
 	ew32(WUC, E1000_WUC_PHY_WAKE | E1000_WUC_PME_EN);
 
 	/* configure and enable PHY wakeup in PHY registers */
-	e1e_wphy(&adapter->hw, BM_WUFC, wufc);
-	e1e_wphy(&adapter->hw, BM_WUC, E1000_WUC_PME_EN);
+	hw->phy.ops.write_reg_page(&adapter->hw, BM_WUFC, wufc);
+	hw->phy.ops.write_reg_page(&adapter->hw, BM_WUC, E1000_WUC_PME_EN);
 
 	/* activate PHY wakeup */
-	retval = hw->phy.ops.acquire(hw);
-	if (retval) {
-		e_err("Could not acquire PHY\n");
-		return retval;
-	}
-	e1000e_write_phy_reg_mdic(hw, IGP01E1000_PHY_PAGE_SELECT,
-	                         (BM_WUC_ENABLE_PAGE << IGP_PAGE_SHIFT));
-	retval = e1000e_read_phy_reg_mdic(hw, BM_WUC_ENABLE_REG, &phy_reg);
-	if (retval) {
-		e_err("Could not read PHY page 769\n");
-		goto out;
-	}
-	phy_reg |= BM_WUC_ENABLE_BIT | BM_WUC_HOST_WU_BIT;
-	retval = e1000e_write_phy_reg_mdic(hw, BM_WUC_ENABLE_REG, phy_reg);
+	wuc_enable |= BM_WUC_ENABLE_BIT | BM_WUC_HOST_WU_BIT;
+	retval = e1000_disable_phy_wakeup_reg_access_bm(hw, &wuc_enable);
 	if (retval)
 		e_err("Could not set PHY Host Wakeup bit\n");
 out:
diff --git a/drivers/net/e1000e/phy.c b/drivers/net/e1000e/phy.c
index 484774c..2a6ee13 100644
--- a/drivers/net/e1000e/phy.c
+++ b/drivers/net/e1000e/phy.c
@@ -36,7 +36,7 @@ static s32 e1000_set_d0_lplu_state(struct e1000_hw *hw, bool active);
 static s32 e1000_wait_autoneg(struct e1000_hw *hw);
 static u32 e1000_get_phy_addr_for_bm_page(u32 page, u32 reg);
 static s32 e1000_access_phy_wakeup_reg_bm(struct e1000_hw *hw, u32 offset,
-					  u16 *data, bool read);
+					  u16 *data, bool read, bool page_set);
 static u32 e1000_get_phy_addr_for_hv_page(u32 page);
 static s32 e1000_access_phy_debug_regs_hv(struct e1000_hw *hw, u32 offset,
                                           u16 *data, bool read);
@@ -348,6 +348,24 @@ s32 e1000e_write_phy_reg_m88(struct e1000_hw *hw, u32 offset, u16 data)
 }
 
 /**
+ *  e1000_set_page_igp - Set page as on IGP-like PHY(s)
+ *  @hw: pointer to the HW structure
+ *  @page: page to set (shifted left when necessary)
+ *
+ *  Sets PHY page required for PHY register access.  Assumes semaphore is
+ *  already acquired.  Note, this function sets phy.addr to 1 so the caller
+ *  must set it appropriately (if necessary) after this function returns.
+ **/
+s32 e1000_set_page_igp(struct e1000_hw *hw, u16 page)
+{
+	e_dbg("Setting page 0x%x\n", page);
+
+	hw->phy.addr = 1;
+
+	return e1000e_write_phy_reg_mdic(hw, IGP01E1000_PHY_PAGE_SELECT, page);
+}
+
+/**
  *  __e1000e_read_phy_reg_igp - Read igp PHY register
  *  @hw: pointer to the HW structure
  *  @offset: register offset to be read
@@ -2418,7 +2436,7 @@ s32 e1000e_write_phy_reg_bm(struct e1000_hw *hw, u32 offset, u16 data)
 	/* Page 800 works differently than the rest so it has its own func */
 	if (page == BM_WUC_PAGE) {
 		ret_val = e1000_access_phy_wakeup_reg_bm(hw, offset, &data,
-							 false);
+							 false, false);
 		goto out;
 	}
 
@@ -2477,7 +2495,7 @@ s32 e1000e_read_phy_reg_bm(struct e1000_hw *hw, u32 offset, u16 *data)
 	/* Page 800 works differently than the rest so it has its own func */
 	if (page == BM_WUC_PAGE) {
 		ret_val = e1000_access_phy_wakeup_reg_bm(hw, offset, data,
-							 true);
+							 true, false);
 		goto out;
 	}
 
@@ -2535,7 +2553,7 @@ s32 e1000e_read_phy_reg_bm2(struct e1000_hw *hw, u32 offset, u16 *data)
 	/* Page 800 works differently than the rest so it has its own func */
 	if (page == BM_WUC_PAGE) {
 		ret_val = e1000_access_phy_wakeup_reg_bm(hw, offset, data,
-							 true);
+							 true, false);
 		goto out;
 	}
 
@@ -2579,7 +2597,7 @@ s32 e1000e_write_phy_reg_bm2(struct e1000_hw *hw, u32 offset, u16 data)
 	/* Page 800 works differently than the rest so it has its own func */
 	if (page == BM_WUC_PAGE) {
 		ret_val = e1000_access_phy_wakeup_reg_bm(hw, offset, &data,
-							 false);
+							 false, false);
 		goto out;
 	}
 
@@ -2603,104 +2621,163 @@ out:
 }
 
 /**
- *  e1000_access_phy_wakeup_reg_bm - Read BM PHY wakeup register
+ *  e1000_enable_phy_wakeup_reg_access_bm - enable access to BM wakeup registers
  *  @hw: pointer to the HW structure
- *  @offset: register offset to be read or written
- *  @data: pointer to the data to read or write
- *  @read: determines if operation is read or write
+ *  @phy_reg: pointer to store original contents of BM_WUC_ENABLE_REG
  *
- *  Acquires semaphore, if necessary, then reads the PHY register at offset
- *  and storing the retrieved information in data.  Release any acquired
- *  semaphores before exiting. Note that procedure to read the wakeup
- *  registers are different. It works as such:
- *  1) Set page 769, register 17, bit 2 = 1
- *  2) Set page to 800 for host (801 if we were manageability)
- *  3) Write the address using the address opcode (0x11)
- *  4) Read or write the data using the data opcode (0x12)
- *  5) Restore 769_17.2 to its original value
- *
- *  Assumes semaphore already acquired.
+ *  Assumes semaphore already acquired and phy_reg points to a valid memory
+ *  address to store contents of the BM_WUC_ENABLE_REG register.
  **/
-static s32 e1000_access_phy_wakeup_reg_bm(struct e1000_hw *hw, u32 offset,
-					  u16 *data, bool read)
+s32 e1000_enable_phy_wakeup_reg_access_bm(struct e1000_hw *hw, u16 *phy_reg)
 {
 	s32 ret_val;
-	u16 reg = BM_PHY_REG_NUM(offset);
-	u16 phy_reg = 0;
+	u16 temp;
 
-	/* Gig must be disabled for MDIO accesses to page 800 */
-	if ((hw->mac.type == e1000_pchlan) &&
-	   (!(er32(PHY_CTRL) & E1000_PHY_CTRL_GBE_DISABLE)))
-		e_dbg("Attempting to access page 800 while gig enabled.\n");
-
-	/* All operations in this function are phy address 1 */
+	/* All page select, port ctrl and wakeup registers use phy address 1 */
 	hw->phy.addr = 1;
 
-	/* Set page 769 */
-	e1000e_write_phy_reg_mdic(hw, IGP01E1000_PHY_PAGE_SELECT,
-	                          (BM_WUC_ENABLE_PAGE << IGP_PAGE_SHIFT));
+	/* Select Port Control Registers page */
+	ret_val = e1000_set_page_igp(hw, (BM_PORT_CTRL_PAGE << IGP_PAGE_SHIFT));
+	if (ret_val) {
+		e_dbg("Could not set Port Control page\n");
+		goto out;
+	}
 
-	ret_val = e1000e_read_phy_reg_mdic(hw, BM_WUC_ENABLE_REG, &phy_reg);
+	ret_val = e1000e_read_phy_reg_mdic(hw, BM_WUC_ENABLE_REG, phy_reg);
 	if (ret_val) {
-		e_dbg("Could not read PHY page 769\n");
+		e_dbg("Could not read PHY register %d.%d\n",
+		      BM_PORT_CTRL_PAGE, BM_WUC_ENABLE_REG);
 		goto out;
 	}
 
-	/* First clear bit 4 to avoid a power state change */
-	phy_reg &= ~(BM_WUC_HOST_WU_BIT);
-	ret_val = e1000e_write_phy_reg_mdic(hw, BM_WUC_ENABLE_REG, phy_reg);
+	/*
+	 * Enable both PHY wakeup mode and Wakeup register page writes.
+	 * Prevent a power state change by disabling ME and Host PHY wakeup.
+	 */
+	temp = *phy_reg;
+	temp |= BM_WUC_ENABLE_BIT;
+	temp &= ~(BM_WUC_ME_WU_BIT | BM_WUC_HOST_WU_BIT);
+
+	ret_val = e1000e_write_phy_reg_mdic(hw, BM_WUC_ENABLE_REG, temp);
 	if (ret_val) {
-		e_dbg("Could not clear PHY page 769 bit 4\n");
+		e_dbg("Could not write PHY register %d.%d\n",
+		      BM_PORT_CTRL_PAGE, BM_WUC_ENABLE_REG);
 		goto out;
 	}
 
-	/* Write bit 2 = 1, and clear bit 4 to 769_17 */
-	ret_val = e1000e_write_phy_reg_mdic(hw, BM_WUC_ENABLE_REG,
-	                                    phy_reg | BM_WUC_ENABLE_BIT);
+	/* Select Host Wakeup Registers page */
+	ret_val = e1000_set_page_igp(hw, (BM_WUC_PAGE << IGP_PAGE_SHIFT));
+
+	/* caller now able to write registers on the Wakeup registers page */
+out:
+	return ret_val;
+}
+
+/**
+ *  e1000_disable_phy_wakeup_reg_access_bm - disable access to BM wakeup regs
+ *  @hw: pointer to the HW structure
+ *  @phy_reg: pointer to original contents of BM_WUC_ENABLE_REG
+ *
+ *  Restore BM_WUC_ENABLE_REG to its original value.
+ *
+ *  Assumes semaphore already acquired and *phy_reg is the contents of the
+ *  BM_WUC_ENABLE_REG before register(s) on BM_WUC_PAGE were accessed by
+ *  caller.
+ **/
+s32 e1000_disable_phy_wakeup_reg_access_bm(struct e1000_hw *hw, u16 *phy_reg)
+{
+	s32 ret_val = 0;
+
+	/* Select Port Control Registers page */
+	ret_val = e1000_set_page_igp(hw, (BM_PORT_CTRL_PAGE << IGP_PAGE_SHIFT));
 	if (ret_val) {
-		e_dbg("Could not write PHY page 769 bit 2\n");
+		e_dbg("Could not set Port Control page\n");
 		goto out;
 	}
 
-	/* Select page 800 */
-	ret_val = e1000e_write_phy_reg_mdic(hw, IGP01E1000_PHY_PAGE_SELECT,
-	                                    (BM_WUC_PAGE << IGP_PAGE_SHIFT));
+	/* Restore 769.17 to its original value */
+	ret_val = e1000e_write_phy_reg_mdic(hw, BM_WUC_ENABLE_REG, *phy_reg);
+	if (ret_val)
+		e_dbg("Could not restore PHY register %d.%d\n",
+		      BM_PORT_CTRL_PAGE, BM_WUC_ENABLE_REG);
+out:
+	return ret_val;
+}
 
-	/* Write the page 800 offset value using opcode 0x11 */
+/**
+ *  e1000_access_phy_wakeup_reg_bm - Read/write BM PHY wakeup register
+ *  @hw: pointer to the HW structure
+ *  @offset: register offset to be read or written
+ *  @data: pointer to the data to read or write
+ *  @read: determines if operation is read or write
+ *  @page_set: BM_WUC_PAGE already set and access enabled
+ *
+ *  Read the PHY register at offset and store the retrieved information in
+ *  data, or write data to PHY register at offset.  Note the procedure to
+ *  access the PHY wakeup registers is different than reading the other PHY
+ *  registers. It works as such:
+ *  1) Set 769.17.2 (page 769, register 17, bit 2) = 1
+ *  2) Set page to 800 for host (801 if we were manageability)
+ *  3) Write the address using the address opcode (0x11)
+ *  4) Read or write the data using the data opcode (0x12)
+ *  5) Restore 769.17.2 to its original value
+ *
+ *  Steps 1 and 2 are done by e1000_enable_phy_wakeup_reg_access_bm() and
+ *  step 5 is done by e1000_disable_phy_wakeup_reg_access_bm().
+ *
+ *  Assumes semaphore is already acquired.  When page_set==true, assumes
+ *  the PHY page is set to BM_WUC_PAGE (i.e. a function in the call stack
+ *  is responsible for calls to e1000_[enable|disable]_phy_wakeup_reg_bm()).
+ **/
+static s32 e1000_access_phy_wakeup_reg_bm(struct e1000_hw *hw, u32 offset,
+					  u16 *data, bool read, bool page_set)
+{
+	s32 ret_val;
+	u16 reg = BM_PHY_REG_NUM(offset);
+	u16 page = BM_PHY_REG_PAGE(offset);
+	u16 phy_reg = 0;
+
+	/* Gig must be disabled for MDIO accesses to Host Wakeup reg page */
+	if ((hw->mac.type == e1000_pchlan) &&
+	    (!(er32(PHY_CTRL) & E1000_PHY_CTRL_GBE_DISABLE)))
+		e_dbg("Attempting to access page %d while gig enabled.\n",
+		      page);
+
+	if (!page_set) {
+		/* Enable access to PHY wakeup registers */
+		ret_val = e1000_enable_phy_wakeup_reg_access_bm(hw, &phy_reg);
+		if (ret_val) {
+			e_dbg("Could not enable PHY wakeup reg access\n");
+			goto out;
+		}
+	}
+
+	e_dbg("Accessing PHY page %d reg 0x%x\n", page, reg);
+
+	/* Write the Wakeup register page offset value using opcode 0x11 */
 	ret_val = e1000e_write_phy_reg_mdic(hw, BM_WUC_ADDRESS_OPCODE, reg);
 	if (ret_val) {
-		e_dbg("Could not write address opcode to page 800\n");
+		e_dbg("Could not write address opcode to page %d\n", page);
 		goto out;
 	}
 
 	if (read) {
-	        /* Read the page 800 value using opcode 0x12 */
+		/* Read the Wakeup register page value using opcode 0x12 */
 		ret_val = e1000e_read_phy_reg_mdic(hw, BM_WUC_DATA_OPCODE,
 		                                   data);
 	} else {
-	        /* Write the page 800 value using opcode 0x12 */
+		/* Write the Wakeup register page value using opcode 0x12 */
 		ret_val = e1000e_write_phy_reg_mdic(hw, BM_WUC_DATA_OPCODE,
 						    *data);
 	}
 
 	if (ret_val) {
-		e_dbg("Could not access data value from page 800\n");
+		e_dbg("Could not access PHY reg %d.%d\n", page, reg);
 		goto out;
 	}
 
-	/*
-	 * Restore 769_17.2 to its original value
-	 * Set page 769
-	 */
-	e1000e_write_phy_reg_mdic(hw, IGP01E1000_PHY_PAGE_SELECT,
-	                          (BM_WUC_ENABLE_PAGE << IGP_PAGE_SHIFT));
-
-	/* Clear 769_17.2 */
-	ret_val = e1000e_write_phy_reg_mdic(hw, BM_WUC_ENABLE_REG, phy_reg);
-	if (ret_val) {
-		e_dbg("Could not clear PHY page 769 bit 2\n");
-		goto out;
-	}
+	if (!page_set)
+		ret_val = e1000_disable_phy_wakeup_reg_access_bm(hw, &phy_reg);
 
 out:
 	return ret_val;
@@ -2792,11 +2869,12 @@ static s32 e1000_set_d0_lplu_state(struct e1000_hw *hw, bool active)
  *  semaphore before exiting.
  **/
 static s32 __e1000_read_phy_reg_hv(struct e1000_hw *hw, u32 offset, u16 *data,
-                                   bool locked)
+				   bool locked, bool page_set)
 {
 	s32 ret_val;
 	u16 page = BM_PHY_REG_PAGE(offset);
 	u16 reg = BM_PHY_REG_NUM(offset);
+	u32 phy_addr = hw->phy.addr = e1000_get_phy_addr_for_hv_page(page);
 
 	if (!locked) {
 		ret_val = hw->phy.ops.acquire(hw);
@@ -2806,8 +2884,8 @@ static s32 __e1000_read_phy_reg_hv(struct e1000_hw *hw, u32 offset, u16 *data,
 
 	/* Page 800 works differently than the rest so it has its own func */
 	if (page == BM_WUC_PAGE) {
-		ret_val = e1000_access_phy_wakeup_reg_bm(hw, offset,
-		                                         data, true);
+		ret_val = e1000_access_phy_wakeup_reg_bm(hw, offset, data,
+							 true, page_set);
 		goto out;
 	}
 
@@ -2817,26 +2895,25 @@ static s32 __e1000_read_phy_reg_hv(struct e1000_hw *hw, u32 offset, u16 *data,
 		goto out;
 	}
 
-	hw->phy.addr = e1000_get_phy_addr_for_hv_page(page);
-
-	if (page == HV_INTC_FC_PAGE_START)
-		page = 0;
+	if (!page_set) {
+		if (page == HV_INTC_FC_PAGE_START)
+			page = 0;
 
-	if (reg > MAX_PHY_MULTI_PAGE_REG) {
-		u32 phy_addr = hw->phy.addr;
+		if (reg > MAX_PHY_MULTI_PAGE_REG) {
+			/* Page is shifted left, PHY expects (page x 32) */
+			ret_val = e1000_set_page_igp(hw,
+						     (page << IGP_PAGE_SHIFT));
 
-		hw->phy.addr = 1;
-
-		/* Page is shifted left, PHY expects (page x 32) */
-		ret_val = e1000e_write_phy_reg_mdic(hw,
-					     IGP01E1000_PHY_PAGE_SELECT,
-					     (page << IGP_PAGE_SHIFT));
-		hw->phy.addr = phy_addr;
+			hw->phy.addr = phy_addr;
 
-		if (ret_val)
-			goto out;
+			if (ret_val)
+				goto out;
+		}
 	}
 
+	e_dbg("reading PHY page %d (or 0x%x shifted) reg 0x%x\n", page,
+	      page << IGP_PAGE_SHIFT, reg);
+
 	ret_val = e1000e_read_phy_reg_mdic(hw, MAX_PHY_REG_ADDRESS & reg,
 	                                  data);
 out:
@@ -2858,7 +2935,7 @@ out:
  **/
 s32 e1000_read_phy_reg_hv(struct e1000_hw *hw, u32 offset, u16 *data)
 {
-	return __e1000_read_phy_reg_hv(hw, offset, data, false);
+	return __e1000_read_phy_reg_hv(hw, offset, data, false, false);
 }
 
 /**
@@ -2872,7 +2949,21 @@ s32 e1000_read_phy_reg_hv(struct e1000_hw *hw, u32 offset, u16 *data)
  **/
 s32 e1000_read_phy_reg_hv_locked(struct e1000_hw *hw, u32 offset, u16 *data)
 {
-	return __e1000_read_phy_reg_hv(hw, offset, data, true);
+	return __e1000_read_phy_reg_hv(hw, offset, data, true, false);
+}
+
+/**
+ *  e1000_read_phy_reg_page_hv - Read HV PHY register
+ *  @hw: pointer to the HW structure
+ *  @offset: register offset to write to
+ *  @data: data to write at register offset
+ *
+ *  Reads the PHY register at offset and stores the retrieved information
+ *  in data.  Assumes semaphore already acquired and page already set.
+ **/
+s32 e1000_read_phy_reg_page_hv(struct e1000_hw *hw, u32 offset, u16 *data)
+{
+	return __e1000_read_phy_reg_hv(hw, offset, data, true, true);
 }
 
 /**
@@ -2886,11 +2977,12 @@ s32 e1000_read_phy_reg_hv_locked(struct e1000_hw *hw, u32 offset, u16 *data)
  *  at the offset.  Release any acquired semaphores before exiting.
  **/
 static s32 __e1000_write_phy_reg_hv(struct e1000_hw *hw, u32 offset, u16 data,
-                                    bool locked)
+				    bool locked, bool page_set)
 {
 	s32 ret_val;
 	u16 page = BM_PHY_REG_PAGE(offset);
 	u16 reg = BM_PHY_REG_NUM(offset);
+	u32 phy_addr = hw->phy.addr = e1000_get_phy_addr_for_hv_page(page);
 
 	if (!locked) {
 		ret_val = hw->phy.ops.acquire(hw);
@@ -2900,8 +2992,8 @@ static s32 __e1000_write_phy_reg_hv(struct e1000_hw *hw, u32 offset, u16 data,
 
 	/* Page 800 works differently than the rest so it has its own func */
 	if (page == BM_WUC_PAGE) {
-		ret_val = e1000_access_phy_wakeup_reg_bm(hw, offset,
-		                                         &data, false);
+		ret_val = e1000_access_phy_wakeup_reg_bm(hw, offset, &data,
+							 false, page_set);
 		goto out;
 	}
 
@@ -2911,42 +3003,41 @@ static s32 __e1000_write_phy_reg_hv(struct e1000_hw *hw, u32 offset, u16 data,
 		goto out;
 	}
 
-	hw->phy.addr = e1000_get_phy_addr_for_hv_page(page);
-
-	if (page == HV_INTC_FC_PAGE_START)
-		page = 0;
-
-	/*
-	 * Workaround MDIO accesses being disabled after entering IEEE Power
-	 * Down (whenever bit 11 of the PHY Control register is set)
-	 */
-	if ((hw->phy.type == e1000_phy_82578) &&
-	    (hw->phy.revision >= 1) &&
-	    (hw->phy.addr == 2) &&
-	    ((MAX_PHY_REG_ADDRESS & reg) == 0) &&
-	    (data & (1 << 11))) {
-		u16 data2 = 0x7EFF;
-		ret_val = e1000_access_phy_debug_regs_hv(hw, (1 << 6) | 0x3,
-		                                         &data2, false);
-		if (ret_val)
-			goto out;
-	}
+	if (!page_set) {
+		if (page == HV_INTC_FC_PAGE_START)
+			page = 0;
 
-	if (reg > MAX_PHY_MULTI_PAGE_REG) {
-		u32 phy_addr = hw->phy.addr;
+		/*
+		 * Workaround MDIO accesses being disabled after entering IEEE
+		 * Power Down (when bit 11 of the PHY Control register is set)
+		 */
+		if ((hw->phy.type == e1000_phy_82578) &&
+		    (hw->phy.revision >= 1) &&
+		    (hw->phy.addr == 2) &&
+		    ((MAX_PHY_REG_ADDRESS & reg) == 0) && (data & (1 << 11))) {
+			u16 data2 = 0x7EFF;
+			ret_val = e1000_access_phy_debug_regs_hv(hw,
+								 (1 << 6) | 0x3,
+								 &data2, false);
+			if (ret_val)
+				goto out;
+		}
 
-		hw->phy.addr = 1;
+		if (reg > MAX_PHY_MULTI_PAGE_REG) {
+			/* Page is shifted left, PHY expects (page x 32) */
+			ret_val = e1000_set_page_igp(hw,
+						     (page << IGP_PAGE_SHIFT));
 
-		/* Page is shifted left, PHY expects (page x 32) */
-		ret_val = e1000e_write_phy_reg_mdic(hw,
-					     IGP01E1000_PHY_PAGE_SELECT,
-					     (page << IGP_PAGE_SHIFT));
-		hw->phy.addr = phy_addr;
+			hw->phy.addr = phy_addr;
 
-		if (ret_val)
-			goto out;
+			if (ret_val)
+				goto out;
+		}
 	}
 
+	e_dbg("writing PHY page %d (or 0x%x shifted) reg 0x%x\n", page,
+	      page << IGP_PAGE_SHIFT, reg);
+
 	ret_val = e1000e_write_phy_reg_mdic(hw, MAX_PHY_REG_ADDRESS & reg,
 	                                  data);
 
@@ -2968,7 +3059,7 @@ out:
  **/
 s32 e1000_write_phy_reg_hv(struct e1000_hw *hw, u32 offset, u16 data)
 {
-	return __e1000_write_phy_reg_hv(hw, offset, data, false);
+	return __e1000_write_phy_reg_hv(hw, offset, data, false, false);
 }
 
 /**
@@ -2982,7 +3073,21 @@ s32 e1000_write_phy_reg_hv(struct e1000_hw *hw, u32 offset, u16 data)
  **/
 s32 e1000_write_phy_reg_hv_locked(struct e1000_hw *hw, u32 offset, u16 data)
 {
-	return __e1000_write_phy_reg_hv(hw, offset, data, true);
+	return __e1000_write_phy_reg_hv(hw, offset, data, true, false);
+}
+
+/**
+ *  e1000_write_phy_reg_page_hv - Write HV PHY register
+ *  @hw: pointer to the HW structure
+ *  @offset: register offset to write to
+ *  @data: data to write at register offset
+ *
+ *  Writes the data to PHY register at the offset.  Assumes semaphore
+ *  already acquired and page already set.
+ **/
+s32 e1000_write_phy_reg_page_hv(struct e1000_hw *hw, u32 offset, u16 data)
+{
+	return __e1000_write_phy_reg_hv(hw, offset, data, true, true);
 }
 
 /**
@@ -3004,11 +3109,12 @@ static u32 e1000_get_phy_addr_for_hv_page(u32 page)
  *  @hw: pointer to the HW structure
  *  @offset: register offset to be read or written
  *  @data: pointer to the data to be read or written
- *  @read: determines if operation is read or written
+ *  @read: determines if operation is read or write
  *
  *  Reads the PHY register at offset and stores the retreived information
  *  in data.  Assumes semaphore already acquired.  Note that the procedure
- *  to read these regs uses the address port and data port to read/write.
+ *  to access these regs uses the address port and data port to read/write.
+ *  These accesses done with PHY address 2 and without using pages.
  **/
 static s32 e1000_access_phy_debug_regs_hv(struct e1000_hw *hw, u32 offset,
                                           u16 *data, bool read)
@@ -3028,7 +3134,7 @@ static s32 e1000_access_phy_debug_regs_hv(struct e1000_hw *hw, u32 offset,
 	/* masking with 0x3F to remove the page from offset */
 	ret_val = e1000e_write_phy_reg_mdic(hw, addr_reg, (u16)offset & 0x3F);
 	if (ret_val) {
-		e_dbg("Could not write PHY the HV address register\n");
+		e_dbg("Could not write the Address Offset port register\n");
 		goto out;
 	}
 
@@ -3039,7 +3145,7 @@ static s32 e1000_access_phy_debug_regs_hv(struct e1000_hw *hw, u32 offset,
 		ret_val = e1000e_write_phy_reg_mdic(hw, data_reg, *data);
 
 	if (ret_val) {
-		e_dbg("Could not read data value from HV data register\n");
+		e_dbg("Could not access the Data port register\n");
 		goto out;
 	}
 
-- 
1.7.5.2


^ permalink raw reply related

* [net-next 08/12] e1000e: update driver version
From: Jeff Kirsher @ 2011-06-08 22:56 UTC (permalink / raw)
  To: davem; +Cc: Bruce Allan, netdev, gospo, Jeff Kirsher
In-Reply-To: <1307573800-24868-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Bruce Allan <bruce.w.allan@intel.com>

Signed-off-by: Bruce Allan <bruce.w.allan@intel.com>
Tested-by: Aaron Brown <aaron.f.brown@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/e1000e/netdev.c |    4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/e1000e/netdev.c b/drivers/net/e1000e/netdev.c
index 51cf715..3bf5249 100644
--- a/drivers/net/e1000e/netdev.c
+++ b/drivers/net/e1000e/netdev.c
@@ -54,9 +54,9 @@
 
 #include "e1000.h"
 
-#define DRV_EXTRAVERSION "-k2"
+#define DRV_EXTRAVERSION "-k"
 
-#define DRV_VERSION "1.3.10" DRV_EXTRAVERSION
+#define DRV_VERSION "1.3.16" DRV_EXTRAVERSION
 char e1000e_driver_name[] = "e1000e";
 const char e1000e_driver_version[] = DRV_VERSION;
 
-- 
1.7.5.2


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox