* [PATCH 7/7] dccp ccid-2: check Ack Ratio when reducing cwnd
From: Gerrit Renker @ 2011-07-25 13:36 UTC (permalink / raw)
To: davem; +Cc: dccp, netdev, Samuel Jero
In-Reply-To: <test_tree_patch_set_update_2011-07-25>
From: Samuel Jero <sj323707@ohio.edu>
This patch causes CCID-2 to check the Ack Ratio after reducing the congestion
window. If the Ack Ratio is greater than the congestion window, it is
reduced. This prevents timeouts caused by an Ack Ratio larger than the
congestion window.
In this situation, we choose to set the Ack Ratio to half the congestion window
(or one if that's zero) so that if we loose one ack we don't trigger a timeout.
Signed-off-by: Samuel Jero <sj323707@ohio.edu>
Acked-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
net/dccp/ccids/ccid2.c | 26 +++++++++++++++++++++++---
1 files changed, 23 insertions(+), 3 deletions(-)
--- a/net/dccp/ccids/ccid2.c
+++ b/net/dccp/ccids/ccid2.c
@@ -101,6 +101,24 @@ static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val)
min_t(u32, val, DCCPF_ACK_RATIO_MAX));
}
+static void ccid2_check_l_ack_ratio(struct sock *sk)
+{
+ struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
+
+ /*
+ * After a loss, idle period, application limited period, or RTO we
+ * need to check that the ack ratio is still less than the congestion
+ * window. Otherwise, we will send an entire congestion window of
+ * packets and got no response because we haven't sent ack ratio
+ * packets yet.
+ * If the ack ratio does need to be reduced, we reduce it to half of
+ * the congestion window (or 1 if that's zero) instead of to the
+ * congestion window. This prevents problems if one ack is lost.
+ */
+ if (dccp_feat_nn_get(sk, DCCPF_ACK_RATIO) > hc->tx_cwnd)
+ ccid2_change_l_ack_ratio(sk, hc->tx_cwnd/2 ? : 1U);
+}
+
static void ccid2_change_l_seq_window(struct sock *sk, u64 val)
{
dccp_feat_signal_nn_change(sk, DCCPF_SEQUENCE_WINDOW,
@@ -187,6 +205,8 @@ static void ccid2_cwnd_application_limited(struct sock *sk, const u32 now)
}
hc->tx_cwnd_used = 0;
hc->tx_cwnd_stamp = now;
+
+ ccid2_check_l_ack_ratio(sk);
}
/* This borrows the code of tcp_cwnd_restart() */
@@ -205,6 +225,8 @@ static void ccid2_cwnd_restart(struct sock *sk, const u32 now)
hc->tx_cwnd_stamp = now;
hc->tx_cwnd_used = 0;
+
+ ccid2_check_l_ack_ratio(sk);
}
static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len)
@@ -461,9 +483,7 @@ static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp)
hc->tx_cwnd = hc->tx_cwnd / 2 ? : 1U;
hc->tx_ssthresh = max(hc->tx_cwnd, 2U);
- /* Avoid spurious timeouts resulting from Ack Ratio > cwnd */
- if (dccp_sk(sk)->dccps_l_ack_ratio > hc->tx_cwnd)
- ccid2_change_l_ack_ratio(sk, hc->tx_cwnd);
+ ccid2_check_l_ack_ratio(sk);
}
static int ccid2_hc_tx_parse_options(struct sock *sk, u8 packet_type,
^ permalink raw reply
* [PATCH 5/7] dccp ccid-2: prevent cwnd > Sequence Window
From: Gerrit Renker @ 2011-07-25 13:36 UTC (permalink / raw)
To: davem; +Cc: dccp, netdev, Samuel Jero
In-Reply-To: <test_tree_patch_set_update_2011-07-25>
From: Samuel Jero <sj323707@ohio.edu>
Add a check to prevent CCID-2 from increasing the cwnd greater than the
Sequence Window.
When the congestion window becomes bigger than the Sequence Window, CCID-2
will attempt to keep more data in the network than the DCCP Sequence Window
code considers possible. This results in the Sequence Window code issuing
a Sync, thereby inducing needless overhead. Further, if this occurs at the
sender, CCID-2 will never detect the problem because the Acks it receives
will indicate no losses. I have seen this cause a drop of 1/3rd in throughput
for a connection.
Also add code to adjust the Sequence Window to be about 5 times the number of
packets in the network (RFC 4340, 7.5.2) and to adjust the Ack Ratio so that
the remote Sequence Window will hold about 5 times the number of packets in
the network. This allows the congestion window to increase correctly without
being limited by the Sequence Window.
Signed-off-by: Samuel Jero <sj323707@ohio.edu>
Acked-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
net/dccp/ccids/ccid2.c | 50 +++++++++++++++++++++++++++++++++--------------
net/dccp/ccids/ccid2.h | 6 +++++
2 files changed, 41 insertions(+), 15 deletions(-)
--- a/net/dccp/ccids/ccid2.h
+++ b/net/dccp/ccids/ccid2.h
@@ -43,6 +43,12 @@ struct ccid2_seq {
#define CCID2_SEQBUF_LEN 1024
#define CCID2_SEQBUF_MAX 128
+/*
+ * Multiple of congestion window to keep the sequence window at
+ * (RFC 4340 7.5.2)
+ */
+#define CCID2_WIN_CHANGE_FACTOR 5
+
/**
* struct ccid2_hc_tx_sock - CCID2 TX half connection
* @tx_{cwnd,ssthresh,pipe}: as per RFC 4341, section 5
--- a/net/dccp/ccids/ccid2.c
+++ b/net/dccp/ccids/ccid2.c
@@ -85,7 +85,6 @@ static int ccid2_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val)
{
- struct dccp_sock *dp = dccp_sk(sk);
u32 max_ratio = DIV_ROUND_UP(ccid2_hc_tx_sk(sk)->tx_cwnd, 2);
/*
@@ -98,14 +97,15 @@ static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val)
DCCP_WARN("Limiting Ack Ratio (%u) to %u\n", val, max_ratio);
val = max_ratio;
}
- if (val > DCCPF_ACK_RATIO_MAX)
- val = DCCPF_ACK_RATIO_MAX;
-
- if (val == dp->dccps_l_ack_ratio)
- return;
+ dccp_feat_signal_nn_change(sk, DCCPF_ACK_RATIO,
+ min_t(u32, val, DCCPF_ACK_RATIO_MAX));
+}
- ccid2_pr_debug("changing local ack ratio to %u\n", val);
- dp->dccps_l_ack_ratio = val;
+static void ccid2_change_l_seq_window(struct sock *sk, u64 val)
+{
+ dccp_feat_signal_nn_change(sk, DCCPF_SEQUENCE_WINDOW,
+ clamp_val(val, DCCPF_SEQ_WMIN,
+ DCCPF_SEQ_WMAX));
}
static void ccid2_hc_tx_rto_expire(unsigned long data)
@@ -405,17 +405,37 @@ static void ccid2_new_ack(struct sock *sk, struct ccid2_seq *seqp,
unsigned int *maxincr)
{
struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
-
- if (hc->tx_cwnd < hc->tx_ssthresh) {
- if (*maxincr > 0 && ++hc->tx_packets_acked == 2) {
+ struct dccp_sock *dp = dccp_sk(sk);
+ int r_seq_used = hc->tx_cwnd / dp->dccps_l_ack_ratio;
+
+ if (hc->tx_cwnd < dp->dccps_l_seq_win &&
+ r_seq_used < dp->dccps_r_seq_win) {
+ if (hc->tx_cwnd < hc->tx_ssthresh) {
+ if (*maxincr > 0 && ++hc->tx_packets_acked == 2) {
+ hc->tx_cwnd += 1;
+ *maxincr -= 1;
+ hc->tx_packets_acked = 0;
+ }
+ } else if (++hc->tx_packets_acked >= hc->tx_cwnd) {
hc->tx_cwnd += 1;
- *maxincr -= 1;
hc->tx_packets_acked = 0;
}
- } else if (++hc->tx_packets_acked >= hc->tx_cwnd) {
- hc->tx_cwnd += 1;
- hc->tx_packets_acked = 0;
}
+
+ /*
+ * Adjust the local sequence window and the ack ratio to allow about
+ * 5 times the number of packets in the network (RFC 4340 7.5.2)
+ */
+ if (r_seq_used * CCID2_WIN_CHANGE_FACTOR >= dp->dccps_r_seq_win)
+ ccid2_change_l_ack_ratio(sk, dp->dccps_l_ack_ratio * 2);
+ else if (r_seq_used * CCID2_WIN_CHANGE_FACTOR < dp->dccps_r_seq_win/2)
+ ccid2_change_l_ack_ratio(sk, dp->dccps_l_ack_ratio / 2 ? : 1U);
+
+ if (hc->tx_cwnd * CCID2_WIN_CHANGE_FACTOR >= dp->dccps_l_seq_win)
+ ccid2_change_l_seq_window(sk, dp->dccps_l_seq_win * 2);
+ else if (hc->tx_cwnd * CCID2_WIN_CHANGE_FACTOR < dp->dccps_l_seq_win/2)
+ ccid2_change_l_seq_window(sk, dp->dccps_l_seq_win / 2);
+
/*
* FIXME: RTT is sampled several times per acknowledgment (for each
* entry in the Ack Vector), instead of once per Ack (as in TCP SACK).
^ permalink raw reply
* [PATCH 6/7] dccp ccid-2: increment cwnd correctly
From: Gerrit Renker @ 2011-07-25 13:36 UTC (permalink / raw)
To: davem; +Cc: dccp, netdev, Samuel Jero
In-Reply-To: <test_tree_patch_set_update_2011-07-25>
From: Samuel Jero <sj323707@ohio.edu>
This patch fixes an issue where CCID-2 will not increase the congestion
window for numerous RTTs after an idle period, application-limited period,
or a loss once the algorithm is in Congestion Avoidance.
What happens is that, when CCID-2 is in Congestion Avoidance mode, it will
increase hc->tx_packets_acked by one for every packet and will increment cwnd
every cwnd packets. However, if there is now an idle period in the connection,
cwnd will be reduced, possibly below the slow start threshold. This will
cause the connection to go into Slow Start. However, in Slow Start CCID-2
performs this test to increment cwnd every second ack:
++hc->tx_packets_acked == 2
Unfortunately, this will be incorrect, if cwnd previous to the idle period
was larger than 2 and if tx_packets_acked was close to cwnd. For example:
cwnd=50 and tx_packets_acked=45.
In this case, the current code, will increment tx_packets_acked until it
equals two, which will only be once tx_packets_acked (an unsigned 32-bit
integer) overflows.
My fix is simply to change that test for tx_packets_acked greater than or
equal to two in slow start.
Signed-off-by: Samuel Jero <sj323707@ohio.edu>
Acked-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
net/dccp/ccids/ccid2.c | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)
--- a/net/dccp/ccids/ccid2.c
+++ b/net/dccp/ccids/ccid2.c
@@ -411,7 +411,7 @@ static void ccid2_new_ack(struct sock *sk, struct ccid2_seq *seqp,
if (hc->tx_cwnd < dp->dccps_l_seq_win &&
r_seq_used < dp->dccps_r_seq_win) {
if (hc->tx_cwnd < hc->tx_ssthresh) {
- if (*maxincr > 0 && ++hc->tx_packets_acked == 2) {
+ if (*maxincr > 0 && ++hc->tx_packets_acked >= 2) {
hc->tx_cwnd += 1;
*maxincr -= 1;
hc->tx_packets_acked = 0;
^ permalink raw reply
* [PATCH 2/7] dccp: support for exchanging of NN options in established state 2/2
From: Gerrit Renker @ 2011-07-25 13:36 UTC (permalink / raw)
To: davem; +Cc: dccp, netdev
In-Reply-To: <test_tree_patch_set_update_2011-07-25>
This patch adds the receiver side and the (fast-path) activation part for
dynamic changes of non-negotiable (NN) parameters in (PART)OPEN state.
Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Signed-off-by: Samuel Jero <sj323707@ohio.edu>
Acked-by: Ian McDonald <ian.mcdonald@jandi.co.uk>
---
net/dccp/feat.c | 116 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 files changed, 116 insertions(+), 0 deletions(-)
--- a/net/dccp/feat.c
+++ b/net/dccp/feat.c
@@ -344,6 +344,20 @@ static int __dccp_feat_activate(struct sock *sk, const int idx,
return dccp_feat_table[idx].activation_hdlr(sk, val, rx);
}
+/**
+ * dccp_feat_activate - Activate feature value on socket
+ * @sk: fully connected DCCP socket (after handshake is complete)
+ * @feat_num: feature to activate, one of %dccp_feature_numbers
+ * @local: whether local (1) or remote (0) @feat_num is meant
+ * @fval: the value (SP or NN) to activate, or NULL to use the default value
+ * For general use this function is preferable over __dccp_feat_activate().
+ */
+static int dccp_feat_activate(struct sock *sk, u8 feat_num, bool local,
+ dccp_feat_val const *fval)
+{
+ return __dccp_feat_activate(sk, dccp_feat_index(feat_num), local, fval);
+}
+
/* Test for "Req'd" feature (RFC 4340, 6.4) */
static inline int dccp_feat_must_be_understood(u8 feat_num)
{
@@ -1252,6 +1266,100 @@ confirmation_failed:
}
/**
+ * dccp_feat_handle_nn_established - Fast-path reception of NN options
+ * @sk: socket of an established DCCP connection
+ * @mandatory: whether @opt was preceded by a Mandatory option
+ * @opt: %DCCPO_CHANGE_L | %DCCPO_CONFIRM_R (NN only)
+ * @feat: NN number, one of %dccp_feature_numbers
+ * @val: NN value
+ * @len: length of @val in bytes
+ * This function combines the functionality of change_recv/confirm_recv, with
+ * the following differences (reset codes are the same):
+ * - cleanup after receiving the Confirm;
+ * - values are directly activated after successful parsing;
+ * - deliberately restricted to NN features.
+ * The restriction to NN features is essential since SP features can have non-
+ * predictable outcomes (depending on the remote configuration), and are inter-
+ * dependent (CCIDs for instance cause further dependencies).
+ */
+static u8 dccp_feat_handle_nn_established(struct sock *sk, u8 mandatory, u8 opt,
+ u8 feat, u8 *val, u8 len)
+{
+ struct list_head *fn = &dccp_sk(sk)->dccps_featneg;
+ const bool local = (opt == DCCPO_CONFIRM_R);
+ struct dccp_feat_entry *entry;
+ u8 type = dccp_feat_type(feat);
+ dccp_feat_val fval;
+
+ dccp_feat_print_opt(opt, feat, val, len, mandatory);
+
+ /* Ignore non-mandatory unknown and non-NN features */
+ if (type == FEAT_UNKNOWN) {
+ if (local && !mandatory)
+ return 0;
+ goto fast_path_unknown;
+ } else if (type != FEAT_NN) {
+ return 0;
+ }
+
+ /*
+ * We don't accept empty Confirms, since in fast-path feature
+ * negotiation the values are enabled immediately after sending
+ * the Change option.
+ * Empty Changes on the other hand are invalid (RFC 4340, 6.1).
+ */
+ if (len == 0 || len > sizeof(fval.nn))
+ goto fast_path_unknown;
+
+ if (opt == DCCPO_CHANGE_L) {
+ fval.nn = dccp_decode_value_var(val, len);
+ if (!dccp_feat_is_valid_nn_val(feat, fval.nn))
+ goto fast_path_unknown;
+
+ if (dccp_feat_push_confirm(fn, feat, local, &fval) ||
+ dccp_feat_activate(sk, feat, local, &fval))
+ return DCCP_RESET_CODE_TOO_BUSY;
+
+ /* set the `Ack Pending' flag to piggyback a Confirm */
+ inet_csk_schedule_ack(sk);
+
+ } else if (opt == DCCPO_CONFIRM_R) {
+ entry = dccp_feat_list_lookup(fn, feat, local);
+ if (entry == NULL || entry->state != FEAT_CHANGING)
+ return 0;
+
+ fval.nn = dccp_decode_value_var(val, len);
+ /*
+ * Just ignore a value that doesn't match our current value.
+ * If the option changes twice within two RTTs, then at least
+ * one CONFIRM will be received for the old value after a
+ * new CHANGE was sent.
+ */
+ if (fval.nn != entry->val.nn)
+ return 0;
+
+ /* Only activate after receiving the Confirm option (6.6.1). */
+ dccp_feat_activate(sk, feat, local, &fval);
+
+ /* It has been confirmed - so remove the entry */
+ dccp_feat_list_pop(entry);
+
+ } else {
+ DCCP_WARN("Received illegal option %u\n", opt);
+ goto fast_path_failed;
+ }
+ return 0;
+
+fast_path_unknown:
+ if (!mandatory)
+ return dccp_push_empty_confirm(fn, feat, local);
+
+fast_path_failed:
+ return mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR
+ : DCCP_RESET_CODE_OPTION_ERROR;
+}
+
+/**
* dccp_feat_parse_options - Process Feature-Negotiation Options
* @sk: for general use and used by the client during connection setup
* @dreq: used by the server during connection setup
@@ -1286,6 +1394,14 @@ int dccp_feat_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
return dccp_feat_confirm_recv(fn, mandatory, opt, feat,
val, len, server);
}
+ break;
+ /*
+ * Support for exchanging NN options on an established connection.
+ */
+ case DCCP_OPEN:
+ case DCCP_PARTOPEN:
+ return dccp_feat_handle_nn_established(sk, mandatory, opt, feat,
+ val, len);
}
return 0; /* ignore FN options in all other states */
}
^ permalink raw reply
* [PATCH 3/7] dccp: send Confirm options only once
From: Gerrit Renker @ 2011-07-25 13:36 UTC (permalink / raw)
To: davem; +Cc: dccp, netdev, Samuel Jero
In-Reply-To: <test_tree_patch_set_update_2011-07-25>
From: Samuel Jero <sj323707@ohio.edu>
If a connection is in the OPEN state, remove feature negotiation Confirm
options from the list of options after sending them once; as such options
are NOT supposed to be retransmitted and are ONLY supposed to be sent in
response to a Change option (RFC 4340 6.2).
Signed-off-by: Samuel Jero <sj323707@ohio.edu>
Acked-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
net/dccp/feat.c | 21 ++++++++++++++++-----
1 files changed, 16 insertions(+), 5 deletions(-)
--- a/net/dccp/feat.c
+++ b/net/dccp/feat.c
@@ -665,11 +665,22 @@ int dccp_feat_insert_opts(struct dccp_sock *dp, struct dccp_request_sock *dreq,
return -1;
if (pos->needs_mandatory && dccp_insert_option_mandatory(skb))
return -1;
- /*
- * Enter CHANGING after transmitting the Change option (6.6.2).
- */
- if (pos->state == FEAT_INITIALISING)
- pos->state = FEAT_CHANGING;
+
+ if (skb->sk->sk_state == DCCP_OPEN &&
+ (opt == DCCPO_CONFIRM_R || opt == DCCPO_CONFIRM_L)) {
+ /*
+ * Confirms don't get retransmitted (6.6.3) once the
+ * connection is in state OPEN
+ */
+ dccp_feat_list_pop(pos);
+ } else {
+ /*
+ * Enter CHANGING after transmitting the Change
+ * option (6.6.2).
+ */
+ if (pos->state == FEAT_INITIALISING)
+ pos->state = FEAT_CHANGING;
+ }
}
return 0;
}
^ permalink raw reply
* [PATCH 1/7] dccp: support for the exchange of NN options in established state 1/2
From: Gerrit Renker @ 2011-07-25 13:36 UTC (permalink / raw)
To: davem; +Cc: dccp, netdev
In-Reply-To: <test_tree_patch_set_update_2011-07-25>
In contrast to static feature negotiation at the begin of a connection, this
patch introduces support for exchange of dynamically changing options.
Such an update/exchange is necessary in at least two cases:
* CCID-2's Ack Ratio (RFC 4341, 6.1.2) which changes during the connection;
* Sequence Window values that, as per RFC 4340, 7.5.2, should be sent "as
the connection progresses".
Both are non-negotiable (NN) features, which means that no new capabilities
are negotiated, but rather that changes in known parameters are brought
up-to-date at either end.
Thse characteristics are reflected by the implementation:
* only NN options can be exchanged after connection setup;
* an ack is scheduled directly after activation to speed up the update;
* CCIDs may request changes to an NN feature even if a negotiation for that
feature is already underway: this is required by CCID-2, where changes in
cwnd necessitate Ack Ratio changes, such that the previous Ack Ratio (which
is still being negotiated) would cause irrecoverable RTO timeouts (thanks
to work by Samuel Jero).
Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Signed-off-by: Samuel Jero <sj323707@ohio.edu>
Acked-by: Ian McDonald <ian.mcdonald@jandi.co.uk>
---
net/dccp/dccp.h | 1 +
net/dccp/feat.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
net/dccp/feat.h | 1 +
3 files changed, 67 insertions(+), 0 deletions(-)
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -474,6 +474,7 @@ static inline int dccp_ack_pending(const struct sock *sk)
return dccp_ackvec_pending(sk) || inet_csk_ack_scheduled(sk);
}
+extern int dccp_feat_signal_nn_change(struct sock *sk, u8 feat, u64 nn_val);
extern int dccp_feat_finalise_settings(struct dccp_sock *dp);
extern int dccp_feat_server_ccid_dependencies(struct dccp_request_sock *dreq);
extern int dccp_feat_insert_opts(struct dccp_sock*, struct dccp_request_sock*,
--- a/net/dccp/feat.h
+++ b/net/dccp/feat.h
@@ -129,6 +129,7 @@ extern int dccp_feat_clone_list(struct list_head const *, struct list_head *);
extern void dccp_encode_value_var(const u64 value, u8 *to, const u8 len);
extern u64 dccp_decode_value_var(const u8 *bf, const u8 len);
+extern u64 dccp_feat_nn_get(struct sock *sk, u8 feat);
extern int dccp_insert_option_mandatory(struct sk_buff *skb);
extern int dccp_insert_fn_opt(struct sk_buff *skb, u8 type, u8 feat,
--- a/net/dccp/feat.c
+++ b/net/dccp/feat.c
@@ -12,6 +12,7 @@
* -----------
* o Feature negotiation is coordinated with connection setup (as in TCP), wild
* changes of parameters of an established connection are not supported.
+ * o Changing non-negotiable (NN) values is supported in state OPEN/PARTOPEN.
* o All currently known SP features have 1-byte quantities. If in the future
* extensions of RFCs 4340..42 define features with item lengths larger than
* one byte, a feature-specific extension of the code will be required.
@@ -730,6 +731,70 @@ int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local,
0, list, len);
}
+/**
+ * dccp_feat_nn_get - Query current/pending value of NN feature
+ * @sk: DCCP socket of an established connection
+ * @feat: NN feature number from %dccp_feature_numbers
+ * For a known NN feature, returns value currently being negotiated, or
+ * current (confirmed) value if no negotiation is going on.
+ */
+u64 dccp_feat_nn_get(struct sock *sk, u8 feat)
+{
+ if (dccp_feat_type(feat) == FEAT_NN) {
+ struct dccp_sock *dp = dccp_sk(sk);
+ struct dccp_feat_entry *entry;
+
+ entry = dccp_feat_list_lookup(&dp->dccps_featneg, feat, 1);
+ if (entry != NULL)
+ return entry->val.nn;
+
+ switch (feat) {
+ case DCCPF_ACK_RATIO:
+ return dp->dccps_l_ack_ratio;
+ case DCCPF_SEQUENCE_WINDOW:
+ return dp->dccps_l_seq_win;
+ }
+ }
+ DCCP_BUG("attempt to look up unsupported feature %u", feat);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(dccp_feat_nn_get);
+
+/**
+ * dccp_feat_signal_nn_change - Update NN values for an established connection
+ * @sk: DCCP socket of an established connection
+ * @feat: NN feature number from %dccp_feature_numbers
+ * @nn_val: the new value to use
+ * This function is used to communicate NN updates out-of-band.
+ */
+int dccp_feat_signal_nn_change(struct sock *sk, u8 feat, u64 nn_val)
+{
+ struct list_head *fn = &dccp_sk(sk)->dccps_featneg;
+ dccp_feat_val fval = { .nn = nn_val };
+ struct dccp_feat_entry *entry;
+
+ if (sk->sk_state != DCCP_OPEN && sk->sk_state != DCCP_PARTOPEN)
+ return 0;
+
+ if (dccp_feat_type(feat) != FEAT_NN ||
+ !dccp_feat_is_valid_nn_val(feat, nn_val))
+ return -EINVAL;
+
+ if (nn_val == dccp_feat_nn_get(sk, feat))
+ return 0; /* already set or negotiation under way */
+
+ entry = dccp_feat_list_lookup(fn, feat, 1);
+ if (entry != NULL) {
+ dccp_pr_debug("Clobbering existing NN entry %llu -> %llu\n",
+ (unsigned long long)entry->val.nn,
+ (unsigned long long)nn_val);
+ dccp_feat_list_pop(entry);
+ }
+
+ inet_csk_schedule_ack(sk);
+ return dccp_feat_push_change(fn, feat, 1, 0, &fval);
+}
+EXPORT_SYMBOL_GPL(dccp_feat_signal_nn_change);
/*
* Tracking features whose value depend on the choice of CCID
^ permalink raw reply
* [PATCH 4/7] dccp ccid-2: use feature-negotiation to report Ack Ratio changes
From: Gerrit Renker @ 2011-07-25 13:36 UTC (permalink / raw)
To: davem; +Cc: dccp, netdev
In-Reply-To: <test_tree_patch_set_update_2011-07-25>
This uses the new feature-negotiation framework to signal Ack Ratio changes,
as required by RFC 4341, sec. 6.1.2.
That raises some problems with CCID-2, which at the moment can not cope
gracefully with Ack Ratios > 1. Since these issues are not directly related
to feature negotiation, they are marked by a FIXME.
Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Signed-off-by: Samuel Jero <sj323707@ohio.edu>
Acked-by: Ian McDonald <ian.mcdonald@jandi.co.uk>
---
net/dccp/ccids/ccid2.c | 10 +++++++++-
net/dccp/proto.c | 1 -
2 files changed, 9 insertions(+), 2 deletions(-)
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -184,7 +184,6 @@ int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized)
dp->dccps_rate_last = jiffies;
dp->dccps_role = DCCP_ROLE_UNDEFINED;
dp->dccps_service = DCCP_SERVICE_CODE_IS_ABSENT;
- dp->dccps_l_ack_ratio = dp->dccps_r_ack_ratio = 1;
dp->dccps_tx_qlen = sysctl_dccp_tx_qlen;
dccp_init_xmit_timers(sk);
--- a/net/dccp/ccids/ccid2.c
+++ b/net/dccp/ccids/ccid2.c
@@ -494,8 +494,16 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
if (hc->tx_rpdupack >= NUMDUPACK) {
hc->tx_rpdupack = -1; /* XXX lame */
hc->tx_rpseq = 0;
-
+#ifdef __CCID2_COPES_GRACEFULLY_WITH_ACK_CONGESTION_CONTROL__
+ /*
+ * FIXME: Ack Congestion Control is broken; in
+ * the current state instabilities occurred with
+ * Ack Ratios greater than 1; causing hang-ups
+ * and long RTO timeouts. This needs to be fixed
+ * before opening up dynamic changes. -- gerrit
+ */
ccid2_change_l_ack_ratio(sk, 2 * dp->dccps_l_ack_ratio);
+#endif
}
}
}
^ permalink raw reply
* net-next-2.6 [PATCH 0/7] dccp: add support for dynamic parameter updates
From: Gerrit Renker @ 2011-07-25 13:36 UTC (permalink / raw)
To: davem; +Cc: dccp, netdev
In-Reply-To: <test_tree_patch_set_update_2011-07-25>
Hi Dave,
please find attached a 2-part patch set to implement features required by the RFCs:
a) exchange of "non-negotiable" (NN) feature options (RFC 4340, 6.3.2), which are
used to dynamically update known parameters during an established connection;
b) use of this new API to improve the current state of the CCID-2 (RFC 4341)
implementation for updating Ack Ratio and Sequence Window features.
Both sets are thanks to the good work done by Samuel Jero.
General DCCP part:
Patch #1: introduces sender-signalling part for exchange of NN options.
Patch #2: implements the receiver-side and activation part for NN options.
Patch #3: bug-fix to send Confirm options in the RFC-specified manner.
CCID-2 part:
Patch #4: adds initial code for CCID-2 Ack Ratio exchange.
Patch #5: fixes issues with cwnd/Sequence Window relationship in CCID-2.
Patch #6: fixes a bug in incrementing the cwnd of CCID-2.
Patch #7: fixes a bug in updating Ack Ratio relative to cwnd in CCID-2.
I have also placed this in into a fresh (today's) copy of net-next-2.6, on
git://eden-feed.erg.abdn.ac.uk/net-next-2.6 [subtree 'dccp']
---
ccids/ccid2.c | 88 +++++++++++++++++++------
ccids/ccid2.h | 6 +
dccp.h | 1
feat.c | 202 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
feat.h | 1
proto.c | 1
6 files changed, 273 insertions(+), 26 deletions(-)
^ permalink raw reply
* Re: [PATCH] net: Fix security_socket_sendmsg() bypass problem.
From: Tetsuo Handa @ 2011-07-25 13:15 UTC (permalink / raw)
To: anton; +Cc: mjt, davem, casey, netdev, linux-security-module
In-Reply-To: <20110725222010.0b284042@kryten>
Anton Blanchard wrote:
> > > When I saw recvmmsg()/sendmmsg() here, my first thought was an
> > > authoritative DNS server which can read several requests at a
> > > time and answer them all at once too - this way it all will go
> > > to different addresses.
> >
> > I don't know what application wants sendmmsg(). Since users can send
> > up to UIO_MAXIOV (= 1024) "struct iovec" blocks using sendmsg(), they
> > will use sendmsg() rather than sendmmsg() if the destination address
> > are the same.
>
> But if an application needs to maintain packet boundaries, then sendmsg
> isn't going to help is it?
Well, such application might want to use RDM or SeqPacket... but your point is
to maintain packet boundaries. You are assuming that sendmmsg() will be used
for sending as much data as possible while preserving packet boundaries.
OK. Then, the question is how to reduce performance loss by redundant
security_socket_sendmsg() calls. If sendmmsg() likely contains single (or few)
destination(s), trying to optimize security_socket_sendmsg() calls by comparing
destination address (as proposed at
http://www.spinics.net/linux/fedora/linux-security-module/msg11510.html
) would help. Otherwise, no optimization (as proposed at
http://www.spinics.net/linux/fedora/linux-security-module/msg11504.html
) would be better. Which approach do you like?
^ permalink raw reply
* Re: [PATCH] net: Fix security_socket_sendmsg() bypass problem.
From: Anton Blanchard @ 2011-07-25 12:20 UTC (permalink / raw)
To: Tetsuo Handa; +Cc: mjt, davem, casey, netdev, linux-security-module
In-Reply-To: <201107231939.FIF21882.QHOSOFtMFVLFOJ@I-love.SAKURA.ne.jp>
Hi,
> > (I noticed samba.org address in the Cc list).
>
> That's because Anton Blanchard is author of sendmmsg() system call.
Ignore the From address - I wasn't adding sendmmsg with samba in mind.
> > When I saw recvmmsg()/sendmmsg() here, my first thought was an
> > authoritative DNS server which can read several requests at a
> > time and answer them all at once too - this way it all will go
> > to different addresses.
>
> I don't know what application wants sendmmsg(). Since users can send
> up to UIO_MAXIOV (= 1024) "struct iovec" blocks using sendmsg(), they
> will use sendmsg() rather than sendmmsg() if the destination address
> are the same.
But if an application needs to maintain packet boundaries, then sendmsg
isn't going to help is it?
> Therefore, I guess users will use sendmmsg() for sending to multiple
> different destination addresses. If so, optimization based on
> destination address will do more harm than benefit; simply passing
> nosec flag down to LSM modules (so that SELinux will skip
> sock_has_perm() call and SMACK will not skip smack_netlabel_send()
> call) will be sufficient for 3.0.x stable release.
>
> Anton, how do you want to use sendmmsg()?
I was using it for packet generation, using raw sockets.
Anton
^ permalink raw reply
* Re: v3.0-rc* intermittent network failure: Test case found!
From: Richard Kennedy @ 2011-07-25 12:01 UTC (permalink / raw)
To: netdev; +Cc: Francois Romieu
In-Reply-To: <1311261527.2980.26.camel@castor.rsk>
On 21/07/11 16:18, Richard Kennedy wrote:
>> Richard Kennedy<richard@rsk.demon.co.uk> :
>>> I keep seeing a total network failure on v3.0.0-rc* , it is highly
>>> intermittent, anything from 1 hour to 12+, and I don't have a reliable
>>> test case.
>>> When it fails I lose all network comms, but there are no errors in the
>>> system log, no hung tasks reported, nothing. But after it fails the
>>> machine hangs during shutdown, it just never turns off. So I guess
>>> something is getting stuck but I can't find it.
>>
I have found a reliable test case, I can instantly trigger my problem by
starting 2 instances of rsync at the same time. [this is on x86_64 AMDX2]
e.g.
rsync -a linux-2.6 server:t1 & ;rsync -a linux-2.6 server:t2 &
If I have a ping running when I trigger the problem, it pauses then
errors with :-
ping: sendmsg: No buffer space available
But if I start a ping after, it fails with
... Destination Host Unreachable
.
I have a serial console attached but don't really understand what it's
telling me.
AFAICT -- I have no blocked tasks - sysrq w shows :-
SysRq : Show Blocked State
task PC stack pid father
Sched Debug Version: v0.10, 3.0.0 #46
ktime : 7129717.783042
sched_clk : 7126380.221722
cpu_clk : 7129711.544071
jiffies : 4301797008
sched_clock_stable : 0
.....[lots more schedule & cpu info]
But now I've got a reliable test case I can find a last know good kernel
and have a stab at bisecting this, unless anyone has got any better
suggestions?
regards
Richard
^ permalink raw reply
* Re: [PATCH net-next] skbuff: clear tx zero-copy flag
From: Michael S. Tsirkin @ 2011-07-25 10:53 UTC (permalink / raw)
To: David Miller; +Cc: herbert, mashirle, netdev, kvm, linux-kernel
In-Reply-To: <20110725.030229.1448266647053398033.davem@davemloft.net>
On Mon, Jul 25, 2011 at 03:02:29AM -0700, David Miller wrote:
> From: Herbert Xu <herbert@gondor.hengli.com.au>
> Date: Mon, 25 Jul 2011 17:57:11 +0800
>
> > However, I think we should add a WARN_ON to the splice skb path
> > so that should a packet find its way through a path that we haven't
> > thought of then at least we'll know about it.
>
> Good idea.
Another place like this is skb_split, I think.
--
MST
^ permalink raw reply
* Re: r8169 driver crashes in 2.6.32.43
From: Kasper Dupont @ 2011-07-25 10:36 UTC (permalink / raw)
To: François romieu; +Cc: ivecera, hayeswang, gregkh, netdev
In-Reply-To: <20110724201626.GB24418@zoreil.com>
On 24/07/11 22.16, François romieu wrote:
> The Sun, Jul 24, 2011 at 09:58:31PM +0200, Kasper Dupont wrote :
> [...]
> > Any idea how to fix this?
>
> Apply 1519e57fe81c14bb8fa4855579f19264d1ef63b4 as well and
> eventually f60ac8e7ab7cbb413a0131d5665b053f9f386526.
>
> Please send r8169 related lines from dmesg, especially the XID
> one and Cc: netdev.
These are the relevant lines from dmesg:
[ 1.045727] pata_sch 0000:00:1f.1: setting latency timer to 64
[ 1.045946] r8169 Gigabit Ethernet driver 2.3LK-NAPI loaded
[ 1.046061] r8169 0000:02:00.0: PCI INT A -> GSI 16 (level, low) -> IRQ 16
[ 1.046201] r8169 0000:02:00.0: setting latency timer to 64
[ 1.046257] alloc irq_desc for 24 on node -1
[ 1.046263] alloc kstat_irqs on node -1
[ 1.046284] r8169 0000:02:00.0: irq 24 for MSI/MSI-X
[ 1.048097] eth0: RTL8168c/8111c at 0xf8076000, 00:01:c0:09:a1:25, XID 1c4000c0 IRQ 24
[ 1.051517] r8169 Gigabit Ethernet driver 2.3LK-NAPI loaded
[ 1.051631] r8169 0000:03:00.0: PCI INT A -> GSI 17 (level, low) -> IRQ 17
[ 1.051764] r8169 0000:03:00.0: setting latency timer to 64
[ 1.051820] alloc irq_desc for 25 on node -1
[ 1.051825] alloc kstat_irqs on node -1
[ 1.051847] r8169 0000:03:00.0: irq 25 for MSI/MSI-X
[ 1.053159] usb 1-7: new high speed USB device using ehci_hcd and address 5
[ 1.056574] vga16fb: initializing
[ 1.056584] vga16fb: mapped to 0xc00a0000
[ 1.056819] fb0: VGA16 VGA frame buffer device
[ 1.070138] scsi0 : pata_sch
[ 1.078253] scsi1 : pata_sch
[ 1.079216] ata1: PATA max UDMA/100 cmd 0x1f0 ctl 0x3f6 bmdma 0x1800 irq 14
[ 1.079312] ata2: PATA max UDMA/100 cmd 0x170 ctl 0x376 bmdma 0x1808 irq 15
[ 1.082178] eth1: RTL8168c/8111c at 0xf8096000, 00:01:c0:09:a1:26, XID 1c4000c0 IRQ 25
[ 1.205643] usb 1-7: configuration #1 chosen from 1 choice
It works on 2.6.32.32 it crashes on 2.6.32.33. I tried to
take 2.6.32.43 and apply 1519e57fe81c14bb8fa4855579f19264d1ef63b4,
that did not help. 2.6.32.43 crashes with and without that patch.
--
Kasper Dupont -- Rigtige mænd skriver deres egne backupprogrammer
#define _(_)"d.%.4s%."_"2s" /* This is my email address */
char*_="@2kaspner"_()"%03"_("4s%.")"t\n";printf(_+11,_+6,_,11,_+2,_+7,_+6);
^ permalink raw reply
* [PATCH 7/7] atm: clip: Use device neigh support on top of "arp_tbl".
From: David Miller @ 2011-07-25 10:01 UTC (permalink / raw)
To: roland; +Cc: linux-rdma, netdev
Instead of instantiating an entire new neigh_table instance
just for ATM handling, use the neigh device private facility.
Signed-off-by: David S. Miller <davem@davemloft.net>
---
include/net/atmclip.h | 5 ---
net/atm/clip.c | 86 ++++++++-----------------------------------------
net/ipv4/arp.c | 5 ---
net/ipv4/route.c | 10 +----
4 files changed, 16 insertions(+), 90 deletions(-)
diff --git a/include/net/atmclip.h b/include/net/atmclip.h
index 852a3b2..5865924 100644
--- a/include/net/atmclip.h
+++ b/include/net/atmclip.h
@@ -41,17 +41,12 @@ struct atmarp_entry {
struct neighbour *neigh; /* neighbour back-pointer */
};
-
#define PRIV(dev) ((struct clip_priv *) netdev_priv(dev))
-
struct clip_priv {
int number; /* for convenience ... */
spinlock_t xoff_lock; /* ensures that pop is atomic (SMP) */
struct net_device *next; /* next CLIP interface */
};
-
-extern struct neigh_table *clip_tbl_hook;
-
#endif
diff --git a/net/atm/clip.c b/net/atm/clip.c
index 44ee92d..9c1d2d6 100644
--- a/net/atm/clip.c
+++ b/net/atm/clip.c
@@ -33,6 +33,7 @@
#include <linux/slab.h>
#include <net/route.h> /* for struct rtable and routing */
#include <net/icmp.h> /* icmp_send */
+#include <net/arp.h>
#include <linux/param.h> /* for HZ */
#include <linux/uaccess.h>
#include <asm/byteorder.h> /* for htons etc. */
@@ -280,70 +281,23 @@ static const struct neigh_ops clip_neigh_ops = {
static int clip_constructor(struct neighbour *neigh)
{
struct atmarp_entry *entry = neighbour_priv(neigh);
- struct net_device *dev = neigh->dev;
- struct in_device *in_dev;
- struct neigh_parms *parms;
- pr_debug("(neigh %p, entry %p)\n", neigh, entry);
- neigh->type = inet_addr_type(&init_net, *((__be32 *) neigh->primary_key));
- if (neigh->type != RTN_UNICAST)
+ if (neigh->tbl->family != AF_INET)
return -EINVAL;
- rcu_read_lock();
- in_dev = __in_dev_get_rcu(dev);
- if (!in_dev) {
- rcu_read_unlock();
+ if (neigh->type != RTN_UNICAST)
return -EINVAL;
- }
-
- parms = in_dev->arp_parms;
- __neigh_parms_put(neigh->parms);
- neigh->parms = neigh_parms_clone(parms);
- rcu_read_unlock();
+ neigh->nud_state = NUD_NONE;
neigh->ops = &clip_neigh_ops;
- neigh->output = neigh->nud_state & NUD_VALID ?
- neigh->ops->connected_output : neigh->ops->output;
+ neigh->output = neigh->ops->output;
entry->neigh = neigh;
entry->vccs = NULL;
entry->expires = jiffies - 1;
+
return 0;
}
-static u32 clip_hash(const void *pkey, const struct net_device *dev, __u32 rnd)
-{
- return jhash_2words(*(u32 *) pkey, dev->ifindex, rnd);
-}
-
-static struct neigh_table clip_tbl = {
- .family = AF_INET,
- .key_len = 4,
- .hash = clip_hash,
- .constructor = clip_constructor,
- .id = "clip_arp_cache",
-
- /* parameters are copied from ARP ... */
- .parms = {
- .tbl = &clip_tbl,
- .base_reachable_time = 30 * HZ,
- .retrans_time = 1 * HZ,
- .gc_staletime = 60 * HZ,
- .reachable_time = 30 * HZ,
- .delay_probe_time = 5 * HZ,
- .queue_len = 3,
- .ucast_probes = 3,
- .mcast_probes = 3,
- .anycast_delay = 1 * HZ,
- .proxy_delay = (8 * HZ) / 10,
- .proxy_qlen = 64,
- .locktime = 1 * HZ,
- },
- .gc_interval = 30 * HZ,
- .gc_thresh1 = 128,
- .gc_thresh2 = 512,
- .gc_thresh3 = 1024,
-};
-
/* @@@ copy bh locking from arp.c -- need to bh-enable atm code before */
/*
@@ -524,7 +478,7 @@ static int clip_setentry(struct atm_vcc *vcc, __be32 ip)
rt = ip_route_output(&init_net, ip, 0, 1, 0);
if (IS_ERR(rt))
return PTR_ERR(rt);
- neigh = __neigh_lookup(&clip_tbl, &ip, rt->dst.dev, 1);
+ neigh = __neigh_lookup(&arp_tbl, &ip, rt->dst.dev, 1);
ip_rt_put(rt);
if (!neigh)
return -ENOMEM;
@@ -545,7 +499,8 @@ static int clip_setentry(struct atm_vcc *vcc, __be32 ip)
}
static const struct net_device_ops clip_netdev_ops = {
- .ndo_start_xmit = clip_start_xmit,
+ .ndo_start_xmit = clip_start_xmit,
+ .ndo_neigh_construct = clip_constructor,
};
static void clip_setup(struct net_device *dev)
@@ -606,10 +561,8 @@ static int clip_device_event(struct notifier_block *this, unsigned long event,
if (!net_eq(dev_net(dev), &init_net))
return NOTIFY_DONE;
- if (event == NETDEV_UNREGISTER) {
- neigh_ifdown(&clip_tbl, dev);
+ if (event == NETDEV_UNREGISTER)
return NOTIFY_DONE;
- }
/* ignore non-CLIP devices */
if (dev->type != ARPHRD_ATM || dev->netdev_ops != &clip_netdev_ops)
@@ -883,6 +836,9 @@ static void *clip_seq_sub_iter(struct neigh_seq_state *_state,
{
struct clip_seq_state *state = (struct clip_seq_state *)_state;
+ if (n->dev->type != ARPHRD_ATM)
+ return NULL;
+
return clip_seq_vcc_walk(state, neighbour_priv(n), pos);
}
@@ -890,7 +846,7 @@ static void *clip_seq_start(struct seq_file *seq, loff_t * pos)
{
struct clip_seq_state *state = seq->private;
state->ns.neigh_sub_iter = clip_seq_sub_iter;
- return neigh_seq_start(seq, pos, &clip_tbl, NEIGH_SEQ_NEIGH_ONLY);
+ return neigh_seq_start(seq, pos, &arp_tbl, NEIGH_SEQ_NEIGH_ONLY);
}
static int clip_seq_show(struct seq_file *seq, void *v)
@@ -936,9 +892,6 @@ static void atm_clip_exit_noproc(void);
static int __init atm_clip_init(void)
{
- neigh_table_init_no_netlink(&clip_tbl);
-
- clip_tbl_hook = &clip_tbl;
register_atm_ioctl(&clip_ioctl_ops);
register_netdevice_notifier(&clip_dev_notifier);
register_inetaddr_notifier(&clip_inet_notifier);
@@ -975,12 +928,6 @@ static void atm_clip_exit_noproc(void)
*/
del_timer_sync(&idle_timer);
- /* Next, purge the table, so that the device
- * unregister loop below does not hang due to
- * device references remaining in the table.
- */
- neigh_ifdown(&clip_tbl, NULL);
-
dev = clip_devs;
while (dev) {
next = PRIV(dev)->next;
@@ -988,11 +935,6 @@ static void atm_clip_exit_noproc(void)
free_netdev(dev);
dev = next;
}
-
- /* Now it is safe to fully shutdown whole table. */
- neigh_table_clear(&clip_tbl);
-
- clip_tbl_hook = NULL;
}
static void __exit atm_clip_exit(void)
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 43f0d15..e652d0d 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -112,11 +112,6 @@
#include <net/arp.h>
#include <net/ax25.h>
#include <net/netrom.h>
-#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
-#include <net/atmclip.h>
-struct neigh_table *clip_tbl_hook;
-EXPORT_SYMBOL(clip_tbl_hook);
-#endif
#include <asm/system.h>
#include <linux/uaccess.h>
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 1730689..a4ca985 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -108,7 +108,6 @@
#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
#endif
-#include <net/atmclip.h>
#define RT_FL_TOS(oldflp4) \
((u32)(oldflp4->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
@@ -1013,23 +1012,18 @@ static int slow_chain_length(const struct rtable *head)
static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
{
- struct neigh_table *tbl = &arp_tbl;
static const __be32 inaddr_any = 0;
struct net_device *dev = dst->dev;
const __be32 *pkey = daddr;
struct neighbour *n;
-#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
- if (dev->type == ARPHRD_ATM)
- tbl = clip_tbl_hook;
-#endif
if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
pkey = &inaddr_any;
- n = __ipv4_neigh_lookup(tbl, dev, *(__force u32 *)pkey);
+ n = __ipv4_neigh_lookup(&arp_tbl, dev, *(__force u32 *)pkey);
if (n)
return n;
- return neigh_create(tbl, pkey, dev);
+ return neigh_create(&arp_tbl, pkey, dev);
}
static int rt_bind_neighbour(struct rtable *rt)
--
1.7.6
^ permalink raw reply related
* [PATCH 5/7] atm: clip: Convert over to neighbour_priv()
From: David Miller @ 2011-07-25 10:01 UTC (permalink / raw)
To: roland; +Cc: linux-rdma, netdev
Signed-off-by: David S. Miller <davem@davemloft.net>
---
include/net/atmclip.h | 2 --
net/atm/clip.c | 28 +++++++++++++++-------------
2 files changed, 15 insertions(+), 15 deletions(-)
diff --git a/include/net/atmclip.h b/include/net/atmclip.h
index 497ef64..852a3b2 100644
--- a/include/net/atmclip.h
+++ b/include/net/atmclip.h
@@ -15,7 +15,6 @@
#define CLIP_VCC(vcc) ((struct clip_vcc *) ((vcc)->user_back))
-#define NEIGH2ENTRY(neigh) ((struct atmarp_entry *) (neigh)->primary_key)
struct sk_buff;
@@ -36,7 +35,6 @@ struct clip_vcc {
struct atmarp_entry {
- __be32 ip; /* IP address */
struct clip_vcc *vccs; /* active VCCs; NULL if resolution is
pending */
unsigned long expires; /* entry expiration time */
diff --git a/net/atm/clip.c b/net/atm/clip.c
index e19a0e7..44ee92d 100644
--- a/net/atm/clip.c
+++ b/net/atm/clip.c
@@ -119,7 +119,7 @@ out:
/* The neighbour entry n->lock is held. */
static int neigh_check_cb(struct neighbour *n)
{
- struct atmarp_entry *entry = NEIGH2ENTRY(n);
+ struct atmarp_entry *entry = neighbour_priv(n);
struct clip_vcc *cv;
for (cv = entry->vccs; cv; cv = cv->next) {
@@ -255,8 +255,10 @@ static void clip_pop(struct atm_vcc *vcc, struct sk_buff *skb)
static void clip_neigh_solicit(struct neighbour *neigh, struct sk_buff *skb)
{
+ __be32 *ip = (__be32 *) neigh->primary_key;
+
pr_debug("(neigh %p, skb %p)\n", neigh, skb);
- to_atmarpd(act_need, PRIV(neigh->dev)->number, NEIGH2ENTRY(neigh)->ip);
+ to_atmarpd(act_need, PRIV(neigh->dev)->number, *ip);
}
static void clip_neigh_error(struct neighbour *neigh, struct sk_buff *skb)
@@ -277,13 +279,13 @@ static const struct neigh_ops clip_neigh_ops = {
static int clip_constructor(struct neighbour *neigh)
{
- struct atmarp_entry *entry = NEIGH2ENTRY(neigh);
+ struct atmarp_entry *entry = neighbour_priv(neigh);
struct net_device *dev = neigh->dev;
struct in_device *in_dev;
struct neigh_parms *parms;
pr_debug("(neigh %p, entry %p)\n", neigh, entry);
- neigh->type = inet_addr_type(&init_net, entry->ip);
+ neigh->type = inet_addr_type(&init_net, *((__be32 *) neigh->primary_key));
if (neigh->type != RTN_UNICAST)
return -EINVAL;
@@ -391,12 +393,12 @@ static netdev_tx_t clip_start_xmit(struct sk_buff *skb,
dev->stats.tx_dropped++;
return NETDEV_TX_OK;
}
- entry = NEIGH2ENTRY(n);
+ entry = neighbour_priv(n);
if (!entry->vccs) {
if (time_after(jiffies, entry->expires)) {
/* should be resolved */
entry->expires = jiffies + ATMARP_RETRY_DELAY * HZ;
- to_atmarpd(act_need, PRIV(dev)->number, entry->ip);
+ to_atmarpd(act_need, PRIV(dev)->number, *((__be32 *)n->primary_key));
}
if (entry->neigh->arp_queue.qlen < ATMARP_MAX_UNRES_PACKETS)
skb_queue_tail(&entry->neigh->arp_queue, skb);
@@ -526,7 +528,7 @@ static int clip_setentry(struct atm_vcc *vcc, __be32 ip)
ip_rt_put(rt);
if (!neigh)
return -ENOMEM;
- entry = NEIGH2ENTRY(neigh);
+ entry = neighbour_priv(neigh);
if (entry != clip_vcc->entry) {
if (!clip_vcc->entry)
pr_debug("add\n");
@@ -787,9 +789,10 @@ static void svc_addr(struct seq_file *seq, struct sockaddr_atmsvc *addr)
/* This means the neighbour entry has no attached VCC objects. */
#define SEQ_NO_VCC_TOKEN ((void *) 2)
-static void atmarp_info(struct seq_file *seq, struct net_device *dev,
+static void atmarp_info(struct seq_file *seq, struct neighbour *n,
struct atmarp_entry *entry, struct clip_vcc *clip_vcc)
{
+ struct net_device *dev = n->dev;
unsigned long exp;
char buf[17];
int svc, llc, off;
@@ -809,8 +812,7 @@ static void atmarp_info(struct seq_file *seq, struct net_device *dev,
seq_printf(seq, "%-6s%-4s%-4s%5ld ",
dev->name, svc ? "SVC" : "PVC", llc ? "LLC" : "NULL", exp);
- off = scnprintf(buf, sizeof(buf) - 1, "%pI4",
- &entry->ip);
+ off = scnprintf(buf, sizeof(buf) - 1, "%pI4", n->primary_key);
while (off < 16)
buf[off++] = ' ';
buf[off] = '\0';
@@ -881,7 +883,7 @@ static void *clip_seq_sub_iter(struct neigh_seq_state *_state,
{
struct clip_seq_state *state = (struct clip_seq_state *)_state;
- return clip_seq_vcc_walk(state, NEIGH2ENTRY(n), pos);
+ return clip_seq_vcc_walk(state, neighbour_priv(n), pos);
}
static void *clip_seq_start(struct seq_file *seq, loff_t * pos)
@@ -900,10 +902,10 @@ static int clip_seq_show(struct seq_file *seq, void *v)
seq_puts(seq, atm_arp_banner);
} else {
struct clip_seq_state *state = seq->private;
- struct neighbour *n = v;
struct clip_vcc *vcc = state->vcc;
+ struct neighbour *n = v;
- atmarp_info(seq, n->dev, NEIGH2ENTRY(n), vcc);
+ atmarp_info(seq, n, neighbour_priv(n), vcc);
}
return 0;
}
--
1.7.6
^ permalink raw reply related
* [PATCH 4/7] neigh: Do not set tbl->entry_size in ipv4/ipv6 neigh tables.
From: David Miller @ 2011-07-25 10:01 UTC (permalink / raw)
To: roland; +Cc: linux-rdma, netdev
Let the core self-size the neigh entry based upon the key length.
Signed-off-by: David S. Miller <davem@davemloft.net>
---
net/atm/clip.c | 1 -
net/ipv4/arp.c | 1 -
net/ipv6/ndisc.c | 1 -
3 files changed, 0 insertions(+), 3 deletions(-)
diff --git a/net/atm/clip.c b/net/atm/clip.c
index 5dc4f4e..e19a0e7 100644
--- a/net/atm/clip.c
+++ b/net/atm/clip.c
@@ -315,7 +315,6 @@ static u32 clip_hash(const void *pkey, const struct net_device *dev, __u32 rnd)
static struct neigh_table clip_tbl = {
.family = AF_INET,
- .entry_size = sizeof(struct neighbour)+sizeof(struct atmarp_entry),
.key_len = 4,
.hash = clip_hash,
.constructor = clip_constructor,
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 96a164a..43f0d15 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -164,7 +164,6 @@ static const struct neigh_ops arp_broken_ops = {
struct neigh_table arp_tbl = {
.family = AF_INET,
- .entry_size = sizeof(struct neighbour) + 4,
.key_len = 4,
.hash = arp_hash,
.constructor = arp_constructor,
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 9da6e02..2582431 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -126,7 +126,6 @@ static const struct neigh_ops ndisc_direct_ops = {
struct neigh_table nd_tbl = {
.family = AF_INET6,
- .entry_size = sizeof(struct neighbour) + sizeof(struct in6_addr),
.key_len = sizeof(struct in6_addr),
.hash = ndisc_hash,
.constructor = ndisc_constructor,
--
1.7.6
^ permalink raw reply related
* [PATCH 2/7] neigh: Get rid of neigh_table->kmem_cachep
From: David Miller @ 2011-07-25 10:01 UTC (permalink / raw)
To: roland; +Cc: linux-rdma, netdev
We are going to alloc for device specific private areas for
neighbour entries, and in order to do that we have to move
away from the fixed allocation size enforced by using
neigh_table->kmem_cachep
As a nice side effect we can now use kfree_rcu().
Signed-off-by: David S. Miller <davem@davemloft.net>
---
include/net/neighbour.h | 1 -
net/core/neighbour.c | 18 ++----------------
2 files changed, 2 insertions(+), 17 deletions(-)
diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index 8ff9143..cd113ed 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -172,7 +172,6 @@ struct neigh_table {
atomic_t entries;
rwlock_t lock;
unsigned long last_rand;
- struct kmem_cache *kmem_cachep;
struct neigh_statistics __percpu *stats;
struct neigh_hash_table __rcu *nht;
struct pneigh_entry **phash_buckets;
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 8fab9b0..493703c 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -287,7 +287,7 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl)
goto out_entries;
}
- n = kmem_cache_zalloc(tbl->kmem_cachep, GFP_ATOMIC);
+ n = kzalloc(tbl->entry_size, GFP_ATOMIC);
if (!n)
goto out_entries;
@@ -677,12 +677,6 @@ static inline void neigh_parms_put(struct neigh_parms *parms)
neigh_parms_destroy(parms);
}
-static void neigh_destroy_rcu(struct rcu_head *head)
-{
- struct neighbour *neigh = container_of(head, struct neighbour, rcu);
-
- kmem_cache_free(neigh->tbl->kmem_cachep, neigh);
-}
/*
* neighbour must already be out of the table;
*
@@ -709,7 +703,7 @@ void neigh_destroy(struct neighbour *neigh)
NEIGH_PRINTK2("neigh %p is destroyed.\n", neigh);
atomic_dec(&neigh->tbl->entries);
- call_rcu(&neigh->rcu, neigh_destroy_rcu);
+ kfree_rcu(neigh, rcu);
}
EXPORT_SYMBOL(neigh_destroy);
@@ -1461,11 +1455,6 @@ void neigh_table_init_no_netlink(struct neigh_table *tbl)
tbl->parms.reachable_time =
neigh_rand_reach_time(tbl->parms.base_reachable_time);
- if (!tbl->kmem_cachep)
- tbl->kmem_cachep =
- kmem_cache_create(tbl->id, tbl->entry_size, 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC,
- NULL);
tbl->stats = alloc_percpu(struct neigh_statistics);
if (!tbl->stats)
panic("cannot create neighbour cache statistics");
@@ -1550,9 +1539,6 @@ int neigh_table_clear(struct neigh_table *tbl)
free_percpu(tbl->stats);
tbl->stats = NULL;
- kmem_cache_destroy(tbl->kmem_cachep);
- tbl->kmem_cachep = NULL;
-
return 0;
}
EXPORT_SYMBOL(neigh_table_clear);
--
1.7.6
^ permalink raw reply related
* [PATCH 0/7] More sane neigh infrastructure
From: David Miller @ 2011-07-25 10:01 UTC (permalink / raw)
To: roland; +Cc: linux-rdma, netdev
Roland, this is a first pass at the kind of thing I was
talking about with you last week.
ATM and Infiniband both need to do their own kind of
signaling, either in place of (ATM) or in addition to
(IPoIB) the generic ARP negotiation.
ATM wants to push everything to a userspace atmarpd daemon,
and override all of the usual ARP signalling. It replaces
the neigh_table used by ARP completely in order to accomplish
this.
IPoIB triggers it's signalling by hooking in at transmit time, and
adding a neigh parms destruction hook to free up and release it's
private per-neigh state.
I think both cases can be consolidated into one kind of scheme,
and these patches provide the infrastructure and convert ATM
over as an example.
Devices provide up to three things:
1) netdev->neighpriv_len, length of per-neighbour device private
state, accessible via neighbour_priv(neigh)
2) net_device_ops->ndo_neigh_construct(), invoked right after
neigh_tbl->constructor(), can fail
3) net_device_ops->ndo_neigh_destroy(), invoked right before
we release neigh->parms and kfree_rcu() the neigh object.
It could return errors but I'm not checking for them
currently and I can't think what we could possibly do
in response at this point in the code. Maybe this gets
changed to return "void" eventually.
As a result ATM CLIP no longer overrides the IPV4 ARP table, and
I'm convinced IPoIB could behave similarly, override the
neigh_ops in a device neigh constructor, and avoid all of the
hooks at transmit time and instead trigger the key signalling
at neigh->output and friends.
If IPoIB can get converted to this new stuff, then we can get
rid of the ->ndo_neigh_setup() netdev op which only exists to
facilitate IPoIB hooking in a destructor for it's neigh state.
^ permalink raw reply
* Re: [PATCH net-next] skbuff: clear tx zero-copy flag
From: David Miller @ 2011-07-25 10:02 UTC (permalink / raw)
To: herbert; +Cc: mst, mashirle, netdev, kvm, linux-kernel
In-Reply-To: <20110725095711.GA30831@gondor.apana.org.au>
From: Herbert Xu <herbert@gondor.hengli.com.au>
Date: Mon, 25 Jul 2011 17:57:11 +0800
> However, I think we should add a WARN_ON to the splice skb path
> so that should a packet find its way through a path that we haven't
> thought of then at least we'll know about it.
Good idea.
^ permalink raw reply
* [PATCH 6/7] neigh: Add device constructor/destructor capability.
From: David Miller @ 2011-07-25 10:01 UTC (permalink / raw)
To: roland-DgEjT+Ai2ygdnm+yROfE0A
Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA
If the neigh entry has device private state, it will need
constructor/destructor ops.
Signed-off-by: David S. Miller <davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>
---
include/linux/netdevice.h | 2 ++
net/core/neighbour.c | 15 ++++++++++++++-
2 files changed, 16 insertions(+), 1 deletions(-)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index a50f6d6..016bb4e 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -943,6 +943,8 @@ struct net_device_ops {
u32 features);
int (*ndo_set_features)(struct net_device *dev,
u32 features);
+ int (*ndo_neigh_construct)(struct neighbour *n);
+ int (*ndo_neigh_destroy)(struct neighbour *n);
};
/*
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 96ae4e4..ee5ce7e 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -488,6 +488,14 @@ struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey,
goto out_neigh_release;
}
+ if (dev->netdev_ops->ndo_neigh_construct) {
+ error = dev->netdev_ops->ndo_neigh_construct(n);
+ if (error < 0) {
+ rc = ERR_PTR(error);
+ goto out_neigh_release;
+ }
+ }
+
/* Device specific setup. */
if (n->parms->neigh_setup &&
(error = n->parms->neigh_setup(n)) < 0) {
@@ -691,6 +699,8 @@ static inline void neigh_parms_put(struct neigh_parms *parms)
*/
void neigh_destroy(struct neighbour *neigh)
{
+ struct net_device *dev = neigh->dev;
+
NEIGH_CACHE_STAT_INC(neigh->tbl, destroys);
if (!neigh->dead) {
@@ -705,7 +715,10 @@ void neigh_destroy(struct neighbour *neigh)
skb_queue_purge(&neigh->arp_queue);
- dev_put(neigh->dev);
+ if (dev->netdev_ops->ndo_neigh_destroy)
+ dev->netdev_ops->ndo_neigh_destroy(neigh);
+
+ dev_put(dev);
neigh_parms_put(neigh->parms);
NEIGH_PRINTK2("neigh %p is destroyed.\n", neigh);
--
1.7.6
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related
* [PATCH 3/7] neigh: Add infrastructure for allocating device neigh privates.
From: David Miller @ 2011-07-25 10:01 UTC (permalink / raw)
To: roland-DgEjT+Ai2ygdnm+yROfE0A
Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA
netdev->neigh_priv_len records the private area length.
This will trigger for neigh_table objects which set tbl->entry_size
to zero, and the first instances of this will be forthcoming.
Signed-off-by: David S. Miller <davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>
---
drivers/infiniband/ulp/ipoib/ipoib_main.c | 2 ++
include/linux/netdevice.h | 1 +
net/atm/clip.c | 1 +
net/core/neighbour.c | 14 +++++++++++---
4 files changed, 15 insertions(+), 3 deletions(-)
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 43f89ba..7b96105 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -1216,6 +1216,8 @@ static struct net_device *ipoib_add_port(const char *format,
priv->dev->mtu = IPOIB_UD_MTU(priv->max_ib_mtu);
priv->mcast_mtu = priv->admin_mtu = priv->dev->mtu;
+ priv->dev->neigh_priv_len = sizeof(struct ipoib_neigh);
+
result = ib_query_pkey(hca, port, 0, &priv->pkey);
if (result) {
printk(KERN_WARNING "%s: ib_query_pkey port %d failed (ret = %d)\n",
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 34f3abc..a50f6d6 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1127,6 +1127,7 @@ struct net_device {
unsigned char perm_addr[MAX_ADDR_LEN]; /* permanent hw address */
unsigned char addr_assign_type; /* hw address assignment type */
unsigned char addr_len; /* hardware address length */
+ unsigned char neigh_priv_len;
unsigned short dev_id; /* for shared network cards */
spinlock_t addr_list_lock;
diff --git a/net/atm/clip.c b/net/atm/clip.c
index 4bc8c67..5dc4f4e 100644
--- a/net/atm/clip.c
+++ b/net/atm/clip.c
@@ -551,6 +551,7 @@ static void clip_setup(struct net_device *dev)
{
dev->netdev_ops = &clip_netdev_ops;
dev->type = ARPHRD_ATM;
+ dev->neigh_priv_len = sizeof(struct atmarp_entry);
dev->hard_header_len = RFC1483LLC_LEN;
dev->mtu = RFC1626_MTU;
dev->tx_queue_len = 100; /* "normal" queue (packets) */
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 493703c..96ae4e4 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -272,7 +272,7 @@ int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev)
}
EXPORT_SYMBOL(neigh_ifdown);
-static struct neighbour *neigh_alloc(struct neigh_table *tbl)
+static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device *dev)
{
struct neighbour *n = NULL;
unsigned long now = jiffies;
@@ -287,7 +287,15 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl)
goto out_entries;
}
- n = kzalloc(tbl->entry_size, GFP_ATOMIC);
+ if (tbl->entry_size)
+ n = kzalloc(tbl->entry_size, GFP_ATOMIC);
+ else {
+ int sz = sizeof(*n) + tbl->key_len;
+
+ sz = ALIGN(sz, NEIGH_PRIV_ALIGN);
+ sz += dev->neigh_priv_len;
+ n = kzalloc(sz, GFP_ATOMIC);
+ }
if (!n)
goto out_entries;
@@ -462,7 +470,7 @@ struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey,
u32 hash_val;
int key_len = tbl->key_len;
int error;
- struct neighbour *n1, *rc, *n = neigh_alloc(tbl);
+ struct neighbour *n1, *rc, *n = neigh_alloc(tbl, dev);
struct neigh_hash_table *nht;
if (!n) {
--
1.7.6
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related
* [PATCH 1/7] neigh: Create mechanism for generic neigh private areas.
From: David Miller @ 2011-07-25 10:01 UTC (permalink / raw)
To: roland-DgEjT+Ai2ygdnm+yROfE0A
Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA
The implementation private sits right after the primary_key memory.
Signed-off-by: David S. Miller <davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>
---
include/net/neighbour.h | 7 +++++++
1 files changed, 7 insertions(+), 0 deletions(-)
diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index 4ba8521..8ff9143 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -178,6 +178,13 @@ struct neigh_table {
struct pneigh_entry **phash_buckets;
};
+#define NEIGH_PRIV_ALIGN sizeof(long long)
+
+static inline void *neighbour_priv(const struct neighbour *n)
+{
+ return (char *)n + ALIGN(sizeof(*n) + n->tbl->key_len, NEIGH_PRIV_ALIGN);
+}
+
/* flags for neigh_update() */
#define NEIGH_UPDATE_F_OVERRIDE 0x00000001
#define NEIGH_UPDATE_F_WEAK_OVERRIDE 0x00000002
--
1.7.6
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related
* Re: [PATCH net-next] skbuff: clear tx zero-copy flag
From: Herbert Xu @ 2011-07-25 9:57 UTC (permalink / raw)
To: Michael S. Tsirkin; +Cc: Shirley Ma, davem, netdev, kvm, linux-kernel
In-Reply-To: <20110725094414.GA11776@redhat.com>
On Mon, Jul 25, 2011 at 12:44:14PM +0300, Michael S. Tsirkin wrote:
>
> if yes that seems to always clone an skb, which in turn
> does the copy so we are fine?
Yes you're right, it should be safe.
However, I think we should add a WARN_ON to the splice skb path
so that should a packet find its way through a path that we haven't
thought of then at least we'll know about it.
Thanks,
--
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
^ permalink raw reply
* Re: [PATCH net-next] skbuff: clear tx zero-copy flag
From: Michael S. Tsirkin @ 2011-07-25 9:44 UTC (permalink / raw)
To: Herbert Xu; +Cc: Shirley Ma, davem, netdev, kvm, linux-kernel
In-Reply-To: <20110725084057.GA30311@gondor.apana.org.au>
On Mon, Jul 25, 2011 at 04:40:57PM +0800, Herbert Xu wrote:
> On Mon, Jul 25, 2011 at 11:07:43AM +0300, Michael S. Tsirkin wrote:
> >
> > However macvtap passes an skb directly to the
> > lower device, so as long as macvtap is the only user
> > of that interface, we are fine I think - there's
> > no way for an skb to get from macvtap to splice
> > read path I think.
> >
> > Right?
>
> Yes, as long as you can guarantee that the skb never loops back
> then you should be fine.
>
> However, does macvtap really bypass everything, including the
> qdisc layer? The qdisc layer is certainly capable of looping
> the skb back with the redirect action.
>
> Cheers,
No, I don't think macvtap bypasses the qdisc.
Is the action in question here?
static int tcf_mirred(struct sk_buff *skb,
const struct tc_action *a,
struct tcf_result *res)
if yes that seems to always clone an skb, which in turn
does the copy so we are fine?
> --
> Email: Herbert Xu <herbert@gondor.apana.org.au>
> Home Page: http://gondor.apana.org.au/~herbert/
> PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
^ permalink raw reply
* [PATCH] net/smsc911x: add device tree probe support
From: Shawn Guo @ 2011-07-25 9:44 UTC (permalink / raw)
To: netdev
Cc: devicetree-discuss, linux-arm-kernel, patches, Shawn Guo,
Grant Likely, Steve Glendinning, David S. Miller
It adds device tree probe support for smsc911x driver.
Signed-off-by: Shawn Guo <shawn.guo@linaro.org>
Cc: Grant Likely <grant.likely@secretlab.ca>
Cc: Steve Glendinning <steve.glendinning@smsc.com>
Cc: David S. Miller <davem@davemloft.net>
---
Documentation/devicetree/bindings/net/smsc.txt | 34 +++++++
drivers/net/smsc911x.c | 123 +++++++++++++++++++-----
2 files changed, 132 insertions(+), 25 deletions(-)
create mode 100644 Documentation/devicetree/bindings/net/smsc.txt
diff --git a/Documentation/devicetree/bindings/net/smsc.txt b/Documentation/devicetree/bindings/net/smsc.txt
new file mode 100644
index 0000000..1920695
--- /dev/null
+++ b/Documentation/devicetree/bindings/net/smsc.txt
@@ -0,0 +1,34 @@
+* Smart Mixed-Signal Connectivity (SMSC) LAN Controller
+
+Required properties:
+- compatible : Should be "smsc,lan<model>""smsc,lan"
+- reg : Address and length of the io space for SMSC LAN
+- smsc-int-gpios : Should specify the GPIO for SMSC LAN interrupt line
+- phy-mode : String, operation mode of the PHY interface.
+ Supported values are: "mii", "gmii", "sgmii", "tbi", "rmii",
+ "rgmii", "rgmii-id", "rgmii-rxid", "rgmii-txid", "rtbi", "smii".
+
+Optional properties:
+- smsc,irq-active-high : Indicates the IRQ polarity is active-low
+- smsc,irq-push-pull : Indicates the IRQ type is push-pull
+- smsc,register-needs-shift : Indicates the register access needs shift
+- smsc,access-in-32bit : Indicates the access to controller is in 32-bit
+ mode
+- smsc,force-internal-phy : Forces SMSC LAN controller to use
+ internal PHY
+- smsc,force-external-phy : Forces SMSC LAN controller to use
+ external PHY
+- smsc,save-mac-address : Indicates that mac address needs to be saved
+ before resetting the controller
+- local-mac-address : 6 bytes, mac address
+
+Examples:
+
+lan9220@f4000000 {
+ compatible = "smsc,lan9220", "smsc,lan";
+ reg = <0xf4000000 0x2000000>;
+ phy-mode = "mii";
+ smsc-int-gpios = <&gpio1 31 0>; /* GPIO2_31 */
+ smsc,irq-push-pull;
+ smsc,access-in-32bit;
+};
diff --git a/drivers/net/smsc911x.c b/drivers/net/smsc911x.c
index b9016a3..0097048 100644
--- a/drivers/net/smsc911x.c
+++ b/drivers/net/smsc911x.c
@@ -53,6 +53,10 @@
#include <linux/phy.h>
#include <linux/smsc911x.h>
#include <linux/device.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
+#include <linux/of_gpio.h>
+#include <linux/of_net.h>
#include "smsc911x.h"
#define SMSC_CHIPNAME "smsc911x"
@@ -2095,25 +2099,67 @@ static const struct smsc911x_ops shifted_smsc911x_ops = {
.tx_writefifo = smsc911x_tx_writefifo_shift,
};
+#ifdef CONFIG_OF
+static int __devinit smsc911x_probe_config_dt(
+ struct smsc911x_platform_config *config,
+ struct device_node *np)
+{
+ const char *mac;
+
+ if (!np)
+ return -ENODEV;
+
+ config->phy_interface = of_get_phy_mode(np);
+
+ mac = of_get_mac_address(np);
+ if (mac)
+ memcpy(config->mac, mac, ETH_ALEN);
+
+ if (of_get_property(np, "smsc,irq-active-high", NULL))
+ config->irq_polarity = SMSC911X_IRQ_POLARITY_ACTIVE_HIGH;
+
+ if (of_get_property(np, "smsc,irq-push-pull", NULL))
+ config->irq_type = SMSC911X_IRQ_TYPE_PUSH_PULL;
+
+ if (of_get_property(np, "smsc,register-needs-shift", NULL))
+ config->shift = 1;
+
+ if (of_get_property(np, "smsc,access-in-32bit", NULL))
+ config->flags |= SMSC911X_USE_32BIT;
+
+ if (of_get_property(np, "smsc,force-internal-phy", NULL))
+ config->flags |= SMSC911X_FORCE_INTERNAL_PHY;
+
+ if (of_get_property(np, "smsc,force-external-phy", NULL))
+ config->flags |= SMSC911X_FORCE_EXTERNAL_PHY;
+
+ if (of_get_property(np, "smsc,save-mac-address", NULL))
+ config->flags |= SMSC911X_SAVE_MAC_ADDRESS;
+
+ return 0;
+}
+#else
+static inline int smsc911x_probe_config_dt(
+ struct smsc911x_platform_config *config,
+ struct device_node *np)
+{
+ return -ENODEV;
+}
+#endif /* CONFIG_OF */
+
static int __devinit smsc911x_drv_probe(struct platform_device *pdev)
{
+ struct device_node *np = pdev->dev.of_node;
struct net_device *dev;
struct smsc911x_data *pdata;
struct smsc911x_platform_config *config = pdev->dev.platform_data;
struct resource *res, *irq_res;
unsigned int intcfg = 0;
- int res_size, irq_flags;
- int retval;
+ int irq_gpio, res_size, irq_flags = 0;
+ int retval = 0;
pr_info("Driver version %s\n", SMSC_DRV_VERSION);
- /* platform data specifies irq & dynamic bus configuration */
- if (!pdev->dev.platform_data) {
- pr_warn("platform_data not provided\n");
- retval = -ENODEV;
- goto out_0;
- }
-
res = platform_get_resource_byname(pdev, IORESOURCE_MEM,
"smsc911x-memory");
if (!res)
@@ -2125,13 +2171,6 @@ static int __devinit smsc911x_drv_probe(struct platform_device *pdev)
}
res_size = resource_size(res);
- irq_res = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
- if (!irq_res) {
- pr_warn("Could not allocate irq resource\n");
- retval = -ENODEV;
- goto out_0;
- }
-
if (!request_mem_region(res->start, res_size, SMSC_CHIPNAME)) {
retval = -EBUSY;
goto out_0;
@@ -2148,26 +2187,53 @@ static int __devinit smsc911x_drv_probe(struct platform_device *pdev)
pdata = netdev_priv(dev);
- dev->irq = irq_res->start;
- irq_flags = irq_res->flags & IRQF_TRIGGER_MASK;
- pdata->ioaddr = ioremap_nocache(res->start, res_size);
-
- /* copy config parameters across to pdata */
- memcpy(&pdata->config, config, sizeof(pdata->config));
+ if (np) {
+ irq_gpio = of_get_named_gpio(np, "smsc-int-gpios", 0);
+ retval = gpio_request_one(irq_gpio, GPIOF_IN, "smsc-int-gpio");
+ if (!retval)
+ dev->irq = gpio_to_irq(irq_gpio);
+ } else {
+ irq_res = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
+ if (irq_res) {
+ dev->irq = irq_res->start;
+ irq_flags = irq_res->flags & IRQF_TRIGGER_MASK;
+ } else {
+ retval = -ENODEV;
+ }
+ }
- pdata->dev = dev;
- pdata->msg_enable = ((1 << debug) - 1);
+ if (retval) {
+ SMSC_WARN(pdata, probe, "Error smsc911x irq not found");
+ retval = -EINVAL;
+ goto out_free_netdev_2;
+ }
+ pdata->ioaddr = ioremap_nocache(res->start, res_size);
if (pdata->ioaddr == NULL) {
SMSC_WARN(pdata, probe, "Error smsc911x base address invalid");
retval = -ENOMEM;
goto out_free_netdev_2;
}
+ pdata->dev = dev;
+ pdata->msg_enable = ((1 << debug) - 1);
+
+ retval = smsc911x_probe_config_dt(&pdata->config, np);
+ if (retval && config) {
+ /* copy config parameters across to pdata */
+ memcpy(&pdata->config, config, sizeof(pdata->config));
+ retval = 0;
+ }
+
+ if (retval) {
+ SMSC_WARN(pdata, probe, "Error smsc911x config not found");
+ goto out_unmap_io_3;
+ }
+
/* assume standard, non-shifted, access to HW registers */
pdata->ops = &standard_smsc911x_ops;
/* apply the right access if shifting is needed */
- if (config->shift)
+ if (pdata->config.shift)
pdata->ops = &shifted_smsc911x_ops;
retval = smsc911x_init(dev);
@@ -2314,6 +2380,12 @@ static const struct dev_pm_ops smsc911x_pm_ops = {
#define SMSC911X_PM_OPS NULL
#endif
+static const struct of_device_id smsc_dt_ids[] = {
+ { .compatible = "smsc,lan", },
+ { /* sentinel */ }
+};
+MODULE_DEVICE_TABLE(of, smsc_dt_ids);
+
static struct platform_driver smsc911x_driver = {
.probe = smsc911x_drv_probe,
.remove = __devexit_p(smsc911x_drv_remove),
@@ -2321,6 +2393,7 @@ static struct platform_driver smsc911x_driver = {
.name = SMSC_CHIPNAME,
.owner = THIS_MODULE,
.pm = SMSC911X_PM_OPS,
+ .of_match_table = smsc_dt_ids,
},
};
--
1.7.4.1
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox