All of lore.kernel.org
 help / color / mirror / Atom feed
* [MPTCP] Re: [RFC PATCH 5/6] mptcp: add and use mptcp_subflow_get_retrans
@ 2019-10-17 14:40 Florian Westphal
  0 siblings, 0 replies; 6+ messages in thread
From: Florian Westphal @ 2019-10-17 14:40 UTC (permalink / raw)
  To: mptcp 

[-- Attachment #1: Type: text/plain, Size: 7307 bytes --]

Mat Martineau <mathew.j.martineau(a)linux.intel.com> wrote:
> 
> On Tue, 15 Oct 2019, Paolo Abeni wrote:
> 
> > On Tue, 2019-10-15 at 14:13 +0200, Florian Westphal wrote:
> > > Rationale for the approach done in patch 5 was that MPTCP-Level retransmits
> > > are useless in virtually all cases.
> > > 
> > > The retransmit would only be needed in case we sent data on ssk x
> > > but peer went dead after xmit started.
> > 
> > uhmm... I think that it will suffice if peer goes dead after we
> > create/enqueue the skb (mptcp_sendmsg() completes). Could/should happen
> > quite consistently e.g. when the client switches from wifi to 4G, and
> > the flow is almost unidirectional from the server.

Yes, but how often do you expect clients to bounce back and forth
between two paths?

> > > I wanted to make sure we only retransmit on ssks that have no
> > > inflight packets, under the assumption that the dfrag has been acked
> > > by the next time the retransmit timer kicks in.
> 
> That might make it a good idea to not push more new data in to the ssks once
> MPTCP-level retransmission kicks in? We don't want the subflow buffers
> filled up with new data when the MPTCP socket is trying to retransmit.

What about this?

diff --git a/net/mptcp/options.c b/net/mptcp/options.c
--- a/net/mptcp/options.c
+++ b/net/mptcp/options.c
@@ -567,6 +567,8 @@ static void update_una(struct mptcp_sock *msk,
 		if (old_snd_una == snd_una) {
 			mptcp_reset_timer((struct sock *)msk);
 			break;
+		} else if (test_bit(MPTCP_RETRANS, &msk->flags)) {
+			sk_stream_write_space((struct sock *)msk);
 		}
 	}
 }
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -210,7 +142,7 @@ static void dfrag_clear(struct sock *sk, struct mptcp_data_frag *dfrag)
 	put_page(dfrag->page);
 }
 
-static void mptcp_clean_una(struct sock *sk)
+static bool mptcp_clean_una(struct sock *sk)
 {
 	struct mptcp_sock *msk = mptcp_sk(sk);
 	struct mptcp_data_frag *dtmp, *dfrag;
@@ -223,6 +155,8 @@ static void mptcp_clean_una(struct sock *sk)
 		dfrag_clear(sk, dfrag);
 	}
 	sk_mem_reclaim_partial(sk);
+
+	return list_empty(&msk->rtx_queue);
 }
 
 /* ensure we get enough memory for the frag hdr, beyond some minimal amount of
@@ -413,6 +347,80 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
 	return ret;
 }
 
+/* returns < 0 if a should be preferred to b */
+static int sock_compare(const struct sock *a, const struct sock *b)
+{
+	int cmp = inet_csk(a)->icsk_ca_state - inet_csk(b)->icsk_ca_state;
+
+	if (cmp)
+		return cmp;
+
+	cmp = tcp_packets_in_flight(tcp_sk(a)) - tcp_packets_in_flight(tcp_sk(b));
+	if (cmp)
+		return cmp;
+
+	return 0;
+}
+
+static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
+{
+	struct mptcp_subflow_context *subflow;
+	bool non_backup_present = false;
+	struct sock *best, *backup;
+
+	sock_owned_by_me((const struct sock *)msk);
+
+	backup = NULL;
+	best = NULL;
+
+	/* don't send new data with ongoing MPTCP retransmit */
+	if (unlikely(test_bit(MPTCP_RETRANS, &msk->flags))) {
+		if (!mptcp_clean_una((void *)msk))
+			return NULL;
+		clear_bit(MPTCP_RETRANS, &msk->flags);
+	}

 [ remaining unchanged ]

 static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 {
 	int mss_now = 0, size_goal = 0, ret = 0;
@@ -687,6 +695,7 @@ static void mptcp_retransmit_handler(struct sock *sk)
 	struct mptcp_sock *msk = mptcp_sk(sk);
 
 	if (atomic64_read(&msk->snd_una) == msk->write_seq) {
+		clear_bit(MPTCP_RETRANS, &msk->flags);
 		mptcp_stop_timer(sk);
 	} else {
 		if (schedule_work(&msk->rtx_work))
@@ -783,9 +792,11 @@ static void mptcp_retransmit(struct work_struct *work)
 		dfrag->data_len -= ret;
 		dfrag->offset += ret;
 	}
-	if (copied)
+	if (copied) {
+		set_bit(MPTCP_RETRANS, &msk->flags);
 		tcp_push(ssk, msg.msg_flags, mss_now, tcp_sk(ssk)->nonagle,
 			 size_goal);
+	}
 
 	dfrag->data_seq = orig_write_seq;
 	dfrag->offset = orig_offset;
@@ -1164,6 +1175,12 @@ static bool mptcp_memory_free(const struct sock *sk, int wake)
 {
 	struct mptcp_sock *msk = mptcp_sk(sk);
 
+	if (unlikely(test_bit(MPTCP_RETRANS, &msk->flags))) {
+		/* can't clear una here, sk is const */
+		if (atomic64_read(&msk->snd_una) == READ_ONCE(msk->write_seq))
+			return true;
+	}
+
 	return test_bit(MPTCP_SEND_SPACE, &msk->flags);
 }
 
@@ -1348,10 +1365,15 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock,
 	const struct mptcp_sock *msk;
 	struct sock *sk = sock->sk;
 	struct socket *ssock;
+	bool retrans = false;
 	__poll_t ret = 0;
 
 	msk = mptcp_sk(sk);
 	lock_sock(sk);
+
+	if (test_bit(MPTCP_RETRANS, &msk->flags) && !mptcp_clean_una(sk))
+		retrans = true;
+
 	ssock = __mptcp_fallback_get_ref(msk);
 	if (ssock) {
 		release_sock(sk);
@@ -1364,9 +1386,14 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock,
 
 	mptcp_for_each_subflow(msk, subflow) {
 		struct socket *tcp_sock;
+		int r;
 
 		tcp_sock = mptcp_subflow_tcp_socket(subflow);
-		ret |= __tcp_poll(tcp_sock->sk);
+		r = __tcp_poll(tcp_sock->sk);
+		if (retrans && (r & (EPOLLOUT|EPOLLWRNORM)))
+			r &= ~(EPOLLOUT|EPOLLWRNORM);
+		ret |= r;
+
 	}
 	release_sock(sk);
 
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index db3007ca50a7..768fad073b07 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -77,6 +77,7 @@
 /* MPTCP socket flags */
 #define MPTCP_DATA_READY	BIT(0)
 #define MPTCP_SEND_SPACE	BIT(1)
+#define MPTCP_RETRANS		BIT(2)
 


------

Main problem is that when MPTCP-level retransmission kicks in, this
blocks writes until snd_una has caught up with mptcp write_seq again.

The use of the MPTCP_RETRANS flag prevents this from occuring
for normal write operations while retrans timer is pending but hasn't
done any actual resends yet.

Might be good enough for now. If not, we will need to track more state,
e.g. the last/highest retransmitted mptcp-sequence number.  It would
allow us to signal a wakeup from mptcp_memory_free() earlier, when the
last 'retransmitted' sequence was acked, rather than the all outstanding
data.

> In the LPC presentation and paper we proposed keeping the scheduler
> extremely simple, with the focus being on the server use case with a limited
> number of subflows (maybe even just two, allowing for a primary and backup).
> The idea for the first patch set was to focus on MPTCP for failover rather
> than aggregating multiple subflows to maximize throughput.

I agree.

> That could mean simply transmitting on the first available non-backup
> subflow, leaving others on the list idle.

Yes, I can alter the patch accordingly.

> As this email thread shows, there are a lot of potential inputs for making a
> scheduling decision and there isn't a one-size-fits all scheduling
> algorithm. The multipath-tcp.org kernel handles this with different
> scheduler modules, and we do have configurable schedulers later in the
> roadmap.

I know.  It wasn't my intention do make a scheduler, I wanted to get
the infra ready to block at mptcp level rather than relying on tcp
blocking on first ssk.

^ permalink raw reply related	[flat|nested] 6+ messages in thread
* [MPTCP] Re: [RFC PATCH 5/6] mptcp: add and use mptcp_subflow_get_retrans
@ 2019-10-17 13:12 Paolo Abeni
  0 siblings, 0 replies; 6+ messages in thread
From: Paolo Abeni @ 2019-10-17 13:12 UTC (permalink / raw)
  To: mptcp 

[-- Attachment #1: Type: text/plain, Size: 588 bytes --]

On Wed, 2019-10-16 at 17:26 -0700, Mat Martineau wrote:
> As this email thread shows, there are a lot of potential inputs for making 
> a scheduling decision and there isn't a one-size-fits all scheduling 
> algorithm. The multipath-tcp.org kernel handles this with different 
> scheduler modules, and we do have configurable schedulers later in the 
> roadmap.
> 
> My suggestion is to keep it very, very simple for now!

Agreed.

I really should fight harder my tendency to overdesign ;)

Let's go the this path and ev. we can adjust later, if needed.

Thanks,

Paolo

^ permalink raw reply	[flat|nested] 6+ messages in thread
* [MPTCP] Re: [RFC PATCH 5/6] mptcp: add and use mptcp_subflow_get_retrans
@ 2019-10-17  0:26 Mat Martineau
  0 siblings, 0 replies; 6+ messages in thread
From: Mat Martineau @ 2019-10-17  0:26 UTC (permalink / raw)
  To: mptcp 

[-- Attachment #1: Type: text/plain, Size: 6036 bytes --]


On Tue, 15 Oct 2019, Paolo Abeni wrote:

> On Tue, 2019-10-15 at 14:13 +0200, Florian Westphal wrote:
>> Paolo Abeni <pabeni(a)redhat.com> wrote:
>>> On Tue, 2019-10-15 at 01:19 +0200, Florian Westphal wrote:
>>>> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
>>>> index 259a2743db91..e003608af237 100644
>>>> --- a/net/mptcp/protocol.c
>>>> +++ b/net/mptcp/protocol.c
>>>> @@ -781,6 +781,35 @@ static void mptcp_retransmit_timer(struct timer_list *t)
>>>>  	sock_put(sk);
>>>>  }
>>>>
>>>> +/* return first subflow that has no data outstanding at tcp level.
>>>> + *
>>>> + * Never return a backup subflow unless thats the only kind left.
>>>> + */
>>>> +static struct sock *mptcp_subflow_get_retrans(const struct mptcp_sock *msk)
>>>> +{
>>>> +	struct mptcp_subflow_context *subflow;
>>>> +	struct sock *backup = NULL;
>>>> +	bool use_backup = true;
>>>> +
>>>> +	sock_owned_by_me((const struct sock *)msk);
>>>> +
>>>> +	mptcp_for_each_subflow(msk, subflow) {
>>>> +		struct sock *ssk = mptcp_subflow_tcp_socket(subflow)->sk;
>>>> +
>>>> +		if (!subflow->backup)
>>>> +			use_backup = false;
>>>> +
>>>> +		if (tcp_write_queue_empty(ssk)) {
>>>> +			if (!subflow->backup)
>>>> +				return ssk;
>>>
>>> I think that to be as accurate as possible we could try to check only
>>> vs the write queue of the subflow used to send the current dfrag -
>>> otherwise we could end-up with spurious/unneded retransmissions on some
>>> corner cases.
>>
>> I'm not following, sorry.
>>
>> So I think that what you'd like is something like this:
>>
>>> Additionally we may want to explicitly exclude/penalize such subflow
>>> from the selection here.
>>
>> static struct sock *
>> mptcp_subflow_get_retrans(const struct mptcp_sock *msk, struct sock *dfrag_origsk)
>> {
>> 	[..]
>>
>> 	if (ssk == dfrag_origsk)
>> 		continue; /* never retrans on same subflow */
>>
>> 	if (!subflow->backup)
>> 		use_backup = false;
>>
>> 	if (!best || compare_sk(ssk, best))
>> 		best = ssk;
>> ...
>>
>> Is that right?
>
> Yes, exactly!
>
>> Rationale for the approach done in patch 5 was that MPTCP-Level retransmits
>> are useless in virtually all cases.
>>
>> The retransmit would only be needed in case we sent data on ssk x
>> but peer went dead after xmit started.
>
> uhmm... I think that it will suffice if peer goes dead after we
> create/enqueue the skb (mptcp_sendmsg() completes). Could/should happen
> quite consistently e.g. when the client switches from wifi to 4G, and
> the flow is almost unidirectional from the server.
>
>> I wanted to make sure we only retransmit on ssks that have no
>> inflight packets, under the assumption that the dfrag has been acked
>> by the next time the retransmit timer kicks in.

That might make it a good idea to not push more new data in to the ssks 
once MPTCP-level retransmission kicks in? We don't want the subflow 
buffers filled up with new data when the MPTCP socket is trying to 
retransmit.

>>
>> What we could do is re-use mptcp_subflow_get_send() from patch 6,
>> adding a 'struct sock *ignore' argument, which would be NULL for normal
>> case and dfrag->orig_sk for retransmit case.
>
> I was in the act of suggesting the above on patch 6 :) I think it would
> simplify the code having a single algorithm to pick-up the next
> subflow.

Simplifying sounds good to me! However, I'm not sure it's good to 
completely ignore the dfrag->orig_sk socket, it could be the only subflow 
(or only non-backup) available. Our MPTCP-level reassembly code doesn't 
currently have an out-of-order queue, so it can flush incoming data 
received on a working subflow when the actual network loss was on a 
different subflow.

>
> Apropos: another thing I would eventually consider for patch 6 is [try 
> to] reuse the subflow we used last, if it has a non empty window. If we 
> can't, we do the full lookup. That should allow the peer to reconstruct 
> more easily the MPTCP level data sequence, with possibly less switches 
> from one subflow to another.
>

In the LPC presentation and paper we proposed keeping the scheduler 
extremely simple, with the focus being on the server use case with a 
limited number of subflows (maybe even just two, allowing for a primary 
and backup). The idea for the first patch set was to focus on MPTCP for 
failover rather than aggregating multiple subflows to maximize throughput.

That could mean simply transmitting on the first available non-backup 
subflow, leaving others on the list idle.

>> That would make retransmits more aggressive for active/active
>> scenarios when one subflow is 'stuck' and the other has pending
>> packets (but the subflow can accept more data).
>
> I think we can mitigate/optimize out the unneeded retransmissions some
> way. e.g. no need to retransmit if the relevant ssn increased since
> last check/enqueue time.
>
> In some scenarios retransmitting the data on a different subflow could
> possibly inprove the tput/latency even if the first transmission has
> not been lost - e.g. if the subflow we used first is experiencing
> congestion.
>
>>> Storing the subflow ptr into a new/additional dfrag field at
>>> transmission time should allow the above.
>>
>> Yes, we can store the ssk that was used for orig transmission,
>> question I have is why :-)
>
> I think having relevant subflow handy will help with the above
> heuristics.
>
> I can agree we can do this kind of tuning/optimization/heuristic[/over-
> design???] later, I'm wondering if we should consider some room for
> them now?
>

As this email thread shows, there are a lot of potential inputs for making 
a scheduling decision and there isn't a one-size-fits all scheduling 
algorithm. The multipath-tcp.org kernel handles this with different 
scheduler modules, and we do have configurable schedulers later in the 
roadmap.

My suggestion is to keep it very, very simple for now!

--
Mat Martineau
Intel

^ permalink raw reply	[flat|nested] 6+ messages in thread
* [MPTCP] Re: [RFC PATCH 5/6] mptcp: add and use mptcp_subflow_get_retrans
@ 2019-10-15 13:20 Paolo Abeni
  0 siblings, 0 replies; 6+ messages in thread
From: Paolo Abeni @ 2019-10-15 13:20 UTC (permalink / raw)
  To: mptcp 

[-- Attachment #1: Type: text/plain, Size: 4458 bytes --]

On Tue, 2019-10-15 at 14:13 +0200, Florian Westphal wrote:
> Paolo Abeni <pabeni(a)redhat.com> wrote:
> > On Tue, 2019-10-15 at 01:19 +0200, Florian Westphal wrote:
> > > diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> > > index 259a2743db91..e003608af237 100644
> > > --- a/net/mptcp/protocol.c
> > > +++ b/net/mptcp/protocol.c
> > > @@ -781,6 +781,35 @@ static void mptcp_retransmit_timer(struct timer_list *t)
> > >  	sock_put(sk);
> > >  }
> > >  
> > > +/* return first subflow that has no data outstanding at tcp level.
> > > + *
> > > + * Never return a backup subflow unless thats the only kind left.
> > > + */
> > > +static struct sock *mptcp_subflow_get_retrans(const struct mptcp_sock *msk)
> > > +{
> > > +	struct mptcp_subflow_context *subflow;
> > > +	struct sock *backup = NULL;
> > > +	bool use_backup = true;
> > > +
> > > +	sock_owned_by_me((const struct sock *)msk);
> > > +
> > > +	mptcp_for_each_subflow(msk, subflow) {
> > > +		struct sock *ssk = mptcp_subflow_tcp_socket(subflow)->sk;
> > > +
> > > +		if (!subflow->backup)
> > > +			use_backup = false;
> > > +
> > > +		if (tcp_write_queue_empty(ssk)) {
> > > +			if (!subflow->backup)
> > > +				return ssk;
> > 
> > I think that to be as accurate as possible we could try to check only
> > vs the write queue of the subflow used to send the current dfrag -
> > otherwise we could end-up with spurious/unneded retransmissions on some
> > corner cases.
> 
> I'm not following, sorry.
> 
> So I think that what you'd like is something like this:
> 
> > Additionally we may want to explicitly exclude/penalize such subflow
> > from the selection here.
> 
> static struct sock *
> mptcp_subflow_get_retrans(const struct mptcp_sock *msk, struct sock *dfrag_origsk)
> {
> 	[..]
> 
> 	if (ssk == dfrag_origsk)
> 		continue; /* never retrans on same subflow */
> 
> 	if (!subflow->backup)
> 		use_backup = false;
> 
> 	if (!best || compare_sk(ssk, best))
> 		best = ssk;
> ...
> 
> Is that right?

Yes, exactly!

> Rationale for the approach done in patch 5 was that MPTCP-Level retransmits
> are useless in virtually all cases.
> 
> The retransmit would only be needed in case we sent data on ssk x
> but peer went dead after xmit started.

uhmm... I think that it will suffice if peer goes dead after we
create/enqueue the skb (mptcp_sendmsg() completes). Could/should happen
quite consistently e.g. when the client switches from wifi to 4G, and
the flow is almost unidirectional from the server.

> I wanted to make sure we only retransmit on ssks that have no
> inflight packets, under the assumption that the dfrag has been acked
> by the next time the retransmit timer kicks in.
> 
> What we could do is re-use mptcp_subflow_get_send() from patch 6,
> adding a 'struct sock *ignore' argument, which would be NULL for normal
> case and dfrag->orig_sk for retransmit case.

I was in the act of suggesting the above on patch 6 :) I think it would
simplify the code having a single algorithm to pick-up the next
subflow.

Apropos: another thing I would eventually consider for patch 6 is [try
to] reuse the subflow we used last, if it has a non empty window. If we
can't, we do the full lookup. That should allow the peer to reconstruct
more easily the MPTCP level data sequence, with possibly less switches
from one subflow to another.

> That would make retransmits more aggressive for active/active
> scenarios when one subflow is 'stuck' and the other has pending
> packets (but the subflow can accept more data).

I think we can mitigate/optimize out the unneeded retransmissions some
way. e.g. no need to retransmit if the relevant ssn increased since
last check/enqueue time.

In some scenarios retransmitting the data on a different subflow could
possibly inprove the tput/latency even if the first transmission has
not been lost - e.g. if the subflow we used first is experiencing
congestion.

> > Storing the subflow ptr into a new/additional dfrag field at
> > transmission time should allow the above.
> 
> Yes, we can store the ssk that was used for orig transmission,
> question I have is why :-)

I think having relevant subflow handy will help with the above
heuristics. 

I can agree we can do this kind of tuning/optimization/heuristic[/over-
design???] later, I'm wondering if we should consider some room for
them now?

Thanks,

Paolo

^ permalink raw reply	[flat|nested] 6+ messages in thread
* [MPTCP] Re: [RFC PATCH 5/6] mptcp: add and use mptcp_subflow_get_retrans
@ 2019-10-15 12:13 Florian Westphal
  0 siblings, 0 replies; 6+ messages in thread
From: Florian Westphal @ 2019-10-15 12:13 UTC (permalink / raw)
  To: mptcp 

[-- Attachment #1: Type: text/plain, Size: 2847 bytes --]

Paolo Abeni <pabeni(a)redhat.com> wrote:
> Hi,
> 
> On Tue, 2019-10-15 at 01:19 +0200, Florian Westphal wrote:
> > diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> > index 259a2743db91..e003608af237 100644
> > --- a/net/mptcp/protocol.c
> > +++ b/net/mptcp/protocol.c
> > @@ -781,6 +781,35 @@ static void mptcp_retransmit_timer(struct timer_list *t)
> >  	sock_put(sk);
> >  }
> >  
> > +/* return first subflow that has no data outstanding at tcp level.
> > + *
> > + * Never return a backup subflow unless thats the only kind left.
> > + */
> > +static struct sock *mptcp_subflow_get_retrans(const struct mptcp_sock *msk)
> > +{
> > +	struct mptcp_subflow_context *subflow;
> > +	struct sock *backup = NULL;
> > +	bool use_backup = true;
> > +
> > +	sock_owned_by_me((const struct sock *)msk);
> > +
> > +	mptcp_for_each_subflow(msk, subflow) {
> > +		struct sock *ssk = mptcp_subflow_tcp_socket(subflow)->sk;
> > +
> > +		if (!subflow->backup)
> > +			use_backup = false;
> > +
> > +		if (tcp_write_queue_empty(ssk)) {
> > +			if (!subflow->backup)
> > +				return ssk;
> 
> I think that to be as accurate as possible we could try to check only
> vs the write queue of the subflow used to send the current dfrag -
> otherwise we could end-up with spurious/unneded retransmissions on some
> corner cases.

I'm not following, sorry.

So I think that what you'd like is something like this:

> Additionally we may want to explicitly exclude/penalize such subflow
> from the selection here.

static struct sock *
mptcp_subflow_get_retrans(const struct mptcp_sock *msk, struct sock *dfrag_origsk)
{
	[..]

	if (ssk == dfrag_origsk)
		continue; /* never retrans on same subflow */

	if (!subflow->backup)
		use_backup = false;

	if (!best || compare_sk(ssk, best))
		best = ssk;
...

Is that right?

Rationale for the approach done in patch 5 was that MPTCP-Level retransmits
are useless in virtually all cases.

The retransmit would only be needed in case we sent data on ssk x
but peer went dead after xmit started.

I wanted to make sure we only retransmit on ssks that have no
inflight packets, under the assumption that the dfrag has been acked
by the next time the retransmit timer kicks in.

What we could do is re-use mptcp_subflow_get_send() from patch 6,
adding a 'struct sock *ignore' argument, which would be NULL for normal
case and dfrag->orig_sk for retransmit case.

That would make retransmits more aggressive for active/active
scenarios when one subflow is 'stuck' and the other has pending
packets (but the subflow can accept more data).

> Storing the subflow ptr into a new/additional dfrag field at
> transmission time should allow the above.

Yes, we can store the ssk that was used for orig transmission,
question I have is why :-)

^ permalink raw reply	[flat|nested] 6+ messages in thread
* [MPTCP] Re: [RFC PATCH 5/6] mptcp: add and use mptcp_subflow_get_retrans
@ 2019-10-15 11:22 Paolo Abeni
  0 siblings, 0 replies; 6+ messages in thread
From: Paolo Abeni @ 2019-10-15 11:22 UTC (permalink / raw)
  To: mptcp 

[-- Attachment #1: Type: text/plain, Size: 1465 bytes --]

Hi,

On Tue, 2019-10-15 at 01:19 +0200, Florian Westphal wrote:
> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> index 259a2743db91..e003608af237 100644
> --- a/net/mptcp/protocol.c
> +++ b/net/mptcp/protocol.c
> @@ -781,6 +781,35 @@ static void mptcp_retransmit_timer(struct timer_list *t)
>  	sock_put(sk);
>  }
>  
> +/* return first subflow that has no data outstanding at tcp level.
> + *
> + * Never return a backup subflow unless thats the only kind left.
> + */
> +static struct sock *mptcp_subflow_get_retrans(const struct mptcp_sock *msk)
> +{
> +	struct mptcp_subflow_context *subflow;
> +	struct sock *backup = NULL;
> +	bool use_backup = true;
> +
> +	sock_owned_by_me((const struct sock *)msk);
> +
> +	mptcp_for_each_subflow(msk, subflow) {
> +		struct sock *ssk = mptcp_subflow_tcp_socket(subflow)->sk;
> +
> +		if (!subflow->backup)
> +			use_backup = false;
> +
> +		if (tcp_write_queue_empty(ssk)) {
> +			if (!subflow->backup)
> +				return ssk;

I think that to be as accurate as possible we could try to check only
vs the write queue of the subflow used to send the current dfrag - 
otherwise we could end-up with spurious/unneded retransmissions on some
corner cases.

Additionally we may want to explicitly exclude/penalize such subflow
from the selection here.

Storing the subflow ptr into a new/additional dfrag field at
transmission time should allow the above.

Cheers,

Paolo

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2019-10-17 14:40 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2019-10-17 14:40 [MPTCP] Re: [RFC PATCH 5/6] mptcp: add and use mptcp_subflow_get_retrans Florian Westphal
  -- strict thread matches above, loose matches on Subject: below --
2019-10-17 13:12 Paolo Abeni
2019-10-17  0:26 Mat Martineau
2019-10-15 13:20 Paolo Abeni
2019-10-15 12:13 Florian Westphal
2019-10-15 11:22 Paolo Abeni

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.