netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Eric Dumazet <dada1@cosmosbay.com>
To: David Miller <davem@davemloft.net>
Cc: haoki@redhat.com, herbert@gondor.apana.org.au,
	netdev@vger.kernel.org, tyasui@redhat.com, mhiramat@redhat.com,
	satoshi.oshima.fk@hitachi.com, billfink@mindspring.com,
	andi@firstfloor.org, johnpol@2ka.mipt.ru,
	shemminger@linux-foundation.org, yoshfuji@linux-ipv6.org,
	yumiko.sugita.yf@hitachi.com
Subject: Re: [PATCH 1/3] [UDP]: add udp_mem, udp_rmem_min and udp_wmem_min
Date: Mon, 31 Dec 2007 15:42:42 +0100	[thread overview]
Message-ID: <4778FFE2.9090008@cosmosbay.com> (raw)
In-Reply-To: <20071231.041342.101160102.davem@davemloft.net>

David Miller a écrit :
> From: Eric Dumazet <dada1@cosmosbay.com>
> Date: Mon, 31 Dec 2007 09:54:32 +0100
> 
>> Maybe I read the patch incorrectly, or we could add some new sysctl so that
>> we not try to uncharge memory if a socket 'forward_alloc' is beyond a given 
>> limit (say 2 pages), so that number of atomic_inc/dec on udp_memory_allocated 
>> (or tcp_memory_allocated) is reduced.
> 
> This is what we should be striving for, using forward_alloc
> as much as possible as a "cache" to avoid the atomics on
> the global var as much as possible.

Thank you for this confirmation David, I understand now that tcp doesnt 
currently satisfy the contract.

For example, tcp_delack_timer() calls sk_mem_reclaim().

So on a machine with a lot of mostly idle sockets (but all sockets are doing 
some trafic, say one message per minute / socket), we can see :

$ grep TCP /proc/net/sockstat
TCP: inuse 1083667 orphan 8840 tw 6646 alloc 1083809 mem 262305
$ cat /proc/sys/net/ipv4/tcp_mem
2000000 3000000 4000000

so an average of 1/4 page are 'allocated' per socket :(

On this machine, we constantly change tcp_memory_allocated, even if we always 
are under tcp_mem[0] limit.

Maybe we need to introduce some mechanism to let sk_forward between 0 and 
SK_MEM_QUANTUM (inclusive).

static inline void sk_mem_reclaim_overpage(struct sock *sk)
{
         if (sk->sk_forward_alloc > SK_MEM_QUANTUM) {
                 __sk_mem_reclaim(sk);
		}
}

and use sk_mem_reclaim_overpage() instead of sk_mem_reclaim() in 
tcp_delack_timer() ?

Thank you

Small program output :

$ gcc -o prog prog.c ; ./prog
TCP: inuse 1035 orphan 0 tw 271 alloc 1203 mem 16
TCP: inuse 1035 orphan 0 tw 271 alloc 1203 mem 4016
TCP: inuse 1034 orphan 0 tw 272 alloc 1202 mem 3015
TCP: inuse 1034 orphan 0 tw 272 alloc 1202 mem 3016
TCP: inuse 1034 orphan 0 tw 272 alloc 1202 mem 3516
TCP: inuse 1034 orphan 0 tw 272 alloc 1202 mem 14 <OOPS>

$ cat prog.c
#include <sys/socket.h>
#include <string.h>
#include <stdlib.h>
#include <netinet/tcp.h>
#include <arpa/inet.h>
#include <netdb.h>
#include <unistd.h>
#include <errno.h>
#include <asm/ioctls.h>
#include <stdio.h>

int SOCK_COUNT = 1000;
int *sockets_fd_tab;
unsigned int count;

static void open_sockets(int domain, int type)
{
         int fdlisten=-1, on = 1;
         socklen_t addrlen;
         struct sockaddr_in host, peer;

         if (domain == AF_INET && type == SOCK_STREAM) {
                 fdlisten = socket(AF_INET, type, 0);
                 setsockopt(fdlisten, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(int));
                 memset(&host, 0, sizeof(host));
                 host.sin_family = AF_INET;
                 bind(fdlisten, (struct sockaddr *)&host, sizeof(host));
                 addrlen = sizeof(host);
                 getsockname(fdlisten, (struct sockaddr *)&host, &addrlen);
                 listen(fdlisten, 5);
         }
         while (1) {
                 int res, vec[2];
                 if (domain == AF_UNIX) {
                         res = socketpair(AF_UNIX, type, 0, vec);
                         if (res == -1)
                                 break;
                 } else {
                         vec[0] = socket(AF_INET, type, 0);
                         if (vec[0] == -1)
                                 break;
                         ioctl(vec[0], FIONBIO, &on);
                         if (type == SOCK_STREAM) {
                                 connect(vec[0], (struct sockaddr *)&host, 
sizeof(host));
                                 addrlen = sizeof(peer);
                                 vec[1] = accept(fdlisten, (struct sockaddr 
*)&peer, &addrlen);
                                 if (vec[1] == -1) {
                                         close(vec[0]);
                                         break;
                                 }
                         } else {
                         }
                 }
                 sockets_fd_tab[count++] = vec[0];
                 sockets_fd_tab[count++] = vec[1];
                 if (count == SOCK_COUNT)
                         break;
         }
}

const char some_msg[1024] = "One dummy message";

static void fill_sockets()
{
         unsigned int ui;
         for (ui = 0; ui < count; ui++)
                 send(sockets_fd_tab[ui], some_msg, 100, 0);
         for (ui = 0; ui < count; ui++)
                 send(sockets_fd_tab[ui], some_msg, 100, 0);
}

static void empty_sockets()
{
         unsigned int ui;
         char buffer[4096];

         for (ui = 0; ui < count; ui++)
                 recv(sockets_fd_tab[ui], buffer, sizeof(buffer), 0);
}

static void dump_infos()
{
         system("grep TCP /proc/net/sockstat");
}

int main(int argc, char *argv[])
{
         int c;
         while ((c = getopt(argc, argv, "n:")) != EOF) {
                 if (c == 'n')
                         SOCK_COUNT = atoi(optarg);
         }
         sockets_fd_tab = malloc(SOCK_COUNT * sizeof(int));
         open_sockets(AF_INET, SOCK_STREAM);
         dump_infos();

         fill_sockets();
         dump_infos();
         sleep(1); /* to see effect of delayed acks */
         dump_infos();
         empty_sockets();
         dump_infos();

         fill_sockets();
         dump_infos();
         empty_sockets();
         sleep(1); /* to see effect of delayed acks */
         dump_infos();

         return 0;
}


  reply	other threads:[~2007-12-31 14:44 UTC|newest]

Thread overview: 14+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2007-12-30  8:57 [PATCH 0/3] UDP memory accounting and limitation (take 12) Hideo AOKI
2007-12-30  9:01 ` [PATCH 1/3] [UDP]: add udp_mem, udp_rmem_min and udp_wmem_min Hideo AOKI
2007-12-31  8:19   ` David Miller
2007-12-31  8:54     ` Eric Dumazet
2007-12-31  9:11       ` Herbert Xu
2007-12-31 12:13       ` David Miller
2007-12-31 14:42         ` Eric Dumazet [this message]
2008-01-11  6:00           ` David Miller
2007-12-31 18:58     ` Hideo AOKI
2007-12-30  9:02 ` [PATCH 2/3] [UDP]: memory accounting in IPv4 Hideo AOKI
2007-12-30  9:28   ` Eric Dumazet
2007-12-31 18:43     ` Hideo AOKI
2007-12-31 18:58       ` Eric Dumazet
2007-12-30  9:02 ` [PATCH 3/3] [UDP]: memory accounting in IPv6 Hideo AOKI

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=4778FFE2.9090008@cosmosbay.com \
    --to=dada1@cosmosbay.com \
    --cc=andi@firstfloor.org \
    --cc=billfink@mindspring.com \
    --cc=davem@davemloft.net \
    --cc=haoki@redhat.com \
    --cc=herbert@gondor.apana.org.au \
    --cc=johnpol@2ka.mipt.ru \
    --cc=mhiramat@redhat.com \
    --cc=netdev@vger.kernel.org \
    --cc=satoshi.oshima.fk@hitachi.com \
    --cc=shemminger@linux-foundation.org \
    --cc=tyasui@redhat.com \
    --cc=yoshfuji@linux-ipv6.org \
    --cc=yumiko.sugita.yf@hitachi.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).