Browse Source

- advanced tcp options support:
- support for defering tcp accepts until some data is received
(linux & freebsd), default off. See NEWS: tcp_defer_accept.
- support for delaying the final ACK from the 3-way handshake until some
data is sent (the ACK will come with the 1st data segment). Default on
when supported (linux only). See NEWS: tcp_delayed_ack.
- support for limiting the number of retransmitted SYNs (linux only,
see NEW: tcp_syncnt)
- support for limiting the lifetime of orphaned sockets in FIN_WAIT2
(linux only, see NEWS: tcp_linger2)
- keepalive support, see NEWS: tcp_keepalive (default on), tcp_keepidle,
tcp_keepintvl and tcp_keepcnt

- the FD cache can now be turned off from ser.cfg (see NEW: tcp_fd_cache)

Andrei Pelinescu-Onciul 18 years ago
parent
commit
20c64cc651
8 changed files with 564 additions and 15 deletions
  1. 40 0
      NEWS
  2. 29 0
      cfg.lex
  3. 84 0
      cfg.y
  4. 43 6
      core_cmd.c
  5. 0 1
      tcp_info.h
  6. 141 8
      tcp_main.c
  7. 111 0
      tcp_options.c
  8. 116 0
      tcp_options.h

+ 40 - 0
NEWS

@@ -8,6 +8,13 @@ $Id$
 2.1.0 changes
 
 modules:
+ - auth      - added extra authentication checks support, to protect
+               against various reply attacks.
+             - params:
+                       - auth_extra_checks - flags specifying which extra
+                          message part/parts will be checked for change before
+                          allowing nonce reuse. See the auth module docs for
+                          for more information (modules/auth/README).
  - blst      - new module containing script blacklist manipulations functions
                (the source of a message can be blacklisted, removed from the
                 blacklist or checked for presence in the blacklist).
@@ -95,6 +102,8 @@ modules:
                         - t_set_retr(t1, t2) - changes the retransmissions
                            intervals on the fly, on a per transaction basis.
 core:
+             - tcp improvements (better tcp timers, send fd cache, special
+                options support)
              - dns naptr support (see dns_try_naptr and dns_<proto>_pref)
              - dns srv based load balancing support (see dns_srv_lb)
              - support for locking ser's pages in memory, pre-mapping
@@ -107,6 +116,37 @@ core:
                between the short name and long name in cache as CNAME record
 
 new config variables:
+  tcp_fd_cache = yes | no (default yes) - if enabled FDs used for sending
+     will be cached inside the process calling tcp_send (performance increase
+     for sending over tcp at the cost of slightly slower connection closing and
+     extra FDs kept open)
+  tcp_defer_accept =  yes | no (default no) on freebsd  / number of seconds
+        before timeout on linux (default disabled) - tcp accepts will be 
+        delayed until some data is received (improves performance on proxies
+        with lots of opened tcp connections). See linux tcp(7) TCP_DEFER_ACCEPT
+        or freebsd ACCF_DATA(0). For now linux and freebsd only.
+        WARNING: the  linux TCP_DEFER_ACCEPT is buggy (<=2.6.23) and doesn't 
+         work exactly as expected (if no data is received it will retransmit 
+         syn acks for ~ 190 s, irrespective of the set timeout and then it will
+         silently drop the connection without sending a RST or FIN). Try to 
+         use it together with tcp_syncnt (this way the number of retrans.
+          SYNACKs can be limited => the timeout can be controlled in some way).
+  tcp_delayed_ack  = yes | no (default yes when supported) - initial ACK for
+        opened connections will be delayed and sent with the first data
+        segment (see linux tcp(7) TCP_QUICKACK). For now linux only.
+  tcp_syncnt = number of syn retr. (default not set) - number of SYN 
+        retransmissions before aborting a connect attempt (see linux tcp(7)
+        TCP_SYNCNT). Linux only.
+  tcp_linger2 = seconds (not set by default) - lifetime of orphaned sockets
+        in FIN_WAIT2 state (overrides tcp_fin_timeout on, see linux tcp(7) 
+        TCP_LINGER2). Linux only.
+  tcp_keepalive = yes | no (default yes) - enables keepalive for tcp.
+  tcp_keepidle  = seconds (not set by default) - time before starting to send
+         keepalives, if the connection is idle. Linux only.
+  tcp_keepintvl = seconds (not set by default) - time interval between 
+         keepalive probes, when the previous probe failed. Linux only.
+  tcp_keepcnt = number (not set by default) - number of keepalives sent before
+         dropping the connection. Linux only.
   pmtu_discovery = 0 | 1 (default 0) - set DF bit in outbound IP if enabled
   dns_srv_lb = yes | no (default no) - enable dns srv weight based load 
     balancing (see doc/dns.txt)

+ 29 - 0
cfg.lex

@@ -72,6 +72,8 @@
  *  2007-09-10  introduced phone2tel option which allows NOT to consider
  *              user=phone URIs as TEL URIs (jiri)
  *  2007-10-10  added DNS_SEARCH_FMATCH (mma)
+ *  2007-11-28  added TCP_OPT_{FD_CACHE, DEFER_ACCEPT, DELAYED_ACK, SYNCNT,
+ *              LINGER2, KEEPALIVE, KEEPIDLE, KEEPINTVL, KEEPCNT} (andrei)
 */
 
 
@@ -290,6 +292,15 @@ TCP_POLL_METHOD		"tcp_poll_method"
 TCP_MAX_CONNECTIONS	"tcp_max_connections"
 TCP_SOURCE_IPV4		"tcp_source_ipv4"
 TCP_SOURCE_IPV6		"tcp_source_ipv6"
+TCP_OPT_FD_CACHE	"tcp_fd_cache"
+TCP_OPT_DEFER_ACCEPT "tcp_defer_accept"
+TCP_OPT_DELAYED_ACK	"tcp_delayed_ack"
+TCP_OPT_SYNCNT		"tcp_syncnt"
+TCP_OPT_LINGER2		"tcp_linger2"
+TCP_OPT_KEEPALIVE	"tcp_keepalive"
+TCP_OPT_KEEPIDLE	"tcp_keepidle"
+TCP_OPT_KEEPINTVL	"tcp_keepintvl"
+TCP_OPT_KEEPCNT		"tcp_keepcnt"
 DISABLE_TLS		"disable_tls"|"tls_disable"
 ENABLE_TLS		"enable_tls"|"tls_enable"
 TLSLOG			"tlslog"|"tls_log"
@@ -548,6 +559,24 @@ EAT_ABLE	[\ \t\b\r]
 									return TCP_SOURCE_IPV4; }
 <INITIAL>{TCP_SOURCE_IPV6}		{ count(); yylval.strval=yytext;
 									return TCP_SOURCE_IPV6; }
+<INITIAL>{TCP_OPT_FD_CACHE}		{ count(); yylval.strval=yytext;
+									return TCP_OPT_FD_CACHE; }
+<INITIAL>{TCP_OPT_DEFER_ACCEPT}	{ count(); yylval.strval=yytext;
+									return TCP_OPT_DEFER_ACCEPT; }
+<INITIAL>{TCP_OPT_DELAYED_ACK}	{ count(); yylval.strval=yytext;
+									return TCP_OPT_DELAYED_ACK; }
+<INITIAL>{TCP_OPT_SYNCNT}		{ count(); yylval.strval=yytext;
+									return TCP_OPT_SYNCNT; }
+<INITIAL>{TCP_OPT_LINGER2}		{ count(); yylval.strval=yytext;
+									return TCP_OPT_LINGER2; }
+<INITIAL>{TCP_OPT_KEEPALIVE}	{ count(); yylval.strval=yytext;
+									return TCP_OPT_KEEPALIVE; }
+<INITIAL>{TCP_OPT_KEEPIDLE}		{ count(); yylval.strval=yytext;
+									return TCP_OPT_KEEPIDLE; }
+<INITIAL>{TCP_OPT_KEEPINTVL}	{ count(); yylval.strval=yytext;
+									return TCP_OPT_KEEPINTVL; }
+<INITIAL>{TCP_OPT_KEEPCNT}	{ count(); yylval.strval=yytext;
+									return TCP_OPT_KEEPCNT; }
 <INITIAL>{DISABLE_TLS}	{ count(); yylval.strval=yytext; return DISABLE_TLS; }
 <INITIAL>{ENABLE_TLS}	{ count(); yylval.strval=yytext; return ENABLE_TLS; }
 <INITIAL>{TLSLOG}		{ count(); yylval.strval=yytext; return TLS_PORT_NO; }

+ 84 - 0
cfg.y

@@ -85,6 +85,8 @@
  * 2007-09-10  introduced phone2tel option which allows NOT to consider
  *             user=phone URIs as TEL URIs (jiri)
  * 2007-10-10  added DNS_SEARCH_FMATCH (mma)
+ * 2007-11-28  added TCP_OPT_{FD_CACHE, DEFER_ACCEPT, DELAYED_ACK, SYNCNT,
+ *              LINGER2, KEEPALIVE, KEEPIDLE, KEEPINTVL, KEEPCNT} (andrei)
 */
 
 %{
@@ -112,6 +114,7 @@
 #include "select.h"
 #include "flags.h"
 #include "tcp_init.h"
+#include "tcp_options.h"
 
 #include "config.h"
 #ifdef CORE_TLS
@@ -330,6 +333,15 @@ static struct socket_id* mk_listen_id(char*, int, int);
 %token TCP_MAX_CONNECTIONS
 %token TCP_SOURCE_IPV4
 %token TCP_SOURCE_IPV6
+%token TCP_OPT_FD_CACHE
+%token TCP_OPT_DEFER_ACCEPT
+%token TCP_OPT_DELAYED_ACK
+%token TCP_OPT_SYNCNT
+%token TCP_OPT_LINGER2
+%token TCP_OPT_KEEPALIVE
+%token TCP_OPT_KEEPIDLE
+%token TCP_OPT_KEEPINTVL
+%token TCP_OPT_KEEPCNT
 %token DISABLE_TLS
 %token ENABLE_TLS
 %token TLSLOG
@@ -783,6 +795,78 @@ assign_stm:
 		pkg_free($3);
 	}
 	| TCP_SOURCE_IPV6 EQUAL error { yyerror("IPv6 address expected"); }
+	| TCP_OPT_FD_CACHE EQUAL NUMBER {
+		#ifdef USE_TCP
+			tcp_options.fd_cache=$3;
+		#else
+			warn("tcp support not compiled in");
+		#endif
+	}
+	| TCP_OPT_FD_CACHE EQUAL error { yyerror("boolean value expected"); }
+	| TCP_OPT_DEFER_ACCEPT EQUAL NUMBER {
+		#ifdef USE_TCP
+			tcp_options.defer_accept=$3;
+		#else
+			warn("tcp support not compiled in");
+		#endif
+	}
+	| TCP_OPT_DEFER_ACCEPT EQUAL error { yyerror("boolean value expected"); }
+	| TCP_OPT_DELAYED_ACK EQUAL NUMBER {
+		#ifdef USE_TCP
+			tcp_options.delayed_ack=$3;
+		#else
+			warn("tcp support not compiled in");
+		#endif
+	}
+	| TCP_OPT_DELAYED_ACK EQUAL error { yyerror("boolean value expected"); }
+	| TCP_OPT_SYNCNT EQUAL NUMBER {
+		#ifdef USE_TCP
+			tcp_options.syncnt=$3;
+		#else
+			warn("tcp support not compiled in");
+		#endif
+	}
+	| TCP_OPT_SYNCNT EQUAL error { yyerror("number expected"); }
+	| TCP_OPT_LINGER2 EQUAL NUMBER {
+		#ifdef USE_TCP
+			tcp_options.linger2=$3;
+		#else
+			warn("tcp support not compiled in");
+		#endif
+	}
+	| TCP_OPT_LINGER2 EQUAL error { yyerror("number expected"); }
+	| TCP_OPT_KEEPALIVE EQUAL NUMBER {
+		#ifdef USE_TCP
+			tcp_options.keepalive=$3;
+		#else
+			warn("tcp support not compiled in");
+		#endif
+	}
+	| TCP_OPT_KEEPALIVE EQUAL error { yyerror("boolean value expected");}
+	| TCP_OPT_KEEPIDLE EQUAL NUMBER {
+		#ifdef USE_TCP
+			tcp_options.keepidle=$3;
+		#else
+			warn("tcp support not compiled in");
+		#endif
+	}
+	| TCP_OPT_KEEPIDLE EQUAL error { yyerror("number expected"); }
+	| TCP_OPT_KEEPINTVL EQUAL NUMBER {
+		#ifdef USE_TCP
+			tcp_options.keepintvl=$3;
+		#else
+			warn("tcp support not compiled in");
+		#endif
+	}
+	| TCP_OPT_KEEPINTVL EQUAL error { yyerror("number expected"); }
+	| TCP_OPT_KEEPCNT EQUAL NUMBER {
+		#ifdef USE_TCP
+			tcp_options.keepcnt=$3;
+		#else
+			warn("tcp support not compiled in");
+		#endif
+	}
+	| TCP_OPT_KEEPCNT EQUAL error { yyerror("number expected"); }
 	| DISABLE_TLS EQUAL NUMBER {
 		#ifdef USE_TLS
 			tls_disable=$3;

+ 43 - 6
core_cmd.c

@@ -37,6 +37,7 @@
 #include "pt.h"
 #include "ut.h"
 #include "tcp_info.h"
+#include "tcp_options.h"
 #include "core_cmd.h"
 
 #ifdef USE_DNS_CACHE
@@ -470,12 +471,11 @@ all:
 			}
 		}
 		rpc->add(c, "{", &handle);
-		rpc->struct_add(handle, "ddddddd",
+		rpc->struct_add(handle, "dddddd",
 			"pool  ", i,
 			"frags ", (unsigned int)frags,
 			"t. misses", (unsigned int)misses,
 			"mem   ", (unsigned int)mem,
-			"bitmap", (unsigned int)shm_block->pool[i].bitmap,
 			"missed", (unsigned int)shm_block->pool[i].missed,
 			"hits",   (unsigned int)shm_block->pool[i].hits
 		);
@@ -490,7 +490,7 @@ all:
 		main_b_frags+=shm_block->free_hash[r].no;
 	}
 	rpc->add(c, "{", &handle);
-	rpc->struct_add(handle, "dddddddddddddd",
+	rpc->struct_add(handle, "ddddddddddddd",
 		"max_frags      ", (unsigned int)max_frags,
 		"max_frags_pool ", max_frags_pool,
 		"max_frags_hash", max_frags_hash,
@@ -503,8 +503,7 @@ all:
 		"in_pools_frags ", (unsigned int)pool_frags,
 		"main_s_frags   ", (unsigned int)main_s_frags,
 		"main_b_frags   ", (unsigned int)main_b_frags,
-		"main_frags     ", (unsigned int)(main_b_frags+main_s_frags),
-		"main_bitmap    ", (unsigned int)shm_block->bitmap
+		"main_frags     ", (unsigned int)(main_b_frags+main_s_frags)
 	);
 }
 
@@ -546,6 +545,43 @@ static void core_tcpinfo(rpc_t* rpc, void* c)
 #endif
 }
 
+
+
+static const char* core_tcp_options_doc[] = {
+	"Returns active tcp options.",    /* Documentation string */
+	0                                 /* Method signature(s) */
+};
+
+static void core_tcp_options(rpc_t* rpc, void* c)
+{
+	void *handle;
+#ifdef USE_TCP
+	struct tcp_cfg_options t;
+
+	if (!tcp_disable){
+		tcp_options_get(&t);
+		rpc->add(c, "{", &handle);
+		rpc->struct_add(handle, "ddddddddd",
+			"fd_cache",		t.fd_cache,
+			"defer_accept",	t.defer_accept,
+			"delayed_ack",	t.delayed_ack,
+			"syncnt",		t.syncnt,
+			"linger2",		t.linger2,
+			"keepalive",	t.keepalive,
+			"keepidle",		t.keepidle,
+			"keepintvl",	t.keepintvl,
+			"keepcnt",		t.keepcnt
+		);
+	}else{
+		rpc->fault(c, 500, "tcp support disabled");
+	}
+#else
+	rpc->fault(c, 500, "tcp support not compiled");
+#endif
+}
+
+
+
 /*
  * RPC Methods exported by this module
  */
@@ -564,7 +600,8 @@ rpc_export_t core_rpc_methods[] = {
 #if defined(SF_MALLOC) || defined(LL_MALLOC)
 	{"core.sfmalloc",          core_sfmalloc,          core_sfmalloc_doc,   0},
 #endif
-	{"core.tcp_info",          core_tcpinfo,           core_tcpinfo_doc,          0	},
+	{"core.tcp_info",          core_tcpinfo,           core_tcpinfo_doc,    0},
+	{"core.tcp_options",       core_tcp_options,       core_tcp_options_doc,0},
 #ifdef USE_DNS_CACHE
 	{"dns.mem_info",          dns_cache_mem_info,     dns_cache_mem_info_doc,     0	},
 	{"dns.debug",          dns_cache_debug,           dns_cache_debug_doc,        0	},

+ 0 - 1
tcp_info.h

@@ -42,5 +42,4 @@ struct tcp_gen_info{
 
 void tcp_get_info(struct tcp_gen_info* ti);
 
-
 #endif

+ 141 - 8
tcp_main.c

@@ -85,6 +85,8 @@
  *               io_watch_add-ing its fd - it's safer this way (andrei)
  *  2007-11-26  improved tcp timers: switched to local_timer (andrei)
  *  2007-11-27  added send fd cache and reader fd reuse (andrei)
+ *  2007-11-28  added support for TCP_DEFER_ACCEPT, KEEPALIVE, KEEPINTVL,
+ *               KEEPCNT, QUICKACK, SYNCNT, LINGER2 (andrei)
  */
 
 
@@ -142,6 +144,7 @@
 #endif
 
 #include "tcp_info.h"
+#include "tcp_options.h"
 
 #define local_malloc pkg_malloc
 #define local_free   pkg_free
@@ -178,8 +181,6 @@ enum fd_types { F_NONE, F_SOCKINFO /* a tcp_listen fd */,
 				F_TCPCONN, F_TCPCHILD, F_PROC };
 
 
-#define TCP_FD_CACHE
-
 #ifdef TCP_FD_CACHE
 
 #define TCP_FD_CACHE_SIZE 8
@@ -270,6 +271,56 @@ int tcp_set_src_addr(struct ip_addr* ip)
 
 
 
+static inline int init_sock_keepalive(int s)
+{
+	int optval;
+	
+#ifdef HAVE_SO_KEEPALIVE
+	if (tcp_options.keepalive){
+		optval=1;
+		if (setsockopt(s, SOL_SOCKET, SO_KEEPALIVE, &optval,
+						sizeof(optval))<0){
+			LOG(L_WARN, "WARNING: init_sock_keepalive: failed to enable"
+						" SO_KEEPALIVE: %s\n", strerror(errno));
+			return -1;
+		}
+	}
+#endif
+#ifdef HAVE_TCP_KEEPINTVL
+	if (tcp_options.keepintvl){
+		optval=tcp_options.keepintvl;
+		if (setsockopt(s, IPPROTO_TCP, TCP_KEEPINTVL, &optval,
+						sizeof(optval))<0){
+			LOG(L_WARN, "WARNING: init_sock_keepalive: failed to set"
+						" keepalive probes interval: %s\n", strerror(errno));
+		}
+	}
+#endif
+#ifdef HAVE_TCP_KEEPIDLE
+	if (tcp_options.keepidle){
+		optval=tcp_options.keepidle;
+		if (setsockopt(s, IPPROTO_TCP, TCP_KEEPIDLE, &optval,
+						sizeof(optval))<0){
+			LOG(L_WARN, "WARNING: init_sock_keepalive: failed to set"
+						" keepalive idle interval: %s\n", strerror(errno));
+		}
+	}
+#endif
+#ifdef HAVE_TCP_KEEPCNT
+	if (tcp_options.keepcnt){
+		optval=tcp_options.keepcnt;
+		if (setsockopt(s, IPPROTO_TCP, TCP_KEEPCNT, &optval,
+						sizeof(optval))<0){
+			LOG(L_WARN, "WARNING: init_sock_keepalive: failed to set"
+						" maximum keepalive count: %s\n", strerror(errno));
+		}
+	}
+#endif
+	return 0;
+}
+
+
+
 /* set all socket/fd options for new sockets (e.g. before connect): 
  *  disable nagle, tos lowdelay, reuseaddr, non-blocking
  *
@@ -303,6 +354,37 @@ static int init_sock_opt(int s)
 		/* continue, not critical */
 	}
 #endif /* !TCP_DONT_REUSEADDR */
+#ifdef HAVE_TCP_SYNCNT
+	if (tcp_options.syncnt){
+		optval=tcp_options.syncnt;
+		if (setsockopt(s, IPPROTO_TCP, TCP_SYNCNT, &optval,
+						sizeof(optval))<0){
+			LOG(L_WARN, "WARNING: init_sock_opt: failed to set"
+						" maximum SYN retr. count: %s\n", strerror(errno));
+		}
+	}
+#endif
+#ifdef HAVE_TCP_LINGER2
+	if (tcp_options.linger2){
+		optval=tcp_options.linger2;
+		if (setsockopt(s, IPPROTO_TCP, TCP_LINGER2, &optval,
+						sizeof(optval))<0){
+			LOG(L_WARN, "WARNING: init_sock_opt: failed to set"
+						" maximum LINGER2 timeout: %s\n", strerror(errno));
+		}
+	}
+#endif
+#ifdef HAVE_TCP_QUICKACK
+	if (tcp_options.delayed_ack){
+		optval=0; /* reset quick ack => delayed ack */
+		if (setsockopt(s, IPPROTO_TCP, TCP_QUICKACK, &optval,
+						sizeof(optval))<0){
+			LOG(L_WARN, "WARNING: init_sock_opt: failed to reset"
+						" TCP_QUICKACK: %s\n", strerror(errno));
+		}
+	}
+#endif /* HAVE_TCP_QUICKACK */
+	init_sock_keepalive(s);
 	
 	/* non-blocking */
 	flags=fcntl(s, F_GETFL);
@@ -1130,7 +1212,8 @@ get_fd:
 			fd=c->fd;
 			do_close_fd=0; /* don't close the fd on exit, it's in use */
 #ifdef TCP_FD_CACHE
-		}else if (likely((fd_cache_e=tcp_fd_cache_get(c))!=0)){
+		}else if (likely(tcp_options.fd_cache && 
+							((fd_cache_e=tcp_fd_cache_get(c))!=0))){
 			fd=fd_cache_e->fd;
 			do_close_fd=0;
 			DBG("tcp_send: found fd in cache ( %d, %p, %d)\n",
@@ -1213,7 +1296,7 @@ send_it:
 	}
 end:
 #ifdef TCP_FD_CACHE
-	if (unlikely(fd_cache_e==0)){
+	if (unlikely((fd_cache_e==0) && tcp_options.fd_cache)){
 		tcp_fd_cache_add(c, fd);
 	}else
 #endif /* TCP_FD_CACHE */
@@ -1229,6 +1312,9 @@ int tcp_init(struct socket_info* sock_info)
 {
 	union sockaddr_union* addr;
 	int optval;
+#ifdef HAVE_TCP_ACCEPT_FILTER
+	struct accept_filter_arg afa;
+#endif /* HAVE_TCP_ACCEPT_FILTER */
 #ifdef DISABLE_NAGLE
 	int flag;
 	struct protoent* pe;
@@ -1291,6 +1377,52 @@ int tcp_init(struct socket_info* sock_info)
 		LOG(L_WARN, "WARNING: tcp_init: setsockopt tos: %s\n", strerror(errno));
 		/* continue since this is not critical */
 	}
+#ifdef HAVE_TCP_DEFER_ACCEPT
+	/* linux only */
+	if (tcp_options.defer_accept){
+		optval=tcp_options.defer_accept;
+		if (setsockopt(sock_info->socket, IPPROTO_TCP, TCP_DEFER_ACCEPT,
+					(void*)&optval, sizeof(optval)) ==-1){
+			LOG(L_WARN, "WARNING: tcp_init: setsockopt TCP_DEFER_ACCEPT %s\n",
+						strerror(errno));
+		/* continue since this is not critical */
+		}
+	}
+#endif /* HAVE_TCP_DEFFER_ACCEPT */
+#ifdef HAVE_TCP_SYNCNT
+	if (tcp_options.syncnt){
+		optval=tcp_options.syncnt;
+		if (setsockopt(sock_info->socket, IPPROTO_TCP, TCP_SYNCNT, &optval,
+						sizeof(optval))<0){
+			LOG(L_WARN, "WARNING: tcp_init: failed to set"
+						" maximum SYN retr. count: %s\n", strerror(errno));
+		}
+	}
+#endif
+#ifdef HAVE_TCP_ACCEPT_FILTER
+	/* freebsd */
+	if (tcp_options.defer_accept){
+		memset(&afa, 0, sizeof(afa));
+		strcpy(afa.af_name, "dataready");
+		if (setsockopt(sock_info->socket, SOL_SOCKET, SO_ACCEPTFILTER,
+					(void*)&afal, sizeof(afa)) ==-1){
+			LOG(L_WARN, "WARNING: tcp_init: setsockopt SO_ACCEPTFILTER %s\n",
+						strerror(errno));
+		/* continue since this is not critical */
+		}
+	}
+#endif /* HAVE_TCP_ACCEPT_FILTER */
+#ifdef HAVE_TCP_LINGER2
+	if (tcp_options.linger2){
+		optval=tcp_options.linger2;
+		if (setsockopt(sock_info->socket, IPPROTO_TCP, TCP_LINGER2, &optval,
+						sizeof(optval))<0){
+			LOG(L_WARN, "WARNING: tcp_init: failed to set"
+						" maximum LINGER2 timeout: %s\n", strerror(errno));
+		}
+	}
+#endif
+	init_sock_keepalive(sock_info->socket);
 	if (bind(sock_info->socket, &addr->s, sockaddru_len(*addr))==-1){
 		LOG(L_ERR, "ERROR: tcp_init: bind(%x, %p, %d) on %s:%d : %s\n",
 				sock_info->socket,  &addr->s, 
@@ -1347,7 +1479,7 @@ static void tcpconn_destroy(struct tcp_connection* tcpconn)
 #endif
 		_tcpconn_free(tcpconn);
 #ifdef TCP_FD_CACHE
-		shutdown(fd, SHUT_RDWR);
+		if (likely(tcp_options.fd_cache)) shutdown(fd, SHUT_RDWR);
 #endif /* TCP_FD_CACHE */
 		close(fd);
 		(*tcp_connections_no)--;
@@ -2060,7 +2192,7 @@ static ticks_t tcpconn_main_timeout(ticks_t t, struct timer_ln* tl, void* data)
 #endif /* USE_TLS */
 					_tcpconn_free(c);
 #ifdef TCP_FD_CACHE
-					shutdown(fd, SHUT_RDWR);
+					if (likely(tcp_options.fd_cache)) shutdown(fd, SHUT_RDWR);
 #endif /* TCP_FD_CACHE */
 					close(fd);
 				}
@@ -2129,7 +2261,7 @@ static inline void tcpconn_destroy_all()
 				_tcpconn_rm(c);
 				if (fd>0) {
 #ifdef TCP_FD_CACHE
-					shutdown(fd, SHUT_RDWR);
+					if (likely(tcp_options.fd_cache)) shutdown(fd, SHUT_RDWR);
 #endif /* TCP_FD_CACHE */
 					close(fd);
 				}
@@ -2172,7 +2304,7 @@ void tcp_main_loop()
 		goto error;
 	}
 #ifdef TCP_FD_CACHE
-	tcp_fd_cache_init();
+	if (tcp_options.fd_cache) tcp_fd_cache_init();
 #endif /* TCP_FD_CACHE */
 	
 	/* add all the sockets we listen on for connections */
@@ -2347,6 +2479,7 @@ int init_tcp()
 {
 	char* poll_err;
 	
+	tcp_options_check();
 	/* init lock */
 	tcpconn_lock=lock_alloc();
 	if (tcpconn_lock==0){

+ 111 - 0
tcp_options.c

@@ -0,0 +1,111 @@
+/* 
+ * $Id$
+ * 
+ * Copyright (C) 2007 iptelorg GmbH
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+/*
+ * tcp options
+ *
+ * History:
+ * --------
+ *  2007-11-28  created by andrei
+ */
+
+#include "tcp_options.h"
+#include "dprint.h"
+
+
+struct tcp_cfg_options tcp_options;
+
+
+/* set defaults */
+void init_tcp_options()
+{
+
+#ifdef TCP_FD_CACHE
+	tcp_options.fd_cache=1;
+#endif
+#ifdef HAVE_SO_KEEPALIVE
+	tcp_options.keepalive=1;
+#endif
+/*
+#if defined HAVE_TCP_DEFER_ACCEPT || defined HAVE_TCP_ACCEPT_FILTER
+	tcp_options.defer_accept=1;
+#endif
+*/
+#ifdef HAVE_TCP_QUICKACK
+	tcp_options.delayed_ack=1;
+#endif
+}
+
+
+
+#define W_OPT_NC(option) \
+	if (tcp_options.option){\
+		WARN("tcp_options: tcp_" ##option \
+				"cannot be enabled (recompile needed)\n"); \
+	}
+
+
+
+#define W_OPT_NS(option) \
+	if (tcp_options.option){\
+		WARN("tcp_options: tcp_" ##option \
+				"cannot be enabled (no OS support)\n"); \
+	}
+
+
+/* checks & warns if some tcp_option cannot be enabled */
+void tcp_options_check()
+{
+#ifndef TCP_FD_CACHE
+	W_OPT_NC(defer_accept);
+#endif
+
+#if ! defined HAVE_TCP_DEFER_ACCEPT && ! defined HAVE_TCP_ACCEPT_FILTER
+	W_OPT_NS(defer_accept);
+#endif
+#ifndef HAVE_TCP_SYNCNT
+	W_OPT_NS(syncnt);
+#endif
+#ifndef HAVE_TCP_LINGER2
+	W_OPT_NS(linger2);
+#endif
+#ifndef HAVE_TCP_KEEPINTVL
+	W_OPT_NS(keepintvl);
+#endif
+#ifndef HAVE_TCP_KEEPIDLE
+	W_OPT_NS(keepidle);
+#endif
+#ifndef HAVE_TCP_KEEPCNT
+	W_OPT_NS(keepcnt);
+#endif
+	if (tcp_options.keepintvl || tcp_options.keepidle || tcp_options.keepcnt){
+		tcp_options.keepalive=1; /* force on */
+	}
+#ifndef HAVE_SO_KEEPALIVE
+	W_OPT_NS(keepalive);
+#endif
+#ifndef HAVE_TCP_QUICKACK
+	W_OPT_NS(delayed_ack);
+#endif
+}
+
+
+
+void tcp_options_get(struct tcp_cfg_options* t)
+{
+	*t=tcp_options;
+}

+ 116 - 0
tcp_options.h

@@ -0,0 +1,116 @@
+/* 
+ * $Id$
+ * 
+ * Copyright (C) 2007 iptelorg GmbH
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+/*
+ * tcp options
+ *
+ * History:
+ * --------
+ *  2007-11-28  created by andrei
+ */
+
+#ifndef tcp_options_h
+#define tcp_options_h
+
+#ifndef NO_TCP_FD_CACHE
+#define TCP_FD_CACHE /* enable fd caching */
+#endif
+
+
+
+/* defer accept */
+#ifndef  NO_TCP_DEFER_ACCEPT
+#ifdef __OS_linux
+#define HAVE_TCP_DEFER_ACCEPT
+#elif define __OS_freebsd
+#define HAVE_TCP_ACCEPT_FILTER
+#endif /* __OS_ */
+#endif /* NO_TCP_DEFER_ACCEPT */
+
+
+/* syn count */
+#ifndef NO_TCP_SYNCNT
+#ifdef __OS_linux
+#define HAVE_TCP_SYNCNT
+#endif /* __OS_*/
+#endif /* NO_TCP_SYNCNT */
+
+/* tcp linger2 */
+#ifndef NO_TCP_LINGER2
+#ifdef __OS_linux
+#define HAVE_TCP_LINGER2
+#endif /* __OS_ */
+#endif /* NO_TCP_LINGER2 */
+
+/* keepalive */
+#ifndef NO_TCP_KEEPALIVE
+#define HAVE_SO_KEEPALIVE
+#endif /* NO_TCP_KEEPALIVE */
+
+/* keepintvl */
+#ifndef NO_TCP_KEEPINTVL
+#ifdef __OS_linux
+#define HAVE_TCP_KEEPINTVL
+#endif /* __OS_ */
+#endif /* NO_TCP_KEEPIDLE */
+
+/* keepidle */
+#ifndef NO_TCP_KEEPIDLE
+#ifdef __OS_linux
+#define HAVE_TCP_KEEPIDLE
+#endif /* __OS_*/
+#endif /* NO_TCP_KEEPIDLE */
+
+
+/* keepcnt */
+#ifndef NO_TCP_KEEPCNT
+#ifdef __OS_linux
+#define HAVE_TCP_KEEPCNT
+#endif /* __OS_ */
+#endif /* NO_TCP_KEEPCNT */
+
+
+/* delayed ack (quick_ack) */
+#ifndef NO_TCP_QUICKACK
+#ifdef __OS_linux
+#define HAVE_TCP_QUICKACK
+#endif /* __OS_ */
+#endif /* NO_TCP_QUICKACK */
+
+
+struct tcp_cfg_options{
+	/* ser tcp options */
+	int fd_cache; /* on /off */
+	/* tcp socket options */
+	int defer_accept; /* on / off */
+	int delayed_ack; /* delay ack on connect */ 
+	int syncnt;     /* numbers of SYNs retrs. before giving up connecting */
+	int linger2;    /* lifetime of orphaned  FIN_WAIT2 state sockets */
+	int keepalive;  /* on /off */
+	int keepidle;   /* idle time (s) before tcp starts sending keepalives */
+	int keepintvl;  /* interval between keep alives */
+	int keepcnt;    /* maximum no. of keepalives before giving up */
+};
+
+
+extern struct tcp_cfg_options tcp_options;
+
+void init_tcp_options();
+void tcp_options_check();
+void tcp_options_get(struct tcp_cfg_options* t);
+
+#endif /* tcp_options_h */