patch-2.1.90 linux/net/ipv4/tcp_ipv4.c
Next file: linux/net/ipv4/tcp_output.c
Previous file: linux/net/ipv4/tcp_input.c
Back to the patch index
Back to the overall index
- Lines: 1168
- Date:
Mon Mar 16 10:36:01 1998
- Orig file:
v2.1.89/linux/net/ipv4/tcp_ipv4.c
- Orig date:
Tue Mar 10 10:03:37 1998
diff -u --recursive --new-file v2.1.89/linux/net/ipv4/tcp_ipv4.c linux/net/ipv4/tcp_ipv4.c
@@ -5,7 +5,7 @@
*
* Implementation of the Transmission Control Protocol(TCP).
*
- * Version: $Id: tcp_ipv4.c,v 1.79 1998/01/15 22:40:47 freitag Exp $
+ * Version: $Id: tcp_ipv4.c,v 1.109 1998/03/15 07:24:15 davem Exp $
*
* IPv4 specific functions
*
@@ -60,8 +60,6 @@
#include <linux/inet.h>
-extern int sysctl_tcp_sack;
-extern int sysctl_tcp_tsack;
extern int sysctl_tcp_timestamps;
extern int sysctl_tcp_window_scaling;
extern int sysctl_tcp_syncookies;
@@ -89,16 +87,19 @@
*/
struct sock *tcp_established_hash[TCP_HTABLE_SIZE];
+/* Ok, let's try this, I give up, we do need a local binding
+ * TCP hash as well as the others for fast bind/connect.
+ */
+struct tcp_bind_bucket *tcp_bound_hash[TCP_BHTABLE_SIZE];
+
/* All sockets in TCP_LISTEN state will be in here. This is the only table
* where wildcard'd TCP sockets can exist. Hash function here is just local
* port number.
*/
struct sock *tcp_listening_hash[TCP_LHTABLE_SIZE];
-/* Ok, let's try this, I give up, we do need a local binding
- * TCP hash as well as the others for fast bind/connect.
- */
-struct sock *tcp_bound_hash[TCP_BHTABLE_SIZE];
+/* Register cache. */
+struct sock *tcp_regs[TCP_NUM_REGS];
/*
* This array holds the first and last local port number.
@@ -106,6 +107,7 @@
* 32768-61000
*/
int sysctl_local_port_range[2] = { 1024, 4999 };
+int tcp_port_rover = (1024 - 1);
static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
__u32 faddr, __u16 fport)
@@ -123,155 +125,135 @@
return tcp_hashfn(laddr, lport, faddr, fport);
}
-static int tcp_v4_verify_bind(struct sock *sk, unsigned short snum)
+/* Invariant, sk->num is non-zero. */
+void tcp_bucket_unlock(struct sock *sk)
{
- struct sock *sk2;
- int retval = 0, sk_reuse = sk->reuse;
+ struct tcp_bind_bucket *tb;
+ unsigned short snum = sk->num;
SOCKHASH_LOCK();
- sk2 = tcp_bound_hash[tcp_bhashfn(snum)];
- for(; sk2 != NULL; sk2 = sk2->bind_next) {
- if((sk2->num == snum) && (sk2 != sk)) {
- unsigned char state = sk2->state;
- int sk2_reuse = sk2->reuse;
-
- /* Two sockets can be bound to the same port if they're
- * bound to different interfaces.
- */
-
- if(sk->bound_dev_if != sk2->bound_dev_if)
- continue;
-
- if(!sk2->rcv_saddr || !sk->rcv_saddr) {
- if((!sk2_reuse) ||
- (!sk_reuse) ||
- (state == TCP_LISTEN)) {
- retval = 1;
- break;
- }
- } else if(sk2->rcv_saddr == sk->rcv_saddr) {
- if((!sk_reuse) ||
- (!sk2_reuse) ||
- (state == TCP_LISTEN)) {
- retval = 1;
- break;
- }
+ for(tb = tcp_bound_hash[tcp_bhashfn(snum)]; tb; tb = tb->next) {
+ if(tb->port == snum) {
+ if(tb->owners == NULL &&
+ (tb->flags & TCPB_FLAG_LOCKED)) {
+ tb->flags &= ~TCPB_FLAG_LOCKED;
+ tcp_inc_slow_timer(TCP_SLT_BUCKETGC);
}
+ break;
}
}
SOCKHASH_UNLOCK();
-
- return retval;
}
-static __inline__ int tcp_lport_inuse(int num)
+struct tcp_bind_bucket *tcp_bucket_create(unsigned short snum)
{
- struct sock *sk = tcp_bound_hash[tcp_bhashfn(num)];
+ struct tcp_bind_bucket *tb;
- for(; sk != NULL; sk = sk->bind_next) {
- if(sk->num == num)
- return 1;
+ tb = kmem_cache_alloc(tcp_bucket_cachep, SLAB_ATOMIC);
+ if(tb != NULL) {
+ struct tcp_bind_bucket **head =
+ &tcp_bound_hash[tcp_bhashfn(snum)];
+ tb->port = snum;
+ tb->flags = TCPB_FLAG_LOCKED;
+ tb->owners = NULL;
+ if((tb->next = *head) != NULL)
+ tb->next->pprev = &tb->next;
+ *head = tb;
+ tb->pprev = head;
}
- return 0;
+ return tb;
}
-/* Find a "good" local port, this is family independent.
- * There are several strategies working in unison here to
- * get the best possible performance. The current socket
- * load is kept track of, if it is zero there is a strong
- * likely hood that there is a zero length chain we will
- * find with a small amount of searching, else the load is
- * what we shoot for for when the chains all have at least
- * one entry. The base helps us walk the chains in an
- * order such that a good chain is found as quickly as possible. -DaveM
- */
-unsigned short tcp_good_socknum(void)
+static int tcp_v4_verify_bind(struct sock *sk, unsigned short snum)
{
- static int start = 0;
- static int binding_contour = 0;
- int best = 0;
- int size = 32767; /* a big num. */
- int retval = 0, i, end, bc;
+ struct tcp_bind_bucket *tb;
+ int result = 0;
SOCKHASH_LOCK();
- if (start > sysctl_local_port_range[1] || start < sysctl_local_port_range[0])
- start = sysctl_local_port_range[0];
- i = tcp_bhashfn(start);
- end = i + TCP_BHTABLE_SIZE;
- bc = binding_contour;
- do {
- struct sock *sk = tcp_bound_hash[i&(TCP_BHTABLE_SIZE-1)];
- if(!sk) {
- /* find the smallest value no smaller than start
- * that has this hash value.
- */
- retval = tcp_bhashnext(start-1,i&(TCP_BHTABLE_SIZE-1));
-
- /* Check for decreasing load. */
- if (bc != 0)
- binding_contour = 0;
- goto done;
- } else {
- int j = 0;
- do { sk = sk->bind_next; } while (++j < size && sk);
- if (j < size) {
- best = i&(TCP_BHTABLE_SIZE-1);
- size = j;
- if (bc && size <= bc)
- goto verify;
- }
- }
- } while(++i != end);
- i = best;
+ for(tb = tcp_bound_hash[tcp_bhashfn(snum)];
+ (tb && (tb->port != snum));
+ tb = tb->next)
+ ;
+ if(tb && tb->owners) {
+ /* Fast path for reuse ports, see include/net/tcp.h for a very
+ * detailed description of why this works, and why it is worth
+ * the effort at all. -DaveM
+ */
+ if((tb->flags & TCPB_FLAG_FASTREUSE) &&
+ (sk->reuse != 0)) {
+ goto go_like_smoke;
+ } else {
+ struct sock *sk2;
+ int sk_reuse = sk->reuse;
- /* Socket load is increasing, adjust our load average. */
- binding_contour = size;
-verify:
- if (size < binding_contour)
- binding_contour = size;
-
- retval = tcp_bhashnext(start-1,i);
-
- best = retval; /* mark the starting point to avoid infinite loops */
- while(tcp_lport_inuse(retval)) {
- retval = tcp_bhashnext(retval,i);
- if (retval > sysctl_local_port_range[1]) /* Upper bound */
- retval = tcp_bhashnext(sysctl_local_port_range[0],i);
- if (retval == best) {
- /* This hash chain is full. No answer. */
- retval = 0;
- break;
+ /* We must walk the whole port owner list in this case. -DaveM */
+ for(sk2 = tb->owners; sk2; sk2 = sk2->bind_next) {
+ if(sk->bound_dev_if == sk2->bound_dev_if) {
+ if(!sk_reuse || !sk2->reuse || sk2->state == TCP_LISTEN) {
+ if(!sk2->rcv_saddr ||
+ !sk->rcv_saddr ||
+ (sk2->rcv_saddr == sk->rcv_saddr))
+ break;
+ }
+ }
+ }
+ if(sk2 != NULL)
+ result = 1;
}
- }
+ }
+ if((result == 0) &&
+ (tb == NULL) &&
+ (tcp_bucket_create(snum) == NULL))
+ result = 1;
+go_like_smoke:
+ SOCKHASH_UNLOCK();
+ return result;
+}
-done:
- start = (retval + 1);
+unsigned short tcp_good_socknum(void)
+{
+ struct tcp_bind_bucket *tb;
+ int low = sysctl_local_port_range[0];
+ int high = sysctl_local_port_range[1];
+ int remaining = high - low;
+ int rover;
+
+ SOCKHASH_LOCK();
+ rover = tcp_port_rover;
+ do {
+ rover += 1;
+ if((rover < low) || (rover > high))
+ rover = low;
+ tb = tcp_bound_hash[tcp_bhashfn(rover)];
+ for( ; tb; tb = tb->next) {
+ if(tb->port == rover)
+ goto next;
+ }
+ break;
+ next:
+ } while(--remaining > 0);
+ tcp_port_rover = rover;
+ if((remaining <= 0) || (tcp_bucket_create(rover) == NULL))
+ rover = 0;
SOCKHASH_UNLOCK();
- return retval;
+ return rover;
}
static void tcp_v4_hash(struct sock *sk)
{
- unsigned char state;
-
- SOCKHASH_LOCK();
- state = sk->state;
- if(state != TCP_CLOSE || !sk->dead) {
+ if (sk->state != TCP_CLOSE) {
struct sock **skp;
- if(state == TCP_LISTEN)
- skp = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
- else
- skp = &tcp_established_hash[tcp_sk_hashfn(sk)];
-
+ SOCKHASH_LOCK();
+ skp = &tcp_established_hash[(sk->hashent = tcp_sk_hashfn(sk))];
if((sk->next = *skp) != NULL)
(*skp)->pprev = &sk->next;
*skp = sk;
sk->pprev = skp;
tcp_sk_bindify(sk);
+ SOCKHASH_UNLOCK();
}
- SOCKHASH_UNLOCK();
}
static void tcp_v4_unhash(struct sock *sk)
@@ -282,6 +264,7 @@
sk->next->pprev = sk->pprev;
*sk->pprev = sk->next;
sk->pprev = NULL;
+ tcp_reg_zap(sk);
tcp_sk_unbindify(sk);
}
SOCKHASH_UNLOCK();
@@ -293,30 +276,27 @@
SOCKHASH_LOCK();
state = sk->state;
- if(sk->pprev) {
+ if(sk->pprev != NULL) {
if(sk->next)
sk->next->pprev = sk->pprev;
*sk->pprev = sk->next;
sk->pprev = NULL;
- tcp_sk_unbindify(sk);
+ tcp_reg_zap(sk);
}
- if(state != TCP_CLOSE || !sk->dead) {
+ if(state != TCP_CLOSE) {
struct sock **skp;
- if(state == TCP_LISTEN) {
+ if(state == TCP_LISTEN)
skp = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
- } else {
- int hash= tcp_sk_hashfn(sk);
- if(state == TCP_TIME_WAIT)
- hash += (TCP_HTABLE_SIZE/2);
- skp = &tcp_established_hash[hash];
- }
+ else
+ skp = &tcp_established_hash[(sk->hashent = tcp_sk_hashfn(sk))];
if((sk->next = *skp) != NULL)
(*skp)->pprev = &sk->next;
*skp = sk;
sk->pprev = skp;
- tcp_sk_bindify(sk);
+ if(state == TCP_LISTEN)
+ tcp_sk_bindify(sk);
}
SOCKHASH_UNLOCK();
}
@@ -360,37 +340,64 @@
return result;
}
+/* Until this is verified... -DaveM */
+/* #define USE_QUICKSYNS */
+
/* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
* we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
+ * It is assumed that this code only gets called from within NET_BH.
*/
static inline struct sock *__tcp_v4_lookup(struct tcphdr *th,
- u32 saddr, u16 sport, u32 daddr, u16 dport, int dif)
+ u32 saddr, u16 sport,
+ u32 daddr, u16 dport, int dif)
{
unsigned short hnum = ntohs(dport);
struct sock *sk;
- int hash = tcp_hashfn(daddr, hnum, saddr, sport);
+ int hash;
+
+#ifdef USE_QUICKSYNS
+ /* Incomming connection short-cut. */
+ if (th && th->syn == 1 && th->ack == 0)
+ goto listener_shortcut;
+#endif
+
+ /* Check TCP register quick cache first. */
+ sk = TCP_RHASH(sport);
+ if(sk &&
+ sk->daddr == saddr && /* remote address */
+ sk->dummy_th.dest == sport && /* remote port */
+ sk->num == hnum && /* local port */
+ sk->rcv_saddr == daddr && /* local address */
+ (!sk->bound_dev_if || sk->bound_dev_if == dif))
+ goto hit;
/* Optimize here for direct hit, only listening connections can
- * have wildcards anyways. It is assumed that this code only
- * gets called from within NET_BH.
+ * have wildcards anyways.
*/
- for(sk = tcp_established_hash[hash]; sk; sk = sk->next)
+ hash = tcp_hashfn(daddr, hnum, saddr, sport);
+ for(sk = tcp_established_hash[hash]; sk; sk = sk->next) {
if(sk->daddr == saddr && /* remote address */
sk->dummy_th.dest == sport && /* remote port */
sk->num == hnum && /* local port */
sk->rcv_saddr == daddr && /* local address */
- (!sk->bound_dev_if || sk->bound_dev_if == dif))
+ (!sk->bound_dev_if || sk->bound_dev_if == dif)) {
+ if (sk->state == TCP_ESTABLISHED)
+ TCP_RHASH(sport) = sk;
goto hit; /* You sunk my battleship! */
-
+ }
+ }
/* Must check for a TIME_WAIT'er before going to listener hash. */
- for(sk = tcp_established_hash[hash+(TCP_HTABLE_SIZE/2)]; sk; sk = sk->next)
+ for(sk = tcp_established_hash[hash+(TCP_HTABLE_SIZE/2)]; sk; sk = sk->next) {
if(sk->daddr == saddr && /* remote address */
sk->dummy_th.dest == sport && /* remote port */
sk->num == hnum && /* local port */
sk->rcv_saddr == daddr && /* local address */
(!sk->bound_dev_if || sk->bound_dev_if == dif))
goto hit;
-
+ }
+#ifdef USE_QUICKSYNS
+listener_shortcut:
+#endif
sk = tcp_v4_lookup_listener(daddr, hnum, dif);
hit:
return sk;
@@ -402,20 +409,11 @@
}
#ifdef CONFIG_IP_TRANSPARENT_PROXY
-#define secondlist(hpnum, sk, fpass) \
-({ struct sock *s1; if(!(sk) && (fpass)--) \
- s1 = tcp_bound_hash[tcp_bhashfn(hpnum)]; \
- else \
- s1 = (sk); \
- s1; \
-})
-
-#define tcp_v4_proxy_loop_init(hnum, hpnum, sk, fpass) \
- secondlist((hpnum), tcp_bound_hash[tcp_bhashfn(hnum)],(fpass))
-
-#define tcp_v4_proxy_loop_next(hnum, hpnum, sk, fpass) \
- secondlist((hpnum),(sk)->bind_next,(fpass))
-
+/* Cleaned up a little and adapted to new bind bucket scheme.
+ * Oddly, this should increase performance here for
+ * transparent proxy, as tests within the inner loop have
+ * been eliminated. -DaveM
+ */
static struct sock *tcp_v4_proxy_lookup(unsigned short num, unsigned long raddr,
unsigned short rnum, unsigned long laddr,
struct device *dev, unsigned short pnum,
@@ -436,51 +434,60 @@
}
/* This code must run only from NET_BH. */
- for(s = tcp_v4_proxy_loop_init(hnum, hpnum, s, firstpass);
- s != NULL;
- s = tcp_v4_proxy_loop_next(hnum, hpnum, s, firstpass)) {
- if(s->num == hnum || s->num == hpnum) {
- int score = 0;
- if(s->dead && (s->state == TCP_CLOSE))
+ {
+ struct tcp_bind_bucket *tb = tcp_bound_hash[tcp_bhashfn(hnum)];
+ for( ; (tb && tb->port != hnum); tb = tb->next)
+ ;
+ if(tb == NULL)
+ goto next;
+ s = tb->owners;
+ }
+pass2:
+ for(; s; s = s->bind_next) {
+ int score = 0;
+ if(s->rcv_saddr) {
+ if((s->num != hpnum || s->rcv_saddr != paddr) &&
+ (s->num != hnum || s->rcv_saddr != laddr))
continue;
- if(s->rcv_saddr) {
- if((s->num != hpnum || s->rcv_saddr != paddr) &&
- (s->num != hnum || s->rcv_saddr != laddr))
- continue;
- score++;
- }
- if(s->daddr) {
- if(s->daddr != raddr)
- continue;
- score++;
- }
- if(s->dummy_th.dest) {
- if(s->dummy_th.dest != rnum)
- continue;
- score++;
- }
- if(s->bound_dev_if) {
- if(s->bound_dev_if != dif)
- continue;
- score++;
- }
- if(score == 4 && s->num == hnum) {
- result = s;
- break;
- } else if(score > badness && (s->num == hpnum || s->rcv_saddr)) {
- result = s;
- badness = score;
- }
+ score++;
+ }
+ if(s->daddr) {
+ if(s->daddr != raddr)
+ continue;
+ score++;
+ }
+ if(s->dummy_th.dest) {
+ if(s->dummy_th.dest != rnum)
+ continue;
+ score++;
+ }
+ if(s->bound_dev_if) {
+ if(s->bound_dev_if != dif)
+ continue;
+ score++;
+ }
+ if(score == 4 && s->num == hnum) {
+ result = s;
+ goto gotit;
+ } else if(score > badness && (s->num == hpnum || s->rcv_saddr)) {
+ result = s;
+ badness = score;
+ }
+ }
+next:
+ if(firstpass--) {
+ struct tcp_bind_bucket *tb = tcp_bound_hash[tcp_bhashfn(hpnum)];
+ for( ; (tb && tb->port != hpnum); tb = tb->next)
+ ;
+ if(tb) {
+ s = tb->owners;
+ goto pass2;
}
}
+gotit:
return result;
}
-
-#undef secondlist
-#undef tcp_v4_proxy_loop_init
-#undef tcp_v4_proxy_loop_next
-
-#endif
+#endif /* CONFIG_IP_TRANSPARENT_PROXY */
static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
{
@@ -495,41 +502,35 @@
/*
* Check that a TCP address is unique, don't allow multiple
- * connects to/from the same address
+ * connects to/from the same address. Actually we can optimize
+ * quite a bit, since the socket about to connect is still
+ * in TCP_CLOSE, a tcp_bind_bucket for the local port he will
+ * use will exist, with a NULL owners list. So check for that.
+ * The good_socknum and verify_bind scheme we use makes this
+ * work.
*/
-static int tcp_unique_address(u32 saddr, u16 snum, u32 daddr, u16 dnum)
+static int tcp_unique_address(struct sock *sk)
{
- int retval = 1, hashent = tcp_hashfn(saddr, snum, daddr, dnum);
- struct sock * sk;
+ struct tcp_bind_bucket *tb;
+ unsigned short snum = sk->num;
+ int retval = 1;
- /* Make sure we are allowed to connect here.
- * But freeze the hash while we snoop around.
- */
+ /* Freeze the hash while we snoop around. */
SOCKHASH_LOCK();
- sk = tcp_established_hash[hashent];
- for (; sk != NULL; sk = sk->next) {
- if(sk->daddr == daddr && /* remote address */
- sk->dummy_th.dest == dnum && /* remote port */
- sk->num == snum && /* local port */
- sk->saddr == saddr) { /* local address */
- retval = 0;
- goto out;
- }
- }
-
- /* Must check TIME_WAIT'ers too. */
- sk = tcp_established_hash[hashent + (TCP_HTABLE_SIZE/2)];
- for (; sk != NULL; sk = sk->next) {
- if(sk->daddr == daddr && /* remote address */
- sk->dummy_th.dest == dnum && /* remote port */
- sk->num == snum && /* local port */
- sk->saddr == saddr) { /* local address */
- retval = 0;
- goto out;
+ tb = tcp_bound_hash[tcp_bhashfn(snum)];
+ for(; tb; tb = tb->next) {
+ if(tb->port == snum && tb->owners != NULL) {
+ /* Almost certainly the re-use port case, search the real hashes
+ * so it actually scales.
+ */
+ sk = __tcp_v4_lookup(NULL, sk->daddr, sk->dummy_th.dest,
+ sk->rcv_saddr, snum, sk->bound_dev_if);
+ if((sk != NULL) && (sk->state != TCP_LISTEN))
+ retval = 0;
+ break;
}
}
-out:
SOCKHASH_UNLOCK();
return retval;
}
@@ -578,8 +579,7 @@
return -ENETUNREACH;
}
- if (!tcp_unique_address(rt->rt_src, sk->num, rt->rt_dst,
- usin->sin_port)) {
+ if (!tcp_unique_address(sk)) {
ip_rt_put(rt);
return -EADDRNOTAVAIL;
}
@@ -587,7 +587,8 @@
lock_sock(sk);
/* Do this early, so there is less state to unwind on failure. */
- buff = sock_wmalloc(sk, MAX_SYN_SIZE, 0, GFP_KERNEL);
+ buff = sock_wmalloc(sk, (MAX_SYN_SIZE + sizeof(struct sk_buff)),
+ 0, GFP_KERNEL);
if (buff == NULL) {
release_sock(sk);
ip_rt_put(rt);
@@ -605,15 +606,13 @@
sk->dummy_th.dest = usin->sin_port;
- sk->write_seq = secure_tcp_sequence_number(sk->saddr, sk->daddr,
+ tp->write_seq = secure_tcp_sequence_number(sk->saddr, sk->daddr,
sk->dummy_th.source,
usin->sin_port);
-
tp->snd_wnd = 0;
tp->snd_wl1 = 0;
- tp->snd_wl2 = sk->write_seq;
- tp->snd_una = sk->write_seq;
-
+ tp->snd_wl2 = tp->write_seq;
+ tp->snd_una = tp->write_seq;
tp->rcv_nxt = 0;
sk->err = 0;
@@ -635,14 +634,22 @@
/* No failure conditions can result past this point. */
+ /* We'll fix this up when we get a response from the other end.
+ * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
+ */
+ tp->tcp_header_len = sizeof(struct tcphdr) +
+ (sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0);
+
th = (struct tcphdr *) skb_put(buff,sizeof(struct tcphdr));
buff->h.th = th;
memcpy(th,(void *)&(sk->dummy_th), sizeof(*th));
- buff->seq = sk->write_seq++;
+ /* th->doff gets fixed up below if we tack on options. */
+
+ buff->seq = tp->write_seq++;
th->seq = htonl(buff->seq);
- tp->snd_nxt = sk->write_seq;
- buff->end_seq = sk->write_seq;
+ tp->snd_nxt = tp->write_seq;
+ buff->end_seq = tp->write_seq;
th->ack = 0;
th->syn = 1;
@@ -656,11 +663,9 @@
if(sk->mtu < 64)
sk->mtu = 64; /* Sanity limit */
- if (sk->user_mss)
- sk->mss = sk->user_mss;
- else
- sk->mss = (sk->mtu - sizeof(struct iphdr) -
- sizeof(struct tcphdr));
+ sk->mss = (sk->mtu - sizeof(struct iphdr) - tp->tcp_header_len);
+ if(sk->user_mss)
+ sk->mss = min(sk->mss, sk->user_mss);
if (sk->mss < 1) {
printk(KERN_DEBUG "intial sk->mss below 1\n");
@@ -675,9 +680,8 @@
&tp->rcv_wscale);
th->window = htons(tp->rcv_wnd);
- tmp = tcp_syn_build_options(buff, sk->mss, sysctl_tcp_sack,
- sysctl_tcp_timestamps,
- sysctl_tcp_window_scaling,tp->rcv_wscale);
+ tmp = tcp_syn_build_options(buff, sk->mss, sysctl_tcp_timestamps,
+ sysctl_tcp_window_scaling, tp->rcv_wscale);
buff->csum = 0;
th->doff = (sizeof(*th)+ tmp)>>2;
@@ -686,9 +690,10 @@
tcp_set_state(sk,TCP_SYN_SENT);
/* Socket identity change complete, no longer
- * in TCP_CLOSE, so rehash.
+ * in TCP_CLOSE, so enter ourselves into the
+ * hash tables.
*/
- tcp_v4_rehash(sk);
+ tcp_v4_hash(sk);
tp->rto = rt->u.dst.rtt;
@@ -715,6 +720,7 @@
static int tcp_v4_sendmsg(struct sock *sk, struct msghdr *msg, int len)
{
+ struct tcp_opt *tp;
int retval = -EINVAL;
/* Do sanity checking for sendmsg/sendto/send. */
@@ -740,7 +746,10 @@
lock_sock(sk);
retval = tcp_do_sendmsg(sk, msg->msg_iovlen, msg->msg_iov,
msg->msg_flags);
-
+ /* Push out partial tail frames if needed. */
+ tp = &(sk->tp_pinfo.af_tcp);
+ if(tp->send_head && tcp_snd_test(sk, tp->send_head))
+ tcp_write_xmit(sk);
release_sock(sk);
out:
@@ -854,7 +863,7 @@
th = (struct tcphdr*)(dp+(iph->ihl<<2));
sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr, th->source, skb->dev->ifindex);
- if (sk == NULL) {
+ if (sk == NULL || sk->state == TCP_TIME_WAIT) {
icmp_statistics.IcmpInErrors++;
return;
}
@@ -1011,7 +1020,8 @@
skb1->csum = csum_partial((u8 *) th1, sizeof(*th1), 0);
th1->check = tcp_v4_check(th1, sizeof(*th1), skb1->nh.iph->saddr,
skb1->nh.iph->daddr, skb1->csum);
- /* FIXME: should this carry an options packet? */
+
+ /* Do not place TCP options in a reset. */
ip_queue_xmit(skb1);
tcp_statistics.TcpOutSegs++;
tcp_statistics.TcpOutRsts++;
@@ -1063,6 +1073,14 @@
mss = (skb->dst->pmtu - sizeof(struct iphdr) - sizeof(struct tcphdr));
if (sk->user_mss)
mss = min(mss, sk->user_mss);
+ if(req->tstamp_ok)
+ mss -= TCPOLEN_TSTAMP_ALIGNED;
+ else
+ req->mss += TCPOLEN_TSTAMP_ALIGNED;
+
+ /* tcp_syn_build_options will do an skb_put() to obtain the TCP
+ * options bytes below.
+ */
skb->h.th = th = (struct tcphdr *) skb_put(skb, sizeof(struct tcphdr));
/* Don't offer more than they did.
@@ -1081,9 +1099,8 @@
memset(th, 0, sizeof(struct tcphdr));
th->syn = 1;
th->ack = 1;
- th->source =
#ifdef CONFIG_IP_TRANSPARENT_PROXY
- req->lcl_port; /* LVE */
+ th->source = req->lcl_port; /* LVE */
#else
th->source = sk->dummy_th.source;
#endif
@@ -1104,16 +1121,7 @@
req->rcv_wscale = rcv_wscale;
}
th->window = htons(req->rcv_wnd);
-
- /* XXX Partial csum of 4 byte quantity is itself! -DaveM
- * Yes, but it's a bit harder to special case now. It's
- * now computed inside the tcp_v4_send_check() to clean up
- * updating the options fields in the mainline send code.
- * If someone thinks this is really bad let me know and
- * I'll try to do it a different way. -- erics
- */
-
- tmp = tcp_syn_build_options(skb, req->mss, req->sack_ok, req->tstamp_ok,
+ tmp = tcp_syn_build_options(skb, req->mss, req->tstamp_ok,
req->wscale_ok,req->rcv_wscale);
skb->csum = 0;
th->doff = (sizeof(*th) + tmp)>>2;
@@ -1232,14 +1240,15 @@
req->rcv_wnd = 0; /* So that tcp_send_synack() knows! */
req->rcv_isn = skb->seq;
- tp.tstamp_ok = tp.sack_ok = tp.wscale_ok = tp.snd_wscale = 0;
+ tp.tstamp_ok = tp.wscale_ok = tp.snd_wscale = 0;
tp.in_mss = 536;
tcp_parse_options(th,&tp,want_cookie);
- if (tp.saw_tstamp)
- req->ts_recent = tp.rcv_tsval;
req->mss = tp.in_mss;
+ if (tp.saw_tstamp) {
+ req->mss -= TCPOLEN_TSTAMP_ALIGNED;
+ req->ts_recent = tp.rcv_tsval;
+ }
req->tstamp_ok = tp.tstamp_ok;
- req->sack_ok = tp.sack_ok;
req->snd_wscale = tp.snd_wscale;
req->wscale_ok = tp.wscale_ok;
req->rmt_port = th->source;
@@ -1289,6 +1298,113 @@
return 0;
}
+/* This is not only more efficient than what we used to do, it eliminates
+ * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
+ */
+struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, struct sk_buff *skb)
+{
+ struct sock *newsk = sk_alloc(AF_INET, GFP_ATOMIC, 0);
+
+ if(newsk != NULL) {
+ struct tcp_opt *newtp;
+
+ memcpy(newsk, sk, sizeof(*newsk));
+ newsk->sklist_next = NULL;
+ newsk->daddr = req->af.v4_req.rmt_addr;
+ newsk->rcv_saddr = req->af.v4_req.loc_addr;
+#ifdef CONFIG_IP_TRANSPARENT_PROXY
+ newsk->num = ntohs(skb->h.th->dest);
+#endif
+ newsk->state = TCP_SYN_RECV;
+
+ /* Clone the TCP header template */
+#ifdef CONFIG_IP_TRANSPARENT_PROXY
+ newsk->dummy_th.source = req->lcl_port;
+#endif
+ newsk->dummy_th.dest = req->rmt_port;
+ newsk->dummy_th.ack = 1;
+ newsk->dummy_th.doff = sizeof(struct tcphdr)>>2;
+
+ newsk->sock_readers = 0;
+ atomic_set(&newsk->rmem_alloc, 0);
+ skb_queue_head_init(&newsk->receive_queue);
+ atomic_set(&newsk->wmem_alloc, 0);
+ skb_queue_head_init(&newsk->write_queue);
+ newsk->saddr = req->af.v4_req.loc_addr;
+
+ newsk->done = 0;
+ newsk->proc = 0;
+ newsk->pair = NULL;
+ skb_queue_head_init(&newsk->back_log);
+ skb_queue_head_init(&newsk->error_queue);
+
+ /* Now setup tcp_opt */
+ newtp = &(newsk->tp_pinfo.af_tcp);
+ newtp->pred_flags = 0;
+ newtp->rcv_nxt = req->rcv_isn + 1;
+ newtp->snd_nxt = req->snt_isn + 1;
+ newtp->snd_una = req->snt_isn + 1;
+ newtp->srtt = 0;
+ newtp->ato = 0;
+ newtp->snd_wl1 = req->rcv_isn;
+ newtp->snd_wl2 = req->snt_isn;
+ newtp->snd_wnd = ntohs(skb->h.th->window);
+ newtp->max_window = newtp->snd_wnd;
+ newtp->pending = 0;
+ newtp->retransmits = 0;
+ newtp->last_ack_sent = req->rcv_isn + 1;
+ newtp->backoff = 0;
+ newtp->mdev = TCP_TIMEOUT_INIT;
+ newtp->snd_cwnd = 1;
+ newtp->rto = TCP_TIMEOUT_INIT;
+ newtp->packets_out = 0;
+ newtp->high_seq = 0;
+ newtp->snd_ssthresh = 0x7fffffff;
+ newtp->snd_cwnd_cnt = 0;
+ newtp->dup_acks = 0;
+ newtp->delayed_acks = 0;
+ init_timer(&newtp->retransmit_timer);
+ newtp->retransmit_timer.function = &tcp_retransmit_timer;
+ newtp->retransmit_timer.data = (unsigned long) newsk;
+ init_timer(&newtp->delack_timer);
+ newtp->delack_timer.function = &tcp_delack_timer;
+ newtp->delack_timer.data = (unsigned long) newsk;
+ skb_queue_head_init(&newtp->out_of_order_queue);
+ newtp->send_head = newtp->retrans_head = NULL;
+ newtp->rcv_wup = req->rcv_isn + 1;
+ newtp->write_seq = req->snt_isn + 1;
+ newtp->copied_seq = req->rcv_isn + 1;
+
+ newtp->saw_tstamp = 0;
+ newtp->in_mss = 536;
+
+ init_timer(&newtp->probe_timer);
+ newtp->probe_timer.function = &tcp_probe_timer;
+ newtp->probe_timer.data = (unsigned long) newsk;
+ newtp->probes_out = 0;
+ newtp->syn_seq = req->rcv_isn;
+ newtp->fin_seq = req->rcv_isn;
+ newtp->urg_data = 0;
+ tcp_synq_init(newtp);
+ newtp->syn_backlog = 0;
+
+ /* Back to base struct sock members. */
+ newsk->err = 0;
+ newsk->ack_backlog = 0;
+ newsk->max_ack_backlog = SOMAXCONN;
+ newsk->priority = 1;
+
+ /* IP layer stuff */
+ newsk->opt = req->af.v4_req.opt;
+ newsk->timeout = 0;
+ init_timer(&newsk->timer);
+ newsk->timer.function = &net_timer;
+ newsk->timer.data = (unsigned long) newsk;
+ newsk->socket = NULL;
+ }
+ return newsk;
+}
+
struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
struct open_request *req,
struct dst_entry *dst)
@@ -1301,98 +1417,14 @@
if (sk->ack_backlog > sk->max_ack_backlog)
goto exit; /* head drop */
#endif
- newsk = sk_alloc(AF_INET, GFP_ATOMIC);
+ newsk = tcp_create_openreq_child(sk, req, skb);
if (!newsk)
goto exit;
#ifdef NEW_LISTEN
sk->ack_backlog++;
#endif
- memcpy(newsk, sk, sizeof(*newsk));
-
- /* Or else we die! -DaveM */
- newsk->sklist_next = NULL;
-
- newsk->opt = req->af.v4_req.opt;
- skb_queue_head_init(&newsk->write_queue);
- skb_queue_head_init(&newsk->receive_queue);
- skb_queue_head_init(&newsk->out_of_order_queue);
- skb_queue_head_init(&newsk->error_queue);
-
- /* Unused */
newtp = &(newsk->tp_pinfo.af_tcp);
- newtp->send_head = NULL;
- newtp->retrans_head = NULL;
-
- newtp->pending = 0;
-
- skb_queue_head_init(&newsk->back_log);
-
- newsk->prot->init(newsk);
-
- newtp->snd_cwnd_cnt = 0;
- newtp->backoff = 0;
- newsk->proc = 0;
- newsk->done = 0;
- newsk->pair = NULL;
- atomic_set(&newsk->wmem_alloc, 0);
- atomic_set(&newsk->rmem_alloc, 0);
- newsk->localroute = sk->localroute;
-
- newsk->err = 0;
- newsk->shutdown = 0;
- newsk->ack_backlog = 0;
-
- newtp->fin_seq = req->rcv_isn;
- newsk->syn_seq = req->rcv_isn;
- newsk->state = TCP_SYN_RECV;
- newsk->timeout = 0;
-
- newsk->write_seq = req->snt_isn;
-
- newtp->snd_wnd = ntohs(skb->h.th->window);
- newtp->max_window = newtp->snd_wnd;
- newtp->snd_wl1 = req->rcv_isn;
- newtp->snd_wl2 = newsk->write_seq;
- newtp->snd_una = newsk->write_seq++;
- newtp->snd_nxt = newsk->write_seq;
-
- newsk->urg_data = 0;
- newtp->packets_out = 0;
- newtp->retransmits = 0;
- newsk->linger=0;
- newsk->destroy = 0;
- init_timer(&newsk->timer);
- newsk->timer.data = (unsigned long) newsk;
- newsk->timer.function = &net_timer;
-
- tcp_init_xmit_timers(newsk);
-
- newsk->dummy_th.source =
-#ifdef CONFIG_IP_TRANSPARENT_PROXY
- req->lcl_port; /* LVE */
-#else
- sk->dummy_th.source;
-#endif
- newsk->dummy_th.dest = req->rmt_port;
- newsk->sock_readers=0;
-
- newtp->last_ack_sent = newtp->rcv_nxt = req->rcv_isn + 1;
- newtp->rcv_wup = req->rcv_isn + 1;
- newsk->copied_seq = req->rcv_isn + 1;
-
- newsk->socket = NULL;
-
-#ifdef CONFIG_IP_TRANSPARENT_PROXY
- /*
- * Deal with possibly redirected traffic by setting num to
- * the intended destination port of the received packet.
- */
- newsk->num = ntohs(skb->h.th->dest);
-#endif
- newsk->daddr = req->af.v4_req.rmt_addr;
- newsk->saddr = req->af.v4_req.loc_addr;
- newsk->rcv_saddr = req->af.v4_req.loc_addr;
/* options / mss / route_cache */
if (dst == NULL) {
@@ -1418,7 +1450,6 @@
if (newsk->mtu < 64)
newsk->mtu = 64;
- newtp->sack_ok = req->sack_ok;
newtp->tstamp_ok = req->tstamp_ok;
newtp->window_clamp = req->window_clamp;
newtp->rcv_wnd = req->rcv_wnd;
@@ -1433,8 +1464,8 @@
if (newtp->tstamp_ok) {
newtp->ts_recent = req->ts_recent;
newtp->ts_recent_stamp = jiffies;
- newtp->tcp_header_len = sizeof(struct tcphdr) + 12; /* FIXME: define constant! */
- newsk->dummy_th.doff += 3;
+ newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
+ newsk->dummy_th.doff += (TCPOLEN_TSTAMP_ALIGNED >> 2);
} else {
newtp->tcp_header_len = sizeof(struct tcphdr);
}
@@ -1446,13 +1477,13 @@
/* Make sure our mtu is adjusted for headers. */
newsk->mss = min(req->mss, snd_mss) + sizeof(struct tcphdr) - newtp->tcp_header_len;
- tcp_v4_hash(newsk);
+ /* Must use the af_specific ops here for the case of IPv6 mapped. */
+ newsk->prot->hash(newsk);
add_to_prot_sklist(newsk);
return newsk;
exit:
- if (dst)
- dst_release(dst);
+ dst_release(dst);
return NULL;
}
@@ -1623,6 +1654,8 @@
skb->used = 0;
+ if (sk->state == TCP_TIME_WAIT)
+ goto do_time_wait;
if (!sk->sock_readers)
return tcp_v4_do_rcv(sk, skb);
@@ -1636,6 +1669,12 @@
/* Discard frame. */
kfree_skb(skb);
return 0;
+
+do_time_wait:
+ if(tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
+ skb, th, &(IPCB(skb)->opt), skb->len))
+ goto no_tcp_socket;
+ goto discard_it;
}
int tcp_v4_build_header(struct sock *sk, struct sk_buff *skb)
@@ -1770,33 +1809,21 @@
sizeof(struct sockaddr_in)
};
+/* NOTE: A lot of things set to zero explicitly by call to
+ * sk_alloc() so need not be done here.
+ */
static int tcp_v4_init_sock(struct sock *sk)
{
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
- skb_queue_head_init(&sk->out_of_order_queue);
+ skb_queue_head_init(&tp->out_of_order_queue);
tcp_init_xmit_timers(sk);
- tp->srtt = 0;
tp->rto = TCP_TIMEOUT_INIT; /*TCP_WRITE_TIME*/
tp->mdev = TCP_TIMEOUT_INIT;
-
- tp->ato = 0;
- tp->iat = (HZ/5) << 3;
-
- /* FIXME: tie this to sk->rcvbuf? (May be unnecessary) */
- /* tp->rcv_wnd = 8192; */
- tp->tstamp_ok = 0;
- tp->sack_ok = 0;
- tp->wscale_ok = 0;
tp->in_mss = 536;
- tp->snd_wscale = 0;
- tp->sacks = 0;
- tp->saw_tstamp = 0;
- tp->syn_backlog = 0;
- /*
- * See draft-stevens-tcpca-spec-01 for discussion of the
+ /* See draft-stevens-tcpca-spec-01 for discussion of the
* initialization of these values.
*/
tp->snd_cwnd = 1;
@@ -1804,9 +1831,7 @@
sk->priority = 1;
sk->state = TCP_CLOSE;
-
sk->max_ack_backlog = SOMAXCONN;
-
sk->mtu = 576;
sk->mss = 536;
@@ -1824,6 +1849,7 @@
static int tcp_v4_destroy_sock(struct sock *sk)
{
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
struct sk_buff *skb;
tcp_clear_xmit_timers(sk);
@@ -1836,8 +1862,16 @@
kfree_skb(skb);
/* Cleans up our, hopefuly empty, out_of_order_queue. */
- while((skb = skb_dequeue(&sk->out_of_order_queue)) != NULL)
+ while((skb = skb_dequeue(&tp->out_of_order_queue)) != NULL)
kfree_skb(skb);
+
+ /* Clean up a locked TCP bind bucket, this only happens if a
+ * port is allocated for a socket, but it never fully connects.
+ * In which case we will find num to be non-zero and daddr to
+ * be zero.
+ */
+ if(sk->daddr == 0 && sk->num != 0)
+ tcp_bucket_unlock(sk);
return 0;
}
FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen, slshen@lbl.gov