patch-2.0.31 linux/net/ipv4/tcp_input.c
Next file: linux/net/ipv4/tcp_output.c
Previous file: linux/net/ipv4/tcp.c
Back to the patch index
Back to the overall index
- Lines: 591
- Date:
Wed Oct 15 14:47:56 1997
- Orig file:
v2.0.30/linux/net/ipv4/tcp_input.c
- Orig date:
Tue Apr 8 08:47:47 1997
diff -u --recursive --new-file v2.0.30/linux/net/ipv4/tcp_input.c linux/net/ipv4/tcp_input.c
@@ -33,6 +33,9 @@
* : with SYN flooding attacks.
* David S. Miller : New socket lookup architecture for ISS.
* This code is dedicated to John Dyson.
+ * Elliot Poger : Added support for SO_BINDTODEVICE.
+ * Willy Konynenberg : Transparent proxy adapted to new
+ * socket hash code.
*/
#include <linux/config.h>
@@ -154,13 +157,13 @@
/*
* This code needs to be a bit more clever.
- * Does 2 minute timeouts now. Still just a circular buffer.
+ * Does 300 second timeouts now. Still just a circular buffer.
* At most 32 validations stored. New validations are ignored
* if all 32 validations are currently valid. To do otherwise
* allows a situation in which clearances are forgotten before
* they can be used (provided valid traffic is coming fast enough).
* The buffer should really be as long as the number of valid
- * connections we want to accept in an 2 minute period.
+ * connections we want to accept in an 300 second period.
* 32 is maybe to small. On the other hand, the validation check
* algorithm has to walk the whole table, which is also stupid.
* It would be better to have a combined hash/circular buffer.
@@ -185,7 +188,7 @@
int i;
for (i = 0; i < 32; i++)
if (clearances[i].saddr == saddr
- && clearances[i].tstamp > jiffies-HZ*120)
+ && clearances[i].tstamp > jiffies-HZ*300)
return 1;
return 0;
}
@@ -207,10 +210,12 @@
#ifdef CONFIG_SYN_COOKIES
/*
* MTU values we can represent in fall back mode.
- * FIXME. I sort of picked these out of a hat. I should
- * probably look around for docs on what common values are.
+ * These values are partially borrowed from Jeff Weisberg's SunOS
+ * implementation of SYNCOOKIES. I have added an extra limiting
+ * value of 64 to deal with the case of very small MTU values.
+ * (e.g. long delay packet radio links, 1200 baud modems.)
*/
-static __u32 cookie_mtu[8] = { 64, 128, 256, 296, 512, 576, 1024, 1500 };
+static __u32 cookie_mtu[8] = { 64, 256, 512, 536, 1024, 1440, 1460, 4312 };
#endif
extern void tcp_v4_hash(struct sock *sk);
@@ -222,27 +227,43 @@
* to specify the remote port nor the remote address for the
* connection. So always assume those are both wildcarded
* during the search since they can never be otherwise.
- *
- * XXX Later on, hash on both local port _and_ local address,
- * XXX to handle a huge IP alias'd box. Keep in mind that
- * XXX such a scheme will require us to run through the listener
- * XXX hash twice, once for local addresses bound, and once for
- * XXX the local address wildcarded (because the hash is different).
*/
-static struct sock *tcp_v4_lookup_longway(u32 daddr, unsigned short hnum)
+static struct sock *tcp_v4_lookup_longway(u32 daddr, unsigned short hnum,
+ struct device *dev)
{
struct sock *sk = tcp_listening_hash[tcp_lhashfn(hnum)];
struct sock *result = NULL;
+ int score, hiscore = 0;
for(; sk; sk = sk->next) {
if(sk->num == hnum) {
__u32 rcv_saddr = sk->rcv_saddr;
+ score = 1;
+ /* If this socket is bound to a particular IP address,
+ * does the dest IPaddr of the packet match it?
+ */
if(rcv_saddr) {
- if(rcv_saddr == daddr)
- return sk; /* Best possible match. */
- } else if(!result)
+ if(rcv_saddr != daddr)
+ continue;
+ score++;
+ }
+
+ /* If this socket is bound to a particular interface,
+ * did the packet come in on it? */
+ if (sk->bound_device) {
+ if (dev != sk->bound_device)
+ continue;
+ score++;
+ }
+
+ /* Check the score--max is 3. */
+ if (score == 3)
+ return sk; /* Best possible match. */
+ if (score > hiscore) {
+ hiscore = score;
result = sk;
+ }
}
}
return result;
@@ -252,7 +273,8 @@
* we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
*/
static inline struct sock *__tcp_v4_lookup(struct tcphdr *th,
- u32 saddr, u16 sport, u32 daddr, u16 dport)
+ u32 saddr, u16 sport, u32 daddr,
+ u16 dport, struct device *dev)
{
unsigned short hnum = ntohs(dport);
struct sock *sk;
@@ -266,83 +288,56 @@
if(sk->daddr == saddr && /* remote address */
sk->dummy_th.dest == sport && /* remote port */
sk->num == hnum && /* local port */
- sk->rcv_saddr == daddr) /* local address */
+ sk->rcv_saddr == daddr && /* local address */
+ ((sk->bound_device==NULL) || (sk->bound_device==dev)) )
goto hit; /* You sunk my battleship! */
- sk = tcp_v4_lookup_longway(daddr, hnum);
+ sk = tcp_v4_lookup_longway(daddr, hnum, dev);
hit:
return sk;
}
-__inline__ struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport)
+__inline__ struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport,
+ struct device *dev)
{
- return __tcp_v4_lookup(0, saddr, sport, daddr, dport);
+ return __tcp_v4_lookup(0, saddr, sport, daddr, dport, dev);
}
#ifdef CONFIG_IP_TRANSPARENT_PROXY
-#define secondlist(hpnum, sk, fpass) \
-({ struct sock *s1; if(!(sk) && (fpass)--) \
- s1 = tcp_bound_hash[tcp_bhashfn(hpnum)]; \
- else \
- s1 = (sk); \
- s1; \
-})
-
-#define tcp_v4_proxy_loop_init(hnum, hpnum, sk, fpass) \
- secondlist((hpnum), tcp_bound_hash[tcp_bhashfn(hnum)],(fpass))
-
-#define tcp_v4_proxy_loop_next(hnum, hpnum, sk, fpass) \
- secondlist((hpnum),(sk)->bind_next,(fpass))
-
-struct sock *tcp_v4_proxy_lookup(unsigned short num, unsigned long raddr,
- unsigned short rnum, unsigned long laddr,
- unsigned long paddr, unsigned short pnum)
-{
- struct sock *s, *result = NULL;
- int badness = -1;
- unsigned short hnum = ntohs(num);
- unsigned short hpnum = ntohs(pnum);
- int firstpass = 1;
-
- /* This code must run only from NET_BH. */
- for(s = tcp_v4_proxy_loop_init(hnum, hpnum, s, firstpass);
- s != NULL;
- s = tcp_v4_proxy_loop_next(hnum, hpnum, s, firstpass)) {
- if(s->num == hnum || s->num == hpnum) {
- int score = 0;
- if(s->dead && (s->state == TCP_CLOSE))
- continue;
- if(s->rcv_saddr) {
- if((s->num != hpnum || s->rcv_saddr != paddr) &&
- (s->num != hnum || s->rcv_saddr != laddr))
- continue;
- score++;
- }
- if(s->daddr) {
- if(s->daddr != raddr)
- continue;
- score++;
- }
- if(s->dummy_th.dest) {
- if(s->dummy_th.dest != rnum)
- continue;
- score++;
- }
- if(score == 3 && s->num == hnum) {
- result = s;
- break;
- } else if(score > badness && (s->num == hpnum || s->rcv_saddr)) {
- result = s;
- badness = score;
- }
- }
+/* I am not entirely sure this is fully equivalent to the old lookup code, but it does
+ * look reasonable. WFK
+ */
+struct sock *tcp_v4_proxy_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, u32 paddr, u16 rport,
+ struct device *dev)
+{
+ unsigned short hnum = ntohs(dport);
+ unsigned short hrnum = ntohs(rport);
+ struct sock *sk;
+
+ /* Optimize here for direct hit, only listening connections can
+ * have wildcards anyways. It is assumed that this code only
+ * gets called from within NET_BH.
+ */
+ sk = tcp_established_hash[tcp_hashfn(daddr, hnum, saddr, sport)];
+ for(; sk; sk = sk->next)
+ if(sk->daddr == saddr && /* remote address */
+ sk->dummy_th.dest == sport && /* remote port */
+ sk->num == hnum && /* local port */
+ sk->rcv_saddr == daddr && /* local address */
+ ((sk->bound_device==NULL) || (sk->bound_device==dev)) )
+ goto hit; /* You sunk my battleship! */
+ /* If we don't match on a bound socket, try to find one explicitly listening
+ * on the remote address (a proxy bind).
+ */
+ sk = tcp_v4_lookup_longway(daddr, hnum, dev);
+ /* If that didn't yield an exact match, look for a socket listening on the
+ * redirect port.
+ */
+ if (!sk || sk->rcv_saddr != daddr) {
+ sk = tcp_v4_lookup_longway(paddr, hrnum, dev);
}
- return result;
+hit:
+ return sk;
}
-
-#undef secondlist
-#undef tcp_v4_proxy_loop_init
-#undef tcp_v4_proxy_loop_next
-
#endif
/*
@@ -473,7 +468,7 @@
switch(opcode)
{
case TCPOPT_EOL:
- return;
+ goto ende;
case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
length--;
ptr--; /* the opsize=*ptr++ above was a mistake */
@@ -481,7 +476,7 @@
default:
if(opsize<=2) /* Avoid silly options looping forever */
- return;
+ goto ende;
switch(opcode)
{
case TCPOPT_MSS:
@@ -497,7 +492,7 @@
length-=opsize;
}
}
- if (th->syn)
+ende: if (th->syn)
{
if (! mss_seen)
sk->mtu=min(sk->mtu, 536); /* default MSS if none sent */
@@ -588,9 +583,10 @@
if (!tcp_clearance(saddr)) {
#endif
/* Only let this warning get printed once a minute. */
- if (jiffies - warning_time > HZ) {
+ if (jiffies - warning_time > HZ*60) {
warning_time = jiffies;
- printk(KERN_INFO "Warning: possible SYN flooding. Sending cookies.\n");
+ printk(KERN_INFO "Warning: possible SYN flood from %d.%d.%d.%d on %d.%d.%d.%d:%d. Sending cookies.\n",
+ NIPQUAD(saddr), NIPQUAD(daddr), ntohs(th->dest));
}
#ifdef CONFIG_RST_COOKIES
tcp_send_synack_probe(daddr, saddr, th, &tcp_prot,
@@ -598,6 +594,12 @@
#endif
#ifdef CONFIG_SYN_COOKIES
send_cookie = 1;
+#else
+ /* If we only have RST cookies we should
+ * not drop through to the rest of the response code.
+ */
+ kfree_skb(skb, FREE_READ);
+ return;
#endif
#ifdef CONFIG_RST_COOKIES
} else if (sk->ack_backlog >= 2*sk->max_ack_backlog) {
@@ -634,6 +636,8 @@
/* Or else we die! -DaveM */
newsk->sklist_next = NULL;
+ /* and die again -- erics */
+ newsk->pprev = NULL;
newsk->opt = NULL;
newsk->ip_route_cache = NULL;
@@ -667,7 +671,7 @@
newsk->rtt = 0;
newsk->rto = TCP_TIMEOUT_INIT;
newsk->mdev = TCP_TIMEOUT_INIT;
- newsk->max_window = 0;
+ newsk->max_window = 32; /* It cannot be left at zero. -DaveM */
/*
* See draft-stevens-tcpca-spec-01 for discussion of the
* initialization of these values.
@@ -736,8 +740,18 @@
newsk->daddr = saddr;
newsk->saddr = daddr;
newsk->rcv_saddr = daddr;
- tcp_v4_hash(newsk);
- add_to_prot_sklist(newsk);
+#ifdef CONFIG_SYN_COOKIES
+ /* Don't actually stuff the socket into the protocol lists
+ * if we are going to just destroy it anyway. We don't want any
+ * funnies happening if the next packet arrives before we get
+ * a chance to clean this one up.
+ */
+ if (!send_cookie)
+#endif
+ {
+ tcp_v4_hash(newsk);
+ add_to_prot_sklist(newsk);
+ }
newsk->acked_seq = skb->seq + 1;
newsk->copied_seq = skb->seq + 1;
@@ -758,7 +772,8 @@
* Note use of sk->user_mss, since user has no direct access to newsk
*/
- rt = ip_rt_route(newsk->opt && newsk->opt->srr ? newsk->opt->faddr : saddr, 0);
+ rt = ip_rt_route(newsk->opt && newsk->opt->srr ? newsk->opt->faddr : saddr, 0,
+ sk->bound_device);
newsk->ip_route_cache = rt;
if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
@@ -779,6 +794,14 @@
newsk->mtu = min(newsk->mtu, dev->mtu - sizeof(struct iphdr) - sizeof(struct tcphdr));
+ /* Must check it here, just to be absolutely safe. If we end up
+ * with a newsk->{max_window,mtu} of zero, we can thus end up with
+ * a newsk->mss of zero, which causes us to bomb out in
+ * tcp_do_sendmsg. -DaveM
+ */
+ if(newsk->mtu < 32)
+ newsk->mtu = 32;
+
#ifdef CONFIG_SKIP
/*
@@ -918,7 +941,7 @@
newsk->rtt = 0;
newsk->rto = TCP_TIMEOUT_INIT;
newsk->mdev = TCP_TIMEOUT_INIT;
- newsk->max_window = 0;
+ newsk->max_window = 32; /* It cannot be left at zero. -DaveM */
/*
* See draft-stevens-tcpca-spec-01 for discussion of the
* initialization of these values.
@@ -1001,7 +1024,8 @@
newsk->ip_ttl=sk->ip_ttl;
newsk->ip_tos=skb->ip_hdr->tos;
- rt = ip_rt_route(newsk->opt && newsk->opt->srr ? newsk->opt->faddr : saddr, 0);
+ rt = ip_rt_route(newsk->opt && newsk->opt->srr ? newsk->opt->faddr : saddr, 0,
+ sk->bound_device);
newsk->ip_route_cache = rt;
if (rt!=NULL && (rt->rt_flags&RTF_WINDOW))
@@ -1263,8 +1287,12 @@
/* We need to be a bit careful to preserve the
* count of packets that are out in the system here.
*/
- sk->ssthresh = max(sk->cong_window >> 1, 2);
+ sk->ssthresh = max(
+ min(sk->cong_window,
+ (sk->window_seq-sk->rcv_ack_seq)/max(sk->mss,1))
+ >> 1, 2);
sk->cong_window = sk->ssthresh+MAX_DUP_ACKS+1;
+ sk->cong_count = 0;
tmp = sk->packets_out;
tcp_do_retransmit(sk,0);
sk->packets_out = tmp;
@@ -1284,6 +1312,7 @@
if (sk->rcv_ack_cnt > MAX_DUP_ACKS) {
/* Don't allow congestion window to drop to zero. */
sk->cong_window = max(sk->ssthresh, 1);
+ sk->cong_count = 0;
}
sk->window_seq = window_seq;
sk->rcv_ack_seq = ack;
@@ -1360,6 +1389,7 @@
*/
for (;;) {
+ int was_locked;
struct sk_buff * skb = sk->send_head;
if (!skb)
break;
@@ -1431,10 +1461,22 @@
* We may need to remove this from the dev send list.
*/
cli();
- if (skb->next)
+ was_locked = skb_device_locked(skb);
+
+ if (was_locked) {
+ /* In this case, we are relying on the fact that kfree_skb
+ * will just set the free flag to be 3, and increment
+ * a counter. It will not actually free anything, and
+ * will not take much time
+ */
+ kfree_skb(skb, FREE_WRITE);
+ } else {
skb_unlink(skb);
+ }
sti();
- kfree_skb(skb, FREE_WRITE); /* write. */
+
+ if (!was_locked)
+ kfree_skb(skb, FREE_WRITE); /* write. */
if (!sk->dead)
sk->write_space(sk);
}
@@ -1534,19 +1576,6 @@
}
/*
- * We have nothing queued but space to send. Send any partial
- * packets immediately (end of Nagle rule application).
- */
-
- if (sk->packets_out == 0
- && sk->partial != NULL
- && skb_queue_empty(&sk->write_queue)
- && sk->send_head == NULL)
- {
- tcp_send_partial(sk);
- }
-
- /*
* In the LAST_ACK case, the other end FIN'd us. We then FIN'd them, and
* we are now waiting for an acknowledge to our FIN. The other end is
* already in TIME_WAIT.
@@ -1618,6 +1647,14 @@
if (sk->state==TCP_SYN_RECV)
{
tcp_set_state(sk, TCP_ESTABLISHED);
+
+ /* Must check for peer advertising zero sized window
+ * or else we get a sk->{mtu,mss} of zero and thus bomb out
+ * in tcp_do_sendmsg. -DaveM
+ */
+ if(sk->max_window == 0)
+ sk->max_window = 32;
+
tcp_options(sk,th);
#if 0
@@ -1628,11 +1665,7 @@
sk->copied_seq = sk->acked_seq;
if(!sk->dead)
sk->state_change(sk);
- if(sk->max_window==0)
- {
- sk->max_window=32; /* Sanity check */
- sk->mss=min(sk->max_window,sk->mtu);
- }
+
/* Reset the RTT estimator to the initial
* state rather than testing to avoid
* updating it on the ACK to the SYN packet.
@@ -1650,11 +1683,20 @@
* If we are retransmitting, and we acked a packet on the retransmit
* queue, and there is still something in the retransmit queue,
* then we can output some retransmission packets.
+ *
+ * Note that we need to be a bit careful here about getting the
+ * correct TIME_WRITE timer set. If we just got an ack of a
+ * packet we where retransmitting, we will retransmit the next
+ * packet in the retransmit queue below, and the timeout
+ * should now start from the time we retransmitted that packet.
+ * The resetting of the TIME_WRITE timer above will have set it
+ * relative to the prior transmission time, which would be wrong.
*/
if (sk->send_head != NULL && (flag&2) && sk->retransmits)
{
tcp_do_retransmit(sk, 1);
+ tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
}
return 1;
@@ -2196,7 +2238,8 @@
struct tcphdr *th = (struct tcphdr *)(skb->h.raw + iph->ihl*4);
struct sock *sk;
- sk = tcp_v4_lookup(iph->saddr, th->source, iph->daddr, th->dest);
+ sk = tcp_v4_lookup(iph->saddr, th->source, iph->daddr, th->dest,
+ skb->dev);
if (!sk)
return 0;
/* 0 means accept all LOCAL addresses here, not all the world... */
@@ -2262,7 +2305,12 @@
#ifdef CONFIG_SYN_COOKIES
retry_search:
#endif
- sk = __tcp_v4_lookup(th, saddr, th->source, daddr, th->dest);
+#ifdef CONFIG_IP_TRANSPARENT_PROXY
+ if (skb->redirport)
+ sk = tcp_v4_proxy_lookup(saddr, th->source, daddr, th->dest, dev->pa_addr, skb->redirport, dev);
+ else
+#endif
+ sk = __tcp_v4_lookup(th, saddr, th->source, daddr, th->dest, dev);
if (!sk)
goto no_tcp_socket;
skb->sk = sk;
@@ -2478,6 +2526,13 @@
*/
tcp_ack(sk,th,skb->ack_seq,len);
+ /* We must check here (before tcp_options) whether
+ * peer advertised a zero sized window on us, else
+ * we end up with a zero sk->{mtu,mss} and thus bomb
+ * out in tcp_do_sendmsg. -DaveM
+ */
+ if(sk->max_window == 0)
+ sk->max_window = 32;
/*
* Ok.. it's good. Set up sequence numbers and
@@ -2501,11 +2556,7 @@
sk->state_change(sk);
sock_wake_async(sk->socket, 0);
}
- if(sk->max_window==0)
- {
- sk->max_window = 32;
- sk->mss = min(sk->max_window, sk->mtu);
- }
+
/* Reset the RTT estimator to the initial
* state rather than testing to avoid
* updating it on the ACK to the SYN packet.
@@ -2571,11 +2622,17 @@
tcp_set_state(sk, TCP_CLOSE);
sk->shutdown = SHUTDOWN_MASK;
#ifdef CONFIG_IP_TRANSPARENT_PROXY
- sk = tcp_v4_proxy_lookup(th->dest, saddr, th->source, daddr,
- dev->pa_addr, skb->redirport);
-#else
- sk = NULL;
+ /* What to do here?
+ * For the non-proxy case, this code is effectively almost a no-op,
+ * due to the sk = NULL. Is that intentional? If so, why shouldn't we
+ * do the same for the proxy case and get rid of some useless code?
+ */
+ if (skb->redirport)
+ sk = tcp_v4_proxy_lookup(saddr, th->source, daddr, th->dest,
+ dev->pa_addr, skb->redirport, dev);
+ else
#endif
+ sk = NULL;
/* this is not really correct: we should check sk->users */
if (sk && sk->state==TCP_LISTEN)
{
@@ -2671,6 +2728,19 @@
if(tcp_data(skb,sk, saddr, len))
kfree_skb(skb, FREE_READ);
+
+ /*
+ * If we had a partial packet being help up due to
+ * application of Nagle's rule we are now free to send it.
+ */
+ if (th->ack
+ && sk->packets_out == 0
+ && sk->partial != NULL
+ && skb_queue_empty(&sk->write_queue)
+ && sk->send_head == NULL)
+ {
+ tcp_send_partial(sk);
+ }
/*
* If our receive queue has grown past its limits,
FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen, slshen@lbl.gov