patch-2.3.4 linux/net/ipv4/tcp_ipv4.c
Next file: linux/net/ipv4/tcp_output.c
Previous file: linux/net/ipv4/tcp_input.c
Back to the patch index
Back to the overall index
- Lines: 584
- Date:
Mon May 31 22:07:43 1999
- Orig file:
v2.3.3/linux/net/ipv4/tcp_ipv4.c
- Orig date:
Fri May 14 18:55:32 1999
diff -u --recursive --new-file v2.3.3/linux/net/ipv4/tcp_ipv4.c linux/net/ipv4/tcp_ipv4.c
@@ -5,7 +5,7 @@
*
* Implementation of the Transmission Control Protocol(TCP).
*
- * Version: $Id: tcp_ipv4.c,v 1.176 1999/05/12 11:24:46 davem Exp $
+ * Version: $Id: tcp_ipv4.c,v 1.178 1999/05/30 01:16:27 davem Exp $
*
* IPv4 specific functions
*
@@ -90,12 +90,14 @@
* First half of the table is for sockets not in TIME_WAIT, second half
* is for TIME_WAIT sockets only.
*/
-struct sock *tcp_established_hash[TCP_HTABLE_SIZE];
+struct sock **tcp_ehash;
+int tcp_ehash_size;
/* Ok, let's try this, I give up, we do need a local binding
* TCP hash as well as the others for fast bind/connect.
*/
-struct tcp_bind_bucket *tcp_bound_hash[TCP_BHTABLE_SIZE];
+struct tcp_bind_bucket **tcp_bhash;
+int tcp_bhash_size;
/* All sockets in TCP_LISTEN state will be in here. This is the only table
* where wildcard'd TCP sockets can exist. Hash function here is just local
@@ -117,7 +119,7 @@
static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
__u32 faddr, __u16 fport)
{
- return ((laddr ^ lport) ^ (faddr ^ fport)) & ((TCP_HTABLE_SIZE/2) - 1);
+ return ((laddr ^ lport) ^ (faddr ^ fport)) & ((tcp_ehash_size >> 1) - 1);
}
static __inline__ int tcp_sk_hashfn(struct sock *sk)
@@ -136,8 +138,8 @@
struct tcp_bind_bucket *tb;
unsigned short snum = sk->num;
- SOCKHASH_LOCK();
- for(tb = tcp_bound_hash[tcp_bhashfn(snum)]; tb; tb = tb->next) {
+ SOCKHASH_LOCK_WRITE();
+ for(tb = tcp_bhash[tcp_bhashfn(snum)]; tb; tb = tb->next) {
if(tb->port == snum) {
if(tb->owners == NULL &&
(tb->flags & TCPB_FLAG_LOCKED)) {
@@ -148,9 +150,10 @@
break;
}
}
- SOCKHASH_UNLOCK();
+ SOCKHASH_UNLOCK_WRITE();
}
+/* The sockhash lock must be held as a writer here. */
struct tcp_bind_bucket *tcp_bucket_create(unsigned short snum)
{
struct tcp_bind_bucket *tb;
@@ -158,7 +161,7 @@
tb = kmem_cache_alloc(tcp_bucket_cachep, SLAB_ATOMIC);
if(tb != NULL) {
struct tcp_bind_bucket **head =
- &tcp_bound_hash[tcp_bhashfn(snum)];
+ &tcp_bhash[tcp_bhashfn(snum)];
tb->port = snum;
tb->flags = TCPB_FLAG_LOCKED;
tb->owners = NULL;
@@ -176,13 +179,18 @@
*/
static __inline__ int tcp_bucket_check(unsigned short snum)
{
- struct tcp_bind_bucket *tb = tcp_bound_hash[tcp_bhashfn(snum)];
+ struct tcp_bind_bucket *tb;
+ int ret = 0;
+
+ SOCKHASH_LOCK_WRITE();
+ tb = tcp_bhash[tcp_bhashfn(snum)];
for( ; (tb && (tb->port != snum)); tb = tb->next)
;
if(tb == NULL && tcp_bucket_create(snum) == NULL)
- return 1;
- else
- return 0;
+ ret = 1;
+ SOCKHASH_UNLOCK_WRITE();
+
+ return ret;
}
#endif
@@ -191,8 +199,8 @@
struct tcp_bind_bucket *tb;
int result = 0;
- SOCKHASH_LOCK();
- for(tb = tcp_bound_hash[tcp_bhashfn(snum)];
+ SOCKHASH_LOCK_WRITE();
+ for(tb = tcp_bhash[tcp_bhashfn(snum)];
(tb && (tb->port != snum));
tb = tb->next)
;
@@ -256,7 +264,7 @@
}
}
go_like_smoke:
- SOCKHASH_UNLOCK();
+ SOCKHASH_UNLOCK_WRITE();
return result;
}
@@ -268,13 +276,13 @@
int remaining = (high - low) + 1;
int rover;
- SOCKHASH_LOCK();
+ SOCKHASH_LOCK_WRITE();
rover = tcp_port_rover;
do {
rover += 1;
if((rover < low) || (rover > high))
rover = low;
- tb = tcp_bound_hash[tcp_bhashfn(rover)];
+ tb = tcp_bhash[tcp_bhashfn(rover)];
for( ; tb; tb = tb->next) {
if(tb->port == rover)
goto next;
@@ -288,7 +296,7 @@
rover = 0;
if (tb != NULL)
tb->flags |= TCPB_FLAG_GOODSOCKNUM;
- SOCKHASH_UNLOCK();
+ SOCKHASH_UNLOCK_WRITE();
return rover;
}
@@ -298,20 +306,20 @@
if (sk->state != TCP_CLOSE) {
struct sock **skp;
- SOCKHASH_LOCK();
- skp = &tcp_established_hash[(sk->hashent = tcp_sk_hashfn(sk))];
+ SOCKHASH_LOCK_WRITE();
+ skp = &tcp_ehash[(sk->hashent = tcp_sk_hashfn(sk))];
if((sk->next = *skp) != NULL)
(*skp)->pprev = &sk->next;
*skp = sk;
sk->pprev = skp;
tcp_sk_bindify(sk);
- SOCKHASH_UNLOCK();
+ SOCKHASH_UNLOCK_WRITE();
}
}
static void tcp_v4_unhash(struct sock *sk)
{
- SOCKHASH_LOCK();
+ SOCKHASH_LOCK_WRITE();
if(sk->pprev) {
if(sk->next)
sk->next->pprev = sk->pprev;
@@ -320,14 +328,14 @@
tcp_reg_zap(sk);
tcp_sk_unbindify(sk);
}
- SOCKHASH_UNLOCK();
+ SOCKHASH_UNLOCK_WRITE();
}
static void tcp_v4_rehash(struct sock *sk)
{
unsigned char state;
- SOCKHASH_LOCK();
+ SOCKHASH_LOCK_WRITE();
state = sk->state;
if(sk->pprev != NULL) {
if(sk->next)
@@ -342,7 +350,7 @@
if(state == TCP_LISTEN)
skp = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
else
- skp = &tcp_established_hash[(sk->hashent = tcp_sk_hashfn(sk))];
+ skp = &tcp_ehash[(sk->hashent = tcp_sk_hashfn(sk))];
if((sk->next = *skp) != NULL)
(*skp)->pprev = &sk->next;
@@ -351,7 +359,7 @@
if(state == TCP_LISTEN)
tcp_sk_bindify(sk);
}
- SOCKHASH_UNLOCK();
+ SOCKHASH_UNLOCK_WRITE();
}
/* Don't inline this cruft. Here are some nice properties to
@@ -395,10 +403,10 @@
/* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
* we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
- * It is assumed that this code only gets called from within NET_BH.
+ *
+ * The sockhash lock must be held as a reader here.
*/
-static inline struct sock *__tcp_v4_lookup(struct tcphdr *th,
- u32 saddr, u16 sport,
+static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
u32 daddr, u16 dport, int dif)
{
TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
@@ -416,7 +424,7 @@
* have wildcards anyways.
*/
hash = tcp_hashfn(daddr, hnum, saddr, sport);
- for(sk = tcp_established_hash[hash]; sk; sk = sk->next) {
+ for(sk = tcp_ehash[hash]; sk; sk = sk->next) {
if(TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif)) {
if (sk->state == TCP_ESTABLISHED)
TCP_RHASH(sport) = sk;
@@ -424,7 +432,7 @@
}
}
/* Must check for a TIME_WAIT'er before going to listener hash. */
- for(sk = tcp_established_hash[hash+(TCP_HTABLE_SIZE/2)]; sk; sk = sk->next)
+ for(sk = tcp_ehash[hash+(tcp_ehash_size >> 1)]; sk; sk = sk->next)
if(TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
goto hit;
sk = tcp_v4_lookup_listener(daddr, hnum, dif);
@@ -434,7 +442,13 @@
__inline__ struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif)
{
- return __tcp_v4_lookup(0, saddr, sport, daddr, dport, dif);
+ struct sock *sk;
+
+ SOCKHASH_LOCK_READ();
+ sk = __tcp_v4_lookup(saddr, sport, daddr, dport, dif);
+ SOCKHASH_UNLOCK_READ();
+
+ return sk;
}
#ifdef CONFIG_IP_TRANSPARENT_PROXY
@@ -462,9 +476,12 @@
paddr = idev->ifa_list->ifa_local;
}
- /* This code must run only from NET_BH. */
+ /* We must obtain the sockhash lock here, we are always
+ * in BH context.
+ */
+ SOCKHASH_LOCK_READ_BH();
{
- struct tcp_bind_bucket *tb = tcp_bound_hash[tcp_bhashfn(hnum)];
+ struct tcp_bind_bucket *tb = tcp_bhash[tcp_bhashfn(hnum)];
for( ; (tb && tb->port != hnum); tb = tb->next)
;
if(tb == NULL)
@@ -505,7 +522,7 @@
}
next:
if(firstpass--) {
- struct tcp_bind_bucket *tb = tcp_bound_hash[tcp_bhashfn(hpnum)];
+ struct tcp_bind_bucket *tb = tcp_bhash[tcp_bhashfn(hpnum)];
for( ; (tb && tb->port != hpnum); tb = tb->next)
;
if(tb) {
@@ -514,6 +531,7 @@
}
}
gotit:
+ SOCKHASH_UNLOCK_READ_BH();
return result;
}
#endif /* CONFIG_IP_TRANSPARENT_PROXY */
@@ -540,21 +558,23 @@
int retval = 1;
/* Freeze the hash while we snoop around. */
- SOCKHASH_LOCK();
- tb = tcp_bound_hash[tcp_bhashfn(snum)];
+ SOCKHASH_LOCK_READ();
+ tb = tcp_bhash[tcp_bhashfn(snum)];
for(; tb; tb = tb->next) {
if(tb->port == snum && tb->owners != NULL) {
/* Almost certainly the re-use port case, search the real hashes
* so it actually scales.
*/
- sk = __tcp_v4_lookup(NULL, sk->daddr, sk->dport,
+ sk = __tcp_v4_lookup(sk->daddr, sk->dport,
sk->rcv_saddr, snum, sk->bound_dev_if);
+ SOCKHASH_UNLOCK_READ();
+
if((sk != NULL) && (sk->state != TCP_LISTEN))
retval = 0;
- break;
+ return retval;
}
}
- SOCKHASH_UNLOCK();
+ SOCKHASH_UNLOCK_READ();
return retval;
}
@@ -727,16 +747,17 @@
{
struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
- if (atomic_read(&sk->sock_readers))
- return;
-
- /* Don't interested in TCP_LISTEN and open_requests (SYN-ACKs
+ /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
* send out by Linux are always <576bytes so they should go through
* unfragmented).
*/
if (sk->state == TCP_LISTEN)
return;
+ bh_lock_sock(sk);
+ if(sk->lock.users != 0)
+ goto out;
+
/* We don't check in the destentry if pmtu discovery is forbidden
* on this route. We just assume that no packet_to_big packets
* are send back when pmtu discovery is not active.
@@ -744,7 +765,8 @@
* route, but I think that's acceptable.
*/
if (sk->dst_cache == NULL)
- return;
+ goto out;
+
ip_rt_update_pmtu(sk->dst_cache, mtu);
if (sk->ip_pmtudisc != IP_PMTUDISC_DONT &&
tp->pmtu_cookie > sk->dst_cache->pmtu) {
@@ -757,6 +779,8 @@
*/
tcp_simple_retransmit(sk);
} /* else let the usual retransmit timer handle it */
+out:
+ bh_unlock_sock(sk);
}
/*
@@ -849,17 +873,6 @@
switch (sk->state) {
struct open_request *req, *prev;
case TCP_LISTEN:
- /* Prevent race conditions with accept() -
- * ICMP is unreliable.
- */
- if (atomic_read(&sk->sock_readers)) {
- net_statistics.LockDroppedIcmps++;
- /* If too many ICMPs get dropped on busy
- * servers this needs to be solved differently.
- */
- return;
- }
-
/* The final ACK of the handshake should be already
* handled in the new socket context, not here.
* Strictly speaking - an ICMP error for the final
@@ -869,12 +882,24 @@
if (!no_flags && !th->syn && !th->ack)
return;
+ /* Prevent race conditions with accept() -
+ * ICMP is unreliable.
+ */
+ bh_lock_sock(sk);
+ if (sk->lock.users != 0) {
+ net_statistics.LockDroppedIcmps++;
+ /* If too many ICMPs get dropped on busy
+ * servers this needs to be solved differently.
+ */
+ goto out_unlock;
+ }
+
req = tcp_v4_search_req(tp, iph, th, &prev);
if (!req)
- return;
+ goto out_unlock;
if (seq != req->snt_isn) {
net_statistics.OutOfWindowIcmps++;
- return;
+ goto out_unlock;
}
if (req->sk) {
/*
@@ -884,6 +909,7 @@
* but only with the next operation on the socket after
* accept.
*/
+ bh_unlock_sock(sk);
sk = req->sk;
} else {
/*
@@ -896,6 +922,8 @@
tcp_synq_unlink(tp, req, prev);
req->class->destructor(req);
tcp_openreq_free(req);
+ out_unlock:
+ bh_unlock_sock(sk);
return;
}
break;
@@ -1025,9 +1053,10 @@
{
struct iphdr *iph = skb->nh.iph;
struct tcphdr *th = (struct tcphdr *)(skb->nh.raw + iph->ihl*4);
- struct sock *sk;
+ struct sock *sk = NULL;
int i;
+ SOCKHASH_LOCK_READ();
for (i=0; i<TCP_LHTABLE_SIZE; i++) {
for(sk = tcp_listening_hash[i]; sk; sk = sk->next) {
struct open_request *dummy;
@@ -1035,10 +1064,12 @@
th, &dummy) &&
(!sk->bound_dev_if ||
sk->bound_dev_if == skb->dev->ifindex))
- return sk;
+ goto out;
}
}
- return NULL;
+out:
+ SOCKHASH_UNLOCK_READ();
+ return sk;
}
/*
@@ -1319,7 +1350,8 @@
/* Clone the TCP header template */
newsk->dport = req->rmt_port;
- atomic_set(&newsk->sock_readers, 0);
+ sock_lock_init(newsk);
+
atomic_set(&newsk->rmem_alloc, 0);
skb_queue_head_init(&newsk->receive_queue);
atomic_set(&newsk->wmem_alloc, 0);
@@ -1328,9 +1360,9 @@
newsk->done = 0;
newsk->proc = 0;
- newsk->pair = NULL;
- skb_queue_head_init(&newsk->back_log);
+ newsk->backlog.head = newsk->backlog.tail = NULL;
skb_queue_head_init(&newsk->error_queue);
+ newsk->write_space = tcp_write_space;
#ifdef CONFIG_FILTER
if ((filter = newsk->filter) != NULL)
sk_filter_charge(newsk, filter);
@@ -1552,7 +1584,8 @@
}
/* Check for SYN|ACK */
- if (flg & __constant_htonl(0x00120000)) {
+ flg &= __constant_htonl(0x00120000);
+ if (flg) {
struct open_request *req, *dummy;
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
@@ -1570,8 +1603,17 @@
return sk;
}
+/* The socket must have it's spinlock held when we get
+ * here.
+ *
+ * We have a potential double-lock case here, so even when
+ * doing backlog processing we use the BH locking scheme.
+ * This is because we cannot sleep with the original spinlock
+ * held.
+ */
int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
{
+ int need_unlock = 0;
#ifdef CONFIG_FILTER
struct sk_filter *filter = sk->filter;
if (filter && sk_filter(skb, filter))
@@ -1591,7 +1633,6 @@
return 0;
}
-
if (sk->state == TCP_LISTEN) {
struct sock *nsk;
@@ -1604,17 +1645,22 @@
* otherwise we just shortcircuit this and continue with
* the new socket..
*/
- if (atomic_read(&nsk->sock_readers)) {
- skb_orphan(skb);
- __skb_queue_tail(&nsk->back_log, skb);
- return 0;
+ if (nsk != sk) {
+ bh_lock_sock(nsk);
+ if (nsk->lock.users != 0) {
+ skb_orphan(skb);
+ sk_add_backlog(nsk, skb);
+ bh_unlock_sock(nsk);
+ return 0;
+ }
+ need_unlock = 1;
+ sk = nsk;
}
- sk = nsk;
}
if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
goto reset;
- return 0;
+ goto out_maybe_unlock;
reset:
tcp_v4_send_reset(skb);
@@ -1625,6 +1671,9 @@
* might be destroyed here. This current version compiles correctly,
* but you have been warned.
*/
+out_maybe_unlock:
+ if(need_unlock)
+ bh_unlock_sock(sk);
return 0;
}
@@ -1636,6 +1685,7 @@
{
struct tcphdr *th;
struct sock *sk;
+ int ret;
if (skb->pkt_type!=PACKET_HOST)
goto discard_it;
@@ -1681,8 +1731,10 @@
IPCB(skb)->redirport, skb->dev->ifindex);
else {
#endif
- sk = __tcp_v4_lookup(th, skb->nh.iph->saddr, th->source,
+ SOCKHASH_LOCK_READ_BH();
+ sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
skb->nh.iph->daddr, th->dest, skb->dev->ifindex);
+ SOCKHASH_UNLOCK_READ_BH();
#ifdef CONFIG_IP_TRANSPARENT_PROXY
if (!sk)
sk = tcp_v4_search_proxy_openreq(skb);
@@ -1702,11 +1754,16 @@
if (sk->state == TCP_TIME_WAIT)
goto do_time_wait;
- if (!atomic_read(&sk->sock_readers))
- return tcp_v4_do_rcv(sk, skb);
- __skb_queue_tail(&sk->back_log, skb);
- return 0;
+ bh_lock_sock(sk);
+ ret = 0;
+ if (!sk->lock.users)
+ ret = tcp_v4_do_rcv(sk, skb);
+ else
+ sk_add_backlog(sk, skb);
+ bh_unlock_sock(sk);
+
+ return ret;
no_tcp_socket:
tcp_v4_send_reset(skb);
@@ -1954,6 +2011,11 @@
if ((err=ops->create(tcp_socket, IPPROTO_TCP))<0)
panic("Failed to create the TCP control socket.\n");
tcp_socket->sk->allocation=GFP_ATOMIC;
- tcp_socket->sk->num = 256; /* Don't receive any data */
tcp_socket->sk->ip_ttl = MAXTTL;
+
+ /* Unhash it so that IP input processing does not even
+ * see it, we do not wish this socket to see incoming
+ * packets.
+ */
+ tcp_socket->sk->prot->unhash(tcp_socket->sk);
}
FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen (who was at: slshen@lbl.gov)