patch-2.3.4 linux/net/ipv4/tcp_ipv4.c

Next file: linux/net/ipv4/tcp_output.c
Previous file: linux/net/ipv4/tcp_input.c
Back to the patch index
Back to the overall index

diff -u --recursive --new-file v2.3.3/linux/net/ipv4/tcp_ipv4.c linux/net/ipv4/tcp_ipv4.c
@@ -5,7 +5,7 @@
  *
  *		Implementation of the Transmission Control Protocol(TCP).
  *
- * Version:	$Id: tcp_ipv4.c,v 1.176 1999/05/12 11:24:46 davem Exp $
+ * Version:	$Id: tcp_ipv4.c,v 1.178 1999/05/30 01:16:27 davem Exp $
  *
  *		IPv4 specific functions
  *
@@ -90,12 +90,14 @@
  * First half of the table is for sockets not in TIME_WAIT, second half
  * is for TIME_WAIT sockets only.
  */
-struct sock *tcp_established_hash[TCP_HTABLE_SIZE];
+struct sock **tcp_ehash;
+int tcp_ehash_size;
 
 /* Ok, let's try this, I give up, we do need a local binding
  * TCP hash as well as the others for fast bind/connect.
  */
-struct tcp_bind_bucket *tcp_bound_hash[TCP_BHTABLE_SIZE];
+struct tcp_bind_bucket **tcp_bhash;
+int tcp_bhash_size;
 
 /* All sockets in TCP_LISTEN state will be in here.  This is the only table
  * where wildcard'd TCP sockets can exist.  Hash function here is just local
@@ -117,7 +119,7 @@
 static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
 				 __u32 faddr, __u16 fport)
 {
-	return ((laddr ^ lport) ^ (faddr ^ fport)) & ((TCP_HTABLE_SIZE/2) - 1);
+	return ((laddr ^ lport) ^ (faddr ^ fport)) & ((tcp_ehash_size >> 1) - 1);
 }
 
 static __inline__ int tcp_sk_hashfn(struct sock *sk)
@@ -136,8 +138,8 @@
 	struct tcp_bind_bucket *tb;
 	unsigned short snum = sk->num;
 
-	SOCKHASH_LOCK();
-	for(tb = tcp_bound_hash[tcp_bhashfn(snum)]; tb; tb = tb->next) {
+	SOCKHASH_LOCK_WRITE();
+	for(tb = tcp_bhash[tcp_bhashfn(snum)]; tb; tb = tb->next) {
 		if(tb->port == snum) {
 			if(tb->owners == NULL &&
 			   (tb->flags & TCPB_FLAG_LOCKED)) {
@@ -148,9 +150,10 @@
 			break;
 		}
 	}
-	SOCKHASH_UNLOCK();
+	SOCKHASH_UNLOCK_WRITE();
 }
 
+/* The sockhash lock must be held as a writer here. */
 struct tcp_bind_bucket *tcp_bucket_create(unsigned short snum)
 {
 	struct tcp_bind_bucket *tb;
@@ -158,7 +161,7 @@
 	tb = kmem_cache_alloc(tcp_bucket_cachep, SLAB_ATOMIC);
 	if(tb != NULL) {
 		struct tcp_bind_bucket **head =
-			&tcp_bound_hash[tcp_bhashfn(snum)];
+			&tcp_bhash[tcp_bhashfn(snum)];
 		tb->port = snum;
 		tb->flags = TCPB_FLAG_LOCKED;
 		tb->owners = NULL;
@@ -176,13 +179,18 @@
  */
 static __inline__ int tcp_bucket_check(unsigned short snum)
 {
-	struct tcp_bind_bucket *tb = tcp_bound_hash[tcp_bhashfn(snum)];
+	struct tcp_bind_bucket *tb;
+	int ret = 0;
+
+	SOCKHASH_LOCK_WRITE();
+	tb = tcp_bhash[tcp_bhashfn(snum)];
 	for( ; (tb && (tb->port != snum)); tb = tb->next)
 		;
 	if(tb == NULL && tcp_bucket_create(snum) == NULL)
-		return 1;
-	else
-		return 0;
+		ret = 1;
+	SOCKHASH_UNLOCK_WRITE();
+
+	return ret;
 }
 #endif
 
@@ -191,8 +199,8 @@
 	struct tcp_bind_bucket *tb;
 	int result = 0;
 
-	SOCKHASH_LOCK();
-	for(tb = tcp_bound_hash[tcp_bhashfn(snum)];
+	SOCKHASH_LOCK_WRITE();
+	for(tb = tcp_bhash[tcp_bhashfn(snum)];
 	    (tb && (tb->port != snum));
 	    tb = tb->next)
 		;
@@ -256,7 +264,7 @@
 		}
 	}
 go_like_smoke:
-	SOCKHASH_UNLOCK();
+	SOCKHASH_UNLOCK_WRITE();
 	return result;
 }
 
@@ -268,13 +276,13 @@
 	int remaining = (high - low) + 1;
 	int rover;
 
-	SOCKHASH_LOCK();
+	SOCKHASH_LOCK_WRITE();
 	rover = tcp_port_rover;
 	do {
 		rover += 1;
 		if((rover < low) || (rover > high))
 			rover = low;
-		tb = tcp_bound_hash[tcp_bhashfn(rover)];
+		tb = tcp_bhash[tcp_bhashfn(rover)];
 		for( ; tb; tb = tb->next) {
 			if(tb->port == rover)
 				goto next;
@@ -288,7 +296,7 @@
 		rover = 0;
 	if (tb != NULL)
 		tb->flags |= TCPB_FLAG_GOODSOCKNUM;
-	SOCKHASH_UNLOCK();
+	SOCKHASH_UNLOCK_WRITE();
 
 	return rover;
 }
@@ -298,20 +306,20 @@
 	if (sk->state != TCP_CLOSE) {
 		struct sock **skp;
 
-		SOCKHASH_LOCK();
-		skp = &tcp_established_hash[(sk->hashent = tcp_sk_hashfn(sk))];
+		SOCKHASH_LOCK_WRITE();
+		skp = &tcp_ehash[(sk->hashent = tcp_sk_hashfn(sk))];
 		if((sk->next = *skp) != NULL)
 			(*skp)->pprev = &sk->next;
 		*skp = sk;
 		sk->pprev = skp;
 		tcp_sk_bindify(sk);
-		SOCKHASH_UNLOCK();
+		SOCKHASH_UNLOCK_WRITE();
 	}
 }
 
 static void tcp_v4_unhash(struct sock *sk)
 {
-	SOCKHASH_LOCK();
+	SOCKHASH_LOCK_WRITE();
 	if(sk->pprev) {
 		if(sk->next)
 			sk->next->pprev = sk->pprev;
@@ -320,14 +328,14 @@
 		tcp_reg_zap(sk);
 		tcp_sk_unbindify(sk);
 	}
-	SOCKHASH_UNLOCK();
+	SOCKHASH_UNLOCK_WRITE();
 }
 
 static void tcp_v4_rehash(struct sock *sk)
 {
 	unsigned char state;
 
-	SOCKHASH_LOCK();
+	SOCKHASH_LOCK_WRITE();
 	state = sk->state;
 	if(sk->pprev != NULL) {
 		if(sk->next)
@@ -342,7 +350,7 @@
 		if(state == TCP_LISTEN)
 			skp = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
 		else
-			skp = &tcp_established_hash[(sk->hashent = tcp_sk_hashfn(sk))];
+			skp = &tcp_ehash[(sk->hashent = tcp_sk_hashfn(sk))];
 
 		if((sk->next = *skp) != NULL)
 			(*skp)->pprev = &sk->next;
@@ -351,7 +359,7 @@
 		if(state == TCP_LISTEN)
 			tcp_sk_bindify(sk);
 	}
-	SOCKHASH_UNLOCK();
+	SOCKHASH_UNLOCK_WRITE();
 }
 
 /* Don't inline this cruft.  Here are some nice properties to
@@ -395,10 +403,10 @@
 
 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
  * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
- * It is assumed that this code only gets called from within NET_BH.
+ *
+ * The sockhash lock must be held as a reader here.
  */
-static inline struct sock *__tcp_v4_lookup(struct tcphdr *th,
-					   u32 saddr, u16 sport,
+static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
 					   u32 daddr, u16 dport, int dif)
 {
 	TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
@@ -416,7 +424,7 @@
 	 * have wildcards anyways.
 	 */
 	hash = tcp_hashfn(daddr, hnum, saddr, sport);
-	for(sk = tcp_established_hash[hash]; sk; sk = sk->next) {
+	for(sk = tcp_ehash[hash]; sk; sk = sk->next) {
 		if(TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif)) {
 			if (sk->state == TCP_ESTABLISHED)
 				TCP_RHASH(sport) = sk;
@@ -424,7 +432,7 @@
 		}
 	}
 	/* Must check for a TIME_WAIT'er before going to listener hash. */
-	for(sk = tcp_established_hash[hash+(TCP_HTABLE_SIZE/2)]; sk; sk = sk->next)
+	for(sk = tcp_ehash[hash+(tcp_ehash_size >> 1)]; sk; sk = sk->next)
 		if(TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
 			goto hit;
 	sk = tcp_v4_lookup_listener(daddr, hnum, dif);
@@ -434,7 +442,13 @@
 
 __inline__ struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif)
 {
-	return __tcp_v4_lookup(0, saddr, sport, daddr, dport, dif);
+	struct sock *sk;
+
+	SOCKHASH_LOCK_READ();
+	sk = __tcp_v4_lookup(saddr, sport, daddr, dport, dif);
+	SOCKHASH_UNLOCK_READ();
+
+	return sk;
 }
 
 #ifdef CONFIG_IP_TRANSPARENT_PROXY
@@ -462,9 +476,12 @@
 			paddr = idev->ifa_list->ifa_local;
 	}
 
-	/* This code must run only from NET_BH. */
+	/* We must obtain the sockhash lock here, we are always
+	 * in BH context.
+	 */
+	SOCKHASH_LOCK_READ_BH();
 	{
-		struct tcp_bind_bucket *tb = tcp_bound_hash[tcp_bhashfn(hnum)];
+		struct tcp_bind_bucket *tb = tcp_bhash[tcp_bhashfn(hnum)];
 		for( ; (tb && tb->port != hnum); tb = tb->next)
 			;
 		if(tb == NULL)
@@ -505,7 +522,7 @@
 	}
 next:
 	if(firstpass--) {
-		struct tcp_bind_bucket *tb = tcp_bound_hash[tcp_bhashfn(hpnum)];
+		struct tcp_bind_bucket *tb = tcp_bhash[tcp_bhashfn(hpnum)];
 		for( ; (tb && tb->port != hpnum); tb = tb->next)
 			;
 		if(tb) {
@@ -514,6 +531,7 @@
 		}
 	}
 gotit:
+	SOCKHASH_UNLOCK_READ_BH();
 	return result;
 }
 #endif /* CONFIG_IP_TRANSPARENT_PROXY */
@@ -540,21 +558,23 @@
 	int retval = 1;
 
 	/* Freeze the hash while we snoop around. */
-	SOCKHASH_LOCK();
-	tb = tcp_bound_hash[tcp_bhashfn(snum)];
+	SOCKHASH_LOCK_READ();
+	tb = tcp_bhash[tcp_bhashfn(snum)];
 	for(; tb; tb = tb->next) {
 		if(tb->port == snum && tb->owners != NULL) {
 			/* Almost certainly the re-use port case, search the real hashes
 			 * so it actually scales.
 			 */
-			sk = __tcp_v4_lookup(NULL, sk->daddr, sk->dport,
+			sk = __tcp_v4_lookup(sk->daddr, sk->dport,
 					     sk->rcv_saddr, snum, sk->bound_dev_if);
+			SOCKHASH_UNLOCK_READ();
+
 			if((sk != NULL) && (sk->state != TCP_LISTEN))
 				retval = 0;
-			break;
+			return retval;
 		}
 	}
-	SOCKHASH_UNLOCK();
+	SOCKHASH_UNLOCK_READ();
 	return retval;
 }
 
@@ -727,16 +747,17 @@
 {
 	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
 
-	if (atomic_read(&sk->sock_readers))
-		return;
-
-	/* Don't interested in TCP_LISTEN and open_requests (SYN-ACKs
+	/* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
 	 * send out by Linux are always <576bytes so they should go through
 	 * unfragmented).
 	 */
 	if (sk->state == TCP_LISTEN)
 		return; 
 
+	bh_lock_sock(sk);
+	if(sk->lock.users != 0)
+		goto out;
+
 	/* We don't check in the destentry if pmtu discovery is forbidden
 	 * on this route. We just assume that no packet_to_big packets
 	 * are send back when pmtu discovery is not active.
@@ -744,7 +765,8 @@
 	 * route, but I think that's acceptable.
 	 */
 	if (sk->dst_cache == NULL)
-		return;
+		goto out;
+
 	ip_rt_update_pmtu(sk->dst_cache, mtu);
 	if (sk->ip_pmtudisc != IP_PMTUDISC_DONT &&
 	    tp->pmtu_cookie > sk->dst_cache->pmtu) {
@@ -757,6 +779,8 @@
 		 */
 		tcp_simple_retransmit(sk);
 	} /* else let the usual retransmit timer handle it */
+out:
+	bh_unlock_sock(sk);
 }
 
 /*
@@ -849,17 +873,6 @@
 	switch (sk->state) {
 		struct open_request *req, *prev;
 	case TCP_LISTEN:
-		/* Prevent race conditions with accept() - 
-		 * ICMP is unreliable. 
-		 */
-		if (atomic_read(&sk->sock_readers)) {
-			net_statistics.LockDroppedIcmps++;
-			 /* If too many ICMPs get dropped on busy
-			  * servers this needs to be solved differently.
-			  */
-			return;
-		}
-
 		/* The final ACK of the handshake should be already 
 		 * handled in the new socket context, not here.
 		 * Strictly speaking - an ICMP error for the final
@@ -869,12 +882,24 @@
 		if (!no_flags && !th->syn && !th->ack)
 			return;
 
+		/* Prevent race conditions with accept() - 
+		 * ICMP is unreliable. 
+		 */
+		bh_lock_sock(sk);
+		if (sk->lock.users != 0) {
+			net_statistics.LockDroppedIcmps++;
+			 /* If too many ICMPs get dropped on busy
+			  * servers this needs to be solved differently.
+			  */
+			goto out_unlock;
+		}
+
 		req = tcp_v4_search_req(tp, iph, th, &prev); 
 		if (!req)
-			return;
+			goto out_unlock;
 		if (seq != req->snt_isn) {
 			net_statistics.OutOfWindowIcmps++;
-			return;
+			goto out_unlock;
 		}
 		if (req->sk) {	
 			/* 
@@ -884,6 +909,7 @@
 			 * but only with the next operation on the socket after
 			 * accept. 
 			 */
+			bh_unlock_sock(sk);
 			sk = req->sk;
 		} else {
 			/* 
@@ -896,6 +922,8 @@
 			tcp_synq_unlink(tp, req, prev);
 			req->class->destructor(req);
 			tcp_openreq_free(req);
+	out_unlock:
+			bh_unlock_sock(sk);
 			return; 
 		}
 		break;
@@ -1025,9 +1053,10 @@
 {
 	struct iphdr *iph = skb->nh.iph;
 	struct tcphdr *th = (struct tcphdr *)(skb->nh.raw + iph->ihl*4);
-	struct sock *sk;
+	struct sock *sk = NULL;
 	int i;
 
+	SOCKHASH_LOCK_READ();
 	for (i=0; i<TCP_LHTABLE_SIZE; i++) {
 		for(sk = tcp_listening_hash[i]; sk; sk = sk->next) {
 			struct open_request *dummy;
@@ -1035,10 +1064,12 @@
 					      th, &dummy) &&
 			    (!sk->bound_dev_if ||
 			     sk->bound_dev_if == skb->dev->ifindex))
-				return sk;
+				goto out;
 		}
 	}
-	return NULL;
+out:
+	SOCKHASH_UNLOCK_READ();
+	return sk;
 }
 
 /*
@@ -1319,7 +1350,8 @@
 		/* Clone the TCP header template */
 		newsk->dport = req->rmt_port;
 
-		atomic_set(&newsk->sock_readers, 0);
+		sock_lock_init(newsk);
+
 		atomic_set(&newsk->rmem_alloc, 0);
 		skb_queue_head_init(&newsk->receive_queue);
 		atomic_set(&newsk->wmem_alloc, 0);
@@ -1328,9 +1360,9 @@
 
 		newsk->done = 0;
 		newsk->proc = 0;
-		newsk->pair = NULL;
-		skb_queue_head_init(&newsk->back_log);
+		newsk->backlog.head = newsk->backlog.tail = NULL;
 		skb_queue_head_init(&newsk->error_queue);
+		newsk->write_space = tcp_write_space;
 #ifdef CONFIG_FILTER
 		if ((filter = newsk->filter) != NULL)
 			sk_filter_charge(newsk, filter);
@@ -1552,7 +1584,8 @@
 	}
 
 	/* Check for SYN|ACK */
-	if (flg & __constant_htonl(0x00120000)) {
+	flg &= __constant_htonl(0x00120000);
+	if (flg) {
 		struct open_request *req, *dummy; 
 		struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 
@@ -1570,8 +1603,17 @@
 	return sk; 
 }
 
+/* The socket must have it's spinlock held when we get
+ * here.
+ *
+ * We have a potential double-lock case here, so even when
+ * doing backlog processing we use the BH locking scheme.
+ * This is because we cannot sleep with the original spinlock
+ * held.
+ */
 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
 {
+	int need_unlock = 0;
 #ifdef CONFIG_FILTER
 	struct sk_filter *filter = sk->filter;
 	if (filter && sk_filter(skb, filter))
@@ -1591,7 +1633,6 @@
 		return 0; 
 	} 
 
-
 	if (sk->state == TCP_LISTEN) { 
 		struct sock *nsk;
 		
@@ -1604,17 +1645,22 @@
 		 * otherwise we just shortcircuit this and continue with
 		 * the new socket..
 		 */
-		if (atomic_read(&nsk->sock_readers)) {
-			skb_orphan(skb);
-			__skb_queue_tail(&nsk->back_log, skb);
-			return 0;
+		if (nsk != sk) {
+			bh_lock_sock(nsk);
+			if (nsk->lock.users != 0) {
+				skb_orphan(skb);
+				sk_add_backlog(nsk, skb);
+				bh_unlock_sock(nsk);
+				return 0;
+			}
+			need_unlock = 1;
+			sk = nsk;
 		}
-		sk = nsk;
 	}
 	
 	if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
 		goto reset;
-	return 0;
+	goto out_maybe_unlock;
 
 reset:
 	tcp_v4_send_reset(skb);
@@ -1625,6 +1671,9 @@
 	 * might be destroyed here. This current version compiles correctly,
 	 * but you have been warned.
 	 */
+out_maybe_unlock:
+	if(need_unlock)
+		bh_unlock_sock(sk);
 	return 0;
 }
 
@@ -1636,6 +1685,7 @@
 {
 	struct tcphdr *th;
 	struct sock *sk;
+	int ret;
 
 	if (skb->pkt_type!=PACKET_HOST)
 		goto discard_it;
@@ -1681,8 +1731,10 @@
 					 IPCB(skb)->redirport, skb->dev->ifindex);
 	else {
 #endif
-		sk = __tcp_v4_lookup(th, skb->nh.iph->saddr, th->source,
+		SOCKHASH_LOCK_READ_BH();
+		sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
 				     skb->nh.iph->daddr, th->dest, skb->dev->ifindex);
+		SOCKHASH_UNLOCK_READ_BH();
 #ifdef CONFIG_IP_TRANSPARENT_PROXY
 		if (!sk)
 			sk = tcp_v4_search_proxy_openreq(skb);
@@ -1702,11 +1754,16 @@
 
 	if (sk->state == TCP_TIME_WAIT)
 		goto do_time_wait;
-	if (!atomic_read(&sk->sock_readers))
-		return tcp_v4_do_rcv(sk, skb);
 
-	__skb_queue_tail(&sk->back_log, skb);
-	return 0;
+	bh_lock_sock(sk);
+	ret = 0;
+	if (!sk->lock.users)
+		ret = tcp_v4_do_rcv(sk, skb);
+	else
+		sk_add_backlog(sk, skb);
+	bh_unlock_sock(sk);
+
+	return ret;
 
 no_tcp_socket:
 	tcp_v4_send_reset(skb);
@@ -1954,6 +2011,11 @@
 	if ((err=ops->create(tcp_socket, IPPROTO_TCP))<0)
 		panic("Failed to create the TCP control socket.\n");
 	tcp_socket->sk->allocation=GFP_ATOMIC;
-	tcp_socket->sk->num = 256;		/* Don't receive any data */
 	tcp_socket->sk->ip_ttl = MAXTTL;
+
+	/* Unhash it so that IP input processing does not even
+	 * see it, we do not wish this socket to see incoming
+	 * packets.
+	 */
+	tcp_socket->sk->prot->unhash(tcp_socket->sk);
 }

FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen (who was at: slshen@lbl.gov)