patch-2.3.38 linux/net/ipv4/route.c
Next file: linux/net/ipv4/udp.c
Previous file: linux/net/decnet/dn_route.c
Back to the patch index
Back to the overall index
- Lines: 524
- Date:
Thu Jan 6 16:18:30 2000
- Orig file:
v2.3.37/linux/net/ipv4/route.c
- Orig date:
Thu Jan 6 12:57:48 2000
diff -u --recursive --new-file v2.3.37/linux/net/ipv4/route.c linux/net/ipv4/route.c
@@ -5,7 +5,7 @@
*
* ROUTE - implementation of the IP router.
*
- * Version: $Id: route.c,v 1.75 1999/12/23 01:41:44 davem Exp $
+ * Version: $Id: route.c,v 1.77 2000/01/06 00:41:59 davem Exp $
*
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -103,8 +103,7 @@
int ip_rt_min_delay = 2*HZ;
int ip_rt_max_delay = 10*HZ;
-int ip_rt_gc_thresh = RT_HASH_DIVISOR;
-int ip_rt_max_size = RT_HASH_DIVISOR*16;
+int ip_rt_max_size;
int ip_rt_gc_timeout = RT_GC_TIMEOUT;
int ip_rt_gc_interval = 60*HZ;
int ip_rt_gc_min_interval = 5*HZ;
@@ -122,12 +121,8 @@
#define RTprint(a...) printk(KERN_DEBUG a)
-static void rt_run_flush(unsigned long dummy);
-
-static struct timer_list rt_flush_timer =
- { NULL, NULL, 0, 0L, rt_run_flush };
-static struct timer_list rt_periodic_timer =
- { NULL, NULL, 0, 0L, NULL };
+static struct timer_list rt_flush_timer;
+static struct timer_list rt_periodic_timer;
/*
* Interface to generic destination cache.
@@ -146,7 +141,7 @@
{
AF_INET,
__constant_htons(ETH_P_IP),
- RT_HASH_DIVISOR,
+ 0,
rt_garbage_collect,
ipv4_dst_check,
@@ -183,7 +178,7 @@
/* The locking scheme is rather straight forward:
*
- * 1) A BH protected rwlock protects the central route hash.
+ * 1) A BH protected rwlocks protect buckets of the central route hash.
* 2) Only writers remove entries, and they hold the lock
* as they look at rtable reference counts.
* 3) Only readers acquire references to rtable entries,
@@ -191,17 +186,23 @@
* lock held.
*/
-static struct rtable *rt_hash_table[RT_HASH_DIVISOR];
-static rwlock_t rt_hash_lock = RW_LOCK_UNLOCKED;
+struct rt_hash_bucket {
+ struct rtable *chain;
+ rwlock_t lock;
+} __attribute__((__aligned__(8)));
+
+static struct rt_hash_bucket *rt_hash_table;
+static unsigned rt_hash_mask;
+static int rt_hash_log;
static int rt_intern_hash(unsigned hash, struct rtable * rth, struct rtable ** res);
static __inline__ unsigned rt_hash_code(u32 daddr, u32 saddr, u8 tos)
{
unsigned hash = ((daddr&0xF0F0F0F0)>>4)|((daddr&0x0F0F0F0F)<<4);
- hash = hash^saddr^tos;
- hash = hash^(hash>>16);
- return (hash^(hash>>8)) & 0xFF;
+ hash ^= saddr^tos;
+ hash ^= (hash>>16);
+ return (hash^(hash>>8)) & rt_hash_mask;
}
#ifndef CONFIG_PROC_FS
@@ -222,11 +223,9 @@
len = 128;
}
-
- read_lock_bh(&rt_hash_lock);
-
- for (i = 0; i<RT_HASH_DIVISOR; i++) {
- for (r = rt_hash_table[i]; r; r = r->u.rt_next) {
+ for (i = rt_hash_mask; i>=0; i--) {
+ read_lock_bh(&rt_hash_table[i].lock);
+ for (r = rt_hash_table[i].chain; r; r = r->u.rt_next) {
/*
* Spin through entries until we are ready
*/
@@ -253,14 +252,15 @@
r->rt_spec_dst);
sprintf(buffer+len,"%-127s\n",temp);
len += 128;
- if (pos >= offset+length)
+ if (pos >= offset+length) {
+ read_unlock_bh(&rt_hash_table[i].lock);
goto done;
+ }
}
+ read_unlock_bh(&rt_hash_table[i].lock);
}
done:
- read_unlock_bh(&rt_hash_lock);
-
*start = buffer+len-(pos-offset);
len = pos-offset;
if (len>length)
@@ -315,21 +315,23 @@
/* This runs via a timer and thus is always in BH context. */
static void rt_check_expire(unsigned long dummy)
{
- int i;
+ int i, t;
static int rover;
struct rtable *rth, **rthp;
unsigned long now = jiffies;
- for (i=0; i<RT_HASH_DIVISOR/5; i++) {
+ i = rover;
+
+ for (t=(ip_rt_gc_interval<<rt_hash_log); t>=0; t -= ip_rt_gc_timeout) {
unsigned tmo = ip_rt_gc_timeout;
- rover = (rover + 1) & (RT_HASH_DIVISOR-1);
- rthp = &rt_hash_table[rover];
+ i = (i + 1) & rt_hash_mask;
+ rthp = &rt_hash_table[i].chain;
- write_lock(&rt_hash_lock);
+ write_lock(&rt_hash_table[i].lock);
while ((rth = *rthp) != NULL) {
if (rth->u.dst.expires) {
- /* Entrie is expired even if it is in use */
+ /* Entry is expired even if it is in use */
if ((long)(now - rth->u.dst.expires) <= 0) {
tmo >>= 1;
rthp = &rth->u.rt_next;
@@ -347,14 +349,14 @@
*rthp = rth->u.rt_next;
rt_free(rth);
}
- write_unlock(&rt_hash_lock);
+ write_unlock(&rt_hash_table[i].lock);
/* Fallback loop breaker. */
if ((jiffies - now) > 0)
break;
}
- rt_periodic_timer.expires = now + ip_rt_gc_interval;
- add_timer(&rt_periodic_timer);
+ rover = i;
+ mod_timer(&rt_periodic_timer, now + ip_rt_gc_interval);
}
/* This can run from both BH and non-BH contexts, the latter
@@ -367,11 +369,12 @@
rt_deadline = 0;
- for (i=0; i<RT_HASH_DIVISOR; i++) {
- write_lock_bh(&rt_hash_lock);
- rth = rt_hash_table[i];
- rt_hash_table[i] = NULL;
- write_unlock_bh(&rt_hash_lock);
+ for (i=rt_hash_mask; i>=0; i--) {
+ write_lock_bh(&rt_hash_table[i].lock);
+ rth = rt_hash_table[i].chain;
+ if (rth)
+ rt_hash_table[i].chain = NULL;
+ write_unlock_bh(&rt_hash_table[i].lock);
for (; rth; rth=next) {
next = rth->u.rt_next;
@@ -418,8 +421,7 @@
if (rt_deadline == 0)
rt_deadline = now + ip_rt_max_delay;
- rt_flush_timer.expires = now + delay;
- add_timer(&rt_flush_timer);
+ mod_timer(&rt_flush_timer, now+delay);
spin_unlock_bh(&rt_flush_lock);
}
@@ -455,20 +457,20 @@
return 0;
/* Calculate number of entries, which we want to expire now. */
- goal = atomic_read(&ipv4_dst_ops.entries) - RT_HASH_DIVISOR*ip_rt_gc_elasticity;
+ goal = atomic_read(&ipv4_dst_ops.entries) - (ip_rt_gc_elasticity<<rt_hash_log);
if (goal <= 0) {
if (equilibrium < ipv4_dst_ops.gc_thresh)
equilibrium = ipv4_dst_ops.gc_thresh;
goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
if (goal > 0) {
- equilibrium += min(goal/2, RT_HASH_DIVISOR);
+ equilibrium += min(goal/2, rt_hash_mask+1);
goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
}
} else {
/* We are in dangerous area. Try to reduce cache really
* aggressively.
*/
- goal = max(goal/2, RT_HASH_DIVISOR);
+ goal = max(goal/2, rt_hash_mask+1);
equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
}
@@ -483,15 +485,12 @@
do {
int i, k;
- /* The write lock is held during the entire hash
- * traversal to ensure consistent state of the rover.
- */
- write_lock_bh(&rt_hash_lock);
- for (i=0, k=rover; i<RT_HASH_DIVISOR; i++) {
+ for (i=rt_hash_mask, k=rover; i>=0; i--) {
unsigned tmo = expire;
- k = (k + 1) & (RT_HASH_DIVISOR-1);
- rthp = &rt_hash_table[k];
+ k = (k + 1) & rt_hash_mask;
+ rthp = &rt_hash_table[k].chain;
+ write_lock_bh(&rt_hash_table[k].lock);
while ((rth = *rthp) != NULL) {
if (!rt_may_expire(rth, tmo, expire)) {
tmo >>= 1;
@@ -502,11 +501,11 @@
rt_free(rth);
goal--;
}
+ write_unlock_bh(&rt_hash_table[k].lock);
if (goal <= 0)
break;
}
rover = k;
- write_unlock_bh(&rt_hash_lock);
if (goal <= 0)
goto work_done;
@@ -556,20 +555,20 @@
int attempts = !in_interrupt();
restart:
- rthp = &rt_hash_table[hash];
+ rthp = &rt_hash_table[hash].chain;
- write_lock_bh(&rt_hash_lock);
+ write_lock_bh(&rt_hash_table[hash].lock);
while ((rth = *rthp) != NULL) {
if (memcmp(&rth->key, &rt->key, sizeof(rt->key)) == 0) {
/* Put it first */
*rthp = rth->u.rt_next;
- rth->u.rt_next = rt_hash_table[hash];
- rt_hash_table[hash] = rth;
+ rth->u.rt_next = rt_hash_table[hash].chain;
+ rt_hash_table[hash].chain = rth;
rth->u.dst.__use++;
dst_hold(&rth->u.dst);
rth->u.dst.lastuse = now;
- write_unlock_bh(&rt_hash_lock);
+ write_unlock_bh(&rt_hash_table[hash].lock);
rt_drop(rt);
*rp = rth;
@@ -584,7 +583,7 @@
*/
if (rt->rt_type == RTN_UNICAST || rt->key.iif == 0) {
if (!arp_bind_neighbour(&rt->u.dst)) {
- write_unlock_bh(&rt_hash_lock);
+ write_unlock_bh(&rt_hash_table[hash].lock);
/* Neighbour tables are full and nothing
can be released. Try to shrink route cache,
@@ -613,7 +612,7 @@
}
}
- rt->u.rt_next = rt_hash_table[hash];
+ rt->u.rt_next = rt_hash_table[hash].chain;
#if RT_CACHE_DEBUG >= 2
if (rt->u.rt_next) {
struct rtable * trt;
@@ -623,8 +622,8 @@
printk("\n");
}
#endif
- rt_hash_table[hash] = rt;
- write_unlock_bh(&rt_hash_lock);
+ rt_hash_table[hash].chain = rt;
+ write_unlock_bh(&rt_hash_table[hash].lock);
*rp = rt;
return 0;
}
@@ -692,16 +691,16 @@
{
struct rtable **rthp;
- write_lock_bh(&rt_hash_lock);
+ write_lock_bh(&rt_hash_table[hash].lock);
ip_rt_put(rt);
- for (rthp = &rt_hash_table[hash]; *rthp; rthp = &(*rthp)->u.rt_next) {
+ for (rthp = &rt_hash_table[hash].chain; *rthp; rthp = &(*rthp)->u.rt_next) {
if (*rthp == rt) {
*rthp = rt->u.rt_next;
rt_free(rt);
break;
}
}
- write_unlock_bh(&rt_hash_lock);
+ write_unlock_bh(&rt_hash_table[hash].lock);
}
void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
@@ -736,9 +735,9 @@
for (k=0; k<2; k++) {
unsigned hash = rt_hash_code(daddr, skeys[i]^(ikeys[k]<<5), tos);
- rthp=&rt_hash_table[hash];
+ rthp=&rt_hash_table[hash].chain;
- read_lock(&rt_hash_lock);
+ read_lock(&rt_hash_table[hash].lock);
while ( (rth = *rthp) != NULL) {
struct rtable *rt;
@@ -759,7 +758,7 @@
break;
dst_clone(&rth->u.dst);
- read_unlock(&rt_hash_lock);
+ read_unlock(&rt_hash_table[hash].lock);
rt = dst_alloc(&ipv4_dst_ops);
if (rt == NULL) {
@@ -806,7 +805,7 @@
ip_rt_put(rt);
goto do_next;
}
- read_unlock(&rt_hash_lock);
+ read_unlock(&rt_hash_table[hash].lock);
do_next:
;
}
@@ -974,8 +973,8 @@
for (i=0; i<2; i++) {
unsigned hash = rt_hash_code(daddr, skeys[i], tos);
- read_lock(&rt_hash_lock);
- for (rth = rt_hash_table[hash]; rth; rth = rth->u.rt_next) {
+ read_lock(&rt_hash_table[hash].lock);
+ for (rth = rt_hash_table[hash].chain; rth; rth = rth->u.rt_next) {
if (rth->key.dst == daddr &&
rth->key.src == skeys[i] &&
rth->rt_dst == daddr &&
@@ -1008,7 +1007,7 @@
}
}
}
- read_unlock(&rt_hash_lock);
+ read_unlock(&rt_hash_table[hash].lock);
}
return est_mtu ? : new_mtu;
}
@@ -1550,8 +1549,8 @@
tos &= IPTOS_TOS_MASK;
hash = rt_hash_code(daddr, saddr^(iif<<5), tos);
- read_lock_bh(&rt_hash_lock);
- for (rth=rt_hash_table[hash]; rth; rth=rth->u.rt_next) {
+ read_lock(&rt_hash_table[hash].lock);
+ for (rth=rt_hash_table[hash].chain; rth; rth=rth->u.rt_next) {
if (rth->key.dst == daddr &&
rth->key.src == saddr &&
rth->key.iif == iif &&
@@ -1565,12 +1564,12 @@
rth->u.dst.lastuse = jiffies;
dst_hold(&rth->u.dst);
rth->u.dst.__use++;
- read_unlock_bh(&rt_hash_lock);
+ read_unlock(&rt_hash_table[hash].lock);
skb->dst = (struct dst_entry*)rth;
return 0;
}
}
- read_unlock_bh(&rt_hash_lock);
+ read_unlock(&rt_hash_table[hash].lock);
/* Multicast recognition logic is moved from route cache to here.
The problem was that too many Ethernet cards have broken/missing
@@ -1885,8 +1884,8 @@
hash = rt_hash_code(daddr, saddr^(oif<<5), tos);
- read_lock_bh(&rt_hash_lock);
- for (rth=rt_hash_table[hash]; rth; rth=rth->u.rt_next) {
+ read_lock_bh(&rt_hash_table[hash].lock);
+ for (rth=rt_hash_table[hash].chain; rth; rth=rth->u.rt_next) {
if (rth->key.dst == daddr &&
rth->key.src == saddr &&
rth->key.iif == 0 &&
@@ -1897,12 +1896,12 @@
rth->u.dst.lastuse = jiffies;
dst_hold(&rth->u.dst);
rth->u.dst.__use++;
- read_unlock_bh(&rt_hash_lock);
+ read_unlock_bh(&rt_hash_table[hash].lock);
*rp = rth;
return 0;
}
}
- read_unlock_bh(&rt_hash_lock);
+ read_unlock_bh(&rt_hash_table[hash].lock);
return ip_route_output_slow(rp, daddr, saddr, tos, oif);
}
@@ -2043,7 +2042,9 @@
return -ENODEV;
skb->protocol = __constant_htons(ETH_P_IP);
skb->dev = dev;
+ local_bh_disable();
err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
+ local_bh_enable();
rt = (struct rtable*)skb->dst;
if (!err && rt->u.dst.error)
err = -rt->u.dst.error;
@@ -2085,24 +2086,24 @@
s_h = cb->args[0];
s_idx = idx = cb->args[1];
- for (h=0; h < RT_HASH_DIVISOR; h++) {
+ for (h=0; h <= rt_hash_mask; h++) {
if (h < s_h) continue;
if (h > s_h)
s_idx = 0;
- read_lock_bh(&rt_hash_lock);
- for (rt = rt_hash_table[h], idx = 0; rt; rt = rt->u.rt_next, idx++) {
+ read_lock_bh(&rt_hash_table[h].lock);
+ for (rt = rt_hash_table[h].chain, idx = 0; rt; rt = rt->u.rt_next, idx++) {
if (idx < s_idx)
continue;
skb->dst = dst_clone(&rt->u.dst);
if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
cb->nlh->nlmsg_seq, RTM_NEWROUTE, 1) <= 0) {
dst_release(xchg(&skb->dst, NULL));
- read_unlock_bh(&rt_hash_lock);
+ read_unlock_bh(&rt_hash_table[h].lock);
goto done;
}
dst_release(xchg(&skb->dst, NULL));
}
- read_unlock_bh(&rt_hash_lock);
+ read_unlock_bh(&rt_hash_table[h].lock);
}
done:
@@ -2231,17 +2232,56 @@
#endif
#endif
-
void __init ip_rt_init(void)
{
+ int i, order, goal;
+
ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
sizeof(struct rtable),
0, SLAB_HWCACHE_ALIGN,
NULL, NULL);
-
+
+ if (!ipv4_dst_ops.kmem_cachep)
+ panic("IP: failed to allocate ip_dst_cache\n");
+
+ goal = num_physpages >> (26 - PAGE_SHIFT);
+
+ for (order = 0; (1UL << order) < goal; order++)
+ /* NOTHING */;
+
+ do {
+ rt_hash_mask = (1UL << order) * PAGE_SIZE /
+ sizeof(struct rt_hash_bucket);
+ while (rt_hash_mask & (rt_hash_mask-1))
+ rt_hash_mask--;
+ rt_hash_table = (struct rt_hash_bucket *)
+ __get_free_pages(GFP_ATOMIC, order);
+ } while (rt_hash_table == NULL && --order > 0);
+
+ if (!rt_hash_table)
+ panic("Failed to allocate IP route cache hash table\n");
+
+ printk("IP: routing cache hash table of %u buckets, %dKbytes\n",
+ rt_hash_mask, (rt_hash_mask*sizeof(struct rt_hash_bucket))/1024);
+
+ for (rt_hash_log=0; (1<<rt_hash_log) != rt_hash_mask; rt_hash_log++)
+ /* NOTHING */;
+
+ rt_hash_mask--;
+ for (i = 0; i <= rt_hash_mask; i++) {
+ rt_hash_table[i].lock = RW_LOCK_UNLOCKED;
+ rt_hash_table[i].chain = NULL;
+ }
+
+ ipv4_dst_ops.gc_thresh = (rt_hash_mask+1);
+ ip_rt_max_size = (rt_hash_mask+1)*16;
+
devinet_init();
ip_fib_init();
+
+ rt_flush_timer.function = rt_run_flush;
rt_periodic_timer.function = rt_check_expire;
+
/* All the timers, started at system startup tend
to synchronize. Perturb it a bit.
*/
FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen (who was at: slshen@lbl.gov)