patch-2.1.15 linux/net/ipv4/route.c
Next file: linux/net/ipv4/sysctl_net_ipv4.c
Previous file: linux/net/ipv4/raw.c
Back to the patch index
Back to the overall index
- Lines: 2760
- Date:
Thu Dec 12 16:54:24 1996
- Orig file:
v2.1.14/linux/net/ipv4/route.c
- Orig date:
Fri Nov 15 23:49:11 1996
diff -u --recursive --new-file v2.1.14/linux/net/ipv4/route.c linux/net/ipv4/route.c
@@ -11,6 +11,7 @@
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
* Alan Cox, <gw4pts@gw4pts.ampr.org>
* Linus Torvalds, <Linus.Torvalds@helsinki.fi>
+ * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*
* Fixes:
* Alan Cox : Verify area fixes.
@@ -42,6 +43,8 @@
* Bjorn Ekwall : Kerneld route support.
* Alan Cox : Multicast fixed (I hope)
* Pavel Krauz : Limited broadcast fixed
+ * Alexey Kuznetsov : End of old history. Splitted to fib.c and
+ * route.c and rewritten from scratch.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -65,761 +68,60 @@
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/if_arp.h>
+#include <linux/proc_fs.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <net/route.h>
+#include <net/arp.h>
#include <net/tcp.h>
#include <linux/skbuff.h>
#include <net/sock.h>
#include <net/icmp.h>
-#include <net/netlink.h>
-#ifdef CONFIG_KERNELD
-#include <linux/kerneld.h>
-#endif
+#include <linux/net_alias.h>
+
+static void rt_run_flush(unsigned long);
+
+static struct timer_list rt_flush_timer =
+ { NULL, NULL, RT_FLUSH_DELAY, 0L, rt_run_flush };
/*
- * Forwarding Information Base definitions.
+ * Interface to generic destination cache.
*/
-struct fib_node
-{
- struct fib_node *fib_next;
- __u32 fib_dst;
- unsigned long fib_use;
- struct fib_info *fib_info;
- short fib_metric;
- unsigned char fib_tos;
-};
+static void ipv4_dst_destroy(struct dst_entry * dst);
+static struct dst_entry * ipv4_dst_check(struct dst_entry * dst);
+static struct dst_entry * ipv4_dst_reroute(struct dst_entry * dst);
-/*
- * This structure contains data shared by many of routes.
- */
-struct fib_info
+struct dst_ops ipv4_dst_ops =
{
- struct fib_info *fib_next;
- struct fib_info *fib_prev;
- __u32 fib_gateway;
- struct device *fib_dev;
- int fib_refcnt;
- unsigned long fib_window;
- unsigned short fib_flags;
- unsigned short fib_mtu;
- unsigned short fib_irtt;
-};
-
-struct fib_zone
-{
- struct fib_zone *fz_next;
- struct fib_node **fz_hash_table;
- struct fib_node *fz_list;
- int fz_nent;
- int fz_logmask;
- __u32 fz_mask;
-};
-
-static struct fib_zone *fib_zones[33];
-static struct fib_zone *fib_zone_list;
-static struct fib_node *fib_loopback = NULL;
-static struct fib_info *fib_info_list;
-
-/*
- * Backlogging.
- */
-
-#define RT_BH_REDIRECT 0
-#define RT_BH_GARBAGE_COLLECT 1
-#define RT_BH_FREE 2
-
-struct rt_req
-{
- struct rt_req * rtr_next;
- struct device *dev;
- __u32 dst;
- __u32 gw;
- unsigned char tos;
+ AF_INET,
+ ipv4_dst_check,
+ ipv4_dst_reroute,
+ ipv4_dst_destroy
};
-int ip_rt_lock;
-unsigned ip_rt_bh_mask;
-static struct rt_req *rt_backlog;
/*
* Route cache.
*/
-struct rtable *ip_rt_hash_table[RT_HASH_DIVISOR];
-static int rt_cache_size;
-static struct rtable *rt_free_queue;
-struct wait_queue *rt_wait;
-
-static void rt_kick_backlog(void);
-static void rt_cache_add(unsigned hash, struct rtable * rth);
-static void rt_cache_flush(void);
-static void rt_garbage_collect_1(void);
-
-/*
- * Evaluate mask length.
- */
-
-static __inline__ int rt_logmask(__u32 mask)
-{
- if (!(mask = ntohl(mask)))
- return 32;
- return ffz(~mask);
-}
-
-/*
- * Create mask from length.
- */
-
-static __inline__ __u32 rt_mask(int logmask)
-{
- if (logmask >= 32)
- return 0;
- return htonl(~((1<<logmask)-1));
-}
-
-static __inline__ unsigned fz_hash_code(__u32 dst, int logmask)
-{
- return ip_rt_hash_code(ntohl(dst)>>logmask);
-}
-
-/*
- * Free FIB node.
- */
-
-static void fib_free_node(struct fib_node * f)
-{
- struct fib_info * fi = f->fib_info;
- if (!--fi->fib_refcnt)
- {
-#if RT_CACHE_DEBUG >= 2
- printk("fib_free_node: fi %08x/%s is free\n", fi->fib_gateway, fi->fib_dev->name);
-#endif
- if (fi->fib_next)
- fi->fib_next->fib_prev = fi->fib_prev;
- if (fi->fib_prev)
- fi->fib_prev->fib_next = fi->fib_next;
- if (fi == fib_info_list)
- fib_info_list = fi->fib_next;
- }
- kfree_s(f, sizeof(struct fib_node));
-}
-
-/*
- * Find gateway route by address.
- */
-
-static struct fib_node * fib_lookup_gateway(__u32 dst)
-{
- struct fib_zone * fz;
- struct fib_node * f;
-
- for (fz = fib_zone_list; fz; fz = fz->fz_next)
- {
- if (fz->fz_hash_table)
- f = fz->fz_hash_table[fz_hash_code(dst, fz->fz_logmask)];
- else
- f = fz->fz_list;
-
- for ( ; f; f = f->fib_next)
- {
- if ((dst ^ f->fib_dst) & fz->fz_mask)
- continue;
- if (f->fib_info->fib_flags & RTF_GATEWAY)
- return NULL;
- return f;
- }
- }
- return NULL;
-}
-
-/*
- * Find local route by address.
- * FIXME: I use "longest match" principle. If destination
- * has some non-local route, I'll not search shorter matches.
- * It's possible, I'm wrong, but I wanted to prevent following
- * situation:
- * route add 193.233.7.128 netmask 255.255.255.192 gw xxxxxx
- * route add 193.233.7.0 netmask 255.255.255.0 eth1
- * (Two ethernets connected by serial line, one is small and other is large)
- * Host 193.233.7.129 is locally unreachable,
- * but old (<=1.3.37) code will send packets destined for it to eth1.
- *
- */
-
-static struct fib_node * fib_lookup_local(__u32 dst)
-{
- struct fib_zone * fz;
- struct fib_node * f;
-
- for (fz = fib_zone_list; fz; fz = fz->fz_next)
- {
- int longest_match_found = 0;
-
- if (fz->fz_hash_table)
- f = fz->fz_hash_table[fz_hash_code(dst, fz->fz_logmask)];
- else
- f = fz->fz_list;
-
- for ( ; f; f = f->fib_next)
- {
- if ((dst ^ f->fib_dst) & fz->fz_mask)
- continue;
- if (!(f->fib_info->fib_flags & RTF_GATEWAY))
- return f;
- longest_match_found = 1;
- }
- if (longest_match_found)
- return NULL;
- }
- return NULL;
-}
-
-/*
- * Main lookup routine.
- * IMPORTANT NOTE: this algorithm has small difference from <=1.3.37 visible
- * by user. It doesn't route non-CIDR broadcasts by default.
- *
- * F.e.
- * ifconfig eth0 193.233.7.65 netmask 255.255.255.192 broadcast 193.233.7.255
- * is valid, but if you really are not able (not allowed, do not want) to
- * use CIDR compliant broadcast 193.233.7.127, you should add host route:
- * route add -host 193.233.7.255 eth0
- */
-
-static struct fib_node * fib_lookup(__u32 dst)
-{
- struct fib_zone * fz;
- struct fib_node * f;
-
- for (fz = fib_zone_list; fz; fz = fz->fz_next)
- {
- if (fz->fz_hash_table)
- f = fz->fz_hash_table[fz_hash_code(dst, fz->fz_logmask)];
- else
- f = fz->fz_list;
-
- for ( ; f; f = f->fib_next)
- {
- if ((dst ^ f->fib_dst) & fz->fz_mask)
- continue;
- return f;
- }
- }
- return NULL;
-}
-
-static __inline__ struct device * get_gw_dev(__u32 gw)
-{
- struct fib_node * f;
- f = fib_lookup_gateway(gw);
- if (f)
- return f->fib_info->fib_dev;
- return NULL;
-}
-
-/*
- * Check if a mask is acceptable.
- */
-
-static inline int bad_mask(__u32 mask, __u32 addr)
-{
- if (addr & (mask = ~mask))
- return 1;
- mask = ntohl(mask);
- if (mask & (mask+1))
- return 1;
- return 0;
-}
-
-
-static int fib_del_list(struct fib_node **fp, __u32 dst,
- struct device * dev, __u32 gtw, short flags, short metric, __u32 mask)
-{
- struct fib_node *f;
- int found=0;
-
- while((f = *fp) != NULL)
- {
- struct fib_info * fi = f->fib_info;
-
- /*
- * Make sure the destination and netmask match.
- * metric, gateway and device are also checked
- * if they were specified.
- */
- if (f->fib_dst != dst ||
- (gtw && fi->fib_gateway != gtw) ||
- (metric >= 0 && f->fib_metric != metric) ||
- (dev && fi->fib_dev != dev) )
- {
- fp = &f->fib_next;
- continue;
- }
- cli();
- *fp = f->fib_next;
- if (fib_loopback == f)
- fib_loopback = NULL;
- sti();
- ip_netlink_msg(RTMSG_DELROUTE, dst, gtw, mask, flags, metric, fi->fib_dev->name);
- fib_free_node(f);
- found++;
- }
- return found;
-}
-
-static __inline__ int fib_del_1(__u32 dst, __u32 mask,
- struct device * dev, __u32 gtw, short flags, short metric)
-{
- struct fib_node **fp;
- struct fib_zone *fz;
- int found=0;
-
- if (!mask)
- {
- for (fz=fib_zone_list; fz; fz = fz->fz_next)
- {
- int tmp;
- if (fz->fz_hash_table)
- fp = &fz->fz_hash_table[fz_hash_code(dst, fz->fz_logmask)];
- else
- fp = &fz->fz_list;
-
- tmp = fib_del_list(fp, dst, dev, gtw, flags, metric, mask);
- fz->fz_nent -= tmp;
- found += tmp;
- }
- }
- else
- {
- if ((fz = fib_zones[rt_logmask(mask)]) != NULL)
- {
- if (fz->fz_hash_table)
- fp = &fz->fz_hash_table[fz_hash_code(dst, fz->fz_logmask)];
- else
- fp = &fz->fz_list;
-
- found = fib_del_list(fp, dst, dev, gtw, flags, metric, mask);
- fz->fz_nent -= found;
- }
- }
-
- if (found)
- {
- rt_cache_flush();
- return 0;
- }
- return -ESRCH;
-}
-
-
-static struct fib_info * fib_create_info(__u32 gw, struct device * dev,
- unsigned short flags, unsigned short mss,
- unsigned long window, unsigned short irtt)
-{
- struct fib_info * fi;
-
- if (!(flags & RTF_MSS))
- {
- mss = dev->mtu;
-#ifdef CONFIG_NO_PATH_MTU_DISCOVERY
- /*
- * If MTU was not specified, use default.
- * If you want to increase MTU for some net (local subnet)
- * use "route add .... mss xxx".
- *
- * The MTU isn't currently always used and computed as it
- * should be as far as I can tell. [Still verifying this is right]
- */
- if ((flags & RTF_GATEWAY) && mss > 576)
- mss = 576;
-#endif
- }
- if (!(flags & RTF_WINDOW))
- window = 0;
- if (!(flags & RTF_IRTT))
- irtt = 0;
-
- for (fi=fib_info_list; fi; fi = fi->fib_next)
- {
- if (fi->fib_gateway != gw ||
- fi->fib_dev != dev ||
- fi->fib_flags != flags ||
- fi->fib_mtu != mss ||
- fi->fib_window != window ||
- fi->fib_irtt != irtt)
- continue;
- fi->fib_refcnt++;
-#if RT_CACHE_DEBUG >= 2
- printk("fib_create_info: fi %08x/%s is duplicate\n", fi->fib_gateway, fi->fib_dev->name);
-#endif
- return fi;
- }
- fi = (struct fib_info*)kmalloc(sizeof(struct fib_info), GFP_KERNEL);
- if (!fi)
- return NULL;
- memset(fi, 0, sizeof(struct fib_info));
- fi->fib_flags = flags;
- fi->fib_dev = dev;
- fi->fib_gateway = gw;
- fi->fib_mtu = mss;
- fi->fib_window = window;
- fi->fib_refcnt++;
- fi->fib_next = fib_info_list;
- fi->fib_prev = NULL;
- fi->fib_irtt = irtt;
- if (fib_info_list)
- fib_info_list->fib_prev = fi;
- fib_info_list = fi;
-#if RT_CACHE_DEBUG >= 2
- printk("fib_create_info: fi %08x/%s is created\n", fi->fib_gateway, fi->fib_dev->name);
-#endif
- return fi;
-}
-
-
-static __inline__ void fib_add_1(short flags, __u32 dst, __u32 mask,
- __u32 gw, struct device *dev, unsigned short mss,
- unsigned long window, unsigned short irtt, short metric)
-{
- struct fib_node *f, *f1;
- struct fib_node **fp;
- struct fib_node **dup_fp = NULL;
- struct fib_zone * fz;
- struct fib_info * fi;
- int logmask;
-
- /*
- * Allocate an entry and fill it in.
- */
-
- f = (struct fib_node *) kmalloc(sizeof(struct fib_node), GFP_KERNEL);
- if (f == NULL)
- return;
-
- memset(f, 0, sizeof(struct fib_node));
- f->fib_dst = dst;
- f->fib_metric = metric;
- f->fib_tos = 0;
-
- if ((fi = fib_create_info(gw, dev, flags, mss, window, irtt)) == NULL)
- {
- kfree_s(f, sizeof(struct fib_node));
- return;
- }
- f->fib_info = fi;
-
- logmask = rt_logmask(mask);
- fz = fib_zones[logmask];
-
-
- if (!fz)
- {
- int i;
- fz = kmalloc(sizeof(struct fib_zone), GFP_KERNEL);
- if (!fz)
- {
- fib_free_node(f);
- return;
- }
- memset(fz, 0, sizeof(struct fib_zone));
- fz->fz_logmask = logmask;
- fz->fz_mask = mask;
- for (i=logmask-1; i>=0; i--)
- if (fib_zones[i])
- break;
- cli();
- if (i<0)
- {
- fz->fz_next = fib_zone_list;
- fib_zone_list = fz;
- }
- else
- {
- fz->fz_next = fib_zones[i]->fz_next;
- fib_zones[i]->fz_next = fz;
- }
- fib_zones[logmask] = fz;
- sti();
- }
-
- /*
- * If zone overgrows RTZ_HASHING_LIMIT, create hash table.
- */
-
- if (fz->fz_nent >= RTZ_HASHING_LIMIT && !fz->fz_hash_table && logmask<32)
- {
- struct fib_node ** ht;
-#if RT_CACHE_DEBUG >= 2
- printk("fib_add_1: hashing for zone %d started\n", logmask);
-#endif
- ht = kmalloc(RTZ_HASH_DIVISOR*sizeof(struct rtable*), GFP_KERNEL);
-
- if (ht)
- {
- memset(ht, 0, RTZ_HASH_DIVISOR*sizeof(struct fib_node*));
- cli();
- f1 = fz->fz_list;
- while (f1)
- {
- struct fib_node * next;
- unsigned hash = fz_hash_code(f1->fib_dst, logmask);
- next = f1->fib_next;
- f1->fib_next = ht[hash];
- ht[hash] = f1;
- f1 = next;
- }
- fz->fz_list = NULL;
- fz->fz_hash_table = ht;
- sti();
- }
- }
-
- if (fz->fz_hash_table)
- fp = &fz->fz_hash_table[fz_hash_code(dst, logmask)];
- else
- fp = &fz->fz_list;
-
- /*
- * Scan list to find the first route with the same destination
- */
- while ((f1 = *fp) != NULL)
- {
- if (f1->fib_dst == dst)
- break;
- fp = &f1->fib_next;
- }
-
- /*
- * Find route with the same destination and less (or equal) metric.
- */
- while ((f1 = *fp) != NULL && f1->fib_dst == dst)
- {
- if (f1->fib_metric >= metric)
- break;
- /*
- * Record route with the same destination and gateway,
- * but less metric. We'll delete it
- * after instantiation of new route.
- */
- if (f1->fib_info->fib_gateway == gw &&
- (gw || f1->fib_info->fib_dev == dev))
- dup_fp = fp;
- fp = &f1->fib_next;
- }
-
- /*
- * Is it already present?
- */
-
- if (f1 && f1->fib_metric == metric && f1->fib_info == fi)
- {
- fib_free_node(f);
- return;
- }
-
- /*
- * Insert new entry to the list.
- */
-
- cli();
- f->fib_next = f1;
- *fp = f;
- if (!fib_loopback && (fi->fib_dev->flags & IFF_LOOPBACK))
- fib_loopback = f;
- sti();
- fz->fz_nent++;
- ip_netlink_msg(RTMSG_NEWROUTE, dst, gw, mask, flags, metric, fi->fib_dev->name);
-
- /*
- * Delete route with the same destination and gateway.
- * Note that we should have at most one such route.
- */
- if (dup_fp)
- fp = dup_fp;
- else
- fp = &f->fib_next;
-
- while ((f1 = *fp) != NULL && f1->fib_dst == dst)
- {
- if (f1->fib_info->fib_gateway == gw &&
- (gw || f1->fib_info->fib_dev == dev))
- {
- cli();
- *fp = f1->fib_next;
- if (fib_loopback == f1)
- fib_loopback = NULL;
- sti();
- ip_netlink_msg(RTMSG_DELROUTE, dst, gw, mask, flags, metric, f1->fib_info->fib_dev->name);
- fib_free_node(f1);
- fz->fz_nent--;
- break;
- }
- fp = &f1->fib_next;
- }
- rt_cache_flush();
- return;
-}
-
-static int rt_flush_list(struct fib_node ** fp, struct device *dev)
-{
- int found = 0;
- struct fib_node *f;
+static atomic_t rt_cache_size;
+static struct rtable *rt_hash_table[RT_HASH_DIVISOR];
- while ((f = *fp) != NULL) {
-/*
- * "Magic" device route is allowed to point to loopback,
- * discard it too.
- */
- if (f->fib_info->fib_dev != dev &&
- (f->fib_info->fib_dev != &loopback_dev || f->fib_dst != dev->pa_addr)) {
- fp = &f->fib_next;
- continue;
- }
- cli();
- *fp = f->fib_next;
- if (fib_loopback == f)
- fib_loopback = NULL;
- sti();
- fib_free_node(f);
- found++;
- }
- return found;
-}
+static struct rtable * rt_intern_hash(unsigned hash, struct rtable * rth, u16 protocol);
-static __inline__ void fib_flush_1(struct device *dev)
+static __inline__ unsigned rt_hash_code(u32 daddr, u32 saddr, u8 tos)
{
- struct fib_zone *fz;
- int found = 0;
-
- for (fz = fib_zone_list; fz; fz = fz->fz_next)
- {
- if (fz->fz_hash_table)
- {
- int i;
- int tmp = 0;
- for (i=0; i<RTZ_HASH_DIVISOR; i++)
- tmp += rt_flush_list(&fz->fz_hash_table[i], dev);
- fz->fz_nent -= tmp;
- found += tmp;
- }
- else
- {
- int tmp;
- tmp = rt_flush_list(&fz->fz_list, dev);
- fz->fz_nent -= tmp;
- found += tmp;
- }
- }
-
- if (found)
- rt_cache_flush();
+ unsigned hash = ((daddr&0xF0F0F0F0)>>4)|((daddr&0x0F0F0F0F)<<4);
+ hash = hash^saddr^tos;
+ hash = hash^(hash>>16);
+ return (hash^(hash>>8)) & 0xFF;
}
+#ifdef CONFIG_PROC_FS
-/*
- * Called from the PROCfs module. This outputs /proc/net/route.
- *
- * We preserve the old format but pad the buffers out. This means that
- * we can spin over the other entries as we read them. Remember the
- * gated BGP4 code could need to read 60,000+ routes on occasion (that's
- * about 7Mb of data). To do that ok we will need to also cache the
- * last route we got to (reads will generally be following on from
- * one another without gaps).
- */
-
-int rt_get_info(char *buffer, char **start, off_t offset, int length, int dummy)
-{
- struct fib_zone *fz;
- struct fib_node *f;
- int len=0;
- off_t pos=0;
- char temp[129];
- int i;
-
- pos = 128;
-
- if (offset<128)
- {
- sprintf(buffer,"%-127s\n","Iface\tDestination\tGateway \tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU\tWindow\tIRTT");
- len = 128;
- }
-
- while (ip_rt_lock)
- sleep_on(&rt_wait);
- ip_rt_fast_lock();
-
- for (fz=fib_zone_list; fz; fz = fz->fz_next)
- {
- int maxslot;
- struct fib_node ** fp;
-
- if (fz->fz_nent == 0)
- continue;
-
- if (pos + 128*fz->fz_nent <= offset)
- {
- pos += 128*fz->fz_nent;
- len = 0;
- continue;
- }
-
- if (fz->fz_hash_table)
- {
- maxslot = RTZ_HASH_DIVISOR;
- fp = fz->fz_hash_table;
- }
- else
- {
- maxslot = 1;
- fp = &fz->fz_list;
- }
-
- for (i=0; i < maxslot; i++, fp++)
- {
-
- for (f = *fp; f; f = f->fib_next)
- {
- struct fib_info * fi;
- /*
- * Spin through entries until we are ready
- */
- pos += 128;
-
- if (pos <= offset)
- {
- len=0;
- continue;
- }
-
- fi = f->fib_info;
- sprintf(temp, "%s\t%08lX\t%08lX\t%02X\t%d\t%lu\t%d\t%08lX\t%d\t%lu\t%u",
- fi->fib_dev->name, (unsigned long)f->fib_dst, (unsigned long)fi->fib_gateway,
- fi->fib_flags, 0, f->fib_use, f->fib_metric,
- (unsigned long)fz->fz_mask, (int)fi->fib_mtu, fi->fib_window, (int)fi->fib_irtt);
- sprintf(buffer+len,"%-127s\n",temp);
-
- len += 128;
- if (pos >= offset+length)
- goto done;
- }
- }
- }
-
-done:
- ip_rt_unlock();
- wake_up(&rt_wait);
-
- *start = buffer+len-(pos-offset);
- len = pos - offset;
- if (len>length)
- len = length;
- return len;
-}
-
-int rt_cache_get_info(char *buffer, char **start, off_t offset, int length, int dummy)
+static int rt_cache_get_info(char *buffer, char **start, off_t offset, int length, int dummy)
{
int len=0;
off_t pos=0;
@@ -829,36 +131,39 @@
pos = 128;
- if (offset<128)
- {
- sprintf(buffer,"%-127s\n","Iface\tDestination\tGateway \tFlags\tRefCnt\tUse\tMetric\tSource\t\tMTU\tWindow\tIRTT\tHH\tARP");
+ if (offset<128) {
+ sprintf(buffer,"%-127s\n","Iface\tDestination\tGateway \tFlags\tRefCnt\tUse\tMetric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHH\tARP");
len = 128;
}
- while (ip_rt_lock)
- sleep_on(&rt_wait);
- ip_rt_fast_lock();
-
- for (i = 0; i<RT_HASH_DIVISOR; i++)
- {
- for (r = ip_rt_hash_table[i]; r; r = r->rt_next)
- {
+ start_bh_atomic();
+
+ for (i = 0; i<RT_HASH_DIVISOR; i++) {
+ for (r = rt_hash_table[i]; r; r = r->u.rt_next) {
/*
* Spin through entries until we are ready
*/
pos += 128;
- if (pos <= offset)
- {
+ if (pos <= offset) {
len = 0;
continue;
}
- sprintf(temp, "%s\t%08lX\t%08lX\t%02X\t%d\t%u\t%d\t%08lX\t%d\t%lu\t%u\t%d\t%1d",
- r->rt_dev->name, (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
- r->rt_flags, r->rt_refcnt, r->rt_use, 0,
- (unsigned long)r->rt_src, (int)r->rt_mtu, r->rt_window, (int)r->rt_irtt, r->rt_hh ? r->rt_hh->hh_refcnt : -1, r->rt_hh ? r->rt_hh->hh_uptodate : 0);
+ sprintf(temp, "%s\t%08lX\t%08lX\t%X\t%d\t%u\t%d\t%08lX\t%d\t%u\t%u\t%02x\t%d\t%1d\t%08x\t%02x",
+ r->u.dst.dev ? r->u.dst.dev->name : "*",
+ (unsigned long)r->rt_dst,
+ (unsigned long)r->rt_gateway,
+ r->rt_flags, r->u.dst.refcnt,
+ r->u.dst.use, 0,
+ (unsigned long)r->rt_src, (int)r->u.dst.pmtu,
+ r->u.dst.window,
+ (int)r->u.dst.rtt, r->key.tos,
+ r->u.dst.hh ? r->u.dst.hh->hh_refcnt : -1,
+ r->u.dst.hh ? r->u.dst.hh->hh_uptodate : 0,
+ r->rt_spec_dst,
+ i);
sprintf(buffer+len,"%-127s\n",temp);
len += 128;
if (pos >= offset+length)
@@ -867,8 +172,7 @@
}
done:
- ip_rt_unlock();
- wake_up(&rt_wait);
+ end_bh_atomic();
*start = buffer+len-(pos-offset);
len = pos-offset;
@@ -876,218 +180,123 @@
len = length;
return len;
}
-
-
-static void rt_free(struct rtable * rt)
-{
- unsigned long flags;
-
- save_flags(flags);
- cli();
- if (!rt->rt_refcnt)
- {
- struct hh_cache * hh = rt->rt_hh;
- rt->rt_hh = NULL;
- restore_flags(flags);
- if (hh && atomic_dec_and_test(&hh->hh_refcnt))
- kfree_s(hh, sizeof(struct hh_cache));
- kfree_s(rt, sizeof(struct rt_table));
- return;
- }
- rt->rt_next = rt_free_queue;
- rt->rt_flags &= ~RTF_UP;
- rt_free_queue = rt;
- ip_rt_bh_mask |= RT_BH_FREE;
-#if RT_CACHE_DEBUG >= 2
- printk("rt_free: %08x\n", rt->rt_dst);
#endif
- restore_flags(flags);
+
+static void __inline__ rt_free(struct rtable *rt)
+{
+ dst_free(&rt->u.dst);
}
-/*
- * RT "bottom half" handlers. Called with masked interrupts.
- */
-static __inline__ void rt_kick_free_queue(void)
+void ip_rt_check_expire()
{
- struct rtable *rt, **rtp;
-
- rtp = &rt_free_queue;
-
- while ((rt = *rtp) != NULL)
- {
- if (!rt->rt_refcnt)
- {
- struct hh_cache * hh = rt->rt_hh;
-#if RT_CACHE_DEBUG >= 2
- __u32 daddr = rt->rt_dst;
-#endif
- *rtp = rt->rt_next;
- rt->rt_hh = NULL;
- sti();
- if (hh && atomic_dec_and_test(&hh->hh_refcnt))
- kfree_s(hh, sizeof(struct hh_cache));
- kfree_s(rt, sizeof(struct rt_table));
-#if RT_CACHE_DEBUG >= 2
- printk("rt_kick_free_queue: %08x is free\n", daddr);
-#endif
- cli();
- continue;
- }
- rtp = &rt->rt_next;
- }
-}
+ int i;
+ static int rover;
+ struct rtable *rth, **rthp;
+ unsigned long now = jiffies;
-void ip_rt_run_bh()
-{
- unsigned long flags;
- save_flags(flags);
- cli();
- if (ip_rt_bh_mask && !ip_rt_lock)
- {
- if (ip_rt_bh_mask & RT_BH_REDIRECT)
- rt_kick_backlog();
+ start_bh_atomic();
- if (ip_rt_bh_mask & RT_BH_GARBAGE_COLLECT)
- {
- ip_rt_fast_lock();
- ip_rt_bh_mask &= ~RT_BH_GARBAGE_COLLECT;
- sti();
- rt_garbage_collect_1();
- cli();
- ip_rt_fast_unlock();
- }
+ for (i=0; i<RT_HASH_DIVISOR/5; i++) {
+ rover = (rover + 1) & (RT_HASH_DIVISOR-1);
+ rthp = &rt_hash_table[rover];
- if (ip_rt_bh_mask & RT_BH_FREE)
- rt_kick_free_queue();
- }
- restore_flags(flags);
-}
+ while ((rth = *rthp) != NULL) {
+ struct rtable * rth_next = rth->u.rt_next;
+ /*
+ * Cleanup aged off entries.
+ */
-void ip_rt_check_expire()
-{
- ip_rt_fast_lock();
- if (ip_rt_lock == 1)
- {
- int i;
- struct rtable *rth, **rthp;
- unsigned long flags;
- unsigned long now = jiffies;
-
- save_flags(flags);
- for (i=0; i<RT_HASH_DIVISOR; i++)
- {
- rthp = &ip_rt_hash_table[i];
-
- while ((rth = *rthp) != NULL)
- {
- struct rtable * rth_next = rth->rt_next;
-
- /*
- * Cleanup aged off entries.
- */
-
- cli();
- if (!rth->rt_refcnt && rth->rt_lastuse + RT_CACHE_TIMEOUT < now)
- {
- *rthp = rth_next;
- sti();
- rt_cache_size--;
+ if (!rth->u.dst.refcnt && now - rth->u.dst.lastuse > RT_CACHE_TIMEOUT) {
+ *rthp = rth_next;
+ atomic_dec(&rt_cache_size);
#if RT_CACHE_DEBUG >= 2
- printk("rt_check_expire clean %02x@%08x\n", i, rth->rt_dst);
+ printk("rt_check_expire clean %02x@%08x\n", rover, rth->rt_dst);
#endif
- rt_free(rth);
- continue;
- }
- sti();
+ rt_free(rth);
+ continue;
+ }
- if (!rth_next)
- break;
+ if (!rth_next)
+ break;
+
+ /*
+ * Pseudo-LRU ordering.
+ * Really we should teach it to move
+ * rarely used but permanently living entries
+ * (f.e. rdisc, igmp etc.) to the end of list.
+ */
- /*
- * LRU ordering.
- */
-
- if (rth->rt_lastuse + RT_CACHE_BUBBLE_THRESHOLD < rth_next->rt_lastuse ||
- (rth->rt_lastuse < rth_next->rt_lastuse &&
- rth->rt_use < rth_next->rt_use))
- {
+ if ( rth_next->u.dst.lastuse - rth->u.dst.lastuse > RT_CACHE_BUBBLE_THRESHOLD ||
+ (rth->u.dst.lastuse - rth_next->u.dst.lastuse < 0 &&
+ rth->u.dst.use < rth_next->u.dst.use)) {
#if RT_CACHE_DEBUG >= 2
- printk("rt_check_expire bubbled %02x@%08x<->%08x\n", i, rth->rt_dst, rth_next->rt_dst);
+ printk("rt_check_expire bubbled %02x@%08x<->%08x\n", rover, rth->rt_dst, rth_next->rt_dst);
#endif
- cli();
- *rthp = rth_next;
- rth->rt_next = rth_next->rt_next;
- rth_next->rt_next = rth;
- sti();
- rthp = &rth_next->rt_next;
- continue;
- }
- rthp = &rth->rt_next;
+ *rthp = rth_next;
+ rth->u.rt_next = rth_next->u.rt_next;
+ rth_next->u.rt_next = rth;
+ sti();
+ rthp = &rth_next->u.rt_next;
+ continue;
}
+ rthp = &rth->u.rt_next;
}
- restore_flags(flags);
- rt_kick_free_queue();
}
- ip_rt_unlock();
-}
-static void rt_redirect_1(__u32 dst, __u32 gw, struct device *dev)
+ end_bh_atomic();
+}
+
+
+void rt_cache_flush(int how)
{
- struct rtable *rt;
- unsigned long hash = ip_rt_hash_code(dst);
-
- if (gw == dev->pa_addr)
- return;
- if (dev != get_gw_dev(gw))
+ start_bh_atomic();
+ if (rt_flush_timer.expires) {
+ if (jiffies - rt_flush_timer.expires > 0 ||
+ rt_flush_timer.expires - jiffies > RT_FLUSH_DELAY/2)
+ how = 1;
+ }
+ if (how) {
+ if (rt_flush_timer.expires)
+ del_timer(&rt_flush_timer);
+ rt_flush_timer.expires = 0;
+ end_bh_atomic();
+ rt_run_flush(0);
return;
- rt = (struct rtable *) kmalloc(sizeof(struct rtable), GFP_ATOMIC);
- if (rt == NULL)
+ }
+ if (rt_flush_timer.expires) {
+ end_bh_atomic();
return;
- memset(rt, 0, sizeof(struct rtable));
- rt->rt_flags = RTF_DYNAMIC | RTF_MODIFIED | RTF_HOST | RTF_GATEWAY | RTF_UP;
- rt->rt_dst = dst;
- rt->rt_dev = dev;
- rt->rt_gateway = gw;
- rt->rt_src = dev->pa_addr;
- rt->rt_mtu = dev->mtu;
-#ifdef CONFIG_NO_PATH_MTU_DISCOVERY
- if (dev->mtu > 576)
- rt->rt_mtu = 576;
-#endif
- rt->rt_lastuse = jiffies;
- rt->rt_refcnt = 1;
- rt_cache_add(hash, rt);
- ip_rt_put(rt);
- return;
+ }
+ del_timer(&rt_flush_timer);
+ rt_flush_timer.expires = jiffies + RT_FLUSH_DELAY;
+ add_timer(&rt_flush_timer);
+ end_bh_atomic();
}
-
-static void rt_cache_flush(void)
+
+void rt_run_flush(unsigned long dummy)
{
int i;
struct rtable * rth, * next;
- for (i=0; i<RT_HASH_DIVISOR; i++)
- {
+ for (i=0; i<RT_HASH_DIVISOR; i++) {
int nr=0;
cli();
- if (!(rth = ip_rt_hash_table[i]))
- {
+ if (!(rth = rt_hash_table[i])) {
sti();
continue;
}
- ip_rt_hash_table[i] = NULL;
+ rt_hash_table[i] = NULL;
sti();
- for (; rth; rth=next)
- {
- next = rth->rt_next;
- rt_cache_size--;
+ for (; rth; rth=next) {
+ next = rth->u.rt_next;
+ atomic_dec(&rt_cache_size);
nr++;
- rth->rt_next = NULL;
+ rth->u.rt_next = NULL;
rt_free(rth);
}
#if RT_CACHE_DEBUG >= 2
@@ -1095,631 +304,1090 @@
printk("rt_cache_flush: %d@%02x\n", nr, i);
#endif
}
-#if RT_CACHE_DEBUG >= 1
- if (rt_cache_size)
- {
- printk("rt_cache_flush: bug rt_cache_size=%d\n", rt_cache_size);
- rt_cache_size = 0;
+}
+
+static void rt_garbage_collect(void)
+{
+ int i;
+ static unsigned expire = RT_CACHE_TIMEOUT>>1;
+ static unsigned long last_gc;
+ struct rtable *rth, **rthp;
+ unsigned long now;
+
+ start_bh_atomic();
+ now = jiffies;
+
+ /*
+ * Garbage collection is pretty expensive,
+ * do not make it too frequently.
+ */
+ if (now - last_gc < 1*HZ) {
+ expire >>= 1;
+ end_bh_atomic();
+ return;
+ }
+
+ expire++;
+
+ for (i=0; i<RT_HASH_DIVISOR; i++) {
+ if (!rt_hash_table[i])
+ continue;
+ for (rthp=&rt_hash_table[i]; (rth=*rthp); rthp=&rth->u.rt_next) {
+ if (rth->u.dst.refcnt || now - rth->u.dst.lastuse > expire)
+ continue;
+ atomic_dec(&rt_cache_size);
+ *rthp = rth->u.rt_next;
+ rth->u.rt_next = NULL;
+ rt_free(rth);
+ break;
+ }
+ }
+
+ last_gc = now;
+ if (rt_cache_size < RT_CACHE_MAX_SIZE)
+ expire = RT_CACHE_TIMEOUT>>1;
+ else
+ expire >>= 1;
+ end_bh_atomic();
+}
+
+static int rt_ll_bind(struct rtable *rt)
+{
+ struct dst_entry *neigh;
+ struct hh_cache *hh = NULL;
+
+ if (rt->u.dst.dev && rt->u.dst.dev->hard_header_cache) {
+ neigh = rt->u.dst.neighbour;
+ if (!neigh)
+ neigh = arp_find_neighbour(&rt->u.dst, 1);
+
+ if (neigh) {
+ rt->u.dst.neighbour = neigh;
+ for (hh=neigh->hh; hh; hh = hh->hh_next)
+ if (hh->hh_type == ETH_P_IP)
+ break;
+ }
+
+ if (!hh && (hh = kmalloc(sizeof(*hh), GFP_ATOMIC)) != NULL) {
+#if RT_CACHE_DEBUG >= 2
+ extern atomic_t hh_count;
+ atomic_inc(&hh_count);
+#endif
+ memset(hh, 0, sizeof(struct hh_cache));
+ hh->hh_type = ETH_P_IP;
+ hh->hh_refcnt = 0;
+ hh->hh_next = NULL;
+ if (rt->u.dst.dev->hard_header_cache(&rt->u.dst, neigh, hh)) {
+ kfree(hh);
+#if RT_CACHE_DEBUG >= 2
+ atomic_dec(&hh_count);
+#endif
+ hh = NULL;
+ } else if (neigh) {
+ atomic_inc(&hh->hh_refcnt);
+ hh->hh_next = neigh->hh;
+ neigh->hh = hh;
+ }
+ }
+ if (hh) {
+ atomic_inc(&hh->hh_refcnt);
+ rt->u.dst.hh = hh;
+ return hh->hh_uptodate;
+ }
+ }
+ return 0;
+}
+
+
+static struct rtable *rt_intern_hash(unsigned hash, struct rtable * rt, u16 protocol)
+{
+ struct rtable *rth, **rthp;
+ unsigned long now = jiffies;
+
+ rt->u.dst.priority = rt_tos2priority(rt->key.tos);
+
+ start_bh_atomic();
+
+ rthp = &rt_hash_table[hash];
+
+ while ((rth = *rthp) != NULL) {
+ if (memcmp(&rth->key, &rt->key, sizeof(rt->key)) == 0) {
+ /* Put it first */
+ *rthp = rth->u.rt_next;
+ rth->u.rt_next = rt_hash_table[hash];
+ rt_hash_table[hash] = rth;
+
+ atomic_inc(&rth->u.dst.refcnt);
+ atomic_inc(&rth->u.dst.use);
+ rth->u.dst.lastuse = now;
+ end_bh_atomic();
+
+ ip_rt_put(rt);
+ rt_free(rt);
+ return rth;
+ }
+
+ rthp = &rth->u.rt_next;
+ }
+
+ if (rt_cache_size >= RT_CACHE_MAX_SIZE)
+ rt_garbage_collect();
+
+ rt->u.rt_next = rt_hash_table[hash];
+#if RT_CACHE_DEBUG >= 2
+ if (rt->u.rt_next) {
+ struct rtable * trt;
+ printk("rt_cache @%02x: %08x", hash, rt->rt_dst);
+ for (trt=rt->u.rt_next; trt; trt=trt->u.rt_next)
+ printk(" . %08x", trt->rt_dst);
+ printk("\n");
}
#endif
+ rt_hash_table[hash] = rt;
+ atomic_inc(&rt_cache_size);
+
+ if (protocol == ETH_P_IP)
+ rt_ll_bind(rt);
+
+ end_bh_atomic();
+ return rt;
}
-static void rt_garbage_collect_1(void)
+void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
+ u32 saddr, u8 tos, struct device *dev)
{
int i;
- unsigned expire = RT_CACHE_TIMEOUT>>1;
- struct rtable * rth, **rthp;
- unsigned long now = jiffies;
-
- for (;;)
- {
- for (i=0; i<RT_HASH_DIVISOR; i++)
- {
- if (!ip_rt_hash_table[i])
+ int off_link = 0;
+ struct fib_info *fi;
+ struct rtable *rth, **rthp;
+ u32 skeys[2] = { saddr, 0, };
+ struct device *pdev = net_alias_main_dev(dev);
+
+ tos &= IPTOS_TOS_MASK;
+
+ if (new_gw == old_gw || !ipv4_config.accept_redirects
+ || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
+ goto reject_redirect;
+
+ if ((new_gw^dev->pa_addr)&dev->pa_mask)
+ off_link = 1;
+
+ if (!ipv4_config.rfc1620_redirects) {
+ if (off_link)
+ goto reject_redirect;
+ if (ipv4_config.secure_redirects && ip_fib_chk_default_gw(new_gw, dev))
+ goto reject_redirect;
+ }
+
+ fi = fib_lookup_info(new_gw, 0, 0, &loopback_dev, NULL);
+ if (fi == NULL || fi->fib_flags&(RTF_LOCAL|RTF_BROADCAST|RTF_NAT))
+ goto reject_redirect;
+
+ for (i=0; i<2; i++) {
+ unsigned hash = rt_hash_code(daddr, skeys[i], tos);
+
+ rthp=&rt_hash_table[hash];
+
+ while ( (rth = *rthp) != NULL) {
+ struct rtable *rt;
+
+ if (rth->key.dst != daddr ||
+ rth->key.src != skeys[i] ||
+ rth->key.tos != tos ||
+ rth->key.dst_dev != NULL ||
+ rth->key.src_dev != NULL) {
+ rthp = &rth->u.rt_next;
continue;
- for (rthp=&ip_rt_hash_table[i]; (rth=*rthp); rthp=&rth->rt_next)
- {
- if (rth->rt_lastuse + expire*(rth->rt_refcnt+1) > now)
- continue;
- rt_cache_size--;
- cli();
- *rthp=rth->rt_next;
- rth->rt_next = NULL;
- sti();
- rt_free(rth);
+ }
+
+ if (rth->rt_dst != daddr ||
+ rth->rt_src != saddr ||
+ rth->rt_flags&RTF_REJECT ||
+ rth->rt_gateway != old_gw ||
+ rth->u.dst.dev != dev)
break;
+
+ rt = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops);
+ if (rt == NULL)
+ return;
+
+ /*
+ * Copy all the information.
+ */
+ rt->u.dst.refcnt = 1;
+ rt->u.dst.dev = dev;
+ rt->u.dst.input = rth->u.dst.input;
+ rt->u.dst.output = rth->u.dst.output;
+ rt->u.dst.pmtu = dev->mtu;
+ rt->u.dst.rtt = TCP_TIMEOUT_INIT;
+ rt->u.dst.window = 0;
+ rt->u.dst.use = 1;
+ rt->u.dst.lastuse = jiffies;
+
+ rt->rt_flags = rth->rt_flags|RTF_DYNAMIC|RTF_MODIFIED;
+ rt->rt_flags &= ~RTF_GATEWAY;
+ if (new_gw != daddr)
+ rt->rt_flags |= RTF_GATEWAY;
+
+ rt->rt_src = rth->rt_src;
+ rt->rt_dst = rth->rt_dst;
+ rt->rt_src_dev = rth->rt_src_dev;
+ rt->rt_spec_dst = rth->rt_spec_dst;
+ rt->key = rth->key;
+
+ /* But gateway is different ... */
+ rt->rt_gateway = new_gw;
+
+ if (off_link) {
+ if (fi->fib_dev != dev &&
+ net_alias_main_dev(fi->fib_dev) == pdev)
+ rt->u.dst.dev = fi->fib_dev;
}
+
+ if (ipv4_config.rfc1620_redirects && !rt_ll_bind(rt)) {
+ ip_rt_put(rt);
+ rt_free(rt);
+ break;
+ }
+
+ *rthp = rth->u.rt_next;
+ rt_free(rth);
+ rt = rt_intern_hash(hash, rt, ETH_P_IP);
+ ip_rt_put(rt);
+ break;
}
- if (rt_cache_size < RT_CACHE_SIZE_MAX)
- return;
- expire >>= 1;
}
+ return;
+
+reject_redirect:
+ if (ipv4_config.log_martians)
+ printk(KERN_INFO "Redirect from %lX/%s to %lX ignored."
+ "Path = %lX -> %lX, tos %02x\n",
+ ntohl(old_gw), dev->name, ntohl(new_gw),
+ ntohl(saddr), ntohl(daddr), tos);
}
-static __inline__ void rt_req_enqueue(struct rt_req **q, struct rt_req *rtr)
+
+void ip_rt_advice(struct rtable **rp, int advice)
{
- unsigned long flags;
- struct rt_req * tail;
+ struct rtable *rt;
- save_flags(flags);
- cli();
- tail = *q;
- if (!tail)
- rtr->rtr_next = rtr;
- else
- {
- rtr->rtr_next = tail->rtr_next;
- tail->rtr_next = rtr;
+ if (advice)
+ return;
+
+ start_bh_atomic();
+ if ((rt = *rp) != NULL && (rt->rt_flags&(RTF_DYNAMIC|RTF_MODIFIED))) {
+#if RT_CACHE_DEBUG >= 1
+ printk(KERN_DEBUG "ip_rt_advice: redirect to %08x/%02x dropped\n", rt->rt_dst, rt->key.tos);
+#endif
+ *rp = NULL;
+ ip_rt_put(rt);
+ rt_cache_flush(0);
}
- *q = rtr;
- restore_flags(flags);
+ end_bh_atomic();
return;
}
/*
- * Caller should mask interrupts.
+ * Algorithm:
+ * 1. The first RT_REDIRECT_NUMBER redirects are sent
+ * with exponential backoff, then we stop sending them at all,
+ * assuming that the host ignores our redirects.
+ * 2. If we did not see a packets requiring redirects
+ * during RT_REDIRECT_SILENCE, we assume that the host
+ * forgot redirected route and start to send redirects again.
+ *
+ * This algorithm is much cheaper and more intelligent than dumb load limiting
+ * in icmp.c.
+ *
+ * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
+ * and "frag. need" (breaks PMTU discovery) in icmp.c.
*/
-static __inline__ struct rt_req * rt_req_dequeue(struct rt_req **q)
+void ip_rt_send_redirect(struct sk_buff *skb)
{
- struct rt_req * rtr;
+ struct rtable *rt = (struct rtable*)skb->dst;
- if (*q)
- {
- rtr = (*q)->rtr_next;
- (*q)->rtr_next = rtr->rtr_next;
- if (rtr->rtr_next == rtr)
- *q = NULL;
- rtr->rtr_next = NULL;
- return rtr;
+ /* No redirected packets during RT_REDIRECT_SILENCE;
+ * reset the algorithm.
+ */
+ if (jiffies - rt->last_error > RT_REDIRECT_SILENCE)
+ rt->errors = 0;
+
+ /* Too many ignored redirects; do not send anything
+ * set last_error to the last seen redirected packet.
+ */
+ if (rt->errors >= RT_REDIRECT_NUMBER) {
+ rt->last_error = jiffies;
+ return;
+ }
+
+ /* Check for load limit; set last_error to the latest sent
+ * redirect.
+ */
+ if (jiffies - rt->last_error > (RT_REDIRECT_LOAD<<rt->errors)) {
+ icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
+ rt->last_error = jiffies;
+ if (ipv4_config.log_martians && ++rt->errors == RT_REDIRECT_NUMBER)
+ printk(KERN_WARNING "host %08x/%s ignores redirects.\n", rt->rt_src, rt->rt_src_dev->name);
}
- return NULL;
}
-/*
- Called with masked interrupts
- */
+static int ip_error(struct sk_buff *skb)
+{
+ struct rtable *rt = (struct rtable*)skb->dst;
+ int code;
+
+ switch (rt->u.dst.error) {
+ case EINVAL:
+ default:
+ kfree_skb(skb, FREE_READ);
+ return 0;
+ case ENETUNREACH:
+ code = ICMP_NET_UNREACH;
+ break;
+ case EACCES:
+ code = ICMP_PKT_FILTERED;
+ break;
+ }
+ if (jiffies - rt->last_error > RT_ERROR_LOAD) {
+ icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
+ rt->last_error = jiffies;
+ }
+ kfree_skb(skb, FREE_READ);
+ return 0;
+}
+
-static void rt_kick_backlog()
+static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
{
- if (!ip_rt_lock)
- {
- struct rt_req * rtr;
+ if (old_mtu > 32000)
+ return 32000;
+ else if (old_mtu > 17914)
+ return 17914;
+ else if (old_mtu > 8166)
+ return 8166;
+ else if (old_mtu > 4352)
+ return 4352;
+ else if (old_mtu > 2002)
+ return 2002;
+ else if (old_mtu > 1492)
+ return 1492;
+ else if (old_mtu > 576)
+ return 576;
+ else if (old_mtu > 296)
+ return 296;
+ /*
+ * These two are not from the RFC but
+ * are needed for AMPRnet AX.25 paths.
+ */
+ else if (old_mtu > 216)
+ return 216;
+ else if (old_mtu > 128)
+ return 128;
+ return 68;
+}
- ip_rt_fast_lock();
- while ((rtr = rt_req_dequeue(&rt_backlog)) != NULL)
- {
- sti();
- rt_redirect_1(rtr->dst, rtr->gw, rtr->dev);
- kfree_s(rtr, sizeof(struct rt_req));
- cli();
+unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
+{
+ int i;
+ unsigned short old_mtu = ntohs(iph->tot_len);
+ struct rtable *rth;
+ u32 skeys[2] = { iph->saddr, 0, };
+ u32 daddr = iph->daddr;
+ u8 tos = iph->tos & IPTOS_TOS_MASK;
+ unsigned short est_mtu = 0;
+
+ if (ipv4_config.no_pmtu_disc)
+ return 0;
+
+ for (i=0; i<2; i++) {
+ unsigned hash = rt_hash_code(daddr, skeys[i], tos);
+
+ for (rth = rt_hash_table[hash]; rth; rth = rth->u.rt_next) {
+ if (rth->key.dst == daddr &&
+ rth->key.src == skeys[i] &&
+ rth->rt_dst == daddr &&
+ rth->rt_src == iph->saddr &&
+ rth->key.tos == tos &&
+ !rth->key.src_dev &&
+ !(rth->rt_flags&RTF_NOPMTUDISC)) {
+ unsigned short mtu = new_mtu;
+
+ if (new_mtu < 68 || new_mtu >= old_mtu) {
+
+ /* BSD 4.2 compatibility hack :-( */
+ if (mtu == 0 && old_mtu >= rth->u.dst.pmtu &&
+ old_mtu >= 68 + (iph->ihl<<2))
+ old_mtu -= iph->ihl<<2;
+
+ mtu = guess_mtu(old_mtu);
+ }
+ if (mtu < rth->u.dst.pmtu) {
+ rth->u.dst.pmtu = mtu;
+ est_mtu = mtu;
+ }
+ }
}
+ }
+ return est_mtu;
+}
- ip_rt_bh_mask &= ~RT_BH_REDIRECT;
- ip_rt_fast_unlock();
+static void ipv4_dst_destroy(struct dst_entry * dst)
+{
+ struct rtable * rt = (struct rtable*)dst;
+ struct hh_cache * hh = rt->u.dst.hh;
+ rt->u.dst.hh = NULL;
+ if (hh && atomic_dec_and_test(&hh->hh_refcnt)) {
+#if RT_CACHE_DEBUG >= 2
+ extern atomic_t hh_count;
+ atomic_dec(&hh_count);
+#endif
+ kfree(hh);
}
}
-/*
- * rt_{del|add|flush} called only from USER process. Waiting is OK.
- */
+static struct dst_entry * ipv4_dst_check(struct dst_entry * dst)
+{
+ return NULL;
+}
-static int rt_del(__u32 dst, __u32 mask,
- struct device * dev, __u32 gtw, short rt_flags, short metric)
+static struct dst_entry * ipv4_dst_reroute(struct dst_entry * dst)
{
- int retval;
+ return NULL;
+}
+
+int
+ip_check_mc(struct device *dev, u32 mc_addr)
+{
+ struct ip_mc_list *ip_mc;
+
+ if (mc_addr==htonl(INADDR_ALLHOSTS_GROUP))
+ return 1;
+
+ for (ip_mc=dev->ip_mc_list; ip_mc; ip_mc=ip_mc->next)
+ if (ip_mc->multiaddr == mc_addr)
+ return 1;
+ return 0;
+}
- while (ip_rt_lock)
- sleep_on(&rt_wait);
- ip_rt_fast_lock();
- retval = fib_del_1(dst, mask, dev, gtw, rt_flags, metric);
- ip_rt_unlock();
- wake_up(&rt_wait);
- return retval;
-}
-
-static void rt_add(short flags, __u32 dst, __u32 mask,
- __u32 gw, struct device *dev, unsigned short mss,
- unsigned long window, unsigned short irtt, short metric)
-{
- while (ip_rt_lock)
- sleep_on(&rt_wait);
- ip_rt_fast_lock();
- fib_add_1(flags, dst, mask, gw, dev, mss, window, irtt, metric);
- ip_rt_unlock();
- wake_up(&rt_wait);
-}
-
-void ip_rt_flush(struct device *dev)
-{
- while (ip_rt_lock)
- sleep_on(&rt_wait);
- ip_rt_fast_lock();
- fib_flush_1(dev);
- ip_rt_unlock();
- wake_up(&rt_wait);
+static int ip_rt_bug(struct sk_buff *skb)
+{
+ kfree_skb(skb, FREE_WRITE);
+ printk(KERN_DEBUG "ip_rt_bug: %08x -> %08x, %s\n", skb->nh.iph->saddr,
+ skb->nh.iph->daddr, skb->dev ? skb->dev->name : "?");
+ return 0;
}
/*
- Called by ICMP module.
+ * This function is called ONLY FROM NET BH. No locking!
+ *
+ * NOTE. We drop all the packets that has local source
+ * addresses, because every properly looped back packet
+ * must have correct destination already attached by output routine.
+ *
+ * Such approach solves two big problems:
+ * 1. Not simplex devices (if they exist 8)) are handled properly.
+ * 2. IP spoofing attempts are filtered with 100% of guarantee.
*/
-void ip_rt_redirect(__u32 src, __u32 dst, __u32 gw, struct device *dev)
-{
- struct rt_req * rtr;
- struct rtable * rt;
+int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
+ u8 tos, struct device *pdev)
+{
+ struct device * dev = pdev;
+ struct fib_info *fi = NULL;
+ struct fib_info *src_fi = NULL;
+ unsigned flags = 0;
+ struct device *devout;
+ struct rtable * rth;
+ unsigned hash;
+ struct fib_result res;
+ u32 src_key = saddr;
+ u32 dst_key = daddr;
+ int err = -EINVAL;
+ int log = 0;
- rt = ip_rt_route(dst, 0);
- if (!rt)
- return;
+ hash = rt_hash_code(daddr, saddr^(unsigned long)pdev, tos);
- if (rt->rt_gateway != src ||
- rt->rt_dev != dev ||
- ((gw^dev->pa_addr)&dev->pa_mask) ||
- ip_chk_addr(gw))
- {
- ip_rt_put(rt);
- return;
- }
- ip_rt_put(rt);
+ /* Check for martians... */
- ip_rt_fast_lock();
- if (ip_rt_lock == 1)
- {
- rt_redirect_1(dst, gw, dev);
- ip_rt_unlock();
- return;
- }
+ if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
+ goto martian_source;
+ if (MULTICAST(daddr) || daddr == 0xFFFFFFFF)
+ goto mc_input;
- rtr = kmalloc(sizeof(struct rt_req), GFP_ATOMIC);
- if (rtr)
- {
- rtr->dst = dst;
- rtr->gw = gw;
- rtr->dev = dev;
- rt_req_enqueue(&rt_backlog, rtr);
- ip_rt_bh_mask |= RT_BH_REDIRECT;
- }
- ip_rt_unlock();
-}
+ /* Accept zero addresses only to limited broadcast/multicasts;
+ * I even do not know to fix it or not.
+ */
+ if (ZERONET(saddr))
+ goto martian_source;
+ if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
+ goto martian_destination;
+ /*
+ * Device is not yet initialized, accept all addresses as ours.
+ */
+ if (ZERONET(dev->pa_addr))
+ goto promisc_ip;
-static __inline__ void rt_garbage_collect(void)
-{
- if (ip_rt_lock == 1)
- {
- rt_garbage_collect_1();
- return;
+ /*
+ * Now we are able to route packet.
+ */
+ if ((err = fib_lookup(&res, daddr, saddr, tos, pdev, NULL)) < 0) {
+ if (!IS_ROUTER)
+ return -EINVAL;
+ goto no_route;
}
- ip_rt_bh_mask |= RT_BH_GARBAGE_COLLECT;
-}
-static void rt_cache_add(unsigned hash, struct rtable * rth)
-{
- unsigned long flags;
- struct rtable **rthp;
- __u32 daddr = rth->rt_dst;
- unsigned long now = jiffies;
+ fi = res.f->fib_info;
+ flags = fi->fib_flags;
+ devout = fi->fib_dev;
-#if RT_CACHE_DEBUG >= 2
- if (ip_rt_lock != 1)
- {
- printk("rt_cache_add: ip_rt_lock==%d\n", ip_rt_lock);
- return;
+ if (flags&RTF_NAT) {
+ daddr = htonl((ntohl(daddr)&((1<<res.fm)-1)))|fi->fib_gateway;
+ fi = fib_lookup_info(daddr, saddr, tos, pdev, NULL);
+ if (!fi || fi->fib_flags&(RTF_NAT|RTF_LOCAL|RTF_MULTICAST|RTF_BROADCAST))
+ return -EINVAL;
+ devout = fi->fib_dev;
+ flags = fi->fib_flags|RTCF_NAT|RTF_NAT;
}
-#endif
- save_flags(flags);
+ switch (res.fr->cl_action) {
+ case RTP_NAT:
+ /* Packet is from translated source; remember it */
+ saddr = (saddr&~res.fr->cl_srcmask)|res.fr->cl_srcmap;
+ flags |= RTCF_NAT;
+ break;
+ case RTP_MASQUERADE:
+ /* Packet is from masqueraded source; remember it */
+ flags |= RTCF_MASQ;
+ break;
+ default:
+ }
+ log = res.fr->cl_flags&RTRF_LOG;
- if (rth->rt_dev->header_cache_bind)
- {
- struct rtable * rtg = rth;
-
- if (rth->rt_gateway != daddr)
- {
- ip_rt_fast_unlock();
- rtg = ip_rt_route(rth->rt_gateway, 0);
- ip_rt_fast_lock();
- }
+ if (!(flags & RTF_LOCAL)) {
+ if (!IS_ROUTER || flags&RTF_NOFORWARD)
+ return -EINVAL;
+ } else {
+ fi = NULL;
+ devout = &loopback_dev;
+ if (flags&RTF_BROADCAST)
+ goto mc_input;
+ }
- if (rtg)
- {
- if (rtg == rth)
- rtg->rt_dev->header_cache_bind(&rtg->rt_hh, rtg->rt_dev, ETH_P_IP, rtg->rt_dst);
- else
- {
- if (rtg->rt_hh)
- atomic_inc(&rtg->rt_hh->hh_refcnt);
- rth->rt_hh = rtg->rt_hh;
- ip_rt_put(rtg);
- }
- }
+#ifndef CONFIG_IP_LOCAL_RT_POLICY
+ if (flags&RTF_LOCAL)
+ src_fi = fib_lookup_info(src_key, 0, tos, &loopback_dev, NULL);
+ else
+#endif
+ if (fib_lookup(&res, src_key, daddr, tos, net_alias_main_dev(devout), NULL) == 0) {
+ src_fi = res.f->fib_info;
+ /* Destination is on masqueraded network:
+ * if it is real incoming frame, ip_forward will drop it.
+ */
+ if (res.fr->cl_flags&RTRF_VALVE)
+ flags |= RTCF_VALVE;
}
- if (rt_cache_size >= RT_CACHE_SIZE_MAX)
- rt_garbage_collect();
+ if (src_fi) {
+ if (src_fi->fib_flags&(RTF_LOCAL|RTF_BROADCAST|RTF_MULTICAST|RTF_NAT))
+ goto martian_source;
+
+ if (!(src_fi->fib_flags&RTF_GATEWAY))
+ flags |= RTCF_DIRECTSRC;
+
+ if (net_alias_main_dev(src_fi->fib_dev) == pdev)
+ skb->dev = dev = src_fi->fib_dev;
+ else {
+ /* Route to packet source goes via
+ different interface; rfc1812 proposes
+ to drop them.
+ It is dangerous on not-stub/transit networks
+ because of path asymmetry.
+ */
+ if (ipv4_config.rfc1812_filter >= 2)
+ goto martian_source;
- cli();
- rth->rt_next = ip_rt_hash_table[hash];
-#if RT_CACHE_DEBUG >= 2
- if (rth->rt_next)
- {
- struct rtable * trth;
- printk("rt_cache @%02x: %08x", hash, daddr);
- for (trth=rth->rt_next; trth; trth=trth->rt_next)
- printk(" . %08x", trth->rt_dst);
- printk("\n");
+ /* Weaker form of rfc1812 filtering.
+ If source is on directly connected network,
+ it can mean either local network configuration error
+ (the most probable case) or real IP spoofing attempt.
+ */
+ if (ipv4_config.rfc1812_filter >= 1 && !(flags&RTCF_DIRECTSRC))
+ goto martian_source;
+ }
+ } else if (ipv4_config.rfc1812_filter >= 1)
+ goto martian_source;
+
+make_route:
+ if (skb->protocol != __constant_htons(ETH_P_IP)) {
+ /* ARP request. Do not make route for invalid destination or
+ * if it is redirected.
+ */
+ if (flags&(RTF_REJECT|RTF_BROADCAST|RTF_MULTICAST) ||
+ skb->pkt_type == PACKET_OTHERHOST ||
+ (devout == dev && !(flags&(RTF_LOCAL|RTCF_NAT))))
+ return -EINVAL;
}
-#endif
- ip_rt_hash_table[hash] = rth;
- rthp = &rth->rt_next;
- sti();
- rt_cache_size++;
- /*
- * Cleanup duplicate (and aged off) entries.
- */
+ rth = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops);
+ if (!rth)
+ return -ENOBUFS;
- while ((rth = *rthp) != NULL)
- {
+ rth->u.dst.output= ip_rt_bug;
- cli();
- if ((!rth->rt_refcnt && rth->rt_lastuse + RT_CACHE_TIMEOUT < now)
- || rth->rt_dst == daddr)
- {
- *rthp = rth->rt_next;
- rt_cache_size--;
- sti();
-#if RT_CACHE_DEBUG >= 2
- printk("rt_cache clean %02x@%08x\n", hash, rth->rt_dst);
+ rth->u.dst.use = 1;
+ rth->key.dst = dst_key;
+ rth->rt_dst = dst_key;
+ rth->rt_dst_map = daddr;
+ rth->key.tos = tos;
+ rth->key.src = src_key;
+ rth->rt_src = src_key;
+ rth->rt_src_map = saddr;
+ rth->rt_src_dev = dev;
+ rth->key.src_dev= pdev;
+ rth->u.dst.dev = devout;
+ rth->key.dst_dev= NULL;
+ rth->rt_gateway = daddr;
+ rth->rt_spec_dst= daddr;
+
+ if (!(flags&RTF_REJECT)) {
+ if (flags&RTF_LOCAL)
+ rth->u.dst.input= ip_local_deliver;
+ if (!(flags&(RTF_NOFORWARD|RTF_BROADCAST))) {
+ if (flags&RTF_MULTICAST) {
+#ifdef CONFIG_IP_MROUTE
+ if (!LOCAL_MCAST(daddr) && ipv4_config.multicast_route) {
+ rth->u.dst.input = ip_mr_input;
+ rth->u.dst.output = ip_output;
+ }
#endif
- rt_free(rth);
- continue;
+ } else if (!(flags&RTF_LOCAL)) {
+ rth->u.dst.input = ip_forward;
+ rth->u.dst.output = ip_output;
+ }
}
- sti();
- rthp = &rth->rt_next;
- }
- restore_flags(flags);
-}
-
-/*
- RT should be already locked.
-
- We could improve this by keeping a chain of say 32 struct rtable's
- last freed for fast recycling.
-
- */
+ } else if (IS_ROUTER && !(flags&(RTF_MULTICAST|RTF_BROADCAST))) {
+ rth->u.dst.input= ip_error;
+ rth->u.dst.error= -err;
+ }
+
+ if ((flags&(RTF_BROADCAST|RTF_MULTICAST)) || !(flags&RTF_LOCAL))
+ rth->rt_spec_dst= dev->pa_addr;
+
+ if (fi) {
+ rth->u.dst.pmtu = fi->fib_mtu;
+ rth->u.dst.window=fi->fib_window;
+ rth->u.dst.rtt = fi->fib_irtt;
+ if (flags & RTF_GATEWAY)
+ rth->rt_gateway = fi->fib_gateway;
+ } else {
+ rth->u.dst.pmtu = devout->mtu;
+ rth->u.dst.window=0;
+ rth->u.dst.rtt = TCP_TIMEOUT_INIT;
+ }
+
+ if (!(flags&(RTF_LOCAL|RTF_BROADCAST|RTF_MULTICAST|RTCF_NAT)) &&
+ flags&RTCF_DIRECTSRC &&
+ (devout == dev || (ipv4_config.rfc1620_redirects &&
+ net_alias_main_dev(devout) == pdev)))
+ flags |= RTCF_DOREDIRECT;
-struct rtable * ip_rt_slow_route (__u32 daddr, int local)
-{
- unsigned hash = ip_rt_hash_code(daddr)^local;
- struct rtable * rth;
- struct fib_node * f;
- struct fib_info * fi;
- __u32 saddr;
+ rth->rt_flags = flags;
-#if RT_CACHE_DEBUG >= 2
- printk("rt_cache miss @%08x\n", daddr);
-#endif
+ if (log)
+ printk(KERN_INFO "installing route %08lX -> %08lX\n", ntohl(rth->rt_src), ntohl(rth->rt_dst));
- rth = kmalloc(sizeof(struct rtable), GFP_ATOMIC);
- if (!rth)
- {
- ip_rt_unlock();
- return NULL;
+ if (flags&(RTF_LOCAL|RTF_MULTICAST|RTF_BROADCAST|RTF_REJECT)) {
+ skb->dst = (struct dst_entry*)rt_intern_hash(hash, rth, 0);
+ return 0;
}
+ skb->dst = (struct dst_entry*)rt_intern_hash(hash, rth, __constant_ntohs(skb->protocol));
+ return 0;
- if (local)
- f = fib_lookup_local(daddr);
- else
- f = fib_lookup (daddr);
+mc_input:
+ if (skb->protocol != __constant_htons(ETH_P_IP))
+ return -EINVAL;
- if (f)
- {
- fi = f->fib_info;
- f->fib_use++;
+ if (ZERONET(saddr)) {
+ if (!ipv4_config.bootp_agent)
+ goto martian_source;
+ flags |= RTF_NOFORWARD|RTF_LOCAL;
+ } else {
+ src_fi = fib_lookup_info(saddr, 0, tos, &loopback_dev, NULL);
+ if (!src_fi)
+ goto martian_source;
+
+ if (src_fi->fib_flags&(RTF_LOCAL|RTF_BROADCAST|RTF_MULTICAST|RTF_NAT))
+ goto martian_source;
+
+ if (!(src_fi->fib_flags&RTF_GATEWAY))
+ flags |= RTCF_DIRECTSRC;
+
+ if (!MULTICAST(daddr) || !ipv4_config.multicast_route ||
+ LOCAL_MCAST(daddr)) {
+ if (net_alias_main_dev(src_fi->fib_dev) == pdev) {
+ skb->dev = dev = src_fi->fib_dev;
+ } else {
+ /* Fascist not-unicast filtering 8) */
+ goto martian_source;
+ }
+ }
}
- if (!f || (fi->fib_flags & RTF_REJECT))
- {
-#ifdef CONFIG_KERNELD
- char wanted_route[20];
-#endif
-#if RT_CACHE_DEBUG >= 2
- printk("rt_route failed @%08x\n", daddr);
-#endif
- ip_rt_unlock();
- kfree_s(rth, sizeof(struct rtable));
-#ifdef CONFIG_KERNELD
- daddr=ntohl(daddr);
- sprintf(wanted_route, "%d.%d.%d.%d",
- (int)(daddr >> 24) & 0xff, (int)(daddr >> 16) & 0xff,
- (int)(daddr >> 8) & 0xff, (int)daddr & 0xff);
- kerneld_route(wanted_route); /* Dynamic route request */
-#endif
- return NULL;
- }
-
- saddr = fi->fib_dev->pa_addr;
-
- if (daddr == fi->fib_dev->pa_addr)
- {
- f->fib_use--;
- if ((f = fib_loopback) != NULL)
- {
- f->fib_use++;
- fi = f->fib_info;
- }
+ if (!MULTICAST(daddr)) {
+ flags |= RTF_LOCAL|RTF_BROADCAST|RTF_NOFORWARD;
+ devout = dev;
+ goto make_route;
}
-
- if (!f)
- {
- ip_rt_unlock();
- kfree_s(rth, sizeof(struct rtable));
- return NULL;
+
+ flags |= RTF_MULTICAST|RTF_LOCAL;
+
+ if (ip_check_mc(dev, daddr) == 0) {
+ flags &= ~RTF_LOCAL;
+
+ if (!ipv4_config.multicast_route || !(dev->flags&IFF_ALLMULTI))
+ goto no_route;
}
+ devout = dev;
+ goto make_route;
- rth->rt_dst = daddr;
- rth->rt_src = saddr;
- rth->rt_lastuse = jiffies;
- rth->rt_refcnt = 1;
- rth->rt_use = 1;
- rth->rt_next = NULL;
- rth->rt_hh = NULL;
- rth->rt_gateway = fi->fib_gateway;
- rth->rt_dev = fi->fib_dev;
- rth->rt_mtu = fi->fib_mtu;
- rth->rt_window = fi->fib_window;
- rth->rt_irtt = fi->fib_irtt;
- rth->rt_tos = f->fib_tos;
- rth->rt_flags = fi->fib_flags | RTF_HOST;
- if (local)
- rth->rt_flags |= RTF_LOCAL;
+promisc_ip:
+ flags |= RTF_LOCAL|RTF_NOFORWARD;
+ if (MULTICAST(daddr))
+ flags |= RTF_MULTICAST;
+ else
+ flags |= RTF_BROADCAST;
+ devout = dev;
+ goto make_route;
+
+no_route:
+ flags |= RTF_REJECT;
+ devout = dev;
+ goto make_route;
- if (!(rth->rt_flags & RTF_GATEWAY))
- rth->rt_gateway = rth->rt_dst;
/*
- * Multicast or limited broadcast is never gatewayed.
+ * Do not cache martian addresses: they should be logged (RFC1812)
*/
- if (MULTICAST(daddr) || daddr == 0xFFFFFFFF)
- rth->rt_gateway = rth->rt_dst;
+martian_destination:
+ if (ipv4_config.log_martians)
+ printk(KERN_WARNING "martian destination %08x from %08x, dev %s\n", daddr, saddr, dev->name);
+ return -EINVAL;
- if (ip_rt_lock == 1)
- rt_cache_add(hash, rth);
- else
- {
- rt_free(rth);
-#if RT_CACHE_DEBUG >= 1
- printk(KERN_DEBUG "rt_cache: route to %08x was born dead\n", daddr);
-#endif
+martian_source:
+ if (ipv4_config.log_martians) {
+ /*
+ * RFC1812 recommenadtion, if source is martian,
+ * the only hint is MAC header.
+ */
+ printk(KERN_WARNING "martian source %08x for %08x, dev %s\n", saddr, daddr, dev->name);
+ if (dev->hard_header_len) {
+ int i;
+ unsigned char *p = skb->mac.raw;
+ printk(KERN_WARNING "ll header:");
+ for (i=0; i<dev->hard_header_len; i++, p++)
+ printk(" %02x", *p);
+ printk("\n");
+ }
}
-
- ip_rt_unlock();
- return rth;
-}
-
-void ip_rt_put(struct rtable * rt)
-{
- if (rt)
- atomic_dec(&rt->rt_refcnt);
+ return -EINVAL;
}
-struct rtable * ip_rt_route(__u32 daddr, int local)
+int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
+ u8 tos, struct device *dev)
{
struct rtable * rth;
+ unsigned hash;
- ip_rt_fast_lock();
+ if (skb->dst)
+ return 0;
- for (rth=ip_rt_hash_table[ip_rt_hash_code(daddr)^local]; rth; rth=rth->rt_next)
- {
- if (rth->rt_dst == daddr)
- {
- rth->rt_lastuse = jiffies;
- atomic_inc(&rth->rt_use);
- atomic_inc(&rth->rt_refcnt);
- ip_rt_unlock();
- return rth;
+#if RT_CACHE_DEBUG >= 1
+ if (dev->flags & IFF_LOOPBACK) {
+ printk(KERN_DEBUG "ip_route_input: bug: packet is looped back\n");
+ return -EINVAL;
+ }
+ if (net_alias_main_dev(dev) != dev)
+ printk(KERN_DEBUG "ip_route_input: bug: packet is received on alias %s\n", dev->name);
+#endif
+
+ tos &= IPTOS_TOS_MASK;
+ hash = rt_hash_code(daddr, saddr^(unsigned long)dev, tos);
+ skb->dev = dev;
+
+ for (rth=rt_hash_table[hash]; rth; rth=rth->u.rt_next) {
+ if (rth->key.dst == daddr &&
+ rth->key.src == saddr &&
+ rth->key.src_dev == dev &&
+ rth->key.dst_dev == NULL &&
+ rth->key.tos == tos) {
+ rth->u.dst.lastuse = jiffies;
+ atomic_inc(&rth->u.dst.use);
+ atomic_inc(&rth->u.dst.refcnt);
+ skb->dst = (struct dst_entry*)rth;
+ skb->dev = rth->rt_src_dev;
+ return 0;
}
}
- return ip_rt_slow_route (daddr, local);
+ return ip_route_input_slow(skb, daddr, saddr, tos, dev);
}
+
/*
- * Process a route add request from the user, or from a kernel
- * task.
+ * Major route resolver routine.
*/
-
-int ip_rt_new(struct rtentry *r)
-{
- int err;
- char * devname;
- struct device * dev = NULL;
- unsigned long flags;
- __u32 daddr, mask, gw;
- short metric;
- /*
- * If a device is specified find it.
- */
-
- if ((devname = r->rt_dev) != NULL)
- {
- err = getname(devname, &devname);
- if (err)
- return err;
- dev = dev_get(devname);
- putname(devname);
- if (!dev)
- return -ENODEV;
- }
-
- /*
- * If the device isn't INET, don't allow it
- */
+int ip_route_output_slow(struct rtable **rp, u32 daddr, u32 saddr, u8 tos,
+ struct device *dev_out)
+{
+ u32 src_key = saddr;
+ u32 dst_key = daddr;
+ u32 dst_map;
+ struct device *dst_dev_key = dev_out;
+ unsigned flags = 0;
+ struct fib_info *fi = NULL;
+ struct rtable *rth;
+#ifdef CONFIG_IP_LOCAL_RT_POLICY
+ struct fib_result res;
+#endif
+ unsigned hash;
- if (r->rt_dst.sa_family != AF_INET)
- return -EAFNOSUPPORT;
+ tos &= IPTOS_TOS_MASK|1;
- /*
- * Make local copies of the important bits
- * We decrement the metric by one for BSD compatibility.
- */
-
- flags = r->rt_flags;
- daddr = (__u32) ((struct sockaddr_in *) &r->rt_dst)->sin_addr.s_addr;
- mask = (__u32) ((struct sockaddr_in *) &r->rt_genmask)->sin_addr.s_addr;
- gw = (__u32) ((struct sockaddr_in *) &r->rt_gateway)->sin_addr.s_addr;
- metric = r->rt_metric > 0 ? r->rt_metric - 1 : 0;
+ if (saddr) {
+ if (MULTICAST(saddr) || BADCLASS(saddr) || ZERONET(saddr) ||
+ __ip_chk_addr(saddr) != IS_MYADDR)
+ return -EINVAL;
+ if (dev_out == NULL && (MULTICAST(daddr) || daddr == 0xFFFFFFFF))
+ dev_out = ip_dev_find(saddr, NULL);
+ }
+ if (!daddr)
+ daddr = saddr;
- /*
- * BSD emulation: Permits route add someroute gw one-of-my-addresses
- * to indicate which iface. Not as clean as the nice Linux dev technique
- * but people keep using it... (and gated likes it ;))
- */
-
- if (!dev && (flags & RTF_GATEWAY))
- {
- struct device *dev2;
- for (dev2 = dev_base ; dev2 != NULL ; dev2 = dev2->next)
- {
- if ((dev2->flags & IFF_UP) && dev2->pa_addr == gw)
- {
- flags &= ~RTF_GATEWAY;
- dev = dev2;
- break;
- }
- }
+ if (dev_out) {
+ if (!saddr) {
+ saddr = dev_out->pa_addr;
+ if (!daddr)
+ daddr = saddr;
+ }
+ dst_map = daddr;
+ if (MULTICAST(daddr) || daddr == 0xFFFFFFFF)
+ goto make_route;
+ }
+
+ if (!daddr)
+ daddr = htonl(INADDR_LOOPBACK);
+
+#ifdef CONFIG_IP_LOCAL_RT_POLICY
+ if (fib_lookup(&res, daddr, saddr, tos, &loopback_dev, dev_out))
+ return -ENETUNREACH;
+ fi = res.f->fib_info;
+ dst_map = daddr;
+
+ if (fi->fib_flags&RTF_NAT) {
+ dst_map = htonl((ntohl(daddr)&((1<<res.fm)-1)))|fi->fib_gateway;
+ fi = fib_lookup_info(dst_map, saddr, tos, &loopback_dev, dev_out);
+ if (!fi || fi->fib_flags&(RTF_NAT|RTF_LOCAL|RTF_MULTICAST|RTF_BROADCAST))
+ return -EINVAL;
+ flags = RTCF_NAT;
}
- if (flags & RTF_HOST)
- mask = 0xffffffff;
- else if (mask && r->rt_genmask.sa_family != AF_INET)
- return -EAFNOSUPPORT;
-
- if (flags & RTF_GATEWAY)
- {
- if (r->rt_gateway.sa_family != AF_INET)
- return -EAFNOSUPPORT;
+ if (!saddr) {
+ saddr = fi->fib_dev->pa_addr;
/*
- * Don't try to add a gateway we can't reach..
- * Tunnel devices are exempt from this rule.
+ * "Stabilization" of route.
+ * This step is necessary, if locally originated packets
+ * are subjected to source routing, else we could get
+ * route flapping.
*/
-
- if (!dev)
- dev = get_gw_dev(gw);
- else if (dev != get_gw_dev(gw) && dev->type != ARPHRD_TUNNEL)
- return -EINVAL;
- if (!dev)
- return -ENETUNREACH;
- }
- else
- {
- gw = 0;
- if (!dev)
- dev = ip_dev_bynet(daddr, mask);
- if (!dev)
+ fi = fib_lookup_info(dst_map, saddr, tos, &loopback_dev, dev_out);
+ if (!fi)
return -ENETUNREACH;
- if (!mask)
- {
- if (((daddr ^ dev->pa_addr) & dev->pa_mask) == 0)
- mask = dev->pa_mask;
- }
}
+#else
+ fi = fib_lookup_info(daddr, 0, tos, &loopback_dev, dev_out);
+ if (!fi)
+ return -ENETUNREACH;
+
+ if (fi->fib_flags&RTF_NAT)
+ return -EINVAL;
-#ifndef CONFIG_IP_CLASSLESS
- if (!mask)
- mask = ip_get_mask(daddr);
+ dst_map = daddr;
+ if (!saddr)
+ saddr = fi->fib_dev->pa_addr;
#endif
-
- if (bad_mask(mask, daddr))
+
+ flags |= fi->fib_flags;
+ dev_out = fi->fib_dev;
+
+ if (RT_LOCALADDR(flags)) {
+ dev_out = &loopback_dev;
+ fi = NULL;
+ }
+
+ if (dst_dev_key && dev_out != dst_dev_key)
return -EINVAL;
- /*
- * Add the route
- */
+make_route:
+ if (LOOPBACK(saddr) && !(dev_out->flags&IFF_LOOPBACK)) {
+ printk(KERN_DEBUG "this guy talks to %08x from loopback\n", daddr);
+ return -EINVAL;
+ }
+
+ if (daddr == 0xFFFFFFFF)
+ flags |= RTF_BROADCAST;
+ else if (MULTICAST(daddr))
+ flags |= RTF_MULTICAST;
+ else if (BADCLASS(daddr) || ZERONET(daddr))
+ return -EINVAL;
+
+ if (flags&RTF_BROADCAST && (dev_out->flags&IFF_LOOPBACK ||
+ !(dev_out->flags&IFF_BROADCAST)))
+ flags &= ~RTF_LOCAL;
+ else if (flags&RTF_MULTICAST) {
+ if (ip_check_mc(dev_out, daddr))
+ flags |= RTF_LOCAL;
+ }
+
+ rth = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops);
+ if (!rth)
+ return -ENOBUFS;
+
+ rth->u.dst.use = 1;
+ rth->key.dst = dst_key;
+ rth->key.tos = tos;
+ rth->key.src = src_key;
+ rth->key.src_dev= NULL;
+ rth->key.dst_dev= dst_dev_key;
+ rth->rt_dst = daddr;
+ rth->rt_dst_map = dst_map;
+ rth->rt_src = saddr;
+ rth->rt_src_map = saddr;
+ rth->rt_src_dev = dev_out;
+ rth->u.dst.dev = dev_out;
+ rth->rt_gateway = dst_map;
+ rth->rt_spec_dst= dev_out->pa_addr;
+
+ rth->u.dst.output=ip_output;
+
+ if (flags&RTF_LOCAL) {
+ rth->u.dst.input = ip_local_deliver;
+ rth->rt_spec_dst = daddr;
+ }
+ if (flags&(RTF_BROADCAST|RTF_MULTICAST)) {
+ rth->rt_spec_dst = dev_out->pa_addr;
+ flags &= ~RTF_GATEWAY;
+ if (flags&RTF_LOCAL)
+ rth->u.dst.output = ip_mc_output;
+ if (flags&RTF_MULTICAST) {
+ if (dev_out->flags&IFF_ALLMULTI)
+ rth->u.dst.output = ip_mc_output;
+#ifdef CONFIG_IP_MROUTE
+ if (ipv4_config.multicast_route && !LOCAL_MCAST(daddr))
+ rth->u.dst.input = ip_mr_input;
+#endif
+ }
+ }
- rt_add(flags, daddr, mask, gw, dev, r->rt_mss, r->rt_window, r->rt_irtt, metric);
+ if (fi) {
+ if (flags&RTF_GATEWAY)
+ rth->rt_gateway = fi->fib_gateway;
+ rth->u.dst.pmtu = fi->fib_mtu;
+ rth->u.dst.window=fi->fib_window;
+ rth->u.dst.rtt = fi->fib_irtt;
+ } else {
+ rth->u.dst.pmtu = dev_out->mtu;
+ rth->u.dst.window=0;
+ rth->u.dst.rtt = TCP_TIMEOUT_INIT;
+ }
+ rth->rt_flags = flags;
+ hash = rt_hash_code(dst_key, src_key, tos);
+ if (dst_dev_key)
+ hash ^= dev_hash_name(dst_dev_key->name);
+ *rp = rt_intern_hash(hash, rth, ETH_P_IP);
return 0;
}
+int ip_route_output(struct rtable **rp, u32 daddr, u32 saddr, u8 tos, struct device *dev_out)
+{
+ unsigned hash;
+ struct rtable *rth;
-/*
- * Remove a route, as requested by the user.
- */
+ hash = rt_hash_code(daddr, saddr, tos);
+ if (dev_out)
+ hash ^= dev_out->hash;
-int ip_rt_kill(struct rtentry *r)
-{
- struct sockaddr_in *trg;
- struct sockaddr_in *msk;
- struct sockaddr_in *gtw;
- char *devname;
- int err;
- struct device * dev = NULL;
-
- trg = (struct sockaddr_in *) &r->rt_dst;
- msk = (struct sockaddr_in *) &r->rt_genmask;
- gtw = (struct sockaddr_in *) &r->rt_gateway;
- if ((devname = r->rt_dev) != NULL)
- {
- err = getname(devname, &devname);
- if (err)
- return err;
- dev = dev_get(devname);
- putname(devname);
- if (!dev)
- return -ENODEV;
+ start_bh_atomic();
+ for (rth=rt_hash_table[hash]; rth; rth=rth->u.rt_next) {
+ if (rth->key.dst == daddr &&
+ rth->key.src == saddr &&
+ rth->key.src_dev == NULL &&
+ rth->key.dst_dev == dev_out &&
+ rth->key.tos == tos) {
+ rth->u.dst.lastuse = jiffies;
+ atomic_inc(&rth->u.dst.use);
+ atomic_inc(&rth->u.dst.refcnt);
+ end_bh_atomic();
+ *rp = rth;
+ return 0;
+ }
}
- /*
- * metric can become negative here if it wasn't filled in
- * but that's a fortunate accident; we really use that in rt_del.
- */
- err=rt_del((__u32)trg->sin_addr.s_addr, (__u32)msk->sin_addr.s_addr, dev,
- (__u32)gtw->sin_addr.s_addr, r->rt_flags, r->rt_metric - 1);
- return err;
+ end_bh_atomic();
+
+ return ip_route_output_slow(rp, daddr, saddr, tos, dev_out);
}
-/*
- * Handle IP routing ioctl calls. These are used to manipulate the routing tables
- */
-
-int ip_rt_ioctl(unsigned int cmd, void *arg)
+int ip_route_output_dev(struct rtable **rp, u32 daddr, u32 saddr, u8 tos, char *devname)
{
- int err;
- struct rtentry rt;
+ unsigned hash;
+ struct rtable *rth;
+ struct device *dev_out;
+
+ hash = rt_hash_code(daddr, saddr, tos)^dev_hash_mc_name(devname);
- switch(cmd)
- {
- case SIOCADDRT: /* Add a route */
- case SIOCDELRT: /* Delete a route */
- if (!suser())
- return -EPERM;
- err = copy_from_user(&rt, arg, sizeof(struct rtentry));
- if (err)
- return -EFAULT;
- return (cmd == SIOCDELRT) ? ip_rt_kill(&rt) : ip_rt_new(&rt);
+ start_bh_atomic();
+ for (rth=rt_hash_table[hash]; rth; rth=rth->u.rt_next) {
+ if (rth->key.dst == daddr &&
+ rth->key.src == saddr &&
+ rth->key.src_dev == NULL &&
+ rth->key.tos == tos &&
+ rth->key.dst_dev &&
+ strcmp(rth->key.dst_dev->name, devname)==0) {
+ rth->u.dst.lastuse = jiffies;
+ atomic_inc(&rth->u.dst.use);
+ atomic_inc(&rth->u.dst.refcnt);
+ end_bh_atomic();
+ *rp = rth;
+ return 0;
+ }
}
+ end_bh_atomic();
- return -EINVAL;
+ dev_out = dev_get(devname);
+ if (!dev_out)
+ return -ENODEV;
+ return ip_route_output_slow(rp, daddr, saddr, tos, dev_out);
}
-void ip_rt_advice(struct rtable **rp, int advice)
+void ip_rt_multicast_event(struct device *dev)
{
- /* Thanks! */
- return;
+ rt_cache_flush(0);
}
-void ip_rt_update(int event, struct device *dev)
+void ip_rt_init()
{
-/*
- * This causes too much grief to do now.
- */
-#ifdef COMING_IN_2_1
- if (event == NETDEV_UP)
- rt_add(RTF_HOST|RTF_UP, dev->pa_addr, ~0, 0, dev, 0, 0, 0, 0);
- else if (event == NETDEV_DOWN)
- rt_del(dev->pa_addr, ~0, dev, 0, RTF_HOST|RTF_UP, 0);
-#endif
+ ip_fib_init();
+
+#ifdef CONFIG_PROC_FS
+ proc_net_register(&(struct proc_dir_entry) {
+ PROC_NET_RTCACHE, 8, "rt_cache",
+ S_IFREG | S_IRUGO, 1, 0, 0,
+ 0, &proc_net_inode_operations,
+ rt_cache_get_info
+ });
+#endif
}
FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen, slshen@lbl.gov