patch-2.2.4 linux/net/ipv4/route.c
Next file: linux/net/ipv4/syncookies.c
Previous file: linux/net/ipv4/ipmr.c
Back to the patch index
Back to the overall index
- Lines: 860
- Date:
Tue Mar 23 13:20:34 1999
- Orig file:
v2.2.3/linux/net/ipv4/route.c
- Orig date:
Tue Jan 19 11:32:53 1999
diff -u --recursive --new-file v2.2.3/linux/net/ipv4/route.c linux/net/ipv4/route.c
@@ -5,7 +5,7 @@
*
* ROUTE - implementation of the IP router.
*
- * Version: $Id: route.c,v 1.61 1999/01/12 14:34:43 davem Exp $
+ * Version: $Id: route.c,v 1.64 1999/03/23 21:21:13 davem Exp $
*
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -49,6 +49,9 @@
* Andi Kleen : Load-limit warning messages.
* Vitaly E. Lavrov : Transparent proxy revived after year coma.
* Vitaly E. Lavrov : Race condition in ip_route_input_slow.
+ * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
+ * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
+ * Marc Boucher : routing by fwmark
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -108,6 +111,7 @@
int ip_rt_error_cost = HZ;
int ip_rt_error_burst = 5*HZ;
int ip_rt_gc_elasticity = 8;
+int ip_rt_mtu_expires = 10*60*HZ;
static unsigned long rt_deadline = 0;
@@ -165,13 +169,14 @@
TC_PRIO_FILLER
};
+
/*
* Route cache.
*/
struct rtable *rt_hash_table[RT_HASH_DIVISOR];
-static struct rtable * rt_intern_hash(unsigned hash, struct rtable * rth);
+static int rt_intern_hash(unsigned hash, struct rtable * rth, struct rtable ** res);
static __inline__ unsigned rt_hash_code(u32 daddr, u32 saddr, u8 tos)
{
@@ -249,6 +254,12 @@
dst_free(&rt->u.dst);
}
+static __inline__ void rt_drop(struct rtable *rt)
+{
+ ip_rt_put(rt);
+ dst_free(&rt->u.dst);
+}
+
static __inline__ int rt_fast_clean(struct rtable *rth)
{
/* Kill broadcast/multicast entries very aggresively, if they
@@ -257,6 +268,27 @@
&& rth->key.iif && rth->u.rt_next);
}
+static __inline__ int rt_valuable(struct rtable *rth)
+{
+ return ((rth->rt_flags&(RTCF_REDIRECTED|RTCF_NOTIFY))
+ || rth->u.dst.expires);
+}
+
+static __inline__ int rt_may_expire(struct rtable *rth, int tmo1, int tmo2)
+{
+ int age;
+
+ if (atomic_read(&rth->u.dst.use))
+ return 0;
+
+ age = jiffies - rth->u.dst.lastuse;
+ if (age <= tmo1 && !rt_fast_clean(rth))
+ return 0;
+ if (age <= tmo2 && rt_valuable(rth))
+ return 0;
+ return 1;
+}
+
static void rt_check_expire(unsigned long dummy)
{
int i;
@@ -271,22 +303,27 @@
rthp = &rt_hash_table[rover];
while ((rth = *rthp) != NULL) {
- /*
- * Cleanup aged off entries.
- */
-
- if (!atomic_read(&rth->u.dst.use) &&
- (now - rth->u.dst.lastuse > tmo
- || rt_fast_clean(rth))) {
- *rthp = rth->u.rt_next;
- rt_free(rth);
+ if (rth->u.dst.expires) {
+ /* Entrie is expired even if it is in use */
+ if ((long)(now - rth->u.dst.expires) < tmo) {
+ tmo >>= 1;
+ rthp = &rth->u.rt_next;
+ continue;
+ }
+ } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
+ tmo >>= 1;
+ rthp = &rth->u.rt_next;
continue;
}
- tmo >>= 1;
- rthp = &rth->u.rt_next;
+ /*
+ * Cleanup aged off entries.
+ */
+ *rthp = rth->u.rt_next;
+ rt_free(rth);
}
+ /* Fallback loop breaker. */
if ((jiffies - now) > 0)
break;
}
@@ -301,16 +338,21 @@
rt_deadline = 0;
+ net_serialize_enter();
for (i=0; i<RT_HASH_DIVISOR; i++) {
if ((rth = xchg(&rt_hash_table[i], NULL)) == NULL)
continue;
+ net_serialize_leave();
for (; rth; rth=next) {
next = rth->u.rt_next;
rth->u.rt_next = NULL;
rt_free(rth);
}
+
+ net_serialize_enter();
}
+ net_serialize_leave();
}
void rt_cache_flush(int delay)
@@ -354,60 +396,137 @@
end_bh_atomic();
}
+/*
+ Short description of GC goals.
+
+ We want to build algorithm, which will keep routing cache
+ at some equilibrium point, when number of aged off entries
+ is kept approximately equal to newly generated ones.
+
+ Current expiration strength is variable "expire".
+ We try to adjust it dynamically, so that if networking
+ is idle expires is large enough to keep enough of warm entries,
+ and when load increases it reduces to limit cache size.
+ */
+
static int rt_garbage_collect(void)
{
- int i;
- static unsigned expire = RT_GC_TIMEOUT>>1;
+ static unsigned expire = RT_GC_TIMEOUT;
static unsigned long last_gc;
+ static int rover;
+ static int equilibrium;
struct rtable *rth, **rthp;
unsigned long now = jiffies;
-
- start_bh_atomic();
+ int goal;
/*
* Garbage collection is pretty expensive,
- * do not make it too frequently, but just increase expire strength.
+ * do not make it too frequently.
*/
- if (now - last_gc < ip_rt_gc_min_interval)
- goto out;
+ if (now - last_gc < ip_rt_gc_min_interval &&
+ atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
+ return 0;
- expire++;
+ /* Calculate number of entries, which we want to expire now. */
+ goal = atomic_read(&ipv4_dst_ops.entries) - RT_HASH_DIVISOR*ip_rt_gc_elasticity;
+ if (goal <= 0) {
+ if (equilibrium < ipv4_dst_ops.gc_thresh)
+ equilibrium = ipv4_dst_ops.gc_thresh;
+ goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
+ if (goal > 0) {
+ equilibrium += min(goal/2, RT_HASH_DIVISOR);
+ goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
+ }
+ } else {
+ /* We are in dangerous area. Try to reduce cache really
+ * aggressively.
+ */
+ goal = max(goal/2, RT_HASH_DIVISOR);
+ equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
+ }
- for (i=0; i<RT_HASH_DIVISOR; i++) {
- unsigned tmo;
- if (!rt_hash_table[i])
- continue;
- tmo = expire;
- for (rthp=&rt_hash_table[i]; (rth=*rthp); rthp=&rth->u.rt_next) {
- if (atomic_read(&rth->u.dst.use) ||
- (now - rth->u.dst.lastuse < tmo && !rt_fast_clean(rth))) {
- tmo >>= 1;
- continue;
+ if (now - last_gc >= ip_rt_gc_min_interval)
+ last_gc = now;
+
+ if (goal <= 0) {
+ equilibrium += goal;
+ goto work_done;
+ }
+
+ do {
+ int i, k;
+
+ start_bh_atomic();
+ for (i=0, k=rover; i<RT_HASH_DIVISOR; i++) {
+ unsigned tmo = expire;
+
+ k = (k + 1) & (RT_HASH_DIVISOR-1);
+ rthp = &rt_hash_table[k];
+ while ((rth = *rthp) != NULL) {
+ if (!rt_may_expire(rth, tmo, expire)) {
+ tmo >>= 1;
+ rthp = &rth->u.rt_next;
+ continue;
+ }
+ *rthp = rth->u.rt_next;
+ rth->u.rt_next = NULL;
+ rt_free(rth);
+ goal--;
}
- *rthp = rth->u.rt_next;
- rth->u.rt_next = NULL;
- rt_free(rth);
- break;
+ if (goal <= 0)
+ break;
}
- if ((jiffies-now)>0)
+ rover = k;
+ end_bh_atomic();
+
+ if (goal <= 0)
+ goto work_done;
+
+ /* Goal is not achieved. We stop process if:
+
+ - if expire reduced to zero. Otherwise, expire is halfed.
+ - if table is not full.
+ - if we are called from interrupt.
+ - jiffies check is just fallback/debug loop breaker.
+ We will not spin here for long time in any case.
+ */
+
+ if (expire == 0)
break;
- }
- last_gc = now;
- if (atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
- expire = ip_rt_gc_timeout>>1;
+ expire >>= 1;
+#if RT_CACHE_DEBUG >= 2
+ printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire, atomic_read(&ipv4_dst_ops.entries), goal, i);
+#endif
-out:
- expire -= expire>>ip_rt_gc_elasticity;
- end_bh_atomic();
- return (atomic_read(&ipv4_dst_ops.entries) > ip_rt_max_size);
+ if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
+ return 0;
+ } while (!in_interrupt() && jiffies - now < 1);
+
+ if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
+ return 0;
+ if (net_ratelimit())
+ printk("dst cache overflow\n");
+ return 1;
+
+work_done:
+ expire += ip_rt_gc_min_interval;
+ if (expire > ip_rt_gc_timeout ||
+ atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
+ expire = ip_rt_gc_timeout;
+#if RT_CACHE_DEBUG >= 2
+ printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire, atomic_read(&ipv4_dst_ops.entries), goal, rover);
+#endif
+ return 0;
}
-static struct rtable *rt_intern_hash(unsigned hash, struct rtable * rt)
+static int rt_intern_hash(unsigned hash, struct rtable * rt, struct rtable ** rp)
{
struct rtable *rth, **rthp;
unsigned long now = jiffies;
+ int attempts = !in_interrupt();
+restart:
start_bh_atomic();
rthp = &rt_hash_table[hash];
@@ -424,9 +543,9 @@
rth->u.dst.lastuse = now;
end_bh_atomic();
- ip_rt_put(rt);
- rt_free(rt);
- return rth;
+ rt_drop(rt);
+ *rp = rth;
+ return 0;
}
rthp = &rth->u.rt_next;
@@ -435,8 +554,28 @@
/* Try to bind route to arp only if it is output
route or unicast forwarding path.
*/
- if (rt->rt_type == RTN_UNICAST || rt->key.iif == 0)
- arp_bind_neighbour(&rt->u.dst);
+ if (rt->rt_type == RTN_UNICAST || rt->key.iif == 0) {
+ if (!arp_bind_neighbour(&rt->u.dst)) {
+ end_bh_atomic();
+
+ /* Neighbour tables are full and nothing
+ can be released. Try to shrink route cache,
+ it is most likely it holds some neighbour records.
+ */
+ if (attempts-- > 0) {
+ int saved_elasticity = ip_rt_gc_elasticity;
+ ip_rt_gc_elasticity = 1;
+ rt_garbage_collect();
+ ip_rt_gc_elasticity = saved_elasticity;
+ goto restart;
+ }
+
+ rt_drop(rt);
+ if (net_ratelimit())
+ printk("neighbour table overflow\n");
+ return -ENOBUFS;
+ }
+ }
rt->u.rt_next = rt_hash_table[hash];
#if RT_CACHE_DEBUG >= 2
@@ -449,9 +588,9 @@
}
#endif
rt_hash_table[hash] = rt;
-
end_bh_atomic();
- return rt;
+ *rp = rt;
+ return 0;
}
void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
@@ -537,17 +676,15 @@
!(rt->u.dst.neighbour->nud_state&NUD_VALID)) {
if (rt->u.dst.neighbour)
neigh_event_send(rt->u.dst.neighbour, NULL);
- ip_rt_put(rt);
ip_rt_put(rth);
- rt_free(rt);
+ rt_drop(rt);
break;
}
*rthp = rth->u.rt_next;
- rt = rt_intern_hash(hash, rt);
- ip_rt_put(rt);
- ip_rt_put(rth);
- rt_free(rth);
+ if (!rt_intern_hash(hash, rt, &rt))
+ ip_rt_put(rt);
+ rt_drop(rth);
break;
}
}
@@ -573,14 +710,14 @@
ip_rt_put(rt);
return NULL;
}
- if (rt->rt_flags&RTCF_REDIRECTED) {
+ if ((rt->rt_flags&RTCF_REDIRECTED) || rt->u.dst.expires) {
unsigned hash = rt_hash_code(rt->key.dst, rt->key.src^(rt->key.oif<<5), rt->key.tos);
struct rtable **rthp;
#if RT_CACHE_DEBUG >= 1
printk(KERN_DEBUG "ip_rt_advice: redirect to %d.%d.%d.%d/%02x dropped\n", NIPQUAD(rt->rt_dst), rt->key.tos);
#endif
- ip_rt_put(rt);
start_bh_atomic();
+ ip_rt_put(rt);
for (rthp = &rt_hash_table[hash]; *rthp; rthp = &(*rthp)->u.rt_next) {
if (*rthp == rt) {
*rthp = rt->u.rt_next;
@@ -614,6 +751,10 @@
void ip_rt_send_redirect(struct sk_buff *skb)
{
struct rtable *rt = (struct rtable*)skb->dst;
+ struct in_device *in_dev = (struct in_device*)rt->u.dst.dev->ip_ptr;
+
+ if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev))
+ return;
/* No redirected packets during ip_rt_redirect_silence;
* reset the algorithm.
@@ -637,7 +778,7 @@
rt->u.dst.rate_last = jiffies;
++rt->u.dst.rate_tokens;
#ifdef CONFIG_IP_ROUTE_VERBOSE
- if (skb->dev->ip_ptr && IN_DEV_LOG_MARTIANS((struct in_device*)skb->dev->ip_ptr) &&
+ if (IN_DEV_LOG_MARTIANS(in_dev) &&
rt->u.dst.rate_tokens == ip_rt_redirect_number && net_ratelimit())
printk(KERN_WARNING "host %08x/if%d ignores redirects for %08x to %08x.\n",
rt->rt_src, rt->rt_iif, rt->rt_dst, rt->rt_gateway);
@@ -737,6 +878,7 @@
if (mtu < rth->u.dst.pmtu) {
dst_confirm(&rth->u.dst);
rth->u.dst.pmtu = mtu;
+ dst_set_expires(&rth->u.dst, ip_rt_mtu_expires);
}
est_mtu = mtu;
}
@@ -760,7 +902,13 @@
static void ipv4_link_failure(struct sk_buff *skb)
{
+ struct rtable *rt;
+
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
+
+ rt = (struct rtable *) skb->dst;
+ if (rt)
+ dst_set_expires(&rt->u.dst, 0);
}
static int ip_rt_bug(struct sk_buff *skb)
@@ -794,7 +942,17 @@
memcpy(addr, &src, 4);
}
-static void rt_set_nexthop(struct rtable *rt, struct fib_result *res)
+#ifdef CONFIG_NET_CLS_ROUTE
+static void set_class_tag(struct rtable *rt, u32 tag)
+{
+ if (!(rt->u.dst.tclassid&0xFFFF))
+ rt->u.dst.tclassid |= tag&0xFFFF;
+ if (!(rt->u.dst.tclassid&0xFFFF0000))
+ rt->u.dst.tclassid |= tag&0xFFFF0000;
+}
+#endif
+
+static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
{
struct fib_info *fi = res->fi;
@@ -824,9 +982,11 @@
rt->u.dst.window= 0;
rt->u.dst.rtt = TCP_TIMEOUT_INIT;
}
-#if defined(CONFIG_NET_CLS_ROUTE) && defined(CONFIG_IP_MULTIPLE_TABLES)
- if (rt->u.dst.tclassid == 0)
- rt->u.dst.tclassid = fib_rules_tclass(res);
+#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+ set_class_tag(rt, fib_rules_tclass(res));
+#endif
+ set_class_tag(rt, itag);
#endif
rt->rt_type = res->type;
}
@@ -839,6 +999,7 @@
struct rtable *rth;
u32 spec_dst;
struct in_device *in_dev = dev->ip_ptr;
+ u32 itag = 0;
/* Primary sanity checks. */
@@ -850,7 +1011,7 @@
if (!LOCAL_MCAST(daddr))
return -EINVAL;
spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
- } else if (fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst) < 0)
+ } else if (fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, &itag) < 0)
return -EINVAL;
rth = dst_alloc(sizeof(struct rtable), &ipv4_dst_ops);
@@ -863,12 +1024,18 @@
rth->key.dst = daddr;
rth->rt_dst = daddr;
rth->key.tos = tos;
+#ifdef CONFIG_IP_ROUTE_FWMARK
+ rth->key.fwmark = skb->fwmark;
+#endif
rth->key.src = saddr;
rth->rt_src = saddr;
#ifdef CONFIG_IP_ROUTE_NAT
rth->rt_dst_map = daddr;
rth->rt_src_map = saddr;
#endif
+#ifdef CONFIG_NET_CLS_ROUTE
+ rth->u.dst.tclassid = itag;
+#endif
rth->rt_iif =
rth->key.iif = dev->ifindex;
rth->u.dst.dev = &loopback_dev;
@@ -888,8 +1055,7 @@
#endif
hash = rt_hash_code(daddr, saddr^(dev->ifindex<<5), tos);
- skb->dst = (struct dst_entry*)rt_intern_hash(hash, rth);
- return 0;
+ return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
}
/*
@@ -910,6 +1076,7 @@
struct in_device *in_dev = dev->ip_ptr;
struct in_device *out_dev;
unsigned flags = 0;
+ u32 itag = 0;
struct rtable * rth;
unsigned hash;
u32 spec_dst;
@@ -925,6 +1092,9 @@
key.dst = daddr;
key.src = saddr;
key.tos = tos;
+#ifdef CONFIG_IP_ROUTE_FWMARK
+ key.fwmark = skb->fwmark;
+#endif
key.iif = dev->ifindex;
key.oif = 0;
key.scope = RT_SCOPE_UNIVERSE;
@@ -983,9 +1153,14 @@
goto brd_input;
if (res.type == RTN_LOCAL) {
- spec_dst = daddr;
- if (inet_addr_type(saddr) != RTN_UNICAST)
+ int result;
+ result = fib_validate_source(saddr, daddr, tos, loopback_dev.ifindex,
+ dev, &spec_dst, &itag);
+ if (result < 0)
goto martian_source;
+ if (result)
+ flags |= RTCF_DIRECTSRC;
+ spec_dst = daddr;
goto local_input;
}
@@ -1005,14 +1180,14 @@
return -EINVAL;
}
- err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(res), dev, &spec_dst);
+ err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(res), dev, &spec_dst, &itag);
if (err < 0)
goto martian_source;
if (err)
flags |= RTCF_DIRECTSRC;
- if (out_dev == in_dev && err && !(flags&RTCF_NAT) &&
+ if (out_dev == in_dev && err && !(flags&(RTCF_NAT|RTCF_MASQ)) &&
(IN_DEV_SHARED_MEDIA(out_dev)
|| inet_addr_onlink(out_dev, saddr, FIB_RES_GW(res))))
flags |= RTCF_DOREDIRECT;
@@ -1033,6 +1208,9 @@
rth->key.dst = daddr;
rth->rt_dst = daddr;
rth->key.tos = tos;
+#ifdef CONFIG_IP_ROUTE_FWMARK
+ rth->key.fwmark = skb->fwmark;
+#endif
rth->key.src = saddr;
rth->rt_src = saddr;
rth->rt_gateway = daddr;
@@ -1051,7 +1229,7 @@
rth->u.dst.input = ip_forward;
rth->u.dst.output = ip_output;
- rt_set_nexthop(rth, &res);
+ rt_set_nexthop(rth, &res, itag);
rth->rt_flags = flags;
@@ -1066,8 +1244,7 @@
}
#endif
- skb->dst = (struct dst_entry*)rt_intern_hash(hash, rth);
- return 0;
+ return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
brd_input:
if (skb->protocol != __constant_htons(ETH_P_IP))
@@ -1076,7 +1253,7 @@
if (ZERONET(saddr)) {
spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
} else {
- err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst);
+ err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, &itag);
if (err < 0)
goto martian_source;
if (err)
@@ -1096,12 +1273,18 @@
rth->key.dst = daddr;
rth->rt_dst = daddr;
rth->key.tos = tos;
+#ifdef CONFIG_IP_ROUTE_FWMARK
+ rth->key.fwmark = skb->fwmark;
+#endif
rth->key.src = saddr;
rth->rt_src = saddr;
#ifdef CONFIG_IP_ROUTE_NAT
rth->rt_dst_map = key.dst;
rth->rt_src_map = key.src;
#endif
+#ifdef CONFIG_NET_CLS_ROUTE
+ rth->u.dst.tclassid = itag;
+#endif
rth->rt_iif =
rth->key.iif = dev->ifindex;
rth->u.dst.dev = &loopback_dev;
@@ -1116,8 +1299,7 @@
rth->rt_flags &= ~RTCF_LOCAL;
}
rth->rt_type = res.type;
- skb->dst = (struct dst_entry*)rt_intern_hash(hash, rth);
- return 0;
+ return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
no_route:
spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
@@ -1170,6 +1352,9 @@
rth->key.src == saddr &&
rth->key.iif == iif &&
rth->key.oif == 0 &&
+#ifdef CONFIG_IP_ROUTE_FWMARK
+ rth->key.fwmark == skb->fwmark &&
+#endif
rth->key.tos == tos) {
rth->u.dst.lastuse = jiffies;
atomic_inc(&rth->u.dst.use);
@@ -1344,43 +1529,33 @@
if (res.type == RTN_NAT)
return -EINVAL;
-
- if (!key.src) {
- key.src = FIB_RES_PREFSRC(res);
-
-#ifdef CONFIG_IP_MULTIPLE_TABLES
- /*
- * "Stabilization" of route.
- * This step is necessary, if locally originated packets
- * are subjected to policy routing, otherwise we could get
- * route flapping.
- */
- if (fib_lookup(&key, &res))
- return -ENETUNREACH;
-#endif
+ if (res.type == RTN_LOCAL) {
+ if (!key.src)
+ key.src = key.dst;
+ dev_out = &loopback_dev;
+ key.oif = dev_out->ifindex;
+ res.fi = NULL;
+ flags |= RTCF_LOCAL;
+ goto make_route;
}
#ifdef CONFIG_IP_ROUTE_MULTIPATH
if (res.fi->fib_nhs > 1 && key.oif == 0)
fib_select_multipath(&key, &res);
+ else
#endif
+ if (res.prefixlen==0 && res.type == RTN_UNICAST && key.oif == 0)
+ fib_select_default(&key, &res);
- dev_out = FIB_RES_DEV(res);
-
- if (res.type == RTN_LOCAL) {
- dev_out = &loopback_dev;
- key.oif = dev_out->ifindex;
- res.fi = NULL;
- flags |= RTCF_LOCAL;
- }
+ if (!key.src)
+ key.src = FIB_RES_PREFSRC(res);
+ dev_out = FIB_RES_DEV(res);
key.oif = dev_out->ifindex;
make_route:
- if (LOOPBACK(key.src) && !(dev_out->flags&IFF_LOOPBACK)) {
- printk(KERN_DEBUG "this guy talks to %08x from loopback\n", key.dst);
+ if (LOOPBACK(key.src) && !(dev_out->flags&IFF_LOOPBACK))
return -EINVAL;
- }
if (key.dst == 0xFFFFFFFF)
res.type = RTN_BROADCAST;
@@ -1449,13 +1624,12 @@
#endif
}
- rt_set_nexthop(rth, &res);
+ rt_set_nexthop(rth, &res, 0);
rth->rt_flags = flags;
hash = rt_hash_code(daddr, saddr^(oif<<5), tos);
- *rp = rt_intern_hash(hash, rth);
- return 0;
+ return rt_intern_hash(hash, rth, rp);
}
int ip_route_output(struct rtable **rp, u32 daddr, u32 saddr, u32 tos, int oif)
@@ -1507,7 +1681,7 @@
nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*r));
r = NLMSG_DATA(nlh);
- nlh->nlmsg_flags = nowait ? NLM_F_MULTI : 0;
+ nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0;
r->rtm_family = AF_INET;
r->rtm_dst_len = 32;
r->rtm_src_len = 0;
@@ -1517,6 +1691,8 @@
r->rtm_scope = RT_SCOPE_UNIVERSE;
r->rtm_protocol = RTPROT_UNSPEC;
r->rtm_flags = (rt->rt_flags&~0xFFFF) | RTM_F_CLONED;
+ if (rt->rt_flags & RTCF_NOTIFY)
+ r->rtm_flags |= RTM_F_NOTIFY;
RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
if (rt->key.src) {
r->rtm_src_len = 32;
@@ -1524,6 +1700,10 @@
}
if (rt->u.dst.dev)
RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
+#ifdef CONFIG_NET_CLS_ROUTE
+ if (rt->u.dst.tclassid)
+ RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
+#endif
if (rt->key.iif)
RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
else if (rt->rt_src != rt->key.src)
@@ -1546,7 +1726,10 @@
ci.rta_lastuse = jiffies - rt->u.dst.lastuse;
ci.rta_used = atomic_read(&rt->u.dst.refcnt);
ci.rta_clntref = atomic_read(&rt->u.dst.use);
- ci.rta_expires = 0;
+ if (rt->u.dst.expires)
+ ci.rta_expires = rt->u.dst.expires - jiffies;
+ else
+ ci.rta_expires = 0;
ci.rta_error = rt->u.dst.error;
#ifdef CONFIG_IP_MROUTE
eptr = (struct rtattr*)skb->tail;
@@ -1625,7 +1808,7 @@
end_bh_atomic();
rt = (struct rtable*)skb->dst;
if (!err && rt->u.dst.error)
- err = rt->u.dst.error;
+ err = -rt->u.dst.error;
} else {
int oif = 0;
if (rta[RTA_OIF-1])
@@ -1667,7 +1850,7 @@
for (h=0; h < RT_HASH_DIVISOR; h++) {
if (h < s_h) continue;
if (h > s_h)
- memset(&cb->args[1], 0, sizeof(cb->args) - sizeof(int));
+ memset(&cb->args[1], 0, sizeof(cb->args) - sizeof(cb->args[0]));
start_bh_atomic();
for (rt = rt_hash_table[h], idx = 0; rt; rt = rt->u.rt_next, idx++) {
if (idx < s_idx)
@@ -1717,7 +1900,7 @@
ctl_table ipv4_route_table[] = {
{NET_IPV4_ROUTE_FLUSH, "flush",
- &flush_delay, sizeof(int), 0644, NULL,
+ &flush_delay, sizeof(int), 0200, NULL,
&ipv4_sysctl_rtcache_flush},
{NET_IPV4_ROUTE_MIN_DELAY, "min_delay",
&ip_rt_min_delay, sizeof(int), 0644, NULL,
@@ -1758,12 +1941,45 @@
{NET_IPV4_ROUTE_GC_ELASTICITY, "gc_elasticity",
&ip_rt_gc_elasticity, sizeof(int), 0644, NULL,
&proc_dointvec},
+ {NET_IPV4_ROUTE_MTU_EXPIRES, "mtu_expires",
+ &ip_rt_mtu_expires, sizeof(int), 0644, NULL,
+ &proc_dointvec_jiffies},
{0}
};
#endif
+#ifdef CONFIG_NET_CLS_ROUTE
+struct ip_rt_acct ip_rt_acct[256];
+
+#ifdef CONFIG_PROC_FS
+static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
+ int length, int *eof, void *data)
+{
+ *start=buffer;
+
+ if (offset + length > sizeof(ip_rt_acct)) {
+ length = sizeof(ip_rt_acct) - offset;
+ *eof = 1;
+ }
+ if (length > 0) {
+ start_bh_atomic();
+ memcpy(buffer, ((u8*)&ip_rt_acct)+offset, length);
+ end_bh_atomic();
+ return length;
+ }
+ return 0;
+}
+#endif
+#endif
+
+
__initfunc(void ip_rt_init(void))
{
+#ifdef CONFIG_PROC_FS
+#ifdef CONFIG_NET_CLS_ROUTE
+ struct proc_dir_entry *ent;
+#endif
+#endif
devinet_init();
ip_fib_init();
rt_periodic_timer.function = rt_check_expire;
@@ -1781,5 +1997,9 @@
0, &proc_net_inode_operations,
rt_cache_get_info
});
+#ifdef CONFIG_NET_CLS_ROUTE
+ ent = create_proc_entry("net/rt_acct", 0, 0);
+ ent->read_proc = ip_rt_acct_read;
+#endif
#endif
}
FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen (who was at: slshen@lbl.gov)