summaryrefslogtreecommitdiffstats
path: root/main/linux-virt-grsec/RFC-net-ipv4-Use-next-hop-exceptions-also-for-input-routes.patch
diff options
context:
space:
mode:
Diffstat (limited to 'main/linux-virt-grsec/RFC-net-ipv4-Use-next-hop-exceptions-also-for-input-routes.patch')
-rw-r--r--main/linux-virt-grsec/RFC-net-ipv4-Use-next-hop-exceptions-also-for-input-routes.patch178
1 files changed, 178 insertions, 0 deletions
diff --git a/main/linux-virt-grsec/RFC-net-ipv4-Use-next-hop-exceptions-also-for-input-routes.patch b/main/linux-virt-grsec/RFC-net-ipv4-Use-next-hop-exceptions-also-for-input-routes.patch
new file mode 100644
index 000000000..2310927e8
--- /dev/null
+++ b/main/linux-virt-grsec/RFC-net-ipv4-Use-next-hop-exceptions-also-for-input-routes.patch
@@ -0,0 +1,178 @@
+From patchwork Thu May 23 13:15:46 2013
+Content-Type: text/plain; charset="utf-8"
+MIME-Version: 1.0
+Content-Transfer-Encoding: 8bit
+Subject: [RFC] net/ipv4: Use next hop exceptions also for input routes
+Date: Thu, 23 May 2013 03:15:46 -0000
+From: =?utf-8?q?Timo_Ter=C3=A4s?= <timo.teras@iki.fi>
+X-Patchwork-Id: 245949
+Message-Id: <1369314946-12692-1-git-send-email-timo.teras@iki.fi>
+To: netdev@vger.kernel.org
+Cc: =?UTF-8?q?Timo=20Ter=C3=A4s?= <timo.teras@iki.fi>
+
+Commit d2d68ba9 (ipv4: Cache input routes in fib_info nexthops)
+assmued that "locally destined, and routed packets, never trigger
+PMTU events or redirects that will be processed by us".
+
+However, it seems that tunnel devices do trigger PMTU events in certain
+cases. At least ip_gre, ip6_gre, sit, and ipip do use the inner flow's
+skb_dst(skb)->ops->update_pmtu to propage mtu information from the
+outer flows. These can cause the inner flow mtu to be decreased. If
+next hop exceptions are not consulted for pmtu, IP fragmentation will
+not be done properly for these routes.
+
+It also seems that we really need to have the PMTU information always
+for netfilter TCPMSS' clamp-to-pmtu feature to work properly.
+
+So for the time being, cache separate copies of input routes for
+each next hop exception.
+
+Signed-off-by: Timo Teräs <timo.teras@iki.fi>
+
+---
+I had ideas to make optimizations where pmtu information would not
+be needed. This includes:
+- Target devices with IFF_XMIT_DST_RELEASE set (practically all devices
+ except tunnels). If skb_dst() is early freed the target device cannot
+ generate PMTU events
+- Add flag for input route generation if pmtu info is needed for
+ fragmentation. Basically a flag saying if DF bit was set in ip_hdr.
+
+However, TCPMSS clamp-to-pmtu prevents both optimizations.
+
+I'm not yet all familiar with the recent changes in routing caching,
+so there might be caveats that I missed. Basic testing shows this fixes
+the fragmentation issues I'm seeing, and I have not yet found any ill
+side effects either.
+
+ include/net/ip_fib.h | 3 ++-
+ net/ipv4/fib_semantics.c | 3 ++-
+ net/ipv4/route.c | 41 +++++++++++++++++++++++++++++++----------
+ 3 files changed, 35 insertions(+), 12 deletions(-)
+
+diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
+index e49db91..20529a6 100644
+--- a/include/net/ip_fib.h
++++ b/include/net/ip_fib.h
+@@ -55,7 +55,8 @@ struct fib_nh_exception {
+ u32 fnhe_pmtu;
+ __be32 fnhe_gw;
+ unsigned long fnhe_expires;
+- struct rtable __rcu *fnhe_rth;
++ struct rtable __rcu *fnhe_rth_input;
++ struct rtable __rcu *fnhe_rth_output;
+ unsigned long fnhe_stamp;
+ };
+
+diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
+index 8f6cb7a..d5dbca5 100644
+--- a/net/ipv4/fib_semantics.c
++++ b/net/ipv4/fib_semantics.c
+@@ -169,7 +169,8 @@ static void free_nh_exceptions(struct fib_nh *nh)
+
+ next = rcu_dereference_protected(fnhe->fnhe_next, 1);
+
+- rt_fibinfo_free(&fnhe->fnhe_rth);
++ rt_fibinfo_free(&fnhe->fnhe_rth_input);
++ rt_fibinfo_free(&fnhe->fnhe_rth_output);
+
+ kfree(fnhe);
+
+diff --git a/net/ipv4/route.c b/net/ipv4/route.c
+index 550781a..073df96 100644
+--- a/net/ipv4/route.c
++++ b/net/ipv4/route.c
+@@ -576,9 +576,14 @@ static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
+ if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
+ oldest = fnhe;
+ }
+- orig = rcu_dereference(oldest->fnhe_rth);
++ orig = rcu_dereference(oldest->fnhe_rth_input);
+ if (orig) {
+- RCU_INIT_POINTER(oldest->fnhe_rth, NULL);
++ RCU_INIT_POINTER(oldest->fnhe_rth_input, NULL);
++ rt_free(orig);
++ }
++ orig = rcu_dereference(oldest->fnhe_rth_output);
++ if (orig) {
++ RCU_INIT_POINTER(oldest->fnhe_rth_output, NULL);
+ rt_free(orig);
+ }
+ return oldest;
+@@ -1209,7 +1214,15 @@ static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
+ spin_lock_bh(&fnhe_lock);
+
+ if (daddr == fnhe->fnhe_daddr) {
+- struct rtable *orig = rcu_dereference(fnhe->fnhe_rth);
++ struct rtable __rcu **porig;
++ struct rtable *orig;
++
++ if (rt_is_input_route(rt))
++ porig = &fnhe->fnhe_rth_input;
++ else
++ porig = &fnhe->fnhe_rth_output;
++
++ orig = rcu_dereference(*porig);
+ if (orig && rt_is_expired(orig)) {
+ fnhe->fnhe_gw = 0;
+ fnhe->fnhe_pmtu = 0;
+@@ -1231,12 +1244,14 @@ static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
+ } else if (!rt->rt_gateway)
+ rt->rt_gateway = daddr;
+
+- rcu_assign_pointer(fnhe->fnhe_rth, rt);
+- if (orig)
+- rt_free(orig);
++ if (!(rt->dst.flags & DST_NOCACHE)) {
++ rcu_assign_pointer(*porig, rt);
++ if (orig)
++ rt_free(orig);
++ ret = true;
++ }
+
+ fnhe->fnhe_stamp = jiffies;
+- ret = true;
+ }
+ spin_unlock_bh(&fnhe_lock);
+
+@@ -1468,6 +1483,7 @@ static int __mkroute_input(struct sk_buff *skb,
+ struct in_device *in_dev,
+ __be32 daddr, __be32 saddr, u32 tos)
+ {
++ struct fib_nh_exception *fnhe;
+ struct rtable *rth;
+ int err;
+ struct in_device *out_dev;
+@@ -1514,8 +1530,13 @@ static int __mkroute_input(struct sk_buff *skb,
+ }
+ }
+
++ fnhe = find_exception(&FIB_RES_NH(*res), daddr);
+ if (do_cache) {
+- rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
++ if (fnhe != NULL)
++ rth = rcu_dereference(fnhe->fnhe_rth_input);
++ else
++ rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
++
+ if (rt_cache_valid(rth)) {
+ skb_dst_set_noref(skb, &rth->dst);
+ goto out;
+@@ -1543,7 +1564,7 @@ static int __mkroute_input(struct sk_buff *skb,
+ rth->dst.input = ip_forward;
+ rth->dst.output = ip_output;
+
+- rt_set_nexthop(rth, daddr, res, NULL, res->fi, res->type, itag);
++ rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
+ skb_dst_set(skb, &rth->dst);
+ out:
+ err = 0;
+@@ -1858,7 +1879,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
+
+ fnhe = find_exception(nh, fl4->daddr);
+ if (fnhe)
+- prth = &fnhe->fnhe_rth;
++ prth = &fnhe->fnhe_rth_output;
+ else {
+ if (unlikely(fl4->flowi4_flags &
+ FLOWI_FLAG_KNOWN_NH &&