summaryrefslogtreecommitdiffstats
path: root/testing/linux-virt-grsec/RFC-net-ipv4-Use-next-hop-exceptions-also-for-input-routes.patch
blob: 2310927e815a4516fbda15f5b19d46fe7970b8a6 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
From patchwork Thu May 23 13:15:46 2013
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
Subject: [RFC] net/ipv4: Use next hop exceptions also for input routes
Date: Thu, 23 May 2013 03:15:46 -0000
From: =?utf-8?q?Timo_Ter=C3=A4s?= <timo.teras@iki.fi>
X-Patchwork-Id: 245949
Message-Id: <1369314946-12692-1-git-send-email-timo.teras@iki.fi>
To: netdev@vger.kernel.org
Cc: =?UTF-8?q?Timo=20Ter=C3=A4s?= <timo.teras@iki.fi>

Commit d2d68ba9 (ipv4: Cache input routes in fib_info nexthops)
assmued that "locally destined, and routed packets, never trigger
PMTU events or redirects that will be processed by us".

However, it seems that tunnel devices do trigger PMTU events in certain
cases. At least ip_gre, ip6_gre, sit, and ipip do use the inner flow's
skb_dst(skb)->ops->update_pmtu to propage mtu information from the
outer flows. These can cause the inner flow mtu to be decreased. If
next hop exceptions are not consulted for pmtu, IP fragmentation will
not be done properly for these routes.

It also seems that we really need to have the PMTU information always
for netfilter TCPMSS' clamp-to-pmtu feature to work properly.

So for the time being, cache separate copies of input routes for
each next hop exception.

Signed-off-by: Timo Teräs <timo.teras@iki.fi>

---
I had ideas to make optimizations where pmtu information would not
be needed. This includes:
- Target devices with IFF_XMIT_DST_RELEASE set (practically all devices
  except tunnels). If skb_dst() is early freed the target device cannot
  generate PMTU events
- Add flag for input route generation if pmtu info is needed for
  fragmentation. Basically a flag saying if DF bit was set in ip_hdr.

However, TCPMSS clamp-to-pmtu prevents both optimizations.

I'm not yet all familiar with the recent changes in routing caching,
so there might be caveats that I missed. Basic testing shows this fixes
the fragmentation issues I'm seeing, and I have not yet found any ill
side effects either.

 include/net/ip_fib.h     |  3 ++-
 net/ipv4/fib_semantics.c |  3 ++-
 net/ipv4/route.c         | 41 +++++++++++++++++++++++++++++++----------
 3 files changed, 35 insertions(+), 12 deletions(-)

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index e49db91..20529a6 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -55,7 +55,8 @@ struct fib_nh_exception {
 	u32				fnhe_pmtu;
 	__be32				fnhe_gw;
 	unsigned long			fnhe_expires;
-	struct rtable __rcu		*fnhe_rth;
+	struct rtable __rcu		*fnhe_rth_input;
+	struct rtable __rcu		*fnhe_rth_output;
 	unsigned long			fnhe_stamp;
 };
 
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 8f6cb7a..d5dbca5 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -169,7 +169,8 @@ static void free_nh_exceptions(struct fib_nh *nh)
 			
 			next = rcu_dereference_protected(fnhe->fnhe_next, 1);
 
-			rt_fibinfo_free(&fnhe->fnhe_rth);
+			rt_fibinfo_free(&fnhe->fnhe_rth_input);
+			rt_fibinfo_free(&fnhe->fnhe_rth_output);
 
 			kfree(fnhe);
 
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 550781a..073df96 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -576,9 +576,14 @@ static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
 		if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
 			oldest = fnhe;
 	}
-	orig = rcu_dereference(oldest->fnhe_rth);
+	orig = rcu_dereference(oldest->fnhe_rth_input);
 	if (orig) {
-		RCU_INIT_POINTER(oldest->fnhe_rth, NULL);
+		RCU_INIT_POINTER(oldest->fnhe_rth_input, NULL);
+		rt_free(orig);
+	}
+	orig = rcu_dereference(oldest->fnhe_rth_output);
+	if (orig) {
+		RCU_INIT_POINTER(oldest->fnhe_rth_output, NULL);
 		rt_free(orig);
 	}
 	return oldest;
@@ -1209,7 +1214,15 @@ static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
 	spin_lock_bh(&fnhe_lock);
 
 	if (daddr == fnhe->fnhe_daddr) {
-		struct rtable *orig = rcu_dereference(fnhe->fnhe_rth);
+		struct rtable __rcu **porig;
+		struct rtable *orig;
+
+		if (rt_is_input_route(rt))
+			porig = &fnhe->fnhe_rth_input;
+		else
+			porig = &fnhe->fnhe_rth_output;
+
+		orig = rcu_dereference(*porig);
 		if (orig && rt_is_expired(orig)) {
 			fnhe->fnhe_gw = 0;
 			fnhe->fnhe_pmtu = 0;
@@ -1231,12 +1244,14 @@ static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
 		} else if (!rt->rt_gateway)
 			rt->rt_gateway = daddr;
 
-		rcu_assign_pointer(fnhe->fnhe_rth, rt);
-		if (orig)
-			rt_free(orig);
+		if (!(rt->dst.flags & DST_NOCACHE)) {
+			rcu_assign_pointer(*porig, rt);
+			if (orig)
+				rt_free(orig);
+			ret = true;
+		}
 
 		fnhe->fnhe_stamp = jiffies;
-		ret = true;
 	}
 	spin_unlock_bh(&fnhe_lock);
 
@@ -1468,6 +1483,7 @@ static int __mkroute_input(struct sk_buff *skb,
 			   struct in_device *in_dev,
 			   __be32 daddr, __be32 saddr, u32 tos)
 {
+	struct fib_nh_exception *fnhe;
 	struct rtable *rth;
 	int err;
 	struct in_device *out_dev;
@@ -1514,8 +1530,13 @@ static int __mkroute_input(struct sk_buff *skb,
 		}
 	}
 
+	fnhe = find_exception(&FIB_RES_NH(*res), daddr);
 	if (do_cache) {
-		rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
+		if (fnhe != NULL)
+			rth = rcu_dereference(fnhe->fnhe_rth_input);
+		else
+			rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
+
 		if (rt_cache_valid(rth)) {
 			skb_dst_set_noref(skb, &rth->dst);
 			goto out;
@@ -1543,7 +1564,7 @@ static int __mkroute_input(struct sk_buff *skb,
 	rth->dst.input = ip_forward;
 	rth->dst.output = ip_output;
 
-	rt_set_nexthop(rth, daddr, res, NULL, res->fi, res->type, itag);
+	rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
 	skb_dst_set(skb, &rth->dst);
 out:
 	err = 0;
@@ -1858,7 +1879,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 
 		fnhe = find_exception(nh, fl4->daddr);
 		if (fnhe)
-			prth = &fnhe->fnhe_rth;
+			prth = &fnhe->fnhe_rth_output;
 		else {
 			if (unlikely(fl4->flowi4_flags &
 				     FLOWI_FLAG_KNOWN_NH &&