summaryrefslogtreecommitdiffstats
path: root/main/linux-pae/0015-xfrm-cache-bundles-instead-of-policies-for-outgoing-.patch
diff options
context:
space:
mode:
Diffstat (limited to 'main/linux-pae/0015-xfrm-cache-bundles-instead-of-policies-for-outgoing-.patch')
-rw-r--r--main/linux-pae/0015-xfrm-cache-bundles-instead-of-policies-for-outgoing-.patch1068
1 files changed, 1068 insertions, 0 deletions
diff --git a/main/linux-pae/0015-xfrm-cache-bundles-instead-of-policies-for-outgoing-.patch b/main/linux-pae/0015-xfrm-cache-bundles-instead-of-policies-for-outgoing-.patch
new file mode 100644
index 000000000..0d066c84d
--- /dev/null
+++ b/main/linux-pae/0015-xfrm-cache-bundles-instead-of-policies-for-outgoing-.patch
@@ -0,0 +1,1068 @@
+From f89d21648e6dc06db2aeabc8926c270894c41446 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Timo=20Ter=C3=A4s?= <timo.teras@iki.fi>
+Date: Wed, 7 Apr 2010 00:30:05 +0000
+Subject: [PATCH 15/18] xfrm: cache bundles instead of policies for outgoing flows
+
+__xfrm_lookup() is called for each packet transmitted out of
+system. The xfrm_find_bundle() does a linear search which can
+kill system performance depending on how many bundles are
+required per policy.
+
+This modifies __xfrm_lookup() to store bundles directly in
+the flow cache. If we did not get a hit, we just create a new
+bundle instead of doing slow search. This means that we can now
+get multiple xfrm_dst's for same flow (on per-cpu basis).
+
+Signed-off-by: Timo Teras <timo.teras@iki.fi>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+(backported from commit 80c802f3073e84c956846e921e8a0b02dfa3755f)
+---
+ include/net/xfrm.h | 10 +-
+ net/ipv4/xfrm4_policy.c | 22 --
+ net/ipv6/xfrm6_policy.c | 31 --
+ net/xfrm/xfrm_policy.c | 710 +++++++++++++++++++++++++----------------------
+ 4 files changed, 383 insertions(+), 390 deletions(-)
+
+diff --git a/include/net/xfrm.h b/include/net/xfrm.h
+index 6023a48..d51ef61 100644
+--- a/include/net/xfrm.h
++++ b/include/net/xfrm.h
+@@ -266,7 +266,6 @@ struct xfrm_policy_afinfo {
+ xfrm_address_t *saddr,
+ xfrm_address_t *daddr);
+ int (*get_saddr)(struct net *net, xfrm_address_t *saddr, xfrm_address_t *daddr);
+- struct dst_entry *(*find_bundle)(struct flowi *fl, struct xfrm_policy *policy);
+ void (*decode_session)(struct sk_buff *skb,
+ struct flowi *fl,
+ int reverse);
+@@ -485,12 +484,12 @@ struct xfrm_policy
+ struct timer_list timer;
+
+ struct flow_cache_object flo;
++ atomic_t genid;
+ u32 priority;
+ u32 index;
+ struct xfrm_selector selector;
+ struct xfrm_lifetime_cfg lft;
+ struct xfrm_lifetime_cur curlft;
+- struct dst_entry *bundles;
+ struct xfrm_policy_walk_entry walk;
+ u8 type;
+ u8 action;
+@@ -883,11 +882,15 @@ struct xfrm_dst
+ struct rt6_info rt6;
+ } u;
+ struct dst_entry *route;
++ struct flow_cache_object flo;
++ struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
++ int num_pols, num_xfrms;
+ #ifdef CONFIG_XFRM_SUB_POLICY
+ struct flowi *origin;
+ struct xfrm_selector *partner;
+ #endif
+- u32 genid;
++ u32 xfrm_genid;
++ u32 policy_genid;
+ u32 route_mtu_cached;
+ u32 child_mtu_cached;
+ u32 route_cookie;
+@@ -897,6 +900,7 @@ struct xfrm_dst
+ #ifdef CONFIG_XFRM
+ static inline void xfrm_dst_destroy(struct xfrm_dst *xdst)
+ {
++ xfrm_pols_put(xdst->pols, xdst->num_pols);
+ dst_release(xdst->route);
+ if (likely(xdst->u.dst.xfrm))
+ xfrm_state_put(xdst->u.dst.xfrm);
+diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
+index 7009886..651a3e7 100644
+--- a/net/ipv4/xfrm4_policy.c
++++ b/net/ipv4/xfrm4_policy.c
+@@ -60,27 +60,6 @@ static int xfrm4_get_saddr(struct net *net,
+ return 0;
+ }
+
+-static struct dst_entry *
+-__xfrm4_find_bundle(struct flowi *fl, struct xfrm_policy *policy)
+-{
+- struct dst_entry *dst;
+-
+- read_lock_bh(&policy->lock);
+- for (dst = policy->bundles; dst; dst = dst->next) {
+- struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
+- if (xdst->u.rt.fl.oif == fl->oif && /*XXX*/
+- xdst->u.rt.fl.fl4_dst == fl->fl4_dst &&
+- xdst->u.rt.fl.fl4_src == fl->fl4_src &&
+- xdst->u.rt.fl.fl4_tos == fl->fl4_tos &&
+- xfrm_bundle_ok(policy, xdst, fl, AF_INET, 0)) {
+- dst_clone(dst);
+- break;
+- }
+- }
+- read_unlock_bh(&policy->lock);
+- return dst;
+-}
+-
+ static int xfrm4_get_tos(struct flowi *fl)
+ {
+ return fl->fl4_tos;
+@@ -258,7 +237,6 @@ static struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
+ .dst_ops = &xfrm4_dst_ops,
+ .dst_lookup = xfrm4_dst_lookup,
+ .get_saddr = xfrm4_get_saddr,
+- .find_bundle = __xfrm4_find_bundle,
+ .decode_session = _decode_session4,
+ .get_tos = xfrm4_get_tos,
+ .init_path = xfrm4_init_path,
+diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
+index 3f89ab7..fb2a5b7 100644
+--- a/net/ipv6/xfrm6_policy.c
++++ b/net/ipv6/xfrm6_policy.c
+@@ -68,36 +68,6 @@ static int xfrm6_get_saddr(struct net *net,
+ return 0;
+ }
+
+-static struct dst_entry *
+-__xfrm6_find_bundle(struct flowi *fl, struct xfrm_policy *policy)
+-{
+- struct dst_entry *dst;
+-
+- /* Still not clear if we should set fl->fl6_{src,dst}... */
+- read_lock_bh(&policy->lock);
+- for (dst = policy->bundles; dst; dst = dst->next) {
+- struct xfrm_dst *xdst = (struct xfrm_dst*)dst;
+- struct in6_addr fl_dst_prefix, fl_src_prefix;
+-
+- ipv6_addr_prefix(&fl_dst_prefix,
+- &fl->fl6_dst,
+- xdst->u.rt6.rt6i_dst.plen);
+- ipv6_addr_prefix(&fl_src_prefix,
+- &fl->fl6_src,
+- xdst->u.rt6.rt6i_src.plen);
+- if (ipv6_addr_equal(&xdst->u.rt6.rt6i_dst.addr, &fl_dst_prefix) &&
+- ipv6_addr_equal(&xdst->u.rt6.rt6i_src.addr, &fl_src_prefix) &&
+- xfrm_bundle_ok(policy, xdst, fl, AF_INET6,
+- (xdst->u.rt6.rt6i_dst.plen != 128 ||
+- xdst->u.rt6.rt6i_src.plen != 128))) {
+- dst_clone(dst);
+- break;
+- }
+- }
+- read_unlock_bh(&policy->lock);
+- return dst;
+-}
+-
+ static int xfrm6_get_tos(struct flowi *fl)
+ {
+ return 0;
+@@ -290,7 +260,6 @@ static struct xfrm_policy_afinfo xfrm6_policy_afinfo = {
+ .dst_ops = &xfrm6_dst_ops,
+ .dst_lookup = xfrm6_dst_lookup,
+ .get_saddr = xfrm6_get_saddr,
+- .find_bundle = __xfrm6_find_bundle,
+ .decode_session = _decode_session6,
+ .get_tos = xfrm6_get_tos,
+ .init_path = xfrm6_init_path,
+diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
+index d1eb2b5..0379d82 100644
+--- a/net/xfrm/xfrm_policy.c
++++ b/net/xfrm/xfrm_policy.c
+@@ -37,6 +37,8 @@
+ DEFINE_MUTEX(xfrm_cfg_mutex);
+ EXPORT_SYMBOL(xfrm_cfg_mutex);
+
++static DEFINE_SPINLOCK(xfrm_policy_sk_bundle_lock);
++static struct dst_entry *xfrm_policy_sk_bundles;
+ static DEFINE_RWLOCK(xfrm_policy_lock);
+
+ static DEFINE_RWLOCK(xfrm_policy_afinfo_lock);
+@@ -50,6 +52,7 @@ static DEFINE_SPINLOCK(xfrm_policy_gc_lock);
+ static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family);
+ static void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo);
+ static void xfrm_init_pmtu(struct dst_entry *dst);
++static int stale_bundle(struct dst_entry *dst);
+
+ static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol,
+ int dir);
+@@ -277,8 +280,6 @@ void xfrm_policy_destroy(struct xfrm_policy *policy)
+ {
+ BUG_ON(!policy->walk.dead);
+
+- BUG_ON(policy->bundles);
+-
+ if (del_timer(&policy->timer))
+ BUG();
+
+@@ -289,12 +290,7 @@ EXPORT_SYMBOL(xfrm_policy_destroy);
+
+ static void xfrm_policy_gc_kill(struct xfrm_policy *policy)
+ {
+- struct dst_entry *dst;
+-
+- while ((dst = policy->bundles) != NULL) {
+- policy->bundles = dst->next;
+- dst_free(dst);
+- }
++ atomic_inc(&policy->genid);
+
+ if (del_timer(&policy->timer))
+ atomic_dec(&policy->refcnt);
+@@ -572,7 +568,6 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
+ struct xfrm_policy *delpol;
+ struct hlist_head *chain;
+ struct hlist_node *entry, *newpos;
+- struct dst_entry *gc_list;
+
+ write_lock_bh(&xfrm_policy_lock);
+ chain = policy_hash_bysel(net, &policy->selector, policy->family, dir);
+@@ -620,34 +615,6 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
+ else if (xfrm_bydst_should_resize(net, dir, NULL))
+ schedule_work(&net->xfrm.policy_hash_work);
+
+- read_lock_bh(&xfrm_policy_lock);
+- gc_list = NULL;
+- entry = &policy->bydst;
+- hlist_for_each_entry_continue(policy, entry, bydst) {
+- struct dst_entry *dst;
+-
+- write_lock(&policy->lock);
+- dst = policy->bundles;
+- if (dst) {
+- struct dst_entry *tail = dst;
+- while (tail->next)
+- tail = tail->next;
+- tail->next = gc_list;
+- gc_list = dst;
+-
+- policy->bundles = NULL;
+- }
+- write_unlock(&policy->lock);
+- }
+- read_unlock_bh(&xfrm_policy_lock);
+-
+- while (gc_list) {
+- struct dst_entry *dst = gc_list;
+-
+- gc_list = dst->next;
+- dst_free(dst);
+- }
+-
+ return 0;
+ }
+ EXPORT_SYMBOL(xfrm_policy_insert);
+@@ -990,6 +957,19 @@ fail:
+ return ret;
+ }
+
++static struct xfrm_policy *
++__xfrm_policy_lookup(struct net *net, struct flowi *fl, u16 family, u8 dir)
++{
++#ifdef CONFIG_XFRM_SUB_POLICY
++ struct xfrm_policy *pol;
++
++ pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_SUB, fl, family, dir);
++ if (pol != NULL)
++ return pol;
++#endif
++ return xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, fl, family, dir);
++}
++
+ static struct flow_cache_object *
+ xfrm_policy_lookup(struct net *net, struct flowi *fl, u16 family,
+ u8 dir, struct flow_cache_object *old_obj, void *ctx)
+@@ -999,21 +979,10 @@ xfrm_policy_lookup(struct net *net, struct flowi *fl, u16 family,
+ if (old_obj)
+ xfrm_pol_put(container_of(old_obj, struct xfrm_policy, flo));
+
+-#ifdef CONFIG_XFRM_SUB_POLICY
+- pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_SUB, fl, family, dir);
+- if (IS_ERR(pol))
++ pol = __xfrm_policy_lookup(net, fl, family, dir);
++ if (pol == NULL || IS_ERR(pol))
+ return ERR_CAST(pol);
+- if (pol)
+- goto found;
+-#endif
+- pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, fl, family, dir);
+- if (IS_ERR(pol))
+- return ERR_CAST(pol);
+- if (pol)
+- goto found;
+- return NULL;
+
+-found:
+ /* Resolver returns two references:
+ * one for cache and one for caller of flow_cache_lookup() */
+ xfrm_pol_hold(pol);
+@@ -1299,18 +1268,6 @@ xfrm_tmpl_resolve(struct xfrm_policy **pols, int npols, struct flowi *fl,
+ * still valid.
+ */
+
+-static struct dst_entry *
+-xfrm_find_bundle(struct flowi *fl, struct xfrm_policy *policy, unsigned short family)
+-{
+- struct dst_entry *x;
+- struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
+- if (unlikely(afinfo == NULL))
+- return ERR_PTR(-EINVAL);
+- x = afinfo->find_bundle(fl, policy);
+- xfrm_policy_put_afinfo(afinfo);
+- return x;
+-}
+-
+ static inline int xfrm_get_tos(struct flowi *fl, int family)
+ {
+ struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
+@@ -1326,6 +1283,54 @@ static inline int xfrm_get_tos(struct flowi *fl, int family)
+ return tos;
+ }
+
++static struct flow_cache_object *xfrm_bundle_flo_get(struct flow_cache_object *flo)
++{
++ struct xfrm_dst *xdst = container_of(flo, struct xfrm_dst, flo);
++ struct dst_entry *dst = &xdst->u.dst;
++
++ if (xdst->route == NULL) {
++ /* Dummy bundle - if it has xfrms we were not
++ * able to build bundle as template resolution failed.
++ * It means we need to try again resolving. */
++ if (xdst->num_xfrms > 0)
++ return NULL;
++ } else {
++ /* Real bundle */
++ if (stale_bundle(dst))
++ return NULL;
++ }
++
++ dst_hold(dst);
++ return flo;
++}
++
++static int xfrm_bundle_flo_check(struct flow_cache_object *flo)
++{
++ struct xfrm_dst *xdst = container_of(flo, struct xfrm_dst, flo);
++ struct dst_entry *dst = &xdst->u.dst;
++
++ if (!xdst->route)
++ return 0;
++ if (stale_bundle(dst))
++ return 0;
++
++ return 1;
++}
++
++static void xfrm_bundle_flo_delete(struct flow_cache_object *flo)
++{
++ struct xfrm_dst *xdst = container_of(flo, struct xfrm_dst, flo);
++ struct dst_entry *dst = &xdst->u.dst;
++
++ dst_free(dst);
++}
++
++static const struct flow_cache_ops xfrm_bundle_fc_ops = {
++ .get = xfrm_bundle_flo_get,
++ .check = xfrm_bundle_flo_check,
++ .delete = xfrm_bundle_flo_delete,
++};
++
+ static inline struct xfrm_dst *xfrm_alloc_dst(int family)
+ {
+ struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
+@@ -1338,6 +1343,8 @@ static inline struct xfrm_dst *xfrm_alloc_dst(int family)
+
+ xfrm_policy_put_afinfo(afinfo);
+
++ xdst->flo.ops = &xfrm_bundle_fc_ops;
++
+ return xdst;
+ }
+
+@@ -1375,6 +1382,7 @@ static inline int xfrm_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
+ return err;
+ }
+
++
+ /* Allocate chain of dst_entry's, attach known xfrm's, calculate
+ * all the metrics... Shortly, bundle a bundle.
+ */
+@@ -1437,7 +1445,7 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy,
+ dst_hold(dst);
+
+ dst1->xfrm = xfrm[i];
+- xdst->genid = xfrm[i]->genid;
++ xdst->xfrm_genid = xfrm[i]->genid;
+
+ dst1->obsolete = -1;
+ dst1->flags |= DST_HOST;
+@@ -1530,7 +1538,186 @@ xfrm_dst_update_origin(struct dst_entry *dst, struct flowi *fl)
+ #endif
+ }
+
+-static int stale_bundle(struct dst_entry *dst);
++static int xfrm_expand_policies(struct flowi *fl, u16 family,
++ struct xfrm_policy **pols,
++ int *num_pols, int *num_xfrms)
++{
++ int i;
++
++ if (*num_pols == 0 || !pols[0]) {
++ *num_pols = 0;
++ *num_xfrms = 0;
++ return 0;
++ }
++ if (IS_ERR(pols[0]))
++ return PTR_ERR(pols[0]);
++
++ *num_xfrms = pols[0]->xfrm_nr;
++
++#ifdef CONFIG_XFRM_SUB_POLICY
++ if (pols[0] && pols[0]->action == XFRM_POLICY_ALLOW &&
++ pols[0]->type != XFRM_POLICY_TYPE_MAIN) {
++ pols[1] = xfrm_policy_lookup_bytype(xp_net(pols[0]),
++ XFRM_POLICY_TYPE_MAIN,
++ fl, family,
++ XFRM_POLICY_OUT);
++ if (pols[1]) {
++ if (IS_ERR(pols[1])) {
++ xfrm_pols_put(pols, *num_pols);
++ return PTR_ERR(pols[1]);
++ }
++ (*num_pols) ++;
++ (*num_xfrms) += pols[1]->xfrm_nr;
++ }
++ }
++#endif
++ for (i = 0; i < *num_pols; i++) {
++ if (pols[i]->action != XFRM_POLICY_ALLOW) {
++ *num_xfrms = -1;
++ break;
++ }
++ }
++
++ return 0;
++
++}
++
++static struct xfrm_dst *
++xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols,
++ struct flowi *fl, u16 family,
++ struct dst_entry *dst_orig)
++{
++ struct net *net = xp_net(pols[0]);
++ struct xfrm_state *xfrm[XFRM_MAX_DEPTH];
++ struct dst_entry *dst;
++ struct xfrm_dst *xdst;
++ int err;
++
++ /* Try to instantiate a bundle */
++ err = xfrm_tmpl_resolve(pols, num_pols, fl, xfrm, family);
++ if (err < 0) {
++ if (err != -EAGAIN)
++ XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR);
++ return ERR_PTR(err);
++ }
++
++ dst = xfrm_bundle_create(pols[0], xfrm, err, fl, dst_orig);
++ if (IS_ERR(dst)) {
++ XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLEGENERROR);
++ return ERR_CAST(dst);
++ }
++
++ xdst = (struct xfrm_dst *)dst;
++ xdst->num_xfrms = err;
++ if (num_pols > 1)
++ err = xfrm_dst_update_parent(dst, &pols[1]->selector);
++ else
++ err = xfrm_dst_update_origin(dst, fl);
++ if (unlikely(err)) {
++ dst_free(dst);
++ XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLECHECKERROR);
++ return ERR_PTR(err);
++ }
++
++ xdst->num_pols = num_pols;
++ memcpy(xdst->pols, pols, sizeof(struct xfrm_policy*) * num_pols);
++ xdst->policy_genid = atomic_read(&pols[0]->genid);
++
++ return xdst;
++}
++
++static struct flow_cache_object *
++xfrm_bundle_lookup(struct net *net, struct flowi *fl, u16 family, u8 dir,
++ struct flow_cache_object *oldflo, void *ctx)
++{
++ struct dst_entry *dst_orig = (struct dst_entry *)ctx;
++ struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
++ struct xfrm_dst *xdst, *new_xdst;
++ int num_pols = 0, num_xfrms = 0, i, err, pol_dead;
++
++ /* Check if the policies from old bundle are usable */
++ xdst = NULL;
++ if (oldflo) {
++ xdst = container_of(oldflo, struct xfrm_dst, flo);
++ num_pols = xdst->num_pols;
++ num_xfrms = xdst->num_xfrms;
++ pol_dead = 0;
++ for (i = 0; i < num_pols; i++) {
++ pols[i] = xdst->pols[i];
++ pol_dead |= pols[i]->walk.dead;
++ }
++ if (pol_dead) {
++ dst_free(&xdst->u.dst);
++ xdst = NULL;
++ num_pols = 0;
++ num_xfrms = 0;
++ oldflo = NULL;
++ }
++ }
++
++ /* Resolve policies to use if we couldn't get them from
++ * previous cache entry */
++ if (xdst == NULL) {
++ num_pols = 1;
++ pols[0] = __xfrm_policy_lookup(net, fl, family, dir);
++ err = xfrm_expand_policies(fl, family, pols,
++ &num_pols, &num_xfrms);
++ if (err < 0)
++ goto inc_error;
++ if (num_pols == 0)
++ return NULL;
++ if (num_xfrms <= 0)
++ goto make_dummy_bundle;
++ }
++
++ new_xdst = xfrm_resolve_and_create_bundle(pols, num_pols, fl, family, dst_orig);
++ if (IS_ERR(new_xdst)) {
++ err = PTR_ERR(new_xdst);
++ if (err != -EAGAIN)
++ goto error;
++ if (oldflo == NULL)
++ goto make_dummy_bundle;
++ dst_hold(&xdst->u.dst);
++ return oldflo;
++ }
++
++ /* Kill the previous bundle */
++ if (xdst) {
++ /* The policies were stolen for newly generated bundle */
++ xdst->num_pols = 0;
++ dst_free(&xdst->u.dst);
++ }
++
++ /* Flow cache does not have reference, it dst_free()'s,
++ * but we do need to return one reference for original caller */
++ dst_hold(&new_xdst->u.dst);
++ return &new_xdst->flo;
++
++make_dummy_bundle:
++ /* We found policies, but there's no bundles to instantiate:
++ * either because the policy blocks, has no transformations or
++ * we could not build template (no xfrm_states).*/
++ xdst = xfrm_alloc_dst(family);
++ if (IS_ERR(xdst)) {
++ xfrm_pols_put(pols, num_pols);
++ return ERR_CAST(xdst);
++ }
++ xdst->num_pols = num_pols;
++ xdst->num_xfrms = num_xfrms;
++ memcpy(xdst->pols, pols, sizeof(struct xfrm_policy*) * num_pols);
++
++ dst_hold(&xdst->u.dst);
++ return &xdst->flo;
++
++inc_error:
++ XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR);
++error:
++ if (xdst != NULL)
++ dst_free(&xdst->u.dst);
++ else
++ xfrm_pols_put(pols, num_pols);
++ return ERR_PTR(err);
++}
+
+ /* Main function: finds/creates a bundle for given flow.
+ *
+@@ -1540,248 +1727,152 @@ static int stale_bundle(struct dst_entry *dst);
+ int __xfrm_lookup(struct net *net, struct dst_entry **dst_p, struct flowi *fl,
+ struct sock *sk, int flags)
+ {
+- struct xfrm_policy *policy;
+ struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
+- int npols;
+- int pol_dead;
+- int xfrm_nr;
+- int pi;
+- struct xfrm_state *xfrm[XFRM_MAX_DEPTH];
+- struct dst_entry *dst, *dst_orig = *dst_p;
+- int nx = 0;
+- int err;
+- u32 genid;
+- u16 family;
++ struct flow_cache_object *flo;
++ struct xfrm_dst *xdst;
++ struct dst_entry *dst, *dst_orig = *dst_p, *route;
++ u16 family = dst_orig->ops->family;
+ u8 dir = policy_to_flow_dir(XFRM_POLICY_OUT);
++ int i, err, num_pols, num_xfrms, drop_pols = 0;
+
+ restart:
+- genid = atomic_read(&flow_cache_genid);
+- policy = NULL;
+- for (pi = 0; pi < ARRAY_SIZE(pols); pi++)
+- pols[pi] = NULL;
+- npols = 0;
+- pol_dead = 0;
+- xfrm_nr = 0;
++ dst = NULL;
++ xdst = NULL;
++ route = NULL;
+
+ if (sk && sk->sk_policy[XFRM_POLICY_OUT]) {
+- policy = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl);
+- err = PTR_ERR(policy);
+- if (IS_ERR(policy)) {
+- XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR);
++ num_pols = 1;
++ pols[0] = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl);
++ err = xfrm_expand_policies(fl, family, pols,
++ &num_pols, &num_xfrms);
++ if (err < 0)
+ goto dropdst;
++
++ if (num_pols) {
++ if (num_xfrms <= 0) {
++ drop_pols = num_pols;
++ goto no_transform;
++ }
++
++ xdst = xfrm_resolve_and_create_bundle(
++ pols, num_pols, fl,
++ family, dst_orig);
++ if (IS_ERR(xdst)) {
++ xfrm_pols_put(pols, num_pols);
++ err = PTR_ERR(xdst);
++ goto dropdst;
++ }
++
++ spin_lock_bh(&xfrm_policy_sk_bundle_lock);
++ xdst->u.dst.next = xfrm_policy_sk_bundles;
++ xfrm_policy_sk_bundles = &xdst->u.dst;
++ spin_unlock_bh(&xfrm_policy_sk_bundle_lock);
++
++ route = xdst->route;
+ }
+ }
+
+- if (!policy) {
+- struct flow_cache_object *flo;
+-
++ if (xdst == NULL) {
+ /* To accelerate a bit... */
+ if ((dst_orig->flags & DST_NOXFRM) ||
+ !net->xfrm.policy_count[XFRM_POLICY_OUT])
+ goto nopol;
+
+- flo = flow_cache_lookup(net, fl, dst_orig->ops->family,
+- dir, xfrm_policy_lookup, NULL);
+- err = PTR_ERR(flo);
++ flo = flow_cache_lookup(net, fl, family, dir,
++ xfrm_bundle_lookup, dst_orig);
++ if (flo == NULL)
++ goto nopol;
+ if (IS_ERR(flo)) {
+- XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR);
++ err = PTR_ERR(flo);
+ goto dropdst;
+ }
+- if (flo)
+- policy = container_of(flo, struct xfrm_policy, flo);
+- else
+- policy = NULL;
++ xdst = container_of(flo, struct xfrm_dst, flo);
++
++ num_pols = xdst->num_pols;
++ num_xfrms = xdst->num_xfrms;
++ memcpy(pols, xdst->pols, sizeof(struct xfrm_policy*) * num_pols);
++ route = xdst->route;
++ }
++
++ dst = &xdst->u.dst;
++ if (route == NULL && num_xfrms > 0) {
++ /* The only case when xfrm_bundle_lookup() returns a
++ * bundle with null route, is when the template could
++ * not be resolved. It means policies are there, but
++ * bundle could not be created, since we don't yet
++ * have the xfrm_state's. We need to wait for KM to
++ * negotiate new SA's or bail out with error.*/
++ if (net->xfrm.sysctl_larval_drop) {
++ /* EREMOTE tells the caller to generate
++ * a one-shot blackhole route. */
++ dst_release(dst);
++ xfrm_pols_put(pols, num_pols);
++ XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES);
++ return -EREMOTE;
++ }
++ if (flags & XFRM_LOOKUP_WAIT) {
++ DECLARE_WAITQUEUE(wait, current);
++
++ add_wait_queue(&net->xfrm.km_waitq, &wait);
++ set_current_state(TASK_INTERRUPTIBLE);
++ schedule();
++ set_current_state(TASK_RUNNING);
++ remove_wait_queue(&net->xfrm.km_waitq, &wait);
++
++ if (!signal_pending(current)) {
++ dst_release(dst);
++ goto restart;
++ }
++
++ err = -ERESTART;
++ } else
++ err = -EAGAIN;
++
++ XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES);
++ goto error;
+ }
+
+- if (!policy)
++no_transform:
++ if (num_pols == 0)
+ goto nopol;
+
+- family = dst_orig->ops->family;
+- pols[0] = policy;
+- npols ++;
+- xfrm_nr += pols[0]->xfrm_nr;
+-
+- err = -ENOENT;
+- if ((flags & XFRM_LOOKUP_ICMP) && !(policy->flags & XFRM_POLICY_ICMP))
++ if ((flags & XFRM_LOOKUP_ICMP) &&
++ !(pols[0]->flags & XFRM_POLICY_ICMP)) {
++ err = -ENOENT;
+ goto error;
++ }
+
+- policy->curlft.use_time = get_seconds();
++ for (i = 0; i < num_pols; i++)
++ pols[i]->curlft.use_time = get_seconds();
+
+- switch (policy->action) {
+- default:
+- case XFRM_POLICY_BLOCK:
++ if (num_xfrms < 0) {
+ /* Prohibit the flow */
+ XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLBLOCK);
+ err = -EPERM;
+ goto error;
+-
+- case XFRM_POLICY_ALLOW:
+-#ifndef CONFIG_XFRM_SUB_POLICY
+- if (policy->xfrm_nr == 0) {
+- /* Flow passes not transformed. */
+- xfrm_pol_put(policy);
+- return 0;
+- }
+-#endif
+-
+- /* Try to find matching bundle.
+- *
+- * LATER: help from flow cache. It is optional, this
+- * is required only for output policy.
+- */
+- dst = xfrm_find_bundle(fl, policy, family);
+- if (IS_ERR(dst)) {
+- XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLECHECKERROR);
+- err = PTR_ERR(dst);
+- goto error;
+- }
+-
+- if (dst)
+- break;
+-
+-#ifdef CONFIG_XFRM_SUB_POLICY
+- if (pols[0]->type != XFRM_POLICY_TYPE_MAIN) {
+- pols[1] = xfrm_policy_lookup_bytype(net,
+- XFRM_POLICY_TYPE_MAIN,
+- fl, family,
+- XFRM_POLICY_OUT);
+- if (pols[1]) {
+- if (IS_ERR(pols[1])) {
+- XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR);
+- err = PTR_ERR(pols[1]);
+- goto error;
+- }
+- if (pols[1]->action == XFRM_POLICY_BLOCK) {
+- XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLBLOCK);
+- err = -EPERM;
+- goto error;
+- }
+- npols ++;
+- xfrm_nr += pols[1]->xfrm_nr;
+- }
+- }
+-
+- /*
+- * Because neither flowi nor bundle information knows about
+- * transformation template size. On more than one policy usage
+- * we can realize whether all of them is bypass or not after
+- * they are searched. See above not-transformed bypass
+- * is surrounded by non-sub policy configuration, too.
+- */
+- if (xfrm_nr == 0) {
+- /* Flow passes not transformed. */
+- xfrm_pols_put(pols, npols);
+- return 0;
+- }
+-
+-#endif
+- nx = xfrm_tmpl_resolve(pols, npols, fl, xfrm, family);
+-
+- if (unlikely(nx<0)) {
+- err = nx;
+- if (err == -EAGAIN && net->xfrm.sysctl_larval_drop) {
+- /* EREMOTE tells the caller to generate
+- * a one-shot blackhole route.
+- */
+- XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES);
+- xfrm_pol_put(policy);
+- return -EREMOTE;
+- }
+- if (err == -EAGAIN && (flags & XFRM_LOOKUP_WAIT)) {
+- DECLARE_WAITQUEUE(wait, current);
+-
+- add_wait_queue(&net->xfrm.km_waitq, &wait);
+- set_current_state(TASK_INTERRUPTIBLE);
+- schedule();
+- set_current_state(TASK_RUNNING);
+- remove_wait_queue(&net->xfrm.km_waitq, &wait);
+-
+- nx = xfrm_tmpl_resolve(pols, npols, fl, xfrm, family);
+-
+- if (nx == -EAGAIN && signal_pending(current)) {
+- XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES);
+- err = -ERESTART;
+- goto error;
+- }
+- if (nx == -EAGAIN ||
+- genid != atomic_read(&flow_cache_genid)) {
+- xfrm_pols_put(pols, npols);
+- goto restart;
+- }
+- err = nx;
+- }
+- if (err < 0) {
+- XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES);
+- goto error;
+- }
+- }
+- if (nx == 0) {
+- /* Flow passes not transformed. */
+- xfrm_pols_put(pols, npols);
+- return 0;
+- }
+-
+- dst = xfrm_bundle_create(policy, xfrm, nx, fl, dst_orig);
+- err = PTR_ERR(dst);
+- if (IS_ERR(dst)) {
+- XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLEGENERROR);
+- goto error;
+- }
+-
+- for (pi = 0; pi < npols; pi++)
+- pol_dead |= pols[pi]->walk.dead;
+-
+- write_lock_bh(&policy->lock);
+- if (unlikely(pol_dead || stale_bundle(dst))) {
+- /* Wow! While we worked on resolving, this
+- * policy has gone. Retry. It is not paranoia,
+- * we just cannot enlist new bundle to dead object.
+- * We can't enlist stable bundles either.
+- */
+- write_unlock_bh(&policy->lock);
+- dst_free(dst);
+-
+- if (pol_dead)
+- XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLDEAD);
+- else
+- XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLECHECKERROR);
+- err = -EHOSTUNREACH;
+- goto error;
+- }
+-
+- if (npols > 1)
+- err = xfrm_dst_update_parent(dst, &pols[1]->selector);
+- else
+- err = xfrm_dst_update_origin(dst, fl);
+- if (unlikely(err)) {
+- write_unlock_bh(&policy->lock);
+- dst_free(dst);
+- XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLECHECKERROR);
+- goto error;
+- }
+-
+- dst->next = policy->bundles;
+- policy->bundles = dst;
+- dst_hold(dst);
+- write_unlock_bh(&policy->lock);
++ } else if (num_xfrms > 0) {
++ /* Flow transformed */
++ *dst_p = dst;
++ dst_release(dst_orig);
++ } else {
++ /* Flow passes untransformed */
++ dst_release(dst);
+ }
+- *dst_p = dst;
+- dst_release(dst_orig);
+- xfrm_pols_put(pols, npols);
++ok:
++ xfrm_pols_put(pols, drop_pols);
+ return 0;
+
++nopol:
++ if (!(flags & XFRM_LOOKUP_ICMP))
++ goto ok;
++ err = -ENOENT;
+ error:
+- xfrm_pols_put(pols, npols);
++ dst_release(dst);
+ dropdst:
+ dst_release(dst_orig);
+ *dst_p = NULL;
++ xfrm_pols_put(pols, drop_pols);
+ return err;
+-
+-nopol:
+- err = -ENOENT;
+- if (flags & XFRM_LOOKUP_ICMP)
+- goto dropdst;
+- return 0;
+ }
+ EXPORT_SYMBOL(__xfrm_lookup);
+
+@@ -2134,71 +2225,24 @@ static struct dst_entry *xfrm_negative_advice(struct dst_entry *dst)
+ return dst;
+ }
+
+-static void prune_one_bundle(struct xfrm_policy *pol, int (*func)(struct dst_entry *), struct dst_entry **gc_list_p)
+-{
+- struct dst_entry *dst, **dstp;
+-
+- write_lock(&pol->lock);
+- dstp = &pol->bundles;
+- while ((dst=*dstp) != NULL) {
+- if (func(dst)) {
+- *dstp = dst->next;
+- dst->next = *gc_list_p;
+- *gc_list_p = dst;
+- } else {
+- dstp = &dst->next;
+- }
+- }
+- write_unlock(&pol->lock);
+-}
+-
+-static void xfrm_prune_bundles(struct net *net, int (*func)(struct dst_entry *))
++static void __xfrm_garbage_collect(struct net *net)
+ {
+- struct dst_entry *gc_list = NULL;
+- int dir;
++ struct dst_entry *head, *next;
+
+- read_lock_bh(&xfrm_policy_lock);
+- for (dir = 0; dir < XFRM_POLICY_MAX * 2; dir++) {
+- struct xfrm_policy *pol;
+- struct hlist_node *entry;
+- struct hlist_head *table;
+- int i;
++ flow_cache_flush();
+
+- hlist_for_each_entry(pol, entry,
+- &net->xfrm.policy_inexact[dir], bydst)
+- prune_one_bundle(pol, func, &gc_list);
++ spin_lock_bh(&xfrm_policy_sk_bundle_lock);
++ head = xfrm_policy_sk_bundles;
++ xfrm_policy_sk_bundles = NULL;
++ spin_unlock_bh(&xfrm_policy_sk_bundle_lock);
+
+- table = net->xfrm.policy_bydst[dir].table;
+- for (i = net->xfrm.policy_bydst[dir].hmask; i >= 0; i--) {
+- hlist_for_each_entry(pol, entry, table + i, bydst)
+- prune_one_bundle(pol, func, &gc_list);
+- }
+- }
+- read_unlock_bh(&xfrm_policy_lock);
+-
+- while (gc_list) {
+- struct dst_entry *dst = gc_list;
+- gc_list = dst->next;
+- dst_free(dst);
++ while (head) {
++ next = head->next;
++ dst_free(head);
++ head = next;
+ }
+ }
+
+-static int unused_bundle(struct dst_entry *dst)
+-{
+- return !atomic_read(&dst->__refcnt);
+-}
+-
+-static void __xfrm_garbage_collect(struct net *net)
+-{
+- xfrm_prune_bundles(net, unused_bundle);
+-}
+-
+-static int xfrm_flush_bundles(struct net *net)
+-{
+- xfrm_prune_bundles(net, stale_bundle);
+- return 0;
+-}
+-
+ static void xfrm_init_pmtu(struct dst_entry *dst)
+ {
+ do {
+@@ -2256,7 +2300,9 @@ int xfrm_bundle_ok(struct xfrm_policy *pol, struct xfrm_dst *first,
+ return 0;
+ if (dst->xfrm->km.state != XFRM_STATE_VALID)
+ return 0;
+- if (xdst->genid != dst->xfrm->genid)
++ if (xdst->xfrm_genid != dst->xfrm->genid)
++ return 0;
++ if (xdst->policy_genid != atomic_read(&xdst->pols[0]->genid))
+ return 0;
+
+ if (strict && fl &&
+@@ -2383,7 +2429,7 @@ static int xfrm_dev_event(struct notifier_block *this, unsigned long event, void
+
+ switch (event) {
+ case NETDEV_DOWN:
+- xfrm_flush_bundles(dev_net(dev));
++ __xfrm_garbage_collect(dev_net(dev));
+ }
+ return NOTIFY_DONE;
+ }
+@@ -2714,7 +2760,6 @@ static int xfrm_policy_migrate(struct xfrm_policy *pol,
+ struct xfrm_migrate *m, int num_migrate)
+ {
+ struct xfrm_migrate *mp;
+- struct dst_entry *dst;
+ int i, j, n = 0;
+
+ write_lock_bh(&pol->lock);
+@@ -2739,10 +2784,7 @@ static int xfrm_policy_migrate(struct xfrm_policy *pol,
+ sizeof(pol->xfrm_vec[i].saddr));
+ pol->xfrm_vec[i].encap_family = mp->new_family;
+ /* flush bundles */
+- while ((dst = pol->bundles) != NULL) {
+- pol->bundles = dst->next;
+- dst_free(dst);
+- }
++ atomic_inc(&pol->genid);
+ }
+ }
+
+--
+1.7.0.2
+