From 3519d7c86a6e87584d25f3292b53d3ce865a659e Mon Sep 17 00:00:00 2001
From: Natanael Copa <ncopa@alpinelinux.org>
Date: Mon, 15 Mar 2010 15:31:37 +0000
Subject: [PATCH] xfrm: flow cache2

---
 include/net/flow.h               |   39 ++++-
 include/net/netns/xfrm.h         |    4 +
 include/net/xfrm.h               |    1 +
 net/core/flow.c                  |  342 ++++++++++++++++++--------------------
 net/ipv6/inet6_connection_sock.c |    6 +-
 net/xfrm/xfrm_policy.c           |  271 +++++++++++++++++++++---------
 6 files changed, 394 insertions(+), 269 deletions(-)

diff --git a/include/net/flow.h b/include/net/flow.h
index 809970b..814a9d2 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -8,6 +8,9 @@
 #define _NET_FLOW_H
 
 #include <linux/in6.h>
+#include <linux/notifier.h>
+#include <linux/timer.h>
+#include <linux/slab.h>
 #include <asm/atomic.h>
 
 struct flowi {
@@ -86,13 +89,37 @@ struct flowi {
 
 struct net;
 struct sock;
-typedef int (*flow_resolve_t)(struct net *net, struct flowi *key, u16 family,
-			      u8 dir, void **objp, atomic_t **obj_refp);
 
-extern void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family,
-			       u8 dir, flow_resolve_t resolver);
-extern void flow_cache_flush(void);
-extern atomic_t flow_cache_genid;
+struct flow_cache_percpu;
+struct flow_cache_entry;
+
+struct flow_cache {
+	u32				hash_shift;
+	u32				order;
+	struct flow_cache_percpu *	percpu;
+	struct notifier_block		hotcpu_notifier;
+	int				low_watermark;
+	int				high_watermark;
+	struct timer_list		rnd_timer;
+	struct kmem_cache *		flow_cachep;
+};
+
+struct flow_cache_entry {
+	struct flow_cache_entry	*next;
+	struct flowi		key;
+	u16			family;
+	u8			dir;
+};
+
+extern struct flow_cache_entry *flow_cache_lookup(
+	struct flow_cache *cache, struct flowi *key,
+	u16 family, u8 dir);
+extern void flow_cache_entry_put(struct flow_cache_entry *fce);
+
+void flow_cache_flush(struct flow_cache *fc,
+		      void (*flush)(struct flow_cache *fc, struct flow_cache_entry *fce));
+extern int flow_cache_init(struct flow_cache *cache, size_t entry_size);
+extern void flow_cache_fini(struct flow_cache *cache);
 
 static inline int flow_cache_uli_match(struct flowi *fl1, struct flowi *fl2)
 {
diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h
index 1ba9127..4bb72c4 100644
--- a/include/net/netns/xfrm.h
+++ b/include/net/netns/xfrm.h
@@ -41,6 +41,10 @@ struct netns_xfrm {
 	struct xfrm_policy_hash	policy_bydst[XFRM_POLICY_MAX * 2];
 	unsigned int		policy_count[XFRM_POLICY_MAX * 2];
 	struct work_struct	policy_hash_work;
+	atomic_t		policy_genid;
+	struct hlist_head	policy_gc_list;
+	struct work_struct	policy_gc_work;
+	struct flow_cache	flow_cache;
 
 	struct sock		*nlsk;
 
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 223e90a..5cd4e29 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -487,6 +487,7 @@ struct xfrm_policy
 	struct xfrm_lifetime_cfg lft;
 	struct xfrm_lifetime_cur curlft;
 	struct dst_entry       *bundles;
+	atomic_t		bundles_genid;
 	struct xfrm_policy_walk_entry walk;
 	u8			type;
 	u8			action;
diff --git a/net/core/flow.c b/net/core/flow.c
index 5b27992..e3782c2 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -25,114 +25,85 @@
 #include <asm/atomic.h>
 #include <linux/security.h>
 
-struct flow_cache_entry {
-	struct flow_cache_entry	*next;
-	u16			family;
-	u8			dir;
-	u32			genid;
-	struct flowi		key;
-	void			*object;
-	atomic_t		*object_ref;
-};
-
-atomic_t flow_cache_genid = ATOMIC_INIT(0);
-
-static u32 flow_hash_shift;
-#define flow_hash_size	(1 << flow_hash_shift)
-static DEFINE_PER_CPU(struct flow_cache_entry **, flow_tables);
-
-#define flow_table(cpu) (per_cpu(flow_tables, cpu))
-
-static struct kmem_cache *flow_cachep __read_mostly;
 
-static int flow_lwm, flow_hwm;
-
-struct flow_percpu_info {
-	int hash_rnd_recalc;
-	u32 hash_rnd;
-	int count;
+struct flow_cache_percpu {
+	struct flow_cache_entry **	hash_table;
+	int				hash_count;
+	u32				hash_rnd;
+	int				hash_rnd_recalc;
+	struct tasklet_struct		flush_tasklet;
 };
-static DEFINE_PER_CPU(struct flow_percpu_info, flow_hash_info);
-
-#define flow_hash_rnd_recalc(cpu) \
-	(per_cpu(flow_hash_info, cpu).hash_rnd_recalc)
-#define flow_hash_rnd(cpu) \
-	(per_cpu(flow_hash_info, cpu).hash_rnd)
-#define flow_count(cpu) \
-	(per_cpu(flow_hash_info, cpu).count)
-
-static struct timer_list flow_hash_rnd_timer;
-
-#define FLOW_HASH_RND_PERIOD	(10 * 60 * HZ)
 
 struct flow_flush_info {
-	atomic_t cpuleft;
-	struct completion completion;
+	void (*flush)(struct flow_cache *fc, struct flow_cache_entry *fce);
+	struct flow_cache *		cache;
+	atomic_t			cpuleft;
+	struct completion		completion;
 };
-static DEFINE_PER_CPU(struct tasklet_struct, flow_flush_tasklets);
 
-#define flow_flush_tasklet(cpu) (&per_cpu(flow_flush_tasklets, cpu))
+#define flow_cache_hash_size(cache)	(1 << (cache)->hash_shift)
+#define FLOW_HASH_RND_PERIOD		(10 * 60 * HZ)
 
 static void flow_cache_new_hashrnd(unsigned long arg)
 {
+	struct flow_cache *fc = (struct flow_cache *) arg;
 	int i;
 
 	for_each_possible_cpu(i)
-		flow_hash_rnd_recalc(i) = 1;
+		per_cpu_ptr(fc->percpu, i)->hash_rnd_recalc = 1;
 
-	flow_hash_rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
-	add_timer(&flow_hash_rnd_timer);
+	fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
+	add_timer(&fc->rnd_timer);
 }
 
-static void flow_entry_kill(int cpu, struct flow_cache_entry *fle)
-{
-	if (fle->object)
-		atomic_dec(fle->object_ref);
-	kmem_cache_free(flow_cachep, fle);
-	flow_count(cpu)--;
-}
-
-static void __flow_cache_shrink(int cpu, int shrink_to)
+static void __flow_cache_shrink(struct flow_cache *fc,
+				struct flow_cache_percpu *fcp,
+				int shrink_to)
 {
 	struct flow_cache_entry *fle, **flp;
 	int i;
 
-	for (i = 0; i < flow_hash_size; i++) {
+	for (i = 0; i < flow_cache_hash_size(fc); i++) {
 		int k = 0;
 
-		flp = &flow_table(cpu)[i];
+		flp = &fcp->hash_table[i];
 		while ((fle = *flp) != NULL && k < shrink_to) {
 			k++;
 			flp = &fle->next;
 		}
 		while ((fle = *flp) != NULL) {
 			*flp = fle->next;
-			flow_entry_kill(cpu, fle);
+
+			kmem_cache_free(fc->flow_cachep, fle);
+			fcp->hash_count--;
 		}
 	}
 }
 
-static void flow_cache_shrink(int cpu)
+static void flow_cache_shrink(struct flow_cache *fc,
+			      struct flow_cache_percpu *fcp)
 {
-	int shrink_to = flow_lwm / flow_hash_size;
+	int shrink_to = fc->low_watermark / flow_cache_hash_size(fc);
 
-	__flow_cache_shrink(cpu, shrink_to);
+	__flow_cache_shrink(fc, fcp, shrink_to);
 }
 
-static void flow_new_hash_rnd(int cpu)
+static void flow_new_hash_rnd(struct flow_cache *fc,
+			      struct flow_cache_percpu *fcp)
 {
-	get_random_bytes(&flow_hash_rnd(cpu), sizeof(u32));
-	flow_hash_rnd_recalc(cpu) = 0;
-
-	__flow_cache_shrink(cpu, 0);
+	get_random_bytes(&fcp->hash_rnd, sizeof(u32));
+	fcp->hash_rnd_recalc = 0;
+	__flow_cache_shrink(fc, fcp, 0);
 }
 
-static u32 flow_hash_code(struct flowi *key, int cpu)
+static u32 flow_hash_code(struct flow_cache *fc,
+			  struct flow_cache_percpu *fcp,
+			  struct flowi *key)
 {
 	u32 *k = (u32 *) key;
 
-	return (jhash2(k, (sizeof(*key) / sizeof(u32)), flow_hash_rnd(cpu)) &
-		(flow_hash_size - 1));
+	return (jhash2(k, (sizeof(*key) / sizeof(u32)), fcp->hash_rnd)
+		& (flow_cache_hash_size(fc) - 1));
 }
 
 #if (BITS_PER_LONG == 64)
@@ -165,128 +136,100 @@ static int flow_key_compare(struct flowi *key1, struct flowi *key2)
 	return 0;
 }
 
-void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir,
-			flow_resolve_t resolver)
+struct flow_cache_entry *flow_cache_lookup(struct flow_cache *fc,
+					   struct flowi *key,
+					   u16 family, u8 dir)
 {
 	struct flow_cache_entry *fle, **head;
+	struct flow_cache_percpu *fcp;
 	unsigned int hash;
-	int cpu;
 
 	local_bh_disable();
-	cpu = smp_processor_id();
+	fcp = per_cpu_ptr(fc->percpu, smp_processor_id());
 
 	fle = NULL;
 	/* Packet really early in init?  Making flow_cache_init a
 	 * pre-smp initcall would solve this.  --RR */
-	if (!flow_table(cpu))
+	if (!fcp->hash_table)
 		goto nocache;
 
-	if (flow_hash_rnd_recalc(cpu))
-		flow_new_hash_rnd(cpu);
-	hash = flow_hash_code(key, cpu);
+	if (fcp->hash_rnd_recalc)
+		flow_new_hash_rnd(fc, fcp);
+
+	hash = flow_hash_code(fc, fcp, key);
 
-	head = &flow_table(cpu)[hash];
+	head = &fcp->hash_table[hash];
 	for (fle = *head; fle; fle = fle->next) {
 		if (fle->family == family &&
 		    fle->dir == dir &&
 		    flow_key_compare(key, &fle->key) == 0) {
-			if (fle->genid == atomic_read(&flow_cache_genid)) {
-				void *ret = fle->object;
-
-				if (ret)
-					atomic_inc(fle->object_ref);
-				local_bh_enable();
-
-				return ret;
-			}
-			break;
-		}
-	}
-
-	if (!fle) {
-		if (flow_count(cpu) > flow_hwm)
-			flow_cache_shrink(cpu);
-
-		fle = kmem_cache_alloc(flow_cachep, GFP_ATOMIC);
-		if (fle) {
-			fle->next = *head;
-			*head = fle;
-			fle->family = family;
-			fle->dir = dir;
-			memcpy(&fle->key, key, sizeof(*key));
-			fle->object = NULL;
-			flow_count(cpu)++;
+			return fle;
 		}
 	}
 
-nocache:
-	{
-		int err;
-		void *obj;
-		atomic_t *obj_ref;
-
-		err = resolver(net, key, family, dir, &obj, &obj_ref);
+	if (fcp->hash_count > fc->high_watermark)
+		flow_cache_shrink(fc, fcp);
 
-		if (fle && !err) {
-			fle->genid = atomic_read(&flow_cache_genid);
+	fle = kmem_cache_zalloc(fc->flow_cachep, GFP_ATOMIC);
+	if (!fle)
+		goto nocache;
 
-			if (fle->object)
-				atomic_dec(fle->object_ref);
+	fle->next = *head;
+	*head = fle;
+	fle->family = family;
+	fle->dir = dir;
+	memcpy(&fle->key, key, sizeof(*key));
+	fcp->hash_count++;
+	return fle;
 
-			fle->object = obj;
-			fle->object_ref = obj_ref;
-			if (obj)
-				atomic_inc(fle->object_ref);
-		}
-		local_bh_enable();
+nocache:
+	local_bh_enable();
+	return NULL;
+}
 
-		if (err)
-			obj = ERR_PTR(err);
-		return obj;
-	}
+void flow_cache_entry_put(struct flow_cache_entry *fce)
+{
+	local_bh_enable();
 }
 
 static void flow_cache_flush_tasklet(unsigned long data)
 {
-	struct flow_flush_info *info = (void *)data;
+	struct flow_flush_info *info = (void *) data;
+	struct flow_cache *fc = (void *) info->cache;
+	struct flow_cache_percpu *fcp;
 	int i;
-	int cpu;
 
-	cpu = smp_processor_id();
-	for (i = 0; i < flow_hash_size; i++) {
-		struct flow_cache_entry *fle;
+	if (info->flush == NULL)
+		goto done;
 
-		fle = flow_table(cpu)[i];
-		for (; fle; fle = fle->next) {
-			unsigned genid = atomic_read(&flow_cache_genid);
-
-			if (!fle->object || fle->genid == genid)
-				continue;
+	fcp = per_cpu_ptr(fc->percpu, smp_processor_id());
+	for (i = 0; i < flow_cache_hash_size(fc); i++) {
+		struct flow_cache_entry *fle;
 
-			fle->object = NULL;
-			atomic_dec(fle->object_ref);
-		}
+		fle = fcp->hash_table[i];
+		for (; fle; fle = fle->next)
+			info->flush(fc, fle);
 	}
 
+done:
 	if (atomic_dec_and_test(&info->cpuleft))
 		complete(&info->completion);
 }
 
-static void flow_cache_flush_per_cpu(void *) __attribute__((__unused__));
 static void flow_cache_flush_per_cpu(void *data)
 {
 	struct flow_flush_info *info = data;
-	int cpu;
 	struct tasklet_struct *tasklet;
+	int cpu;
 
 	cpu = smp_processor_id();
-
-	tasklet = flow_flush_tasklet(cpu);
-	tasklet->data = (unsigned long)info;
+	tasklet = &per_cpu_ptr(info->cache->percpu, cpu)->flush_tasklet;
+	tasklet->data = (unsigned long) data;
 	tasklet_schedule(tasklet);
 }
 
-void flow_cache_flush(void)
+void flow_cache_flush(struct flow_cache *fc,
+		      void (*flush)(struct flow_cache *fc, struct flow_cache_entry *fce))
 {
 	struct flow_flush_info info;
 	static DEFINE_MUTEX(flow_flush_sem);
@@ -294,6 +237,8 @@ void flow_cache_flush(void)
 	/* Don't want cpus going down or up during this. */
 	get_online_cpus();
 	mutex_lock(&flow_flush_sem);
+	info.cache = fc;
+	info.flush = flush;
 	atomic_set(&info.cpuleft, num_online_cpus());
 	init_completion(&info.completion);
 
@@ -307,62 +252,99 @@ void flow_cache_flush(void)
 	put_online_cpus();
 }
 
-static void __init flow_cache_cpu_prepare(int cpu)
+static void __init flow_cache_cpu_prepare(struct flow_cache *fc,
+					  struct flow_cache_percpu *fcp)
+{
+	fcp->hash_table = (struct flow_cache_entry **)
+		__get_free_pages(GFP_KERNEL|__GFP_ZERO, fc->order);
+	fcp->hash_rnd_recalc = 1;
+	fcp->hash_count = 0;
+
+	tasklet_init(&fcp->flush_tasklet, flow_cache_flush_tasklet, 0);
+}
+
+static int __cpuinit flow_cache_cpu(struct notifier_block *nfb,
+				    unsigned long action,
+				    void *hcpu)
+{
+	struct flow_cache *fc = container_of(nfb, struct flow_cache, hotcpu_notifier);
+	int cpu = (unsigned long) hcpu;
+	struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu);
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
+		flow_cache_cpu_prepare(fc, fcp);
+		if (!fcp->hash_table)
+			return NOTIFY_BAD;
+		break;
+	case CPU_UP_CANCELED:
+	case CPU_UP_CANCELED_FROZEN:
+	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
+		if (fcp->hash_table) {
+			__flow_cache_shrink(fc, fcp, 0);
+			free_pages((unsigned long) fcp->hash_table, fc->order);
+			fcp->hash_table = NULL;
+		}
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+int flow_cache_init(struct flow_cache *fc, size_t entry_size)
 {
-	struct tasklet_struct *tasklet;
 	unsigned long order;
+	int i, r;
+
+	BUG_ON(entry_size < sizeof(struct flow_cache_entry));
+	fc->flow_cachep = kmem_cache_create("flow_cache",
+					entry_size,
+					0, SLAB_PANIC,
+					NULL);
+	fc->hash_shift = 10;
+	fc->low_watermark = 2 * flow_cache_hash_size(fc);
+	fc->high_watermark = 4 * flow_cache_hash_size(fc);
+	fc->percpu = alloc_percpu(struct flow_cache_percpu);
 
 	for (order = 0;
 	     (PAGE_SIZE << order) <
-		     (sizeof(struct flow_cache_entry *)*flow_hash_size);
+		(sizeof(struct flow_cache_entry *) * flow_cache_hash_size(fc));
 	     order++)
 		/* NOTHING */;
+	fc->order = order;
 
-	flow_table(cpu) = (struct flow_cache_entry **)
-		__get_free_pages(GFP_KERNEL|__GFP_ZERO, order);
-	if (!flow_table(cpu))
-		panic("NET: failed to allocate flow cache order %lu\n", order);
+	setup_timer(&fc->rnd_timer, flow_cache_new_hashrnd, (unsigned long) fc);
+	fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
+	add_timer(&fc->rnd_timer);
 
-	flow_hash_rnd_recalc(cpu) = 1;
-	flow_count(cpu) = 0;
+	for_each_online_cpu(i) {
+		r = flow_cache_cpu(&fc->hotcpu_notifier,
+				   CPU_UP_PREPARE, (void*) i);
+		if (r != NOTIFY_OK)
+			panic("NET: failed to allocate flow cache order %lu\n", order);
+	}
 
-	tasklet = flow_flush_tasklet(cpu);
-	tasklet_init(tasklet, flow_cache_flush_tasklet, 0);
-}
+	fc->hotcpu_notifier = (struct notifier_block){
+		.notifier_call = flow_cache_cpu,
+	};
+	register_hotcpu_notifier(&fc->hotcpu_notifier);
 
-static int flow_cache_cpu(struct notifier_block *nfb,
-			  unsigned long action,
-			  void *hcpu)
-{
-	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
-		__flow_cache_shrink((unsigned long)hcpu, 0);
-	return NOTIFY_OK;
+	return 0;
 }
 
-static int __init flow_cache_init(void)
+void flow_cache_fini(struct flow_cache *fc)
 {
 	int i;
 
-	flow_cachep = kmem_cache_create("flow_cache",
-					sizeof(struct flow_cache_entry),
-					0, SLAB_PANIC,
-					NULL);
-	flow_hash_shift = 10;
-	flow_lwm = 2 * flow_hash_size;
-	flow_hwm = 4 * flow_hash_size;
-
-	setup_timer(&flow_hash_rnd_timer, flow_cache_new_hashrnd, 0);
-	flow_hash_rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
-	add_timer(&flow_hash_rnd_timer);
+	del_timer(&fc->rnd_timer);
+	unregister_hotcpu_notifier(&fc->hotcpu_notifier);
 
 	for_each_possible_cpu(i)
-		flow_cache_cpu_prepare(i);
+		flow_cache_cpu(&fc->hotcpu_notifier, CPU_DEAD, (void*) i);
 
-	hotcpu_notifier(flow_cache_cpu, 0);
-	return 0;
+	free_percpu(fc->percpu);
+	kmem_cache_destroy(fc->flow_cachep);
 }
 
-module_init(flow_cache_init);
-
-EXPORT_SYMBOL(flow_cache_genid);
 EXPORT_SYMBOL(flow_cache_lookup);
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index cc4797d..399853e 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c
@@ -151,8 +151,9 @@ void __inet6_csk_dst_store(struct sock *sk, struct dst_entry *dst,
 
 #ifdef CONFIG_XFRM
 	{
+		struct net *net = sock_net(sk);
 		struct rt6_info *rt = (struct rt6_info  *)dst;
-		rt->rt6i_flow_cache_genid = atomic_read(&flow_cache_genid);
+		rt->rt6i_flow_cache_genid = atomic_read(&net->xfrm.policy_genid);
 	}
 #endif
 }
@@ -166,8 +167,9 @@ struct dst_entry *__inet6_csk_dst_check(struct sock *sk, u32 cookie)
 
 #ifdef CONFIG_XFRM
 	if (dst) {
+		struct net *net = sock_net(sk);
 		struct rt6_info *rt = (struct rt6_info *)dst;
-		if (rt->rt6i_flow_cache_genid != atomic_read(&flow_cache_genid)) {
+		if (rt->rt6i_flow_cache_genid != atomic_read(&net->xfrm.policy_genid)) {
 			sk->sk_dst_cache = NULL;
 			dst_release(dst);
 			dst = NULL;
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index cb81ca3..82b01c3 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -44,7 +44,6 @@ static struct xfrm_policy_afinfo *xfrm_policy_afinfo[NPROTO];
 
 static struct kmem_cache *xfrm_dst_cache __read_mostly;
 
-static HLIST_HEAD(xfrm_policy_gc_list);
 static DEFINE_SPINLOCK(xfrm_policy_gc_lock);
 
 static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family);
@@ -53,6 +52,7 @@ static void xfrm_init_pmtu(struct dst_entry *dst);
 
 static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol,
 						int dir);
+static int stale_bundle(struct dst_entry *dst);
 
 static inline int
 __xfrm4_selector_match(struct xfrm_selector *sel, struct flowi *fl)
@@ -216,6 +216,35 @@ expired:
 	xfrm_pol_put(xp);
 }
 
+struct xfrm_flow_cache_entry {
+	struct flow_cache_entry fce;
+	struct xfrm_policy *policy;
+	struct xfrm_dst *dst;
+	u32 policy_genid, bundles_genid;
+};
+#define XFRM_CACHE_NO_POLICY ((struct xfrm_policy *) -1)
+
+void xfrm_flow_cache_entry_validate(struct flow_cache *fc,
+				    struct flow_cache_entry *fce)
+{
+	struct net *net = container_of(fc, struct net, xfrm.flow_cache);
+	struct xfrm_flow_cache_entry *xfc =
+		container_of(fce, struct xfrm_flow_cache_entry, fce);
+
+	if (xfc->policy_genid != atomic_read(&net->xfrm.policy_genid))
+		goto invalid;
+	if (xfc->policy == NULL || xfc->policy == XFRM_CACHE_NO_POLICY)
+		return;
+	if (xfc->policy->walk.dead)
+		goto invalid;
+	if (xfc->bundles_genid != atomic_read(&xfc->policy->bundles_genid))
+		goto invalid_dst;
+	return;
+invalid:
+	xfc->policy = NULL;
+invalid_dst:
+	xfc->dst = NULL;
+}
 
 /* Allocate xfrm_policy. Not used here, it is supposed to be used by pfkeyv2
  * SPD calls.
@@ -269,27 +298,26 @@ static void xfrm_policy_gc_kill(struct xfrm_policy *policy)
 	if (del_timer(&policy->timer))
 		atomic_dec(&policy->refcnt);
 
-	if (atomic_read(&policy->refcnt) > 1)
-		flow_cache_flush();
-
 	xfrm_pol_put(policy);
 }
 
 static void xfrm_policy_gc_task(struct work_struct *work)
 {
+	struct net *net = container_of(work, struct net, xfrm.policy_gc_work);
 	struct xfrm_policy *policy;
 	struct hlist_node *entry, *tmp;
 	struct hlist_head gc_list;
 
 	spin_lock_bh(&xfrm_policy_gc_lock);
-	gc_list.first = xfrm_policy_gc_list.first;
-	INIT_HLIST_HEAD(&xfrm_policy_gc_list);
+	gc_list.first = net->xfrm.policy_gc_list.first;
+	INIT_HLIST_HEAD(&net->xfrm.policy_gc_list);
 	spin_unlock_bh(&xfrm_policy_gc_lock);
 
+	flow_cache_flush(&net->xfrm.flow_cache, xfrm_flow_cache_entry_validate);
+
 	hlist_for_each_entry_safe(policy, entry, tmp, &gc_list, bydst)
 		xfrm_policy_gc_kill(policy);
 }
-static DECLARE_WORK(xfrm_policy_gc_work, xfrm_policy_gc_task);
 
 /* Rule must be locked. Release descentant resources, announce
  * entry dead. The rule must be unlinked from lists to the moment.
@@ -297,6 +325,7 @@ static DECLARE_WORK(xfrm_policy_gc_work, xfrm_policy_gc_task);
 
 static void xfrm_policy_kill(struct xfrm_policy *policy)
 {
+	struct net *net = xp_net(policy);
 	int dead;
 
 	write_lock_bh(&policy->lock);
@@ -310,10 +339,10 @@ static void xfrm_policy_kill(struct xfrm_policy *policy)
 	}
 
 	spin_lock_bh(&xfrm_policy_gc_lock);
-	hlist_add_head(&policy->bydst, &xfrm_policy_gc_list);
+	hlist_add_head(&policy->bydst, &net->xfrm.policy_gc_list);
 	spin_unlock_bh(&xfrm_policy_gc_lock);
 
-	schedule_work(&xfrm_policy_gc_work);
+	schedule_work(&net->xfrm.policy_gc_work);
 }
 
 static unsigned int xfrm_policy_hashmax __read_mostly = 1 * 1024 * 1024;
@@ -586,7 +615,7 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
 		hlist_add_head(&policy->bydst, chain);
 	xfrm_pol_hold(policy);
 	net->xfrm.policy_count[dir]++;
-	atomic_inc(&flow_cache_genid);
+	atomic_inc(&net->xfrm.policy_genid);
 	if (delpol)
 		__xfrm_policy_unlink(delpol, dir);
 	policy->index = delpol ? delpol->index : xfrm_gen_index(net, dir);
@@ -619,11 +648,13 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
 			gc_list = dst;
 
 			policy->bundles = NULL;
+			atomic_inc(&policy->bundles_genid);
 		}
 		write_unlock(&policy->lock);
 	}
 	read_unlock_bh(&xfrm_policy_lock);
 
+	flow_cache_flush(&net->xfrm.flow_cache, NULL);
 	while (gc_list) {
 		struct dst_entry *dst = gc_list;
 
@@ -669,7 +700,7 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u8 type, int dir,
 	write_unlock_bh(&xfrm_policy_lock);
 
 	if (ret && delete) {
-		atomic_inc(&flow_cache_genid);
+		atomic_inc(&net->xfrm.policy_genid);
 		xfrm_policy_kill(ret);
 	}
 	return ret;
@@ -710,7 +741,7 @@ struct xfrm_policy *xfrm_policy_byid(struct net *net, u8 type, int dir, u32 id,
 	write_unlock_bh(&xfrm_policy_lock);
 
 	if (ret && delete) {
-		atomic_inc(&flow_cache_genid);
+		atomic_inc(&net->xfrm.policy_genid);
 		xfrm_policy_kill(ret);
 	}
 	return ret;
@@ -824,7 +855,7 @@ int xfrm_policy_flush(struct net *net, u8 type, struct xfrm_audit *audit_info)
 		}
 
 	}
-	atomic_inc(&flow_cache_genid);
+	atomic_inc(&net->xfrm.policy_genid);
 out:
 	write_unlock_bh(&xfrm_policy_lock);
 	return err;
@@ -977,32 +1008,18 @@ fail:
 	return ret;
 }
 
-static int xfrm_policy_lookup(struct net *net, struct flowi *fl, u16 family,
-			      u8 dir, void **objp, atomic_t **obj_refp)
+static struct xfrm_policy *xfrm_policy_lookup(
+		struct net *net, struct flowi *fl,
+		u16 family, u8 dir)
 {
+#ifdef CONFIG_XFRM_SUB_POLICY
 	struct xfrm_policy *pol;
-	int err = 0;
 
-#ifdef CONFIG_XFRM_SUB_POLICY
 	pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_SUB, fl, family, dir);
-	if (IS_ERR(pol)) {
-		err = PTR_ERR(pol);
-		pol = NULL;
-	}
-	if (pol || err)
-		goto end;
+	if (pol != NULL)
+		return pol;
 #endif
-	pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, fl, family, dir);
-	if (IS_ERR(pol)) {
-		err = PTR_ERR(pol);
-		pol = NULL;
-	}
-#ifdef CONFIG_XFRM_SUB_POLICY
-end:
-#endif
-	if ((*objp = (void *) pol) != NULL)
-		*obj_refp = &pol->refcnt;
-	return err;
+	return xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, fl, family, dir);
 }
 
 static inline int policy_to_flow_dir(int dir)
@@ -1083,12 +1100,14 @@ static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol,
 
 int xfrm_policy_delete(struct xfrm_policy *pol, int dir)
 {
+	struct net *net = xp_net(pol);
+
 	write_lock_bh(&xfrm_policy_lock);
 	pol = __xfrm_policy_unlink(pol, dir);
 	write_unlock_bh(&xfrm_policy_lock);
 	if (pol) {
 		if (dir < XFRM_POLICY_MAX)
-			atomic_inc(&flow_cache_genid);
+			atomic_inc(&net->xfrm.policy_genid);
 		xfrm_policy_kill(pol);
 		return 0;
 	}
@@ -1512,13 +1531,34 @@ xfrm_dst_update_origin(struct dst_entry *dst, struct flowi *fl)
 #endif
 }
 
-static int stale_bundle(struct dst_entry *dst);
-
 /* Main function: finds/creates a bundle for given flow.
  *
  * At the moment we eat a raw IP route. Mostly to speed up lookups
  * on interfaces with disabled IPsec.
  */
+
+static void xfrm_flow_cache_update(struct net *net, struct flowi *key,
+				   u16 family, u8 dir,
+				   struct xfrm_policy *pol,
+				   struct xfrm_dst *dst)
+{
+	struct flow_cache_entry *fce;
+	struct xfrm_flow_cache_entry *xf;
+
+	fce = flow_cache_lookup(&net->xfrm.flow_cache,
+				key, family, dir);
+	if (fce == NULL)
+		return;
+
+	xf = container_of(fce, struct xfrm_flow_cache_entry, fce);
+	xf->policy_genid = atomic_read(&net->xfrm.policy_genid);
+	xf->policy = pol;
+	if (dst != NULL)
+		xf->bundles_genid = atomic_read(&pol->bundles_genid);
+	xf->dst = dst;
+	flow_cache_entry_put(fce);
+}
+
 int __xfrm_lookup(struct net *net, struct dst_entry **dst_p, struct flowi *fl,
 		  struct sock *sk, int flags)
 {
@@ -1537,8 +1577,10 @@ int __xfrm_lookup(struct net *net, struct dst_entry **dst_p, struct flowi *fl,
 	u8 dir = policy_to_flow_dir(XFRM_POLICY_OUT);
 
 restart:
-	genid = atomic_read(&flow_cache_genid);
+	family = dst_orig->ops->family;
+	genid = atomic_read(&net->xfrm.policy_genid);
 	policy = NULL;
+	dst = NULL;
 	for (pi = 0; pi < ARRAY_SIZE(pols); pi++)
 		pols[pi] = NULL;
 	npols = 0;
@@ -1555,24 +1597,51 @@ restart:
 	}
 
 	if (!policy) {
+		struct flow_cache_entry *fce;
+		struct xfrm_flow_cache_entry *xf;
+
 		/* To accelerate a bit...  */
 		if ((dst_orig->flags & DST_NOXFRM) ||
 		    !net->xfrm.policy_count[XFRM_POLICY_OUT])
 			goto nopol;
 
-		policy = flow_cache_lookup(net, fl, dst_orig->ops->family,
-					   dir, xfrm_policy_lookup);
-		err = PTR_ERR(policy);
-		if (IS_ERR(policy)) {
-			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR);
-			goto dropdst;
+		fce = flow_cache_lookup(&net->xfrm.flow_cache,
+					fl, family, dir);
+		if (fce == NULL)
+			goto no_cache;
+
+		xf = container_of(fce, struct xfrm_flow_cache_entry, fce);
+		xfrm_flow_cache_entry_validate(&net->xfrm.flow_cache, fce);
+		if (xf->policy != NULL) {
+			policy = xf->policy;
+			if (policy != XFRM_CACHE_NO_POLICY)
+				xfrm_pol_hold(policy);
+			if (xf->dst != NULL)
+				dst = dst_clone((struct dst_entry *) xf->dst);
+		}
+		flow_cache_entry_put(fce);
+		if (policy == XFRM_CACHE_NO_POLICY)
+			goto nopol;
+		if (dst && !xfrm_bundle_ok(policy, (struct xfrm_dst *) dst, fl, family, 0)) {
+			dst_release(dst);
+			dst = NULL;
 		}
 	}
+no_cache:
+	if (!policy) {
+		policy = xfrm_policy_lookup(net, fl, family, dir);
+		if (!policy) {
+			xfrm_flow_cache_update(
+				net, fl, family, dir,
+				XFRM_CACHE_NO_POLICY, NULL);
+			goto nopol;
+		}
+	}
+	if (IS_ERR(policy)) {
+		XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR);
+		goto dropdst;
+	}
 
-	if (!policy)
-		goto nopol;
-
-	family = dst_orig->ops->family;
 	pols[0] = policy;
 	npols ++;
 	xfrm_nr += pols[0]->xfrm_nr;
@@ -1583,6 +1652,9 @@ restart:
 
 	policy->curlft.use_time = get_seconds();
 
+	if (dst)
+		goto dst_found;
+
 	switch (policy->action) {
 	default:
 	case XFRM_POLICY_BLOCK:
@@ -1593,18 +1665,11 @@ restart:
 
 	case XFRM_POLICY_ALLOW:
 #ifndef CONFIG_XFRM_SUB_POLICY
-		if (policy->xfrm_nr == 0) {
-			/* Flow passes not transformed. */
-			xfrm_pol_put(policy);
-			return 0;
-		}
+		if (policy->xfrm_nr == 0)
+			goto no_transform;
 #endif
 
-		/* Try to find matching bundle.
-		 *
-		 * LATER: help from flow cache. It is optional, this
-		 * is required only for output policy.
-		 */
+		/* Try to find matching bundle the hard way. */
 		dst = xfrm_find_bundle(fl, policy, family);
 		if (IS_ERR(dst)) {
 			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLECHECKERROR);
@@ -1644,12 +1709,8 @@ restart:
 		 * they are searched. See above not-transformed bypass
 		 * is surrounded by non-sub policy configuration, too.
 		 */
-		if (xfrm_nr == 0) {
-			/* Flow passes not transformed. */
-			xfrm_pols_put(pols, npols);
-			return 0;
-		}
-
+		if (xfrm_nr == 0)
+			goto no_transform;
 #endif
 		nx = xfrm_tmpl_resolve(pols, npols, fl, xfrm, family);
 
@@ -1680,7 +1741,7 @@ restart:
 					goto error;
 				}
 				if (nx == -EAGAIN ||
-				    genid != atomic_read(&flow_cache_genid)) {
+				    genid != atomic_read(&net->xfrm.policy_genid)) {
 					xfrm_pols_put(pols, npols);
 					goto restart;
 				}
@@ -1691,11 +1752,8 @@ restart:
 				goto error;
 			}
 		}
-		if (nx == 0) {
-			/* Flow passes not transformed. */
-			xfrm_pols_put(pols, npols);
-			return 0;
-		}
+		if (nx == 0)
+			goto no_transform;
 
 		dst = xfrm_bundle_create(policy, xfrm, nx, fl, dst_orig);
 		err = PTR_ERR(dst);
@@ -1744,6 +1802,9 @@ restart:
 		dst_hold(dst);
 		write_unlock_bh(&policy->lock);
 	}
+	xfrm_flow_cache_update(net, fl, family, dir,
+			       policy, (struct xfrm_dst *) dst);
+dst_found:
 	*dst_p = dst;
 	dst_release(dst_orig);
 	xfrm_pols_put(pols, npols);
@@ -1761,7 +1822,12 @@ nopol:
 	if (flags & XFRM_LOOKUP_ICMP)
 		goto dropdst;
 	return 0;
+no_transform:
+	/* Flow passes not transformed. */
+	xfrm_pols_put(pols, npols);
+	return 0;
 }
+
 EXPORT_SYMBOL(__xfrm_lookup);
 
 int xfrm_lookup(struct net *net, struct dst_entry **dst_p, struct flowi *fl,
@@ -1919,10 +1985,35 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
 		}
 	}
 
-	if (!pol)
-		pol = flow_cache_lookup(net, &fl, family, fl_dir,
-					xfrm_policy_lookup);
-
+	if (!pol) {
+		struct flow_cache_entry *fce;
+		struct xfrm_flow_cache_entry *xf;
+
+		fce = flow_cache_lookup(&net->xfrm.flow_cache,
+					&fl, family, dir);
+		if (fce != NULL) {
+			xf = container_of(fce, struct xfrm_flow_cache_entry, fce);
+			xfrm_flow_cache_entry_validate(&net->xfrm.flow_cache, fce);
+			if (xf->policy != NULL) {
+				pol = xf->policy;
+				if (pol != XFRM_CACHE_NO_POLICY)
+					xfrm_pol_hold(pol);
+				else
+					pol = NULL;
+			} else {
+				pol = xfrm_policy_lookup(net, &fl, family, dir);
+				if (!IS_ERR(pol)) {
+					if (pol)
+						xf->policy = pol;
+					else
+						xf->policy = XFRM_CACHE_NO_POLICY;
+				}
+				xf->dst = NULL;
+				xf->policy_genid = atomic_read(&net->xfrm.policy_genid);
+			}
+			flow_cache_entry_put(fce);
+		}
+	}
 	if (IS_ERR(pol)) {
 		XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR);
 		return 0;
@@ -2121,6 +2212,7 @@ static void prune_one_bundle(struct xfrm_policy *pol, int (*func)(struct dst_ent
 			dstp = &dst->next;
 		}
 	}
+	atomic_inc(&pol->bundles_genid);
 	write_unlock(&pol->lock);
 }
 
@@ -2148,6 +2240,7 @@ static void xfrm_prune_bundles(struct net *net, int (*func)(struct dst_entry *))
 	}
 	read_unlock_bh(&xfrm_policy_lock);
 
+	flow_cache_flush(&net->xfrm.flow_cache, NULL);
 	while (gc_list) {
 		struct dst_entry *dst = gc_list;
 		gc_list = dst->next;
@@ -2428,6 +2521,9 @@ static int __net_init xfrm_policy_init(struct net *net)
 
 	INIT_LIST_HEAD(&net->xfrm.policy_all);
 	INIT_WORK(&net->xfrm.policy_hash_work, xfrm_hash_resize);
+	INIT_HLIST_HEAD(&net->xfrm.policy_gc_list);
+	INIT_WORK(&net->xfrm.policy_gc_work, xfrm_policy_gc_task);
+	flow_cache_init(&net->xfrm.flow_cache, sizeof(struct xfrm_flow_cache_entry));
 	if (net_eq(net, &init_net))
 		register_netdevice_notifier(&xfrm_dev_notifier);
 	return 0;
@@ -2461,7 +2557,7 @@ static void xfrm_policy_fini(struct net *net)
 	audit_info.sessionid = -1;
 	audit_info.secid = 0;
 	xfrm_policy_flush(net, XFRM_POLICY_TYPE_MAIN, &audit_info);
-	flush_work(&xfrm_policy_gc_work);
+	flush_work(&net->xfrm.policy_gc_work);
 
 	WARN_ON(!list_empty(&net->xfrm.policy_all));
 
@@ -2479,6 +2575,8 @@ static void xfrm_policy_fini(struct net *net)
 	sz = (net->xfrm.policy_idx_hmask + 1) * sizeof(struct hlist_head);
 	WARN_ON(!hlist_empty(net->xfrm.policy_byidx));
 	xfrm_hash_free(net->xfrm.policy_byidx, sz);
+
+	flow_cache_fini(&net->xfrm.flow_cache);
 }
 
 static int __net_init xfrm_net_init(struct net *net)
@@ -2685,8 +2783,9 @@ static int migrate_tmpl_match(struct xfrm_migrate *m, struct xfrm_tmpl *t)
 static int xfrm_policy_migrate(struct xfrm_policy *pol,
 			       struct xfrm_migrate *m, int num_migrate)
 {
+	struct net *net = xp_net(pol);
 	struct xfrm_migrate *mp;
-	struct dst_entry *dst;
+	struct dst_entry *gc_list = NULL, *tail;
 	int i, j, n = 0;
 
 	write_lock_bh(&pol->lock);
@@ -2711,15 +2810,25 @@ static int xfrm_policy_migrate(struct xfrm_policy *pol,
 			       sizeof(pol->xfrm_vec[i].saddr));
 			pol->xfrm_vec[i].encap_family = mp->new_family;
 			/* flush bundles */
-			while ((dst = pol->bundles) != NULL) {
-				pol->bundles = dst->next;
-				dst_free(dst);
-			}
+			tail = pol->bundles;
+			while (tail->next)
+				tail = tail->next;
+			tail->next = gc_list;
+			gc_list = pol->bundles;
+			pol->bundles = NULL;
+			atomic_inc(&pol->bundles_genid);
 		}
 	}
-
 	write_unlock_bh(&pol->lock);
 
+	flow_cache_flush(&net->xfrm.flow_cache, NULL);
+	while (gc_list) {
+		struct dst_entry *dst = gc_list;
+
+		gc_list = dst->next;
+		dst_free(dst);
+	}
+
 	if (!n)
 		return -ENODATA;
 
-- 
1.7.0.2