From 475d86ecf072a29b88dd7be8ca5cea8ecb045fb6 Mon Sep 17 00:00:00 2001 From: Cedric Schieli Date: Wed, 21 Apr 2010 08:05:41 +0000 Subject: main/linux-pae: sync with main/linux-grsec --- ...-hard-header-destination-address-checking.patch | 44 + ...ude-route-header_len-in-max_headroom-calc.patch | 39 + ...0004-arp-flush-arp-cache-on-device-change.patch | 29 + .../0005-r8169-fix-broken-register-writes.patch | 52 + ...al-fix-for-CVE-2009-4537-overlength-frame.patch | 120 +++ .../0007-r8169-Fix-rtl8169_rx_interrupt.patch | 89 ++ .../0008-r8169-clean-up-my-printk-uglyness.patch | 36 + .../0009-ipsec-Fix-bogus-bundle-flowi.patch | 110 ++ .../0010-xfrm-Remove-xfrm_state_genid.patch | 54 + ...erify-policy-direction-at-XFRM_MSG_POLEXP.patch | 35 + ...-policy-lock-when-accessing-policy-walk.d.patch | 105 ++ .../0013-flow-structurize-flow-cache.patch | 395 ++++++++ ...-flow-virtualize-flow-cache-entry-methods.patch | 513 ++++++++++ ...bundles-instead-of-policies-for-outgoing-.patch | 1068 ++++++++++++++++++++ ...016-xfrm-remove-policy-garbage-collection.patch | 91 ++ ...ow-delayed-deletion-of-flow-cache-entries.patch | 231 +++++ .../0018-xfrm-Fix-crashes-in-xfrm_lookup.patch | 46 + main/linux-pae/APKBUILD | 54 +- main/linux-pae/arp.patch | 14 - main/linux-pae/ip_gre.patch | 15 - main/linux-pae/ip_gre2.patch | 17 - ...t-78f1cd-r8169-fix-broken-register-writes.patch | 51 - ...x-for-CVE-2009-4537-overlength-frame-DMAs.patch | 119 --- main/linux-pae/xfrm-cache-size-revert.patch | 12 - 24 files changed, 3095 insertions(+), 244 deletions(-) create mode 100644 main/linux-pae/0002-gre-fix-hard-header-destination-address-checking.patch create mode 100644 main/linux-pae/0003-ip_gre-include-route-header_len-in-max_headroom-calc.patch create mode 100644 main/linux-pae/0004-arp-flush-arp-cache-on-device-change.patch create mode 100644 main/linux-pae/0005-r8169-fix-broken-register-writes.patch create mode 100644 main/linux-pae/0006-r8169-offical-fix-for-CVE-2009-4537-overlength-frame.patch create mode 100644 main/linux-pae/0007-r8169-Fix-rtl8169_rx_interrupt.patch create mode 100644 main/linux-pae/0008-r8169-clean-up-my-printk-uglyness.patch create mode 100644 main/linux-pae/0009-ipsec-Fix-bogus-bundle-flowi.patch create mode 100644 main/linux-pae/0010-xfrm-Remove-xfrm_state_genid.patch create mode 100644 main/linux-pae/0011-xfrm_user-verify-policy-direction-at-XFRM_MSG_POLEXP.patch create mode 100644 main/linux-pae/0012-xfrm-remove-policy-lock-when-accessing-policy-walk.d.patch create mode 100644 main/linux-pae/0013-flow-structurize-flow-cache.patch create mode 100644 main/linux-pae/0014-flow-virtualize-flow-cache-entry-methods.patch create mode 100644 main/linux-pae/0015-xfrm-cache-bundles-instead-of-policies-for-outgoing-.patch create mode 100644 main/linux-pae/0016-xfrm-remove-policy-garbage-collection.patch create mode 100644 main/linux-pae/0017-flow-delayed-deletion-of-flow-cache-entries.patch create mode 100644 main/linux-pae/0018-xfrm-Fix-crashes-in-xfrm_lookup.patch delete mode 100644 main/linux-pae/arp.patch delete mode 100644 main/linux-pae/ip_gre.patch delete mode 100644 main/linux-pae/ip_gre2.patch delete mode 100644 main/linux-pae/net-git-78f1cd-r8169-fix-broken-register-writes.patch delete mode 100644 main/linux-pae/net-git-c0cd88-r8169-offical-fix-for-CVE-2009-4537-overlength-frame-DMAs.patch delete mode 100644 main/linux-pae/xfrm-cache-size-revert.patch diff --git a/main/linux-pae/0002-gre-fix-hard-header-destination-address-checking.patch b/main/linux-pae/0002-gre-fix-hard-header-destination-address-checking.patch new file mode 100644 index 00000000..36a0ae44 --- /dev/null +++ b/main/linux-pae/0002-gre-fix-hard-header-destination-address-checking.patch @@ -0,0 +1,44 @@ +From 9082391046940c410eac3bad065c8701998b5cab Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Timo=20Ter=C3=A4s?= +Date: Wed, 3 Mar 2010 04:01:13 +0000 +Subject: [PATCH 02/18] gre: fix hard header destination address checking + +ipgre_header() can be called with zero daddr when the gre device is +configured as multipoint tunnel and still has the NOARP flag set (which is +typically cleared by the userspace arp daemon). If the NOARP packets are +not dropped, ipgre_tunnel_xmit() will take rt->rt_gateway (= NBMA IP) and +use that for route look up (and may lead to bogus xfrm acquires). + +The multicast address check is removed as sending to multicast group should +be ok. In fact, if gre device has a multicast address as destination +ipgre_header is always called with multicast address. + +Signed-off-by: Timo Teras +Signed-off-by: David S. Miller +(cherry picked from commit 6d55cb91a0020ac0d78edcad61efd6c8cf5785a3) +--- + net/ipv4/ip_gre.c | 7 ++----- + 1 files changed, 2 insertions(+), 5 deletions(-) + +diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c +index 1433338..ac88ce5 100644 +--- a/net/ipv4/ip_gre.c ++++ b/net/ipv4/ip_gre.c +@@ -1137,12 +1137,9 @@ static int ipgre_header(struct sk_buff *skb, struct net_device *dev, + + if (saddr) + memcpy(&iph->saddr, saddr, 4); +- +- if (daddr) { ++ if (daddr) + memcpy(&iph->daddr, daddr, 4); +- return t->hlen; +- } +- if (iph->daddr && !ipv4_is_multicast(iph->daddr)) ++ if (iph->daddr) + return t->hlen; + + return -t->hlen; +-- +1.7.0.2 + diff --git a/main/linux-pae/0003-ip_gre-include-route-header_len-in-max_headroom-calc.patch b/main/linux-pae/0003-ip_gre-include-route-header_len-in-max_headroom-calc.patch new file mode 100644 index 00000000..61d7c9a6 --- /dev/null +++ b/main/linux-pae/0003-ip_gre-include-route-header_len-in-max_headroom-calc.patch @@ -0,0 +1,39 @@ +From cd0e9d08480e1e0648e17d099ecf50f6fd8714e5 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Timo=20Ter=C3=A4s?= +Date: Sat, 20 Mar 2010 02:27:58 +0000 +Subject: [PATCH 03/18] ip_gre: include route header_len in max_headroom calculation + +Taking route's header_len into account, and updating gre device +needed_headroom will give better hints on upper bound of required +headroom. This is useful if the gre traffic is xfrm'ed. + +Signed-off-by: Timo Teras +Acked-by: Herbert Xu +Signed-off-by: David S. Miller +(cherry picked from commit 243aad830e8a4cdda261626fbaeddde16b08d04a) +--- + net/ipv4/ip_gre.c | 4 +++- + 1 files changed, 3 insertions(+), 1 deletions(-) + +diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c +index ac88ce5..7f1ff73 100644 +--- a/net/ipv4/ip_gre.c ++++ b/net/ipv4/ip_gre.c +@@ -803,11 +803,13 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev + tunnel->err_count = 0; + } + +- max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen; ++ max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->u.dst.header_len; + + if (skb_headroom(skb) < max_headroom || skb_shared(skb)|| + (skb_cloned(skb) && !skb_clone_writable(skb, 0))) { + struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); ++ if (max_headroom > dev->needed_headroom) ++ dev->needed_headroom = max_headroom; + if (!new_skb) { + ip_rt_put(rt); + stats->tx_dropped++; +-- +1.7.0.2 + diff --git a/main/linux-pae/0004-arp-flush-arp-cache-on-device-change.patch b/main/linux-pae/0004-arp-flush-arp-cache-on-device-change.patch new file mode 100644 index 00000000..85161ea3 --- /dev/null +++ b/main/linux-pae/0004-arp-flush-arp-cache-on-device-change.patch @@ -0,0 +1,29 @@ +From 8a0e3ea4924059a7268446177d6869e3399adbb2 Mon Sep 17 00:00:00 2001 +From: Timo Teras +Date: Mon, 12 Apr 2010 13:46:45 +0000 +Subject: [PATCH 04/18] arp: flush arp cache on device change + +If IFF_NOARP is changed, we must flush the arp cache. + +Signed-off-by: Timo Teras +--- + net/ipv4/arp.c | 3 +++ + 1 files changed, 3 insertions(+), 0 deletions(-) + +diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c +index 4e80f33..580bfc3 100644 +--- a/net/ipv4/arp.c ++++ b/net/ipv4/arp.c +@@ -1200,6 +1200,9 @@ static int arp_netdev_event(struct notifier_block *this, unsigned long event, vo + neigh_changeaddr(&arp_tbl, dev); + rt_cache_flush(dev_net(dev), 0); + break; ++ case NETDEV_CHANGE: ++ neigh_changeaddr(&arp_tbl, dev); ++ break; + default: + break; + } +-- +1.7.0.2 + diff --git a/main/linux-pae/0005-r8169-fix-broken-register-writes.patch b/main/linux-pae/0005-r8169-fix-broken-register-writes.patch new file mode 100644 index 00000000..bfa8df29 --- /dev/null +++ b/main/linux-pae/0005-r8169-fix-broken-register-writes.patch @@ -0,0 +1,52 @@ +From 89f350c4ec426b4c1db6ef269546940365d918e1 Mon Sep 17 00:00:00 2001 +From: Francois Romieu +Date: Sat, 27 Mar 2010 19:35:46 -0700 +Subject: [PATCH 05/18] r8169: fix broken register writes +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +This is quite similar to b39fe41f481d20c201012e4483e76c203802dda7 +though said registers are not even documented as 64-bit registers +- as opposed to the initial TxDescStartAddress ones - but as single +bytes which must be combined into 32 bits at the MMIO read/write +level before being merged into a 64 bit logical entity. + +Credits go to Ben Hutchings for the MAR +registers (aka "multicast is broken for ages on ARM) and to +Timo Teräs for the MAC registers. + +Signed-off-by: Francois Romieu +Signed-off-by: David S. Miller +(cherry picked from commit 78f1cd02457252e1ffbc6caa44a17424a45286b8) +--- + drivers/net/r8169.c | 4 ++-- + 1 files changed, 2 insertions(+), 2 deletions(-) + +diff --git a/drivers/net/r8169.c b/drivers/net/r8169.c +index 0fe2fc9..24599b5 100644 +--- a/drivers/net/r8169.c ++++ b/drivers/net/r8169.c +@@ -2827,8 +2827,8 @@ static void rtl_rar_set(struct rtl8169_private *tp, u8 *addr) + spin_lock_irq(&tp->lock); + + RTL_W8(Cfg9346, Cfg9346_Unlock); +- RTL_W32(MAC0, low); + RTL_W32(MAC4, high); ++ RTL_W32(MAC0, low); + RTL_W8(Cfg9346, Cfg9346_Lock); + + spin_unlock_irq(&tp->lock); +@@ -4795,8 +4795,8 @@ static void rtl_set_rx_mode(struct net_device *dev) + mc_filter[1] = swab32(data); + } + +- RTL_W32(MAR0 + 0, mc_filter[0]); + RTL_W32(MAR0 + 4, mc_filter[1]); ++ RTL_W32(MAR0 + 0, mc_filter[0]); + + RTL_W32(RxConfig, tmp); + +-- +1.7.0.2 + diff --git a/main/linux-pae/0006-r8169-offical-fix-for-CVE-2009-4537-overlength-frame.patch b/main/linux-pae/0006-r8169-offical-fix-for-CVE-2009-4537-overlength-frame.patch new file mode 100644 index 00000000..03ea13fa --- /dev/null +++ b/main/linux-pae/0006-r8169-offical-fix-for-CVE-2009-4537-overlength-frame.patch @@ -0,0 +1,120 @@ +From a60cfaf3df9cd0cddbc24695434ed5bfa917d505 Mon Sep 17 00:00:00 2001 +From: Neil Horman +Date: Mon, 29 Mar 2010 13:16:02 -0700 +Subject: [PATCH 06/18] r8169: offical fix for CVE-2009-4537 (overlength frame DMAs) + +Official patch to fix the r8169 frame length check error. + +Based on this initial thread: +http://marc.info/?l=linux-netdev&m=126202972828626&w=1 +This is the official patch to fix the frame length problems in the r8169 +driver. As noted in the previous thread, while this patch incurs a performance +hit on the driver, its possible to improve performance dynamically by updating +the mtu and rx_copybreak values at runtime to return performance to what it was +for those NICS which are unaffected by the ideosyncracy (if there are any). + +Summary: + + A while back Eric submitted a patch for r8169 in which the proper +allocated frame size was written to RXMaxSize to prevent the NIC from dmaing too +much data. This was done in commit fdd7b4c3302c93f6833e338903ea77245eb510b4. A +long time prior to that however, Francois posted +126fa4b9ca5d9d7cb7d46f779ad3bd3631ca387c, which expiclitly disabled the MaxSize +setting due to the fact that the hardware behaved in odd ways when overlong +frames were received on NIC's supported by this driver. This was mentioned in a +security conference recently: +http://events.ccc.de/congress/2009/Fahrplan//events/3596.en.html + +It seems that if we can't enable frame size filtering, then, as Eric correctly +noticed, we can find ourselves DMA-ing too much data to a buffer, causing +corruption. As a result is seems that we are forced to allocate a frame which +is ready to handle a maximally sized receive. + +This obviously has performance issues with it, so to mitigate that issue, this +patch does two things: + +1) Raises the copybreak value to the frame allocation size, which should force +appropriately sized packets to get allocated on rx, rather than a full new 16k +buffer. + +2) This patch only disables frame filtering initially (i.e., during the NIC +open), changing the MTU results in ring buffer allocation of a size in relation +to the new mtu (along with a warning indicating that this is dangerous). + +Because of item (2), individuals who can't cope with the performance hit (or can +otherwise filter frames to prevent the bug), or who have hardware they are sure +is unaffected by this issue, can manually lower the copybreak and reset the mtu +such that performance is restored easily. + +Signed-off-by: Neil Horman +Signed-off-by: David S. Miller +(cherry picked from commit c0cd884af045338476b8e69a61fceb3f34ff22f1) +--- + drivers/net/r8169.c | 29 ++++++++++++++++++++++++----- + 1 files changed, 24 insertions(+), 5 deletions(-) + +diff --git a/drivers/net/r8169.c b/drivers/net/r8169.c +index 24599b5..1484528 100644 +--- a/drivers/net/r8169.c ++++ b/drivers/net/r8169.c +@@ -186,7 +186,12 @@ static struct pci_device_id rtl8169_pci_tbl[] = { + + MODULE_DEVICE_TABLE(pci, rtl8169_pci_tbl); + +-static int rx_copybreak = 200; ++/* ++ * we set our copybreak very high so that we don't have ++ * to allocate 16k frames all the time (see note in ++ * rtl8169_open() ++ */ ++static int rx_copybreak = 16383; + static int use_dac; + static struct { + u32 msg_enable; +@@ -3245,9 +3250,13 @@ static void __devexit rtl8169_remove_one(struct pci_dev *pdev) + } + + static void rtl8169_set_rxbufsize(struct rtl8169_private *tp, +- struct net_device *dev) ++ unsigned int mtu) + { +- unsigned int max_frame = dev->mtu + VLAN_ETH_HLEN + ETH_FCS_LEN; ++ unsigned int max_frame = mtu + VLAN_ETH_HLEN + ETH_FCS_LEN; ++ ++ if (max_frame != 16383) ++ printk(KERN_WARNING "WARNING! Changing of MTU on this NIC" ++ "May lead to frame reception errors!\n"); + + tp->rx_buf_sz = (max_frame > RX_BUF_SIZE) ? max_frame : RX_BUF_SIZE; + } +@@ -3259,7 +3268,17 @@ static int rtl8169_open(struct net_device *dev) + int retval = -ENOMEM; + + +- rtl8169_set_rxbufsize(tp, dev); ++ /* ++ * Note that we use a magic value here, its wierd I know ++ * its done because, some subset of rtl8169 hardware suffers from ++ * a problem in which frames received that are longer than ++ * the size set in RxMaxSize register return garbage sizes ++ * when received. To avoid this we need to turn off filtering, ++ * which is done by setting a value of 16383 in the RxMaxSize register ++ * and allocating 16k frames to handle the largest possible rx value ++ * thats what the magic math below does. ++ */ ++ rtl8169_set_rxbufsize(tp, 16383 - VLAN_ETH_HLEN - ETH_FCS_LEN); + + /* + * Rx and Tx desscriptors needs 256 bytes alignment. +@@ -3912,7 +3931,7 @@ static int rtl8169_change_mtu(struct net_device *dev, int new_mtu) + + rtl8169_down(dev); + +- rtl8169_set_rxbufsize(tp, dev); ++ rtl8169_set_rxbufsize(tp, dev->mtu); + + ret = rtl8169_init_ring(dev); + if (ret < 0) +-- +1.7.0.2 + diff --git a/main/linux-pae/0007-r8169-Fix-rtl8169_rx_interrupt.patch b/main/linux-pae/0007-r8169-Fix-rtl8169_rx_interrupt.patch new file mode 100644 index 00000000..fad27232 --- /dev/null +++ b/main/linux-pae/0007-r8169-Fix-rtl8169_rx_interrupt.patch @@ -0,0 +1,89 @@ +From 26654a966adb674afc30d285f7e79535d03c2492 Mon Sep 17 00:00:00 2001 +From: Eric Dumazet +Date: Wed, 31 Mar 2010 02:08:31 +0000 +Subject: [PATCH 07/18] r8169: Fix rtl8169_rx_interrupt() + +In case a reset is performed, rtl8169_rx_interrupt() is called from +process context instead of softirq context. Special care must be taken +to call appropriate network core services (netif_rx() instead of +netif_receive_skb()). VLAN handling also corrected. + +Reported-by: Sergey Senozhatsky +Tested-by: Sergey Senozhatsky +Diagnosed-by: Oleg Nesterov +Signed-off-by: Eric Dumazet +Signed-off-by: David S. Miller +(cherry picked from commit 630b943c182d1aed69f244405131902fbcba7ec6) +--- + drivers/net/r8169.c | 22 +++++++++++++++++----- + 1 files changed, 17 insertions(+), 5 deletions(-) + +diff --git a/drivers/net/r8169.c b/drivers/net/r8169.c +index 1484528..bed1d47 100644 +--- a/drivers/net/r8169.c ++++ b/drivers/net/r8169.c +@@ -1047,14 +1047,14 @@ static void rtl8169_vlan_rx_register(struct net_device *dev, + } + + static int rtl8169_rx_vlan_skb(struct rtl8169_private *tp, struct RxDesc *desc, +- struct sk_buff *skb) ++ struct sk_buff *skb, int polling) + { + u32 opts2 = le32_to_cpu(desc->opts2); + struct vlan_group *vlgrp = tp->vlgrp; + int ret; + + if (vlgrp && (opts2 & RxVlanTag)) { +- vlan_hwaccel_receive_skb(skb, vlgrp, swab16(opts2 & 0xffff)); ++ __vlan_hwaccel_rx(skb, vlgrp, swab16(opts2 & 0xffff), polling); + ret = 0; + } else + ret = -1; +@@ -1071,7 +1071,7 @@ static inline u32 rtl8169_tx_vlan_tag(struct rtl8169_private *tp, + } + + static int rtl8169_rx_vlan_skb(struct rtl8169_private *tp, struct RxDesc *desc, +- struct sk_buff *skb) ++ struct sk_buff *skb, int polling) + { + return -1; + } +@@ -4480,12 +4480,20 @@ out: + return done; + } + ++/* ++ * Warning : rtl8169_rx_interrupt() might be called : ++ * 1) from NAPI (softirq) context ++ * (polling = 1 : we should call netif_receive_skb()) ++ * 2) from process context (rtl8169_reset_task()) ++ * (polling = 0 : we must call netif_rx() instead) ++ */ + static int rtl8169_rx_interrupt(struct net_device *dev, + struct rtl8169_private *tp, + void __iomem *ioaddr, u32 budget) + { + unsigned int cur_rx, rx_left; + unsigned int delta, count; ++ int polling = (budget != ~(u32)0) ? 1 : 0; + + cur_rx = tp->cur_rx; + rx_left = NUM_RX_DESC + tp->dirty_rx - cur_rx; +@@ -4550,8 +4558,12 @@ static int rtl8169_rx_interrupt(struct net_device *dev, + skb_put(skb, pkt_size); + skb->protocol = eth_type_trans(skb, dev); + +- if (rtl8169_rx_vlan_skb(tp, desc, skb) < 0) +- netif_receive_skb(skb); ++ if (rtl8169_rx_vlan_skb(tp, desc, skb, polling) < 0) { ++ if (likely(polling)) ++ netif_receive_skb(skb); ++ else ++ netif_rx(skb); ++ } + + dev->stats.rx_bytes += pkt_size; + dev->stats.rx_packets++; +-- +1.7.0.2 + diff --git a/main/linux-pae/0008-r8169-clean-up-my-printk-uglyness.patch b/main/linux-pae/0008-r8169-clean-up-my-printk-uglyness.patch new file mode 100644 index 00000000..dff3fd21 --- /dev/null +++ b/main/linux-pae/0008-r8169-clean-up-my-printk-uglyness.patch @@ -0,0 +1,36 @@ +From d1c9ac562923fa0b1738fceb4c7bafac3ab936ba Mon Sep 17 00:00:00 2001 +From: Neil Horman +Date: Thu, 1 Apr 2010 07:30:07 +0000 +Subject: [PATCH 08/18] r8169: clean up my printk uglyness + +Fix formatting on r8169 printk + +Brandon Philips noted that I had a spacing issue in my printk for the +last r8169 patch that made it quite ugly. Fix that up and add the PFX +macro to it as well so it looks like the other r8169 printks + +Signed-off-by: Neil Horman +Signed-off-by: David S. Miller +(cherry picked from commit 93f4d91d879acfcb0ba9c2725e3133fcff2dfd1e) +--- + drivers/net/r8169.c | 4 ++-- + 1 files changed, 2 insertions(+), 2 deletions(-) + +diff --git a/drivers/net/r8169.c b/drivers/net/r8169.c +index bed1d47..790555e 100644 +--- a/drivers/net/r8169.c ++++ b/drivers/net/r8169.c +@@ -3255,8 +3255,8 @@ static void rtl8169_set_rxbufsize(struct rtl8169_private *tp, + unsigned int max_frame = mtu + VLAN_ETH_HLEN + ETH_FCS_LEN; + + if (max_frame != 16383) +- printk(KERN_WARNING "WARNING! Changing of MTU on this NIC" +- "May lead to frame reception errors!\n"); ++ printk(KERN_WARNING PFX "WARNING! Changing of MTU on this " ++ "NIC may lead to frame reception errors!\n"); + + tp->rx_buf_sz = (max_frame > RX_BUF_SIZE) ? max_frame : RX_BUF_SIZE; + } +-- +1.7.0.2 + diff --git a/main/linux-pae/0009-ipsec-Fix-bogus-bundle-flowi.patch b/main/linux-pae/0009-ipsec-Fix-bogus-bundle-flowi.patch new file mode 100644 index 00000000..d4de0e1d --- /dev/null +++ b/main/linux-pae/0009-ipsec-Fix-bogus-bundle-flowi.patch @@ -0,0 +1,110 @@ +From 21ee14f92ef1b6d4ca965c9b59135f3462919631 Mon Sep 17 00:00:00 2001 +From: Herbert Xu +Date: Tue, 2 Mar 2010 02:51:56 +0000 +Subject: [PATCH 09/18] ipsec: Fix bogus bundle flowi + +When I merged the bundle creation code, I introduced a bogus +flowi value in the bundle. Instead of getting from the caller, +it was instead set to the flow in the route object, which is +totally different. + +The end result is that the bundles we created never match, and +we instead end up with an ever growing bundle list. + +Thanks to Jamal for find this problem. + +Reported-by: Jamal Hadi Salim +Signed-off-by: Herbert Xu +Acked-by: Steffen Klassert +Acked-by: Jamal Hadi Salim +Signed-off-by: David S. Miller +(cherry picked from commit 87c1e12b5eeb7b30b4b41291bef8e0b41fc3dde9) +--- + include/net/xfrm.h | 3 ++- + net/ipv4/xfrm4_policy.c | 5 +++-- + net/ipv6/xfrm6_policy.c | 3 ++- + net/xfrm/xfrm_policy.c | 7 ++++--- + 4 files changed, 11 insertions(+), 7 deletions(-) + +diff --git a/include/net/xfrm.h b/include/net/xfrm.h +index 223e90a..6960be2 100644 +--- a/include/net/xfrm.h ++++ b/include/net/xfrm.h +@@ -273,7 +273,8 @@ struct xfrm_policy_afinfo { + struct dst_entry *dst, + int nfheader_len); + int (*fill_dst)(struct xfrm_dst *xdst, +- struct net_device *dev); ++ struct net_device *dev, ++ struct flowi *fl); + }; + + extern int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo); +diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c +index 74fb2eb..7009886 100644 +--- a/net/ipv4/xfrm4_policy.c ++++ b/net/ipv4/xfrm4_policy.c +@@ -92,11 +92,12 @@ static int xfrm4_init_path(struct xfrm_dst *path, struct dst_entry *dst, + return 0; + } + +-static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev) ++static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, ++ struct flowi *fl) + { + struct rtable *rt = (struct rtable *)xdst->route; + +- xdst->u.rt.fl = rt->fl; ++ xdst->u.rt.fl = *fl; + + xdst->u.dst.dev = dev; + dev_hold(dev); +diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c +index 8ec3d45..3f89ab7 100644 +--- a/net/ipv6/xfrm6_policy.c ++++ b/net/ipv6/xfrm6_policy.c +@@ -117,7 +117,8 @@ static int xfrm6_init_path(struct xfrm_dst *path, struct dst_entry *dst, + return 0; + } + +-static int xfrm6_fill_dst(struct xfrm_dst *xdst, struct net_device *dev) ++static int xfrm6_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, ++ struct flowi *fl) + { + struct rt6_info *rt = (struct rt6_info*)xdst->route; + +diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c +index cb81ca3..d75047c 100644 +--- a/net/xfrm/xfrm_policy.c ++++ b/net/xfrm/xfrm_policy.c +@@ -1341,7 +1341,8 @@ static inline int xfrm_init_path(struct xfrm_dst *path, struct dst_entry *dst, + return err; + } + +-static inline int xfrm_fill_dst(struct xfrm_dst *xdst, struct net_device *dev) ++static inline int xfrm_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, ++ struct flowi *fl) + { + struct xfrm_policy_afinfo *afinfo = + xfrm_policy_get_afinfo(xdst->u.dst.ops->family); +@@ -1350,7 +1351,7 @@ static inline int xfrm_fill_dst(struct xfrm_dst *xdst, struct net_device *dev) + if (!afinfo) + return -EINVAL; + +- err = afinfo->fill_dst(xdst, dev); ++ err = afinfo->fill_dst(xdst, dev, fl); + + xfrm_policy_put_afinfo(afinfo); + +@@ -1454,7 +1455,7 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, + for (dst_prev = dst0; dst_prev != dst; dst_prev = dst_prev->child) { + struct xfrm_dst *xdst = (struct xfrm_dst *)dst_prev; + +- err = xfrm_fill_dst(xdst, dev); ++ err = xfrm_fill_dst(xdst, dev, fl); + if (err) + goto free_dst; + +-- +1.7.0.2 + diff --git a/main/linux-pae/0010-xfrm-Remove-xfrm_state_genid.patch b/main/linux-pae/0010-xfrm-Remove-xfrm_state_genid.patch new file mode 100644 index 00000000..8cfffd73 --- /dev/null +++ b/main/linux-pae/0010-xfrm-Remove-xfrm_state_genid.patch @@ -0,0 +1,54 @@ +From f2c59932757a06851bb740dc757ce2ba1961fc08 Mon Sep 17 00:00:00 2001 +From: Herbert Xu +Date: Wed, 31 Mar 2010 01:19:49 +0000 +Subject: [PATCH 10/18] xfrm: Remove xfrm_state_genid +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +The xfrm state genid only needs to be matched against the copy +saved in xfrm_dst. So we don't need a global genid at all. In +fact, we don't even need to initialise it. + +Based on observation by Timo Teräs. + +Signed-off-by: Herbert Xu +Signed-off-by: David S. Miller +(cherry picked from commit 34996cb91dd72f0b0456d8fd3fef4aaee62232f2) +--- + net/xfrm/xfrm_state.c | 5 +---- + 1 files changed, 1 insertions(+), 4 deletions(-) + +diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c +index f2f7c63..8ee733f 100644 +--- a/net/xfrm/xfrm_state.c ++++ b/net/xfrm/xfrm_state.c +@@ -34,7 +34,6 @@ + static DEFINE_SPINLOCK(xfrm_state_lock); + + static unsigned int xfrm_state_hashmax __read_mostly = 1 * 1024 * 1024; +-static unsigned int xfrm_state_genid; + + static struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned int family); + static void xfrm_state_put_afinfo(struct xfrm_state_afinfo *afinfo); +@@ -903,8 +902,6 @@ static void __xfrm_state_insert(struct xfrm_state *x) + struct net *net = xs_net(x); + unsigned int h; + +- x->genid = ++xfrm_state_genid; +- + list_add(&x->km.all, &net->xfrm.state_all); + + h = xfrm_dst_hash(net, &x->id.daddr, &x->props.saddr, +@@ -948,7 +945,7 @@ static void __xfrm_state_bump_genids(struct xfrm_state *xnew) + x->props.reqid == reqid && + !xfrm_addr_cmp(&x->id.daddr, &xnew->id.daddr, family) && + !xfrm_addr_cmp(&x->props.saddr, &xnew->props.saddr, family)) +- x->genid = xfrm_state_genid; ++ x->genid++; + } + } + +-- +1.7.0.2 + diff --git a/main/linux-pae/0011-xfrm_user-verify-policy-direction-at-XFRM_MSG_POLEXP.patch b/main/linux-pae/0011-xfrm_user-verify-policy-direction-at-XFRM_MSG_POLEXP.patch new file mode 100644 index 00000000..ae2a0f91 --- /dev/null +++ b/main/linux-pae/0011-xfrm_user-verify-policy-direction-at-XFRM_MSG_POLEXP.patch @@ -0,0 +1,35 @@ +From 5b3e87bccb0e48f2f8b78695e949c015a3695f8e Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Timo=20Ter=C3=A4s?= +Date: Wed, 31 Mar 2010 00:17:04 +0000 +Subject: [PATCH 11/18] xfrm_user: verify policy direction at XFRM_MSG_POLEXPIRE handler + +Add missing check for policy direction verification. This is +especially important since without this xfrm_user may end up +deleting per-socket policy which is not allowed. + +Signed-off-by: Timo Teras +Acked-by: Herbert Xu +Signed-off-by: David S. Miller +(cherry picked from commit c8bf4d04f970fafb3430d332533e1cf103f2a018) +--- + net/xfrm/xfrm_user.c | 4 ++++ + 1 files changed, 4 insertions(+), 0 deletions(-) + +diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c +index b95a2d6..d1e9ee3 100644 +--- a/net/xfrm/xfrm_user.c ++++ b/net/xfrm/xfrm_user.c +@@ -1589,6 +1589,10 @@ static int xfrm_add_pol_expire(struct sk_buff *skb, struct nlmsghdr *nlh, + if (err) + return err; + ++ err = verify_policy_dir(p->dir); ++ if (err) ++ return err; ++ + if (p->index) + xp = xfrm_policy_byid(net, type, p->dir, p->index, 0, &err); + else { +-- +1.7.0.2 + diff --git a/main/linux-pae/0012-xfrm-remove-policy-lock-when-accessing-policy-walk.d.patch b/main/linux-pae/0012-xfrm-remove-policy-lock-when-accessing-policy-walk.d.patch new file mode 100644 index 00000000..222caadd --- /dev/null +++ b/main/linux-pae/0012-xfrm-remove-policy-lock-when-accessing-policy-walk.d.patch @@ -0,0 +1,105 @@ +From 7a400eb025dd53883c3560d0fdb069542f7ad3db Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Timo=20Ter=C3=A4s?= +Date: Wed, 31 Mar 2010 00:17:05 +0000 +Subject: [PATCH 12/18] xfrm: remove policy lock when accessing policy->walk.dead + +All of the code considers ->dead as a hint that the cached policy +needs to get refreshed. The read side can just drop the read lock +without any side effects. + +The write side needs to make sure that it's written only exactly +once. Only possible race is at xfrm_policy_kill(). This is fixed +by checking result of __xfrm_policy_unlink() when needed. It will +always succeed if the policy object is looked up from the hash +list (so some checks are removed), but it needs to be checked if +we are trying to unlink policy via a reference (appropriate +checks added). + +Since policy->walk.dead is written exactly once, it no longer +needs to be protected with a write lock. + +Signed-off-by: Timo Teras +Acked-by: Herbert Xu +Signed-off-by: David S. Miller +(backported from commit ea2dea9dacc256fe927857feb423872051642ae7) +--- + net/xfrm/xfrm_policy.c | 20 +++++--------------- + net/xfrm/xfrm_user.c | 6 +----- + 2 files changed, 6 insertions(+), 20 deletions(-) + +diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c +index d75047c..110184f 100644 +--- a/net/xfrm/xfrm_policy.c ++++ b/net/xfrm/xfrm_policy.c +@@ -156,7 +156,7 @@ static void xfrm_policy_timer(unsigned long data) + + read_lock(&xp->lock); + +- if (xp->walk.dead) ++ if (unlikely(xp->walk.dead)) + goto out; + + dir = xfrm_policy_id2dir(xp->index); +@@ -297,17 +297,7 @@ static DECLARE_WORK(xfrm_policy_gc_work, xfrm_policy_gc_task); + + static void xfrm_policy_kill(struct xfrm_policy *policy) + { +- int dead; +- +- write_lock_bh(&policy->lock); +- dead = policy->walk.dead; + policy->walk.dead = 1; +- write_unlock_bh(&policy->lock); +- +- if (unlikely(dead)) { +- WARN_ON(1); +- return; +- } + + spin_lock_bh(&xfrm_policy_gc_lock); + hlist_add_head(&policy->bydst, &xfrm_policy_gc_list); +@@ -1115,6 +1105,9 @@ int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol) + __xfrm_policy_link(pol, XFRM_POLICY_MAX+dir); + } + if (old_pol) ++ /* Unlinking succeeds always. This is the only function ++ * allowed to delete or replace socket policy. ++ */ + __xfrm_policy_unlink(old_pol, XFRM_POLICY_MAX+dir); + write_unlock_bh(&xfrm_policy_lock); + +@@ -1705,11 +1698,8 @@ restart: + goto error; + } + +- for (pi = 0; pi < npols; pi++) { +- read_lock_bh(&pols[pi]->lock); ++ for (pi = 0; pi < npols; pi++) + pol_dead |= pols[pi]->walk.dead; +- read_unlock_bh(&pols[pi]->lock); +- } + + write_lock_bh(&policy->lock); + if (unlikely(pol_dead || stale_bundle(dst))) { +diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c +index d1e9ee3..f9c56e9 100644 +--- a/net/xfrm/xfrm_user.c ++++ b/net/xfrm/xfrm_user.c +@@ -1617,13 +1617,9 @@ static int xfrm_add_pol_expire(struct sk_buff *skb, struct nlmsghdr *nlh, + if (xp == NULL) + return -ENOENT; + +- read_lock(&xp->lock); +- if (xp->walk.dead) { +- read_unlock(&xp->lock); ++ if (unlikely(xp->walk.dead)) + goto out; +- } + +- read_unlock(&xp->lock); + err = 0; + if (up->hard) { + uid_t loginuid = NETLINK_CB(skb).loginuid; +-- +1.7.0.2 + diff --git a/main/linux-pae/0013-flow-structurize-flow-cache.patch b/main/linux-pae/0013-flow-structurize-flow-cache.patch new file mode 100644 index 00000000..68fa753a --- /dev/null +++ b/main/linux-pae/0013-flow-structurize-flow-cache.patch @@ -0,0 +1,395 @@ +From 884f6e44f0b405c06bd234b14cc228482291bb38 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Timo=20Ter=C3=A4s?= +Date: Wed, 31 Mar 2010 00:17:06 +0000 +Subject: [PATCH 13/18] flow: structurize flow cache + +Group all per-cpu data to one structure instead of having many +globals. Also prepare the internals so that we can have multiple +instances of the flow cache if needed. + +Only the kmem_cache is left as a global as all flow caches share +the same element size, and benefit from using a common cache. + +Signed-off-by: Timo Teras +Acked-by: Herbert Xu +Signed-off-by: David S. Miller +(cherry picked from commit d7997fe1f4584da12e9c29fb682c18e9bdc13b73) +--- + net/core/flow.c | 223 +++++++++++++++++++++++++++++-------------------------- + 1 files changed, 119 insertions(+), 104 deletions(-) + +diff --git a/net/core/flow.c b/net/core/flow.c +index 9601587..1d27ca6 100644 +--- a/net/core/flow.c ++++ b/net/core/flow.c +@@ -35,104 +35,105 @@ struct flow_cache_entry { + atomic_t *object_ref; + }; + +-atomic_t flow_cache_genid = ATOMIC_INIT(0); +- +-static u32 flow_hash_shift; +-#define flow_hash_size (1 << flow_hash_shift) +-static DEFINE_PER_CPU(struct flow_cache_entry **, flow_tables) = { NULL }; +- +-#define flow_table(cpu) (per_cpu(flow_tables, cpu)) +- +-static struct kmem_cache *flow_cachep __read_mostly; +- +-static int flow_lwm, flow_hwm; +- +-struct flow_percpu_info { +- int hash_rnd_recalc; +- u32 hash_rnd; +- int count; ++struct flow_cache_percpu { ++ struct flow_cache_entry ** hash_table; ++ int hash_count; ++ u32 hash_rnd; ++ int hash_rnd_recalc; ++ struct tasklet_struct flush_tasklet; + }; +-static DEFINE_PER_CPU(struct flow_percpu_info, flow_hash_info) = { 0 }; +- +-#define flow_hash_rnd_recalc(cpu) \ +- (per_cpu(flow_hash_info, cpu).hash_rnd_recalc) +-#define flow_hash_rnd(cpu) \ +- (per_cpu(flow_hash_info, cpu).hash_rnd) +-#define flow_count(cpu) \ +- (per_cpu(flow_hash_info, cpu).count) +- +-static struct timer_list flow_hash_rnd_timer; +- +-#define FLOW_HASH_RND_PERIOD (10 * 60 * HZ) + + struct flow_flush_info { +- atomic_t cpuleft; +- struct completion completion; ++ struct flow_cache * cache; ++ atomic_t cpuleft; ++ struct completion completion; + }; +-static DEFINE_PER_CPU(struct tasklet_struct, flow_flush_tasklets) = { NULL }; + +-#define flow_flush_tasklet(cpu) (&per_cpu(flow_flush_tasklets, cpu)) ++struct flow_cache { ++ u32 hash_shift; ++ unsigned long order; ++ struct flow_cache_percpu * percpu; ++ struct notifier_block hotcpu_notifier; ++ int low_watermark; ++ int high_watermark; ++ struct timer_list rnd_timer; ++}; ++ ++atomic_t flow_cache_genid = ATOMIC_INIT(0); ++static struct flow_cache flow_cache_global; ++static struct kmem_cache *flow_cachep; ++ ++#define flow_cache_hash_size(cache) (1 << (cache)->hash_shift) ++#define FLOW_HASH_RND_PERIOD (10 * 60 * HZ) + + static void flow_cache_new_hashrnd(unsigned long arg) + { ++ struct flow_cache *fc = (void *) arg; + int i; + + for_each_possible_cpu(i) +- flow_hash_rnd_recalc(i) = 1; ++ per_cpu_ptr(fc->percpu, i)->hash_rnd_recalc = 1; + +- flow_hash_rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD; +- add_timer(&flow_hash_rnd_timer); ++ fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD; ++ add_timer(&fc->rnd_timer); + } + +-static void flow_entry_kill(int cpu, struct flow_cache_entry *fle) ++static void flow_entry_kill(struct flow_cache *fc, ++ struct flow_cache_percpu *fcp, ++ struct flow_cache_entry *fle) + { + if (fle->object) + atomic_dec(fle->object_ref); + kmem_cache_free(flow_cachep, fle); +- flow_count(cpu)--; ++ fcp->hash_count--; + } + +-static void __flow_cache_shrink(int cpu, int shrink_to) ++static void __flow_cache_shrink(struct flow_cache *fc, ++ struct flow_cache_percpu *fcp, ++ int shrink_to) + { + struct flow_cache_entry *fle, **flp; + int i; + +- for (i = 0; i < flow_hash_size; i++) { ++ for (i = 0; i < flow_cache_hash_size(fc); i++) { + int k = 0; + +- flp = &flow_table(cpu)[i]; ++ flp = &fcp->hash_table[i]; + while ((fle = *flp) != NULL && k < shrink_to) { + k++; + flp = &fle->next; + } + while ((fle = *flp) != NULL) { + *flp = fle->next; +- flow_entry_kill(cpu, fle); ++ flow_entry_kill(fc, fcp, fle); + } + } + } + +-static void flow_cache_shrink(int cpu) ++static void flow_cache_shrink(struct flow_cache *fc, ++ struct flow_cache_percpu *fcp) + { +- int shrink_to = flow_lwm / flow_hash_size; ++ int shrink_to = fc->low_watermark / flow_cache_hash_size(fc); + +- __flow_cache_shrink(cpu, shrink_to); ++ __flow_cache_shrink(fc, fcp, shrink_to); + } + +-static void flow_new_hash_rnd(int cpu) ++static void flow_new_hash_rnd(struct flow_cache *fc, ++ struct flow_cache_percpu *fcp) + { +- get_random_bytes(&flow_hash_rnd(cpu), sizeof(u32)); +- flow_hash_rnd_recalc(cpu) = 0; +- +- __flow_cache_shrink(cpu, 0); ++ get_random_bytes(&fcp->hash_rnd, sizeof(u32)); ++ fcp->hash_rnd_recalc = 0; ++ __flow_cache_shrink(fc, fcp, 0); + } + +-static u32 flow_hash_code(struct flowi *key, int cpu) ++static u32 flow_hash_code(struct flow_cache *fc, ++ struct flow_cache_percpu *fcp, ++ struct flowi *key) + { + u32 *k = (u32 *) key; + +- return (jhash2(k, (sizeof(*key) / sizeof(u32)), flow_hash_rnd(cpu)) & +- (flow_hash_size - 1)); ++ return (jhash2(k, (sizeof(*key) / sizeof(u32)), fcp->hash_rnd) ++ & (flow_cache_hash_size(fc) - 1)); + } + + #if (BITS_PER_LONG == 64) +@@ -168,24 +169,25 @@ static int flow_key_compare(struct flowi *key1, struct flowi *key2) + void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir, + flow_resolve_t resolver) + { ++ struct flow_cache *fc = &flow_cache_global; ++ struct flow_cache_percpu *fcp; + struct flow_cache_entry *fle, **head; + unsigned int hash; +- int cpu; + + local_bh_disable(); +- cpu = smp_processor_id(); ++ fcp = per_cpu_ptr(fc->percpu, smp_processor_id()); + + fle = NULL; + /* Packet really early in init? Making flow_cache_init a + * pre-smp initcall would solve this. --RR */ +- if (!flow_table(cpu)) ++ if (!fcp->hash_table) + goto nocache; + +- if (flow_hash_rnd_recalc(cpu)) +- flow_new_hash_rnd(cpu); +- hash = flow_hash_code(key, cpu); ++ if (fcp->hash_rnd_recalc) ++ flow_new_hash_rnd(fc, fcp); ++ hash = flow_hash_code(fc, fcp, key); + +- head = &flow_table(cpu)[hash]; ++ head = &fcp->hash_table[hash]; + for (fle = *head; fle; fle = fle->next) { + if (fle->family == family && + fle->dir == dir && +@@ -204,8 +206,8 @@ void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir, + } + + if (!fle) { +- if (flow_count(cpu) > flow_hwm) +- flow_cache_shrink(cpu); ++ if (fcp->hash_count > fc->high_watermark) ++ flow_cache_shrink(fc, fcp); + + fle = kmem_cache_alloc(flow_cachep, GFP_ATOMIC); + if (fle) { +@@ -215,7 +217,7 @@ void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir, + fle->dir = dir; + memcpy(&fle->key, key, sizeof(*key)); + fle->object = NULL; +- flow_count(cpu)++; ++ fcp->hash_count++; + } + } + +@@ -249,14 +251,15 @@ nocache: + static void flow_cache_flush_tasklet(unsigned long data) + { + struct flow_flush_info *info = (void *)data; ++ struct flow_cache *fc = info->cache; ++ struct flow_cache_percpu *fcp; + int i; +- int cpu; + +- cpu = smp_processor_id(); +- for (i = 0; i < flow_hash_size; i++) { ++ fcp = per_cpu_ptr(fc->percpu, smp_processor_id()); ++ for (i = 0; i < flow_cache_hash_size(fc); i++) { + struct flow_cache_entry *fle; + +- fle = flow_table(cpu)[i]; ++ fle = fcp->hash_table[i]; + for (; fle; fle = fle->next) { + unsigned genid = atomic_read(&flow_cache_genid); + +@@ -272,7 +275,6 @@ static void flow_cache_flush_tasklet(unsigned long data) + complete(&info->completion); + } + +-static void flow_cache_flush_per_cpu(void *) __attribute__((__unused__)); + static void flow_cache_flush_per_cpu(void *data) + { + struct flow_flush_info *info = data; +@@ -280,8 +282,7 @@ static void flow_cache_flush_per_cpu(void *data) + struct tasklet_struct *tasklet; + + cpu = smp_processor_id(); +- +- tasklet = flow_flush_tasklet(cpu); ++ tasklet = &per_cpu_ptr(info->cache->percpu, cpu)->flush_tasklet; + tasklet->data = (unsigned long)info; + tasklet_schedule(tasklet); + } +@@ -294,6 +295,7 @@ void flow_cache_flush(void) + /* Don't want cpus going down or up during this. */ + get_online_cpus(); + mutex_lock(&flow_flush_sem); ++ info.cache = &flow_cache_global; + atomic_set(&info.cpuleft, num_online_cpus()); + init_completion(&info.completion); + +@@ -307,62 +309,75 @@ void flow_cache_flush(void) + put_online_cpus(); + } + +-static void __init flow_cache_cpu_prepare(int cpu) ++static void __init flow_cache_cpu_prepare(struct flow_cache *fc, ++ struct flow_cache_percpu *fcp) + { +- struct tasklet_struct *tasklet; +- unsigned long order; +- +- for (order = 0; +- (PAGE_SIZE << order) < +- (sizeof(struct flow_cache_entry *)*flow_hash_size); +- order++) +- /* NOTHING */; +- +- flow_table(cpu) = (struct flow_cache_entry **) +- __get_free_pages(GFP_KERNEL|__GFP_ZERO, order); +- if (!flow_table(cpu)) +- panic("NET: failed to allocate flow cache order %lu\n", order); +- +- flow_hash_rnd_recalc(cpu) = 1; +- flow_count(cpu) = 0; +- +- tasklet = flow_flush_tasklet(cpu); +- tasklet_init(tasklet, flow_cache_flush_tasklet, 0); ++ fcp->hash_table = (struct flow_cache_entry **) ++ __get_free_pages(GFP_KERNEL|__GFP_ZERO, fc->order); ++ if (!fcp->hash_table) ++ panic("NET: failed to allocate flow cache order %lu\n", fc->order); ++ ++ fcp->hash_rnd_recalc = 1; ++ fcp->hash_count = 0; ++ tasklet_init(&fcp->flush_tasklet, flow_cache_flush_tasklet, 0); + } + + static int flow_cache_cpu(struct notifier_block *nfb, + unsigned long action, + void *hcpu) + { ++ struct flow_cache *fc = container_of(nfb, struct flow_cache, hotcpu_notifier); ++ int cpu = (unsigned long) hcpu; ++ struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu); ++ + if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) +- __flow_cache_shrink((unsigned long)hcpu, 0); ++ __flow_cache_shrink(fc, fcp, 0); + return NOTIFY_OK; + } + +-static int __init flow_cache_init(void) ++static int flow_cache_init(struct flow_cache *fc) + { ++ unsigned long order; + int i; + +- flow_cachep = kmem_cache_create("flow_cache", +- sizeof(struct flow_cache_entry), +- 0, SLAB_PANIC, +- NULL); +- flow_hash_shift = 10; +- flow_lwm = 2 * flow_hash_size; +- flow_hwm = 4 * flow_hash_size; ++ fc->hash_shift = 10; ++ fc->low_watermark = 2 * flow_cache_hash_size(fc); ++ fc->high_watermark = 4 * flow_cache_hash_size(fc); ++ ++ for (order = 0; ++ (PAGE_SIZE << order) < ++ (sizeof(struct flow_cache_entry *)*flow_cache_hash_size(fc)); ++ order++) ++ /* NOTHING */; ++ fc->order = order; ++ fc->percpu = alloc_percpu(struct flow_cache_percpu); + +- setup_timer(&flow_hash_rnd_timer, flow_cache_new_hashrnd, 0); +- flow_hash_rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD; +- add_timer(&flow_hash_rnd_timer); ++ setup_timer(&fc->rnd_timer, flow_cache_new_hashrnd, ++ (unsigned long) fc); ++ fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD; ++ add_timer(&fc->rnd_timer); + + for_each_possible_cpu(i) +- flow_cache_cpu_prepare(i); ++ flow_cache_cpu_prepare(fc, per_cpu_ptr(fc->percpu, i)); ++ ++ fc->hotcpu_notifier = (struct notifier_block){ ++ .notifier_call = flow_cache_cpu, ++ }; ++ register_hotcpu_notifier(&fc->hotcpu_notifier); + +- hotcpu_notifier(flow_cache_cpu, 0); + return 0; + } + +-module_init(flow_cache_init); ++static int __init flow_cache_init_global(void) ++{ ++ flow_cachep = kmem_cache_create("flow_cache", ++ sizeof(struct flow_cache_entry), ++ 0, SLAB_PANIC, NULL); ++ ++ return flow_cache_init(&flow_cache_global); ++} ++ ++module_init(flow_cache_init_global); + + EXPORT_SYMBOL(flow_cache_genid); + EXPORT_SYMBOL(flow_cache_lookup); +-- +1.7.0.2 + diff --git a/main/linux-pae/0014-flow-virtualize-flow-cache-entry-methods.patch b/main/linux-pae/0014-flow-virtualize-flow-cache-entry-methods.patch new file mode 100644 index 00000000..5c4a9ea5 --- /dev/null +++ b/main/linux-pae/0014-flow-virtualize-flow-cache-entry-methods.patch @@ -0,0 +1,513 @@ +From d56cd1c538e5448fe43acc69991aa842f382a622 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Timo=20Ter=C3=A4s?= +Date: Wed, 7 Apr 2010 00:30:04 +0000 +Subject: [PATCH 14/18] flow: virtualize flow cache entry methods + +This allows to validate the cached object before returning it. +It also allows to destruct object properly, if the last reference +was held in flow cache. This is also a prepartion for caching +bundles in the flow cache. + +In return for virtualizing the methods, we save on: +- not having to regenerate the whole flow cache on policy removal: + each flow matching a killed policy gets refreshed as the getter + function notices it smartly. +- we do not have to call flow_cache_flush from policy gc, since the + flow cache now properly deletes the object if it had any references + +Signed-off-by: Timo Teras +Acked-by: Herbert Xu +Signed-off-by: David S. Miller +(backported from commit fe1a5f031e76bd8761a7803d75b95ee96e84a574) +--- + include/net/flow.h | 23 +++++++-- + include/net/xfrm.h | 3 + + net/core/flow.c | 128 +++++++++++++++++++++++++---------------------- + net/xfrm/xfrm_policy.c | 111 ++++++++++++++++++++++++++++-------------- + 4 files changed, 164 insertions(+), 101 deletions(-) + +diff --git a/include/net/flow.h b/include/net/flow.h +index 809970b..bb08692 100644 +--- a/include/net/flow.h ++++ b/include/net/flow.h +@@ -86,11 +86,26 @@ struct flowi { + + struct net; + struct sock; +-typedef int (*flow_resolve_t)(struct net *net, struct flowi *key, u16 family, +- u8 dir, void **objp, atomic_t **obj_refp); ++struct flow_cache_ops; ++ ++struct flow_cache_object { ++ const struct flow_cache_ops *ops; ++}; ++ ++struct flow_cache_ops { ++ struct flow_cache_object *(*get)(struct flow_cache_object *); ++ int (*check)(struct flow_cache_object *); ++ void (*delete)(struct flow_cache_object *); ++}; ++ ++typedef struct flow_cache_object *(*flow_resolve_t)( ++ struct net *net, struct flowi *key, u16 family, ++ u8 dir, struct flow_cache_object *oldobj, void *ctx); ++ ++extern struct flow_cache_object *flow_cache_lookup( ++ struct net *net, struct flowi *key, u16 family, ++ u8 dir, flow_resolve_t resolver, void *ctx); + +-extern void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family, +- u8 dir, flow_resolve_t resolver); + extern void flow_cache_flush(void); + extern atomic_t flow_cache_genid; + +diff --git a/include/net/xfrm.h b/include/net/xfrm.h +index 6960be2..6023a48 100644 +--- a/include/net/xfrm.h ++++ b/include/net/xfrm.h +@@ -19,6 +19,8 @@ + #include + #include + #include ++#include ++ + #ifdef CONFIG_XFRM_STATISTICS + #include + #endif +@@ -482,6 +484,7 @@ struct xfrm_policy + atomic_t refcnt; + struct timer_list timer; + ++ struct flow_cache_object flo; + u32 priority; + u32 index; + struct xfrm_selector selector; +diff --git a/net/core/flow.c b/net/core/flow.c +index 1d27ca6..521df52 100644 +--- a/net/core/flow.c ++++ b/net/core/flow.c +@@ -26,17 +26,16 @@ + #include + + struct flow_cache_entry { +- struct flow_cache_entry *next; +- u16 family; +- u8 dir; +- u32 genid; +- struct flowi key; +- void *object; +- atomic_t *object_ref; ++ struct flow_cache_entry *next; ++ u16 family; ++ u8 dir; ++ u32 genid; ++ struct flowi key; ++ struct flow_cache_object *object; + }; + + struct flow_cache_percpu { +- struct flow_cache_entry ** hash_table; ++ struct flow_cache_entry **hash_table; + int hash_count; + u32 hash_rnd; + int hash_rnd_recalc; +@@ -44,7 +43,7 @@ struct flow_cache_percpu { + }; + + struct flow_flush_info { +- struct flow_cache * cache; ++ struct flow_cache *cache; + atomic_t cpuleft; + struct completion completion; + }; +@@ -52,7 +51,7 @@ struct flow_flush_info { + struct flow_cache { + u32 hash_shift; + unsigned long order; +- struct flow_cache_percpu * percpu; ++ struct flow_cache_percpu *percpu; + struct notifier_block hotcpu_notifier; + int low_watermark; + int high_watermark; +@@ -78,12 +77,21 @@ static void flow_cache_new_hashrnd(unsigned long arg) + add_timer(&fc->rnd_timer); + } + ++static int flow_entry_valid(struct flow_cache_entry *fle) ++{ ++ if (atomic_read(&flow_cache_genid) != fle->genid) ++ return 0; ++ if (fle->object && !fle->object->ops->check(fle->object)) ++ return 0; ++ return 1; ++} ++ + static void flow_entry_kill(struct flow_cache *fc, + struct flow_cache_percpu *fcp, + struct flow_cache_entry *fle) + { + if (fle->object) +- atomic_dec(fle->object_ref); ++ fle->object->ops->delete(fle->object); + kmem_cache_free(flow_cachep, fle); + fcp->hash_count--; + } +@@ -96,16 +104,18 @@ static void __flow_cache_shrink(struct flow_cache *fc, + int i; + + for (i = 0; i < flow_cache_hash_size(fc); i++) { +- int k = 0; ++ int saved = 0; + + flp = &fcp->hash_table[i]; +- while ((fle = *flp) != NULL && k < shrink_to) { +- k++; +- flp = &fle->next; +- } + while ((fle = *flp) != NULL) { +- *flp = fle->next; +- flow_entry_kill(fc, fcp, fle); ++ if (saved < shrink_to && ++ flow_entry_valid(fle)) { ++ saved++; ++ flp = &fle->next; ++ } else { ++ *flp = fle->next; ++ flow_entry_kill(fc, fcp, fle); ++ } + } + } + } +@@ -166,18 +176,21 @@ static int flow_key_compare(struct flowi *key1, struct flowi *key2) + return 0; + } + +-void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir, +- flow_resolve_t resolver) ++struct flow_cache_object * ++flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir, ++ flow_resolve_t resolver, void *ctx) + { + struct flow_cache *fc = &flow_cache_global; + struct flow_cache_percpu *fcp; + struct flow_cache_entry *fle, **head; ++ struct flow_cache_object *flo; + unsigned int hash; + + local_bh_disable(); + fcp = per_cpu_ptr(fc->percpu, smp_processor_id()); + + fle = NULL; ++ flo = NULL; + /* Packet really early in init? Making flow_cache_init a + * pre-smp initcall would solve this. --RR */ + if (!fcp->hash_table) +@@ -185,27 +198,17 @@ void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir, + + if (fcp->hash_rnd_recalc) + flow_new_hash_rnd(fc, fcp); +- hash = flow_hash_code(fc, fcp, key); + ++ hash = flow_hash_code(fc, fcp, key); + head = &fcp->hash_table[hash]; + for (fle = *head; fle; fle = fle->next) { + if (fle->family == family && + fle->dir == dir && +- flow_key_compare(key, &fle->key) == 0) { +- if (fle->genid == atomic_read(&flow_cache_genid)) { +- void *ret = fle->object; +- +- if (ret) +- atomic_inc(fle->object_ref); +- local_bh_enable(); +- +- return ret; +- } ++ flow_key_compare(key, &fle->key) == 0) + break; +- } + } + +- if (!fle) { ++ if (unlikely(!fle)) { + if (fcp->hash_count > fc->high_watermark) + flow_cache_shrink(fc, fcp); + +@@ -219,33 +222,39 @@ void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir, + fle->object = NULL; + fcp->hash_count++; + } ++ } else if (likely(fle->genid == atomic_read(&flow_cache_genid))) { ++ flo = fle->object; ++ if (!flo) ++ goto ret_object; ++ flo = flo->ops->get(flo); ++ if (flo) ++ goto ret_object; ++ } else if (fle->object) { ++ flo = fle->object; ++ flo->ops->delete(flo); ++ fle->object = NULL; + } + + nocache: +- { +- int err; +- void *obj; +- atomic_t *obj_ref; +- +- err = resolver(net, key, family, dir, &obj, &obj_ref); +- +- if (fle && !err) { +- fle->genid = atomic_read(&flow_cache_genid); +- +- if (fle->object) +- atomic_dec(fle->object_ref); +- +- fle->object = obj; +- fle->object_ref = obj_ref; +- if (obj) +- atomic_inc(fle->object_ref); +- } +- local_bh_enable(); +- +- if (err) +- obj = ERR_PTR(err); +- return obj; ++ flo = NULL; ++ if (fle) { ++ flo = fle->object; ++ fle->object = NULL; ++ } ++ flo = resolver(net, key, family, dir, flo, ctx); ++ if (fle) { ++ fle->genid = atomic_read(&flow_cache_genid); ++ if (!IS_ERR(flo)) ++ fle->object = flo; ++ else ++ fle->genid--; ++ } else { ++ if (flo && !IS_ERR(flo)) ++ flo->ops->delete(flo); + } ++ret_object: ++ local_bh_enable(); ++ return flo; + } + + static void flow_cache_flush_tasklet(unsigned long data) +@@ -261,13 +270,12 @@ static void flow_cache_flush_tasklet(unsigned long data) + + fle = fcp->hash_table[i]; + for (; fle; fle = fle->next) { +- unsigned genid = atomic_read(&flow_cache_genid); +- +- if (!fle->object || fle->genid == genid) ++ if (flow_entry_valid(fle)) + continue; + ++ if (fle->object) ++ fle->object->ops->delete(fle->object); + fle->object = NULL; +- atomic_dec(fle->object_ref); + } + } + +diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c +index 110184f..d1eb2b5 100644 +--- a/net/xfrm/xfrm_policy.c ++++ b/net/xfrm/xfrm_policy.c +@@ -216,6 +216,35 @@ expired: + xfrm_pol_put(xp); + } + ++static struct flow_cache_object *xfrm_policy_flo_get(struct flow_cache_object *flo) ++{ ++ struct xfrm_policy *pol = container_of(flo, struct xfrm_policy, flo); ++ ++ if (unlikely(pol->walk.dead)) ++ flo = NULL; ++ else ++ xfrm_pol_hold(pol); ++ ++ return flo; ++} ++ ++static int xfrm_policy_flo_check(struct flow_cache_object *flo) ++{ ++ struct xfrm_policy *pol = container_of(flo, struct xfrm_policy, flo); ++ ++ return !pol->walk.dead; ++} ++ ++static void xfrm_policy_flo_delete(struct flow_cache_object *flo) ++{ ++ xfrm_pol_put(container_of(flo, struct xfrm_policy, flo)); ++} ++ ++static const struct flow_cache_ops xfrm_policy_fc_ops = { ++ .get = xfrm_policy_flo_get, ++ .check = xfrm_policy_flo_check, ++ .delete = xfrm_policy_flo_delete, ++}; + + /* Allocate xfrm_policy. Not used here, it is supposed to be used by pfkeyv2 + * SPD calls. +@@ -236,6 +265,7 @@ struct xfrm_policy *xfrm_policy_alloc(struct net *net, gfp_t gfp) + atomic_set(&policy->refcnt, 1); + setup_timer(&policy->timer, xfrm_policy_timer, + (unsigned long)policy); ++ policy->flo.ops = &xfrm_policy_fc_ops; + } + return policy; + } +@@ -269,9 +299,6 @@ static void xfrm_policy_gc_kill(struct xfrm_policy *policy) + if (del_timer(&policy->timer)) + atomic_dec(&policy->refcnt); + +- if (atomic_read(&policy->refcnt) > 1) +- flow_cache_flush(); +- + xfrm_pol_put(policy); + } + +@@ -658,10 +685,8 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u8 type, int dir, + } + write_unlock_bh(&xfrm_policy_lock); + +- if (ret && delete) { +- atomic_inc(&flow_cache_genid); ++ if (ret && delete) + xfrm_policy_kill(ret); +- } + return ret; + } + EXPORT_SYMBOL(xfrm_policy_bysel_ctx); +@@ -699,10 +724,8 @@ struct xfrm_policy *xfrm_policy_byid(struct net *net, u8 type, int dir, u32 id, + } + write_unlock_bh(&xfrm_policy_lock); + +- if (ret && delete) { +- atomic_inc(&flow_cache_genid); ++ if (ret && delete) + xfrm_policy_kill(ret); +- } + return ret; + } + EXPORT_SYMBOL(xfrm_policy_byid); +@@ -967,32 +990,35 @@ fail: + return ret; + } + +-static int xfrm_policy_lookup(struct net *net, struct flowi *fl, u16 family, +- u8 dir, void **objp, atomic_t **obj_refp) ++static struct flow_cache_object * ++xfrm_policy_lookup(struct net *net, struct flowi *fl, u16 family, ++ u8 dir, struct flow_cache_object *old_obj, void *ctx) + { + struct xfrm_policy *pol; +- int err = 0; ++ ++ if (old_obj) ++ xfrm_pol_put(container_of(old_obj, struct xfrm_policy, flo)); + + #ifdef CONFIG_XFRM_SUB_POLICY + pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_SUB, fl, family, dir); +- if (IS_ERR(pol)) { +- err = PTR_ERR(pol); +- pol = NULL; +- } +- if (pol || err) +- goto end; ++ if (IS_ERR(pol)) ++ return ERR_CAST(pol); ++ if (pol) ++ goto found; + #endif + pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, fl, family, dir); +- if (IS_ERR(pol)) { +- err = PTR_ERR(pol); +- pol = NULL; +- } +-#ifdef CONFIG_XFRM_SUB_POLICY +-end: +-#endif +- if ((*objp = (void *) pol) != NULL) +- *obj_refp = &pol->refcnt; +- return err; ++ if (IS_ERR(pol)) ++ return ERR_CAST(pol); ++ if (pol) ++ goto found; ++ return NULL; ++ ++found: ++ /* Resolver returns two references: ++ * one for cache and one for caller of flow_cache_lookup() */ ++ xfrm_pol_hold(pol); ++ ++ return &pol->flo; + } + + static inline int policy_to_flow_dir(int dir) +@@ -1077,8 +1103,6 @@ int xfrm_policy_delete(struct xfrm_policy *pol, int dir) + pol = __xfrm_policy_unlink(pol, dir); + write_unlock_bh(&xfrm_policy_lock); + if (pol) { +- if (dir < XFRM_POLICY_MAX) +- atomic_inc(&flow_cache_genid); + xfrm_policy_kill(pol); + return 0; + } +@@ -1549,18 +1573,24 @@ restart: + } + + if (!policy) { ++ struct flow_cache_object *flo; ++ + /* To accelerate a bit... */ + if ((dst_orig->flags & DST_NOXFRM) || + !net->xfrm.policy_count[XFRM_POLICY_OUT]) + goto nopol; + +- policy = flow_cache_lookup(net, fl, dst_orig->ops->family, +- dir, xfrm_policy_lookup); +- err = PTR_ERR(policy); +- if (IS_ERR(policy)) { ++ flo = flow_cache_lookup(net, fl, dst_orig->ops->family, ++ dir, xfrm_policy_lookup, NULL); ++ err = PTR_ERR(flo); ++ if (IS_ERR(flo)) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR); + goto dropdst; + } ++ if (flo) ++ policy = container_of(flo, struct xfrm_policy, flo); ++ else ++ policy = NULL; + } + + if (!policy) +@@ -1910,9 +1940,16 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, + } + } + +- if (!pol) +- pol = flow_cache_lookup(net, &fl, family, fl_dir, +- xfrm_policy_lookup); ++ if (!pol) { ++ struct flow_cache_object *flo; ++ ++ flo = flow_cache_lookup(net, &fl, family, fl_dir, ++ xfrm_policy_lookup, NULL); ++ if (flo == NULL || IS_ERR(flo)) ++ pol = ERR_CAST(flo); ++ else ++ pol = container_of(flo, struct xfrm_policy, flo); ++ } + + if (IS_ERR(pol)) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR); +-- +1.7.0.2 + diff --git a/main/linux-pae/0015-xfrm-cache-bundles-instead-of-policies-for-outgoing-.patch b/main/linux-pae/0015-xfrm-cache-bundles-instead-of-policies-for-outgoing-.patch new file mode 100644 index 00000000..0d066c84 --- /dev/null +++ b/main/linux-pae/0015-xfrm-cache-bundles-instead-of-policies-for-outgoing-.patch @@ -0,0 +1,1068 @@ +From f89d21648e6dc06db2aeabc8926c270894c41446 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Timo=20Ter=C3=A4s?= +Date: Wed, 7 Apr 2010 00:30:05 +0000 +Subject: [PATCH 15/18] xfrm: cache bundles instead of policies for outgoing flows + +__xfrm_lookup() is called for each packet transmitted out of +system. The xfrm_find_bundle() does a linear search which can +kill system performance depending on how many bundles are +required per policy. + +This modifies __xfrm_lookup() to store bundles directly in +the flow cache. If we did not get a hit, we just create a new +bundle instead of doing slow search. This means that we can now +get multiple xfrm_dst's for same flow (on per-cpu basis). + +Signed-off-by: Timo Teras +Signed-off-by: David S. Miller +(backported from commit 80c802f3073e84c956846e921e8a0b02dfa3755f) +--- + include/net/xfrm.h | 10 +- + net/ipv4/xfrm4_policy.c | 22 -- + net/ipv6/xfrm6_policy.c | 31 -- + net/xfrm/xfrm_policy.c | 710 +++++++++++++++++++++++++---------------------- + 4 files changed, 383 insertions(+), 390 deletions(-) + +diff --git a/include/net/xfrm.h b/include/net/xfrm.h +index 6023a48..d51ef61 100644 +--- a/include/net/xfrm.h ++++ b/include/net/xfrm.h +@@ -266,7 +266,6 @@ struct xfrm_policy_afinfo { + xfrm_address_t *saddr, + xfrm_address_t *daddr); + int (*get_saddr)(struct net *net, xfrm_address_t *saddr, xfrm_address_t *daddr); +- struct dst_entry *(*find_bundle)(struct flowi *fl, struct xfrm_policy *policy); + void (*decode_session)(struct sk_buff *skb, + struct flowi *fl, + int reverse); +@@ -485,12 +484,12 @@ struct xfrm_policy + struct timer_list timer; + + struct flow_cache_object flo; ++ atomic_t genid; + u32 priority; + u32 index; + struct xfrm_selector selector; + struct xfrm_lifetime_cfg lft; + struct xfrm_lifetime_cur curlft; +- struct dst_entry *bundles; + struct xfrm_policy_walk_entry walk; + u8 type; + u8 action; +@@ -883,11 +882,15 @@ struct xfrm_dst + struct rt6_info rt6; + } u; + struct dst_entry *route; ++ struct flow_cache_object flo; ++ struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX]; ++ int num_pols, num_xfrms; + #ifdef CONFIG_XFRM_SUB_POLICY + struct flowi *origin; + struct xfrm_selector *partner; + #endif +- u32 genid; ++ u32 xfrm_genid; ++ u32 policy_genid; + u32 route_mtu_cached; + u32 child_mtu_cached; + u32 route_cookie; +@@ -897,6 +900,7 @@ struct xfrm_dst + #ifdef CONFIG_XFRM + static inline void xfrm_dst_destroy(struct xfrm_dst *xdst) + { ++ xfrm_pols_put(xdst->pols, xdst->num_pols); + dst_release(xdst->route); + if (likely(xdst->u.dst.xfrm)) + xfrm_state_put(xdst->u.dst.xfrm); +diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c +index 7009886..651a3e7 100644 +--- a/net/ipv4/xfrm4_policy.c ++++ b/net/ipv4/xfrm4_policy.c +@@ -60,27 +60,6 @@ static int xfrm4_get_saddr(struct net *net, + return 0; + } + +-static struct dst_entry * +-__xfrm4_find_bundle(struct flowi *fl, struct xfrm_policy *policy) +-{ +- struct dst_entry *dst; +- +- read_lock_bh(&policy->lock); +- for (dst = policy->bundles; dst; dst = dst->next) { +- struct xfrm_dst *xdst = (struct xfrm_dst *)dst; +- if (xdst->u.rt.fl.oif == fl->oif && /*XXX*/ +- xdst->u.rt.fl.fl4_dst == fl->fl4_dst && +- xdst->u.rt.fl.fl4_src == fl->fl4_src && +- xdst->u.rt.fl.fl4_tos == fl->fl4_tos && +- xfrm_bundle_ok(policy, xdst, fl, AF_INET, 0)) { +- dst_clone(dst); +- break; +- } +- } +- read_unlock_bh(&policy->lock); +- return dst; +-} +- + static int xfrm4_get_tos(struct flowi *fl) + { + return fl->fl4_tos; +@@ -258,7 +237,6 @@ static struct xfrm_policy_afinfo xfrm4_policy_afinfo = { + .dst_ops = &xfrm4_dst_ops, + .dst_lookup = xfrm4_dst_lookup, + .get_saddr = xfrm4_get_saddr, +- .find_bundle = __xfrm4_find_bundle, + .decode_session = _decode_session4, + .get_tos = xfrm4_get_tos, + .init_path = xfrm4_init_path, +diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c +index 3f89ab7..fb2a5b7 100644 +--- a/net/ipv6/xfrm6_policy.c ++++ b/net/ipv6/xfrm6_policy.c +@@ -68,36 +68,6 @@ static int xfrm6_get_saddr(struct net *net, + return 0; + } + +-static struct dst_entry * +-__xfrm6_find_bundle(struct flowi *fl, struct xfrm_policy *policy) +-{ +- struct dst_entry *dst; +- +- /* Still not clear if we should set fl->fl6_{src,dst}... */ +- read_lock_bh(&policy->lock); +- for (dst = policy->bundles; dst; dst = dst->next) { +- struct xfrm_dst *xdst = (struct xfrm_dst*)dst; +- struct in6_addr fl_dst_prefix, fl_src_prefix; +- +- ipv6_addr_prefix(&fl_dst_prefix, +- &fl->fl6_dst, +- xdst->u.rt6.rt6i_dst.plen); +- ipv6_addr_prefix(&fl_src_prefix, +- &fl->fl6_src, +- xdst->u.rt6.rt6i_src.plen); +- if (ipv6_addr_equal(&xdst->u.rt6.rt6i_dst.addr, &fl_dst_prefix) && +- ipv6_addr_equal(&xdst->u.rt6.rt6i_src.addr, &fl_src_prefix) && +- xfrm_bundle_ok(policy, xdst, fl, AF_INET6, +- (xdst->u.rt6.rt6i_dst.plen != 128 || +- xdst->u.rt6.rt6i_src.plen != 128))) { +- dst_clone(dst); +- break; +- } +- } +- read_unlock_bh(&policy->lock); +- return dst; +-} +- + static int xfrm6_get_tos(struct flowi *fl) + { + return 0; +@@ -290,7 +260,6 @@ static struct xfrm_policy_afinfo xfrm6_policy_afinfo = { + .dst_ops = &xfrm6_dst_ops, + .dst_lookup = xfrm6_dst_lookup, + .get_saddr = xfrm6_get_saddr, +- .find_bundle = __xfrm6_find_bundle, + .decode_session = _decode_session6, + .get_tos = xfrm6_get_tos, + .init_path = xfrm6_init_path, +diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c +index d1eb2b5..0379d82 100644 +--- a/net/xfrm/xfrm_policy.c ++++ b/net/xfrm/xfrm_policy.c +@@ -37,6 +37,8 @@ + DEFINE_MUTEX(xfrm_cfg_mutex); + EXPORT_SYMBOL(xfrm_cfg_mutex); + ++static DEFINE_SPINLOCK(xfrm_policy_sk_bundle_lock); ++static struct dst_entry *xfrm_policy_sk_bundles; + static DEFINE_RWLOCK(xfrm_policy_lock); + + static DEFINE_RWLOCK(xfrm_policy_afinfo_lock); +@@ -50,6 +52,7 @@ static DEFINE_SPINLOCK(xfrm_policy_gc_lock); + static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family); + static void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo); + static void xfrm_init_pmtu(struct dst_entry *dst); ++static int stale_bundle(struct dst_entry *dst); + + static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol, + int dir); +@@ -277,8 +280,6 @@ void xfrm_policy_destroy(struct xfrm_policy *policy) + { + BUG_ON(!policy->walk.dead); + +- BUG_ON(policy->bundles); +- + if (del_timer(&policy->timer)) + BUG(); + +@@ -289,12 +290,7 @@ EXPORT_SYMBOL(xfrm_policy_destroy); + + static void xfrm_policy_gc_kill(struct xfrm_policy *policy) + { +- struct dst_entry *dst; +- +- while ((dst = policy->bundles) != NULL) { +- policy->bundles = dst->next; +- dst_free(dst); +- } ++ atomic_inc(&policy->genid); + + if (del_timer(&policy->timer)) + atomic_dec(&policy->refcnt); +@@ -572,7 +568,6 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) + struct xfrm_policy *delpol; + struct hlist_head *chain; + struct hlist_node *entry, *newpos; +- struct dst_entry *gc_list; + + write_lock_bh(&xfrm_policy_lock); + chain = policy_hash_bysel(net, &policy->selector, policy->family, dir); +@@ -620,34 +615,6 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) + else if (xfrm_bydst_should_resize(net, dir, NULL)) + schedule_work(&net->xfrm.policy_hash_work); + +- read_lock_bh(&xfrm_policy_lock); +- gc_list = NULL; +- entry = &policy->bydst; +- hlist_for_each_entry_continue(policy, entry, bydst) { +- struct dst_entry *dst; +- +- write_lock(&policy->lock); +- dst = policy->bundles; +- if (dst) { +- struct dst_entry *tail = dst; +- while (tail->next) +- tail = tail->next; +- tail->next = gc_list; +- gc_list = dst; +- +- policy->bundles = NULL; +- } +- write_unlock(&policy->lock); +- } +- read_unlock_bh(&xfrm_policy_lock); +- +- while (gc_list) { +- struct dst_entry *dst = gc_list; +- +- gc_list = dst->next; +- dst_free(dst); +- } +- + return 0; + } + EXPORT_SYMBOL(xfrm_policy_insert); +@@ -990,6 +957,19 @@ fail: + return ret; + } + ++static struct xfrm_policy * ++__xfrm_policy_lookup(struct net *net, struct flowi *fl, u16 family, u8 dir) ++{ ++#ifdef CONFIG_XFRM_SUB_POLICY ++ struct xfrm_policy *pol; ++ ++ pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_SUB, fl, family, dir); ++ if (pol != NULL) ++ return pol; ++#endif ++ return xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, fl, family, dir); ++} ++ + static struct flow_cache_object * + xfrm_policy_lookup(struct net *net, struct flowi *fl, u16 family, + u8 dir, struct flow_cache_object *old_obj, void *ctx) +@@ -999,21 +979,10 @@ xfrm_policy_lookup(struct net *net, struct flowi *fl, u16 family, + if (old_obj) + xfrm_pol_put(container_of(old_obj, struct xfrm_policy, flo)); + +-#ifdef CONFIG_XFRM_SUB_POLICY +- pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_SUB, fl, family, dir); +- if (IS_ERR(pol)) ++ pol = __xfrm_policy_lookup(net, fl, family, dir); ++ if (pol == NULL || IS_ERR(pol)) + return ERR_CAST(pol); +- if (pol) +- goto found; +-#endif +- pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, fl, family, dir); +- if (IS_ERR(pol)) +- return ERR_CAST(pol); +- if (pol) +- goto found; +- return NULL; + +-found: + /* Resolver returns two references: + * one for cache and one for caller of flow_cache_lookup() */ + xfrm_pol_hold(pol); +@@ -1299,18 +1268,6 @@ xfrm_tmpl_resolve(struct xfrm_policy **pols, int npols, struct flowi *fl, + * still valid. + */ + +-static struct dst_entry * +-xfrm_find_bundle(struct flowi *fl, struct xfrm_policy *policy, unsigned short family) +-{ +- struct dst_entry *x; +- struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); +- if (unlikely(afinfo == NULL)) +- return ERR_PTR(-EINVAL); +- x = afinfo->find_bundle(fl, policy); +- xfrm_policy_put_afinfo(afinfo); +- return x; +-} +- + static inline int xfrm_get_tos(struct flowi *fl, int family) + { + struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); +@@ -1326,6 +1283,54 @@ static inline int xfrm_get_tos(struct flowi *fl, int family) + return tos; + } + ++static struct flow_cache_object *xfrm_bundle_flo_get(struct flow_cache_object *flo) ++{ ++ struct xfrm_dst *xdst = container_of(flo, struct xfrm_dst, flo); ++ struct dst_entry *dst = &xdst->u.dst; ++ ++ if (xdst->route == NULL) { ++ /* Dummy bundle - if it has xfrms we were not ++ * able to build bundle as template resolution failed. ++ * It means we need to try again resolving. */ ++ if (xdst->num_xfrms > 0) ++ return NULL; ++ } else { ++ /* Real bundle */ ++ if (stale_bundle(dst)) ++ return NULL; ++ } ++ ++ dst_hold(dst); ++ return flo; ++} ++ ++static int xfrm_bundle_flo_check(struct flow_cache_object *flo) ++{ ++ struct xfrm_dst *xdst = container_of(flo, struct xfrm_dst, flo); ++ struct dst_entry *dst = &xdst->u.dst; ++ ++ if (!xdst->route) ++ return 0; ++ if (stale_bundle(dst)) ++ return 0; ++ ++ return 1; ++} ++ ++static void xfrm_bundle_flo_delete(struct flow_cache_object *flo) ++{ ++ struct xfrm_dst *xdst = container_of(flo, struct xfrm_dst, flo); ++ struct dst_entry *dst = &xdst->u.dst; ++ ++ dst_free(dst); ++} ++ ++static const struct flow_cache_ops xfrm_bundle_fc_ops = { ++ .get = xfrm_bundle_flo_get, ++ .check = xfrm_bundle_flo_check, ++ .delete = xfrm_bundle_flo_delete, ++}; ++ + static inline struct xfrm_dst *xfrm_alloc_dst(int family) + { + struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); +@@ -1338,6 +1343,8 @@ static inline struct xfrm_dst *xfrm_alloc_dst(int family) + + xfrm_policy_put_afinfo(afinfo); + ++ xdst->flo.ops = &xfrm_bundle_fc_ops; ++ + return xdst; + } + +@@ -1375,6 +1382,7 @@ static inline int xfrm_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, + return err; + } + ++ + /* Allocate chain of dst_entry's, attach known xfrm's, calculate + * all the metrics... Shortly, bundle a bundle. + */ +@@ -1437,7 +1445,7 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, + dst_hold(dst); + + dst1->xfrm = xfrm[i]; +- xdst->genid = xfrm[i]->genid; ++ xdst->xfrm_genid = xfrm[i]->genid; + + dst1->obsolete = -1; + dst1->flags |= DST_HOST; +@@ -1530,7 +1538,186 @@ xfrm_dst_update_origin(struct dst_entry *dst, struct flowi *fl) + #endif + } + +-static int stale_bundle(struct dst_entry *dst); ++static int xfrm_expand_policies(struct flowi *fl, u16 family, ++ struct xfrm_policy **pols, ++ int *num_pols, int *num_xfrms) ++{ ++ int i; ++ ++ if (*num_pols == 0 || !pols[0]) { ++ *num_pols = 0; ++ *num_xfrms = 0; ++ return 0; ++ } ++ if (IS_ERR(pols[0])) ++ return PTR_ERR(pols[0]); ++ ++ *num_xfrms = pols[0]->xfrm_nr; ++ ++#ifdef CONFIG_XFRM_SUB_POLICY ++ if (pols[0] && pols[0]->action == XFRM_POLICY_ALLOW && ++ pols[0]->type != XFRM_POLICY_TYPE_MAIN) { ++ pols[1] = xfrm_policy_lookup_bytype(xp_net(pols[0]), ++ XFRM_POLICY_TYPE_MAIN, ++ fl, family, ++ XFRM_POLICY_OUT); ++ if (pols[1]) { ++ if (IS_ERR(pols[1])) { ++ xfrm_pols_put(pols, *num_pols); ++ return PTR_ERR(pols[1]); ++ } ++ (*num_pols) ++; ++ (*num_xfrms) += pols[1]->xfrm_nr; ++ } ++ } ++#endif ++ for (i = 0; i < *num_pols; i++) { ++ if (pols[i]->action != XFRM_POLICY_ALLOW) { ++ *num_xfrms = -1; ++ break; ++ } ++ } ++ ++ return 0; ++ ++} ++ ++static struct xfrm_dst * ++xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols, ++ struct flowi *fl, u16 family, ++ struct dst_entry *dst_orig) ++{ ++ struct net *net = xp_net(pols[0]); ++ struct xfrm_state *xfrm[XFRM_MAX_DEPTH]; ++ struct dst_entry *dst; ++ struct xfrm_dst *xdst; ++ int err; ++ ++ /* Try to instantiate a bundle */ ++ err = xfrm_tmpl_resolve(pols, num_pols, fl, xfrm, family); ++ if (err < 0) { ++ if (err != -EAGAIN) ++ XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR); ++ return ERR_PTR(err); ++ } ++ ++ dst = xfrm_bundle_create(pols[0], xfrm, err, fl, dst_orig); ++ if (IS_ERR(dst)) { ++ XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLEGENERROR); ++ return ERR_CAST(dst); ++ } ++ ++ xdst = (struct xfrm_dst *)dst; ++ xdst->num_xfrms = err; ++ if (num_pols > 1) ++ err = xfrm_dst_update_parent(dst, &pols[1]->selector); ++ else ++ err = xfrm_dst_update_origin(dst, fl); ++ if (unlikely(err)) { ++ dst_free(dst); ++ XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLECHECKERROR); ++ return ERR_PTR(err); ++ } ++ ++ xdst->num_pols = num_pols; ++ memcpy(xdst->pols, pols, sizeof(struct xfrm_policy*) * num_pols); ++ xdst->policy_genid = atomic_read(&pols[0]->genid); ++ ++ return xdst; ++} ++ ++static struct flow_cache_object * ++xfrm_bundle_lookup(struct net *net, struct flowi *fl, u16 family, u8 dir, ++ struct flow_cache_object *oldflo, void *ctx) ++{ ++ struct dst_entry *dst_orig = (struct dst_entry *)ctx; ++ struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX]; ++ struct xfrm_dst *xdst, *new_xdst; ++ int num_pols = 0, num_xfrms = 0, i, err, pol_dead; ++ ++ /* Check if the policies from old bundle are usable */ ++ xdst = NULL; ++ if (oldflo) { ++ xdst = container_of(oldflo, struct xfrm_dst, flo); ++ num_pols = xdst->num_pols; ++ num_xfrms = xdst->num_xfrms; ++ pol_dead = 0; ++ for (i = 0; i < num_pols; i++) { ++ pols[i] = xdst->pols[i]; ++ pol_dead |= pols[i]->walk.dead; ++ } ++ if (pol_dead) { ++ dst_free(&xdst->u.dst); ++ xdst = NULL; ++ num_pols = 0; ++ num_xfrms = 0; ++ oldflo = NULL; ++ } ++ } ++ ++ /* Resolve policies to use if we couldn't get them from ++ * previous cache entry */ ++ if (xdst == NULL) { ++ num_pols = 1; ++ pols[0] = __xfrm_policy_lookup(net, fl, family, dir); ++ err = xfrm_expand_policies(fl, family, pols, ++ &num_pols, &num_xfrms); ++ if (err < 0) ++ goto inc_error; ++ if (num_pols == 0) ++ return NULL; ++ if (num_xfrms <= 0) ++ goto make_dummy_bundle; ++ } ++ ++ new_xdst = xfrm_resolve_and_create_bundle(pols, num_pols, fl, family, dst_orig); ++ if (IS_ERR(new_xdst)) { ++ err = PTR_ERR(new_xdst); ++ if (err != -EAGAIN) ++ goto error; ++ if (oldflo == NULL) ++ goto make_dummy_bundle; ++ dst_hold(&xdst->u.dst); ++ return oldflo; ++ } ++ ++ /* Kill the previous bundle */ ++ if (xdst) { ++ /* The policies were stolen for newly generated bundle */ ++ xdst->num_pols = 0; ++ dst_free(&xdst->u.dst); ++ } ++ ++ /* Flow cache does not have reference, it dst_free()'s, ++ * but we do need to return one reference for original caller */ ++ dst_hold(&new_xdst->u.dst); ++ return &new_xdst->flo; ++ ++make_dummy_bundle: ++ /* We found policies, but there's no bundles to instantiate: ++ * either because the policy blocks, has no transformations or ++ * we could not build template (no xfrm_states).*/ ++ xdst = xfrm_alloc_dst(family); ++ if (IS_ERR(xdst)) { ++ xfrm_pols_put(pols, num_pols); ++ return ERR_CAST(xdst); ++ } ++ xdst->num_pols = num_pols; ++ xdst->num_xfrms = num_xfrms; ++ memcpy(xdst->pols, pols, sizeof(struct xfrm_policy*) * num_pols); ++ ++ dst_hold(&xdst->u.dst); ++ return &xdst->flo; ++ ++inc_error: ++ XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR); ++error: ++ if (xdst != NULL) ++ dst_free(&xdst->u.dst); ++ else ++ xfrm_pols_put(pols, num_pols); ++ return ERR_PTR(err); ++} + + /* Main function: finds/creates a bundle for given flow. + * +@@ -1540,248 +1727,152 @@ static int stale_bundle(struct dst_entry *dst); + int __xfrm_lookup(struct net *net, struct dst_entry **dst_p, struct flowi *fl, + struct sock *sk, int flags) + { +- struct xfrm_policy *policy; + struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX]; +- int npols; +- int pol_dead; +- int xfrm_nr; +- int pi; +- struct xfrm_state *xfrm[XFRM_MAX_DEPTH]; +- struct dst_entry *dst, *dst_orig = *dst_p; +- int nx = 0; +- int err; +- u32 genid; +- u16 family; ++ struct flow_cache_object *flo; ++ struct xfrm_dst *xdst; ++ struct dst_entry *dst, *dst_orig = *dst_p, *route; ++ u16 family = dst_orig->ops->family; + u8 dir = policy_to_flow_dir(XFRM_POLICY_OUT); ++ int i, err, num_pols, num_xfrms, drop_pols = 0; + + restart: +- genid = atomic_read(&flow_cache_genid); +- policy = NULL; +- for (pi = 0; pi < ARRAY_SIZE(pols); pi++) +- pols[pi] = NULL; +- npols = 0; +- pol_dead = 0; +- xfrm_nr = 0; ++ dst = NULL; ++ xdst = NULL; ++ route = NULL; + + if (sk && sk->sk_policy[XFRM_POLICY_OUT]) { +- policy = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl); +- err = PTR_ERR(policy); +- if (IS_ERR(policy)) { +- XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR); ++ num_pols = 1; ++ pols[0] = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl); ++ err = xfrm_expand_policies(fl, family, pols, ++ &num_pols, &num_xfrms); ++ if (err < 0) + goto dropdst; ++ ++ if (num_pols) { ++ if (num_xfrms <= 0) { ++ drop_pols = num_pols; ++ goto no_transform; ++ } ++ ++ xdst = xfrm_resolve_and_create_bundle( ++ pols, num_pols, fl, ++ family, dst_orig); ++ if (IS_ERR(xdst)) { ++ xfrm_pols_put(pols, num_pols); ++ err = PTR_ERR(xdst); ++ goto dropdst; ++ } ++ ++ spin_lock_bh(&xfrm_policy_sk_bundle_lock); ++ xdst->u.dst.next = xfrm_policy_sk_bundles; ++ xfrm_policy_sk_bundles = &xdst->u.dst; ++ spin_unlock_bh(&xfrm_policy_sk_bundle_lock); ++ ++ route = xdst->route; + } + } + +- if (!policy) { +- struct flow_cache_object *flo; +- ++ if (xdst == NULL) { + /* To accelerate a bit... */ + if ((dst_orig->flags & DST_NOXFRM) || + !net->xfrm.policy_count[XFRM_POLICY_OUT]) + goto nopol; + +- flo = flow_cache_lookup(net, fl, dst_orig->ops->family, +- dir, xfrm_policy_lookup, NULL); +- err = PTR_ERR(flo); ++ flo = flow_cache_lookup(net, fl, family, dir, ++ xfrm_bundle_lookup, dst_orig); ++ if (flo == NULL) ++ goto nopol; + if (IS_ERR(flo)) { +- XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR); ++ err = PTR_ERR(flo); + goto dropdst; + } +- if (flo) +- policy = container_of(flo, struct xfrm_policy, flo); +- else +- policy = NULL; ++ xdst = container_of(flo, struct xfrm_dst, flo); ++ ++ num_pols = xdst->num_pols; ++ num_xfrms = xdst->num_xfrms; ++ memcpy(pols, xdst->pols, sizeof(struct xfrm_policy*) * num_pols); ++ route = xdst->route; ++ } ++ ++ dst = &xdst->u.dst; ++ if (route == NULL && num_xfrms > 0) { ++ /* The only case when xfrm_bundle_lookup() returns a ++ * bundle with null route, is when the template could ++ * not be resolved. It means policies are there, but ++ * bundle could not be created, since we don't yet ++ * have the xfrm_state's. We need to wait for KM to ++ * negotiate new SA's or bail out with error.*/ ++ if (net->xfrm.sysctl_larval_drop) { ++ /* EREMOTE tells the caller to generate ++ * a one-shot blackhole route. */ ++ dst_release(dst); ++ xfrm_pols_put(pols, num_pols); ++ XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES); ++ return -EREMOTE; ++ } ++ if (flags & XFRM_LOOKUP_WAIT) { ++ DECLARE_WAITQUEUE(wait, current); ++ ++ add_wait_queue(&net->xfrm.km_waitq, &wait); ++ set_current_state(TASK_INTERRUPTIBLE); ++ schedule(); ++ set_current_state(TASK_RUNNING); ++ remove_wait_queue(&net->xfrm.km_waitq, &wait); ++ ++ if (!signal_pending(current)) { ++ dst_release(dst); ++ goto restart; ++ } ++ ++ err = -ERESTART; ++ } else ++ err = -EAGAIN; ++ ++ XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES); ++ goto error; + } + +- if (!policy) ++no_transform: ++ if (num_pols == 0) + goto nopol; + +- family = dst_orig->ops->family; +- pols[0] = policy; +- npols ++; +- xfrm_nr += pols[0]->xfrm_nr; +- +- err = -ENOENT; +- if ((flags & XFRM_LOOKUP_ICMP) && !(policy->flags & XFRM_POLICY_ICMP)) ++ if ((flags & XFRM_LOOKUP_ICMP) && ++ !(pols[0]->flags & XFRM_POLICY_ICMP)) { ++ err = -ENOENT; + goto error; ++ } + +- policy->curlft.use_time = get_seconds(); ++ for (i = 0; i < num_pols; i++) ++ pols[i]->curlft.use_time = get_seconds(); + +- switch (policy->action) { +- default: +- case XFRM_POLICY_BLOCK: ++ if (num_xfrms < 0) { + /* Prohibit the flow */ + XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLBLOCK); + err = -EPERM; + goto error; +- +- case XFRM_POLICY_ALLOW: +-#ifndef CONFIG_XFRM_SUB_POLICY +- if (policy->xfrm_nr == 0) { +- /* Flow passes not transformed. */ +- xfrm_pol_put(policy); +- return 0; +- } +-#endif +- +- /* Try to find matching bundle. +- * +- * LATER: help from flow cache. It is optional, this +- * is required only for output policy. +- */ +- dst = xfrm_find_bundle(fl, policy, family); +- if (IS_ERR(dst)) { +- XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLECHECKERROR); +- err = PTR_ERR(dst); +- goto error; +- } +- +- if (dst) +- break; +- +-#ifdef CONFIG_XFRM_SUB_POLICY +- if (pols[0]->type != XFRM_POLICY_TYPE_MAIN) { +- pols[1] = xfrm_policy_lookup_bytype(net, +- XFRM_POLICY_TYPE_MAIN, +- fl, family, +- XFRM_POLICY_OUT); +- if (pols[1]) { +- if (IS_ERR(pols[1])) { +- XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR); +- err = PTR_ERR(pols[1]); +- goto error; +- } +- if (pols[1]->action == XFRM_POLICY_BLOCK) { +- XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLBLOCK); +- err = -EPERM; +- goto error; +- } +- npols ++; +- xfrm_nr += pols[1]->xfrm_nr; +- } +- } +- +- /* +- * Because neither flowi nor bundle information knows about +- * transformation template size. On more than one policy usage +- * we can realize whether all of them is bypass or not after +- * they are searched. See above not-transformed bypass +- * is surrounded by non-sub policy configuration, too. +- */ +- if (xfrm_nr == 0) { +- /* Flow passes not transformed. */ +- xfrm_pols_put(pols, npols); +- return 0; +- } +- +-#endif +- nx = xfrm_tmpl_resolve(pols, npols, fl, xfrm, family); +- +- if (unlikely(nx<0)) { +- err = nx; +- if (err == -EAGAIN && net->xfrm.sysctl_larval_drop) { +- /* EREMOTE tells the caller to generate +- * a one-shot blackhole route. +- */ +- XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES); +- xfrm_pol_put(policy); +- return -EREMOTE; +- } +- if (err == -EAGAIN && (flags & XFRM_LOOKUP_WAIT)) { +- DECLARE_WAITQUEUE(wait, current); +- +- add_wait_queue(&net->xfrm.km_waitq, &wait); +- set_current_state(TASK_INTERRUPTIBLE); +- schedule(); +- set_current_state(TASK_RUNNING); +- remove_wait_queue(&net->xfrm.km_waitq, &wait); +- +- nx = xfrm_tmpl_resolve(pols, npols, fl, xfrm, family); +- +- if (nx == -EAGAIN && signal_pending(current)) { +- XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES); +- err = -ERESTART; +- goto error; +- } +- if (nx == -EAGAIN || +- genid != atomic_read(&flow_cache_genid)) { +- xfrm_pols_put(pols, npols); +- goto restart; +- } +- err = nx; +- } +- if (err < 0) { +- XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES); +- goto error; +- } +- } +- if (nx == 0) { +- /* Flow passes not transformed. */ +- xfrm_pols_put(pols, npols); +- return 0; +- } +- +- dst = xfrm_bundle_create(policy, xfrm, nx, fl, dst_orig); +- err = PTR_ERR(dst); +- if (IS_ERR(dst)) { +- XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLEGENERROR); +- goto error; +- } +- +- for (pi = 0; pi < npols; pi++) +- pol_dead |= pols[pi]->walk.dead; +- +- write_lock_bh(&policy->lock); +- if (unlikely(pol_dead || stale_bundle(dst))) { +- /* Wow! While we worked on resolving, this +- * policy has gone. Retry. It is not paranoia, +- * we just cannot enlist new bundle to dead object. +- * We can't enlist stable bundles either. +- */ +- write_unlock_bh(&policy->lock); +- dst_free(dst); +- +- if (pol_dead) +- XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLDEAD); +- else +- XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLECHECKERROR); +- err = -EHOSTUNREACH; +- goto error; +- } +- +- if (npols > 1) +- err = xfrm_dst_update_parent(dst, &pols[1]->selector); +- else +- err = xfrm_dst_update_origin(dst, fl); +- if (unlikely(err)) { +- write_unlock_bh(&policy->lock); +- dst_free(dst); +- XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLECHECKERROR); +- goto error; +- } +- +- dst->next = policy->bundles; +- policy->bundles = dst; +- dst_hold(dst); +- write_unlock_bh(&policy->lock); ++ } else if (num_xfrms > 0) { ++ /* Flow transformed */ ++ *dst_p = dst; ++ dst_release(dst_orig); ++ } else { ++ /* Flow passes untransformed */ ++ dst_release(dst); + } +- *dst_p = dst; +- dst_release(dst_orig); +- xfrm_pols_put(pols, npols); ++ok: ++ xfrm_pols_put(pols, drop_pols); + return 0; + ++nopol: ++ if (!(flags & XFRM_LOOKUP_ICMP)) ++ goto ok; ++ err = -ENOENT; + error: +- xfrm_pols_put(pols, npols); ++ dst_release(dst); + dropdst: + dst_release(dst_orig); + *dst_p = NULL; ++ xfrm_pols_put(pols, drop_pols); + return err; +- +-nopol: +- err = -ENOENT; +- if (flags & XFRM_LOOKUP_ICMP) +- goto dropdst; +- return 0; + } + EXPORT_SYMBOL(__xfrm_lookup); + +@@ -2134,71 +2225,24 @@ static struct dst_entry *xfrm_negative_advice(struct dst_entry *dst) + return dst; + } + +-static void prune_one_bundle(struct xfrm_policy *pol, int (*func)(struct dst_entry *), struct dst_entry **gc_list_p) +-{ +- struct dst_entry *dst, **dstp; +- +- write_lock(&pol->lock); +- dstp = &pol->bundles; +- while ((dst=*dstp) != NULL) { +- if (func(dst)) { +- *dstp = dst->next; +- dst->next = *gc_list_p; +- *gc_list_p = dst; +- } else { +- dstp = &dst->next; +- } +- } +- write_unlock(&pol->lock); +-} +- +-static void xfrm_prune_bundles(struct net *net, int (*func)(struct dst_entry *)) ++static void __xfrm_garbage_collect(struct net *net) + { +- struct dst_entry *gc_list = NULL; +- int dir; ++ struct dst_entry *head, *next; + +- read_lock_bh(&xfrm_policy_lock); +- for (dir = 0; dir < XFRM_POLICY_MAX * 2; dir++) { +- struct xfrm_policy *pol; +- struct hlist_node *entry; +- struct hlist_head *table; +- int i; ++ flow_cache_flush(); + +- hlist_for_each_entry(pol, entry, +- &net->xfrm.policy_inexact[dir], bydst) +- prune_one_bundle(pol, func, &gc_list); ++ spin_lock_bh(&xfrm_policy_sk_bundle_lock); ++ head = xfrm_policy_sk_bundles; ++ xfrm_policy_sk_bundles = NULL; ++ spin_unlock_bh(&xfrm_policy_sk_bundle_lock); + +- table = net->xfrm.policy_bydst[dir].table; +- for (i = net->xfrm.policy_bydst[dir].hmask; i >= 0; i--) { +- hlist_for_each_entry(pol, entry, table + i, bydst) +- prune_one_bundle(pol, func, &gc_list); +- } +- } +- read_unlock_bh(&xfrm_policy_lock); +- +- while (gc_list) { +- struct dst_entry *dst = gc_list; +- gc_list = dst->next; +- dst_free(dst); ++ while (head) { ++ next = head->next; ++ dst_free(head); ++ head = next; + } + } + +-static int unused_bundle(struct dst_entry *dst) +-{ +- return !atomic_read(&dst->__refcnt); +-} +- +-static void __xfrm_garbage_collect(struct net *net) +-{ +- xfrm_prune_bundles(net, unused_bundle); +-} +- +-static int xfrm_flush_bundles(struct net *net) +-{ +- xfrm_prune_bundles(net, stale_bundle); +- return 0; +-} +- + static void xfrm_init_pmtu(struct dst_entry *dst) + { + do { +@@ -2256,7 +2300,9 @@ int xfrm_bundle_ok(struct xfrm_policy *pol, struct xfrm_dst *first, + return 0; + if (dst->xfrm->km.state != XFRM_STATE_VALID) + return 0; +- if (xdst->genid != dst->xfrm->genid) ++ if (xdst->xfrm_genid != dst->xfrm->genid) ++ return 0; ++ if (xdst->policy_genid != atomic_read(&xdst->pols[0]->genid)) + return 0; + + if (strict && fl && +@@ -2383,7 +2429,7 @@ static int xfrm_dev_event(struct notifier_block *this, unsigned long event, void + + switch (event) { + case NETDEV_DOWN: +- xfrm_flush_bundles(dev_net(dev)); ++ __xfrm_garbage_collect(dev_net(dev)); + } + return NOTIFY_DONE; + } +@@ -2714,7 +2760,6 @@ static int xfrm_policy_migrate(struct xfrm_policy *pol, + struct xfrm_migrate *m, int num_migrate) + { + struct xfrm_migrate *mp; +- struct dst_entry *dst; + int i, j, n = 0; + + write_lock_bh(&pol->lock); +@@ -2739,10 +2784,7 @@ static int xfrm_policy_migrate(struct xfrm_policy *pol, + sizeof(pol->xfrm_vec[i].saddr)); + pol->xfrm_vec[i].encap_family = mp->new_family; + /* flush bundles */ +- while ((dst = pol->bundles) != NULL) { +- pol->bundles = dst->next; +- dst_free(dst); +- } ++ atomic_inc(&pol->genid); + } + } + +-- +1.7.0.2 + diff --git a/main/linux-pae/0016-xfrm-remove-policy-garbage-collection.patch b/main/linux-pae/0016-xfrm-remove-policy-garbage-collection.patch new file mode 100644 index 00000000..4a45c7f4 --- /dev/null +++ b/main/linux-pae/0016-xfrm-remove-policy-garbage-collection.patch @@ -0,0 +1,91 @@ +From 4c53c9239069f48ec9a86f8e596c163b72e8bc4d Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Timo=20Ter=C3=A4s?= +Date: Wed, 7 Apr 2010 00:30:06 +0000 +Subject: [PATCH 16/18] xfrm: remove policy garbage collection + +Policies are now properly reference counted and destroyed from +all code paths. The delayed gc is just an overhead now and can +be removed. + +Signed-off-by: Timo Teras +Signed-off-by: David S. Miller +(cherry picked from commit 285ead175c5dd5075cab5b6c94f35a3e6c0a3ae6) +--- + net/xfrm/xfrm_policy.c | 39 +++++---------------------------------- + 1 files changed, 5 insertions(+), 34 deletions(-) + +diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c +index 0379d82..5606841 100644 +--- a/net/xfrm/xfrm_policy.c ++++ b/net/xfrm/xfrm_policy.c +@@ -46,9 +46,6 @@ static struct xfrm_policy_afinfo *xfrm_policy_afinfo[NPROTO]; + + static struct kmem_cache *xfrm_dst_cache __read_mostly; + +-static HLIST_HEAD(xfrm_policy_gc_list); +-static DEFINE_SPINLOCK(xfrm_policy_gc_lock); +- + static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family); + static void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo); + static void xfrm_init_pmtu(struct dst_entry *dst); +@@ -288,32 +285,6 @@ void xfrm_policy_destroy(struct xfrm_policy *policy) + } + EXPORT_SYMBOL(xfrm_policy_destroy); + +-static void xfrm_policy_gc_kill(struct xfrm_policy *policy) +-{ +- atomic_inc(&policy->genid); +- +- if (del_timer(&policy->timer)) +- atomic_dec(&policy->refcnt); +- +- xfrm_pol_put(policy); +-} +- +-static void xfrm_policy_gc_task(struct work_struct *work) +-{ +- struct xfrm_policy *policy; +- struct hlist_node *entry, *tmp; +- struct hlist_head gc_list; +- +- spin_lock_bh(&xfrm_policy_gc_lock); +- gc_list.first = xfrm_policy_gc_list.first; +- INIT_HLIST_HEAD(&xfrm_policy_gc_list); +- spin_unlock_bh(&xfrm_policy_gc_lock); +- +- hlist_for_each_entry_safe(policy, entry, tmp, &gc_list, bydst) +- xfrm_policy_gc_kill(policy); +-} +-static DECLARE_WORK(xfrm_policy_gc_work, xfrm_policy_gc_task); +- + /* Rule must be locked. Release descentant resources, announce + * entry dead. The rule must be unlinked from lists to the moment. + */ +@@ -322,11 +293,12 @@ static void xfrm_policy_kill(struct xfrm_policy *policy) + { + policy->walk.dead = 1; + +- spin_lock_bh(&xfrm_policy_gc_lock); +- hlist_add_head(&policy->bydst, &xfrm_policy_gc_list); +- spin_unlock_bh(&xfrm_policy_gc_lock); ++ atomic_inc(&policy->genid); + +- schedule_work(&xfrm_policy_gc_work); ++ if (del_timer(&policy->timer)) ++ xfrm_pol_put(policy); ++ ++ xfrm_pol_put(policy); + } + + static unsigned int xfrm_policy_hashmax __read_mostly = 1 * 1024 * 1024; +@@ -2535,7 +2507,6 @@ static void xfrm_policy_fini(struct net *net) + audit_info.sessionid = -1; + audit_info.secid = 0; + xfrm_policy_flush(net, XFRM_POLICY_TYPE_MAIN, &audit_info); +- flush_work(&xfrm_policy_gc_work); + + WARN_ON(!list_empty(&net->xfrm.policy_all)); + +-- +1.7.0.2 + diff --git a/main/linux-pae/0017-flow-delayed-deletion-of-flow-cache-entries.patch b/main/linux-pae/0017-flow-delayed-deletion-of-flow-cache-entries.patch new file mode 100644 index 00000000..7d17d41a --- /dev/null +++ b/main/linux-pae/0017-flow-delayed-deletion-of-flow-cache-entries.patch @@ -0,0 +1,231 @@ +From fede05e99e2d860e97bc877b8b77fb9e63f55cc8 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Timo=20Ter=C3=A4s?= +Date: Wed, 7 Apr 2010 00:30:07 +0000 +Subject: [PATCH 17/18] flow: delayed deletion of flow cache entries + +Speed up lookups by freeing flow cache entries later. After +virtualizing flow cache entry operations, the flow cache may now +end up calling policy or bundle destructor which can be slowish. + +As gc_list is more effective with double linked list, the flow cache +is converted to use common hlist and list macroes where appropriate. + +Signed-off-by: Timo Teras +Signed-off-by: David S. Miller +(cherry picked from commit 8e4795605d1e1b39113818ad7c147b8a867a1f6a) +--- + net/core/flow.c | 100 ++++++++++++++++++++++++++++++++++++++----------------- + 1 files changed, 69 insertions(+), 31 deletions(-) + +diff --git a/net/core/flow.c b/net/core/flow.c +index 521df52..1619006 100644 +--- a/net/core/flow.c ++++ b/net/core/flow.c +@@ -26,7 +26,10 @@ + #include + + struct flow_cache_entry { +- struct flow_cache_entry *next; ++ union { ++ struct hlist_node hlist; ++ struct list_head gc_list; ++ } u; + u16 family; + u8 dir; + u32 genid; +@@ -35,7 +38,7 @@ struct flow_cache_entry { + }; + + struct flow_cache_percpu { +- struct flow_cache_entry **hash_table; ++ struct hlist_head *hash_table; + int hash_count; + u32 hash_rnd; + int hash_rnd_recalc; +@@ -62,6 +65,9 @@ atomic_t flow_cache_genid = ATOMIC_INIT(0); + static struct flow_cache flow_cache_global; + static struct kmem_cache *flow_cachep; + ++static DEFINE_SPINLOCK(flow_cache_gc_lock); ++static LIST_HEAD(flow_cache_gc_list); ++ + #define flow_cache_hash_size(cache) (1 << (cache)->hash_shift) + #define FLOW_HASH_RND_PERIOD (10 * 60 * HZ) + +@@ -86,38 +92,66 @@ static int flow_entry_valid(struct flow_cache_entry *fle) + return 1; + } + +-static void flow_entry_kill(struct flow_cache *fc, +- struct flow_cache_percpu *fcp, +- struct flow_cache_entry *fle) ++static void flow_entry_kill(struct flow_cache_entry *fle) + { + if (fle->object) + fle->object->ops->delete(fle->object); + kmem_cache_free(flow_cachep, fle); +- fcp->hash_count--; ++} ++ ++static void flow_cache_gc_task(struct work_struct *work) ++{ ++ struct list_head gc_list; ++ struct flow_cache_entry *fce, *n; ++ ++ INIT_LIST_HEAD(&gc_list); ++ spin_lock_bh(&flow_cache_gc_lock); ++ list_splice_tail_init(&flow_cache_gc_list, &gc_list); ++ spin_unlock_bh(&flow_cache_gc_lock); ++ ++ list_for_each_entry_safe(fce, n, &gc_list, u.gc_list) ++ flow_entry_kill(fce); ++} ++static DECLARE_WORK(flow_cache_gc_work, flow_cache_gc_task); ++ ++static void flow_cache_queue_garbage(struct flow_cache_percpu *fcp, ++ int deleted, struct list_head *gc_list) ++{ ++ if (deleted) { ++ fcp->hash_count -= deleted; ++ spin_lock_bh(&flow_cache_gc_lock); ++ list_splice_tail(gc_list, &flow_cache_gc_list); ++ spin_unlock_bh(&flow_cache_gc_lock); ++ schedule_work(&flow_cache_gc_work); ++ } + } + + static void __flow_cache_shrink(struct flow_cache *fc, + struct flow_cache_percpu *fcp, + int shrink_to) + { +- struct flow_cache_entry *fle, **flp; +- int i; ++ struct flow_cache_entry *fle; ++ struct hlist_node *entry, *tmp; ++ LIST_HEAD(gc_list); ++ int i, deleted = 0; + + for (i = 0; i < flow_cache_hash_size(fc); i++) { + int saved = 0; + +- flp = &fcp->hash_table[i]; +- while ((fle = *flp) != NULL) { ++ hlist_for_each_entry_safe(fle, entry, tmp, ++ &fcp->hash_table[i], u.hlist) { + if (saved < shrink_to && + flow_entry_valid(fle)) { + saved++; +- flp = &fle->next; + } else { +- *flp = fle->next; +- flow_entry_kill(fc, fcp, fle); ++ deleted++; ++ hlist_del(&fle->u.hlist); ++ list_add_tail(&fle->u.gc_list, &gc_list); + } + } + } ++ ++ flow_cache_queue_garbage(fcp, deleted, &gc_list); + } + + static void flow_cache_shrink(struct flow_cache *fc, +@@ -182,7 +216,8 @@ flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir, + { + struct flow_cache *fc = &flow_cache_global; + struct flow_cache_percpu *fcp; +- struct flow_cache_entry *fle, **head; ++ struct flow_cache_entry *fle, *tfle; ++ struct hlist_node *entry; + struct flow_cache_object *flo; + unsigned int hash; + +@@ -200,12 +235,13 @@ flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir, + flow_new_hash_rnd(fc, fcp); + + hash = flow_hash_code(fc, fcp, key); +- head = &fcp->hash_table[hash]; +- for (fle = *head; fle; fle = fle->next) { +- if (fle->family == family && +- fle->dir == dir && +- flow_key_compare(key, &fle->key) == 0) ++ hlist_for_each_entry(tfle, entry, &fcp->hash_table[hash], u.hlist) { ++ if (tfle->family == family && ++ tfle->dir == dir && ++ flow_key_compare(key, &tfle->key) == 0) { ++ fle = tfle; + break; ++ } + } + + if (unlikely(!fle)) { +@@ -214,12 +250,11 @@ flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir, + + fle = kmem_cache_alloc(flow_cachep, GFP_ATOMIC); + if (fle) { +- fle->next = *head; +- *head = fle; + fle->family = family; + fle->dir = dir; + memcpy(&fle->key, key, sizeof(*key)); + fle->object = NULL; ++ hlist_add_head(&fle->u.hlist, &fcp->hash_table[hash]); + fcp->hash_count++; + } + } else if (likely(fle->genid == atomic_read(&flow_cache_genid))) { +@@ -262,23 +297,26 @@ static void flow_cache_flush_tasklet(unsigned long data) + struct flow_flush_info *info = (void *)data; + struct flow_cache *fc = info->cache; + struct flow_cache_percpu *fcp; +- int i; ++ struct flow_cache_entry *fle; ++ struct hlist_node *entry, *tmp; ++ LIST_HEAD(gc_list); ++ int i, deleted = 0; + + fcp = per_cpu_ptr(fc->percpu, smp_processor_id()); + for (i = 0; i < flow_cache_hash_size(fc); i++) { +- struct flow_cache_entry *fle; +- +- fle = fcp->hash_table[i]; +- for (; fle; fle = fle->next) { ++ hlist_for_each_entry_safe(fle, entry, tmp, ++ &fcp->hash_table[i], u.hlist) { + if (flow_entry_valid(fle)) + continue; + +- if (fle->object) +- fle->object->ops->delete(fle->object); +- fle->object = NULL; ++ deleted++; ++ hlist_del(&fle->u.hlist); ++ list_add_tail(&fle->u.gc_list, &gc_list); + } + } + ++ flow_cache_queue_garbage(fcp, deleted, &gc_list); ++ + if (atomic_dec_and_test(&info->cpuleft)) + complete(&info->completion); + } +@@ -320,7 +358,7 @@ void flow_cache_flush(void) + static void __init flow_cache_cpu_prepare(struct flow_cache *fc, + struct flow_cache_percpu *fcp) + { +- fcp->hash_table = (struct flow_cache_entry **) ++ fcp->hash_table = (struct hlist_head *) + __get_free_pages(GFP_KERNEL|__GFP_ZERO, fc->order); + if (!fcp->hash_table) + panic("NET: failed to allocate flow cache order %lu\n", fc->order); +@@ -354,7 +392,7 @@ static int flow_cache_init(struct flow_cache *fc) + + for (order = 0; + (PAGE_SIZE << order) < +- (sizeof(struct flow_cache_entry *)*flow_cache_hash_size(fc)); ++ (sizeof(struct hlist_head)*flow_cache_hash_size(fc)); + order++) + /* NOTHING */; + fc->order = order; +-- +1.7.0.2 + diff --git a/main/linux-pae/0018-xfrm-Fix-crashes-in-xfrm_lookup.patch b/main/linux-pae/0018-xfrm-Fix-crashes-in-xfrm_lookup.patch new file mode 100644 index 00000000..6f0dc912 --- /dev/null +++ b/main/linux-pae/0018-xfrm-Fix-crashes-in-xfrm_lookup.patch @@ -0,0 +1,46 @@ +From e0c0800740cdf64fe7b121c2ef235c01f1957af0 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Timo=20Ter=C3=A4s?= +Date: Thu, 8 Apr 2010 11:27:42 -0700 +Subject: [PATCH 18/18] xfrm: Fix crashes in xfrm_lookup() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Timo Teräs + +Happens because CONFIG_XFRM_SUB_POLICY is not enabled, and one of +the helper functions I used did unexpected things in that case. + +Signed-off-by: David S. Miller +(cherry picked from commit e4077e018b5ead3de9951fc01d8bf12eeeeeefed) +--- + include/net/xfrm.h | 7 ------- + 1 files changed, 0 insertions(+), 7 deletions(-) + +diff --git a/include/net/xfrm.h b/include/net/xfrm.h +index d51ef61..280f46f 100644 +--- a/include/net/xfrm.h ++++ b/include/net/xfrm.h +@@ -738,19 +738,12 @@ static inline void xfrm_pol_put(struct xfrm_policy *policy) + xfrm_policy_destroy(policy); + } + +-#ifdef CONFIG_XFRM_SUB_POLICY + static inline void xfrm_pols_put(struct xfrm_policy **pols, int npols) + { + int i; + for (i = npols - 1; i >= 0; --i) + xfrm_pol_put(pols[i]); + } +-#else +-static inline void xfrm_pols_put(struct xfrm_policy **pols, int npols) +-{ +- xfrm_pol_put(pols[0]); +-} +-#endif + + extern void __xfrm_state_destroy(struct xfrm_state *); + +-- +1.7.0.2 + diff --git a/main/linux-pae/APKBUILD b/main/linux-pae/APKBUILD index 971800f4..95d20f1a 100644 --- a/main/linux-pae/APKBUILD +++ b/main/linux-pae/APKBUILD @@ -4,7 +4,7 @@ _flavor=pae pkgname=linux-${_flavor} pkgver=2.6.32.11 _kernver=2.6.32 -pkgrel=0 +pkgrel=1 pkgdesc="Linux kernel with PAE enabled" url=http://www.kernel.org depends="mkinitfs linux-firmware" @@ -14,12 +14,23 @@ _config=${config:-kernelconfig.${CARCH:-x86}} install= source="ftp://ftp.kernel.org/pub/linux/kernel/v2.6/linux-$_kernver.tar.bz2 ftp://ftp.kernel.org/pub/linux/kernel/v2.6/patch-$pkgver.bz2 - ip_gre.patch - ip_gre2.patch - arp.patch - xfrm-cache-size-revert.patch - net-git-78f1cd-r8169-fix-broken-register-writes.patch - net-git-c0cd88-r8169-offical-fix-for-CVE-2009-4537-overlength-frame-DMAs.patch + 0002-gre-fix-hard-header-destination-address-checking.patch + 0003-ip_gre-include-route-header_len-in-max_headroom-calc.patch + 0004-arp-flush-arp-cache-on-device-change.patch + 0005-r8169-fix-broken-register-writes.patch + 0006-r8169-offical-fix-for-CVE-2009-4537-overlength-frame.patch + 0007-r8169-Fix-rtl8169_rx_interrupt.patch + 0008-r8169-clean-up-my-printk-uglyness.patch + 0009-ipsec-Fix-bogus-bundle-flowi.patch + 0010-xfrm-Remove-xfrm_state_genid.patch + 0011-xfrm_user-verify-policy-direction-at-XFRM_MSG_POLEXP.patch + 0012-xfrm-remove-policy-lock-when-accessing-policy-walk.d.patch + 0013-flow-structurize-flow-cache.patch + 0014-flow-virtualize-flow-cache-entry-methods.patch + 0015-xfrm-cache-bundles-instead-of-policies-for-outgoing-.patch + 0016-xfrm-remove-policy-garbage-collection.patch + 0017-flow-delayed-deletion-of-flow-cache-entries.patch + 0018-xfrm-Fix-crashes-in-xfrm_lookup.patch kernelconfig.x86 " subpackages="$pkgname-dev" @@ -42,7 +53,7 @@ prepare() { mkdir -p "$srcdir"/build cp "$srcdir"/$_config "$srcdir"/build/.config echo "-${_flavor}" > "$srcdir"/linux-$_kernver/localversion-${_flavor} - make -C "$srcdir"/linux-$_kernver O="$srcdir"/build HOSTCC="$CC" \ + make -C "$srcdir"/linux-$_kernver O="$srcdir"/build HOSTCC="${CC:-gcc}" \ silentoldconfig } @@ -55,7 +66,7 @@ menuconfig() { build() { cd "$srcdir"/build - make CC="$CC" || return 1 + make CC="${CC:-gcc}" || return 1 } package() { @@ -88,7 +99,7 @@ dev() { # external modules, and create the scripts mkdir -p "$dir" cp "$srcdir"/$_config "$dir"/.config - make -j1 -C "$srcdir"/linux-$_kernver O="$dir" HOSTCC="$CC" \ + make -j1 -C "$srcdir"/linux-$_kernver O="$dir" HOSTCC="${CC:-gcc}" \ silentoldconfig prepare scripts # remove the stuff that poits to real sources. we want 3rd party @@ -121,10 +132,21 @@ dev() { md5sums="260551284ac224c3a43c4adac7df4879 linux-2.6.32.tar.bz2 855c248334a71ef5ca3d8cb89d51334f patch-2.6.32.11.bz2 -3ef822f3a2723b9a80c3f12954457225 ip_gre.patch -13ca9e91700e459da269c957062bbea7 ip_gre2.patch -4c39a161d918e7f274292ecfd168b891 arp.patch -329fcab881425e001d3243caa4648478 xfrm-cache-size-revert.patch -21ed38773d846097b7315e1e0801d87a net-git-78f1cd-r8169-fix-broken-register-writes.patch -962a6dd7c639612fc8bdaeb836388b0b net-git-c0cd88-r8169-offical-fix-for-CVE-2009-4537-overlength-frame-DMAs.patch +437317f88ec13ace8d39c31983a41696 0002-gre-fix-hard-header-destination-address-checking.patch +151b29a161178ed39d62a08f21f3484d 0003-ip_gre-include-route-header_len-in-max_headroom-calc.patch +776adeeb5272093574f8836c5037dd7d 0004-arp-flush-arp-cache-on-device-change.patch +afa06334c81f21c20571286a83d3d928 0005-r8169-fix-broken-register-writes.patch +c538c0f735d79fd71b47dde02bf1f790 0006-r8169-offical-fix-for-CVE-2009-4537-overlength-frame.patch +5f8b9a76d95319c5b1aa26b54a42e6b5 0007-r8169-Fix-rtl8169_rx_interrupt.patch +f878c802700e3babd03be3505119c5c2 0008-r8169-clean-up-my-printk-uglyness.patch +cf168620efa63479a6e03da78906e32f 0009-ipsec-Fix-bogus-bundle-flowi.patch +3af4b5ae1afae3278b0070f585b874e3 0010-xfrm-Remove-xfrm_state_genid.patch +9f284c3fd5ab38cef4544efc1f50c6ba 0011-xfrm_user-verify-policy-direction-at-XFRM_MSG_POLEXP.patch +b035114e893883cf67530350678e00f5 0012-xfrm-remove-policy-lock-when-accessing-policy-walk.d.patch +9dea03ec19aaf9a384e4f56f57009257 0013-flow-structurize-flow-cache.patch +fc9ab26abbfec0d3f20000b5e695620b 0014-flow-virtualize-flow-cache-entry-methods.patch +c09b82b89a49ba2a3836a0bc3a3312f4 0015-xfrm-cache-bundles-instead-of-policies-for-outgoing-.patch +41618efb65ab9ddacfb59a1cde9b4edd 0016-xfrm-remove-policy-garbage-collection.patch +3b83f0972ab715819d1119b120a987e7 0017-flow-delayed-deletion-of-flow-cache-entries.patch +45a676c7a1759fec60b724d557b4e295 0018-xfrm-Fix-crashes-in-xfrm_lookup.patch bf15e3ee69e03319dab0d59e08b67195 kernelconfig.x86" diff --git a/main/linux-pae/arp.patch b/main/linux-pae/arp.patch deleted file mode 100644 index d2682690..00000000 --- a/main/linux-pae/arp.patch +++ /dev/null @@ -1,14 +0,0 @@ -diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c -index c95cd93..71ab56f 100644 ---- a/net/ipv4/arp.c -+++ b/net/ipv4/arp.c -@@ -1200,6 +1200,9 @@ static int arp_netdev_event(struct notifier_block *this, unsigned long event, vo - neigh_changeaddr(&arp_tbl, dev); - rt_cache_flush(dev_net(dev), 0); - break; -+ case NETDEV_CHANGE: -+ neigh_changeaddr(&arp_tbl, dev); -+ break; - default: - break; - } diff --git a/main/linux-pae/ip_gre.patch b/main/linux-pae/ip_gre.patch deleted file mode 100644 index ba5f19b3..00000000 --- a/main/linux-pae/ip_gre.patch +++ /dev/null @@ -1,15 +0,0 @@ ---- a/net/ipv4/ip_gre.c.orig -+++ b/net/ipv4/ip_gre.c -@@ -1137,11 +1137,8 @@ - - if (saddr) - memcpy(&iph->saddr, saddr, 4); -- -- if (daddr) { -+ if (daddr) - memcpy(&iph->daddr, daddr, 4); -- return t->hlen; -- } - if (iph->daddr && !ipv4_is_multicast(iph->daddr)) - return t->hlen; - diff --git a/main/linux-pae/ip_gre2.patch b/main/linux-pae/ip_gre2.patch deleted file mode 100644 index 52c44076..00000000 --- a/main/linux-pae/ip_gre2.patch +++ /dev/null @@ -1,17 +0,0 @@ ---- linux-2.6.32/net/ipv4/ip_gre.c.orig -+++ linux-2.6.32/net/ipv4/ip_gre.c -@@ -803,11 +803,13 @@ - tunnel->err_count = 0; - } - -- max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen; -+ max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->u.dst.header_len; - - if (skb_headroom(skb) < max_headroom || skb_shared(skb)|| - (skb_cloned(skb) && !skb_clone_writable(skb, 0))) { - struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); -+ if (max_headroom > dev->needed_headroom) -+ dev->needed_headroom = max_headroom; - if (!new_skb) { - ip_rt_put(rt); - stats->tx_dropped++; diff --git a/main/linux-pae/net-git-78f1cd-r8169-fix-broken-register-writes.patch b/main/linux-pae/net-git-78f1cd-r8169-fix-broken-register-writes.patch deleted file mode 100644 index f5f72acc..00000000 --- a/main/linux-pae/net-git-78f1cd-r8169-fix-broken-register-writes.patch +++ /dev/null @@ -1,51 +0,0 @@ -From 78f1cd02457252e1ffbc6caa44a17424a45286b8 Mon Sep 17 00:00:00 2001 -From: Francois Romieu -Date: Sat, 27 Mar 2010 19:35:46 -0700 -Subject: [PATCH] r8169: fix broken register writes -MIME-Version: 1.0 -Content-Type: text/plain; charset=utf8 -Content-Transfer-Encoding: 8bit - -This is quite similar to b39fe41f481d20c201012e4483e76c203802dda7 -though said registers are not even documented as 64-bit registers -- as opposed to the initial TxDescStartAddress ones - but as single -bytes which must be combined into 32 bits at the MMIO read/write -level before being merged into a 64 bit logical entity. - -Credits go to Ben Hutchings for the MAR -registers (aka "multicast is broken for ages on ARM) and to -Timo Teräs for the MAC registers. - -Signed-off-by: Francois Romieu -Signed-off-by: David S. Miller ---- - drivers/net/r8169.c | 4 ++-- - 1 files changed, 2 insertions(+), 2 deletions(-) - -diff --git a/drivers/net/r8169.c b/drivers/net/r8169.c -index b93fd23..7193afc 100644 ---- a/drivers/net/r8169.c -+++ b/drivers/net/r8169.c -@@ -2820,8 +2820,8 @@ static void rtl_rar_set(struct rtl8169_private *tp, u8 *addr) - spin_lock_irq(&tp->lock); - - RTL_W8(Cfg9346, Cfg9346_Unlock); -- RTL_W32(MAC0, low); - RTL_W32(MAC4, high); -+ RTL_W32(MAC0, low); - RTL_W8(Cfg9346, Cfg9346_Lock); - - spin_unlock_irq(&tp->lock); -@@ -4747,8 +4747,8 @@ static void rtl_set_rx_mode(struct net_device *dev) - mc_filter[1] = swab32(data); - } - -- RTL_W32(MAR0 + 0, mc_filter[0]); - RTL_W32(MAR0 + 4, mc_filter[1]); -+ RTL_W32(MAR0 + 0, mc_filter[0]); - - RTL_W32(RxConfig, tmp); - --- -1.7.0.3 - diff --git a/main/linux-pae/net-git-c0cd88-r8169-offical-fix-for-CVE-2009-4537-overlength-frame-DMAs.patch b/main/linux-pae/net-git-c0cd88-r8169-offical-fix-for-CVE-2009-4537-overlength-frame-DMAs.patch deleted file mode 100644 index 250c85d6..00000000 --- a/main/linux-pae/net-git-c0cd88-r8169-offical-fix-for-CVE-2009-4537-overlength-frame-DMAs.patch +++ /dev/null @@ -1,119 +0,0 @@ -From c0cd884af045338476b8e69a61fceb3f34ff22f1 Mon Sep 17 00:00:00 2001 -From: Neil Horman -Date: Mon, 29 Mar 2010 13:16:02 -0700 -Subject: [PATCH] r8169: offical fix for CVE-2009-4537 (overlength frame DMAs) - -Official patch to fix the r8169 frame length check error. - -Based on this initial thread: -http://marc.info/?l=linux-netdev&m=126202972828626&w=1 -This is the official patch to fix the frame length problems in the r8169 -driver. As noted in the previous thread, while this patch incurs a performance -hit on the driver, its possible to improve performance dynamically by updating -the mtu and rx_copybreak values at runtime to return performance to what it was -for those NICS which are unaffected by the ideosyncracy (if there are any). - -Summary: - - A while back Eric submitted a patch for r8169 in which the proper -allocated frame size was written to RXMaxSize to prevent the NIC from dmaing too -much data. This was done in commit fdd7b4c3302c93f6833e338903ea77245eb510b4. A -long time prior to that however, Francois posted -126fa4b9ca5d9d7cb7d46f779ad3bd3631ca387c, which expiclitly disabled the MaxSize -setting due to the fact that the hardware behaved in odd ways when overlong -frames were received on NIC's supported by this driver. This was mentioned in a -security conference recently: -http://events.ccc.de/congress/2009/Fahrplan//events/3596.en.html - -It seems that if we can't enable frame size filtering, then, as Eric correctly -noticed, we can find ourselves DMA-ing too much data to a buffer, causing -corruption. As a result is seems that we are forced to allocate a frame which -is ready to handle a maximally sized receive. - -This obviously has performance issues with it, so to mitigate that issue, this -patch does two things: - -1) Raises the copybreak value to the frame allocation size, which should force -appropriately sized packets to get allocated on rx, rather than a full new 16k -buffer. - -2) This patch only disables frame filtering initially (i.e., during the NIC -open), changing the MTU results in ring buffer allocation of a size in relation -to the new mtu (along with a warning indicating that this is dangerous). - -Because of item (2), individuals who can't cope with the performance hit (or can -otherwise filter frames to prevent the bug), or who have hardware they are sure -is unaffected by this issue, can manually lower the copybreak and reset the mtu -such that performance is restored easily. - -Signed-off-by: Neil Horman -Signed-off-by: David S. Miller ---- - drivers/net/r8169.c | 29 ++++++++++++++++++++++++----- - 1 files changed, 24 insertions(+), 5 deletions(-) - -diff --git a/drivers/net/r8169.c b/drivers/net/r8169.c -index 7193afc..9674005 100644 ---- a/drivers/net/r8169.c -+++ b/drivers/net/r8169.c -@@ -186,7 +186,12 @@ static DEFINE_PCI_DEVICE_TABLE(rtl8169_pci_tbl) = { - - MODULE_DEVICE_TABLE(pci, rtl8169_pci_tbl); - --static int rx_copybreak = 200; -+/* -+ * we set our copybreak very high so that we don't have -+ * to allocate 16k frames all the time (see note in -+ * rtl8169_open() -+ */ -+static int rx_copybreak = 16383; - static int use_dac; - static struct { - u32 msg_enable; -@@ -3217,9 +3222,13 @@ static void __devexit rtl8169_remove_one(struct pci_dev *pdev) - } - - static void rtl8169_set_rxbufsize(struct rtl8169_private *tp, -- struct net_device *dev) -+ unsigned int mtu) - { -- unsigned int max_frame = dev->mtu + VLAN_ETH_HLEN + ETH_FCS_LEN; -+ unsigned int max_frame = mtu + VLAN_ETH_HLEN + ETH_FCS_LEN; -+ -+ if (max_frame != 16383) -+ printk(KERN_WARNING "WARNING! Changing of MTU on this NIC" -+ "May lead to frame reception errors!\n"); - - tp->rx_buf_sz = (max_frame > RX_BUF_SIZE) ? max_frame : RX_BUF_SIZE; - } -@@ -3231,7 +3240,17 @@ static int rtl8169_open(struct net_device *dev) - int retval = -ENOMEM; - - -- rtl8169_set_rxbufsize(tp, dev); -+ /* -+ * Note that we use a magic value here, its wierd I know -+ * its done because, some subset of rtl8169 hardware suffers from -+ * a problem in which frames received that are longer than -+ * the size set in RxMaxSize register return garbage sizes -+ * when received. To avoid this we need to turn off filtering, -+ * which is done by setting a value of 16383 in the RxMaxSize register -+ * and allocating 16k frames to handle the largest possible rx value -+ * thats what the magic math below does. -+ */ -+ rtl8169_set_rxbufsize(tp, 16383 - VLAN_ETH_HLEN - ETH_FCS_LEN); - - /* - * Rx and Tx desscriptors needs 256 bytes alignment. -@@ -3884,7 +3903,7 @@ static int rtl8169_change_mtu(struct net_device *dev, int new_mtu) - - rtl8169_down(dev); - -- rtl8169_set_rxbufsize(tp, dev); -+ rtl8169_set_rxbufsize(tp, dev->mtu); - - ret = rtl8169_init_ring(dev); - if (ret < 0) --- -1.7.0.3 - diff --git a/main/linux-pae/xfrm-cache-size-revert.patch b/main/linux-pae/xfrm-cache-size-revert.patch deleted file mode 100644 index c8fcbd0d..00000000 --- a/main/linux-pae/xfrm-cache-size-revert.patch +++ /dev/null @@ -1,12 +0,0 @@ -diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c -index 74fb2eb..e158860 100644 ---- a/net/ipv4/xfrm4_policy.c -+++ b/net/ipv4/xfrm4_policy.c -@@ -308,7 +308,6 @@ void __init xfrm4_init(int rt_max_size) - * That will let us store an ipsec connection per route table entry, - * and start cleaning when were 1/2 full - */ -- xfrm4_dst_ops.gc_thresh = rt_max_size/2; - #ifdef CONFIG_SYSCTL - sysctl_hdr = register_net_sysctl_table(&init_net, net_ipv4_ctl_path, - xfrm4_policy_table); -- cgit v1.2.3