diff options
Diffstat (limited to 'testing')
-rw-r--r-- | testing/ipt-netflow/APKBUILD | 49 | ||||
-rw-r--r-- | testing/ipt-netflow/git.patch | 4308 | ||||
-rw-r--r-- | testing/ipt-netflow/kernel-vs-userspace.patch | 56 |
3 files changed, 4413 insertions, 0 deletions
diff --git a/testing/ipt-netflow/APKBUILD b/testing/ipt-netflow/APKBUILD new file mode 100644 index 0000000000..2adb8e03ae --- /dev/null +++ b/testing/ipt-netflow/APKBUILD @@ -0,0 +1,49 @@ +# Contributor: Rush Future <rush.zlo@gmail.com> +# Maintainer: Natanael Copa <ncopa@alpinelinux.org> +pkgname=ipt-netflow +pkgver=1.8_git20140103 +_ver=${pkgver%_git*} +pkgrel=0 +pkgdesc="Kernel netflow sensor" +url="http://ipt-netflow.sourceforge.net." +arch="all" +license="GPL3+" +makedepends="iptables-dev bash" +source="$pkgname-$_ver.tar.gz::https://github.com/aabc/ipt-netflow/archive/v$_ver.tar.gz + git.patch + kernel-vs-userspace.patch + " + +_builddir="$srcdir/$pkgname-$_ver" +prepare() { + cd "$_builddir" + for i in $source; do + case $i in + *.patch) msg $i; patch -p1 -i "$srcdir"/$i || return 1;; + esac + done +} + +build() { + cd "$_builddir" + ./configure \ + --ipt-inc=/usr/include \ + --disable-kernel \ + || return 1 + make libipt_NETFLOW.so || return 1 +} + +package() { + cd "$_builddir" + make linstall DESTDIR=$pkgdir || return 1 +} + +md5sums="922a7d9dd17f28a2c3551a1c5e9849eb ipt-netflow-1.8.tar.gz +77875a5979589bca8c49a8331e7cc22b git.patch +59adcee5cd4cec4e09f432f567ff8243 kernel-vs-userspace.patch" +sha256sums="eefe766eda6d4d918e4cc919b5d5acd2b681a258246017ab7948b80f0cb95112 ipt-netflow-1.8.tar.gz +75ea3afe9532699d4fd1bef7f198dfaa97b3a310234a86b48371522ae0704a60 git.patch +a897d88212b56c3d06645f8fc9fe7ce5b7accfa1d45b96917b3ba747605a82bc kernel-vs-userspace.patch" +sha512sums="c413515deeac1d604e36a53b39edcf340d6c3f78c29e53979fede8aa013e324ada4622b571ac5270f7495ab6982096cd2bd9e9283c9cc2cc7360aa5c3954792d ipt-netflow-1.8.tar.gz +3fade2ceb00ddae15dd6b1eacebc36480efe0c5c2d050e050e0aef37382ad72ca454f4d23080b7263ab2f078f72db4572e2514d3faca113d8e437437f7c3afcf git.patch +dfc37ccc010559994a94290aee99dbc75c47732cbc06492366bff081bafced7bccc5144b5119dc8f0ad8dea1392684ef7f592d15131da2fa1227516dc7d8d3ce kernel-vs-userspace.patch" diff --git a/testing/ipt-netflow/git.patch b/testing/ipt-netflow/git.patch new file mode 100644 index 0000000000..faa49ffdc7 --- /dev/null +++ b/testing/ipt-netflow/git.patch @@ -0,0 +1,4308 @@ +diff --git a/Makefile.in b/Makefile.in +index 30ebbfe..0a82c67 100644 +--- a/Makefile.in ++++ b/Makefile.in +@@ -1,43 +1,73 @@ +-# ++# Edit Makefile.in and run ./configure + + KVERSION = @KVERSION@ + KDIR = @KDIR@ ++KINSTDIR = $(shell dirname @KDIR@) + IPTABLES_CFLAGS = @IPTABLES_CFLAGS@ + IPTABLES_MODULES = @IPTABLES_MODULES@ ++DEPMOD = depmod -a + ++# https://www.kernel.org/doc/Documentation/kbuild/modules.txt ++# https://www.kernel.org/doc/Documentation/kbuild/makefiles.txt + obj-m = ipt_NETFLOW.o + +-ipt_NETFLOW.ko: ipt_NETFLOW.c ipt_NETFLOW.h ++all: ipt_NETFLOW.ko libipt_NETFLOW.so libip6t_NETFLOW.so ++ipt_NETFLOW.ko: version.h ipt_NETFLOW.c ipt_NETFLOW.h Makefile + @echo Compiling for kernel $(KVERSION) + make -C $(KDIR) M=$(CURDIR) modules +-all: ipt_NETFLOW.ko libipt_NETFLOW.so +-minstall: +- make -C $(KDIR) M=$(CURDIR) modules_install ++ @touch $@ ++sparse: | version.h ipt_NETFLOW.c ipt_NETFLOW.h Makefile ++ @rm -f ipt_NETFLOW.ko ipt_NETFLOW.o ++ @echo Compiling for kernel $(KVERSION) ++ make -C $(KDIR) M=$(CURDIR) modules C=1 ++ @touch ipt_NETFLOW.ko ++minstall: | ipt_NETFLOW.ko ++ make -C $(KDIR) M=$(CURDIR) modules_install INSTALL_MOD_PATH=$(DESTDIR) ++ $(DEPMOD) + mclean: + make -C $(KDIR) M=$(CURDIR) clean + lclean: + -rm -f *.so *_sh.o + clean: mclean lclean +- -rm -f *.so *.o modules.order ++ -rm -f *.so *.o modules.order version.h ++ ++%_sh.o: libipt_NETFLOW.c ++ gcc -O2 -Wall -Wunused $(IPTABLES_CFLAGS) -fPIC -o $@ -c libipt_NETFLOW.c ++ ++%.so: %_sh.o ++ gcc -shared -o $@ $< + +-libipt_NETFLOW.so: libipt_NETFLOW.c +- gcc -O2 -Wall -Wunused -I$(KDIR)/include $(IPTABLES_CFLAGS) -fPIC -o libipt_NETFLOW_sh.o -c libipt_NETFLOW.c +- gcc -shared -o libipt_NETFLOW.so libipt_NETFLOW_sh.o ++version.h: ipt_NETFLOW.c ipt_NETFLOW.h Makefile ++ @if [ -d .git ] && type git >/dev/null 2>&1; then \ ++ echo "#define GITVERSION \"`git describe --dirty`\""; \ ++ fi > version.h + +-linstall: ipt_NETFLOW.ko libipt_NETFLOW.so +- cp -a libipt_NETFLOW.so $(IPTABLES_MODULES) ++linstall: | libipt_NETFLOW.so libip6t_NETFLOW.so ++ install -D libipt_NETFLOW.so $(DESTDIR)$(IPTABLES_MODULES)/libipt_NETFLOW.so ++ install -D libip6t_NETFLOW.so $(DESTDIR)$(IPTABLES_MODULES)/libip6t_NETFLOW.so + + install: minstall linstall + ++uninstall: ++ -rm -f $(DESTDIR)$(IPTABLES_MODULES)/libipt_NETFLOW.so ++ -rm -f $(DESTDIR)$(IPTABLES_MODULES)/libip6t_NETFLOW.so ++ -rm -f $(DESTDIR)$(KINSTDIR)/extra/ipt_NETFLOW.ko ++ + Makefile: Makefile.in configure + ./configure --make + + load: all +- insmod ipt_NETFLOW.ko active_timeout=5 +- iptables -A OUTPUT -d 0/0 -j NETFLOW +- iptables -A INPUT -d 0/0 -j NETFLOW ++ -insmod ipt_NETFLOW.ko active_timeout=5 protocol=9 ++ -iptables -I OUTPUT -j NETFLOW ++ -iptables -I INPUT -j NETFLOW ++ -ip6tables -I OUTPUT -j NETFLOW ++ -ip6tables -I INPUT -j NETFLOW + + unload: +- iptables -D OUTPUT -d 0/0 -j NETFLOW +- iptables -D INPUT -d 0/0 -j NETFLOW +- rmmod ipt_NETFLOW.ko ++ -iptables -D OUTPUT -j NETFLOW ++ -iptables -D INPUT -j NETFLOW ++ -ip6tables -D OUTPUT -j NETFLOW ++ -ip6tables -D INPUT -j NETFLOW ++ -rmmod ipt_NETFLOW.ko ++ ++reload: unload load +diff --git a/README b/README +index 213f02c..56a4fde 100644 +--- a/README ++++ b/README +@@ -1,10 +1,17 @@ +-ipt_NETFLOW linux 2.6 kernel module by <abc@telekom.ru> -- 11 Feb 2008 ++ipt_NETFLOW linux 2.6.x-3.x kernel module by <abc@telekom.ru> -- 2008-2013. ++ ++ High performance NetFlow v5, v9, IPFIX flow data export module for Linux ++ kernel. Supporting IPv4 and IPv6. Created to be useful for highly loaded ++ linux router. It should be used as iptables target. Also can export NAT ++ translation events using NetFlow Event Logging (NEL) for v9, IPFIX, or ++ specially crafted v5 flows. ++ + + ============================ + = OBTAINING LATEST VERSION = + ============================ + +- $ git clone git://ipt-netflow.git.sourceforge.net/gitroot/ipt-netflow/ipt-netflow ++ $ git clone git://git.code.sf.net/p/ipt-netflow/code ipt-netflow + $ cd ipt-netflow + + +@@ -12,94 +19,220 @@ ipt_NETFLOW linux 2.6 kernel module by <abc@telekom.ru> -- 11 Feb 2008 + = INSTALLATION = + ================ + +-1. Besides kernel you will need iptables/netfilter source matching your +- installation or just fresh install from there: ftp://ftp.netfilter.org/pub/iptables/snapshot/ +- I have this: ftp://ftp.netfilter.org/pub/iptables/snapshot/iptables-1.3.7-20070329.tar.bz2 +- Unpack it somewhere and build with make. ++ Four easy steps. ++ ++** 1. Prepare Kernel source ++ ++ If you have package system install kernel-devel package, otherwise install ++ raw kernel source from http://kernel.org matching _exactly_ version of your ++ installed kernel. ++ ++ a) What to do for Centos: ++ ++ ~# yum install kernel-devel ++ ++ b) What to do for Debian: ++ ++ ~# apt-get install module-assistant ++ ~# m-a prepare ++ ++ c) Otherwise, if you downloaded raw kernel sources don't forget to create ++ .config by copying it from your distribution's kernel. Its copy could reside ++ in /boot or sometimes in /proc, examples: ++ ++ kernel-src-dir/# cp /boot/config-`uname -r` .config ++ or ++ kernel-src-dir/# zcat /proc/config.gz > .config ++ ++ Assuming you unpacked kernel source into `kernel-src-dir/' directory. ++ Then run: ++ ++ kernel-src-dir/# make oldconfig ++ ++ After that you'll need to prepare kernel for modules build: ++ ++ kernel-src-dir/# make prepare modules_prepare ++ ++ Note: Don't try to `make prepare' in Centos kernel-devel package directory ++ (which is usually something like /usr/src/kernels/2.6.32-431.el6.x86_64) ++ as this is wrong and meaningless. ++ ++** 2. Prepare Iptables ++ ++ Before this step it also would be useful to install pkg-config if don't ++ already have. ++ ++ If you have package system just install iptables-devel (or iptables-dev) ++ package, otherwise install iptables source matching version of your ++ installation from ftp://ftp.netfilter.org/pub/iptables/ ++ ++ a) What to do for Centos: ++ ++ # yum install iptables-devel ++ ++ b) What to do for Debian: ++ ++ # apt-get install iptables-dev pkg-config + +-2. Run ./configure script and it will create Makefile ++ c) Otherwise, for raw iptables source build it and make install. + +-3. make all install; depmod +- This will install kernel module and iptable specific library. ++** 3. Now, to actually build the module run: + +-Troubleshooting: +- 1) Sometimes you will want to add CC=gcc-3 to make command. +- Example: make CC=gcc-3.3 ++ ~/ipt-netflow# ./configure ++ ~/ipt-netflow# make all install ++ ~/ipt-netflow# depmod + +- 2) Compile module with actual kernel source compiled. +- I.e. first compile kernel and boot into it, and then compile module. ++ This will install kernel module and iptables specific library. + +- 3) For autoloading module after reboot: set net.netflow.destination (or load +- module, if idestination set on load) after interfaces are up. Becasue module +- needs exporting interface (usually lo) to establish export connection. ++ Troubleshooting: + +-4. After this point you should be able to load module +- and use -j NETFLOW target in your iptables. See next section. ++ a) Sometimes you will want to add CC=gcc-3 to make command. ++ Example: make CC=gcc-3.3 ++ ++ b) Compile module with actual kernel source compiled. ++ I.e. first compile kernel and boot into it, and then compile module. ++ If you are using kernel-devel package check that its version matches ++ your kernel package. ++ ++ c) If you have sources in non-standard places or configure isn't able to ++ find something run ./configure --help to see how to specify paths manually. ++ ++** 4. After this point you should be able to load module and ++ use -j NETFLOW target in your iptables. See next section. + + + =========== + = RUNNING = + =========== + +-1. You can load module by insmod like this: +- # insmod ipt_NETFLOW.ko destination=127.0.0.1:2055 debug=1 ++1. You can load module directly by insmod like this: ++ ++ # insmod ipt_NETFLOW.ko destination=127.0.0.1:2055 debug=1 + + Or if properly installed (make install; depmod) by this: +- # modprobe ipt_NETFLOW destination=127.0.0.1:2055 ++ ++ # modprobe ipt_NETFLOW destination=127.0.0.1:2055 + + See, you may add options in insmod/modprobe command line, or add +- them in /etc/ to modules.conf or modprobe.conf like thus: +- options ipt_NETFLOW destination=127.0.0.1:2055 ++ them in /etc/modprobe.conf or /etc/modprobe.d/ipt_NETFLOW.conf ++ like thus: ++ ++ options ipt_NETFLOW destination=127.0.0.1:2055 protocol=9 natevents=1 + + 2. Statistics is in /proc/net/stat/ipt_netflow +- To view slab statistics: grep ipt_netflow /proc/slabinfo ++ To view boring slab statistics: grep ipt_netflow /proc/slabinfo + + 3. You can view parameters and control them via sysctl, example: +- # sysctl -w net.netflow.hashsize=32768 + +-4. Example of directing all traffic into module: +- # iptables -A FORWARD -j NETFLOW +- # iptables -A INPUT -j NETFLOW +- # iptables -A OUTPUT -j NETFLOW ++ # sysctl net.netflow ++ # sysctl net.netflow.hashsize=32768 ++ ++ Note: For after-reboot configuration I recommend to store module parameters ++ in modprobe configs instead of storing them in /etc/sysctl.conf, as it's ++ less clear when init process will apply sysctl.conf, before of after ++ module's load. ++ ++4. Example of directing all IPv4 traffic into the module: ++ ++ # iptables -I FORWARD -j NETFLOW ++ # iptables -I INPUT -j NETFLOW ++ # iptables -I OUTPUT -j NETFLOW ++ ++ Note: It is preferable (because easier to understand) to _insert_ ++ NETFLOW target at the top of the chain, otherwise not all traffic may ++ reach NETFLOW if your iptables configuration is complicated and some ++ other rule inadvertently consume the traffic (dropping or acepting before ++ NETFLOW is reached). It's always good to test your configuration. ++ Use iptables -L -nvx to check pkts/bytes counters on the rules. ++ ++5. If you want to account IPv6 traffic you should use protocol 9 or 10. ++ Example of directing all IPv6 traffic into the module: + ++ # sysctl net.netflow.protocol=10 ++ # ip6tables -I FORWARD -j NETFLOW ++ # ip6tables -I INPUT -j NETFLOW ++ # ip6tables -I OUTPUT -j NETFLOW ++ ++ Note: First enable right version of protocol and after that add ip6tables ++ rules, otherwise you will get errors in dmesg. ++ ++6. If you want to account NAT events (NEL): ++ ++ # sysctl net.netflow.natevents=1 ++ ++ Note that natevents feature is completely independent from traffic accounting ++ (it's using so called conntrack events), thus you don't need to set or change ++ any iptables rules to use that. You may need to enable kernel config option ++ CONFIG_NF_CONNTRACK_EVENTS though (if it isn't already enabled). ++ For details on how they are exported for different protocol versions see ++ below. + + =========== + = OPTIONS = + =========== + ++ protocol=5 ++ - what version of NetFlow protocol to use. Default is 5. ++ You can choose from 5, 9, or 10 (where 10 is IPFIX). If you plan ++ to account IPv6 traffic you should use protocol 9 or 10 (IPFIX), ++ because NetFlow v5 isn't compatible with IPv6. ++ + destination=127.0.0.1:2055 + - where to export netflow, to this ip address + You will see this connection in netstat like this: + udp 0 0 127.0.0.1:32772 127.0.0.1:2055 ESTABLISHED + + destination=127.0.0.1:2055,192.0.0.1:2055 +- - mirror flows to two (can be more) addresses, +- separate addresses with comma. ++ - mirror flows to two (can be more) addresses, separate addresses ++ with comma. ++ ++ natevents=1 ++ - Collect and send NAT translation events as NetFlow Event Logging (NEL) ++ for NetFlow v9/IPFIX, or as dummy flows compatible with NetFlow v5. ++ Default is 0 (don't send). ++ ++ For NetFlow v5 protocol meaning of fields in dummy flows are such: ++ Src IP, Src Port is Pre-nat source address. ++ Dst IP, Dst Port is Post-nat destination address. ++ - These two fields made equal to data flows catched in FORWARD chain. ++ Nexthop, Src AS is Post-nat source address for SNAT. Or, ++ Nexthop, Dst AS is Pre-nat destination address for DNAT. ++ TCP Flags is SYN+SCK for start event, RST+FIN for stop event. ++ Pkt/Traffic size is 0 (zero), so it won't interfere with accounting. + + inactive_timeout=15 + - export flow after it's inactive 15 seconds. Default value is 15. + + active_timeout=1800 +- - export flow after it's active 1800 seconds (30 minutes). Default value is 1800. ++ - export flow after it's active 1800 seconds (30 minutes). Default valuae ++ is 1800. ++ ++ refresh-rate=20 ++ - for NetFlow v9 and IPFIX it's rate how frequently to re-send templates ++ (per packets). You probably don't need to change default (which is 20). ++ ++ timeout-rate=30 ++ - for NetFlow v9 and IPFIX it's rate when to re-send old templates (in ++ minutes). No need to change it. + + debug=0 + - debug level (none). + + sndbuf=number +- - size of output socket buffer in bytes. Recommend you to put ++ - size of output socket buffer in bytes. I recommend you to put + higher value if you experience netflow packet drops (can be + seen in statistics as 'sock: fail' number.) + Default value is system default. + + hashsize=number + - Hash table bucket size. Used for performance tuning. +- Abstractly speaking, it should be two times bigger than flows ++ Abstractly speaking, it should be minimum two times bigger than flows + you usually have, but not need to. + Default is system memory dependent small enough value. + + maxflows=2000000 +- - Maximum number of flows to account. It's here to prevent DOS attacks. After +- this limit reached new flows will not be accounted. Default is ++ - Maximum number of flows to account. It's here to prevent DOS attacks. ++ After this limit reached new flows will not be accounted. Default is + 2000000, zero is unlimited. + + aggregation=string.. +@@ -130,14 +263,15 @@ Troubleshooting: + = HOW TO READ STAT = + ==================== + +- Statistics is your friend to fine tune and understand netflow module performance. ++ Statistics is your friend to fine tune and understand netflow module ++ performance. + + To see stat: + # cat /proc/net/stat/ipt_netflow + + How to interpret the data: + +-> Flows: active 5187 (peak 83905 reached 0d0h1m ago, maxflows 2000000), mem 283K ++> Flows: active 5187 (peak 83905 reached 0d0h1m ago, maxflows 2000000), mem 283K, worker delay 100/1000. + + active X: currently active flows in memory cache. + - for optimum CPU performance it is recommended to set hash table size to +@@ -146,8 +280,9 @@ Troubleshooting: + mem XK: how much kilobytes of memory currently taken by active flows. + - one active flow taking 56 bytes of memory. + - there is system limit on cache size too. ++ worker delay X/HZ: how frequently exporter scan flows table per second. + +-> Hash: size 8192 (mem 32K), metric 1.0, 1.0, 1.0, 1.0. MemTraf: 1420 pkt, 364 K (pdu 0, 0). ++> Hash: size 8192 (mem 32K), metric 1.00, [1.00, 1.00, 1.00]. MemTraf: 1420 pkt, 364 K (pdu 0, 0). + + Hash: size X: current hash size/limit. + - you can control this by sysctl net.netflow.hashsize variable. +@@ -156,18 +291,22 @@ Troubleshooting: + - optimal value is twice of average of active flows. + mem XK: how much memory occupied by hash table. + - hash table is fixed size by nature, taking 4 bytes per entry. +- metric X, X, X, X: how optimal is your hash table being used. ++ metric X, [X, X, X]: how optimal is your hash table being used. + - lesser value mean more optimal hash table use, min is 1.0. +- - this is moving average (EWMA) of hash table access divided +- by match rate (searches / matches) for 4sec, and 1, 5, 15 minutes. +- Sort of hash table load average. ++ - last three numbers in squares is moving average (EWMA) of hash table ++ access divided by match rate (searches / matches) for 4sec, and 1, 5, and ++ 15 minutes. Sort of hash table load average. First value is instantaneous. ++ You can try to increase hashsize if averages more than 1 (increase ++ certainly if >= 2). + MemTraf: X pkt, X K: how much traffic accounted for flows that are in memory. + - these flows that are residing in internal hash table. + pdu X, X: how much traffic in flows preparing to be exported. + - it is included already in aforementioned MemTraf total. + +-> Timeout: active 1800, inactive 15. Maxflows 2000000 ++> Protocol version 10 (ipfix), refresh-rate 20, timeout-rate 30, (templates 2, active 2). Timeouts: active 5, inactive 15. Maxflows 2000000 + ++ Protocol version currently in use. Refresh-rate and timeout-rate ++ for v9 and IPFIX. Total templates generated and currently active. + Timeout: active X: how much seconds to wait before exporting active flow. + - same as sysctl net.netflow.active_timeout variable. + inactive X: how much seconds to wait before exporting inactive flow. +@@ -180,20 +319,22 @@ Troubleshooting: + + - Module throughput values for 1 second, 1 minute, and 5 minutes. + +-> cpu# stat: <search found new, trunc frag alloc maxflows>, sock: <ok fail cberr, bytes>, traffic: <pkt, bytes>, drop: <pkt, bytes> +-> cpu0 stat: 980540 10473 180600, 0 0 0 0, sock: 4983 928 0, 7124 K, traffic: 188765, 14 MB, drop: 27863, 1142 K ++> cpu# stat: <search found new [metric], trunc frag alloc maxflows>, sock: <ok fail cberr, bytes>, traffic: <pkt, bytes>, drop: <pkt, bytes> ++> cpu0 stat: 980540 10473 180600 [1.03], 0 0 0 0, sock: 4983 928 0, 7124 K, traffic: 188765, 14 MB, drop: 27863, 1142 K + + cpu#: this is Total and per CPU statistics for: + stat: <search found new, trunc frag alloc maxflows>: internal stat for: + search found new: hash table searched, found, and not found counters. +- trunc: how much truncated packets is ignored ++ [metric]: average hash metric since module load. ++ trunc: how much truncated packets are ignored + - these are that possible don't have valid IP header. + - accounted in drop packets counter but not in drop bytes. + frag: how much fragmented packets have seen. + - kernel always defragments INPUT/OUTPUT chains for us. + - these packets are not ignored but not reassembled either, so: +- - if there is no enough data in fragment (ex. tcp ports) it is considered zero. +- alloc: how much cache memory allocations is failed. ++ - if there is no enough data in fragment (ex. tcp ports) it is considered ++ zero. ++ alloc: how much cache memory allocations are failed. + - packets ignored and accounted in drop stat. + - probably increase system memory if this ever happen. + maxflows: how much packets ignored on maxflows (maximum active flows reached). +@@ -203,7 +344,8 @@ Troubleshooting: + sock: <ok fail cberr, bytes>: table of exporting stats for: + ok: how much Netflow PDUs are exported (i.e. UDP packets sent by module). + fail: how much socket errors (i.e. packets failed to be sent). +- - packets dropped and their internal statistics cumulatively accounted in drop stat. ++ - packets dropped and their internal statistics cumulatively accounted in ++ drop stat. + cberr: how much connection refused ICMP errors we got from export target. + - probably you not launched collector software on destination, + - or specified wrong destination address. +@@ -225,20 +367,34 @@ Troubleshooting: + packet is for new flow but maxflows is already reached, + all flows in export packets that got socket error. + +-> sock0: 10.0.0.2:2055, sndbuf 106496, filled 0, peak 106848; err: sndbuf reached 928, other 0 ++> Natevents disabled, count start 0, stop 0. ++ ++ - Natevents mode disabled or enabled, and how much start or stop events ++ are reported. ++ ++> sock0: 10.0.0.2:2055 unconnected (1 attempts). ++ ++ If socket is unconnected (for example if module loaded before interfaces is ++ up) it shows now much connection attempts was failed. It will try to connect ++ until success. ++ ++> sock0: 10.0.0.2:2055, sndbuf 106496, filled 0, peak 106848; err: sndbuf reached 928, connect 0, other 0 + + sockX: per destination stats for: + X.X.X.X:Y: destination ip address and port. + - controlled by sysctl net.netflow.destination variable. + sndbuf X: how much data socket can hold in buffers. + - controlled by sysctl net.netflow.sndbuf variable. +- - if you have packet drops due to sndbuf reached (error -11) increase this value. ++ - if you have packet drops due to sndbuf reached (error -11) increase this ++ value. + filled X: how much data in socket buffers right now. + peak X: peak value of how much data in socket buffers was. + - you will be interested to keep it below sndbuf value. + err: how much packets are dropped due to errors. + - all flows from them will be accounted in drop stat. +- sndbuf reached X: how much packets dropped due to sndbuf being too small (error -11). ++ sndbuf reached X: how much packets dropped due to sndbuf being too small ++ (error -11). ++ connect X: how much connection attempts was failed. + other X: dropped due to other possible errors. + + > aggr0: ... +diff --git a/README.promisc b/README.promisc +index 60ca922..31d774f 100644 +--- a/README.promisc ++++ b/README.promisc +@@ -2,9 +2,14 @@ Hello, + + If you wish to account with netflow module traffic mirrored on switch you may follow this example: + +-************** +-* Solution 1 * +-************** ++ ++ Solution 1: General kernel patch. ++ Solution 2: Alternative w/o kernel patch. ++ ++ ++ ************** ++ * Solution 1 * ++ ************** + + 1. Patch your kernel with `raw_promisc.patch' to enable raw table to see promisc traffic. + +@@ -33,17 +38,7 @@ If you wish to account with netflow module traffic mirrored on switch you may fo + # /sbin/vconfig add eth1 47 + # /sbin/ifconfig eth1.47 up + +-5. Recompile ipt_netflow module with #define RAW_PROMISC_HACK uncommented: +- +- Find this line in ipt_NETFLOW.c (should be line 7): +- +-//#define RAW_PROMISC_HACK +- +- And remove two slashes at beginning of the line, so it become like this: +- +-#define RAW_PROMISC_HACK +- +- Re-compile module: ++5. Compile module: + + # make clean all install + +@@ -55,13 +50,14 @@ If you wish to account with netflow module traffic mirrored on switch you may fo + + # /sbin/iptables -A PREROUTING -t raw -i eth1.47 -j NETFLOW + +- + Voila. + ++ps. For Debian Squeeze instructions look at raw_promisc_debian_squeeze6.patch + +-************** +-* Solution 2 * +-************** ++ ++ ************** ++ * Solution 2 * ++ ************** + + By Anonymous. + +@@ -81,4 +77,3 @@ Sometimes you may need to run: + + for this scheme to work. + +- +diff --git a/configure b/configure +index 677dd7f..3f10e2a 100755 +--- a/configure ++++ b/configure +@@ -3,7 +3,7 @@ + PATH=$PATH:/bin:/usr/bin:/usr/sbin:/sbin:/usr/local/sbin + + error() { +- echo "! Error: $@" ++ echo -e "! Error: $@" + exit 1 + } + +@@ -56,19 +56,20 @@ get_lib_from_lib() { + } + + iptables_inc() { +- echo -n "Iptables include path: " ++ echo -n "Iptables include flags: " + if [ "$IPTINC" ]; then +- echo "$IPTINC (user specified)" + IPTINC="-I$IPTINC" ++ echo "$IPTINC (user specified)" ++ elif [ "$PKGVER" ]; then ++ IPTINC="$PKGINC" ++ echo "$IPTINC (pkg-config)" ++ elif [ "$NOIPTSRC" ]; then ++ IPTINC= ++ echo "none (default)" + else +- if [ "$PKGINC" ]; then +- IPTINC="$PKGINC" +- echo "$IPTINC (pkg-config)" +- else +- IPTINC="$IPTSRC/include" +- echo "$IPTINC (from source)" +- IPTINC="-I$IPTINC" +- fi ++ IPTINC="$IPTSRC/include" ++ IPTINC="-I$IPTINC" ++ echo "$IPTINC (from source)" + fi + } + +@@ -109,7 +110,16 @@ try_dir2() { + test -d "$1" && try_dir `dirname $1` && return 0 + } + +-iptables_ver() { ++check_pkg_config() { ++ test "$PKGWARN" && return 1 ++ if ! which pkg-config >/dev/null 2>&1; then ++ echo "! You don't have pkg-config, it may be useful to install it." ++ PKGWARN=1 ++ return 1 ++ fi ++ return 0 ++} ++iptables_find_version() { + echo -n "Iptables binary version: " + if [ "$IPTVER" ]; then + echo "$IPTVER (user specified)" +@@ -121,6 +131,7 @@ iptables_ver() { + else + echo "no iptables binary found" + fi ++ check_pkg_config + PKGVER=`pkg-config --modversion xtables 2>/dev/null` + if [ "$PKGVER" ]; then + IPTVER="$PKGVER" +@@ -131,44 +142,90 @@ iptables_ver() { + fi + } + +-iptables_dir() { +- test "$IPTINC" && return 1 +- test "$PKGINC" && return 1 +- +- VER="iptables-$IPTVER" +- if [ "$IPTSRC" ]; then +- echo "User specified source directory: $IPTSRC" +- try_dir $IPTSRC || error "Specified directory is not iptables source.." ++compile_libitp_test() { ++ echo -n "Checking for presence of $@... " ++ echo " ++#define __EXPORTED_HEADERS__ ++#include <$*>" > test.c ++ gcc -c test.c >/dev/null 2>&1 ++ RET=$? ++ if [ $RET = 0 ]; then ++ echo Yes; + else +- echo "Searching for $VER sources.." +- try_dir "./$VER" && return 0 +- try_dir "../$VER" && return 0 +- try_dir "/usr/src/$VER" && return 0 +- try_dirg "iptables" && return 0 +- try_dirg "../iptables" && return 0 +- try_dirg "/usr/src/iptables" && return 0 +- try_dir2 `locate $VER/extensions | head -1` && return 0 +- error "Can not find iptables source directory, try setting it with --ipt-src=" ++ echo No; + fi ++ rm -f test.c test.o ++ return $RET + } + +-iptables_pkg_config() { ++iptables_try_pkgconfig() { + if [ ! "$PKGVER" ]; then ++ check_pkg_config ++ PKGVER=`pkg-config --modversion xtables 2>/dev/null` ++ TRYPKGVER=`pkg-config --modversion xtables 2>/dev/null` + echo -n "pkg-config for version $IPTVER exists: " +- PKGVER=`pkg-config --exact-version=$IPTVER --modversion xtables 2>/dev/null` ++ pkg-config --exact-version=$IPTVER xtables 2>/dev/null + if [ $? = 0 ]; then + echo "Yes" ++ PKGVER=$TRYPKGVER + else +- echo "No (reported: $PKGVER)" +- unset PKGVER ++ if [ "$TRYPKGVER" ]; then ++ echo "No (reported: $TRYPKGVER)" ++ else ++ echo "No" ++ fi + fi + fi + if [ "$PKGVER" ]; then ++ check_pkg_config ++ PKGVER=`pkg-config --modversion xtables 2>/dev/null` + PKGINC=`pkg-config --cflags xtables` + PKGLIB=`pkg-config --variable=xtlibdir xtables` +- IPTCFLAGS="-DXTABLES" + else +- IPTCFLAGS="-DIPTABLES_VERSION=\\\\\"$IPTVER\\\\\"" ++ # Newer versions of iptables should not have -I/kernel/include! ++ # So I assume that newer version will have correct pkg-config set up ++ # and if not, then it's older who need it. ++ IPTCFLAGS="-I$KDIR/include -DIPTABLES_VERSION=\\\\\"$IPTVER\\\\\"" ++ fi ++ if compile_libitp_test xtables.h; then ++ IPTCFLAGS="-DXTABLES $IPTCFLAGS" ++ elif ! compile_libitp_test iptables.h; then ++ echo "! Iptables headers not found. You may need to specify --ipt-inc=..." ++ if [ -s /etc/debian_version ]; then ++ echo "! " ++ echo "! Under Debian simply run this:" ++ echo "! root# apt-get install iptables-dev pkg-config" ++ elif [ -s /etc/redhat-release ]; then ++ echo "! " ++ arch=.`uname -m` ++ echo "! Under Centos simply run this:" ++ echo "! root# yum install iptables-devel$arch pkgconfig" ++ fi ++ exit 1 ++ fi ++ ++} ++ ++iptables_find_src() { ++ test "$IPTINC" && return 1 ++ test "$PKGVER" && return 1 ++ ++ VER="iptables-$IPTVER" ++ if [ "$IPTSRC" ]; then ++ echo "User specified source directory: $IPTSRC" ++ try_dir $IPTSRC || error "Specified directory is not iptables source.." ++ else ++ echo "Searching for $VER sources.." ++ try_dir "./$VER" && return 0 ++ try_dir "../$VER" && return 0 ++ try_dir "/usr/src/$VER" && return 0 ++ try_dirg "iptables" && return 0 ++ try_dirg "../iptables" && return 0 ++ try_dirg "/usr/src/iptables" && return 0 ++ try_dir2 `locate $VER/extensions 2>/dev/null | head -1` && return 0 ++ echo "! Can not find iptables source directory, you may try setting it with --ipt-src=" ++ echo "! This is not fatal error, yet. Will be just using default include dir." ++ NOIPTSRC=1 + fi + } + +@@ -206,18 +263,110 @@ do + esac + done + +-test "$KVERSION" || KVERSION=`uname -r` +-echo Kernel version: $KVERSION ++kernel_find_version() { ++ KHOW=requested ++ test "$KVERSION" && return 0 ++ ++ if grep -q '#.*Debian' /proc/version; then ++ KHOW=proc ++ KVERSION=`sed -n 's/.*#.*Debian \([0-9\.]\+\)-.*/\1/p' /proc/version` ++ KLIBMOD=`uname -r` ++ else ++ KHOW=uname ++ KVERSION=`uname -r` ++ fi ++ test "$KDIR" || return 0 ++ ++ test -s $KDIR/Makefile || return 1 ++ test -s $KDIR/include/config/kernel.release || return 1 ++ KVERSION=`cat $KDIR/include/config/kernel.release` ++ KHOW=sources ++} ++ ++kernel_check_src() { ++ if [ -s "$1/Makefile" ]; then ++ KDIR="$1" ++ return 0 ++ fi ++ return 1 ++} ++ ++kernel_find_source() { ++ KSHOW=requested ++ test "$KDIR" && return 0 ++ KSHOW=found ++ kernel_check_src /lib/modules/$KLIBMOD/build && return 0 ++ kernel_check_src /lib/modules/$KVERSION/build && return 0 ++ kernel_check_src /usr/src/kernels/$KVERSION && return 0 ++ kernel_check_src /usr/src/linux-$KVERSION && return 0 ++ echo "! Linux source not found. Don't panic. You may specify kernel source" ++ echo "! directory with --kdir=..., or try to install kernel-devel package," ++ echo "! or just raw sources for linux-$KVERSION from kernel.org." ++ if grep -q -i centos /proc/version 2>/dev/null; then ++ echo "! " ++ arch=.`uname -m` ++ echo "! Under Centos simply run this:" ++ echo "! root# yum install kernel-devel iptables-devel$arch pkgconfig" ++ fi ++ if grep -q -i debian /proc/version 2>/dev/null; then ++ echo "! " ++ echo "! Under Debian simply run this:" ++ echo "! root# apt-get install module-assistant iptables-dev pkg-config" ++ echo "! root# m-a prepare" ++ fi ++ exit 1 ++} ++ ++kernel_check_consistency() { ++ if test -s $KDIR/include/config/kernel.release; then ++ SRCVER=`cat $KDIR/include/config/kernel.release` ++ test "$KVERSION" != "$SRCVER" && error "$KHOW kernel version ($KVERSION) and $KSHOW version of kernel source ($SRCVER) doesn't match!\n!" \ ++ "You may try to specify only kernel source tree with --kdir=$KDIR\n!" \ ++ "and configure will pick up version properly." ++ else ++ test -e "$KDIR/.config" || error ".config in kernel source not found, run make menuconfig in $KDIR" ++ test -d "$KDIR/include/config" || error "kernel is not prepared, run make prepare modules_prepare in $KDIR" ++ fi ++} ++ ++kconfig() { ++ KCONFIG=$KDIR/.config ++ if ! grep -q "^$1=" $KCONFIG 2>/dev/null; then ++ if [ "$KCONFIGREPORTED" != true ]; then ++ KCONFIGREPORTED=true ++ echo Kernel config file checked: $KCONFIG ++ echo ++ fi ++ echo "! Attention: $1 is undefined in your kernel configuration" ++ echo "! Without this option enabled $2 will not work." ++ echo ++ fi ++} ++ ++kernel_check_config() { ++ kconfig CONFIG_SYSCTL "sysctl interface" ++ kconfig CONFIG_PROC_FS "proc interface" ++ kconfig CONFIG_NF_NAT_NEEDED "natevents" ++ kconfig CONFIG_NF_CONNTRACK_EVENTS "natevents" ++ kconfig CONFIG_NF_CONNTRACK_MARK "connmark tracking" ++ kconfig CONFIG_IPV6 "IPv6" ++ kconfig CONFIG_IP6_NF_IPTABLES "ip6tables target" ++} + +-test "$KDIR" || KDIR=/lib/modules/$KVERSION/build +-echo Kernel sources: $KDIR ++kernel_find_version #KVERSION ++test "$KLIBMOD" || KLIBMOD=$KVERSION ++echo "Kernel version: $KVERSION ($KHOW)" ++kernel_find_source #KDIR ++echo "Kernel sources: $KDIR ($KSHOW)" ++kernel_check_consistency ++kernel_check_config + + test "$IPTBIN" || IPTBIN=`which iptables` + +-iptables_ver #IPTVER +-iptables_pkg_config +-iptables_dir #IPTSRC +-iptables_src_version #check IPTSRC match to IPTVER ++iptables_find_version #IPTVER ++iptables_try_pkgconfig #try to configure from pkg-config ++iptables_find_src #IPTSRC ++iptables_src_version #check that IPTSRC match to IPTVER + iptables_inc #IPTINC + iptables_modules #IPTLIB + +@@ -225,7 +374,6 @@ REPLACE="\ + s!@KVERSION@!$KVERSION!;\ + s!@KDIR@!$KDIR!;\ + s!@IPTABLES_VERSION@!$IPTVER!;\ +-s!@IPTABLES_INCLUDES@!$IPTINC!;\ + s!@IPTABLES_CFLAGS@!$IPTCFLAGS $IPTINC!;\ + s!@IPTABLES_MODULES@!$IPTLIB!" + +diff --git a/ipt_NETFLOW.c b/ipt_NETFLOW.c +index d4c91e1..ad974c5 100644 +--- a/ipt_NETFLOW.c ++++ b/ipt_NETFLOW.c +@@ -1,6 +1,6 @@ + /* + * This is NetFlow exporting module (NETFLOW target) for linux +- * (c) 2008-2012 <abc@telekom.ru> ++ * (c) 2008-2013 <abc@telekom.ru> + * + * + * This program is free software: you can redistribute it and/or modify +@@ -18,8 +18,6 @@ + * + */ + +-//#define RAW_PROMISC_HACK +- + #include <linux/module.h> + #include <linux/skbuff.h> + #include <linux/proc_fs.h> +@@ -31,16 +29,26 @@ + #include <linux/icmp.h> + #include <linux/igmp.h> + #include <linux/inetdevice.h> +-#include <linux/jhash.h> ++#include <linux/hash.h> ++#include <linux/delay.h> ++#include <linux/spinlock_types.h> + #include <net/icmp.h> + #include <net/ip.h> ++#include <net/ipv6.h> + #include <net/tcp.h> + #include <net/route.h> ++#include <net/ip6_fib.h> + #include <net/dst.h> + #include <linux/netfilter_ipv4/ip_tables.h> ++#if defined(CONFIG_NF_NAT_NEEDED) || defined(CONFIG_NF_CONNTRACK_MARK) ++#include <linux/notifier.h> ++#include <net/netfilter/nf_conntrack.h> ++#include <net/netfilter/nf_conntrack_core.h> ++#endif + #include <linux/version.h> + #include <asm/unaligned.h> + #include "ipt_NETFLOW.h" ++#include "murmur3.h" + #ifdef CONFIG_BRIDGE_NETFILTER + #include <linux/netfilter_bridge.h> + #endif +@@ -74,41 +82,66 @@ + #define ipt_target xt_target + #endif + +-#define IPT_NETFLOW_VERSION "1.8" ++#define IPT_NETFLOW_VERSION "1.8.2" /* Note that if you are using git, you ++ will see version in other format. */ ++#include "version.h" ++#ifdef GITVERSION ++#undef IPT_NETFLOW_VERSION ++#define IPT_NETFLOW_VERSION GITVERSION ++#endif + + MODULE_LICENSE("GPL"); + MODULE_AUTHOR("<abc@telekom.ru>"); + MODULE_DESCRIPTION("iptables NETFLOW target module"); + MODULE_VERSION(IPT_NETFLOW_VERSION); ++MODULE_ALIAS("ip6t_NETFLOW"); + + #define DST_SIZE 256 + static char destination_buf[DST_SIZE] = "127.0.0.1:2055"; + static char *destination = destination_buf; +-module_param(destination, charp, 0400); ++module_param(destination, charp, 0444); + MODULE_PARM_DESC(destination, "export destination ipaddress:port"); + + static int inactive_timeout = 15; +-module_param(inactive_timeout, int, 0600); ++module_param(inactive_timeout, int, 0644); + MODULE_PARM_DESC(inactive_timeout, "inactive flows timeout in seconds"); + + static int active_timeout = 30 * 60; +-module_param(active_timeout, int, 0600); ++module_param(active_timeout, int, 0644); + MODULE_PARM_DESC(active_timeout, "active flows timeout in seconds"); + + static int debug = 0; +-module_param(debug, int, 0600); ++module_param(debug, int, 0644); + MODULE_PARM_DESC(debug, "debug verbosity level"); + + static int sndbuf; +-module_param(sndbuf, int, 0400); ++module_param(sndbuf, int, 0444); + MODULE_PARM_DESC(sndbuf, "udp socket SNDBUF size"); + ++static int protocol = 5; ++module_param(protocol, int, 0444); ++MODULE_PARM_DESC(protocol, "netflow protocol version (5, 9, 10)"); ++ ++static unsigned int refresh_rate = 20; ++module_param(refresh_rate, uint, 0644); ++MODULE_PARM_DESC(refresh_rate, "NetFlow v9/IPFIX refresh rate (packets)"); ++ ++static unsigned int timeout_rate = 30; ++module_param(timeout_rate, uint, 0644); ++MODULE_PARM_DESC(timeout_rate, "NetFlow v9/IPFIX timeout rate (minutes)"); ++ ++#ifdef CONFIG_NF_NAT_NEEDED ++static int natevents = 0; ++module_param(natevents, int, 0444); ++MODULE_PARM_DESC(natevents, "send NAT Events"); ++#endif ++ + static int hashsize; +-module_param(hashsize, int, 0400); ++module_param(hashsize, int, 0444); + MODULE_PARM_DESC(hashsize, "hash table size"); + + static int maxflows = 2000000; +-module_param(maxflows, int, 0600); ++module_param(maxflows, int, 0644); + MODULE_PARM_DESC(maxflows, "maximum number of flows"); + static int peakflows = 0; + static unsigned long peakflows_at; +@@ -121,22 +154,52 @@ MODULE_PARM_DESC(aggregation, "aggregation ruleset"); + + static DEFINE_PER_CPU(struct ipt_netflow_stat, ipt_netflow_stat); + static LIST_HEAD(usock_list); +-static DEFINE_RWLOCK(sock_lock); ++static DEFINE_MUTEX(sock_lock); + ++#define LOCK_COUNT (1<<8) ++#define LOCK_COUNT_MASK (LOCK_COUNT-1) ++static spinlock_t htable_locks[LOCK_COUNT] = { ++ [0 ... LOCK_COUNT - 1] = __SPIN_LOCK_UNLOCKED(htable_locks) ++}; ++static DEFINE_RWLOCK(htable_rwlock); /* global lock to protect htable_locks change */ + static unsigned int ipt_netflow_hash_rnd; +-struct hlist_head *ipt_netflow_hash __read_mostly; /* hash table memory */ ++static struct hlist_head *ipt_netflow_hash __read_mostly; /* hash table memory */ + static unsigned int ipt_netflow_hash_size __read_mostly = 0; /* buckets */ + static LIST_HEAD(ipt_netflow_list); /* all flows */ ++static DEFINE_SPINLOCK(hlist_lock); /* should almost always be locked w/o _bh */ + static LIST_HEAD(aggr_n_list); + static LIST_HEAD(aggr_p_list); + static DEFINE_RWLOCK(aggr_lock); ++#ifdef CONFIG_NF_NAT_NEEDED ++static LIST_HEAD(nat_list); /* nat events */ ++static DEFINE_SPINLOCK(nat_lock); ++static unsigned long nat_events_start = 0; ++static unsigned long nat_events_stop = 0; ++#endif + static struct kmem_cache *ipt_netflow_cachep __read_mostly; /* ipt_netflow memory */ + static atomic_t ipt_netflow_count = ATOMIC_INIT(0); +-static DEFINE_SPINLOCK(ipt_netflow_lock); /* hash table lock */ + +-static long long pdu_packets = 0, pdu_traf = 0; +-static struct netflow5_pdu pdu; +-static unsigned long pdu_ts_mod; ++static long long pdu_packets = 0, pdu_traf = 0; /* how much accounted traffic in pdu */ ++static unsigned int pdu_count = 0; ++static unsigned int pdu_seq = 0; ++static unsigned int pdu_data_records = 0; ++static unsigned int pdu_tpl_records = 0; ++static unsigned long pdu_ts_mod; /* ts of last flow */ ++static union { ++ struct netflow5_pdu v5; ++ struct netflow9_pdu v9; ++ struct ipfix_pdu ipfix; ++} pdu; ++static int engine_id = 0; /* Observation Domain */ ++static __u8 *pdu_data_used; ++static __u8 *pdu_high_wm; /* high watermark */ ++static unsigned int pdu_max_size; /* sizeof pdu */ ++static struct flowset_data *pdu_flowset = NULL; /* current data flowset */ ++ ++static void (*netflow_export_flow)(struct ipt_netflow *nf); ++static void (*netflow_export_pdu)(void); /* called on timeout */ ++static void netflow_switch_version(int ver); ++ + #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20) + static void netflow_work_fn(void *work); + static DECLARE_WORK(netflow_work, netflow_work_fn, NULL); +@@ -146,19 +209,26 @@ static DECLARE_DELAYED_WORK(netflow_work, netflow_work_fn); + #endif + static struct timer_list rate_timer; + ++#define TCP_SYN_ACK 0x12 + #define TCP_FIN_RST 0x05 + + static long long sec_prate = 0, sec_brate = 0; + static long long min_prate = 0, min_brate = 0; + static long long min5_prate = 0, min5_brate = 0; +-static unsigned int metric = 10, min15_metric = 10, min5_metric = 10, min_metric = 10; /* hash metrics */ ++static unsigned int metric = 100, min15_metric = 100, min5_metric = 100, min_metric = 100; /* hash metrics */ + + static int set_hashsize(int new_size); + static void destination_removeall(void); + static int add_destinations(char *ptr); + static void aggregation_remove(struct list_head *list); + static int add_aggregation(char *ptr); +-static void netflow_scan_and_export(int flush); ++static int netflow_scan_and_export(int flush); ++enum { ++ DONT_FLUSH, AND_FLUSH ++}; ++static int template_ids = FLOWSET_DATA_FIRST; ++static int tpl_count = 0; /* how much active templates */ ++ + + static inline __be32 bits2mask(int bits) { + return (bits? 0xffffffff << (32 - bits) : 0); +@@ -175,28 +245,46 @@ static inline int mask2bits(__be32 mask) { + /* under that lock worker is always stopped and not rescheduled, + * and we can call worker sub-functions manually */ + static DEFINE_MUTEX(worker_lock); +-static inline void __start_scan_worker(void) ++#define MIN_DELAY 1 ++#define MAX_DELAY (HZ / 10) ++static int worker_delay = HZ / 10; ++static inline void _schedule_scan_worker(const int status) + { +- schedule_delayed_work(&netflow_work, HZ / 10); ++ /* rudimentary congestion avoidance */ ++ if (status > 0) ++ worker_delay -= status; ++ else if (status < 0) ++ worker_delay /= 2; ++ else ++ worker_delay++; ++ if (worker_delay < MIN_DELAY) ++ worker_delay = MIN_DELAY; ++ else if (worker_delay > MAX_DELAY) ++ worker_delay = MAX_DELAY; ++ schedule_delayed_work(&netflow_work, worker_delay); + } + +-static inline void start_scan_worker(void) ++/* This is only called soon after pause_scan_worker. */ ++static inline void cont_scan_worker(void) + { +- __start_scan_worker(); ++ _schedule_scan_worker(0); + mutex_unlock(&worker_lock); + } + +-/* we always stop scanner before write_lock(&sock_lock) +- * to let it never hold that spin lock */ +-static inline void __stop_scan_worker(void) ++static inline void _unschedule_scan_worker(void) + { ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,23) ++ cancel_rearming_delayed_work(&netflow_work); ++#else + cancel_delayed_work_sync(&netflow_work); ++#endif + } + +-static inline void stop_scan_worker(void) ++/* This is only used for quick pause (in procctl). */ ++static inline void pause_scan_worker(void) + { + mutex_lock(&worker_lock); +- __stop_scan_worker(); ++ _unschedule_scan_worker(); + } + + #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,24) +@@ -223,11 +311,14 @@ static int nf_seq_show(struct seq_file *seq, void *v) + int snum = 0; + int peak = (jiffies - peakflows_at) / HZ; + +- seq_printf(seq, "Flows: active %u (peak %u reached %ud%uh%um ago), mem %uK\n", ++ seq_printf(seq, "ipt_NETFLOW version " IPT_NETFLOW_VERSION ", srcversion %s\n", ++ THIS_MODULE->srcversion); ++ seq_printf(seq, "Flows: active %u (peak %u reached %ud%uh%um ago), mem %uK, worker delay %d/%d.\n", + nr_flows, + peakflows, + peak / (60 * 60 * 24), (peak / (60 * 60)) % 24, (peak / 60) % 60, +- (unsigned int)((nr_flows * sizeof(struct ipt_netflow)) >> 10)); ++ (unsigned int)((nr_flows * sizeof(struct ipt_netflow)) >> 10), ++ worker_delay, HZ); + + for_each_present_cpu(cpu) { + struct ipt_netflow_stat *st = &per_cpu(ipt_netflow_stat, cpu); +@@ -252,93 +343,123 @@ static int nf_seq_show(struct seq_file *seq, void *v) + } + + #define FFLOAT(x, prec) (int)(x) / prec, (int)(x) % prec +- seq_printf(seq, "Hash: size %u (mem %uK), metric %d.%d, %d.%d, %d.%d, %d.%d. MemTraf: %llu pkt, %llu K (pdu %llu, %llu).\n", +- ipt_netflow_hash_size, +- (unsigned int)((ipt_netflow_hash_size * sizeof(struct hlist_head)) >> 10), +- FFLOAT(metric, 10), +- FFLOAT(min_metric, 10), +- FFLOAT(min5_metric, 10), +- FFLOAT(min15_metric, 10), +- pkt_total - pkt_out + pdu_packets, +- (traf_total - traf_out + pdu_traf) >> 10, +- pdu_packets, +- pdu_traf); +- +- seq_printf(seq, "Timeout: active %d, inactive %d. Maxflows %u\n", +- active_timeout, +- inactive_timeout, +- maxflows); +- +- seq_printf(seq, "Rate: %llu bits/sec, %llu packets/sec; Avg 1 min: %llu bps, %llu pps; 5 min: %llu bps, %llu pps\n", +- sec_brate, sec_prate, min_brate, min_prate, min5_brate, min5_prate); +- +- seq_printf(seq, "cpu# stat: <search found new, trunc frag alloc maxflows>, sock: <ok fail cberr, bytes>, traffic: <pkt, bytes>, drop: <pkt, bytes>\n"); +- +- seq_printf(seq, "Total stat: %6llu %6llu %6llu, %4u %4u %4u %4u, sock: %6u %u %u, %llu K, traffic: %llu, %llu MB, drop: %llu, %llu K\n", +- (unsigned long long)searched, +- (unsigned long long)found, +- (unsigned long long)notfound, +- truncated, frags, alloc_err, maxflows_err, +- send_success, send_failed, sock_errors, +- (unsigned long long)exported_size >> 10, +- (unsigned long long)pkt_total, (unsigned long long)traf_total >> 20, +- (unsigned long long)pkt_drop, (unsigned long long)traf_drop >> 10); ++ seq_printf(seq, "Hash: size %u (mem %uK), metric %d.%02d [%d.%02d, %d.%02d, %d.%02d]." ++ " MemTraf: %llu pkt, %llu K (pdu %llu, %llu), Out %llu pkt, %llu K.\n", ++ ipt_netflow_hash_size, ++ (unsigned int)((ipt_netflow_hash_size * sizeof(struct hlist_head)) >> 10), ++ FFLOAT(metric, 100), ++ FFLOAT(min_metric, 100), ++ FFLOAT(min5_metric, 100), ++ FFLOAT(min15_metric, 100), ++ pkt_total - pkt_out + pdu_packets, ++ (traf_total - traf_out + pdu_traf) >> 10, ++ pdu_packets, ++ pdu_traf, ++ pkt_out, ++ traf_out >> 10); ++ ++ seq_printf(seq, "Rate: %llu bits/sec, %llu packets/sec;" ++ " Avg 1 min: %llu bps, %llu pps; 5 min: %llu bps, %llu pps\n", ++ sec_brate, sec_prate, min_brate, min_prate, min5_brate, min5_prate); ++ ++ seq_printf(seq, "cpu# stat: <search found new [metric], trunc frag alloc maxflows>," ++ " sock: <ok fail cberr, bytes>, traffic: <pkt, bytes>, drop: <pkt, bytes>\n"); ++ ++#define SAFEDIV(x,y) ((y)? ({ u64 __tmp = x; do_div(__tmp, y); (int)__tmp; }) : 0) ++ seq_printf(seq, "Total stat: %6llu %6llu %6llu [%d.%02d], %4u %4u %4u %4u," ++ " sock: %6u %u %u, %llu K, traffic: %llu, %llu MB, drop: %llu, %llu K\n", ++ searched, ++ (unsigned long long)found, ++ (unsigned long long)notfound, ++ FFLOAT(SAFEDIV(100LL * (searched + found + notfound), (found + notfound)), 100), ++ truncated, frags, alloc_err, maxflows_err, ++ send_success, send_failed, sock_errors, ++ (unsigned long long)exported_size >> 10, ++ (unsigned long long)pkt_total, (unsigned long long)traf_total >> 20, ++ (unsigned long long)pkt_drop, (unsigned long long)traf_drop >> 10); + + if (num_present_cpus() > 1) { + for_each_present_cpu(cpu) { + struct ipt_netflow_stat *st; + + st = &per_cpu(ipt_netflow_stat, cpu); +- seq_printf(seq, "cpu%u stat: %6llu %6llu %6llu, %4u %4u %4u %4u, sock: %6u %u %u, %llu K, traffic: %llu, %llu MB, drop: %llu, %llu K\n", +- cpu, +- (unsigned long long)st->searched, +- (unsigned long long)st->found, +- (unsigned long long)st->notfound, +- st->truncated, st->frags, st->alloc_err, st->maxflows_err, +- st->send_success, st->send_failed, st->sock_errors, +- (unsigned long long)st->exported_size >> 10, +- (unsigned long long)st->pkt_total, (unsigned long long)st->traf_total >> 20, +- (unsigned long long)st->pkt_drop, (unsigned long long)st->traf_drop >> 10); ++ seq_printf(seq, "cpu%u stat: %6llu %6llu %6llu [%d.%02d], %4u %4u %4u %4u," ++ " sock: %6u %u %u, %llu K, traffic: %llu, %llu MB, drop: %llu, %llu K\n", ++ cpu, ++ (unsigned long long)st->searched, ++ (unsigned long long)st->found, ++ (unsigned long long)st->notfound, ++ FFLOAT(SAFEDIV(100LL * (st->searched + st->found + st->notfound), (st->found + st->notfound)), 100), ++ st->truncated, st->frags, st->alloc_err, st->maxflows_err, ++ st->send_success, st->send_failed, st->sock_errors, ++ (unsigned long long)st->exported_size >> 10, ++ (unsigned long long)st->pkt_total, (unsigned long long)st->traf_total >> 20, ++ (unsigned long long)st->pkt_drop, (unsigned long long)st->traf_drop >> 10); + } + } + +- read_lock(&sock_lock); ++ seq_printf(seq, "Protocol version %d", protocol); ++ if (protocol == 10) ++ seq_printf(seq, " (ipfix)"); ++ else ++ seq_printf(seq, " (netflow)"); ++ if (protocol >= 9) ++ seq_printf(seq, ", refresh-rate %u, timeout-rate %u, (templates %d, active %d)", ++ refresh_rate, timeout_rate, template_ids - FLOWSET_DATA_FIRST, tpl_count); ++ ++ seq_printf(seq, ". Timeouts: active %d, inactive %d. Maxflows %u\n", ++ active_timeout, ++ inactive_timeout, ++ maxflows); ++ ++#ifdef CONFIG_NF_NAT_NEEDED ++ seq_printf(seq, "Natevents %s, count start %lu, stop %lu.\n", natevents? "enabled" : "disabled", ++ nat_events_start, nat_events_stop); ++#endif ++ ++ mutex_lock(&sock_lock); + list_for_each_entry(usock, &usock_list, list) { +- struct sock *sk = usock->sock->sk; +- +- seq_printf(seq, "sock%d: %u.%u.%u.%u:%u, sndbuf %u, filled %u, peak %u; err: sndbuf reached %u, other %u\n", +- snum, +- usock->ipaddr >> 24, +- (usock->ipaddr >> 16) & 255, +- (usock->ipaddr >> 8) & 255, +- usock->ipaddr & 255, +- usock->port, +- sk->sk_sndbuf, +- atomic_read(&sk->sk_wmem_alloc), +- atomic_read(&usock->wmem_peak), +- atomic_read(&usock->err_full), +- atomic_read(&usock->err_other)); ++ seq_printf(seq, "sock%d: %u.%u.%u.%u:%u", ++ snum, ++ HIPQUAD(usock->ipaddr), ++ usock->port); ++ if (usock->sock) { ++ struct sock *sk = usock->sock->sk; ++ ++ seq_printf(seq, ", sndbuf %u, filled %u, peak %u;" ++ " err: sndbuf reached %u, connect %u, other %u\n", ++ sk->sk_sndbuf, ++ atomic_read(&sk->sk_wmem_alloc), ++ atomic_read(&usock->wmem_peak), ++ atomic_read(&usock->err_full), ++ atomic_read(&usock->err_connect), ++ atomic_read(&usock->err_other)); ++ } else ++ seq_printf(seq, " unconnected (%u attempts).\n", ++ atomic_read(&usock->err_connect)); + snum++; + } +- read_unlock(&sock_lock); ++ mutex_unlock(&sock_lock); + + read_lock_bh(&aggr_lock); + snum = 0; + list_for_each_entry(aggr_n, &aggr_n_list, list) { +- seq_printf(seq, "aggr#%d net: match %u.%u.%u.%u/%d strip %d\n", +- snum, +- HIPQUAD(aggr_n->addr), +- mask2bits(aggr_n->mask), +- mask2bits(aggr_n->aggr_mask)); ++ seq_printf(seq, "aggr#%d net: match %u.%u.%u.%u/%d strip %d (usage %u)\n", ++ snum, ++ HIPQUAD(aggr_n->addr), ++ mask2bits(aggr_n->mask), ++ mask2bits(aggr_n->aggr_mask), ++ atomic_read(&aggr_n->usage)); + snum++; + } + snum = 0; + list_for_each_entry(aggr_p, &aggr_p_list, list) { +- seq_printf(seq, "aggr#%d port: ports %u-%u replace %u\n", +- snum, +- aggr_p->port1, +- aggr_p->port2, +- aggr_p->aggr_port); ++ seq_printf(seq, "aggr#%d port: ports %u-%u replace %u (usage %u)\n", ++ snum, ++ aggr_p->port1, ++ aggr_p->port2, ++ aggr_p->aggr_port, ++ atomic_read(&aggr_p->usage)); + snum++; + } + read_unlock_bh(&aggr_lock); +@@ -367,8 +488,13 @@ static struct file_operations nf_seq_fops = { + #define BEFORE2632(x,y) + #endif + ++/* PAX need to know that we are allowed to write */ ++#ifndef CONSTIFY_PLUGIN ++#define ctl_table_no_const ctl_table ++#endif ++ + /* sysctl /proc/sys/net/netflow */ +-static int hsize_procctl(ctl_table *ctl, int write, BEFORE2632(struct file *filp,) ++static int hsize_procctl(ctl_table_no_const *ctl, int write, BEFORE2632(struct file *filp,) + void __user *buffer, size_t *lenp, loff_t *fpos) + { + void *orig = ctl->data; +@@ -386,20 +512,21 @@ static int hsize_procctl(ctl_table *ctl, int write, BEFORE2632(struct file *filp + return ret; + } + +-static int sndbuf_procctl(ctl_table *ctl, int write, BEFORE2632(struct file *filp,) ++static int sndbuf_procctl(ctl_table_no_const *ctl, int write, BEFORE2632(struct file *filp,) + void __user *buffer, size_t *lenp, loff_t *fpos) + { + int ret; + struct ipt_netflow_sock *usock; +- +- read_lock(&sock_lock); ++ ++ mutex_lock(&sock_lock); + if (list_empty(&usock_list)) { +- read_unlock(&sock_lock); ++ mutex_unlock(&sock_lock); + return -ENOENT; + } + usock = list_first_entry(&usock_list, struct ipt_netflow_sock, list); +- sndbuf = usock->sock->sk->sk_sndbuf; +- read_unlock(&sock_lock); ++ if (usock->sock) ++ sndbuf = usock->sock->sk->sk_sndbuf; ++ mutex_unlock(&sock_lock); + + ctl->data = &sndbuf; + ret = proc_dointvec(ctl, write, BEFORE2632(filp,) buffer, lenp, fpos); +@@ -407,13 +534,14 @@ static int sndbuf_procctl(ctl_table *ctl, int write, BEFORE2632(struct file *fil + return ret; + if (sndbuf < SOCK_MIN_SNDBUF) + sndbuf = SOCK_MIN_SNDBUF; +- stop_scan_worker(); +- write_lock(&sock_lock); ++ pause_scan_worker(); ++ mutex_lock(&sock_lock); + list_for_each_entry(usock, &usock_list, list) { +- usock->sock->sk->sk_sndbuf = sndbuf; ++ if (usock->sock) ++ usock->sock->sk->sk_sndbuf = sndbuf; + } +- write_unlock(&sock_lock); +- start_scan_worker(); ++ mutex_unlock(&sock_lock); ++ cont_scan_worker(); + return ret; + } + +@@ -424,10 +552,10 @@ static int destination_procctl(ctl_table *ctl, int write, BEFORE2632(struct file + + ret = proc_dostring(ctl, write, BEFORE2632(filp,) buffer, lenp, fpos); + if (ret >= 0 && write) { +- stop_scan_worker(); ++ pause_scan_worker(); + destination_removeall(); + add_destinations(destination_buf); +- start_scan_worker(); ++ cont_scan_worker(); + } + return ret; + } +@@ -446,13 +574,12 @@ static int aggregation_procctl(ctl_table *ctl, int write, BEFORE2632(struct file + return ret; + } + +-static int flush_procctl(ctl_table *ctl, int write, BEFORE2632(struct file *filp,) ++static int flush_procctl(ctl_table_no_const *ctl, int write, BEFORE2632(struct file *filp,) + void __user *buffer, size_t *lenp, loff_t *fpos) + { + int ret; +- int val; ++ int val = 0; + +- val = 0; + ctl->data = &val; + ret = proc_dointvec(ctl, write, BEFORE2632(filp,) buffer, lenp, fpos); + +@@ -461,14 +588,67 @@ static int flush_procctl(ctl_table *ctl, int write, BEFORE2632(struct file *filp + + if (val > 0) { + printk(KERN_INFO "ipt_NETFLOW: forced flush\n"); +- stop_scan_worker(); +- netflow_scan_and_export(1); +- start_scan_worker(); ++ pause_scan_worker(); ++ netflow_scan_and_export(AND_FLUSH); ++ cont_scan_worker(); ++ } ++ ++ return ret; ++} ++ ++static int protocol_procctl(ctl_table *ctl, int write, BEFORE2632(struct file *filp,) ++ void __user *buffer, size_t *lenp, loff_t *fpos) ++{ ++ int ret; ++ int ver = protocol; ++ ++ ctl->data = &ver; ++ ret = proc_dointvec(ctl, write, BEFORE2632(filp,) buffer, lenp, fpos); ++ ++ if (!write) ++ return ret; ++ ++ switch (ver) { ++ case 5: ++ case 9: ++ case 10: ++ printk(KERN_INFO "ipt_NETFLOW: forced flush (protocol version change)\n"); ++ pause_scan_worker(); ++ netflow_scan_and_export(AND_FLUSH); ++ netflow_switch_version(ver); ++ cont_scan_worker(); ++ break; ++ default: ++ return -EPERM; + } + + return ret; + } + ++#ifdef CONFIG_NF_NAT_NEEDED ++static void register_ct_events(void); ++static void unregister_ct_events(void); ++static int natevents_procctl(ctl_table *ctl, int write, BEFORE2632(struct file *filp,) ++ void __user *buffer, size_t *lenp, loff_t *fpos) ++{ ++ int ret; ++ int val = natevents; ++ ++ ctl->data = &val; ++ ret = proc_dointvec(ctl, write, BEFORE2632(filp,) buffer, lenp, fpos); ++ ++ if (!write) ++ return ret; ++ ++ if (natevents && !val) ++ unregister_ct_events(); ++ else if (!natevents && val) ++ register_ct_events(); ++ ++ return ret; ++} ++#endif ++ + static struct ctl_table_header *netflow_sysctl_header; + + #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20) +@@ -547,6 +727,38 @@ static struct ctl_table netflow_sysctl_table[] = { + .maxlen = sizeof(int), + .proc_handler = &flush_procctl, + }, ++ { ++ _CTL_NAME(10) ++ .procname = "protocol", ++ .mode = 0644, ++ .maxlen = sizeof(int), ++ .proc_handler = &protocol_procctl, ++ }, ++ { ++ _CTL_NAME(11) ++ .procname = "refresh-rate", ++ .mode = 0644, ++ .data = &refresh_rate, ++ .maxlen = sizeof(int), ++ .proc_handler = &proc_dointvec, ++ }, ++ { ++ _CTL_NAME(12) ++ .procname = "timeout-rate", ++ .mode = 0644, ++ .data = &timeout_rate, ++ .maxlen = sizeof(int), ++ .proc_handler = &proc_dointvec, ++ }, ++#ifdef CONFIG_NF_NAT_NEEDED ++ { ++ _CTL_NAME(13) ++ .procname = "natevents", ++ .mode = 0644, ++ .maxlen = sizeof(int), ++ .proc_handler = &natevents_procctl, ++ }, ++#endif + { } + }; + +@@ -588,18 +800,69 @@ static struct ctl_path netflow_sysctl_path[] = { + static void sk_error_report(struct sock *sk) + { + /* clear connection refused errors if any */ +- write_lock_bh(&sk->sk_callback_lock); + if (debug > 1) +- printk(KERN_INFO "NETFLOW: socket error <%d>\n", sk->sk_err); ++ printk(KERN_INFO "ipt_NETFLOW: socket error <%d>\n", sk->sk_err); + sk->sk_err = 0; + NETFLOW_STAT_INC(sock_errors); +- write_unlock_bh(&sk->sk_callback_lock); + return; + } + ++static struct socket *_usock_alloc(const __be32 ipaddr, const unsigned short port) ++{ ++ struct sockaddr_in sin; ++ struct socket *sock; ++ int error; ++ ++ if ((error = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock)) < 0) { ++ printk(KERN_ERR "ipt_NETFLOW: sock_create_kern error %d\n", -error); ++ return NULL; ++ } ++ sock->sk->sk_allocation = GFP_ATOMIC; ++ sock->sk->sk_prot->unhash(sock->sk); /* hidden from input */ ++ sock->sk->sk_error_report = &sk_error_report; /* clear ECONNREFUSED */ ++ if (sndbuf) ++ sock->sk->sk_sndbuf = sndbuf; ++ else ++ sndbuf = sock->sk->sk_sndbuf; ++ memset(&sin, 0, sizeof(sin)); ++ sin.sin_family = AF_INET; ++ sin.sin_addr.s_addr = htonl(ipaddr); ++ sin.sin_port = htons(port); ++ if ((error = sock->ops->connect(sock, (struct sockaddr *)&sin, ++ sizeof(sin), 0)) < 0) { ++ printk(KERN_ERR "ipt_NETFLOW: error connecting UDP socket %d," ++ " don't worry, will try reconnect later.\n", -error); ++ /* ENETUNREACH when no interfaces */ ++ sock_release(sock); ++ return NULL; ++ } ++ return sock; ++} ++ ++static void usock_connect(struct ipt_netflow_sock *usock, const int sendmsg) ++{ ++ usock->sock = _usock_alloc(usock->ipaddr, usock->port); ++ if (usock->sock) { ++ if (sendmsg || debug) ++ printk(KERN_INFO "ipt_NETFLOW: connected %u.%u.%u.%u:%u\n", ++ HIPQUAD(usock->ipaddr), ++ usock->port); ++ } else { ++ atomic_inc(&usock->err_connect); ++ if (debug) ++ printk(KERN_INFO "ipt_NETFLOW: connect to %u.%u.%u.%u:%u failed%s.\n", ++ HIPQUAD(usock->ipaddr), ++ usock->port, ++ (sendmsg)? " (pdu lost)" : ""); ++ } ++ atomic_set(&usock->wmem_peak, 0); ++ atomic_set(&usock->err_full, 0); ++ atomic_set(&usock->err_other, 0); ++} ++ + // return numbers of sends succeded, 0 if none + /* only called in scan worker path */ +-static int netflow_send_pdu(void *buffer, int len) ++static void netflow_sendmsg(void *buffer, const int len) + { + struct msghdr msg = { .msg_flags = MSG_DONTWAIT|MSG_NOSIGNAL }; + struct kvec iov = { buffer, len }; +@@ -607,9 +870,16 @@ static int netflow_send_pdu(void *buffer, int len) + int snum = 0; + struct ipt_netflow_sock *usock; + ++ mutex_lock(&sock_lock); + list_for_each_entry(usock, &usock_list, list) { ++ if (!usock->sock) ++ usock_connect(usock, 1); ++ if (!usock->sock) { ++ NETFLOW_STAT_INC_ATOMIC(send_failed); ++ continue; ++ } + if (debug) +- printk(KERN_INFO "netflow_send_pdu: sendmsg(%d, %d) [%u %u]\n", ++ printk(KERN_INFO "netflow_sendmsg: sendmsg(%d, %d) [%u %u]\n", + snum, + len, + atomic_read(&usock->sock->sk->sk_wmem_alloc), +@@ -624,7 +894,7 @@ static int netflow_send_pdu(void *buffer, int len) + suggestion = ": increase sndbuf!"; + } else + atomic_inc(&usock->err_other); +- printk(KERN_ERR "netflow_send_pdu[%d]: sendmsg error %d: data loss %llu pkt, %llu bytes%s\n", ++ printk(KERN_ERR "ipt_NETFLOW: sendmsg[%d] error %d: data loss %llu pkt, %llu bytes%s\n", + snum, ret, pdu_packets, pdu_traf, suggestion); + } else { + unsigned int wmem = atomic_read(&usock->sock->sk->sk_wmem_alloc); +@@ -636,98 +906,67 @@ static int netflow_send_pdu(void *buffer, int len) + } + snum++; + } +- return retok; ++ mutex_unlock(&sock_lock); ++ if (retok == 0) { ++ /* not least one send succeded, account stat for dropped packets */ ++ NETFLOW_STAT_ADD_ATOMIC(pkt_drop, pdu_packets); ++ NETFLOW_STAT_ADD_ATOMIC(traf_drop, pdu_traf); ++ } + } + +-static void usock_free(struct ipt_netflow_sock *usock) ++static void usock_close_free(struct ipt_netflow_sock *usock) + { +- printk(KERN_INFO "netflow: remove destination %u.%u.%u.%u:%u (%p)\n", ++ printk(KERN_INFO "ipt_NETFLOW: removed destination %u.%u.%u.%u:%u\n", + HIPQUAD(usock->ipaddr), +- usock->port, +- usock->sock); ++ usock->port); + if (usock->sock) + sock_release(usock->sock); + usock->sock = NULL; +- vfree(usock); ++ vfree(usock); + } + + static void destination_removeall(void) + { +- write_lock(&sock_lock); ++ mutex_lock(&sock_lock); + while (!list_empty(&usock_list)) { + struct ipt_netflow_sock *usock; + + usock = list_entry(usock_list.next, struct ipt_netflow_sock, list); + list_del(&usock->list); +- write_unlock(&sock_lock); +- usock_free(usock); +- write_lock(&sock_lock); ++ mutex_unlock(&sock_lock); ++ usock_close_free(usock); ++ mutex_lock(&sock_lock); + } +- write_unlock(&sock_lock); ++ mutex_unlock(&sock_lock); + } + + static void add_usock(struct ipt_netflow_sock *usock) + { + struct ipt_netflow_sock *sk; + +- /* don't need empty sockets */ +- if (!usock->sock) { +- usock_free(usock); +- return; +- } +- +- write_lock(&sock_lock); ++ mutex_lock(&sock_lock); + /* don't need duplicated sockets */ + list_for_each_entry(sk, &usock_list, list) { + if (sk->ipaddr == usock->ipaddr && + sk->port == usock->port) { +- write_unlock(&sock_lock); +- usock_free(usock); ++ mutex_unlock(&sock_lock); ++ usock_close_free(usock); + return; + } + } + list_add_tail(&usock->list, &usock_list); +- printk(KERN_INFO "netflow: added destination %u.%u.%u.%u:%u\n", ++ printk(KERN_INFO "ipt_NETFLOW: added destination %u.%u.%u.%u:%u%s\n", + HIPQUAD(usock->ipaddr), +- usock->port); +- write_unlock(&sock_lock); +-} +- +-static struct socket *usock_alloc(__be32 ipaddr, unsigned short port) +-{ +- struct sockaddr_in sin; +- struct socket *sock; +- int error; +- +- if ((error = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock)) < 0) { +- printk(KERN_ERR "netflow: sock_create_kern error %d\n", error); +- return NULL; +- } +- sock->sk->sk_allocation = GFP_ATOMIC; +- sock->sk->sk_prot->unhash(sock->sk); /* hidden from input */ +- sock->sk->sk_error_report = &sk_error_report; /* clear ECONNREFUSED */ +- if (sndbuf) +- sock->sk->sk_sndbuf = sndbuf; +- else +- sndbuf = sock->sk->sk_sndbuf; +- memset(&sin, 0, sizeof(sin)); +- sin.sin_family = AF_INET; +- sin.sin_addr.s_addr = htonl(ipaddr); +- sin.sin_port = htons(port); +- if ((error = sock->ops->connect(sock, (struct sockaddr *)&sin, +- sizeof(sin), 0)) < 0) { +- printk(KERN_ERR "netflow: error connecting UDP socket %d\n", error); +- sock_release(sock); +- return NULL; +- } +- return sock; ++ usock->port, ++ (!usock->sock)? " (unconnected)" : ""); ++ mutex_unlock(&sock_lock); + } + + #define SEPARATORS " ,;\t\n" + static int add_destinations(char *ptr) + { + while (ptr) { +- unsigned char ip[4]; ++ unsigned char ip[4]; + unsigned short port; + + ptr += strspn(ptr, SEPARATORS); +@@ -737,17 +976,15 @@ static int add_destinations(char *ptr) + struct ipt_netflow_sock *usock; + + if (!(usock = vmalloc(sizeof(*usock)))) { +- printk(KERN_ERR "netflow: can't vmalloc socket\n"); ++ printk(KERN_ERR "ipt_NETFLOW: can't vmalloc socket\n"); + return -ENOMEM; + } + + memset(usock, 0, sizeof(*usock)); ++ atomic_set(&usock->err_connect, 0); + usock->ipaddr = ntohl(*(__be32 *)ip); + usock->port = port; +- usock->sock = usock_alloc(usock->ipaddr, port); +- atomic_set(&usock->wmem_peak, 0); +- atomic_set(&usock->err_full, 0); +- atomic_set(&usock->err_other, 0); ++ usock_connect(usock, 0); + add_usock(usock); + } else + break; +@@ -781,7 +1018,7 @@ static int add_aggregation(char *ptr) + LIST_HEAD(old_aggr_list); + + while (ptr && *ptr) { +- unsigned char ip[4]; ++ unsigned char ip[4]; + unsigned int mask; + unsigned int port1, port2; + unsigned int aggr_to; +@@ -792,16 +1029,16 @@ static int add_aggregation(char *ptr) + ip, ip + 1, ip + 2, ip + 3, &mask, &aggr_to) == 6) { + + if (!(aggr_n = vmalloc(sizeof(*aggr_n)))) { +- printk(KERN_ERR "netflow: can't vmalloc aggr\n"); ++ printk(KERN_ERR "ipt_NETFLOW: can't vmalloc aggr\n"); + return -ENOMEM; + } + memset(aggr_n, 0, sizeof(*aggr_n)); + +- aggr_n->addr = ntohl(*(__be32 *)ip); + aggr_n->mask = bits2mask(mask); ++ aggr_n->addr = ntohl(*(__be32 *)ip) & aggr_n->mask; + aggr_n->aggr_mask = bits2mask(aggr_to); + aggr_n->prefix = mask; +- printk(KERN_INFO "netflow: add aggregation [%u.%u.%u.%u/%u=%u]\n", ++ printk(KERN_INFO "ipt_NETFLOW: add aggregation [%u.%u.%u.%u/%u=%u]\n", + HIPQUAD(aggr_n->addr), mask, aggr_to); + list_add_tail(&aggr_n->list, &new_aggr_n_list); + +@@ -809,7 +1046,7 @@ static int add_aggregation(char *ptr) + sscanf(ptr, "%u=%u", &port2, &aggr_to) == 2) { + + if (!(aggr_p = vmalloc(sizeof(*aggr_p)))) { +- printk(KERN_ERR "netflow: can't vmalloc aggr\n"); ++ printk(KERN_ERR "ipt_NETFLOW: can't vmalloc aggr\n"); + return -ENOMEM; + } + memset(aggr_p, 0, sizeof(*aggr_p)); +@@ -817,11 +1054,11 @@ static int add_aggregation(char *ptr) + aggr_p->port1 = port1; + aggr_p->port2 = port2; + aggr_p->aggr_port = aggr_to; +- printk(KERN_INFO "netflow: add aggregation [%u-%u=%u]\n", ++ printk(KERN_INFO "ipt_NETFLOW: add aggregation [%u-%u=%u]\n", + port1, port2, aggr_to); + list_add_tail(&aggr_p->list, &new_aggr_p_list); + } else { +- printk(KERN_ERR "netflow: bad aggregation rule: %s (ignoring)\n", ptr); ++ printk(KERN_ERR "ipt_NETFLOW: bad aggregation rule: %s (ignoring)\n", ptr); + break; + } + +@@ -846,17 +1083,23 @@ static int add_aggregation(char *ptr) + + static inline u_int32_t hash_netflow(const struct ipt_netflow_tuple *tuple) + { +- /* tuple is rounded to u32s */ +- return jhash2((u32 *)tuple, NETFLOW_TUPLE_SIZE, ipt_netflow_hash_rnd) % ipt_netflow_hash_size; ++ return murmur3(tuple, sizeof(struct ipt_netflow_tuple), ipt_netflow_hash_rnd) % ipt_netflow_hash_size; + } + + static struct ipt_netflow * +-ipt_netflow_find(const struct ipt_netflow_tuple *tuple, unsigned int hash) ++ipt_netflow_find(const struct ipt_netflow_tuple *tuple, const unsigned int hash) + { + struct ipt_netflow *nf; ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3,9,0) ++#define compat_hlist_for_each_entry hlist_for_each_entry ++#define compat_hlist_for_each_entry_safe hlist_for_each_entry_safe + struct hlist_node *pos; ++#else /* since 3.9.0 */ ++#define compat_hlist_for_each_entry(a,pos,c,d) hlist_for_each_entry(a,c,d) ++#define compat_hlist_for_each_entry_safe(a,pos,c,d,e) hlist_for_each_entry_safe(a,c,d,e) ++#endif + +- hlist_for_each_entry(nf, pos, &ipt_netflow_hash[hash], hlist) { ++ compat_hlist_for_each_entry(nf, pos, &ipt_netflow_hash[hash], hlist) { + if (ipt_netflow_tuple_equal(tuple, &nf->tuple) && + nf->nr_bytes < FLOW_FULL_WATERMARK) { + NETFLOW_STAT_INC(found); +@@ -868,7 +1111,7 @@ ipt_netflow_find(const struct ipt_netflow_tuple *tuple, unsigned int hash) + return NULL; + } + +-static struct hlist_head *alloc_hashtable(int size) ++static struct hlist_head *alloc_hashtable(const int size) + { + struct hlist_head *hash; + +@@ -879,19 +1122,18 @@ static struct hlist_head *alloc_hashtable(int size) + for (i = 0; i < size; i++) + INIT_HLIST_HEAD(&hash[i]); + } else +- printk(KERN_ERR "netflow: unable to vmalloc hash table.\n"); ++ printk(KERN_ERR "ipt_NETFLOW: unable to vmalloc hash table.\n"); + + return hash; + } + +-static int set_hashsize(int new_size) ++static int set_hashsize(const int new_size) + { + struct hlist_head *new_hash, *old_hash; +- unsigned int hash; + struct ipt_netflow *nf; + int rnd; + +- printk(KERN_INFO "netflow: allocating new hash table %u -> %u buckets\n", ++ printk(KERN_INFO "ipt_NETFLOW: allocating new hash table %u -> %u buckets\n", + ipt_netflow_hash_size, new_size); + new_hash = alloc_hashtable(new_size); + if (!new_hash) +@@ -900,19 +1142,24 @@ static int set_hashsize(int new_size) + get_random_bytes(&rnd, 4); + + /* rehash */ +- spin_lock_bh(&ipt_netflow_lock); ++ write_lock_bh(&htable_rwlock); + old_hash = ipt_netflow_hash; + ipt_netflow_hash = new_hash; + ipt_netflow_hash_size = new_size; + ipt_netflow_hash_rnd = rnd; + /* hash_netflow() is dependent on ipt_netflow_hash_* values */ ++ spin_lock(&hlist_lock); + list_for_each_entry(nf, &ipt_netflow_list, list) { ++ unsigned int hash; ++ + hash = hash_netflow(&nf->tuple); + /* hlist_add_head overwrites hlist pointers for this node + * so it's good */ + hlist_add_head(&nf->hlist, &new_hash[hash]); ++ nf->lock = &htable_locks[hash & LOCK_COUNT_MASK]; + } +- spin_unlock_bh(&ipt_netflow_lock); ++ spin_unlock(&hlist_lock); ++ write_unlock_bh(&htable_rwlock); + + vfree(old_hash); + +@@ -920,14 +1167,14 @@ static int set_hashsize(int new_size) + } + + static struct ipt_netflow * +-ipt_netflow_alloc(struct ipt_netflow_tuple *tuple) ++ipt_netflow_alloc(const struct ipt_netflow_tuple *tuple) + { + struct ipt_netflow *nf; + long count; + + nf = kmem_cache_alloc(ipt_netflow_cachep, GFP_ATOMIC); + if (!nf) { +- printk(KERN_ERR "Can't allocate netflow.\n"); ++ printk(KERN_ERR "ipt_NETFLOW: Can't allocate flow.\n"); + return NULL; + } + +@@ -945,13 +1192,15 @@ ipt_netflow_alloc(struct ipt_netflow_tuple *tuple) + + static void ipt_netflow_free(struct ipt_netflow *nf) + { ++ if (IS_DUMMY_FLOW(nf)) ++ return; + atomic_dec(&ipt_netflow_count); + kmem_cache_free(ipt_netflow_cachep, nf); + } + + static struct ipt_netflow * +-init_netflow(struct ipt_netflow_tuple *tuple, +- struct sk_buff *skb, unsigned int hash) ++init_netflow(const struct ipt_netflow_tuple *tuple, ++ const struct sk_buff *skb, const unsigned int hash) + { + struct ipt_netflow *nf; + +@@ -959,93 +1208,774 @@ init_netflow(struct ipt_netflow_tuple *tuple, + if (!nf) + return NULL; + ++ nf->lock = &htable_locks[hash & LOCK_COUNT_MASK]; + hlist_add_head(&nf->hlist, &ipt_netflow_hash[hash]); ++ spin_lock(&hlist_lock); + list_add(&nf->list, &ipt_netflow_list); ++ spin_unlock(&hlist_lock); + + return nf; + } + + /* cook pdu, send, and clean */ + /* only called in scan worker path */ +-static void netflow_export_pdu(void) ++static void netflow_export_pdu_v5(void) + { + struct timeval tv; + int pdusize; + +- if (!pdu.nr_records) ++ if (!pdu_data_records) + return; + + if (debug > 1) +- printk(KERN_INFO "netflow_export_pdu with %d records\n", pdu.nr_records); +- do_gettimeofday(&tv); +- +- pdu.version = htons(5); +- pdu.ts_uptime = htonl(jiffies_to_msecs(jiffies)); +- pdu.ts_usecs = htonl(tv.tv_sec); +- pdu.ts_unsecs = htonl(tv.tv_usec); +- //pdu.eng_type = 0; +- //pdu.eng_id = 0; +- //pdu.padding = 0; ++ printk(KERN_INFO "netflow_export_pdu_v5 with %d records\n", pdu_data_records); + +- pdusize = NETFLOW5_HEADER_SIZE + sizeof(struct netflow5_record) * pdu.nr_records; +- +- /* especially fix nr_records before export */ +- pdu.nr_records = htons(pdu.nr_records); ++ pdu.v5.version = htons(5); ++ pdu.v5.nr_records = htons(pdu_data_records); ++ pdu.v5.ts_uptime = htonl(jiffies_to_msecs(jiffies)); ++ do_gettimeofday(&tv); ++ pdu.v5.ts_usecs = htonl(tv.tv_sec); ++ pdu.v5.ts_unsecs = htonl(tv.tv_usec); ++ pdu.v5.seq = htonl(pdu_seq); ++ //pdu.v5.eng_type = 0; ++ pdu.v5.eng_id = engine_id; ++ //pdu.v5.padding = 0; + +- if (netflow_send_pdu(&pdu, pdusize) == 0) { +- /* not least one send succeded, account stat for dropped packets */ +- NETFLOW_STAT_ADD_ATOMIC(pkt_drop, pdu_packets); +- NETFLOW_STAT_ADD_ATOMIC(traf_drop, pdu_traf); +- } ++ pdusize = NETFLOW5_HEADER_SIZE + sizeof(struct netflow5_record) * pdu_data_records; + +- pdu.seq = htonl(ntohl(pdu.seq) + ntohs(pdu.nr_records)); ++ netflow_sendmsg(&pdu.v5, pdusize); + +- pdu.nr_records = 0; + pdu_packets = 0; +- pdu_traf = 0; ++ pdu_traf = 0; ++ ++ pdu_seq += pdu_data_records; ++ pdu_count++; ++ pdu_data_records = 0; + } + + /* only called in scan worker path */ +-static void netflow_export_flow(struct ipt_netflow *nf) ++static void netflow_export_flow_v5(struct ipt_netflow *nf) + { + struct netflow5_record *rec; + +- if (debug > 2) +- printk(KERN_INFO "adding flow to export (%d)\n", pdu.nr_records); ++ if (unlikely(debug > 2)) ++ printk(KERN_INFO "adding flow to export (%d)\n", pdu_data_records); + + pdu_packets += nf->nr_packets; + pdu_traf += nf->nr_bytes; + pdu_ts_mod = jiffies; +- rec = &pdu.flow[pdu.nr_records++]; ++ rec = &pdu.v5.flow[pdu_data_records++]; + + /* make V5 flow record */ +- rec->s_addr = nf->tuple.s_addr; +- rec->d_addr = nf->tuple.d_addr; +- //rec->nexthop = 0; ++ rec->s_addr = nf->tuple.src.ip; ++ rec->d_addr = nf->tuple.dst.ip; ++ rec->nexthop = nf->nh.ip; + rec->i_ifc = htons(nf->tuple.i_ifc); + rec->o_ifc = htons(nf->o_ifc); + rec->nr_packets = htonl(nf->nr_packets); + rec->nr_octets = htonl(nf->nr_bytes); +- rec->ts_first = htonl(jiffies_to_msecs(nf->ts_first)); +- rec->ts_last = htonl(jiffies_to_msecs(nf->ts_last)); ++ rec->first_ms = htonl(jiffies_to_msecs(nf->ts_first)); ++ rec->last_ms = htonl(jiffies_to_msecs(nf->ts_last)); + rec->s_port = nf->tuple.s_port; + rec->d_port = nf->tuple.d_port; +- //rec->reserved = 0; ++ //rec->reserved = 0; /* pdu is always zeroized for v5 in netflow_switch_version */ + rec->tcp_flags = nf->tcp_flags; + rec->protocol = nf->tuple.protocol; + rec->tos = nf->tuple.tos; +- //rec->s_as = 0; +- //rec->d_as = 0; ++#ifdef CONFIG_NF_NAT_NEEDED ++ rec->s_as = nf->s_as; ++ rec->d_as = nf->d_as; ++#endif + rec->s_mask = nf->s_mask; + rec->d_mask = nf->d_mask; + //rec->padding = 0; + ipt_netflow_free(nf); + +- if (pdu.nr_records == NETFLOW5_RECORDS_MAX) ++ if (pdu_data_records == NETFLOW5_RECORDS_MAX) ++ netflow_export_pdu_v5(); ++} ++ ++/* pdu is initially blank, export current pdu, and prepare next for filling. */ ++static void netflow_export_pdu_v9(void) ++{ ++ struct timeval tv; ++ int pdusize; ++ ++ if (pdu_data_used <= pdu.v9.data) ++ return; ++ ++ if (debug > 1) ++ printk(KERN_INFO "netflow_export_pdu_v9 with %d records\n", ++ pdu_data_records + pdu_tpl_records); ++ ++ pdu.v9.version = htons(9); ++ pdu.v9.nr_records = htons(pdu_data_records + pdu_tpl_records); ++ pdu.v9.sys_uptime_ms = htonl(jiffies_to_msecs(jiffies)); ++ do_gettimeofday(&tv); ++ pdu.v9.export_time_s = htonl(tv.tv_sec); ++ pdu.v9.seq = htonl(pdu_seq); ++ pdu.v9.source_id = engine_id; ++ ++ pdusize = pdu_data_used - (unsigned char *)&pdu.v9; ++ ++ netflow_sendmsg(&pdu.v9, pdusize); ++ ++ pdu_packets = 0; ++ pdu_traf = 0; ++ ++ pdu_seq++; ++ pdu_count++; ++ pdu_data_records = pdu_tpl_records = 0; ++ pdu_data_used = pdu.v9.data; ++ pdu_flowset = NULL; ++} ++ ++static void netflow_export_pdu_ipfix(void) ++{ ++ struct timeval tv; ++ int pdusize; ++ ++ if (pdu_data_used <= pdu.ipfix.data) ++ return; ++ ++ if (debug > 1) ++ printk(KERN_INFO "netflow_export_pduX with %d records\n", ++ pdu_data_records); ++ ++ pdu.ipfix.version = htons(10); ++ do_gettimeofday(&tv); ++ pdu.ipfix.export_time_s = htonl(tv.tv_sec); ++ pdu.ipfix.seq = htonl(pdu_seq); ++ pdu.ipfix.odomain_id = engine_id; ++ pdusize = pdu_data_used - (unsigned char *)&pdu; ++ pdu.ipfix.length = htons(pdusize); ++ ++ netflow_sendmsg(&pdu.ipfix, pdusize); ++ ++ pdu_packets = 0; ++ pdu_traf = 0; ++ ++ pdu_seq += pdu_data_records; ++ pdu_count++; ++ pdu_data_records = pdu_tpl_records = 0; ++ pdu_data_used = pdu.ipfix.data; ++ pdu_flowset = NULL; ++} ++ ++static inline int pdu_have_space(const size_t size) ++{ ++ return ((pdu_data_used + size) <= pdu_high_wm); ++} ++ ++static inline unsigned char *pdu_grab_space(const size_t size) ++{ ++ unsigned char *ptr = pdu_data_used; ++ pdu_data_used += size; ++ return ptr; ++} ++ ++// allocate data space in pdu, or fail if pdu is reallocated. ++static inline unsigned char *pdu_alloc_fail(const size_t size) ++{ ++ if (!pdu_have_space(size)) { + netflow_export_pdu(); ++ return NULL; ++ } ++ return pdu_grab_space(size); ++} ++ ++/* doesn't fail, but can provide empty pdu. */ ++static unsigned char *pdu_alloc(const size_t size) ++{ ++ return pdu_alloc_fail(size) ?: pdu_grab_space(size); ++} ++ ++/* global table of sizes of template field types */ ++static u_int8_t tpl_element_sizes[] = { ++ [IN_BYTES] = 4, ++ [IN_PKTS] = 4, ++ [PROTOCOL] = 1, ++ [TOS] = 1, ++ [TCP_FLAGS] = 1, ++ [L4_SRC_PORT] = 2, ++ [IPV4_SRC_ADDR] = 4, ++ [SRC_MASK] = 1, ++ [INPUT_SNMP] = 2, ++ [L4_DST_PORT] = 2, ++ [IPV4_DST_ADDR] = 4, ++ [DST_MASK] = 1, ++ [OUTPUT_SNMP] = 2, ++ [IPV4_NEXT_HOP] = 4, ++ //[SRC_AS] = 2, ++ //[DST_AS] = 2, ++ //[BGP_IPV4_NEXT_HOP] = 4, ++ //[MUL_DST_PKTS] = 4, ++ //[MUL_DST_BYTES] = 4, ++ [LAST_SWITCHED] = 4, ++ [FIRST_SWITCHED]= 4, ++ [IPV6_SRC_ADDR] = 16, ++ [IPV6_DST_ADDR] = 16, ++ [IPV6_FLOW_LABEL] = 3, ++ [ICMP_TYPE] = 2, ++ [MUL_IGMP_TYPE] = 1, ++ //[TOTAL_BYTES_EXP] = 4, ++ //[TOTAL_PKTS_EXP] = 4, ++ //[TOTAL_FLOWS_EXP] = 4, ++ [IPV6_NEXT_HOP] = 16, ++ [IPV6_OPTION_HEADERS] = 2, ++ [commonPropertiesId] = 4, ++ [ipv4Options] = 4, ++ [tcpOptions] = 4, ++ [postNATSourceIPv4Address] = 4, ++ [postNATDestinationIPv4Address] = 4, ++ [postNAPTSourceTransportPort] = 2, ++ [postNAPTDestinationTransportPort] = 2, ++ [natEvent] = 1, ++ [postNATSourceIPv6Address] = 16, ++ [postNATDestinationIPv6Address] = 16, ++ [IPSecSPI] = 4, ++ [observationTimeMilliseconds] = 8, ++ [observationTimeMicroseconds] = 8, ++ [observationTimeNanoseconds] = 8, ++}; ++ ++#define TEMPLATES_HASH_BSIZE 8 ++#define TEMPLATES_HASH_SIZE (1<<TEMPLATES_HASH_BSIZE) ++static struct hlist_head templates_hash[TEMPLATES_HASH_SIZE]; ++ ++struct base_template { ++ int length; /* number of elements in template */ ++ u_int16_t types[]; /* {type, size} pairs */ ++}; ++ ++/* base templates */ ++#define BTPL_BASE 0x00000001 /* base stat */ ++#define BTPL_IP4 0x00000002 /* IPv4 */ ++#define BTPL_MASK4 0x00000004 /* Aggregated */ ++#define BTPL_PORTS 0x00000008 /* UDP&TCP */ ++#define BTPL_IP6 0x00000010 /* IPv6 */ ++#define BTPL_ICMP 0x00000020 /* ICMP */ ++#define BTPL_IGMP 0x00000040 /* IGMP */ ++#define BTPL_IPSEC 0x00000080 /* AH&ESP */ ++#define BTPL_NAT4 0x00000100 /* NAT IPv4 */ ++#define BTPL_MARK 0x00000400 /* connmark */ ++#define BTPL_LABEL6 0x00000800 /* IPv6 flow label */ ++#define BTPL_OPTIONS4 0x00001000 /* IPv4 Options */ ++#define BTPL_OPTIONS6 0x00002000 /* IPv6 Options */ ++#define BTPL_TCPOPTIONS 0x00004000 /* TCP Options */ ++#define BTPL_MAX 32 ++ ++static struct base_template template_base = { ++ .types = { ++ INPUT_SNMP, ++ OUTPUT_SNMP, ++ IN_PKTS, ++ IN_BYTES, ++ FIRST_SWITCHED, ++ LAST_SWITCHED, ++ PROTOCOL, ++ TOS, ++ 0 ++ } ++}; ++static struct base_template template_ipv4 = { ++ .types = { ++ IPV4_SRC_ADDR, ++ IPV4_DST_ADDR, ++ IPV4_NEXT_HOP, ++ 0 ++ } ++}; ++static struct base_template template_options4 = { ++ .types = { ipv4Options, 0 } ++}; ++static struct base_template template_tcpoptions = { ++ .types = { tcpOptions, 0 } ++}; ++static struct base_template template_ipv6 = { ++ .types = { ++ IPV6_SRC_ADDR, ++ IPV6_DST_ADDR, ++ IPV6_NEXT_HOP, ++ 0 ++ } ++}; ++static struct base_template template_options6 = { ++ .types = { IPV6_OPTION_HEADERS, 0 } ++}; ++static struct base_template template_label6 = { ++ .types = { IPV6_FLOW_LABEL, 0 } ++}; ++static struct base_template template_ipv4_mask = { ++ .types = { ++ SRC_MASK, ++ DST_MASK, ++ 0 ++ } ++}; ++static struct base_template template_ports = { ++ .types = { ++ L4_SRC_PORT, ++ L4_DST_PORT, ++ TCP_FLAGS, ++ 0 ++ } ++}; ++static struct base_template template_icmp = { ++ .types = { ICMP_TYPE, 0 } ++}; ++static struct base_template template_igmp = { ++ .types = { MUL_IGMP_TYPE, 0 } ++}; ++static struct base_template template_ipsec = { ++ .types = { IPSecSPI, 0 } ++}; ++static struct base_template template_nat4 = { ++ .types = { ++ observationTimeMilliseconds, ++ IPV4_SRC_ADDR, ++ IPV4_DST_ADDR, ++ postNATSourceIPv4Address, ++ postNATDestinationIPv4Address, ++ L4_SRC_PORT, ++ L4_DST_PORT, ++ postNAPTSourceTransportPort, ++ postNAPTDestinationTransportPort, ++ PROTOCOL, ++ natEvent, ++ 0 ++ } ++}; ++static struct base_template template_mark = { ++ .types = { commonPropertiesId, 0 } ++}; ++ ++struct data_template { ++ struct hlist_node hlist; ++ int tpl_mask; ++ ++ int length; /* number of elements in template */ ++ int tpl_size; /* summary size of template with flowset header */ ++ int rec_size; /* summary size of all recods of template (w/o flowset header) */ ++ int template_id_n; /* assigned from template_ids, network order. */ ++ int exported_cnt; ++ unsigned long exported_ts; /* jiffies */ ++ u_int16_t fields[]; /* {type, size} pairs */ ++} __attribute__ ((packed)); ++ ++#define TPL_FIELD_NSIZE 4 /* one complete template field's network size */ ++ ++static void free_templates(void) ++{ ++ int i; ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3,9,0) ++ struct hlist_node *pos; ++#endif ++ struct hlist_node *tmp; ++ ++ for (i = 0; i < TEMPLATES_HASH_SIZE; i++) { ++ struct hlist_head *thead = &templates_hash[i]; ++ struct data_template *tpl; ++ ++ compat_hlist_for_each_entry_safe(tpl, pos, tmp, thead, hlist) ++ kfree(tpl); ++ INIT_HLIST_HEAD(thead); ++ } ++ tpl_count = 0; ++} ++ ++/* create combined template from mask */ ++static struct data_template *get_template(const int tmask) ++{ ++ struct base_template *tlist[BTPL_MAX]; ++ struct data_template *tpl; ++ int tnum; ++ int length; ++ int i, j, k; ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3,9,0) ++ struct hlist_node *pos; ++#endif ++ int hash = hash_long(tmask, TEMPLATES_HASH_BSIZE); ++ ++ compat_hlist_for_each_entry(tpl, pos, &templates_hash[hash], hlist) ++ if (tpl->tpl_mask == tmask) ++ return tpl; ++ ++ tnum = 0; ++ if (tmask & BTPL_IP4) { ++ tlist[tnum++] = &template_ipv4; ++ if (tmask & BTPL_OPTIONS4) ++ tlist[tnum++] = &template_options4; ++ if (tmask & BTPL_MASK4) ++ tlist[tnum++] = &template_ipv4_mask; ++ } else if (tmask & BTPL_IP6) { ++ tlist[tnum++] = &template_ipv6; ++ if (tmask & BTPL_LABEL6) ++ tlist[tnum++] = &template_label6; ++ if (tmask & BTPL_OPTIONS6) ++ tlist[tnum++] = &template_options6; ++ } else if (tmask & BTPL_NAT4) ++ tlist[tnum++] = &template_nat4; ++ if (tmask & BTPL_PORTS) ++ tlist[tnum++] = &template_ports; ++ if (tmask & BTPL_BASE) ++ tlist[tnum++] = &template_base; ++ if (tmask & BTPL_TCPOPTIONS) ++ tlist[tnum++] = &template_tcpoptions; ++ if (tmask & BTPL_ICMP) ++ tlist[tnum++] = &template_icmp; ++ if (tmask & BTPL_IGMP) ++ tlist[tnum++] = &template_igmp; ++ if (tmask & BTPL_IPSEC) ++ tlist[tnum++] = &template_ipsec; ++ if (tmask & BTPL_MARK) ++ tlist[tnum++] = &template_mark; ++ ++ /* calc memory size */ ++ length = 0; ++ for (i = 0; i < tnum; i++) { ++ if (!tlist[i]->length) { ++ for (k = 0; tlist[i]->types[k]; k++); ++ tlist[i]->length = k; ++ } ++ length += tlist[i]->length; ++ } ++ /* elements are pairs + one termiantor */ ++ tpl = kmalloc(sizeof(struct data_template) + (length * 2 + 1) * sizeof(u_int16_t), GFP_KERNEL); ++ if (!tpl) { ++ printk(KERN_ERR "ipt_NETFLOW: unable to kmalloc template.\n"); ++ return NULL; ++ } ++ tpl->tpl_mask = tmask; ++ tpl->length = length; ++ tpl->tpl_size = sizeof(struct flowset_template); ++ tpl->rec_size = 0; ++ tpl->template_id_n = htons(template_ids++); ++ tpl->exported_cnt = 0; ++ tpl->exported_ts = 0; ++ ++ j = 0; ++ for (i = 0; i < tnum; i++) { ++ struct base_template *btpl = tlist[i]; ++ ++ for (k = 0; k < btpl->length; k++) { ++ int size; ++ int type = btpl->types[k]; ++ ++ tpl->fields[j++] = type; ++ size = tpl_element_sizes[type]; ++ tpl->fields[j++] = size; ++ tpl->rec_size += size; ++ } ++ tpl->tpl_size += btpl->length * TPL_FIELD_NSIZE; ++ } ++ tpl->fields[j++] = 0; ++ ++ hlist_add_head(&tpl->hlist, &templates_hash[hash]); ++ tpl_count++; ++ ++ return tpl; ++} ++ ++static void pdu_add_template(struct data_template *tpl) ++{ ++ int i; ++ unsigned char *ptr; ++ struct flowset_template *ntpl; ++ __be16 *sptr; ++ ++ ptr = pdu_alloc(tpl->tpl_size); ++ ntpl = (struct flowset_template *)ptr; ++ ntpl->flowset_id = protocol == 9? htons(FLOWSET_TEMPLATE) : htons(IPFIX_TEMPLATE); ++ ntpl->length = htons(tpl->tpl_size); ++ ntpl->template_id = tpl->template_id_n; ++ ntpl->field_count = htons(tpl->length); ++ ptr += sizeof(struct flowset_template); ++ sptr = (__be16 *)ptr; ++ for (i = 0; ; ) { ++ int type = tpl->fields[i++]; ++ if (!type) ++ break; ++ *sptr++ = htons(type); ++ *sptr++ = htons(tpl->fields[i++]); ++ } ++ ++ tpl->exported_cnt = pdu_count; ++ tpl->exported_ts = jiffies; ++ ++ pdu_flowset = NULL; ++ pdu_tpl_records++; ++} ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,35) ++static inline s64 portable_ktime_to_ms(const ktime_t kt) ++{ ++ struct timeval tv = ktime_to_timeval(kt); ++ return (s64) tv.tv_sec * MSEC_PER_SEC + tv.tv_usec / USEC_PER_MSEC; ++} ++#define ktime_to_ms portable_ktime_to_ms ++#endif ++ ++/* encode one field */ ++typedef struct in6_addr in6_t; ++static inline void add_ipv4_field(__u8 *ptr, const int type, const struct ipt_netflow *nf) ++{ ++ switch (type) { ++ case IN_BYTES: *(__be32 *)ptr = htonl(nf->nr_bytes); break; ++ case IN_PKTS: *(__be32 *)ptr = htonl(nf->nr_packets); break; ++ case FIRST_SWITCHED: *(__be32 *)ptr = htonl(jiffies_to_msecs(nf->ts_first)); break; ++ case LAST_SWITCHED: *(__be32 *)ptr = htonl(jiffies_to_msecs(nf->ts_last)); break; ++ case IPV4_SRC_ADDR: *(__be32 *)ptr = nf->tuple.src.ip; break; ++ case IPV4_DST_ADDR: *(__be32 *)ptr = nf->tuple.dst.ip; break; ++ case IPV4_NEXT_HOP: *(__be32 *)ptr = nf->nh.ip; break; ++ case L4_SRC_PORT: *(__be16 *)ptr = nf->tuple.s_port; break; ++ case L4_DST_PORT: *(__be16 *)ptr = nf->tuple.d_port; break; ++ case INPUT_SNMP: *(__be16 *)ptr = htons(nf->tuple.i_ifc); break; ++ case OUTPUT_SNMP: *(__be16 *)ptr = htons(nf->o_ifc); break; ++ case PROTOCOL: *ptr = nf->tuple.protocol; break; ++ case TCP_FLAGS: *ptr = nf->tcp_flags; break; ++ case TOS: *ptr = nf->tuple.tos; break; ++ case IPV6_SRC_ADDR: *(in6_t *)ptr = nf->tuple.src.in6; break; ++ case IPV6_DST_ADDR: *(in6_t *)ptr = nf->tuple.dst.in6; break; ++ case IPV6_NEXT_HOP: *(in6_t *)ptr = nf->nh.in6; break; ++ case IPV6_FLOW_LABEL: *ptr++ = nf->flow_label >> 16; ++ *(__be16 *)ptr = nf->flow_label; ++ break; ++ case tcpOptions: *(__be32 *)ptr = htonl(nf->tcpoptions); break; ++ case ipv4Options: *(__be32 *)ptr = htonl(nf->options); break; ++ case IPV6_OPTION_HEADERS: *(__be16 *)ptr = htons(nf->options); break; ++#ifdef CONFIG_NF_CONNTRACK_MARK ++ case commonPropertiesId: ++ *(__be32 *)ptr = htonl(nf->mark); break; ++#endif ++ case SRC_MASK: *ptr = nf->s_mask; break; ++ case DST_MASK: *ptr = nf->d_mask; break; ++ case ICMP_TYPE: *(__be16 *)ptr = nf->tuple.d_port; break; ++ case MUL_IGMP_TYPE: *ptr = nf->tuple.d_port; break; ++#ifdef CONFIG_NF_NAT_NEEDED ++ case postNATSourceIPv4Address: *(__be32 *)ptr = nf->nat->post.s_addr; break; ++ case postNATDestinationIPv4Address: *(__be32 *)ptr = nf->nat->post.d_addr; break; ++ case postNAPTSourceTransportPort: *(__be16 *)ptr = nf->nat->post.s_port; break; ++ case postNAPTDestinationTransportPort: *(__be16 *)ptr = nf->nat->post.d_port; break; ++ case natEvent: *ptr = nf->nat->nat_event; break; ++#endif ++ case IPSecSPI: *(__u32 *)ptr = (nf->tuple.s_port << 16) | nf->tuple.d_port; break; ++ case observationTimeMilliseconds: ++ *(__be64 *)ptr = cpu_to_be64(ktime_to_ms(nf->ts_obs)); break; ++ case observationTimeMicroseconds: ++ *(__be64 *)ptr = cpu_to_be64(ktime_to_us(nf->ts_obs)); break; ++ case observationTimeNanoseconds: ++ *(__be64 *)ptr = cpu_to_be64(ktime_to_ns(nf->ts_obs)); break; ++ default: ++ memset(ptr, 0, tpl_element_sizes[type]); ++ } ++} ++ ++#define PAD_SIZE 4 /* rfc prescribes flowsets to be padded */ ++ ++/* cache timeout_rate in jiffies */ ++static inline unsigned long timeout_rate_j(void) ++{ ++ static unsigned int t_rate = 0; ++ static unsigned long t_rate_j = 0; ++ ++ if (unlikely(timeout_rate != t_rate)) { ++ struct timeval tv = { .tv_sec = timeout_rate * 60, .tv_usec = 0 }; ++ ++ t_rate = timeout_rate; ++ t_rate_j = timeval_to_jiffies(&tv); ++ } ++ return t_rate_j; ++} ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20) ++#define IPPROTO_UDPLITE 136 ++#endif ++ ++#ifndef time_is_before_jiffies ++#define time_is_before_jiffies(a) time_after(jiffies, a) ++#endif ++ ++static void netflow_export_flow_tpl(struct ipt_netflow *nf) ++{ ++ unsigned char *ptr; ++ int i; ++ struct data_template *tpl; ++ int tpl_mask = BTPL_BASE; ++ ++ if (unlikely(debug > 2)) ++ printk(KERN_INFO "adding flow to export (%d)\n", ++ pdu_data_records + pdu_tpl_records); ++ ++ if (likely(nf->tuple.l3proto == AF_INET)) { ++ tpl_mask |= BTPL_IP4; ++ if (unlikely(nf->options)) ++ tpl_mask |= BTPL_OPTIONS4; ++ } else { ++ tpl_mask |= BTPL_IP6; ++ if (unlikely(nf->options)) ++ tpl_mask |= BTPL_OPTIONS6; ++ if (unlikely(nf->flow_label)) ++ tpl_mask |= BTPL_LABEL6; ++ } ++ if (unlikely(nf->tcpoptions)) ++ tpl_mask |= BTPL_TCPOPTIONS; ++ if (unlikely(nf->s_mask || nf->d_mask)) ++ tpl_mask |= BTPL_MASK4; ++ if (likely(nf->tuple.protocol == IPPROTO_TCP || ++ nf->tuple.protocol == IPPROTO_UDP || ++ nf->tuple.protocol == IPPROTO_SCTP || ++ nf->tuple.protocol == IPPROTO_UDPLITE)) ++ tpl_mask |= BTPL_PORTS; ++ else if (nf->tuple.protocol == IPPROTO_ICMP) ++ tpl_mask |= BTPL_ICMP; ++ else if (nf->tuple.protocol == IPPROTO_IGMP) ++ tpl_mask |= BTPL_IGMP; ++#ifdef CONFIG_NF_CONNTRACK_MARK ++ if (nf->mark) ++ tpl_mask |= BTPL_MARK; ++#endif ++#ifdef CONFIG_NF_NAT_NEEDED ++ if (nf->nat) ++ tpl_mask = BTPL_NAT4; ++#endif ++ ++ tpl = get_template(tpl_mask); ++ if (unlikely(!tpl)) { ++ printk(KERN_INFO "ipt_NETFLOW: template allocation failed.\n"); ++ NETFLOW_STAT_INC(alloc_err); ++ NETFLOW_STAT_ADD_ATOMIC(pkt_drop, nf->nr_packets); ++ NETFLOW_STAT_ADD_ATOMIC(traf_drop, nf->nr_bytes); ++ ipt_netflow_free(nf); ++ return; ++ } ++ ++ if (unlikely(!pdu_flowset || ++ pdu_flowset->flowset_id != tpl->template_id_n || ++ !(ptr = pdu_alloc_fail(tpl->rec_size)))) { ++ ++ /* if there was previous data template we should pad it to 4 bytes */ ++ if (pdu_flowset) { ++ int padding = (PAD_SIZE - ntohs(pdu_flowset->length) % PAD_SIZE) % PAD_SIZE; ++ if (padding && (ptr = pdu_alloc_fail(padding))) { ++ pdu_flowset->length = htons(ntohs(pdu_flowset->length) + padding); ++ for (; padding; padding--) ++ *ptr++ = 0; ++ } ++ } ++ ++ if (!tpl->exported_ts || ++ pdu_count > (tpl->exported_cnt + refresh_rate) || ++ time_is_before_jiffies(tpl->exported_ts + timeout_rate_j())) { ++ pdu_add_template(tpl); ++ } ++ ++ ptr = pdu_alloc(sizeof(struct flowset_data) + tpl->rec_size); ++ pdu_flowset = (struct flowset_data *)ptr; ++ pdu_flowset->flowset_id = tpl->template_id_n; ++ pdu_flowset->length = htons(sizeof(struct flowset_data)); ++ ptr += sizeof(struct flowset_data); ++ } ++ ++ /* encode all fields */ ++ for (i = 0; ; ) { ++ int type = tpl->fields[i++]; ++ ++ if (!type) ++ break; ++ add_ipv4_field(ptr, type, nf); ++ ptr += tpl->fields[i++]; ++ } ++ ++ pdu_data_records++; ++ pdu_flowset->length = htons(ntohs(pdu_flowset->length) + tpl->rec_size); ++ ++ pdu_packets += nf->nr_packets; ++ pdu_traf += nf->nr_bytes; ++ ++ ipt_netflow_free(nf); ++ pdu_ts_mod = jiffies; ++} ++ ++static void netflow_switch_version(const int ver) ++{ ++ protocol = ver; ++ if (protocol == 5) { ++ memset(&pdu, 0, sizeof(pdu)); ++ netflow_export_flow = &netflow_export_flow_v5; ++ netflow_export_pdu = &netflow_export_pdu_v5; ++ } else if (protocol == 9) { ++ pdu_data_used = pdu.v9.data; ++ pdu_max_size = sizeof(pdu.v9); ++ pdu_high_wm = (unsigned char *)&pdu + pdu_max_size; ++ netflow_export_flow = &netflow_export_flow_tpl; ++ netflow_export_pdu = &netflow_export_pdu_v9; ++ } else { /* IPFIX */ ++ pdu_data_used = pdu.ipfix.data; ++ pdu_max_size = sizeof(pdu.ipfix); ++ pdu_high_wm = (unsigned char *)&pdu + pdu_max_size; ++ netflow_export_flow = &netflow_export_flow_tpl; ++ netflow_export_pdu = &netflow_export_pdu_ipfix; ++ } ++ if (protocol != 5) ++ free_templates(); ++ pdu_data_records = pdu_tpl_records = 0; ++ pdu_flowset = NULL; ++ printk(KERN_INFO "ipt_NETFLOW protocol version %d (%s) enabled.\n", ++ protocol, protocol == 10? "IPFIX" : "NetFlow"); ++} ++ ++#ifdef CONFIG_NF_NAT_NEEDED ++static void export_nat_event(struct nat_event *nel) ++{ ++ static struct ipt_netflow nf = { { NULL } }; ++ ++ nf.tuple.l3proto = AF_INET; ++ nf.tuple.protocol = nel->protocol; ++ nf.nat = nel; /* this is also flag of dummy flow */ ++ nf.tcp_flags = (nel->nat_event == NAT_DESTROY)? TCP_FIN_RST : TCP_SYN_ACK; ++ if (protocol >= 9) { ++ nf.ts_obs = nel->ts_ktime; ++ nf.tuple.src.ip = nel->pre.s_addr; ++ nf.tuple.dst.ip = nel->pre.d_addr; ++ nf.tuple.s_port = nel->pre.s_port; ++ nf.tuple.d_port = nel->pre.d_port; ++ netflow_export_flow(&nf); ++ } else { /* v5 */ ++ /* The weird v5 packet(s). ++ * src and dst will be same as in data flow from the FORWARD chain ++ * where src is pre-nat src ip and dst is post-nat dst ip. ++ * What we lacking here is external src ip for SNAT, or ++ * pre-nat dst ip for DNAT. We will put this into Nexthop field ++ * with port into src/dst AS field. tcp_flags will distinguish it's ++ * start or stop event. Two flows in case of full nat. */ ++ nf.tuple.src.ip = nel->pre.s_addr; ++ nf.tuple.s_port = nel->pre.s_port; ++ nf.tuple.dst.ip = nel->post.d_addr; ++ nf.tuple.d_port = nel->post.d_port; ++ ++ nf.ts_first = nel->ts_jiffies; ++ nf.ts_last = nel->ts_jiffies; ++ if (nel->pre.s_addr != nel->post.s_addr || ++ nel->pre.s_port != nel->post.s_port) { ++ nf.nh.ip = nel->post.s_addr; ++ nf.s_as = nel->post.s_port; ++ nf.d_as = 0; ++ netflow_export_flow(&nf); ++ } ++ if (nel->pre.d_addr != nel->post.d_addr || ++ nel->pre.d_port != nel->post.d_port) { ++ nf.nh.ip = nel->pre.d_addr; ++ nf.s_as = 0; ++ nf.d_as = nel->pre.d_port; ++ netflow_export_flow(&nf); ++ } ++ } ++ kfree(nel); + } ++#endif /* CONFIG_NF_NAT_NEEDED */ + +-static inline int active_needs_export(struct ipt_netflow *nf, long a_timeout) ++static inline int active_needs_export(const struct ipt_netflow *nf, const long a_timeout) + { + /* active too long, finishing, or having too much bytes */ + return ((jiffies - nf->ts_first) > a_timeout) || +@@ -1057,42 +1987,77 @@ static inline int active_needs_export(struct ipt_netflow *nf, long a_timeout) + + /* could be called with zero to flush cache and pdu */ + /* this function is guaranteed to be called non-concurrently */ +-static void netflow_scan_and_export(int flush) ++/* return -1 is trylockfailed, 0 if nothin gexported, >=1 if exported something */ ++static int netflow_scan_and_export(const int flush) + { + long i_timeout = inactive_timeout * HZ; + long a_timeout = active_timeout * HZ; ++ int trylock_failed = 0; ++ int pdu_c = pdu_count; + + if (flush) + i_timeout = 0; + +- spin_lock_bh(&ipt_netflow_lock); +- while (!list_empty(&ipt_netflow_list)) { ++ local_bh_disable(); ++ spin_lock(&hlist_lock); ++ /* This is different order of locking than elsewhere, ++ * so we trylock&break to avoid deadlock. */ ++ ++ while (likely(!list_empty(&ipt_netflow_list))) { + struct ipt_netflow *nf; +- ++ ++ /* Last entry, which is usually oldest. */ + nf = list_entry(ipt_netflow_list.prev, struct ipt_netflow, list); ++ if (!spin_trylock(nf->lock)) { ++ trylock_failed = 1; ++ break; ++ } + /* Note: i_timeout checked with >= to allow specifying zero timeout + * to purge all flows on module unload */ + if (((jiffies - nf->ts_last) >= i_timeout) || + active_needs_export(nf, a_timeout)) { + hlist_del(&nf->hlist); ++ spin_unlock(nf->lock); ++ + list_del(&nf->list); ++ spin_unlock(&hlist_lock); ++ local_bh_enable(); ++ + NETFLOW_STAT_ADD(pkt_out, nf->nr_packets); + NETFLOW_STAT_ADD(traf_out, nf->nr_bytes); +- spin_unlock_bh(&ipt_netflow_lock); + netflow_export_flow(nf); +- spin_lock_bh(&ipt_netflow_lock); ++ ++ local_bh_disable(); ++ spin_lock(&hlist_lock); + } else { ++ spin_unlock(nf->lock); + /* all flows which need to be exported is always at the tail + * so if no more exportable flows we can break */ + break; + } + } +- spin_unlock_bh(&ipt_netflow_lock); +- ++ spin_unlock(&hlist_lock); ++ local_bh_enable(); ++ ++#ifdef CONFIG_NF_NAT_NEEDED ++ spin_lock_bh(&nat_lock); ++ while (!list_empty(&nat_list)) { ++ struct nat_event *nel; ++ ++ nel = list_entry(nat_list.next, struct nat_event, list); ++ list_del(&nel->list); ++ spin_unlock_bh(&nat_lock); ++ export_nat_event(nel); ++ spin_lock_bh(&nat_lock); ++ } ++ spin_unlock_bh(&nat_lock); ++#endif + /* flush flows stored in pdu if there no new flows for too long */ + /* Note: using >= to allow flow purge on zero timeout */ + if ((jiffies - pdu_ts_mod) >= i_timeout) + netflow_export_pdu(); ++ ++ return trylock_failed? -1 : pdu_count - pdu_c; + } + + #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20) +@@ -1101,8 +2066,10 @@ static void netflow_work_fn(void *dummy) + static void netflow_work_fn(struct work_struct *dummy) + #endif + { +- netflow_scan_and_export(0); +- __start_scan_worker(); ++ int status; ++ ++ status = netflow_scan_and_export(DONT_FLUSH); ++ _schedule_scan_worker(status); + } + + #define RATESHIFT 2 +@@ -1154,7 +2121,7 @@ static void rate_timer_calc(unsigned long dummy) + old_found = found; + old_notfound = notfound; + /* if there is no access to hash keep rate steady */ +- metric = (dfnd + dnfnd)? 10 * (dsrch + dfnd + dnfnd) / (dfnd + dnfnd) : metric; ++ metric = (dfnd + dnfnd)? 100 * (dsrch + dfnd + dnfnd) / (dfnd + dnfnd) : metric; + CALC_RATE(min15_metric, (unsigned long long)metric, 15); + CALC_RATE(min5_metric, (unsigned long long)metric, 5); + CALC_RATE(min_metric, (unsigned long long)metric, 1); +@@ -1162,6 +2129,262 @@ static void rate_timer_calc(unsigned long dummy) + mod_timer(&rate_timer, jiffies + (HZ * SAMPLERATE)); + } + ++#ifdef CONFIG_NF_NAT_NEEDED ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,31) ++static struct nf_ct_event_notifier *saved_event_cb __read_mostly = NULL; ++static int netflow_conntrack_event(const unsigned int events, struct nf_ct_event *item) ++#else ++static int netflow_conntrack_event(struct notifier_block *this, unsigned long events, void *ptr) ++#endif ++{ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,31) ++ struct nf_conn *ct = item->ct; ++#else ++ struct nf_conn *ct = (struct nf_conn *)ptr; ++#endif ++ struct nat_event *nel; ++ const struct nf_conntrack_tuple *t; ++ int ret = NOTIFY_DONE; ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,31) ++ struct nf_ct_event_notifier *notifier; ++ ++ /* Call netlink first. */ ++ notifier = rcu_dereference(saved_event_cb); ++ if (likely(notifier)) ++ ret = notifier->fcn(events, item); ++#endif ++ if (unlikely(!natevents)) ++ return ret; ++ ++ if (!(events & ((1 << IPCT_NEW) | (1 << IPCT_RELATED) | (1 << IPCT_DESTROY)))) ++ return ret; ++ ++ if (!(ct->status & IPS_NAT_MASK)) ++ return ret; ++ ++ if (unlikely(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num != AF_INET || ++ ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.l3num != AF_INET)) { ++ /* Well, there is no linux NAT for IPv6 anyway. */ ++ return ret; ++ } ++ ++ if (!(nel = kmalloc(sizeof(struct nat_event), GFP_ATOMIC))) { ++ printk(KERN_ERR "ipt_NETFLOW: can't kmalloc nat event\n"); ++ return ret; ++ } ++ memset(nel, 0, sizeof(struct nat_event)); ++ nel->ts_ktime = ktime_get_real(); ++ nel->ts_jiffies = jiffies; ++ t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; ++ nel->protocol = t->dst.protonum; ++ nel->pre.s_addr = t->src.u3.ip; ++ nel->pre.d_addr = t->dst.u3.ip; ++ nel->pre.s_port = t->src.u.all; ++ nel->pre.d_port = t->dst.u.all; ++ t = &ct->tuplehash[IP_CT_DIR_REPLY].tuple; ++ /* reply is reversed */ ++ nel->post.s_addr = t->dst.u3.ip; ++ nel->post.d_addr = t->src.u3.ip; ++ nel->post.s_port = t->dst.u.all; ++ nel->post.d_port = t->src.u.all; ++ if (events & (1 << IPCT_DESTROY)) { ++ nel->nat_event = NAT_DESTROY; ++ nat_events_stop++; ++ } else { ++ nel->nat_event = NAT_CREATE; ++ nat_events_start++; ++ } ++ ++ spin_lock_bh(&nat_lock); ++ list_add_tail(&nel->list, &nat_list); ++ spin_unlock_bh(&nat_lock); ++ ++ return ret; ++} ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,31) ++static struct notifier_block ctnl_notifier = { ++ .notifier_call = netflow_conntrack_event ++}; ++#else ++static struct nf_ct_event_notifier ctnl_notifier = { ++ .fcn = netflow_conntrack_event ++}; ++#endif /* since 2.6.31 */ ++#endif /* CONFIG_NF_NAT_NEEDED */ ++ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,23) && \ ++ LINUX_VERSION_CODE < KERNEL_VERSION(2,6,35) ++static bool ++#else ++static int ++#endif ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28) ++netflow_target_check(const char *tablename, const void *entry, const struct xt_target *target, ++ void *targinfo, ++#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18) ++ unsigned int targinfosize, ++#endif ++ unsigned int hook_mask) ++{ ++#else ++netflow_target_check(const struct xt_tgchk_param *par) ++{ ++ const char *tablename = par->table; ++ const struct xt_target *target = par->target; ++#endif ++ if (strcmp("nat", tablename) == 0) { ++ /* In the nat table we only see single packet per flow, which is useless. */ ++ printk(KERN_ERR "%s target: is not valid in %s table\n", target->name, tablename); ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,35) ++#define CHECK_FAIL 0 ++#define CHECK_OK 1 ++#else ++#define CHECK_FAIL -EINVAL ++#define CHECK_OK 0 ++#endif ++ return CHECK_FAIL; ++ } ++ if (target->family == AF_INET6 && protocol == 5) { ++ printk(KERN_ERR "ip6tables NETFLOW target is meaningful for protocol 9 or 10 only.\n"); ++ return CHECK_FAIL; ++ } ++ return CHECK_OK; ++} ++ ++#define SetXBit(x) (0x8000 >> (x)) /* Proper bit for htons later. */ ++#ifndef IPPROTO_MH ++#define IPPROTO_MH 135 ++#endif ++static inline __u16 observed_hdrs(const __u8 currenthdr) ++{ ++ switch (currenthdr) { ++ case IPPROTO_TCP: ++ case IPPROTO_UDP: ++ /* For speed, in case switch is not optimized. */ ++ return 0; ++ case IPPROTO_DSTOPTS: return SetXBit(0); ++ case IPPROTO_HOPOPTS: return SetXBit(1); ++ case IPPROTO_ROUTING: return SetXBit(5); ++ case IPPROTO_MH: return SetXBit(12); ++ case IPPROTO_ESP: return SetXBit(13); ++ case IPPROTO_AH: return SetXBit(14); ++ case IPPROTO_COMP: return SetXBit(15); ++ case IPPROTO_FRAGMENT: /* Handled elsewhere. */ ++ /* Next is known headers. */ ++ case IPPROTO_ICMPV6: ++ case IPPROTO_UDPLITE: ++ case IPPROTO_IPIP: ++ case IPPROTO_PIM: ++ case IPPROTO_GRE: ++ case IPPROTO_SCTP: ++#ifdef IPPROTO_L2TP ++ case IPPROTO_L2TP: ++#endif ++ case IPPROTO_DCCP: ++ return 0; ++ } ++ return SetXBit(3); /* Unknown header. */ ++} ++ ++/* http://www.iana.org/assignments/ip-parameters/ip-parameters.xhtml */ ++static const __u8 ip4_opt_table[] = { ++ [7] = 0, /* RR */ /* parsed manually becasue of 0 */ ++ [134] = 1, /* CIPSO */ ++ [133] = 2, /* E-SEC */ ++ [68] = 3, /* TS */ ++ [131] = 4, /* LSR */ ++ [130] = 5, /* SEC */ ++ [1] = 6, /* NOP */ ++ [0] = 7, /* EOOL */ ++ [15] = 8, /* ENCODE */ ++ [142] = 9, /* VISA */ ++ [205] = 10, /* FINN */ ++ [12] = 11, /* MTUR */ ++ [11] = 12, /* MTUP */ ++ [10] = 13, /* ZSU */ ++ [137] = 14, /* SSR */ ++ [136] = 15, /* SID */ ++ [151] = 16, /* DPS */ ++ [150] = 17, /* NSAPA */ ++ [149] = 18, /* SDB */ ++ [147] = 19, /* ADDEXT */ ++ [148] = 20, /* RTRALT */ ++ [82] = 21, /* TR */ ++ [145] = 22, /* EIP */ ++ [144] = 23, /* IMITD */ ++ [30] = 25, /* EXP */ ++ [94] = 25, /* EXP */ ++ [158] = 25, /* EXP */ ++ [222] = 25, /* EXP */ ++ [25] = 30, /* QS */ ++ [152] = 31, /* UMP */ ++}; ++/* Parse IPv4 Options array int ipv4Options IPFIX value. */ ++static inline __u32 ip4_options(const u_int8_t *p, const unsigned int optsize) ++{ ++ __u32 ret = 0; ++ unsigned int i; ++ ++ for (i = 0; likely(i < optsize); ) { ++ u_int8_t op = p[i++]; ++ ++ if (op == 7) /* RR: bit 0 */ ++ ret |= 1; ++ else if (likely(op < ARRAY_SIZE(ip4_opt_table))) { ++ /* Btw, IANA doc is messed up in a crazy way: ++ * http://www.ietf.org/mail-archive/web/ipfix/current/msg06008.html (2011) ++ * I decided to follow IANA _text_ description from ++ * http://www.iana.org/assignments/ipfix/ipfix.xhtml (2013-09-18) ++ * ++ * Set proper bit for htonl later. */ ++ if (ip4_opt_table[op]) ++ ret |= 1 << (32 - ip4_opt_table[op]); ++ } ++ if (likely(i >= optsize || op == 0)) ++ break; ++ else if (unlikely(op == 1)) ++ continue; ++ else if (unlikely(p[i] < 2)) ++ break; ++ else ++ i += p[i] - 1; ++ } ++ return ret; ++} ++ ++#define TCPHDR_MAXSIZE (4 * 15) ++/* List of options: http://www.iana.org/assignments/tcp-parameters/tcp-parameters.xhtml */ ++static inline __u32 tcp_options(const struct sk_buff *skb, const unsigned int ptr, const struct tcphdr *th) ++{ ++ const unsigned int optsize = th->doff * 4 - sizeof(struct tcphdr); ++ __u8 _opt[TCPHDR_MAXSIZE]; ++ const u_int8_t *p; ++ __u32 ret; ++ unsigned int i; ++ ++ p = skb_header_pointer(skb, ptr + sizeof(struct tcphdr), optsize, _opt); ++ if (unlikely(!p)) ++ return 0; ++ ret = 0; ++ for (i = 0; likely(i < optsize); ) { ++ u_int8_t opt = p[i++]; ++ ++ if (likely(opt < 32)) { ++ /* IANA doc is messed up, see above. */ ++ ret |= 1 << (32 - opt); ++ } ++ if (likely(i >= optsize || opt == 0)) ++ break; ++ else if (unlikely(opt == 1)) ++ continue; ++ else if (unlikely(p[i] < 2)) /* "silly options" */ ++ break; ++ else ++ i += p[i] - 1; ++ } ++ return ret; ++} + /* packet receiver */ + static unsigned int netflow_target( + #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,24) +@@ -1192,27 +2415,38 @@ static unsigned int netflow_target( + ) + { + #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,24) +- struct sk_buff *skb = *pskb; ++ const struct sk_buff *skb = *pskb; ++#endif ++ union { ++ struct iphdr ip; ++ struct ipv6hdr ip6; ++ } _iph, *iph; ++ unsigned int hash; ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28) ++ const int family = target->family; ++#else ++ const int family = par->family; + #endif +- struct iphdr _iph, *iph; + struct ipt_netflow_tuple tuple; + struct ipt_netflow *nf; + __u8 tcp_flags; + struct netflow_aggr_n *aggr_n; + struct netflow_aggr_p *aggr_p; + __u8 s_mask, d_mask; +- unsigned int hash; +- +- iph = skb_header_pointer(skb, 0, sizeof(_iph), &_iph); //iph = ip_hdr(skb); +- +- if (iph == NULL) { ++ unsigned int ptr; ++ int fragment; ++ size_t pkt_len; ++ int options = 0; ++ int tcpoptions = 0; ++ ++ iph = skb_header_pointer(skb, 0, (likely(family == AF_INET))? sizeof(_iph.ip) : sizeof(_iph.ip6), &iph); ++ if (unlikely(iph == NULL)) { + NETFLOW_STAT_INC(truncated); + NETFLOW_STAT_INC(pkt_drop); + return IPT_CONTINUE; + } + +- tuple.s_addr = iph->saddr; +- tuple.d_addr = iph->daddr; ++ tuple.l3proto = family; + tuple.s_port = 0; + tuple.d_port = 0; + #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28) +@@ -1220,30 +2454,118 @@ static unsigned int netflow_target( + #else + tuple.i_ifc = par->in? par->in->ifindex : -1; + #endif +- tuple.protocol = iph->protocol; +- tuple.tos = iph->tos; + tcp_flags = 0; /* Cisco sometimes have TCP ACK for non TCP packets, don't get it */ + s_mask = 0; + d_mask = 0; + +- if (iph->frag_off & htons(IP_OFFSET)) ++ if (likely(family == AF_INET)) { ++ tuple.src = (union nf_inet_addr){ .ip = iph->ip.saddr }; ++ tuple.dst = (union nf_inet_addr){ .ip = iph->ip.daddr }; ++ tuple.tos = iph->ip.tos; ++ tuple.protocol = iph->ip.protocol; ++ fragment = unlikely(iph->ip.frag_off & htons(IP_OFFSET)); ++ ptr = iph->ip.ihl * 4; ++ pkt_len = ntohs(iph->ip.tot_len); ++ ++#define IPHDR_MAXSIZE (4 * 15) ++ if (unlikely(iph->ip.ihl * 4 > sizeof(struct iphdr))) { ++ u_int8_t _opt[IPHDR_MAXSIZE - sizeof(struct iphdr)]; ++ const u_int8_t *op; ++ unsigned int optsize = iph->ip.ihl * 4 - sizeof(struct iphdr); ++ ++ op = skb_header_pointer(skb, sizeof(struct iphdr), optsize, _opt); ++ if (likely(op)) ++ options = ip4_options(op, optsize); ++ } ++ } else { ++ __u8 currenthdr; ++ ++ tuple.src.in6 = iph->ip6.saddr; ++ tuple.dst.in6 = iph->ip6.daddr; ++ tuple.tos = iph->ip6.priority; ++ fragment = 0; ++ ptr = sizeof(struct ipv6hdr); ++ pkt_len = ntohs(iph->ip6.payload_len) + sizeof(struct ipv6hdr); ++ ++ currenthdr = iph->ip6.nexthdr; ++ while (currenthdr != NEXTHDR_NONE && ipv6_ext_hdr(currenthdr)) { ++ struct ipv6_opt_hdr _hdr; ++ const struct ipv6_opt_hdr *hp; ++ unsigned int hdrlen = 0; ++ ++ options |= observed_hdrs(currenthdr); ++ hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr); ++ if (hp == NULL) { ++ /* We have src/dst, so must account something. */ ++ tuple.protocol = currenthdr; ++ fragment = 3; ++ goto do_protocols; ++ } ++ ++ switch (currenthdr) { ++ case IPPROTO_FRAGMENT: { ++ struct frag_hdr _fhdr; ++ const struct frag_hdr *fh; ++ ++ fh = skb_header_pointer(skb, ptr, sizeof(_fhdr), ++ &_fhdr); ++ if (fh == NULL) { ++ tuple.protocol = currenthdr; ++ fragment = 2; ++ goto do_protocols; ++ } ++ fragment = 1; ++#define FRA0 SetXBit(4) /* Fragment header - first fragment */ ++#define FRA1 SetXBit(6) /* Fragmentation header - not first fragment */ ++ options |= (ntohs(fh->frag_off) & 0xFFF8)? FRA1 : FRA0; ++ hdrlen = 8; ++ break; ++ } ++ case IPPROTO_AH: { ++ struct ip_auth_hdr _hdr, *hp; ++ ++ if (likely(hp = skb_header_pointer(skb, ptr, 8, &_hdr))) { ++ tuple.s_port = hp->spi >> 16; ++ tuple.d_port = hp->spi; ++ } ++ hdrlen = (hp->hdrlen + 2) << 2; ++ break; ++ } ++ default: ++ hdrlen = ipv6_optlen(hp); ++ } ++ currenthdr = hp->nexthdr; ++ ptr += hdrlen; ++ } ++ tuple.protocol = currenthdr; ++ options |= observed_hdrs(currenthdr); ++ } ++ ++do_protocols: ++ if (fragment) { ++ /* if conntrack is enabled it should defrag on pre-routing and local-out */ + NETFLOW_STAT_INC(frags); +- else { ++ } else { + switch (tuple.protocol) { + case IPPROTO_TCP: { + struct tcphdr _hdr, *hp; + +- if ((hp = skb_header_pointer(skb, iph->ihl * 4, 14, &_hdr))) { ++ if (likely(hp = skb_header_pointer(skb, ptr, 14, &_hdr))) { + tuple.s_port = hp->source; + tuple.d_port = hp->dest; + tcp_flags = (u_int8_t)(ntohl(tcp_flag_word(hp)) >> 16); ++ ++ if (unlikely(hp->doff * 4 > sizeof(struct tcphdr))) ++ tcpoptions = tcp_options(skb, ptr, hp); + } + break; + } +- case IPPROTO_UDP: { ++ case IPPROTO_UDP: ++ case IPPROTO_UDPLITE: ++ case IPPROTO_SCTP: { + struct udphdr _hdr, *hp; + +- if ((hp = skb_header_pointer(skb, iph->ihl * 4, 4, &_hdr))) { ++ if (likely(hp = skb_header_pointer(skb, ptr, 4, &_hdr))) { + tuple.s_port = hp->source; + tuple.d_port = hp->dest; + } +@@ -1252,72 +2574,111 @@ static unsigned int netflow_target( + case IPPROTO_ICMP: { + struct icmphdr _hdr, *hp; + +- if ((hp = skb_header_pointer(skb, iph->ihl * 4, 2, &_hdr))) +- tuple.d_port = (hp->type << 8) | hp->code; ++ if (likely(family == AF_INET && ++ (hp = skb_header_pointer(skb, ptr, 2, &_hdr)))) ++ tuple.d_port = htons((hp->type << 8) | hp->code); + break; + } ++ case IPPROTO_ICMPV6: { ++ struct icmp6hdr _icmp6h, *ic; ++ ++ if (likely(family == AF_INET6 && ++ (ic = skb_header_pointer(skb, ptr, 2, &_icmp6h)))) ++ tuple.d_port = htons((ic->icmp6_type << 8) | ic->icmp6_code); ++ break; ++ } + case IPPROTO_IGMP: { +- struct igmphdr *_hdr, *hp; ++ struct igmphdr _hdr, *hp; + +- if ((hp = skb_header_pointer(skb, iph->ihl * 4, 1, &_hdr))) ++ if (likely(hp = skb_header_pointer(skb, ptr, 1, &_hdr))) + tuple.d_port = hp->type; + } + break; ++ case IPPROTO_AH: { /* IPSEC */ ++ struct ip_auth_hdr _hdr, *hp; ++ ++ if (likely(family == AF_INET && /* For IPv6 it's parsed above. */ ++ (hp = skb_header_pointer(skb, ptr, 8, &_hdr)))) { ++ tuple.s_port = hp->spi >> 16; ++ tuple.d_port = hp->spi; ++ } ++ break; ++ } ++ case IPPROTO_ESP: { ++ struct ip_esp_hdr _hdr, *hp; ++ ++ if (likely(hp = skb_header_pointer(skb, ptr, 4, &_hdr))) ++ tuple.s_port = hp->spi >> 16; ++ tuple.d_port = hp->spi; ++ } ++ break; + } + } /* not fragmented */ + + /* aggregate networks */ + read_lock_bh(&aggr_lock); +- list_for_each_entry(aggr_n, &aggr_n_list, list) +- if ((ntohl(tuple.s_addr) & aggr_n->mask) == aggr_n->addr) { +- tuple.s_addr &= htonl(aggr_n->aggr_mask); +- s_mask = aggr_n->prefix; +- break; +- } +- list_for_each_entry(aggr_n, &aggr_n_list, list) +- if ((ntohl(tuple.d_addr) & aggr_n->mask) == aggr_n->addr) { +- tuple.d_addr &= htonl(aggr_n->aggr_mask); +- d_mask = aggr_n->prefix; +- break; +- } ++ if (family == AF_INET) { ++ list_for_each_entry(aggr_n, &aggr_n_list, list) ++ if (unlikely((ntohl(tuple.src.ip) & aggr_n->mask) == aggr_n->addr)) { ++ tuple.src.ip &= htonl(aggr_n->aggr_mask); ++ s_mask = aggr_n->prefix; ++ atomic_inc(&aggr_n->usage); ++ break; ++ } ++ list_for_each_entry(aggr_n, &aggr_n_list, list) ++ if (unlikely((ntohl(tuple.dst.ip) & aggr_n->mask) == aggr_n->addr)) { ++ tuple.dst.ip &= htonl(aggr_n->aggr_mask); ++ d_mask = aggr_n->prefix; ++ atomic_inc(&aggr_n->usage); ++ break; ++ } ++ } + +- /* aggregate ports */ +- list_for_each_entry(aggr_p, &aggr_p_list, list) +- if (ntohs(tuple.s_port) >= aggr_p->port1 && +- ntohs(tuple.s_port) <= aggr_p->port2) { +- tuple.s_port = htons(aggr_p->aggr_port); +- break; +- } ++ if (tuple.protocol == IPPROTO_TCP || ++ tuple.protocol == IPPROTO_UDP || ++ tuple.protocol == IPPROTO_SCTP || ++ tuple.protocol == IPPROTO_UDPLITE) { ++ /* aggregate ports */ ++ list_for_each_entry(aggr_p, &aggr_p_list, list) ++ if (unlikely(ntohs(tuple.s_port) >= aggr_p->port1 && ++ ntohs(tuple.s_port) <= aggr_p->port2)) { ++ tuple.s_port = htons(aggr_p->aggr_port); ++ atomic_inc(&aggr_p->usage); ++ break; ++ } + +- list_for_each_entry(aggr_p, &aggr_p_list, list) +- if (ntohs(tuple.d_port) >= aggr_p->port1 && +- ntohs(tuple.d_port) <= aggr_p->port2) { +- tuple.d_port = htons(aggr_p->aggr_port); +- break; +- } ++ list_for_each_entry(aggr_p, &aggr_p_list, list) ++ if (unlikely(ntohs(tuple.d_port) >= aggr_p->port1 && ++ ntohs(tuple.d_port) <= aggr_p->port2)) { ++ tuple.d_port = htons(aggr_p->aggr_port); ++ atomic_inc(&aggr_p->usage); ++ break; ++ } ++ } + read_unlock_bh(&aggr_lock); + + hash = hash_netflow(&tuple); +- spin_lock_bh(&ipt_netflow_lock); ++ read_lock_bh(&htable_rwlock); ++ spin_lock(&htable_locks[hash & LOCK_COUNT_MASK]); + /* record */ + nf = ipt_netflow_find(&tuple, hash); +- if (!nf) { +- if (maxflows > 0 && atomic_read(&ipt_netflow_count) >= maxflows) { ++ if (unlikely(!nf)) { ++ struct rtable *rt; ++ ++ if (unlikely(maxflows > 0 && atomic_read(&ipt_netflow_count) >= maxflows)) { + /* This is DOS attack prevention */ + NETFLOW_STAT_INC(maxflows_err); + NETFLOW_STAT_INC(pkt_drop); +- NETFLOW_STAT_ADD(traf_drop, ntohs(iph->tot_len)); +- spin_unlock_bh(&ipt_netflow_lock); +- return IPT_CONTINUE; ++ NETFLOW_STAT_ADD(traf_drop, pkt_len); ++ goto unlock_return; + } + + nf = init_netflow(&tuple, skb, hash); +- if (!nf || IS_ERR(nf)) { ++ if (unlikely(!nf || IS_ERR(nf))) { + NETFLOW_STAT_INC(alloc_err); + NETFLOW_STAT_INC(pkt_drop); +- NETFLOW_STAT_ADD(traf_drop, ntohs(iph->tot_len)); +- spin_unlock_bh(&ipt_netflow_lock); +- return IPT_CONTINUE; ++ NETFLOW_STAT_ADD(traf_drop, pkt_len); ++ goto unlock_return; + } + + nf->ts_first = jiffies; +@@ -1330,31 +2691,68 @@ static unsigned int netflow_target( + nf->s_mask = s_mask; + nf->d_mask = d_mask; + +- if (debug > 2) +- printk(KERN_INFO "ipt_netflow: new (%u) %hd:%hd SRC=%u.%u.%u.%u:%u DST=%u.%u.%u.%u:%u\n", ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,26) ++ rt = (struct rtable *)skb->dst; ++#else /* since 2.6.26 */ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,31) ++ rt = skb->rtable; ++#else /* since 2.6.31 */ ++ rt = skb_rtable(skb); ++#endif ++#endif ++ if (likely(family == AF_INET)) { ++ if (rt) ++ nf->nh.ip = rt->rt_gateway; ++ } else { ++ if (rt) ++ nf->nh.in6 = ((struct rt6_info *)rt)->rt6i_gateway; ++ nf->flow_label = (iph->ip6.flow_lbl[0] << 16) | ++ (iph->ip6.flow_lbl[1] << 8) | (iph->ip6.flow_lbl[2]); ++ } ++#if 0 ++ if (unlikely(debug > 2)) ++ printk(KERN_INFO "ipt_NETFLOW: new (%u) %hd:%hd SRC=%u.%u.%u.%u:%u DST=%u.%u.%u.%u:%u\n", + atomic_read(&ipt_netflow_count), + tuple.i_ifc, nf->o_ifc, + NIPQUAD(tuple.s_addr), ntohs(tuple.s_port), + NIPQUAD(tuple.d_addr), ntohs(tuple.d_port)); ++#endif + } else { + /* ipt_netflow_list is sorted by access time: + * most recently accessed flows are at head, old flows remain at tail + * this function bubble up flow to the head */ ++ spin_lock(&hlist_lock); + list_move(&nf->list, &ipt_netflow_list); ++ spin_unlock(&hlist_lock); + } + ++#ifdef CONFIG_NF_CONNTRACK_MARK ++ { ++ struct nf_conn *ct; ++ enum ip_conntrack_info ctinfo; ++ ct = nf_ct_get(skb, &ctinfo); ++ if (ct) ++ nf->mark = ct->mark; ++ } ++#endif ++ + nf->nr_packets++; +- nf->nr_bytes += ntohs(iph->tot_len); ++ nf->nr_bytes += pkt_len; + nf->ts_last = jiffies; + nf->tcp_flags |= tcp_flags; ++ nf->options |= options; ++ if (tuple.protocol == IPPROTO_TCP) ++ nf->tcpoptions |= tcpoptions; + + NETFLOW_STAT_INC(pkt_total); +- NETFLOW_STAT_ADD(traf_total, ntohs(iph->tot_len)); ++ NETFLOW_STAT_ADD(traf_total, pkt_len); + +- if (active_needs_export(nf, active_timeout * HZ)) { ++ if (likely(active_needs_export(nf, active_timeout * HZ))) { + /* ok, if this active flow to be exported + * bubble it to the tail */ ++ spin_lock(&hlist_lock); + list_move_tail(&nf->list, &ipt_netflow_list); ++ spin_unlock(&hlist_lock); + + /* Blog: I thought about forcing timer to wake up sooner if we have + * enough exportable flows, but in fact this doesn't have much sense, +@@ -1363,35 +2761,194 @@ static unsigned int netflow_target( + * limited size). But yes, this is disputable. */ + } + +- spin_unlock_bh(&ipt_netflow_lock); ++unlock_return: ++ spin_unlock(&htable_locks[hash & LOCK_COUNT_MASK]); ++ read_unlock_bh(&htable_rwlock); + + return IPT_CONTINUE; + } + +-static struct ipt_target ipt_netflow_reg = { +- .name = "NETFLOW", +- .target = netflow_target, +- .family = AF_INET, +-#ifndef RAW_PROMISC_HACK +- .table = "filter", +-#ifndef NF_IP_LOCAL_IN /* 2.6.25 */ +- .hooks = (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD) | +- (1 << NF_INET_LOCAL_OUT), +-#else +- .hooks = (1 << NF_IP_LOCAL_IN) | (1 << NF_IP_FORWARD) | +- (1 << NF_IP_LOCAL_OUT), +-#endif /* NF_IP_LOCAL_IN */ ++#ifdef CONFIG_NF_NAT_NEEDED ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,31) ++ /* Below 2.6.31 we don't need to handle callback chain manually. */ ++ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,2,0) ++#define NET_STRUCT struct net *net ++#define NET_ARG net, ++#define nf_conntrack_event_cb net->ct.nf_conntrack_event_cb + #else +- .table = "raw", +-#ifndef NF_IP_LOCAL_IN +- .hooks = (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD) | +- (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_PRE_ROUTING), ++#define NET_STRUCT void ++#define NET_ARG ++#endif ++static int set_notifier_cb(NET_STRUCT) ++{ ++ struct nf_ct_event_notifier *notifier; ++ ++ notifier = rcu_dereference(nf_conntrack_event_cb); ++ if (notifier == NULL) { ++ /* Polite mode. */ ++ nf_conntrack_register_notifier(NET_ARG &ctnl_notifier); ++ } else if (notifier != &ctnl_notifier) { ++ if (!saved_event_cb) ++ saved_event_cb = notifier; ++ else if (saved_event_cb != notifier) ++ printk(KERN_ERR "natevents_net_init: %p != %p (report error.)\n", ++ saved_event_cb, notifier); ++ rcu_assign_pointer(nf_conntrack_event_cb, &ctnl_notifier); ++ } else ++ printk(KERN_ERR "ipt_NETFLOW: natevents already enabled.\n"); ++ return 0; ++} ++static void unset_notifier_cb(NET_STRUCT) ++{ ++ struct nf_ct_event_notifier *notifier; ++ ++ notifier = rcu_dereference(nf_conntrack_event_cb); ++ if (notifier == &ctnl_notifier) { ++ if (saved_event_cb == NULL) ++ nf_conntrack_unregister_notifier(NET_ARG &ctnl_notifier); ++ else ++ rcu_assign_pointer(nf_conntrack_event_cb, saved_event_cb); ++ } else ++ printk(KERN_ERR "ipt_NETFLOW: natevents already disabled.\n"); ++} ++ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,2,0) ++#undef nf_conntrack_event_cb ++static struct pernet_operations natevents_net_ops = { ++ .init = set_notifier_cb, ++ .exit = unset_notifier_cb ++}; ++#endif ++#endif /* since 2.6.31 */ ++ ++static DEFINE_MUTEX(events_lock); ++/* Both functions may be called multiple times. */ ++static void register_ct_events(void) ++{ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,31) ++#define NETLINK_M "nf_conntrack_netlink" ++ struct module *netlink_m; ++ static int referenced = 0; ++#endif ++ ++ printk(KERN_INFO "ipt_NETFLOW: enable natevents.\n"); ++ mutex_lock(&events_lock); ++ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,31) ++ /* Pre-load netlink module who will be first notifier ++ * user, and then hijack nf_conntrack_event_cb from it. */ ++ if ( ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3,2,0) ++ !rcu_dereference(nf_conntrack_event_cb) || ++#endif ++ !(netlink_m = find_module(NETLINK_M))) { ++ printk("Loading " NETLINK_M "\n"); ++ request_module(NETLINK_M); ++ } ++ /* Reference netlink module to prevent it's unsafe unload before us. */ ++ if (!referenced && (netlink_m = find_module(NETLINK_M))) { ++ referenced++; ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,35) ++#define use_module ref_module ++#endif ++ use_module(THIS_MODULE, netlink_m); ++ } ++ ++ /* Register ct events callback. */ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,2,0) ++ register_pernet_subsys(&natevents_net_ops); + #else +- .hooks = (1 << NF_IP_LOCAL_IN) | (1 << NF_IP_FORWARD) | +- (1 << NF_IP_LOCAL_OUT) | (1 << NF_IP_PRE_ROUTING), +-#endif /* NF_IP_LOCAL_IN */ +-#endif /* !RAW_PROMISC_HACK */ +- .me = THIS_MODULE ++ set_notifier_cb(); ++#endif ++#else /* below v2.6.31 */ ++ if (!natevents && nf_conntrack_register_notifier(&ctnl_notifier) < 0) ++ printk(KERN_ERR "Can't register conntrack notifier, natevents disabled.\n"); ++ else ++#endif ++ natevents = 1; ++ mutex_unlock(&events_lock); ++} ++ ++static void unregister_ct_events(void) ++{ ++ printk(KERN_INFO "ipt_NETFLOW: disable natevents.\n"); ++ mutex_lock(&events_lock); ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,31) ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,2,0) ++ unregister_pernet_subsys(&natevents_net_ops); ++#else /* < v3.2 */ ++ unset_notifier_cb(); ++#endif /* v3.2 */ ++ rcu_assign_pointer(saved_event_cb, NULL); ++#else /* < v2.6.31 */ ++ nf_conntrack_unregister_notifier(&ctnl_notifier); ++#endif ++ natevents = 0; ++ mutex_unlock(&events_lock); ++} ++#endif /* CONFIG_NF_NAT_NEEDED */ ++ ++#ifndef NF_IP_LOCAL_IN /* 2.6.25 */ ++#define NF_IP_PRE_ROUTING NF_INET_PRE_ROUTING ++#define NF_IP_LOCAL_IN NF_INET_LOCAL_IN ++#define NF_IP_FORWARD NF_INET_FORWARD ++#define NF_IP_LOCAL_OUT NF_INET_LOCAL_OUT ++#define NF_IP_POST_ROUTING NF_INET_POST_ROUTING ++#endif ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19) ++/* net/netfilter/x_tables.c */ ++static void xt_unregister_targets(struct xt_target *target, unsigned int n) ++{ ++ unsigned int i; ++ ++ for (i = 0; i < n; i++) ++ xt_unregister_target(&target[i]); ++} ++static int xt_register_targets(struct xt_target *target, unsigned int n) ++{ ++ unsigned int i; ++ ++ int err = 0; ++ for (i = 0; i < n; i++) ++ if ((err = xt_register_target(&target[i]))) ++ goto err; ++ return err; ++err: ++ if (i > 0) ++ xt_unregister_targets(target, i); ++ return err; ++} ++#endif ++ ++static struct ipt_target ipt_netflow_reg[] __read_mostly = { ++ { ++ .name = "NETFLOW", ++ .target = netflow_target, ++ .checkentry = netflow_target_check, ++ .family = AF_INET, ++ .hooks = ++ (1 << NF_IP_PRE_ROUTING) | ++ (1 << NF_IP_LOCAL_IN) | ++ (1 << NF_IP_FORWARD) | ++ (1 << NF_IP_LOCAL_OUT) | ++ (1 << NF_IP_POST_ROUTING), ++ .me = THIS_MODULE ++ }, ++ { ++ .name = "NETFLOW", ++ .target = netflow_target, ++ .checkentry = netflow_target_check, ++ .family = AF_INET6, ++ .hooks = ++ (1 << NF_IP_PRE_ROUTING) | ++ (1 << NF_IP_LOCAL_IN) | ++ (1 << NF_IP_FORWARD) | ++ (1 << NF_IP_LOCAL_OUT) | ++ (1 << NF_IP_POST_ROUTING), ++ .me = THIS_MODULE ++ }, + }; + + static int __init ipt_netflow_init(void) +@@ -1399,11 +2956,16 @@ static int __init ipt_netflow_init(void) + #ifdef CONFIG_PROC_FS + struct proc_dir_entry *proc_stat; + #endif ++ printk(KERN_INFO "ipt_NETFLOW version %s, srcversion %s\n", ++ IPT_NETFLOW_VERSION, THIS_MODULE->srcversion); + + get_random_bytes(&ipt_netflow_hash_rnd, 4); + + /* determine hash size (idea from nf_conntrack_core.c) */ + if (!hashsize) { ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,11,0) ++#define num_physpages totalram_pages ++#endif + hashsize = (((num_physpages << PAGE_SHIFT) / 16384) + / sizeof(struct hlist_head)); + if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE)) +@@ -1411,8 +2973,7 @@ static int __init ipt_netflow_init(void) + } + if (hashsize < 16) + hashsize = 16; +- printk(KERN_INFO "ipt_netflow version %s (%u buckets)\n", +- IPT_NETFLOW_VERSION, hashsize); ++ printk(KERN_INFO "ipt_NETFLOW: hashsize %u\n", hashsize); + + ipt_netflow_hash_size = hashsize; + ipt_netflow_hash = alloc_hashtable(ipt_netflow_hash_size); +@@ -1434,12 +2995,18 @@ static int __init ipt_netflow_init(void) + } + + #ifdef CONFIG_PROC_FS ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0) + proc_stat = create_proc_entry("ipt_netflow", S_IRUGO, INIT_NET(proc_net_stat)); ++#else ++ proc_stat = proc_create("ipt_netflow", S_IRUGO, INIT_NET(proc_net_stat), &nf_seq_fops); ++#endif + if (!proc_stat) { + printk(KERN_ERR "Unable to create /proc/net/stat/ipt_netflow entry\n"); + goto err_free_netflow_slab; + } ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0) + proc_stat->proc_fops = &nf_seq_fops; ++#endif + #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,30) + proc_stat->owner = THIS_MODULE; + #endif +@@ -1480,21 +3047,28 @@ static int __init ipt_netflow_init(void) + } + add_aggregation(aggregation); + +- __start_scan_worker(); ++ netflow_switch_version(protocol); ++ _schedule_scan_worker(0); + setup_timer(&rate_timer, rate_timer_calc, 0); + mod_timer(&rate_timer, jiffies + (HZ * SAMPLERATE)); + +- if (xt_register_target(&ipt_netflow_reg)) ++ peakflows_at = jiffies; ++ if (xt_register_targets(ipt_netflow_reg, ARRAY_SIZE(ipt_netflow_reg))) + goto err_stop_timer; + +- peakflows_at = jiffies; ++#ifdef CONFIG_NF_NAT_NEEDED ++ if (natevents) ++ register_ct_events(); ++#endif + +- printk(KERN_INFO "ipt_netflow loaded.\n"); ++ printk(KERN_INFO "ipt_NETFLOW is loaded.\n"); + return 0; + + err_stop_timer: +- __stop_scan_worker(); ++ _unschedule_scan_worker(); ++ netflow_scan_and_export(AND_FLUSH); + del_timer_sync(&rate_timer); ++ free_templates(); + destination_removeall(); + aggregation_remove(&aggr_n_list); + aggregation_remove(&aggr_p_list); +@@ -1506,17 +3080,18 @@ err_free_proc_stat: + #ifdef CONFIG_PROC_FS + remove_proc_entry("ipt_netflow", INIT_NET(proc_net_stat)); + err_free_netflow_slab: +-#endif ++#endif + kmem_cache_destroy(ipt_netflow_cachep); + err_free_hash: + vfree(ipt_netflow_hash); + err: ++ printk(KERN_INFO "ipt_NETFLOW is not loaded.\n"); + return -ENOMEM; + } + + static void __exit ipt_netflow_fini(void) + { +- printk(KERN_INFO "ipt_netflow unloading..\n"); ++ printk(KERN_INFO "ipt_NETFLOW unloading..\n"); + + #ifdef CONFIG_SYSCTL + unregister_sysctl_table(netflow_sysctl_header); +@@ -1524,14 +3099,18 @@ static void __exit ipt_netflow_fini(void) + #ifdef CONFIG_PROC_FS + remove_proc_entry("ipt_netflow", INIT_NET(proc_net_stat)); + #endif +- +- xt_unregister_target(&ipt_netflow_reg); +- __stop_scan_worker(); +- netflow_scan_and_export(1); ++ xt_unregister_targets(ipt_netflow_reg, ARRAY_SIZE(ipt_netflow_reg)); ++#ifdef CONFIG_NF_NAT_NEEDED ++ if (natevents) ++ unregister_ct_events(); ++#endif ++ _unschedule_scan_worker(); ++ netflow_scan_and_export(AND_FLUSH); + del_timer_sync(&rate_timer); + + synchronize_sched(); + ++ free_templates(); + destination_removeall(); + aggregation_remove(&aggr_n_list); + aggregation_remove(&aggr_p_list); +@@ -1539,7 +3118,7 @@ static void __exit ipt_netflow_fini(void) + kmem_cache_destroy(ipt_netflow_cachep); + vfree(ipt_netflow_hash); + +- printk(KERN_INFO "ipt_netflow unloaded.\n"); ++ printk(KERN_INFO "ipt_NETFLOW unloaded.\n"); + } + + module_init(ipt_netflow_init); +diff --git a/ipt_NETFLOW.h b/ipt_NETFLOW.h +index 4a7b645..749f985 100644 +--- a/ipt_NETFLOW.h ++++ b/ipt_NETFLOW.h +@@ -35,8 +35,8 @@ struct netflow5_record { + __be16 o_ifc; + __be32 nr_packets; + __be32 nr_octets; +- __be32 ts_first; +- __be32 ts_last; ++ __be32 first_ms; ++ __be32 last_ms; + __be16 s_port; + __be16 d_port; + __u8 reserved; +@@ -54,9 +54,9 @@ struct netflow5_record { + struct netflow5_pdu { + __be16 version; + __be16 nr_records; +- __be32 ts_uptime; +- __be32 ts_usecs; +- __be32 ts_unsecs; ++ __be32 ts_uptime; /* ms */ ++ __be32 ts_usecs; /* s */ ++ __be32 ts_unsecs; /* ns */ + __be32 seq; + __u8 eng_type; + __u8 eng_id; +@@ -65,42 +65,185 @@ struct netflow5_pdu { + } __attribute__ ((packed)); + #define NETFLOW5_HEADER_SIZE (sizeof(struct netflow5_pdu) - NETFLOW5_RECORDS_MAX * sizeof(struct netflow5_record)) + ++/* NetFlow v9 RFC http://www.ietf.org/rfc/rfc3954.txt */ ++enum { ++ IN_BYTES = 1, ++ IN_PKTS = 2, ++ PROTOCOL = 4, ++ TOS = 5, ++ TCP_FLAGS = 6, ++ L4_SRC_PORT = 7, ++ IPV4_SRC_ADDR = 8, ++ SRC_MASK = 9, ++ INPUT_SNMP = 10, ++ L4_DST_PORT = 11, ++ IPV4_DST_ADDR = 12, ++ DST_MASK = 13, ++ OUTPUT_SNMP = 14, ++ IPV4_NEXT_HOP = 15, ++ //SRC_AS = 16, ++ //DST_AS = 17, ++ //BGP_IPV4_NEXT_HOP = 18, ++ //MUL_DST_PKTS = 19, ++ //MUL_DST_BYTES = 20, ++ LAST_SWITCHED = 21, ++ FIRST_SWITCHED = 22, ++ IPV6_SRC_ADDR = 27, ++ IPV6_DST_ADDR = 28, ++ IPV6_FLOW_LABEL = 31, ++ ICMP_TYPE = 32, ++ MUL_IGMP_TYPE = 33, ++ //TOTAL_BYTES_EXP = 40, ++ //TOTAL_PKTS_EXP = 41, ++ //TOTAL_FLOWS_EXP = 42, ++ IPV6_NEXT_HOP = 62, ++ IPV6_OPTION_HEADERS = 64, ++ commonPropertiesId = 137, /* for MARK */ ++ ipv4Options = 208, ++ tcpOptions = 209, ++ postNATSourceIPv4Address = 225, ++ postNATDestinationIPv4Address = 226, ++ postNAPTSourceTransportPort = 227, ++ postNAPTDestinationTransportPort = 228, ++ natEvent = 230, ++ postNATSourceIPv6Address = 281, ++ postNATDestinationIPv6Address = 282, ++ IPSecSPI = 295, ++ observationTimeMilliseconds = 323, ++ observationTimeMicroseconds = 324, ++ observationTimeNanoseconds = 325, ++}; ++ ++enum { ++ FLOWSET_TEMPLATE = 0, ++ FLOWSET_OPTIONS = 1, ++ IPFIX_TEMPLATE = 2, ++ IPFIX_OPTIONS = 3, ++ FLOWSET_DATA_FIRST = 256, ++}; ++ ++struct flowset_template { ++ __be16 flowset_id; ++ __be16 length; ++ __be16 template_id; ++ __be16 field_count; ++} __attribute__ ((packed)); ++ ++struct flowset_data { ++ __be16 flowset_id; ++ __be16 length; ++} __attribute__ ((packed)); ++ ++/* NetFlow v9 packet. */ ++struct netflow9_pdu { ++ __be16 version; ++ __be16 nr_records; ++ __be32 sys_uptime_ms; ++ __be32 export_time_s; ++ __be32 seq; ++ __be32 source_id; /* Exporter Observation Domain */ ++ __u8 data[1400]; ++} __attribute__ ((packed)); ++ ++/* IPFIX packet. */ ++struct ipfix_pdu { ++ __be16 version; ++ __be16 length; ++ __be32 export_time_s; ++ __be32 seq; ++ __be32 odomain_id; /* Observation Domain ID */ ++ __u8 data[1400]; ++} __attribute__ ((packed)); ++ ++/* Maximum bytes flow can have, after it's reached flow will become ++ * not searchable and will be exported soon. */ ++#define FLOW_FULL_WATERMARK 0xffefffff ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,25) ++union nf_inet_addr { ++ __be32 ip; ++ __be32 ip6[4]; ++ struct in_addr in; ++ struct in6_addr in6; ++}; ++#endif ++ + /* hashed data which identify unique flow */ ++/* 16+16 + 2+2 + 2+1+1+1 = 41 */ + struct ipt_netflow_tuple { +- __be32 s_addr; // Network byte order +- __be32 d_addr; // -"- +- __be16 s_port; // -"- ++ union nf_inet_addr src; ++ union nf_inet_addr dst; ++ __be16 s_port; // Network byte order + __be16 d_port; // -"- +- __be16 i_ifc; // Local byte order ++ __u16 i_ifc; // Host byte order + __u8 protocol; + __u8 tos; ++ __u8 l3proto; + }; +-/* tuple size is rounded to u32s */ +-#define NETFLOW_TUPLE_SIZE (sizeof(struct ipt_netflow_tuple) / 4) +- +-/* maximum bytes flow can have, after it reached flow become not searchable and will be exported soon */ +-#define FLOW_FULL_WATERMARK 0xffefffff + +-/* flow entry */ ++/* hlist[2] + tuple[]: 8+8 + 41 = 57 (less than usual cache line, 64) */ + struct ipt_netflow { + struct hlist_node hlist; // hashtable search chain +- struct list_head list; // all flows chain + + /* unique per flow data (hashed, NETFLOW_TUPLE_SIZE) */ + struct ipt_netflow_tuple tuple; + + /* volatile data */ +- __be16 o_ifc; ++ union nf_inet_addr nh; ++ __u16 o_ifc; + __u8 s_mask; + __u8 d_mask; ++ __u8 tcp_flags; /* `OR' of all tcp flags */ + + /* flow statistics */ + u_int32_t nr_packets; + u_int32_t nr_bytes; +- unsigned long ts_first; +- unsigned long ts_last; +- __u8 tcp_flags; /* `OR' of all tcp flags */ ++ union { ++ struct { ++ unsigned long first; ++ unsigned long last; ++ } ts; ++ ktime_t ts_obs; ++ } _ts_un; ++#define ts_first _ts_un.ts.first ++#define ts_last _ts_un.ts.last ++#define ts_obs _ts_un.ts_obs ++ u_int32_t flow_label; /* IPv6 */ ++ u_int32_t options; /* IPv4(16) & IPv6(32) Options */ ++ u_int32_t tcpoptions; ++#ifdef CONFIG_NF_CONNTRACK_MARK ++ u_int32_t mark; /* Exported as commonPropertiesId */ ++#endif ++#ifdef CONFIG_NF_NAT_NEEDED ++ __be32 s_as; ++ __be32 d_as; ++ struct nat_event *nat; ++#endif ++ struct list_head list; // all flows chain ++ spinlock_t *lock; ++}; ++ ++#ifdef CONFIG_NF_NAT_NEEDED ++enum { ++ NAT_CREATE, NAT_DESTROY, NAT_POOLEXHAUSTED + }; ++struct nat_event { ++ struct list_head list; ++ struct { ++ __be32 s_addr; ++ __be32 d_addr; ++ __be16 s_port; ++ __be16 d_port; ++ } pre, post; ++ ktime_t ts_ktime; ++ unsigned long ts_jiffies; ++ __u8 protocol; ++ __u8 nat_event; ++}; ++#define IS_DUMMY_FLOW(nf) (nf->nat) ++#else ++#define IS_DUMMY_FLOW(nf) 0 ++#endif + + static inline int ipt_netflow_tuple_equal(const struct ipt_netflow_tuple *t1, + const struct ipt_netflow_tuple *t2) +@@ -115,11 +258,13 @@ struct ipt_netflow_sock { + unsigned short port; + atomic_t wmem_peak; // sk_wmem_alloc peak value + atomic_t err_full; // socket filled error ++ atomic_t err_connect; // connect errors + atomic_t err_other; // other socket errors + }; + + struct netflow_aggr_n { + struct list_head list; ++ atomic_t usage; + __u32 mask; + __u32 addr; + __u32 aggr_mask; +@@ -128,6 +273,7 @@ struct netflow_aggr_n { + + struct netflow_aggr_p { + struct list_head list; ++ atomic_t usage; + __u16 port1; + __u16 port2; + __u16 aggr_port; +diff --git a/libipt_NETFLOW.c b/libipt_NETFLOW.c +index d85b6d9..a0f9e5d 100644 +--- a/libipt_NETFLOW.c ++++ b/libipt_NETFLOW.c +@@ -58,24 +58,24 @@ + #define _IPT_IP struct ipt_ip + #endif + ++#ifndef IPTABLES_VERSION ++#define IPTABLES_VERSION XTABLES_VERSION ++#endif ++ + static struct option opts[] = { +- {0} ++ { 0 } + }; + + static void help(void) + { +- printf( "NETFLOW target\n"); ++ printf("NETFLOW target\n"); + } + +-//static int parse(int c, char **argv, int invert, unsigned int *flags, +-// const _IPT_ENTRY *entry, +-// struct ipt_entry_target **target) + static int parse(int c, char **argv, int invert, unsigned int *flags, + const _IPT_ENTRY *entry, + struct ipt_entry_target **targetinfo) + + { +- + return 1; + } + +@@ -95,16 +95,9 @@ static void print(const _IPT_IP *ip, + } + + static struct iptables_target netflow = { +-#ifdef MOD140 +- .family = AF_INET, +-#endif + .next = NULL, + .name = "NETFLOW", +-#ifdef XTABLES_VERSION +- .version = XTABLES_VERSION, +-#else + .version = IPTABLES_VERSION, +-#endif + .size = IPT_ALIGN(0), + .userspacesize = IPT_ALIGN(0), + .help = &help, +diff --git a/murmur3.h b/murmur3.h +new file mode 100644 +index 0000000..57a6006 +--- /dev/null ++++ b/murmur3.h +@@ -0,0 +1,42 @@ ++/* MurmurHash3, based on https://code.google.com/p/smhasher of Austin Appleby. */ ++ ++static __always_inline uint32_t rotl32(const uint32_t x, const int8_t r) ++{ ++ return (x << r) | (x >> (32 - r)); ++} ++ ++static __always_inline uint32_t fmix32(register uint32_t h) ++{ ++ h ^= h >> 16; ++ h *= 0x85ebca6b; ++ h ^= h >> 13; ++ h *= 0xc2b2ae35; ++ h ^= h >> 16; ++ return h; ++} ++ ++static inline uint32_t murmur3(const void *key, const uint32_t len, const uint32_t seed) ++{ ++ const uint32_t c1 = 0xcc9e2d51; ++ const uint32_t c2 = 0x1b873593; ++ const uint32_t *blocks; ++ const uint8_t *tail; ++ register uint32_t h1 = seed; ++ uint32_t k1 = 0; ++ uint32_t i; ++ ++ blocks = (const uint32_t *)key; ++ for (i = len / 4; i; --i) { ++ h1 ^= rotl32(*blocks++ * c1, 15) * c2; ++ h1 = rotl32(h1, 13) * 5 + 0xe6546b64; ++ } ++ tail = (const uint8_t*)blocks; ++ switch (len & 3) { ++ case 3: k1 ^= tail[2] << 16; ++ case 2: k1 ^= tail[1] << 8; ++ case 1: k1 ^= tail[0]; ++ h1 ^= rotl32(k1 * c1, 15) * c2; ++ } ++ return fmix32(h1^ len); ++} ++ +diff --git a/raw_promisc_debian_squeeze6.patch b/raw_promisc_debian_squeeze6.patch +new file mode 100644 +index 0000000..69d0d35 +--- /dev/null ++++ b/raw_promisc_debian_squeeze6.patch +@@ -0,0 +1,37 @@ ++ ++ Short manual and patch for Debian Squeeze ++ suggested by Pavel Odintsov: ++ ++On Thu, Dec 27, 2012 at 07:46:30PM +0400, Pavel Odintsov wrote: ++> ++> краткий мануал для патчинга Debian Squeeze ядра патчем promisc. ++> ++> cd /usr/src ++> apt-get install -y dpkg-dev ++> apt-get build-dep linux-image-2.6.32-5-amd64 ++> cd linux-2.6-2.6.32/ ++> apt-get source linux-image-2.6.32-5-amd64 ++> ++> wget .... /root/raw_promisc_debian_squeeze6.patch ++> patch -p1 < raw_promisc_debian_squeeze6.patch ++> Накладываем патчи дебияна: ++> debian/rules source ++> ++> Запускаем сборку: ++> debian/rules binary ++> ++ ++diff -rupN linux-2.6-2.6.32/net/ipv4/ip_input.c linux-2.6-2.6.32_promisc_raw//net/ipv4/ip_input.c ++--- linux-2.6-2.6.32/net/ipv4/ip_input.c 2009-12-03 04:51:21.000000000 +0100 +++++ linux-2.6-2.6.32_promisc_raw//net/ipv4/ip_input.c 2012-06-25 19:13:49.000000000 +0200 ++@@ -383,8 +383,8 @@ int ip_rcv(struct sk_buff *skb, struct n ++ /* When the interface is in promisc. mode, drop all the crap ++ * that it receives, do not try to analyse it. ++ */ ++- if (skb->pkt_type == PACKET_OTHERHOST) ++- goto drop; +++ //if (skb->pkt_type == PACKET_OTHERHOST) +++ // goto drop; ++ ++ ++ IP_UPD_PO_STATS_BH(dev_net(dev), IPSTATS_MIB_IN, skb->len); diff --git a/testing/ipt-netflow/kernel-vs-userspace.patch b/testing/ipt-netflow/kernel-vs-userspace.patch new file mode 100644 index 0000000000..8f9857fa8b --- /dev/null +++ b/testing/ipt-netflow/kernel-vs-userspace.patch @@ -0,0 +1,56 @@ +diff --git a/configure b/configure +index 3f10e2a..b43f024 100755 +--- a/configure ++++ b/configure +@@ -256,6 +256,8 @@ do + --ipt-inc=*) IPTINC="$ac_optarg" ;; + --kver=*) KVERSION="$ac_optarg" ;; + --kdir=*) KDIR="$ac_optarg" ;; ++ --disable-kernel) NOKERNEL=1;; ++ --disable-ipt) NOIPT=1;; + --make) echo called from make ;; + --help) show_help ;; + -*) echo Invalid option: $ac_option; exit 1 ;; +@@ -353,22 +355,26 @@ kernel_check_config() { + kconfig CONFIG_IP6_NF_IPTABLES "ip6tables target" + } + +-kernel_find_version #KVERSION +-test "$KLIBMOD" || KLIBMOD=$KVERSION +-echo "Kernel version: $KVERSION ($KHOW)" +-kernel_find_source #KDIR +-echo "Kernel sources: $KDIR ($KSHOW)" +-kernel_check_consistency +-kernel_check_config +- +-test "$IPTBIN" || IPTBIN=`which iptables` +- +-iptables_find_version #IPTVER +-iptables_try_pkgconfig #try to configure from pkg-config +-iptables_find_src #IPTSRC +-iptables_src_version #check that IPTSRC match to IPTVER +-iptables_inc #IPTINC +-iptables_modules #IPTLIB ++if ! test "$NOKERNEL"; then ++ kernel_find_version #KVERSION ++ test "$KLIBMOD" || KLIBMOD=$KVERSION ++ echo "Kernel version: $KVERSION ($KHOW)" ++ kernel_find_source #KDIR ++ echo "Kernel sources: $KDIR ($KSHOW)" ++ kernel_check_consistency ++ kernel_check_config ++fi ++ ++if ! test "$NOIPT"; then ++ test "$IPTBIN" || IPTBIN=`which iptables` ++ ++ iptables_find_version #IPTVER ++ iptables_try_pkgconfig #try to configure from pkg-config ++ iptables_find_src #IPTSRC ++ iptables_src_version #check that IPTSRC match to IPTVER ++ iptables_inc #IPTINC ++ iptables_modules #IPTLIB ++fi + + REPLACE="\ + s!@KVERSION@!$KVERSION!;\ |