diff --git a/Makefile.in b/Makefile.in index 30ebbfe..0a82c67 100644 --- a/Makefile.in +++ b/Makefile.in @@ -1,43 +1,73 @@ -# +# Edit Makefile.in and run ./configure KVERSION = @KVERSION@ KDIR = @KDIR@ +KINSTDIR = $(shell dirname @KDIR@) IPTABLES_CFLAGS = @IPTABLES_CFLAGS@ IPTABLES_MODULES = @IPTABLES_MODULES@ +DEPMOD = depmod -a +# https://www.kernel.org/doc/Documentation/kbuild/modules.txt +# https://www.kernel.org/doc/Documentation/kbuild/makefiles.txt obj-m = ipt_NETFLOW.o -ipt_NETFLOW.ko: ipt_NETFLOW.c ipt_NETFLOW.h +all: ipt_NETFLOW.ko libipt_NETFLOW.so libip6t_NETFLOW.so +ipt_NETFLOW.ko: version.h ipt_NETFLOW.c ipt_NETFLOW.h Makefile @echo Compiling for kernel $(KVERSION) make -C $(KDIR) M=$(CURDIR) modules -all: ipt_NETFLOW.ko libipt_NETFLOW.so -minstall: - make -C $(KDIR) M=$(CURDIR) modules_install + @touch $@ +sparse: | version.h ipt_NETFLOW.c ipt_NETFLOW.h Makefile + @rm -f ipt_NETFLOW.ko ipt_NETFLOW.o + @echo Compiling for kernel $(KVERSION) + make -C $(KDIR) M=$(CURDIR) modules C=1 + @touch ipt_NETFLOW.ko +minstall: | ipt_NETFLOW.ko + make -C $(KDIR) M=$(CURDIR) modules_install INSTALL_MOD_PATH=$(DESTDIR) + $(DEPMOD) mclean: make -C $(KDIR) M=$(CURDIR) clean lclean: -rm -f *.so *_sh.o clean: mclean lclean - -rm -f *.so *.o modules.order + -rm -f *.so *.o modules.order version.h + +%_sh.o: libipt_NETFLOW.c + gcc -O2 -Wall -Wunused $(IPTABLES_CFLAGS) -fPIC -o $@ -c libipt_NETFLOW.c + +%.so: %_sh.o + gcc -shared -o $@ $< -libipt_NETFLOW.so: libipt_NETFLOW.c - gcc -O2 -Wall -Wunused -I$(KDIR)/include $(IPTABLES_CFLAGS) -fPIC -o libipt_NETFLOW_sh.o -c libipt_NETFLOW.c - gcc -shared -o libipt_NETFLOW.so libipt_NETFLOW_sh.o +version.h: ipt_NETFLOW.c ipt_NETFLOW.h Makefile + @if [ -d .git ] && type git >/dev/null 2>&1; then \ + echo "#define GITVERSION \"`git describe --dirty`\""; \ + fi > version.h -linstall: ipt_NETFLOW.ko libipt_NETFLOW.so - cp -a libipt_NETFLOW.so $(IPTABLES_MODULES) +linstall: | libipt_NETFLOW.so libip6t_NETFLOW.so + install -D libipt_NETFLOW.so $(DESTDIR)$(IPTABLES_MODULES)/libipt_NETFLOW.so + install -D libip6t_NETFLOW.so $(DESTDIR)$(IPTABLES_MODULES)/libip6t_NETFLOW.so install: minstall linstall +uninstall: + -rm -f $(DESTDIR)$(IPTABLES_MODULES)/libipt_NETFLOW.so + -rm -f $(DESTDIR)$(IPTABLES_MODULES)/libip6t_NETFLOW.so + -rm -f $(DESTDIR)$(KINSTDIR)/extra/ipt_NETFLOW.ko + Makefile: Makefile.in configure ./configure --make load: all - insmod ipt_NETFLOW.ko active_timeout=5 - iptables -A OUTPUT -d 0/0 -j NETFLOW - iptables -A INPUT -d 0/0 -j NETFLOW + -insmod ipt_NETFLOW.ko active_timeout=5 protocol=9 + -iptables -I OUTPUT -j NETFLOW + -iptables -I INPUT -j NETFLOW + -ip6tables -I OUTPUT -j NETFLOW + -ip6tables -I INPUT -j NETFLOW unload: - iptables -D OUTPUT -d 0/0 -j NETFLOW - iptables -D INPUT -d 0/0 -j NETFLOW - rmmod ipt_NETFLOW.ko + -iptables -D OUTPUT -j NETFLOW + -iptables -D INPUT -j NETFLOW + -ip6tables -D OUTPUT -j NETFLOW + -ip6tables -D INPUT -j NETFLOW + -rmmod ipt_NETFLOW.ko + +reload: unload load diff --git a/README b/README index 213f02c..56a4fde 100644 --- a/README +++ b/README @@ -1,10 +1,17 @@ -ipt_NETFLOW linux 2.6 kernel module by -- 11 Feb 2008 +ipt_NETFLOW linux 2.6.x-3.x kernel module by -- 2008-2013. + + High performance NetFlow v5, v9, IPFIX flow data export module for Linux + kernel. Supporting IPv4 and IPv6. Created to be useful for highly loaded + linux router. It should be used as iptables target. Also can export NAT + translation events using NetFlow Event Logging (NEL) for v9, IPFIX, or + specially crafted v5 flows. + ============================ = OBTAINING LATEST VERSION = ============================ - $ git clone git://ipt-netflow.git.sourceforge.net/gitroot/ipt-netflow/ipt-netflow + $ git clone git://git.code.sf.net/p/ipt-netflow/code ipt-netflow $ cd ipt-netflow @@ -12,94 +19,220 @@ ipt_NETFLOW linux 2.6 kernel module by -- 11 Feb 2008 = INSTALLATION = ================ -1. Besides kernel you will need iptables/netfilter source matching your - installation or just fresh install from there: ftp://ftp.netfilter.org/pub/iptables/snapshot/ - I have this: ftp://ftp.netfilter.org/pub/iptables/snapshot/iptables-1.3.7-20070329.tar.bz2 - Unpack it somewhere and build with make. + Four easy steps. + +** 1. Prepare Kernel source + + If you have package system install kernel-devel package, otherwise install + raw kernel source from http://kernel.org matching _exactly_ version of your + installed kernel. + + a) What to do for Centos: + + ~# yum install kernel-devel + + b) What to do for Debian: + + ~# apt-get install module-assistant + ~# m-a prepare + + c) Otherwise, if you downloaded raw kernel sources don't forget to create + .config by copying it from your distribution's kernel. Its copy could reside + in /boot or sometimes in /proc, examples: + + kernel-src-dir/# cp /boot/config-`uname -r` .config + or + kernel-src-dir/# zcat /proc/config.gz > .config + + Assuming you unpacked kernel source into `kernel-src-dir/' directory. + Then run: + + kernel-src-dir/# make oldconfig + + After that you'll need to prepare kernel for modules build: + + kernel-src-dir/# make prepare modules_prepare + + Note: Don't try to `make prepare' in Centos kernel-devel package directory + (which is usually something like /usr/src/kernels/2.6.32-431.el6.x86_64) + as this is wrong and meaningless. + +** 2. Prepare Iptables + + Before this step it also would be useful to install pkg-config if don't + already have. + + If you have package system just install iptables-devel (or iptables-dev) + package, otherwise install iptables source matching version of your + installation from ftp://ftp.netfilter.org/pub/iptables/ + + a) What to do for Centos: + + # yum install iptables-devel + + b) What to do for Debian: + + # apt-get install iptables-dev pkg-config -2. Run ./configure script and it will create Makefile + c) Otherwise, for raw iptables source build it and make install. -3. make all install; depmod - This will install kernel module and iptable specific library. +** 3. Now, to actually build the module run: -Troubleshooting: - 1) Sometimes you will want to add CC=gcc-3 to make command. - Example: make CC=gcc-3.3 + ~/ipt-netflow# ./configure + ~/ipt-netflow# make all install + ~/ipt-netflow# depmod - 2) Compile module with actual kernel source compiled. - I.e. first compile kernel and boot into it, and then compile module. + This will install kernel module and iptables specific library. - 3) For autoloading module after reboot: set net.netflow.destination (or load - module, if idestination set on load) after interfaces are up. Becasue module - needs exporting interface (usually lo) to establish export connection. + Troubleshooting: -4. After this point you should be able to load module - and use -j NETFLOW target in your iptables. See next section. + a) Sometimes you will want to add CC=gcc-3 to make command. + Example: make CC=gcc-3.3 + + b) Compile module with actual kernel source compiled. + I.e. first compile kernel and boot into it, and then compile module. + If you are using kernel-devel package check that its version matches + your kernel package. + + c) If you have sources in non-standard places or configure isn't able to + find something run ./configure --help to see how to specify paths manually. + +** 4. After this point you should be able to load module and + use -j NETFLOW target in your iptables. See next section. =========== = RUNNING = =========== -1. You can load module by insmod like this: - # insmod ipt_NETFLOW.ko destination=127.0.0.1:2055 debug=1 +1. You can load module directly by insmod like this: + + # insmod ipt_NETFLOW.ko destination=127.0.0.1:2055 debug=1 Or if properly installed (make install; depmod) by this: - # modprobe ipt_NETFLOW destination=127.0.0.1:2055 + + # modprobe ipt_NETFLOW destination=127.0.0.1:2055 See, you may add options in insmod/modprobe command line, or add - them in /etc/ to modules.conf or modprobe.conf like thus: - options ipt_NETFLOW destination=127.0.0.1:2055 + them in /etc/modprobe.conf or /etc/modprobe.d/ipt_NETFLOW.conf + like thus: + + options ipt_NETFLOW destination=127.0.0.1:2055 protocol=9 natevents=1 2. Statistics is in /proc/net/stat/ipt_netflow - To view slab statistics: grep ipt_netflow /proc/slabinfo + To view boring slab statistics: grep ipt_netflow /proc/slabinfo 3. You can view parameters and control them via sysctl, example: - # sysctl -w net.netflow.hashsize=32768 -4. Example of directing all traffic into module: - # iptables -A FORWARD -j NETFLOW - # iptables -A INPUT -j NETFLOW - # iptables -A OUTPUT -j NETFLOW + # sysctl net.netflow + # sysctl net.netflow.hashsize=32768 + + Note: For after-reboot configuration I recommend to store module parameters + in modprobe configs instead of storing them in /etc/sysctl.conf, as it's + less clear when init process will apply sysctl.conf, before of after + module's load. + +4. Example of directing all IPv4 traffic into the module: + + # iptables -I FORWARD -j NETFLOW + # iptables -I INPUT -j NETFLOW + # iptables -I OUTPUT -j NETFLOW + + Note: It is preferable (because easier to understand) to _insert_ + NETFLOW target at the top of the chain, otherwise not all traffic may + reach NETFLOW if your iptables configuration is complicated and some + other rule inadvertently consume the traffic (dropping or acepting before + NETFLOW is reached). It's always good to test your configuration. + Use iptables -L -nvx to check pkts/bytes counters on the rules. + +5. If you want to account IPv6 traffic you should use protocol 9 or 10. + Example of directing all IPv6 traffic into the module: + # sysctl net.netflow.protocol=10 + # ip6tables -I FORWARD -j NETFLOW + # ip6tables -I INPUT -j NETFLOW + # ip6tables -I OUTPUT -j NETFLOW + + Note: First enable right version of protocol and after that add ip6tables + rules, otherwise you will get errors in dmesg. + +6. If you want to account NAT events (NEL): + + # sysctl net.netflow.natevents=1 + + Note that natevents feature is completely independent from traffic accounting + (it's using so called conntrack events), thus you don't need to set or change + any iptables rules to use that. You may need to enable kernel config option + CONFIG_NF_CONNTRACK_EVENTS though (if it isn't already enabled). + For details on how they are exported for different protocol versions see + below. =========== = OPTIONS = =========== + protocol=5 + - what version of NetFlow protocol to use. Default is 5. + You can choose from 5, 9, or 10 (where 10 is IPFIX). If you plan + to account IPv6 traffic you should use protocol 9 or 10 (IPFIX), + because NetFlow v5 isn't compatible with IPv6. + destination=127.0.0.1:2055 - where to export netflow, to this ip address You will see this connection in netstat like this: udp 0 0 127.0.0.1:32772 127.0.0.1:2055 ESTABLISHED destination=127.0.0.1:2055,192.0.0.1:2055 - - mirror flows to two (can be more) addresses, - separate addresses with comma. + - mirror flows to two (can be more) addresses, separate addresses + with comma. + + natevents=1 + - Collect and send NAT translation events as NetFlow Event Logging (NEL) + for NetFlow v9/IPFIX, or as dummy flows compatible with NetFlow v5. + Default is 0 (don't send). + + For NetFlow v5 protocol meaning of fields in dummy flows are such: + Src IP, Src Port is Pre-nat source address. + Dst IP, Dst Port is Post-nat destination address. + - These two fields made equal to data flows catched in FORWARD chain. + Nexthop, Src AS is Post-nat source address for SNAT. Or, + Nexthop, Dst AS is Pre-nat destination address for DNAT. + TCP Flags is SYN+SCK for start event, RST+FIN for stop event. + Pkt/Traffic size is 0 (zero), so it won't interfere with accounting. inactive_timeout=15 - export flow after it's inactive 15 seconds. Default value is 15. active_timeout=1800 - - export flow after it's active 1800 seconds (30 minutes). Default value is 1800. + - export flow after it's active 1800 seconds (30 minutes). Default valuae + is 1800. + + refresh-rate=20 + - for NetFlow v9 and IPFIX it's rate how frequently to re-send templates + (per packets). You probably don't need to change default (which is 20). + + timeout-rate=30 + - for NetFlow v9 and IPFIX it's rate when to re-send old templates (in + minutes). No need to change it. debug=0 - debug level (none). sndbuf=number - - size of output socket buffer in bytes. Recommend you to put + - size of output socket buffer in bytes. I recommend you to put higher value if you experience netflow packet drops (can be seen in statistics as 'sock: fail' number.) Default value is system default. hashsize=number - Hash table bucket size. Used for performance tuning. - Abstractly speaking, it should be two times bigger than flows + Abstractly speaking, it should be minimum two times bigger than flows you usually have, but not need to. Default is system memory dependent small enough value. maxflows=2000000 - - Maximum number of flows to account. It's here to prevent DOS attacks. After - this limit reached new flows will not be accounted. Default is + - Maximum number of flows to account. It's here to prevent DOS attacks. + After this limit reached new flows will not be accounted. Default is 2000000, zero is unlimited. aggregation=string.. @@ -130,14 +263,15 @@ Troubleshooting: = HOW TO READ STAT = ==================== - Statistics is your friend to fine tune and understand netflow module performance. + Statistics is your friend to fine tune and understand netflow module + performance. To see stat: # cat /proc/net/stat/ipt_netflow How to interpret the data: -> Flows: active 5187 (peak 83905 reached 0d0h1m ago, maxflows 2000000), mem 283K +> Flows: active 5187 (peak 83905 reached 0d0h1m ago, maxflows 2000000), mem 283K, worker delay 100/1000. active X: currently active flows in memory cache. - for optimum CPU performance it is recommended to set hash table size to @@ -146,8 +280,9 @@ Troubleshooting: mem XK: how much kilobytes of memory currently taken by active flows. - one active flow taking 56 bytes of memory. - there is system limit on cache size too. + worker delay X/HZ: how frequently exporter scan flows table per second. -> Hash: size 8192 (mem 32K), metric 1.0, 1.0, 1.0, 1.0. MemTraf: 1420 pkt, 364 K (pdu 0, 0). +> Hash: size 8192 (mem 32K), metric 1.00, [1.00, 1.00, 1.00]. MemTraf: 1420 pkt, 364 K (pdu 0, 0). Hash: size X: current hash size/limit. - you can control this by sysctl net.netflow.hashsize variable. @@ -156,18 +291,22 @@ Troubleshooting: - optimal value is twice of average of active flows. mem XK: how much memory occupied by hash table. - hash table is fixed size by nature, taking 4 bytes per entry. - metric X, X, X, X: how optimal is your hash table being used. + metric X, [X, X, X]: how optimal is your hash table being used. - lesser value mean more optimal hash table use, min is 1.0. - - this is moving average (EWMA) of hash table access divided - by match rate (searches / matches) for 4sec, and 1, 5, 15 minutes. - Sort of hash table load average. + - last three numbers in squares is moving average (EWMA) of hash table + access divided by match rate (searches / matches) for 4sec, and 1, 5, and + 15 minutes. Sort of hash table load average. First value is instantaneous. + You can try to increase hashsize if averages more than 1 (increase + certainly if >= 2). MemTraf: X pkt, X K: how much traffic accounted for flows that are in memory. - these flows that are residing in internal hash table. pdu X, X: how much traffic in flows preparing to be exported. - it is included already in aforementioned MemTraf total. -> Timeout: active 1800, inactive 15. Maxflows 2000000 +> Protocol version 10 (ipfix), refresh-rate 20, timeout-rate 30, (templates 2, active 2). Timeouts: active 5, inactive 15. Maxflows 2000000 + Protocol version currently in use. Refresh-rate and timeout-rate + for v9 and IPFIX. Total templates generated and currently active. Timeout: active X: how much seconds to wait before exporting active flow. - same as sysctl net.netflow.active_timeout variable. inactive X: how much seconds to wait before exporting inactive flow. @@ -180,20 +319,22 @@ Troubleshooting: - Module throughput values for 1 second, 1 minute, and 5 minutes. -> cpu# stat: , sock: , traffic: , drop: -> cpu0 stat: 980540 10473 180600, 0 0 0 0, sock: 4983 928 0, 7124 K, traffic: 188765, 14 MB, drop: 27863, 1142 K +> cpu# stat: , sock: , traffic: , drop: +> cpu0 stat: 980540 10473 180600 [1.03], 0 0 0 0, sock: 4983 928 0, 7124 K, traffic: 188765, 14 MB, drop: 27863, 1142 K cpu#: this is Total and per CPU statistics for: stat: : internal stat for: search found new: hash table searched, found, and not found counters. - trunc: how much truncated packets is ignored + [metric]: average hash metric since module load. + trunc: how much truncated packets are ignored - these are that possible don't have valid IP header. - accounted in drop packets counter but not in drop bytes. frag: how much fragmented packets have seen. - kernel always defragments INPUT/OUTPUT chains for us. - these packets are not ignored but not reassembled either, so: - - if there is no enough data in fragment (ex. tcp ports) it is considered zero. - alloc: how much cache memory allocations is failed. + - if there is no enough data in fragment (ex. tcp ports) it is considered + zero. + alloc: how much cache memory allocations are failed. - packets ignored and accounted in drop stat. - probably increase system memory if this ever happen. maxflows: how much packets ignored on maxflows (maximum active flows reached). @@ -203,7 +344,8 @@ Troubleshooting: sock: : table of exporting stats for: ok: how much Netflow PDUs are exported (i.e. UDP packets sent by module). fail: how much socket errors (i.e. packets failed to be sent). - - packets dropped and their internal statistics cumulatively accounted in drop stat. + - packets dropped and their internal statistics cumulatively accounted in + drop stat. cberr: how much connection refused ICMP errors we got from export target. - probably you not launched collector software on destination, - or specified wrong destination address. @@ -225,20 +367,34 @@ Troubleshooting: packet is for new flow but maxflows is already reached, all flows in export packets that got socket error. -> sock0: 10.0.0.2:2055, sndbuf 106496, filled 0, peak 106848; err: sndbuf reached 928, other 0 +> Natevents disabled, count start 0, stop 0. + + - Natevents mode disabled or enabled, and how much start or stop events + are reported. + +> sock0: 10.0.0.2:2055 unconnected (1 attempts). + + If socket is unconnected (for example if module loaded before interfaces is + up) it shows now much connection attempts was failed. It will try to connect + until success. + +> sock0: 10.0.0.2:2055, sndbuf 106496, filled 0, peak 106848; err: sndbuf reached 928, connect 0, other 0 sockX: per destination stats for: X.X.X.X:Y: destination ip address and port. - controlled by sysctl net.netflow.destination variable. sndbuf X: how much data socket can hold in buffers. - controlled by sysctl net.netflow.sndbuf variable. - - if you have packet drops due to sndbuf reached (error -11) increase this value. + - if you have packet drops due to sndbuf reached (error -11) increase this + value. filled X: how much data in socket buffers right now. peak X: peak value of how much data in socket buffers was. - you will be interested to keep it below sndbuf value. err: how much packets are dropped due to errors. - all flows from them will be accounted in drop stat. - sndbuf reached X: how much packets dropped due to sndbuf being too small (error -11). + sndbuf reached X: how much packets dropped due to sndbuf being too small + (error -11). + connect X: how much connection attempts was failed. other X: dropped due to other possible errors. > aggr0: ... diff --git a/README.promisc b/README.promisc index 60ca922..31d774f 100644 --- a/README.promisc +++ b/README.promisc @@ -2,9 +2,14 @@ Hello, If you wish to account with netflow module traffic mirrored on switch you may follow this example: -************** -* Solution 1 * -************** + + Solution 1: General kernel patch. + Solution 2: Alternative w/o kernel patch. + + + ************** + * Solution 1 * + ************** 1. Patch your kernel with `raw_promisc.patch' to enable raw table to see promisc traffic. @@ -33,17 +38,7 @@ If you wish to account with netflow module traffic mirrored on switch you may fo # /sbin/vconfig add eth1 47 # /sbin/ifconfig eth1.47 up -5. Recompile ipt_netflow module with #define RAW_PROMISC_HACK uncommented: - - Find this line in ipt_NETFLOW.c (should be line 7): - -//#define RAW_PROMISC_HACK - - And remove two slashes at beginning of the line, so it become like this: - -#define RAW_PROMISC_HACK - - Re-compile module: +5. Compile module: # make clean all install @@ -55,13 +50,14 @@ If you wish to account with netflow module traffic mirrored on switch you may fo # /sbin/iptables -A PREROUTING -t raw -i eth1.47 -j NETFLOW - Voila. +ps. For Debian Squeeze instructions look at raw_promisc_debian_squeeze6.patch -************** -* Solution 2 * -************** + + ************** + * Solution 2 * + ************** By Anonymous. @@ -81,4 +77,3 @@ Sometimes you may need to run: for this scheme to work. - diff --git a/configure b/configure index 677dd7f..3f10e2a 100755 --- a/configure +++ b/configure @@ -3,7 +3,7 @@ PATH=$PATH:/bin:/usr/bin:/usr/sbin:/sbin:/usr/local/sbin error() { - echo "! Error: $@" + echo -e "! Error: $@" exit 1 } @@ -56,19 +56,20 @@ get_lib_from_lib() { } iptables_inc() { - echo -n "Iptables include path: " + echo -n "Iptables include flags: " if [ "$IPTINC" ]; then - echo "$IPTINC (user specified)" IPTINC="-I$IPTINC" + echo "$IPTINC (user specified)" + elif [ "$PKGVER" ]; then + IPTINC="$PKGINC" + echo "$IPTINC (pkg-config)" + elif [ "$NOIPTSRC" ]; then + IPTINC= + echo "none (default)" else - if [ "$PKGINC" ]; then - IPTINC="$PKGINC" - echo "$IPTINC (pkg-config)" - else - IPTINC="$IPTSRC/include" - echo "$IPTINC (from source)" - IPTINC="-I$IPTINC" - fi + IPTINC="$IPTSRC/include" + IPTINC="-I$IPTINC" + echo "$IPTINC (from source)" fi } @@ -109,7 +110,16 @@ try_dir2() { test -d "$1" && try_dir `dirname $1` && return 0 } -iptables_ver() { +check_pkg_config() { + test "$PKGWARN" && return 1 + if ! which pkg-config >/dev/null 2>&1; then + echo "! You don't have pkg-config, it may be useful to install it." + PKGWARN=1 + return 1 + fi + return 0 +} +iptables_find_version() { echo -n "Iptables binary version: " if [ "$IPTVER" ]; then echo "$IPTVER (user specified)" @@ -121,6 +131,7 @@ iptables_ver() { else echo "no iptables binary found" fi + check_pkg_config PKGVER=`pkg-config --modversion xtables 2>/dev/null` if [ "$PKGVER" ]; then IPTVER="$PKGVER" @@ -131,44 +142,90 @@ iptables_ver() { fi } -iptables_dir() { - test "$IPTINC" && return 1 - test "$PKGINC" && return 1 - - VER="iptables-$IPTVER" - if [ "$IPTSRC" ]; then - echo "User specified source directory: $IPTSRC" - try_dir $IPTSRC || error "Specified directory is not iptables source.." +compile_libitp_test() { + echo -n "Checking for presence of $@... " + echo " +#define __EXPORTED_HEADERS__ +#include <$*>" > test.c + gcc -c test.c >/dev/null 2>&1 + RET=$? + if [ $RET = 0 ]; then + echo Yes; else - echo "Searching for $VER sources.." - try_dir "./$VER" && return 0 - try_dir "../$VER" && return 0 - try_dir "/usr/src/$VER" && return 0 - try_dirg "iptables" && return 0 - try_dirg "../iptables" && return 0 - try_dirg "/usr/src/iptables" && return 0 - try_dir2 `locate $VER/extensions | head -1` && return 0 - error "Can not find iptables source directory, try setting it with --ipt-src=" + echo No; fi + rm -f test.c test.o + return $RET } -iptables_pkg_config() { +iptables_try_pkgconfig() { if [ ! "$PKGVER" ]; then + check_pkg_config + PKGVER=`pkg-config --modversion xtables 2>/dev/null` + TRYPKGVER=`pkg-config --modversion xtables 2>/dev/null` echo -n "pkg-config for version $IPTVER exists: " - PKGVER=`pkg-config --exact-version=$IPTVER --modversion xtables 2>/dev/null` + pkg-config --exact-version=$IPTVER xtables 2>/dev/null if [ $? = 0 ]; then echo "Yes" + PKGVER=$TRYPKGVER else - echo "No (reported: $PKGVER)" - unset PKGVER + if [ "$TRYPKGVER" ]; then + echo "No (reported: $TRYPKGVER)" + else + echo "No" + fi fi fi if [ "$PKGVER" ]; then + check_pkg_config + PKGVER=`pkg-config --modversion xtables 2>/dev/null` PKGINC=`pkg-config --cflags xtables` PKGLIB=`pkg-config --variable=xtlibdir xtables` - IPTCFLAGS="-DXTABLES" else - IPTCFLAGS="-DIPTABLES_VERSION=\\\\\"$IPTVER\\\\\"" + # Newer versions of iptables should not have -I/kernel/include! + # So I assume that newer version will have correct pkg-config set up + # and if not, then it's older who need it. + IPTCFLAGS="-I$KDIR/include -DIPTABLES_VERSION=\\\\\"$IPTVER\\\\\"" + fi + if compile_libitp_test xtables.h; then + IPTCFLAGS="-DXTABLES $IPTCFLAGS" + elif ! compile_libitp_test iptables.h; then + echo "! Iptables headers not found. You may need to specify --ipt-inc=..." + if [ -s /etc/debian_version ]; then + echo "! " + echo "! Under Debian simply run this:" + echo "! root# apt-get install iptables-dev pkg-config" + elif [ -s /etc/redhat-release ]; then + echo "! " + arch=.`uname -m` + echo "! Under Centos simply run this:" + echo "! root# yum install iptables-devel$arch pkgconfig" + fi + exit 1 + fi + +} + +iptables_find_src() { + test "$IPTINC" && return 1 + test "$PKGVER" && return 1 + + VER="iptables-$IPTVER" + if [ "$IPTSRC" ]; then + echo "User specified source directory: $IPTSRC" + try_dir $IPTSRC || error "Specified directory is not iptables source.." + else + echo "Searching for $VER sources.." + try_dir "./$VER" && return 0 + try_dir "../$VER" && return 0 + try_dir "/usr/src/$VER" && return 0 + try_dirg "iptables" && return 0 + try_dirg "../iptables" && return 0 + try_dirg "/usr/src/iptables" && return 0 + try_dir2 `locate $VER/extensions 2>/dev/null | head -1` && return 0 + echo "! Can not find iptables source directory, you may try setting it with --ipt-src=" + echo "! This is not fatal error, yet. Will be just using default include dir." + NOIPTSRC=1 fi } @@ -206,18 +263,110 @@ do esac done -test "$KVERSION" || KVERSION=`uname -r` -echo Kernel version: $KVERSION +kernel_find_version() { + KHOW=requested + test "$KVERSION" && return 0 + + if grep -q '#.*Debian' /proc/version; then + KHOW=proc + KVERSION=`sed -n 's/.*#.*Debian \([0-9\.]\+\)-.*/\1/p' /proc/version` + KLIBMOD=`uname -r` + else + KHOW=uname + KVERSION=`uname -r` + fi + test "$KDIR" || return 0 + + test -s $KDIR/Makefile || return 1 + test -s $KDIR/include/config/kernel.release || return 1 + KVERSION=`cat $KDIR/include/config/kernel.release` + KHOW=sources +} + +kernel_check_src() { + if [ -s "$1/Makefile" ]; then + KDIR="$1" + return 0 + fi + return 1 +} + +kernel_find_source() { + KSHOW=requested + test "$KDIR" && return 0 + KSHOW=found + kernel_check_src /lib/modules/$KLIBMOD/build && return 0 + kernel_check_src /lib/modules/$KVERSION/build && return 0 + kernel_check_src /usr/src/kernels/$KVERSION && return 0 + kernel_check_src /usr/src/linux-$KVERSION && return 0 + echo "! Linux source not found. Don't panic. You may specify kernel source" + echo "! directory with --kdir=..., or try to install kernel-devel package," + echo "! or just raw sources for linux-$KVERSION from kernel.org." + if grep -q -i centos /proc/version 2>/dev/null; then + echo "! " + arch=.`uname -m` + echo "! Under Centos simply run this:" + echo "! root# yum install kernel-devel iptables-devel$arch pkgconfig" + fi + if grep -q -i debian /proc/version 2>/dev/null; then + echo "! " + echo "! Under Debian simply run this:" + echo "! root# apt-get install module-assistant iptables-dev pkg-config" + echo "! root# m-a prepare" + fi + exit 1 +} + +kernel_check_consistency() { + if test -s $KDIR/include/config/kernel.release; then + SRCVER=`cat $KDIR/include/config/kernel.release` + test "$KVERSION" != "$SRCVER" && error "$KHOW kernel version ($KVERSION) and $KSHOW version of kernel source ($SRCVER) doesn't match!\n!" \ + "You may try to specify only kernel source tree with --kdir=$KDIR\n!" \ + "and configure will pick up version properly." + else + test -e "$KDIR/.config" || error ".config in kernel source not found, run make menuconfig in $KDIR" + test -d "$KDIR/include/config" || error "kernel is not prepared, run make prepare modules_prepare in $KDIR" + fi +} + +kconfig() { + KCONFIG=$KDIR/.config + if ! grep -q "^$1=" $KCONFIG 2>/dev/null; then + if [ "$KCONFIGREPORTED" != true ]; then + KCONFIGREPORTED=true + echo Kernel config file checked: $KCONFIG + echo + fi + echo "! Attention: $1 is undefined in your kernel configuration" + echo "! Without this option enabled $2 will not work." + echo + fi +} + +kernel_check_config() { + kconfig CONFIG_SYSCTL "sysctl interface" + kconfig CONFIG_PROC_FS "proc interface" + kconfig CONFIG_NF_NAT_NEEDED "natevents" + kconfig CONFIG_NF_CONNTRACK_EVENTS "natevents" + kconfig CONFIG_NF_CONNTRACK_MARK "connmark tracking" + kconfig CONFIG_IPV6 "IPv6" + kconfig CONFIG_IP6_NF_IPTABLES "ip6tables target" +} -test "$KDIR" || KDIR=/lib/modules/$KVERSION/build -echo Kernel sources: $KDIR +kernel_find_version #KVERSION +test "$KLIBMOD" || KLIBMOD=$KVERSION +echo "Kernel version: $KVERSION ($KHOW)" +kernel_find_source #KDIR +echo "Kernel sources: $KDIR ($KSHOW)" +kernel_check_consistency +kernel_check_config test "$IPTBIN" || IPTBIN=`which iptables` -iptables_ver #IPTVER -iptables_pkg_config -iptables_dir #IPTSRC -iptables_src_version #check IPTSRC match to IPTVER +iptables_find_version #IPTVER +iptables_try_pkgconfig #try to configure from pkg-config +iptables_find_src #IPTSRC +iptables_src_version #check that IPTSRC match to IPTVER iptables_inc #IPTINC iptables_modules #IPTLIB @@ -225,7 +374,6 @@ REPLACE="\ s!@KVERSION@!$KVERSION!;\ s!@KDIR@!$KDIR!;\ s!@IPTABLES_VERSION@!$IPTVER!;\ -s!@IPTABLES_INCLUDES@!$IPTINC!;\ s!@IPTABLES_CFLAGS@!$IPTCFLAGS $IPTINC!;\ s!@IPTABLES_MODULES@!$IPTLIB!" diff --git a/ipt_NETFLOW.c b/ipt_NETFLOW.c index d4c91e1..ad974c5 100644 --- a/ipt_NETFLOW.c +++ b/ipt_NETFLOW.c @@ -1,6 +1,6 @@ /* * This is NetFlow exporting module (NETFLOW target) for linux - * (c) 2008-2012 + * (c) 2008-2013 * * * This program is free software: you can redistribute it and/or modify @@ -18,8 +18,6 @@ * */ -//#define RAW_PROMISC_HACK - #include #include #include @@ -31,16 +29,26 @@ #include #include #include -#include +#include +#include +#include #include #include +#include #include #include +#include #include #include +#if defined(CONFIG_NF_NAT_NEEDED) || defined(CONFIG_NF_CONNTRACK_MARK) +#include +#include +#include +#endif #include #include #include "ipt_NETFLOW.h" +#include "murmur3.h" #ifdef CONFIG_BRIDGE_NETFILTER #include #endif @@ -74,41 +82,66 @@ #define ipt_target xt_target #endif -#define IPT_NETFLOW_VERSION "1.8" +#define IPT_NETFLOW_VERSION "1.8.2" /* Note that if you are using git, you + will see version in other format. */ +#include "version.h" +#ifdef GITVERSION +#undef IPT_NETFLOW_VERSION +#define IPT_NETFLOW_VERSION GITVERSION +#endif MODULE_LICENSE("GPL"); MODULE_AUTHOR(""); MODULE_DESCRIPTION("iptables NETFLOW target module"); MODULE_VERSION(IPT_NETFLOW_VERSION); +MODULE_ALIAS("ip6t_NETFLOW"); #define DST_SIZE 256 static char destination_buf[DST_SIZE] = "127.0.0.1:2055"; static char *destination = destination_buf; -module_param(destination, charp, 0400); +module_param(destination, charp, 0444); MODULE_PARM_DESC(destination, "export destination ipaddress:port"); static int inactive_timeout = 15; -module_param(inactive_timeout, int, 0600); +module_param(inactive_timeout, int, 0644); MODULE_PARM_DESC(inactive_timeout, "inactive flows timeout in seconds"); static int active_timeout = 30 * 60; -module_param(active_timeout, int, 0600); +module_param(active_timeout, int, 0644); MODULE_PARM_DESC(active_timeout, "active flows timeout in seconds"); static int debug = 0; -module_param(debug, int, 0600); +module_param(debug, int, 0644); MODULE_PARM_DESC(debug, "debug verbosity level"); static int sndbuf; -module_param(sndbuf, int, 0400); +module_param(sndbuf, int, 0444); MODULE_PARM_DESC(sndbuf, "udp socket SNDBUF size"); +static int protocol = 5; +module_param(protocol, int, 0444); +MODULE_PARM_DESC(protocol, "netflow protocol version (5, 9, 10)"); + +static unsigned int refresh_rate = 20; +module_param(refresh_rate, uint, 0644); +MODULE_PARM_DESC(refresh_rate, "NetFlow v9/IPFIX refresh rate (packets)"); + +static unsigned int timeout_rate = 30; +module_param(timeout_rate, uint, 0644); +MODULE_PARM_DESC(timeout_rate, "NetFlow v9/IPFIX timeout rate (minutes)"); + +#ifdef CONFIG_NF_NAT_NEEDED +static int natevents = 0; +module_param(natevents, int, 0444); +MODULE_PARM_DESC(natevents, "send NAT Events"); +#endif + static int hashsize; -module_param(hashsize, int, 0400); +module_param(hashsize, int, 0444); MODULE_PARM_DESC(hashsize, "hash table size"); static int maxflows = 2000000; -module_param(maxflows, int, 0600); +module_param(maxflows, int, 0644); MODULE_PARM_DESC(maxflows, "maximum number of flows"); static int peakflows = 0; static unsigned long peakflows_at; @@ -121,22 +154,52 @@ MODULE_PARM_DESC(aggregation, "aggregation ruleset"); static DEFINE_PER_CPU(struct ipt_netflow_stat, ipt_netflow_stat); static LIST_HEAD(usock_list); -static DEFINE_RWLOCK(sock_lock); +static DEFINE_MUTEX(sock_lock); +#define LOCK_COUNT (1<<8) +#define LOCK_COUNT_MASK (LOCK_COUNT-1) +static spinlock_t htable_locks[LOCK_COUNT] = { + [0 ... LOCK_COUNT - 1] = __SPIN_LOCK_UNLOCKED(htable_locks) +}; +static DEFINE_RWLOCK(htable_rwlock); /* global lock to protect htable_locks change */ static unsigned int ipt_netflow_hash_rnd; -struct hlist_head *ipt_netflow_hash __read_mostly; /* hash table memory */ +static struct hlist_head *ipt_netflow_hash __read_mostly; /* hash table memory */ static unsigned int ipt_netflow_hash_size __read_mostly = 0; /* buckets */ static LIST_HEAD(ipt_netflow_list); /* all flows */ +static DEFINE_SPINLOCK(hlist_lock); /* should almost always be locked w/o _bh */ static LIST_HEAD(aggr_n_list); static LIST_HEAD(aggr_p_list); static DEFINE_RWLOCK(aggr_lock); +#ifdef CONFIG_NF_NAT_NEEDED +static LIST_HEAD(nat_list); /* nat events */ +static DEFINE_SPINLOCK(nat_lock); +static unsigned long nat_events_start = 0; +static unsigned long nat_events_stop = 0; +#endif static struct kmem_cache *ipt_netflow_cachep __read_mostly; /* ipt_netflow memory */ static atomic_t ipt_netflow_count = ATOMIC_INIT(0); -static DEFINE_SPINLOCK(ipt_netflow_lock); /* hash table lock */ -static long long pdu_packets = 0, pdu_traf = 0; -static struct netflow5_pdu pdu; -static unsigned long pdu_ts_mod; +static long long pdu_packets = 0, pdu_traf = 0; /* how much accounted traffic in pdu */ +static unsigned int pdu_count = 0; +static unsigned int pdu_seq = 0; +static unsigned int pdu_data_records = 0; +static unsigned int pdu_tpl_records = 0; +static unsigned long pdu_ts_mod; /* ts of last flow */ +static union { + struct netflow5_pdu v5; + struct netflow9_pdu v9; + struct ipfix_pdu ipfix; +} pdu; +static int engine_id = 0; /* Observation Domain */ +static __u8 *pdu_data_used; +static __u8 *pdu_high_wm; /* high watermark */ +static unsigned int pdu_max_size; /* sizeof pdu */ +static struct flowset_data *pdu_flowset = NULL; /* current data flowset */ + +static void (*netflow_export_flow)(struct ipt_netflow *nf); +static void (*netflow_export_pdu)(void); /* called on timeout */ +static void netflow_switch_version(int ver); + #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20) static void netflow_work_fn(void *work); static DECLARE_WORK(netflow_work, netflow_work_fn, NULL); @@ -146,19 +209,26 @@ static DECLARE_DELAYED_WORK(netflow_work, netflow_work_fn); #endif static struct timer_list rate_timer; +#define TCP_SYN_ACK 0x12 #define TCP_FIN_RST 0x05 static long long sec_prate = 0, sec_brate = 0; static long long min_prate = 0, min_brate = 0; static long long min5_prate = 0, min5_brate = 0; -static unsigned int metric = 10, min15_metric = 10, min5_metric = 10, min_metric = 10; /* hash metrics */ +static unsigned int metric = 100, min15_metric = 100, min5_metric = 100, min_metric = 100; /* hash metrics */ static int set_hashsize(int new_size); static void destination_removeall(void); static int add_destinations(char *ptr); static void aggregation_remove(struct list_head *list); static int add_aggregation(char *ptr); -static void netflow_scan_and_export(int flush); +static int netflow_scan_and_export(int flush); +enum { + DONT_FLUSH, AND_FLUSH +}; +static int template_ids = FLOWSET_DATA_FIRST; +static int tpl_count = 0; /* how much active templates */ + static inline __be32 bits2mask(int bits) { return (bits? 0xffffffff << (32 - bits) : 0); @@ -175,28 +245,46 @@ static inline int mask2bits(__be32 mask) { /* under that lock worker is always stopped and not rescheduled, * and we can call worker sub-functions manually */ static DEFINE_MUTEX(worker_lock); -static inline void __start_scan_worker(void) +#define MIN_DELAY 1 +#define MAX_DELAY (HZ / 10) +static int worker_delay = HZ / 10; +static inline void _schedule_scan_worker(const int status) { - schedule_delayed_work(&netflow_work, HZ / 10); + /* rudimentary congestion avoidance */ + if (status > 0) + worker_delay -= status; + else if (status < 0) + worker_delay /= 2; + else + worker_delay++; + if (worker_delay < MIN_DELAY) + worker_delay = MIN_DELAY; + else if (worker_delay > MAX_DELAY) + worker_delay = MAX_DELAY; + schedule_delayed_work(&netflow_work, worker_delay); } -static inline void start_scan_worker(void) +/* This is only called soon after pause_scan_worker. */ +static inline void cont_scan_worker(void) { - __start_scan_worker(); + _schedule_scan_worker(0); mutex_unlock(&worker_lock); } -/* we always stop scanner before write_lock(&sock_lock) - * to let it never hold that spin lock */ -static inline void __stop_scan_worker(void) +static inline void _unschedule_scan_worker(void) { +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,23) + cancel_rearming_delayed_work(&netflow_work); +#else cancel_delayed_work_sync(&netflow_work); +#endif } -static inline void stop_scan_worker(void) +/* This is only used for quick pause (in procctl). */ +static inline void pause_scan_worker(void) { mutex_lock(&worker_lock); - __stop_scan_worker(); + _unschedule_scan_worker(); } #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,24) @@ -223,11 +311,14 @@ static int nf_seq_show(struct seq_file *seq, void *v) int snum = 0; int peak = (jiffies - peakflows_at) / HZ; - seq_printf(seq, "Flows: active %u (peak %u reached %ud%uh%um ago), mem %uK\n", + seq_printf(seq, "ipt_NETFLOW version " IPT_NETFLOW_VERSION ", srcversion %s\n", + THIS_MODULE->srcversion); + seq_printf(seq, "Flows: active %u (peak %u reached %ud%uh%um ago), mem %uK, worker delay %d/%d.\n", nr_flows, peakflows, peak / (60 * 60 * 24), (peak / (60 * 60)) % 24, (peak / 60) % 60, - (unsigned int)((nr_flows * sizeof(struct ipt_netflow)) >> 10)); + (unsigned int)((nr_flows * sizeof(struct ipt_netflow)) >> 10), + worker_delay, HZ); for_each_present_cpu(cpu) { struct ipt_netflow_stat *st = &per_cpu(ipt_netflow_stat, cpu); @@ -252,93 +343,123 @@ static int nf_seq_show(struct seq_file *seq, void *v) } #define FFLOAT(x, prec) (int)(x) / prec, (int)(x) % prec - seq_printf(seq, "Hash: size %u (mem %uK), metric %d.%d, %d.%d, %d.%d, %d.%d. MemTraf: %llu pkt, %llu K (pdu %llu, %llu).\n", - ipt_netflow_hash_size, - (unsigned int)((ipt_netflow_hash_size * sizeof(struct hlist_head)) >> 10), - FFLOAT(metric, 10), - FFLOAT(min_metric, 10), - FFLOAT(min5_metric, 10), - FFLOAT(min15_metric, 10), - pkt_total - pkt_out + pdu_packets, - (traf_total - traf_out + pdu_traf) >> 10, - pdu_packets, - pdu_traf); - - seq_printf(seq, "Timeout: active %d, inactive %d. Maxflows %u\n", - active_timeout, - inactive_timeout, - maxflows); - - seq_printf(seq, "Rate: %llu bits/sec, %llu packets/sec; Avg 1 min: %llu bps, %llu pps; 5 min: %llu bps, %llu pps\n", - sec_brate, sec_prate, min_brate, min_prate, min5_brate, min5_prate); - - seq_printf(seq, "cpu# stat: , sock: , traffic: , drop: \n"); - - seq_printf(seq, "Total stat: %6llu %6llu %6llu, %4u %4u %4u %4u, sock: %6u %u %u, %llu K, traffic: %llu, %llu MB, drop: %llu, %llu K\n", - (unsigned long long)searched, - (unsigned long long)found, - (unsigned long long)notfound, - truncated, frags, alloc_err, maxflows_err, - send_success, send_failed, sock_errors, - (unsigned long long)exported_size >> 10, - (unsigned long long)pkt_total, (unsigned long long)traf_total >> 20, - (unsigned long long)pkt_drop, (unsigned long long)traf_drop >> 10); + seq_printf(seq, "Hash: size %u (mem %uK), metric %d.%02d [%d.%02d, %d.%02d, %d.%02d]." + " MemTraf: %llu pkt, %llu K (pdu %llu, %llu), Out %llu pkt, %llu K.\n", + ipt_netflow_hash_size, + (unsigned int)((ipt_netflow_hash_size * sizeof(struct hlist_head)) >> 10), + FFLOAT(metric, 100), + FFLOAT(min_metric, 100), + FFLOAT(min5_metric, 100), + FFLOAT(min15_metric, 100), + pkt_total - pkt_out + pdu_packets, + (traf_total - traf_out + pdu_traf) >> 10, + pdu_packets, + pdu_traf, + pkt_out, + traf_out >> 10); + + seq_printf(seq, "Rate: %llu bits/sec, %llu packets/sec;" + " Avg 1 min: %llu bps, %llu pps; 5 min: %llu bps, %llu pps\n", + sec_brate, sec_prate, min_brate, min_prate, min5_brate, min5_prate); + + seq_printf(seq, "cpu# stat: ," + " sock: , traffic: , drop: \n"); + +#define SAFEDIV(x,y) ((y)? ({ u64 __tmp = x; do_div(__tmp, y); (int)__tmp; }) : 0) + seq_printf(seq, "Total stat: %6llu %6llu %6llu [%d.%02d], %4u %4u %4u %4u," + " sock: %6u %u %u, %llu K, traffic: %llu, %llu MB, drop: %llu, %llu K\n", + searched, + (unsigned long long)found, + (unsigned long long)notfound, + FFLOAT(SAFEDIV(100LL * (searched + found + notfound), (found + notfound)), 100), + truncated, frags, alloc_err, maxflows_err, + send_success, send_failed, sock_errors, + (unsigned long long)exported_size >> 10, + (unsigned long long)pkt_total, (unsigned long long)traf_total >> 20, + (unsigned long long)pkt_drop, (unsigned long long)traf_drop >> 10); if (num_present_cpus() > 1) { for_each_present_cpu(cpu) { struct ipt_netflow_stat *st; st = &per_cpu(ipt_netflow_stat, cpu); - seq_printf(seq, "cpu%u stat: %6llu %6llu %6llu, %4u %4u %4u %4u, sock: %6u %u %u, %llu K, traffic: %llu, %llu MB, drop: %llu, %llu K\n", - cpu, - (unsigned long long)st->searched, - (unsigned long long)st->found, - (unsigned long long)st->notfound, - st->truncated, st->frags, st->alloc_err, st->maxflows_err, - st->send_success, st->send_failed, st->sock_errors, - (unsigned long long)st->exported_size >> 10, - (unsigned long long)st->pkt_total, (unsigned long long)st->traf_total >> 20, - (unsigned long long)st->pkt_drop, (unsigned long long)st->traf_drop >> 10); + seq_printf(seq, "cpu%u stat: %6llu %6llu %6llu [%d.%02d], %4u %4u %4u %4u," + " sock: %6u %u %u, %llu K, traffic: %llu, %llu MB, drop: %llu, %llu K\n", + cpu, + (unsigned long long)st->searched, + (unsigned long long)st->found, + (unsigned long long)st->notfound, + FFLOAT(SAFEDIV(100LL * (st->searched + st->found + st->notfound), (st->found + st->notfound)), 100), + st->truncated, st->frags, st->alloc_err, st->maxflows_err, + st->send_success, st->send_failed, st->sock_errors, + (unsigned long long)st->exported_size >> 10, + (unsigned long long)st->pkt_total, (unsigned long long)st->traf_total >> 20, + (unsigned long long)st->pkt_drop, (unsigned long long)st->traf_drop >> 10); } } - read_lock(&sock_lock); + seq_printf(seq, "Protocol version %d", protocol); + if (protocol == 10) + seq_printf(seq, " (ipfix)"); + else + seq_printf(seq, " (netflow)"); + if (protocol >= 9) + seq_printf(seq, ", refresh-rate %u, timeout-rate %u, (templates %d, active %d)", + refresh_rate, timeout_rate, template_ids - FLOWSET_DATA_FIRST, tpl_count); + + seq_printf(seq, ". Timeouts: active %d, inactive %d. Maxflows %u\n", + active_timeout, + inactive_timeout, + maxflows); + +#ifdef CONFIG_NF_NAT_NEEDED + seq_printf(seq, "Natevents %s, count start %lu, stop %lu.\n", natevents? "enabled" : "disabled", + nat_events_start, nat_events_stop); +#endif + + mutex_lock(&sock_lock); list_for_each_entry(usock, &usock_list, list) { - struct sock *sk = usock->sock->sk; - - seq_printf(seq, "sock%d: %u.%u.%u.%u:%u, sndbuf %u, filled %u, peak %u; err: sndbuf reached %u, other %u\n", - snum, - usock->ipaddr >> 24, - (usock->ipaddr >> 16) & 255, - (usock->ipaddr >> 8) & 255, - usock->ipaddr & 255, - usock->port, - sk->sk_sndbuf, - atomic_read(&sk->sk_wmem_alloc), - atomic_read(&usock->wmem_peak), - atomic_read(&usock->err_full), - atomic_read(&usock->err_other)); + seq_printf(seq, "sock%d: %u.%u.%u.%u:%u", + snum, + HIPQUAD(usock->ipaddr), + usock->port); + if (usock->sock) { + struct sock *sk = usock->sock->sk; + + seq_printf(seq, ", sndbuf %u, filled %u, peak %u;" + " err: sndbuf reached %u, connect %u, other %u\n", + sk->sk_sndbuf, + atomic_read(&sk->sk_wmem_alloc), + atomic_read(&usock->wmem_peak), + atomic_read(&usock->err_full), + atomic_read(&usock->err_connect), + atomic_read(&usock->err_other)); + } else + seq_printf(seq, " unconnected (%u attempts).\n", + atomic_read(&usock->err_connect)); snum++; } - read_unlock(&sock_lock); + mutex_unlock(&sock_lock); read_lock_bh(&aggr_lock); snum = 0; list_for_each_entry(aggr_n, &aggr_n_list, list) { - seq_printf(seq, "aggr#%d net: match %u.%u.%u.%u/%d strip %d\n", - snum, - HIPQUAD(aggr_n->addr), - mask2bits(aggr_n->mask), - mask2bits(aggr_n->aggr_mask)); + seq_printf(seq, "aggr#%d net: match %u.%u.%u.%u/%d strip %d (usage %u)\n", + snum, + HIPQUAD(aggr_n->addr), + mask2bits(aggr_n->mask), + mask2bits(aggr_n->aggr_mask), + atomic_read(&aggr_n->usage)); snum++; } snum = 0; list_for_each_entry(aggr_p, &aggr_p_list, list) { - seq_printf(seq, "aggr#%d port: ports %u-%u replace %u\n", - snum, - aggr_p->port1, - aggr_p->port2, - aggr_p->aggr_port); + seq_printf(seq, "aggr#%d port: ports %u-%u replace %u (usage %u)\n", + snum, + aggr_p->port1, + aggr_p->port2, + aggr_p->aggr_port, + atomic_read(&aggr_p->usage)); snum++; } read_unlock_bh(&aggr_lock); @@ -367,8 +488,13 @@ static struct file_operations nf_seq_fops = { #define BEFORE2632(x,y) #endif +/* PAX need to know that we are allowed to write */ +#ifndef CONSTIFY_PLUGIN +#define ctl_table_no_const ctl_table +#endif + /* sysctl /proc/sys/net/netflow */ -static int hsize_procctl(ctl_table *ctl, int write, BEFORE2632(struct file *filp,) +static int hsize_procctl(ctl_table_no_const *ctl, int write, BEFORE2632(struct file *filp,) void __user *buffer, size_t *lenp, loff_t *fpos) { void *orig = ctl->data; @@ -386,20 +512,21 @@ static int hsize_procctl(ctl_table *ctl, int write, BEFORE2632(struct file *filp return ret; } -static int sndbuf_procctl(ctl_table *ctl, int write, BEFORE2632(struct file *filp,) +static int sndbuf_procctl(ctl_table_no_const *ctl, int write, BEFORE2632(struct file *filp,) void __user *buffer, size_t *lenp, loff_t *fpos) { int ret; struct ipt_netflow_sock *usock; - - read_lock(&sock_lock); + + mutex_lock(&sock_lock); if (list_empty(&usock_list)) { - read_unlock(&sock_lock); + mutex_unlock(&sock_lock); return -ENOENT; } usock = list_first_entry(&usock_list, struct ipt_netflow_sock, list); - sndbuf = usock->sock->sk->sk_sndbuf; - read_unlock(&sock_lock); + if (usock->sock) + sndbuf = usock->sock->sk->sk_sndbuf; + mutex_unlock(&sock_lock); ctl->data = &sndbuf; ret = proc_dointvec(ctl, write, BEFORE2632(filp,) buffer, lenp, fpos); @@ -407,13 +534,14 @@ static int sndbuf_procctl(ctl_table *ctl, int write, BEFORE2632(struct file *fil return ret; if (sndbuf < SOCK_MIN_SNDBUF) sndbuf = SOCK_MIN_SNDBUF; - stop_scan_worker(); - write_lock(&sock_lock); + pause_scan_worker(); + mutex_lock(&sock_lock); list_for_each_entry(usock, &usock_list, list) { - usock->sock->sk->sk_sndbuf = sndbuf; + if (usock->sock) + usock->sock->sk->sk_sndbuf = sndbuf; } - write_unlock(&sock_lock); - start_scan_worker(); + mutex_unlock(&sock_lock); + cont_scan_worker(); return ret; } @@ -424,10 +552,10 @@ static int destination_procctl(ctl_table *ctl, int write, BEFORE2632(struct file ret = proc_dostring(ctl, write, BEFORE2632(filp,) buffer, lenp, fpos); if (ret >= 0 && write) { - stop_scan_worker(); + pause_scan_worker(); destination_removeall(); add_destinations(destination_buf); - start_scan_worker(); + cont_scan_worker(); } return ret; } @@ -446,13 +574,12 @@ static int aggregation_procctl(ctl_table *ctl, int write, BEFORE2632(struct file return ret; } -static int flush_procctl(ctl_table *ctl, int write, BEFORE2632(struct file *filp,) +static int flush_procctl(ctl_table_no_const *ctl, int write, BEFORE2632(struct file *filp,) void __user *buffer, size_t *lenp, loff_t *fpos) { int ret; - int val; + int val = 0; - val = 0; ctl->data = &val; ret = proc_dointvec(ctl, write, BEFORE2632(filp,) buffer, lenp, fpos); @@ -461,14 +588,67 @@ static int flush_procctl(ctl_table *ctl, int write, BEFORE2632(struct file *filp if (val > 0) { printk(KERN_INFO "ipt_NETFLOW: forced flush\n"); - stop_scan_worker(); - netflow_scan_and_export(1); - start_scan_worker(); + pause_scan_worker(); + netflow_scan_and_export(AND_FLUSH); + cont_scan_worker(); + } + + return ret; +} + +static int protocol_procctl(ctl_table *ctl, int write, BEFORE2632(struct file *filp,) + void __user *buffer, size_t *lenp, loff_t *fpos) +{ + int ret; + int ver = protocol; + + ctl->data = &ver; + ret = proc_dointvec(ctl, write, BEFORE2632(filp,) buffer, lenp, fpos); + + if (!write) + return ret; + + switch (ver) { + case 5: + case 9: + case 10: + printk(KERN_INFO "ipt_NETFLOW: forced flush (protocol version change)\n"); + pause_scan_worker(); + netflow_scan_and_export(AND_FLUSH); + netflow_switch_version(ver); + cont_scan_worker(); + break; + default: + return -EPERM; } return ret; } +#ifdef CONFIG_NF_NAT_NEEDED +static void register_ct_events(void); +static void unregister_ct_events(void); +static int natevents_procctl(ctl_table *ctl, int write, BEFORE2632(struct file *filp,) + void __user *buffer, size_t *lenp, loff_t *fpos) +{ + int ret; + int val = natevents; + + ctl->data = &val; + ret = proc_dointvec(ctl, write, BEFORE2632(filp,) buffer, lenp, fpos); + + if (!write) + return ret; + + if (natevents && !val) + unregister_ct_events(); + else if (!natevents && val) + register_ct_events(); + + return ret; +} +#endif + static struct ctl_table_header *netflow_sysctl_header; #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20) @@ -547,6 +727,38 @@ static struct ctl_table netflow_sysctl_table[] = { .maxlen = sizeof(int), .proc_handler = &flush_procctl, }, + { + _CTL_NAME(10) + .procname = "protocol", + .mode = 0644, + .maxlen = sizeof(int), + .proc_handler = &protocol_procctl, + }, + { + _CTL_NAME(11) + .procname = "refresh-rate", + .mode = 0644, + .data = &refresh_rate, + .maxlen = sizeof(int), + .proc_handler = &proc_dointvec, + }, + { + _CTL_NAME(12) + .procname = "timeout-rate", + .mode = 0644, + .data = &timeout_rate, + .maxlen = sizeof(int), + .proc_handler = &proc_dointvec, + }, +#ifdef CONFIG_NF_NAT_NEEDED + { + _CTL_NAME(13) + .procname = "natevents", + .mode = 0644, + .maxlen = sizeof(int), + .proc_handler = &natevents_procctl, + }, +#endif { } }; @@ -588,18 +800,69 @@ static struct ctl_path netflow_sysctl_path[] = { static void sk_error_report(struct sock *sk) { /* clear connection refused errors if any */ - write_lock_bh(&sk->sk_callback_lock); if (debug > 1) - printk(KERN_INFO "NETFLOW: socket error <%d>\n", sk->sk_err); + printk(KERN_INFO "ipt_NETFLOW: socket error <%d>\n", sk->sk_err); sk->sk_err = 0; NETFLOW_STAT_INC(sock_errors); - write_unlock_bh(&sk->sk_callback_lock); return; } +static struct socket *_usock_alloc(const __be32 ipaddr, const unsigned short port) +{ + struct sockaddr_in sin; + struct socket *sock; + int error; + + if ((error = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock)) < 0) { + printk(KERN_ERR "ipt_NETFLOW: sock_create_kern error %d\n", -error); + return NULL; + } + sock->sk->sk_allocation = GFP_ATOMIC; + sock->sk->sk_prot->unhash(sock->sk); /* hidden from input */ + sock->sk->sk_error_report = &sk_error_report; /* clear ECONNREFUSED */ + if (sndbuf) + sock->sk->sk_sndbuf = sndbuf; + else + sndbuf = sock->sk->sk_sndbuf; + memset(&sin, 0, sizeof(sin)); + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = htonl(ipaddr); + sin.sin_port = htons(port); + if ((error = sock->ops->connect(sock, (struct sockaddr *)&sin, + sizeof(sin), 0)) < 0) { + printk(KERN_ERR "ipt_NETFLOW: error connecting UDP socket %d," + " don't worry, will try reconnect later.\n", -error); + /* ENETUNREACH when no interfaces */ + sock_release(sock); + return NULL; + } + return sock; +} + +static void usock_connect(struct ipt_netflow_sock *usock, const int sendmsg) +{ + usock->sock = _usock_alloc(usock->ipaddr, usock->port); + if (usock->sock) { + if (sendmsg || debug) + printk(KERN_INFO "ipt_NETFLOW: connected %u.%u.%u.%u:%u\n", + HIPQUAD(usock->ipaddr), + usock->port); + } else { + atomic_inc(&usock->err_connect); + if (debug) + printk(KERN_INFO "ipt_NETFLOW: connect to %u.%u.%u.%u:%u failed%s.\n", + HIPQUAD(usock->ipaddr), + usock->port, + (sendmsg)? " (pdu lost)" : ""); + } + atomic_set(&usock->wmem_peak, 0); + atomic_set(&usock->err_full, 0); + atomic_set(&usock->err_other, 0); +} + // return numbers of sends succeded, 0 if none /* only called in scan worker path */ -static int netflow_send_pdu(void *buffer, int len) +static void netflow_sendmsg(void *buffer, const int len) { struct msghdr msg = { .msg_flags = MSG_DONTWAIT|MSG_NOSIGNAL }; struct kvec iov = { buffer, len }; @@ -607,9 +870,16 @@ static int netflow_send_pdu(void *buffer, int len) int snum = 0; struct ipt_netflow_sock *usock; + mutex_lock(&sock_lock); list_for_each_entry(usock, &usock_list, list) { + if (!usock->sock) + usock_connect(usock, 1); + if (!usock->sock) { + NETFLOW_STAT_INC_ATOMIC(send_failed); + continue; + } if (debug) - printk(KERN_INFO "netflow_send_pdu: sendmsg(%d, %d) [%u %u]\n", + printk(KERN_INFO "netflow_sendmsg: sendmsg(%d, %d) [%u %u]\n", snum, len, atomic_read(&usock->sock->sk->sk_wmem_alloc), @@ -624,7 +894,7 @@ static int netflow_send_pdu(void *buffer, int len) suggestion = ": increase sndbuf!"; } else atomic_inc(&usock->err_other); - printk(KERN_ERR "netflow_send_pdu[%d]: sendmsg error %d: data loss %llu pkt, %llu bytes%s\n", + printk(KERN_ERR "ipt_NETFLOW: sendmsg[%d] error %d: data loss %llu pkt, %llu bytes%s\n", snum, ret, pdu_packets, pdu_traf, suggestion); } else { unsigned int wmem = atomic_read(&usock->sock->sk->sk_wmem_alloc); @@ -636,98 +906,67 @@ static int netflow_send_pdu(void *buffer, int len) } snum++; } - return retok; + mutex_unlock(&sock_lock); + if (retok == 0) { + /* not least one send succeded, account stat for dropped packets */ + NETFLOW_STAT_ADD_ATOMIC(pkt_drop, pdu_packets); + NETFLOW_STAT_ADD_ATOMIC(traf_drop, pdu_traf); + } } -static void usock_free(struct ipt_netflow_sock *usock) +static void usock_close_free(struct ipt_netflow_sock *usock) { - printk(KERN_INFO "netflow: remove destination %u.%u.%u.%u:%u (%p)\n", + printk(KERN_INFO "ipt_NETFLOW: removed destination %u.%u.%u.%u:%u\n", HIPQUAD(usock->ipaddr), - usock->port, - usock->sock); + usock->port); if (usock->sock) sock_release(usock->sock); usock->sock = NULL; - vfree(usock); + vfree(usock); } static void destination_removeall(void) { - write_lock(&sock_lock); + mutex_lock(&sock_lock); while (!list_empty(&usock_list)) { struct ipt_netflow_sock *usock; usock = list_entry(usock_list.next, struct ipt_netflow_sock, list); list_del(&usock->list); - write_unlock(&sock_lock); - usock_free(usock); - write_lock(&sock_lock); + mutex_unlock(&sock_lock); + usock_close_free(usock); + mutex_lock(&sock_lock); } - write_unlock(&sock_lock); + mutex_unlock(&sock_lock); } static void add_usock(struct ipt_netflow_sock *usock) { struct ipt_netflow_sock *sk; - /* don't need empty sockets */ - if (!usock->sock) { - usock_free(usock); - return; - } - - write_lock(&sock_lock); + mutex_lock(&sock_lock); /* don't need duplicated sockets */ list_for_each_entry(sk, &usock_list, list) { if (sk->ipaddr == usock->ipaddr && sk->port == usock->port) { - write_unlock(&sock_lock); - usock_free(usock); + mutex_unlock(&sock_lock); + usock_close_free(usock); return; } } list_add_tail(&usock->list, &usock_list); - printk(KERN_INFO "netflow: added destination %u.%u.%u.%u:%u\n", + printk(KERN_INFO "ipt_NETFLOW: added destination %u.%u.%u.%u:%u%s\n", HIPQUAD(usock->ipaddr), - usock->port); - write_unlock(&sock_lock); -} - -static struct socket *usock_alloc(__be32 ipaddr, unsigned short port) -{ - struct sockaddr_in sin; - struct socket *sock; - int error; - - if ((error = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock)) < 0) { - printk(KERN_ERR "netflow: sock_create_kern error %d\n", error); - return NULL; - } - sock->sk->sk_allocation = GFP_ATOMIC; - sock->sk->sk_prot->unhash(sock->sk); /* hidden from input */ - sock->sk->sk_error_report = &sk_error_report; /* clear ECONNREFUSED */ - if (sndbuf) - sock->sk->sk_sndbuf = sndbuf; - else - sndbuf = sock->sk->sk_sndbuf; - memset(&sin, 0, sizeof(sin)); - sin.sin_family = AF_INET; - sin.sin_addr.s_addr = htonl(ipaddr); - sin.sin_port = htons(port); - if ((error = sock->ops->connect(sock, (struct sockaddr *)&sin, - sizeof(sin), 0)) < 0) { - printk(KERN_ERR "netflow: error connecting UDP socket %d\n", error); - sock_release(sock); - return NULL; - } - return sock; + usock->port, + (!usock->sock)? " (unconnected)" : ""); + mutex_unlock(&sock_lock); } #define SEPARATORS " ,;\t\n" static int add_destinations(char *ptr) { while (ptr) { - unsigned char ip[4]; + unsigned char ip[4]; unsigned short port; ptr += strspn(ptr, SEPARATORS); @@ -737,17 +976,15 @@ static int add_destinations(char *ptr) struct ipt_netflow_sock *usock; if (!(usock = vmalloc(sizeof(*usock)))) { - printk(KERN_ERR "netflow: can't vmalloc socket\n"); + printk(KERN_ERR "ipt_NETFLOW: can't vmalloc socket\n"); return -ENOMEM; } memset(usock, 0, sizeof(*usock)); + atomic_set(&usock->err_connect, 0); usock->ipaddr = ntohl(*(__be32 *)ip); usock->port = port; - usock->sock = usock_alloc(usock->ipaddr, port); - atomic_set(&usock->wmem_peak, 0); - atomic_set(&usock->err_full, 0); - atomic_set(&usock->err_other, 0); + usock_connect(usock, 0); add_usock(usock); } else break; @@ -781,7 +1018,7 @@ static int add_aggregation(char *ptr) LIST_HEAD(old_aggr_list); while (ptr && *ptr) { - unsigned char ip[4]; + unsigned char ip[4]; unsigned int mask; unsigned int port1, port2; unsigned int aggr_to; @@ -792,16 +1029,16 @@ static int add_aggregation(char *ptr) ip, ip + 1, ip + 2, ip + 3, &mask, &aggr_to) == 6) { if (!(aggr_n = vmalloc(sizeof(*aggr_n)))) { - printk(KERN_ERR "netflow: can't vmalloc aggr\n"); + printk(KERN_ERR "ipt_NETFLOW: can't vmalloc aggr\n"); return -ENOMEM; } memset(aggr_n, 0, sizeof(*aggr_n)); - aggr_n->addr = ntohl(*(__be32 *)ip); aggr_n->mask = bits2mask(mask); + aggr_n->addr = ntohl(*(__be32 *)ip) & aggr_n->mask; aggr_n->aggr_mask = bits2mask(aggr_to); aggr_n->prefix = mask; - printk(KERN_INFO "netflow: add aggregation [%u.%u.%u.%u/%u=%u]\n", + printk(KERN_INFO "ipt_NETFLOW: add aggregation [%u.%u.%u.%u/%u=%u]\n", HIPQUAD(aggr_n->addr), mask, aggr_to); list_add_tail(&aggr_n->list, &new_aggr_n_list); @@ -809,7 +1046,7 @@ static int add_aggregation(char *ptr) sscanf(ptr, "%u=%u", &port2, &aggr_to) == 2) { if (!(aggr_p = vmalloc(sizeof(*aggr_p)))) { - printk(KERN_ERR "netflow: can't vmalloc aggr\n"); + printk(KERN_ERR "ipt_NETFLOW: can't vmalloc aggr\n"); return -ENOMEM; } memset(aggr_p, 0, sizeof(*aggr_p)); @@ -817,11 +1054,11 @@ static int add_aggregation(char *ptr) aggr_p->port1 = port1; aggr_p->port2 = port2; aggr_p->aggr_port = aggr_to; - printk(KERN_INFO "netflow: add aggregation [%u-%u=%u]\n", + printk(KERN_INFO "ipt_NETFLOW: add aggregation [%u-%u=%u]\n", port1, port2, aggr_to); list_add_tail(&aggr_p->list, &new_aggr_p_list); } else { - printk(KERN_ERR "netflow: bad aggregation rule: %s (ignoring)\n", ptr); + printk(KERN_ERR "ipt_NETFLOW: bad aggregation rule: %s (ignoring)\n", ptr); break; } @@ -846,17 +1083,23 @@ static int add_aggregation(char *ptr) static inline u_int32_t hash_netflow(const struct ipt_netflow_tuple *tuple) { - /* tuple is rounded to u32s */ - return jhash2((u32 *)tuple, NETFLOW_TUPLE_SIZE, ipt_netflow_hash_rnd) % ipt_netflow_hash_size; + return murmur3(tuple, sizeof(struct ipt_netflow_tuple), ipt_netflow_hash_rnd) % ipt_netflow_hash_size; } static struct ipt_netflow * -ipt_netflow_find(const struct ipt_netflow_tuple *tuple, unsigned int hash) +ipt_netflow_find(const struct ipt_netflow_tuple *tuple, const unsigned int hash) { struct ipt_netflow *nf; +#if LINUX_VERSION_CODE < KERNEL_VERSION(3,9,0) +#define compat_hlist_for_each_entry hlist_for_each_entry +#define compat_hlist_for_each_entry_safe hlist_for_each_entry_safe struct hlist_node *pos; +#else /* since 3.9.0 */ +#define compat_hlist_for_each_entry(a,pos,c,d) hlist_for_each_entry(a,c,d) +#define compat_hlist_for_each_entry_safe(a,pos,c,d,e) hlist_for_each_entry_safe(a,c,d,e) +#endif - hlist_for_each_entry(nf, pos, &ipt_netflow_hash[hash], hlist) { + compat_hlist_for_each_entry(nf, pos, &ipt_netflow_hash[hash], hlist) { if (ipt_netflow_tuple_equal(tuple, &nf->tuple) && nf->nr_bytes < FLOW_FULL_WATERMARK) { NETFLOW_STAT_INC(found); @@ -868,7 +1111,7 @@ ipt_netflow_find(const struct ipt_netflow_tuple *tuple, unsigned int hash) return NULL; } -static struct hlist_head *alloc_hashtable(int size) +static struct hlist_head *alloc_hashtable(const int size) { struct hlist_head *hash; @@ -879,19 +1122,18 @@ static struct hlist_head *alloc_hashtable(int size) for (i = 0; i < size; i++) INIT_HLIST_HEAD(&hash[i]); } else - printk(KERN_ERR "netflow: unable to vmalloc hash table.\n"); + printk(KERN_ERR "ipt_NETFLOW: unable to vmalloc hash table.\n"); return hash; } -static int set_hashsize(int new_size) +static int set_hashsize(const int new_size) { struct hlist_head *new_hash, *old_hash; - unsigned int hash; struct ipt_netflow *nf; int rnd; - printk(KERN_INFO "netflow: allocating new hash table %u -> %u buckets\n", + printk(KERN_INFO "ipt_NETFLOW: allocating new hash table %u -> %u buckets\n", ipt_netflow_hash_size, new_size); new_hash = alloc_hashtable(new_size); if (!new_hash) @@ -900,19 +1142,24 @@ static int set_hashsize(int new_size) get_random_bytes(&rnd, 4); /* rehash */ - spin_lock_bh(&ipt_netflow_lock); + write_lock_bh(&htable_rwlock); old_hash = ipt_netflow_hash; ipt_netflow_hash = new_hash; ipt_netflow_hash_size = new_size; ipt_netflow_hash_rnd = rnd; /* hash_netflow() is dependent on ipt_netflow_hash_* values */ + spin_lock(&hlist_lock); list_for_each_entry(nf, &ipt_netflow_list, list) { + unsigned int hash; + hash = hash_netflow(&nf->tuple); /* hlist_add_head overwrites hlist pointers for this node * so it's good */ hlist_add_head(&nf->hlist, &new_hash[hash]); + nf->lock = &htable_locks[hash & LOCK_COUNT_MASK]; } - spin_unlock_bh(&ipt_netflow_lock); + spin_unlock(&hlist_lock); + write_unlock_bh(&htable_rwlock); vfree(old_hash); @@ -920,14 +1167,14 @@ static int set_hashsize(int new_size) } static struct ipt_netflow * -ipt_netflow_alloc(struct ipt_netflow_tuple *tuple) +ipt_netflow_alloc(const struct ipt_netflow_tuple *tuple) { struct ipt_netflow *nf; long count; nf = kmem_cache_alloc(ipt_netflow_cachep, GFP_ATOMIC); if (!nf) { - printk(KERN_ERR "Can't allocate netflow.\n"); + printk(KERN_ERR "ipt_NETFLOW: Can't allocate flow.\n"); return NULL; } @@ -945,13 +1192,15 @@ ipt_netflow_alloc(struct ipt_netflow_tuple *tuple) static void ipt_netflow_free(struct ipt_netflow *nf) { + if (IS_DUMMY_FLOW(nf)) + return; atomic_dec(&ipt_netflow_count); kmem_cache_free(ipt_netflow_cachep, nf); } static struct ipt_netflow * -init_netflow(struct ipt_netflow_tuple *tuple, - struct sk_buff *skb, unsigned int hash) +init_netflow(const struct ipt_netflow_tuple *tuple, + const struct sk_buff *skb, const unsigned int hash) { struct ipt_netflow *nf; @@ -959,93 +1208,774 @@ init_netflow(struct ipt_netflow_tuple *tuple, if (!nf) return NULL; + nf->lock = &htable_locks[hash & LOCK_COUNT_MASK]; hlist_add_head(&nf->hlist, &ipt_netflow_hash[hash]); + spin_lock(&hlist_lock); list_add(&nf->list, &ipt_netflow_list); + spin_unlock(&hlist_lock); return nf; } /* cook pdu, send, and clean */ /* only called in scan worker path */ -static void netflow_export_pdu(void) +static void netflow_export_pdu_v5(void) { struct timeval tv; int pdusize; - if (!pdu.nr_records) + if (!pdu_data_records) return; if (debug > 1) - printk(KERN_INFO "netflow_export_pdu with %d records\n", pdu.nr_records); - do_gettimeofday(&tv); - - pdu.version = htons(5); - pdu.ts_uptime = htonl(jiffies_to_msecs(jiffies)); - pdu.ts_usecs = htonl(tv.tv_sec); - pdu.ts_unsecs = htonl(tv.tv_usec); - //pdu.eng_type = 0; - //pdu.eng_id = 0; - //pdu.padding = 0; + printk(KERN_INFO "netflow_export_pdu_v5 with %d records\n", pdu_data_records); - pdusize = NETFLOW5_HEADER_SIZE + sizeof(struct netflow5_record) * pdu.nr_records; - - /* especially fix nr_records before export */ - pdu.nr_records = htons(pdu.nr_records); + pdu.v5.version = htons(5); + pdu.v5.nr_records = htons(pdu_data_records); + pdu.v5.ts_uptime = htonl(jiffies_to_msecs(jiffies)); + do_gettimeofday(&tv); + pdu.v5.ts_usecs = htonl(tv.tv_sec); + pdu.v5.ts_unsecs = htonl(tv.tv_usec); + pdu.v5.seq = htonl(pdu_seq); + //pdu.v5.eng_type = 0; + pdu.v5.eng_id = engine_id; + //pdu.v5.padding = 0; - if (netflow_send_pdu(&pdu, pdusize) == 0) { - /* not least one send succeded, account stat for dropped packets */ - NETFLOW_STAT_ADD_ATOMIC(pkt_drop, pdu_packets); - NETFLOW_STAT_ADD_ATOMIC(traf_drop, pdu_traf); - } + pdusize = NETFLOW5_HEADER_SIZE + sizeof(struct netflow5_record) * pdu_data_records; - pdu.seq = htonl(ntohl(pdu.seq) + ntohs(pdu.nr_records)); + netflow_sendmsg(&pdu.v5, pdusize); - pdu.nr_records = 0; pdu_packets = 0; - pdu_traf = 0; + pdu_traf = 0; + + pdu_seq += pdu_data_records; + pdu_count++; + pdu_data_records = 0; } /* only called in scan worker path */ -static void netflow_export_flow(struct ipt_netflow *nf) +static void netflow_export_flow_v5(struct ipt_netflow *nf) { struct netflow5_record *rec; - if (debug > 2) - printk(KERN_INFO "adding flow to export (%d)\n", pdu.nr_records); + if (unlikely(debug > 2)) + printk(KERN_INFO "adding flow to export (%d)\n", pdu_data_records); pdu_packets += nf->nr_packets; pdu_traf += nf->nr_bytes; pdu_ts_mod = jiffies; - rec = &pdu.flow[pdu.nr_records++]; + rec = &pdu.v5.flow[pdu_data_records++]; /* make V5 flow record */ - rec->s_addr = nf->tuple.s_addr; - rec->d_addr = nf->tuple.d_addr; - //rec->nexthop = 0; + rec->s_addr = nf->tuple.src.ip; + rec->d_addr = nf->tuple.dst.ip; + rec->nexthop = nf->nh.ip; rec->i_ifc = htons(nf->tuple.i_ifc); rec->o_ifc = htons(nf->o_ifc); rec->nr_packets = htonl(nf->nr_packets); rec->nr_octets = htonl(nf->nr_bytes); - rec->ts_first = htonl(jiffies_to_msecs(nf->ts_first)); - rec->ts_last = htonl(jiffies_to_msecs(nf->ts_last)); + rec->first_ms = htonl(jiffies_to_msecs(nf->ts_first)); + rec->last_ms = htonl(jiffies_to_msecs(nf->ts_last)); rec->s_port = nf->tuple.s_port; rec->d_port = nf->tuple.d_port; - //rec->reserved = 0; + //rec->reserved = 0; /* pdu is always zeroized for v5 in netflow_switch_version */ rec->tcp_flags = nf->tcp_flags; rec->protocol = nf->tuple.protocol; rec->tos = nf->tuple.tos; - //rec->s_as = 0; - //rec->d_as = 0; +#ifdef CONFIG_NF_NAT_NEEDED + rec->s_as = nf->s_as; + rec->d_as = nf->d_as; +#endif rec->s_mask = nf->s_mask; rec->d_mask = nf->d_mask; //rec->padding = 0; ipt_netflow_free(nf); - if (pdu.nr_records == NETFLOW5_RECORDS_MAX) + if (pdu_data_records == NETFLOW5_RECORDS_MAX) + netflow_export_pdu_v5(); +} + +/* pdu is initially blank, export current pdu, and prepare next for filling. */ +static void netflow_export_pdu_v9(void) +{ + struct timeval tv; + int pdusize; + + if (pdu_data_used <= pdu.v9.data) + return; + + if (debug > 1) + printk(KERN_INFO "netflow_export_pdu_v9 with %d records\n", + pdu_data_records + pdu_tpl_records); + + pdu.v9.version = htons(9); + pdu.v9.nr_records = htons(pdu_data_records + pdu_tpl_records); + pdu.v9.sys_uptime_ms = htonl(jiffies_to_msecs(jiffies)); + do_gettimeofday(&tv); + pdu.v9.export_time_s = htonl(tv.tv_sec); + pdu.v9.seq = htonl(pdu_seq); + pdu.v9.source_id = engine_id; + + pdusize = pdu_data_used - (unsigned char *)&pdu.v9; + + netflow_sendmsg(&pdu.v9, pdusize); + + pdu_packets = 0; + pdu_traf = 0; + + pdu_seq++; + pdu_count++; + pdu_data_records = pdu_tpl_records = 0; + pdu_data_used = pdu.v9.data; + pdu_flowset = NULL; +} + +static void netflow_export_pdu_ipfix(void) +{ + struct timeval tv; + int pdusize; + + if (pdu_data_used <= pdu.ipfix.data) + return; + + if (debug > 1) + printk(KERN_INFO "netflow_export_pduX with %d records\n", + pdu_data_records); + + pdu.ipfix.version = htons(10); + do_gettimeofday(&tv); + pdu.ipfix.export_time_s = htonl(tv.tv_sec); + pdu.ipfix.seq = htonl(pdu_seq); + pdu.ipfix.odomain_id = engine_id; + pdusize = pdu_data_used - (unsigned char *)&pdu; + pdu.ipfix.length = htons(pdusize); + + netflow_sendmsg(&pdu.ipfix, pdusize); + + pdu_packets = 0; + pdu_traf = 0; + + pdu_seq += pdu_data_records; + pdu_count++; + pdu_data_records = pdu_tpl_records = 0; + pdu_data_used = pdu.ipfix.data; + pdu_flowset = NULL; +} + +static inline int pdu_have_space(const size_t size) +{ + return ((pdu_data_used + size) <= pdu_high_wm); +} + +static inline unsigned char *pdu_grab_space(const size_t size) +{ + unsigned char *ptr = pdu_data_used; + pdu_data_used += size; + return ptr; +} + +// allocate data space in pdu, or fail if pdu is reallocated. +static inline unsigned char *pdu_alloc_fail(const size_t size) +{ + if (!pdu_have_space(size)) { netflow_export_pdu(); + return NULL; + } + return pdu_grab_space(size); +} + +/* doesn't fail, but can provide empty pdu. */ +static unsigned char *pdu_alloc(const size_t size) +{ + return pdu_alloc_fail(size) ?: pdu_grab_space(size); +} + +/* global table of sizes of template field types */ +static u_int8_t tpl_element_sizes[] = { + [IN_BYTES] = 4, + [IN_PKTS] = 4, + [PROTOCOL] = 1, + [TOS] = 1, + [TCP_FLAGS] = 1, + [L4_SRC_PORT] = 2, + [IPV4_SRC_ADDR] = 4, + [SRC_MASK] = 1, + [INPUT_SNMP] = 2, + [L4_DST_PORT] = 2, + [IPV4_DST_ADDR] = 4, + [DST_MASK] = 1, + [OUTPUT_SNMP] = 2, + [IPV4_NEXT_HOP] = 4, + //[SRC_AS] = 2, + //[DST_AS] = 2, + //[BGP_IPV4_NEXT_HOP] = 4, + //[MUL_DST_PKTS] = 4, + //[MUL_DST_BYTES] = 4, + [LAST_SWITCHED] = 4, + [FIRST_SWITCHED]= 4, + [IPV6_SRC_ADDR] = 16, + [IPV6_DST_ADDR] = 16, + [IPV6_FLOW_LABEL] = 3, + [ICMP_TYPE] = 2, + [MUL_IGMP_TYPE] = 1, + //[TOTAL_BYTES_EXP] = 4, + //[TOTAL_PKTS_EXP] = 4, + //[TOTAL_FLOWS_EXP] = 4, + [IPV6_NEXT_HOP] = 16, + [IPV6_OPTION_HEADERS] = 2, + [commonPropertiesId] = 4, + [ipv4Options] = 4, + [tcpOptions] = 4, + [postNATSourceIPv4Address] = 4, + [postNATDestinationIPv4Address] = 4, + [postNAPTSourceTransportPort] = 2, + [postNAPTDestinationTransportPort] = 2, + [natEvent] = 1, + [postNATSourceIPv6Address] = 16, + [postNATDestinationIPv6Address] = 16, + [IPSecSPI] = 4, + [observationTimeMilliseconds] = 8, + [observationTimeMicroseconds] = 8, + [observationTimeNanoseconds] = 8, +}; + +#define TEMPLATES_HASH_BSIZE 8 +#define TEMPLATES_HASH_SIZE (1<tpl_mask == tmask) + return tpl; + + tnum = 0; + if (tmask & BTPL_IP4) { + tlist[tnum++] = &template_ipv4; + if (tmask & BTPL_OPTIONS4) + tlist[tnum++] = &template_options4; + if (tmask & BTPL_MASK4) + tlist[tnum++] = &template_ipv4_mask; + } else if (tmask & BTPL_IP6) { + tlist[tnum++] = &template_ipv6; + if (tmask & BTPL_LABEL6) + tlist[tnum++] = &template_label6; + if (tmask & BTPL_OPTIONS6) + tlist[tnum++] = &template_options6; + } else if (tmask & BTPL_NAT4) + tlist[tnum++] = &template_nat4; + if (tmask & BTPL_PORTS) + tlist[tnum++] = &template_ports; + if (tmask & BTPL_BASE) + tlist[tnum++] = &template_base; + if (tmask & BTPL_TCPOPTIONS) + tlist[tnum++] = &template_tcpoptions; + if (tmask & BTPL_ICMP) + tlist[tnum++] = &template_icmp; + if (tmask & BTPL_IGMP) + tlist[tnum++] = &template_igmp; + if (tmask & BTPL_IPSEC) + tlist[tnum++] = &template_ipsec; + if (tmask & BTPL_MARK) + tlist[tnum++] = &template_mark; + + /* calc memory size */ + length = 0; + for (i = 0; i < tnum; i++) { + if (!tlist[i]->length) { + for (k = 0; tlist[i]->types[k]; k++); + tlist[i]->length = k; + } + length += tlist[i]->length; + } + /* elements are pairs + one termiantor */ + tpl = kmalloc(sizeof(struct data_template) + (length * 2 + 1) * sizeof(u_int16_t), GFP_KERNEL); + if (!tpl) { + printk(KERN_ERR "ipt_NETFLOW: unable to kmalloc template.\n"); + return NULL; + } + tpl->tpl_mask = tmask; + tpl->length = length; + tpl->tpl_size = sizeof(struct flowset_template); + tpl->rec_size = 0; + tpl->template_id_n = htons(template_ids++); + tpl->exported_cnt = 0; + tpl->exported_ts = 0; + + j = 0; + for (i = 0; i < tnum; i++) { + struct base_template *btpl = tlist[i]; + + for (k = 0; k < btpl->length; k++) { + int size; + int type = btpl->types[k]; + + tpl->fields[j++] = type; + size = tpl_element_sizes[type]; + tpl->fields[j++] = size; + tpl->rec_size += size; + } + tpl->tpl_size += btpl->length * TPL_FIELD_NSIZE; + } + tpl->fields[j++] = 0; + + hlist_add_head(&tpl->hlist, &templates_hash[hash]); + tpl_count++; + + return tpl; +} + +static void pdu_add_template(struct data_template *tpl) +{ + int i; + unsigned char *ptr; + struct flowset_template *ntpl; + __be16 *sptr; + + ptr = pdu_alloc(tpl->tpl_size); + ntpl = (struct flowset_template *)ptr; + ntpl->flowset_id = protocol == 9? htons(FLOWSET_TEMPLATE) : htons(IPFIX_TEMPLATE); + ntpl->length = htons(tpl->tpl_size); + ntpl->template_id = tpl->template_id_n; + ntpl->field_count = htons(tpl->length); + ptr += sizeof(struct flowset_template); + sptr = (__be16 *)ptr; + for (i = 0; ; ) { + int type = tpl->fields[i++]; + if (!type) + break; + *sptr++ = htons(type); + *sptr++ = htons(tpl->fields[i++]); + } + + tpl->exported_cnt = pdu_count; + tpl->exported_ts = jiffies; + + pdu_flowset = NULL; + pdu_tpl_records++; +} + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,35) +static inline s64 portable_ktime_to_ms(const ktime_t kt) +{ + struct timeval tv = ktime_to_timeval(kt); + return (s64) tv.tv_sec * MSEC_PER_SEC + tv.tv_usec / USEC_PER_MSEC; +} +#define ktime_to_ms portable_ktime_to_ms +#endif + +/* encode one field */ +typedef struct in6_addr in6_t; +static inline void add_ipv4_field(__u8 *ptr, const int type, const struct ipt_netflow *nf) +{ + switch (type) { + case IN_BYTES: *(__be32 *)ptr = htonl(nf->nr_bytes); break; + case IN_PKTS: *(__be32 *)ptr = htonl(nf->nr_packets); break; + case FIRST_SWITCHED: *(__be32 *)ptr = htonl(jiffies_to_msecs(nf->ts_first)); break; + case LAST_SWITCHED: *(__be32 *)ptr = htonl(jiffies_to_msecs(nf->ts_last)); break; + case IPV4_SRC_ADDR: *(__be32 *)ptr = nf->tuple.src.ip; break; + case IPV4_DST_ADDR: *(__be32 *)ptr = nf->tuple.dst.ip; break; + case IPV4_NEXT_HOP: *(__be32 *)ptr = nf->nh.ip; break; + case L4_SRC_PORT: *(__be16 *)ptr = nf->tuple.s_port; break; + case L4_DST_PORT: *(__be16 *)ptr = nf->tuple.d_port; break; + case INPUT_SNMP: *(__be16 *)ptr = htons(nf->tuple.i_ifc); break; + case OUTPUT_SNMP: *(__be16 *)ptr = htons(nf->o_ifc); break; + case PROTOCOL: *ptr = nf->tuple.protocol; break; + case TCP_FLAGS: *ptr = nf->tcp_flags; break; + case TOS: *ptr = nf->tuple.tos; break; + case IPV6_SRC_ADDR: *(in6_t *)ptr = nf->tuple.src.in6; break; + case IPV6_DST_ADDR: *(in6_t *)ptr = nf->tuple.dst.in6; break; + case IPV6_NEXT_HOP: *(in6_t *)ptr = nf->nh.in6; break; + case IPV6_FLOW_LABEL: *ptr++ = nf->flow_label >> 16; + *(__be16 *)ptr = nf->flow_label; + break; + case tcpOptions: *(__be32 *)ptr = htonl(nf->tcpoptions); break; + case ipv4Options: *(__be32 *)ptr = htonl(nf->options); break; + case IPV6_OPTION_HEADERS: *(__be16 *)ptr = htons(nf->options); break; +#ifdef CONFIG_NF_CONNTRACK_MARK + case commonPropertiesId: + *(__be32 *)ptr = htonl(nf->mark); break; +#endif + case SRC_MASK: *ptr = nf->s_mask; break; + case DST_MASK: *ptr = nf->d_mask; break; + case ICMP_TYPE: *(__be16 *)ptr = nf->tuple.d_port; break; + case MUL_IGMP_TYPE: *ptr = nf->tuple.d_port; break; +#ifdef CONFIG_NF_NAT_NEEDED + case postNATSourceIPv4Address: *(__be32 *)ptr = nf->nat->post.s_addr; break; + case postNATDestinationIPv4Address: *(__be32 *)ptr = nf->nat->post.d_addr; break; + case postNAPTSourceTransportPort: *(__be16 *)ptr = nf->nat->post.s_port; break; + case postNAPTDestinationTransportPort: *(__be16 *)ptr = nf->nat->post.d_port; break; + case natEvent: *ptr = nf->nat->nat_event; break; +#endif + case IPSecSPI: *(__u32 *)ptr = (nf->tuple.s_port << 16) | nf->tuple.d_port; break; + case observationTimeMilliseconds: + *(__be64 *)ptr = cpu_to_be64(ktime_to_ms(nf->ts_obs)); break; + case observationTimeMicroseconds: + *(__be64 *)ptr = cpu_to_be64(ktime_to_us(nf->ts_obs)); break; + case observationTimeNanoseconds: + *(__be64 *)ptr = cpu_to_be64(ktime_to_ns(nf->ts_obs)); break; + default: + memset(ptr, 0, tpl_element_sizes[type]); + } +} + +#define PAD_SIZE 4 /* rfc prescribes flowsets to be padded */ + +/* cache timeout_rate in jiffies */ +static inline unsigned long timeout_rate_j(void) +{ + static unsigned int t_rate = 0; + static unsigned long t_rate_j = 0; + + if (unlikely(timeout_rate != t_rate)) { + struct timeval tv = { .tv_sec = timeout_rate * 60, .tv_usec = 0 }; + + t_rate = timeout_rate; + t_rate_j = timeval_to_jiffies(&tv); + } + return t_rate_j; +} + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20) +#define IPPROTO_UDPLITE 136 +#endif + +#ifndef time_is_before_jiffies +#define time_is_before_jiffies(a) time_after(jiffies, a) +#endif + +static void netflow_export_flow_tpl(struct ipt_netflow *nf) +{ + unsigned char *ptr; + int i; + struct data_template *tpl; + int tpl_mask = BTPL_BASE; + + if (unlikely(debug > 2)) + printk(KERN_INFO "adding flow to export (%d)\n", + pdu_data_records + pdu_tpl_records); + + if (likely(nf->tuple.l3proto == AF_INET)) { + tpl_mask |= BTPL_IP4; + if (unlikely(nf->options)) + tpl_mask |= BTPL_OPTIONS4; + } else { + tpl_mask |= BTPL_IP6; + if (unlikely(nf->options)) + tpl_mask |= BTPL_OPTIONS6; + if (unlikely(nf->flow_label)) + tpl_mask |= BTPL_LABEL6; + } + if (unlikely(nf->tcpoptions)) + tpl_mask |= BTPL_TCPOPTIONS; + if (unlikely(nf->s_mask || nf->d_mask)) + tpl_mask |= BTPL_MASK4; + if (likely(nf->tuple.protocol == IPPROTO_TCP || + nf->tuple.protocol == IPPROTO_UDP || + nf->tuple.protocol == IPPROTO_SCTP || + nf->tuple.protocol == IPPROTO_UDPLITE)) + tpl_mask |= BTPL_PORTS; + else if (nf->tuple.protocol == IPPROTO_ICMP) + tpl_mask |= BTPL_ICMP; + else if (nf->tuple.protocol == IPPROTO_IGMP) + tpl_mask |= BTPL_IGMP; +#ifdef CONFIG_NF_CONNTRACK_MARK + if (nf->mark) + tpl_mask |= BTPL_MARK; +#endif +#ifdef CONFIG_NF_NAT_NEEDED + if (nf->nat) + tpl_mask = BTPL_NAT4; +#endif + + tpl = get_template(tpl_mask); + if (unlikely(!tpl)) { + printk(KERN_INFO "ipt_NETFLOW: template allocation failed.\n"); + NETFLOW_STAT_INC(alloc_err); + NETFLOW_STAT_ADD_ATOMIC(pkt_drop, nf->nr_packets); + NETFLOW_STAT_ADD_ATOMIC(traf_drop, nf->nr_bytes); + ipt_netflow_free(nf); + return; + } + + if (unlikely(!pdu_flowset || + pdu_flowset->flowset_id != tpl->template_id_n || + !(ptr = pdu_alloc_fail(tpl->rec_size)))) { + + /* if there was previous data template we should pad it to 4 bytes */ + if (pdu_flowset) { + int padding = (PAD_SIZE - ntohs(pdu_flowset->length) % PAD_SIZE) % PAD_SIZE; + if (padding && (ptr = pdu_alloc_fail(padding))) { + pdu_flowset->length = htons(ntohs(pdu_flowset->length) + padding); + for (; padding; padding--) + *ptr++ = 0; + } + } + + if (!tpl->exported_ts || + pdu_count > (tpl->exported_cnt + refresh_rate) || + time_is_before_jiffies(tpl->exported_ts + timeout_rate_j())) { + pdu_add_template(tpl); + } + + ptr = pdu_alloc(sizeof(struct flowset_data) + tpl->rec_size); + pdu_flowset = (struct flowset_data *)ptr; + pdu_flowset->flowset_id = tpl->template_id_n; + pdu_flowset->length = htons(sizeof(struct flowset_data)); + ptr += sizeof(struct flowset_data); + } + + /* encode all fields */ + for (i = 0; ; ) { + int type = tpl->fields[i++]; + + if (!type) + break; + add_ipv4_field(ptr, type, nf); + ptr += tpl->fields[i++]; + } + + pdu_data_records++; + pdu_flowset->length = htons(ntohs(pdu_flowset->length) + tpl->rec_size); + + pdu_packets += nf->nr_packets; + pdu_traf += nf->nr_bytes; + + ipt_netflow_free(nf); + pdu_ts_mod = jiffies; +} + +static void netflow_switch_version(const int ver) +{ + protocol = ver; + if (protocol == 5) { + memset(&pdu, 0, sizeof(pdu)); + netflow_export_flow = &netflow_export_flow_v5; + netflow_export_pdu = &netflow_export_pdu_v5; + } else if (protocol == 9) { + pdu_data_used = pdu.v9.data; + pdu_max_size = sizeof(pdu.v9); + pdu_high_wm = (unsigned char *)&pdu + pdu_max_size; + netflow_export_flow = &netflow_export_flow_tpl; + netflow_export_pdu = &netflow_export_pdu_v9; + } else { /* IPFIX */ + pdu_data_used = pdu.ipfix.data; + pdu_max_size = sizeof(pdu.ipfix); + pdu_high_wm = (unsigned char *)&pdu + pdu_max_size; + netflow_export_flow = &netflow_export_flow_tpl; + netflow_export_pdu = &netflow_export_pdu_ipfix; + } + if (protocol != 5) + free_templates(); + pdu_data_records = pdu_tpl_records = 0; + pdu_flowset = NULL; + printk(KERN_INFO "ipt_NETFLOW protocol version %d (%s) enabled.\n", + protocol, protocol == 10? "IPFIX" : "NetFlow"); +} + +#ifdef CONFIG_NF_NAT_NEEDED +static void export_nat_event(struct nat_event *nel) +{ + static struct ipt_netflow nf = { { NULL } }; + + nf.tuple.l3proto = AF_INET; + nf.tuple.protocol = nel->protocol; + nf.nat = nel; /* this is also flag of dummy flow */ + nf.tcp_flags = (nel->nat_event == NAT_DESTROY)? TCP_FIN_RST : TCP_SYN_ACK; + if (protocol >= 9) { + nf.ts_obs = nel->ts_ktime; + nf.tuple.src.ip = nel->pre.s_addr; + nf.tuple.dst.ip = nel->pre.d_addr; + nf.tuple.s_port = nel->pre.s_port; + nf.tuple.d_port = nel->pre.d_port; + netflow_export_flow(&nf); + } else { /* v5 */ + /* The weird v5 packet(s). + * src and dst will be same as in data flow from the FORWARD chain + * where src is pre-nat src ip and dst is post-nat dst ip. + * What we lacking here is external src ip for SNAT, or + * pre-nat dst ip for DNAT. We will put this into Nexthop field + * with port into src/dst AS field. tcp_flags will distinguish it's + * start or stop event. Two flows in case of full nat. */ + nf.tuple.src.ip = nel->pre.s_addr; + nf.tuple.s_port = nel->pre.s_port; + nf.tuple.dst.ip = nel->post.d_addr; + nf.tuple.d_port = nel->post.d_port; + + nf.ts_first = nel->ts_jiffies; + nf.ts_last = nel->ts_jiffies; + if (nel->pre.s_addr != nel->post.s_addr || + nel->pre.s_port != nel->post.s_port) { + nf.nh.ip = nel->post.s_addr; + nf.s_as = nel->post.s_port; + nf.d_as = 0; + netflow_export_flow(&nf); + } + if (nel->pre.d_addr != nel->post.d_addr || + nel->pre.d_port != nel->post.d_port) { + nf.nh.ip = nel->pre.d_addr; + nf.s_as = 0; + nf.d_as = nel->pre.d_port; + netflow_export_flow(&nf); + } + } + kfree(nel); } +#endif /* CONFIG_NF_NAT_NEEDED */ -static inline int active_needs_export(struct ipt_netflow *nf, long a_timeout) +static inline int active_needs_export(const struct ipt_netflow *nf, const long a_timeout) { /* active too long, finishing, or having too much bytes */ return ((jiffies - nf->ts_first) > a_timeout) || @@ -1057,42 +1987,77 @@ static inline int active_needs_export(struct ipt_netflow *nf, long a_timeout) /* could be called with zero to flush cache and pdu */ /* this function is guaranteed to be called non-concurrently */ -static void netflow_scan_and_export(int flush) +/* return -1 is trylockfailed, 0 if nothin gexported, >=1 if exported something */ +static int netflow_scan_and_export(const int flush) { long i_timeout = inactive_timeout * HZ; long a_timeout = active_timeout * HZ; + int trylock_failed = 0; + int pdu_c = pdu_count; if (flush) i_timeout = 0; - spin_lock_bh(&ipt_netflow_lock); - while (!list_empty(&ipt_netflow_list)) { + local_bh_disable(); + spin_lock(&hlist_lock); + /* This is different order of locking than elsewhere, + * so we trylock&break to avoid deadlock. */ + + while (likely(!list_empty(&ipt_netflow_list))) { struct ipt_netflow *nf; - + + /* Last entry, which is usually oldest. */ nf = list_entry(ipt_netflow_list.prev, struct ipt_netflow, list); + if (!spin_trylock(nf->lock)) { + trylock_failed = 1; + break; + } /* Note: i_timeout checked with >= to allow specifying zero timeout * to purge all flows on module unload */ if (((jiffies - nf->ts_last) >= i_timeout) || active_needs_export(nf, a_timeout)) { hlist_del(&nf->hlist); + spin_unlock(nf->lock); + list_del(&nf->list); + spin_unlock(&hlist_lock); + local_bh_enable(); + NETFLOW_STAT_ADD(pkt_out, nf->nr_packets); NETFLOW_STAT_ADD(traf_out, nf->nr_bytes); - spin_unlock_bh(&ipt_netflow_lock); netflow_export_flow(nf); - spin_lock_bh(&ipt_netflow_lock); + + local_bh_disable(); + spin_lock(&hlist_lock); } else { + spin_unlock(nf->lock); /* all flows which need to be exported is always at the tail * so if no more exportable flows we can break */ break; } } - spin_unlock_bh(&ipt_netflow_lock); - + spin_unlock(&hlist_lock); + local_bh_enable(); + +#ifdef CONFIG_NF_NAT_NEEDED + spin_lock_bh(&nat_lock); + while (!list_empty(&nat_list)) { + struct nat_event *nel; + + nel = list_entry(nat_list.next, struct nat_event, list); + list_del(&nel->list); + spin_unlock_bh(&nat_lock); + export_nat_event(nel); + spin_lock_bh(&nat_lock); + } + spin_unlock_bh(&nat_lock); +#endif /* flush flows stored in pdu if there no new flows for too long */ /* Note: using >= to allow flow purge on zero timeout */ if ((jiffies - pdu_ts_mod) >= i_timeout) netflow_export_pdu(); + + return trylock_failed? -1 : pdu_count - pdu_c; } #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20) @@ -1101,8 +2066,10 @@ static void netflow_work_fn(void *dummy) static void netflow_work_fn(struct work_struct *dummy) #endif { - netflow_scan_and_export(0); - __start_scan_worker(); + int status; + + status = netflow_scan_and_export(DONT_FLUSH); + _schedule_scan_worker(status); } #define RATESHIFT 2 @@ -1154,7 +2121,7 @@ static void rate_timer_calc(unsigned long dummy) old_found = found; old_notfound = notfound; /* if there is no access to hash keep rate steady */ - metric = (dfnd + dnfnd)? 10 * (dsrch + dfnd + dnfnd) / (dfnd + dnfnd) : metric; + metric = (dfnd + dnfnd)? 100 * (dsrch + dfnd + dnfnd) / (dfnd + dnfnd) : metric; CALC_RATE(min15_metric, (unsigned long long)metric, 15); CALC_RATE(min5_metric, (unsigned long long)metric, 5); CALC_RATE(min_metric, (unsigned long long)metric, 1); @@ -1162,6 +2129,262 @@ static void rate_timer_calc(unsigned long dummy) mod_timer(&rate_timer, jiffies + (HZ * SAMPLERATE)); } +#ifdef CONFIG_NF_NAT_NEEDED +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,31) +static struct nf_ct_event_notifier *saved_event_cb __read_mostly = NULL; +static int netflow_conntrack_event(const unsigned int events, struct nf_ct_event *item) +#else +static int netflow_conntrack_event(struct notifier_block *this, unsigned long events, void *ptr) +#endif +{ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,31) + struct nf_conn *ct = item->ct; +#else + struct nf_conn *ct = (struct nf_conn *)ptr; +#endif + struct nat_event *nel; + const struct nf_conntrack_tuple *t; + int ret = NOTIFY_DONE; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,31) + struct nf_ct_event_notifier *notifier; + + /* Call netlink first. */ + notifier = rcu_dereference(saved_event_cb); + if (likely(notifier)) + ret = notifier->fcn(events, item); +#endif + if (unlikely(!natevents)) + return ret; + + if (!(events & ((1 << IPCT_NEW) | (1 << IPCT_RELATED) | (1 << IPCT_DESTROY)))) + return ret; + + if (!(ct->status & IPS_NAT_MASK)) + return ret; + + if (unlikely(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num != AF_INET || + ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.l3num != AF_INET)) { + /* Well, there is no linux NAT for IPv6 anyway. */ + return ret; + } + + if (!(nel = kmalloc(sizeof(struct nat_event), GFP_ATOMIC))) { + printk(KERN_ERR "ipt_NETFLOW: can't kmalloc nat event\n"); + return ret; + } + memset(nel, 0, sizeof(struct nat_event)); + nel->ts_ktime = ktime_get_real(); + nel->ts_jiffies = jiffies; + t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; + nel->protocol = t->dst.protonum; + nel->pre.s_addr = t->src.u3.ip; + nel->pre.d_addr = t->dst.u3.ip; + nel->pre.s_port = t->src.u.all; + nel->pre.d_port = t->dst.u.all; + t = &ct->tuplehash[IP_CT_DIR_REPLY].tuple; + /* reply is reversed */ + nel->post.s_addr = t->dst.u3.ip; + nel->post.d_addr = t->src.u3.ip; + nel->post.s_port = t->dst.u.all; + nel->post.d_port = t->src.u.all; + if (events & (1 << IPCT_DESTROY)) { + nel->nat_event = NAT_DESTROY; + nat_events_stop++; + } else { + nel->nat_event = NAT_CREATE; + nat_events_start++; + } + + spin_lock_bh(&nat_lock); + list_add_tail(&nel->list, &nat_list); + spin_unlock_bh(&nat_lock); + + return ret; +} + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,31) +static struct notifier_block ctnl_notifier = { + .notifier_call = netflow_conntrack_event +}; +#else +static struct nf_ct_event_notifier ctnl_notifier = { + .fcn = netflow_conntrack_event +}; +#endif /* since 2.6.31 */ +#endif /* CONFIG_NF_NAT_NEEDED */ + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,23) && \ + LINUX_VERSION_CODE < KERNEL_VERSION(2,6,35) +static bool +#else +static int +#endif +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28) +netflow_target_check(const char *tablename, const void *entry, const struct xt_target *target, + void *targinfo, +#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18) + unsigned int targinfosize, +#endif + unsigned int hook_mask) +{ +#else +netflow_target_check(const struct xt_tgchk_param *par) +{ + const char *tablename = par->table; + const struct xt_target *target = par->target; +#endif + if (strcmp("nat", tablename) == 0) { + /* In the nat table we only see single packet per flow, which is useless. */ + printk(KERN_ERR "%s target: is not valid in %s table\n", target->name, tablename); +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,35) +#define CHECK_FAIL 0 +#define CHECK_OK 1 +#else +#define CHECK_FAIL -EINVAL +#define CHECK_OK 0 +#endif + return CHECK_FAIL; + } + if (target->family == AF_INET6 && protocol == 5) { + printk(KERN_ERR "ip6tables NETFLOW target is meaningful for protocol 9 or 10 only.\n"); + return CHECK_FAIL; + } + return CHECK_OK; +} + +#define SetXBit(x) (0x8000 >> (x)) /* Proper bit for htons later. */ +#ifndef IPPROTO_MH +#define IPPROTO_MH 135 +#endif +static inline __u16 observed_hdrs(const __u8 currenthdr) +{ + switch (currenthdr) { + case IPPROTO_TCP: + case IPPROTO_UDP: + /* For speed, in case switch is not optimized. */ + return 0; + case IPPROTO_DSTOPTS: return SetXBit(0); + case IPPROTO_HOPOPTS: return SetXBit(1); + case IPPROTO_ROUTING: return SetXBit(5); + case IPPROTO_MH: return SetXBit(12); + case IPPROTO_ESP: return SetXBit(13); + case IPPROTO_AH: return SetXBit(14); + case IPPROTO_COMP: return SetXBit(15); + case IPPROTO_FRAGMENT: /* Handled elsewhere. */ + /* Next is known headers. */ + case IPPROTO_ICMPV6: + case IPPROTO_UDPLITE: + case IPPROTO_IPIP: + case IPPROTO_PIM: + case IPPROTO_GRE: + case IPPROTO_SCTP: +#ifdef IPPROTO_L2TP + case IPPROTO_L2TP: +#endif + case IPPROTO_DCCP: + return 0; + } + return SetXBit(3); /* Unknown header. */ +} + +/* http://www.iana.org/assignments/ip-parameters/ip-parameters.xhtml */ +static const __u8 ip4_opt_table[] = { + [7] = 0, /* RR */ /* parsed manually becasue of 0 */ + [134] = 1, /* CIPSO */ + [133] = 2, /* E-SEC */ + [68] = 3, /* TS */ + [131] = 4, /* LSR */ + [130] = 5, /* SEC */ + [1] = 6, /* NOP */ + [0] = 7, /* EOOL */ + [15] = 8, /* ENCODE */ + [142] = 9, /* VISA */ + [205] = 10, /* FINN */ + [12] = 11, /* MTUR */ + [11] = 12, /* MTUP */ + [10] = 13, /* ZSU */ + [137] = 14, /* SSR */ + [136] = 15, /* SID */ + [151] = 16, /* DPS */ + [150] = 17, /* NSAPA */ + [149] = 18, /* SDB */ + [147] = 19, /* ADDEXT */ + [148] = 20, /* RTRALT */ + [82] = 21, /* TR */ + [145] = 22, /* EIP */ + [144] = 23, /* IMITD */ + [30] = 25, /* EXP */ + [94] = 25, /* EXP */ + [158] = 25, /* EXP */ + [222] = 25, /* EXP */ + [25] = 30, /* QS */ + [152] = 31, /* UMP */ +}; +/* Parse IPv4 Options array int ipv4Options IPFIX value. */ +static inline __u32 ip4_options(const u_int8_t *p, const unsigned int optsize) +{ + __u32 ret = 0; + unsigned int i; + + for (i = 0; likely(i < optsize); ) { + u_int8_t op = p[i++]; + + if (op == 7) /* RR: bit 0 */ + ret |= 1; + else if (likely(op < ARRAY_SIZE(ip4_opt_table))) { + /* Btw, IANA doc is messed up in a crazy way: + * http://www.ietf.org/mail-archive/web/ipfix/current/msg06008.html (2011) + * I decided to follow IANA _text_ description from + * http://www.iana.org/assignments/ipfix/ipfix.xhtml (2013-09-18) + * + * Set proper bit for htonl later. */ + if (ip4_opt_table[op]) + ret |= 1 << (32 - ip4_opt_table[op]); + } + if (likely(i >= optsize || op == 0)) + break; + else if (unlikely(op == 1)) + continue; + else if (unlikely(p[i] < 2)) + break; + else + i += p[i] - 1; + } + return ret; +} + +#define TCPHDR_MAXSIZE (4 * 15) +/* List of options: http://www.iana.org/assignments/tcp-parameters/tcp-parameters.xhtml */ +static inline __u32 tcp_options(const struct sk_buff *skb, const unsigned int ptr, const struct tcphdr *th) +{ + const unsigned int optsize = th->doff * 4 - sizeof(struct tcphdr); + __u8 _opt[TCPHDR_MAXSIZE]; + const u_int8_t *p; + __u32 ret; + unsigned int i; + + p = skb_header_pointer(skb, ptr + sizeof(struct tcphdr), optsize, _opt); + if (unlikely(!p)) + return 0; + ret = 0; + for (i = 0; likely(i < optsize); ) { + u_int8_t opt = p[i++]; + + if (likely(opt < 32)) { + /* IANA doc is messed up, see above. */ + ret |= 1 << (32 - opt); + } + if (likely(i >= optsize || opt == 0)) + break; + else if (unlikely(opt == 1)) + continue; + else if (unlikely(p[i] < 2)) /* "silly options" */ + break; + else + i += p[i] - 1; + } + return ret; +} /* packet receiver */ static unsigned int netflow_target( #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,24) @@ -1192,27 +2415,38 @@ static unsigned int netflow_target( ) { #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,24) - struct sk_buff *skb = *pskb; + const struct sk_buff *skb = *pskb; +#endif + union { + struct iphdr ip; + struct ipv6hdr ip6; + } _iph, *iph; + unsigned int hash; +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28) + const int family = target->family; +#else + const int family = par->family; #endif - struct iphdr _iph, *iph; struct ipt_netflow_tuple tuple; struct ipt_netflow *nf; __u8 tcp_flags; struct netflow_aggr_n *aggr_n; struct netflow_aggr_p *aggr_p; __u8 s_mask, d_mask; - unsigned int hash; - - iph = skb_header_pointer(skb, 0, sizeof(_iph), &_iph); //iph = ip_hdr(skb); - - if (iph == NULL) { + unsigned int ptr; + int fragment; + size_t pkt_len; + int options = 0; + int tcpoptions = 0; + + iph = skb_header_pointer(skb, 0, (likely(family == AF_INET))? sizeof(_iph.ip) : sizeof(_iph.ip6), &iph); + if (unlikely(iph == NULL)) { NETFLOW_STAT_INC(truncated); NETFLOW_STAT_INC(pkt_drop); return IPT_CONTINUE; } - tuple.s_addr = iph->saddr; - tuple.d_addr = iph->daddr; + tuple.l3proto = family; tuple.s_port = 0; tuple.d_port = 0; #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28) @@ -1220,30 +2454,118 @@ static unsigned int netflow_target( #else tuple.i_ifc = par->in? par->in->ifindex : -1; #endif - tuple.protocol = iph->protocol; - tuple.tos = iph->tos; tcp_flags = 0; /* Cisco sometimes have TCP ACK for non TCP packets, don't get it */ s_mask = 0; d_mask = 0; - if (iph->frag_off & htons(IP_OFFSET)) + if (likely(family == AF_INET)) { + tuple.src = (union nf_inet_addr){ .ip = iph->ip.saddr }; + tuple.dst = (union nf_inet_addr){ .ip = iph->ip.daddr }; + tuple.tos = iph->ip.tos; + tuple.protocol = iph->ip.protocol; + fragment = unlikely(iph->ip.frag_off & htons(IP_OFFSET)); + ptr = iph->ip.ihl * 4; + pkt_len = ntohs(iph->ip.tot_len); + +#define IPHDR_MAXSIZE (4 * 15) + if (unlikely(iph->ip.ihl * 4 > sizeof(struct iphdr))) { + u_int8_t _opt[IPHDR_MAXSIZE - sizeof(struct iphdr)]; + const u_int8_t *op; + unsigned int optsize = iph->ip.ihl * 4 - sizeof(struct iphdr); + + op = skb_header_pointer(skb, sizeof(struct iphdr), optsize, _opt); + if (likely(op)) + options = ip4_options(op, optsize); + } + } else { + __u8 currenthdr; + + tuple.src.in6 = iph->ip6.saddr; + tuple.dst.in6 = iph->ip6.daddr; + tuple.tos = iph->ip6.priority; + fragment = 0; + ptr = sizeof(struct ipv6hdr); + pkt_len = ntohs(iph->ip6.payload_len) + sizeof(struct ipv6hdr); + + currenthdr = iph->ip6.nexthdr; + while (currenthdr != NEXTHDR_NONE && ipv6_ext_hdr(currenthdr)) { + struct ipv6_opt_hdr _hdr; + const struct ipv6_opt_hdr *hp; + unsigned int hdrlen = 0; + + options |= observed_hdrs(currenthdr); + hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr); + if (hp == NULL) { + /* We have src/dst, so must account something. */ + tuple.protocol = currenthdr; + fragment = 3; + goto do_protocols; + } + + switch (currenthdr) { + case IPPROTO_FRAGMENT: { + struct frag_hdr _fhdr; + const struct frag_hdr *fh; + + fh = skb_header_pointer(skb, ptr, sizeof(_fhdr), + &_fhdr); + if (fh == NULL) { + tuple.protocol = currenthdr; + fragment = 2; + goto do_protocols; + } + fragment = 1; +#define FRA0 SetXBit(4) /* Fragment header - first fragment */ +#define FRA1 SetXBit(6) /* Fragmentation header - not first fragment */ + options |= (ntohs(fh->frag_off) & 0xFFF8)? FRA1 : FRA0; + hdrlen = 8; + break; + } + case IPPROTO_AH: { + struct ip_auth_hdr _hdr, *hp; + + if (likely(hp = skb_header_pointer(skb, ptr, 8, &_hdr))) { + tuple.s_port = hp->spi >> 16; + tuple.d_port = hp->spi; + } + hdrlen = (hp->hdrlen + 2) << 2; + break; + } + default: + hdrlen = ipv6_optlen(hp); + } + currenthdr = hp->nexthdr; + ptr += hdrlen; + } + tuple.protocol = currenthdr; + options |= observed_hdrs(currenthdr); + } + +do_protocols: + if (fragment) { + /* if conntrack is enabled it should defrag on pre-routing and local-out */ NETFLOW_STAT_INC(frags); - else { + } else { switch (tuple.protocol) { case IPPROTO_TCP: { struct tcphdr _hdr, *hp; - if ((hp = skb_header_pointer(skb, iph->ihl * 4, 14, &_hdr))) { + if (likely(hp = skb_header_pointer(skb, ptr, 14, &_hdr))) { tuple.s_port = hp->source; tuple.d_port = hp->dest; tcp_flags = (u_int8_t)(ntohl(tcp_flag_word(hp)) >> 16); + + if (unlikely(hp->doff * 4 > sizeof(struct tcphdr))) + tcpoptions = tcp_options(skb, ptr, hp); } break; } - case IPPROTO_UDP: { + case IPPROTO_UDP: + case IPPROTO_UDPLITE: + case IPPROTO_SCTP: { struct udphdr _hdr, *hp; - if ((hp = skb_header_pointer(skb, iph->ihl * 4, 4, &_hdr))) { + if (likely(hp = skb_header_pointer(skb, ptr, 4, &_hdr))) { tuple.s_port = hp->source; tuple.d_port = hp->dest; } @@ -1252,72 +2574,111 @@ static unsigned int netflow_target( case IPPROTO_ICMP: { struct icmphdr _hdr, *hp; - if ((hp = skb_header_pointer(skb, iph->ihl * 4, 2, &_hdr))) - tuple.d_port = (hp->type << 8) | hp->code; + if (likely(family == AF_INET && + (hp = skb_header_pointer(skb, ptr, 2, &_hdr)))) + tuple.d_port = htons((hp->type << 8) | hp->code); break; } + case IPPROTO_ICMPV6: { + struct icmp6hdr _icmp6h, *ic; + + if (likely(family == AF_INET6 && + (ic = skb_header_pointer(skb, ptr, 2, &_icmp6h)))) + tuple.d_port = htons((ic->icmp6_type << 8) | ic->icmp6_code); + break; + } case IPPROTO_IGMP: { - struct igmphdr *_hdr, *hp; + struct igmphdr _hdr, *hp; - if ((hp = skb_header_pointer(skb, iph->ihl * 4, 1, &_hdr))) + if (likely(hp = skb_header_pointer(skb, ptr, 1, &_hdr))) tuple.d_port = hp->type; } break; + case IPPROTO_AH: { /* IPSEC */ + struct ip_auth_hdr _hdr, *hp; + + if (likely(family == AF_INET && /* For IPv6 it's parsed above. */ + (hp = skb_header_pointer(skb, ptr, 8, &_hdr)))) { + tuple.s_port = hp->spi >> 16; + tuple.d_port = hp->spi; + } + break; + } + case IPPROTO_ESP: { + struct ip_esp_hdr _hdr, *hp; + + if (likely(hp = skb_header_pointer(skb, ptr, 4, &_hdr))) + tuple.s_port = hp->spi >> 16; + tuple.d_port = hp->spi; + } + break; } } /* not fragmented */ /* aggregate networks */ read_lock_bh(&aggr_lock); - list_for_each_entry(aggr_n, &aggr_n_list, list) - if ((ntohl(tuple.s_addr) & aggr_n->mask) == aggr_n->addr) { - tuple.s_addr &= htonl(aggr_n->aggr_mask); - s_mask = aggr_n->prefix; - break; - } - list_for_each_entry(aggr_n, &aggr_n_list, list) - if ((ntohl(tuple.d_addr) & aggr_n->mask) == aggr_n->addr) { - tuple.d_addr &= htonl(aggr_n->aggr_mask); - d_mask = aggr_n->prefix; - break; - } + if (family == AF_INET) { + list_for_each_entry(aggr_n, &aggr_n_list, list) + if (unlikely((ntohl(tuple.src.ip) & aggr_n->mask) == aggr_n->addr)) { + tuple.src.ip &= htonl(aggr_n->aggr_mask); + s_mask = aggr_n->prefix; + atomic_inc(&aggr_n->usage); + break; + } + list_for_each_entry(aggr_n, &aggr_n_list, list) + if (unlikely((ntohl(tuple.dst.ip) & aggr_n->mask) == aggr_n->addr)) { + tuple.dst.ip &= htonl(aggr_n->aggr_mask); + d_mask = aggr_n->prefix; + atomic_inc(&aggr_n->usage); + break; + } + } - /* aggregate ports */ - list_for_each_entry(aggr_p, &aggr_p_list, list) - if (ntohs(tuple.s_port) >= aggr_p->port1 && - ntohs(tuple.s_port) <= aggr_p->port2) { - tuple.s_port = htons(aggr_p->aggr_port); - break; - } + if (tuple.protocol == IPPROTO_TCP || + tuple.protocol == IPPROTO_UDP || + tuple.protocol == IPPROTO_SCTP || + tuple.protocol == IPPROTO_UDPLITE) { + /* aggregate ports */ + list_for_each_entry(aggr_p, &aggr_p_list, list) + if (unlikely(ntohs(tuple.s_port) >= aggr_p->port1 && + ntohs(tuple.s_port) <= aggr_p->port2)) { + tuple.s_port = htons(aggr_p->aggr_port); + atomic_inc(&aggr_p->usage); + break; + } - list_for_each_entry(aggr_p, &aggr_p_list, list) - if (ntohs(tuple.d_port) >= aggr_p->port1 && - ntohs(tuple.d_port) <= aggr_p->port2) { - tuple.d_port = htons(aggr_p->aggr_port); - break; - } + list_for_each_entry(aggr_p, &aggr_p_list, list) + if (unlikely(ntohs(tuple.d_port) >= aggr_p->port1 && + ntohs(tuple.d_port) <= aggr_p->port2)) { + tuple.d_port = htons(aggr_p->aggr_port); + atomic_inc(&aggr_p->usage); + break; + } + } read_unlock_bh(&aggr_lock); hash = hash_netflow(&tuple); - spin_lock_bh(&ipt_netflow_lock); + read_lock_bh(&htable_rwlock); + spin_lock(&htable_locks[hash & LOCK_COUNT_MASK]); /* record */ nf = ipt_netflow_find(&tuple, hash); - if (!nf) { - if (maxflows > 0 && atomic_read(&ipt_netflow_count) >= maxflows) { + if (unlikely(!nf)) { + struct rtable *rt; + + if (unlikely(maxflows > 0 && atomic_read(&ipt_netflow_count) >= maxflows)) { /* This is DOS attack prevention */ NETFLOW_STAT_INC(maxflows_err); NETFLOW_STAT_INC(pkt_drop); - NETFLOW_STAT_ADD(traf_drop, ntohs(iph->tot_len)); - spin_unlock_bh(&ipt_netflow_lock); - return IPT_CONTINUE; + NETFLOW_STAT_ADD(traf_drop, pkt_len); + goto unlock_return; } nf = init_netflow(&tuple, skb, hash); - if (!nf || IS_ERR(nf)) { + if (unlikely(!nf || IS_ERR(nf))) { NETFLOW_STAT_INC(alloc_err); NETFLOW_STAT_INC(pkt_drop); - NETFLOW_STAT_ADD(traf_drop, ntohs(iph->tot_len)); - spin_unlock_bh(&ipt_netflow_lock); - return IPT_CONTINUE; + NETFLOW_STAT_ADD(traf_drop, pkt_len); + goto unlock_return; } nf->ts_first = jiffies; @@ -1330,31 +2691,68 @@ static unsigned int netflow_target( nf->s_mask = s_mask; nf->d_mask = d_mask; - if (debug > 2) - printk(KERN_INFO "ipt_netflow: new (%u) %hd:%hd SRC=%u.%u.%u.%u:%u DST=%u.%u.%u.%u:%u\n", +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,26) + rt = (struct rtable *)skb->dst; +#else /* since 2.6.26 */ +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,31) + rt = skb->rtable; +#else /* since 2.6.31 */ + rt = skb_rtable(skb); +#endif +#endif + if (likely(family == AF_INET)) { + if (rt) + nf->nh.ip = rt->rt_gateway; + } else { + if (rt) + nf->nh.in6 = ((struct rt6_info *)rt)->rt6i_gateway; + nf->flow_label = (iph->ip6.flow_lbl[0] << 16) | + (iph->ip6.flow_lbl[1] << 8) | (iph->ip6.flow_lbl[2]); + } +#if 0 + if (unlikely(debug > 2)) + printk(KERN_INFO "ipt_NETFLOW: new (%u) %hd:%hd SRC=%u.%u.%u.%u:%u DST=%u.%u.%u.%u:%u\n", atomic_read(&ipt_netflow_count), tuple.i_ifc, nf->o_ifc, NIPQUAD(tuple.s_addr), ntohs(tuple.s_port), NIPQUAD(tuple.d_addr), ntohs(tuple.d_port)); +#endif } else { /* ipt_netflow_list is sorted by access time: * most recently accessed flows are at head, old flows remain at tail * this function bubble up flow to the head */ + spin_lock(&hlist_lock); list_move(&nf->list, &ipt_netflow_list); + spin_unlock(&hlist_lock); } +#ifdef CONFIG_NF_CONNTRACK_MARK + { + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + ct = nf_ct_get(skb, &ctinfo); + if (ct) + nf->mark = ct->mark; + } +#endif + nf->nr_packets++; - nf->nr_bytes += ntohs(iph->tot_len); + nf->nr_bytes += pkt_len; nf->ts_last = jiffies; nf->tcp_flags |= tcp_flags; + nf->options |= options; + if (tuple.protocol == IPPROTO_TCP) + nf->tcpoptions |= tcpoptions; NETFLOW_STAT_INC(pkt_total); - NETFLOW_STAT_ADD(traf_total, ntohs(iph->tot_len)); + NETFLOW_STAT_ADD(traf_total, pkt_len); - if (active_needs_export(nf, active_timeout * HZ)) { + if (likely(active_needs_export(nf, active_timeout * HZ))) { /* ok, if this active flow to be exported * bubble it to the tail */ + spin_lock(&hlist_lock); list_move_tail(&nf->list, &ipt_netflow_list); + spin_unlock(&hlist_lock); /* Blog: I thought about forcing timer to wake up sooner if we have * enough exportable flows, but in fact this doesn't have much sense, @@ -1363,35 +2761,194 @@ static unsigned int netflow_target( * limited size). But yes, this is disputable. */ } - spin_unlock_bh(&ipt_netflow_lock); +unlock_return: + spin_unlock(&htable_locks[hash & LOCK_COUNT_MASK]); + read_unlock_bh(&htable_rwlock); return IPT_CONTINUE; } -static struct ipt_target ipt_netflow_reg = { - .name = "NETFLOW", - .target = netflow_target, - .family = AF_INET, -#ifndef RAW_PROMISC_HACK - .table = "filter", -#ifndef NF_IP_LOCAL_IN /* 2.6.25 */ - .hooks = (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD) | - (1 << NF_INET_LOCAL_OUT), -#else - .hooks = (1 << NF_IP_LOCAL_IN) | (1 << NF_IP_FORWARD) | - (1 << NF_IP_LOCAL_OUT), -#endif /* NF_IP_LOCAL_IN */ +#ifdef CONFIG_NF_NAT_NEEDED +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,31) + /* Below 2.6.31 we don't need to handle callback chain manually. */ + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,2,0) +#define NET_STRUCT struct net *net +#define NET_ARG net, +#define nf_conntrack_event_cb net->ct.nf_conntrack_event_cb #else - .table = "raw", -#ifndef NF_IP_LOCAL_IN - .hooks = (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD) | - (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_PRE_ROUTING), +#define NET_STRUCT void +#define NET_ARG +#endif +static int set_notifier_cb(NET_STRUCT) +{ + struct nf_ct_event_notifier *notifier; + + notifier = rcu_dereference(nf_conntrack_event_cb); + if (notifier == NULL) { + /* Polite mode. */ + nf_conntrack_register_notifier(NET_ARG &ctnl_notifier); + } else if (notifier != &ctnl_notifier) { + if (!saved_event_cb) + saved_event_cb = notifier; + else if (saved_event_cb != notifier) + printk(KERN_ERR "natevents_net_init: %p != %p (report error.)\n", + saved_event_cb, notifier); + rcu_assign_pointer(nf_conntrack_event_cb, &ctnl_notifier); + } else + printk(KERN_ERR "ipt_NETFLOW: natevents already enabled.\n"); + return 0; +} +static void unset_notifier_cb(NET_STRUCT) +{ + struct nf_ct_event_notifier *notifier; + + notifier = rcu_dereference(nf_conntrack_event_cb); + if (notifier == &ctnl_notifier) { + if (saved_event_cb == NULL) + nf_conntrack_unregister_notifier(NET_ARG &ctnl_notifier); + else + rcu_assign_pointer(nf_conntrack_event_cb, saved_event_cb); + } else + printk(KERN_ERR "ipt_NETFLOW: natevents already disabled.\n"); +} + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,2,0) +#undef nf_conntrack_event_cb +static struct pernet_operations natevents_net_ops = { + .init = set_notifier_cb, + .exit = unset_notifier_cb +}; +#endif +#endif /* since 2.6.31 */ + +static DEFINE_MUTEX(events_lock); +/* Both functions may be called multiple times. */ +static void register_ct_events(void) +{ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,31) +#define NETLINK_M "nf_conntrack_netlink" + struct module *netlink_m; + static int referenced = 0; +#endif + + printk(KERN_INFO "ipt_NETFLOW: enable natevents.\n"); + mutex_lock(&events_lock); + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,31) + /* Pre-load netlink module who will be first notifier + * user, and then hijack nf_conntrack_event_cb from it. */ + if ( +#if LINUX_VERSION_CODE < KERNEL_VERSION(3,2,0) + !rcu_dereference(nf_conntrack_event_cb) || +#endif + !(netlink_m = find_module(NETLINK_M))) { + printk("Loading " NETLINK_M "\n"); + request_module(NETLINK_M); + } + /* Reference netlink module to prevent it's unsafe unload before us. */ + if (!referenced && (netlink_m = find_module(NETLINK_M))) { + referenced++; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,35) +#define use_module ref_module +#endif + use_module(THIS_MODULE, netlink_m); + } + + /* Register ct events callback. */ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,2,0) + register_pernet_subsys(&natevents_net_ops); #else - .hooks = (1 << NF_IP_LOCAL_IN) | (1 << NF_IP_FORWARD) | - (1 << NF_IP_LOCAL_OUT) | (1 << NF_IP_PRE_ROUTING), -#endif /* NF_IP_LOCAL_IN */ -#endif /* !RAW_PROMISC_HACK */ - .me = THIS_MODULE + set_notifier_cb(); +#endif +#else /* below v2.6.31 */ + if (!natevents && nf_conntrack_register_notifier(&ctnl_notifier) < 0) + printk(KERN_ERR "Can't register conntrack notifier, natevents disabled.\n"); + else +#endif + natevents = 1; + mutex_unlock(&events_lock); +} + +static void unregister_ct_events(void) +{ + printk(KERN_INFO "ipt_NETFLOW: disable natevents.\n"); + mutex_lock(&events_lock); +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,31) +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,2,0) + unregister_pernet_subsys(&natevents_net_ops); +#else /* < v3.2 */ + unset_notifier_cb(); +#endif /* v3.2 */ + rcu_assign_pointer(saved_event_cb, NULL); +#else /* < v2.6.31 */ + nf_conntrack_unregister_notifier(&ctnl_notifier); +#endif + natevents = 0; + mutex_unlock(&events_lock); +} +#endif /* CONFIG_NF_NAT_NEEDED */ + +#ifndef NF_IP_LOCAL_IN /* 2.6.25 */ +#define NF_IP_PRE_ROUTING NF_INET_PRE_ROUTING +#define NF_IP_LOCAL_IN NF_INET_LOCAL_IN +#define NF_IP_FORWARD NF_INET_FORWARD +#define NF_IP_LOCAL_OUT NF_INET_LOCAL_OUT +#define NF_IP_POST_ROUTING NF_INET_POST_ROUTING +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19) +/* net/netfilter/x_tables.c */ +static void xt_unregister_targets(struct xt_target *target, unsigned int n) +{ + unsigned int i; + + for (i = 0; i < n; i++) + xt_unregister_target(&target[i]); +} +static int xt_register_targets(struct xt_target *target, unsigned int n) +{ + unsigned int i; + + int err = 0; + for (i = 0; i < n; i++) + if ((err = xt_register_target(&target[i]))) + goto err; + return err; +err: + if (i > 0) + xt_unregister_targets(target, i); + return err; +} +#endif + +static struct ipt_target ipt_netflow_reg[] __read_mostly = { + { + .name = "NETFLOW", + .target = netflow_target, + .checkentry = netflow_target_check, + .family = AF_INET, + .hooks = + (1 << NF_IP_PRE_ROUTING) | + (1 << NF_IP_LOCAL_IN) | + (1 << NF_IP_FORWARD) | + (1 << NF_IP_LOCAL_OUT) | + (1 << NF_IP_POST_ROUTING), + .me = THIS_MODULE + }, + { + .name = "NETFLOW", + .target = netflow_target, + .checkentry = netflow_target_check, + .family = AF_INET6, + .hooks = + (1 << NF_IP_PRE_ROUTING) | + (1 << NF_IP_LOCAL_IN) | + (1 << NF_IP_FORWARD) | + (1 << NF_IP_LOCAL_OUT) | + (1 << NF_IP_POST_ROUTING), + .me = THIS_MODULE + }, }; static int __init ipt_netflow_init(void) @@ -1399,11 +2956,16 @@ static int __init ipt_netflow_init(void) #ifdef CONFIG_PROC_FS struct proc_dir_entry *proc_stat; #endif + printk(KERN_INFO "ipt_NETFLOW version %s, srcversion %s\n", + IPT_NETFLOW_VERSION, THIS_MODULE->srcversion); get_random_bytes(&ipt_netflow_hash_rnd, 4); /* determine hash size (idea from nf_conntrack_core.c) */ if (!hashsize) { +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,11,0) +#define num_physpages totalram_pages +#endif hashsize = (((num_physpages << PAGE_SHIFT) / 16384) / sizeof(struct hlist_head)); if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE)) @@ -1411,8 +2973,7 @@ static int __init ipt_netflow_init(void) } if (hashsize < 16) hashsize = 16; - printk(KERN_INFO "ipt_netflow version %s (%u buckets)\n", - IPT_NETFLOW_VERSION, hashsize); + printk(KERN_INFO "ipt_NETFLOW: hashsize %u\n", hashsize); ipt_netflow_hash_size = hashsize; ipt_netflow_hash = alloc_hashtable(ipt_netflow_hash_size); @@ -1434,12 +2995,18 @@ static int __init ipt_netflow_init(void) } #ifdef CONFIG_PROC_FS +#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0) proc_stat = create_proc_entry("ipt_netflow", S_IRUGO, INIT_NET(proc_net_stat)); +#else + proc_stat = proc_create("ipt_netflow", S_IRUGO, INIT_NET(proc_net_stat), &nf_seq_fops); +#endif if (!proc_stat) { printk(KERN_ERR "Unable to create /proc/net/stat/ipt_netflow entry\n"); goto err_free_netflow_slab; } +#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0) proc_stat->proc_fops = &nf_seq_fops; +#endif #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,30) proc_stat->owner = THIS_MODULE; #endif @@ -1480,21 +3047,28 @@ static int __init ipt_netflow_init(void) } add_aggregation(aggregation); - __start_scan_worker(); + netflow_switch_version(protocol); + _schedule_scan_worker(0); setup_timer(&rate_timer, rate_timer_calc, 0); mod_timer(&rate_timer, jiffies + (HZ * SAMPLERATE)); - if (xt_register_target(&ipt_netflow_reg)) + peakflows_at = jiffies; + if (xt_register_targets(ipt_netflow_reg, ARRAY_SIZE(ipt_netflow_reg))) goto err_stop_timer; - peakflows_at = jiffies; +#ifdef CONFIG_NF_NAT_NEEDED + if (natevents) + register_ct_events(); +#endif - printk(KERN_INFO "ipt_netflow loaded.\n"); + printk(KERN_INFO "ipt_NETFLOW is loaded.\n"); return 0; err_stop_timer: - __stop_scan_worker(); + _unschedule_scan_worker(); + netflow_scan_and_export(AND_FLUSH); del_timer_sync(&rate_timer); + free_templates(); destination_removeall(); aggregation_remove(&aggr_n_list); aggregation_remove(&aggr_p_list); @@ -1506,17 +3080,18 @@ err_free_proc_stat: #ifdef CONFIG_PROC_FS remove_proc_entry("ipt_netflow", INIT_NET(proc_net_stat)); err_free_netflow_slab: -#endif +#endif kmem_cache_destroy(ipt_netflow_cachep); err_free_hash: vfree(ipt_netflow_hash); err: + printk(KERN_INFO "ipt_NETFLOW is not loaded.\n"); return -ENOMEM; } static void __exit ipt_netflow_fini(void) { - printk(KERN_INFO "ipt_netflow unloading..\n"); + printk(KERN_INFO "ipt_NETFLOW unloading..\n"); #ifdef CONFIG_SYSCTL unregister_sysctl_table(netflow_sysctl_header); @@ -1524,14 +3099,18 @@ static void __exit ipt_netflow_fini(void) #ifdef CONFIG_PROC_FS remove_proc_entry("ipt_netflow", INIT_NET(proc_net_stat)); #endif - - xt_unregister_target(&ipt_netflow_reg); - __stop_scan_worker(); - netflow_scan_and_export(1); + xt_unregister_targets(ipt_netflow_reg, ARRAY_SIZE(ipt_netflow_reg)); +#ifdef CONFIG_NF_NAT_NEEDED + if (natevents) + unregister_ct_events(); +#endif + _unschedule_scan_worker(); + netflow_scan_and_export(AND_FLUSH); del_timer_sync(&rate_timer); synchronize_sched(); + free_templates(); destination_removeall(); aggregation_remove(&aggr_n_list); aggregation_remove(&aggr_p_list); @@ -1539,7 +3118,7 @@ static void __exit ipt_netflow_fini(void) kmem_cache_destroy(ipt_netflow_cachep); vfree(ipt_netflow_hash); - printk(KERN_INFO "ipt_netflow unloaded.\n"); + printk(KERN_INFO "ipt_NETFLOW unloaded.\n"); } module_init(ipt_netflow_init); diff --git a/ipt_NETFLOW.h b/ipt_NETFLOW.h index 4a7b645..749f985 100644 --- a/ipt_NETFLOW.h +++ b/ipt_NETFLOW.h @@ -35,8 +35,8 @@ struct netflow5_record { __be16 o_ifc; __be32 nr_packets; __be32 nr_octets; - __be32 ts_first; - __be32 ts_last; + __be32 first_ms; + __be32 last_ms; __be16 s_port; __be16 d_port; __u8 reserved; @@ -54,9 +54,9 @@ struct netflow5_record { struct netflow5_pdu { __be16 version; __be16 nr_records; - __be32 ts_uptime; - __be32 ts_usecs; - __be32 ts_unsecs; + __be32 ts_uptime; /* ms */ + __be32 ts_usecs; /* s */ + __be32 ts_unsecs; /* ns */ __be32 seq; __u8 eng_type; __u8 eng_id; @@ -65,42 +65,185 @@ struct netflow5_pdu { } __attribute__ ((packed)); #define NETFLOW5_HEADER_SIZE (sizeof(struct netflow5_pdu) - NETFLOW5_RECORDS_MAX * sizeof(struct netflow5_record)) +/* NetFlow v9 RFC http://www.ietf.org/rfc/rfc3954.txt */ +enum { + IN_BYTES = 1, + IN_PKTS = 2, + PROTOCOL = 4, + TOS = 5, + TCP_FLAGS = 6, + L4_SRC_PORT = 7, + IPV4_SRC_ADDR = 8, + SRC_MASK = 9, + INPUT_SNMP = 10, + L4_DST_PORT = 11, + IPV4_DST_ADDR = 12, + DST_MASK = 13, + OUTPUT_SNMP = 14, + IPV4_NEXT_HOP = 15, + //SRC_AS = 16, + //DST_AS = 17, + //BGP_IPV4_NEXT_HOP = 18, + //MUL_DST_PKTS = 19, + //MUL_DST_BYTES = 20, + LAST_SWITCHED = 21, + FIRST_SWITCHED = 22, + IPV6_SRC_ADDR = 27, + IPV6_DST_ADDR = 28, + IPV6_FLOW_LABEL = 31, + ICMP_TYPE = 32, + MUL_IGMP_TYPE = 33, + //TOTAL_BYTES_EXP = 40, + //TOTAL_PKTS_EXP = 41, + //TOTAL_FLOWS_EXP = 42, + IPV6_NEXT_HOP = 62, + IPV6_OPTION_HEADERS = 64, + commonPropertiesId = 137, /* for MARK */ + ipv4Options = 208, + tcpOptions = 209, + postNATSourceIPv4Address = 225, + postNATDestinationIPv4Address = 226, + postNAPTSourceTransportPort = 227, + postNAPTDestinationTransportPort = 228, + natEvent = 230, + postNATSourceIPv6Address = 281, + postNATDestinationIPv6Address = 282, + IPSecSPI = 295, + observationTimeMilliseconds = 323, + observationTimeMicroseconds = 324, + observationTimeNanoseconds = 325, +}; + +enum { + FLOWSET_TEMPLATE = 0, + FLOWSET_OPTIONS = 1, + IPFIX_TEMPLATE = 2, + IPFIX_OPTIONS = 3, + FLOWSET_DATA_FIRST = 256, +}; + +struct flowset_template { + __be16 flowset_id; + __be16 length; + __be16 template_id; + __be16 field_count; +} __attribute__ ((packed)); + +struct flowset_data { + __be16 flowset_id; + __be16 length; +} __attribute__ ((packed)); + +/* NetFlow v9 packet. */ +struct netflow9_pdu { + __be16 version; + __be16 nr_records; + __be32 sys_uptime_ms; + __be32 export_time_s; + __be32 seq; + __be32 source_id; /* Exporter Observation Domain */ + __u8 data[1400]; +} __attribute__ ((packed)); + +/* IPFIX packet. */ +struct ipfix_pdu { + __be16 version; + __be16 length; + __be32 export_time_s; + __be32 seq; + __be32 odomain_id; /* Observation Domain ID */ + __u8 data[1400]; +} __attribute__ ((packed)); + +/* Maximum bytes flow can have, after it's reached flow will become + * not searchable and will be exported soon. */ +#define FLOW_FULL_WATERMARK 0xffefffff + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,25) +union nf_inet_addr { + __be32 ip; + __be32 ip6[4]; + struct in_addr in; + struct in6_addr in6; +}; +#endif + /* hashed data which identify unique flow */ +/* 16+16 + 2+2 + 2+1+1+1 = 41 */ struct ipt_netflow_tuple { - __be32 s_addr; // Network byte order - __be32 d_addr; // -"- - __be16 s_port; // -"- + union nf_inet_addr src; + union nf_inet_addr dst; + __be16 s_port; // Network byte order __be16 d_port; // -"- - __be16 i_ifc; // Local byte order + __u16 i_ifc; // Host byte order __u8 protocol; __u8 tos; + __u8 l3proto; }; -/* tuple size is rounded to u32s */ -#define NETFLOW_TUPLE_SIZE (sizeof(struct ipt_netflow_tuple) / 4) - -/* maximum bytes flow can have, after it reached flow become not searchable and will be exported soon */ -#define FLOW_FULL_WATERMARK 0xffefffff -/* flow entry */ +/* hlist[2] + tuple[]: 8+8 + 41 = 57 (less than usual cache line, 64) */ struct ipt_netflow { struct hlist_node hlist; // hashtable search chain - struct list_head list; // all flows chain /* unique per flow data (hashed, NETFLOW_TUPLE_SIZE) */ struct ipt_netflow_tuple tuple; /* volatile data */ - __be16 o_ifc; + union nf_inet_addr nh; + __u16 o_ifc; __u8 s_mask; __u8 d_mask; + __u8 tcp_flags; /* `OR' of all tcp flags */ /* flow statistics */ u_int32_t nr_packets; u_int32_t nr_bytes; - unsigned long ts_first; - unsigned long ts_last; - __u8 tcp_flags; /* `OR' of all tcp flags */ + union { + struct { + unsigned long first; + unsigned long last; + } ts; + ktime_t ts_obs; + } _ts_un; +#define ts_first _ts_un.ts.first +#define ts_last _ts_un.ts.last +#define ts_obs _ts_un.ts_obs + u_int32_t flow_label; /* IPv6 */ + u_int32_t options; /* IPv4(16) & IPv6(32) Options */ + u_int32_t tcpoptions; +#ifdef CONFIG_NF_CONNTRACK_MARK + u_int32_t mark; /* Exported as commonPropertiesId */ +#endif +#ifdef CONFIG_NF_NAT_NEEDED + __be32 s_as; + __be32 d_as; + struct nat_event *nat; +#endif + struct list_head list; // all flows chain + spinlock_t *lock; +}; + +#ifdef CONFIG_NF_NAT_NEEDED +enum { + NAT_CREATE, NAT_DESTROY, NAT_POOLEXHAUSTED }; +struct nat_event { + struct list_head list; + struct { + __be32 s_addr; + __be32 d_addr; + __be16 s_port; + __be16 d_port; + } pre, post; + ktime_t ts_ktime; + unsigned long ts_jiffies; + __u8 protocol; + __u8 nat_event; +}; +#define IS_DUMMY_FLOW(nf) (nf->nat) +#else +#define IS_DUMMY_FLOW(nf) 0 +#endif static inline int ipt_netflow_tuple_equal(const struct ipt_netflow_tuple *t1, const struct ipt_netflow_tuple *t2) @@ -115,11 +258,13 @@ struct ipt_netflow_sock { unsigned short port; atomic_t wmem_peak; // sk_wmem_alloc peak value atomic_t err_full; // socket filled error + atomic_t err_connect; // connect errors atomic_t err_other; // other socket errors }; struct netflow_aggr_n { struct list_head list; + atomic_t usage; __u32 mask; __u32 addr; __u32 aggr_mask; @@ -128,6 +273,7 @@ struct netflow_aggr_n { struct netflow_aggr_p { struct list_head list; + atomic_t usage; __u16 port1; __u16 port2; __u16 aggr_port; diff --git a/libipt_NETFLOW.c b/libipt_NETFLOW.c index d85b6d9..a0f9e5d 100644 --- a/libipt_NETFLOW.c +++ b/libipt_NETFLOW.c @@ -58,24 +58,24 @@ #define _IPT_IP struct ipt_ip #endif +#ifndef IPTABLES_VERSION +#define IPTABLES_VERSION XTABLES_VERSION +#endif + static struct option opts[] = { - {0} + { 0 } }; static void help(void) { - printf( "NETFLOW target\n"); + printf("NETFLOW target\n"); } -//static int parse(int c, char **argv, int invert, unsigned int *flags, -// const _IPT_ENTRY *entry, -// struct ipt_entry_target **target) static int parse(int c, char **argv, int invert, unsigned int *flags, const _IPT_ENTRY *entry, struct ipt_entry_target **targetinfo) { - return 1; } @@ -95,16 +95,9 @@ static void print(const _IPT_IP *ip, } static struct iptables_target netflow = { -#ifdef MOD140 - .family = AF_INET, -#endif .next = NULL, .name = "NETFLOW", -#ifdef XTABLES_VERSION - .version = XTABLES_VERSION, -#else .version = IPTABLES_VERSION, -#endif .size = IPT_ALIGN(0), .userspacesize = IPT_ALIGN(0), .help = &help, diff --git a/murmur3.h b/murmur3.h new file mode 100644 index 0000000..57a6006 --- /dev/null +++ b/murmur3.h @@ -0,0 +1,42 @@ +/* MurmurHash3, based on https://code.google.com/p/smhasher of Austin Appleby. */ + +static __always_inline uint32_t rotl32(const uint32_t x, const int8_t r) +{ + return (x << r) | (x >> (32 - r)); +} + +static __always_inline uint32_t fmix32(register uint32_t h) +{ + h ^= h >> 16; + h *= 0x85ebca6b; + h ^= h >> 13; + h *= 0xc2b2ae35; + h ^= h >> 16; + return h; +} + +static inline uint32_t murmur3(const void *key, const uint32_t len, const uint32_t seed) +{ + const uint32_t c1 = 0xcc9e2d51; + const uint32_t c2 = 0x1b873593; + const uint32_t *blocks; + const uint8_t *tail; + register uint32_t h1 = seed; + uint32_t k1 = 0; + uint32_t i; + + blocks = (const uint32_t *)key; + for (i = len / 4; i; --i) { + h1 ^= rotl32(*blocks++ * c1, 15) * c2; + h1 = rotl32(h1, 13) * 5 + 0xe6546b64; + } + tail = (const uint8_t*)blocks; + switch (len & 3) { + case 3: k1 ^= tail[2] << 16; + case 2: k1 ^= tail[1] << 8; + case 1: k1 ^= tail[0]; + h1 ^= rotl32(k1 * c1, 15) * c2; + } + return fmix32(h1^ len); +} + diff --git a/raw_promisc_debian_squeeze6.patch b/raw_promisc_debian_squeeze6.patch new file mode 100644 index 0000000..69d0d35 --- /dev/null +++ b/raw_promisc_debian_squeeze6.patch @@ -0,0 +1,37 @@ + + Short manual and patch for Debian Squeeze + suggested by Pavel Odintsov: + +On Thu, Dec 27, 2012 at 07:46:30PM +0400, Pavel Odintsov wrote: +> +> краткий мануал для патчинга Debian Squeeze ядра патчем promisc. +> +> cd /usr/src +> apt-get install -y dpkg-dev +> apt-get build-dep linux-image-2.6.32-5-amd64 +> cd linux-2.6-2.6.32/ +> apt-get source linux-image-2.6.32-5-amd64 +> +> wget .... /root/raw_promisc_debian_squeeze6.patch +> patch -p1 < raw_promisc_debian_squeeze6.patch +> Накладываем патчи дебияна: +> debian/rules source +> +> Запускаем сборку: +> debian/rules binary +> + +diff -rupN linux-2.6-2.6.32/net/ipv4/ip_input.c linux-2.6-2.6.32_promisc_raw//net/ipv4/ip_input.c +--- linux-2.6-2.6.32/net/ipv4/ip_input.c 2009-12-03 04:51:21.000000000 +0100 ++++ linux-2.6-2.6.32_promisc_raw//net/ipv4/ip_input.c 2012-06-25 19:13:49.000000000 +0200 +@@ -383,8 +383,8 @@ int ip_rcv(struct sk_buff *skb, struct n + /* When the interface is in promisc. mode, drop all the crap + * that it receives, do not try to analyse it. + */ +- if (skb->pkt_type == PACKET_OTHERHOST) +- goto drop; ++ //if (skb->pkt_type == PACKET_OTHERHOST) ++ // goto drop; + + + IP_UPD_PO_STATS_BH(dev_net(dev), IPSTATS_MIB_IN, skb->len);