diff --git a/.gitignore b/.gitignore index e37a6903c0e..c96ceed0556 100644 --- a/.gitignore +++ b/.gitignore @@ -66,3 +66,4 @@ _debian odp-netlink.h OvsDpInterface.h /.vagrant/ +testsuite.tmp.orig diff --git a/.travis/build.sh b/.travis/build.sh index b6b701c3adb..c7796e13b19 100755 --- a/.travis/build.sh +++ b/.travis/build.sh @@ -38,9 +38,9 @@ function install_kernel() function install_dpdk() { if [ -n "$DPDK_GIT" ]; then - git clone $DPDK_GIT dpdk-$1 - cd dpdk-$1 - git checkout v$1 + git clone $DPDK_GIT dpdk-$1 + cd dpdk-$1 + git checkout v$1 else wget http://www.dpdk.org/browse/dpdk/snapshot/dpdk-$1.tar.gz tar xzvf dpdk-$1.tar.gz > /dev/null @@ -49,6 +49,7 @@ function install_dpdk() find ./ -type f | xargs sed -i 's/max-inline-insns-single=100/max-inline-insns-single=400/' sed -ri 's,(CONFIG_RTE_BUILD_COMBINE_LIBS=).*,\1y,' config/common_linuxapp sed -ri 's,(CONFIG_RTE_LIBRTE_VHOST=).*,\1y,' config/common_linuxapp + sed -ri 's,(CONFIG_RTE_LIBRTE_VHOST_USER=).*,\1n,' config/common_linuxapp sed -ri '/CONFIG_RTE_LIBNAME/a CONFIG_RTE_BUILD_FPIC=y' config/common_linuxapp sed -ri '/EXECENV_CFLAGS = -pthread -fPIC/{s/$/\nelse ifeq ($(CONFIG_RTE_BUILD_FPIC),y)/;s/$/\nEXECENV_CFLAGS = -pthread -fPIC/}' mk/exec-env/linuxapp/rte.vars.mk make config CC=gcc T=x86_64-native-linuxapp-gcc @@ -68,13 +69,15 @@ fi if [ "$DPDK" ]; then if [ -z "$DPDK_VER" ]; then - DPDK_VER="1.8.0" + DPDK_VER="2.0.0" fi install_dpdk $DPDK_VER - # Disregard bad function casts until DPDK is fixed - CFLAGS="$CFLAGS -Wno-error=bad-function-cast -Wno-error=cast-align" - EXTRA_OPTS+="--with-dpdk=./dpdk-$DPDK_VER/build" -elif [ $CC != "clang" ]; then + if [ "$CC" = "clang" ]; then + # Disregard cast alignment errors until DPDK is fixed + EXTRA_OPTS="$EXTRA_OPTS -Wno-cast-align" + fi + EXTRA_OPTS="$EXTRA_OPTS --with-dpdk=./dpdk-$DPDK_VER/build" +elif [ "$CC" != "clang" ]; then # DPDK headers currently trigger sparse errors SPARSE_FLAGS="$SPARSE_FLAGS -Wsparse-error" fi @@ -82,11 +85,11 @@ fi configure_ovs $EXTRA_OPTS $* # Only build datapath if we are testing kernel w/o running testsuite -if [ $KERNEL ] && [ ! "$TESTSUITE" ] && [ ! "$DPDK" ]; then +if [ "$KERNEL" ] && [ ! "$TESTSUITE" ] && [ ! "$DPDK" ]; then cd datapath fi -if [ $CC = "clang" ]; then +if [ "$CC" = "clang" ]; then make CFLAGS="$CFLAGS -Wno-error=unused-command-line-argument" elif [[ $BUILD_ENV =~ "-m32" ]]; then # Disable sparse for 32bit builds on 64bit machine @@ -95,7 +98,7 @@ else make CFLAGS="$CFLAGS $BUILD_ENV $SPARSE_FLAGS" C=1 fi -if [ $TESTSUITE ] && [ $CC != "clang" ]; then +if [ "$TESTSUITE" ] && [ "$CC" != "clang" ]; then if ! make distcheck; then # testsuite.log is necessary for debugging. cat */_build/tests/testsuite.log diff --git a/.travis/prepare.sh b/.travis/prepare.sh index cda80c22b66..0fd6c28784a 100755 --- a/.travis/prepare.sh +++ b/.travis/prepare.sh @@ -1,11 +1,11 @@ #!/bin/bash -sudo apt-get update -qq -sudo apt-get install -qq libssl-dev llvm-dev -sudo apt-get install -qq gcc-multilib +sudo -E apt-get update -qq +sudo -E apt-get install -qq libssl-dev llvm-dev +sudo -E apt-get install -qq gcc-multilib if [ "$DPDK" ]; then - sudo apt-get install -qq libfuse-dev + sudo -E apt-get install -qq libfuse-dev fi git clone git://git.kernel.org/pub/scm/devel/sparse/chrisl/sparse.git -cd sparse && make && sudo make install PREFIX=/usr && cd .. +cd sparse && make && sudo -E make install PREFIX=/usr && cd .. diff --git a/AUTHORS b/AUTHORS index cff99e6d93d..5ad3c1aae6d 100644 --- a/AUTHORS +++ b/AUTHORS @@ -24,6 +24,7 @@ Arun Sharma arun.sharma@calsoftinc.com Aryan TaheriMonfared aryan.taherimonfared@uis.no Ashwin Swaminathan ashwinds@arista.com Ben Pfaff blp@nicira.com +Billy O'Mahony billy.o.mahony@intel.com Brian Kruger bkruger+ovsdev@gmail.com Bruce Davie bsd@nicira.com Bryan Phillippe bp@toroki.com @@ -171,6 +172,7 @@ Thomas Lacroix thomas.lacroix@citrix.com Todd Deshane deshantm@gmail.com Tom Everman teverman@google.com Tsvi Slonim tsvi@toroki.com +Tuan Nguyen tuan.nguyen@veriksystems.com Tyler Coumbes coumbes@gmail.com Valient Gough vgough@pobox.com Vivien Bernet-Rollande vbr@soprive.net @@ -229,6 +231,7 @@ Chunhe Li lichunhe@huawei.com Ciara Loftus ciara.loftus@intel.com Daniel Badea daniel.badea@windriver.com Dave Walker DaveWalker@ubuntu.com +David Evans davidjoshuaevans@gmail.com David Palma palma@onesource.pt Derek Cormier derek.cormier@lab.ntt.co.jp Dhaval Badiani dbadiani@vmware.com @@ -255,6 +258,7 @@ Henrik Amren henrik@nicira.com Hiroshi Tanaka htanaka@nicira.com Hiroshi Miyata miyahiro.dazu@gmail.com Hyojoon Kim joonk@gatech.edu +Ian Stokes ian.stokes@intel.com Igor Ganichev iganichev@nicira.com Igor Sever igor@xorops.com Jacob Cherkas jcherkas@nicira.com @@ -360,6 +364,7 @@ likunyun kunyunli@hotmail.com rahim entezari rahim.entezari@gmail.com 冯全树(Crab) fqs888@126.com 胡靖飞 hujingfei914@msn.com +张伟 zhangwqh@126.com Thanks to all Open vSwitch contributors. If you are not listed above but believe that you should be, please write to dev@openvswitch.org. diff --git a/FAQ.md b/FAQ.md index 21d4e7a6701..d2286788941 100644 --- a/FAQ.md +++ b/FAQ.md @@ -173,14 +173,23 @@ A: The following table lists the Linux kernel versions against which the What should I do? -A: If there is a newer version of Open vSwitch, consider building that - one, because it may support the kernel that you are building - against. (To find out, consult the table in the previous answer.) +A: You have the following options: - Otherwise, use the Linux kernel module supplied with the kernel - that you are using. All versions of Open vSwitch userspace are - compatible with all versions of the Open vSwitch kernel module, so - this will also work. See also the following question. + - Use the Linux kernel module supplied with the kernel that you are + using. (See also the following FAQ.) + + - If there is a newer released version of Open vSwitch, consider + building that one, because it may support the kernel that you are + building against. (To find out, consult the table in the + previous FAQ.) + + - The Open vSwitch "master" branch may support the kernel that you + are using, so consider building the kernel module from "master". + + All versions of Open vSwitch userspace are compatible with all + versions of the Open vSwitch kernel module, so you do not have to + use the kernel module from one source along with the userspace + programs from the same source. ### Q: What features are not available in the Open vSwitch kernel datapath that ships as part of the upstream Linux kernel? @@ -209,6 +218,7 @@ A: Support for tunnels was added to the upstream Linux kernel module | VXLAN | 3.12 | Geneve | 3.18 | LISP | +| STT | If you are using a version of the kernel that is older than the one listed above, it is still possible to use that tunnel protocol. However, @@ -350,6 +360,25 @@ A: Yes. How you configure it depends on what you mean by "promiscuous SPAN, see "How do I configure a port as a SPAN port, that is, enable mirroring of all traffic to that port?" +### Q: How do I configure a DPDK port as an access port? + +A: Firstly, you must have a DPDK-enabled version of Open vSwitch. + + If your version is DPDK-enabled it will support the --dpdk + argument on the command line and will display lines with + "EAL:..." during startup when --dpdk is supplied. + + Secondly, when adding a DPDK port, unlike a system port, the + type for the interface must be specified. For example; + + ovs-vsctl add-br br0 + ovs-vsctl add-port br0 dpdk0 -- set Interface dpdk0 type=dpdk + + Finally, it is required that DPDK port names begin with 'dpdk'. + + See [INSTALL.DPDK.md] for more information on enabling and using DPDK with + Open vSwitch. + ### Q: How do I configure a VLAN as an RSPAN VLAN, that is, enable mirroring of all traffic to that VLAN? A: The following commands configure br0 with eth0 as a trunk port and @@ -639,6 +668,9 @@ A: More than likely, you've looped your network. Probably, eth0 and documentation on the Port table in ovs-vswitchd.conf.db(5) for all the details. + Configuration for DPDK-enabled interfaces is slightly less + straightforward: see [INSTALL.DPDK.md]. + - Perhaps you don't actually need eth0 and eth1 to be on the same bridge. For example, if you simply want to be able to connect each of them to virtual machines, then you can put @@ -823,6 +855,92 @@ A: Open vSwitch wasn't able to create the port. Check the ovs-vsctl will immediately report when there is an issue creating a port. +### Q: I created a tap device tap0, configured an IP address on it, and + added it to a bridge, like this: + + tunctl -t tap0 + ifconfig tap0 192.168.0.123 + ovs-vsctl add-br br0 + ovs-vsctl add-port br0 tap0 + + I expected that I could then use this IP address to contact other + hosts on the network, but it doesn't work. Why not? + +A: The short answer is that this is a misuse of a "tap" device. Use + an "internal" device implemented by Open vSwitch, which works + differently and is designed for this use. To solve this problem + with an internal device, instead run: + + ovs-vsctl add-br br0 + ovs-vsctl add-port br0 int0 -- set Interface int0 type=internal + ifconfig int0 192.168.0.123 + + Even more simply, you can take advantage of the internal port that + every bridge has under the name of the bridge: + + ovs-vsctl add-br br0 + ifconfig br0 192.168.0.123 + + In more detail, a "tap" device is an interface between the Linux + (or *BSD) network stack and a user program that opens it as a + socket. When the "tap" device transmits a packet, it appears in + the socket opened by the userspace program. Conversely, when the + userspace program writes to the "tap" socket, the kernel TCP/IP + stack processes the packet as if it had been received by the "tap" + device. + + Consider the configuration above. Given this configuration, if you + "ping" an IP address in the 192.168.0.x subnet, the Linux kernel + routing stack will transmit an ARP on the tap0 device. Open + vSwitch userspace treats "tap" devices just like any other network + device; that is, it doesn't open them as "tap" sockets. That means + that the ARP packet will simply get dropped. + + You might wonder why the Open vSwitch kernel module doesn't + intercept the ARP packet and bridge it. After all, Open vSwitch + intercepts packets on other devices. The answer is that Open + vSwitch only intercepts *received* packets, but this is a packet + being transmitted. The same thing happens for all other types of + network devices, except for Open vSwitch "internal" ports. If you, + for example, add a physical Ethernet port to an OVS bridge, + configure an IP address on a physical Ethernet port, and then issue + a "ping" to an address in that subnet, the same thing happens: an + ARP gets transmitted on the physical Ethernet port and Open vSwitch + never sees it. (You should not do that, as documented at the + beginning of this section.) + + It can make sense to add a "tap" device to an Open vSwitch bridge, + if some userspace program (other than Open vSwitch) has opened the + tap socket. This is the case, for example, if the "tap" device was + created by KVM (or QEMU) to simulate a virtual NIC. In such a + case, when OVS bridges a packet to the "tap" device, the kernel + forwards that packet to KVM in userspace, which passes it along to + the VM, and in the other direction, when the VM sends a packet, KVM + writes it to the "tap" socket, which causes OVS to receive it and + bridge it to the other OVS ports. Please note that in such a case + no IP address is configured on the "tap" device (there is normally + an IP address configured in the virtual NIC inside the VM, but this + is not visible to the host Linux kernel or to Open vSwitch). + + There is one special case in which Open vSwitch does directly read + and write "tap" sockets. This is an implementation detail of the + Open vSwitch userspace switch, which implements its "internal" + ports as Linux (or *BSD) "tap" sockets. In such a userspace + switch, OVS receives packets sent on the "tap" device used to + implement an "internal" port by reading the associated "tap" + socket, and bridges them to the rest of the switch. In the other + direction, OVS transmits packets bridged to the "internal" port by + writing them to the "tap" socket, causing them to be processed by + the kernel TCP/IP stack as if they had been received on the "tap" + device. Users should not need to be concerned with this + implementation detail. + + Open vSwitch has a network device type called "tap". This is + intended only for implementing "internal" ports in the OVS + userspace switch and should not be used otherwise. In particular, + users should not configure KVM "tap" devices as type "tap" (use + type "system", the default, instead). + Quality of Service (QoS) ------------------------ @@ -1801,3 +1919,4 @@ http://openvswitch.org/ [WHY-OVS.md]:WHY-OVS.md [INSTALL.md]:INSTALL.md [OPENFLOW-1.1+.md]:OPENFLOW-1.1+.md +[INSTALL.DPDK.md]:INSTALL.DPDK.md diff --git a/INSTALL.DPDK.md b/INSTALL.DPDK.md index 60889d01d51..a05367a9729 100644 --- a/INSTALL.DPDK.md +++ b/INSTALL.DPDK.md @@ -16,13 +16,13 @@ OVS needs a system with 1GB hugepages support. Building and Installing: ------------------------ -Required DPDK 1.8.0, `fuse`, `fuse-devel` (`libfuse-dev` on Debian/Ubuntu) +Required DPDK 2.0, `fuse`, `fuse-devel` (`libfuse-dev` on Debian/Ubuntu) 1. Configure build & install DPDK: 1. Set `$DPDK_DIR` ``` - export DPDK_DIR=/usr/src/dpdk-1.8.0 + export DPDK_DIR=/usr/src/dpdk-2.0 cd $DPDK_DIR ``` @@ -32,9 +32,12 @@ Required DPDK 1.8.0, `fuse`, `fuse-devel` (`libfuse-dev` on Debian/Ubuntu) `CONFIG_RTE_BUILD_COMBINE_LIBS=y` Update `config/common_linuxapp` so that DPDK is built with vhost - libraries: + libraries; currently, OVS only supports vhost-cuse, so DPDK vhost-user + libraries should be explicitly turned off (they are enabled by default + in DPDK 2.0). `CONFIG_RTE_LIBRTE_VHOST=y` + `CONFIG_RTE_LIBRTE_VHOST_USER=n` Then run `make install` to build and install the library. For default install without IVSHMEM: @@ -65,10 +68,12 @@ Required DPDK 1.8.0, `fuse`, `fuse-devel` (`libfuse-dev` on Debian/Ubuntu) ``` cd $(OVS_DIR)/openvswitch ./boot.sh - ./configure --with-dpdk=$DPDK_BUILD + ./configure --with-dpdk=$DPDK_BUILD [CFLAGS="-g -O2 -Wno-cast-align"] make ``` + Note: 'clang' users may specify the '-Wno-cast-align' flag to suppress DPDK cast-align warnings. + To have better performance one can enable aggressive compiler optimizations and use the special instructions(popcnt, crc32) that may not be available on all machines. Instead of typing `make`, type: @@ -95,7 +100,7 @@ Using the DPDK with ovs-vswitchd: 1. insert uio.ko: `modprobe uio` 2. insert igb_uio.ko: `insmod $DPDK_BUILD/kmod/igb_uio.ko` 3. Bind network device to igb_uio: - `$DPDK_DIR/tools/dpdk_nic_bind.py --bind=igb_uio eth1` + `$DPDK_DIR/tools/dpdk_nic_bind.py --bind=igb_uio eth1` * VFIO: @@ -106,7 +111,7 @@ Using the DPDK with ovs-vswitchd: 2. Set correct permissions on vfio device: `sudo /usr/bin/chmod a+x /dev/vfio` and: `sudo /usr/bin/chmod 0666 /dev/vfio/*` 3. Bind network device to vfio-pci: - `$DPDK_DIR/tools/dpdk_nic_bind.py --bind=vfio-pci eth1` + `$DPDK_DIR/tools/dpdk_nic_bind.py --bind=vfio-pci eth1` 3. Mount the hugetable filsystem @@ -182,6 +187,14 @@ Using the DPDK with ovs-vswitchd: polls dpdk device in continuous loop. Therefore CPU utilization for that thread is always 100%. + Note: creating bonds of DPDK interfaces is slightly different to creating + bonds of system interfaces. For DPDK, the interface type must be explicitly + set, for example: + + ``` + ovs-vsctl add-bond br0 dpdkbond dpdk0 dpdk1 -- set Interface dpdk0 type=dpdk -- set Interface dpdk1 type=dpdk + ``` + 7. Add test flows Test flow script across NICs (assuming ovs in /usr/src/ovs): @@ -250,6 +263,14 @@ Using the DPDK with ovs-vswitchd: Note, core 0 is always reserved from non-pmd threads and should never be set in the cpu mask. + To understand where most of the time is spent and whether the caches are + effective, these commands can be used: + + ``` + ovs-appctl dpif-netdev/pmd-stats-clear #To reset statistics + ovs-appctl dpif-netdev/pmd-stats-show + ``` + DPDK Rings : ------------ @@ -547,10 +568,16 @@ Restrictions: - DPDK-vHost support works with 1G huge pages. ivshmem: - - The shared memory is currently restricted to the use of a 1GB - huge pages. - - All huge pages are shared amongst the host, clients, virtual - machines etc. + - If you run Open vSwitch with smaller page sizes (e.g. 2MB), you may be + unable to share any rings or mempools with a virtual machine. + This is because the current implementation of ivshmem works by sharing + a single 1GB huge page from the host operating system to any guest + operating system through the Qemu ivshmem device. When using smaller + page sizes, multiple pages may be required to hold the ring descriptors + and buffer pools. The Qemu ivshmem device does not allow you to share + multiple file descriptors to the guest operating system. However, if you + want to share dpdkr rings with other processes on the host, you can do + this with smaller page sizes. Bug Reporting: -------------- diff --git a/INSTALL.Windows.md b/INSTALL.Windows.md index 00db4a37400..78af0a173b1 100644 --- a/INSTALL.Windows.md +++ b/INSTALL.Windows.md @@ -162,7 +162,12 @@ Steps to install the module 02> Run ./install.cmd to insert the new one. For this to work you will have to turn on TESTSIGNING boot option or 'Disable Driver Signature Enforcement' -during boot. +during boot. The following commands can be used: + % bcdedit /set LOADOPTIONS DISABLE_INTEGRITY_CHECKS + % bcdedit /set TESTSIGNING ON + % bcdedit /set nointegritychecks ON + +Note: you may have to restart the machine for the settings to take effect. 03> In the Virtual Switch Manager configuration you can enable the Open vSwitch Extension on an existing switch or create a new switch. If you are using an @@ -294,9 +299,9 @@ as a special name to refer to that adapter. Port br-pif Interface br-pif type: internal - Bridge br-int Port "external.1" Interface "external.1" + Bridge br-int Port br-int Interface br-int type: internal @@ -430,7 +435,7 @@ MSYS bash or Windows command prompt. * Create the ovsdb-server service and start it. - % sc create ovsdb-server binpath="C:/Shares/openvswitch/ovsdb/ovsdb-server.exe C:/openvswitch/etc/openvswitch/conf.db -vfile:info --log-file --pidfile --remote=punix:db.sock --service --service-monitor" + % sc create ovsdb-server binpath="C:/openvswitch/usr/sbin/ovsdb-server.exe C:/openvswitch/etc/openvswitch/conf.db -vfile:info --log-file --pidfile --remote=punix:db.sock --service --service-monitor" One of the common issues with creating a Windows service is with mungled paths. You can make sure that the correct path has been registered with @@ -452,7 +457,7 @@ MSYS bash or Windows command prompt. * Create the ovs-vswitchd service and start it. - % sc create ovs-vswitchd binpath="C:/Shares/openvswitch/vswitchd/ovs-vswitchd.exe --pidfile -vfile:info --log-file --service --service-monitor" + % sc create ovs-vswitchd binpath="C:/openvswitch/usr/sbin/ovs-vswitchd.exe --pidfile -vfile:info --log-file --service --service-monitor" % sc start ovs-vswitchd diff --git a/INSTALL.XenServer.md b/INSTALL.XenServer.md index 072a9f2ffff..4be974b5316 100644 --- a/INSTALL.XenServer.md +++ b/INSTALL.XenServer.md @@ -81,26 +81,16 @@ where: the crashdump kernel flavor. Commonly, one would specify "xen" here. For XenServer 6.5 or above, the kernel version naming no longer contains -KERNEL_FLAVOR. Correspondingly, the the final "rpmbuild" step changes to: +KERNEL_FLAVOR. In fact, only providing the `uname -r` output is enough. +So, the final "rpmbuild" step changes to: ``` - VERSION= - KERNEL_NAME= - KERNEL_VERSION= - XEN_VERSION= + KERNEL_UNAME=<`uname -r` output> rpmbuild \ - -D "openvswitch_version $VERSION" \ - -D "kernel_name $KERNEL_NAME" \ - -D "kernel_version $KERNEL_VERSION" \ - -D "xen_version $XEN_VERSION" \ + -D "kenel_uname $KERNEL_UNAME" \ -bb xenserver/openvswitch-xen.spec ``` -where: - - `` is the output of `uname -r`. Since XenServer 6.5, the - directory name in 'lib/modules/' becomes a shortened expression of the - KERNEL_VERSION. Installing Open vSwitch for XenServer ------------------------------------- diff --git a/INSTALL.md b/INSTALL.md index 81568987d19..9f8c57eb681 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -45,8 +45,6 @@ you will need the following software: - Python 2.x, for x >= 4. - - patch (The utility that is used to patch files). - On Linux, you may choose to compile the kernel module that comes with the Open vSwitch distribution or to use the kernel module built into the Linux kernel (version 3.3 or later). See the [FAQ.md] question diff --git a/NEWS b/NEWS index 87460a71259..882a3814c7c 100644 --- a/NEWS +++ b/NEWS @@ -77,6 +77,10 @@ Post-v2.3.0 numbers. OpenFlow is 6653 and OVSDB is 6640. - Support for DPDK vHost. - Support for outer UDP checksums in Geneve and VXLAN. + - The kernel vports with dependencies are no longer part of the overall + openvswitch.ko but built and loaded automatically as individual kernel + modules (vport-*.ko). + - Support for STT tunneling. v2.3.0 - 14 Aug 2014 diff --git a/acinclude.m4 b/acinclude.m4 index b09f2f25855..e9d0ed96814 100644 --- a/acinclude.m4 +++ b/acinclude.m4 @@ -205,7 +205,7 @@ AC_DEFUN([OVS_CHECK_DPDK], [ CFLAGS="$ovs_save_CFLAGS" LDFLAGS="$ovs_save_LDFLAGS" OVS_LDFLAGS="$OVS_LDFLAGS -L$DPDK_LIB_DIR" - OVS_CFLAGS="$OVS_CFLAGS -I$DPDK_INCLUDE" + OVS_CFLAGS="$OVS_CFLAGS -I$DPDK_INCLUDE -mssse3" # DPDK pmd drivers are not linked unless --whole-archive is used. # @@ -292,16 +292,6 @@ AC_DEFUN([OVS_DEFINE], [ echo '#define $1 1' >> datapath/linux/kcompat.h.new ]) -AC_DEFUN([OVS_CHECK_LOG2_H], [ - AC_MSG_CHECKING([for $KSRC/include/linux/log2.h]) - if test -e $KSRC/include/linux/log2.h; then - AC_MSG_RESULT([yes]) - OVS_DEFINE([HAVE_LOG2_H]) - else - AC_MSG_RESULT([no]) - fi -]) - dnl OVS_CHECK_LINUX_COMPAT dnl dnl Runs various Autoconf checks on the Linux 2.6 kernel source in @@ -335,6 +325,8 @@ AC_DEFUN([OVS_CHECK_LINUX_COMPAT], [ OVS_GREP_IFELSE([$KSRC/include/linux/netdevice.h], [can_checksum_protocol]) OVS_GREP_IFELSE([$KSRC/include/linux/netdevice.h], [netdev_features_t]) OVS_GREP_IFELSE([$KSRC/include/linux/netdevice.h], [pcpu_sw_netstats]) + OVS_GREP_IFELSE([$KSRC/include/linux/netfilter.h], [nf_hookfn.*nf_hook_ops], + [OVS_DEFINE([HAVE_NF_HOOKFN_ARG_OPS])]) OVS_GREP_IFELSE([$KSRC/include/linux/random.h], [prandom_u32]) @@ -442,8 +434,6 @@ AC_DEFUN([OVS_CHECK_LINUX_COMPAT], [ OVS_GREP_IFELSE([$KSRC/include/uapi/linux/netdevice.h], [NET_NAME_UNKNOWN], [OVS_DEFINE([HAVE_NET_NAME_UNKNOWN])]) - OVS_CHECK_LOG2_H - if cmp -s datapath/linux/kcompat.h.new \ datapath/linux/kcompat.h >/dev/null 2>&1; then rm datapath/linux/kcompat.h.new @@ -534,8 +524,16 @@ AC_DEFUN([_OVS_CHECK_CC_OPTION], [dnl dnl clang's GCC-compatible compiler driver does not return a failure dnl exit status even though it complains about options it does not dnl understand. + dnl + dnl Also, check stderr as gcc exits with status 0 for options + dnl rejected at getopt level. + dnl % touch /tmp/a.c + dnl % gcc -g -c -Werror -Qunused-arguments /tmp/a.c; echo $? + dnl gcc: unrecognized option '-Qunused-arguments' + dnl 0 + dnl % CFLAGS="$CFLAGS $WERROR $1" - AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,)], [ovs_cv_name[]=yes], [ovs_cv_name[]=no]) + AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,)], [if test -s conftest.err && grep "unrecognized option" conftest.err; then ovs_cv_name[]=no; else ovs_cv_name[]=yes; fi], [ovs_cv_name[]=no]) CFLAGS="$ovs_save_CFLAGS"]) if test $ovs_cv_name = yes; then m4_if([$2], [], [:], [$2]) diff --git a/build-aux/cccl b/build-aux/cccl index 456292e5ff7..afa0a6b8b73 100644 --- a/build-aux/cccl +++ b/build-aux/cccl @@ -148,6 +148,10 @@ EOF #ignore warnings ;; + -Q*) + #ignore link warnings + ;; + -fno-strict-aliasing*) #ignore aliasing ;; diff --git a/configure.ac b/configure.ac index 6b28a526220..70e3ab82931 100644 --- a/configure.ac +++ b/configure.ac @@ -60,7 +60,7 @@ m4_pattern_forbid([LT_INIT]) dnl Make autoconf fail if libtool is missing. # replacement, but programs using the new version may use APIs not # present in the previous one. In other words, new symbols have been # added and a program linking against the new version may fail with -# “unresolved symbols.” If linking against the old version at runtime: +# "unresolved symbols." If linking against the old version at runtime: # set revision to 0, bump current and age. # # 3. Programs may need to be changed, recompiled, relinked in order to use @@ -81,7 +81,7 @@ AC_SUBST([LT_AGE]) AC_SEARCH_LIBS([pow], [m]) AC_SEARCH_LIBS([clock_gettime], [rt]) AC_SEARCH_LIBS([timer_create], [rt]) -AC_SEARCH_LIBS([pthread_sigmask], [pthread]) +AC_SEARCH_LIBS([pthread_create], [pthread]) AC_FUNC_STRERROR_R OVS_CHECK_ESX @@ -111,6 +111,7 @@ OVS_CHECK_PKIDIR OVS_CHECK_RUNDIR OVS_CHECK_DBDIR OVS_CHECK_BACKTRACE +OVS_CHECK_PERF_EVENT OVS_CHECK_VALGRIND OVS_CHECK_SOCKET_LIBS OVS_CHECK_XENSERVER_VERSION @@ -155,6 +156,7 @@ OVS_ENABLE_OPTION([-Wmissing-prototypes]) OVS_ENABLE_OPTION([-Wmissing-field-initializers]) OVS_ENABLE_OPTION([-Wthread-safety]) OVS_ENABLE_OPTION([-fno-strict-aliasing]) +OVS_ENABLE_OPTION([-Qunused-arguments]) OVS_CONDITIONAL_CC_OPTION([-Wno-unused], [HAVE_WNO_UNUSED]) OVS_CONDITIONAL_CC_OPTION([-Wno-unused-parameter], [HAVE_WNO_UNUSED_PARAMETER]) OVS_ENABLE_WERROR diff --git a/datapath-windows/ovsext/Datapath.c b/datapath-windows/ovsext/Datapath.c index fea7d3a7a82..7646f0a9112 100644 --- a/datapath-windows/ovsext/Datapath.c +++ b/datapath-windows/ovsext/Datapath.c @@ -78,10 +78,11 @@ typedef struct _NETLINK_CMD { /* A netlink family is a group of commands. */ typedef struct _NETLINK_FAMILY { CHAR *name; - UINT32 id; + UINT16 id; UINT8 version; - UINT8 pad; + UINT8 pad1; UINT16 maxAttr; + UINT16 pad2; NETLINK_CMD *cmds; /* Array of netlink commands and handlers. */ UINT16 opsCount; } NETLINK_FAMILY, *PNETLINK_FAMILY; @@ -143,12 +144,12 @@ NETLINK_CMD nlControlFamilyCmdOps[] = { }, { .cmd = OVS_CTRL_CMD_EVENT_NOTIFY, .handler = OvsReadEventCmdHandler, - .supportedDevOp = OVS_READ_EVENT_DEV_OP, + .supportedDevOp = OVS_READ_DEV_OP, .validateDpIndex = FALSE, }, { .cmd = OVS_CTRL_CMD_READ_NOTIFY, .handler = OvsReadPacketCmdHandler, - .supportedDevOp = OVS_READ_PACKET_DEV_OP, + .supportedDevOp = OVS_READ_DEV_OP, .validateDpIndex = FALSE, } }; @@ -799,12 +800,17 @@ OvsDeviceControl(PDEVICE_OBJECT deviceObject, inputBufferLen = 0; ovsMsg = &ovsMsgReadOp; - ovsMsg->nlMsg.nlmsgType = OVS_WIN_NL_CTRL_FAMILY_ID; + RtlZeroMemory(ovsMsg, sizeof *ovsMsg); + ovsMsg->nlMsg.nlmsgLen = sizeof *ovsMsg; + ovsMsg->nlMsg.nlmsgType = nlControlFamilyOps.id; ovsMsg->nlMsg.nlmsgPid = instance->pid; + /* An "artificial" command so we can use NL family function table*/ ovsMsg->genlMsg.cmd = (code == OVS_IOCTL_READ_EVENT) ? OVS_CTRL_CMD_EVENT_NOTIFY : OVS_CTRL_CMD_READ_NOTIFY; + ovsMsg->genlMsg.version = nlControlFamilyOps.version; + devOp = OVS_READ_DEV_OP; break; @@ -895,8 +901,8 @@ OvsDeviceControl(PDEVICE_OBJECT deviceObject, } /* - * For read operation, the netlink command has already been validated - * previously. + * For read operation, avoid duplicate validation since 'ovsMsg' is either + * "artificial" or was copied from a previously validated 'ovsMsg'. */ if (devOp != OVS_READ_DEV_OP) { status = ValidateNetlinkCmd(devOp, instance, ovsMsg, nlFamilyOps); @@ -958,14 +964,11 @@ ValidateNetlinkCmd(UINT32 devOp, /* Validate the DP for commands that require a DP. */ if (nlFamilyOps->cmds[i].validateDpIndex == TRUE) { - OvsAcquireCtrlLock(); if (ovsMsg->ovsHdr.dp_ifindex != (INT)gOvsSwitchContext->dpNo) { status = STATUS_INVALID_PARAMETER; - OvsReleaseCtrlLock(); goto done; } - OvsReleaseCtrlLock(); } /* Validate the PID. */ @@ -985,7 +988,9 @@ ValidateNetlinkCmd(UINT32 devOp, /* * -------------------------------------------------------------------------- - * Function to invoke the netlink command handler. + * Function to invoke the netlink command handler. The function also stores + * the return value of the handler function to construct a 'NL_ERROR' message, + * and in turn returns success to the caller. * -------------------------------------------------------------------------- */ static NTSTATUS @@ -1007,6 +1012,43 @@ InvokeNetlinkCmdHandler(POVS_USER_PARAMS_CONTEXT usrParamsCtx, } } + /* + * Netlink socket semantics dictate that the return value of the netlink + * function should be an error ONLY under fatal conditions. If the message + * made it all the way to the handler function, it is not a fatal condition. + * Absorb the error returned by the handler function into a 'struct + * NL_ERROR' and populate the 'output buffer' to return to userspace. + * + * This behavior is obviously applicable only to netlink commands that + * specify an 'output buffer'. For other commands, we return the error as + * is. + * + * 'STATUS_PENDING' is a special return value and userspace is equipped to + * handle it. + */ + if (status != STATUS_SUCCESS && status != STATUS_PENDING) { + if (usrParamsCtx->devOp != OVS_WRITE_DEV_OP && *replyLen == 0) { + NL_ERROR nlError = NlMapStatusToNlErr(status); + POVS_MESSAGE msgIn = (POVS_MESSAGE)usrParamsCtx->inputBuffer; + POVS_MESSAGE_ERROR msgError = (POVS_MESSAGE_ERROR) + usrParamsCtx->outputBuffer; + + ASSERT(msgError); + NlBuildErrorMsg(msgIn, msgError, nlError); + *replyLen = msgError->nlMsg.nlmsgLen; + } + + if (*replyLen != 0) { + status = STATUS_SUCCESS; + } + } + +#ifdef DBG + if (usrParamsCtx->devOp != OVS_WRITE_DEV_OP) { + ASSERT(status == STATUS_PENDING || *replyLen != 0 || status == STATUS_SUCCESS); + } +#endif + return status; } @@ -1045,7 +1087,6 @@ OvsGetPidHandler(POVS_USER_PARAMS_CONTEXT usrParamsCtx, * -------------------------------------------------------------------------- * Utility function to fill up information about the datapath in a reply to * userspace. - * Assumes that 'gOvsCtrlLock' lock is acquired. * -------------------------------------------------------------------------- */ static NTSTATUS @@ -1245,9 +1286,7 @@ HandleGetDpDump(POVS_USER_PARAMS_CONTEXT usrParamsCtx, NlBufInit(&nlBuf, usrParamsCtx->outputBuffer, usrParamsCtx->outputLength); - OvsAcquireCtrlLock(); status = OvsDpFillInfo(gOvsSwitchContext, msgIn, &nlBuf); - OvsReleaseCtrlLock(); if (status != STATUS_SUCCESS) { *replyLen = 0; @@ -1334,11 +1373,9 @@ HandleDpTransactionCommon(POVS_USER_PARAMS_CONTEXT usrParamsCtx, NlBufInit(&nlBuf, usrParamsCtx->outputBuffer, usrParamsCtx->outputLength); - OvsAcquireCtrlLock(); if (dpAttrs[OVS_DP_ATTR_NAME] != NULL) { if (!OvsCompareString(NlAttrGet(dpAttrs[OVS_DP_ATTR_NAME]), OVS_SYSTEM_DP_NAME)) { - OvsReleaseCtrlLock(); /* Creation of new datapaths is not supported. */ if (usrParamsCtx->ovsMsg->genlMsg.cmd == OVS_DP_CMD_SET) { @@ -1350,19 +1387,16 @@ HandleDpTransactionCommon(POVS_USER_PARAMS_CONTEXT usrParamsCtx, goto cleanup; } } else if ((UINT32)msgIn->ovsHdr.dp_ifindex != gOvsSwitchContext->dpNo) { - OvsReleaseCtrlLock(); nlError = NL_ERROR_NODEV; goto cleanup; } if (usrParamsCtx->ovsMsg->genlMsg.cmd == OVS_DP_CMD_NEW) { - OvsReleaseCtrlLock(); nlError = NL_ERROR_EXIST; goto cleanup; } status = OvsDpFillInfo(gOvsSwitchContext, msgIn, &nlBuf); - OvsReleaseCtrlLock(); *replyLen = NlBufSize(&nlBuf); @@ -1444,7 +1478,6 @@ MapIrpOutputBuffer(PIRP irp, * -------------------------------------------------------------------------- * Utility function to fill up information about the state of a port in a reply * to* userspace. - * Assumes that 'gOvsCtrlLock' lock is acquired. * -------------------------------------------------------------------------- */ static NTSTATUS @@ -1548,8 +1581,6 @@ OvsReadEventCmdHandler(POVS_USER_PARAMS_CONTEXT usrParamsCtx, NlBufInit(&nlBuf, usrParamsCtx->outputBuffer, usrParamsCtx->outputLength); - OvsAcquireCtrlLock(); - /* remove an event entry from the event queue */ status = OvsRemoveEventEntry(usrParamsCtx->ovsInstance, &eventEntry); if (status != STATUS_SUCCESS) { @@ -1565,7 +1596,6 @@ OvsReadEventCmdHandler(POVS_USER_PARAMS_CONTEXT usrParamsCtx, } cleanup: - OvsReleaseCtrlLock(); return status; } diff --git a/datapath-windows/ovsext/Datapath.h b/datapath-windows/ovsext/Datapath.h index 863afc4caae..dbc9dea58f8 100644 --- a/datapath-windows/ovsext/Datapath.h +++ b/datapath-windows/ovsext/Datapath.h @@ -32,8 +32,6 @@ #define OVS_READ_DEV_OP (1 << 0) #define OVS_WRITE_DEV_OP (1 << 1) #define OVS_TRANSACTION_DEV_OP (1 << 2) -#define OVS_READ_EVENT_DEV_OP (1 << 3) -#define OVS_READ_PACKET_DEV_OP (1 << 4) typedef struct _OVS_DEVICE_EXTENSION { INT numberOpenInstance; diff --git a/datapath-windows/ovsext/Flow.c b/datapath-windows/ovsext/Flow.c index f25fe9a4f49..6fa10a33fd8 100644 --- a/datapath-windows/ovsext/Flow.c +++ b/datapath-windows/ovsext/Flow.c @@ -31,7 +31,6 @@ #pragma warning( push ) #pragma warning( disable:4127 ) -extern PNDIS_SPIN_LOCK gOvsCtrlLock; extern POVS_SWITCH_CONTEXT gOvsSwitchContext; extern UINT64 ovsTimeIncrementPerTick; @@ -1995,25 +1994,23 @@ OvsDoDumpFlows(OvsFlowDumpInput *dumpInput, BOOLEAN findNextNonEmpty = FALSE; dpNo = dumpInput->dpNo; - NdisAcquireSpinLock(gOvsCtrlLock); if (gOvsSwitchContext->dpNo != dpNo) { status = STATUS_INVALID_PARAMETER; - goto unlock; + goto exit; } rowIndex = dumpInput->position[0]; if (rowIndex >= OVS_FLOW_TABLE_SIZE) { dumpOutput->n = 0; *replyLen = sizeof(*dumpOutput); - goto unlock; + goto exit; } columnIndex = dumpInput->position[1]; datapath = &gOvsSwitchContext->datapath; ASSERT(datapath); - ASSERT(KeGetCurrentIrql() == DISPATCH_LEVEL); - OvsAcquireDatapathRead(datapath, &dpLockState, TRUE); + OvsAcquireDatapathRead(datapath, &dpLockState, FALSE); head = &datapath->flowTable[rowIndex]; node = head->Flink; @@ -2062,8 +2059,7 @@ OvsDoDumpFlows(OvsFlowDumpInput *dumpInput, dp_unlock: OvsReleaseDatapath(datapath, &dpLockState); -unlock: - NdisReleaseSpinLock(gOvsCtrlLock); +exit: return status; } @@ -2124,21 +2120,18 @@ OvsPutFlowIoctl(PVOID inputBuffer, } dpNo = put->dpNo; - NdisAcquireSpinLock(gOvsCtrlLock); if (gOvsSwitchContext->dpNo != dpNo) { status = STATUS_INVALID_PARAMETER; - goto unlock; + goto exit; } datapath = &gOvsSwitchContext->datapath; ASSERT(datapath); - ASSERT(KeGetCurrentIrql() == DISPATCH_LEVEL); - OvsAcquireDatapathWrite(datapath, &dpLockState, TRUE); + OvsAcquireDatapathWrite(datapath, &dpLockState, FALSE); status = HandleFlowPut(put, datapath, stats); OvsReleaseDatapath(datapath, &dpLockState); -unlock: - NdisReleaseSpinLock(gOvsCtrlLock); +exit: return status; } @@ -2306,16 +2299,14 @@ OvsGetFlowIoctl(PVOID inputBuffer, } dpNo = getInput->dpNo; - NdisAcquireSpinLock(gOvsCtrlLock); if (gOvsSwitchContext->dpNo != dpNo) { status = STATUS_INVALID_PARAMETER; - goto unlock; + goto exit; } datapath = &gOvsSwitchContext->datapath; ASSERT(datapath); - ASSERT(KeGetCurrentIrql() == DISPATCH_LEVEL); - OvsAcquireDatapathRead(datapath, &dpLockState, TRUE); + OvsAcquireDatapathRead(datapath, &dpLockState, FALSE); flow = OvsLookupFlow(datapath, &getInput->key, &hash, FALSE); if (!flow) { status = STATUS_INVALID_PARAMETER; @@ -2327,8 +2318,7 @@ OvsGetFlowIoctl(PVOID inputBuffer, dp_unlock: OvsReleaseDatapath(datapath, &dpLockState); -unlock: - NdisReleaseSpinLock(gOvsCtrlLock); +exit: return status; } @@ -2339,21 +2329,18 @@ OvsFlushFlowIoctl(UINT32 dpNo) OVS_DATAPATH *datapath = NULL; LOCK_STATE_EX dpLockState; - NdisAcquireSpinLock(gOvsCtrlLock); if (gOvsSwitchContext->dpNo != dpNo) { status = STATUS_INVALID_PARAMETER; - goto unlock; + goto exit; } datapath = &gOvsSwitchContext->datapath; ASSERT(datapath); - ASSERT(KeGetCurrentIrql() == DISPATCH_LEVEL); - OvsAcquireDatapathWrite(datapath, &dpLockState, TRUE); + OvsAcquireDatapathWrite(datapath, &dpLockState, FALSE); DeleteAllFlows(datapath); OvsReleaseDatapath(datapath, &dpLockState); -unlock: - NdisReleaseSpinLock(gOvsCtrlLock); +exit: return status; } diff --git a/datapath-windows/ovsext/Switch.c b/datapath-windows/ovsext/Switch.c index 4f4591fcab9..032153d3992 100644 --- a/datapath-windows/ovsext/Switch.c +++ b/datapath-windows/ovsext/Switch.c @@ -35,12 +35,12 @@ #include "Debug.h" POVS_SWITCH_CONTEXT gOvsSwitchContext; -BOOLEAN gOvsInAttach; +LONG volatile gOvsInAttach; UINT64 ovsTimeIncrementPerTick; -extern PNDIS_SPIN_LOCK gOvsCtrlLock; extern NDIS_HANDLE gOvsExtDriverHandle; extern NDIS_HANDLE gOvsExtDriverObject; +extern PDEVICE_OBJECT gOvsDeviceObject; /* * Reference count used to prevent premature deallocation of the global switch @@ -89,22 +89,18 @@ OvsExtAttach(NDIS_HANDLE ndisFilterHandle, goto cleanup; } - NdisAcquireSpinLock(gOvsCtrlLock); if (gOvsSwitchContext) { - NdisReleaseSpinLock(gOvsCtrlLock); OVS_LOG_TRACE("Exit: Failed to create OVS Switch, only one datapath is" "supported, %p.", gOvsSwitchContext); goto cleanup; } - if (gOvsInAttach) { - NdisReleaseSpinLock(gOvsCtrlLock); + + if (InterlockedCompareExchange(&gOvsInAttach, 1, 0)) { /* Just fail the request. */ OVS_LOG_TRACE("Exit: Failed to create OVS Switch, since another attach" "instance is in attach process."); goto cleanup; } - gOvsInAttach = TRUE; - NdisReleaseSpinLock(gOvsCtrlLock); status = OvsInitIpHelper(ndisFilterHandle); if (status != STATUS_SUCCESS) { @@ -121,7 +117,7 @@ OvsExtAttach(NDIS_HANDLE ndisFilterHandle, /* * Register the switch context with NDIS so NDIS can pass it back to the - * Filterxxx callback functions as the 'FilterModuleContext' parameter. + * FilterXXX callback functions as the 'FilterModuleContext' parameter. */ RtlZeroMemory(&ovsExtAttributes, sizeof(NDIS_FILTER_ATTRIBUTES)); ovsExtAttributes.Header.Revision = NDIS_FILTER_ATTRIBUTES_REVISION_1; @@ -208,12 +204,12 @@ OvsCreateSwitch(NDIS_HANDLE ndisFilterHandle, goto create_switch_done; } - status = OvsTunnelFilterInitialize(gOvsExtDriverObject); + status = OvsInitTunnelFilter(gOvsExtDriverObject, gOvsDeviceObject); if (status != NDIS_STATUS_SUCCESS) { OvsUninitSwitchContext(switchContext); - OvsFreeMemoryWithTag(switchContext, OVS_SWITCH_POOL_TAG); goto create_switch_done; } + *switchContextOut = switchContext; create_switch_done: @@ -267,10 +263,9 @@ OvsDeleteSwitch(POVS_SWITCH_CONTEXT switchContext) if (switchContext) { dpNo = switchContext->dpNo; - OvsTunnelFilterUninitialize(gOvsExtDriverObject); + OvsUninitTunnelFilter(gOvsExtDriverObject); OvsClearAllSwitchVports(switchContext); OvsUninitSwitchContext(switchContext); - OvsFreeMemoryWithTag(switchContext, OVS_SWITCH_POOL_TAG); } OVS_LOG_TRACE("Exit: deleted switch %p dpNo: %d", switchContext, dpNo); } @@ -441,7 +436,12 @@ OvsUninitSwitchContext(POVS_SWITCH_CONTEXT switchContext) OvsReleaseSwitchContext(switchContext); } -VOID +/* + * -------------------------------------------------------------------------- + * Frees up the contents of and also the switch context. + * -------------------------------------------------------------------------- + */ +static VOID OvsDeleteSwitchContext(POVS_SWITCH_CONTEXT switchContext) { OVS_LOG_TRACE("Enter: Delete switchContext:%p", switchContext); @@ -467,6 +467,8 @@ OvsDeleteSwitchContext(POVS_SWITCH_CONTEXT switchContext) switchContext->pidHashArray = NULL; OvsDeleteFlowTable(&switchContext->datapath); OvsCleanupBufferPool(switchContext); + + OvsFreeMemoryWithTag(switchContext, OVS_SWITCH_POOL_TAG); OVS_LOG_TRACE("Exit: Delete switchContext: %p", switchContext); } @@ -582,7 +584,6 @@ OvsExtNetPnPEvent(NDIS_HANDLE filterModuleContext, switchContext->isActivateFailed = TRUE; } else { ASSERT(switchContext->isActivated == FALSE); - ASSERT(switchActive == TRUE); if (switchContext->isActivated == FALSE && switchActive == TRUE) { status = OvsActivateSwitch(switchContext); OVS_LOG_TRACE("OvsExtNetPnPEvent: activated switch: %p " diff --git a/datapath-windows/ovsext/TunnelFilter.c b/datapath-windows/ovsext/TunnelFilter.c index 4b879c059d1..c2186eb427c 100644 --- a/datapath-windows/ovsext/TunnelFilter.c +++ b/datapath-windows/ovsext/TunnelFilter.c @@ -111,7 +111,8 @@ DEFINE_GUID( PDEVICE_OBJECT gDeviceObject; HANDLE gEngineHandle = NULL; -HANDLE gBfeSubscriptionHandle = NULL; +HANDLE gTunnelProviderBfeHandle = NULL; +HANDLE gTunnelInitBfeHandle = NULL; UINT32 gCalloutIdV4; @@ -448,11 +449,6 @@ OvsTunnelRegisterCallouts(VOID *deviceObject) L"Sub-Layer for use by Datagram-Data OVS callouts"; OvsTunnelSubLayer.flags = 0; OvsTunnelSubLayer.weight = FWP_EMPTY; /* auto-weight */ - /* - * Link all objects to the tunnel provider. When multiple providers are - * installed on a computer, this makes it easy to determine who added what. - */ - OvsTunnelSubLayer.providerKey = (GUID*) &OVS_TUNNEL_PROVIDER_KEY; status = FwpmSubLayerAdd(gEngineHandle, &OvsTunnelSubLayer, NULL); if (!NT_SUCCESS(status)) { @@ -547,8 +543,8 @@ OvsTunnelFilterInitialize(PDRIVER_OBJECT driverObject) } VOID NTAPI -OvsBfeStateChangeCallback(PVOID context, - FWPM_SERVICE_STATE bfeState) +OvsTunnelProviderBfeCallback(PVOID context, + FWPM_SERVICE_STATE bfeState) { HANDLE handle = NULL; @@ -564,18 +560,18 @@ OvsBfeStateChangeCallback(PVOID context, } NTSTATUS -OvsSubscribeBfeStateChanges(PVOID deviceObject) +OvsSubscribeTunnelProviderBfeStateChanges(PVOID deviceObject) { NTSTATUS status = STATUS_SUCCESS; - if (!gBfeSubscriptionHandle) { + if (!gTunnelProviderBfeHandle) { status = FwpmBfeStateSubscribeChanges(deviceObject, - OvsBfeStateChangeCallback, + OvsTunnelProviderBfeCallback, NULL, - &gBfeSubscriptionHandle); + &gTunnelProviderBfeHandle); if (!NT_SUCCESS(status)) { OVS_LOG_ERROR( - "Failed to open subscribe BFE state change callback, status: %x.", + "Failed to subscribe BFE tunnel provider callback, status: %x.", status); } } @@ -584,27 +580,28 @@ OvsSubscribeBfeStateChanges(PVOID deviceObject) } VOID -OvsUnsubscribeBfeStateChanges() +OvsUnsubscribeTunnelProviderBfeStateChanges() { NTSTATUS status = STATUS_SUCCESS; - if (gBfeSubscriptionHandle) { - status = FwpmBfeStateUnsubscribeChanges(gBfeSubscriptionHandle); + if (gTunnelProviderBfeHandle) { + status = FwpmBfeStateUnsubscribeChanges(gTunnelProviderBfeHandle); if (!NT_SUCCESS(status)) { OVS_LOG_ERROR( - "Failed to open unsubscribe BFE state change callback, status: %x.", + "Failed to unsubscribe BFE tunnel provider callback, status: %x.", status); } - gBfeSubscriptionHandle = NULL; + gTunnelProviderBfeHandle = NULL; } } -VOID OvsRegisterSystemProvider(PVOID deviceObject) +VOID +OvsRegisterSystemProvider(PVOID deviceObject) { NTSTATUS status = STATUS_SUCCESS; HANDLE handle = NULL; - status = OvsSubscribeBfeStateChanges(deviceObject); + status = OvsSubscribeTunnelProviderBfeStateChanges(deviceObject); if (NT_SUCCESS(status)) { if (FWPM_SERVICE_RUNNING == FwpmBfeStateGet()) { OvsTunnelEngineOpen(&handle); @@ -613,7 +610,7 @@ VOID OvsRegisterSystemProvider(PVOID deviceObject) } OvsTunnelEngineClose(&handle); - OvsUnsubscribeBfeStateChanges(); + OvsUnsubscribeTunnelProviderBfeStateChanges(); } } } @@ -628,5 +625,89 @@ VOID OvsUnregisterSystemProvider() } OvsTunnelEngineClose(&handle); - OvsUnsubscribeBfeStateChanges(); + OvsUnsubscribeTunnelProviderBfeStateChanges(); +} + +VOID NTAPI +OvsTunnelInitBfeCallback(PVOID context, + FWPM_SERVICE_STATE bfeState) +{ + NTSTATUS status = STATUS_SUCCESS; + PDRIVER_OBJECT driverObject = (PDRIVER_OBJECT) context; + + if (FWPM_SERVICE_RUNNING == bfeState) { + status = OvsTunnelFilterInitialize(driverObject); + if (!NT_SUCCESS(status)) { + OVS_LOG_ERROR( + "Failed to initialize tunnel filter, status: %x.", + status); + } + } +} + +NTSTATUS +OvsSubscribeTunnelInitBfeStateChanges(PDRIVER_OBJECT driverObject, + PVOID deviceObject) +{ + NTSTATUS status = STATUS_SUCCESS; + + if (!gTunnelInitBfeHandle) { + status = FwpmBfeStateSubscribeChanges(deviceObject, + OvsTunnelInitBfeCallback, + driverObject, + &gTunnelInitBfeHandle); + if (!NT_SUCCESS(status)) { + OVS_LOG_ERROR( + "Failed to subscribe BFE tunnel init callback, status: %x.", + status); + } + } + + return status; +} + +VOID +OvsUnsubscribeTunnelInitBfeStateChanges() +{ + NTSTATUS status = STATUS_SUCCESS; + + if (gTunnelInitBfeHandle) { + status = FwpmBfeStateUnsubscribeChanges(gTunnelInitBfeHandle); + if (!NT_SUCCESS(status)) { + OVS_LOG_ERROR( + "Failed to unsubscribe BFE tunnel init callback, status: %x.", + status); + } + gTunnelInitBfeHandle = NULL; + } +} + +NTSTATUS +OvsInitTunnelFilter(PDRIVER_OBJECT driverObject, PVOID deviceObject) +{ + NTSTATUS status = STATUS_SUCCESS; + + status = OvsSubscribeTunnelInitBfeStateChanges(driverObject, deviceObject); + if (NT_SUCCESS(status)) { + if (FWPM_SERVICE_RUNNING == FwpmBfeStateGet()) { + status = OvsTunnelFilterInitialize(driverObject); + if (!NT_SUCCESS(status)) { + /* XXX: We need to decide what actions to take in case of + * failure to initialize tunnel filter. */ + ASSERT(status == NDIS_STATUS_SUCCESS); + OVS_LOG_ERROR( + "Failed to initialize tunnel filter, status: %x.", + status); + } + OvsUnsubscribeTunnelInitBfeStateChanges(); + } + } + + return status; +} + +VOID OvsUninitTunnelFilter(PDRIVER_OBJECT driverObject) +{ + OvsTunnelFilterUninitialize(driverObject); + OvsUnsubscribeTunnelInitBfeStateChanges(); } diff --git a/datapath-windows/ovsext/TunnelIntf.h b/datapath-windows/ovsext/TunnelIntf.h index 728a53f7f67..82a51459ad3 100644 --- a/datapath-windows/ovsext/TunnelIntf.h +++ b/datapath-windows/ovsext/TunnelIntf.h @@ -18,9 +18,9 @@ #define __TUNNEL_INTF_H_ 1 /* Tunnel callout driver load/unload functions */ -NTSTATUS OvsTunnelFilterInitialize(PDRIVER_OBJECT driverObject); +NTSTATUS OvsInitTunnelFilter(PDRIVER_OBJECT driverObject, PVOID deviceObject); -VOID OvsTunnelFilterUninitialize(PDRIVER_OBJECT driverObject); +VOID OvsUninitTunnelFilter(PDRIVER_OBJECT driverObject); VOID OvsRegisterSystemProvider(PVOID deviceObject); diff --git a/datapath-windows/ovsext/User.c b/datapath-windows/ovsext/User.c index 03f0377d992..9f462cf0055 100644 --- a/datapath-windows/ovsext/User.c +++ b/datapath-windows/ovsext/User.c @@ -142,14 +142,12 @@ OvsCleanupPacketQueue(POVS_OPEN_INSTANCE instance) } /* Verify if gOvsSwitchContext exists. */ - OvsAcquireCtrlLock(); if (gOvsSwitchContext) { /* Remove the instance from pidHashArray */ OvsAcquirePidHashLock(); OvsDelPidInstance(gOvsSwitchContext, instance->pid); OvsReleasePidHashLock(); } - OvsReleaseCtrlLock(); } NTSTATUS @@ -163,12 +161,6 @@ OvsSubscribeDpIoctl(PVOID instanceP, if (instance->packetQueue && !join) { /* unsubscribe */ OvsCleanupPacketQueue(instance); - - OvsAcquirePidHashLock(); - /* Remove the instance from pidHashArray */ - OvsDelPidInstance(gOvsSwitchContext, pid); - OvsReleasePidHashLock(); - } else if (instance->packetQueue == NULL && join) { queue = (POVS_USER_PACKET_QUEUE) OvsAllocateMemoryWithTag( sizeof *queue, OVS_USER_POOL_TAG); @@ -447,11 +439,9 @@ OvsExecuteDpIoctl(OvsPacketExecute *execute) OVS_PACKET_HDR_INFO layers; POVS_VPORT_ENTRY vport; - NdisAcquireSpinLock(gOvsCtrlLock); - if (execute->packetLen == 0) { status = STATUS_INVALID_PARAMETER; - goto unlock; + goto exit; } actions = execute->actions; @@ -466,7 +456,7 @@ OvsExecuteDpIoctl(OvsPacketExecute *execute) execute->packetLen); if (pNbl == NULL) { status = STATUS_NO_MEMORY; - goto unlock; + goto exit; } fwdDetail = NET_BUFFER_LIST_SWITCH_FORWARDING_DETAIL(pNbl); @@ -481,11 +471,9 @@ OvsExecuteDpIoctl(OvsPacketExecute *execute) // XXX: Figure out if any of the other members of fwdDetail need to be set. ndisStatus = OvsExtractFlow(pNbl, fwdDetail->SourcePortId, &key, &layers, - NULL); + NULL); if (ndisStatus == NDIS_STATUS_SUCCESS) { - ASSERT(KeGetCurrentIrql() == DISPATCH_LEVEL); - NdisAcquireRWLockRead(gOvsSwitchContext->dispatchLock, &lockState, - NDIS_RWL_AT_DISPATCH_LEVEL); + NdisAcquireRWLockRead(gOvsSwitchContext->dispatchLock, &lockState, 0); ndisStatus = OvsActionsExecute(gOvsSwitchContext, NULL, pNbl, vport ? vport->portNo : OVS_DEFAULT_PORT_NO, @@ -506,8 +494,7 @@ OvsExecuteDpIoctl(OvsPacketExecute *execute) if (pNbl) { OvsCompleteNBL(gOvsSwitchContext, pNbl, TRUE); } -unlock: - NdisReleaseSpinLock(gOvsCtrlLock); +exit: return status; } @@ -630,7 +617,6 @@ OvsGetNextPacket(POVS_OPEN_INSTANCE instance) /* * --------------------------------------------------------------------------- * Given a pid, returns the corresponding USER_PACKET_QUEUE. - * gOvsCtrlLock must be acquired before calling this API. * --------------------------------------------------------------------------- */ POVS_USER_PACKET_QUEUE diff --git a/datapath-windows/ovsext/Vport.c b/datapath-windows/ovsext/Vport.c index f46a0ac0462..1423ace6521 100644 --- a/datapath-windows/ovsext/Vport.c +++ b/datapath-windows/ovsext/Vport.c @@ -48,7 +48,6 @@ #define OVS_VPORT_DEFAULT_WAIT_TIME_MICROSEC 100 extern POVS_SWITCH_CONTEXT gOvsSwitchContext; -extern PNDIS_SPIN_LOCK gOvsCtrlLock; static VOID OvsInitVportWithPortParam(POVS_VPORT_ENTRY vport, PNDIS_SWITCH_PORT_PARAMETERS portParam); @@ -1366,8 +1365,6 @@ OvsConvertIfCountedStrToAnsiStr(PIF_COUNTED_STRING wStr, * -------------------------------------------------------------------------- * Utility function that populates a 'OVS_VPORT_EXT_INFO' structure for the * specified vport. - * - * Assumes that 'gOvsCtrlLock' is held. * -------------------------------------------------------------------------- */ NTSTATUS @@ -1381,9 +1378,7 @@ OvsGetExtInfoIoctl(POVS_VPORT_GET vportGet, BOOLEAN doConvert = FALSE; RtlZeroMemory(extInfo, sizeof (POVS_VPORT_EXT_INFO)); - ASSERT(KeGetCurrentIrql() == DISPATCH_LEVEL); - NdisAcquireRWLockRead(gOvsSwitchContext->dispatchLock, &lockState, - NDIS_RWL_AT_DISPATCH_LEVEL); + NdisAcquireRWLockRead(gOvsSwitchContext->dispatchLock, &lockState, 0); if (vportGet->portNo == 0) { StringCbLengthA(vportGet->name, OVS_MAX_PORT_NAME_LENGTH - 1, &len); vport = OvsFindVportByHvNameA(gOvsSwitchContext, vportGet->name); @@ -1511,8 +1506,6 @@ OvsGetNetdevCmdHandler(POVS_USER_PARAMS_CONTEXT usrParamsCtx, return STATUS_INVALID_PARAMETER; } - OvsAcquireCtrlLock(); - vportGet.portNo = 0; RtlCopyMemory(&vportGet.name, NlAttrGet(netdevAttrs[OVS_VPORT_ATTR_NAME]), NlAttrGetSize(netdevAttrs[OVS_VPORT_ATTR_NAME])); @@ -1520,7 +1513,6 @@ OvsGetNetdevCmdHandler(POVS_USER_PARAMS_CONTEXT usrParamsCtx, status = OvsGetExtInfoIoctl(&vportGet, &info); if (status == STATUS_DEVICE_DOES_NOT_EXIST) { nlError = NL_ERROR_NODEV; - OvsReleaseCtrlLock(); goto cleanup; } @@ -1530,7 +1522,6 @@ OvsGetNetdevCmdHandler(POVS_USER_PARAMS_CONTEXT usrParamsCtx, if (status == STATUS_SUCCESS) { *replyLen = msgOut->nlMsg.nlmsgLen; } - OvsReleaseCtrlLock(); cleanup: if (nlError != NL_ERROR_SUCCESS) { @@ -1737,17 +1728,13 @@ OvsGetVportDumpNext(POVS_USER_PARAMS_CONTEXT usrParamsCtx, msgIn = instance->dumpState.ovsMsg; - OvsAcquireCtrlLock(); - /* * XXX: when we implement OVS_DP_ATTR_USER_FEATURES in datapath, * we'll need to check the OVS_DP_F_VPORT_PIDS flag: if it is set, * it means we have an array of pids, instead of a single pid. * ATM we assume we have one pid only. */ - ASSERT(KeGetCurrentIrql() == DISPATCH_LEVEL); - NdisAcquireRWLockRead(gOvsSwitchContext->dispatchLock, &lockState, - NDIS_RWL_AT_DISPATCH_LEVEL); + NdisAcquireRWLockRead(gOvsSwitchContext->dispatchLock, &lockState, 0); if (gOvsSwitchContext->numHvVports > 0 || gOvsSwitchContext->numNonHvVports > 0) { @@ -1808,8 +1795,6 @@ OvsGetVportDumpNext(POVS_USER_PARAMS_CONTEXT usrParamsCtx, NdisReleaseRWLock(gOvsSwitchContext->dispatchLock, &lockState); - OvsReleaseCtrlLock(); - /* if i < OVS_MAX_VPORT_ARRAY_SIZE => vport was found */ if (i < OVS_MAX_VPORT_ARRAY_SIZE) { POVS_MESSAGE msgOut = (POVS_MESSAGE)usrParamsCtx->outputBuffer; @@ -2186,8 +2171,6 @@ OvsSetVportCmdHandler(POVS_USER_PARAMS_CONTEXT usrParamsCtx, /* Output buffer has been validated while validating transact dev op. */ ASSERT(msgOut != NULL && usrParamsCtx->outputLength >= sizeof *msgOut); - OvsAcquireCtrlLock(); - NdisAcquireRWLockWrite(gOvsSwitchContext->dispatchLock, &lockState, 0); if (vportAttrs[OVS_VPORT_ATTR_NAME] != NULL) { PSTR portName = NlAttrGet(vportAttrs[OVS_VPORT_ATTR_NAME]); @@ -2240,7 +2223,6 @@ OvsSetVportCmdHandler(POVS_USER_PARAMS_CONTEXT usrParamsCtx, Cleanup: NdisReleaseRWLock(gOvsSwitchContext->dispatchLock, &lockState); - OvsReleaseCtrlLock(); if (nlError != NL_ERROR_SUCCESS) { POVS_MESSAGE_ERROR msgError = (POVS_MESSAGE_ERROR) diff --git a/datapath/Makefile.am b/datapath/Makefile.am index 1c9e53b9d9f..458fa5bb61b 100644 --- a/datapath/Makefile.am +++ b/datapath/Makefile.am @@ -20,3 +20,39 @@ distfiles: Makefile sed -e "s|^$$srcdirstrip/||;t" \ -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t" | sort -u > $@ CLEANFILES = distfiles + +# Print name of all modules. +print-build-modules: + @if test -z "$(build_modules)"; \ + then \ + echo "Could not find any kernel module."; \ + exit 1; \ + fi + @echo "$(build_modules)" | tr '_' '-'; + +COMPAT_GET_FUNCTIONS := find $(top_srcdir)/datapath/linux/compat -name "*.h" \ + -exec sed -n '/^[a-z][a-z]* \*\?[A-Za-z0-9_][A-Za-z0-9_]*([a-z]/p; /^struct [a-z0-9_][a-z0-9_]* \*\?[A-Za-z0-9_][A-Za-z0-9_]*([a-z]/p' {} \; | tr -d '*' | cut -d '(' -f1 | rev | cut -d ' ' -f1 | rev +COMPAT_GET_EXPORTS := find $(top_srcdir)/datapath/linux/compat -name "*.c" \ + -exec sed -n 's/^EXPORT_SYMBOL[A-Z_]*(\([a-z_][a-z_]*\));$$/\1/p' {} \; +COMPAT_FUNCTIONS := $(shell $(COMPAT_GET_FUNCTIONS)) +COMPAT_EXPORTS := $(shell $(COMPAT_GET_EXPORTS)) + +# Checks that all public functions are 'rpl_' or 'ovs_' prefixed. +# Checks that all EXPORT_SYMBOL_GPL() export 'rpl_' or 'ovs_' prefixed functions. +check-export-symbol: + @for fun_ in $(COMPAT_FUNCTIONS); do \ + if ! grep -- $${fun_} $(top_srcdir)/datapath/linux/compat/build-aux/export-check-whitelist > /dev/null; then \ + if ! echo $${fun_} | grep -q -E '^(rpl|ovs)_'; then \ + echo "error: $${fun_}() needs to be prefixed with 'rpl_' or 'ovs_'."; \ + exit 1; \ + fi; \ + fi; \ + done + @for fun_ in $(COMPAT_EXPORTS); do \ + if ! echo $${fun_} | grep -q -E '^(rpl|ovs)_'; then \ + echo "error: $${fun_}() needs to be prefixed with 'rpl_' or 'ovs_'."; \ + exit 1; \ + fi; \ + done + +all-local: check-export-symbol diff --git a/datapath/Modules.mk b/datapath/Modules.mk index e9f40797eec..8dc3415c6b2 100644 --- a/datapath/Modules.mk +++ b/datapath/Modules.mk @@ -9,7 +9,10 @@ both_modules = \ vport_geneve \ vport_gre \ vport_lisp \ + vport_stt \ vport_vxlan +# When changing the name of 'build_modules', please also update the +# print-build-modules in Makefile.am. build_modules = $(both_modules) # Modules to build dist_modules = $(both_modules) # Modules to distribute @@ -28,6 +31,7 @@ vport_geneve_sources = vport-geneve.c vport_vxlan_sources = vport-vxlan.c vport_gre_sources = vport-gre.c vport_lisp_sources = vport-lisp.c +vport_stt_sources = vport-stt.c openvswitch_headers = \ compat.h \ diff --git a/datapath/README.md b/datapath/README.md index 9c03a2b99f5..1a4d8e1e1c4 100644 --- a/datapath/README.md +++ b/datapath/README.md @@ -246,3 +246,19 @@ The other rules for flow keys are much less subtle: composes it the same way. This allows userspace to hash and compare entire flow keys that it may not be able to fully interpret. + + +Coding rules +============ + +Compatibility +------------- + +Please implement the headers and codes for compatibility with older kernel +in linux/compat/ directory. All public functions should be exported using +EXPORT_SYMBOL macro. Public function replacing the same-named kernel +function should be prefixed with 'rpl_'. Otherwise, the function should be +prefixed with 'ovs_'. For special case when it is not possible to follow +this rule (e.g., the pskb_expand_head() function), the function name must +be added to linux/compat/build-aux/export-check-whitelist, otherwise, the +compilation check 'check-export-symbol' will fail. diff --git a/datapath/datapath.c b/datapath/datapath.c index c0af9ad0978..3c97b86dfec 100644 --- a/datapath/datapath.c +++ b/datapath/datapath.c @@ -2270,8 +2270,7 @@ static int __init dp_init(void) BUILD_BUG_ON(sizeof(struct ovs_skb_cb) > FIELD_SIZEOF(struct sk_buff, cb)); - pr_info("Open vSwitch switching datapath %s, built "__DATE__" "__TIME__"\n", - VERSION); + pr_info("Open vSwitch switching datapath %s\n", VERSION); err = action_fifos_init(); if (err) diff --git a/datapath/linux/.gitignore b/datapath/linux/.gitignore index 69d66582868..a4a930db782 100644 --- a/datapath/linux/.gitignore +++ b/datapath/linux/.gitignore @@ -35,6 +35,7 @@ /random32.c /reciprocal_div.c /skbuff-openvswitch.c +/stt.c /table.c /time.c /tmp @@ -50,6 +51,7 @@ /vport-lisp.c /vport-netdev.c /vport-patch.c +/vport-stt.c /vport-vxlan.c /vport.c /vxlan.c diff --git a/datapath/linux/Modules.mk b/datapath/linux/Modules.mk index 7d9710d7ec4..be3a8d894de 100644 --- a/datapath/linux/Modules.mk +++ b/datapath/linux/Modules.mk @@ -12,6 +12,7 @@ openvswitch_sources += \ linux/compat/net_namespace.c \ linux/compat/reciprocal_div.c \ linux/compat/skbuff-openvswitch.c \ + linux/compat/stt.c \ linux/compat/udp.c \ linux/compat/udp_tunnel.c \ linux/compat/vxlan.c \ @@ -39,7 +40,6 @@ openvswitch_headers += \ linux/compat/include/linux/kconfig.h \ linux/compat/include/linux/kernel.h \ linux/compat/include/linux/list.h \ - linux/compat/include/linux/log2.h \ linux/compat/include/linux/mpls.h \ linux/compat/include/linux/net.h \ linux/compat/include/linux/random.h \ @@ -76,5 +76,7 @@ openvswitch_headers += \ linux/compat/include/net/udp.h \ linux/compat/include/net/udp_tunnel.h \ linux/compat/include/net/sock.h \ + linux/compat/include/net/stt.h \ linux/compat/include/net/vxlan.h \ linux/compat/include/net/sctp/checksum.h +EXTRA_DIST += linux/compat/build-aux/export-check-whitelist diff --git a/datapath/linux/compat/build-aux/export-check-whitelist b/datapath/linux/compat/build-aux/export-check-whitelist new file mode 100644 index 00000000000..1178f46eea1 --- /dev/null +++ b/datapath/linux/compat/build-aux/export-check-whitelist @@ -0,0 +1 @@ +pskb_expand_head \ No newline at end of file diff --git a/datapath/linux/compat/dev-openvswitch.c b/datapath/linux/compat/dev-openvswitch.c index 1035fe83dfb..256d5817511 100644 --- a/datapath/linux/compat/dev-openvswitch.c +++ b/datapath/linux/compat/dev-openvswitch.c @@ -39,9 +39,9 @@ void dev_disable_lro(struct net_device *dev) { } static int nr_bridges; #ifdef HAVE_RHEL_OVS_HOOK -int netdev_rx_handler_register(struct net_device *dev, - openvswitch_handle_frame_hook_t *hook, - void *rx_handler_data) +int rpl_netdev_rx_handler_register(struct net_device *dev, + openvswitch_handle_frame_hook_t *hook, + void *rx_handler_data) { nr_bridges++; rcu_assign_pointer(dev->ax25_ptr, rx_handler_data); @@ -50,12 +50,13 @@ int netdev_rx_handler_register(struct net_device *dev, rcu_assign_pointer(openvswitch_handle_frame_hook, hook); return 0; } +EXPORT_SYMBOL_GPL(rpl_netdev_rx_handler_register); #else -int netdev_rx_handler_register(struct net_device *dev, - struct sk_buff *(*hook)(struct net_bridge_port *p, - struct sk_buff *skb), - void *rx_handler_data) +int rpl_netdev_rx_handler_register(struct net_device *dev, + struct sk_buff *(*hook)(struct net_bridge_port *p, + struct sk_buff *skb), + void *rx_handler_data) { nr_bridges++; if (dev->br_port) @@ -67,9 +68,10 @@ int netdev_rx_handler_register(struct net_device *dev, br_handle_frame_hook = hook; return 0; } +EXPORT_SYMBOL_GPL(rpl_netdev_rx_handler_register); #endif -void netdev_rx_handler_unregister(struct net_device *dev) +void rpl_netdev_rx_handler_unregister(struct net_device *dev) { nr_bridges--; #ifdef HAVE_RHEL_OVS_HOOK @@ -88,4 +90,6 @@ void netdev_rx_handler_unregister(struct net_device *dev) br_handle_frame_hook = NULL; #endif } +EXPORT_SYMBOL_GPL(rpl_netdev_rx_handler_unregister); + #endif diff --git a/datapath/linux/compat/exthdrs_core.c b/datapath/linux/compat/exthdrs_core.c index 56a17daaf61..6692ce3f4b4 100644 --- a/datapath/linux/compat/exthdrs_core.c +++ b/datapath/linux/compat/exthdrs_core.c @@ -45,6 +45,7 @@ int rpl_ipv6_skip_exthdr(const struct sk_buff *skb, int start, *nexthdrp = nexthdr; return start; } +EXPORT_SYMBOL_GPL(rpl_ipv6_skip_exthdr); #endif /* Kernel version < 3.3 */ #ifndef HAVE_IP6_FH_F_SKIP_RH @@ -169,5 +170,6 @@ int rpl_ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset, *offset = start; return nexthdr; } +EXPORT_SYMBOL_GPL(rpl_ipv6_find_hdr); #endif diff --git a/datapath/linux/compat/flex_array.c b/datapath/linux/compat/flex_array.c index c39dd1b5f4d..ce73198ff5d 100644 --- a/datapath/linux/compat/flex_array.c +++ b/datapath/linux/compat/flex_array.c @@ -89,7 +89,7 @@ static inline int elements_fit_in_base(struct flex_array *fa) * capacity in the base structure. Also note that no effort is made * to efficiently pack objects across page boundaries. */ -struct flex_array *flex_array_alloc(int element_size, unsigned int total, +struct flex_array *rpl_flex_array_alloc(int element_size, unsigned int total, gfp_t flags) { struct flex_array *ret; @@ -118,6 +118,7 @@ struct flex_array *flex_array_alloc(int element_size, unsigned int total, FLEX_ARRAY_BASE_BYTES_LEFT); return ret; } +EXPORT_SYMBOL_GPL(rpl_flex_array_alloc); static int fa_element_to_part_nr(struct flex_array *fa, unsigned int element_nr) @@ -132,7 +133,7 @@ static int fa_element_to_part_nr(struct flex_array *fa, * This is to be used in cases where the base 'struct flex_array' * has been statically allocated and should not be free. */ -void flex_array_free_parts(struct flex_array *fa) +void rpl_flex_array_free_parts(struct flex_array *fa) { int part_nr; @@ -141,12 +142,14 @@ void flex_array_free_parts(struct flex_array *fa) for (part_nr = 0; part_nr < FLEX_ARRAY_NR_BASE_PTRS; part_nr++) kfree(fa->parts[part_nr]); } +EXPORT_SYMBOL_GPL(rpl_flex_array_free_parts); -void flex_array_free(struct flex_array *fa) +void rpl_flex_array_free(struct flex_array *fa) { flex_array_free_parts(fa); kfree(fa); } +EXPORT_SYMBOL_GPL(rpl_flex_array_free); static unsigned int index_inside_part(struct flex_array *fa, unsigned int element_nr, @@ -191,7 +194,7 @@ __fa_get_part(struct flex_array *fa, int part_nr, gfp_t flags) * * Locking must be provided by the caller. */ -int flex_array_put(struct flex_array *fa, unsigned int element_nr, void *src, +int rpl_flex_array_put(struct flex_array *fa, unsigned int element_nr, void *src, gfp_t flags) { int part_nr = 0; @@ -214,6 +217,7 @@ int flex_array_put(struct flex_array *fa, unsigned int element_nr, void *src, memcpy(dst, src, fa->element_size); return 0; } +EXPORT_SYMBOL_GPL(rpl_flex_array_put); /** * flex_array_clear - clear element in array at @element_nr @@ -222,7 +226,7 @@ int flex_array_put(struct flex_array *fa, unsigned int element_nr, void *src, * * Locking must be provided by the caller. */ -int flex_array_clear(struct flex_array *fa, unsigned int element_nr) +int rpl_flex_array_clear(struct flex_array *fa, unsigned int element_nr) { int part_nr = 0; struct flex_array_part *part; @@ -244,6 +248,7 @@ int flex_array_clear(struct flex_array *fa, unsigned int element_nr) memset(dst, FLEX_ARRAY_FREE, fa->element_size); return 0; } +EXPORT_SYMBOL_GPL(rpl_flex_array_clear); /** * flex_array_prealloc - guarantee that array space exists @@ -260,7 +265,7 @@ int flex_array_clear(struct flex_array *fa, unsigned int element_nr) * * Locking must be provided by the caller. */ -int flex_array_prealloc(struct flex_array *fa, unsigned int start, +int rpl_flex_array_prealloc(struct flex_array *fa, unsigned int start, unsigned int nr_elements, gfp_t flags) { int start_part; @@ -293,6 +298,7 @@ int flex_array_prealloc(struct flex_array *fa, unsigned int start, } return 0; } +EXPORT_SYMBOL_GPL(rpl_flex_array_prealloc); /** * flex_array_get - pull data back out of the array @@ -306,7 +312,7 @@ int flex_array_prealloc(struct flex_array *fa, unsigned int start, * * Locking must be provided by the caller. */ -void *flex_array_get(struct flex_array *fa, unsigned int element_nr) +void *rpl_flex_array_get(struct flex_array *fa, unsigned int element_nr) { int part_nr = 0; struct flex_array_part *part; @@ -325,6 +331,7 @@ void *flex_array_get(struct flex_array *fa, unsigned int element_nr) } return &part->elements[index_inside_part(fa, element_nr, part_nr)]; } +EXPORT_SYMBOL_GPL(rpl_flex_array_get); /** * flex_array_get_ptr - pull a ptr back out of the array @@ -335,7 +342,7 @@ void *flex_array_get(struct flex_array *fa, unsigned int element_nr) * flex_array_put_ptr(). This function should not be called if the * element in question was not set using the _put_ptr() helper. */ -void *flex_array_get_ptr(struct flex_array *fa, unsigned int element_nr) +void *rpl_flex_array_get_ptr(struct flex_array *fa, unsigned int element_nr) { void **tmp; @@ -345,6 +352,7 @@ void *flex_array_get_ptr(struct flex_array *fa, unsigned int element_nr) return *tmp; } +EXPORT_SYMBOL_GPL(rpl_flex_array_get_ptr); static int part_is_free(struct flex_array_part *part) { @@ -365,7 +373,7 @@ static int part_is_free(struct flex_array_part *part) * * Locking must be provided by the caller. */ -int flex_array_shrink(struct flex_array *fa) +int rpl_flex_array_shrink(struct flex_array *fa) { struct flex_array_part *part; int part_nr; @@ -387,5 +395,6 @@ int flex_array_shrink(struct flex_array *fa) } return ret; } +EXPORT_SYMBOL_GPL(rpl_flex_array_shrink); #endif /* Linux version < 3.0.0 */ diff --git a/datapath/linux/compat/flow_dissector.c b/datapath/linux/compat/flow_dissector.c index a68f84f5259..3f42dba6c8c 100644 --- a/datapath/linux/compat/flow_dissector.c +++ b/datapath/linux/compat/flow_dissector.c @@ -204,7 +204,7 @@ static __always_inline u32 __flow_hash_3words(u32 a, u32 b, u32 c) return jhash_3words(a, b, c, hashrnd); } -u32 __skb_get_hash(struct sk_buff *skb) +u32 rpl__skb_get_rxhash(struct sk_buff *skb) { struct flow_keys keys; u32 hash; @@ -231,5 +231,6 @@ u32 __skb_get_hash(struct sk_buff *skb) #endif return hash; } -EXPORT_SYMBOL_GPL(__skb_get_hash); +EXPORT_SYMBOL_GPL(rpl__skb_get_rxhash); + #endif diff --git a/datapath/linux/compat/genetlink-openvswitch.c b/datapath/linux/compat/genetlink-openvswitch.c index ab149c35d25..9aea997778e 100644 --- a/datapath/linux/compat/genetlink-openvswitch.c +++ b/datapath/linux/compat/genetlink-openvswitch.c @@ -21,6 +21,7 @@ void rpl_genl_notify(struct rpl_genl_family *family, struct sk_buff *skb, genl_notify(skb, net, portid, group, nlh, flags); #endif } +EXPORT_SYMBOL_GPL(rpl_genl_notify); int rpl___genl_register_family(struct rpl_genl_family *f) { @@ -52,4 +53,6 @@ int rpl___genl_register_family(struct rpl_genl_family *f) return err; } +EXPORT_SYMBOL_GPL(rpl___genl_register_family); + #endif /* kernel version < 3.13.0 */ diff --git a/datapath/linux/compat/geneve.c b/datapath/linux/compat/geneve.c index 48a306e93a1..78d33954163 100644 --- a/datapath/linux/compat/geneve.c +++ b/datapath/linux/compat/geneve.c @@ -11,6 +11,9 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#if LINUX_VERSION_CODE < KERNEL_VERSION(4,0,0) + #include #include #include @@ -81,11 +84,11 @@ static void geneve_build_header(struct genevehdr *geneveh, * * This function will add other UDP tunnel headers. */ -int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt, - struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos, - __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port, - __be16 tun_flags, u8 vni[3], u8 opt_len, u8 *opt, - bool csum, bool xnet) +int rpl_geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt, + struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos, + __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port, + __be16 tun_flags, u8 vni[3], u8 opt_len, u8 *opt, + bool csum, bool xnet) { struct genevehdr *gnvh; int min_headroom; @@ -118,7 +121,7 @@ int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt, tos, ttl, df, src_port, dst_port, xnet, !csum); } -EXPORT_SYMBOL_GPL(geneve_xmit_skb); +EXPORT_SYMBOL_GPL(rpl_geneve_xmit_skb); /* Callback from net/ipv4/udp.c to receive packets */ static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb) @@ -221,13 +224,13 @@ static struct geneve_sock *geneve_socket_create(struct net *net, __be16 port, return gs; } -struct geneve_sock *geneve_sock_add(struct net *net, __be16 port, - geneve_rcv_t *rcv, void *data, - bool no_share, bool ipv6) +struct geneve_sock *rpl_geneve_sock_add(struct net *net, __be16 port, + geneve_rcv_t *rcv, void *data, + bool no_share, bool ipv6) { return geneve_socket_create(net, port, rcv, data, ipv6); } -EXPORT_SYMBOL_GPL(geneve_sock_add); +EXPORT_SYMBOL_GPL(rpl_geneve_sock_add); static void rcu_free_gs(struct rcu_head *rcu) { @@ -236,9 +239,11 @@ static void rcu_free_gs(struct rcu_head *rcu) kfree(gs); } -void geneve_sock_release(struct geneve_sock *gs) +void rpl_geneve_sock_release(struct geneve_sock *gs) { udp_tunnel_sock_release(gs->sock); call_rcu(&gs->rcu, rcu_free_gs); } -EXPORT_SYMBOL_GPL(geneve_sock_release); +EXPORT_SYMBOL_GPL(rpl_geneve_sock_release); + +#endif /* kernel < 4.0 */ diff --git a/datapath/linux/compat/gre.c b/datapath/linux/compat/gre.c index 06956f049ae..fe8138014d2 100644 --- a/datapath/linux/compat/gre.c +++ b/datapath/linux/compat/gre.c @@ -17,11 +17,7 @@ */ #include -#if LINUX_VERSION_CODE < KERNEL_VERSION(3,12,0) - #include -#if IS_ENABLED(CONFIG_NET_IPGRE_DEMUX) - #include #include #include @@ -42,6 +38,10 @@ #include "gso.h" +#if IS_ENABLED(CONFIG_NET_IPGRE_DEMUX) + +#if LINUX_VERSION_CODE < KERNEL_VERSION(3,12,0) + #ifndef HAVE_GRE_CISCO_REGISTER #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,37) @@ -236,7 +236,7 @@ static const struct gre_protocol ipgre_protocol = { .handler = gre_cisco_rcv, }; -int gre_cisco_register(struct gre_cisco_protocol *newp) +int rpl_gre_cisco_register(struct gre_cisco_protocol *newp) { int err; @@ -250,9 +250,9 @@ int gre_cisco_register(struct gre_cisco_protocol *newp) return (cmpxchg((struct gre_cisco_protocol **)&gre_cisco_proto, NULL, newp) == NULL) ? 0 : -EBUSY; } -EXPORT_SYMBOL_GPL(gre_cisco_register); +EXPORT_SYMBOL_GPL(rpl_gre_cisco_register); -int gre_cisco_unregister(struct gre_cisco_protocol *proto) +int rpl_gre_cisco_unregister(struct gre_cisco_protocol *proto) { int ret; @@ -266,7 +266,7 @@ int gre_cisco_unregister(struct gre_cisco_protocol *proto) ret = gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO); return ret; } -EXPORT_SYMBOL_GPL(gre_cisco_unregister); +EXPORT_SYMBOL_GPL(rpl_gre_cisco_unregister); #endif /* !HAVE_GRE_CISCO_REGISTER */ @@ -287,26 +287,12 @@ static void gre_csum_fix(struct sk_buff *skb) skb->len - gre_offset, 0)); } -struct sk_buff *gre_handle_offloads(struct sk_buff *skb, bool gre_csum) -{ - int type = gre_csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE; - gso_fix_segment_t fix_segment; - - if (gre_csum) - fix_segment = gre_csum_fix; - else - fix_segment = gre_nop_fix; - - return ovs_iptunnel_handle_offloads(skb, gre_csum, type, fix_segment); -} -EXPORT_SYMBOL_GPL(gre_handle_offloads); - static bool is_gre_gso(struct sk_buff *skb) { return skb_is_gso(skb); } -void gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi, +void rpl_gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi, int hdr_len) { struct gre_base_hdr *greh; @@ -337,8 +323,32 @@ void gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi, ovs_skb_set_inner_protocol(skb, tpi->proto); } -EXPORT_SYMBOL_GPL(gre_build_header); +EXPORT_SYMBOL_GPL(rpl_gre_build_header); -#endif /* CONFIG_NET_IPGRE_DEMUX */ +struct sk_buff *rpl_gre_handle_offloads(struct sk_buff *skb, bool gre_csum) +{ + int type = gre_csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE; + gso_fix_segment_t fix_segment; + + if (gre_csum) + fix_segment = gre_csum_fix; + else + fix_segment = gre_nop_fix; -#endif /* 3.12 */ + return ovs_iptunnel_handle_offloads(skb, gre_csum, type, fix_segment); +} +#else +struct sk_buff *rpl_gre_handle_offloads(struct sk_buff *skb, bool gre_csum) +{ + if (skb_is_gso(skb) && skb_is_encapsulated(skb)) { + kfree_skb(skb); + return ERR_PTR(-ENOSYS); + } + skb_clear_ovs_gso_cb(skb); +#undef gre_handle_offloads + return gre_handle_offloads(skb, gre_csum); +} +#endif +EXPORT_SYMBOL_GPL(rpl_gre_handle_offloads); + +#endif /* CONFIG_NET_IPGRE_DEMUX */ diff --git a/datapath/linux/compat/gso.c b/datapath/linux/compat/gso.c index cad9b182d5c..552e7485ccb 100644 --- a/datapath/linux/compat/gso.c +++ b/datapath/linux/compat/gso.c @@ -167,6 +167,7 @@ int rpl_dev_queue_xmit(struct sk_buff *skb) kfree_skb(skb); return err; } +EXPORT_SYMBOL_GPL(rpl_dev_queue_xmit); #endif /* 3.16 */ #if LINUX_VERSION_CODE < KERNEL_VERSION(3,18,0) @@ -296,4 +297,6 @@ int rpl_ip_local_out(struct sk_buff *skb) } return ret; } +EXPORT_SYMBOL_GPL(rpl_ip_local_out); + #endif /* 3.18 */ diff --git a/datapath/linux/compat/gso.h b/datapath/linux/compat/gso.h index 337d13a7714..6fcaff8d623 100644 --- a/datapath/linux/compat/gso.h +++ b/datapath/linux/compat/gso.h @@ -26,6 +26,15 @@ struct ovs_gso_cb { }; #define OVS_GSO_CB(skb) ((struct ovs_gso_cb *)(skb)->cb) +static inline void skb_clear_ovs_gso_cb(struct sk_buff *skb) +{ + OVS_GSO_CB(skb)->fix_segment = NULL; +} +#else +static inline void skb_clear_ovs_gso_cb(struct sk_buff *skb) +{ + +} #endif #if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0) @@ -52,6 +61,15 @@ static inline int skb_inner_network_offset(const struct sk_buff *skb) return skb_inner_network_header(skb) - skb->data; } +/* We don't actually store the transport offset on backports because + * we don't use it anywhere. Slightly rename this version to avoid + * future users from picking it up accidentially. + */ +static inline int ovs_skb_inner_transport_offset(const struct sk_buff *skb) +{ + return 0; +} + static inline void skb_set_inner_network_header(const struct sk_buff *skb, int offset) { @@ -62,6 +80,14 @@ static inline void skb_set_inner_network_header(const struct sk_buff *skb, static inline void skb_set_inner_transport_header(const struct sk_buff *skb, int offset) { } + +#else + +static inline int ovs_skb_inner_transport_offset(const struct sk_buff *skb) +{ + return skb_inner_transport_header(skb) - skb->data; +} + #endif #if LINUX_VERSION_CODE < KERNEL_VERSION(3,11,0) @@ -110,7 +136,7 @@ static inline __be16 ovs_skb_get_inner_protocol(struct sk_buff *skb) #if LINUX_VERSION_CODE < KERNEL_VERSION(3,18,0) #define ip_local_out rpl_ip_local_out -int ip_local_out(struct sk_buff *skb); +int rpl_ip_local_out(struct sk_buff *skb); static inline int skb_inner_mac_offset(const struct sk_buff *skb) { diff --git a/datapath/linux/compat/include/linux/flex_array.h b/datapath/linux/compat/include/linux/flex_array.h index 443c4af3bdf..d1e1b831c31 100644 --- a/datapath/linux/compat/include/linux/flex_array.h +++ b/datapath/linux/compat/include/linux/flex_array.h @@ -66,22 +66,39 @@ struct flex_array { FLEX_ARRAY_ELEMENTS_PER_PART(__element_size)); \ } -struct flex_array *flex_array_alloc(int element_size, unsigned int total, +#define flex_array_alloc rpl_flex_array_alloc +struct flex_array *rpl_flex_array_alloc(int element_size, unsigned int total, gfp_t flags); -int flex_array_prealloc(struct flex_array *fa, unsigned int start, + +#define flex_array_prealloc rpl_flex_array_prealloc +int rpl_flex_array_prealloc(struct flex_array *fa, unsigned int start, unsigned int nr_elements, gfp_t flags); -void flex_array_free(struct flex_array *fa); -void flex_array_free_parts(struct flex_array *fa); -int flex_array_put(struct flex_array *fa, unsigned int element_nr, void *src, + +#define flex_array_free rpl_flex_array_free +void rpl_flex_array_free(struct flex_array *fa); + +#define flex_array_free_parts rpl_flex_array_free_parts +void rpl_flex_array_free_parts(struct flex_array *fa); + +#define flex_array_put rpl_flex_array_put +int rpl_flex_array_put(struct flex_array *fa, unsigned int element_nr, void *src, gfp_t flags); -int flex_array_clear(struct flex_array *fa, unsigned int element_nr); -void *flex_array_get(struct flex_array *fa, unsigned int element_nr); -int flex_array_shrink(struct flex_array *fa); -#define flex_array_put_ptr(fa, nr, src, gfp) \ +#define flex_array_clear rpl_flex_array_clear +int rpl_flex_array_clear(struct flex_array *fa, unsigned int element_nr); + +#define flex_array_get rpl_flex_array_get +void *rpl_flex_array_get(struct flex_array *fa, unsigned int element_nr); + +#define flex_array_shrink rpl_flex_array_shrink +int rpl_flex_array_shrink(struct flex_array *fa); + +#define flex_array_put_ptr rpl_flex_array_put_ptr +#define rpl_flex_array_put_ptr(fa, nr, src, gfp) \ flex_array_put(fa, nr, (void *)&(src), gfp) -void *flex_array_get_ptr(struct flex_array *fa, unsigned int element_nr); +#define flex_array_get_ptr rpl_flex_array_get_ptr +void *rpl_flex_array_get_ptr(struct flex_array *fa, unsigned int element_nr); #endif /* Linux version < 3.0.0 */ #endif /* __LINUX_FLEX_ARRAY_WRAPPER_H */ diff --git a/datapath/linux/compat/include/linux/ip.h b/datapath/linux/compat/include/linux/ip.h index 6ff71524023..c64306e0808 100644 --- a/datapath/linux/compat/include/linux/ip.h +++ b/datapath/linux/compat/include/linux/ip.h @@ -5,6 +5,7 @@ #ifndef HAVE_SKBUFF_HEADER_HELPERS #include + static inline struct iphdr *ip_hdr(const struct sk_buff *skb) { return (struct iphdr *)skb_network_header(skb); diff --git a/datapath/linux/compat/include/linux/log2.h b/datapath/linux/compat/include/linux/log2.h deleted file mode 100644 index 69abae5e80a..00000000000 --- a/datapath/linux/compat/include/linux/log2.h +++ /dev/null @@ -1,17 +0,0 @@ -#ifndef __LINUX_LOG2_WRAPPER -#define __LINUX_LOG2_WRAPPER - -#ifdef HAVE_LOG2_H -#include_next -#else -/* This is very stripped down because log2.h has far too many dependencies. */ - -extern __attribute__((const, noreturn)) -int ____ilog2_NaN(void); - -#define ilog2(n) ((n) == 4 ? 2 : \ - (n) == 8 ? 3 : \ - ____ilog2_NaN()) -#endif - -#endif diff --git a/datapath/linux/compat/include/linux/net.h b/datapath/linux/compat/include/linux/net.h index d8bf621cee4..9c947454efc 100644 --- a/datapath/linux/compat/include/linux/net.h +++ b/datapath/linux/compat/include/linux/net.h @@ -30,7 +30,8 @@ do { \ #endif #ifndef net_get_random_once -bool __net_get_random_once(void *buf, int nbytes, bool *done, +#define __net_get_random_once rpl___net_get_random_once +bool rpl___net_get_random_once(void *buf, int nbytes, bool *done, atomic_t *done_key); #define ___NET_RANDOM_STATIC_KEY_INIT ATOMIC_INIT(0) diff --git a/datapath/linux/compat/include/linux/netdevice.h b/datapath/linux/compat/include/linux/netdevice.h index 43a04a43286..38315c251cb 100644 --- a/datapath/linux/compat/include/linux/netdevice.h +++ b/datapath/linux/compat/include/linux/netdevice.h @@ -50,18 +50,21 @@ extern void dev_disable_lro(struct net_device *dev); typedef struct sk_buff *(openvswitch_handle_frame_hook_t)(struct sk_buff *skb); extern openvswitch_handle_frame_hook_t *openvswitch_handle_frame_hook; -int netdev_rx_handler_register(struct net_device *dev, - openvswitch_handle_frame_hook_t *hook, - void *rx_handler_data); +#define netdev_rx_handler_register rpl_netdev_rx_handler_register +int rpl_netdev_rx_handler_register(struct net_device *dev, + openvswitch_handle_frame_hook_t *hook, + void *rx_handler_data); #else -int netdev_rx_handler_register(struct net_device *dev, - struct sk_buff *(*netdev_hook)(struct net_bridge_port *p, - struct sk_buff *skb), - void *rx_handler_data); +#define netdev_rx_handler_register rpl_netdev_rx_handler_register +int rpl_netdev_rx_handler_register(struct net_device *dev, + struct sk_buff *(*netdev_hook)(struct net_bridge_port *p, + struct sk_buff *skb), + void *rx_handler_data); #endif -void netdev_rx_handler_unregister(struct net_device *dev); +#define netdev_rx_handler_unregister rpl_netdev_rx_handler_unregister +void rpl_netdev_rx_handler_unregister(struct net_device *dev); #endif #ifndef HAVE_DEV_GET_BY_INDEX_RCU @@ -138,7 +141,7 @@ static inline struct net_device *netdev_master_upper_dev_get(struct net_device * #if LINUX_VERSION_CODE < KERNEL_VERSION(3,16,0) #define dev_queue_xmit rpl_dev_queue_xmit -int dev_queue_xmit(struct sk_buff *skb); +int rpl_dev_queue_xmit(struct sk_buff *skb); #endif #if LINUX_VERSION_CODE < KERNEL_VERSION(3,11,0) diff --git a/datapath/linux/compat/include/linux/openvswitch.h b/datapath/linux/compat/include/linux/openvswitch.h index 61c59a221c8..f53bc81c75a 100644 --- a/datapath/linux/compat/include/linux/openvswitch.h +++ b/datapath/linux/compat/include/linux/openvswitch.h @@ -229,6 +229,7 @@ enum ovs_vport_type { OVS_VPORT_TYPE_GENEVE, /* Geneve tunnel. */ OVS_VPORT_TYPE_GRE64 = 104, /* GRE tunnel with 64-bit keys */ OVS_VPORT_TYPE_LISP = 105, /* LISP tunnel */ + OVS_VPORT_TYPE_STT = 106, /* STT tunnel */ __OVS_VPORT_TYPE_MAX }; diff --git a/datapath/linux/compat/include/linux/reciprocal_div.h b/datapath/linux/compat/include/linux/reciprocal_div.h index 2def5c6ca78..f50d8e4ee27 100644 --- a/datapath/linux/compat/include/linux/reciprocal_div.h +++ b/datapath/linux/compat/include/linux/reciprocal_div.h @@ -25,10 +25,10 @@ struct reciprocal_value { u8 sh1, sh2; }; -struct reciprocal_value reciprocal_value(u32 d); +struct reciprocal_value rpl_reciprocal_value(u32 d); #define reciprocal_divide rpl_reciprocal_divide -static inline u32 reciprocal_divide(u32 a, struct reciprocal_value R) +static inline u32 rpl_reciprocal_divide(u32 a, struct reciprocal_value R) { u32 t = (u32)(((u64)a * R.m) >> 32); return (t + ((a - t) >> R.sh1)) >> R.sh2; diff --git a/datapath/linux/compat/include/linux/skbuff.h b/datapath/linux/compat/include/linux/skbuff.h index d147192328a..0ae6c133f2a 100644 --- a/datapath/linux/compat/include/linux/skbuff.h +++ b/datapath/linux/compat/include/linux/skbuff.h @@ -1,10 +1,22 @@ #ifndef __LINUX_SKBUFF_WRAPPER_H #define __LINUX_SKBUFF_WRAPPER_H 1 +#include +#include + +#if LINUX_VERSION_CODE < KERNEL_VERSION(3,18,0) +/* This should be before skbuff.h to make sure that we rewrite + * the calls there. */ +struct sk_buff; + +int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, + gfp_t gfp_mask); +#define pskb_expand_head rpl_pskb_expand_head +#endif + #include_next #include -#include #if LINUX_VERSION_CODE < KERNEL_VERSION(3,9,0) #define SKB_GSO_GRE 0 @@ -303,13 +315,14 @@ static inline void skb_tx_error(struct sk_buff *skb) #endif /* LINUX_VERSION_CODE < KERNEL_VERSION(3,8,0) */ #if LINUX_VERSION_CODE < KERNEL_VERSION(3,14,0) -unsigned int skb_zerocopy_headlen(const struct sk_buff *from); +#define skb_zerocopy_headlen rpl_skb_zerocopy_headlen +unsigned int rpl_skb_zerocopy_headlen(const struct sk_buff *from); #endif #ifndef HAVE_SKB_ZEROCOPY #define skb_zerocopy rpl_skb_zerocopy -int skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, - int hlen); +int rpl_skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, + int hlen); #endif #ifndef HAVE_SKB_CLEAR_HASH @@ -342,17 +355,17 @@ static inline void __skb_fill_page_desc(struct sk_buff *skb, int i, #ifndef HAVE_SKB_ENSURE_WRITABLE #define skb_ensure_writable rpl_skb_ensure_writable -int skb_ensure_writable(struct sk_buff *skb, int write_len); +int rpl_skb_ensure_writable(struct sk_buff *skb, int write_len); #endif #ifndef HAVE_SKB_VLAN_POP #define skb_vlan_pop rpl_skb_vlan_pop -int skb_vlan_pop(struct sk_buff *skb); +int rpl_skb_vlan_pop(struct sk_buff *skb); #endif #ifndef HAVE_SKB_VLAN_PUSH #define skb_vlan_push rpl_skb_vlan_push -int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci); +int rpl_skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci); #endif #endif diff --git a/datapath/linux/compat/include/net/checksum.h b/datapath/linux/compat/include/net/checksum.h index a40de4019c2..398df9324fc 100644 --- a/datapath/linux/compat/include/net/checksum.h +++ b/datapath/linux/compat/include/net/checksum.h @@ -37,8 +37,9 @@ static inline void csum_replace2(__sum16 *sum, __be16 from, __be16 to) #endif #if LINUX_VERSION_CODE < KERNEL_VERSION(3,7,0) -void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb, - const __be32 *from, const __be32 *to, - int pseudohdr); +#define inet_proto_csum_replace16 rpl_inet_proto_csum_replace16 +void rpl_inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb, + const __be32 *from, const __be32 *to, + int pseudohdr); #endif #endif /* checksum.h */ diff --git a/datapath/linux/compat/include/net/genetlink.h b/datapath/linux/compat/include/net/genetlink.h index edf68156264..cf89d4c239e 100644 --- a/datapath/linux/compat/include/net/genetlink.h +++ b/datapath/linux/compat/include/net/genetlink.h @@ -45,9 +45,9 @@ struct rpl_genl_family { #define genl_family rpl_genl_family #define genl_notify rpl_genl_notify -void genl_notify(struct genl_family *family, - struct sk_buff *skb, struct net *net, u32 portid, u32 group, - struct nlmsghdr *nlh, gfp_t flags); +void rpl_genl_notify(struct genl_family *family, + struct sk_buff *skb, struct net *net, u32 portid, u32 group, + struct nlmsghdr *nlh, gfp_t flags); static inline void *rpl_genlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, struct genl_family *family, int flags, u8 cmd) diff --git a/datapath/linux/compat/include/net/geneve.h b/datapath/linux/compat/include/net/geneve.h index 402ef38a2fe..58f5def99f3 100644 --- a/datapath/linux/compat/include/net/geneve.h +++ b/datapath/linux/compat/include/net/geneve.h @@ -1,6 +1,12 @@ #ifndef __NET_GENEVE_WRAPPER_H #define __NET_GENEVE_WRAPPER_H 1 +#include + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,0,0) +#include_next +#else + #ifdef CONFIG_INET #include #endif @@ -77,17 +83,22 @@ struct geneve_sock { #define GENEVE_VER 0 #define GENEVE_BASE_HLEN (sizeof(struct udphdr) + sizeof(struct genevehdr)) -struct geneve_sock *geneve_sock_add(struct net *net, __be16 port, - geneve_rcv_t *rcv, void *data, - bool no_share, bool ipv6); +#define geneve_sock_add rpl_geneve_sock_add +struct geneve_sock *rpl_geneve_sock_add(struct net *net, __be16 port, + geneve_rcv_t *rcv, void *data, + bool no_share, bool ipv6); -void geneve_sock_release(struct geneve_sock *vs); +#define geneve_sock_release rpl_geneve_sock_release +void rpl_geneve_sock_release(struct geneve_sock *vs); -int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt, - struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos, - __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port, - __be16 tun_flags, u8 vni[3], u8 opt_len, u8 *opt, - bool csum, bool xnet); +#define geneve_xmit_skb rpl_geneve_xmit_skb +int rpl_geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt, + struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos, + __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port, + __be16 tun_flags, u8 vni[3], u8 opt_len, u8 *opt, + bool csum, bool xnet); #endif /*ifdef CONFIG_INET */ +#endif /* kernel < 4.0 */ + #endif /*ifdef__NET_GENEVE_WRAPPER_H */ diff --git a/datapath/linux/compat/include/net/gre.h b/datapath/linux/compat/include/net/gre.h index 08f6ee1b1fd..6e0df0fd8e1 100644 --- a/datapath/linux/compat/include/net/gre.h +++ b/datapath/linux/compat/include/net/gre.h @@ -23,10 +23,10 @@ struct gre_cisco_protocol { }; #define gre_cisco_register rpl_gre_cisco_register -int gre_cisco_register(struct gre_cisco_protocol *proto); +int rpl_gre_cisco_register(struct gre_cisco_protocol *proto); #define gre_cisco_unregister rpl_gre_cisco_unregister -int gre_cisco_unregister(struct gre_cisco_protocol *proto); +int rpl_gre_cisco_unregister(struct gre_cisco_protocol *proto); #if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0) struct gre_base_hdr { @@ -81,14 +81,14 @@ static inline __be16 tnl_flags_to_gre_flags(__be16 tflags) #endif /* LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0) */ #endif /* HAVE_GRE_CISCO_REGISTER */ +#define gre_handle_offloads rpl_gre_handle_offloads +struct sk_buff *rpl_gre_handle_offloads(struct sk_buff *skb, bool gre_csum); + #if LINUX_VERSION_CODE < KERNEL_VERSION(3,12,0) #define gre_build_header rpl_gre_build_header -void gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi, - int hdr_len); - -#define gre_handle_offloads rpl_gre_handle_offloads -struct sk_buff *gre_handle_offloads(struct sk_buff *skb, bool gre_csum); +void rpl_gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi, + int hdr_len); #define ip_gre_calc_hlen rpl_ip_gre_calc_hlen static inline int ip_gre_calc_hlen(__be16 o_flags) @@ -103,18 +103,6 @@ static inline int ip_gre_calc_hlen(__be16 o_flags) addend += 4; return addend; } -#else - -static inline struct sk_buff *rpl_gre_handle_offloads(struct sk_buff *skb, - bool gre_csum) -{ - if (skb_is_gso(skb) && skb_is_encapsulated(skb)) { - kfree_skb(skb); - return ERR_PTR(-ENOSYS); - } - return gre_handle_offloads(skb, gre_csum); -} -#define gre_handle_offloads rpl_gre_handle_offloads #endif #endif diff --git a/datapath/linux/compat/include/net/ip_tunnels.h b/datapath/linux/compat/include/net/ip_tunnels.h index bb96ec3181a..3ed6f9193b0 100644 --- a/datapath/linux/compat/include/net/ip_tunnels.h +++ b/datapath/linux/compat/include/net/ip_tunnels.h @@ -24,12 +24,12 @@ struct sk_buff *ovs_iptunnel_handle_offloads(struct sk_buff *skb, void (*fix_segment)(struct sk_buff *)); #define iptunnel_xmit rpl_iptunnel_xmit -int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, - __be32 src, __be32 dst, __u8 proto, __u8 tos, __u8 ttl, - __be16 df, bool xnet); +int rpl_iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, + __be32 src, __be32 dst, __u8 proto, __u8 tos, __u8 ttl, + __be16 df, bool xnet); #define iptunnel_pull_header rpl_iptunnel_pull_header -int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto); +int rpl_iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto); #else @@ -80,6 +80,7 @@ struct tnl_ptk_info { #undef TUNNEL_OPTIONS_PRESENT #define TUNNEL_OPTIONS_PRESENT (TUNNEL_GENEVE_OPT | TUNNEL_VXLAN_OPT) -bool skb_is_encapsulated(struct sk_buff *skb); +#define skb_is_encapsulated ovs_skb_is_encapsulated +bool ovs_skb_is_encapsulated(struct sk_buff *skb); #endif /* __NET_IP_TUNNELS_H */ diff --git a/datapath/linux/compat/include/net/ipv6.h b/datapath/linux/compat/include/net/ipv6.h index 450e3079acf..18c7d301edc 100644 --- a/datapath/linux/compat/include/net/ipv6.h +++ b/datapath/linux/compat/include/net/ipv6.h @@ -11,8 +11,8 @@ #if LINUX_VERSION_CODE < KERNEL_VERSION(3,3,0) #define ipv6_skip_exthdr rpl_ipv6_skip_exthdr -extern int ipv6_skip_exthdr(const struct sk_buff *skb, int start, - u8 *nexthdrp, __be16 *frag_offp); +extern int rpl_ipv6_skip_exthdr(const struct sk_buff *skb, int start, + u8 *nexthdrp, __be16 *frag_offp); #endif #ifndef HAVE_IP6_FH_F_SKIP_RH @@ -28,8 +28,8 @@ enum { * IP6_FH_F_SKIP_RH. */ #define ipv6_find_hdr rpl_ipv6_find_hdr -extern int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset, - int target, unsigned short *fragoff, int *fragflg); +extern int rpl_ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset, + int target, unsigned short *fragoff, int *fragflg); #endif #if LINUX_VERSION_CODE < KERNEL_VERSION(3,6,0) diff --git a/datapath/linux/compat/include/net/net_namespace.h b/datapath/linux/compat/include/net/net_namespace.h index be64093ddc1..b7dbfe3fbd1 100644 --- a/datapath/linux/compat/include/net/net_namespace.h +++ b/datapath/linux/compat/include/net/net_namespace.h @@ -17,8 +17,10 @@ struct rpl_pernet_operations { #define register_pernet_device rpl_register_pernet_gen_device #define unregister_pernet_device rpl_unregister_pernet_gen_device -int compat_init_net(struct net *net, struct rpl_pernet_operations *pnet); -void compat_exit_net(struct net *net, struct rpl_pernet_operations *pnet); +#define compat_init_net ovs_compat_init_net +int ovs_compat_init_net(struct net *net, struct rpl_pernet_operations *pnet); +#define compat_exit_net ovs_compat_exit_net +void ovs_compat_exit_net(struct net *net, struct rpl_pernet_operations *pnet); #define DEFINE_COMPAT_PNET_REG_FUNC(TYPE) \ \ diff --git a/datapath/linux/compat/include/net/stt.h b/datapath/linux/compat/include/net/stt.h new file mode 100644 index 00000000000..13812b1f218 --- /dev/null +++ b/datapath/linux/compat/include/net/stt.h @@ -0,0 +1,71 @@ +#ifndef __NET_STT_H +#define __NET_STT_H 1 + +#include +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,5,0) && IS_ENABLED(CONFIG_NETFILTER) +#include +#define OVS_STT + +struct stthdr { + __u8 version; + __u8 flags; + __u8 l4_offset; + __u8 reserved; + __be16 mss; + __be16 vlan_tci; + __be64 key; +}; + +/* Padding after the end of the tunnel headers to provide alignment + * for inner packet IP header after 14 byte Ethernet header. + */ +#define STT_ETH_PAD 2 + +#define STT_BASE_HLEN (sizeof(struct stthdr) + STT_ETH_PAD) +#define STT_HEADER_LEN (sizeof(struct tcphdr) + STT_BASE_HLEN) + +static inline struct stthdr *stt_hdr(const struct sk_buff *skb) +{ + return (struct stthdr *)(skb_transport_header(skb) + + sizeof(struct tcphdr)); +} + +struct stt_sock; +typedef void (stt_rcv_t)(struct stt_sock *stt_sock, struct sk_buff *skb); + +/* @list: Per-net list of STT ports. + * @rcv: The callback is called on STT packet recv, STT reassembly can generate + * multiple packets, in this case first packet has tunnel outer header, rest + * of the packets are inner packet segments with no stt header. + * @rcv_data: user data. + * @sock: Fake TCP socket for the STT port. + */ +struct stt_sock { + struct list_head list; + stt_rcv_t *rcv; + void *rcv_data; + struct socket *sock; + struct rcu_head rcu; +}; + +#define stt_sock_add rpl_stt_sock_add +struct stt_sock *rpl_stt_sock_add(struct net *net, __be16 port, + stt_rcv_t *rcv, void *data); + +#define stt_sock_release rpl_stt_sock_release +void rpl_stt_sock_release(struct stt_sock *stt_sock); + +#define stt_xmit_skb rpl_stt_xmit_skb +int rpl_stt_xmit_skb(struct sk_buff *skb, struct rtable *rt, + __be32 src, __be32 dst, __u8 tos, + __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port, + __be64 tun_id); + +#define stt_init_module ovs_stt_init_module +int ovs_stt_init_module(void); + +#define stt_cleanup_module ovs_stt_cleanup_module +void ovs_stt_cleanup_module(void); + +#endif +#endif /*ifdef__NET_STT_H */ diff --git a/datapath/linux/compat/include/net/udp.h b/datapath/linux/compat/include/net/udp.h index 02eb688781d..fcb8f6a8e22 100644 --- a/datapath/linux/compat/include/net/udp.h +++ b/datapath/linux/compat/include/net/udp.h @@ -55,8 +55,9 @@ static inline __sum16 udp_v4_check(int len, __be32 saddr, #endif #ifndef HAVE_UDP_SET_CSUM -void udp_set_csum(bool nocheck, struct sk_buff *skb, - __be32 saddr, __be32 daddr, int len); +#define udp_set_csum rpl_udp_set_csum +void rpl_udp_set_csum(bool nocheck, struct sk_buff *skb, + __be32 saddr, __be32 daddr, int len); #endif #endif diff --git a/datapath/linux/compat/include/net/udp_tunnel.h b/datapath/linux/compat/include/net/udp_tunnel.h index 6c25ca54bc9..81cb3df12e5 100644 --- a/datapath/linux/compat/include/net/udp_tunnel.h +++ b/datapath/linux/compat/include/net/udp_tunnel.h @@ -4,7 +4,7 @@ #include #include -#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,20,0) +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,0,0) #include_next static inline struct sk_buff * @@ -50,8 +50,8 @@ struct udp_port_cfg { }; #define udp_sock_create rpl_udp_sock_create -int udp_sock_create(struct net *net, struct udp_port_cfg *cfg, - struct socket **sockp); +int rpl_udp_sock_create(struct net *net, struct udp_port_cfg *cfg, + struct socket **sockp); typedef int (*udp_tunnel_encap_rcv_t)(struct sock *sk, struct sk_buff *skb); typedef void (*udp_tunnel_encap_destroy_t)(struct sock *sk); @@ -65,17 +65,20 @@ struct udp_tunnel_sock_cfg { }; /* Setup the given (UDP) sock to receive UDP encapsulated packets */ -void setup_udp_tunnel_sock(struct net *net, struct socket *sock, - struct udp_tunnel_sock_cfg *sock_cfg); +#define setup_udp_tunnel_sock rpl_setup_udp_tunnel_sock +void rpl_setup_udp_tunnel_sock(struct net *net, struct socket *sock, + struct udp_tunnel_sock_cfg *sock_cfg); /* Transmit the skb using UDP encapsulation. */ -int udp_tunnel_xmit_skb(struct rtable *rt, struct sk_buff *skb, - __be32 src, __be32 dst, __u8 tos, __u8 ttl, - __be16 df, __be16 src_port, __be16 dst_port, - bool xnet, bool nocheck); +#define udp_tunnel_xmit_skb rpl_udp_tunnel_xmit_skb +int rpl_udp_tunnel_xmit_skb(struct rtable *rt, struct sk_buff *skb, + __be32 src, __be32 dst, __u8 tos, __u8 ttl, + __be16 df, __be16 src_port, __be16 dst_port, + bool xnet, bool nocheck); -void udp_tunnel_sock_release(struct socket *sock); +#define udp_tunnel_sock_release rpl_udp_tunnel_sock_release +void rpl_udp_tunnel_sock_release(struct socket *sock); void ovs_udp_gso(struct sk_buff *skb); void ovs_udp_csum_gso(struct sk_buff *skb); @@ -102,5 +105,5 @@ static inline struct sk_buff *udp_tunnel_handle_offloads(struct sk_buff *skb, #define udp_tunnel_encap_enable(sock) udp_encap_enable() -#endif /* Linux version < 3.20 */ +#endif /* Linux version < 4.0 */ #endif diff --git a/datapath/linux/compat/include/net/vxlan.h b/datapath/linux/compat/include/net/vxlan.h index 7511c2e8560..0d60c189cc2 100644 --- a/datapath/linux/compat/include/net/vxlan.h +++ b/datapath/linux/compat/include/net/vxlan.h @@ -130,19 +130,19 @@ struct vxlan_sock { }; #define vxlan_sock_add rpl_vxlan_sock_add -struct vxlan_sock *vxlan_sock_add(struct net *net, __be16 port, - vxlan_rcv_t *rcv, void *data, - bool no_share, u32 flags); +struct vxlan_sock *rpl_vxlan_sock_add(struct net *net, __be16 port, + vxlan_rcv_t *rcv, void *data, + bool no_share, u32 flags); #define vxlan_sock_release rpl_vxlan_sock_release -void vxlan_sock_release(struct vxlan_sock *vs); +void rpl_vxlan_sock_release(struct vxlan_sock *vs); #define vxlan_xmit_skb rpl_vxlan_xmit_skb -int vxlan_xmit_skb(struct vxlan_sock *vs, - struct rtable *rt, struct sk_buff *skb, - __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df, - __be16 src_port, __be16 dst_port, - struct vxlan_metadata *md, bool xnet, u32 vxflags); +int rpl_vxlan_xmit_skb(struct vxlan_sock *vs, + struct rtable *rt, struct sk_buff *skb, + __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df, + __be16 src_port, __be16 dst_port, + struct vxlan_metadata *md, bool xnet, u32 vxflags); #endif /* !HAVE_VXLAN_METADATA */ #endif diff --git a/datapath/linux/compat/ip_tunnels_core.c b/datapath/linux/compat/ip_tunnels_core.c index f2c4ffd6724..8ff7cd79f7a 100644 --- a/datapath/linux/compat/ip_tunnels_core.c +++ b/datapath/linux/compat/ip_tunnels_core.c @@ -135,7 +135,7 @@ struct sk_buff *ovs_iptunnel_handle_offloads(struct sk_buff *skb, } EXPORT_SYMBOL_GPL(ovs_iptunnel_handle_offloads); -int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto) +int rpl_iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto) { if (unlikely(!pskb_may_pull(skb, hdr_len))) return -ENOMEM; @@ -168,15 +168,15 @@ int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto) skb->pkt_type = PACKET_HOST; return 0; } -EXPORT_SYMBOL_GPL(iptunnel_pull_header); +EXPORT_SYMBOL_GPL(rpl_iptunnel_pull_header); #endif -bool skb_is_encapsulated(struct sk_buff *skb) +bool ovs_skb_is_encapsulated(struct sk_buff *skb) { /* checking for inner protocol should be sufficient on newer kernel, but * old kernel just set encapsulation bit. */ return ovs_skb_get_inner_protocol(skb) || skb_encapsulation(skb); } -EXPORT_SYMBOL_GPL(skb_is_encapsulated); +EXPORT_SYMBOL_GPL(ovs_skb_is_encapsulated); diff --git a/datapath/linux/compat/net_namespace.c b/datapath/linux/compat/net_namespace.c index 1fba3b124b9..4a9b1d64837 100644 --- a/datapath/linux/compat/net_namespace.c +++ b/datapath/linux/compat/net_namespace.c @@ -5,7 +5,7 @@ #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33) -int compat_init_net(struct net *net, struct rpl_pernet_operations *pnet) +int ovs_compat_init_net(struct net *net, struct rpl_pernet_operations *pnet) { int err; void *ovs_net = kzalloc(pnet->size, GFP_KERNEL); @@ -28,8 +28,9 @@ int compat_init_net(struct net *net, struct rpl_pernet_operations *pnet) kfree(ovs_net); return err; } +EXPORT_SYMBOL_GPL(ovs_compat_init_net); -void compat_exit_net(struct net *net, struct rpl_pernet_operations *pnet) +void ovs_compat_exit_net(struct net *net, struct rpl_pernet_operations *pnet) { void *ovs_net = net_generic(net, *pnet->id); @@ -37,4 +38,6 @@ void compat_exit_net(struct net *net, struct rpl_pernet_operations *pnet) pnet->exit(net); kfree(ovs_net); } +EXPORT_SYMBOL_GPL(ovs_compat_exit_net); + #endif diff --git a/datapath/linux/compat/netdevice.c b/datapath/linux/compat/netdevice.c index 2932f5cb6f3..7bb8f77938d 100644 --- a/datapath/linux/compat/netdevice.c +++ b/datapath/linux/compat/netdevice.c @@ -72,6 +72,7 @@ netdev_features_t rpl_netif_skb_features(struct sk_buff *skb) return harmonize_features(skb, protocol, features); } } +EXPORT_SYMBOL_GPL(rpl_netif_skb_features); #endif /* kernel version < 2.6.38 */ #if LINUX_VERSION_CODE < KERNEL_VERSION(3,16,0) @@ -106,4 +107,6 @@ struct sk_buff *rpl_skb_gso_segment(struct sk_buff *skb, skb->protocol = skb_proto; return skb_gso; } +EXPORT_SYMBOL_GPL(rpl_skb_gso_segment); + #endif /* kernel version < 3.16.0 */ diff --git a/datapath/linux/compat/reciprocal_div.c b/datapath/linux/compat/reciprocal_div.c index 90ce7b1ca64..818502a0f1e 100644 --- a/datapath/linux/compat/reciprocal_div.c +++ b/datapath/linux/compat/reciprocal_div.c @@ -1,5 +1,6 @@ #include #include +#include #include /* @@ -7,7 +8,7 @@ * include/linux/reciprocal_div.h */ -struct reciprocal_value reciprocal_value(u32 d) +struct reciprocal_value rpl_reciprocal_value(u32 d) { struct reciprocal_value R; u64 m; @@ -23,3 +24,4 @@ struct reciprocal_value reciprocal_value(u32 d) return R; } +EXPORT_SYMBOL_GPL(rpl_reciprocal_value); diff --git a/datapath/linux/compat/skbuff-openvswitch.c b/datapath/linux/compat/skbuff-openvswitch.c index 5de43b36fd2..3ecf1fe2930 100644 --- a/datapath/linux/compat/skbuff-openvswitch.c +++ b/datapath/linux/compat/skbuff-openvswitch.c @@ -3,6 +3,8 @@ #include #include +#include "gso.h" + #if !defined(HAVE_SKB_WARN_LRO) && defined(NETIF_F_LRO) #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -35,7 +37,7 @@ static inline bool head_frag(const struct sk_buff *skb) * into skb_zerocopy(). */ unsigned int -skb_zerocopy_headlen(const struct sk_buff *from) +rpl_skb_zerocopy_headlen(const struct sk_buff *from) { unsigned int hlen = 0; @@ -49,6 +51,7 @@ skb_zerocopy_headlen(const struct sk_buff *from) return hlen; } +EXPORT_SYMBOL_GPL(rpl_skb_zerocopy_headlen); #ifndef HAVE_SKB_ZEROCOPY /** @@ -70,7 +73,7 @@ skb_zerocopy_headlen(const struct sk_buff *from) * -EFAULT: skb_copy_bits() found some problem with skb geometry */ int -skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen) +rpl_skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen) { int i, j = 0; int plen = 0; /* length of skb->head fragment */ @@ -123,11 +126,12 @@ skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen) return 0; } +EXPORT_SYMBOL_GPL(rpl_skb_zerocopy); #endif #endif #ifndef HAVE_SKB_ENSURE_WRITABLE -int skb_ensure_writable(struct sk_buff *skb, int write_len) +int rpl_skb_ensure_writable(struct sk_buff *skb, int write_len) { if (!pskb_may_pull(skb, write_len)) return -ENOMEM; @@ -137,6 +141,7 @@ int skb_ensure_writable(struct sk_buff *skb, int write_len) return pskb_expand_head(skb, 0, 0, GFP_ATOMIC); } +EXPORT_SYMBOL_GPL(rpl_skb_ensure_writable); #endif #ifndef HAVE_SKB_VLAN_POP @@ -173,7 +178,7 @@ static int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci) return err; } -int skb_vlan_pop(struct sk_buff *skb) +int rpl_skb_vlan_pop(struct sk_buff *skb) { u16 vlan_tci; __be16 vlan_proto; @@ -205,10 +210,11 @@ int skb_vlan_pop(struct sk_buff *skb) __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci); return 0; } +EXPORT_SYMBOL_GPL(rpl_skb_vlan_pop); #endif #ifndef HAVE_SKB_VLAN_PUSH -int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) +int rpl_skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) { if (skb_vlan_tag_present(skb)) { unsigned int offset = skb->data - skb_mac_header(skb); @@ -233,4 +239,31 @@ int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci); return 0; } +EXPORT_SYMBOL_GPL(rpl_skb_vlan_push); +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(3,18,0) +int rpl_pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, + gfp_t gfp_mask) +{ + int err; + int inner_mac_offset, inner_nw_offset, inner_transport_offset; + + inner_mac_offset = skb_inner_mac_offset(skb); + inner_nw_offset = skb_inner_network_offset(skb); + inner_transport_offset = ovs_skb_inner_transport_offset(skb); + +#undef pskb_expand_head + err = pskb_expand_head(skb, nhead, ntail, gfp_mask); + if (err) + return err; + + skb_set_inner_mac_header(skb, inner_mac_offset); + skb_set_inner_network_header(skb, inner_nw_offset); + skb_set_inner_transport_header(skb, inner_transport_offset); + + return 0; +} +EXPORT_SYMBOL(rpl_pskb_expand_head); + #endif diff --git a/datapath/linux/compat/stt.c b/datapath/linux/compat/stt.c new file mode 100644 index 00000000000..b44f4708f4e --- /dev/null +++ b/datapath/linux/compat/stt.c @@ -0,0 +1,1550 @@ +/* + * Stateless TCP Tunnel (STT) vport. + * + * Copyright (c) 2015 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gso.h" + +#ifdef OVS_STT +#define STT_VER 0 + +#define STT_CSUM_VERIFIED BIT(0) +#define STT_CSUM_PARTIAL BIT(1) +#define STT_PROTO_IPV4 BIT(2) +#define STT_PROTO_TCP BIT(3) +#define STT_PROTO_TYPES (STT_PROTO_IPV4 | STT_PROTO_TCP) + +#define SUPPORTED_GSO_TYPES (SKB_GSO_TCPV4 | SKB_GSO_UDP | SKB_GSO_DODGY | \ + SKB_GSO_TCPV6) + +/* The length and offset of a fragment are encoded in the sequence number. + * STT_SEQ_LEN_SHIFT is the left shift needed to store the length. + * STT_SEQ_OFFSET_MASK is the mask to extract the offset. + */ +#define STT_SEQ_LEN_SHIFT 16 +#define STT_SEQ_OFFSET_MASK (BIT(STT_SEQ_LEN_SHIFT) - 1) + +/* The maximum amount of memory used to store packets waiting to be reassembled + * on a given CPU. Once this threshold is exceeded we will begin freeing the + * least recently used fragments. + */ +#define REASM_HI_THRESH (4 * 1024 * 1024) +/* The target for the high memory evictor. Once we have exceeded + * REASM_HI_THRESH, we will continue freeing fragments until we hit + * this limit. + */ +#define REASM_LO_THRESH (3 * 1024 * 1024) +/* The length of time a given packet has to be reassembled from the time the + * first fragment arrives. Once this limit is exceeded it becomes available + * for cleaning. + */ +#define FRAG_EXP_TIME (30 * HZ) +/* Number of hash entries. Each entry has only a single slot to hold a packet + * so if there are collisions, we will drop packets. This is allocated + * per-cpu and each entry consists of struct pkt_frag. + */ +#define FRAG_HASH_SHIFT 8 +#define FRAG_HASH_ENTRIES BIT(FRAG_HASH_SHIFT) +#define FRAG_HASH_SEGS ((sizeof(u32) * 8) / FRAG_HASH_SHIFT) + +#define CLEAN_PERCPU_INTERVAL (30 * HZ) + +struct pkt_key { + __be32 saddr; + __be32 daddr; + __be32 pkt_seq; + u32 mark; +}; + +struct pkt_frag { + struct sk_buff *skbs; + unsigned long timestamp; + struct list_head lru_node; + struct pkt_key key; +}; + +struct stt_percpu { + struct flex_array *frag_hash; + struct list_head frag_lru; + unsigned int frag_mem_used; + + /* Protect frags table. */ + spinlock_t lock; +}; + +struct first_frag { + struct sk_buff *last_skb; + unsigned int mem_used; + u16 tot_len; + u16 rcvd_len; + bool set_ecn_ce; +}; + +struct frag_skb_cb { + u16 offset; + + /* Only valid for the first skb in the chain. */ + struct first_frag first; +}; + +#define FRAG_CB(skb) ((struct frag_skb_cb *)(skb)->cb) + +/* per-network namespace private data for this module */ +struct stt_net { + struct list_head sock_list; +}; + +static int stt_net_id; + +static struct stt_percpu __percpu *stt_percpu_data __read_mostly; +static u32 frag_hash_seed __read_mostly; + +/* Protects sock-hash and refcounts. */ +static DEFINE_MUTEX(stt_mutex); + +static int n_tunnels; +static DEFINE_PER_CPU(u32, pkt_seq_counter); + +static void clean_percpu(struct work_struct *work); +static DECLARE_DELAYED_WORK(clean_percpu_wq, clean_percpu); + +static struct stt_sock *stt_find_sock(struct net *net, __be16 port) +{ + struct stt_net *sn = net_generic(net, stt_net_id); + struct stt_sock *stt_sock; + + list_for_each_entry_rcu(stt_sock, &sn->sock_list, list) { + if (inet_sk(stt_sock->sock->sk)->inet_sport == port) + return stt_sock; + } + return NULL; +} + +static __be32 ack_seq(void) +{ +#if NR_CPUS <= 65536 + u32 pkt_seq, ack; + + pkt_seq = this_cpu_read(pkt_seq_counter); + ack = pkt_seq << ilog2(NR_CPUS) | smp_processor_id(); + this_cpu_inc(pkt_seq_counter); + + return (__force __be32)ack; +#else +#error "Support for greater than 64k CPUs not implemented" +#endif +} + +static int clear_gso(struct sk_buff *skb) +{ + struct skb_shared_info *shinfo = skb_shinfo(skb); + int err; + + if (shinfo->gso_type == 0 && shinfo->gso_size == 0 && + shinfo->gso_segs == 0) + return 0; + + err = skb_unclone(skb, GFP_ATOMIC); + if (unlikely(err)) + return err; + + shinfo = skb_shinfo(skb); + shinfo->gso_type = 0; + shinfo->gso_size = 0; + shinfo->gso_segs = 0; + return 0; +} + +static struct sk_buff *normalize_frag_list(struct sk_buff *head, + struct sk_buff **skbp) +{ + struct sk_buff *skb = *skbp; + struct sk_buff *last; + + do { + struct sk_buff *frags; + + if (skb_shared(skb)) { + struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC); + + if (unlikely(!nskb)) + return ERR_PTR(-ENOMEM); + + nskb->next = skb->next; + consume_skb(skb); + skb = nskb; + *skbp = skb; + } + + if (head) { + head->len -= skb->len; + head->data_len -= skb->len; + head->truesize -= skb->truesize; + } + + frags = skb_shinfo(skb)->frag_list; + if (frags) { + int err; + + err = skb_unclone(skb, GFP_ATOMIC); + if (unlikely(err)) + return ERR_PTR(err); + + last = normalize_frag_list(skb, &frags); + if (IS_ERR(last)) + return last; + + skb_shinfo(skb)->frag_list = NULL; + last->next = skb->next; + skb->next = frags; + } else { + last = skb; + } + + skbp = &skb->next; + } while ((skb = skb->next)); + + return last; +} + +/* Takes a linked list of skbs, which potentially contain frag_list + * (whose members in turn potentially contain frag_lists, etc.) and + * converts them into a single linear linked list. + */ +static int straighten_frag_list(struct sk_buff **skbp) +{ + struct sk_buff *err_skb; + + err_skb = normalize_frag_list(NULL, skbp); + if (IS_ERR(err_skb)) + return PTR_ERR(err_skb); + + return 0; +} + +static void copy_skb_metadata(struct sk_buff *to, struct sk_buff *from) +{ + to->protocol = from->protocol; + to->tstamp = from->tstamp; + to->priority = from->priority; + to->mark = from->mark; + to->vlan_tci = from->vlan_tci; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0) + to->vlan_proto = from->vlan_proto; +#endif + skb_copy_secmark(to, from); +} + +static void update_headers(struct sk_buff *skb, bool head, + unsigned int l4_offset, unsigned int hdr_len, + bool ipv4, u32 tcp_seq) +{ + u16 old_len, new_len; + __be32 delta; + struct tcphdr *tcph; + int gso_size; + + if (ipv4) { + struct iphdr *iph = (struct iphdr *)(skb->data + ETH_HLEN); + + old_len = ntohs(iph->tot_len); + new_len = skb->len - ETH_HLEN; + iph->tot_len = htons(new_len); + + ip_send_check(iph); + } else { + struct ipv6hdr *ip6h = (struct ipv6hdr *)(skb->data + ETH_HLEN); + + old_len = ntohs(ip6h->payload_len); + new_len = skb->len - ETH_HLEN - sizeof(struct ipv6hdr); + ip6h->payload_len = htons(new_len); + } + + tcph = (struct tcphdr *)(skb->data + l4_offset); + if (!head) { + tcph->seq = htonl(tcp_seq); + tcph->cwr = 0; + } + + if (skb->next) { + tcph->fin = 0; + tcph->psh = 0; + } + + delta = htonl(~old_len + new_len); + tcph->check = ~csum_fold((__force __wsum)((__force u32)tcph->check + + (__force u32)delta)); + + gso_size = skb_shinfo(skb)->gso_size; + if (gso_size && skb->len - hdr_len <= gso_size) + BUG_ON(clear_gso(skb)); +} + +static bool can_segment(struct sk_buff *head, bool ipv4, bool tcp, bool csum_partial) +{ + /* If no offloading is in use then we don't have enough information + * to process the headers. + */ + if (!csum_partial) + goto linearize; + + /* Handling UDP packets requires IP fragmentation, which means that + * the L4 checksum can no longer be calculated by hardware (since the + * fragments are in different packets. If we have to compute the + * checksum it's faster just to linearize and large UDP packets are + * pretty uncommon anyways, so it's not worth dealing with for now. + */ + if (!tcp) + goto linearize; + + if (ipv4) { + struct iphdr *iph = (struct iphdr *)(head->data + ETH_HLEN); + + /* It's difficult to get the IP IDs exactly right here due to + * varying segment sizes and potentially multiple layers of + * segmentation. IP ID isn't important when DF is set and DF + * is generally set for TCP packets, so just linearize if it's + * not. + */ + if (!(iph->frag_off & htons(IP_DF))) + goto linearize; + } else { + struct ipv6hdr *ip6h = (struct ipv6hdr *)(head->data + ETH_HLEN); + + /* Jumbograms require more processing to update and we'll + * probably never see them, so just linearize. + */ + if (ip6h->payload_len == 0) + goto linearize; + } + return true; + +linearize: + return false; +} + +static int copy_headers(struct sk_buff *head, struct sk_buff *frag, + int hdr_len) +{ + u16 csum_start; + + if (skb_cloned(frag) || skb_headroom(frag) < hdr_len) { + int extra_head = hdr_len - skb_headroom(frag); + + extra_head = extra_head > 0 ? extra_head : 0; + if (unlikely(pskb_expand_head(frag, extra_head, 0, + GFP_ATOMIC))) + return -ENOMEM; + } + + memcpy(__skb_push(frag, hdr_len), head->data, hdr_len); + + csum_start = head->csum_start - skb_headroom(head); + frag->csum_start = skb_headroom(frag) + csum_start; + frag->csum_offset = head->csum_offset; + frag->ip_summed = head->ip_summed; + + skb_shinfo(frag)->gso_size = skb_shinfo(head)->gso_size; + skb_shinfo(frag)->gso_type = skb_shinfo(head)->gso_type; + skb_shinfo(frag)->gso_segs = 0; + + copy_skb_metadata(frag, head); + return 0; +} + +static int skb_list_segment(struct sk_buff *head, bool ipv4, int l4_offset) +{ + struct sk_buff *skb; + struct tcphdr *tcph; + int seg_len; + int hdr_len; + int tcp_len; + u32 seq; + + if (unlikely(!pskb_may_pull(head, l4_offset + sizeof(*tcph)))) + return -ENOMEM; + + tcph = (struct tcphdr *)(head->data + l4_offset); + tcp_len = tcph->doff * 4; + hdr_len = l4_offset + tcp_len; + + if (unlikely((tcp_len < sizeof(struct tcphdr)) || + (head->len < hdr_len))) + return -EINVAL; + + if (unlikely(!pskb_may_pull(head, hdr_len))) + return -ENOMEM; + + tcph = (struct tcphdr *)(head->data + l4_offset); + /* Update header of each segment. */ + seq = ntohl(tcph->seq); + seg_len = skb_pagelen(head) - hdr_len; + + skb = skb_shinfo(head)->frag_list; + skb_shinfo(head)->frag_list = NULL; + head->next = skb; + for (; skb; skb = skb->next) { + int err; + + head->len -= skb->len; + head->data_len -= skb->len; + head->truesize -= skb->truesize; + + seq += seg_len; + seg_len = skb->len; + err = copy_headers(head, skb, hdr_len); + if (err) + return err; + update_headers(skb, false, l4_offset, hdr_len, ipv4, seq); + } + update_headers(head, true, l4_offset, hdr_len, ipv4, 0); + return 0; +} + +static int coalesce_skb(struct sk_buff **headp) +{ + struct sk_buff *frag, *head, *prev; + int err; + + err = straighten_frag_list(headp); + if (unlikely(err)) + return err; + head = *headp; + + /* Coalesce frag list. */ + prev = head; + for (frag = head->next; frag; frag = frag->next) { + bool headstolen; + int delta; + + if (unlikely(skb_unclone(prev, GFP_ATOMIC))) + return -ENOMEM; + + if (!skb_try_coalesce(prev, frag, &headstolen, &delta)) { + prev = frag; + continue; + } + + prev->next = frag->next; + frag->len = 0; + frag->data_len = 0; + frag->truesize -= delta; + kfree_skb_partial(frag, headstolen); + frag = prev; + } + + if (!head->next) + return 0; + + for (frag = head->next; frag; frag = frag->next) { + head->len += frag->len; + head->data_len += frag->len; + head->truesize += frag->truesize; + } + + skb_shinfo(head)->frag_list = head->next; + head->next = NULL; + return 0; +} + +static int __try_to_segment(struct sk_buff *skb, bool csum_partial, + bool ipv4, bool tcp, int l4_offset) +{ + if (can_segment(skb, ipv4, tcp, csum_partial)) + return skb_list_segment(skb, ipv4, l4_offset); + else + return skb_linearize(skb); +} + +static int try_to_segment(struct sk_buff *skb) +{ + struct stthdr *stth = stt_hdr(skb); + bool csum_partial = !!(stth->flags & STT_CSUM_PARTIAL); + bool ipv4 = !!(stth->flags & STT_PROTO_IPV4); + bool tcp = !!(stth->flags & STT_PROTO_TCP); + int l4_offset = stth->l4_offset; + + return __try_to_segment(skb, csum_partial, ipv4, tcp, l4_offset); +} + +static int segment_skb(struct sk_buff **headp, bool csum_partial, + bool ipv4, bool tcp, int l4_offset) +{ + int err; + + err = coalesce_skb(headp); + if (err) + return err; + + if (skb_shinfo(*headp)->frag_list) + return __try_to_segment(*headp, csum_partial, + ipv4, tcp, l4_offset); + return 0; +} + +static int __push_stt_header(struct sk_buff *skb, __be64 tun_id, + __be16 s_port, __be16 d_port, + __be32 saddr, __be32 dst, + __be16 l3_proto, u8 l4_proto, + int dst_mtu) +{ + int data_len = skb->len + sizeof(struct stthdr) + STT_ETH_PAD; + unsigned short encap_mss; + struct tcphdr *tcph; + struct stthdr *stth; + + skb_push(skb, STT_HEADER_LEN); + skb_reset_transport_header(skb); + tcph = tcp_hdr(skb); + memset(tcph, 0, STT_HEADER_LEN); + stth = stt_hdr(skb); + + if (skb->ip_summed == CHECKSUM_PARTIAL) { + stth->flags |= STT_CSUM_PARTIAL; + + stth->l4_offset = skb->csum_start - + (skb_headroom(skb) + + STT_HEADER_LEN); + + if (l3_proto == htons(ETH_P_IP)) + stth->flags |= STT_PROTO_IPV4; + + if (l4_proto == IPPROTO_TCP) + stth->flags |= STT_PROTO_TCP; + + stth->mss = htons(skb_shinfo(skb)->gso_size); + } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) { + stth->flags |= STT_CSUM_VERIFIED; + } + + stth->vlan_tci = htons(skb->vlan_tci); + skb->vlan_tci = 0; + put_unaligned(tun_id, &stth->key); + + tcph->source = s_port; + tcph->dest = d_port; + tcph->doff = sizeof(struct tcphdr) / 4; + tcph->ack = 1; + tcph->psh = 1; + tcph->window = htons(USHRT_MAX); + tcph->seq = htonl(data_len << STT_SEQ_LEN_SHIFT); + tcph->ack_seq = ack_seq(); + tcph->check = ~tcp_v4_check(skb->len, saddr, dst, 0); + + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = offsetof(struct tcphdr, check); + skb->ip_summed = CHECKSUM_PARTIAL; + + encap_mss = dst_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr); + if (data_len > encap_mss) { + if (unlikely(skb_unclone(skb, GFP_ATOMIC))) + return -EINVAL; + + skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4; + skb_shinfo(skb)->gso_size = encap_mss; + skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(data_len, encap_mss); + } else { + if (unlikely(clear_gso(skb))) + return -EINVAL; + } + return 0; +} + +static struct sk_buff *push_stt_header(struct sk_buff *head, __be64 tun_id, + __be16 s_port, __be16 d_port, + __be32 saddr, __be32 dst, + __be16 l3_proto, u8 l4_proto, + int dst_mtu) +{ + struct sk_buff *skb; + + if (skb_shinfo(head)->frag_list) { + bool ipv4 = (l3_proto == htons(ETH_P_IP)); + bool tcp = (l4_proto == IPPROTO_TCP); + bool csum_partial = (head->ip_summed == CHECKSUM_PARTIAL); + int l4_offset = skb_transport_offset(head); + + /* Need to call skb_orphan() to report currect true-size. + * calling skb_orphan() in this layer is odd but SKB with + * frag-list should not be associated with any socket, so + * skb-orphan should be no-op. */ + skb_orphan(head); + if (unlikely(segment_skb(&head, csum_partial, + ipv4, tcp, l4_offset))) + goto error; + } + + for (skb = head; skb; skb = skb->next) { + if (__push_stt_header(skb, tun_id, s_port, d_port, saddr, dst, + l3_proto, l4_proto, dst_mtu)) + goto error; + } + + return head; +error: + kfree_skb_list(head); + return NULL; +} + +static int stt_can_offload(struct sk_buff *skb, __be16 l3_proto, u8 l4_proto) +{ + if (skb_is_gso(skb) && skb->ip_summed != CHECKSUM_PARTIAL) { + int csum_offset; + __sum16 *csum; + int len; + + if (l4_proto == IPPROTO_TCP) + csum_offset = offsetof(struct tcphdr, check); + else if (l4_proto == IPPROTO_UDP) + csum_offset = offsetof(struct udphdr, check); + else + return 0; + + len = skb->len - skb_transport_offset(skb); + csum = (__sum16 *)(skb_transport_header(skb) + csum_offset); + + if (unlikely(!pskb_may_pull(skb, skb_transport_offset(skb) + + csum_offset + sizeof(*csum)))) + return -EINVAL; + + if (l3_proto == htons(ETH_P_IP)) { + struct iphdr *iph = ip_hdr(skb); + + *csum = ~csum_tcpudp_magic(iph->saddr, iph->daddr, + len, l4_proto, 0); + } else if (l3_proto == htons(ETH_P_IPV6)) { + struct ipv6hdr *ip6h = ipv6_hdr(skb); + + *csum = ~csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, + len, l4_proto, 0); + } else { + return 0; + } + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = csum_offset; + skb->ip_summed = CHECKSUM_PARTIAL; + } + + if (skb->ip_summed == CHECKSUM_PARTIAL) { + /* Assume receiver can only offload TCP/UDP over IPv4/6, + * and require 802.1Q VLANs to be accelerated. + */ + if (l3_proto != htons(ETH_P_IP) && + l3_proto != htons(ETH_P_IPV6)) + return 0; + + if (l4_proto != IPPROTO_TCP && l4_proto != IPPROTO_UDP) + return 0; + + /* L4 offset must fit in a 1-byte field. */ + if (skb->csum_start - skb_headroom(skb) > 255) + return 0; + + if (skb_shinfo(skb)->gso_type & ~SUPPORTED_GSO_TYPES) + return 0; + } + /* Total size of encapsulated packet must fit in 16 bits. */ + if (skb->len + STT_HEADER_LEN + sizeof(struct iphdr) > 65535) + return 0; + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0) + if (skb_vlan_tag_present(skb) && skb->vlan_proto != htons(ETH_P_8021Q)) + return 0; +#endif + return 1; +} + +static bool need_linearize(const struct sk_buff *skb) +{ + struct skb_shared_info *shinfo = skb_shinfo(skb); + int i; + + if (unlikely(shinfo->frag_list)) + return true; + + /* Generally speaking we should linearize if there are paged frags. + * However, if all of the refcounts are 1 we know nobody else can + * change them from underneath us and we can skip the linearization. + */ + for (i = 0; i < shinfo->nr_frags; i++) + if (unlikely(page_count(skb_frag_page(&shinfo->frags[i])) > 1)) + return true; + + return false; +} + +static struct sk_buff *handle_offloads(struct sk_buff *skb, int min_headroom) +{ + int err; + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0) + if (skb_vlan_tag_present(skb) && skb->vlan_proto != htons(ETH_P_8021Q)) { + + min_headroom += VLAN_HLEN; + if (skb_headroom(skb) < min_headroom) { + int head_delta = SKB_DATA_ALIGN(min_headroom - + skb_headroom(skb) + 16); + + err = pskb_expand_head(skb, max_t(int, head_delta, 0), + 0, GFP_ATOMIC); + if (unlikely(err)) + goto error; + } + + skb = __vlan_hwaccel_push_inside(skb); + if (!skb) { + err = -ENOMEM; + goto error; + } + } +#endif + + if (skb_is_gso(skb)) { + struct sk_buff *nskb; + char cb[sizeof(skb->cb)]; + + memcpy(cb, skb->cb, sizeof(cb)); + + nskb = __skb_gso_segment(skb, 0, false); + if (IS_ERR(nskb)) { + err = PTR_ERR(nskb); + goto error; + } + + consume_skb(skb); + skb = nskb; + while (nskb) { + memcpy(nskb->cb, cb, sizeof(cb)); + nskb = nskb->next; + } + } else if (skb->ip_summed == CHECKSUM_PARTIAL) { + /* Pages aren't locked and could change at any time. + * If this happens after we compute the checksum, the + * checksum will be wrong. We linearize now to avoid + * this problem. + */ + if (unlikely(need_linearize(skb))) { + err = __skb_linearize(skb); + if (unlikely(err)) + goto error; + } + + err = skb_checksum_help(skb); + if (unlikely(err)) + goto error; + } + skb->ip_summed = CHECKSUM_NONE; + + return skb; +error: + kfree_skb(skb); + return ERR_PTR(err); +} + +static int skb_list_xmit(struct rtable *rt, struct sk_buff *skb, __be32 src, + __be32 dst, __u8 tos, __u8 ttl, __be16 df) +{ + int len = 0; + + while (skb) { + struct sk_buff *next = skb->next; + + if (next) + dst_clone(&rt->dst); + + skb_clear_ovs_gso_cb(skb); + skb->next = NULL; + len += iptunnel_xmit(NULL, rt, skb, src, dst, IPPROTO_TCP, + tos, ttl, df, false); + + skb = next; + } + return len; +} + +static u8 parse_ipv6_l4_proto(struct sk_buff *skb) +{ + unsigned int nh_ofs = skb_network_offset(skb); + int payload_ofs; + struct ipv6hdr *nh; + uint8_t nexthdr; + __be16 frag_off; + + if (unlikely(!pskb_may_pull(skb, nh_ofs + sizeof(struct ipv6hdr)))) + return 0; + + nh = ipv6_hdr(skb); + nexthdr = nh->nexthdr; + payload_ofs = (u8 *)(nh + 1) - skb->data; + + payload_ofs = ipv6_skip_exthdr(skb, payload_ofs, &nexthdr, &frag_off); + if (unlikely(payload_ofs < 0)) + return 0; + + return nexthdr; +} + +static u8 skb_get_l4_proto(struct sk_buff *skb, __be16 l3_proto) +{ + if (l3_proto == htons(ETH_P_IP)) { + unsigned int nh_ofs = skb_network_offset(skb); + + if (unlikely(!pskb_may_pull(skb, nh_ofs + sizeof(struct iphdr)))) + return 0; + + return ip_hdr(skb)->protocol; + } else if (l3_proto == htons(ETH_P_IPV6)) { + return parse_ipv6_l4_proto(skb); + } + return 0; +} + +int rpl_stt_xmit_skb(struct sk_buff *skb, struct rtable *rt, + __be32 src, __be32 dst, __u8 tos, + __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port, + __be64 tun_id) +{ + struct ethhdr *eh = eth_hdr(skb); + int ret = 0, min_headroom; + __be16 inner_l3_proto; + u8 inner_l4_proto; + + inner_l3_proto = eh->h_proto; + inner_l4_proto = skb_get_l4_proto(skb, inner_l3_proto); + + min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len + + STT_HEADER_LEN + sizeof(struct iphdr); + + if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) { + int head_delta = SKB_DATA_ALIGN(min_headroom - + skb_headroom(skb) + + 16); + + ret = pskb_expand_head(skb, max_t(int, head_delta, 0), + 0, GFP_ATOMIC); + if (unlikely(ret)) + goto err_free_rt; + } + + ret = stt_can_offload(skb, inner_l3_proto, inner_l4_proto); + if (ret < 0) + goto err_free_rt; + if (!ret) { + skb = handle_offloads(skb, min_headroom); + if (IS_ERR(skb)) { + ret = PTR_ERR(skb); + skb = NULL; + goto err_free_rt; + } + } + + ret = 0; + while (skb) { + struct sk_buff *next_skb = skb->next; + + skb->next = NULL; + + if (next_skb) + dst_clone(&rt->dst); + + /* Push STT and TCP header. */ + skb = push_stt_header(skb, tun_id, src_port, dst_port, src, + dst, inner_l3_proto, inner_l4_proto, + dst_mtu(&rt->dst)); + if (unlikely(!skb)) { + ip_rt_put(rt); + goto next; + } + + /* Push IP header. */ + ret += skb_list_xmit(rt, skb, src, dst, tos, ttl, df); + +next: + skb = next_skb; + } + + return ret; + +err_free_rt: + ip_rt_put(rt); + kfree_skb(skb); + return ret; +} +EXPORT_SYMBOL_GPL(rpl_stt_xmit_skb); + +static void free_frag(struct stt_percpu *stt_percpu, + struct pkt_frag *frag) +{ + stt_percpu->frag_mem_used -= FRAG_CB(frag->skbs)->first.mem_used; + kfree_skb_list(frag->skbs); + list_del(&frag->lru_node); + frag->skbs = NULL; +} + +static void evict_frags(struct stt_percpu *stt_percpu) +{ + while (!list_empty(&stt_percpu->frag_lru) && + stt_percpu->frag_mem_used > REASM_LO_THRESH) { + struct pkt_frag *frag; + + frag = list_first_entry(&stt_percpu->frag_lru, + struct pkt_frag, + lru_node); + free_frag(stt_percpu, frag); + } +} + +static bool pkt_key_match(struct net *net, + const struct pkt_frag *a, const struct pkt_key *b) +{ + return a->key.saddr == b->saddr && a->key.daddr == b->daddr && + a->key.pkt_seq == b->pkt_seq && a->key.mark == b->mark && + net_eq(dev_net(a->skbs->dev), net); +} + +static u32 pkt_key_hash(const struct net *net, const struct pkt_key *key) +{ + u32 initval = frag_hash_seed ^ (u32)(unsigned long)net ^ key->mark; + + return jhash_3words((__force u32)key->saddr, (__force u32)key->daddr, + (__force u32)key->pkt_seq, initval); +} + +static struct pkt_frag *lookup_frag(struct net *net, + struct stt_percpu *stt_percpu, + const struct pkt_key *key, u32 hash) +{ + struct pkt_frag *frag, *victim_frag = NULL; + int i; + + for (i = 0; i < FRAG_HASH_SEGS; i++) { + frag = flex_array_get(stt_percpu->frag_hash, + hash & (FRAG_HASH_ENTRIES - 1)); + + if (frag->skbs && + time_before(jiffies, frag->timestamp + FRAG_EXP_TIME) && + pkt_key_match(net, frag, key)) + return frag; + + if (!victim_frag || + (victim_frag->skbs && + (!frag->skbs || + time_before(frag->timestamp, victim_frag->timestamp)))) + victim_frag = frag; + + hash >>= FRAG_HASH_SHIFT; + } + + if (victim_frag->skbs) + free_frag(stt_percpu, victim_frag); + + return victim_frag; +} + +static struct sk_buff *reassemble(struct sk_buff *skb) +{ + struct iphdr *iph = ip_hdr(skb); + struct tcphdr *tcph = tcp_hdr(skb); + u32 seq = ntohl(tcph->seq); + struct stt_percpu *stt_percpu; + struct sk_buff *last_skb; + struct pkt_frag *frag; + struct pkt_key key; + int tot_len; + u32 hash; + + tot_len = seq >> STT_SEQ_LEN_SHIFT; + FRAG_CB(skb)->offset = seq & STT_SEQ_OFFSET_MASK; + + if (unlikely(skb->len == 0)) + goto out_free; + + if (unlikely(FRAG_CB(skb)->offset + skb->len > tot_len)) + goto out_free; + + if (tot_len == skb->len) + goto out; + + key.saddr = iph->saddr; + key.daddr = iph->daddr; + key.pkt_seq = tcph->ack_seq; + key.mark = skb->mark; + hash = pkt_key_hash(dev_net(skb->dev), &key); + + stt_percpu = per_cpu_ptr(stt_percpu_data, smp_processor_id()); + + spin_lock(&stt_percpu->lock); + + if (unlikely(stt_percpu->frag_mem_used + skb->truesize > REASM_HI_THRESH)) + evict_frags(stt_percpu); + + frag = lookup_frag(dev_net(skb->dev), stt_percpu, &key, hash); + if (!frag->skbs) { + frag->skbs = skb; + frag->key = key; + frag->timestamp = jiffies; + FRAG_CB(skb)->first.last_skb = skb; + FRAG_CB(skb)->first.mem_used = skb->truesize; + FRAG_CB(skb)->first.tot_len = tot_len; + FRAG_CB(skb)->first.rcvd_len = skb->len; + FRAG_CB(skb)->first.set_ecn_ce = false; + list_add_tail(&frag->lru_node, &stt_percpu->frag_lru); + stt_percpu->frag_mem_used += skb->truesize; + + skb = NULL; + goto unlock; + } + + /* Optimize for the common case where fragments are received in-order + * and not overlapping. + */ + last_skb = FRAG_CB(frag->skbs)->first.last_skb; + if (likely(FRAG_CB(last_skb)->offset + last_skb->len == + FRAG_CB(skb)->offset)) { + last_skb->next = skb; + FRAG_CB(frag->skbs)->first.last_skb = skb; + } else { + struct sk_buff *prev = NULL, *next; + + for (next = frag->skbs; next; next = next->next) { + if (FRAG_CB(next)->offset >= FRAG_CB(skb)->offset) + break; + prev = next; + } + + /* Overlapping fragments aren't allowed. We shouldn't start + * before the end of the previous fragment. + */ + if (prev && + FRAG_CB(prev)->offset + prev->len > FRAG_CB(skb)->offset) + goto unlock_free; + + /* We also shouldn't end after the beginning of the next + * fragment. + */ + if (next && + FRAG_CB(skb)->offset + skb->len > FRAG_CB(next)->offset) + goto unlock_free; + + if (prev) { + prev->next = skb; + } else { + FRAG_CB(skb)->first = FRAG_CB(frag->skbs)->first; + frag->skbs = skb; + } + + if (next) + skb->next = next; + else + FRAG_CB(frag->skbs)->first.last_skb = skb; + } + + FRAG_CB(frag->skbs)->first.set_ecn_ce |= INET_ECN_is_ce(iph->tos); + FRAG_CB(frag->skbs)->first.rcvd_len += skb->len; + FRAG_CB(frag->skbs)->first.mem_used += skb->truesize; + stt_percpu->frag_mem_used += skb->truesize; + + if (FRAG_CB(frag->skbs)->first.tot_len == + FRAG_CB(frag->skbs)->first.rcvd_len) { + struct sk_buff *frag_head = frag->skbs; + + frag_head->tstamp = skb->tstamp; + if (FRAG_CB(frag_head)->first.set_ecn_ce) + INET_ECN_set_ce(frag_head); + + list_del(&frag->lru_node); + stt_percpu->frag_mem_used -= FRAG_CB(frag_head)->first.mem_used; + frag->skbs = NULL; + skb = frag_head; + } else { + list_move_tail(&frag->lru_node, &stt_percpu->frag_lru); + skb = NULL; + } + + goto unlock; + +unlock_free: + kfree_skb(skb); + skb = NULL; +unlock: + spin_unlock(&stt_percpu->lock); + return skb; +out_free: + kfree_skb(skb); + skb = NULL; +out: + return skb; +} + +static bool validate_checksum(struct sk_buff *skb) +{ + struct iphdr *iph = ip_hdr(skb); + + if (skb_csum_unnecessary(skb)) + return true; + + if (skb->ip_summed == CHECKSUM_COMPLETE && + !tcp_v4_check(skb->len, iph->saddr, iph->daddr, skb->csum)) + return true; + + skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, skb->len, + IPPROTO_TCP, 0); + + return __tcp_checksum_complete(skb) == 0; +} + +static bool set_offloads(struct sk_buff *skb) +{ + struct stthdr *stth = stt_hdr(skb); + unsigned short gso_type; + int l3_header_size; + int l4_header_size; + u16 csum_offset; + u8 proto_type; + + if (stth->vlan_tci) + __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), + ntohs(stth->vlan_tci)); + + if (!(stth->flags & STT_CSUM_PARTIAL)) { + if (stth->flags & STT_CSUM_VERIFIED) + skb->ip_summed = CHECKSUM_UNNECESSARY; + else + skb->ip_summed = CHECKSUM_NONE; + + return clear_gso(skb) == 0; + } + + proto_type = stth->flags & STT_PROTO_TYPES; + + switch (proto_type) { + case (STT_PROTO_IPV4 | STT_PROTO_TCP): + /* TCP/IPv4 */ + csum_offset = offsetof(struct tcphdr, check); + gso_type = SKB_GSO_TCPV4; + l3_header_size = sizeof(struct iphdr); + l4_header_size = sizeof(struct tcphdr); + skb->protocol = htons(ETH_P_IP); + break; + case STT_PROTO_TCP: + /* TCP/IPv6 */ + csum_offset = offsetof(struct tcphdr, check); + gso_type = SKB_GSO_TCPV6; + l3_header_size = sizeof(struct ipv6hdr); + l4_header_size = sizeof(struct tcphdr); + skb->protocol = htons(ETH_P_IPV6); + break; + case STT_PROTO_IPV4: + /* UDP/IPv4 */ + csum_offset = offsetof(struct udphdr, check); + gso_type = SKB_GSO_UDP; + l3_header_size = sizeof(struct iphdr); + l4_header_size = sizeof(struct udphdr); + skb->protocol = htons(ETH_P_IP); + break; + default: + /* UDP/IPv6 */ + csum_offset = offsetof(struct udphdr, check); + gso_type = SKB_GSO_UDP; + l3_header_size = sizeof(struct ipv6hdr); + l4_header_size = sizeof(struct udphdr); + skb->protocol = htons(ETH_P_IPV6); + } + + if (unlikely(stth->l4_offset < ETH_HLEN + l3_header_size)) + return false; + + if (unlikely(!pskb_may_pull(skb, stth->l4_offset + l4_header_size))) + return false; + + stth = stt_hdr(skb); + + skb->csum_start = skb_headroom(skb) + stth->l4_offset; + skb->csum_offset = csum_offset; + skb->ip_summed = CHECKSUM_PARTIAL; + + if (stth->mss) { + if (unlikely(skb_unclone(skb, GFP_ATOMIC))) + return false; + + skb_shinfo(skb)->gso_type = gso_type | SKB_GSO_DODGY; + skb_shinfo(skb)->gso_size = ntohs(stth->mss); + skb_shinfo(skb)->gso_segs = 0; + } else { + if (unlikely(clear_gso(skb))) + return false; + } + + return true; +} +static void stt_rcv(struct stt_sock *stt_sock, struct sk_buff *skb) +{ + int err; + + if (unlikely(!validate_checksum(skb))) + goto drop; + + skb = reassemble(skb); + if (!skb) + return; + + if (skb->next && coalesce_skb(&skb)) + goto drop; + + err = iptunnel_pull_header(skb, + sizeof(struct stthdr) + STT_ETH_PAD, + htons(ETH_P_TEB)); + if (unlikely(err)) + goto drop; + + if (unlikely(stt_hdr(skb)->version != 0)) + goto drop; + + if (unlikely(!set_offloads(skb))) + goto drop; + + if (skb_shinfo(skb)->frag_list && try_to_segment(skb)) + goto drop; + + stt_sock->rcv(stt_sock, skb); + return; +drop: + /* Consume bad packet */ + kfree_skb_list(skb); +} + +static void tcp_sock_release(struct socket *sock) +{ + kernel_sock_shutdown(sock, SHUT_RDWR); + sk_release_kernel(sock->sk); +} + +static int tcp_sock_create4(struct net *net, __be16 port, + struct socket **sockp) +{ + struct sockaddr_in tcp_addr; + struct socket *sock = NULL; + int err; + + err = sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); + if (err < 0) + goto error; + + sk_change_net(sock->sk, net); + + memset(&tcp_addr, 0, sizeof(tcp_addr)); + tcp_addr.sin_family = AF_INET; + tcp_addr.sin_addr.s_addr = htonl(INADDR_ANY); + tcp_addr.sin_port = port; + err = kernel_bind(sock, (struct sockaddr *)&tcp_addr, + sizeof(tcp_addr)); + if (err < 0) + goto error; + + *sockp = sock; + return 0; + +error: + if (sock) + tcp_sock_release(sock); + *sockp = NULL; + return err; +} + +static void schedule_clean_percpu(void) +{ + schedule_delayed_work(&clean_percpu_wq, CLEAN_PERCPU_INTERVAL); +} + +static void clean_percpu(struct work_struct *work) +{ + int i; + + for_each_possible_cpu(i) { + struct stt_percpu *stt_percpu = per_cpu_ptr(stt_percpu_data, i); + int j; + + for (j = 0; j < FRAG_HASH_ENTRIES; j++) { + struct pkt_frag *frag; + + frag = flex_array_get(stt_percpu->frag_hash, j); + if (!frag->skbs || + time_before(jiffies, frag->timestamp + FRAG_EXP_TIME)) + continue; + + spin_lock_bh(&stt_percpu->lock); + + if (frag->skbs && + time_after(jiffies, frag->timestamp + FRAG_EXP_TIME)) + free_frag(stt_percpu, frag); + + spin_unlock_bh(&stt_percpu->lock); + } + } + schedule_clean_percpu(); +} + +#ifdef HAVE_NF_HOOKFN_ARG_OPS +#define FIRST_PARAM const struct nf_hook_ops *ops, +#else +#define FIRST_PARAM unsigned int hooknum, +#endif + +static unsigned int nf_ip_hook(FIRST_PARAM + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct stt_sock *stt_sock; + int ip_hdr_len; + + if (ip_hdr(skb)->protocol != IPPROTO_TCP) + return NF_ACCEPT; + + ip_hdr_len = ip_hdrlen(skb); + if (unlikely(!pskb_may_pull(skb, ip_hdr_len + sizeof(struct tcphdr)))) + return NF_ACCEPT; + + skb_set_transport_header(skb, ip_hdr_len); + + stt_sock = stt_find_sock(dev_net(skb->dev), tcp_hdr(skb)->dest); + if (!stt_sock) + return NF_ACCEPT; + + __skb_pull(skb, ip_hdr_len + sizeof(struct tcphdr)); + stt_rcv(stt_sock, skb); + return NF_STOLEN; +} + +static struct nf_hook_ops nf_hook_ops __read_mostly = { + .hook = nf_ip_hook, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_LOCAL_IN, + .priority = INT_MAX, +}; + +static int stt_start(void) +{ + int err; + int i; + + if (n_tunnels) { + n_tunnels++; + return 0; + } + get_random_bytes(&frag_hash_seed, sizeof(u32)); + + stt_percpu_data = alloc_percpu(struct stt_percpu); + if (!stt_percpu_data) { + err = -ENOMEM; + goto error; + } + + for_each_possible_cpu(i) { + struct stt_percpu *stt_percpu = per_cpu_ptr(stt_percpu_data, i); + struct flex_array *frag_hash; + + spin_lock_init(&stt_percpu->lock); + INIT_LIST_HEAD(&stt_percpu->frag_lru); + get_random_bytes(&per_cpu(pkt_seq_counter, i), sizeof(u32)); + + frag_hash = flex_array_alloc(sizeof(struct pkt_frag), + FRAG_HASH_ENTRIES, + GFP_KERNEL | __GFP_ZERO); + if (!frag_hash) { + err = -ENOMEM; + goto free_percpu; + } + stt_percpu->frag_hash = frag_hash; + + err = flex_array_prealloc(stt_percpu->frag_hash, 0, + FRAG_HASH_ENTRIES, + GFP_KERNEL | __GFP_ZERO); + if (err) + goto free_percpu; + } + err = nf_register_hook(&nf_hook_ops); + if (err) + goto free_percpu; + + schedule_clean_percpu(); + n_tunnels++; + return 0; + +free_percpu: + for_each_possible_cpu(i) { + struct stt_percpu *stt_percpu = per_cpu_ptr(stt_percpu_data, i); + + if (stt_percpu->frag_hash) + flex_array_free(stt_percpu->frag_hash); + } + + free_percpu(stt_percpu_data); + +error: + return err; +} + +static void stt_cleanup(void) +{ + int i; + + n_tunnels--; + if (n_tunnels) + return; + + cancel_delayed_work_sync(&clean_percpu_wq); + nf_unregister_hook(&nf_hook_ops); + + for_each_possible_cpu(i) { + struct stt_percpu *stt_percpu = per_cpu_ptr(stt_percpu_data, i); + int j; + + for (j = 0; j < FRAG_HASH_ENTRIES; j++) { + struct pkt_frag *frag; + + frag = flex_array_get(stt_percpu->frag_hash, j); + kfree_skb_list(frag->skbs); + } + + flex_array_free(stt_percpu->frag_hash); + } + + free_percpu(stt_percpu_data); +} + +static struct stt_sock *stt_socket_create(struct net *net, __be16 port, + stt_rcv_t *rcv, void *data) +{ + struct stt_net *sn = net_generic(net, stt_net_id); + struct stt_sock *stt_sock; + struct socket *sock; + int err; + + stt_sock = kzalloc(sizeof(*stt_sock), GFP_KERNEL); + if (!stt_sock) + return ERR_PTR(-ENOMEM); + + err = tcp_sock_create4(net, port, &sock); + if (err) { + kfree(stt_sock); + return ERR_PTR(err); + } + + stt_sock->sock = sock; + stt_sock->rcv = rcv; + stt_sock->rcv_data = data; + + list_add_rcu(&stt_sock->list, &sn->sock_list); + + return stt_sock; +} + +static void __stt_sock_release(struct stt_sock *stt_sock) +{ + list_del_rcu(&stt_sock->list); + tcp_sock_release(stt_sock->sock); + kfree_rcu(stt_sock, rcu); +} + +struct stt_sock *rpl_stt_sock_add(struct net *net, __be16 port, + stt_rcv_t *rcv, void *data) +{ + struct stt_sock *stt_sock; + int err; + + err = stt_start(); + if (err) + return ERR_PTR(err); + + mutex_lock(&stt_mutex); + rcu_read_lock(); + stt_sock = stt_find_sock(net, port); + rcu_read_unlock(); + if (stt_sock) + stt_sock = ERR_PTR(-EBUSY); + else + stt_sock = stt_socket_create(net, port, rcv, data); + + mutex_unlock(&stt_mutex); + + if (IS_ERR(stt_sock)) + stt_cleanup(); + + return stt_sock; +} +EXPORT_SYMBOL_GPL(rpl_stt_sock_add); + +void rpl_stt_sock_release(struct stt_sock *stt_sock) +{ + mutex_lock(&stt_mutex); + if (stt_sock) { + __stt_sock_release(stt_sock); + stt_cleanup(); + } + mutex_unlock(&stt_mutex); +} +EXPORT_SYMBOL_GPL(rpl_stt_sock_release); + +static int stt_init_net(struct net *net) +{ + struct stt_net *sn = net_generic(net, stt_net_id); + + INIT_LIST_HEAD(&sn->sock_list); + return 0; +} + +static struct pernet_operations stt_net_ops = { + .init = stt_init_net, + .id = &stt_net_id, + .size = sizeof(struct stt_net), +}; + +int ovs_stt_init_module(void) +{ + return register_pernet_subsys(&stt_net_ops); +} +EXPORT_SYMBOL_GPL(ovs_stt_init_module); + +void ovs_stt_cleanup_module(void) +{ + unregister_pernet_subsys(&stt_net_ops); +} +EXPORT_SYMBOL_GPL(ovs_stt_cleanup_module); +#endif diff --git a/datapath/linux/compat/udp.c b/datapath/linux/compat/udp.c index 834a86b717b..487d317bb22 100644 --- a/datapath/linux/compat/udp.c +++ b/datapath/linux/compat/udp.c @@ -7,8 +7,8 @@ /* Function to set UDP checksum for an IPv4 UDP packet. This is intended * for the simple case like when setting the checksum for a UDP tunnel. */ -void udp_set_csum(bool nocheck, struct sk_buff *skb, - __be32 saddr, __be32 daddr, int len) +void rpl_udp_set_csum(bool nocheck, struct sk_buff *skb, + __be32 saddr, __be32 daddr, int len) { struct udphdr *uh = udp_hdr(skb); @@ -39,5 +39,6 @@ void udp_set_csum(bool nocheck, struct sk_buff *skb, skb->ip_summed = CHECKSUM_UNNECESSARY; } } +EXPORT_SYMBOL_GPL(rpl_udp_set_csum); #endif /* Linux version < 3.16 */ diff --git a/datapath/linux/compat/udp_tunnel.c b/datapath/linux/compat/udp_tunnel.c index f64011387a5..c84113b5b61 100644 --- a/datapath/linux/compat/udp_tunnel.c +++ b/datapath/linux/compat/udp_tunnel.c @@ -1,6 +1,6 @@ #include -#if LINUX_VERSION_CODE < KERNEL_VERSION(3,20,0) +#if LINUX_VERSION_CODE < KERNEL_VERSION(4,0,0) #include #include @@ -13,8 +13,8 @@ #include #include -int udp_sock_create(struct net *net, struct udp_port_cfg *cfg, - struct socket **sockp) +int rpl_udp_sock_create(struct net *net, struct udp_port_cfg *cfg, + struct socket **sockp) { int err; struct socket *sock = NULL; @@ -95,10 +95,10 @@ int udp_sock_create(struct net *net, struct udp_port_cfg *cfg, *sockp = NULL; return err; } -EXPORT_SYMBOL_GPL(udp_sock_create); +EXPORT_SYMBOL_GPL(rpl_udp_sock_create); -void setup_udp_tunnel_sock(struct net *net, struct socket *sock, - struct udp_tunnel_sock_cfg *cfg) +void rpl_setup_udp_tunnel_sock(struct net *net, struct socket *sock, + struct udp_tunnel_sock_cfg *cfg) { struct sock *sk = sock->sk; @@ -115,7 +115,7 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock, udp_tunnel_encap_enable(sock); } -EXPORT_SYMBOL_GPL(setup_udp_tunnel_sock); +EXPORT_SYMBOL_GPL(rpl_setup_udp_tunnel_sock); void ovs_udp_gso(struct sk_buff *skb) { @@ -142,10 +142,10 @@ void ovs_udp_csum_gso(struct sk_buff *skb) } EXPORT_SYMBOL_GPL(ovs_udp_csum_gso); -int udp_tunnel_xmit_skb(struct rtable *rt, struct sk_buff *skb, - __be32 src, __be32 dst, __u8 tos, __u8 ttl, - __be16 df, __be16 src_port, __be16 dst_port, - bool xnet, bool nocheck) +int rpl_udp_tunnel_xmit_skb(struct rtable *rt, struct sk_buff *skb, + __be32 src, __be32 dst, __u8 tos, __u8 ttl, + __be16 df, __be16 src_port, __be16 dst_port, + bool xnet, bool nocheck) { struct udphdr *uh; @@ -162,14 +162,14 @@ int udp_tunnel_xmit_skb(struct rtable *rt, struct sk_buff *skb, return iptunnel_xmit(skb->sk, rt, skb, src, dst, IPPROTO_UDP, tos, ttl, df, xnet); } -EXPORT_SYMBOL_GPL(udp_tunnel_xmit_skb); +EXPORT_SYMBOL_GPL(rpl_udp_tunnel_xmit_skb); -void udp_tunnel_sock_release(struct socket *sock) +void rpl_udp_tunnel_sock_release(struct socket *sock) { rcu_assign_sk_user_data(sock->sk, NULL); kernel_sock_shutdown(sock, SHUT_RDWR); sk_release_kernel(sock->sk); } -EXPORT_SYMBOL_GPL(udp_tunnel_sock_release); +EXPORT_SYMBOL_GPL(rpl_udp_tunnel_sock_release); -#endif /* Linux version < 3.20 */ +#endif /* Linux version < 4.0 */ diff --git a/datapath/linux/compat/utils.c b/datapath/linux/compat/utils.c index 9404e20f4c8..0ee6e803a02 100644 --- a/datapath/linux/compat/utils.c +++ b/datapath/linux/compat/utils.c @@ -19,9 +19,9 @@ #include #if LINUX_VERSION_CODE < KERNEL_VERSION(3,7,0) -void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb, - const __be32 *from, const __be32 *to, - int pseudohdr) +void rpl_inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb, + const __be32 *from, const __be32 *to, + int pseudohdr) { __be32 diff[] = { ~from[0], ~from[1], ~from[2], ~from[3], @@ -37,10 +37,12 @@ void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb, *sum = ~csum_fold(csum_partial(diff, sizeof(diff), csum_unfold(*sum))); } +EXPORT_SYMBOL_GPL(rpl_inet_proto_csum_replace16); #endif #if LINUX_VERSION_CODE < KERNEL_VERSION(3,13,0) -bool __net_get_random_once(void *buf, int nbytes, bool *done, + +bool rpl___net_get_random_once(void *buf, int nbytes, bool *done, atomic_t *done_key) { static DEFINE_SPINLOCK(lock); @@ -60,4 +62,6 @@ bool __net_get_random_once(void *buf, int nbytes, bool *done, return true; } +EXPORT_SYMBOL_GPL(rpl___net_get_random_once); + #endif diff --git a/datapath/linux/compat/vxlan.c b/datapath/linux/compat/vxlan.c index 51135fac715..129f1717eaa 100644 --- a/datapath/linux/compat/vxlan.c +++ b/datapath/linux/compat/vxlan.c @@ -180,11 +180,11 @@ static void vxlan_build_gbp_hdr(struct vxlanhdr *vxh, u32 vxflags, gbp->policy_id = htons(md->gbp & VXLAN_GBP_ID_MASK); } -int vxlan_xmit_skb(struct vxlan_sock *vs, - struct rtable *rt, struct sk_buff *skb, - __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df, - __be16 src_port, __be16 dst_port, - struct vxlan_metadata *md, bool xnet, u32 vxflags) +int rpl_vxlan_xmit_skb(struct vxlan_sock *vs, + struct rtable *rt, struct sk_buff *skb, + __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df, + __be16 src_port, __be16 dst_port, + struct vxlan_metadata *md, bool xnet, u32 vxflags) { struct vxlanhdr *vxh; int min_headroom; @@ -225,7 +225,7 @@ int vxlan_xmit_skb(struct vxlan_sock *vs, ttl, df, src_port, dst_port, xnet, !udp_sum); } -EXPORT_SYMBOL_GPL(vxlan_xmit_skb); +EXPORT_SYMBOL_GPL(rpl_vxlan_xmit_skb); static void rcu_free_vs(struct rcu_head *rcu) { @@ -308,20 +308,20 @@ static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port, return vs; } -struct vxlan_sock *vxlan_sock_add(struct net *net, __be16 port, - vxlan_rcv_t *rcv, void *data, - bool no_share, u32 flags) +struct vxlan_sock *rpl_vxlan_sock_add(struct net *net, __be16 port, + vxlan_rcv_t *rcv, void *data, + bool no_share, u32 flags) { return vxlan_socket_create(net, port, rcv, data, flags); } -EXPORT_SYMBOL_GPL(vxlan_sock_add); +EXPORT_SYMBOL_GPL(rpl_vxlan_sock_add); -void vxlan_sock_release(struct vxlan_sock *vs) +void rpl_vxlan_sock_release(struct vxlan_sock *vs) { ASSERT_OVSL(); queue_work(system_wq, &vs->del_work); } -EXPORT_SYMBOL_GPL(vxlan_sock_release); +EXPORT_SYMBOL_GPL(rpl_vxlan_sock_release); #endif /* !USE_UPSTREAM_VXLAN */ diff --git a/datapath/vport-stt.c b/datapath/vport-stt.c new file mode 100644 index 00000000000..9a1c8a65cab --- /dev/null +++ b/datapath/vport-stt.c @@ -0,0 +1,231 @@ +/* + * Copyright (c) 2015 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "datapath.h" +#include "vport.h" + +#ifdef OVS_STT +static struct vport_ops ovs_stt_vport_ops; + +/** + * struct stt_port + * @stt_sock: The socket created for this port number. + * @name: vport name. + */ +struct stt_port { + struct stt_sock *stt_sock; + char name[IFNAMSIZ]; +}; + +static inline struct stt_port *stt_vport(const struct vport *vport) +{ + return vport_priv(vport); +} + +static void stt_rcv(struct stt_sock *stt_sock, struct sk_buff *skb) +{ + struct vport *vport = stt_sock->rcv_data; + struct stthdr *stth = stt_hdr(skb); + struct ovs_tunnel_info tun_info; + struct sk_buff *next; + + ovs_flow_tun_info_init(&tun_info, ip_hdr(skb), + tcp_hdr(skb)->source, tcp_hdr(skb)->dest, + get_unaligned(&stth->key), + TUNNEL_KEY | TUNNEL_CSUM, + NULL, 0); + do { + next = skb->next; + skb->next = NULL; + ovs_vport_receive(vport, skb, &tun_info); + } while ((skb = next)); +} + +static int stt_tnl_get_options(const struct vport *vport, + struct sk_buff *skb) +{ + struct stt_port *stt_port = stt_vport(vport); + struct inet_sock *sk = inet_sk(stt_port->stt_sock->sock->sk); + + if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, ntohs(sk->inet_sport))) + return -EMSGSIZE; + return 0; +} + +static void stt_tnl_destroy(struct vport *vport) +{ + struct stt_port *stt_port = stt_vport(vport); + + stt_sock_release(stt_port->stt_sock); + ovs_vport_deferred_free(vport); +} + +static struct vport *stt_tnl_create(const struct vport_parms *parms) +{ + struct net *net = ovs_dp_get_net(parms->dp); + struct nlattr *options = parms->options; + struct stt_port *stt_port; + struct stt_sock *stt_sock; + struct vport *vport; + struct nlattr *a; + int err; + u16 dst_port; + + if (!options) { + err = -EINVAL; + goto error; + } + + a = nla_find_nested(options, OVS_TUNNEL_ATTR_DST_PORT); + if (a && nla_len(a) == sizeof(u16)) { + dst_port = nla_get_u16(a); + } else { + /* Require destination port from userspace. */ + err = -EINVAL; + goto error; + } + + vport = ovs_vport_alloc(sizeof(struct stt_port), + &ovs_stt_vport_ops, parms); + if (IS_ERR(vport)) + return vport; + + stt_port = stt_vport(vport); + strncpy(stt_port->name, parms->name, IFNAMSIZ); + + stt_sock = stt_sock_add(net, htons(dst_port), stt_rcv, vport); + if (IS_ERR(stt_sock)) { + ovs_vport_free(vport); + return ERR_CAST(stt_sock); + } + stt_port->stt_sock = stt_sock; + + return vport; +error: + return ERR_PTR(err); +} + +static int stt_tnl_send(struct vport *vport, struct sk_buff *skb) +{ + struct net *net = ovs_dp_get_net(vport->dp); + struct stt_port *stt_port = stt_vport(vport); + __be16 dport = inet_sk(stt_port->stt_sock->sock->sk)->inet_sport; + const struct ovs_key_ipv4_tunnel *tun_key; + const struct ovs_tunnel_info *tun_info; + struct rtable *rt; + __be16 sport; + __be32 saddr; + __be16 df; + int err; + + tun_info = OVS_CB(skb)->egress_tun_info; + if (unlikely(!tun_info)) { + err = -EINVAL; + goto error; + } + + tun_key = &tun_info->tunnel; + /* Route lookup */ + saddr = tun_key->ipv4_src; + rt = find_route(ovs_dp_get_net(vport->dp), + &saddr, tun_key->ipv4_dst, + IPPROTO_TCP, tun_key->ipv4_tos, + skb->mark); + + if (IS_ERR(rt)) { + err = PTR_ERR(rt); + goto error; + } + + df = tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; + sport = udp_flow_src_port(net, skb, 1, USHRT_MAX, true); + skb->ignore_df = 1; + + return stt_xmit_skb(skb, rt, saddr, tun_key->ipv4_dst, + tun_key->ipv4_tos, tun_key->ipv4_ttl, + df, sport, dport, tun_key->tun_id); +error: + kfree_skb(skb); + return err; +} + +static const char *stt_tnl_get_name(const struct vport *vport) +{ + return stt_vport(vport)->name; +} + +static int stt_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, + struct ovs_tunnel_info *egress_tun_info) +{ + struct stt_port *stt_port = stt_vport(vport); + struct net *net = ovs_dp_get_net(vport->dp); + __be16 dport = inet_sk(stt_port->stt_sock->sock->sk)->inet_sport; + __be16 sport = udp_flow_src_port(net, skb, 1, USHRT_MAX, true); + + /* Get tp_src and tp_dst, refert to stt_build_header(). + */ + return ovs_tunnel_get_egress_info(egress_tun_info, + ovs_dp_get_net(vport->dp), + OVS_CB(skb)->egress_tun_info, + IPPROTO_UDP, skb->mark, sport, dport); +} + +static struct vport_ops ovs_stt_vport_ops = { + .type = OVS_VPORT_TYPE_STT, + .create = stt_tnl_create, + .destroy = stt_tnl_destroy, + .get_name = stt_tnl_get_name, + .get_options = stt_tnl_get_options, + .send = stt_tnl_send, + .get_egress_tun_info = stt_get_egress_tun_info, + .owner = THIS_MODULE, +}; + +static int __init ovs_stt_tnl_init(void) +{ + int err; + + err = stt_init_module(); + if (err) + return err; + err = ovs_vport_ops_register(&ovs_stt_vport_ops); + if (err) + stt_cleanup_module(); + return err; +} + +static void __exit ovs_stt_tnl_exit(void) +{ + ovs_vport_ops_unregister(&ovs_stt_vport_ops); + stt_cleanup_module(); +} + +module_init(ovs_stt_tnl_init); +module_exit(ovs_stt_tnl_exit); + +MODULE_DESCRIPTION("OVS: STT switching port"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("vport-type-106"); +#endif diff --git a/debian/dkms.conf.in b/debian/dkms.conf.in index a477761fabf..2c90b4defe8 100644 --- a/debian/dkms.conf.in +++ b/debian/dkms.conf.in @@ -1,7 +1,11 @@ +MODULES=( __MODULES__ ) + PACKAGE_NAME="openvswitch" PACKAGE_VERSION="__VERSION__" MAKE="./configure --with-linux='${kernel_source_dir}' && make -C datapath/linux" -BUILT_MODULE_NAME[0]=openvswitch -BUILT_MODULE_LOCATION[0]=datapath/linux/ -DEST_MODULE_LOCATION[0]=/kernel/drivers/net/openvswitch/ +for __idx in ${!MODULES[@]}; do + BUILT_MODULE_NAME[__idx]=${MODULES[__idx]} + BUILT_MODULE_LOCATION[__idx]=datapath/linux/ + DEST_MODULE_LOCATION[__idx]=/kernel/drivers/net/openvswitch/ +done AUTOINSTALL=yes diff --git a/debian/rules b/debian/rules index a5dccac41a8..fc6ce57eb11 100755 --- a/debian/rules +++ b/debian/rules @@ -105,8 +105,11 @@ install-indep: build-indep # copy the source cd debian/$(pdkms)/usr/src/$(PACKAGE)-$(DEB_UPSTREAM_VERSION) && tar xvzf $(CURDIR)/_debian/openvswitch.tar.gz && mv openvswitch/* openvswitch/.[a-z]* . && rmdir openvswitch + # check we can get kernel module names + $(MAKE) -C _debian/datapath print-build-modules + # Prepare dkms.conf from the dkms.conf.in template - sed "s/__VERSION__/$(DEB_UPSTREAM_VERSION)/g" debian/dkms.conf.in > debian/$(pdkms)/usr/src/$(PACKAGE)-$(DEB_UPSTREAM_VERSION)/dkms.conf + sed "s/__VERSION__/$(DEB_UPSTREAM_VERSION)/g; s/__MODULES__/$(shell $(MAKE) -C _debian/datapath print-build-modules | grep -v make)/" debian/dkms.conf.in > debian/$(pdkms)/usr/src/$(PACKAGE)-$(DEB_UPSTREAM_VERSION)/dkms.conf # We don't need the debian folder in there, just upstream sources... rm -rf debian/$(pdkms)/usr/src/$(PACKAGE)-$(DEB_UPSTREAM_VERSION)/debian diff --git a/lib/automake.mk b/lib/automake.mk index 3629079198e..7a34c1a0b8f 100644 --- a/lib/automake.mk +++ b/lib/automake.mk @@ -185,6 +185,8 @@ lib_libopenvswitch_la_SOURCES = \ lib/packets.h \ lib/pcap-file.c \ lib/pcap-file.h \ + lib/perf-counter.h \ + lib/perf-counter.c \ lib/poll-loop.c \ lib/poll-loop.h \ lib/process.c \ diff --git a/lib/dp-packet.h b/lib/dp-packet.h index 5d0fee75ec8..fd23d11536a 100644 --- a/lib/dp-packet.h +++ b/lib/dp-packet.h @@ -63,7 +63,7 @@ struct dp_packet { void *base_; /* First byte of allocated space. */ uint16_t data_ofs; /* First byte actually in use. */ uint32_t size_; /* Number of bytes in use. */ - uint32_t dp_hash; /* Packet hash. */ + uint32_t rss_hash; /* Packet hash. */ #endif uint32_t allocated; /* Number of bytes allocated. */ @@ -484,22 +484,22 @@ static inline void dp_packet_reset_packet(struct dp_packet *b, int off) b->l2_5_ofs = b->l3_ofs = b->l4_ofs = UINT16_MAX; } -static inline uint32_t dp_packet_get_dp_hash(struct dp_packet *p) +static inline uint32_t dp_packet_get_rss_hash(struct dp_packet *p) { #ifdef DPDK_NETDEV return p->mbuf.hash.rss; #else - return p->dp_hash; + return p->rss_hash; #endif } -static inline void dp_packet_set_dp_hash(struct dp_packet *p, +static inline void dp_packet_set_rss_hash(struct dp_packet *p, uint32_t hash) { #ifdef DPDK_NETDEV p->mbuf.hash.rss = hash; #else - p->dp_hash = hash; + p->rss_hash = hash; #endif } diff --git a/lib/dpctl.c b/lib/dpctl.c index 4c4d1c32cac..05c28d177ff 100644 --- a/lib/dpctl.c +++ b/lib/dpctl.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013, 2014 Nicira, Inc. + * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015 Nicira, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -60,6 +60,11 @@ struct dpctl_command { dpctl_command_handler *handler; }; static const struct dpctl_command *get_all_dpctl_commands(void); +static void dpctl_print(struct dpctl_params *dpctl_p, const char *fmt, ...) + OVS_PRINTF_FORMAT(2, 3); +static void dpctl_error(struct dpctl_params* dpctl_p, int err_no, + const char *fmt, ...) + OVS_PRINTF_FORMAT(3, 4); static void dpctl_puts(struct dpctl_params *dpctl_p, bool error, const char *string) @@ -382,12 +387,14 @@ dpctl_set_if(int argc, const char *argv[], struct dpctl_params *dpctl_p) "%s: can't change type from %s to %s", name, type, value); error = EINVAL; + goto next_destroy_args; } } else if (!strcmp(key, "port_no")) { if (port_no != u32_to_odp(atoi(value))) { dpctl_error(dpctl_p, 0, "%s: can't change port number from" " %"PRIu32" to %d", name, port_no, atoi(value)); error = EINVAL; + goto next_destroy_args; } } else if (value[0] == '\0') { smap_remove(&args, key); @@ -397,7 +404,13 @@ dpctl_set_if(int argc, const char *argv[], struct dpctl_params *dpctl_p) } /* Update configuration. */ - error = netdev_set_config(netdev, &args, NULL); + char *err_s = NULL; + error = netdev_set_config(netdev, &args, &err_s); + if (err_s || error) { + dpctl_error(dpctl_p, error, "%s", + err_s ? err_s : "Error updating configuration"); + free(err_s); + } if (error) { goto next_destroy_args; } @@ -599,7 +612,57 @@ show_dpif(struct dpif *dpif, struct dpctl_params *dpctl_p) } } } - dpif_close(dpif); +} + +typedef void (*dps_for_each_cb)(struct dpif *, struct dpctl_params *); + +static int +dps_for_each(struct dpctl_params *dpctl_p, dps_for_each_cb cb) +{ + struct sset dpif_names = SSET_INITIALIZER(&dpif_names), + dpif_types = SSET_INITIALIZER(&dpif_types); + int error, openerror = 0, enumerror = 0; + const char *type, *name; + bool at_least_one = false; + + dp_enumerate_types(&dpif_types); + + SSET_FOR_EACH (type, &dpif_types) { + error = dp_enumerate_names(type, &dpif_names); + if (error) { + enumerror = error; + } + + SSET_FOR_EACH (name, &dpif_names) { + struct dpif *dpif; + + at_least_one = true; + error = dpif_open(name, type, &dpif); + if (!error) { + cb(dpif, dpctl_p); + dpif_close(dpif); + } else { + openerror = error; + dpctl_error(dpctl_p, error, "opening datapath %s failed", + name); + } + } + } + + sset_destroy(&dpif_names); + sset_destroy(&dpif_types); + + /* If there has been an error while opening a datapath it should be + * reported. Otherwise, we want to ignore the errors generated by + * dp_enumerate_names() if at least one datapath has been discovered, + * because they're not interesting for the user. This happens, for + * example, if OVS is using a userspace datapath and the kernel module + * is not loaded. */ + if (openerror) { + return openerror; + } else { + return at_least_one ? 0 : enumerror; + } } static int @@ -615,6 +678,7 @@ dpctl_show(int argc, const char *argv[], struct dpctl_params *dpctl_p) error = parsed_dpif_open(name, false, &dpif); if (!error) { show_dpif(dpif, dpctl_p); + dpif_close(dpif); } else { dpctl_error(dpctl_p, error, "opening datapath %s failed", name); @@ -622,73 +686,23 @@ dpctl_show(int argc, const char *argv[], struct dpctl_params *dpctl_p) } } } else { - struct sset types; - const char *type; - - sset_init(&types); - dp_enumerate_types(&types); - SSET_FOR_EACH (type, &types) { - struct sset names; - const char *name; - - sset_init(&names); - error = dp_enumerate_names(type, &names); - if (error) { - lasterror = error; - goto next; - } - SSET_FOR_EACH (name, &names) { - struct dpif *dpif; - - error = dpif_open(name, type, &dpif); - if (!error) { - show_dpif(dpif, dpctl_p); - } else { - dpctl_error(dpctl_p, error, "opening datapath %s failed", - name); - lasterror = error; - } - } -next: - sset_destroy(&names); - } - sset_destroy(&types); + lasterror = dps_for_each(dpctl_p, show_dpif); } + return lasterror; } +static void +dump_cb(struct dpif *dpif, struct dpctl_params *dpctl_p) +{ + dpctl_print(dpctl_p, "%s\n", dpif_name(dpif)); +} + static int dpctl_dump_dps(int argc OVS_UNUSED, const char *argv[] OVS_UNUSED, struct dpctl_params *dpctl_p) { - struct sset dpif_names, dpif_types; - const char *type; - int error, lasterror = 0; - - sset_init(&dpif_names); - sset_init(&dpif_types); - dp_enumerate_types(&dpif_types); - - SSET_FOR_EACH (type, &dpif_types) { - const char *name; - - error = dp_enumerate_names(type, &dpif_names); - if (error) { - lasterror = error; - } - - SSET_FOR_EACH (name, &dpif_names) { - struct dpif *dpif; - if (!dpif_open(name, type, &dpif)) { - dpctl_print(dpctl_p, "%s\n", dpif_name(dpif)); - dpif_close(dpif); - } - } - } - - sset_destroy(&dpif_names); - sset_destroy(&dpif_types); - return lasterror; + return dps_for_each(dpctl_p, dump_cb); } static void @@ -1469,7 +1483,7 @@ dpctl_normalize_actions(int argc, const char *argv[], ds_clear(&s); format_odp_actions(&s, af->actions.data, af->actions.size); - dpctl_print(dpctl_p, ds_cstr(&s)); + dpctl_puts(dpctl_p, false, ds_cstr(&s)); ofpbuf_uninit(&af->actions); free(af); @@ -1567,7 +1581,7 @@ dpctl_unixctl_handler(struct unixctl_conn *conn, int argc, const char *argv[], { struct ds ds = DS_EMPTY_INITIALIZER; struct dpctl_params dpctl_p; - bool opt_parse_err = false; + bool error = false; dpctl_command_handler *handler = (dpctl_command_handler *) aux; @@ -1579,7 +1593,7 @@ dpctl_unixctl_handler(struct unixctl_conn *conn, int argc, const char *argv[], /* Parse options (like getopt). Unfortunately it does * not seem a good idea to call getopt_long() here, since it uses global * variables */ - while (argc > 1 && !opt_parse_err) { + while (argc > 1 && !error) { const char *arg = argv[1]; if (!strncmp(arg, "--", 2)) { /* Long option */ @@ -1593,13 +1607,13 @@ dpctl_unixctl_handler(struct unixctl_conn *conn, int argc, const char *argv[], dpctl_p.verbosity++; } else { ds_put_format(&ds, "Unrecognized option %s", argv[1]); - opt_parse_err = true; + error = true; } } else if (arg[0] == '-' && arg[1] != '\0') { /* Short option[s] */ const char *opt = &arg[1]; - while (*opt && !opt_parse_err) { + while (*opt && !error) { switch (*opt) { case 'm': dpctl_p.verbosity++; @@ -1609,7 +1623,7 @@ dpctl_unixctl_handler(struct unixctl_conn *conn, int argc, const char *argv[], break; default: ds_put_format(&ds, "Unrecognized option -%c", *opt); - opt_parse_err = true; + error = true; break; } opt++; @@ -1619,22 +1633,26 @@ dpctl_unixctl_handler(struct unixctl_conn *conn, int argc, const char *argv[], break; } - if (opt_parse_err) { + if (error) { break; } argv++; argc--; } - if (!opt_parse_err) { + if (!error) { dpctl_p.is_appctl = true; dpctl_p.output = dpctl_unixctl_print; dpctl_p.aux = &ds; - handler(argc, argv, &dpctl_p); + error = handler(argc, argv, &dpctl_p) != 0; } - unixctl_command_reply(conn, ds_cstr(&ds)); + if (error) { + unixctl_command_reply_error(conn, ds_cstr(&ds)); + } else { + unixctl_command_reply(conn, ds_cstr(&ds)); + } ds_destroy(&ds); } diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index 61d9211c2d2..f1d65f57d4a 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -227,6 +227,12 @@ enum dp_stat_type { DP_N_STATS }; +enum pmd_cycles_counter_type { + PMD_CYCLES_POLLING, /* Cycles spent polling NICs. */ + PMD_CYCLES_PROCESSING, /* Cycles spent processing packets */ + PMD_N_CYCLES +}; + /* A port in a netdev-based datapath. */ struct dp_netdev_port { struct cmap_node node; /* Node in dp_netdev's 'ports'. */ @@ -326,8 +332,8 @@ static int dpif_netdev_flow_from_nlattrs(const struct nlattr *, uint32_t, struct dp_netdev_actions { /* These members are immutable: they do not change during the struct's * lifetime. */ - struct nlattr *actions; /* Sequence of OVS_ACTION_ATTR_* attributes. */ unsigned int size; /* Size of 'actions', in bytes. */ + struct nlattr actions[]; /* Sequence of OVS_ACTION_ATTR_* attributes. */ }; struct dp_netdev_actions *dp_netdev_actions_create(const struct nlattr *, @@ -342,6 +348,12 @@ struct dp_netdev_pmd_stats { atomic_ullong n[DP_N_STATS]; }; +/* Contained by struct dp_netdev_pmd_thread's 'cycle' member. */ +struct dp_netdev_pmd_cycles { + /* Indexed by PMD_CYCLES_*. */ + atomic_ullong n[PMD_N_CYCLES]; +}; + /* PMD: Poll modes drivers. PMD accesses devices via polling to eliminate * the performance overhead of interrupt processing. Therefore netdev can * not implement rx-wait for these devices. dpif-netdev needs to poll @@ -384,6 +396,12 @@ struct dp_netdev_pmd_thread { /* Statistics. */ struct dp_netdev_pmd_stats stats; + /* Cycles counters */ + struct dp_netdev_pmd_cycles cycles; + + /* Used to count cicles. See 'cycles_counter_end()' */ + unsigned long long last_cycles; + struct latch exit_latch; /* For terminating the pmd thread. */ atomic_uint change_seq; /* For reloading pmd ports. */ pthread_t thread; @@ -391,6 +409,13 @@ struct dp_netdev_pmd_thread { /* threads on same numa node. */ int core_id; /* CPU core id of this pmd thread. */ int numa_id; /* numa node id of this pmd thread. */ + + /* Only a pmd thread can write on its own 'cycles' and 'stats'. + * The main thread keeps 'stats_zero' and 'cycles_zero' as base + * values and subtracts them from 'stats' and 'cycles' before + * reporting to the user */ + unsigned long long stats_zero[DP_N_STATS]; + uint64_t cycles_zero[PMD_N_CYCLES]; }; #define PMD_INITIAL_SEQ 1 @@ -498,6 +523,182 @@ get_dp_netdev(const struct dpif *dpif) { return dpif_netdev_cast(dpif)->dp; } + +enum pmd_info_type { + PMD_INFO_SHOW_STATS, /* show how cpu cycles are spent */ + PMD_INFO_CLEAR_STATS /* set the cycles count to 0 */ +}; + +static void +pmd_info_show_stats(struct ds *reply, + struct dp_netdev_pmd_thread *pmd, + unsigned long long stats[DP_N_STATS], + uint64_t cycles[PMD_N_CYCLES]) +{ + unsigned long long total_packets = 0; + uint64_t total_cycles = 0; + int i; + + /* These loops subtracts reference values ('*_zero') from the counters. + * Since loads and stores are relaxed, it might be possible for a '*_zero' + * value to be more recent than the current value we're reading from the + * counter. This is not a big problem, since these numbers are not + * supposed to be too accurate, but we should at least make sure that + * the result is not negative. */ + for (i = 0; i < DP_N_STATS; i++) { + if (stats[i] > pmd->stats_zero[i]) { + stats[i] -= pmd->stats_zero[i]; + } else { + stats[i] = 0; + } + + if (i != DP_STAT_LOST) { + /* Lost packets are already included in DP_STAT_MISS */ + total_packets += stats[i]; + } + } + + for (i = 0; i < PMD_N_CYCLES; i++) { + if (cycles[i] > pmd->cycles_zero[i]) { + cycles[i] -= pmd->cycles_zero[i]; + } else { + cycles[i] = 0; + } + + total_cycles += cycles[i]; + } + + ds_put_cstr(reply, (pmd->core_id == NON_PMD_CORE_ID) + ? "main thread" : "pmd thread"); + + if (pmd->numa_id != OVS_NUMA_UNSPEC) { + ds_put_format(reply, " numa_id %d", pmd->numa_id); + } + if (pmd->core_id != OVS_CORE_UNSPEC) { + ds_put_format(reply, " core_id %d", pmd->core_id); + } + ds_put_cstr(reply, ":\n"); + + ds_put_format(reply, + "\temc hits:%llu\n\tmegaflow hits:%llu\n" + "\tmiss:%llu\n\tlost:%llu\n", + stats[DP_STAT_EXACT_HIT], stats[DP_STAT_MASKED_HIT], + stats[DP_STAT_MISS], stats[DP_STAT_LOST]); + + if (total_cycles == 0) { + return; + } + + ds_put_format(reply, + "\tpolling cycles:%"PRIu64" (%.02f%%)\n" + "\tprocessing cycles:%"PRIu64" (%.02f%%)\n", + cycles[PMD_CYCLES_POLLING], + cycles[PMD_CYCLES_POLLING] / (double)total_cycles * 100, + cycles[PMD_CYCLES_PROCESSING], + cycles[PMD_CYCLES_PROCESSING] / (double)total_cycles * 100); + + if (total_packets == 0) { + return; + } + + ds_put_format(reply, + "\tavg cycles per packet: %.02f (%"PRIu64"/%llu)\n", + total_cycles / (double)total_packets, + total_cycles, total_packets); + + ds_put_format(reply, + "\tavg processing cycles per packet: " + "%.02f (%"PRIu64"/%llu)\n", + cycles[PMD_CYCLES_PROCESSING] / (double)total_packets, + cycles[PMD_CYCLES_PROCESSING], total_packets); +} + +static void +pmd_info_clear_stats(struct ds *reply OVS_UNUSED, + struct dp_netdev_pmd_thread *pmd, + unsigned long long stats[DP_N_STATS], + uint64_t cycles[PMD_N_CYCLES]) +{ + int i; + + /* We cannot write 'stats' and 'cycles' (because they're written by other + * threads) and we shouldn't change 'stats' (because they're used to count + * datapath stats, which must not be cleared here). Instead, we save the + * current values and subtract them from the values to be displayed in the + * future */ + for (i = 0; i < DP_N_STATS; i++) { + pmd->stats_zero[i] = stats[i]; + } + for (i = 0; i < PMD_N_CYCLES; i++) { + pmd->cycles_zero[i] = cycles[i]; + } +} + +static void +dpif_netdev_pmd_info(struct unixctl_conn *conn, int argc, const char *argv[], + void *aux) +{ + struct ds reply = DS_EMPTY_INITIALIZER; + struct dp_netdev_pmd_thread *pmd; + struct dp_netdev *dp = NULL; + enum pmd_info_type type = *(enum pmd_info_type *) aux; + + ovs_mutex_lock(&dp_netdev_mutex); + + if (argc == 2) { + dp = shash_find_data(&dp_netdevs, argv[1]); + } else if (shash_count(&dp_netdevs) == 1) { + /* There's only one datapath */ + dp = shash_first(&dp_netdevs)->data; + } + + if (!dp) { + ovs_mutex_unlock(&dp_netdev_mutex); + unixctl_command_reply_error(conn, + "please specify an existing datapath"); + return; + } + + CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { + unsigned long long stats[DP_N_STATS]; + uint64_t cycles[PMD_N_CYCLES]; + int i; + + /* Read current stats and cycle counters */ + for (i = 0; i < ARRAY_SIZE(stats); i++) { + atomic_read_relaxed(&pmd->stats.n[i], &stats[i]); + } + for (i = 0; i < ARRAY_SIZE(cycles); i++) { + atomic_read_relaxed(&pmd->cycles.n[i], &cycles[i]); + } + + if (type == PMD_INFO_CLEAR_STATS) { + pmd_info_clear_stats(&reply, pmd, stats, cycles); + } else if (type == PMD_INFO_SHOW_STATS) { + pmd_info_show_stats(&reply, pmd, stats, cycles); + } + } + + ovs_mutex_unlock(&dp_netdev_mutex); + + unixctl_command_reply(conn, ds_cstr(&reply)); + ds_destroy(&reply); +} + +static int +dpif_netdev_init(void) +{ + static enum pmd_info_type show_aux = PMD_INFO_SHOW_STATS, + clear_aux = PMD_INFO_CLEAR_STATS; + + unixctl_command_register("dpif-netdev/pmd-stats-show", "[dp]", + 0, 1, dpif_netdev_pmd_info, + (void *)&show_aux); + unixctl_command_register("dpif-netdev/pmd-stats-clear", "[dp]", + 0, 1, dpif_netdev_pmd_info, + (void *)&clear_aux); + return 0; +} static int dpif_netdev_enumerate(struct sset *all_dps, @@ -834,7 +1035,10 @@ do_add_port(struct dp_netdev *dp, const char *devname, const char *type, int error; int i; - /* XXX reject devices already in some dp_netdev. */ + /* Reject devices already in 'dp'. */ + if (!get_port_by_name(dp, devname, &port)) { + return EEXIST; + } /* Open and validate network device. */ open_type = dpif_netdev_port_open_type(dp->class, type); @@ -2231,16 +2435,15 @@ dpif_netdev_queue_to_priority(const struct dpif *dpif OVS_UNUSED, } -/* Creates and returns a new 'struct dp_netdev_actions', with a reference count - * of 1, whose actions are a copy of from the 'ofpacts_len' bytes of - * 'ofpacts'. */ +/* Creates and returns a new 'struct dp_netdev_actions', whose actions are + * a copy of the 'ofpacts_len' bytes of 'ofpacts'. */ struct dp_netdev_actions * dp_netdev_actions_create(const struct nlattr *actions, size_t size) { struct dp_netdev_actions *netdev_actions; - netdev_actions = xmalloc(sizeof *netdev_actions); - netdev_actions->actions = xmemdup(actions, size); + netdev_actions = xmalloc(sizeof *netdev_actions + size); + memcpy(netdev_actions->actions, actions, size); netdev_actions->size = size; return netdev_actions; @@ -2255,10 +2458,42 @@ dp_netdev_flow_get_actions(const struct dp_netdev_flow *flow) static void dp_netdev_actions_free(struct dp_netdev_actions *actions) { - free(actions->actions); free(actions); } +static inline unsigned long long +cycles_counter(void) +{ +#ifdef DPDK_NETDEV + return rte_get_tsc_cycles(); +#else + return 0; +#endif +} + +/* Fake mutex to make sure that the calls to cycles_count_* are balanced */ +extern struct ovs_mutex cycles_counter_fake_mutex; + +/* Start counting cycles. Must be followed by 'cycles_count_end()' */ +static inline void +cycles_count_start(struct dp_netdev_pmd_thread *pmd) + OVS_ACQUIRES(&cycles_counter_fake_mutex) + OVS_NO_THREAD_SAFETY_ANALYSIS +{ + pmd->last_cycles = cycles_counter(); +} + +/* Stop counting cycles and add them to the counter 'type' */ +static inline void +cycles_count_end(struct dp_netdev_pmd_thread *pmd, + enum pmd_cycles_counter_type type) + OVS_RELEASES(&cycles_counter_fake_mutex) + OVS_NO_THREAD_SAFETY_ANALYSIS +{ + unsigned long long interval = cycles_counter() - pmd->last_cycles; + + non_atomic_ullong_add(&pmd->cycles.n[type], interval); +} static void dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd, @@ -2268,7 +2503,9 @@ dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd, struct dp_packet *packets[NETDEV_MAX_RX_BATCH]; int error, cnt; + cycles_count_start(pmd); error = netdev_rxq_recv(rxq, packets, &cnt); + cycles_count_end(pmd, PMD_CYCLES_POLLING); if (!error) { int i; @@ -2278,7 +2515,9 @@ dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd, for (i = 0; i < cnt; i++) { packets[i]->md = PKT_METADATA_INITIALIZER(port->port_no); } + cycles_count_start(pmd); dp_netdev_input(pmd, packets, cnt); + cycles_count_end(pmd, PMD_CYCLES_PROCESSING); } else if (error != EAGAIN && error != EOPNOTSUPP) { static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); @@ -2710,9 +2949,8 @@ dpif_netdev_get_datapath_version(void) static void dp_netdev_flow_used(struct dp_netdev_flow *netdev_flow, int cnt, int size, - uint16_t tcp_flags) + uint16_t tcp_flags, long long now) { - long long now = time_msec(); uint16_t flags; atomic_store_relaxed(&netdev_flow->stats.used, now); @@ -2774,10 +3012,10 @@ dpif_netdev_packet_get_dp_hash(struct dp_packet *packet, { uint32_t hash; - hash = dp_packet_get_dp_hash(packet); + hash = dp_packet_get_rss_hash(packet); if (OVS_UNLIKELY(!hash)) { hash = miniflow_hash_5tuple(mf, 0); - dp_packet_set_dp_hash(packet, hash); + dp_packet_set_rss_hash(packet, hash); } return hash; } @@ -2814,13 +3052,14 @@ packet_batch_init(struct packet_batch *batch, struct dp_netdev_flow *flow) static inline void packet_batch_execute(struct packet_batch *batch, struct dp_netdev_pmd_thread *pmd, - enum dp_stat_type hit_type) + enum dp_stat_type hit_type, + long long now) { struct dp_netdev_actions *actions; struct dp_netdev_flow *flow = batch->flow; dp_netdev_flow_used(batch->flow, batch->packet_count, batch->byte_count, - batch->tcp_flags); + batch->tcp_flags, now); actions = dp_netdev_flow_get_actions(flow); @@ -2883,7 +3122,7 @@ dp_packet_swap(struct dp_packet **a, struct dp_packet **b) */ static inline size_t emc_processing(struct dp_netdev_pmd_thread *pmd, struct dp_packet **packets, - size_t cnt, struct netdev_flow_key *keys) + size_t cnt, struct netdev_flow_key *keys, long long now) { struct netdev_flow_key key; struct packet_batch batches[4]; @@ -2918,7 +3157,7 @@ emc_processing(struct dp_netdev_pmd_thread *pmd, struct dp_packet **packets, } for (i = 0; i < n_batches; i++) { - packet_batch_execute(&batches[i], pmd, DP_STAT_EXACT_HIT); + packet_batch_execute(&batches[i], pmd, DP_STAT_EXACT_HIT, now); } return notfound_cnt; @@ -2927,7 +3166,7 @@ emc_processing(struct dp_netdev_pmd_thread *pmd, struct dp_packet **packets, static inline void fast_path_processing(struct dp_netdev_pmd_thread *pmd, struct dp_packet **packets, size_t cnt, - struct netdev_flow_key *keys) + struct netdev_flow_key *keys, long long now) { #if !defined(__CHECKER__) && !defined(_WIN32) const size_t PKT_ARRAY_SIZE = cnt; @@ -3055,7 +3294,7 @@ fast_path_processing(struct dp_netdev_pmd_thread *pmd, } for (i = 0; i < n_batches; i++) { - packet_batch_execute(&batches[i], pmd, DP_STAT_MASKED_HIT); + packet_batch_execute(&batches[i], pmd, DP_STAT_MASKED_HIT, now); } } @@ -3070,11 +3309,12 @@ dp_netdev_input(struct dp_netdev_pmd_thread *pmd, enum { PKT_ARRAY_SIZE = NETDEV_MAX_RX_BATCH }; #endif struct netdev_flow_key keys[PKT_ARRAY_SIZE]; + long long now = time_msec(); size_t newcnt; - newcnt = emc_processing(pmd, packets, cnt, keys); + newcnt = emc_processing(pmd, packets, cnt, keys, now); if (OVS_UNLIKELY(newcnt)) { - fast_path_processing(pmd, packets, newcnt, keys); + fast_path_processing(pmd, packets, newcnt, keys, now); } } @@ -3123,13 +3363,13 @@ push_tnl_action(const struct dp_netdev *dp, } static void -dp_netdev_clone_pkt_batch(struct dp_packet **tnl_pkt, - struct dp_packet **packets, int cnt) +dp_netdev_clone_pkt_batch(struct dp_packet **dst_pkts, + struct dp_packet **src_pkts, int cnt) { int i; for (i = 0; i < cnt; i++) { - tnl_pkt[i] = dp_packet_clone(packets[i]); + dst_pkts[i] = dp_packet_clone(src_pkts[i]); } } @@ -3140,8 +3380,8 @@ dp_execute_cb(void *aux_, struct dp_packet **packets, int cnt, { struct dp_netdev_execute_aux *aux = aux_; uint32_t *depth = recirc_depth_get(); - struct dp_netdev_pmd_thread *pmd= aux->pmd; - struct dp_netdev *dp= pmd->dp; + struct dp_netdev_pmd_thread *pmd = aux->pmd; + struct dp_netdev *dp = pmd->dp; int type = nl_attr_type(a); struct dp_netdev_port *p; int i; @@ -3245,21 +3485,19 @@ dp_execute_cb(void *aux_, struct dp_packet **packets, int cnt, case OVS_ACTION_ATTR_RECIRC: if (*depth < MAX_RECIRC_DEPTH) { + struct dp_packet *recirc_pkts[NETDEV_MAX_RX_BATCH]; - (*depth)++; - for (i = 0; i < cnt; i++) { - struct dp_packet *recirc_pkt; - - recirc_pkt = (may_steal) ? packets[i] - : dp_packet_clone(packets[i]); - - recirc_pkt->md.recirc_id = nl_attr_get_u32(a); - - /* Hash is private to each packet */ - recirc_pkt->md.dp_hash = dp_packet_get_dp_hash(packets[i]); + if (!may_steal) { + dp_netdev_clone_pkt_batch(recirc_pkts, packets, cnt); + packets = recirc_pkts; + } - dp_netdev_input(pmd, &recirc_pkt, 1); + for (i = 0; i < cnt; i++) { + packets[i]->md.recirc_id = nl_attr_get_u32(a); } + + (*depth)++; + dp_netdev_input(pmd, packets, cnt); (*depth)--; return; @@ -3298,6 +3536,7 @@ dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd, const struct dpif_class dpif_netdev_class = { "netdev", + dpif_netdev_init, dpif_netdev_enumerate, dpif_netdev_port_open_type, dpif_netdev_open, diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c index 93fd8a4512f..6838def7d9a 100644 --- a/lib/dpif-netlink.c +++ b/lib/dpif-netlink.c @@ -768,6 +768,9 @@ get_vport_type(const struct dpif_netlink_vport *vport) case OVS_VPORT_TYPE_LISP: return "lisp"; + case OVS_VPORT_TYPE_STT: + return "stt"; + case OVS_VPORT_TYPE_UNSPEC: case __OVS_VPORT_TYPE_MAX: break; @@ -787,6 +790,8 @@ netdev_to_ovs_vport_type(const struct netdev *netdev) return OVS_VPORT_TYPE_NETDEV; } else if (!strcmp(type, "internal")) { return OVS_VPORT_TYPE_INTERNAL; + } else if (strstr(type, "stt")) { + return OVS_VPORT_TYPE_STT; } else if (!strcmp(type, "geneve")) { return OVS_VPORT_TYPE_GENEVE; } else if (strstr(type, "gre64")) { @@ -2274,6 +2279,7 @@ dpif_netlink_get_datapath_version(void) const struct dpif_class dpif_netlink_class = { "system", + NULL, /* init */ dpif_netlink_enumerate, NULL, dpif_netlink_open, diff --git a/lib/dpif-provider.h b/lib/dpif-provider.h index 7b4878eb5fc..28ea86f4805 100644 --- a/lib/dpif-provider.h +++ b/lib/dpif-provider.h @@ -90,6 +90,14 @@ struct dpif_class { * the type assumed if no type is specified when opening a dpif. */ const char *type; + /* Called when the dpif provider is registered, typically at program + * startup. Returning an error from this function will prevent any + * datapath with this class from being created. + * + * This function may be set to null if a datapath class needs no + * initialization at registration time. */ + int (*init)(void); + /* Enumerates the names of all known created datapaths (of class * 'dpif_class'), if possible, into 'all_dps'. The caller has already * initialized 'all_dps' and other dpif classes might already have added diff --git a/lib/dpif.c b/lib/dpif.c index ee71774a7fa..b8f30a50349 100644 --- a/lib/dpif.c +++ b/lib/dpif.c @@ -135,6 +135,7 @@ static int dp_register_provider__(const struct dpif_class *new_class) { struct registered_dpif_class *registered_class; + int error; if (sset_contains(&dpif_blacklist, new_class->type)) { VLOG_DBG("attempted to register blacklisted provider: %s", @@ -148,6 +149,13 @@ dp_register_provider__(const struct dpif_class *new_class) return EEXIST; } + error = new_class->init ? new_class->init() : 0; + if (error) { + VLOG_WARN("failed to initialize %s datapath class: %s", + new_class->type, ovs_strerror(error)); + return error; + } + registered_class = xmalloc(sizeof *registered_class); registered_class->dpif_class = new_class; registered_class->refcount = 0; diff --git a/lib/netdev-bsd.c b/lib/netdev-bsd.c index 9ed2823b892..b3075dc829c 100644 --- a/lib/netdev-bsd.c +++ b/lib/netdev-bsd.c @@ -42,7 +42,6 @@ #include #if defined(__NetBSD__) #include -#include #include #endif @@ -643,7 +642,7 @@ netdev_bsd_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet **packets, dp_packet_delete(packet); } else { dp_packet_pad(packet); - dp_packet_set_dp_hash(packet, 0); + dp_packet_set_rss_hash(packet, 0); packets[0] = packet; *c = 1; } @@ -844,7 +843,7 @@ netdev_bsd_get_mtu(const struct netdev *netdev_, int *mtup) } ovs_mutex_unlock(&netdev->mutex); - return 0; + return error; } static int diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index f69154b0740..5af15d42156 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -117,8 +117,7 @@ static const struct rte_eth_conf port_conf = { .rx_adv_conf = { .rss_conf = { .rss_key = NULL, - .rss_hf = ETH_RSS_IPV4_TCP | ETH_RSS_IPV4 | ETH_RSS_IPV6 - | ETH_RSS_IPV4_UDP | ETH_RSS_IPV6_TCP | ETH_RSS_IPV6_UDP, + .rss_hf = ETH_RSS_IP | ETH_RSS_UDP | ETH_RSS_TCP, }, }, .txmode = { @@ -558,11 +557,11 @@ netdev_dpdk_init(struct netdev *netdev_, unsigned int port_no, netdev_->n_rxq = NR_QUEUE; if (type == DPDK_DEV_ETH) { - netdev_dpdk_alloc_txq(netdev, NR_QUEUE); - err = dpdk_eth_dev_init(netdev); - if (err) { - goto unlock; - } + netdev_dpdk_alloc_txq(netdev, NR_QUEUE); + err = dpdk_eth_dev_init(netdev); + if (err) { + goto unlock; + } } list_push_back(&dpdk_list, &netdev->list_node); @@ -906,10 +905,10 @@ __netdev_dpdk_vhost_send(struct netdev *netdev, struct dp_packet **pkts, int tx_pkts, i; if (OVS_UNLIKELY(!is_vhost_running(virtio_dev))) { - ovs_mutex_lock(&vhost_dev->mutex); - vhost_dev->stats.tx_dropped+= cnt; - ovs_mutex_unlock(&vhost_dev->mutex); - goto out; + ovs_mutex_lock(&vhost_dev->mutex); + vhost_dev->stats.tx_dropped+= cnt; + ovs_mutex_unlock(&vhost_dev->mutex); + goto out; } /* There is vHost TX single queue, So we need to lock it for TX. */ @@ -923,9 +922,9 @@ __netdev_dpdk_vhost_send(struct netdev *netdev, struct dp_packet **pkts, out: if (may_steal) { - for (i = 0; i < cnt; i++) { - dp_packet_delete(pkts[i]); - } + for (i = 0; i < cnt; i++) { + dp_packet_delete(pkts[i]); + } } } @@ -1064,6 +1063,7 @@ netdev_dpdk_send__(struct netdev_dpdk *dev, int qid, for (i = 0; i < cnt; i++) { int size = dp_packet_size(pkts[i]); + if (OVS_UNLIKELY(size > dev->max_packet_len)) { if (next_tx_idx != i) { dpdk_queue_pkts(dev, qid, @@ -1745,6 +1745,15 @@ netdev_dpdk_ring_send(struct netdev *netdev, int qid OVS_UNUSED, struct dp_packet **pkts, int cnt, bool may_steal) { struct netdev_dpdk *dev = netdev_dpdk_cast(netdev); + unsigned i; + + /* When using 'dpdkr' and sending to a DPDK ring, we want to ensure that the + * rss hash field is clear. This is because the same mbuf may be modified by + * the consumer of the ring and return into the datapath without recalculating + * the RSS hash. */ + for (i = 0; i < cnt; i++) { + dp_packet_set_rss_hash(pkts[i], 0); + } /* DPDK Rings have a single TX queue, Therefore needs locking. */ rte_spinlock_lock(&dev->txq_lock); diff --git a/lib/netdev-dummy.c b/lib/netdev-dummy.c index 24c91c20351..64f8f66b9a0 100644 --- a/lib/netdev-dummy.c +++ b/lib/netdev-dummy.c @@ -342,13 +342,15 @@ dummy_packet_conn_set_config(struct dummy_packet_conn *conn, switch (conn->type) { case PASSIVE: - if (!strcmp(pstream_get_name(conn->u.pconn.pstream), pstream)) { + if (pstream && + !strcmp(pstream_get_name(conn->u.pconn.pstream), pstream)) { return; } dummy_packet_conn_close(conn); break; case ACTIVE: - if (!strcmp(stream_get_name(conn->u.rconn.rstream->stream), stream)) { + if (stream && + !strcmp(stream_get_name(conn->u.rconn.rstream->stream), stream)) { return; } dummy_packet_conn_close(conn); @@ -834,7 +836,7 @@ netdev_dummy_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet **arr, ovs_mutex_unlock(&netdev->mutex); dp_packet_pad(packet); - dp_packet_set_dp_hash(packet, 0); + dp_packet_set_rss_hash(packet, 0); arr[0] = packet; *c = 1; diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c index 23f98566dcf..36e27e098f7 100644 --- a/lib/netdev-linux.c +++ b/lib/netdev-linux.c @@ -1058,7 +1058,7 @@ netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet **packets, dp_packet_delete(buffer); } else { dp_packet_pad(buffer); - dp_packet_set_dp_hash(buffer, 0); + dp_packet_set_rss_hash(buffer, 0); packets[0] = buffer; *c = 1; } diff --git a/lib/netdev-vport.c b/lib/netdev-vport.c index f228ac219a4..ea9abf9e7a7 100644 --- a/lib/netdev-vport.c +++ b/lib/netdev-vport.c @@ -55,6 +55,7 @@ static struct vlog_rate_limit err_rl = VLOG_RATE_LIMIT_INIT(60, 5); #define GENEVE_DST_PORT 6081 #define VXLAN_DST_PORT 4789 #define LISP_DST_PORT 4341 +#define STT_DST_PORT 7471 #define VXLAN_HLEN (sizeof(struct eth_header) + \ sizeof(struct ip_header) + \ @@ -158,7 +159,7 @@ netdev_vport_needs_dst_port(const struct netdev *dev) return (class->get_config == get_tunnel_config && (!strcmp("geneve", type) || !strcmp("vxlan", type) || - !strcmp("lisp", type))); + !strcmp("lisp", type) || !strcmp("stt", type)) ); } const char * @@ -257,8 +258,12 @@ netdev_vport_construct(struct netdev *netdev_) dev->tnl_cfg.dst_port = htons(VXLAN_DST_PORT); } else if (!strcmp(type, "lisp")) { dev->tnl_cfg.dst_port = htons(LISP_DST_PORT); + } else if (!strcmp(type, "stt")) { + dev->tnl_cfg.dst_port = htons(STT_DST_PORT); } + dev->tnl_cfg.dont_fragment = true; + dev->tnl_cfg.ttl = DEFAULT_TTL; return 0; } @@ -432,7 +437,7 @@ set_tunnel_config(struct netdev *dev_, const struct smap *args) struct smap_node *node; has_csum = strstr(type, "gre") || strstr(type, "geneve") || - strstr(type, "vxlan"); + strstr(type, "stt") || strstr(type, "vxlan"); ipsec_mech_set = false; memset(&tnl_cfg, 0, sizeof tnl_cfg); @@ -449,6 +454,10 @@ set_tunnel_config(struct netdev *dev_, const struct smap *args) tnl_cfg.dst_port = htons(LISP_DST_PORT); } + if (!strcmp(type, "stt")) { + tnl_cfg.dst_port = htons(STT_DST_PORT); + } + needs_dst_port = netdev_vport_needs_dst_port(dev_); tnl_cfg.ipsec = strstr(type, "ipsec"); tnl_cfg.dont_fragment = true; @@ -688,7 +697,8 @@ get_tunnel_config(const struct netdev *dev, struct smap *args) if ((!strcmp("geneve", type) && dst_port != GENEVE_DST_PORT) || (!strcmp("vxlan", type) && dst_port != VXLAN_DST_PORT) || - (!strcmp("lisp", type) && dst_port != LISP_DST_PORT)) { + (!strcmp("lisp", type) && dst_port != LISP_DST_PORT) || + (!strcmp("stt", type) && dst_port != STT_DST_PORT)) { smap_add_format(args, "dst_port", "%d", dst_port); } } @@ -914,7 +924,7 @@ get_src_port(struct dp_packet *packet) { uint32_t hash; - hash = dp_packet_get_dp_hash(packet); + hash = dp_packet_get_rss_hash(packet); return htons((((uint64_t) hash * (tnl_udp_port_max - tnl_udp_port_min)) >> 32) + tnl_udp_port_min); @@ -1401,7 +1411,8 @@ netdev_vport_tunnel_register(void) TUNNEL_CLASS("vxlan", "vxlan_sys", netdev_vxlan_build_header, push_udp_header, netdev_vxlan_pop_header), - TUNNEL_CLASS("lisp", "lisp_sys", NULL, NULL, NULL) + TUNNEL_CLASS("lisp", "lisp_sys", NULL, NULL, NULL), + TUNNEL_CLASS("stt", "stt_sys", NULL, NULL, NULL), }; static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER; diff --git a/lib/netlink-socket.c b/lib/netlink-socket.c index fab2a6681a9..42eb232e5c0 100644 --- a/lib/netlink-socket.c +++ b/lib/netlink-socket.c @@ -475,6 +475,8 @@ nl_sock_send__(struct nl_sock *sock, const struct ofpbuf *msg, retval = -1; /* XXX: Map to a more appropriate error based on GetLastError(). */ errno = EINVAL; + VLOG_DBG_RL(&rl, "fatal driver failure in write: %s", + ovs_lasterror_to_string()); } else { retval = msg->size; } @@ -564,7 +566,10 @@ nl_sock_recv__(struct nl_sock *sock, struct ofpbuf *buf, bool wait) DWORD bytes; if (!DeviceIoControl(sock->handle, sock->read_ioctl, NULL, 0, tail, sizeof tail, &bytes, NULL)) { + VLOG_DBG_RL(&rl, "fatal driver failure in transact: %s", + ovs_lasterror_to_string()); retval = -1; + /* XXX: Map to a more appropriate error. */ errno = EINVAL; } else { retval = bytes; @@ -789,61 +794,78 @@ nl_sock_transact_multiple__(struct nl_sock *sock, uint8_t reply_buf[65536]; for (i = 0; i < n; i++) { DWORD reply_len; + bool ret; struct nl_transaction *txn = transactions[i]; struct nlmsghdr *request_nlmsg, *reply_nlmsg; - if (!DeviceIoControl(sock->handle, OVS_IOCTL_TRANSACT, - txn->request->data, - txn->request->size, - reply_buf, sizeof reply_buf, - &reply_len, NULL)) { + ret = DeviceIoControl(sock->handle, OVS_IOCTL_TRANSACT, + txn->request->data, + txn->request->size, + reply_buf, sizeof reply_buf, + &reply_len, NULL); + + if (ret && reply_len == 0) { + /* + * The current transaction did not produce any data to read and that + * is not an error as such. Continue with the remainder of the + * transactions. + */ + txn->error = 0; + if (txn->reply) { + ofpbuf_clear(txn->reply); + } + } else if (!ret) { /* XXX: Map to a more appropriate error. */ error = EINVAL; + VLOG_DBG_RL(&rl, "fatal driver failure: %s", + ovs_lasterror_to_string()); break; } - if (reply_len < sizeof *reply_nlmsg) { - nl_sock_record_errors__(transactions, n, 0); - VLOG_DBG_RL(&rl, "insufficient length of reply %#"PRIu32 - " for seq: %#"PRIx32, reply_len, request_nlmsg->nlmsg_seq); - break; - } - - /* Validate the sequence number in the reply. */ - request_nlmsg = nl_msg_nlmsghdr(txn->request); - reply_nlmsg = (struct nlmsghdr *)reply_buf; + if (reply_len != 0) { + if (reply_len < sizeof *reply_nlmsg) { + nl_sock_record_errors__(transactions, n, 0); + VLOG_DBG_RL(&rl, "insufficient length of reply %#"PRIu32 + " for seq: %#"PRIx32, reply_len, request_nlmsg->nlmsg_seq); + break; + } - if (request_nlmsg->nlmsg_seq != reply_nlmsg->nlmsg_seq) { - ovs_assert(request_nlmsg->nlmsg_seq == reply_nlmsg->nlmsg_seq); - VLOG_DBG_RL(&rl, "mismatched seq request %#"PRIx32 - ", reply %#"PRIx32, request_nlmsg->nlmsg_seq, - reply_nlmsg->nlmsg_seq); - break; - } + /* Validate the sequence number in the reply. */ + request_nlmsg = nl_msg_nlmsghdr(txn->request); + reply_nlmsg = (struct nlmsghdr *)reply_buf; - /* Handle errors embedded within the netlink message. */ - ofpbuf_use_stub(&tmp_reply, reply_buf, sizeof reply_buf); - tmp_reply.size = sizeof reply_buf; - if (nl_msg_nlmsgerr(&tmp_reply, &txn->error)) { - if (txn->reply) { - ofpbuf_clear(txn->reply); - } - if (txn->error) { - VLOG_DBG_RL(&rl, "received NAK error=%d (%s)", - error, ovs_strerror(txn->error)); + if (request_nlmsg->nlmsg_seq != reply_nlmsg->nlmsg_seq) { + ovs_assert(request_nlmsg->nlmsg_seq == reply_nlmsg->nlmsg_seq); + VLOG_DBG_RL(&rl, "mismatched seq request %#"PRIx32 + ", reply %#"PRIx32, request_nlmsg->nlmsg_seq, + reply_nlmsg->nlmsg_seq); + break; } - } else { - txn->error = 0; - if (txn->reply) { - /* Copy the reply to the buffer specified by the caller. */ - if (reply_len > txn->reply->allocated) { - ofpbuf_reinit(txn->reply, reply_len); + + /* Handle errors embedded within the netlink message. */ + ofpbuf_use_stub(&tmp_reply, reply_buf, sizeof reply_buf); + tmp_reply.size = sizeof reply_buf; + if (nl_msg_nlmsgerr(&tmp_reply, &txn->error)) { + if (txn->reply) { + ofpbuf_clear(txn->reply); + } + if (txn->error) { + VLOG_DBG_RL(&rl, "received NAK error=%d (%s)", + error, ovs_strerror(txn->error)); + } + } else { + txn->error = 0; + if (txn->reply) { + /* Copy the reply to the buffer specified by the caller. */ + if (reply_len > txn->reply->allocated) { + ofpbuf_reinit(txn->reply, reply_len); + } + memcpy(txn->reply->data, reply_buf, reply_len); + txn->reply->size = reply_len; } - memcpy(txn->reply->data, reply_buf, reply_len); - txn->reply->size = reply_len; } + ofpbuf_uninit(&tmp_reply); } - ofpbuf_uninit(&tmp_reply); /* Count the number of successful transactions. */ (*done)++; @@ -909,6 +931,11 @@ nl_sock_transact_multiple(struct nl_sock *sock, } else if (error) { VLOG_ERR_RL(&rl, "transaction error (%s)", ovs_strerror(error)); nl_sock_record_errors__(transactions, n, error); + if (error != EAGAIN) { + /* A fatal error has occurred. Abort the rest of + * transactions. */ + break; + } } } } diff --git a/lib/nx-match.c b/lib/nx-match.c index 4b724604f63..21f291c6d9e 100644 --- a/lib/nx-match.c +++ b/lib/nx-match.c @@ -319,7 +319,9 @@ nx_pull_header__(struct ofpbuf *b, bool allow_cookie, uint64_t *header, b->size); error: *header = 0; - *field = NULL; + if (field) { + *field = NULL; + } return OFPERR_OFPBMC_BAD_LEN; } diff --git a/lib/odp-execute.c b/lib/odp-execute.c index ccd29d7bca7..f83fe60585d 100644 --- a/lib/odp-execute.c +++ b/lib/odp-execute.c @@ -312,7 +312,6 @@ odp_execute_set_action(struct dp_packet *packet, const struct nlattr *a) case OVS_KEY_ATTR_DP_HASH: md->dp_hash = nl_attr_get_u32(a); - dp_packet_set_dp_hash(packet, md->dp_hash); break; case OVS_KEY_ATTR_RECIRC_ID: @@ -405,8 +404,7 @@ odp_execute_masked_set_action(struct dp_packet *packet, case OVS_KEY_ATTR_DP_HASH: md->dp_hash = nl_attr_get_u32(a) - | (dp_packet_get_dp_hash(packet) & ~*get_mask(a, uint32_t)); - dp_packet_set_dp_hash(packet, md->dp_hash); + | (md->dp_hash & ~*get_mask(a, uint32_t)); break; case OVS_KEY_ATTR_RECIRC_ID: @@ -516,8 +514,7 @@ odp_execute_actions(void *dp, struct dp_packet **packets, int cnt, bool steal, flow_extract(packets[i], &flow); hash = flow_hash_5tuple(&flow, hash_act->hash_basis); - /* We also store the hash value with each packet */ - dp_packet_set_dp_hash(packets[i], hash ? hash : 1); + packets[i]->md.dp_hash = hash; } } else { /* Assert on unknown hash algorithm. */ diff --git a/lib/odp-util.c b/lib/odp-util.c index b82edb700c6..962b84b2cf5 100644 --- a/lib/odp-util.c +++ b/lib/odp-util.c @@ -231,21 +231,22 @@ parse_flags(const char *s, const char *(*bit_to_string)(uint32_t), uint32_t flags = 0, mask = 0; /* Parse masked flags. */ - while (s[n] != ')') { + while (s[0] != ')') { bool set; uint32_t bit; int name_len; - if (s[n] == '+') { + if (s[0] == '+') { set = true; - } else if (s[n] == '-') { + } else if (s[0] == '-') { set = false; } else { return -EINVAL; } + s++; n++; - name_len = strcspn(s + n, "+-)"); + name_len = strcspn(s, "+-)"); for (bit = 1; bit; bit <<= 1) { const char *fname = bit_to_string(bit); @@ -259,7 +260,7 @@ parse_flags(const char *s, const char *(*bit_to_string)(uint32_t), if (len != name_len) { continue; } - if (!strncmp(s + n, fname, len)) { + if (!strncmp(s, fname, len)) { if (mask & bit) { /* bit already set. */ return -EINVAL; @@ -279,6 +280,7 @@ parse_flags(const char *s, const char *(*bit_to_string)(uint32_t), return -EINVAL; /* Unknown flag name */ } s += name_len; + n += name_len; } *res_flags = flags; diff --git a/lib/ofp-actions.c b/lib/ofp-actions.c index 65fa64d08d2..2240b86af66 100644 --- a/lib/ofp-actions.c +++ b/lib/ofp-actions.c @@ -5308,8 +5308,8 @@ ofpacts_pull_openflow_instructions(struct ofpbuf *openflow, ofpact_pad(ofpacts); start = ofpacts->size; - on = ofpact_put(ofpacts, OFPACT_WRITE_ACTIONS, - offsetof(struct ofpact_nest, actions)); + ofpact_put(ofpacts, OFPACT_WRITE_ACTIONS, + offsetof(struct ofpact_nest, actions)); get_actions_from_instruction(insts[OVSINST_OFPIT11_WRITE_ACTIONS], &actions, &actions_len); error = ofpacts_decode_for_action_set(actions, actions_len, diff --git a/lib/ofp-parse.c b/lib/ofp-parse.c index 8fce546f4a2..856044dbb3d 100644 --- a/lib/ofp-parse.c +++ b/lib/ofp-parse.c @@ -1333,7 +1333,7 @@ parse_ofp_group_mod_str__(struct ofputil_group_mod *gm, uint16_t command, } else if (!strcmp(value, "last")) { gm->command_bucket_id = OFPG15_BUCKET_LAST; } else { - char *error = str_to_u32(value, &gm->command_bucket_id); + error = str_to_u32(value, &gm->command_bucket_id); if (error) { goto out; } @@ -1356,7 +1356,7 @@ parse_ofp_group_mod_str__(struct ofputil_group_mod *gm, uint16_t command, if(!strcmp(value, "all")) { gm->group_id = OFPG_ALL; } else { - char *error = str_to_u32(value, &gm->group_id); + error = str_to_u32(value, &gm->group_id); if (error) { goto out; } @@ -1409,6 +1409,9 @@ parse_ofp_group_mod_str__(struct ofputil_group_mod *gm, uint16_t command, goto out; } error = str_to_u64(value, &gm->props.selection_method_param); + if (error) { + goto out; + } *usable_protocols &= OFPUTIL_P_OF15_UP; } else if (!strcmp(name, "fields")) { if (!(fields & F_GROUP_TYPE)) { diff --git a/lib/ofp-util.c b/lib/ofp-util.c index 277fdfeb2db..60cc67432e0 100644 --- a/lib/ofp-util.c +++ b/lib/ofp-util.c @@ -8170,7 +8170,8 @@ ofputil_encode_ofp15_group_mod(enum ofp_version ofp_version, } static void -bad_group_cmd(enum ofp15_group_mod_command cmd) { +bad_group_cmd(enum ofp15_group_mod_command cmd) +{ const char *opt_version; const char *version; const char *cmd_str; @@ -8187,6 +8188,7 @@ bad_group_cmd(enum ofp15_group_mod_command cmd) { case OFPGC15_REMOVE_BUCKET: version = "1.5"; opt_version = "15"; + break; default: OVS_NOT_REACHED(); @@ -8210,7 +8212,7 @@ bad_group_cmd(enum ofp15_group_mod_command cmd) { break; case OFPGC15_REMOVE_BUCKET: - cmd_str = "insert-bucket"; + cmd_str = "remove-bucket"; break; default: diff --git a/lib/ovs-lldp.c b/lib/ovs-lldp.c index 3edaf429a99..54c70c53c69 100644 --- a/lib/ovs-lldp.c +++ b/lib/ovs-lldp.c @@ -202,8 +202,10 @@ aa_print_element_status_port(struct ds *ds, struct lldpd_hardware *hw) if (memcmp(&port->p_element.system_id, &system_id_null, sizeof port->p_element.system_id)) { - static char *none_str = ""; - char *id = none_str, *descr = none_str, *system = none_str; + const char *none_str = ""; + const char *descr = NULL; + char *id = NULL; + char *system; if (port->p_chassis) { if (port->p_chassis->c_id_len > 0) { @@ -211,16 +213,16 @@ aa_print_element_status_port(struct ds *ds, struct lldpd_hardware *hw) port->p_chassis->c_id_len, &id); } - descr = port->p_chassis->c_descr - ? port->p_chassis->c_descr : none_str; + descr = port->p_chassis->c_descr; } chassisid_to_string((uint8_t *) &port->p_element.system_id, sizeof port->p_element.system_id, &system); - ds_put_format(ds, "\tAuto Attach Primary Server Id: %s\n", id); + ds_put_format(ds, "\tAuto Attach Primary Server Id: %s\n", + id ? id : none_str); ds_put_format(ds, "\tAuto Attach Primary Server Descr: %s\n", - descr); + descr ? descr : none_str); ds_put_format(ds, "\tAuto Attach Primary Server System Id: %s\n", system); @@ -389,9 +391,7 @@ update_mapping_on_lldp(struct lldp *lldp, struct lldpd_hardware *hardware, { struct lldpd_aa_isid_vlan_maps_tlv *lm = xzalloc(sizeof *lm); - if (hardware->h_ifname) { - VLOG_INFO("\t\t hardware->h_ifname=%s", hardware->h_ifname); - } + VLOG_INFO("\t\t hardware->h_ifname=%s", hardware->h_ifname); lm->isid_vlan_data.isid = m->isid; lm->isid_vlan_data.vlan = m->vlan; @@ -617,16 +617,13 @@ aa_mapping_unregister(void *aux) } hmap_remove(&lldp->mappings_by_aux, &m->hmap_node_aux); - free(m); /* Remove from all the lldp instances */ LIST_FOR_EACH (hw, h_entries, &lldp->lldpd->g_hardware) { - if (hw->h_ifname) { - VLOG_INFO("\t\t hardware->h_ifname=%s", hw->h_ifname); - } - + VLOG_INFO("\t\t hardware->h_ifname=%s", hw->h_ifname); aa_mapping_unregister_mapping(lldp, hw, m); } + free(m); /* Remove from the all_mappings */ HMAP_FOR_EACH (m, hmap_node_isid, all_mappings) { @@ -658,9 +655,9 @@ lldp_init(void) * fields in 'wc' that were used to make the determination. */ bool -lldp_should_process_flow(const struct flow *flow) +lldp_should_process_flow(struct lldp *lldp, const struct flow *flow) { - return (flow->dl_type == htons(ETH_TYPE_LLDP)); + return (flow->dl_type == htons(ETH_TYPE_LLDP) && lldp->enabled); } @@ -687,6 +684,9 @@ lldp_should_send_packet(struct lldp *cfg) OVS_EXCLUDED(mutex) ret = timer_expired(&cfg->tx_timer); ovs_mutex_unlock(&mutex); + /* LLDP must be enabled */ + ret &= cfg->enabled; + return ret; } @@ -697,7 +697,7 @@ lldp_wake_time(const struct lldp *lldp) OVS_EXCLUDED(mutex) { long long int retval; - if (!lldp) { + if (!lldp || !lldp->enabled) { return LLONG_MAX; } @@ -726,7 +726,6 @@ lldp_put_packet(struct lldp *lldp, struct dp_packet *packet, { struct lldpd *mylldpd = lldp->lldpd; struct lldpd_hardware *hw = lldpd_first_hardware(mylldpd); - uint32_t lldp_size = 0; static const uint8_t eth_addr_lldp[6] = {0x01, 0x80, 0xC2, 0x00, 0x00, 0x0e}; @@ -734,10 +733,7 @@ lldp_put_packet(struct lldp *lldp, struct dp_packet *packet, eth_compose(packet, eth_addr_lldp, eth_src, ETH_TYPE_LLDP, 0); - lldp_size = lldpd_send(hw, packet); - if (lldp_size + ETH_HEADER_LEN < MINIMUM_ETH_PACKET_SIZE) { - lldp_size = MINIMUM_ETH_PACKET_SIZE; - } + lldpd_send(hw, packet); timer_set_duration(&lldp->tx_timer, lldp->lldpd->g_config.c_tx_interval); ovs_mutex_unlock(&mutex); @@ -746,9 +742,15 @@ lldp_put_packet(struct lldp *lldp, struct dp_packet *packet, /* Configures the LLDP stack. */ bool -lldp_configure(struct lldp *lldp) OVS_EXCLUDED(mutex) +lldp_configure(struct lldp *lldp, const struct smap *cfg) OVS_EXCLUDED(mutex) { if (lldp) { + if (cfg && smap_get_bool(cfg, "enable", false)) { + lldp->enabled = true; + } else { + lldp->enabled = false; + } + ovs_mutex_lock(&mutex); timer_set_expired(&lldp->tx_timer); timer_set_duration(&lldp->tx_timer, LLDP_DEFAULT_TRANSMIT_INTERVAL_MS); diff --git a/lib/ovs-lldp.h b/lib/ovs-lldp.h index 66288a57732..807590aaae6 100644 --- a/lib/ovs-lldp.h +++ b/lib/ovs-lldp.h @@ -46,6 +46,7 @@ struct lldp { struct hmap mappings_by_aux; /* "struct" indexed by aux */ struct ovs_list active_mapping_queue; struct ovs_refcount ref_cnt; + bool enabled; /* LLDP enabled on port */ }; /* Configuration specific to Auto Attach. @@ -83,8 +84,8 @@ long long int lldp_wait(struct lldp *lldp); long long int lldp_wake_time(const struct lldp *lldp); void lldp_run(struct lldpd *cfg); bool lldp_should_send_packet(struct lldp *cfg); -bool lldp_should_process_flow(const struct flow *flow); -bool lldp_configure(struct lldp *lldp); +bool lldp_should_process_flow(struct lldp *lldp, const struct flow *flow); +bool lldp_configure(struct lldp *lldp, const struct smap *cfg); void lldp_process_packet(struct lldp *cfg, const struct dp_packet *); void lldp_put_packet(struct lldp *lldp, struct dp_packet *packet, uint8_t eth_src[ETH_ADDR_LEN]); diff --git a/lib/perf-counter.c b/lib/perf-counter.c new file mode 100644 index 00000000000..8c859cc7646 --- /dev/null +++ b/lib/perf-counter.c @@ -0,0 +1,194 @@ +/* + * Copyright (c) 2015 Nicira, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* This implementation only applies to the Linux platform. */ + +#include +#if defined(__linux__) && defined(HAVE_LINUX_PERF_EVENT_H) + +#include +#include +#include +#include +#include +#include +#include +#include "dynamic-string.h" +#include "perf-counter.h" +#include "shash.h" +#include "util.h" + +static struct shash perf_counters; +static int fd__ = 0; + +uint64_t +perf_counter_read(uint64_t *counter) +{ + int size = sizeof *counter; + + if (fd__ <= 0 || read(fd__, counter, size) < size) { + *counter = 0; + } + + return *counter; +} + +static long +perf_event_open(struct perf_event_attr *hw_event, pid_t pid, + int cpu, int group_fd, unsigned long flags) +{ + int ret; + + ret = syscall(__NR_perf_event_open, hw_event, pid, cpu, + group_fd, flags); + return ret; +} + +/* Set up perf event counters to read user space instruction counters + * only for this process, on all cpus. */ +static void +perf_event_setup(void) +{ + struct perf_event_attr pe; + + memset(&pe, 0, sizeof(struct perf_event_attr)); + pe.type = PERF_TYPE_HARDWARE; + pe.size = sizeof(struct perf_event_attr); + pe.config = PERF_COUNT_HW_INSTRUCTIONS; + pe.disabled = 1; + pe.exclude_kernel = 1; + pe.exclude_hv = 1; + + fd__ = perf_event_open(&pe, 0, -1, -1, 0); + if (fd__ > 0) { + ioctl(fd__, PERF_EVENT_IOC_RESET, 0); + ioctl(fd__, PERF_EVENT_IOC_ENABLE, 0); + } +} + +static void +perf_counter_init(struct perf_counter *counter) +{ + counter->once = true; + shash_add_assert(&perf_counters, counter->name, counter); +} + +void +perf_counter_accumulate(struct perf_counter *counter, uint64_t start_count) +{ + uint64_t end_count; + + if (!counter->once) { + perf_counter_init(counter); + } + + counter->n_events++; + perf_counter_read(&end_count); + counter->total_count += end_count - start_count; +} + +static void +perf_counter_to_ds(struct ds *ds, struct perf_counter *pfc) +{ + double ratio; + + if (pfc->n_events) { + ratio = (double)pfc->total_count / (double)pfc->n_events; + } else { + ratio = 0.0; + } + + ds_put_format(ds, "%-40s%12"PRIu64"%12"PRIu64"%12.1f\n", + pfc->name, pfc->n_events, pfc->total_count, ratio); +} + +static void +perf_counters_to_ds(struct ds *ds) +{ + const char *err_str; + const struct shash_node **sorted; + int i; + + err_str = NULL; + if (fd__ == -1) { + err_str = "performance counter is not supported on this platfrom"; + } else if (!shash_count(&perf_counters)) { + err_str = "performance counter has never been hit"; + } + + if (err_str) { + ds_put_format(ds, "%s\n", err_str); + return; + } + + /* Display counters in alphabetical order. */ + sorted = shash_sort(&perf_counters); + for (i = 0; i < shash_count(&perf_counters); i++) { + perf_counter_to_ds(ds, sorted[i]->data); + } + free(sorted); +} + +/* + * Caller is responsible for free memory. + */ +char * +perf_counters_to_string() +{ + struct ds ds; + + ds_init(&ds); + perf_counters_to_ds(&ds); + return ds_steal_cstr(&ds); +} + +void +perf_counters_init(void) +{ + shash_init(&perf_counters); + perf_event_setup(); +} + +void +perf_counters_clear(void) +{ + struct shash_node *node; + + SHASH_FOR_EACH (node, &perf_counters) { + struct perf_counter *perf = node->data; + + perf->n_events = 0; + perf->total_count = 0; + } +} + +void +perf_counters_destroy() +{ + struct shash_node *node, *next; + + if (fd__ != -1) { + ioctl(fd__, PERF_EVENT_IOC_DISABLE, 0); + close(fd__); + } + + SHASH_FOR_EACH_SAFE (node, next, &perf_counters) { + shash_delete(&perf_counters, node); + } + + shash_destroy(&perf_counters); +} +#endif diff --git a/lib/perf-counter.h b/lib/perf-counter.h new file mode 100644 index 00000000000..23365de177d --- /dev/null +++ b/lib/perf-counter.h @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2015 Nicira, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __PERF_COUNTER_H +#define __PERF_COUNTER_H 1 + +/* Motivation + * ========== + * + * It is sometimes desirable to gain performance insights of a program + * by using hardware counters. Recent Linux kernels started to support + * a set of portable API for configuring and access those counter across + * multiple platforms. + * + * APIs provided by perf-counter.h provides a set of APIs that are + * semi-integrated into OVS user spaces. The infrastructure that initializes, + * cleanup, display and clear them at run time is provided. However the + * sample points are not. A programmer needs insert sample points when needed. + * + * Since there is no pre configured sample points, there is no run time + * over head for the released product. + * + * Limitations + * =========== + * - Hard coded to sample CPU cycle count in user space only. + * - Only one counter is sampled. + * - Useful macros are only provided for function profiling. + * - show and clear command applies to all counters, there is no way + * to select a sub-set of counter. + * + * Those are not fundamental limits, but only limited by current + * implementation. + * + * Function instruction counter sample point Usage + * ================================================ + * + * There are two macros provided: + * + * Macro 'PERF_FUNCTON_COUNT_BEGIN' needs to be inserted towards the + * beginning of the function where local variables are declared. + * + * Macro 'PERF_FUNCTON_COUNT_END' needs to appear in the same function, + * some where below 'PERF_FUNCTION_COUNT_BEGIN', usually towards of + * a function. + * + * For example: + * + * void my_func() { + * int some_local_variable; + * + * PERF_FUNCTION_COUNT_BEGIN; + * + * < implementation > + * + * PERF_FUNCTION_COUNT_END + * } + * + * This will maintain the number of times 'my_func()' is called, total + * number of instructions '' executed during all those calls. + * + * Currently there are two limitation: + * 1). At most one pair can appear in the same variable scope. + * 2). The Macros use function name as the counter name for display. + * Thus, all functions in one annotation session are required to + * have unique names. + * + * Note, there is no requirement for those macros to be balanced. + * For example: + * + * void my_func(int i){ + * + * PERF_FUNCTION_COUNT_BEGIN; + * + * if (i == 300) { + * PERF_FUNCTION_COUNT_END; + * return; + * } else { + * + * } + * } + * will work just fine. + */ + +#if defined(__linux__) && defined(HAVE_LINUX_PERF_EVENT_H) +struct perf_counter { + const char *name; + bool once; + uint64_t n_events; + uint64_t total_count; +}; + +#define PERF_COUNTER_ONCE_INITIALIZER(name) \ + { \ + name, \ + false, \ + 0, \ + 0, \ + } + +void perf_counters_init(void); +void perf_counters_destroy(void); +void perf_counters_clear(void); + +uint64_t perf_counter_read(uint64_t *counter); +void perf_counter_accumulate(struct perf_counter *counter, + uint64_t start_count); +char *perf_counters_to_string(void); + +/* User access macros. */ +#define PERF_FUNCTION_BEGIN \ + static struct perf_counter x__ = PERF_COUNTER_ONCE_INITIALIZER(__func__); \ + uint64_t start_count__ = perf_counter_read(&start_count__); \ + +#define PERF_FUNCTION_END \ + perf_counter_accumulate(&x__, start_count__); + +#else + +#define PERF_FUNCTON_BEGIN +#define PERF_FUNCTON_END + +static inline void perf_counters_init(void) {} +static inline void perf_counters_destroy(void) {} +static inline void perf_counters_clear(void) {} +static inline char * +perf_counters_to_string(void) +{ + return xstrdup("Not Supported on this platform. Only available on Linux (version >= 2.6.32)"); +} + +#endif + +#endif diff --git a/lib/rstp-state-machines.c b/lib/rstp-state-machines.c index d254ca3436f..7e2378977ce 100644 --- a/lib/rstp-state-machines.c +++ b/lib/rstp-state-machines.c @@ -1858,7 +1858,6 @@ port_role_transition_sm(struct rstp_port *p) p->port_role_transition_sm_state); } if (last_role != p->role) { - last_role = p->role; VLOG_DBG("%s, port %u, port role ["RSTP_PORT_ID_FMT"] = %s", p->rstp->name, p->port_number, p->port_id, rstp_port_role_name(p->role)); diff --git a/m4/openvswitch.m4 b/m4/openvswitch.m4 index ba3ed9b3116..8ace9ce11c3 100644 --- a/m4/openvswitch.m4 +++ b/m4/openvswitch.m4 @@ -243,6 +243,10 @@ AC_DEFUN([OVS_CHECK_BACKTRACE], [AC_DEFINE([HAVE_BACKTRACE], [1], [Define to 1 if you have backtrace(3).])])]) +dnl Defines HAVE_PERF_EVENT if linux/perf_event.h is found. +AC_DEFUN([OVS_CHECK_PERF_EVENT], + [AC_CHECK_HEADERS([linux/perf_event.h])]) + dnl Checks for valgrind/valgrind.h. AC_DEFUN([OVS_CHECK_VALGRIND], [AC_CHECK_HEADERS([valgrind/valgrind.h])]) diff --git a/ofproto/in-band.c b/ofproto/in-band.c index 902a86ca109..3608c13bb54 100644 --- a/ofproto/in-band.c +++ b/ofproto/in-band.c @@ -119,8 +119,8 @@ refresh_remote(struct in_band *ib, struct in_band_remote *r) retval = netdev_get_next_hop(ib->local_netdev, &r->remote_addr.sin_addr, &next_hop_inaddr, &next_hop_dev); if (retval) { - VLOG_WARN("cannot find route for controller ("IP_FMT"): %s", - IP_ARGS(r->remote_addr.sin_addr.s_addr), + VLOG_WARN("%s: cannot find route for controller ("IP_FMT"): %s", + ib->ofproto->name, IP_ARGS(r->remote_addr.sin_addr.s_addr), ovs_strerror(retval)); return 1; } @@ -136,9 +136,10 @@ refresh_remote(struct in_band *ib, struct in_band_remote *r) retval = netdev_open(next_hop_dev, "system", &r->remote_netdev); if (retval) { - VLOG_WARN_RL(&rl, "cannot open netdev %s (next hop " + VLOG_WARN_RL(&rl, "%s: cannot open netdev %s (next hop " "to controller "IP_FMT"): %s", - next_hop_dev, IP_ARGS(r->remote_addr.sin_addr.s_addr), + ib->ofproto->name, next_hop_dev, + IP_ARGS(r->remote_addr.sin_addr.s_addr), ovs_strerror(retval)); free(next_hop_dev); return 1; @@ -150,8 +151,9 @@ refresh_remote(struct in_band *ib, struct in_band_remote *r) retval = netdev_arp_lookup(r->remote_netdev, next_hop_inaddr.s_addr, r->remote_mac); if (retval) { - VLOG_DBG_RL(&rl, "cannot look up remote MAC address ("IP_FMT"): %s", - IP_ARGS(next_hop_inaddr.s_addr), ovs_strerror(retval)); + VLOG_DBG_RL(&rl, "%s: cannot look up remote MAC address ("IP_FMT"): %s", + ib->ofproto->name, IP_ARGS(next_hop_inaddr.s_addr), + ovs_strerror(retval)); } /* If we don't have a MAC address, then refresh quickly, since we probably @@ -188,8 +190,9 @@ refresh_remotes(struct in_band *ib) any_changes = true; if (!eth_addr_is_zero(r->remote_mac) && !eth_addr_equals(r->last_remote_mac, r->remote_mac)) { - VLOG_DBG("remote MAC address changed from "ETH_ADDR_FMT + VLOG_DBG("%s: remote MAC address changed from "ETH_ADDR_FMT " to "ETH_ADDR_FMT, + ib->ofproto->name, ETH_ADDR_ARGS(r->last_remote_mac), ETH_ADDR_ARGS(r->remote_mac)); memcpy(r->last_remote_mac, r->remote_mac, ETH_ADDR_LEN); @@ -425,8 +428,8 @@ in_band_create(struct ofproto *ofproto, const char *local_name, *in_bandp = NULL; error = netdev_open(local_name, "internal", &local_netdev); if (error) { - VLOG_ERR("failed to initialize in-band control: cannot open " - "datapath local port %s (%s)", + VLOG_ERR("%s: failed to initialize in-band control: cannot open " + "datapath local port %s (%s)", ofproto->name, local_name, ovs_strerror(error)); return error; } diff --git a/ofproto/ofproto-dpif-ipfix.c b/ofproto/ofproto-dpif-ipfix.c index f73d8b4560c..8a931d68e75 100644 --- a/ofproto/ofproto-dpif-ipfix.c +++ b/ofproto/ofproto-dpif-ipfix.c @@ -48,8 +48,8 @@ static struct ovs_mutex mutex = OVS_MUTEX_INITIALIZER; * used to indicate the type of tunnel (0x01 = VxLAN, 0x02 = GRE) and the three * least significant bytes hold the value of the layer 2 overlay network * segment identifier: a 24-bit VxLAN tunnel's VNI or a 24-bit GRE tunnel's - * TNI. This is not compatible with GRE-64, as implemented in OVS, as its - * tunnel IDs are 64-bit. + * TNI. This is not compatible with GRE-64 or STT, as implemented in OVS, as + * their tunnel IDs are 64-bit. * * Two new enterprise information elements are defined which are similar to * laryerSegmentId but support 64-bit IDs: @@ -64,6 +64,7 @@ enum dpif_ipfix_tunnel_type { DPIF_IPFIX_TUNNEL_VXLAN = 0x01, DPIF_IPFIX_TUNNEL_GRE = 0x02, DPIF_IPFIX_TUNNEL_LISP = 0x03, + DPIF_IPFIX_TUNNEL_STT = 0x04, DPIF_IPFIX_TUNNEL_IPSEC_GRE = 0x05, DPIF_IPFIX_TUNNEL_GENEVE = 0x07, NUM_DPIF_IPFIX_TUNNEL @@ -299,7 +300,7 @@ static uint8_t tunnel_protocol[NUM_DPIF_IPFIX_TUNNEL] = { IPPROTO_UDP, /* DPIF_IPFIX_TUNNEL_VXLAN */ IPPROTO_GRE, /* DPIF_IPFIX_TUNNEL_GRE */ IPPROTO_UDP, /* DPIF_IPFIX_TUNNEL_LISP*/ - 0 , /* reserved */ + IPPROTO_TCP, /* DPIF_IPFIX_TUNNEL_STT*/ IPPROTO_GRE, /* DPIF_IPFIX_TUNNEL_IPSEC_GRE */ 0 , /* reserved */ IPPROTO_UDP, /* DPIF_IPFIX_TUNNEL_GENEVE*/ @@ -353,6 +354,7 @@ BUILD_ASSERT_DECL(sizeof(struct ipfix_data_record_aggregated_ip) == 32); * VxLAN: 24-bit VIN, * GRE: 32- or 64-bit key, * LISP: 24-bit instance ID + * STT: 64-bit key */ #define MAX_TUNNEL_KEY_LEN 8 @@ -607,6 +609,9 @@ dpif_ipfix_add_tunnel_port(struct dpif_ipfix *di, struct ofport *ofport, } else if (strcmp(type, "geneve") == 0) { dip->tunnel_type = DPIF_IPFIX_TUNNEL_GENEVE; dip->tunnel_key_length = 3; + } else if (strcmp(type, "stt") == 0) { + dip->tunnel_type = DPIF_IPFIX_TUNNEL_STT; + dip->tunnel_key_length = 8; } else { free(dip); goto out; diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c index 55ae6831a33..f73787744db 100644 --- a/ofproto/ofproto-dpif-xlate.c +++ b/ofproto/ofproto-dpif-xlate.c @@ -2610,7 +2610,7 @@ process_special(struct xlate_ctx *ctx, const struct flow *flow, : rstp_process_packet(xport, packet); } return SLOW_STP; - } else if (xport->lldp && lldp_should_process_flow(flow)) { + } else if (xport->lldp && lldp_should_process_flow(xport->lldp, flow)) { if (packet) { lldp_process_packet(xport->lldp, packet); } diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c index 01d99c54aae..bf893214b52 100644 --- a/ofproto/ofproto-dpif.c +++ b/ofproto/ofproto-dpif.c @@ -2020,16 +2020,15 @@ set_lldp(struct ofport *ofport_, ofport->lldp = lldp_create(ofport->up.netdev, ofport_->mtu, cfg); } - if (lldp_configure(ofport->lldp)) { - error = 0; - goto out; + if (!lldp_configure(ofport->lldp, cfg)) { + error = EINVAL; } - - error = EINVAL; } - lldp_unref(ofport->lldp); - ofport->lldp = NULL; -out: + if (error) { + lldp_unref(ofport->lldp); + ofport->lldp = NULL; + } + ofproto_dpif_monitor_port_update(ofport, ofport->bfd, ofport->cfm, diff --git a/ovsdb/ovsdb-server.c b/ovsdb/ovsdb-server.c index deb2b8bb3de..cd13b0de1c2 100644 --- a/ovsdb/ovsdb-server.c +++ b/ovsdb/ovsdb-server.c @@ -53,6 +53,7 @@ #include "trigger.h" #include "util.h" #include "unixctl.h" +#include "perf-counter.h" #include "openvswitch/vlog.h" VLOG_DEFINE_THIS_MODULE(ovsdb_server); @@ -76,6 +77,8 @@ static bool bootstrap_ca_cert; static unixctl_cb_func ovsdb_server_exit; static unixctl_cb_func ovsdb_server_compact; static unixctl_cb_func ovsdb_server_reconnect; +static unixctl_cb_func ovsdb_server_perf_counters_clear; +static unixctl_cb_func ovsdb_server_perf_counters_show; struct server_config { struct sset *remotes; @@ -292,6 +295,8 @@ main(int argc, char *argv[]) daemonize_complete(); + perf_counters_init(); + if (!run_command) { /* ovsdb-server is usually a long-running process, in which case it * makes plenty of sense to log the version, but --run makes @@ -318,6 +323,10 @@ main(int argc, char *argv[]) ovsdb_server_remove_database, &server_config); unixctl_command_register("ovsdb-server/list-dbs", "", 0, 0, ovsdb_server_list_databases, &all_dbs); + unixctl_command_register("ovsdb-server/perf-counters-show", "", 0, 0, + ovsdb_server_perf_counters_show, NULL); + unixctl_command_register("ovsdb-server/perf-counters-clear", "", 0, 0, + ovsdb_server_perf_counters_clear, NULL); main_loop(jsonrpc, &all_dbs, unixctl, &remotes, run_process, &exiting); @@ -338,7 +347,7 @@ main(int argc, char *argv[]) run_command, process_status_msg(status)); } } - + perf_counters_destroy(); service_stop(); return 0; } @@ -1021,6 +1030,26 @@ ovsdb_server_exit(struct unixctl_conn *conn, int argc OVS_UNUSED, unixctl_command_reply(conn, NULL); } +static void +ovsdb_server_perf_counters_show(struct unixctl_conn *conn, int argc OVS_UNUSED, + const char *argv[] OVS_UNUSED, + void *arg_ OVS_UNUSED) +{ + char *s = perf_counters_to_string(); + + unixctl_command_reply(conn, s); + free(s); +} + +static void +ovsdb_server_perf_counters_clear(struct unixctl_conn *conn, int argc OVS_UNUSED, + const char *argv[] OVS_UNUSED, + void *arg_ OVS_UNUSED) +{ + perf_counters_clear(); + unixctl_command_reply(conn, NULL); +} + static void ovsdb_server_compact(struct unixctl_conn *conn, int argc, const char *argv[], void *dbs_) diff --git a/python/.gitignore b/python/.gitignore new file mode 100644 index 00000000000..60ace6f05b5 --- /dev/null +++ b/python/.gitignore @@ -0,0 +1,2 @@ +dist/ +*.egg-info diff --git a/python/README.rst b/python/README.rst new file mode 100644 index 00000000000..4f4742c534a --- /dev/null +++ b/python/README.rst @@ -0,0 +1 @@ +Python library for working with Open vSwitch diff --git a/python/automake.mk b/python/automake.mk index f6ab60666d2..3a7f04016b9 100644 --- a/python/automake.mk +++ b/python/automake.mk @@ -41,6 +41,11 @@ EXTRA_DIST += \ python/build/__init__.py \ python/build/nroff.py +# PyPI support. +EXTRA_DIST += \ + python/README.rst \ + python/setup.py + PYFILES = $(ovs_pyfiles) python/ovs/dirs.py $(ovstest_pyfiles) EXTRA_DIST += $(PYFILES) PYCOV_CLEAN_FILES += $(PYFILES:.py=.py,cover) @@ -62,6 +67,12 @@ ovs-install-data-local: $(MKDIR_P) $(DESTDIR)$(pkgdatadir)/python/ovs $(INSTALL_DATA) python/ovs/dirs.py.tmp $(DESTDIR)$(pkgdatadir)/python/ovs/dirs.py rm python/ovs/dirs.py.tmp + +python-sdist: $(srcdir)/python/ovs/version.py $(ovs_pyfiles) python/ovs/dirs.py + (cd python/ && $(PYTHON) setup.py sdist) + +pypi-upload: $(srcdir)/python/ovs/version.py $(ovs_pyfiles) python/ovs/dirs.py + (cd python/ && $(PYTHON) setup.py sdist upload) else ovs-install-data-local: @: diff --git a/python/ovs/db/idl.py b/python/ovs/db/idl.py index a01701a0aa3..45a5a23b141 100644 --- a/python/ovs/db/idl.py +++ b/python/ovs/db/idl.py @@ -26,8 +26,12 @@ __pychecker__ = 'no-classattr no-objattrs' +ROW_CREATE = "create" +ROW_UPDATE = "update" +ROW_DELETE = "delete" -class Idl: + +class Idl(object): """Open vSwitch Database Interface Definition Language (OVSDB IDL). The OVSDB IDL maintains an in-memory replica of a database. It issues RPC @@ -264,6 +268,17 @@ def set_lock(self, lock_name): self.lock_name = lock_name self.__send_lock_request() + def notify(self, event, row, updates=None): + """Hook for implementing create/update/delete notifications + + :param event: The event that was triggered + :type event: ROW_CREATE, ROW_UPDATE, or ROW_DELETE + :param row: The row as it is after the operation has occured + :type row: Row + :param updates: For updates, a Row object with just the changed columns + :type updates: Row + """ + def __clear(self): changed = False @@ -386,6 +401,7 @@ def __process_update(self, table, uuid, old, new): if row: del table.rows[uuid] changed = True + self.notify(ROW_DELETE, row) else: # XXX rate-limit vlog.warn("cannot delete missing row %s from table %s" @@ -401,15 +417,19 @@ def __process_update(self, table, uuid, old, new): % (uuid, table.name)) if self.__row_update(table, row, new): changed = True + self.notify(ROW_CREATE, row) else: + op = ROW_UPDATE if not row: row = self.__create_row(table, uuid) changed = True + op = ROW_CREATE # XXX rate-limit vlog.warn("cannot modify missing row %s in table %s" % (uuid, table.name)) if self.__row_update(table, row, new): changed = True + self.notify(op, row, Row.from_json(self, table, uuid, old)) return changed def __row_update(self, table, row, row_json): @@ -570,6 +590,26 @@ def __setattr__(self, column_name, value): return self._idl.txn._write(self, column, datum) + @classmethod + def from_json(cls, idl, table, uuid, row_json): + data = {} + for column_name, datum_json in row_json.iteritems(): + column = table.columns.get(column_name) + if not column: + # XXX rate-limit + vlog.warn("unknown column %s in table %s" + % (column_name, table.name)) + continue + try: + datum = ovs.db.data.Datum.from_json(column.type, datum_json) + except error.Error, e: + # XXX rate-limit + vlog.warn("error parsing column %s in table %s: %s" + % (column_name, table.name, e)) + continue + data[column_name] = datum + return cls(idl, table, uuid, data) + def verify(self, column_name): """Causes the original contents of column 'column_name' in this row to be verified as a prerequisite to completing the transaction. That is, diff --git a/python/setup.py b/python/setup.py new file mode 100644 index 00000000000..889d2159420 --- /dev/null +++ b/python/setup.py @@ -0,0 +1,46 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import sys + +import setuptools + +VERSION = "unknown" + +try: + # Try to set the version from the generated ovs/version.py + execfile("ovs/version.py") +except IOError: + print("Ensure version.py is created by running make python/ovs/version.py", + file=sys.stderr) + sys.exit(-1) + + +setuptools.setup( + name='ovs', + description='Open vSwitch library', + version=VERSION, + url='http://www.openvswitch.org/', + author='Open vSwitch', + author_email='dev@openvswitch.org', + packages=['ovs', 'ovs.db', 'ovs.unixctl'], + keywords=['openvswitch', 'ovs', 'OVSDB'], + license='Apache 2.0', + classifiers=[ + 'Development Status :: 5 - Production/Stable', + 'Topic :: Database :: Front-Ends', + 'Topic :: Software Development :: Libraries :: Python Modules', + 'Topic :: System :: Networking', + 'License :: OSI Approved :: Apache Software License' + ] +) diff --git a/tests/automake.mk b/tests/automake.mk index 4c2817b78b4..3f57114e618 100644 --- a/tests/automake.mk +++ b/tests/automake.mk @@ -51,6 +51,7 @@ TESTSUITE_AT = \ tests/reconnect.at \ tests/ovs-vswitchd.at \ tests/dpif-netdev.at \ + tests/dpctl.at \ tests/ofproto-dpif.at \ tests/bridge.at \ tests/vlan-splinters.at \ @@ -200,10 +201,17 @@ clean-local: test ! -f '$(TESTSUITE)' || $(SHELL) '$(TESTSUITE)' -C tests --clean AUTOTEST = $(AUTOM4TE) --language=autotest + +if WIN32 $(TESTSUITE): package.m4 $(TESTSUITE_AT) $(COMMON_MACROS_AT) $(TESTSUITE_PATCH) $(AM_V_GEN)$(AUTOTEST) -I '$(srcdir)' -o testsuite.tmp $@.at patch -p0 testsuite.tmp $(TESTSUITE_PATCH) $(AM_V_at)mv testsuite.tmp $@ +else +$(TESTSUITE): package.m4 $(TESTSUITE_AT) $(COMMON_MACROS_AT) + $(AM_V_GEN)$(AUTOTEST) -I '$(srcdir)' -o $@.tmp $@.at + $(AM_V_at)mv $@.tmp $@ +endif $(KMOD_TESTSUITE): package.m4 $(KMOD_TESTSUITE_AT) $(COMMON_MACROS_AT) $(AM_V_GEN)$(AUTOTEST) -I '$(srcdir)' -o $@.tmp $@.at diff --git a/tests/dpctl.at b/tests/dpctl.at new file mode 100644 index 00000000000..ab7c89ceac4 --- /dev/null +++ b/tests/dpctl.at @@ -0,0 +1,85 @@ +AT_BANNER([dpctl]) + +AT_SETUP([dpctl - add-dp del-dp]) +OVS_VSWITCHD_START +AT_CHECK([ovs-appctl dpctl/add-dp dummy@br0]) +AT_CHECK([ovs-appctl dpctl/add-dp dummy@br0], [2], [], + [ovs-vswitchd: add_dp (File exists) +ovs-appctl: ovs-vswitchd: server returned an error +]) +AT_CHECK([ovs-appctl dpctl/del-dp dummy@br0]) +AT_CHECK([ovs-appctl dpctl/del-dp dummy@br0], [2], [], [stderr]) +AT_CHECK([sed 's/(.*)/(...)/' stderr], [0], [dnl +ovs-vswitchd: opening datapath (...) +ovs-appctl: ovs-vswitchd: server returned an error +]) +OVS_VSWITCHD_STOP +AT_CLEANUP + +AT_SETUP([dpctl - add-if set-if del-if]) +OVS_VSWITCHD_START([], [], [=override]) +AT_CHECK([ovs-appctl dpctl/add-dp dummy@br0]) +AT_CHECK([ovs-appctl dpctl/show dummy@br0], [0], [dnl +dummy@br0: + lookups: hit:0 missed:0 lost:0 + flows: 0 + port 0: br0 (internal) +]) +AT_CHECK([ovs-appctl dpctl/add-if dummy@br0 vif1.0,type=dummy,port_no=5]) +AT_CHECK([ovs-appctl dpctl/show dummy@br0], [0], [dnl +dummy@br0: + lookups: hit:0 missed:0 lost:0 + flows: 0 + port 0: br0 (internal) + port 5: vif1.0 (dummy) +]) +AT_CHECK([ovs-appctl dpctl/add-if dummy@br0 vif1.0,type=dummy], [2], [], + [stderr]) +AT_CHECK([sed 's/(.*)/(...)/' stderr], [0], + [ovs-vswitchd: adding vif1.0 to dummy@br0 failed (...) +ovs-appctl: ovs-vswitchd: server returned an error +]) +AT_CHECK([ovs-appctl dpctl/set-if dummy@br0 vif1.0,port_no=5]) +AT_CHECK([ovs-appctl dpctl/set-if dummy@br0 vif1.0,type=system], [2], [], + [ovs-vswitchd: vif1.0: can't change type from dummy to system +ovs-appctl: ovs-vswitchd: server returned an error +]) +AT_CHECK([ovs-appctl dpctl/set-if dummy@br0 br0,type=dummy], [2], [], + [ovs-vswitchd: br0: can't change type from internal to dummy +ovs-appctl: ovs-vswitchd: server returned an error +]) +AT_CHECK([ovs-appctl dpctl/del-if dummy@br0 vif1.0]) +AT_CHECK([ovs-appctl dpctl/show dummy@br0], [0], [dnl +dummy@br0: + lookups: hit:0 missed:0 lost:0 + flows: 0 + port 0: br0 (internal) +]) +AT_CHECK([ovs-appctl dpctl/del-if dummy@br0 vif1.0], [2], [], + [ovs-vswitchd: no port named vif1.0 +ovs-appctl: ovs-vswitchd: server returned an error +]) +AT_CHECK([ovs-appctl dpctl/show dummy@br0], [0], [dnl +dummy@br0: + lookups: hit:0 missed:0 lost:0 + flows: 0 + port 0: br0 (internal) +]) +AT_CHECK([ovs-appctl dpctl/del-if dummy@br0 nonexistent], [2], [], + [ovs-vswitchd: no port named nonexistent +ovs-appctl: ovs-vswitchd: server returned an error +]) +AT_CHECK([ovs-appctl dpctl/del-if dummy@br0 br0], [2], [], [stderr]) +AT_CHECK([sed 's/(.*)/(...)/' stderr], [0], + [ovs-vswitchd: deleting port br0 from dummy@br0 failed (...) +ovs-appctl: ovs-vswitchd: server returned an error +]) +AT_CHECK([ovs-appctl dpctl/del-dp dummy@br0]) +AT_CHECK([ovs-appctl dpctl/del-if dummy@br0 br0], [2], [], [stderr]) +AT_CHECK([sed 's/(.*)/(...)/' stderr], [0], + [ovs-vswitchd: opening datapath (...) +ovs-appctl: ovs-vswitchd: server returned an error +]) +OVS_VSWITCHD_STOP(["/dummy@br0: port_del failed/d +/dummy@br0: failed to add vif1.0 as port/d"]) +AT_CLEANUP diff --git a/tests/dpif-netdev.at b/tests/dpif-netdev.at index a267366e03b..067f9000cff 100644 --- a/tests/dpif-netdev.at +++ b/tests/dpif-netdev.at @@ -127,11 +127,3 @@ skb_priority(0/0),skb_mark(0/0),recirc_id(0),dp_hash(0/0),in_port(1),eth(src=50: OVS_VSWITCHD_STOP AT_CLEANUP - -AT_SETUP([dpif-netdev - Datapath removal]) -OVS_VSWITCHD_START() -AT_CHECK([ovs-appctl dpctl/add-dp dummy@br0]) -AT_CHECK([ovs-appctl dpctl/del-dp dummy@br0]) - -OVS_VSWITCHD_STOP -AT_CLEANUP diff --git a/tests/odp.at b/tests/odp.at index c3564871105..16a58e7d9fa 100644 --- a/tests/odp.at +++ b/tests/odp.at @@ -119,6 +119,10 @@ skb_mark(0x1234/0xfff0),in_port(1),eth(src=00:01:02:03:04:05,dst=10:11:12:13:14: echo '# Valid forms with tunnel header.' sed 's/^/tunnel(tun_id=0x7f10354\/0xff,src=10.10.10.10\/255.255.255.0,dst=20.20.20.20\/255.255.255.0,tos=0,ttl=64,tp_src=0,tp_dst=0,gbp_id=0,gbp_flags=0,flags(csum,key)),/' odp-base.txt + echo + echo '# Valid forms with tunnel header (wildcard flag).' + sed 's/^/tunnel(tun_id=0x7f10354\/0xff,src=10.10.10.10\/255.255.255.0,dst=20.20.20.20\/255.255.255.0,tos=0,ttl=64,tp_src=0,tp_dst=0,gbp_id=0,gbp_flags=0,flags(-df+csum+key)),/' odp-base.txt + echo echo '# Valid forms with VLAN header.' sed 's/\(eth([[^)]]*)\),*/\1,eth_type(0x8100),vlan(vid=99,pcp=7),encap(/ diff --git a/tests/ofproto-macros.at b/tests/ofproto-macros.at index 93d8a77516b..fd915ef2811 100644 --- a/tests/ofproto-macros.at +++ b/tests/ofproto-macros.at @@ -49,7 +49,6 @@ m4_define([_OVS_VSWITCHD_START], OVS_LOGDIR=`pwd`; export OVS_LOGDIR OVS_DBDIR=`pwd`; export OVS_DBDIR OVS_SYSCONFDIR=`pwd`; export OVS_SYSCONFDIR - ON_EXIT([kill `cat ovsdb-server.pid ovs-vswitchd.pid`]) dnl Create database. touch .conf.db.~lock~ @@ -57,6 +56,7 @@ m4_define([_OVS_VSWITCHD_START], dnl Start ovsdb-server. AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --log-file --remote=punix:$OVS_RUNDIR/db.sock], [0], [], [stderr]) + ON_EXIT_UNQUOTED([kill `cat ovsdb-server.pid`]) AT_CHECK([[sed < stderr ' /vlog|INFO|opened log file/d /ovsdb_server|INFO|ovsdb-server (Open vSwitch)/d']]) @@ -68,6 +68,7 @@ m4_define([_OVS_VSWITCHD_START], dnl Start ovs-vswitchd. AT_CHECK([ovs-vswitchd $1 --detach --no-chdir --pidfile --log-file -vvconn -vofproto_dpif], [0], [], [stderr]) AT_CAPTURE_FILE([ovs-vswitchd.log]) + ON_EXIT_UNQUOTED([kill `cat ovs-vswitchd.pid`]) AT_CHECK([[sed < stderr ' /ovs_numa|INFO|Discovered /d /vlog|INFO|opened log file/d diff --git a/tests/ofproto.at b/tests/ofproto.at index 5ae313998ad..f4e5321d032 100644 --- a/tests/ofproto.at +++ b/tests/ofproto.at @@ -313,6 +313,11 @@ AT_CHECK([ovs-ofctl -O OpenFlow11 -vwarn dump-groups br0], [0], [stdout]) AT_CHECK([STRIP_XIDS stdout], [0], [dnl OFPST_GROUP_DESC reply (OF1.1): ]) + +# Negative test. +AT_CHECK([ovs-ofctl -O OpenFlow11 -vwarn del-groups br0 group_id=0xfffffff0], + [1], [], [ovs-ofctl: invalid group id 4294967280 +]) OVS_VSWITCHD_STOP AT_CLEANUP @@ -454,6 +459,14 @@ AT_CHECK([STRIP_XIDS stdout], [0], [dnl OFPST_GROUP_DESC reply (OF1.5): group_id=1234,type=all,bucket=bucket_id:0,actions=output:0,bucket=bucket_id:1,actions=output:1,bucket=bucket_id:10,actions=output:10,bucket=bucket_id:11,actions=output:11,bucket=bucket_id:12,actions=output:12,bucket=bucket_id:13,actions=output:13,bucket=bucket_id:14,actions=output:14,bucket=bucket_id:15,actions=output:15,bucket=bucket_id:20,actions=output:20,bucket=bucket_id:21,actions=output:21 ]) + +# Negative tests. +AT_CHECK([ovs-ofctl -O OpenFlow15 -vwarn insert-buckets br0 group_id=1234,command_bucket_id=0xffffff01,bucket=bucket_id:0,actions=output:0,bucket=bucket_id:1,actions=output:1], [1], [], + [ovs-ofctl: invalid command bucket id 4294967041 +]) +AT_CHECK([ovs-ofctl -O OpenFlow11 -vwarn insert-buckets br0 group_id=1234,command_bucket_id=first,bucket=bucket_id:0,actions=output:0,bucket=bucket_id:1,actions=output:1], [1], [], + [ovs-ofctl: insert-bucket needs OpenFlow 1.5 or later ('-O OpenFlow15') +]) OVS_VSWITCHD_STOP AT_CLEANUP @@ -507,6 +520,10 @@ OFPT_ERROR (OF1.5): OFPGMFC_UNKNOWN_BUCKET OFPT_GROUP_MOD (OF1.5): REMOVE_BUCKET command_bucket_id:1,group_id=1234 ]) +# Negative test. +AT_CHECK([ovs-ofctl -O OpenFlow11 -vwarn remove-buckets br0 group_id=1234,command_bucket_id=last], [1], [], + [ovs-ofctl: remove-bucket needs OpenFlow 1.5 or later ('-O OpenFlow15') +]) OVS_VSWITCHD_STOP AT_CLEANUP diff --git a/tests/ovs-macros.at b/tests/ovs-macros.at index 14edba37339..c583c3d4077 100644 --- a/tests/ovs-macros.at +++ b/tests/ovs-macros.at @@ -55,12 +55,16 @@ if test "$IS_WIN32" = "yes"; then -[1-9]*) shift for i in $*; do - taskkill //F //PID $i >/dev/null + if tasklist //fi "PID eq $i" | grep $i >/dev/null; then + tskill $i + fi done ;; [1-9][0-9]*) for i in $*; do - taskkill //F //PID $i >/dev/null + if tasklist //fi "PID eq $i" | grep $i >/dev/null; then + tskill $i + fi done ;; esac @@ -86,15 +90,28 @@ m4_define([OVS_APP_EXIT_AND_WAIT], [ovs-appctl -t $1 exit OVS_WAIT_WHILE([test -e $1.pid])]) +m4_define([ON_EXIT__], [trap '. ./cleanup' 0; cat - cleanup << $2 > __cleanup +$1 +EOF +mv __cleanup cleanup +]) + dnl ON_EXIT([COMMANDS]) +dnl ON_EXIT_UNQUOTED([COMMANDS]) dnl -dnl Adds the shell COMMANDS to a collection executed when the current test +dnl Add the shell COMMANDS to a collection executed when the current test dnl completes, as a cleanup action. (The most common use is to kill a dnl daemon started by the test. This is important to prevent tests that dnl start daemons from hanging at exit.) -dnl The commands will be added will be tht first one to excute. -m4_define([ON_EXIT], [trap '. ./cleanup' 0; cat - cleanup << 'EOF' > __cleanup -$1 -EOF -mv __cleanup cleanup -]) +dnl +dnl The only difference between ON_EXIT and ON_EXIT_UNQUOTED is that only the +dnl latter performs shell variable (e.g. $var) substitution, command +dnl substitution (e.g. `command`), and backslash escaping (e.g. \\ becomes \) +dnl in COMMANDS at the time that ON_EXIT_UNQUOTED is encountered. ON_EXIT, +dnl in contrast, copies the literal COMMANDS and only executes shell expansion +dnl at cleanup time. +dnl +dnl Cleanup commands are executed in the reverse order of execution of +dnl these macros. +m4_define([ON_EXIT], [ON_EXIT__([$1], ['EOF'])]) +m4_define([ON_EXIT_UNQUOTED], [ON_EXIT__([$1], [EOF])]) diff --git a/tests/ovsdb-idl.at b/tests/ovsdb-idl.at index 57642be68d5..4ec342d3346 100644 --- a/tests/ovsdb-idl.at +++ b/tests/ovsdb-idl.at @@ -497,6 +497,23 @@ OVSDB_CHECK_IDL_PY([getattr idl, insert ops], 003: done ]]) +OVSDB_CHECK_IDL_PY([row-from-json idl, whats this], + [['["idltest", + {"op": "insert", + "table": "simple", + "row": {"i": 1}}, + {"op": "insert", + "table": "simple", + "row": {}}]']], + [['notifytest insert 2, notifytest set 1 b 1, notifytest delete 0']], + [[000: i=0 r=0 b=false s= u=<0> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<1> +000: i=1 r=0 b=false s= u=<0> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<2> +001: commit, status=success, events=create|2|None, delete|0|None, update|1|b +002: i=1 r=0 b=true s= u=<0> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<2> +002: i=2 r=0 b=false s= u=<0> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<3> +003: done +]]) + AT_SETUP([idl handling of missing tables and columns - C]) AT_KEYWORDS([ovsdb server idl positive]) OVS_RUNDIR=`pwd`; export OVS_RUNDIR diff --git a/tests/run-ryu b/tests/run-ryu index c14d91e1bf2..58ee781e4d9 100755 --- a/tests/run-ryu +++ b/tests/run-ryu @@ -100,7 +100,7 @@ EOF logfile=$sandbox/`echo $app | sed 's,/,.,g'`.log logs="$logs $logfile" - ryu-manager "$app" --log-file="$logfile" & pid=$! + ryu-manager --ofp-tcp-listen-port=6653 "$app" --log-file="$logfile" & pid=$! echo $pid > "$sandbox/ryu.pid" i=0 while sleep 1; do diff --git a/tests/test-ovsdb.py b/tests/test-ovsdb.py index cec46bd636b..4f8d7cac574 100644 --- a/tests/test-ovsdb.py +++ b/tests/test-ovsdb.py @@ -228,11 +228,27 @@ def idltest_find_simple(idl, i): def idl_set(idl, commands, step): txn = ovs.db.idl.Transaction(idl) increment = False + events = [] for command in commands.split(','): words = command.split() name = words[0] args = words[1:] + if name == "notifytest": + name = args[0] + args = args[1:] + old_notify = idl.notify + + def notify(event, row, updates=None): + if updates: + upcol = updates._data.keys()[0] + else: + upcol = None + events.append("%s|%s|%s" % (event, row.i, upcol)) + idl.notify = old_notify + + idl.notify = notify + if name == "set": if len(args) != 3: sys.stderr.write('"set" command requires 3 arguments\n') @@ -338,6 +354,10 @@ def idl_set(idl, commands, step): % (step, ovs.db.idl.Transaction.status_to_string(status))) if increment and status == ovs.db.idl.Transaction.SUCCESS: sys.stdout.write(", increment=%d" % txn.get_increment_new_value()) + if events: + # Event notifications from operations in a single transaction are + # not in a gauranteed order due to update messages being dicts + sys.stdout.write(", events=" + ", ".join(sorted(events))) sys.stdout.write("\n") sys.stdout.flush() diff --git a/tests/testsuite.at b/tests/testsuite.at index 80bbcd59b70..92b788b9fda 100644 --- a/tests/testsuite.at +++ b/tests/testsuite.at @@ -53,6 +53,7 @@ m4_include([tests/reconnect.at]) m4_include([tests/ovs-vswitchd.at]) m4_include([tests/ofproto.at]) m4_include([tests/dpif-netdev.at]) +m4_include([tests/dpctl.at]) m4_include([tests/ofproto-dpif.at]) m4_include([tests/bridge.at]) m4_include([tests/vlan-splinters.at]) diff --git a/tutorial/Tutorial.md b/tutorial/Tutorial.md index 1d8bbf5eca1..1a38e4fed81 100644 --- a/tutorial/Tutorial.md +++ b/tutorial/Tutorial.md @@ -118,6 +118,11 @@ This option can be handy for setting break points before ovs-vswitchd runs, or for catching early segfaults. Similarly, a '-d' option can be used to run ovsdb-server under GDB. Both options can be specified at the same time. +In addition, a '-e' option also launches ovs-vswitchd under GDB. However, +instead of displaying a 'gdb>' prompt and waiting for user input, ovs-vswitchd +will start to execute immediately. '-r' option is the corresponding option +for running ovsdb-server under gdb with immediate execution. + To avoid GDB mangling with the sandbox sub shell terminal, 'ovs-sandbox' starts a new xterm to run each GDB session. For systems that do not support X windows, GDB support is effectively disabled. diff --git a/tutorial/ovs-sandbox b/tutorial/ovs-sandbox index e8339f264bf..02145de4456 100755 --- a/tutorial/ovs-sandbox +++ b/tutorial/ovs-sandbox @@ -28,7 +28,10 @@ run_xterm() { rungdb() { under_gdb=$1 + gdb_run=$2 shift + shift + # Remove the --detach and to put the process under gdb control. # Also remove --vconsole:off to allow error message to show up # on the console. @@ -36,7 +39,13 @@ rungdb() { if $under_gdb && [ "$DISPLAY" ]; then args=`echo $@ |sed s/--detach//g | sed s/--vconsole:off//g` xterm_title=$1 - run_xterm $xterm_title gdb -ex run --args $args + + gdb_cmd="" + if $gdb_run; then + gdb_cmd="-ex run" + fi + + run_xterm $xterm_title gdb $gdb_cmd --args $args else run $@ fi @@ -44,6 +53,8 @@ rungdb() { gdb_vswitchd=false gdb_ovsdb=false +gdb_vswitchd_ex=false +gdb_ovsdb_ex=false gdb_ovn_northd=false gdb_ovn_controller=false builddir= @@ -134,9 +145,19 @@ EOF ;; -g|--gdb-v*) gdb_vswitchd=true + gdb_vswitchd_ex=false + ;; + -e|--gdb-ex-v*) + gdb_vswitchd=true + gdb_vswitchd_ex=true ;; -d|--gdb-ovsdb) gdb_ovsdb=true + gdb_ovsdb_ex=false + ;; + -r|--gdb-ex-o*) + gdb_ovsdb=true + gdb_ovsdb_ex=true ;; --gdb-ovn-northd) gdb_ovn_northd=true @@ -276,14 +297,26 @@ if $ovn; then run ovsdb-tool create ovnnb.db "$ovnnb_schema" ovsdb_server_args="ovnsb.db ovnnb.db conf.db" fi -rungdb $gdb_ovsdb ovsdb-server --detach --no-chdir --pidfile -vconsole:off --log-file \ +rungdb $gdb_ovsdb $gdb_ovsdb_ex ovsdb-server --detach --no-chdir --pidfile -vconsole:off --log-file \ --remote=punix:"$sandbox"/db.sock $ovsdb_server_args +#Add a small delay to allow ovsdb-server to launch. +sleep 0.1 + +#Wait for ovsdb-server to finish launching. +if test ! -e "$sandbox"/db.sock; then + echo -n "Waiting for ovsdb-server to start..." + while test ! -e "$sandbox"/db.sock; do + sleep 1; + done + echo " Done" +fi + # Initialize database. run ovs-vsctl --no-wait -- init # Start ovs-vswitchd. -rungdb $gdb_vswitchd ovs-vswitchd --detach --no-chdir --pidfile -vconsole:off --log-file \ +rungdb $gdb_vswitchd $gdb_vswitchd_ex ovs-vswitchd --detach --no-chdir --pidfile -vconsole:off --log-file \ --enable-dummy=override -vvconn -vnetdev_dummy if $ovn; then diff --git a/utilities/ovs-ctl.8 b/utilities/ovs-ctl.8 index 2720d8c8ec0..c08c7db4e09 100644 --- a/utilities/ovs-ctl.8 +++ b/utilities/ovs-ctl.8 @@ -22,6 +22,9 @@ ovs\-ctl \- OVS startup helper script .br \fBovs\-ctl stop .br +\fBovs\-ctl\fR \fB\-\-system\-id=random\fR|\fIuuid\fR +[\fIoptions\fR] \fBrestart +.br \fBovs\-ctl status .br \fBovs\-ctl version diff --git a/utilities/ovs-ctl.in b/utilities/ovs-ctl.in index 6d2e9388a42..97716e9ff48 100755 --- a/utilities/ovs-ctl.in +++ b/utilities/ovs-ctl.in @@ -30,6 +30,19 @@ done ## start ## ## ----- ## +# Keep track of removed vports so we can reload them if needed +removed_vports="" + +insert_mods () { + # Try loading openvswitch again. + action "Inserting openvswitch module" modprobe openvswitch + + for vport in $removed_vports; do + # Don't treat failures to load vports as fatal error + action "Inserting $vport module" modprobe $vport || true + done +} + insert_mod_if_required () { # If this kernel has no module support, expect we're done. if test ! -e /proc/modules @@ -43,7 +56,7 @@ insert_mod_if_required () { return 0 # Load openvswitch. If that's successful then we're done. - action "Inserting openvswitch module" modprobe openvswitch && return 0 + insert_mods && return 0 # If the bridge module is loaded, then that might be blocking # openvswitch. Try to unload it, if there are no bridges. @@ -56,7 +69,7 @@ insert_mod_if_required () { action "removing bridge module" rmmod bridge || return 1 # Try loading openvswitch again. - action "Inserting openvswitch module" modprobe openvswitch + insert_mods } ovs_vsctl () { @@ -388,6 +401,13 @@ force_reload_kmod () { action "Removing datapath: $dp" ovs-dpctl del-dp "$dp" done + for vport in `awk '/^vport_/ { print $1 }' /proc/modules`; do + action "Removing $vport module" rmmod $vport + if ! grep -q $vport /proc/modules; then + removed_vports="$removed_vports $vport" + fi + done + # try both old and new names in case this is post upgrade if test -e /sys/module/openvswitch_mod; then action "Removing openvswitch module" rmmod openvswitch_mod diff --git a/utilities/ovs-dev.py b/utilities/ovs-dev.py index 9467df5211e..8128b08f9bf 100755 --- a/utilities/ovs-dev.py +++ b/utilities/ovs-dev.py @@ -280,6 +280,7 @@ def modinst(): _sh("modprobe", "openvswitch") _sh("dmesg | grep openvswitch | tail -1") + _sh("find /lib/modules/%s/ -iname vport-*.ko -exec insmod '{}' \;" % uname()) commands.append(modinst) @@ -316,6 +317,11 @@ def doc(): # Install the kernel module sudo insmod %(ovs)s/datapath/linux/openvswitch.ko + # If needed, manually load all required vport modules: + sudo insmod %(ovs)s/datapath/linux/vport-vxlan.ko + sudo insmod %(ovs)s/datapath/linux/vport-geneve.ko + [...] + # Run the switch. %(v)s run diff --git a/utilities/ovs-ofctl.8.in b/utilities/ovs-ofctl.8.in index ea3337b9301..c667aa4b5dc 100644 --- a/utilities/ovs-ofctl.8.in +++ b/utilities/ovs-ofctl.8.in @@ -2374,9 +2374,11 @@ Drop packets exceeding the band's rate limit. .IP \fBrate=\fIvalue\fR The relative rate limit for this band, in kilobits per second or packets per second, depending on the meter flags defined above. -.IP \fBburst_size=\fIport\fR -The maximum burst allowed for the band. If unspecified, the switch is free to -select some reasonable value depending on it's configuration. +.IP \fBburst_size=\fIsize\fR +The maximum burst allowed for the band. If \fBpktps\fR is specified, +then \fIsize\fR is a packet count, otherwise it is in kilobits. If +unspecified, the switch is free to select some reasonable value +depending on its configuration. .RE . .SH OPTIONS diff --git a/utilities/ovs-vsctl.8.in b/utilities/ovs-vsctl.8.in index 0a629f68ede..785857d95d4 100644 --- a/utilities/ovs-vsctl.8.in +++ b/utilities/ovs-vsctl.8.in @@ -282,7 +282,9 @@ is an error. With \fB\-\-may\-exist\fR, this command does nothing if .IP "[\fB\-\-fake\-iface\fR] \fBadd\-bond \fIbridge port iface\fR\&... [\fIcolumn\fR[\fB:\fIkey\fR]\fR=\fIvalue\fR]\&...\fR" Creates on \fIbridge\fR a new port named \fIport\fR that bonds together the network devices given as each \fIiface\fR. At least two -interfaces must be named. +interfaces must be named. If the interfaces are DPDK enabled then +the transaction will need to include operations to explicitly set the +interface type to 'dpdk'. .IP Optional arguments set values of column in the Port record created by the command. The syntax is the same as that for the \fBset\fR command diff --git a/vswitchd/bridge.c b/vswitchd/bridge.c index ea7d78838bd..d48cf7ff11e 100644 --- a/vswitchd/bridge.c +++ b/vswitchd/bridge.c @@ -217,16 +217,6 @@ static long long int stats_timer = LLONG_MIN; #define AA_REFRESH_INTERVAL (1000) /* In milliseconds. */ static long long int aa_refresh_timer = LLONG_MIN; -/* In some datapaths, creating and destroying OpenFlow ports can be extremely - * expensive. This can cause bridge_reconfigure() to take a long time during - * which no other work can be done. To deal with this problem, we limit port - * adds and deletions to a window of OFP_PORT_ACTION_WINDOW milliseconds per - * call to bridge_reconfigure(). If there is more work to do after the limit - * is reached, 'need_reconfigure', is flagged and it's done on the next loop. - * This allows the rest of the code to catch up on important things like - * forwarding packets. */ -#define OFP_PORT_ACTION_WINDOW 10 - static void add_del_bridges(const struct ovsrec_open_vswitch *); static void bridge_run__(void); static void bridge_create(const struct ovsrec_bridge *); @@ -509,7 +499,8 @@ bridge_exit(void) * should not be and in fact is not directly involved in that. But * ovs-vswitchd needs to make sure that ovsdb-server can reach the managers, so * it has to tell in-band control where the managers are to enable that. - * (Thus, only managers connected in-band are collected.) + * (Thus, only managers connected in-band and with non-loopback addresses + * are collected.) */ static void collect_in_band_managers(const struct ovsrec_open_vswitch *ovs_cfg, @@ -545,9 +536,11 @@ collect_in_band_managers(const struct ovsrec_open_vswitch *ovs_cfg, struct sockaddr_in in; } sa; + /* Ignore loopback. */ if (stream_parse_target_with_default_port(target, OVSDB_PORT, &sa.ss) - && sa.ss.ss_family == AF_INET) { + && sa.ss.ss_family == AF_INET + && sa.in.sin_addr.s_addr != htonl(INADDR_LOOPBACK)) { managers[n_managers++] = sa.in; } } @@ -581,10 +574,6 @@ bridge_reconfigure(const struct ovsrec_open_vswitch *ovs_cfg) smap_get_int(&ovs_cfg->other_config, "n-handler-threads", 0), smap_get_int(&ovs_cfg->other_config, "n-revalidator-threads", 0)); - if (ovs_cfg) { - discover_types(ovs_cfg); - } - /* Destroy "struct bridge"s, "struct port"s, and "struct iface"s according * to 'ovs_cfg', with only very minimal configuration otherwise. * @@ -3920,12 +3909,12 @@ static void bridge_aa_refresh_queued(struct bridge *br) { struct ovs_list *list = xmalloc(sizeof *list); - struct bridge_aa_vlan *node; + struct bridge_aa_vlan *node, *next; list_init(list); ofproto_aa_vlan_get_queued(br->ofproto, list); - LIST_FOR_EACH(node, list_node, list) { + LIST_FOR_EACH_SAFE (node, next, list_node, list) { struct port *port; VLOG_INFO("ifname=%s, vlan=%u, oper=%u", node->port_name, node->vlan, diff --git a/vswitchd/ovs-vswitchd.8.in b/vswitchd/ovs-vswitchd.8.in index 7f165eae886..b9eb0046121 100644 --- a/vswitchd/ovs-vswitchd.8.in +++ b/vswitchd/ovs-vswitchd.8.in @@ -239,6 +239,24 @@ type). .. .so lib/dpctl.man . +.SS "DPIF-NETDEV COMMANDS" +These commands are used to expose internal information (mostly statistics) +about the ``dpif-netdev'' userspace datapath. If there is only one datapath +(as is often the case, unless \fBdpctl/\fR commands are used), the \fIdp\fR +argument can be omitted. +.IP "\fBdpif-netdev/pmd-stats-show\fR [\fIdp\fR]" +Shows performance statistics for each pmd thread of the datapath \fIdp\fR. +The special thread ``main'' sums up the statistics of every non pmd thread. +The sum of ``emc hits'', ``masked hits'' and ``miss'' is the number of +packets received by the datapath. Cycles are counted using the TSC or similar +facilities (when available on the platform). To reset these counters use +\fBdpif-netdev/pmd-stats-clear\fR. The duration of one cycle depends on the +measuring infrastructure. +.IP "\fBdpif-netdev/pmd-stats-clear\fR [\fIdp\fR]" +Resets to zero the per pmd thread performance numbers shown by the +\fBdpif-netdev/pmd-stats-show\fR command. It will NOT reset datapath or +bridge statistics, only the values shown by the above command. +. .so ofproto/ofproto-dpif-unixctl.man .so ofproto/ofproto-unixctl.man .so lib/vlog-unixctl.man diff --git a/vswitchd/ovs-vswitchd.c b/vswitchd/ovs-vswitchd.c index 44d44f043b6..a1b33dad929 100644 --- a/vswitchd/ovs-vswitchd.c +++ b/vswitchd/ovs-vswitchd.c @@ -180,7 +180,7 @@ parse_options(int argc, char *argv[], char **unixctl_pathp) usage(); case 'V': - ovs_print_version(OFP10_VERSION, OFP10_VERSION); + ovs_print_version(0, 0); exit(EXIT_SUCCESS); case OPT_MLOCKALL: diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml index 3256ce0a435..79b5606c7fa 100644 --- a/vswitchd/vswitch.xml +++ b/vswitchd/vswitch.xml @@ -1892,6 +1892,25 @@

+
stt
+
+ The Stateless TCP Tunnel (STT) is particularly useful when tunnel + endpoints are in end-systems, as it utilizes the capabilities of + standard network interface cards to improve performance. STT utilizes + a TCP-like header inside the IP header. It is stateless, i.e., there is + no TCP connection state of any kind associated with the tunnel. The + TCP-like header is used to leverage the capabilities of existing + network interface cards, but should not be interpreted as implying + any sort of connection state between endpoints. + Since the STT protocol does not engage in the usual TCP 3-way handshake, + so it will have difficulty traversing stateful firewalls. + The protocol is documented at + http://www.ietf.org/archive/id/draft-davie-stt-06.txt + + All traffic uses a default destination port of 7471. STT is only + available in kernel datapath on kernel 3.5 or newer. +
+
patch
A pair of virtual devices that act as a patch cable. @@ -1909,7 +1928,7 @@ These options apply to interfaces with of geneve, gre, ipsec_gre, gre64, ipsec_gre64, vxlan, - and lisp. + lisp and stt.

@@ -1998,8 +2017,8 @@

  • A positive 24-bit (for Geneve, VXLAN, and LISP), 32-bit (for GRE) - or 64-bit (for GRE64) number. The tunnel receives only packets - with the specified key. + or 64-bit (for GRE64 and STT) number. The tunnel receives only + packets with the specified key.
  • The word flow. The tunnel accepts packets with any @@ -2025,8 +2044,8 @@
  • A positive 24-bit (for Geneve, VXLAN and LISP), 32-bit (for GRE) or - 64-bit (for GRE64) number. Packets sent through the tunnel will - have the specified key. + 64-bit (for GRE64 and STT) number. Packets sent through the tunnel + will have the specified key.
  • The word flow. Packets sent through the tunnel will diff --git a/xenserver/openvswitch-xen.spec.in b/xenserver/openvswitch-xen.spec.in index 2902372e83c..7a0c30f4bec 100644 --- a/xenserver/openvswitch-xen.spec.in +++ b/xenserver/openvswitch-xen.spec.in @@ -18,14 +18,12 @@ # -D "kernel_flavor xen" # -bb /usr/src/redhat/SPECS/openvswitch-xen.spec # -# For XenServer version >= 6.5, replace kernel_flavor with xen_version which -# should be the `uname -r` output. +# For XenServer version >= 6.5, use kernel_uname which should be +# the `uname -r` output. # for example: # # rpmbuild -D "openvswitch_version 2.3.0+build123" -# -D "kernel_name NAME-xen" -# -D "kernel_version 3.10.41-323.380416" -# -D "xen_version 3.10.0+2" +# -D "kernel_uname 3.10.0+2" # -bb /usr/src/redhat/SPECS/openvswitch-xen.spec # # If tests have to be skipped while building, specify the '--without check' @@ -36,6 +34,11 @@ %define openvswitch_version @VERSION@ %endif +%if %{?kernel_uname:1}%{!?kernel_uname:0} +%define kernel_name kernel +%define kernel_version %{kernel_uname} +%endif + %if %{?kernel_name:0}%{!?kernel_name:1} %define kernel %(rpm -qa 'kernel*xen-devel' | head -1) %define kernel_name %(rpm -q --queryformat "%%{Name}" %{kernel} | sed 's/-devel//' | sed 's/kernel-//') @@ -44,7 +47,7 @@ %endif %if %{?xen_version:0}%{!?xen_version:1} -%define xen_version %{kernel_version}%{kernel_flavor} +%define xen_version %{kernel_version}%{?kernel_flavor:%{kernel_flavor}} %endif # bump this when breaking compatibility with userspace @@ -78,7 +81,12 @@ Summary: Open vSwitch kernel module Group: System Environment/Kernel License: GPLv2 Provides: %{name}-modules%{?kernel_flavor:-%{kernel_flavor}} = %{kernel_version}, openvswitch.ko.%{module_abi_version} +%if %{?kernel_uname:0}%{!?kernel_uname:1} Requires: kernel%{?kernel_flavor:-%{kernel_flavor}} = %{kernel_version} +%endif +%if %{?kernel_uname:1}%{!?kernel_uname:0} +Requires: kernel-uname-r = %{kernel_version} +%endif %description %{module_package} Open vSwitch Linux kernel module compiled against kernel version