diff --git a/conf/dpvs.bond.conf.sample b/conf/dpvs.bond.conf.sample index d933f7823..fe0f12b78 100755 --- a/conf/dpvs.bond.conf.sample +++ b/conf/dpvs.bond.conf.sample @@ -13,12 +13,12 @@ ! global config global_defs { log_level WARNING -! log_file /var/log/dpvs.log + ! log_file /var/log/dpvs.log } ! netif config netif_defs { - pktpool_size 2097151 + pktpool_size 1048575 pktpool_cache 256 device dpdk0 { @@ -31,8 +31,13 @@ netif_defs { queue_number 8 descriptor_number 1024 } - ! promisc_mode - ! kni_name dpdk0.kni + fdir { + mode perfect + pballoc 64k + status matched + } + ! promisc_mode + ! kni_name dpdk0.kni } device dpdk1 { @@ -45,8 +50,13 @@ netif_defs { queue_number 8 descriptor_number 1024 } - ! promisc_mode - ! kni_name dpdk1.kni + fdir { + mode perfect + pballoc 64k + status matched + } + ! promisc_mode + ! kni_name dpdk1.kni } @@ -60,8 +70,13 @@ netif_defs { queue_number 8 descriptor_number 1024 } - ! promisc_mode - ! kni_name dpdk2.kni + fdir { + mode perfect + pballoc 64k + status matched + } + ! promisc_mode + ! kni_name dpdk2.kni } device dpdk3 { @@ -74,8 +89,13 @@ netif_defs { queue_number 8 descriptor_number 1024 } - ! promisc_mode - ! kni_name dpdk3.kni + fdir { + mode perfect + pballoc 64k + status matched + } + ! promisc_mode + ! kni_name dpdk3.kni } bonding bond0 { @@ -237,6 +257,7 @@ worker_defs { isol_rxq_ring_sz 1048576 } } + } ! timer config @@ -253,6 +274,7 @@ neigh_defs { ! dpvs ipv4 config ipv4_defs { + forwarding off default_ttl 64 fragment { bucket_number 4096 @@ -262,12 +284,22 @@ ipv4_defs { } } +! dpvs ipv6 config +ipv6_defs { + disable off + forwarding off + route6 { + method "hlist" + recycle_time 10 + } +} + ! control plane config ctrl_defs { lcore_msg { ring_size 4096 multicast_queue_length 256 - sync_msg_timeout_us 2000 + sync_msg_timeout_us 20000 } ipc_msg { unix_domain /var/run/dpvs_ctrl @@ -280,8 +312,9 @@ ipvs_defs { conn_pool_size 2097152 conn_pool_cache 256 conn_init_timeout 3 - ! expire_quiescent_template - ! fast_xmit_close + ! expire_quiescent_template + ! fast_xmit_close + ! redirect off } udp { @@ -313,19 +346,19 @@ ipvs_defs { mss 1452 ttl 63 sack - ! wscale - ! timestamp + ! wscale + ! timestamp } - ! defer_rs_syn + ! defer_rs_syn rs_syn_max_retry 3 ack_storm_thresh 10 max_ack_saved 3 conn_reuse_state { close time_wait - ! fin_wait - ! close_wait - ! last_ack + ! fin_wait + ! close_wait + ! last_ack } } } diff --git a/conf/dpvs.conf.items b/conf/dpvs.conf.items index 165fd016a..5ff7197cb 100755 --- a/conf/dpvs.conf.items +++ b/conf/dpvs.conf.items @@ -11,7 +11,7 @@ ! global config global_defs { - #daemon + #daemon log_level INFO log_file /var/log/dpvs.log } @@ -189,6 +189,7 @@ ipvs_defs { conn_init_timeout 3 <3, 1-31535999> expire_quiescent_template fast_xmit_close + redirect off } udp { @@ -243,4 +244,3 @@ ipvs_defs { sa_pool { pool_hash_size 16 <16, 1-128> } - diff --git a/conf/dpvs.conf.sample b/conf/dpvs.conf.sample index 3b99db0cf..c9e3a9740 100755 --- a/conf/dpvs.conf.sample +++ b/conf/dpvs.conf.sample @@ -227,7 +227,7 @@ neigh_defs { ! dpvs ipv4 config ipv4_defs { - forwarding off ! set this to on, dpvs will forward packets that NOT hit rules directly + forwarding off default_ttl 64 fragment { bucket_number 4096 @@ -242,7 +242,7 @@ ipv6_defs { disable off forwarding off route6 { - method "hlist" + method "hlist" recycle_time 10 } } @@ -267,6 +267,7 @@ ipvs_defs { conn_init_timeout 3 ! expire_quiescent_template ! fast_xmit_close + ! redirect off } udp { diff --git a/conf/dpvs.conf.single-bond.sample b/conf/dpvs.conf.single-bond.sample index db95307d3..ef94d0e39 100755 --- a/conf/dpvs.conf.single-bond.sample +++ b/conf/dpvs.conf.single-bond.sample @@ -12,8 +12,8 @@ ! global config global_defs { - log_level WARNING -! log_file /var/log/dpvs.log + log_level WARNING + ! log_file /var/log/dpvs.log } ! netif config @@ -162,6 +162,7 @@ worker_defs { ! isol_rxq_ring_sz 1048576 } } + } ! timer config @@ -216,8 +217,9 @@ ipvs_defs { conn_pool_size 2097152 conn_pool_cache 256 conn_init_timeout 3 - ! expire_quiescent_template - ! fast_xmit_close + ! expire_quiescent_template + ! fast_xmit_close + ! redirect off } udp { diff --git a/conf/dpvs.conf.single-nic.sample b/conf/dpvs.conf.single-nic.sample index da3191967..558a2b060 100755 --- a/conf/dpvs.conf.single-nic.sample +++ b/conf/dpvs.conf.single-nic.sample @@ -192,6 +192,7 @@ ipvs_defs { conn_init_timeout 3 ! expire_quiescent_template ! fast_xmit_close + ! redirect off } udp { diff --git a/doc/TODO.md b/doc/TODO.md index f8a2da07c..0e4c2ef77 100644 --- a/doc/TODO.md +++ b/doc/TODO.md @@ -1,40 +1,28 @@ DPVS TODO list ============== -Short-term ----------- - -* [x] Merge DPDK-17.05.2 -* [x] Basic traffic control -* [x] Neighbour (ARP) refactor -* [x] Tunnel Interface (gre/ipip) -* [x] NAT/Tunnel forwarding mode -* [x] Consistent hashing -* [x] Get real client IP for UDP, like TCP TOA. -* [x] Keepalive.conf support SNAT -* [x] Numa/fdir auto check. -* [ ] SNAT Related - - [ ] Multi-WIPs for schedule (auto switch to new WIP if one fails). - - [ ] Fixed group of WIPs for user, share or not share with other user. - - [ ] White/black list. - - [ ] Throughput and concurrency monitoring. - - [ ] Throughput and/or concurrency limiting. +* [x] IPv6 Support. +* [x] Documents update. +* [ ] NIC without Flow-Director (FDIR) + - [x] Packet redirect to workers. + - [ ]RSS pre-calcuating. +* [ ] Merge lastest DPDK stable +* [ ] SNAT ACL +* [ ] Refactor Keepalived (porting latest stable keepalived) +* [ ] Packet Capture and Tcpdump Support * [ ] Logging - [ ] Packet based logging. - [ ] Session based logging (creation, expire, statistics) -* [ ] CI, Test Automation setup. -* [ ] Performance optimization for 25G/40G NIC. -* [ ] Documents update. - -Long-term ---------- - -* [ ] VM support -* [ ] IP fragment support, for UDP apps. +* [ ] CI, Test Automation Setup. +* [ ] Performance Optimization + - [ ] CPU Performance Tuning + - [ ] Memory Performance Tuning + - [ ] Numa-aware NIC + - [ ] Minimal Running Resource +* [ ] 25G/40G NIC Supports +* [ ] VxLAN Support +* [ ] IPv6 Tunnel Device +* [ ] VM Support +* [ ] IP Fragment Support, for UDP APPs. * [ ] Session Sharing * [ ] ALG (ftp, sip, ...) -* [ ] VxLAN Support -* [ ] NIC without Flow-Director (fdir) - - Packet redirect to workers. - - RSS pre-calcuating. -* [ ] IPv6 Support. diff --git a/doc/tutorial.md b/doc/tutorial.md index 65b5cc670..6e42499a3 100644 --- a/doc/tutorial.md +++ b/doc/tutorial.md @@ -30,7 +30,7 @@ DPVS Tutorial About the concepts of *Full-NAT* (`FNAT`), `DR`, `Tunnel`, `toa`, `OSPF`/`ECMP` and `keepalived`, pls refer [LVS](www.linuxvirtualserver.org) and [Alibaba/LVS](https://github.com/alibaba/LVS/tree/master/docs). -Note `DPVS` support `FNAT`, `DR`, `Tunnel`, `SNAT` forwarding modes, and each mode can be configured as `one-arm` or `two-arm` topology, with or without `OSFP/ECMP`/`keepalived`. There're too many combinations, I cannot list all the examples here. Let's just give some popular working models used in our daily work. +Note `DPVS` supports `FNAT`, `DR`, `Tunnel`, `NAT`, `SNAT` forwarding modes, and each mode can be configured as `one-arm` or `two-arm` topology, with or without `OSFP/ECMP`/`keepalived`. There're too many combinations, I cannot list all the examples here. Let's just give some popular working models used in our daily work. @@ -177,13 +177,17 @@ You could refer to following links to get `TOA` source code and porting to your * [Huawai TOA](https://github.com/Huawei/TCP_option_address) * [IPVS CA](https://github.com/yubo/ip_vs_ca) +TOA source code is included into DPVS project(in directory `kmod/toa`) since v1.7 to support IPv6 and NAT64. It is derived from the Alibaba TOA. For IPv6 applications which need client's real IP address, we suggest to use this TOA version. + +Be aware that **application may need some changes** if you are using NAT64. An extra `getsockopt` should be called to obtain the client's real IPv6 address from the IPv4 socket on RS. As an example, we give a [NAT64 patch for nginx-1.14](../kmod/toa/example_nat64/nginx/nginx-1.14.0-nat64-toa.patch). By the way, if you do not need client's real IP address, application needs no changes. + ## Full-NAT with OSPF/ECMP (two-arm) -To work with *OSPF*, the patch in `patch/dpdk-stable-17.05.2/` must be applied to *dpdk-stable-17.05.2* and the correct `rte_kni.ko` should be installed. +To work with *OSPF*, the patch in `patch/dpdk-xxx/` must be applied to the corresponding DPDK source codes and the correct `rte_kni.ko` should be installed. -`DPVS` OSPF-cluster model looks like this, it leverage `OSPF/ECMP` for HA and high-scalability. This model is widely used in practice. +`DPVS` OSPF-cluster model looks like this, it leverages `OSPF/ECMP` for HA and high-scalability. This model is widely used in practice. ![fnat-ospf-two-arm](pics/fnat-ospf-two-arm.png) @@ -646,6 +650,7 @@ Hi, I am 10.140.18.33. client$ curl 192.168.0.89:80 Hi, I am 10.140.18.34. ``` +> Since v1.7.2, a solution is made for multi-lcore NAT mode forwarding. The principle is to redirect the outbound packets to the correct lcore where its session entry reside through a global redirection table and some lockless rings. Of course, it harms performance to some degree. If you want to use it, turn on the config swtich "ipvs_defs/conn/redirect" in /etc/dpvs.conf. @@ -770,7 +775,7 @@ host$ curl www.iqiyi.com # IPv6 Support -DPVS support IPv6 since 1.7-0. You can configure IPv6 fullnat just like IPv4: +DPVS support IPv6-IPv6 since v1.7 which means VIP/client IP/local IP/rs IP can be IPv6. You can configure IPv6 fullnat just like IPv4: ```bash #!/bin/sh - @@ -900,6 +905,61 @@ virtual_server group 2001-1-80 { } ``` +DPVS support IPv6-IPv4 for fullnat, which means VIP/client IP can be IPv6 and local IP/rs IP can be IPv4, you can configure it like this: + +```bash +#!/bin/sh - +# add VIP to WAN interface +./dpip addr add 2001::1/128 dev dpdk1 + +# route for WAN/LAN access +# add routes for other network or default route if needed. +./dpip route -6 add 2001::/64 dev dpdk1 +./dpip route add 10.0.0.0/8 dev dpdk0 + +# add service to forwarding, scheduling mode is RR. +# use ipvsadm --help for more info. +./ipvsadm -A -t [2001::1]:80 -s rr + +# add two RS for service, forwarding mode is FNAT (-b) +./ipvsadm -a -t [2001::1]:80 -r 10.0.0.1 -b +./ipvsadm -a -t [2001::1]:80 -r 10.0.0.2 -b + +# add at least one Local-IP (LIP) for FNAT on LAN interface +./ipvsadm --add-laddr -z 10.0.0.3 -t [2001::1]:80 -F dpdk0 +``` +OSPF can just be configured like IPv6-IPv6. If you prefer keepalived, you can configure it like IPv6-IPv6 except real_server/local_address_group. + +**IPv6 and Flow Director** + +We found there exists some NICs do not (fully) support Flow Director for IPv6. +For example, 82599 10GE Controller do not support IPv6 *perfect mode*, and IPv4/IPv6 *signature mode* supports only one locall IP. + +If you would like to use Flow Director signature mode, add the following lines into the device configs of `dpvs.conf`: + +``` +fdir { + mode signature + pballoc 64k + status matched +} +``` + +Another method to avoid Flow Director problem is to use the redirect forwarding, which forwards the recieved packets to the right lcore where the session resides by using lockless DPDK rings. +If you want to try this method, turn on the `redirect` switch in the `dpvs.conf`. + +``` +ipvs_defs { + conn { + ...... + redirect on + } + ...... +} +``` +It should note that the redirect forwarding may harm performance to a certain degree. Keep it in `off` state unless you have no other solutions. + + # Virtual Devices @@ -991,7 +1051,7 @@ To achieve this, 1. The kernel module `uoa.ko` is needed to be installed on `RS`, and 2. the program on `RS` just need a `getsockopt(2)` call to get the real client IP/port. -The example C code for RS to fetch Real Client IP can be found [here](../uoa/example/udp_serv.c). +The example C code for RS to fetch Real Client IP can be found [here](../kmod/uoa/example/udp_serv.c). ```bash rs$ insmod `uoa` diff --git a/include/common.h b/include/common.h index 66f69df2b..bf04cd918 100644 --- a/include/common.h +++ b/include/common.h @@ -30,18 +30,18 @@ #ifndef min #define min(x,y) ({ \ - typeof(x) _x = (x); \ - typeof(y) _y = (y); \ - (void) (&_x == &_y); \ - _x < _y ? _x : _y; }) + typeof(x) _x = (x); \ + typeof(y) _y = (y); \ + (void) (&_x == &_y); \ + _x < _y ? _x : _y; }) #endif #ifndef max #define max(x,y) ({ \ - typeof(x) _x = (x); \ - typeof(y) _y = (y); \ - (void) (&_x == &_y); \ - _x > _y ? _x : _y; }) + typeof(x) _x = (x); \ + typeof(y) _y = (y); \ + (void) (&_x == &_y); \ + _x > _y ? _x : _y; }) #endif #ifndef min_t diff --git a/include/conf/conn.h b/include/conf/conn.h index c7ffd3511..86da70b66 100644 --- a/include/conf/conn.h +++ b/include/conf/conn.h @@ -59,7 +59,8 @@ struct ip_vs_sockpair { typedef struct ip_vs_sockpair ipvs_sockpair_t; struct ip_vs_conn_entry { - uint16_t af; + uint16_t in_af; + uint16_t out_af; uint16_t proto; union inet_addr caddr; union inet_addr vaddr; diff --git a/include/conf/laddr.h b/include/conf/laddr.h index 4a39ff99c..32097eff4 100644 --- a/include/conf/laddr.h +++ b/include/conf/laddr.h @@ -43,7 +43,7 @@ struct dp_vs_laddr_entry { struct dp_vs_laddr_conf { /* identify service */ - int af; + int af_s; uint8_t proto; union inet_addr vaddr; uint16_t vport; @@ -54,6 +54,7 @@ struct dp_vs_laddr_conf { char oifname[IFNAMSIZ]; /* for set */ + int af_l; union inet_addr laddr; char ifname[IFNAMSIZ]; diff --git a/include/conf/neigh.h b/include/conf/neigh.h index fca935c79..9bb74764a 100644 --- a/include/conf/neigh.h +++ b/include/conf/neigh.h @@ -32,7 +32,7 @@ enum { }; enum { - DPVS_NUD_S_NONE = 0, + DPVS_NUD_S_NONE = 0, DPVS_NUD_S_SEND, DPVS_NUD_S_REACHABLE, DPVS_NUD_S_PROBE, @@ -83,4 +83,4 @@ static inline const char *nud_state_name(int state) #define NEIGHBOUR_HASHED 0x01 #define NEIGHBOUR_STATIC 0x02 -#endif +#endif diff --git a/include/inet.h b/include/inet.h index bdb7a1759..1c1167fba 100644 --- a/include/inet.h +++ b/include/inet.h @@ -98,7 +98,7 @@ static inline const char *inet_proto_name(uint8_t proto) static inline uint32_t inet_addr_fold(int af, const union inet_addr *addr) { - uint32_t addr_fold = 0; + uint32_t addr_fold = 0; if (af == AF_INET) { addr_fold = addr->in.s_addr; @@ -109,7 +109,7 @@ static inline uint32_t inet_addr_fold(int af, const union inet_addr *addr) return 0; } - return addr_fold; + return addr_fold; } /* ip1[-ip2][:port1[-port2]] */ @@ -308,7 +308,7 @@ int INET_HOOK(int af, unsigned int hook, struct rte_mbuf *mbuf, int inet_init(void); int inet_term(void); -bool inet_addr_equal(int af, const union inet_addr *a1, +bool inet_addr_equal(int af, const union inet_addr *a1, const union inet_addr *a2); const char *inet_proto_name(uint8_t proto); @@ -317,7 +317,7 @@ bool inet_is_addr_any(int af, const union inet_addr *addr); int inet_plen_to_mask(int af, uint8_t plen, union inet_addr *mask); -int inet_addr_net(int af, const union inet_addr *addr, +int inet_addr_net(int af, const union inet_addr *addr, const union inet_addr *mask, union inet_addr *net); diff --git a/include/inetaddr.h b/include/inetaddr.h index 4e7fce77b..4d8ba2b43 100644 --- a/include/inetaddr.h +++ b/include/inetaddr.h @@ -80,15 +80,15 @@ struct inet_ifaddr { #define this_sa_pool sa_pools[rte_lcore_id()] }; -int inet_addr_add(int af, const struct netif_port *dev, +int inet_addr_add(int af, const struct netif_port *dev, const union inet_addr *addr, uint8_t plen, - const union inet_addr *bcast, + const union inet_addr *bcast, uint32_t valid_lft, uint32_t prefered_lft, uint8_t scope, uint32_t flags); -int inet_addr_mod(int af, const struct netif_port *dev, +int inet_addr_mod(int af, const struct netif_port *dev, const union inet_addr *addr, uint8_t plen, - const union inet_addr *bcast, + const union inet_addr *bcast, uint32_t valid_lft, uint32_t prefered_lft, uint8_t scope); @@ -99,8 +99,8 @@ int inet_addr_flush(int af, struct netif_port *dev); struct netif_port *inet_addr_get_iface(int af, union inet_addr *addr); -void inet_addr_select(int af, const struct netif_port *dev, - const union inet_addr *dst, int scope, +void inet_addr_select(int af, const struct netif_port *dev, + const union inet_addr *dst, int scope, union inet_addr *addr); struct inet_ifaddr *inet_addr_ifa_get(int af, const struct netif_port *dev, diff --git a/include/ip_tunnel.h b/include/ip_tunnel.h index 4e6e39479..e3b9c5f85 100644 --- a/include/ip_tunnel.h +++ b/include/ip_tunnel.h @@ -70,7 +70,7 @@ struct ip_tunnel_param { __be16 o_flags; __be32 i_key; __be32 o_key; - struct iphdr iph; + struct iphdr iph; } __attribute__((__packed__)); #if defined(__DPVS__) diff --git a/include/ipv4.h b/include/ipv4.h index 40f4aeab3..d708dea64 100644 --- a/include/ipv4.h +++ b/include/ipv4.h @@ -31,10 +31,10 @@ int ipv4_term(void); void ipv4_keyword_value_init(void); void install_ipv4_keywords(void); -/* +/* * Output */ -/* 'flow4.daddr' & 'flow4.proto' is mandatory +/* 'flow4.daddr' & 'flow4.proto' is mandatory * while others are not. '0/NULL' for wildcard. */ int ipv4_xmit(struct rte_mbuf *mbuf, const struct flow4 *fl4); @@ -45,14 +45,14 @@ int ipv4_output(struct rte_mbuf *mbuf); * Transport Protocols */ struct inet_protocol { - /* mbuf->userdata can be used to get IPv4 header, + /* mbuf->userdata can be used to get IPv4 header, * save it if protocols need ->userdata for other purpose. */ int (*handler)(struct rte_mbuf *mbuf); }; -int ipv4_register_protocol(struct inet_protocol *prot, +int ipv4_register_protocol(struct inet_protocol *prot, unsigned char protocol); -int ipv4_unregister_protocol(struct inet_protocol *prot, +int ipv4_unregister_protocol(struct inet_protocol *prot, unsigned char protocol); enum { @@ -144,4 +144,56 @@ static inline bool ip4_is_frag(struct ipv4_hdr *iph) & htons(IPV4_HDR_MF_FLAG | IPV4_HDR_OFFSET_MASK)) != 0; } +/* + * Process the pseudo-header checksum of an IPv4 header. + * + * Different from "rte_ipv4_phdr_cksum", "ip4_phdr_cksum" allows for ipv4 options. + * The checksum field must be set to 0 by the caller. + * + * @param iph + * The pointer to the contiguous IPv4 header. + * @param ol_flags + * The ol_flags of the associated mbuf. + * @return + * The non-complemented pseudo checksum to set in the L4 header. + */ +static inline uint16_t ip4_phdr_cksum(struct ipv4_hdr *iph, uint64_t ol_flags) +{ + uint16_t csum; + uint16_t total_length = iph->total_length; + + iph->total_length = htons(ntohs(total_length) - + ((iph->version_ihl & 0xf) << 2) + sizeof(struct ipv4_hdr)); + csum = rte_ipv4_phdr_cksum(iph, ol_flags); + + iph->total_length = total_length; + return csum; +} + +/* + * Process the IPv4 UDP or TCP checksum. + * + * Different from "rte_ipv4_udptcp_cksum", "ip4_udptcp_cksum" allows for ipv4 options. + * The IP and layer 4 checksum must be set to 0 in the packet by the caller. + * + * @param iph + * The pointer to the contiguous IPv4 header. + * @param l4_hdr + * The pointer to the beginning of the L4 header. + * @return + * The complemented checksum to set in the L4 header. + */ +static inline uint16_t ip4_udptcp_cksum(struct ipv4_hdr *iph, const void *l4_hdr) +{ + uint16_t csum; + uint16_t total_length = iph->total_length; + + iph->total_length = htons(ntohs(total_length) - + ((iph->version_ihl & 0xf) << 2) + sizeof(struct ipv4_hdr)); + csum = rte_ipv4_udptcp_cksum(iph, l4_hdr); + + iph->total_length = total_length; + return csum; +} + #endif /* __DPVS_IPV4_H__ */ diff --git a/include/ipv4_frag.h b/include/ipv4_frag.h index 70cc69cae..1cfd49593 100644 --- a/include/ipv4_frag.h +++ b/include/ipv4_frag.h @@ -24,7 +24,7 @@ int ipv4_frag_init(void); int ipv4_frag_term(void); int ipv4_reassamble(struct rte_mbuf *mbuf); int ipv4_fragment(struct rte_mbuf *mbuf, unsigned int mtu, - int (*output)(struct rte_mbuf *)); + int (*output)(struct rte_mbuf *)); void ip4_frag_keyword_value_init(void); void install_ip4_frag_keywords(void); diff --git a/include/ipv6.h b/include/ipv6.h index 7313c6d18..e7760c24f 100644 --- a/include/ipv6.h +++ b/include/ipv6.h @@ -119,8 +119,8 @@ int ip6_hdrlen(const struct rte_mbuf *mbuf); * The non-complemented checksum to set in the L4 header. */ uint16_t ip6_phdr_cksum(struct ip6_hdr*, uint64_t ol_flags, - uint32_t exthdrlen, uint8_t l4_proto); + uint32_t exthdrlen, uint8_t l4_proto); uint16_t ip6_udptcp_cksum(struct ip6_hdr*, const void *l4_hdr, - uint32_t exthdrlen, uint8_t l4_proto); + uint32_t exthdrlen, uint8_t l4_proto); #endif /* __DPVS_IPV6_H__ */ diff --git a/include/ipvs/blklst.h b/include/ipvs/blklst.h index 30a02ff09..96d035e55 100644 --- a/include/ipvs/blklst.h +++ b/include/ipvs/blklst.h @@ -25,7 +25,7 @@ struct blklst_entry { struct list_head list; union inet_addr vaddr; uint16_t vport; - uint8_t proto; + uint8_t proto; union inet_addr blklst; }; diff --git a/include/ipvs/conn.h b/include/ipvs/conn.h index 28db75daf..dab8ef9b9 100644 --- a/include/ipvs/conn.h +++ b/include/ipvs/conn.h @@ -27,6 +27,7 @@ #include "ipvs/conn.h" #include "ipvs/proto.h" #include "ipvs/service.h" +#include "ipvs/redirect.h" enum { DPVS_CONN_DIR_INBOUND = 0, @@ -35,11 +36,12 @@ enum { }; enum { - DPVS_CONN_F_HASHED = 0x0040, - DPVS_CONN_F_INACTIVE = 0x0100, - DPVS_CONN_F_SYNPROXY = 0x8000, - DPVS_CONN_F_TEMPLATE = 0x1000, - DPVS_CONN_F_NOFASTXMIT = 0x2000, + DPVS_CONN_F_HASHED = 0x0040, + DPVS_CONN_F_REDIRECT_HASHED = 0x0080, + DPVS_CONN_F_INACTIVE = 0x0100, + DPVS_CONN_F_SYNPROXY = 0x8000, + DPVS_CONN_F_TEMPLATE = 0x1000, + DPVS_CONN_F_NOFASTXMIT = 0x2000, }; struct dp_vs_conn_param { @@ -147,6 +149,9 @@ struct dp_vs_conn { struct dp_vs_conn *control; /* master who controlls me */ rte_atomic32_t n_control; /* number of connections controlled by me*/ uint64_t ctime; /* create time */ + + /* connection redirect in fnat/snat/nat modes */ + struct dp_vs_redirect *redirect; } __rte_cache_aligned; /* for syn-proxy to save all ack packet in conn before rs's syn-ack arrives */ @@ -162,8 +167,8 @@ struct dp_vs_synproxy_ack_pakcet { int dp_vs_conn_init(void); int dp_vs_conn_term(void); -struct dp_vs_conn * -dp_vs_conn_new(struct rte_mbuf *mbuf, +struct dp_vs_conn * +dp_vs_conn_new(struct rte_mbuf *mbuf, const struct dp_vs_iphdr *iph, struct dp_vs_conn_param *param, struct dp_vs_dest *dest, @@ -171,9 +176,9 @@ dp_vs_conn_new(struct rte_mbuf *mbuf, int dp_vs_conn_del(struct dp_vs_conn *conn); struct dp_vs_conn * -dp_vs_conn_get(int af, uint16_t proto, - const union inet_addr *saddr, - const union inet_addr *daddr, +dp_vs_conn_get(int af, uint16_t proto, + const union inet_addr *saddr, + const union inet_addr *daddr, uint16_t sport, uint16_t dport, int *dir, bool reverse); @@ -190,8 +195,8 @@ void dp_vs_conn_put_no_reset(struct dp_vs_conn *conn); void ipvs_conn_keyword_value_init(void); void install_ipvs_conn_keywords(void); -static inline void dp_vs_conn_fill_param(int af, uint8_t proto, - const union inet_addr *caddr, const union inet_addr *vaddr, +static inline void dp_vs_conn_fill_param(int af, uint8_t proto, + const union inet_addr *caddr, const union inet_addr *vaddr, uint16_t cport, uint16_t vport, uint16_t ct_dport, struct dp_vs_conn_param *param) { @@ -270,4 +275,31 @@ static inline void dp_vs_control_add(struct dp_vs_conn *conn, struct dp_vs_conn rte_atomic32_inc(&ctl_conn->n_control); } +static inline bool +dp_vs_conn_is_redirect_hashed(struct dp_vs_conn *conn) +{ + return (conn->flags & DPVS_CONN_F_REDIRECT_HASHED) ? true : false; +} + +static inline void +dp_vs_conn_set_redirect_hashed(struct dp_vs_conn *conn) +{ + conn->flags |= DPVS_CONN_F_REDIRECT_HASHED; +} + +static inline void +dp_vs_conn_clear_redirect_hashed(struct dp_vs_conn *conn) +{ + conn->flags &= ~DPVS_CONN_F_REDIRECT_HASHED; +} + +inline uint32_t dp_vs_conn_hashkey(int af, + const union inet_addr *saddr, uint16_t sport, + const union inet_addr *daddr, uint16_t dport, + uint32_t mask); +int dp_vs_conn_pool_size(void); +int dp_vs_conn_pool_cache_size(void); + +extern bool dp_vs_redirect_disable; + #endif /* __DPVS_CONN_H__ */ diff --git a/include/ipvs/dest.h b/include/ipvs/dest.h index 94a7f6e0c..d27d041cf 100644 --- a/include/ipvs/dest.h +++ b/include/ipvs/dest.h @@ -78,7 +78,7 @@ struct dp_vs_dest { struct dp_vs_service *svc; /* service it belongs to */ union inet_addr vaddr; /* virtual IP address */ unsigned conn_timeout; /* conn timeout copied from svc*/ - unsigned limit_proportion; /* limit copied from svc*/ + unsigned limit_proportion; /* limit copied from svc*/ } __rte_cache_aligned; #endif @@ -149,16 +149,39 @@ struct dp_vs_dest_user{ }; #ifdef __DPVS__ +static inline bool +dp_vs_dest_is_avail(struct dp_vs_dest *dest) +{ + return (dest->flags & DPVS_DEST_F_AVAILABLE) ? true : false; +} + +static inline bool +dp_vs_dest_is_overload(struct dp_vs_dest *dest) +{ + return (dest->flags & DPVS_DEST_F_OVERLOAD) ? true : false; +} + +static inline int16_t +dp_vs_dest_get_weight(struct dp_vs_dest *dest) +{ + return rte_atomic16_read(&dest->weight); +} + +static inline bool +dp_vs_dest_is_valid(struct dp_vs_dest *dest) +{ + return (dest + && dp_vs_dest_is_avail(dest) + && !dp_vs_dest_is_overload(dest) + && dp_vs_dest_get_weight(dest) > 0) ? true : false; +} + int dp_vs_new_dest(struct dp_vs_service *svc, struct dp_vs_dest_conf *udest, struct dp_vs_dest **dest_p); -struct dp_vs_dest *dp_vs_lookup_dest(struct dp_vs_service *svc, +struct dp_vs_dest *dp_vs_lookup_dest(int af, struct dp_vs_service *svc, const union inet_addr *daddr, uint16_t dport); -struct dp_vs_dest *dp_vs_find_dest(int af, const union inet_addr *daddr, - uint16_t dport, const union inet_addr *vaddr, - uint16_t vport, uint16_t protocol); - struct dp_vs_dest *dp_vs_trash_get_dest(struct dp_vs_service *svc, const union inet_addr *daddr, uint16_t dport); diff --git a/include/ipvs/fo.h b/include/ipvs/fo.h new file mode 100644 index 000000000..1b223e7a4 --- /dev/null +++ b/include/ipvs/fo.h @@ -0,0 +1,26 @@ +/* + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * yangxingwu , Feb 2019, initial. + * + */ + +#ifndef __DPVS_FO_H__ +#define __DPVS_FO_H__ + +#include "ipvs/service.h" +#include "ipvs/dest.h" +#include "ipvs/sched.h" + +int dp_vs_fo_init(void); +int dp_vs_fo_term(void); + +#endif diff --git a/include/ipvs/nat64.h b/include/ipvs/nat64.h new file mode 100644 index 000000000..c5052e9c6 --- /dev/null +++ b/include/ipvs/nat64.h @@ -0,0 +1,49 @@ +/* + * DPVS is a software load balancer (Virtual Server) based on DPDK. + * + * Copyright (C) 2017 iQIYI (www.iqiyi.com). + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ +#ifndef __DPVS_NAT64_H__ +#define __DPVS_NAT64_H__ + +#include "ipv4.h" +#include "ipv6.h" + +static inline int mbuf_nat6to4_len(struct rte_mbuf *mbuf) +{ + int offset = sizeof(struct ip6_hdr); + uint8_t nexthdr = ip6_hdr(mbuf)->ip6_nxt; + int len; + + offset = ip6_skip_exthdr(mbuf, offset, &nexthdr); + len = mbuf->pkt_len - offset + sizeof(struct ipv4_hdr); + + return len; +} + +static inline int mbuf_nat4to6_len(struct rte_mbuf *mbuf) +{ + return (mbuf->pkt_len - ip4_hdrlen(mbuf) + sizeof(struct ip6_hdr)); +} + +int mbuf_6to4(struct rte_mbuf *mbuf, + const struct in_addr *saddr, + const struct in_addr *daddr); + +int mbuf_4to6(struct rte_mbuf *mbuf, + const struct in6_addr *saddr, + const struct in6_addr *daddr); + +#endif /* __DPVS_NAT64_H__ */ diff --git a/include/ipvs/proto.h b/include/ipvs/proto.h index 2a364e3c1..bf3fb0821 100644 --- a/include/ipvs/proto.h +++ b/include/ipvs/proto.h @@ -36,7 +36,7 @@ struct dp_vs_proto { int (*exit)(struct dp_vs_proto *proto); /* schedule RS and create new conn */ - int (*conn_sched)(struct dp_vs_proto *proto, + int (*conn_sched)(struct dp_vs_proto *proto, const struct dp_vs_iphdr *iph, struct rte_mbuf *mbuf, struct dp_vs_conn **conn, @@ -45,12 +45,12 @@ struct dp_vs_proto { /* lookup conn by * return conn and direction or NULL if miss */ struct dp_vs_conn * - (*conn_lookup)(struct dp_vs_proto *proto, + (*conn_lookup)(struct dp_vs_proto *proto, const struct dp_vs_iphdr *iph, - struct rte_mbuf *mbuf, int *direct, - bool reverse, bool *drop); + struct rte_mbuf *mbuf, int *direct, + bool reverse, bool *drop, lcoreid_t *peer_cid); - int (*conn_expire)(struct dp_vs_proto *proto, + int (*conn_expire)(struct dp_vs_proto *proto, struct dp_vs_conn *conn); /* for NAT mode */ @@ -87,13 +87,13 @@ struct dp_vs_proto { int (*csum_check)(struct dp_vs_proto *proto, int af, struct rte_mbuf *mbuf); int (*dump_packet)(struct dp_vs_proto *proto, int af, - struct rte_mbuf *mbuf, int off, + struct rte_mbuf *mbuf, int off, const char *msg); /* try trans connn's states by packet and direction */ - int (*state_trans)(struct dp_vs_proto *proto, - struct dp_vs_conn *conn, - struct rte_mbuf *mbuf, + int (*state_trans)(struct dp_vs_proto *proto, + struct dp_vs_conn *conn, + struct rte_mbuf *mbuf, int direct); const char * diff --git a/include/ipvs/proto_udp.h b/include/ipvs/proto_udp.h index e407882e8..8b28c6b71 100644 --- a/include/ipvs/proto_udp.h +++ b/include/ipvs/proto_udp.h @@ -30,7 +30,7 @@ extern int g_defence_udp_drop; void install_proto_udp_keywords(void); void udp_keyword_value_init(void); -void udp4_send_csum(struct ipv4_hdr *iph, struct udphdr *uh); -void udp6_send_csum(struct ipv6_hdr *iph, struct udphdr *uh); +void udp4_send_csum(struct ipv4_hdr *iph, struct udp_hdr *uh); +void udp6_send_csum(struct ipv6_hdr *iph, struct udp_hdr *uh); #endif diff --git a/include/ipvs/redirect.h b/include/ipvs/redirect.h new file mode 100644 index 000000000..fe94dd922 --- /dev/null +++ b/include/ipvs/redirect.h @@ -0,0 +1,61 @@ +/* + * DPVS is a software load balancer (Virtual Server) based on DPDK. + * + * Copyright (C) 2017 iQIYI (www.iqiyi.com). + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ +#ifndef __DPVS_REDIRECT_H__ +#define __DPVS_REDIRECT_H__ +#include "common.h" +#include "list.h" +#include "dpdk.h" +#include "netif.h" +#include "ipvs/conn.h" +#include "ipvs/dest.h" + +/* + * The conneciton redirect tuple is only for the reverse tuple + * (inside -> outside) in nat-mode. + */ +struct dp_vs_redirect { + struct list_head list; + + uint8_t af; + uint8_t proto; + lcoreid_t cid; + uint8_t padding; + + union inet_addr saddr; + union inet_addr daddr; + uint16_t sport; + uint16_t dport; + + struct rte_mempool *redirect_pool; +} __rte_cache_aligned; + +struct dp_vs_redirect *dp_vs_redirect_alloc(enum dpvs_fwd_mode fwdmode); +void dp_vs_redirect_free(struct dp_vs_conn *conn); +void dp_vs_redirect_hash(struct dp_vs_conn *conn); +void dp_vs_redirect_unhash(struct dp_vs_conn *conn); +struct dp_vs_redirect *dp_vs_redirect_get(int af, uint16_t proto, + const union inet_addr *saddr, const union inet_addr *daddr, + uint16_t sport, uint16_t dport); +void dp_vs_redirect_init(struct dp_vs_conn *conn); +int dp_vs_redirect_table_init(void); +int dp_vs_redirect_pkt(struct rte_mbuf *mbuf, lcoreid_t peer_cid); +void dp_vs_redirect_ring_proc(struct netif_queue_conf *qconf, lcoreid_t cid); +int dp_vs_redirects_init(void); +int dp_vs_redirects_term(void); + +#endif /* __DPVS_REDIRECT_H__ */ diff --git a/include/ipvs/sched.h b/include/ipvs/sched.h index 2903d3c8b..a8af48c85 100644 --- a/include/ipvs/sched.h +++ b/include/ipvs/sched.h @@ -20,6 +20,7 @@ #include "list.h" #include "dpdk.h" #include "common.h" +#include "ctrl.h" #include "ipvs/service.h" @@ -29,12 +30,13 @@ struct dp_vs_scheduler { // rte_atomic32_t refcnt; struct dp_vs_dest * - (*schedule)(struct dp_vs_service *svc, + (*schedule)(struct dp_vs_service *svc, const struct rte_mbuf *mbuf); int (*init_service)(struct dp_vs_service *svc); int (*exit_service)(struct dp_vs_service *svc); - int (*update_service)(struct dp_vs_service *svc); + int (*update_service)(struct dp_vs_service *svc, struct dp_vs_dest *dest, + sockoptid_t opt); } __rte_cache_aligned; int dp_vs_sched_init(void); diff --git a/include/ipvs/service.h b/include/ipvs/service.h index 34385cb2e..65adb3526 100644 --- a/include/ipvs/service.h +++ b/include/ipvs/service.h @@ -178,12 +178,12 @@ int dp_vs_add_service(struct dp_vs_service_conf *u, int dp_vs_del_service(struct dp_vs_service *svc); -int dp_vs_edit_service(struct dp_vs_service *svc, +int dp_vs_edit_service(struct dp_vs_service *svc, struct dp_vs_service_conf *u); struct dp_vs_service * dp_vs_service_lookup(int af, uint16_t protocol, - const union inet_addr *vaddr, + const union inet_addr *vaddr, uint16_t vport, uint32_t fwmark, const struct rte_mbuf *mbuf, const struct dp_vs_match *match); diff --git a/include/ipvs/xmit.h b/include/ipvs/xmit.h index f23e0f292..4d0b22a23 100644 --- a/include/ipvs/xmit.h +++ b/include/ipvs/xmit.h @@ -29,9 +29,9 @@ int dp_vs_out_xmit_fnat(struct dp_vs_proto *prot, struct dp_vs_conn *conn, struct rte_mbuf *mbuf); -void dp_vs_xmit_icmp(struct rte_mbuf *mbuf, +void dp_vs_xmit_icmp(struct rte_mbuf *mbuf, struct dp_vs_proto *prot, - struct dp_vs_conn *conn, + struct dp_vs_conn *conn, int dir); int dp_vs_xmit_dr(struct dp_vs_proto *proto, diff --git a/include/linux_ipv6.h b/include/linux_ipv6.h index 3c42fbc6f..39a495ebd 100644 --- a/include/linux_ipv6.h +++ b/include/linux_ipv6.h @@ -15,8 +15,8 @@ * linux:include/net/ipv6.h * linux:net/ipv6/addrconf_core.c * - * Authors: - * Pedro Roque + * Authors: + * Pedro Roque */ #ifndef __LINUX_IPV6_H__ #define __LINUX_IPV6_H__ @@ -28,26 +28,26 @@ #include "inetaddr.h" #endif -#define IPV6_MAXPLEN 65535 +#define IPV6_MAXPLEN 65535 #define IPV6_MIN_MTU 1280 /* - * NextHeader field of IPv6 header + * NextHeader field of IPv6 header */ -#define NEXTHDR_HOP 0 /* Hop-by-hop option header. */ -#define NEXTHDR_TCP 6 /* TCP segment. */ -#define NEXTHDR_UDP 17 /* UDP message. */ -#define NEXTHDR_IPV6 41 /* IPv6 in IPv6 */ -#define NEXTHDR_ROUTING 43 /* Routing header. */ -#define NEXTHDR_FRAGMENT 44 /* Fragmentation/reassembly header. */ -#define NEXTHDR_GRE 47 /* GRE header. */ -#define NEXTHDR_ESP 50 /* Encapsulating security payload. */ -#define NEXTHDR_AUTH 51 /* Authentication header. */ -#define NEXTHDR_ICMP 58 /* ICMP for IPv6. */ -#define NEXTHDR_NONE 59 /* No next header */ -#define NEXTHDR_DEST 60 /* Destination options header. */ -#define NEXTHDR_SCTP 132 /* SCTP message. */ -#define NEXTHDR_MOBILITY 135 /* Mobility header. */ +#define NEXTHDR_HOP 0 /* Hop-by-hop option header. */ +#define NEXTHDR_TCP 6 /* TCP segment. */ +#define NEXTHDR_UDP 17 /* UDP message. */ +#define NEXTHDR_IPV6 41 /* IPv6 in IPv6 */ +#define NEXTHDR_ROUTING 43 /* Routing header. */ +#define NEXTHDR_FRAGMENT 44 /* Fragmentation/reassembly header. */ +#define NEXTHDR_GRE 47 /* GRE header. */ +#define NEXTHDR_ESP 50 /* Encapsulating security payload. */ +#define NEXTHDR_AUTH 51 /* Authentication header. */ +#define NEXTHDR_ICMP 58 /* ICMP for IPv6. */ +#define NEXTHDR_NONE 59 /* No next header */ +#define NEXTHDR_DEST 60 /* Destination options header. */ +#define NEXTHDR_SCTP 132 /* SCTP message. */ +#define NEXTHDR_MOBILITY 135 /* Mobility header. */ #define NEXTHDR_MAX 255 @@ -55,74 +55,74 @@ #define IPV6_DEFAULT_MCASTHOPS 1 /* - * Addr type - * - * type - unicast | multicast - * scope - local | site | global - * v4 - compat - * v4mapped - * any - * loopback + * Addr type + * + * type - unicast | multicast + * scope - local | site | global + * v4 - compat + * v4mapped + * any + * loopback */ -#define IPV6_ADDR_ANY 0x0000U +#define IPV6_ADDR_ANY 0x0000U -#define IPV6_ADDR_UNICAST 0x0001U -#define IPV6_ADDR_MULTICAST 0x0002U +#define IPV6_ADDR_UNICAST 0x0001U +#define IPV6_ADDR_MULTICAST 0x0002U -#define IPV6_ADDR_LOOPBACK 0x0010U -#define IPV6_ADDR_LINKLOCAL 0x0020U -#define IPV6_ADDR_SITELOCAL 0x0040U +#define IPV6_ADDR_LOOPBACK 0x0010U +#define IPV6_ADDR_LINKLOCAL 0x0020U +#define IPV6_ADDR_SITELOCAL 0x0040U -#define IPV6_ADDR_COMPATv4 0x0080U +#define IPV6_ADDR_COMPATv4 0x0080U -#define IPV6_ADDR_SCOPE_MASK 0x00f0U +#define IPV6_ADDR_SCOPE_MASK 0x00f0U -#define IPV6_ADDR_MAPPED 0x1000U +#define IPV6_ADDR_MAPPED 0x1000U -#define IPV6_ADDR_RESERVED 0x2000U /* reserved address space */ +#define IPV6_ADDR_RESERVED 0x2000U /* reserved address space */ /* - * Addr scopes + * Addr scopes */ -#define IPV6_ADDR_MC_SCOPE(a) \ - ((a)->s6_addr[1] & 0x0f) /* nonstandard */ -#define __IPV6_ADDR_SCOPE_INVALID -1 -#define IPV6_ADDR_SCOPE_NODELOCAL 0x01 -#define IPV6_ADDR_SCOPE_LINKLOCAL 0x02 -#define IPV6_ADDR_SCOPE_SITELOCAL 0x05 -#define IPV6_ADDR_SCOPE_ORGLOCAL 0x08 -#define IPV6_ADDR_SCOPE_GLOBAL 0x0e +#define IPV6_ADDR_MC_SCOPE(a) \ + ((a)->s6_addr[1] & 0x0f) /* nonstandard */ +#define __IPV6_ADDR_SCOPE_INVALID -1 +#define IPV6_ADDR_SCOPE_NODELOCAL 0x01 +#define IPV6_ADDR_SCOPE_LINKLOCAL 0x02 +#define IPV6_ADDR_SCOPE_SITELOCAL 0x05 +#define IPV6_ADDR_SCOPE_ORGLOCAL 0x08 +#define IPV6_ADDR_SCOPE_GLOBAL 0x0e /* - * Addr flags + * Addr flags */ -#define IPV6_ADDR_MC_FLAG_TRANSIENT(a) \ - ((a)->s6_addr[1] & 0x10) -#define IPV6_ADDR_MC_FLAG_PREFIX(a) \ - ((a)->s6_addr[1] & 0x20) -#define IPV6_ADDR_MC_FLAG_RENDEZVOUS(a) \ - ((a)->s6_addr[1] & 0x40) +#define IPV6_ADDR_MC_FLAG_TRANSIENT(a) \ + ((a)->s6_addr[1] & 0x10) +#define IPV6_ADDR_MC_FLAG_PREFIX(a) \ + ((a)->s6_addr[1] & 0x20) +#define IPV6_ADDR_MC_FLAG_RENDEZVOUS(a) \ + ((a)->s6_addr[1] & 0x40) /* * choose an appropriate source address (RFC3484) */ enum { - IPV6_SADDR_RULE_INIT = 0, - IPV6_SADDR_RULE_LOCAL, - IPV6_SADDR_RULE_SCOPE, - IPV6_SADDR_RULE_PREFERRED, + IPV6_SADDR_RULE_INIT = 0, + IPV6_SADDR_RULE_LOCAL, + IPV6_SADDR_RULE_SCOPE, + IPV6_SADDR_RULE_PREFERRED, #ifdef CONFIG_IPV6_MIP6 - IPV6_SADDR_RULE_HOA, + IPV6_SADDR_RULE_HOA, #endif - IPV6_SADDR_RULE_OIF, - IPV6_SADDR_RULE_LABEL, + IPV6_SADDR_RULE_OIF, + IPV6_SADDR_RULE_LABEL, #ifdef CONFIG_IPV6_PRIVACY - IPV6_SADDR_RULE_PRIVACY, + IPV6_SADDR_RULE_PRIVACY, #endif - IPV6_SADDR_RULE_ORCHID, - IPV6_SADDR_RULE_PREFIX, - IPV6_SADDR_RULE_MAX + IPV6_SADDR_RULE_ORCHID, + IPV6_SADDR_RULE_PREFIX, + IPV6_SADDR_RULE_MAX }; #ifdef __DPVS__ @@ -146,189 +146,189 @@ struct ipv6_saddr_dst { /** * from linux:net/ipv6/addrconf_core.c */ -#define IPV6_ADDR_SCOPE_TYPE(scope) ((scope) << 16) +#define IPV6_ADDR_SCOPE_TYPE(scope) ((scope) << 16) -#define IN6ADDR_LINKLOCAL_ALLNODES_INIT \ - { { { 0xff,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1 } } } +#define IN6ADDR_LINKLOCAL_ALLNODES_INIT \ + { { { 0xff,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1 } } } #define IN6ADDR_LINKLOCAL_ALLROUTERS_INIT \ - { { { 0xff,2,0,0,0,0,0,0,0,0,0,0,0,0,0,2 } } } + { { { 0xff,2,0,0,0,0,0,0,0,0,0,0,0,0,0,2 } } } static const struct in6_addr in6addr_linklocal_allnodes = IN6ADDR_LINKLOCAL_ALLNODES_INIT; static const struct in6_addr in6addr_linklocal_allrouters = IN6ADDR_LINKLOCAL_ALLROUTERS_INIT; static inline unsigned int ipv6_addr_scope2type(unsigned int scope) { - switch (scope) { - case IPV6_ADDR_SCOPE_NODELOCAL: - return (IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_NODELOCAL) | - IPV6_ADDR_LOOPBACK); - case IPV6_ADDR_SCOPE_LINKLOCAL: - return (IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_LINKLOCAL) | - IPV6_ADDR_LINKLOCAL); - case IPV6_ADDR_SCOPE_SITELOCAL: - return (IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_SITELOCAL) | - IPV6_ADDR_SITELOCAL); - } - return IPV6_ADDR_SCOPE_TYPE(scope); + switch (scope) { + case IPV6_ADDR_SCOPE_NODELOCAL: + return (IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_NODELOCAL) | + IPV6_ADDR_LOOPBACK); + case IPV6_ADDR_SCOPE_LINKLOCAL: + return (IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_LINKLOCAL) | + IPV6_ADDR_LINKLOCAL); + case IPV6_ADDR_SCOPE_SITELOCAL: + return (IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_SITELOCAL) | + IPV6_ADDR_SITELOCAL); + } + return IPV6_ADDR_SCOPE_TYPE(scope); } static inline int __ipv6_addr_type(const struct in6_addr *addr) { - __be32 st; - - st = addr->s6_addr32[0]; - - /* Consider all addresses with the first three bits different of - 000 and 111 as unicasts. - */ - if ((st & htonl(0xE0000000)) != htonl(0x00000000) && - (st & htonl(0xE0000000)) != htonl(0xE0000000)) - return (IPV6_ADDR_UNICAST | - IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL)); - - if ((st & htonl(0xFF000000)) == htonl(0xFF000000)) { - /* multicast */ - /* addr-select 3.1 */ - return (IPV6_ADDR_MULTICAST | - ipv6_addr_scope2type(IPV6_ADDR_MC_SCOPE(addr))); - } - - if ((st & htonl(0xFFC00000)) == htonl(0xFE800000)) - return (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST | - IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_LINKLOCAL)); /* addr-select 3.1 */ - if ((st & htonl(0xFFC00000)) == htonl(0xFEC00000)) - return (IPV6_ADDR_SITELOCAL | IPV6_ADDR_UNICAST | - IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_SITELOCAL)); /* addr-select 3.1 */ - if ((st & htonl(0xFE000000)) == htonl(0xFC000000)) - return (IPV6_ADDR_UNICAST | - IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL)); /* RFC 4193 */ - - if ((addr->s6_addr32[0] | addr->s6_addr32[1]) == 0) { - if (addr->s6_addr32[2] == 0) { - if (addr->s6_addr32[3] == 0) - return IPV6_ADDR_ANY; - - if (addr->s6_addr32[3] == htonl(0x00000001)) - return (IPV6_ADDR_LOOPBACK | IPV6_ADDR_UNICAST | - IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_LINKLOCAL)); /* addr-select 3.4 */ - - return (IPV6_ADDR_COMPATv4 | IPV6_ADDR_UNICAST | - IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL)); /* addr-select 3.3 */ - } - - if (addr->s6_addr32[2] == htonl(0x0000ffff)) - return (IPV6_ADDR_MAPPED | - IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL)); /* addr-select 3.3 */ - } - - return (IPV6_ADDR_UNICAST | - IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL)); /* addr-select 3.4 */ + __be32 st; + + st = addr->s6_addr32[0]; + + /* Consider all addresses with the first three bits different of + 000 and 111 as unicasts. + */ + if ((st & htonl(0xE0000000)) != htonl(0x00000000) && + (st & htonl(0xE0000000)) != htonl(0xE0000000)) + return (IPV6_ADDR_UNICAST | + IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL)); + + if ((st & htonl(0xFF000000)) == htonl(0xFF000000)) { + /* multicast */ + /* addr-select 3.1 */ + return (IPV6_ADDR_MULTICAST | + ipv6_addr_scope2type(IPV6_ADDR_MC_SCOPE(addr))); + } + + if ((st & htonl(0xFFC00000)) == htonl(0xFE800000)) + return (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST | + IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_LINKLOCAL)); /* addr-select 3.1 */ + if ((st & htonl(0xFFC00000)) == htonl(0xFEC00000)) + return (IPV6_ADDR_SITELOCAL | IPV6_ADDR_UNICAST | + IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_SITELOCAL)); /* addr-select 3.1 */ + if ((st & htonl(0xFE000000)) == htonl(0xFC000000)) + return (IPV6_ADDR_UNICAST | + IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL)); /* RFC 4193 */ + + if ((addr->s6_addr32[0] | addr->s6_addr32[1]) == 0) { + if (addr->s6_addr32[2] == 0) { + if (addr->s6_addr32[3] == 0) + return IPV6_ADDR_ANY; + + if (addr->s6_addr32[3] == htonl(0x00000001)) + return (IPV6_ADDR_LOOPBACK | IPV6_ADDR_UNICAST | + IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_LINKLOCAL)); /* addr-select 3.4 */ + + return (IPV6_ADDR_COMPATv4 | IPV6_ADDR_UNICAST | + IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL)); /* addr-select 3.3 */ + } + + if (addr->s6_addr32[2] == htonl(0x0000ffff)) + return (IPV6_ADDR_MAPPED | + IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL)); /* addr-select 3.3 */ + } + + return (IPV6_ADDR_UNICAST | + IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL)); /* addr-select 3.4 */ } static inline int ipv6_addr_type(const struct in6_addr *addr) { - return __ipv6_addr_type(addr) & 0xffff; + return __ipv6_addr_type(addr) & 0xffff; } static inline int ipv6_addr_scope(const struct in6_addr *addr) { - return __ipv6_addr_type(addr) & IPV6_ADDR_SCOPE_MASK; + return __ipv6_addr_type(addr) & IPV6_ADDR_SCOPE_MASK; } static inline int __ipv6_addr_src_scope(int type) { - return (type == IPV6_ADDR_ANY) ? __IPV6_ADDR_SCOPE_INVALID : (type >> 16); + return (type == IPV6_ADDR_ANY) ? __IPV6_ADDR_SCOPE_INVALID : (type >> 16); } static inline int ipv6_addr_src_scope(const struct in6_addr *addr) { - return __ipv6_addr_src_scope(__ipv6_addr_type(addr)); + return __ipv6_addr_src_scope(__ipv6_addr_type(addr)); } static inline bool __ipv6_addr_needs_scope_id(int type) { - return type & IPV6_ADDR_LINKLOCAL || - (type & IPV6_ADDR_MULTICAST && - (type & (IPV6_ADDR_LOOPBACK|IPV6_ADDR_LINKLOCAL))); + return type & IPV6_ADDR_LINKLOCAL || + (type & IPV6_ADDR_MULTICAST && + (type & (IPV6_ADDR_LOOPBACK|IPV6_ADDR_LINKLOCAL))); } static inline __u32 ipv6_iface_scope_id(const struct in6_addr *addr, int iface) { - return __ipv6_addr_needs_scope_id(__ipv6_addr_type(addr)) ? iface : 0; + return __ipv6_addr_needs_scope_id(__ipv6_addr_type(addr)) ? iface : 0; } static inline int ipv6_addr_cmp(const struct in6_addr *a1, const struct in6_addr *a2) { - return memcmp(a1, a2, sizeof(struct in6_addr)); + return memcmp(a1, a2, sizeof(struct in6_addr)); } static inline bool ipv6_masked_addr_cmp(const struct in6_addr *a1, const struct in6_addr *m, - const struct in6_addr *a2) + const struct in6_addr *a2) { - return !!(((a1->s6_addr32[0] ^ a2->s6_addr32[0]) & m->s6_addr32[0]) | - ((a1->s6_addr32[1] ^ a2->s6_addr32[1]) & m->s6_addr32[1]) | - ((a1->s6_addr32[2] ^ a2->s6_addr32[2]) & m->s6_addr32[2]) | - ((a1->s6_addr32[3] ^ a2->s6_addr32[3]) & m->s6_addr32[3])); + return !!(((a1->s6_addr32[0] ^ a2->s6_addr32[0]) & m->s6_addr32[0]) | + ((a1->s6_addr32[1] ^ a2->s6_addr32[1]) & m->s6_addr32[1]) | + ((a1->s6_addr32[2] ^ a2->s6_addr32[2]) & m->s6_addr32[2]) | + ((a1->s6_addr32[3] ^ a2->s6_addr32[3]) & m->s6_addr32[3])); } static inline void ipv6_addr_prefix(struct in6_addr *pfx, - const struct in6_addr *addr, - int plen) + const struct in6_addr *addr, + int plen) { - /* caller must guarantee 0 <= plen <= 128 */ - int o = plen >> 3, - b = plen & 0x7; + /* caller must guarantee 0 <= plen <= 128 */ + int o = plen >> 3, + b = plen & 0x7; - memset(pfx->s6_addr, 0, sizeof(pfx->s6_addr)); - memcpy(pfx->s6_addr, addr, o); - if (b != 0) - pfx->s6_addr[o] = addr->s6_addr[o] & (0xff00 >> b); + memset(pfx->s6_addr, 0, sizeof(pfx->s6_addr)); + memcpy(pfx->s6_addr, addr, o); + if (b != 0) + pfx->s6_addr[o] = addr->s6_addr[o] & (0xff00 >> b); } static inline void ipv6_addr_prefix_copy(struct in6_addr *addr, - const struct in6_addr *pfx, - int plen) + const struct in6_addr *pfx, + int plen) { - /* caller must guarantee 0 <= plen <= 128 */ - int o = plen >> 3, - b = plen & 0x7; + /* caller must guarantee 0 <= plen <= 128 */ + int o = plen >> 3, + b = plen & 0x7; - memcpy(addr->s6_addr, pfx, o); - if (b != 0) { - addr->s6_addr[o] &= ~(0xff00 >> b); - addr->s6_addr[o] |= (pfx->s6_addr[o] & (0xff00 >> b)); - } + memcpy(addr->s6_addr, pfx, o); + if (b != 0) { + addr->s6_addr[o] &= ~(0xff00 >> b); + addr->s6_addr[o] |= (pfx->s6_addr[o] & (0xff00 >> b)); + } } static inline bool ipv6_addr_equal(const struct in6_addr *a1, - const struct in6_addr *a2) + const struct in6_addr *a2) { - return ((a1->s6_addr32[0] ^ a2->s6_addr32[0]) | - (a1->s6_addr32[1] ^ a2->s6_addr32[1]) | - (a1->s6_addr32[2] ^ a2->s6_addr32[2]) | - (a1->s6_addr32[3] ^ a2->s6_addr32[3])) == 0; + return ((a1->s6_addr32[0] ^ a2->s6_addr32[0]) | + (a1->s6_addr32[1] ^ a2->s6_addr32[1]) | + (a1->s6_addr32[2] ^ a2->s6_addr32[2]) | + (a1->s6_addr32[3] ^ a2->s6_addr32[3])) == 0; } static inline bool ipv6_prefix_equal(const struct in6_addr *addr1, - const struct in6_addr *addr2, - unsigned int prefixlen) + const struct in6_addr *addr2, + unsigned int prefixlen) { - const __be32 *a1 = addr1->s6_addr32; - const __be32 *a2 = addr2->s6_addr32; - unsigned int pdw, pbi; + const __be32 *a1 = addr1->s6_addr32; + const __be32 *a2 = addr2->s6_addr32; + unsigned int pdw, pbi; - /* check complete u32 in prefix */ - pdw = prefixlen >> 5; - if (pdw && memcmp(a1, a2, pdw << 2)) - return false; + /* check complete u32 in prefix */ + pdw = prefixlen >> 5; + if (pdw && memcmp(a1, a2, pdw << 2)) + return false; - /* check incomplete u32 in prefix */ - pbi = prefixlen & 0x1f; - if (pbi && ((a1[pdw] ^ a2[pdw]) & htonl((0xffffffff) << (32 - pbi)))) - return false; + /* check incomplete u32 in prefix */ + pbi = prefixlen & 0x1f; + if (pbi && ((a1[pdw] ^ a2[pdw]) & htonl((0xffffffff) << (32 - pbi)))) + return false; - return true; + return true; } static inline bool ipv6_addr_any(const struct in6_addr *a) @@ -339,42 +339,42 @@ static inline bool ipv6_addr_any(const struct in6_addr *a) static inline bool ipv6_addr_loopback(const struct in6_addr *a) { - return (a->s6_addr32[0] | a->s6_addr32[1] | - a->s6_addr32[2] | (a->s6_addr32[3] ^ htonl(1))) == 0; + return (a->s6_addr32[0] | a->s6_addr32[1] | + a->s6_addr32[2] | (a->s6_addr32[3] ^ htonl(1))) == 0; } static inline bool ipv6_addr_v4mapped(const struct in6_addr *a) { - return ( - (unsigned long)(a->s6_addr32[0] | a->s6_addr32[1]) | - (unsigned long)(a->s6_addr32[2] ^ - htonl(0x0000ffff))) == 0UL; + return ( + (unsigned long)(a->s6_addr32[0] | a->s6_addr32[1]) | + (unsigned long)(a->s6_addr32[2] ^ + htonl(0x0000ffff))) == 0UL; } static inline bool ipv6_addr_orchid(const struct in6_addr *a) { - return (a->s6_addr32[0] & htonl(0xfffffff0)) == htonl(0x20010010); + return (a->s6_addr32[0] & htonl(0xfffffff0)) == htonl(0x20010010); } static inline bool ipv6_addr_is_multicast(const struct in6_addr *addr) { - return (addr->s6_addr32[0] & htonl(0xFF000000)) == htonl(0xFF000000); + return (addr->s6_addr32[0] & htonl(0xFF000000)) == htonl(0xFF000000); } -static inline void ipv6_addr_set(struct in6_addr *addr, - uint32_t w1, uint32_t w2, - uint32_t w3, uint32_t w4) +static inline void ipv6_addr_set(struct in6_addr *addr, + uint32_t w1, uint32_t w2, + uint32_t w3, uint32_t w4) { - addr->s6_addr32[0] = w1; - addr->s6_addr32[1] = w2; - addr->s6_addr32[2] = w3; - addr->s6_addr32[3] = w4; + addr->s6_addr32[0] = w1; + addr->s6_addr32[1] = w2; + addr->s6_addr32[2] = w3; + addr->s6_addr32[3] = w4; } -static inline void ipv6_addr_copy(struct in6_addr *a1, +static inline void ipv6_addr_copy(struct in6_addr *a1, const struct in6_addr *a2) { - memcpy(a1, a2, sizeof(struct in6_addr)); + memcpy(a1, a2, sizeof(struct in6_addr)); } static inline void addrconf_addr_solict_mult(const struct in6_addr *addr, @@ -389,103 +389,103 @@ static inline void addrconf_addr_solict_mult(const struct in6_addr *addr, /* net/addrconf.h */ static inline bool ipv6_addr_is_ll_all_nodes(const struct in6_addr *addr) { - return ((addr->s6_addr32[0] ^ htonl(0xff020000)) | - addr->s6_addr32[1] | addr->s6_addr32[2] | - (addr->s6_addr32[3] ^ htonl(0x00000001))) == 0; + return ((addr->s6_addr32[0] ^ htonl(0xff020000)) | + addr->s6_addr32[1] | addr->s6_addr32[2] | + (addr->s6_addr32[3] ^ htonl(0x00000001))) == 0; } static inline bool ipv6_addr_is_ll_all_routers(const struct in6_addr *addr) { - return ((addr->s6_addr32[0] ^ htonl(0xff020000)) | - addr->s6_addr32[1] | addr->s6_addr32[2] | - (addr->s6_addr32[3] ^ htonl(0x00000002))) == 0; + return ((addr->s6_addr32[0] ^ htonl(0xff020000)) | + addr->s6_addr32[1] | addr->s6_addr32[2] | + (addr->s6_addr32[3] ^ htonl(0x00000002))) == 0; } static inline bool ipv6_addr_is_isatap(const struct in6_addr *addr) { - return (addr->s6_addr32[2] | htonl(0x02000000)) == htonl(0x02005EFE); + return (addr->s6_addr32[2] | htonl(0x02000000)) == htonl(0x02005EFE); } static inline bool ipv6_addr_is_solict_mult(const struct in6_addr *addr) { - return ((addr->s6_addr32[0] ^ htonl(0xff020000)) | - addr->s6_addr32[1] | - (addr->s6_addr32[2] ^ htonl(0x00000001)) | - (addr->s6_addr[12] ^ 0xff)) == 0; + return ((addr->s6_addr32[0] ^ htonl(0xff020000)) | + addr->s6_addr32[1] | + (addr->s6_addr32[2] ^ htonl(0x00000001)) | + (addr->s6_addr[12] ^ 0xff)) == 0; } static inline int fls(int x) { - int r = 32; - - if (!x) - return 0; - if (!(x & 0xffff0000u)) { - x <<= 16; - r -= 16; - } - if (!(x & 0xff000000u)) { - x <<= 8; - r -= 8; - } - if (!(x & 0xf0000000u)) { - x <<= 4; - r -= 4; - } - if (!(x & 0xc0000000u)) { - x <<= 2; - r -= 2; - } - if (!(x & 0x80000000u)) { - x <<= 1; - r -= 1; - } - return r; + int r = 32; + + if (!x) + return 0; + if (!(x & 0xffff0000u)) { + x <<= 16; + r -= 16; + } + if (!(x & 0xff000000u)) { + x <<= 8; + r -= 8; + } + if (!(x & 0xf0000000u)) { + x <<= 4; + r -= 4; + } + if (!(x & 0xc0000000u)) { + x <<= 2; + r -= 2; + } + if (!(x & 0x80000000u)) { + x <<= 1; + r -= 1; + } + return r; } static inline int __ipv6_addr_diff(const void *token1, const void *token2, int addrlen) { - const __be32 *a1 = token1, *a2 = token2; - int i; - - addrlen >>= 2; - - for (i = 0; i < addrlen; i++) { - __be32 xb = a1[i] ^ a2[i]; - if (xb) - return i * 32 + 32 - fls(ntohl(xb)); - } - - /* - * we should *never* get to this point since that - * would mean the addrs are equal - * - * However, we do get to it 8) And exacly, when - * addresses are equal 8) - * - * ip route add 1111::/128 via ... - * ip route add 1111::/64 via ... - * and we are here. - * - * Ideally, this function should stop comparison - * at prefix length. It does not, but it is still OK, - * if returned value is greater than prefix length. - * --ANK (980803) - */ - return (addrlen << 5); + const __be32 *a1 = token1, *a2 = token2; + int i; + + addrlen >>= 2; + + for (i = 0; i < addrlen; i++) { + __be32 xb = a1[i] ^ a2[i]; + if (xb) + return i * 32 + 32 - fls(ntohl(xb)); + } + + /* + * we should *never* get to this point since that + * would mean the addrs are equal + * + * However, we do get to it 8) And exacly, when + * addresses are equal 8) + * + * ip route add 1111::/128 via ... + * ip route add 1111::/64 via ... + * and we are here. + * + * Ideally, this function should stop comparison + * at prefix length. It does not, but it is still OK, + * if returned value is greater than prefix length. + * --ANK (980803) + */ + return (addrlen << 5); } static inline int ipv6_addr_diff(const struct in6_addr *a1, const struct in6_addr *a2) { - return __ipv6_addr_diff(a1, a2, sizeof(struct in6_addr)); + return __ipv6_addr_diff(a1, a2, sizeof(struct in6_addr)); } static inline int ipv6_saddr_preferred(int type) { - if (type & (IPV6_ADDR_MAPPED|IPV6_ADDR_COMPATv4| - IPV6_ADDR_LOOPBACK|IPV6_ADDR_RESERVED)) - return 1; - return 0; + if (type & (IPV6_ADDR_MAPPED|IPV6_ADDR_COMPATv4| + IPV6_ADDR_LOOPBACK|IPV6_ADDR_RESERVED)) + return 1; + return 0; } #ifdef __DPVS__ @@ -493,10 +493,10 @@ static inline int ipv6_saddr_preferred(int type) /* * 1. Prefer same address. (i.e. destination is local machine) - * 2. Prefer appropriate scope. (i.e. smallest scope shared with the destination) - * 3. Avoid deprecated addresses. + * 2. Prefer appropriate scope. (i.e. smallest scope shared with the destination) + * 3. Avoid deprecated addresses. * 4. Prefer home addresses. (not support here!) - * 5. Prefer outgoing interface. (i.e. prefer an address on the interface we’re sending out of) + * 5. Prefer outgoing interface. (i.e. prefer an address on the interface we’re sending out of) * 6. Prefer matching label. (not support here!) * 7. Prefer public addresses. (not support here) * 8. Use longest matching prefix. @@ -505,7 +505,7 @@ static inline int ipv6_get_saddr_eval(struct ipv6_saddr_score *score, struct ipv6_saddr_dst *dst, int i) { - int ret; + int ret; if (i <= score->rule) { switch (i) { @@ -517,9 +517,9 @@ static inline int ipv6_get_saddr_eval(struct ipv6_saddr_score *score, break; default: ret = score->scorebits[i]; - } - goto out; - } + } + goto out; + } switch (i) { case IPV6_SADDR_RULE_INIT: @@ -574,10 +574,10 @@ static inline int ipv6_get_saddr_eval(struct ipv6_saddr_score *score, /* call me by lock */ static inline int ipv6_addr_select(struct inet_device *idev, - const union inet_addr *daddr, + const union inet_addr *daddr, union inet_addr *saddr) { - struct ipv6_saddr_score scores[2]; + struct ipv6_saddr_score scores[2]; struct ipv6_saddr_score *score = &scores[0], *hiscore = &scores[1]; struct ipv6_saddr_dst dst; int dst_type; diff --git a/include/list.h b/include/list.h index 753197f42..8f28bc778 100644 --- a/include/list.h +++ b/include/list.h @@ -59,12 +59,12 @@ struct hlist_node { #undef LIST_HEAD // conflict with GNU queue.h #define LIST_HEAD(name) \ - struct list_head name = LIST_HEAD_INIT(name) + struct list_head name = LIST_HEAD_INIT(name) static inline void INIT_LIST_HEAD(struct list_head *list) { - list->next = list; - list->prev = list; + list->next = list; + list->prev = list; } /* @@ -75,18 +75,18 @@ static inline void INIT_LIST_HEAD(struct list_head *list) */ #ifndef CONFIG_DEBUG_LIST static inline void __list_add(struct list_head *new, - struct list_head *prev, - struct list_head *next) + struct list_head *prev, + struct list_head *next) { - next->prev = new; - new->next = next; - new->prev = prev; - prev->next = new; + next->prev = new; + new->next = next; + new->prev = prev; + prev->next = new; } #else extern void __list_add(struct list_head *new, - struct list_head *prev, - struct list_head *next); + struct list_head *prev, + struct list_head *next); #endif /** @@ -99,7 +99,7 @@ extern void __list_add(struct list_head *new, */ static inline void list_add(struct list_head *new, struct list_head *head) { - __list_add(new, head, head->next); + __list_add(new, head, head->next); } @@ -113,7 +113,7 @@ static inline void list_add(struct list_head *new, struct list_head *head) */ static inline void list_add_tail(struct list_head *new, struct list_head *head) { - __list_add(new, head->prev, head); + __list_add(new, head->prev, head); } /* @@ -125,8 +125,8 @@ static inline void list_add_tail(struct list_head *new, struct list_head *head) */ static inline void __list_del(struct list_head * prev, struct list_head * next) { - next->prev = prev; - prev->next = next; + next->prev = prev; + prev->next = next; } /** @@ -138,14 +138,14 @@ static inline void __list_del(struct list_head * prev, struct list_head * next) #ifndef CONFIG_DEBUG_LIST static inline void __list_del_entry(struct list_head *entry) { - __list_del(entry->prev, entry->next); + __list_del(entry->prev, entry->next); } static inline void list_del(struct list_head *entry) { - __list_del(entry->prev, entry->next); - entry->next = LIST_POISON1; - entry->prev = LIST_POISON2; + __list_del(entry->prev, entry->next); + entry->next = LIST_POISON1; + entry->prev = LIST_POISON2; } #else extern void __list_del_entry(struct list_head *entry); @@ -171,19 +171,19 @@ extern void list_force_poison(struct list_head *entry); * If @old was empty, it will be overwritten. */ static inline void list_replace(struct list_head *old, - struct list_head *new) + struct list_head *new) { - new->next = old->next; - new->next->prev = new; - new->prev = old->prev; - new->prev->next = new; + new->next = old->next; + new->next->prev = new; + new->prev = old->prev; + new->prev->next = new; } static inline void list_replace_init(struct list_head *old, - struct list_head *new) + struct list_head *new) { - list_replace(old, new); - INIT_LIST_HEAD(old); + list_replace(old, new); + INIT_LIST_HEAD(old); } /** @@ -192,8 +192,8 @@ static inline void list_replace_init(struct list_head *old, */ static inline void list_del_init(struct list_head *entry) { - __list_del_entry(entry); - INIT_LIST_HEAD(entry); + __list_del_entry(entry); + INIT_LIST_HEAD(entry); } /** @@ -203,8 +203,8 @@ static inline void list_del_init(struct list_head *entry) */ static inline void list_move(struct list_head *list, struct list_head *head) { - __list_del_entry(list); - list_add(list, head); + __list_del_entry(list); + list_add(list, head); } /** @@ -213,10 +213,10 @@ static inline void list_move(struct list_head *list, struct list_head *head) * @head: the head that will follow our entry */ static inline void list_move_tail(struct list_head *list, - struct list_head *head) + struct list_head *head) { - __list_del_entry(list); - list_add_tail(list, head); + __list_del_entry(list); + list_add_tail(list, head); } /** @@ -225,9 +225,9 @@ static inline void list_move_tail(struct list_head *list, * @head: the head of the list */ static inline int list_is_last(const struct list_head *list, - const struct list_head *head) + const struct list_head *head) { - return list->next == head; + return list->next == head; } /** @@ -236,7 +236,7 @@ static inline int list_is_last(const struct list_head *list, */ static inline int list_empty(const struct list_head *head) { - return (head->next) == head; + return (head->next) == head; } /** @@ -267,8 +267,8 @@ static inline int list_elems(const struct list_head *head) */ static inline int list_empty_careful(const struct list_head *head) { - struct list_head *next = head->next; - return (next == head) && (next == head->prev); + struct list_head *next = head->next; + return (next == head) && (next == head->prev); } /** @@ -277,12 +277,12 @@ static inline int list_empty_careful(const struct list_head *head) */ static inline void list_rotate_left(struct list_head *head) { - struct list_head *first; + struct list_head *first; - if (!list_empty(head)) { - first = head->next; - list_move_tail(first, head); - } + if (!list_empty(head)) { + first = head->next; + list_move_tail(first, head); + } } /** @@ -291,19 +291,19 @@ static inline void list_rotate_left(struct list_head *head) */ static inline int list_is_singular(const struct list_head *head) { - return !list_empty(head) && (head->next == head->prev); + return !list_empty(head) && (head->next == head->prev); } static inline void __list_cut_position(struct list_head *list, - struct list_head *head, struct list_head *entry) + struct list_head *head, struct list_head *entry) { - struct list_head *new_first = entry->next; - list->next = head->next; - list->next->prev = list; - list->prev = entry; - entry->next = list; - head->next = new_first; - new_first->prev = head; + struct list_head *new_first = entry->next; + list->next = head->next; + list->next->prev = list; + list->prev = entry; + entry->next = list; + head->next = new_first; + new_first->prev = head; } /** @@ -311,7 +311,7 @@ static inline void __list_cut_position(struct list_head *list, * @list: a new list to add all removed entries * @head: a list with entries * @entry: an entry within head, could be the head itself - * and if so we won't cut the list + * and if so we won't cut the list * * This helper moves the initial part of @head, up to and * including @entry, from @head to @list. You should @@ -321,31 +321,31 @@ static inline void __list_cut_position(struct list_head *list, * */ static inline void list_cut_position(struct list_head *list, - struct list_head *head, struct list_head *entry) + struct list_head *head, struct list_head *entry) { - if (list_empty(head)) - return; - if (list_is_singular(head) && - (head->next != entry && head != entry)) - return; - if (entry == head) - INIT_LIST_HEAD(list); - else - __list_cut_position(list, head, entry); + if (list_empty(head)) + return; + if (list_is_singular(head) && + (head->next != entry && head != entry)) + return; + if (entry == head) + INIT_LIST_HEAD(list); + else + __list_cut_position(list, head, entry); } static inline void __list_splice(const struct list_head *list, - struct list_head *prev, - struct list_head *next) + struct list_head *prev, + struct list_head *next) { - struct list_head *first = list->next; - struct list_head *last = list->prev; + struct list_head *first = list->next; + struct list_head *last = list->prev; - first->prev = prev; - prev->next = first; + first->prev = prev; + prev->next = first; - last->next = next; - next->prev = last; + last->next = next; + next->prev = last; } /** @@ -354,10 +354,10 @@ static inline void __list_splice(const struct list_head *list, * @head: the place to add it in the first list. */ static inline void list_splice(const struct list_head *list, - struct list_head *head) + struct list_head *head) { - if (!list_empty(list)) - __list_splice(list, head, head->next); + if (!list_empty(list)) + __list_splice(list, head, head->next); } /** @@ -366,10 +366,10 @@ static inline void list_splice(const struct list_head *list, * @head: the place to add it in the first list. */ static inline void list_splice_tail(struct list_head *list, - struct list_head *head) + struct list_head *head) { - if (!list_empty(list)) - __list_splice(list, head->prev, head); + if (!list_empty(list)) + __list_splice(list, head->prev, head); } /** @@ -380,12 +380,12 @@ static inline void list_splice_tail(struct list_head *list, * The list at @list is reinitialised */ static inline void list_splice_init(struct list_head *list, - struct list_head *head) + struct list_head *head) { - if (!list_empty(list)) { - __list_splice(list, head, head->next); - INIT_LIST_HEAD(list); - } + if (!list_empty(list)) { + __list_splice(list, head, head->next); + INIT_LIST_HEAD(list); + } } /** @@ -397,247 +397,247 @@ static inline void list_splice_init(struct list_head *list, * The list at @list is reinitialised */ static inline void list_splice_tail_init(struct list_head *list, - struct list_head *head) + struct list_head *head) { - if (!list_empty(list)) { - __list_splice(list, head->prev, head); - INIT_LIST_HEAD(list); - } + if (!list_empty(list)) { + __list_splice(list, head->prev, head); + INIT_LIST_HEAD(list); + } } /** * list_entry - get the struct for this entry - * @ptr: the &struct list_head pointer. - * @type: the type of the struct this is embedded in. - * @member: the name of the list_head within the struct. + * @ptr: the &struct list_head pointer. + * @type: the type of the struct this is embedded in. + * @member: the name of the list_head within the struct. */ #define list_entry(ptr, type, member) \ - container_of(ptr, type, member) + container_of(ptr, type, member) /** * list_first_entry - get the first element from a list - * @ptr: the list head to take the element from. - * @type: the type of the struct this is embedded in. - * @member: the name of the list_head within the struct. + * @ptr: the list head to take the element from. + * @type: the type of the struct this is embedded in. + * @member: the name of the list_head within the struct. * * Note, that list is expected to be not empty. */ #define list_first_entry(ptr, type, member) \ - list_entry((ptr)->next, type, member) + list_entry((ptr)->next, type, member) /** * list_last_entry - get the last element from a list - * @ptr: the list head to take the element from. - * @type: the type of the struct this is embedded in. - * @member: the name of the list_head within the struct. + * @ptr: the list head to take the element from. + * @type: the type of the struct this is embedded in. + * @member: the name of the list_head within the struct. * * Note, that list is expected to be not empty. */ #define list_last_entry(ptr, type, member) \ - list_entry((ptr)->prev, type, member) + list_entry((ptr)->prev, type, member) /** * list_first_entry_or_null - get the first element from a list - * @ptr: the list head to take the element from. - * @type: the type of the struct this is embedded in. - * @member: the name of the list_head within the struct. + * @ptr: the list head to take the element from. + * @type: the type of the struct this is embedded in. + * @member: the name of the list_head within the struct. * * Note that if the list is empty, it returns NULL. */ #define list_first_entry_or_null(ptr, type, member) \ - (!list_empty(ptr) ? list_first_entry(ptr, type, member) : NULL) + (!list_empty(ptr) ? list_first_entry(ptr, type, member) : NULL) /** * list_next_entry - get the next element in list - * @pos: the type * to cursor - * @member: the name of the list_head within the struct. + * @pos: the type * to cursor + * @member: the name of the list_head within the struct. */ #define list_next_entry(pos, member) \ - list_entry((pos)->member.next, typeof(*(pos)), member) + list_entry((pos)->member.next, typeof(*(pos)), member) /** * list_prev_entry - get the prev element in list - * @pos: the type * to cursor - * @member: the name of the list_head within the struct. + * @pos: the type * to cursor + * @member: the name of the list_head within the struct. */ #define list_prev_entry(pos, member) \ - list_entry((pos)->member.prev, typeof(*(pos)), member) + list_entry((pos)->member.prev, typeof(*(pos)), member) /** - * list_for_each - iterate over a list - * @pos: the &struct list_head to use as a loop cursor. - * @head: the head for your list. + * list_for_each - iterate over a list + * @pos: the &struct list_head to use as a loop cursor. + * @head: the head for your list. */ #define list_for_each(pos, head) \ - for (pos = (head)->next; pos != (head); pos = pos->next) + for (pos = (head)->next; pos != (head); pos = pos->next) /** - * list_for_each_prev - iterate over a list backwards - * @pos: the &struct list_head to use as a loop cursor. - * @head: the head for your list. + * list_for_each_prev - iterate over a list backwards + * @pos: the &struct list_head to use as a loop cursor. + * @head: the head for your list. */ #define list_for_each_prev(pos, head) \ - for (pos = (head)->prev; pos != (head); pos = pos->prev) + for (pos = (head)->prev; pos != (head); pos = pos->prev) /** * list_for_each_safe - iterate over a list safe against removal of list entry - * @pos: the &struct list_head to use as a loop cursor. - * @n: another &struct list_head to use as temporary storage - * @head: the head for your list. + * @pos: the &struct list_head to use as a loop cursor. + * @n: another &struct list_head to use as temporary storage + * @head: the head for your list. */ #define list_for_each_safe(pos, n, head) \ - for (pos = (head)->next, n = pos->next; pos != (head); \ - pos = n, n = pos->next) + for (pos = (head)->next, n = pos->next; pos != (head); \ + pos = n, n = pos->next) /** * list_for_each_prev_safe - iterate over a list backwards safe against removal of list entry - * @pos: the &struct list_head to use as a loop cursor. - * @n: another &struct list_head to use as temporary storage - * @head: the head for your list. + * @pos: the &struct list_head to use as a loop cursor. + * @n: another &struct list_head to use as temporary storage + * @head: the head for your list. */ #define list_for_each_prev_safe(pos, n, head) \ - for (pos = (head)->prev, n = pos->prev; \ - pos != (head); \ - pos = n, n = pos->prev) + for (pos = (head)->prev, n = pos->prev; \ + pos != (head); \ + pos = n, n = pos->prev) /** - * list_for_each_entry - iterate over list of given type - * @pos: the type * to use as a loop cursor. - * @head: the head for your list. - * @member: the name of the list_head within the struct. + * list_for_each_entry - iterate over list of given type + * @pos: the type * to use as a loop cursor. + * @head: the head for your list. + * @member: the name of the list_head within the struct. */ -#define list_for_each_entry(pos, head, member) \ - for (pos = list_first_entry(head, typeof(*pos), member); \ - &pos->member != (head); \ - pos = list_next_entry(pos, member)) +#define list_for_each_entry(pos, head, member) \ + for (pos = list_first_entry(head, typeof(*pos), member); \ + &pos->member != (head); \ + pos = list_next_entry(pos, member)) /** * list_for_each_entry_reverse - iterate backwards over list of given type. - * @pos: the type * to use as a loop cursor. - * @head: the head for your list. - * @member: the name of the list_head within the struct. + * @pos: the type * to use as a loop cursor. + * @head: the head for your list. + * @member: the name of the list_head within the struct. */ -#define list_for_each_entry_reverse(pos, head, member) \ - for (pos = list_last_entry(head, typeof(*pos), member); \ - &pos->member != (head); \ - pos = list_prev_entry(pos, member)) +#define list_for_each_entry_reverse(pos, head, member) \ + for (pos = list_last_entry(head, typeof(*pos), member); \ + &pos->member != (head); \ + pos = list_prev_entry(pos, member)) /** * list_prepare_entry - prepare a pos entry for use in list_for_each_entry_continue() - * @pos: the type * to use as a start point - * @head: the head of the list - * @member: the name of the list_head within the struct. + * @pos: the type * to use as a start point + * @head: the head of the list + * @member: the name of the list_head within the struct. * * Prepares a pos entry for use as a start point in list_for_each_entry_continue(). */ #define list_prepare_entry(pos, head, member) \ - ((pos) ? : list_entry(head, typeof(*pos), member)) + ((pos) ? : list_entry(head, typeof(*pos), member)) /** * list_for_each_entry_continue - continue iteration over list of given type - * @pos: the type * to use as a loop cursor. - * @head: the head for your list. - * @member: the name of the list_head within the struct. + * @pos: the type * to use as a loop cursor. + * @head: the head for your list. + * @member: the name of the list_head within the struct. * * Continue to iterate over list of given type, continuing after * the current position. */ -#define list_for_each_entry_continue(pos, head, member) \ - for (pos = list_next_entry(pos, member); \ - &pos->member != (head); \ - pos = list_next_entry(pos, member)) +#define list_for_each_entry_continue(pos, head, member) \ + for (pos = list_next_entry(pos, member); \ + &pos->member != (head); \ + pos = list_next_entry(pos, member)) /** * list_for_each_entry_continue_reverse - iterate backwards from the given point - * @pos: the type * to use as a loop cursor. - * @head: the head for your list. - * @member: the name of the list_head within the struct. + * @pos: the type * to use as a loop cursor. + * @head: the head for your list. + * @member: the name of the list_head within the struct. * * Start to iterate over list of given type backwards, continuing after * the current position. */ -#define list_for_each_entry_continue_reverse(pos, head, member) \ - for (pos = list_prev_entry(pos, member); \ - &pos->member != (head); \ - pos = list_prev_entry(pos, member)) +#define list_for_each_entry_continue_reverse(pos, head, member) \ + for (pos = list_prev_entry(pos, member); \ + &pos->member != (head); \ + pos = list_prev_entry(pos, member)) /** * list_for_each_entry_from - iterate over list of given type from the current point - * @pos: the type * to use as a loop cursor. - * @head: the head for your list. - * @member: the name of the list_head within the struct. + * @pos: the type * to use as a loop cursor. + * @head: the head for your list. + * @member: the name of the list_head within the struct. * * Iterate over list of given type, continuing from current position. */ -#define list_for_each_entry_from(pos, head, member) \ - for (; &pos->member != (head); \ - pos = list_next_entry(pos, member)) +#define list_for_each_entry_from(pos, head, member) \ + for (; &pos->member != (head); \ + pos = list_next_entry(pos, member)) /** * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry - * @pos: the type * to use as a loop cursor. - * @n: another type * to use as temporary storage - * @head: the head for your list. - * @member: the name of the list_head within the struct. + * @pos: the type * to use as a loop cursor. + * @n: another type * to use as temporary storage + * @head: the head for your list. + * @member: the name of the list_head within the struct. */ -#define list_for_each_entry_safe(pos, n, head, member) \ - for (pos = list_first_entry(head, typeof(*pos), member), \ - n = list_next_entry(pos, member); \ - &pos->member != (head); \ - pos = n, n = list_next_entry(n, member)) +#define list_for_each_entry_safe(pos, n, head, member) \ + for (pos = list_first_entry(head, typeof(*pos), member), \ + n = list_next_entry(pos, member); \ + &pos->member != (head); \ + pos = n, n = list_next_entry(n, member)) /** * list_for_each_entry_safe_continue - continue list iteration safe against removal - * @pos: the type * to use as a loop cursor. - * @n: another type * to use as temporary storage - * @head: the head for your list. - * @member: the name of the list_head within the struct. + * @pos: the type * to use as a loop cursor. + * @n: another type * to use as temporary storage + * @head: the head for your list. + * @member: the name of the list_head within the struct. * * Iterate over list of given type, continuing after current point, * safe against removal of list entry. */ -#define list_for_each_entry_safe_continue(pos, n, head, member) \ - for (pos = list_next_entry(pos, member), \ - n = list_next_entry(pos, member); \ - &pos->member != (head); \ - pos = n, n = list_next_entry(n, member)) +#define list_for_each_entry_safe_continue(pos, n, head, member) \ + for (pos = list_next_entry(pos, member), \ + n = list_next_entry(pos, member); \ + &pos->member != (head); \ + pos = n, n = list_next_entry(n, member)) /** * list_for_each_entry_safe_from - iterate over list from current point safe against removal - * @pos: the type * to use as a loop cursor. - * @n: another type * to use as temporary storage - * @head: the head for your list. - * @member: the name of the list_head within the struct. + * @pos: the type * to use as a loop cursor. + * @n: another type * to use as temporary storage + * @head: the head for your list. + * @member: the name of the list_head within the struct. * * Iterate over list of given type from current point, safe against * removal of list entry. */ -#define list_for_each_entry_safe_from(pos, n, head, member) \ - for (n = list_next_entry(pos, member); \ - &pos->member != (head); \ - pos = n, n = list_next_entry(n, member)) +#define list_for_each_entry_safe_from(pos, n, head, member) \ + for (n = list_next_entry(pos, member); \ + &pos->member != (head); \ + pos = n, n = list_next_entry(n, member)) /** * list_for_each_entry_safe_reverse - iterate backwards over list safe against removal - * @pos: the type * to use as a loop cursor. - * @n: another type * to use as temporary storage - * @head: the head for your list. - * @member: the name of the list_head within the struct. + * @pos: the type * to use as a loop cursor. + * @n: another type * to use as temporary storage + * @head: the head for your list. + * @member: the name of the list_head within the struct. * * Iterate backwards over list of given type, safe against removal * of list entry. */ -#define list_for_each_entry_safe_reverse(pos, n, head, member) \ - for (pos = list_last_entry(head, typeof(*pos), member), \ - n = list_prev_entry(pos, member); \ - &pos->member != (head); \ - pos = n, n = list_prev_entry(n, member)) +#define list_for_each_entry_safe_reverse(pos, n, head, member) \ + for (pos = list_last_entry(head, typeof(*pos), member), \ + n = list_prev_entry(pos, member); \ + &pos->member != (head); \ + pos = n, n = list_prev_entry(n, member)) /** * list_safe_reset_next - reset a stale list_for_each_entry_safe loop - * @pos: the loop cursor used in the list_for_each_entry_safe loop - * @n: temporary storage used in list_for_each_entry_safe - * @member: the name of the list_head within the struct. + * @pos: the loop cursor used in the list_for_each_entry_safe loop + * @n: temporary storage used in list_for_each_entry_safe + * @member: the name of the list_head within the struct. * * list_safe_reset_next is not safe to use in general if the list may be * modified concurrently (eg. the lock is dropped in the loop body). An @@ -645,8 +645,8 @@ static inline void list_splice_tail_init(struct list_head *list, * and list_safe_reset_next is called after re-taking the lock and before * completing the current iteration of the loop body. */ -#define list_safe_reset_next(pos, n, member) \ - n = list_next_entry(pos, member) +#define list_safe_reset_next(pos, n, member) \ + n = list_next_entry(pos, member) /* * Double linked lists with a single pointer list head. @@ -660,85 +660,85 @@ static inline void list_splice_tail_init(struct list_head *list, #define INIT_HLIST_HEAD(ptr) ((ptr)->first = NULL) static inline void INIT_HLIST_NODE(struct hlist_node *h) { - h->next = NULL; - h->pprev = NULL; + h->next = NULL; + h->pprev = NULL; } static inline int hlist_unhashed(const struct hlist_node *h) { - return !h->pprev; + return !h->pprev; } static inline int hlist_empty(const struct hlist_head *h) { - return !(h->first); + return !(h->first); } static inline void __hlist_del(struct hlist_node *n) { - struct hlist_node *next = n->next; - struct hlist_node **pprev = n->pprev; + struct hlist_node *next = n->next; + struct hlist_node **pprev = n->pprev; - *pprev = next; - if (next) - next->pprev = pprev; + *pprev = next; + if (next) + next->pprev = pprev; } static inline void hlist_del(struct hlist_node *n) { - __hlist_del(n); - n->next = LIST_POISON1; - n->pprev = LIST_POISON2; + __hlist_del(n); + n->next = LIST_POISON1; + n->pprev = LIST_POISON2; } static inline void hlist_del_init(struct hlist_node *n) { - if (!hlist_unhashed(n)) { - __hlist_del(n); - INIT_HLIST_NODE(n); - } + if (!hlist_unhashed(n)) { + __hlist_del(n); + INIT_HLIST_NODE(n); + } } static inline void hlist_add_head(struct hlist_node *n, struct hlist_head *h) { - struct hlist_node *first = h->first; - n->next = first; - if (first) - first->pprev = &n->next; - h->first = n; - n->pprev = &h->first; + struct hlist_node *first = h->first; + n->next = first; + if (first) + first->pprev = &n->next; + h->first = n; + n->pprev = &h->first; } /* next must be != NULL */ static inline void hlist_add_before(struct hlist_node *n, - struct hlist_node *next) + struct hlist_node *next) { - n->pprev = next->pprev; - n->next = next; - next->pprev = &n->next; - *(n->pprev) = n; + n->pprev = next->pprev; + n->next = next; + next->pprev = &n->next; + *(n->pprev) = n; } static inline void hlist_add_behind(struct hlist_node *n, - struct hlist_node *prev) + struct hlist_node *prev) { - n->next = prev->next; - prev->next = n; - n->pprev = &prev->next; + n->next = prev->next; + prev->next = n; + n->pprev = &prev->next; - if (n->next) - n->next->pprev = &n->next; + if (n->next) + n->next->pprev = &n->next; } /* after that we'll appear to be on some hlist and hlist_del will work */ static inline void hlist_add_fake(struct hlist_node *n) { - n->pprev = &n->next; + n->pprev = &n->next; } static inline bool hlist_fake(struct hlist_node *h) { - return h->pprev == &h->next; + return h->pprev == &h->next; } /* @@ -746,68 +746,68 @@ static inline bool hlist_fake(struct hlist_node *h) * reference of the first entry if it exists. */ static inline void hlist_move_list(struct hlist_head *old, - struct hlist_head *new) + struct hlist_head *new) { - new->first = old->first; - if (new->first) - new->first->pprev = &new->first; - old->first = NULL; + new->first = old->first; + if (new->first) + new->first->pprev = &new->first; + old->first = NULL; } #define hlist_entry(ptr, type, member) container_of(ptr,type,member) #define hlist_for_each(pos, head) \ - for (pos = (head)->first; pos ; pos = pos->next) + for (pos = (head)->first; pos ; pos = pos->next) #define hlist_for_each_safe(pos, n, head) \ - for (pos = (head)->first; pos && ({ n = pos->next; 1; }); \ - pos = n) + for (pos = (head)->first; pos && ({ n = pos->next; 1; }); \ + pos = n) #define hlist_entry_safe(ptr, type, member) \ - ({ typeof(ptr) ____ptr = (ptr); \ - ____ptr ? hlist_entry(____ptr, type, member) : NULL; \ - }) + ({ typeof(ptr) ____ptr = (ptr); \ + ____ptr ? hlist_entry(____ptr, type, member) : NULL; \ + }) /** - * hlist_for_each_entry - iterate over list of given type - * @pos: the type * to use as a loop cursor. - * @head: the head for your list. - * @member: the name of the hlist_node within the struct. + * hlist_for_each_entry - iterate over list of given type + * @pos: the type * to use as a loop cursor. + * @head: the head for your list. + * @member: the name of the hlist_node within the struct. */ -#define hlist_for_each_entry(pos, head, member) \ - for (pos = hlist_entry_safe((head)->first, typeof(*(pos)), member);\ - pos; \ - pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member)) +#define hlist_for_each_entry(pos, head, member) \ + for (pos = hlist_entry_safe((head)->first, typeof(*(pos)), member);\ + pos; \ + pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member)) /** * hlist_for_each_entry_continue - iterate over a hlist continuing after current point - * @pos: the type * to use as a loop cursor. - * @member: the name of the hlist_node within the struct. + * @pos: the type * to use as a loop cursor. + * @member: the name of the hlist_node within the struct. */ -#define hlist_for_each_entry_continue(pos, member) \ - for (pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member);\ - pos; \ - pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member)) +#define hlist_for_each_entry_continue(pos, member) \ + for (pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member);\ + pos; \ + pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member)) /** * hlist_for_each_entry_from - iterate over a hlist continuing from current point - * @pos: the type * to use as a loop cursor. - * @member: the name of the hlist_node within the struct. + * @pos: the type * to use as a loop cursor. + * @member: the name of the hlist_node within the struct. */ -#define hlist_for_each_entry_from(pos, member) \ - for (; pos; \ - pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member)) +#define hlist_for_each_entry_from(pos, member) \ + for (; pos; \ + pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member)) /** * hlist_for_each_entry_safe - iterate over list of given type safe against removal of list entry - * @pos: the type * to use as a loop cursor. - * @n: another &struct hlist_node to use as temporary storage - * @head: the head for your list. - * @member: the name of the hlist_node within the struct. - */ -#define hlist_for_each_entry_safe(pos, n, head, member) \ - for (pos = hlist_entry_safe((head)->first, typeof(*pos), member);\ - pos && ({ n = pos->member.next; 1; }); \ - pos = hlist_entry_safe(n, typeof(*pos), member)) + * @pos: the type * to use as a loop cursor. + * @n: another &struct hlist_node to use as temporary storage + * @head: the head for your list. + * @member: the name of the hlist_node within the struct. + */ +#define hlist_for_each_entry_safe(pos, n, head, member) \ + for (pos = hlist_entry_safe((head)->first, typeof(*pos), member);\ + pos && ({ n = pos->member.next; 1; }); \ + pos = hlist_entry_safe(n, typeof(*pos), member)) #endif diff --git a/include/mbuf.h b/include/mbuf.h index 168da8b0a..19aef0bb3 100644 --- a/include/mbuf.h +++ b/include/mbuf.h @@ -27,77 +27,77 @@ #include "rte_mbuf.h" /* for each mbuf including heading mbuf and segments */ -#define mbuf_foreach(m, pos) \ - for (pos = m; pos != NULL; pos = pos->next) +#define mbuf_foreach(m, pos) \ + for (pos = m; pos != NULL; pos = pos->next) /* for each segments of mbuf */ -#define mbuf_foreach_seg(m, s) \ - for (s = m->next; s != NULL; s = s->next) +#define mbuf_foreach_seg(m, s) \ + for (s = m->next; s != NULL; s = s->next) -#define mbuf_foreach_seg_safe(m, n, s) \ - for (s = m->next, n = s ? s->next : NULL; \ - s != NULL; \ - s = n, n = s ? s->next : NULL) +#define mbuf_foreach_seg_safe(m, n, s) \ + for (s = m->next, n = s ? s->next : NULL; \ + s != NULL; \ + s = n, n = s ? s->next : NULL) /** * mbuf_copy_bits - copy bits from mbuf to buffer. * see skb_copy_bits(). */ static inline int mbuf_copy_bits(const struct rte_mbuf *mbuf, - int offset, void *to, int len) + int offset, void *to, int len) { - const struct rte_mbuf *seg; - int start, copy, end; + const struct rte_mbuf *seg; + int start, copy, end; - if (offset + len > (int)mbuf->pkt_len) - return -1; + if (offset + len > (int)mbuf->pkt_len) + return -1; - start = 0; - mbuf_foreach(mbuf, seg) { - end = start + seg->data_len; + start = 0; + mbuf_foreach(mbuf, seg) { + end = start + seg->data_len; - if ((copy = end - offset) > 0) { - if (copy > len) - copy = len; + if ((copy = end - offset) > 0) { + if (copy > len) + copy = len; - memcpy(to, rte_pktmbuf_mtod_offset( - seg, void *, offset - start), - copy); + memcpy(to, rte_pktmbuf_mtod_offset( + seg, void *, offset - start), + copy); - if ((len -= copy) == 0) - return 0; - offset += copy; - to += copy; - } + if ((len -= copy) == 0) + return 0; + offset += copy; + to += copy; + } - start = end; - } + start = end; + } - if (!len) - return 0; + if (!len) + return 0; - return -1; + return -1; } static inline void *mbuf_tail_point(const struct rte_mbuf *mbuf) { - return rte_pktmbuf_mtod_offset(mbuf, void *, mbuf->data_len); + return rte_pktmbuf_mtod_offset(mbuf, void *, mbuf->data_len); } static inline void *mbuf_header_pointer(const struct rte_mbuf *mbuf, - int offset, int len, void *buffer) + int offset, int len, void *buffer) { - if (unlikely(mbuf->data_len < offset + len)) { - if (unlikely(mbuf->pkt_len < offset + len)) - return NULL; + if (unlikely(mbuf->data_len < offset + len)) { + if (unlikely(mbuf->pkt_len < offset + len)) + return NULL; - if (mbuf_copy_bits(mbuf, offset, buffer, len) != 0) - return NULL; + if (mbuf_copy_bits(mbuf, offset, buffer, len) != 0) + return NULL; - return buffer; - } + return buffer; + } - return rte_pktmbuf_mtod_offset(mbuf, void *, offset); + return rte_pktmbuf_mtod_offset(mbuf, void *, offset); } /** diff --git a/include/md5.h b/include/md5.h index 515c04bb8..df61d9a61 100644 --- a/include/md5.h +++ b/include/md5.h @@ -18,98 +18,98 @@ #ifndef __MD5_H__ #define __MD5_H__ -#define MD5_DIGEST_WORDS 4 -#define MD5_MESSAGE_BYTES 64 +#define MD5_DIGEST_WORDS 4 +#define MD5_MESSAGE_BYTES 64 -#define F1(x, y, z) (z ^ (x & (y ^ z))) -#define F2(x, y, z) F1(z, x, y) -#define F3(x, y, z) (x ^ y ^ z) -#define F4(x, y, z) (y ^ (x | ~z)) +#define F1(x, y, z) (z ^ (x & (y ^ z))) +#define F2(x, y, z) F1(z, x, y) +#define F3(x, y, z) (x ^ y ^ z) +#define F4(x, y, z) (y ^ (x | ~z)) #define MD5STEP(f, w, x, y, z, in, s) \ - (w += f(x, y, z) + in, w = (w<>(32-s)) + x) + (w += f(x, y, z) + in, w = (w<>(32-s)) + x) static inline void md5_transform(uint32_t *hash, uint32_t const *in) { - uint32_t a, b, c, d; + uint32_t a, b, c, d; - a = hash[0]; - b = hash[1]; - c = hash[2]; - d = hash[3]; + a = hash[0]; + b = hash[1]; + c = hash[2]; + d = hash[3]; - MD5STEP(F1, a, b, c, d, in[0] + 0xd76aa478, 7); - MD5STEP(F1, d, a, b, c, in[1] + 0xe8c7b756, 12); - MD5STEP(F1, c, d, a, b, in[2] + 0x242070db, 17); - MD5STEP(F1, b, c, d, a, in[3] + 0xc1bdceee, 22); - MD5STEP(F1, a, b, c, d, in[4] + 0xf57c0faf, 7); - MD5STEP(F1, d, a, b, c, in[5] + 0x4787c62a, 12); - MD5STEP(F1, c, d, a, b, in[6] + 0xa8304613, 17); - MD5STEP(F1, b, c, d, a, in[7] + 0xfd469501, 22); - MD5STEP(F1, a, b, c, d, in[8] + 0x698098d8, 7); - MD5STEP(F1, d, a, b, c, in[9] + 0x8b44f7af, 12); - MD5STEP(F1, c, d, a, b, in[10] + 0xffff5bb1, 17); - MD5STEP(F1, b, c, d, a, in[11] + 0x895cd7be, 22); - MD5STEP(F1, a, b, c, d, in[12] + 0x6b901122, 7); - MD5STEP(F1, d, a, b, c, in[13] + 0xfd987193, 12); - MD5STEP(F1, c, d, a, b, in[14] + 0xa679438e, 17); - MD5STEP(F1, b, c, d, a, in[15] + 0x49b40821, 22); + MD5STEP(F1, a, b, c, d, in[0] + 0xd76aa478, 7); + MD5STEP(F1, d, a, b, c, in[1] + 0xe8c7b756, 12); + MD5STEP(F1, c, d, a, b, in[2] + 0x242070db, 17); + MD5STEP(F1, b, c, d, a, in[3] + 0xc1bdceee, 22); + MD5STEP(F1, a, b, c, d, in[4] + 0xf57c0faf, 7); + MD5STEP(F1, d, a, b, c, in[5] + 0x4787c62a, 12); + MD5STEP(F1, c, d, a, b, in[6] + 0xa8304613, 17); + MD5STEP(F1, b, c, d, a, in[7] + 0xfd469501, 22); + MD5STEP(F1, a, b, c, d, in[8] + 0x698098d8, 7); + MD5STEP(F1, d, a, b, c, in[9] + 0x8b44f7af, 12); + MD5STEP(F1, c, d, a, b, in[10] + 0xffff5bb1, 17); + MD5STEP(F1, b, c, d, a, in[11] + 0x895cd7be, 22); + MD5STEP(F1, a, b, c, d, in[12] + 0x6b901122, 7); + MD5STEP(F1, d, a, b, c, in[13] + 0xfd987193, 12); + MD5STEP(F1, c, d, a, b, in[14] + 0xa679438e, 17); + MD5STEP(F1, b, c, d, a, in[15] + 0x49b40821, 22); - MD5STEP(F2, a, b, c, d, in[1] + 0xf61e2562, 5); - MD5STEP(F2, d, a, b, c, in[6] + 0xc040b340, 9); - MD5STEP(F2, c, d, a, b, in[11] + 0x265e5a51, 14); - MD5STEP(F2, b, c, d, a, in[0] + 0xe9b6c7aa, 20); - MD5STEP(F2, a, b, c, d, in[5] + 0xd62f105d, 5); - MD5STEP(F2, d, a, b, c, in[10] + 0x02441453, 9); - MD5STEP(F2, c, d, a, b, in[15] + 0xd8a1e681, 14); - MD5STEP(F2, b, c, d, a, in[4] + 0xe7d3fbc8, 20); - MD5STEP(F2, a, b, c, d, in[9] + 0x21e1cde6, 5); - MD5STEP(F2, d, a, b, c, in[14] + 0xc33707d6, 9); - MD5STEP(F2, c, d, a, b, in[3] + 0xf4d50d87, 14); - MD5STEP(F2, b, c, d, a, in[8] + 0x455a14ed, 20); - MD5STEP(F2, a, b, c, d, in[13] + 0xa9e3e905, 5); - MD5STEP(F2, d, a, b, c, in[2] + 0xfcefa3f8, 9); - MD5STEP(F2, c, d, a, b, in[7] + 0x676f02d9, 14); - MD5STEP(F2, b, c, d, a, in[12] + 0x8d2a4c8a, 20); + MD5STEP(F2, a, b, c, d, in[1] + 0xf61e2562, 5); + MD5STEP(F2, d, a, b, c, in[6] + 0xc040b340, 9); + MD5STEP(F2, c, d, a, b, in[11] + 0x265e5a51, 14); + MD5STEP(F2, b, c, d, a, in[0] + 0xe9b6c7aa, 20); + MD5STEP(F2, a, b, c, d, in[5] + 0xd62f105d, 5); + MD5STEP(F2, d, a, b, c, in[10] + 0x02441453, 9); + MD5STEP(F2, c, d, a, b, in[15] + 0xd8a1e681, 14); + MD5STEP(F2, b, c, d, a, in[4] + 0xe7d3fbc8, 20); + MD5STEP(F2, a, b, c, d, in[9] + 0x21e1cde6, 5); + MD5STEP(F2, d, a, b, c, in[14] + 0xc33707d6, 9); + MD5STEP(F2, c, d, a, b, in[3] + 0xf4d50d87, 14); + MD5STEP(F2, b, c, d, a, in[8] + 0x455a14ed, 20); + MD5STEP(F2, a, b, c, d, in[13] + 0xa9e3e905, 5); + MD5STEP(F2, d, a, b, c, in[2] + 0xfcefa3f8, 9); + MD5STEP(F2, c, d, a, b, in[7] + 0x676f02d9, 14); + MD5STEP(F2, b, c, d, a, in[12] + 0x8d2a4c8a, 20); - MD5STEP(F3, a, b, c, d, in[5] + 0xfffa3942, 4); - MD5STEP(F3, d, a, b, c, in[8] + 0x8771f681, 11); - MD5STEP(F3, c, d, a, b, in[11] + 0x6d9d6122, 16); - MD5STEP(F3, b, c, d, a, in[14] + 0xfde5380c, 23); - MD5STEP(F3, a, b, c, d, in[1] + 0xa4beea44, 4); - MD5STEP(F3, d, a, b, c, in[4] + 0x4bdecfa9, 11); - MD5STEP(F3, c, d, a, b, in[7] + 0xf6bb4b60, 16); - MD5STEP(F3, b, c, d, a, in[10] + 0xbebfbc70, 23); - MD5STEP(F3, a, b, c, d, in[13] + 0x289b7ec6, 4); - MD5STEP(F3, d, a, b, c, in[0] + 0xeaa127fa, 11); - MD5STEP(F3, c, d, a, b, in[3] + 0xd4ef3085, 16); - MD5STEP(F3, b, c, d, a, in[6] + 0x04881d05, 23); - MD5STEP(F3, a, b, c, d, in[9] + 0xd9d4d039, 4); - MD5STEP(F3, d, a, b, c, in[12] + 0xe6db99e5, 11); - MD5STEP(F3, c, d, a, b, in[15] + 0x1fa27cf8, 16); - MD5STEP(F3, b, c, d, a, in[2] + 0xc4ac5665, 23); + MD5STEP(F3, a, b, c, d, in[5] + 0xfffa3942, 4); + MD5STEP(F3, d, a, b, c, in[8] + 0x8771f681, 11); + MD5STEP(F3, c, d, a, b, in[11] + 0x6d9d6122, 16); + MD5STEP(F3, b, c, d, a, in[14] + 0xfde5380c, 23); + MD5STEP(F3, a, b, c, d, in[1] + 0xa4beea44, 4); + MD5STEP(F3, d, a, b, c, in[4] + 0x4bdecfa9, 11); + MD5STEP(F3, c, d, a, b, in[7] + 0xf6bb4b60, 16); + MD5STEP(F3, b, c, d, a, in[10] + 0xbebfbc70, 23); + MD5STEP(F3, a, b, c, d, in[13] + 0x289b7ec6, 4); + MD5STEP(F3, d, a, b, c, in[0] + 0xeaa127fa, 11); + MD5STEP(F3, c, d, a, b, in[3] + 0xd4ef3085, 16); + MD5STEP(F3, b, c, d, a, in[6] + 0x04881d05, 23); + MD5STEP(F3, a, b, c, d, in[9] + 0xd9d4d039, 4); + MD5STEP(F3, d, a, b, c, in[12] + 0xe6db99e5, 11); + MD5STEP(F3, c, d, a, b, in[15] + 0x1fa27cf8, 16); + MD5STEP(F3, b, c, d, a, in[2] + 0xc4ac5665, 23); - MD5STEP(F4, a, b, c, d, in[0] + 0xf4292244, 6); - MD5STEP(F4, d, a, b, c, in[7] + 0x432aff97, 10); - MD5STEP(F4, c, d, a, b, in[14] + 0xab9423a7, 15); - MD5STEP(F4, b, c, d, a, in[5] + 0xfc93a039, 21); - MD5STEP(F4, a, b, c, d, in[12] + 0x655b59c3, 6); - MD5STEP(F4, d, a, b, c, in[3] + 0x8f0ccc92, 10); - MD5STEP(F4, c, d, a, b, in[10] + 0xffeff47d, 15); - MD5STEP(F4, b, c, d, a, in[1] + 0x85845dd1, 21); - MD5STEP(F4, a, b, c, d, in[8] + 0x6fa87e4f, 6); - MD5STEP(F4, d, a, b, c, in[15] + 0xfe2ce6e0, 10); - MD5STEP(F4, c, d, a, b, in[6] + 0xa3014314, 15); - MD5STEP(F4, b, c, d, a, in[13] + 0x4e0811a1, 21); - MD5STEP(F4, a, b, c, d, in[4] + 0xf7537e82, 6); - MD5STEP(F4, d, a, b, c, in[11] + 0xbd3af235, 10); - MD5STEP(F4, c, d, a, b, in[2] + 0x2ad7d2bb, 15); - MD5STEP(F4, b, c, d, a, in[9] + 0xeb86d391, 21); + MD5STEP(F4, a, b, c, d, in[0] + 0xf4292244, 6); + MD5STEP(F4, d, a, b, c, in[7] + 0x432aff97, 10); + MD5STEP(F4, c, d, a, b, in[14] + 0xab9423a7, 15); + MD5STEP(F4, b, c, d, a, in[5] + 0xfc93a039, 21); + MD5STEP(F4, a, b, c, d, in[12] + 0x655b59c3, 6); + MD5STEP(F4, d, a, b, c, in[3] + 0x8f0ccc92, 10); + MD5STEP(F4, c, d, a, b, in[10] + 0xffeff47d, 15); + MD5STEP(F4, b, c, d, a, in[1] + 0x85845dd1, 21); + MD5STEP(F4, a, b, c, d, in[8] + 0x6fa87e4f, 6); + MD5STEP(F4, d, a, b, c, in[15] + 0xfe2ce6e0, 10); + MD5STEP(F4, c, d, a, b, in[6] + 0xa3014314, 15); + MD5STEP(F4, b, c, d, a, in[13] + 0x4e0811a1, 21); + MD5STEP(F4, a, b, c, d, in[4] + 0xf7537e82, 6); + MD5STEP(F4, d, a, b, c, in[11] + 0xbd3af235, 10); + MD5STEP(F4, c, d, a, b, in[2] + 0x2ad7d2bb, 15); + MD5STEP(F4, b, c, d, a, in[9] + 0xeb86d391, 21); - hash[0] += a; - hash[1] += b; - hash[2] += c; - hash[3] += d; + hash[0] += a; + hash[1] += b; + hash[2] += c; + hash[3] += d; } #endif diff --git a/include/ndisc.h b/include/ndisc.h index 6b0ee9968..463289668 100644 --- a/include/ndisc.h +++ b/include/ndisc.h @@ -20,13 +20,13 @@ #include "neigh.h" -int ndisc_rcv(struct rte_mbuf *mbuf, +int ndisc_rcv(struct rte_mbuf *mbuf, struct netif_port *dev); -void ndisc_send_dad(struct netif_port *dev, +void ndisc_send_dad(struct netif_port *dev, const struct in6_addr* solicit); -void ndisc_solicit(struct neighbour_entry *neigh, +void ndisc_solicit(struct neighbour_entry *neigh, const struct in6_addr *saddr); #endif /* __DPVS_NDISC_H__ */ diff --git a/include/neigh.h b/include/neigh.h index a7814dcc4..bbac65e05 100644 --- a/include/neigh.h +++ b/include/neigh.h @@ -51,7 +51,7 @@ #define NEIGH_TAB_MASK (NEIGH_TAB_SIZE - 1) struct neighbour_entry { - int af; + int af; struct list_head neigh_list; union inet_addr ip_addr; struct ether_addr eth_addr; @@ -60,7 +60,7 @@ struct neighbour_entry { struct list_head queue_list; uint32_t que_num; uint32_t state; - uint32_t ts; + uint32_t ts; uint8_t flag; } __rte_cache_aligned; @@ -69,12 +69,12 @@ enum param_kind { NEIGH_PARAM }; -/* +/* * no matter which kind of ip_addr, just use 32 bit to hash * since neighbour table is not a large table */ static inline unsigned int neigh_hashkey(int af, - const union inet_addr *ip_addr, + const union inet_addr *ip_addr, struct netif_port *port) { return rte_be_to_cpu_32(inet_addr_fold(af, ip_addr)) \ & NEIGH_TAB_MASK; @@ -83,12 +83,12 @@ static inline unsigned int neigh_hashkey(int af, void neigh_entry_state_trans(struct neighbour_entry *neighbour, int idx); struct neighbour_entry *neigh_lookup_entry(int af, const union inet_addr *key, - const struct netif_port *port, + const struct netif_port *port, unsigned int hashkey); void neigh_send_mbuf_cach(struct neighbour_entry *neighbour); -int neigh_edit(struct neighbour_entry *neighbour, +int neigh_edit(struct neighbour_entry *neighbour, struct ether_addr *eth_addr); int neigh_init(void); @@ -100,13 +100,13 @@ void neigh_keyword_value_init(void); void install_neighbor_keywords(void); int neigh_output(int af, - union inet_addr *nexhop, - struct rte_mbuf *mbuf, + union inet_addr *nexhop, + struct rte_mbuf *mbuf, struct netif_port *port); -struct neighbour_entry *neigh_add_table(int af, const union inet_addr *ipaddr, +struct neighbour_entry *neigh_add_table(int af, const union inet_addr *ipaddr, const struct ether_addr *eth_addr, - struct netif_port *port, + struct netif_port *port, unsigned int hashkey, int flag); int neigh_gratuitous_arp(struct in_addr *src, struct netif_port *port); @@ -119,7 +119,7 @@ void neigh_confirm(int af, union inet_addr *nexthop, struct netif_port *port); int neigh_sync_core(const void *param, bool add_del, enum param_kind kind); -static inline void ipv6_mac_mult(const struct in6_addr *mult_target, +static inline void ipv6_mac_mult(const struct in6_addr *mult_target, struct ether_addr *mult_eth) { uint8_t *w = (uint8_t *)mult_eth; @@ -146,7 +146,7 @@ ethAddrSwap(void *t, void *f) { uint16Swap(d++, s++); uint16Swap(d++, s++); - uint16Swap(d, s); + uint16Swap(d, s); } /* inetAddrSwap( void * t, void * f ) - Swap two IPv4 addresses */ @@ -165,7 +165,7 @@ inetAddrCopy(void *t, void *f) { uint32_t *d = (uint32_t *)t; uint32_t *s = (uint32_t *)f; - *d = *s; + *d = *s; } #endif /* __DPVS_NEIGH_H__ */ diff --git a/include/netif.h b/include/netif.h index df87389fb..ca3ef1d82 100644 --- a/include/netif.h +++ b/include/netif.h @@ -70,7 +70,7 @@ struct rx_partner; /* RX/TX queue conf for lcore */ struct netif_queue_conf { - queueid_t id; + queueid_t id; uint16_t len; uint16_t kni_len; struct rx_partner *isol_rxq; @@ -84,7 +84,7 @@ struct netif_queue_conf */ struct netif_port_conf { - portid_t id; + portid_t id; /* rx/tx queues for this lcore to process*/ int nrxq; int ntxq; @@ -99,7 +99,7 @@ struct netif_port_conf */ struct netif_lcore_conf { - lcoreid_t id; + lcoreid_t id; /* nic number of this lcore to process */ int nports; /* port list of this lcore to process */ @@ -283,13 +283,14 @@ int netif_lcore_loop_job_register(struct netif_lcore_loop_job *lcore_job); int netif_lcore_loop_job_unregister(struct netif_lcore_loop_job *lcore_job); int netif_lcore_start(void); bool is_lcore_id_valid(lcoreid_t cid); +bool netif_lcore_is_idle(lcoreid_t cid); /************************** protocol API *****************************/ int netif_register_pkt(struct pkt_type *pt); int netif_unregister_pkt(struct pkt_type *pt); /**************************** port API ******************************/ -int netif_fdir_filter_set(struct netif_port *port, enum rte_filter_op opcode, +int netif_fdir_filter_set(struct netif_port *port, enum rte_filter_op opcode, const struct rte_eth_fdir_filter *fdir_flt); void netif_mask_fdir_filter(int af, const struct netif_port *port, struct rte_eth_fdir_filter *filt); @@ -363,5 +364,7 @@ static inline char *eth_addr_dump(const struct ether_addr *ea, } portid_t netif_port_count(void); +void lcore_process_packets(struct netif_queue_conf *qconf, struct rte_mbuf **mbufs, + lcoreid_t cid, uint16_t count, bool pkts_from_ring); #endif /* __DPVS_NETIF_H__ */ diff --git a/include/route.h b/include/route.h index d78ca4b72..a6e621b91 100644 --- a/include/route.h +++ b/include/route.h @@ -64,7 +64,9 @@ int route_flush(void); static inline void route4_put(struct route_entry *route) { if(route){ - rte_atomic32_dec(&route->refcnt); + if (rte_atomic32_dec_and_test(&route->refcnt)) { + rte_free(route); + } } } @@ -79,7 +81,7 @@ static inline uint32_t __attribute__((pure)) depth_to_mask(uint8_t depth) { if (depth>0) { - return (int)0x80000000 >> (depth - 1); + return (int)0x80000000 >> (depth - 1); } else return (int)0x0; diff --git a/include/tc/cls.h b/include/tc/cls.h index d3810b58d..03a3be44a 100644 --- a/include/tc/cls.h +++ b/include/tc/cls.h @@ -73,7 +73,7 @@ struct tc_cls { static inline void *tc_cls_priv(struct tc_cls *cls) { - return (char *)cls + TC_ALIGN(sizeof(struct tc_cls)); + return (char *)cls + TC_ALIGN(sizeof(struct tc_cls)); } struct tc_cls *tc_cls_create(struct Qsch *sch, const char *kind, diff --git a/include/tc/sch.h b/include/tc/sch.h index 2e626acce..ea4dc6fe1 100644 --- a/include/tc/sch.h +++ b/include/tc/sch.h @@ -99,12 +99,12 @@ struct Qsch { }; struct qsch_rate { - uint64_t rate_bytes_ps; /* B/s */ + uint64_t rate_bytes_ps; /* B/s */ }; static inline void *qsch_priv(struct Qsch *sch) { - return (char *)sch + TC_ALIGN(sizeof(struct Qsch)); + return (char *)sch + TC_ALIGN(sizeof(struct Qsch)); } static inline struct netif_port *qsch_dev(struct Qsch *sch) diff --git a/include/timer.h b/include/timer.h index 934288900..cf6f9070a 100644 --- a/include/timer.h +++ b/include/timer.h @@ -20,7 +20,7 @@ #include #include "list.h" -/* +/* * __NOTE__ * timer handler should be as quick as possible * and do not block. @@ -55,7 +55,7 @@ int dpvs_timer_term(void); /** * if @global is 'true' it's system wide timer, or it's per-lcore. - * for per-lcore module pls set global to 'false'otherwise + * for per-lcore module pls set global to 'false'otherwise * set @global to 'true'. a timer is global or not must be consistent * all the time, DO NOT mix up. * @@ -65,16 +65,16 @@ int dpvs_timer_term(void); int dpvs_time_now(struct timeval *now, bool global); /* schedule one-shot timer expire at "time_now" + @delay */ -int dpvs_timer_sched(struct dpvs_timer *timer, struct timeval *delay, +int dpvs_timer_sched(struct dpvs_timer *timer, struct timeval *delay, dpvs_timer_cb_t handler, void *arg, bool global); /* schedule one-shot timer expire at @expire * it's abstract time not delta value */ -int dpvs_timer_sched_abs(struct dpvs_timer *timer, struct timeval *expire, +int dpvs_timer_sched_abs(struct dpvs_timer *timer, struct timeval *expire, dpvs_timer_cb_t handler, void *arg, bool global); /* schedule periodic timer with interval @intv */ -int dpvs_timer_sched_period(struct dpvs_timer *timer, struct timeval *intv, +int dpvs_timer_sched_period(struct dpvs_timer *timer, struct timeval *intv, dpvs_timer_cb_t handler, void *arg, bool global); int dpvs_timer_cancel(struct dpvs_timer *timer, bool global); @@ -83,7 +83,7 @@ int dpvs_timer_cancel(struct dpvs_timer *timer, bool global); int dpvs_timer_reset(struct dpvs_timer *timer, bool global); /* set timer with new delay (one-shot) or interval (periodic) */ -int dpvs_timer_update(struct dpvs_timer *timer, +int dpvs_timer_update(struct dpvs_timer *timer, struct timeval *delay, bool global); void dpvs_time_rand_delay(struct timeval *tv, long delay_us); diff --git a/include/uoa.h b/include/uoa.h index 6236fe054..5a60271ee 100644 --- a/include/uoa.h +++ b/include/uoa.h @@ -36,72 +36,74 @@ #define IPOLEN_UOA_IPV4 (sizeof(struct ipopt_uoa) + 4) #define IPOLEN_UOA_IPV6 (sizeof(struct ipopt_uoa) + 16) -/* +/* * UOA IP option * @op_code: operation code - * @op_len: length of struct ipopt_uoa + real op_addr (v4/v6) length + * @op_len: length of (struct ipopt_uoa) + real op_addr (v4/v6) length * i.e. IPOLEN_UOA_IPV4 or IPOLEN_UOA_IPV6 * @op_port: port number * @op_addr: real ipv4 or ipv6 address following it */ struct ipopt_uoa { - __u8 op_code; - __u8 op_len; - __be16 op_port; - __u8 op_addr[0]; + __u8 op_code; + __u8 op_len; + __be16 op_port; + __u8 op_addr[0]; } __attribute__((__packed__)); /* per-cpu statistics */ struct uoa_cpu_stats { - __u64 uoa_got; /* UDP packet got UOA. */ - __u64 uoa_none; /* UDP packet has no UOA. */ - __u64 uoa_saved; /* UOA saved to mapping table */ - __u64 uoa_ack_fail; /* Fail to send UOA ACK. */ - __u64 uoa_miss; /* Fail to get UOA info from pkt. */ + __u64 uoa_got; /* UDP packet got UOA. */ + __u64 uoa_none; /* UDP packet has no UOA. */ + __u64 uoa_saved; /* UOA saved to mapping table */ + __u64 uoa_ack_fail; /* Fail to send UOA ACK. */ + __u64 uoa_miss; /* Fail to get UOA info from pkt. */ - __u64 success; /* uoa address returned. */ - __u64 miss; /* no such uoa info . */ - __u64 invalid; /* bad uoa info found. */ + __u64 success; /* uoa address returned. */ + __u64 miss; /* no such uoa info . */ + __u64 invalid; /* bad uoa info found. */ #ifdef __KERNEL__ #if LINUX_VERSION_CODE >= KERNEL_VERSION(4,1,0) - struct u64_stats_sync syncp; + struct u64_stats_sync syncp; #endif #endif } __attribute__((__packed__)); /* normal kernel statistics (global) */ struct uoa_kstats { - __u64 uoa_got; /* UDP packet got UOA. */ - __u64 uoa_none; /* UDP packet has no UOA. */ - __u64 uoa_saved; /* UOA saved to mapping table */ - __u64 uoa_ack_fail; /* Fail to shand UOA ACK. */ - __u64 uoa_miss; /* Fail to get UOA info from pkt. */ - - __u64 success; /* uoa address returned. */ - __u64 miss; /* no such uoa info . */ - __u64 invalid; /* bad uoa info found. */ + __u64 uoa_got; /* UDP packet got UOA. */ + __u64 uoa_none; /* UDP packet has no UOA. */ + __u64 uoa_saved; /* UOA saved to mapping table */ + __u64 uoa_ack_fail; /* Fail to shand UOA ACK. */ + __u64 uoa_miss; /* Fail to get UOA info from pkt. */ + + __u64 success; /* uoa address returned. */ + __u64 miss; /* no such uoa info . */ + __u64 invalid; /* bad uoa info found. */ } __attribute__((__packed__)); /* uoa socket options */ enum { - UOA_BASE_CTL = 2048, - /* set */ - UOA_SO_SET_MAX = UOA_BASE_CTL, - /* get */ - UOA_SO_GET_LOOKUP = UOA_BASE_CTL, - UOA_SO_GET_MAX = UOA_SO_GET_LOOKUP, + UOA_BASE_CTL = 2048, + /* set */ + UOA_SO_SET_MAX = UOA_BASE_CTL, + /* get */ + UOA_SO_GET_LOOKUP = UOA_BASE_CTL, + UOA_SO_GET_MAX = UOA_SO_GET_LOOKUP, }; struct uoa_param_map { - /* input */ - __be32 saddr; - __be32 daddr; - __be16 sport; - __be16 dport; - /* output */ - __be32 real_saddr; - __be16 real_sport; + /* input */ + __be16 af; + union inet_addr saddr; + union inet_addr daddr; + __be16 sport; + __be16 dport; + /* output */ + __be16 real_af; + union inet_addr real_saddr; + __be16 real_sport; } __attribute__((__packed__)); /** @@ -136,11 +138,11 @@ struct uoa_param_map { * 0x2 (2) for ipv6 address family, OPPHDR_IPV6 * Rsvd. Reserved bits, must be zero. * Protocol Next level protocol, e.g., IPPROTO_UDP. - * Length Length of fixed header and options, not include payloads. - * Options Compatible with IPv4 options, including IPOPT_UOA. + * Length Length of fixed header and options, not include payloads. + * Options Compatible with IPv4 options, including IPOPT_UOA. */ -#define IPPROTO_OPT 0xf8 /* 248 */ +#define IPPROTO_OPT 0xf8 /* 248 */ #define OPPHDR_IPV6 0x02 #define OPPHDR_IPV4 0x01 @@ -148,21 +150,21 @@ struct uoa_param_map { /* OPtion Protocol header */ struct opphdr { #if defined(__LITTLE_ENDIAN_BITFIELD) || (__BYTE_ORDER == __LITTLE_ENDIAN) - unsigned int rsvd0:4; - unsigned int version:4; + unsigned int rsvd0:4; + unsigned int version:4; #elif defined (__BIG_ENDIAN_BITFIELD) || (__BYTE_ORDER == __BIG_ENDIAN) - unsigned int version:4; - unsigned int rsvd0:4; + unsigned int version:4; + unsigned int rsvd0:4; #else #ifndef __KERNEL__ -# error "Please fix " +# error "Please fix " #else -# error "Please fix " +# error "Please fix " #endif #endif - __u8 protocol; /* IPPROTO_XXX */ - __be16 length; /* length of fixed header and options */ - __u8 options[0]; + __u8 protocol; /* IPPROTO_XXX */ + __be16 length; /* length of fixed header and options */ + __u8 options[0]; } __attribute__((__packed__)); #endif diff --git a/include/vlan.h b/include/vlan.h index 0d86dbbce..8994adba3 100644 --- a/include/vlan.h +++ b/include/vlan.h @@ -80,19 +80,19 @@ struct vlan_dev_priv { /** * from linux kernel. * - * struct vlan_ethhdr - vlan ethernet header (ethhdr + vlan_hdr) - * @h_dest: destination ethernet address - * @h_source: source ethernet address - * @h_vlan_proto: ethernet protocol - * @h_vlan_TCI: priority and VLAN ID - * @h_vlan_encapsulated_proto: packet type ID or len + * struct vlan_ethhdr - vlan ethernet header (ethhdr + vlan_hdr) + * @h_dest: destination ethernet address + * @h_source: source ethernet address + * @h_vlan_proto: ethernet protocol + * @h_vlan_TCI: priority and VLAN ID + * @h_vlan_encapsulated_proto: packet type ID or len */ struct vlan_ethhdr { - unsigned char h_dest[ETH_ALEN]; - unsigned char h_source[ETH_ALEN]; - __be16 h_vlan_proto; - __be16 h_vlan_TCI; - __be16 h_vlan_encapsulated_proto; + unsigned char h_dest[ETH_ALEN]; + unsigned char h_source[ETH_ALEN]; + __be16 h_vlan_proto; + __be16 h_vlan_TCI; + __be16 h_vlan_encapsulated_proto; }; int vlan_add_dev(struct netif_port *real_dev, const char *ifname, diff --git a/kmod/toa/Makefile b/kmod/toa/Makefile index 0a091b6ba..8d42c507d 100644 --- a/kmod/toa/Makefile +++ b/kmod/toa/Makefile @@ -7,7 +7,8 @@ KDIR := $(KERNDIR) endif PWD := $(shell pwd) -ccflags-y := -DTOA_IPV6_ENABLE +ccflags-y += -DTOA_IPV6_ENABLE +ccflags-y += -DTOA_NAT64_ENABLE ifeq ($(DEBUG), 1) ccflags-y += -g -O0 diff --git a/kmod/toa/example_nat64/Makefile b/kmod/toa/example_nat64/Makefile new file mode 100644 index 000000000..a9195919b --- /dev/null +++ b/kmod/toa/example_nat64/Makefile @@ -0,0 +1,33 @@ +# +# DPVS is a software load balancer (Virtual Server) based on DPDK. +# +# Copyright (C) 2017 iQIYI (www.iqiyi.com). +# All Rights Reserved. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# + +# +# Makefile for example nat64 +# + +all: server client + +CFLAGS = -g -O0 + +server: server.c + gcc $(CFLAGS) -o server server.c + +client: client.c + gcc $(CFLAGS) -o client client.c + +clean: + rm -rf server client diff --git a/kmod/toa/example_nat64/client.c b/kmod/toa/example_nat64/client.c new file mode 100644 index 000000000..ab8bd24f9 --- /dev/null +++ b/kmod/toa/example_nat64/client.c @@ -0,0 +1,39 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MAXLINE 1024 +#define ADDR "127.0.0.1" +#define PORT 10004 +static char *sendbuf = "test"; + +int main(int argc,char **argv) +{ + char *servInetAddr = ADDR;//TODO + int socketfd; + struct sockaddr_in sockaddr; + int n; + socketfd = socket(AF_INET, SOCK_STREAM, 0); + memset(&sockaddr, 0, sizeof(sockaddr)); + sockaddr.sin_family = AF_INET; + sockaddr.sin_port = htons(PORT); + inet_pton(AF_INET, servInetAddr, &sockaddr.sin_addr); + if((connect(socketfd, (struct sockaddr*)&sockaddr, sizeof(sockaddr))) < 0 ) { + printf("connect error %s errno: %d\n", strerror(errno), errno); + exit(0); + } + printf("send message to server\n"); + if((send(socketfd, sendbuf, strlen(sendbuf), 0)) < 0) { + printf("send mes error: %s errno : %d", strerror(errno), errno); + exit(0); + } + close(socketfd); + printf("exit\n"); + exit(0); +} diff --git a/kmod/toa/example_nat64/nginx/README.md b/kmod/toa/example_nat64/nginx/README.md new file mode 100644 index 000000000..3e3190ee9 --- /dev/null +++ b/kmod/toa/example_nat64/nginx/README.md @@ -0,0 +1,32 @@ +This patch is for Nginx to get real client ip by 'toa_remote_addr' +when you are using NAT64 mode(VIP is IPv6 while RS is IPv4). +You can use this patch only when toa module is installed. + +Here is an exampe to configure http block in nginx.conf: + +``` +http { + include mime.types; + default_type application/octet-stream; + + log_format main '$toa_remote_addr $toa_remote_port $remote_addr - $remote_user [$time_local] "$request" ' + '$status $body_bytes_sent "$http_referer" ' + '"$http_user_agent" "$http_x_forwarded_for"'; + + access_log /data/nginx/logs/access.log main; + + keepalive_timeout 65; + + server { + listen 80; + server_name localhost; + + access_log /data/nginx/logs/access.log main; + + location / { + proxy_set_header X-Forwarded-For $toa_remote_addr; + proxy_pass http://192.168.1.1; + } + } +} +``` diff --git a/kmod/toa/example_nat64/nginx/nginx-1.14.0-nat64-toa.patch b/kmod/toa/example_nat64/nginx/nginx-1.14.0-nat64-toa.patch new file mode 100644 index 000000000..956db717a --- /dev/null +++ b/kmod/toa/example_nat64/nginx/nginx-1.14.0-nat64-toa.patch @@ -0,0 +1,164 @@ +diff -Nrup nginx-1.14.0/src/core/ngx_connection.h nginx-1.14.0_toa/src/core/ngx_connection.h +--- nginx-1.14.0/src/core/ngx_connection.h 2018-04-17 15:22:36.000000000 +0000 ++++ nginx-1.14.0_toa/src/core/ngx_connection.h 2018-11-22 07:02:37.632988544 +0000 +@@ -144,6 +144,8 @@ struct ngx_connection_s { + socklen_t socklen; + ngx_str_t addr_text; + ++ struct toa_nat64_peer *toaddr; ++ + ngx_str_t proxy_protocol_addr; + in_port_t proxy_protocol_port; + +diff -Nrup nginx-1.14.0/src/core/ngx_inet.h nginx-1.14.0_toa/src/core/ngx_inet.h +--- nginx-1.14.0/src/core/ngx_inet.h 2018-04-17 15:22:36.000000000 +0000 ++++ nginx-1.14.0_toa/src/core/ngx_inet.h 2018-11-22 06:13:19.869544046 +0000 +@@ -126,5 +126,19 @@ ngx_int_t ngx_cmp_sockaddr(struct sockad + in_port_t ngx_inet_get_port(struct sockaddr *sa); + void ngx_inet_set_port(struct sockaddr *sa, in_port_t port); + ++/* toa socket options, now only for nat64 */ ++enum { ++ TOA_BASE_CTL = 4096, ++ /* set */ ++ TOA_SO_SET_MAX = TOA_BASE_CTL, ++ /* get */ ++ TOA_SO_GET_LOOKUP = TOA_BASE_CTL, ++ TOA_SO_GET_MAX = TOA_SO_GET_LOOKUP, ++}; ++ ++struct toa_nat64_peer { ++ struct in6_addr saddr; ++ uint16_t sport; ++}; + + #endif /* _NGX_INET_H_INCLUDED_ */ +diff -Nrup nginx-1.14.0/src/event/ngx_event_accept.c nginx-1.14.0_toa/src/event/ngx_event_accept.c +--- nginx-1.14.0/src/event/ngx_event_accept.c 2018-04-17 15:22:36.000000000 +0000 ++++ nginx-1.14.0_toa/src/event/ngx_event_accept.c 2018-11-22 07:02:11.253981252 +0000 +@@ -22,7 +22,7 @@ static void ngx_debug_accepted_connectio + void + ngx_event_accept(ngx_event_t *ev) + { +- socklen_t socklen; ++ socklen_t socklen, len; + ngx_err_t err; + ngx_log_t *log; + ngx_uint_t level; +@@ -35,6 +35,7 @@ ngx_event_accept(ngx_event_t *ev) + #if (NGX_HAVE_ACCEPT4) + static ngx_uint_t use_accept4 = 1; + #endif ++ struct toa_nat64_peer uaddr; + + if (ev->timedout) { + if (ngx_enable_accept_events((ngx_cycle_t *) ngx_cycle) != NGX_OK) { +@@ -176,6 +177,21 @@ ngx_event_accept(ngx_event_t *ev) + + ngx_memcpy(c->sockaddr, &sa, socklen); + ++ /* get NAT64 remote addr/port */ ++ len = sizeof(struct toa_nat64_peer); ++ if (getsockopt(s, IPPROTO_IP, TOA_SO_GET_LOOKUP, &uaddr, &len) ++ == NGX_OK) { ++ c->toaddr = ngx_palloc(c->pool, len); ++ if (c->toaddr == NULL) { ++ ngx_close_accepted_connection(c); ++ return; ++ } ++ ++ ngx_memcpy(c->toaddr, &uaddr, len); ++ } else { ++ c->toaddr = NULL; ++ } ++ + log = ngx_palloc(c->pool, sizeof(ngx_log_t)); + if (log == NULL) { + ngx_close_accepted_connection(c); +diff -Nrup nginx-1.14.0/src/http/ngx_http_variables.c nginx-1.14.0_toa/src/http/ngx_http_variables.c +--- nginx-1.14.0/src/http/ngx_http_variables.c 2018-04-17 15:22:36.000000000 +0000 ++++ nginx-1.14.0_toa/src/http/ngx_http_variables.c 2018-12-04 06:43:16.014591001 +0000 +@@ -143,6 +143,10 @@ static ngx_int_t ngx_http_variable_time_ + static ngx_int_t ngx_http_variable_time_local(ngx_http_request_t *r, + ngx_http_variable_value_t *v, uintptr_t data); + ++static ngx_int_t ngx_http_variable_toa_remote_addr(ngx_http_request_t *r, ++ ngx_http_variable_value_t *v, uintptr_t data); ++static ngx_int_t ngx_http_variable_toa_remote_port(ngx_http_request_t *r, ++ ngx_http_variable_value_t *v, uintptr_t data); + /* + * TODO: + * Apache CGI: AUTH_TYPE, PATH_INFO (null), PATH_TRANSLATED +@@ -379,6 +383,10 @@ static ngx_http_variable_t ngx_http_cor + { ngx_string("arg_"), NULL, ngx_http_variable_argument, + 0, NGX_HTTP_VAR_NOCACHEABLE|NGX_HTTP_VAR_PREFIX, 0 }, + ++ { ngx_string("toa_remote_addr"), NULL, ngx_http_variable_toa_remote_addr, 0, 0, 0 }, ++ ++ { ngx_string("toa_remote_port"), NULL, ngx_http_variable_toa_remote_port, 0, 0, 0 }, ++ + ngx_http_null_variable + }; + +@@ -1314,6 +1322,61 @@ ngx_http_variable_remote_port(ngx_http_r + return NGX_OK; + } + ++static ngx_int_t ++ngx_http_variable_toa_remote_addr(ngx_http_request_t *r, ++ ngx_http_variable_value_t *v, uintptr_t data) ++{ ++ v->len = 0; ++ v->valid = 1; ++ v->no_cacheable = 0; ++ v->not_found = 0; ++ ++ if (r->connection->toaddr) { ++ v->data = ngx_pnalloc(r->pool, NGX_INET6_ADDRSTRLEN - 1); ++ if (v->data == NULL) { ++ return NGX_ERROR; ++ } ++ ++ inet_ntop(AF_INET6, &(r->connection->toaddr->saddr), (char *)v->data, NGX_INET6_ADDRSTRLEN); ++ v->len = ngx_strlen(v->data); ++ } else { ++ v->data = ngx_pnalloc(r->pool, 1); ++ ngx_memcpy(v->data, "-", 1); ++ v->len = 1; ++ } ++ ++ return NGX_OK; ++} ++ ++static ngx_int_t ++ngx_http_variable_toa_remote_port(ngx_http_request_t *r, ++ ngx_http_variable_value_t *v, uintptr_t data) ++{ ++ ngx_uint_t port; ++ ++ v->len = 0; ++ v->valid = 1; ++ v->no_cacheable = 0; ++ v->not_found = 0; ++ ++ v->data = ngx_pnalloc(r->pool, sizeof("65535") - 1); ++ if (v->data == NULL) { ++ return NGX_ERROR; ++ } ++ ++ if (r->connection->toaddr) { ++ port = r->connection->toaddr->sport; ++ if (port > 0 && port < 65536) { ++ v->len = ngx_sprintf(v->data, "%ui", port) - v->data; ++ } ++ } else { ++ v->data = ngx_pnalloc(r->pool, 1); ++ ngx_memcpy(v->data, "-", 1); ++ v->len = 1; ++ } ++ ++ return NGX_OK; ++} + + static ngx_int_t + ngx_http_variable_proxy_protocol_addr(ngx_http_request_t *r, diff --git a/kmod/toa/example_nat64/server.c b/kmod/toa/example_nat64/server.c new file mode 100644 index 000000000..1c63230e6 --- /dev/null +++ b/kmod/toa/example_nat64/server.c @@ -0,0 +1,63 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "toa.h" +#define MAXLINE 1024 +#define PORT 10004 + +int main(int argc,char **argv) +{ + int listenfd,connfd; + struct sockaddr_in sockaddr, caddr; + char buff[MAXLINE]; + int n; + struct toa_nat64_peer uaddr; + int len = sizeof(struct toa_nat64_peer); + char from[40]; + int err; + + memset(&sockaddr,0,sizeof(sockaddr)); + memset(&caddr,0,sizeof(caddr)); + sockaddr.sin_family = AF_INET; + sockaddr.sin_addr.s_addr = htonl(INADDR_ANY); + sockaddr.sin_port = htons(PORT); + listenfd = socket(AF_INET,SOCK_STREAM,0); + if (err = bind(listenfd, (struct sockaddr *)&sockaddr, sizeof(sockaddr)) != 0) { + printf("bind error, code = %d\n", err); + exit(0); + } + if (listen(listenfd,1024) != 0) { + printf("listen error\n"); + exit(0); + } + printf("Please wait for the client information\n"); + + for(;;) { + socklen_t length = sizeof(caddr); + if((connfd = accept(listenfd, (struct sockaddr*)&caddr, &length))==-1) { + printf("accpet socket error: %s errno :%d\n", strerror(errno), errno); + continue; + } + if (err = recv(connfd, buff, MAXLINE, 0) == -1) { + printf("recv error\n"); + continue; + } + + if (getsockopt(connfd, IPPROTO_IP, TOA_SO_GET_LOOKUP, &uaddr, &len) == 0) { + inet_ntop(AF_INET6, &uaddr.saddr, from, sizeof(from)); + printf(" real client [%s]:%d\n", from, ntohs(uaddr.sport)); + } else { + printf("client is %s\n", inet_ntoa(caddr.sin_addr)); + } + + close(connfd); + } + + close(listenfd); +} diff --git a/kmod/toa/example_nat64/toa.h b/kmod/toa/example_nat64/toa.h new file mode 100644 index 000000000..ed8adf007 --- /dev/null +++ b/kmod/toa/example_nat64/toa.h @@ -0,0 +1,15 @@ +#include + /* toa socket options, now only for nat64 */ +enum { + TOA_BASE_CTL = 4096, + /* set */ + TOA_SO_SET_MAX = TOA_BASE_CTL, + /* get */ + TOA_SO_GET_LOOKUP = TOA_BASE_CTL, + TOA_SO_GET_MAX = TOA_SO_GET_LOOKUP, +}; + +struct toa_nat64_peer { + struct in6_addr saddr; + uint16_t sport; +}; diff --git a/kmod/toa/toa.c b/kmod/toa/toa.c index 63c0709ac..1040370f3 100644 --- a/kmod/toa/toa.c +++ b/kmod/toa/toa.c @@ -1,8 +1,9 @@ #include "toa.h" +#include /* - * TOA: Address is a new TCP Option - * Address include ip+port, Now support IPV4 and IPV6 + * TOA: Address is a new TCP Option + * Address include ip+port, Now support IPV4 and IPV6 */ unsigned long sk_data_ready_addr = 0; @@ -15,7 +16,7 @@ unsigned long sk_data_ready_addr = 0; ((unsigned char *)&addr)[2], \ ((unsigned char *)&addr)[3] -#ifdef TOA_IPV6_ENABLE +#if (defined(TOA_IPV6_ENABLE) || defined(TOA_NAT64_ENABLE)) #define TOA_NIP6_FMT "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x" #define TOA_NIP6(addr) \ @@ -28,36 +29,250 @@ unsigned long sk_data_ready_addr = 0; ntohs((addr).s6_addr16[6]), \ ntohs((addr).s6_addr16[7]) +/* ipv6's toa list table array */ +#define TOA_IP6_TAB_BITS 12 +#define TOA_IP6_TAB_SIZE (1 << TOA_IP6_TAB_BITS) +#define TOA_IP6_TAB_MASK (TOA_IP6_TAB_SIZE - 1) + +struct toa_ip6_entry { + struct toa_ip6_data toa_data; + struct sock *sk; + + struct list_head list; +}; + +struct toa_ip6_list_head { + struct list_head toa_ip6_head; + spinlock_t lock; +} __attribute__((__aligned__(SMP_CACHE_BYTES))); + +static struct toa_ip6_list_head +__toa_ip6_list_tab[TOA_IP6_TAB_SIZE] __cacheline_aligned; + +/* per-cpu lock for toa of ipv6 */ +struct toa_ip6_sk_lock { + /* lock for sk of ip6 toa */ + spinlock_t __percpu *lock; +}; + +static struct toa_ip6_sk_lock toa_ip6_sk_lock; +#endif + +#ifdef TOA_IPV6_ENABLE static struct proto_ops *inet6_stream_ops_p = NULL; static struct inet_connection_sock_af_ops *ipv6_specific_p = NULL; typedef struct sock *(*syn_recv_sock_func_pt)( - struct sock *sk, struct sk_buff *skb, - struct request_sock *req, - struct dst_entry *dst); + struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct dst_entry *dst); static syn_recv_sock_func_pt tcp_v6_syn_recv_sock_org_pt = NULL; #endif /* * Statistics of toa in proc /proc/net/toa_stats */ - struct toa_stats_entry toa_stats[] = { - TOA_STAT_ITEM("syn_recv_sock_toa", SYN_RECV_SOCK_TOA_CNT), - TOA_STAT_ITEM("syn_recv_sock_no_toa", SYN_RECV_SOCK_NO_TOA_CNT), - TOA_STAT_ITEM("getname_toa_ok", GETNAME_TOA_OK_CNT), - TOA_STAT_ITEM("getname_toa_mismatch", GETNAME_TOA_MISMATCH_CNT), - TOA_STAT_ITEM("getname_toa_bypass", GETNAME_TOA_BYPASS_CNT), - TOA_STAT_ITEM("getname_toa_empty", GETNAME_TOA_EMPTY_CNT), -#ifdef TOA_IPV6_ENABLE - TOA_STAT_ITEM("ip6_address_alloc", IP6_ADDR_ALLOC_CNT), - TOA_STAT_ITEM("ip6_address_free", IP6_ADDR_FREE_CNT), + TOA_STAT_ITEM("syn_recv_sock_toa", SYN_RECV_SOCK_TOA_CNT), + TOA_STAT_ITEM("syn_recv_sock_no_toa", SYN_RECV_SOCK_NO_TOA_CNT), + TOA_STAT_ITEM("getname_toa_ok", GETNAME_TOA_OK_CNT), + TOA_STAT_ITEM("getname_toa_mismatch", GETNAME_TOA_MISMATCH_CNT), + TOA_STAT_ITEM("getname_toa_bypass", GETNAME_TOA_BYPASS_CNT), + TOA_STAT_ITEM("getname_toa_empty", GETNAME_TOA_EMPTY_CNT), +#if (defined(TOA_IPV6_ENABLE) || defined(TOA_NAT64_ENABLE)) + TOA_STAT_ITEM("ip6_address_alloc", IP6_ADDR_ALLOC_CNT), + TOA_STAT_ITEM("ip6_address_free", IP6_ADDR_FREE_CNT), #endif - TOA_STAT_END + TOA_STAT_END }; DEFINE_TOA_STAT(struct toa_stat_mib, ext_stats); +#if (defined(TOA_IPV6_ENABLE) || defined(TOA_NAT64_ENABLE)) +#if LINUX_VERSION_CODE < KERNEL_VERSION(3,4,103) +/* more secured version of ipv6_addr_hash() */ +static inline u32 +__ipv6_addr_jhash(const struct in6_addr *a, const u32 initval) +{ + u32 v = (__force u32)a->s6_addr32[0] ^ (__force u32)a->s6_addr32[1]; + + return jhash_3words(v, + (__force u32)a->s6_addr32[2], + (__force u32)a->s6_addr32[3], + initval); +} +#endif + +static void +toa_ip6_hash(struct toa_ip6_entry *ptr_ip6_entry) +{ + struct toa_ip6_data *ptr_toa_data = &ptr_ip6_entry->toa_data; + __u32 hash_key = + __ipv6_addr_jhash(&ptr_toa_data->in6_addr, ptr_toa_data->port) & TOA_IP6_TAB_MASK; + + spin_lock_bh(&__toa_ip6_list_tab[hash_key].lock); + + list_add(&ptr_ip6_entry->list, &__toa_ip6_list_tab[hash_key].toa_ip6_head); + + spin_unlock_bh(&__toa_ip6_list_tab[hash_key].lock); + + return; +} + +static void +toa_ip6_unhash(struct toa_ip6_entry *ptr_ip6_entry) +{ + struct toa_ip6_data *ptr_toa_data = &ptr_ip6_entry->toa_data; + __u32 hash_key = + __ipv6_addr_jhash(&ptr_toa_data->in6_addr, ptr_toa_data->port) & TOA_IP6_TAB_MASK; + + spin_lock_bh(&__toa_ip6_list_tab[hash_key].lock); + + list_del(&ptr_ip6_entry->list); + + spin_unlock_bh(&__toa_ip6_list_tab[hash_key].lock); +} + +static void +lock_all_toa_ip6_sk(void) +{ + int i; + for_each_possible_cpu(i) { + spinlock_t *lock; + + lock = per_cpu_ptr(toa_ip6_sk_lock.lock, i); + spin_lock_bh(lock); + } +} + +static void +unlock_all_toa_ip6_sk(void) +{ + int i; + for_each_possible_cpu(i) { + spinlock_t *lock; + + lock = per_cpu_ptr(toa_ip6_sk_lock.lock, i); + spin_unlock_bh(lock); + } +} + +static void +lock_cpu_toa_ip6_sk(void) +{ + spinlock_t *lock = this_cpu_ptr(toa_ip6_sk_lock.lock); + spin_lock_bh(lock); +} + +static void +unlock_cpu_toa_ip6_sk(void) +{ + spinlock_t *lock = this_cpu_ptr(toa_ip6_sk_lock.lock); + spin_unlock_bh(lock); +} + +static int +init_toa_ip6(void) +{ + int i; + + for_each_possible_cpu(i) { + spinlock_t *lock; + + lock = per_cpu_ptr(toa_ip6_sk_lock.lock, i); + spin_lock_init(lock); + } + + for (i = 0; i < TOA_IP6_TAB_SIZE; ++i) { + INIT_LIST_HEAD(&__toa_ip6_list_tab[i].toa_ip6_head); + spin_lock_init(&__toa_ip6_list_tab[i].lock); + } + + toa_ip6_sk_lock.lock = alloc_percpu(spinlock_t); + if (toa_ip6_sk_lock.lock == NULL) { + TOA_INFO("fail to alloc per cpu ip6's destruct lock\n"); + return -ENOMEM; + } + + return 0; +} + +static void +tcp_v6_sk_destruct_toa(struct sock *sk) { + + lock_cpu_toa_ip6_sk(); + + if (sk->sk_user_data) { + struct toa_ip6_entry* ptr_ip6_entry = sk->sk_user_data; + toa_ip6_unhash(ptr_ip6_entry); + sk->sk_destruct = inet_sock_destruct; + sk->sk_user_data = NULL; + kfree(ptr_ip6_entry); + TOA_INC_STATS(ext_stats, IP6_ADDR_FREE_CNT); + } + + inet_sock_destruct(sk); + + unlock_cpu_toa_ip6_sk(); +} + +static int +exit_toa_ip6(void) +{ + int i; + struct list_head *head; + struct toa_ip6_entry *ptr_ip6_entry; + struct sock *sk; + + lock_all_toa_ip6_sk(); + + for (i = 0; i < TOA_IP6_TAB_SIZE; ++i) { + + spin_lock_bh(&__toa_ip6_list_tab[i].lock); + + head = &__toa_ip6_list_tab[i].toa_ip6_head; + while (!list_empty(head)) { + ptr_ip6_entry = list_first_entry(head, struct toa_ip6_entry, list); + sk = ptr_ip6_entry->sk; + + if (sk && sk->sk_user_data && + (sk->sk_destruct == tcp_v6_sk_destruct_toa)) { + + sk->sk_destruct = inet_sock_destruct; + sk->sk_user_data = NULL; + + TOA_DBG("free ip6_entry in __toa_ip6_list_tab succ. " + "ptr_ip6_entry : %p, toa_ip6 : "TOA_NIP6_FMT", toa_port : %u\n", + ptr_ip6_entry, + TOA_NIP6(ptr_ip6_entry->toa_data.in6_addr), + ptr_ip6_entry->toa_data.port); + } else { + TOA_DBG("update sk of ip6_entry fail. " + "ptr_ip6_entry : %p\n", + ptr_ip6_entry); + } + + TOA_INC_STATS(ext_stats, IP6_ADDR_FREE_CNT); + + list_del(&ptr_ip6_entry->list); + kfree(ptr_ip6_entry); + } + + spin_unlock_bh(&__toa_ip6_list_tab[i].lock); + + } + + unlock_all_toa_ip6_sk(); + + synchronize_net(); + + free_percpu(toa_ip6_sk_lock.lock); + return 0; +} + +#endif + + /* * Funcs for toa hooks */ @@ -67,100 +282,112 @@ DEFINE_TOA_STAT(struct toa_stat_mib, ext_stats); * @return NULL if we don't get client ip/port; * value of toa_data in ret_ptr if we get client ip/port. */ -static void *get_toa_data(int af, struct sk_buff *skb) +static void *get_toa_data(int af, struct sk_buff *skb, int *nat64) { - struct tcphdr *th; - int length; - unsigned char *ptr; - - TOA_DBG("get_toa_data called\n"); - - if (NULL != skb) { - th = tcp_hdr(skb); - length = (th->doff * 4) - sizeof(struct tcphdr); - ptr = (unsigned char *) (th + 1); - - while (length > 0) { - int opcode = *ptr++; - int opsize; - switch (opcode) { - case TCPOPT_EOL: - return NULL; - case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ - length--; - continue; - default: - opsize = *ptr++; - if (opsize < 2) /* "silly options" */ - return NULL; - if (opsize > length) - /* don't parse partial options */ - return NULL; - if (TCPOPT_TOA == opcode && - TCPOLEN_IP4_TOA == opsize) { - - struct toa_ip4_data tdata; - void *ret_ptr = NULL; - - memcpy(&tdata, ptr - 2, sizeof(tdata)); - TOA_DBG("af = %d, find toa data: ip = " - TOA_NIPQUAD_FMT", port = %u\n", - af, - TOA_NIPQUAD(tdata.ip), - ntohs(tdata.port)); - if (af == AF_INET) { - memcpy(&ret_ptr, &tdata, - sizeof(ret_ptr)); - TOA_DBG("coded ip4 toa data: %p\n", - ret_ptr); - return ret_ptr; - } + struct tcphdr *th; + int length; + unsigned char *ptr; + + TOA_DBG("get_toa_data called\n"); + + *nat64 = 0; + if (NULL != skb) { + th = tcp_hdr(skb); + length = (th->doff * 4) - sizeof(struct tcphdr); + ptr = (unsigned char *) (th + 1); + + while (length > 0) { + int opcode = *ptr++; + int opsize; + switch (opcode) { + case TCPOPT_EOL: + return NULL; + case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ + length--; + continue; + default: + opsize = *ptr++; + if (opsize < 2) /* "silly options" */ + return NULL; + if (opsize > length) + /* don't parse partial options */ + return NULL; + if (TCPOPT_TOA == opcode && + TCPOLEN_IP4_TOA == opsize) { + + struct toa_ip4_data tdata; + void *ret_ptr = NULL; + + memcpy(&tdata, ptr - 2, sizeof(tdata)); + TOA_DBG("af = %d, find toa data: ip = " + TOA_NIPQUAD_FMT", port = %u\n", + af, + TOA_NIPQUAD(tdata.ip), + ntohs(tdata.port)); + if (af == AF_INET) { + memcpy(&ret_ptr, &tdata, + sizeof(ret_ptr)); + TOA_DBG("coded ip4 toa data: %p\n", + ret_ptr); + return ret_ptr; + } #ifdef TOA_IPV6_ENABLE - else if (af == AF_INET6) { - struct toa_ip6_data *ptr_toa_ip6 = - kmalloc(sizeof(struct toa_ip6_data), GFP_ATOMIC); - if (!ptr_toa_ip6) { - return NULL; - } - ptr_toa_ip6->opcode = opcode; - ptr_toa_ip6->opsize = TCPOLEN_IP6_TOA; - ipv6_addr_set(&ptr_toa_ip6->in6_addr, 0, 0, - htonl(0x0000FFFF), tdata.ip); - TOA_DBG("coded ip6 toa data: %p\n", - ptr_toa_ip6); - TOA_INC_STATS(ext_stats, IP6_ADDR_ALLOC_CNT); - return ptr_toa_ip6; - } + else if (af == AF_INET6) { + struct toa_ip6_data *ptr_toa_ip6; + struct toa_ip6_entry *ptr_toa_entry = + kzalloc(sizeof(struct toa_ip6_entry), GFP_ATOMIC); + if (!ptr_toa_entry) { + return NULL; + } + + ptr_toa_ip6 = &ptr_toa_entry->toa_data; + ptr_toa_ip6->opcode = opcode; + ptr_toa_ip6->opsize = TCPOLEN_IP6_TOA; + ipv6_addr_set(&ptr_toa_ip6->in6_addr, 0, 0, + htonl(0x0000FFFF), tdata.ip); + ptr_toa_ip6->port = tdata.port; + TOA_DBG("coded ip6 toa data: %p\n", + ptr_toa_ip6); + TOA_INC_STATS(ext_stats, IP6_ADDR_ALLOC_CNT); + return ptr_toa_entry; + } #endif - } - -#ifdef TOA_IPV6_ENABLE - if (TCPOPT_TOA == opcode && - TCPOLEN_IP6_TOA == opsize && - af == AF_INET6) { - struct toa_ip6_data *ptr_toa_ip6 = - kmalloc(sizeof(struct toa_ip6_data), GFP_ATOMIC); - if (!ptr_toa_ip6) { - return NULL; - } - memcpy(ptr_toa_ip6, ptr - 2, sizeof(struct toa_ip6_data)); - - TOA_DBG("find toa_v6 data : ip = " - TOA_NIP6_FMT", port = %u," - " coded ip6 toa data: %p\n", - TOA_NIP6(ptr_toa_ip6->in6_addr), - ptr_toa_ip6->port, - ptr_toa_ip6); - TOA_INC_STATS(ext_stats, IP6_ADDR_ALLOC_CNT); - return ptr_toa_ip6; - } + } + +#if (defined(TOA_IPV6_ENABLE) || defined(TOA_NAT64_ENABLE)) + if (TCPOPT_TOA == opcode && + TCPOLEN_IP6_TOA == opsize) { + struct toa_ip6_data *ptr_toa_ip6; + struct toa_ip6_entry *ptr_toa_entry = + kzalloc(sizeof(struct toa_ip6_entry), GFP_ATOMIC); + if (!ptr_toa_entry) { + return NULL; + } + + ptr_toa_ip6 = &ptr_toa_entry->toa_data; + memcpy(ptr_toa_ip6, ptr - 2, sizeof(struct toa_ip6_data)); + + TOA_DBG("find toa_v6 data : ip = " + TOA_NIP6_FMT", port = %u," + " coded ip6 toa data: %p\n", + TOA_NIP6(ptr_toa_ip6->in6_addr), + ptr_toa_ip6->port, + ptr_toa_ip6); + TOA_INC_STATS(ext_stats, IP6_ADDR_ALLOC_CNT); + if (af == AF_INET6) + *nat64 = 0; + else + *nat64 = 1; + + return ptr_toa_entry; + } #endif - ptr += opsize - 2; - length -= opsize; - } - } - } - return NULL; + ptr += opsize - 2; + length -= opsize; + } + } + } + return NULL; } /* get client ip from socket @@ -173,100 +400,211 @@ static void *get_toa_data(int af, struct sk_buff *skb) */ static int inet_getname_toa(struct socket *sock, struct sockaddr *uaddr, - int *uaddr_len, int peer) + int *uaddr_len, int peer) { - int retval = 0; - struct sock *sk = sock->sk; - struct sockaddr_in *sin = (struct sockaddr_in *) uaddr; - struct toa_ip4_data tdata; - - TOA_DBG("inet_getname_toa called, sk->sk_user_data is %p\n", - sk->sk_user_data); - - /* call orginal one */ - retval = inet_getname(sock, uaddr, uaddr_len, peer); - - /* set our value if need */ - if (retval == 0 && NULL != sk->sk_user_data && peer) { - if (sk_data_ready_addr == (unsigned long) sk->sk_data_ready) { - memcpy(&tdata, &sk->sk_user_data, sizeof(tdata)); - if (TCPOPT_TOA == tdata.opcode && - TCPOLEN_IP4_TOA == tdata.opsize) { - TOA_INC_STATS(ext_stats, GETNAME_TOA_OK_CNT); - TOA_DBG("inet_getname_toa: set new sockaddr, ip " - TOA_NIPQUAD_FMT" -> "TOA_NIPQUAD_FMT - ", port %u -> %u\n", - TOA_NIPQUAD(sin->sin_addr.s_addr), - TOA_NIPQUAD(tdata.ip), ntohs(sin->sin_port), - ntohs(tdata.port)); - sin->sin_port = tdata.port; - sin->sin_addr.s_addr = tdata.ip; - } else { /* sk_user_data doesn't belong to us */ - TOA_INC_STATS(ext_stats, - GETNAME_TOA_MISMATCH_CNT); - TOA_DBG("inet_getname_toa: invalid toa data, " - "ip "TOA_NIPQUAD_FMT" port %u opcode %u " - "opsize %u\n", - TOA_NIPQUAD(tdata.ip), ntohs(tdata.port), - tdata.opcode, tdata.opsize); - } - } else { - TOA_INC_STATS(ext_stats, GETNAME_TOA_BYPASS_CNT); - } - } else { /* no need to get client ip */ - TOA_INC_STATS(ext_stats, GETNAME_TOA_EMPTY_CNT); - } - - return retval; + int retval = 0; + struct sock *sk = sock->sk; + struct sockaddr_in *sin = (struct sockaddr_in *) uaddr; + struct toa_ip4_data tdata; + + TOA_DBG("inet_getname_toa called, sk->sk_user_data is %p\n", + sk->sk_user_data); + + /* call orginal one */ + retval = inet_getname(sock, uaddr, uaddr_len, peer); + + /* set our value if need */ + if (retval == 0 && NULL != sk->sk_user_data && peer) { + if (sk_data_ready_addr == (unsigned long) sk->sk_data_ready && + !sock_flag(sk, SOCK_NAT64)) { + memcpy(&tdata, &sk->sk_user_data, sizeof(tdata)); + if (TCPOPT_TOA == tdata.opcode && + TCPOLEN_IP4_TOA == tdata.opsize) { + TOA_INC_STATS(ext_stats, GETNAME_TOA_OK_CNT); + TOA_DBG("inet_getname_toa: set new sockaddr, ip " + TOA_NIPQUAD_FMT" -> "TOA_NIPQUAD_FMT + ", port %u -> %u\n", + TOA_NIPQUAD(sin->sin_addr.s_addr), + TOA_NIPQUAD(tdata.ip), ntohs(sin->sin_port), + ntohs(tdata.port)); + sin->sin_port = tdata.port; + sin->sin_addr.s_addr = tdata.ip; + } else { /* sk_user_data doesn't belong to us */ + TOA_INC_STATS(ext_stats, + GETNAME_TOA_MISMATCH_CNT); + TOA_DBG("inet_getname_toa: invalid toa data, " + "ip "TOA_NIPQUAD_FMT" port %u opcode %u " + "opsize %u\n", + TOA_NIPQUAD(tdata.ip), ntohs(tdata.port), + tdata.opcode, tdata.opsize); + } + } else { + TOA_INC_STATS(ext_stats, GETNAME_TOA_BYPASS_CNT); + } + } else { /* no need to get client ip */ + TOA_INC_STATS(ext_stats, GETNAME_TOA_EMPTY_CNT); + } + + return retval; } +/* NAT64 get client ip from socket + * Client ip is v6 and socket is v4 + * Find toa and copy_to_user + * This function will not return inet_getname, + * so users can get distinctions from normal v4 + * + * Notice: + * In fact, we can just use original api inet_getname_toa by uaddr_len judge. + * We didn't do this because RS developers may be confused about this api. + */ +#ifdef TOA_NAT64_ENABLE +static int +inet64_getname_toa(struct sock *sk, int cmd, void __user *user, int *len) +{ + struct inet_sock *inet; + struct toa_nat64_peer uaddr; + int ret; + + if (cmd != TOA_SO_GET_LOOKUP || !sk) { + TOA_INFO("%s: bad cmd\n", __func__); + return -EINVAL; + } + + if (*len < sizeof(struct toa_nat64_peer) || + NULL == user) { + TOA_INFO("%s: bad param len\n", __func__); + return -EINVAL; + } + + inet = inet_sk(sk); + /* refered to inet_getname */ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,33) + if (!inet->inet_dport || +#else + if (!inet->dport || +#endif + ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT))) + return -ENOTCONN; + + ret = -EINVAL; + + lock_cpu_toa_ip6_sk(); + + if (NULL != sk->sk_user_data) { + struct toa_ip6_entry *ptr_ip6_entry; + struct toa_ip6_data *ptr_ip6_data; + + if (sk_data_ready_addr == (unsigned long) sk->sk_data_ready) { + + if (!sock_flag(sk, SOCK_NAT64)) { + ret = -EFAULT; + goto out; + } + + ptr_ip6_entry = sk->sk_user_data; + ptr_ip6_data = &ptr_ip6_entry->toa_data; + + if (TCPOPT_TOA == ptr_ip6_data->opcode && + TCPOLEN_IP6_TOA == ptr_ip6_data->opsize) { + TOA_INC_STATS(ext_stats, GETNAME_TOA_OK_CNT); + TOA_DBG("inet64_getname_toa: set new sockaddr, ip " + TOA_NIPQUAD_FMT" -> "TOA_NIP6_FMT + ", port %u -> %u\n", +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,33) + TOA_NIPQUAD(inet->inet_saddr), +#else + TOA_NIPQUAD(inet->saddr), +#endif + TOA_NIP6(ptr_ip6_data->in6_addr), +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,33) + ntohs(inet->inet_sport), +#else + ntohs(inet->sport), +#endif + ntohs(ptr_ip6_data->port)); + uaddr.saddr = ptr_ip6_data->in6_addr; + uaddr.port = ptr_ip6_data->port; + + if (copy_to_user(user, &uaddr, + sizeof(struct toa_nat64_peer)) != 0) { + ret = -EFAULT; + goto out; + } + + *len = sizeof(struct toa_nat64_peer); + ret = 0; + goto out; + } else { + TOA_INC_STATS(ext_stats, + GETNAME_TOA_MISMATCH_CNT); + } + } else { + TOA_INC_STATS(ext_stats, GETNAME_TOA_BYPASS_CNT); + } + } else { + TOA_INC_STATS(ext_stats, GETNAME_TOA_EMPTY_CNT); + } + +out: + unlock_cpu_toa_ip6_sk(); + return ret; +} +#endif + #ifdef TOA_IPV6_ENABLE static int inet6_getname_toa(struct socket *sock, struct sockaddr *uaddr, - int *uaddr_len, int peer) + int *uaddr_len, int peer) { - int retval = 0; - struct sock *sk = sock->sk; - struct sockaddr_in6 *sin = (struct sockaddr_in6 *) uaddr; - struct toa_ip6_data* t_ip6_data_ptr; - - TOA_DBG("inet6_getname_toa called, sk->sk_user_data is %p\n", - sk->sk_user_data); - - /* call orginal one */ - retval = inet6_getname(sock, uaddr, uaddr_len, peer); - - /* set our value if need */ - if (retval == 0 && NULL != sk->sk_user_data && peer) { - if (sk_data_ready_addr == (unsigned long) sk->sk_data_ready) { - t_ip6_data_ptr = sk->sk_user_data; - if (TCPOPT_TOA == t_ip6_data_ptr->opcode && - TCPOLEN_IP6_TOA == t_ip6_data_ptr->opsize) { - TOA_INC_STATS(ext_stats, GETNAME_TOA_OK_CNT); - TOA_DBG("inet6_getname_toa: set new sockaddr, ip " - TOA_NIP6_FMT" -> "TOA_NIP6_FMT - ", port %u -> %u\n", - TOA_NIP6(sin->sin6_addr), - TOA_NIP6(t_ip6_data_ptr->in6_addr), - ntohs(sin->sin6_port), - ntohs(t_ip6_data_ptr->port)); - sin->sin6_port = t_ip6_data_ptr->port; - sin->sin6_addr = t_ip6_data_ptr->in6_addr; - } else { /* sk_user_data doesn't belong to us */ - TOA_INC_STATS(ext_stats, - GETNAME_TOA_MISMATCH_CNT); - } - } else { - TOA_INC_STATS(ext_stats, GETNAME_TOA_BYPASS_CNT); - } - } else { /* no need to get client ip */ - TOA_INC_STATS(ext_stats, GETNAME_TOA_EMPTY_CNT); - } - - return retval; + int retval = 0; + struct sock *sk = sock->sk; + struct sockaddr_in6 *sin = (struct sockaddr_in6 *) uaddr; + + TOA_DBG("inet6_getname_toa called, sk->sk_user_data is %p\n", + sk->sk_user_data); + + /* call orginal one */ + retval = inet6_getname(sock, uaddr, uaddr_len, peer); + + /* set our value if need */ + lock_cpu_toa_ip6_sk(); + + if (retval == 0 && NULL != sk->sk_user_data && peer) { + if (sk_data_ready_addr == (unsigned long) sk->sk_data_ready) { + struct toa_ip6_entry* ptr_ip6_entry = sk->sk_user_data; + struct toa_ip6_data* ptr_ip6_data = &ptr_ip6_entry->toa_data; + + if (sk == ptr_ip6_entry->sk && + TCPOPT_TOA == ptr_ip6_data->opcode && + TCPOLEN_IP6_TOA == ptr_ip6_data->opsize) { + TOA_INC_STATS(ext_stats, GETNAME_TOA_OK_CNT); + TOA_DBG("inet6_getname_toa: set new sockaddr, ip " + TOA_NIP6_FMT" -> "TOA_NIP6_FMT + ", port %u -> %u\n", + TOA_NIP6(sin->sin6_addr), + TOA_NIP6(ptr_ip6_data->in6_addr), + ntohs(sin->sin6_port), + ntohs(ptr_ip6_data->port)); + sin->sin6_port = ptr_ip6_data->port; + sin->sin6_addr = ptr_ip6_data->in6_addr; + } else { /* sk_user_data doesn't belong to us */ + TOA_INC_STATS(ext_stats, + GETNAME_TOA_MISMATCH_CNT); + } + } else { + TOA_INC_STATS(ext_stats, GETNAME_TOA_BYPASS_CNT); + } + } else { /* no need to get client ip */ + TOA_INC_STATS(ext_stats, GETNAME_TOA_EMPTY_CNT); + } + + unlock_cpu_toa_ip6_sk(); + + return retval; } -static inline int +static inline int get_kernel_ipv6_symbol(void) { inet6_stream_ops_p = @@ -274,24 +612,24 @@ get_kernel_ipv6_symbol(void) if (inet6_stream_ops_p == NULL) { TOA_INFO("CPU [%u] kallsyms_lookup_name cannot find symbol inet6_stream_ops\n", smp_processor_id()); - - return -1; - } + + return -1; + } ipv6_specific_p = (struct inet_connection_sock_af_ops *)kallsyms_lookup_name("ipv6_specific"); if (ipv6_specific_p == NULL) { TOA_INFO("CPU [%u] kallsyms_lookup_name cannot find symbol ipv6_specific\n", smp_processor_id()); - return -1; - } + return -1; + } tcp_v6_syn_recv_sock_org_pt = (syn_recv_sock_func_pt)kallsyms_lookup_name("tcp_v6_syn_recv_sock"); if (tcp_v6_syn_recv_sock_org_pt == NULL) { TOA_INFO("CPU [%u] kallsyms_lookup_name cannot find symbol tcp_v6_syn_recv_sock\n", smp_processor_id()); - return -1; - } - return 0; + return -1; + } + return 0; } #endif @@ -306,70 +644,78 @@ get_kernel_ipv6_symbol(void) */ static struct sock * tcp_v4_syn_recv_sock_toa(struct sock *sk, struct sk_buff *skb, - struct request_sock *req, struct dst_entry *dst) + struct request_sock *req, struct dst_entry *dst) { - struct sock *newsock = NULL; - - TOA_DBG("tcp_v4_syn_recv_sock_toa called\n"); - - /* call orginal one */ - newsock = tcp_v4_syn_recv_sock(sk, skb, req, dst); - - /* set our value if need */ - if (NULL != newsock && NULL == newsock->sk_user_data) { - newsock->sk_user_data = get_toa_data(AF_INET, skb); - if (NULL != newsock->sk_user_data) - TOA_INC_STATS(ext_stats, SYN_RECV_SOCK_TOA_CNT); - else - TOA_INC_STATS(ext_stats, SYN_RECV_SOCK_NO_TOA_CNT); - - TOA_DBG("tcp_v4_syn_recv_sock_toa: set " - "sk->sk_user_data to %p\n", - newsock->sk_user_data); - } - return newsock; + struct sock *newsock = NULL; + int nat64 = 0; + + TOA_DBG("tcp_v4_syn_recv_sock_toa called\n"); + + /* call orginal one */ + newsock = tcp_v4_syn_recv_sock(sk, skb, req, dst); + + /* set our value if need */ + if (NULL != newsock && NULL == newsock->sk_user_data) { + newsock->sk_user_data = get_toa_data(AF_INET, skb, &nat64); + sock_reset_flag(newsock, SOCK_NAT64); + if (NULL != newsock->sk_user_data) { + TOA_INC_STATS(ext_stats, SYN_RECV_SOCK_TOA_CNT); +#ifdef TOA_NAT64_ENABLE + if (nat64) { + struct toa_ip6_entry *ptr_ip6_entry = newsock->sk_user_data; + ptr_ip6_entry->sk = newsock; + toa_ip6_hash(ptr_ip6_entry); + + newsock->sk_destruct = tcp_v6_sk_destruct_toa; + + sock_set_flag(newsock, SOCK_NAT64); + } +#endif + } + else + TOA_INC_STATS(ext_stats, SYN_RECV_SOCK_NO_TOA_CNT); + + TOA_DBG("tcp_v4_syn_recv_sock_toa: set " + "sk->sk_user_data to %p\n", + newsock->sk_user_data); + } + return newsock; } #ifdef TOA_IPV6_ENABLE -static void -tcp_v6_sk_destruct_toa(struct sock *sk) { - if (sk->sk_user_data) { - kfree(sk->sk_user_data); - sk->sk_user_data = NULL; - TOA_INC_STATS(ext_stats, IP6_ADDR_FREE_CNT); - } - inet_sock_destruct(sk); -} - static struct sock * tcp_v6_syn_recv_sock_toa(struct sock *sk, struct sk_buff *skb, - struct request_sock *req, struct dst_entry *dst) + struct request_sock *req, struct dst_entry *dst) { - struct sock *newsock = NULL; - - TOA_DBG("tcp_v6_syn_recv_sock_toa called\n"); - - /* call orginal one */ - newsock = tcp_v6_syn_recv_sock_org_pt(sk, skb, req, dst); - - /* set our value if need */ - if (NULL != newsock && NULL == newsock->sk_user_data) { - newsock->sk_user_data = get_toa_data(AF_INET6, skb); - if (NULL != newsock->sk_user_data) { - newsock->sk_destruct = tcp_v6_sk_destruct_toa; - TOA_INC_STATS(ext_stats, SYN_RECV_SOCK_TOA_CNT); - } else { - TOA_INC_STATS(ext_stats, SYN_RECV_SOCK_NO_TOA_CNT); - } - - TOA_DBG("tcp_v6_syn_recv_sock_toa: set " - "sk->sk_user_data to %p\n", - newsock->sk_user_data); - } - return newsock; + struct sock *newsock = NULL; + int nat64 = 0; + + TOA_DBG("tcp_v6_syn_recv_sock_toa called\n"); + + /* call orginal one */ + newsock = tcp_v6_syn_recv_sock_org_pt(sk, skb, req, dst); + + /* set our value if need */ + if (NULL != newsock && NULL == newsock->sk_user_data) { + newsock->sk_user_data = get_toa_data(AF_INET6, skb, &nat64); + sock_reset_flag(newsock, SOCK_NAT64); + if (NULL != newsock->sk_user_data) { + struct toa_ip6_entry *ptr_ip6_entry = newsock->sk_user_data; + ptr_ip6_entry->sk = newsock; + toa_ip6_hash(ptr_ip6_entry); + + newsock->sk_destruct = tcp_v6_sk_destruct_toa; + TOA_INC_STATS(ext_stats, SYN_RECV_SOCK_TOA_CNT); + } else { + TOA_INC_STATS(ext_stats, SYN_RECV_SOCK_NO_TOA_CNT); + } + + TOA_DBG("tcp_v6_syn_recv_sock_toa: set " + "sk->sk_user_data to %p\n", + newsock->sk_user_data); + } + return newsock; } - - #endif /* @@ -380,69 +726,69 @@ tcp_v6_syn_recv_sock_toa(struct sock *sk, struct sk_buff *skb, static inline int hook_toa_functions(void) { - /* hook inet_getname for ipv4 */ - struct proto_ops *inet_stream_ops_p = - (struct proto_ops *)&inet_stream_ops; - /* hook tcp_v4_syn_recv_sock for ipv4 */ - struct inet_connection_sock_af_ops *ipv4_specific_p = - (struct inet_connection_sock_af_ops *)&ipv4_specific; - - inet_stream_ops_p->getname = inet_getname_toa; - TOA_INFO("CPU [%u] hooked inet_getname <%p> --> <%p>\n", - smp_processor_id(), inet_getname, inet_stream_ops_p->getname); - - ipv4_specific_p->syn_recv_sock = tcp_v4_syn_recv_sock_toa; - TOA_INFO("CPU [%u] hooked tcp_v4_syn_recv_sock <%p> --> <%p>\n", - smp_processor_id(), tcp_v4_syn_recv_sock, - ipv4_specific_p->syn_recv_sock); + /* hook inet_getname for ipv4 */ + struct proto_ops *inet_stream_ops_p = + (struct proto_ops *)&inet_stream_ops; + /* hook tcp_v4_syn_recv_sock for ipv4 */ + struct inet_connection_sock_af_ops *ipv4_specific_p = + (struct inet_connection_sock_af_ops *)&ipv4_specific; + + inet_stream_ops_p->getname = inet_getname_toa; + TOA_INFO("CPU [%u] hooked inet_getname <%p> --> <%p>\n", + smp_processor_id(), inet_getname, inet_stream_ops_p->getname); + + ipv4_specific_p->syn_recv_sock = tcp_v4_syn_recv_sock_toa; + TOA_INFO("CPU [%u] hooked tcp_v4_syn_recv_sock <%p> --> <%p>\n", + smp_processor_id(), tcp_v4_syn_recv_sock, + ipv4_specific_p->syn_recv_sock); #ifdef TOA_IPV6_ENABLE - inet6_stream_ops_p->getname = inet6_getname_toa; - TOA_INFO("CPU [%u] hooked inet6_getname <%p> --> <%p>\n", - smp_processor_id(), inet6_getname, inet6_stream_ops_p->getname); - - ipv6_specific_p->syn_recv_sock = tcp_v6_syn_recv_sock_toa; - TOA_INFO("CPU [%u] hooked tcp_v6_syn_recv_sock <%p> --> <%p>\n", - smp_processor_id(), tcp_v6_syn_recv_sock_org_pt, - ipv6_specific_p->syn_recv_sock); + inet6_stream_ops_p->getname = inet6_getname_toa; + TOA_INFO("CPU [%u] hooked inet6_getname <%p> --> <%p>\n", + smp_processor_id(), inet6_getname, inet6_stream_ops_p->getname); + + ipv6_specific_p->syn_recv_sock = tcp_v6_syn_recv_sock_toa; + TOA_INFO("CPU [%u] hooked tcp_v6_syn_recv_sock <%p> --> <%p>\n", + smp_processor_id(), tcp_v6_syn_recv_sock_org_pt, + ipv6_specific_p->syn_recv_sock); #endif - return 0; + return 0; } /* replace the functions to original ones */ static int unhook_toa_functions(void) { - /* unhook inet_getname for ipv4 */ - struct proto_ops *inet_stream_ops_p = - (struct proto_ops *)&inet_stream_ops; - /* unhook tcp_v4_syn_recv_sock for ipv4 */ - struct inet_connection_sock_af_ops *ipv4_specific_p = - (struct inet_connection_sock_af_ops *)&ipv4_specific; + /* unhook inet_getname for ipv4 */ + struct proto_ops *inet_stream_ops_p = + (struct proto_ops *)&inet_stream_ops; + /* unhook tcp_v4_syn_recv_sock for ipv4 */ + struct inet_connection_sock_af_ops *ipv4_specific_p = + (struct inet_connection_sock_af_ops *)&ipv4_specific; - inet_stream_ops_p->getname = inet_getname; - TOA_INFO("CPU [%u] unhooked inet_getname\n", - smp_processor_id()); + inet_stream_ops_p->getname = inet_getname; + TOA_INFO("CPU [%u] unhooked inet_getname\n", + smp_processor_id()); - ipv4_specific_p->syn_recv_sock = tcp_v4_syn_recv_sock; - TOA_INFO("CPU [%u] unhooked tcp_v4_syn_recv_sock\n", - smp_processor_id()); + ipv4_specific_p->syn_recv_sock = tcp_v4_syn_recv_sock; + TOA_INFO("CPU [%u] unhooked tcp_v4_syn_recv_sock\n", + smp_processor_id()); #ifdef TOA_IPV6_ENABLE - if (inet6_stream_ops_p) { - inet6_stream_ops_p->getname = inet6_getname; - TOA_INFO("CPU [%u] unhooked inet6_getname\n", - smp_processor_id()); - } - if (ipv6_specific_p) { - ipv6_specific_p->syn_recv_sock = tcp_v6_syn_recv_sock_org_pt; - TOA_INFO("CPU [%u] unhooked tcp_v6_syn_recv_sock\n", - smp_processor_id()); - } + if (inet6_stream_ops_p) { + inet6_stream_ops_p->getname = inet6_getname; + TOA_INFO("CPU [%u] unhooked inet6_getname\n", + smp_processor_id()); + } + if (ipv6_specific_p) { + ipv6_specific_p->syn_recv_sock = tcp_v6_syn_recv_sock_org_pt; + TOA_INFO("CPU [%u] unhooked tcp_v6_syn_recv_sock\n", + smp_processor_id()); + } #endif - return 0; + return 0; } /* @@ -450,124 +796,158 @@ unhook_toa_functions(void) */ static int toa_stats_show(struct seq_file *seq, void *v) { - int i, j, cpu_nr; - - /* print CPU first */ - seq_printf(seq, " "); - cpu_nr = num_possible_cpus(); - for (i = 0; i < cpu_nr; i++) - if (cpu_online(i)) - seq_printf(seq, "CPU%d ", i); - seq_putc(seq, '\n'); - - i = 0; - while (NULL != toa_stats[i].name) { - seq_printf(seq, "%-25s:", toa_stats[i].name); - for (j = 0; j < cpu_nr; j++) { - if (cpu_online(j)) { - seq_printf(seq, "%10lu ", *( - ((unsigned long *) per_cpu_ptr( - ext_stats, j)) + toa_stats[i].entry - )); - } - } - seq_putc(seq, '\n'); - i++; - } - return 0; + int i, j, cpu_nr; + + /* print CPU first */ + seq_printf(seq, " "); + cpu_nr = num_possible_cpus(); + for (i = 0; i < cpu_nr; i++) + if (cpu_online(i)) + seq_printf(seq, "CPU%d ", i); + seq_putc(seq, '\n'); + + i = 0; + while (NULL != toa_stats[i].name) { + seq_printf(seq, "%-25s:", toa_stats[i].name); + for (j = 0; j < cpu_nr; j++) { + if (cpu_online(j)) { + seq_printf(seq, "%10lu ", *( + ((unsigned long *) per_cpu_ptr( + ext_stats, j)) + toa_stats[i].entry + )); + } + } + seq_putc(seq, '\n'); + i++; + } + return 0; } static int toa_stats_seq_open(struct inode *inode, struct file *file) { - return single_open(file, toa_stats_show, NULL); + return single_open(file, toa_stats_show, NULL); } static const struct file_operations toa_stats_fops = { - .owner = THIS_MODULE, - .open = toa_stats_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, + .owner = THIS_MODULE, + .open = toa_stats_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +#ifdef TOA_NAT64_ENABLE +static struct nf_sockopt_ops toa_sockopts = { + .pf = PF_INET, + .owner = THIS_MODULE, + /* Nothing to do in set */ + /* get */ + .get_optmin = TOA_BASE_CTL, + .get_optmax = TOA_SO_GET_MAX+1, + .get = inet64_getname_toa, }; +#endif /* * TOA module init and destory */ -#if LINUX_VERSION_CODE >=KERNEL_VERSION(3,9,0) +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,9,0) static struct proc_dir_entry *proc_net_fops_create(struct net *net, - const char *name, mode_t mode, const struct file_operations *fops) + const char *name, mode_t mode, const struct file_operations *fops) { - return proc_create(name, mode, net->proc_net, fops); + return proc_create(name, mode, net->proc_net, fops); } static void proc_net_remove(struct net *net, const char *name) { - remove_proc_entry(name, net->proc_net); + remove_proc_entry(name, net->proc_net); } #endif - /* module init */ static int __init toa_init(void) { - TOA_INFO("TOA " TOA_VERSION " by pukong.wjm\n"); - - /* alloc statistics array for toa */ - ext_stats = alloc_percpu(struct toa_stat_mib); - if (NULL == ext_stats) - return 1; - proc_net_fops_create(&init_net, "toa_stats", 0, &toa_stats_fops); - - /* get the address of function sock_def_readable - * so later we can know whether the sock is for rpc, tux or others - */ - sk_data_ready_addr = kallsyms_lookup_name("sock_def_readable"); - TOA_INFO("CPU [%u] sk_data_ready_addr = " - "kallsyms_lookup_name(sock_def_readable) = %lu\n", - smp_processor_id(), sk_data_ready_addr); - if (0 == sk_data_ready_addr) { - TOA_INFO("cannot find sock_def_readable.\n"); - goto err; - } + TOA_INFO("TOA " TOA_VERSION " by qlb of iqiyi.\n"); + + /* alloc statistics array for toa */ + ext_stats = alloc_percpu(struct toa_stat_mib); + if (NULL == ext_stats) + return 1; + proc_net_fops_create(&init_net, "toa_stats", 0, &toa_stats_fops); + + /* get the address of function sock_def_readable + * so later we can know whether the sock is for rpc, tux or others + */ + sk_data_ready_addr = kallsyms_lookup_name("sock_def_readable"); + TOA_INFO("CPU [%u] sk_data_ready_addr = " + "kallsyms_lookup_name(sock_def_readable) = %lu\n", + smp_processor_id(), sk_data_ready_addr); + if (0 == sk_data_ready_addr) { + TOA_INFO("cannot find sock_def_readable.\n"); + goto err; + } + +#if (defined(TOA_IPV6_ENABLE) || defined(TOA_NAT64_ENABLE)) + if (0 != init_toa_ip6()) { + TOA_INFO("init toa ip6 fail.\n"); + goto err; + } +#endif #ifdef TOA_IPV6_ENABLE - if (0 != get_kernel_ipv6_symbol()) { - TOA_INFO("get ipv6 struct from kernel fail.\n"); - goto err; - } + if (0 != get_kernel_ipv6_symbol()) { + TOA_INFO("get ipv6 struct from kernel fail.\n"); + goto err; + } #endif - - /* hook funcs for parse and get toa */ - hook_toa_functions(); - TOA_INFO("toa loaded\n"); - return 0; +#ifdef TOA_NAT64_ENABLE + if (0 != nf_register_sockopt(&toa_sockopts)) { + TOA_INFO("fail to register sockopt\n"); + goto err; + } +#endif + + /* hook funcs for parse and get toa */ + hook_toa_functions(); + + TOA_INFO("toa loaded\n"); + return 0; err: - proc_net_remove(&init_net, "toa_stats"); - if (NULL != ext_stats) { - free_percpu(ext_stats); - ext_stats = NULL; - } + proc_net_remove(&init_net, "toa_stats"); + if (NULL != ext_stats) { + free_percpu(ext_stats); + ext_stats = NULL; + } - return 1; + return 1; } /* module cleanup*/ static void __exit toa_exit(void) { - unhook_toa_functions(); - synchronize_net(); - - proc_net_remove(&init_net, "toa_stats"); - if (NULL != ext_stats) { - free_percpu(ext_stats); - ext_stats = NULL; - } - TOA_INFO("toa unloaded\n"); + unhook_toa_functions(); +#ifdef TOA_NAT64_ENABLE + nf_unregister_sockopt(&toa_sockopts); +#endif + synchronize_net(); + +#if (defined(TOA_IPV6_ENABLE) || defined(TOA_NAT64_ENABLE)) + if (0 != exit_toa_ip6()) { + TOA_INFO("exit toa ip6 fail.\n"); + } +#endif + + proc_net_remove(&init_net, "toa_stats"); + if (NULL != ext_stats) { + free_percpu(ext_stats); + ext_stats = NULL; + } + TOA_INFO("toa unloaded\n"); } module_init(toa_init); diff --git a/kmod/toa/toa.h b/kmod/toa/toa.h index ebda933e0..7f05abe74 100644 --- a/kmod/toa/toa.h +++ b/kmod/toa/toa.h @@ -19,87 +19,112 @@ #include #include #include +#include -#define TOA_VERSION "1.0.0.0" - -//#define TOA_DEBUG_ENABLE -//#define TOA_IPV6_ENABLE +#define TOA_VERSION "2.0.0.0" #ifdef TOA_DEBUG_ENABLE -#define TOA_DBG(msg...) \ - do { \ - printk(KERN_DEBUG "[DEBUG] TOA: " msg); \ - } while (0) +#define TOA_DBG(msg...) \ + do { \ + printk(KERN_DEBUG "[DEBUG] TOA: " msg); \ + } while (0) #else #define TOA_DBG(msg...) #endif -#define TOA_INFO(msg...) \ - do { \ - if (net_ratelimit()) \ - printk(KERN_INFO "TOA: " msg); \ - } while (0) +#define TOA_INFO(msg...) \ + do { \ + if (net_ratelimit()) \ + printk(KERN_INFO "TOA: " msg); \ + } while (0) #define TCPOPT_TOA 254 /* MUST be 4n !!!! */ -#define TCPOLEN_IP4_TOA 8 /* |opcode|size|ip+port| = 1 + 1 + 6 */ -#define TCPOLEN_IP6_TOA 20 /* |opcode|size|ip_of_v6+port| = 1 + 1 + 18 */ +#define TCPOLEN_IP4_TOA 8 /* |opcode|size|ip+port| = 1 + 1 + 6 */ +#define TCPOLEN_IP6_TOA 20 /* |opcode|size|ip_of_v6+port| = 1 + 1 + 18 */ /* MUST be 4 bytes alignment */ struct toa_ip4_data { - __u8 opcode; - __u8 opsize; - __u16 port; - __u32 ip; + __u8 opcode; + __u8 opsize; + __u16 port; + __u32 ip; +}; + +#if (defined(TOA_IPV6_ENABLE) || defined(TOA_NAT64_ENABLE)) +struct toa_ip6_data { + __u8 opcode; + __u8 opsize; + __u16 port; + struct in6_addr in6_addr; }; +#endif -struct toa_ip6_data{ - __u8 opcode; - __u8 opsize; - __u16 port; - struct in6_addr in6_addr; +#ifdef TOA_NAT64_ENABLE +struct toa_nat64_peer { + struct in6_addr saddr; + __u16 port; }; +/* toa socket options, now only for nat64 */ +enum { + TOA_BASE_CTL = 4096, + /* set */ + TOA_SO_SET_MAX = TOA_BASE_CTL, + /* get */ + TOA_SO_GET_LOOKUP = TOA_BASE_CTL, + TOA_SO_GET_MAX = TOA_SO_GET_LOOKUP, +}; +#endif + +/*should be larger than enum sock_flags(net/sock.h)*/ +enum toa_sock_flags { +#if defined(__x86_64__) + SOCK_NAT64 = 63 +#else + SOCK_NAT64 = 31 +#endif +}; /* statistics about toa in proc /proc/net/toa_stat */ enum { - SYN_RECV_SOCK_TOA_CNT = 1, - SYN_RECV_SOCK_NO_TOA_CNT, - GETNAME_TOA_OK_CNT, - GETNAME_TOA_MISMATCH_CNT, - GETNAME_TOA_BYPASS_CNT, - GETNAME_TOA_EMPTY_CNT, -#ifdef TOA_IPV6_ENABLE - IP6_ADDR_ALLOC_CNT, - IP6_ADDR_FREE_CNT, + SYN_RECV_SOCK_TOA_CNT = 1, + SYN_RECV_SOCK_NO_TOA_CNT, + GETNAME_TOA_OK_CNT, + GETNAME_TOA_MISMATCH_CNT, + GETNAME_TOA_BYPASS_CNT, + GETNAME_TOA_EMPTY_CNT, +#if (defined(TOA_IPV6_ENABLE) || defined(TOA_NAT64_ENABLE)) + IP6_ADDR_ALLOC_CNT, + IP6_ADDR_FREE_CNT, #endif - TOA_STAT_LAST + TOA_STAT_LAST }; struct toa_stats_entry { - char *name; - int entry; + char *name; + int entry; }; #define TOA_STAT_ITEM(_name, _entry) { \ - .name = _name, \ - .entry = _entry, \ + .name = _name, \ + .entry = _entry, \ } -#define TOA_STAT_END { \ - NULL, \ - 0, \ +#define TOA_STAT_END { \ + NULL, \ + 0, \ } struct toa_stat_mib { - unsigned long mibs[TOA_STAT_LAST]; + unsigned long mibs[TOA_STAT_LAST]; }; #define DEFINE_TOA_STAT(type, name) \ - __typeof__(type) *name + __typeof__(type) *name #define TOA_INC_STATS(mib, field) \ - (per_cpu_ptr(mib, smp_processor_id())->mibs[field]++) + (per_cpu_ptr(mib, smp_processor_id())->mibs[field]++) #endif diff --git a/kmod/uoa/Makefile b/kmod/uoa/Makefile index 7da8bf1eb..5c5052948 100644 --- a/kmod/uoa/Makefile +++ b/kmod/uoa/Makefile @@ -7,7 +7,7 @@ KDIR := $(KERNDIR) endif PWD := $(shell pwd) -ccflags-y := -I$(src)/../include +ccflags-y := -I$(src)/../../include ifeq ($(DEBUG), 1) ccflags-y += -g -O0 diff --git a/kmod/uoa/example/Makefile b/kmod/uoa/example/Makefile new file mode 100644 index 000000000..bc6924acf --- /dev/null +++ b/kmod/uoa/example/Makefile @@ -0,0 +1,37 @@ +# +# DPVS is a software load balancer (Virtual Server) based on DPDK. +# +# Copyright (C) 2017 iQIYI (www.iqiyi.com). +# All Rights Reserved. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# + +# +# Makefile for example uoa +# + +all: udp_serv uperf opp + +CFLAGS = -g -O0 +CFLAGS += -I ../../../include -I ../ + +udp_serv: udp_serv.c + gcc $(CFLAGS) -o udp_serv udp_serv.c + +uperf: uperf.c + gcc $(CFLAGS) -lrt -o uperf uperf.c + +opp: opp.c + gcc $(CFLAGS) -o opp opp.c + +clean: + rm -rf udp_serv uperf opp diff --git a/kmod/uoa/example/make.sh b/kmod/uoa/example/make.sh deleted file mode 100755 index 2b576e5f2..000000000 --- a/kmod/uoa/example/make.sh +++ /dev/null @@ -1,3 +0,0 @@ -gcc udp_serv.c -o udp_serv -I ../../include/ -Wall -g -O0 -gcc uperf.c -o uperf -Wall -g -O0 -gcc opp.c -o opp -I../../include -Wall -g -O0 diff --git a/kmod/uoa/example/opp.c b/kmod/uoa/example/opp.c index cb5795cd5..e56a22cbb 100644 --- a/kmod/uoa/example/opp.c +++ b/kmod/uoa/example/opp.c @@ -28,6 +28,9 @@ #include #include #include "common.h" + +/* for union inet_addr only */ +#include "uoa_extra.h" #include "uoa.h" /** @@ -35,126 +38,126 @@ */ static inline __u16 csum_fold(__u32 csum) { - __u32 sum = (__u32)csum; - sum += (sum >> 16) | (sum << 16); - return ~(__u16)(sum >> 16); + __u32 sum = (__u32)csum; + sum += (sum >> 16) | (sum << 16); + return ~(__u16)(sum >> 16); } static inline __u16 ip_fast_csum(const void *iph, unsigned int ihl) { - __uint128_t tmp; - uint64_t sum; - - tmp = *(const __uint128_t *)iph; - iph += 16; - ihl -= 4; - tmp += ((tmp >> 64) | (tmp << 64)); - sum = tmp >> 64; - do { - sum += *(const __u32 *)iph; - iph += 4; - } while (--ihl); - - sum += ((sum >> 32) | (sum << 32)); - return csum_fold((__u32)(sum >> 32)); + __uint128_t tmp; + uint64_t sum; + + tmp = *(const __uint128_t *)iph; + iph += 16; + ihl -= 4; + tmp += ((tmp >> 64) | (tmp << 64)); + sum = tmp >> 64; + do { + sum += *(const __u32 *)iph; + iph += 4; + } while (--ihl); + + sum += ((sum >> 32) | (sum << 32)); + return csum_fold((__u32)(sum >> 32)); } /* Generate a checksum for an outgoing IP datagram. */ static void ip_send_check(struct iphdr *iph) { - iph->check = 0; - iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); + iph->check = 0; + iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); } int main(int argc, char *argv[]) { - int sockfd; - struct iphdr *iph; - struct opphdr *opph; - struct ipopt_uoa *uoa; - struct udphdr *uh; - __u8 pkt[4096] = {0}; - __u8 payload[] = {1, 2, 3, 4, 5, 6, 7, 8}; - int v = 1; - struct sockaddr_in sin; - - if (argc != 5) { - fprintf(stderr, "usage: a.out SRC-IP DST-IP CLI-IP CLI-PORT\n"); - exit(1); - } - - sockfd = socket(AF_INET, SOCK_RAW, IPPROTO_RAW); - if (sockfd < 0) { - perror("socket"); - exit(1); - } - - if (setsockopt(sockfd, IPPROTO_IP, IP_HDRINCL, &v, sizeof(v)) < 0) { - perror("setsockopt"); - exit(1); - } - - /* build IP header */ - iph = (void *)pkt; - iph->version = 0x4; - iph->ihl = sizeof(struct iphdr) / 4; - iph->tos = 0x0; - iph->tot_len = htons(sizeof(*iph) + sizeof(*opph) + \ - sizeof(*uoa) + sizeof(*uh) + sizeof(payload)); - iph->id = htons(1234); // just for test. - iph->frag_off = 0x0; - iph->ttl = 64; - iph->protocol = IPPROTO_OPT; - - if (inet_pton(AF_INET, argv[1], &iph->saddr) <= 0) { - fprintf(stderr, "bad src-ip\n"); - exit(1); - } - - if (inet_pton(AF_INET, argv[2], &iph->daddr) <= 0) { - fprintf(stderr, "bad dst-ip\n"); - exit(1); - } - - /* build Option Protocol fixed header */ - opph = (void *)iph + (iph->ihl << 2); - opph->version = 0x1; - opph->protocol = IPPROTO_UDP; - opph->length = htons(sizeof(*opph) + sizeof(*uoa)); - - /* uoa option */ - uoa = (void *)opph->options; - uoa->op_code = IPOPT_UOA; - uoa->op_len = IPOLEN_UOA_IPV4; - uoa->op_port = htons(atoi(argv[4])); - - if (inet_pton(AF_INET, argv[3], &uoa->op_addr) <= 0) { - fprintf(stderr, "bad cli-ip\n"); - exit(1); - } - - ip_send_check(iph); - - /* udp header */ - uh = (void *)opph + ntohs(opph->length); - uh->source = htons(1122); - uh->dest = htons(3344); - uh->len = htons(sizeof(*uh) + sizeof(payload)); - uh->check = 0; /* ok for UDP */ - - /* payload */ - memcpy(uh + 1, payload, sizeof(payload)); - - memset(&sin, 0, sizeof(sin)); - sin.sin_family = AF_INET; - sin.sin_addr.s_addr = iph->daddr; - - if (sendto(sockfd, pkt, ntohs(iph->tot_len), 0, - (struct sockaddr *)&sin, sizeof(sin)) < 0) { - perror("sendto"); - exit(1); - } - - close(sockfd); - exit(0); + int sockfd; + struct iphdr *iph; + struct opphdr *opph; + struct ipopt_uoa *uoa; + struct udphdr *uh; + __u8 pkt[4096] = {0}; + __u8 payload[] = {1, 2, 3, 4, 5, 6, 7, 8}; + int v = 1; + struct sockaddr_in sin; + + if (argc != 5) { + fprintf(stderr, "usage: a.out SRC-IP DST-IP CLI-IP CLI-PORT\n"); + exit(1); + } + + sockfd = socket(AF_INET, SOCK_RAW, IPPROTO_RAW); + if (sockfd < 0) { + perror("socket"); + exit(1); + } + + if (setsockopt(sockfd, IPPROTO_IP, IP_HDRINCL, &v, sizeof(v)) < 0) { + perror("setsockopt"); + exit(1); + } + + /* build IP header */ + iph = (void *)pkt; + iph->version = 0x4; + iph->ihl = sizeof(struct iphdr) / 4; + iph->tos = 0x0; + iph->tot_len = htons(sizeof(*iph) + sizeof(*opph) + \ + sizeof(*uoa) + sizeof(*uh) + sizeof(payload)); + iph->id = htons(1234); // just for test. + iph->frag_off = 0x0; + iph->ttl = 64; + iph->protocol = IPPROTO_OPT; + + if (inet_pton(AF_INET, argv[1], &iph->saddr) <= 0) { + fprintf(stderr, "bad src-ip\n"); + exit(1); + } + + if (inet_pton(AF_INET, argv[2], &iph->daddr) <= 0) { + fprintf(stderr, "bad dst-ip\n"); + exit(1); + } + + /* build Option Protocol fixed header */ + opph = (void *)iph + (iph->ihl << 2); + opph->version = 0x1; + opph->protocol = IPPROTO_UDP; + opph->length = htons(sizeof(*opph) + sizeof(*uoa)); + + /* uoa option */ + uoa = (void *)opph->options; + uoa->op_code = IPOPT_UOA; + uoa->op_len = IPOLEN_UOA_IPV4; + uoa->op_port = htons(atoi(argv[4])); + + if (inet_pton(AF_INET, argv[3], &uoa->op_addr) <= 0) { + fprintf(stderr, "bad cli-ip\n"); + exit(1); + } + + ip_send_check(iph); + + /* udp header */ + uh = (void *)opph + ntohs(opph->length); + uh->source = htons(1122); + uh->dest = htons(3344); + uh->len = htons(sizeof(*uh) + sizeof(payload)); + uh->check = 0; /* ok for UDP */ + + /* payload */ + memcpy(uh + 1, payload, sizeof(payload)); + + memset(&sin, 0, sizeof(sin)); + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = iph->daddr; + + if (sendto(sockfd, pkt, ntohs(iph->tot_len), 0, + (struct sockaddr *)&sin, sizeof(sin)) < 0) { + perror("sendto"); + exit(1); + } + + close(sockfd); + exit(0); } diff --git a/kmod/uoa/example/udp_serv.c b/kmod/uoa/example/udp_serv.c index cfb0117ef..db9acfd1e 100644 --- a/kmod/uoa/example/udp_serv.c +++ b/kmod/uoa/example/udp_serv.c @@ -32,8 +32,11 @@ #include #include #include -#include "common.h" /* for __u8, __be16, __be32, __u64 only, - just define them if not want common.h */ +/* for __u8, __be16, __be32, __u64 only */ +#include "common.h" + +/* for union inet_addr only */ +#include "uoa_extra.h" #include "uoa.h" #define MAX_SUPP_AF 2 @@ -50,6 +53,7 @@ void handle_reply(int efd, int fd) struct uoa_param_map map; socklen_t len, mlen; int n; + uint8_t af = AF_INET; len = sizeof(peer); n = recvfrom(fd, buff, sizeof(buff), 0, (SA *)&peer, &len); @@ -58,8 +62,9 @@ void handle_reply(int efd, int fd) exit(1); } buff[n]='\0'; + af = ((SA *)&peer)->sa_family; - if (((SA *)&peer)->sa_family == AF_INET) { + if (AF_INET == af) { sin = (struct sockaddr_in *)&peer; inet_ntop(AF_INET, &sin->sin_addr.s_addr, from, sizeof(from)); printf("Receive %d bytes from %s:%d -- %s\n", @@ -72,13 +77,13 @@ void handle_reply(int efd, int fd) * lookup for daddr (or local IP) is supported. * */ memset(&map, 0, sizeof(map)); - map.saddr = sin->sin_addr.s_addr; + map.af = af; map.sport = sin->sin_port; - map.daddr = htonl(INADDR_ANY); map.dport = htons(SERV_PORT); + memmove(&map.saddr, &sin->sin_addr.s_addr, sizeof(struct in_addr)); mlen = sizeof(map); if (getsockopt(fd, IPPROTO_IP, UOA_SO_GET_LOOKUP, &map, &mlen) == 0) { - inet_ntop(AF_INET, &map.real_saddr, from, sizeof(from)); + inet_ntop(map.real_af, &map.real_saddr.in, from, sizeof(from)); printf(" real client %s:%d\n", from, ntohs(map.real_sport)); } @@ -89,9 +94,20 @@ void handle_reply(int efd, int fd) inet_ntop(AF_INET6, &sin6->sin6_addr, from, sizeof(from)); printf("Receive %d bytes from %s:%d -- %s\n", n, from, ntohs(sin6->sin6_port), buff); + /* get real client address */ + memset(&map, 0, sizeof(map)); + map.af = af; + map.sport = sin6->sin6_port; + map.dport = htons(SERV_PORT); + memmove(&map.saddr, &sin6->sin6_addr, sizeof(struct in6_addr)); + mlen = sizeof(map); - /* Todo: IPv6 uoa support */ + if (getsockopt(fd, IPPROTO_IP, UOA_SO_GET_LOOKUP, &map, &mlen) == 0) { + inet_ntop(map.real_af, &map.real_saddr.in6, from, sizeof(from)); + printf(" real client %s:%d\n", from, ntohs(map.real_sport)); + } + len = sizeof(peer); sendto(fd, buff, n, 0, (SA *)&peer, len); } } diff --git a/kmod/uoa/uoa.c b/kmod/uoa/uoa.c index 39a039c6d..7430c762f 100644 --- a/kmod/uoa/uoa.c +++ b/kmod/uoa/uoa.c @@ -44,6 +44,7 @@ #include #include #include +#include #include #include #include @@ -51,21 +52,35 @@ #include #include +#if LINUX_VERSION_CODE < KERNEL_VERSION(3,3,0) +#include /* ipv6_skip_exthdr */ +#endif + +#define UOA_NEED_EXTRA +#include "uoa_extra.h" #include "uoa.h" +struct kr_ipopt_uoa { + __u8 op_code; + __u8 op_len; + __be16 op_port; + union inet_addr op_addr; +} __attribute__((__packed__)); + /* uoa mapping hash table */ struct uoa_map { - struct hlist_node hlist; - atomic_t refcnt; - struct timer_list timer; - - /* tuples as hash key */ - __be32 saddr; - __be32 daddr; - __be16 sport; - __be16 dport; - - struct ipopt_uoa optuoa; + struct hlist_node hlist; + atomic_t refcnt; + struct timer_list timer; + + /* tuples as hash key */ + __be16 af; + union inet_addr saddr; + union inet_addr daddr; + __be16 sport; + __be16 dport; + + struct kr_ipopt_uoa optuoa; }; static int uoa_debug = 0; @@ -88,14 +103,15 @@ static struct kmem_cache *uoa_map_cache __read_mostly; static unsigned int uoa_map_rnd __read_mostly; static atomic_t uoa_map_count = ATOMIC_INIT(0); +static int ipv6_hdrlen(const struct sk_buff *skb); /* uoa mapping table lock array */ -#define UOA_MAP_LOCKARR_BITS 5 -#define UOA_MAP_LOCKARR_SIZE (1<= KERNEL_VERSION(4,1,0) #define UOA_STATS_INC(_f_) do { \ - struct uoa_cpu_stats *s = this_cpu_ptr(uoa_stats.cpustats); \ - u64_stats_update_begin(&s->syncp); \ - s->_f_++; \ - u64_stats_update_end(&s->syncp); \ - uoa_stats.kstats._f_++; \ + struct uoa_cpu_stats *s = this_cpu_ptr(uoa_stats.cpustats); \ + u64_stats_update_begin(&s->syncp); \ + s->_f_++; \ + u64_stats_update_end(&s->syncp); \ + uoa_stats.kstats._f_++; \ } while (0) #else #define UOA_STATS_INC(_f_) do { \ - struct uoa_cpu_stats *s = this_cpu_ptr(uoa_stats.cpustats); \ - s->_f_++; \ - uoa_stats.kstats._f_++; \ + struct uoa_cpu_stats *s = this_cpu_ptr(uoa_stats.cpustats); \ + s->_f_++; \ + uoa_stats.kstats._f_++; \ } while (0) #endif static int uoa_stats_show(struct seq_file *seq, void *arg) { - struct uoa_kstats ks; + struct uoa_kstats ks; - spin_lock_bh(&uoa_stats.lock); - ks = uoa_stats.kstats; - spin_unlock_bh(&uoa_stats.lock); + spin_lock_bh(&uoa_stats.lock); + ks = uoa_stats.kstats; + spin_unlock_bh(&uoa_stats.lock); - seq_puts(seq, " Success Miss Invalid|UOA Got None Saved Ack-Fail\n"); + seq_puts(seq, " Success Miss Invalid|UOA Got None Saved Ack-Fail\n"); - seq_printf(seq, "%8llu %8llu %8llu %8llu %8llu %8llu %8llu\n", - ks.success, ks.miss, ks.invalid, - ks.uoa_got, ks.uoa_none, ks.uoa_saved, ks.uoa_ack_fail); + seq_printf(seq, "%8llu %8llu %8llu %8llu %8llu %8llu %8llu\n", + ks.success, ks.miss, ks.invalid, + ks.uoa_got, ks.uoa_none, ks.uoa_saved, ks.uoa_ack_fail); - return 0; + return 0; } static int uoa_stats_percpu_show(struct seq_file *seq, void *arg) { - int i; + int i; - seq_puts(seq, "CPU Success Miss Invalid|UOA Got None Saved Ack-Fail\n"); + seq_puts(seq, "CPU Success Miss Invalid|UOA Got None Saved Ack-Fail\n"); - for_each_possible_cpu(i) { - struct uoa_cpu_stats *s = per_cpu_ptr(uoa_stats.cpustats, i); - __u64 success, miss, invalid, got, none, saved, ack_fail; + for_each_possible_cpu(i) { + struct uoa_cpu_stats *s = per_cpu_ptr(uoa_stats.cpustats, i); + __u64 success, miss, invalid, got, none, saved, ack_fail; #if LINUX_VERSION_CODE >= KERNEL_VERSION(4,1,0) - unsigned int start; + unsigned int start; - do { - start = u64_stats_fetch_begin_irq(&s->syncp); + do { + start = u64_stats_fetch_begin_irq(&s->syncp); #endif - - success = s->success; - miss = s->miss; - invalid = s->invalid; - got = s->uoa_got; - none = s->uoa_none; - saved = s->uoa_saved; - ack_fail = s->uoa_ack_fail; + success = s->success; + miss = s->miss; + invalid = s->invalid; + got = s->uoa_got; + none = s->uoa_none; + saved = s->uoa_saved; + ack_fail = s->uoa_ack_fail; #if LINUX_VERSION_CODE >= KERNEL_VERSION(4,1,0) - } while (u64_stats_fetch_retry_irq(&s->syncp, start)); + } while (u64_stats_fetch_retry_irq(&s->syncp, start)); #endif - seq_printf(seq, - "%3X %8llu %8llu %8llu %8llu %8llu %8llu %8llu\n", - i, success, miss, invalid, got, none, saved, ack_fail); - } + seq_printf(seq, + "%3X %8llu %8llu %8llu %8llu %8llu %8llu %8llu\n", + i, success, miss, invalid, got, none, saved, ack_fail); + } - return 0; + return 0; } static int uoa_stats_seq_open(struct inode *inode, struct file *file) { - return single_open(file, uoa_stats_show, NULL); + return single_open(file, uoa_stats_show, NULL); } static int uoa_stats_percpu_seq_open(struct inode *inode, struct file *file) { - return single_open(file, uoa_stats_percpu_show, NULL); + return single_open(file, uoa_stats_percpu_show, NULL); } static const struct file_operations uoa_stats_fops = { - .owner = THIS_MODULE, - .open = uoa_stats_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, + .owner = THIS_MODULE, + .open = uoa_stats_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, }; static const struct file_operations uoa_stats_percpu_fops = { - .owner = THIS_MODULE, - .open = uoa_stats_percpu_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, + .owner = THIS_MODULE, + .open = uoa_stats_percpu_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, }; static int uoa_stats_init(void) { - int i; + int i; - spin_lock_init(&uoa_stats.lock); - memset(&uoa_stats.kstats, 0, sizeof(struct uoa_kstats)); + spin_lock_init(&uoa_stats.lock); + memset(&uoa_stats.kstats, 0, sizeof(struct uoa_kstats)); - uoa_stats.cpustats = alloc_percpu(struct uoa_cpu_stats); - if (!uoa_stats.cpustats) { - pr_err("fail to alloc percpu stats\n"); - return -ENOMEM; - } + uoa_stats.cpustats = alloc_percpu(struct uoa_cpu_stats); + if (!uoa_stats.cpustats) { + pr_err("fail to alloc percpu stats\n"); + return -ENOMEM; + } - for_each_possible_cpu(i) { - struct uoa_cpu_stats *cs; + for_each_possible_cpu(i) { + struct uoa_cpu_stats *cs; - cs = per_cpu_ptr(uoa_stats.cpustats, i); + cs = per_cpu_ptr(uoa_stats.cpustats, i); #if LINUX_VERSION_CODE >= KERNEL_VERSION(4,1,0) - u64_stats_init(&cs->syncp); + u64_stats_init(&cs->syncp); #endif - } + } - proc_create("uoa_stats", 0, init_net.proc_net, &uoa_stats_fops); - proc_create("uoa_stats_percpu", 0, init_net.proc_net, - &uoa_stats_percpu_fops); + proc_create("uoa_stats", 0, init_net.proc_net, &uoa_stats_fops); + proc_create("uoa_stats_percpu", 0, init_net.proc_net, + &uoa_stats_percpu_fops); - return 0; + return 0; } static void uoa_stats_exit(void) { - remove_proc_entry("uoa_stats", init_net.proc_net); - remove_proc_entry("uoa_stats_percpu", init_net.proc_net); - free_percpu(uoa_stats.cpustats); + remove_proc_entry("uoa_stats", init_net.proc_net); + remove_proc_entry("uoa_stats_percpu", init_net.proc_net); + free_percpu(uoa_stats.cpustats); } static inline void uoa_map_dump(const struct uoa_map *um, const char *pref) { - if (likely(!uoa_debug)) - return; - - pr_info("%s %pI4:%d->%pI4:%d real %pI4:%d, refcnt %d\n", pref ? : "", - &um->saddr, ntohs(um->sport), &um->daddr, ntohs(um->dport), - &um->optuoa.op_addr, ntohs(um->optuoa.op_port), - atomic_read(&um->refcnt)); + int real_af; + + if (likely(!uoa_debug)) + return; + + if (um->optuoa.op_len == IPOLEN_UOA_IPV6) + real_af = AF_INET6; + else + real_af = AF_INET; + + if (AF_INET == um->af) { + if (real_af == AF_INET) { + pr_info("%s %pI4:%d->%pI4:%d real %pI4:%d, refcnt %d\n", pref ? : "", + &um->saddr.in, ntohs(um->sport), &um->daddr.in, ntohs(um->dport), + &um->optuoa.op_addr.in, ntohs(um->optuoa.op_port), + atomic_read(&um->refcnt)); + } else { + pr_info("%s %pI4:%d->%pI4:%d real [%pI6]:%d, refcnt %d\n", pref ? : "", + &um->saddr.in, ntohs(um->sport), &um->daddr.in, ntohs(um->dport), + &um->optuoa.op_addr.in6, ntohs(um->optuoa.op_port), + atomic_read(&um->refcnt)); + } + } else { + if (real_af == AF_INET) { + pr_info("%s [%pI6]:%d->[%pI6]:%d real %pI4:%d, refcnt %d\n", pref ? : "", + &um->saddr.in6, ntohs(um->sport), &um->daddr.in6, ntohs(um->dport), + &um->optuoa.op_addr.in, ntohs(um->optuoa.op_port), + atomic_read(&um->refcnt)); + } else { + pr_info("%s [%pI6]:%d->[%pI6]:%d real [%pI6]:%d, refcnt %d\n", pref ? : "", + &um->saddr.in6, ntohs(um->sport), &um->daddr.in6, ntohs(um->dport), + &um->optuoa.op_addr.in6, ntohs(um->optuoa.op_port), + atomic_read(&um->refcnt)); + } + } } -static inline unsigned int __uoa_map_hash_key(__be32 saddr, __be32 daddr, - __be16 sport, __be16 dport) +static inline unsigned int __uoa_map_hash_key(__be16 af, + const union inet_addr *saddr, + const union inet_addr *daddr, + __be16 sport, __be16 dport) { - /* do not cal daddr, it could be zero for wildcard lookup */ - return jhash_3words(saddr, sport, dport, uoa_map_rnd) & - uoa_map_tab_mask; + /* do not cal daddr, it could be zero for wildcard lookup */ + uint32_t saddr_fold; + saddr_fold = inet_addr_fold(af, saddr); + return jhash_3words(saddr_fold, sport, dport, uoa_map_rnd) & + uoa_map_tab_mask; } static inline unsigned int uoa_map_hash_key(const struct uoa_map *um) { - return __uoa_map_hash_key(um->saddr, um->daddr, um->sport, um->dport); + return __uoa_map_hash_key(um->af, &um->saddr, &um->daddr, + um->sport, um->dport); } static inline void uoa_map_hash(struct uoa_map *um) { - unsigned int hash = uoa_map_hash_key(um); - struct hlist_head *head = &uoa_map_tab[hash]; - struct uoa_map *cur; + unsigned int hash = uoa_map_hash_key(um); + struct hlist_head *head = &uoa_map_tab[hash]; + struct uoa_map *cur; #if LINUX_VERSION_CODE < KERNEL_VERSION(3,8,0) - struct hlist_node *node; + struct hlist_node *node; #endif - um_lock_bh(hash); + um_lock_bh(hash); - /* overwrite existing mapping */ + /* overwrite existing mapping */ #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,8,0) - hlist_for_each_entry_rcu(cur, head, hlist) { + hlist_for_each_entry_rcu(cur, head, hlist) { #else - hlist_for_each_entry_rcu(cur, node, head, hlist) { + hlist_for_each_entry_rcu(cur, node, head, hlist) { #endif - if (um->saddr == cur->saddr && - um->daddr == cur->daddr && - um->sport == cur->sport && - um->dport == cur->dport) { - /* update */ - memcpy(&cur->optuoa, &um->optuoa, IPOLEN_UOA_IPV4); - - mod_timer(&cur->timer, jiffies + uoa_map_timeout * HZ); + if (um->af == cur->af && + inet_addr_equal(um->af, &um->saddr, &cur->saddr) && + inet_addr_equal(um->af, &um->daddr, &cur->daddr) && + um->sport == cur->sport && + um->dport == cur->dport) { + /* update */ + memmove(&cur->optuoa, &um->optuoa, sizeof(cur->optuoa)); + mod_timer(&cur->timer, jiffies + uoa_map_timeout * HZ); - kmem_cache_free(uoa_map_cache, um); + kmem_cache_free(uoa_map_cache, um); - uoa_map_dump(cur, "upd:"); - goto hashed; - } - } + uoa_map_dump(cur, "update:"); + goto hashed; + } + } - /* not exist */ - hlist_add_head_rcu(&um->hlist, head); + /* not exist */ + hlist_add_head_rcu(&um->hlist, head); #if LINUX_VERSION_CODE >= KERNEL_VERSION(4,15,0) - timer_setup(&um->timer, uoa_map_expire, 0); + timer_setup(&um->timer, uoa_map_expire, 0); #else - setup_timer(&um->timer, uoa_map_expire, (unsigned long)um); + setup_timer(&um->timer, uoa_map_expire, (unsigned long)um); #endif - mod_timer(&um->timer, jiffies + uoa_map_timeout * HZ); + mod_timer(&um->timer, jiffies + uoa_map_timeout * HZ); - atomic_inc(&uoa_map_count); - uoa_map_dump(um, "new:"); + atomic_inc(&uoa_map_count); + uoa_map_dump(um, "new:"); hashed: - um_unlock_bh(hash); + um_unlock_bh(hash); } static inline int uoa_map_unhash(struct uoa_map *um) { - unsigned int hash = uoa_map_hash_key(um); - int err = -1; - - um_lock_bh(hash); - if (atomic_read(&um->refcnt) == 0) { - hlist_del_rcu(&um->hlist); - atomic_dec(&uoa_map_count); - err = 0; - } - um_unlock_bh(hash); - - return err; + unsigned int hash = uoa_map_hash_key(um); + int err = -1; + + um_lock_bh(hash); + if (atomic_read(&um->refcnt) == 0) { + hlist_del_rcu(&um->hlist); + atomic_dec(&uoa_map_count); + err = 0; + } + um_unlock_bh(hash); + + return err; } -static inline struct uoa_map *uoa_map_get(__be32 saddr, __be32 daddr, - __be16 sport, __be16 dport) +static inline struct uoa_map *uoa_map_get(__be16 af, + union inet_addr *saddr, + union inet_addr *daddr, + __be16 sport, __be16 dport) { - unsigned int hash = __uoa_map_hash_key(saddr, daddr, sport, dport); - struct hlist_head *head = &uoa_map_tab[hash]; - struct uoa_map *um = NULL; + unsigned int hash = __uoa_map_hash_key(af, saddr, daddr, sport, dport); + struct hlist_head *head = &uoa_map_tab[hash]; + struct uoa_map *um = NULL; #if LINUX_VERSION_CODE < KERNEL_VERSION(3,8,0) - struct hlist_node *node; + struct hlist_node *node; #endif - um_lock_bh(hash); + um_lock_bh(hash); #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,8,0) - hlist_for_each_entry_rcu(um, head, hlist) { + hlist_for_each_entry_rcu(um, head, hlist) { #else - hlist_for_each_entry_rcu(um, node, head, hlist) { + hlist_for_each_entry_rcu(um, node, head, hlist) { #endif - /* we allow daddr being set to wildcard (zero), - * since UDP server may bind INADDR_ANY */ - if (um->saddr == saddr && (daddr == 0 || um->daddr == daddr) && - um->sport == sport && um->dport == dport) { - mod_timer(&um->timer, jiffies + uoa_map_timeout * HZ); - atomic_inc(&um->refcnt); - - um_unlock_bh(hash); - return um; - } - } - - um_unlock_bh(hash); - - return NULL; + /* we allow daddr being set to wildcard (zero), + * since UDP server may bind INADDR_ANY */ + if (um->af == af && + inet_addr_equal(af, &um->saddr, saddr) && + (inet_is_addr_any(af, daddr) || + inet_addr_equal(af, &um->daddr, daddr)) && + um->sport == sport && + um->dport == dport) { + mod_timer(&um->timer, jiffies + uoa_map_timeout * HZ); + atomic_inc(&um->refcnt); + + um_unlock_bh(hash); + return um; + } + } + + um_unlock_bh(hash); + + return NULL; } static inline void uoa_map_put(struct uoa_map *um) { - atomic_dec(&um->refcnt); + atomic_dec(&um->refcnt); } static inline void __uoa_map_expire(struct uoa_map *um, struct timer_list *timer) { - if (uoa_map_unhash(um) != 0) { - /* try again if some one is using it */ - mod_timer(timer, jiffies + uoa_map_timeout * HZ); + if (uoa_map_unhash(um) != 0) { + /* try again if some one is using it */ + mod_timer(timer, jiffies + uoa_map_timeout * HZ); - uoa_map_dump(um, "expire delayed:"); - return; - } + uoa_map_dump(um, "expire delayed:"); + return; + } - uoa_map_dump(um, "del:"); - del_timer(&um->timer); - kmem_cache_free(uoa_map_cache, um); + uoa_map_dump(um, "del:"); + del_timer(&um->timer); + kmem_cache_free(uoa_map_cache, um); } #if LINUX_VERSION_CODE >= KERNEL_VERSION(4,15,0) static void uoa_map_expire(struct timer_list *timer) { - struct uoa_map *um = from_timer(um, timer, timer); + struct uoa_map *um = from_timer(um, timer, timer); - __uoa_map_expire(um, timer); + __uoa_map_expire(um, timer); } #else static void uoa_map_expire(unsigned long data) { - struct uoa_map *um = (struct uoa_map *)data; + struct uoa_map *um = (struct uoa_map *)data; - __uoa_map_expire(um, &um->timer); + __uoa_map_expire(um, &um->timer); } #endif static void uoa_map_flush(void) { - int i; + int i; flush_again: - for (i = 0; i < uoa_map_tab_size; i++) { - struct uoa_map *um; - struct hlist_node *n; - struct hlist_head *head = &uoa_map_tab[i]; + for (i = 0; i < uoa_map_tab_size; i++) { + struct uoa_map *um; + struct hlist_node *n; + struct hlist_head *head = &uoa_map_tab[i]; #if LINUX_VERSION_CODE < KERNEL_VERSION(3,8,0) - struct hlist_node *node; + struct hlist_node *node; #endif - um_lock_bh(i); + um_lock_bh(i); #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,8,0) - hlist_for_each_entry_safe(um, n, head, hlist) { + hlist_for_each_entry_safe(um, n, head, hlist) { #else - hlist_for_each_entry_safe(um, node, n, head, hlist) { + hlist_for_each_entry_safe(um, node, n, head, hlist) { #endif - if (timer_pending(&um->timer)) - del_timer(&um->timer); - - if (atomic_read(&um->refcnt) != 0) - continue; + if (timer_pending(&um->timer)) + del_timer(&um->timer); - uoa_map_dump(um, "flu:"); + if (atomic_read(&um->refcnt) != 0) + continue; - hlist_del_rcu(&um->hlist); - atomic_dec(&uoa_map_count); - kmem_cache_free(uoa_map_cache, um); - } + uoa_map_dump(um, "flu:"); + hlist_del_rcu(&um->hlist); + atomic_dec(&uoa_map_count); + kmem_cache_free(uoa_map_cache, um); + } - um_unlock_bh(i); - } + um_unlock_bh(i); + } - if (atomic_read(&uoa_map_count) > 0) { - pr_debug("%s: again\n", __func__); - schedule(); - goto flush_again; - } + if (atomic_read(&uoa_map_count) > 0) { + pr_debug("%s: again\n", __func__); + schedule(); + goto flush_again; + } } static int uoa_so_set(struct sock *sk, int cmd, void __user *user, - unsigned int len) + unsigned int len) { - return 0; + return 0; } static int uoa_so_get(struct sock *sk, int cmd, void __user *user, int *len) { - struct uoa_param_map map; - struct uoa_map *um; - int err; - - if (cmd != UOA_SO_GET_LOOKUP) { - pr_warn("%s: bad cmd\n", __func__); - return -EINVAL; - } - - if (*len < sizeof(struct uoa_param_map)) { - pr_warn("%s: bad param len\n", __func__); - return -EINVAL; - } - - if (copy_from_user(&map, user, sizeof(struct uoa_param_map)) != 0) - return -EFAULT; - - /* lookup uap mapping table */ - um = uoa_map_get(map.saddr, map.daddr, map.sport, map.dport); - if (!um) { - if (uoa_debug) { - pr_warn("%s: not found: %pI4:%d->%pI4:%d\n", __func__, - &map.saddr, ntohs(map.sport), - &map.daddr, ntohs(map.dport)); - } - UOA_STATS_INC(miss); - return -ENOENT; - } - - uoa_map_dump(um, "hit:"); - - if (likely(um->optuoa.op_code == IPOPT_UOA && - um->optuoa.op_len == IPOLEN_UOA_IPV4)) { - memcpy(&map.real_saddr, um->optuoa.op_addr, sizeof(map.real_saddr)); - map.real_sport = um->optuoa.op_port; - UOA_STATS_INC(success); - err = 0; - } else { - UOA_STATS_INC(invalid); - err = -EFAULT; - } - - if (copy_to_user(user, &map, sizeof(struct uoa_param_map)) != 0) - err = -EFAULT; - *len = sizeof(struct uoa_param_map); - - uoa_map_put(um); - - return err; + struct uoa_param_map map; + struct uoa_map *um; + int err; + + if (cmd != UOA_SO_GET_LOOKUP) { + pr_warn("%s: bad cmd\n", __func__); + return -EINVAL; + } + + if (*len < sizeof(struct uoa_param_map)) { + pr_warn("%s: bad param len\n", __func__); + return -EINVAL; + } + + if (copy_from_user(&map, user, sizeof(struct uoa_param_map)) != 0) + return -EFAULT; + + /* lookup uap mapping table */ + um = uoa_map_get(map.af, &map.saddr, &map.daddr, map.sport, map.dport); + + if (!um) { + if (uoa_debug) { + if (AF_INET == map.af) { + pr_warn("%s: not found: %pI4:%d->%pI4:%d\n", __func__, + &map.saddr.in, ntohs(map.sport), + &map.daddr.in, ntohs(map.dport)); + } else { + pr_warn("%s: not found: [%pI6]:%d->[%pI6]:%d\n", __func__, + &map.saddr.in6, ntohs(map.sport), + &map.daddr.in6, ntohs(map.dport)); + } + } + UOA_STATS_INC(miss); + return -ENOENT; + } + + uoa_map_dump(um, "hit:"); + + if (likely(um->optuoa.op_code == IPOPT_UOA)) { + if (um->optuoa.op_len == IPOLEN_UOA_IPV4) { + map.real_af = AF_INET; + memmove(&map.real_saddr.in, &um->optuoa.op_addr.in, + sizeof(map.real_saddr.in)); + map.real_sport = um->optuoa.op_port; + UOA_STATS_INC(success); + err = 0; + } else { + if (um->optuoa.op_len == IPOLEN_UOA_IPV6) { + map.real_af = AF_INET6; + memmove(&map.real_saddr.in6, &um->optuoa.op_addr.in6, + sizeof(map.real_saddr.in6)); + map.real_sport = um->optuoa.op_port; + UOA_STATS_INC(success); + err = 0; + } else { + UOA_STATS_INC(invalid); + err = -EFAULT; + } + } + } else { + UOA_STATS_INC(invalid); + err = -EFAULT; + } + + if (copy_to_user(user, &map, sizeof(struct uoa_param_map)) != 0) + err = -EFAULT; + *len = sizeof(struct uoa_param_map); + + uoa_map_put(um); + + return err; } static struct nf_sockopt_ops uoa_sockopts = { - .pf = PF_INET, - .owner = THIS_MODULE, - /* set */ - .set_optmin = UOA_BASE_CTL, - .set_optmax = UOA_SO_SET_MAX+1, - .set = uoa_so_set, - /* get */ - .get_optmin = UOA_BASE_CTL, - .get_optmax = UOA_SO_GET_MAX+1, - .get = uoa_so_get, + .pf = PF_INET, + .owner = THIS_MODULE, + /* set */ + .set_optmin = UOA_BASE_CTL, + .set_optmax = UOA_SO_SET_MAX + 1, + .set = uoa_so_set, + /* get */ + .get_optmin = UOA_BASE_CTL, + .get_optmax = UOA_SO_GET_MAX + 1, + .get = uoa_so_get, }; static int uoa_map_init(void) { - int i, err; - - /* mapping table */ - uoa_map_tab_size = 1 << uoa_map_tab_bits; - uoa_map_tab_mask = uoa_map_tab_size - 1; - - uoa_map_tab = vmalloc(uoa_map_tab_size * sizeof(*uoa_map_tab)); - if (!uoa_map_tab) { - pr_err("no memory for uoa mapping table\n"); - return -ENOMEM; - } - - atomic_set(&uoa_map_count, 0); - get_random_bytes(&uoa_map_rnd, sizeof(uoa_map_rnd)); - - for (i = 0; i < uoa_map_tab_size; i++) - INIT_HLIST_HEAD(&uoa_map_tab[i]); - - for (i = 0; i < UOA_MAP_LOCKARR_SIZE; i++) - spin_lock_init(&__uoa_map_tab_lock_array[i].lock); - - /* mapping cache */ - uoa_map_cache = kmem_cache_create("uoa_map", - sizeof(struct uoa_map), 0, - SLAB_HWCACHE_ALIGN, NULL); - if (!uoa_map_cache) { - pr_err("fail to create uoa_map cache\n"); - vfree(uoa_map_tab); - return -ENOMEM; - } - - /* socket option */ - err = nf_register_sockopt(&uoa_sockopts); - if (err != 0) { - pr_err("fail to register sockopt\n"); - kmem_cache_destroy(uoa_map_cache); - vfree(uoa_map_tab); - return -ENOMEM; - } - - pr_debug("mapping hash initialed, size %d\n", uoa_map_tab_size); - return 0; + int i, err; + + /* mapping table */ + uoa_map_tab_size = 1 << uoa_map_tab_bits; + uoa_map_tab_mask = uoa_map_tab_size - 1; + + uoa_map_tab = vmalloc(uoa_map_tab_size * sizeof(*uoa_map_tab)); + if (!uoa_map_tab) { + pr_err("no memory for uoa mapping table\n"); + return -ENOMEM; + } + + atomic_set(&uoa_map_count, 0); + get_random_bytes(&uoa_map_rnd, sizeof(uoa_map_rnd)); + + for (i = 0; i < uoa_map_tab_size; i++) + INIT_HLIST_HEAD(&uoa_map_tab[i]); + + for (i = 0; i < UOA_MAP_LOCKARR_SIZE; i++) + spin_lock_init(&__uoa_map_tab_lock_array[i].lock); + + /* mapping cache */ + uoa_map_cache = kmem_cache_create("uoa_map", + sizeof(struct uoa_map), 0, + SLAB_HWCACHE_ALIGN, NULL); + if (!uoa_map_cache) { + pr_err("fail to create uoa_map cache\n"); + vfree(uoa_map_tab); + return -ENOMEM; + } + + /* socket option */ + err = nf_register_sockopt(&uoa_sockopts); + if (err != 0) { + pr_err("fail to register sockopt\n"); + kmem_cache_destroy(uoa_map_cache); + vfree(uoa_map_tab); + return -ENOMEM; + } + + pr_debug("mapping hash initialed, size %d\n", uoa_map_tab_size); + return 0; } static void uoa_map_exit(void) { - nf_unregister_sockopt(&uoa_sockopts); - kmem_cache_destroy(uoa_map_cache); - vfree(uoa_map_tab); + nf_unregister_sockopt(&uoa_sockopts); + kmem_cache_destroy(uoa_map_cache); + vfree(uoa_map_tab); } /* @@ -596,167 +671,203 @@ static void uoa_map_exit(void) */ static int uoa_send_ack(const struct sk_buff *oskb) { - /* TODO: */ - return 0; + /* TODO: */ + return 0; } -static struct uoa_map *uoa_parse_ipopt(unsigned char *optptr, int optlen, - __be32 saddr, __be32 daddr, - __be16 sport, __be16 dport) +static struct uoa_map *uoa_parse_ipopt(__be16 af, unsigned char *optptr, + int optlen, void *iph, + __be16 sport, __be16 dport) { - int l; - struct uoa_map *um = NULL; - - for (l = optlen; l > 0; ) { - switch (*optptr) { - case IPOPT_END: - break; - case IPOPT_NOOP: - l--; - optptr++; - continue; - } - - if (unlikely(l < 2)) - goto out; /* invalid */ - - optlen = optptr[1]; - if (unlikely(optlen < 2 || optlen > l)) - goto out; /* invalid */ - - if (*optptr == IPOPT_UOA && optlen == IPOLEN_UOA_IPV4) { - UOA_STATS_INC(uoa_got); - - um = kmem_cache_alloc(uoa_map_cache, GFP_ATOMIC); - if (!um) { - UOA_STATS_INC(uoa_miss); - goto out; - } - - atomic_set(&um->refcnt, 0); - um->saddr = saddr; - um->daddr = daddr; - um->sport = sport; - um->dport = dport; - - memcpy(&um->optuoa, optptr, IPOLEN_UOA_IPV4); - - UOA_STATS_INC(uoa_saved); - return um; - } - - l -= optlen; - optptr += optlen; - continue; - } - - /* no UOA option */ - UOA_STATS_INC(uoa_none); + int l; + struct uoa_map *um = NULL; + + for (l = optlen; l > 0; ) { + switch (*optptr) { + case IPOPT_END: + break; + case IPOPT_NOOP: + l--; + optptr++; + continue; + } + + if (unlikely(l < 2)) + goto out; /* invalid */ + + optlen = optptr[1]; + if (unlikely(optlen < 2 || optlen > l)) + goto out; /* invalid */ + + if (*optptr == IPOPT_UOA) { + UOA_STATS_INC(uoa_got); + um = kmem_cache_alloc(uoa_map_cache, GFP_ATOMIC); + if (!um) { + UOA_STATS_INC(uoa_miss); + goto out; + } + + atomic_set(&um->refcnt, 0); + um->af = af; + if (AF_INET == af) { + memmove(&um->saddr.in, &((struct iphdr *)iph)->saddr, + sizeof(struct in_addr)); + memmove(&um->daddr.in, &((struct iphdr *)iph)->daddr, + sizeof(struct in_addr)); + } else { + /* ipv6 */ + memmove(&um->saddr.in6, &((struct ipv6hdr *)iph)->saddr, + sizeof(struct in6_addr)); + memmove(&um->daddr.in6, &((struct ipv6hdr *)iph)->daddr, + sizeof(struct in6_addr)); + } + um->sport = sport; + um->dport = dport; + memcpy(&um->optuoa, optptr, optlen); + + UOA_STATS_INC(uoa_saved); + return um; + } + + l -= optlen; + optptr += optlen; + continue; + } + + /* no UOA option */ + UOA_STATS_INC(uoa_none); out: - return NULL; + return NULL; } /* get uoa info from uoa-option in IP header. */ static struct uoa_map *uoa_iph_rcv(const struct iphdr *iph, struct sk_buff *skb) { - struct udphdr *uh; - int optlen; - unsigned char *optptr; - struct uoa_map *um = NULL; + struct udphdr *uh; + int optlen; + unsigned char *optptr; + struct uoa_map *um = NULL; - if (!pskb_may_pull(skb, ip_hdrlen(skb) + sizeof(struct udphdr))) - return NULL; + if (!pskb_may_pull(skb, ip_hdrlen(skb) + sizeof(struct udphdr))) + return NULL; - uh = (void *)iph + ip_hdrlen(skb); + uh = (void *)iph + ip_hdrlen(skb); - optlen = ip_hdrlen(skb) - sizeof(struct iphdr); - optptr = (unsigned char *)(iph + 1); + optlen = ip_hdrlen(skb) - sizeof(struct iphdr); + optptr = (unsigned char *)(iph + 1); - um = uoa_parse_ipopt(optptr, optlen, iph->saddr, iph->daddr, - uh->source, uh->dest); + um = uoa_parse_ipopt(AF_INET, optptr, optlen, + (void *)iph, uh->source, uh->dest); - if (um && uoa_send_ack(skb) != 0) { - UOA_STATS_INC(uoa_ack_fail); - pr_warn("fail to send UOA ACK\n"); - } + if (um && uoa_send_ack(skb) != 0) { + UOA_STATS_INC(uoa_ack_fail); + pr_warn("fail to send UOA ACK\n"); + } - return um; + return um; } /* get uoa info from private option protocol. */ -static struct uoa_map *uoa_opp_rcv(struct iphdr *iph, struct sk_buff *skb) +static struct uoa_map *uoa_opp_rcv(__be16 af, void *iph, struct sk_buff *skb) { - struct opphdr *opph; - struct udphdr *uh; - int optlen, opplen; - unsigned char *optptr; - struct uoa_map *um = NULL; - - if (!pskb_may_pull(skb, ip_hdrlen(skb) + sizeof(struct opphdr))) - return NULL; - - opph = (void *)iph + ip_hdrlen(skb); - opplen = ntohs(opph->length); - - if (unlikely(opph->version != 0x01 || opph->protocol != IPPROTO_UDP)) { - pr_warn("bad opp header\n"); - return NULL; - } - - if (!pskb_may_pull(skb, ip_hdrlen(skb) + opplen + sizeof(*uh))) - return NULL; - - uh = (void *)iph + ip_hdrlen(skb) + opplen; - - optlen = opplen - sizeof(*opph); - optptr = (unsigned char *)(opph + 1); - - /* try parse UOA option from ip-options */ - um = uoa_parse_ipopt(optptr, optlen, iph->saddr, iph->daddr, - uh->source, uh->dest); - - if (um && uoa_send_ack(skb) != 0) { - UOA_STATS_INC(uoa_ack_fail); - pr_warn("fail to send UOA ACK\n"); - } - - /* - * "remove" private option protocol, then adjust IP header - * protocol, tot_len and checksum. these could be slow ? - */ - skb_set_transport_header(skb, ip_hdrlen(skb) + opplen); - - /* Old kernel like 2.6.32 use "iph->ihl" rather "skb->transport_header" - * to get UDP header offset. The UOA private protocol data should be - * erased here, but this should move skb data and harm perfomance. As a - * compromise, we convert the private protocol data into NOP IP option - * data if possible.*/ - if (iph->ihl + (opplen >> 2) < 16) { - iph->ihl = (iph->ihl) + (opplen >> 2); - memset(opph, opplen, IPOPT_NOOP); - } else { - pr_warn("IP header has no room to convert uoa data into option.\n"); - } - - /* need change it to parse transport layer */ - iph->protocol = opph->protocol; - ip_send_check(iph); + struct opphdr *opph; + struct udphdr *uh; + int optlen, opplen; + unsigned char *optptr; + struct uoa_map *um = NULL; + int iphdrlen = ((AF_INET6 == af) ? ipv6_hdrlen(skb) : ip_hdrlen(skb)); + + if (!pskb_may_pull(skb, iphdrlen + sizeof(struct opphdr))) + return NULL; + + opph = iph + iphdrlen; + opplen = ntohs(opph->length); + + if (unlikely(opph->protocol != IPPROTO_UDP)) { + pr_warn("bad opp header\n"); + return NULL; + } + + if (!pskb_may_pull(skb, iphdrlen + opplen + sizeof(*uh))) + return NULL; + + uh = iph + iphdrlen + opplen; + optlen = opplen - sizeof(*opph); + optptr = (unsigned char *)(opph + 1); + + /* try parse UOA option from ip-options */ + um = uoa_parse_ipopt(af, optptr, optlen, iph, uh->source, uh->dest); + + if (um && uoa_send_ack(skb) != 0) { + UOA_STATS_INC(uoa_ack_fail); + pr_warn("fail to send UOA ACK\n"); + } + + /* + * "remove" private option protocol, then adjust IP header + * protocol, tot_len and checksum. these could be slow ? + */ + + skb_set_transport_header(skb, iphdrlen + opplen); + + /* Old kernel like 2.6.32 use "iph->ihl" rather "skb->transport_header" + * to get UDP header offset. The UOA private protocol data should be + * erased here, but this should move skb data and harm perfomance. As a + * compromise, we convert the private protocol data into NOP IP option + * data if possible.*/ + if (AF_INET == af) { + if (((struct iphdr *)iph)->ihl + (opplen >> 2) < 16) { + ((struct iphdr *)iph)->ihl += (opplen >> 2); + memset(opph, opplen, IPOPT_NOOP); + + /* need change it to parse transport layer */ + ((struct iphdr *)iph)->protocol = opph->protocol; + } else { + pr_warn("IP header has no room to convert uoa data into option.\n"); + } + /* re-calc checksum */ + ip_send_check(iph); + } else { +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,0,0) + /* do as upper ipv4, handle for old kernel version */ + int payload_len = ntohs(((struct ipv6hdr *)iph)->payload_len); + ((struct ipv6hdr *)iph)->payload_len = htons(payload_len - opplen); + ((struct ipv6hdr *)iph)->nexthdr = opph->protocol; + memmove(iph + iphdrlen, uh, ntohs(uh->len)); + skb_set_transport_header(skb, iphdrlen); +#else + ((struct ipv6hdr *)iph)->nexthdr = opph->protocol; +#endif + } - return um; + return um; } static struct uoa_map *uoa_skb_rcv_opt(struct sk_buff *skb) { - struct iphdr *iph = ip_hdr(skb); - - if (unlikely(iph->ihl > 5) && iph->protocol == IPPROTO_UDP) - return uoa_iph_rcv(iph, skb); - else if (unlikely(iph->protocol == IPPROTO_OPT)) - return uoa_opp_rcv(iph, skb); - - UOA_STATS_INC(uoa_none); - return NULL; + struct iphdr *iph = ip_hdr(skb); + __be16 af = ((6 == iph->version) ? AF_INET6 : AF_INET); + + if (AF_INET6 == af) { + struct ipv6hdr *ip6h = ipv6_hdr(skb); + if (ipv6_hdrlen(skb) != sizeof(struct ipv6hdr)) { + if (uoa_debug) { + pr_info("we not support uoa with ipv6 ext header now."); + } + } + if (unlikely(ip6h->nexthdr == IPPROTO_OPT)) { + return uoa_opp_rcv(af, (void *)ip6h, skb); + } + } else { + if (unlikely(iph->ihl > 5) && iph->protocol == IPPROTO_UDP) + return uoa_iph_rcv(iph, skb); + else if (unlikely(iph->protocol == IPPROTO_OPT)) + return uoa_opp_rcv(af, (void *)iph, skb); + } + + UOA_STATS_INC(uoa_none); + return NULL; } /* @@ -765,99 +876,137 @@ static struct uoa_map *uoa_skb_rcv_opt(struct sk_buff *skb) */ #if LINUX_VERSION_CODE >= KERNEL_VERSION(4,3,0) static unsigned int uoa_ip_local_in(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) + const struct nf_hook_state *state) #elif RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7,2) static unsigned int uoa_ip_local_in(const struct nf_hook_ops *ops, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct nf_hook_state *state) + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const struct nf_hook_state *state) #elif RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(6,4) static unsigned int uoa_ip_local_in(unsigned int hooknum, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) #else #error "Pls modify the definition according to kernel version." #endif { - struct uoa_map *um; + struct uoa_map *um; - um = uoa_skb_rcv_opt(skb); - if (um) - uoa_map_hash(um); + um = uoa_skb_rcv_opt(skb); + if (um) + uoa_map_hash(um); - return NF_ACCEPT; + return NF_ACCEPT; } /* * use nf LOCAL_IN hook to get UOA option. */ static struct nf_hook_ops uoa_nf_hook_ops[] __read_mostly = { - { - .hook = uoa_ip_local_in, - .pf = NFPROTO_IPV4, - .hooknum = NF_INET_LOCAL_IN, - .priority = NF_IP_PRI_NAT_SRC + 1, - }, + { + .hook = uoa_ip_local_in, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_LOCAL_IN, + .priority = NF_IP_PRI_NAT_SRC + 1, + }, +}; + +static struct nf_hook_ops uoa_nf_hook_ops6[] __read_mostly = { + { + .hook = uoa_ip_local_in, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_LOCAL_IN, + .priority = NF_IP_PRI_NAT_SRC + 1, + }, }; static __init int uoa_init(void) { - int err = -ENOMEM; - - /* uoa mapping hash table. */ - err = uoa_map_init(); - if (err != 0) - return err; - - /* statistics */ - err = uoa_stats_init(); - if (err != 0) - goto stats_failed; - - /* - * no way to hook udp_rcv() and udp_recvmsg() is difficult - * to be overwirten since it handles multiple skbs. - */ + int err = -ENOMEM; + + /* uoa mapping hash table. */ + err = uoa_map_init(); + if (err != 0) + return err; + + /* statistics */ + err = uoa_stats_init(); + if (err != 0) + goto stats_failed; + + /* + * no way to hook udp_rcv() and udp_recvmsg() is difficult + * to be overwirten since it handles multiple skbs. + */ #if LINUX_VERSION_CODE >= KERNEL_VERSION(4,3,0) - err = nf_register_net_hooks(&init_net, uoa_nf_hook_ops, - ARRAY_SIZE(uoa_nf_hook_ops)); + err = nf_register_net_hooks(&init_net, uoa_nf_hook_ops, + ARRAY_SIZE(uoa_nf_hook_ops)); + if (err < 0) { + pr_err("fail to register netfilter hooks.\n"); + goto hook_failed; + } + err = nf_register_net_hooks(&init_net, uoa_nf_hook_ops6, + ARRAY_SIZE(uoa_nf_hook_ops6)); #else - err = nf_register_hooks(uoa_nf_hook_ops, ARRAY_SIZE(uoa_nf_hook_ops)); + err = nf_register_hooks(uoa_nf_hook_ops, ARRAY_SIZE(uoa_nf_hook_ops)); + if (err < 0) { + pr_err("fail to register netfilter hooks.\n"); + goto hook_failed; + } + err = nf_register_hooks(uoa_nf_hook_ops6, ARRAY_SIZE(uoa_nf_hook_ops6)); #endif - if (err < 0) { - pr_err("fail to register netfilter hooks.\n"); - goto hook_failed; - } + if (err < 0) { + pr_err("fail to register netfilter hooks.\n"); + goto hook_failed; + } - pr_info("UOA module installed %s\n", uoa_debug ? "with debug" : ""); - return 0; + pr_info("UOA module installed %s\n", uoa_debug ? "with debug" : ""); + return 0; hook_failed: - uoa_stats_exit(); + uoa_stats_exit(); stats_failed: - uoa_map_exit(); - return err; + uoa_map_exit(); + return err; } static __exit void uoa_exit(void) { #if LINUX_VERSION_CODE >= KERNEL_VERSION(4,3,0) - nf_unregister_net_hooks(&init_net, uoa_nf_hook_ops, - ARRAY_SIZE(uoa_nf_hook_ops)); + nf_unregister_net_hooks(&init_net, uoa_nf_hook_ops, + ARRAY_SIZE(uoa_nf_hook_ops)); + nf_unregister_net_hooks(&init_net, uoa_nf_hook_ops6, + ARRAY_SIZE(uoa_nf_hook_ops6)); #else - nf_unregister_hooks(uoa_nf_hook_ops, ARRAY_SIZE(uoa_nf_hook_ops)); + nf_unregister_hooks(uoa_nf_hook_ops, ARRAY_SIZE(uoa_nf_hook_ops)); + nf_unregister_hooks(uoa_nf_hook_ops6, ARRAY_SIZE(uoa_nf_hook_ops6)); #endif - synchronize_net(); + synchronize_net(); + + uoa_stats_exit(); - uoa_stats_exit(); + uoa_map_flush(); + uoa_map_exit(); + + pr_info("UOA module removed\n"); +} - uoa_map_flush(); - uoa_map_exit(); +static int ipv6_hdrlen(const struct sk_buff *skb) +{ + struct ipv6hdr *ip6h = ipv6_hdr(skb); + uint8_t ip6nxt = ip6h->nexthdr; +#if LINUX_VERSION_CODE < KERNEL_VERSION(3,3,0) + int ip6_hdrlen = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &ip6nxt); +#else + __be16 frag_off; + int ip6_hdrlen = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), + &ip6nxt, &frag_off); +#endif - pr_info("UOA module removed\n"); + return (ip6_hdrlen >= 0) ? ip6_hdrlen : sizeof(struct ipv6hdr); } module_init(uoa_init); diff --git a/kmod/uoa/uoa_extra.h b/kmod/uoa/uoa_extra.h new file mode 100644 index 000000000..b653db460 --- /dev/null +++ b/kmod/uoa/uoa_extra.h @@ -0,0 +1,65 @@ +#ifndef __UOA_EXTRA_H__ +#define __UOA_EXTRA_H__ + +#ifdef UOA_NEED_EXTRA +#include +#endif + +union inet_addr { + struct in_addr in; + struct in6_addr in6; +}; + +#ifdef UOA_NEED_EXTRA +static inline uint32_t inet_addr_fold(int af, const union inet_addr *addr) +{ + uint32_t addr_fold = 0; + + if (af == AF_INET) { + addr_fold = addr->in.s_addr; + } else if (af == AF_INET6) { + addr_fold = addr->in6.s6_addr32[0] ^ addr->in6.s6_addr32[1] ^ + addr->in6.s6_addr32[2] ^ addr->in6.s6_addr32[3]; + } else { + return 0; + } + + return addr_fold; +} + +static inline bool inet_addr_equal(int af, const union inet_addr *a1, + const union inet_addr *a2) +{ + switch (af) { + case AF_INET: + return a1->in.s_addr == a2->in.s_addr; + case AF_INET6: + return memcmp(a1->in6.s6_addr, a2->in6.s6_addr, 16) == 0; + default: + return memcmp(a1, a2, sizeof(union inet_addr)) == 0; + } +} + +#define IN6_ARE_ADDR_EQUAL(a,b) \ + ((((const uint32_t *) (a))[0] == ((const uint32_t *) (b))[0]) \ + && (((const uint32_t *) (a))[1] == ((const uint32_t *) (b))[1]) \ + && (((const uint32_t *) (a))[2] == ((const uint32_t *) (b))[2]) \ + && (((const uint32_t *) (a))[3] == ((const uint32_t *) (b))[3])) + +static inline bool inet_is_addr_any(int af, const union inet_addr *addr) +{ + switch (af) { + case AF_INET: + return addr->in.s_addr == htonl(INADDR_ANY); + case AF_INET6: + { + struct in6_addr ip6adummy = IN6ADDR_ANY_INIT; + return IN6_ARE_ADDR_EQUAL(&addr->in6, &ip6adummy); + } + default: + return false; + } +} +#endif + +#endif /* ifndef __UOA_EXTRA_H_ */ diff --git a/src/VERSION b/src/VERSION index b53d8c658..322f7953d 100755 --- a/src/VERSION +++ b/src/VERSION @@ -1,9 +1,9 @@ #!/bin/sh - # program: dpvs -# commit: ff3e0b3159bb7562844b332b24178812f5612ea1 -# Jun 21, 2018 +# Feb 26, 2019 +# Features: NAT64, connection redirect -export VERSION=1.6 -export RELEASE=1 +export VERSION=1.7 +export RELEASE=2 echo $VERSION-$RELEASE diff --git a/src/ctrl.c b/src/ctrl.c index bd6d0ed26..e1ac96056 100644 --- a/src/ctrl.c +++ b/src/ctrl.c @@ -68,7 +68,6 @@ struct multicast_wait_list { struct list_head list; }; struct multicast_wait_list mc_wait_list; -rte_rwlock_t mc_wait_lock; /* per-lcore msg queue */ struct rte_ring *msg_ring[DPVS_MAX_LCORE]; @@ -482,9 +481,7 @@ int multicast_msg_send(struct dpvs_msg *msg, uint32_t flags, struct dpvs_multica mc_msg->org_msg = msg; /* save original msg */ INIT_LIST_HEAD(&mc_msg->mq); - rte_rwlock_write_lock(&mc_wait_lock); if (mc_wait_list.free_cnt <= 0) { - rte_rwlock_write_unlock(&mc_wait_lock); RTE_LOG(WARNING, MSGMGR, "%s: multicast msg wait queue full, " "msg dropped and try later...\n", __func__); add_msg_flags(msg, DPVS_MSG_F_STATE_DROP); @@ -492,7 +489,6 @@ int multicast_msg_send(struct dpvs_msg *msg, uint32_t flags, struct dpvs_multica } list_add_tail(&mc_msg->list, &mc_wait_list.list); --mc_wait_list.free_cnt; - rte_rwlock_write_unlock(&mc_wait_lock); if (flags & DPVS_MSG_F_ASYNC) return EDPVS_OK; @@ -1252,8 +1248,6 @@ int ctrl_init(void) { int ret; - rte_rwlock_init(&mc_wait_lock); - ret = msg_init(); if (unlikely(ret < 0)) { RTE_LOG(ERR, MSGMGR, "%s: msg module initialization failed!\n", __func__); diff --git a/src/global_conf.c b/src/global_conf.c index 2beed2758..b586a9f5f 100644 --- a/src/global_conf.c +++ b/src/global_conf.c @@ -78,7 +78,7 @@ static int set_log_file(const char *log_file) __func__, log_file); return EDPVS_DPDKAPIFAIL; } - + log_current_time(); return EDPVS_OK; } diff --git a/src/icmp.c b/src/icmp.c index 1de9bee36..55ad44b67 100644 --- a/src/icmp.c +++ b/src/icmp.c @@ -38,7 +38,7 @@ static void icmp_dump_hdr(const struct rte_mbuf *mbuf) lcoreid_t lcore = rte_lcore_id(); fprintf(stderr, "lcore %d port %d icmp type %u code %u id %u seq %u\n", - lcore, mbuf->port, ich->icmp_type, ich->icmp_code, + lcore, mbuf->port, ich->icmp_type, ich->icmp_code, ntohs(ich->icmp_ident), ntohs(ich->icmp_seq_nb)); return; diff --git a/src/inet.c b/src/inet.c index afe56c626..54c7b8148 100644 --- a/src/inet.c +++ b/src/inet.c @@ -127,7 +127,7 @@ int inet_term(void) return EDPVS_OK; } -bool inet_addr_equal(int af, const union inet_addr *a1, +bool inet_addr_equal(int af, const union inet_addr *a1, const union inet_addr *a2) { switch (af) { @@ -166,7 +166,7 @@ int inet_plen_to_mask(int af, uint8_t plen, union inet_addr *mask) } } -int inet_addr_net(int af, const union inet_addr *addr, +int inet_addr_net(int af, const union inet_addr *addr, const union inet_addr *mask, union inet_addr *net) { diff --git a/src/inetaddr.c b/src/inetaddr.c index 299a70078..c19eb99e9 100644 --- a/src/inetaddr.c +++ b/src/inetaddr.c @@ -70,7 +70,7 @@ static uint32_t inline in_addr_hash(struct in_addr *in) return hash % INET_ADDR_HSIZE; } -static inline bool ifa_prefix_check(int af, const union inet_addr *addr, +static inline bool ifa_prefix_check(int af, const union inet_addr *addr, uint8_t plen) { if ((af != AF_INET && af != AF_INET6) @@ -83,7 +83,7 @@ static inline bool ifa_prefix_check(int af, const union inet_addr *addr, } /* zero for infinity lifetime */ -static void ifa_set_lifetime(struct inet_ifaddr *ifa, +static void ifa_set_lifetime(struct inet_ifaddr *ifa, uint32_t valid_lft, uint32_t prefered_lft) { /* XXX: do not support prefered_lft */ @@ -99,8 +99,8 @@ static void ifa_set_lifetime(struct inet_ifaddr *ifa, return; } -static struct inet_ifaddr *__ifa_lookup(struct inet_device *idev, - const union inet_addr *addr, +static struct inet_ifaddr *__ifa_lookup(struct inet_device *idev, + const union inet_addr *addr, uint8_t plen, int af) { struct inet_ifaddr *ifa; @@ -139,7 +139,7 @@ static inline void ___ifa_remove(struct inet_ifaddr *ifa) } /* make lookup and remove atmomic, also cancel the timer */ -static int __ifa_remove(struct inet_device *idev, const union inet_addr *addr, +static int __ifa_remove(struct inet_device *idev, const union inet_addr *addr, uint8_t plen, struct inet_ifaddr **ifa, int af) { struct inet_ifaddr *ent; @@ -162,7 +162,7 @@ static int __ifa_add_route4(struct inet_ifaddr *ifa) int err; union inet_addr net; - err = route_add(&ifa->addr.in, 32, RTF_LOCALIN, + err = route_add(&ifa->addr.in, 32, RTF_LOCALIN, NULL, ifa->idev->dev, NULL, 0, 0); /* may already added by same IP with diff plen */ if (err != EDPVS_OK && err != EDPVS_EXIST) @@ -175,7 +175,7 @@ static int __ifa_add_route4(struct inet_ifaddr *ifa) if (err != EDPVS_OK) goto errout; - err = route_add(&net.in, ifa->plen, RTF_FORWARD, + err = route_add(&net.in, ifa->plen, RTF_FORWARD, NULL, ifa->idev->dev, &ifa->addr.in, 0, 0); /* may already added by another IP */ if (err != EDPVS_OK && err != EDPVS_EXIST) @@ -184,7 +184,7 @@ static int __ifa_add_route4(struct inet_ifaddr *ifa) return EDPVS_OK; errout: - route_del(&ifa->addr.in, ifa->plen, RTF_LOCALIN, + route_del(&ifa->addr.in, ifa->plen, RTF_LOCALIN, NULL, ifa->idev->dev, NULL, 0, 0); return err; } @@ -195,7 +195,7 @@ static int __ifa_add_route6(struct inet_ifaddr *ifa) struct in6_addr net; err = route6_add(&ifa->addr.in6, 128, RTF_LOCALIN, - &in6addr_any, ifa->idev->dev, + &in6addr_any, ifa->idev->dev, &in6addr_any, ifa->idev->dev->mtu); if (err != EDPVS_OK && err != EDPVS_EXIST) @@ -237,7 +237,7 @@ static int __ifa_del_route4(struct inet_ifaddr *ifa) int err; union inet_addr net; - err = route_del(&ifa->addr.in, 32, RTF_LOCALIN, + err = route_del(&ifa->addr.in, 32, RTF_LOCALIN, NULL, ifa->idev->dev, NULL, 0, 0); if (err != EDPVS_OK && err != EDPVS_NOTEXIST) RTE_LOG(WARNING, IFA, "%s: fail to delete route", __func__); @@ -249,7 +249,7 @@ static int __ifa_del_route4(struct inet_ifaddr *ifa) if (err != EDPVS_OK) RTE_LOG(WARNING, IFA, "%s: fail to delete route", __func__); - err = route_del(&net.in, ifa->plen, RTF_FORWARD, + err = route_del(&net.in, ifa->plen, RTF_FORWARD, NULL, ifa->idev->dev, &ifa->addr.in, 0, 0); if (err != EDPVS_OK && err != EDPVS_NOTEXIST) RTE_LOG(WARNING, IFA, "%s: fail to delete route", __func__); @@ -263,7 +263,7 @@ static int __ifa_del_route6(struct inet_ifaddr *ifa) struct in6_addr net; err = route6_del(&ifa->addr.in6, 128, RTF_LOCALIN, - &in6addr_any, ifa->idev->dev, + &in6addr_any, ifa->idev->dev, &in6addr_any, ifa->idev->dev->mtu); if (err != EDPVS_OK && err != EDPVS_NOTEXIST) RTE_LOG(WARNING, IFA, "%s: fail to delete route", __func__); @@ -292,7 +292,7 @@ static int ifa_del_route(struct inet_ifaddr *ifa) return EDPVS_NOTSUPP; } -static struct inet_ifmcaddr *__imc_lookup( int af, const struct inet_device *idev, +static struct inet_ifmcaddr *__imc_lookup( int af, const struct inet_device *idev, const union inet_addr *maddr) { struct inet_ifmcaddr *imc; @@ -306,7 +306,7 @@ static struct inet_ifmcaddr *__imc_lookup( int af, const struct inet_device *ide return NULL; } -static int idev_mc_add(int af, struct inet_device *idev, +static int idev_mc_add(int af, struct inet_device *idev, const union inet_addr *maddr) { struct inet_ifmcaddr *imc; @@ -330,7 +330,7 @@ static int idev_mc_add(int af, struct inet_device *idev, return EDPVS_OK; } -static int idev_mc_del(int af, struct inet_device *idev, +static int idev_mc_del(int af, struct inet_device *idev, const union inet_addr *maddr) { struct inet_ifmcaddr *imc; @@ -403,7 +403,7 @@ static int inet_ifaddr_dad_completed(void *arg) } /* change timer callback, refer to 'addrconf_mod_timer' */ -static void inet_ifaddr_mod_timer(struct inet_ifaddr *ifa, +static void inet_ifaddr_mod_timer(struct inet_ifaddr *ifa, enum ifaddr_timer_t what, struct timeval *when) { @@ -411,13 +411,13 @@ static void inet_ifaddr_mod_timer(struct inet_ifaddr *ifa, switch (what) { case INET_DAD: - dpvs_timer_sched(&ifa->timer, when, inet_ifaddr_dad_completed, + dpvs_timer_sched(&ifa->timer, when, inet_ifaddr_dad_completed, ifa, true); break; /* TODO: other timer support */ default: break; - } + } } static void inet_ifaddr_dad_stop(struct inet_ifaddr *ifa, int dad_failed) @@ -463,8 +463,8 @@ static void inet_ifaddr_dad_start(struct inet_ifaddr *ifa) ndisc_send_dad(ifa->idev->dev, &ifa->addr.in6); } -/* - * no need to rollback, dpvs can not start successfully; +/* + * no need to rollback, dpvs can not start successfully; * should not be init in 'inetaddr_init'; * because multicast address should be added after port_start */ @@ -555,9 +555,9 @@ static int ifa_expire(void *arg) return DTIMER_STOP; } -static int ifa_add_set(int af, const struct netif_port *dev, +static int ifa_add_set(int af, const struct netif_port *dev, const union inet_addr *addr, uint8_t plen, - const union inet_addr *bcast, + const union inet_addr *bcast, uint32_t valid_lft, uint32_t prefered_lft, uint8_t scope, uint32_t flags, bool create) { @@ -565,6 +565,7 @@ static int ifa_add_set(int af, const struct netif_port *dev, struct inet_ifaddr *ifa = NULL; struct timeval timeo = {0}; int err; + char addr_str[64]; if (!dev || !ifa_prefix_check(af, addr, plen)) return EDPVS_INVAL; @@ -573,6 +574,9 @@ static int ifa_add_set(int af, const struct netif_port *dev, if (!idev) return EDPVS_RESOURCE; + inet_ntop(af, &addr->in.s_addr, addr_str, sizeof(addr_str)); + RTE_LOG(INFO, IFA, "try to add %s in %s \n", addr_str, __func__); + rte_rwlock_write_lock(&in_addr_lock); ifa = __ifa_lookup(idev, addr, plen, af); @@ -583,7 +587,7 @@ static int ifa_add_set(int af, const struct netif_port *dev, err = EDPVS_NOTEXIST; goto errout; } - + if (!ifa) { ifa = rte_calloc(NULL, 1, sizeof(*ifa), RTE_CACHE_LINE_SIZE); if (!ifa) { @@ -610,7 +614,7 @@ static int ifa_add_set(int af, const struct netif_port *dev, /* set routes for local and network */ err = ifa_add_route(ifa); - if (err != EDPVS_OK) + if (err != EDPVS_OK && err != EDPVS_EXIST) goto del_mc; err = __ifa_insert(idev, ifa); @@ -678,26 +682,27 @@ static int ifa_add_set(int af, const struct netif_port *dev, errout: rte_rwlock_write_unlock(&in_addr_lock); idev_put(idev); + RTE_LOG(WARNING, IFA, "add %s in %s failed\n", addr_str, __func__); return err; } -int inet_addr_add(int af, const struct netif_port *dev, +int inet_addr_add(int af, const struct netif_port *dev, const union inet_addr *addr, uint8_t plen, - const union inet_addr *bcast, + const union inet_addr *bcast, uint32_t valid_lft, uint32_t prefered_lft, uint8_t scope, uint32_t flags) { - return ifa_add_set(af, dev, addr, plen, bcast, valid_lft, prefered_lft, + return ifa_add_set(af, dev, addr, plen, bcast, valid_lft, prefered_lft, scope, flags, true); } -int inet_addr_mod(int af, const struct netif_port *dev, +int inet_addr_mod(int af, const struct netif_port *dev, const union inet_addr *addr, uint8_t plen, - const union inet_addr *bcast, + const union inet_addr *bcast, uint32_t valid_lft, uint32_t prefered_lft, uint8_t scope) { - return ifa_add_set(af, dev, addr, plen, bcast, valid_lft, prefered_lft, + return ifa_add_set(af, dev, addr, plen, bcast, valid_lft, prefered_lft, scope, 0, false); } @@ -707,6 +712,7 @@ int inet_addr_del(int af, struct netif_port *dev, struct inet_ifaddr *ifa; struct inet_device *idev; int err; + char addr_str[64]; if (!dev || !ifa_prefix_check(af, addr, plen)) return EDPVS_INVAL; @@ -730,6 +736,9 @@ int inet_addr_del(int af, struct netif_port *dev, } rte_rwlock_write_unlock(&in_addr_lock); + inet_ntop(af, &addr->in.s_addr, addr_str, sizeof(addr_str)); + RTE_LOG(INFO, IFA, "del %s in %s \n", addr_str, __func__); + idev_put(idev); return err; } @@ -756,8 +765,8 @@ int inet_addr_flush(int af, struct netif_port *dev) INIT_LIST_HEAD(&ifa->h_list); if (rte_atomic32_read(&ifa->refcnt) > 2) { - RTE_LOG(ERR, IFA, "%s: address %s/%d is in use\n", __func__, - inet_ntop(af, &ifa->addr, buf, sizeof(buf)) ? buf : "::", + RTE_LOG(ERR, IFA, "%s: address %s/%d is in use\n", __func__, + inet_ntop(af, &ifa->addr, buf, sizeof(buf)) ? buf : "::", ifa->plen); continue; } @@ -810,8 +819,8 @@ struct netif_port *inet_addr_get_iface(int af, union inet_addr *addr) return dev; } -void inet_addr_select(int af, const struct netif_port *dev, - const union inet_addr *dst, int scope, +void inet_addr_select(int af, const struct netif_port *dev, + const union inet_addr *dst, int scope, union inet_addr *addr) { struct inet_device *idev = dev_get_idev(dev); @@ -833,7 +842,7 @@ void inet_addr_select(int af, const struct netif_port *dev, /* for each primary address */ if (af == AF_INET) { list_for_each_entry(ifa, &idev->ifa_list, d_list) { - if ((ifa->flags & IFA_F_SECONDARY) || + if ((ifa->flags & IFA_F_SECONDARY) || (ifa->flags & IFA_F_TENTATIVE)) continue; if (ifa->scope > scope) @@ -901,7 +910,7 @@ bool inet_chk_mcast_addr(int af, struct netif_port *dev, if (af != AF_INET6) return true; - + idev = dev_get_idev(dev); if (idev) { @@ -916,7 +925,7 @@ bool inet_chk_mcast_addr(int af, struct netif_port *dev, ret = true; } } - + rte_rwlock_read_unlock(&in_addr_lock); idev_put(idev); } @@ -942,23 +951,23 @@ static int ifa_sockopt_set(sockoptid_t opt, const void *conf, size_t size) dev = netif_port_get_by_name(param->ifname); if (!dev) { - RTE_LOG(WARNING, IFA, "%s: no such device: %s\n", + RTE_LOG(WARNING, IFA, "%s: no such device: %s\n", __func__, param->ifname); return EDPVS_NOTEXIST; } switch (opt) { case SOCKOPT_SET_IFADDR_ADD: - return inet_addr_add(param->af, dev, ¶m->addr, param->plen, - ¶m->bcast, param->valid_lft, + return inet_addr_add(param->af, dev, ¶m->addr, param->plen, + ¶m->bcast, param->valid_lft, param->prefered_lft, param->scope, param->flags); case SOCKOPT_SET_IFADDR_DEL: return inet_addr_del(param->af, dev, ¶m->addr, param->plen); case SOCKOPT_SET_IFADDR_SET: - return inet_addr_mod(param->af, dev, ¶m->addr, param->plen, - ¶m->bcast, param->valid_lft, + return inet_addr_mod(param->af, dev, ¶m->addr, param->plen, + ¶m->bcast, param->valid_lft, param->prefered_lft, param->scope); case SOCKOPT_SET_IFADDR_FLUSH: @@ -1001,7 +1010,7 @@ static void ifa_fill_param(int af, struct inet_addr_param *param, } } -static int ifa_sockopt_get(sockoptid_t opt, const void *conf, size_t size, +static int ifa_sockopt_get(sockoptid_t opt, const void *conf, size_t size, void **out, size_t *outsize) { const struct inet_addr_param *param = conf; @@ -1017,7 +1026,7 @@ static int ifa_sockopt_get(sockoptid_t opt, const void *conf, size_t size, if (opt != SOCKOPT_GET_IFADDR_SHOW) return EDPVS_NOTSUPP; - if (param->af != AF_INET && + if (param->af != AF_INET && param->af != AF_UNSPEC && param->af != AF_INET6) return EDPVS_NOTSUPP; @@ -1025,7 +1034,7 @@ static int ifa_sockopt_get(sockoptid_t opt, const void *conf, size_t size, if (strlen(param->ifname)) { dev = netif_port_get_by_name(param->ifname); if (!dev) { - RTE_LOG(WARNING, IFA, "%s: no such device: %s\n", + RTE_LOG(WARNING, IFA, "%s: no such device: %s\n", __func__, param->ifname); return EDPVS_NOTEXIST; } diff --git a/src/ip_gre.c b/src/ip_gre.c index abbadb104..8d1d60778 100644 --- a/src/ip_gre.c +++ b/src/ip_gre.c @@ -52,47 +52,47 @@ static struct ip_tunnel_tab gre_tunnel_tab; /* linux: gre_flags_to_tnl_flags */ static inline __be16 flags_gre2tnl(__be16 flags) { - __be16 tflags = 0; - - if (flags & GRE_F_CSUM) - tflags |= TUNNEL_F_CSUM; - if (flags & GRE_F_ROUTING) - tflags |= TUNNEL_F_ROUTING; - if (flags & GRE_F_KEY) - tflags |= TUNNEL_F_KEY; - if (flags & GRE_F_SEQ) - tflags |= TUNNEL_F_SEQ; - if (flags & GRE_F_STRICT) - tflags |= TUNNEL_F_STRICT; - if (flags & GRE_F_REC) - tflags |= TUNNEL_F_REC; - if (flags & GRE_F_VERSION) - tflags |= TUNNEL_F_VERSION; - - return tflags; + __be16 tflags = 0; + + if (flags & GRE_F_CSUM) + tflags |= TUNNEL_F_CSUM; + if (flags & GRE_F_ROUTING) + tflags |= TUNNEL_F_ROUTING; + if (flags & GRE_F_KEY) + tflags |= TUNNEL_F_KEY; + if (flags & GRE_F_SEQ) + tflags |= TUNNEL_F_SEQ; + if (flags & GRE_F_STRICT) + tflags |= TUNNEL_F_STRICT; + if (flags & GRE_F_REC) + tflags |= TUNNEL_F_REC; + if (flags & GRE_F_VERSION) + tflags |= TUNNEL_F_VERSION; + + return tflags; } /* linux: gre_tnl_flags_to_gre_flags */ static inline __be16 flags_tnl2gre(__be16 tflags) { - __be16 flags = 0; - - if (tflags & TUNNEL_F_CSUM) - flags |= GRE_F_CSUM; - if (tflags & TUNNEL_F_ROUTING) - flags |= GRE_F_ROUTING; - if (tflags & TUNNEL_F_KEY) - flags |= GRE_F_KEY; - if (tflags & TUNNEL_F_SEQ) - flags |= GRE_F_SEQ; - if (tflags & TUNNEL_F_STRICT) - flags |= GRE_F_STRICT; - if (tflags & TUNNEL_F_REC) - flags |= GRE_F_REC; - if (tflags & TUNNEL_F_VERSION) - flags |= GRE_F_VERSION; - - return flags; + __be16 flags = 0; + + if (tflags & TUNNEL_F_CSUM) + flags |= GRE_F_CSUM; + if (tflags & TUNNEL_F_ROUTING) + flags |= GRE_F_ROUTING; + if (tflags & TUNNEL_F_KEY) + flags |= GRE_F_KEY; + if (tflags & TUNNEL_F_SEQ) + flags |= GRE_F_SEQ; + if (tflags & TUNNEL_F_STRICT) + flags |= GRE_F_STRICT; + if (tflags & TUNNEL_F_REC) + flags |= GRE_F_REC; + if (tflags & TUNNEL_F_VERSION) + flags |= GRE_F_VERSION; + + return flags; } static inline __be16 gre_checksum(struct rte_mbuf *mbuf) diff --git a/src/ip_tunnel.c b/src/ip_tunnel.c index 69a87ec2b..4608a401b 100644 --- a/src/ip_tunnel.c +++ b/src/ip_tunnel.c @@ -392,7 +392,7 @@ static int tunnel_so_set(sockoptid_t opt, const void *arg, size_t inlen) return EDPVS_INVAL; } } - + if (!ops && (opt == SOCKOPT_TUNNEL_ADD || opt == SOCKOPT_TUNNEL_REPLACE)) { RTE_LOG(ERR, TUNNEL, "%s: cannot determine tunnel mode\n", __func__); return EDPVS_INVAL; @@ -456,7 +456,7 @@ static int tunnel_so_get(sockoptid_t opt, const void *arg, size_t inlen, assert(params && inlen >= sizeof(*params) && out && outlen); rte_rwlock_read_lock(&ip_tunnel_lock); - + /* device name is indicated */ if (strlen(params->ifname)) { dev = netif_port_get_by_name(params->ifname); @@ -687,7 +687,7 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_tab *tab, remote != tnl->params.iph.daddr || !(tnl->dev->flag & NETIF_PORT_FLAG_RUNNING)) continue; - + if (!tunnel_key_match(&tnl->params, flags, key)) continue; diff --git a/src/ipv4.c b/src/ipv4.c index e5e1a9282..9e619f933 100644 --- a/src/ipv4.c +++ b/src/ipv4.c @@ -532,7 +532,8 @@ int ipv4_xmit(struct rte_mbuf *mbuf, const struct flow4 *fl4) /* output route decision: out-dev, source address, ... */ rt = route4_output(fl4); - if (!rt) { + /* not support loopback */ + if (!rt || !(rt->flag & RTF_FORWARD)) { rte_pktmbuf_free(mbuf); IP4_INC_STATS(outnoroutes); return EDPVS_NOROUTE; diff --git a/src/ipv4_frag.c b/src/ipv4_frag.c index abf84dacd..6a49a18a3 100644 --- a/src/ipv4_frag.c +++ b/src/ipv4_frag.c @@ -29,11 +29,11 @@ #define IP4FRAG #define RTE_LOGTYPE_IP4FRAG RTE_LOGTYPE_USER1 -#define IP4FRAG_PREFETCH_OFFSET 3 +#define IP4FRAG_PREFETCH_OFFSET 3 struct ipv4_frag { - struct rte_ip_frag_tbl *reasm_tbl; - struct rte_ip_frag_death_row death_tbl; /* frags to be free */ + struct rte_ip_frag_tbl *reasm_tbl; + struct rte_ip_frag_death_row death_tbl; /* frags to be free */ }; /* parameters */ @@ -156,12 +156,12 @@ void install_ip4_frag_keywords(void) /* * per-lcore reassamble table. * - * RTE_DEFINE_PER_LCORE has no way to traverse the table + * RTE_DEFINE_PER_LCORE has no way to traverse the table * it need to use rte_eal_mp_remote_launch with additional func. * that's not straightforward, so let's use array. */ static struct ipv4_frag ip4_frags[RTE_MAX_LCORE]; -#define this_ip4_frag (ip4_frags[rte_socket_id()]) +#define this_ip4_frag (ip4_frags[rte_socket_id()]) /* * change mbuf in-place or have to change proto-type @@ -170,266 +170,266 @@ static struct ipv4_frag ip4_frags[RTE_MAX_LCORE]; */ int ipv4_reassamble(struct rte_mbuf *mbuf) { - struct rte_mbuf *asm_mbuf, *next, *seg, *prev; - struct ipv4_hdr *iph = ip4_hdr(mbuf); - - assert(mbuf->l3_len > 0); - - /* dpdk frag lib need mbuf->data_off of fragments - * start with l2 header if exist. */ - rte_pktmbuf_prepend(mbuf, mbuf->l2_len); - - asm_mbuf = rte_ipv4_frag_reassemble_packet( - this_ip4_frag.reasm_tbl, - &this_ip4_frag.death_tbl, - mbuf, rte_rdtsc(), iph); - - if (!asm_mbuf) /* no way to distinguish error and in-progress */ - return EDPVS_INPROGRESS; - - rte_pktmbuf_adj(asm_mbuf, mbuf->l2_len); - - /* as kernel, make this frag as heading mbuf. - * the latest fragment (mbuf) should be linear. */ - - /* now mbuf is a seg of asm_mbuf, replace it with a new seg. */ - if ((seg = rte_pktmbuf_alloc(mbuf->pool)) == NULL) { - RTE_LOG(ERR, IP4FRAG, "%s: no memory.", __func__); - rte_pktmbuf_free(asm_mbuf); - return EDPVS_NOMEM; - } - for (prev = asm_mbuf; prev; prev = prev->next) - if (prev->next == mbuf) - break; - if (!prev) { - RTE_LOG(ERR, IP4FRAG, "%s: mbuf is not a seg.", __func__); - rte_pktmbuf_free(asm_mbuf); - rte_pktmbuf_free(seg); - return EDPVS_NOMEM; - } - memcpy(rte_pktmbuf_mtod(seg, void *), - rte_pktmbuf_mtod(mbuf, void *), mbuf->data_len); - seg->data_len = mbuf->data_len; - seg->pkt_len = mbuf->pkt_len; - prev->next = seg; - seg->next = mbuf->next; - mbuf->next = NULL; - - /* make mbuf as heading frag. */ - if (!rte_pktmbuf_is_contiguous(mbuf)) { - RTE_LOG(ERR, IP4FRAG, "%s: mbuf is not linear.", __func__); - rte_pktmbuf_free(asm_mbuf); - return EDPVS_NOROOM; - } - - if (mbuf->data_off + asm_mbuf->data_len > mbuf->buf_len) { - RTE_LOG(ERR, IP4FRAG, "%s: no room.", __func__); - rte_pktmbuf_free(asm_mbuf); - return EDPVS_NOROOM; - } - - memcpy(rte_pktmbuf_mtod(mbuf, void *), - rte_pktmbuf_mtod(asm_mbuf, void *), asm_mbuf->data_len); - mbuf->data_len = asm_mbuf->data_len; - mbuf->pkt_len = mbuf->data_len; - - /* move segs to new heading mbuf. */ - prev = mbuf; - mbuf_foreach_seg_safe(asm_mbuf, next, seg) { - assert(asm_mbuf->next == seg); - - asm_mbuf->next = next; - asm_mbuf->nb_segs--; - asm_mbuf->pkt_len -= seg->data_len; - - prev->next = seg; - prev = seg; - mbuf->nb_segs++; - mbuf->pkt_len += seg->data_len; - } - - /* now asm_mbuf has no segs */ - rte_pktmbuf_free(asm_mbuf); - return EDPVS_OK; + struct rte_mbuf *asm_mbuf, *next, *seg, *prev; + struct ipv4_hdr *iph = ip4_hdr(mbuf); + + assert(mbuf->l3_len > 0); + + /* dpdk frag lib need mbuf->data_off of fragments + * start with l2 header if exist. */ + rte_pktmbuf_prepend(mbuf, mbuf->l2_len); + + asm_mbuf = rte_ipv4_frag_reassemble_packet( + this_ip4_frag.reasm_tbl, + &this_ip4_frag.death_tbl, + mbuf, rte_rdtsc(), iph); + + if (!asm_mbuf) /* no way to distinguish error and in-progress */ + return EDPVS_INPROGRESS; + + rte_pktmbuf_adj(asm_mbuf, mbuf->l2_len); + + /* as kernel, make this frag as heading mbuf. + * the latest fragment (mbuf) should be linear. */ + + /* now mbuf is a seg of asm_mbuf, replace it with a new seg. */ + if ((seg = rte_pktmbuf_alloc(mbuf->pool)) == NULL) { + RTE_LOG(ERR, IP4FRAG, "%s: no memory.", __func__); + rte_pktmbuf_free(asm_mbuf); + return EDPVS_NOMEM; + } + for (prev = asm_mbuf; prev; prev = prev->next) + if (prev->next == mbuf) + break; + if (!prev) { + RTE_LOG(ERR, IP4FRAG, "%s: mbuf is not a seg.", __func__); + rte_pktmbuf_free(asm_mbuf); + rte_pktmbuf_free(seg); + return EDPVS_NOMEM; + } + memcpy(rte_pktmbuf_mtod(seg, void *), + rte_pktmbuf_mtod(mbuf, void *), mbuf->data_len); + seg->data_len = mbuf->data_len; + seg->pkt_len = mbuf->pkt_len; + prev->next = seg; + seg->next = mbuf->next; + mbuf->next = NULL; + + /* make mbuf as heading frag. */ + if (!rte_pktmbuf_is_contiguous(mbuf)) { + RTE_LOG(ERR, IP4FRAG, "%s: mbuf is not linear.", __func__); + rte_pktmbuf_free(asm_mbuf); + return EDPVS_NOROOM; + } + + if (mbuf->data_off + asm_mbuf->data_len > mbuf->buf_len) { + RTE_LOG(ERR, IP4FRAG, "%s: no room.", __func__); + rte_pktmbuf_free(asm_mbuf); + return EDPVS_NOROOM; + } + + memcpy(rte_pktmbuf_mtod(mbuf, void *), + rte_pktmbuf_mtod(asm_mbuf, void *), asm_mbuf->data_len); + mbuf->data_len = asm_mbuf->data_len; + mbuf->pkt_len = mbuf->data_len; + + /* move segs to new heading mbuf. */ + prev = mbuf; + mbuf_foreach_seg_safe(asm_mbuf, next, seg) { + assert(asm_mbuf->next == seg); + + asm_mbuf->next = next; + asm_mbuf->nb_segs--; + asm_mbuf->pkt_len -= seg->data_len; + + prev->next = seg; + prev = seg; + mbuf->nb_segs++; + mbuf->pkt_len += seg->data_len; + } + + /* now asm_mbuf has no segs */ + rte_pktmbuf_free(asm_mbuf); + return EDPVS_OK; } /* this function consumes mbuf also free route. */ int ipv4_fragment(struct rte_mbuf *mbuf, unsigned int mtu, - int (*output)(struct rte_mbuf *)) + int (*output)(struct rte_mbuf *)) { - struct ipv4_hdr *iph = ip4_hdr(mbuf); - struct route_entry *rt = mbuf->userdata; - struct rte_mbuf *frag; - unsigned int left, len, hlen; - int offset, err, from; - void *to; - assert(rt); - - if (iph->fragment_offset & IPV4_HDR_DF_FLAG) { - icmp_send(mbuf, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, - htonl(mtu)); - err = EDPVS_FRAG; - goto out; - } - - hlen = ip4_hdrlen(mbuf); - mtu -= hlen; /* IP payload space */ - left = mbuf->pkt_len - hlen; - from = hlen; - offset = 0; - - while (left > 0) { - len = left < mtu ? left : mtu; /* min(left, mtu) */ - - /* if we are not last frag, - * ensure next start on eight byte boundary */ - if (len < left) - len &= ~7; - - /* mbuf should have enough headroom, - * but no way to extend tail room. */ - frag = rte_pktmbuf_alloc(mbuf->pool); - if (!frag) { - err = EDPVS_NOMEM; - goto out; - } - - /* copy metadata from orig pkt */ - route4_get(rt); - frag->userdata = rt; /* no need to hold before consume mbuf */ - frag->port = mbuf->port; - frag->ol_flags = 0; /* do not offload csum for frag */ - frag->l2_len = mbuf->l2_len; - frag->l3_len = mbuf->l3_len; - - /* copy IP header */ - if (unlikely((to = rte_pktmbuf_append(frag, hlen)) == NULL) - || mbuf_copy_bits(mbuf, 0, to, hlen) != 0) { - err = EDPVS_NOROOM; - route4_put(rt); - rte_pktmbuf_free(frag); - goto out; - } - - /* copy data block */ - if (unlikely((to = rte_pktmbuf_append(frag, len)) == NULL) - || mbuf_copy_bits(mbuf, from, to, len) != 0) { - err = EDPVS_NOROOM; - route4_put(rt); - rte_pktmbuf_free(frag); - goto out; - } - left -= len; - - /* adjust new IP header fields */ - iph = ip4_hdr(frag); - iph->fragment_offset = htons(offset >> 3); - /* TODO: if (offset == 0) ip_fragment_options(frag); */ - - if (left > 0) - iph->fragment_offset |= htons(IPV4_HDR_MF_FLAG); - offset += len; - from += len; - - iph->total_length = htons(len + hlen); - ip4_send_csum(iph); - - /* consumes frag and it's route */ - err = output(frag); - if (err != EDPVS_OK) - goto out; - - IP4_INC_STATS(fragcreates); - } - - err = EDPVS_OK; + struct ipv4_hdr *iph = ip4_hdr(mbuf); + struct route_entry *rt = mbuf->userdata; + struct rte_mbuf *frag; + unsigned int left, len, hlen; + int offset, err, from; + void *to; + assert(rt); + + if (iph->fragment_offset & IPV4_HDR_DF_FLAG) { + icmp_send(mbuf, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, + htonl(mtu)); + err = EDPVS_FRAG; + goto out; + } + + hlen = ip4_hdrlen(mbuf); + mtu -= hlen; /* IP payload space */ + left = mbuf->pkt_len - hlen; + from = hlen; + offset = 0; + + while (left > 0) { + len = left < mtu ? left : mtu; /* min(left, mtu) */ + + /* if we are not last frag, + * ensure next start on eight byte boundary */ + if (len < left) + len &= ~7; + + /* mbuf should have enough headroom, + * but no way to extend tail room. */ + frag = rte_pktmbuf_alloc(mbuf->pool); + if (!frag) { + err = EDPVS_NOMEM; + goto out; + } + + /* copy metadata from orig pkt */ + route4_get(rt); + frag->userdata = rt; /* no need to hold before consume mbuf */ + frag->port = mbuf->port; + frag->ol_flags = 0; /* do not offload csum for frag */ + frag->l2_len = mbuf->l2_len; + frag->l3_len = mbuf->l3_len; + + /* copy IP header */ + if (unlikely((to = rte_pktmbuf_append(frag, hlen)) == NULL) + || mbuf_copy_bits(mbuf, 0, to, hlen) != 0) { + err = EDPVS_NOROOM; + route4_put(rt); + rte_pktmbuf_free(frag); + goto out; + } + + /* copy data block */ + if (unlikely((to = rte_pktmbuf_append(frag, len)) == NULL) + || mbuf_copy_bits(mbuf, from, to, len) != 0) { + err = EDPVS_NOROOM; + route4_put(rt); + rte_pktmbuf_free(frag); + goto out; + } + left -= len; + + /* adjust new IP header fields */ + iph = ip4_hdr(frag); + iph->fragment_offset = htons(offset >> 3); + /* TODO: if (offset == 0) ip_fragment_options(frag); */ + + if (left > 0) + iph->fragment_offset |= htons(IPV4_HDR_MF_FLAG); + offset += len; + from += len; + + iph->total_length = htons(len + hlen); + ip4_send_csum(iph); + + /* consumes frag and it's route */ + err = output(frag); + if (err != EDPVS_OK) + goto out; + + IP4_INC_STATS(fragcreates); + } + + err = EDPVS_OK; out: - route4_put(rt); - rte_pktmbuf_free(mbuf); - if (err == EDPVS_OK) - IP4_INC_STATS(fragoks); - else - IP4_INC_STATS(fragfails); - return err; + route4_put(rt); + rte_pktmbuf_free(mbuf); + if (err == EDPVS_OK) + IP4_INC_STATS(fragoks); + else + IP4_INC_STATS(fragfails); + return err; } static void ipv4_frag_job(void *arg) { - struct ipv4_frag *f = &ip4_frags[rte_lcore_id()]; + struct ipv4_frag *f = &ip4_frags[rte_lcore_id()]; - rte_ip_frag_free_death_row(&f->death_tbl, IP4FRAG_PREFETCH_OFFSET); - return; + rte_ip_frag_free_death_row(&f->death_tbl, IP4FRAG_PREFETCH_OFFSET); + return; } static struct netif_lcore_loop_job frag_job; int ipv4_frag_init(void) { - lcoreid_t cid; - int socket_id; /* NUMA-socket ID */ - uint64_t max_cycles; - int err; - struct ipv4_frag *f4; - - if (ip4_frag_bucket_entries <=0 || - ip4_frag_max_entries > ip4_frag_buckets * ip4_frag_bucket_entries) { - RTE_LOG(WARNING, IP4FRAG, "invalid ip4_frag_max_entries %d (should be no " - "bigger than ip4_frag_buckets(%d) * ip4_frag_bucket_entries(%d), using " - "%d instead\n", ip4_frag_max_entries, - ip4_frag_buckets, ip4_frag_bucket_entries, - ip4_frag_buckets * ip4_frag_bucket_entries / 2); - ip4_frag_max_entries = ip4_frag_buckets * ip4_frag_bucket_entries / 2; - } - - /* this magic expression comes from DPDK ip_reassembly example */ - max_cycles = (rte_get_tsc_hz() + MS_PER_S - 1) / MS_PER_S * - (ip4_frag_ttl * MS_PER_S); - - for (cid = 0; cid < RTE_MAX_LCORE; cid++) { - if (!rte_lcore_is_enabled(cid)) - continue; - - f4 = &ip4_frags[cid]; - memset(f4, 0, sizeof(struct ipv4_frag)); - socket_id = rte_lcore_to_socket_id(cid); - - f4->reasm_tbl = rte_ip_frag_table_create( - ip4_frag_buckets, - ip4_frag_bucket_entries, - ip4_frag_max_entries, - max_cycles, - socket_id); - if (!f4->reasm_tbl) { - RTE_LOG(ERR, IP4FRAG, - "[%d] fail to create frag table.\n", cid); - return EDPVS_DPDKAPIFAIL; - } - } - - snprintf(frag_job.name, sizeof(frag_job.name) - 1, "%s", "ipv4_frag"); - frag_job.func = ipv4_frag_job; - frag_job.data = NULL; - frag_job.type = NETIF_LCORE_JOB_SLOW; - frag_job.skip_loops = IP4_FRAG_FREE_DEATH_ROW_INTERVAL; - err = netif_lcore_loop_job_register(&frag_job); - if (err != EDPVS_OK) { - RTE_LOG(ERR, IP4FRAG, "fail to register loop job.\n"); - return err; - } - - return EDPVS_OK; + lcoreid_t cid; + int socket_id; /* NUMA-socket ID */ + uint64_t max_cycles; + int err; + struct ipv4_frag *f4; + + if (ip4_frag_bucket_entries <=0 || + ip4_frag_max_entries > ip4_frag_buckets * ip4_frag_bucket_entries) { + RTE_LOG(WARNING, IP4FRAG, "invalid ip4_frag_max_entries %d (should be no " + "bigger than ip4_frag_buckets(%d) * ip4_frag_bucket_entries(%d), using " + "%d instead\n", ip4_frag_max_entries, + ip4_frag_buckets, ip4_frag_bucket_entries, + ip4_frag_buckets * ip4_frag_bucket_entries / 2); + ip4_frag_max_entries = ip4_frag_buckets * ip4_frag_bucket_entries / 2; + } + + /* this magic expression comes from DPDK ip_reassembly example */ + max_cycles = (rte_get_tsc_hz() + MS_PER_S - 1) / MS_PER_S * + (ip4_frag_ttl * MS_PER_S); + + for (cid = 0; cid < RTE_MAX_LCORE; cid++) { + if (!rte_lcore_is_enabled(cid)) + continue; + + f4 = &ip4_frags[cid]; + memset(f4, 0, sizeof(struct ipv4_frag)); + socket_id = rte_lcore_to_socket_id(cid); + + f4->reasm_tbl = rte_ip_frag_table_create( + ip4_frag_buckets, + ip4_frag_bucket_entries, + ip4_frag_max_entries, + max_cycles, + socket_id); + if (!f4->reasm_tbl) { + RTE_LOG(ERR, IP4FRAG, + "[%d] fail to create frag table.\n", cid); + return EDPVS_DPDKAPIFAIL; + } + } + + snprintf(frag_job.name, sizeof(frag_job.name) - 1, "%s", "ipv4_frag"); + frag_job.func = ipv4_frag_job; + frag_job.data = NULL; + frag_job.type = NETIF_LCORE_JOB_SLOW; + frag_job.skip_loops = IP4_FRAG_FREE_DEATH_ROW_INTERVAL; + err = netif_lcore_loop_job_register(&frag_job); + if (err != EDPVS_OK) { + RTE_LOG(ERR, IP4FRAG, "fail to register loop job.\n"); + return err; + } + + return EDPVS_OK; } int ipv4_frag_term(void) { - int err; + int err; - err = netif_lcore_loop_job_unregister(&frag_job); - if (err != EDPVS_OK) { - RTE_LOG(ERR, IP4FRAG, "fail to unregister loop job.\n"); - return err; - } + err = netif_lcore_loop_job_unregister(&frag_job); + if (err != EDPVS_OK) { + RTE_LOG(ERR, IP4FRAG, "fail to unregister loop job.\n"); + return err; + } - return EDPVS_OK; + return EDPVS_OK; } diff --git a/src/ipv6/icmp6.c b/src/ipv6/icmp6.c index 2a4d12bba..231d6024b 100644 --- a/src/ipv6/icmp6.c +++ b/src/ipv6/icmp6.c @@ -134,7 +134,7 @@ void icmp6_send(struct rte_mbuf *imbuf, int type, int code, uint32_t info) struct inet_ifaddr *ifa; int room, err; int addr_type = 0; - + ifa = inet_addr_ifa_get(AF_INET6, netif_port_get(imbuf->port), (union inet_addr *)&iph->ip6_dst); if (ifa) { @@ -153,7 +153,7 @@ void icmp6_send(struct rte_mbuf *imbuf, int type, int code, uint32_t info) !(type == ICMP6_PARAM_PROB && code == ICMP6_PARAMPROB_OPTION && (icmp6_opt_unrec(imbuf, info)))) { - + RTE_LOG(DEBUG, ICMP6, "%s: l2 broadcast or l3 multicast don't support the error.\n", __func__); diff --git a/src/ipv6/ipv6.c b/src/ipv6/ipv6.c index d3f451ba4..18e1dd70a 100644 --- a/src/ipv6/ipv6.c +++ b/src/ipv6/ipv6.c @@ -190,7 +190,7 @@ static int ip6_local_in_fin(struct rte_mbuf *mbuf) /* check mcast, if failed, kni may like it. */ if (ipv6_addr_is_multicast(&hdr->ip6_dst) && !inet_chk_mcast_addr(AF_INET6, netif_port_get(mbuf->port), - (union inet_addr *)&hdr->ip6_dst, + (union inet_addr *)&hdr->ip6_dst, (union inet_addr *)&hdr->ip6_src)) { rte_rwlock_read_unlock(&inet6_prot_lock); goto kni; @@ -246,7 +246,7 @@ static int ip6_mc_local_in(struct rte_mbuf *mbuf) IP6_UPD_PO_STATS(inmcast, mbuf->pkt_len); - if (inet_chk_mcast_addr(AF_INET6, netif_port_get(mbuf->port), + if (inet_chk_mcast_addr(AF_INET6, netif_port_get(mbuf->port), (union inet_addr *)&iph->ip6_dst, NULL)) return ip6_local_in(mbuf); else @@ -690,7 +690,7 @@ int ipv6_xmit(struct rte_mbuf *mbuf, struct flow6 *fl6) if (unlikely(ipv6_addr_is_multicast(&fl6->fl6_daddr))) { /* only support linklocal now */ - if (IPV6_ADDR_MC_SCOPE(&fl6->fl6_daddr) + if (IPV6_ADDR_MC_SCOPE(&fl6->fl6_daddr) != IPV6_ADDR_SCOPE_LINKLOCAL) { IP6_INC_STATS(outnoroutes); rte_pktmbuf_free(mbuf); diff --git a/src/ipv6/ipv6_ctrl.c b/src/ipv6/ipv6_ctrl.c index 5c6abb47d..7134817d0 100644 --- a/src/ipv6/ipv6_ctrl.c +++ b/src/ipv6/ipv6_ctrl.c @@ -43,7 +43,7 @@ static int ip6_msg_get_stats(struct dpvs_msg *msg) rte_free(stats); return err; } - + msg->reply.len = sizeof(*stats); msg->reply.data = stats; diff --git a/src/ipv6/ndisc.c b/src/ipv6/ndisc.c index 16cd0e378..a96701e44 100644 --- a/src/ipv6/ndisc.c +++ b/src/ipv6/ndisc.c @@ -48,7 +48,7 @@ struct nd_msg { #define __ND_OPT_ARRAY_MAX 256 struct ndisc_options { - struct nd_opt_hdr *nd_opt_array[__ND_OPT_ARRAY_MAX]; + struct nd_opt_hdr *nd_opt_array[__ND_OPT_ARRAY_MAX]; struct nd_opt_hdr *nd_useropts; struct nd_opt_hdr *nd_useropts_end; }; @@ -61,7 +61,7 @@ struct ndisc_options { #define nd_opts_mtu nd_opt_array[ND_OPT_MTU] /* ipv6 neighbour */ -static inline uint8_t *ndisc_opt_addr_data(struct nd_opt_hdr *p, +static inline uint8_t *ndisc_opt_addr_data(struct nd_opt_hdr *p, struct netif_port *dev) { uint8_t *lladdr = (uint8_t *)(p + 1); @@ -147,11 +147,11 @@ static struct ndisc_options *ndisc_parse_options(uint8_t *opt, int opt_len, return ndopts; } -static struct rte_mbuf *ndisc_build_mbuf(struct netif_port *dev, +static struct rte_mbuf *ndisc_build_mbuf(struct netif_port *dev, const struct in6_addr *daddr, - const struct in6_addr *saddr, + const struct in6_addr *saddr, const struct icmp6_hdr *icmp6h, - const struct in6_addr *target, + const struct in6_addr *target, int llinfo) { struct rte_mbuf *mbuf; @@ -196,9 +196,9 @@ static struct rte_mbuf *ndisc_build_mbuf(struct netif_port *dev, return mbuf; } -static void ndisc_send_na(struct netif_port *dev, - const struct in6_addr *daddr, - const struct in6_addr *solicited_addr, +static void ndisc_send_na(struct netif_port *dev, + const struct in6_addr *daddr, + const struct in6_addr *solicited_addr, int solicited, int override, int inc_opt) { struct inet_ifaddr *ifa; @@ -225,7 +225,7 @@ static void ndisc_send_na(struct netif_port *dev, icmp6h.icmp6_pptr |= ND_NA_FLAG_OVERRIDE; /*ndisc*/ - mbuf = ndisc_build_mbuf(dev, daddr, src_addr, &icmp6h, solicited_addr, + mbuf = ndisc_build_mbuf(dev, daddr, src_addr, &icmp6h, solicited_addr, inc_opt ? ND_OPT_TARGET_LINKADDR : 0); if (!mbuf) return; @@ -241,9 +241,9 @@ static void ndisc_send_na(struct netif_port *dev, } /* saddr can be 0 in ns for dad in addrconf_dad_timer */ -static void ndisc_send_ns(struct netif_port *dev, +static void ndisc_send_ns(struct netif_port *dev, const struct in6_addr *solicit, - const struct in6_addr *daddr, + const struct in6_addr *daddr, const struct in6_addr *saddr) { struct rte_mbuf *mbuf; @@ -262,7 +262,7 @@ static void ndisc_send_ns(struct netif_port *dev, memset(&icmp6h, 0, sizeof(icmp6h)); icmp6h.icmp6_type = ND_NEIGHBOR_SOLICIT; - mbuf = ndisc_build_mbuf(dev, daddr, saddr, &icmp6h, solicit, + mbuf = ndisc_build_mbuf(dev, daddr, saddr, &icmp6h, solicit, !ipv6_addr_any(saddr) ? ND_OPT_SOURCE_LINKADDR : 0); if (!mbuf) return; @@ -277,7 +277,7 @@ static void ndisc_send_ns(struct netif_port *dev, ipv6_xmit(mbuf, &fl6); } -void ndisc_send_dad(struct netif_port *dev, +void ndisc_send_dad(struct netif_port *dev, const struct in6_addr* solicit) { struct in6_addr mcaddr; @@ -285,7 +285,7 @@ void ndisc_send_dad(struct netif_port *dev, ndisc_send_ns(dev, solicit, &mcaddr, &in6addr_any); } -void ndisc_solicit(struct neighbour_entry *neigh, +void ndisc_solicit(struct neighbour_entry *neigh, const struct in6_addr *saddr) { struct in6_addr mcaddr; @@ -364,7 +364,7 @@ static int ndisc_recv_ns(struct rte_mbuf *mbuf, struct netif_port *dev) inc = ipv6_addr_is_multicast(daddr); - /* + /* * dad response src_addr should be link local, daddr should be multi ff02::1 * optimistic addr not support */ @@ -390,7 +390,7 @@ static int ndisc_recv_ns(struct rte_mbuf *mbuf, struct netif_port *dev) neigh_entry_state_trans(neigh, 1); neigh_sync_core(neigh, 1, NEIGH_ENTRY); } else { - neigh = neigh_add_table(AF_INET6, (union inet_addr *)saddr, + neigh = neigh_add_table(AF_INET6, (union inet_addr *)saddr, (struct ether_addr *)lladdr, dev, hashkey, 0); if (!neigh){ RTE_LOG(ERR, NEIGHBOUR, "[%s] add neighbour wrong\n", __func__); @@ -400,7 +400,7 @@ static int ndisc_recv_ns(struct rte_mbuf *mbuf, struct netif_port *dev) neigh_sync_core(neigh, 1, NEIGH_ENTRY); } neigh_send_mbuf_cach(neigh); - + ndisc_send_na(dev, saddr, &msg->target, 1, inc, inc); @@ -447,10 +447,10 @@ static int ndisc_recv_na(struct rte_mbuf *mbuf, struct netif_port *dev) RTE_LOG(ERR, NEIGHBOUR, "ICMPv6 NA: someone advertises our address.\n"); if (ifa->flags & (IFA_F_TENTATIVE | IFA_F_OPTIMISTIC)) { inet_ifaddr_dad_failure(ifa); - } + } inet_addr_ifa_put(ifa); return EDPVS_KNICONTINUE; - } + } if (ndopts.nd_opts_tgt_lladdr) { lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, dev); diff --git a/src/ipv6/route6_hlist.c b/src/ipv6/route6_hlist.c index 3c6b3c796..b27f8a8c0 100644 --- a/src/ipv6/route6_hlist.c +++ b/src/ipv6/route6_hlist.c @@ -101,7 +101,7 @@ static inline int rt6_hlist_hashkey(const struct in6_addr *addr, int plen, int n static inline bool rt6_match(const struct route6 *rt6, const struct dp_vs_route6_conf *cf) { - /* Note: Do not use `ipv6_masked_addr_cmp` here for performance consideration + /* Note: Do not use `ipv6_masked_addr_cmp` here for performance consideration * here. We ensure the route6 entry is masked when added to route table. */ if (ipv6_addr_cmp(&rt6->rt6_dst.addr, &cf->dst.addr) != 0) return false; diff --git a/src/ipvs/ip_vs_blklst.c b/src/ipvs/ip_vs_blklst.c index d035f1390..2a903d5af 100644 --- a/src/ipvs/ip_vs_blklst.c +++ b/src/ipvs/ip_vs_blklst.c @@ -88,7 +88,7 @@ static int dp_vs_blklst_add_lcore(uint8_t proto, const union inet_addr *vaddr, new = rte_zmalloc("new_blklst_entry", sizeof(struct blklst_entry), 0); if (new == NULL) return EDPVS_NOMEM; - + memcpy(&new->vaddr, vaddr,sizeof(union inet_addr)); new->vport = vport; new->proto = proto; @@ -103,13 +103,13 @@ static int dp_vs_blklst_del_lcore(uint8_t proto, const union inet_addr *vaddr, uint16_t vport, const union inet_addr *blklst) { struct blklst_entry *blklst_node; - + blklst_node = dp_vs_blklst_lookup(proto, vaddr, vport, blklst); if (blklst_node != NULL) { list_del(&blklst_node->list); rte_free(blklst_node); rte_atomic32_dec(&this_num_blklsts); - return EDPVS_OK; + return EDPVS_OK; } return EDPVS_NOTEXIST; } @@ -121,7 +121,7 @@ static int dp_vs_blklst_add(uint8_t proto, const union inet_addr *vaddr, int err; struct dpvs_msg *msg; struct dp_vs_blklst_conf cf; - + if (cid != rte_get_master_lcore()) { RTE_LOG(INFO, SERVICE, "[%s] must set from master lcore\n", __func__); return EDPVS_NOTSUPP; @@ -131,7 +131,7 @@ static int dp_vs_blklst_add(uint8_t proto, const union inet_addr *vaddr, memcpy(&(cf.vaddr), vaddr,sizeof(union inet_addr)); memcpy(&(cf.blklst), blklst, sizeof(union inet_addr)); cf.vport = vport; - cf.proto = proto; + cf.proto = proto; /*set blklst ip on master lcore*/ err = dp_vs_blklst_add_lcore(proto, vaddr, vport, blklst); @@ -145,7 +145,7 @@ static int dp_vs_blklst_add(uint8_t proto, const union inet_addr *vaddr, cid, sizeof(struct dp_vs_blklst_conf), &cf); if (!msg) return EDPVS_NOMEM; - err = multicast_msg_send(msg, 0, NULL); + err = multicast_msg_send(msg, 0, NULL); if (err != EDPVS_OK) { msg_destroy(&msg); RTE_LOG(INFO, SERVICE, "[%s] fail to send multicast message\n", __func__); @@ -209,7 +209,7 @@ void dp_vs_blklst_flush(struct dp_vs_service *svc) entry->vport, &entry->blklst); } } - return; + return; } static void dp_vs_blklst_flush_all(void) @@ -219,14 +219,14 @@ static void dp_vs_blklst_flush_all(void) for (hash = 0; hash < DPVS_BLKLST_TAB_SIZE; hash++) { list_for_each_entry_safe(entry, next, &this_blklst_tab[hash], list) { - dp_vs_blklst_del(entry->proto, &entry->vaddr, + dp_vs_blklst_del(entry->proto, &entry->vaddr, entry->vport, &entry->blklst); } } return; } -/* +/* * for control plane */ static int blklst_sockopt_set(sockoptid_t opt, const void *conf, size_t size) @@ -239,7 +239,7 @@ static int blklst_sockopt_set(sockoptid_t opt, const void *conf, size_t size) switch (opt) { case SOCKOPT_SET_BLKLST_ADD: - err = dp_vs_blklst_add(blklst_conf->proto, &blklst_conf->vaddr, + err = dp_vs_blklst_add(blklst_conf->proto, &blklst_conf->vaddr, blklst_conf->vport, &blklst_conf->blklst); break; case SOCKOPT_SET_BLKLST_DEL: @@ -265,7 +265,7 @@ static void blklst_fill_conf(int af, struct dp_vs_blklst_conf *cf, cf->vport = entry->vport; } -static int blklst_sockopt_get(sockoptid_t opt, const void *conf, size_t size, +static int blklst_sockopt_get(sockoptid_t opt, const void *conf, size_t size, void **out, size_t *outsize) { struct dp_vs_blklst_conf_array *array; @@ -274,7 +274,7 @@ static int blklst_sockopt_get(sockoptid_t opt, const void *conf, size_t size, int off = 0; naddr = rte_atomic32_read(&this_num_blklsts); - *outsize = sizeof(struct dp_vs_blklst_conf_array) + + *outsize = sizeof(struct dp_vs_blklst_conf_array) + naddr * sizeof(struct dp_vs_blklst_conf); *out = rte_calloc_socket(NULL, 1, *outsize, 0, rte_socket_id()); if (!(*out)) @@ -288,7 +288,7 @@ static int blklst_sockopt_get(sockoptid_t opt, const void *conf, size_t size, break; blklst_fill_conf(AF_INET, &array->blklsts[off++], entry); } - } + } return EDPVS_OK; } @@ -299,7 +299,7 @@ static int blklst_msg_process(bool add, struct dpvs_msg *msg) struct dp_vs_blklst_conf *cf; int err; assert(msg); - + if (msg->len != sizeof(struct dp_vs_blklst_conf)){ RTE_LOG(ERR, SERVICE, "%s: bad message.\n", __func__); return EDPVS_INVAL; @@ -341,7 +341,7 @@ static int blklst_lcore_init(void *args) { int i; if (!rte_lcore_is_enabled(rte_lcore_id())) - return EDPVS_DISABLED; + return EDPVS_DISABLED; this_blklst_tab = rte_malloc_socket(NULL, sizeof(struct list_head) * DPVS_BLKLST_TAB_SIZE, RTE_CACHE_LINE_SIZE, rte_socket_id()); @@ -349,7 +349,7 @@ static int blklst_lcore_init(void *args) return EDPVS_NOMEM; for (i = 0; i < DPVS_BLKLST_TAB_SIZE; i++) - INIT_LIST_HEAD(&this_blklst_tab[i]); + INIT_LIST_HEAD(&this_blklst_tab[i]); return EDPVS_OK; } @@ -361,7 +361,7 @@ static int blklst_lcore_term(void *args) dp_vs_blklst_flush_all(); - if (this_blklst_tab) { + if (this_blklst_tab) { rte_free(this_blklst_tab); this_blklst_tab = NULL; } @@ -384,7 +384,7 @@ int dp_vs_blklst_init(void) return err; } } - + memset(&msg_type, 0, sizeof(struct dpvs_msg_type)); msg_type.type = MSG_TYPE_BLKLST_ADD; msg_type.mode = DPVS_MSG_MULTICAST; @@ -410,7 +410,7 @@ int dp_vs_blklst_init(void) if ((err = sockopt_register(&blklst_sockopts)) != EDPVS_OK) return err; dp_vs_blklst_rnd = (uint32_t)random(); - + return EDPVS_OK; } diff --git a/src/ipvs/ip_vs_conhash.c b/src/ipvs/ip_vs_conhash.c index 1339b1f39..2c95bf7cb 100644 --- a/src/ipvs/ip_vs_conhash.c +++ b/src/ipvs/ip_vs_conhash.c @@ -16,20 +16,35 @@ * */ +#include #include #include "ipv4.h" #include "ipv6.h" #include "libconhash/conhash.h" #include "ipvs/conhash.h" +struct conhash_node { + struct list_head list; + uint16_t weight; + int af; /* address family */ + union inet_addr addr; /* IP address of the server */ + uint16_t port; /* port number of the server */ + struct node_s node; /* node in libconhash */ +}; + +struct conhash_sched_data { + struct list_head nodes; /* node list */ + struct conhash_s *conhash; /* consistent hash meta data */ +}; + #define REPLICA 160 #define QUIC_PACKET_8BYTE_CONNECTION_ID (1 << 3) -/* +/* * QUIC CID hash target for quic* * QUIC CID(qid) should be configured in UDP service */ -static int get_quic_hash_target(int af, const struct rte_mbuf *mbuf, +static int get_quic_hash_target(int af, const struct rte_mbuf *mbuf, uint64_t *quic_cid) { uint8_t pub_flags; @@ -44,14 +59,14 @@ static int get_quic_hash_target(int af, const struct rte_mbuf *mbuf, } else udphoff = ip4_hdrlen(mbuf); - + quic_len = udphoff + sizeof(struct udp_hdr) + sizeof(pub_flags) + sizeof(*quic_cid); if (mbuf_may_pull((struct rte_mbuf *)mbuf, quic_len) != 0) return EDPVS_NOTEXIST; - quic_data = rte_pktmbuf_mtod_offset(mbuf, char *, + quic_data = rte_pktmbuf_mtod_offset(mbuf, char *, udphoff + sizeof(struct udp_hdr)); pub_flags = *((uint8_t *)quic_data); @@ -67,7 +82,7 @@ static int get_quic_hash_target(int af, const struct rte_mbuf *mbuf, } /*source ip hash target*/ -static int get_sip_hash_target(int af, const struct rte_mbuf *mbuf, +static int get_sip_hash_target(int af, const struct rte_mbuf *mbuf, uint32_t *addr_fold) { if (af == AF_INET) { @@ -122,87 +137,251 @@ dp_vs_conhash_get(struct dp_vs_service *svc, struct conhash_s *conhash, return node == NULL? NULL: node->data; } -/* - * Assign dest to connhash. - */ -static int -dp_vs_conhash_assign(struct dp_vs_service *svc) +static void node_fini(struct node_s *node) { - struct dp_vs_dest *dest; - struct node_s *p_node; - uint32_t addr_fold; - int weight = 0; + struct conhash_node *p_conhash_node = NULL; + + if (!node) + return; + + if (node->data) { + rte_atomic32_dec(&(((struct dp_vs_dest *)(node->data))->refcnt)); + node->data = NULL; + } + + p_conhash_node = container_of(node, struct conhash_node, node); + list_del(&(p_conhash_node->list)); + rte_free(p_conhash_node); +} + +static int dp_vs_conhash_add_dest(struct dp_vs_service *svc, + struct dp_vs_dest *dest) +{ + int ret; char str[40]; + uint32_t addr_fold; + int16_t weight = 0; + struct node_s *p_node; + struct conhash_node *p_conhash_node; + struct conhash_sched_data *p_sched_data; - list_for_each_entry(dest, &svc->dests, n_list) { - weight = rte_atomic16_read(&dest->weight); - if (weight > 0) { - p_node = rte_zmalloc("p_node", sizeof(struct node_s), RTE_CACHE_LINE_SIZE); - if (p_node == NULL) { - return EDPVS_NOMEM; + p_sched_data = (struct conhash_sched_data *)(svc->sched_data); + + weight = rte_atomic16_read(&dest->weight); + if (weight < 0) { + RTE_LOG(ERR, SERVICE, "%s: add dest with weight(%d) less than 0\n", + __func__, weight); + return EDPVS_INVAL; + } + + p_conhash_node = rte_zmalloc(NULL, sizeof(struct conhash_node), + RTE_CACHE_LINE_SIZE); + if (!p_conhash_node) { + RTE_LOG(ERR, SERVICE, "%s: alloc conhash node failed\n", __func__); + return EDPVS_NOMEM; + } + + INIT_LIST_HEAD(&(p_conhash_node->list)); + p_conhash_node->af = dest->af; + p_conhash_node->addr = dest->addr; + p_conhash_node->port = dest->port; + p_conhash_node->weight = weight; + + // add node to conhash + p_node = &(p_conhash_node->node); + addr_fold = inet_addr_fold(dest->af, &dest->addr); + snprintf(str, sizeof(str), "%u%d", addr_fold, dest->port); + + conhash_set_node(p_node, str, weight * REPLICA); + ret = conhash_add_node(p_sched_data->conhash, p_node); + if (ret < 0) { + RTE_LOG(ERR, SERVICE, "%s: conhash_add_node failed\n", __func__); + rte_free(p_conhash_node); + return EDPVS_INVAL; + } + + // set node data + rte_atomic32_inc(&dest->refcnt); + p_node->data = dest; + + // add conhash node to list + list_add(&(p_conhash_node->list), &(p_sched_data->nodes)); + + return EDPVS_OK; +} + +static int dp_vs_conhash_del_dest(struct dp_vs_service *svc, + struct dp_vs_dest *dest) +{ + int ret; + struct node_s *p_node; + struct conhash_node *p_conhash_node; + struct conhash_sched_data *p_sched_data; + + p_sched_data = (struct conhash_sched_data *)(svc->sched_data); + + list_for_each_entry(p_conhash_node, &(p_sched_data->nodes), list) { + if (p_conhash_node->af == dest->af && + inet_addr_equal(dest->af, &p_conhash_node->addr, &dest->addr) && + p_conhash_node->port == dest->port) { + p_node = &(p_conhash_node->node); + ret = conhash_del_node(p_sched_data->conhash, p_node); + if (ret < 0) { + RTE_LOG(ERR, SERVICE, "%s: conhash_del_node failed\n", __func__); + return EDPVS_INVAL; } + node_fini(p_node); + return EDPVS_OK; + } + } + + return EDPVS_NOTEXIST; +} - rte_atomic32_inc(&dest->refcnt); - p_node->data = dest; +static int dp_vs_conhash_edit_dest(struct dp_vs_service *svc, + struct dp_vs_dest *dest) +{ + int ret; + char str[40]; + uint32_t addr_fold; + int16_t weight; + struct node_s *p_node; + struct conhash_node *p_conhash_node; + struct conhash_sched_data *p_sched_data; + + weight = rte_atomic16_read(&dest->weight); + p_sched_data = (struct conhash_sched_data *)(svc->sched_data); + + // find node by addr and port + list_for_each_entry(p_conhash_node, &(p_sched_data->nodes), list) { + if (p_conhash_node->af == dest->af && + inet_addr_equal(dest->af, &p_conhash_node->addr, &dest->addr) && + p_conhash_node->port == dest->port) { + if (p_conhash_node->weight == weight) + return EDPVS_OK; + + // del from conhash + p_node = &(p_conhash_node->node); + ret = conhash_del_node(p_sched_data->conhash, p_node); + if (ret < 0) { + RTE_LOG(ERR, SERVICE, "%s: conhash_del_node failed\n", __func__); + return EDPVS_INVAL; + } - addr_fold = inet_addr_fold(dest->af, &dest->addr); - snprintf(str, sizeof(str), "%u%d", addr_fold, dest->port); + // adjust weight + p_conhash_node->weight = weight; + addr_fold = inet_addr_fold(dest->af, &dest->addr); + snprintf(str, sizeof(str), "%u%d", addr_fold, dest->port); + conhash_set_node(p_node, str, weight * REPLICA); + + // add to conhash again + ret = conhash_add_node(p_sched_data->conhash, p_node); + if (ret < 0) { + RTE_LOG(ERR, SERVICE, "%s: conhash_set_node failed\n", __func__); + return EDPVS_INVAL; + } - conhash_set_node(p_node, str, weight*REPLICA); - conhash_add_node(svc->sched_data, p_node); + return EDPVS_OK; } } - return EDPVS_OK; + + return EDPVS_NOTEXIST; } -static void node_fini(struct node_s *node) +/* + * Assign dest to connhash. + */ +static int +dp_vs_conhash_assign(struct dp_vs_service *svc) { - if (!node) - return; + int err; + struct dp_vs_dest *dest; - if (node->data) { - rte_atomic32_dec(&(((struct dp_vs_dest *)(node->data))->refcnt)); - node->data = NULL; + list_for_each_entry(dest, &svc->dests, n_list) { + err = dp_vs_conhash_add_dest(svc, dest); + if (err != EDPVS_OK) { + RTE_LOG(ERR, SERVICE, "%s: add dest to conhash failed\n", __func__); + return err; + } } - - rte_free(node); + + return EDPVS_OK; } static int dp_vs_conhash_init_svc(struct dp_vs_service *svc) { - svc->sched_data = conhash_init(NULL); + struct conhash_sched_data *sched_data = NULL; + + svc->sched_data = NULL; - if (!svc->sched_data) { + // alloc schedule data + sched_data = rte_zmalloc(NULL, sizeof(struct conhash_sched_data), + RTE_CACHE_LINE_SIZE); + if (!sched_data) { + RTE_LOG(ERR, SERVICE, "%s: alloc schedule data faild\n", __func__); + return EDPVS_NOMEM; + } + + // init conhash + sched_data->conhash = conhash_init(NULL); + if (!sched_data->conhash) { RTE_LOG(ERR, SERVICE, "%s: conhash init faild!\n", __func__); + rte_free(sched_data); return EDPVS_NOMEM; } - dp_vs_conhash_assign(svc); + // init node list + INIT_LIST_HEAD(&(sched_data->nodes)); - return EDPVS_OK; + // assign node + svc->sched_data = sched_data; + return dp_vs_conhash_assign(svc); } static int dp_vs_conhash_done_svc(struct dp_vs_service *svc) { - conhash_fini(svc->sched_data, node_fini); - - return EDPVS_OK; -} + struct conhash_sched_data *sched_data = + (struct conhash_sched_data *)(svc->sched_data); + struct conhash_node *p_conhash_node, *p_conhash_node_next; -static int dp_vs_conhash_update_svc(struct dp_vs_service *svc) -{ - conhash_fini(svc->sched_data, node_fini); + conhash_fini(sched_data->conhash, node_fini); - svc->sched_data = conhash_init(NULL); + // del nodes left in list when rs weight is 0 + list_for_each_entry_safe(p_conhash_node, p_conhash_node_next, + &(sched_data->nodes), list) { + node_fini(&(p_conhash_node->node)); + } - dp_vs_conhash_assign(svc); + rte_free(svc->sched_data); + svc->sched_data = NULL; - return 0; + return EDPVS_OK; } -static inline int is_overloaded(struct dp_vs_dest *dest) +static int dp_vs_conhash_update_svc(struct dp_vs_service *svc, + struct dp_vs_dest *dest, sockoptid_t opt) { - return dest->flags & DPVS_DEST_F_OVERLOAD; + int ret; + + switch (opt) { + case DPVS_SO_SET_ADDDEST: + ret = dp_vs_conhash_add_dest(svc, dest); + break; + case DPVS_SO_SET_DELDEST: + ret = dp_vs_conhash_del_dest(svc, dest); + break; + case DPVS_SO_SET_EDITDEST: + ret = dp_vs_conhash_edit_dest(svc, dest); + break; + default: + ret = EDPVS_INVAL; + break; + } + + if (ret != EDPVS_OK) + RTE_LOG(ERR, SERVICE, "%s: update service faild!\n", __func__); + + return ret; } /* @@ -212,18 +391,12 @@ static struct dp_vs_dest * dp_vs_conhash_schedule(struct dp_vs_service *svc, const struct rte_mbuf *mbuf) { struct dp_vs_dest *dest; + struct conhash_sched_data *sched_data = + (struct conhash_sched_data *)(svc->sched_data); - dest = dp_vs_conhash_get(svc, (struct conhash_s *)svc->sched_data, mbuf); - - if (!dest - || !(dest->flags & DPVS_DEST_F_AVAILABLE) - || rte_atomic16_read(&dest->weight) <= 0 - || is_overloaded(dest)) { + dest = dp_vs_conhash_get(svc, sched_data->conhash, mbuf); - return NULL; - } - else - return dest; + return dp_vs_dest_is_valid(dest) ? dest : NULL; } /* diff --git a/src/ipvs/ip_vs_conn.c b/src/ipvs/ip_vs_conn.c index 470899fad..6173dd0cb 100644 --- a/src/ipvs/ip_vs_conn.c +++ b/src/ipvs/ip_vs_conn.c @@ -36,41 +36,44 @@ #include "conf/conn.h" #include "sys_time.h" -#define DPVS_CONN_TAB_BITS 20 -#define DPVS_CONN_TAB_SIZE (1 << DPVS_CONN_TAB_BITS) -#define DPVS_CONN_TAB_MASK (DPVS_CONN_TAB_SIZE - 1) +#define DPVS_CONN_TBL_BITS 20 +#define DPVS_CONN_TBL_SIZE (1 << DPVS_CONN_TBL_BITS) +#define DPVS_CONN_TBL_MASK (DPVS_CONN_TBL_SIZE - 1) /* too big ? adjust according to free mem ?*/ #define DPVS_CONN_POOL_SIZE_DEF 2097152 #define DPVS_CONN_POOL_SIZE_MIN 65536 -static int conn_pool_size = DPVS_CONN_POOL_SIZE_DEF; #define DPVS_CONN_CACHE_SIZE_DEF 256 + +static int conn_pool_size = DPVS_CONN_POOL_SIZE_DEF; static int conn_pool_cache = DPVS_CONN_CACHE_SIZE_DEF; #define DPVS_CONN_INIT_TIMEOUT_DEF 3 /* sec */ static int conn_init_timeout = DPVS_CONN_INIT_TIMEOUT_DEF; /* helpers */ -#define this_conn_tab (RTE_PER_LCORE(dp_vs_conn_tab)) +#define this_conn_tbl (RTE_PER_LCORE(dp_vs_conn_tbl)) #ifdef CONFIG_DPVS_IPVS_CONN_LOCK -#define this_conn_lock (RTE_PER_LCORE(dp_vs_conn_lock)) +#define this_conn_lock (RTE_PER_LCORE(dp_vs_conn_lock)) #endif -#define this_conn_count (RTE_PER_LCORE(dp_vs_conn_count)) -#define this_conn_cache (dp_vs_conn_cache[rte_socket_id()]) +#define this_conn_count (RTE_PER_LCORE(dp_vs_conn_count)) +#define this_conn_cache (dp_vs_conn_cache[rte_socket_id()]) /* dpvs control variables */ static bool conn_expire_quiescent_template = false; +bool dp_vs_redirect_disable = true; + /* * per-lcore dp_vs_conn{} hash table. */ -static RTE_DEFINE_PER_LCORE(struct list_head *, dp_vs_conn_tab); +static RTE_DEFINE_PER_LCORE(struct list_head *, dp_vs_conn_tbl); #ifdef CONFIG_DPVS_IPVS_CONN_LOCK static RTE_DEFINE_PER_LCORE(rte_spinlock_t, dp_vs_conn_lock); #endif /* global connection template table */ -static struct list_head *dp_vs_ct_tab; +static struct list_head *dp_vs_ct_tbl; static rte_spinlock_t dp_vs_ct_lock; static RTE_DEFINE_PER_LCORE(uint32_t, dp_vs_conn_count); @@ -82,49 +85,94 @@ static uint32_t dp_vs_conn_rnd; /* hash random */ */ static struct rte_mempool *dp_vs_conn_cache[DPVS_MAX_SOCKET]; +static struct dp_vs_conn *dp_vs_conn_alloc(void) +{ + struct dp_vs_conn *conn; + + if (unlikely(rte_mempool_get(this_conn_cache, (void **)&conn) != 0)) { + RTE_LOG(ERR, IPVS, "%s: no memory for connection\n", __func__); + return NULL; + } + + memset(conn, 0, sizeof(struct dp_vs_conn)); + conn->connpool = this_conn_cache; + this_conn_count++; + + return conn; +} + +static void dp_vs_conn_free(struct dp_vs_conn *conn) +{ + if (!conn) + return; + + dp_vs_redirect_free(conn); + + rte_mempool_put(conn->connpool, conn); + this_conn_count--; +} + static inline struct dp_vs_conn * tuplehash_to_conn(const struct conn_tuple_hash *thash) { return container_of(thash, struct dp_vs_conn, tuplehash[thash->direct]); } -static inline uint32_t conn_hashkey(int af, - const union inet_addr *saddr, uint16_t sport, - const union inet_addr *daddr, uint16_t dport) +inline uint32_t dp_vs_conn_hashkey(int af, + const union inet_addr *saddr, uint16_t sport, + const union inet_addr *daddr, uint16_t dport, + uint32_t mask) { - if (AF_INET == af) + switch (af) { + case AF_INET: return rte_jhash_3words((uint32_t)saddr->in.s_addr, (uint32_t)daddr->in.s_addr, ((uint32_t)sport) << 16 | (uint32_t)dport, - dp_vs_conn_rnd) & DPVS_CONN_TAB_MASK; - - if (AF_INET6 == af) { - uint32_t vect[9]; - vect[0] = ((uint32_t)sport) << 16 | (uint32_t)dport; - memcpy(&vect[1], &saddr->in6, 16); - memcpy(&vect[5], &daddr->in6, 16); - return rte_jhash_32b(vect, 9, dp_vs_conn_rnd) & DPVS_CONN_TAB_MASK; - } + dp_vs_conn_rnd) & mask; + + case AF_INET6: + { + uint32_t vect[9]; + + vect[0] = ((uint32_t)sport) << 16 | (uint32_t)dport; + memcpy(&vect[1], &saddr->in6, 16); + memcpy(&vect[5], &daddr->in6, 16); + + return rte_jhash_32b(vect, 9, dp_vs_conn_rnd) & mask; + } - RTE_LOG(WARNING, IPVS, "%s: hashing unsupported protocol %d\n", __func__, af); - return 0; + default: + RTE_LOG(WARNING, IPVS, "%s: hashing unsupported protocol %d\n", __func__, af); + return 0; + } } -static inline int __conn_hash(struct dp_vs_conn *conn, - uint32_t ihash, uint32_t ohash) +static inline int __dp_vs_conn_hash(struct dp_vs_conn *conn, uint32_t mask) { + uint32_t ihash, ohash; + if (unlikely(conn->flags & DPVS_CONN_F_HASHED)) return EDPVS_EXIST; + ihash = dp_vs_conn_hashkey(tuplehash_in(conn).af, + &tuplehash_in(conn).saddr, tuplehash_in(conn).sport, + &tuplehash_in(conn).daddr, tuplehash_in(conn).dport, + mask); + + ohash = dp_vs_conn_hashkey(tuplehash_out(conn).af, + &tuplehash_out(conn).saddr, tuplehash_out(conn).sport, + &tuplehash_out(conn).daddr, tuplehash_out(conn).dport, + mask); + if (conn->flags & DPVS_CONN_F_TEMPLATE) { /* lock is complusory for template */ rte_spinlock_lock(&dp_vs_ct_lock); - list_add(&tuplehash_in(conn).list, &dp_vs_ct_tab[ihash]); - list_add(&tuplehash_out(conn).list, &dp_vs_ct_tab[ohash]); + list_add(&tuplehash_in(conn).list, &dp_vs_ct_tbl[ihash]); + list_add(&tuplehash_out(conn).list, &dp_vs_ct_tbl[ohash]); rte_spinlock_unlock(&dp_vs_ct_lock); } else { - list_add(&tuplehash_in(conn).list, &this_conn_tab[ihash]); - list_add(&tuplehash_out(conn).list, &this_conn_tab[ohash]); + list_add(&tuplehash_in(conn).list, &this_conn_tbl[ihash]); + list_add(&tuplehash_out(conn).list, &this_conn_tbl[ohash]); } conn->flags |= DPVS_CONN_F_HASHED; @@ -133,31 +181,26 @@ static inline int __conn_hash(struct dp_vs_conn *conn, return EDPVS_OK; } -static inline int conn_hash(struct dp_vs_conn *conn) +static inline int dp_vs_conn_hash(struct dp_vs_conn *conn) { - uint32_t ihash, ohash; int err; - ihash = conn_hashkey(conn->af, - &tuplehash_in(conn).saddr, tuplehash_in(conn).sport, - &tuplehash_in(conn).daddr, tuplehash_in(conn).dport); - - ohash = conn_hashkey(conn->af, - &tuplehash_out(conn).saddr, tuplehash_out(conn).sport, - &tuplehash_out(conn).daddr, tuplehash_out(conn).dport); - #ifdef CONFIG_DPVS_IPVS_CONN_LOCK rte_spinlock_lock(&this_conn_lock); #endif - err = __conn_hash(conn, ihash, ohash); + + err = __dp_vs_conn_hash(conn, DPVS_CONN_TBL_MASK); + #ifdef CONFIG_DPVS_IPVS_CONN_LOCK rte_spinlock_unlock(&this_conn_lock); #endif + dp_vs_redirect_hash(conn); + return err; } -static inline int conn_unhash(struct dp_vs_conn *conn) +static inline int dp_vs_conn_unhash(struct dp_vs_conn *conn) { int err; @@ -168,6 +211,8 @@ static inline int conn_unhash(struct dp_vs_conn *conn) if (rte_atomic32_read(&conn->refcnt) != 2) { err = EDPVS_BUSY; } else { + dp_vs_redirect_unhash(conn); + if (conn->flags & DPVS_CONN_F_TEMPLATE) { rte_spinlock_lock(&dp_vs_ct_lock); list_del(&tuplehash_in(conn).list); @@ -179,6 +224,7 @@ static inline int conn_unhash(struct dp_vs_conn *conn) } conn->flags &= ~DPVS_CONN_F_HASHED; rte_atomic32_dec(&conn->refcnt); + err = EDPVS_OK; } } else { @@ -280,12 +326,12 @@ static inline void conn_dump(const char *msg, struct dp_vs_conn *conn) char cbuf[64], vbuf[64], lbuf[64], dbuf[64]; const char *caddr, *vaddr, *laddr, *daddr; - caddr = inet_ntop(conn->af, &conn->caddr, cbuf, sizeof(cbuf)) ? cbuf : "::"; - vaddr = inet_ntop(conn->af, &conn->vaddr, vbuf, sizeof(vbuf)) ? vbuf : "::"; - laddr = inet_ntop(conn->af, &conn->laddr, lbuf, sizeof(lbuf)) ? lbuf : "::"; - daddr = inet_ntop(conn->af, &conn->daddr, dbuf, sizeof(dbuf)) ? dbuf : "::"; + caddr = inet_ntop(conn->tuplehash[DPVS_CONN_DIR_INBOUND].af, &conn->caddr, cbuf, sizeof(cbuf)) ? cbuf : "::"; + vaddr = inet_ntop(conn->tuplehash[DPVS_CONN_DIR_INBOUND].af, &conn->vaddr, vbuf, sizeof(vbuf)) ? vbuf : "::"; + laddr = inet_ntop(conn->tuplehash[DPVS_CONN_DIR_OUTBOUND].af, &conn->laddr, lbuf, sizeof(lbuf)) ? lbuf : "::"; + daddr = inet_ntop(conn->tuplehash[DPVS_CONN_DIR_OUTBOUND].af, &conn->daddr, dbuf, sizeof(dbuf)) ? dbuf : "::"; - RTE_LOG(DEBUG, IPVS, "%s [%d] %s %s:%u %s:%u %s:%u %s:%u refs %d\n", + RTE_LOG(DEBUG, IPVS, "%s [%d] %s %s/%u %s/%u %s/%u %s/%u refs %d\n", msg ? msg : "", rte_lcore_id(), inet_proto_name(conn->proto), caddr, ntohs(conn->cport), vaddr, ntohs(conn->vport), laddr, ntohs(conn->lport), daddr, ntohs(conn->dport), @@ -301,14 +347,14 @@ static inline void conn_tuplehash_dump(const char *msg, saddr = inet_ntop(t->af, &t->saddr, sbuf, sizeof(sbuf)) ? sbuf : "::"; daddr = inet_ntop(t->af, &t->daddr, dbuf, sizeof(dbuf)) ? dbuf : "::"; - RTE_LOG(DEBUG, IPVS, "%s%s %s %s:%u->%s:%u\n", + RTE_LOG(DEBUG, IPVS, "%s%s %s %s/%u->%s/%u\n", msg ? msg : "", t->direct == DPVS_CONN_DIR_INBOUND ? "in " : "out", inet_proto_name(t->proto), saddr, ntohs(t->sport), daddr, ntohs(t->dport)); } -static inline void conn_tab_dump(void) +static inline void conn_table_dump(void) { int i; struct conn_tuple_hash *tuphash; @@ -319,13 +365,13 @@ static inline void conn_tab_dump(void) rte_spinlock_lock(&this_conn_lock); #endif - for (i = 0; i < DPVS_CONN_TAB_SIZE; i++) { - if (list_empty(&this_conn_tab[i])) + for (i = 0; i < DPVS_CONN_TBL_SIZE; i++) { + if (list_empty(&this_conn_tbl[i])) continue; RTE_LOG(DEBUG, IPVS, " hash %d\n", i); - list_for_each_entry(tuphash, &this_conn_tab[i], list) { + list_for_each_entry(tuphash, &this_conn_tbl[i], list) { conn_tuplehash_dump(" ", tuphash); } } @@ -348,7 +394,7 @@ static inline void conn_stats_dump(const char *msg, struct dp_vs_conn *conn) laddr = inet_ntop(conn->af, &conn->laddr, lbuf, sizeof(lbuf)) ? lbuf : "::"; daddr = inet_ntop(conn->af, &conn->daddr, dbuf, sizeof(dbuf)) ? dbuf : "::"; - RTE_LOG(DEBUG, IPVS, "[%s->%s]%s [%d] %s %s:%u %s:%u %s:%u %s:%u" + RTE_LOG(DEBUG, IPVS, "[%s->%s]%s [%d] %s %s/%u %s/%u %s/%u %s/%u" " inpkts=%ld, inbytes=%ld, outpkts=%ld, outbytes=%ld\n", cycles_to_stime(conn->ctime), sys_localtime_str(), msg ? msg : "", rte_lcore_id(), inet_proto_name(conn->proto), @@ -403,7 +449,7 @@ static int conn_expire(void *priv) RTE_LOG(WARNING, IPVS, "%s: no route for syn_proxy rs's syn " "retransmit\n", __func__); } else { - cloned_syn_mbuf = rte_pktmbuf_clone(conn->syn_mbuf, pool); + cloned_syn_mbuf = mbuf_copy(conn->syn_mbuf, pool); if (unlikely(!cloned_syn_mbuf)) { RTE_LOG(WARNING, IPVS, "%s: no memory for syn_proxy rs's syn " "retransmit\n", __func__); @@ -430,7 +476,7 @@ static int conn_expire(void *priv) /* unhash it then no further user can get it, * even we cannot del it now. */ - conn_unhash(conn); + dp_vs_conn_unhash(conn); /* refcnt == 1 means we are the only referer. * no one is using the conn and it's timed out. */ @@ -454,7 +500,7 @@ static int conn_expire(void *priv) memset(&saddr, 0, sizeof(saddr)); memset(&daddr, 0, sizeof(daddr)); if (AF_INET == conn->af) { - struct sockaddr_in *daddr4 = (struct sockaddr_in *)&saddr; + struct sockaddr_in *daddr4 = (struct sockaddr_in *)&daddr; struct sockaddr_in *saddr4 = (struct sockaddr_in *)&saddr; daddr4->sin_family = AF_INET; @@ -500,19 +546,19 @@ static int conn_expire(void *priv) rte_atomic32_dec(&conn->refcnt); - rte_mempool_put(conn->connpool, conn); - this_conn_count--; - #ifdef CONFIG_DPVS_IPVS_STATS_DEBUG conn_stats_dump("del conn", conn); #endif #ifdef CONFIG_DPVS_IPVS_DEBUG conn_dump("del conn: ", conn); #endif + + dp_vs_conn_free(conn); + return DTIMER_STOP; } - conn_hash(conn); + dp_vs_conn_hash(conn); /* some one is using it when expire, * try del it again later */ @@ -534,8 +580,8 @@ static void conn_flush(void) #ifdef CONFIG_DPVS_IPVS_CONN_LOCK rte_spinlock_lock(&this_conn_lock); #endif - for (i = 0; i < DPVS_CONN_TAB_SIZE; i++) { - list_for_each_entry_safe(tuphash, next, &this_conn_tab[i], list) { + for (i = 0; i < DPVS_CONN_TBL_SIZE; i++) { + list_for_each_entry_safe(tuphash, next, &this_conn_tbl[i], list) { conn = tuplehash_to_conn(tuphash); if (conn->flags & DPVS_CONN_F_TEMPLATE) @@ -547,7 +593,7 @@ static void conn_flush(void) if (rte_atomic32_read(&conn->refcnt) != 2) { rte_atomic32_dec(&conn->refcnt); } else { - conn_unhash(conn); + dp_vs_conn_unhash(conn); if (conn->dest->fwdmode == DPVS_FWD_MODE_SNAT && conn->proto != IPPROTO_ICMP && @@ -590,8 +636,8 @@ static void conn_flush(void) dp_vs_laddr_unbind(conn); rte_atomic32_dec(&conn->refcnt); - rte_mempool_put(conn->connpool, conn); - this_conn_count--; + dp_vs_conn_free(conn); + #ifdef CONFIG_DPVS_IPVS_STATS_DEBUG conn_stats_dump("conn flush", conn); #endif @@ -609,6 +655,7 @@ struct dp_vs_conn *dp_vs_conn_new(struct rte_mbuf *mbuf, struct dp_vs_dest *dest, uint32_t flags) { struct dp_vs_conn *new; + struct dp_vs_redirect *new_r = NULL; struct conn_tuple_hash *t; uint16_t rport; __be16 _ports[2], *ports; @@ -616,12 +663,16 @@ struct dp_vs_conn *dp_vs_conn_new(struct rte_mbuf *mbuf, assert(mbuf && param && dest); - if (unlikely(rte_mempool_get(this_conn_cache, (void **)&new) != 0)) { - RTE_LOG(WARNING, IPVS, "%s: no memory\n", __func__); - return NULL; + /* no need to create redirect for the global template connection */ + if ((flags & DPVS_CONN_F_TEMPLATE) == 0) { + new_r = dp_vs_redirect_alloc(dest->fwdmode); } - memset(new, 0, sizeof(struct dp_vs_conn)); - new->connpool = this_conn_cache; + + new = dp_vs_conn_alloc(); + if (unlikely(!new)) + goto errout_redirect; + + new->redirect = new_r; /* set proper RS port */ if ((flags & DPVS_CONN_F_TEMPLATE) || param->ct_dport != 0) @@ -656,12 +707,13 @@ struct dp_vs_conn *dp_vs_conn_new(struct rte_mbuf *mbuf, /* init outbound conn tuple hash */ t = &tuplehash_out(new); t->direct = DPVS_CONN_DIR_OUTBOUND; - t->af = param->af; + t->af = dest->af; t->proto = param->proto; - if (dest->fwdmode == DPVS_FWD_MODE_SNAT) + if (dest->fwdmode == DPVS_FWD_MODE_SNAT) { t->saddr = iph->saddr; - else + } else { t->saddr = dest->addr; + } t->sport = rport; t->daddr = *param->caddr; /* non-FNAT */ t->dport = param->cport; /* non-FNAT */ @@ -676,19 +728,22 @@ struct dp_vs_conn *dp_vs_conn_new(struct rte_mbuf *mbuf, new->vport = param->vport; new->laddr = *param->caddr; /* non-FNAT */ new->lport = param->cport; /* non-FNAT */ - if (dest->fwdmode == DPVS_FWD_MODE_SNAT) { + if (dest->fwdmode == DPVS_FWD_MODE_SNAT) new->daddr = iph->saddr; - } else { + else new->daddr = dest->addr; - } new->dport = rport; /* neighbour confirm cache */ - if (AF_INET == param->af) { + if (AF_INET == tuplehash_in(new).af) { new->in_nexthop.in.s_addr = htonl(INADDR_ANY); - new->out_nexthop.in.s_addr = htonl(INADDR_ANY); - } else if (AF_INET6 == param->af) { + } else { new->in_nexthop.in6 = in6addr_any; + } + + if (AF_INET == tuplehash_out(new).af) { + new->out_nexthop.in.s_addr = htonl(INADDR_ANY); + } else { new->out_nexthop.in6 = in6addr_any; } @@ -722,8 +777,11 @@ struct dp_vs_conn *dp_vs_conn_new(struct rte_mbuf *mbuf, goto unbind_dest; } + /* init redirect if it exists */ + dp_vs_redirect_init(new); + /* add to hash table (dual dir for each bucket) */ - if ((err = conn_hash(new)) != EDPVS_OK) + if ((err = dp_vs_conn_hash(new)) != EDPVS_OK) goto unbind_laddr; /* timer */ @@ -767,8 +825,6 @@ struct dp_vs_conn *dp_vs_conn_new(struct rte_mbuf *mbuf, new->timeout.tv_sec = pp->timeout_table[new->state = DPVS_TCP_S_SYN_SENT]; } - this_conn_count++; - /* schedule conn timer */ dpvs_time_rand_delay(&new->timeout, 1000000); if (new->flags & DPVS_CONN_F_TEMPLATE) @@ -786,7 +842,9 @@ struct dp_vs_conn *dp_vs_conn_new(struct rte_mbuf *mbuf, unbind_dest: conn_unbind_dest(new); errout: - rte_mempool_put(this_conn_cache, new); + dp_vs_conn_free(new); +errout_redirect: + dp_vs_redirect_free(new); return NULL; } @@ -809,16 +867,19 @@ struct dp_vs_conn *dp_vs_conn_get(int af, uint16_t proto, char sbuf[64], dbuf[64]; #endif - if (unlikely(reverse)) - hash = conn_hashkey(af, daddr, dport, saddr, sport); - else - hash = conn_hashkey(af, saddr, sport, daddr, dport); + if (unlikely(reverse)) { + hash = dp_vs_conn_hashkey(af, daddr, dport, saddr, sport, + DPVS_CONN_TBL_MASK); + } else { + hash = dp_vs_conn_hashkey(af, saddr, sport, daddr, dport, + DPVS_CONN_TBL_MASK); + } #ifdef CONFIG_DPVS_IPVS_CONN_LOCK rte_spinlock_lock(&this_conn_lock); #endif if (unlikely(reverse)) { /* swap source/dest for lookup */ - list_for_each_entry(tuphash, &this_conn_tab[hash], list) { + list_for_each_entry(tuphash, &this_conn_tbl[hash], list) { if (tuphash->sport == dport && tuphash->dport == sport && inet_addr_equal(af, &tuphash->saddr, daddr) @@ -834,7 +895,7 @@ struct dp_vs_conn *dp_vs_conn_get(int af, uint16_t proto, } } } else { - list_for_each_entry(tuphash, &this_conn_tab[hash], list) { + list_for_each_entry(tuphash, &this_conn_tbl[hash], list) { if (tuphash->sport == sport && tuphash->dport == dport && inet_addr_equal(af, &tuphash->saddr, saddr) @@ -855,7 +916,7 @@ struct dp_vs_conn *dp_vs_conn_get(int af, uint16_t proto, #endif #ifdef CONFIG_DPVS_IPVS_DEBUG - RTE_LOG(DEBUG, IPVS, "conn lookup: [%d] %s %s:%d -> %s:%d %s %s\n", + RTE_LOG(DEBUG, IPVS, "conn lookup: [%d] %s %s/%d -> %s/%d %s %s\n", rte_lcore_id(), inet_proto_name(proto), inet_ntop(af, saddr, sbuf, sizeof(sbuf)) ? sbuf : "::", ntohs(sport), inet_ntop(af, daddr, dbuf, sizeof(dbuf)) ? dbuf : "::", ntohs(dport), @@ -878,10 +939,11 @@ struct dp_vs_conn *dp_vs_ct_in_get(int af, uint16_t proto, char sbuf[64], dbuf[64]; #endif - hash = conn_hashkey(af, saddr, sport, daddr, dport); + hash = dp_vs_conn_hashkey(af, saddr, sport, daddr, dport, + DPVS_CONN_TBL_MASK); rte_spinlock_lock(&dp_vs_ct_lock); - list_for_each_entry(tuphash, &dp_vs_ct_tab[hash], list) { + list_for_each_entry(tuphash, &dp_vs_ct_tbl[hash], list) { conn = tuplehash_to_conn(tuphash); if (tuphash->sport == sport && tuphash->dport == dport && inet_addr_equal(af, &tuphash->saddr, saddr) @@ -898,7 +960,7 @@ struct dp_vs_conn *dp_vs_ct_in_get(int af, uint16_t proto, rte_spinlock_unlock(&dp_vs_ct_lock); #ifdef CONFIG_DPVS_IPVS_DEBUG - RTE_LOG(DEBUG, IPVS, "conn-template lookup: [%d] %s %s:%d -> %s:%d %s\n", + RTE_LOG(DEBUG, IPVS, "conn-template lookup: [%d] %s %s/%d -> %s/%d %s\n", rte_lcore_id(), inet_proto_name(proto), inet_ntop(af, saddr, sbuf, sizeof(sbuf)) ? sbuf : "::", ntohs(sport), inet_ntop(af, daddr, dbuf, sizeof(dbuf)) ? dbuf : "::", ntohs(dport), @@ -923,7 +985,7 @@ int dp_vs_check_template(struct dp_vs_conn *ct) rte_atomic16_read(&dest->weight) == 0)) { #ifdef CONFIG_DPVS_IPVS_DEBUG RTE_LOG(DEBUG, IPVS, "%s: check_template: dest not available for " - "protocol %s s:%s:%u v:%s:%u -> l:%s:%u d:%s:%u\n", + "protocol %s s:%s/%u v:%s/%u -> l:%s/%u d:%s/%u\n", __func__, inet_proto_name(ct->proto), inet_ntop(ct->af, &ct->caddr, sbuf, sizeof(sbuf)) ? sbuf : "::", ntohs(ct->cport), @@ -936,12 +998,12 @@ int dp_vs_check_template(struct dp_vs_conn *ct) #endif /* invalidate the connection */ if (ct->vport != htons(0xffff)) { - if (conn_unhash(ct)) { + if (dp_vs_conn_unhash(ct)) { ct->dport = htonl(0xffff); ct->vport = htonl(0xffff); ct->lport = 0; ct->cport = 0; - conn_hash(ct); + dp_vs_conn_hash(ct); } } /* simply decrease the refcnt of the template, do not restart its timer */ @@ -974,14 +1036,17 @@ static int conn_init_lcore(void *arg) if (!rte_lcore_is_enabled(rte_lcore_id())) return EDPVS_DISABLED; - this_conn_tab = rte_malloc_socket(NULL, - sizeof(struct list_head) * DPVS_CONN_TAB_SIZE, + if (netif_lcore_is_idle(rte_lcore_id())) + return EDPVS_IDLE; + + this_conn_tbl = rte_malloc_socket(NULL, + sizeof(struct list_head) * DPVS_CONN_TBL_SIZE, RTE_CACHE_LINE_SIZE, rte_socket_id()); - if (!this_conn_tab) + if (!this_conn_tbl) return EDPVS_NOMEM; - for (i = 0; i < DPVS_CONN_TAB_SIZE; i++) - INIT_LIST_HEAD(&this_conn_tab[i]); + for (i = 0; i < DPVS_CONN_TBL_SIZE; i++) + INIT_LIST_HEAD(&this_conn_tbl[i]); #ifdef CONFIG_DPVS_IPVS_CONN_LOCK rte_spinlock_init(&this_conn_lock); @@ -998,15 +1063,14 @@ static int conn_term_lcore(void *arg) conn_flush(); - if (this_conn_tab) { - rte_free(this_conn_tab); - this_conn_tab = NULL; + if (this_conn_tbl) { + rte_free(this_conn_tbl); + this_conn_tbl = NULL; } return EDPVS_OK; } - /* * ctrl plane support for commands: * ipvsadm -ln -c @@ -1102,22 +1166,16 @@ static inline char* get_conn_state_name(uint16_t proto, uint16_t state) static inline void sockopt_fill_conn_entry(const struct dp_vs_conn *conn, ipvs_conn_entry_t *entry) { - entry->af = conn->af; + entry->in_af = tuplehash_in(conn).af; + entry->out_af = tuplehash_out(conn).af; entry->proto = conn->proto; entry->lcoreid = rte_lcore_id(); snprintf(entry->state, sizeof(entry->state), "%s", get_conn_state_name(conn->proto, conn->state)); - if (AF_INET == conn->af) { - entry->caddr.in = conn->caddr.in; - entry->vaddr.in = conn->vaddr.in; - entry->laddr.in = conn->laddr.in; - entry->daddr.in = conn->daddr.in; - } else if (AF_INET6 == conn->af) { - entry->caddr.in6 = conn->caddr.in6; - entry->vaddr.in6 = conn->vaddr.in6; - entry->laddr.in6 = conn->laddr.in6; - entry->daddr.in6 = conn->daddr.in6; - } + entry->caddr = conn->caddr; + entry->vaddr = conn->vaddr; + entry->laddr = conn->laddr; + entry->daddr = conn->daddr; entry->cport = conn->cport; entry->vport = conn->vport; entry->lport = conn->lport; @@ -1186,7 +1244,7 @@ static int __lcore_conn_table_dump(const struct list_head *cplist) struct dp_vs_conn *conn; struct ip_vs_conn_array_list *cparr = NULL; - for (i = 0; i < DPVS_CONN_TAB_SIZE; i++) { + for (i = 0; i < DPVS_CONN_TBL_SIZE; i++) { list_for_each_entry(tuphash, &cplist[i], list) { if (tuphash->direct != DPVS_CONN_DIR_INBOUND) continue; @@ -1266,7 +1324,7 @@ static int sockopt_conn_get_all(const struct ip_vs_conn_req *conn_req, if ((conn_req->flag & GET_IPVS_CONN_FLAG_TEMPLATE) && (cid == rte_get_master_lcore())) { /* persist conns */ rte_spinlock_lock(&dp_vs_ct_lock); - res = __lcore_conn_table_dump(dp_vs_ct_tab); + res = __lcore_conn_table_dump(dp_vs_ct_tbl); rte_spinlock_unlock(&dp_vs_ct_lock); if (res != EDPVS_OK) { conn_arr->nconns = got; @@ -1438,7 +1496,7 @@ static int conn_get_msgcb_slave(struct dpvs_msg *msg) static int conn_get_all_msgcb_slave(struct dpvs_msg *msg) { - return __lcore_conn_table_dump(this_conn_tab); + return __lcore_conn_table_dump(this_conn_tbl); } static int register_conn_get_msg(void) @@ -1551,10 +1609,11 @@ int dp_vs_conn_init(void) char poolname[32]; /* init connection template table */ - dp_vs_ct_tab = rte_malloc_socket(NULL, sizeof(struct list_head) * DPVS_CONN_TAB_SIZE, + dp_vs_ct_tbl = rte_malloc_socket(NULL, sizeof(struct list_head) * DPVS_CONN_TBL_SIZE, RTE_CACHE_LINE_SIZE, rte_socket_id()); - for (i = 0; i < DPVS_CONN_TAB_SIZE; i++) - INIT_LIST_HEAD(&dp_vs_ct_tab[i]); + + for (i = 0; i < DPVS_CONN_TBL_SIZE; i++) + INIT_LIST_HEAD(&dp_vs_ct_tbl[i]); rte_spinlock_init(&dp_vs_ct_lock); /* @@ -1612,21 +1671,33 @@ int dp_vs_conn_term(void) return EDPVS_OK; } +int dp_vs_conn_pool_size(void) +{ + return conn_pool_size; +} + +int dp_vs_conn_pool_cache_size(void) +{ + return conn_pool_cache; +} + static void conn_pool_size_handler(vector_t tokens) { char *str = set_value(tokens); - int pktpool_size; + int pool_size; assert(str); - pktpool_size = atoi(str); - if (pktpool_size < DPVS_CONN_POOL_SIZE_MIN) { + + pool_size = atoi(str); + + if (pool_size < DPVS_CONN_POOL_SIZE_MIN) { RTE_LOG(WARNING, IPVS, "invalid conn_pool_size %s, using default %d\n", str, DPVS_CONN_POOL_SIZE_DEF); conn_pool_size = DPVS_CONN_POOL_SIZE_DEF; } else { - is_power2(pktpool_size, 0, &pktpool_size); - RTE_LOG(INFO, IPVS, "conn_pool_size = %d (round to 2^n)\n", pktpool_size); - conn_pool_size = pktpool_size; + is_power2(pool_size, 0, &pool_size); + RTE_LOG(INFO, IPVS, "conn_pool_size = %d (round to 2^n)\n", pool_size); + conn_pool_size = pool_size; } FREE_PTR(str); @@ -1635,13 +1706,14 @@ static void conn_pool_size_handler(vector_t tokens) static void conn_pool_cache_handler(vector_t tokens) { char *str = set_value(tokens); - int pktpool_cache; + int pool_cache; assert(str); - if ((pktpool_cache = atoi(str)) > 0) { - is_power2(pktpool_cache, 0, &pktpool_cache); - RTE_LOG(INFO, IPVS, "conn_pool_cache = %d (round to 2^n)\n", pktpool_cache); - conn_pool_cache = pktpool_cache; + + if ((pool_cache = atoi(str)) > 0) { + is_power2(pool_cache, 0, &pool_cache); + RTE_LOG(INFO, IPVS, "conn_pool_cache = %d (round to 2^n)\n", pool_cache); + conn_pool_cache = pool_cache; } else { RTE_LOG(WARNING, IPVS, "invalid conn_pool_cache %s, using default %d\n", str, DPVS_CONN_CACHE_SIZE_DEF); @@ -1657,7 +1729,9 @@ static void conn_init_timeout_handler(vector_t tokens) int init_timeout; assert(str); + init_timeout = atoi(str); + if (init_timeout > IPVS_TIMEOUT_MIN && init_timeout < IPVS_TIMEOUT_MAX) { RTE_LOG(INFO, IPVS, "conn_init_timeout = %d\n", init_timeout); conn_init_timeout = init_timeout; @@ -1676,12 +1750,31 @@ static void conn_expire_quiscent_template_handler(vector_t tokens) conn_expire_quiescent_template = true; } +static void conn_redirect_handler(vector_t tokens) +{ + char *str = set_value(tokens); + + assert(str); + + if (strcasecmp(str, "on") == 0) + dp_vs_redirect_disable = false; + else if (strcasecmp(str, "off") == 0) + dp_vs_redirect_disable = true; + else + RTE_LOG(WARNING, IPVS, "invalid conn:redirect %s\n", str); + + RTE_LOG(INFO, IPVS, "conn:redirect = %s\n", dp_vs_redirect_disable ? "off" : "on"); + + FREE_PTR(str); +} + void ipvs_conn_keyword_value_init(void) { if (dpvs_state_get() == DPVS_STATE_INIT) { /* KW_TYPE_INIT keyword */ conn_pool_size = DPVS_CONN_POOL_SIZE_DEF; conn_pool_cache = DPVS_CONN_CACHE_SIZE_DEF; + dp_vs_redirect_disable = true; } /* KW_TYPE_NORMAL keyword */ conn_init_timeout = DPVS_CONN_INIT_TIMEOUT_DEF; @@ -1696,6 +1789,7 @@ void install_ipvs_conn_keywords(void) install_keyword("conn_init_timeout", conn_init_timeout_handler, KW_TYPE_NORMAL); install_keyword("expire_quiescent_template", conn_expire_quiscent_template_handler, KW_TYPE_NORMAL); + install_keyword("redirect", conn_redirect_handler, KW_TYPE_INIT); install_xmit_keywords(); install_sublevel_end(); } diff --git a/src/ipvs/ip_vs_core.c b/src/ipvs/ip_vs_core.c index 33a3134b5..0f533750e 100644 --- a/src/ipvs/ip_vs_core.c +++ b/src/ipvs/ip_vs_core.c @@ -36,6 +36,7 @@ #include "ipvs/blklst.h" #include "ipvs/proto_udp.h" #include "route6.h" +#include "ipvs/redirect.h" static inline int dp_vs_fill_iphdr(int af, struct rte_mbuf *mbuf, struct dp_vs_iphdr *iph) @@ -96,7 +97,7 @@ static struct dp_vs_conn *dp_vs_sched_persist(struct dp_vs_service *svc, return NULL; #ifdef CONFIG_DPVS_IPVS_DEBUG - RTE_LOG(DEBUG, IPVS, "%s: persist-schedule: src %s:%u dest %s:%u snet %s\n", + RTE_LOG(DEBUG, IPVS, "%s: persist-schedule: src %s/%u dest %s/%u snet %s\n", __func__, inet_ntop(svc->af, &iph->saddr, sbuf, sizeof(sbuf)), ntohs(ports[0]), @@ -269,7 +270,7 @@ static struct dp_vs_conn *dp_vs_snat_schedule(struct dp_vs_dest *dest, } /* select an RS by service's scheduler and create a connection */ -struct dp_vs_conn *dp_vs_schedule(struct dp_vs_service *svc, +struct dp_vs_conn *dp_vs_schedule(struct dp_vs_service *svc, const struct dp_vs_iphdr *iph, struct rte_mbuf *mbuf, bool is_synproxy_on) @@ -284,7 +285,7 @@ struct dp_vs_conn *dp_vs_schedule(struct dp_vs_service *svc, ports = mbuf_header_pointer(mbuf, iph->len, sizeof(_ports), _ports); if (!ports) return NULL; - + /* persistent service */ if (svc->flags & DP_VS_SVC_F_PERSISTENT) return dp_vs_sched_persist(svc, iph, mbuf, is_synproxy_on); @@ -297,7 +298,7 @@ struct dp_vs_conn *dp_vs_schedule(struct dp_vs_service *svc, #endif return NULL; } - + if (dest->fwdmode == DPVS_FWD_MODE_SNAT) return dp_vs_snat_schedule(dest, iph, ports, mbuf); @@ -344,8 +345,8 @@ struct dp_vs_conn *dp_vs_schedule(struct dp_vs_service *svc, } /* return verdict INET_XXX */ -static int xmit_outbound(struct rte_mbuf *mbuf, - struct dp_vs_proto *prot, +static int xmit_outbound(struct rte_mbuf *mbuf, + struct dp_vs_proto *prot, struct dp_vs_conn *conn) { int err; @@ -383,7 +384,7 @@ static int xmit_inbound(struct rte_mbuf *mbuf, dp_vs_conn_put(conn); return INET_DROP; } - + /* is dest avaible to forward the packet ? */ if (!conn->dest) { /* silently drop packet without reset connection timer. @@ -418,7 +419,7 @@ static int __xmit_outbound_icmp4(struct rte_mbuf *mbuf, struct ipv4_hdr *iph = ip4_hdr(mbuf); /* no translation needed for DR/TUN. */ - if (conn->dest->fwdmode != DPVS_FWD_MODE_FNAT && + if (conn->dest->fwdmode != DPVS_FWD_MODE_FNAT && conn->dest->fwdmode != DPVS_FWD_MODE_NAT && conn->dest->fwdmode != DPVS_FWD_MODE_SNAT) { if (!conn->packet_out_xmit) { @@ -440,10 +441,10 @@ static int __xmit_outbound_icmp4(struct rte_mbuf *mbuf, return EDPVS_NOROUTE; } - if ((mbuf->pkt_len > rt->mtu) + if ((mbuf->pkt_len > rt->mtu) && (ip4_hdr(mbuf)->fragment_offset & IPV4_HDR_DF_FLAG)) { route4_put(rt); - icmp_send(mbuf, ICMP_DEST_UNREACH, ICMP_UNREACH_NEEDFRAG, + icmp_send(mbuf, ICMP_DEST_UNREACH, ICMP_UNREACH_NEEDFRAG, htonl(rt->mtu)); rte_pktmbuf_free(mbuf); return EDPVS_FRAG; @@ -531,9 +532,9 @@ static int __xmit_inbound_icmp4(struct rte_mbuf *mbuf, struct ipv4_hdr *iph = ip4_hdr(mbuf); /* no translation needed for DR/TUN. */ - if (conn->dest->fwdmode != DPVS_FWD_MODE_NAT && - conn->dest->fwdmode != DPVS_FWD_MODE_FNAT && - conn->dest->fwdmode != DPVS_FWD_MODE_SNAT) { + if (conn->dest->fwdmode != DPVS_FWD_MODE_NAT && + conn->dest->fwdmode != DPVS_FWD_MODE_FNAT && + conn->dest->fwdmode != DPVS_FWD_MODE_SNAT) { if (!conn->packet_xmit) { RTE_LOG(WARNING, IPVS, "%s: missing packet_xmit\n", __func__); rte_pktmbuf_free(mbuf); @@ -553,10 +554,10 @@ static int __xmit_inbound_icmp4(struct rte_mbuf *mbuf, return EDPVS_NOROUTE; } - if ((mbuf->pkt_len > rt->mtu) + if ((mbuf->pkt_len > rt->mtu) && (ip4_hdr(mbuf)->fragment_offset & IPV4_HDR_DF_FLAG)) { route4_put(rt); - icmp_send(mbuf, ICMP_DEST_UNREACH, ICMP_UNREACH_NEEDFRAG, + icmp_send(mbuf, ICMP_DEST_UNREACH, ICMP_UNREACH_NEEDFRAG, htonl(rt->mtu)); rte_pktmbuf_free(mbuf); return EDPVS_FRAG; @@ -646,9 +647,11 @@ static int __dp_vs_in_icmp4(struct rte_mbuf *mbuf, int *related) struct dp_vs_proto *prot; struct dp_vs_conn *conn; int off, dir, err; + lcoreid_t cid, peer_cid; bool drop = false; *related = 0; /* not related until found matching conn */ + cid = peer_cid = rte_lcore_id(); if (unlikely(ip4_is_frag(iph))) { if (ip4_defrag(mbuf, IP_DEFRAG_VS_FWD) != EDPVS_OK) @@ -689,9 +692,9 @@ static int __dp_vs_in_icmp4(struct rte_mbuf *mbuf, int *related) return INET_DROP; } - /* + /* * lookup conn with inner IP pkt. - * it need to move mbuf.data_off to inner IP pkt, + * it need to move mbuf.data_off to inner IP pkt, * and restore it later. although it looks strange. */ rte_pktmbuf_adj(mbuf, off); @@ -699,13 +702,25 @@ static int __dp_vs_in_icmp4(struct rte_mbuf *mbuf, int *related) return INET_DROP; dp_vs_fill_iphdr(AF_INET, mbuf, &dciph); - conn = prot->conn_lookup(prot, &dciph, mbuf, &dir, true, &drop); - if (!conn) - return INET_ACCEPT; + conn = prot->conn_lookup(prot, &dciph, mbuf, &dir, true, &drop, &peer_cid); + + /* + * The connection is not locally found, however the redirect is found so + * forward the packet to the remote redirect owner core. + */ + if (cid != peer_cid) { + /* recover mbuf.data_off to outer Ether header */ + rte_pktmbuf_prepend(mbuf, (uint16_t)sizeof(struct ether_hdr) + off); + + return dp_vs_redirect_pkt(mbuf, peer_cid); + } /* recover mbuf.data_off to outer IP header. */ rte_pktmbuf_prepend(mbuf, off); + if (!conn) + return INET_ACCEPT; + /* so the ICMP is related to existing conn */ *related = 1; @@ -728,7 +743,7 @@ static int __dp_vs_in_icmp4(struct rte_mbuf *mbuf, int *related) dp_vs_conn_put(conn); return INET_DROP; } - /* note + /* note * 1. the direction of inner IP pkt is reversed with ICMP pkt. * 2. but we use (@reverse == true) for prot->conn_lookup() * as a result, @dir is same with icmp packet. */ @@ -737,13 +752,26 @@ static int __dp_vs_in_icmp4(struct rte_mbuf *mbuf, int *related) else err = xmit_outbound_icmp(mbuf, prot, conn); if (err != EDPVS_OK) - RTE_LOG(WARNING, IPVS, "%s: xmit icmp error: %s\n", + RTE_LOG(WARNING, IPVS, "%s: xmit icmp error: %s\n", __func__, dpvs_strerror(err)); dp_vs_conn_put_no_reset(conn); return INET_STOLEN; } +#ifdef CONFIG_DPVS_IPVS_DEBUG +static void __dp_vs_icmp6_show(struct ip6_hdr *ip6h, struct icmp6_hdr *ic6h) +{ + char src_addr_buff[64], dst_addr_buff[64]; + + inet_ntop(AF_INET6, &ip6h->ip6_src, src_addr_buff, sizeof(src_addr_buff)); + inet_ntop(AF_INET6, &ip6h->ip6_dst, dst_addr_buff, sizeof(dst_addr_buff)); + + RTE_LOG(DEBUG, IPVS, "%s: ICMP6 (%d, %d) %s->%s\n", + __func__, ic6h->icmp6_type, ntohs(icmp6h_id(ic6h)), src_addr_buff, dst_addr_buff); +} +#endif + /* return verdict INET_XXX */ static int __dp_vs_in_icmp6(struct rte_mbuf *mbuf, int *related) { @@ -754,13 +782,12 @@ static int __dp_vs_in_icmp6(struct rte_mbuf *mbuf, int *related) struct dp_vs_proto *prot; struct dp_vs_conn *conn; int off, ic6h_off, dir, err; + lcoreid_t cid, peer_cid; bool drop = false; uint8_t nexthdr = ip6h->ip6_nxt; -#ifdef CONFIG_DPVS_IPVS_DEBUG - char src_addr_buff[64], dst_addr_buff[64]; -#endif *related = 0; /* not related until found matching conn */ + cid = peer_cid = rte_lcore_id(); // don't suppurt frag now if (unlikely(ip6_is_frag(ip6h))) { @@ -782,10 +809,7 @@ static int __dp_vs_in_icmp6(struct rte_mbuf *mbuf, int *related) return INET_DROP; #ifdef CONFIG_DPVS_IPVS_DEBUG - inet_ntop(AF_INET6, &ip6h->ip6_src, src_addr_buff, sizeof(src_addr_buff)); - inet_ntop(AF_INET6, &ip6h->ip6_dst, dst_addr_buff, sizeof(dst_addr_buff)); - RTE_LOG(DEBUG, IPVS, "ICMP6 (%d,%d) %s->%s\n", - ic6h->icmp6_type, ntohs(icmp6h_id(ic6h)), src_addr_buff, dst_addr_buff); + __dp_vs_icmp6_show(ip6h, ic6h); #endif /* support these related error types only, @@ -821,13 +845,25 @@ static int __dp_vs_in_icmp6(struct rte_mbuf *mbuf, int *related) if (!prot) return INET_ACCEPT; - conn = prot->conn_lookup(prot, &dcip6h, mbuf, &dir, true, &drop); - if (!conn) - return INET_ACCEPT; + conn = prot->conn_lookup(prot, &dcip6h, mbuf, &dir, true, &drop, &peer_cid); + + /* + * The connection is not locally found, however the redirect is found so + * forward the packet to the remote redirect owner core. + */ + if (cid != peer_cid) { + /* recover mbuf.data_off to outer Ether header */ + rte_pktmbuf_prepend(mbuf, (uint16_t)sizeof(struct ether_hdr) + off); + + return dp_vs_redirect_pkt(mbuf, peer_cid); + } /* recover mbuf.data_off to outer IP header. */ rte_pktmbuf_prepend(mbuf, off); + if (!conn) + return INET_ACCEPT; + /* so the ICMP is related to existing conn */ *related = 1; @@ -862,7 +898,7 @@ static int __dp_vs_in_icmp6(struct rte_mbuf *mbuf, int *related) else err = xmit_outbound_icmp(mbuf, prot, conn); if (err != EDPVS_OK) - RTE_LOG(WARNING, IPVS, "%s: xmit icmp error: %s\n", + RTE_LOG(WARNING, IPVS, "%s: xmit icmp error: %s\n", __func__, dpvs_strerror(err)); dp_vs_conn_put_no_reset(conn); @@ -893,9 +929,12 @@ static int __dp_vs_in(void *priv, struct rte_mbuf *mbuf, struct dp_vs_conn *conn; int dir, verdict, err, related; bool drop = false; + lcoreid_t cid, peer_cid; eth_type_t etype = mbuf->packet_type; /* FIXME: use other field ? */ assert(mbuf && state); + cid = peer_cid = rte_lcore_id(); + if (unlikely(etype != ETH_PKT_HOST)) return INET_ACCEPT; @@ -908,7 +947,7 @@ static int __dp_vs_in(void *priv, struct rte_mbuf *mbuf, verdict = dp_vs_in_icmp(af, mbuf, &related); if (related || verdict != INET_ACCEPT) return verdict; - /* let unrelated and valid ICMP goes down, + /* let unrelated and valid ICMP goes down, * may implement ICMP fwd in the futher. */ } @@ -934,13 +973,24 @@ static int __dp_vs_in(void *priv, struct rte_mbuf *mbuf, } /* packet belongs to existing connection ? */ - conn = prot->conn_lookup(prot, &iph, mbuf, &dir, false, &drop); + conn = prot->conn_lookup(prot, &iph, mbuf, &dir, false, &drop, &peer_cid); if (unlikely(drop)) { RTE_LOG(DEBUG, IPVS, "%s: deny ip try to visit.\n", __func__); return INET_DROP; } + /* + * The connection is not locally found, however the redirect is found so + * forward the packet to the remote redirect owner core. + */ + if (cid != peer_cid) { + /* recover mbuf.data_off to outer Ether header */ + rte_pktmbuf_prepend(mbuf, (uint16_t)sizeof(struct ether_hdr)); + + return dp_vs_redirect_pkt(mbuf, peer_cid); + } + if (unlikely(!conn)) { /* try schedule RS and create new connection */ if (prot->conn_sched(prot, &iph, mbuf, &conn, &verdict) != EDPVS_OK) { @@ -1111,6 +1161,12 @@ int dp_vs_init(void) goto err_conn; } + err = dp_vs_redirects_init(); + if (err != EDPVS_OK) { + RTE_LOG(ERR, IPVS, "fail to init redirect: %s\n", dpvs_strerror(err)); + goto err_redirect; + } + err = dp_vs_synproxy_init(); if (err != EDPVS_OK) { RTE_LOG(ERR, IPVS, "fail to init synproxy: %s\n", dpvs_strerror(err)); @@ -1128,16 +1184,19 @@ int dp_vs_init(void) RTE_LOG(ERR, IPVS, "fail to init serv: %s\n", dpvs_strerror(err)); goto err_serv; } + err = dp_vs_blklst_init(); if (err != EDPVS_OK) { RTE_LOG(ERR, IPVS, "fail to init blklst: %s\n", dpvs_strerror(err)); goto err_blklst; } + err = dp_vs_stats_init(); if (err != EDPVS_OK) { RTE_LOG(ERR, IPVS, "fail to init stats: %s\n", dpvs_strerror(err)); goto err_stats; } + err = inet_register_hooks(dp_vs_ops, NELEMS(dp_vs_ops)); if (err != EDPVS_OK) { RTE_LOG(ERR, IPVS, "fail to register hooks: %s\n", dpvs_strerror(err)); @@ -1158,6 +1217,8 @@ int dp_vs_init(void) err_sched: dp_vs_synproxy_term(); err_synproxy: + dp_vs_redirects_term(); +err_redirect: dp_vs_conn_term(); err_conn: dp_vs_laddr_term(); @@ -1181,7 +1242,7 @@ int dp_vs_term(void) err = dp_vs_blklst_term(); if (err != EDPVS_OK) - RTE_LOG(ERR, IPVS, "fail to terminate blklst: %s\n", dpvs_strerror(err)); + RTE_LOG(ERR, IPVS, "fail to terminate blklst: %s\n", dpvs_strerror(err)); err = dp_vs_service_term(); if (err != EDPVS_OK) @@ -1195,6 +1256,10 @@ int dp_vs_term(void) if (err != EDPVS_OK) RTE_LOG(ERR, IPVS, "fail to terminate synproxy: %s\n", dpvs_strerror(err)); + err = dp_vs_redirects_term(); + if (err != EDPVS_OK) + RTE_LOG(ERR, IPVS, "fail to terminate redirect: %s\n", dpvs_strerror(err)); + err = dp_vs_conn_term(); if (err != EDPVS_OK) RTE_LOG(ERR, IPVS, "fail to terminate conn: %s\n", dpvs_strerror(err)); diff --git a/src/ipvs/ip_vs_dest.c b/src/ipvs/ip_vs_dest.c index a69d3bc61..7f884d315 100644 --- a/src/ipvs/ip_vs_dest.c +++ b/src/ipvs/ip_vs_dest.c @@ -86,37 +86,22 @@ static int dp_vs_rs_unhash(struct dp_vs_dest *dest) } -struct dp_vs_dest *dp_vs_lookup_dest(struct dp_vs_service *svc, - const union inet_addr *daddr, +struct dp_vs_dest *dp_vs_lookup_dest(int af, + struct dp_vs_service *svc, + const union inet_addr *daddr, uint16_t dport) { struct dp_vs_dest *dest; list_for_each_entry(dest, &svc->dests, n_list){ - if ((dest->af == svc->af) - && inet_addr_equal(svc->af, &dest->addr, daddr) + if ((dest->af == af) + && inet_addr_equal(af, &dest->addr, daddr) && (dest->port == dport)) return dest; } return NULL; } -struct dp_vs_dest *dp_vs_find_dest(int af, const union inet_addr *daddr, - uint16_t dport, const union inet_addr *vaddr, - uint16_t vport, uint16_t protocol) -{ - struct dp_vs_dest *dest; - struct dp_vs_service *svc; - svc = dp_vs_service_lookup(af, protocol, vaddr, vport, 0, NULL, NULL); - if(!svc) - return NULL; - dest = dp_vs_lookup_dest(svc, daddr, dport); - if(dest) - rte_atomic32_inc(&dest->refcnt); - dp_vs_service_put(svc); - return dest; -} - /* * Lookup dest by {svc,addr,port} in the destination trash. * The destination trash is used to hold the destinations that are removed @@ -128,7 +113,7 @@ struct dp_vs_dest *dp_vs_find_dest(int af, const union inet_addr *daddr, * scheduling. */ struct dp_vs_dest *dp_vs_trash_get_dest(struct dp_vs_service *svc, - const union inet_addr *daddr, + const union inet_addr *daddr, uint16_t dport) { struct dp_vs_dest *dest, *nxt; @@ -176,7 +161,7 @@ void dp_vs_trash_cleanup(void) } static void __dp_vs_update_dest(struct dp_vs_service *svc, - struct dp_vs_dest *dest, + struct dp_vs_dest *dest, struct dp_vs_dest_conf *udest) { int conn_flags; @@ -211,14 +196,12 @@ static void __dp_vs_update_dest(struct dp_vs_service *svc, } -int dp_vs_new_dest(struct dp_vs_service *svc, +int dp_vs_new_dest(struct dp_vs_service *svc, struct dp_vs_dest_conf *udest, struct dp_vs_dest **dest_p) { int size; struct dp_vs_dest *dest; -#ifdef CONFIG_IP_VS_IPV6 -#endif size = RTE_CACHE_LINE_ROUNDUP(sizeof(struct dp_vs_dest)); dest = rte_zmalloc("dpvs_new_dest", size, 0); if(dest == NULL){ @@ -227,7 +210,7 @@ int dp_vs_new_dest(struct dp_vs_service *svc, } assert(dest->svc == NULL); - dest->af = svc->af; + dest->af = udest->af; dest->proto = svc->proto; dest->vaddr = svc->addr; dest->vport = svc->port; @@ -279,7 +262,7 @@ dp_vs_add_dest(struct dp_vs_service *svc, struct dp_vs_dest_conf *udest) /* * Check if the dest already exists in the list */ - dest = dp_vs_lookup_dest(svc, &daddr, dport); + dest = dp_vs_lookup_dest(udest->af, svc, &daddr, dport); if (dest != NULL) { RTE_LOG(DEBUG, SERVICE, "%s: dest already exists.\n", __func__); @@ -317,7 +300,7 @@ dp_vs_add_dest(struct dp_vs_service *svc, struct dp_vs_dest_conf *udest) /* call the update_service function of its scheduler */ if (svc->scheduler->update_service) - svc->scheduler->update_service(svc); + svc->scheduler->update_service(svc, dest, DPVS_SO_SET_ADDDEST); rte_rwlock_write_unlock(&__dp_vs_svc_lock); return EDPVS_OK; @@ -349,7 +332,7 @@ dp_vs_add_dest(struct dp_vs_service *svc, struct dp_vs_dest_conf *udest) /* call the update_service function of its scheduler */ if (svc->scheduler->update_service) - svc->scheduler->update_service(svc); + svc->scheduler->update_service(svc, dest, DPVS_SO_SET_ADDDEST); rte_rwlock_write_unlock(&__dp_vs_svc_lock); @@ -380,7 +363,7 @@ dp_vs_edit_dest(struct dp_vs_service *svc, struct dp_vs_dest_conf *udest) /* * Lookup the destination list */ - dest = dp_vs_lookup_dest(svc, &daddr, dport); + dest = dp_vs_lookup_dest(udest->af, svc, &daddr, dport); if (dest == NULL) { RTE_LOG(DEBUG, SERVICE,"%s(): dest doesn't exist\n", __func__); @@ -410,7 +393,7 @@ dp_vs_edit_dest(struct dp_vs_service *svc, struct dp_vs_dest_conf *udest) /* call the update_service, because server weight may be changed */ if (svc->scheduler->update_service) - svc->scheduler->update_service(svc); + svc->scheduler->update_service(svc, dest, DPVS_SO_SET_EDITDEST); rte_rwlock_write_unlock(&__dp_vs_svc_lock); @@ -480,7 +463,7 @@ void __dp_vs_unlink_dest(struct dp_vs_service *svc, * Call the update_service function of its scheduler */ if (svcupd && svc->scheduler->update_service) - svc->scheduler->update_service(svc); + svc->scheduler->update_service(svc, dest, DPVS_SO_SET_DELDEST); } int @@ -489,7 +472,7 @@ dp_vs_del_dest(struct dp_vs_service *svc, struct dp_vs_dest_conf *udest) struct dp_vs_dest *dest; uint16_t dport = udest->port; - dest = dp_vs_lookup_dest(svc, &udest->addr, dport); + dest = dp_vs_lookup_dest(udest->af, svc, &udest->addr, dport); if (dest == NULL) { RTE_LOG(DEBUG, SERVICE,"%s(): destination not found!\n", __func__); @@ -542,6 +525,8 @@ int dp_vs_get_dest_entries(const struct dp_vs_service *svc, entry.inactconns = rte_atomic32_read(&dest->inactconns); entry.persistconns = rte_atomic32_read(&dest->persistconns); ret = dp_vs_copy_stats(&(entry.stats), dest->stats); + if (ret != EDPVS_OK) + break; memcpy(&uptr->entrytable[count], &entry, sizeof(entry)); count++; diff --git a/src/ipvs/ip_vs_fo.c b/src/ipvs/ip_vs_fo.c new file mode 100644 index 000000000..6b2ffcef4 --- /dev/null +++ b/src/ipvs/ip_vs_fo.c @@ -0,0 +1,58 @@ +/* + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * weighted fail over module + * see net/netfilter/ipvs/ip_vs_fo.c for reference + * + * yangxingwu , Feb 2019, initial. + * + */ + +#include "ipvs/fo.h" + +/* weighted fail over scheduling */ +static struct dp_vs_dest *dp_vs_fo_schedule(struct dp_vs_service *svc, + const struct rte_mbuf *mbuf __rte_unused) +{ + + struct dp_vs_dest *dest, *hweight = NULL; + int16_t hw = 0; /* track highest weight */ + + /* basic failover functionality + * find virtual server with highest weight and send it traffic + */ + list_for_each_entry(dest, &svc->dests, n_list) { + if (!(dest->flags & DPVS_DEST_F_OVERLOAD) && + (dest->flags & DPVS_DEST_F_AVAILABLE) && + (rte_atomic16_read(&dest->weight) > hw)) { + hweight = dest; + hw = rte_atomic16_read(&dest->weight); + } + } + + return hweight; +} + +static struct dp_vs_scheduler dp_vs_fo_scheduler = { + .name = "fo", + .n_list = LIST_HEAD_INIT(dp_vs_fo_scheduler.n_list), + .schedule = dp_vs_fo_schedule, +}; + +int dp_vs_fo_init(void) +{ + return register_dp_vs_scheduler(&dp_vs_fo_scheduler); +} + +int dp_vs_fo_term(void) +{ + return unregister_dp_vs_scheduler(&dp_vs_fo_scheduler); +} diff --git a/src/ipvs/ip_vs_laddr.c b/src/ipvs/ip_vs_laddr.c index f04100d90..050d57e47 100644 --- a/src/ipvs/ip_vs_laddr.c +++ b/src/ipvs/ip_vs_laddr.c @@ -35,13 +35,13 @@ #include "conf/laddr.h" /* - * Local Address (LIP) and port (lport) allocation for FNAT mode, + * Local Address (LIP) and port (lport) allocation for FNAT mode, * * 1. Four tuple of IP connection must be unique. - * we cannot control RS's while we really need support - * millions of connections. so one laddr is not enough (only lport + * we cannot control RS's while we really need support + * millions of connections. so one laddr is not enough (only lport * is variable means 2^16 is the max connection number). - * + * * So we need more laddr and an algorithm to select it, so as lport. * * 2. laddr maintained by service. @@ -53,11 +53,11 @@ * consider conn table is per-lcore, we must * make sure outbound flow handled by same lcore. * so we use FDIR to set oubound flow to same lcore as inbound. - * note FDIR has limited filter number (8K), + * note FDIR has limited filter number (8K), * both 2^32*2^16 and 2^16 are too big. * * actually we just need N fdir filter, while N >= #lcore - * so we use LSB B bits of lport for fdir mask, let + * so we use LSB B bits of lport for fdir mask, let * 2^B >= (N == #lcore) * * further more, for the case inbound/outbound port are same, @@ -70,7 +70,7 @@ * * MSB was used, it makes lport range continuous and more clear * for each lcores, for example - * + * * lcore lport-range * 0 0~4095 * 1 4096~8191 @@ -78,7 +78,7 @@ * But consider global min/max limitations, like we should * skip port 0~1024 or 50000~65535. it causes lport resource * of some lcore exhausted prematurely. That's not acceptable. - * + * * Using LSB bits solves this issue, although the lports for * each lcore is distributed. * @@ -89,9 +89,9 @@ * b) select laddr according to lcore * c) set laddr to FDIR. * - * to use lport-mask we can save laddr, but it's not easy set + * to use lport-mask we can save laddr, but it's not easy set * fdir for TCP/UDP related ICMP (or too complex). - * and using laddr-lcore 1:1 mapping, it consumes more laddr, + * and using laddr-lcore 1:1 mapping, it consumes more laddr, * but note one laddr supports at about 6W conn (same rip:rport). * It man not make sence to let #lcore bigger then #laddr. */ @@ -177,7 +177,7 @@ int dp_vs_laddr_bind(struct dp_vs_conn *conn, struct dp_vs_service *svc) /* * some time allocate lport fails for one laddr, * but there's also some resource on another laddr. - * use write lock since + * use write lock since * 1. __get_laddr will change svc->laddr_curr; * 2. we uses svc->num_laddrs; */ @@ -235,7 +235,7 @@ int dp_vs_laddr_bind(struct dp_vs_conn *conn, struct dp_vs_service *svc) if (!laddr || sport == 0) { #ifdef CONFIG_DPVS_IPVS_DEBUG - RTE_LOG(ERR, IPVS, "%s: [%d] no lport available !!\n", + RTE_LOG(ERR, IPVS, "%s: [%d] no lport available !!\n", __func__, rte_lcore_id()); #endif if (laddr) @@ -308,7 +308,7 @@ int dp_vs_laddr_add(struct dp_vs_service *svc, if (!svc || !addr) return EDPVS_INVAL; - new = rte_malloc_socket(NULL, sizeof(*new), + new = rte_malloc_socket(NULL, sizeof(*new), RTE_CACHE_LINE_SIZE, rte_socket_id()); if (!new) return EDPVS_NOMEM; @@ -356,9 +356,9 @@ int dp_vs_laddr_del(struct dp_vs_service *svc, int af, const union inet_addr *ad /* found */ if (rte_atomic32_read(&laddr->refcnt) == 0) { - /* update svc->curr_laddr */ - if (svc->laddr_curr == &laddr->list) - svc->laddr_curr = laddr->list.next; + /* update svc->curr_laddr */ + if (svc->laddr_curr == &laddr->list) + svc->laddr_curr = laddr->list.next; list_del(&laddr->list); rte_free(laddr); svc->num_laddrs--; @@ -379,7 +379,7 @@ int dp_vs_laddr_del(struct dp_vs_service *svc, int af, const union inet_addr *ad } /* if success, it depend on caller to free @addrs by rte_free() */ -static int dp_vs_laddr_getall(struct dp_vs_service *svc, +static int dp_vs_laddr_getall(struct dp_vs_service *svc, struct dp_vs_laddr_entry **addrs, size_t *naddr) { struct dp_vs_laddr *laddr; @@ -445,7 +445,7 @@ int dp_vs_laddr_flush(struct dp_vs_service *svc) return err; } -/* +/* * for control plane */ static int laddr_sockopt_set(sockoptid_t opt, const void *conf, size_t size) @@ -458,12 +458,12 @@ static int laddr_sockopt_set(sockoptid_t opt, const void *conf, size_t size) if (!conf && size < sizeof(*laddr_conf)) return EDPVS_INVAL; - if (dp_vs_match_parse(laddr_conf->srange, laddr_conf->drange, - laddr_conf->iifname, laddr_conf->oifname, + if (dp_vs_match_parse(laddr_conf->srange, laddr_conf->drange, + laddr_conf->iifname, laddr_conf->oifname, &match) != EDPVS_OK) return EDPVS_INVAL; - svc = dp_vs_service_lookup(laddr_conf->af, laddr_conf->proto, + svc = dp_vs_service_lookup(laddr_conf->af_s, laddr_conf->proto, &laddr_conf->vaddr, laddr_conf->vport, laddr_conf->fwmark, NULL, &match); if (!svc) @@ -471,11 +471,11 @@ static int laddr_sockopt_set(sockoptid_t opt, const void *conf, size_t size) switch (opt) { case SOCKOPT_SET_LADDR_ADD: - err = dp_vs_laddr_add(svc, laddr_conf->af, &laddr_conf->laddr, + err = dp_vs_laddr_add(svc, laddr_conf->af_l, &laddr_conf->laddr, laddr_conf->ifname); break; case SOCKOPT_SET_LADDR_DEL: - err = dp_vs_laddr_del(svc, laddr_conf->af, &laddr_conf->laddr); + err = dp_vs_laddr_del(svc, laddr_conf->af_l, &laddr_conf->laddr); break; case SOCKOPT_SET_LADDR_FLUSH: err = dp_vs_laddr_flush(svc); @@ -503,13 +503,13 @@ static int laddr_sockopt_get(sockoptid_t opt, const void *conf, size_t size, if (!conf && size < sizeof(*laddr_conf)) return EDPVS_INVAL; - if (dp_vs_match_parse(laddr_conf->srange, laddr_conf->drange, - laddr_conf->iifname, laddr_conf->oifname, + if (dp_vs_match_parse(laddr_conf->srange, laddr_conf->drange, + laddr_conf->iifname, laddr_conf->oifname, &match) != EDPVS_OK) return EDPVS_INVAL; - svc = dp_vs_service_lookup(laddr_conf->af, laddr_conf->proto, + svc = dp_vs_service_lookup(laddr_conf->af_s, laddr_conf->proto, &laddr_conf->vaddr, laddr_conf->vport, laddr_conf->fwmark, NULL, &match); if (!svc) diff --git a/src/ipvs/ip_vs_nat64.c b/src/ipvs/ip_vs_nat64.c new file mode 100644 index 000000000..44dc724be --- /dev/null +++ b/src/ipvs/ip_vs_nat64.c @@ -0,0 +1,101 @@ +/* + * DPVS is a software load balancer (Virtual Server) based on DPDK. + * + * Copyright (C) 2017 iQIYI (www.iqiyi.com). + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#include "ipvs/nat64.h" +#include "ipvs/ipvs.h" +#include "uoa.h" + +int mbuf_6to4(struct rte_mbuf *mbuf, + const struct in_addr *saddr, + const struct in_addr *daddr) +{ + struct ip6_hdr *ip6h = ip6_hdr(mbuf); + struct ipv4_hdr *ip4h; + uint8_t next_prot; + uint8_t ttl; + + /* + * ext_hdr not support yet + */ + if (ip6h->ip6_nxt != IPPROTO_TCP && + ip6h->ip6_nxt != IPPROTO_UDP && + ip6h->ip6_nxt != IPPROTO_ICMPV6 && + ip6h->ip6_nxt != IPPROTO_OPT) { + return EDPVS_NOTSUPP; + } + if (rte_pktmbuf_adj(mbuf, mbuf->l3_len) == NULL) + return EDPVS_DROP; + + next_prot = ip6h->ip6_nxt; + ttl = ip6h->ip6_hlim; + ip4h = (struct ipv4_hdr *)rte_pktmbuf_prepend(mbuf, sizeof(struct ipv4_hdr)); + if (!ip4h) + return EDPVS_NOROOM; + + memset(ip4h, 0, sizeof(struct ipv4_hdr)); + ip4h->version_ihl = ((4 << 4) | 5); + ip4h->type_of_service = 0; + ip4h->total_length = htons(mbuf->pkt_len); + ip4h->fragment_offset = htons(IPV4_HDR_DF_FLAG); + ip4h->time_to_live = ttl; + ip4h->next_proto_id = next_prot; + ip4h->src_addr = saddr->s_addr; + ip4h->dst_addr = daddr->s_addr; + ip4h->packet_id = 0; // NO FRAG, so 0 is OK? + + mbuf->l3_len = sizeof(struct ipv4_hdr); + + return EDPVS_OK; +} + +int mbuf_4to6(struct rte_mbuf *mbuf, + const struct in6_addr *saddr, + const struct in6_addr *daddr) +{ + struct ipv4_hdr *ip4h = ip4_hdr(mbuf); + struct ip6_hdr *ip6h; + uint16_t plen; + uint8_t hops; + uint8_t next_prot; + + if (mbuf->l3_len != sizeof(struct ipv4_hdr)) { + return EDPVS_NOTSUPP; + } + if (rte_pktmbuf_adj(mbuf, mbuf->l3_len) == NULL) + return EDPVS_DROP; + + plen = mbuf->pkt_len; + next_prot = ip4h->next_proto_id; + hops = ip4h->time_to_live; + ip6h = (struct ip6_hdr *)rte_pktmbuf_prepend(mbuf, sizeof(struct ip6_hdr)); + if (!ip6h) + return EDPVS_NOROOM; + + memset(ip6h, 0, sizeof(struct ip6_hdr)); + ip6h->ip6_vfc = 0x60; + ip6h->ip6_plen = htons(plen); + ip6h->ip6_nxt = next_prot; + ip6h->ip6_hlim = hops; + ip6h->ip6_src = *saddr; + ip6h->ip6_dst = *daddr; + + mbuf->l3_len = sizeof(struct ip6_hdr); + + return EDPVS_OK; +} + diff --git a/src/ipvs/ip_vs_proto.c b/src/ipvs/ip_vs_proto.c index 88bfc4ec0..30188a34a 100644 --- a/src/ipvs/ip_vs_proto.c +++ b/src/ipvs/ip_vs_proto.c @@ -42,7 +42,7 @@ static int proto_register(struct dp_vs_proto *proto) if (proto->init) proto->init(proto); - + return EDPVS_OK; } @@ -57,7 +57,7 @@ static int proto_unregister(struct dp_vs_proto *proto) if (proto->exit) proto->exit(proto); - + return EDPVS_OK; } diff --git a/src/ipvs/ip_vs_proto_icmp.c b/src/ipvs/ip_vs_proto_icmp.c index 2fc29e439..b4345b800 100644 --- a/src/ipvs/ip_vs_proto_icmp.c +++ b/src/ipvs/ip_vs_proto_icmp.c @@ -34,6 +34,7 @@ #include "ipvs/proto_icmp.h" #include "ipvs/conn.h" #include "ipvs/service.h" +#include "ipvs/redirect.h" /* * o ICMP tuple @@ -181,8 +182,9 @@ static bool is_icmp6_reply(uint8_t type) { static struct dp_vs_conn *icmp_conn_lookup(struct dp_vs_proto *proto, const struct dp_vs_iphdr *iph, - struct rte_mbuf *mbuf, int *direct, - bool reverse, bool *drop) + struct rte_mbuf *mbuf, int *direct, + bool reverse, bool *drop, + lcoreid_t *peer_cid) { void *ich = NULL; __be16 sport, dport; /* dummy ports */ @@ -191,6 +193,7 @@ static struct dp_vs_conn *icmp_conn_lookup(struct dp_vs_proto *proto, /* true icmp type/code, used for v4/v6 */ uint8_t icmp_type = 0; uint8_t icmp_code = 0; + struct dp_vs_conn *conn; assert(proto && iph && mbuf); if (AF_INET6 == af) { @@ -217,6 +220,7 @@ static struct dp_vs_conn *icmp_conn_lookup(struct dp_vs_proto *proto, (void *)&_icmph); if (unlikely(!ich)) return NULL; + /* icmp v4 */ icmp_type = ((struct icmphdr *)ich)->type; icmp_code = ((struct icmphdr *)ich)->code; @@ -231,8 +235,22 @@ static struct dp_vs_conn *icmp_conn_lookup(struct dp_vs_proto *proto, } } - return dp_vs_conn_get(iph->af, iph->proto, &iph->saddr, &iph->daddr, + conn = dp_vs_conn_get(iph->af, iph->proto, &iph->saddr, &iph->daddr, sport, dport, direct, reverse); + if (conn) { + return conn; + } else { + struct dp_vs_redirect *r; + + r = dp_vs_redirect_get(iph->af, iph->proto, + &iph->saddr, &iph->daddr, + sport, dport); + if (r) { + *peer_cid = r->cid; + } + } + + return conn; } static int icmp6_csum_handler(struct dp_vs_proto *proto, diff --git a/src/ipvs/ip_vs_proto_tcp.c b/src/ipvs/ip_vs_proto_tcp.c index 3a78639d7..57b26e43e 100644 --- a/src/ipvs/ip_vs_proto_tcp.c +++ b/src/ipvs/ip_vs_proto_tcp.c @@ -36,6 +36,7 @@ * like tcphdr.syn, so use standard definition. */ #include #include +#include "ipvs/redirect.h" static int g_defence_tcp_drop = 0; @@ -72,22 +73,22 @@ static const char *tcp_state_names[] = { #endif static struct tcp_state tcp_states[] = { -/* INPUT */ -/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ +/* INPUT */ +/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR}}, /*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW}}, /*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES}}, /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR}}, -/* OUTPUT */ -/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ +/* OUTPUT */ +/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ /*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR}}, /*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW}}, /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES}}, /*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL}}, -/* INPUT-ONLY */ -/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ +/* INPUT-ONLY */ +/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR}}, /*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW}}, /*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES}}, @@ -140,7 +141,7 @@ inline struct tcphdr *tcp_hdr(const struct rte_mbuf *mbuf) inline void tcp4_send_csum(struct ipv4_hdr *iph, struct tcphdr *th) { th->check = 0; - th->check = rte_ipv4_udptcp_cksum(iph, th); + th->check = ip4_udptcp_cksum(iph, th); } /* @@ -155,6 +156,52 @@ inline void tcp6_send_csum(struct ipv6_hdr *iph, struct tcphdr *th) { (void *)th - (void *)iph, IPPROTO_TCP); } +static inline int tcp_send_csum(int af, int iphdrlen, struct tcphdr *th, + const struct dp_vs_conn *conn, struct rte_mbuf *mbuf) +{ + /* leverage HW TX TCP csum offload if possible */ + + struct netif_port *dev = NULL; + + if (AF_INET6 == af) { + struct route6 *rt6 = mbuf->userdata; + struct ip6_hdr *ip6h = ip6_hdr(mbuf); + if (rt6 && rt6->rt6_dev) + dev = rt6->rt6_dev; + else if (conn->out_dev) + dev = conn->out_dev; + if (likely(dev && (dev->flag & NETIF_PORT_FLAG_TX_TCP_CSUM_OFFLOAD))) { + mbuf->l3_len = iphdrlen; + mbuf->l4_len = ntohs(ip6h->ip6_plen) + sizeof(struct ip6_hdr) - iphdrlen; + mbuf->ol_flags |= (PKT_TX_TCP_CKSUM | PKT_TX_IPV6); + th->check = ip6_phdr_cksum(ip6h, mbuf->ol_flags, iphdrlen, IPPROTO_TCP); + } else { + if (mbuf_may_pull(mbuf, mbuf->pkt_len) != 0) + return EDPVS_INVPKT; + tcp6_send_csum((struct ipv6_hdr *)ip6h, th); + } + } else { /* AF_INET */ + struct route_entry *rt = mbuf->userdata; + struct ipv4_hdr *iph = ip4_hdr(mbuf); + if (rt && rt->port) + dev = rt->port; + else if (conn->out_dev) + dev = conn->out_dev; + if (likely(dev && (dev->flag & NETIF_PORT_FLAG_TX_TCP_CSUM_OFFLOAD))) { + mbuf->l4_len = ntohs(iph->total_length) - iphdrlen; + mbuf->l3_len = iphdrlen; + mbuf->ol_flags |= (PKT_TX_TCP_CKSUM | PKT_TX_IP_CKSUM | PKT_TX_IPV4); + th->check = ip4_phdr_cksum(iph, mbuf->ol_flags); + } else { + if (mbuf_may_pull(mbuf, mbuf->pkt_len) != 0) + return EDPVS_INVPKT; + tcp4_send_csum(iph, th); + } + } + + return EDPVS_OK; +} + static inline uint32_t seq_scale(uint32_t seq) { struct timespec now; @@ -186,7 +233,7 @@ static inline uint32_t tcp_secure_sequence_number(uint32_t saddr, uint32_t daddr return seq_scale(*(uint32_t *)&hash0); } -static inline void tcp_in_init_seq(struct dp_vs_conn *conn, +static inline void tcp_in_init_seq(struct dp_vs_conn *conn, struct rte_mbuf *mbuf, struct tcphdr *th) { struct dp_vs_seq *fseq = &conn->fnat_seq; @@ -198,7 +245,7 @@ static inline void tcp_in_init_seq(struct dp_vs_conn *conn, if (fseq->isn) return; - fseq->isn = tcp_secure_sequence_number(conn->laddr.in.s_addr, + fseq->isn = tcp_secure_sequence_number(conn->laddr.in.s_addr, conn->daddr.in.s_addr, conn->lport, conn->dport); fseq->delta = fseq->isn - seq; @@ -235,10 +282,10 @@ static void tcp_in_remove_ts(struct tcphdr *tcph) continue; default: opsize = *ptr++; - if (opsize < 2) /* silly options */ + if (opsize < 2) /* silly options */ return; if (opsize > len) - return; /* partial options */ + return; /* partial options */ if ((opcode == TCP_OPT_TIMESTAMP) && (opsize == TCP_OLEN_TIMESTAMP)) { for (i = 0; i < TCP_OLEN_TIMESTAMP; i++) @@ -259,9 +306,9 @@ static inline int tcp_in_add_toa(struct dp_vs_conn *conn, struct rte_mbuf *mbuf, uint32_t mtu; struct tcpopt_addr *toa; uint32_t tcp_opt_len; - uint8_t *p, *q, *tail; struct route_entry *rt; + struct route6 *rt6; if (unlikely(conn->af != AF_INET && conn->af != AF_INET6)) return EDPVS_NOTSUPP; @@ -271,8 +318,10 @@ static inline int tcp_in_add_toa(struct dp_vs_conn *conn, struct rte_mbuf *mbuf, * check if we can add the new option */ /* skb length and tcp option length checking */ - if ((rt = mbuf->userdata) != NULL) { + if (tuplehash_out(conn).af == AF_INET && (rt = mbuf->userdata) != NULL) { mtu = rt->mtu; + } else if (tuplehash_out(conn).af == AF_INET6 && (rt6 = mbuf->userdata) != NULL) { + mtu = rt6->rt6_mtu; } else if (conn->in_dev) { /* no route for fast-xmit */ mtu = conn->in_dev->mtu; } else { @@ -337,34 +386,37 @@ static inline int tcp_in_add_toa(struct dp_vs_conn *conn, struct rte_mbuf *mbuf, /* reset tcp header length */ tcph->doff += tcp_opt_len >> 2; - /* reset ip header total length */ - if (conn->af == AF_INET) + /* + * reset ip header total length, notice nat64 + * toa is always for rs which is tuplehash_out conn + */ + if (tuplehash_out(conn).af == AF_INET) ip4_hdr(mbuf)->total_length = htons(ntohs(ip4_hdr(mbuf)->total_length) + tcp_opt_len); else ip6_hdr(mbuf)->ip6_plen = htons(ntohs(ip6_hdr(mbuf)->ip6_plen) + tcp_opt_len); - /* tcp csum will be recalc later, + /* tcp csum will be recalc later, * so as IP hdr csum since iph.tot_len has been chagned. */ return EDPVS_OK; } -static void tcp_out_save_seq(struct rte_mbuf *mbuf, +static void tcp_out_save_seq(struct rte_mbuf *mbuf, struct dp_vs_conn *conn, struct tcphdr *th) { if (th->rst) return; /* out of order ? */ - if (seq_before(ntohl(th->ack_seq), ntohl(conn->rs_end_ack)) + if (seq_before(ntohl(th->ack_seq), ntohl(conn->rs_end_ack)) && conn->rs_end_ack != 0) return; if (th->syn && th->ack) conn->rs_end_seq = htonl(ntohl(th->seq) + 1); else - conn->rs_end_seq = htonl(ntohl(th->seq) + mbuf->pkt_len + conn->rs_end_seq = htonl(ntohl(th->seq) + mbuf->pkt_len - ip4_hdrlen(mbuf) - (th->doff << 2)); conn->rs_end_ack = th->ack_seq; @@ -396,10 +448,10 @@ static void tcp_out_adjust_mss(int af, struct tcphdr *tcph) continue; default: opsize = *ptr++; - if (opsize < 2) /* "silly options" */ + if (opsize < 2) /* "silly options" */ return; if (opsize > length) - return; /* partial options */ + return; /* partial options */ if ((opcode == TCP_OPT_MSS) && (opsize == TCP_OLEN_MSS)) { uint16_t in_mss = ntohs(*(__be16 *) ptr); @@ -493,14 +545,15 @@ static void tcp_out_init_seq(struct dp_vs_conn *conn, struct tcphdr *th) } /* set @verdict if failed to schedule */ -static int tcp_conn_sched(struct dp_vs_proto *proto, +static int tcp_conn_sched(struct dp_vs_proto *proto, const struct dp_vs_iphdr *iph, - struct rte_mbuf *mbuf, + struct rte_mbuf *mbuf, struct dp_vs_conn **conn, int *verdict) { struct tcphdr *th, _tcph; struct dp_vs_service *svc; + assert(proto && iph && mbuf && conn && verdict); th = mbuf_header_pointer(mbuf, iph->len, sizeof(_tcph), &_tcph); @@ -527,7 +580,7 @@ static int tcp_conn_sched(struct dp_vs_proto *proto, daddr = inet_ntop(iph->af, &iph->daddr, dbuf, sizeof(dbuf)) ? dbuf : "::"; saddr = inet_ntop(iph->af, &iph->saddr, sbuf, sizeof(sbuf)) ? sbuf : "::"; RTE_LOG(DEBUG, IPVS, - "%s: [%d] try sched non-SYN packet: [%c%c%c%c] %s:%d->%s:%d\n", + "%s: [%d] try sched non-SYN packet: [%c%c%c%c] %s/%d->%s/%d\n", __func__, rte_lcore_id(), th->syn ? 'S' : '.', th->fin ? 'F' : '.', th->ack ? 'A' : '.', th->rst ? 'R' : '.', @@ -546,7 +599,7 @@ static int tcp_conn_sched(struct dp_vs_proto *proto, return EDPVS_INVAL; } - svc = dp_vs_service_lookup(iph->af, iph->proto, + svc = dp_vs_service_lookup(iph->af, iph->proto, &iph->daddr, th->dest, 0, mbuf, NULL); if (!svc) { /* Drop tcp packet which is send to vip and !vport */ @@ -574,7 +627,8 @@ static int tcp_conn_sched(struct dp_vs_proto *proto, static struct dp_vs_conn * tcp_conn_lookup(struct dp_vs_proto *proto, const struct dp_vs_iphdr *iph, - struct rte_mbuf *mbuf, int *direct, bool reverse, bool *drop) + struct rte_mbuf *mbuf, int *direct, bool reverse, bool *drop, + lcoreid_t *peer_cid) { struct tcphdr *th, _tcph; struct dp_vs_conn *conn; @@ -583,30 +637,41 @@ tcp_conn_lookup(struct dp_vs_proto *proto, const struct dp_vs_iphdr *iph, th = mbuf_header_pointer(mbuf, iph->len, sizeof(_tcph), &_tcph); if (unlikely(!th)) return NULL; - + if (dp_vs_blklst_lookup(iph->proto, &iph->daddr, th->dest, &iph->saddr)) { *drop = true; return NULL; } - conn = dp_vs_conn_get(iph->af, iph->proto, + conn = dp_vs_conn_get(iph->af, iph->proto, &iph->saddr, &iph->daddr, th->source, th->dest, direct, reverse); /* * L2 confirm neighbour - * pkt in from client confirm neighbour to client - * pkt out from rs confirm neighbour to rs + * pkt in from client confirm neighbour to client + * pkt out from rs confirm neighbour to rs */ if (conn != NULL) { if (th->ack) { - if ((*direct == DPVS_CONN_DIR_INBOUND) && conn->out_dev - && (!inet_is_addr_any(conn->af, &conn->out_nexthop))) { - neigh_confirm(conn->af, &conn->out_nexthop, conn->out_dev); - } else if ((*direct == DPVS_CONN_DIR_OUTBOUND) && conn->in_dev - && (!inet_is_addr_any(conn->af, &conn->in_nexthop))) { - neigh_confirm(conn->af, &conn->in_nexthop, conn->in_dev); + if ((*direct == DPVS_CONN_DIR_INBOUND) && conn->out_dev + && (!inet_is_addr_any(tuplehash_in(conn).af, &conn->out_nexthop))) { + neigh_confirm(tuplehash_in(conn).af, &conn->out_nexthop, + conn->out_dev); + } else if ((*direct == DPVS_CONN_DIR_OUTBOUND) && conn->in_dev + && (!inet_is_addr_any(tuplehash_out(conn).af, &conn->in_nexthop))) { + neigh_confirm(tuplehash_out(conn).af, &conn->in_nexthop, + conn->in_dev); } } + } else { + struct dp_vs_redirect *r; + + r = dp_vs_redirect_get(iph->af, iph->proto, + &iph->saddr, &iph->daddr, + th->source, th->dest); + if (r) { + *peer_cid = r->cid; + } } return conn; @@ -616,9 +681,8 @@ static int tcp_fnat_in_handler(struct dp_vs_proto *proto, struct dp_vs_conn *conn, struct rte_mbuf *mbuf) { struct tcphdr *th; - struct route_entry *rt = mbuf->userdata; - struct netif_port *dev = NULL; - int af = conn->af; + /* af/mbuf may be changed for nat64 which in af is ipv6 and out is ipv4 */ + int af = tuplehash_out(conn).af; int iphdrlen = ((AF_INET6 == af) ? ip6_hdrlen(mbuf): ip4_hdrlen(mbuf)); if (mbuf_may_pull(mbuf, iphdrlen + sizeof(*th)) != 0) @@ -631,7 +695,7 @@ static int tcp_fnat_in_handler(struct dp_vs_proto *proto, if (mbuf_may_pull(mbuf, iphdrlen + (th->doff << 2)) != 0) return EDPVS_INVPKT; - /* + /* * for SYN packet * 1. remove tcp timestamp option * laddress for different client have diff timestamp. @@ -657,49 +721,21 @@ static int tcp_fnat_in_handler(struct dp_vs_proto *proto, th->source = conn->lport; th->dest = conn->dport; - if (rt && rt->port) - dev = rt->port; - else if (conn->in_dev) - dev = conn->in_dev; - if (likely(dev && (dev->flag & NETIF_PORT_FLAG_TX_TCP_CSUM_OFFLOAD))) { - if (AF_INET6 == af) { - struct ip6_hdr *ip6h = ip6_hdr(mbuf); - mbuf->l3_len = iphdrlen; - mbuf->l4_len = ntohs(ip6h->ip6_plen) + sizeof(struct ip6_hdr) - iphdrlen; - mbuf->ol_flags |= (PKT_TX_TCP_CKSUM | PKT_TX_IPV6); - th->check = ip6_phdr_cksum(ip6h, mbuf->ol_flags, iphdrlen, IPPROTO_TCP); - } else { - mbuf->l4_len = ntohs(ip4_hdr(mbuf)->total_length) - iphdrlen; - mbuf->l3_len = iphdrlen; - mbuf->ol_flags |= (PKT_TX_TCP_CKSUM | PKT_TX_IP_CKSUM | PKT_TX_IPV4); - th->check = rte_ipv4_phdr_cksum(ip4_hdr(mbuf), mbuf->ol_flags); - } - } else { - if (mbuf_may_pull(mbuf, mbuf->pkt_len) != 0) - return EDPVS_INVPKT; - if (AF_INET6 == af) { - tcp6_send_csum((struct ipv6_hdr *)ip6_hdr(mbuf), th); - } else { - tcp4_send_csum(ip4_hdr(mbuf), th); - } - } - - return EDPVS_OK; + return tcp_send_csum(af, iphdrlen, th, conn, mbuf); } static int tcp_fnat_out_handler(struct dp_vs_proto *proto, struct dp_vs_conn *conn, struct rte_mbuf *mbuf) { struct tcphdr *th; - struct route_entry *rt = mbuf->userdata; - struct netif_port *dev = NULL; - int af = conn->af; + /* af/mbuf may be changed for nat64 which in af is ipv6 and out is ipv4*/ + int af = tuplehash_in(conn).af; int iphdrlen = ((AF_INET6 == af) ? ip6_hdrlen(mbuf): ip4_hdrlen(mbuf)); if (mbuf_may_pull(mbuf, iphdrlen + sizeof(*th)) != 0) return EDPVS_INVPKT; - + th = tcp_hdr(mbuf); if (unlikely(!th)) return EDPVS_INVPKT; @@ -724,43 +760,13 @@ static int tcp_fnat_out_handler(struct dp_vs_proto *proto, if (th->syn && th->ack) tcp_out_init_seq(conn, th); - if (rt && rt->port) - dev = rt->port; - else if (conn->out_dev) - dev = conn->out_dev; - - if (likely(dev && (dev->flag & NETIF_PORT_FLAG_TX_TCP_CSUM_OFFLOAD))) { - if (AF_INET6 == af) { - struct ip6_hdr *ip6h = ip6_hdr(mbuf); - mbuf->l3_len = iphdrlen; - mbuf->l4_len = ntohs(ip6h->ip6_plen) + sizeof(struct ip6_hdr) - iphdrlen; - mbuf->ol_flags |= (PKT_TX_TCP_CKSUM | PKT_TX_IPV6); - th->check = ip6_phdr_cksum(ip6h, mbuf->ol_flags, iphdrlen, IPPROTO_TCP); - } else { - mbuf->l4_len = ntohs(ip4_hdr(mbuf)->total_length) - iphdrlen; - mbuf->l3_len = iphdrlen; - mbuf->ol_flags |= (PKT_TX_TCP_CKSUM | PKT_TX_IP_CKSUM | PKT_TX_IPV4); - th->check = rte_ipv4_phdr_cksum(ip4_hdr(mbuf), mbuf->ol_flags); - } - } else { - if (mbuf_may_pull(mbuf, mbuf->pkt_len) != 0) - return EDPVS_INVPKT; - if (AF_INET6 == af) { - tcp6_send_csum((struct ipv6_hdr *)ip6_hdr(mbuf), th); - } else { - tcp4_send_csum(ip4_hdr(mbuf), th); - } - } - - return EDPVS_OK; + return tcp_send_csum(af, iphdrlen, th, conn, mbuf); } static int tcp_snat_in_handler(struct dp_vs_proto *proto, struct dp_vs_conn *conn, struct rte_mbuf *mbuf) { struct tcphdr *th; - struct netif_port *dev = NULL; - struct route_entry *rt = mbuf->userdata; int af = conn->af; int iphdrlen = ((AF_INET6 == af) ? ip6_hdrlen(mbuf): ip4_hdrlen(mbuf)); @@ -778,42 +784,13 @@ static int tcp_snat_in_handler(struct dp_vs_proto *proto, th->dest = conn->dport; /* L4 re-checksum */ - if (rt && rt->port) - dev = rt->port; - - /* leverage HW TX TCP csum offload if possible */ - if (likely(dev && (dev->flag & NETIF_PORT_FLAG_TX_TCP_CSUM_OFFLOAD))) { - if (AF_INET6 == af) { - struct ip6_hdr *ip6h = ip6_hdr(mbuf); - mbuf->l3_len = iphdrlen; - mbuf->l4_len = ntohs(ip6h->ip6_plen) + sizeof(struct ip6_hdr) - iphdrlen; - mbuf->ol_flags |= (PKT_TX_TCP_CKSUM | PKT_TX_IPV6); - th->check = ip6_phdr_cksum(ip6h, mbuf->ol_flags, iphdrlen, IPPROTO_TCP); - } else { - mbuf->l4_len = ntohs(ip4_hdr(mbuf)->total_length) - iphdrlen; - mbuf->l3_len = iphdrlen; - mbuf->ol_flags |= (PKT_TX_TCP_CKSUM | PKT_TX_IP_CKSUM | PKT_TX_IPV4); - th->check = rte_ipv4_phdr_cksum(ip4_hdr(mbuf), mbuf->ol_flags); - } - } else { - if (mbuf_may_pull(mbuf, mbuf->pkt_len) != 0) - return EDPVS_INVPKT; - if (AF_INET6 == af) { - tcp6_send_csum((struct ipv6_hdr *)ip6_hdr(mbuf), th); - } else { - tcp4_send_csum(ip4_hdr(mbuf), th); - } - } - - return EDPVS_OK; + return tcp_send_csum(af, iphdrlen, th, conn, mbuf); } static int tcp_snat_out_handler(struct dp_vs_proto *proto, struct dp_vs_conn *conn, struct rte_mbuf *mbuf) { struct tcphdr *th; - struct netif_port *dev = NULL; - struct route_entry *rt = mbuf->userdata; int af = conn->af; int iphdrlen = ((AF_INET6 == af) ? ip6_hdrlen(mbuf): ip4_hdrlen(mbuf)); @@ -831,34 +808,7 @@ static int tcp_snat_out_handler(struct dp_vs_proto *proto, th->source = conn->vport; /* L4 re-checksum */ - if (rt && rt->port) - dev = rt->port; - - /* leverage HW TX TCP csum offload if possible */ - if (likely(dev && (dev->flag & NETIF_PORT_FLAG_TX_TCP_CSUM_OFFLOAD))) { - if (AF_INET6 == af) { - struct ip6_hdr *ip6h = ip6_hdr(mbuf); - mbuf->l3_len = iphdrlen; - mbuf->l4_len = ntohs(ip6h->ip6_plen) + sizeof(struct ip6_hdr) - iphdrlen; - mbuf->ol_flags |= (PKT_TX_TCP_CKSUM | PKT_TX_IPV6); - th->check = ip6_phdr_cksum(ip6h, mbuf->ol_flags, iphdrlen, IPPROTO_TCP); - } else { - mbuf->l4_len = ntohs(ip4_hdr(mbuf)->total_length) - iphdrlen; - mbuf->l3_len = iphdrlen; - mbuf->ol_flags |= (PKT_TX_TCP_CKSUM | PKT_TX_IP_CKSUM | PKT_TX_IPV4); - th->check = rte_ipv4_phdr_cksum(ip4_hdr(mbuf), mbuf->ol_flags); - } - } else { - if (mbuf_may_pull(mbuf, mbuf->pkt_len) != 0) - return EDPVS_INVPKT; - if (AF_INET6 == af) { - tcp6_send_csum((struct ipv6_hdr *)ip6_hdr(mbuf), th); - } else { - tcp4_send_csum(ip4_hdr(mbuf), th); - } - } - - return EDPVS_OK; + return tcp_send_csum(af, iphdrlen, th, conn, mbuf); } static inline int tcp_state_idx(struct tcphdr *th) @@ -898,6 +848,11 @@ static int tcp_state_trans(struct dp_vs_proto *proto, struct dp_vs_conn *conn, const char *daddr, *caddr; #endif + if (dir == DPVS_CONN_DIR_INBOUND && dest->fwdmode == DPVS_FWD_MODE_FNAT) + af = tuplehash_in(conn).af; + else if (dir == DPVS_CONN_DIR_OUTBOUND && dest->fwdmode == DPVS_FWD_MODE_FNAT) + af = tuplehash_out(conn).af; + int iphdrlen = ((AF_INET6 == af) ? ip6_hdrlen(mbuf): ip4_hdrlen(mbuf)); th = mbuf_header_pointer(mbuf, iphdrlen, sizeof(_tcph), &_tcph); if (unlikely(!th)) @@ -925,16 +880,16 @@ static int tcp_state_trans(struct dp_vs_proto *proto, struct dp_vs_conn *conn, /* state changed */ #ifdef CONFIG_DPVS_IPVS_DEBUG - daddr = inet_ntop(conn->af, &conn->daddr, dbuf, sizeof(dbuf)) ? dbuf : "::"; - caddr = inet_ntop(conn->af, &conn->caddr, cbuf, sizeof(cbuf)) ? cbuf : "::"; + daddr = inet_ntop(tuplehash_out(conn).af, &conn->daddr, dbuf, sizeof(dbuf)) ? dbuf : "::"; + caddr = inet_ntop(tuplehash_in(conn).af, &conn->caddr, cbuf, sizeof(cbuf)) ? cbuf : "::"; RTE_LOG(DEBUG, IPVS, "state trans: %s %s [%c%c%c%c] %s:%u->%s:%u " " state %s->%s conn.refcnt %d\n", proto->name, dir == DPVS_CONN_DIR_OUTBOUND ? "out" : "in", th->syn ? 'S' : '.', th->fin ? 'F' : '.', th->ack ? 'A' : '.', th->rst ? 'R' : '.', - caddr, ntohs(conn->cport), - daddr, ntohs(conn->dport), + caddr, ntohs(conn->cport), + daddr, ntohs(conn->dport), tcp_state_name(conn->state), tcp_state_name(new_state), rte_atomic32_read(&conn->refcnt)); @@ -954,7 +909,7 @@ static int tcp_state_trans(struct dp_vs_proto *proto, struct dp_vs_conn *conn, } if (dest) { - if (!(conn->flags & DPVS_CONN_F_INACTIVE) + if (!(conn->flags & DPVS_CONN_F_INACTIVE) && (new_state != DPVS_TCP_S_ESTABLISHED)) { rte_atomic32_dec(&dest->actconns); rte_atomic32_inc(&dest->inactconns); @@ -973,13 +928,19 @@ static int tcp_state_trans(struct dp_vs_proto *proto, struct dp_vs_conn *conn, struct rte_mempool *get_mbuf_pool(const struct dp_vs_conn *conn, int dir) { struct netif_port *dev; + int af; - /* we need oif for correct rte_mempoll, + /* we need oif for correct rte_mempoll, * most likely oif is conn->in/out_dev (fast-xmit), * if not, determine output device by route. */ dev = ((dir == DPVS_CONN_DIR_INBOUND) ? conn->in_dev : conn->out_dev); + if (unlikely(!dev)) { - if (AF_INET == conn->af) { + /* dir is mbuf to revieve, route/af is mbuf to send + * their in/out may be reversed */ + af = ((dir == DPVS_CONN_DIR_INBOUND) ? \ + tuplehash_out(conn).af : tuplehash_in(conn).af); + if (AF_INET == af) { struct route_entry *rt = NULL; struct flow4 fl4; memset(&fl4, 0, sizeof(struct flow4)); @@ -1025,13 +986,14 @@ struct rte_mempool *get_mbuf_pool(const struct dp_vs_conn *conn, int dir) return dev->mbuf_pool; } -static int tcp_send_rst(struct dp_vs_proto *proto, +static int tcp_send_rst(struct dp_vs_proto *proto, struct dp_vs_conn *conn, int dir) { struct rte_mempool *pool; struct rte_mbuf *mbuf = NULL; struct tcphdr *th; - struct ipv4_hdr *iph; + struct ipv4_hdr *ip4h; + struct ip6_hdr *ip6h; if (conn->state != DPVS_TCP_S_ESTABLISHED) { /* RTE_LOG(WARNING, IPVS, "%s: only RST in ESTABLISHED.\n", __func__); */ @@ -1047,9 +1009,9 @@ static int tcp_send_rst(struct dp_vs_proto *proto, return EDPVS_NOMEM; mbuf->userdata = NULL; /* make sure "no route info" */ - /* + /* * reserve head room ? - * mbuf has alreay configured header room + * mbuf has alreay configured header room * RTE_PKTMBUF_HEADROOM for lower layer headers. */ assert(rte_pktmbuf_headroom(mbuf) >= 128); /* how to reserve. >_< */ @@ -1079,45 +1041,107 @@ static int tcp_send_rst(struct dp_vs_proto *proto, th->rst = 1; /* IP header (before translation) */ - iph = (struct ipv4_hdr *)rte_pktmbuf_prepend(mbuf, sizeof(struct ipv4_hdr)); - if (!iph) { - rte_pktmbuf_free(mbuf); - return EDPVS_NOROOM; - } - - iph->version_ihl = 0x45; - iph->total_length = htons(mbuf->pkt_len); - iph->packet_id = 0; - iph->fragment_offset = htons(IPV4_HDR_DF_FLAG); - iph->time_to_live = 64; - iph->next_proto_id = IPPROTO_TCP; if (dir == DPVS_CONN_DIR_INBOUND) { - iph->src_addr = conn->caddr.in.s_addr; - iph->dst_addr = conn->vaddr.in.s_addr; - } else { - iph->src_addr = conn->daddr.in.s_addr; - iph->dst_addr = conn->laddr.in.s_addr; - } + if (tuplehash_in(conn).af == AF_INET) { + ip4h = (struct ipv4_hdr *)rte_pktmbuf_prepend(mbuf, + sizeof(struct ipv4_hdr)); + if (!ip4h) { + rte_pktmbuf_free(mbuf); + return EDPVS_NOROOM; + } + ip4h->version_ihl = 0x45; + ip4h->total_length = htons(mbuf->pkt_len); + ip4h->packet_id = 0; + ip4h->fragment_offset = htons(IPV4_HDR_DF_FLAG); + ip4h->time_to_live = 64; + ip4h->next_proto_id = IPPROTO_TCP; + ip4h->src_addr = conn->caddr.in.s_addr; + ip4h->dst_addr = conn->vaddr.in.s_addr; + + mbuf->l3_len = sizeof(*ip4h); + + ip4h->hdr_checksum = 0; + tcp4_send_csum(ip4h, th); + ip4_send_csum(ip4h); + + } else { + int plen = mbuf->pkt_len; + ip6h = (struct ip6_hdr *)rte_pktmbuf_prepend(mbuf, + sizeof(struct ip6_hdr)); + if (!ip6h) { + rte_pktmbuf_free(mbuf); + return EDPVS_NOROOM; + } + ip6h->ip6_vfc = 0x60; + ip6h->ip6_plen = htons(plen); + ip6h->ip6_hlim = 64; + ip6h->ip6_nxt = IPPROTO_TCP; + ip6h->ip6_src = conn->caddr.in6; + ip6h->ip6_dst = conn->vaddr.in6; - iph->hdr_checksum = 0; - tcp4_send_csum(iph, th); - ip4_send_csum(iph); + mbuf->l3_len = sizeof(*ip6h); + + tcp6_send_csum((struct ipv6_hdr *)ip6h, th); + } - if (dir == DPVS_CONN_DIR_INBOUND) conn->packet_xmit(proto, conn, mbuf); - else + + } else { + if (tuplehash_out(conn).af == AF_INET) { + ip4h = (struct ipv4_hdr *)rte_pktmbuf_prepend(mbuf, + sizeof(struct ipv4_hdr)); + if (!ip4h) { + rte_pktmbuf_free(mbuf); + return EDPVS_NOROOM; + } + ip4h->version_ihl = 0x45; + ip4h->total_length = htons(mbuf->pkt_len); + ip4h->packet_id = 0; + ip4h->fragment_offset = htons(IPV4_HDR_DF_FLAG); + ip4h->time_to_live = 64; + ip4h->next_proto_id = IPPROTO_TCP; + ip4h->src_addr = conn->daddr.in.s_addr; + ip4h->dst_addr = conn->laddr.in.s_addr; + + mbuf->l3_len = sizeof(*ip4h); + + ip4h->hdr_checksum = 0; + tcp4_send_csum(ip4h, th); + ip4_send_csum(ip4h); + + } else { + int plen = mbuf->pkt_len; + ip6h = (struct ip6_hdr *)rte_pktmbuf_prepend(mbuf, + sizeof(struct ip6_hdr)); + if (!ip6h) { + rte_pktmbuf_free(mbuf); + return EDPVS_NOROOM; + } + ip6h->ip6_vfc = 0x60; + ip6h->ip6_plen = htons(plen); + ip6h->ip6_hlim = 64; + ip6h->ip6_nxt = IPPROTO_TCP; + ip6h->ip6_src = conn->daddr.in6; + ip6h->ip6_dst = conn->laddr.in6; + + mbuf->l3_len = sizeof(*ip6h); + + tcp6_send_csum((struct ipv6_hdr *)ip6h, th); + } + conn->packet_out_xmit(proto, conn, mbuf); + } return EDPVS_OK; } -static int tcp_conn_expire(struct dp_vs_proto *proto, +static int tcp_conn_expire(struct dp_vs_proto *proto, struct dp_vs_conn *conn) { int err; assert(proto && conn && conn->dest); - if (conn->dest->fwdmode == DPVS_FWD_MODE_NAT + if (conn->dest->fwdmode == DPVS_FWD_MODE_NAT || conn->dest->fwdmode == DPVS_FWD_MODE_FNAT) { /* send RST to RS and client */ err = tcp_send_rst(proto, conn, DPVS_CONN_DIR_INBOUND); diff --git a/src/ipvs/ip_vs_proto_udp.c b/src/ipvs/ip_vs_proto_udp.c index e8c9dcf2d..5a8c5dd7a 100644 --- a/src/ipvs/ip_vs_proto_udp.c +++ b/src/ipvs/ip_vs_proto_udp.c @@ -29,6 +29,7 @@ #include "ipvs/conn.h" #include "ipvs/service.h" #include "ipvs/blklst.h" +#include "ipvs/redirect.h" #include "parser/parser.h" #include "uoa.h" #include "neigh.h" @@ -61,19 +62,86 @@ static int udp_timeouts[DPVS_UDP_S_LAST + 1] = { [DPVS_UDP_S_LAST] = 2, }; -inline void udp4_send_csum(struct ipv4_hdr *iph, struct udphdr *uh) +inline void udp4_send_csum(struct ipv4_hdr *iph, struct udp_hdr *uh) { - uh->check = 0; - uh->check = rte_ipv4_udptcp_cksum(iph, uh); + uh->dgram_cksum = 0; + uh->dgram_cksum = ip4_udptcp_cksum(iph, uh); } -inline void udp6_send_csum(struct ipv6_hdr *iph, struct udphdr *uh) +inline void udp6_send_csum(struct ipv6_hdr *iph, struct udp_hdr *uh) { - uh->check = 0; - uh->check = ip6_udptcp_cksum((struct ip6_hdr *)iph, uh, + uh->dgram_cksum = 0; + uh->dgram_cksum = ip6_udptcp_cksum((struct ip6_hdr *)iph, (struct udphdr *)uh, (void *)uh - (void *)iph, IPPROTO_UDP); } +static inline int udp_send_csum(int af, int iphdrlen, struct udp_hdr *uh, + const struct dp_vs_conn *conn, + struct rte_mbuf *mbuf, const struct opphdr *opp) +{ + /* leverage HW TX UDP csum offload if possible */ + + struct netif_port *dev = NULL; + + if (AF_INET6 == af) { + /* UDP checksum is mandatory for IPv6.[RFC 2460] */ + struct ip6_hdr *ip6h = ip6_hdr(mbuf); + if (unlikely(opp != NULL)) { + udp6_send_csum((struct ipv6_hdr*)ip6h, uh); + } else { + struct route6 *rt6 = mbuf->userdata; + if (rt6 && rt6->rt6_dev) + dev = rt6->rt6_dev; + else if (conn->out_dev) + dev = conn->out_dev; + if (likely(dev && (dev->flag & NETIF_PORT_FLAG_TX_UDP_CSUM_OFFLOAD))) { + mbuf->l3_len = iphdrlen; + mbuf->l4_len = ntohs(ip6h->ip6_plen) + sizeof(struct ip6_hdr) -iphdrlen; + mbuf->ol_flags |= (PKT_TX_UDP_CKSUM | PKT_TX_IPV6); + uh->dgram_cksum = ip6_phdr_cksum(ip6h, mbuf->ol_flags, + iphdrlen, IPPROTO_UDP); + } else { + if (mbuf_may_pull(mbuf, mbuf->pkt_len) != 0) + return EDPVS_INVPKT; + udp6_send_csum((struct ipv6_hdr*)ip6h, uh); + } + } + } else { /* AF_INET */ + /* UDP checksum is not mandatory for IPv4. */ + struct ipv4_hdr *iph = ip4_hdr(mbuf); + if (unlikely(opp != NULL)) { + /* + * XXX: UDP pseudo header need UDP length, but the common helper function + * rte_ipv4_udptcp_cksum() use (IP.tot_len - IP.header_len), it's not + * correct if OPP header insterted between IP header and UDP header. + * We can modify the function, or change IP.tot_len before use + * rte_ipv4_udptcp_cksum() and restore it after. + * + * However, UDP checksum is not mandatory, to make things easier, when OPP + * header exist, we just not calc UDP checksum. + */ + uh->dgram_cksum = 0; + } else { + struct route_entry *rt = mbuf->userdata; + if (rt && rt->port) + dev = rt->port; + else if (conn->out_dev) + dev = conn->out_dev; + if (likely(dev && (dev->flag & NETIF_PORT_FLAG_TX_UDP_CSUM_OFFLOAD))) { + mbuf->l3_len = iphdrlen; + mbuf->l4_len = ntohs(iph->total_length) - iphdrlen; + mbuf->ol_flags |= (PKT_TX_UDP_CKSUM | PKT_TX_IP_CKSUM | PKT_TX_IPV4); + uh->dgram_cksum = ip4_phdr_cksum(iph, mbuf->ol_flags); + } else { + if (mbuf_may_pull(mbuf, mbuf->pkt_len) != 0) + return EDPVS_INVPKT; + udp4_send_csum(iph, uh); + } + } + } + return EDPVS_OK; +} + static int udp_conn_sched(struct dp_vs_proto *proto, const struct dp_vs_iphdr *iph, struct rte_mbuf *mbuf, @@ -129,7 +197,7 @@ static struct dp_vs_conn * udp_conn_lookup(struct dp_vs_proto *proto, const struct dp_vs_iphdr *iph, struct rte_mbuf *mbuf, int *direct, - bool reverse, bool *drop) + bool reverse, bool *drop, lcoreid_t *peer_cid) { struct udp_hdr *uh, _udph; struct dp_vs_conn *conn; @@ -143,11 +211,11 @@ udp_conn_lookup(struct dp_vs_proto *proto, &iph->saddr)) { *drop = true; return NULL; - } + } - conn = dp_vs_conn_get(iph->af, iph->proto, - &iph->saddr, &iph->daddr, - uh->src_port, uh->dst_port, + conn = dp_vs_conn_get(iph->af, iph->proto, + &iph->saddr, &iph->daddr, + uh->src_port, uh->dst_port, direct, reverse); /* @@ -155,18 +223,19 @@ udp_conn_lookup(struct dp_vs_proto *proto, * UDP has no ack, we don't know pkt from client is response or not * UDP can only confirm neighbour to RS */ - int af = iph->af; if (conn != NULL) { - if (AF_INET6 == af) { - if ((*direct == DPVS_CONN_DIR_OUTBOUND) && conn->in_dev - && !ipv6_addr_any(&conn->in_nexthop.in6)) { - neigh_confirm(AF_INET6, &conn->in_nexthop, conn->in_dev); - } - } else { - if ((*direct == DPVS_CONN_DIR_OUTBOUND) && conn->in_dev - && (conn->in_nexthop.in.s_addr != htonl(INADDR_ANY))) { - neigh_confirm(AF_INET, &conn->in_nexthop, conn->in_dev); - } + if ((*direct == DPVS_CONN_DIR_OUTBOUND) && conn->in_dev + && (!inet_is_addr_any(tuplehash_out(conn).af, &conn->in_nexthop))) { + neigh_confirm(tuplehash_out(conn).af, &conn->in_nexthop, conn->in_dev); + } + } else { + struct dp_vs_redirect *r; + + r = dp_vs_redirect_get(iph->af, iph->proto, + &iph->saddr, &iph->daddr, + uh->src_port, uh->dst_port); + if (r) { + *peer_cid = r->cid; } } @@ -200,7 +269,8 @@ static int send_standalone_uoa(const struct dp_vs_conn *conn, struct udphdr *uh; struct ipopt_uoa *uoa = NULL; struct opphdr *opp; - int af = conn->af; + int iaf = tuplehash_in(conn).af; + int oaf = tuplehash_out(conn).af; assert(conn && ombuf && oiph && ouh && ombuf->userdata); @@ -212,21 +282,19 @@ static int send_standalone_uoa(const struct dp_vs_conn *conn, if (unlikely(!mbuf)) return EDPVS_NOMEM; - int ipolen_uoa = (AF_INET6 == af) ? IPOLEN_UOA_IPV6 : IPOLEN_UOA_IPV4; + int ipolen_uoa = (AF_INET6 == iaf) ? IPOLEN_UOA_IPV6 : IPOLEN_UOA_IPV4; /* don't copy any ip options from oiph, is it ok ? */ - if (AF_INET6 == af) { + if (AF_INET6 == oaf) { iph = (void *)rte_pktmbuf_append(mbuf, sizeof(struct ip6_hdr)); if (unlikely(!iph)) goto no_room; ((struct ip6_hdr *)iph)->ip6_ctlun = ((struct ip6_hdr *)oiph)->ip6_ctlun; - memcpy(&((struct ip6_hdr *)iph)->ip6_src, - &((struct ip6_hdr *)oiph)->ip6_src, - IPV6_ADDR_LEN_IN_BYTES); - memcpy(&((struct ip6_hdr *)iph)->ip6_dst, - &((struct ip6_hdr *)oiph)->ip6_dst, - IPV6_ADDR_LEN_IN_BYTES); + memcpy(&((struct ip6_hdr *)iph)->ip6_src, &conn->laddr.in6, + sizeof(struct in6_addr)); + memcpy(&((struct ip6_hdr *)iph)->ip6_dst, &conn->daddr.in6, + sizeof(struct in6_addr)); } else { iph = (void *)rte_pktmbuf_append(mbuf, sizeof(struct iphdr)); if (unlikely(!iph)) @@ -242,6 +310,10 @@ static int send_standalone_uoa(const struct dp_vs_conn *conn, if (mode == UOA_M_IPO) { /* only ipv4 support and use this ip option mode */ + if (iaf != AF_INET || oaf != AF_INET) { + rte_pktmbuf_free(mbuf); + return EDPVS_NOTSUPP; + } ((struct iphdr *)iph)->ihl = (sizeof(struct iphdr) + IPOLEN_UOA_IPV4) / 4; ((struct iphdr *)iph)->tot_len = @@ -251,14 +323,14 @@ static int send_standalone_uoa(const struct dp_vs_conn *conn, uoa = (void *)rte_pktmbuf_append(mbuf, ipolen_uoa); } else { /* UOA_M_OPP */ - if (AF_INET6 == af) { + if (AF_INET6 == oaf) { ((struct ip6_hdr *)iph)->ip6_plen = - sizeof(*opp) + sizeof(*uoa) + sizeof(*uh); + htons(sizeof(*opp) + ipolen_uoa + sizeof(*uh)); ((struct ip6_hdr *)iph)->ip6_nxt = IPPROTO_OPT; } else { ((struct iphdr *)iph)->ihl = sizeof(struct iphdr) / 4; ((struct iphdr *)iph)->tot_len = htons(sizeof(struct iphdr) + - sizeof(*opp) + sizeof(*uoa) + sizeof(*uh)); + sizeof(*opp) + ipolen_uoa + sizeof(*uh)); ((struct iphdr *)iph)->protocol = IPPROTO_OPT; } @@ -268,13 +340,8 @@ static int send_standalone_uoa(const struct dp_vs_conn *conn, goto no_room; memset(opp, 0, sizeof(*opp)); - if (AF_INET6 == af) { - opp->version = OPPHDR_IPV6; - opp->protocol = IPPROTO_UDP; /* set to IPPROTO_UDP */ - } else { - opp->version = OPPHDR_IPV4; - opp->protocol = IPPROTO_UDP; - } + opp->version = (AF_INET6 == iaf) ? OPPHDR_IPV6 : OPPHDR_IPV4; + opp->protocol = IPPROTO_UDP; /* set to IPPROTO_UDP */ opp->length = htons(sizeof(*opp) + ipolen_uoa); uoa = (void *)rte_pktmbuf_append(mbuf, ipolen_uoa); @@ -289,7 +356,7 @@ static int send_standalone_uoa(const struct dp_vs_conn *conn, uoa->op_len = ipolen_uoa; uoa->op_port = ouh->source; /* fix uoa->op_addr */ - if (AF_INET6 == af) { + if (AF_INET6 == iaf) { memcpy(&uoa->op_addr, &((struct ip6_hdr *)oiph)->ip6_src, IPV6_ADDR_LEN_IN_BYTES); } else { @@ -307,18 +374,22 @@ static int send_standalone_uoa(const struct dp_vs_conn *conn, uh->dest = conn->dport; uh->len = htons(sizeof(struct udphdr)); /* empty payload */ - /* udp checksum */ - uh->check = 0; /* rte_ipv4_udptcp_cksum fails if opp inserted. */ - /* ip checksum will calc later */ - if (AF_INET6 == af) { + if (AF_INET6 == oaf) { struct route6 *rt6; + /* + * IPv6 UDP checksum is a must, packets with OPP header also need checksum. + * if udp checksum error here, may cause tcpdump & uoa moudule parse packets + * correctly, however socket can not receive L4 data. + */ + udp6_send_csum((struct ipv6_hdr *)iph, (struct udp_hdr*)uh); mbuf->userdata = rt6 = (struct route6*)ombuf->userdata; route6_get(rt6); return ip6_local_out(mbuf); } else { /* IPv4 */ struct route_entry *rt; + uh->check = 0; /* rte_ipv4_udptcp_cksum fails if opp inserted. */ mbuf->userdata = rt = (struct route_entry *)ombuf->userdata; route4_get(rt); return ipv4_local_out(mbuf); @@ -336,6 +407,7 @@ static int insert_ipopt_uoa(struct dp_vs_conn *conn, struct rte_mbuf *mbuf, struct iphdr *niph = NULL; struct ipopt_uoa *optuoa; + assert(AF_INET == tuplehash_in(conn).af && AF_INET == tuplehash_out(conn).af); if ((ip4_hdrlen(mbuf) + sizeof(struct ipopt_uoa) > sizeof(struct iphdr) + MAX_IPOPTLEN) || (mbuf->pkt_len + sizeof(struct ipopt_uoa) > mtu)) @@ -401,16 +473,22 @@ static int insert_opp_uoa(struct dp_vs_conn *conn, struct rte_mbuf *mbuf, void *niph; struct opphdr *opph = NULL; struct ipopt_uoa *uoa = NULL; - int af = conn->af; int iphdrlen = 0, iptot_len = 0, ipolen_uoa = 0; + + /* the current af of mbuf before possible nat64, + * i.e. the "tuplehash_in(conn).af" for FullNAT */ + int af = conn->af; + if (AF_INET6 == af) { /* - * iphdrlen: ipv6 total header length = basic header length (40 B) + - * ext header length - * iptot_len: ipv6 total length = basic header length (40 B) + - * payload length(including ext header) + * iphdrlen: ipv6 total header length = + * basic header length (40 B) + ext header length + * iptot_len: ipv6 total length = + * basic header length (40 B) + payload length(including ext header) */ iphdrlen = ip6_hdrlen(mbuf); + if (iphdrlen != sizeof(struct ipv6_hdr)) + goto standalone_uoa; iptot_len = sizeof(struct ip6_hdr) + ntohs(((struct ip6_hdr *)iph)->ip6_plen); ipolen_uoa = IPOLEN_UOA_IPV6; @@ -471,13 +549,19 @@ static int insert_opp_uoa(struct dp_vs_conn *conn, struct rte_mbuf *mbuf, opph->length = htons(sizeof(*opph) + ipolen_uoa); uoa = (void *)opph->options; - memset(uoa, 0, sizeof(struct ipopt_uoa)); + memset(uoa, 0, ipolen_uoa); uoa->op_code = IPOPT_UOA; uoa->op_len = ipolen_uoa; uoa->op_port = uh->source; if (AF_INET6 == af) { - memcpy(&uoa->op_addr, &((struct ip6_hdr *)niph)->ip6_src, + memcpy(&uoa->op_addr, &((struct ip6_hdr *)niph)->ip6_src, IPV6_ADDR_LEN_IN_BYTES); + /* + * we should set the 'nexthdr' of the last ext header to IPPROTO_OPT here + * but seems no efficient method to set that one + * ip6_skip_exthdr was only used to get the value + * so we send_standalone_uoa when has ip ext headers + */ ((struct ip6_hdr *)niph)->ip6_nxt = IPPROTO_OPT; /* Update ipv6 payload length */ ((struct ip6_hdr *)niph)->ip6_plen = @@ -526,7 +610,7 @@ static int udp_insert_uoa(struct dp_vs_conn *conn, struct rte_mbuf *mbuf, return EDPVS_INVPKT; } - if (AF_INET6 == af) { + if (AF_INET6 == tuplehash_out(conn).af) { mtu = ((struct route6*)rt)->rt6_mtu; iph = ip6_hdr(mbuf); iphdrlen = ip6_hdrlen(mbuf); @@ -582,7 +666,8 @@ static int udp_fnat_in_handler(struct dp_vs_proto *proto, struct udp_hdr *uh = NULL; struct opphdr *opp = NULL; void *iph = NULL; - int af = conn->af; + /* af/mbuf may be changed for nat64 which in af is ipv6 and out is ipv4 */ + int af = tuplehash_out(conn).af; int iphdrlen = 0; uint8_t nxt_proto; @@ -616,29 +701,7 @@ static int udp_fnat_in_handler(struct dp_vs_proto *proto, uh->src_port = conn->lport; uh->dst_port = conn->dport; - /* - * XXX: UDP pseudo header need UDP length, but the common helper function - * rte_ipv4_udptcp_cksum() use (IP.tot_len - IP.header_len), it's not - * correct if OPP header insterted between IP header and UDP header. - * We can modify the function, or change IP.tot_len before use - * rte_ipv4_udptcp_cksum() and restore it after. - * - * However, UDP checksum is not mandatory, to make things easier, when OPP - * header exist, we just not calc UDP checksum. - */ - if (!opp) { - if (AF_INET6 == af) { - udp6_send_csum((struct ipv6_hdr *)ip6_hdr(mbuf), (struct udphdr*)uh); - } else { - udp4_send_csum(ip4_hdr(mbuf), (struct udphdr *)uh); - } - } - /* FIXME: - * 1. IPv6 UDP checksum is a must, packets with OPP header also need checksum. - * 2. UDP checksum offload is to be supported. - */ - - return EDPVS_OK; + return udp_send_csum(af, iphdrlen, uh, conn, mbuf, opp); } static int udp_fnat_out_handler(struct dp_vs_proto *proto, @@ -646,7 +709,8 @@ static int udp_fnat_out_handler(struct dp_vs_proto *proto, struct rte_mbuf *mbuf) { struct udp_hdr *uh; - int af = conn->af; + /* af/mbuf may be changed for nat64 which in af is ipv6 and out is ipv4 */ + int af = tuplehash_in(conn).af; int iphdrlen = ((AF_INET6 == af) ? ip6_hdrlen(mbuf): ip4_hdrlen(mbuf)); /* cannot use mbuf_header_pointer() */ @@ -659,13 +723,7 @@ static int udp_fnat_out_handler(struct dp_vs_proto *proto, uh->src_port = conn->vport; uh->dst_port = conn->cport; - if (AF_INET6 == af) { - udp6_send_csum((struct ipv6_hdr *)ip6_hdr(mbuf), (struct udphdr*)uh); - } else { - udp4_send_csum(ip4_hdr(mbuf), (struct udphdr *)uh); - } - - return EDPVS_OK; + return udp_send_csum(af, iphdrlen, uh, conn, mbuf, NULL); } static int udp_fnat_in_pre_handler(struct dp_vs_proto *proto, @@ -695,15 +753,9 @@ static int udp_snat_in_handler(struct dp_vs_proto *proto, if (unlikely(!uh)) return EDPVS_INVPKT; - uh->dst_port = conn->dport; - - if (AF_INET6 == af) { - udp6_send_csum((struct ipv6_hdr *)ip6_hdr(mbuf), (struct udphdr*)uh); - } else { - udp4_send_csum(ip4_hdr(mbuf), (struct udphdr *)uh); - } + uh->dst_port = conn->dport; - return EDPVS_OK; + return udp_send_csum(af, iphdrlen, uh, conn, mbuf, NULL); } static int udp_snat_out_handler(struct dp_vs_proto *proto, @@ -721,15 +773,9 @@ static int udp_snat_out_handler(struct dp_vs_proto *proto, if (unlikely(!uh)) return EDPVS_INVPKT; - uh->src_port = conn->vport; - - if (AF_INET6 == af) { - udp6_send_csum((struct ipv6_hdr *)ip6_hdr(mbuf), (struct udphdr*)uh); - } else { - udp4_send_csum(ip4_hdr(mbuf), (struct udphdr *)uh); - } + uh->src_port = conn->vport; - return EDPVS_OK; + return udp_send_csum(af, iphdrlen, uh, conn, mbuf, NULL); } struct dp_vs_proto dp_vs_proto_udp = { diff --git a/src/ipvs/ip_vs_redirect.c b/src/ipvs/ip_vs_redirect.c new file mode 100644 index 000000000..93a692241 --- /dev/null +++ b/src/ipvs/ip_vs_redirect.c @@ -0,0 +1,429 @@ +/* + * DPVS is a software load balancer (Virtual Server) based on DPDK. + * + * Copyright (C) 2017 iQIYI (www.iqiyi.com). + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ +#include "ipvs/redirect.h" + +#define DPVS_REDIRECT_RING_SIZE 4096 + +#define DPVS_CR_TBL_BITS 22 +#define DPVS_CR_TBL_SIZE (1 << DPVS_CR_TBL_BITS) +#define DPVS_CR_TBL_MASK (DPVS_CR_TBL_SIZE - 1) + +static struct list_head *dp_vs_cr_tbl; +static rte_spinlock_t dp_vs_cr_lock[DPVS_CR_TBL_SIZE]; +static struct rte_mempool *dp_vs_cr_cache[DPVS_MAX_SOCKET]; +#define this_cr_cache (dp_vs_cr_cache[rte_socket_id()]) + +static struct rte_ring *dp_vs_redirect_ring[DPVS_MAX_LCORE][DPVS_MAX_LCORE]; + +#ifdef CONFIG_DPVS_IPVS_DEBUG +static inline void +dp_vs_redirect_show(struct dp_vs_redirect *r, const char *action) +{ + char sbuf[64], dbuf[64]; + + RTE_LOG(DEBUG, IPVS, "[%d] redirect %s: [%d] %s %s/%d -> %s/%d\n", + rte_lcore_id(), action, r->cid, + inet_proto_name(r->proto), + inet_ntop(r->af, &r->saddr, sbuf, sizeof(sbuf)) ? sbuf : "::", + ntohs(r->sport), + inet_ntop(r->af, &r->daddr, dbuf, sizeof(dbuf)) ? dbuf : "::", + ntohs(r->dport)); +} +#endif + +struct dp_vs_redirect * +dp_vs_redirect_alloc(enum dpvs_fwd_mode fwdmode) +{ + struct dp_vs_redirect *r; + + if (dp_vs_redirect_disable) { + return NULL; + } + + /* + * Currently, IPv6 support has the below issues. + * a) Fdir IPv6 rules fail to be created with "perfect" mode, but can be + * created with "signature" mode. + * + * b) In full-nat mode, the packets from incoming direction and outgoing + * direction are dispatched to the different cores so the service is + * broken. + * + * The solutuion is to use decentralized packet dispatch for the symemtric + * service modes, full-nat/snat/nat before issue a) is fixed. + */ + if (fwdmode != DPVS_FWD_MODE_FNAT + && fwdmode != DPVS_FWD_MODE_SNAT + && fwdmode != DPVS_FWD_MODE_NAT) { + return NULL; + } + + if (unlikely(rte_mempool_get(this_cr_cache, (void **)&r) != 0)) { + RTE_LOG(WARNING, IPVS, + "%s: no memory for redirect\n", __func__); + return NULL; + } + + memset(r, 0, sizeof(struct dp_vs_redirect)); + r->redirect_pool = this_cr_cache; + + return r; +} + +void dp_vs_redirect_free(struct dp_vs_conn *conn) +{ + if (conn->redirect) { +#ifdef CONFIG_DPVS_IPVS_DEBUG + dp_vs_redirect_show(conn->redirect, "free"); +#endif + rte_mempool_put(this_cr_cache, conn->redirect); + conn->redirect = NULL; + } +} + +void dp_vs_redirect_hash(struct dp_vs_conn *conn) +{ + uint32_t hash; + struct dp_vs_redirect *r = conn->redirect; + + if (!r || unlikely(dp_vs_conn_is_redirect_hashed(conn))) { + return; + } + + hash = dp_vs_conn_hashkey(r->af, + &tuplehash_out(conn).saddr, tuplehash_out(conn).sport, + &tuplehash_out(conn).daddr, tuplehash_out(conn).dport, + DPVS_CR_TBL_MASK); + + rte_spinlock_lock(&dp_vs_cr_lock[hash]); + list_add(&r->list, &dp_vs_cr_tbl[hash]); + rte_spinlock_unlock(&dp_vs_cr_lock[hash]); + + dp_vs_conn_set_redirect_hashed(conn); +} + +void dp_vs_redirect_unhash(struct dp_vs_conn *conn) +{ + uint32_t hash; + struct dp_vs_redirect *r = conn->redirect; + + if (r && likely(dp_vs_conn_is_redirect_hashed(conn))) { + hash = dp_vs_conn_hashkey(r->af, + &r->saddr, r->sport, + &r->daddr, r->dport, + DPVS_CR_TBL_MASK); + + rte_spinlock_lock(&dp_vs_cr_lock[hash]); + list_del(&r->list); + rte_spinlock_unlock(&dp_vs_cr_lock[hash]); + + dp_vs_conn_clear_redirect_hashed(conn); + } +} + +void dp_vs_redirect_init(struct dp_vs_conn *conn) +{ + enum dpvs_fwd_mode fm = conn->dest->fwdmode; + struct conn_tuple_hash *t = &tuplehash_out(conn); + struct dp_vs_redirect *r = conn->redirect; + + if (!r) { + return; + } + + switch (fm) { + case DPVS_FWD_MODE_FNAT: + case DPVS_FWD_MODE_NAT: + t = &tuplehash_out(conn); + break; + + case DPVS_FWD_MODE_SNAT: + t = &tuplehash_in(conn); + break; + + default: + RTE_LOG(ERR, IPVS, + "%s: no redirect created for fwd mode %d\n", + __func__, fm); + return; + } + + r->af = t->af; + r->proto = t->proto; + r->saddr = t->saddr; + r->daddr = t->daddr; + r->sport = t->sport; + r->dport = t->dport; + r->cid = rte_lcore_id(); + +#ifdef CONFIG_DPVS_IPVS_DEBUG + dp_vs_redirect_show(r, "init"); +#endif +} + +/** + * try lookup dp_vs_cr_tbl{} by packet tuple + * + * . + * + * return r if found or NULL if not exist. + */ +struct dp_vs_redirect * +dp_vs_redirect_get(int af, uint16_t proto, + const union inet_addr *saddr, const union inet_addr *daddr, + uint16_t sport, uint16_t dport) +{ + uint32_t hash; + struct dp_vs_redirect *r; + + if (dp_vs_redirect_disable) { + return NULL; + } + + hash = dp_vs_conn_hashkey(af, saddr, sport, daddr, dport, DPVS_CR_TBL_MASK); + + rte_spinlock_lock(&dp_vs_cr_lock[hash]); + list_for_each_entry(r, &dp_vs_cr_tbl[hash], list) { + if (r->af == af + && r->proto == proto + && r->sport == sport + && r->dport == dport + && inet_addr_equal(af, &r->saddr, saddr) + && inet_addr_equal(af, &r->daddr, daddr)) { + goto found; + } + } + rte_spinlock_unlock(&dp_vs_cr_lock[hash]); + + return NULL; + +found: + rte_spinlock_unlock(&dp_vs_cr_lock[hash]); + +#ifdef CONFIG_DPVS_IPVS_DEBUG + dp_vs_redirect_show(r, "get"); +#endif + + return r; +} + +/** + * Forward the packet to the found redirect owner core. + */ +int dp_vs_redirect_pkt(struct rte_mbuf *mbuf, lcoreid_t peer_cid) +{ + lcoreid_t cid = rte_lcore_id(); + int ret; + + ret = rte_ring_enqueue(dp_vs_redirect_ring[peer_cid][cid], mbuf); + if (ret < 0) { + RTE_LOG(ERR, IPVS, + "%s: [%d] failed to enqueue mbuf to redirect_ring[%d][%d]\n", + __func__, cid, peer_cid, cid); + return INET_DROP; + } + +#ifdef CONFIG_DPVS_IPVS_DEBUG + RTE_LOG(DEBUG, IPVS, + "%s: [%d] enqueued mbuf to redirect_ring[%d][%d]\n", + __func__, cid, peer_cid, cid); +#endif + + return INET_STOLEN; +} + +void dp_vs_redirect_ring_proc(struct netif_queue_conf *qconf, lcoreid_t cid) +{ + struct rte_mbuf *mbufs[NETIF_MAX_PKT_BURST]; + uint16_t nb_rb; + lcoreid_t peer_cid; + + if (dp_vs_redirect_disable) { + return; + } + + cid = rte_lcore_id(); + + for (peer_cid = 0; peer_cid < DPVS_MAX_LCORE; peer_cid++) { + if (dp_vs_redirect_ring[cid][peer_cid]) { + nb_rb = rte_ring_dequeue_burst(dp_vs_redirect_ring[cid][peer_cid], + (void**)mbufs, + NETIF_MAX_PKT_BURST, NULL); + if (nb_rb > 0) { + lcore_process_packets(qconf, mbufs, cid, nb_rb, 1); + } + } + } +} + +/* + * allocate redirect cache on each NUMA socket and its size is + * same as conn_pool_size + */ +static int dp_vs_redirect_cache_alloc(void) +{ + int i; + char pool_name[32]; + + for (i = 0; i < get_numa_nodes(); i++) { + snprintf(pool_name, sizeof(pool_name), "dp_vs_redirect_%d", i); + + dp_vs_cr_cache[i] = + rte_mempool_create(pool_name, + dp_vs_conn_pool_size(), + sizeof(struct dp_vs_redirect), + dp_vs_conn_pool_cache_size(), + 0, NULL, NULL, NULL, NULL, + i, 0); + + if (!dp_vs_cr_cache[i]) { + return EDPVS_NOMEM; + } + } + + return EDPVS_OK; +} + +static void dp_vs_redirect_cache_free(void) +{ + int i; + + for (i = 0; i < get_numa_nodes(); i++) { + rte_mempool_free(dp_vs_cr_cache[i]); + } +} + +static int dp_vs_redirect_table_create(void) +{ + int i; + + if (dp_vs_redirect_cache_alloc() != EDPVS_OK) { + goto cache_free; + } + + /* allocate the global redirect hash table, per socket? */ + dp_vs_cr_tbl = + rte_malloc_socket(NULL, sizeof(struct list_head ) * DPVS_CR_TBL_SIZE, + RTE_CACHE_LINE_SIZE, rte_socket_id()); + if (!dp_vs_cr_tbl) { + goto cache_free; + } + + /* init the global redirect hash table */ + for (i = 0; i < DPVS_CR_TBL_SIZE; i++) { + INIT_LIST_HEAD(&dp_vs_cr_tbl[i]); + rte_spinlock_init(&dp_vs_cr_lock[i]); + } + + return EDPVS_OK; + +cache_free: + dp_vs_redirect_cache_free(); + return EDPVS_NOMEM; +} + +static void dp_vs_redirect_table_free(void) +{ + dp_vs_redirect_cache_free(); + + /* release the global redirect hash table */ + if (dp_vs_cr_tbl) { + rte_free(dp_vs_cr_tbl); + } +} + +/* + * Each lcore allocates redirect rings with the other lcores espectively. + */ +static int dp_vs_redirect_ring_create(void) +{ + char name_buf[RTE_RING_NAMESIZE]; + int socket_id; + lcoreid_t cid, peer_cid; + + socket_id = rte_socket_id(); + + for (cid = 0; cid < DPVS_MAX_LCORE; cid++) { + if (cid == rte_get_master_lcore() || !rte_lcore_is_enabled(cid)) { + continue; + } + + for (peer_cid = 0; peer_cid < DPVS_MAX_LCORE; peer_cid++) { + if (!rte_lcore_is_enabled(peer_cid) + || peer_cid == rte_get_master_lcore() + || cid == peer_cid) { + continue; + } + + snprintf(name_buf, RTE_RING_NAMESIZE, + "dp_vs_redirect_ring[%d[%d]", cid, peer_cid); + + dp_vs_redirect_ring[cid][peer_cid] = + rte_ring_create(name_buf, DPVS_REDIRECT_RING_SIZE, socket_id, + RING_F_SP_ENQ | RING_F_SC_DEQ); + + if (!dp_vs_redirect_ring[cid][peer_cid]) { + RTE_LOG(ERR, IPVS, + "%s: failed to create redirect_ring[%d][%d]\n", + __func__, cid, peer_cid); + return EDPVS_NOMEM; + } + } + } + + return EDPVS_OK; +} + +static void dp_vs_redirect_ring_free(void) +{ + lcoreid_t cid, peer_cid; + + for (cid = 0; cid < DPVS_MAX_LCORE; cid++) { + for (peer_cid = 0; peer_cid < DPVS_MAX_LCORE; peer_cid++) { + rte_ring_free(dp_vs_redirect_ring[cid][peer_cid]); + } + } +} + +int dp_vs_redirects_init(void) +{ + int err; + + if (dp_vs_redirect_disable) { + return EDPVS_OK; + } + + err = dp_vs_redirect_ring_create(); + if (err != EDPVS_OK) { + return err; + } + + return dp_vs_redirect_table_create(); +} + +int dp_vs_redirects_term(void) +{ + if (dp_vs_redirect_disable) { + return EDPVS_OK; + } + + dp_vs_redirect_ring_free(); + dp_vs_redirect_table_free(); + + return EDPVS_OK; +} diff --git a/src/ipvs/ip_vs_rr.c b/src/ipvs/ip_vs_rr.c index 962feeb4f..234f859f4 100644 --- a/src/ipvs/ip_vs_rr.c +++ b/src/ipvs/ip_vs_rr.c @@ -24,7 +24,8 @@ static int dp_vs_rr_init_svc(struct dp_vs_service *svc) return EDPVS_OK; } -static int dp_vs_rr_update_svc(struct dp_vs_service *svc) +static int dp_vs_rr_update_svc(struct dp_vs_service *svc, + struct dp_vs_dest *dest __rte_unused, sockoptid_t opt __rte_unused) { svc->sched_data = &svc->dests; return EDPVS_OK; @@ -53,9 +54,7 @@ static struct dp_vs_dest *dp_vs_rr_schedule(struct dp_vs_service *svc, } dest = list_entry(q, struct dp_vs_dest, n_list); - if (!(dest->flags & DPVS_DEST_F_OVERLOAD) && - (dest->flags & DPVS_DEST_F_AVAILABLE) && - rte_atomic16_read(&dest->weight) > 0) + if (dp_vs_dest_is_valid(dest)) /* HIT */ goto out; q = q->next; diff --git a/src/ipvs/ip_vs_sched.c b/src/ipvs/ip_vs_sched.c index 6ea1193ae..759acc3e0 100644 --- a/src/ipvs/ip_vs_sched.c +++ b/src/ipvs/ip_vs_sched.c @@ -23,6 +23,7 @@ #include "ipvs/wrr.h" #include "ipvs/wlc.h" #include "ipvs/conhash.h" +#include "ipvs/fo.h" /* * IPVS scheduler list @@ -101,8 +102,8 @@ struct dp_vs_scheduler *dp_vs_scheduler_get(const char *sched_name) /* HIT */ rte_rwlock_read_unlock(&__dp_vs_sched_lock); return sched; - } - } + } + } rte_rwlock_read_unlock(&__dp_vs_sched_lock); return NULL; @@ -183,6 +184,7 @@ int dp_vs_sched_init(void) dp_vs_wrr_init(); dp_vs_wlc_init(); dp_vs_conhash_init(); + dp_vs_fo_init(); return EDPVS_OK; } @@ -193,6 +195,7 @@ int dp_vs_sched_term(void) dp_vs_wrr_term(); dp_vs_wlc_term(); dp_vs_conhash_term(); + dp_vs_fo_term(); return EDPVS_OK; } diff --git a/src/ipvs/ip_vs_service.c b/src/ipvs/ip_vs_service.c index 7156d574c..789e40b3c 100644 --- a/src/ipvs/ip_vs_service.c +++ b/src/ipvs/ip_vs_service.c @@ -112,8 +112,8 @@ static int dp_vs_svc_unhash(struct dp_vs_service *svc) return EDPVS_OK; } -struct dp_vs_service *__dp_vs_service_get(int af, uint16_t protocol, - const union inet_addr *vaddr, +struct dp_vs_service *__dp_vs_service_get(int af, uint16_t protocol, + const union inet_addr *vaddr, uint16_t vport) { unsigned hash; @@ -156,7 +156,7 @@ static inline bool __svc_in_range(int af, const union inet_addr *addr, __be16 port, const struct inet_addr_range *range) { - if (unlikely((af == AF_INET) && + if (unlikely((af == AF_INET) && (ntohl(range->min_addr.in.s_addr) > ntohl(range->max_addr.in.s_addr)))) return false; @@ -381,7 +381,7 @@ __dp_vs_svc_match_find(int af, uint8_t proto, const struct dp_vs_match *match) } struct dp_vs_service *dp_vs_service_lookup(int af, uint16_t protocol, - const union inet_addr *vaddr, + const union inet_addr *vaddr, uint16_t vport, uint32_t fwmark, const struct rte_mbuf *mbuf, const struct dp_vs_match *match) @@ -456,7 +456,7 @@ void __dp_vs_unbind_svc(struct dp_vs_dest *dest) } } -int dp_vs_add_service(struct dp_vs_service_conf *u, +int dp_vs_add_service(struct dp_vs_service_conf *u, struct dp_vs_service **svc_p) { int ret = 0; @@ -464,7 +464,7 @@ int dp_vs_add_service(struct dp_vs_service_conf *u, struct dp_vs_scheduler *sched = NULL; struct dp_vs_service *svc = NULL; - if (!u->fwmark && inet_is_addr_any(u->af, &u->addr) + if (!u->fwmark && inet_is_addr_any(u->af, &u->addr) && !u->port && is_empty_match(&u->match)) { RTE_LOG(ERR, SERVICE, "%s: adding empty servive\n", __func__); return EDPVS_INVAL; @@ -523,7 +523,7 @@ int dp_vs_add_service(struct dp_vs_service_conf *u, ret = dp_vs_new_stats(&(svc->stats)); if(ret) goto out_err; - + dp_vs_num_services++; rte_rwlock_write_lock(&__dp_vs_svc_lock); @@ -714,7 +714,7 @@ dp_vs_copy_service(struct dp_vs_service_entry *dst, struct dp_vs_service *src) return err; } -int dp_vs_get_service_entries(const struct dp_vs_get_services *get, +int dp_vs_get_service_entries(const struct dp_vs_get_services *get, struct dp_vs_get_services *uptr) { int idx, count = 0; @@ -897,7 +897,7 @@ static void dp_vs_copy_udest_compat(struct dp_vs_dest_conf *udest, udest->addr = udest_compat->addr; udest->port = udest_compat->port; udest->fwdmode = udest_compat->conn_flags;//make sure fwdmode and conn_flags are the same - udest->conn_flags = udest_compat->conn_flags; + udest->conn_flags = udest_compat->conn_flags; udest->weight = udest_compat->weight; udest->max_conn = udest_compat->max_conn; udest->min_conn = udest_compat->min_conn; @@ -937,13 +937,13 @@ static int dp_vs_set_svc(sockoptid_t opt, const void *user, size_t len) memcpy(arg, user, len); usvc_compat = (struct dp_vs_service_user *)arg; udest_compat = (struct dp_vs_dest_user *)(usvc_compat + 1); - + ret = dp_vs_copy_usvc_compat(&usvc, usvc_compat); if (ret != EDPVS_OK) return ret; - + if (opt == DPVS_SO_SET_ZERO) { - if(!inet_is_addr_any(usvc.af, &usvc.addr) && + if(!inet_is_addr_any(usvc.af, &usvc.addr) && !usvc.fwmark && !usvc.port && is_empty_match(&usvc.match) ) { @@ -958,7 +958,7 @@ static int dp_vs_set_svc(sockoptid_t opt, const void *user, size_t len) } if (!inet_is_addr_any(usvc.af, &usvc.addr) || usvc.port) - svc = __dp_vs_service_get(usvc.af, usvc.protocol, + svc = __dp_vs_service_get(usvc.af, usvc.protocol, &usvc.addr, usvc.port); else if (usvc.fwmark) svc = __dp_vs_svc_fwm_get(usvc.af, usvc.fwmark); @@ -969,7 +969,7 @@ static int dp_vs_set_svc(sockoptid_t opt, const void *user, size_t len) return EDPVS_INVAL; } - if(opt != DPVS_SO_SET_ADD && + if(opt != DPVS_SO_SET_ADD && (svc == NULL || svc->proto != usvc.protocol)){ if (svc) dp_vs_service_put(svc); @@ -980,7 +980,7 @@ static int dp_vs_set_svc(sockoptid_t opt, const void *user, size_t len) case DPVS_SO_SET_ADD: if(svc != NULL) ret = EDPVS_EXIST; - else + else ret = dp_vs_add_service(&usvc, &svc); break; case DPVS_SO_SET_EDIT: @@ -1048,7 +1048,7 @@ static int dp_vs_get_svc(sockoptid_t opt, const void *user, size_t len, void **o size = sizeof(*get) + \ sizeof(struct dp_vs_service_entry) * (get->num_services); if(len != sizeof(*get)){ - *outlen = 0; + *outlen = 0; return EDPVS_INVAL; } output = rte_zmalloc("get_services", size, 0); @@ -1076,8 +1076,8 @@ static int dp_vs_get_svc(sockoptid_t opt, const void *user, size_t len, void **o else { struct dp_vs_match match; - ret = dp_vs_match_parse(entry->srange, entry->drange, - entry->iifname, entry->oifname, + ret = dp_vs_match_parse(entry->srange, entry->drange, + entry->iifname, entry->oifname, &match); if (ret != EDPVS_OK) return ret; @@ -1090,8 +1090,12 @@ static int dp_vs_get_svc(sockoptid_t opt, const void *user, size_t len, void **o output = rte_zmalloc("get_service", sizeof(struct dp_vs_service_entry), 0); - if (unlikely(NULL == output)) + if (unlikely(NULL == output)) { + if (svc) { + dp_vs_service_put(svc); + } return EDPVS_NOMEM; + } memcpy(output, entry, sizeof(struct dp_vs_service_entry)); if(svc) { ret = dp_vs_copy_service(output, svc); @@ -1100,6 +1104,9 @@ static int dp_vs_get_svc(sockoptid_t opt, const void *user, size_t len, void **o *outlen = sizeof(struct dp_vs_service_entry); }else{ *outlen = 0; + if (output) { + rte_free(output); + } ret = EDPVS_NOTEXIST; } } @@ -1130,20 +1137,25 @@ static int dp_vs_get_svc(sockoptid_t opt, const void *user, size_t len, void **o else { struct dp_vs_match match; - ret = dp_vs_match_parse(get->srange, get->drange, - get->iifname, get->oifname, + ret = dp_vs_match_parse(get->srange, get->drange, + get->iifname, get->oifname, &match); - if (ret != EDPVS_OK) + if (ret != EDPVS_OK) { + rte_free(output); return ret; - + } if (!is_empty_match(&match)) { svc = __dp_vs_svc_match_find(match.af, get->proto, &match); } } - if (!svc) + if (!svc) { + if (output) { + rte_free(output); + } ret = EDPVS_NOTEXIST; + } else { ret = dp_vs_get_dest_entries(svc, get, output); dp_vs_service_put(svc); @@ -1156,6 +1168,11 @@ static int dp_vs_get_svc(sockoptid_t opt, const void *user, size_t len, void **o return EDPVS_INVAL; } + if (ret != EDPVS_OK) { + if (*out) + rte_free(*out); + } + return ret; } @@ -1175,7 +1192,7 @@ int dp_vs_service_init(void) for (idx = 0; idx < DP_VS_SVC_TAB_SIZE; idx++) { INIT_LIST_HEAD(&dp_vs_svc_table[idx]); INIT_LIST_HEAD(&dp_vs_svc_fwm_table[idx]); - } + } INIT_LIST_HEAD(&dp_vs_svc_match_list); rte_rwlock_init(&__dp_vs_svc_lock); dp_vs_dest_init(); diff --git a/src/ipvs/ip_vs_stats.c b/src/ipvs/ip_vs_stats.c index 916793309..fad617753 100644 --- a/src/ipvs/ip_vs_stats.c +++ b/src/ipvs/ip_vs_stats.c @@ -93,8 +93,8 @@ static struct dp_vs_stats* alloc_percpu_stats(void) if (!(lcore_mask & (1L<dest; - lcoreid_t cid; + lcoreid_t cid; cid = rte_lcore_id(); if (dest && (dest->flags & DPVS_DEST_F_AVAILABLE)) { /*limit rate*/ if ((dest->limit_proportion < 100) && (dest->limit_proportion > 0)) { - return (rand()%100) > dest->limit_proportion + return (rand()%100) > dest->limit_proportion ? EDPVS_OVERLOAD : EDPVS_OK; } @@ -239,10 +239,10 @@ int dp_vs_stats_out(struct dp_vs_conn *conn, struct rte_mbuf *mbuf) if (dest && (dest->flags & DPVS_DEST_F_AVAILABLE)) { /*limit rate*/ - if ((dest->limit_proportion < 100) && + if ((dest->limit_proportion < 100) && (dest->limit_proportion > 0)) { - return (rand()%100) > dest->limit_proportion - ? EDPVS_OVERLOAD : EDPVS_OK; + return (rand()%100) > dest->limit_proportion + ? EDPVS_OVERLOAD : EDPVS_OK; } dest->stats[cid].outpkts++; @@ -264,7 +264,7 @@ void dp_vs_stats_conn(struct dp_vs_conn *conn) assert(conn && conn->dest); lcoreid_t cid; - cid = rte_lcore_id(); + cid = rte_lcore_id(); conn->dest->stats[cid].conns++; this_dpvs_stats.conns++; } diff --git a/src/ipvs/ip_vs_synproxy.c b/src/ipvs/ip_vs_synproxy.c index b1c9b1bb6..9c570120a 100644 --- a/src/ipvs/ip_vs_synproxy.c +++ b/src/ipvs/ip_vs_synproxy.c @@ -75,7 +75,7 @@ rte_atomic64_t sp_ack_refused; static struct dpvs_timer g_second_timer; #endif -/* +/* * syncookies using digest function from openssl libray, * a little difference from kernel, which uses md5_transform * */ @@ -86,7 +86,7 @@ static rte_atomic32_t g_minute_count; static int minute_timer_expire( void *priv) { struct timeval tv; - + rte_atomic32_inc(&g_minute_count); tv.tv_sec = 60; /* one minute timer */ @@ -362,7 +362,7 @@ syn_proxy_cookie_v4_init_sequence(struct rte_mbuf *mbuf, data |= opts->sack_ok << DP_VS_SYNPROXY_SACKOK_BIT; data |= opts->tstamp_ok << DP_VS_SYNPROXY_TSOK_BIT; data |= ((opts->snd_wscale & 0xf) << DP_VS_SYNPROXY_SND_WSCALE_BITS); - + return secure_tcp_syn_cookie(iph->saddr, iph->daddr, th->source, th->dest, ntohl(th->seq), rte_atomic32_read(&g_minute_count), data); @@ -472,7 +472,7 @@ syn_proxy_v6_cookie_check(struct rte_mbuf *mbuf, uint32_t cookie, return 0; } -/* +/* * Synproxy implementation */ @@ -559,7 +559,7 @@ static void syn_proxy_parse_set_opts(struct rte_mbuf *mbuf, struct tcphdr *th, opt->sack_ok = 1; } else { memset(tmp_opcode, TCPOPT_NOP, TCPOLEN_SACK_PERMITTED); - } + } } break; } @@ -609,7 +609,8 @@ static void syn_proxy_reuse_mbuf(int af, struct rte_mbuf *mbuf, tmpport = th->dest; th->dest = th->source; th->source = tmpport; - + /* set window size to zero */ + th->window = 0; /* set seq(cookie) and ack_seq */ th->ack_seq = htonl(ntohl(th->seq) + 1); th->seq = htonl(isn); @@ -647,7 +648,7 @@ static void syn_proxy_reuse_mbuf(int af, struct rte_mbuf *mbuf, if (likely(mbuf->ol_flags & PKT_TX_TCP_CKSUM)) { mbuf->l3_len = iphlen; mbuf->l4_len = ntohs(iph->tot_len) - iphlen; - th->check = rte_ipv4_phdr_cksum((struct ipv4_hdr*)iph, mbuf->ol_flags); + th->check = ip4_phdr_cksum((struct ipv4_hdr*)iph, mbuf->ol_flags); } else { if (mbuf_may_pull(mbuf, mbuf->pkt_len) != 0) return; @@ -668,7 +669,7 @@ static void syn_proxy_reuse_mbuf(int af, struct rte_mbuf *mbuf, * 1) mbuf is a syn packet, * 2) and the service is synproxy-enable, * 3) and ip_vs_todrop return fasle (not supported now) - * + * * @return 0 means the caller should return at once and use * verdict as return value, return 1 for nothing. */ @@ -688,7 +689,7 @@ int dp_vs_synproxy_syn_rcv(int af, struct rte_mbuf *mbuf, goto syn_rcv_out; if (th->syn && !th->ack && !th->rst && !th->fin && - (svc = dp_vs_service_lookup(af, iph->proto, + (svc = dp_vs_service_lookup(af, iph->proto, &iph->daddr, th->dest, 0, NULL, NULL)) && (svc->flags & DP_VS_SVC_F_SYNPROXY)) { /* if service's weight is zero (non-active realserver), @@ -752,9 +753,9 @@ int dp_vs_synproxy_syn_rcv(int af, struct rte_mbuf *mbuf, if (unlikely(EDPVS_OK != (ret = netif_xmit(mbuf, dev)))) { RTE_LOG(ERR, IPVS, "%s: netif_xmit failed -- %s\n", __func__, dpvs_strerror(ret)); - /* should not set verdict to INET_DROP since netif_xmit - * always consume the mbuf while INET_DROP means mbuf'll - * be free in INET_HOOK.*/ + /* should not set verdict to INET_DROP since netif_xmit + * always consume the mbuf while INET_DROP means mbuf'll + * be free in INET_HOOK.*/ } *verdict = INET_STOLEN; @@ -820,7 +821,7 @@ static int syn_proxy_send_rs_syn(int af, const struct tcphdr *th, int tcp_hdr_size; struct rte_mbuf *syn_mbuf, *syn_mbuf_cloned; struct rte_mempool *pool; - struct tcphdr *syn_th; + struct tcphdr *syn_th; if (!cp->packet_xmit) { RTE_LOG(WARNING, IPVS, "%s: packet_xmit is null\n", __func__); @@ -889,6 +890,8 @@ static int syn_proxy_send_rs_syn(int af, const struct tcphdr *th, syn_ip6h->ip6_plen = htons(tcp_hdr_size); syn_ip6h->ip6_nxt = NEXTHDR_TCP; syn_ip6h->ip6_hlim = IPV6_DEFAULT_HOPLIMIT; + + syn_mbuf->l3_len = sizeof(*syn_ip6h); } else { struct iphdr *ack_iph; struct iphdr *syn_iph; @@ -910,13 +913,15 @@ static int syn_proxy_send_rs_syn(int af, const struct tcphdr *th, syn_iph->saddr = ack_iph->saddr; syn_iph->daddr = ack_iph->daddr; + syn_mbuf->l3_len = sizeof(*syn_iph); + /* checksum is done by fnat_in_handler */ syn_iph->check = 0; } /* Save syn_mbuf if syn retransmission is on */ if (dp_vs_synproxy_ctrl_syn_retry > 0) { - syn_mbuf_cloned = rte_pktmbuf_clone(syn_mbuf, pool); + syn_mbuf_cloned = mbuf_copy(syn_mbuf, pool); if (unlikely(!syn_mbuf_cloned)) { rte_pktmbuf_free(syn_mbuf); //RTE_LOG(WARNING, IPVS, "%s:%s\n", __func__, dpvs_strerror(EDPVS_NOMEM)); @@ -999,7 +1004,7 @@ int dp_vs_synproxy_ack_rcv(int af, struct rte_mbuf *mbuf, *verdict = INET_DROP; return 0; } - + /* Release the service, we do not need it any more */ dp_vs_service_put(svc); @@ -1109,6 +1114,87 @@ void dp_vs_synproxy_dnat_handler(struct tcphdr *tcph, struct dp_vs_seq *sp_seq) } } +static int syn_proxy_send_window_update(int af, struct rte_mbuf *mbuf, struct dp_vs_conn *conn, + struct dp_vs_proto *pp, struct tcphdr *th) +{ + struct rte_mbuf *ack_mbuf; + struct rte_mempool *pool; + struct tcphdr *ack_th; + + if (!conn->packet_out_xmit) { + return EDPVS_INVAL; + } + + pool = get_mbuf_pool(conn, DPVS_CONN_DIR_OUTBOUND); + if (unlikely(!pool)) { + RTE_LOG(WARNING, IPVS, "%s: %s\n", __func__, dpvs_strerror(EDPVS_NOROUTE)); + return EDPVS_NOROUTE; + } + + ack_mbuf = rte_pktmbuf_alloc(pool); + if (unlikely(!ack_mbuf)) { + RTE_LOG(WARNING, IPVS, "%s: %s\n", __func__, dpvs_strerror(EDPVS_NOMEM)); + return EDPVS_NOMEM; + } + + ack_th = (struct tcphdr *)rte_pktmbuf_prepend(ack_mbuf, sizeof(struct tcphdr)); + if (!ack_th) { + rte_pktmbuf_free(ack_mbuf); + RTE_LOG(WARNING, IPVS, "%s:%s\n", __func__, dpvs_strerror(EDPVS_NOROOM)); + return EDPVS_NOROOM; + } + + /* Set up tcp header */ + memcpy(ack_th, th, sizeof(struct tcphdr)); + /* clear SYN flag */ + ack_th->syn = 0; + /* add one to seq and seq will be adjust later */ + ack_th->seq = htonl(ntohl(ack_th->seq)+1); + ack_th->doff = sizeof(struct tcphdr) >> 2; + + if (AF_INET6 == af) { + struct ip6_hdr *ack_ip6h; + struct ip6_hdr *reuse_ip6h = (struct ip6_hdr *)ip6_hdr(mbuf); + /* Reserve space for ipv6 header */ + ack_ip6h = (struct ip6_hdr *)rte_pktmbuf_prepend(ack_mbuf, + sizeof(struct ip6_hdr)); + if (!ack_ip6h) { + rte_pktmbuf_free(ack_mbuf); + RTE_LOG(WARNING, IPVS, "%s:%s\n", __func__, dpvs_strerror(EDPVS_NOROOM)); + return EDPVS_NOROOM; + } + + memcpy(ack_ip6h, reuse_ip6h, sizeof(struct ip6_hdr)); + ack_ip6h->ip6_vfc = 0x60; /* IPv6 */ + ack_ip6h->ip6_plen = htons(sizeof(struct tcphdr)); + ack_ip6h->ip6_nxt = NEXTHDR_TCP; + ack_mbuf->l3_len = sizeof(*ack_ip6h); + } else { + struct ipv4_hdr *ack_iph; + struct ipv4_hdr *reuse_iph = ip4_hdr(mbuf); + int pkt_ack_len = sizeof(struct tcphdr) + sizeof(struct iphdr); + /* Reserve space for ipv4 header */ + ack_iph = (struct ipv4_hdr *)rte_pktmbuf_prepend(ack_mbuf, sizeof(struct ipv4_hdr)); + if (!ack_iph) { + rte_pktmbuf_free(ack_mbuf); + RTE_LOG(WARNING, IPVS, "%s:%s\n", __func__, dpvs_strerror(EDPVS_NOROOM)); + return EDPVS_NOROOM; + } + + memcpy(ack_iph, reuse_iph, sizeof(struct ipv4_hdr)); + /* version and ip header length */ + ack_iph->version_ihl = 0x45; + ack_iph->type_of_service = 0; + ack_iph->fragment_offset = htons(IPV4_HDR_DF_FLAG); + ack_iph->total_length = htons(pkt_ack_len); + ack_mbuf->l3_len = sizeof(*ack_iph); + } + + conn->packet_out_xmit(pp, conn, ack_mbuf); + + return EDPVS_OK; +} + /* Syn-proxy step 3 logic: receive rs's Syn/Ack. * Update syn_proxy_seq.delta and send stored ack mbufs to rs. */ int dp_vs_synproxy_synack_rcv(struct rte_mbuf *mbuf, struct dp_vs_conn *cp, @@ -1187,6 +1273,15 @@ int dp_vs_synproxy_synack_rcv(struct rte_mbuf *mbuf, struct dp_vs_conn *cp, return 0; } + /* Window size has been set to zero in the syn-ack packet to Client. + * If get more than one ack packet here, + * it means client has sent a window probe after one RTO. + * The probe will be forward to RS and RS will respond a window update. + * So DPVS has no need to send a window update. + */ + if (cp->ack_num == 1) + syn_proxy_send_window_update(tuplehash_out(cp).af, mbuf, cp, pp, th); + list_for_each_entry_safe(tmbuf, tmbuf2, &cp->ack_mbuf, list) { list_del_init(&tmbuf->list); cp->ack_num--; diff --git a/src/ipvs/ip_vs_wlc.c b/src/ipvs/ip_vs_wlc.c index e5751f7d7..ea1f861d7 100644 --- a/src/ipvs/ip_vs_wlc.c +++ b/src/ipvs/ip_vs_wlc.c @@ -38,9 +38,7 @@ static struct dp_vs_dest *dp_vs_wlc_schedule(struct dp_vs_service *svc, */ list_for_each_entry(dest, &svc->dests, n_list) { - if (!(dest->flags & DPVS_DEST_F_OVERLOAD) && - (dest->flags & DPVS_DEST_F_AVAILABLE) && - rte_atomic16_read(&dest->weight) > 0) { + if (dp_vs_dest_is_valid(dest)) { least = dest; loh = dp_vs_wlc_dest_overhead(least); goto nextstage; diff --git a/src/ipvs/ip_vs_wrr.c b/src/ipvs/ip_vs_wrr.c index bf4395856..0a76a8d22 100644 --- a/src/ipvs/ip_vs_wrr.c +++ b/src/ipvs/ip_vs_wrr.c @@ -105,7 +105,8 @@ static int dp_vs_wrr_done_svc(struct dp_vs_service *svc) return EDPVS_OK; } -static int dp_vs_wrr_update_svc(struct dp_vs_service *svc) +static int dp_vs_wrr_update_svc(struct dp_vs_service *svc, + struct dp_vs_dest *dest __rte_unused, sockoptid_t opt __rte_unused) { struct dp_vs_wrr_mark *mark = svc->sched_data; @@ -162,8 +163,7 @@ static struct dp_vs_dest *dp_vs_wrr_schedule(struct dp_vs_service *svc, if (mark->cl != &svc->dests) { /* not at the head of the list */ dest = list_entry(mark->cl, struct dp_vs_dest, n_list); - if (!(dest->flags & DPVS_DEST_F_OVERLOAD) && - (dest->flags & DPVS_DEST_F_AVAILABLE) && + if (dp_vs_dest_is_valid(dest) && rte_atomic16_read(&dest->weight) >= mark->cw) { /* got it */ break; @@ -202,5 +202,3 @@ int dp_vs_wrr_term(void) { return unregister_dp_vs_scheduler(&dp_vs_wrr_scheduler); } - - diff --git a/src/ipvs/ip_vs_xmit.c b/src/ipvs/ip_vs_xmit.c index be2282ce6..fd08d2bca 100644 --- a/src/ipvs/ip_vs_xmit.c +++ b/src/ipvs/ip_vs_xmit.c @@ -27,6 +27,7 @@ #include "icmp6.h" #include "neigh.h" #include "ipvs/xmit.h" +#include "ipvs/nat64.h" #include "parser/parser.h" static bool fast_xmit_close = false; @@ -114,9 +115,9 @@ static int __dp_vs_fast_xmit_fnat6(struct dp_vs_proto *proto, if (err != EDPVS_OK) return err; - /* + /* * re-fetch IP header - * the offset may changed during pre-handler + * the offset may changed during pre-handler */ ip6h = ip6_hdr(mbuf); } @@ -236,9 +237,9 @@ static int __dp_vs_fast_outxmit_fnat6(struct dp_vs_proto *proto, if (err != EDPVS_OK) return err; - /* + /* * re-fetch IP header - * the offset may changed during pre-handler + * the offset may changed during pre-handler */ ip6h = ip6_hdr(mbuf); } @@ -532,7 +533,7 @@ static int __dp_vs_xmit_fnat6(struct dp_vs_proto *proto, * didn't cache the pointer to rt6 * or route can't be deleted when there is conn ref * this is for neighbour confirm. - */ + */ dp_vs_conn_cache_rt6(conn, rt6, true); // check mtu @@ -592,14 +593,124 @@ static int __dp_vs_xmit_fnat6(struct dp_vs_proto *proto, return err; } +static int __dp_vs_xmit_fnat64(struct dp_vs_proto *proto, + struct dp_vs_conn *conn, + struct rte_mbuf *mbuf) +{ + struct flow4 fl4; + struct ip6_hdr *ip6h = ip6_hdr(mbuf); + struct ipv4_hdr *ip4h; + uint32_t pkt_len; + struct route_entry *rt; + int err, mtu; + + /* + * drop old route. just for safe, because + * FNAT is PRE_ROUTING, should not have route. + */ + if (unlikely(mbuf->userdata != NULL)) { + RTE_LOG(WARNING, IPVS, "%s: FNAT have route %p ?\n", + __func__, mbuf->userdata); + route6_put((struct route6 *)mbuf->userdata); + } + + memset(&fl4, 0, sizeof(struct flow4)); + fl4.fl4_daddr = conn->daddr.in; + fl4.fl4_saddr = conn->laddr.in; + rt = route4_output(&fl4); + if (!rt) { + err = EDPVS_NOROUTE; + goto errout; + } + + /* + * didn't cache the pointer to rt + * or route can't be deleted when there is conn ref + * this is for neighbour confirm + */ + dp_vs_conn_cache_rt(conn, rt, true); + + /* + * mbuf is from IPv6, icmp should send by icmp6 + * ext_hdr and + */ + mtu = rt->mtu; + pkt_len = mbuf_nat6to4_len(mbuf); + if (pkt_len > mtu) { + RTE_LOG(DEBUG, IPVS, "%s: frag needed.\n", __func__); + icmp6_send(mbuf, ICMP6_PACKET_TOO_BIG, 0, mtu); + + err = EDPVS_FRAG; + goto errout; + } + + mbuf->userdata = rt; + /* after route lookup and before translation */ + if (xmit_ttl) { + if (unlikely(ip6h->ip6_hops <= 1)) { + icmp6_send(mbuf, ICMP6_TIME_EXCEEDED, ICMP6_TIME_EXCEED_TRANSIT, 0); + err = EDPVS_DROP; + goto errout; + } + ip6h->ip6_hops--; + } + + /* pre-handler before translation */ + if (proto->fnat_in_pre_handler) { + err = proto->fnat_in_pre_handler(proto, conn, mbuf); + if (err != EDPVS_OK) + goto errout; + } + + /* L3 translation before l4 re-csum */ + err = mbuf_6to4(mbuf, &conn->laddr.in, &conn->daddr.in); + if (err) + goto errout; + ip4h = ip4_hdr(mbuf); + ip4h->hdr_checksum = 0; + + /* L4 FNAT translation */ + if (proto->fnat_in_handler) { + err = proto->fnat_in_handler(proto, conn, mbuf); + if (err != EDPVS_OK) + goto errout; + } + + if (likely(mbuf->ol_flags & PKT_TX_IP_CKSUM)) { + ip4h->hdr_checksum = 0; + } else { + ip4_send_csum(ip4h); + } + + return INET_HOOK(AF_INET, INET_HOOK_LOCAL_OUT, mbuf, + NULL, rt->port, ipv4_output); + +errout: + if (rt) + route4_put(rt); + rte_pktmbuf_free(mbuf); + return err; +} + int dp_vs_xmit_fnat(struct dp_vs_proto *proto, struct dp_vs_conn *conn, struct rte_mbuf *mbuf) { int af = conn->af; assert(af == AF_INET || af == AF_INET6); - return af == AF_INET ? __dp_vs_xmit_fnat4(proto, conn, mbuf) - : __dp_vs_xmit_fnat6(proto, conn, mbuf); + + if (tuplehash_in(conn).af == AF_INET && + tuplehash_out(conn).af == AF_INET) + return __dp_vs_xmit_fnat4(proto, conn, mbuf); + if (tuplehash_in(conn).af == AF_INET6 && + tuplehash_out(conn).af == AF_INET6) + return __dp_vs_xmit_fnat6(proto, conn, mbuf); + if (tuplehash_in(conn).af == AF_INET6 && + tuplehash_out(conn).af == AF_INET) + return __dp_vs_xmit_fnat64(proto, conn, mbuf); + + rte_pktmbuf_free(mbuf); + return EDPVS_NOTSUPP; } static int __dp_vs_out_xmit_fnat4(struct dp_vs_proto *proto, @@ -799,14 +910,115 @@ static int __dp_vs_out_xmit_fnat6(struct dp_vs_proto *proto, return err; } +static int __dp_vs_out_xmit_fnat46(struct dp_vs_proto *proto, + struct dp_vs_conn *conn, + struct rte_mbuf *mbuf) +{ + struct flow6 fl6; + struct ipv4_hdr *ip4h = ip4_hdr(mbuf); + uint32_t pkt_len; + struct route6 *rt6; + int err, mtu; + + /* + * drop old route. just for safe, because + * FNAT is PRE_ROUTING, should not have route. + */ + if (unlikely(mbuf->userdata != NULL)) { + RTE_LOG(WARNING, IPVS, "%s: FNAT have route %p ?\n", + __func__, mbuf->userdata); + route4_put((struct route_entry *)mbuf->userdata); + } + + memset(&fl6, 0, sizeof(struct flow6)); + fl6.fl6_daddr = conn->caddr.in6; + fl6.fl6_saddr = conn->vaddr.in6; + rt6 = route6_output(mbuf, &fl6); + if (!rt6) { + err = EDPVS_NOROUTE; + goto errout; + } + + /* + * didn't cache the pointer to rt + * or route can't be deleted when there is conn ref + * this is for neighbour confirm + */ + dp_vs_conn_cache_rt6(conn, rt6, false); + + /* + * mbuf is from IPv6, icmp should send by icmp6 + * ext_hdr and + */ + mtu = rt6->rt6_mtu; + pkt_len = mbuf_nat4to6_len(mbuf); + if (pkt_len > mtu + && (ip4h->fragment_offset & htons(IPV4_HDR_DF_FLAG))) { + RTE_LOG(DEBUG, IPVS, "%s: frag needed.\n", __func__); + icmp_send(mbuf, ICMP_DEST_UNREACH, ICMP_UNREACH_NEEDFRAG, htonl(mtu)); + err = EDPVS_FRAG; + goto errout; + } + + mbuf->userdata = rt6; + /* after route lookup and before translation */ + if (xmit_ttl) { + if (unlikely(ip4h->time_to_live <= 1)) { + icmp_send(mbuf, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0); + err = EDPVS_DROP; + goto errout; + } + ip4h->time_to_live--; + } + + /* pre-handler before translation */ + if (proto->fnat_out_pre_handler) { + err = proto->fnat_out_pre_handler(proto, conn, mbuf); + if (err != EDPVS_OK) + goto errout; + } + + /* L3 translation before l4 re-csum */ + err = mbuf_4to6(mbuf, &conn->vaddr.in6, &conn->caddr.in6); + if (err) + goto errout; + + /* L4 FNAT translation */ + if (proto->fnat_out_handler) { + err = proto->fnat_out_handler(proto, conn, mbuf); + if (err != EDPVS_OK) + goto errout; + } + + return INET_HOOK(AF_INET6, INET_HOOK_LOCAL_OUT, mbuf, + NULL, rt6->rt6_dev, ip6_output); + +errout: + if (rt6) + route6_put(rt6); + rte_pktmbuf_free(mbuf); + return err; +} + int dp_vs_out_xmit_fnat(struct dp_vs_proto *proto, struct dp_vs_conn *conn, struct rte_mbuf *mbuf) { int af = conn->af; assert(af == AF_INET || af == AF_INET6); - return af == AF_INET ? __dp_vs_out_xmit_fnat4(proto, conn, mbuf) - : __dp_vs_out_xmit_fnat6(proto, conn, mbuf); + + if (tuplehash_in(conn).af == AF_INET && + tuplehash_out(conn).af == AF_INET) + return __dp_vs_out_xmit_fnat4(proto, conn, mbuf); + if (tuplehash_in(conn).af == AF_INET6 && + tuplehash_out(conn).af == AF_INET6) + return __dp_vs_out_xmit_fnat6(proto, conn, mbuf); + if (tuplehash_in(conn).af == AF_INET6 && + tuplehash_out(conn).af == AF_INET) + return __dp_vs_out_xmit_fnat46(proto, conn, mbuf); + + rte_pktmbuf_free(mbuf); + return EDPVS_NOTSUPP; } /* mbuf's data should pointer to outer IP packet. */ @@ -821,7 +1033,7 @@ static void __dp_vs_xmit_icmp4(struct rte_mbuf *mbuf, int fullnat = (conn->dest->fwdmode == DPVS_FWD_MODE_FNAT); uint16_t csum; - /* + /* * outer/inner L3 translation. */ if (fullnat) { @@ -846,7 +1058,7 @@ static void __dp_vs_xmit_icmp4(struct rte_mbuf *mbuf, ip4_send_csum(ciph); } - /* + /* * inner L4 translation. * * note it's no way to recalc inner csum to lack of data, @@ -982,7 +1194,7 @@ static void __dp_vs_xmit_icmp6(struct rte_mbuf *mbuf, } } - /* + /* * ICMP recalc csum. */ icmp6h->icmp6_cksum = 0; @@ -1215,7 +1427,7 @@ static int __dp_vs_xmit_snat6(struct dp_vs_proto *proto, struct route6 *rt6; int err, mtu; - /* + /* * drop old route. just for safe, because * inbound SNAT traffic is hooked at PRE_ROUTING, * should not have route. @@ -1226,9 +1438,9 @@ static int __dp_vs_xmit_snat6(struct dp_vs_proto *proto, route6_put((struct route6 *)mbuf->userdata); } - /* + /* * hosts inside SNAT may belongs to diff net, - * let's route it. + * let's route it. */ memset(&fl6, 0, sizeof(struct flow6)); fl6.fl6_daddr = conn->daddr.in6; @@ -1627,7 +1839,7 @@ static int __dp_vs_xmit_nat6(struct dp_vs_proto *proto, struct route6 *rt6; int err, mtu; - /* + /* * drop old route. just for safe, because * NAT is PREROUTING, should not have route. */ @@ -1673,7 +1885,7 @@ static int __dp_vs_xmit_nat6(struct dp_vs_proto *proto, ip6h->ip6_dst = conn->daddr.in6; /* L4 NAT translation */ - if (proto->fnat_in_handler) { + if (proto->nat_in_handler) { err = proto->nat_in_handler(proto, conn, mbuf); if (err != EDPVS_OK) goto errout; @@ -1799,7 +2011,7 @@ static int __dp_vs_out_xmit_nat6(struct dp_vs_proto *proto, struct route6 *rt6; int err, mtu; - /* + /* * drop old route. just for safe, because * NAT is PREROUTING, should not have route. */ diff --git a/src/ipvs/libconhash/conhash.c b/src/ipvs/libconhash/conhash.c index bbf6e8af8..3e6b14869 100644 --- a/src/ipvs/libconhash/conhash.c +++ b/src/ipvs/libconhash/conhash.c @@ -42,7 +42,7 @@ void conhash_fini(struct conhash_s *conhash,void (*node_fini)(struct node_s*)) struct node_s *node; util_rbtree_node_t *rbnode = conhash->vnode_tree.root; util_rbtree_delete(&(conhash->vnode_tree), rbnode); - if (rbnode && rbnode->data) + if (rbnode && rbnode->data) { node = ((struct virtual_node_s*)rbnode->data)->node; if (--(node->replicas) == 0) @@ -63,7 +63,7 @@ void conhash_set_node(struct node_s *node, const char *iden, u_int replica) int conhash_add_node(struct conhash_s *conhash, struct node_s *node) { - if((conhash==NULL) || (node==NULL)) + if((conhash==NULL) || (node==NULL)) { return -1; } @@ -75,13 +75,13 @@ int conhash_add_node(struct conhash_s *conhash, struct node_s *node) node->flag |= NODE_FLAG_IN; /* add replicas of server */ __conhash_add_replicas(conhash, node); - + return 0; } int conhash_del_node(struct conhash_s *conhash, struct node_s *node) { - if((conhash==NULL) || (node==NULL)) + if((conhash==NULL) || (node==NULL)) { return -1; } @@ -101,13 +101,13 @@ const struct node_s* conhash_lookup(const struct conhash_s *conhash, const char { long hash; const util_rbtree_node_t *rbnode; - if((conhash==NULL) || (conhash->ivnodes==0) || (object==NULL)) + if((conhash==NULL) || (conhash->ivnodes==0) || (object==NULL)) { return NULL; } /* calc hash value */ hash = conhash->cb_hashfunc(object); - + rbnode = util_rbtree_lookup((util_rbtree_t *)&(conhash->vnode_tree), hash); if(rbnode != NULL) { diff --git a/src/ipvs/libconhash/conhash.h b/src/ipvs/libconhash/conhash.h index 31ff13018..95d46c711 100644 --- a/src/ipvs/libconhash/conhash.h +++ b/src/ipvs/libconhash/conhash.h @@ -13,10 +13,10 @@ #define CONHASH_API __declspec(dllexport) #else #define CONHASH_API __declspec(dllimport) -#endif +#endif #else /* Linux, or static lib */ -#define CONHASH_API +#define CONHASH_API #endif #define NODE_FLAG_INIT 0x01 /* node is initialized */ @@ -31,8 +31,8 @@ struct node_s void *data;/*real data for consistent hash*/ }; -/* - * callback function to calculate hash value +/* + * callback function to calculate hash value * @instr: input string */ typedef long (*conhash_cb_hashfunc)(const char *instr); @@ -43,11 +43,11 @@ struct conhash_s; #ifdef __cplusplus extern "C" { #endif - /* initialize conhash library - * @pfhash : hash function, NULL to use default MD5 method - * return a conhash_s instance - */ - CONHASH_API struct conhash_s* conhash_init(conhash_cb_hashfunc pfhash); + /* initialize conhash library + * @pfhash : hash function, NULL to use default MD5 method + * return a conhash_s instance + */ + CONHASH_API struct conhash_s* conhash_init(conhash_cb_hashfunc pfhash); /* finalize lib */ CONHASH_API void conhash_fini(struct conhash_s *conhash, void (*node_fini)(struct node_s*)); @@ -55,36 +55,36 @@ extern "C" { /* set node */ CONHASH_API void conhash_set_node(struct node_s *node, const char *iden, u_int replica); - /* - * add a new node + /* + * add a new node * @node: the node to add */ - CONHASH_API int conhash_add_node(struct conhash_s *conhash, struct node_s *node); + CONHASH_API int conhash_add_node(struct conhash_s *conhash, struct node_s *node); /* remove a node */ - CONHASH_API int conhash_del_node(struct conhash_s *conhash, struct node_s *node); - - /* - * update a node's virtual nodes - * @replica: new replica of server - * return 0 success, -1 failed - */ - CONHASH_API int conhash_update_node(struct conhash_s *conhash, struct node_s *node, u_int replica); + CONHASH_API int conhash_del_node(struct conhash_s *conhash, struct node_s *node); - /* - * lookup a server which object belongs to + /* + * update a node's virtual nodes + * @replica: new replica of server + * return 0 success, -1 failed + */ + CONHASH_API int conhash_update_node(struct conhash_s *conhash, struct node_s *node, u_int replica); + + /* + * lookup a server which object belongs to * @object: the input string which indicates an object * return the server_s structure, do not modify the value, or it will cause a disaster */ - CONHASH_API const struct node_s* conhash_lookup(const struct conhash_s *conhash, const char *object); + CONHASH_API const struct node_s* conhash_lookup(const struct conhash_s *conhash, const char *object); - /* some utility functions export*/ - CONHASH_API void conhash_md5_digest(const u_char *instr, u_char digest[16]); + /* some utility functions export*/ + CONHASH_API void conhash_md5_digest(const u_char *instr, u_char digest[16]); /* get virtual node number in the hash */ - CONHASH_API u_int conhash_get_vnodes_num(const struct conhash_s *conhash); + CONHASH_API u_int conhash_get_vnodes_num(const struct conhash_s *conhash); /* * get virtual nodes in ascending oder - * @values, pointer to an array, stores all the nodes's hash value + * @values, pointer to an array, stores all the nodes's hash value * @size, how many nodes to get, can't be less than the array size */ CONHASH_API void conhash_get_vnodes(const struct conhash_s *conhash, long *values, int size); diff --git a/src/ipvs/libconhash/conhash_inter.c b/src/ipvs/libconhash/conhash_inter.c index 6777ecbcc..1519b416e 100644 --- a/src/ipvs/libconhash/conhash_inter.c +++ b/src/ipvs/libconhash/conhash_inter.c @@ -4,7 +4,7 @@ #include "conhash_inter.h" #include "conhash.h" -/* +/* * the default hash function, using md5 algorithm * @instr: input string */ @@ -25,7 +25,7 @@ unsigned long __conhash_hash_def(const char *instr) | ((long)(digest[i*4 + 1]&0xFF) << 8) | ((long)(digest[i*4 + 0]&0xFF)); } - + a = hash; a = (a+0x7ed55d16) + (a<<12); a = (a^0xc761c23c) ^ (a>>19); @@ -37,7 +37,7 @@ unsigned long __conhash_hash_def(const char *instr) /* ensure values are better spread all around the tree by multiplying * by a large prime close to 3/4 of the tree. */ - a = a * 3221225473U; + a = a * 3221225473U; return a; } diff --git a/src/ipvs/libconhash/md5.c b/src/ipvs/libconhash/md5.c index c35d96c5e..04c25625b 100644 --- a/src/ipvs/libconhash/md5.c +++ b/src/ipvs/libconhash/md5.c @@ -27,7 +27,7 @@ This code implements the MD5 Algorithm defined in RFC 1321, whose text is available at - http://www.ietf.org/rfc/rfc1321.txt + http://www.ietf.org/rfc/rfc1321.txt The code is derived from the text of the RFC, including the test suite (section A.5) but excluding the rest of Appendix A. It does not include any code or documentation that is identified in the RFC as being @@ -38,14 +38,14 @@ that follows (in reverse chronological order): 2002-04-13 lpd Clarified derivation from RFC 1321; now handles byte order - either statically or dynamically; added missing #include - in library. + either statically or dynamically; added missing #include + in library. 2002-03-11 lpd Corrected argument list for main(), and added int return - type, in test program and T value program. + type, in test program and T value program. 2002-02-21 lpd Added missing #include in test program. 2000-07-03 lpd Patched to eliminate warnings about "constant is - unsigned in ANSI C, signed in traditional"; made test program - self-checking. + unsigned in ANSI C, signed in traditional"; made test program + self-checking. 1999-11-04 lpd Edited comments slightly for automatic TOC extraction. 1999-10-18 lpd Fixed typo in header comment (ansi2knr rather than md5). 1999-05-03 lpd Original version. @@ -54,7 +54,7 @@ #include "md5.h" #include -#undef BYTE_ORDER /* 1 = big-endian, -1 = little-endian, 0 = unknown */ +#undef BYTE_ORDER /* 1 = big-endian, -1 = little-endian, 0 = unknown */ #ifdef ARCH_IS_BIG_ENDIAN # define BYTE_ORDER (ARCH_IS_BIG_ENDIAN ? 1 : -1) #else @@ -132,8 +132,8 @@ static void md5_process(md5_state_t *pms, const md5_byte_t *data /*[64]*/) { md5_word_t - a = pms->abcd[0], b = pms->abcd[1], - c = pms->abcd[2], d = pms->abcd[3]; + a = pms->abcd[0], b = pms->abcd[1], + c = pms->abcd[2], d = pms->abcd[3]; md5_word_t t; #if BYTE_ORDER > 0 /* Define storage only for big-endian CPUs. */ @@ -146,51 +146,51 @@ md5_process(md5_state_t *pms, const md5_byte_t *data /*[64]*/) { #if BYTE_ORDER == 0 - /* - * Determine dynamically whether this is a big-endian or - * little-endian machine, since we can use a more efficient - * algorithm on the latter. - */ - static const int w = 1; - - if (*((const md5_byte_t *)&w)) /* dynamic little-endian */ + /* + * Determine dynamically whether this is a big-endian or + * little-endian machine, since we can use a more efficient + * algorithm on the latter. + */ + static const int w = 1; + + if (*((const md5_byte_t *)&w)) /* dynamic little-endian */ #endif -#if BYTE_ORDER <= 0 /* little-endian */ - { - /* - * On little-endian machines, we can process properly aligned - * data without copying it. - */ - if (!((data - (const md5_byte_t *)0) & 3)) { - /* data are properly aligned */ - X = (const md5_word_t *)data; - } else { - /* not aligned */ - memcpy(xbuf, data, 64); - X = xbuf; - } - } +#if BYTE_ORDER <= 0 /* little-endian */ + { + /* + * On little-endian machines, we can process properly aligned + * data without copying it. + */ + if (!((data - (const md5_byte_t *)0) & 3)) { + /* data are properly aligned */ + X = (const md5_word_t *)data; + } else { + /* not aligned */ + memcpy(xbuf, data, 64); + X = xbuf; + } + } #endif #if BYTE_ORDER == 0 - else /* dynamic big-endian */ + else /* dynamic big-endian */ #endif -#if BYTE_ORDER >= 0 /* big-endian */ - { - /* - * On big-endian machines, we must arrange the bytes in the - * right order. - */ - const md5_byte_t *xp = data; - int i; +#if BYTE_ORDER >= 0 /* big-endian */ + { + /* + * On big-endian machines, we must arrange the bytes in the + * right order. + */ + const md5_byte_t *xp = data; + int i; # if BYTE_ORDER == 0 - X = xbuf; /* (dynamic only) */ + X = xbuf; /* (dynamic only) */ # else -# define xbuf X /* (static only) */ +# define xbuf X /* (static only) */ # endif - for (i = 0; i < 16; ++i, xp += 4) - xbuf[i] = xp[0] + (xp[1] << 8) + (xp[2] << 16) + (xp[3] << 24); - } + for (i = 0; i < 16; ++i, xp += 4) + xbuf[i] = xp[0] + (xp[1] << 8) + (xp[2] << 16) + (xp[3] << 24); + } #endif } @@ -328,54 +328,54 @@ md5_append(md5_state_t *pms, const md5_byte_t *data, int nbytes) md5_word_t nbits = (md5_word_t)(nbytes << 3); if (nbytes <= 0) - return; + return; /* Update the message length. */ pms->count[1] += nbytes >> 29; pms->count[0] += nbits; if (pms->count[0] < nbits) - pms->count[1]++; + pms->count[1]++; /* Process an initial partial block. */ if (offset) { - int copy = (offset + nbytes > 64 ? 64 - offset : nbytes); - - memcpy(pms->buf + offset, p, copy); - if (offset + copy < 64) - return; - p += copy; - left -= copy; - md5_process(pms, pms->buf); + int copy = (offset + nbytes > 64 ? 64 - offset : nbytes); + + memcpy(pms->buf + offset, p, copy); + if (offset + copy < 64) + return; + p += copy; + left -= copy; + md5_process(pms, pms->buf); } /* Process full blocks. */ for (; left >= 64; p += 64, left -= 64) - md5_process(pms, p); + md5_process(pms, p); /* Process a final partial block. */ if (left) - memcpy(pms->buf, p, left); + memcpy(pms->buf, p, left); } void md5_finish(md5_state_t *pms, md5_byte_t digest[16]) { static const md5_byte_t pad[64] = { - 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; md5_byte_t data[8]; int i; /* Save the length before padding. */ for (i = 0; i < 8; ++i) - data[i] = (md5_byte_t)(pms->count[i >> 2] >> ((i & 3) << 3)); + data[i] = (md5_byte_t)(pms->count[i >> 2] >> ((i & 3) << 3)); /* Pad to 56 bytes mod 64. */ md5_append(pms, pad, ((55 - (pms->count[0] >> 3)) & 63) + 1); /* Append the length. */ md5_append(pms, data, 8); for (i = 0; i < 16; ++i) - digest[i] = (md5_byte_t)(pms->abcd[i >> 2] >> ((i & 3) << 3)); + digest[i] = (md5_byte_t)(pms->abcd[i >> 2] >> ((i & 3) << 3)); } diff --git a/src/ipvs/libconhash/md5.h b/src/ipvs/libconhash/md5.h index d9a45cb2e..b49711bc1 100644 --- a/src/ipvs/libconhash/md5.h +++ b/src/ipvs/libconhash/md5.h @@ -27,7 +27,7 @@ This code implements the MD5 Algorithm defined in RFC 1321, whose text is available at - http://www.ietf.org/rfc/rfc1321.txt + http://www.ietf.org/rfc/rfc1321.txt The code is derived from the text of the RFC, including the test suite (section A.5) but excluding the rest of Appendix A. It does not include any code or documentation that is identified in the RFC as being @@ -38,12 +38,12 @@ that follows (in reverse chronological order): 2002-04-13 lpd Removed support for non-ANSI compilers; removed - references to Ghostscript; clarified derivation from RFC 1321; - now handles byte order either statically or dynamically. + references to Ghostscript; clarified derivation from RFC 1321; + now handles byte order either statically or dynamically. 1999-11-04 lpd Edited comments slightly for automatic TOC extraction. 1999-10-18 lpd Fixed typo in header comment (ansi2knr rather than md5); - added conditionalization for C++ compilation from Martin - Purschke . + added conditionalization for C++ compilation from Martin + Purschke . 1999-05-03 lpd Original version. */ @@ -65,13 +65,13 @@ typedef unsigned int md5_word_t; /* 32-bit word */ /* Define the state of the MD5 Algorithm. */ typedef struct md5_state_s { - md5_word_t count[2]; /* message length in bits, lsw first */ - md5_word_t abcd[4]; /* digest buffer */ - md5_byte_t buf[64]; /* accumulate block */ + md5_word_t count[2]; /* message length in bits, lsw first */ + md5_word_t abcd[4]; /* digest buffer */ + md5_byte_t buf[64]; /* accumulate block */ } md5_state_t; #ifdef __cplusplus -extern "C" +extern "C" { #endif diff --git a/src/ipvs/libconhash/util_rbtree.c b/src/ipvs/libconhash/util_rbtree.c index c1a42a546..0c0782dac 100644 --- a/src/ipvs/libconhash/util_rbtree.c +++ b/src/ipvs/libconhash/util_rbtree.c @@ -24,12 +24,12 @@ static void rbtree_right_rotate(util_rbtree_t *rbtree, util_rbtree_node_t *node) void util_rbtree_init(util_rbtree_t *rbtree) { - if(rbtree != NULL) - { - util_rbt_black(_NULL(rbtree)); /* null MUST be black */ - rbtree->root = _NULL(rbtree); - rbtree->size = 0; - } + if(rbtree != NULL) + { + util_rbt_black(_NULL(rbtree)); /* null MUST be black */ + rbtree->root = _NULL(rbtree); + rbtree->size = 0; + } } util_rbtree_node_t* util_rbsubtree_min(util_rbtree_node_t *node, util_rbtree_node_t *sentinel) @@ -49,10 +49,10 @@ util_rbtree_node_t* util_rbsubtree_max(util_rbtree_node_t *node, util_rbtree_nod void util_rbtree_insert(util_rbtree_t *rbtree, util_rbtree_node_t *node) { util_rbtree_node_t *x, *y; - if((rbtree==NULL) || (node==NULL) || (node==_NULL(rbtree))) - { - return; - } + if((rbtree==NULL) || (node==NULL) || (node==_NULL(rbtree))) + { + return; + } /* the tree is empty */ if(rbtree->root == _NULL(rbtree)) { @@ -80,7 +80,7 @@ void util_rbtree_insert(util_rbtree_t *rbtree, util_rbtree_node_t *node) util_rbt_red(node); /* fix up insert */ rbtree_insert_fixup(rbtree, node); - rbtree->size++; + rbtree->size++; } /* insert may violate the rbtree properties, fix up the tree */ @@ -105,7 +105,7 @@ void rbtree_insert_fixup(util_rbtree_t *rbtree, util_rbtree_node_t *node) if(node == p->right) /* case 2: p:read, u:black, node is right child */ { node = p; - rbtree_left_rotate(rbtree, node); + rbtree_left_rotate(rbtree, node); p = node->parent; } /* case 3: p:read, u:black, node is left child */ @@ -117,25 +117,25 @@ void rbtree_insert_fixup(util_rbtree_t *rbtree, util_rbtree_node_t *node) else /* parent is the right child */ { u = p->parent->left; - if(util_rbt_isred(u)) - { - util_rbt_black(u); - util_rbt_black(p); - util_rbt_red(p->parent); + if(util_rbt_isred(u)) + { + util_rbt_black(u); + util_rbt_black(p); + util_rbt_red(p->parent); node = p->parent; - } - else - { - if(p->left == node) - { - node = p; - rbtree_right_rotate(rbtree, node); - p = node->parent; - } - util_rbt_black(p); - util_rbt_red(p->parent); - rbtree_left_rotate(rbtree, p->parent); - } + } + else + { + if(p->left == node) + { + node = p; + rbtree_right_rotate(rbtree, node); + p = node->parent; + } + util_rbt_black(p); + util_rbt_red(p->parent); + rbtree_left_rotate(rbtree, p->parent); + } } } /* mark root to black */ @@ -145,45 +145,45 @@ void rbtree_insert_fixup(util_rbtree_t *rbtree, util_rbtree_node_t *node) void util_rbtree_delete(util_rbtree_t *rbtree, util_rbtree_node_t *node) { - int isblack; - util_rbtree_node_t *temp, *subst; - if((rbtree==NULL) || (node==NULL) || (node==_NULL(rbtree))) - { - return; - } - rbtree->size--; - /* find deleted position, indicated by temp */ - if(node->left == _NULL(rbtree)) - { - temp = node; - subst = node->right; - } - else if(node->right == _NULL(rbtree)) - { - temp = node; - subst = node->left; - } - else /* right & left aren't null */ - { - temp = util_rbsubtree_min(node->right, _NULL(rbtree)); - if(temp->left != _NULL(rbtree)) - { - subst = temp->left; - } - else - { - subst = temp->right; - } - } - if(temp == rbtree->root) /* temp is root */ - { - rbtree->root = subst; - util_rbt_black(subst); - rbt_clear_node(temp); - return; - } - isblack = util_rbt_isblack(temp); - /* temp will be removed from it's position, rebuild links + int isblack; + util_rbtree_node_t *temp, *subst; + if((rbtree==NULL) || (node==NULL) || (node==_NULL(rbtree))) + { + return; + } + rbtree->size--; + /* find deleted position, indicated by temp */ + if(node->left == _NULL(rbtree)) + { + temp = node; + subst = node->right; + } + else if(node->right == _NULL(rbtree)) + { + temp = node; + subst = node->left; + } + else /* right & left aren't null */ + { + temp = util_rbsubtree_min(node->right, _NULL(rbtree)); + if(temp->left != _NULL(rbtree)) + { + subst = temp->left; + } + else + { + subst = temp->right; + } + } + if(temp == rbtree->root) /* temp is root */ + { + rbtree->root = subst; + util_rbt_black(subst); + rbt_clear_node(temp); + return; + } + isblack = util_rbt_isblack(temp); + /* temp will be removed from it's position, rebuild links * NOTE: if temp->parent = node, then subst->parent is node * while node is the one to be delete, so relink subst's parent to temp * because temp will replace node's in the tree @@ -197,55 +197,55 @@ void util_rbtree_delete(util_rbtree_t *rbtree, util_rbtree_node_t *node) subst->parent = temp->parent; } - if(temp == temp->parent->left) - { - temp->parent->left = subst; - } - else - { - temp->parent->right = subst; - } - /* - * now temp is removed from the tree. - * so we will make temp to replace node in the tree. - */ - if(temp != node) - { - temp->parent = node->parent; - if(node == rbtree->root) /* node maybe root */ - { - rbtree->root = temp; - } - else + if(temp == temp->parent->left) + { + temp->parent->left = subst; + } + else + { + temp->parent->right = subst; + } + /* + * now temp is removed from the tree. + * so we will make temp to replace node in the tree. + */ + if(temp != node) + { + temp->parent = node->parent; + if(node == rbtree->root) /* node maybe root */ { - if(node->parent->left == node) - { - node->parent->left = temp; - } - else - { - node->parent->right = temp; - } - } - temp->right = node->right; - temp->left = node->left; - if(temp->left != _NULL(rbtree)) - { - temp->left->parent = temp; - } - if(temp->right != _NULL(rbtree)) - { - temp->right->parent = temp; - } - temp->color = node->color; - } - rbt_clear_node(node); + rbtree->root = temp; + } + else + { + if(node->parent->left == node) + { + node->parent->left = temp; + } + else + { + node->parent->right = temp; + } + } + temp->right = node->right; + temp->left = node->left; + if(temp->left != _NULL(rbtree)) + { + temp->left->parent = temp; + } + if(temp->right != _NULL(rbtree)) + { + temp->right->parent = temp; + } + temp->color = node->color; + } + rbt_clear_node(node); - if(isblack) - { - /* temp is black, fix up delete */ - rbtree_delete_fixup(rbtree, subst); - } + if(isblack) + { + /* temp is black, fix up delete */ + rbtree_delete_fixup(rbtree, subst); + } } /* delete may violate the rbtree properties, fix up the tree */ @@ -253,8 +253,8 @@ void rbtree_delete_fixup(util_rbtree_t *rbtree, util_rbtree_node_t *node) { int h = 0; util_rbtree_node_t *w; - while((node != rbtree->root) && util_rbt_isblack(node)) - { + while((node != rbtree->root) && util_rbt_isblack(node)) + { h++; if(node == node->parent->left) /* node is left child */ { @@ -322,116 +322,116 @@ void rbtree_delete_fixup(util_rbtree_t *rbtree, util_rbtree_node_t *node) node = rbtree->root; /* to break loop */ } } - } + } util_rbt_black(node); } void rbtree_left_rotate(util_rbtree_t *rbtree, util_rbtree_node_t *node) { - util_rbtree_node_t *rc = node->right; - util_rbtree_node_t *rclc = rc->left; - /* make rc to replace node's position */ - rc->parent = node->parent; - if(node == rbtree->root) - { - rbtree->root = rc; - } - else - { - if(node->parent->left == node) /* node is left child */ - { - node->parent->left = rc; - } - else - { - node->parent->right = rc; - } - } - /* make node to be rc's left child */ - node->parent = rc; - rc->left = node; - /* rc's left child to be node's right child */ - node->right = rclc; - if(rclc != _NULL(rbtree)) - { - rclc->parent = node; - } + util_rbtree_node_t *rc = node->right; + util_rbtree_node_t *rclc = rc->left; + /* make rc to replace node's position */ + rc->parent = node->parent; + if(node == rbtree->root) + { + rbtree->root = rc; + } + else + { + if(node->parent->left == node) /* node is left child */ + { + node->parent->left = rc; + } + else + { + node->parent->right = rc; + } + } + /* make node to be rc's left child */ + node->parent = rc; + rc->left = node; + /* rc's left child to be node's right child */ + node->right = rclc; + if(rclc != _NULL(rbtree)) + { + rclc->parent = node; + } } void rbtree_right_rotate(util_rbtree_t *rbtree, util_rbtree_node_t *node) { - util_rbtree_node_t *lc = node->left; - util_rbtree_node_t *lcrc = lc->right; - /* make lc to replace node's position */ - lc->parent = node->parent; - if(node == rbtree->root) - { - rbtree->root = lc; - } - else - { - if(node->parent->left == node) /* node is left child */ - { - node->parent->left = lc; - } - else - { - node->parent->right = lc; - } - } - /* make node to be lc's right child */ - lc->right = node; - node->parent = lc; - /* lc's right child to be node's left child */ - node->left = lcrc; - if(lcrc != _NULL(rbtree)) - { - lcrc->parent = node; - } + util_rbtree_node_t *lc = node->left; + util_rbtree_node_t *lcrc = lc->right; + /* make lc to replace node's position */ + lc->parent = node->parent; + if(node == rbtree->root) + { + rbtree->root = lc; + } + else + { + if(node->parent->left == node) /* node is left child */ + { + node->parent->left = lc; + } + else + { + node->parent->right = lc; + } + } + /* make node to be lc's right child */ + lc->right = node; + node->parent = lc; + /* lc's right child to be node's left child */ + node->left = lcrc; + if(lcrc != _NULL(rbtree)) + { + lcrc->parent = node; + } } util_rbtree_node_t* util_rbtree_search(util_rbtree_t *rbtree, long key) { - if(rbtree != NULL) - { - util_rbtree_node_t *node = rbtree->root; - util_rbtree_node_t *null = _NULL(rbtree); - while(node != null) - { - if(key < node->key) node = node->left; - else if(key > node->key) node = node->right; - else if(node->key == key) return node; - } - } + if(rbtree != NULL) + { + util_rbtree_node_t *node = rbtree->root; + util_rbtree_node_t *null = _NULL(rbtree); + while(node != null) + { + if(key < node->key) node = node->left; + else if(key > node->key) node = node->right; + else if(node->key == key) return node; + } + } return NULL; } util_rbtree_node_t* util_rbtree_lookup(util_rbtree_t *rbtree, long key) { - if((rbtree != NULL) && !util_rbtree_isempty(rbtree)) - { - util_rbtree_node_t *node = NULL; + if((rbtree != NULL) && !util_rbtree_isempty(rbtree)) + { + util_rbtree_node_t *node = NULL; util_rbtree_node_t *temp = rbtree->root; - util_rbtree_node_t *null = _NULL(rbtree); - while(temp != null) - { - if(key <= temp->key) + util_rbtree_node_t *null = _NULL(rbtree); + while(temp != null) + { + if(key <= temp->key) { node = temp; /* update node */ temp = temp->left; } - else if(key > temp->key) + else if(key > temp->key) { temp = temp->right; } - } + } /* if node==NULL return the minimum node */ return ((node != NULL) ? node : util_rbtree_min(rbtree)); - } + } return NULL; } -static void rbtree_check_subtree(const util_rbtree_node_t *node, rbtree_check_t *check, +static void rbtree_check_subtree(const util_rbtree_node_t *node, rbtree_check_t *check, int level, int curheight) { if(check->fini) /* already failed */ @@ -527,7 +527,7 @@ int util_rbtree_check(const util_rbtree_t *rbtree, int *blackheight, int *maxdep return check.fini; } -static void rbtree_mid_travel(util_rbtree_node_t *node, util_rbtree_node_t *sentinel, +static void rbtree_mid_travel(util_rbtree_node_t *node, util_rbtree_node_t *sentinel, void(*opera)(util_rbtree_node_t *, void *), void *data) { if(node->left != sentinel) @@ -541,7 +541,7 @@ static void rbtree_mid_travel(util_rbtree_node_t *node, util_rbtree_node_t *sent } } -void util_rbtree_mid_travel(util_rbtree_t *rbtree, +void util_rbtree_mid_travel(util_rbtree_t *rbtree, void(*opera)(util_rbtree_node_t *, void *), void *data) { if((rbtree!=NULL) && !util_rbtree_isempty(rbtree)) diff --git a/src/ipvs/libconhash/util_rbtree.h b/src/ipvs/libconhash/util_rbtree.h index 8c8c41985..6366f8cab 100644 --- a/src/ipvs/libconhash/util_rbtree.h +++ b/src/ipvs/libconhash/util_rbtree.h @@ -17,7 +17,7 @@ struct util_rbtree_node_s util_rbtree_node_t *right; util_rbtree_node_t *left; int color; - void *data; + void *data; }; struct util_rbtree_s @@ -35,22 +35,22 @@ struct util_rbtree_s /* clear a node's link */ #define rbt_clear_node(node) do{ \ - node->left = NULL; \ - node->right = NULL; \ - node->parent = NULL; \ - }while(0) + node->left = NULL; \ + node->right = NULL; \ + node->parent = NULL; \ + }while(0) /* is the tree empty */ #define util_rbtree_isempty(rbtree) ((rbtree)->root == &(rbtree)->null) -/* - * find the min node of tree +/* + * find the min node of tree * return NULL is tree is empty */ #define util_rbtree_min(rbtree) util_rbsubtree_min((rbtree)->root, &(rbtree)->null) -/* - * find the max node of tree +/* + * find the max node of tree * return NULL is tree is empty */ #define util_rbtree_max(rbtree) util_rbsubtree_max((rbtree)->root, &(rbtree)->null) @@ -59,7 +59,7 @@ void util_rbtree_init(util_rbtree_t *rbtree); void util_rbtree_insert(util_rbtree_t *rbtree, util_rbtree_node_t *node); void util_rbtree_delete(util_rbtree_t *rbtree, util_rbtree_node_t *node); -/* +/* * search node with key = @key in the tree * if no such node exist, return NULL */ @@ -73,16 +73,16 @@ util_rbtree_node_t* util_rbtree_search(util_rbtree_t *rbtree, long key); */ util_rbtree_node_t* util_rbtree_lookup(util_rbtree_t *rbtree, long key); -/* - * find the min node of subtree +/* + * find the min node of subtree * @rbnode: root of the subtree - * @sentinel : the sentinel node + * @sentinel : the sentinel node * return NULL if subtree is empty */ util_rbtree_node_t* util_rbsubtree_min(util_rbtree_node_t *node, util_rbtree_node_t *sentinel); -/* - * find the max node of subtree +/* + * find the max node of subtree * @rbnode: root of the subtree * @sentinel : the sentinel node * return NULL if subtree is empty diff --git a/src/kni.c b/src/kni.c index 56d84a4a1..77f9ed2e0 100644 --- a/src/kni.c +++ b/src/kni.c @@ -232,7 +232,7 @@ static int kni_rtnl_check(void *arg) /* try to handle more events once, because we're not really * event-driven, the polling speed may not fast enough. - * there may not so may events in real world ? but when + * there may not so may events in real world ? but when * performan strength test, it's really found kni_rtnl_timer * is too slow, so that more and more events queued. */ @@ -312,7 +312,7 @@ static int kni_rtnl_init(struct netif_port *dev) return err; } -/* +/* * @dev - real device kni attach to. * @kniname - optional, kni device name or auto generate. */ diff --git a/src/main.c b/src/main.c index fccde36fd..61a0d4358 100644 --- a/src/main.c +++ b/src/main.c @@ -56,6 +56,7 @@ static int set_all_thread_affinity(void) s = pthread_setaffinity_np(tid, sizeof(cpu_set_t), &cpuset); if (s != 0) { + errno = s; perror("fail to set thread affinty"); return -1; } @@ -63,6 +64,7 @@ static int set_all_thread_affinity(void) CPU_ZERO(&cpuset); s = pthread_getaffinity_np(tid, sizeof(cpu_set_t), &cpuset); if (s != 0) { + errno = s; perror("fail to get thread affinity"); return -2; } @@ -206,7 +208,7 @@ int main(int argc, char *argv[]) if ((err = netif_init(NULL)) != EDPVS_OK) rte_exit(EXIT_FAILURE, "Fail to init netif: %s\n", dpvs_strerror(err)); - /* Default lcore conf and port conf are used and may be changed here + /* Default lcore conf and port conf are used and may be changed here * with "netif_port_conf_update" and "netif_lcore_conf_set" */ if ((err = ctrl_init()) != EDPVS_OK) @@ -286,7 +288,7 @@ int main(int argc, char *argv[]) /* process mac ring on master */ neigh_process_ring(NULL); - + /* increase loop counts */ netif_update_master_loop_cnt(); } diff --git a/src/mbuf.c b/src/mbuf.c index 87f4f8ebe..62ee2441a 100644 --- a/src/mbuf.c +++ b/src/mbuf.c @@ -38,56 +38,56 @@ */ int mbuf_may_pull(struct rte_mbuf *mbuf, unsigned int len) { - int delta, eat; - struct rte_mbuf *seg, *next; - - if (likely(len <= mbuf->data_len)) - return 0; - - if (unlikely(len > mbuf->pkt_len)) - return -1; - - delta = len - mbuf->data_len; - - /* different from skb, there's no way to expand mbuf's tail room, - * because mbuf size is determined when init mbuf pool */ - if (rte_pktmbuf_tailroom(mbuf) < delta) { - RTE_LOG(ERR, EMBUF, "%s: no tail room.", __func__); - return -1; - } - - /* pull bits needed from segments to tail room of heading mbuf */ - if (mbuf_copy_bits(mbuf, mbuf->data_len, - mbuf_tail_point(mbuf), delta) != 0) - return -1; - - /* free fully eaten segments and leave left segs attached, - * points need be reload if partial bits was eaten for a seg. */ - eat = delta; - mbuf_foreach_seg_safe(mbuf, next, seg) { - if (eat <= 0) - break; - - if (seg->data_len <= eat) { - assert(mbuf->next == seg); - eat -= seg->data_len; - rte_pktmbuf_free_seg(seg); - mbuf->next = next; - mbuf->nb_segs--; - } else { - rte_pktmbuf_adj(seg, eat); - eat = 0; - break; - } - } - - assert(!eat && - mbuf->data_off + mbuf->data_len + delta <= mbuf->buf_len); - - /* mbuf points must be updated */ - mbuf->data_len += delta; - - return 0; + int delta, eat; + struct rte_mbuf *seg, *next; + + if (likely(len <= mbuf->data_len)) + return 0; + + if (unlikely(len > mbuf->pkt_len)) + return -1; + + delta = len - mbuf->data_len; + + /* different from skb, there's no way to expand mbuf's tail room, + * because mbuf size is determined when init mbuf pool */ + if (rte_pktmbuf_tailroom(mbuf) < delta) { + RTE_LOG(ERR, EMBUF, "%s: no tail room.", __func__); + return -1; + } + + /* pull bits needed from segments to tail room of heading mbuf */ + if (mbuf_copy_bits(mbuf, mbuf->data_len, + mbuf_tail_point(mbuf), delta) != 0) + return -1; + + /* free fully eaten segments and leave left segs attached, + * points need be reload if partial bits was eaten for a seg. */ + eat = delta; + mbuf_foreach_seg_safe(mbuf, next, seg) { + if (eat <= 0) + break; + + if (seg->data_len <= eat) { + assert(mbuf->next == seg); + eat -= seg->data_len; + rte_pktmbuf_free_seg(seg); + mbuf->next = next; + mbuf->nb_segs--; + } else { + rte_pktmbuf_adj(seg, eat); + eat = 0; + break; + } + } + + assert(!eat && + mbuf->data_off + mbuf->data_len + delta <= mbuf->buf_len); + + /* mbuf points must be updated */ + mbuf->data_len += delta; + + return 0; } void mbuf_copy_metadata(struct rte_mbuf *mi, struct rte_mbuf *m) @@ -109,7 +109,7 @@ void mbuf_copy_metadata(struct rte_mbuf *mi, struct rte_mbuf *m) mi->packet_type = m->packet_type; __rte_mbuf_sanity_check(mi, 1); - __rte_mbuf_sanity_check(m,0); + __rte_mbuf_sanity_check(m, 0); } struct rte_mbuf *mbuf_copy(struct rte_mbuf *md, struct rte_mempool *mp) @@ -131,8 +131,7 @@ struct rte_mbuf *mbuf_copy(struct rte_mbuf *md, struct rte_mempool *mp) mbuf_copy_metadata(mi, md); *prev = mi; prev = &mi->next; - rte_memcpy(rte_pktmbuf_mtod(mi, void *), rte_pktmbuf_mtod(md, void *) -, md->data_len); + rte_memcpy(rte_pktmbuf_mtod(mi, void *), rte_pktmbuf_mtod(md, void *), md->data_len); } while ((md = md->next) != NULL && (mi = rte_pktmbuf_alloc(mp)) != NULL); *prev = NULL; diff --git a/src/neigh.c b/src/neigh.c index e91036968..fbbb5a3be 100644 --- a/src/neigh.c +++ b/src/neigh.c @@ -144,14 +144,14 @@ static lcoreid_t master_cid = 0; static struct list_head neigh_table[DPVS_MAX_LCORE][NEIGH_TAB_SIZE]; -static struct raw_neigh* neigh_ring_clone_entry(const struct neighbour_entry* neighbour, +static struct raw_neigh* neigh_ring_clone_entry(const struct neighbour_entry* neighbour, bool add); static int neigh_send_arp(struct netif_port *port, uint32_t src_ip, uint32_t dst_ip); static inline char *eth_addr_itoa(const struct ether_addr *src, char *dst, size_t size) { - snprintf(dst, size, "%02x:%02x:%02x:%02x:%02x:%02x", + snprintf(dst, size, "%02x:%02x:%02x:%02x:%02x:%02x", src->addr_bytes[0], src->addr_bytes[1], src->addr_bytes[2], @@ -159,7 +159,7 @@ static inline char *eth_addr_itoa(const struct ether_addr *src, char *dst, size_ src->addr_bytes[4], src->addr_bytes[5]); return dst; -} +} #ifdef CONFIG_DPVS_NEIGH_DEBUG @@ -221,9 +221,9 @@ static inline int neigh_unhash(struct neighbour_entry *neighbour) static inline bool neigh_key_cmp(int af, const struct neighbour_entry *neighbour, const union inet_addr *key, const struct netif_port* port) { - + return (inet_addr_equal(af, key, &neighbour->ip_addr)) && - (neighbour->port == port) && + (neighbour->port == port) && (neighbour->af == af); } @@ -290,8 +290,8 @@ static int neighbour_timer_event(void *data) return DTIMER_OK; } -struct neighbour_entry *neigh_lookup_entry(int af, const union inet_addr *key, - const struct netif_port* port, +struct neighbour_entry *neigh_lookup_entry(int af, const union inet_addr *key, + const struct netif_port* port, unsigned int hashkey) { struct neighbour_entry *neighbour; @@ -312,9 +312,9 @@ int neigh_edit(struct neighbour_entry *neighbour, struct ether_addr *eth_addr) return EDPVS_OK; } -struct neighbour_entry *neigh_add_table(int af, const union inet_addr *ipaddr, +struct neighbour_entry *neigh_add_table(int af, const union inet_addr *ipaddr, const struct ether_addr *eth_addr, - struct netif_port *port, + struct netif_port *port, unsigned int hashkey, int flag) { struct neighbour_entry *new_neighbour=NULL; @@ -358,9 +358,9 @@ struct neighbour_entry *neigh_add_table(int af, const union inet_addr *ipaddr, } /***********************fill mac hdr before send pkt************************************/ -static void neigh_fill_mac(struct neighbour_entry *neighbour, - struct rte_mbuf *m, - const struct in6_addr *target, +static void neigh_fill_mac(struct neighbour_entry *neighbour, + struct rte_mbuf *m, + const struct in6_addr *target, struct netif_port *port) { struct ether_hdr *eth; @@ -386,7 +386,7 @@ void neigh_send_mbuf_cach(struct neighbour_entry *neighbour) { struct neighbour_mbuf_entry *mbuf, *mbuf_next; struct rte_mbuf *m; - + list_for_each_entry_safe(mbuf, mbuf_next, &neighbour->queue_list,neigh_mbuf_list){ list_del(&mbuf->neigh_mbuf_list); @@ -409,8 +409,8 @@ void neigh_confirm(int af, union inet_addr *nexthop, struct netif_port *port) if (neigh_key_cmp(af, neighbour, nexthop, port) && !(neighbour->flag & NEIGHBOUR_STATIC)) { neigh_entry_state_trans(neighbour, 2); - } - } + } + } } static void neigh_state_confirm(struct neighbour_entry *neighbour) @@ -424,22 +424,22 @@ static void neigh_state_confirm(struct neighbour_entry *neighbour) inet_addr_select(AF_INET, neighbour->port, &daddr, 0, &saddr); if (!saddr.in.s_addr) { RTE_LOG(ERR, NEIGHBOUR, "[%s]no source ip\n", __func__); - } + } if (neigh_send_arp(neighbour->port, saddr.in.s_addr, daddr.in.s_addr) != EDPVS_OK) { RTE_LOG(ERR, NEIGHBOUR, "[%s] send arp failed\n", __func__); - } + } } else if (neighbour->af == AF_INET6) { /*to be continue*/ - ipv6_addr_copy(&daddr.in6, &neighbour->ip_addr.in6); + ipv6_addr_copy(&daddr.in6, &neighbour->ip_addr.in6); inet_addr_select(AF_INET6, neighbour->port, &daddr, 0, &saddr); if (ipv6_addr_any(&saddr.in6)) RTE_LOG(ERR, NEIGHBOUR, "[%s]no source ip\n", __func__); - + ndisc_solicit(neighbour, &saddr.in6); - } + } } /*arp*/ @@ -447,17 +447,15 @@ int neigh_resolve_input(struct rte_mbuf *m, struct netif_port *port) { struct arp_hdr *arp = rte_pktmbuf_mtod(m, struct arp_hdr *); struct ether_hdr *eth; - uint32_t ipaddr; struct neighbour_entry *neighbour = NULL; unsigned int hashkey; - struct route_entry *rt = NULL; + struct inet_ifaddr *ifa; - rt = route4_local(arp->arp_data.arp_tip, port); - if(!rt){ - return EDPVS_KNICONTINUE; - } - route4_put(rt); + ifa = inet_addr_ifa_get(AF_INET, port, (union inet_addr*)&arp->arp_data.arp_tip); + if (!ifa) + return EDPVS_KNICONTINUE; + inet_addr_ifa_put(ifa); eth = (struct ether_hdr *)rte_pktmbuf_prepend(m, (uint16_t)sizeof(struct ether_hdr)); @@ -475,20 +473,20 @@ int neigh_resolve_input(struct rte_mbuf *m, struct netif_port *port) arp->arp_data.arp_tip = ipaddr; m->l2_len = sizeof(struct ether_hdr); m->l3_len = sizeof(struct arp_hdr); - + netif_xmit(m, port); return EDPVS_OK; } else if(arp->arp_op == htons(ARP_OP_REPLY)) { ipaddr = arp->arp_data.arp_sip; hashkey = neigh_hashkey(AF_INET, (union inet_addr *)&ipaddr, port); - neighbour = neigh_lookup_entry(AF_INET, (union inet_addr *)&ipaddr, + neighbour = neigh_lookup_entry(AF_INET, (union inet_addr *)&ipaddr, port, hashkey); if (neighbour && !(neighbour->flag & NEIGHBOUR_STATIC)) { neigh_edit(neighbour, &arp->arp_data.arp_sha); neigh_entry_state_trans(neighbour, 1); } else { - neighbour = neigh_add_table(AF_INET, (union inet_addr *)&ipaddr, + neighbour = neigh_add_table(AF_INET, (union inet_addr *)&ipaddr, &arp->arp_data.arp_sha, port, hashkey, 0); if(!neighbour){ RTE_LOG(ERR, NEIGHBOUR, "[%s] add neighbour wrong\n", __func__); @@ -510,7 +508,7 @@ static int neigh_send_arp(struct netif_port *port, uint32_t src_ip, uint32_t dst struct rte_mbuf *m; struct ether_hdr *eth; struct arp_hdr *arp; - + uint32_t addr; m = rte_pktmbuf_alloc(port->mbuf_pool); @@ -551,7 +549,7 @@ static int neigh_send_arp(struct netif_port *port, uint32_t src_ip, uint32_t dst return EDPVS_OK; } -int neigh_output(int af, union inet_addr *nexhop, +int neigh_output(int af, union inet_addr *nexhop, struct rte_mbuf *m, struct netif_port *port) { struct neighbour_entry *neighbour; @@ -574,7 +572,7 @@ int neigh_output(int af, union inet_addr *nexhop, (neighbour->state == DPVS_NUD_S_SEND)) { if (neighbour->que_num > arp_unres_qlen) { /* - * don't need arp request now, + * don't need arp request now, * since neighbour will not be confirmed * and it will be released late */ @@ -669,7 +667,7 @@ static int neigh_ring_init(void) socket_id = rte_socket_id(); for (cid = 0; cid < DPVS_MAX_LCORE; cid++) { snprintf(name_buf, RTE_RING_NAMESIZE, "neigh_ring_c%d", cid); - neigh_ring[cid] = rte_ring_create(name_buf, MAC_RING_SIZE, + neigh_ring[cid] = rte_ring_create(name_buf, MAC_RING_SIZE, socket_id, RING_F_SC_DEQ); if (neigh_ring[cid] == NULL) rte_panic("create ring:%s failed!\n", name_buf); @@ -694,7 +692,7 @@ static struct raw_neigh* neigh_ring_clone_entry(const struct neighbour_entry* ne return mac_param; } -static struct raw_neigh* neigh_ring_clone_param(const struct dp_vs_neigh_conf *param, +static struct raw_neigh* neigh_ring_clone_param(const struct dp_vs_neigh_conf *param, bool add) { struct netif_port *port; @@ -724,14 +722,14 @@ void neigh_process_ring(void *arg) struct neighbour_entry *neigh; struct raw_neigh *param; lcoreid_t cid = rte_lcore_id(); - nb_rb = rte_ring_dequeue_burst(neigh_ring[cid], (void **)params, + nb_rb = rte_ring_dequeue_burst(neigh_ring[cid], (void **)params, NETIF_MAX_PKT_BURST, NULL); if (nb_rb > 0) { int i; for (i = 0; i < nb_rb; i++) { param = params[i]; hash = neigh_hashkey(param->af, ¶m->ip_addr, param->port); - neigh = neigh_lookup_entry(param->af, ¶m->ip_addr, + neigh = neigh_lookup_entry(param->af, ¶m->ip_addr, param->port, hash); if (param->add) { if (neigh) { @@ -740,8 +738,8 @@ void neigh_process_ring(void *arg) neigh_entry_state_trans(neigh, 1); } else { - neigh = neigh_add_table(param->af, ¶m->ip_addr, - ¶m->eth_addr, param->port, + neigh = neigh_add_table(param->af, ¶m->ip_addr, + ¶m->eth_addr, param->port, hash, param->flag); if (!(param->flag & NEIGHBOUR_STATIC)) neigh_entry_state_trans(neigh, 1); @@ -789,7 +787,7 @@ static void neigh_fill_param(struct dp_vs_neigh_conf *param, memcpy(¶m->ifname, entry->port->name, IFNAMSIZ); } -static void neigh_fill_array(struct netif_port *dev, lcoreid_t cid, +static void neigh_fill_array(struct netif_port *dev, lcoreid_t cid, struct dp_vs_neigh_conf_array *array, size_t neigh_nums) { @@ -837,7 +835,7 @@ static int get_neigh_uc_cb(struct dpvs_msg *msg) if (msg->len) dev = netif_port_get_by_name((char *)msg->data); - len = sizeof(struct dp_vs_neigh_conf_array) + + len = sizeof(struct dp_vs_neigh_conf_array) + sizeof(struct dp_vs_neigh_conf) * neigh_nums[cid]; array = rte_zmalloc("neigh_array", len, RTE_CACHE_LINE_SIZE); @@ -848,7 +846,7 @@ static int get_neigh_uc_cb(struct dpvs_msg *msg) return EDPVS_OK; } -static int neigh_sockopt_get(sockoptid_t opt, const void *conf, +static int neigh_sockopt_get(sockoptid_t opt, const void *conf, size_t size, void **out, size_t *outsize) { const struct dp_vs_neigh_conf *cf; @@ -903,13 +901,13 @@ static int neigh_sockopt_get(sockoptid_t opt, const void *conf, array->neigh_nums = neigh_nums_g; list_for_each_entry(cur, &reply->mq, mq_node) { array_msg = (struct dp_vs_neigh_conf_array *)(cur->data); - memcpy(&array->addrs[off], &array_msg->addrs, + memcpy(&array->addrs[off], &array_msg->addrs, array_msg->neigh_nums * sizeof(struct dp_vs_neigh_conf)); off += array_msg->neigh_nums; } msg_destroy(&msg); - + return EDPVS_OK; } diff --git a/src/netif.c b/src/netif.c index 20ed57d26..b1520a1b4 100644 --- a/src/netif.c +++ b/src/netif.c @@ -41,7 +41,7 @@ #include #include #include - +#include #define NETIF_PKTPOOL_NB_MBUF_DEF 65535 #define NETIF_PKTPOOL_NB_MBUF_MIN 1023 @@ -320,8 +320,9 @@ static void rss_handler(vector_t tokens) struct port_conf_stream, port_list_node); assert(str); - if (!strcmp(str, "all") || !strcmp(str, "ip") || !strcmp(str, "tcp") || !strcmp(str, "udp") - || !strcmp(str, "sctp") || !strcmp(str, "ether") || !strcmp(str, "port") || !strcmp(str, "tunnel")) { + if (!strcmp(str, "all") || !strcmp(str, "ip") || !strcmp(str, "tcp") || !strcmp(str, "udp") + || !strcmp(str, "sctp") || !strcmp(str, "ether") || !strcmp(str, "port") || !strcmp(str, "tunnel") + || (strstr(str, "|") && str[0] != '|')) { RTE_LOG(INFO, NETIF, "%s:rss = %s\n", current_device->name, str); strncpy(current_device->rss, str, sizeof(current_device->rss)); } else { @@ -1061,7 +1062,7 @@ static struct pkt_type *pkt_type_get(uint16_t type, struct netif_port *port) } /****************************************** lcore job *********************************************/ -/* Note: lockless, lcore_job can only be register on initialization stage and +/* Note: lockless, lcore_job can only be register on initialization stage and * unregistered on cleanup stage. */ struct list_head netif_lcore_jobs[NETIF_LCORE_JOB_TYPE_MAX]; @@ -1224,6 +1225,11 @@ static void config_lcores(struct list_head *worker_list) lcoreid_t lcore2index[DPVS_MAX_LCORE]; portid_t port2index[DPVS_MAX_LCORE][NETIF_MAX_PORTS]; +bool netif_lcore_is_idle(lcoreid_t cid) +{ + return (lcore_conf[lcore2index[cid]].nports == 0) ? true : false; +} + static void lcore_index_init(void) { lcoreid_t ii; @@ -2109,7 +2115,7 @@ int netif_hard_xmit(struct rte_mbuf *mbuf, struct netif_port *dev) (lcore_conf[lcore2index[cid]].pqs[port2index[cid][pid]].ntxq); //RTE_LOG(DEBUG, NETIF, "tx-queue hash(%x) = %d\n", ((uint32_t)mbuf->buf_physaddr) >> 8, qindex); txq = &lcore_conf[lcore2index[cid]].pqs[port2index[cid][pid]].txqs[qindex]; - + if (unlikely(txq->len == NETIF_MAX_PKT_BURST)) { netif_tx_burst(cid, pid, qindex); txq->len = 0; @@ -2160,7 +2166,7 @@ static inline eth_type_t eth_type_parse(const struct ether_hdr *eth_hdr, else return ETH_PKT_MULTICAST; } - + return ETH_PKT_OTHERHOST; } @@ -2286,7 +2292,7 @@ static int netif_arp_ring_init(void) return EDPVS_OK; } -static void lcore_process_packets(struct netif_queue_conf *qconf, struct rte_mbuf **mbufs, +void lcore_process_packets(struct netif_queue_conf *qconf, struct rte_mbuf **mbufs, lcoreid_t cid, uint16_t count, bool pkts_from_ring) { int i, t; @@ -2389,6 +2395,11 @@ static void lcore_process_arp_ring(struct netif_queue_conf *qconf, lcoreid_t cid } } +static void lcore_process_redirect_ring(struct netif_queue_conf *qconf, lcoreid_t cid) +{ + dp_vs_redirect_ring_proc(qconf, cid); +} + static void lcore_job_recv_fwd(void *arg) { int i, j; @@ -2407,6 +2418,7 @@ static void lcore_job_recv_fwd(void *arg) qconf = &lcore_conf[lcore2index[cid]].pqs[i].rxqs[j]; lcore_process_arp_ring(qconf, cid); + lcore_process_redirect_ring(qconf, cid); qconf->len = netif_rx_burst(pid, qconf); lcore_stats_burst(&lcore_stats[cid], qconf->len); @@ -2505,7 +2517,7 @@ static void netif_lcore_init(void) for (ii = 0; ii < NETIF_JOB_COUNT; ii++) { res = netif_lcore_loop_job_register(&netif_jobs[ii]); if (res < 0) { - rte_exit(EXIT_FAILURE, + rte_exit(EXIT_FAILURE, "[%s] Fail to register netif lcore jobs, exiting ...\n", __func__); break; } @@ -2551,10 +2563,10 @@ static inline void free_mbufs(struct rte_mbuf **pkts, unsigned num) for (i = 0; i < num; i++) { rte_pktmbuf_free(pkts[i]); pkts[i] = NULL; - } + } } -static void kni_ingress(struct rte_mbuf *mbuf, struct netif_port *dev, +static void kni_ingress(struct rte_mbuf *mbuf, struct netif_port *dev, struct netif_queue_conf *qconf) { unsigned pkt_num; @@ -2572,8 +2584,8 @@ static void kni_ingress(struct rte_mbuf *mbuf, struct netif_port *dev, } /* VLAN device cannot be scheduled by kni_send2kern_loop */ - if ((dev->type == PORT_TYPE_VLAN && qconf->kni_len > 0)|| - unlikely(qconf->kni_len == NETIF_MAX_PKT_BURST)) { + if ((dev->type == PORT_TYPE_VLAN && qconf->kni_len > 0) || + unlikely(qconf->kni_len == NETIF_MAX_PKT_BURST)) { rte_spinlock_lock(&kni_lock); pkt_num = rte_kni_tx_burst(dev->kni.kni, qconf->kni_mbufs, qconf->kni_len); rte_spinlock_unlock(&kni_lock); @@ -2591,7 +2603,7 @@ static void kni_send2kern_loop(uint8_t port_id, struct netif_queue_conf *qconf) { struct netif_port *dev; unsigned pkt_num; - + dev = netif_port_get(port_id); if (qconf->kni_len > 0) { @@ -2930,7 +2942,7 @@ void netif_mask_fdir_filter(int af, const struct netif_port *port, static int dpdk_set_fdir_filt(struct netif_port *dev, enum rte_filter_op op, const struct rte_eth_fdir_filter *filt) { - if (rte_eth_dev_filter_ctrl((uint8_t)dev->id, RTE_ETH_FILTER_FDIR, + if (rte_eth_dev_filter_ctrl(dev->id, RTE_ETH_FILTER_FDIR, op, (void *)filt) < 0) return EDPVS_DPDKAPIFAIL; @@ -3039,7 +3051,7 @@ static struct netif_port* netif_rte_port_alloc(portid_t id, int nrxq, port->hw_header_len = sizeof(struct ether_hdr); if (port->socket == SOCKET_ID_ANY) port->socket = rte_socket_id(); - port->mbuf_pool = pktmbuf_pool[port->socket]; + port->mbuf_pool = pktmbuf_pool[port->socket]; rte_eth_macaddr_get((uint8_t)id, &port->addr); rte_eth_dev_get_mtu((uint8_t)id, &port->mtu); rte_eth_dev_info_get((uint8_t)id, &port->dev_info); @@ -3124,7 +3136,7 @@ int netif_get_queue(struct netif_port *port, lcoreid_t cid, queueid_t *qid) if (++idx > IDX_MAX) idx = 0; - *qid = qconf->rxqs[idx % qconf->nrxq].id; + *qid = qconf->rxqs[idx % qconf->nrxq].id; return EDPVS_OK; } @@ -3165,7 +3177,7 @@ int netif_get_stats(struct netif_port *dev, struct rte_eth_stats *stats) return EDPVS_OK; } -int netif_fdir_filter_set(struct netif_port *port, enum rte_filter_op opcode, +int netif_fdir_filter_set(struct netif_port *port, enum rte_filter_op opcode, const struct rte_eth_fdir_filter *fdir_flt) { assert(port && port->netif_ops); @@ -3249,12 +3261,39 @@ inline static int netif_port_fdir_dstport_mask_set(struct netif_port *port) return EDPVS_OK; } +static int rss_resolve_proc(char *rss) +{ + int rss_value = 0; + + if (!strcmp(rss, "all")) + rss_value = ETH_RSS_IP | ETH_RSS_TCP | ETH_RSS_UDP; + else if (!strcmp(rss, "ip")) + rss_value = ETH_RSS_IP; + else if (!strcmp(rss, "tcp")) + rss_value = ETH_RSS_TCP; + else if (!strcmp(rss, "udp")) + rss_value = ETH_RSS_UDP; + else if (!strcmp(rss, "sctp")) + rss_value = ETH_RSS_SCTP; + else if (!strcmp(rss, "ether")) + rss_value = ETH_RSS_L2_PAYLOAD; + else if (!strcmp(rss, "port")) + rss_value = ETH_RSS_PORT; + else if (!strcmp(rss, "tunnel")) + rss_value = ETH_RSS_TUNNEL; + + return rss_value; +} + /* fill in rx/tx queue configurations, including queue number, * decriptor number, bonding device's rss */ static void fill_port_config(struct netif_port *port, char *promisc_on) { assert(port); + char rss[256] = {0}; + int index = 0; + int rss_index = 0; struct port_conf_stream *cfg_stream; if (port->type == PORT_TYPE_BOND_SLAVE) { @@ -3269,22 +3308,23 @@ static void fill_port_config(struct netif_port *port, char *promisc_on) cfg_stream = get_port_conf_stream(port->name); if (cfg_stream) { /* device specific configurations from cfgfile */ - if (!strcmp(cfg_stream->rss, "all")) - port->dev_conf.rx_adv_conf.rss_conf.rss_hf = ETH_RSS_IP | ETH_RSS_TCP | ETH_RSS_UDP; - else if (!strcmp(cfg_stream->rss, "ip")) - port->dev_conf.rx_adv_conf.rss_conf.rss_hf = ETH_RSS_IP; - else if (!strcmp(cfg_stream->rss, "tcp")) - port->dev_conf.rx_adv_conf.rss_conf.rss_hf = ETH_RSS_TCP; - else if (!strcmp(cfg_stream->rss, "udp")) - port->dev_conf.rx_adv_conf.rss_conf.rss_hf = ETH_RSS_UDP; - else if (!strcmp(cfg_stream->rss, "sctp")) - port->dev_conf.rx_adv_conf.rss_conf.rss_hf = ETH_RSS_SCTP; - else if (!strcmp(cfg_stream->rss, "ether")) - port->dev_conf.rx_adv_conf.rss_conf.rss_hf = ETH_RSS_L2_PAYLOAD; - else if (!strcmp(cfg_stream->rss, "port")) - port->dev_conf.rx_adv_conf.rss_conf.rss_hf = ETH_RSS_PORT; - else if (!strcmp(cfg_stream->rss, "tunnel")) - port->dev_conf.rx_adv_conf.rss_conf.rss_hf = ETH_RSS_TUNNEL; + port->dev_conf.rx_adv_conf.rss_conf.rss_hf = 0; + for (index = 0; index < strlen(cfg_stream->rss); index++) { + if (cfg_stream->rss[index] == ' ') { + continue; + } else if (cfg_stream->rss[index] != '|') { + rss[rss_index++] = cfg_stream->rss[index]; + } else { + rss[rss_index] = '\0'; + rss_index = 0; + port->dev_conf.rx_adv_conf.rss_conf.rss_hf |= rss_resolve_proc(rss); + memset(rss, 0, sizeof(rss)); + } + } + + if (rss[0]) { + port->dev_conf.rx_adv_conf.rss_conf.rss_hf |= rss_resolve_proc(rss); + } port->dev_conf.fdir_conf.mode = cfg_stream->fdir_mode; port->dev_conf.fdir_conf.pballoc = cfg_stream->fdir_pballoc; @@ -3458,7 +3498,7 @@ int netif_port_start(struct netif_port *port) if (port->ntxq > 0) { for (qid = 0; qid < port->ntxq; qid++) { memcpy(&txconf, &port->dev_info.default_txconf, sizeof(struct rte_eth_txconf)); - if (port->dev_conf.rxmode.jumbo_frame + if (port->dev_conf.rxmode.jumbo_frame || (port->flag & NETIF_PORT_FLAG_TX_IP_CSUM_OFFLOAD) || (port->flag & NETIF_PORT_FLAG_TX_UDP_CSUM_OFFLOAD) || (port->flag & NETIF_PORT_FLAG_TX_TCP_CSUM_OFFLOAD)) @@ -3486,20 +3526,20 @@ int netif_port_start(struct netif_port *port) // build port-queue-lcore mapping array build_port_queue_lcore_map(); - // start the device + // start the device ret = rte_eth_dev_start(port->id); if (ret < 0) { RTE_LOG(ERR, NETIF, "%s: fail to start %s\n", __func__, port->name); return EDPVS_DPDKAPIFAIL; } - // wait the device link up + // wait the device link up RTE_LOG(INFO, NETIF, "Waiting for %s link up, be patient ...\n", port->name); for (ii = 0; ii < wait_link_up_msecs; ii++) { rte_eth_link_get_nowait(port->id, &link); if (link.link_status) { RTE_LOG(INFO, NETIF, ">> %s: link up - speed %u Mbps - %s\n", - port->name, (unsigned)link.link_speed, + port->name, (unsigned)link.link_speed, (link.link_duplex == ETH_LINK_FULL_DUPLEX) ? "full-duplex" : "half-duplex"); break; @@ -3958,6 +3998,11 @@ static int netif_loop(void *dummy) uint64_t loop_start, loop_end; #endif +#ifdef DPVS_MAX_LCORE + if (cid >= DPVS_MAX_LCORE) + return EDPVS_IDLE; +#endif + assert(LCORE_ID_ANY != cid); try_isol_rxq_lcore_loop(); diff --git a/src/parser.c b/src/parser.c index a2bb1fcc6..b145384c0 100644 --- a/src/parser.c +++ b/src/parser.c @@ -32,8 +32,8 @@ static vector_t g_current_keywords; static char *g_current_conf_file; static int g_sublevel = 0; -/* - * keyword operations +/* + * keyword operations * */ /* allocate and set a keyword in current level */ diff --git a/src/route.c b/src/route.c index e247292b7..ce3afd148 100644 --- a/src/route.c +++ b/src/route.c @@ -71,13 +71,6 @@ static int route_local_hash(struct route_entry *route) return EDPVS_OK; } -static int route_local_unhash(struct route_entry *route) -{ - list_del(&route->list); - rte_atomic32_dec(&route->refcnt); - return EDPVS_OK; -} - static struct route_entry *route_new_entry(struct in_addr* dest, uint8_t netmask, uint32_t flag, struct in_addr* gw, struct netif_port *port, @@ -147,18 +140,6 @@ static int route_net_add(struct in_addr *dest, uint8_t netmask, uint32_t flag, return EDPVS_OK; } -static int route_net_del(struct route_entry *route) -{ - if (route){ - DPVS_WAIT_WHILE(rte_atomic32_read(&route->refcnt) > 2); - list_del(&route->list); - rte_free(route); - rte_atomic32_dec(&this_num_routes); - return EDPVS_OK; - } - return EDPVS_NOTEXIST; -} - static struct route_entry *route_local_lookup(uint32_t dest, const struct netif_port *port) { unsigned hashkey; @@ -251,18 +232,6 @@ static int route_local_add(struct in_addr* dest, uint8_t netmask, uint32_t flag, return EDPVS_OK; } -static int route_local_del(struct route_entry *route) -{ - if(route){ - DPVS_WAIT_WHILE(rte_atomic32_read(&route->refcnt) > 2); - route_local_unhash(route); - rte_free(route); - rte_atomic32_dec(&this_num_routes); - return EDPVS_OK; - } - return EDPVS_NOTEXIST; -} - static int route_add_lcore(struct in_addr* dest,uint8_t netmask, uint32_t flag, struct in_addr* gw, struct netif_port *port, struct in_addr* src, unsigned long mtu,short metric) @@ -277,22 +246,42 @@ static int route_add_lcore(struct in_addr* dest,uint8_t netmask, uint32_t flag, return EDPVS_INVAL; } +/* del route node in list, then mbuf next will never find it; + * route4_put will delete route when refcnt is 0. + * refcnt: + * 1, new route is set to 0; + * 2, add list will be 1; + * 3, find route and ref it will +1; + * 4, put route will -1; + */ static int route_del_lcore(struct in_addr* dest,uint8_t netmask, uint32_t flag, struct in_addr* gw, struct netif_port *port, struct in_addr* src, unsigned long mtu,short metric) { struct route_entry *route = NULL; - int error; + if(flag & RTF_LOCALIN || (flag & RTF_KNI)){ route = route_local_lookup(dest->s_addr, port); - error = route_local_del(route); - return error; + if (!route) + return EDPVS_NOTEXIST; + list_del(&route->list); + rte_atomic32_dec(&route->refcnt); + rte_atomic32_dec(&this_num_routes); + route4_put(route); + return EDPVS_OK; } + if(flag & RTF_FORWARD || (flag & RTF_DEFAULT)){ route = route_net_lookup(port, dest, netmask); - error = route_net_del(route); - return error; + if (!route) + return EDPVS_NOTEXIST; + list_del(&route->list); + rte_atomic32_dec(&route->refcnt); + rte_atomic32_dec(&this_num_routes); + route4_put(route); + return EDPVS_OK; } + return EDPVS_INVAL; } @@ -318,7 +307,7 @@ static int route_add_del(bool add, struct in_addr* dest, else err = route_del_lcore(dest, netmask, flag, gw, port, src, mtu, metric); - if (err != EDPVS_OK) { + if (err != EDPVS_OK && err != EDPVS_EXIST && err != EDPVS_NOTEXIST) { RTE_LOG(INFO, ROUTE, "[%s] fail to set route\n", __func__); return err; } @@ -346,9 +335,11 @@ static int route_add_del(bool add, struct in_addr* dest, err = multicast_msg_send(msg, 0/*DPVS_MSG_F_ASYNC*/, NULL); if (err != EDPVS_OK) { - msg_destroy(&msg); - RTE_LOG(INFO, ROUTE, "[%s] fail to send multicast message\n", __func__); - return err; + /* ignore timeout for msg, or keepalived will cause a lot bug. + * Timeout error is ok because route can still be set, + * no mem is another possible err, but problem will not just be here */ + RTE_LOG(INFO, ROUTE, "[%s] fail to send multicast message, error code = %d\n", + __func__, err); } msg_destroy(&msg); @@ -423,7 +414,7 @@ struct route_entry *route4_output(const struct flow4 *fl4) if(route){ return route; } - + route = route_out_net_lookup(&fl4->fl4_daddr); if(route){ return route; @@ -432,24 +423,34 @@ struct route_entry *route4_output(const struct flow4 *fl4) return NULL; } -int route_flush(void) +static int route_lcore_flush(void) { int i = 0; struct route_entry *route_node; for (i = 0; i < LOCAL_ROUTE_TAB_SIZE; i++){ list_for_each_entry(route_node, &this_local_route_table[i], list){ - route_local_del(route_node); + list_del(&route_node->list); + rte_atomic32_dec(&this_num_routes); + route4_put(route_node); } } list_for_each_entry(route_node, &this_net_route_table, list){ - route_net_del(route_node); + list_del(&route_node->list); + rte_atomic32_dec(&this_num_routes); + route4_put(route_node); } return EDPVS_OK; } +int route_flush(void) +{ + return EDPVS_OK; +} + + /** * control plane */ @@ -673,7 +674,7 @@ static int route_lcore_term(void *arg) if (!rte_lcore_is_enabled(rte_lcore_id())) return EDPVS_DISABLED; - return route_flush(); + return route_lcore_flush(); } int route_init(void) diff --git a/src/sys_time.c b/src/sys_time.c index 38f5f240d..781a00aa0 100644 --- a/src/sys_time.c +++ b/src/sys_time.c @@ -24,7 +24,7 @@ char* cycles_to_stime(uint64_t cycles) { time_t ts; static char time_str[SYS_TIME_STR_LEN]; - + memset(time_str, 0, SYS_TIME_STR_LEN); ts = (cycles - g_start_cycles) / rte_get_timer_hz(); ts += g_dpvs_timer; diff --git a/src/tc/cls.c b/src/tc/cls.c index 8d7c38753..76fc69948 100644 --- a/src/tc/cls.c +++ b/src/tc/cls.c @@ -37,7 +37,7 @@ static inline tc_handle_t cls_alloc_handle(struct Qsch *sch) autohandle = TC_H_MAKE(0x80000000U, 0); if (!tc_cls_lookup(sch, autohandle)) return autohandle; - } while (--i > 0); + } while (--i > 0); return 0; } @@ -142,7 +142,7 @@ void tc_cls_destroy(struct tc_cls *cls) if (ops->destroy) ops->destroy(cls); - + tc_cls_ops_put(ops); cls_free(cls); } diff --git a/src/tc/sch_generic.c b/src/tc/sch_generic.c index 8dcc6e524..636d6b25d 100644 --- a/src/tc/sch_generic.c +++ b/src/tc/sch_generic.c @@ -128,7 +128,7 @@ static inline tc_handle_t sch_alloc_handle(struct netif_port *dev) autohandle = TC_H_MAKE(0x80000000U, 0); if (!qsch_lookup_noref(&dev->tc, autohandle)) return autohandle; - } while (--i > 0); + } while (--i > 0); return 0; } diff --git a/src/tc/sch_pfifo_fast.c b/src/tc/sch_pfifo_fast.c index 1417e3fca..71cedf6f0 100644 --- a/src/tc/sch_pfifo_fast.c +++ b/src/tc/sch_pfifo_fast.c @@ -31,7 +31,7 @@ #define TC_PRIO_MAX 15 static const uint8_t prio2band[TC_PRIO_MAX + 1] = { - 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 + 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 }; #define PFIFO_FAST_BANDS 3 @@ -47,7 +47,7 @@ struct pfifo_fast_priv { }; static inline struct tc_mbuf_head *band2list(struct pfifo_fast_priv *priv, - int band) + int band) { assert(band >= 0 && band < PFIFO_FAST_BANDS); @@ -55,7 +55,7 @@ static inline struct tc_mbuf_head *band2list(struct pfifo_fast_priv *priv, } static inline struct tc_mbuf_head *band2list_cpu(struct pfifo_fast_priv *priv, - int band, lcoreid_t cid) + int band, lcoreid_t cid) { assert(band >= 0 && band < PFIFO_FAST_BANDS); @@ -84,7 +84,7 @@ static int pfifo_fast_enqueue(struct Qsch *sch, struct rte_mbuf *mbuf) band = prio2band[prio]; priv = qsch_priv(sch); qh = band2list(priv, band); - + err = __qsch_enqueue_tail(sch, mbuf, qh); if (err == EDPVS_OK) { priv->this_bitmap |= (1 << band); diff --git a/src/timer.c b/src/timer.c index ce5bed1a9..665be8698 100644 --- a/src/timer.c +++ b/src/timer.c @@ -182,7 +182,7 @@ static void timer_expire(struct timer_scheduler *sched, struct dpvs_timer *timer struct timeval delay; assert(timer && timer->handler); - /* remove from hash table first, since timer may + /* remove from hash table first, since timer may * set by handler, could not remove it after it. */ handler = timer->handler; priv = timer->priv; @@ -216,16 +216,16 @@ static inline void deviation_measure(void) timersub(&tv_now, &tv_prev[rte_lcore_id()], &tv_elapse); tv_prev[rte_lcore_id()] = tv_now; - printf("[%d] %s: round %u elapse %6lu.%06lu\n", + printf("[%d] %s: round %u elapse %6lu.%06lu\n", rte_lcore_id(), __func__, count[rte_lcore_id()] - 1, tv_elapse.tv_sec, tv_elapse.tv_usec); } } #endif -/* +/* * it takes exactly one tick between invokations, - * except system (including time handles) takes more then + * except system (including time handles) takes more then * one tick to get rte_timer_manage() called. * we needn't calculate ticks elapsed by ourself. */ @@ -379,7 +379,7 @@ int dpvs_timer_init(void) RTE_LCORE_FOREACH_SLAVE(cid) { err = rte_eal_wait_lcore(cid); if (err < 0) { - RTE_LOG(ERR, DTIMER, "%s: lcore %d: %s.\n", + RTE_LOG(ERR, DTIMER, "%s: lcore %d: %s.\n", __func__, cid, dpvs_strerror(err)); return err; } @@ -399,7 +399,7 @@ int dpvs_timer_term(void) RTE_LCORE_FOREACH_SLAVE(cid) { err = rte_eal_wait_lcore(cid); if (err < 0) { - RTE_LOG(WARNING, DTIMER, "%s: lcore %d: %s.\n", + RTE_LOG(WARNING, DTIMER, "%s: lcore %d: %s.\n", __func__, cid, dpvs_strerror(err)); } } @@ -419,13 +419,13 @@ static inline struct timer_scheduler *this_lcore_sched(bool global) return global ? &g_timer_sched : &RTE_PER_LCORE(timer_sched); } -int dpvs_timer_sched(struct dpvs_timer *timer, struct timeval *delay, +int dpvs_timer_sched(struct dpvs_timer *timer, struct timeval *delay, dpvs_timer_cb_t handler, void *arg, bool global) { struct timer_scheduler *sched = this_lcore_sched(global); int err; - if (!sched || !timer || !delay || !handler + if (!sched || !timer || !delay || !handler || delay->tv_sec >= TIMER_MAX_SECS) return EDPVS_INVAL; @@ -436,7 +436,7 @@ int dpvs_timer_sched(struct dpvs_timer *timer, struct timeval *delay, return err; } -int dpvs_timer_sched_abs(struct dpvs_timer *timer, struct timeval *expire, +int dpvs_timer_sched_abs(struct dpvs_timer *timer, struct timeval *expire, dpvs_timer_cb_t handler, void *arg, bool global) { struct timer_scheduler *sched = this_lcore_sched(global); @@ -450,7 +450,7 @@ int dpvs_timer_sched_abs(struct dpvs_timer *timer, struct timeval *expire, __time_now(sched, &now); if (!timercmp(expire, &now, >)) { /* consider the diff between user call dpvs_time_now() and NOW, - * it's possible timer already expired although rarely. + * it's possible timer already expired although rarely. * to schedule an 1-tick timer ? no, let's trigger it now. * note we cannot call timer_expire() direcly. */ handler(arg); @@ -470,7 +470,7 @@ int dpvs_timer_sched_abs(struct dpvs_timer *timer, struct timeval *expire, return err; } -int dpvs_timer_sched_period(struct dpvs_timer *timer, struct timeval *intv, +int dpvs_timer_sched_period(struct dpvs_timer *timer, struct timeval *intv, dpvs_timer_cb_t handler, void *arg, bool global) { struct timer_scheduler *sched = this_lcore_sched(global); @@ -530,7 +530,7 @@ int dpvs_timer_update(struct dpvs_timer *timer, struct timeval *delay, bool glob rte_spinlock_lock(&sched->lock); if (timer_pending(timer)) list_del(&timer->list); - err = __dpvs_timer_sched(sched, timer, delay, + err = __dpvs_timer_sched(sched, timer, delay, timer->handler, timer->priv, timer->is_period); rte_spinlock_unlock(&sched->lock); return err; diff --git a/src/vlan.c b/src/vlan.c index 44a7a7e37..78172bfc4 100644 --- a/src/vlan.c +++ b/src/vlan.c @@ -423,10 +423,10 @@ int vlan_rcv(struct rte_mbuf *mbuf, struct netif_port *real_dev) mbuf->port = dev->id; if (unlikely(mbuf->packet_type == ETH_PKT_OTHERHOST)) { - /* as comments in linux:vlan_do_receive(). + /* as comments in linux:vlan_do_receive(). * "Our lower layer thinks this is not local, let's make sure. - * This allows the VLAN to have a different MAC than the - * underlying device, and still route correctly." */ + * This allows the VLAN to have a different MAC than the + * underlying device, and still route correctly." */ if (eth_addr_equal(&ehdr->d_addr, &dev->addr)) mbuf->packet_type = ETH_PKT_HOST; } diff --git a/test/checklist-v1.7.2.md b/test/checklist-v1.7.2.md new file mode 100644 index 000000000..1221c320d --- /dev/null +++ b/test/checklist-v1.7.2.md @@ -0,0 +1,101 @@ +DPVS v1.7.2 功能测试结果 +------------------------ +- IPv4 协议栈测试 + * [Y] DPVS IPv4 地址添加、查询、删除 + * [Y] DPVS IPv4 路由添加、查询、删除 + * [Y] DPVS IPv4 ARP 缓存表正常 + * [Y] DPVS IPv4 地址能 ping 通 + +- IPv6 协议栈测试 + * [Y] DPVS IPv6 地址添加、查询、删除 + * [Y] DPVS IPv6 路由添加、查询、删除 + * [Y] DPVS IPv6 ARP 缓存表正常 + * [Y] DPVS IPv6 地址能 ping 通 + +- DPVS FullNAT 转发测试 + * [Y] ipvsadm 业务添加、查询、修改、删除 + * [Y] keepalive 业务添加、查询、修改、删除 + - TCP 协议数据转发 + * [Y] 4to4 转发 + * [Y] 6to6 转发 + * [Y] 6to4 转发 + - TCP synproxy 功能 + * [Y] 4to4 转发 + * [Y] 6to6 转发 + * [Y] 6to4 转发 + - TOA Centos 7 + * [Y] 源 IP、端口获取功能(4to4, 6to6, 6to4) -- 6to4 apache可以直接获取源IP,nginx需要patch + * [Y] toa.ko 加载测试 + * [Y] toa.ko 卸载测试(有流量时) + * [Y] toa.ko 版本前向兼容测试 -- nginx兼容,apache关闭IPv6后不兼容 + - TOA Centos 6 + * [Y] 源 IP、端口 获取功能(4to4, 6to6, 6to4) -- 6to6无环境未测试,6to4 nginx需要patch + * [Y] toa.ko 加载测试 + * [Y] toa.ko 卸载测试(有流量时) + * [Y] toa.ko 版本前向兼容测试 + - UDP 协议数据转发(无 UOA 数据) + * [Y] 4to4 转发 + * [Y] 6to6 转发 + * [Y] 6to4 转发 + - UOA Centos 7 + * [Y] 源 IP、端口获取功能(4to4, 6to6, 6to4) + * [Y] uoa.ko 加载测试 + * [Y] uoa.ko 卸载测试(有流量时) + * [N] uoa.ko 版本前向兼容测试 -- 不兼容 v1.6 版本 uoa.ko,无法获取源 IP、带 UOA 的 UDP 包被丢弃(4to4 未丢) + - UOA Centos 6 + * [Y] 源 IP、端口获取功能(4to4, 6to4) + * [Y] uoa.ko 加载测试 + * [Y] uoa.ko 卸载测试(有流量时) + * [N] uoa.ko 版本前向兼容测试 -- 不兼容 v1.6 版本 uoa.ko,无法获取源 IP、带 UOA 的 UDP 包被丢弃(4to4 未丢) + - Flow Director 测试 + * [Y] Perfect 模式、一个或多个 Local IP (4to4, 6to4) + * [N] Perfect 模式、一个或多个 Local IP (6to6) -- 网卡不支持 + * [Y] Signature 模式、一个 Local IP (4to4, 6to6, 6to4) + * [N] Signature 模式、多个 Local IP (4to4, 6to6, 6to4) -- 网卡不支持 + * [Y] Signature 模式、多个 Local IP、打开 packet redirect (4to4, 6to6, 6to4) + + +- DPVS SNAT 转发测试 + * [Y] ipvsadm 业务添加、查询、修改、删除 + * [Y] keepalived 业务添加、查询、修改、删除 + * [Y] ICMP 转发 + * [Y] TCP 转发 + * [Y] UDP 转发 + * [Y] ICMP 隧道上网 + * [Y] TCP 隧道上网 + * [Y] UDP 隧道上网 + +- DPVS DR 转发测试 + - TCP 协议数据转发 + * [Y] 4to4 转发 + * [Y] 6to6 转发 + * [N] 6to4 转发 -- 原理上不支持 + - UDP 协议数据转发 + * [Y] 4to4 转发 + * [Y] 6to6 转发 + * [N] 6to4 转发 -- 原理上不支持 + +- DPVS Tunnel 转发测试 + - TCP 协议数据转发 + * [Y] 4to4 转发 + * [Y] 6to6 转发 + * [N] 6to4 转发 -- 原理上不支持 + - UDP 协议数据转发 + * [Y] 4to4 转发 + * [Y] 6to6 转发 + * [N] 6to4 转发 -- 原理上不支持 + +- DPVS NAT 转发测试(单核,或多核打开redirect) + - TCP 协议数据转发 + * [Y] 4to4 转发 + * [Y] 6to6 转发 + * [N] 6to4 转发 -- 原理上不支持 + - UDP 协议数据转发 + * [Y] 4to4 转发 + * [Y] 6to6 转发 + * [N] 6to4 转发 -- 原理上不支持 + +------------------- +Note: + * [Y]: Test Passed + * [N]: Test Failed or Not Supported diff --git a/test/checklist_setup.sh b/test/checklist_setup.sh new file mode 100755 index 000000000..f796188a4 --- /dev/null +++ b/test/checklist_setup.sh @@ -0,0 +1,111 @@ +#!/bin/sh + +### IP/Route ### +dpip addr add 2001::12/64 dev dpdk0 # host IP +dpip addr add 2001::1/128 dev dpdk0 # FullNAT vip +dpip addr add 2001::2/128 dev dpdk0 # DR vip +dpip addr add 2001::3/128 dev dpdk0 # Tunnel vip +dpip addr add 2002::1/64 dev dpdk0 # NAT vip + +dpip addr add 192.168.88.12/24 dev dpdk0 # host IP +dpip addr add 192.168.88.1/32 dev dpdk0 # FullNAT vip +dpip addr add 192.168.88.2/32 dev dpdk0 # DR vip +dpip addr add 192.168.88.3/32 dev dpdk0 # Tunel vip +dpip addr add 172.27.88.1/24 dev dpdk0 # NAT vip + +dpip addr add 2001::4/128 dev dpdk0 # NAT64 vip + +### FullNAT ### +ipvsadm -A -t [2001::1]:8080 -j enable +ipvsadm -at [2001::1]:8080 -r [2001::51]:80 -b +ipvsadm -at [2001::1]:8080 -r [2001::52]:80 -b +ipvsadm -at [2001::1]:8080 -r [2001::53]:80 -b +ipvsadm -at [2001::1]:8080 -r [2001::54]:80 -b +ipvsadm -Pt [2001::1]:8080 -z 2001::1:11 -F dpdk0 +ipvsadm -Pt [2001::1]:8080 -z 2001::1:12 -F dpdk0 +ipvsadm -Pt [2001::1]:8080 -z 2001::1:13 -F dpdk0 + +ipvsadm -A -u [2001::1]:80 +ipvsadm -au [2001::1]:80 -r [2001::51]:6000 -b +ipvsadm -au [2001::1]:80 -r [2001::54]:6000 -b +ipvsadm -Pu [2001::1]:80 -z 2001::1:12 -F dpdk0 + +ipvsadm -A -t 192.168.88.1:8080 -j enable +ipvsadm -at 192.168.88.1:8080 -r 192.168.88.151:80 -b +ipvsadm -at 192.168.88.1:8080 -r 192.168.88.152:80 -b +ipvsadm -at 192.168.88.1:8080 -r 192.168.88.153:80 -b +ipvsadm -at 192.168.88.1:8080 -r 192.168.88.154:80 -b +ipvsadm -Pt 192.168.88.1:8080 -z 192.168.88.241 -F dpdk0 +ipvsadm -Pt 192.168.88.1:8080 -z 192.168.88.242 -F dpdk0 +ipvsadm -Pt 192.168.88.1:8080 -z 192.168.88.243 -F dpdk0 + +ipvsadm -A -u 192.168.88.1:80 -j enable +ipvsadm -au 192.168.88.1:80 -r 192.168.88.151:6000 -b +ipvsadm -au 192.168.88.1:80 -r 192.168.88.154:6000 -b +ipvsadm -Pu 192.168.88.1:80 -z 192.168.88.241 -F dpdk0 + +### NAT64 ### +ipvsadm -A -t [2001::4]:8080 -j enable +ipvsadm -at [2001::4]:8080 -r 192.168.88.151:80 -b +ipvsadm -at [2001::4]:8080 -r 192.168.88.152:80 -b +ipvsadm -at [2001::4]:8080 -r 192.168.88.153:80 -b +ipvsadm -at [2001::4]:8080 -r 192.168.88.154:80 -b +ipvsadm -Pt [2001::4]:8080 -z 192.168.88.241 -F dpdk0 +ipvsadm -Pt [2001::4]:8080 -z 192.168.88.242 -F dpdk0 +ipvsadm -Pt [2001::4]:8080 -z 192.168.88.243 -F dpdk0 + +ipvsadm -A -u [2001::4]:80 +ipvsadm -au [2001::4]:80 -r 192.168.88.151:6000 -b +ipvsadm -Pu [2001::4]:80 -z 192.168.88.241 -F dpdk0 + +### DR ### +ipvsadm -A -t [2001::2]:80 -s wlc +ipvsadm -at [2001::2]:80 -r [2001::51]:80 -g -w 100 +ipvsadm -at [2001::2]:80 -r [2001::52]:80 -g -w 200 + +ipvsadm -A -u [2001::2]:6000 -s wlc +ipvsadm -au [2001::2]:6000 -r [2001::51]:6000 -g -w 50 +ipvsadm -au [2001::2]:6000 -r [2001::52]:6000 -g -w 50 + +ipvsadm -A -t 192.168.88.2:80 -s rr +ipvsadm -at 192.168.88.2:80 -r 192.168.88.151:80 -g -w 10 +ipvsadm -at 192.168.88.2:80 -r 192.168.88.152:80 -g -w 10 + +ipvsadm -A -u 192.168.88.2:6000 -s wrr +ipvsadm -au 192.168.88.2:6000 -r 192.168.88.151:6000 -g -w 10 +ipvsadm -au 192.168.88.2:6000 -r 192.168.88.152:6000 -g -w 20 + +### Tunnel ### +ipvsadm -A -t [2001::3]:80 +ipvsadm -at [2001::3]:80 -r [2001::51]:80 -i +ipvsadm -at [2001::3]:80 -r [2001::52]:80 -i + +ipvsadm -A -u [2001::3]:6000 +ipvsadm -au [2001::3]:6000 -r [2001::51]:6000 -i +ipvsadm -au [2001::3]:6000 -r [2001::52]:6000 -i + +ipvsadm -A -t 192.168.88.3:80 +ipvsadm -at 192.168.88.3:80 -r 192.168.88.151:80 -i +ipvsadm -at 192.168.88.3:80 -r 192.168.88.152:80 -i + +ipvsadm -A -u 192.168.88.3:6000 +ipvsadm -au 192.168.88.3:6000 -r 192.168.88.151:6000 -i +ipvsadm -au 192.168.88.3:6000 -r 192.168.88.152:6000 -i + +### NAT ### +ipvsadm -A -t [2002::1]:8080 +ipvsadm -at [2002::1]:8080 -r [2001::51]:80 -m +ipvsadm -at [2002::1]:8080 -r [2001::52]:80 -m + +ipvsadm -A -u [2002::1]:80 +ipvsadm -au [2002::1]:80 -r [2001::51]:6000 -m +ipvsadm -au [2002::1]:80 -r [2001::52]:6000 -m + +ipvsadm -A -t 172.27.88.1:8080 +ipvsadm -at 172.27.88.1:8080 -r 192.168.88.151:80 -m +ipvsadm -at 172.27.88.1:8080 -r 192.168.88.152:80 -m + +ipvsadm -A -u 172.27.88.1:80 +ipvsadm -au 172.27.88.1:80 -r 192.168.88.151:6000 -m +ipvsadm -au 172.27.88.1:80 -r 192.168.88.152:6000 -m + diff --git a/tools/dpip/addr.c b/tools/dpip/addr.c index 361bc966d..441bbbfa2 100644 --- a/tools/dpip/addr.c +++ b/tools/dpip/addr.c @@ -26,7 +26,7 @@ static void addr_help(void) { - fprintf(stderr, + fprintf(stderr, "Usage:\n" " dpip addr show [ dev STRING ]\n" " dpip -6 addr show [ dev STRING ]\n" @@ -88,7 +88,7 @@ static void addr_dump(const struct inet_addr_param *param) bcast[0] = '\0'; if (!inet_is_addr_any(param->af, ¶m->bcast)) { snprintf(bcast, sizeof(bcast), "broadcast "); - if (inet_ntop(param->af, ¶m->bcast, bcast + strlen(bcast), + if (inet_ntop(param->af, ¶m->bcast, bcast + strlen(bcast), sizeof(bcast) - strlen(bcast)) == NULL) bcast[0] = '\0'; } @@ -133,7 +133,7 @@ static int addr_parse_args(struct dpip_conf *conf, param->scope = IFA_SCOPE_LINK; else if (strcmp(conf->argv[0], "global") == 0) param->scope = IFA_SCOPE_GLOBAL; - else + else param->scope = atoi(conf->argv[0]); } else if (strcmp(conf->argv[0], "broadcast") == 0) { NEXTARG_CHECK(conf, "broadcast"); @@ -167,7 +167,7 @@ static int addr_parse_args(struct dpip_conf *conf, return -1; } - if (conf->cmd == DPIP_CMD_ADD || conf->cmd == DPIP_CMD_DEL + if (conf->cmd == DPIP_CMD_ADD || conf->cmd == DPIP_CMD_DEL || conf->cmd == DPIP_CMD_SET) { if (!prefix) { fprintf(stderr, "missing IFADDR\n"); @@ -261,7 +261,7 @@ struct dpip_obj dpip_addr = { static void __init addr_init(void) { dpip_register_obj(&dpip_addr); -} +} static void __exit addr_exit(void) { diff --git a/tools/dpip/link.c b/tools/dpip/link.c index aa2593d0c..e9e966612 100644 --- a/tools/dpip/link.c +++ b/tools/dpip/link.c @@ -80,7 +80,7 @@ struct link_param char dev_name[LINK_DEV_NAME_MAXLEN]; char item[LINK_ARG_ITEM_MAXLEN]; /* for SET cmd */ char value[LINK_ARG_VALUE_MAXLEN]; /* for SET cmd */ -}; +}; bool g_color = false; netif_nic_list_get_t *g_nic_list = NULL; @@ -113,7 +113,7 @@ static inline int get_netif_port_list(void) static void link_help(void) { - fprintf(stderr, + fprintf(stderr, "Usage:\n" " dpip link show [ NIC-NAME ]\n" " dpip link show BOND-NAME status\n" @@ -225,7 +225,7 @@ static int dump_nic_basic(char *name, int namelen) (void **)&p_get, &len); if (err != EDPVS_OK || !p_get || !len) return err; - get = *p_get; + get = *p_get; dpvs_sockopt_msg_free(p_get); printf("%d: %s: socket %d mtu %d rx-queue %d tx-queue %d\n", @@ -281,7 +281,7 @@ static int dump_nic_stats(char *name, int namelen) (void **)&p_get, &len); if (err != EDPVS_OK || !p_get || !len) return err; - get = *p_get; + get = *p_get; dpvs_sockopt_msg_free(p_get); assert(len == sizeof(netif_nic_stats_get_t)); @@ -405,27 +405,27 @@ static int dump_nic_stats_velocity(char *name, int namelen, int interval, int co (void **)&p_get, &len); if (err != EDPVS_OK || !p_get || !len) return err; - get1 = *p_get; + get1 = *p_get; dpvs_sockopt_msg_free(p_get); - + sleep(interval); - + err = dpvs_getsockopt(SOCKOPT_NETIF_GET_PORT_STATS, name, namelen, (void **)&p_get, &len); if (err != EDPVS_OK || !p_get || !len) return err; - get2 = *p_get; + get2 = *p_get; dpvs_sockopt_msg_free(p_get); - + calc_nic_stats_velocity(interval, &get1, &get2, &velocity); - + if (g_color) { if (tk % 2) printf(BLUE); else printf(GREEN); } - + printf(" %-16s%-16s%-16s%-16s\n", "ipackets/pps", "opackets/pps", "ibytes/Bps", "obytes/Bps"); printf(" %-16lu%-16lu%-16lu%-16lu\n", @@ -434,7 +434,7 @@ static int dump_nic_stats_velocity(char *name, int namelen, int interval, int co "ierrors/pps", "oerrors/pps", "imissed/pps", "rx_nombuf/pps"); printf(" %-16lu%-16lu%-16lu%-16lu\n", velocity.ierrors, velocity.oerrors, velocity.imissed, velocity.rx_nombuf); - + ++tk; if (count > 0 && tk > count) break; @@ -585,7 +585,7 @@ static int dump_cpu_stats_velocity(lcoreid_t cid, int interval, int count) assert(len == sizeof(netif_lcore_stats_get_t)); get1 = *p_get; dpvs_sockopt_msg_free(p_get); - + sleep(interval); err = dpvs_getsockopt(SOCKOPT_NETIF_GET_LCORE_STATS, &cid, sizeof(cid), @@ -597,7 +597,7 @@ static int dump_cpu_stats_velocity(lcoreid_t cid, int interval, int count) dpvs_sockopt_msg_free(p_get); calc_cpu_stats_velocity(interval, &get1, &get2, &velocity); - + if (g_color) { if (tk % 2) printf(BLUE); @@ -614,7 +614,7 @@ static int dump_cpu_stats_velocity(lcoreid_t cid, int interval, int count) "z2hpktburst/nps", "h2fpktburst/nps", "dropped/nps"); printf(" %-16lu%-16lu%-16lu\n", velocity.z2hpktburst, velocity.h2fpktburst, velocity.dropped); - + printf(" %-16s%-16s%-16s%-16s\n", "ipackets/pps", "ibytes/Bps", "opackets/pps", "obytes/Bps"); printf(" %-16lu%-16lu%-16lu%-16lu\n", @@ -1180,7 +1180,7 @@ struct dpip_obj dpip_link = { static void __init addr_init(void) { dpip_register_obj(&dpip_link); -} +} static void __exit addr_exit(void) { diff --git a/tools/dpip/neigh.c b/tools/dpip/neigh.c index 5532fb588..2159b103c 100644 --- a/tools/dpip/neigh.c +++ b/tools/dpip/neigh.c @@ -107,20 +107,20 @@ static void neigh_dump(struct dp_vs_neigh_conf *neigh) neigh->eth_addr.ether_addr_octet[3], neigh->eth_addr.ether_addr_octet[4], neigh->eth_addr.ether_addr_octet[5], - nud_state_names[neigh->state], neigh->ifname, neigh->cid, + nud_state_names[neigh->state], neigh->ifname, neigh->cid, (neigh->flag & NEIGHBOUR_STATIC) ? "static" : ""); else printf("ip: %-48s mac:incomplate state: %-12s dev: %s core: %d %s\n", inet_ntop(neigh->af, &neigh->ip_addr, ipaddr, sizeof(ipaddr)) ? ipaddr : "::", - nud_state_names[neigh->state], neigh->ifname, neigh->cid, + nud_state_names[neigh->state], neigh->ifname, neigh->cid, (neigh->flag & NEIGHBOUR_STATIC) ? "static" : ""); - return; + return; } static inline bool is_mac_valid(const struct ether_addr *ea) { - return (ea->ether_addr_octet[0] || ea->ether_addr_octet[1] || - ea->ether_addr_octet[2] || ea->ether_addr_octet[3] || + return (ea->ether_addr_octet[0] || ea->ether_addr_octet[1] || + ea->ether_addr_octet[2] || ea->ether_addr_octet[3] || ea->ether_addr_octet[4] || ea->ether_addr_octet[5]); } @@ -132,7 +132,7 @@ static int neigh_do_cmd(struct dpip_obj *obj, dpip_cmd_t cmd, struct dp_vs_neigh_conf_array *array; size_t size, i; int err; - + if (neigh_parse_args(conf, &neigh) != 0) return EDPVS_INVAL; @@ -142,7 +142,7 @@ static int neigh_do_cmd(struct dpip_obj *obj, dpip_cmd_t cmd, (void **)&array, &size); if (err != 0) return err; - if (size < sizeof(*array) || + if (size < sizeof(*array) || size != sizeof(*array) + \ array->neigh_nums * sizeof(struct dp_vs_neigh_conf)) { fprintf(stderr, "corrupted response.\n"); @@ -152,7 +152,7 @@ static int neigh_do_cmd(struct dpip_obj *obj, dpip_cmd_t cmd, for (i = 0; i < array->neigh_nums; i++) neigh_dump(&array->addrs[i]); dpvs_sockopt_msg_free(array); - return EDPVS_OK; + return EDPVS_OK; case DPIP_CMD_ADD: if (!is_mac_valid(&neigh.eth_addr)) { @@ -166,7 +166,7 @@ static int neigh_do_cmd(struct dpip_obj *obj, dpip_cmd_t cmd, return dpvs_setsockopt(SOCKOPT_SET_NEIGH_DEL, &neigh, sizeof(neigh)); default: - return EDPVS_NOTSUPP; + return EDPVS_NOTSUPP; } } diff --git a/tools/dpip/route.c b/tools/dpip/route.c index 8a8bd3ffd..87a7e8e01 100644 --- a/tools/dpip/route.c +++ b/tools/dpip/route.c @@ -28,7 +28,7 @@ static void route_help(void) { - fprintf(stderr, + fprintf(stderr, "Usage:\n" " dpip route { show | flush | help }\n" " dpip route { add | del | set } ROUTE\n" @@ -121,8 +121,8 @@ static void route4_dump(const struct dp_vs_route_conf *route) printf("%s %s/%d via %s src %s dev %s" " mtu %d tos %d scope %s metric %d proto %s %s\n", - af_itoa(route->af), - inet_ntop(route->af, &route->dst, dst, sizeof(dst)) ? dst : "::", + af_itoa(route->af), + inet_ntop(route->af, &route->dst, dst, sizeof(dst)) ? dst : "::", route->plen, inet_ntop(route->af, &route->via, via, sizeof(via)) ? via : "::", inet_ntop(route->af, &route->src, src, sizeof(src)) ? src : "::", @@ -172,7 +172,7 @@ static void route6_dump(const struct dp_vs_route6_conf *rt6_cfg) printf("\n"); } -static int route4_parse_args(struct dpip_conf *conf, +static int route4_parse_args(struct dpip_conf *conf, struct dp_vs_route_conf *route) { char *prefix = NULL; @@ -206,7 +206,7 @@ static int route4_parse_args(struct dpip_conf *conf, route->scope = ROUTE_CF_SCOPE_LINK; else if (strcmp(conf->argv[0], "global") == 0) route->scope = ROUTE_CF_SCOPE_GLOBAL; - else + else route->scope = atoi(conf->argv[0]); } else if (strcmp(conf->argv[0], "src") == 0) { NEXTARG_CHECK(conf, "src"); @@ -226,7 +226,7 @@ static int route4_parse_args(struct dpip_conf *conf, route->proto = ROUTE_CF_PROTO_STATIC; else if (strcmp(conf->argv[0], "ra") == 0) route->proto = ROUTE_CF_PROTO_RA; - else + else route->proto = atoi(conf->argv[0]); } else if (strcmp(conf->argv[0], "onlink") == 0) { ;/* on-link is output only */ @@ -307,7 +307,7 @@ static int route4_parse_args(struct dpip_conf *conf, return 0; } -static int route6_parse_args(struct dpip_conf *conf, +static int route6_parse_args(struct dpip_conf *conf, struct dp_vs_route6_conf *rt6_cfg) { int af; @@ -433,7 +433,7 @@ static int route4_do_cmd(struct dpip_obj *obj, dpip_cmd_t cmd, if (err != 0) return err; - if (size < sizeof(*array) + if (size < sizeof(*array) || size != sizeof(*array) + \ array->nroute * sizeof(struct dp_vs_route_conf)) { fprintf(stderr, "corrupted response.\n"); @@ -522,7 +522,7 @@ struct dpip_obj dpip_route = { static void __init route_init(void) { dpip_register_obj(&dpip_route); -} +} static void __exit route_exit(void) { diff --git a/tools/ipvsadm/ipvsadm.c b/tools/ipvsadm/ipvsadm.c index 5a215b01d..aeafa89ba 100644 --- a/tools/ipvsadm/ipvsadm.c +++ b/tools/ipvsadm/ipvsadm.c @@ -162,8 +162,8 @@ static const char* cmdnames[] = { "restore", "save", "zero", - "add-laddr" , - "del-laddr" , + "add-laddr" , + "del-laddr" , "get-laddr" , "add-blklst", "del-blklst", @@ -193,11 +193,11 @@ static const char* optnames[] = { "syncid", "exact", "ops", - "pe" , + "pe" , "local-address" , - "blklst-address", - "synproxy" , - "ifname" , + "blklst-address", + "synproxy" , + "ifname" , "sockpair" , "hash-target", }; @@ -585,7 +585,7 @@ parse_options(int argc, char **argv, struct ipvs_command_entry *ce, strncpy(ce->svc.sched_name, optarg, IP_VS_SCHEDNAME_MAXLEN); if (!memcmp(ce->svc.sched_name, "conhash", strlen("conhash"))) - ce->svc.flags = ce->svc.flags | IP_VS_SVC_F_SIP_HASH; + ce->svc.flags = ce->svc.flags | IP_VS_SVC_F_SIP_HASH; break; case 'p': set_option(options, OPT_PERSISTENT); @@ -1171,15 +1171,15 @@ parse_service(char *buf, ipvs_service_t *svc) * Get sockpair from the arguments. * sockpair := PROTO:SIP:SPORT:TIP:TPORT * PROTO := [tcp|udp] - * SIP,TIP := dotted-decimal ip address + * SIP,TIP := dotted-decimal ip address or square-blacketed ip6 address * SPORT,TPORT := range(0, 65535) */ static int parse_sockpair(char *buf, ipvs_sockpair_t *sockpair) { char *pos = buf, *end; - int af = AF_INET; - struct in_addr sip, tip; + int af = (strchr(pos, '[') == NULL ? AF_INET : AF_INET6); + union inet_addr sip, tip; unsigned short proto, sport, tport; long portn; @@ -1196,13 +1196,26 @@ parse_sockpair(char *buf, ipvs_sockpair_t *sockpair) else return 0; - pos = end; - end = strchr(pos, ':'); - if (!end) - return 0; - *end++ = '\0'; - if (inet_pton(af, pos, &sip) != 1) - return 0; + if (af == AF_INET) { + pos = end; + end = strchr(pos, ':'); + if (!end) + return 0; + *end++ = '\0'; + if (inet_pton(af, pos, &sip) != 1) + return 0; + } else { + if (*end != '[') + return 0; + pos = end + 1; + end = strchr(pos, ']'); + if (!end || *(end+1) != ':') + return 0; + *end++ = '\0'; + *end++ = '\0'; + if (inet_pton(af, pos, &sip.in6) != 1) + return 0; + } pos = end; end = strchr(pos, ':'); @@ -1213,13 +1226,26 @@ parse_sockpair(char *buf, ipvs_sockpair_t *sockpair) return 0; sport = portn; - pos = end; - end = strchr(pos, ':'); - if (!end) - return 0; - *end++ = '\0'; - if (inet_pton(af, pos, &tip) != 1) - return 0; + if (af == AF_INET) { + pos = end; + end = strchr(pos, ':'); + if (!end) + return 0; + *end++ = '\0'; + if (inet_pton(af, pos, &tip.in) != 1) + return 0; + } else { + if (*end !='[') + return 0; + pos = end + 1; + end = strchr(pos, ']'); + if (!end || *(end+1) != ':') + return 0; + *end++ = '\0'; + *end++ = '\0'; + if (inet_pton(af, pos, &tip.in6) != 1) + return 0; + } pos = end; if ((portn = string_to_number(pos, 0, 65535)) == -1) @@ -1228,9 +1254,9 @@ parse_sockpair(char *buf, ipvs_sockpair_t *sockpair) sockpair->af = af; sockpair->proto = proto; - memcpy(&sockpair->sip, &sip, sizeof(sockpair->sip)); + sockpair->sip = sip; sockpair->sport = ntohs(sport); - memcpy(&sockpair->tip, &tip, sizeof(sockpair->tip)); + sockpair->tip = tip; sockpair->tport = ntohs(tport); return 1; @@ -1525,16 +1551,16 @@ static void print_conn_entry(const ipvs_conn_entry_t *conn_entry, snprintf(time_str, sizeof(time_str), "%ds", conn_entry->timeout); - if (!(cname = addrport_to_anyname(conn_entry->af, &conn_entry->caddr, + if (!(cname = addrport_to_anyname(conn_entry->in_af, &conn_entry->caddr, ntohs(conn_entry->cport), conn_entry->proto, format))) goto exit; - if (!(vname = addrport_to_anyname(conn_entry->af, &conn_entry->vaddr, + if (!(vname = addrport_to_anyname(conn_entry->in_af, &conn_entry->vaddr, ntohs(conn_entry->vport), conn_entry->proto, format))) goto exit; - if (!(lname = addrport_to_anyname(conn_entry->af, &conn_entry->laddr, + if (!(lname = addrport_to_anyname(conn_entry->out_af, &conn_entry->laddr, ntohs(conn_entry->lport), conn_entry->proto, format))) goto exit; - if (!(dname = addrport_to_anyname(conn_entry->af, &conn_entry->daddr, + if (!(dname = addrport_to_anyname(conn_entry->out_af, &conn_entry->daddr, ntohs(conn_entry->dport), conn_entry->proto, format))) goto exit; @@ -1667,7 +1693,7 @@ static void print_largenum(unsigned long long i, unsigned int format) printf("%*llu", len <= 8 ? 9 : len + 1, i); return; } - + if (i < 100000000) /* less than 100 million */ printf("%9llu", i); else if (i < 1000000000) /* less than 1 billion */ @@ -1754,7 +1780,7 @@ print_service_entry(ipvs_service_entry_t *se, unsigned int format) proto = "UDP"; else if (se->protocol == IPPROTO_ICMP) proto = "ICMP"; - else + else proto = "ICMPv6"; sprintf(svc_name, "%s %s", proto, vname); @@ -1773,7 +1799,7 @@ print_service_entry(ipvs_service_entry_t *se, unsigned int format) proto = "icmp"; else proto = "icmpv6"; - + if (format & FMT_RULE) { snprintf(svc_name, sizeof(svc_name), "-H proto=%s,src-range=%s,dst-range=%s,iif=%s,oif=%s", @@ -1785,7 +1811,7 @@ print_service_entry(ipvs_service_entry_t *se, unsigned int format) left -= snprintf(svc_name + strlen(svc_name), left, "MATCH %s", proto); - + if (strcmp(se->srange, "[::-::]:0-0") != 0 && strcmp(se->srange, "0.0.0.0-0.0.0.0:0-0") != 0) left -= snprintf(svc_name + strlen(svc_name), left, @@ -1883,7 +1909,7 @@ print_service_entry(ipvs_service_entry_t *se, unsigned int format) char *dname; ipvs_dest_entry_t *e = &d->entrytable[i]; - if (!(dname = addrport_to_anyname(se->af, &(e->addr), ntohs(e->port), + if (!(dname = addrport_to_anyname(e->af, &(e->addr), ntohs(e->port), se->protocol, format))) { fprintf(stderr, "addrport_to_anyname fails\n"); exit(1); @@ -1929,7 +1955,7 @@ print_service_entry(ipvs_service_entry_t *se, unsigned int format) static void list_laddrs_print_title(void) { - printf("%-20s %-8s %-20s %-10s %-10s\n" , + printf("%-20s %-8s %-20s %-10s %-10s\n" , "VIP:VPORT" , "TOTAL" , "SNAT_IP", @@ -1943,7 +1969,7 @@ static void list_laddrs_print_service(struct ip_vs_get_laddrs *d) if (!(vname = addrport_to_anyname(d->af, &d->addr, ntohs(d->port), d->protocol, FMT_NUMERIC))) - fail(2, "addrport_to_anyname: %s", strerror(errno)); + fail(2, "addrport_to_anyname: %s", strerror(errno)); printf("%-20s %-8u \n" , vname , d->num_laddrs); free(vname); @@ -1957,10 +1983,10 @@ static void list_laddrs_print_service(struct ip_vs_get_laddrs *d) static void list_laddrs_print_laddr(struct ip_vs_laddr_entry * entry) { - char pbuf[40]; + char pbuf[INET6_ADDRSTRLEN]; inet_ntop(entry->af, (char *)&entry->addr, pbuf, sizeof(pbuf)); - + printf("%-20s %-8s %-20s %-10lu %-10u\n", "", "", @@ -1972,13 +1998,13 @@ static void list_laddrs_print_laddr(struct ip_vs_laddr_entry * entry) static void print_service_and_laddrs(struct ip_vs_get_laddrs* d, int with_title) { int i = 0; - + if(with_title) list_laddrs_print_title(); list_laddrs_print_service(d); for(i = 0 ; i < d->num_laddrs ; i ++){ - list_laddrs_print_laddr(d->entrytable + i); + list_laddrs_print_laddr(d->entrytable + i); } return; @@ -1999,7 +2025,7 @@ static int list_laddrs(ipvs_service_t *svc , int with_title) fprintf(stderr, "%s\n", ipvs_strerror(errno)); free(entry); return -1; - } + } print_service_and_laddrs(d, with_title); @@ -2028,14 +2054,14 @@ static int list_all_laddrs(void) fprintf(stderr, "%s\n", ipvs_strerror(errno)); return -1; } - + if(i != 0) title_enable = 0; print_service_and_laddrs(d, title_enable); free(d); } - + free(get); return 0; @@ -2075,8 +2101,8 @@ static int list_blklst(uint32_t addr_v4, uint16_t port, uint16_t protocol) } for (i = 0; i < get->naddr; i++) { - if ( addr_v4== get->blklsts[i].vaddr.in.s_addr && - port == get->blklsts[i].vport&& + if ( addr_v4== get->blklsts[i].vaddr.in.s_addr && + port == get->blklsts[i].vport&& protocol == get->blklsts[i].proto) { print_service_and_blklsts(&get->blklsts[i]); } @@ -2097,9 +2123,9 @@ static int list_all_blklsts(void) list_blklsts_print_title(); for (i = 0; i < get->num_services; i++) - list_blklst(get->entrytable[i].__addr_v4, get->entrytable[i].port, + list_blklst(get->entrytable[i].__addr_v4, get->entrytable[i].port, get->entrytable[i].protocol); - + free(get); return 0; } @@ -2273,7 +2299,7 @@ static char * addrport_to_anyname(int af, const void *addr, unsigned short port, unsigned short proto, unsigned int format) { - char *buf, pbuf[INET6_ADDRSTRLEN]; + char *buf, pbuf[INET6_ADDRSTRLEN]; if (!(buf=malloc(60))) return NULL; diff --git a/tools/ipvsadm/ipvsadm.sh b/tools/ipvsadm/ipvsadm.sh index 805bba2aa..3fa360fc7 100644 --- a/tools/ipvsadm/ipvsadm.sh +++ b/tools/ipvsadm/ipvsadm.sh @@ -12,7 +12,7 @@ # Based on init script for ipchains by Joshua Jensen # # Changes: -# Wenzhuo Zhang : fixed the typo of failure function +# Wenzhuo Zhang : fixed the typo of failure function # # config: /etc/sysconfig/ipvsadm # config: /etc/ipvsadm.rules @@ -47,7 +47,7 @@ fi # Check for ipvsadm in both /sbin and /usr/sbin # The default install puts it in /sbin, as it is analogos to commands such -# as route and ipchains that live in /sbin. Some vendors, most notibly +# as route and ipchains that live in /sbin. Some vendors, most notibly # Red Hat insist on moving it to /usr/sbin if [ ! -x /sbin/ipvsadm -a ! -x /usr/sbin/ipvsadm ]; then exit 0 @@ -77,7 +77,7 @@ case "$1" in ;; panic) - # I'm not sure what panic does but in the case of IPVS + # I'm not sure what panic does but in the case of IPVS # it makes sense just to clear everything action "Clearing the current IPVS table:" ipvsadm -C ;; diff --git a/tools/keepalived/keepalived/check/check_daemon.c b/tools/keepalived/keepalived/check/check_daemon.c index c77b79113..6e070e78e 100644 --- a/tools/keepalived/keepalived/check/check_daemon.c +++ b/tools/keepalived/keepalived/check/check_daemon.c @@ -97,7 +97,6 @@ start_check(void) init_checkers_queue(); #ifdef _WITH_VRRP_ init_interface_queue(); - kernel_netlink_init(); #endif #ifdef _WITH_SNMP_ if (!reload && snmp) diff --git a/tools/keepalived/keepalived/check/ipvswrapper.c b/tools/keepalived/keepalived/check/ipvswrapper.c old mode 100644 new mode 100755 index 7456b754d..795059151 --- a/tools/keepalived/keepalived/check/ipvswrapper.c +++ b/tools/keepalived/keepalived/check/ipvswrapper.c @@ -407,7 +407,7 @@ ipvs_stop(void) } /* Send user rules to IPVS module */ -static void +static int ipvs_talk(int cmd) { int result = -1; @@ -462,8 +462,14 @@ ipvs_talk(int cmd) break; } - if (result) + if (result) { + if (result == EDPVS_EXIST && (cmd == IP_VS_SO_SET_ADD || cmd == IP_VS_SO_SET_ADDDEST)) + result = 0; + else if (result == EDPVS_NOTEXIST && (cmd == IP_VS_SO_SET_DEL || cmd == IP_VS_SO_SET_DELDEST)) + result = 0; log_message(LOG_INFO, "IPVS: %s", ipvs_strerror(errno)); + } + return result? IPVS_ERROR:IPVS_SUCCESS; } int @@ -517,7 +523,7 @@ ipvs_group_range_cmd(int cmd, virtual_server_group_entry_t *vsg_entry) } /* set IPVS group rules */ -static void +static int ipvs_group_cmd(int cmd, list vs_group, real_server_t * rs, virtual_server_t * vs) { virtual_server_group_t *vsg = ipvs_get_group_by_name(vs->vsgname, vs_group); @@ -526,7 +532,7 @@ ipvs_group_cmd(int cmd, list vs_group, real_server_t * rs, virtual_server_t * vs element e; /* return if jointure fails */ - if (!vsg) return; + if (!vsg) return IPVS_ERROR; /* visit addr_ip list */ l = vsg->addr_ip; @@ -543,7 +549,8 @@ ipvs_group_cmd(int cmd, list vs_group, real_server_t * rs, virtual_server_t * vs /* Talk to the IPVS channel */ if (IPVS_ALIVE(cmd, vsg_entry, rs)) { - ipvs_talk(cmd); + if (ipvs_talk(cmd) != IPVS_SUCCESS) + return IPVS_ERROR; IPVS_SET_ALIVE(cmd, vsg_entry); } } @@ -566,7 +573,8 @@ ipvs_group_cmd(int cmd, list vs_group, real_server_t * rs, virtual_server_t * vs /* Talk to the IPVS channel */ if (IPVS_ALIVE(cmd, vsg_entry, rs)) { - ipvs_talk(cmd); + if (ipvs_talk(cmd) != IPVS_SUCCESS) + return IPVS_ERROR; IPVS_SET_ALIVE(cmd, vsg_entry); } } @@ -583,6 +591,7 @@ ipvs_group_cmd(int cmd, list vs_group, real_server_t * rs, virtual_server_t * vs IPVS_SET_ALIVE(cmd, vsg_entry); } } + return IPVS_SUCCESS; } /* Fill IPVS rule with root vs infos */ @@ -1003,7 +1012,7 @@ ipvs_cmd(int cmd, list vs_group, virtual_server_t * vs, real_server_t * rs) /* Set vs rule and send to kernel */ if (vs->vsgname) { - ipvs_group_cmd(cmd, vs_group, rs, vs); + return ipvs_group_cmd(cmd, vs_group, rs, vs); } else { if (vs->vfwmark) { srule->af = AF_INET; @@ -1028,7 +1037,7 @@ ipvs_cmd(int cmd, list vs_group, virtual_server_t * vs, real_server_t * rs) } /* Talk to the IPVS channel */ - ipvs_talk(cmd); + return ipvs_talk(cmd); } return IPVS_SUCCESS; diff --git a/tools/keepalived/keepalived/check/ipwrapper.c b/tools/keepalived/keepalived/check/ipwrapper.c old mode 100644 new mode 100755 index 1fdecfe6b..74c8b1cc8 --- a/tools/keepalived/keepalived/check/ipwrapper.c +++ b/tools/keepalived/keepalived/check/ipwrapper.c @@ -30,7 +30,6 @@ #ifdef _WITH_SNMP_ #include "check_snmp.h" #endif - /* out-of-order functions declarations */ static void update_quorum_state(virtual_server_t * vs); @@ -197,16 +196,20 @@ init_service_vs(virtual_server_t * vs) SET_ALIVE(vs); } - /*Set local ip address in "FNAT" mode of IPVS */ - if ((vs->loadbalancing_kind == IP_VS_CONN_F_FULLNAT) && vs->local_addr_gname) { + /* Set local ip address in "FNAT" mode of IPVS */ + if (vs->local_addr_gname && + (vs->loadbalancing_kind == IP_VS_CONN_F_FULLNAT || + vs->loadbalancing_kind == IP_VS_CONN_F_SNAT)) { if (!ipvs_cmd(LVS_CMD_ADD_LADDR, check_data->vs_group, vs, NULL)) return 0; } - /*Set blacklist ip address */ - if (vs->blklst_addr_gname) { - if (!ipvs_cmd(LVS_CMD_ADD_BLKLST, check_data->vs_group, vs, NULL)) - return 0; - } + + /*Set blacklist ip address */ + if (vs->blklst_addr_gname) { + if (!ipvs_cmd(LVS_CMD_ADD_BLKLST, check_data->vs_group, vs, NULL)) + return 0; + } + /* Processing real server queue */ if (!LIST_ISEMPTY(vs->rs)) { if (vs->alpha && ! vs->reloaded) @@ -385,7 +388,7 @@ update_quorum_state(virtual_server_t * vs) } /* manipulate add/remove rs according to alive state */ -void +bool perform_svr_state(int alive, virtual_server_t * vs, real_server_t * rs) { /* @@ -402,7 +405,10 @@ perform_svr_state(int alive, virtual_server_t * vs, real_server_t * rs) , FMT_VS(vs)); /* Add only if we have quorum or no sorry server */ if (vs->quorum_state == UP || !vs->s_svr || !ISALIVE(vs->s_svr)) { - ipvs_cmd(LVS_CMD_ADD_DEST, check_data->vs_group, vs, rs); + if (ipvs_cmd(LVS_CMD_ADD_DEST, check_data->vs_group, vs, rs) != IPVS_SUCCESS) { + log_message(LOG_INFO, "LVS cmd add dest fail!"); + return false; + } } rs->alive = alive; if (rs->notify_up) { @@ -430,7 +436,10 @@ perform_svr_state(int alive, virtual_server_t * vs, real_server_t * rs) * Remove only if we have quorum or no sorry server */ if (vs->quorum_state == UP || !vs->s_svr || !ISALIVE(vs->s_svr)) { - ipvs_cmd(LVS_CMD_DEL_DEST, check_data->vs_group, vs, rs); + if (ipvs_cmd(LVS_CMD_DEL_DEST, check_data->vs_group, vs, rs) != IPVS_SUCCESS) { + log_message(LOG_INFO, "LVS cmd del dest fail!"); + return false; + } } rs->alive = alive; if (rs->notify_down) { @@ -447,6 +456,7 @@ perform_svr_state(int alive, virtual_server_t * vs, real_server_t * rs) /* We may have lost quorum */ update_quorum_state(vs); } + return true; } /* Store new weight in real_server struct and then update kernel. */ @@ -496,40 +506,62 @@ svr_checker_up(checker_id_t cid, real_server_t *rs) return 1; } -/* Update checker's state */ -void -update_svr_checker_state(int alive, checker_id_t cid, virtual_server_t *vs, real_server_t *rs) +static int +remove_failed_checker_list(checker_id_t cid, real_server_t *rs) { element e; list l = rs->failed_checkers; checker_id_t *id; + for (e = LIST_HEAD(l); e; ELEMENT_NEXT(e)) { + id = ELEMENT_DATA(e); + if (*id == cid) { + free_list_element(l, e); + /* If we don't break, the next iteration will trigger + * a SIGSEGV. + */ + break; + } + } + return 0; +} + +static int +add_failed_checker_list(checker_id_t cid, real_server_t *rs) +{ + list l = rs->failed_checkers; + checker_id_t *id; + + id = (checker_id_t *) MALLOC(sizeof(checker_id_t)); + *id = cid; + list_add(l, id); + + return 0; +} + +/* Update checker's state */ +void +update_svr_checker_state(int alive, checker_id_t cid, virtual_server_t *vs, real_server_t *rs) +{ /* Handle alive state. Depopulate failed_checkers and call * perform_svr_state() independently, letting the latter sort * things out itself. */ if (alive) { /* Remove the succeeded check from failed_checkers list. */ - for (e = LIST_HEAD(l); e; ELEMENT_NEXT(e)) { - id = ELEMENT_DATA(e); - if (*id == cid) { - free_list_element(l, e); - /* If we don't break, the next iteration will trigger - * a SIGSEGV. - */ - break; - } + remove_failed_checker_list(cid, rs); + if (LIST_SIZE(rs->failed_checkers) == 0) { + if (!perform_svr_state(alive, vs, rs)) + add_failed_checker_list(cid, rs); } - if (LIST_SIZE(l) == 0) - perform_svr_state(alive, vs, rs); } /* Handle not alive state */ else { - id = (checker_id_t *) MALLOC(sizeof(checker_id_t)); - *id = cid; - list_add(l, id); - if (LIST_SIZE(l) == 1) - perform_svr_state(alive, vs, rs); + add_failed_checker_list(cid, rs); + if (LIST_SIZE(rs->failed_checkers) == 1) { + if (!perform_svr_state(alive, vs, rs)) + remove_failed_checker_list(cid, rs); + } } } @@ -718,6 +750,7 @@ clear_diff_rs(list old_vs_group, virtual_server_t * old_vs) log_message(LOG_INFO, "service %s no longer exist" , FMT_RS(rs)); rs->inhibit = 0; + SET_ALIVE(rs); list_add (rs_to_remove, rs); } } diff --git a/tools/keepalived/keepalived/include/ipwrapper.h b/tools/keepalived/keepalived/include/ipwrapper.h old mode 100644 new mode 100755 index 2816f2594..612e782a7 --- a/tools/keepalived/keepalived/include/ipwrapper.h +++ b/tools/keepalived/keepalived/include/ipwrapper.h @@ -55,7 +55,7 @@ #define LVS_CMD_DEL_TUNNEL IP_VS_SO_SET_DELTUNNEL /* prototypes */ -extern void perform_svr_state(int, virtual_server_t *, real_server_t *); +extern bool perform_svr_state(int, virtual_server_t *, real_server_t *); extern void update_svr_wgt(int, virtual_server_t *, real_server_t *); extern int svr_checker_up(checker_id_t, real_server_t *); extern void update_svr_checker_state(int, checker_id_t, virtual_server_t *, real_server_t *); diff --git a/tools/keepalived/keepalived/libipvs-2.6/libipvs.c b/tools/keepalived/keepalived/libipvs-2.6/libipvs.c index fce9a716e..4f0a5cf2c 100644 --- a/tools/keepalived/keepalived/libipvs-2.6/libipvs.c +++ b/tools/keepalived/keepalived/libipvs-2.6/libipvs.c @@ -321,10 +321,11 @@ static void ipvs_fill_laddr_conf(ipvs_service_t *svc, ipvs_laddr_t *laddr, struct dp_vs_laddr_conf *conf) { memset(conf, 0, sizeof(*conf)); - conf->af = laddr->af; + conf->af_s = svc->af; conf->proto = svc->protocol; conf->vport = svc->port; conf->fwmark = svc->fwmark; + conf->af_l = laddr->af; if (strlen(laddr->ifname)) snprintf(conf->ifname, sizeof(conf->ifname), "%s", laddr->ifname); @@ -615,7 +616,7 @@ struct ip_vs_get_laddrs *ipvs_get_laddrs(ipvs_service_entry_t *svc) size_t res_size, i; memset(&conf, 0, sizeof(struct dp_vs_laddr_conf)); - conf.af = svc->af; + conf.af_s = svc->af; conf.proto = svc->protocol; if (svc->af == AF_INET) conf.vaddr.in = svc->addr.in; @@ -644,8 +645,8 @@ struct ip_vs_get_laddrs *ipvs_get_laddrs(ipvs_service_entry_t *svc) laddrs->port = result->vport; laddrs->fwmark = result->fwmark; laddrs->num_laddrs = result->nladdrs; - laddrs->af = result->af; - if (result->af == AF_INET) + laddrs->af = result->af_s; + if (result->af_s == AF_INET) laddrs->addr.in = result->vaddr.in; else laddrs->addr.in6 = result->vaddr.in6; @@ -654,8 +655,8 @@ struct ip_vs_get_laddrs *ipvs_get_laddrs(ipvs_service_entry_t *svc) laddrs->entrytable[i].__addr_v4 = result->laddrs[i].addr.in.s_addr; laddrs->entrytable[i].port_conflict = result->laddrs[i].nport_conflict; laddrs->entrytable[i].conn_counts = result->laddrs[i].nconns; - laddrs->entrytable[i].af = result->af; - if (result->af == AF_INET) + laddrs->entrytable[i].af = result->laddrs[i].af; + if (result->laddrs[i].af == AF_INET) laddrs->entrytable[i].addr.in = result->laddrs[i].addr.in; else laddrs->entrytable[i].addr.in6 = result->laddrs[i].addr.in6; diff --git a/tools/keepalived/keepalived/vrrp/vrrp_daemon.c b/tools/keepalived/keepalived/vrrp/vrrp_daemon.c index f615de13b..b10645dc7 100644 --- a/tools/keepalived/keepalived/vrrp/vrrp_daemon.c +++ b/tools/keepalived/keepalived/vrrp/vrrp_daemon.c @@ -83,7 +83,6 @@ stop_vrrp(void) free_vrrp_data(vrrp_data); free_vrrp_buffer(); free_interface_queue(); - kernel_netlink_close(); thread_destroy_master(master); gratuitous_arp_close(); ndisc_close(); @@ -106,7 +105,6 @@ start_vrrp(void) { /* Initialize sub-system */ init_interface_queue(); - kernel_netlink_init(); gratuitous_arp_init(); ndisc_init(); #ifdef _WITH_SNMP_ @@ -211,7 +209,6 @@ reload_vrrp_thread(thread_t * thread) /* Destroy master thread */ vrrp_dispatcher_release(vrrp_data); - kernel_netlink_close(); thread_destroy_master(master); master = thread_make_master(); free_global_data(global_data); diff --git a/tools/keepalived/keepalived/vrrp/vrrp_ipaddress.c b/tools/keepalived/keepalived/vrrp/vrrp_ipaddress.c index f7f554b23..3aa976dc1 100644 --- a/tools/keepalived/keepalived/vrrp/vrrp_ipaddress.c +++ b/tools/keepalived/keepalived/vrrp/vrrp_ipaddress.c @@ -43,7 +43,6 @@ static void dpvs_fill_addrconf(ip_address_t *ipaddress, char *dpdk_port, struct param->addr.in = ipaddress->u.sin.sin_addr; else param->addr.in6 = ipaddress->u.sin6_addr; - strcpy(param->ifname, dpdk_port); param->plen = ipaddress->ifa.ifa_prefixlen; param->flags &= ~IFA_F_SAPOOL; } @@ -51,13 +50,21 @@ static void dpvs_fill_addrconf(ip_address_t *ipaddress, char *dpdk_port, struct static int netlink_ipaddress(ip_address_t *ipaddress, char *dpdk_port, int cmd) { - char *tmp_ip; struct inet_addr_param param; - tmp_ip = ipaddresstos(ipaddress); + int err; memset(¶m, 0, sizeof(param)); dpvs_fill_addrconf(ipaddress, dpdk_port, ¶m); - ipvs_set_ipaddr(¶m, cmd); - FREE(tmp_ip); + err = ipvs_set_ipaddr(¶m, cmd); + + if (err) { + char addr_str[64]; + void *addr = (IP_IS6(ipaddress)) ? (void *) &ipaddress->u.sin6_addr : + (void *) &ipaddress->u.sin.sin_addr; + inet_ntop(IP_FAMILY(ipaddress), addr, addr_str, 41); + log_message(LOG_INFO, "ip address %s cmd %s failed\n", addr_str, \ + cmd == IPADDRESS_DEL ? "del" : "add"); + return -1; + } return 1; } diff --git a/tools/keepalived/lib/parser.c b/tools/keepalived/lib/parser.c index 99511dbfb..2c4ab9b7a 100644 --- a/tools/keepalived/lib/parser.c +++ b/tools/keepalived/lib/parser.c @@ -223,6 +223,7 @@ void read_conf_file(char *conf_file) log_message(LOG_INFO, "chdir(%s) error (%s)" , confpath, strerror(errno)); } + free(confpath); process_stream(current_keywords); fclose(stream);