diff options
author | Srikant Patnaik | 2015-01-13 15:08:24 +0530 |
---|---|---|
committer | Srikant Patnaik | 2015-01-13 15:08:24 +0530 |
commit | 97327692361306d1e6259021bc425e32832fdb50 (patch) | |
tree | fe9088f3248ec61e24f404f21b9793cb644b7f01 /net/netfilter | |
parent | 2d05a8f663478a44e088d122e0d62109bbc801d0 (diff) | |
parent | a3a8b90b61e21be3dde9101c4e86c881e0f06210 (diff) | |
download | FOSSEE-netbook-kernel-source-97327692361306d1e6259021bc425e32832fdb50.tar.gz FOSSEE-netbook-kernel-source-97327692361306d1e6259021bc425e32832fdb50.tar.bz2 FOSSEE-netbook-kernel-source-97327692361306d1e6259021bc425e32832fdb50.zip |
dirty fix to merging
Diffstat (limited to 'net/netfilter')
160 files changed, 72137 insertions, 0 deletions
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig new file mode 100644 index 00000000..ce2976c0 --- /dev/null +++ b/net/netfilter/Kconfig @@ -0,0 +1,1195 @@ +menu "Core Netfilter Configuration" + depends on NET && INET && NETFILTER + +config NETFILTER_NETLINK + tristate + +config NETFILTER_NETLINK_ACCT +tristate "Netfilter NFACCT over NFNETLINK interface" + depends on NETFILTER_ADVANCED + select NETFILTER_NETLINK + help + If this option is enabled, the kernel will include support + for extended accounting via NFNETLINK. + +config NETFILTER_NETLINK_QUEUE + tristate "Netfilter NFQUEUE over NFNETLINK interface" + depends on NETFILTER_ADVANCED + select NETFILTER_NETLINK + help + If this option is enabled, the kernel will include support + for queueing packets via NFNETLINK. + +config NETFILTER_NETLINK_LOG + tristate "Netfilter LOG over NFNETLINK interface" + default m if NETFILTER_ADVANCED=n + select NETFILTER_NETLINK + help + If this option is enabled, the kernel will include support + for logging packets via NFNETLINK. + + This obsoletes the existing ipt_ULOG and ebg_ulog mechanisms, + and is also scheduled to replace the old syslog-based ipt_LOG + and ip6t_LOG modules. + +config NF_CONNTRACK + tristate "Netfilter connection tracking support" + default m if NETFILTER_ADVANCED=n + help + Connection tracking keeps a record of what packets have passed + through your machine, in order to figure out how they are related + into connections. + + This is required to do Masquerading or other kinds of Network + Address Translation. It can also be used to enhance packet + filtering (see `Connection state match support' below). + + To compile it as a module, choose M here. If unsure, say N. + +if NF_CONNTRACK + +config NF_CONNTRACK_MARK + bool 'Connection mark tracking support' + depends on NETFILTER_ADVANCED + help + This option enables support for connection marks, used by the + `CONNMARK' target and `connmark' match. Similar to the mark value + of packets, but this mark value is kept in the conntrack session + instead of the individual packets. + +config NF_CONNTRACK_SECMARK + bool 'Connection tracking security mark support' + depends on NETWORK_SECMARK + default m if NETFILTER_ADVANCED=n + help + This option enables security markings to be applied to + connections. Typically they are copied to connections from + packets using the CONNSECMARK target and copied back from + connections to packets with the same target, with the packets + being originally labeled via SECMARK. + + If unsure, say 'N'. + +config NF_CONNTRACK_ZONES + bool 'Connection tracking zones' + depends on NETFILTER_ADVANCED + depends on NETFILTER_XT_TARGET_CT + help + This option enables support for connection tracking zones. + Normally, each connection needs to have a unique system wide + identity. Connection tracking zones allow to have multiple + connections using the same identity, as long as they are + contained in different zones. + + If unsure, say `N'. + +config NF_CONNTRACK_PROCFS + bool "Supply CT list in procfs (OBSOLETE)" + default y + depends on PROC_FS + ---help--- + This option enables for the list of known conntrack entries + to be shown in procfs under net/netfilter/nf_conntrack. This + is considered obsolete in favor of using the conntrack(8) + tool which uses Netlink. + +config NF_CONNTRACK_EVENTS + bool "Connection tracking events" + depends on NETFILTER_ADVANCED + help + If this option is enabled, the connection tracking code will + provide a notifier chain that can be used by other kernel code + to get notified about changes in the connection tracking state. + + If unsure, say `N'. + +config NF_CONNTRACK_TIMEOUT + bool 'Connection tracking timeout' + depends on NETFILTER_ADVANCED + help + This option enables support for connection tracking timeout + extension. This allows you to attach timeout policies to flow + via the CT target. + + If unsure, say `N'. + +config NF_CONNTRACK_TIMESTAMP + bool 'Connection tracking timestamping' + depends on NETFILTER_ADVANCED + help + This option enables support for connection tracking timestamping. + This allows you to store the flow start-time and to obtain + the flow-stop time (once it has been destroyed) via Connection + tracking events. + + If unsure, say `N'. + +config NF_CT_PROTO_DCCP + tristate 'DCCP protocol connection tracking support (EXPERIMENTAL)' + depends on EXPERIMENTAL + depends on NETFILTER_ADVANCED + default IP_DCCP + help + With this option enabled, the layer 3 independent connection + tracking code will be able to do state tracking on DCCP connections. + + If unsure, say 'N'. + +config NF_CT_PROTO_GRE + tristate + +config NF_CT_PROTO_SCTP + tristate 'SCTP protocol connection tracking support (EXPERIMENTAL)' + depends on EXPERIMENTAL + depends on NETFILTER_ADVANCED + default IP_SCTP + help + With this option enabled, the layer 3 independent connection + tracking code will be able to do state tracking on SCTP connections. + + If you want to compile it as a module, say M here and read + <file:Documentation/kbuild/modules.txt>. If unsure, say `N'. + +config NF_CT_PROTO_UDPLITE + tristate 'UDP-Lite protocol connection tracking support' + depends on NETFILTER_ADVANCED + help + With this option enabled, the layer 3 independent connection + tracking code will be able to do state tracking on UDP-Lite + connections. + + To compile it as a module, choose M here. If unsure, say N. + +config NF_CONNTRACK_AMANDA + tristate "Amanda backup protocol support" + depends on NETFILTER_ADVANCED + select TEXTSEARCH + select TEXTSEARCH_KMP + help + If you are running the Amanda backup package <http://www.amanda.org/> + on this machine or machines that will be MASQUERADED through this + machine, then you may want to enable this feature. This allows the + connection tracking and natting code to allow the sub-channels that + Amanda requires for communication of the backup data, messages and + index. + + To compile it as a module, choose M here. If unsure, say N. + +config NF_CONNTRACK_FTP + tristate "FTP protocol support" + default m if NETFILTER_ADVANCED=n + help + Tracking FTP connections is problematic: special helpers are + required for tracking them, and doing masquerading and other forms + of Network Address Translation on them. + + This is FTP support on Layer 3 independent connection tracking. + Layer 3 independent connection tracking is experimental scheme + which generalize ip_conntrack to support other layer 3 protocols. + + To compile it as a module, choose M here. If unsure, say N. + +config NF_CONNTRACK_H323 + tristate "H.323 protocol support" + depends on (IPV6 || IPV6=n) + depends on NETFILTER_ADVANCED + help + H.323 is a VoIP signalling protocol from ITU-T. As one of the most + important VoIP protocols, it is widely used by voice hardware and + software including voice gateways, IP phones, Netmeeting, OpenPhone, + Gnomemeeting, etc. + + With this module you can support H.323 on a connection tracking/NAT + firewall. + + This module supports RAS, Fast Start, H.245 Tunnelling, Call + Forwarding, RTP/RTCP and T.120 based audio, video, fax, chat, + whiteboard, file transfer, etc. For more information, please + visit http://nath323.sourceforge.net/. + + To compile it as a module, choose M here. If unsure, say N. + +config NF_CONNTRACK_IRC + tristate "IRC protocol support" + default m if NETFILTER_ADVANCED=n + help + There is a commonly-used extension to IRC called + Direct Client-to-Client Protocol (DCC). This enables users to send + files to each other, and also chat to each other without the need + of a server. DCC Sending is used anywhere you send files over IRC, + and DCC Chat is most commonly used by Eggdrop bots. If you are + using NAT, this extension will enable you to send files and initiate + chats. Note that you do NOT need this extension to get files or + have others initiate chats, or everything else in IRC. + + To compile it as a module, choose M here. If unsure, say N. + +config NF_CONNTRACK_BROADCAST + tristate + +config NF_CONNTRACK_NETBIOS_NS + tristate "NetBIOS name service protocol support" + select NF_CONNTRACK_BROADCAST + help + NetBIOS name service requests are sent as broadcast messages from an + unprivileged port and responded to with unicast messages to the + same port. This make them hard to firewall properly because connection + tracking doesn't deal with broadcasts. This helper tracks locally + originating NetBIOS name service requests and the corresponding + responses. It relies on correct IP address configuration, specifically + netmask and broadcast address. When properly configured, the output + of "ip address show" should look similar to this: + + $ ip -4 address show eth0 + 4: eth0: <BROADCAST,MULTICAST,UP> mtu 1500 qdisc pfifo_fast qlen 1000 + inet 172.16.2.252/24 brd 172.16.2.255 scope global eth0 + + To compile it as a module, choose M here. If unsure, say N. + +config NF_CONNTRACK_SNMP + tristate "SNMP service protocol support" + depends on NETFILTER_ADVANCED + select NF_CONNTRACK_BROADCAST + help + SNMP service requests are sent as broadcast messages from an + unprivileged port and responded to with unicast messages to the + same port. This make them hard to firewall properly because connection + tracking doesn't deal with broadcasts. This helper tracks locally + originating SNMP service requests and the corresponding + responses. It relies on correct IP address configuration, specifically + netmask and broadcast address. + + To compile it as a module, choose M here. If unsure, say N. + +config NF_CONNTRACK_PPTP + tristate "PPtP protocol support" + depends on NETFILTER_ADVANCED + select NF_CT_PROTO_GRE + help + This module adds support for PPTP (Point to Point Tunnelling + Protocol, RFC2637) connection tracking and NAT. + + If you are running PPTP sessions over a stateful firewall or NAT + box, you may want to enable this feature. + + Please note that not all PPTP modes of operation are supported yet. + Specifically these limitations exist: + - Blindly assumes that control connections are always established + in PNS->PAC direction. This is a violation of RFC2637. + - Only supports a single call within each session + + To compile it as a module, choose M here. If unsure, say N. + +config NF_CONNTRACK_SANE + tristate "SANE protocol support (EXPERIMENTAL)" + depends on EXPERIMENTAL + depends on NETFILTER_ADVANCED + help + SANE is a protocol for remote access to scanners as implemented + by the 'saned' daemon. Like FTP, it uses separate control and + data connections. + + With this module you can support SANE on a connection tracking + firewall. + + To compile it as a module, choose M here. If unsure, say N. + +config NF_CONNTRACK_SIP + tristate "SIP protocol support" + default m if NETFILTER_ADVANCED=n + help + SIP is an application-layer control protocol that can establish, + modify, and terminate multimedia sessions (conferences) such as + Internet telephony calls. With the ip_conntrack_sip and + the nf_nat_sip modules you can support the protocol on a connection + tracking/NATing firewall. + + To compile it as a module, choose M here. If unsure, say N. + +config NF_CONNTRACK_TFTP + tristate "TFTP protocol support" + depends on NETFILTER_ADVANCED + help + TFTP connection tracking helper, this is required depending + on how restrictive your ruleset is. + If you are using a tftp client behind -j SNAT or -j MASQUERADING + you will need this. + + To compile it as a module, choose M here. If unsure, say N. + +config NF_CT_NETLINK + tristate 'Connection tracking netlink interface' + select NETFILTER_NETLINK + default m if NETFILTER_ADVANCED=n + help + This option enables support for a netlink-based userspace interface + +config NF_CT_NETLINK_TIMEOUT + tristate 'Connection tracking timeout tuning via Netlink' + select NETFILTER_NETLINK + depends on NETFILTER_ADVANCED + help + This option enables support for connection tracking timeout + fine-grain tuning. This allows you to attach specific timeout + policies to flows, instead of using the global timeout policy. + + If unsure, say `N'. + +endif # NF_CONNTRACK + +# transparent proxy support +config NETFILTER_TPROXY + tristate "Transparent proxying support (EXPERIMENTAL)" + depends on EXPERIMENTAL + depends on IP_NF_MANGLE + depends on NETFILTER_ADVANCED + help + This option enables transparent proxying support, that is, + support for handling non-locally bound IPv4 TCP and UDP sockets. + For it to work you will have to configure certain iptables rules + and use policy routing. For more information on how to set it up + see Documentation/networking/tproxy.txt. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XTABLES + tristate "Netfilter Xtables support (required for ip_tables)" + default m if NETFILTER_ADVANCED=n + help + This is required if you intend to use any of ip_tables, + ip6_tables or arp_tables. + +if NETFILTER_XTABLES + +comment "Xtables combined modules" + +config NETFILTER_XT_MARK + tristate 'nfmark target and match support' + default m if NETFILTER_ADVANCED=n + ---help--- + This option adds the "MARK" target and "mark" match. + + Netfilter mark matching allows you to match packets based on the + "nfmark" value in the packet. + The target allows you to create rules in the "mangle" table which alter + the netfilter mark (nfmark) field associated with the packet. + + Prior to routing, the nfmark can influence the routing method (see + "Use netfilter MARK value as routing key") and can also be used by + other subsystems to change their behavior. + +config NETFILTER_XT_CONNMARK + tristate 'ctmark target and match support' + depends on NF_CONNTRACK + depends on NETFILTER_ADVANCED + select NF_CONNTRACK_MARK + ---help--- + This option adds the "CONNMARK" target and "connmark" match. + + Netfilter allows you to store a mark value per connection (a.k.a. + ctmark), similarly to the packet mark (nfmark). Using this + target and match, you can set and match on this mark. + +config NETFILTER_XT_SET + tristate 'set target and match support' + depends on IP_SET + depends on NETFILTER_ADVANCED + help + This option adds the "SET" target and "set" match. + + Using this target and match, you can add/delete and match + elements in the sets created by ipset(8). + + To compile it as a module, choose M here. If unsure, say N. + +# alphabetically ordered list of targets + +comment "Xtables targets" + +config NETFILTER_XT_TARGET_AUDIT + tristate "AUDIT target support" + depends on AUDIT + depends on NETFILTER_ADVANCED + ---help--- + This option adds a 'AUDIT' target, which can be used to create + audit records for packets dropped/accepted. + + To compileit as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_TARGET_CHECKSUM + tristate "CHECKSUM target support" + depends on IP_NF_MANGLE || IP6_NF_MANGLE + depends on NETFILTER_ADVANCED + ---help--- + This option adds a `CHECKSUM' target, which can be used in the iptables mangle + table. + + You can use this target to compute and fill in the checksum in + a packet that lacks a checksum. This is particularly useful, + if you need to work around old applications such as dhcp clients, + that do not work well with checksum offloads, but don't want to disable + checksum offload in your device. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_TARGET_CLASSIFY + tristate '"CLASSIFY" target support' + depends on NETFILTER_ADVANCED + help + This option adds a `CLASSIFY' target, which enables the user to set + the priority of a packet. Some qdiscs can use this value for + classification, among these are: + + atm, cbq, dsmark, pfifo_fast, htb, prio + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_TARGET_CONNMARK + tristate '"CONNMARK" target support' + depends on NF_CONNTRACK + depends on NETFILTER_ADVANCED + select NETFILTER_XT_CONNMARK + ---help--- + This is a backwards-compat option for the user's convenience + (e.g. when running oldconfig). It selects + CONFIG_NETFILTER_XT_CONNMARK (combined connmark/CONNMARK module). + +config NETFILTER_XT_TARGET_CONNSECMARK + tristate '"CONNSECMARK" target support' + depends on NF_CONNTRACK && NF_CONNTRACK_SECMARK + default m if NETFILTER_ADVANCED=n + help + The CONNSECMARK target copies security markings from packets + to connections, and restores security markings from connections + to packets (if the packets are not already marked). This would + normally be used in conjunction with the SECMARK target. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_TARGET_CT + tristate '"CT" target support' + depends on NF_CONNTRACK + depends on IP_NF_RAW || IP6_NF_RAW + depends on NETFILTER_ADVANCED + help + This options adds a `CT' target, which allows to specify initial + connection tracking parameters like events to be delivered and + the helper to be used. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_TARGET_DSCP + tristate '"DSCP" and "TOS" target support' + depends on IP_NF_MANGLE || IP6_NF_MANGLE + depends on NETFILTER_ADVANCED + help + This option adds a `DSCP' target, which allows you to manipulate + the IPv4/IPv6 header DSCP field (differentiated services codepoint). + + The DSCP field can have any value between 0x0 and 0x3f inclusive. + + It also adds the "TOS" target, which allows you to create rules in + the "mangle" table which alter the Type Of Service field of an IPv4 + or the Priority field of an IPv6 packet, prior to routing. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_TARGET_HL + tristate '"HL" hoplimit target support' + depends on IP_NF_MANGLE || IP6_NF_MANGLE + depends on NETFILTER_ADVANCED + ---help--- + This option adds the "HL" (for IPv6) and "TTL" (for IPv4) + targets, which enable the user to change the + hoplimit/time-to-live value of the IP header. + + While it is safe to decrement the hoplimit/TTL value, the + modules also allow to increment and set the hoplimit value of + the header to arbitrary values. This is EXTREMELY DANGEROUS + since you can easily create immortal packets that loop + forever on the network. + +config NETFILTER_XT_TARGET_IDLETIMER + tristate "IDLETIMER target support" + depends on NETFILTER_ADVANCED + help + + This option adds the `IDLETIMER' target. Each matching packet + resets the timer associated with label specified when the rule is + added. When the timer expires, it triggers a sysfs notification. + The remaining time for expiration can be read via sysfs. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_TARGET_LED + tristate '"LED" target support' + depends on LEDS_CLASS && LEDS_TRIGGERS + depends on NETFILTER_ADVANCED + help + This option adds a `LED' target, which allows you to blink LEDs in + response to particular packets passing through your machine. + + This can be used to turn a spare LED into a network activity LED, + which only flashes in response to FTP transfers, for example. Or + you could have an LED which lights up for a minute or two every time + somebody connects to your machine via SSH. + + You will need support for the "led" class to make this work. + + To create an LED trigger for incoming SSH traffic: + iptables -A INPUT -p tcp --dport 22 -j LED --led-trigger-id ssh --led-delay 1000 + + Then attach the new trigger to an LED on your system: + echo netfilter-ssh > /sys/class/leds/<ledname>/trigger + + For more information on the LEDs available on your system, see + Documentation/leds/leds-class.txt + +config NETFILTER_XT_TARGET_LOG + tristate "LOG target support" + default m if NETFILTER_ADVANCED=n + help + This option adds a `LOG' target, which allows you to create rules in + any iptables table which records the packet header to the syslog. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_TARGET_MARK + tristate '"MARK" target support' + depends on NETFILTER_ADVANCED + select NETFILTER_XT_MARK + ---help--- + This is a backwards-compat option for the user's convenience + (e.g. when running oldconfig). It selects + CONFIG_NETFILTER_XT_MARK (combined mark/MARK module). + +config NETFILTER_XT_TARGET_NFLOG + tristate '"NFLOG" target support' + default m if NETFILTER_ADVANCED=n + select NETFILTER_NETLINK_LOG + help + This option enables the NFLOG target, which allows to LOG + messages through nfnetlink_log. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_TARGET_NFQUEUE + tristate '"NFQUEUE" target Support' + depends on NETFILTER_ADVANCED + select NETFILTER_NETLINK_QUEUE + help + This target replaced the old obsolete QUEUE target. + + As opposed to QUEUE, it supports 65535 different queues, + not just one. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_TARGET_NOTRACK + tristate '"NOTRACK" target support' + depends on IP_NF_RAW || IP6_NF_RAW + depends on NF_CONNTRACK + help + The NOTRACK target allows a select rule to specify + which packets *not* to enter the conntrack/NAT + subsystem with all the consequences (no ICMP error tracking, + no protocol helpers for the selected packets). + + If you want to compile it as a module, say M here and read + <file:Documentation/kbuild/modules.txt>. If unsure, say `N'. + +config NETFILTER_XT_TARGET_RATEEST + tristate '"RATEEST" target support' + depends on NETFILTER_ADVANCED + help + This option adds a `RATEEST' target, which allows to measure + rates similar to TC estimators. The `rateest' match can be + used to match on the measured rates. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_TARGET_TEE + tristate '"TEE" - packet cloning to alternate destination' + depends on NETFILTER_ADVANCED + depends on (IPV6 || IPV6=n) + depends on !NF_CONNTRACK || NF_CONNTRACK + ---help--- + This option adds a "TEE" target with which a packet can be cloned and + this clone be rerouted to another nexthop. + +config NETFILTER_XT_TARGET_TPROXY + tristate '"TPROXY" target support (EXPERIMENTAL)' + depends on EXPERIMENTAL + depends on NETFILTER_TPROXY + depends on NETFILTER_XTABLES + depends on NETFILTER_ADVANCED + select NF_DEFRAG_IPV4 + select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES + help + This option adds a `TPROXY' target, which is somewhat similar to + REDIRECT. It can only be used in the mangle table and is useful + to redirect traffic to a transparent proxy. It does _not_ depend + on Netfilter connection tracking and NAT, unlike REDIRECT. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_TARGET_TRACE + tristate '"TRACE" target support' + depends on IP_NF_RAW || IP6_NF_RAW + depends on NETFILTER_ADVANCED + help + The TRACE target allows you to mark packets so that the kernel + will log every rule which match the packets as those traverse + the tables, chains, rules. + + If you want to compile it as a module, say M here and read + <file:Documentation/kbuild/modules.txt>. If unsure, say `N'. + +config NETFILTER_XT_TARGET_SECMARK + tristate '"SECMARK" target support' + depends on NETWORK_SECMARK + default m if NETFILTER_ADVANCED=n + help + The SECMARK target allows security marking of network + packets, for use with security subsystems. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_TARGET_TCPMSS + tristate '"TCPMSS" target support' + depends on (IPV6 || IPV6=n) + default m if NETFILTER_ADVANCED=n + ---help--- + This option adds a `TCPMSS' target, which allows you to alter the + MSS value of TCP SYN packets, to control the maximum size for that + connection (usually limiting it to your outgoing interface's MTU + minus 40). + + This is used to overcome criminally braindead ISPs or servers which + block ICMP Fragmentation Needed packets. The symptoms of this + problem are that everything works fine from your Linux + firewall/router, but machines behind it can never exchange large + packets: + 1) Web browsers connect, then hang with no data received. + 2) Small mail works fine, but large emails hang. + 3) ssh works fine, but scp hangs after initial handshaking. + + Workaround: activate this option and add a rule to your firewall + configuration like: + + iptables -A FORWARD -p tcp --tcp-flags SYN,RST SYN \ + -j TCPMSS --clamp-mss-to-pmtu + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_TARGET_TCPOPTSTRIP + tristate '"TCPOPTSTRIP" target support (EXPERIMENTAL)' + depends on EXPERIMENTAL + depends on IP_NF_MANGLE || IP6_NF_MANGLE + depends on NETFILTER_ADVANCED + help + This option adds a "TCPOPTSTRIP" target, which allows you to strip + TCP options from TCP packets. + +# alphabetically ordered list of matches + +comment "Xtables matches" + +config NETFILTER_XT_MATCH_ADDRTYPE + tristate '"addrtype" address type match support' + depends on NETFILTER_ADVANCED + ---help--- + This option allows you to match what routing thinks of an address, + eg. UNICAST, LOCAL, BROADCAST, ... + + If you want to compile it as a module, say M here and read + <file:Documentation/kbuild/modules.txt>. If unsure, say `N'. + +config NETFILTER_XT_MATCH_CLUSTER + tristate '"cluster" match support' + depends on NF_CONNTRACK + depends on NETFILTER_ADVANCED + ---help--- + This option allows you to build work-load-sharing clusters of + network servers/stateful firewalls without having a dedicated + load-balancing router/server/switch. Basically, this match returns + true when the packet must be handled by this cluster node. Thus, + all nodes see all packets and this match decides which node handles + what packets. The work-load sharing algorithm is based on source + address hashing. + + If you say Y or M here, try `iptables -m cluster --help` for + more information. + +config NETFILTER_XT_MATCH_COMMENT + tristate '"comment" match support' + depends on NETFILTER_ADVANCED + help + This option adds a `comment' dummy-match, which allows you to put + comments in your iptables ruleset. + + If you want to compile it as a module, say M here and read + <file:Documentation/kbuild/modules.txt>. If unsure, say `N'. + +config NETFILTER_XT_MATCH_CONNBYTES + tristate '"connbytes" per-connection counter match support' + depends on NF_CONNTRACK + depends on NETFILTER_ADVANCED + help + This option adds a `connbytes' match, which allows you to match the + number of bytes and/or packets for each direction within a connection. + + If you want to compile it as a module, say M here and read + <file:Documentation/kbuild/modules.txt>. If unsure, say `N'. + +config NETFILTER_XT_MATCH_CONNLIMIT + tristate '"connlimit" match support"' + depends on NF_CONNTRACK + depends on NETFILTER_ADVANCED + ---help--- + This match allows you to match against the number of parallel + connections to a server per client IP address (or address block). + +config NETFILTER_XT_MATCH_CONNMARK + tristate '"connmark" connection mark match support' + depends on NF_CONNTRACK + depends on NETFILTER_ADVANCED + select NETFILTER_XT_CONNMARK + ---help--- + This is a backwards-compat option for the user's convenience + (e.g. when running oldconfig). It selects + CONFIG_NETFILTER_XT_CONNMARK (combined connmark/CONNMARK module). + +config NETFILTER_XT_MATCH_CONNTRACK + tristate '"conntrack" connection tracking match support' + depends on NF_CONNTRACK + default m if NETFILTER_ADVANCED=n + help + This is a general conntrack match module, a superset of the state match. + + It allows matching on additional conntrack information, which is + useful in complex configurations, such as NAT gateways with multiple + internet links or tunnels. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_MATCH_CPU + tristate '"cpu" match support' + depends on NETFILTER_ADVANCED + help + CPU matching allows you to match packets based on the CPU + currently handling the packet. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_MATCH_DCCP + tristate '"dccp" protocol match support' + depends on NETFILTER_ADVANCED + default IP_DCCP + help + With this option enabled, you will be able to use the iptables + `dccp' match in order to match on DCCP source/destination ports + and DCCP flags. + + If you want to compile it as a module, say M here and read + <file:Documentation/kbuild/modules.txt>. If unsure, say `N'. + +config NETFILTER_XT_MATCH_DEVGROUP + tristate '"devgroup" match support' + depends on NETFILTER_ADVANCED + help + This options adds a `devgroup' match, which allows to match on the + device group a network device is assigned to. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_MATCH_DSCP + tristate '"dscp" and "tos" match support' + depends on NETFILTER_ADVANCED + help + This option adds a `DSCP' match, which allows you to match against + the IPv4/IPv6 header DSCP field (differentiated services codepoint). + + The DSCP field can have any value between 0x0 and 0x3f inclusive. + + It will also add a "tos" match, which allows you to match packets + based on the Type Of Service fields of the IPv4 packet (which share + the same bits as DSCP). + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_MATCH_ECN + tristate '"ecn" match support' + depends on NETFILTER_ADVANCED + ---help--- + This option adds an "ECN" match, which allows you to match against + the IPv4 and TCP header ECN fields. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_MATCH_ESP + tristate '"esp" match support' + depends on NETFILTER_ADVANCED + help + This match extension allows you to match a range of SPIs + inside ESP header of IPSec packets. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_MATCH_HASHLIMIT + tristate '"hashlimit" match support' + depends on (IP6_NF_IPTABLES || IP6_NF_IPTABLES=n) + depends on NETFILTER_ADVANCED + help + This option adds a `hashlimit' match. + + As opposed to `limit', this match dynamically creates a hash table + of limit buckets, based on your selection of source/destination + addresses and/or ports. + + It enables you to express policies like `10kpps for any given + destination address' or `500pps from any given source address' + with a single rule. + +config NETFILTER_XT_MATCH_HELPER + tristate '"helper" match support' + depends on NF_CONNTRACK + depends on NETFILTER_ADVANCED + help + Helper matching allows you to match packets in dynamic connections + tracked by a conntrack-helper, ie. ip_conntrack_ftp + + To compile it as a module, choose M here. If unsure, say Y. + +config NETFILTER_XT_MATCH_HL + tristate '"hl" hoplimit/TTL match support' + depends on NETFILTER_ADVANCED + ---help--- + HL matching allows you to match packets based on the hoplimit + in the IPv6 header, or the time-to-live field in the IPv4 + header of the packet. + +config NETFILTER_XT_MATCH_IPRANGE + tristate '"iprange" address range match support' + depends on NETFILTER_ADVANCED + ---help--- + This option adds a "iprange" match, which allows you to match based on + an IP address range. (Normal iptables only matches on single addresses + with an optional mask.) + + If unsure, say M. + +config NETFILTER_XT_MATCH_IPVS + tristate '"ipvs" match support' + depends on IP_VS + depends on NETFILTER_ADVANCED + depends on NF_CONNTRACK + help + This option allows you to match against IPVS properties of a packet. + + If unsure, say N. + +config NETFILTER_XT_MATCH_LENGTH + tristate '"length" match support' + depends on NETFILTER_ADVANCED + help + This option allows you to match the length of a packet against a + specific value or range of values. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_MATCH_LIMIT + tristate '"limit" match support' + depends on NETFILTER_ADVANCED + help + limit matching allows you to control the rate at which a rule can be + matched: mainly useful in combination with the LOG target ("LOG + target support", below) and to avoid some Denial of Service attacks. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_MATCH_MAC + tristate '"mac" address match support' + depends on NETFILTER_ADVANCED + help + MAC matching allows you to match packets based on the source + Ethernet address of the packet. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_MATCH_MARK + tristate '"mark" match support' + depends on NETFILTER_ADVANCED + select NETFILTER_XT_MARK + ---help--- + This is a backwards-compat option for the user's convenience + (e.g. when running oldconfig). It selects + CONFIG_NETFILTER_XT_MARK (combined mark/MARK module). + +config NETFILTER_XT_MATCH_MULTIPORT + tristate '"multiport" Multiple port match support' + depends on NETFILTER_ADVANCED + help + Multiport matching allows you to match TCP or UDP packets based on + a series of source or destination ports: normally a rule can only + match a single range of ports. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_MATCH_NFACCT + tristate '"nfacct" match support' + depends on NETFILTER_ADVANCED + select NETFILTER_NETLINK_ACCT + help + This option allows you to use the extended accounting through + nfnetlink_acct. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_MATCH_OSF + tristate '"osf" Passive OS fingerprint match' + depends on NETFILTER_ADVANCED && NETFILTER_NETLINK + help + This option selects the Passive OS Fingerprinting match module + that allows to passively match the remote operating system by + analyzing incoming TCP SYN packets. + + Rules and loading software can be downloaded from + http://www.ioremap.net/projects/osf + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_MATCH_OWNER + tristate '"owner" match support' + depends on NETFILTER_ADVANCED + ---help--- + Socket owner matching allows you to match locally-generated packets + based on who created the socket: the user or group. It is also + possible to check whether a socket actually exists. + + Conflicts with '"quota, tag, uid" match' + +config NETFILTER_XT_MATCH_POLICY + tristate 'IPsec "policy" match support' + depends on XFRM + default m if NETFILTER_ADVANCED=n + help + Policy matching allows you to match packets based on the + IPsec policy that was used during decapsulation/will + be used during encapsulation. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_MATCH_PHYSDEV + tristate '"physdev" match support' + depends on BRIDGE && BRIDGE_NETFILTER + depends on NETFILTER_ADVANCED + help + Physdev packet matching matches against the physical bridge ports + the IP packet arrived on or will leave by. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_MATCH_PKTTYPE + tristate '"pkttype" packet type match support' + depends on NETFILTER_ADVANCED + help + Packet type matching allows you to match a packet by + its "class", eg. BROADCAST, MULTICAST, ... + + Typical usage: + iptables -A INPUT -m pkttype --pkt-type broadcast -j LOG + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_MATCH_QTAGUID + bool '"quota, tag, owner" match and stats support' + depends on NETFILTER_XT_MATCH_SOCKET + depends on NETFILTER_XT_MATCH_OWNER=n + help + This option replaces the `owner' match. In addition to matching + on uid, it keeps stats based on a tag assigned to a socket. + The full tag is comprised of a UID and an accounting tag. + The tags are assignable to sockets from user space (e.g. a download + manager can assign the socket to another UID for accounting). + Stats and control are done via /proc/net/xt_qtaguid/. + It replaces owner as it takes the same arguments, but should + really be recognized by the iptables tool. + + If unsure, say `N'. + +config NETFILTER_XT_MATCH_QUOTA + tristate '"quota" match support' + depends on NETFILTER_ADVANCED + help + This option adds a `quota' match, which allows to match on a + byte counter. + + If you want to compile it as a module, say M here and read + <file:Documentation/kbuild/modules.txt>. If unsure, say `N'. + +config NETFILTER_XT_MATCH_QUOTA2 + tristate '"quota2" match support' + depends on NETFILTER_ADVANCED + help + This option adds a `quota2' match, which allows to match on a + byte counter correctly and not per CPU. + It allows naming the quotas. + This is based on http://xtables-addons.git.sourceforge.net + + If you want to compile it as a module, say M here and read + <file:Documentation/kbuild/modules.txt>. If unsure, say `N'. + +config NETFILTER_XT_MATCH_QUOTA2_LOG + bool '"quota2" Netfilter LOG support' + depends on NETFILTER_XT_MATCH_QUOTA2 + depends on IP_NF_TARGET_ULOG=n # not yes, not module, just no + default n + help + This option allows `quota2' to log ONCE when a quota limit + is passed. It logs via NETLINK using the NETLINK_NFLOG family. + It logs similarly to how ipt_ULOG would without data. + + If unsure, say `N'. + +config NETFILTER_XT_MATCH_RATEEST + tristate '"rateest" match support' + depends on NETFILTER_ADVANCED + select NETFILTER_XT_TARGET_RATEEST + help + This option adds a `rateest' match, which allows to match on the + rate estimated by the RATEEST target. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_MATCH_REALM + tristate '"realm" match support' + depends on NETFILTER_ADVANCED + select IP_ROUTE_CLASSID + help + This option adds a `realm' match, which allows you to use the realm + key from the routing subsystem inside iptables. + + This match pretty much resembles the CONFIG_NET_CLS_ROUTE4 option + in tc world. + + If you want to compile it as a module, say M here and read + <file:Documentation/kbuild/modules.txt>. If unsure, say `N'. + +config NETFILTER_XT_MATCH_RECENT + tristate '"recent" match support' + depends on NETFILTER_ADVANCED + ---help--- + This match is used for creating one or many lists of recently + used addresses and then matching against that/those list(s). + + Short options are available by using 'iptables -m recent -h' + Official Website: <http://snowman.net/projects/ipt_recent/> + +config NETFILTER_XT_MATCH_SCTP + tristate '"sctp" protocol match support (EXPERIMENTAL)' + depends on EXPERIMENTAL + depends on NETFILTER_ADVANCED + default IP_SCTP + help + With this option enabled, you will be able to use the + `sctp' match in order to match on SCTP source/destination ports + and SCTP chunk types. + + If you want to compile it as a module, say M here and read + <file:Documentation/kbuild/modules.txt>. If unsure, say `N'. + +config NETFILTER_XT_MATCH_SOCKET + tristate '"socket" match support (EXPERIMENTAL)' + depends on EXPERIMENTAL + depends on NETFILTER_TPROXY + depends on NETFILTER_XTABLES + depends on NETFILTER_ADVANCED + depends on !NF_CONNTRACK || NF_CONNTRACK + select NF_DEFRAG_IPV4 + select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES + help + This option adds a `socket' match, which can be used to match + packets for which a TCP or UDP socket lookup finds a valid socket. + It can be used in combination with the MARK target and policy + routing to implement full featured non-locally bound sockets. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_MATCH_STATE + tristate '"state" match support' + depends on NF_CONNTRACK + default m if NETFILTER_ADVANCED=n + help + Connection state matching allows you to match packets based on their + relationship to a tracked connection (ie. previous packets). This + is a powerful tool for packet classification. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_MATCH_STATISTIC + tristate '"statistic" match support' + depends on NETFILTER_ADVANCED + help + This option adds a `statistic' match, which allows you to match + on packets periodically or randomly with a given percentage. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_MATCH_STRING + tristate '"string" match support' + depends on NETFILTER_ADVANCED + select TEXTSEARCH + select TEXTSEARCH_KMP + select TEXTSEARCH_BM + select TEXTSEARCH_FSM + help + This option adds a `string' match, which allows you to look for + pattern matchings in packets. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_MATCH_TCPMSS + tristate '"tcpmss" match support' + depends on NETFILTER_ADVANCED + help + This option adds a `tcpmss' match, which allows you to examine the + MSS value of TCP SYN packets, which control the maximum packet size + for that connection. + + To compile it as a module, choose M here. If unsure, say N. + +config NETFILTER_XT_MATCH_TIME + tristate '"time" match support' + depends on NETFILTER_ADVANCED + ---help--- + This option adds a "time" match, which allows you to match based on + the packet arrival time (at the machine which netfilter is running) + on) or departure time/date (for locally generated packets). + + If you say Y here, try `iptables -m time --help` for + more information. + + If you want to compile it as a module, say M here. + If unsure, say N. + +config NETFILTER_XT_MATCH_U32 + tristate '"u32" match support' + depends on NETFILTER_ADVANCED + ---help--- + u32 allows you to extract quantities of up to 4 bytes from a packet, + AND them with specified masks, shift them by specified amounts and + test whether the results are in any of a set of specified ranges. + The specification of what to extract is general enough to skip over + headers with lengths stored in the packet, as in IP or TCP header + lengths. + + Details and examples are in the kernel module source. + +endif # NETFILTER_XTABLES + +endmenu + +source "net/netfilter/ipset/Kconfig" + +source "net/netfilter/ipvs/Kconfig" diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile new file mode 100644 index 00000000..452e84de --- /dev/null +++ b/net/netfilter/Makefile @@ -0,0 +1,123 @@ +netfilter-objs := core.o nf_log.o nf_queue.o nf_sockopt.o + +nf_conntrack-y := nf_conntrack_core.o nf_conntrack_standalone.o nf_conntrack_expect.o nf_conntrack_helper.o nf_conntrack_proto.o nf_conntrack_l3proto_generic.o nf_conntrack_proto_generic.o nf_conntrack_proto_tcp.o nf_conntrack_proto_udp.o nf_conntrack_extend.o nf_conntrack_acct.o +nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMEOUT) += nf_conntrack_timeout.o +nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMESTAMP) += nf_conntrack_timestamp.o +nf_conntrack-$(CONFIG_NF_CONNTRACK_EVENTS) += nf_conntrack_ecache.o + +obj-$(CONFIG_NETFILTER) = netfilter.o + +obj-$(CONFIG_NETFILTER_NETLINK) += nfnetlink.o +obj-$(CONFIG_NETFILTER_NETLINK_ACCT) += nfnetlink_acct.o +obj-$(CONFIG_NETFILTER_NETLINK_QUEUE) += nfnetlink_queue.o +obj-$(CONFIG_NETFILTER_NETLINK_LOG) += nfnetlink_log.o + +# connection tracking +obj-$(CONFIG_NF_CONNTRACK) += nf_conntrack.o + +# SCTP protocol connection tracking +obj-$(CONFIG_NF_CT_PROTO_DCCP) += nf_conntrack_proto_dccp.o +obj-$(CONFIG_NF_CT_PROTO_GRE) += nf_conntrack_proto_gre.o +obj-$(CONFIG_NF_CT_PROTO_SCTP) += nf_conntrack_proto_sctp.o +obj-$(CONFIG_NF_CT_PROTO_UDPLITE) += nf_conntrack_proto_udplite.o + +# netlink interface for nf_conntrack +obj-$(CONFIG_NF_CT_NETLINK) += nf_conntrack_netlink.o +obj-$(CONFIG_NF_CT_NETLINK_TIMEOUT) += nfnetlink_cttimeout.o + +# connection tracking helpers +nf_conntrack_h323-objs := nf_conntrack_h323_main.o nf_conntrack_h323_asn1.o + +obj-$(CONFIG_NF_CONNTRACK_AMANDA) += nf_conntrack_amanda.o +obj-$(CONFIG_NF_CONNTRACK_FTP) += nf_conntrack_ftp.o +obj-$(CONFIG_NF_CONNTRACK_H323) += nf_conntrack_h323.o +obj-$(CONFIG_NF_CONNTRACK_IRC) += nf_conntrack_irc.o +obj-$(CONFIG_NF_CONNTRACK_BROADCAST) += nf_conntrack_broadcast.o +obj-$(CONFIG_NF_CONNTRACK_NETBIOS_NS) += nf_conntrack_netbios_ns.o +obj-$(CONFIG_NF_CONNTRACK_SNMP) += nf_conntrack_snmp.o +obj-$(CONFIG_NF_CONNTRACK_PPTP) += nf_conntrack_pptp.o +obj-$(CONFIG_NF_CONNTRACK_SANE) += nf_conntrack_sane.o +obj-$(CONFIG_NF_CONNTRACK_SIP) += nf_conntrack_sip.o +obj-$(CONFIG_NF_CONNTRACK_TFTP) += nf_conntrack_tftp.o + +# transparent proxy support +obj-$(CONFIG_NETFILTER_TPROXY) += nf_tproxy_core.o + +# generic X tables +obj-$(CONFIG_NETFILTER_XTABLES) += x_tables.o xt_tcpudp.o + +# combos +obj-$(CONFIG_NETFILTER_XT_MARK) += xt_mark.o +obj-$(CONFIG_NETFILTER_XT_CONNMARK) += xt_connmark.o +obj-$(CONFIG_NETFILTER_XT_SET) += xt_set.o + +# targets +obj-$(CONFIG_NETFILTER_XT_TARGET_AUDIT) += xt_AUDIT.o +obj-$(CONFIG_NETFILTER_XT_TARGET_CHECKSUM) += xt_CHECKSUM.o +obj-$(CONFIG_NETFILTER_XT_TARGET_CLASSIFY) += xt_CLASSIFY.o +obj-$(CONFIG_NETFILTER_XT_TARGET_CONNSECMARK) += xt_CONNSECMARK.o +obj-$(CONFIG_NETFILTER_XT_TARGET_CT) += xt_CT.o +obj-$(CONFIG_NETFILTER_XT_TARGET_DSCP) += xt_DSCP.o +obj-$(CONFIG_NETFILTER_XT_TARGET_HL) += xt_HL.o +obj-$(CONFIG_NETFILTER_XT_TARGET_LED) += xt_LED.o +obj-$(CONFIG_NETFILTER_XT_TARGET_LOG) += xt_LOG.o +obj-$(CONFIG_NETFILTER_XT_TARGET_NFLOG) += xt_NFLOG.o +obj-$(CONFIG_NETFILTER_XT_TARGET_NFQUEUE) += xt_NFQUEUE.o +obj-$(CONFIG_NETFILTER_XT_TARGET_NOTRACK) += xt_NOTRACK.o +obj-$(CONFIG_NETFILTER_XT_TARGET_RATEEST) += xt_RATEEST.o +obj-$(CONFIG_NETFILTER_XT_TARGET_SECMARK) += xt_SECMARK.o +obj-$(CONFIG_NETFILTER_XT_TARGET_TPROXY) += xt_TPROXY.o +obj-$(CONFIG_NETFILTER_XT_TARGET_TCPMSS) += xt_TCPMSS.o +obj-$(CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP) += xt_TCPOPTSTRIP.o +obj-$(CONFIG_NETFILTER_XT_TARGET_TEE) += xt_TEE.o +obj-$(CONFIG_NETFILTER_XT_TARGET_TRACE) += xt_TRACE.o +obj-$(CONFIG_NETFILTER_XT_TARGET_IDLETIMER) += xt_IDLETIMER.o + +# matches +obj-$(CONFIG_NETFILTER_XT_MATCH_ADDRTYPE) += xt_addrtype.o +obj-$(CONFIG_NETFILTER_XT_MATCH_CLUSTER) += xt_cluster.o +obj-$(CONFIG_NETFILTER_XT_MATCH_COMMENT) += xt_comment.o +obj-$(CONFIG_NETFILTER_XT_MATCH_CONNBYTES) += xt_connbytes.o +obj-$(CONFIG_NETFILTER_XT_MATCH_CONNLIMIT) += xt_connlimit.o +obj-$(CONFIG_NETFILTER_XT_MATCH_CONNTRACK) += xt_conntrack.o +obj-$(CONFIG_NETFILTER_XT_MATCH_CPU) += xt_cpu.o +obj-$(CONFIG_NETFILTER_XT_MATCH_DCCP) += xt_dccp.o +obj-$(CONFIG_NETFILTER_XT_MATCH_DEVGROUP) += xt_devgroup.o +obj-$(CONFIG_NETFILTER_XT_MATCH_DSCP) += xt_dscp.o +obj-$(CONFIG_NETFILTER_XT_MATCH_ECN) += xt_ecn.o +obj-$(CONFIG_NETFILTER_XT_MATCH_ESP) += xt_esp.o +obj-$(CONFIG_NETFILTER_XT_MATCH_HASHLIMIT) += xt_hashlimit.o +obj-$(CONFIG_NETFILTER_XT_MATCH_HELPER) += xt_helper.o +obj-$(CONFIG_NETFILTER_XT_MATCH_HL) += xt_hl.o +obj-$(CONFIG_NETFILTER_XT_MATCH_IPRANGE) += xt_iprange.o +obj-$(CONFIG_NETFILTER_XT_MATCH_IPVS) += xt_ipvs.o +obj-$(CONFIG_NETFILTER_XT_MATCH_LENGTH) += xt_length.o +obj-$(CONFIG_NETFILTER_XT_MATCH_LIMIT) += xt_limit.o +obj-$(CONFIG_NETFILTER_XT_MATCH_MAC) += xt_mac.o +obj-$(CONFIG_NETFILTER_XT_MATCH_MULTIPORT) += xt_multiport.o +obj-$(CONFIG_NETFILTER_XT_MATCH_NFACCT) += xt_nfacct.o +obj-$(CONFIG_NETFILTER_XT_MATCH_OSF) += xt_osf.o +obj-$(CONFIG_NETFILTER_XT_MATCH_OWNER) += xt_owner.o +obj-$(CONFIG_NETFILTER_XT_MATCH_PHYSDEV) += xt_physdev.o +obj-$(CONFIG_NETFILTER_XT_MATCH_PKTTYPE) += xt_pkttype.o +obj-$(CONFIG_NETFILTER_XT_MATCH_POLICY) += xt_policy.o +obj-$(CONFIG_NETFILTER_XT_MATCH_QTAGUID) += xt_qtaguid_print.o xt_qtaguid.o +obj-$(CONFIG_NETFILTER_XT_MATCH_QUOTA) += xt_quota.o +obj-$(CONFIG_NETFILTER_XT_MATCH_QUOTA2) += xt_quota2.o +obj-$(CONFIG_NETFILTER_XT_MATCH_RATEEST) += xt_rateest.o +obj-$(CONFIG_NETFILTER_XT_MATCH_REALM) += xt_realm.o +obj-$(CONFIG_NETFILTER_XT_MATCH_RECENT) += xt_recent.o +obj-$(CONFIG_NETFILTER_XT_MATCH_SCTP) += xt_sctp.o +obj-$(CONFIG_NETFILTER_XT_MATCH_SOCKET) += xt_socket.o +obj-$(CONFIG_NETFILTER_XT_MATCH_STATE) += xt_state.o +obj-$(CONFIG_NETFILTER_XT_MATCH_STATISTIC) += xt_statistic.o +obj-$(CONFIG_NETFILTER_XT_MATCH_STRING) += xt_string.o +obj-$(CONFIG_NETFILTER_XT_MATCH_TCPMSS) += xt_tcpmss.o +obj-$(CONFIG_NETFILTER_XT_MATCH_TIME) += xt_time.o +obj-$(CONFIG_NETFILTER_XT_MATCH_U32) += xt_u32.o + +# ipset +obj-$(CONFIG_IP_SET) += ipset/ + +# IPVS +obj-$(CONFIG_IP_VS) += ipvs/ diff --git a/net/netfilter/core.c b/net/netfilter/core.c new file mode 100644 index 00000000..e1b7e051 --- /dev/null +++ b/net/netfilter/core.c @@ -0,0 +1,301 @@ +/* netfilter.c: look after the filters for various protocols. + * Heavily influenced by the old firewall.c by David Bonn and Alan Cox. + * + * Thanks to Rob `CmdrTaco' Malda for not influencing this code in any + * way. + * + * Rusty Russell (C)2000 -- This code is GPL. + */ +#include <linux/kernel.h> +#include <linux/netfilter.h> +#include <net/protocol.h> +#include <linux/init.h> +#include <linux/skbuff.h> +#include <linux/wait.h> +#include <linux/module.h> +#include <linux/interrupt.h> +#include <linux/if.h> +#include <linux/netdevice.h> +#include <linux/inetdevice.h> +#include <linux/proc_fs.h> +#include <linux/mutex.h> +#include <linux/slab.h> +#include <net/net_namespace.h> +#include <net/sock.h> + +#include "nf_internals.h" + +static DEFINE_MUTEX(afinfo_mutex); + +const struct nf_afinfo __rcu *nf_afinfo[NFPROTO_NUMPROTO] __read_mostly; +EXPORT_SYMBOL(nf_afinfo); + +int nf_register_afinfo(const struct nf_afinfo *afinfo) +{ + int err; + + err = mutex_lock_interruptible(&afinfo_mutex); + if (err < 0) + return err; + RCU_INIT_POINTER(nf_afinfo[afinfo->family], afinfo); + mutex_unlock(&afinfo_mutex); + return 0; +} +EXPORT_SYMBOL_GPL(nf_register_afinfo); + +void nf_unregister_afinfo(const struct nf_afinfo *afinfo) +{ + mutex_lock(&afinfo_mutex); + RCU_INIT_POINTER(nf_afinfo[afinfo->family], NULL); + mutex_unlock(&afinfo_mutex); + synchronize_rcu(); +} +EXPORT_SYMBOL_GPL(nf_unregister_afinfo); + +struct list_head nf_hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS] __read_mostly; +EXPORT_SYMBOL(nf_hooks); + +#if defined(CONFIG_JUMP_LABEL) +struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; +EXPORT_SYMBOL(nf_hooks_needed); +#endif + +static DEFINE_MUTEX(nf_hook_mutex); + +int nf_register_hook(struct nf_hook_ops *reg) +{ + struct nf_hook_ops *elem; + int err; + + err = mutex_lock_interruptible(&nf_hook_mutex); + if (err < 0) + return err; + list_for_each_entry(elem, &nf_hooks[reg->pf][reg->hooknum], list) { + if (reg->priority < elem->priority) + break; + } + list_add_rcu(®->list, elem->list.prev); + mutex_unlock(&nf_hook_mutex); +#if defined(CONFIG_JUMP_LABEL) + static_key_slow_inc(&nf_hooks_needed[reg->pf][reg->hooknum]); +#endif + return 0; +} +EXPORT_SYMBOL(nf_register_hook); + +void nf_unregister_hook(struct nf_hook_ops *reg) +{ + mutex_lock(&nf_hook_mutex); + list_del_rcu(®->list); + mutex_unlock(&nf_hook_mutex); +#if defined(CONFIG_JUMP_LABEL) + static_key_slow_dec(&nf_hooks_needed[reg->pf][reg->hooknum]); +#endif + synchronize_net(); +} +EXPORT_SYMBOL(nf_unregister_hook); + +int nf_register_hooks(struct nf_hook_ops *reg, unsigned int n) +{ + unsigned int i; + int err = 0; + + for (i = 0; i < n; i++) { + err = nf_register_hook(®[i]); + if (err) + goto err; + } + return err; + +err: + if (i > 0) + nf_unregister_hooks(reg, i); + return err; +} +EXPORT_SYMBOL(nf_register_hooks); + +void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n) +{ + while (n-- > 0) + nf_unregister_hook(®[n]); +} +EXPORT_SYMBOL(nf_unregister_hooks); + +unsigned int nf_iterate(struct list_head *head, + struct sk_buff *skb, + unsigned int hook, + const struct net_device *indev, + const struct net_device *outdev, + struct list_head **i, + int (*okfn)(struct sk_buff *), + int hook_thresh) +{ + unsigned int verdict; + + /* + * The caller must not block between calls to this + * function because of risk of continuing from deleted element. + */ + list_for_each_continue_rcu(*i, head) { + struct nf_hook_ops *elem = (struct nf_hook_ops *)*i; + + if (hook_thresh > elem->priority) + continue; + + /* Optimization: we don't need to hold module + reference here, since function can't sleep. --RR */ +repeat: + verdict = elem->hook(hook, skb, indev, outdev, okfn); + if (verdict != NF_ACCEPT) { +#ifdef CONFIG_NETFILTER_DEBUG + if (unlikely((verdict & NF_VERDICT_MASK) + > NF_MAX_VERDICT)) { + NFDEBUG("Evil return from %p(%u).\n", + elem->hook, hook); + continue; + } +#endif + if (verdict != NF_REPEAT) + return verdict; + goto repeat; + } + } + return NF_ACCEPT; +} + + +/* Returns 1 if okfn() needs to be executed by the caller, + * -EPERM for NF_DROP, 0 otherwise. */ +int nf_hook_slow(u_int8_t pf, unsigned int hook, struct sk_buff *skb, + struct net_device *indev, + struct net_device *outdev, + int (*okfn)(struct sk_buff *), + int hook_thresh) +{ + struct list_head *elem; + unsigned int verdict; + int ret = 0; + + /* We may already have this, but read-locks nest anyway */ + rcu_read_lock(); + + elem = &nf_hooks[pf][hook]; +next_hook: + verdict = nf_iterate(&nf_hooks[pf][hook], skb, hook, indev, + outdev, &elem, okfn, hook_thresh); + if (verdict == NF_ACCEPT || verdict == NF_STOP) { + ret = 1; + } else if ((verdict & NF_VERDICT_MASK) == NF_DROP) { + kfree_skb(skb); + ret = NF_DROP_GETERR(verdict); + if (ret == 0) + ret = -EPERM; + } else if ((verdict & NF_VERDICT_MASK) == NF_QUEUE) { + int err = nf_queue(skb, elem, pf, hook, indev, outdev, okfn, + verdict >> NF_VERDICT_QBITS); + if (err < 0) { + if (err == -ECANCELED) + goto next_hook; + if (err == -ESRCH && + (verdict & NF_VERDICT_FLAG_QUEUE_BYPASS)) + goto next_hook; + kfree_skb(skb); + } + } + rcu_read_unlock(); + return ret; +} +EXPORT_SYMBOL(nf_hook_slow); + + +int skb_make_writable(struct sk_buff *skb, unsigned int writable_len) +{ + if (writable_len > skb->len) + return 0; + + /* Not exclusive use of packet? Must copy. */ + if (!skb_cloned(skb)) { + if (writable_len <= skb_headlen(skb)) + return 1; + } else if (skb_clone_writable(skb, writable_len)) + return 1; + + if (writable_len <= skb_headlen(skb)) + writable_len = 0; + else + writable_len -= skb_headlen(skb); + + return !!__pskb_pull_tail(skb, writable_len); +} +EXPORT_SYMBOL(skb_make_writable); + +#if IS_ENABLED(CONFIG_NF_CONNTRACK) +/* This does not belong here, but locally generated errors need it if connection + tracking in use: without this, connection may not be in hash table, and hence + manufactured ICMP or RST packets will not be associated with it. */ +void (*ip_ct_attach)(struct sk_buff *, struct sk_buff *) __rcu __read_mostly; +EXPORT_SYMBOL(ip_ct_attach); + +void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) +{ + void (*attach)(struct sk_buff *, struct sk_buff *); + + if (skb->nfct) { + rcu_read_lock(); + attach = rcu_dereference(ip_ct_attach); + if (attach) + attach(new, skb); + rcu_read_unlock(); + } +} +EXPORT_SYMBOL(nf_ct_attach); + +void (*nf_ct_destroy)(struct nf_conntrack *) __rcu __read_mostly; +EXPORT_SYMBOL(nf_ct_destroy); + +void nf_conntrack_destroy(struct nf_conntrack *nfct) +{ + void (*destroy)(struct nf_conntrack *); + + rcu_read_lock(); + destroy = rcu_dereference(nf_ct_destroy); + BUG_ON(destroy == NULL); + destroy(nfct); + rcu_read_unlock(); +} +EXPORT_SYMBOL(nf_conntrack_destroy); +#endif /* CONFIG_NF_CONNTRACK */ + +#ifdef CONFIG_PROC_FS +struct proc_dir_entry *proc_net_netfilter; +EXPORT_SYMBOL(proc_net_netfilter); +#endif + +void __init netfilter_init(void) +{ + int i, h; + for (i = 0; i < ARRAY_SIZE(nf_hooks); i++) { + for (h = 0; h < NF_MAX_HOOKS; h++) + INIT_LIST_HEAD(&nf_hooks[i][h]); + } + +#ifdef CONFIG_PROC_FS + proc_net_netfilter = proc_mkdir("netfilter", init_net.proc_net); + if (!proc_net_netfilter) + panic("cannot create netfilter proc entry"); +#endif + + if (netfilter_queue_init() < 0) + panic("cannot initialize nf_queue"); + if (netfilter_log_init() < 0) + panic("cannot initialize nf_log"); +} + +#ifdef CONFIG_SYSCTL +struct ctl_path nf_net_netfilter_sysctl_path[] = { + { .procname = "net", }, + { .procname = "netfilter", }, + { } +}; +EXPORT_SYMBOL_GPL(nf_net_netfilter_sysctl_path); +#endif /* CONFIG_SYSCTL */ diff --git a/net/netfilter/ipset/Kconfig b/net/netfilter/ipset/Kconfig new file mode 100644 index 00000000..ba36c283 --- /dev/null +++ b/net/netfilter/ipset/Kconfig @@ -0,0 +1,132 @@ +menuconfig IP_SET + tristate "IP set support" + depends on INET && NETFILTER + depends on NETFILTER_NETLINK + help + This option adds IP set support to the kernel. + In order to define and use the sets, you need the userspace utility + ipset(8). You can use the sets in netfilter via the "set" match + and "SET" target. + + To compile it as a module, choose M here. If unsure, say N. + +if IP_SET + +config IP_SET_MAX + int "Maximum number of IP sets" + default 256 + range 2 65534 + depends on IP_SET + help + You can define here default value of the maximum number + of IP sets for the kernel. + + The value can be overriden by the 'max_sets' module + parameter of the 'ip_set' module. + +config IP_SET_BITMAP_IP + tristate "bitmap:ip set support" + depends on IP_SET + help + This option adds the bitmap:ip set type support, by which one + can store IPv4 addresses (or network addresse) from a range. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_SET_BITMAP_IPMAC + tristate "bitmap:ip,mac set support" + depends on IP_SET + help + This option adds the bitmap:ip,mac set type support, by which one + can store IPv4 address and (source) MAC address pairs from a range. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_SET_BITMAP_PORT + tristate "bitmap:port set support" + depends on IP_SET + help + This option adds the bitmap:port set type support, by which one + can store TCP/UDP port numbers from a range. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_SET_HASH_IP + tristate "hash:ip set support" + depends on IP_SET + help + This option adds the hash:ip set type support, by which one + can store arbitrary IPv4 or IPv6 addresses (or network addresses) + in a set. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_SET_HASH_IPPORT + tristate "hash:ip,port set support" + depends on IP_SET + help + This option adds the hash:ip,port set type support, by which one + can store IPv4/IPv6 address and protocol/port pairs. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_SET_HASH_IPPORTIP + tristate "hash:ip,port,ip set support" + depends on IP_SET + help + This option adds the hash:ip,port,ip set type support, by which + one can store IPv4/IPv6 address, protocol/port, and IPv4/IPv6 + address triples in a set. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_SET_HASH_IPPORTNET + tristate "hash:ip,port,net set support" + depends on IP_SET + help + This option adds the hash:ip,port,net set type support, by which + one can store IPv4/IPv6 address, protocol/port, and IPv4/IPv6 + network address/prefix triples in a set. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_SET_HASH_NET + tristate "hash:net set support" + depends on IP_SET + help + This option adds the hash:net set type support, by which + one can store IPv4/IPv6 network address/prefix elements in a set. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_SET_HASH_NETPORT + tristate "hash:net,port set support" + depends on IP_SET + help + This option adds the hash:net,port set type support, by which + one can store IPv4/IPv6 network address/prefix and + protocol/port pairs as elements in a set. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_SET_HASH_NETIFACE + tristate "hash:net,iface set support" + depends on IP_SET + help + This option adds the hash:net,iface set type support, by which + one can store IPv4/IPv6 network address/prefix and + interface name pairs as elements in a set. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_SET_LIST_SET + tristate "list:set set support" + depends on IP_SET + help + This option adds the list:set set type support. In this + kind of set one can store the name of other sets and it forms + an ordered union of the member sets. + + To compile it as a module, choose M here. If unsure, say N. + +endif # IP_SET diff --git a/net/netfilter/ipset/Makefile b/net/netfilter/ipset/Makefile new file mode 100644 index 00000000..6e965ecd --- /dev/null +++ b/net/netfilter/ipset/Makefile @@ -0,0 +1,25 @@ +# +# Makefile for the ipset modules +# + +ip_set-y := ip_set_core.o ip_set_getport.o pfxlen.o + +# ipset core +obj-$(CONFIG_IP_SET) += ip_set.o + +# bitmap types +obj-$(CONFIG_IP_SET_BITMAP_IP) += ip_set_bitmap_ip.o +obj-$(CONFIG_IP_SET_BITMAP_IPMAC) += ip_set_bitmap_ipmac.o +obj-$(CONFIG_IP_SET_BITMAP_PORT) += ip_set_bitmap_port.o + +# hash types +obj-$(CONFIG_IP_SET_HASH_IP) += ip_set_hash_ip.o +obj-$(CONFIG_IP_SET_HASH_IPPORT) += ip_set_hash_ipport.o +obj-$(CONFIG_IP_SET_HASH_IPPORTIP) += ip_set_hash_ipportip.o +obj-$(CONFIG_IP_SET_HASH_IPPORTNET) += ip_set_hash_ipportnet.o +obj-$(CONFIG_IP_SET_HASH_NET) += ip_set_hash_net.o +obj-$(CONFIG_IP_SET_HASH_NETPORT) += ip_set_hash_netport.o +obj-$(CONFIG_IP_SET_HASH_NETIFACE) += ip_set_hash_netiface.o + +# list types +obj-$(CONFIG_IP_SET_LIST_SET) += ip_set_list_set.o diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c new file mode 100644 index 00000000..a72a4dff --- /dev/null +++ b/net/netfilter/ipset/ip_set_bitmap_ip.c @@ -0,0 +1,587 @@ +/* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu> + * Patrick Schaaf <bof@bof.de> + * Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Kernel module implementing an IP set type: the bitmap:ip type */ + +#include <linux/module.h> +#include <linux/ip.h> +#include <linux/skbuff.h> +#include <linux/errno.h> +#include <linux/bitops.h> +#include <linux/spinlock.h> +#include <linux/netlink.h> +#include <linux/jiffies.h> +#include <linux/timer.h> +#include <net/netlink.h> +#include <net/tcp.h> + +#include <linux/netfilter/ipset/pfxlen.h> +#include <linux/netfilter/ipset/ip_set.h> +#include <linux/netfilter/ipset/ip_set_bitmap.h> +#define IP_SET_BITMAP_TIMEOUT +#include <linux/netfilter/ipset/ip_set_timeout.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +MODULE_DESCRIPTION("bitmap:ip type of IP sets"); +MODULE_ALIAS("ip_set_bitmap:ip"); + +/* Type structure */ +struct bitmap_ip { + void *members; /* the set members */ + u32 first_ip; /* host byte order, included in range */ + u32 last_ip; /* host byte order, included in range */ + u32 elements; /* number of max elements in the set */ + u32 hosts; /* number of hosts in a subnet */ + size_t memsize; /* members size */ + u8 netmask; /* subnet netmask */ + u32 timeout; /* timeout parameter */ + struct timer_list gc; /* garbage collection */ +}; + +/* Base variant */ + +static inline u32 +ip_to_id(const struct bitmap_ip *m, u32 ip) +{ + return ((ip & ip_set_hostmask(m->netmask)) - m->first_ip)/m->hosts; +} + +static int +bitmap_ip_test(struct ip_set *set, void *value, u32 timeout, u32 flags) +{ + const struct bitmap_ip *map = set->data; + u16 id = *(u16 *)value; + + return !!test_bit(id, map->members); +} + +static int +bitmap_ip_add(struct ip_set *set, void *value, u32 timeout, u32 flags) +{ + struct bitmap_ip *map = set->data; + u16 id = *(u16 *)value; + + if (test_and_set_bit(id, map->members)) + return -IPSET_ERR_EXIST; + + return 0; +} + +static int +bitmap_ip_del(struct ip_set *set, void *value, u32 timeout, u32 flags) +{ + struct bitmap_ip *map = set->data; + u16 id = *(u16 *)value; + + if (!test_and_clear_bit(id, map->members)) + return -IPSET_ERR_EXIST; + + return 0; +} + +static int +bitmap_ip_list(const struct ip_set *set, + struct sk_buff *skb, struct netlink_callback *cb) +{ + const struct bitmap_ip *map = set->data; + struct nlattr *atd, *nested; + u32 id, first = cb->args[2]; + + atd = ipset_nest_start(skb, IPSET_ATTR_ADT); + if (!atd) + return -EMSGSIZE; + for (; cb->args[2] < map->elements; cb->args[2]++) { + id = cb->args[2]; + if (!test_bit(id, map->members)) + continue; + nested = ipset_nest_start(skb, IPSET_ATTR_DATA); + if (!nested) { + if (id == first) { + nla_nest_cancel(skb, atd); + return -EMSGSIZE; + } else + goto nla_put_failure; + } + NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, + htonl(map->first_ip + id * map->hosts)); + ipset_nest_end(skb, nested); + } + ipset_nest_end(skb, atd); + /* Set listing finished */ + cb->args[2] = 0; + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nested); + ipset_nest_end(skb, atd); + if (unlikely(id == first)) { + cb->args[2] = 0; + return -EMSGSIZE; + } + return 0; +} + +/* Timeout variant */ + +static int +bitmap_ip_ttest(struct ip_set *set, void *value, u32 timeout, u32 flags) +{ + const struct bitmap_ip *map = set->data; + const unsigned long *members = map->members; + u16 id = *(u16 *)value; + + return ip_set_timeout_test(members[id]); +} + +static int +bitmap_ip_tadd(struct ip_set *set, void *value, u32 timeout, u32 flags) +{ + struct bitmap_ip *map = set->data; + unsigned long *members = map->members; + u16 id = *(u16 *)value; + + if (ip_set_timeout_test(members[id]) && !(flags & IPSET_FLAG_EXIST)) + return -IPSET_ERR_EXIST; + + members[id] = ip_set_timeout_set(timeout); + + return 0; +} + +static int +bitmap_ip_tdel(struct ip_set *set, void *value, u32 timeout, u32 flags) +{ + struct bitmap_ip *map = set->data; + unsigned long *members = map->members; + u16 id = *(u16 *)value; + int ret = -IPSET_ERR_EXIST; + + if (ip_set_timeout_test(members[id])) + ret = 0; + + members[id] = IPSET_ELEM_UNSET; + return ret; +} + +static int +bitmap_ip_tlist(const struct ip_set *set, + struct sk_buff *skb, struct netlink_callback *cb) +{ + const struct bitmap_ip *map = set->data; + struct nlattr *adt, *nested; + u32 id, first = cb->args[2]; + const unsigned long *members = map->members; + + adt = ipset_nest_start(skb, IPSET_ATTR_ADT); + if (!adt) + return -EMSGSIZE; + for (; cb->args[2] < map->elements; cb->args[2]++) { + id = cb->args[2]; + if (!ip_set_timeout_test(members[id])) + continue; + nested = ipset_nest_start(skb, IPSET_ATTR_DATA); + if (!nested) { + if (id == first) { + nla_nest_cancel(skb, adt); + return -EMSGSIZE; + } else + goto nla_put_failure; + } + NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, + htonl(map->first_ip + id * map->hosts)); + NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, + htonl(ip_set_timeout_get(members[id]))); + ipset_nest_end(skb, nested); + } + ipset_nest_end(skb, adt); + + /* Set listing finished */ + cb->args[2] = 0; + + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nested); + ipset_nest_end(skb, adt); + if (unlikely(id == first)) { + cb->args[2] = 0; + return -EMSGSIZE; + } + return 0; +} + +static int +bitmap_ip_kadt(struct ip_set *set, const struct sk_buff *skb, + const struct xt_action_param *par, + enum ipset_adt adt, const struct ip_set_adt_opt *opt) +{ + struct bitmap_ip *map = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + u32 ip; + + ip = ntohl(ip4addr(skb, opt->flags & IPSET_DIM_ONE_SRC)); + if (ip < map->first_ip || ip > map->last_ip) + return -IPSET_ERR_BITMAP_RANGE; + + ip = ip_to_id(map, ip); + + return adtfn(set, &ip, opt_timeout(opt, map), opt->cmdflags); +} + +static int +bitmap_ip_uadt(struct ip_set *set, struct nlattr *tb[], + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ + struct bitmap_ip *map = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + u32 timeout = map->timeout; + u32 ip, ip_to, id; + int ret = 0; + + if (unlikely(!tb[IPSET_ATTR_IP] || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); + if (ret) + return ret; + + if (ip < map->first_ip || ip > map->last_ip) + return -IPSET_ERR_BITMAP_RANGE; + + if (tb[IPSET_ATTR_TIMEOUT]) { + if (!with_timeout(map->timeout)) + return -IPSET_ERR_TIMEOUT; + timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + } + + if (adt == IPSET_TEST) { + id = ip_to_id(map, ip); + return adtfn(set, &id, timeout, flags); + } + + if (tb[IPSET_ATTR_IP_TO]) { + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); + if (ret) + return ret; + if (ip > ip_to) { + swap(ip, ip_to); + if (ip < map->first_ip) + return -IPSET_ERR_BITMAP_RANGE; + } + } else if (tb[IPSET_ATTR_CIDR]) { + u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + + if (cidr > 32) + return -IPSET_ERR_INVALID_CIDR; + ip_set_mask_from_to(ip, ip_to, cidr); + } else + ip_to = ip; + + if (ip_to > map->last_ip) + return -IPSET_ERR_BITMAP_RANGE; + + for (; !before(ip_to, ip); ip += map->hosts) { + id = ip_to_id(map, ip); + ret = adtfn(set, &id, timeout, flags); + + if (ret && !ip_set_eexist(ret, flags)) + return ret; + else + ret = 0; + } + return ret; +} + +static void +bitmap_ip_destroy(struct ip_set *set) +{ + struct bitmap_ip *map = set->data; + + if (with_timeout(map->timeout)) + del_timer_sync(&map->gc); + + ip_set_free(map->members); + kfree(map); + + set->data = NULL; +} + +static void +bitmap_ip_flush(struct ip_set *set) +{ + struct bitmap_ip *map = set->data; + + memset(map->members, 0, map->memsize); +} + +static int +bitmap_ip_head(struct ip_set *set, struct sk_buff *skb) +{ + const struct bitmap_ip *map = set->data; + struct nlattr *nested; + + nested = ipset_nest_start(skb, IPSET_ATTR_DATA); + if (!nested) + goto nla_put_failure; + NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, htonl(map->first_ip)); + NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP_TO, htonl(map->last_ip)); + if (map->netmask != 32) + NLA_PUT_U8(skb, IPSET_ATTR_NETMASK, map->netmask); + NLA_PUT_NET32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)); + NLA_PUT_NET32(skb, IPSET_ATTR_MEMSIZE, + htonl(sizeof(*map) + map->memsize)); + if (with_timeout(map->timeout)) + NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, htonl(map->timeout)); + ipset_nest_end(skb, nested); + + return 0; +nla_put_failure: + return -EMSGSIZE; +} + +static bool +bitmap_ip_same_set(const struct ip_set *a, const struct ip_set *b) +{ + const struct bitmap_ip *x = a->data; + const struct bitmap_ip *y = b->data; + + return x->first_ip == y->first_ip && + x->last_ip == y->last_ip && + x->netmask == y->netmask && + x->timeout == y->timeout; +} + +static const struct ip_set_type_variant bitmap_ip = { + .kadt = bitmap_ip_kadt, + .uadt = bitmap_ip_uadt, + .adt = { + [IPSET_ADD] = bitmap_ip_add, + [IPSET_DEL] = bitmap_ip_del, + [IPSET_TEST] = bitmap_ip_test, + }, + .destroy = bitmap_ip_destroy, + .flush = bitmap_ip_flush, + .head = bitmap_ip_head, + .list = bitmap_ip_list, + .same_set = bitmap_ip_same_set, +}; + +static const struct ip_set_type_variant bitmap_tip = { + .kadt = bitmap_ip_kadt, + .uadt = bitmap_ip_uadt, + .adt = { + [IPSET_ADD] = bitmap_ip_tadd, + [IPSET_DEL] = bitmap_ip_tdel, + [IPSET_TEST] = bitmap_ip_ttest, + }, + .destroy = bitmap_ip_destroy, + .flush = bitmap_ip_flush, + .head = bitmap_ip_head, + .list = bitmap_ip_tlist, + .same_set = bitmap_ip_same_set, +}; + +static void +bitmap_ip_gc(unsigned long ul_set) +{ + struct ip_set *set = (struct ip_set *) ul_set; + struct bitmap_ip *map = set->data; + unsigned long *table = map->members; + u32 id; + + /* We run parallel with other readers (test element) + * but adding/deleting new entries is locked out */ + read_lock_bh(&set->lock); + for (id = 0; id < map->elements; id++) + if (ip_set_timeout_expired(table[id])) + table[id] = IPSET_ELEM_UNSET; + read_unlock_bh(&set->lock); + + map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ; + add_timer(&map->gc); +} + +static void +bitmap_ip_gc_init(struct ip_set *set) +{ + struct bitmap_ip *map = set->data; + + init_timer(&map->gc); + map->gc.data = (unsigned long) set; + map->gc.function = bitmap_ip_gc; + map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ; + add_timer(&map->gc); +} + +/* Create bitmap:ip type of sets */ + +static bool +init_map_ip(struct ip_set *set, struct bitmap_ip *map, + u32 first_ip, u32 last_ip, + u32 elements, u32 hosts, u8 netmask) +{ + map->members = ip_set_alloc(map->memsize); + if (!map->members) + return false; + map->first_ip = first_ip; + map->last_ip = last_ip; + map->elements = elements; + map->hosts = hosts; + map->netmask = netmask; + map->timeout = IPSET_NO_TIMEOUT; + + set->data = map; + set->family = NFPROTO_IPV4; + + return true; +} + +static int +bitmap_ip_create(struct ip_set *set, struct nlattr *tb[], u32 flags) +{ + struct bitmap_ip *map; + u32 first_ip, last_ip, hosts, elements; + u8 netmask = 32; + int ret; + + if (unlikely(!tb[IPSET_ATTR_IP] || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) + return -IPSET_ERR_PROTOCOL; + + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &first_ip); + if (ret) + return ret; + + if (tb[IPSET_ATTR_IP_TO]) { + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &last_ip); + if (ret) + return ret; + if (first_ip > last_ip) { + u32 tmp = first_ip; + + first_ip = last_ip; + last_ip = tmp; + } + } else if (tb[IPSET_ATTR_CIDR]) { + u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + + if (cidr >= 32) + return -IPSET_ERR_INVALID_CIDR; + ip_set_mask_from_to(first_ip, last_ip, cidr); + } else + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_NETMASK]) { + netmask = nla_get_u8(tb[IPSET_ATTR_NETMASK]); + + if (netmask > 32) + return -IPSET_ERR_INVALID_NETMASK; + + first_ip &= ip_set_hostmask(netmask); + last_ip |= ~ip_set_hostmask(netmask); + } + + if (netmask == 32) { + hosts = 1; + elements = last_ip - first_ip + 1; + } else { + u8 mask_bits; + u32 mask; + + mask = range_to_mask(first_ip, last_ip, &mask_bits); + + if ((!mask && (first_ip || last_ip != 0xFFFFFFFF)) || + netmask <= mask_bits) + return -IPSET_ERR_BITMAP_RANGE; + + pr_debug("mask_bits %u, netmask %u\n", mask_bits, netmask); + hosts = 2 << (32 - netmask - 1); + elements = 2 << (netmask - mask_bits - 1); + } + if (elements > IPSET_BITMAP_MAX_RANGE + 1) + return -IPSET_ERR_BITMAP_RANGE_SIZE; + + pr_debug("hosts %u, elements %u\n", hosts, elements); + + map = kzalloc(sizeof(*map), GFP_KERNEL); + if (!map) + return -ENOMEM; + + if (tb[IPSET_ATTR_TIMEOUT]) { + map->memsize = elements * sizeof(unsigned long); + + if (!init_map_ip(set, map, first_ip, last_ip, + elements, hosts, netmask)) { + kfree(map); + return -ENOMEM; + } + + map->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + set->variant = &bitmap_tip; + + bitmap_ip_gc_init(set); + } else { + map->memsize = bitmap_bytes(0, elements - 1); + + if (!init_map_ip(set, map, first_ip, last_ip, + elements, hosts, netmask)) { + kfree(map); + return -ENOMEM; + } + + set->variant = &bitmap_ip; + } + return 0; +} + +static struct ip_set_type bitmap_ip_type __read_mostly = { + .name = "bitmap:ip", + .protocol = IPSET_PROTOCOL, + .features = IPSET_TYPE_IP, + .dimension = IPSET_DIM_ONE, + .family = NFPROTO_IPV4, + .revision_min = 0, + .revision_max = 0, + .create = bitmap_ip_create, + .create_policy = { + [IPSET_ATTR_IP] = { .type = NLA_NESTED }, + [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED }, + [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, + [IPSET_ATTR_NETMASK] = { .type = NLA_U8 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + }, + .adt_policy = { + [IPSET_ATTR_IP] = { .type = NLA_NESTED }, + [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED }, + [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, + }, + .me = THIS_MODULE, +}; + +static int __init +bitmap_ip_init(void) +{ + return ip_set_type_register(&bitmap_ip_type); +} + +static void __exit +bitmap_ip_fini(void) +{ + ip_set_type_unregister(&bitmap_ip_type); +} + +module_init(bitmap_ip_init); +module_exit(bitmap_ip_fini); diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c new file mode 100644 index 00000000..81324c12 --- /dev/null +++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c @@ -0,0 +1,659 @@ +/* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu> + * Patrick Schaaf <bof@bof.de> + * Martin Josefsson <gandalf@wlug.westbo.se> + * Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Kernel module implementing an IP set type: the bitmap:ip,mac type */ + +#include <linux/module.h> +#include <linux/ip.h> +#include <linux/etherdevice.h> +#include <linux/skbuff.h> +#include <linux/errno.h> +#include <linux/if_ether.h> +#include <linux/netlink.h> +#include <linux/jiffies.h> +#include <linux/timer.h> +#include <net/netlink.h> + +#include <linux/netfilter/ipset/pfxlen.h> +#include <linux/netfilter/ipset/ip_set.h> +#include <linux/netfilter/ipset/ip_set_timeout.h> +#include <linux/netfilter/ipset/ip_set_bitmap.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +MODULE_DESCRIPTION("bitmap:ip,mac type of IP sets"); +MODULE_ALIAS("ip_set_bitmap:ip,mac"); + +enum { + MAC_EMPTY, /* element is not set */ + MAC_FILLED, /* element is set with MAC */ + MAC_UNSET, /* element is set, without MAC */ +}; + +/* Type structure */ +struct bitmap_ipmac { + void *members; /* the set members */ + u32 first_ip; /* host byte order, included in range */ + u32 last_ip; /* host byte order, included in range */ + u32 timeout; /* timeout value */ + struct timer_list gc; /* garbage collector */ + size_t dsize; /* size of element */ +}; + +/* ADT structure for generic function args */ +struct ipmac { + u32 id; /* id in array */ + unsigned char *ether; /* ethernet address */ +}; + +/* Member element without and with timeout */ + +struct ipmac_elem { + unsigned char ether[ETH_ALEN]; + unsigned char match; +} __attribute__ ((aligned)); + +struct ipmac_telem { + unsigned char ether[ETH_ALEN]; + unsigned char match; + unsigned long timeout; +} __attribute__ ((aligned)); + +static inline void * +bitmap_ipmac_elem(const struct bitmap_ipmac *map, u32 id) +{ + return (void *)((char *)map->members + id * map->dsize); +} + +static inline bool +bitmap_timeout(const struct bitmap_ipmac *map, u32 id) +{ + const struct ipmac_telem *elem = bitmap_ipmac_elem(map, id); + + return ip_set_timeout_test(elem->timeout); +} + +static inline bool +bitmap_expired(const struct bitmap_ipmac *map, u32 id) +{ + const struct ipmac_telem *elem = bitmap_ipmac_elem(map, id); + + return ip_set_timeout_expired(elem->timeout); +} + +static inline int +bitmap_ipmac_exist(const struct ipmac_telem *elem) +{ + return elem->match == MAC_UNSET || + (elem->match == MAC_FILLED && + !ip_set_timeout_expired(elem->timeout)); +} + +/* Base variant */ + +static int +bitmap_ipmac_test(struct ip_set *set, void *value, u32 timeout, u32 flags) +{ + const struct bitmap_ipmac *map = set->data; + const struct ipmac *data = value; + const struct ipmac_elem *elem = bitmap_ipmac_elem(map, data->id); + + switch (elem->match) { + case MAC_UNSET: + /* Trigger kernel to fill out the ethernet address */ + return -EAGAIN; + case MAC_FILLED: + return data->ether == NULL || + compare_ether_addr(data->ether, elem->ether) == 0; + } + return 0; +} + +static int +bitmap_ipmac_add(struct ip_set *set, void *value, u32 timeout, u32 flags) +{ + struct bitmap_ipmac *map = set->data; + const struct ipmac *data = value; + struct ipmac_elem *elem = bitmap_ipmac_elem(map, data->id); + + switch (elem->match) { + case MAC_UNSET: + if (!data->ether) + /* Already added without ethernet address */ + return -IPSET_ERR_EXIST; + /* Fill the MAC address */ + memcpy(elem->ether, data->ether, ETH_ALEN); + elem->match = MAC_FILLED; + break; + case MAC_FILLED: + return -IPSET_ERR_EXIST; + case MAC_EMPTY: + if (data->ether) { + memcpy(elem->ether, data->ether, ETH_ALEN); + elem->match = MAC_FILLED; + } else + elem->match = MAC_UNSET; + } + + return 0; +} + +static int +bitmap_ipmac_del(struct ip_set *set, void *value, u32 timeout, u32 flags) +{ + struct bitmap_ipmac *map = set->data; + const struct ipmac *data = value; + struct ipmac_elem *elem = bitmap_ipmac_elem(map, data->id); + + if (elem->match == MAC_EMPTY) + return -IPSET_ERR_EXIST; + + elem->match = MAC_EMPTY; + + return 0; +} + +static int +bitmap_ipmac_list(const struct ip_set *set, + struct sk_buff *skb, struct netlink_callback *cb) +{ + const struct bitmap_ipmac *map = set->data; + const struct ipmac_elem *elem; + struct nlattr *atd, *nested; + u32 id, first = cb->args[2]; + u32 last = map->last_ip - map->first_ip; + + atd = ipset_nest_start(skb, IPSET_ATTR_ADT); + if (!atd) + return -EMSGSIZE; + for (; cb->args[2] <= last; cb->args[2]++) { + id = cb->args[2]; + elem = bitmap_ipmac_elem(map, id); + if (elem->match == MAC_EMPTY) + continue; + nested = ipset_nest_start(skb, IPSET_ATTR_DATA); + if (!nested) { + if (id == first) { + nla_nest_cancel(skb, atd); + return -EMSGSIZE; + } else + goto nla_put_failure; + } + NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, + htonl(map->first_ip + id)); + if (elem->match == MAC_FILLED) + NLA_PUT(skb, IPSET_ATTR_ETHER, ETH_ALEN, + elem->ether); + ipset_nest_end(skb, nested); + } + ipset_nest_end(skb, atd); + /* Set listing finished */ + cb->args[2] = 0; + + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nested); + ipset_nest_end(skb, atd); + if (unlikely(id == first)) { + cb->args[2] = 0; + return -EMSGSIZE; + } + return 0; +} + +/* Timeout variant */ + +static int +bitmap_ipmac_ttest(struct ip_set *set, void *value, u32 timeout, u32 flags) +{ + const struct bitmap_ipmac *map = set->data; + const struct ipmac *data = value; + const struct ipmac_elem *elem = bitmap_ipmac_elem(map, data->id); + + switch (elem->match) { + case MAC_UNSET: + /* Trigger kernel to fill out the ethernet address */ + return -EAGAIN; + case MAC_FILLED: + return (data->ether == NULL || + compare_ether_addr(data->ether, elem->ether) == 0) && + !bitmap_expired(map, data->id); + } + return 0; +} + +static int +bitmap_ipmac_tadd(struct ip_set *set, void *value, u32 timeout, u32 flags) +{ + struct bitmap_ipmac *map = set->data; + const struct ipmac *data = value; + struct ipmac_telem *elem = bitmap_ipmac_elem(map, data->id); + bool flag_exist = flags & IPSET_FLAG_EXIST; + + switch (elem->match) { + case MAC_UNSET: + if (!(data->ether || flag_exist)) + /* Already added without ethernet address */ + return -IPSET_ERR_EXIST; + /* Fill the MAC address and activate the timer */ + memcpy(elem->ether, data->ether, ETH_ALEN); + elem->match = MAC_FILLED; + if (timeout == map->timeout) + /* Timeout was not specified, get stored one */ + timeout = elem->timeout; + elem->timeout = ip_set_timeout_set(timeout); + break; + case MAC_FILLED: + if (!(bitmap_expired(map, data->id) || flag_exist)) + return -IPSET_ERR_EXIST; + /* Fall through */ + case MAC_EMPTY: + if (data->ether) { + memcpy(elem->ether, data->ether, ETH_ALEN); + elem->match = MAC_FILLED; + } else + elem->match = MAC_UNSET; + /* If MAC is unset yet, we store plain timeout value + * because the timer is not activated yet + * and we can reuse it later when MAC is filled out, + * possibly by the kernel */ + elem->timeout = data->ether ? ip_set_timeout_set(timeout) + : timeout; + break; + } + + return 0; +} + +static int +bitmap_ipmac_tdel(struct ip_set *set, void *value, u32 timeout, u32 flags) +{ + struct bitmap_ipmac *map = set->data; + const struct ipmac *data = value; + struct ipmac_telem *elem = bitmap_ipmac_elem(map, data->id); + + if (elem->match == MAC_EMPTY || bitmap_expired(map, data->id)) + return -IPSET_ERR_EXIST; + + elem->match = MAC_EMPTY; + + return 0; +} + +static int +bitmap_ipmac_tlist(const struct ip_set *set, + struct sk_buff *skb, struct netlink_callback *cb) +{ + const struct bitmap_ipmac *map = set->data; + const struct ipmac_telem *elem; + struct nlattr *atd, *nested; + u32 id, first = cb->args[2]; + u32 timeout, last = map->last_ip - map->first_ip; + + atd = ipset_nest_start(skb, IPSET_ATTR_ADT); + if (!atd) + return -EMSGSIZE; + for (; cb->args[2] <= last; cb->args[2]++) { + id = cb->args[2]; + elem = bitmap_ipmac_elem(map, id); + if (!bitmap_ipmac_exist(elem)) + continue; + nested = ipset_nest_start(skb, IPSET_ATTR_DATA); + if (!nested) { + if (id == first) { + nla_nest_cancel(skb, atd); + return -EMSGSIZE; + } else + goto nla_put_failure; + } + NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, + htonl(map->first_ip + id)); + if (elem->match == MAC_FILLED) + NLA_PUT(skb, IPSET_ATTR_ETHER, ETH_ALEN, + elem->ether); + timeout = elem->match == MAC_UNSET ? elem->timeout + : ip_set_timeout_get(elem->timeout); + NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, htonl(timeout)); + ipset_nest_end(skb, nested); + } + ipset_nest_end(skb, atd); + /* Set listing finished */ + cb->args[2] = 0; + + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nested); + ipset_nest_end(skb, atd); + return -EMSGSIZE; +} + +static int +bitmap_ipmac_kadt(struct ip_set *set, const struct sk_buff *skb, + const struct xt_action_param *par, + enum ipset_adt adt, const struct ip_set_adt_opt *opt) +{ + struct bitmap_ipmac *map = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct ipmac data; + + /* MAC can be src only */ + if (!(opt->flags & IPSET_DIM_TWO_SRC)) + return 0; + + data.id = ntohl(ip4addr(skb, opt->flags & IPSET_DIM_ONE_SRC)); + if (data.id < map->first_ip || data.id > map->last_ip) + return -IPSET_ERR_BITMAP_RANGE; + + /* Backward compatibility: we don't check the second flag */ + if (skb_mac_header(skb) < skb->head || + (skb_mac_header(skb) + ETH_HLEN) > skb->data) + return -EINVAL; + + data.id -= map->first_ip; + data.ether = eth_hdr(skb)->h_source; + + return adtfn(set, &data, opt_timeout(opt, map), opt->cmdflags); +} + +static int +bitmap_ipmac_uadt(struct ip_set *set, struct nlattr *tb[], + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ + const struct bitmap_ipmac *map = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct ipmac data; + u32 timeout = map->timeout; + int ret = 0; + + if (unlikely(!tb[IPSET_ATTR_IP] || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &data.id); + if (ret) + return ret; + + if (data.id < map->first_ip || data.id > map->last_ip) + return -IPSET_ERR_BITMAP_RANGE; + + if (tb[IPSET_ATTR_ETHER]) + data.ether = nla_data(tb[IPSET_ATTR_ETHER]); + else + data.ether = NULL; + + if (tb[IPSET_ATTR_TIMEOUT]) { + if (!with_timeout(map->timeout)) + return -IPSET_ERR_TIMEOUT; + timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + } + + data.id -= map->first_ip; + + ret = adtfn(set, &data, timeout, flags); + + return ip_set_eexist(ret, flags) ? 0 : ret; +} + +static void +bitmap_ipmac_destroy(struct ip_set *set) +{ + struct bitmap_ipmac *map = set->data; + + if (with_timeout(map->timeout)) + del_timer_sync(&map->gc); + + ip_set_free(map->members); + kfree(map); + + set->data = NULL; +} + +static void +bitmap_ipmac_flush(struct ip_set *set) +{ + struct bitmap_ipmac *map = set->data; + + memset(map->members, 0, + (map->last_ip - map->first_ip + 1) * map->dsize); +} + +static int +bitmap_ipmac_head(struct ip_set *set, struct sk_buff *skb) +{ + const struct bitmap_ipmac *map = set->data; + struct nlattr *nested; + + nested = ipset_nest_start(skb, IPSET_ATTR_DATA); + if (!nested) + goto nla_put_failure; + NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, htonl(map->first_ip)); + NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP_TO, htonl(map->last_ip)); + NLA_PUT_NET32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)); + NLA_PUT_NET32(skb, IPSET_ATTR_MEMSIZE, + htonl(sizeof(*map) + + (map->last_ip - map->first_ip + 1) * map->dsize)); + if (with_timeout(map->timeout)) + NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, htonl(map->timeout)); + ipset_nest_end(skb, nested); + + return 0; +nla_put_failure: + return -EMSGSIZE; +} + +static bool +bitmap_ipmac_same_set(const struct ip_set *a, const struct ip_set *b) +{ + const struct bitmap_ipmac *x = a->data; + const struct bitmap_ipmac *y = b->data; + + return x->first_ip == y->first_ip && + x->last_ip == y->last_ip && + x->timeout == y->timeout; +} + +static const struct ip_set_type_variant bitmap_ipmac = { + .kadt = bitmap_ipmac_kadt, + .uadt = bitmap_ipmac_uadt, + .adt = { + [IPSET_ADD] = bitmap_ipmac_add, + [IPSET_DEL] = bitmap_ipmac_del, + [IPSET_TEST] = bitmap_ipmac_test, + }, + .destroy = bitmap_ipmac_destroy, + .flush = bitmap_ipmac_flush, + .head = bitmap_ipmac_head, + .list = bitmap_ipmac_list, + .same_set = bitmap_ipmac_same_set, +}; + +static const struct ip_set_type_variant bitmap_tipmac = { + .kadt = bitmap_ipmac_kadt, + .uadt = bitmap_ipmac_uadt, + .adt = { + [IPSET_ADD] = bitmap_ipmac_tadd, + [IPSET_DEL] = bitmap_ipmac_tdel, + [IPSET_TEST] = bitmap_ipmac_ttest, + }, + .destroy = bitmap_ipmac_destroy, + .flush = bitmap_ipmac_flush, + .head = bitmap_ipmac_head, + .list = bitmap_ipmac_tlist, + .same_set = bitmap_ipmac_same_set, +}; + +static void +bitmap_ipmac_gc(unsigned long ul_set) +{ + struct ip_set *set = (struct ip_set *) ul_set; + struct bitmap_ipmac *map = set->data; + struct ipmac_telem *elem; + u32 id, last = map->last_ip - map->first_ip; + + /* We run parallel with other readers (test element) + * but adding/deleting new entries is locked out */ + read_lock_bh(&set->lock); + for (id = 0; id <= last; id++) { + elem = bitmap_ipmac_elem(map, id); + if (elem->match == MAC_FILLED && + ip_set_timeout_expired(elem->timeout)) + elem->match = MAC_EMPTY; + } + read_unlock_bh(&set->lock); + + map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ; + add_timer(&map->gc); +} + +static void +bitmap_ipmac_gc_init(struct ip_set *set) +{ + struct bitmap_ipmac *map = set->data; + + init_timer(&map->gc); + map->gc.data = (unsigned long) set; + map->gc.function = bitmap_ipmac_gc; + map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ; + add_timer(&map->gc); +} + +/* Create bitmap:ip,mac type of sets */ + +static bool +init_map_ipmac(struct ip_set *set, struct bitmap_ipmac *map, + u32 first_ip, u32 last_ip) +{ + map->members = ip_set_alloc((last_ip - first_ip + 1) * map->dsize); + if (!map->members) + return false; + map->first_ip = first_ip; + map->last_ip = last_ip; + map->timeout = IPSET_NO_TIMEOUT; + + set->data = map; + set->family = NFPROTO_IPV4; + + return true; +} + +static int +bitmap_ipmac_create(struct ip_set *set, struct nlattr *tb[], + u32 flags) +{ + u32 first_ip, last_ip, elements; + struct bitmap_ipmac *map; + int ret; + + if (unlikely(!tb[IPSET_ATTR_IP] || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) + return -IPSET_ERR_PROTOCOL; + + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &first_ip); + if (ret) + return ret; + + if (tb[IPSET_ATTR_IP_TO]) { + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &last_ip); + if (ret) + return ret; + if (first_ip > last_ip) { + u32 tmp = first_ip; + + first_ip = last_ip; + last_ip = tmp; + } + } else if (tb[IPSET_ATTR_CIDR]) { + u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + + if (cidr >= 32) + return -IPSET_ERR_INVALID_CIDR; + ip_set_mask_from_to(first_ip, last_ip, cidr); + } else + return -IPSET_ERR_PROTOCOL; + + elements = last_ip - first_ip + 1; + + if (elements > IPSET_BITMAP_MAX_RANGE + 1) + return -IPSET_ERR_BITMAP_RANGE_SIZE; + + map = kzalloc(sizeof(*map), GFP_KERNEL); + if (!map) + return -ENOMEM; + + if (tb[IPSET_ATTR_TIMEOUT]) { + map->dsize = sizeof(struct ipmac_telem); + + if (!init_map_ipmac(set, map, first_ip, last_ip)) { + kfree(map); + return -ENOMEM; + } + + map->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + + set->variant = &bitmap_tipmac; + + bitmap_ipmac_gc_init(set); + } else { + map->dsize = sizeof(struct ipmac_elem); + + if (!init_map_ipmac(set, map, first_ip, last_ip)) { + kfree(map); + return -ENOMEM; + } + set->variant = &bitmap_ipmac; + + } + return 0; +} + +static struct ip_set_type bitmap_ipmac_type = { + .name = "bitmap:ip,mac", + .protocol = IPSET_PROTOCOL, + .features = IPSET_TYPE_IP | IPSET_TYPE_MAC, + .dimension = IPSET_DIM_TWO, + .family = NFPROTO_IPV4, + .revision_min = 0, + .revision_max = 0, + .create = bitmap_ipmac_create, + .create_policy = { + [IPSET_ATTR_IP] = { .type = NLA_NESTED }, + [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED }, + [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + }, + .adt_policy = { + [IPSET_ATTR_IP] = { .type = NLA_NESTED }, + [IPSET_ATTR_ETHER] = { .type = NLA_BINARY, + .len = ETH_ALEN }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, + }, + .me = THIS_MODULE, +}; + +static int __init +bitmap_ipmac_init(void) +{ + return ip_set_type_register(&bitmap_ipmac_type); +} + +static void __exit +bitmap_ipmac_fini(void) +{ + ip_set_type_unregister(&bitmap_ipmac_type); +} + +module_init(bitmap_ipmac_init); +module_exit(bitmap_ipmac_fini); diff --git a/net/netfilter/ipset/ip_set_bitmap_port.c b/net/netfilter/ipset/ip_set_bitmap_port.c new file mode 100644 index 00000000..382ec28b --- /dev/null +++ b/net/netfilter/ipset/ip_set_bitmap_port.c @@ -0,0 +1,517 @@ +/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Kernel module implementing an IP set type: the bitmap:port type */ + +#include <linux/module.h> +#include <linux/ip.h> +#include <linux/skbuff.h> +#include <linux/errno.h> +#include <linux/netlink.h> +#include <linux/jiffies.h> +#include <linux/timer.h> +#include <net/netlink.h> + +#include <linux/netfilter/ipset/ip_set.h> +#include <linux/netfilter/ipset/ip_set_bitmap.h> +#include <linux/netfilter/ipset/ip_set_getport.h> +#define IP_SET_BITMAP_TIMEOUT +#include <linux/netfilter/ipset/ip_set_timeout.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +MODULE_DESCRIPTION("bitmap:port type of IP sets"); +MODULE_ALIAS("ip_set_bitmap:port"); + +/* Type structure */ +struct bitmap_port { + void *members; /* the set members */ + u16 first_port; /* host byte order, included in range */ + u16 last_port; /* host byte order, included in range */ + size_t memsize; /* members size */ + u32 timeout; /* timeout parameter */ + struct timer_list gc; /* garbage collection */ +}; + +/* Base variant */ + +static int +bitmap_port_test(struct ip_set *set, void *value, u32 timeout, u32 flags) +{ + const struct bitmap_port *map = set->data; + u16 id = *(u16 *)value; + + return !!test_bit(id, map->members); +} + +static int +bitmap_port_add(struct ip_set *set, void *value, u32 timeout, u32 flags) +{ + struct bitmap_port *map = set->data; + u16 id = *(u16 *)value; + + if (test_and_set_bit(id, map->members)) + return -IPSET_ERR_EXIST; + + return 0; +} + +static int +bitmap_port_del(struct ip_set *set, void *value, u32 timeout, u32 flags) +{ + struct bitmap_port *map = set->data; + u16 id = *(u16 *)value; + + if (!test_and_clear_bit(id, map->members)) + return -IPSET_ERR_EXIST; + + return 0; +} + +static int +bitmap_port_list(const struct ip_set *set, + struct sk_buff *skb, struct netlink_callback *cb) +{ + const struct bitmap_port *map = set->data; + struct nlattr *atd, *nested; + u16 id, first = cb->args[2]; + u16 last = map->last_port - map->first_port; + + atd = ipset_nest_start(skb, IPSET_ATTR_ADT); + if (!atd) + return -EMSGSIZE; + for (; cb->args[2] <= last; cb->args[2]++) { + id = cb->args[2]; + if (!test_bit(id, map->members)) + continue; + nested = ipset_nest_start(skb, IPSET_ATTR_DATA); + if (!nested) { + if (id == first) { + nla_nest_cancel(skb, atd); + return -EMSGSIZE; + } else + goto nla_put_failure; + } + NLA_PUT_NET16(skb, IPSET_ATTR_PORT, + htons(map->first_port + id)); + ipset_nest_end(skb, nested); + } + ipset_nest_end(skb, atd); + /* Set listing finished */ + cb->args[2] = 0; + + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nested); + ipset_nest_end(skb, atd); + if (unlikely(id == first)) { + cb->args[2] = 0; + return -EMSGSIZE; + } + return 0; +} + +/* Timeout variant */ + +static int +bitmap_port_ttest(struct ip_set *set, void *value, u32 timeout, u32 flags) +{ + const struct bitmap_port *map = set->data; + const unsigned long *members = map->members; + u16 id = *(u16 *)value; + + return ip_set_timeout_test(members[id]); +} + +static int +bitmap_port_tadd(struct ip_set *set, void *value, u32 timeout, u32 flags) +{ + struct bitmap_port *map = set->data; + unsigned long *members = map->members; + u16 id = *(u16 *)value; + + if (ip_set_timeout_test(members[id]) && !(flags & IPSET_FLAG_EXIST)) + return -IPSET_ERR_EXIST; + + members[id] = ip_set_timeout_set(timeout); + + return 0; +} + +static int +bitmap_port_tdel(struct ip_set *set, void *value, u32 timeout, u32 flags) +{ + struct bitmap_port *map = set->data; + unsigned long *members = map->members; + u16 id = *(u16 *)value; + int ret = -IPSET_ERR_EXIST; + + if (ip_set_timeout_test(members[id])) + ret = 0; + + members[id] = IPSET_ELEM_UNSET; + return ret; +} + +static int +bitmap_port_tlist(const struct ip_set *set, + struct sk_buff *skb, struct netlink_callback *cb) +{ + const struct bitmap_port *map = set->data; + struct nlattr *adt, *nested; + u16 id, first = cb->args[2]; + u16 last = map->last_port - map->first_port; + const unsigned long *members = map->members; + + adt = ipset_nest_start(skb, IPSET_ATTR_ADT); + if (!adt) + return -EMSGSIZE; + for (; cb->args[2] <= last; cb->args[2]++) { + id = cb->args[2]; + if (!ip_set_timeout_test(members[id])) + continue; + nested = ipset_nest_start(skb, IPSET_ATTR_DATA); + if (!nested) { + if (id == first) { + nla_nest_cancel(skb, adt); + return -EMSGSIZE; + } else + goto nla_put_failure; + } + NLA_PUT_NET16(skb, IPSET_ATTR_PORT, + htons(map->first_port + id)); + NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, + htonl(ip_set_timeout_get(members[id]))); + ipset_nest_end(skb, nested); + } + ipset_nest_end(skb, adt); + + /* Set listing finished */ + cb->args[2] = 0; + + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nested); + ipset_nest_end(skb, adt); + if (unlikely(id == first)) { + cb->args[2] = 0; + return -EMSGSIZE; + } + return 0; +} + +static int +bitmap_port_kadt(struct ip_set *set, const struct sk_buff *skb, + const struct xt_action_param *par, + enum ipset_adt adt, const struct ip_set_adt_opt *opt) +{ + struct bitmap_port *map = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + __be16 __port; + u16 port = 0; + + if (!ip_set_get_ip_port(skb, opt->family, + opt->flags & IPSET_DIM_ONE_SRC, &__port)) + return -EINVAL; + + port = ntohs(__port); + + if (port < map->first_port || port > map->last_port) + return -IPSET_ERR_BITMAP_RANGE; + + port -= map->first_port; + + return adtfn(set, &port, opt_timeout(opt, map), opt->cmdflags); +} + +static int +bitmap_port_uadt(struct ip_set *set, struct nlattr *tb[], + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ + struct bitmap_port *map = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + u32 timeout = map->timeout; + u32 port; /* wraparound */ + u16 id, port_to; + int ret = 0; + + if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + port = ip_set_get_h16(tb[IPSET_ATTR_PORT]); + if (port < map->first_port || port > map->last_port) + return -IPSET_ERR_BITMAP_RANGE; + + if (tb[IPSET_ATTR_TIMEOUT]) { + if (!with_timeout(map->timeout)) + return -IPSET_ERR_TIMEOUT; + timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + } + + if (adt == IPSET_TEST) { + id = port - map->first_port; + return adtfn(set, &id, timeout, flags); + } + + if (tb[IPSET_ATTR_PORT_TO]) { + port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); + if (port > port_to) { + swap(port, port_to); + if (port < map->first_port) + return -IPSET_ERR_BITMAP_RANGE; + } + } else + port_to = port; + + if (port_to > map->last_port) + return -IPSET_ERR_BITMAP_RANGE; + + for (; port <= port_to; port++) { + id = port - map->first_port; + ret = adtfn(set, &id, timeout, flags); + + if (ret && !ip_set_eexist(ret, flags)) + return ret; + else + ret = 0; + } + return ret; +} + +static void +bitmap_port_destroy(struct ip_set *set) +{ + struct bitmap_port *map = set->data; + + if (with_timeout(map->timeout)) + del_timer_sync(&map->gc); + + ip_set_free(map->members); + kfree(map); + + set->data = NULL; +} + +static void +bitmap_port_flush(struct ip_set *set) +{ + struct bitmap_port *map = set->data; + + memset(map->members, 0, map->memsize); +} + +static int +bitmap_port_head(struct ip_set *set, struct sk_buff *skb) +{ + const struct bitmap_port *map = set->data; + struct nlattr *nested; + + nested = ipset_nest_start(skb, IPSET_ATTR_DATA); + if (!nested) + goto nla_put_failure; + NLA_PUT_NET16(skb, IPSET_ATTR_PORT, htons(map->first_port)); + NLA_PUT_NET16(skb, IPSET_ATTR_PORT_TO, htons(map->last_port)); + NLA_PUT_NET32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)); + NLA_PUT_NET32(skb, IPSET_ATTR_MEMSIZE, + htonl(sizeof(*map) + map->memsize)); + if (with_timeout(map->timeout)) + NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, htonl(map->timeout)); + ipset_nest_end(skb, nested); + + return 0; +nla_put_failure: + return -EMSGSIZE; +} + +static bool +bitmap_port_same_set(const struct ip_set *a, const struct ip_set *b) +{ + const struct bitmap_port *x = a->data; + const struct bitmap_port *y = b->data; + + return x->first_port == y->first_port && + x->last_port == y->last_port && + x->timeout == y->timeout; +} + +static const struct ip_set_type_variant bitmap_port = { + .kadt = bitmap_port_kadt, + .uadt = bitmap_port_uadt, + .adt = { + [IPSET_ADD] = bitmap_port_add, + [IPSET_DEL] = bitmap_port_del, + [IPSET_TEST] = bitmap_port_test, + }, + .destroy = bitmap_port_destroy, + .flush = bitmap_port_flush, + .head = bitmap_port_head, + .list = bitmap_port_list, + .same_set = bitmap_port_same_set, +}; + +static const struct ip_set_type_variant bitmap_tport = { + .kadt = bitmap_port_kadt, + .uadt = bitmap_port_uadt, + .adt = { + [IPSET_ADD] = bitmap_port_tadd, + [IPSET_DEL] = bitmap_port_tdel, + [IPSET_TEST] = bitmap_port_ttest, + }, + .destroy = bitmap_port_destroy, + .flush = bitmap_port_flush, + .head = bitmap_port_head, + .list = bitmap_port_tlist, + .same_set = bitmap_port_same_set, +}; + +static void +bitmap_port_gc(unsigned long ul_set) +{ + struct ip_set *set = (struct ip_set *) ul_set; + struct bitmap_port *map = set->data; + unsigned long *table = map->members; + u32 id; /* wraparound */ + u16 last = map->last_port - map->first_port; + + /* We run parallel with other readers (test element) + * but adding/deleting new entries is locked out */ + read_lock_bh(&set->lock); + for (id = 0; id <= last; id++) + if (ip_set_timeout_expired(table[id])) + table[id] = IPSET_ELEM_UNSET; + read_unlock_bh(&set->lock); + + map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ; + add_timer(&map->gc); +} + +static void +bitmap_port_gc_init(struct ip_set *set) +{ + struct bitmap_port *map = set->data; + + init_timer(&map->gc); + map->gc.data = (unsigned long) set; + map->gc.function = bitmap_port_gc; + map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ; + add_timer(&map->gc); +} + +/* Create bitmap:ip type of sets */ + +static bool +init_map_port(struct ip_set *set, struct bitmap_port *map, + u16 first_port, u16 last_port) +{ + map->members = ip_set_alloc(map->memsize); + if (!map->members) + return false; + map->first_port = first_port; + map->last_port = last_port; + map->timeout = IPSET_NO_TIMEOUT; + + set->data = map; + set->family = NFPROTO_UNSPEC; + + return true; +} + +static int +bitmap_port_create(struct ip_set *set, struct nlattr *tb[], + u32 flags) +{ + struct bitmap_port *map; + u16 first_port, last_port; + + if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || + !ip_set_attr_netorder(tb, IPSET_ATTR_PORT_TO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) + return -IPSET_ERR_PROTOCOL; + + first_port = ip_set_get_h16(tb[IPSET_ATTR_PORT]); + last_port = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); + if (first_port > last_port) { + u16 tmp = first_port; + + first_port = last_port; + last_port = tmp; + } + + map = kzalloc(sizeof(*map), GFP_KERNEL); + if (!map) + return -ENOMEM; + + if (tb[IPSET_ATTR_TIMEOUT]) { + map->memsize = (last_port - first_port + 1) + * sizeof(unsigned long); + + if (!init_map_port(set, map, first_port, last_port)) { + kfree(map); + return -ENOMEM; + } + + map->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + set->variant = &bitmap_tport; + + bitmap_port_gc_init(set); + } else { + map->memsize = bitmap_bytes(0, last_port - first_port); + pr_debug("memsize: %zu\n", map->memsize); + if (!init_map_port(set, map, first_port, last_port)) { + kfree(map); + return -ENOMEM; + } + + set->variant = &bitmap_port; + } + return 0; +} + +static struct ip_set_type bitmap_port_type = { + .name = "bitmap:port", + .protocol = IPSET_PROTOCOL, + .features = IPSET_TYPE_PORT, + .dimension = IPSET_DIM_ONE, + .family = NFPROTO_UNSPEC, + .revision_min = 0, + .revision_max = 0, + .create = bitmap_port_create, + .create_policy = { + [IPSET_ATTR_PORT] = { .type = NLA_U16 }, + [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + }, + .adt_policy = { + [IPSET_ATTR_PORT] = { .type = NLA_U16 }, + [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, + }, + .me = THIS_MODULE, +}; + +static int __init +bitmap_port_init(void) +{ + return ip_set_type_register(&bitmap_port_type); +} + +static void __exit +bitmap_port_fini(void) +{ + ip_set_type_unregister(&bitmap_port_type); +} + +module_init(bitmap_port_init); +module_exit(bitmap_port_fini); diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c new file mode 100644 index 00000000..e6c1c960 --- /dev/null +++ b/net/netfilter/ipset/ip_set_core.c @@ -0,0 +1,1755 @@ +/* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu> + * Patrick Schaaf <bof@bof.de> + * Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Kernel module for IP set management */ + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/ip.h> +#include <linux/skbuff.h> +#include <linux/spinlock.h> +#include <linux/netlink.h> +#include <linux/rculist.h> +#include <net/netlink.h> + +#include <linux/netfilter.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/ipset/ip_set.h> + +static LIST_HEAD(ip_set_type_list); /* all registered set types */ +static DEFINE_MUTEX(ip_set_type_mutex); /* protects ip_set_type_list */ +static DEFINE_RWLOCK(ip_set_ref_lock); /* protects the set refs */ + +static struct ip_set **ip_set_list; /* all individual sets */ +static ip_set_id_t ip_set_max = CONFIG_IP_SET_MAX; /* max number of sets */ + +#define STREQ(a, b) (strncmp(a, b, IPSET_MAXNAMELEN) == 0) + +static unsigned int max_sets; + +module_param(max_sets, int, 0600); +MODULE_PARM_DESC(max_sets, "maximal number of sets"); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +MODULE_DESCRIPTION("core IP set support"); +MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_IPSET); + +/* + * The set types are implemented in modules and registered set types + * can be found in ip_set_type_list. Adding/deleting types is + * serialized by ip_set_type_mutex. + */ + +static inline void +ip_set_type_lock(void) +{ + mutex_lock(&ip_set_type_mutex); +} + +static inline void +ip_set_type_unlock(void) +{ + mutex_unlock(&ip_set_type_mutex); +} + +/* Register and deregister settype */ + +static struct ip_set_type * +find_set_type(const char *name, u8 family, u8 revision) +{ + struct ip_set_type *type; + + list_for_each_entry_rcu(type, &ip_set_type_list, list) + if (STREQ(type->name, name) && + (type->family == family || type->family == NFPROTO_UNSPEC) && + revision >= type->revision_min && + revision <= type->revision_max) + return type; + return NULL; +} + +/* Unlock, try to load a set type module and lock again */ +static bool +load_settype(const char *name) +{ + nfnl_unlock(); + pr_debug("try to load ip_set_%s\n", name); + if (request_module("ip_set_%s", name) < 0) { + pr_warning("Can't find ip_set type %s\n", name); + nfnl_lock(); + return false; + } + nfnl_lock(); + return true; +} + +/* Find a set type and reference it */ +#define find_set_type_get(name, family, revision, found) \ + __find_set_type_get(name, family, revision, found, false) + +static int +__find_set_type_get(const char *name, u8 family, u8 revision, + struct ip_set_type **found, bool retry) +{ + struct ip_set_type *type; + int err; + + if (retry && !load_settype(name)) + return -IPSET_ERR_FIND_TYPE; + + rcu_read_lock(); + *found = find_set_type(name, family, revision); + if (*found) { + err = !try_module_get((*found)->me) ? -EFAULT : 0; + goto unlock; + } + /* Make sure the type is already loaded + * but we don't support the revision */ + list_for_each_entry_rcu(type, &ip_set_type_list, list) + if (STREQ(type->name, name)) { + err = -IPSET_ERR_FIND_TYPE; + goto unlock; + } + rcu_read_unlock(); + + return retry ? -IPSET_ERR_FIND_TYPE : + __find_set_type_get(name, family, revision, found, true); + +unlock: + rcu_read_unlock(); + return err; +} + +/* Find a given set type by name and family. + * If we succeeded, the supported minimal and maximum revisions are + * filled out. + */ +#define find_set_type_minmax(name, family, min, max) \ + __find_set_type_minmax(name, family, min, max, false) + +static int +__find_set_type_minmax(const char *name, u8 family, u8 *min, u8 *max, + bool retry) +{ + struct ip_set_type *type; + bool found = false; + + if (retry && !load_settype(name)) + return -IPSET_ERR_FIND_TYPE; + + *min = 255; *max = 0; + rcu_read_lock(); + list_for_each_entry_rcu(type, &ip_set_type_list, list) + if (STREQ(type->name, name) && + (type->family == family || type->family == NFPROTO_UNSPEC)) { + found = true; + if (type->revision_min < *min) + *min = type->revision_min; + if (type->revision_max > *max) + *max = type->revision_max; + } + rcu_read_unlock(); + if (found) + return 0; + + return retry ? -IPSET_ERR_FIND_TYPE : + __find_set_type_minmax(name, family, min, max, true); +} + +#define family_name(f) ((f) == NFPROTO_IPV4 ? "inet" : \ + (f) == NFPROTO_IPV6 ? "inet6" : "any") + +/* Register a set type structure. The type is identified by + * the unique triple of name, family and revision. + */ +int +ip_set_type_register(struct ip_set_type *type) +{ + int ret = 0; + + if (type->protocol != IPSET_PROTOCOL) { + pr_warning("ip_set type %s, family %s, revision %u:%u uses " + "wrong protocol version %u (want %u)\n", + type->name, family_name(type->family), + type->revision_min, type->revision_max, + type->protocol, IPSET_PROTOCOL); + return -EINVAL; + } + + ip_set_type_lock(); + if (find_set_type(type->name, type->family, type->revision_min)) { + /* Duplicate! */ + pr_warning("ip_set type %s, family %s with revision min %u " + "already registered!\n", type->name, + family_name(type->family), type->revision_min); + ret = -EINVAL; + goto unlock; + } + list_add_rcu(&type->list, &ip_set_type_list); + pr_debug("type %s, family %s, revision %u:%u registered.\n", + type->name, family_name(type->family), + type->revision_min, type->revision_max); +unlock: + ip_set_type_unlock(); + return ret; +} +EXPORT_SYMBOL_GPL(ip_set_type_register); + +/* Unregister a set type. There's a small race with ip_set_create */ +void +ip_set_type_unregister(struct ip_set_type *type) +{ + ip_set_type_lock(); + if (!find_set_type(type->name, type->family, type->revision_min)) { + pr_warning("ip_set type %s, family %s with revision min %u " + "not registered\n", type->name, + family_name(type->family), type->revision_min); + goto unlock; + } + list_del_rcu(&type->list); + pr_debug("type %s, family %s with revision min %u unregistered.\n", + type->name, family_name(type->family), type->revision_min); +unlock: + ip_set_type_unlock(); + + synchronize_rcu(); +} +EXPORT_SYMBOL_GPL(ip_set_type_unregister); + +/* Utility functions */ +void * +ip_set_alloc(size_t size) +{ + void *members = NULL; + + if (size < KMALLOC_MAX_SIZE) + members = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); + + if (members) { + pr_debug("%p: allocated with kmalloc\n", members); + return members; + } + + members = vzalloc(size); + if (!members) + return NULL; + pr_debug("%p: allocated with vmalloc\n", members); + + return members; +} +EXPORT_SYMBOL_GPL(ip_set_alloc); + +void +ip_set_free(void *members) +{ + pr_debug("%p: free with %s\n", members, + is_vmalloc_addr(members) ? "vfree" : "kfree"); + if (is_vmalloc_addr(members)) + vfree(members); + else + kfree(members); +} +EXPORT_SYMBOL_GPL(ip_set_free); + +static inline bool +flag_nested(const struct nlattr *nla) +{ + return nla->nla_type & NLA_F_NESTED; +} + +static const struct nla_policy ipaddr_policy[IPSET_ATTR_IPADDR_MAX + 1] = { + [IPSET_ATTR_IPADDR_IPV4] = { .type = NLA_U32 }, + [IPSET_ATTR_IPADDR_IPV6] = { .type = NLA_BINARY, + .len = sizeof(struct in6_addr) }, +}; + +int +ip_set_get_ipaddr4(struct nlattr *nla, __be32 *ipaddr) +{ + struct nlattr *tb[IPSET_ATTR_IPADDR_MAX+1]; + + if (unlikely(!flag_nested(nla))) + return -IPSET_ERR_PROTOCOL; + if (nla_parse_nested(tb, IPSET_ATTR_IPADDR_MAX, nla, ipaddr_policy)) + return -IPSET_ERR_PROTOCOL; + if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_IPADDR_IPV4))) + return -IPSET_ERR_PROTOCOL; + + *ipaddr = nla_get_be32(tb[IPSET_ATTR_IPADDR_IPV4]); + return 0; +} +EXPORT_SYMBOL_GPL(ip_set_get_ipaddr4); + +int +ip_set_get_ipaddr6(struct nlattr *nla, union nf_inet_addr *ipaddr) +{ + struct nlattr *tb[IPSET_ATTR_IPADDR_MAX+1]; + + if (unlikely(!flag_nested(nla))) + return -IPSET_ERR_PROTOCOL; + + if (nla_parse_nested(tb, IPSET_ATTR_IPADDR_MAX, nla, ipaddr_policy)) + return -IPSET_ERR_PROTOCOL; + if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_IPADDR_IPV6))) + return -IPSET_ERR_PROTOCOL; + + memcpy(ipaddr, nla_data(tb[IPSET_ATTR_IPADDR_IPV6]), + sizeof(struct in6_addr)); + return 0; +} +EXPORT_SYMBOL_GPL(ip_set_get_ipaddr6); + +/* + * Creating/destroying/renaming/swapping affect the existence and + * the properties of a set. All of these can be executed from userspace + * only and serialized by the nfnl mutex indirectly from nfnetlink. + * + * Sets are identified by their index in ip_set_list and the index + * is used by the external references (set/SET netfilter modules). + * + * The set behind an index may change by swapping only, from userspace. + */ + +static inline void +__ip_set_get(ip_set_id_t index) +{ + write_lock_bh(&ip_set_ref_lock); + ip_set_list[index]->ref++; + write_unlock_bh(&ip_set_ref_lock); +} + +static inline void +__ip_set_put(ip_set_id_t index) +{ + write_lock_bh(&ip_set_ref_lock); + BUG_ON(ip_set_list[index]->ref == 0); + ip_set_list[index]->ref--; + write_unlock_bh(&ip_set_ref_lock); +} + +/* + * Add, del and test set entries from kernel. + * + * The set behind the index must exist and must be referenced + * so it can't be destroyed (or changed) under our foot. + */ + +int +ip_set_test(ip_set_id_t index, const struct sk_buff *skb, + const struct xt_action_param *par, + const struct ip_set_adt_opt *opt) +{ + struct ip_set *set = ip_set_list[index]; + int ret = 0; + + BUG_ON(set == NULL); + pr_debug("set %s, index %u\n", set->name, index); + + if (opt->dim < set->type->dimension || + !(opt->family == set->family || set->family == NFPROTO_UNSPEC)) + return 0; + + read_lock_bh(&set->lock); + ret = set->variant->kadt(set, skb, par, IPSET_TEST, opt); + read_unlock_bh(&set->lock); + + if (ret == -EAGAIN) { + /* Type requests element to be completed */ + pr_debug("element must be competed, ADD is triggered\n"); + write_lock_bh(&set->lock); + set->variant->kadt(set, skb, par, IPSET_ADD, opt); + write_unlock_bh(&set->lock); + ret = 1; + } + + /* Convert error codes to nomatch */ + return (ret < 0 ? 0 : ret); +} +EXPORT_SYMBOL_GPL(ip_set_test); + +int +ip_set_add(ip_set_id_t index, const struct sk_buff *skb, + const struct xt_action_param *par, + const struct ip_set_adt_opt *opt) +{ + struct ip_set *set = ip_set_list[index]; + int ret; + + BUG_ON(set == NULL); + pr_debug("set %s, index %u\n", set->name, index); + + if (opt->dim < set->type->dimension || + !(opt->family == set->family || set->family == NFPROTO_UNSPEC)) + return 0; + + write_lock_bh(&set->lock); + ret = set->variant->kadt(set, skb, par, IPSET_ADD, opt); + write_unlock_bh(&set->lock); + + return ret; +} +EXPORT_SYMBOL_GPL(ip_set_add); + +int +ip_set_del(ip_set_id_t index, const struct sk_buff *skb, + const struct xt_action_param *par, + const struct ip_set_adt_opt *opt) +{ + struct ip_set *set = ip_set_list[index]; + int ret = 0; + + BUG_ON(set == NULL); + pr_debug("set %s, index %u\n", set->name, index); + + if (opt->dim < set->type->dimension || + !(opt->family == set->family || set->family == NFPROTO_UNSPEC)) + return 0; + + write_lock_bh(&set->lock); + ret = set->variant->kadt(set, skb, par, IPSET_DEL, opt); + write_unlock_bh(&set->lock); + + return ret; +} +EXPORT_SYMBOL_GPL(ip_set_del); + +/* + * Find set by name, reference it once. The reference makes sure the + * thing pointed to, does not go away under our feet. + * + */ +ip_set_id_t +ip_set_get_byname(const char *name, struct ip_set **set) +{ + ip_set_id_t i, index = IPSET_INVALID_ID; + struct ip_set *s; + + for (i = 0; i < ip_set_max; i++) { + s = ip_set_list[i]; + if (s != NULL && STREQ(s->name, name)) { + __ip_set_get(i); + index = i; + *set = s; + } + } + + return index; +} +EXPORT_SYMBOL_GPL(ip_set_get_byname); + +/* + * If the given set pointer points to a valid set, decrement + * reference count by 1. The caller shall not assume the index + * to be valid, after calling this function. + * + */ +void +ip_set_put_byindex(ip_set_id_t index) +{ + if (ip_set_list[index] != NULL) + __ip_set_put(index); +} +EXPORT_SYMBOL_GPL(ip_set_put_byindex); + +/* + * Get the name of a set behind a set index. + * We assume the set is referenced, so it does exist and + * can't be destroyed. The set cannot be renamed due to + * the referencing either. + * + */ +const char * +ip_set_name_byindex(ip_set_id_t index) +{ + const struct ip_set *set = ip_set_list[index]; + + BUG_ON(set == NULL); + BUG_ON(set->ref == 0); + + /* Referenced, so it's safe */ + return set->name; +} +EXPORT_SYMBOL_GPL(ip_set_name_byindex); + +/* + * Routines to call by external subsystems, which do not + * call nfnl_lock for us. + */ + +/* + * Find set by name, reference it once. The reference makes sure the + * thing pointed to, does not go away under our feet. + * + * The nfnl mutex is used in the function. + */ +ip_set_id_t +ip_set_nfnl_get(const char *name) +{ + struct ip_set *s; + ip_set_id_t index; + + nfnl_lock(); + index = ip_set_get_byname(name, &s); + nfnl_unlock(); + + return index; +} +EXPORT_SYMBOL_GPL(ip_set_nfnl_get); + +/* + * Find set by index, reference it once. The reference makes sure the + * thing pointed to, does not go away under our feet. + * + * The nfnl mutex is used in the function. + */ +ip_set_id_t +ip_set_nfnl_get_byindex(ip_set_id_t index) +{ + if (index > ip_set_max) + return IPSET_INVALID_ID; + + nfnl_lock(); + if (ip_set_list[index]) + __ip_set_get(index); + else + index = IPSET_INVALID_ID; + nfnl_unlock(); + + return index; +} +EXPORT_SYMBOL_GPL(ip_set_nfnl_get_byindex); + +/* + * If the given set pointer points to a valid set, decrement + * reference count by 1. The caller shall not assume the index + * to be valid, after calling this function. + * + * The nfnl mutex is used in the function. + */ +void +ip_set_nfnl_put(ip_set_id_t index) +{ + nfnl_lock(); + ip_set_put_byindex(index); + nfnl_unlock(); +} +EXPORT_SYMBOL_GPL(ip_set_nfnl_put); + +/* + * Communication protocol with userspace over netlink. + * + * The commands are serialized by the nfnl mutex. + */ + +static inline bool +protocol_failed(const struct nlattr * const tb[]) +{ + return !tb[IPSET_ATTR_PROTOCOL] || + nla_get_u8(tb[IPSET_ATTR_PROTOCOL]) != IPSET_PROTOCOL; +} + +static inline u32 +flag_exist(const struct nlmsghdr *nlh) +{ + return nlh->nlmsg_flags & NLM_F_EXCL ? 0 : IPSET_FLAG_EXIST; +} + +static struct nlmsghdr * +start_msg(struct sk_buff *skb, u32 pid, u32 seq, unsigned int flags, + enum ipset_cmd cmd) +{ + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + + nlh = nlmsg_put(skb, pid, seq, cmd | (NFNL_SUBSYS_IPSET << 8), + sizeof(*nfmsg), flags); + if (nlh == NULL) + return NULL; + + nfmsg = nlmsg_data(nlh); + nfmsg->nfgen_family = NFPROTO_IPV4; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = 0; + + return nlh; +} + +/* Create a set */ + +static const struct nla_policy ip_set_create_policy[IPSET_ATTR_CMD_MAX + 1] = { + [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 }, + [IPSET_ATTR_SETNAME] = { .type = NLA_NUL_STRING, + .len = IPSET_MAXNAMELEN - 1 }, + [IPSET_ATTR_TYPENAME] = { .type = NLA_NUL_STRING, + .len = IPSET_MAXNAMELEN - 1}, + [IPSET_ATTR_REVISION] = { .type = NLA_U8 }, + [IPSET_ATTR_FAMILY] = { .type = NLA_U8 }, + [IPSET_ATTR_DATA] = { .type = NLA_NESTED }, +}; + +static ip_set_id_t +find_set_id(const char *name) +{ + ip_set_id_t i, index = IPSET_INVALID_ID; + const struct ip_set *set; + + for (i = 0; index == IPSET_INVALID_ID && i < ip_set_max; i++) { + set = ip_set_list[i]; + if (set != NULL && STREQ(set->name, name)) + index = i; + } + return index; +} + +static inline struct ip_set * +find_set(const char *name) +{ + ip_set_id_t index = find_set_id(name); + + return index == IPSET_INVALID_ID ? NULL : ip_set_list[index]; +} + +static int +find_free_id(const char *name, ip_set_id_t *index, struct ip_set **set) +{ + ip_set_id_t i; + + *index = IPSET_INVALID_ID; + for (i = 0; i < ip_set_max; i++) { + if (ip_set_list[i] == NULL) { + if (*index == IPSET_INVALID_ID) + *index = i; + } else if (STREQ(name, ip_set_list[i]->name)) { + /* Name clash */ + *set = ip_set_list[i]; + return -EEXIST; + } + } + if (*index == IPSET_INVALID_ID) + /* No free slot remained */ + return -IPSET_ERR_MAX_SETS; + return 0; +} + +static int +ip_set_create(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const attr[]) +{ + struct ip_set *set, *clash = NULL; + ip_set_id_t index = IPSET_INVALID_ID; + struct nlattr *tb[IPSET_ATTR_CREATE_MAX+1] = {}; + const char *name, *typename; + u8 family, revision; + u32 flags = flag_exist(nlh); + int ret = 0; + + if (unlikely(protocol_failed(attr) || + attr[IPSET_ATTR_SETNAME] == NULL || + attr[IPSET_ATTR_TYPENAME] == NULL || + attr[IPSET_ATTR_REVISION] == NULL || + attr[IPSET_ATTR_FAMILY] == NULL || + (attr[IPSET_ATTR_DATA] != NULL && + !flag_nested(attr[IPSET_ATTR_DATA])))) + return -IPSET_ERR_PROTOCOL; + + name = nla_data(attr[IPSET_ATTR_SETNAME]); + typename = nla_data(attr[IPSET_ATTR_TYPENAME]); + family = nla_get_u8(attr[IPSET_ATTR_FAMILY]); + revision = nla_get_u8(attr[IPSET_ATTR_REVISION]); + pr_debug("setname: %s, typename: %s, family: %s, revision: %u\n", + name, typename, family_name(family), revision); + + /* + * First, and without any locks, allocate and initialize + * a normal base set structure. + */ + set = kzalloc(sizeof(struct ip_set), GFP_KERNEL); + if (!set) + return -ENOMEM; + rwlock_init(&set->lock); + strlcpy(set->name, name, IPSET_MAXNAMELEN); + set->family = family; + set->revision = revision; + + /* + * Next, check that we know the type, and take + * a reference on the type, to make sure it stays available + * while constructing our new set. + * + * After referencing the type, we try to create the type + * specific part of the set without holding any locks. + */ + ret = find_set_type_get(typename, family, revision, &(set->type)); + if (ret) + goto out; + + /* + * Without holding any locks, create private part. + */ + if (attr[IPSET_ATTR_DATA] && + nla_parse_nested(tb, IPSET_ATTR_CREATE_MAX, attr[IPSET_ATTR_DATA], + set->type->create_policy)) { + ret = -IPSET_ERR_PROTOCOL; + goto put_out; + } + + ret = set->type->create(set, tb, flags); + if (ret != 0) + goto put_out; + + /* BTW, ret==0 here. */ + + /* + * Here, we have a valid, constructed set and we are protected + * by the nfnl mutex. Find the first free index in ip_set_list + * and check clashing. + */ + if ((ret = find_free_id(set->name, &index, &clash)) != 0) { + /* If this is the same set and requested, ignore error */ + if (ret == -EEXIST && + (flags & IPSET_FLAG_EXIST) && + STREQ(set->type->name, clash->type->name) && + set->type->family == clash->type->family && + set->type->revision_min == clash->type->revision_min && + set->type->revision_max == clash->type->revision_max && + set->variant->same_set(set, clash)) + ret = 0; + goto cleanup; + } + + /* + * Finally! Add our shiny new set to the list, and be done. + */ + pr_debug("create: '%s' created with index %u!\n", set->name, index); + ip_set_list[index] = set; + + return ret; + +cleanup: + set->variant->destroy(set); +put_out: + module_put(set->type->me); +out: + kfree(set); + return ret; +} + +/* Destroy sets */ + +static const struct nla_policy +ip_set_setname_policy[IPSET_ATTR_CMD_MAX + 1] = { + [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 }, + [IPSET_ATTR_SETNAME] = { .type = NLA_NUL_STRING, + .len = IPSET_MAXNAMELEN - 1 }, +}; + +static void +ip_set_destroy_set(ip_set_id_t index) +{ + struct ip_set *set = ip_set_list[index]; + + pr_debug("set: %s\n", set->name); + ip_set_list[index] = NULL; + + /* Must call it without holding any lock */ + set->variant->destroy(set); + module_put(set->type->me); + kfree(set); +} + +static int +ip_set_destroy(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const attr[]) +{ + ip_set_id_t i; + int ret = 0; + + if (unlikely(protocol_failed(attr))) + return -IPSET_ERR_PROTOCOL; + + /* Commands are serialized and references are + * protected by the ip_set_ref_lock. + * External systems (i.e. xt_set) must call + * ip_set_put|get_nfnl_* functions, that way we + * can safely check references here. + * + * list:set timer can only decrement the reference + * counter, so if it's already zero, we can proceed + * without holding the lock. + */ + read_lock_bh(&ip_set_ref_lock); + if (!attr[IPSET_ATTR_SETNAME]) { + for (i = 0; i < ip_set_max; i++) { + if (ip_set_list[i] != NULL && ip_set_list[i]->ref) { + ret = -IPSET_ERR_BUSY; + goto out; + } + } + read_unlock_bh(&ip_set_ref_lock); + for (i = 0; i < ip_set_max; i++) { + if (ip_set_list[i] != NULL) + ip_set_destroy_set(i); + } + } else { + i = find_set_id(nla_data(attr[IPSET_ATTR_SETNAME])); + if (i == IPSET_INVALID_ID) { + ret = -ENOENT; + goto out; + } else if (ip_set_list[i]->ref) { + ret = -IPSET_ERR_BUSY; + goto out; + } + read_unlock_bh(&ip_set_ref_lock); + + ip_set_destroy_set(i); + } + return 0; +out: + read_unlock_bh(&ip_set_ref_lock); + return ret; +} + +/* Flush sets */ + +static void +ip_set_flush_set(struct ip_set *set) +{ + pr_debug("set: %s\n", set->name); + + write_lock_bh(&set->lock); + set->variant->flush(set); + write_unlock_bh(&set->lock); +} + +static int +ip_set_flush(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const attr[]) +{ + ip_set_id_t i; + + if (unlikely(protocol_failed(attr))) + return -IPSET_ERR_PROTOCOL; + + if (!attr[IPSET_ATTR_SETNAME]) { + for (i = 0; i < ip_set_max; i++) + if (ip_set_list[i] != NULL) + ip_set_flush_set(ip_set_list[i]); + } else { + i = find_set_id(nla_data(attr[IPSET_ATTR_SETNAME])); + if (i == IPSET_INVALID_ID) + return -ENOENT; + + ip_set_flush_set(ip_set_list[i]); + } + + return 0; +} + +/* Rename a set */ + +static const struct nla_policy +ip_set_setname2_policy[IPSET_ATTR_CMD_MAX + 1] = { + [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 }, + [IPSET_ATTR_SETNAME] = { .type = NLA_NUL_STRING, + .len = IPSET_MAXNAMELEN - 1 }, + [IPSET_ATTR_SETNAME2] = { .type = NLA_NUL_STRING, + .len = IPSET_MAXNAMELEN - 1 }, +}; + +static int +ip_set_rename(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const attr[]) +{ + struct ip_set *set; + const char *name2; + ip_set_id_t i; + int ret = 0; + + if (unlikely(protocol_failed(attr) || + attr[IPSET_ATTR_SETNAME] == NULL || + attr[IPSET_ATTR_SETNAME2] == NULL)) + return -IPSET_ERR_PROTOCOL; + + set = find_set(nla_data(attr[IPSET_ATTR_SETNAME])); + if (set == NULL) + return -ENOENT; + + read_lock_bh(&ip_set_ref_lock); + if (set->ref != 0) { + ret = -IPSET_ERR_REFERENCED; + goto out; + } + + name2 = nla_data(attr[IPSET_ATTR_SETNAME2]); + for (i = 0; i < ip_set_max; i++) { + if (ip_set_list[i] != NULL && + STREQ(ip_set_list[i]->name, name2)) { + ret = -IPSET_ERR_EXIST_SETNAME2; + goto out; + } + } + strncpy(set->name, name2, IPSET_MAXNAMELEN); + +out: + read_unlock_bh(&ip_set_ref_lock); + return ret; +} + +/* Swap two sets so that name/index points to the other. + * References and set names are also swapped. + * + * The commands are serialized by the nfnl mutex and references are + * protected by the ip_set_ref_lock. The kernel interfaces + * do not hold the mutex but the pointer settings are atomic + * so the ip_set_list always contains valid pointers to the sets. + */ + +static int +ip_set_swap(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const attr[]) +{ + struct ip_set *from, *to; + ip_set_id_t from_id, to_id; + char from_name[IPSET_MAXNAMELEN]; + + if (unlikely(protocol_failed(attr) || + attr[IPSET_ATTR_SETNAME] == NULL || + attr[IPSET_ATTR_SETNAME2] == NULL)) + return -IPSET_ERR_PROTOCOL; + + from_id = find_set_id(nla_data(attr[IPSET_ATTR_SETNAME])); + if (from_id == IPSET_INVALID_ID) + return -ENOENT; + + to_id = find_set_id(nla_data(attr[IPSET_ATTR_SETNAME2])); + if (to_id == IPSET_INVALID_ID) + return -IPSET_ERR_EXIST_SETNAME2; + + from = ip_set_list[from_id]; + to = ip_set_list[to_id]; + + /* Features must not change. + * Not an artificial restriction anymore, as we must prevent + * possible loops created by swapping in setlist type of sets. */ + if (!(from->type->features == to->type->features && + from->type->family == to->type->family)) + return -IPSET_ERR_TYPE_MISMATCH; + + strncpy(from_name, from->name, IPSET_MAXNAMELEN); + strncpy(from->name, to->name, IPSET_MAXNAMELEN); + strncpy(to->name, from_name, IPSET_MAXNAMELEN); + + write_lock_bh(&ip_set_ref_lock); + swap(from->ref, to->ref); + ip_set_list[from_id] = to; + ip_set_list[to_id] = from; + write_unlock_bh(&ip_set_ref_lock); + + return 0; +} + +/* List/save set data */ + +#define DUMP_INIT 0 +#define DUMP_ALL 1 +#define DUMP_ONE 2 +#define DUMP_LAST 3 + +#define DUMP_TYPE(arg) (((u32)(arg)) & 0x0000FFFF) +#define DUMP_FLAGS(arg) (((u32)(arg)) >> 16) + +static int +ip_set_dump_done(struct netlink_callback *cb) +{ + if (cb->args[2]) { + pr_debug("release set %s\n", ip_set_list[cb->args[1]]->name); + ip_set_put_byindex((ip_set_id_t) cb->args[1]); + } + return 0; +} + +static inline void +dump_attrs(struct nlmsghdr *nlh) +{ + const struct nlattr *attr; + int rem; + + pr_debug("dump nlmsg\n"); + nlmsg_for_each_attr(attr, nlh, sizeof(struct nfgenmsg), rem) { + pr_debug("type: %u, len %u\n", nla_type(attr), attr->nla_len); + } +} + +static int +dump_init(struct netlink_callback *cb) +{ + struct nlmsghdr *nlh = nlmsg_hdr(cb->skb); + int min_len = NLMSG_SPACE(sizeof(struct nfgenmsg)); + struct nlattr *cda[IPSET_ATTR_CMD_MAX+1]; + struct nlattr *attr = (void *)nlh + min_len; + u32 dump_type; + ip_set_id_t index; + + /* Second pass, so parser can't fail */ + nla_parse(cda, IPSET_ATTR_CMD_MAX, + attr, nlh->nlmsg_len - min_len, ip_set_setname_policy); + + /* cb->args[0] : dump single set/all sets + * [1] : set index + * [..]: type specific + */ + + if (cda[IPSET_ATTR_SETNAME]) { + index = find_set_id(nla_data(cda[IPSET_ATTR_SETNAME])); + if (index == IPSET_INVALID_ID) + return -ENOENT; + + dump_type = DUMP_ONE; + cb->args[1] = index; + } else + dump_type = DUMP_ALL; + + if (cda[IPSET_ATTR_FLAGS]) { + u32 f = ip_set_get_h32(cda[IPSET_ATTR_FLAGS]); + dump_type |= (f << 16); + } + cb->args[0] = dump_type; + + return 0; +} + +static int +ip_set_dump_start(struct sk_buff *skb, struct netlink_callback *cb) +{ + ip_set_id_t index = IPSET_INVALID_ID, max; + struct ip_set *set = NULL; + struct nlmsghdr *nlh = NULL; + unsigned int flags = NETLINK_CB(cb->skb).pid ? NLM_F_MULTI : 0; + u32 dump_type, dump_flags; + int ret = 0; + + if (!cb->args[0]) { + ret = dump_init(cb); + if (ret < 0) { + nlh = nlmsg_hdr(cb->skb); + /* We have to create and send the error message + * manually :-( */ + if (nlh->nlmsg_flags & NLM_F_ACK) + netlink_ack(cb->skb, nlh, ret); + return ret; + } + } + + if (cb->args[1] >= ip_set_max) + goto out; + + dump_type = DUMP_TYPE(cb->args[0]); + dump_flags = DUMP_FLAGS(cb->args[0]); + max = dump_type == DUMP_ONE ? cb->args[1] + 1 : ip_set_max; +dump_last: + pr_debug("args[0]: %u %u args[1]: %ld\n", + dump_type, dump_flags, cb->args[1]); + for (; cb->args[1] < max; cb->args[1]++) { + index = (ip_set_id_t) cb->args[1]; + set = ip_set_list[index]; + if (set == NULL) { + if (dump_type == DUMP_ONE) { + ret = -ENOENT; + goto out; + } + continue; + } + /* When dumping all sets, we must dump "sorted" + * so that lists (unions of sets) are dumped last. + */ + if (dump_type != DUMP_ONE && + ((dump_type == DUMP_ALL) == + !!(set->type->features & IPSET_DUMP_LAST))) + continue; + pr_debug("List set: %s\n", set->name); + if (!cb->args[2]) { + /* Start listing: make sure set won't be destroyed */ + pr_debug("reference set\n"); + __ip_set_get(index); + } + nlh = start_msg(skb, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, flags, + IPSET_CMD_LIST); + if (!nlh) { + ret = -EMSGSIZE; + goto release_refcount; + } + NLA_PUT_U8(skb, IPSET_ATTR_PROTOCOL, IPSET_PROTOCOL); + NLA_PUT_STRING(skb, IPSET_ATTR_SETNAME, set->name); + if (dump_flags & IPSET_FLAG_LIST_SETNAME) + goto next_set; + switch (cb->args[2]) { + case 0: + /* Core header data */ + NLA_PUT_STRING(skb, IPSET_ATTR_TYPENAME, + set->type->name); + NLA_PUT_U8(skb, IPSET_ATTR_FAMILY, + set->family); + NLA_PUT_U8(skb, IPSET_ATTR_REVISION, + set->revision); + ret = set->variant->head(set, skb); + if (ret < 0) + goto release_refcount; + if (dump_flags & IPSET_FLAG_LIST_HEADER) + goto next_set; + /* Fall through and add elements */ + default: + read_lock_bh(&set->lock); + ret = set->variant->list(set, skb, cb); + read_unlock_bh(&set->lock); + if (!cb->args[2]) + /* Set is done, proceed with next one */ + goto next_set; + goto release_refcount; + } + } + /* If we dump all sets, continue with dumping last ones */ + if (dump_type == DUMP_ALL) { + dump_type = DUMP_LAST; + cb->args[0] = dump_type | (dump_flags << 16); + cb->args[1] = 0; + goto dump_last; + } + goto out; + +nla_put_failure: + ret = -EFAULT; +next_set: + if (dump_type == DUMP_ONE) + cb->args[1] = IPSET_INVALID_ID; + else + cb->args[1]++; +release_refcount: + /* If there was an error or set is done, release set */ + if (ret || !cb->args[2]) { + pr_debug("release set %s\n", ip_set_list[index]->name); + ip_set_put_byindex(index); + cb->args[2] = 0; + } +out: + if (nlh) { + nlmsg_end(skb, nlh); + pr_debug("nlmsg_len: %u\n", nlh->nlmsg_len); + dump_attrs(nlh); + } + + return ret < 0 ? ret : skb->len; +} + +static int +ip_set_dump(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const attr[]) +{ + if (unlikely(protocol_failed(attr))) + return -IPSET_ERR_PROTOCOL; + + { + struct netlink_dump_control c = { + .dump = ip_set_dump_start, + .done = ip_set_dump_done, + }; + return netlink_dump_start(ctnl, skb, nlh, &c); + } +} + +/* Add, del and test */ + +static const struct nla_policy ip_set_adt_policy[IPSET_ATTR_CMD_MAX + 1] = { + [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 }, + [IPSET_ATTR_SETNAME] = { .type = NLA_NUL_STRING, + .len = IPSET_MAXNAMELEN - 1 }, + [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, + [IPSET_ATTR_DATA] = { .type = NLA_NESTED }, + [IPSET_ATTR_ADT] = { .type = NLA_NESTED }, +}; + +static int +call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set, + struct nlattr *tb[], enum ipset_adt adt, + u32 flags, bool use_lineno) +{ + int ret; + u32 lineno = 0; + bool eexist = flags & IPSET_FLAG_EXIST, retried = false; + + do { + write_lock_bh(&set->lock); + ret = set->variant->uadt(set, tb, adt, &lineno, flags, retried); + write_unlock_bh(&set->lock); + retried = true; + } while (ret == -EAGAIN && + set->variant->resize && + (ret = set->variant->resize(set, retried)) == 0); + + if (!ret || (ret == -IPSET_ERR_EXIST && eexist)) + return 0; + if (lineno && use_lineno) { + /* Error in restore/batch mode: send back lineno */ + struct nlmsghdr *rep, *nlh = nlmsg_hdr(skb); + struct sk_buff *skb2; + struct nlmsgerr *errmsg; + size_t payload = sizeof(*errmsg) + nlmsg_len(nlh); + int min_len = NLMSG_SPACE(sizeof(struct nfgenmsg)); + struct nlattr *cda[IPSET_ATTR_CMD_MAX+1]; + struct nlattr *cmdattr; + u32 *errline; + + skb2 = nlmsg_new(payload, GFP_KERNEL); + if (skb2 == NULL) + return -ENOMEM; + rep = __nlmsg_put(skb2, NETLINK_CB(skb).pid, + nlh->nlmsg_seq, NLMSG_ERROR, payload, 0); + errmsg = nlmsg_data(rep); + errmsg->error = ret; + memcpy(&errmsg->msg, nlh, nlh->nlmsg_len); + cmdattr = (void *)&errmsg->msg + min_len; + + nla_parse(cda, IPSET_ATTR_CMD_MAX, + cmdattr, nlh->nlmsg_len - min_len, + ip_set_adt_policy); + + errline = nla_data(cda[IPSET_ATTR_LINENO]); + + *errline = lineno; + + netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT); + /* Signal netlink not to send its ACK/errmsg. */ + return -EINTR; + } + + return ret; +} + +static int +ip_set_uadd(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const attr[]) +{ + struct ip_set *set; + struct nlattr *tb[IPSET_ATTR_ADT_MAX+1] = {}; + const struct nlattr *nla; + u32 flags = flag_exist(nlh); + bool use_lineno; + int ret = 0; + + if (unlikely(protocol_failed(attr) || + attr[IPSET_ATTR_SETNAME] == NULL || + !((attr[IPSET_ATTR_DATA] != NULL) ^ + (attr[IPSET_ATTR_ADT] != NULL)) || + (attr[IPSET_ATTR_DATA] != NULL && + !flag_nested(attr[IPSET_ATTR_DATA])) || + (attr[IPSET_ATTR_ADT] != NULL && + (!flag_nested(attr[IPSET_ATTR_ADT]) || + attr[IPSET_ATTR_LINENO] == NULL)))) + return -IPSET_ERR_PROTOCOL; + + set = find_set(nla_data(attr[IPSET_ATTR_SETNAME])); + if (set == NULL) + return -ENOENT; + + use_lineno = !!attr[IPSET_ATTR_LINENO]; + if (attr[IPSET_ATTR_DATA]) { + if (nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, + attr[IPSET_ATTR_DATA], + set->type->adt_policy)) + return -IPSET_ERR_PROTOCOL; + ret = call_ad(ctnl, skb, set, tb, IPSET_ADD, flags, + use_lineno); + } else { + int nla_rem; + + nla_for_each_nested(nla, attr[IPSET_ATTR_ADT], nla_rem) { + memset(tb, 0, sizeof(tb)); + if (nla_type(nla) != IPSET_ATTR_DATA || + !flag_nested(nla) || + nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, nla, + set->type->adt_policy)) + return -IPSET_ERR_PROTOCOL; + ret = call_ad(ctnl, skb, set, tb, IPSET_ADD, + flags, use_lineno); + if (ret < 0) + return ret; + } + } + return ret; +} + +static int +ip_set_udel(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const attr[]) +{ + struct ip_set *set; + struct nlattr *tb[IPSET_ATTR_ADT_MAX+1] = {}; + const struct nlattr *nla; + u32 flags = flag_exist(nlh); + bool use_lineno; + int ret = 0; + + if (unlikely(protocol_failed(attr) || + attr[IPSET_ATTR_SETNAME] == NULL || + !((attr[IPSET_ATTR_DATA] != NULL) ^ + (attr[IPSET_ATTR_ADT] != NULL)) || + (attr[IPSET_ATTR_DATA] != NULL && + !flag_nested(attr[IPSET_ATTR_DATA])) || + (attr[IPSET_ATTR_ADT] != NULL && + (!flag_nested(attr[IPSET_ATTR_ADT]) || + attr[IPSET_ATTR_LINENO] == NULL)))) + return -IPSET_ERR_PROTOCOL; + + set = find_set(nla_data(attr[IPSET_ATTR_SETNAME])); + if (set == NULL) + return -ENOENT; + + use_lineno = !!attr[IPSET_ATTR_LINENO]; + if (attr[IPSET_ATTR_DATA]) { + if (nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, + attr[IPSET_ATTR_DATA], + set->type->adt_policy)) + return -IPSET_ERR_PROTOCOL; + ret = call_ad(ctnl, skb, set, tb, IPSET_DEL, flags, + use_lineno); + } else { + int nla_rem; + + nla_for_each_nested(nla, attr[IPSET_ATTR_ADT], nla_rem) { + memset(tb, 0, sizeof(*tb)); + if (nla_type(nla) != IPSET_ATTR_DATA || + !flag_nested(nla) || + nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, nla, + set->type->adt_policy)) + return -IPSET_ERR_PROTOCOL; + ret = call_ad(ctnl, skb, set, tb, IPSET_DEL, + flags, use_lineno); + if (ret < 0) + return ret; + } + } + return ret; +} + +static int +ip_set_utest(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const attr[]) +{ + struct ip_set *set; + struct nlattr *tb[IPSET_ATTR_ADT_MAX+1] = {}; + int ret = 0; + + if (unlikely(protocol_failed(attr) || + attr[IPSET_ATTR_SETNAME] == NULL || + attr[IPSET_ATTR_DATA] == NULL || + !flag_nested(attr[IPSET_ATTR_DATA]))) + return -IPSET_ERR_PROTOCOL; + + set = find_set(nla_data(attr[IPSET_ATTR_SETNAME])); + if (set == NULL) + return -ENOENT; + + if (nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, attr[IPSET_ATTR_DATA], + set->type->adt_policy)) + return -IPSET_ERR_PROTOCOL; + + read_lock_bh(&set->lock); + ret = set->variant->uadt(set, tb, IPSET_TEST, NULL, 0, 0); + read_unlock_bh(&set->lock); + /* Userspace can't trigger element to be re-added */ + if (ret == -EAGAIN) + ret = 1; + + return ret < 0 ? ret : ret > 0 ? 0 : -IPSET_ERR_EXIST; +} + +/* Get headed data of a set */ + +static int +ip_set_header(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const attr[]) +{ + const struct ip_set *set; + struct sk_buff *skb2; + struct nlmsghdr *nlh2; + ip_set_id_t index; + int ret = 0; + + if (unlikely(protocol_failed(attr) || + attr[IPSET_ATTR_SETNAME] == NULL)) + return -IPSET_ERR_PROTOCOL; + + index = find_set_id(nla_data(attr[IPSET_ATTR_SETNAME])); + if (index == IPSET_INVALID_ID) + return -ENOENT; + set = ip_set_list[index]; + + skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (skb2 == NULL) + return -ENOMEM; + + nlh2 = start_msg(skb2, NETLINK_CB(skb).pid, nlh->nlmsg_seq, 0, + IPSET_CMD_HEADER); + if (!nlh2) + goto nlmsg_failure; + NLA_PUT_U8(skb2, IPSET_ATTR_PROTOCOL, IPSET_PROTOCOL); + NLA_PUT_STRING(skb2, IPSET_ATTR_SETNAME, set->name); + NLA_PUT_STRING(skb2, IPSET_ATTR_TYPENAME, set->type->name); + NLA_PUT_U8(skb2, IPSET_ATTR_FAMILY, set->family); + NLA_PUT_U8(skb2, IPSET_ATTR_REVISION, set->revision); + nlmsg_end(skb2, nlh2); + + ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT); + if (ret < 0) + return ret; + + return 0; + +nla_put_failure: + nlmsg_cancel(skb2, nlh2); +nlmsg_failure: + kfree_skb(skb2); + return -EMSGSIZE; +} + +/* Get type data */ + +static const struct nla_policy ip_set_type_policy[IPSET_ATTR_CMD_MAX + 1] = { + [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 }, + [IPSET_ATTR_TYPENAME] = { .type = NLA_NUL_STRING, + .len = IPSET_MAXNAMELEN - 1 }, + [IPSET_ATTR_FAMILY] = { .type = NLA_U8 }, +}; + +static int +ip_set_type(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const attr[]) +{ + struct sk_buff *skb2; + struct nlmsghdr *nlh2; + u8 family, min, max; + const char *typename; + int ret = 0; + + if (unlikely(protocol_failed(attr) || + attr[IPSET_ATTR_TYPENAME] == NULL || + attr[IPSET_ATTR_FAMILY] == NULL)) + return -IPSET_ERR_PROTOCOL; + + family = nla_get_u8(attr[IPSET_ATTR_FAMILY]); + typename = nla_data(attr[IPSET_ATTR_TYPENAME]); + ret = find_set_type_minmax(typename, family, &min, &max); + if (ret) + return ret; + + skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (skb2 == NULL) + return -ENOMEM; + + nlh2 = start_msg(skb2, NETLINK_CB(skb).pid, nlh->nlmsg_seq, 0, + IPSET_CMD_TYPE); + if (!nlh2) + goto nlmsg_failure; + NLA_PUT_U8(skb2, IPSET_ATTR_PROTOCOL, IPSET_PROTOCOL); + NLA_PUT_STRING(skb2, IPSET_ATTR_TYPENAME, typename); + NLA_PUT_U8(skb2, IPSET_ATTR_FAMILY, family); + NLA_PUT_U8(skb2, IPSET_ATTR_REVISION, max); + NLA_PUT_U8(skb2, IPSET_ATTR_REVISION_MIN, min); + nlmsg_end(skb2, nlh2); + + pr_debug("Send TYPE, nlmsg_len: %u\n", nlh2->nlmsg_len); + ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT); + if (ret < 0) + return ret; + + return 0; + +nla_put_failure: + nlmsg_cancel(skb2, nlh2); +nlmsg_failure: + kfree_skb(skb2); + return -EMSGSIZE; +} + +/* Get protocol version */ + +static const struct nla_policy +ip_set_protocol_policy[IPSET_ATTR_CMD_MAX + 1] = { + [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 }, +}; + +static int +ip_set_protocol(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const attr[]) +{ + struct sk_buff *skb2; + struct nlmsghdr *nlh2; + int ret = 0; + + if (unlikely(attr[IPSET_ATTR_PROTOCOL] == NULL)) + return -IPSET_ERR_PROTOCOL; + + skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (skb2 == NULL) + return -ENOMEM; + + nlh2 = start_msg(skb2, NETLINK_CB(skb).pid, nlh->nlmsg_seq, 0, + IPSET_CMD_PROTOCOL); + if (!nlh2) + goto nlmsg_failure; + NLA_PUT_U8(skb2, IPSET_ATTR_PROTOCOL, IPSET_PROTOCOL); + nlmsg_end(skb2, nlh2); + + ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT); + if (ret < 0) + return ret; + + return 0; + +nla_put_failure: + nlmsg_cancel(skb2, nlh2); +nlmsg_failure: + kfree_skb(skb2); + return -EMSGSIZE; +} + +static const struct nfnl_callback ip_set_netlink_subsys_cb[IPSET_MSG_MAX] = { + [IPSET_CMD_CREATE] = { + .call = ip_set_create, + .attr_count = IPSET_ATTR_CMD_MAX, + .policy = ip_set_create_policy, + }, + [IPSET_CMD_DESTROY] = { + .call = ip_set_destroy, + .attr_count = IPSET_ATTR_CMD_MAX, + .policy = ip_set_setname_policy, + }, + [IPSET_CMD_FLUSH] = { + .call = ip_set_flush, + .attr_count = IPSET_ATTR_CMD_MAX, + .policy = ip_set_setname_policy, + }, + [IPSET_CMD_RENAME] = { + .call = ip_set_rename, + .attr_count = IPSET_ATTR_CMD_MAX, + .policy = ip_set_setname2_policy, + }, + [IPSET_CMD_SWAP] = { + .call = ip_set_swap, + .attr_count = IPSET_ATTR_CMD_MAX, + .policy = ip_set_setname2_policy, + }, + [IPSET_CMD_LIST] = { + .call = ip_set_dump, + .attr_count = IPSET_ATTR_CMD_MAX, + .policy = ip_set_setname_policy, + }, + [IPSET_CMD_SAVE] = { + .call = ip_set_dump, + .attr_count = IPSET_ATTR_CMD_MAX, + .policy = ip_set_setname_policy, + }, + [IPSET_CMD_ADD] = { + .call = ip_set_uadd, + .attr_count = IPSET_ATTR_CMD_MAX, + .policy = ip_set_adt_policy, + }, + [IPSET_CMD_DEL] = { + .call = ip_set_udel, + .attr_count = IPSET_ATTR_CMD_MAX, + .policy = ip_set_adt_policy, + }, + [IPSET_CMD_TEST] = { + .call = ip_set_utest, + .attr_count = IPSET_ATTR_CMD_MAX, + .policy = ip_set_adt_policy, + }, + [IPSET_CMD_HEADER] = { + .call = ip_set_header, + .attr_count = IPSET_ATTR_CMD_MAX, + .policy = ip_set_setname_policy, + }, + [IPSET_CMD_TYPE] = { + .call = ip_set_type, + .attr_count = IPSET_ATTR_CMD_MAX, + .policy = ip_set_type_policy, + }, + [IPSET_CMD_PROTOCOL] = { + .call = ip_set_protocol, + .attr_count = IPSET_ATTR_CMD_MAX, + .policy = ip_set_protocol_policy, + }, +}; + +static struct nfnetlink_subsystem ip_set_netlink_subsys __read_mostly = { + .name = "ip_set", + .subsys_id = NFNL_SUBSYS_IPSET, + .cb_count = IPSET_MSG_MAX, + .cb = ip_set_netlink_subsys_cb, +}; + +/* Interface to iptables/ip6tables */ + +static int +ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len) +{ + unsigned *op; + void *data; + int copylen = *len, ret = 0; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (optval != SO_IP_SET) + return -EBADF; + if (*len < sizeof(unsigned)) + return -EINVAL; + + data = vmalloc(*len); + if (!data) + return -ENOMEM; + if (copy_from_user(data, user, *len) != 0) { + ret = -EFAULT; + goto done; + } + op = (unsigned *) data; + + if (*op < IP_SET_OP_VERSION) { + /* Check the version at the beginning of operations */ + struct ip_set_req_version *req_version = data; + if (req_version->version != IPSET_PROTOCOL) { + ret = -EPROTO; + goto done; + } + } + + switch (*op) { + case IP_SET_OP_VERSION: { + struct ip_set_req_version *req_version = data; + + if (*len != sizeof(struct ip_set_req_version)) { + ret = -EINVAL; + goto done; + } + + req_version->version = IPSET_PROTOCOL; + ret = copy_to_user(user, req_version, + sizeof(struct ip_set_req_version)); + goto done; + } + case IP_SET_OP_GET_BYNAME: { + struct ip_set_req_get_set *req_get = data; + + if (*len != sizeof(struct ip_set_req_get_set)) { + ret = -EINVAL; + goto done; + } + req_get->set.name[IPSET_MAXNAMELEN - 1] = '\0'; + nfnl_lock(); + req_get->set.index = find_set_id(req_get->set.name); + nfnl_unlock(); + goto copy; + } + case IP_SET_OP_GET_BYINDEX: { + struct ip_set_req_get_set *req_get = data; + + if (*len != sizeof(struct ip_set_req_get_set) || + req_get->set.index >= ip_set_max) { + ret = -EINVAL; + goto done; + } + nfnl_lock(); + strncpy(req_get->set.name, + ip_set_list[req_get->set.index] + ? ip_set_list[req_get->set.index]->name : "", + IPSET_MAXNAMELEN); + nfnl_unlock(); + goto copy; + } + default: + ret = -EBADMSG; + goto done; + } /* end of switch(op) */ + +copy: + ret = copy_to_user(user, data, copylen); + +done: + vfree(data); + if (ret > 0) + ret = 0; + return ret; +} + +static struct nf_sockopt_ops so_set __read_mostly = { + .pf = PF_INET, + .get_optmin = SO_IP_SET, + .get_optmax = SO_IP_SET + 1, + .get = &ip_set_sockfn_get, + .owner = THIS_MODULE, +}; + +static int __init +ip_set_init(void) +{ + int ret; + + if (max_sets) + ip_set_max = max_sets; + if (ip_set_max >= IPSET_INVALID_ID) + ip_set_max = IPSET_INVALID_ID - 1; + + ip_set_list = kzalloc(sizeof(struct ip_set *) * ip_set_max, + GFP_KERNEL); + if (!ip_set_list) + return -ENOMEM; + + ret = nfnetlink_subsys_register(&ip_set_netlink_subsys); + if (ret != 0) { + pr_err("ip_set: cannot register with nfnetlink.\n"); + kfree(ip_set_list); + return ret; + } + ret = nf_register_sockopt(&so_set); + if (ret != 0) { + pr_err("SO_SET registry failed: %d\n", ret); + nfnetlink_subsys_unregister(&ip_set_netlink_subsys); + kfree(ip_set_list); + return ret; + } + + pr_notice("ip_set: protocol %u\n", IPSET_PROTOCOL); + return 0; +} + +static void __exit +ip_set_fini(void) +{ + /* There can't be any existing set */ + nf_unregister_sockopt(&so_set); + nfnetlink_subsys_unregister(&ip_set_netlink_subsys); + kfree(ip_set_list); + pr_debug("these are the famous last words\n"); +} + +module_init(ip_set_init); +module_exit(ip_set_fini); diff --git a/net/netfilter/ipset/ip_set_getport.c b/net/netfilter/ipset/ip_set_getport.c new file mode 100644 index 00000000..6fdf88ae --- /dev/null +++ b/net/netfilter/ipset/ip_set_getport.c @@ -0,0 +1,158 @@ +/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Get Layer-4 data from the packets */ + +#include <linux/ip.h> +#include <linux/skbuff.h> +#include <linux/icmp.h> +#include <linux/icmpv6.h> +#include <linux/sctp.h> +#include <linux/netfilter_ipv6/ip6_tables.h> +#include <net/ip.h> +#include <net/ipv6.h> + +#include <linux/netfilter/ipset/ip_set_getport.h> +#include <linux/export.h> + +/* We must handle non-linear skbs */ +static bool +get_port(const struct sk_buff *skb, int protocol, unsigned int protooff, + bool src, __be16 *port, u8 *proto) +{ + switch (protocol) { + case IPPROTO_TCP: { + struct tcphdr _tcph; + const struct tcphdr *th; + + th = skb_header_pointer(skb, protooff, sizeof(_tcph), &_tcph); + if (th == NULL) + /* No choice either */ + return false; + + *port = src ? th->source : th->dest; + break; + } + case IPPROTO_SCTP: { + sctp_sctphdr_t _sh; + const sctp_sctphdr_t *sh; + + sh = skb_header_pointer(skb, protooff, sizeof(_sh), &_sh); + if (sh == NULL) + /* No choice either */ + return false; + + *port = src ? sh->source : sh->dest; + break; + } + case IPPROTO_UDP: + case IPPROTO_UDPLITE: { + struct udphdr _udph; + const struct udphdr *uh; + + uh = skb_header_pointer(skb, protooff, sizeof(_udph), &_udph); + if (uh == NULL) + /* No choice either */ + return false; + + *port = src ? uh->source : uh->dest; + break; + } + case IPPROTO_ICMP: { + struct icmphdr _ich; + const struct icmphdr *ic; + + ic = skb_header_pointer(skb, protooff, sizeof(_ich), &_ich); + if (ic == NULL) + return false; + + *port = (__force __be16)htons((ic->type << 8) | ic->code); + break; + } + case IPPROTO_ICMPV6: { + struct icmp6hdr _ich; + const struct icmp6hdr *ic; + + ic = skb_header_pointer(skb, protooff, sizeof(_ich), &_ich); + if (ic == NULL) + return false; + + *port = (__force __be16) + htons((ic->icmp6_type << 8) | ic->icmp6_code); + break; + } + default: + break; + } + *proto = protocol; + + return true; +} + +bool +ip_set_get_ip4_port(const struct sk_buff *skb, bool src, + __be16 *port, u8 *proto) +{ + const struct iphdr *iph = ip_hdr(skb); + unsigned int protooff = ip_hdrlen(skb); + int protocol = iph->protocol; + + /* See comments at tcp_match in ip_tables.c */ + if (protocol <= 0 || (ntohs(iph->frag_off) & IP_OFFSET)) + return false; + + return get_port(skb, protocol, protooff, src, port, proto); +} +EXPORT_SYMBOL_GPL(ip_set_get_ip4_port); + +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) +bool +ip_set_get_ip6_port(const struct sk_buff *skb, bool src, + __be16 *port, u8 *proto) +{ + int protoff; + u8 nexthdr; + __be16 frag_off; + + nexthdr = ipv6_hdr(skb)->nexthdr; + protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, + &frag_off); + if (protoff < 0) + return false; + + return get_port(skb, nexthdr, protoff, src, port, proto); +} +EXPORT_SYMBOL_GPL(ip_set_get_ip6_port); +#endif + +bool +ip_set_get_ip_port(const struct sk_buff *skb, u8 pf, bool src, __be16 *port) +{ + bool ret; + u8 proto; + + switch (pf) { + case NFPROTO_IPV4: + ret = ip_set_get_ip4_port(skb, src, port, &proto); + break; + case NFPROTO_IPV6: + ret = ip_set_get_ip6_port(skb, src, port, &proto); + break; + default: + return false; + } + if (!ret) + return ret; + switch (proto) { + case IPPROTO_TCP: + case IPPROTO_UDP: + return true; + default: + return false; + } +} +EXPORT_SYMBOL_GPL(ip_set_get_ip_port); diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c new file mode 100644 index 00000000..828ce46c --- /dev/null +++ b/net/netfilter/ipset/ip_set_hash_ip.c @@ -0,0 +1,485 @@ +/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Kernel module implementing an IP set type: the hash:ip type */ + +#include <linux/jhash.h> +#include <linux/module.h> +#include <linux/ip.h> +#include <linux/skbuff.h> +#include <linux/errno.h> +#include <linux/random.h> +#include <net/ip.h> +#include <net/ipv6.h> +#include <net/netlink.h> +#include <net/tcp.h> + +#include <linux/netfilter.h> +#include <linux/netfilter/ipset/pfxlen.h> +#include <linux/netfilter/ipset/ip_set.h> +#include <linux/netfilter/ipset/ip_set_timeout.h> +#include <linux/netfilter/ipset/ip_set_hash.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +MODULE_DESCRIPTION("hash:ip type of IP sets"); +MODULE_ALIAS("ip_set_hash:ip"); + +/* Type specific function prefix */ +#define TYPE hash_ip + +static bool +hash_ip_same_set(const struct ip_set *a, const struct ip_set *b); + +#define hash_ip4_same_set hash_ip_same_set +#define hash_ip6_same_set hash_ip_same_set + +/* The type variant functions: IPv4 */ + +/* Member elements without timeout */ +struct hash_ip4_elem { + __be32 ip; +}; + +/* Member elements with timeout support */ +struct hash_ip4_telem { + __be32 ip; + unsigned long timeout; +}; + +static inline bool +hash_ip4_data_equal(const struct hash_ip4_elem *ip1, + const struct hash_ip4_elem *ip2, + u32 *multi) +{ + return ip1->ip == ip2->ip; +} + +static inline bool +hash_ip4_data_isnull(const struct hash_ip4_elem *elem) +{ + return elem->ip == 0; +} + +static inline void +hash_ip4_data_copy(struct hash_ip4_elem *dst, const struct hash_ip4_elem *src) +{ + dst->ip = src->ip; +} + +/* Zero valued IP addresses cannot be stored */ +static inline void +hash_ip4_data_zero_out(struct hash_ip4_elem *elem) +{ + elem->ip = 0; +} + +static inline bool +hash_ip4_data_list(struct sk_buff *skb, const struct hash_ip4_elem *data) +{ + NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, data->ip); + return 0; + +nla_put_failure: + return 1; +} + +static bool +hash_ip4_data_tlist(struct sk_buff *skb, const struct hash_ip4_elem *data) +{ + const struct hash_ip4_telem *tdata = + (const struct hash_ip4_telem *)data; + + NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, tdata->ip); + NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, + htonl(ip_set_timeout_get(tdata->timeout))); + + return 0; + +nla_put_failure: + return 1; +} + +#define IP_SET_HASH_WITH_NETMASK +#define PF 4 +#define HOST_MASK 32 +#include <linux/netfilter/ipset/ip_set_ahash.h> + +static inline void +hash_ip4_data_next(struct ip_set_hash *h, const struct hash_ip4_elem *d) +{ + h->next.ip = ntohl(d->ip); +} + +static int +hash_ip4_kadt(struct ip_set *set, const struct sk_buff *skb, + const struct xt_action_param *par, + enum ipset_adt adt, const struct ip_set_adt_opt *opt) +{ + const struct ip_set_hash *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + __be32 ip; + + ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &ip); + ip &= ip_set_netmask(h->netmask); + if (ip == 0) + return -EINVAL; + + return adtfn(set, &ip, opt_timeout(opt, h), opt->cmdflags); +} + +static int +hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[], + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ + const struct ip_set_hash *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + u32 ip, ip_to, hosts, timeout = h->timeout; + __be32 nip; + int ret = 0; + + if (unlikely(!tb[IPSET_ATTR_IP] || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); + if (ret) + return ret; + + ip &= ip_set_hostmask(h->netmask); + + if (tb[IPSET_ATTR_TIMEOUT]) { + if (!with_timeout(h->timeout)) + return -IPSET_ERR_TIMEOUT; + timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + } + + if (adt == IPSET_TEST) { + nip = htonl(ip); + if (nip == 0) + return -IPSET_ERR_HASH_ELEM; + return adtfn(set, &nip, timeout, flags); + } + + if (tb[IPSET_ATTR_IP_TO]) { + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); + if (ret) + return ret; + if (ip > ip_to) + swap(ip, ip_to); + } else if (tb[IPSET_ATTR_CIDR]) { + u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + + if (cidr > 32) + return -IPSET_ERR_INVALID_CIDR; + ip_set_mask_from_to(ip, ip_to, cidr); + } else + ip_to = ip; + + hosts = h->netmask == 32 ? 1 : 2 << (32 - h->netmask - 1); + + if (retried) + ip = h->next.ip; + for (; !before(ip_to, ip); ip += hosts) { + nip = htonl(ip); + if (nip == 0) + return -IPSET_ERR_HASH_ELEM; + ret = adtfn(set, &nip, timeout, flags); + + if (ret && !ip_set_eexist(ret, flags)) + return ret; + else + ret = 0; + } + return ret; +} + +static bool +hash_ip_same_set(const struct ip_set *a, const struct ip_set *b) +{ + const struct ip_set_hash *x = a->data; + const struct ip_set_hash *y = b->data; + + /* Resizing changes htable_bits, so we ignore it */ + return x->maxelem == y->maxelem && + x->timeout == y->timeout && + x->netmask == y->netmask; +} + +/* The type variant functions: IPv6 */ + +struct hash_ip6_elem { + union nf_inet_addr ip; +}; + +struct hash_ip6_telem { + union nf_inet_addr ip; + unsigned long timeout; +}; + +static inline bool +hash_ip6_data_equal(const struct hash_ip6_elem *ip1, + const struct hash_ip6_elem *ip2, + u32 *multi) +{ + return ipv6_addr_cmp(&ip1->ip.in6, &ip2->ip.in6) == 0; +} + +static inline bool +hash_ip6_data_isnull(const struct hash_ip6_elem *elem) +{ + return ipv6_addr_any(&elem->ip.in6); +} + +static inline void +hash_ip6_data_copy(struct hash_ip6_elem *dst, const struct hash_ip6_elem *src) +{ + dst->ip.in6 = src->ip.in6; +} + +static inline void +hash_ip6_data_zero_out(struct hash_ip6_elem *elem) +{ + ipv6_addr_set(&elem->ip.in6, 0, 0, 0, 0); +} + +static inline void +ip6_netmask(union nf_inet_addr *ip, u8 prefix) +{ + ip->ip6[0] &= ip_set_netmask6(prefix)[0]; + ip->ip6[1] &= ip_set_netmask6(prefix)[1]; + ip->ip6[2] &= ip_set_netmask6(prefix)[2]; + ip->ip6[3] &= ip_set_netmask6(prefix)[3]; +} + +static bool +hash_ip6_data_list(struct sk_buff *skb, const struct hash_ip6_elem *data) +{ + NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &data->ip); + return 0; + +nla_put_failure: + return 1; +} + +static bool +hash_ip6_data_tlist(struct sk_buff *skb, const struct hash_ip6_elem *data) +{ + const struct hash_ip6_telem *e = + (const struct hash_ip6_telem *)data; + + NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &e->ip); + NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, + htonl(ip_set_timeout_get(e->timeout))); + return 0; + +nla_put_failure: + return 1; +} + +#undef PF +#undef HOST_MASK + +#define PF 6 +#define HOST_MASK 128 +#include <linux/netfilter/ipset/ip_set_ahash.h> + +static inline void +hash_ip6_data_next(struct ip_set_hash *h, const struct hash_ip6_elem *d) +{ +} + +static int +hash_ip6_kadt(struct ip_set *set, const struct sk_buff *skb, + const struct xt_action_param *par, + enum ipset_adt adt, const struct ip_set_adt_opt *opt) +{ + const struct ip_set_hash *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + union nf_inet_addr ip; + + ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &ip.in6); + ip6_netmask(&ip, h->netmask); + if (ipv6_addr_any(&ip.in6)) + return -EINVAL; + + return adtfn(set, &ip, opt_timeout(opt, h), opt->cmdflags); +} + +static const struct nla_policy hash_ip6_adt_policy[IPSET_ATTR_ADT_MAX + 1] = { + [IPSET_ATTR_IP] = { .type = NLA_NESTED }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, +}; + +static int +hash_ip6_uadt(struct ip_set *set, struct nlattr *tb[], + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ + const struct ip_set_hash *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + union nf_inet_addr ip; + u32 timeout = h->timeout; + int ret; + + if (unlikely(!tb[IPSET_ATTR_IP] || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + tb[IPSET_ATTR_IP_TO] || + tb[IPSET_ATTR_CIDR])) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &ip); + if (ret) + return ret; + + ip6_netmask(&ip, h->netmask); + if (ipv6_addr_any(&ip.in6)) + return -IPSET_ERR_HASH_ELEM; + + if (tb[IPSET_ATTR_TIMEOUT]) { + if (!with_timeout(h->timeout)) + return -IPSET_ERR_TIMEOUT; + timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + } + + ret = adtfn(set, &ip, timeout, flags); + + return ip_set_eexist(ret, flags) ? 0 : ret; +} + +/* Create hash:ip type of sets */ + +static int +hash_ip_create(struct ip_set *set, struct nlattr *tb[], u32 flags) +{ + u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM; + u8 netmask, hbits; + size_t hsize; + struct ip_set_hash *h; + + if (!(set->family == NFPROTO_IPV4 || set->family == NFPROTO_IPV6)) + return -IPSET_ERR_INVALID_FAMILY; + netmask = set->family == NFPROTO_IPV4 ? 32 : 128; + pr_debug("Create set %s with family %s\n", + set->name, set->family == NFPROTO_IPV4 ? "inet" : "inet6"); + + if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_HASHSIZE]) { + hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]); + if (hashsize < IPSET_MIMINAL_HASHSIZE) + hashsize = IPSET_MIMINAL_HASHSIZE; + } + + if (tb[IPSET_ATTR_MAXELEM]) + maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]); + + if (tb[IPSET_ATTR_NETMASK]) { + netmask = nla_get_u8(tb[IPSET_ATTR_NETMASK]); + + if ((set->family == NFPROTO_IPV4 && netmask > 32) || + (set->family == NFPROTO_IPV6 && netmask > 128) || + netmask == 0) + return -IPSET_ERR_INVALID_NETMASK; + } + + h = kzalloc(sizeof(*h), GFP_KERNEL); + if (!h) + return -ENOMEM; + + h->maxelem = maxelem; + h->netmask = netmask; + get_random_bytes(&h->initval, sizeof(h->initval)); + h->timeout = IPSET_NO_TIMEOUT; + + hbits = htable_bits(hashsize); + hsize = htable_size(hbits); + if (hsize == 0) { + kfree(h); + return -ENOMEM; + } + h->table = ip_set_alloc(hsize); + if (!h->table) { + kfree(h); + return -ENOMEM; + } + h->table->htable_bits = hbits; + + set->data = h; + + if (tb[IPSET_ATTR_TIMEOUT]) { + h->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + + set->variant = set->family == NFPROTO_IPV4 + ? &hash_ip4_tvariant : &hash_ip6_tvariant; + + if (set->family == NFPROTO_IPV4) + hash_ip4_gc_init(set); + else + hash_ip6_gc_init(set); + } else { + set->variant = set->family == NFPROTO_IPV4 + ? &hash_ip4_variant : &hash_ip6_variant; + } + + pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n", + set->name, jhash_size(h->table->htable_bits), + h->table->htable_bits, h->maxelem, set->data, h->table); + + return 0; +} + +static struct ip_set_type hash_ip_type __read_mostly = { + .name = "hash:ip", + .protocol = IPSET_PROTOCOL, + .features = IPSET_TYPE_IP, + .dimension = IPSET_DIM_ONE, + .family = NFPROTO_UNSPEC, + .revision_min = 0, + .revision_max = 0, + .create = hash_ip_create, + .create_policy = { + [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, + [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, + [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, + [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_NETMASK] = { .type = NLA_U8 }, + }, + .adt_policy = { + [IPSET_ATTR_IP] = { .type = NLA_NESTED }, + [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED }, + [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, + }, + .me = THIS_MODULE, +}; + +static int __init +hash_ip_init(void) +{ + return ip_set_type_register(&hash_ip_type); +} + +static void __exit +hash_ip_fini(void) +{ + ip_set_type_unregister(&hash_ip_type); +} + +module_init(hash_ip_init); +module_exit(hash_ip_fini); diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c new file mode 100644 index 00000000..e8dbb498 --- /dev/null +++ b/net/netfilter/ipset/ip_set_hash_ipport.c @@ -0,0 +1,559 @@ +/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Kernel module implementing an IP set type: the hash:ip,port type */ + +#include <linux/jhash.h> +#include <linux/module.h> +#include <linux/ip.h> +#include <linux/skbuff.h> +#include <linux/errno.h> +#include <linux/random.h> +#include <net/ip.h> +#include <net/ipv6.h> +#include <net/netlink.h> +#include <net/tcp.h> + +#include <linux/netfilter.h> +#include <linux/netfilter/ipset/pfxlen.h> +#include <linux/netfilter/ipset/ip_set.h> +#include <linux/netfilter/ipset/ip_set_timeout.h> +#include <linux/netfilter/ipset/ip_set_getport.h> +#include <linux/netfilter/ipset/ip_set_hash.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +MODULE_DESCRIPTION("hash:ip,port type of IP sets"); +MODULE_ALIAS("ip_set_hash:ip,port"); + +/* Type specific function prefix */ +#define TYPE hash_ipport + +static bool +hash_ipport_same_set(const struct ip_set *a, const struct ip_set *b); + +#define hash_ipport4_same_set hash_ipport_same_set +#define hash_ipport6_same_set hash_ipport_same_set + +/* The type variant functions: IPv4 */ + +/* Member elements without timeout */ +struct hash_ipport4_elem { + __be32 ip; + __be16 port; + u8 proto; + u8 padding; +}; + +/* Member elements with timeout support */ +struct hash_ipport4_telem { + __be32 ip; + __be16 port; + u8 proto; + u8 padding; + unsigned long timeout; +}; + +static inline bool +hash_ipport4_data_equal(const struct hash_ipport4_elem *ip1, + const struct hash_ipport4_elem *ip2, + u32 *multi) +{ + return ip1->ip == ip2->ip && + ip1->port == ip2->port && + ip1->proto == ip2->proto; +} + +static inline bool +hash_ipport4_data_isnull(const struct hash_ipport4_elem *elem) +{ + return elem->proto == 0; +} + +static inline void +hash_ipport4_data_copy(struct hash_ipport4_elem *dst, + const struct hash_ipport4_elem *src) +{ + dst->ip = src->ip; + dst->port = src->port; + dst->proto = src->proto; +} + +static inline void +hash_ipport4_data_zero_out(struct hash_ipport4_elem *elem) +{ + elem->proto = 0; +} + +static bool +hash_ipport4_data_list(struct sk_buff *skb, + const struct hash_ipport4_elem *data) +{ + NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, data->ip); + NLA_PUT_NET16(skb, IPSET_ATTR_PORT, data->port); + NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto); + return 0; + +nla_put_failure: + return 1; +} + +static bool +hash_ipport4_data_tlist(struct sk_buff *skb, + const struct hash_ipport4_elem *data) +{ + const struct hash_ipport4_telem *tdata = + (const struct hash_ipport4_telem *)data; + + NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, tdata->ip); + NLA_PUT_NET16(skb, IPSET_ATTR_PORT, tdata->port); + NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto); + NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, + htonl(ip_set_timeout_get(tdata->timeout))); + + return 0; + +nla_put_failure: + return 1; +} + +#define PF 4 +#define HOST_MASK 32 +#include <linux/netfilter/ipset/ip_set_ahash.h> + +static inline void +hash_ipport4_data_next(struct ip_set_hash *h, + const struct hash_ipport4_elem *d) +{ + h->next.ip = ntohl(d->ip); + h->next.port = ntohs(d->port); +} + +static int +hash_ipport4_kadt(struct ip_set *set, const struct sk_buff *skb, + const struct xt_action_param *par, + enum ipset_adt adt, const struct ip_set_adt_opt *opt) +{ + const struct ip_set_hash *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_ipport4_elem data = { }; + + if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC, + &data.port, &data.proto)) + return -EINVAL; + + ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &data.ip); + + return adtfn(set, &data, opt_timeout(opt, h), opt->cmdflags); +} + +static int +hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ + const struct ip_set_hash *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_ipport4_elem data = { }; + u32 ip, ip_to = 0, p = 0, port, port_to; + u32 timeout = h->timeout; + bool with_ports = false; + int ret; + + if (unlikely(!tb[IPSET_ATTR_IP] || + !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &data.ip); + if (ret) + return ret; + + if (tb[IPSET_ATTR_PORT]) + data.port = nla_get_be16(tb[IPSET_ATTR_PORT]); + else + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_PROTO]) { + data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); + with_ports = ip_set_proto_with_ports(data.proto); + + if (data.proto == 0) + return -IPSET_ERR_INVALID_PROTO; + } else + return -IPSET_ERR_MISSING_PROTO; + + if (!(with_ports || data.proto == IPPROTO_ICMP)) + data.port = 0; + + if (tb[IPSET_ATTR_TIMEOUT]) { + if (!with_timeout(h->timeout)) + return -IPSET_ERR_TIMEOUT; + timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + } + + if (adt == IPSET_TEST || + !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR] || + tb[IPSET_ATTR_PORT_TO])) { + ret = adtfn(set, &data, timeout, flags); + return ip_set_eexist(ret, flags) ? 0 : ret; + } + + ip = ntohl(data.ip); + if (tb[IPSET_ATTR_IP_TO]) { + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); + if (ret) + return ret; + if (ip > ip_to) + swap(ip, ip_to); + } else if (tb[IPSET_ATTR_CIDR]) { + u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + + if (cidr > 32) + return -IPSET_ERR_INVALID_CIDR; + ip_set_mask_from_to(ip, ip_to, cidr); + } else + ip_to = ip; + + port_to = port = ntohs(data.port); + if (with_ports && tb[IPSET_ATTR_PORT_TO]) { + port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); + if (port > port_to) + swap(port, port_to); + } + + if (retried) + ip = h->next.ip; + for (; !before(ip_to, ip); ip++) { + p = retried && ip == h->next.ip ? h->next.port : port; + for (; p <= port_to; p++) { + data.ip = htonl(ip); + data.port = htons(p); + ret = adtfn(set, &data, timeout, flags); + + if (ret && !ip_set_eexist(ret, flags)) + return ret; + else + ret = 0; + } + } + return ret; +} + +static bool +hash_ipport_same_set(const struct ip_set *a, const struct ip_set *b) +{ + const struct ip_set_hash *x = a->data; + const struct ip_set_hash *y = b->data; + + /* Resizing changes htable_bits, so we ignore it */ + return x->maxelem == y->maxelem && + x->timeout == y->timeout; +} + +/* The type variant functions: IPv6 */ + +struct hash_ipport6_elem { + union nf_inet_addr ip; + __be16 port; + u8 proto; + u8 padding; +}; + +struct hash_ipport6_telem { + union nf_inet_addr ip; + __be16 port; + u8 proto; + u8 padding; + unsigned long timeout; +}; + +static inline bool +hash_ipport6_data_equal(const struct hash_ipport6_elem *ip1, + const struct hash_ipport6_elem *ip2, + u32 *multi) +{ + return ipv6_addr_cmp(&ip1->ip.in6, &ip2->ip.in6) == 0 && + ip1->port == ip2->port && + ip1->proto == ip2->proto; +} + +static inline bool +hash_ipport6_data_isnull(const struct hash_ipport6_elem *elem) +{ + return elem->proto == 0; +} + +static inline void +hash_ipport6_data_copy(struct hash_ipport6_elem *dst, + const struct hash_ipport6_elem *src) +{ + memcpy(dst, src, sizeof(*dst)); +} + +static inline void +hash_ipport6_data_zero_out(struct hash_ipport6_elem *elem) +{ + elem->proto = 0; +} + +static bool +hash_ipport6_data_list(struct sk_buff *skb, + const struct hash_ipport6_elem *data) +{ + NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &data->ip); + NLA_PUT_NET16(skb, IPSET_ATTR_PORT, data->port); + NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto); + return 0; + +nla_put_failure: + return 1; +} + +static bool +hash_ipport6_data_tlist(struct sk_buff *skb, + const struct hash_ipport6_elem *data) +{ + const struct hash_ipport6_telem *e = + (const struct hash_ipport6_telem *)data; + + NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &e->ip); + NLA_PUT_NET16(skb, IPSET_ATTR_PORT, data->port); + NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto); + NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, + htonl(ip_set_timeout_get(e->timeout))); + return 0; + +nla_put_failure: + return 1; +} + +#undef PF +#undef HOST_MASK + +#define PF 6 +#define HOST_MASK 128 +#include <linux/netfilter/ipset/ip_set_ahash.h> + +static inline void +hash_ipport6_data_next(struct ip_set_hash *h, + const struct hash_ipport6_elem *d) +{ + h->next.port = ntohs(d->port); +} + +static int +hash_ipport6_kadt(struct ip_set *set, const struct sk_buff *skb, + const struct xt_action_param *par, + enum ipset_adt adt, const struct ip_set_adt_opt *opt) +{ + const struct ip_set_hash *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_ipport6_elem data = { }; + + if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC, + &data.port, &data.proto)) + return -EINVAL; + + ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &data.ip.in6); + + return adtfn(set, &data, opt_timeout(opt, h), opt->cmdflags); +} + +static int +hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[], + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ + const struct ip_set_hash *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_ipport6_elem data = { }; + u32 port, port_to; + u32 timeout = h->timeout; + bool with_ports = false; + int ret; + + if (unlikely(!tb[IPSET_ATTR_IP] || + !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + tb[IPSET_ATTR_IP_TO] || + tb[IPSET_ATTR_CIDR])) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &data.ip); + if (ret) + return ret; + + if (tb[IPSET_ATTR_PORT]) + data.port = nla_get_be16(tb[IPSET_ATTR_PORT]); + else + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_PROTO]) { + data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); + with_ports = ip_set_proto_with_ports(data.proto); + + if (data.proto == 0) + return -IPSET_ERR_INVALID_PROTO; + } else + return -IPSET_ERR_MISSING_PROTO; + + if (!(with_ports || data.proto == IPPROTO_ICMPV6)) + data.port = 0; + + if (tb[IPSET_ATTR_TIMEOUT]) { + if (!with_timeout(h->timeout)) + return -IPSET_ERR_TIMEOUT; + timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + } + + if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) { + ret = adtfn(set, &data, timeout, flags); + return ip_set_eexist(ret, flags) ? 0 : ret; + } + + port = ntohs(data.port); + port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); + if (port > port_to) + swap(port, port_to); + + if (retried) + port = h->next.port; + for (; port <= port_to; port++) { + data.port = htons(port); + ret = adtfn(set, &data, timeout, flags); + + if (ret && !ip_set_eexist(ret, flags)) + return ret; + else + ret = 0; + } + return ret; +} + +/* Create hash:ip type of sets */ + +static int +hash_ipport_create(struct ip_set *set, struct nlattr *tb[], u32 flags) +{ + struct ip_set_hash *h; + u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM; + u8 hbits; + size_t hsize; + + if (!(set->family == NFPROTO_IPV4 || set->family == NFPROTO_IPV6)) + return -IPSET_ERR_INVALID_FAMILY; + + if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_HASHSIZE]) { + hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]); + if (hashsize < IPSET_MIMINAL_HASHSIZE) + hashsize = IPSET_MIMINAL_HASHSIZE; + } + + if (tb[IPSET_ATTR_MAXELEM]) + maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]); + + h = kzalloc(sizeof(*h), GFP_KERNEL); + if (!h) + return -ENOMEM; + + h->maxelem = maxelem; + get_random_bytes(&h->initval, sizeof(h->initval)); + h->timeout = IPSET_NO_TIMEOUT; + + hbits = htable_bits(hashsize); + hsize = htable_size(hbits); + if (hsize == 0) { + kfree(h); + return -ENOMEM; + } + h->table = ip_set_alloc(hsize); + if (!h->table) { + kfree(h); + return -ENOMEM; + } + h->table->htable_bits = hbits; + + set->data = h; + + if (tb[IPSET_ATTR_TIMEOUT]) { + h->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + + set->variant = set->family == NFPROTO_IPV4 + ? &hash_ipport4_tvariant : &hash_ipport6_tvariant; + + if (set->family == NFPROTO_IPV4) + hash_ipport4_gc_init(set); + else + hash_ipport6_gc_init(set); + } else { + set->variant = set->family == NFPROTO_IPV4 + ? &hash_ipport4_variant : &hash_ipport6_variant; + } + + pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n", + set->name, jhash_size(h->table->htable_bits), + h->table->htable_bits, h->maxelem, set->data, h->table); + + return 0; +} + +static struct ip_set_type hash_ipport_type __read_mostly = { + .name = "hash:ip,port", + .protocol = IPSET_PROTOCOL, + .features = IPSET_TYPE_IP | IPSET_TYPE_PORT, + .dimension = IPSET_DIM_TWO, + .family = NFPROTO_UNSPEC, + .revision_min = 0, + .revision_max = 1, /* SCTP and UDPLITE support added */ + .create = hash_ipport_create, + .create_policy = { + [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, + [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, + [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, + [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, + [IPSET_ATTR_PROTO] = { .type = NLA_U8 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + }, + .adt_policy = { + [IPSET_ATTR_IP] = { .type = NLA_NESTED }, + [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED }, + [IPSET_ATTR_PORT] = { .type = NLA_U16 }, + [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 }, + [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, + [IPSET_ATTR_PROTO] = { .type = NLA_U8 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, + }, + .me = THIS_MODULE, +}; + +static int __init +hash_ipport_init(void) +{ + return ip_set_type_register(&hash_ipport_type); +} + +static void __exit +hash_ipport_fini(void) +{ + ip_set_type_unregister(&hash_ipport_type); +} + +module_init(hash_ipport_init); +module_exit(hash_ipport_fini); diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c new file mode 100644 index 00000000..52f79d8e --- /dev/null +++ b/net/netfilter/ipset/ip_set_hash_ipportip.c @@ -0,0 +1,577 @@ +/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Kernel module implementing an IP set type: the hash:ip,port,ip type */ + +#include <linux/jhash.h> +#include <linux/module.h> +#include <linux/ip.h> +#include <linux/skbuff.h> +#include <linux/errno.h> +#include <linux/random.h> +#include <net/ip.h> +#include <net/ipv6.h> +#include <net/netlink.h> +#include <net/tcp.h> + +#include <linux/netfilter.h> +#include <linux/netfilter/ipset/pfxlen.h> +#include <linux/netfilter/ipset/ip_set.h> +#include <linux/netfilter/ipset/ip_set_timeout.h> +#include <linux/netfilter/ipset/ip_set_getport.h> +#include <linux/netfilter/ipset/ip_set_hash.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +MODULE_DESCRIPTION("hash:ip,port,ip type of IP sets"); +MODULE_ALIAS("ip_set_hash:ip,port,ip"); + +/* Type specific function prefix */ +#define TYPE hash_ipportip + +static bool +hash_ipportip_same_set(const struct ip_set *a, const struct ip_set *b); + +#define hash_ipportip4_same_set hash_ipportip_same_set +#define hash_ipportip6_same_set hash_ipportip_same_set + +/* The type variant functions: IPv4 */ + +/* Member elements without timeout */ +struct hash_ipportip4_elem { + __be32 ip; + __be32 ip2; + __be16 port; + u8 proto; + u8 padding; +}; + +/* Member elements with timeout support */ +struct hash_ipportip4_telem { + __be32 ip; + __be32 ip2; + __be16 port; + u8 proto; + u8 padding; + unsigned long timeout; +}; + +static inline bool +hash_ipportip4_data_equal(const struct hash_ipportip4_elem *ip1, + const struct hash_ipportip4_elem *ip2, + u32 *multi) +{ + return ip1->ip == ip2->ip && + ip1->ip2 == ip2->ip2 && + ip1->port == ip2->port && + ip1->proto == ip2->proto; +} + +static inline bool +hash_ipportip4_data_isnull(const struct hash_ipportip4_elem *elem) +{ + return elem->proto == 0; +} + +static inline void +hash_ipportip4_data_copy(struct hash_ipportip4_elem *dst, + const struct hash_ipportip4_elem *src) +{ + memcpy(dst, src, sizeof(*dst)); +} + +static inline void +hash_ipportip4_data_zero_out(struct hash_ipportip4_elem *elem) +{ + elem->proto = 0; +} + +static bool +hash_ipportip4_data_list(struct sk_buff *skb, + const struct hash_ipportip4_elem *data) +{ + NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, data->ip); + NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP2, data->ip2); + NLA_PUT_NET16(skb, IPSET_ATTR_PORT, data->port); + NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto); + return 0; + +nla_put_failure: + return 1; +} + +static bool +hash_ipportip4_data_tlist(struct sk_buff *skb, + const struct hash_ipportip4_elem *data) +{ + const struct hash_ipportip4_telem *tdata = + (const struct hash_ipportip4_telem *)data; + + NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, tdata->ip); + NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP2, tdata->ip2); + NLA_PUT_NET16(skb, IPSET_ATTR_PORT, tdata->port); + NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto); + NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, + htonl(ip_set_timeout_get(tdata->timeout))); + + return 0; + +nla_put_failure: + return 1; +} + +#define PF 4 +#define HOST_MASK 32 +#include <linux/netfilter/ipset/ip_set_ahash.h> + +static inline void +hash_ipportip4_data_next(struct ip_set_hash *h, + const struct hash_ipportip4_elem *d) +{ + h->next.ip = ntohl(d->ip); + h->next.port = ntohs(d->port); +} + +static int +hash_ipportip4_kadt(struct ip_set *set, const struct sk_buff *skb, + const struct xt_action_param *par, + enum ipset_adt adt, const struct ip_set_adt_opt *opt) +{ + const struct ip_set_hash *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_ipportip4_elem data = { }; + + if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC, + &data.port, &data.proto)) + return -EINVAL; + + ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &data.ip); + ip4addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &data.ip2); + + return adtfn(set, &data, opt_timeout(opt, h), opt->cmdflags); +} + +static int +hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ + const struct ip_set_hash *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_ipportip4_elem data = { }; + u32 ip, ip_to = 0, p = 0, port, port_to; + u32 timeout = h->timeout; + bool with_ports = false; + int ret; + + if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || + !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &data.ip); + if (ret) + return ret; + + ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP2], &data.ip2); + if (ret) + return ret; + + if (tb[IPSET_ATTR_PORT]) + data.port = nla_get_be16(tb[IPSET_ATTR_PORT]); + else + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_PROTO]) { + data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); + with_ports = ip_set_proto_with_ports(data.proto); + + if (data.proto == 0) + return -IPSET_ERR_INVALID_PROTO; + } else + return -IPSET_ERR_MISSING_PROTO; + + if (!(with_ports || data.proto == IPPROTO_ICMP)) + data.port = 0; + + if (tb[IPSET_ATTR_TIMEOUT]) { + if (!with_timeout(h->timeout)) + return -IPSET_ERR_TIMEOUT; + timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + } + + if (adt == IPSET_TEST || + !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR] || + tb[IPSET_ATTR_PORT_TO])) { + ret = adtfn(set, &data, timeout, flags); + return ip_set_eexist(ret, flags) ? 0 : ret; + } + + ip = ntohl(data.ip); + if (tb[IPSET_ATTR_IP_TO]) { + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); + if (ret) + return ret; + if (ip > ip_to) + swap(ip, ip_to); + } else if (tb[IPSET_ATTR_CIDR]) { + u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + + if (cidr > 32) + return -IPSET_ERR_INVALID_CIDR; + ip_set_mask_from_to(ip, ip_to, cidr); + } else + ip_to = ip; + + port_to = port = ntohs(data.port); + if (with_ports && tb[IPSET_ATTR_PORT_TO]) { + port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); + if (port > port_to) + swap(port, port_to); + } + + if (retried) + ip = h->next.ip; + for (; !before(ip_to, ip); ip++) { + p = retried && ip == h->next.ip ? h->next.port : port; + for (; p <= port_to; p++) { + data.ip = htonl(ip); + data.port = htons(p); + ret = adtfn(set, &data, timeout, flags); + + if (ret && !ip_set_eexist(ret, flags)) + return ret; + else + ret = 0; + } + } + return ret; +} + +static bool +hash_ipportip_same_set(const struct ip_set *a, const struct ip_set *b) +{ + const struct ip_set_hash *x = a->data; + const struct ip_set_hash *y = b->data; + + /* Resizing changes htable_bits, so we ignore it */ + return x->maxelem == y->maxelem && + x->timeout == y->timeout; +} + +/* The type variant functions: IPv6 */ + +struct hash_ipportip6_elem { + union nf_inet_addr ip; + union nf_inet_addr ip2; + __be16 port; + u8 proto; + u8 padding; +}; + +struct hash_ipportip6_telem { + union nf_inet_addr ip; + union nf_inet_addr ip2; + __be16 port; + u8 proto; + u8 padding; + unsigned long timeout; +}; + +static inline bool +hash_ipportip6_data_equal(const struct hash_ipportip6_elem *ip1, + const struct hash_ipportip6_elem *ip2, + u32 *multi) +{ + return ipv6_addr_cmp(&ip1->ip.in6, &ip2->ip.in6) == 0 && + ipv6_addr_cmp(&ip1->ip2.in6, &ip2->ip2.in6) == 0 && + ip1->port == ip2->port && + ip1->proto == ip2->proto; +} + +static inline bool +hash_ipportip6_data_isnull(const struct hash_ipportip6_elem *elem) +{ + return elem->proto == 0; +} + +static inline void +hash_ipportip6_data_copy(struct hash_ipportip6_elem *dst, + const struct hash_ipportip6_elem *src) +{ + memcpy(dst, src, sizeof(*dst)); +} + +static inline void +hash_ipportip6_data_zero_out(struct hash_ipportip6_elem *elem) +{ + elem->proto = 0; +} + +static bool +hash_ipportip6_data_list(struct sk_buff *skb, + const struct hash_ipportip6_elem *data) +{ + NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &data->ip); + NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP2, &data->ip2); + NLA_PUT_NET16(skb, IPSET_ATTR_PORT, data->port); + NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto); + return 0; + +nla_put_failure: + return 1; +} + +static bool +hash_ipportip6_data_tlist(struct sk_buff *skb, + const struct hash_ipportip6_elem *data) +{ + const struct hash_ipportip6_telem *e = + (const struct hash_ipportip6_telem *)data; + + NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &e->ip); + NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP2, &data->ip2); + NLA_PUT_NET16(skb, IPSET_ATTR_PORT, data->port); + NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto); + NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, + htonl(ip_set_timeout_get(e->timeout))); + return 0; + +nla_put_failure: + return 1; +} + +#undef PF +#undef HOST_MASK + +#define PF 6 +#define HOST_MASK 128 +#include <linux/netfilter/ipset/ip_set_ahash.h> + +static inline void +hash_ipportip6_data_next(struct ip_set_hash *h, + const struct hash_ipportip6_elem *d) +{ + h->next.port = ntohs(d->port); +} + +static int +hash_ipportip6_kadt(struct ip_set *set, const struct sk_buff *skb, + const struct xt_action_param *par, + enum ipset_adt adt, const struct ip_set_adt_opt *opt) +{ + const struct ip_set_hash *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_ipportip6_elem data = { }; + + if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC, + &data.port, &data.proto)) + return -EINVAL; + + ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &data.ip.in6); + ip6addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &data.ip2.in6); + + return adtfn(set, &data, opt_timeout(opt, h), opt->cmdflags); +} + +static int +hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[], + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ + const struct ip_set_hash *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_ipportip6_elem data = { }; + u32 port, port_to; + u32 timeout = h->timeout; + bool with_ports = false; + int ret; + + if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || + !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + tb[IPSET_ATTR_IP_TO] || + tb[IPSET_ATTR_CIDR])) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &data.ip); + if (ret) + return ret; + + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &data.ip2); + if (ret) + return ret; + + if (tb[IPSET_ATTR_PORT]) + data.port = nla_get_be16(tb[IPSET_ATTR_PORT]); + else + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_PROTO]) { + data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); + with_ports = ip_set_proto_with_ports(data.proto); + + if (data.proto == 0) + return -IPSET_ERR_INVALID_PROTO; + } else + return -IPSET_ERR_MISSING_PROTO; + + if (!(with_ports || data.proto == IPPROTO_ICMPV6)) + data.port = 0; + + if (tb[IPSET_ATTR_TIMEOUT]) { + if (!with_timeout(h->timeout)) + return -IPSET_ERR_TIMEOUT; + timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + } + + if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) { + ret = adtfn(set, &data, timeout, flags); + return ip_set_eexist(ret, flags) ? 0 : ret; + } + + port = ntohs(data.port); + port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); + if (port > port_to) + swap(port, port_to); + + if (retried) + port = h->next.port; + for (; port <= port_to; port++) { + data.port = htons(port); + ret = adtfn(set, &data, timeout, flags); + + if (ret && !ip_set_eexist(ret, flags)) + return ret; + else + ret = 0; + } + return ret; +} + +/* Create hash:ip type of sets */ + +static int +hash_ipportip_create(struct ip_set *set, struct nlattr *tb[], u32 flags) +{ + struct ip_set_hash *h; + u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM; + u8 hbits; + size_t hsize; + + if (!(set->family == NFPROTO_IPV4 || set->family == NFPROTO_IPV6)) + return -IPSET_ERR_INVALID_FAMILY; + + if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_HASHSIZE]) { + hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]); + if (hashsize < IPSET_MIMINAL_HASHSIZE) + hashsize = IPSET_MIMINAL_HASHSIZE; + } + + if (tb[IPSET_ATTR_MAXELEM]) + maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]); + + h = kzalloc(sizeof(*h), GFP_KERNEL); + if (!h) + return -ENOMEM; + + h->maxelem = maxelem; + get_random_bytes(&h->initval, sizeof(h->initval)); + h->timeout = IPSET_NO_TIMEOUT; + + hbits = htable_bits(hashsize); + hsize = htable_size(hbits); + if (hsize == 0) { + kfree(h); + return -ENOMEM; + } + h->table = ip_set_alloc(hsize); + if (!h->table) { + kfree(h); + return -ENOMEM; + } + h->table->htable_bits = hbits; + + set->data = h; + + if (tb[IPSET_ATTR_TIMEOUT]) { + h->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + + set->variant = set->family == NFPROTO_IPV4 + ? &hash_ipportip4_tvariant : &hash_ipportip6_tvariant; + + if (set->family == NFPROTO_IPV4) + hash_ipportip4_gc_init(set); + else + hash_ipportip6_gc_init(set); + } else { + set->variant = set->family == NFPROTO_IPV4 + ? &hash_ipportip4_variant : &hash_ipportip6_variant; + } + + pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n", + set->name, jhash_size(h->table->htable_bits), + h->table->htable_bits, h->maxelem, set->data, h->table); + + return 0; +} + +static struct ip_set_type hash_ipportip_type __read_mostly = { + .name = "hash:ip,port,ip", + .protocol = IPSET_PROTOCOL, + .features = IPSET_TYPE_IP | IPSET_TYPE_PORT | IPSET_TYPE_IP2, + .dimension = IPSET_DIM_THREE, + .family = NFPROTO_UNSPEC, + .revision_min = 0, + .revision_max = 1, /* SCTP and UDPLITE support added */ + .create = hash_ipportip_create, + .create_policy = { + [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, + [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, + [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, + [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + }, + .adt_policy = { + [IPSET_ATTR_IP] = { .type = NLA_NESTED }, + [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED }, + [IPSET_ATTR_IP2] = { .type = NLA_NESTED }, + [IPSET_ATTR_PORT] = { .type = NLA_U16 }, + [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 }, + [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, + [IPSET_ATTR_PROTO] = { .type = NLA_U8 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, + }, + .me = THIS_MODULE, +}; + +static int __init +hash_ipportip_init(void) +{ + return ip_set_type_register(&hash_ipportip_type); +} + +static void __exit +hash_ipportip_fini(void) +{ + ip_set_type_unregister(&hash_ipportip_type); +} + +module_init(hash_ipportip_init); +module_exit(hash_ipportip_fini); diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c new file mode 100644 index 00000000..97583f5a --- /dev/null +++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c @@ -0,0 +1,734 @@ +/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Kernel module implementing an IP set type: the hash:ip,port,net type */ + +#include <linux/jhash.h> +#include <linux/module.h> +#include <linux/ip.h> +#include <linux/skbuff.h> +#include <linux/errno.h> +#include <linux/random.h> +#include <net/ip.h> +#include <net/ipv6.h> +#include <net/netlink.h> +#include <net/tcp.h> + +#include <linux/netfilter.h> +#include <linux/netfilter/ipset/pfxlen.h> +#include <linux/netfilter/ipset/ip_set.h> +#include <linux/netfilter/ipset/ip_set_timeout.h> +#include <linux/netfilter/ipset/ip_set_getport.h> +#include <linux/netfilter/ipset/ip_set_hash.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +MODULE_DESCRIPTION("hash:ip,port,net type of IP sets"); +MODULE_ALIAS("ip_set_hash:ip,port,net"); + +/* Type specific function prefix */ +#define TYPE hash_ipportnet + +static bool +hash_ipportnet_same_set(const struct ip_set *a, const struct ip_set *b); + +#define hash_ipportnet4_same_set hash_ipportnet_same_set +#define hash_ipportnet6_same_set hash_ipportnet_same_set + +/* The type variant functions: IPv4 */ + +/* We squeeze the "nomatch" flag into cidr: we don't support cidr == 0 + * However this way we have to store internally cidr - 1, + * dancing back and forth. + */ +#define IP_SET_HASH_WITH_NETS_PACKED + +/* Member elements without timeout */ +struct hash_ipportnet4_elem { + __be32 ip; + __be32 ip2; + __be16 port; + u8 cidr:7; + u8 nomatch:1; + u8 proto; +}; + +/* Member elements with timeout support */ +struct hash_ipportnet4_telem { + __be32 ip; + __be32 ip2; + __be16 port; + u8 cidr:7; + u8 nomatch:1; + u8 proto; + unsigned long timeout; +}; + +static inline bool +hash_ipportnet4_data_equal(const struct hash_ipportnet4_elem *ip1, + const struct hash_ipportnet4_elem *ip2, + u32 *multi) +{ + return ip1->ip == ip2->ip && + ip1->ip2 == ip2->ip2 && + ip1->cidr == ip2->cidr && + ip1->port == ip2->port && + ip1->proto == ip2->proto; +} + +static inline bool +hash_ipportnet4_data_isnull(const struct hash_ipportnet4_elem *elem) +{ + return elem->proto == 0; +} + +static inline void +hash_ipportnet4_data_copy(struct hash_ipportnet4_elem *dst, + const struct hash_ipportnet4_elem *src) +{ + memcpy(dst, src, sizeof(*dst)); +} + +static inline void +hash_ipportnet4_data_flags(struct hash_ipportnet4_elem *dst, u32 flags) +{ + dst->nomatch = !!(flags & IPSET_FLAG_NOMATCH); +} + +static inline bool +hash_ipportnet4_data_match(const struct hash_ipportnet4_elem *elem) +{ + return !elem->nomatch; +} + +static inline void +hash_ipportnet4_data_netmask(struct hash_ipportnet4_elem *elem, u8 cidr) +{ + elem->ip2 &= ip_set_netmask(cidr); + elem->cidr = cidr - 1; +} + +static inline void +hash_ipportnet4_data_zero_out(struct hash_ipportnet4_elem *elem) +{ + elem->proto = 0; +} + +static bool +hash_ipportnet4_data_list(struct sk_buff *skb, + const struct hash_ipportnet4_elem *data) +{ + u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; + + NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, data->ip); + NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP2, data->ip2); + NLA_PUT_NET16(skb, IPSET_ATTR_PORT, data->port); + NLA_PUT_U8(skb, IPSET_ATTR_CIDR2, data->cidr + 1); + NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto); + if (flags) + NLA_PUT_NET32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)); + return 0; + +nla_put_failure: + return 1; +} + +static bool +hash_ipportnet4_data_tlist(struct sk_buff *skb, + const struct hash_ipportnet4_elem *data) +{ + const struct hash_ipportnet4_telem *tdata = + (const struct hash_ipportnet4_telem *)data; + u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; + + NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, tdata->ip); + NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP2, tdata->ip2); + NLA_PUT_NET16(skb, IPSET_ATTR_PORT, tdata->port); + NLA_PUT_U8(skb, IPSET_ATTR_CIDR2, data->cidr + 1); + NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto); + NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, + htonl(ip_set_timeout_get(tdata->timeout))); + if (flags) + NLA_PUT_NET32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)); + + return 0; + +nla_put_failure: + return 1; +} + +#define IP_SET_HASH_WITH_PROTO +#define IP_SET_HASH_WITH_NETS + +#define PF 4 +#define HOST_MASK 32 +#include <linux/netfilter/ipset/ip_set_ahash.h> + +static inline void +hash_ipportnet4_data_next(struct ip_set_hash *h, + const struct hash_ipportnet4_elem *d) +{ + h->next.ip = ntohl(d->ip); + h->next.port = ntohs(d->port); + h->next.ip2 = ntohl(d->ip2); +} + +static int +hash_ipportnet4_kadt(struct ip_set *set, const struct sk_buff *skb, + const struct xt_action_param *par, + enum ipset_adt adt, const struct ip_set_adt_opt *opt) +{ + const struct ip_set_hash *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_ipportnet4_elem data = { + .cidr = h->nets[0].cidr ? h->nets[0].cidr - 1 : HOST_MASK - 1 + }; + + if (adt == IPSET_TEST) + data.cidr = HOST_MASK - 1; + + if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC, + &data.port, &data.proto)) + return -EINVAL; + + ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &data.ip); + ip4addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &data.ip2); + data.ip2 &= ip_set_netmask(data.cidr + 1); + + return adtfn(set, &data, opt_timeout(opt, h), opt->cmdflags); +} + +static int +hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ + const struct ip_set_hash *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_ipportnet4_elem data = { .cidr = HOST_MASK - 1 }; + u32 ip, ip_to = 0, p = 0, port, port_to; + u32 ip2_from = 0, ip2_to, ip2_last, ip2; + u32 timeout = h->timeout; + bool with_ports = false; + u8 cidr; + int ret; + + if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || + !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); + if (ret) + return ret; + + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2], &ip2_from); + if (ret) + return ret; + + if (tb[IPSET_ATTR_CIDR2]) { + cidr = nla_get_u8(tb[IPSET_ATTR_CIDR2]); + if (!cidr || cidr > HOST_MASK) + return -IPSET_ERR_INVALID_CIDR; + data.cidr = cidr - 1; + } + + if (tb[IPSET_ATTR_PORT]) + data.port = nla_get_be16(tb[IPSET_ATTR_PORT]); + else + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_PROTO]) { + data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); + with_ports = ip_set_proto_with_ports(data.proto); + + if (data.proto == 0) + return -IPSET_ERR_INVALID_PROTO; + } else + return -IPSET_ERR_MISSING_PROTO; + + if (!(with_ports || data.proto == IPPROTO_ICMP)) + data.port = 0; + + if (tb[IPSET_ATTR_TIMEOUT]) { + if (!with_timeout(h->timeout)) + return -IPSET_ERR_TIMEOUT; + timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + } + + if (tb[IPSET_ATTR_CADT_FLAGS] && adt == IPSET_ADD) { + u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_NOMATCH) + flags |= (cadt_flags << 16); + } + + with_ports = with_ports && tb[IPSET_ATTR_PORT_TO]; + if (adt == IPSET_TEST || + !(tb[IPSET_ATTR_CIDR] || tb[IPSET_ATTR_IP_TO] || with_ports || + tb[IPSET_ATTR_IP2_TO])) { + data.ip = htonl(ip); + data.ip2 = htonl(ip2_from & ip_set_hostmask(data.cidr + 1)); + ret = adtfn(set, &data, timeout, flags); + return ip_set_eexist(ret, flags) ? 0 : ret; + } + + if (tb[IPSET_ATTR_IP_TO]) { + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); + if (ret) + return ret; + if (ip > ip_to) + swap(ip, ip_to); + } else if (tb[IPSET_ATTR_CIDR]) { + u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + + if (cidr > 32) + return -IPSET_ERR_INVALID_CIDR; + ip_set_mask_from_to(ip, ip_to, cidr); + } + + port_to = port = ntohs(data.port); + if (tb[IPSET_ATTR_PORT_TO]) { + port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); + if (port > port_to) + swap(port, port_to); + } + if (tb[IPSET_ATTR_IP2_TO]) { + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2_TO], &ip2_to); + if (ret) + return ret; + if (ip2_from > ip2_to) + swap(ip2_from, ip2_to); + if (ip2_from + UINT_MAX == ip2_to) + return -IPSET_ERR_HASH_RANGE; + } else { + ip_set_mask_from_to(ip2_from, ip2_to, data.cidr + 1); + } + + if (retried) + ip = h->next.ip; + for (; !before(ip_to, ip); ip++) { + data.ip = htonl(ip); + p = retried && ip == h->next.ip ? h->next.port : port; + for (; p <= port_to; p++) { + data.port = htons(p); + ip2 = retried && ip == h->next.ip && p == h->next.port + ? h->next.ip2 : ip2_from; + while (!after(ip2, ip2_to)) { + data.ip2 = htonl(ip2); + ip2_last = ip_set_range_to_cidr(ip2, ip2_to, + &cidr); + data.cidr = cidr - 1; + ret = adtfn(set, &data, timeout, flags); + + if (ret && !ip_set_eexist(ret, flags)) + return ret; + else + ret = 0; + ip2 = ip2_last + 1; + } + } + } + return ret; +} + +static bool +hash_ipportnet_same_set(const struct ip_set *a, const struct ip_set *b) +{ + const struct ip_set_hash *x = a->data; + const struct ip_set_hash *y = b->data; + + /* Resizing changes htable_bits, so we ignore it */ + return x->maxelem == y->maxelem && + x->timeout == y->timeout; +} + +/* The type variant functions: IPv6 */ + +struct hash_ipportnet6_elem { + union nf_inet_addr ip; + union nf_inet_addr ip2; + __be16 port; + u8 cidr:7; + u8 nomatch:1; + u8 proto; +}; + +struct hash_ipportnet6_telem { + union nf_inet_addr ip; + union nf_inet_addr ip2; + __be16 port; + u8 cidr:7; + u8 nomatch:1; + u8 proto; + unsigned long timeout; +}; + +static inline bool +hash_ipportnet6_data_equal(const struct hash_ipportnet6_elem *ip1, + const struct hash_ipportnet6_elem *ip2, + u32 *multi) +{ + return ipv6_addr_cmp(&ip1->ip.in6, &ip2->ip.in6) == 0 && + ipv6_addr_cmp(&ip1->ip2.in6, &ip2->ip2.in6) == 0 && + ip1->cidr == ip2->cidr && + ip1->port == ip2->port && + ip1->proto == ip2->proto; +} + +static inline bool +hash_ipportnet6_data_isnull(const struct hash_ipportnet6_elem *elem) +{ + return elem->proto == 0; +} + +static inline void +hash_ipportnet6_data_copy(struct hash_ipportnet6_elem *dst, + const struct hash_ipportnet6_elem *src) +{ + memcpy(dst, src, sizeof(*dst)); +} + +static inline void +hash_ipportnet6_data_flags(struct hash_ipportnet6_elem *dst, u32 flags) +{ + dst->nomatch = !!(flags & IPSET_FLAG_NOMATCH); +} + +static inline bool +hash_ipportnet6_data_match(const struct hash_ipportnet6_elem *elem) +{ + return !elem->nomatch; +} + +static inline void +hash_ipportnet6_data_zero_out(struct hash_ipportnet6_elem *elem) +{ + elem->proto = 0; +} + +static inline void +ip6_netmask(union nf_inet_addr *ip, u8 prefix) +{ + ip->ip6[0] &= ip_set_netmask6(prefix)[0]; + ip->ip6[1] &= ip_set_netmask6(prefix)[1]; + ip->ip6[2] &= ip_set_netmask6(prefix)[2]; + ip->ip6[3] &= ip_set_netmask6(prefix)[3]; +} + +static inline void +hash_ipportnet6_data_netmask(struct hash_ipportnet6_elem *elem, u8 cidr) +{ + ip6_netmask(&elem->ip2, cidr); + elem->cidr = cidr - 1; +} + +static bool +hash_ipportnet6_data_list(struct sk_buff *skb, + const struct hash_ipportnet6_elem *data) +{ + u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; + + NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &data->ip); + NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP2, &data->ip2); + NLA_PUT_NET16(skb, IPSET_ATTR_PORT, data->port); + NLA_PUT_U8(skb, IPSET_ATTR_CIDR2, data->cidr + 1); + NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto); + if (flags) + NLA_PUT_NET32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)); + return 0; + +nla_put_failure: + return 1; +} + +static bool +hash_ipportnet6_data_tlist(struct sk_buff *skb, + const struct hash_ipportnet6_elem *data) +{ + const struct hash_ipportnet6_telem *e = + (const struct hash_ipportnet6_telem *)data; + u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; + + NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &e->ip); + NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP2, &data->ip2); + NLA_PUT_NET16(skb, IPSET_ATTR_PORT, data->port); + NLA_PUT_U8(skb, IPSET_ATTR_CIDR2, data->cidr + 1); + NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto); + NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, + htonl(ip_set_timeout_get(e->timeout))); + if (flags) + NLA_PUT_NET32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)); + return 0; + +nla_put_failure: + return 1; +} + +#undef PF +#undef HOST_MASK + +#define PF 6 +#define HOST_MASK 128 +#include <linux/netfilter/ipset/ip_set_ahash.h> + +static inline void +hash_ipportnet6_data_next(struct ip_set_hash *h, + const struct hash_ipportnet6_elem *d) +{ + h->next.port = ntohs(d->port); +} + +static int +hash_ipportnet6_kadt(struct ip_set *set, const struct sk_buff *skb, + const struct xt_action_param *par, + enum ipset_adt adt, const struct ip_set_adt_opt *opt) +{ + const struct ip_set_hash *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_ipportnet6_elem data = { + .cidr = h->nets[0].cidr ? h->nets[0].cidr - 1 : HOST_MASK - 1 + }; + + if (adt == IPSET_TEST) + data.cidr = HOST_MASK - 1; + + if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC, + &data.port, &data.proto)) + return -EINVAL; + + ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &data.ip.in6); + ip6addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &data.ip2.in6); + ip6_netmask(&data.ip2, data.cidr + 1); + + return adtfn(set, &data, opt_timeout(opt, h), opt->cmdflags); +} + +static int +hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[], + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ + const struct ip_set_hash *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_ipportnet6_elem data = { .cidr = HOST_MASK - 1 }; + u32 port, port_to; + u32 timeout = h->timeout; + bool with_ports = false; + u8 cidr; + int ret; + + if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || + !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) || + tb[IPSET_ATTR_IP_TO] || + tb[IPSET_ATTR_CIDR])) + return -IPSET_ERR_PROTOCOL; + if (unlikely(tb[IPSET_ATTR_IP_TO])) + return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; + + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &data.ip); + if (ret) + return ret; + + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &data.ip2); + if (ret) + return ret; + + if (tb[IPSET_ATTR_CIDR2]) { + cidr = nla_get_u8(tb[IPSET_ATTR_CIDR2]); + if (!cidr || cidr > HOST_MASK) + return -IPSET_ERR_INVALID_CIDR; + data.cidr = cidr - 1; + } + + ip6_netmask(&data.ip2, data.cidr + 1); + + if (tb[IPSET_ATTR_PORT]) + data.port = nla_get_be16(tb[IPSET_ATTR_PORT]); + else + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_PROTO]) { + data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); + with_ports = ip_set_proto_with_ports(data.proto); + + if (data.proto == 0) + return -IPSET_ERR_INVALID_PROTO; + } else + return -IPSET_ERR_MISSING_PROTO; + + if (!(with_ports || data.proto == IPPROTO_ICMPV6)) + data.port = 0; + + if (tb[IPSET_ATTR_TIMEOUT]) { + if (!with_timeout(h->timeout)) + return -IPSET_ERR_TIMEOUT; + timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + } + + if (tb[IPSET_ATTR_CADT_FLAGS] && adt == IPSET_ADD) { + u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_NOMATCH) + flags |= (cadt_flags << 16); + } + + if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) { + ret = adtfn(set, &data, timeout, flags); + return ip_set_eexist(ret, flags) ? 0 : ret; + } + + port = ntohs(data.port); + port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); + if (port > port_to) + swap(port, port_to); + + if (retried) + port = h->next.port; + for (; port <= port_to; port++) { + data.port = htons(port); + ret = adtfn(set, &data, timeout, flags); + + if (ret && !ip_set_eexist(ret, flags)) + return ret; + else + ret = 0; + } + return ret; +} + +/* Create hash:ip type of sets */ + +static int +hash_ipportnet_create(struct ip_set *set, struct nlattr *tb[], u32 flags) +{ + struct ip_set_hash *h; + u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM; + u8 hbits; + size_t hsize; + + if (!(set->family == NFPROTO_IPV4 || set->family == NFPROTO_IPV6)) + return -IPSET_ERR_INVALID_FAMILY; + + if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_HASHSIZE]) { + hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]); + if (hashsize < IPSET_MIMINAL_HASHSIZE) + hashsize = IPSET_MIMINAL_HASHSIZE; + } + + if (tb[IPSET_ATTR_MAXELEM]) + maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]); + + h = kzalloc(sizeof(*h) + + sizeof(struct ip_set_hash_nets) + * (set->family == NFPROTO_IPV4 ? 32 : 128), GFP_KERNEL); + if (!h) + return -ENOMEM; + + h->maxelem = maxelem; + get_random_bytes(&h->initval, sizeof(h->initval)); + h->timeout = IPSET_NO_TIMEOUT; + + hbits = htable_bits(hashsize); + hsize = htable_size(hbits); + if (hsize == 0) { + kfree(h); + return -ENOMEM; + } + h->table = ip_set_alloc(hsize); + if (!h->table) { + kfree(h); + return -ENOMEM; + } + h->table->htable_bits = hbits; + + set->data = h; + + if (tb[IPSET_ATTR_TIMEOUT]) { + h->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + + set->variant = set->family == NFPROTO_IPV4 + ? &hash_ipportnet4_tvariant + : &hash_ipportnet6_tvariant; + + if (set->family == NFPROTO_IPV4) + hash_ipportnet4_gc_init(set); + else + hash_ipportnet6_gc_init(set); + } else { + set->variant = set->family == NFPROTO_IPV4 + ? &hash_ipportnet4_variant : &hash_ipportnet6_variant; + } + + pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n", + set->name, jhash_size(h->table->htable_bits), + h->table->htable_bits, h->maxelem, set->data, h->table); + + return 0; +} + +static struct ip_set_type hash_ipportnet_type __read_mostly = { + .name = "hash:ip,port,net", + .protocol = IPSET_PROTOCOL, + .features = IPSET_TYPE_IP | IPSET_TYPE_PORT | IPSET_TYPE_IP2, + .dimension = IPSET_DIM_THREE, + .family = NFPROTO_UNSPEC, + .revision_min = 0, + /* 1 SCTP and UDPLITE support added */ + /* 2 Range as input support for IPv4 added */ + .revision_max = 3, /* nomatch flag support added */ + .create = hash_ipportnet_create, + .create_policy = { + [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, + [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, + [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, + [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + }, + .adt_policy = { + [IPSET_ATTR_IP] = { .type = NLA_NESTED }, + [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED }, + [IPSET_ATTR_IP2] = { .type = NLA_NESTED }, + [IPSET_ATTR_IP2_TO] = { .type = NLA_NESTED }, + [IPSET_ATTR_PORT] = { .type = NLA_U16 }, + [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 }, + [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, + [IPSET_ATTR_CIDR2] = { .type = NLA_U8 }, + [IPSET_ATTR_PROTO] = { .type = NLA_U8 }, + [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, + }, + .me = THIS_MODULE, +}; + +static int __init +hash_ipportnet_init(void) +{ + return ip_set_type_register(&hash_ipportnet_type); +} + +static void __exit +hash_ipportnet_fini(void) +{ + ip_set_type_unregister(&hash_ipportnet_type); +} + +module_init(hash_ipportnet_init); +module_exit(hash_ipportnet_fini); diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c new file mode 100644 index 00000000..1721cdec --- /dev/null +++ b/net/netfilter/ipset/ip_set_hash_net.c @@ -0,0 +1,569 @@ +/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Kernel module implementing an IP set type: the hash:net type */ + +#include <linux/jhash.h> +#include <linux/module.h> +#include <linux/ip.h> +#include <linux/skbuff.h> +#include <linux/errno.h> +#include <linux/random.h> +#include <net/ip.h> +#include <net/ipv6.h> +#include <net/netlink.h> + +#include <linux/netfilter.h> +#include <linux/netfilter/ipset/pfxlen.h> +#include <linux/netfilter/ipset/ip_set.h> +#include <linux/netfilter/ipset/ip_set_timeout.h> +#include <linux/netfilter/ipset/ip_set_hash.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +MODULE_DESCRIPTION("hash:net type of IP sets"); +MODULE_ALIAS("ip_set_hash:net"); + +/* Type specific function prefix */ +#define TYPE hash_net + +static bool +hash_net_same_set(const struct ip_set *a, const struct ip_set *b); + +#define hash_net4_same_set hash_net_same_set +#define hash_net6_same_set hash_net_same_set + +/* The type variant functions: IPv4 */ + +/* Member elements without timeout */ +struct hash_net4_elem { + __be32 ip; + u16 padding0; + u8 nomatch; + u8 cidr; +}; + +/* Member elements with timeout support */ +struct hash_net4_telem { + __be32 ip; + u16 padding0; + u8 nomatch; + u8 cidr; + unsigned long timeout; +}; + +static inline bool +hash_net4_data_equal(const struct hash_net4_elem *ip1, + const struct hash_net4_elem *ip2, + u32 *multi) +{ + return ip1->ip == ip2->ip && + ip1->cidr == ip2->cidr; +} + +static inline bool +hash_net4_data_isnull(const struct hash_net4_elem *elem) +{ + return elem->cidr == 0; +} + +static inline void +hash_net4_data_copy(struct hash_net4_elem *dst, + const struct hash_net4_elem *src) +{ + dst->ip = src->ip; + dst->cidr = src->cidr; + dst->nomatch = src->nomatch; +} + +static inline void +hash_net4_data_flags(struct hash_net4_elem *dst, u32 flags) +{ + dst->nomatch = flags & IPSET_FLAG_NOMATCH; +} + +static inline bool +hash_net4_data_match(const struct hash_net4_elem *elem) +{ + return !elem->nomatch; +} + +static inline void +hash_net4_data_netmask(struct hash_net4_elem *elem, u8 cidr) +{ + elem->ip &= ip_set_netmask(cidr); + elem->cidr = cidr; +} + +/* Zero CIDR values cannot be stored */ +static inline void +hash_net4_data_zero_out(struct hash_net4_elem *elem) +{ + elem->cidr = 0; +} + +static bool +hash_net4_data_list(struct sk_buff *skb, const struct hash_net4_elem *data) +{ + u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; + + NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, data->ip); + NLA_PUT_U8(skb, IPSET_ATTR_CIDR, data->cidr); + if (flags) + NLA_PUT_NET32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)); + return 0; + +nla_put_failure: + return 1; +} + +static bool +hash_net4_data_tlist(struct sk_buff *skb, const struct hash_net4_elem *data) +{ + const struct hash_net4_telem *tdata = + (const struct hash_net4_telem *)data; + u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; + + NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, tdata->ip); + NLA_PUT_U8(skb, IPSET_ATTR_CIDR, tdata->cidr); + NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, + htonl(ip_set_timeout_get(tdata->timeout))); + if (flags) + NLA_PUT_NET32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)); + + return 0; + +nla_put_failure: + return 1; +} + +#define IP_SET_HASH_WITH_NETS + +#define PF 4 +#define HOST_MASK 32 +#include <linux/netfilter/ipset/ip_set_ahash.h> + +static inline void +hash_net4_data_next(struct ip_set_hash *h, + const struct hash_net4_elem *d) +{ + h->next.ip = ntohl(d->ip); +} + +static int +hash_net4_kadt(struct ip_set *set, const struct sk_buff *skb, + const struct xt_action_param *par, + enum ipset_adt adt, const struct ip_set_adt_opt *opt) +{ + const struct ip_set_hash *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_net4_elem data = { + .cidr = h->nets[0].cidr ? h->nets[0].cidr : HOST_MASK + }; + + if (data.cidr == 0) + return -EINVAL; + if (adt == IPSET_TEST) + data.cidr = HOST_MASK; + + ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &data.ip); + data.ip &= ip_set_netmask(data.cidr); + + return adtfn(set, &data, opt_timeout(opt, h), opt->cmdflags); +} + +static int +hash_net4_uadt(struct ip_set *set, struct nlattr *tb[], + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ + const struct ip_set_hash *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_net4_elem data = { .cidr = HOST_MASK }; + u32 timeout = h->timeout; + u32 ip = 0, ip_to, last; + int ret; + + if (unlikely(!tb[IPSET_ATTR_IP] || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); + if (ret) + return ret; + + if (tb[IPSET_ATTR_CIDR]) { + data.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + if (!data.cidr || data.cidr > HOST_MASK) + return -IPSET_ERR_INVALID_CIDR; + } + + if (tb[IPSET_ATTR_TIMEOUT]) { + if (!with_timeout(h->timeout)) + return -IPSET_ERR_TIMEOUT; + timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + } + + if (tb[IPSET_ATTR_CADT_FLAGS] && adt == IPSET_ADD) { + u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_NOMATCH) + flags |= (cadt_flags << 16); + } + + if (adt == IPSET_TEST || !tb[IPSET_ATTR_IP_TO]) { + data.ip = htonl(ip & ip_set_hostmask(data.cidr)); + ret = adtfn(set, &data, timeout, flags); + return ip_set_eexist(ret, flags) ? 0 : ret; + } + + ip_to = ip; + if (tb[IPSET_ATTR_IP_TO]) { + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); + if (ret) + return ret; + if (ip_to < ip) + swap(ip, ip_to); + if (ip + UINT_MAX == ip_to) + return -IPSET_ERR_HASH_RANGE; + } + if (retried) + ip = h->next.ip; + while (!after(ip, ip_to)) { + data.ip = htonl(ip); + last = ip_set_range_to_cidr(ip, ip_to, &data.cidr); + ret = adtfn(set, &data, timeout, flags); + if (ret && !ip_set_eexist(ret, flags)) + return ret; + else + ret = 0; + ip = last + 1; + } + return ret; +} + +static bool +hash_net_same_set(const struct ip_set *a, const struct ip_set *b) +{ + const struct ip_set_hash *x = a->data; + const struct ip_set_hash *y = b->data; + + /* Resizing changes htable_bits, so we ignore it */ + return x->maxelem == y->maxelem && + x->timeout == y->timeout; +} + +/* The type variant functions: IPv6 */ + +struct hash_net6_elem { + union nf_inet_addr ip; + u16 padding0; + u8 nomatch; + u8 cidr; +}; + +struct hash_net6_telem { + union nf_inet_addr ip; + u16 padding0; + u8 nomatch; + u8 cidr; + unsigned long timeout; +}; + +static inline bool +hash_net6_data_equal(const struct hash_net6_elem *ip1, + const struct hash_net6_elem *ip2, + u32 *multi) +{ + return ipv6_addr_cmp(&ip1->ip.in6, &ip2->ip.in6) == 0 && + ip1->cidr == ip2->cidr; +} + +static inline bool +hash_net6_data_isnull(const struct hash_net6_elem *elem) +{ + return elem->cidr == 0; +} + +static inline void +hash_net6_data_copy(struct hash_net6_elem *dst, + const struct hash_net6_elem *src) +{ + dst->ip.in6 = src->ip.in6; + dst->cidr = src->cidr; + dst->nomatch = src->nomatch; +} + +static inline void +hash_net6_data_flags(struct hash_net6_elem *dst, u32 flags) +{ + dst->nomatch = flags & IPSET_FLAG_NOMATCH; +} + +static inline bool +hash_net6_data_match(const struct hash_net6_elem *elem) +{ + return !elem->nomatch; +} + +static inline void +hash_net6_data_zero_out(struct hash_net6_elem *elem) +{ + elem->cidr = 0; +} + +static inline void +ip6_netmask(union nf_inet_addr *ip, u8 prefix) +{ + ip->ip6[0] &= ip_set_netmask6(prefix)[0]; + ip->ip6[1] &= ip_set_netmask6(prefix)[1]; + ip->ip6[2] &= ip_set_netmask6(prefix)[2]; + ip->ip6[3] &= ip_set_netmask6(prefix)[3]; +} + +static inline void +hash_net6_data_netmask(struct hash_net6_elem *elem, u8 cidr) +{ + ip6_netmask(&elem->ip, cidr); + elem->cidr = cidr; +} + +static bool +hash_net6_data_list(struct sk_buff *skb, const struct hash_net6_elem *data) +{ + u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; + + NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &data->ip); + NLA_PUT_U8(skb, IPSET_ATTR_CIDR, data->cidr); + if (flags) + NLA_PUT_NET32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)); + return 0; + +nla_put_failure: + return 1; +} + +static bool +hash_net6_data_tlist(struct sk_buff *skb, const struct hash_net6_elem *data) +{ + const struct hash_net6_telem *e = + (const struct hash_net6_telem *)data; + u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; + + NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &e->ip); + NLA_PUT_U8(skb, IPSET_ATTR_CIDR, e->cidr); + NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, + htonl(ip_set_timeout_get(e->timeout))); + if (flags) + NLA_PUT_NET32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)); + return 0; + +nla_put_failure: + return 1; +} + +#undef PF +#undef HOST_MASK + +#define PF 6 +#define HOST_MASK 128 +#include <linux/netfilter/ipset/ip_set_ahash.h> + +static inline void +hash_net6_data_next(struct ip_set_hash *h, + const struct hash_net6_elem *d) +{ +} + +static int +hash_net6_kadt(struct ip_set *set, const struct sk_buff *skb, + const struct xt_action_param *par, + enum ipset_adt adt, const struct ip_set_adt_opt *opt) +{ + const struct ip_set_hash *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_net6_elem data = { + .cidr = h->nets[0].cidr ? h->nets[0].cidr : HOST_MASK + }; + + if (data.cidr == 0) + return -EINVAL; + if (adt == IPSET_TEST) + data.cidr = HOST_MASK; + + ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &data.ip.in6); + ip6_netmask(&data.ip, data.cidr); + + return adtfn(set, &data, opt_timeout(opt, h), opt->cmdflags); +} + +static int +hash_net6_uadt(struct ip_set *set, struct nlattr *tb[], + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ + const struct ip_set_hash *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_net6_elem data = { .cidr = HOST_MASK }; + u32 timeout = h->timeout; + int ret; + + if (unlikely(!tb[IPSET_ATTR_IP] || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) + return -IPSET_ERR_PROTOCOL; + if (unlikely(tb[IPSET_ATTR_IP_TO])) + return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; + + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &data.ip); + if (ret) + return ret; + + if (tb[IPSET_ATTR_CIDR]) + data.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + + if (!data.cidr || data.cidr > HOST_MASK) + return -IPSET_ERR_INVALID_CIDR; + + ip6_netmask(&data.ip, data.cidr); + + if (tb[IPSET_ATTR_TIMEOUT]) { + if (!with_timeout(h->timeout)) + return -IPSET_ERR_TIMEOUT; + timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + } + + if (tb[IPSET_ATTR_CADT_FLAGS] && adt == IPSET_ADD) { + u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_NOMATCH) + flags |= (cadt_flags << 16); + } + + ret = adtfn(set, &data, timeout, flags); + + return ip_set_eexist(ret, flags) ? 0 : ret; +} + +/* Create hash:ip type of sets */ + +static int +hash_net_create(struct ip_set *set, struct nlattr *tb[], u32 flags) +{ + u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM; + struct ip_set_hash *h; + u8 hbits; + size_t hsize; + + if (!(set->family == NFPROTO_IPV4 || set->family == NFPROTO_IPV6)) + return -IPSET_ERR_INVALID_FAMILY; + + if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_HASHSIZE]) { + hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]); + if (hashsize < IPSET_MIMINAL_HASHSIZE) + hashsize = IPSET_MIMINAL_HASHSIZE; + } + + if (tb[IPSET_ATTR_MAXELEM]) + maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]); + + h = kzalloc(sizeof(*h) + + sizeof(struct ip_set_hash_nets) + * (set->family == NFPROTO_IPV4 ? 32 : 128), GFP_KERNEL); + if (!h) + return -ENOMEM; + + h->maxelem = maxelem; + get_random_bytes(&h->initval, sizeof(h->initval)); + h->timeout = IPSET_NO_TIMEOUT; + + hbits = htable_bits(hashsize); + hsize = htable_size(hbits); + if (hsize == 0) { + kfree(h); + return -ENOMEM; + } + h->table = ip_set_alloc(hsize); + if (!h->table) { + kfree(h); + return -ENOMEM; + } + h->table->htable_bits = hbits; + + set->data = h; + + if (tb[IPSET_ATTR_TIMEOUT]) { + h->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + + set->variant = set->family == NFPROTO_IPV4 + ? &hash_net4_tvariant : &hash_net6_tvariant; + + if (set->family == NFPROTO_IPV4) + hash_net4_gc_init(set); + else + hash_net6_gc_init(set); + } else { + set->variant = set->family == NFPROTO_IPV4 + ? &hash_net4_variant : &hash_net6_variant; + } + + pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n", + set->name, jhash_size(h->table->htable_bits), + h->table->htable_bits, h->maxelem, set->data, h->table); + + return 0; +} + +static struct ip_set_type hash_net_type __read_mostly = { + .name = "hash:net", + .protocol = IPSET_PROTOCOL, + .features = IPSET_TYPE_IP, + .dimension = IPSET_DIM_ONE, + .family = NFPROTO_UNSPEC, + .revision_min = 0, + /* = 1 Range as input support for IPv4 added */ + .revision_max = 2, /* nomatch flag support added */ + .create = hash_net_create, + .create_policy = { + [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, + [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, + [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, + [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + }, + .adt_policy = { + [IPSET_ATTR_IP] = { .type = NLA_NESTED }, + [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED }, + [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, + }, + .me = THIS_MODULE, +}; + +static int __init +hash_net_init(void) +{ + return ip_set_type_register(&hash_net_type); +} + +static void __exit +hash_net_fini(void) +{ + ip_set_type_unregister(&hash_net_type); +} + +module_init(hash_net_init); +module_exit(hash_net_fini); diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c new file mode 100644 index 00000000..33bafc97 --- /dev/null +++ b/net/netfilter/ipset/ip_set_hash_netiface.c @@ -0,0 +1,836 @@ +/* Copyright (C) 2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Kernel module implementing an IP set type: the hash:net,iface type */ + +#include <linux/jhash.h> +#include <linux/module.h> +#include <linux/ip.h> +#include <linux/skbuff.h> +#include <linux/errno.h> +#include <linux/random.h> +#include <linux/rbtree.h> +#include <net/ip.h> +#include <net/ipv6.h> +#include <net/netlink.h> + +#include <linux/netfilter.h> +#include <linux/netfilter/ipset/pfxlen.h> +#include <linux/netfilter/ipset/ip_set.h> +#include <linux/netfilter/ipset/ip_set_timeout.h> +#include <linux/netfilter/ipset/ip_set_hash.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +MODULE_DESCRIPTION("hash:net,iface type of IP sets"); +MODULE_ALIAS("ip_set_hash:net,iface"); + +/* Interface name rbtree */ + +struct iface_node { + struct rb_node node; + char iface[IFNAMSIZ]; +}; + +#define iface_data(n) (rb_entry(n, struct iface_node, node)->iface) + +static inline long +ifname_compare(const char *_a, const char *_b) +{ + const long *a = (const long *)_a; + const long *b = (const long *)_b; + + BUILD_BUG_ON(IFNAMSIZ > 4 * sizeof(unsigned long)); + if (a[0] != b[0]) + return a[0] - b[0]; + if (IFNAMSIZ > sizeof(long)) { + if (a[1] != b[1]) + return a[1] - b[1]; + } + if (IFNAMSIZ > 2 * sizeof(long)) { + if (a[2] != b[2]) + return a[2] - b[2]; + } + if (IFNAMSIZ > 3 * sizeof(long)) { + if (a[3] != b[3]) + return a[3] - b[3]; + } + return 0; +} + +static void +rbtree_destroy(struct rb_root *root) +{ + struct rb_node *p, *n = root->rb_node; + struct iface_node *node; + + /* Non-recursive destroy, like in ext3 */ + while (n) { + if (n->rb_left) { + n = n->rb_left; + continue; + } + if (n->rb_right) { + n = n->rb_right; + continue; + } + p = rb_parent(n); + node = rb_entry(n, struct iface_node, node); + if (!p) + *root = RB_ROOT; + else if (p->rb_left == n) + p->rb_left = NULL; + else if (p->rb_right == n) + p->rb_right = NULL; + + kfree(node); + n = p; + } +} + +static int +iface_test(struct rb_root *root, const char **iface) +{ + struct rb_node *n = root->rb_node; + + while (n) { + const char *d = iface_data(n); + long res = ifname_compare(*iface, d); + + if (res < 0) + n = n->rb_left; + else if (res > 0) + n = n->rb_right; + else { + *iface = d; + return 1; + } + } + return 0; +} + +static int +iface_add(struct rb_root *root, const char **iface) +{ + struct rb_node **n = &(root->rb_node), *p = NULL; + struct iface_node *d; + + while (*n) { + char *ifname = iface_data(*n); + long res = ifname_compare(*iface, ifname); + + p = *n; + if (res < 0) + n = &((*n)->rb_left); + else if (res > 0) + n = &((*n)->rb_right); + else { + *iface = ifname; + return 0; + } + } + + d = kzalloc(sizeof(*d), GFP_ATOMIC); + if (!d) + return -ENOMEM; + strcpy(d->iface, *iface); + + rb_link_node(&d->node, p, n); + rb_insert_color(&d->node, root); + + *iface = d->iface; + return 0; +} + +/* Type specific function prefix */ +#define TYPE hash_netiface + +static bool +hash_netiface_same_set(const struct ip_set *a, const struct ip_set *b); + +#define hash_netiface4_same_set hash_netiface_same_set +#define hash_netiface6_same_set hash_netiface_same_set + +#define STREQ(a, b) (strcmp(a, b) == 0) + +/* The type variant functions: IPv4 */ + +struct hash_netiface4_elem_hashed { + __be32 ip; + u8 physdev; + u8 cidr; + u8 nomatch; + u8 padding; +}; + +#define HKEY_DATALEN sizeof(struct hash_netiface4_elem_hashed) + +/* Member elements without timeout */ +struct hash_netiface4_elem { + __be32 ip; + u8 physdev; + u8 cidr; + u8 nomatch; + u8 padding; + const char *iface; +}; + +/* Member elements with timeout support */ +struct hash_netiface4_telem { + __be32 ip; + u8 physdev; + u8 cidr; + u8 nomatch; + u8 padding; + const char *iface; + unsigned long timeout; +}; + +static inline bool +hash_netiface4_data_equal(const struct hash_netiface4_elem *ip1, + const struct hash_netiface4_elem *ip2, + u32 *multi) +{ + return ip1->ip == ip2->ip && + ip1->cidr == ip2->cidr && + (++*multi) && + ip1->physdev == ip2->physdev && + ip1->iface == ip2->iface; +} + +static inline bool +hash_netiface4_data_isnull(const struct hash_netiface4_elem *elem) +{ + return elem->cidr == 0; +} + +static inline void +hash_netiface4_data_copy(struct hash_netiface4_elem *dst, + const struct hash_netiface4_elem *src) +{ + dst->ip = src->ip; + dst->cidr = src->cidr; + dst->physdev = src->physdev; + dst->iface = src->iface; + dst->nomatch = src->nomatch; +} + +static inline void +hash_netiface4_data_flags(struct hash_netiface4_elem *dst, u32 flags) +{ + dst->nomatch = flags & IPSET_FLAG_NOMATCH; +} + +static inline bool +hash_netiface4_data_match(const struct hash_netiface4_elem *elem) +{ + return !elem->nomatch; +} + +static inline void +hash_netiface4_data_netmask(struct hash_netiface4_elem *elem, u8 cidr) +{ + elem->ip &= ip_set_netmask(cidr); + elem->cidr = cidr; +} + +static inline void +hash_netiface4_data_zero_out(struct hash_netiface4_elem *elem) +{ + elem->cidr = 0; +} + +static bool +hash_netiface4_data_list(struct sk_buff *skb, + const struct hash_netiface4_elem *data) +{ + u32 flags = data->physdev ? IPSET_FLAG_PHYSDEV : 0; + + if (data->nomatch) + flags |= IPSET_FLAG_NOMATCH; + NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, data->ip); + NLA_PUT_U8(skb, IPSET_ATTR_CIDR, data->cidr); + NLA_PUT_STRING(skb, IPSET_ATTR_IFACE, data->iface); + if (flags) + NLA_PUT_NET32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)); + return 0; + +nla_put_failure: + return 1; +} + +static bool +hash_netiface4_data_tlist(struct sk_buff *skb, + const struct hash_netiface4_elem *data) +{ + const struct hash_netiface4_telem *tdata = + (const struct hash_netiface4_telem *)data; + u32 flags = data->physdev ? IPSET_FLAG_PHYSDEV : 0; + + if (data->nomatch) + flags |= IPSET_FLAG_NOMATCH; + NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, data->ip); + NLA_PUT_U8(skb, IPSET_ATTR_CIDR, data->cidr); + NLA_PUT_STRING(skb, IPSET_ATTR_IFACE, data->iface); + if (flags) + NLA_PUT_NET32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)); + NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, + htonl(ip_set_timeout_get(tdata->timeout))); + + return 0; + +nla_put_failure: + return 1; +} + +#define IP_SET_HASH_WITH_NETS +#define IP_SET_HASH_WITH_RBTREE +#define IP_SET_HASH_WITH_MULTI + +#define PF 4 +#define HOST_MASK 32 +#include <linux/netfilter/ipset/ip_set_ahash.h> + +static inline void +hash_netiface4_data_next(struct ip_set_hash *h, + const struct hash_netiface4_elem *d) +{ + h->next.ip = ntohl(d->ip); +} + +static int +hash_netiface4_kadt(struct ip_set *set, const struct sk_buff *skb, + const struct xt_action_param *par, + enum ipset_adt adt, const struct ip_set_adt_opt *opt) +{ + struct ip_set_hash *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_netiface4_elem data = { + .cidr = h->nets[0].cidr ? h->nets[0].cidr : HOST_MASK + }; + int ret; + + if (data.cidr == 0) + return -EINVAL; + if (adt == IPSET_TEST) + data.cidr = HOST_MASK; + + ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &data.ip); + data.ip &= ip_set_netmask(data.cidr); + +#define IFACE(dir) (par->dir ? par->dir->name : NULL) +#define PHYSDEV(dir) (nf_bridge->dir ? nf_bridge->dir->name : NULL) +#define SRCDIR (opt->flags & IPSET_DIM_TWO_SRC) + + if (opt->cmdflags & IPSET_FLAG_PHYSDEV) { +#ifdef CONFIG_BRIDGE_NETFILTER + const struct nf_bridge_info *nf_bridge = skb->nf_bridge; + + if (!nf_bridge) + return -EINVAL; + data.iface = SRCDIR ? PHYSDEV(physindev) : PHYSDEV(physoutdev); + data.physdev = 1; +#else + data.iface = NULL; +#endif + } else + data.iface = SRCDIR ? IFACE(in) : IFACE(out); + + if (!data.iface) + return -EINVAL; + ret = iface_test(&h->rbtree, &data.iface); + if (adt == IPSET_ADD) { + if (!ret) { + ret = iface_add(&h->rbtree, &data.iface); + if (ret) + return ret; + } + } else if (!ret) + return ret; + + return adtfn(set, &data, opt_timeout(opt, h), opt->cmdflags); +} + +static int +hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[], + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ + struct ip_set_hash *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_netiface4_elem data = { .cidr = HOST_MASK }; + u32 ip = 0, ip_to, last; + u32 timeout = h->timeout; + char iface[IFNAMSIZ] = {}; + int ret; + + if (unlikely(!tb[IPSET_ATTR_IP] || + !tb[IPSET_ATTR_IFACE] || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); + if (ret) + return ret; + + if (tb[IPSET_ATTR_CIDR]) { + data.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + if (!data.cidr || data.cidr > HOST_MASK) + return -IPSET_ERR_INVALID_CIDR; + } + + if (tb[IPSET_ATTR_TIMEOUT]) { + if (!with_timeout(h->timeout)) + return -IPSET_ERR_TIMEOUT; + timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + } + + strcpy(iface, nla_data(tb[IPSET_ATTR_IFACE])); + data.iface = iface; + ret = iface_test(&h->rbtree, &data.iface); + if (adt == IPSET_ADD) { + if (!ret) { + ret = iface_add(&h->rbtree, &data.iface); + if (ret) + return ret; + } + } else if (!ret) + return ret; + + if (tb[IPSET_ATTR_CADT_FLAGS]) { + u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_PHYSDEV) + data.physdev = 1; + if (adt == IPSET_ADD && (cadt_flags & IPSET_FLAG_NOMATCH)) + flags |= (cadt_flags << 16); + } + + if (adt == IPSET_TEST || !tb[IPSET_ATTR_IP_TO]) { + data.ip = htonl(ip & ip_set_hostmask(data.cidr)); + ret = adtfn(set, &data, timeout, flags); + return ip_set_eexist(ret, flags) ? 0 : ret; + } + + if (tb[IPSET_ATTR_IP_TO]) { + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); + if (ret) + return ret; + if (ip_to < ip) + swap(ip, ip_to); + if (ip + UINT_MAX == ip_to) + return -IPSET_ERR_HASH_RANGE; + } else { + ip_set_mask_from_to(ip, ip_to, data.cidr); + } + + if (retried) + ip = h->next.ip; + while (!after(ip, ip_to)) { + data.ip = htonl(ip); + last = ip_set_range_to_cidr(ip, ip_to, &data.cidr); + ret = adtfn(set, &data, timeout, flags); + + if (ret && !ip_set_eexist(ret, flags)) + return ret; + else + ret = 0; + ip = last + 1; + } + return ret; +} + +static bool +hash_netiface_same_set(const struct ip_set *a, const struct ip_set *b) +{ + const struct ip_set_hash *x = a->data; + const struct ip_set_hash *y = b->data; + + /* Resizing changes htable_bits, so we ignore it */ + return x->maxelem == y->maxelem && + x->timeout == y->timeout; +} + +/* The type variant functions: IPv6 */ + +struct hash_netiface6_elem_hashed { + union nf_inet_addr ip; + u8 physdev; + u8 cidr; + u8 nomatch; + u8 padding; +}; + +#define HKEY_DATALEN sizeof(struct hash_netiface6_elem_hashed) + +struct hash_netiface6_elem { + union nf_inet_addr ip; + u8 physdev; + u8 cidr; + u8 nomatch; + u8 padding; + const char *iface; +}; + +struct hash_netiface6_telem { + union nf_inet_addr ip; + u8 physdev; + u8 cidr; + u8 nomatch; + u8 padding; + const char *iface; + unsigned long timeout; +}; + +static inline bool +hash_netiface6_data_equal(const struct hash_netiface6_elem *ip1, + const struct hash_netiface6_elem *ip2, + u32 *multi) +{ + return ipv6_addr_cmp(&ip1->ip.in6, &ip2->ip.in6) == 0 && + ip1->cidr == ip2->cidr && + (++*multi) && + ip1->physdev == ip2->physdev && + ip1->iface == ip2->iface; +} + +static inline bool +hash_netiface6_data_isnull(const struct hash_netiface6_elem *elem) +{ + return elem->cidr == 0; +} + +static inline void +hash_netiface6_data_copy(struct hash_netiface6_elem *dst, + const struct hash_netiface6_elem *src) +{ + memcpy(dst, src, sizeof(*dst)); +} + +static inline void +hash_netiface6_data_flags(struct hash_netiface6_elem *dst, u32 flags) +{ + dst->nomatch = flags & IPSET_FLAG_NOMATCH; +} + +static inline bool +hash_netiface6_data_match(const struct hash_netiface6_elem *elem) +{ + return !elem->nomatch; +} + +static inline void +hash_netiface6_data_zero_out(struct hash_netiface6_elem *elem) +{ + elem->cidr = 0; +} + +static inline void +ip6_netmask(union nf_inet_addr *ip, u8 prefix) +{ + ip->ip6[0] &= ip_set_netmask6(prefix)[0]; + ip->ip6[1] &= ip_set_netmask6(prefix)[1]; + ip->ip6[2] &= ip_set_netmask6(prefix)[2]; + ip->ip6[3] &= ip_set_netmask6(prefix)[3]; +} + +static inline void +hash_netiface6_data_netmask(struct hash_netiface6_elem *elem, u8 cidr) +{ + ip6_netmask(&elem->ip, cidr); + elem->cidr = cidr; +} + +static bool +hash_netiface6_data_list(struct sk_buff *skb, + const struct hash_netiface6_elem *data) +{ + u32 flags = data->physdev ? IPSET_FLAG_PHYSDEV : 0; + + if (data->nomatch) + flags |= IPSET_FLAG_NOMATCH; + NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &data->ip); + NLA_PUT_U8(skb, IPSET_ATTR_CIDR, data->cidr); + NLA_PUT_STRING(skb, IPSET_ATTR_IFACE, data->iface); + if (flags) + NLA_PUT_NET32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)); + return 0; + +nla_put_failure: + return 1; +} + +static bool +hash_netiface6_data_tlist(struct sk_buff *skb, + const struct hash_netiface6_elem *data) +{ + const struct hash_netiface6_telem *e = + (const struct hash_netiface6_telem *)data; + u32 flags = data->physdev ? IPSET_FLAG_PHYSDEV : 0; + + if (data->nomatch) + flags |= IPSET_FLAG_NOMATCH; + NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &e->ip); + NLA_PUT_U8(skb, IPSET_ATTR_CIDR, data->cidr); + NLA_PUT_STRING(skb, IPSET_ATTR_IFACE, data->iface); + if (flags) + NLA_PUT_NET32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)); + NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, + htonl(ip_set_timeout_get(e->timeout))); + return 0; + +nla_put_failure: + return 1; +} + +#undef PF +#undef HOST_MASK + +#define PF 6 +#define HOST_MASK 128 +#include <linux/netfilter/ipset/ip_set_ahash.h> + +static inline void +hash_netiface6_data_next(struct ip_set_hash *h, + const struct hash_netiface6_elem *d) +{ +} + +static int +hash_netiface6_kadt(struct ip_set *set, const struct sk_buff *skb, + const struct xt_action_param *par, + enum ipset_adt adt, const struct ip_set_adt_opt *opt) +{ + struct ip_set_hash *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_netiface6_elem data = { + .cidr = h->nets[0].cidr ? h->nets[0].cidr : HOST_MASK + }; + int ret; + + if (data.cidr == 0) + return -EINVAL; + if (adt == IPSET_TEST) + data.cidr = HOST_MASK; + + ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &data.ip.in6); + ip6_netmask(&data.ip, data.cidr); + + if (opt->cmdflags & IPSET_FLAG_PHYSDEV) { +#ifdef CONFIG_BRIDGE_NETFILTER + const struct nf_bridge_info *nf_bridge = skb->nf_bridge; + + if (!nf_bridge) + return -EINVAL; + data.iface = SRCDIR ? PHYSDEV(physindev) : PHYSDEV(physoutdev); + data.physdev = 1; +#else + data.iface = NULL; +#endif + } else + data.iface = SRCDIR ? IFACE(in) : IFACE(out); + + if (!data.iface) + return -EINVAL; + ret = iface_test(&h->rbtree, &data.iface); + if (adt == IPSET_ADD) { + if (!ret) { + ret = iface_add(&h->rbtree, &data.iface); + if (ret) + return ret; + } + } else if (!ret) + return ret; + + return adtfn(set, &data, opt_timeout(opt, h), opt->cmdflags); +} + +static int +hash_netiface6_uadt(struct ip_set *set, struct nlattr *tb[], + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ + struct ip_set_hash *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_netiface6_elem data = { .cidr = HOST_MASK }; + u32 timeout = h->timeout; + char iface[IFNAMSIZ] = {}; + int ret; + + if (unlikely(!tb[IPSET_ATTR_IP] || + !tb[IPSET_ATTR_IFACE] || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) + return -IPSET_ERR_PROTOCOL; + if (unlikely(tb[IPSET_ATTR_IP_TO])) + return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; + + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &data.ip); + if (ret) + return ret; + + if (tb[IPSET_ATTR_CIDR]) + data.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + if (!data.cidr || data.cidr > HOST_MASK) + return -IPSET_ERR_INVALID_CIDR; + ip6_netmask(&data.ip, data.cidr); + + if (tb[IPSET_ATTR_TIMEOUT]) { + if (!with_timeout(h->timeout)) + return -IPSET_ERR_TIMEOUT; + timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + } + + strcpy(iface, nla_data(tb[IPSET_ATTR_IFACE])); + data.iface = iface; + ret = iface_test(&h->rbtree, &data.iface); + if (adt == IPSET_ADD) { + if (!ret) { + ret = iface_add(&h->rbtree, &data.iface); + if (ret) + return ret; + } + } else if (!ret) + return ret; + + if (tb[IPSET_ATTR_CADT_FLAGS]) { + u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_PHYSDEV) + data.physdev = 1; + if (adt == IPSET_ADD && (cadt_flags & IPSET_FLAG_NOMATCH)) + flags |= (cadt_flags << 16); + } + + ret = adtfn(set, &data, timeout, flags); + + return ip_set_eexist(ret, flags) ? 0 : ret; +} + +/* Create hash:ip type of sets */ + +static int +hash_netiface_create(struct ip_set *set, struct nlattr *tb[], u32 flags) +{ + struct ip_set_hash *h; + u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM; + u8 hbits; + size_t hsize; + + if (!(set->family == NFPROTO_IPV4 || set->family == NFPROTO_IPV6)) + return -IPSET_ERR_INVALID_FAMILY; + + if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_HASHSIZE]) { + hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]); + if (hashsize < IPSET_MIMINAL_HASHSIZE) + hashsize = IPSET_MIMINAL_HASHSIZE; + } + + if (tb[IPSET_ATTR_MAXELEM]) + maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]); + + h = kzalloc(sizeof(*h) + + sizeof(struct ip_set_hash_nets) + * (set->family == NFPROTO_IPV4 ? 32 : 128), GFP_KERNEL); + if (!h) + return -ENOMEM; + + h->maxelem = maxelem; + get_random_bytes(&h->initval, sizeof(h->initval)); + h->timeout = IPSET_NO_TIMEOUT; + h->ahash_max = AHASH_MAX_SIZE; + + hbits = htable_bits(hashsize); + hsize = htable_size(hbits); + if (hsize == 0) { + kfree(h); + return -ENOMEM; + } + h->table = ip_set_alloc(hsize); + if (!h->table) { + kfree(h); + return -ENOMEM; + } + h->table->htable_bits = hbits; + h->rbtree = RB_ROOT; + + set->data = h; + + if (tb[IPSET_ATTR_TIMEOUT]) { + h->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + + set->variant = set->family == NFPROTO_IPV4 + ? &hash_netiface4_tvariant : &hash_netiface6_tvariant; + + if (set->family == NFPROTO_IPV4) + hash_netiface4_gc_init(set); + else + hash_netiface6_gc_init(set); + } else { + set->variant = set->family == NFPROTO_IPV4 + ? &hash_netiface4_variant : &hash_netiface6_variant; + } + + pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n", + set->name, jhash_size(h->table->htable_bits), + h->table->htable_bits, h->maxelem, set->data, h->table); + + return 0; +} + +static struct ip_set_type hash_netiface_type __read_mostly = { + .name = "hash:net,iface", + .protocol = IPSET_PROTOCOL, + .features = IPSET_TYPE_IP | IPSET_TYPE_IFACE, + .dimension = IPSET_DIM_TWO, + .family = NFPROTO_UNSPEC, + .revision_min = 0, + .revision_max = 1, /* nomatch flag support added */ + .create = hash_netiface_create, + .create_policy = { + [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, + [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, + [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, + [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, + [IPSET_ATTR_PROTO] = { .type = NLA_U8 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + }, + .adt_policy = { + [IPSET_ATTR_IP] = { .type = NLA_NESTED }, + [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED }, + [IPSET_ATTR_IFACE] = { .type = NLA_NUL_STRING, + .len = IPSET_MAXNAMELEN - 1 }, + [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, + [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, + }, + .me = THIS_MODULE, +}; + +static int __init +hash_netiface_init(void) +{ + return ip_set_type_register(&hash_netiface_type); +} + +static void __exit +hash_netiface_fini(void) +{ + ip_set_type_unregister(&hash_netiface_type); +} + +module_init(hash_netiface_init); +module_exit(hash_netiface_fini); diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c new file mode 100644 index 00000000..3a5e1986 --- /dev/null +++ b/net/netfilter/ipset/ip_set_hash_netport.c @@ -0,0 +1,687 @@ +/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Kernel module implementing an IP set type: the hash:net,port type */ + +#include <linux/jhash.h> +#include <linux/module.h> +#include <linux/ip.h> +#include <linux/skbuff.h> +#include <linux/errno.h> +#include <linux/random.h> +#include <net/ip.h> +#include <net/ipv6.h> +#include <net/netlink.h> + +#include <linux/netfilter.h> +#include <linux/netfilter/ipset/pfxlen.h> +#include <linux/netfilter/ipset/ip_set.h> +#include <linux/netfilter/ipset/ip_set_timeout.h> +#include <linux/netfilter/ipset/ip_set_getport.h> +#include <linux/netfilter/ipset/ip_set_hash.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +MODULE_DESCRIPTION("hash:net,port type of IP sets"); +MODULE_ALIAS("ip_set_hash:net,port"); + +/* Type specific function prefix */ +#define TYPE hash_netport + +static bool +hash_netport_same_set(const struct ip_set *a, const struct ip_set *b); + +#define hash_netport4_same_set hash_netport_same_set +#define hash_netport6_same_set hash_netport_same_set + +/* The type variant functions: IPv4 */ + +/* We squeeze the "nomatch" flag into cidr: we don't support cidr == 0 + * However this way we have to store internally cidr - 1, + * dancing back and forth. + */ +#define IP_SET_HASH_WITH_NETS_PACKED + +/* Member elements without timeout */ +struct hash_netport4_elem { + __be32 ip; + __be16 port; + u8 proto; + u8 cidr:7; + u8 nomatch:1; +}; + +/* Member elements with timeout support */ +struct hash_netport4_telem { + __be32 ip; + __be16 port; + u8 proto; + u8 cidr:7; + u8 nomatch:1; + unsigned long timeout; +}; + +static inline bool +hash_netport4_data_equal(const struct hash_netport4_elem *ip1, + const struct hash_netport4_elem *ip2, + u32 *multi) +{ + return ip1->ip == ip2->ip && + ip1->port == ip2->port && + ip1->proto == ip2->proto && + ip1->cidr == ip2->cidr; +} + +static inline bool +hash_netport4_data_isnull(const struct hash_netport4_elem *elem) +{ + return elem->proto == 0; +} + +static inline void +hash_netport4_data_copy(struct hash_netport4_elem *dst, + const struct hash_netport4_elem *src) +{ + dst->ip = src->ip; + dst->port = src->port; + dst->proto = src->proto; + dst->cidr = src->cidr; + dst->nomatch = src->nomatch; +} + +static inline void +hash_netport4_data_flags(struct hash_netport4_elem *dst, u32 flags) +{ + dst->nomatch = !!(flags & IPSET_FLAG_NOMATCH); +} + +static inline bool +hash_netport4_data_match(const struct hash_netport4_elem *elem) +{ + return !elem->nomatch; +} + +static inline void +hash_netport4_data_netmask(struct hash_netport4_elem *elem, u8 cidr) +{ + elem->ip &= ip_set_netmask(cidr); + elem->cidr = cidr - 1; +} + +static inline void +hash_netport4_data_zero_out(struct hash_netport4_elem *elem) +{ + elem->proto = 0; +} + +static bool +hash_netport4_data_list(struct sk_buff *skb, + const struct hash_netport4_elem *data) +{ + u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; + + NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, data->ip); + NLA_PUT_NET16(skb, IPSET_ATTR_PORT, data->port); + NLA_PUT_U8(skb, IPSET_ATTR_CIDR, data->cidr + 1); + NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto); + if (flags) + NLA_PUT_NET32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)); + return 0; + +nla_put_failure: + return 1; +} + +static bool +hash_netport4_data_tlist(struct sk_buff *skb, + const struct hash_netport4_elem *data) +{ + const struct hash_netport4_telem *tdata = + (const struct hash_netport4_telem *)data; + u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; + + NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, tdata->ip); + NLA_PUT_NET16(skb, IPSET_ATTR_PORT, tdata->port); + NLA_PUT_U8(skb, IPSET_ATTR_CIDR, data->cidr + 1); + NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto); + NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, + htonl(ip_set_timeout_get(tdata->timeout))); + if (flags) + NLA_PUT_NET32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)); + + return 0; + +nla_put_failure: + return 1; +} + +#define IP_SET_HASH_WITH_PROTO +#define IP_SET_HASH_WITH_NETS + +#define PF 4 +#define HOST_MASK 32 +#include <linux/netfilter/ipset/ip_set_ahash.h> + +static inline void +hash_netport4_data_next(struct ip_set_hash *h, + const struct hash_netport4_elem *d) +{ + h->next.ip = ntohl(d->ip); + h->next.port = ntohs(d->port); +} + +static int +hash_netport4_kadt(struct ip_set *set, const struct sk_buff *skb, + const struct xt_action_param *par, + enum ipset_adt adt, const struct ip_set_adt_opt *opt) +{ + const struct ip_set_hash *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_netport4_elem data = { + .cidr = h->nets[0].cidr ? h->nets[0].cidr - 1 : HOST_MASK - 1 + }; + + if (adt == IPSET_TEST) + data.cidr = HOST_MASK - 1; + + if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC, + &data.port, &data.proto)) + return -EINVAL; + + ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &data.ip); + data.ip &= ip_set_netmask(data.cidr + 1); + + return adtfn(set, &data, opt_timeout(opt, h), opt->cmdflags); +} + +static int +hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ + const struct ip_set_hash *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_netport4_elem data = { .cidr = HOST_MASK - 1 }; + u32 port, port_to, p = 0, ip = 0, ip_to, last; + u32 timeout = h->timeout; + bool with_ports = false; + u8 cidr; + int ret; + + if (unlikely(!tb[IPSET_ATTR_IP] || + !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); + if (ret) + return ret; + + if (tb[IPSET_ATTR_CIDR]) { + cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + if (!cidr || cidr > HOST_MASK) + return -IPSET_ERR_INVALID_CIDR; + data.cidr = cidr - 1; + } + + if (tb[IPSET_ATTR_PORT]) + data.port = nla_get_be16(tb[IPSET_ATTR_PORT]); + else + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_PROTO]) { + data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); + with_ports = ip_set_proto_with_ports(data.proto); + + if (data.proto == 0) + return -IPSET_ERR_INVALID_PROTO; + } else + return -IPSET_ERR_MISSING_PROTO; + + if (!(with_ports || data.proto == IPPROTO_ICMP)) + data.port = 0; + + if (tb[IPSET_ATTR_TIMEOUT]) { + if (!with_timeout(h->timeout)) + return -IPSET_ERR_TIMEOUT; + timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + } + + with_ports = with_ports && tb[IPSET_ATTR_PORT_TO]; + + if (tb[IPSET_ATTR_CADT_FLAGS] && adt == IPSET_ADD) { + u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_NOMATCH) + flags |= (cadt_flags << 16); + } + + if (adt == IPSET_TEST || !(with_ports || tb[IPSET_ATTR_IP_TO])) { + data.ip = htonl(ip & ip_set_hostmask(data.cidr + 1)); + ret = adtfn(set, &data, timeout, flags); + return ip_set_eexist(ret, flags) ? 0 : ret; + } + + port = port_to = ntohs(data.port); + if (tb[IPSET_ATTR_PORT_TO]) { + port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); + if (port_to < port) + swap(port, port_to); + } + if (tb[IPSET_ATTR_IP_TO]) { + ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); + if (ret) + return ret; + if (ip_to < ip) + swap(ip, ip_to); + if (ip + UINT_MAX == ip_to) + return -IPSET_ERR_HASH_RANGE; + } else { + ip_set_mask_from_to(ip, ip_to, data.cidr + 1); + } + + if (retried) + ip = h->next.ip; + while (!after(ip, ip_to)) { + data.ip = htonl(ip); + last = ip_set_range_to_cidr(ip, ip_to, &cidr); + data.cidr = cidr - 1; + p = retried && ip == h->next.ip ? h->next.port : port; + for (; p <= port_to; p++) { + data.port = htons(p); + ret = adtfn(set, &data, timeout, flags); + + if (ret && !ip_set_eexist(ret, flags)) + return ret; + else + ret = 0; + } + ip = last + 1; + } + return ret; +} + +static bool +hash_netport_same_set(const struct ip_set *a, const struct ip_set *b) +{ + const struct ip_set_hash *x = a->data; + const struct ip_set_hash *y = b->data; + + /* Resizing changes htable_bits, so we ignore it */ + return x->maxelem == y->maxelem && + x->timeout == y->timeout; +} + +/* The type variant functions: IPv6 */ + +struct hash_netport6_elem { + union nf_inet_addr ip; + __be16 port; + u8 proto; + u8 cidr:7; + u8 nomatch:1; +}; + +struct hash_netport6_telem { + union nf_inet_addr ip; + __be16 port; + u8 proto; + u8 cidr:7; + u8 nomatch:1; + unsigned long timeout; +}; + +static inline bool +hash_netport6_data_equal(const struct hash_netport6_elem *ip1, + const struct hash_netport6_elem *ip2, + u32 *multi) +{ + return ipv6_addr_cmp(&ip1->ip.in6, &ip2->ip.in6) == 0 && + ip1->port == ip2->port && + ip1->proto == ip2->proto && + ip1->cidr == ip2->cidr; +} + +static inline bool +hash_netport6_data_isnull(const struct hash_netport6_elem *elem) +{ + return elem->proto == 0; +} + +static inline void +hash_netport6_data_copy(struct hash_netport6_elem *dst, + const struct hash_netport6_elem *src) +{ + memcpy(dst, src, sizeof(*dst)); +} + +static inline void +hash_netport6_data_flags(struct hash_netport6_elem *dst, u32 flags) +{ + dst->nomatch = !!(flags & IPSET_FLAG_NOMATCH); +} + +static inline bool +hash_netport6_data_match(const struct hash_netport6_elem *elem) +{ + return !elem->nomatch; +} + +static inline void +hash_netport6_data_zero_out(struct hash_netport6_elem *elem) +{ + elem->proto = 0; +} + +static inline void +ip6_netmask(union nf_inet_addr *ip, u8 prefix) +{ + ip->ip6[0] &= ip_set_netmask6(prefix)[0]; + ip->ip6[1] &= ip_set_netmask6(prefix)[1]; + ip->ip6[2] &= ip_set_netmask6(prefix)[2]; + ip->ip6[3] &= ip_set_netmask6(prefix)[3]; +} + +static inline void +hash_netport6_data_netmask(struct hash_netport6_elem *elem, u8 cidr) +{ + ip6_netmask(&elem->ip, cidr); + elem->cidr = cidr - 1; +} + +static bool +hash_netport6_data_list(struct sk_buff *skb, + const struct hash_netport6_elem *data) +{ + u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; + + NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &data->ip); + NLA_PUT_NET16(skb, IPSET_ATTR_PORT, data->port); + NLA_PUT_U8(skb, IPSET_ATTR_CIDR, data->cidr + 1); + NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto); + if (flags) + NLA_PUT_NET32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)); + return 0; + +nla_put_failure: + return 1; +} + +static bool +hash_netport6_data_tlist(struct sk_buff *skb, + const struct hash_netport6_elem *data) +{ + const struct hash_netport6_telem *e = + (const struct hash_netport6_telem *)data; + u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; + + NLA_PUT_IPADDR6(skb, IPSET_ATTR_IP, &e->ip); + NLA_PUT_NET16(skb, IPSET_ATTR_PORT, data->port); + NLA_PUT_U8(skb, IPSET_ATTR_CIDR, data->cidr + 1); + NLA_PUT_U8(skb, IPSET_ATTR_PROTO, data->proto); + NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, + htonl(ip_set_timeout_get(e->timeout))); + if (flags) + NLA_PUT_NET32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)); + return 0; + +nla_put_failure: + return 1; +} + +#undef PF +#undef HOST_MASK + +#define PF 6 +#define HOST_MASK 128 +#include <linux/netfilter/ipset/ip_set_ahash.h> + +static inline void +hash_netport6_data_next(struct ip_set_hash *h, + const struct hash_netport6_elem *d) +{ + h->next.port = ntohs(d->port); +} + +static int +hash_netport6_kadt(struct ip_set *set, const struct sk_buff *skb, + const struct xt_action_param *par, + enum ipset_adt adt, const struct ip_set_adt_opt *opt) +{ + const struct ip_set_hash *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_netport6_elem data = { + .cidr = h->nets[0].cidr ? h->nets[0].cidr - 1 : HOST_MASK - 1, + }; + + if (adt == IPSET_TEST) + data.cidr = HOST_MASK - 1; + + if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC, + &data.port, &data.proto)) + return -EINVAL; + + ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &data.ip.in6); + ip6_netmask(&data.ip, data.cidr + 1); + + return adtfn(set, &data, opt_timeout(opt, h), opt->cmdflags); +} + +static int +hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[], + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ + const struct ip_set_hash *h = set->data; + ipset_adtfn adtfn = set->variant->adt[adt]; + struct hash_netport6_elem data = { .cidr = HOST_MASK - 1 }; + u32 port, port_to; + u32 timeout = h->timeout; + bool with_ports = false; + u8 cidr; + int ret; + + if (unlikely(!tb[IPSET_ATTR_IP] || + !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) + return -IPSET_ERR_PROTOCOL; + if (unlikely(tb[IPSET_ATTR_IP_TO])) + return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; + + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &data.ip); + if (ret) + return ret; + + if (tb[IPSET_ATTR_CIDR]) { + cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); + if (!cidr || cidr > HOST_MASK) + return -IPSET_ERR_INVALID_CIDR; + data.cidr = cidr - 1; + } + ip6_netmask(&data.ip, data.cidr + 1); + + if (tb[IPSET_ATTR_PORT]) + data.port = nla_get_be16(tb[IPSET_ATTR_PORT]); + else + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_PROTO]) { + data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); + with_ports = ip_set_proto_with_ports(data.proto); + + if (data.proto == 0) + return -IPSET_ERR_INVALID_PROTO; + } else + return -IPSET_ERR_MISSING_PROTO; + + if (!(with_ports || data.proto == IPPROTO_ICMPV6)) + data.port = 0; + + if (tb[IPSET_ATTR_TIMEOUT]) { + if (!with_timeout(h->timeout)) + return -IPSET_ERR_TIMEOUT; + timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + } + + if (tb[IPSET_ATTR_CADT_FLAGS] && adt == IPSET_ADD) { + u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + if (cadt_flags & IPSET_FLAG_NOMATCH) + flags |= (cadt_flags << 16); + } + + if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) { + ret = adtfn(set, &data, timeout, flags); + return ip_set_eexist(ret, flags) ? 0 : ret; + } + + port = ntohs(data.port); + port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); + if (port > port_to) + swap(port, port_to); + + if (retried) + port = h->next.port; + for (; port <= port_to; port++) { + data.port = htons(port); + ret = adtfn(set, &data, timeout, flags); + + if (ret && !ip_set_eexist(ret, flags)) + return ret; + else + ret = 0; + } + return ret; +} + +/* Create hash:ip type of sets */ + +static int +hash_netport_create(struct ip_set *set, struct nlattr *tb[], u32 flags) +{ + struct ip_set_hash *h; + u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM; + u8 hbits; + size_t hsize; + + if (!(set->family == NFPROTO_IPV4 || set->family == NFPROTO_IPV6)) + return -IPSET_ERR_INVALID_FAMILY; + + if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_HASHSIZE]) { + hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]); + if (hashsize < IPSET_MIMINAL_HASHSIZE) + hashsize = IPSET_MIMINAL_HASHSIZE; + } + + if (tb[IPSET_ATTR_MAXELEM]) + maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]); + + h = kzalloc(sizeof(*h) + + sizeof(struct ip_set_hash_nets) + * (set->family == NFPROTO_IPV4 ? 32 : 128), GFP_KERNEL); + if (!h) + return -ENOMEM; + + h->maxelem = maxelem; + get_random_bytes(&h->initval, sizeof(h->initval)); + h->timeout = IPSET_NO_TIMEOUT; + + hbits = htable_bits(hashsize); + hsize = htable_size(hbits); + if (hsize == 0) { + kfree(h); + return -ENOMEM; + } + h->table = ip_set_alloc(hsize); + if (!h->table) { + kfree(h); + return -ENOMEM; + } + h->table->htable_bits = hbits; + + set->data = h; + + if (tb[IPSET_ATTR_TIMEOUT]) { + h->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + + set->variant = set->family == NFPROTO_IPV4 + ? &hash_netport4_tvariant : &hash_netport6_tvariant; + + if (set->family == NFPROTO_IPV4) + hash_netport4_gc_init(set); + else + hash_netport6_gc_init(set); + } else { + set->variant = set->family == NFPROTO_IPV4 + ? &hash_netport4_variant : &hash_netport6_variant; + } + + pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n", + set->name, jhash_size(h->table->htable_bits), + h->table->htable_bits, h->maxelem, set->data, h->table); + + return 0; +} + +static struct ip_set_type hash_netport_type __read_mostly = { + .name = "hash:net,port", + .protocol = IPSET_PROTOCOL, + .features = IPSET_TYPE_IP | IPSET_TYPE_PORT, + .dimension = IPSET_DIM_TWO, + .family = NFPROTO_UNSPEC, + .revision_min = 0, + /* 1 SCTP and UDPLITE support added */ + /* 2, Range as input support for IPv4 added */ + .revision_max = 3, /* nomatch flag support added */ + .create = hash_netport_create, + .create_policy = { + [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, + [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, + [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, + [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, + [IPSET_ATTR_PROTO] = { .type = NLA_U8 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + }, + .adt_policy = { + [IPSET_ATTR_IP] = { .type = NLA_NESTED }, + [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED }, + [IPSET_ATTR_PORT] = { .type = NLA_U16 }, + [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 }, + [IPSET_ATTR_PROTO] = { .type = NLA_U8 }, + [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, + [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, + }, + .me = THIS_MODULE, +}; + +static int __init +hash_netport_init(void) +{ + return ip_set_type_register(&hash_netport_type); +} + +static void __exit +hash_netport_fini(void) +{ + ip_set_type_unregister(&hash_netport_type); +} + +module_init(hash_netport_init); +module_exit(hash_netport_fini); diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c new file mode 100644 index 00000000..7e095f90 --- /dev/null +++ b/net/netfilter/ipset/ip_set_list_set.c @@ -0,0 +1,611 @@ +/* Copyright (C) 2008-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Kernel module implementing an IP set type: the list:set type */ + +#include <linux/module.h> +#include <linux/ip.h> +#include <linux/skbuff.h> +#include <linux/errno.h> + +#include <linux/netfilter/ipset/ip_set.h> +#include <linux/netfilter/ipset/ip_set_timeout.h> +#include <linux/netfilter/ipset/ip_set_list.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +MODULE_DESCRIPTION("list:set type of IP sets"); +MODULE_ALIAS("ip_set_list:set"); + +/* Member elements without and with timeout */ +struct set_elem { + ip_set_id_t id; +}; + +struct set_telem { + ip_set_id_t id; + unsigned long timeout; +}; + +/* Type structure */ +struct list_set { + size_t dsize; /* element size */ + u32 size; /* size of set list array */ + u32 timeout; /* timeout value */ + struct timer_list gc; /* garbage collection */ + struct set_elem members[0]; /* the set members */ +}; + +static inline struct set_elem * +list_set_elem(const struct list_set *map, u32 id) +{ + return (struct set_elem *)((void *)map->members + id * map->dsize); +} + +static inline struct set_telem * +list_set_telem(const struct list_set *map, u32 id) +{ + return (struct set_telem *)((void *)map->members + id * map->dsize); +} + +static inline bool +list_set_timeout(const struct list_set *map, u32 id) +{ + const struct set_telem *elem = list_set_telem(map, id); + + return ip_set_timeout_test(elem->timeout); +} + +static inline bool +list_set_expired(const struct list_set *map, u32 id) +{ + const struct set_telem *elem = list_set_telem(map, id); + + return ip_set_timeout_expired(elem->timeout); +} + +/* Set list without and with timeout */ + +static int +list_set_kadt(struct ip_set *set, const struct sk_buff *skb, + const struct xt_action_param *par, + enum ipset_adt adt, const struct ip_set_adt_opt *opt) +{ + struct list_set *map = set->data; + struct set_elem *elem; + u32 i; + int ret; + + for (i = 0; i < map->size; i++) { + elem = list_set_elem(map, i); + if (elem->id == IPSET_INVALID_ID) + return 0; + if (with_timeout(map->timeout) && list_set_expired(map, i)) + continue; + switch (adt) { + case IPSET_TEST: + ret = ip_set_test(elem->id, skb, par, opt); + if (ret > 0) + return ret; + break; + case IPSET_ADD: + ret = ip_set_add(elem->id, skb, par, opt); + if (ret == 0) + return ret; + break; + case IPSET_DEL: + ret = ip_set_del(elem->id, skb, par, opt); + if (ret == 0) + return ret; + break; + default: + break; + } + } + return -EINVAL; +} + +static bool +id_eq(const struct list_set *map, u32 i, ip_set_id_t id) +{ + const struct set_elem *elem; + + if (i < map->size) { + elem = list_set_elem(map, i); + return elem->id == id; + } + + return 0; +} + +static bool +id_eq_timeout(const struct list_set *map, u32 i, ip_set_id_t id) +{ + const struct set_elem *elem; + + if (i < map->size) { + elem = list_set_elem(map, i); + return !!(elem->id == id && + !(with_timeout(map->timeout) && + list_set_expired(map, i))); + } + + return 0; +} + +static void +list_elem_add(struct list_set *map, u32 i, ip_set_id_t id) +{ + struct set_elem *e; + + for (; i < map->size; i++) { + e = list_set_elem(map, i); + swap(e->id, id); + if (e->id == IPSET_INVALID_ID) + break; + } +} + +static void +list_elem_tadd(struct list_set *map, u32 i, ip_set_id_t id, + unsigned long timeout) +{ + struct set_telem *e; + + for (; i < map->size; i++) { + e = list_set_telem(map, i); + swap(e->id, id); + swap(e->timeout, timeout); + if (e->id == IPSET_INVALID_ID) + break; + } +} + +static int +list_set_add(struct list_set *map, u32 i, ip_set_id_t id, + unsigned long timeout) +{ + const struct set_elem *e = list_set_elem(map, i); + + if (i == map->size - 1 && e->id != IPSET_INVALID_ID) + /* Last element replaced: e.g. add new,before,last */ + ip_set_put_byindex(e->id); + if (with_timeout(map->timeout)) + list_elem_tadd(map, i, id, ip_set_timeout_set(timeout)); + else + list_elem_add(map, i, id); + + return 0; +} + +static int +list_set_del(struct list_set *map, u32 i) +{ + struct set_elem *a = list_set_elem(map, i), *b; + + ip_set_put_byindex(a->id); + + for (; i < map->size - 1; i++) { + b = list_set_elem(map, i + 1); + a->id = b->id; + if (with_timeout(map->timeout)) + ((struct set_telem *)a)->timeout = + ((struct set_telem *)b)->timeout; + a = b; + if (a->id == IPSET_INVALID_ID) + break; + } + /* Last element */ + a->id = IPSET_INVALID_ID; + return 0; +} + +static void +cleanup_entries(struct list_set *map) +{ + struct set_telem *e; + u32 i; + + for (i = 0; i < map->size; i++) { + e = list_set_telem(map, i); + if (e->id != IPSET_INVALID_ID && list_set_expired(map, i)) + list_set_del(map, i); + } +} + +static int +list_set_uadt(struct ip_set *set, struct nlattr *tb[], + enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) +{ + struct list_set *map = set->data; + bool with_timeout = with_timeout(map->timeout); + bool flag_exist = flags & IPSET_FLAG_EXIST; + int before = 0; + u32 timeout = map->timeout; + ip_set_id_t id, refid = IPSET_INVALID_ID; + const struct set_elem *elem; + struct ip_set *s; + u32 i; + int ret = 0; + + if (unlikely(!tb[IPSET_ATTR_NAME] || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_LINENO]) + *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); + + id = ip_set_get_byname(nla_data(tb[IPSET_ATTR_NAME]), &s); + if (id == IPSET_INVALID_ID) + return -IPSET_ERR_NAME; + /* "Loop detection" */ + if (s->type->features & IPSET_TYPE_NAME) { + ret = -IPSET_ERR_LOOP; + goto finish; + } + + if (tb[IPSET_ATTR_CADT_FLAGS]) { + u32 f = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); + before = f & IPSET_FLAG_BEFORE; + } + + if (before && !tb[IPSET_ATTR_NAMEREF]) { + ret = -IPSET_ERR_BEFORE; + goto finish; + } + + if (tb[IPSET_ATTR_NAMEREF]) { + refid = ip_set_get_byname(nla_data(tb[IPSET_ATTR_NAMEREF]), + &s); + if (refid == IPSET_INVALID_ID) { + ret = -IPSET_ERR_NAMEREF; + goto finish; + } + if (!before) + before = -1; + } + if (tb[IPSET_ATTR_TIMEOUT]) { + if (!with_timeout) { + ret = -IPSET_ERR_TIMEOUT; + goto finish; + } + timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); + } + if (with_timeout && adt != IPSET_TEST) + cleanup_entries(map); + + switch (adt) { + case IPSET_TEST: + for (i = 0; i < map->size && !ret; i++) { + elem = list_set_elem(map, i); + if (elem->id == IPSET_INVALID_ID || + (before != 0 && i + 1 >= map->size)) + break; + else if (with_timeout && list_set_expired(map, i)) + continue; + else if (before > 0 && elem->id == id) + ret = id_eq_timeout(map, i + 1, refid); + else if (before < 0 && elem->id == refid) + ret = id_eq_timeout(map, i + 1, id); + else if (before == 0 && elem->id == id) + ret = 1; + } + break; + case IPSET_ADD: + for (i = 0; i < map->size; i++) { + elem = list_set_elem(map, i); + if (elem->id != id) + continue; + if (!(with_timeout && flag_exist)) { + ret = -IPSET_ERR_EXIST; + goto finish; + } else { + struct set_telem *e = list_set_telem(map, i); + + if ((before > 1 && + !id_eq(map, i + 1, refid)) || + (before < 0 && + (i == 0 || !id_eq(map, i - 1, refid)))) { + ret = -IPSET_ERR_EXIST; + goto finish; + } + e->timeout = ip_set_timeout_set(timeout); + ip_set_put_byindex(id); + ret = 0; + goto finish; + } + } + ret = -IPSET_ERR_LIST_FULL; + for (i = 0; i < map->size && ret == -IPSET_ERR_LIST_FULL; i++) { + elem = list_set_elem(map, i); + if (elem->id == IPSET_INVALID_ID) + ret = before != 0 ? -IPSET_ERR_REF_EXIST + : list_set_add(map, i, id, timeout); + else if (elem->id != refid) + continue; + else if (before > 0) + ret = list_set_add(map, i, id, timeout); + else if (i + 1 < map->size) + ret = list_set_add(map, i + 1, id, timeout); + } + break; + case IPSET_DEL: + ret = -IPSET_ERR_EXIST; + for (i = 0; i < map->size && ret == -IPSET_ERR_EXIST; i++) { + elem = list_set_elem(map, i); + if (elem->id == IPSET_INVALID_ID) { + ret = before != 0 ? -IPSET_ERR_REF_EXIST + : -IPSET_ERR_EXIST; + break; + } else if (elem->id == id && + (before == 0 || + (before > 0 && id_eq(map, i + 1, refid)))) + ret = list_set_del(map, i); + else if (elem->id == refid && + before < 0 && id_eq(map, i + 1, id)) + ret = list_set_del(map, i + 1); + } + break; + default: + break; + } + +finish: + if (refid != IPSET_INVALID_ID) + ip_set_put_byindex(refid); + if (adt != IPSET_ADD || ret) + ip_set_put_byindex(id); + + return ip_set_eexist(ret, flags) ? 0 : ret; +} + +static void +list_set_flush(struct ip_set *set) +{ + struct list_set *map = set->data; + struct set_elem *elem; + u32 i; + + for (i = 0; i < map->size; i++) { + elem = list_set_elem(map, i); + if (elem->id != IPSET_INVALID_ID) { + ip_set_put_byindex(elem->id); + elem->id = IPSET_INVALID_ID; + } + } +} + +static void +list_set_destroy(struct ip_set *set) +{ + struct list_set *map = set->data; + + if (with_timeout(map->timeout)) + del_timer_sync(&map->gc); + list_set_flush(set); + kfree(map); + + set->data = NULL; +} + +static int +list_set_head(struct ip_set *set, struct sk_buff *skb) +{ + const struct list_set *map = set->data; + struct nlattr *nested; + + nested = ipset_nest_start(skb, IPSET_ATTR_DATA); + if (!nested) + goto nla_put_failure; + NLA_PUT_NET32(skb, IPSET_ATTR_SIZE, htonl(map->size)); + if (with_timeout(map->timeout)) + NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, htonl(map->timeout)); + NLA_PUT_NET32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)); + NLA_PUT_NET32(skb, IPSET_ATTR_MEMSIZE, + htonl(sizeof(*map) + map->size * map->dsize)); + ipset_nest_end(skb, nested); + + return 0; +nla_put_failure: + return -EMSGSIZE; +} + +static int +list_set_list(const struct ip_set *set, + struct sk_buff *skb, struct netlink_callback *cb) +{ + const struct list_set *map = set->data; + struct nlattr *atd, *nested; + u32 i, first = cb->args[2]; + const struct set_elem *e; + + atd = ipset_nest_start(skb, IPSET_ATTR_ADT); + if (!atd) + return -EMSGSIZE; + for (; cb->args[2] < map->size; cb->args[2]++) { + i = cb->args[2]; + e = list_set_elem(map, i); + if (e->id == IPSET_INVALID_ID) + goto finish; + if (with_timeout(map->timeout) && list_set_expired(map, i)) + continue; + nested = ipset_nest_start(skb, IPSET_ATTR_DATA); + if (!nested) { + if (i == first) { + nla_nest_cancel(skb, atd); + return -EMSGSIZE; + } else + goto nla_put_failure; + } + NLA_PUT_STRING(skb, IPSET_ATTR_NAME, + ip_set_name_byindex(e->id)); + if (with_timeout(map->timeout)) { + const struct set_telem *te = + (const struct set_telem *) e; + NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, + htonl(ip_set_timeout_get(te->timeout))); + } + ipset_nest_end(skb, nested); + } +finish: + ipset_nest_end(skb, atd); + /* Set listing finished */ + cb->args[2] = 0; + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nested); + ipset_nest_end(skb, atd); + if (unlikely(i == first)) { + cb->args[2] = 0; + return -EMSGSIZE; + } + return 0; +} + +static bool +list_set_same_set(const struct ip_set *a, const struct ip_set *b) +{ + const struct list_set *x = a->data; + const struct list_set *y = b->data; + + return x->size == y->size && + x->timeout == y->timeout; +} + +static const struct ip_set_type_variant list_set = { + .kadt = list_set_kadt, + .uadt = list_set_uadt, + .destroy = list_set_destroy, + .flush = list_set_flush, + .head = list_set_head, + .list = list_set_list, + .same_set = list_set_same_set, +}; + +static void +list_set_gc(unsigned long ul_set) +{ + struct ip_set *set = (struct ip_set *) ul_set; + struct list_set *map = set->data; + + write_lock_bh(&set->lock); + cleanup_entries(map); + write_unlock_bh(&set->lock); + + map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ; + add_timer(&map->gc); +} + +static void +list_set_gc_init(struct ip_set *set) +{ + struct list_set *map = set->data; + + init_timer(&map->gc); + map->gc.data = (unsigned long) set; + map->gc.function = list_set_gc; + map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ; + add_timer(&map->gc); +} + +/* Create list:set type of sets */ + +static bool +init_list_set(struct ip_set *set, u32 size, size_t dsize, + unsigned long timeout) +{ + struct list_set *map; + struct set_elem *e; + u32 i; + + map = kzalloc(sizeof(*map) + size * dsize, GFP_KERNEL); + if (!map) + return false; + + map->size = size; + map->dsize = dsize; + map->timeout = timeout; + set->data = map; + + for (i = 0; i < size; i++) { + e = list_set_elem(map, i); + e->id = IPSET_INVALID_ID; + } + + return true; +} + +static int +list_set_create(struct ip_set *set, struct nlattr *tb[], u32 flags) +{ + u32 size = IP_SET_LIST_DEFAULT_SIZE; + + if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_SIZE) || + !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT))) + return -IPSET_ERR_PROTOCOL; + + if (tb[IPSET_ATTR_SIZE]) + size = ip_set_get_h32(tb[IPSET_ATTR_SIZE]); + if (size < IP_SET_LIST_MIN_SIZE) + size = IP_SET_LIST_MIN_SIZE; + + if (tb[IPSET_ATTR_TIMEOUT]) { + if (!init_list_set(set, size, sizeof(struct set_telem), + ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]))) + return -ENOMEM; + + list_set_gc_init(set); + } else { + if (!init_list_set(set, size, sizeof(struct set_elem), + IPSET_NO_TIMEOUT)) + return -ENOMEM; + } + set->variant = &list_set; + return 0; +} + +static struct ip_set_type list_set_type __read_mostly = { + .name = "list:set", + .protocol = IPSET_PROTOCOL, + .features = IPSET_TYPE_NAME | IPSET_DUMP_LAST, + .dimension = IPSET_DIM_ONE, + .family = NFPROTO_UNSPEC, + .revision_min = 0, + .revision_max = 0, + .create = list_set_create, + .create_policy = { + [IPSET_ATTR_SIZE] = { .type = NLA_U32 }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + }, + .adt_policy = { + [IPSET_ATTR_NAME] = { .type = NLA_STRING, + .len = IPSET_MAXNAMELEN }, + [IPSET_ATTR_NAMEREF] = { .type = NLA_STRING, + .len = IPSET_MAXNAMELEN }, + [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, + [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, + }, + .me = THIS_MODULE, +}; + +static int __init +list_set_init(void) +{ + return ip_set_type_register(&list_set_type); +} + +static void __exit +list_set_fini(void) +{ + ip_set_type_unregister(&list_set_type); +} + +module_init(list_set_init); +module_exit(list_set_fini); diff --git a/net/netfilter/ipset/pfxlen.c b/net/netfilter/ipset/pfxlen.c new file mode 100644 index 00000000..4f29fa97 --- /dev/null +++ b/net/netfilter/ipset/pfxlen.c @@ -0,0 +1,313 @@ +#include <linux/export.h> +#include <linux/netfilter/ipset/pfxlen.h> + +/* + * Prefixlen maps for fast conversions, by Jan Engelhardt. + */ + +#define E(a, b, c, d) \ + {.ip6 = { \ + __constant_htonl(a), __constant_htonl(b), \ + __constant_htonl(c), __constant_htonl(d), \ + } } + +/* + * This table works for both IPv4 and IPv6; + * just use prefixlen_netmask_map[prefixlength].ip. + */ +const union nf_inet_addr ip_set_netmask_map[] = { + E(0x00000000, 0x00000000, 0x00000000, 0x00000000), + E(0x80000000, 0x00000000, 0x00000000, 0x00000000), + E(0xC0000000, 0x00000000, 0x00000000, 0x00000000), + E(0xE0000000, 0x00000000, 0x00000000, 0x00000000), + E(0xF0000000, 0x00000000, 0x00000000, 0x00000000), + E(0xF8000000, 0x00000000, 0x00000000, 0x00000000), + E(0xFC000000, 0x00000000, 0x00000000, 0x00000000), + E(0xFE000000, 0x00000000, 0x00000000, 0x00000000), + E(0xFF000000, 0x00000000, 0x00000000, 0x00000000), + E(0xFF800000, 0x00000000, 0x00000000, 0x00000000), + E(0xFFC00000, 0x00000000, 0x00000000, 0x00000000), + E(0xFFE00000, 0x00000000, 0x00000000, 0x00000000), + E(0xFFF00000, 0x00000000, 0x00000000, 0x00000000), + E(0xFFF80000, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFC0000, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFE0000, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFF0000, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFF8000, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFC000, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFE000, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFF000, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFF800, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFFC00, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFFE00, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFFF00, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFFF80, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFFFC0, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFFFE0, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFFFF0, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFFFF8, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFFFFC, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFFFFE, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0x80000000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xC0000000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xE0000000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xF0000000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xF8000000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFC000000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFE000000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFF000000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFF800000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFC00000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFE00000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFF00000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFF80000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFC0000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFE0000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFF0000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFF8000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFC000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFE000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFF000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFF800, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFC00, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFE00, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFF00, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFF80, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFC0, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFE0, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFF0, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFF8, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFC, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFE, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0x80000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xC0000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xE0000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xF0000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xF8000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFC000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFE000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFF000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFF800000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFC00000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFE00000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFF00000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFF80000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFC0000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFE0000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF0000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF8000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFC000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFE000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF800, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFC00, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFE00, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF00, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF80, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFC0, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFE0, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF0, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF8, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFC, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFE, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x80000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xC0000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xE0000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xF0000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xF8000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFC000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFE000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFF000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFF800000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFC00000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFE00000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFF00000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFF80000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFC0000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFE0000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF0000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF8000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFC000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFE000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF800), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFC00), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFE00), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF00), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF80), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFC0), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFE0), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF0), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF8), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFC), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFE), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF), +}; +EXPORT_SYMBOL_GPL(ip_set_netmask_map); + +#undef E +#define E(a, b, c, d) \ + {.ip6 = { (__force __be32) a, (__force __be32) b, \ + (__force __be32) c, (__force __be32) d, \ + } } + +/* + * This table works for both IPv4 and IPv6; + * just use prefixlen_hostmask_map[prefixlength].ip. + */ +const union nf_inet_addr ip_set_hostmask_map[] = { + E(0x00000000, 0x00000000, 0x00000000, 0x00000000), + E(0x80000000, 0x00000000, 0x00000000, 0x00000000), + E(0xC0000000, 0x00000000, 0x00000000, 0x00000000), + E(0xE0000000, 0x00000000, 0x00000000, 0x00000000), + E(0xF0000000, 0x00000000, 0x00000000, 0x00000000), + E(0xF8000000, 0x00000000, 0x00000000, 0x00000000), + E(0xFC000000, 0x00000000, 0x00000000, 0x00000000), + E(0xFE000000, 0x00000000, 0x00000000, 0x00000000), + E(0xFF000000, 0x00000000, 0x00000000, 0x00000000), + E(0xFF800000, 0x00000000, 0x00000000, 0x00000000), + E(0xFFC00000, 0x00000000, 0x00000000, 0x00000000), + E(0xFFE00000, 0x00000000, 0x00000000, 0x00000000), + E(0xFFF00000, 0x00000000, 0x00000000, 0x00000000), + E(0xFFF80000, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFC0000, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFE0000, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFF0000, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFF8000, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFC000, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFE000, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFF000, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFF800, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFFC00, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFFE00, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFFF00, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFFF80, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFFFC0, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFFFE0, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFFFF0, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFFFF8, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFFFFC, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFFFFE, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0x80000000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xC0000000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xE0000000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xF0000000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xF8000000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFC000000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFE000000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFF000000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFF800000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFC00000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFE00000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFF00000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFF80000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFC0000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFE0000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFF0000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFF8000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFC000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFE000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFF000, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFF800, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFC00, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFE00, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFF00, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFF80, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFC0, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFE0, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFF0, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFF8, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFC, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFE, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0x80000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xC0000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xE0000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xF0000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xF8000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFC000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFE000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFF000000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFF800000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFC00000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFE00000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFF00000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFF80000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFC0000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFE0000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF0000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF8000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFC000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFE000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF000, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF800, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFC00, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFE00, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF00, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF80, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFC0, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFE0, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF0, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF8, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFC, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFE, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x80000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xC0000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xE0000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xF0000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xF8000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFC000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFE000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFF000000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFF800000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFC00000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFE00000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFF00000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFF80000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFC0000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFE0000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF0000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFF8000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFC000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFE000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF000), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFF800), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFC00), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFE00), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF00), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFF80), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFC0), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFE0), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF0), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFF8), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFC), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFE), + E(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF), +}; +EXPORT_SYMBOL_GPL(ip_set_hostmask_map); + +/* Find the largest network which matches the range from left, in host order. */ +u32 +ip_set_range_to_cidr(u32 from, u32 to, u8 *cidr) +{ + u32 last; + u8 i; + + for (i = 1; i < 32; i++) { + if ((from & ip_set_hostmask(i)) != from) + continue; + last = from | ~ip_set_hostmask(i); + if (!after(last, to)) { + *cidr = i; + return last; + } + } + *cidr = 32; + return from; +} +EXPORT_SYMBOL_GPL(ip_set_range_to_cidr); diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig new file mode 100644 index 00000000..f9871385 --- /dev/null +++ b/net/netfilter/ipvs/Kconfig @@ -0,0 +1,281 @@ +# +# IP Virtual Server configuration +# +menuconfig IP_VS + tristate "IP virtual server support" + depends on NET && INET && NETFILTER + depends on (NF_CONNTRACK || NF_CONNTRACK=n) + ---help--- + IP Virtual Server support will let you build a high-performance + virtual server based on cluster of two or more real servers. This + option must be enabled for at least one of the clustered computers + that will take care of intercepting incoming connections to a + single IP address and scheduling them to real servers. + + Three request dispatching techniques are implemented, they are + virtual server via NAT, virtual server via tunneling and virtual + server via direct routing. The several scheduling algorithms can + be used to choose which server the connection is directed to, + thus load balancing can be achieved among the servers. For more + information and its administration program, please visit the + following URL: <http://www.linuxvirtualserver.org/>. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +if IP_VS + +config IP_VS_IPV6 + bool "IPv6 support for IPVS" + depends on IPV6 = y || IP_VS = IPV6 + ---help--- + Add IPv6 support to IPVS. This is incomplete and might be dangerous. + + See http://www.mindbasket.com/ipvs for more information. + + Say N if unsure. + +config IP_VS_DEBUG + bool "IP virtual server debugging" + ---help--- + Say Y here if you want to get additional messages useful in + debugging the IP virtual server code. You can change the debug + level in /proc/sys/net/ipv4/vs/debug_level + +config IP_VS_TAB_BITS + int "IPVS connection table size (the Nth power of 2)" + range 8 20 + default 12 + ---help--- + The IPVS connection hash table uses the chaining scheme to handle + hash collisions. Using a big IPVS connection hash table will greatly + reduce conflicts when there are hundreds of thousands of connections + in the hash table. + + Note the table size must be power of 2. The table size will be the + value of 2 to the your input number power. The number to choose is + from 8 to 20, the default number is 12, which means the table size + is 4096. Don't input the number too small, otherwise you will lose + performance on it. You can adapt the table size yourself, according + to your virtual server application. It is good to set the table size + not far less than the number of connections per second multiplying + average lasting time of connection in the table. For example, your + virtual server gets 200 connections per second, the connection lasts + for 200 seconds in average in the connection table, the table size + should be not far less than 200x200, it is good to set the table + size 32768 (2**15). + + Another note that each connection occupies 128 bytes effectively and + each hash entry uses 8 bytes, so you can estimate how much memory is + needed for your box. + + You can overwrite this number setting conn_tab_bits module parameter + or by appending ip_vs.conn_tab_bits=? to the kernel command line + if IP VS was compiled built-in. + +comment "IPVS transport protocol load balancing support" + +config IP_VS_PROTO_TCP + bool "TCP load balancing support" + ---help--- + This option enables support for load balancing TCP transport + protocol. Say Y if unsure. + +config IP_VS_PROTO_UDP + bool "UDP load balancing support" + ---help--- + This option enables support for load balancing UDP transport + protocol. Say Y if unsure. + +config IP_VS_PROTO_AH_ESP + def_bool IP_VS_PROTO_ESP || IP_VS_PROTO_AH + +config IP_VS_PROTO_ESP + bool "ESP load balancing support" + ---help--- + This option enables support for load balancing ESP (Encapsulation + Security Payload) transport protocol. Say Y if unsure. + +config IP_VS_PROTO_AH + bool "AH load balancing support" + ---help--- + This option enables support for load balancing AH (Authentication + Header) transport protocol. Say Y if unsure. + +config IP_VS_PROTO_SCTP + bool "SCTP load balancing support" + select LIBCRC32C + ---help--- + This option enables support for load balancing SCTP transport + protocol. Say Y if unsure. + +comment "IPVS scheduler" + +config IP_VS_RR + tristate "round-robin scheduling" + ---help--- + The robin-robin scheduling algorithm simply directs network + connections to different real servers in a round-robin manner. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_WRR + tristate "weighted round-robin scheduling" + ---help--- + The weighted robin-robin scheduling algorithm directs network + connections to different real servers based on server weights + in a round-robin manner. Servers with higher weights receive + new connections first than those with less weights, and servers + with higher weights get more connections than those with less + weights and servers with equal weights get equal connections. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_LC + tristate "least-connection scheduling" + ---help--- + The least-connection scheduling algorithm directs network + connections to the server with the least number of active + connections. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_WLC + tristate "weighted least-connection scheduling" + ---help--- + The weighted least-connection scheduling algorithm directs network + connections to the server with the least active connections + normalized by the server weight. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_LBLC + tristate "locality-based least-connection scheduling" + ---help--- + The locality-based least-connection scheduling algorithm is for + destination IP load balancing. It is usually used in cache cluster. + This algorithm usually directs packet destined for an IP address to + its server if the server is alive and under load. If the server is + overloaded (its active connection numbers is larger than its weight) + and there is a server in its half load, then allocate the weighted + least-connection server to this IP address. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_LBLCR + tristate "locality-based least-connection with replication scheduling" + ---help--- + The locality-based least-connection with replication scheduling + algorithm is also for destination IP load balancing. It is + usually used in cache cluster. It differs from the LBLC scheduling + as follows: the load balancer maintains mappings from a target + to a set of server nodes that can serve the target. Requests for + a target are assigned to the least-connection node in the target's + server set. If all the node in the server set are over loaded, + it picks up a least-connection node in the cluster and adds it + in the sever set for the target. If the server set has not been + modified for the specified time, the most loaded node is removed + from the server set, in order to avoid high degree of replication. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_DH + tristate "destination hashing scheduling" + ---help--- + The destination hashing scheduling algorithm assigns network + connections to the servers through looking up a statically assigned + hash table by their destination IP addresses. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_SH + tristate "source hashing scheduling" + ---help--- + The source hashing scheduling algorithm assigns network + connections to the servers through looking up a statically assigned + hash table by their source IP addresses. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_SED + tristate "shortest expected delay scheduling" + ---help--- + The shortest expected delay scheduling algorithm assigns network + connections to the server with the shortest expected delay. The + expected delay that the job will experience is (Ci + 1) / Ui if + sent to the ith server, in which Ci is the number of connections + on the ith server and Ui is the fixed service rate (weight) + of the ith server. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_NQ + tristate "never queue scheduling" + ---help--- + The never queue scheduling algorithm adopts a two-speed model. + When there is an idle server available, the job will be sent to + the idle server, instead of waiting for a fast one. When there + is no idle server available, the job will be sent to the server + that minimize its expected delay (The Shortest Expected Delay + scheduling algorithm). + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +comment 'IPVS SH scheduler' + +config IP_VS_SH_TAB_BITS + int "IPVS source hashing table size (the Nth power of 2)" + range 4 20 + default 8 + ---help--- + The source hashing scheduler maps source IPs to destinations + stored in a hash table. This table is tiled by each destination + until all slots in the table are filled. When using weights to + allow destinations to receive more connections, the table is + tiled an amount proportional to the weights specified. The table + needs to be large enough to effectively fit all the destinations + multiplied by their respective weights. + +comment 'IPVS application helper' + +config IP_VS_FTP + tristate "FTP protocol helper" + depends on IP_VS_PROTO_TCP && NF_CONNTRACK && NF_NAT + select IP_VS_NFCT + ---help--- + FTP is a protocol that transfers IP address and/or port number in + the payload. In the virtual server via Network Address Translation, + the IP address and port number of real servers cannot be sent to + clients in ftp connections directly, so FTP protocol helper is + required for tracking the connection and mangling it back to that of + virtual service. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_NFCT + bool "Netfilter connection tracking" + depends on NF_CONNTRACK + ---help--- + The Netfilter connection tracking support allows the IPVS + connection state to be exported to the Netfilter framework + for filtering purposes. + +config IP_VS_PE_SIP + tristate "SIP persistence engine" + depends on IP_VS_PROTO_UDP + depends on NF_CONNTRACK_SIP + ---help--- + Allow persistence based on the SIP Call-ID + +endif # IP_VS diff --git a/net/netfilter/ipvs/Makefile b/net/netfilter/ipvs/Makefile new file mode 100644 index 00000000..34ee602d --- /dev/null +++ b/net/netfilter/ipvs/Makefile @@ -0,0 +1,40 @@ +# +# Makefile for the IPVS modules on top of IPv4. +# + +# IPVS transport protocol load balancing support +ip_vs_proto-objs-y := +ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_TCP) += ip_vs_proto_tcp.o +ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_UDP) += ip_vs_proto_udp.o +ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_AH_ESP) += ip_vs_proto_ah_esp.o +ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_SCTP) += ip_vs_proto_sctp.o + +ip_vs-extra_objs-y := +ip_vs-extra_objs-$(CONFIG_IP_VS_NFCT) += ip_vs_nfct.o + +ip_vs-objs := ip_vs_conn.o ip_vs_core.o ip_vs_ctl.o ip_vs_sched.o \ + ip_vs_xmit.o ip_vs_app.o ip_vs_sync.o \ + ip_vs_est.o ip_vs_proto.o ip_vs_pe.o \ + $(ip_vs_proto-objs-y) $(ip_vs-extra_objs-y) + + +# IPVS core +obj-$(CONFIG_IP_VS) += ip_vs.o + +# IPVS schedulers +obj-$(CONFIG_IP_VS_RR) += ip_vs_rr.o +obj-$(CONFIG_IP_VS_WRR) += ip_vs_wrr.o +obj-$(CONFIG_IP_VS_LC) += ip_vs_lc.o +obj-$(CONFIG_IP_VS_WLC) += ip_vs_wlc.o +obj-$(CONFIG_IP_VS_LBLC) += ip_vs_lblc.o +obj-$(CONFIG_IP_VS_LBLCR) += ip_vs_lblcr.o +obj-$(CONFIG_IP_VS_DH) += ip_vs_dh.o +obj-$(CONFIG_IP_VS_SH) += ip_vs_sh.o +obj-$(CONFIG_IP_VS_SED) += ip_vs_sed.o +obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o + +# IPVS application helpers +obj-$(CONFIG_IP_VS_FTP) += ip_vs_ftp.o + +# IPVS connection template retrievers +obj-$(CONFIG_IP_VS_PE_SIP) += ip_vs_pe_sip.o diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c new file mode 100644 index 00000000..52856178 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_app.c @@ -0,0 +1,590 @@ +/* + * ip_vs_app.c: Application module support for IPVS + * + * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Most code here is taken from ip_masq_app.c in kernel 2.2. The difference + * is that ip_vs_app module handles the reverse direction (incoming requests + * and outgoing responses). + * + * IP_MASQ_APP application masquerading module + * + * Author: Juan Jose Ciarlante, <jjciarla@raiz.uncu.edu.ar> + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/skbuff.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/netfilter.h> +#include <linux/slab.h> +#include <net/net_namespace.h> +#include <net/protocol.h> +#include <net/tcp.h> +#include <linux/stat.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/mutex.h> + +#include <net/ip_vs.h> + +EXPORT_SYMBOL(register_ip_vs_app); +EXPORT_SYMBOL(unregister_ip_vs_app); +EXPORT_SYMBOL(register_ip_vs_app_inc); + +static DEFINE_MUTEX(__ip_vs_app_mutex); + +/* + * Get an ip_vs_app object + */ +static inline int ip_vs_app_get(struct ip_vs_app *app) +{ + return try_module_get(app->module); +} + + +static inline void ip_vs_app_put(struct ip_vs_app *app) +{ + module_put(app->module); +} + + +/* + * Allocate/initialize app incarnation and register it in proto apps. + */ +static int +ip_vs_app_inc_new(struct net *net, struct ip_vs_app *app, __u16 proto, + __u16 port) +{ + struct ip_vs_protocol *pp; + struct ip_vs_app *inc; + int ret; + + if (!(pp = ip_vs_proto_get(proto))) + return -EPROTONOSUPPORT; + + if (!pp->unregister_app) + return -EOPNOTSUPP; + + inc = kmemdup(app, sizeof(*inc), GFP_KERNEL); + if (!inc) + return -ENOMEM; + INIT_LIST_HEAD(&inc->p_list); + INIT_LIST_HEAD(&inc->incs_list); + inc->app = app; + inc->port = htons(port); + atomic_set(&inc->usecnt, 0); + + if (app->timeouts) { + inc->timeout_table = + ip_vs_create_timeout_table(app->timeouts, + app->timeouts_size); + if (!inc->timeout_table) { + ret = -ENOMEM; + goto out; + } + } + + ret = pp->register_app(net, inc); + if (ret) + goto out; + + list_add(&inc->a_list, &app->incs_list); + IP_VS_DBG(9, "%s App %s:%u registered\n", + pp->name, inc->name, ntohs(inc->port)); + + return 0; + + out: + kfree(inc->timeout_table); + kfree(inc); + return ret; +} + + +/* + * Release app incarnation + */ +static void +ip_vs_app_inc_release(struct net *net, struct ip_vs_app *inc) +{ + struct ip_vs_protocol *pp; + + if (!(pp = ip_vs_proto_get(inc->protocol))) + return; + + if (pp->unregister_app) + pp->unregister_app(net, inc); + + IP_VS_DBG(9, "%s App %s:%u unregistered\n", + pp->name, inc->name, ntohs(inc->port)); + + list_del(&inc->a_list); + + kfree(inc->timeout_table); + kfree(inc); +} + + +/* + * Get reference to app inc (only called from softirq) + * + */ +int ip_vs_app_inc_get(struct ip_vs_app *inc) +{ + int result; + + atomic_inc(&inc->usecnt); + if (unlikely((result = ip_vs_app_get(inc->app)) != 1)) + atomic_dec(&inc->usecnt); + return result; +} + + +/* + * Put the app inc (only called from timer or net softirq) + */ +void ip_vs_app_inc_put(struct ip_vs_app *inc) +{ + ip_vs_app_put(inc->app); + atomic_dec(&inc->usecnt); +} + + +/* + * Register an application incarnation in protocol applications + */ +int +register_ip_vs_app_inc(struct net *net, struct ip_vs_app *app, __u16 proto, + __u16 port) +{ + int result; + + mutex_lock(&__ip_vs_app_mutex); + + result = ip_vs_app_inc_new(net, app, proto, port); + + mutex_unlock(&__ip_vs_app_mutex); + + return result; +} + + +/* + * ip_vs_app registration routine + */ +int register_ip_vs_app(struct net *net, struct ip_vs_app *app) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + /* increase the module use count */ + ip_vs_use_count_inc(); + + mutex_lock(&__ip_vs_app_mutex); + + list_add(&app->a_list, &ipvs->app_list); + + mutex_unlock(&__ip_vs_app_mutex); + + return 0; +} + + +/* + * ip_vs_app unregistration routine + * We are sure there are no app incarnations attached to services + */ +void unregister_ip_vs_app(struct net *net, struct ip_vs_app *app) +{ + struct ip_vs_app *inc, *nxt; + + mutex_lock(&__ip_vs_app_mutex); + + list_for_each_entry_safe(inc, nxt, &app->incs_list, a_list) { + ip_vs_app_inc_release(net, inc); + } + + list_del(&app->a_list); + + mutex_unlock(&__ip_vs_app_mutex); + + /* decrease the module use count */ + ip_vs_use_count_dec(); +} + + +/* + * Bind ip_vs_conn to its ip_vs_app (called by cp constructor) + */ +int ip_vs_bind_app(struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) +{ + return pp->app_conn_bind(cp); +} + + +/* + * Unbind cp from application incarnation (called by cp destructor) + */ +void ip_vs_unbind_app(struct ip_vs_conn *cp) +{ + struct ip_vs_app *inc = cp->app; + + if (!inc) + return; + + if (inc->unbind_conn) + inc->unbind_conn(inc, cp); + if (inc->done_conn) + inc->done_conn(inc, cp); + ip_vs_app_inc_put(inc); + cp->app = NULL; +} + + +/* + * Fixes th->seq based on ip_vs_seq info. + */ +static inline void vs_fix_seq(const struct ip_vs_seq *vseq, struct tcphdr *th) +{ + __u32 seq = ntohl(th->seq); + + /* + * Adjust seq with delta-offset for all packets after + * the most recent resized pkt seq and with previous_delta offset + * for all packets before most recent resized pkt seq. + */ + if (vseq->delta || vseq->previous_delta) { + if(after(seq, vseq->init_seq)) { + th->seq = htonl(seq + vseq->delta); + IP_VS_DBG(9, "%s(): added delta (%d) to seq\n", + __func__, vseq->delta); + } else { + th->seq = htonl(seq + vseq->previous_delta); + IP_VS_DBG(9, "%s(): added previous_delta (%d) to seq\n", + __func__, vseq->previous_delta); + } + } +} + + +/* + * Fixes th->ack_seq based on ip_vs_seq info. + */ +static inline void +vs_fix_ack_seq(const struct ip_vs_seq *vseq, struct tcphdr *th) +{ + __u32 ack_seq = ntohl(th->ack_seq); + + /* + * Adjust ack_seq with delta-offset for + * the packets AFTER most recent resized pkt has caused a shift + * for packets before most recent resized pkt, use previous_delta + */ + if (vseq->delta || vseq->previous_delta) { + /* since ack_seq is the number of octet that is expected + to receive next, so compare it with init_seq+delta */ + if(after(ack_seq, vseq->init_seq+vseq->delta)) { + th->ack_seq = htonl(ack_seq - vseq->delta); + IP_VS_DBG(9, "%s(): subtracted delta " + "(%d) from ack_seq\n", __func__, vseq->delta); + + } else { + th->ack_seq = htonl(ack_seq - vseq->previous_delta); + IP_VS_DBG(9, "%s(): subtracted " + "previous_delta (%d) from ack_seq\n", + __func__, vseq->previous_delta); + } + } +} + + +/* + * Updates ip_vs_seq if pkt has been resized + * Assumes already checked proto==IPPROTO_TCP and diff!=0. + */ +static inline void vs_seq_update(struct ip_vs_conn *cp, struct ip_vs_seq *vseq, + unsigned flag, __u32 seq, int diff) +{ + /* spinlock is to keep updating cp->flags atomic */ + spin_lock(&cp->lock); + if (!(cp->flags & flag) || after(seq, vseq->init_seq)) { + vseq->previous_delta = vseq->delta; + vseq->delta += diff; + vseq->init_seq = seq; + cp->flags |= flag; + } + spin_unlock(&cp->lock); +} + +static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb, + struct ip_vs_app *app) +{ + int diff; + const unsigned int tcp_offset = ip_hdrlen(skb); + struct tcphdr *th; + __u32 seq; + + if (!skb_make_writable(skb, tcp_offset + sizeof(*th))) + return 0; + + th = (struct tcphdr *)(skb_network_header(skb) + tcp_offset); + + /* + * Remember seq number in case this pkt gets resized + */ + seq = ntohl(th->seq); + + /* + * Fix seq stuff if flagged as so. + */ + if (cp->flags & IP_VS_CONN_F_OUT_SEQ) + vs_fix_seq(&cp->out_seq, th); + if (cp->flags & IP_VS_CONN_F_IN_SEQ) + vs_fix_ack_seq(&cp->in_seq, th); + + /* + * Call private output hook function + */ + if (app->pkt_out == NULL) + return 1; + + if (!app->pkt_out(app, cp, skb, &diff)) + return 0; + + /* + * Update ip_vs seq stuff if len has changed. + */ + if (diff != 0) + vs_seq_update(cp, &cp->out_seq, + IP_VS_CONN_F_OUT_SEQ, seq, diff); + + return 1; +} + +/* + * Output pkt hook. Will call bound ip_vs_app specific function + * called by ipvs packet handler, assumes previously checked cp!=NULL + * returns false if it can't handle packet (oom) + */ +int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb) +{ + struct ip_vs_app *app; + + /* + * check if application module is bound to + * this ip_vs_conn. + */ + if ((app = cp->app) == NULL) + return 1; + + /* TCP is complicated */ + if (cp->protocol == IPPROTO_TCP) + return app_tcp_pkt_out(cp, skb, app); + + /* + * Call private output hook function + */ + if (app->pkt_out == NULL) + return 1; + + return app->pkt_out(app, cp, skb, NULL); +} + + +static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb, + struct ip_vs_app *app) +{ + int diff; + const unsigned int tcp_offset = ip_hdrlen(skb); + struct tcphdr *th; + __u32 seq; + + if (!skb_make_writable(skb, tcp_offset + sizeof(*th))) + return 0; + + th = (struct tcphdr *)(skb_network_header(skb) + tcp_offset); + + /* + * Remember seq number in case this pkt gets resized + */ + seq = ntohl(th->seq); + + /* + * Fix seq stuff if flagged as so. + */ + if (cp->flags & IP_VS_CONN_F_IN_SEQ) + vs_fix_seq(&cp->in_seq, th); + if (cp->flags & IP_VS_CONN_F_OUT_SEQ) + vs_fix_ack_seq(&cp->out_seq, th); + + /* + * Call private input hook function + */ + if (app->pkt_in == NULL) + return 1; + + if (!app->pkt_in(app, cp, skb, &diff)) + return 0; + + /* + * Update ip_vs seq stuff if len has changed. + */ + if (diff != 0) + vs_seq_update(cp, &cp->in_seq, + IP_VS_CONN_F_IN_SEQ, seq, diff); + + return 1; +} + +/* + * Input pkt hook. Will call bound ip_vs_app specific function + * called by ipvs packet handler, assumes previously checked cp!=NULL. + * returns false if can't handle packet (oom). + */ +int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb) +{ + struct ip_vs_app *app; + + /* + * check if application module is bound to + * this ip_vs_conn. + */ + if ((app = cp->app) == NULL) + return 1; + + /* TCP is complicated */ + if (cp->protocol == IPPROTO_TCP) + return app_tcp_pkt_in(cp, skb, app); + + /* + * Call private input hook function + */ + if (app->pkt_in == NULL) + return 1; + + return app->pkt_in(app, cp, skb, NULL); +} + + +#ifdef CONFIG_PROC_FS +/* + * /proc/net/ip_vs_app entry function + */ + +static struct ip_vs_app *ip_vs_app_idx(struct netns_ipvs *ipvs, loff_t pos) +{ + struct ip_vs_app *app, *inc; + + list_for_each_entry(app, &ipvs->app_list, a_list) { + list_for_each_entry(inc, &app->incs_list, a_list) { + if (pos-- == 0) + return inc; + } + } + return NULL; + +} + +static void *ip_vs_app_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct net *net = seq_file_net(seq); + struct netns_ipvs *ipvs = net_ipvs(net); + + mutex_lock(&__ip_vs_app_mutex); + + return *pos ? ip_vs_app_idx(ipvs, *pos - 1) : SEQ_START_TOKEN; +} + +static void *ip_vs_app_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct ip_vs_app *inc, *app; + struct list_head *e; + struct net *net = seq_file_net(seq); + struct netns_ipvs *ipvs = net_ipvs(net); + + ++*pos; + if (v == SEQ_START_TOKEN) + return ip_vs_app_idx(ipvs, 0); + + inc = v; + app = inc->app; + + if ((e = inc->a_list.next) != &app->incs_list) + return list_entry(e, struct ip_vs_app, a_list); + + /* go on to next application */ + for (e = app->a_list.next; e != &ipvs->app_list; e = e->next) { + app = list_entry(e, struct ip_vs_app, a_list); + list_for_each_entry(inc, &app->incs_list, a_list) { + return inc; + } + } + return NULL; +} + +static void ip_vs_app_seq_stop(struct seq_file *seq, void *v) +{ + mutex_unlock(&__ip_vs_app_mutex); +} + +static int ip_vs_app_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) + seq_puts(seq, "prot port usecnt name\n"); + else { + const struct ip_vs_app *inc = v; + + seq_printf(seq, "%-3s %-7u %-6d %-17s\n", + ip_vs_proto_name(inc->protocol), + ntohs(inc->port), + atomic_read(&inc->usecnt), + inc->name); + } + return 0; +} + +static const struct seq_operations ip_vs_app_seq_ops = { + .start = ip_vs_app_seq_start, + .next = ip_vs_app_seq_next, + .stop = ip_vs_app_seq_stop, + .show = ip_vs_app_seq_show, +}; + +static int ip_vs_app_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &ip_vs_app_seq_ops, + sizeof(struct seq_net_private)); +} + +static const struct file_operations ip_vs_app_fops = { + .owner = THIS_MODULE, + .open = ip_vs_app_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; +#endif + +int __net_init ip_vs_app_net_init(struct net *net) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + + INIT_LIST_HEAD(&ipvs->app_list); + proc_net_fops_create(net, "ip_vs_app", 0, &ip_vs_app_fops); + return 0; +} + +void __net_exit ip_vs_app_net_cleanup(struct net *net) +{ + proc_net_remove(net, "ip_vs_app"); +} diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c new file mode 100644 index 00000000..29fa5bad --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -0,0 +1,1326 @@ +/* + * IPVS An implementation of the IP virtual server support for the + * LINUX operating system. IPVS is now implemented as a module + * over the Netfilter framework. IPVS can be used to build a + * high-performance and highly available server based on a + * cluster of servers. + * + * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> + * Peter Kese <peter.kese@ijs.si> + * Julian Anastasov <ja@ssi.bg> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese, + * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms + * and others. Many code here is taken from IP MASQ code of kernel 2.2. + * + * Changes: + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/interrupt.h> +#include <linux/in.h> +#include <linux/net.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/vmalloc.h> +#include <linux/proc_fs.h> /* for proc_net_* */ +#include <linux/slab.h> +#include <linux/seq_file.h> +#include <linux/jhash.h> +#include <linux/random.h> + +#include <net/net_namespace.h> +#include <net/ip_vs.h> + + +#ifndef CONFIG_IP_VS_TAB_BITS +#define CONFIG_IP_VS_TAB_BITS 12 +#endif + +/* + * Connection hash size. Default is what was selected at compile time. +*/ +static int ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS; +module_param_named(conn_tab_bits, ip_vs_conn_tab_bits, int, 0444); +MODULE_PARM_DESC(conn_tab_bits, "Set connections' hash size"); + +/* size and mask values */ +int ip_vs_conn_tab_size __read_mostly; +static int ip_vs_conn_tab_mask __read_mostly; + +/* + * Connection hash table: for input and output packets lookups of IPVS + */ +static struct hlist_head *ip_vs_conn_tab __read_mostly; + +/* SLAB cache for IPVS connections */ +static struct kmem_cache *ip_vs_conn_cachep __read_mostly; + +/* counter for no client port connections */ +static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0); + +/* random value for IPVS connection hash */ +static unsigned int ip_vs_conn_rnd __read_mostly; + +/* + * Fine locking granularity for big connection hash table + */ +#define CT_LOCKARRAY_BITS 5 +#define CT_LOCKARRAY_SIZE (1<<CT_LOCKARRAY_BITS) +#define CT_LOCKARRAY_MASK (CT_LOCKARRAY_SIZE-1) + +struct ip_vs_aligned_lock +{ + rwlock_t l; +} __attribute__((__aligned__(SMP_CACHE_BYTES))); + +/* lock array for conn table */ +static struct ip_vs_aligned_lock +__ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned; + +static inline void ct_read_lock(unsigned key) +{ + read_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); +} + +static inline void ct_read_unlock(unsigned key) +{ + read_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); +} + +static inline void ct_write_lock(unsigned key) +{ + write_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); +} + +static inline void ct_write_unlock(unsigned key) +{ + write_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); +} + +static inline void ct_read_lock_bh(unsigned key) +{ + read_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); +} + +static inline void ct_read_unlock_bh(unsigned key) +{ + read_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); +} + +static inline void ct_write_lock_bh(unsigned key) +{ + write_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); +} + +static inline void ct_write_unlock_bh(unsigned key) +{ + write_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); +} + + +/* + * Returns hash value for IPVS connection entry + */ +static unsigned int ip_vs_conn_hashkey(struct net *net, int af, unsigned proto, + const union nf_inet_addr *addr, + __be16 port) +{ +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + return (jhash_3words(jhash(addr, 16, ip_vs_conn_rnd), + (__force u32)port, proto, ip_vs_conn_rnd) ^ + ((size_t)net>>8)) & ip_vs_conn_tab_mask; +#endif + return (jhash_3words((__force u32)addr->ip, (__force u32)port, proto, + ip_vs_conn_rnd) ^ + ((size_t)net>>8)) & ip_vs_conn_tab_mask; +} + +static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p, + bool inverse) +{ + const union nf_inet_addr *addr; + __be16 port; + + if (p->pe_data && p->pe->hashkey_raw) + return p->pe->hashkey_raw(p, ip_vs_conn_rnd, inverse) & + ip_vs_conn_tab_mask; + + if (likely(!inverse)) { + addr = p->caddr; + port = p->cport; + } else { + addr = p->vaddr; + port = p->vport; + } + + return ip_vs_conn_hashkey(p->net, p->af, p->protocol, addr, port); +} + +static unsigned int ip_vs_conn_hashkey_conn(const struct ip_vs_conn *cp) +{ + struct ip_vs_conn_param p; + + ip_vs_conn_fill_param(ip_vs_conn_net(cp), cp->af, cp->protocol, + &cp->caddr, cp->cport, NULL, 0, &p); + + if (cp->pe) { + p.pe = cp->pe; + p.pe_data = cp->pe_data; + p.pe_data_len = cp->pe_data_len; + } + + return ip_vs_conn_hashkey_param(&p, false); +} + +/* + * Hashes ip_vs_conn in ip_vs_conn_tab by netns,proto,addr,port. + * returns bool success. + */ +static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) +{ + unsigned hash; + int ret; + + if (cp->flags & IP_VS_CONN_F_ONE_PACKET) + return 0; + + /* Hash by protocol, client address and port */ + hash = ip_vs_conn_hashkey_conn(cp); + + ct_write_lock(hash); + spin_lock(&cp->lock); + + if (!(cp->flags & IP_VS_CONN_F_HASHED)) { + hlist_add_head(&cp->c_list, &ip_vs_conn_tab[hash]); + cp->flags |= IP_VS_CONN_F_HASHED; + atomic_inc(&cp->refcnt); + ret = 1; + } else { + pr_err("%s(): request for already hashed, called from %pF\n", + __func__, __builtin_return_address(0)); + ret = 0; + } + + spin_unlock(&cp->lock); + ct_write_unlock(hash); + + return ret; +} + + +/* + * UNhashes ip_vs_conn from ip_vs_conn_tab. + * returns bool success. + */ +static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp) +{ + unsigned hash; + int ret; + + /* unhash it and decrease its reference counter */ + hash = ip_vs_conn_hashkey_conn(cp); + + ct_write_lock(hash); + spin_lock(&cp->lock); + + if (cp->flags & IP_VS_CONN_F_HASHED) { + hlist_del(&cp->c_list); + cp->flags &= ~IP_VS_CONN_F_HASHED; + atomic_dec(&cp->refcnt); + ret = 1; + } else + ret = 0; + + spin_unlock(&cp->lock); + ct_write_unlock(hash); + + return ret; +} + + +/* + * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. + * Called for pkts coming from OUTside-to-INside. + * p->caddr, p->cport: pkt source address (foreign host) + * p->vaddr, p->vport: pkt dest address (load balancer) + */ +static inline struct ip_vs_conn * +__ip_vs_conn_in_get(const struct ip_vs_conn_param *p) +{ + unsigned hash; + struct ip_vs_conn *cp; + struct hlist_node *n; + + hash = ip_vs_conn_hashkey_param(p, false); + + ct_read_lock(hash); + + hlist_for_each_entry(cp, n, &ip_vs_conn_tab[hash], c_list) { + if (cp->af == p->af && + p->cport == cp->cport && p->vport == cp->vport && + ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) && + ip_vs_addr_equal(p->af, p->vaddr, &cp->vaddr) && + ((!p->cport) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) && + p->protocol == cp->protocol && + ip_vs_conn_net_eq(cp, p->net)) { + /* HIT */ + atomic_inc(&cp->refcnt); + ct_read_unlock(hash); + return cp; + } + } + + ct_read_unlock(hash); + + return NULL; +} + +struct ip_vs_conn *ip_vs_conn_in_get(const struct ip_vs_conn_param *p) +{ + struct ip_vs_conn *cp; + + cp = __ip_vs_conn_in_get(p); + if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt)) { + struct ip_vs_conn_param cport_zero_p = *p; + cport_zero_p.cport = 0; + cp = __ip_vs_conn_in_get(&cport_zero_p); + } + + IP_VS_DBG_BUF(9, "lookup/in %s %s:%d->%s:%d %s\n", + ip_vs_proto_name(p->protocol), + IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport), + IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport), + cp ? "hit" : "not hit"); + + return cp; +} + +static int +ip_vs_conn_fill_param_proto(int af, const struct sk_buff *skb, + const struct ip_vs_iphdr *iph, + unsigned int proto_off, int inverse, + struct ip_vs_conn_param *p) +{ + __be16 _ports[2], *pptr; + struct net *net = skb_net(skb); + + pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); + if (pptr == NULL) + return 1; + + if (likely(!inverse)) + ip_vs_conn_fill_param(net, af, iph->protocol, &iph->saddr, + pptr[0], &iph->daddr, pptr[1], p); + else + ip_vs_conn_fill_param(net, af, iph->protocol, &iph->daddr, + pptr[1], &iph->saddr, pptr[0], p); + return 0; +} + +struct ip_vs_conn * +ip_vs_conn_in_get_proto(int af, const struct sk_buff *skb, + const struct ip_vs_iphdr *iph, + unsigned int proto_off, int inverse) +{ + struct ip_vs_conn_param p; + + if (ip_vs_conn_fill_param_proto(af, skb, iph, proto_off, inverse, &p)) + return NULL; + + return ip_vs_conn_in_get(&p); +} +EXPORT_SYMBOL_GPL(ip_vs_conn_in_get_proto); + +/* Get reference to connection template */ +struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p) +{ + unsigned hash; + struct ip_vs_conn *cp; + struct hlist_node *n; + + hash = ip_vs_conn_hashkey_param(p, false); + + ct_read_lock(hash); + + hlist_for_each_entry(cp, n, &ip_vs_conn_tab[hash], c_list) { + if (!ip_vs_conn_net_eq(cp, p->net)) + continue; + if (p->pe_data && p->pe->ct_match) { + if (p->pe == cp->pe && p->pe->ct_match(p, cp)) + goto out; + continue; + } + + if (cp->af == p->af && + ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) && + /* protocol should only be IPPROTO_IP if + * p->vaddr is a fwmark */ + ip_vs_addr_equal(p->protocol == IPPROTO_IP ? AF_UNSPEC : + p->af, p->vaddr, &cp->vaddr) && + p->cport == cp->cport && p->vport == cp->vport && + cp->flags & IP_VS_CONN_F_TEMPLATE && + p->protocol == cp->protocol) + goto out; + } + cp = NULL; + + out: + if (cp) + atomic_inc(&cp->refcnt); + ct_read_unlock(hash); + + IP_VS_DBG_BUF(9, "template lookup/in %s %s:%d->%s:%d %s\n", + ip_vs_proto_name(p->protocol), + IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport), + IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport), + cp ? "hit" : "not hit"); + + return cp; +} + +/* Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. + * Called for pkts coming from inside-to-OUTside. + * p->caddr, p->cport: pkt source address (inside host) + * p->vaddr, p->vport: pkt dest address (foreign host) */ +struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p) +{ + unsigned hash; + struct ip_vs_conn *cp, *ret=NULL; + struct hlist_node *n; + + /* + * Check for "full" addressed entries + */ + hash = ip_vs_conn_hashkey_param(p, true); + + ct_read_lock(hash); + + hlist_for_each_entry(cp, n, &ip_vs_conn_tab[hash], c_list) { + if (cp->af == p->af && + p->vport == cp->cport && p->cport == cp->dport && + ip_vs_addr_equal(p->af, p->vaddr, &cp->caddr) && + ip_vs_addr_equal(p->af, p->caddr, &cp->daddr) && + p->protocol == cp->protocol && + ip_vs_conn_net_eq(cp, p->net)) { + /* HIT */ + atomic_inc(&cp->refcnt); + ret = cp; + break; + } + } + + ct_read_unlock(hash); + + IP_VS_DBG_BUF(9, "lookup/out %s %s:%d->%s:%d %s\n", + ip_vs_proto_name(p->protocol), + IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport), + IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport), + ret ? "hit" : "not hit"); + + return ret; +} + +struct ip_vs_conn * +ip_vs_conn_out_get_proto(int af, const struct sk_buff *skb, + const struct ip_vs_iphdr *iph, + unsigned int proto_off, int inverse) +{ + struct ip_vs_conn_param p; + + if (ip_vs_conn_fill_param_proto(af, skb, iph, proto_off, inverse, &p)) + return NULL; + + return ip_vs_conn_out_get(&p); +} +EXPORT_SYMBOL_GPL(ip_vs_conn_out_get_proto); + +/* + * Put back the conn and restart its timer with its timeout + */ +void ip_vs_conn_put(struct ip_vs_conn *cp) +{ + unsigned long t = (cp->flags & IP_VS_CONN_F_ONE_PACKET) ? + 0 : cp->timeout; + mod_timer(&cp->timer, jiffies+t); + + __ip_vs_conn_put(cp); +} + + +/* + * Fill a no_client_port connection with a client port number + */ +void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport) +{ + if (ip_vs_conn_unhash(cp)) { + spin_lock(&cp->lock); + if (cp->flags & IP_VS_CONN_F_NO_CPORT) { + atomic_dec(&ip_vs_conn_no_cport_cnt); + cp->flags &= ~IP_VS_CONN_F_NO_CPORT; + cp->cport = cport; + } + spin_unlock(&cp->lock); + + /* hash on new dport */ + ip_vs_conn_hash(cp); + } +} + + +/* + * Bind a connection entry with the corresponding packet_xmit. + * Called by ip_vs_conn_new. + */ +static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp) +{ + switch (IP_VS_FWD_METHOD(cp)) { + case IP_VS_CONN_F_MASQ: + cp->packet_xmit = ip_vs_nat_xmit; + break; + + case IP_VS_CONN_F_TUNNEL: + cp->packet_xmit = ip_vs_tunnel_xmit; + break; + + case IP_VS_CONN_F_DROUTE: + cp->packet_xmit = ip_vs_dr_xmit; + break; + + case IP_VS_CONN_F_LOCALNODE: + cp->packet_xmit = ip_vs_null_xmit; + break; + + case IP_VS_CONN_F_BYPASS: + cp->packet_xmit = ip_vs_bypass_xmit; + break; + } +} + +#ifdef CONFIG_IP_VS_IPV6 +static inline void ip_vs_bind_xmit_v6(struct ip_vs_conn *cp) +{ + switch (IP_VS_FWD_METHOD(cp)) { + case IP_VS_CONN_F_MASQ: + cp->packet_xmit = ip_vs_nat_xmit_v6; + break; + + case IP_VS_CONN_F_TUNNEL: + cp->packet_xmit = ip_vs_tunnel_xmit_v6; + break; + + case IP_VS_CONN_F_DROUTE: + cp->packet_xmit = ip_vs_dr_xmit_v6; + break; + + case IP_VS_CONN_F_LOCALNODE: + cp->packet_xmit = ip_vs_null_xmit; + break; + + case IP_VS_CONN_F_BYPASS: + cp->packet_xmit = ip_vs_bypass_xmit_v6; + break; + } +} +#endif + + +static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest) +{ + return atomic_read(&dest->activeconns) + + atomic_read(&dest->inactconns); +} + +/* + * Bind a connection entry with a virtual service destination + * Called just after a new connection entry is created. + */ +static inline void +ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest) +{ + unsigned int conn_flags; + + /* if dest is NULL, then return directly */ + if (!dest) + return; + + /* Increase the refcnt counter of the dest */ + atomic_inc(&dest->refcnt); + + conn_flags = atomic_read(&dest->conn_flags); + if (cp->protocol != IPPROTO_UDP) + conn_flags &= ~IP_VS_CONN_F_ONE_PACKET; + /* Bind with the destination and its corresponding transmitter */ + if (cp->flags & IP_VS_CONN_F_SYNC) { + /* if the connection is not template and is created + * by sync, preserve the activity flag. + */ + if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) + conn_flags &= ~IP_VS_CONN_F_INACTIVE; + /* connections inherit forwarding method from dest */ + cp->flags &= ~IP_VS_CONN_F_FWD_MASK; + } + cp->flags |= conn_flags; + cp->dest = dest; + + IP_VS_DBG_BUF(7, "Bind-dest %s c:%s:%d v:%s:%d " + "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d " + "dest->refcnt:%d\n", + ip_vs_proto_name(cp->protocol), + IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), + IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), + IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport), + ip_vs_fwd_tag(cp), cp->state, + cp->flags, atomic_read(&cp->refcnt), + atomic_read(&dest->refcnt)); + + /* Update the connection counters */ + if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) { + /* It is a normal connection, so increase the inactive + connection counter because it is in TCP SYNRECV + state (inactive) or other protocol inacive state */ + if ((cp->flags & IP_VS_CONN_F_SYNC) && + (!(cp->flags & IP_VS_CONN_F_INACTIVE))) + atomic_inc(&dest->activeconns); + else + atomic_inc(&dest->inactconns); + } else { + /* It is a persistent connection/template, so increase + the persistent connection counter */ + atomic_inc(&dest->persistconns); + } + + if (dest->u_threshold != 0 && + ip_vs_dest_totalconns(dest) >= dest->u_threshold) + dest->flags |= IP_VS_DEST_F_OVERLOAD; +} + + +/* + * Check if there is a destination for the connection, if so + * bind the connection to the destination. + */ +struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp) +{ + struct ip_vs_dest *dest; + + if ((cp) && (!cp->dest)) { + dest = ip_vs_find_dest(ip_vs_conn_net(cp), cp->af, &cp->daddr, + cp->dport, &cp->vaddr, cp->vport, + cp->protocol, cp->fwmark, cp->flags); + ip_vs_bind_dest(cp, dest); + return dest; + } else + return NULL; +} + + +/* + * Unbind a connection entry with its VS destination + * Called by the ip_vs_conn_expire function. + */ +static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp) +{ + struct ip_vs_dest *dest = cp->dest; + + if (!dest) + return; + + IP_VS_DBG_BUF(7, "Unbind-dest %s c:%s:%d v:%s:%d " + "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d " + "dest->refcnt:%d\n", + ip_vs_proto_name(cp->protocol), + IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), + IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), + IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport), + ip_vs_fwd_tag(cp), cp->state, + cp->flags, atomic_read(&cp->refcnt), + atomic_read(&dest->refcnt)); + + /* Update the connection counters */ + if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) { + /* It is a normal connection, so decrease the inactconns + or activeconns counter */ + if (cp->flags & IP_VS_CONN_F_INACTIVE) { + atomic_dec(&dest->inactconns); + } else { + atomic_dec(&dest->activeconns); + } + } else { + /* It is a persistent connection/template, so decrease + the persistent connection counter */ + atomic_dec(&dest->persistconns); + } + + if (dest->l_threshold != 0) { + if (ip_vs_dest_totalconns(dest) < dest->l_threshold) + dest->flags &= ~IP_VS_DEST_F_OVERLOAD; + } else if (dest->u_threshold != 0) { + if (ip_vs_dest_totalconns(dest) * 4 < dest->u_threshold * 3) + dest->flags &= ~IP_VS_DEST_F_OVERLOAD; + } else { + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + dest->flags &= ~IP_VS_DEST_F_OVERLOAD; + } + + /* + * Simply decrease the refcnt of the dest, because the + * dest will be either in service's destination list + * or in the trash. + */ + atomic_dec(&dest->refcnt); +} + +static int expire_quiescent_template(struct netns_ipvs *ipvs, + struct ip_vs_dest *dest) +{ +#ifdef CONFIG_SYSCTL + return ipvs->sysctl_expire_quiescent_template && + (atomic_read(&dest->weight) == 0); +#else + return 0; +#endif +} + +/* + * Checking if the destination of a connection template is available. + * If available, return 1, otherwise invalidate this connection + * template and return 0. + */ +int ip_vs_check_template(struct ip_vs_conn *ct) +{ + struct ip_vs_dest *dest = ct->dest; + struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(ct)); + + /* + * Checking the dest server status. + */ + if ((dest == NULL) || + !(dest->flags & IP_VS_DEST_F_AVAILABLE) || + expire_quiescent_template(ipvs, dest)) { + IP_VS_DBG_BUF(9, "check_template: dest not available for " + "protocol %s s:%s:%d v:%s:%d " + "-> d:%s:%d\n", + ip_vs_proto_name(ct->protocol), + IP_VS_DBG_ADDR(ct->af, &ct->caddr), + ntohs(ct->cport), + IP_VS_DBG_ADDR(ct->af, &ct->vaddr), + ntohs(ct->vport), + IP_VS_DBG_ADDR(ct->af, &ct->daddr), + ntohs(ct->dport)); + + /* + * Invalidate the connection template + */ + if (ct->vport != htons(0xffff)) { + if (ip_vs_conn_unhash(ct)) { + ct->dport = htons(0xffff); + ct->vport = htons(0xffff); + ct->cport = 0; + ip_vs_conn_hash(ct); + } + } + + /* + * Simply decrease the refcnt of the template, + * don't restart its timer. + */ + atomic_dec(&ct->refcnt); + return 0; + } + return 1; +} + +static void ip_vs_conn_expire(unsigned long data) +{ + struct ip_vs_conn *cp = (struct ip_vs_conn *)data; + struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp)); + + cp->timeout = 60*HZ; + + /* + * hey, I'm using it + */ + atomic_inc(&cp->refcnt); + + /* + * do I control anybody? + */ + if (atomic_read(&cp->n_control)) + goto expire_later; + + /* + * unhash it if it is hashed in the conn table + */ + if (!ip_vs_conn_unhash(cp) && !(cp->flags & IP_VS_CONN_F_ONE_PACKET)) + goto expire_later; + + /* + * refcnt==1 implies I'm the only one referrer + */ + if (likely(atomic_read(&cp->refcnt) == 1)) { + /* delete the timer if it is activated by other users */ + if (timer_pending(&cp->timer)) + del_timer(&cp->timer); + + /* does anybody control me? */ + if (cp->control) + ip_vs_control_del(cp); + + if (cp->flags & IP_VS_CONN_F_NFCT) { + ip_vs_conn_drop_conntrack(cp); + /* Do not access conntracks during subsys cleanup + * because nf_conntrack_find_get can not be used after + * conntrack cleanup for the net. + */ + smp_rmb(); + if (ipvs->enable) + ip_vs_conn_drop_conntrack(cp); + } + + ip_vs_pe_put(cp->pe); + kfree(cp->pe_data); + if (unlikely(cp->app != NULL)) + ip_vs_unbind_app(cp); + ip_vs_unbind_dest(cp); + if (cp->flags & IP_VS_CONN_F_NO_CPORT) + atomic_dec(&ip_vs_conn_no_cport_cnt); + atomic_dec(&ipvs->conn_count); + + kmem_cache_free(ip_vs_conn_cachep, cp); + return; + } + + /* hash it back to the table */ + ip_vs_conn_hash(cp); + + expire_later: + IP_VS_DBG(7, "delayed: conn->refcnt-1=%d conn->n_control=%d\n", + atomic_read(&cp->refcnt)-1, + atomic_read(&cp->n_control)); + + ip_vs_conn_put(cp); +} + + +void ip_vs_conn_expire_now(struct ip_vs_conn *cp) +{ + if (del_timer(&cp->timer)) + mod_timer(&cp->timer, jiffies); +} + + +/* + * Create a new connection entry and hash it into the ip_vs_conn_tab + */ +struct ip_vs_conn * +ip_vs_conn_new(const struct ip_vs_conn_param *p, + const union nf_inet_addr *daddr, __be16 dport, unsigned flags, + struct ip_vs_dest *dest, __u32 fwmark) +{ + struct ip_vs_conn *cp; + struct netns_ipvs *ipvs = net_ipvs(p->net); + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(p->net, + p->protocol); + + cp = kmem_cache_zalloc(ip_vs_conn_cachep, GFP_ATOMIC); + if (cp == NULL) { + IP_VS_ERR_RL("%s(): no memory\n", __func__); + return NULL; + } + + INIT_HLIST_NODE(&cp->c_list); + setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp); + ip_vs_conn_net_set(cp, p->net); + cp->af = p->af; + cp->protocol = p->protocol; + ip_vs_addr_copy(p->af, &cp->caddr, p->caddr); + cp->cport = p->cport; + ip_vs_addr_copy(p->af, &cp->vaddr, p->vaddr); + cp->vport = p->vport; + /* proto should only be IPPROTO_IP if d_addr is a fwmark */ + ip_vs_addr_copy(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af, + &cp->daddr, daddr); + cp->dport = dport; + cp->flags = flags; + cp->fwmark = fwmark; + if (flags & IP_VS_CONN_F_TEMPLATE && p->pe) { + ip_vs_pe_get(p->pe); + cp->pe = p->pe; + cp->pe_data = p->pe_data; + cp->pe_data_len = p->pe_data_len; + } + spin_lock_init(&cp->lock); + + /* + * Set the entry is referenced by the current thread before hashing + * it in the table, so that other thread run ip_vs_random_dropentry + * but cannot drop this entry. + */ + atomic_set(&cp->refcnt, 1); + + atomic_set(&cp->n_control, 0); + atomic_set(&cp->in_pkts, 0); + + atomic_inc(&ipvs->conn_count); + if (flags & IP_VS_CONN_F_NO_CPORT) + atomic_inc(&ip_vs_conn_no_cport_cnt); + + /* Bind the connection with a destination server */ + ip_vs_bind_dest(cp, dest); + + /* Set its state and timeout */ + cp->state = 0; + cp->timeout = 3*HZ; + + /* Bind its packet transmitter */ +#ifdef CONFIG_IP_VS_IPV6 + if (p->af == AF_INET6) + ip_vs_bind_xmit_v6(cp); + else +#endif + ip_vs_bind_xmit(cp); + + if (unlikely(pd && atomic_read(&pd->appcnt))) + ip_vs_bind_app(cp, pd->pp); + + /* + * Allow conntrack to be preserved. By default, conntrack + * is created and destroyed for every packet. + * Sometimes keeping conntrack can be useful for + * IP_VS_CONN_F_ONE_PACKET too. + */ + + if (ip_vs_conntrack_enabled(ipvs)) + cp->flags |= IP_VS_CONN_F_NFCT; + + /* Hash it in the ip_vs_conn_tab finally */ + ip_vs_conn_hash(cp); + + return cp; +} + +/* + * /proc/net/ip_vs_conn entries + */ +#ifdef CONFIG_PROC_FS +struct ip_vs_iter_state { + struct seq_net_private p; + struct hlist_head *l; +}; + +static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos) +{ + int idx; + struct ip_vs_conn *cp; + struct ip_vs_iter_state *iter = seq->private; + struct hlist_node *n; + + for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { + ct_read_lock_bh(idx); + hlist_for_each_entry(cp, n, &ip_vs_conn_tab[idx], c_list) { + if (pos-- == 0) { + iter->l = &ip_vs_conn_tab[idx]; + return cp; + } + } + ct_read_unlock_bh(idx); + } + + return NULL; +} + +static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct ip_vs_iter_state *iter = seq->private; + + iter->l = NULL; + return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN; +} + +static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct ip_vs_conn *cp = v; + struct ip_vs_iter_state *iter = seq->private; + struct hlist_node *e; + struct hlist_head *l = iter->l; + int idx; + + ++*pos; + if (v == SEQ_START_TOKEN) + return ip_vs_conn_array(seq, 0); + + /* more on same hash chain? */ + if ((e = cp->c_list.next)) + return hlist_entry(e, struct ip_vs_conn, c_list); + + idx = l - ip_vs_conn_tab; + ct_read_unlock_bh(idx); + + while (++idx < ip_vs_conn_tab_size) { + ct_read_lock_bh(idx); + hlist_for_each_entry(cp, e, &ip_vs_conn_tab[idx], c_list) { + iter->l = &ip_vs_conn_tab[idx]; + return cp; + } + ct_read_unlock_bh(idx); + } + iter->l = NULL; + return NULL; +} + +static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v) +{ + struct ip_vs_iter_state *iter = seq->private; + struct hlist_head *l = iter->l; + + if (l) + ct_read_unlock_bh(l - ip_vs_conn_tab); +} + +static int ip_vs_conn_seq_show(struct seq_file *seq, void *v) +{ + + if (v == SEQ_START_TOKEN) + seq_puts(seq, + "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Expires PEName PEData\n"); + else { + const struct ip_vs_conn *cp = v; + struct net *net = seq_file_net(seq); + char pe_data[IP_VS_PENAME_MAXLEN + IP_VS_PEDATA_MAXLEN + 3]; + size_t len = 0; + + if (!ip_vs_conn_net_eq(cp, net)) + return 0; + if (cp->pe_data) { + pe_data[0] = ' '; + len = strlen(cp->pe->name); + memcpy(pe_data + 1, cp->pe->name, len); + pe_data[len + 1] = ' '; + len += 2; + len += cp->pe->show_pe_data(cp, pe_data + len); + } + pe_data[len] = '\0'; + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X " + "%pI6 %04X %-11s %7lu%s\n", + ip_vs_proto_name(cp->protocol), + &cp->caddr.in6, ntohs(cp->cport), + &cp->vaddr.in6, ntohs(cp->vport), + &cp->daddr.in6, ntohs(cp->dport), + ip_vs_state_name(cp->protocol, cp->state), + (cp->timer.expires-jiffies)/HZ, pe_data); + else +#endif + seq_printf(seq, + "%-3s %08X %04X %08X %04X" + " %08X %04X %-11s %7lu%s\n", + ip_vs_proto_name(cp->protocol), + ntohl(cp->caddr.ip), ntohs(cp->cport), + ntohl(cp->vaddr.ip), ntohs(cp->vport), + ntohl(cp->daddr.ip), ntohs(cp->dport), + ip_vs_state_name(cp->protocol, cp->state), + (cp->timer.expires-jiffies)/HZ, pe_data); + } + return 0; +} + +static const struct seq_operations ip_vs_conn_seq_ops = { + .start = ip_vs_conn_seq_start, + .next = ip_vs_conn_seq_next, + .stop = ip_vs_conn_seq_stop, + .show = ip_vs_conn_seq_show, +}; + +static int ip_vs_conn_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &ip_vs_conn_seq_ops, + sizeof(struct ip_vs_iter_state)); +} + +static const struct file_operations ip_vs_conn_fops = { + .owner = THIS_MODULE, + .open = ip_vs_conn_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +static const char *ip_vs_origin_name(unsigned flags) +{ + if (flags & IP_VS_CONN_F_SYNC) + return "SYNC"; + else + return "LOCAL"; +} + +static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v) +{ + + if (v == SEQ_START_TOKEN) + seq_puts(seq, + "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Origin Expires\n"); + else { + const struct ip_vs_conn *cp = v; + struct net *net = seq_file_net(seq); + + if (!ip_vs_conn_net_eq(cp, net)) + return 0; + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X %pI6 %04X %-11s %-6s %7lu\n", + ip_vs_proto_name(cp->protocol), + &cp->caddr.in6, ntohs(cp->cport), + &cp->vaddr.in6, ntohs(cp->vport), + &cp->daddr.in6, ntohs(cp->dport), + ip_vs_state_name(cp->protocol, cp->state), + ip_vs_origin_name(cp->flags), + (cp->timer.expires-jiffies)/HZ); + else +#endif + seq_printf(seq, + "%-3s %08X %04X %08X %04X " + "%08X %04X %-11s %-6s %7lu\n", + ip_vs_proto_name(cp->protocol), + ntohl(cp->caddr.ip), ntohs(cp->cport), + ntohl(cp->vaddr.ip), ntohs(cp->vport), + ntohl(cp->daddr.ip), ntohs(cp->dport), + ip_vs_state_name(cp->protocol, cp->state), + ip_vs_origin_name(cp->flags), + (cp->timer.expires-jiffies)/HZ); + } + return 0; +} + +static const struct seq_operations ip_vs_conn_sync_seq_ops = { + .start = ip_vs_conn_seq_start, + .next = ip_vs_conn_seq_next, + .stop = ip_vs_conn_seq_stop, + .show = ip_vs_conn_sync_seq_show, +}; + +static int ip_vs_conn_sync_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &ip_vs_conn_sync_seq_ops, + sizeof(struct ip_vs_iter_state)); +} + +static const struct file_operations ip_vs_conn_sync_fops = { + .owner = THIS_MODULE, + .open = ip_vs_conn_sync_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +#endif + + +/* + * Randomly drop connection entries before running out of memory + */ +static inline int todrop_entry(struct ip_vs_conn *cp) +{ + /* + * The drop rate array needs tuning for real environments. + * Called from timer bh only => no locking + */ + static const char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8}; + static char todrop_counter[9] = {0}; + int i; + + /* if the conn entry hasn't lasted for 60 seconds, don't drop it. + This will leave enough time for normal connection to get + through. */ + if (time_before(cp->timeout + jiffies, cp->timer.expires + 60*HZ)) + return 0; + + /* Don't drop the entry if its number of incoming packets is not + located in [0, 8] */ + i = atomic_read(&cp->in_pkts); + if (i > 8 || i < 0) return 0; + + if (!todrop_rate[i]) return 0; + if (--todrop_counter[i] > 0) return 0; + + todrop_counter[i] = todrop_rate[i]; + return 1; +} + +/* Called from keventd and must protect itself from softirqs */ +void ip_vs_random_dropentry(struct net *net) +{ + int idx; + struct ip_vs_conn *cp; + + /* + * Randomly scan 1/32 of the whole table every second + */ + for (idx = 0; idx < (ip_vs_conn_tab_size>>5); idx++) { + unsigned hash = net_random() & ip_vs_conn_tab_mask; + struct hlist_node *n; + + /* + * Lock is actually needed in this loop. + */ + ct_write_lock_bh(hash); + + hlist_for_each_entry(cp, n, &ip_vs_conn_tab[hash], c_list) { + if (cp->flags & IP_VS_CONN_F_TEMPLATE) + /* connection template */ + continue; + if (!ip_vs_conn_net_eq(cp, net)) + continue; + if (cp->protocol == IPPROTO_TCP) { + switch(cp->state) { + case IP_VS_TCP_S_SYN_RECV: + case IP_VS_TCP_S_SYNACK: + break; + + case IP_VS_TCP_S_ESTABLISHED: + if (todrop_entry(cp)) + break; + continue; + + default: + continue; + } + } else { + if (!todrop_entry(cp)) + continue; + } + + IP_VS_DBG(4, "del connection\n"); + ip_vs_conn_expire_now(cp); + if (cp->control) { + IP_VS_DBG(4, "del conn template\n"); + ip_vs_conn_expire_now(cp->control); + } + } + ct_write_unlock_bh(hash); + } +} + + +/* + * Flush all the connection entries in the ip_vs_conn_tab + */ +static void ip_vs_conn_flush(struct net *net) +{ + int idx; + struct ip_vs_conn *cp; + struct netns_ipvs *ipvs = net_ipvs(net); + +flush_again: + for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { + struct hlist_node *n; + + /* + * Lock is actually needed in this loop. + */ + ct_write_lock_bh(idx); + + hlist_for_each_entry(cp, n, &ip_vs_conn_tab[idx], c_list) { + if (!ip_vs_conn_net_eq(cp, net)) + continue; + IP_VS_DBG(4, "del connection\n"); + ip_vs_conn_expire_now(cp); + if (cp->control) { + IP_VS_DBG(4, "del conn template\n"); + ip_vs_conn_expire_now(cp->control); + } + } + ct_write_unlock_bh(idx); + } + + /* the counter may be not NULL, because maybe some conn entries + are run by slow timer handler or unhashed but still referred */ + if (atomic_read(&ipvs->conn_count) != 0) { + schedule(); + goto flush_again; + } +} +/* + * per netns init and exit + */ +int __net_init ip_vs_conn_net_init(struct net *net) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + + atomic_set(&ipvs->conn_count, 0); + + proc_net_fops_create(net, "ip_vs_conn", 0, &ip_vs_conn_fops); + proc_net_fops_create(net, "ip_vs_conn_sync", 0, &ip_vs_conn_sync_fops); + return 0; +} + +void __net_exit ip_vs_conn_net_cleanup(struct net *net) +{ + /* flush all the connection entries first */ + ip_vs_conn_flush(net); + proc_net_remove(net, "ip_vs_conn"); + proc_net_remove(net, "ip_vs_conn_sync"); +} + +int __init ip_vs_conn_init(void) +{ + int idx; + + /* Compute size and mask */ + ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits; + ip_vs_conn_tab_mask = ip_vs_conn_tab_size - 1; + + /* + * Allocate the connection hash table and initialize its list heads + */ + ip_vs_conn_tab = vmalloc(ip_vs_conn_tab_size * sizeof(*ip_vs_conn_tab)); + if (!ip_vs_conn_tab) + return -ENOMEM; + + /* Allocate ip_vs_conn slab cache */ + ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn", + sizeof(struct ip_vs_conn), 0, + SLAB_HWCACHE_ALIGN, NULL); + if (!ip_vs_conn_cachep) { + vfree(ip_vs_conn_tab); + return -ENOMEM; + } + + pr_info("Connection hash table configured " + "(size=%d, memory=%ldKbytes)\n", + ip_vs_conn_tab_size, + (long)(ip_vs_conn_tab_size*sizeof(struct list_head))/1024); + IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n", + sizeof(struct ip_vs_conn)); + + for (idx = 0; idx < ip_vs_conn_tab_size; idx++) + INIT_HLIST_HEAD(&ip_vs_conn_tab[idx]); + + for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++) { + rwlock_init(&__ip_vs_conntbl_lock_array[idx].l); + } + + /* calculate the random value for connection hash */ + get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd)); + + return 0; +} + +void ip_vs_conn_cleanup(void) +{ + /* Release the empty cache */ + kmem_cache_destroy(ip_vs_conn_cachep); + vfree(ip_vs_conn_tab); +} diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c new file mode 100644 index 00000000..00bdb1d9 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -0,0 +1,2037 @@ +/* + * IPVS An implementation of the IP virtual server support for the + * LINUX operating system. IPVS is now implemented as a module + * over the Netfilter framework. IPVS can be used to build a + * high-performance and highly available server based on a + * cluster of servers. + * + * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> + * Peter Kese <peter.kese@ijs.si> + * Julian Anastasov <ja@ssi.bg> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese, + * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms + * and others. + * + * Changes: + * Paul `Rusty' Russell properly handle non-linear skbs + * Harald Welte don't use nfcache + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/ip.h> +#include <linux/tcp.h> +#include <linux/sctp.h> +#include <linux/icmp.h> +#include <linux/slab.h> + +#include <net/ip.h> +#include <net/tcp.h> +#include <net/udp.h> +#include <net/icmp.h> /* for icmp_send */ +#include <net/route.h> +#include <net/ip6_checksum.h> +#include <net/netns/generic.h> /* net_generic() */ + +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> + +#ifdef CONFIG_IP_VS_IPV6 +#include <net/ipv6.h> +#include <linux/netfilter_ipv6.h> +#include <net/ip6_route.h> +#endif + +#include <net/ip_vs.h> + + +EXPORT_SYMBOL(register_ip_vs_scheduler); +EXPORT_SYMBOL(unregister_ip_vs_scheduler); +EXPORT_SYMBOL(ip_vs_proto_name); +EXPORT_SYMBOL(ip_vs_conn_new); +EXPORT_SYMBOL(ip_vs_conn_in_get); +EXPORT_SYMBOL(ip_vs_conn_out_get); +#ifdef CONFIG_IP_VS_PROTO_TCP +EXPORT_SYMBOL(ip_vs_tcp_conn_listen); +#endif +EXPORT_SYMBOL(ip_vs_conn_put); +#ifdef CONFIG_IP_VS_DEBUG +EXPORT_SYMBOL(ip_vs_get_debug_level); +#endif + +int ip_vs_net_id __read_mostly; +#ifdef IP_VS_GENERIC_NETNS +EXPORT_SYMBOL(ip_vs_net_id); +#endif +/* netns cnt used for uniqueness */ +static atomic_t ipvs_netns_cnt = ATOMIC_INIT(0); + +/* ID used in ICMP lookups */ +#define icmp_id(icmph) (((icmph)->un).echo.id) +#define icmpv6_id(icmph) (icmph->icmp6_dataun.u_echo.identifier) + +const char *ip_vs_proto_name(unsigned proto) +{ + static char buf[20]; + + switch (proto) { + case IPPROTO_IP: + return "IP"; + case IPPROTO_UDP: + return "UDP"; + case IPPROTO_TCP: + return "TCP"; + case IPPROTO_SCTP: + return "SCTP"; + case IPPROTO_ICMP: + return "ICMP"; +#ifdef CONFIG_IP_VS_IPV6 + case IPPROTO_ICMPV6: + return "ICMPv6"; +#endif + default: + sprintf(buf, "IP_%d", proto); + return buf; + } +} + +void ip_vs_init_hash_table(struct list_head *table, int rows) +{ + while (--rows >= 0) + INIT_LIST_HEAD(&table[rows]); +} + +static inline void +ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb) +{ + struct ip_vs_dest *dest = cp->dest; + struct netns_ipvs *ipvs = net_ipvs(skb_net(skb)); + + if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { + struct ip_vs_cpu_stats *s; + + s = this_cpu_ptr(dest->stats.cpustats); + s->ustats.inpkts++; + u64_stats_update_begin(&s->syncp); + s->ustats.inbytes += skb->len; + u64_stats_update_end(&s->syncp); + + s = this_cpu_ptr(dest->svc->stats.cpustats); + s->ustats.inpkts++; + u64_stats_update_begin(&s->syncp); + s->ustats.inbytes += skb->len; + u64_stats_update_end(&s->syncp); + + s = this_cpu_ptr(ipvs->tot_stats.cpustats); + s->ustats.inpkts++; + u64_stats_update_begin(&s->syncp); + s->ustats.inbytes += skb->len; + u64_stats_update_end(&s->syncp); + } +} + + +static inline void +ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb) +{ + struct ip_vs_dest *dest = cp->dest; + struct netns_ipvs *ipvs = net_ipvs(skb_net(skb)); + + if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { + struct ip_vs_cpu_stats *s; + + s = this_cpu_ptr(dest->stats.cpustats); + s->ustats.outpkts++; + u64_stats_update_begin(&s->syncp); + s->ustats.outbytes += skb->len; + u64_stats_update_end(&s->syncp); + + s = this_cpu_ptr(dest->svc->stats.cpustats); + s->ustats.outpkts++; + u64_stats_update_begin(&s->syncp); + s->ustats.outbytes += skb->len; + u64_stats_update_end(&s->syncp); + + s = this_cpu_ptr(ipvs->tot_stats.cpustats); + s->ustats.outpkts++; + u64_stats_update_begin(&s->syncp); + s->ustats.outbytes += skb->len; + u64_stats_update_end(&s->syncp); + } +} + + +static inline void +ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc) +{ + struct netns_ipvs *ipvs = net_ipvs(svc->net); + struct ip_vs_cpu_stats *s; + + s = this_cpu_ptr(cp->dest->stats.cpustats); + s->ustats.conns++; + + s = this_cpu_ptr(svc->stats.cpustats); + s->ustats.conns++; + + s = this_cpu_ptr(ipvs->tot_stats.cpustats); + s->ustats.conns++; +} + + +static inline void +ip_vs_set_state(struct ip_vs_conn *cp, int direction, + const struct sk_buff *skb, + struct ip_vs_proto_data *pd) +{ + if (likely(pd->pp->state_transition)) + pd->pp->state_transition(cp, direction, skb, pd); +} + +static inline int +ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc, + struct sk_buff *skb, int protocol, + const union nf_inet_addr *caddr, __be16 cport, + const union nf_inet_addr *vaddr, __be16 vport, + struct ip_vs_conn_param *p) +{ + ip_vs_conn_fill_param(svc->net, svc->af, protocol, caddr, cport, vaddr, + vport, p); + p->pe = svc->pe; + if (p->pe && p->pe->fill_param) + return p->pe->fill_param(p, skb); + + return 0; +} + +/* + * IPVS persistent scheduling function + * It creates a connection entry according to its template if exists, + * or selects a server and creates a connection entry plus a template. + * Locking: we are svc user (svc->refcnt), so we hold all dests too + * Protocols supported: TCP, UDP + */ +static struct ip_vs_conn * +ip_vs_sched_persist(struct ip_vs_service *svc, + struct sk_buff *skb, + __be16 src_port, __be16 dst_port, int *ignored) +{ + struct ip_vs_conn *cp = NULL; + struct ip_vs_iphdr iph; + struct ip_vs_dest *dest; + struct ip_vs_conn *ct; + __be16 dport = 0; /* destination port to forward */ + unsigned int flags; + struct ip_vs_conn_param param; + const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) }; + union nf_inet_addr snet; /* source network of the client, + after masking */ + + ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); + + /* Mask saddr with the netmask to adjust template granularity */ +#ifdef CONFIG_IP_VS_IPV6 + if (svc->af == AF_INET6) + ipv6_addr_prefix(&snet.in6, &iph.saddr.in6, svc->netmask); + else +#endif + snet.ip = iph.saddr.ip & svc->netmask; + + IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u " + "mnet %s\n", + IP_VS_DBG_ADDR(svc->af, &iph.saddr), ntohs(src_port), + IP_VS_DBG_ADDR(svc->af, &iph.daddr), ntohs(dst_port), + IP_VS_DBG_ADDR(svc->af, &snet)); + + /* + * As far as we know, FTP is a very complicated network protocol, and + * it uses control connection and data connections. For active FTP, + * FTP server initialize data connection to the client, its source port + * is often 20. For passive FTP, FTP server tells the clients the port + * that it passively listens to, and the client issues the data + * connection. In the tunneling or direct routing mode, the load + * balancer is on the client-to-server half of connection, the port + * number is unknown to the load balancer. So, a conn template like + * <caddr, 0, vaddr, 0, daddr, 0> is created for persistent FTP + * service, and a template like <caddr, 0, vaddr, vport, daddr, dport> + * is created for other persistent services. + */ + { + int protocol = iph.protocol; + const union nf_inet_addr *vaddr = &iph.daddr; + __be16 vport = 0; + + if (dst_port == svc->port) { + /* non-FTP template: + * <protocol, caddr, 0, vaddr, vport, daddr, dport> + * FTP template: + * <protocol, caddr, 0, vaddr, 0, daddr, 0> + */ + if (svc->port != FTPPORT) + vport = dst_port; + } else { + /* Note: persistent fwmark-based services and + * persistent port zero service are handled here. + * fwmark template: + * <IPPROTO_IP,caddr,0,fwmark,0,daddr,0> + * port zero template: + * <protocol,caddr,0,vaddr,0,daddr,0> + */ + if (svc->fwmark) { + protocol = IPPROTO_IP; + vaddr = &fwmark; + } + } + /* return *ignored = -1 so NF_DROP can be used */ + if (ip_vs_conn_fill_param_persist(svc, skb, protocol, &snet, 0, + vaddr, vport, ¶m) < 0) { + *ignored = -1; + return NULL; + } + } + + /* Check if a template already exists */ + ct = ip_vs_ct_in_get(¶m); + if (!ct || !ip_vs_check_template(ct)) { + /* + * No template found or the dest of the connection + * template is not available. + * return *ignored=0 i.e. ICMP and NF_DROP + */ + dest = svc->scheduler->schedule(svc, skb); + if (!dest) { + IP_VS_DBG(1, "p-schedule: no dest found.\n"); + kfree(param.pe_data); + *ignored = 0; + return NULL; + } + + if (dst_port == svc->port && svc->port != FTPPORT) + dport = dest->port; + + /* Create a template + * This adds param.pe_data to the template, + * and thus param.pe_data will be destroyed + * when the template expires */ + ct = ip_vs_conn_new(¶m, &dest->addr, dport, + IP_VS_CONN_F_TEMPLATE, dest, skb->mark); + if (ct == NULL) { + kfree(param.pe_data); + *ignored = -1; + return NULL; + } + + ct->timeout = svc->timeout; + } else { + /* set destination with the found template */ + dest = ct->dest; + kfree(param.pe_data); + } + + dport = dst_port; + if (dport == svc->port && dest->port) + dport = dest->port; + + flags = (svc->flags & IP_VS_SVC_F_ONEPACKET + && iph.protocol == IPPROTO_UDP)? + IP_VS_CONN_F_ONE_PACKET : 0; + + /* + * Create a new connection according to the template + */ + ip_vs_conn_fill_param(svc->net, svc->af, iph.protocol, &iph.saddr, + src_port, &iph.daddr, dst_port, ¶m); + + cp = ip_vs_conn_new(¶m, &dest->addr, dport, flags, dest, skb->mark); + if (cp == NULL) { + ip_vs_conn_put(ct); + *ignored = -1; + return NULL; + } + + /* + * Add its control + */ + ip_vs_control_add(cp, ct); + ip_vs_conn_put(ct); + + ip_vs_conn_stats(cp, svc); + return cp; +} + + +/* + * IPVS main scheduling function + * It selects a server according to the virtual service, and + * creates a connection entry. + * Protocols supported: TCP, UDP + * + * Usage of *ignored + * + * 1 : protocol tried to schedule (eg. on SYN), found svc but the + * svc/scheduler decides that this packet should be accepted with + * NF_ACCEPT because it must not be scheduled. + * + * 0 : scheduler can not find destination, so try bypass or + * return ICMP and then NF_DROP (ip_vs_leave). + * + * -1 : scheduler tried to schedule but fatal error occurred, eg. + * ip_vs_conn_new failure (ENOMEM) or ip_vs_sip_fill_param + * failure such as missing Call-ID, ENOMEM on skb_linearize + * or pe_data. In this case we should return NF_DROP without + * any attempts to send ICMP with ip_vs_leave. + */ +struct ip_vs_conn * +ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, + struct ip_vs_proto_data *pd, int *ignored) +{ + struct ip_vs_protocol *pp = pd->pp; + struct ip_vs_conn *cp = NULL; + struct ip_vs_iphdr iph; + struct ip_vs_dest *dest; + __be16 _ports[2], *pptr; + unsigned int flags; + + *ignored = 1; + ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); + pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports); + if (pptr == NULL) + return NULL; + + /* + * FTPDATA needs this check when using local real server. + * Never schedule Active FTPDATA connections from real server. + * For LVS-NAT they must be already created. For other methods + * with persistence the connection is created on SYN+ACK. + */ + if (pptr[0] == FTPDATA) { + IP_VS_DBG_PKT(12, svc->af, pp, skb, 0, + "Not scheduling FTPDATA"); + return NULL; + } + + /* + * Do not schedule replies from local real server. + */ + if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK) && + (cp = pp->conn_in_get(svc->af, skb, &iph, iph.len, 1))) { + IP_VS_DBG_PKT(12, svc->af, pp, skb, 0, + "Not scheduling reply for existing connection"); + __ip_vs_conn_put(cp); + return NULL; + } + + /* + * Persistent service + */ + if (svc->flags & IP_VS_SVC_F_PERSISTENT) + return ip_vs_sched_persist(svc, skb, pptr[0], pptr[1], ignored); + + *ignored = 0; + + /* + * Non-persistent service + */ + if (!svc->fwmark && pptr[1] != svc->port) { + if (!svc->port) + pr_err("Schedule: port zero only supported " + "in persistent services, " + "check your ipvs configuration\n"); + return NULL; + } + + dest = svc->scheduler->schedule(svc, skb); + if (dest == NULL) { + IP_VS_DBG(1, "Schedule: no dest found.\n"); + return NULL; + } + + flags = (svc->flags & IP_VS_SVC_F_ONEPACKET + && iph.protocol == IPPROTO_UDP)? + IP_VS_CONN_F_ONE_PACKET : 0; + + /* + * Create a connection entry. + */ + { + struct ip_vs_conn_param p; + + ip_vs_conn_fill_param(svc->net, svc->af, iph.protocol, + &iph.saddr, pptr[0], &iph.daddr, pptr[1], + &p); + cp = ip_vs_conn_new(&p, &dest->addr, + dest->port ? dest->port : pptr[1], + flags, dest, skb->mark); + if (!cp) { + *ignored = -1; + return NULL; + } + } + + IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u " + "d:%s:%u conn->flags:%X conn->refcnt:%d\n", + ip_vs_fwd_tag(cp), + IP_VS_DBG_ADDR(svc->af, &cp->caddr), ntohs(cp->cport), + IP_VS_DBG_ADDR(svc->af, &cp->vaddr), ntohs(cp->vport), + IP_VS_DBG_ADDR(svc->af, &cp->daddr), ntohs(cp->dport), + cp->flags, atomic_read(&cp->refcnt)); + + ip_vs_conn_stats(cp, svc); + return cp; +} + + +/* + * Pass or drop the packet. + * Called by ip_vs_in, when the virtual service is available but + * no destination is available for a new connection. + */ +int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, + struct ip_vs_proto_data *pd) +{ + __be16 _ports[2], *pptr; + struct ip_vs_iphdr iph; +#ifdef CONFIG_SYSCTL + struct net *net; + struct netns_ipvs *ipvs; + int unicast; +#endif + + ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); + + pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports); + if (pptr == NULL) { + ip_vs_service_put(svc); + return NF_DROP; + } + +#ifdef CONFIG_SYSCTL + net = skb_net(skb); + +#ifdef CONFIG_IP_VS_IPV6 + if (svc->af == AF_INET6) + unicast = ipv6_addr_type(&iph.daddr.in6) & IPV6_ADDR_UNICAST; + else +#endif + unicast = (inet_addr_type(net, iph.daddr.ip) == RTN_UNICAST); + + /* if it is fwmark-based service, the cache_bypass sysctl is up + and the destination is a non-local unicast, then create + a cache_bypass connection entry */ + ipvs = net_ipvs(net); + if (ipvs->sysctl_cache_bypass && svc->fwmark && unicast) { + int ret; + struct ip_vs_conn *cp; + unsigned int flags = (svc->flags & IP_VS_SVC_F_ONEPACKET && + iph.protocol == IPPROTO_UDP)? + IP_VS_CONN_F_ONE_PACKET : 0; + union nf_inet_addr daddr = { .all = { 0, 0, 0, 0 } }; + + ip_vs_service_put(svc); + + /* create a new connection entry */ + IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__); + { + struct ip_vs_conn_param p; + ip_vs_conn_fill_param(svc->net, svc->af, iph.protocol, + &iph.saddr, pptr[0], + &iph.daddr, pptr[1], &p); + cp = ip_vs_conn_new(&p, &daddr, 0, + IP_VS_CONN_F_BYPASS | flags, + NULL, skb->mark); + if (!cp) + return NF_DROP; + } + + /* statistics */ + ip_vs_in_stats(cp, skb); + + /* set state */ + ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd); + + /* transmit the first SYN packet */ + ret = cp->packet_xmit(skb, cp, pd->pp); + /* do not touch skb anymore */ + + atomic_inc(&cp->in_pkts); + ip_vs_conn_put(cp); + return ret; + } +#endif + + /* + * When the virtual ftp service is presented, packets destined + * for other services on the VIP may get here (except services + * listed in the ipvs table), pass the packets, because it is + * not ipvs job to decide to drop the packets. + */ + if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT)) { + ip_vs_service_put(svc); + return NF_ACCEPT; + } + + ip_vs_service_put(svc); + + /* + * Notify the client that the destination is unreachable, and + * release the socket buffer. + * Since it is in IP layer, the TCP socket is not actually + * created, the TCP RST packet cannot be sent, instead that + * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ + */ +#ifdef CONFIG_IP_VS_IPV6 + if (svc->af == AF_INET6) { + if (!skb->dev) { + struct net *net = dev_net(skb_dst(skb)->dev); + + skb->dev = net->loopback_dev; + } + icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); + } else +#endif + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); + + return NF_DROP; +} + +#ifdef CONFIG_SYSCTL + +static int sysctl_snat_reroute(struct sk_buff *skb) +{ + struct netns_ipvs *ipvs = net_ipvs(skb_net(skb)); + return ipvs->sysctl_snat_reroute; +} + +static int sysctl_nat_icmp_send(struct net *net) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + return ipvs->sysctl_nat_icmp_send; +} + +static int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs) +{ + return ipvs->sysctl_expire_nodest_conn; +} + +#else + +static int sysctl_snat_reroute(struct sk_buff *skb) { return 0; } +static int sysctl_nat_icmp_send(struct net *net) { return 0; } +static int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs) { return 0; } + +#endif + +__sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset) +{ + return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0)); +} + +static inline enum ip_defrag_users ip_vs_defrag_user(unsigned int hooknum) +{ + if (NF_INET_LOCAL_IN == hooknum) + return IP_DEFRAG_VS_IN; + if (NF_INET_FORWARD == hooknum) + return IP_DEFRAG_VS_FWD; + return IP_DEFRAG_VS_OUT; +} + +static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user) +{ + int err = ip_defrag(skb, user); + + if (!err) + ip_send_check(ip_hdr(skb)); + + return err; +} + +#ifdef CONFIG_IP_VS_IPV6 +static inline int ip_vs_gather_frags_v6(struct sk_buff *skb, u_int32_t user) +{ + /* TODO IPv6: Find out what to do here for IPv6 */ + return 0; +} +#endif + +static int ip_vs_route_me_harder(int af, struct sk_buff *skb) +{ +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) { + if (sysctl_snat_reroute(skb) && ip6_route_me_harder(skb) != 0) + return 1; + } else +#endif + if ((sysctl_snat_reroute(skb) || + skb_rtable(skb)->rt_flags & RTCF_LOCAL) && + ip_route_me_harder(skb, RTN_LOCAL) != 0) + return 1; + + return 0; +} + +/* + * Packet has been made sufficiently writable in caller + * - inout: 1=in->out, 0=out->in + */ +void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct ip_vs_conn *cp, int inout) +{ + struct iphdr *iph = ip_hdr(skb); + unsigned int icmp_offset = iph->ihl*4; + struct icmphdr *icmph = (struct icmphdr *)(skb_network_header(skb) + + icmp_offset); + struct iphdr *ciph = (struct iphdr *)(icmph + 1); + + if (inout) { + iph->saddr = cp->vaddr.ip; + ip_send_check(iph); + ciph->daddr = cp->vaddr.ip; + ip_send_check(ciph); + } else { + iph->daddr = cp->daddr.ip; + ip_send_check(iph); + ciph->saddr = cp->daddr.ip; + ip_send_check(ciph); + } + + /* the TCP/UDP/SCTP port */ + if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol || + IPPROTO_SCTP == ciph->protocol) { + __be16 *ports = (void *)ciph + ciph->ihl*4; + + if (inout) + ports[1] = cp->vport; + else + ports[0] = cp->dport; + } + + /* And finally the ICMP checksum */ + icmph->checksum = 0; + icmph->checksum = ip_vs_checksum_complete(skb, icmp_offset); + skb->ip_summed = CHECKSUM_UNNECESSARY; + + if (inout) + IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph, + "Forwarding altered outgoing ICMP"); + else + IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph, + "Forwarding altered incoming ICMP"); +} + +#ifdef CONFIG_IP_VS_IPV6 +void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct ip_vs_conn *cp, int inout) +{ + struct ipv6hdr *iph = ipv6_hdr(skb); + unsigned int icmp_offset = sizeof(struct ipv6hdr); + struct icmp6hdr *icmph = (struct icmp6hdr *)(skb_network_header(skb) + + icmp_offset); + struct ipv6hdr *ciph = (struct ipv6hdr *)(icmph + 1); + + if (inout) { + iph->saddr = cp->vaddr.in6; + ciph->daddr = cp->vaddr.in6; + } else { + iph->daddr = cp->daddr.in6; + ciph->saddr = cp->daddr.in6; + } + + /* the TCP/UDP/SCTP port */ + if (IPPROTO_TCP == ciph->nexthdr || IPPROTO_UDP == ciph->nexthdr || + IPPROTO_SCTP == ciph->nexthdr) { + __be16 *ports = (void *)ciph + sizeof(struct ipv6hdr); + + if (inout) + ports[1] = cp->vport; + else + ports[0] = cp->dport; + } + + /* And finally the ICMP checksum */ + icmph->icmp6_cksum = ~csum_ipv6_magic(&iph->saddr, &iph->daddr, + skb->len - icmp_offset, + IPPROTO_ICMPV6, 0); + skb->csum_start = skb_network_header(skb) - skb->head + icmp_offset; + skb->csum_offset = offsetof(struct icmp6hdr, icmp6_cksum); + skb->ip_summed = CHECKSUM_PARTIAL; + + if (inout) + IP_VS_DBG_PKT(11, AF_INET6, pp, skb, + (void *)ciph - (void *)iph, + "Forwarding altered outgoing ICMPv6"); + else + IP_VS_DBG_PKT(11, AF_INET6, pp, skb, + (void *)ciph - (void *)iph, + "Forwarding altered incoming ICMPv6"); +} +#endif + +/* Handle relevant response ICMP messages - forward to the right + * destination host. + */ +static int handle_response_icmp(int af, struct sk_buff *skb, + union nf_inet_addr *snet, + __u8 protocol, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp, + unsigned int offset, unsigned int ihl) +{ + unsigned int verdict = NF_DROP; + + if (IP_VS_FWD_METHOD(cp) != 0) { + pr_err("shouldn't reach here, because the box is on the " + "half connection in the tun/dr module.\n"); + } + + /* Ensure the checksum is correct */ + if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) { + /* Failed checksum! */ + IP_VS_DBG_BUF(1, "Forward ICMP: failed checksum from %s!\n", + IP_VS_DBG_ADDR(af, snet)); + goto out; + } + + if (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol || + IPPROTO_SCTP == protocol) + offset += 2 * sizeof(__u16); + if (!skb_make_writable(skb, offset)) + goto out; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + ip_vs_nat_icmp_v6(skb, pp, cp, 1); + else +#endif + ip_vs_nat_icmp(skb, pp, cp, 1); + + if (ip_vs_route_me_harder(af, skb)) + goto out; + + /* do the statistics and put it back */ + ip_vs_out_stats(cp, skb); + + skb->ipvs_property = 1; + if (!(cp->flags & IP_VS_CONN_F_NFCT)) + ip_vs_notrack(skb); + else + ip_vs_update_conntrack(skb, cp, 0); + verdict = NF_ACCEPT; + +out: + __ip_vs_conn_put(cp); + + return verdict; +} + +/* + * Handle ICMP messages in the inside-to-outside direction (outgoing). + * Find any that might be relevant, check against existing connections. + * Currently handles error types - unreachable, quench, ttl exceeded. + */ +static int ip_vs_out_icmp(struct sk_buff *skb, int *related, + unsigned int hooknum) +{ + struct iphdr *iph; + struct icmphdr _icmph, *ic; + struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */ + struct ip_vs_iphdr ciph; + struct ip_vs_conn *cp; + struct ip_vs_protocol *pp; + unsigned int offset, ihl; + union nf_inet_addr snet; + + *related = 1; + + /* reassemble IP fragments */ + if (ip_is_fragment(ip_hdr(skb))) { + if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum))) + return NF_STOLEN; + } + + iph = ip_hdr(skb); + offset = ihl = iph->ihl * 4; + ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); + if (ic == NULL) + return NF_DROP; + + IP_VS_DBG(12, "Outgoing ICMP (%d,%d) %pI4->%pI4\n", + ic->type, ntohs(icmp_id(ic)), + &iph->saddr, &iph->daddr); + + /* + * Work through seeing if this is for us. + * These checks are supposed to be in an order that means easy + * things are checked first to speed up processing.... however + * this means that some packets will manage to get a long way + * down this stack and then be rejected, but that's life. + */ + if ((ic->type != ICMP_DEST_UNREACH) && + (ic->type != ICMP_SOURCE_QUENCH) && + (ic->type != ICMP_TIME_EXCEEDED)) { + *related = 0; + return NF_ACCEPT; + } + + /* Now find the contained IP header */ + offset += sizeof(_icmph); + cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); + if (cih == NULL) + return NF_ACCEPT; /* The packet looks wrong, ignore */ + + pp = ip_vs_proto_get(cih->protocol); + if (!pp) + return NF_ACCEPT; + + /* Is the embedded protocol header present? */ + if (unlikely(cih->frag_off & htons(IP_OFFSET) && + pp->dont_defrag)) + return NF_ACCEPT; + + IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset, + "Checking outgoing ICMP for"); + + offset += cih->ihl * 4; + + ip_vs_fill_iphdr(AF_INET, cih, &ciph); + /* The embedded headers contain source and dest in reverse order */ + cp = pp->conn_out_get(AF_INET, skb, &ciph, offset, 1); + if (!cp) + return NF_ACCEPT; + + snet.ip = iph->saddr; + return handle_response_icmp(AF_INET, skb, &snet, cih->protocol, cp, + pp, offset, ihl); +} + +#ifdef CONFIG_IP_VS_IPV6 +static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related, + unsigned int hooknum) +{ + struct ipv6hdr *iph; + struct icmp6hdr _icmph, *ic; + struct ipv6hdr _ciph, *cih; /* The ip header contained + within the ICMP */ + struct ip_vs_iphdr ciph; + struct ip_vs_conn *cp; + struct ip_vs_protocol *pp; + unsigned int offset; + union nf_inet_addr snet; + + *related = 1; + + /* reassemble IP fragments */ + if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) { + if (ip_vs_gather_frags_v6(skb, ip_vs_defrag_user(hooknum))) + return NF_STOLEN; + } + + iph = ipv6_hdr(skb); + offset = sizeof(struct ipv6hdr); + ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); + if (ic == NULL) + return NF_DROP; + + IP_VS_DBG(12, "Outgoing ICMPv6 (%d,%d) %pI6->%pI6\n", + ic->icmp6_type, ntohs(icmpv6_id(ic)), + &iph->saddr, &iph->daddr); + + /* + * Work through seeing if this is for us. + * These checks are supposed to be in an order that means easy + * things are checked first to speed up processing.... however + * this means that some packets will manage to get a long way + * down this stack and then be rejected, but that's life. + */ + if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) && + (ic->icmp6_type != ICMPV6_PKT_TOOBIG) && + (ic->icmp6_type != ICMPV6_TIME_EXCEED)) { + *related = 0; + return NF_ACCEPT; + } + + /* Now find the contained IP header */ + offset += sizeof(_icmph); + cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); + if (cih == NULL) + return NF_ACCEPT; /* The packet looks wrong, ignore */ + + pp = ip_vs_proto_get(cih->nexthdr); + if (!pp) + return NF_ACCEPT; + + /* Is the embedded protocol header present? */ + /* TODO: we don't support fragmentation at the moment anyways */ + if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag)) + return NF_ACCEPT; + + IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offset, + "Checking outgoing ICMPv6 for"); + + offset += sizeof(struct ipv6hdr); + + ip_vs_fill_iphdr(AF_INET6, cih, &ciph); + /* The embedded headers contain source and dest in reverse order */ + cp = pp->conn_out_get(AF_INET6, skb, &ciph, offset, 1); + if (!cp) + return NF_ACCEPT; + + snet.in6 = iph->saddr; + return handle_response_icmp(AF_INET6, skb, &snet, cih->nexthdr, cp, + pp, offset, sizeof(struct ipv6hdr)); +} +#endif + +/* + * Check if sctp chunc is ABORT chunk + */ +static inline int is_sctp_abort(const struct sk_buff *skb, int nh_len) +{ + sctp_chunkhdr_t *sch, schunk; + sch = skb_header_pointer(skb, nh_len + sizeof(sctp_sctphdr_t), + sizeof(schunk), &schunk); + if (sch == NULL) + return 0; + if (sch->type == SCTP_CID_ABORT) + return 1; + return 0; +} + +static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len) +{ + struct tcphdr _tcph, *th; + + th = skb_header_pointer(skb, nh_len, sizeof(_tcph), &_tcph); + if (th == NULL) + return 0; + return th->rst; +} + +/* Handle response packets: rewrite addresses and send away... + */ +static unsigned int +handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, + struct ip_vs_conn *cp, int ihl) +{ + struct ip_vs_protocol *pp = pd->pp; + + IP_VS_DBG_PKT(11, af, pp, skb, 0, "Outgoing packet"); + + if (!skb_make_writable(skb, ihl)) + goto drop; + + /* mangle the packet */ + if (pp->snat_handler && !pp->snat_handler(skb, pp, cp)) + goto drop; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + ipv6_hdr(skb)->saddr = cp->vaddr.in6; + else +#endif + { + ip_hdr(skb)->saddr = cp->vaddr.ip; + ip_send_check(ip_hdr(skb)); + } + + /* + * nf_iterate does not expect change in the skb->dst->dev. + * It looks like it is not fatal to enable this code for hooks + * where our handlers are at the end of the chain list and + * when all next handlers use skb->dst->dev and not outdev. + * It will definitely route properly the inout NAT traffic + * when multiple paths are used. + */ + + /* For policy routing, packets originating from this + * machine itself may be routed differently to packets + * passing through. We want this packet to be routed as + * if it came from this machine itself. So re-compute + * the routing information. + */ + if (ip_vs_route_me_harder(af, skb)) + goto drop; + + IP_VS_DBG_PKT(10, af, pp, skb, 0, "After SNAT"); + + ip_vs_out_stats(cp, skb); + ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pd); + skb->ipvs_property = 1; + if (!(cp->flags & IP_VS_CONN_F_NFCT)) + ip_vs_notrack(skb); + else + ip_vs_update_conntrack(skb, cp, 0); + ip_vs_conn_put(cp); + + LeaveFunction(11); + return NF_ACCEPT; + +drop: + ip_vs_conn_put(cp); + kfree_skb(skb); + LeaveFunction(11); + return NF_STOLEN; +} + +/* + * Check if outgoing packet belongs to the established ip_vs_conn. + */ +static unsigned int +ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) +{ + struct net *net = NULL; + struct ip_vs_iphdr iph; + struct ip_vs_protocol *pp; + struct ip_vs_proto_data *pd; + struct ip_vs_conn *cp; + + EnterFunction(11); + + /* Already marked as IPVS request or reply? */ + if (skb->ipvs_property) + return NF_ACCEPT; + + /* Bad... Do not break raw sockets */ + if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT && + af == AF_INET)) { + struct sock *sk = skb->sk; + struct inet_sock *inet = inet_sk(skb->sk); + + if (inet && sk->sk_family == PF_INET && inet->nodefrag) + return NF_ACCEPT; + } + + if (unlikely(!skb_dst(skb))) + return NF_ACCEPT; + + net = skb_net(skb); + if (!net_ipvs(net)->enable) + return NF_ACCEPT; + + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) { + if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { + int related; + int verdict = ip_vs_out_icmp_v6(skb, &related, + hooknum); + + if (related) + return verdict; + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + } + } else +#endif + if (unlikely(iph.protocol == IPPROTO_ICMP)) { + int related; + int verdict = ip_vs_out_icmp(skb, &related, hooknum); + + if (related) + return verdict; + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + } + + pd = ip_vs_proto_data_get(net, iph.protocol); + if (unlikely(!pd)) + return NF_ACCEPT; + pp = pd->pp; + + /* reassemble IP fragments */ +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) { + if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) { + if (ip_vs_gather_frags_v6(skb, + ip_vs_defrag_user(hooknum))) + return NF_STOLEN; + } + + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + } else +#endif + if (unlikely(ip_is_fragment(ip_hdr(skb)) && !pp->dont_defrag)) { + if (ip_vs_gather_frags(skb, + ip_vs_defrag_user(hooknum))) + return NF_STOLEN; + + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + } + + /* + * Check if the packet belongs to an existing entry + */ + cp = pp->conn_out_get(af, skb, &iph, iph.len, 0); + + if (likely(cp)) + return handle_response(af, skb, pd, cp, iph.len); + if (sysctl_nat_icmp_send(net) && + (pp->protocol == IPPROTO_TCP || + pp->protocol == IPPROTO_UDP || + pp->protocol == IPPROTO_SCTP)) { + __be16 _ports[2], *pptr; + + pptr = skb_header_pointer(skb, iph.len, + sizeof(_ports), _ports); + if (pptr == NULL) + return NF_ACCEPT; /* Not for me */ + if (ip_vs_lookup_real_service(net, af, iph.protocol, + &iph.saddr, + pptr[0])) { + /* + * Notify the real server: there is no + * existing entry if it is not RST + * packet or not TCP packet. + */ + if ((iph.protocol != IPPROTO_TCP && + iph.protocol != IPPROTO_SCTP) + || ((iph.protocol == IPPROTO_TCP + && !is_tcp_reset(skb, iph.len)) + || (iph.protocol == IPPROTO_SCTP + && !is_sctp_abort(skb, + iph.len)))) { +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) { + struct net *net = + dev_net(skb_dst(skb)->dev); + + if (!skb->dev) + skb->dev = net->loopback_dev; + icmpv6_send(skb, + ICMPV6_DEST_UNREACH, + ICMPV6_PORT_UNREACH, + 0); + } else +#endif + icmp_send(skb, + ICMP_DEST_UNREACH, + ICMP_PORT_UNREACH, 0); + return NF_DROP; + } + } + } + IP_VS_DBG_PKT(12, af, pp, skb, 0, + "ip_vs_out: packet continues traversal as normal"); + return NF_ACCEPT; +} + +/* + * It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain, + * used only for VS/NAT. + * Check if packet is reply for established ip_vs_conn. + */ +static unsigned int +ip_vs_reply4(unsigned int hooknum, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return ip_vs_out(hooknum, skb, AF_INET); +} + +/* + * It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT. + * Check if packet is reply for established ip_vs_conn. + */ +static unsigned int +ip_vs_local_reply4(unsigned int hooknum, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + unsigned int verdict; + + /* Disable BH in LOCAL_OUT until all places are fixed */ + local_bh_disable(); + verdict = ip_vs_out(hooknum, skb, AF_INET); + local_bh_enable(); + return verdict; +} + +#ifdef CONFIG_IP_VS_IPV6 + +/* + * It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain, + * used only for VS/NAT. + * Check if packet is reply for established ip_vs_conn. + */ +static unsigned int +ip_vs_reply6(unsigned int hooknum, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return ip_vs_out(hooknum, skb, AF_INET6); +} + +/* + * It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT. + * Check if packet is reply for established ip_vs_conn. + */ +static unsigned int +ip_vs_local_reply6(unsigned int hooknum, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + unsigned int verdict; + + /* Disable BH in LOCAL_OUT until all places are fixed */ + local_bh_disable(); + verdict = ip_vs_out(hooknum, skb, AF_INET6); + local_bh_enable(); + return verdict; +} + +#endif + +/* + * Handle ICMP messages in the outside-to-inside direction (incoming). + * Find any that might be relevant, check against existing connections, + * forward to the right destination host if relevant. + * Currently handles error types - unreachable, quench, ttl exceeded. + */ +static int +ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) +{ + struct net *net = NULL; + struct iphdr *iph; + struct icmphdr _icmph, *ic; + struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */ + struct ip_vs_iphdr ciph; + struct ip_vs_conn *cp; + struct ip_vs_protocol *pp; + struct ip_vs_proto_data *pd; + unsigned int offset, ihl, verdict; + + *related = 1; + + /* reassemble IP fragments */ + if (ip_is_fragment(ip_hdr(skb))) { + if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum))) + return NF_STOLEN; + } + + iph = ip_hdr(skb); + offset = ihl = iph->ihl * 4; + ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); + if (ic == NULL) + return NF_DROP; + + IP_VS_DBG(12, "Incoming ICMP (%d,%d) %pI4->%pI4\n", + ic->type, ntohs(icmp_id(ic)), + &iph->saddr, &iph->daddr); + + /* + * Work through seeing if this is for us. + * These checks are supposed to be in an order that means easy + * things are checked first to speed up processing.... however + * this means that some packets will manage to get a long way + * down this stack and then be rejected, but that's life. + */ + if ((ic->type != ICMP_DEST_UNREACH) && + (ic->type != ICMP_SOURCE_QUENCH) && + (ic->type != ICMP_TIME_EXCEEDED)) { + *related = 0; + return NF_ACCEPT; + } + + /* Now find the contained IP header */ + offset += sizeof(_icmph); + cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); + if (cih == NULL) + return NF_ACCEPT; /* The packet looks wrong, ignore */ + + net = skb_net(skb); + + pd = ip_vs_proto_data_get(net, cih->protocol); + if (!pd) + return NF_ACCEPT; + pp = pd->pp; + + /* Is the embedded protocol header present? */ + if (unlikely(cih->frag_off & htons(IP_OFFSET) && + pp->dont_defrag)) + return NF_ACCEPT; + + IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset, + "Checking incoming ICMP for"); + + offset += cih->ihl * 4; + + ip_vs_fill_iphdr(AF_INET, cih, &ciph); + /* The embedded headers contain source and dest in reverse order */ + cp = pp->conn_in_get(AF_INET, skb, &ciph, offset, 1); + if (!cp) + return NF_ACCEPT; + + verdict = NF_DROP; + + /* Ensure the checksum is correct */ + if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) { + /* Failed checksum! */ + IP_VS_DBG(1, "Incoming ICMP: failed checksum from %pI4!\n", + &iph->saddr); + goto out; + } + + /* do the statistics and put it back */ + ip_vs_in_stats(cp, skb); + if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol) + offset += 2 * sizeof(__u16); + verdict = ip_vs_icmp_xmit(skb, cp, pp, offset, hooknum); + +out: + __ip_vs_conn_put(cp); + + return verdict; +} + +#ifdef CONFIG_IP_VS_IPV6 +static int +ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum) +{ + struct net *net = NULL; + struct ipv6hdr *iph; + struct icmp6hdr _icmph, *ic; + struct ipv6hdr _ciph, *cih; /* The ip header contained + within the ICMP */ + struct ip_vs_iphdr ciph; + struct ip_vs_conn *cp; + struct ip_vs_protocol *pp; + struct ip_vs_proto_data *pd; + unsigned int offset, verdict; + + *related = 1; + + /* reassemble IP fragments */ + if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) { + if (ip_vs_gather_frags_v6(skb, ip_vs_defrag_user(hooknum))) + return NF_STOLEN; + } + + iph = ipv6_hdr(skb); + offset = sizeof(struct ipv6hdr); + ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); + if (ic == NULL) + return NF_DROP; + + IP_VS_DBG(12, "Incoming ICMPv6 (%d,%d) %pI6->%pI6\n", + ic->icmp6_type, ntohs(icmpv6_id(ic)), + &iph->saddr, &iph->daddr); + + /* + * Work through seeing if this is for us. + * These checks are supposed to be in an order that means easy + * things are checked first to speed up processing.... however + * this means that some packets will manage to get a long way + * down this stack and then be rejected, but that's life. + */ + if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) && + (ic->icmp6_type != ICMPV6_PKT_TOOBIG) && + (ic->icmp6_type != ICMPV6_TIME_EXCEED)) { + *related = 0; + return NF_ACCEPT; + } + + /* Now find the contained IP header */ + offset += sizeof(_icmph); + cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); + if (cih == NULL) + return NF_ACCEPT; /* The packet looks wrong, ignore */ + + net = skb_net(skb); + pd = ip_vs_proto_data_get(net, cih->nexthdr); + if (!pd) + return NF_ACCEPT; + pp = pd->pp; + + /* Is the embedded protocol header present? */ + /* TODO: we don't support fragmentation at the moment anyways */ + if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag)) + return NF_ACCEPT; + + IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offset, + "Checking incoming ICMPv6 for"); + + offset += sizeof(struct ipv6hdr); + + ip_vs_fill_iphdr(AF_INET6, cih, &ciph); + /* The embedded headers contain source and dest in reverse order */ + cp = pp->conn_in_get(AF_INET6, skb, &ciph, offset, 1); + if (!cp) + return NF_ACCEPT; + + /* do the statistics and put it back */ + ip_vs_in_stats(cp, skb); + if (IPPROTO_TCP == cih->nexthdr || IPPROTO_UDP == cih->nexthdr || + IPPROTO_SCTP == cih->nexthdr) + offset += 2 * sizeof(__u16); + verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, offset, hooknum); + + __ip_vs_conn_put(cp); + + return verdict; +} +#endif + + +/* + * Check if it's for virtual services, look it up, + * and send it on its way... + */ +static unsigned int +ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) +{ + struct net *net; + struct ip_vs_iphdr iph; + struct ip_vs_protocol *pp; + struct ip_vs_proto_data *pd; + struct ip_vs_conn *cp; + int ret, pkts; + struct netns_ipvs *ipvs; + + /* Already marked as IPVS request or reply? */ + if (skb->ipvs_property) + return NF_ACCEPT; + + /* + * Big tappo: + * - remote client: only PACKET_HOST + * - route: used for struct net when skb->dev is unset + */ + if (unlikely((skb->pkt_type != PACKET_HOST && + hooknum != NF_INET_LOCAL_OUT) || + !skb_dst(skb))) { + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s" + " ignored in hook %u\n", + skb->pkt_type, iph.protocol, + IP_VS_DBG_ADDR(af, &iph.daddr), hooknum); + return NF_ACCEPT; + } + /* ipvs enabled in this netns ? */ + net = skb_net(skb); + if (!net_ipvs(net)->enable) + return NF_ACCEPT; + + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + + /* Bad... Do not break raw sockets */ + if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT && + af == AF_INET)) { + struct sock *sk = skb->sk; + struct inet_sock *inet = inet_sk(skb->sk); + + if (inet && sk->sk_family == PF_INET && inet->nodefrag) + return NF_ACCEPT; + } + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) { + if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { + int related; + int verdict = ip_vs_in_icmp_v6(skb, &related, hooknum); + + if (related) + return verdict; + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + } + } else +#endif + if (unlikely(iph.protocol == IPPROTO_ICMP)) { + int related; + int verdict = ip_vs_in_icmp(skb, &related, hooknum); + + if (related) + return verdict; + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + } + + /* Protocol supported? */ + pd = ip_vs_proto_data_get(net, iph.protocol); + if (unlikely(!pd)) + return NF_ACCEPT; + pp = pd->pp; + /* + * Check if the packet belongs to an existing connection entry + */ + cp = pp->conn_in_get(af, skb, &iph, iph.len, 0); + + if (unlikely(!cp)) { + int v; + + if (!pp->conn_schedule(af, skb, pd, &v, &cp)) + return v; + } + + if (unlikely(!cp)) { + /* sorry, all this trouble for a no-hit :) */ + IP_VS_DBG_PKT(12, af, pp, skb, 0, + "ip_vs_in: packet continues traversal as normal"); + return NF_ACCEPT; + } + + IP_VS_DBG_PKT(11, af, pp, skb, 0, "Incoming packet"); + ipvs = net_ipvs(net); + /* Check the server status */ + if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) { + /* the destination server is not available */ + + if (sysctl_expire_nodest_conn(ipvs)) { + /* try to expire the connection immediately */ + ip_vs_conn_expire_now(cp); + } + /* don't restart its timer, and silently + drop the packet. */ + __ip_vs_conn_put(cp); + return NF_DROP; + } + + ip_vs_in_stats(cp, skb); + ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd); + if (cp->packet_xmit) + ret = cp->packet_xmit(skb, cp, pp); + /* do not touch skb anymore */ + else { + IP_VS_DBG_RL("warning: packet_xmit is null"); + ret = NF_ACCEPT; + } + + /* Increase its packet counter and check if it is needed + * to be synchronized + * + * Sync connection if it is about to close to + * encorage the standby servers to update the connections timeout + * + * For ONE_PKT let ip_vs_sync_conn() do the filter work. + */ + + if (cp->flags & IP_VS_CONN_F_ONE_PACKET) + pkts = sysctl_sync_threshold(ipvs); + else + pkts = atomic_add_return(1, &cp->in_pkts); + + if ((ipvs->sync_state & IP_VS_STATE_MASTER) && + cp->protocol == IPPROTO_SCTP) { + if ((cp->state == IP_VS_SCTP_S_ESTABLISHED && + (pkts % sysctl_sync_period(ipvs) + == sysctl_sync_threshold(ipvs))) || + (cp->old_state != cp->state && + ((cp->state == IP_VS_SCTP_S_CLOSED) || + (cp->state == IP_VS_SCTP_S_SHUT_ACK_CLI) || + (cp->state == IP_VS_SCTP_S_SHUT_ACK_SER)))) { + ip_vs_sync_conn(net, cp); + goto out; + } + } + + /* Keep this block last: TCP and others with pp->num_states <= 1 */ + else if ((ipvs->sync_state & IP_VS_STATE_MASTER) && + (((cp->protocol != IPPROTO_TCP || + cp->state == IP_VS_TCP_S_ESTABLISHED) && + (pkts % sysctl_sync_period(ipvs) + == sysctl_sync_threshold(ipvs))) || + ((cp->protocol == IPPROTO_TCP) && (cp->old_state != cp->state) && + ((cp->state == IP_VS_TCP_S_FIN_WAIT) || + (cp->state == IP_VS_TCP_S_CLOSE) || + (cp->state == IP_VS_TCP_S_CLOSE_WAIT) || + (cp->state == IP_VS_TCP_S_TIME_WAIT))))) + ip_vs_sync_conn(net, cp); +out: + cp->old_state = cp->state; + + ip_vs_conn_put(cp); + return ret; +} + +/* + * AF_INET handler in NF_INET_LOCAL_IN chain + * Schedule and forward packets from remote clients + */ +static unsigned int +ip_vs_remote_request4(unsigned int hooknum, struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return ip_vs_in(hooknum, skb, AF_INET); +} + +/* + * AF_INET handler in NF_INET_LOCAL_OUT chain + * Schedule and forward packets from local clients + */ +static unsigned int +ip_vs_local_request4(unsigned int hooknum, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + unsigned int verdict; + + /* Disable BH in LOCAL_OUT until all places are fixed */ + local_bh_disable(); + verdict = ip_vs_in(hooknum, skb, AF_INET); + local_bh_enable(); + return verdict; +} + +#ifdef CONFIG_IP_VS_IPV6 + +/* + * AF_INET6 handler in NF_INET_LOCAL_IN chain + * Schedule and forward packets from remote clients + */ +static unsigned int +ip_vs_remote_request6(unsigned int hooknum, struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return ip_vs_in(hooknum, skb, AF_INET6); +} + +/* + * AF_INET6 handler in NF_INET_LOCAL_OUT chain + * Schedule and forward packets from local clients + */ +static unsigned int +ip_vs_local_request6(unsigned int hooknum, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + unsigned int verdict; + + /* Disable BH in LOCAL_OUT until all places are fixed */ + local_bh_disable(); + verdict = ip_vs_in(hooknum, skb, AF_INET6); + local_bh_enable(); + return verdict; +} + +#endif + + +/* + * It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP + * related packets destined for 0.0.0.0/0. + * When fwmark-based virtual service is used, such as transparent + * cache cluster, TCP packets can be marked and routed to ip_vs_in, + * but ICMP destined for 0.0.0.0/0 cannot not be easily marked and + * sent to ip_vs_in_icmp. So, catch them at the NF_INET_FORWARD chain + * and send them to ip_vs_in_icmp. + */ +static unsigned int +ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + int r; + struct net *net; + + if (ip_hdr(skb)->protocol != IPPROTO_ICMP) + return NF_ACCEPT; + + /* ipvs enabled in this netns ? */ + net = skb_net(skb); + if (!net_ipvs(net)->enable) + return NF_ACCEPT; + + return ip_vs_in_icmp(skb, &r, hooknum); +} + +#ifdef CONFIG_IP_VS_IPV6 +static unsigned int +ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + int r; + struct net *net; + + if (ipv6_hdr(skb)->nexthdr != IPPROTO_ICMPV6) + return NF_ACCEPT; + + /* ipvs enabled in this netns ? */ + net = skb_net(skb); + if (!net_ipvs(net)->enable) + return NF_ACCEPT; + + return ip_vs_in_icmp_v6(skb, &r, hooknum); +} +#endif + + +static struct nf_hook_ops ip_vs_ops[] __read_mostly = { + /* After packet filtering, change source only for VS/NAT */ + { + .hook = ip_vs_reply4, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_INET_LOCAL_IN, + .priority = NF_IP_PRI_NAT_SRC - 2, + }, + /* After packet filtering, forward packet through VS/DR, VS/TUN, + * or VS/NAT(change destination), so that filtering rules can be + * applied to IPVS. */ + { + .hook = ip_vs_remote_request4, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_INET_LOCAL_IN, + .priority = NF_IP_PRI_NAT_SRC - 1, + }, + /* Before ip_vs_in, change source only for VS/NAT */ + { + .hook = ip_vs_local_reply4, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_INET_LOCAL_OUT, + .priority = NF_IP_PRI_NAT_DST + 1, + }, + /* After mangle, schedule and forward local requests */ + { + .hook = ip_vs_local_request4, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_INET_LOCAL_OUT, + .priority = NF_IP_PRI_NAT_DST + 2, + }, + /* After packet filtering (but before ip_vs_out_icmp), catch icmp + * destined for 0.0.0.0/0, which is for incoming IPVS connections */ + { + .hook = ip_vs_forward_icmp, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_INET_FORWARD, + .priority = 99, + }, + /* After packet filtering, change source only for VS/NAT */ + { + .hook = ip_vs_reply4, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_INET_FORWARD, + .priority = 100, + }, +#ifdef CONFIG_IP_VS_IPV6 + /* After packet filtering, change source only for VS/NAT */ + { + .hook = ip_vs_reply6, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_INET_LOCAL_IN, + .priority = NF_IP6_PRI_NAT_SRC - 2, + }, + /* After packet filtering, forward packet through VS/DR, VS/TUN, + * or VS/NAT(change destination), so that filtering rules can be + * applied to IPVS. */ + { + .hook = ip_vs_remote_request6, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_INET_LOCAL_IN, + .priority = NF_IP6_PRI_NAT_SRC - 1, + }, + /* Before ip_vs_in, change source only for VS/NAT */ + { + .hook = ip_vs_local_reply6, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_INET_LOCAL_OUT, + .priority = NF_IP6_PRI_NAT_DST + 1, + }, + /* After mangle, schedule and forward local requests */ + { + .hook = ip_vs_local_request6, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_INET_LOCAL_OUT, + .priority = NF_IP6_PRI_NAT_DST + 2, + }, + /* After packet filtering (but before ip_vs_out_icmp), catch icmp + * destined for 0.0.0.0/0, which is for incoming IPVS connections */ + { + .hook = ip_vs_forward_icmp_v6, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_INET_FORWARD, + .priority = 99, + }, + /* After packet filtering, change source only for VS/NAT */ + { + .hook = ip_vs_reply6, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_INET_FORWARD, + .priority = 100, + }, +#endif +}; +/* + * Initialize IP Virtual Server netns mem. + */ +static int __net_init __ip_vs_init(struct net *net) +{ + struct netns_ipvs *ipvs; + + ipvs = net_generic(net, ip_vs_net_id); + if (ipvs == NULL) + return -ENOMEM; + + /* Hold the beast until a service is registerd */ + ipvs->enable = 0; + ipvs->net = net; + /* Counters used for creating unique names */ + ipvs->gen = atomic_read(&ipvs_netns_cnt); + atomic_inc(&ipvs_netns_cnt); + net->ipvs = ipvs; + + if (ip_vs_estimator_net_init(net) < 0) + goto estimator_fail; + + if (ip_vs_control_net_init(net) < 0) + goto control_fail; + + if (ip_vs_protocol_net_init(net) < 0) + goto protocol_fail; + + if (ip_vs_app_net_init(net) < 0) + goto app_fail; + + if (ip_vs_conn_net_init(net) < 0) + goto conn_fail; + + if (ip_vs_sync_net_init(net) < 0) + goto sync_fail; + + printk(KERN_INFO "IPVS: Creating netns size=%zu id=%d\n", + sizeof(struct netns_ipvs), ipvs->gen); + return 0; +/* + * Error handling + */ + +sync_fail: + ip_vs_conn_net_cleanup(net); +conn_fail: + ip_vs_app_net_cleanup(net); +app_fail: + ip_vs_protocol_net_cleanup(net); +protocol_fail: + ip_vs_control_net_cleanup(net); +control_fail: + ip_vs_estimator_net_cleanup(net); +estimator_fail: + net->ipvs = NULL; + return -ENOMEM; +} + +static void __net_exit __ip_vs_cleanup(struct net *net) +{ + ip_vs_service_net_cleanup(net); /* ip_vs_flush() with locks */ + ip_vs_conn_net_cleanup(net); + ip_vs_app_net_cleanup(net); + ip_vs_protocol_net_cleanup(net); + ip_vs_control_net_cleanup(net); + ip_vs_estimator_net_cleanup(net); + IP_VS_DBG(2, "ipvs netns %d released\n", net_ipvs(net)->gen); + net->ipvs = NULL; +} + +static void __net_exit __ip_vs_dev_cleanup(struct net *net) +{ + EnterFunction(2); + net_ipvs(net)->enable = 0; /* Disable packet reception */ + smp_wmb(); + ip_vs_sync_net_cleanup(net); + LeaveFunction(2); +} + +static struct pernet_operations ipvs_core_ops = { + .init = __ip_vs_init, + .exit = __ip_vs_cleanup, + .id = &ip_vs_net_id, + .size = sizeof(struct netns_ipvs), +}; + +static struct pernet_operations ipvs_core_dev_ops = { + .exit = __ip_vs_dev_cleanup, +}; + +/* + * Initialize IP Virtual Server + */ +static int __init ip_vs_init(void) +{ + int ret; + + ret = ip_vs_control_init(); + if (ret < 0) { + pr_err("can't setup control.\n"); + goto exit; + } + + ip_vs_protocol_init(); + + ret = ip_vs_conn_init(); + if (ret < 0) { + pr_err("can't setup connection table.\n"); + goto cleanup_protocol; + } + + ret = register_pernet_subsys(&ipvs_core_ops); /* Alloc ip_vs struct */ + if (ret < 0) + goto cleanup_conn; + + ret = register_pernet_device(&ipvs_core_dev_ops); + if (ret < 0) + goto cleanup_sub; + + ret = nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); + if (ret < 0) { + pr_err("can't register hooks.\n"); + goto cleanup_dev; + } + + ret = ip_vs_register_nl_ioctl(); + if (ret < 0) { + pr_err("can't register netlink/ioctl.\n"); + goto cleanup_hooks; + } + + pr_info("ipvs loaded.\n"); + + return ret; + +cleanup_hooks: + nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); +cleanup_dev: + unregister_pernet_device(&ipvs_core_dev_ops); +cleanup_sub: + unregister_pernet_subsys(&ipvs_core_ops); +cleanup_conn: + ip_vs_conn_cleanup(); +cleanup_protocol: + ip_vs_protocol_cleanup(); + ip_vs_control_cleanup(); +exit: + return ret; +} + +static void __exit ip_vs_cleanup(void) +{ + ip_vs_unregister_nl_ioctl(); + nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); + unregister_pernet_device(&ipvs_core_dev_ops); + unregister_pernet_subsys(&ipvs_core_ops); /* free ip_vs struct */ + ip_vs_conn_cleanup(); + ip_vs_protocol_cleanup(); + ip_vs_control_cleanup(); + pr_info("ipvs unloaded.\n"); +} + +module_init(ip_vs_init); +module_exit(ip_vs_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c new file mode 100644 index 00000000..f5589987 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -0,0 +1,3811 @@ +/* + * IPVS An implementation of the IP virtual server support for the + * LINUX operating system. IPVS is now implemented as a module + * over the NetFilter framework. IPVS can be used to build a + * high-performance and highly available server based on a + * cluster of servers. + * + * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> + * Peter Kese <peter.kese@ijs.si> + * Julian Anastasov <ja@ssi.bg> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/types.h> +#include <linux/capability.h> +#include <linux/fs.h> +#include <linux/sysctl.h> +#include <linux/proc_fs.h> +#include <linux/workqueue.h> +#include <linux/swap.h> +#include <linux/seq_file.h> +#include <linux/slab.h> + +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> +#include <linux/mutex.h> + +#include <net/net_namespace.h> +#include <linux/nsproxy.h> +#include <net/ip.h> +#ifdef CONFIG_IP_VS_IPV6 +#include <net/ipv6.h> +#include <net/ip6_route.h> +#endif +#include <net/route.h> +#include <net/sock.h> +#include <net/genetlink.h> + +#include <asm/uaccess.h> + +#include <net/ip_vs.h> + +/* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */ +static DEFINE_MUTEX(__ip_vs_mutex); + +/* lock for service table */ +static DEFINE_RWLOCK(__ip_vs_svc_lock); + +/* sysctl variables */ + +#ifdef CONFIG_IP_VS_DEBUG +static int sysctl_ip_vs_debug_level = 0; + +int ip_vs_get_debug_level(void) +{ + return sysctl_ip_vs_debug_level; +} +#endif + + +/* Protos */ +static void __ip_vs_del_service(struct ip_vs_service *svc); + + +#ifdef CONFIG_IP_VS_IPV6 +/* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */ +static int __ip_vs_addr_is_local_v6(struct net *net, + const struct in6_addr *addr) +{ + struct rt6_info *rt; + struct flowi6 fl6 = { + .daddr = *addr, + }; + + rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6); + if (rt && rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK)) + return 1; + + return 0; +} +#endif + +#ifdef CONFIG_SYSCTL +/* + * update_defense_level is called from keventd and from sysctl, + * so it needs to protect itself from softirqs + */ +static void update_defense_level(struct netns_ipvs *ipvs) +{ + struct sysinfo i; + static int old_secure_tcp = 0; + int availmem; + int nomem; + int to_change = -1; + + /* we only count free and buffered memory (in pages) */ + si_meminfo(&i); + availmem = i.freeram + i.bufferram; + /* however in linux 2.5 the i.bufferram is total page cache size, + we need adjust it */ + /* si_swapinfo(&i); */ + /* availmem = availmem - (i.totalswap - i.freeswap); */ + + nomem = (availmem < ipvs->sysctl_amemthresh); + + local_bh_disable(); + + /* drop_entry */ + spin_lock(&ipvs->dropentry_lock); + switch (ipvs->sysctl_drop_entry) { + case 0: + atomic_set(&ipvs->dropentry, 0); + break; + case 1: + if (nomem) { + atomic_set(&ipvs->dropentry, 1); + ipvs->sysctl_drop_entry = 2; + } else { + atomic_set(&ipvs->dropentry, 0); + } + break; + case 2: + if (nomem) { + atomic_set(&ipvs->dropentry, 1); + } else { + atomic_set(&ipvs->dropentry, 0); + ipvs->sysctl_drop_entry = 1; + }; + break; + case 3: + atomic_set(&ipvs->dropentry, 1); + break; + } + spin_unlock(&ipvs->dropentry_lock); + + /* drop_packet */ + spin_lock(&ipvs->droppacket_lock); + switch (ipvs->sysctl_drop_packet) { + case 0: + ipvs->drop_rate = 0; + break; + case 1: + if (nomem) { + ipvs->drop_rate = ipvs->drop_counter + = ipvs->sysctl_amemthresh / + (ipvs->sysctl_amemthresh-availmem); + ipvs->sysctl_drop_packet = 2; + } else { + ipvs->drop_rate = 0; + } + break; + case 2: + if (nomem) { + ipvs->drop_rate = ipvs->drop_counter + = ipvs->sysctl_amemthresh / + (ipvs->sysctl_amemthresh-availmem); + } else { + ipvs->drop_rate = 0; + ipvs->sysctl_drop_packet = 1; + } + break; + case 3: + ipvs->drop_rate = ipvs->sysctl_am_droprate; + break; + } + spin_unlock(&ipvs->droppacket_lock); + + /* secure_tcp */ + spin_lock(&ipvs->securetcp_lock); + switch (ipvs->sysctl_secure_tcp) { + case 0: + if (old_secure_tcp >= 2) + to_change = 0; + break; + case 1: + if (nomem) { + if (old_secure_tcp < 2) + to_change = 1; + ipvs->sysctl_secure_tcp = 2; + } else { + if (old_secure_tcp >= 2) + to_change = 0; + } + break; + case 2: + if (nomem) { + if (old_secure_tcp < 2) + to_change = 1; + } else { + if (old_secure_tcp >= 2) + to_change = 0; + ipvs->sysctl_secure_tcp = 1; + } + break; + case 3: + if (old_secure_tcp < 2) + to_change = 1; + break; + } + old_secure_tcp = ipvs->sysctl_secure_tcp; + if (to_change >= 0) + ip_vs_protocol_timeout_change(ipvs, + ipvs->sysctl_secure_tcp > 1); + spin_unlock(&ipvs->securetcp_lock); + + local_bh_enable(); +} + + +/* + * Timer for checking the defense + */ +#define DEFENSE_TIMER_PERIOD 1*HZ + +static void defense_work_handler(struct work_struct *work) +{ + struct netns_ipvs *ipvs = + container_of(work, struct netns_ipvs, defense_work.work); + + update_defense_level(ipvs); + if (atomic_read(&ipvs->dropentry)) + ip_vs_random_dropentry(ipvs->net); + schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD); +} +#endif + +int +ip_vs_use_count_inc(void) +{ + return try_module_get(THIS_MODULE); +} + +void +ip_vs_use_count_dec(void) +{ + module_put(THIS_MODULE); +} + + +/* + * Hash table: for virtual service lookups + */ +#define IP_VS_SVC_TAB_BITS 8 +#define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS) +#define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1) + +/* the service table hashed by <protocol, addr, port> */ +static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE]; +/* the service table hashed by fwmark */ +static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE]; + + +/* + * Returns hash value for virtual service + */ +static inline unsigned +ip_vs_svc_hashkey(struct net *net, int af, unsigned proto, + const union nf_inet_addr *addr, __be16 port) +{ + register unsigned porth = ntohs(port); + __be32 addr_fold = addr->ip; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + addr_fold = addr->ip6[0]^addr->ip6[1]^ + addr->ip6[2]^addr->ip6[3]; +#endif + addr_fold ^= ((size_t)net>>8); + + return (proto^ntohl(addr_fold)^(porth>>IP_VS_SVC_TAB_BITS)^porth) + & IP_VS_SVC_TAB_MASK; +} + +/* + * Returns hash value of fwmark for virtual service lookup + */ +static inline unsigned ip_vs_svc_fwm_hashkey(struct net *net, __u32 fwmark) +{ + return (((size_t)net>>8) ^ fwmark) & IP_VS_SVC_TAB_MASK; +} + +/* + * Hashes a service in the ip_vs_svc_table by <netns,proto,addr,port> + * or in the ip_vs_svc_fwm_table by fwmark. + * Should be called with locked tables. + */ +static int ip_vs_svc_hash(struct ip_vs_service *svc) +{ + unsigned hash; + + if (svc->flags & IP_VS_SVC_F_HASHED) { + pr_err("%s(): request for already hashed, called from %pF\n", + __func__, __builtin_return_address(0)); + return 0; + } + + if (svc->fwmark == 0) { + /* + * Hash it by <netns,protocol,addr,port> in ip_vs_svc_table + */ + hash = ip_vs_svc_hashkey(svc->net, svc->af, svc->protocol, + &svc->addr, svc->port); + list_add(&svc->s_list, &ip_vs_svc_table[hash]); + } else { + /* + * Hash it by fwmark in svc_fwm_table + */ + hash = ip_vs_svc_fwm_hashkey(svc->net, svc->fwmark); + list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]); + } + + svc->flags |= IP_VS_SVC_F_HASHED; + /* increase its refcnt because it is referenced by the svc table */ + atomic_inc(&svc->refcnt); + return 1; +} + + +/* + * Unhashes a service from svc_table / svc_fwm_table. + * Should be called with locked tables. + */ +static int ip_vs_svc_unhash(struct ip_vs_service *svc) +{ + if (!(svc->flags & IP_VS_SVC_F_HASHED)) { + pr_err("%s(): request for unhash flagged, called from %pF\n", + __func__, __builtin_return_address(0)); + return 0; + } + + if (svc->fwmark == 0) { + /* Remove it from the svc_table table */ + list_del(&svc->s_list); + } else { + /* Remove it from the svc_fwm_table table */ + list_del(&svc->f_list); + } + + svc->flags &= ~IP_VS_SVC_F_HASHED; + atomic_dec(&svc->refcnt); + return 1; +} + + +/* + * Get service by {netns, proto,addr,port} in the service table. + */ +static inline struct ip_vs_service * +__ip_vs_service_find(struct net *net, int af, __u16 protocol, + const union nf_inet_addr *vaddr, __be16 vport) +{ + unsigned hash; + struct ip_vs_service *svc; + + /* Check for "full" addressed entries */ + hash = ip_vs_svc_hashkey(net, af, protocol, vaddr, vport); + + list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){ + if ((svc->af == af) + && ip_vs_addr_equal(af, &svc->addr, vaddr) + && (svc->port == vport) + && (svc->protocol == protocol) + && net_eq(svc->net, net)) { + /* HIT */ + return svc; + } + } + + return NULL; +} + + +/* + * Get service by {fwmark} in the service table. + */ +static inline struct ip_vs_service * +__ip_vs_svc_fwm_find(struct net *net, int af, __u32 fwmark) +{ + unsigned hash; + struct ip_vs_service *svc; + + /* Check for fwmark addressed entries */ + hash = ip_vs_svc_fwm_hashkey(net, fwmark); + + list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) { + if (svc->fwmark == fwmark && svc->af == af + && net_eq(svc->net, net)) { + /* HIT */ + return svc; + } + } + + return NULL; +} + +struct ip_vs_service * +ip_vs_service_get(struct net *net, int af, __u32 fwmark, __u16 protocol, + const union nf_inet_addr *vaddr, __be16 vport) +{ + struct ip_vs_service *svc; + struct netns_ipvs *ipvs = net_ipvs(net); + + read_lock(&__ip_vs_svc_lock); + + /* + * Check the table hashed by fwmark first + */ + if (fwmark) { + svc = __ip_vs_svc_fwm_find(net, af, fwmark); + if (svc) + goto out; + } + + /* + * Check the table hashed by <protocol,addr,port> + * for "full" addressed entries + */ + svc = __ip_vs_service_find(net, af, protocol, vaddr, vport); + + if (svc == NULL + && protocol == IPPROTO_TCP + && atomic_read(&ipvs->ftpsvc_counter) + && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) { + /* + * Check if ftp service entry exists, the packet + * might belong to FTP data connections. + */ + svc = __ip_vs_service_find(net, af, protocol, vaddr, FTPPORT); + } + + if (svc == NULL + && atomic_read(&ipvs->nullsvc_counter)) { + /* + * Check if the catch-all port (port zero) exists + */ + svc = __ip_vs_service_find(net, af, protocol, vaddr, 0); + } + + out: + if (svc) + atomic_inc(&svc->usecnt); + read_unlock(&__ip_vs_svc_lock); + + IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n", + fwmark, ip_vs_proto_name(protocol), + IP_VS_DBG_ADDR(af, vaddr), ntohs(vport), + svc ? "hit" : "not hit"); + + return svc; +} + + +static inline void +__ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc) +{ + atomic_inc(&svc->refcnt); + dest->svc = svc; +} + +static void +__ip_vs_unbind_svc(struct ip_vs_dest *dest) +{ + struct ip_vs_service *svc = dest->svc; + + dest->svc = NULL; + if (atomic_dec_and_test(&svc->refcnt)) { + IP_VS_DBG_BUF(3, "Removing service %u/%s:%u usecnt=%d\n", + svc->fwmark, + IP_VS_DBG_ADDR(svc->af, &svc->addr), + ntohs(svc->port), atomic_read(&svc->usecnt)); + free_percpu(svc->stats.cpustats); + kfree(svc); + } +} + + +/* + * Returns hash value for real service + */ +static inline unsigned ip_vs_rs_hashkey(int af, + const union nf_inet_addr *addr, + __be16 port) +{ + register unsigned porth = ntohs(port); + __be32 addr_fold = addr->ip; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + addr_fold = addr->ip6[0]^addr->ip6[1]^ + addr->ip6[2]^addr->ip6[3]; +#endif + + return (ntohl(addr_fold)^(porth>>IP_VS_RTAB_BITS)^porth) + & IP_VS_RTAB_MASK; +} + +/* + * Hashes ip_vs_dest in rs_table by <proto,addr,port>. + * should be called with locked tables. + */ +static int ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest) +{ + unsigned hash; + + if (!list_empty(&dest->d_list)) { + return 0; + } + + /* + * Hash by proto,addr,port, + * which are the parameters of the real service. + */ + hash = ip_vs_rs_hashkey(dest->af, &dest->addr, dest->port); + + list_add(&dest->d_list, &ipvs->rs_table[hash]); + + return 1; +} + +/* + * UNhashes ip_vs_dest from rs_table. + * should be called with locked tables. + */ +static int ip_vs_rs_unhash(struct ip_vs_dest *dest) +{ + /* + * Remove it from the rs_table table. + */ + if (!list_empty(&dest->d_list)) { + list_del(&dest->d_list); + INIT_LIST_HEAD(&dest->d_list); + } + + return 1; +} + +/* + * Lookup real service by <proto,addr,port> in the real service table. + */ +struct ip_vs_dest * +ip_vs_lookup_real_service(struct net *net, int af, __u16 protocol, + const union nf_inet_addr *daddr, + __be16 dport) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + unsigned hash; + struct ip_vs_dest *dest; + + /* + * Check for "full" addressed entries + * Return the first found entry + */ + hash = ip_vs_rs_hashkey(af, daddr, dport); + + read_lock(&ipvs->rs_lock); + list_for_each_entry(dest, &ipvs->rs_table[hash], d_list) { + if ((dest->af == af) + && ip_vs_addr_equal(af, &dest->addr, daddr) + && (dest->port == dport) + && ((dest->protocol == protocol) || + dest->vfwmark)) { + /* HIT */ + read_unlock(&ipvs->rs_lock); + return dest; + } + } + read_unlock(&ipvs->rs_lock); + + return NULL; +} + +/* + * Lookup destination by {addr,port} in the given service + */ +static struct ip_vs_dest * +ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, + __be16 dport) +{ + struct ip_vs_dest *dest; + + /* + * Find the destination for the given service + */ + list_for_each_entry(dest, &svc->destinations, n_list) { + if ((dest->af == svc->af) + && ip_vs_addr_equal(svc->af, &dest->addr, daddr) + && (dest->port == dport)) { + /* HIT */ + return dest; + } + } + + return NULL; +} + +/* + * Find destination by {daddr,dport,vaddr,protocol} + * Cretaed to be used in ip_vs_process_message() in + * the backup synchronization daemon. It finds the + * destination to be bound to the received connection + * on the backup. + * + * ip_vs_lookup_real_service() looked promissing, but + * seems not working as expected. + */ +struct ip_vs_dest *ip_vs_find_dest(struct net *net, int af, + const union nf_inet_addr *daddr, + __be16 dport, + const union nf_inet_addr *vaddr, + __be16 vport, __u16 protocol, __u32 fwmark, + __u32 flags) +{ + struct ip_vs_dest *dest; + struct ip_vs_service *svc; + __be16 port = dport; + + svc = ip_vs_service_get(net, af, fwmark, protocol, vaddr, vport); + if (!svc) + return NULL; + if (fwmark && (flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) + port = 0; + dest = ip_vs_lookup_dest(svc, daddr, port); + if (!dest) + dest = ip_vs_lookup_dest(svc, daddr, port ^ dport); + if (dest) + atomic_inc(&dest->refcnt); + ip_vs_service_put(svc); + return dest; +} + +/* + * Lookup dest by {svc,addr,port} in the destination trash. + * The destination trash is used to hold the destinations that are removed + * from the service table but are still referenced by some conn entries. + * The reason to add the destination trash is when the dest is temporary + * down (either by administrator or by monitor program), the dest can be + * picked back from the trash, the remaining connections to the dest can + * continue, and the counting information of the dest is also useful for + * scheduling. + */ +static struct ip_vs_dest * +ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, + __be16 dport) +{ + struct ip_vs_dest *dest, *nxt; + struct netns_ipvs *ipvs = net_ipvs(svc->net); + + /* + * Find the destination in trash + */ + list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, n_list) { + IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, " + "dest->refcnt=%d\n", + dest->vfwmark, + IP_VS_DBG_ADDR(svc->af, &dest->addr), + ntohs(dest->port), + atomic_read(&dest->refcnt)); + if (dest->af == svc->af && + ip_vs_addr_equal(svc->af, &dest->addr, daddr) && + dest->port == dport && + dest->vfwmark == svc->fwmark && + dest->protocol == svc->protocol && + (svc->fwmark || + (ip_vs_addr_equal(svc->af, &dest->vaddr, &svc->addr) && + dest->vport == svc->port))) { + /* HIT */ + return dest; + } + + /* + * Try to purge the destination from trash if not referenced + */ + if (atomic_read(&dest->refcnt) == 1) { + IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u " + "from trash\n", + dest->vfwmark, + IP_VS_DBG_ADDR(svc->af, &dest->addr), + ntohs(dest->port)); + list_del(&dest->n_list); + ip_vs_dst_reset(dest); + __ip_vs_unbind_svc(dest); + free_percpu(dest->stats.cpustats); + kfree(dest); + } + } + + return NULL; +} + + +/* + * Clean up all the destinations in the trash + * Called by the ip_vs_control_cleanup() + * + * When the ip_vs_control_clearup is activated by ipvs module exit, + * the service tables must have been flushed and all the connections + * are expired, and the refcnt of each destination in the trash must + * be 1, so we simply release them here. + */ +static void ip_vs_trash_cleanup(struct net *net) +{ + struct ip_vs_dest *dest, *nxt; + struct netns_ipvs *ipvs = net_ipvs(net); + + list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, n_list) { + list_del(&dest->n_list); + ip_vs_dst_reset(dest); + __ip_vs_unbind_svc(dest); + free_percpu(dest->stats.cpustats); + kfree(dest); + } +} + +static void +ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src) +{ +#define IP_VS_SHOW_STATS_COUNTER(c) dst->c = src->ustats.c - src->ustats0.c + + spin_lock_bh(&src->lock); + + IP_VS_SHOW_STATS_COUNTER(conns); + IP_VS_SHOW_STATS_COUNTER(inpkts); + IP_VS_SHOW_STATS_COUNTER(outpkts); + IP_VS_SHOW_STATS_COUNTER(inbytes); + IP_VS_SHOW_STATS_COUNTER(outbytes); + + ip_vs_read_estimator(dst, src); + + spin_unlock_bh(&src->lock); +} + +static void +ip_vs_zero_stats(struct ip_vs_stats *stats) +{ + spin_lock_bh(&stats->lock); + + /* get current counters as zero point, rates are zeroed */ + +#define IP_VS_ZERO_STATS_COUNTER(c) stats->ustats0.c = stats->ustats.c + + IP_VS_ZERO_STATS_COUNTER(conns); + IP_VS_ZERO_STATS_COUNTER(inpkts); + IP_VS_ZERO_STATS_COUNTER(outpkts); + IP_VS_ZERO_STATS_COUNTER(inbytes); + IP_VS_ZERO_STATS_COUNTER(outbytes); + + ip_vs_zero_estimator(stats); + + spin_unlock_bh(&stats->lock); +} + +/* + * Update a destination in the given service + */ +static void +__ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, + struct ip_vs_dest_user_kern *udest, int add) +{ + struct netns_ipvs *ipvs = net_ipvs(svc->net); + int conn_flags; + + /* set the weight and the flags */ + atomic_set(&dest->weight, udest->weight); + conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK; + conn_flags |= IP_VS_CONN_F_INACTIVE; + + /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */ + if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) { + conn_flags |= IP_VS_CONN_F_NOOUTPUT; + } else { + /* + * Put the real service in rs_table if not present. + * For now only for NAT! + */ + write_lock_bh(&ipvs->rs_lock); + ip_vs_rs_hash(ipvs, dest); + write_unlock_bh(&ipvs->rs_lock); + } + atomic_set(&dest->conn_flags, conn_flags); + + /* bind the service */ + if (!dest->svc) { + __ip_vs_bind_svc(dest, svc); + } else { + if (dest->svc != svc) { + __ip_vs_unbind_svc(dest); + ip_vs_zero_stats(&dest->stats); + __ip_vs_bind_svc(dest, svc); + } + } + + /* set the dest status flags */ + dest->flags |= IP_VS_DEST_F_AVAILABLE; + + if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold) + dest->flags &= ~IP_VS_DEST_F_OVERLOAD; + dest->u_threshold = udest->u_threshold; + dest->l_threshold = udest->l_threshold; + + spin_lock_bh(&dest->dst_lock); + ip_vs_dst_reset(dest); + spin_unlock_bh(&dest->dst_lock); + + if (add) + ip_vs_start_estimator(svc->net, &dest->stats); + + write_lock_bh(&__ip_vs_svc_lock); + + /* Wait until all other svc users go away */ + IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0); + + if (add) { + list_add(&dest->n_list, &svc->destinations); + svc->num_dests++; + } + + /* call the update_service, because server weight may be changed */ + if (svc->scheduler->update_service) + svc->scheduler->update_service(svc); + + write_unlock_bh(&__ip_vs_svc_lock); +} + + +/* + * Create a destination for the given service + */ +static int +ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest, + struct ip_vs_dest **dest_p) +{ + struct ip_vs_dest *dest; + unsigned atype; + + EnterFunction(2); + +#ifdef CONFIG_IP_VS_IPV6 + if (svc->af == AF_INET6) { + atype = ipv6_addr_type(&udest->addr.in6); + if ((!(atype & IPV6_ADDR_UNICAST) || + atype & IPV6_ADDR_LINKLOCAL) && + !__ip_vs_addr_is_local_v6(svc->net, &udest->addr.in6)) + return -EINVAL; + } else +#endif + { + atype = inet_addr_type(svc->net, udest->addr.ip); + if (atype != RTN_LOCAL && atype != RTN_UNICAST) + return -EINVAL; + } + + dest = kzalloc(sizeof(struct ip_vs_dest), GFP_KERNEL); + if (dest == NULL) + return -ENOMEM; + + dest->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats); + if (!dest->stats.cpustats) + goto err_alloc; + + dest->af = svc->af; + dest->protocol = svc->protocol; + dest->vaddr = svc->addr; + dest->vport = svc->port; + dest->vfwmark = svc->fwmark; + ip_vs_addr_copy(svc->af, &dest->addr, &udest->addr); + dest->port = udest->port; + + atomic_set(&dest->activeconns, 0); + atomic_set(&dest->inactconns, 0); + atomic_set(&dest->persistconns, 0); + atomic_set(&dest->refcnt, 1); + + INIT_LIST_HEAD(&dest->d_list); + spin_lock_init(&dest->dst_lock); + spin_lock_init(&dest->stats.lock); + __ip_vs_update_dest(svc, dest, udest, 1); + + *dest_p = dest; + + LeaveFunction(2); + return 0; + +err_alloc: + kfree(dest); + return -ENOMEM; +} + + +/* + * Add a destination into an existing service + */ +static int +ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) +{ + struct ip_vs_dest *dest; + union nf_inet_addr daddr; + __be16 dport = udest->port; + int ret; + + EnterFunction(2); + + if (udest->weight < 0) { + pr_err("%s(): server weight less than zero\n", __func__); + return -ERANGE; + } + + if (udest->l_threshold > udest->u_threshold) { + pr_err("%s(): lower threshold is higher than upper threshold\n", + __func__); + return -ERANGE; + } + + ip_vs_addr_copy(svc->af, &daddr, &udest->addr); + + /* + * Check if the dest already exists in the list + */ + dest = ip_vs_lookup_dest(svc, &daddr, dport); + + if (dest != NULL) { + IP_VS_DBG(1, "%s(): dest already exists\n", __func__); + return -EEXIST; + } + + /* + * Check if the dest already exists in the trash and + * is from the same service + */ + dest = ip_vs_trash_get_dest(svc, &daddr, dport); + + if (dest != NULL) { + IP_VS_DBG_BUF(3, "Get destination %s:%u from trash, " + "dest->refcnt=%d, service %u/%s:%u\n", + IP_VS_DBG_ADDR(svc->af, &daddr), ntohs(dport), + atomic_read(&dest->refcnt), + dest->vfwmark, + IP_VS_DBG_ADDR(svc->af, &dest->vaddr), + ntohs(dest->vport)); + + /* + * Get the destination from the trash + */ + list_del(&dest->n_list); + + __ip_vs_update_dest(svc, dest, udest, 1); + ret = 0; + } else { + /* + * Allocate and initialize the dest structure + */ + ret = ip_vs_new_dest(svc, udest, &dest); + } + LeaveFunction(2); + + return ret; +} + + +/* + * Edit a destination in the given service + */ +static int +ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) +{ + struct ip_vs_dest *dest; + union nf_inet_addr daddr; + __be16 dport = udest->port; + + EnterFunction(2); + + if (udest->weight < 0) { + pr_err("%s(): server weight less than zero\n", __func__); + return -ERANGE; + } + + if (udest->l_threshold > udest->u_threshold) { + pr_err("%s(): lower threshold is higher than upper threshold\n", + __func__); + return -ERANGE; + } + + ip_vs_addr_copy(svc->af, &daddr, &udest->addr); + + /* + * Lookup the destination list + */ + dest = ip_vs_lookup_dest(svc, &daddr, dport); + + if (dest == NULL) { + IP_VS_DBG(1, "%s(): dest doesn't exist\n", __func__); + return -ENOENT; + } + + __ip_vs_update_dest(svc, dest, udest, 0); + LeaveFunction(2); + + return 0; +} + + +/* + * Delete a destination (must be already unlinked from the service) + */ +static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + + ip_vs_stop_estimator(net, &dest->stats); + + /* + * Remove it from the d-linked list with the real services. + */ + write_lock_bh(&ipvs->rs_lock); + ip_vs_rs_unhash(dest); + write_unlock_bh(&ipvs->rs_lock); + + /* + * Decrease the refcnt of the dest, and free the dest + * if nobody refers to it (refcnt=0). Otherwise, throw + * the destination into the trash. + */ + if (atomic_dec_and_test(&dest->refcnt)) { + IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u\n", + dest->vfwmark, + IP_VS_DBG_ADDR(dest->af, &dest->addr), + ntohs(dest->port)); + ip_vs_dst_reset(dest); + /* simply decrease svc->refcnt here, let the caller check + and release the service if nobody refers to it. + Only user context can release destination and service, + and only one user context can update virtual service at a + time, so the operation here is OK */ + atomic_dec(&dest->svc->refcnt); + free_percpu(dest->stats.cpustats); + kfree(dest); + } else { + IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, " + "dest->refcnt=%d\n", + IP_VS_DBG_ADDR(dest->af, &dest->addr), + ntohs(dest->port), + atomic_read(&dest->refcnt)); + list_add(&dest->n_list, &ipvs->dest_trash); + atomic_inc(&dest->refcnt); + } +} + + +/* + * Unlink a destination from the given service + */ +static void __ip_vs_unlink_dest(struct ip_vs_service *svc, + struct ip_vs_dest *dest, + int svcupd) +{ + dest->flags &= ~IP_VS_DEST_F_AVAILABLE; + + /* + * Remove it from the d-linked destination list. + */ + list_del(&dest->n_list); + svc->num_dests--; + + /* + * Call the update_service function of its scheduler + */ + if (svcupd && svc->scheduler->update_service) + svc->scheduler->update_service(svc); +} + + +/* + * Delete a destination server in the given service + */ +static int +ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) +{ + struct ip_vs_dest *dest; + __be16 dport = udest->port; + + EnterFunction(2); + + dest = ip_vs_lookup_dest(svc, &udest->addr, dport); + + if (dest == NULL) { + IP_VS_DBG(1, "%s(): destination not found!\n", __func__); + return -ENOENT; + } + + write_lock_bh(&__ip_vs_svc_lock); + + /* + * Wait until all other svc users go away. + */ + IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0); + + /* + * Unlink dest from the service + */ + __ip_vs_unlink_dest(svc, dest, 1); + + write_unlock_bh(&__ip_vs_svc_lock); + + /* + * Delete the destination + */ + __ip_vs_del_dest(svc->net, dest); + + LeaveFunction(2); + + return 0; +} + + +/* + * Add a service into the service hash table + */ +static int +ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u, + struct ip_vs_service **svc_p) +{ + int ret = 0; + struct ip_vs_scheduler *sched = NULL; + struct ip_vs_pe *pe = NULL; + struct ip_vs_service *svc = NULL; + struct netns_ipvs *ipvs = net_ipvs(net); + + /* increase the module use count */ + ip_vs_use_count_inc(); + + /* Lookup the scheduler by 'u->sched_name' */ + sched = ip_vs_scheduler_get(u->sched_name); + if (sched == NULL) { + pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name); + ret = -ENOENT; + goto out_err; + } + + if (u->pe_name && *u->pe_name) { + pe = ip_vs_pe_getbyname(u->pe_name); + if (pe == NULL) { + pr_info("persistence engine module ip_vs_pe_%s " + "not found\n", u->pe_name); + ret = -ENOENT; + goto out_err; + } + } + +#ifdef CONFIG_IP_VS_IPV6 + if (u->af == AF_INET6 && (u->netmask < 1 || u->netmask > 128)) { + ret = -EINVAL; + goto out_err; + } +#endif + + svc = kzalloc(sizeof(struct ip_vs_service), GFP_KERNEL); + if (svc == NULL) { + IP_VS_DBG(1, "%s(): no memory\n", __func__); + ret = -ENOMEM; + goto out_err; + } + svc->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats); + if (!svc->stats.cpustats) + goto out_err; + + /* I'm the first user of the service */ + atomic_set(&svc->usecnt, 0); + atomic_set(&svc->refcnt, 0); + + svc->af = u->af; + svc->protocol = u->protocol; + ip_vs_addr_copy(svc->af, &svc->addr, &u->addr); + svc->port = u->port; + svc->fwmark = u->fwmark; + svc->flags = u->flags; + svc->timeout = u->timeout * HZ; + svc->netmask = u->netmask; + svc->net = net; + + INIT_LIST_HEAD(&svc->destinations); + rwlock_init(&svc->sched_lock); + spin_lock_init(&svc->stats.lock); + + /* Bind the scheduler */ + ret = ip_vs_bind_scheduler(svc, sched); + if (ret) + goto out_err; + sched = NULL; + + /* Bind the ct retriever */ + ip_vs_bind_pe(svc, pe); + pe = NULL; + + /* Update the virtual service counters */ + if (svc->port == FTPPORT) + atomic_inc(&ipvs->ftpsvc_counter); + else if (svc->port == 0) + atomic_inc(&ipvs->nullsvc_counter); + + ip_vs_start_estimator(net, &svc->stats); + + /* Count only IPv4 services for old get/setsockopt interface */ + if (svc->af == AF_INET) + ipvs->num_services++; + + /* Hash the service into the service table */ + write_lock_bh(&__ip_vs_svc_lock); + ip_vs_svc_hash(svc); + write_unlock_bh(&__ip_vs_svc_lock); + + *svc_p = svc; + /* Now there is a service - full throttle */ + ipvs->enable = 1; + return 0; + + + out_err: + if (svc != NULL) { + ip_vs_unbind_scheduler(svc); + if (svc->inc) { + local_bh_disable(); + ip_vs_app_inc_put(svc->inc); + local_bh_enable(); + } + if (svc->stats.cpustats) + free_percpu(svc->stats.cpustats); + kfree(svc); + } + ip_vs_scheduler_put(sched); + ip_vs_pe_put(pe); + + /* decrease the module use count */ + ip_vs_use_count_dec(); + + return ret; +} + + +/* + * Edit a service and bind it with a new scheduler + */ +static int +ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) +{ + struct ip_vs_scheduler *sched, *old_sched; + struct ip_vs_pe *pe = NULL, *old_pe = NULL; + int ret = 0; + + /* + * Lookup the scheduler, by 'u->sched_name' + */ + sched = ip_vs_scheduler_get(u->sched_name); + if (sched == NULL) { + pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name); + return -ENOENT; + } + old_sched = sched; + + if (u->pe_name && *u->pe_name) { + pe = ip_vs_pe_getbyname(u->pe_name); + if (pe == NULL) { + pr_info("persistence engine module ip_vs_pe_%s " + "not found\n", u->pe_name); + ret = -ENOENT; + goto out; + } + old_pe = pe; + } + +#ifdef CONFIG_IP_VS_IPV6 + if (u->af == AF_INET6 && (u->netmask < 1 || u->netmask > 128)) { + ret = -EINVAL; + goto out; + } +#endif + + write_lock_bh(&__ip_vs_svc_lock); + + /* + * Wait until all other svc users go away. + */ + IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0); + + /* + * Set the flags and timeout value + */ + svc->flags = u->flags | IP_VS_SVC_F_HASHED; + svc->timeout = u->timeout * HZ; + svc->netmask = u->netmask; + + old_sched = svc->scheduler; + if (sched != old_sched) { + /* + * Unbind the old scheduler + */ + if ((ret = ip_vs_unbind_scheduler(svc))) { + old_sched = sched; + goto out_unlock; + } + + /* + * Bind the new scheduler + */ + if ((ret = ip_vs_bind_scheduler(svc, sched))) { + /* + * If ip_vs_bind_scheduler fails, restore the old + * scheduler. + * The main reason of failure is out of memory. + * + * The question is if the old scheduler can be + * restored all the time. TODO: if it cannot be + * restored some time, we must delete the service, + * otherwise the system may crash. + */ + ip_vs_bind_scheduler(svc, old_sched); + old_sched = sched; + goto out_unlock; + } + } + + old_pe = svc->pe; + if (pe != old_pe) { + ip_vs_unbind_pe(svc); + ip_vs_bind_pe(svc, pe); + } + +out_unlock: + write_unlock_bh(&__ip_vs_svc_lock); +out: + ip_vs_scheduler_put(old_sched); + ip_vs_pe_put(old_pe); + return ret; +} + + +/* + * Delete a service from the service list + * - The service must be unlinked, unlocked and not referenced! + * - We are called under _bh lock + */ +static void __ip_vs_del_service(struct ip_vs_service *svc) +{ + struct ip_vs_dest *dest, *nxt; + struct ip_vs_scheduler *old_sched; + struct ip_vs_pe *old_pe; + struct netns_ipvs *ipvs = net_ipvs(svc->net); + + pr_info("%s: enter\n", __func__); + + /* Count only IPv4 services for old get/setsockopt interface */ + if (svc->af == AF_INET) + ipvs->num_services--; + + ip_vs_stop_estimator(svc->net, &svc->stats); + + /* Unbind scheduler */ + old_sched = svc->scheduler; + ip_vs_unbind_scheduler(svc); + ip_vs_scheduler_put(old_sched); + + /* Unbind persistence engine */ + old_pe = svc->pe; + ip_vs_unbind_pe(svc); + ip_vs_pe_put(old_pe); + + /* Unbind app inc */ + if (svc->inc) { + ip_vs_app_inc_put(svc->inc); + svc->inc = NULL; + } + + /* + * Unlink the whole destination list + */ + list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) { + __ip_vs_unlink_dest(svc, dest, 0); + __ip_vs_del_dest(svc->net, dest); + } + + /* + * Update the virtual service counters + */ + if (svc->port == FTPPORT) + atomic_dec(&ipvs->ftpsvc_counter); + else if (svc->port == 0) + atomic_dec(&ipvs->nullsvc_counter); + + /* + * Free the service if nobody refers to it + */ + if (atomic_read(&svc->refcnt) == 0) { + IP_VS_DBG_BUF(3, "Removing service %u/%s:%u usecnt=%d\n", + svc->fwmark, + IP_VS_DBG_ADDR(svc->af, &svc->addr), + ntohs(svc->port), atomic_read(&svc->usecnt)); + free_percpu(svc->stats.cpustats); + kfree(svc); + } + + /* decrease the module use count */ + ip_vs_use_count_dec(); +} + +/* + * Unlink a service from list and try to delete it if its refcnt reached 0 + */ +static void ip_vs_unlink_service(struct ip_vs_service *svc) +{ + /* + * Unhash it from the service table + */ + write_lock_bh(&__ip_vs_svc_lock); + + ip_vs_svc_unhash(svc); + + /* + * Wait until all the svc users go away. + */ + IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0); + + __ip_vs_del_service(svc); + + write_unlock_bh(&__ip_vs_svc_lock); +} + +/* + * Delete a service from the service list + */ +static int ip_vs_del_service(struct ip_vs_service *svc) +{ + if (svc == NULL) + return -EEXIST; + ip_vs_unlink_service(svc); + + return 0; +} + + +/* + * Flush all the virtual services + */ +static int ip_vs_flush(struct net *net) +{ + int idx; + struct ip_vs_service *svc, *nxt; + + /* + * Flush the service table hashed by <netns,protocol,addr,port> + */ + for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], + s_list) { + if (net_eq(svc->net, net)) + ip_vs_unlink_service(svc); + } + } + + /* + * Flush the service table hashed by fwmark + */ + for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + list_for_each_entry_safe(svc, nxt, + &ip_vs_svc_fwm_table[idx], f_list) { + if (net_eq(svc->net, net)) + ip_vs_unlink_service(svc); + } + } + + return 0; +} + +/* + * Delete service by {netns} in the service table. + * Called by __ip_vs_cleanup() + */ +void ip_vs_service_net_cleanup(struct net *net) +{ + EnterFunction(2); + /* Check for "full" addressed entries */ + mutex_lock(&__ip_vs_mutex); + ip_vs_flush(net); + mutex_unlock(&__ip_vs_mutex); + LeaveFunction(2); +} +/* + * Release dst hold by dst_cache + */ +static inline void +__ip_vs_dev_reset(struct ip_vs_dest *dest, struct net_device *dev) +{ + spin_lock_bh(&dest->dst_lock); + if (dest->dst_cache && dest->dst_cache->dev == dev) { + IP_VS_DBG_BUF(3, "Reset dev:%s dest %s:%u ,dest->refcnt=%d\n", + dev->name, + IP_VS_DBG_ADDR(dest->af, &dest->addr), + ntohs(dest->port), + atomic_read(&dest->refcnt)); + ip_vs_dst_reset(dest); + } + spin_unlock_bh(&dest->dst_lock); + +} +/* + * Netdev event receiver + * Currently only NETDEV_UNREGISTER is handled, i.e. if we hold a reference to + * a device that is "unregister" it must be released. + */ +static int ip_vs_dst_event(struct notifier_block *this, unsigned long event, + void *ptr) +{ + struct net_device *dev = ptr; + struct net *net = dev_net(dev); + struct ip_vs_service *svc; + struct ip_vs_dest *dest; + unsigned int idx; + + if (event != NETDEV_UNREGISTER) + return NOTIFY_DONE; + IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name); + EnterFunction(2); + mutex_lock(&__ip_vs_mutex); + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { + if (net_eq(svc->net, net)) { + list_for_each_entry(dest, &svc->destinations, + n_list) { + __ip_vs_dev_reset(dest, dev); + } + } + } + + list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { + if (net_eq(svc->net, net)) { + list_for_each_entry(dest, &svc->destinations, + n_list) { + __ip_vs_dev_reset(dest, dev); + } + } + + } + } + + list_for_each_entry(dest, &net_ipvs(net)->dest_trash, n_list) { + __ip_vs_dev_reset(dest, dev); + } + mutex_unlock(&__ip_vs_mutex); + LeaveFunction(2); + return NOTIFY_DONE; +} + +/* + * Zero counters in a service or all services + */ +static int ip_vs_zero_service(struct ip_vs_service *svc) +{ + struct ip_vs_dest *dest; + + write_lock_bh(&__ip_vs_svc_lock); + list_for_each_entry(dest, &svc->destinations, n_list) { + ip_vs_zero_stats(&dest->stats); + } + ip_vs_zero_stats(&svc->stats); + write_unlock_bh(&__ip_vs_svc_lock); + return 0; +} + +static int ip_vs_zero_all(struct net *net) +{ + int idx; + struct ip_vs_service *svc; + + for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { + if (net_eq(svc->net, net)) + ip_vs_zero_service(svc); + } + } + + for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { + if (net_eq(svc->net, net)) + ip_vs_zero_service(svc); + } + } + + ip_vs_zero_stats(&net_ipvs(net)->tot_stats); + return 0; +} + +#ifdef CONFIG_SYSCTL +static int +proc_do_defense_mode(ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct net *net = current->nsproxy->net_ns; + int *valp = table->data; + int val = *valp; + int rc; + + rc = proc_dointvec(table, write, buffer, lenp, ppos); + if (write && (*valp != val)) { + if ((*valp < 0) || (*valp > 3)) { + /* Restore the correct value */ + *valp = val; + } else { + update_defense_level(net_ipvs(net)); + } + } + return rc; +} + +static int +proc_do_sync_threshold(ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int *valp = table->data; + int val[2]; + int rc; + + /* backup the value first */ + memcpy(val, valp, sizeof(val)); + + rc = proc_dointvec(table, write, buffer, lenp, ppos); + if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) { + /* Restore the correct value */ + memcpy(valp, val, sizeof(val)); + } + return rc; +} + +static int +proc_do_sync_mode(ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int *valp = table->data; + int val = *valp; + int rc; + + rc = proc_dointvec(table, write, buffer, lenp, ppos); + if (write && (*valp != val)) { + if ((*valp < 0) || (*valp > 1)) { + /* Restore the correct value */ + *valp = val; + } else { + struct net *net = current->nsproxy->net_ns; + ip_vs_sync_switch_mode(net, val); + } + } + return rc; +} + +/* + * IPVS sysctl table (under the /proc/sys/net/ipv4/vs/) + * Do not change order or insert new entries without + * align with netns init in ip_vs_control_net_init() + */ + +static struct ctl_table vs_vars[] = { + { + .procname = "amemthresh", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "am_droprate", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "drop_entry", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_do_defense_mode, + }, + { + .procname = "drop_packet", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_do_defense_mode, + }, +#ifdef CONFIG_IP_VS_NFCT + { + .procname = "conntrack", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif + { + .procname = "secure_tcp", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_do_defense_mode, + }, + { + .procname = "snat_reroute", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .procname = "sync_version", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_do_sync_mode, + }, + { + .procname = "cache_bypass", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "expire_nodest_conn", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "expire_quiescent_template", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "sync_threshold", + .maxlen = + sizeof(((struct netns_ipvs *)0)->sysctl_sync_threshold), + .mode = 0644, + .proc_handler = proc_do_sync_threshold, + }, + { + .procname = "nat_icmp_send", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#ifdef CONFIG_IP_VS_DEBUG + { + .procname = "debug_level", + .data = &sysctl_ip_vs_debug_level, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#if 0 + { + .procname = "timeout_established", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "timeout_synsent", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "timeout_synrecv", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "timeout_finwait", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "timeout_timewait", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "timeout_close", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "timeout_closewait", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "timeout_lastack", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "timeout_listen", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "timeout_synack", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "timeout_udp", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_UDP], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "timeout_icmp", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_ICMP], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, +#endif + { } +}; + +const struct ctl_path net_vs_ctl_path[] = { + { .procname = "net", }, + { .procname = "ipv4", }, + { .procname = "vs", }, + { } +}; +EXPORT_SYMBOL_GPL(net_vs_ctl_path); +#endif + +#ifdef CONFIG_PROC_FS + +struct ip_vs_iter { + struct seq_net_private p; /* Do not move this, netns depends upon it*/ + struct list_head *table; + int bucket; +}; + +/* + * Write the contents of the VS rule table to a PROCfs file. + * (It is kept just for backward compatibility) + */ +static inline const char *ip_vs_fwd_name(unsigned flags) +{ + switch (flags & IP_VS_CONN_F_FWD_MASK) { + case IP_VS_CONN_F_LOCALNODE: + return "Local"; + case IP_VS_CONN_F_TUNNEL: + return "Tunnel"; + case IP_VS_CONN_F_DROUTE: + return "Route"; + default: + return "Masq"; + } +} + + +/* Get the Nth entry in the two lists */ +static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos) +{ + struct net *net = seq_file_net(seq); + struct ip_vs_iter *iter = seq->private; + int idx; + struct ip_vs_service *svc; + + /* look in hash by protocol */ + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { + if (net_eq(svc->net, net) && pos-- == 0) { + iter->table = ip_vs_svc_table; + iter->bucket = idx; + return svc; + } + } + } + + /* keep looking in fwmark */ + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { + if (net_eq(svc->net, net) && pos-- == 0) { + iter->table = ip_vs_svc_fwm_table; + iter->bucket = idx; + return svc; + } + } + } + + return NULL; +} + +static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos) +__acquires(__ip_vs_svc_lock) +{ + + read_lock_bh(&__ip_vs_svc_lock); + return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN; +} + + +static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct list_head *e; + struct ip_vs_iter *iter; + struct ip_vs_service *svc; + + ++*pos; + if (v == SEQ_START_TOKEN) + return ip_vs_info_array(seq,0); + + svc = v; + iter = seq->private; + + if (iter->table == ip_vs_svc_table) { + /* next service in table hashed by protocol */ + if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket]) + return list_entry(e, struct ip_vs_service, s_list); + + + while (++iter->bucket < IP_VS_SVC_TAB_SIZE) { + list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket], + s_list) { + return svc; + } + } + + iter->table = ip_vs_svc_fwm_table; + iter->bucket = -1; + goto scan_fwmark; + } + + /* next service in hashed by fwmark */ + if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket]) + return list_entry(e, struct ip_vs_service, f_list); + + scan_fwmark: + while (++iter->bucket < IP_VS_SVC_TAB_SIZE) { + list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket], + f_list) + return svc; + } + + return NULL; +} + +static void ip_vs_info_seq_stop(struct seq_file *seq, void *v) +__releases(__ip_vs_svc_lock) +{ + read_unlock_bh(&__ip_vs_svc_lock); +} + + +static int ip_vs_info_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) { + seq_printf(seq, + "IP Virtual Server version %d.%d.%d (size=%d)\n", + NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size); + seq_puts(seq, + "Prot LocalAddress:Port Scheduler Flags\n"); + seq_puts(seq, + " -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n"); + } else { + const struct ip_vs_service *svc = v; + const struct ip_vs_iter *iter = seq->private; + const struct ip_vs_dest *dest; + + if (iter->table == ip_vs_svc_table) { +#ifdef CONFIG_IP_VS_IPV6 + if (svc->af == AF_INET6) + seq_printf(seq, "%s [%pI6]:%04X %s ", + ip_vs_proto_name(svc->protocol), + &svc->addr.in6, + ntohs(svc->port), + svc->scheduler->name); + else +#endif + seq_printf(seq, "%s %08X:%04X %s %s ", + ip_vs_proto_name(svc->protocol), + ntohl(svc->addr.ip), + ntohs(svc->port), + svc->scheduler->name, + (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":""); + } else { + seq_printf(seq, "FWM %08X %s %s", + svc->fwmark, svc->scheduler->name, + (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":""); + } + + if (svc->flags & IP_VS_SVC_F_PERSISTENT) + seq_printf(seq, "persistent %d %08X\n", + svc->timeout, + ntohl(svc->netmask)); + else + seq_putc(seq, '\n'); + + list_for_each_entry(dest, &svc->destinations, n_list) { +#ifdef CONFIG_IP_VS_IPV6 + if (dest->af == AF_INET6) + seq_printf(seq, + " -> [%pI6]:%04X" + " %-7s %-6d %-10d %-10d\n", + &dest->addr.in6, + ntohs(dest->port), + ip_vs_fwd_name(atomic_read(&dest->conn_flags)), + atomic_read(&dest->weight), + atomic_read(&dest->activeconns), + atomic_read(&dest->inactconns)); + else +#endif + seq_printf(seq, + " -> %08X:%04X " + "%-7s %-6d %-10d %-10d\n", + ntohl(dest->addr.ip), + ntohs(dest->port), + ip_vs_fwd_name(atomic_read(&dest->conn_flags)), + atomic_read(&dest->weight), + atomic_read(&dest->activeconns), + atomic_read(&dest->inactconns)); + + } + } + return 0; +} + +static const struct seq_operations ip_vs_info_seq_ops = { + .start = ip_vs_info_seq_start, + .next = ip_vs_info_seq_next, + .stop = ip_vs_info_seq_stop, + .show = ip_vs_info_seq_show, +}; + +static int ip_vs_info_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &ip_vs_info_seq_ops, + sizeof(struct ip_vs_iter)); +} + +static const struct file_operations ip_vs_info_fops = { + .owner = THIS_MODULE, + .open = ip_vs_info_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +static int ip_vs_stats_show(struct seq_file *seq, void *v) +{ + struct net *net = seq_file_single_net(seq); + struct ip_vs_stats_user show; + +/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ + seq_puts(seq, + " Total Incoming Outgoing Incoming Outgoing\n"); + seq_printf(seq, + " Conns Packets Packets Bytes Bytes\n"); + + ip_vs_copy_stats(&show, &net_ipvs(net)->tot_stats); + seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", show.conns, + show.inpkts, show.outpkts, + (unsigned long long) show.inbytes, + (unsigned long long) show.outbytes); + +/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ + seq_puts(seq, + " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n"); + seq_printf(seq, "%8X %8X %8X %16X %16X\n", + show.cps, show.inpps, show.outpps, + show.inbps, show.outbps); + + return 0; +} + +static int ip_vs_stats_seq_open(struct inode *inode, struct file *file) +{ + return single_open_net(inode, file, ip_vs_stats_show); +} + +static const struct file_operations ip_vs_stats_fops = { + .owner = THIS_MODULE, + .open = ip_vs_stats_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release_net, +}; + +static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v) +{ + struct net *net = seq_file_single_net(seq); + struct ip_vs_stats *tot_stats = &net_ipvs(net)->tot_stats; + struct ip_vs_cpu_stats *cpustats = tot_stats->cpustats; + struct ip_vs_stats_user rates; + int i; + +/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ + seq_puts(seq, + " Total Incoming Outgoing Incoming Outgoing\n"); + seq_printf(seq, + "CPU Conns Packets Packets Bytes Bytes\n"); + + for_each_possible_cpu(i) { + struct ip_vs_cpu_stats *u = per_cpu_ptr(cpustats, i); + unsigned int start; + __u64 inbytes, outbytes; + + do { + start = u64_stats_fetch_begin_bh(&u->syncp); + inbytes = u->ustats.inbytes; + outbytes = u->ustats.outbytes; + } while (u64_stats_fetch_retry_bh(&u->syncp, start)); + + seq_printf(seq, "%3X %8X %8X %8X %16LX %16LX\n", + i, u->ustats.conns, u->ustats.inpkts, + u->ustats.outpkts, (__u64)inbytes, + (__u64)outbytes); + } + + spin_lock_bh(&tot_stats->lock); + + seq_printf(seq, " ~ %8X %8X %8X %16LX %16LX\n\n", + tot_stats->ustats.conns, tot_stats->ustats.inpkts, + tot_stats->ustats.outpkts, + (unsigned long long) tot_stats->ustats.inbytes, + (unsigned long long) tot_stats->ustats.outbytes); + + ip_vs_read_estimator(&rates, tot_stats); + + spin_unlock_bh(&tot_stats->lock); + +/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ + seq_puts(seq, + " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n"); + seq_printf(seq, " %8X %8X %8X %16X %16X\n", + rates.cps, + rates.inpps, + rates.outpps, + rates.inbps, + rates.outbps); + + return 0; +} + +static int ip_vs_stats_percpu_seq_open(struct inode *inode, struct file *file) +{ + return single_open_net(inode, file, ip_vs_stats_percpu_show); +} + +static const struct file_operations ip_vs_stats_percpu_fops = { + .owner = THIS_MODULE, + .open = ip_vs_stats_percpu_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release_net, +}; +#endif + +/* + * Set timeout values for tcp tcpfin udp in the timeout_table. + */ +static int ip_vs_set_timeout(struct net *net, struct ip_vs_timeout_user *u) +{ +#if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP) + struct ip_vs_proto_data *pd; +#endif + + IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n", + u->tcp_timeout, + u->tcp_fin_timeout, + u->udp_timeout); + +#ifdef CONFIG_IP_VS_PROTO_TCP + if (u->tcp_timeout) { + pd = ip_vs_proto_data_get(net, IPPROTO_TCP); + pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] + = u->tcp_timeout * HZ; + } + + if (u->tcp_fin_timeout) { + pd = ip_vs_proto_data_get(net, IPPROTO_TCP); + pd->timeout_table[IP_VS_TCP_S_FIN_WAIT] + = u->tcp_fin_timeout * HZ; + } +#endif + +#ifdef CONFIG_IP_VS_PROTO_UDP + if (u->udp_timeout) { + pd = ip_vs_proto_data_get(net, IPPROTO_UDP); + pd->timeout_table[IP_VS_UDP_S_NORMAL] + = u->udp_timeout * HZ; + } +#endif + return 0; +} + + +#define SET_CMDID(cmd) (cmd - IP_VS_BASE_CTL) +#define SERVICE_ARG_LEN (sizeof(struct ip_vs_service_user)) +#define SVCDEST_ARG_LEN (sizeof(struct ip_vs_service_user) + \ + sizeof(struct ip_vs_dest_user)) +#define TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user)) +#define DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user)) +#define MAX_ARG_LEN SVCDEST_ARG_LEN + +static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = { + [SET_CMDID(IP_VS_SO_SET_ADD)] = SERVICE_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_EDIT)] = SERVICE_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_DEL)] = SERVICE_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_FLUSH)] = 0, + [SET_CMDID(IP_VS_SO_SET_ADDDEST)] = SVCDEST_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_DELDEST)] = SVCDEST_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_EDITDEST)] = SVCDEST_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_TIMEOUT)] = TIMEOUT_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_STARTDAEMON)] = DAEMON_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_STOPDAEMON)] = DAEMON_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_ZERO)] = SERVICE_ARG_LEN, +}; + +static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc, + struct ip_vs_service_user *usvc_compat) +{ + memset(usvc, 0, sizeof(*usvc)); + + usvc->af = AF_INET; + usvc->protocol = usvc_compat->protocol; + usvc->addr.ip = usvc_compat->addr; + usvc->port = usvc_compat->port; + usvc->fwmark = usvc_compat->fwmark; + + /* Deep copy of sched_name is not needed here */ + usvc->sched_name = usvc_compat->sched_name; + + usvc->flags = usvc_compat->flags; + usvc->timeout = usvc_compat->timeout; + usvc->netmask = usvc_compat->netmask; +} + +static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest, + struct ip_vs_dest_user *udest_compat) +{ + memset(udest, 0, sizeof(*udest)); + + udest->addr.ip = udest_compat->addr; + udest->port = udest_compat->port; + udest->conn_flags = udest_compat->conn_flags; + udest->weight = udest_compat->weight; + udest->u_threshold = udest_compat->u_threshold; + udest->l_threshold = udest_compat->l_threshold; +} + +static int +do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) +{ + struct net *net = sock_net(sk); + int ret; + unsigned char arg[MAX_ARG_LEN]; + struct ip_vs_service_user *usvc_compat; + struct ip_vs_service_user_kern usvc; + struct ip_vs_service *svc; + struct ip_vs_dest_user *udest_compat; + struct ip_vs_dest_user_kern udest; + struct netns_ipvs *ipvs = net_ipvs(net); + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_SET_MAX) + return -EINVAL; + if (len < 0 || len > MAX_ARG_LEN) + return -EINVAL; + if (len != set_arglen[SET_CMDID(cmd)]) { + pr_err("set_ctl: len %u != %u\n", + len, set_arglen[SET_CMDID(cmd)]); + return -EINVAL; + } + + if (copy_from_user(arg, user, len) != 0) + return -EFAULT; + + /* increase the module use count */ + ip_vs_use_count_inc(); + + /* Handle daemons since they have another lock */ + if (cmd == IP_VS_SO_SET_STARTDAEMON || + cmd == IP_VS_SO_SET_STOPDAEMON) { + struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg; + + if (mutex_lock_interruptible(&ipvs->sync_mutex)) { + ret = -ERESTARTSYS; + goto out_dec; + } + if (cmd == IP_VS_SO_SET_STARTDAEMON) + ret = start_sync_thread(net, dm->state, dm->mcast_ifn, + dm->syncid); + else + ret = stop_sync_thread(net, dm->state); + mutex_unlock(&ipvs->sync_mutex); + goto out_dec; + } + + if (mutex_lock_interruptible(&__ip_vs_mutex)) { + ret = -ERESTARTSYS; + goto out_dec; + } + + if (cmd == IP_VS_SO_SET_FLUSH) { + /* Flush the virtual service */ + ret = ip_vs_flush(net); + goto out_unlock; + } else if (cmd == IP_VS_SO_SET_TIMEOUT) { + /* Set timeout values for (tcp tcpfin udp) */ + ret = ip_vs_set_timeout(net, (struct ip_vs_timeout_user *)arg); + goto out_unlock; + } + + usvc_compat = (struct ip_vs_service_user *)arg; + udest_compat = (struct ip_vs_dest_user *)(usvc_compat + 1); + + /* We only use the new structs internally, so copy userspace compat + * structs to extended internal versions */ + ip_vs_copy_usvc_compat(&usvc, usvc_compat); + ip_vs_copy_udest_compat(&udest, udest_compat); + + if (cmd == IP_VS_SO_SET_ZERO) { + /* if no service address is set, zero counters in all */ + if (!usvc.fwmark && !usvc.addr.ip && !usvc.port) { + ret = ip_vs_zero_all(net); + goto out_unlock; + } + } + + /* Check for valid protocol: TCP or UDP or SCTP, even for fwmark!=0 */ + if (usvc.protocol != IPPROTO_TCP && usvc.protocol != IPPROTO_UDP && + usvc.protocol != IPPROTO_SCTP) { + pr_err("set_ctl: invalid protocol: %d %pI4:%d %s\n", + usvc.protocol, &usvc.addr.ip, + ntohs(usvc.port), usvc.sched_name); + ret = -EFAULT; + goto out_unlock; + } + + /* Lookup the exact service by <protocol, addr, port> or fwmark */ + if (usvc.fwmark == 0) + svc = __ip_vs_service_find(net, usvc.af, usvc.protocol, + &usvc.addr, usvc.port); + else + svc = __ip_vs_svc_fwm_find(net, usvc.af, usvc.fwmark); + + if (cmd != IP_VS_SO_SET_ADD + && (svc == NULL || svc->protocol != usvc.protocol)) { + ret = -ESRCH; + goto out_unlock; + } + + switch (cmd) { + case IP_VS_SO_SET_ADD: + if (svc != NULL) + ret = -EEXIST; + else + ret = ip_vs_add_service(net, &usvc, &svc); + break; + case IP_VS_SO_SET_EDIT: + ret = ip_vs_edit_service(svc, &usvc); + break; + case IP_VS_SO_SET_DEL: + ret = ip_vs_del_service(svc); + if (!ret) + goto out_unlock; + break; + case IP_VS_SO_SET_ZERO: + ret = ip_vs_zero_service(svc); + break; + case IP_VS_SO_SET_ADDDEST: + ret = ip_vs_add_dest(svc, &udest); + break; + case IP_VS_SO_SET_EDITDEST: + ret = ip_vs_edit_dest(svc, &udest); + break; + case IP_VS_SO_SET_DELDEST: + ret = ip_vs_del_dest(svc, &udest); + break; + default: + ret = -EINVAL; + } + + out_unlock: + mutex_unlock(&__ip_vs_mutex); + out_dec: + /* decrease the module use count */ + ip_vs_use_count_dec(); + + return ret; +} + + +static void +ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src) +{ + dst->protocol = src->protocol; + dst->addr = src->addr.ip; + dst->port = src->port; + dst->fwmark = src->fwmark; + strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name)); + dst->flags = src->flags; + dst->timeout = src->timeout / HZ; + dst->netmask = src->netmask; + dst->num_dests = src->num_dests; + ip_vs_copy_stats(&dst->stats, &src->stats); +} + +static inline int +__ip_vs_get_service_entries(struct net *net, + const struct ip_vs_get_services *get, + struct ip_vs_get_services __user *uptr) +{ + int idx, count=0; + struct ip_vs_service *svc; + struct ip_vs_service_entry entry; + int ret = 0; + + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { + /* Only expose IPv4 entries to old interface */ + if (svc->af != AF_INET || !net_eq(svc->net, net)) + continue; + + if (count >= get->num_services) + goto out; + memset(&entry, 0, sizeof(entry)); + ip_vs_copy_service(&entry, svc); + if (copy_to_user(&uptr->entrytable[count], + &entry, sizeof(entry))) { + ret = -EFAULT; + goto out; + } + count++; + } + } + + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { + /* Only expose IPv4 entries to old interface */ + if (svc->af != AF_INET || !net_eq(svc->net, net)) + continue; + + if (count >= get->num_services) + goto out; + memset(&entry, 0, sizeof(entry)); + ip_vs_copy_service(&entry, svc); + if (copy_to_user(&uptr->entrytable[count], + &entry, sizeof(entry))) { + ret = -EFAULT; + goto out; + } + count++; + } + } +out: + return ret; +} + +static inline int +__ip_vs_get_dest_entries(struct net *net, const struct ip_vs_get_dests *get, + struct ip_vs_get_dests __user *uptr) +{ + struct ip_vs_service *svc; + union nf_inet_addr addr = { .ip = get->addr }; + int ret = 0; + + if (get->fwmark) + svc = __ip_vs_svc_fwm_find(net, AF_INET, get->fwmark); + else + svc = __ip_vs_service_find(net, AF_INET, get->protocol, &addr, + get->port); + + if (svc) { + int count = 0; + struct ip_vs_dest *dest; + struct ip_vs_dest_entry entry; + + list_for_each_entry(dest, &svc->destinations, n_list) { + if (count >= get->num_dests) + break; + + entry.addr = dest->addr.ip; + entry.port = dest->port; + entry.conn_flags = atomic_read(&dest->conn_flags); + entry.weight = atomic_read(&dest->weight); + entry.u_threshold = dest->u_threshold; + entry.l_threshold = dest->l_threshold; + entry.activeconns = atomic_read(&dest->activeconns); + entry.inactconns = atomic_read(&dest->inactconns); + entry.persistconns = atomic_read(&dest->persistconns); + ip_vs_copy_stats(&entry.stats, &dest->stats); + if (copy_to_user(&uptr->entrytable[count], + &entry, sizeof(entry))) { + ret = -EFAULT; + break; + } + count++; + } + } else + ret = -ESRCH; + return ret; +} + +static inline void +__ip_vs_get_timeouts(struct net *net, struct ip_vs_timeout_user *u) +{ +#if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP) + struct ip_vs_proto_data *pd; +#endif + +#ifdef CONFIG_IP_VS_PROTO_TCP + pd = ip_vs_proto_data_get(net, IPPROTO_TCP); + u->tcp_timeout = pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ; + u->tcp_fin_timeout = pd->timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ; +#endif +#ifdef CONFIG_IP_VS_PROTO_UDP + pd = ip_vs_proto_data_get(net, IPPROTO_UDP); + u->udp_timeout = + pd->timeout_table[IP_VS_UDP_S_NORMAL] / HZ; +#endif +} + + +#define GET_CMDID(cmd) (cmd - IP_VS_BASE_CTL) +#define GET_INFO_ARG_LEN (sizeof(struct ip_vs_getinfo)) +#define GET_SERVICES_ARG_LEN (sizeof(struct ip_vs_get_services)) +#define GET_SERVICE_ARG_LEN (sizeof(struct ip_vs_service_entry)) +#define GET_DESTS_ARG_LEN (sizeof(struct ip_vs_get_dests)) +#define GET_TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user)) +#define GET_DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user) * 2) + +static const unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = { + [GET_CMDID(IP_VS_SO_GET_VERSION)] = 64, + [GET_CMDID(IP_VS_SO_GET_INFO)] = GET_INFO_ARG_LEN, + [GET_CMDID(IP_VS_SO_GET_SERVICES)] = GET_SERVICES_ARG_LEN, + [GET_CMDID(IP_VS_SO_GET_SERVICE)] = GET_SERVICE_ARG_LEN, + [GET_CMDID(IP_VS_SO_GET_DESTS)] = GET_DESTS_ARG_LEN, + [GET_CMDID(IP_VS_SO_GET_TIMEOUT)] = GET_TIMEOUT_ARG_LEN, + [GET_CMDID(IP_VS_SO_GET_DAEMON)] = GET_DAEMON_ARG_LEN, +}; + +static int +do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) +{ + unsigned char arg[128]; + int ret = 0; + unsigned int copylen; + struct net *net = sock_net(sk); + struct netns_ipvs *ipvs = net_ipvs(net); + + BUG_ON(!net); + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_GET_MAX) + return -EINVAL; + + if (*len < get_arglen[GET_CMDID(cmd)]) { + pr_err("get_ctl: len %u < %u\n", + *len, get_arglen[GET_CMDID(cmd)]); + return -EINVAL; + } + + copylen = get_arglen[GET_CMDID(cmd)]; + if (copylen > 128) + return -EINVAL; + + if (copy_from_user(arg, user, copylen) != 0) + return -EFAULT; + /* + * Handle daemons first since it has its own locking + */ + if (cmd == IP_VS_SO_GET_DAEMON) { + struct ip_vs_daemon_user d[2]; + + memset(&d, 0, sizeof(d)); + if (mutex_lock_interruptible(&ipvs->sync_mutex)) + return -ERESTARTSYS; + + if (ipvs->sync_state & IP_VS_STATE_MASTER) { + d[0].state = IP_VS_STATE_MASTER; + strlcpy(d[0].mcast_ifn, ipvs->master_mcast_ifn, + sizeof(d[0].mcast_ifn)); + d[0].syncid = ipvs->master_syncid; + } + if (ipvs->sync_state & IP_VS_STATE_BACKUP) { + d[1].state = IP_VS_STATE_BACKUP; + strlcpy(d[1].mcast_ifn, ipvs->backup_mcast_ifn, + sizeof(d[1].mcast_ifn)); + d[1].syncid = ipvs->backup_syncid; + } + if (copy_to_user(user, &d, sizeof(d)) != 0) + ret = -EFAULT; + mutex_unlock(&ipvs->sync_mutex); + return ret; + } + + if (mutex_lock_interruptible(&__ip_vs_mutex)) + return -ERESTARTSYS; + + switch (cmd) { + case IP_VS_SO_GET_VERSION: + { + char buf[64]; + + sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)", + NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size); + if (copy_to_user(user, buf, strlen(buf)+1) != 0) { + ret = -EFAULT; + goto out; + } + *len = strlen(buf)+1; + } + break; + + case IP_VS_SO_GET_INFO: + { + struct ip_vs_getinfo info; + info.version = IP_VS_VERSION_CODE; + info.size = ip_vs_conn_tab_size; + info.num_services = ipvs->num_services; + if (copy_to_user(user, &info, sizeof(info)) != 0) + ret = -EFAULT; + } + break; + + case IP_VS_SO_GET_SERVICES: + { + struct ip_vs_get_services *get; + int size; + + get = (struct ip_vs_get_services *)arg; + size = sizeof(*get) + + sizeof(struct ip_vs_service_entry) * get->num_services; + if (*len != size) { + pr_err("length: %u != %u\n", *len, size); + ret = -EINVAL; + goto out; + } + ret = __ip_vs_get_service_entries(net, get, user); + } + break; + + case IP_VS_SO_GET_SERVICE: + { + struct ip_vs_service_entry *entry; + struct ip_vs_service *svc; + union nf_inet_addr addr; + + entry = (struct ip_vs_service_entry *)arg; + addr.ip = entry->addr; + if (entry->fwmark) + svc = __ip_vs_svc_fwm_find(net, AF_INET, entry->fwmark); + else + svc = __ip_vs_service_find(net, AF_INET, + entry->protocol, &addr, + entry->port); + if (svc) { + ip_vs_copy_service(entry, svc); + if (copy_to_user(user, entry, sizeof(*entry)) != 0) + ret = -EFAULT; + } else + ret = -ESRCH; + } + break; + + case IP_VS_SO_GET_DESTS: + { + struct ip_vs_get_dests *get; + int size; + + get = (struct ip_vs_get_dests *)arg; + size = sizeof(*get) + + sizeof(struct ip_vs_dest_entry) * get->num_dests; + if (*len != size) { + pr_err("length: %u != %u\n", *len, size); + ret = -EINVAL; + goto out; + } + ret = __ip_vs_get_dest_entries(net, get, user); + } + break; + + case IP_VS_SO_GET_TIMEOUT: + { + struct ip_vs_timeout_user t; + + __ip_vs_get_timeouts(net, &t); + if (copy_to_user(user, &t, sizeof(t)) != 0) + ret = -EFAULT; + } + break; + + default: + ret = -EINVAL; + } + +out: + mutex_unlock(&__ip_vs_mutex); + return ret; +} + + +static struct nf_sockopt_ops ip_vs_sockopts = { + .pf = PF_INET, + .set_optmin = IP_VS_BASE_CTL, + .set_optmax = IP_VS_SO_SET_MAX+1, + .set = do_ip_vs_set_ctl, + .get_optmin = IP_VS_BASE_CTL, + .get_optmax = IP_VS_SO_GET_MAX+1, + .get = do_ip_vs_get_ctl, + .owner = THIS_MODULE, +}; + +/* + * Generic Netlink interface + */ + +/* IPVS genetlink family */ +static struct genl_family ip_vs_genl_family = { + .id = GENL_ID_GENERATE, + .hdrsize = 0, + .name = IPVS_GENL_NAME, + .version = IPVS_GENL_VERSION, + .maxattr = IPVS_CMD_MAX, + .netnsok = true, /* Make ipvsadm to work on netns */ +}; + +/* Policy used for first-level command attributes */ +static const struct nla_policy ip_vs_cmd_policy[IPVS_CMD_ATTR_MAX + 1] = { + [IPVS_CMD_ATTR_SERVICE] = { .type = NLA_NESTED }, + [IPVS_CMD_ATTR_DEST] = { .type = NLA_NESTED }, + [IPVS_CMD_ATTR_DAEMON] = { .type = NLA_NESTED }, + [IPVS_CMD_ATTR_TIMEOUT_TCP] = { .type = NLA_U32 }, + [IPVS_CMD_ATTR_TIMEOUT_TCP_FIN] = { .type = NLA_U32 }, + [IPVS_CMD_ATTR_TIMEOUT_UDP] = { .type = NLA_U32 }, +}; + +/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DAEMON */ +static const struct nla_policy ip_vs_daemon_policy[IPVS_DAEMON_ATTR_MAX + 1] = { + [IPVS_DAEMON_ATTR_STATE] = { .type = NLA_U32 }, + [IPVS_DAEMON_ATTR_MCAST_IFN] = { .type = NLA_NUL_STRING, + .len = IP_VS_IFNAME_MAXLEN }, + [IPVS_DAEMON_ATTR_SYNC_ID] = { .type = NLA_U32 }, +}; + +/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_SERVICE */ +static const struct nla_policy ip_vs_svc_policy[IPVS_SVC_ATTR_MAX + 1] = { + [IPVS_SVC_ATTR_AF] = { .type = NLA_U16 }, + [IPVS_SVC_ATTR_PROTOCOL] = { .type = NLA_U16 }, + [IPVS_SVC_ATTR_ADDR] = { .type = NLA_BINARY, + .len = sizeof(union nf_inet_addr) }, + [IPVS_SVC_ATTR_PORT] = { .type = NLA_U16 }, + [IPVS_SVC_ATTR_FWMARK] = { .type = NLA_U32 }, + [IPVS_SVC_ATTR_SCHED_NAME] = { .type = NLA_NUL_STRING, + .len = IP_VS_SCHEDNAME_MAXLEN }, + [IPVS_SVC_ATTR_PE_NAME] = { .type = NLA_NUL_STRING, + .len = IP_VS_PENAME_MAXLEN }, + [IPVS_SVC_ATTR_FLAGS] = { .type = NLA_BINARY, + .len = sizeof(struct ip_vs_flags) }, + [IPVS_SVC_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPVS_SVC_ATTR_NETMASK] = { .type = NLA_U32 }, + [IPVS_SVC_ATTR_STATS] = { .type = NLA_NESTED }, +}; + +/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DEST */ +static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = { + [IPVS_DEST_ATTR_ADDR] = { .type = NLA_BINARY, + .len = sizeof(union nf_inet_addr) }, + [IPVS_DEST_ATTR_PORT] = { .type = NLA_U16 }, + [IPVS_DEST_ATTR_FWD_METHOD] = { .type = NLA_U32 }, + [IPVS_DEST_ATTR_WEIGHT] = { .type = NLA_U32 }, + [IPVS_DEST_ATTR_U_THRESH] = { .type = NLA_U32 }, + [IPVS_DEST_ATTR_L_THRESH] = { .type = NLA_U32 }, + [IPVS_DEST_ATTR_ACTIVE_CONNS] = { .type = NLA_U32 }, + [IPVS_DEST_ATTR_INACT_CONNS] = { .type = NLA_U32 }, + [IPVS_DEST_ATTR_PERSIST_CONNS] = { .type = NLA_U32 }, + [IPVS_DEST_ATTR_STATS] = { .type = NLA_NESTED }, +}; + +static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type, + struct ip_vs_stats *stats) +{ + struct ip_vs_stats_user ustats; + struct nlattr *nl_stats = nla_nest_start(skb, container_type); + if (!nl_stats) + return -EMSGSIZE; + + ip_vs_copy_stats(&ustats, stats); + + NLA_PUT_U32(skb, IPVS_STATS_ATTR_CONNS, ustats.conns); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPKTS, ustats.inpkts); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPKTS, ustats.outpkts); + NLA_PUT_U64(skb, IPVS_STATS_ATTR_INBYTES, ustats.inbytes); + NLA_PUT_U64(skb, IPVS_STATS_ATTR_OUTBYTES, ustats.outbytes); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_CPS, ustats.cps); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPPS, ustats.inpps); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPPS, ustats.outpps); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_INBPS, ustats.inbps); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTBPS, ustats.outbps); + + nla_nest_end(skb, nl_stats); + + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nl_stats); + return -EMSGSIZE; +} + +static int ip_vs_genl_fill_service(struct sk_buff *skb, + struct ip_vs_service *svc) +{ + struct nlattr *nl_service; + struct ip_vs_flags flags = { .flags = svc->flags, + .mask = ~0 }; + + nl_service = nla_nest_start(skb, IPVS_CMD_ATTR_SERVICE); + if (!nl_service) + return -EMSGSIZE; + + NLA_PUT_U16(skb, IPVS_SVC_ATTR_AF, svc->af); + + if (svc->fwmark) { + NLA_PUT_U32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark); + } else { + NLA_PUT_U16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol); + NLA_PUT(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr); + NLA_PUT_U16(skb, IPVS_SVC_ATTR_PORT, svc->port); + } + + NLA_PUT_STRING(skb, IPVS_SVC_ATTR_SCHED_NAME, svc->scheduler->name); + if (svc->pe) + NLA_PUT_STRING(skb, IPVS_SVC_ATTR_PE_NAME, svc->pe->name); + NLA_PUT(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags); + NLA_PUT_U32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ); + NLA_PUT_U32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask); + + if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &svc->stats)) + goto nla_put_failure; + + nla_nest_end(skb, nl_service); + + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nl_service); + return -EMSGSIZE; +} + +static int ip_vs_genl_dump_service(struct sk_buff *skb, + struct ip_vs_service *svc, + struct netlink_callback *cb) +{ + void *hdr; + + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, + &ip_vs_genl_family, NLM_F_MULTI, + IPVS_CMD_NEW_SERVICE); + if (!hdr) + return -EMSGSIZE; + + if (ip_vs_genl_fill_service(skb, svc) < 0) + goto nla_put_failure; + + return genlmsg_end(skb, hdr); + +nla_put_failure: + genlmsg_cancel(skb, hdr); + return -EMSGSIZE; +} + +static int ip_vs_genl_dump_services(struct sk_buff *skb, + struct netlink_callback *cb) +{ + int idx = 0, i; + int start = cb->args[0]; + struct ip_vs_service *svc; + struct net *net = skb_sknet(skb); + + mutex_lock(&__ip_vs_mutex); + for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) { + list_for_each_entry(svc, &ip_vs_svc_table[i], s_list) { + if (++idx <= start || !net_eq(svc->net, net)) + continue; + if (ip_vs_genl_dump_service(skb, svc, cb) < 0) { + idx--; + goto nla_put_failure; + } + } + } + + for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) { + list_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) { + if (++idx <= start || !net_eq(svc->net, net)) + continue; + if (ip_vs_genl_dump_service(skb, svc, cb) < 0) { + idx--; + goto nla_put_failure; + } + } + } + +nla_put_failure: + mutex_unlock(&__ip_vs_mutex); + cb->args[0] = idx; + + return skb->len; +} + +static int ip_vs_genl_parse_service(struct net *net, + struct ip_vs_service_user_kern *usvc, + struct nlattr *nla, int full_entry, + struct ip_vs_service **ret_svc) +{ + struct nlattr *attrs[IPVS_SVC_ATTR_MAX + 1]; + struct nlattr *nla_af, *nla_port, *nla_fwmark, *nla_protocol, *nla_addr; + struct ip_vs_service *svc; + + /* Parse mandatory identifying service fields first */ + if (nla == NULL || + nla_parse_nested(attrs, IPVS_SVC_ATTR_MAX, nla, ip_vs_svc_policy)) + return -EINVAL; + + nla_af = attrs[IPVS_SVC_ATTR_AF]; + nla_protocol = attrs[IPVS_SVC_ATTR_PROTOCOL]; + nla_addr = attrs[IPVS_SVC_ATTR_ADDR]; + nla_port = attrs[IPVS_SVC_ATTR_PORT]; + nla_fwmark = attrs[IPVS_SVC_ATTR_FWMARK]; + + if (!(nla_af && (nla_fwmark || (nla_port && nla_protocol && nla_addr)))) + return -EINVAL; + + memset(usvc, 0, sizeof(*usvc)); + + usvc->af = nla_get_u16(nla_af); +#ifdef CONFIG_IP_VS_IPV6 + if (usvc->af != AF_INET && usvc->af != AF_INET6) +#else + if (usvc->af != AF_INET) +#endif + return -EAFNOSUPPORT; + + if (nla_fwmark) { + usvc->protocol = IPPROTO_TCP; + usvc->fwmark = nla_get_u32(nla_fwmark); + } else { + usvc->protocol = nla_get_u16(nla_protocol); + nla_memcpy(&usvc->addr, nla_addr, sizeof(usvc->addr)); + usvc->port = nla_get_u16(nla_port); + usvc->fwmark = 0; + } + + if (usvc->fwmark) + svc = __ip_vs_svc_fwm_find(net, usvc->af, usvc->fwmark); + else + svc = __ip_vs_service_find(net, usvc->af, usvc->protocol, + &usvc->addr, usvc->port); + *ret_svc = svc; + + /* If a full entry was requested, check for the additional fields */ + if (full_entry) { + struct nlattr *nla_sched, *nla_flags, *nla_pe, *nla_timeout, + *nla_netmask; + struct ip_vs_flags flags; + + nla_sched = attrs[IPVS_SVC_ATTR_SCHED_NAME]; + nla_pe = attrs[IPVS_SVC_ATTR_PE_NAME]; + nla_flags = attrs[IPVS_SVC_ATTR_FLAGS]; + nla_timeout = attrs[IPVS_SVC_ATTR_TIMEOUT]; + nla_netmask = attrs[IPVS_SVC_ATTR_NETMASK]; + + if (!(nla_sched && nla_flags && nla_timeout && nla_netmask)) + return -EINVAL; + + nla_memcpy(&flags, nla_flags, sizeof(flags)); + + /* prefill flags from service if it already exists */ + if (svc) + usvc->flags = svc->flags; + + /* set new flags from userland */ + usvc->flags = (usvc->flags & ~flags.mask) | + (flags.flags & flags.mask); + usvc->sched_name = nla_data(nla_sched); + usvc->pe_name = nla_pe ? nla_data(nla_pe) : NULL; + usvc->timeout = nla_get_u32(nla_timeout); + usvc->netmask = nla_get_u32(nla_netmask); + } + + return 0; +} + +static struct ip_vs_service *ip_vs_genl_find_service(struct net *net, + struct nlattr *nla) +{ + struct ip_vs_service_user_kern usvc; + struct ip_vs_service *svc; + int ret; + + ret = ip_vs_genl_parse_service(net, &usvc, nla, 0, &svc); + return ret ? ERR_PTR(ret) : svc; +} + +static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest) +{ + struct nlattr *nl_dest; + + nl_dest = nla_nest_start(skb, IPVS_CMD_ATTR_DEST); + if (!nl_dest) + return -EMSGSIZE; + + NLA_PUT(skb, IPVS_DEST_ATTR_ADDR, sizeof(dest->addr), &dest->addr); + NLA_PUT_U16(skb, IPVS_DEST_ATTR_PORT, dest->port); + + NLA_PUT_U32(skb, IPVS_DEST_ATTR_FWD_METHOD, + atomic_read(&dest->conn_flags) & IP_VS_CONN_F_FWD_MASK); + NLA_PUT_U32(skb, IPVS_DEST_ATTR_WEIGHT, atomic_read(&dest->weight)); + NLA_PUT_U32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold); + NLA_PUT_U32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold); + NLA_PUT_U32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS, + atomic_read(&dest->activeconns)); + NLA_PUT_U32(skb, IPVS_DEST_ATTR_INACT_CONNS, + atomic_read(&dest->inactconns)); + NLA_PUT_U32(skb, IPVS_DEST_ATTR_PERSIST_CONNS, + atomic_read(&dest->persistconns)); + + if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &dest->stats)) + goto nla_put_failure; + + nla_nest_end(skb, nl_dest); + + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nl_dest); + return -EMSGSIZE; +} + +static int ip_vs_genl_dump_dest(struct sk_buff *skb, struct ip_vs_dest *dest, + struct netlink_callback *cb) +{ + void *hdr; + + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, + &ip_vs_genl_family, NLM_F_MULTI, + IPVS_CMD_NEW_DEST); + if (!hdr) + return -EMSGSIZE; + + if (ip_vs_genl_fill_dest(skb, dest) < 0) + goto nla_put_failure; + + return genlmsg_end(skb, hdr); + +nla_put_failure: + genlmsg_cancel(skb, hdr); + return -EMSGSIZE; +} + +static int ip_vs_genl_dump_dests(struct sk_buff *skb, + struct netlink_callback *cb) +{ + int idx = 0; + int start = cb->args[0]; + struct ip_vs_service *svc; + struct ip_vs_dest *dest; + struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1]; + struct net *net = skb_sknet(skb); + + mutex_lock(&__ip_vs_mutex); + + /* Try to find the service for which to dump destinations */ + if (nlmsg_parse(cb->nlh, GENL_HDRLEN, attrs, + IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy)) + goto out_err; + + + svc = ip_vs_genl_find_service(net, attrs[IPVS_CMD_ATTR_SERVICE]); + if (IS_ERR(svc) || svc == NULL) + goto out_err; + + /* Dump the destinations */ + list_for_each_entry(dest, &svc->destinations, n_list) { + if (++idx <= start) + continue; + if (ip_vs_genl_dump_dest(skb, dest, cb) < 0) { + idx--; + goto nla_put_failure; + } + } + +nla_put_failure: + cb->args[0] = idx; + +out_err: + mutex_unlock(&__ip_vs_mutex); + + return skb->len; +} + +static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, + struct nlattr *nla, int full_entry) +{ + struct nlattr *attrs[IPVS_DEST_ATTR_MAX + 1]; + struct nlattr *nla_addr, *nla_port; + + /* Parse mandatory identifying destination fields first */ + if (nla == NULL || + nla_parse_nested(attrs, IPVS_DEST_ATTR_MAX, nla, ip_vs_dest_policy)) + return -EINVAL; + + nla_addr = attrs[IPVS_DEST_ATTR_ADDR]; + nla_port = attrs[IPVS_DEST_ATTR_PORT]; + + if (!(nla_addr && nla_port)) + return -EINVAL; + + memset(udest, 0, sizeof(*udest)); + + nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr)); + udest->port = nla_get_u16(nla_port); + + /* If a full entry was requested, check for the additional fields */ + if (full_entry) { + struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh, + *nla_l_thresh; + + nla_fwd = attrs[IPVS_DEST_ATTR_FWD_METHOD]; + nla_weight = attrs[IPVS_DEST_ATTR_WEIGHT]; + nla_u_thresh = attrs[IPVS_DEST_ATTR_U_THRESH]; + nla_l_thresh = attrs[IPVS_DEST_ATTR_L_THRESH]; + + if (!(nla_fwd && nla_weight && nla_u_thresh && nla_l_thresh)) + return -EINVAL; + + udest->conn_flags = nla_get_u32(nla_fwd) + & IP_VS_CONN_F_FWD_MASK; + udest->weight = nla_get_u32(nla_weight); + udest->u_threshold = nla_get_u32(nla_u_thresh); + udest->l_threshold = nla_get_u32(nla_l_thresh); + } + + return 0; +} + +static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __be32 state, + const char *mcast_ifn, __be32 syncid) +{ + struct nlattr *nl_daemon; + + nl_daemon = nla_nest_start(skb, IPVS_CMD_ATTR_DAEMON); + if (!nl_daemon) + return -EMSGSIZE; + + NLA_PUT_U32(skb, IPVS_DAEMON_ATTR_STATE, state); + NLA_PUT_STRING(skb, IPVS_DAEMON_ATTR_MCAST_IFN, mcast_ifn); + NLA_PUT_U32(skb, IPVS_DAEMON_ATTR_SYNC_ID, syncid); + + nla_nest_end(skb, nl_daemon); + + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nl_daemon); + return -EMSGSIZE; +} + +static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __be32 state, + const char *mcast_ifn, __be32 syncid, + struct netlink_callback *cb) +{ + void *hdr; + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, + &ip_vs_genl_family, NLM_F_MULTI, + IPVS_CMD_NEW_DAEMON); + if (!hdr) + return -EMSGSIZE; + + if (ip_vs_genl_fill_daemon(skb, state, mcast_ifn, syncid)) + goto nla_put_failure; + + return genlmsg_end(skb, hdr); + +nla_put_failure: + genlmsg_cancel(skb, hdr); + return -EMSGSIZE; +} + +static int ip_vs_genl_dump_daemons(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct net *net = skb_sknet(skb); + struct netns_ipvs *ipvs = net_ipvs(net); + + mutex_lock(&ipvs->sync_mutex); + if ((ipvs->sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) { + if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER, + ipvs->master_mcast_ifn, + ipvs->master_syncid, cb) < 0) + goto nla_put_failure; + + cb->args[0] = 1; + } + + if ((ipvs->sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) { + if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP, + ipvs->backup_mcast_ifn, + ipvs->backup_syncid, cb) < 0) + goto nla_put_failure; + + cb->args[1] = 1; + } + +nla_put_failure: + mutex_unlock(&ipvs->sync_mutex); + + return skb->len; +} + +static int ip_vs_genl_new_daemon(struct net *net, struct nlattr **attrs) +{ + if (!(attrs[IPVS_DAEMON_ATTR_STATE] && + attrs[IPVS_DAEMON_ATTR_MCAST_IFN] && + attrs[IPVS_DAEMON_ATTR_SYNC_ID])) + return -EINVAL; + + return start_sync_thread(net, + nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]), + nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]), + nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID])); +} + +static int ip_vs_genl_del_daemon(struct net *net, struct nlattr **attrs) +{ + if (!attrs[IPVS_DAEMON_ATTR_STATE]) + return -EINVAL; + + return stop_sync_thread(net, + nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE])); +} + +static int ip_vs_genl_set_config(struct net *net, struct nlattr **attrs) +{ + struct ip_vs_timeout_user t; + + __ip_vs_get_timeouts(net, &t); + + if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]) + t.tcp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]); + + if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]) + t.tcp_fin_timeout = + nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]); + + if (attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]) + t.udp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]); + + return ip_vs_set_timeout(net, &t); +} + +static int ip_vs_genl_set_daemon(struct sk_buff *skb, struct genl_info *info) +{ + int ret = 0, cmd; + struct net *net; + struct netns_ipvs *ipvs; + + net = skb_sknet(skb); + ipvs = net_ipvs(net); + cmd = info->genlhdr->cmd; + + if (cmd == IPVS_CMD_NEW_DAEMON || cmd == IPVS_CMD_DEL_DAEMON) { + struct nlattr *daemon_attrs[IPVS_DAEMON_ATTR_MAX + 1]; + + mutex_lock(&ipvs->sync_mutex); + if (!info->attrs[IPVS_CMD_ATTR_DAEMON] || + nla_parse_nested(daemon_attrs, IPVS_DAEMON_ATTR_MAX, + info->attrs[IPVS_CMD_ATTR_DAEMON], + ip_vs_daemon_policy)) { + ret = -EINVAL; + goto out; + } + + if (cmd == IPVS_CMD_NEW_DAEMON) + ret = ip_vs_genl_new_daemon(net, daemon_attrs); + else + ret = ip_vs_genl_del_daemon(net, daemon_attrs); +out: + mutex_unlock(&ipvs->sync_mutex); + } + return ret; +} + +static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) +{ + struct ip_vs_service *svc = NULL; + struct ip_vs_service_user_kern usvc; + struct ip_vs_dest_user_kern udest; + int ret = 0, cmd; + int need_full_svc = 0, need_full_dest = 0; + struct net *net; + + net = skb_sknet(skb); + cmd = info->genlhdr->cmd; + + mutex_lock(&__ip_vs_mutex); + + if (cmd == IPVS_CMD_FLUSH) { + ret = ip_vs_flush(net); + goto out; + } else if (cmd == IPVS_CMD_SET_CONFIG) { + ret = ip_vs_genl_set_config(net, info->attrs); + goto out; + } else if (cmd == IPVS_CMD_ZERO && + !info->attrs[IPVS_CMD_ATTR_SERVICE]) { + ret = ip_vs_zero_all(net); + goto out; + } + + /* All following commands require a service argument, so check if we + * received a valid one. We need a full service specification when + * adding / editing a service. Only identifying members otherwise. */ + if (cmd == IPVS_CMD_NEW_SERVICE || cmd == IPVS_CMD_SET_SERVICE) + need_full_svc = 1; + + ret = ip_vs_genl_parse_service(net, &usvc, + info->attrs[IPVS_CMD_ATTR_SERVICE], + need_full_svc, &svc); + if (ret) + goto out; + + /* Unless we're adding a new service, the service must already exist */ + if ((cmd != IPVS_CMD_NEW_SERVICE) && (svc == NULL)) { + ret = -ESRCH; + goto out; + } + + /* Destination commands require a valid destination argument. For + * adding / editing a destination, we need a full destination + * specification. */ + if (cmd == IPVS_CMD_NEW_DEST || cmd == IPVS_CMD_SET_DEST || + cmd == IPVS_CMD_DEL_DEST) { + if (cmd != IPVS_CMD_DEL_DEST) + need_full_dest = 1; + + ret = ip_vs_genl_parse_dest(&udest, + info->attrs[IPVS_CMD_ATTR_DEST], + need_full_dest); + if (ret) + goto out; + } + + switch (cmd) { + case IPVS_CMD_NEW_SERVICE: + if (svc == NULL) + ret = ip_vs_add_service(net, &usvc, &svc); + else + ret = -EEXIST; + break; + case IPVS_CMD_SET_SERVICE: + ret = ip_vs_edit_service(svc, &usvc); + break; + case IPVS_CMD_DEL_SERVICE: + ret = ip_vs_del_service(svc); + /* do not use svc, it can be freed */ + break; + case IPVS_CMD_NEW_DEST: + ret = ip_vs_add_dest(svc, &udest); + break; + case IPVS_CMD_SET_DEST: + ret = ip_vs_edit_dest(svc, &udest); + break; + case IPVS_CMD_DEL_DEST: + ret = ip_vs_del_dest(svc, &udest); + break; + case IPVS_CMD_ZERO: + ret = ip_vs_zero_service(svc); + break; + default: + ret = -EINVAL; + } + +out: + mutex_unlock(&__ip_vs_mutex); + + return ret; +} + +static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info) +{ + struct sk_buff *msg; + void *reply; + int ret, cmd, reply_cmd; + struct net *net; + + net = skb_sknet(skb); + cmd = info->genlhdr->cmd; + + if (cmd == IPVS_CMD_GET_SERVICE) + reply_cmd = IPVS_CMD_NEW_SERVICE; + else if (cmd == IPVS_CMD_GET_INFO) + reply_cmd = IPVS_CMD_SET_INFO; + else if (cmd == IPVS_CMD_GET_CONFIG) + reply_cmd = IPVS_CMD_SET_CONFIG; + else { + pr_err("unknown Generic Netlink command\n"); + return -EINVAL; + } + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + mutex_lock(&__ip_vs_mutex); + + reply = genlmsg_put_reply(msg, info, &ip_vs_genl_family, 0, reply_cmd); + if (reply == NULL) + goto nla_put_failure; + + switch (cmd) { + case IPVS_CMD_GET_SERVICE: + { + struct ip_vs_service *svc; + + svc = ip_vs_genl_find_service(net, + info->attrs[IPVS_CMD_ATTR_SERVICE]); + if (IS_ERR(svc)) { + ret = PTR_ERR(svc); + goto out_err; + } else if (svc) { + ret = ip_vs_genl_fill_service(msg, svc); + if (ret) + goto nla_put_failure; + } else { + ret = -ESRCH; + goto out_err; + } + + break; + } + + case IPVS_CMD_GET_CONFIG: + { + struct ip_vs_timeout_user t; + + __ip_vs_get_timeouts(net, &t); +#ifdef CONFIG_IP_VS_PROTO_TCP + NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP, t.tcp_timeout); + NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN, + t.tcp_fin_timeout); +#endif +#ifdef CONFIG_IP_VS_PROTO_UDP + NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_UDP, t.udp_timeout); +#endif + + break; + } + + case IPVS_CMD_GET_INFO: + NLA_PUT_U32(msg, IPVS_INFO_ATTR_VERSION, IP_VS_VERSION_CODE); + NLA_PUT_U32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE, + ip_vs_conn_tab_size); + break; + } + + genlmsg_end(msg, reply); + ret = genlmsg_reply(msg, info); + goto out; + +nla_put_failure: + pr_err("not enough space in Netlink message\n"); + ret = -EMSGSIZE; + +out_err: + nlmsg_free(msg); +out: + mutex_unlock(&__ip_vs_mutex); + + return ret; +} + + +static struct genl_ops ip_vs_genl_ops[] __read_mostly = { + { + .cmd = IPVS_CMD_NEW_SERVICE, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_SET_SERVICE, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_DEL_SERVICE, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_GET_SERVICE, + .flags = GENL_ADMIN_PERM, + .doit = ip_vs_genl_get_cmd, + .dumpit = ip_vs_genl_dump_services, + .policy = ip_vs_cmd_policy, + }, + { + .cmd = IPVS_CMD_NEW_DEST, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_SET_DEST, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_DEL_DEST, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_GET_DEST, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .dumpit = ip_vs_genl_dump_dests, + }, + { + .cmd = IPVS_CMD_NEW_DAEMON, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_daemon, + }, + { + .cmd = IPVS_CMD_DEL_DAEMON, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_daemon, + }, + { + .cmd = IPVS_CMD_GET_DAEMON, + .flags = GENL_ADMIN_PERM, + .dumpit = ip_vs_genl_dump_daemons, + }, + { + .cmd = IPVS_CMD_SET_CONFIG, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_GET_CONFIG, + .flags = GENL_ADMIN_PERM, + .doit = ip_vs_genl_get_cmd, + }, + { + .cmd = IPVS_CMD_GET_INFO, + .flags = GENL_ADMIN_PERM, + .doit = ip_vs_genl_get_cmd, + }, + { + .cmd = IPVS_CMD_ZERO, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_FLUSH, + .flags = GENL_ADMIN_PERM, + .doit = ip_vs_genl_set_cmd, + }, +}; + +static int __init ip_vs_genl_register(void) +{ + return genl_register_family_with_ops(&ip_vs_genl_family, + ip_vs_genl_ops, ARRAY_SIZE(ip_vs_genl_ops)); +} + +static void ip_vs_genl_unregister(void) +{ + genl_unregister_family(&ip_vs_genl_family); +} + +/* End of Generic Netlink interface definitions */ + +/* + * per netns intit/exit func. + */ +#ifdef CONFIG_SYSCTL +int __net_init ip_vs_control_net_init_sysctl(struct net *net) +{ + int idx; + struct netns_ipvs *ipvs = net_ipvs(net); + struct ctl_table *tbl; + + atomic_set(&ipvs->dropentry, 0); + spin_lock_init(&ipvs->dropentry_lock); + spin_lock_init(&ipvs->droppacket_lock); + spin_lock_init(&ipvs->securetcp_lock); + + if (!net_eq(net, &init_net)) { + tbl = kmemdup(vs_vars, sizeof(vs_vars), GFP_KERNEL); + if (tbl == NULL) + return -ENOMEM; + } else + tbl = vs_vars; + /* Initialize sysctl defaults */ + idx = 0; + ipvs->sysctl_amemthresh = 1024; + tbl[idx++].data = &ipvs->sysctl_amemthresh; + ipvs->sysctl_am_droprate = 10; + tbl[idx++].data = &ipvs->sysctl_am_droprate; + tbl[idx++].data = &ipvs->sysctl_drop_entry; + tbl[idx++].data = &ipvs->sysctl_drop_packet; +#ifdef CONFIG_IP_VS_NFCT + tbl[idx++].data = &ipvs->sysctl_conntrack; +#endif + tbl[idx++].data = &ipvs->sysctl_secure_tcp; + ipvs->sysctl_snat_reroute = 1; + tbl[idx++].data = &ipvs->sysctl_snat_reroute; + ipvs->sysctl_sync_ver = 1; + tbl[idx++].data = &ipvs->sysctl_sync_ver; + tbl[idx++].data = &ipvs->sysctl_cache_bypass; + tbl[idx++].data = &ipvs->sysctl_expire_nodest_conn; + tbl[idx++].data = &ipvs->sysctl_expire_quiescent_template; + ipvs->sysctl_sync_threshold[0] = DEFAULT_SYNC_THRESHOLD; + ipvs->sysctl_sync_threshold[1] = DEFAULT_SYNC_PERIOD; + tbl[idx].data = &ipvs->sysctl_sync_threshold; + tbl[idx++].maxlen = sizeof(ipvs->sysctl_sync_threshold); + tbl[idx++].data = &ipvs->sysctl_nat_icmp_send; + + + ipvs->sysctl_hdr = register_net_sysctl_table(net, net_vs_ctl_path, + tbl); + if (ipvs->sysctl_hdr == NULL) { + if (!net_eq(net, &init_net)) + kfree(tbl); + return -ENOMEM; + } + ip_vs_start_estimator(net, &ipvs->tot_stats); + ipvs->sysctl_tbl = tbl; + /* Schedule defense work */ + INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler); + schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD); + + return 0; +} + +void __net_exit ip_vs_control_net_cleanup_sysctl(struct net *net) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + + cancel_delayed_work_sync(&ipvs->defense_work); + cancel_work_sync(&ipvs->defense_work.work); + unregister_net_sysctl_table(ipvs->sysctl_hdr); +} + +#else + +int __net_init ip_vs_control_net_init_sysctl(struct net *net) { return 0; } +void __net_exit ip_vs_control_net_cleanup_sysctl(struct net *net) { } + +#endif + +static struct notifier_block ip_vs_dst_notifier = { + .notifier_call = ip_vs_dst_event, +}; + +int __net_init ip_vs_control_net_init(struct net *net) +{ + int idx; + struct netns_ipvs *ipvs = net_ipvs(net); + + rwlock_init(&ipvs->rs_lock); + + /* Initialize rs_table */ + for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++) + INIT_LIST_HEAD(&ipvs->rs_table[idx]); + + INIT_LIST_HEAD(&ipvs->dest_trash); + atomic_set(&ipvs->ftpsvc_counter, 0); + atomic_set(&ipvs->nullsvc_counter, 0); + + /* procfs stats */ + ipvs->tot_stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats); + if (!ipvs->tot_stats.cpustats) + return -ENOMEM; + + spin_lock_init(&ipvs->tot_stats.lock); + + proc_net_fops_create(net, "ip_vs", 0, &ip_vs_info_fops); + proc_net_fops_create(net, "ip_vs_stats", 0, &ip_vs_stats_fops); + proc_net_fops_create(net, "ip_vs_stats_percpu", 0, + &ip_vs_stats_percpu_fops); + + if (ip_vs_control_net_init_sysctl(net)) + goto err; + + return 0; + +err: + free_percpu(ipvs->tot_stats.cpustats); + return -ENOMEM; +} + +void __net_exit ip_vs_control_net_cleanup(struct net *net) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + + ip_vs_trash_cleanup(net); + ip_vs_stop_estimator(net, &ipvs->tot_stats); + ip_vs_control_net_cleanup_sysctl(net); + proc_net_remove(net, "ip_vs_stats_percpu"); + proc_net_remove(net, "ip_vs_stats"); + proc_net_remove(net, "ip_vs"); + free_percpu(ipvs->tot_stats.cpustats); +} + +int __init ip_vs_register_nl_ioctl(void) +{ + int ret; + + ret = nf_register_sockopt(&ip_vs_sockopts); + if (ret) { + pr_err("cannot register sockopt.\n"); + goto err_sock; + } + + ret = ip_vs_genl_register(); + if (ret) { + pr_err("cannot register Generic Netlink interface.\n"); + goto err_genl; + } + return 0; + +err_genl: + nf_unregister_sockopt(&ip_vs_sockopts); +err_sock: + return ret; +} + +void ip_vs_unregister_nl_ioctl(void) +{ + ip_vs_genl_unregister(); + nf_unregister_sockopt(&ip_vs_sockopts); +} + +int __init ip_vs_control_init(void) +{ + int idx; + int ret; + + EnterFunction(2); + + /* Initialize svc_table, ip_vs_svc_fwm_table, rs_table */ + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + INIT_LIST_HEAD(&ip_vs_svc_table[idx]); + INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]); + } + + smp_wmb(); /* Do we really need it now ? */ + + ret = register_netdevice_notifier(&ip_vs_dst_notifier); + if (ret < 0) + return ret; + + LeaveFunction(2); + return 0; +} + + +void ip_vs_control_cleanup(void) +{ + EnterFunction(2); + unregister_netdevice_notifier(&ip_vs_dst_notifier); + LeaveFunction(2); +} diff --git a/net/netfilter/ipvs/ip_vs_dh.c b/net/netfilter/ipvs/ip_vs_dh.c new file mode 100644 index 00000000..1c269e56 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_dh.c @@ -0,0 +1,270 @@ +/* + * IPVS: Destination Hashing scheduling module + * + * Authors: Wensong Zhang <wensong@gnuchina.org> + * + * Inspired by the consistent hashing scheduler patch from + * Thomas Proell <proellt@gmx.de> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +/* + * The dh algorithm is to select server by the hash key of destination IP + * address. The pseudo code is as follows: + * + * n <- servernode[dest_ip]; + * if (n is dead) OR + * (n is overloaded) OR (n.weight <= 0) then + * return NULL; + * + * return n; + * + * Notes that servernode is a 256-bucket hash table that maps the hash + * index derived from packet destination IP address to the current server + * array. If the dh scheduler is used in cache cluster, it is good to + * combine it with cache_bypass feature. When the statically assigned + * server is dead or overloaded, the load balancer can bypass the cache + * server and send requests to the original server directly. + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/ip.h> +#include <linux/slab.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/skbuff.h> + +#include <net/ip_vs.h> + + +/* + * IPVS DH bucket + */ +struct ip_vs_dh_bucket { + struct ip_vs_dest *dest; /* real server (cache) */ +}; + +/* + * for IPVS DH entry hash table + */ +#ifndef CONFIG_IP_VS_DH_TAB_BITS +#define CONFIG_IP_VS_DH_TAB_BITS 8 +#endif +#define IP_VS_DH_TAB_BITS CONFIG_IP_VS_DH_TAB_BITS +#define IP_VS_DH_TAB_SIZE (1 << IP_VS_DH_TAB_BITS) +#define IP_VS_DH_TAB_MASK (IP_VS_DH_TAB_SIZE - 1) + + +/* + * Returns hash value for IPVS DH entry + */ +static inline unsigned ip_vs_dh_hashkey(int af, const union nf_inet_addr *addr) +{ + __be32 addr_fold = addr->ip; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + addr_fold = addr->ip6[0]^addr->ip6[1]^ + addr->ip6[2]^addr->ip6[3]; +#endif + return (ntohl(addr_fold)*2654435761UL) & IP_VS_DH_TAB_MASK; +} + + +/* + * Get ip_vs_dest associated with supplied parameters. + */ +static inline struct ip_vs_dest * +ip_vs_dh_get(int af, struct ip_vs_dh_bucket *tbl, + const union nf_inet_addr *addr) +{ + return (tbl[ip_vs_dh_hashkey(af, addr)]).dest; +} + + +/* + * Assign all the hash buckets of the specified table with the service. + */ +static int +ip_vs_dh_assign(struct ip_vs_dh_bucket *tbl, struct ip_vs_service *svc) +{ + int i; + struct ip_vs_dh_bucket *b; + struct list_head *p; + struct ip_vs_dest *dest; + + b = tbl; + p = &svc->destinations; + for (i=0; i<IP_VS_DH_TAB_SIZE; i++) { + if (list_empty(p)) { + b->dest = NULL; + } else { + if (p == &svc->destinations) + p = p->next; + + dest = list_entry(p, struct ip_vs_dest, n_list); + atomic_inc(&dest->refcnt); + b->dest = dest; + + p = p->next; + } + b++; + } + return 0; +} + + +/* + * Flush all the hash buckets of the specified table. + */ +static void ip_vs_dh_flush(struct ip_vs_dh_bucket *tbl) +{ + int i; + struct ip_vs_dh_bucket *b; + + b = tbl; + for (i=0; i<IP_VS_DH_TAB_SIZE; i++) { + if (b->dest) { + atomic_dec(&b->dest->refcnt); + b->dest = NULL; + } + b++; + } +} + + +static int ip_vs_dh_init_svc(struct ip_vs_service *svc) +{ + struct ip_vs_dh_bucket *tbl; + + /* allocate the DH table for this service */ + tbl = kmalloc(sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE, + GFP_ATOMIC); + if (tbl == NULL) + return -ENOMEM; + + svc->sched_data = tbl; + IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) allocated for " + "current service\n", + sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE); + + /* assign the hash buckets with the updated service */ + ip_vs_dh_assign(tbl, svc); + + return 0; +} + + +static int ip_vs_dh_done_svc(struct ip_vs_service *svc) +{ + struct ip_vs_dh_bucket *tbl = svc->sched_data; + + /* got to clean up hash buckets here */ + ip_vs_dh_flush(tbl); + + /* release the table itself */ + kfree(svc->sched_data); + IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) released\n", + sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE); + + return 0; +} + + +static int ip_vs_dh_update_svc(struct ip_vs_service *svc) +{ + struct ip_vs_dh_bucket *tbl = svc->sched_data; + + /* got to clean up hash buckets here */ + ip_vs_dh_flush(tbl); + + /* assign the hash buckets with the updated service */ + ip_vs_dh_assign(tbl, svc); + + return 0; +} + + +/* + * If the dest flags is set with IP_VS_DEST_F_OVERLOAD, + * consider that the server is overloaded here. + */ +static inline int is_overloaded(struct ip_vs_dest *dest) +{ + return dest->flags & IP_VS_DEST_F_OVERLOAD; +} + + +/* + * Destination hashing scheduling + */ +static struct ip_vs_dest * +ip_vs_dh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +{ + struct ip_vs_dest *dest; + struct ip_vs_dh_bucket *tbl; + struct ip_vs_iphdr iph; + + ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); + + IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); + + tbl = (struct ip_vs_dh_bucket *)svc->sched_data; + dest = ip_vs_dh_get(svc->af, tbl, &iph.daddr); + if (!dest + || !(dest->flags & IP_VS_DEST_F_AVAILABLE) + || atomic_read(&dest->weight) <= 0 + || is_overloaded(dest)) { + return NULL; + } + + IP_VS_DBG_BUF(6, "DH: destination IP address %s --> server %s:%d\n", + IP_VS_DBG_ADDR(svc->af, &iph.daddr), + IP_VS_DBG_ADDR(svc->af, &dest->addr), + ntohs(dest->port)); + + return dest; +} + + +/* + * IPVS DH Scheduler structure + */ +static struct ip_vs_scheduler ip_vs_dh_scheduler = +{ + .name = "dh", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_dh_scheduler.n_list), + .init_service = ip_vs_dh_init_svc, + .done_service = ip_vs_dh_done_svc, + .update_service = ip_vs_dh_update_svc, + .schedule = ip_vs_dh_schedule, +}; + + +static int __init ip_vs_dh_init(void) +{ + return register_ip_vs_scheduler(&ip_vs_dh_scheduler); +} + + +static void __exit ip_vs_dh_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_dh_scheduler); +} + + +module_init(ip_vs_dh_init); +module_exit(ip_vs_dh_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c new file mode 100644 index 00000000..0fac6017 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_est.c @@ -0,0 +1,209 @@ +/* + * ip_vs_est.c: simple rate estimator for IPVS + * + * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: Hans Schillstrom <hans.schillstrom@ericsson.com> + * Network name space (netns) aware. + * Global data moved to netns i.e struct netns_ipvs + * Affected data: est_list and est_lock. + * estimation_timer() runs with timer per netns. + * get_stats()) do the per cpu summing. + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/kernel.h> +#include <linux/jiffies.h> +#include <linux/types.h> +#include <linux/interrupt.h> +#include <linux/sysctl.h> +#include <linux/list.h> + +#include <net/ip_vs.h> + +/* + This code is to estimate rate in a shorter interval (such as 8 + seconds) for virtual services and real servers. For measure rate in a + long interval, it is easy to implement a user level daemon which + periodically reads those statistical counters and measure rate. + + Currently, the measurement is activated by slow timer handler. Hope + this measurement will not introduce too much load. + + We measure rate during the last 8 seconds every 2 seconds: + + avgrate = avgrate*(1-W) + rate*W + + where W = 2^(-2) + + NOTES. + + * The stored value for average bps is scaled by 2^5, so that maximal + rate is ~2.15Gbits/s, average pps and cps are scaled by 2^10. + + * A lot code is taken from net/sched/estimator.c + */ + + +/* + * Make a summary from each cpu + */ +static void ip_vs_read_cpu_stats(struct ip_vs_stats_user *sum, + struct ip_vs_cpu_stats *stats) +{ + int i; + + for_each_possible_cpu(i) { + struct ip_vs_cpu_stats *s = per_cpu_ptr(stats, i); + unsigned int start; + __u64 inbytes, outbytes; + if (i) { + sum->conns += s->ustats.conns; + sum->inpkts += s->ustats.inpkts; + sum->outpkts += s->ustats.outpkts; + do { + start = u64_stats_fetch_begin(&s->syncp); + inbytes = s->ustats.inbytes; + outbytes = s->ustats.outbytes; + } while (u64_stats_fetch_retry(&s->syncp, start)); + sum->inbytes += inbytes; + sum->outbytes += outbytes; + } else { + sum->conns = s->ustats.conns; + sum->inpkts = s->ustats.inpkts; + sum->outpkts = s->ustats.outpkts; + do { + start = u64_stats_fetch_begin(&s->syncp); + sum->inbytes = s->ustats.inbytes; + sum->outbytes = s->ustats.outbytes; + } while (u64_stats_fetch_retry(&s->syncp, start)); + } + } +} + + +static void estimation_timer(unsigned long arg) +{ + struct ip_vs_estimator *e; + struct ip_vs_stats *s; + u32 n_conns; + u32 n_inpkts, n_outpkts; + u64 n_inbytes, n_outbytes; + u32 rate; + struct net *net = (struct net *)arg; + struct netns_ipvs *ipvs; + + ipvs = net_ipvs(net); + spin_lock(&ipvs->est_lock); + list_for_each_entry(e, &ipvs->est_list, list) { + s = container_of(e, struct ip_vs_stats, est); + + spin_lock(&s->lock); + ip_vs_read_cpu_stats(&s->ustats, s->cpustats); + n_conns = s->ustats.conns; + n_inpkts = s->ustats.inpkts; + n_outpkts = s->ustats.outpkts; + n_inbytes = s->ustats.inbytes; + n_outbytes = s->ustats.outbytes; + + /* scaled by 2^10, but divided 2 seconds */ + rate = (n_conns - e->last_conns) << 9; + e->last_conns = n_conns; + e->cps += ((long)rate - (long)e->cps) >> 2; + + rate = (n_inpkts - e->last_inpkts) << 9; + e->last_inpkts = n_inpkts; + e->inpps += ((long)rate - (long)e->inpps) >> 2; + + rate = (n_outpkts - e->last_outpkts) << 9; + e->last_outpkts = n_outpkts; + e->outpps += ((long)rate - (long)e->outpps) >> 2; + + rate = (n_inbytes - e->last_inbytes) << 4; + e->last_inbytes = n_inbytes; + e->inbps += ((long)rate - (long)e->inbps) >> 2; + + rate = (n_outbytes - e->last_outbytes) << 4; + e->last_outbytes = n_outbytes; + e->outbps += ((long)rate - (long)e->outbps) >> 2; + spin_unlock(&s->lock); + } + spin_unlock(&ipvs->est_lock); + mod_timer(&ipvs->est_timer, jiffies + 2*HZ); +} + +void ip_vs_start_estimator(struct net *net, struct ip_vs_stats *stats) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_estimator *est = &stats->est; + + INIT_LIST_HEAD(&est->list); + + spin_lock_bh(&ipvs->est_lock); + list_add(&est->list, &ipvs->est_list); + spin_unlock_bh(&ipvs->est_lock); +} + +void ip_vs_stop_estimator(struct net *net, struct ip_vs_stats *stats) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_estimator *est = &stats->est; + + spin_lock_bh(&ipvs->est_lock); + list_del(&est->list); + spin_unlock_bh(&ipvs->est_lock); +} + +void ip_vs_zero_estimator(struct ip_vs_stats *stats) +{ + struct ip_vs_estimator *est = &stats->est; + struct ip_vs_stats_user *u = &stats->ustats; + + /* reset counters, caller must hold the stats->lock lock */ + est->last_inbytes = u->inbytes; + est->last_outbytes = u->outbytes; + est->last_conns = u->conns; + est->last_inpkts = u->inpkts; + est->last_outpkts = u->outpkts; + est->cps = 0; + est->inpps = 0; + est->outpps = 0; + est->inbps = 0; + est->outbps = 0; +} + +/* Get decoded rates */ +void ip_vs_read_estimator(struct ip_vs_stats_user *dst, + struct ip_vs_stats *stats) +{ + struct ip_vs_estimator *e = &stats->est; + + dst->cps = (e->cps + 0x1FF) >> 10; + dst->inpps = (e->inpps + 0x1FF) >> 10; + dst->outpps = (e->outpps + 0x1FF) >> 10; + dst->inbps = (e->inbps + 0xF) >> 5; + dst->outbps = (e->outbps + 0xF) >> 5; +} + +int __net_init ip_vs_estimator_net_init(struct net *net) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + + INIT_LIST_HEAD(&ipvs->est_list); + spin_lock_init(&ipvs->est_lock); + setup_timer(&ipvs->est_timer, estimation_timer, (unsigned long)net); + mod_timer(&ipvs->est_timer, jiffies + 2 * HZ); + return 0; +} + +void __net_exit ip_vs_estimator_net_cleanup(struct net *net) +{ + del_timer_sync(&net_ipvs(net)->est_timer); +} diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c new file mode 100644 index 00000000..e39f693d --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_ftp.c @@ -0,0 +1,507 @@ +/* + * ip_vs_ftp.c: IPVS ftp application module + * + * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> + * + * Changes: + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Most code here is taken from ip_masq_ftp.c in kernel 2.2. The difference + * is that ip_vs_ftp module handles the reverse direction to ip_masq_ftp. + * + * IP_MASQ_FTP ftp masquerading module + * + * Version: @(#)ip_masq_ftp.c 0.04 02/05/96 + * + * Author: Wouter Gadeyne + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/kernel.h> +#include <linux/skbuff.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/netfilter.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_expect.h> +#include <net/netfilter/nf_nat.h> +#include <net/netfilter/nf_nat_helper.h> +#include <linux/gfp.h> +#include <net/protocol.h> +#include <net/tcp.h> +#include <asm/unaligned.h> + +#include <net/ip_vs.h> + + +#define SERVER_STRING "227 " +#define CLIENT_STRING "PORT" + + +/* + * List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper + * First port is set to the default port. + */ +static unsigned int ports_count = 1; +static unsigned short ports[IP_VS_APP_MAX_PORTS] = {21, 0}; +module_param_array(ports, ushort, &ports_count, 0444); +MODULE_PARM_DESC(ports, "Ports to monitor for FTP control commands"); + + +/* Dummy variable */ +static int ip_vs_ftp_pasv; + + +static int +ip_vs_ftp_init_conn(struct ip_vs_app *app, struct ip_vs_conn *cp) +{ + /* We use connection tracking for the command connection */ + cp->flags |= IP_VS_CONN_F_NFCT; + return 0; +} + + +static int +ip_vs_ftp_done_conn(struct ip_vs_app *app, struct ip_vs_conn *cp) +{ + return 0; +} + + +/* + * Get <addr,port> from the string "xxx.xxx.xxx.xxx,ppp,ppp", started + * with the "pattern", ignoring before "skip" and terminated with + * the "term" character. + * <addr,port> is in network order. + */ +static int ip_vs_ftp_get_addrport(char *data, char *data_limit, + const char *pattern, size_t plen, + char skip, char term, + __be32 *addr, __be16 *port, + char **start, char **end) +{ + char *s, c; + unsigned char p[6]; + int i = 0; + + if (data_limit - data < plen) { + /* check if there is partial match */ + if (strnicmp(data, pattern, data_limit - data) == 0) + return -1; + else + return 0; + } + + if (strnicmp(data, pattern, plen) != 0) { + return 0; + } + s = data + plen; + if (skip) { + int found = 0; + + for (;; s++) { + if (s == data_limit) + return -1; + if (!found) { + if (*s == skip) + found = 1; + } else if (*s != skip) { + break; + } + } + } + + for (data = s; ; data++) { + if (data == data_limit) + return -1; + if (*data == term) + break; + } + *end = data; + + memset(p, 0, sizeof(p)); + for (data = s; ; data++) { + c = *data; + if (c == term) + break; + if (c >= '0' && c <= '9') { + p[i] = p[i]*10 + c - '0'; + } else if (c == ',' && i < 5) { + i++; + } else { + /* unexpected character */ + return -1; + } + } + + if (i != 5) + return -1; + + *start = s; + *addr = get_unaligned((__be32 *) p); + *port = get_unaligned((__be16 *) (p + 4)); + return 1; +} + +/* + * Look at outgoing ftp packets to catch the response to a PASV command + * from the server (inside-to-outside). + * When we see one, we build a connection entry with the client address, + * client port 0 (unknown at the moment), the server address and the + * server port. Mark the current connection entry as a control channel + * of the new entry. All this work is just to make the data connection + * can be scheduled to the right server later. + * + * The outgoing packet should be something like + * "227 Entering Passive Mode (xxx,xxx,xxx,xxx,ppp,ppp)". + * xxx,xxx,xxx,xxx is the server address, ppp,ppp is the server port number. + */ +static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, + struct sk_buff *skb, int *diff) +{ + struct iphdr *iph; + struct tcphdr *th; + char *data, *data_limit; + char *start, *end; + union nf_inet_addr from; + __be16 port; + struct ip_vs_conn *n_cp; + char buf[24]; /* xxx.xxx.xxx.xxx,ppp,ppp\000 */ + unsigned buf_len; + int ret = 0; + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + struct net *net; + +#ifdef CONFIG_IP_VS_IPV6 + /* This application helper doesn't work with IPv6 yet, + * so turn this into a no-op for IPv6 packets + */ + if (cp->af == AF_INET6) + return 1; +#endif + + *diff = 0; + + /* Only useful for established sessions */ + if (cp->state != IP_VS_TCP_S_ESTABLISHED) + return 1; + + /* Linear packets are much easier to deal with. */ + if (!skb_make_writable(skb, skb->len)) + return 0; + + if (cp->app_data == &ip_vs_ftp_pasv) { + iph = ip_hdr(skb); + th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]); + data = (char *)th + (th->doff << 2); + data_limit = skb_tail_pointer(skb); + + if (ip_vs_ftp_get_addrport(data, data_limit, + SERVER_STRING, + sizeof(SERVER_STRING)-1, + '(', ')', + &from.ip, &port, + &start, &end) != 1) + return 1; + + IP_VS_DBG(7, "PASV response (%pI4:%d) -> %pI4:%d detected\n", + &from.ip, ntohs(port), &cp->caddr.ip, 0); + + /* + * Now update or create an connection entry for it + */ + { + struct ip_vs_conn_param p; + ip_vs_conn_fill_param(ip_vs_conn_net(cp), AF_INET, + iph->protocol, &from, port, + &cp->caddr, 0, &p); + n_cp = ip_vs_conn_out_get(&p); + } + if (!n_cp) { + struct ip_vs_conn_param p; + ip_vs_conn_fill_param(ip_vs_conn_net(cp), + AF_INET, IPPROTO_TCP, &cp->caddr, + 0, &cp->vaddr, port, &p); + n_cp = ip_vs_conn_new(&p, &from, port, + IP_VS_CONN_F_NO_CPORT | + IP_VS_CONN_F_NFCT, + cp->dest, skb->mark); + if (!n_cp) + return 0; + + /* add its controller */ + ip_vs_control_add(n_cp, cp); + } + + /* + * Replace the old passive address with the new one + */ + from.ip = n_cp->vaddr.ip; + port = n_cp->vport; + snprintf(buf, sizeof(buf), "%u,%u,%u,%u,%u,%u", + ((unsigned char *)&from.ip)[0], + ((unsigned char *)&from.ip)[1], + ((unsigned char *)&from.ip)[2], + ((unsigned char *)&from.ip)[3], + ntohs(port) >> 8, + ntohs(port) & 0xFF); + + buf_len = strlen(buf); + + ct = nf_ct_get(skb, &ctinfo); + if (ct && !nf_ct_is_untracked(ct) && nfct_nat(ct)) { + /* If mangling fails this function will return 0 + * which will cause the packet to be dropped. + * Mangling can only fail under memory pressure, + * hopefully it will succeed on the retransmitted + * packet. + */ + ret = nf_nat_mangle_tcp_packet(skb, ct, ctinfo, + start-data, end-start, + buf, buf_len); + if (ret) { + ip_vs_nfct_expect_related(skb, ct, n_cp, + IPPROTO_TCP, 0, 0); + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->ip_summed = CHECKSUM_UNNECESSARY; + /* csum is updated */ + ret = 1; + } + } + + /* + * Not setting 'diff' is intentional, otherwise the sequence + * would be adjusted twice. + */ + + net = skb_net(skb); + cp->app_data = NULL; + ip_vs_tcp_conn_listen(net, n_cp); + ip_vs_conn_put(n_cp); + return ret; + } + return 1; +} + + +/* + * Look at incoming ftp packets to catch the PASV/PORT command + * (outside-to-inside). + * + * The incoming packet having the PORT command should be something like + * "PORT xxx,xxx,xxx,xxx,ppp,ppp\n". + * xxx,xxx,xxx,xxx is the client address, ppp,ppp is the client port number. + * In this case, we create a connection entry using the client address and + * port, so that the active ftp data connection from the server can reach + * the client. + */ +static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, + struct sk_buff *skb, int *diff) +{ + struct iphdr *iph; + struct tcphdr *th; + char *data, *data_start, *data_limit; + char *start, *end; + union nf_inet_addr to; + __be16 port; + struct ip_vs_conn *n_cp; + struct net *net; + +#ifdef CONFIG_IP_VS_IPV6 + /* This application helper doesn't work with IPv6 yet, + * so turn this into a no-op for IPv6 packets + */ + if (cp->af == AF_INET6) + return 1; +#endif + + /* no diff required for incoming packets */ + *diff = 0; + + /* Only useful for established sessions */ + if (cp->state != IP_VS_TCP_S_ESTABLISHED) + return 1; + + /* Linear packets are much easier to deal with. */ + if (!skb_make_writable(skb, skb->len)) + return 0; + + /* + * Detecting whether it is passive + */ + iph = ip_hdr(skb); + th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]); + + /* Since there may be OPTIONS in the TCP packet and the HLEN is + the length of the header in 32-bit multiples, it is accurate + to calculate data address by th+HLEN*4 */ + data = data_start = (char *)th + (th->doff << 2); + data_limit = skb_tail_pointer(skb); + + while (data <= data_limit - 6) { + if (strnicmp(data, "PASV\r\n", 6) == 0) { + /* Passive mode on */ + IP_VS_DBG(7, "got PASV at %td of %td\n", + data - data_start, + data_limit - data_start); + cp->app_data = &ip_vs_ftp_pasv; + return 1; + } + data++; + } + + /* + * To support virtual FTP server, the scenerio is as follows: + * FTP client ----> Load Balancer ----> FTP server + * First detect the port number in the application data, + * then create a new connection entry for the coming data + * connection. + */ + if (ip_vs_ftp_get_addrport(data_start, data_limit, + CLIENT_STRING, sizeof(CLIENT_STRING)-1, + ' ', '\r', &to.ip, &port, + &start, &end) != 1) + return 1; + + IP_VS_DBG(7, "PORT %pI4:%d detected\n", &to.ip, ntohs(port)); + + /* Passive mode off */ + cp->app_data = NULL; + + /* + * Now update or create a connection entry for it + */ + IP_VS_DBG(7, "protocol %s %pI4:%d %pI4:%d\n", + ip_vs_proto_name(iph->protocol), + &to.ip, ntohs(port), &cp->vaddr.ip, 0); + + { + struct ip_vs_conn_param p; + ip_vs_conn_fill_param(ip_vs_conn_net(cp), AF_INET, + iph->protocol, &to, port, &cp->vaddr, + htons(ntohs(cp->vport)-1), &p); + n_cp = ip_vs_conn_in_get(&p); + if (!n_cp) { + n_cp = ip_vs_conn_new(&p, &cp->daddr, + htons(ntohs(cp->dport)-1), + IP_VS_CONN_F_NFCT, cp->dest, + skb->mark); + if (!n_cp) + return 0; + + /* add its controller */ + ip_vs_control_add(n_cp, cp); + } + } + + /* + * Move tunnel to listen state + */ + net = skb_net(skb); + ip_vs_tcp_conn_listen(net, n_cp); + ip_vs_conn_put(n_cp); + + return 1; +} + + +static struct ip_vs_app ip_vs_ftp = { + .name = "ftp", + .type = IP_VS_APP_TYPE_FTP, + .protocol = IPPROTO_TCP, + .module = THIS_MODULE, + .incs_list = LIST_HEAD_INIT(ip_vs_ftp.incs_list), + .init_conn = ip_vs_ftp_init_conn, + .done_conn = ip_vs_ftp_done_conn, + .bind_conn = NULL, + .unbind_conn = NULL, + .pkt_out = ip_vs_ftp_out, + .pkt_in = ip_vs_ftp_in, +}; + +/* + * per netns ip_vs_ftp initialization + */ +static int __net_init __ip_vs_ftp_init(struct net *net) +{ + int i, ret; + struct ip_vs_app *app; + struct netns_ipvs *ipvs = net_ipvs(net); + + if (!ipvs) + return -ENOENT; + app = kmemdup(&ip_vs_ftp, sizeof(struct ip_vs_app), GFP_KERNEL); + if (!app) + return -ENOMEM; + INIT_LIST_HEAD(&app->a_list); + INIT_LIST_HEAD(&app->incs_list); + ipvs->ftp_app = app; + + ret = register_ip_vs_app(net, app); + if (ret) + goto err_exit; + + for (i = 0; i < ports_count; i++) { + if (!ports[i]) + continue; + ret = register_ip_vs_app_inc(net, app, app->protocol, ports[i]); + if (ret) + goto err_unreg; + pr_info("%s: loaded support on port[%d] = %d\n", + app->name, i, ports[i]); + } + return 0; + +err_unreg: + unregister_ip_vs_app(net, app); +err_exit: + kfree(ipvs->ftp_app); + return ret; +} +/* + * netns exit + */ +static void __ip_vs_ftp_exit(struct net *net) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + + unregister_ip_vs_app(net, ipvs->ftp_app); + kfree(ipvs->ftp_app); +} + +static struct pernet_operations ip_vs_ftp_ops = { + .init = __ip_vs_ftp_init, + .exit = __ip_vs_ftp_exit, +}; + +int __init ip_vs_ftp_init(void) +{ + int rv; + + rv = register_pernet_subsys(&ip_vs_ftp_ops); + return rv; +} + +/* + * ip_vs_ftp finish. + */ +static void __exit ip_vs_ftp_exit(void) +{ + unregister_pernet_subsys(&ip_vs_ftp_ops); +} + + +module_init(ip_vs_ftp_init); +module_exit(ip_vs_ftp_exit); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c new file mode 100644 index 00000000..caa43704 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_lblc.c @@ -0,0 +1,625 @@ +/* + * IPVS: Locality-Based Least-Connection scheduling module + * + * Authors: Wensong Zhang <wensong@gnuchina.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * Martin Hamilton : fixed the terrible locking bugs + * *lock(tbl->lock) ==> *lock(&tbl->lock) + * Wensong Zhang : fixed the uninitialized tbl->lock bug + * Wensong Zhang : added doing full expiration check to + * collect stale entries of 24+ hours when + * no partial expire check in a half hour + * Julian Anastasov : replaced del_timer call with del_timer_sync + * to avoid the possible race between timer + * handler and del_timer thread in SMP + * + */ + +/* + * The lblc algorithm is as follows (pseudo code): + * + * if cachenode[dest_ip] is null then + * n, cachenode[dest_ip] <- {weighted least-conn node}; + * else + * n <- cachenode[dest_ip]; + * if (n is dead) OR + * (n.conns>n.weight AND + * there is a node m with m.conns<m.weight/2) then + * n, cachenode[dest_ip] <- {weighted least-conn node}; + * + * return n; + * + * Thanks must go to Wenzhuo Zhang for talking WCCP to me and pushing + * me to write this module. + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/ip.h> +#include <linux/slab.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/skbuff.h> +#include <linux/jiffies.h> + +/* for sysctl */ +#include <linux/fs.h> +#include <linux/sysctl.h> + +#include <net/ip_vs.h> + + +/* + * It is for garbage collection of stale IPVS lblc entries, + * when the table is full. + */ +#define CHECK_EXPIRE_INTERVAL (60*HZ) +#define ENTRY_TIMEOUT (6*60*HZ) + +#define DEFAULT_EXPIRATION (24*60*60*HZ) + +/* + * It is for full expiration check. + * When there is no partial expiration check (garbage collection) + * in a half hour, do a full expiration check to collect stale + * entries that haven't been touched for a day. + */ +#define COUNT_FOR_FULL_EXPIRATION 30 + + +/* + * for IPVS lblc entry hash table + */ +#ifndef CONFIG_IP_VS_LBLC_TAB_BITS +#define CONFIG_IP_VS_LBLC_TAB_BITS 10 +#endif +#define IP_VS_LBLC_TAB_BITS CONFIG_IP_VS_LBLC_TAB_BITS +#define IP_VS_LBLC_TAB_SIZE (1 << IP_VS_LBLC_TAB_BITS) +#define IP_VS_LBLC_TAB_MASK (IP_VS_LBLC_TAB_SIZE - 1) + + +/* + * IPVS lblc entry represents an association between destination + * IP address and its destination server + */ +struct ip_vs_lblc_entry { + struct list_head list; + int af; /* address family */ + union nf_inet_addr addr; /* destination IP address */ + struct ip_vs_dest *dest; /* real server (cache) */ + unsigned long lastuse; /* last used time */ +}; + + +/* + * IPVS lblc hash table + */ +struct ip_vs_lblc_table { + struct list_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */ + atomic_t entries; /* number of entries */ + int max_size; /* maximum size of entries */ + struct timer_list periodic_timer; /* collect stale entries */ + int rover; /* rover for expire check */ + int counter; /* counter for no expire */ +}; + + +/* + * IPVS LBLC sysctl table + */ +#ifdef CONFIG_SYSCTL +static ctl_table vs_vars_table[] = { + { + .procname = "lblc_expiration", + .data = NULL, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { } +}; +#endif + +static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en) +{ + list_del(&en->list); + /* + * We don't kfree dest because it is referred either by its service + * or the trash dest list. + */ + atomic_dec(&en->dest->refcnt); + kfree(en); +} + + +/* + * Returns hash value for IPVS LBLC entry + */ +static inline unsigned +ip_vs_lblc_hashkey(int af, const union nf_inet_addr *addr) +{ + __be32 addr_fold = addr->ip; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + addr_fold = addr->ip6[0]^addr->ip6[1]^ + addr->ip6[2]^addr->ip6[3]; +#endif + return (ntohl(addr_fold)*2654435761UL) & IP_VS_LBLC_TAB_MASK; +} + + +/* + * Hash an entry in the ip_vs_lblc_table. + * returns bool success. + */ +static void +ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en) +{ + unsigned hash = ip_vs_lblc_hashkey(en->af, &en->addr); + + list_add(&en->list, &tbl->bucket[hash]); + atomic_inc(&tbl->entries); +} + + +/* + * Get ip_vs_lblc_entry associated with supplied parameters. Called under read + * lock + */ +static inline struct ip_vs_lblc_entry * +ip_vs_lblc_get(int af, struct ip_vs_lblc_table *tbl, + const union nf_inet_addr *addr) +{ + unsigned hash = ip_vs_lblc_hashkey(af, addr); + struct ip_vs_lblc_entry *en; + + list_for_each_entry(en, &tbl->bucket[hash], list) + if (ip_vs_addr_equal(af, &en->addr, addr)) + return en; + + return NULL; +} + + +/* + * Create or update an ip_vs_lblc_entry, which is a mapping of a destination IP + * address to a server. Called under write lock. + */ +static inline struct ip_vs_lblc_entry * +ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, const union nf_inet_addr *daddr, + struct ip_vs_dest *dest) +{ + struct ip_vs_lblc_entry *en; + + en = ip_vs_lblc_get(dest->af, tbl, daddr); + if (!en) { + en = kmalloc(sizeof(*en), GFP_ATOMIC); + if (!en) + return NULL; + + en->af = dest->af; + ip_vs_addr_copy(dest->af, &en->addr, daddr); + en->lastuse = jiffies; + + atomic_inc(&dest->refcnt); + en->dest = dest; + + ip_vs_lblc_hash(tbl, en); + } else if (en->dest != dest) { + atomic_dec(&en->dest->refcnt); + atomic_inc(&dest->refcnt); + en->dest = dest; + } + + return en; +} + + +/* + * Flush all the entries of the specified table. + */ +static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl) +{ + struct ip_vs_lblc_entry *en, *nxt; + int i; + + for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) { + list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) { + ip_vs_lblc_free(en); + atomic_dec(&tbl->entries); + } + } +} + +static int sysctl_lblc_expiration(struct ip_vs_service *svc) +{ +#ifdef CONFIG_SYSCTL + struct netns_ipvs *ipvs = net_ipvs(svc->net); + return ipvs->sysctl_lblc_expiration; +#else + return DEFAULT_EXPIRATION; +#endif +} + +static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc) +{ + struct ip_vs_lblc_table *tbl = svc->sched_data; + struct ip_vs_lblc_entry *en, *nxt; + unsigned long now = jiffies; + int i, j; + + for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) { + j = (j + 1) & IP_VS_LBLC_TAB_MASK; + + write_lock(&svc->sched_lock); + list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { + if (time_before(now, + en->lastuse + + sysctl_lblc_expiration(svc))) + continue; + + ip_vs_lblc_free(en); + atomic_dec(&tbl->entries); + } + write_unlock(&svc->sched_lock); + } + tbl->rover = j; +} + + +/* + * Periodical timer handler for IPVS lblc table + * It is used to collect stale entries when the number of entries + * exceeds the maximum size of the table. + * + * Fixme: we probably need more complicated algorithm to collect + * entries that have not been used for a long time even + * if the number of entries doesn't exceed the maximum size + * of the table. + * The full expiration check is for this purpose now. + */ +static void ip_vs_lblc_check_expire(unsigned long data) +{ + struct ip_vs_service *svc = (struct ip_vs_service *) data; + struct ip_vs_lblc_table *tbl = svc->sched_data; + unsigned long now = jiffies; + int goal; + int i, j; + struct ip_vs_lblc_entry *en, *nxt; + + if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { + /* do full expiration check */ + ip_vs_lblc_full_check(svc); + tbl->counter = 1; + goto out; + } + + if (atomic_read(&tbl->entries) <= tbl->max_size) { + tbl->counter++; + goto out; + } + + goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3; + if (goal > tbl->max_size/2) + goal = tbl->max_size/2; + + for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) { + j = (j + 1) & IP_VS_LBLC_TAB_MASK; + + write_lock(&svc->sched_lock); + list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { + if (time_before(now, en->lastuse + ENTRY_TIMEOUT)) + continue; + + ip_vs_lblc_free(en); + atomic_dec(&tbl->entries); + goal--; + } + write_unlock(&svc->sched_lock); + if (goal <= 0) + break; + } + tbl->rover = j; + + out: + mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL); +} + + +static int ip_vs_lblc_init_svc(struct ip_vs_service *svc) +{ + int i; + struct ip_vs_lblc_table *tbl; + + /* + * Allocate the ip_vs_lblc_table for this service + */ + tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC); + if (tbl == NULL) + return -ENOMEM; + + svc->sched_data = tbl; + IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) allocated for " + "current service\n", sizeof(*tbl)); + + /* + * Initialize the hash buckets + */ + for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) { + INIT_LIST_HEAD(&tbl->bucket[i]); + } + tbl->max_size = IP_VS_LBLC_TAB_SIZE*16; + tbl->rover = 0; + tbl->counter = 1; + + /* + * Hook periodic timer for garbage collection + */ + setup_timer(&tbl->periodic_timer, ip_vs_lblc_check_expire, + (unsigned long)svc); + mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL); + + return 0; +} + + +static int ip_vs_lblc_done_svc(struct ip_vs_service *svc) +{ + struct ip_vs_lblc_table *tbl = svc->sched_data; + + /* remove periodic timer */ + del_timer_sync(&tbl->periodic_timer); + + /* got to clean up table entries here */ + ip_vs_lblc_flush(tbl); + + /* release the table itself */ + kfree(tbl); + IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n", + sizeof(*tbl)); + + return 0; +} + + +static inline struct ip_vs_dest * +__ip_vs_lblc_schedule(struct ip_vs_service *svc) +{ + struct ip_vs_dest *dest, *least; + int loh, doh; + + /* + * We use the following formula to estimate the load: + * (dest overhead) / dest->weight + * + * Remember -- no floats in kernel mode!!! + * The comparison of h1*w2 > h2*w1 is equivalent to that of + * h1/w1 > h2/w2 + * if every weight is larger than zero. + * + * The server with weight=0 is quiesced and will not receive any + * new connection. + */ + list_for_each_entry(dest, &svc->destinations, n_list) { + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + continue; + if (atomic_read(&dest->weight) > 0) { + least = dest; + loh = ip_vs_dest_conn_overhead(least); + goto nextstage; + } + } + return NULL; + + /* + * Find the destination with the least load. + */ + nextstage: + list_for_each_entry_continue(dest, &svc->destinations, n_list) { + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + continue; + + doh = ip_vs_dest_conn_overhead(dest); + if (loh * atomic_read(&dest->weight) > + doh * atomic_read(&least->weight)) { + least = dest; + loh = doh; + } + } + + IP_VS_DBG_BUF(6, "LBLC: server %s:%d " + "activeconns %d refcnt %d weight %d overhead %d\n", + IP_VS_DBG_ADDR(least->af, &least->addr), + ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->refcnt), + atomic_read(&least->weight), loh); + + return least; +} + + +/* + * If this destination server is overloaded and there is a less loaded + * server, then return true. + */ +static inline int +is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) +{ + if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) { + struct ip_vs_dest *d; + + list_for_each_entry(d, &svc->destinations, n_list) { + if (atomic_read(&d->activeconns)*2 + < atomic_read(&d->weight)) { + return 1; + } + } + } + return 0; +} + + +/* + * Locality-Based (weighted) Least-Connection scheduling + */ +static struct ip_vs_dest * +ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +{ + struct ip_vs_lblc_table *tbl = svc->sched_data; + struct ip_vs_iphdr iph; + struct ip_vs_dest *dest = NULL; + struct ip_vs_lblc_entry *en; + + ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); + + IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); + + /* First look in our cache */ + read_lock(&svc->sched_lock); + en = ip_vs_lblc_get(svc->af, tbl, &iph.daddr); + if (en) { + /* We only hold a read lock, but this is atomic */ + en->lastuse = jiffies; + + /* + * If the destination is not available, i.e. it's in the trash, + * we must ignore it, as it may be removed from under our feet, + * if someone drops our reference count. Our caller only makes + * sure that destinations, that are not in the trash, are not + * moved to the trash, while we are scheduling. But anyone can + * free up entries from the trash at any time. + */ + + if (en->dest->flags & IP_VS_DEST_F_AVAILABLE) + dest = en->dest; + } + read_unlock(&svc->sched_lock); + + /* If the destination has a weight and is not overloaded, use it */ + if (dest && atomic_read(&dest->weight) > 0 && !is_overloaded(dest, svc)) + goto out; + + /* No cache entry or it is invalid, time to schedule */ + dest = __ip_vs_lblc_schedule(svc); + if (!dest) { + ip_vs_scheduler_err(svc, "no destination available"); + return NULL; + } + + /* If we fail to create a cache entry, we'll just use the valid dest */ + write_lock(&svc->sched_lock); + ip_vs_lblc_new(tbl, &iph.daddr, dest); + write_unlock(&svc->sched_lock); + +out: + IP_VS_DBG_BUF(6, "LBLC: destination IP address %s --> server %s:%d\n", + IP_VS_DBG_ADDR(svc->af, &iph.daddr), + IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port)); + + return dest; +} + + +/* + * IPVS LBLC Scheduler structure + */ +static struct ip_vs_scheduler ip_vs_lblc_scheduler = +{ + .name = "lblc", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_lblc_scheduler.n_list), + .init_service = ip_vs_lblc_init_svc, + .done_service = ip_vs_lblc_done_svc, + .schedule = ip_vs_lblc_schedule, +}; + +/* + * per netns init. + */ +#ifdef CONFIG_SYSCTL +static int __net_init __ip_vs_lblc_init(struct net *net) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + + if (!ipvs) + return -ENOENT; + + if (!net_eq(net, &init_net)) { + ipvs->lblc_ctl_table = kmemdup(vs_vars_table, + sizeof(vs_vars_table), + GFP_KERNEL); + if (ipvs->lblc_ctl_table == NULL) + return -ENOMEM; + } else + ipvs->lblc_ctl_table = vs_vars_table; + ipvs->sysctl_lblc_expiration = DEFAULT_EXPIRATION; + ipvs->lblc_ctl_table[0].data = &ipvs->sysctl_lblc_expiration; + + ipvs->lblc_ctl_header = + register_net_sysctl_table(net, net_vs_ctl_path, + ipvs->lblc_ctl_table); + if (!ipvs->lblc_ctl_header) { + if (!net_eq(net, &init_net)) + kfree(ipvs->lblc_ctl_table); + return -ENOMEM; + } + + return 0; +} + +static void __net_exit __ip_vs_lblc_exit(struct net *net) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + + unregister_net_sysctl_table(ipvs->lblc_ctl_header); + + if (!net_eq(net, &init_net)) + kfree(ipvs->lblc_ctl_table); +} + +#else + +static int __net_init __ip_vs_lblc_init(struct net *net) { return 0; } +static void __net_exit __ip_vs_lblc_exit(struct net *net) { } + +#endif + +static struct pernet_operations ip_vs_lblc_ops = { + .init = __ip_vs_lblc_init, + .exit = __ip_vs_lblc_exit, +}; + +static int __init ip_vs_lblc_init(void) +{ + int ret; + + ret = register_pernet_subsys(&ip_vs_lblc_ops); + if (ret) + return ret; + + ret = register_ip_vs_scheduler(&ip_vs_lblc_scheduler); + if (ret) + unregister_pernet_subsys(&ip_vs_lblc_ops); + return ret; +} + +static void __exit ip_vs_lblc_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler); + unregister_pernet_subsys(&ip_vs_lblc_ops); +} + + +module_init(ip_vs_lblc_init); +module_exit(ip_vs_lblc_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c new file mode 100644 index 00000000..548bf37a --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_lblcr.c @@ -0,0 +1,819 @@ +/* + * IPVS: Locality-Based Least-Connection with Replication scheduler + * + * Authors: Wensong Zhang <wensong@gnuchina.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * Julian Anastasov : Added the missing (dest->weight>0) + * condition in the ip_vs_dest_set_max. + * + */ + +/* + * The lblc/r algorithm is as follows (pseudo code): + * + * if serverSet[dest_ip] is null then + * n, serverSet[dest_ip] <- {weighted least-conn node}; + * else + * n <- {least-conn (alive) node in serverSet[dest_ip]}; + * if (n is null) OR + * (n.conns>n.weight AND + * there is a node m with m.conns<m.weight/2) then + * n <- {weighted least-conn node}; + * add n to serverSet[dest_ip]; + * if |serverSet[dest_ip]| > 1 AND + * now - serverSet[dest_ip].lastMod > T then + * m <- {most conn node in serverSet[dest_ip]}; + * remove m from serverSet[dest_ip]; + * if serverSet[dest_ip] changed then + * serverSet[dest_ip].lastMod <- now; + * + * return n; + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/ip.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/skbuff.h> +#include <linux/jiffies.h> +#include <linux/list.h> +#include <linux/slab.h> + +/* for sysctl */ +#include <linux/fs.h> +#include <linux/sysctl.h> +#include <net/net_namespace.h> + +#include <net/ip_vs.h> + + +/* + * It is for garbage collection of stale IPVS lblcr entries, + * when the table is full. + */ +#define CHECK_EXPIRE_INTERVAL (60*HZ) +#define ENTRY_TIMEOUT (6*60*HZ) + +#define DEFAULT_EXPIRATION (24*60*60*HZ) + +/* + * It is for full expiration check. + * When there is no partial expiration check (garbage collection) + * in a half hour, do a full expiration check to collect stale + * entries that haven't been touched for a day. + */ +#define COUNT_FOR_FULL_EXPIRATION 30 + +/* + * for IPVS lblcr entry hash table + */ +#ifndef CONFIG_IP_VS_LBLCR_TAB_BITS +#define CONFIG_IP_VS_LBLCR_TAB_BITS 10 +#endif +#define IP_VS_LBLCR_TAB_BITS CONFIG_IP_VS_LBLCR_TAB_BITS +#define IP_VS_LBLCR_TAB_SIZE (1 << IP_VS_LBLCR_TAB_BITS) +#define IP_VS_LBLCR_TAB_MASK (IP_VS_LBLCR_TAB_SIZE - 1) + + +/* + * IPVS destination set structure and operations + */ +struct ip_vs_dest_set_elem { + struct list_head list; /* list link */ + struct ip_vs_dest *dest; /* destination server */ +}; + +struct ip_vs_dest_set { + atomic_t size; /* set size */ + unsigned long lastmod; /* last modified time */ + struct list_head list; /* destination list */ + rwlock_t lock; /* lock for this list */ +}; + + +static struct ip_vs_dest_set_elem * +ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest) +{ + struct ip_vs_dest_set_elem *e; + + list_for_each_entry(e, &set->list, list) { + if (e->dest == dest) + /* already existed */ + return NULL; + } + + e = kmalloc(sizeof(*e), GFP_ATOMIC); + if (e == NULL) + return NULL; + + atomic_inc(&dest->refcnt); + e->dest = dest; + + list_add(&e->list, &set->list); + atomic_inc(&set->size); + + set->lastmod = jiffies; + return e; +} + +static void +ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest) +{ + struct ip_vs_dest_set_elem *e; + + list_for_each_entry(e, &set->list, list) { + if (e->dest == dest) { + /* HIT */ + atomic_dec(&set->size); + set->lastmod = jiffies; + atomic_dec(&e->dest->refcnt); + list_del(&e->list); + kfree(e); + break; + } + } +} + +static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set) +{ + struct ip_vs_dest_set_elem *e, *ep; + + write_lock(&set->lock); + list_for_each_entry_safe(e, ep, &set->list, list) { + /* + * We don't kfree dest because it is referred either + * by its service or by the trash dest list. + */ + atomic_dec(&e->dest->refcnt); + list_del(&e->list); + kfree(e); + } + write_unlock(&set->lock); +} + +/* get weighted least-connection node in the destination set */ +static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set) +{ + register struct ip_vs_dest_set_elem *e; + struct ip_vs_dest *dest, *least; + int loh, doh; + + if (set == NULL) + return NULL; + + /* select the first destination server, whose weight > 0 */ + list_for_each_entry(e, &set->list, list) { + least = e->dest; + if (least->flags & IP_VS_DEST_F_OVERLOAD) + continue; + + if ((atomic_read(&least->weight) > 0) + && (least->flags & IP_VS_DEST_F_AVAILABLE)) { + loh = ip_vs_dest_conn_overhead(least); + goto nextstage; + } + } + return NULL; + + /* find the destination with the weighted least load */ + nextstage: + list_for_each_entry(e, &set->list, list) { + dest = e->dest; + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + continue; + + doh = ip_vs_dest_conn_overhead(dest); + if ((loh * atomic_read(&dest->weight) > + doh * atomic_read(&least->weight)) + && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { + least = dest; + loh = doh; + } + } + + IP_VS_DBG_BUF(6, "%s(): server %s:%d " + "activeconns %d refcnt %d weight %d overhead %d\n", + __func__, + IP_VS_DBG_ADDR(least->af, &least->addr), + ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->refcnt), + atomic_read(&least->weight), loh); + return least; +} + + +/* get weighted most-connection node in the destination set */ +static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set) +{ + register struct ip_vs_dest_set_elem *e; + struct ip_vs_dest *dest, *most; + int moh, doh; + + if (set == NULL) + return NULL; + + /* select the first destination server, whose weight > 0 */ + list_for_each_entry(e, &set->list, list) { + most = e->dest; + if (atomic_read(&most->weight) > 0) { + moh = ip_vs_dest_conn_overhead(most); + goto nextstage; + } + } + return NULL; + + /* find the destination with the weighted most load */ + nextstage: + list_for_each_entry(e, &set->list, list) { + dest = e->dest; + doh = ip_vs_dest_conn_overhead(dest); + /* moh/mw < doh/dw ==> moh*dw < doh*mw, where mw,dw>0 */ + if ((moh * atomic_read(&dest->weight) < + doh * atomic_read(&most->weight)) + && (atomic_read(&dest->weight) > 0)) { + most = dest; + moh = doh; + } + } + + IP_VS_DBG_BUF(6, "%s(): server %s:%d " + "activeconns %d refcnt %d weight %d overhead %d\n", + __func__, + IP_VS_DBG_ADDR(most->af, &most->addr), ntohs(most->port), + atomic_read(&most->activeconns), + atomic_read(&most->refcnt), + atomic_read(&most->weight), moh); + return most; +} + + +/* + * IPVS lblcr entry represents an association between destination + * IP address and its destination server set + */ +struct ip_vs_lblcr_entry { + struct list_head list; + int af; /* address family */ + union nf_inet_addr addr; /* destination IP address */ + struct ip_vs_dest_set set; /* destination server set */ + unsigned long lastuse; /* last used time */ +}; + + +/* + * IPVS lblcr hash table + */ +struct ip_vs_lblcr_table { + struct list_head bucket[IP_VS_LBLCR_TAB_SIZE]; /* hash bucket */ + atomic_t entries; /* number of entries */ + int max_size; /* maximum size of entries */ + struct timer_list periodic_timer; /* collect stale entries */ + int rover; /* rover for expire check */ + int counter; /* counter for no expire */ +}; + + +#ifdef CONFIG_SYSCTL +/* + * IPVS LBLCR sysctl table + */ + +static ctl_table vs_vars_table[] = { + { + .procname = "lblcr_expiration", + .data = NULL, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { } +}; +#endif + +static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en) +{ + list_del(&en->list); + ip_vs_dest_set_eraseall(&en->set); + kfree(en); +} + + +/* + * Returns hash value for IPVS LBLCR entry + */ +static inline unsigned +ip_vs_lblcr_hashkey(int af, const union nf_inet_addr *addr) +{ + __be32 addr_fold = addr->ip; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + addr_fold = addr->ip6[0]^addr->ip6[1]^ + addr->ip6[2]^addr->ip6[3]; +#endif + return (ntohl(addr_fold)*2654435761UL) & IP_VS_LBLCR_TAB_MASK; +} + + +/* + * Hash an entry in the ip_vs_lblcr_table. + * returns bool success. + */ +static void +ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en) +{ + unsigned hash = ip_vs_lblcr_hashkey(en->af, &en->addr); + + list_add(&en->list, &tbl->bucket[hash]); + atomic_inc(&tbl->entries); +} + + +/* + * Get ip_vs_lblcr_entry associated with supplied parameters. Called under + * read lock. + */ +static inline struct ip_vs_lblcr_entry * +ip_vs_lblcr_get(int af, struct ip_vs_lblcr_table *tbl, + const union nf_inet_addr *addr) +{ + unsigned hash = ip_vs_lblcr_hashkey(af, addr); + struct ip_vs_lblcr_entry *en; + + list_for_each_entry(en, &tbl->bucket[hash], list) + if (ip_vs_addr_equal(af, &en->addr, addr)) + return en; + + return NULL; +} + + +/* + * Create or update an ip_vs_lblcr_entry, which is a mapping of a destination + * IP address to a server. Called under write lock. + */ +static inline struct ip_vs_lblcr_entry * +ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, const union nf_inet_addr *daddr, + struct ip_vs_dest *dest) +{ + struct ip_vs_lblcr_entry *en; + + en = ip_vs_lblcr_get(dest->af, tbl, daddr); + if (!en) { + en = kmalloc(sizeof(*en), GFP_ATOMIC); + if (!en) + return NULL; + + en->af = dest->af; + ip_vs_addr_copy(dest->af, &en->addr, daddr); + en->lastuse = jiffies; + + /* initialize its dest set */ + atomic_set(&(en->set.size), 0); + INIT_LIST_HEAD(&en->set.list); + rwlock_init(&en->set.lock); + + ip_vs_lblcr_hash(tbl, en); + } + + write_lock(&en->set.lock); + ip_vs_dest_set_insert(&en->set, dest); + write_unlock(&en->set.lock); + + return en; +} + + +/* + * Flush all the entries of the specified table. + */ +static void ip_vs_lblcr_flush(struct ip_vs_lblcr_table *tbl) +{ + int i; + struct ip_vs_lblcr_entry *en, *nxt; + + /* No locking required, only called during cleanup. */ + for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) { + list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) { + ip_vs_lblcr_free(en); + } + } +} + +static int sysctl_lblcr_expiration(struct ip_vs_service *svc) +{ +#ifdef CONFIG_SYSCTL + struct netns_ipvs *ipvs = net_ipvs(svc->net); + return ipvs->sysctl_lblcr_expiration; +#else + return DEFAULT_EXPIRATION; +#endif +} + +static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc) +{ + struct ip_vs_lblcr_table *tbl = svc->sched_data; + unsigned long now = jiffies; + int i, j; + struct ip_vs_lblcr_entry *en, *nxt; + + for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) { + j = (j + 1) & IP_VS_LBLCR_TAB_MASK; + + write_lock(&svc->sched_lock); + list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { + if (time_after(en->lastuse + + sysctl_lblcr_expiration(svc), now)) + continue; + + ip_vs_lblcr_free(en); + atomic_dec(&tbl->entries); + } + write_unlock(&svc->sched_lock); + } + tbl->rover = j; +} + + +/* + * Periodical timer handler for IPVS lblcr table + * It is used to collect stale entries when the number of entries + * exceeds the maximum size of the table. + * + * Fixme: we probably need more complicated algorithm to collect + * entries that have not been used for a long time even + * if the number of entries doesn't exceed the maximum size + * of the table. + * The full expiration check is for this purpose now. + */ +static void ip_vs_lblcr_check_expire(unsigned long data) +{ + struct ip_vs_service *svc = (struct ip_vs_service *) data; + struct ip_vs_lblcr_table *tbl = svc->sched_data; + unsigned long now = jiffies; + int goal; + int i, j; + struct ip_vs_lblcr_entry *en, *nxt; + + if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { + /* do full expiration check */ + ip_vs_lblcr_full_check(svc); + tbl->counter = 1; + goto out; + } + + if (atomic_read(&tbl->entries) <= tbl->max_size) { + tbl->counter++; + goto out; + } + + goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3; + if (goal > tbl->max_size/2) + goal = tbl->max_size/2; + + for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) { + j = (j + 1) & IP_VS_LBLCR_TAB_MASK; + + write_lock(&svc->sched_lock); + list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { + if (time_before(now, en->lastuse+ENTRY_TIMEOUT)) + continue; + + ip_vs_lblcr_free(en); + atomic_dec(&tbl->entries); + goal--; + } + write_unlock(&svc->sched_lock); + if (goal <= 0) + break; + } + tbl->rover = j; + + out: + mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL); +} + +static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc) +{ + int i; + struct ip_vs_lblcr_table *tbl; + + /* + * Allocate the ip_vs_lblcr_table for this service + */ + tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC); + if (tbl == NULL) + return -ENOMEM; + + svc->sched_data = tbl; + IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) allocated for " + "current service\n", sizeof(*tbl)); + + /* + * Initialize the hash buckets + */ + for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) { + INIT_LIST_HEAD(&tbl->bucket[i]); + } + tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16; + tbl->rover = 0; + tbl->counter = 1; + + /* + * Hook periodic timer for garbage collection + */ + setup_timer(&tbl->periodic_timer, ip_vs_lblcr_check_expire, + (unsigned long)svc); + mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL); + + return 0; +} + + +static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc) +{ + struct ip_vs_lblcr_table *tbl = svc->sched_data; + + /* remove periodic timer */ + del_timer_sync(&tbl->periodic_timer); + + /* got to clean up table entries here */ + ip_vs_lblcr_flush(tbl); + + /* release the table itself */ + kfree(tbl); + IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) released\n", + sizeof(*tbl)); + + return 0; +} + + +static inline struct ip_vs_dest * +__ip_vs_lblcr_schedule(struct ip_vs_service *svc) +{ + struct ip_vs_dest *dest, *least; + int loh, doh; + + /* + * We use the following formula to estimate the load: + * (dest overhead) / dest->weight + * + * Remember -- no floats in kernel mode!!! + * The comparison of h1*w2 > h2*w1 is equivalent to that of + * h1/w1 > h2/w2 + * if every weight is larger than zero. + * + * The server with weight=0 is quiesced and will not receive any + * new connection. + */ + list_for_each_entry(dest, &svc->destinations, n_list) { + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + continue; + + if (atomic_read(&dest->weight) > 0) { + least = dest; + loh = ip_vs_dest_conn_overhead(least); + goto nextstage; + } + } + return NULL; + + /* + * Find the destination with the least load. + */ + nextstage: + list_for_each_entry_continue(dest, &svc->destinations, n_list) { + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + continue; + + doh = ip_vs_dest_conn_overhead(dest); + if (loh * atomic_read(&dest->weight) > + doh * atomic_read(&least->weight)) { + least = dest; + loh = doh; + } + } + + IP_VS_DBG_BUF(6, "LBLCR: server %s:%d " + "activeconns %d refcnt %d weight %d overhead %d\n", + IP_VS_DBG_ADDR(least->af, &least->addr), + ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->refcnt), + atomic_read(&least->weight), loh); + + return least; +} + + +/* + * If this destination server is overloaded and there is a less loaded + * server, then return true. + */ +static inline int +is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) +{ + if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) { + struct ip_vs_dest *d; + + list_for_each_entry(d, &svc->destinations, n_list) { + if (atomic_read(&d->activeconns)*2 + < atomic_read(&d->weight)) { + return 1; + } + } + } + return 0; +} + + +/* + * Locality-Based (weighted) Least-Connection scheduling + */ +static struct ip_vs_dest * +ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +{ + struct ip_vs_lblcr_table *tbl = svc->sched_data; + struct ip_vs_iphdr iph; + struct ip_vs_dest *dest = NULL; + struct ip_vs_lblcr_entry *en; + + ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); + + IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); + + /* First look in our cache */ + read_lock(&svc->sched_lock); + en = ip_vs_lblcr_get(svc->af, tbl, &iph.daddr); + if (en) { + /* We only hold a read lock, but this is atomic */ + en->lastuse = jiffies; + + /* Get the least loaded destination */ + read_lock(&en->set.lock); + dest = ip_vs_dest_set_min(&en->set); + read_unlock(&en->set.lock); + + /* More than one destination + enough time passed by, cleanup */ + if (atomic_read(&en->set.size) > 1 && + time_after(jiffies, en->set.lastmod + + sysctl_lblcr_expiration(svc))) { + struct ip_vs_dest *m; + + write_lock(&en->set.lock); + m = ip_vs_dest_set_max(&en->set); + if (m) + ip_vs_dest_set_erase(&en->set, m); + write_unlock(&en->set.lock); + } + + /* If the destination is not overloaded, use it */ + if (dest && !is_overloaded(dest, svc)) { + read_unlock(&svc->sched_lock); + goto out; + } + + /* The cache entry is invalid, time to schedule */ + dest = __ip_vs_lblcr_schedule(svc); + if (!dest) { + ip_vs_scheduler_err(svc, "no destination available"); + read_unlock(&svc->sched_lock); + return NULL; + } + + /* Update our cache entry */ + write_lock(&en->set.lock); + ip_vs_dest_set_insert(&en->set, dest); + write_unlock(&en->set.lock); + } + read_unlock(&svc->sched_lock); + + if (dest) + goto out; + + /* No cache entry, time to schedule */ + dest = __ip_vs_lblcr_schedule(svc); + if (!dest) { + IP_VS_DBG(1, "no destination available\n"); + return NULL; + } + + /* If we fail to create a cache entry, we'll just use the valid dest */ + write_lock(&svc->sched_lock); + ip_vs_lblcr_new(tbl, &iph.daddr, dest); + write_unlock(&svc->sched_lock); + +out: + IP_VS_DBG_BUF(6, "LBLCR: destination IP address %s --> server %s:%d\n", + IP_VS_DBG_ADDR(svc->af, &iph.daddr), + IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port)); + + return dest; +} + + +/* + * IPVS LBLCR Scheduler structure + */ +static struct ip_vs_scheduler ip_vs_lblcr_scheduler = +{ + .name = "lblcr", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_lblcr_scheduler.n_list), + .init_service = ip_vs_lblcr_init_svc, + .done_service = ip_vs_lblcr_done_svc, + .schedule = ip_vs_lblcr_schedule, +}; + +/* + * per netns init. + */ +#ifdef CONFIG_SYSCTL +static int __net_init __ip_vs_lblcr_init(struct net *net) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + + if (!ipvs) + return -ENOENT; + + if (!net_eq(net, &init_net)) { + ipvs->lblcr_ctl_table = kmemdup(vs_vars_table, + sizeof(vs_vars_table), + GFP_KERNEL); + if (ipvs->lblcr_ctl_table == NULL) + return -ENOMEM; + } else + ipvs->lblcr_ctl_table = vs_vars_table; + ipvs->sysctl_lblcr_expiration = DEFAULT_EXPIRATION; + ipvs->lblcr_ctl_table[0].data = &ipvs->sysctl_lblcr_expiration; + + ipvs->lblcr_ctl_header = + register_net_sysctl_table(net, net_vs_ctl_path, + ipvs->lblcr_ctl_table); + if (!ipvs->lblcr_ctl_header) { + if (!net_eq(net, &init_net)) + kfree(ipvs->lblcr_ctl_table); + return -ENOMEM; + } + + return 0; +} + +static void __net_exit __ip_vs_lblcr_exit(struct net *net) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + + unregister_net_sysctl_table(ipvs->lblcr_ctl_header); + + if (!net_eq(net, &init_net)) + kfree(ipvs->lblcr_ctl_table); +} + +#else + +static int __net_init __ip_vs_lblcr_init(struct net *net) { return 0; } +static void __net_exit __ip_vs_lblcr_exit(struct net *net) { } + +#endif + +static struct pernet_operations ip_vs_lblcr_ops = { + .init = __ip_vs_lblcr_init, + .exit = __ip_vs_lblcr_exit, +}; + +static int __init ip_vs_lblcr_init(void) +{ + int ret; + + ret = register_pernet_subsys(&ip_vs_lblcr_ops); + if (ret) + return ret; + + ret = register_ip_vs_scheduler(&ip_vs_lblcr_scheduler); + if (ret) + unregister_pernet_subsys(&ip_vs_lblcr_ops); + return ret; +} + +static void __exit ip_vs_lblcr_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler); + unregister_pernet_subsys(&ip_vs_lblcr_ops); +} + + +module_init(ip_vs_lblcr_init); +module_exit(ip_vs_lblcr_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_lc.c b/net/netfilter/ipvs/ip_vs_lc.c new file mode 100644 index 00000000..f391819c --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_lc.c @@ -0,0 +1,91 @@ +/* + * IPVS: Least-Connection Scheduling module + * + * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * Wensong Zhang : added the ip_vs_lc_update_svc + * Wensong Zhang : added any dest with weight=0 is quiesced + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/module.h> +#include <linux/kernel.h> + +#include <net/ip_vs.h> + +/* + * Least Connection scheduling + */ +static struct ip_vs_dest * +ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +{ + struct ip_vs_dest *dest, *least = NULL; + unsigned int loh = 0, doh; + + IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); + + /* + * Simply select the server with the least number of + * (activeconns<<5) + inactconns + * Except whose weight is equal to zero. + * If the weight is equal to zero, it means that the server is + * quiesced, the existing connections to the server still get + * served, but no new connection is assigned to the server. + */ + + list_for_each_entry(dest, &svc->destinations, n_list) { + if ((dest->flags & IP_VS_DEST_F_OVERLOAD) || + atomic_read(&dest->weight) == 0) + continue; + doh = ip_vs_dest_conn_overhead(dest); + if (!least || doh < loh) { + least = dest; + loh = doh; + } + } + + if (!least) + ip_vs_scheduler_err(svc, "no destination available"); + else + IP_VS_DBG_BUF(6, "LC: server %s:%u activeconns %d " + "inactconns %d\n", + IP_VS_DBG_ADDR(svc->af, &least->addr), + ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->inactconns)); + + return least; +} + + +static struct ip_vs_scheduler ip_vs_lc_scheduler = { + .name = "lc", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_lc_scheduler.n_list), + .schedule = ip_vs_lc_schedule, +}; + + +static int __init ip_vs_lc_init(void) +{ + return register_ip_vs_scheduler(&ip_vs_lc_scheduler) ; +} + +static void __exit ip_vs_lc_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_lc_scheduler); +} + +module_init(ip_vs_lc_init); +module_exit(ip_vs_lc_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_nfct.c b/net/netfilter/ipvs/ip_vs_nfct.c new file mode 100644 index 00000000..022e77e1 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_nfct.c @@ -0,0 +1,294 @@ +/* + * ip_vs_nfct.c: Netfilter connection tracking support for IPVS + * + * Portions Copyright (C) 2001-2002 + * Antefacto Ltd, 181 Parnell St, Dublin 1, Ireland. + * + * Portions Copyright (C) 2003-2010 + * Julian Anastasov + * + * + * This code is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * + * Authors: + * Ben North <ben@redfrontdoor.org> + * Julian Anastasov <ja@ssi.bg> Reorganize and sync with latest kernels + * Hannes Eder <heder@google.com> Extend NFCT support for FTP, ipvs match + * + * + * Current status: + * + * - provide conntrack confirmation for new and related connections, by + * this way we can see their proper conntrack state in all hooks + * - support for all forwarding methods, not only NAT + * - FTP support (NAT), ability to support other NAT apps with expectations + * - to correctly create expectations for related NAT connections the proper + * NF conntrack support must be already installed, eg. ip_vs_ftp requires + * nf_conntrack_ftp ... iptables_nat for the same ports (but no iptables + * NAT rules are needed) + * - alter reply for NAT when forwarding packet in original direction: + * conntrack from client in NEW or RELATED (Passive FTP DATA) state or + * when RELATED conntrack is created from real server (Active FTP DATA) + * - if iptables_nat is not loaded the Passive FTP will not work (the + * PASV response can not be NAT-ed) but Active FTP should work + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/compiler.h> +#include <linux/vmalloc.h> +#include <linux/skbuff.h> +#include <net/ip.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> +#include <net/ip_vs.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_expect.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_zones.h> + + +#define FMT_TUPLE "%pI4:%u->%pI4:%u/%u" +#define ARG_TUPLE(T) &(T)->src.u3.ip, ntohs((T)->src.u.all), \ + &(T)->dst.u3.ip, ntohs((T)->dst.u.all), \ + (T)->dst.protonum + +#define FMT_CONN "%pI4:%u->%pI4:%u->%pI4:%u/%u:%u" +#define ARG_CONN(C) &((C)->caddr.ip), ntohs((C)->cport), \ + &((C)->vaddr.ip), ntohs((C)->vport), \ + &((C)->daddr.ip), ntohs((C)->dport), \ + (C)->protocol, (C)->state + +void +ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo); + struct nf_conntrack_tuple new_tuple; + + if (ct == NULL || nf_ct_is_confirmed(ct) || nf_ct_is_untracked(ct) || + nf_ct_is_dying(ct)) + return; + + /* Never alter conntrack for non-NAT conns */ + if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) + return; + + /* Alter reply only in original direction */ + if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) + return; + + /* + * The connection is not yet in the hashtable, so we update it. + * CIP->VIP will remain the same, so leave the tuple in + * IP_CT_DIR_ORIGINAL untouched. When the reply comes back from the + * real-server we will see RIP->DIP. + */ + new_tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple; + /* + * This will also take care of UDP and other protocols. + */ + if (outin) { + new_tuple.src.u3 = cp->daddr; + if (new_tuple.dst.protonum != IPPROTO_ICMP && + new_tuple.dst.protonum != IPPROTO_ICMPV6) + new_tuple.src.u.tcp.port = cp->dport; + } else { + new_tuple.dst.u3 = cp->vaddr; + if (new_tuple.dst.protonum != IPPROTO_ICMP && + new_tuple.dst.protonum != IPPROTO_ICMPV6) + new_tuple.dst.u.tcp.port = cp->vport; + } + IP_VS_DBG(7, "%s: Updating conntrack ct=%p, status=0x%lX, " + "ctinfo=%d, old reply=" FMT_TUPLE + ", new reply=" FMT_TUPLE ", cp=" FMT_CONN "\n", + __func__, ct, ct->status, ctinfo, + ARG_TUPLE(&ct->tuplehash[IP_CT_DIR_REPLY].tuple), + ARG_TUPLE(&new_tuple), ARG_CONN(cp)); + nf_conntrack_alter_reply(ct, &new_tuple); +} + +int ip_vs_confirm_conntrack(struct sk_buff *skb) +{ + return nf_conntrack_confirm(skb); +} + +/* + * Called from init_conntrack() as expectfn handler. + */ +static void ip_vs_nfct_expect_callback(struct nf_conn *ct, + struct nf_conntrack_expect *exp) +{ + struct nf_conntrack_tuple *orig, new_reply; + struct ip_vs_conn *cp; + struct ip_vs_conn_param p; + struct net *net = nf_ct_net(ct); + + if (exp->tuple.src.l3num != PF_INET) + return; + + /* + * We assume that no NF locks are held before this callback. + * ip_vs_conn_out_get and ip_vs_conn_in_get should match their + * expectations even if they use wildcard values, now we provide the + * actual values from the newly created original conntrack direction. + * The conntrack is confirmed when packet reaches IPVS hooks. + */ + + /* RS->CLIENT */ + orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; + ip_vs_conn_fill_param(net, exp->tuple.src.l3num, orig->dst.protonum, + &orig->src.u3, orig->src.u.tcp.port, + &orig->dst.u3, orig->dst.u.tcp.port, &p); + cp = ip_vs_conn_out_get(&p); + if (cp) { + /* Change reply CLIENT->RS to CLIENT->VS */ + new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple; + IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", " + FMT_TUPLE ", found inout cp=" FMT_CONN "\n", + __func__, ct, ct->status, + ARG_TUPLE(orig), ARG_TUPLE(&new_reply), + ARG_CONN(cp)); + new_reply.dst.u3 = cp->vaddr; + new_reply.dst.u.tcp.port = cp->vport; + IP_VS_DBG(7, "%s: ct=%p, new tuples=" FMT_TUPLE ", " FMT_TUPLE + ", inout cp=" FMT_CONN "\n", + __func__, ct, + ARG_TUPLE(orig), ARG_TUPLE(&new_reply), + ARG_CONN(cp)); + goto alter; + } + + /* CLIENT->VS */ + cp = ip_vs_conn_in_get(&p); + if (cp) { + /* Change reply VS->CLIENT to RS->CLIENT */ + new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple; + IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", " + FMT_TUPLE ", found outin cp=" FMT_CONN "\n", + __func__, ct, ct->status, + ARG_TUPLE(orig), ARG_TUPLE(&new_reply), + ARG_CONN(cp)); + new_reply.src.u3 = cp->daddr; + new_reply.src.u.tcp.port = cp->dport; + IP_VS_DBG(7, "%s: ct=%p, new tuples=" FMT_TUPLE ", " + FMT_TUPLE ", outin cp=" FMT_CONN "\n", + __func__, ct, + ARG_TUPLE(orig), ARG_TUPLE(&new_reply), + ARG_CONN(cp)); + goto alter; + } + + IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuple=" FMT_TUPLE + " - unknown expect\n", + __func__, ct, ct->status, ARG_TUPLE(orig)); + return; + +alter: + /* Never alter conntrack for non-NAT conns */ + if (IP_VS_FWD_METHOD(cp) == IP_VS_CONN_F_MASQ) + nf_conntrack_alter_reply(ct, &new_reply); + ip_vs_conn_put(cp); + return; +} + +/* + * Create NF conntrack expectation with wildcard (optional) source port. + * Then the default callback function will alter the reply and will confirm + * the conntrack entry when the first packet comes. + * Use port 0 to expect connection from any port. + */ +void ip_vs_nfct_expect_related(struct sk_buff *skb, struct nf_conn *ct, + struct ip_vs_conn *cp, u_int8_t proto, + const __be16 port, int from_rs) +{ + struct nf_conntrack_expect *exp; + + if (ct == NULL || nf_ct_is_untracked(ct)) + return; + + exp = nf_ct_expect_alloc(ct); + if (!exp) + return; + + nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct), + from_rs ? &cp->daddr : &cp->caddr, + from_rs ? &cp->caddr : &cp->vaddr, + proto, port ? &port : NULL, + from_rs ? &cp->cport : &cp->vport); + + exp->expectfn = ip_vs_nfct_expect_callback; + + IP_VS_DBG(7, "%s: ct=%p, expect tuple=" FMT_TUPLE "\n", + __func__, ct, ARG_TUPLE(&exp->tuple)); + nf_ct_expect_related(exp); + nf_ct_expect_put(exp); +} +EXPORT_SYMBOL(ip_vs_nfct_expect_related); + +/* + * Our connection was terminated, try to drop the conntrack immediately + */ +void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp) +{ + struct nf_conntrack_tuple_hash *h; + struct nf_conn *ct; + struct nf_conntrack_tuple tuple; + + if (!cp->cport) + return; + + tuple = (struct nf_conntrack_tuple) { + .dst = { .protonum = cp->protocol, .dir = IP_CT_DIR_ORIGINAL } }; + tuple.src.u3 = cp->caddr; + tuple.src.u.all = cp->cport; + tuple.src.l3num = cp->af; + tuple.dst.u3 = cp->vaddr; + tuple.dst.u.all = cp->vport; + + IP_VS_DBG(7, "%s: dropping conntrack with tuple=" FMT_TUPLE + " for conn " FMT_CONN "\n", + __func__, ARG_TUPLE(&tuple), ARG_CONN(cp)); + + h = nf_conntrack_find_get(ip_vs_conn_net(cp), NF_CT_DEFAULT_ZONE, + &tuple); + if (h) { + ct = nf_ct_tuplehash_to_ctrack(h); + /* Show what happens instead of calling nf_ct_kill() */ + if (del_timer(&ct->timeout)) { + IP_VS_DBG(7, "%s: ct=%p, deleted conntrack timer for tuple=" + FMT_TUPLE "\n", + __func__, ct, ARG_TUPLE(&tuple)); + if (ct->timeout.function) + ct->timeout.function(ct->timeout.data); + } else { + IP_VS_DBG(7, "%s: ct=%p, no conntrack timer for tuple=" + FMT_TUPLE "\n", + __func__, ct, ARG_TUPLE(&tuple)); + } + nf_ct_put(ct); + } else { + IP_VS_DBG(7, "%s: no conntrack for tuple=" FMT_TUPLE "\n", + __func__, ARG_TUPLE(&tuple)); + } +} + diff --git a/net/netfilter/ipvs/ip_vs_nq.c b/net/netfilter/ipvs/ip_vs_nq.c new file mode 100644 index 00000000..984d9c13 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_nq.c @@ -0,0 +1,140 @@ +/* + * IPVS: Never Queue scheduling module + * + * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +/* + * The NQ algorithm adopts a two-speed model. When there is an idle server + * available, the job will be sent to the idle server, instead of waiting + * for a fast one. When there is no idle server available, the job will be + * sent to the server that minimize its expected delay (The Shortest + * Expected Delay scheduling algorithm). + * + * See the following paper for more information: + * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing + * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88, + * pages 986-994, 1988. + * + * Thanks must go to Marko Buuri <marko@buuri.name> for talking NQ to me. + * + * The difference between NQ and SED is that NQ can improve overall + * system utilization. + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/module.h> +#include <linux/kernel.h> + +#include <net/ip_vs.h> + + +static inline unsigned int +ip_vs_nq_dest_overhead(struct ip_vs_dest *dest) +{ + /* + * We only use the active connection number in the cost + * calculation here. + */ + return atomic_read(&dest->activeconns) + 1; +} + + +/* + * Weighted Least Connection scheduling + */ +static struct ip_vs_dest * +ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +{ + struct ip_vs_dest *dest, *least = NULL; + unsigned int loh = 0, doh; + + IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); + + /* + * We calculate the load of each dest server as follows: + * (server expected overhead) / dest->weight + * + * Remember -- no floats in kernel mode!!! + * The comparison of h1*w2 > h2*w1 is equivalent to that of + * h1/w1 > h2/w2 + * if every weight is larger than zero. + * + * The server with weight=0 is quiesced and will not receive any + * new connections. + */ + + list_for_each_entry(dest, &svc->destinations, n_list) { + + if (dest->flags & IP_VS_DEST_F_OVERLOAD || + !atomic_read(&dest->weight)) + continue; + + doh = ip_vs_nq_dest_overhead(dest); + + /* return the server directly if it is idle */ + if (atomic_read(&dest->activeconns) == 0) { + least = dest; + loh = doh; + goto out; + } + + if (!least || + (loh * atomic_read(&dest->weight) > + doh * atomic_read(&least->weight))) { + least = dest; + loh = doh; + } + } + + if (!least) { + ip_vs_scheduler_err(svc, "no destination available"); + return NULL; + } + + out: + IP_VS_DBG_BUF(6, "NQ: server %s:%u " + "activeconns %d refcnt %d weight %d overhead %d\n", + IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->refcnt), + atomic_read(&least->weight), loh); + + return least; +} + + +static struct ip_vs_scheduler ip_vs_nq_scheduler = +{ + .name = "nq", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_nq_scheduler.n_list), + .schedule = ip_vs_nq_schedule, +}; + + +static int __init ip_vs_nq_init(void) +{ + return register_ip_vs_scheduler(&ip_vs_nq_scheduler); +} + +static void __exit ip_vs_nq_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_nq_scheduler); +} + +module_init(ip_vs_nq_init); +module_exit(ip_vs_nq_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_pe.c b/net/netfilter/ipvs/ip_vs_pe.c new file mode 100644 index 00000000..5cf859cc --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_pe.c @@ -0,0 +1,140 @@ +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/module.h> +#include <linux/spinlock.h> +#include <linux/interrupt.h> +#include <asm/string.h> +#include <linux/kmod.h> +#include <linux/sysctl.h> + +#include <net/ip_vs.h> + +/* IPVS pe list */ +static LIST_HEAD(ip_vs_pe); + +/* lock for service table */ +static DEFINE_SPINLOCK(ip_vs_pe_lock); + +/* Bind a service with a pe */ +void ip_vs_bind_pe(struct ip_vs_service *svc, struct ip_vs_pe *pe) +{ + svc->pe = pe; +} + +/* Unbind a service from its pe */ +void ip_vs_unbind_pe(struct ip_vs_service *svc) +{ + svc->pe = NULL; +} + +/* Get pe in the pe list by name */ +struct ip_vs_pe *__ip_vs_pe_getbyname(const char *pe_name) +{ + struct ip_vs_pe *pe; + + IP_VS_DBG(10, "%s(): pe_name \"%s\"\n", __func__, + pe_name); + + spin_lock_bh(&ip_vs_pe_lock); + + list_for_each_entry(pe, &ip_vs_pe, n_list) { + /* Test and get the modules atomically */ + if (pe->module && + !try_module_get(pe->module)) { + /* This pe is just deleted */ + continue; + } + if (strcmp(pe_name, pe->name)==0) { + /* HIT */ + spin_unlock_bh(&ip_vs_pe_lock); + return pe; + } + if (pe->module) + module_put(pe->module); + } + + spin_unlock_bh(&ip_vs_pe_lock); + return NULL; +} + +/* Lookup pe and try to load it if it doesn't exist */ +struct ip_vs_pe *ip_vs_pe_getbyname(const char *name) +{ + struct ip_vs_pe *pe; + + /* Search for the pe by name */ + pe = __ip_vs_pe_getbyname(name); + + /* If pe not found, load the module and search again */ + if (!pe) { + request_module("ip_vs_pe_%s", name); + pe = __ip_vs_pe_getbyname(name); + } + + return pe; +} + +/* Register a pe in the pe list */ +int register_ip_vs_pe(struct ip_vs_pe *pe) +{ + struct ip_vs_pe *tmp; + + /* increase the module use count */ + ip_vs_use_count_inc(); + + spin_lock_bh(&ip_vs_pe_lock); + + if (!list_empty(&pe->n_list)) { + spin_unlock_bh(&ip_vs_pe_lock); + ip_vs_use_count_dec(); + pr_err("%s(): [%s] pe already linked\n", + __func__, pe->name); + return -EINVAL; + } + + /* Make sure that the pe with this name doesn't exist + * in the pe list. + */ + list_for_each_entry(tmp, &ip_vs_pe, n_list) { + if (strcmp(tmp->name, pe->name) == 0) { + spin_unlock_bh(&ip_vs_pe_lock); + ip_vs_use_count_dec(); + pr_err("%s(): [%s] pe already existed " + "in the system\n", __func__, pe->name); + return -EINVAL; + } + } + /* Add it into the d-linked pe list */ + list_add(&pe->n_list, &ip_vs_pe); + spin_unlock_bh(&ip_vs_pe_lock); + + pr_info("[%s] pe registered.\n", pe->name); + + return 0; +} +EXPORT_SYMBOL_GPL(register_ip_vs_pe); + +/* Unregister a pe from the pe list */ +int unregister_ip_vs_pe(struct ip_vs_pe *pe) +{ + spin_lock_bh(&ip_vs_pe_lock); + if (list_empty(&pe->n_list)) { + spin_unlock_bh(&ip_vs_pe_lock); + pr_err("%s(): [%s] pe is not in the list. failed\n", + __func__, pe->name); + return -EINVAL; + } + + /* Remove it from the d-linked pe list */ + list_del(&pe->n_list); + spin_unlock_bh(&ip_vs_pe_lock); + + /* decrease the module use count */ + ip_vs_use_count_dec(); + + pr_info("[%s] pe unregistered.\n", pe->name); + + return 0; +} +EXPORT_SYMBOL_GPL(unregister_ip_vs_pe); diff --git a/net/netfilter/ipvs/ip_vs_pe_sip.c b/net/netfilter/ipvs/ip_vs_pe_sip.c new file mode 100644 index 00000000..1aa5cac7 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_pe_sip.c @@ -0,0 +1,171 @@ +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/module.h> +#include <linux/kernel.h> + +#include <net/ip_vs.h> +#include <net/netfilter/nf_conntrack.h> +#include <linux/netfilter/nf_conntrack_sip.h> + +#ifdef CONFIG_IP_VS_DEBUG +static const char *ip_vs_dbg_callid(char *buf, size_t buf_len, + const char *callid, size_t callid_len, + int *idx) +{ + size_t len = min(min(callid_len, (size_t)64), buf_len - *idx - 1); + memcpy(buf + *idx, callid, len); + buf[*idx+len] = '\0'; + *idx += len + 1; + return buf + *idx - len; +} + +#define IP_VS_DEBUG_CALLID(callid, len) \ + ip_vs_dbg_callid(ip_vs_dbg_buf, sizeof(ip_vs_dbg_buf), \ + callid, len, &ip_vs_dbg_idx) +#endif + +static int get_callid(const char *dptr, unsigned int dataoff, + unsigned int datalen, + unsigned int *matchoff, unsigned int *matchlen) +{ + /* Find callid */ + while (1) { + int ret = ct_sip_get_header(NULL, dptr, dataoff, datalen, + SIP_HDR_CALL_ID, matchoff, + matchlen); + if (ret > 0) + break; + if (!ret) + return 0; + dataoff += *matchoff; + } + + /* Empty callid is useless */ + if (!*matchlen) + return -EINVAL; + + /* Too large is useless */ + if (*matchlen > IP_VS_PEDATA_MAXLEN) + return -EINVAL; + + /* SIP headers are always followed by a line terminator */ + if (*matchoff + *matchlen == datalen) + return -EINVAL; + + /* RFC 2543 allows lines to be terminated with CR, LF or CRLF, + * RFC 3261 allows only CRLF, we support both. */ + if (*(dptr + *matchoff + *matchlen) != '\r' && + *(dptr + *matchoff + *matchlen) != '\n') + return -EINVAL; + + IP_VS_DBG_BUF(9, "SIP callid %s (%d bytes)\n", + IP_VS_DEBUG_CALLID(dptr + *matchoff, *matchlen), + *matchlen); + return 0; +} + +static int +ip_vs_sip_fill_param(struct ip_vs_conn_param *p, struct sk_buff *skb) +{ + struct ip_vs_iphdr iph; + unsigned int dataoff, datalen, matchoff, matchlen; + const char *dptr; + int retc; + + ip_vs_fill_iphdr(p->af, skb_network_header(skb), &iph); + + /* Only useful with UDP */ + if (iph.protocol != IPPROTO_UDP) + return -EINVAL; + + /* No Data ? */ + dataoff = iph.len + sizeof(struct udphdr); + if (dataoff >= skb->len) + return -EINVAL; + + if ((retc=skb_linearize(skb)) < 0) + return retc; + dptr = skb->data + dataoff; + datalen = skb->len - dataoff; + + if (get_callid(dptr, dataoff, datalen, &matchoff, &matchlen)) + return -EINVAL; + + /* N.B: pe_data is only set on success, + * this allows fallback to the default persistence logic on failure + */ + p->pe_data = kmemdup(dptr + matchoff, matchlen, GFP_ATOMIC); + if (!p->pe_data) + return -ENOMEM; + + p->pe_data_len = matchlen; + + return 0; +} + +static bool ip_vs_sip_ct_match(const struct ip_vs_conn_param *p, + struct ip_vs_conn *ct) + +{ + bool ret = false; + + if (ct->af == p->af && + ip_vs_addr_equal(p->af, p->caddr, &ct->caddr) && + /* protocol should only be IPPROTO_IP if + * d_addr is a fwmark */ + ip_vs_addr_equal(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af, + p->vaddr, &ct->vaddr) && + ct->vport == p->vport && + ct->flags & IP_VS_CONN_F_TEMPLATE && + ct->protocol == p->protocol && + ct->pe_data && ct->pe_data_len == p->pe_data_len && + !memcmp(ct->pe_data, p->pe_data, p->pe_data_len)) + ret = true; + + IP_VS_DBG_BUF(9, "SIP template match %s %s->%s:%d %s\n", + ip_vs_proto_name(p->protocol), + IP_VS_DEBUG_CALLID(p->pe_data, p->pe_data_len), + IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport), + ret ? "hit" : "not hit"); + + return ret; +} + +static u32 ip_vs_sip_hashkey_raw(const struct ip_vs_conn_param *p, + u32 initval, bool inverse) +{ + return jhash(p->pe_data, p->pe_data_len, initval); +} + +static int ip_vs_sip_show_pe_data(const struct ip_vs_conn *cp, char *buf) +{ + memcpy(buf, cp->pe_data, cp->pe_data_len); + return cp->pe_data_len; +} + +static struct ip_vs_pe ip_vs_sip_pe = +{ + .name = "sip", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_sip_pe.n_list), + .fill_param = ip_vs_sip_fill_param, + .ct_match = ip_vs_sip_ct_match, + .hashkey_raw = ip_vs_sip_hashkey_raw, + .show_pe_data = ip_vs_sip_show_pe_data, +}; + +static int __init ip_vs_sip_init(void) +{ + return register_ip_vs_pe(&ip_vs_sip_pe); +} + +static void __exit ip_vs_sip_cleanup(void) +{ + unregister_ip_vs_pe(&ip_vs_sip_pe); +} + +module_init(ip_vs_sip_init); +module_exit(ip_vs_sip_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c new file mode 100644 index 00000000..ed835e67 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_proto.c @@ -0,0 +1,409 @@ +/* + * ip_vs_proto.c: transport protocol load balancing support for IPVS + * + * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> + * Julian Anastasov <ja@ssi.bg> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/skbuff.h> +#include <linux/gfp.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <net/protocol.h> +#include <net/tcp.h> +#include <net/udp.h> +#include <linux/stat.h> +#include <linux/proc_fs.h> + +#include <net/ip_vs.h> + + +/* + * IPVS protocols can only be registered/unregistered when the ipvs + * module is loaded/unloaded, so no lock is needed in accessing the + * ipvs protocol table. + */ + +#define IP_VS_PROTO_TAB_SIZE 32 /* must be power of 2 */ +#define IP_VS_PROTO_HASH(proto) ((proto) & (IP_VS_PROTO_TAB_SIZE-1)) + +static struct ip_vs_protocol *ip_vs_proto_table[IP_VS_PROTO_TAB_SIZE]; + + +/* + * register an ipvs protocol + */ +static int __used __init register_ip_vs_protocol(struct ip_vs_protocol *pp) +{ + unsigned hash = IP_VS_PROTO_HASH(pp->protocol); + + pp->next = ip_vs_proto_table[hash]; + ip_vs_proto_table[hash] = pp; + + if (pp->init != NULL) + pp->init(pp); + + return 0; +} + +/* + * register an ipvs protocols netns related data + */ +static int +register_ip_vs_proto_netns(struct net *net, struct ip_vs_protocol *pp) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + unsigned hash = IP_VS_PROTO_HASH(pp->protocol); + struct ip_vs_proto_data *pd = + kzalloc(sizeof(struct ip_vs_proto_data), GFP_ATOMIC); + + if (!pd) + return -ENOMEM; + + pd->pp = pp; /* For speed issues */ + pd->next = ipvs->proto_data_table[hash]; + ipvs->proto_data_table[hash] = pd; + atomic_set(&pd->appcnt, 0); /* Init app counter */ + + if (pp->init_netns != NULL) { + int ret = pp->init_netns(net, pd); + if (ret) { + /* unlink an free proto data */ + ipvs->proto_data_table[hash] = pd->next; + kfree(pd); + return ret; + } + } + + return 0; +} + +/* + * unregister an ipvs protocol + */ +static int unregister_ip_vs_protocol(struct ip_vs_protocol *pp) +{ + struct ip_vs_protocol **pp_p; + unsigned hash = IP_VS_PROTO_HASH(pp->protocol); + + pp_p = &ip_vs_proto_table[hash]; + for (; *pp_p; pp_p = &(*pp_p)->next) { + if (*pp_p == pp) { + *pp_p = pp->next; + if (pp->exit != NULL) + pp->exit(pp); + return 0; + } + } + + return -ESRCH; +} + +/* + * unregister an ipvs protocols netns data + */ +static int +unregister_ip_vs_proto_netns(struct net *net, struct ip_vs_proto_data *pd) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_proto_data **pd_p; + unsigned hash = IP_VS_PROTO_HASH(pd->pp->protocol); + + pd_p = &ipvs->proto_data_table[hash]; + for (; *pd_p; pd_p = &(*pd_p)->next) { + if (*pd_p == pd) { + *pd_p = pd->next; + if (pd->pp->exit_netns != NULL) + pd->pp->exit_netns(net, pd); + kfree(pd); + return 0; + } + } + + return -ESRCH; +} + +/* + * get ip_vs_protocol object by its proto. + */ +struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto) +{ + struct ip_vs_protocol *pp; + unsigned hash = IP_VS_PROTO_HASH(proto); + + for (pp = ip_vs_proto_table[hash]; pp; pp = pp->next) { + if (pp->protocol == proto) + return pp; + } + + return NULL; +} +EXPORT_SYMBOL(ip_vs_proto_get); + +/* + * get ip_vs_protocol object data by netns and proto + */ +struct ip_vs_proto_data * +__ipvs_proto_data_get(struct netns_ipvs *ipvs, unsigned short proto) +{ + struct ip_vs_proto_data *pd; + unsigned hash = IP_VS_PROTO_HASH(proto); + + for (pd = ipvs->proto_data_table[hash]; pd; pd = pd->next) { + if (pd->pp->protocol == proto) + return pd; + } + + return NULL; +} + +struct ip_vs_proto_data * +ip_vs_proto_data_get(struct net *net, unsigned short proto) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + + return __ipvs_proto_data_get(ipvs, proto); +} +EXPORT_SYMBOL(ip_vs_proto_data_get); + +/* + * Propagate event for state change to all protocols + */ +void ip_vs_protocol_timeout_change(struct netns_ipvs *ipvs, int flags) +{ + struct ip_vs_proto_data *pd; + int i; + + for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) { + for (pd = ipvs->proto_data_table[i]; pd; pd = pd->next) { + if (pd->pp->timeout_change) + pd->pp->timeout_change(pd, flags); + } + } +} + + +int * +ip_vs_create_timeout_table(int *table, int size) +{ + return kmemdup(table, size, GFP_ATOMIC); +} + + +/* + * Set timeout value for state specified by name + */ +int +ip_vs_set_state_timeout(int *table, int num, const char *const *names, + const char *name, int to) +{ + int i; + + if (!table || !name || !to) + return -EINVAL; + + for (i = 0; i < num; i++) { + if (strcmp(names[i], name)) + continue; + table[i] = to * HZ; + return 0; + } + return -ENOENT; +} + + +const char * ip_vs_state_name(__u16 proto, int state) +{ + struct ip_vs_protocol *pp = ip_vs_proto_get(proto); + + if (pp == NULL || pp->state_name == NULL) + return (IPPROTO_IP == proto) ? "NONE" : "ERR!"; + return pp->state_name(state); +} + + +static void +ip_vs_tcpudp_debug_packet_v4(struct ip_vs_protocol *pp, + const struct sk_buff *skb, + int offset, + const char *msg) +{ + char buf[128]; + struct iphdr _iph, *ih; + + ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); + if (ih == NULL) + sprintf(buf, "TRUNCATED"); + else if (ih->frag_off & htons(IP_OFFSET)) + sprintf(buf, "%pI4->%pI4 frag", &ih->saddr, &ih->daddr); + else { + __be16 _ports[2], *pptr; + + pptr = skb_header_pointer(skb, offset + ih->ihl*4, + sizeof(_ports), _ports); + if (pptr == NULL) + sprintf(buf, "TRUNCATED %pI4->%pI4", + &ih->saddr, &ih->daddr); + else + sprintf(buf, "%pI4:%u->%pI4:%u", + &ih->saddr, ntohs(pptr[0]), + &ih->daddr, ntohs(pptr[1])); + } + + pr_debug("%s: %s %s\n", msg, pp->name, buf); +} + +#ifdef CONFIG_IP_VS_IPV6 +static void +ip_vs_tcpudp_debug_packet_v6(struct ip_vs_protocol *pp, + const struct sk_buff *skb, + int offset, + const char *msg) +{ + char buf[192]; + struct ipv6hdr _iph, *ih; + + ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); + if (ih == NULL) + sprintf(buf, "TRUNCATED"); + else if (ih->nexthdr == IPPROTO_FRAGMENT) + sprintf(buf, "%pI6->%pI6 frag", &ih->saddr, &ih->daddr); + else { + __be16 _ports[2], *pptr; + + pptr = skb_header_pointer(skb, offset + sizeof(struct ipv6hdr), + sizeof(_ports), _ports); + if (pptr == NULL) + sprintf(buf, "TRUNCATED %pI6->%pI6", + &ih->saddr, &ih->daddr); + else + sprintf(buf, "%pI6:%u->%pI6:%u", + &ih->saddr, ntohs(pptr[0]), + &ih->daddr, ntohs(pptr[1])); + } + + pr_debug("%s: %s %s\n", msg, pp->name, buf); +} +#endif + + +void +ip_vs_tcpudp_debug_packet(int af, struct ip_vs_protocol *pp, + const struct sk_buff *skb, + int offset, + const char *msg) +{ +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + ip_vs_tcpudp_debug_packet_v6(pp, skb, offset, msg); + else +#endif + ip_vs_tcpudp_debug_packet_v4(pp, skb, offset, msg); +} + +/* + * per network name-space init + */ +int __net_init ip_vs_protocol_net_init(struct net *net) +{ + int i, ret; + static struct ip_vs_protocol *protos[] = { +#ifdef CONFIG_IP_VS_PROTO_TCP + &ip_vs_protocol_tcp, +#endif +#ifdef CONFIG_IP_VS_PROTO_UDP + &ip_vs_protocol_udp, +#endif +#ifdef CONFIG_IP_VS_PROTO_SCTP + &ip_vs_protocol_sctp, +#endif +#ifdef CONFIG_IP_VS_PROTO_AH + &ip_vs_protocol_ah, +#endif +#ifdef CONFIG_IP_VS_PROTO_ESP + &ip_vs_protocol_esp, +#endif + }; + + for (i = 0; i < ARRAY_SIZE(protos); i++) { + ret = register_ip_vs_proto_netns(net, protos[i]); + if (ret < 0) + goto cleanup; + } + return 0; + +cleanup: + ip_vs_protocol_net_cleanup(net); + return ret; +} + +void __net_exit ip_vs_protocol_net_cleanup(struct net *net) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_proto_data *pd; + int i; + + /* unregister all the ipvs proto data for this netns */ + for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) { + while ((pd = ipvs->proto_data_table[i]) != NULL) + unregister_ip_vs_proto_netns(net, pd); + } +} + +int __init ip_vs_protocol_init(void) +{ + char protocols[64]; +#define REGISTER_PROTOCOL(p) \ + do { \ + register_ip_vs_protocol(p); \ + strcat(protocols, ", "); \ + strcat(protocols, (p)->name); \ + } while (0) + + protocols[0] = '\0'; + protocols[2] = '\0'; +#ifdef CONFIG_IP_VS_PROTO_TCP + REGISTER_PROTOCOL(&ip_vs_protocol_tcp); +#endif +#ifdef CONFIG_IP_VS_PROTO_UDP + REGISTER_PROTOCOL(&ip_vs_protocol_udp); +#endif +#ifdef CONFIG_IP_VS_PROTO_SCTP + REGISTER_PROTOCOL(&ip_vs_protocol_sctp); +#endif +#ifdef CONFIG_IP_VS_PROTO_AH + REGISTER_PROTOCOL(&ip_vs_protocol_ah); +#endif +#ifdef CONFIG_IP_VS_PROTO_ESP + REGISTER_PROTOCOL(&ip_vs_protocol_esp); +#endif + pr_info("Registered protocols (%s)\n", &protocols[2]); + + return 0; +} + + +void ip_vs_protocol_cleanup(void) +{ + struct ip_vs_protocol *pp; + int i; + + /* unregister all the ipvs protocols */ + for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) { + while ((pp = ip_vs_proto_table[i]) != NULL) + unregister_ip_vs_protocol(pp); + } +} diff --git a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c new file mode 100644 index 00000000..5b8eb8b1 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c @@ -0,0 +1,166 @@ +/* + * ip_vs_proto_ah_esp.c: AH/ESP IPSec load balancing support for IPVS + * + * Authors: Julian Anastasov <ja@ssi.bg>, February 2002 + * Wensong Zhang <wensong@linuxvirtualserver.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation; + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> + +#include <net/ip_vs.h> + + +/* TODO: + +struct isakmp_hdr { + __u8 icookie[8]; + __u8 rcookie[8]; + __u8 np; + __u8 version; + __u8 xchgtype; + __u8 flags; + __u32 msgid; + __u32 length; +}; + +*/ + +#define PORT_ISAKMP 500 + +static void +ah_esp_conn_fill_param_proto(struct net *net, int af, + const struct ip_vs_iphdr *iph, int inverse, + struct ip_vs_conn_param *p) +{ + if (likely(!inverse)) + ip_vs_conn_fill_param(net, af, IPPROTO_UDP, + &iph->saddr, htons(PORT_ISAKMP), + &iph->daddr, htons(PORT_ISAKMP), p); + else + ip_vs_conn_fill_param(net, af, IPPROTO_UDP, + &iph->daddr, htons(PORT_ISAKMP), + &iph->saddr, htons(PORT_ISAKMP), p); +} + +static struct ip_vs_conn * +ah_esp_conn_in_get(int af, const struct sk_buff *skb, + const struct ip_vs_iphdr *iph, unsigned int proto_off, + int inverse) +{ + struct ip_vs_conn *cp; + struct ip_vs_conn_param p; + struct net *net = skb_net(skb); + + ah_esp_conn_fill_param_proto(net, af, iph, inverse, &p); + cp = ip_vs_conn_in_get(&p); + if (!cp) { + /* + * We are not sure if the packet is from our + * service, so our conn_schedule hook should return NF_ACCEPT + */ + IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for outin packet " + "%s%s %s->%s\n", + inverse ? "ICMP+" : "", + ip_vs_proto_get(iph->protocol)->name, + IP_VS_DBG_ADDR(af, &iph->saddr), + IP_VS_DBG_ADDR(af, &iph->daddr)); + } + + return cp; +} + + +static struct ip_vs_conn * +ah_esp_conn_out_get(int af, const struct sk_buff *skb, + const struct ip_vs_iphdr *iph, + unsigned int proto_off, + int inverse) +{ + struct ip_vs_conn *cp; + struct ip_vs_conn_param p; + struct net *net = skb_net(skb); + + ah_esp_conn_fill_param_proto(net, af, iph, inverse, &p); + cp = ip_vs_conn_out_get(&p); + if (!cp) { + IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for inout packet " + "%s%s %s->%s\n", + inverse ? "ICMP+" : "", + ip_vs_proto_get(iph->protocol)->name, + IP_VS_DBG_ADDR(af, &iph->saddr), + IP_VS_DBG_ADDR(af, &iph->daddr)); + } + + return cp; +} + + +static int +ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, + int *verdict, struct ip_vs_conn **cpp) +{ + /* + * AH/ESP is only related traffic. Pass the packet to IP stack. + */ + *verdict = NF_ACCEPT; + return 0; +} + +#ifdef CONFIG_IP_VS_PROTO_AH +struct ip_vs_protocol ip_vs_protocol_ah = { + .name = "AH", + .protocol = IPPROTO_AH, + .num_states = 1, + .dont_defrag = 1, + .init = NULL, + .exit = NULL, + .conn_schedule = ah_esp_conn_schedule, + .conn_in_get = ah_esp_conn_in_get, + .conn_out_get = ah_esp_conn_out_get, + .snat_handler = NULL, + .dnat_handler = NULL, + .csum_check = NULL, + .state_transition = NULL, + .register_app = NULL, + .unregister_app = NULL, + .app_conn_bind = NULL, + .debug_packet = ip_vs_tcpudp_debug_packet, + .timeout_change = NULL, /* ISAKMP */ +}; +#endif + +#ifdef CONFIG_IP_VS_PROTO_ESP +struct ip_vs_protocol ip_vs_protocol_esp = { + .name = "ESP", + .protocol = IPPROTO_ESP, + .num_states = 1, + .dont_defrag = 1, + .init = NULL, + .exit = NULL, + .conn_schedule = ah_esp_conn_schedule, + .conn_in_get = ah_esp_conn_in_get, + .conn_out_get = ah_esp_conn_out_get, + .snat_handler = NULL, + .dnat_handler = NULL, + .csum_check = NULL, + .state_transition = NULL, + .register_app = NULL, + .unregister_app = NULL, + .app_conn_bind = NULL, + .debug_packet = ip_vs_tcpudp_debug_packet, + .timeout_change = NULL, /* ISAKMP */ +}; +#endif diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c new file mode 100644 index 00000000..9f3fb751 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c @@ -0,0 +1,1133 @@ +#include <linux/kernel.h> +#include <linux/ip.h> +#include <linux/sctp.h> +#include <net/ip.h> +#include <net/ip6_checksum.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> +#include <net/sctp/checksum.h> +#include <net/ip_vs.h> + +static int +sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, + int *verdict, struct ip_vs_conn **cpp) +{ + struct net *net; + struct ip_vs_service *svc; + sctp_chunkhdr_t _schunkh, *sch; + sctp_sctphdr_t *sh, _sctph; + struct ip_vs_iphdr iph; + + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + + sh = skb_header_pointer(skb, iph.len, sizeof(_sctph), &_sctph); + if (sh == NULL) + return 0; + + sch = skb_header_pointer(skb, iph.len + sizeof(sctp_sctphdr_t), + sizeof(_schunkh), &_schunkh); + if (sch == NULL) + return 0; + net = skb_net(skb); + if ((sch->type == SCTP_CID_INIT) && + (svc = ip_vs_service_get(net, af, skb->mark, iph.protocol, + &iph.daddr, sh->dest))) { + int ignored; + + if (ip_vs_todrop(net_ipvs(net))) { + /* + * It seems that we are very loaded. + * We have to drop this packet :( + */ + ip_vs_service_put(svc); + *verdict = NF_DROP; + return 0; + } + /* + * Let the virtual server select a real server for the + * incoming connection, and create a connection entry. + */ + *cpp = ip_vs_schedule(svc, skb, pd, &ignored); + if (!*cpp && ignored <= 0) { + if (!ignored) + *verdict = ip_vs_leave(svc, skb, pd); + else { + ip_vs_service_put(svc); + *verdict = NF_DROP; + } + return 0; + } + ip_vs_service_put(svc); + } + /* NF_ACCEPT */ + return 1; +} + +static int +sctp_snat_handler(struct sk_buff *skb, + struct ip_vs_protocol *pp, struct ip_vs_conn *cp) +{ + sctp_sctphdr_t *sctph; + unsigned int sctphoff; + struct sk_buff *iter; + __be32 crc32; + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + sctphoff = sizeof(struct ipv6hdr); + else +#endif + sctphoff = ip_hdrlen(skb); + + /* csum_check requires unshared skb */ + if (!skb_make_writable(skb, sctphoff + sizeof(*sctph))) + return 0; + + if (unlikely(cp->app != NULL)) { + /* Some checks before mangling */ + if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) + return 0; + + /* Call application helper if needed */ + if (!ip_vs_app_pkt_out(cp, skb)) + return 0; + } + + sctph = (void *) skb_network_header(skb) + sctphoff; + sctph->source = cp->vport; + + /* Calculate the checksum */ + crc32 = sctp_start_cksum((u8 *) sctph, skb_headlen(skb) - sctphoff); + skb_walk_frags(skb, iter) + crc32 = sctp_update_cksum((u8 *) iter->data, skb_headlen(iter), + crc32); + crc32 = sctp_end_cksum(crc32); + sctph->checksum = crc32; + + return 1; +} + +static int +sctp_dnat_handler(struct sk_buff *skb, + struct ip_vs_protocol *pp, struct ip_vs_conn *cp) +{ + sctp_sctphdr_t *sctph; + unsigned int sctphoff; + struct sk_buff *iter; + __be32 crc32; + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + sctphoff = sizeof(struct ipv6hdr); + else +#endif + sctphoff = ip_hdrlen(skb); + + /* csum_check requires unshared skb */ + if (!skb_make_writable(skb, sctphoff + sizeof(*sctph))) + return 0; + + if (unlikely(cp->app != NULL)) { + /* Some checks before mangling */ + if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) + return 0; + + /* Call application helper if needed */ + if (!ip_vs_app_pkt_in(cp, skb)) + return 0; + } + + sctph = (void *) skb_network_header(skb) + sctphoff; + sctph->dest = cp->dport; + + /* Calculate the checksum */ + crc32 = sctp_start_cksum((u8 *) sctph, skb_headlen(skb) - sctphoff); + skb_walk_frags(skb, iter) + crc32 = sctp_update_cksum((u8 *) iter->data, skb_headlen(iter), + crc32); + crc32 = sctp_end_cksum(crc32); + sctph->checksum = crc32; + + return 1; +} + +static int +sctp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) +{ + unsigned int sctphoff; + struct sctphdr *sh, _sctph; + struct sk_buff *iter; + __le32 cmp; + __le32 val; + __u32 tmp; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + sctphoff = sizeof(struct ipv6hdr); + else +#endif + sctphoff = ip_hdrlen(skb); + + sh = skb_header_pointer(skb, sctphoff, sizeof(_sctph), &_sctph); + if (sh == NULL) + return 0; + + cmp = sh->checksum; + + tmp = sctp_start_cksum((__u8 *) sh, skb_headlen(skb)); + skb_walk_frags(skb, iter) + tmp = sctp_update_cksum((__u8 *) iter->data, + skb_headlen(iter), tmp); + + val = sctp_end_cksum(tmp); + + if (val != cmp) { + /* CRC failure, dump it. */ + IP_VS_DBG_RL_PKT(0, af, pp, skb, 0, + "Failed checksum for"); + return 0; + } + return 1; +} + +struct ipvs_sctp_nextstate { + int next_state; +}; +enum ipvs_sctp_event_t { + IP_VS_SCTP_EVE_DATA_CLI, + IP_VS_SCTP_EVE_DATA_SER, + IP_VS_SCTP_EVE_INIT_CLI, + IP_VS_SCTP_EVE_INIT_SER, + IP_VS_SCTP_EVE_INIT_ACK_CLI, + IP_VS_SCTP_EVE_INIT_ACK_SER, + IP_VS_SCTP_EVE_COOKIE_ECHO_CLI, + IP_VS_SCTP_EVE_COOKIE_ECHO_SER, + IP_VS_SCTP_EVE_COOKIE_ACK_CLI, + IP_VS_SCTP_EVE_COOKIE_ACK_SER, + IP_VS_SCTP_EVE_ABORT_CLI, + IP_VS_SCTP_EVE__ABORT_SER, + IP_VS_SCTP_EVE_SHUT_CLI, + IP_VS_SCTP_EVE_SHUT_SER, + IP_VS_SCTP_EVE_SHUT_ACK_CLI, + IP_VS_SCTP_EVE_SHUT_ACK_SER, + IP_VS_SCTP_EVE_SHUT_COM_CLI, + IP_VS_SCTP_EVE_SHUT_COM_SER, + IP_VS_SCTP_EVE_LAST +}; + +static enum ipvs_sctp_event_t sctp_events[255] = { + IP_VS_SCTP_EVE_DATA_CLI, + IP_VS_SCTP_EVE_INIT_CLI, + IP_VS_SCTP_EVE_INIT_ACK_CLI, + IP_VS_SCTP_EVE_DATA_CLI, + IP_VS_SCTP_EVE_DATA_CLI, + IP_VS_SCTP_EVE_DATA_CLI, + IP_VS_SCTP_EVE_ABORT_CLI, + IP_VS_SCTP_EVE_SHUT_CLI, + IP_VS_SCTP_EVE_SHUT_ACK_CLI, + IP_VS_SCTP_EVE_DATA_CLI, + IP_VS_SCTP_EVE_COOKIE_ECHO_CLI, + IP_VS_SCTP_EVE_COOKIE_ACK_CLI, + IP_VS_SCTP_EVE_DATA_CLI, + IP_VS_SCTP_EVE_DATA_CLI, + IP_VS_SCTP_EVE_SHUT_COM_CLI, +}; + +static struct ipvs_sctp_nextstate + sctp_states_table[IP_VS_SCTP_S_LAST][IP_VS_SCTP_EVE_LAST] = { + /* + * STATE : IP_VS_SCTP_S_NONE + */ + /*next state *//*event */ + {{IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_SER */ }, + {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ }, + {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ }, + }, + /* + * STATE : IP_VS_SCTP_S_INIT_CLI + * Cient sent INIT and is waiting for reply from server(In ECHO_WAIT) + */ + {{IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_SER */ }, + {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ }, + {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ }, + {IP_VS_SCTP_S_INIT_ACK_SER /* IP_VS_SCTP_EVE_INIT_ACK_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ECHO_CLI */ }, + {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_ECHO_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ }, + {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ } + }, + /* + * State : IP_VS_SCTP_S_INIT_SER + * Server sent INIT and waiting for INIT ACK from the client + */ + {{IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_SER */ }, + {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ }, + {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ }, + {IP_VS_SCTP_S_INIT_ACK_CLI /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_SER */ }, + {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ }, + {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ } + }, + /* + * State : IP_VS_SCTP_S_INIT_ACK_CLI + * Client sent INIT ACK and waiting for ECHO from the server + */ + {{IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_SER */ }, + /* + * We have got an INIT from client. From the spec.“Upon receipt of + * an INIT in the COOKIE-WAIT state, an endpoint MUST respond with + * an INIT ACK using the same parameters it sent in its original + * INIT chunk (including its Initiate Tag, unchanged”). + */ + {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ }, + {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ }, + /* + * INIT_ACK has been resent by the client, let us stay is in + * the same state + */ + {IP_VS_SCTP_S_INIT_ACK_CLI /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ }, + /* + * INIT_ACK sent by the server, close the connection + */ + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_SER */ }, + /* + * ECHO by client, it should not happen, close the connection + */ + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ }, + /* + * ECHO by server, this is what we are expecting, move to ECHO_SER + */ + {IP_VS_SCTP_S_ECHO_SER /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ }, + /* + * COOKIE ACK from client, it should not happen, close the connection + */ + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ }, + /* + * Unexpected COOKIE ACK from server, staty in the same state + */ + {IP_VS_SCTP_S_INIT_ACK_CLI /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ } + }, + /* + * State : IP_VS_SCTP_S_INIT_ACK_SER + * Server sent INIT ACK and waiting for ECHO from the client + */ + {{IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_SER */ }, + /* + * We have got an INIT from client. From the spec.“Upon receipt of + * an INIT in the COOKIE-WAIT state, an endpoint MUST respond with + * an INIT ACK using the same parameters it sent in its original + * INIT chunk (including its Initiate Tag, unchanged”). + */ + {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ }, + {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ }, + /* + * Unexpected INIT_ACK by the client, let us close the connection + */ + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ }, + /* + * INIT_ACK resent by the server, let us move to same state + */ + {IP_VS_SCTP_S_INIT_ACK_SER /* IP_VS_SCTP_EVE_INIT_ACK_SER */ }, + /* + * Client send the ECHO, this is what we are expecting, + * move to ECHO_CLI + */ + {IP_VS_SCTP_S_ECHO_CLI /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ }, + /* + * ECHO received from the server, Not sure what to do, + * let us close it + */ + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ }, + /* + * COOKIE ACK from client, let us stay in the same state + */ + {IP_VS_SCTP_S_INIT_ACK_SER /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ }, + /* + * COOKIE ACK from server, hmm... this should not happen, lets close + * the connection. + */ + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ } + }, + /* + * State : IP_VS_SCTP_S_ECHO_CLI + * Cient sent ECHO and waiting COOKEI ACK from the Server + */ + {{IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_SER */ }, + /* + * We have got an INIT from client. From the spec.“Upon receipt of + * an INIT in the COOKIE-WAIT state, an endpoint MUST respond with + * an INIT ACK using the same parameters it sent in its original + * INIT chunk (including its Initiate Tag, unchanged”). + */ + {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ }, + {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ }, + /* + * INIT_ACK has been by the client, let us close the connection + */ + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ }, + /* + * INIT_ACK sent by the server, Unexpected INIT ACK, spec says, + * “If an INIT ACK is received by an endpoint in any state other + * than the COOKIE-WAIT state, the endpoint should discard the + * INIT ACK chunk”. Stay in the same state + */ + {IP_VS_SCTP_S_ECHO_CLI /* IP_VS_SCTP_EVE_INIT_ACK_SER */ }, + /* + * Client resent the ECHO, let us stay in the same state + */ + {IP_VS_SCTP_S_ECHO_CLI /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ }, + /* + * ECHO received from the server, Not sure what to do, + * let us close it + */ + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ }, + /* + * COOKIE ACK from client, this shoud not happen, let's close the + * connection + */ + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ }, + /* + * COOKIE ACK from server, this is what we are awaiting,lets move to + * ESTABLISHED. + */ + {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ } + }, + /* + * State : IP_VS_SCTP_S_ECHO_SER + * Server sent ECHO and waiting COOKEI ACK from the client + */ + {{IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_SER */ }, + /* + * We have got an INIT from client. From the spec.“Upon receipt of + * an INIT in the COOKIE-WAIT state, an endpoint MUST respond with + * an INIT ACK using the same parameters it sent in its original + * INIT chunk (including its Initiate Tag, unchanged”). + */ + {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ }, + {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ }, + /* + * INIT_ACK sent by the server, Unexpected INIT ACK, spec says, + * “If an INIT ACK is received by an endpoint in any state other + * than the COOKIE-WAIT state, the endpoint should discard the + * INIT ACK chunk”. Stay in the same state + */ + {IP_VS_SCTP_S_ECHO_SER /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ }, + /* + * INIT_ACK has been by the server, let us close the connection + */ + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_SER */ }, + /* + * Client sent the ECHO, not sure what to do, let's close the + * connection. + */ + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ }, + /* + * ECHO resent by the server, stay in the same state + */ + {IP_VS_SCTP_S_ECHO_SER /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ }, + /* + * COOKIE ACK from client, this is what we are expecting, let's move + * to ESTABLISHED. + */ + {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ }, + /* + * COOKIE ACK from server, this should not happen, lets close the + * connection. + */ + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ } + }, + /* + * State : IP_VS_SCTP_S_ESTABLISHED + * Association established + */ + {{IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_DATA_CLI */ }, + {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_DATA_SER */ }, + /* + * We have got an INIT from client. From the spec.“Upon receipt of + * an INIT in the COOKIE-WAIT state, an endpoint MUST respond with + * an INIT ACK using the same parameters it sent in its original + * INIT chunk (including its Initiate Tag, unchanged”). + */ + {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ }, + {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ }, + /* + * INIT_ACK sent by the server, Unexpected INIT ACK, spec says, + * “If an INIT ACK is received by an endpoint in any state other + * than the COOKIE-WAIT state, the endpoint should discard the + * INIT ACK chunk”. Stay in the same state + */ + {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ }, + {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_INIT_ACK_SER */ }, + /* + * Client sent ECHO, Spec(sec 5.2.4) says it may be handled by the + * peer and peer shall move to the ESTABISHED. if it doesn't handle + * it will send ERROR chunk. So, stay in the same state + */ + {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ }, + {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ }, + /* + * COOKIE ACK from client, not sure what to do stay in the same state + */ + {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ }, + {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ }, + /* + * SHUTDOWN from the client, move to SHUDDOWN_CLI + */ + {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_SHUT_CLI */ }, + /* + * SHUTDOWN from the server, move to SHUTDOWN_SER + */ + {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_SHUT_SER */ }, + /* + * client sent SHUDTDOWN_ACK, this should not happen, let's close + * the connection + */ + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ } + }, + /* + * State : IP_VS_SCTP_S_SHUT_CLI + * SHUTDOWN sent from the client, waitinf for SHUT ACK from the server + */ + /* + * We received the data chuck, keep the state unchanged. I assume + * that still data chuncks can be received by both the peers in + * SHUDOWN state + */ + + {{IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_DATA_CLI */ }, + {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_DATA_SER */ }, + /* + * We have got an INIT from client. From the spec.“Upon receipt of + * an INIT in the COOKIE-WAIT state, an endpoint MUST respond with + * an INIT ACK using the same parameters it sent in its original + * INIT chunk (including its Initiate Tag, unchanged”). + */ + {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ }, + {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ }, + /* + * INIT_ACK sent by the server, Unexpected INIT ACK, spec says, + * “If an INIT ACK is received by an endpoint in any state other + * than the COOKIE-WAIT state, the endpoint should discard the + * INIT ACK chunk”. Stay in the same state + */ + {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ }, + {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_INIT_ACK_SER */ }, + /* + * Client sent ECHO, Spec(sec 5.2.4) says it may be handled by the + * peer and peer shall move to the ESTABISHED. if it doesn't handle + * it will send ERROR chunk. So, stay in the same state + */ + {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ }, + {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ }, + /* + * COOKIE ACK from client, not sure what to do stay in the same state + */ + {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ }, + {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ }, + /* + * SHUTDOWN resent from the client, move to SHUDDOWN_CLI + */ + {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_SHUT_CLI */ }, + /* + * SHUTDOWN from the server, move to SHUTDOWN_SER + */ + {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_SHUT_SER */ }, + /* + * client sent SHUDTDOWN_ACK, this should not happen, let's close + * the connection + */ + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ }, + /* + * Server sent SHUTDOWN ACK, this is what we are expecting, let's move + * to SHUDOWN_ACK_SER + */ + {IP_VS_SCTP_S_SHUT_ACK_SER /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ }, + /* + * SHUTDOWN COM from client, this should not happen, let's close the + * connection + */ + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ } + }, + /* + * State : IP_VS_SCTP_S_SHUT_SER + * SHUTDOWN sent from the server, waitinf for SHUTDOWN ACK from client + */ + /* + * We received the data chuck, keep the state unchanged. I assume + * that still data chuncks can be received by both the peers in + * SHUDOWN state + */ + + {{IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_DATA_CLI */ }, + {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_DATA_SER */ }, + /* + * We have got an INIT from client. From the spec.“Upon receipt of + * an INIT in the COOKIE-WAIT state, an endpoint MUST respond with + * an INIT ACK using the same parameters it sent in its original + * INIT chunk (including its Initiate Tag, unchanged”). + */ + {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ }, + {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ }, + /* + * INIT_ACK sent by the server, Unexpected INIT ACK, spec says, + * “If an INIT ACK is received by an endpoint in any state other + * than the COOKIE-WAIT state, the endpoint should discard the + * INIT ACK chunk”. Stay in the same state + */ + {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ }, + {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_INIT_ACK_SER */ }, + /* + * Client sent ECHO, Spec(sec 5.2.4) says it may be handled by the + * peer and peer shall move to the ESTABISHED. if it doesn't handle + * it will send ERROR chunk. So, stay in the same state + */ + {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ }, + {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ }, + /* + * COOKIE ACK from client, not sure what to do stay in the same state + */ + {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ }, + {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ }, + /* + * SHUTDOWN resent from the client, move to SHUDDOWN_CLI + */ + {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_SHUT_CLI */ }, + /* + * SHUTDOWN resent from the server, move to SHUTDOWN_SER + */ + {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_SHUT_SER */ }, + /* + * client sent SHUDTDOWN_ACK, this is what we are expecting, let's + * move to SHUT_ACK_CLI + */ + {IP_VS_SCTP_S_SHUT_ACK_CLI /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ }, + /* + * Server sent SHUTDOWN ACK, this should not happen, let's close the + * connection + */ + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ }, + /* + * SHUTDOWN COM from client, this should not happen, let's close the + * connection + */ + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ } + }, + + /* + * State : IP_VS_SCTP_S_SHUT_ACK_CLI + * SHUTDOWN ACK from the client, awaiting for SHUTDOWN COM from server + */ + /* + * We received the data chuck, keep the state unchanged. I assume + * that still data chuncks can be received by both the peers in + * SHUDOWN state + */ + + {{IP_VS_SCTP_S_SHUT_ACK_CLI /* IP_VS_SCTP_EVE_DATA_CLI */ }, + {IP_VS_SCTP_S_SHUT_ACK_CLI /* IP_VS_SCTP_EVE_DATA_SER */ }, + /* + * We have got an INIT from client. From the spec.“Upon receipt of + * an INIT in the COOKIE-WAIT state, an endpoint MUST respond with + * an INIT ACK using the same parameters it sent in its original + * INIT chunk (including its Initiate Tag, unchanged”). + */ + {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ }, + {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ }, + /* + * INIT_ACK sent by the server, Unexpected INIT ACK, spec says, + * “If an INIT ACK is received by an endpoint in any state other + * than the COOKIE-WAIT state, the endpoint should discard the + * INIT ACK chunk”. Stay in the same state + */ + {IP_VS_SCTP_S_SHUT_ACK_CLI /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ }, + {IP_VS_SCTP_S_SHUT_ACK_CLI /* IP_VS_SCTP_EVE_INIT_ACK_SER */ }, + /* + * Client sent ECHO, Spec(sec 5.2.4) says it may be handled by the + * peer and peer shall move to the ESTABISHED. if it doesn't handle + * it will send ERROR chunk. So, stay in the same state + */ + {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ }, + {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ }, + /* + * COOKIE ACK from client, not sure what to do stay in the same state + */ + {IP_VS_SCTP_S_SHUT_ACK_CLI /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ }, + {IP_VS_SCTP_S_SHUT_ACK_CLI /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ }, + /* + * SHUTDOWN sent from the client, move to SHUDDOWN_CLI + */ + {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_SHUT_CLI */ }, + /* + * SHUTDOWN sent from the server, move to SHUTDOWN_SER + */ + {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_SHUT_SER */ }, + /* + * client resent SHUDTDOWN_ACK, let's stay in the same state + */ + {IP_VS_SCTP_S_SHUT_ACK_CLI /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ }, + /* + * Server sent SHUTDOWN ACK, this should not happen, let's close the + * connection + */ + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ }, + /* + * SHUTDOWN COM from client, this should not happen, let's close the + * connection + */ + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ }, + /* + * SHUTDOWN COMPLETE from server this is what we are expecting. + */ + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ } + }, + + /* + * State : IP_VS_SCTP_S_SHUT_ACK_SER + * SHUTDOWN ACK from the server, awaiting for SHUTDOWN COM from client + */ + /* + * We received the data chuck, keep the state unchanged. I assume + * that still data chuncks can be received by both the peers in + * SHUDOWN state + */ + + {{IP_VS_SCTP_S_SHUT_ACK_SER /* IP_VS_SCTP_EVE_DATA_CLI */ }, + {IP_VS_SCTP_S_SHUT_ACK_SER /* IP_VS_SCTP_EVE_DATA_SER */ }, + /* + * We have got an INIT from client. From the spec.“Upon receipt of + * an INIT in the COOKIE-WAIT state, an endpoint MUST respond with + * an INIT ACK using the same parameters it sent in its original + * INIT chunk (including its Initiate Tag, unchanged”). + */ + {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ }, + {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ }, + /* + * INIT_ACK sent by the server, Unexpected INIT ACK, spec says, + * “If an INIT ACK is received by an endpoint in any state other + * than the COOKIE-WAIT state, the endpoint should discard the + * INIT ACK chunk”. Stay in the same state + */ + {IP_VS_SCTP_S_SHUT_ACK_SER /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ }, + {IP_VS_SCTP_S_SHUT_ACK_SER /* IP_VS_SCTP_EVE_INIT_ACK_SER */ }, + /* + * Client sent ECHO, Spec(sec 5.2.4) says it may be handled by the + * peer and peer shall move to the ESTABISHED. if it doesn't handle + * it will send ERROR chunk. So, stay in the same state + */ + {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ }, + {IP_VS_SCTP_S_ESTABLISHED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ }, + /* + * COOKIE ACK from client, not sure what to do stay in the same state + */ + {IP_VS_SCTP_S_SHUT_ACK_SER /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ }, + {IP_VS_SCTP_S_SHUT_ACK_SER /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ }, + /* + * SHUTDOWN sent from the client, move to SHUDDOWN_CLI + */ + {IP_VS_SCTP_S_SHUT_CLI /* IP_VS_SCTP_EVE_SHUT_CLI */ }, + /* + * SHUTDOWN sent from the server, move to SHUTDOWN_SER + */ + {IP_VS_SCTP_S_SHUT_SER /* IP_VS_SCTP_EVE_SHUT_SER */ }, + /* + * client sent SHUDTDOWN_ACK, this should not happen let's close + * the connection. + */ + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ }, + /* + * Server resent SHUTDOWN ACK, stay in the same state + */ + {IP_VS_SCTP_S_SHUT_ACK_SER /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ }, + /* + * SHUTDOWN COM from client, this what we are expecting, let's close + * the connection + */ + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ }, + /* + * SHUTDOWN COMPLETE from server this should not happen. + */ + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ } + }, + /* + * State : IP_VS_SCTP_S_CLOSED + */ + {{IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_DATA_SER */ }, + {IP_VS_SCTP_S_INIT_CLI /* IP_VS_SCTP_EVE_INIT_CLI */ }, + {IP_VS_SCTP_S_INIT_SER /* IP_VS_SCTP_EVE_INIT_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_INIT_ACK_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ECHO_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ECHO_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_COOKIE_ACK_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_ABORT_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_ACK_SER */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_CLI */ }, + {IP_VS_SCTP_S_CLOSED /* IP_VS_SCTP_EVE_SHUT_COM_SER */ } + } +}; + +/* + * Timeout table[state] + */ +static const int sctp_timeouts[IP_VS_SCTP_S_LAST + 1] = { + [IP_VS_SCTP_S_NONE] = 2 * HZ, + [IP_VS_SCTP_S_INIT_CLI] = 1 * 60 * HZ, + [IP_VS_SCTP_S_INIT_SER] = 1 * 60 * HZ, + [IP_VS_SCTP_S_INIT_ACK_CLI] = 1 * 60 * HZ, + [IP_VS_SCTP_S_INIT_ACK_SER] = 1 * 60 * HZ, + [IP_VS_SCTP_S_ECHO_CLI] = 1 * 60 * HZ, + [IP_VS_SCTP_S_ECHO_SER] = 1 * 60 * HZ, + [IP_VS_SCTP_S_ESTABLISHED] = 15 * 60 * HZ, + [IP_VS_SCTP_S_SHUT_CLI] = 1 * 60 * HZ, + [IP_VS_SCTP_S_SHUT_SER] = 1 * 60 * HZ, + [IP_VS_SCTP_S_SHUT_ACK_CLI] = 1 * 60 * HZ, + [IP_VS_SCTP_S_SHUT_ACK_SER] = 1 * 60 * HZ, + [IP_VS_SCTP_S_CLOSED] = 10 * HZ, + [IP_VS_SCTP_S_LAST] = 2 * HZ, +}; + +static const char *sctp_state_name_table[IP_VS_SCTP_S_LAST + 1] = { + [IP_VS_SCTP_S_NONE] = "NONE", + [IP_VS_SCTP_S_INIT_CLI] = "INIT_CLI", + [IP_VS_SCTP_S_INIT_SER] = "INIT_SER", + [IP_VS_SCTP_S_INIT_ACK_CLI] = "INIT_ACK_CLI", + [IP_VS_SCTP_S_INIT_ACK_SER] = "INIT_ACK_SER", + [IP_VS_SCTP_S_ECHO_CLI] = "COOKIE_ECHO_CLI", + [IP_VS_SCTP_S_ECHO_SER] = "COOKIE_ECHO_SER", + [IP_VS_SCTP_S_ESTABLISHED] = "ESTABISHED", + [IP_VS_SCTP_S_SHUT_CLI] = "SHUTDOWN_CLI", + [IP_VS_SCTP_S_SHUT_SER] = "SHUTDOWN_SER", + [IP_VS_SCTP_S_SHUT_ACK_CLI] = "SHUTDOWN_ACK_CLI", + [IP_VS_SCTP_S_SHUT_ACK_SER] = "SHUTDOWN_ACK_SER", + [IP_VS_SCTP_S_CLOSED] = "CLOSED", + [IP_VS_SCTP_S_LAST] = "BUG!" +}; + + +static const char *sctp_state_name(int state) +{ + if (state >= IP_VS_SCTP_S_LAST) + return "ERR!"; + if (sctp_state_name_table[state]) + return sctp_state_name_table[state]; + return "?"; +} + +static inline void +set_sctp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp, + int direction, const struct sk_buff *skb) +{ + sctp_chunkhdr_t _sctpch, *sch; + unsigned char chunk_type; + int event, next_state; + int ihl; + +#ifdef CONFIG_IP_VS_IPV6 + ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr); +#else + ihl = ip_hdrlen(skb); +#endif + + sch = skb_header_pointer(skb, ihl + sizeof(sctp_sctphdr_t), + sizeof(_sctpch), &_sctpch); + if (sch == NULL) + return; + + chunk_type = sch->type; + /* + * Section 3: Multiple chunks can be bundled into one SCTP packet + * up to the MTU size, except for the INIT, INIT ACK, and + * SHUTDOWN COMPLETE chunks. These chunks MUST NOT be bundled with + * any other chunk in a packet. + * + * Section 3.3.7: DATA chunks MUST NOT be bundled with ABORT. Control + * chunks (except for INIT, INIT ACK, and SHUTDOWN COMPLETE) MAY be + * bundled with an ABORT, but they MUST be placed before the ABORT + * in the SCTP packet or they will be ignored by the receiver. + */ + if ((sch->type == SCTP_CID_COOKIE_ECHO) || + (sch->type == SCTP_CID_COOKIE_ACK)) { + sch = skb_header_pointer(skb, (ihl + sizeof(sctp_sctphdr_t) + + sch->length), sizeof(_sctpch), &_sctpch); + if (sch) { + if (sch->type == SCTP_CID_ABORT) + chunk_type = sch->type; + } + } + + event = sctp_events[chunk_type]; + + /* + * If the direction is IP_VS_DIR_OUTPUT, this event is from server + */ + if (direction == IP_VS_DIR_OUTPUT) + event++; + /* + * get next state + */ + next_state = sctp_states_table[cp->state][event].next_state; + + if (next_state != cp->state) { + struct ip_vs_dest *dest = cp->dest; + + IP_VS_DBG_BUF(8, "%s %s %s:%d->" + "%s:%d state: %s->%s conn->refcnt:%d\n", + pd->pp->name, + ((direction == IP_VS_DIR_OUTPUT) ? + "output " : "input "), + IP_VS_DBG_ADDR(cp->af, &cp->daddr), + ntohs(cp->dport), + IP_VS_DBG_ADDR(cp->af, &cp->caddr), + ntohs(cp->cport), + sctp_state_name(cp->state), + sctp_state_name(next_state), + atomic_read(&cp->refcnt)); + if (dest) { + if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && + (next_state != IP_VS_SCTP_S_ESTABLISHED)) { + atomic_dec(&dest->activeconns); + atomic_inc(&dest->inactconns); + cp->flags |= IP_VS_CONN_F_INACTIVE; + } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) && + (next_state == IP_VS_SCTP_S_ESTABLISHED)) { + atomic_inc(&dest->activeconns); + atomic_dec(&dest->inactconns); + cp->flags &= ~IP_VS_CONN_F_INACTIVE; + } + } + } + if (likely(pd)) + cp->timeout = pd->timeout_table[cp->state = next_state]; + else /* What to do ? */ + cp->timeout = sctp_timeouts[cp->state = next_state]; +} + +static void +sctp_state_transition(struct ip_vs_conn *cp, int direction, + const struct sk_buff *skb, struct ip_vs_proto_data *pd) +{ + spin_lock(&cp->lock); + set_sctp_state(pd, cp, direction, skb); + spin_unlock(&cp->lock); +} + +static inline __u16 sctp_app_hashkey(__be16 port) +{ + return (((__force u16)port >> SCTP_APP_TAB_BITS) ^ (__force u16)port) + & SCTP_APP_TAB_MASK; +} + +static int sctp_register_app(struct net *net, struct ip_vs_app *inc) +{ + struct ip_vs_app *i; + __u16 hash; + __be16 port = inc->port; + int ret = 0; + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_SCTP); + + hash = sctp_app_hashkey(port); + + spin_lock_bh(&ipvs->sctp_app_lock); + list_for_each_entry(i, &ipvs->sctp_apps[hash], p_list) { + if (i->port == port) { + ret = -EEXIST; + goto out; + } + } + list_add(&inc->p_list, &ipvs->sctp_apps[hash]); + atomic_inc(&pd->appcnt); +out: + spin_unlock_bh(&ipvs->sctp_app_lock); + + return ret; +} + +static void sctp_unregister_app(struct net *net, struct ip_vs_app *inc) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_SCTP); + + spin_lock_bh(&ipvs->sctp_app_lock); + atomic_dec(&pd->appcnt); + list_del(&inc->p_list); + spin_unlock_bh(&ipvs->sctp_app_lock); +} + +static int sctp_app_conn_bind(struct ip_vs_conn *cp) +{ + struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp)); + int hash; + struct ip_vs_app *inc; + int result = 0; + + /* Default binding: bind app only for NAT */ + if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) + return 0; + /* Lookup application incarnations and bind the right one */ + hash = sctp_app_hashkey(cp->vport); + + spin_lock(&ipvs->sctp_app_lock); + list_for_each_entry(inc, &ipvs->sctp_apps[hash], p_list) { + if (inc->port == cp->vport) { + if (unlikely(!ip_vs_app_inc_get(inc))) + break; + spin_unlock(&ipvs->sctp_app_lock); + + IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->" + "%s:%u to app %s on port %u\n", + __func__, + IP_VS_DBG_ADDR(cp->af, &cp->caddr), + ntohs(cp->cport), + IP_VS_DBG_ADDR(cp->af, &cp->vaddr), + ntohs(cp->vport), + inc->name, ntohs(inc->port)); + cp->app = inc; + if (inc->init_conn) + result = inc->init_conn(inc, cp); + goto out; + } + } + spin_unlock(&ipvs->sctp_app_lock); +out: + return result; +} + +/* --------------------------------------------- + * timeouts is netns related now. + * --------------------------------------------- + */ +static int __ip_vs_sctp_init(struct net *net, struct ip_vs_proto_data *pd) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + + ip_vs_init_hash_table(ipvs->sctp_apps, SCTP_APP_TAB_SIZE); + spin_lock_init(&ipvs->sctp_app_lock); + pd->timeout_table = ip_vs_create_timeout_table((int *)sctp_timeouts, + sizeof(sctp_timeouts)); + if (!pd->timeout_table) + return -ENOMEM; + return 0; +} + +static void __ip_vs_sctp_exit(struct net *net, struct ip_vs_proto_data *pd) +{ + kfree(pd->timeout_table); +} + +struct ip_vs_protocol ip_vs_protocol_sctp = { + .name = "SCTP", + .protocol = IPPROTO_SCTP, + .num_states = IP_VS_SCTP_S_LAST, + .dont_defrag = 0, + .init = NULL, + .exit = NULL, + .init_netns = __ip_vs_sctp_init, + .exit_netns = __ip_vs_sctp_exit, + .register_app = sctp_register_app, + .unregister_app = sctp_unregister_app, + .conn_schedule = sctp_conn_schedule, + .conn_in_get = ip_vs_conn_in_get_proto, + .conn_out_get = ip_vs_conn_out_get_proto, + .snat_handler = sctp_snat_handler, + .dnat_handler = sctp_dnat_handler, + .csum_check = sctp_csum_check, + .state_name = sctp_state_name, + .state_transition = sctp_state_transition, + .app_conn_bind = sctp_app_conn_bind, + .debug_packet = ip_vs_tcpudp_debug_packet, + .timeout_change = NULL, +}; diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c new file mode 100644 index 00000000..cd609cc6 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c @@ -0,0 +1,722 @@ +/* + * ip_vs_proto_tcp.c: TCP load balancing support for IPVS + * + * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> + * Julian Anastasov <ja@ssi.bg> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: Hans Schillstrom <hans.schillstrom@ericsson.com> + * + * Network name space (netns) aware. + * Global data moved to netns i.e struct netns_ipvs + * tcp_timeouts table has copy per netns in a hash table per + * protocol ip_vs_proto_data and is handled by netns + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/kernel.h> +#include <linux/ip.h> +#include <linux/tcp.h> /* for tcphdr */ +#include <net/ip.h> +#include <net/tcp.h> /* for csum_tcpudp_magic */ +#include <net/ip6_checksum.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> + +#include <net/ip_vs.h> + +static int +tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, + int *verdict, struct ip_vs_conn **cpp) +{ + struct net *net; + struct ip_vs_service *svc; + struct tcphdr _tcph, *th; + struct ip_vs_iphdr iph; + + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + + th = skb_header_pointer(skb, iph.len, sizeof(_tcph), &_tcph); + if (th == NULL) { + *verdict = NF_DROP; + return 0; + } + net = skb_net(skb); + /* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */ + if (th->syn && + (svc = ip_vs_service_get(net, af, skb->mark, iph.protocol, + &iph.daddr, th->dest))) { + int ignored; + + if (ip_vs_todrop(net_ipvs(net))) { + /* + * It seems that we are very loaded. + * We have to drop this packet :( + */ + ip_vs_service_put(svc); + *verdict = NF_DROP; + return 0; + } + + /* + * Let the virtual server select a real server for the + * incoming connection, and create a connection entry. + */ + *cpp = ip_vs_schedule(svc, skb, pd, &ignored); + if (!*cpp && ignored <= 0) { + if (!ignored) + *verdict = ip_vs_leave(svc, skb, pd); + else { + ip_vs_service_put(svc); + *verdict = NF_DROP; + } + return 0; + } + ip_vs_service_put(svc); + } + /* NF_ACCEPT */ + return 1; +} + + +static inline void +tcp_fast_csum_update(int af, struct tcphdr *tcph, + const union nf_inet_addr *oldip, + const union nf_inet_addr *newip, + __be16 oldport, __be16 newport) +{ +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + tcph->check = + csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, + ip_vs_check_diff2(oldport, newport, + ~csum_unfold(tcph->check)))); + else +#endif + tcph->check = + csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, + ip_vs_check_diff2(oldport, newport, + ~csum_unfold(tcph->check)))); +} + + +static inline void +tcp_partial_csum_update(int af, struct tcphdr *tcph, + const union nf_inet_addr *oldip, + const union nf_inet_addr *newip, + __be16 oldlen, __be16 newlen) +{ +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + tcph->check = + ~csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, + ip_vs_check_diff2(oldlen, newlen, + csum_unfold(tcph->check)))); + else +#endif + tcph->check = + ~csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, + ip_vs_check_diff2(oldlen, newlen, + csum_unfold(tcph->check)))); +} + + +static int +tcp_snat_handler(struct sk_buff *skb, + struct ip_vs_protocol *pp, struct ip_vs_conn *cp) +{ + struct tcphdr *tcph; + unsigned int tcphoff; + int oldlen; + int payload_csum = 0; + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + tcphoff = sizeof(struct ipv6hdr); + else +#endif + tcphoff = ip_hdrlen(skb); + oldlen = skb->len - tcphoff; + + /* csum_check requires unshared skb */ + if (!skb_make_writable(skb, tcphoff+sizeof(*tcph))) + return 0; + + if (unlikely(cp->app != NULL)) { + int ret; + + /* Some checks before mangling */ + if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) + return 0; + + /* Call application helper if needed */ + if (!(ret = ip_vs_app_pkt_out(cp, skb))) + return 0; + /* ret=2: csum update is needed after payload mangling */ + if (ret == 1) + oldlen = skb->len - tcphoff; + else + payload_csum = 1; + } + + tcph = (void *)skb_network_header(skb) + tcphoff; + tcph->source = cp->vport; + + /* Adjust TCP checksums */ + if (skb->ip_summed == CHECKSUM_PARTIAL) { + tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr, + htons(oldlen), + htons(skb->len - tcphoff)); + } else if (!payload_csum) { + /* Only port and addr are changed, do fast csum update */ + tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr, + cp->dport, cp->vport); + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->ip_summed = (cp->app && pp->csum_check) ? + CHECKSUM_UNNECESSARY : CHECKSUM_NONE; + } else { + /* full checksum calculation */ + tcph->check = 0; + skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + tcph->check = csum_ipv6_magic(&cp->vaddr.in6, + &cp->caddr.in6, + skb->len - tcphoff, + cp->protocol, skb->csum); + else +#endif + tcph->check = csum_tcpudp_magic(cp->vaddr.ip, + cp->caddr.ip, + skb->len - tcphoff, + cp->protocol, + skb->csum); + skb->ip_summed = CHECKSUM_UNNECESSARY; + + IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n", + pp->name, tcph->check, + (char*)&(tcph->check) - (char*)tcph); + } + return 1; +} + + +static int +tcp_dnat_handler(struct sk_buff *skb, + struct ip_vs_protocol *pp, struct ip_vs_conn *cp) +{ + struct tcphdr *tcph; + unsigned int tcphoff; + int oldlen; + int payload_csum = 0; + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + tcphoff = sizeof(struct ipv6hdr); + else +#endif + tcphoff = ip_hdrlen(skb); + oldlen = skb->len - tcphoff; + + /* csum_check requires unshared skb */ + if (!skb_make_writable(skb, tcphoff+sizeof(*tcph))) + return 0; + + if (unlikely(cp->app != NULL)) { + int ret; + + /* Some checks before mangling */ + if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) + return 0; + + /* + * Attempt ip_vs_app call. + * It will fix ip_vs_conn and iph ack_seq stuff + */ + if (!(ret = ip_vs_app_pkt_in(cp, skb))) + return 0; + /* ret=2: csum update is needed after payload mangling */ + if (ret == 1) + oldlen = skb->len - tcphoff; + else + payload_csum = 1; + } + + tcph = (void *)skb_network_header(skb) + tcphoff; + tcph->dest = cp->dport; + + /* + * Adjust TCP checksums + */ + if (skb->ip_summed == CHECKSUM_PARTIAL) { + tcp_partial_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr, + htons(oldlen), + htons(skb->len - tcphoff)); + } else if (!payload_csum) { + /* Only port and addr are changed, do fast csum update */ + tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr, + cp->vport, cp->dport); + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->ip_summed = (cp->app && pp->csum_check) ? + CHECKSUM_UNNECESSARY : CHECKSUM_NONE; + } else { + /* full checksum calculation */ + tcph->check = 0; + skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + tcph->check = csum_ipv6_magic(&cp->caddr.in6, + &cp->daddr.in6, + skb->len - tcphoff, + cp->protocol, skb->csum); + else +#endif + tcph->check = csum_tcpudp_magic(cp->caddr.ip, + cp->daddr.ip, + skb->len - tcphoff, + cp->protocol, + skb->csum); + skb->ip_summed = CHECKSUM_UNNECESSARY; + } + return 1; +} + + +static int +tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) +{ + unsigned int tcphoff; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + tcphoff = sizeof(struct ipv6hdr); + else +#endif + tcphoff = ip_hdrlen(skb); + + switch (skb->ip_summed) { + case CHECKSUM_NONE: + skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); + case CHECKSUM_COMPLETE: +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) { + if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr, + &ipv6_hdr(skb)->daddr, + skb->len - tcphoff, + ipv6_hdr(skb)->nexthdr, + skb->csum)) { + IP_VS_DBG_RL_PKT(0, af, pp, skb, 0, + "Failed checksum for"); + return 0; + } + } else +#endif + if (csum_tcpudp_magic(ip_hdr(skb)->saddr, + ip_hdr(skb)->daddr, + skb->len - tcphoff, + ip_hdr(skb)->protocol, + skb->csum)) { + IP_VS_DBG_RL_PKT(0, af, pp, skb, 0, + "Failed checksum for"); + return 0; + } + break; + default: + /* No need to checksum. */ + break; + } + + return 1; +} + + +#define TCP_DIR_INPUT 0 +#define TCP_DIR_OUTPUT 4 +#define TCP_DIR_INPUT_ONLY 8 + +static const int tcp_state_off[IP_VS_DIR_LAST] = { + [IP_VS_DIR_INPUT] = TCP_DIR_INPUT, + [IP_VS_DIR_OUTPUT] = TCP_DIR_OUTPUT, + [IP_VS_DIR_INPUT_ONLY] = TCP_DIR_INPUT_ONLY, +}; + +/* + * Timeout table[state] + */ +static const int tcp_timeouts[IP_VS_TCP_S_LAST+1] = { + [IP_VS_TCP_S_NONE] = 2*HZ, + [IP_VS_TCP_S_ESTABLISHED] = 15*60*HZ, + [IP_VS_TCP_S_SYN_SENT] = 2*60*HZ, + [IP_VS_TCP_S_SYN_RECV] = 1*60*HZ, + [IP_VS_TCP_S_FIN_WAIT] = 2*60*HZ, + [IP_VS_TCP_S_TIME_WAIT] = 2*60*HZ, + [IP_VS_TCP_S_CLOSE] = 10*HZ, + [IP_VS_TCP_S_CLOSE_WAIT] = 60*HZ, + [IP_VS_TCP_S_LAST_ACK] = 30*HZ, + [IP_VS_TCP_S_LISTEN] = 2*60*HZ, + [IP_VS_TCP_S_SYNACK] = 120*HZ, + [IP_VS_TCP_S_LAST] = 2*HZ, +}; + +static const char *const tcp_state_name_table[IP_VS_TCP_S_LAST+1] = { + [IP_VS_TCP_S_NONE] = "NONE", + [IP_VS_TCP_S_ESTABLISHED] = "ESTABLISHED", + [IP_VS_TCP_S_SYN_SENT] = "SYN_SENT", + [IP_VS_TCP_S_SYN_RECV] = "SYN_RECV", + [IP_VS_TCP_S_FIN_WAIT] = "FIN_WAIT", + [IP_VS_TCP_S_TIME_WAIT] = "TIME_WAIT", + [IP_VS_TCP_S_CLOSE] = "CLOSE", + [IP_VS_TCP_S_CLOSE_WAIT] = "CLOSE_WAIT", + [IP_VS_TCP_S_LAST_ACK] = "LAST_ACK", + [IP_VS_TCP_S_LISTEN] = "LISTEN", + [IP_VS_TCP_S_SYNACK] = "SYNACK", + [IP_VS_TCP_S_LAST] = "BUG!", +}; + +#define sNO IP_VS_TCP_S_NONE +#define sES IP_VS_TCP_S_ESTABLISHED +#define sSS IP_VS_TCP_S_SYN_SENT +#define sSR IP_VS_TCP_S_SYN_RECV +#define sFW IP_VS_TCP_S_FIN_WAIT +#define sTW IP_VS_TCP_S_TIME_WAIT +#define sCL IP_VS_TCP_S_CLOSE +#define sCW IP_VS_TCP_S_CLOSE_WAIT +#define sLA IP_VS_TCP_S_LAST_ACK +#define sLI IP_VS_TCP_S_LISTEN +#define sSA IP_VS_TCP_S_SYNACK + +struct tcp_states_t { + int next_state[IP_VS_TCP_S_LAST]; +}; + +static const char * tcp_state_name(int state) +{ + if (state >= IP_VS_TCP_S_LAST) + return "ERR!"; + return tcp_state_name_table[state] ? tcp_state_name_table[state] : "?"; +} + +static struct tcp_states_t tcp_states [] = { +/* INPUT */ +/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ +/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }}, +/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }}, +/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, +/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }}, + +/* OUTPUT */ +/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ +/*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }}, +/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }}, +/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }}, +/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }}, + +/* INPUT-ONLY */ +/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ +/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }}, +/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }}, +/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, +/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, +}; + +static struct tcp_states_t tcp_states_dos [] = { +/* INPUT */ +/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ +/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }}, +/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }}, +/*ack*/ {{sCL, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }}, +/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, + +/* OUTPUT */ +/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ +/*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }}, +/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }}, +/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }}, +/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }}, + +/* INPUT-ONLY */ +/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ +/*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }}, +/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }}, +/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, +/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, +}; + +static void tcp_timeout_change(struct ip_vs_proto_data *pd, int flags) +{ + int on = (flags & 1); /* secure_tcp */ + + /* + ** FIXME: change secure_tcp to independent sysctl var + ** or make it per-service or per-app because it is valid + ** for most if not for all of the applications. Something + ** like "capabilities" (flags) for each object. + */ + pd->tcp_state_table = (on ? tcp_states_dos : tcp_states); +} + +static inline int tcp_state_idx(struct tcphdr *th) +{ + if (th->rst) + return 3; + if (th->syn) + return 0; + if (th->fin) + return 1; + if (th->ack) + return 2; + return -1; +} + +static inline void +set_tcp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp, + int direction, struct tcphdr *th) +{ + int state_idx; + int new_state = IP_VS_TCP_S_CLOSE; + int state_off = tcp_state_off[direction]; + + /* + * Update state offset to INPUT_ONLY if necessary + * or delete NO_OUTPUT flag if output packet detected + */ + if (cp->flags & IP_VS_CONN_F_NOOUTPUT) { + if (state_off == TCP_DIR_OUTPUT) + cp->flags &= ~IP_VS_CONN_F_NOOUTPUT; + else + state_off = TCP_DIR_INPUT_ONLY; + } + + if ((state_idx = tcp_state_idx(th)) < 0) { + IP_VS_DBG(8, "tcp_state_idx=%d!!!\n", state_idx); + goto tcp_state_out; + } + + new_state = + pd->tcp_state_table[state_off+state_idx].next_state[cp->state]; + + tcp_state_out: + if (new_state != cp->state) { + struct ip_vs_dest *dest = cp->dest; + + IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] %s:%d->" + "%s:%d state: %s->%s conn->refcnt:%d\n", + pd->pp->name, + ((state_off == TCP_DIR_OUTPUT) ? + "output " : "input "), + th->syn ? 'S' : '.', + th->fin ? 'F' : '.', + th->ack ? 'A' : '.', + th->rst ? 'R' : '.', + IP_VS_DBG_ADDR(cp->af, &cp->daddr), + ntohs(cp->dport), + IP_VS_DBG_ADDR(cp->af, &cp->caddr), + ntohs(cp->cport), + tcp_state_name(cp->state), + tcp_state_name(new_state), + atomic_read(&cp->refcnt)); + + if (dest) { + if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && + (new_state != IP_VS_TCP_S_ESTABLISHED)) { + atomic_dec(&dest->activeconns); + atomic_inc(&dest->inactconns); + cp->flags |= IP_VS_CONN_F_INACTIVE; + } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) && + (new_state == IP_VS_TCP_S_ESTABLISHED)) { + atomic_inc(&dest->activeconns); + atomic_dec(&dest->inactconns); + cp->flags &= ~IP_VS_CONN_F_INACTIVE; + } + } + } + + if (likely(pd)) + cp->timeout = pd->timeout_table[cp->state = new_state]; + else /* What to do ? */ + cp->timeout = tcp_timeouts[cp->state = new_state]; +} + +/* + * Handle state transitions + */ +static void +tcp_state_transition(struct ip_vs_conn *cp, int direction, + const struct sk_buff *skb, + struct ip_vs_proto_data *pd) +{ + struct tcphdr _tcph, *th; + +#ifdef CONFIG_IP_VS_IPV6 + int ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr); +#else + int ihl = ip_hdrlen(skb); +#endif + + th = skb_header_pointer(skb, ihl, sizeof(_tcph), &_tcph); + if (th == NULL) + return; + + spin_lock(&cp->lock); + set_tcp_state(pd, cp, direction, th); + spin_unlock(&cp->lock); +} + +static inline __u16 tcp_app_hashkey(__be16 port) +{ + return (((__force u16)port >> TCP_APP_TAB_BITS) ^ (__force u16)port) + & TCP_APP_TAB_MASK; +} + + +static int tcp_register_app(struct net *net, struct ip_vs_app *inc) +{ + struct ip_vs_app *i; + __u16 hash; + __be16 port = inc->port; + int ret = 0; + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP); + + hash = tcp_app_hashkey(port); + + spin_lock_bh(&ipvs->tcp_app_lock); + list_for_each_entry(i, &ipvs->tcp_apps[hash], p_list) { + if (i->port == port) { + ret = -EEXIST; + goto out; + } + } + list_add(&inc->p_list, &ipvs->tcp_apps[hash]); + atomic_inc(&pd->appcnt); + + out: + spin_unlock_bh(&ipvs->tcp_app_lock); + return ret; +} + + +static void +tcp_unregister_app(struct net *net, struct ip_vs_app *inc) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP); + + spin_lock_bh(&ipvs->tcp_app_lock); + atomic_dec(&pd->appcnt); + list_del(&inc->p_list); + spin_unlock_bh(&ipvs->tcp_app_lock); +} + + +static int +tcp_app_conn_bind(struct ip_vs_conn *cp) +{ + struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp)); + int hash; + struct ip_vs_app *inc; + int result = 0; + + /* Default binding: bind app only for NAT */ + if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) + return 0; + + /* Lookup application incarnations and bind the right one */ + hash = tcp_app_hashkey(cp->vport); + + spin_lock(&ipvs->tcp_app_lock); + list_for_each_entry(inc, &ipvs->tcp_apps[hash], p_list) { + if (inc->port == cp->vport) { + if (unlikely(!ip_vs_app_inc_get(inc))) + break; + spin_unlock(&ipvs->tcp_app_lock); + + IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->" + "%s:%u to app %s on port %u\n", + __func__, + IP_VS_DBG_ADDR(cp->af, &cp->caddr), + ntohs(cp->cport), + IP_VS_DBG_ADDR(cp->af, &cp->vaddr), + ntohs(cp->vport), + inc->name, ntohs(inc->port)); + + cp->app = inc; + if (inc->init_conn) + result = inc->init_conn(inc, cp); + goto out; + } + } + spin_unlock(&ipvs->tcp_app_lock); + + out: + return result; +} + + +/* + * Set LISTEN timeout. (ip_vs_conn_put will setup timer) + */ +void ip_vs_tcp_conn_listen(struct net *net, struct ip_vs_conn *cp) +{ + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP); + + spin_lock(&cp->lock); + cp->state = IP_VS_TCP_S_LISTEN; + cp->timeout = (pd ? pd->timeout_table[IP_VS_TCP_S_LISTEN] + : tcp_timeouts[IP_VS_TCP_S_LISTEN]); + spin_unlock(&cp->lock); +} + +/* --------------------------------------------- + * timeouts is netns related now. + * --------------------------------------------- + */ +static int __ip_vs_tcp_init(struct net *net, struct ip_vs_proto_data *pd) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + + ip_vs_init_hash_table(ipvs->tcp_apps, TCP_APP_TAB_SIZE); + spin_lock_init(&ipvs->tcp_app_lock); + pd->timeout_table = ip_vs_create_timeout_table((int *)tcp_timeouts, + sizeof(tcp_timeouts)); + if (!pd->timeout_table) + return -ENOMEM; + pd->tcp_state_table = tcp_states; + return 0; +} + +static void __ip_vs_tcp_exit(struct net *net, struct ip_vs_proto_data *pd) +{ + kfree(pd->timeout_table); +} + + +struct ip_vs_protocol ip_vs_protocol_tcp = { + .name = "TCP", + .protocol = IPPROTO_TCP, + .num_states = IP_VS_TCP_S_LAST, + .dont_defrag = 0, + .init = NULL, + .exit = NULL, + .init_netns = __ip_vs_tcp_init, + .exit_netns = __ip_vs_tcp_exit, + .register_app = tcp_register_app, + .unregister_app = tcp_unregister_app, + .conn_schedule = tcp_conn_schedule, + .conn_in_get = ip_vs_conn_in_get_proto, + .conn_out_get = ip_vs_conn_out_get_proto, + .snat_handler = tcp_snat_handler, + .dnat_handler = tcp_dnat_handler, + .csum_check = tcp_csum_check, + .state_name = tcp_state_name, + .state_transition = tcp_state_transition, + .app_conn_bind = tcp_app_conn_bind, + .debug_packet = ip_vs_tcpudp_debug_packet, + .timeout_change = tcp_timeout_change, +}; diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c new file mode 100644 index 00000000..2fedb2dc --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_proto_udp.c @@ -0,0 +1,511 @@ +/* + * ip_vs_proto_udp.c: UDP load balancing support for IPVS + * + * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> + * Julian Anastasov <ja@ssi.bg> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: Hans Schillstrom <hans.schillstrom@ericsson.com> + * Network name space (netns) aware. + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/kernel.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> +#include <linux/udp.h> + +#include <net/ip_vs.h> +#include <net/ip.h> +#include <net/ip6_checksum.h> + +static int +udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, + int *verdict, struct ip_vs_conn **cpp) +{ + struct net *net; + struct ip_vs_service *svc; + struct udphdr _udph, *uh; + struct ip_vs_iphdr iph; + + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + + uh = skb_header_pointer(skb, iph.len, sizeof(_udph), &_udph); + if (uh == NULL) { + *verdict = NF_DROP; + return 0; + } + net = skb_net(skb); + svc = ip_vs_service_get(net, af, skb->mark, iph.protocol, + &iph.daddr, uh->dest); + if (svc) { + int ignored; + + if (ip_vs_todrop(net_ipvs(net))) { + /* + * It seems that we are very loaded. + * We have to drop this packet :( + */ + ip_vs_service_put(svc); + *verdict = NF_DROP; + return 0; + } + + /* + * Let the virtual server select a real server for the + * incoming connection, and create a connection entry. + */ + *cpp = ip_vs_schedule(svc, skb, pd, &ignored); + if (!*cpp && ignored <= 0) { + if (!ignored) + *verdict = ip_vs_leave(svc, skb, pd); + else { + ip_vs_service_put(svc); + *verdict = NF_DROP; + } + return 0; + } + ip_vs_service_put(svc); + } + /* NF_ACCEPT */ + return 1; +} + + +static inline void +udp_fast_csum_update(int af, struct udphdr *uhdr, + const union nf_inet_addr *oldip, + const union nf_inet_addr *newip, + __be16 oldport, __be16 newport) +{ +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + uhdr->check = + csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, + ip_vs_check_diff2(oldport, newport, + ~csum_unfold(uhdr->check)))); + else +#endif + uhdr->check = + csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, + ip_vs_check_diff2(oldport, newport, + ~csum_unfold(uhdr->check)))); + if (!uhdr->check) + uhdr->check = CSUM_MANGLED_0; +} + +static inline void +udp_partial_csum_update(int af, struct udphdr *uhdr, + const union nf_inet_addr *oldip, + const union nf_inet_addr *newip, + __be16 oldlen, __be16 newlen) +{ +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + uhdr->check = + ~csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, + ip_vs_check_diff2(oldlen, newlen, + csum_unfold(uhdr->check)))); + else +#endif + uhdr->check = + ~csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, + ip_vs_check_diff2(oldlen, newlen, + csum_unfold(uhdr->check)))); +} + + +static int +udp_snat_handler(struct sk_buff *skb, + struct ip_vs_protocol *pp, struct ip_vs_conn *cp) +{ + struct udphdr *udph; + unsigned int udphoff; + int oldlen; + int payload_csum = 0; + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + udphoff = sizeof(struct ipv6hdr); + else +#endif + udphoff = ip_hdrlen(skb); + oldlen = skb->len - udphoff; + + /* csum_check requires unshared skb */ + if (!skb_make_writable(skb, udphoff+sizeof(*udph))) + return 0; + + if (unlikely(cp->app != NULL)) { + int ret; + + /* Some checks before mangling */ + if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) + return 0; + + /* + * Call application helper if needed + */ + if (!(ret = ip_vs_app_pkt_out(cp, skb))) + return 0; + /* ret=2: csum update is needed after payload mangling */ + if (ret == 1) + oldlen = skb->len - udphoff; + else + payload_csum = 1; + } + + udph = (void *)skb_network_header(skb) + udphoff; + udph->source = cp->vport; + + /* + * Adjust UDP checksums + */ + if (skb->ip_summed == CHECKSUM_PARTIAL) { + udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr, + htons(oldlen), + htons(skb->len - udphoff)); + } else if (!payload_csum && (udph->check != 0)) { + /* Only port and addr are changed, do fast csum update */ + udp_fast_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr, + cp->dport, cp->vport); + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->ip_summed = (cp->app && pp->csum_check) ? + CHECKSUM_UNNECESSARY : CHECKSUM_NONE; + } else { + /* full checksum calculation */ + udph->check = 0; + skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0); +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + udph->check = csum_ipv6_magic(&cp->vaddr.in6, + &cp->caddr.in6, + skb->len - udphoff, + cp->protocol, skb->csum); + else +#endif + udph->check = csum_tcpudp_magic(cp->vaddr.ip, + cp->caddr.ip, + skb->len - udphoff, + cp->protocol, + skb->csum); + if (udph->check == 0) + udph->check = CSUM_MANGLED_0; + skb->ip_summed = CHECKSUM_UNNECESSARY; + IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n", + pp->name, udph->check, + (char*)&(udph->check) - (char*)udph); + } + return 1; +} + + +static int +udp_dnat_handler(struct sk_buff *skb, + struct ip_vs_protocol *pp, struct ip_vs_conn *cp) +{ + struct udphdr *udph; + unsigned int udphoff; + int oldlen; + int payload_csum = 0; + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + udphoff = sizeof(struct ipv6hdr); + else +#endif + udphoff = ip_hdrlen(skb); + oldlen = skb->len - udphoff; + + /* csum_check requires unshared skb */ + if (!skb_make_writable(skb, udphoff+sizeof(*udph))) + return 0; + + if (unlikely(cp->app != NULL)) { + int ret; + + /* Some checks before mangling */ + if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) + return 0; + + /* + * Attempt ip_vs_app call. + * It will fix ip_vs_conn + */ + if (!(ret = ip_vs_app_pkt_in(cp, skb))) + return 0; + /* ret=2: csum update is needed after payload mangling */ + if (ret == 1) + oldlen = skb->len - udphoff; + else + payload_csum = 1; + } + + udph = (void *)skb_network_header(skb) + udphoff; + udph->dest = cp->dport; + + /* + * Adjust UDP checksums + */ + if (skb->ip_summed == CHECKSUM_PARTIAL) { + udp_partial_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr, + htons(oldlen), + htons(skb->len - udphoff)); + } else if (!payload_csum && (udph->check != 0)) { + /* Only port and addr are changed, do fast csum update */ + udp_fast_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr, + cp->vport, cp->dport); + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->ip_summed = (cp->app && pp->csum_check) ? + CHECKSUM_UNNECESSARY : CHECKSUM_NONE; + } else { + /* full checksum calculation */ + udph->check = 0; + skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0); +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + udph->check = csum_ipv6_magic(&cp->caddr.in6, + &cp->daddr.in6, + skb->len - udphoff, + cp->protocol, skb->csum); + else +#endif + udph->check = csum_tcpudp_magic(cp->caddr.ip, + cp->daddr.ip, + skb->len - udphoff, + cp->protocol, + skb->csum); + if (udph->check == 0) + udph->check = CSUM_MANGLED_0; + skb->ip_summed = CHECKSUM_UNNECESSARY; + } + return 1; +} + + +static int +udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) +{ + struct udphdr _udph, *uh; + unsigned int udphoff; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + udphoff = sizeof(struct ipv6hdr); + else +#endif + udphoff = ip_hdrlen(skb); + + uh = skb_header_pointer(skb, udphoff, sizeof(_udph), &_udph); + if (uh == NULL) + return 0; + + if (uh->check != 0) { + switch (skb->ip_summed) { + case CHECKSUM_NONE: + skb->csum = skb_checksum(skb, udphoff, + skb->len - udphoff, 0); + case CHECKSUM_COMPLETE: +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) { + if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr, + &ipv6_hdr(skb)->daddr, + skb->len - udphoff, + ipv6_hdr(skb)->nexthdr, + skb->csum)) { + IP_VS_DBG_RL_PKT(0, af, pp, skb, 0, + "Failed checksum for"); + return 0; + } + } else +#endif + if (csum_tcpudp_magic(ip_hdr(skb)->saddr, + ip_hdr(skb)->daddr, + skb->len - udphoff, + ip_hdr(skb)->protocol, + skb->csum)) { + IP_VS_DBG_RL_PKT(0, af, pp, skb, 0, + "Failed checksum for"); + return 0; + } + break; + default: + /* No need to checksum. */ + break; + } + } + return 1; +} + +static inline __u16 udp_app_hashkey(__be16 port) +{ + return (((__force u16)port >> UDP_APP_TAB_BITS) ^ (__force u16)port) + & UDP_APP_TAB_MASK; +} + + +static int udp_register_app(struct net *net, struct ip_vs_app *inc) +{ + struct ip_vs_app *i; + __u16 hash; + __be16 port = inc->port; + int ret = 0; + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_UDP); + + hash = udp_app_hashkey(port); + + + spin_lock_bh(&ipvs->udp_app_lock); + list_for_each_entry(i, &ipvs->udp_apps[hash], p_list) { + if (i->port == port) { + ret = -EEXIST; + goto out; + } + } + list_add(&inc->p_list, &ipvs->udp_apps[hash]); + atomic_inc(&pd->appcnt); + + out: + spin_unlock_bh(&ipvs->udp_app_lock); + return ret; +} + + +static void +udp_unregister_app(struct net *net, struct ip_vs_app *inc) +{ + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_UDP); + struct netns_ipvs *ipvs = net_ipvs(net); + + spin_lock_bh(&ipvs->udp_app_lock); + atomic_dec(&pd->appcnt); + list_del(&inc->p_list); + spin_unlock_bh(&ipvs->udp_app_lock); +} + + +static int udp_app_conn_bind(struct ip_vs_conn *cp) +{ + struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp)); + int hash; + struct ip_vs_app *inc; + int result = 0; + + /* Default binding: bind app only for NAT */ + if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) + return 0; + + /* Lookup application incarnations and bind the right one */ + hash = udp_app_hashkey(cp->vport); + + spin_lock(&ipvs->udp_app_lock); + list_for_each_entry(inc, &ipvs->udp_apps[hash], p_list) { + if (inc->port == cp->vport) { + if (unlikely(!ip_vs_app_inc_get(inc))) + break; + spin_unlock(&ipvs->udp_app_lock); + + IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->" + "%s:%u to app %s on port %u\n", + __func__, + IP_VS_DBG_ADDR(cp->af, &cp->caddr), + ntohs(cp->cport), + IP_VS_DBG_ADDR(cp->af, &cp->vaddr), + ntohs(cp->vport), + inc->name, ntohs(inc->port)); + + cp->app = inc; + if (inc->init_conn) + result = inc->init_conn(inc, cp); + goto out; + } + } + spin_unlock(&ipvs->udp_app_lock); + + out: + return result; +} + + +static const int udp_timeouts[IP_VS_UDP_S_LAST+1] = { + [IP_VS_UDP_S_NORMAL] = 5*60*HZ, + [IP_VS_UDP_S_LAST] = 2*HZ, +}; + +static const char *const udp_state_name_table[IP_VS_UDP_S_LAST+1] = { + [IP_VS_UDP_S_NORMAL] = "UDP", + [IP_VS_UDP_S_LAST] = "BUG!", +}; + +static const char * udp_state_name(int state) +{ + if (state >= IP_VS_UDP_S_LAST) + return "ERR!"; + return udp_state_name_table[state] ? udp_state_name_table[state] : "?"; +} + +static void +udp_state_transition(struct ip_vs_conn *cp, int direction, + const struct sk_buff *skb, + struct ip_vs_proto_data *pd) +{ + if (unlikely(!pd)) { + pr_err("UDP no ns data\n"); + return; + } + + cp->timeout = pd->timeout_table[IP_VS_UDP_S_NORMAL]; +} + +static int __udp_init(struct net *net, struct ip_vs_proto_data *pd) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + + ip_vs_init_hash_table(ipvs->udp_apps, UDP_APP_TAB_SIZE); + spin_lock_init(&ipvs->udp_app_lock); + pd->timeout_table = ip_vs_create_timeout_table((int *)udp_timeouts, + sizeof(udp_timeouts)); + if (!pd->timeout_table) + return -ENOMEM; + return 0; +} + +static void __udp_exit(struct net *net, struct ip_vs_proto_data *pd) +{ + kfree(pd->timeout_table); +} + + +struct ip_vs_protocol ip_vs_protocol_udp = { + .name = "UDP", + .protocol = IPPROTO_UDP, + .num_states = IP_VS_UDP_S_LAST, + .dont_defrag = 0, + .init = NULL, + .exit = NULL, + .init_netns = __udp_init, + .exit_netns = __udp_exit, + .conn_schedule = udp_conn_schedule, + .conn_in_get = ip_vs_conn_in_get_proto, + .conn_out_get = ip_vs_conn_out_get_proto, + .snat_handler = udp_snat_handler, + .dnat_handler = udp_dnat_handler, + .csum_check = udp_csum_check, + .state_transition = udp_state_transition, + .state_name = udp_state_name, + .register_app = udp_register_app, + .unregister_app = udp_unregister_app, + .app_conn_bind = udp_app_conn_bind, + .debug_packet = ip_vs_tcpudp_debug_packet, + .timeout_change = NULL, +}; diff --git a/net/netfilter/ipvs/ip_vs_rr.c b/net/netfilter/ipvs/ip_vs_rr.c new file mode 100644 index 00000000..c49b388d --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_rr.c @@ -0,0 +1,113 @@ +/* + * IPVS: Round-Robin Scheduling module + * + * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> + * Peter Kese <peter.kese@ijs.si> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Fixes/Changes: + * Wensong Zhang : changed the ip_vs_rr_schedule to return dest + * Julian Anastasov : fixed the NULL pointer access bug in debugging + * Wensong Zhang : changed some comestics things for debugging + * Wensong Zhang : changed for the d-linked destination list + * Wensong Zhang : added the ip_vs_rr_update_svc + * Wensong Zhang : added any dest with weight=0 is quiesced + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/module.h> +#include <linux/kernel.h> + +#include <net/ip_vs.h> + + +static int ip_vs_rr_init_svc(struct ip_vs_service *svc) +{ + svc->sched_data = &svc->destinations; + return 0; +} + + +static int ip_vs_rr_update_svc(struct ip_vs_service *svc) +{ + svc->sched_data = &svc->destinations; + return 0; +} + + +/* + * Round-Robin Scheduling + */ +static struct ip_vs_dest * +ip_vs_rr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +{ + struct list_head *p, *q; + struct ip_vs_dest *dest; + + IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); + + write_lock(&svc->sched_lock); + p = (struct list_head *)svc->sched_data; + p = p->next; + q = p; + do { + /* skip list head */ + if (q == &svc->destinations) { + q = q->next; + continue; + } + + dest = list_entry(q, struct ip_vs_dest, n_list); + if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && + atomic_read(&dest->weight) > 0) + /* HIT */ + goto out; + q = q->next; + } while (q != p); + write_unlock(&svc->sched_lock); + ip_vs_scheduler_err(svc, "no destination available"); + return NULL; + + out: + svc->sched_data = q; + write_unlock(&svc->sched_lock); + IP_VS_DBG_BUF(6, "RR: server %s:%u " + "activeconns %d refcnt %d weight %d\n", + IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port), + atomic_read(&dest->activeconns), + atomic_read(&dest->refcnt), atomic_read(&dest->weight)); + + return dest; +} + + +static struct ip_vs_scheduler ip_vs_rr_scheduler = { + .name = "rr", /* name */ + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_rr_scheduler.n_list), + .init_service = ip_vs_rr_init_svc, + .update_service = ip_vs_rr_update_svc, + .schedule = ip_vs_rr_schedule, +}; + +static int __init ip_vs_rr_init(void) +{ + return register_ip_vs_scheduler(&ip_vs_rr_scheduler); +} + +static void __exit ip_vs_rr_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_rr_scheduler); +} + +module_init(ip_vs_rr_init); +module_exit(ip_vs_rr_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_sched.c b/net/netfilter/ipvs/ip_vs_sched.c new file mode 100644 index 00000000..08dbdd5b --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_sched.c @@ -0,0 +1,260 @@ +/* + * IPVS An implementation of the IP virtual server support for the + * LINUX operating system. IPVS is now implemented as a module + * over the Netfilter framework. IPVS can be used to build a + * high-performance and highly available server based on a + * cluster of servers. + * + * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> + * Peter Kese <peter.kese@ijs.si> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/module.h> +#include <linux/spinlock.h> +#include <linux/interrupt.h> +#include <asm/string.h> +#include <linux/kmod.h> +#include <linux/sysctl.h> + +#include <net/ip_vs.h> + +EXPORT_SYMBOL(ip_vs_scheduler_err); +/* + * IPVS scheduler list + */ +static LIST_HEAD(ip_vs_schedulers); + +/* lock for service table */ +static DEFINE_SPINLOCK(ip_vs_sched_lock); + + +/* + * Bind a service with a scheduler + */ +int ip_vs_bind_scheduler(struct ip_vs_service *svc, + struct ip_vs_scheduler *scheduler) +{ + int ret; + + svc->scheduler = scheduler; + + if (scheduler->init_service) { + ret = scheduler->init_service(svc); + if (ret) { + pr_err("%s(): init error\n", __func__); + return ret; + } + } + + return 0; +} + + +/* + * Unbind a service with its scheduler + */ +int ip_vs_unbind_scheduler(struct ip_vs_service *svc) +{ + struct ip_vs_scheduler *sched = svc->scheduler; + + if (!sched) + return 0; + + if (sched->done_service) { + if (sched->done_service(svc) != 0) { + pr_err("%s(): done error\n", __func__); + return -EINVAL; + } + } + + svc->scheduler = NULL; + return 0; +} + + +/* + * Get scheduler in the scheduler list by name + */ +static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name) +{ + struct ip_vs_scheduler *sched; + + IP_VS_DBG(2, "%s(): sched_name \"%s\"\n", __func__, sched_name); + + spin_lock_bh(&ip_vs_sched_lock); + + list_for_each_entry(sched, &ip_vs_schedulers, n_list) { + /* + * Test and get the modules atomically + */ + if (sched->module && !try_module_get(sched->module)) { + /* + * This scheduler is just deleted + */ + continue; + } + if (strcmp(sched_name, sched->name)==0) { + /* HIT */ + spin_unlock_bh(&ip_vs_sched_lock); + return sched; + } + if (sched->module) + module_put(sched->module); + } + + spin_unlock_bh(&ip_vs_sched_lock); + return NULL; +} + + +/* + * Lookup scheduler and try to load it if it doesn't exist + */ +struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name) +{ + struct ip_vs_scheduler *sched; + + /* + * Search for the scheduler by sched_name + */ + sched = ip_vs_sched_getbyname(sched_name); + + /* + * If scheduler not found, load the module and search again + */ + if (sched == NULL) { + request_module("ip_vs_%s", sched_name); + sched = ip_vs_sched_getbyname(sched_name); + } + + return sched; +} + +void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler) +{ + if (scheduler && scheduler->module) + module_put(scheduler->module); +} + +/* + * Common error output helper for schedulers + */ + +void ip_vs_scheduler_err(struct ip_vs_service *svc, const char *msg) +{ + if (svc->fwmark) { + IP_VS_ERR_RL("%s: FWM %u 0x%08X - %s\n", + svc->scheduler->name, svc->fwmark, + svc->fwmark, msg); +#ifdef CONFIG_IP_VS_IPV6 + } else if (svc->af == AF_INET6) { + IP_VS_ERR_RL("%s: %s [%pI6]:%d - %s\n", + svc->scheduler->name, + ip_vs_proto_name(svc->protocol), + &svc->addr.in6, ntohs(svc->port), msg); +#endif + } else { + IP_VS_ERR_RL("%s: %s %pI4:%d - %s\n", + svc->scheduler->name, + ip_vs_proto_name(svc->protocol), + &svc->addr.ip, ntohs(svc->port), msg); + } +} + +/* + * Register a scheduler in the scheduler list + */ +int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) +{ + struct ip_vs_scheduler *sched; + + if (!scheduler) { + pr_err("%s(): NULL arg\n", __func__); + return -EINVAL; + } + + if (!scheduler->name) { + pr_err("%s(): NULL scheduler_name\n", __func__); + return -EINVAL; + } + + /* increase the module use count */ + ip_vs_use_count_inc(); + + spin_lock_bh(&ip_vs_sched_lock); + + if (!list_empty(&scheduler->n_list)) { + spin_unlock_bh(&ip_vs_sched_lock); + ip_vs_use_count_dec(); + pr_err("%s(): [%s] scheduler already linked\n", + __func__, scheduler->name); + return -EINVAL; + } + + /* + * Make sure that the scheduler with this name doesn't exist + * in the scheduler list. + */ + list_for_each_entry(sched, &ip_vs_schedulers, n_list) { + if (strcmp(scheduler->name, sched->name) == 0) { + spin_unlock_bh(&ip_vs_sched_lock); + ip_vs_use_count_dec(); + pr_err("%s(): [%s] scheduler already existed " + "in the system\n", __func__, scheduler->name); + return -EINVAL; + } + } + /* + * Add it into the d-linked scheduler list + */ + list_add(&scheduler->n_list, &ip_vs_schedulers); + spin_unlock_bh(&ip_vs_sched_lock); + + pr_info("[%s] scheduler registered.\n", scheduler->name); + + return 0; +} + + +/* + * Unregister a scheduler from the scheduler list + */ +int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) +{ + if (!scheduler) { + pr_err("%s(): NULL arg\n", __func__); + return -EINVAL; + } + + spin_lock_bh(&ip_vs_sched_lock); + if (list_empty(&scheduler->n_list)) { + spin_unlock_bh(&ip_vs_sched_lock); + pr_err("%s(): [%s] scheduler is not in the list. failed\n", + __func__, scheduler->name); + return -EINVAL; + } + + /* + * Remove it from the d-linked scheduler list + */ + list_del(&scheduler->n_list); + spin_unlock_bh(&ip_vs_sched_lock); + + /* decrease the module use count */ + ip_vs_use_count_dec(); + + pr_info("[%s] scheduler unregistered.\n", scheduler->name); + + return 0; +} diff --git a/net/netfilter/ipvs/ip_vs_sed.c b/net/netfilter/ipvs/ip_vs_sed.c new file mode 100644 index 00000000..89ead246 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_sed.c @@ -0,0 +1,141 @@ +/* + * IPVS: Shortest Expected Delay scheduling module + * + * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +/* + * The SED algorithm attempts to minimize each job's expected delay until + * completion. The expected delay that the job will experience is + * (Ci + 1) / Ui if sent to the ith server, in which Ci is the number of + * jobs on the ith server and Ui is the fixed service rate (weight) of + * the ith server. The SED algorithm adopts a greedy policy that each does + * what is in its own best interest, i.e. to join the queue which would + * minimize its expected delay of completion. + * + * See the following paper for more information: + * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing + * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88, + * pages 986-994, 1988. + * + * Thanks must go to Marko Buuri <marko@buuri.name> for talking SED to me. + * + * The difference between SED and WLC is that SED includes the incoming + * job in the cost function (the increment of 1). SED may outperform + * WLC, while scheduling big jobs under larger heterogeneous systems + * (the server weight varies a lot). + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/module.h> +#include <linux/kernel.h> + +#include <net/ip_vs.h> + + +static inline unsigned int +ip_vs_sed_dest_overhead(struct ip_vs_dest *dest) +{ + /* + * We only use the active connection number in the cost + * calculation here. + */ + return atomic_read(&dest->activeconns) + 1; +} + + +/* + * Weighted Least Connection scheduling + */ +static struct ip_vs_dest * +ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +{ + struct ip_vs_dest *dest, *least; + unsigned int loh, doh; + + IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); + + /* + * We calculate the load of each dest server as follows: + * (server expected overhead) / dest->weight + * + * Remember -- no floats in kernel mode!!! + * The comparison of h1*w2 > h2*w1 is equivalent to that of + * h1/w1 > h2/w2 + * if every weight is larger than zero. + * + * The server with weight=0 is quiesced and will not receive any + * new connections. + */ + + list_for_each_entry(dest, &svc->destinations, n_list) { + if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && + atomic_read(&dest->weight) > 0) { + least = dest; + loh = ip_vs_sed_dest_overhead(least); + goto nextstage; + } + } + ip_vs_scheduler_err(svc, "no destination available"); + return NULL; + + /* + * Find the destination with the least load. + */ + nextstage: + list_for_each_entry_continue(dest, &svc->destinations, n_list) { + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + continue; + doh = ip_vs_sed_dest_overhead(dest); + if (loh * atomic_read(&dest->weight) > + doh * atomic_read(&least->weight)) { + least = dest; + loh = doh; + } + } + + IP_VS_DBG_BUF(6, "SED: server %s:%u " + "activeconns %d refcnt %d weight %d overhead %d\n", + IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->refcnt), + atomic_read(&least->weight), loh); + + return least; +} + + +static struct ip_vs_scheduler ip_vs_sed_scheduler = +{ + .name = "sed", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_sed_scheduler.n_list), + .schedule = ip_vs_sed_schedule, +}; + + +static int __init ip_vs_sed_init(void) +{ + return register_ip_vs_scheduler(&ip_vs_sed_scheduler); +} + +static void __exit ip_vs_sed_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_sed_scheduler); +} + +module_init(ip_vs_sed_init); +module_exit(ip_vs_sed_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c new file mode 100644 index 00000000..069e8d4d --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_sh.c @@ -0,0 +1,284 @@ +/* + * IPVS: Source Hashing scheduling module + * + * Authors: Wensong Zhang <wensong@gnuchina.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +/* + * The sh algorithm is to select server by the hash key of source IP + * address. The pseudo code is as follows: + * + * n <- servernode[src_ip]; + * if (n is dead) OR + * (n is overloaded) or (n.weight <= 0) then + * return NULL; + * + * return n; + * + * Notes that servernode is a 256-bucket hash table that maps the hash + * index derived from packet source IP address to the current server + * array. If the sh scheduler is used in cache cluster, it is good to + * combine it with cache_bypass feature. When the statically assigned + * server is dead or overloaded, the load balancer can bypass the cache + * server and send requests to the original server directly. + * + * The weight destination attribute can be used to control the + * distribution of connections to the destinations in servernode. The + * greater the weight, the more connections the destination + * will receive. + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/ip.h> +#include <linux/slab.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/skbuff.h> + +#include <net/ip_vs.h> + + +/* + * IPVS SH bucket + */ +struct ip_vs_sh_bucket { + struct ip_vs_dest *dest; /* real server (cache) */ +}; + +/* + * for IPVS SH entry hash table + */ +#ifndef CONFIG_IP_VS_SH_TAB_BITS +#define CONFIG_IP_VS_SH_TAB_BITS 8 +#endif +#define IP_VS_SH_TAB_BITS CONFIG_IP_VS_SH_TAB_BITS +#define IP_VS_SH_TAB_SIZE (1 << IP_VS_SH_TAB_BITS) +#define IP_VS_SH_TAB_MASK (IP_VS_SH_TAB_SIZE - 1) + + +/* + * Returns hash value for IPVS SH entry + */ +static inline unsigned ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr) +{ + __be32 addr_fold = addr->ip; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + addr_fold = addr->ip6[0]^addr->ip6[1]^ + addr->ip6[2]^addr->ip6[3]; +#endif + return (ntohl(addr_fold)*2654435761UL) & IP_VS_SH_TAB_MASK; +} + + +/* + * Get ip_vs_dest associated with supplied parameters. + */ +static inline struct ip_vs_dest * +ip_vs_sh_get(int af, struct ip_vs_sh_bucket *tbl, + const union nf_inet_addr *addr) +{ + return (tbl[ip_vs_sh_hashkey(af, addr)]).dest; +} + + +/* + * Assign all the hash buckets of the specified table with the service. + */ +static int +ip_vs_sh_assign(struct ip_vs_sh_bucket *tbl, struct ip_vs_service *svc) +{ + int i; + struct ip_vs_sh_bucket *b; + struct list_head *p; + struct ip_vs_dest *dest; + int d_count; + + b = tbl; + p = &svc->destinations; + d_count = 0; + for (i=0; i<IP_VS_SH_TAB_SIZE; i++) { + if (list_empty(p)) { + b->dest = NULL; + } else { + if (p == &svc->destinations) + p = p->next; + + dest = list_entry(p, struct ip_vs_dest, n_list); + atomic_inc(&dest->refcnt); + b->dest = dest; + + IP_VS_DBG_BUF(6, "assigned i: %d dest: %s weight: %d\n", + i, IP_VS_DBG_ADDR(svc->af, &dest->addr), + atomic_read(&dest->weight)); + + /* Don't move to next dest until filling weight */ + if (++d_count >= atomic_read(&dest->weight)) { + p = p->next; + d_count = 0; + } + + } + b++; + } + return 0; +} + + +/* + * Flush all the hash buckets of the specified table. + */ +static void ip_vs_sh_flush(struct ip_vs_sh_bucket *tbl) +{ + int i; + struct ip_vs_sh_bucket *b; + + b = tbl; + for (i=0; i<IP_VS_SH_TAB_SIZE; i++) { + if (b->dest) { + atomic_dec(&b->dest->refcnt); + b->dest = NULL; + } + b++; + } +} + + +static int ip_vs_sh_init_svc(struct ip_vs_service *svc) +{ + struct ip_vs_sh_bucket *tbl; + + /* allocate the SH table for this service */ + tbl = kmalloc(sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE, + GFP_ATOMIC); + if (tbl == NULL) + return -ENOMEM; + + svc->sched_data = tbl; + IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) allocated for " + "current service\n", + sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE); + + /* assign the hash buckets with the updated service */ + ip_vs_sh_assign(tbl, svc); + + return 0; +} + + +static int ip_vs_sh_done_svc(struct ip_vs_service *svc) +{ + struct ip_vs_sh_bucket *tbl = svc->sched_data; + + /* got to clean up hash buckets here */ + ip_vs_sh_flush(tbl); + + /* release the table itself */ + kfree(svc->sched_data); + IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) released\n", + sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE); + + return 0; +} + + +static int ip_vs_sh_update_svc(struct ip_vs_service *svc) +{ + struct ip_vs_sh_bucket *tbl = svc->sched_data; + + /* got to clean up hash buckets here */ + ip_vs_sh_flush(tbl); + + /* assign the hash buckets with the updated service */ + ip_vs_sh_assign(tbl, svc); + + return 0; +} + + +/* + * If the dest flags is set with IP_VS_DEST_F_OVERLOAD, + * consider that the server is overloaded here. + */ +static inline int is_overloaded(struct ip_vs_dest *dest) +{ + return dest->flags & IP_VS_DEST_F_OVERLOAD; +} + + +/* + * Source Hashing scheduling + */ +static struct ip_vs_dest * +ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +{ + struct ip_vs_dest *dest; + struct ip_vs_sh_bucket *tbl; + struct ip_vs_iphdr iph; + + ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); + + IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n"); + + tbl = (struct ip_vs_sh_bucket *)svc->sched_data; + dest = ip_vs_sh_get(svc->af, tbl, &iph.saddr); + if (!dest + || !(dest->flags & IP_VS_DEST_F_AVAILABLE) + || atomic_read(&dest->weight) <= 0 + || is_overloaded(dest)) { + ip_vs_scheduler_err(svc, "no destination available"); + return NULL; + } + + IP_VS_DBG_BUF(6, "SH: source IP address %s --> server %s:%d\n", + IP_VS_DBG_ADDR(svc->af, &iph.saddr), + IP_VS_DBG_ADDR(svc->af, &dest->addr), + ntohs(dest->port)); + + return dest; +} + + +/* + * IPVS SH Scheduler structure + */ +static struct ip_vs_scheduler ip_vs_sh_scheduler = +{ + .name = "sh", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_sh_scheduler.n_list), + .init_service = ip_vs_sh_init_svc, + .done_service = ip_vs_sh_done_svc, + .update_service = ip_vs_sh_update_svc, + .schedule = ip_vs_sh_schedule, +}; + + +static int __init ip_vs_sh_init(void) +{ + return register_ip_vs_scheduler(&ip_vs_sh_scheduler); +} + + +static void __exit ip_vs_sh_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_sh_scheduler); +} + + +module_init(ip_vs_sh_init); +module_exit(ip_vs_sh_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c new file mode 100644 index 00000000..8a0d6d68 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -0,0 +1,1697 @@ +/* + * IPVS An implementation of the IP virtual server support for the + * LINUX operating system. IPVS is now implemented as a module + * over the NetFilter framework. IPVS can be used to build a + * high-performance and highly available server based on a + * cluster of servers. + * + * Version 1, is capable of handling both version 0 and 1 messages. + * Version 0 is the plain old format. + * Note Version 0 receivers will just drop Ver 1 messages. + * Version 1 is capable of handle IPv6, Persistence data, + * time-outs, and firewall marks. + * In ver.1 "ip_vs_sync_conn_options" will be sent in netw. order. + * Ver. 0 can be turned on by sysctl -w net.ipv4.vs.sync_version=0 + * + * Definitions Message: is a complete datagram + * Sync_conn: is a part of a Message + * Param Data is an option to a Sync_conn. + * + * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> + * + * ip_vs_sync: sync connection info from master load balancer to backups + * through multicast + * + * Changes: + * Alexandre Cassen : Added master & backup support at a time. + * Alexandre Cassen : Added SyncID support for incoming sync + * messages filtering. + * Justin Ossevoort : Fix endian problem on sync message size. + * Hans Schillstrom : Added Version 1: i.e. IPv6, + * Persistence support, fwmark and time-out. + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/inetdevice.h> +#include <linux/net.h> +#include <linux/completion.h> +#include <linux/delay.h> +#include <linux/skbuff.h> +#include <linux/in.h> +#include <linux/igmp.h> /* for ip_mc_join_group */ +#include <linux/udp.h> +#include <linux/err.h> +#include <linux/kthread.h> +#include <linux/wait.h> +#include <linux/kernel.h> + +#include <asm/unaligned.h> /* Used for ntoh_seq and hton_seq */ + +#include <net/ip.h> +#include <net/sock.h> + +#include <net/ip_vs.h> + +#define IP_VS_SYNC_GROUP 0xe0000051 /* multicast addr - 224.0.0.81 */ +#define IP_VS_SYNC_PORT 8848 /* multicast port */ + +#define SYNC_PROTO_VER 1 /* Protocol version in header */ + +static struct lock_class_key __ipvs_sync_key; +/* + * IPVS sync connection entry + * Version 0, i.e. original version. + */ +struct ip_vs_sync_conn_v0 { + __u8 reserved; + + /* Protocol, addresses and port numbers */ + __u8 protocol; /* Which protocol (TCP/UDP) */ + __be16 cport; + __be16 vport; + __be16 dport; + __be32 caddr; /* client address */ + __be32 vaddr; /* virtual address */ + __be32 daddr; /* destination address */ + + /* Flags and state transition */ + __be16 flags; /* status flags */ + __be16 state; /* state info */ + + /* The sequence options start here */ +}; + +struct ip_vs_sync_conn_options { + struct ip_vs_seq in_seq; /* incoming seq. struct */ + struct ip_vs_seq out_seq; /* outgoing seq. struct */ +}; + +/* + Sync Connection format (sync_conn) + + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Type | Protocol | Ver. | Size | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Flags | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | State | cport | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | vport | dport | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | fwmark | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | timeout (in sec.) | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | ... | + | IP-Addresses (v4 or v6) | + | ... | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + Optional Parameters. + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Param. Type | Param. Length | Param. data | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | + | ... | + | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | | Param Type | Param. Length | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Param data | + | Last Param data should be padded for 32 bit alignment | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +*/ + +/* + * Type 0, IPv4 sync connection format + */ +struct ip_vs_sync_v4 { + __u8 type; + __u8 protocol; /* Which protocol (TCP/UDP) */ + __be16 ver_size; /* Version msb 4 bits */ + /* Flags and state transition */ + __be32 flags; /* status flags */ + __be16 state; /* state info */ + /* Protocol, addresses and port numbers */ + __be16 cport; + __be16 vport; + __be16 dport; + __be32 fwmark; /* Firewall mark from skb */ + __be32 timeout; /* cp timeout */ + __be32 caddr; /* client address */ + __be32 vaddr; /* virtual address */ + __be32 daddr; /* destination address */ + /* The sequence options start here */ + /* PE data padded to 32bit alignment after seq. options */ +}; +/* + * Type 2 messages IPv6 + */ +struct ip_vs_sync_v6 { + __u8 type; + __u8 protocol; /* Which protocol (TCP/UDP) */ + __be16 ver_size; /* Version msb 4 bits */ + /* Flags and state transition */ + __be32 flags; /* status flags */ + __be16 state; /* state info */ + /* Protocol, addresses and port numbers */ + __be16 cport; + __be16 vport; + __be16 dport; + __be32 fwmark; /* Firewall mark from skb */ + __be32 timeout; /* cp timeout */ + struct in6_addr caddr; /* client address */ + struct in6_addr vaddr; /* virtual address */ + struct in6_addr daddr; /* destination address */ + /* The sequence options start here */ + /* PE data padded to 32bit alignment after seq. options */ +}; + +union ip_vs_sync_conn { + struct ip_vs_sync_v4 v4; + struct ip_vs_sync_v6 v6; +}; + +/* Bits in Type field in above */ +#define STYPE_INET6 0 +#define STYPE_F_INET6 (1 << STYPE_INET6) + +#define SVER_SHIFT 12 /* Shift to get version */ +#define SVER_MASK 0x0fff /* Mask to strip version */ + +#define IPVS_OPT_SEQ_DATA 1 +#define IPVS_OPT_PE_DATA 2 +#define IPVS_OPT_PE_NAME 3 +#define IPVS_OPT_PARAM 7 + +#define IPVS_OPT_F_SEQ_DATA (1 << (IPVS_OPT_SEQ_DATA-1)) +#define IPVS_OPT_F_PE_DATA (1 << (IPVS_OPT_PE_DATA-1)) +#define IPVS_OPT_F_PE_NAME (1 << (IPVS_OPT_PE_NAME-1)) +#define IPVS_OPT_F_PARAM (1 << (IPVS_OPT_PARAM-1)) + +struct ip_vs_sync_thread_data { + struct net *net; + struct socket *sock; + char *buf; +}; + +/* Version 0 definition of packet sizes */ +#define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn_v0)) +#define FULL_CONN_SIZE \ +(sizeof(struct ip_vs_sync_conn_v0) + sizeof(struct ip_vs_sync_conn_options)) + + +/* + The master mulitcasts messages (Datagrams) to the backup load balancers + in the following format. + + Version 1: + Note, first byte should be Zero, so ver 0 receivers will drop the packet. + + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | 0 | SyncID | Size | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Count Conns | Version | Reserved, set to Zero | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | | + | IPVS Sync Connection (1) | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | . | + ~ . ~ + | . | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | | + | IPVS Sync Connection (n) | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + + Version 0 Header + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Count Conns | SyncID | Size | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | IPVS Sync Connection (1) | +*/ + +#define SYNC_MESG_HEADER_LEN 4 +#define MAX_CONNS_PER_SYNCBUFF 255 /* nr_conns in ip_vs_sync_mesg is 8 bit */ + +/* Version 0 header */ +struct ip_vs_sync_mesg_v0 { + __u8 nr_conns; + __u8 syncid; + __u16 size; + + /* ip_vs_sync_conn entries start here */ +}; + +/* Version 1 header */ +struct ip_vs_sync_mesg { + __u8 reserved; /* must be zero */ + __u8 syncid; + __u16 size; + __u8 nr_conns; + __s8 version; /* SYNC_PROTO_VER */ + __u16 spare; + /* ip_vs_sync_conn entries start here */ +}; + +struct ip_vs_sync_buff { + struct list_head list; + unsigned long firstuse; + + /* pointers for the message data */ + struct ip_vs_sync_mesg *mesg; + unsigned char *head; + unsigned char *end; +}; + +/* multicast addr */ +static struct sockaddr_in mcast_addr = { + .sin_family = AF_INET, + .sin_port = cpu_to_be16(IP_VS_SYNC_PORT), + .sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP), +}; + +/* + * Copy of struct ip_vs_seq + * From unaligned network order to aligned host order + */ +static void ntoh_seq(struct ip_vs_seq *no, struct ip_vs_seq *ho) +{ + ho->init_seq = get_unaligned_be32(&no->init_seq); + ho->delta = get_unaligned_be32(&no->delta); + ho->previous_delta = get_unaligned_be32(&no->previous_delta); +} + +/* + * Copy of struct ip_vs_seq + * From Aligned host order to unaligned network order + */ +static void hton_seq(struct ip_vs_seq *ho, struct ip_vs_seq *no) +{ + put_unaligned_be32(ho->init_seq, &no->init_seq); + put_unaligned_be32(ho->delta, &no->delta); + put_unaligned_be32(ho->previous_delta, &no->previous_delta); +} + +static inline struct ip_vs_sync_buff *sb_dequeue(struct netns_ipvs *ipvs) +{ + struct ip_vs_sync_buff *sb; + + spin_lock_bh(&ipvs->sync_lock); + if (list_empty(&ipvs->sync_queue)) { + sb = NULL; + } else { + sb = list_entry(ipvs->sync_queue.next, + struct ip_vs_sync_buff, + list); + list_del(&sb->list); + } + spin_unlock_bh(&ipvs->sync_lock); + + return sb; +} + +/* + * Create a new sync buffer for Version 1 proto. + */ +static inline struct ip_vs_sync_buff * +ip_vs_sync_buff_create(struct netns_ipvs *ipvs) +{ + struct ip_vs_sync_buff *sb; + + if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC))) + return NULL; + + sb->mesg = kmalloc(ipvs->send_mesg_maxlen, GFP_ATOMIC); + if (!sb->mesg) { + kfree(sb); + return NULL; + } + sb->mesg->reserved = 0; /* old nr_conns i.e. must be zeo now */ + sb->mesg->version = SYNC_PROTO_VER; + sb->mesg->syncid = ipvs->master_syncid; + sb->mesg->size = sizeof(struct ip_vs_sync_mesg); + sb->mesg->nr_conns = 0; + sb->mesg->spare = 0; + sb->head = (unsigned char *)sb->mesg + sizeof(struct ip_vs_sync_mesg); + sb->end = (unsigned char *)sb->mesg + ipvs->send_mesg_maxlen; + + sb->firstuse = jiffies; + return sb; +} + +static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb) +{ + kfree(sb->mesg); + kfree(sb); +} + +static inline void sb_queue_tail(struct netns_ipvs *ipvs) +{ + struct ip_vs_sync_buff *sb = ipvs->sync_buff; + + spin_lock(&ipvs->sync_lock); + if (ipvs->sync_state & IP_VS_STATE_MASTER) + list_add_tail(&sb->list, &ipvs->sync_queue); + else + ip_vs_sync_buff_release(sb); + spin_unlock(&ipvs->sync_lock); +} + +/* + * Get the current sync buffer if it has been created for more + * than the specified time or the specified time is zero. + */ +static inline struct ip_vs_sync_buff * +get_curr_sync_buff(struct netns_ipvs *ipvs, unsigned long time) +{ + struct ip_vs_sync_buff *sb; + + spin_lock_bh(&ipvs->sync_buff_lock); + if (ipvs->sync_buff && + time_after_eq(jiffies - ipvs->sync_buff->firstuse, time)) { + sb = ipvs->sync_buff; + ipvs->sync_buff = NULL; + } else + sb = NULL; + spin_unlock_bh(&ipvs->sync_buff_lock); + return sb; +} + +/* + * Switch mode from sending version 0 or 1 + * - must handle sync_buf + */ +void ip_vs_sync_switch_mode(struct net *net, int mode) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + + if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) + return; + if (mode == sysctl_sync_ver(ipvs) || !ipvs->sync_buff) + return; + + spin_lock_bh(&ipvs->sync_buff_lock); + /* Buffer empty ? then let buf_create do the job */ + if (ipvs->sync_buff->mesg->size <= sizeof(struct ip_vs_sync_mesg)) { + kfree(ipvs->sync_buff); + ipvs->sync_buff = NULL; + } else { + spin_lock_bh(&ipvs->sync_lock); + if (ipvs->sync_state & IP_VS_STATE_MASTER) + list_add_tail(&ipvs->sync_buff->list, + &ipvs->sync_queue); + else + ip_vs_sync_buff_release(ipvs->sync_buff); + spin_unlock_bh(&ipvs->sync_lock); + } + spin_unlock_bh(&ipvs->sync_buff_lock); +} + +/* + * Create a new sync buffer for Version 0 proto. + */ +static inline struct ip_vs_sync_buff * +ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs) +{ + struct ip_vs_sync_buff *sb; + struct ip_vs_sync_mesg_v0 *mesg; + + if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC))) + return NULL; + + sb->mesg = kmalloc(ipvs->send_mesg_maxlen, GFP_ATOMIC); + if (!sb->mesg) { + kfree(sb); + return NULL; + } + mesg = (struct ip_vs_sync_mesg_v0 *)sb->mesg; + mesg->nr_conns = 0; + mesg->syncid = ipvs->master_syncid; + mesg->size = sizeof(struct ip_vs_sync_mesg_v0); + sb->head = (unsigned char *)mesg + sizeof(struct ip_vs_sync_mesg_v0); + sb->end = (unsigned char *)mesg + ipvs->send_mesg_maxlen; + sb->firstuse = jiffies; + return sb; +} + +/* + * Version 0 , could be switched in by sys_ctl. + * Add an ip_vs_conn information into the current sync_buff. + */ +void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_sync_mesg_v0 *m; + struct ip_vs_sync_conn_v0 *s; + int len; + + if (unlikely(cp->af != AF_INET)) + return; + /* Do not sync ONE PACKET */ + if (cp->flags & IP_VS_CONN_F_ONE_PACKET) + return; + + spin_lock(&ipvs->sync_buff_lock); + if (!ipvs->sync_buff) { + ipvs->sync_buff = + ip_vs_sync_buff_create_v0(ipvs); + if (!ipvs->sync_buff) { + spin_unlock(&ipvs->sync_buff_lock); + pr_err("ip_vs_sync_buff_create failed.\n"); + return; + } + } + + len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE : + SIMPLE_CONN_SIZE; + m = (struct ip_vs_sync_mesg_v0 *)ipvs->sync_buff->mesg; + s = (struct ip_vs_sync_conn_v0 *)ipvs->sync_buff->head; + + /* copy members */ + s->reserved = 0; + s->protocol = cp->protocol; + s->cport = cp->cport; + s->vport = cp->vport; + s->dport = cp->dport; + s->caddr = cp->caddr.ip; + s->vaddr = cp->vaddr.ip; + s->daddr = cp->daddr.ip; + s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED); + s->state = htons(cp->state); + if (cp->flags & IP_VS_CONN_F_SEQ_MASK) { + struct ip_vs_sync_conn_options *opt = + (struct ip_vs_sync_conn_options *)&s[1]; + memcpy(opt, &cp->in_seq, sizeof(*opt)); + } + + m->nr_conns++; + m->size += len; + ipvs->sync_buff->head += len; + + /* check if there is a space for next one */ + if (ipvs->sync_buff->head + FULL_CONN_SIZE > ipvs->sync_buff->end) { + sb_queue_tail(ipvs); + ipvs->sync_buff = NULL; + } + spin_unlock(&ipvs->sync_buff_lock); + + /* synchronize its controller if it has */ + if (cp->control) + ip_vs_sync_conn(net, cp->control); +} + +/* + * Add an ip_vs_conn information into the current sync_buff. + * Called by ip_vs_in. + * Sending Version 1 messages + */ +void ip_vs_sync_conn(struct net *net, struct ip_vs_conn *cp) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_sync_mesg *m; + union ip_vs_sync_conn *s; + __u8 *p; + unsigned int len, pe_name_len, pad; + + /* Handle old version of the protocol */ + if (sysctl_sync_ver(ipvs) == 0) { + ip_vs_sync_conn_v0(net, cp); + return; + } + /* Do not sync ONE PACKET */ + if (cp->flags & IP_VS_CONN_F_ONE_PACKET) + goto control; +sloop: + /* Sanity checks */ + pe_name_len = 0; + if (cp->pe_data_len) { + if (!cp->pe_data || !cp->dest) { + IP_VS_ERR_RL("SYNC, connection pe_data invalid\n"); + return; + } + pe_name_len = strnlen(cp->pe->name, IP_VS_PENAME_MAXLEN); + } + + spin_lock(&ipvs->sync_buff_lock); + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + len = sizeof(struct ip_vs_sync_v6); + else +#endif + len = sizeof(struct ip_vs_sync_v4); + + if (cp->flags & IP_VS_CONN_F_SEQ_MASK) + len += sizeof(struct ip_vs_sync_conn_options) + 2; + + if (cp->pe_data_len) + len += cp->pe_data_len + 2; /* + Param hdr field */ + if (pe_name_len) + len += pe_name_len + 2; + + /* check if there is a space for this one */ + pad = 0; + if (ipvs->sync_buff) { + pad = (4 - (size_t)ipvs->sync_buff->head) & 3; + if (ipvs->sync_buff->head + len + pad > ipvs->sync_buff->end) { + sb_queue_tail(ipvs); + ipvs->sync_buff = NULL; + pad = 0; + } + } + + if (!ipvs->sync_buff) { + ipvs->sync_buff = ip_vs_sync_buff_create(ipvs); + if (!ipvs->sync_buff) { + spin_unlock(&ipvs->sync_buff_lock); + pr_err("ip_vs_sync_buff_create failed.\n"); + return; + } + } + + m = ipvs->sync_buff->mesg; + p = ipvs->sync_buff->head; + ipvs->sync_buff->head += pad + len; + m->size += pad + len; + /* Add ev. padding from prev. sync_conn */ + while (pad--) + *(p++) = 0; + + s = (union ip_vs_sync_conn *)p; + + /* Set message type & copy members */ + s->v4.type = (cp->af == AF_INET6 ? STYPE_F_INET6 : 0); + s->v4.ver_size = htons(len & SVER_MASK); /* Version 0 */ + s->v4.flags = htonl(cp->flags & ~IP_VS_CONN_F_HASHED); + s->v4.state = htons(cp->state); + s->v4.protocol = cp->protocol; + s->v4.cport = cp->cport; + s->v4.vport = cp->vport; + s->v4.dport = cp->dport; + s->v4.fwmark = htonl(cp->fwmark); + s->v4.timeout = htonl(cp->timeout / HZ); + m->nr_conns++; + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) { + p += sizeof(struct ip_vs_sync_v6); + s->v6.caddr = cp->caddr.in6; + s->v6.vaddr = cp->vaddr.in6; + s->v6.daddr = cp->daddr.in6; + } else +#endif + { + p += sizeof(struct ip_vs_sync_v4); /* options ptr */ + s->v4.caddr = cp->caddr.ip; + s->v4.vaddr = cp->vaddr.ip; + s->v4.daddr = cp->daddr.ip; + } + if (cp->flags & IP_VS_CONN_F_SEQ_MASK) { + *(p++) = IPVS_OPT_SEQ_DATA; + *(p++) = sizeof(struct ip_vs_sync_conn_options); + hton_seq((struct ip_vs_seq *)p, &cp->in_seq); + p += sizeof(struct ip_vs_seq); + hton_seq((struct ip_vs_seq *)p, &cp->out_seq); + p += sizeof(struct ip_vs_seq); + } + /* Handle pe data */ + if (cp->pe_data_len && cp->pe_data) { + *(p++) = IPVS_OPT_PE_DATA; + *(p++) = cp->pe_data_len; + memcpy(p, cp->pe_data, cp->pe_data_len); + p += cp->pe_data_len; + if (pe_name_len) { + /* Add PE_NAME */ + *(p++) = IPVS_OPT_PE_NAME; + *(p++) = pe_name_len; + memcpy(p, cp->pe->name, pe_name_len); + p += pe_name_len; + } + } + + spin_unlock(&ipvs->sync_buff_lock); + +control: + /* synchronize its controller if it has */ + cp = cp->control; + if (!cp) + return; + /* + * Reduce sync rate for templates + * i.e only increment in_pkts for Templates. + */ + if (cp->flags & IP_VS_CONN_F_TEMPLATE) { + int pkts = atomic_add_return(1, &cp->in_pkts); + + if (pkts % sysctl_sync_period(ipvs) != 1) + return; + } + goto sloop; +} + +/* + * fill_param used by version 1 + */ +static inline int +ip_vs_conn_fill_param_sync(struct net *net, int af, union ip_vs_sync_conn *sc, + struct ip_vs_conn_param *p, + __u8 *pe_data, unsigned int pe_data_len, + __u8 *pe_name, unsigned int pe_name_len) +{ +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + ip_vs_conn_fill_param(net, af, sc->v6.protocol, + (const union nf_inet_addr *)&sc->v6.caddr, + sc->v6.cport, + (const union nf_inet_addr *)&sc->v6.vaddr, + sc->v6.vport, p); + else +#endif + ip_vs_conn_fill_param(net, af, sc->v4.protocol, + (const union nf_inet_addr *)&sc->v4.caddr, + sc->v4.cport, + (const union nf_inet_addr *)&sc->v4.vaddr, + sc->v4.vport, p); + /* Handle pe data */ + if (pe_data_len) { + if (pe_name_len) { + char buff[IP_VS_PENAME_MAXLEN+1]; + + memcpy(buff, pe_name, pe_name_len); + buff[pe_name_len]=0; + p->pe = __ip_vs_pe_getbyname(buff); + if (!p->pe) { + IP_VS_DBG(3, "BACKUP, no %s engine found/loaded\n", + buff); + return 1; + } + } else { + IP_VS_ERR_RL("BACKUP, Invalid PE parameters\n"); + return 1; + } + + p->pe_data = kmemdup(pe_data, pe_data_len, GFP_ATOMIC); + if (!p->pe_data) { + if (p->pe->module) + module_put(p->pe->module); + return -ENOMEM; + } + p->pe_data_len = pe_data_len; + } + return 0; +} + +/* + * Connection Add / Update. + * Common for version 0 and 1 reception of backup sync_conns. + * Param: ... + * timeout is in sec. + */ +static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param, + unsigned int flags, unsigned int state, + unsigned int protocol, unsigned int type, + const union nf_inet_addr *daddr, __be16 dport, + unsigned long timeout, __u32 fwmark, + struct ip_vs_sync_conn_options *opt) +{ + struct ip_vs_dest *dest; + struct ip_vs_conn *cp; + struct netns_ipvs *ipvs = net_ipvs(net); + + if (!(flags & IP_VS_CONN_F_TEMPLATE)) + cp = ip_vs_conn_in_get(param); + else + cp = ip_vs_ct_in_get(param); + + if (cp && param->pe_data) /* Free pe_data */ + kfree(param->pe_data); + if (!cp) { + /* + * Find the appropriate destination for the connection. + * If it is not found the connection will remain unbound + * but still handled. + */ + dest = ip_vs_find_dest(net, type, daddr, dport, param->vaddr, + param->vport, protocol, fwmark, flags); + + /* Set the approprite ativity flag */ + if (protocol == IPPROTO_TCP) { + if (state != IP_VS_TCP_S_ESTABLISHED) + flags |= IP_VS_CONN_F_INACTIVE; + else + flags &= ~IP_VS_CONN_F_INACTIVE; + } else if (protocol == IPPROTO_SCTP) { + if (state != IP_VS_SCTP_S_ESTABLISHED) + flags |= IP_VS_CONN_F_INACTIVE; + else + flags &= ~IP_VS_CONN_F_INACTIVE; + } + cp = ip_vs_conn_new(param, daddr, dport, flags, dest, fwmark); + if (dest) + atomic_dec(&dest->refcnt); + if (!cp) { + if (param->pe_data) + kfree(param->pe_data); + IP_VS_DBG(2, "BACKUP, add new conn. failed\n"); + return; + } + } else if (!cp->dest) { + dest = ip_vs_try_bind_dest(cp); + if (dest) + atomic_dec(&dest->refcnt); + } else if ((cp->dest) && (cp->protocol == IPPROTO_TCP) && + (cp->state != state)) { + /* update active/inactive flag for the connection */ + dest = cp->dest; + if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && + (state != IP_VS_TCP_S_ESTABLISHED)) { + atomic_dec(&dest->activeconns); + atomic_inc(&dest->inactconns); + cp->flags |= IP_VS_CONN_F_INACTIVE; + } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) && + (state == IP_VS_TCP_S_ESTABLISHED)) { + atomic_inc(&dest->activeconns); + atomic_dec(&dest->inactconns); + cp->flags &= ~IP_VS_CONN_F_INACTIVE; + } + } else if ((cp->dest) && (cp->protocol == IPPROTO_SCTP) && + (cp->state != state)) { + dest = cp->dest; + if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && + (state != IP_VS_SCTP_S_ESTABLISHED)) { + atomic_dec(&dest->activeconns); + atomic_inc(&dest->inactconns); + cp->flags &= ~IP_VS_CONN_F_INACTIVE; + } + } + + if (opt) + memcpy(&cp->in_seq, opt, sizeof(*opt)); + atomic_set(&cp->in_pkts, sysctl_sync_threshold(ipvs)); + cp->state = state; + cp->old_state = cp->state; + /* + * For Ver 0 messages style + * - Not possible to recover the right timeout for templates + * - can not find the right fwmark + * virtual service. If needed, we can do it for + * non-fwmark persistent services. + * Ver 1 messages style. + * - No problem. + */ + if (timeout) { + if (timeout > MAX_SCHEDULE_TIMEOUT / HZ) + timeout = MAX_SCHEDULE_TIMEOUT / HZ; + cp->timeout = timeout*HZ; + } else { + struct ip_vs_proto_data *pd; + + pd = ip_vs_proto_data_get(net, protocol); + if (!(flags & IP_VS_CONN_F_TEMPLATE) && pd && pd->timeout_table) + cp->timeout = pd->timeout_table[state]; + else + cp->timeout = (3*60*HZ); + } + ip_vs_conn_put(cp); +} + +/* + * Process received multicast message for Version 0 + */ +static void ip_vs_process_message_v0(struct net *net, const char *buffer, + const size_t buflen) +{ + struct ip_vs_sync_mesg_v0 *m = (struct ip_vs_sync_mesg_v0 *)buffer; + struct ip_vs_sync_conn_v0 *s; + struct ip_vs_sync_conn_options *opt; + struct ip_vs_protocol *pp; + struct ip_vs_conn_param param; + char *p; + int i; + + p = (char *)buffer + sizeof(struct ip_vs_sync_mesg_v0); + for (i=0; i<m->nr_conns; i++) { + unsigned flags, state; + + if (p + SIMPLE_CONN_SIZE > buffer+buflen) { + IP_VS_ERR_RL("BACKUP v0, bogus conn\n"); + return; + } + s = (struct ip_vs_sync_conn_v0 *) p; + flags = ntohs(s->flags) | IP_VS_CONN_F_SYNC; + flags &= ~IP_VS_CONN_F_HASHED; + if (flags & IP_VS_CONN_F_SEQ_MASK) { + opt = (struct ip_vs_sync_conn_options *)&s[1]; + p += FULL_CONN_SIZE; + if (p > buffer+buflen) { + IP_VS_ERR_RL("BACKUP v0, Dropping buffer bogus conn options\n"); + return; + } + } else { + opt = NULL; + p += SIMPLE_CONN_SIZE; + } + + state = ntohs(s->state); + if (!(flags & IP_VS_CONN_F_TEMPLATE)) { + pp = ip_vs_proto_get(s->protocol); + if (!pp) { + IP_VS_DBG(2, "BACKUP v0, Unsupported protocol %u\n", + s->protocol); + continue; + } + if (state >= pp->num_states) { + IP_VS_DBG(2, "BACKUP v0, Invalid %s state %u\n", + pp->name, state); + continue; + } + } else { + /* protocol in templates is not used for state/timeout */ + if (state > 0) { + IP_VS_DBG(2, "BACKUP v0, Invalid template state %u\n", + state); + state = 0; + } + } + + ip_vs_conn_fill_param(net, AF_INET, s->protocol, + (const union nf_inet_addr *)&s->caddr, + s->cport, + (const union nf_inet_addr *)&s->vaddr, + s->vport, ¶m); + + /* Send timeout as Zero */ + ip_vs_proc_conn(net, ¶m, flags, state, s->protocol, AF_INET, + (union nf_inet_addr *)&s->daddr, s->dport, + 0, 0, opt); + } +} + +/* + * Handle options + */ +static inline int ip_vs_proc_seqopt(__u8 *p, unsigned int plen, + __u32 *opt_flags, + struct ip_vs_sync_conn_options *opt) +{ + struct ip_vs_sync_conn_options *topt; + + topt = (struct ip_vs_sync_conn_options *)p; + + if (plen != sizeof(struct ip_vs_sync_conn_options)) { + IP_VS_DBG(2, "BACKUP, bogus conn options length\n"); + return -EINVAL; + } + if (*opt_flags & IPVS_OPT_F_SEQ_DATA) { + IP_VS_DBG(2, "BACKUP, conn options found twice\n"); + return -EINVAL; + } + ntoh_seq(&topt->in_seq, &opt->in_seq); + ntoh_seq(&topt->out_seq, &opt->out_seq); + *opt_flags |= IPVS_OPT_F_SEQ_DATA; + return 0; +} + +static int ip_vs_proc_str(__u8 *p, unsigned int plen, unsigned int *data_len, + __u8 **data, unsigned int maxlen, + __u32 *opt_flags, __u32 flag) +{ + if (plen > maxlen) { + IP_VS_DBG(2, "BACKUP, bogus par.data len > %d\n", maxlen); + return -EINVAL; + } + if (*opt_flags & flag) { + IP_VS_DBG(2, "BACKUP, Par.data found twice 0x%x\n", flag); + return -EINVAL; + } + *data_len = plen; + *data = p; + *opt_flags |= flag; + return 0; +} +/* + * Process a Version 1 sync. connection + */ +static inline int ip_vs_proc_sync_conn(struct net *net, __u8 *p, __u8 *msg_end) +{ + struct ip_vs_sync_conn_options opt; + union ip_vs_sync_conn *s; + struct ip_vs_protocol *pp; + struct ip_vs_conn_param param; + __u32 flags; + unsigned int af, state, pe_data_len=0, pe_name_len=0; + __u8 *pe_data=NULL, *pe_name=NULL; + __u32 opt_flags=0; + int retc=0; + + s = (union ip_vs_sync_conn *) p; + + if (s->v6.type & STYPE_F_INET6) { +#ifdef CONFIG_IP_VS_IPV6 + af = AF_INET6; + p += sizeof(struct ip_vs_sync_v6); +#else + IP_VS_DBG(3,"BACKUP, IPv6 msg received, and IPVS is not compiled for IPv6\n"); + retc = 10; + goto out; +#endif + } else if (!s->v4.type) { + af = AF_INET; + p += sizeof(struct ip_vs_sync_v4); + } else { + return -10; + } + if (p > msg_end) + return -20; + + /* Process optional params check Type & Len. */ + while (p < msg_end) { + int ptype; + int plen; + + if (p+2 > msg_end) + return -30; + ptype = *(p++); + plen = *(p++); + + if (!plen || ((p + plen) > msg_end)) + return -40; + /* Handle seq option p = param data */ + switch (ptype & ~IPVS_OPT_F_PARAM) { + case IPVS_OPT_SEQ_DATA: + if (ip_vs_proc_seqopt(p, plen, &opt_flags, &opt)) + return -50; + break; + + case IPVS_OPT_PE_DATA: + if (ip_vs_proc_str(p, plen, &pe_data_len, &pe_data, + IP_VS_PEDATA_MAXLEN, &opt_flags, + IPVS_OPT_F_PE_DATA)) + return -60; + break; + + case IPVS_OPT_PE_NAME: + if (ip_vs_proc_str(p, plen,&pe_name_len, &pe_name, + IP_VS_PENAME_MAXLEN, &opt_flags, + IPVS_OPT_F_PE_NAME)) + return -70; + break; + + default: + /* Param data mandatory ? */ + if (!(ptype & IPVS_OPT_F_PARAM)) { + IP_VS_DBG(3, "BACKUP, Unknown mandatory param %d found\n", + ptype & ~IPVS_OPT_F_PARAM); + retc = 20; + goto out; + } + } + p += plen; /* Next option */ + } + + /* Get flags and Mask off unsupported */ + flags = ntohl(s->v4.flags) & IP_VS_CONN_F_BACKUP_MASK; + flags |= IP_VS_CONN_F_SYNC; + state = ntohs(s->v4.state); + + if (!(flags & IP_VS_CONN_F_TEMPLATE)) { + pp = ip_vs_proto_get(s->v4.protocol); + if (!pp) { + IP_VS_DBG(3,"BACKUP, Unsupported protocol %u\n", + s->v4.protocol); + retc = 30; + goto out; + } + if (state >= pp->num_states) { + IP_VS_DBG(3, "BACKUP, Invalid %s state %u\n", + pp->name, state); + retc = 40; + goto out; + } + } else { + /* protocol in templates is not used for state/timeout */ + if (state > 0) { + IP_VS_DBG(3, "BACKUP, Invalid template state %u\n", + state); + state = 0; + } + } + if (ip_vs_conn_fill_param_sync(net, af, s, ¶m, pe_data, + pe_data_len, pe_name, pe_name_len)) { + retc = 50; + goto out; + } + /* If only IPv4, just silent skip IPv6 */ + if (af == AF_INET) + ip_vs_proc_conn(net, ¶m, flags, state, s->v4.protocol, af, + (union nf_inet_addr *)&s->v4.daddr, s->v4.dport, + ntohl(s->v4.timeout), ntohl(s->v4.fwmark), + (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL) + ); +#ifdef CONFIG_IP_VS_IPV6 + else + ip_vs_proc_conn(net, ¶m, flags, state, s->v6.protocol, af, + (union nf_inet_addr *)&s->v6.daddr, s->v6.dport, + ntohl(s->v6.timeout), ntohl(s->v6.fwmark), + (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL) + ); +#endif + return 0; + /* Error exit */ +out: + IP_VS_DBG(2, "BACKUP, Single msg dropped err:%d\n", retc); + return retc; + +} +/* + * Process received multicast message and create the corresponding + * ip_vs_conn entries. + * Handles Version 0 & 1 + */ +static void ip_vs_process_message(struct net *net, __u8 *buffer, + const size_t buflen) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_sync_mesg *m2 = (struct ip_vs_sync_mesg *)buffer; + __u8 *p, *msg_end; + int i, nr_conns; + + if (buflen < sizeof(struct ip_vs_sync_mesg_v0)) { + IP_VS_DBG(2, "BACKUP, message header too short\n"); + return; + } + /* Convert size back to host byte order */ + m2->size = ntohs(m2->size); + + if (buflen != m2->size) { + IP_VS_DBG(2, "BACKUP, bogus message size\n"); + return; + } + /* SyncID sanity check */ + if (ipvs->backup_syncid != 0 && m2->syncid != ipvs->backup_syncid) { + IP_VS_DBG(7, "BACKUP, Ignoring syncid = %d\n", m2->syncid); + return; + } + /* Handle version 1 message */ + if ((m2->version == SYNC_PROTO_VER) && (m2->reserved == 0) + && (m2->spare == 0)) { + + msg_end = buffer + sizeof(struct ip_vs_sync_mesg); + nr_conns = m2->nr_conns; + + for (i=0; i<nr_conns; i++) { + union ip_vs_sync_conn *s; + unsigned size; + int retc; + + p = msg_end; + if (p + sizeof(s->v4) > buffer+buflen) { + IP_VS_ERR_RL("BACKUP, Dropping buffer, to small\n"); + return; + } + s = (union ip_vs_sync_conn *)p; + size = ntohs(s->v4.ver_size) & SVER_MASK; + msg_end = p + size; + /* Basic sanity checks */ + if (msg_end > buffer+buflen) { + IP_VS_ERR_RL("BACKUP, Dropping buffer, msg > buffer\n"); + return; + } + if (ntohs(s->v4.ver_size) >> SVER_SHIFT) { + IP_VS_ERR_RL("BACKUP, Dropping buffer, Unknown version %d\n", + ntohs(s->v4.ver_size) >> SVER_SHIFT); + return; + } + /* Process a single sync_conn */ + retc = ip_vs_proc_sync_conn(net, p, msg_end); + if (retc < 0) { + IP_VS_ERR_RL("BACKUP, Dropping buffer, Err: %d in decoding\n", + retc); + return; + } + /* Make sure we have 32 bit alignment */ + msg_end = p + ((size + 3) & ~3); + } + } else { + /* Old type of message */ + ip_vs_process_message_v0(net, buffer, buflen); + return; + } +} + + +/* + * Setup loopback of outgoing multicasts on a sending socket + */ +static void set_mcast_loop(struct sock *sk, u_char loop) +{ + struct inet_sock *inet = inet_sk(sk); + + /* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */ + lock_sock(sk); + inet->mc_loop = loop ? 1 : 0; + release_sock(sk); +} + +/* + * Specify TTL for outgoing multicasts on a sending socket + */ +static void set_mcast_ttl(struct sock *sk, u_char ttl) +{ + struct inet_sock *inet = inet_sk(sk); + + /* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */ + lock_sock(sk); + inet->mc_ttl = ttl; + release_sock(sk); +} + +/* + * Specifiy default interface for outgoing multicasts + */ +static int set_mcast_if(struct sock *sk, char *ifname) +{ + struct net_device *dev; + struct inet_sock *inet = inet_sk(sk); + struct net *net = sock_net(sk); + + dev = __dev_get_by_name(net, ifname); + if (!dev) + return -ENODEV; + + if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) + return -EINVAL; + + lock_sock(sk); + inet->mc_index = dev->ifindex; + /* inet->mc_addr = 0; */ + release_sock(sk); + + return 0; +} + + +/* + * Set the maximum length of sync message according to the + * specified interface's MTU. + */ +static int set_sync_mesg_maxlen(struct net *net, int sync_state) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + struct net_device *dev; + int num; + + if (sync_state == IP_VS_STATE_MASTER) { + dev = __dev_get_by_name(net, ipvs->master_mcast_ifn); + if (!dev) + return -ENODEV; + + num = (dev->mtu - sizeof(struct iphdr) - + sizeof(struct udphdr) - + SYNC_MESG_HEADER_LEN - 20) / SIMPLE_CONN_SIZE; + ipvs->send_mesg_maxlen = SYNC_MESG_HEADER_LEN + + SIMPLE_CONN_SIZE * min(num, MAX_CONNS_PER_SYNCBUFF); + IP_VS_DBG(7, "setting the maximum length of sync sending " + "message %d.\n", ipvs->send_mesg_maxlen); + } else if (sync_state == IP_VS_STATE_BACKUP) { + dev = __dev_get_by_name(net, ipvs->backup_mcast_ifn); + if (!dev) + return -ENODEV; + + ipvs->recv_mesg_maxlen = dev->mtu - + sizeof(struct iphdr) - sizeof(struct udphdr); + IP_VS_DBG(7, "setting the maximum length of sync receiving " + "message %d.\n", ipvs->recv_mesg_maxlen); + } + + return 0; +} + + +/* + * Join a multicast group. + * the group is specified by a class D multicast address 224.0.0.0/8 + * in the in_addr structure passed in as a parameter. + */ +static int +join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname) +{ + struct net *net = sock_net(sk); + struct ip_mreqn mreq; + struct net_device *dev; + int ret; + + memset(&mreq, 0, sizeof(mreq)); + memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr)); + + dev = __dev_get_by_name(net, ifname); + if (!dev) + return -ENODEV; + if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) + return -EINVAL; + + mreq.imr_ifindex = dev->ifindex; + + lock_sock(sk); + ret = ip_mc_join_group(sk, &mreq); + release_sock(sk); + + return ret; +} + + +static int bind_mcastif_addr(struct socket *sock, char *ifname) +{ + struct net *net = sock_net(sock->sk); + struct net_device *dev; + __be32 addr; + struct sockaddr_in sin; + + dev = __dev_get_by_name(net, ifname); + if (!dev) + return -ENODEV; + + addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); + if (!addr) + pr_err("You probably need to specify IP address on " + "multicast interface.\n"); + + IP_VS_DBG(7, "binding socket with (%s) %pI4\n", + ifname, &addr); + + /* Now bind the socket with the address of multicast interface */ + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = addr; + sin.sin_port = 0; + + return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin)); +} + +/* + * Set up sending multicast socket over UDP + */ +static struct socket *make_send_sock(struct net *net) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + struct socket *sock; + int result; + + /* First create a socket move it to right name space later */ + result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); + if (result < 0) { + pr_err("Error during creation of socket; terminating\n"); + return ERR_PTR(result); + } + /* + * Kernel sockets that are a part of a namespace, should not + * hold a reference to a namespace in order to allow to stop it. + * After sk_change_net should be released using sk_release_kernel. + */ + sk_change_net(sock->sk, net); + result = set_mcast_if(sock->sk, ipvs->master_mcast_ifn); + if (result < 0) { + pr_err("Error setting outbound mcast interface\n"); + goto error; + } + + set_mcast_loop(sock->sk, 0); + set_mcast_ttl(sock->sk, 1); + + result = bind_mcastif_addr(sock, ipvs->master_mcast_ifn); + if (result < 0) { + pr_err("Error binding address of the mcast interface\n"); + goto error; + } + + result = sock->ops->connect(sock, (struct sockaddr *) &mcast_addr, + sizeof(struct sockaddr), 0); + if (result < 0) { + pr_err("Error connecting to the multicast addr\n"); + goto error; + } + + return sock; + +error: + sk_release_kernel(sock->sk); + return ERR_PTR(result); +} + + +/* + * Set up receiving multicast socket over UDP + */ +static struct socket *make_receive_sock(struct net *net) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + struct socket *sock; + int result; + + /* First create a socket */ + result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); + if (result < 0) { + pr_err("Error during creation of socket; terminating\n"); + return ERR_PTR(result); + } + /* + * Kernel sockets that are a part of a namespace, should not + * hold a reference to a namespace in order to allow to stop it. + * After sk_change_net should be released using sk_release_kernel. + */ + sk_change_net(sock->sk, net); + /* it is equivalent to the REUSEADDR option in user-space */ + sock->sk->sk_reuse = 1; + + result = sock->ops->bind(sock, (struct sockaddr *) &mcast_addr, + sizeof(struct sockaddr)); + if (result < 0) { + pr_err("Error binding to the multicast addr\n"); + goto error; + } + + /* join the multicast group */ + result = join_mcast_group(sock->sk, + (struct in_addr *) &mcast_addr.sin_addr, + ipvs->backup_mcast_ifn); + if (result < 0) { + pr_err("Error joining to the multicast group\n"); + goto error; + } + + return sock; + +error: + sk_release_kernel(sock->sk); + return ERR_PTR(result); +} + + +static int +ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length) +{ + struct msghdr msg = {.msg_flags = MSG_DONTWAIT|MSG_NOSIGNAL}; + struct kvec iov; + int len; + + EnterFunction(7); + iov.iov_base = (void *)buffer; + iov.iov_len = length; + + len = kernel_sendmsg(sock, &msg, &iov, 1, (size_t)(length)); + + LeaveFunction(7); + return len; +} + +static void +ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg) +{ + int msize; + + msize = msg->size; + + /* Put size in network byte order */ + msg->size = htons(msg->size); + + if (ip_vs_send_async(sock, (char *)msg, msize) != msize) + pr_err("ip_vs_send_async error\n"); +} + +static int +ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen) +{ + struct msghdr msg = {NULL,}; + struct kvec iov; + int len; + + EnterFunction(7); + + /* Receive a packet */ + iov.iov_base = buffer; + iov.iov_len = (size_t)buflen; + + len = kernel_recvmsg(sock, &msg, &iov, 1, buflen, 0); + + if (len < 0) + return -1; + + LeaveFunction(7); + return len; +} + + +static int sync_thread_master(void *data) +{ + struct ip_vs_sync_thread_data *tinfo = data; + struct netns_ipvs *ipvs = net_ipvs(tinfo->net); + struct ip_vs_sync_buff *sb; + + pr_info("sync thread started: state = MASTER, mcast_ifn = %s, " + "syncid = %d\n", + ipvs->master_mcast_ifn, ipvs->master_syncid); + + while (!kthread_should_stop()) { + while ((sb = sb_dequeue(ipvs))) { + ip_vs_send_sync_msg(tinfo->sock, sb->mesg); + ip_vs_sync_buff_release(sb); + } + + /* check if entries stay in ipvs->sync_buff for 2 seconds */ + sb = get_curr_sync_buff(ipvs, 2 * HZ); + if (sb) { + ip_vs_send_sync_msg(tinfo->sock, sb->mesg); + ip_vs_sync_buff_release(sb); + } + + schedule_timeout_interruptible(HZ); + } + + /* clean up the sync_buff queue */ + while ((sb = sb_dequeue(ipvs))) + ip_vs_sync_buff_release(sb); + + /* clean up the current sync_buff */ + sb = get_curr_sync_buff(ipvs, 0); + if (sb) + ip_vs_sync_buff_release(sb); + + /* release the sending multicast socket */ + sk_release_kernel(tinfo->sock->sk); + kfree(tinfo); + + return 0; +} + + +static int sync_thread_backup(void *data) +{ + struct ip_vs_sync_thread_data *tinfo = data; + struct netns_ipvs *ipvs = net_ipvs(tinfo->net); + int len; + + pr_info("sync thread started: state = BACKUP, mcast_ifn = %s, " + "syncid = %d\n", + ipvs->backup_mcast_ifn, ipvs->backup_syncid); + + while (!kthread_should_stop()) { + wait_event_interruptible(*sk_sleep(tinfo->sock->sk), + !skb_queue_empty(&tinfo->sock->sk->sk_receive_queue) + || kthread_should_stop()); + + /* do we have data now? */ + while (!skb_queue_empty(&(tinfo->sock->sk->sk_receive_queue))) { + len = ip_vs_receive(tinfo->sock, tinfo->buf, + ipvs->recv_mesg_maxlen); + if (len <= 0) { + pr_err("receiving message error\n"); + break; + } + + /* disable bottom half, because it accesses the data + shared by softirq while getting/creating conns */ + local_bh_disable(); + ip_vs_process_message(tinfo->net, tinfo->buf, len); + local_bh_enable(); + } + } + + /* release the sending multicast socket */ + sk_release_kernel(tinfo->sock->sk); + kfree(tinfo->buf); + kfree(tinfo); + + return 0; +} + + +int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid) +{ + struct ip_vs_sync_thread_data *tinfo; + struct task_struct **realtask, *task; + struct socket *sock; + struct netns_ipvs *ipvs = net_ipvs(net); + char *name, *buf = NULL; + int (*threadfn)(void *data); + int result = -ENOMEM; + + IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current)); + IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %Zd bytes\n", + sizeof(struct ip_vs_sync_conn_v0)); + + + if (state == IP_VS_STATE_MASTER) { + if (ipvs->master_thread) + return -EEXIST; + + strlcpy(ipvs->master_mcast_ifn, mcast_ifn, + sizeof(ipvs->master_mcast_ifn)); + ipvs->master_syncid = syncid; + realtask = &ipvs->master_thread; + name = "ipvs_master:%d"; + threadfn = sync_thread_master; + sock = make_send_sock(net); + } else if (state == IP_VS_STATE_BACKUP) { + if (ipvs->backup_thread) + return -EEXIST; + + strlcpy(ipvs->backup_mcast_ifn, mcast_ifn, + sizeof(ipvs->backup_mcast_ifn)); + ipvs->backup_syncid = syncid; + realtask = &ipvs->backup_thread; + name = "ipvs_backup:%d"; + threadfn = sync_thread_backup; + sock = make_receive_sock(net); + } else { + return -EINVAL; + } + + if (IS_ERR(sock)) { + result = PTR_ERR(sock); + goto out; + } + + set_sync_mesg_maxlen(net, state); + if (state == IP_VS_STATE_BACKUP) { + buf = kmalloc(ipvs->recv_mesg_maxlen, GFP_KERNEL); + if (!buf) + goto outsocket; + } + + tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL); + if (!tinfo) + goto outbuf; + + tinfo->net = net; + tinfo->sock = sock; + tinfo->buf = buf; + + task = kthread_run(threadfn, tinfo, name, ipvs->gen); + if (IS_ERR(task)) { + result = PTR_ERR(task); + goto outtinfo; + } + + /* mark as active */ + *realtask = task; + ipvs->sync_state |= state; + + /* increase the module use count */ + ip_vs_use_count_inc(); + + return 0; + +outtinfo: + kfree(tinfo); +outbuf: + kfree(buf); +outsocket: + sk_release_kernel(sock->sk); +out: + return result; +} + + +int stop_sync_thread(struct net *net, int state) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + int retc = -EINVAL; + + IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current)); + + if (state == IP_VS_STATE_MASTER) { + if (!ipvs->master_thread) + return -ESRCH; + + pr_info("stopping master sync thread %d ...\n", + task_pid_nr(ipvs->master_thread)); + + /* + * The lock synchronizes with sb_queue_tail(), so that we don't + * add sync buffers to the queue, when we are already in + * progress of stopping the master sync daemon. + */ + + spin_lock_bh(&ipvs->sync_lock); + ipvs->sync_state &= ~IP_VS_STATE_MASTER; + spin_unlock_bh(&ipvs->sync_lock); + retc = kthread_stop(ipvs->master_thread); + ipvs->master_thread = NULL; + } else if (state == IP_VS_STATE_BACKUP) { + if (!ipvs->backup_thread) + return -ESRCH; + + pr_info("stopping backup sync thread %d ...\n", + task_pid_nr(ipvs->backup_thread)); + + ipvs->sync_state &= ~IP_VS_STATE_BACKUP; + retc = kthread_stop(ipvs->backup_thread); + ipvs->backup_thread = NULL; + } + + /* decrease the module use count */ + ip_vs_use_count_dec(); + + return retc; +} + +/* + * Initialize data struct for each netns + */ +int __net_init ip_vs_sync_net_init(struct net *net) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + + __mutex_init(&ipvs->sync_mutex, "ipvs->sync_mutex", &__ipvs_sync_key); + INIT_LIST_HEAD(&ipvs->sync_queue); + spin_lock_init(&ipvs->sync_lock); + spin_lock_init(&ipvs->sync_buff_lock); + + ipvs->sync_mcast_addr.sin_family = AF_INET; + ipvs->sync_mcast_addr.sin_port = cpu_to_be16(IP_VS_SYNC_PORT); + ipvs->sync_mcast_addr.sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP); + return 0; +} + +void ip_vs_sync_net_cleanup(struct net *net) +{ + int retc; + struct netns_ipvs *ipvs = net_ipvs(net); + + mutex_lock(&ipvs->sync_mutex); + retc = stop_sync_thread(net, IP_VS_STATE_MASTER); + if (retc && retc != -ESRCH) + pr_err("Failed to stop Master Daemon\n"); + + retc = stop_sync_thread(net, IP_VS_STATE_BACKUP); + if (retc && retc != -ESRCH) + pr_err("Failed to stop Backup Daemon\n"); + mutex_unlock(&ipvs->sync_mutex); +} diff --git a/net/netfilter/ipvs/ip_vs_wlc.c b/net/netfilter/ipvs/ip_vs_wlc.c new file mode 100644 index 00000000..bc1bfc48 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_wlc.c @@ -0,0 +1,113 @@ +/* + * IPVS: Weighted Least-Connection Scheduling module + * + * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> + * Peter Kese <peter.kese@ijs.si> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * Wensong Zhang : changed the ip_vs_wlc_schedule to return dest + * Wensong Zhang : changed to use the inactconns in scheduling + * Wensong Zhang : changed some comestics things for debugging + * Wensong Zhang : changed for the d-linked destination list + * Wensong Zhang : added the ip_vs_wlc_update_svc + * Wensong Zhang : added any dest with weight=0 is quiesced + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/module.h> +#include <linux/kernel.h> + +#include <net/ip_vs.h> + +/* + * Weighted Least Connection scheduling + */ +static struct ip_vs_dest * +ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +{ + struct ip_vs_dest *dest, *least; + unsigned int loh, doh; + + IP_VS_DBG(6, "ip_vs_wlc_schedule(): Scheduling...\n"); + + /* + * We calculate the load of each dest server as follows: + * (dest overhead) / dest->weight + * + * Remember -- no floats in kernel mode!!! + * The comparison of h1*w2 > h2*w1 is equivalent to that of + * h1/w1 > h2/w2 + * if every weight is larger than zero. + * + * The server with weight=0 is quiesced and will not receive any + * new connections. + */ + + list_for_each_entry(dest, &svc->destinations, n_list) { + if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && + atomic_read(&dest->weight) > 0) { + least = dest; + loh = ip_vs_dest_conn_overhead(least); + goto nextstage; + } + } + ip_vs_scheduler_err(svc, "no destination available"); + return NULL; + + /* + * Find the destination with the least load. + */ + nextstage: + list_for_each_entry_continue(dest, &svc->destinations, n_list) { + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + continue; + doh = ip_vs_dest_conn_overhead(dest); + if (loh * atomic_read(&dest->weight) > + doh * atomic_read(&least->weight)) { + least = dest; + loh = doh; + } + } + + IP_VS_DBG_BUF(6, "WLC: server %s:%u " + "activeconns %d refcnt %d weight %d overhead %d\n", + IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->refcnt), + atomic_read(&least->weight), loh); + + return least; +} + + +static struct ip_vs_scheduler ip_vs_wlc_scheduler = +{ + .name = "wlc", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_wlc_scheduler.n_list), + .schedule = ip_vs_wlc_schedule, +}; + + +static int __init ip_vs_wlc_init(void) +{ + return register_ip_vs_scheduler(&ip_vs_wlc_scheduler); +} + +static void __exit ip_vs_wlc_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_wlc_scheduler); +} + +module_init(ip_vs_wlc_init); +module_exit(ip_vs_wlc_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_wrr.c b/net/netfilter/ipvs/ip_vs_wrr.c new file mode 100644 index 00000000..fd0d4e09 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_wrr.c @@ -0,0 +1,231 @@ +/* + * IPVS: Weighted Round-Robin Scheduling module + * + * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * Wensong Zhang : changed the ip_vs_wrr_schedule to return dest + * Wensong Zhang : changed some comestics things for debugging + * Wensong Zhang : changed for the d-linked destination list + * Wensong Zhang : added the ip_vs_wrr_update_svc + * Julian Anastasov : fixed the bug of returning destination + * with weight 0 when all weights are zero + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/net.h> +#include <linux/gcd.h> + +#include <net/ip_vs.h> + +/* + * current destination pointer for weighted round-robin scheduling + */ +struct ip_vs_wrr_mark { + struct list_head *cl; /* current list head */ + int cw; /* current weight */ + int mw; /* maximum weight */ + int di; /* decreasing interval */ +}; + + +static int ip_vs_wrr_gcd_weight(struct ip_vs_service *svc) +{ + struct ip_vs_dest *dest; + int weight; + int g = 0; + + list_for_each_entry(dest, &svc->destinations, n_list) { + weight = atomic_read(&dest->weight); + if (weight > 0) { + if (g > 0) + g = gcd(weight, g); + else + g = weight; + } + } + return g ? g : 1; +} + + +/* + * Get the maximum weight of the service destinations. + */ +static int ip_vs_wrr_max_weight(struct ip_vs_service *svc) +{ + struct ip_vs_dest *dest; + int new_weight, weight = 0; + + list_for_each_entry(dest, &svc->destinations, n_list) { + new_weight = atomic_read(&dest->weight); + if (new_weight > weight) + weight = new_weight; + } + + return weight; +} + + +static int ip_vs_wrr_init_svc(struct ip_vs_service *svc) +{ + struct ip_vs_wrr_mark *mark; + + /* + * Allocate the mark variable for WRR scheduling + */ + mark = kmalloc(sizeof(struct ip_vs_wrr_mark), GFP_ATOMIC); + if (mark == NULL) + return -ENOMEM; + + mark->cl = &svc->destinations; + mark->cw = 0; + mark->mw = ip_vs_wrr_max_weight(svc); + mark->di = ip_vs_wrr_gcd_weight(svc); + svc->sched_data = mark; + + return 0; +} + + +static int ip_vs_wrr_done_svc(struct ip_vs_service *svc) +{ + /* + * Release the mark variable + */ + kfree(svc->sched_data); + + return 0; +} + + +static int ip_vs_wrr_update_svc(struct ip_vs_service *svc) +{ + struct ip_vs_wrr_mark *mark = svc->sched_data; + + mark->cl = &svc->destinations; + mark->mw = ip_vs_wrr_max_weight(svc); + mark->di = ip_vs_wrr_gcd_weight(svc); + if (mark->cw > mark->mw) + mark->cw = 0; + return 0; +} + + +/* + * Weighted Round-Robin Scheduling + */ +static struct ip_vs_dest * +ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +{ + struct ip_vs_dest *dest; + struct ip_vs_wrr_mark *mark = svc->sched_data; + struct list_head *p; + + IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); + + /* + * This loop will always terminate, because mark->cw in (0, max_weight] + * and at least one server has its weight equal to max_weight. + */ + write_lock(&svc->sched_lock); + p = mark->cl; + while (1) { + if (mark->cl == &svc->destinations) { + /* it is at the head of the destination list */ + + if (mark->cl == mark->cl->next) { + /* no dest entry */ + ip_vs_scheduler_err(svc, + "no destination available: " + "no destinations present"); + dest = NULL; + goto out; + } + + mark->cl = svc->destinations.next; + mark->cw -= mark->di; + if (mark->cw <= 0) { + mark->cw = mark->mw; + /* + * Still zero, which means no available servers. + */ + if (mark->cw == 0) { + mark->cl = &svc->destinations; + ip_vs_scheduler_err(svc, + "no destination available"); + dest = NULL; + goto out; + } + } + } else + mark->cl = mark->cl->next; + + if (mark->cl != &svc->destinations) { + /* not at the head of the list */ + dest = list_entry(mark->cl, struct ip_vs_dest, n_list); + if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && + atomic_read(&dest->weight) >= mark->cw) { + /* got it */ + break; + } + } + + if (mark->cl == p && mark->cw == mark->di) { + /* back to the start, and no dest is found. + It is only possible when all dests are OVERLOADED */ + dest = NULL; + ip_vs_scheduler_err(svc, + "no destination available: " + "all destinations are overloaded"); + goto out; + } + } + + IP_VS_DBG_BUF(6, "WRR: server %s:%u " + "activeconns %d refcnt %d weight %d\n", + IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port), + atomic_read(&dest->activeconns), + atomic_read(&dest->refcnt), + atomic_read(&dest->weight)); + + out: + write_unlock(&svc->sched_lock); + return dest; +} + + +static struct ip_vs_scheduler ip_vs_wrr_scheduler = { + .name = "wrr", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_wrr_scheduler.n_list), + .init_service = ip_vs_wrr_init_svc, + .done_service = ip_vs_wrr_done_svc, + .update_service = ip_vs_wrr_update_svc, + .schedule = ip_vs_wrr_schedule, +}; + +static int __init ip_vs_wrr_init(void) +{ + return register_ip_vs_scheduler(&ip_vs_wrr_scheduler) ; +} + +static void __exit ip_vs_wrr_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_wrr_scheduler); +} + +module_init(ip_vs_wrr_init); +module_exit(ip_vs_wrr_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c new file mode 100644 index 00000000..7fd66dec --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -0,0 +1,1370 @@ +/* + * ip_vs_xmit.c: various packet transmitters for IPVS + * + * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> + * Julian Anastasov <ja@ssi.bg> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + * Description of forwarding methods: + * - all transmitters are called from LOCAL_IN (remote clients) and + * LOCAL_OUT (local clients) but for ICMP can be called from FORWARD + * - not all connections have destination server, for example, + * connections in backup server when fwmark is used + * - bypass connections use daddr from packet + * LOCAL_OUT rules: + * - skb->dev is NULL, skb->protocol is not set (both are set in POST_ROUTING) + * - skb->pkt_type is not set yet + * - the only place where we can see skb->sk != NULL + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/tcp.h> /* for tcphdr */ +#include <net/ip.h> +#include <net/tcp.h> /* for csum_tcpudp_magic */ +#include <net/udp.h> +#include <net/icmp.h> /* for icmp_send */ +#include <net/route.h> /* for ip_route_output */ +#include <net/ipv6.h> +#include <net/ip6_route.h> +#include <net/addrconf.h> +#include <linux/icmpv6.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> + +#include <net/ip_vs.h> + +enum { + IP_VS_RT_MODE_LOCAL = 1, /* Allow local dest */ + IP_VS_RT_MODE_NON_LOCAL = 2, /* Allow non-local dest */ + IP_VS_RT_MODE_RDR = 4, /* Allow redirect from remote daddr to + * local + */ +}; + +/* + * Destination cache to speed up outgoing route lookup + */ +static inline void +__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst, + u32 dst_cookie) +{ + struct dst_entry *old_dst; + + old_dst = dest->dst_cache; + dest->dst_cache = dst; + dest->dst_rtos = rtos; + dest->dst_cookie = dst_cookie; + dst_release(old_dst); +} + +static inline struct dst_entry * +__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos) +{ + struct dst_entry *dst = dest->dst_cache; + + if (!dst) + return NULL; + if ((dst->obsolete || rtos != dest->dst_rtos) && + dst->ops->check(dst, dest->dst_cookie) == NULL) { + dest->dst_cache = NULL; + dst_release(dst); + return NULL; + } + dst_hold(dst); + return dst; +} + +/* Get route to destination or remote server */ +static struct rtable * +__ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest, + __be32 daddr, u32 rtos, int rt_mode, __be32 *ret_saddr) +{ + struct net *net = dev_net(skb_dst(skb)->dev); + struct rtable *rt; /* Route to the other host */ + struct rtable *ort; /* Original route */ + int local; + + if (dest) { + spin_lock(&dest->dst_lock); + if (!(rt = (struct rtable *) + __ip_vs_dst_check(dest, rtos))) { + struct flowi4 fl4; + + memset(&fl4, 0, sizeof(fl4)); + fl4.daddr = dest->addr.ip; + fl4.flowi4_tos = rtos; + rt = ip_route_output_key(net, &fl4); + if (IS_ERR(rt)) { + spin_unlock(&dest->dst_lock); + IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", + &dest->addr.ip); + return NULL; + } + __ip_vs_dst_set(dest, rtos, dst_clone(&rt->dst), 0); + dest->dst_saddr.ip = fl4.saddr; + IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d, " + "rtos=%X\n", + &dest->addr.ip, &dest->dst_saddr.ip, + atomic_read(&rt->dst.__refcnt), rtos); + } + daddr = dest->addr.ip; + if (ret_saddr) + *ret_saddr = dest->dst_saddr.ip; + spin_unlock(&dest->dst_lock); + } else { + struct flowi4 fl4; + + memset(&fl4, 0, sizeof(fl4)); + fl4.daddr = daddr; + fl4.flowi4_tos = rtos; + rt = ip_route_output_key(net, &fl4); + if (IS_ERR(rt)) { + IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", + &daddr); + return NULL; + } + if (ret_saddr) + *ret_saddr = fl4.saddr; + } + + local = rt->rt_flags & RTCF_LOCAL; + if (!((local ? IP_VS_RT_MODE_LOCAL : IP_VS_RT_MODE_NON_LOCAL) & + rt_mode)) { + IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI4\n", + (rt->rt_flags & RTCF_LOCAL) ? + "local":"non-local", &daddr); + ip_rt_put(rt); + return NULL; + } + if (local && !(rt_mode & IP_VS_RT_MODE_RDR) && + !((ort = skb_rtable(skb)) && ort->rt_flags & RTCF_LOCAL)) { + IP_VS_DBG_RL("Redirect from non-local address %pI4 to local " + "requires NAT method, dest: %pI4\n", + &ip_hdr(skb)->daddr, &daddr); + ip_rt_put(rt); + return NULL; + } + if (unlikely(!local && ipv4_is_loopback(ip_hdr(skb)->saddr))) { + IP_VS_DBG_RL("Stopping traffic from loopback address %pI4 " + "to non-local address, dest: %pI4\n", + &ip_hdr(skb)->saddr, &daddr); + ip_rt_put(rt); + return NULL; + } + + return rt; +} + +/* Reroute packet to local IPv4 stack after DNAT */ +static int +__ip_vs_reroute_locally(struct sk_buff *skb) +{ + struct rtable *rt = skb_rtable(skb); + struct net_device *dev = rt->dst.dev; + struct net *net = dev_net(dev); + struct iphdr *iph = ip_hdr(skb); + + if (rt_is_input_route(rt)) { + unsigned long orefdst = skb->_skb_refdst; + + if (ip_route_input(skb, iph->daddr, iph->saddr, + iph->tos, skb->dev)) + return 0; + refdst_drop(orefdst); + } else { + struct flowi4 fl4 = { + .daddr = iph->daddr, + .saddr = iph->saddr, + .flowi4_tos = RT_TOS(iph->tos), + .flowi4_mark = skb->mark, + }; + + rt = ip_route_output_key(net, &fl4); + if (IS_ERR(rt)) + return 0; + if (!(rt->rt_flags & RTCF_LOCAL)) { + ip_rt_put(rt); + return 0; + } + /* Drop old route. */ + skb_dst_drop(skb); + skb_dst_set(skb, &rt->dst); + } + return 1; +} + +#ifdef CONFIG_IP_VS_IPV6 + +static inline int __ip_vs_is_local_route6(struct rt6_info *rt) +{ + return rt->dst.dev && rt->dst.dev->flags & IFF_LOOPBACK; +} + +static struct dst_entry * +__ip_vs_route_output_v6(struct net *net, struct in6_addr *daddr, + struct in6_addr *ret_saddr, int do_xfrm) +{ + struct dst_entry *dst; + struct flowi6 fl6 = { + .daddr = *daddr, + }; + + dst = ip6_route_output(net, NULL, &fl6); + if (dst->error) + goto out_err; + if (!ret_saddr) + return dst; + if (ipv6_addr_any(&fl6.saddr) && + ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev, + &fl6.daddr, 0, &fl6.saddr) < 0) + goto out_err; + if (do_xfrm) { + dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0); + if (IS_ERR(dst)) { + dst = NULL; + goto out_err; + } + } + *ret_saddr = fl6.saddr; + return dst; + +out_err: + dst_release(dst); + IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n", daddr); + return NULL; +} + +/* + * Get route to destination or remote server + */ +static struct rt6_info * +__ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_dest *dest, + struct in6_addr *daddr, struct in6_addr *ret_saddr, + int do_xfrm, int rt_mode) +{ + struct net *net = dev_net(skb_dst(skb)->dev); + struct rt6_info *rt; /* Route to the other host */ + struct rt6_info *ort; /* Original route */ + struct dst_entry *dst; + int local; + + if (dest) { + spin_lock(&dest->dst_lock); + rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0); + if (!rt) { + u32 cookie; + + dst = __ip_vs_route_output_v6(net, &dest->addr.in6, + &dest->dst_saddr.in6, + do_xfrm); + if (!dst) { + spin_unlock(&dest->dst_lock); + return NULL; + } + rt = (struct rt6_info *) dst; + cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0; + __ip_vs_dst_set(dest, 0, dst_clone(&rt->dst), cookie); + IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n", + &dest->addr.in6, &dest->dst_saddr.in6, + atomic_read(&rt->dst.__refcnt)); + } + if (ret_saddr) + *ret_saddr = dest->dst_saddr.in6; + spin_unlock(&dest->dst_lock); + } else { + dst = __ip_vs_route_output_v6(net, daddr, ret_saddr, do_xfrm); + if (!dst) + return NULL; + rt = (struct rt6_info *) dst; + } + + local = __ip_vs_is_local_route6(rt); + if (!((local ? IP_VS_RT_MODE_LOCAL : IP_VS_RT_MODE_NON_LOCAL) & + rt_mode)) { + IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI6\n", + local ? "local":"non-local", daddr); + dst_release(&rt->dst); + return NULL; + } + if (local && !(rt_mode & IP_VS_RT_MODE_RDR) && + !((ort = (struct rt6_info *) skb_dst(skb)) && + __ip_vs_is_local_route6(ort))) { + IP_VS_DBG_RL("Redirect from non-local address %pI6 to local " + "requires NAT method, dest: %pI6\n", + &ipv6_hdr(skb)->daddr, daddr); + dst_release(&rt->dst); + return NULL; + } + if (unlikely(!local && (!skb->dev || skb->dev->flags & IFF_LOOPBACK) && + ipv6_addr_type(&ipv6_hdr(skb)->saddr) & + IPV6_ADDR_LOOPBACK)) { + IP_VS_DBG_RL("Stopping traffic from loopback address %pI6 " + "to non-local address, dest: %pI6\n", + &ipv6_hdr(skb)->saddr, daddr); + dst_release(&rt->dst); + return NULL; + } + + return rt; +} +#endif + + +/* + * Release dest->dst_cache before a dest is removed + */ +void +ip_vs_dst_reset(struct ip_vs_dest *dest) +{ + struct dst_entry *old_dst; + + old_dst = dest->dst_cache; + dest->dst_cache = NULL; + dst_release(old_dst); +} + +#define IP_VS_XMIT_TUNNEL(skb, cp) \ +({ \ + int __ret = NF_ACCEPT; \ + \ + (skb)->ipvs_property = 1; \ + if (unlikely((cp)->flags & IP_VS_CONN_F_NFCT)) \ + __ret = ip_vs_confirm_conntrack(skb); \ + if (__ret == NF_ACCEPT) { \ + nf_reset(skb); \ + skb_forward_csum(skb); \ + } \ + __ret; \ +}) + +#define IP_VS_XMIT_NAT(pf, skb, cp, local) \ +do { \ + (skb)->ipvs_property = 1; \ + if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT))) \ + ip_vs_notrack(skb); \ + else \ + ip_vs_update_conntrack(skb, cp, 1); \ + if (local) \ + return NF_ACCEPT; \ + skb_forward_csum(skb); \ + NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL, \ + skb_dst(skb)->dev, dst_output); \ +} while (0) + +#define IP_VS_XMIT(pf, skb, cp, local) \ +do { \ + (skb)->ipvs_property = 1; \ + if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT))) \ + ip_vs_notrack(skb); \ + if (local) \ + return NF_ACCEPT; \ + skb_forward_csum(skb); \ + NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL, \ + skb_dst(skb)->dev, dst_output); \ +} while (0) + + +/* + * NULL transmitter (do nothing except return NF_ACCEPT) + */ +int +ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) +{ + /* we do not touch skb and do not need pskb ptr */ + IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1); +} + + +/* + * Bypass transmitter + * Let packets bypass the destination when the destination is not + * available, it may be only used in transparent cache cluster. + */ +int +ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) +{ + struct rtable *rt; /* Route to the other host */ + struct iphdr *iph = ip_hdr(skb); + int mtu; + + EnterFunction(10); + + if (!(rt = __ip_vs_get_out_rt(skb, NULL, iph->daddr, RT_TOS(iph->tos), + IP_VS_RT_MODE_NON_LOCAL, NULL))) + goto tx_error_icmp; + + /* MTU checking */ + mtu = dst_mtu(&rt->dst); + if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF)) && + !skb_is_gso(skb)) { + ip_rt_put(rt); + icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); + IP_VS_DBG_RL("%s(): frag needed\n", __func__); + goto tx_error; + } + + /* + * Call ip_send_check because we are not sure it is called + * after ip_defrag. Is copy-on-write needed? + */ + if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) { + ip_rt_put(rt); + return NF_STOLEN; + } + ip_send_check(ip_hdr(skb)); + + /* drop old route */ + skb_dst_drop(skb); + skb_dst_set(skb, &rt->dst); + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->local_df = 1; + + IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 0); + + LeaveFunction(10); + return NF_STOLEN; + + tx_error_icmp: + dst_link_failure(skb); + tx_error: + kfree_skb(skb); + LeaveFunction(10); + return NF_STOLEN; +} + +#ifdef CONFIG_IP_VS_IPV6 +int +ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) +{ + struct rt6_info *rt; /* Route to the other host */ + struct ipv6hdr *iph = ipv6_hdr(skb); + int mtu; + + EnterFunction(10); + + if (!(rt = __ip_vs_get_out_rt_v6(skb, NULL, &iph->daddr, NULL, 0, + IP_VS_RT_MODE_NON_LOCAL))) + goto tx_error_icmp; + + /* MTU checking */ + mtu = dst_mtu(&rt->dst); + if (skb->len > mtu && !skb_is_gso(skb)) { + if (!skb->dev) { + struct net *net = dev_net(skb_dst(skb)->dev); + + skb->dev = net->loopback_dev; + } + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + dst_release(&rt->dst); + IP_VS_DBG_RL("%s(): frag needed\n", __func__); + goto tx_error; + } + + /* + * Call ip_send_check because we are not sure it is called + * after ip_defrag. Is copy-on-write needed? + */ + skb = skb_share_check(skb, GFP_ATOMIC); + if (unlikely(skb == NULL)) { + dst_release(&rt->dst); + return NF_STOLEN; + } + + /* drop old route */ + skb_dst_drop(skb); + skb_dst_set(skb, &rt->dst); + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->local_df = 1; + + IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 0); + + LeaveFunction(10); + return NF_STOLEN; + + tx_error_icmp: + dst_link_failure(skb); + tx_error: + kfree_skb(skb); + LeaveFunction(10); + return NF_STOLEN; +} +#endif + +/* + * NAT transmitter (only for outside-to-inside nat forwarding) + * Not used for related ICMP + */ +int +ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) +{ + struct rtable *rt; /* Route to the other host */ + int mtu; + struct iphdr *iph = ip_hdr(skb); + int local; + + EnterFunction(10); + + /* check if it is a connection of no-client-port */ + if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) { + __be16 _pt, *p; + p = skb_header_pointer(skb, iph->ihl*4, sizeof(_pt), &_pt); + if (p == NULL) + goto tx_error; + ip_vs_conn_fill_cport(cp, *p); + IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); + } + + if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, + RT_TOS(iph->tos), + IP_VS_RT_MODE_LOCAL | + IP_VS_RT_MODE_NON_LOCAL | + IP_VS_RT_MODE_RDR, NULL))) + goto tx_error_icmp; + local = rt->rt_flags & RTCF_LOCAL; + /* + * Avoid duplicate tuple in reply direction for NAT traffic + * to local address when connection is sync-ed + */ +#if IS_ENABLED(CONFIG_NF_CONNTRACK) + if (cp->flags & IP_VS_CONN_F_SYNC && local) { + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo); + + if (ct && !nf_ct_is_untracked(ct)) { + IP_VS_DBG_RL_PKT(10, AF_INET, pp, skb, 0, + "ip_vs_nat_xmit(): " + "stopping DNAT to local address"); + goto tx_error_put; + } + } +#endif + + /* From world but DNAT to loopback address? */ + if (local && ipv4_is_loopback(cp->daddr.ip) && + rt_is_input_route(skb_rtable(skb))) { + IP_VS_DBG_RL_PKT(1, AF_INET, pp, skb, 0, "ip_vs_nat_xmit(): " + "stopping DNAT to loopback address"); + goto tx_error_put; + } + + /* MTU checking */ + mtu = dst_mtu(&rt->dst); + if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF)) && + !skb_is_gso(skb)) { + icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); + IP_VS_DBG_RL_PKT(0, AF_INET, pp, skb, 0, + "ip_vs_nat_xmit(): frag needed for"); + goto tx_error_put; + } + + /* copy-on-write the packet before mangling it */ + if (!skb_make_writable(skb, sizeof(struct iphdr))) + goto tx_error_put; + + if (skb_cow(skb, rt->dst.dev->hard_header_len)) + goto tx_error_put; + + /* mangle the packet */ + if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp)) + goto tx_error_put; + ip_hdr(skb)->daddr = cp->daddr.ip; + ip_send_check(ip_hdr(skb)); + + if (!local) { + /* drop old route */ + skb_dst_drop(skb); + skb_dst_set(skb, &rt->dst); + } else { + ip_rt_put(rt); + /* + * Some IPv4 replies get local address from routes, + * not from iph, so while we DNAT after routing + * we need this second input/output route. + */ + if (!__ip_vs_reroute_locally(skb)) + goto tx_error; + } + + IP_VS_DBG_PKT(10, AF_INET, pp, skb, 0, "After DNAT"); + + /* FIXME: when application helper enlarges the packet and the length + is larger than the MTU of outgoing device, there will be still + MTU problem. */ + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->local_df = 1; + + IP_VS_XMIT_NAT(NFPROTO_IPV4, skb, cp, local); + + LeaveFunction(10); + return NF_STOLEN; + + tx_error_icmp: + dst_link_failure(skb); + tx_error: + kfree_skb(skb); + LeaveFunction(10); + return NF_STOLEN; + tx_error_put: + ip_rt_put(rt); + goto tx_error; +} + +#ifdef CONFIG_IP_VS_IPV6 +int +ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) +{ + struct rt6_info *rt; /* Route to the other host */ + int mtu; + int local; + + EnterFunction(10); + + /* check if it is a connection of no-client-port */ + if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) { + __be16 _pt, *p; + p = skb_header_pointer(skb, sizeof(struct ipv6hdr), + sizeof(_pt), &_pt); + if (p == NULL) + goto tx_error; + ip_vs_conn_fill_cport(cp, *p); + IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); + } + + if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL, + 0, (IP_VS_RT_MODE_LOCAL | + IP_VS_RT_MODE_NON_LOCAL | + IP_VS_RT_MODE_RDR)))) + goto tx_error_icmp; + local = __ip_vs_is_local_route6(rt); + /* + * Avoid duplicate tuple in reply direction for NAT traffic + * to local address when connection is sync-ed + */ +#if IS_ENABLED(CONFIG_NF_CONNTRACK) + if (cp->flags & IP_VS_CONN_F_SYNC && local) { + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo); + + if (ct && !nf_ct_is_untracked(ct)) { + IP_VS_DBG_RL_PKT(10, AF_INET6, pp, skb, 0, + "ip_vs_nat_xmit_v6(): " + "stopping DNAT to local address"); + goto tx_error_put; + } + } +#endif + + /* From world but DNAT to loopback address? */ + if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) && + ipv6_addr_type(&rt->rt6i_dst.addr) & IPV6_ADDR_LOOPBACK) { + IP_VS_DBG_RL_PKT(1, AF_INET6, pp, skb, 0, + "ip_vs_nat_xmit_v6(): " + "stopping DNAT to loopback address"); + goto tx_error_put; + } + + /* MTU checking */ + mtu = dst_mtu(&rt->dst); + if (skb->len > mtu && !skb_is_gso(skb)) { + if (!skb->dev) { + struct net *net = dev_net(skb_dst(skb)->dev); + + skb->dev = net->loopback_dev; + } + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + IP_VS_DBG_RL_PKT(0, AF_INET6, pp, skb, 0, + "ip_vs_nat_xmit_v6(): frag needed for"); + goto tx_error_put; + } + + /* copy-on-write the packet before mangling it */ + if (!skb_make_writable(skb, sizeof(struct ipv6hdr))) + goto tx_error_put; + + if (skb_cow(skb, rt->dst.dev->hard_header_len)) + goto tx_error_put; + + /* mangle the packet */ + if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp)) + goto tx_error; + ipv6_hdr(skb)->daddr = cp->daddr.in6; + + if (!local || !skb->dev) { + /* drop the old route when skb is not shared */ + skb_dst_drop(skb); + skb_dst_set(skb, &rt->dst); + } else { + /* destined to loopback, do we need to change route? */ + dst_release(&rt->dst); + } + + IP_VS_DBG_PKT(10, AF_INET6, pp, skb, 0, "After DNAT"); + + /* FIXME: when application helper enlarges the packet and the length + is larger than the MTU of outgoing device, there will be still + MTU problem. */ + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->local_df = 1; + + IP_VS_XMIT_NAT(NFPROTO_IPV6, skb, cp, local); + + LeaveFunction(10); + return NF_STOLEN; + +tx_error_icmp: + dst_link_failure(skb); +tx_error: + LeaveFunction(10); + kfree_skb(skb); + return NF_STOLEN; +tx_error_put: + dst_release(&rt->dst); + goto tx_error; +} +#endif + + +/* + * IP Tunneling transmitter + * + * This function encapsulates the packet in a new IP packet, its + * destination will be set to cp->daddr. Most code of this function + * is taken from ipip.c. + * + * It is used in VS/TUN cluster. The load balancer selects a real + * server from a cluster based on a scheduling algorithm, + * encapsulates the request packet and forwards it to the selected + * server. For example, all real servers are configured with + * "ifconfig tunl0 <Virtual IP Address> up". When the server receives + * the encapsulated packet, it will decapsulate the packet, processe + * the request and return the response packets directly to the client + * without passing the load balancer. This can greatly increase the + * scalability of virtual server. + * + * Used for ANY protocol + */ +int +ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) +{ + struct rtable *rt; /* Route to the other host */ + __be32 saddr; /* Source for tunnel */ + struct net_device *tdev; /* Device to other host */ + struct iphdr *old_iph = ip_hdr(skb); + u8 tos = old_iph->tos; + __be16 df = old_iph->frag_off; + struct iphdr *iph; /* Our new IP header */ + unsigned int max_headroom; /* The extra header space needed */ + int mtu; + int ret; + + EnterFunction(10); + + if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, + RT_TOS(tos), IP_VS_RT_MODE_LOCAL | + IP_VS_RT_MODE_NON_LOCAL, + &saddr))) + goto tx_error_icmp; + if (rt->rt_flags & RTCF_LOCAL) { + ip_rt_put(rt); + IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1); + } + + tdev = rt->dst.dev; + + mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr); + if (mtu < 68) { + IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__); + goto tx_error_put; + } + if (skb_dst(skb)) + skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu); + + df |= (old_iph->frag_off & htons(IP_DF)); + + if ((old_iph->frag_off & htons(IP_DF) && + mtu < ntohs(old_iph->tot_len) && !skb_is_gso(skb))) { + icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); + IP_VS_DBG_RL("%s(): frag needed\n", __func__); + goto tx_error_put; + } + + /* + * Okay, now see if we can stuff it in the buffer as-is. + */ + max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr); + + if (skb_headroom(skb) < max_headroom + || skb_cloned(skb) || skb_shared(skb)) { + struct sk_buff *new_skb = + skb_realloc_headroom(skb, max_headroom); + if (!new_skb) { + ip_rt_put(rt); + kfree_skb(skb); + IP_VS_ERR_RL("%s(): no memory\n", __func__); + return NF_STOLEN; + } + kfree_skb(skb); + skb = new_skb; + old_iph = ip_hdr(skb); + } + + skb->transport_header = skb->network_header; + + /* fix old IP header checksum */ + ip_send_check(old_iph); + + skb_push(skb, sizeof(struct iphdr)); + skb_reset_network_header(skb); + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + + /* drop old route */ + skb_dst_drop(skb); + skb_dst_set(skb, &rt->dst); + + /* + * Push down and install the IPIP header. + */ + iph = ip_hdr(skb); + iph->version = 4; + iph->ihl = sizeof(struct iphdr)>>2; + iph->frag_off = df; + iph->protocol = IPPROTO_IPIP; + iph->tos = tos; + iph->daddr = cp->daddr.ip; + iph->saddr = saddr; + iph->ttl = old_iph->ttl; + ip_select_ident(iph, &rt->dst, NULL); + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->local_df = 1; + + ret = IP_VS_XMIT_TUNNEL(skb, cp); + if (ret == NF_ACCEPT) + ip_local_out(skb); + else if (ret == NF_DROP) + kfree_skb(skb); + + LeaveFunction(10); + + return NF_STOLEN; + + tx_error_icmp: + dst_link_failure(skb); + tx_error: + kfree_skb(skb); + LeaveFunction(10); + return NF_STOLEN; +tx_error_put: + ip_rt_put(rt); + goto tx_error; +} + +#ifdef CONFIG_IP_VS_IPV6 +int +ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) +{ + struct rt6_info *rt; /* Route to the other host */ + struct in6_addr saddr; /* Source for tunnel */ + struct net_device *tdev; /* Device to other host */ + struct ipv6hdr *old_iph = ipv6_hdr(skb); + struct ipv6hdr *iph; /* Our new IP header */ + unsigned int max_headroom; /* The extra header space needed */ + int mtu; + int ret; + + EnterFunction(10); + + if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, + &saddr, 1, (IP_VS_RT_MODE_LOCAL | + IP_VS_RT_MODE_NON_LOCAL)))) + goto tx_error_icmp; + if (__ip_vs_is_local_route6(rt)) { + dst_release(&rt->dst); + IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 1); + } + + tdev = rt->dst.dev; + + mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr); + if (mtu < IPV6_MIN_MTU) { + IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__, + IPV6_MIN_MTU); + goto tx_error_put; + } + if (skb_dst(skb)) + skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu); + + if (mtu < ntohs(old_iph->payload_len) + sizeof(struct ipv6hdr) && + !skb_is_gso(skb)) { + if (!skb->dev) { + struct net *net = dev_net(skb_dst(skb)->dev); + + skb->dev = net->loopback_dev; + } + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + IP_VS_DBG_RL("%s(): frag needed\n", __func__); + goto tx_error_put; + } + + /* + * Okay, now see if we can stuff it in the buffer as-is. + */ + max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr); + + if (skb_headroom(skb) < max_headroom + || skb_cloned(skb) || skb_shared(skb)) { + struct sk_buff *new_skb = + skb_realloc_headroom(skb, max_headroom); + if (!new_skb) { + dst_release(&rt->dst); + kfree_skb(skb); + IP_VS_ERR_RL("%s(): no memory\n", __func__); + return NF_STOLEN; + } + kfree_skb(skb); + skb = new_skb; + old_iph = ipv6_hdr(skb); + } + + skb->transport_header = skb->network_header; + + skb_push(skb, sizeof(struct ipv6hdr)); + skb_reset_network_header(skb); + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + + /* drop old route */ + skb_dst_drop(skb); + skb_dst_set(skb, &rt->dst); + + /* + * Push down and install the IPIP header. + */ + iph = ipv6_hdr(skb); + iph->version = 6; + iph->nexthdr = IPPROTO_IPV6; + iph->payload_len = old_iph->payload_len; + be16_add_cpu(&iph->payload_len, sizeof(*old_iph)); + iph->priority = old_iph->priority; + memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl)); + iph->daddr = cp->daddr.in6; + iph->saddr = saddr; + iph->hop_limit = old_iph->hop_limit; + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->local_df = 1; + + ret = IP_VS_XMIT_TUNNEL(skb, cp); + if (ret == NF_ACCEPT) + ip6_local_out(skb); + else if (ret == NF_DROP) + kfree_skb(skb); + + LeaveFunction(10); + + return NF_STOLEN; + +tx_error_icmp: + dst_link_failure(skb); +tx_error: + kfree_skb(skb); + LeaveFunction(10); + return NF_STOLEN; +tx_error_put: + dst_release(&rt->dst); + goto tx_error; +} +#endif + + +/* + * Direct Routing transmitter + * Used for ANY protocol + */ +int +ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) +{ + struct rtable *rt; /* Route to the other host */ + struct iphdr *iph = ip_hdr(skb); + int mtu; + + EnterFunction(10); + + if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, + RT_TOS(iph->tos), + IP_VS_RT_MODE_LOCAL | + IP_VS_RT_MODE_NON_LOCAL, NULL))) + goto tx_error_icmp; + if (rt->rt_flags & RTCF_LOCAL) { + ip_rt_put(rt); + IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1); + } + + /* MTU checking */ + mtu = dst_mtu(&rt->dst); + if ((iph->frag_off & htons(IP_DF)) && skb->len > mtu && + !skb_is_gso(skb)) { + icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); + ip_rt_put(rt); + IP_VS_DBG_RL("%s(): frag needed\n", __func__); + goto tx_error; + } + + /* + * Call ip_send_check because we are not sure it is called + * after ip_defrag. Is copy-on-write needed? + */ + if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) { + ip_rt_put(rt); + return NF_STOLEN; + } + ip_send_check(ip_hdr(skb)); + + /* drop old route */ + skb_dst_drop(skb); + skb_dst_set(skb, &rt->dst); + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->local_df = 1; + + IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 0); + + LeaveFunction(10); + return NF_STOLEN; + + tx_error_icmp: + dst_link_failure(skb); + tx_error: + kfree_skb(skb); + LeaveFunction(10); + return NF_STOLEN; +} + +#ifdef CONFIG_IP_VS_IPV6 +int +ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) +{ + struct rt6_info *rt; /* Route to the other host */ + int mtu; + + EnterFunction(10); + + if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL, + 0, (IP_VS_RT_MODE_LOCAL | + IP_VS_RT_MODE_NON_LOCAL)))) + goto tx_error_icmp; + if (__ip_vs_is_local_route6(rt)) { + dst_release(&rt->dst); + IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 1); + } + + /* MTU checking */ + mtu = dst_mtu(&rt->dst); + if (skb->len > mtu) { + if (!skb->dev) { + struct net *net = dev_net(skb_dst(skb)->dev); + + skb->dev = net->loopback_dev; + } + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + dst_release(&rt->dst); + IP_VS_DBG_RL("%s(): frag needed\n", __func__); + goto tx_error; + } + + /* + * Call ip_send_check because we are not sure it is called + * after ip_defrag. Is copy-on-write needed? + */ + skb = skb_share_check(skb, GFP_ATOMIC); + if (unlikely(skb == NULL)) { + dst_release(&rt->dst); + return NF_STOLEN; + } + + /* drop old route */ + skb_dst_drop(skb); + skb_dst_set(skb, &rt->dst); + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->local_df = 1; + + IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 0); + + LeaveFunction(10); + return NF_STOLEN; + +tx_error_icmp: + dst_link_failure(skb); +tx_error: + kfree_skb(skb); + LeaveFunction(10); + return NF_STOLEN; +} +#endif + + +/* + * ICMP packet transmitter + * called by the ip_vs_in_icmp + */ +int +ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp, int offset, unsigned int hooknum) +{ + struct rtable *rt; /* Route to the other host */ + int mtu; + int rc; + int local; + int rt_mode; + + EnterFunction(10); + + /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be + forwarded directly here, because there is no need to + translate address/port back */ + if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { + if (cp->packet_xmit) + rc = cp->packet_xmit(skb, cp, pp); + else + rc = NF_ACCEPT; + /* do not touch skb anymore */ + atomic_inc(&cp->in_pkts); + goto out; + } + + /* + * mangle and send the packet here (only for VS/NAT) + */ + + /* LOCALNODE from FORWARD hook is not supported */ + rt_mode = (hooknum != NF_INET_FORWARD) ? + IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | + IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL; + if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, + RT_TOS(ip_hdr(skb)->tos), + rt_mode, NULL))) + goto tx_error_icmp; + local = rt->rt_flags & RTCF_LOCAL; + + /* + * Avoid duplicate tuple in reply direction for NAT traffic + * to local address when connection is sync-ed + */ +#if IS_ENABLED(CONFIG_NF_CONNTRACK) + if (cp->flags & IP_VS_CONN_F_SYNC && local) { + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo); + + if (ct && !nf_ct_is_untracked(ct)) { + IP_VS_DBG(10, "%s(): " + "stopping DNAT to local address %pI4\n", + __func__, &cp->daddr.ip); + goto tx_error_put; + } + } +#endif + + /* From world but DNAT to loopback address? */ + if (local && ipv4_is_loopback(cp->daddr.ip) && + rt_is_input_route(skb_rtable(skb))) { + IP_VS_DBG(1, "%s(): " + "stopping DNAT to loopback %pI4\n", + __func__, &cp->daddr.ip); + goto tx_error_put; + } + + /* MTU checking */ + mtu = dst_mtu(&rt->dst); + if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF)) && + !skb_is_gso(skb)) { + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); + IP_VS_DBG_RL("%s(): frag needed\n", __func__); + goto tx_error_put; + } + + /* copy-on-write the packet before mangling it */ + if (!skb_make_writable(skb, offset)) + goto tx_error_put; + + if (skb_cow(skb, rt->dst.dev->hard_header_len)) + goto tx_error_put; + + ip_vs_nat_icmp(skb, pp, cp, 0); + + if (!local) { + /* drop the old route when skb is not shared */ + skb_dst_drop(skb); + skb_dst_set(skb, &rt->dst); + } else { + ip_rt_put(rt); + /* + * Some IPv4 replies get local address from routes, + * not from iph, so while we DNAT after routing + * we need this second input/output route. + */ + if (!__ip_vs_reroute_locally(skb)) + goto tx_error; + } + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->local_df = 1; + + IP_VS_XMIT_NAT(NFPROTO_IPV4, skb, cp, local); + + rc = NF_STOLEN; + goto out; + + tx_error_icmp: + dst_link_failure(skb); + tx_error: + dev_kfree_skb(skb); + rc = NF_STOLEN; + out: + LeaveFunction(10); + return rc; + tx_error_put: + ip_rt_put(rt); + goto tx_error; +} + +#ifdef CONFIG_IP_VS_IPV6 +int +ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp, int offset, unsigned int hooknum) +{ + struct rt6_info *rt; /* Route to the other host */ + int mtu; + int rc; + int local; + int rt_mode; + + EnterFunction(10); + + /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be + forwarded directly here, because there is no need to + translate address/port back */ + if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { + if (cp->packet_xmit) + rc = cp->packet_xmit(skb, cp, pp); + else + rc = NF_ACCEPT; + /* do not touch skb anymore */ + atomic_inc(&cp->in_pkts); + goto out; + } + + /* + * mangle and send the packet here (only for VS/NAT) + */ + + /* LOCALNODE from FORWARD hook is not supported */ + rt_mode = (hooknum != NF_INET_FORWARD) ? + IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | + IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL; + if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL, + 0, rt_mode))) + goto tx_error_icmp; + + local = __ip_vs_is_local_route6(rt); + /* + * Avoid duplicate tuple in reply direction for NAT traffic + * to local address when connection is sync-ed + */ +#if IS_ENABLED(CONFIG_NF_CONNTRACK) + if (cp->flags & IP_VS_CONN_F_SYNC && local) { + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo); + + if (ct && !nf_ct_is_untracked(ct)) { + IP_VS_DBG(10, "%s(): " + "stopping DNAT to local address %pI6\n", + __func__, &cp->daddr.in6); + goto tx_error_put; + } + } +#endif + + /* From world but DNAT to loopback address? */ + if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) && + ipv6_addr_type(&rt->rt6i_dst.addr) & IPV6_ADDR_LOOPBACK) { + IP_VS_DBG(1, "%s(): " + "stopping DNAT to loopback %pI6\n", + __func__, &cp->daddr.in6); + goto tx_error_put; + } + + /* MTU checking */ + mtu = dst_mtu(&rt->dst); + if (skb->len > mtu && !skb_is_gso(skb)) { + if (!skb->dev) { + struct net *net = dev_net(skb_dst(skb)->dev); + + skb->dev = net->loopback_dev; + } + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + IP_VS_DBG_RL("%s(): frag needed\n", __func__); + goto tx_error_put; + } + + /* copy-on-write the packet before mangling it */ + if (!skb_make_writable(skb, offset)) + goto tx_error_put; + + if (skb_cow(skb, rt->dst.dev->hard_header_len)) + goto tx_error_put; + + ip_vs_nat_icmp_v6(skb, pp, cp, 0); + + if (!local || !skb->dev) { + /* drop the old route when skb is not shared */ + skb_dst_drop(skb); + skb_dst_set(skb, &rt->dst); + } else { + /* destined to loopback, do we need to change route? */ + dst_release(&rt->dst); + } + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->local_df = 1; + + IP_VS_XMIT_NAT(NFPROTO_IPV6, skb, cp, local); + + rc = NF_STOLEN; + goto out; + +tx_error_icmp: + dst_link_failure(skb); +tx_error: + dev_kfree_skb(skb); + rc = NF_STOLEN; +out: + LeaveFunction(10); + return rc; +tx_error_put: + dst_release(&rt->dst); + goto tx_error; +} +#endif diff --git a/net/netfilter/nf_conntrack_acct.c b/net/netfilter/nf_conntrack_acct.c new file mode 100644 index 00000000..f4f8cda0 --- /dev/null +++ b/net/netfilter/nf_conntrack_acct.c @@ -0,0 +1,137 @@ +/* Accouting handling for netfilter. */ + +/* + * (C) 2008 Krzysztof Piotr Oledzki <ole@ans.pl> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/netfilter.h> +#include <linux/slab.h> +#include <linux/kernel.h> +#include <linux/moduleparam.h> +#include <linux/export.h> + +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_extend.h> +#include <net/netfilter/nf_conntrack_acct.h> + +static bool nf_ct_acct __read_mostly; + +module_param_named(acct, nf_ct_acct, bool, 0644); +MODULE_PARM_DESC(acct, "Enable connection tracking flow accounting."); + +#ifdef CONFIG_SYSCTL +static struct ctl_table acct_sysctl_table[] = { + { + .procname = "nf_conntrack_acct", + .data = &init_net.ct.sysctl_acct, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + {} +}; +#endif /* CONFIG_SYSCTL */ + +unsigned int +seq_print_acct(struct seq_file *s, const struct nf_conn *ct, int dir) +{ + struct nf_conn_counter *acct; + + acct = nf_conn_acct_find(ct); + if (!acct) + return 0; + + return seq_printf(s, "packets=%llu bytes=%llu ", + (unsigned long long)atomic64_read(&acct[dir].packets), + (unsigned long long)atomic64_read(&acct[dir].bytes)); +}; +EXPORT_SYMBOL_GPL(seq_print_acct); + +static struct nf_ct_ext_type acct_extend __read_mostly = { + .len = sizeof(struct nf_conn_counter[IP_CT_DIR_MAX]), + .align = __alignof__(struct nf_conn_counter[IP_CT_DIR_MAX]), + .id = NF_CT_EXT_ACCT, +}; + +#ifdef CONFIG_SYSCTL +static int nf_conntrack_acct_init_sysctl(struct net *net) +{ + struct ctl_table *table; + + table = kmemdup(acct_sysctl_table, sizeof(acct_sysctl_table), + GFP_KERNEL); + if (!table) + goto out; + + table[0].data = &net->ct.sysctl_acct; + + net->ct.acct_sysctl_header = register_net_sysctl_table(net, + nf_net_netfilter_sysctl_path, table); + if (!net->ct.acct_sysctl_header) { + printk(KERN_ERR "nf_conntrack_acct: can't register to sysctl.\n"); + goto out_register; + } + return 0; + +out_register: + kfree(table); +out: + return -ENOMEM; +} + +static void nf_conntrack_acct_fini_sysctl(struct net *net) +{ + struct ctl_table *table; + + table = net->ct.acct_sysctl_header->ctl_table_arg; + unregister_net_sysctl_table(net->ct.acct_sysctl_header); + kfree(table); +} +#else +static int nf_conntrack_acct_init_sysctl(struct net *net) +{ + return 0; +} + +static void nf_conntrack_acct_fini_sysctl(struct net *net) +{ +} +#endif + +int nf_conntrack_acct_init(struct net *net) +{ + int ret; + + net->ct.sysctl_acct = nf_ct_acct; + + if (net_eq(net, &init_net)) { + ret = nf_ct_extend_register(&acct_extend); + if (ret < 0) { + printk(KERN_ERR "nf_conntrack_acct: Unable to register extension\n"); + goto out_extend_register; + } + } + + ret = nf_conntrack_acct_init_sysctl(net); + if (ret < 0) + goto out_sysctl; + + return 0; + +out_sysctl: + if (net_eq(net, &init_net)) + nf_ct_extend_unregister(&acct_extend); +out_extend_register: + return ret; +} + +void nf_conntrack_acct_fini(struct net *net) +{ + nf_conntrack_acct_fini_sysctl(net); + if (net_eq(net, &init_net)) + nf_ct_extend_unregister(&acct_extend); +} diff --git a/net/netfilter/nf_conntrack_amanda.c b/net/netfilter/nf_conntrack_amanda.c new file mode 100644 index 00000000..13fd2c55 --- /dev/null +++ b/net/netfilter/nf_conntrack_amanda.c @@ -0,0 +1,237 @@ +/* Amanda extension for IP connection tracking + * + * (C) 2002 by Brian J. Murrell <netfilter@interlinx.bc.ca> + * based on HW's ip_conntrack_irc.c as well as other modules + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/textsearch.h> +#include <linux/skbuff.h> +#include <linux/in.h> +#include <linux/udp.h> +#include <linux/netfilter.h> +#include <linux/gfp.h> + +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_expect.h> +#include <net/netfilter/nf_conntrack_ecache.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <linux/netfilter/nf_conntrack_amanda.h> + +static unsigned int master_timeout __read_mostly = 300; +static char *ts_algo = "kmp"; + +MODULE_AUTHOR("Brian J. Murrell <netfilter@interlinx.bc.ca>"); +MODULE_DESCRIPTION("Amanda connection tracking module"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ip_conntrack_amanda"); +MODULE_ALIAS_NFCT_HELPER("amanda"); + +module_param(master_timeout, uint, 0600); +MODULE_PARM_DESC(master_timeout, "timeout for the master connection"); +module_param(ts_algo, charp, 0400); +MODULE_PARM_DESC(ts_algo, "textsearch algorithm to use (default kmp)"); + +unsigned int (*nf_nat_amanda_hook)(struct sk_buff *skb, + enum ip_conntrack_info ctinfo, + unsigned int matchoff, + unsigned int matchlen, + struct nf_conntrack_expect *exp) + __read_mostly; +EXPORT_SYMBOL_GPL(nf_nat_amanda_hook); + +enum amanda_strings { + SEARCH_CONNECT, + SEARCH_NEWLINE, + SEARCH_DATA, + SEARCH_MESG, + SEARCH_INDEX, +}; + +static struct { + const char *string; + size_t len; + struct ts_config *ts; +} search[] __read_mostly = { + [SEARCH_CONNECT] = { + .string = "CONNECT ", + .len = 8, + }, + [SEARCH_NEWLINE] = { + .string = "\n", + .len = 1, + }, + [SEARCH_DATA] = { + .string = "DATA ", + .len = 5, + }, + [SEARCH_MESG] = { + .string = "MESG ", + .len = 5, + }, + [SEARCH_INDEX] = { + .string = "INDEX ", + .len = 6, + }, +}; + +static int amanda_help(struct sk_buff *skb, + unsigned int protoff, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo) +{ + struct ts_state ts; + struct nf_conntrack_expect *exp; + struct nf_conntrack_tuple *tuple; + unsigned int dataoff, start, stop, off, i; + char pbuf[sizeof("65535")], *tmp; + u_int16_t len; + __be16 port; + int ret = NF_ACCEPT; + typeof(nf_nat_amanda_hook) nf_nat_amanda; + + /* Only look at packets from the Amanda server */ + if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) + return NF_ACCEPT; + + /* increase the UDP timeout of the master connection as replies from + * Amanda clients to the server can be quite delayed */ + nf_ct_refresh(ct, skb, master_timeout * HZ); + + /* No data? */ + dataoff = protoff + sizeof(struct udphdr); + if (dataoff >= skb->len) { + if (net_ratelimit()) + printk(KERN_ERR "amanda_help: skblen = %u\n", skb->len); + return NF_ACCEPT; + } + + memset(&ts, 0, sizeof(ts)); + start = skb_find_text(skb, dataoff, skb->len, + search[SEARCH_CONNECT].ts, &ts); + if (start == UINT_MAX) + goto out; + start += dataoff + search[SEARCH_CONNECT].len; + + memset(&ts, 0, sizeof(ts)); + stop = skb_find_text(skb, start, skb->len, + search[SEARCH_NEWLINE].ts, &ts); + if (stop == UINT_MAX) + goto out; + stop += start; + + for (i = SEARCH_DATA; i <= SEARCH_INDEX; i++) { + memset(&ts, 0, sizeof(ts)); + off = skb_find_text(skb, start, stop, search[i].ts, &ts); + if (off == UINT_MAX) + continue; + off += start + search[i].len; + + len = min_t(unsigned int, sizeof(pbuf) - 1, stop - off); + if (skb_copy_bits(skb, off, pbuf, len)) + break; + pbuf[len] = '\0'; + + port = htons(simple_strtoul(pbuf, &tmp, 10)); + len = tmp - pbuf; + if (port == 0 || len > 5) + break; + + exp = nf_ct_expect_alloc(ct); + if (exp == NULL) { + ret = NF_DROP; + goto out; + } + tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; + nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, + nf_ct_l3num(ct), + &tuple->src.u3, &tuple->dst.u3, + IPPROTO_TCP, NULL, &port); + + nf_nat_amanda = rcu_dereference(nf_nat_amanda_hook); + if (nf_nat_amanda && ct->status & IPS_NAT_MASK) + ret = nf_nat_amanda(skb, ctinfo, off - dataoff, + len, exp); + else if (nf_ct_expect_related(exp) != 0) + ret = NF_DROP; + nf_ct_expect_put(exp); + } + +out: + return ret; +} + +static const struct nf_conntrack_expect_policy amanda_exp_policy = { + .max_expected = 3, + .timeout = 180, +}; + +static struct nf_conntrack_helper amanda_helper[2] __read_mostly = { + { + .name = "amanda", + .me = THIS_MODULE, + .help = amanda_help, + .tuple.src.l3num = AF_INET, + .tuple.src.u.udp.port = cpu_to_be16(10080), + .tuple.dst.protonum = IPPROTO_UDP, + .expect_policy = &amanda_exp_policy, + }, + { + .name = "amanda", + .me = THIS_MODULE, + .help = amanda_help, + .tuple.src.l3num = AF_INET6, + .tuple.src.u.udp.port = cpu_to_be16(10080), + .tuple.dst.protonum = IPPROTO_UDP, + .expect_policy = &amanda_exp_policy, + }, +}; + +static void __exit nf_conntrack_amanda_fini(void) +{ + int i; + + nf_conntrack_helper_unregister(&amanda_helper[0]); + nf_conntrack_helper_unregister(&amanda_helper[1]); + for (i = 0; i < ARRAY_SIZE(search); i++) + textsearch_destroy(search[i].ts); +} + +static int __init nf_conntrack_amanda_init(void) +{ + int ret, i; + + for (i = 0; i < ARRAY_SIZE(search); i++) { + search[i].ts = textsearch_prepare(ts_algo, search[i].string, + search[i].len, + GFP_KERNEL, TS_AUTOLOAD); + if (IS_ERR(search[i].ts)) { + ret = PTR_ERR(search[i].ts); + goto err1; + } + } + ret = nf_conntrack_helper_register(&amanda_helper[0]); + if (ret < 0) + goto err1; + ret = nf_conntrack_helper_register(&amanda_helper[1]); + if (ret < 0) + goto err2; + return 0; + +err2: + nf_conntrack_helper_unregister(&amanda_helper[0]); +err1: + while (--i >= 0) + textsearch_destroy(search[i].ts); + + return ret; +} + +module_init(nf_conntrack_amanda_init); +module_exit(nf_conntrack_amanda_fini); diff --git a/net/netfilter/nf_conntrack_broadcast.c b/net/netfilter/nf_conntrack_broadcast.c new file mode 100644 index 00000000..4e99cca6 --- /dev/null +++ b/net/netfilter/nf_conntrack_broadcast.c @@ -0,0 +1,82 @@ +/* + * broadcast connection tracking helper + * + * (c) 2005 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/module.h> +#include <linux/ip.h> +#include <net/route.h> +#include <linux/inetdevice.h> +#include <linux/skbuff.h> + +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_expect.h> + +int nf_conntrack_broadcast_help(struct sk_buff *skb, + unsigned int protoff, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned int timeout) +{ + struct nf_conntrack_expect *exp; + struct iphdr *iph = ip_hdr(skb); + struct rtable *rt = skb_rtable(skb); + struct in_device *in_dev; + struct nf_conn_help *help = nfct_help(ct); + __be32 mask = 0; + + /* we're only interested in locally generated packets */ + if (skb->sk == NULL) + goto out; + if (rt == NULL || !(rt->rt_flags & RTCF_BROADCAST)) + goto out; + if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) + goto out; + + rcu_read_lock(); + in_dev = __in_dev_get_rcu(rt->dst.dev); + if (in_dev != NULL) { + for_primary_ifa(in_dev) { + if (ifa->ifa_broadcast == iph->daddr) { + mask = ifa->ifa_mask; + break; + } + } endfor_ifa(in_dev); + } + rcu_read_unlock(); + + if (mask == 0) + goto out; + + exp = nf_ct_expect_alloc(ct); + if (exp == NULL) + goto out; + + exp->tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple; + exp->tuple.src.u.udp.port = help->helper->tuple.src.u.udp.port; + + exp->mask.src.u3.ip = mask; + exp->mask.src.u.udp.port = htons(0xFFFF); + + exp->expectfn = NULL; + exp->flags = NF_CT_EXPECT_PERMANENT; + exp->class = NF_CT_EXPECT_CLASS_DEFAULT; + exp->helper = NULL; + + nf_ct_expect_related(exp); + nf_ct_expect_put(exp); + + nf_ct_refresh(ct, skb, timeout * HZ); +out: + return NF_ACCEPT; +} +EXPORT_SYMBOL_GPL(nf_conntrack_broadcast_help); + +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c new file mode 100644 index 00000000..729f157a --- /dev/null +++ b/net/netfilter/nf_conntrack_core.c @@ -0,0 +1,1647 @@ +/* Connection state tracking for netfilter. This is separated from, + but required by, the NAT layer; it can also be used by an iptables + extension. */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> + * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/types.h> +#include <linux/netfilter.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/skbuff.h> +#include <linux/proc_fs.h> +#include <linux/vmalloc.h> +#include <linux/stddef.h> +#include <linux/slab.h> +#include <linux/random.h> +#include <linux/jhash.h> +#include <linux/err.h> +#include <linux/percpu.h> +#include <linux/moduleparam.h> +#include <linux/notifier.h> +#include <linux/kernel.h> +#include <linux/netdevice.h> +#include <linux/socket.h> +#include <linux/mm.h> +#include <linux/nsproxy.h> +#include <linux/rculist_nulls.h> + +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_l3proto.h> +#include <net/netfilter/nf_conntrack_l4proto.h> +#include <net/netfilter/nf_conntrack_expect.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_extend.h> +#include <net/netfilter/nf_conntrack_acct.h> +#include <net/netfilter/nf_conntrack_ecache.h> +#include <net/netfilter/nf_conntrack_zones.h> +#include <net/netfilter/nf_conntrack_timestamp.h> +#include <net/netfilter/nf_conntrack_timeout.h> +#include <net/netfilter/nf_nat.h> +#include <net/netfilter/nf_nat_core.h> + +#define NF_CONNTRACK_VERSION "0.5.0" + +int (*nfnetlink_parse_nat_setup_hook)(struct nf_conn *ct, + enum nf_nat_manip_type manip, + const struct nlattr *attr) __read_mostly; +EXPORT_SYMBOL_GPL(nfnetlink_parse_nat_setup_hook); + +DEFINE_SPINLOCK(nf_conntrack_lock); +EXPORT_SYMBOL_GPL(nf_conntrack_lock); + +unsigned int nf_conntrack_htable_size __read_mostly; +EXPORT_SYMBOL_GPL(nf_conntrack_htable_size); + +unsigned int nf_conntrack_max __read_mostly; +EXPORT_SYMBOL_GPL(nf_conntrack_max); + +DEFINE_PER_CPU(struct nf_conn, nf_conntrack_untracked); +EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked); + +unsigned int nf_conntrack_hash_rnd __read_mostly; +EXPORT_SYMBOL_GPL(nf_conntrack_hash_rnd); + +static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, u16 zone) +{ + unsigned int n; + + /* The direction must be ignored, so we hash everything up to the + * destination ports (which is a multiple of 4) and treat the last + * three bytes manually. + */ + n = (sizeof(tuple->src) + sizeof(tuple->dst.u3)) / sizeof(u32); + return jhash2((u32 *)tuple, n, zone ^ nf_conntrack_hash_rnd ^ + (((__force __u16)tuple->dst.u.all << 16) | + tuple->dst.protonum)); +} + +static u32 __hash_bucket(u32 hash, unsigned int size) +{ + return ((u64)hash * size) >> 32; +} + +static u32 hash_bucket(u32 hash, const struct net *net) +{ + return __hash_bucket(hash, net->ct.htable_size); +} + +static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple, + u16 zone, unsigned int size) +{ + return __hash_bucket(hash_conntrack_raw(tuple, zone), size); +} + +static inline u_int32_t hash_conntrack(const struct net *net, u16 zone, + const struct nf_conntrack_tuple *tuple) +{ + return __hash_conntrack(tuple, zone, net->ct.htable_size); +} + +bool +nf_ct_get_tuple(const struct sk_buff *skb, + unsigned int nhoff, + unsigned int dataoff, + u_int16_t l3num, + u_int8_t protonum, + struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_l3proto *l3proto, + const struct nf_conntrack_l4proto *l4proto) +{ + memset(tuple, 0, sizeof(*tuple)); + + tuple->src.l3num = l3num; + if (l3proto->pkt_to_tuple(skb, nhoff, tuple) == 0) + return false; + + tuple->dst.protonum = protonum; + tuple->dst.dir = IP_CT_DIR_ORIGINAL; + + return l4proto->pkt_to_tuple(skb, dataoff, tuple); +} +EXPORT_SYMBOL_GPL(nf_ct_get_tuple); + +bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff, + u_int16_t l3num, struct nf_conntrack_tuple *tuple) +{ + struct nf_conntrack_l3proto *l3proto; + struct nf_conntrack_l4proto *l4proto; + unsigned int protoff; + u_int8_t protonum; + int ret; + + rcu_read_lock(); + + l3proto = __nf_ct_l3proto_find(l3num); + ret = l3proto->get_l4proto(skb, nhoff, &protoff, &protonum); + if (ret != NF_ACCEPT) { + rcu_read_unlock(); + return false; + } + + l4proto = __nf_ct_l4proto_find(l3num, protonum); + + ret = nf_ct_get_tuple(skb, nhoff, protoff, l3num, protonum, tuple, + l3proto, l4proto); + + rcu_read_unlock(); + return ret; +} +EXPORT_SYMBOL_GPL(nf_ct_get_tuplepr); + +bool +nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse, + const struct nf_conntrack_tuple *orig, + const struct nf_conntrack_l3proto *l3proto, + const struct nf_conntrack_l4proto *l4proto) +{ + memset(inverse, 0, sizeof(*inverse)); + + inverse->src.l3num = orig->src.l3num; + if (l3proto->invert_tuple(inverse, orig) == 0) + return false; + + inverse->dst.dir = !orig->dst.dir; + + inverse->dst.protonum = orig->dst.protonum; + return l4proto->invert_tuple(inverse, orig); +} +EXPORT_SYMBOL_GPL(nf_ct_invert_tuple); + +static void +clean_from_lists(struct nf_conn *ct) +{ + pr_debug("clean_from_lists(%p)\n", ct); + hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); + hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode); + + /* Destroy all pending expectations */ + nf_ct_remove_expectations(ct); +} + +static void +destroy_conntrack(struct nf_conntrack *nfct) +{ + struct nf_conn *ct = (struct nf_conn *)nfct; + struct net *net = nf_ct_net(ct); + struct nf_conntrack_l4proto *l4proto; + + pr_debug("destroy_conntrack(%p)\n", ct); + NF_CT_ASSERT(atomic_read(&nfct->use) == 0); + NF_CT_ASSERT(!timer_pending(&ct->timeout)); + + /* To make sure we don't get any weird locking issues here: + * destroy_conntrack() MUST NOT be called with a write lock + * to nf_conntrack_lock!!! -HW */ + rcu_read_lock(); + l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); + if (l4proto && l4proto->destroy) + l4proto->destroy(ct); + + rcu_read_unlock(); + + spin_lock_bh(&nf_conntrack_lock); + /* Expectations will have been removed in clean_from_lists, + * except TFTP can create an expectation on the first packet, + * before connection is in the list, so we need to clean here, + * too. */ + nf_ct_remove_expectations(ct); + + /* We overload first tuple to link into unconfirmed list. */ + if (!nf_ct_is_confirmed(ct)) { + BUG_ON(hlist_nulls_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode)); + hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); + } + + NF_CT_STAT_INC(net, delete); + spin_unlock_bh(&nf_conntrack_lock); + + if (ct->master) + nf_ct_put(ct->master); + + pr_debug("destroy_conntrack: returning ct=%p to slab\n", ct); + nf_conntrack_free(ct); +} + +void nf_ct_delete_from_lists(struct nf_conn *ct) +{ + struct net *net = nf_ct_net(ct); + + nf_ct_helper_destroy(ct); + spin_lock_bh(&nf_conntrack_lock); + /* Inside lock so preempt is disabled on module removal path. + * Otherwise we can get spurious warnings. */ + NF_CT_STAT_INC(net, delete_list); + clean_from_lists(ct); + spin_unlock_bh(&nf_conntrack_lock); +} +EXPORT_SYMBOL_GPL(nf_ct_delete_from_lists); + +static void death_by_event(unsigned long ul_conntrack) +{ + struct nf_conn *ct = (void *)ul_conntrack; + struct net *net = nf_ct_net(ct); + + if (nf_conntrack_event(IPCT_DESTROY, ct) < 0) { + /* bad luck, let's retry again */ + ct->timeout.expires = jiffies + + (random32() % net->ct.sysctl_events_retry_timeout); + add_timer(&ct->timeout); + return; + } + /* we've got the event delivered, now it's dying */ + set_bit(IPS_DYING_BIT, &ct->status); + spin_lock(&nf_conntrack_lock); + hlist_nulls_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); + spin_unlock(&nf_conntrack_lock); + nf_ct_put(ct); +} + +void nf_ct_insert_dying_list(struct nf_conn *ct) +{ + struct net *net = nf_ct_net(ct); + + /* add this conntrack to the dying list */ + spin_lock_bh(&nf_conntrack_lock); + hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, + &net->ct.dying); + spin_unlock_bh(&nf_conntrack_lock); + /* set a new timer to retry event delivery */ + setup_timer(&ct->timeout, death_by_event, (unsigned long)ct); + ct->timeout.expires = jiffies + + (random32() % net->ct.sysctl_events_retry_timeout); + add_timer(&ct->timeout); +} +EXPORT_SYMBOL_GPL(nf_ct_insert_dying_list); + +static void death_by_timeout(unsigned long ul_conntrack) +{ + struct nf_conn *ct = (void *)ul_conntrack; + struct nf_conn_tstamp *tstamp; + + tstamp = nf_conn_tstamp_find(ct); + if (tstamp && tstamp->stop == 0) + tstamp->stop = ktime_to_ns(ktime_get_real()); + + if (!test_bit(IPS_DYING_BIT, &ct->status) && + unlikely(nf_conntrack_event(IPCT_DESTROY, ct) < 0)) { + /* destroy event was not delivered */ + nf_ct_delete_from_lists(ct); + nf_ct_insert_dying_list(ct); + return; + } + set_bit(IPS_DYING_BIT, &ct->status); + nf_ct_delete_from_lists(ct); + nf_ct_put(ct); +} + +/* + * Warning : + * - Caller must take a reference on returned object + * and recheck nf_ct_tuple_equal(tuple, &h->tuple) + * OR + * - Caller must lock nf_conntrack_lock before calling this function + */ +static struct nf_conntrack_tuple_hash * +____nf_conntrack_find(struct net *net, u16 zone, + const struct nf_conntrack_tuple *tuple, u32 hash) +{ + struct nf_conntrack_tuple_hash *h; + struct hlist_nulls_node *n; + unsigned int bucket = hash_bucket(hash, net); + + /* Disable BHs the entire time since we normally need to disable them + * at least once for the stats anyway. + */ + local_bh_disable(); +begin: + hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[bucket], hnnode) { + if (nf_ct_tuple_equal(tuple, &h->tuple) && + nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)) == zone) { + NF_CT_STAT_INC(net, found); + local_bh_enable(); + return h; + } + NF_CT_STAT_INC(net, searched); + } + /* + * if the nulls value we got at the end of this lookup is + * not the expected one, we must restart lookup. + * We probably met an item that was moved to another chain. + */ + if (get_nulls_value(n) != bucket) { + NF_CT_STAT_INC(net, search_restart); + goto begin; + } + local_bh_enable(); + + return NULL; +} + +struct nf_conntrack_tuple_hash * +__nf_conntrack_find(struct net *net, u16 zone, + const struct nf_conntrack_tuple *tuple) +{ + return ____nf_conntrack_find(net, zone, tuple, + hash_conntrack_raw(tuple, zone)); +} +EXPORT_SYMBOL_GPL(__nf_conntrack_find); + +/* Find a connection corresponding to a tuple. */ +static struct nf_conntrack_tuple_hash * +__nf_conntrack_find_get(struct net *net, u16 zone, + const struct nf_conntrack_tuple *tuple, u32 hash) +{ + struct nf_conntrack_tuple_hash *h; + struct nf_conn *ct; + + rcu_read_lock(); +begin: + h = ____nf_conntrack_find(net, zone, tuple, hash); + if (h) { + ct = nf_ct_tuplehash_to_ctrack(h); + if (unlikely(nf_ct_is_dying(ct) || + !atomic_inc_not_zero(&ct->ct_general.use))) + h = NULL; + else { + if (unlikely(!nf_ct_tuple_equal(tuple, &h->tuple) || + nf_ct_zone(ct) != zone)) { + nf_ct_put(ct); + goto begin; + } + } + } + rcu_read_unlock(); + + return h; +} + +struct nf_conntrack_tuple_hash * +nf_conntrack_find_get(struct net *net, u16 zone, + const struct nf_conntrack_tuple *tuple) +{ + return __nf_conntrack_find_get(net, zone, tuple, + hash_conntrack_raw(tuple, zone)); +} +EXPORT_SYMBOL_GPL(nf_conntrack_find_get); + +static void __nf_conntrack_hash_insert(struct nf_conn *ct, + unsigned int hash, + unsigned int repl_hash) +{ + struct net *net = nf_ct_net(ct); + + hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, + &net->ct.hash[hash]); + hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode, + &net->ct.hash[repl_hash]); +} + +int +nf_conntrack_hash_check_insert(struct nf_conn *ct) +{ + struct net *net = nf_ct_net(ct); + unsigned int hash, repl_hash; + struct nf_conntrack_tuple_hash *h; + struct hlist_nulls_node *n; + u16 zone; + + zone = nf_ct_zone(ct); + hash = hash_conntrack(net, zone, + &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + repl_hash = hash_conntrack(net, zone, + &ct->tuplehash[IP_CT_DIR_REPLY].tuple); + + spin_lock_bh(&nf_conntrack_lock); + + /* See if there's one in the list already, including reverse */ + hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode) + if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, + &h->tuple) && + zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h))) + goto out; + hlist_nulls_for_each_entry(h, n, &net->ct.hash[repl_hash], hnnode) + if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple, + &h->tuple) && + zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h))) + goto out; + + add_timer(&ct->timeout); + nf_conntrack_get(&ct->ct_general); + __nf_conntrack_hash_insert(ct, hash, repl_hash); + NF_CT_STAT_INC(net, insert); + spin_unlock_bh(&nf_conntrack_lock); + + return 0; + +out: + NF_CT_STAT_INC(net, insert_failed); + spin_unlock_bh(&nf_conntrack_lock); + return -EEXIST; +} +EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert); + +/* Confirm a connection given skb; places it in hash table */ +int +__nf_conntrack_confirm(struct sk_buff *skb) +{ + unsigned int hash, repl_hash; + struct nf_conntrack_tuple_hash *h; + struct nf_conn *ct; + struct nf_conn_help *help; + struct nf_conn_tstamp *tstamp; + struct hlist_nulls_node *n; + enum ip_conntrack_info ctinfo; + struct net *net; + u16 zone; + + ct = nf_ct_get(skb, &ctinfo); + net = nf_ct_net(ct); + + /* ipt_REJECT uses nf_conntrack_attach to attach related + ICMP/TCP RST packets in other direction. Actual packet + which created connection will be IP_CT_NEW or for an + expected connection, IP_CT_RELATED. */ + if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) + return NF_ACCEPT; + + zone = nf_ct_zone(ct); + /* reuse the hash saved before */ + hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev; + hash = hash_bucket(hash, net); + repl_hash = hash_conntrack(net, zone, + &ct->tuplehash[IP_CT_DIR_REPLY].tuple); + + /* We're not in hash table, and we refuse to set up related + connections for unconfirmed conns. But packet copies and + REJECT will give spurious warnings here. */ + /* NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 1); */ + + /* No external references means no one else could have + confirmed us. */ + NF_CT_ASSERT(!nf_ct_is_confirmed(ct)); + pr_debug("Confirming conntrack %p\n", ct); + + spin_lock_bh(&nf_conntrack_lock); + + /* We have to check the DYING flag inside the lock to prevent + a race against nf_ct_get_next_corpse() possibly called from + user context, else we insert an already 'dead' hash, blocking + further use of that particular connection -JM */ + + if (unlikely(nf_ct_is_dying(ct))) { + spin_unlock_bh(&nf_conntrack_lock); + return NF_ACCEPT; + } + + /* See if there's one in the list already, including reverse: + NAT could have grabbed it without realizing, since we're + not in the hash. If there is, we lost race. */ + hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode) + if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, + &h->tuple) && + zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h))) + goto out; + hlist_nulls_for_each_entry(h, n, &net->ct.hash[repl_hash], hnnode) + if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple, + &h->tuple) && + zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h))) + goto out; + + /* Remove from unconfirmed list */ + hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); + + /* Timer relative to confirmation time, not original + setting time, otherwise we'd get timer wrap in + weird delay cases. */ + ct->timeout.expires += jiffies; + add_timer(&ct->timeout); + atomic_inc(&ct->ct_general.use); + ct->status |= IPS_CONFIRMED; + + /* set conntrack timestamp, if enabled. */ + tstamp = nf_conn_tstamp_find(ct); + if (tstamp) { + if (skb->tstamp.tv64 == 0) + __net_timestamp((struct sk_buff *)skb); + + tstamp->start = ktime_to_ns(skb->tstamp); + } + /* Since the lookup is lockless, hash insertion must be done after + * starting the timer and setting the CONFIRMED bit. The RCU barriers + * guarantee that no other CPU can find the conntrack before the above + * stores are visible. + */ + __nf_conntrack_hash_insert(ct, hash, repl_hash); + NF_CT_STAT_INC(net, insert); + spin_unlock_bh(&nf_conntrack_lock); + + help = nfct_help(ct); + if (help && help->helper) + nf_conntrack_event_cache(IPCT_HELPER, ct); + + nf_conntrack_event_cache(master_ct(ct) ? + IPCT_RELATED : IPCT_NEW, ct); + return NF_ACCEPT; + +out: + NF_CT_STAT_INC(net, insert_failed); + spin_unlock_bh(&nf_conntrack_lock); + return NF_DROP; +} +EXPORT_SYMBOL_GPL(__nf_conntrack_confirm); + +/* Returns true if a connection correspondings to the tuple (required + for NAT). */ +int +nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, + const struct nf_conn *ignored_conntrack) +{ + struct net *net = nf_ct_net(ignored_conntrack); + struct nf_conntrack_tuple_hash *h; + struct hlist_nulls_node *n; + struct nf_conn *ct; + u16 zone = nf_ct_zone(ignored_conntrack); + unsigned int hash = hash_conntrack(net, zone, tuple); + + /* Disable BHs the entire time since we need to disable them at + * least once for the stats anyway. + */ + rcu_read_lock_bh(); + hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash], hnnode) { + ct = nf_ct_tuplehash_to_ctrack(h); + if (ct != ignored_conntrack && + nf_ct_tuple_equal(tuple, &h->tuple) && + nf_ct_zone(ct) == zone) { + NF_CT_STAT_INC(net, found); + rcu_read_unlock_bh(); + return 1; + } + NF_CT_STAT_INC(net, searched); + } + rcu_read_unlock_bh(); + + return 0; +} +EXPORT_SYMBOL_GPL(nf_conntrack_tuple_taken); + +#define NF_CT_EVICTION_RANGE 8 + +/* There's a small race here where we may free a just-assured + connection. Too bad: we're in trouble anyway. */ +static noinline int early_drop(struct net *net, unsigned int hash) +{ + /* Use oldest entry, which is roughly LRU */ + struct nf_conntrack_tuple_hash *h; + struct nf_conn *ct = NULL, *tmp; + struct hlist_nulls_node *n; + unsigned int i, cnt = 0; + int dropped = 0; + + rcu_read_lock(); + for (i = 0; i < net->ct.htable_size; i++) { + hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash], + hnnode) { + tmp = nf_ct_tuplehash_to_ctrack(h); + if (!test_bit(IPS_ASSURED_BIT, &tmp->status)) + ct = tmp; + cnt++; + } + + if (ct != NULL) { + if (likely(!nf_ct_is_dying(ct) && + atomic_inc_not_zero(&ct->ct_general.use))) + break; + else + ct = NULL; + } + + if (cnt >= NF_CT_EVICTION_RANGE) + break; + + hash = (hash + 1) % net->ct.htable_size; + } + rcu_read_unlock(); + + if (!ct) + return dropped; + + if (del_timer(&ct->timeout)) { + death_by_timeout((unsigned long)ct); + /* Check if we indeed killed this entry. Reliable event + delivery may have inserted it into the dying list. */ + if (test_bit(IPS_DYING_BIT, &ct->status)) { + dropped = 1; + NF_CT_STAT_INC_ATOMIC(net, early_drop); + } + } + nf_ct_put(ct); + return dropped; +} + +void init_nf_conntrack_hash_rnd(void) +{ + unsigned int rand; + + /* + * Why not initialize nf_conntrack_rnd in a "init()" function ? + * Because there isn't enough entropy when system initializing, + * and we initialize it as late as possible. + */ + do { + get_random_bytes(&rand, sizeof(rand)); + } while (!rand); + cmpxchg(&nf_conntrack_hash_rnd, 0, rand); +} + +static struct nf_conn * +__nf_conntrack_alloc(struct net *net, u16 zone, + const struct nf_conntrack_tuple *orig, + const struct nf_conntrack_tuple *repl, + gfp_t gfp, u32 hash) +{ + struct nf_conn *ct; + + if (unlikely(!nf_conntrack_hash_rnd)) { + init_nf_conntrack_hash_rnd(); + /* recompute the hash as nf_conntrack_hash_rnd is initialized */ + hash = hash_conntrack_raw(orig, zone); + } + + /* We don't want any race condition at early drop stage */ + atomic_inc(&net->ct.count); + + if (nf_conntrack_max && + unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) { + if (!early_drop(net, hash_bucket(hash, net))) { + atomic_dec(&net->ct.count); + if (net_ratelimit()) + printk(KERN_WARNING + "nf_conntrack: table full, dropping" + " packet.\n"); + return ERR_PTR(-ENOMEM); + } + } + + /* + * Do not use kmem_cache_zalloc(), as this cache uses + * SLAB_DESTROY_BY_RCU. + */ + ct = kmem_cache_alloc(net->ct.nf_conntrack_cachep, gfp); + if (ct == NULL) { + atomic_dec(&net->ct.count); + return ERR_PTR(-ENOMEM); + } + /* + * Let ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.next + * and ct->tuplehash[IP_CT_DIR_REPLY].hnnode.next unchanged. + */ + memset(&ct->tuplehash[IP_CT_DIR_MAX], 0, + offsetof(struct nf_conn, proto) - + offsetof(struct nf_conn, tuplehash[IP_CT_DIR_MAX])); + spin_lock_init(&ct->lock); + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig; + ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL; + ct->tuplehash[IP_CT_DIR_REPLY].tuple = *repl; + /* save hash for reusing when confirming */ + *(unsigned long *)(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev) = hash; + /* Don't set timer yet: wait for confirmation */ + setup_timer(&ct->timeout, death_by_timeout, (unsigned long)ct); + write_pnet(&ct->ct_net, net); +#ifdef CONFIG_NF_CONNTRACK_ZONES + if (zone) { + struct nf_conntrack_zone *nf_ct_zone; + + nf_ct_zone = nf_ct_ext_add(ct, NF_CT_EXT_ZONE, GFP_ATOMIC); + if (!nf_ct_zone) + goto out_free; + nf_ct_zone->id = zone; + } +#endif + /* + * changes to lookup keys must be done before setting refcnt to 1 + */ + smp_wmb(); + atomic_set(&ct->ct_general.use, 1); + return ct; + +#ifdef CONFIG_NF_CONNTRACK_ZONES +out_free: + atomic_dec(&net->ct.count); + kmem_cache_free(net->ct.nf_conntrack_cachep, ct); + return ERR_PTR(-ENOMEM); +#endif +} + +struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone, + const struct nf_conntrack_tuple *orig, + const struct nf_conntrack_tuple *repl, + gfp_t gfp) +{ + return __nf_conntrack_alloc(net, zone, orig, repl, gfp, 0); +} +EXPORT_SYMBOL_GPL(nf_conntrack_alloc); + +void nf_conntrack_free(struct nf_conn *ct) +{ + struct net *net = nf_ct_net(ct); + + nf_ct_ext_destroy(ct); + atomic_dec(&net->ct.count); + nf_ct_ext_free(ct); + kmem_cache_free(net->ct.nf_conntrack_cachep, ct); +} +EXPORT_SYMBOL_GPL(nf_conntrack_free); + +/* Allocate a new conntrack: we return -ENOMEM if classification + failed due to stress. Otherwise it really is unclassifiable. */ +static struct nf_conntrack_tuple_hash * +init_conntrack(struct net *net, struct nf_conn *tmpl, + const struct nf_conntrack_tuple *tuple, + struct nf_conntrack_l3proto *l3proto, + struct nf_conntrack_l4proto *l4proto, + struct sk_buff *skb, + unsigned int dataoff, u32 hash) +{ + struct nf_conn *ct; + struct nf_conn_help *help; + struct nf_conntrack_tuple repl_tuple; + struct nf_conntrack_ecache *ecache; + struct nf_conntrack_expect *exp; + u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE; + struct nf_conn_timeout *timeout_ext; + unsigned int *timeouts; + + if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, l4proto)) { + pr_debug("Can't invert tuple.\n"); + return NULL; + } + + ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC, + hash); + if (IS_ERR(ct)) + return (struct nf_conntrack_tuple_hash *)ct; + + timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL; + if (timeout_ext) + timeouts = NF_CT_TIMEOUT_EXT_DATA(timeout_ext); + else + timeouts = l4proto->get_timeouts(net); + + if (!l4proto->new(ct, skb, dataoff, timeouts)) { + nf_conntrack_free(ct); + pr_debug("init conntrack: can't track with proto module\n"); + return NULL; + } + + if (timeout_ext) + nf_ct_timeout_ext_add(ct, timeout_ext->timeout, GFP_ATOMIC); + + nf_ct_acct_ext_add(ct, GFP_ATOMIC); + nf_ct_tstamp_ext_add(ct, GFP_ATOMIC); + + ecache = tmpl ? nf_ct_ecache_find(tmpl) : NULL; + nf_ct_ecache_ext_add(ct, ecache ? ecache->ctmask : 0, + ecache ? ecache->expmask : 0, + GFP_ATOMIC); + + spin_lock_bh(&nf_conntrack_lock); + exp = nf_ct_find_expectation(net, zone, tuple); + if (exp) { + pr_debug("conntrack: expectation arrives ct=%p exp=%p\n", + ct, exp); + /* Welcome, Mr. Bond. We've been expecting you... */ + __set_bit(IPS_EXPECTED_BIT, &ct->status); + ct->master = exp->master; + if (exp->helper) { + help = nf_ct_helper_ext_add(ct, GFP_ATOMIC); + if (help) + rcu_assign_pointer(help->helper, exp->helper); + } + +#ifdef CONFIG_NF_CONNTRACK_MARK + ct->mark = exp->master->mark; +#endif +#ifdef CONFIG_NF_CONNTRACK_SECMARK + ct->secmark = exp->master->secmark; +#endif + nf_conntrack_get(&ct->master->ct_general); + NF_CT_STAT_INC(net, expect_new); + } else { + __nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC); + NF_CT_STAT_INC(net, new); + } + + /* Overload tuple linked list to put us in unconfirmed list. */ + hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, + &net->ct.unconfirmed); + + spin_unlock_bh(&nf_conntrack_lock); + + if (exp) { + if (exp->expectfn) + exp->expectfn(ct, exp); + nf_ct_expect_put(exp); + } + + return &ct->tuplehash[IP_CT_DIR_ORIGINAL]; +} + +/* On success, returns conntrack ptr, sets skb->nfct and ctinfo */ +static inline struct nf_conn * +resolve_normal_ct(struct net *net, struct nf_conn *tmpl, + struct sk_buff *skb, + unsigned int dataoff, + u_int16_t l3num, + u_int8_t protonum, + struct nf_conntrack_l3proto *l3proto, + struct nf_conntrack_l4proto *l4proto, + int *set_reply, + enum ip_conntrack_info *ctinfo) +{ + struct nf_conntrack_tuple tuple; + struct nf_conntrack_tuple_hash *h; + struct nf_conn *ct; + u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE; + u32 hash; + + if (!nf_ct_get_tuple(skb, skb_network_offset(skb), + dataoff, l3num, protonum, &tuple, l3proto, + l4proto)) { + pr_debug("resolve_normal_ct: Can't get tuple\n"); + return NULL; + } + + /* look for tuple match */ + hash = hash_conntrack_raw(&tuple, zone); + h = __nf_conntrack_find_get(net, zone, &tuple, hash); + if (!h) { + h = init_conntrack(net, tmpl, &tuple, l3proto, l4proto, + skb, dataoff, hash); + if (!h) + return NULL; + if (IS_ERR(h)) + return (void *)h; + } + ct = nf_ct_tuplehash_to_ctrack(h); + + /* It exists; we have (non-exclusive) reference. */ + if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) { + *ctinfo = IP_CT_ESTABLISHED_REPLY; + /* Please set reply bit if this packet OK */ + *set_reply = 1; + } else { + /* Once we've had two way comms, always ESTABLISHED. */ + if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { + pr_debug("nf_conntrack_in: normal packet for %p\n", ct); + *ctinfo = IP_CT_ESTABLISHED; + } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) { + pr_debug("nf_conntrack_in: related packet for %p\n", + ct); + *ctinfo = IP_CT_RELATED; + } else { + pr_debug("nf_conntrack_in: new packet for %p\n", ct); + *ctinfo = IP_CT_NEW; + } + *set_reply = 0; + } + skb->nfct = &ct->ct_general; + skb->nfctinfo = *ctinfo; + return ct; +} + +unsigned int +nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, + struct sk_buff *skb) +{ + struct nf_conn *ct, *tmpl = NULL; + enum ip_conntrack_info ctinfo; + struct nf_conntrack_l3proto *l3proto; + struct nf_conntrack_l4proto *l4proto; + struct nf_conn_timeout *timeout_ext; + unsigned int *timeouts; + unsigned int dataoff; + u_int8_t protonum; + int set_reply = 0; + int ret; + + if (skb->nfct) { + /* Previously seen (loopback or untracked)? Ignore. */ + tmpl = (struct nf_conn *)skb->nfct; + if (!nf_ct_is_template(tmpl)) { + NF_CT_STAT_INC_ATOMIC(net, ignore); + return NF_ACCEPT; + } + skb->nfct = NULL; + } + + /* rcu_read_lock()ed by nf_hook_slow */ + l3proto = __nf_ct_l3proto_find(pf); + ret = l3proto->get_l4proto(skb, skb_network_offset(skb), + &dataoff, &protonum); + if (ret <= 0) { + pr_debug("not prepared to track yet or error occurred\n"); + NF_CT_STAT_INC_ATOMIC(net, error); + NF_CT_STAT_INC_ATOMIC(net, invalid); + ret = -ret; + goto out; + } + + l4proto = __nf_ct_l4proto_find(pf, protonum); + + /* It may be an special packet, error, unclean... + * inverse of the return code tells to the netfilter + * core what to do with the packet. */ + if (l4proto->error != NULL) { + ret = l4proto->error(net, tmpl, skb, dataoff, &ctinfo, + pf, hooknum); + if (ret <= 0) { + NF_CT_STAT_INC_ATOMIC(net, error); + NF_CT_STAT_INC_ATOMIC(net, invalid); + ret = -ret; + goto out; + } + /* ICMP[v6] protocol trackers may assign one conntrack. */ + if (skb->nfct) + goto out; + } + + ct = resolve_normal_ct(net, tmpl, skb, dataoff, pf, protonum, + l3proto, l4proto, &set_reply, &ctinfo); + if (!ct) { + /* Not valid part of a connection */ + NF_CT_STAT_INC_ATOMIC(net, invalid); + ret = NF_ACCEPT; + goto out; + } + + if (IS_ERR(ct)) { + /* Too stressed to deal. */ + NF_CT_STAT_INC_ATOMIC(net, drop); + ret = NF_DROP; + goto out; + } + + NF_CT_ASSERT(skb->nfct); + + /* Decide what timeout policy we want to apply to this flow. */ + timeout_ext = nf_ct_timeout_find(ct); + if (timeout_ext) + timeouts = NF_CT_TIMEOUT_EXT_DATA(timeout_ext); + else + timeouts = l4proto->get_timeouts(net); + + ret = l4proto->packet(ct, skb, dataoff, ctinfo, pf, hooknum, timeouts); + if (ret <= 0) { + /* Invalid: inverse of the return code tells + * the netfilter core what to do */ + pr_debug("nf_conntrack_in: Can't track with proto module\n"); + nf_conntrack_put(skb->nfct); + skb->nfct = NULL; + NF_CT_STAT_INC_ATOMIC(net, invalid); + if (ret == -NF_DROP) + NF_CT_STAT_INC_ATOMIC(net, drop); + ret = -ret; + goto out; + } + + if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status)) + nf_conntrack_event_cache(IPCT_REPLY, ct); +out: + if (tmpl) { + /* Special case: we have to repeat this hook, assign the + * template again to this packet. We assume that this packet + * has no conntrack assigned. This is used by nf_ct_tcp. */ + if (ret == NF_REPEAT) + skb->nfct = (struct nf_conntrack *)tmpl; + else + nf_ct_put(tmpl); + } + + return ret; +} +EXPORT_SYMBOL_GPL(nf_conntrack_in); + +bool nf_ct_invert_tuplepr(struct nf_conntrack_tuple *inverse, + const struct nf_conntrack_tuple *orig) +{ + bool ret; + + rcu_read_lock(); + ret = nf_ct_invert_tuple(inverse, orig, + __nf_ct_l3proto_find(orig->src.l3num), + __nf_ct_l4proto_find(orig->src.l3num, + orig->dst.protonum)); + rcu_read_unlock(); + return ret; +} +EXPORT_SYMBOL_GPL(nf_ct_invert_tuplepr); + +/* Alter reply tuple (maybe alter helper). This is for NAT, and is + implicitly racy: see __nf_conntrack_confirm */ +void nf_conntrack_alter_reply(struct nf_conn *ct, + const struct nf_conntrack_tuple *newreply) +{ + struct nf_conn_help *help = nfct_help(ct); + + /* Should be unconfirmed, so not in hash table yet */ + NF_CT_ASSERT(!nf_ct_is_confirmed(ct)); + + pr_debug("Altering reply tuple of %p to ", ct); + nf_ct_dump_tuple(newreply); + + ct->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply; + if (ct->master || (help && !hlist_empty(&help->expectations))) + return; + + rcu_read_lock(); + __nf_ct_try_assign_helper(ct, NULL, GFP_ATOMIC); + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(nf_conntrack_alter_reply); + +/* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */ +void __nf_ct_refresh_acct(struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + const struct sk_buff *skb, + unsigned long extra_jiffies, + int do_acct) +{ + NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct); + NF_CT_ASSERT(skb); + + /* Only update if this is not a fixed timeout */ + if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) + goto acct; + + /* If not in hash table, timer will not be active yet */ + if (!nf_ct_is_confirmed(ct)) { + ct->timeout.expires = extra_jiffies; + } else { + unsigned long newtime = jiffies + extra_jiffies; + + /* Only update the timeout if the new timeout is at least + HZ jiffies from the old timeout. Need del_timer for race + avoidance (may already be dying). */ + if (newtime - ct->timeout.expires >= HZ) + mod_timer_pending(&ct->timeout, newtime); + } + +acct: + if (do_acct) { + struct nf_conn_counter *acct; + + acct = nf_conn_acct_find(ct); + if (acct) { + atomic64_inc(&acct[CTINFO2DIR(ctinfo)].packets); + atomic64_add(skb->len, &acct[CTINFO2DIR(ctinfo)].bytes); + } + } +} +EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct); + +bool __nf_ct_kill_acct(struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + const struct sk_buff *skb, + int do_acct) +{ + if (do_acct) { + struct nf_conn_counter *acct; + + acct = nf_conn_acct_find(ct); + if (acct) { + atomic64_inc(&acct[CTINFO2DIR(ctinfo)].packets); + atomic64_add(skb->len - skb_network_offset(skb), + &acct[CTINFO2DIR(ctinfo)].bytes); + } + } + + if (del_timer(&ct->timeout)) { + ct->timeout.function((unsigned long)ct); + return true; + } + return false; +} +EXPORT_SYMBOL_GPL(__nf_ct_kill_acct); + +#ifdef CONFIG_NF_CONNTRACK_ZONES +static struct nf_ct_ext_type nf_ct_zone_extend __read_mostly = { + .len = sizeof(struct nf_conntrack_zone), + .align = __alignof__(struct nf_conntrack_zone), + .id = NF_CT_EXT_ZONE, +}; +#endif + +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) + +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nfnetlink_conntrack.h> +#include <linux/mutex.h> + +/* Generic function for tcp/udp/sctp/dccp and alike. This needs to be + * in ip_conntrack_core, since we don't want the protocols to autoload + * or depend on ctnetlink */ +int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb, + const struct nf_conntrack_tuple *tuple) +{ + NLA_PUT_BE16(skb, CTA_PROTO_SRC_PORT, tuple->src.u.tcp.port); + NLA_PUT_BE16(skb, CTA_PROTO_DST_PORT, tuple->dst.u.tcp.port); + return 0; + +nla_put_failure: + return -1; +} +EXPORT_SYMBOL_GPL(nf_ct_port_tuple_to_nlattr); + +const struct nla_policy nf_ct_port_nla_policy[CTA_PROTO_MAX+1] = { + [CTA_PROTO_SRC_PORT] = { .type = NLA_U16 }, + [CTA_PROTO_DST_PORT] = { .type = NLA_U16 }, +}; +EXPORT_SYMBOL_GPL(nf_ct_port_nla_policy); + +int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[], + struct nf_conntrack_tuple *t) +{ + if (!tb[CTA_PROTO_SRC_PORT] || !tb[CTA_PROTO_DST_PORT]) + return -EINVAL; + + t->src.u.tcp.port = nla_get_be16(tb[CTA_PROTO_SRC_PORT]); + t->dst.u.tcp.port = nla_get_be16(tb[CTA_PROTO_DST_PORT]); + + return 0; +} +EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_to_tuple); + +int nf_ct_port_nlattr_tuple_size(void) +{ + return nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1); +} +EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_tuple_size); +#endif + +/* Used by ipt_REJECT and ip6t_REJECT. */ +static void nf_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb) +{ + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + + /* This ICMP is in reverse direction to the packet which caused it */ + ct = nf_ct_get(skb, &ctinfo); + if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) + ctinfo = IP_CT_RELATED_REPLY; + else + ctinfo = IP_CT_RELATED; + + /* Attach to new skbuff, and increment count */ + nskb->nfct = &ct->ct_general; + nskb->nfctinfo = ctinfo; + nf_conntrack_get(nskb->nfct); +} + +/* Bring out ya dead! */ +static struct nf_conn * +get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data), + void *data, unsigned int *bucket) +{ + struct nf_conntrack_tuple_hash *h; + struct nf_conn *ct; + struct hlist_nulls_node *n; + + spin_lock_bh(&nf_conntrack_lock); + for (; *bucket < net->ct.htable_size; (*bucket)++) { + hlist_nulls_for_each_entry(h, n, &net->ct.hash[*bucket], hnnode) { + ct = nf_ct_tuplehash_to_ctrack(h); + if (iter(ct, data)) + goto found; + } + } + hlist_nulls_for_each_entry(h, n, &net->ct.unconfirmed, hnnode) { + ct = nf_ct_tuplehash_to_ctrack(h); + if (iter(ct, data)) + set_bit(IPS_DYING_BIT, &ct->status); + } + spin_unlock_bh(&nf_conntrack_lock); + return NULL; +found: + atomic_inc(&ct->ct_general.use); + spin_unlock_bh(&nf_conntrack_lock); + return ct; +} + +void nf_ct_iterate_cleanup(struct net *net, + int (*iter)(struct nf_conn *i, void *data), + void *data) +{ + struct nf_conn *ct; + unsigned int bucket = 0; + + while ((ct = get_next_corpse(net, iter, data, &bucket)) != NULL) { + /* Time to push up daises... */ + if (del_timer(&ct->timeout)) + death_by_timeout((unsigned long)ct); + /* ... else the timer will get him soon. */ + + nf_ct_put(ct); + } +} +EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup); + +struct __nf_ct_flush_report { + u32 pid; + int report; +}; + +static int kill_report(struct nf_conn *i, void *data) +{ + struct __nf_ct_flush_report *fr = (struct __nf_ct_flush_report *)data; + struct nf_conn_tstamp *tstamp; + + tstamp = nf_conn_tstamp_find(i); + if (tstamp && tstamp->stop == 0) + tstamp->stop = ktime_to_ns(ktime_get_real()); + + /* If we fail to deliver the event, death_by_timeout() will retry */ + if (nf_conntrack_event_report(IPCT_DESTROY, i, + fr->pid, fr->report) < 0) + return 1; + + /* Avoid the delivery of the destroy event in death_by_timeout(). */ + set_bit(IPS_DYING_BIT, &i->status); + return 1; +} + +static int kill_all(struct nf_conn *i, void *data) +{ + return 1; +} + +void nf_ct_free_hashtable(void *hash, unsigned int size) +{ + if (is_vmalloc_addr(hash)) + vfree(hash); + else + free_pages((unsigned long)hash, + get_order(sizeof(struct hlist_head) * size)); +} +EXPORT_SYMBOL_GPL(nf_ct_free_hashtable); + +void nf_conntrack_flush_report(struct net *net, u32 pid, int report) +{ + struct __nf_ct_flush_report fr = { + .pid = pid, + .report = report, + }; + nf_ct_iterate_cleanup(net, kill_report, &fr); +} +EXPORT_SYMBOL_GPL(nf_conntrack_flush_report); + +static void nf_ct_release_dying_list(struct net *net) +{ + struct nf_conntrack_tuple_hash *h; + struct nf_conn *ct; + struct hlist_nulls_node *n; + + spin_lock_bh(&nf_conntrack_lock); + hlist_nulls_for_each_entry(h, n, &net->ct.dying, hnnode) { + ct = nf_ct_tuplehash_to_ctrack(h); + /* never fails to remove them, no listeners at this point */ + nf_ct_kill(ct); + } + spin_unlock_bh(&nf_conntrack_lock); +} + +static int untrack_refs(void) +{ + int cnt = 0, cpu; + + for_each_possible_cpu(cpu) { + struct nf_conn *ct = &per_cpu(nf_conntrack_untracked, cpu); + + cnt += atomic_read(&ct->ct_general.use) - 1; + } + return cnt; +} + +static void nf_conntrack_cleanup_init_net(void) +{ + while (untrack_refs() > 0) + schedule(); + + nf_conntrack_helper_fini(); + nf_conntrack_proto_fini(); +#ifdef CONFIG_NF_CONNTRACK_ZONES + nf_ct_extend_unregister(&nf_ct_zone_extend); +#endif +} + +static void nf_conntrack_cleanup_net(struct net *net) +{ + i_see_dead_people: + nf_ct_iterate_cleanup(net, kill_all, NULL); + nf_ct_release_dying_list(net); + if (atomic_read(&net->ct.count) != 0) { + schedule(); + goto i_see_dead_people; + } + + nf_ct_free_hashtable(net->ct.hash, net->ct.htable_size); + nf_conntrack_timeout_fini(net); + nf_conntrack_ecache_fini(net); + nf_conntrack_tstamp_fini(net); + nf_conntrack_acct_fini(net); + nf_conntrack_expect_fini(net); + kmem_cache_destroy(net->ct.nf_conntrack_cachep); + kfree(net->ct.slabname); + free_percpu(net->ct.stat); +} + +/* Mishearing the voices in his head, our hero wonders how he's + supposed to kill the mall. */ +void nf_conntrack_cleanup(struct net *net) +{ + if (net_eq(net, &init_net)) + RCU_INIT_POINTER(ip_ct_attach, NULL); + + /* This makes sure all current packets have passed through + netfilter framework. Roll on, two-stage module + delete... */ + synchronize_net(); + + nf_conntrack_cleanup_net(net); + + if (net_eq(net, &init_net)) { + RCU_INIT_POINTER(nf_ct_destroy, NULL); + nf_conntrack_cleanup_init_net(); + } +} + +void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls) +{ + struct hlist_nulls_head *hash; + unsigned int nr_slots, i; + size_t sz; + + BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head)); + nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head)); + sz = nr_slots * sizeof(struct hlist_nulls_head); + hash = (void *)__get_free_pages(GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO, + get_order(sz)); + if (!hash) { + printk(KERN_WARNING "nf_conntrack: falling back to vmalloc.\n"); + hash = vzalloc(sz); + } + + if (hash && nulls) + for (i = 0; i < nr_slots; i++) + INIT_HLIST_NULLS_HEAD(&hash[i], i); + + return hash; +} +EXPORT_SYMBOL_GPL(nf_ct_alloc_hashtable); + +int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp) +{ + int i, bucket; + unsigned int hashsize, old_size; + struct hlist_nulls_head *hash, *old_hash; + struct nf_conntrack_tuple_hash *h; + struct nf_conn *ct; + + if (current->nsproxy->net_ns != &init_net) + return -EOPNOTSUPP; + + /* On boot, we can set this without any fancy locking. */ + if (!nf_conntrack_htable_size) + return param_set_uint(val, kp); + + hashsize = simple_strtoul(val, NULL, 0); + if (!hashsize) + return -EINVAL; + + hash = nf_ct_alloc_hashtable(&hashsize, 1); + if (!hash) + return -ENOMEM; + + /* Lookups in the old hash might happen in parallel, which means we + * might get false negatives during connection lookup. New connections + * created because of a false negative won't make it into the hash + * though since that required taking the lock. + */ + spin_lock_bh(&nf_conntrack_lock); + for (i = 0; i < init_net.ct.htable_size; i++) { + while (!hlist_nulls_empty(&init_net.ct.hash[i])) { + h = hlist_nulls_entry(init_net.ct.hash[i].first, + struct nf_conntrack_tuple_hash, hnnode); + ct = nf_ct_tuplehash_to_ctrack(h); + hlist_nulls_del_rcu(&h->hnnode); + bucket = __hash_conntrack(&h->tuple, nf_ct_zone(ct), + hashsize); + hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]); + } + } + old_size = init_net.ct.htable_size; + old_hash = init_net.ct.hash; + + init_net.ct.htable_size = nf_conntrack_htable_size = hashsize; + init_net.ct.hash = hash; + spin_unlock_bh(&nf_conntrack_lock); + + nf_ct_free_hashtable(old_hash, old_size); + return 0; +} +EXPORT_SYMBOL_GPL(nf_conntrack_set_hashsize); + +module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint, + &nf_conntrack_htable_size, 0600); + +void nf_ct_untracked_status_or(unsigned long bits) +{ + int cpu; + + for_each_possible_cpu(cpu) + per_cpu(nf_conntrack_untracked, cpu).status |= bits; +} +EXPORT_SYMBOL_GPL(nf_ct_untracked_status_or); + +static int nf_conntrack_init_init_net(void) +{ + int max_factor = 8; + int ret, cpu; + + /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB + * machine has 512 buckets. >= 1GB machines have 16384 buckets. */ + if (!nf_conntrack_htable_size) { + nf_conntrack_htable_size + = (((totalram_pages << PAGE_SHIFT) / 16384) + / sizeof(struct hlist_head)); + if (totalram_pages > (1024 * 1024 * 1024 / PAGE_SIZE)) + nf_conntrack_htable_size = 16384; + if (nf_conntrack_htable_size < 32) + nf_conntrack_htable_size = 32; + + /* Use a max. factor of four by default to get the same max as + * with the old struct list_heads. When a table size is given + * we use the old value of 8 to avoid reducing the max. + * entries. */ + max_factor = 4; + } + nf_conntrack_max = max_factor * nf_conntrack_htable_size; + + printk(KERN_INFO "nf_conntrack version %s (%u buckets, %d max)\n", + NF_CONNTRACK_VERSION, nf_conntrack_htable_size, + nf_conntrack_max); + + ret = nf_conntrack_proto_init(); + if (ret < 0) + goto err_proto; + + ret = nf_conntrack_helper_init(); + if (ret < 0) + goto err_helper; + +#ifdef CONFIG_NF_CONNTRACK_ZONES + ret = nf_ct_extend_register(&nf_ct_zone_extend); + if (ret < 0) + goto err_extend; +#endif + /* Set up fake conntrack: to never be deleted, not in any hashes */ + for_each_possible_cpu(cpu) { + struct nf_conn *ct = &per_cpu(nf_conntrack_untracked, cpu); + write_pnet(&ct->ct_net, &init_net); + atomic_set(&ct->ct_general.use, 1); + } + /* - and look it like as a confirmed connection */ + nf_ct_untracked_status_or(IPS_CONFIRMED | IPS_UNTRACKED); + return 0; + +#ifdef CONFIG_NF_CONNTRACK_ZONES +err_extend: + nf_conntrack_helper_fini(); +#endif +err_helper: + nf_conntrack_proto_fini(); +err_proto: + return ret; +} + +/* + * We need to use special "null" values, not used in hash table + */ +#define UNCONFIRMED_NULLS_VAL ((1<<30)+0) +#define DYING_NULLS_VAL ((1<<30)+1) + +static int nf_conntrack_init_net(struct net *net) +{ + int ret; + + atomic_set(&net->ct.count, 0); + INIT_HLIST_NULLS_HEAD(&net->ct.unconfirmed, UNCONFIRMED_NULLS_VAL); + INIT_HLIST_NULLS_HEAD(&net->ct.dying, DYING_NULLS_VAL); + net->ct.stat = alloc_percpu(struct ip_conntrack_stat); + if (!net->ct.stat) { + ret = -ENOMEM; + goto err_stat; + } + + net->ct.slabname = kasprintf(GFP_KERNEL, "nf_conntrack_%p", net); + if (!net->ct.slabname) { + ret = -ENOMEM; + goto err_slabname; + } + + net->ct.nf_conntrack_cachep = kmem_cache_create(net->ct.slabname, + sizeof(struct nf_conn), 0, + SLAB_DESTROY_BY_RCU, NULL); + if (!net->ct.nf_conntrack_cachep) { + printk(KERN_ERR "Unable to create nf_conn slab cache\n"); + ret = -ENOMEM; + goto err_cache; + } + + net->ct.htable_size = nf_conntrack_htable_size; + net->ct.hash = nf_ct_alloc_hashtable(&net->ct.htable_size, 1); + if (!net->ct.hash) { + ret = -ENOMEM; + printk(KERN_ERR "Unable to create nf_conntrack_hash\n"); + goto err_hash; + } + ret = nf_conntrack_expect_init(net); + if (ret < 0) + goto err_expect; + ret = nf_conntrack_acct_init(net); + if (ret < 0) + goto err_acct; + ret = nf_conntrack_tstamp_init(net); + if (ret < 0) + goto err_tstamp; + ret = nf_conntrack_ecache_init(net); + if (ret < 0) + goto err_ecache; + ret = nf_conntrack_timeout_init(net); + if (ret < 0) + goto err_timeout; + + return 0; + +err_timeout: + nf_conntrack_ecache_fini(net); +err_ecache: + nf_conntrack_tstamp_fini(net); +err_tstamp: + nf_conntrack_acct_fini(net); +err_acct: + nf_conntrack_expect_fini(net); +err_expect: + nf_ct_free_hashtable(net->ct.hash, net->ct.htable_size); +err_hash: + kmem_cache_destroy(net->ct.nf_conntrack_cachep); +err_cache: + kfree(net->ct.slabname); +err_slabname: + free_percpu(net->ct.stat); +err_stat: + return ret; +} + +s16 (*nf_ct_nat_offset)(const struct nf_conn *ct, + enum ip_conntrack_dir dir, + u32 seq); +EXPORT_SYMBOL_GPL(nf_ct_nat_offset); + +int nf_conntrack_init(struct net *net) +{ + int ret; + + if (net_eq(net, &init_net)) { + ret = nf_conntrack_init_init_net(); + if (ret < 0) + goto out_init_net; + } + ret = nf_conntrack_init_net(net); + if (ret < 0) + goto out_net; + + if (net_eq(net, &init_net)) { + /* For use by REJECT target */ + RCU_INIT_POINTER(ip_ct_attach, nf_conntrack_attach); + RCU_INIT_POINTER(nf_ct_destroy, destroy_conntrack); + + /* Howto get NAT offsets */ + RCU_INIT_POINTER(nf_ct_nat_offset, NULL); + } + return 0; + +out_net: + if (net_eq(net, &init_net)) + nf_conntrack_cleanup_init_net(); +out_init_net: + return ret; +} diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c new file mode 100644 index 00000000..5bd3047d --- /dev/null +++ b/net/netfilter/nf_conntrack_ecache.c @@ -0,0 +1,269 @@ +/* Event cache for netfilter. */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> + * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/types.h> +#include <linux/netfilter.h> +#include <linux/skbuff.h> +#include <linux/vmalloc.h> +#include <linux/stddef.h> +#include <linux/err.h> +#include <linux/percpu.h> +#include <linux/kernel.h> +#include <linux/netdevice.h> +#include <linux/slab.h> +#include <linux/export.h> + +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_extend.h> + +static DEFINE_MUTEX(nf_ct_ecache_mutex); + +/* deliver cached events and clear cache entry - must be called with locally + * disabled softirqs */ +void nf_ct_deliver_cached_events(struct nf_conn *ct) +{ + struct net *net = nf_ct_net(ct); + unsigned long events, missed; + struct nf_ct_event_notifier *notify; + struct nf_conntrack_ecache *e; + struct nf_ct_event item; + int ret; + + rcu_read_lock(); + notify = rcu_dereference(net->ct.nf_conntrack_event_cb); + if (notify == NULL) + goto out_unlock; + + e = nf_ct_ecache_find(ct); + if (e == NULL) + goto out_unlock; + + events = xchg(&e->cache, 0); + + if (!nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct) || !events) + goto out_unlock; + + /* We make a copy of the missed event cache without taking + * the lock, thus we may send missed events twice. However, + * this does not harm and it happens very rarely. */ + missed = e->missed; + + if (!((events | missed) & e->ctmask)) + goto out_unlock; + + item.ct = ct; + item.pid = 0; + item.report = 0; + + ret = notify->fcn(events | missed, &item); + + if (likely(ret >= 0 && !missed)) + goto out_unlock; + + spin_lock_bh(&ct->lock); + if (ret < 0) + e->missed |= events; + else + e->missed &= ~missed; + spin_unlock_bh(&ct->lock); + +out_unlock: + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(nf_ct_deliver_cached_events); + +int nf_conntrack_register_notifier(struct net *net, + struct nf_ct_event_notifier *new) +{ + int ret = 0; + struct nf_ct_event_notifier *notify; + + mutex_lock(&nf_ct_ecache_mutex); + notify = rcu_dereference_protected(net->ct.nf_conntrack_event_cb, + lockdep_is_held(&nf_ct_ecache_mutex)); + if (notify != NULL) { + ret = -EBUSY; + goto out_unlock; + } + rcu_assign_pointer(net->ct.nf_conntrack_event_cb, new); + mutex_unlock(&nf_ct_ecache_mutex); + return ret; + +out_unlock: + mutex_unlock(&nf_ct_ecache_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(nf_conntrack_register_notifier); + +void nf_conntrack_unregister_notifier(struct net *net, + struct nf_ct_event_notifier *new) +{ + struct nf_ct_event_notifier *notify; + + mutex_lock(&nf_ct_ecache_mutex); + notify = rcu_dereference_protected(net->ct.nf_conntrack_event_cb, + lockdep_is_held(&nf_ct_ecache_mutex)); + BUG_ON(notify != new); + RCU_INIT_POINTER(net->ct.nf_conntrack_event_cb, NULL); + mutex_unlock(&nf_ct_ecache_mutex); +} +EXPORT_SYMBOL_GPL(nf_conntrack_unregister_notifier); + +int nf_ct_expect_register_notifier(struct net *net, + struct nf_exp_event_notifier *new) +{ + int ret = 0; + struct nf_exp_event_notifier *notify; + + mutex_lock(&nf_ct_ecache_mutex); + notify = rcu_dereference_protected(net->ct.nf_expect_event_cb, + lockdep_is_held(&nf_ct_ecache_mutex)); + if (notify != NULL) { + ret = -EBUSY; + goto out_unlock; + } + rcu_assign_pointer(net->ct.nf_expect_event_cb, new); + mutex_unlock(&nf_ct_ecache_mutex); + return ret; + +out_unlock: + mutex_unlock(&nf_ct_ecache_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(nf_ct_expect_register_notifier); + +void nf_ct_expect_unregister_notifier(struct net *net, + struct nf_exp_event_notifier *new) +{ + struct nf_exp_event_notifier *notify; + + mutex_lock(&nf_ct_ecache_mutex); + notify = rcu_dereference_protected(net->ct.nf_expect_event_cb, + lockdep_is_held(&nf_ct_ecache_mutex)); + BUG_ON(notify != new); + RCU_INIT_POINTER(net->ct.nf_expect_event_cb, NULL); + mutex_unlock(&nf_ct_ecache_mutex); +} +EXPORT_SYMBOL_GPL(nf_ct_expect_unregister_notifier); + +#define NF_CT_EVENTS_DEFAULT 1 +static int nf_ct_events __read_mostly = NF_CT_EVENTS_DEFAULT; +static int nf_ct_events_retry_timeout __read_mostly = 15*HZ; + +#ifdef CONFIG_SYSCTL +static struct ctl_table event_sysctl_table[] = { + { + .procname = "nf_conntrack_events", + .data = &init_net.ct.sysctl_events, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "nf_conntrack_events_retry_timeout", + .data = &init_net.ct.sysctl_events_retry_timeout, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + {} +}; +#endif /* CONFIG_SYSCTL */ + +static struct nf_ct_ext_type event_extend __read_mostly = { + .len = sizeof(struct nf_conntrack_ecache), + .align = __alignof__(struct nf_conntrack_ecache), + .id = NF_CT_EXT_ECACHE, +}; + +#ifdef CONFIG_SYSCTL +static int nf_conntrack_event_init_sysctl(struct net *net) +{ + struct ctl_table *table; + + table = kmemdup(event_sysctl_table, sizeof(event_sysctl_table), + GFP_KERNEL); + if (!table) + goto out; + + table[0].data = &net->ct.sysctl_events; + table[1].data = &net->ct.sysctl_events_retry_timeout; + + net->ct.event_sysctl_header = + register_net_sysctl_table(net, + nf_net_netfilter_sysctl_path, table); + if (!net->ct.event_sysctl_header) { + printk(KERN_ERR "nf_ct_event: can't register to sysctl.\n"); + goto out_register; + } + return 0; + +out_register: + kfree(table); +out: + return -ENOMEM; +} + +static void nf_conntrack_event_fini_sysctl(struct net *net) +{ + struct ctl_table *table; + + table = net->ct.event_sysctl_header->ctl_table_arg; + unregister_net_sysctl_table(net->ct.event_sysctl_header); + kfree(table); +} +#else +static int nf_conntrack_event_init_sysctl(struct net *net) +{ + return 0; +} + +static void nf_conntrack_event_fini_sysctl(struct net *net) +{ +} +#endif /* CONFIG_SYSCTL */ + +int nf_conntrack_ecache_init(struct net *net) +{ + int ret; + + net->ct.sysctl_events = nf_ct_events; + net->ct.sysctl_events_retry_timeout = nf_ct_events_retry_timeout; + + if (net_eq(net, &init_net)) { + ret = nf_ct_extend_register(&event_extend); + if (ret < 0) { + printk(KERN_ERR "nf_ct_event: Unable to register " + "event extension.\n"); + goto out_extend_register; + } + } + + ret = nf_conntrack_event_init_sysctl(net); + if (ret < 0) + goto out_sysctl; + + return 0; + +out_sysctl: + if (net_eq(net, &init_net)) + nf_ct_extend_unregister(&event_extend); +out_extend_register: + return ret; +} + +void nf_conntrack_ecache_fini(struct net *net) +{ + nf_conntrack_event_fini_sysctl(net); + if (net_eq(net, &init_net)) + nf_ct_extend_unregister(&event_extend); +} diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c new file mode 100644 index 00000000..4147ba3f --- /dev/null +++ b/net/netfilter/nf_conntrack_expect.c @@ -0,0 +1,658 @@ +/* Expectation handling for nf_conntrack. */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> + * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/types.h> +#include <linux/netfilter.h> +#include <linux/skbuff.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/stddef.h> +#include <linux/slab.h> +#include <linux/err.h> +#include <linux/percpu.h> +#include <linux/kernel.h> +#include <linux/jhash.h> +#include <linux/moduleparam.h> +#include <linux/export.h> +#include <net/net_namespace.h> + +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_expect.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_tuple.h> +#include <net/netfilter/nf_conntrack_zones.h> + +unsigned int nf_ct_expect_hsize __read_mostly; +EXPORT_SYMBOL_GPL(nf_ct_expect_hsize); + +unsigned int nf_ct_expect_max __read_mostly; + +static struct kmem_cache *nf_ct_expect_cachep __read_mostly; + +/* nf_conntrack_expect helper functions */ +void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp, + u32 pid, int report) +{ + struct nf_conn_help *master_help = nfct_help(exp->master); + struct net *net = nf_ct_exp_net(exp); + + NF_CT_ASSERT(master_help); + NF_CT_ASSERT(!timer_pending(&exp->timeout)); + + hlist_del_rcu(&exp->hnode); + net->ct.expect_count--; + + hlist_del(&exp->lnode); + master_help->expecting[exp->class]--; + + nf_ct_expect_event_report(IPEXP_DESTROY, exp, pid, report); + nf_ct_expect_put(exp); + + NF_CT_STAT_INC(net, expect_delete); +} +EXPORT_SYMBOL_GPL(nf_ct_unlink_expect_report); + +static void nf_ct_expectation_timed_out(unsigned long ul_expect) +{ + struct nf_conntrack_expect *exp = (void *)ul_expect; + + spin_lock_bh(&nf_conntrack_lock); + nf_ct_unlink_expect(exp); + spin_unlock_bh(&nf_conntrack_lock); + nf_ct_expect_put(exp); +} + +static unsigned int nf_ct_expect_dst_hash(const struct nf_conntrack_tuple *tuple) +{ + unsigned int hash; + + if (unlikely(!nf_conntrack_hash_rnd)) { + init_nf_conntrack_hash_rnd(); + } + + hash = jhash2(tuple->dst.u3.all, ARRAY_SIZE(tuple->dst.u3.all), + (((tuple->dst.protonum ^ tuple->src.l3num) << 16) | + (__force __u16)tuple->dst.u.all) ^ nf_conntrack_hash_rnd); + return ((u64)hash * nf_ct_expect_hsize) >> 32; +} + +struct nf_conntrack_expect * +__nf_ct_expect_find(struct net *net, u16 zone, + const struct nf_conntrack_tuple *tuple) +{ + struct nf_conntrack_expect *i; + struct hlist_node *n; + unsigned int h; + + if (!net->ct.expect_count) + return NULL; + + h = nf_ct_expect_dst_hash(tuple); + hlist_for_each_entry_rcu(i, n, &net->ct.expect_hash[h], hnode) { + if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) && + nf_ct_zone(i->master) == zone) + return i; + } + return NULL; +} +EXPORT_SYMBOL_GPL(__nf_ct_expect_find); + +/* Just find a expectation corresponding to a tuple. */ +struct nf_conntrack_expect * +nf_ct_expect_find_get(struct net *net, u16 zone, + const struct nf_conntrack_tuple *tuple) +{ + struct nf_conntrack_expect *i; + + rcu_read_lock(); + i = __nf_ct_expect_find(net, zone, tuple); + if (i && !atomic_inc_not_zero(&i->use)) + i = NULL; + rcu_read_unlock(); + + return i; +} +EXPORT_SYMBOL_GPL(nf_ct_expect_find_get); + +/* If an expectation for this connection is found, it gets delete from + * global list then returned. */ +struct nf_conntrack_expect * +nf_ct_find_expectation(struct net *net, u16 zone, + const struct nf_conntrack_tuple *tuple) +{ + struct nf_conntrack_expect *i, *exp = NULL; + struct hlist_node *n; + unsigned int h; + + if (!net->ct.expect_count) + return NULL; + + h = nf_ct_expect_dst_hash(tuple); + hlist_for_each_entry(i, n, &net->ct.expect_hash[h], hnode) { + if (!(i->flags & NF_CT_EXPECT_INACTIVE) && + nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) && + nf_ct_zone(i->master) == zone) { + exp = i; + break; + } + } + if (!exp) + return NULL; + + /* If master is not in hash table yet (ie. packet hasn't left + this machine yet), how can other end know about expected? + Hence these are not the droids you are looking for (if + master ct never got confirmed, we'd hold a reference to it + and weird things would happen to future packets). */ + if (!nf_ct_is_confirmed(exp->master)) + return NULL; + + if (exp->flags & NF_CT_EXPECT_PERMANENT) { + atomic_inc(&exp->use); + return exp; + } else if (del_timer(&exp->timeout)) { + nf_ct_unlink_expect(exp); + return exp; + } + + return NULL; +} + +/* delete all expectations for this conntrack */ +void nf_ct_remove_expectations(struct nf_conn *ct) +{ + struct nf_conn_help *help = nfct_help(ct); + struct nf_conntrack_expect *exp; + struct hlist_node *n, *next; + + /* Optimization: most connection never expect any others. */ + if (!help) + return; + + hlist_for_each_entry_safe(exp, n, next, &help->expectations, lnode) { + if (del_timer(&exp->timeout)) { + nf_ct_unlink_expect(exp); + nf_ct_expect_put(exp); + } + } +} +EXPORT_SYMBOL_GPL(nf_ct_remove_expectations); + +/* Would two expected things clash? */ +static inline int expect_clash(const struct nf_conntrack_expect *a, + const struct nf_conntrack_expect *b) +{ + /* Part covered by intersection of masks must be unequal, + otherwise they clash */ + struct nf_conntrack_tuple_mask intersect_mask; + int count; + + intersect_mask.src.u.all = a->mask.src.u.all & b->mask.src.u.all; + + for (count = 0; count < NF_CT_TUPLE_L3SIZE; count++){ + intersect_mask.src.u3.all[count] = + a->mask.src.u3.all[count] & b->mask.src.u3.all[count]; + } + + return nf_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask); +} + +static inline int expect_matches(const struct nf_conntrack_expect *a, + const struct nf_conntrack_expect *b) +{ + return a->master == b->master && a->class == b->class && + nf_ct_tuple_equal(&a->tuple, &b->tuple) && + nf_ct_tuple_mask_equal(&a->mask, &b->mask) && + nf_ct_zone(a->master) == nf_ct_zone(b->master); +} + +/* Generally a bad idea to call this: could have matched already. */ +void nf_ct_unexpect_related(struct nf_conntrack_expect *exp) +{ + spin_lock_bh(&nf_conntrack_lock); + if (del_timer(&exp->timeout)) { + nf_ct_unlink_expect(exp); + nf_ct_expect_put(exp); + } + spin_unlock_bh(&nf_conntrack_lock); +} +EXPORT_SYMBOL_GPL(nf_ct_unexpect_related); + +/* We don't increase the master conntrack refcount for non-fulfilled + * conntracks. During the conntrack destruction, the expectations are + * always killed before the conntrack itself */ +struct nf_conntrack_expect *nf_ct_expect_alloc(struct nf_conn *me) +{ + struct nf_conntrack_expect *new; + + new = kmem_cache_alloc(nf_ct_expect_cachep, GFP_ATOMIC); + if (!new) + return NULL; + + new->master = me; + atomic_set(&new->use, 1); + return new; +} +EXPORT_SYMBOL_GPL(nf_ct_expect_alloc); + +void nf_ct_expect_init(struct nf_conntrack_expect *exp, unsigned int class, + u_int8_t family, + const union nf_inet_addr *saddr, + const union nf_inet_addr *daddr, + u_int8_t proto, const __be16 *src, const __be16 *dst) +{ + int len; + + if (family == AF_INET) + len = 4; + else + len = 16; + + exp->flags = 0; + exp->class = class; + exp->expectfn = NULL; + exp->helper = NULL; + exp->tuple.src.l3num = family; + exp->tuple.dst.protonum = proto; + + if (saddr) { + memcpy(&exp->tuple.src.u3, saddr, len); + if (sizeof(exp->tuple.src.u3) > len) + /* address needs to be cleared for nf_ct_tuple_equal */ + memset((void *)&exp->tuple.src.u3 + len, 0x00, + sizeof(exp->tuple.src.u3) - len); + memset(&exp->mask.src.u3, 0xFF, len); + if (sizeof(exp->mask.src.u3) > len) + memset((void *)&exp->mask.src.u3 + len, 0x00, + sizeof(exp->mask.src.u3) - len); + } else { + memset(&exp->tuple.src.u3, 0x00, sizeof(exp->tuple.src.u3)); + memset(&exp->mask.src.u3, 0x00, sizeof(exp->mask.src.u3)); + } + + if (src) { + exp->tuple.src.u.all = *src; + exp->mask.src.u.all = htons(0xFFFF); + } else { + exp->tuple.src.u.all = 0; + exp->mask.src.u.all = 0; + } + + memcpy(&exp->tuple.dst.u3, daddr, len); + if (sizeof(exp->tuple.dst.u3) > len) + /* address needs to be cleared for nf_ct_tuple_equal */ + memset((void *)&exp->tuple.dst.u3 + len, 0x00, + sizeof(exp->tuple.dst.u3) - len); + + exp->tuple.dst.u.all = *dst; +} +EXPORT_SYMBOL_GPL(nf_ct_expect_init); + +static void nf_ct_expect_free_rcu(struct rcu_head *head) +{ + struct nf_conntrack_expect *exp; + + exp = container_of(head, struct nf_conntrack_expect, rcu); + kmem_cache_free(nf_ct_expect_cachep, exp); +} + +void nf_ct_expect_put(struct nf_conntrack_expect *exp) +{ + if (atomic_dec_and_test(&exp->use)) + call_rcu(&exp->rcu, nf_ct_expect_free_rcu); +} +EXPORT_SYMBOL_GPL(nf_ct_expect_put); + +static int nf_ct_expect_insert(struct nf_conntrack_expect *exp) +{ + struct nf_conn_help *master_help = nfct_help(exp->master); + struct nf_conntrack_helper *helper; + struct net *net = nf_ct_exp_net(exp); + unsigned int h = nf_ct_expect_dst_hash(&exp->tuple); + + /* two references : one for hash insert, one for the timer */ + atomic_add(2, &exp->use); + + hlist_add_head(&exp->lnode, &master_help->expectations); + master_help->expecting[exp->class]++; + + hlist_add_head_rcu(&exp->hnode, &net->ct.expect_hash[h]); + net->ct.expect_count++; + + setup_timer(&exp->timeout, nf_ct_expectation_timed_out, + (unsigned long)exp); + helper = rcu_dereference_protected(master_help->helper, + lockdep_is_held(&nf_conntrack_lock)); + if (helper) { + exp->timeout.expires = jiffies + + helper->expect_policy[exp->class].timeout * HZ; + } + add_timer(&exp->timeout); + + NF_CT_STAT_INC(net, expect_create); + return 0; +} + +/* Race with expectations being used means we could have none to find; OK. */ +static void evict_oldest_expect(struct nf_conn *master, + struct nf_conntrack_expect *new) +{ + struct nf_conn_help *master_help = nfct_help(master); + struct nf_conntrack_expect *exp, *last = NULL; + struct hlist_node *n; + + hlist_for_each_entry(exp, n, &master_help->expectations, lnode) { + if (exp->class == new->class) + last = exp; + } + + if (last && del_timer(&last->timeout)) { + nf_ct_unlink_expect(last); + nf_ct_expect_put(last); + } +} + +static inline int refresh_timer(struct nf_conntrack_expect *i) +{ + struct nf_conn_help *master_help = nfct_help(i->master); + const struct nf_conntrack_expect_policy *p; + + if (!del_timer(&i->timeout)) + return 0; + + p = &rcu_dereference_protected( + master_help->helper, + lockdep_is_held(&nf_conntrack_lock) + )->expect_policy[i->class]; + i->timeout.expires = jiffies + p->timeout * HZ; + add_timer(&i->timeout); + return 1; +} + +static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect) +{ + const struct nf_conntrack_expect_policy *p; + struct nf_conntrack_expect *i; + struct nf_conn *master = expect->master; + struct nf_conn_help *master_help = nfct_help(master); + struct nf_conntrack_helper *helper; + struct net *net = nf_ct_exp_net(expect); + struct hlist_node *n; + unsigned int h; + int ret = 1; + + if (!master_help) { + ret = -ESHUTDOWN; + goto out; + } + h = nf_ct_expect_dst_hash(&expect->tuple); + hlist_for_each_entry(i, n, &net->ct.expect_hash[h], hnode) { + if (expect_matches(i, expect)) { + /* Refresh timer: if it's dying, ignore.. */ + if (refresh_timer(i)) { + ret = 0; + goto out; + } + } else if (expect_clash(i, expect)) { + ret = -EBUSY; + goto out; + } + } + /* Will be over limit? */ + helper = rcu_dereference_protected(master_help->helper, + lockdep_is_held(&nf_conntrack_lock)); + if (helper) { + p = &helper->expect_policy[expect->class]; + if (p->max_expected && + master_help->expecting[expect->class] >= p->max_expected) { + evict_oldest_expect(master, expect); + if (master_help->expecting[expect->class] + >= p->max_expected) { + ret = -EMFILE; + goto out; + } + } + } + + if (net->ct.expect_count >= nf_ct_expect_max) { + if (net_ratelimit()) + printk(KERN_WARNING + "nf_conntrack: expectation table full\n"); + ret = -EMFILE; + } +out: + return ret; +} + +int nf_ct_expect_related_report(struct nf_conntrack_expect *expect, + u32 pid, int report) +{ + int ret; + + spin_lock_bh(&nf_conntrack_lock); + ret = __nf_ct_expect_check(expect); + if (ret <= 0) + goto out; + + ret = nf_ct_expect_insert(expect); + if (ret < 0) + goto out; + spin_unlock_bh(&nf_conntrack_lock); + nf_ct_expect_event_report(IPEXP_NEW, expect, pid, report); + return ret; +out: + spin_unlock_bh(&nf_conntrack_lock); + return ret; +} +EXPORT_SYMBOL_GPL(nf_ct_expect_related_report); + +#ifdef CONFIG_NF_CONNTRACK_PROCFS +struct ct_expect_iter_state { + struct seq_net_private p; + unsigned int bucket; +}; + +static struct hlist_node *ct_expect_get_first(struct seq_file *seq) +{ + struct net *net = seq_file_net(seq); + struct ct_expect_iter_state *st = seq->private; + struct hlist_node *n; + + for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) { + n = rcu_dereference(hlist_first_rcu(&net->ct.expect_hash[st->bucket])); + if (n) + return n; + } + return NULL; +} + +static struct hlist_node *ct_expect_get_next(struct seq_file *seq, + struct hlist_node *head) +{ + struct net *net = seq_file_net(seq); + struct ct_expect_iter_state *st = seq->private; + + head = rcu_dereference(hlist_next_rcu(head)); + while (head == NULL) { + if (++st->bucket >= nf_ct_expect_hsize) + return NULL; + head = rcu_dereference(hlist_first_rcu(&net->ct.expect_hash[st->bucket])); + } + return head; +} + +static struct hlist_node *ct_expect_get_idx(struct seq_file *seq, loff_t pos) +{ + struct hlist_node *head = ct_expect_get_first(seq); + + if (head) + while (pos && (head = ct_expect_get_next(seq, head))) + pos--; + return pos ? NULL : head; +} + +static void *exp_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(RCU) +{ + rcu_read_lock(); + return ct_expect_get_idx(seq, *pos); +} + +static void *exp_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + (*pos)++; + return ct_expect_get_next(seq, v); +} + +static void exp_seq_stop(struct seq_file *seq, void *v) + __releases(RCU) +{ + rcu_read_unlock(); +} + +static int exp_seq_show(struct seq_file *s, void *v) +{ + struct nf_conntrack_expect *expect; + struct nf_conntrack_helper *helper; + struct hlist_node *n = v; + char *delim = ""; + + expect = hlist_entry(n, struct nf_conntrack_expect, hnode); + + if (expect->timeout.function) + seq_printf(s, "%ld ", timer_pending(&expect->timeout) + ? (long)(expect->timeout.expires - jiffies)/HZ : 0); + else + seq_printf(s, "- "); + seq_printf(s, "l3proto = %u proto=%u ", + expect->tuple.src.l3num, + expect->tuple.dst.protonum); + print_tuple(s, &expect->tuple, + __nf_ct_l3proto_find(expect->tuple.src.l3num), + __nf_ct_l4proto_find(expect->tuple.src.l3num, + expect->tuple.dst.protonum)); + + if (expect->flags & NF_CT_EXPECT_PERMANENT) { + seq_printf(s, "PERMANENT"); + delim = ","; + } + if (expect->flags & NF_CT_EXPECT_INACTIVE) { + seq_printf(s, "%sINACTIVE", delim); + delim = ","; + } + if (expect->flags & NF_CT_EXPECT_USERSPACE) + seq_printf(s, "%sUSERSPACE", delim); + + helper = rcu_dereference(nfct_help(expect->master)->helper); + if (helper) { + seq_printf(s, "%s%s", expect->flags ? " " : "", helper->name); + if (helper->expect_policy[expect->class].name) + seq_printf(s, "/%s", + helper->expect_policy[expect->class].name); + } + + return seq_putc(s, '\n'); +} + +static const struct seq_operations exp_seq_ops = { + .start = exp_seq_start, + .next = exp_seq_next, + .stop = exp_seq_stop, + .show = exp_seq_show +}; + +static int exp_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &exp_seq_ops, + sizeof(struct ct_expect_iter_state)); +} + +static const struct file_operations exp_file_ops = { + .owner = THIS_MODULE, + .open = exp_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; +#endif /* CONFIG_NF_CONNTRACK_PROCFS */ + +static int exp_proc_init(struct net *net) +{ +#ifdef CONFIG_NF_CONNTRACK_PROCFS + struct proc_dir_entry *proc; + + proc = proc_net_fops_create(net, "nf_conntrack_expect", 0440, &exp_file_ops); + if (!proc) + return -ENOMEM; +#endif /* CONFIG_NF_CONNTRACK_PROCFS */ + return 0; +} + +static void exp_proc_remove(struct net *net) +{ +#ifdef CONFIG_NF_CONNTRACK_PROCFS + proc_net_remove(net, "nf_conntrack_expect"); +#endif /* CONFIG_NF_CONNTRACK_PROCFS */ +} + +module_param_named(expect_hashsize, nf_ct_expect_hsize, uint, 0400); + +int nf_conntrack_expect_init(struct net *net) +{ + int err = -ENOMEM; + + if (net_eq(net, &init_net)) { + if (!nf_ct_expect_hsize) { + nf_ct_expect_hsize = net->ct.htable_size / 256; + if (!nf_ct_expect_hsize) + nf_ct_expect_hsize = 1; + } + nf_ct_expect_max = nf_ct_expect_hsize * 4; + } + + net->ct.expect_count = 0; + net->ct.expect_hash = nf_ct_alloc_hashtable(&nf_ct_expect_hsize, 0); + if (net->ct.expect_hash == NULL) + goto err1; + + if (net_eq(net, &init_net)) { + nf_ct_expect_cachep = kmem_cache_create("nf_conntrack_expect", + sizeof(struct nf_conntrack_expect), + 0, 0, NULL); + if (!nf_ct_expect_cachep) + goto err2; + } + + err = exp_proc_init(net); + if (err < 0) + goto err3; + + return 0; + +err3: + if (net_eq(net, &init_net)) + kmem_cache_destroy(nf_ct_expect_cachep); +err2: + nf_ct_free_hashtable(net->ct.expect_hash, nf_ct_expect_hsize); +err1: + return err; +} + +void nf_conntrack_expect_fini(struct net *net) +{ + exp_proc_remove(net); + if (net_eq(net, &init_net)) { + rcu_barrier(); /* Wait for call_rcu() before destroy */ + kmem_cache_destroy(nf_ct_expect_cachep); + } + nf_ct_free_hashtable(net->ct.expect_hash, nf_ct_expect_hsize); +} diff --git a/net/netfilter/nf_conntrack_extend.c b/net/netfilter/nf_conntrack_extend.c new file mode 100644 index 00000000..641ff5f9 --- /dev/null +++ b/net/netfilter/nf_conntrack_extend.c @@ -0,0 +1,189 @@ +/* Structure dynamic extension infrastructure + * Copyright (C) 2004 Rusty Russell IBM Corporation + * Copyright (C) 2007 Netfilter Core Team <coreteam@netfilter.org> + * Copyright (C) 2007 USAGI/WIDE Project <http://www.linux-ipv6.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/rcupdate.h> +#include <linux/slab.h> +#include <linux/skbuff.h> +#include <net/netfilter/nf_conntrack_extend.h> + +static struct nf_ct_ext_type __rcu *nf_ct_ext_types[NF_CT_EXT_NUM]; +static DEFINE_MUTEX(nf_ct_ext_type_mutex); + +void __nf_ct_ext_destroy(struct nf_conn *ct) +{ + unsigned int i; + struct nf_ct_ext_type *t; + struct nf_ct_ext *ext = ct->ext; + + for (i = 0; i < NF_CT_EXT_NUM; i++) { + if (!__nf_ct_ext_exist(ext, i)) + continue; + + rcu_read_lock(); + t = rcu_dereference(nf_ct_ext_types[i]); + + /* Here the nf_ct_ext_type might have been unregisterd. + * I.e., it has responsible to cleanup private + * area in all conntracks when it is unregisterd. + */ + if (t && t->destroy) + t->destroy(ct); + rcu_read_unlock(); + } +} +EXPORT_SYMBOL(__nf_ct_ext_destroy); + +static void * +nf_ct_ext_create(struct nf_ct_ext **ext, enum nf_ct_ext_id id, gfp_t gfp) +{ + unsigned int off, len; + struct nf_ct_ext_type *t; + size_t alloc_size; + + rcu_read_lock(); + t = rcu_dereference(nf_ct_ext_types[id]); + BUG_ON(t == NULL); + off = ALIGN(sizeof(struct nf_ct_ext), t->align); + len = off + t->len; + alloc_size = t->alloc_size; + rcu_read_unlock(); + + *ext = kzalloc(alloc_size, gfp); + if (!*ext) + return NULL; + + (*ext)->offset[id] = off; + (*ext)->len = len; + + return (void *)(*ext) + off; +} + +void *__nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp) +{ + struct nf_ct_ext *old, *new; + int i, newlen, newoff; + struct nf_ct_ext_type *t; + + /* Conntrack must not be confirmed to avoid races on reallocation. */ + NF_CT_ASSERT(!nf_ct_is_confirmed(ct)); + + old = ct->ext; + if (!old) + return nf_ct_ext_create(&ct->ext, id, gfp); + + if (__nf_ct_ext_exist(old, id)) + return NULL; + + rcu_read_lock(); + t = rcu_dereference(nf_ct_ext_types[id]); + BUG_ON(t == NULL); + + newoff = ALIGN(old->len, t->align); + newlen = newoff + t->len; + rcu_read_unlock(); + + new = __krealloc(old, newlen, gfp); + if (!new) + return NULL; + + if (new != old) { + for (i = 0; i < NF_CT_EXT_NUM; i++) { + if (!__nf_ct_ext_exist(old, i)) + continue; + + rcu_read_lock(); + t = rcu_dereference(nf_ct_ext_types[i]); + if (t && t->move) + t->move((void *)new + new->offset[i], + (void *)old + old->offset[i]); + rcu_read_unlock(); + } + kfree_rcu(old, rcu); + ct->ext = new; + } + + new->offset[id] = newoff; + new->len = newlen; + memset((void *)new + newoff, 0, newlen - newoff); + return (void *)new + newoff; +} +EXPORT_SYMBOL(__nf_ct_ext_add); + +static void update_alloc_size(struct nf_ct_ext_type *type) +{ + int i, j; + struct nf_ct_ext_type *t1, *t2; + enum nf_ct_ext_id min = 0, max = NF_CT_EXT_NUM - 1; + + /* unnecessary to update all types */ + if ((type->flags & NF_CT_EXT_F_PREALLOC) == 0) { + min = type->id; + max = type->id; + } + + /* This assumes that extended areas in conntrack for the types + whose NF_CT_EXT_F_PREALLOC bit set are allocated in order */ + for (i = min; i <= max; i++) { + t1 = rcu_dereference_protected(nf_ct_ext_types[i], + lockdep_is_held(&nf_ct_ext_type_mutex)); + if (!t1) + continue; + + t1->alloc_size = ALIGN(sizeof(struct nf_ct_ext), t1->align) + + t1->len; + for (j = 0; j < NF_CT_EXT_NUM; j++) { + t2 = rcu_dereference_protected(nf_ct_ext_types[j], + lockdep_is_held(&nf_ct_ext_type_mutex)); + if (t2 == NULL || t2 == t1 || + (t2->flags & NF_CT_EXT_F_PREALLOC) == 0) + continue; + + t1->alloc_size = ALIGN(t1->alloc_size, t2->align) + + t2->len; + } + } +} + +/* This MUST be called in process context. */ +int nf_ct_extend_register(struct nf_ct_ext_type *type) +{ + int ret = 0; + + mutex_lock(&nf_ct_ext_type_mutex); + if (nf_ct_ext_types[type->id]) { + ret = -EBUSY; + goto out; + } + + /* This ensures that nf_ct_ext_create() can allocate enough area + before updating alloc_size */ + type->alloc_size = ALIGN(sizeof(struct nf_ct_ext), type->align) + + type->len; + rcu_assign_pointer(nf_ct_ext_types[type->id], type); + update_alloc_size(type); +out: + mutex_unlock(&nf_ct_ext_type_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(nf_ct_extend_register); + +/* This MUST be called in process context. */ +void nf_ct_extend_unregister(struct nf_ct_ext_type *type) +{ + mutex_lock(&nf_ct_ext_type_mutex); + RCU_INIT_POINTER(nf_ct_ext_types[type->id], NULL); + update_alloc_size(type); + mutex_unlock(&nf_ct_ext_type_mutex); + rcu_barrier(); /* Wait for completion of call_rcu()'s */ +} +EXPORT_SYMBOL_GPL(nf_ct_extend_unregister); diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c new file mode 100644 index 00000000..8c5c95c6 --- /dev/null +++ b/net/netfilter/nf_conntrack_ftp.c @@ -0,0 +1,589 @@ +/* FTP extension for connection tracking. */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/netfilter.h> +#include <linux/ip.h> +#include <linux/slab.h> +#include <linux/ipv6.h> +#include <linux/ctype.h> +#include <linux/inet.h> +#include <net/checksum.h> +#include <net/tcp.h> + +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_expect.h> +#include <net/netfilter/nf_conntrack_ecache.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <linux/netfilter/nf_conntrack_ftp.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>"); +MODULE_DESCRIPTION("ftp connection tracking helper"); +MODULE_ALIAS("ip_conntrack_ftp"); +MODULE_ALIAS_NFCT_HELPER("ftp"); + +/* This is slow, but it's simple. --RR */ +static char *ftp_buffer; + +static DEFINE_SPINLOCK(nf_ftp_lock); + +#define MAX_PORTS 8 +static u_int16_t ports[MAX_PORTS]; +static unsigned int ports_c; +module_param_array(ports, ushort, &ports_c, 0400); + +static bool loose; +module_param(loose, bool, 0600); + +unsigned int (*nf_nat_ftp_hook)(struct sk_buff *skb, + enum ip_conntrack_info ctinfo, + enum nf_ct_ftp_type type, + unsigned int matchoff, + unsigned int matchlen, + struct nf_conntrack_expect *exp); +EXPORT_SYMBOL_GPL(nf_nat_ftp_hook); + +static int try_rfc959(const char *, size_t, struct nf_conntrack_man *, char); +static int try_eprt(const char *, size_t, struct nf_conntrack_man *, char); +static int try_epsv_response(const char *, size_t, struct nf_conntrack_man *, + char); + +static struct ftp_search { + const char *pattern; + size_t plen; + char skip; + char term; + enum nf_ct_ftp_type ftptype; + int (*getnum)(const char *, size_t, struct nf_conntrack_man *, char); +} search[IP_CT_DIR_MAX][2] = { + [IP_CT_DIR_ORIGINAL] = { + { + .pattern = "PORT", + .plen = sizeof("PORT") - 1, + .skip = ' ', + .term = '\r', + .ftptype = NF_CT_FTP_PORT, + .getnum = try_rfc959, + }, + { + .pattern = "EPRT", + .plen = sizeof("EPRT") - 1, + .skip = ' ', + .term = '\r', + .ftptype = NF_CT_FTP_EPRT, + .getnum = try_eprt, + }, + }, + [IP_CT_DIR_REPLY] = { + { + .pattern = "227 ", + .plen = sizeof("227 ") - 1, + .skip = '(', + .term = ')', + .ftptype = NF_CT_FTP_PASV, + .getnum = try_rfc959, + }, + { + .pattern = "229 ", + .plen = sizeof("229 ") - 1, + .skip = '(', + .term = ')', + .ftptype = NF_CT_FTP_EPSV, + .getnum = try_epsv_response, + }, + }, +}; + +static int +get_ipv6_addr(const char *src, size_t dlen, struct in6_addr *dst, u_int8_t term) +{ + const char *end; + int ret = in6_pton(src, min_t(size_t, dlen, 0xffff), (u8 *)dst, term, &end); + if (ret > 0) + return (int)(end - src); + return 0; +} + +static int try_number(const char *data, size_t dlen, u_int32_t array[], + int array_size, char sep, char term) +{ + u_int32_t i, len; + + memset(array, 0, sizeof(array[0])*array_size); + + /* Keep data pointing at next char. */ + for (i = 0, len = 0; len < dlen && i < array_size; len++, data++) { + if (*data >= '0' && *data <= '9') { + array[i] = array[i]*10 + *data - '0'; + } + else if (*data == sep) + i++; + else { + /* Unexpected character; true if it's the + terminator and we're finished. */ + if (*data == term && i == array_size - 1) + return len; + + pr_debug("Char %u (got %u nums) `%u' unexpected\n", + len, i, *data); + return 0; + } + } + pr_debug("Failed to fill %u numbers separated by %c\n", + array_size, sep); + return 0; +} + +/* Returns 0, or length of numbers: 192,168,1,1,5,6 */ +static int try_rfc959(const char *data, size_t dlen, + struct nf_conntrack_man *cmd, char term) +{ + int length; + u_int32_t array[6]; + + length = try_number(data, dlen, array, 6, ',', term); + if (length == 0) + return 0; + + cmd->u3.ip = htonl((array[0] << 24) | (array[1] << 16) | + (array[2] << 8) | array[3]); + cmd->u.tcp.port = htons((array[4] << 8) | array[5]); + return length; +} + +/* Grab port: number up to delimiter */ +static int get_port(const char *data, int start, size_t dlen, char delim, + __be16 *port) +{ + u_int16_t tmp_port = 0; + int i; + + for (i = start; i < dlen; i++) { + /* Finished? */ + if (data[i] == delim) { + if (tmp_port == 0) + break; + *port = htons(tmp_port); + pr_debug("get_port: return %d\n", tmp_port); + return i + 1; + } + else if (data[i] >= '0' && data[i] <= '9') + tmp_port = tmp_port*10 + data[i] - '0'; + else { /* Some other crap */ + pr_debug("get_port: invalid char.\n"); + break; + } + } + return 0; +} + +/* Returns 0, or length of numbers: |1|132.235.1.2|6275| or |2|3ffe::1|6275| */ +static int try_eprt(const char *data, size_t dlen, struct nf_conntrack_man *cmd, + char term) +{ + char delim; + int length; + + /* First character is delimiter, then "1" for IPv4 or "2" for IPv6, + then delimiter again. */ + if (dlen <= 3) { + pr_debug("EPRT: too short\n"); + return 0; + } + delim = data[0]; + if (isdigit(delim) || delim < 33 || delim > 126 || data[2] != delim) { + pr_debug("try_eprt: invalid delimitter.\n"); + return 0; + } + + if ((cmd->l3num == PF_INET && data[1] != '1') || + (cmd->l3num == PF_INET6 && data[1] != '2')) { + pr_debug("EPRT: invalid protocol number.\n"); + return 0; + } + + pr_debug("EPRT: Got %c%c%c\n", delim, data[1], delim); + + if (data[1] == '1') { + u_int32_t array[4]; + + /* Now we have IP address. */ + length = try_number(data + 3, dlen - 3, array, 4, '.', delim); + if (length != 0) + cmd->u3.ip = htonl((array[0] << 24) | (array[1] << 16) + | (array[2] << 8) | array[3]); + } else { + /* Now we have IPv6 address. */ + length = get_ipv6_addr(data + 3, dlen - 3, + (struct in6_addr *)cmd->u3.ip6, delim); + } + + if (length == 0) + return 0; + pr_debug("EPRT: Got IP address!\n"); + /* Start offset includes initial "|1|", and trailing delimiter */ + return get_port(data, 3 + length + 1, dlen, delim, &cmd->u.tcp.port); +} + +/* Returns 0, or length of numbers: |||6446| */ +static int try_epsv_response(const char *data, size_t dlen, + struct nf_conntrack_man *cmd, char term) +{ + char delim; + + /* Three delimiters. */ + if (dlen <= 3) return 0; + delim = data[0]; + if (isdigit(delim) || delim < 33 || delim > 126 || + data[1] != delim || data[2] != delim) + return 0; + + return get_port(data, 3, dlen, delim, &cmd->u.tcp.port); +} + +/* Return 1 for match, 0 for accept, -1 for partial. */ +static int find_pattern(const char *data, size_t dlen, + const char *pattern, size_t plen, + char skip, char term, + unsigned int *numoff, + unsigned int *numlen, + struct nf_conntrack_man *cmd, + int (*getnum)(const char *, size_t, + struct nf_conntrack_man *, char)) +{ + size_t i; + + pr_debug("find_pattern `%s': dlen = %Zu\n", pattern, dlen); + if (dlen == 0) + return 0; + + if (dlen <= plen) { + /* Short packet: try for partial? */ + if (strnicmp(data, pattern, dlen) == 0) + return -1; + else return 0; + } + + if (strnicmp(data, pattern, plen) != 0) { +#if 0 + size_t i; + + pr_debug("ftp: string mismatch\n"); + for (i = 0; i < plen; i++) { + pr_debug("ftp:char %u `%c'(%u) vs `%c'(%u)\n", + i, data[i], data[i], + pattern[i], pattern[i]); + } +#endif + return 0; + } + + pr_debug("Pattern matches!\n"); + /* Now we've found the constant string, try to skip + to the 'skip' character */ + for (i = plen; data[i] != skip; i++) + if (i == dlen - 1) return -1; + + /* Skip over the last character */ + i++; + + pr_debug("Skipped up to `%c'!\n", skip); + + *numoff = i; + *numlen = getnum(data + i, dlen - i, cmd, term); + if (!*numlen) + return -1; + + pr_debug("Match succeeded!\n"); + return 1; +} + +/* Look up to see if we're just after a \n. */ +static int find_nl_seq(u32 seq, const struct nf_ct_ftp_master *info, int dir) +{ + unsigned int i; + + for (i = 0; i < info->seq_aft_nl_num[dir]; i++) + if (info->seq_aft_nl[dir][i] == seq) + return 1; + return 0; +} + +/* We don't update if it's older than what we have. */ +static void update_nl_seq(struct nf_conn *ct, u32 nl_seq, + struct nf_ct_ftp_master *info, int dir, + struct sk_buff *skb) +{ + unsigned int i, oldest; + + /* Look for oldest: if we find exact match, we're done. */ + for (i = 0; i < info->seq_aft_nl_num[dir]; i++) { + if (info->seq_aft_nl[dir][i] == nl_seq) + return; + } + + if (info->seq_aft_nl_num[dir] < NUM_SEQ_TO_REMEMBER) { + info->seq_aft_nl[dir][info->seq_aft_nl_num[dir]++] = nl_seq; + } else { + if (before(info->seq_aft_nl[dir][0], info->seq_aft_nl[dir][1])) + oldest = 0; + else + oldest = 1; + + if (after(nl_seq, info->seq_aft_nl[dir][oldest])) + info->seq_aft_nl[dir][oldest] = nl_seq; + } +} + +static int help(struct sk_buff *skb, + unsigned int protoff, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo) +{ + unsigned int dataoff, datalen; + const struct tcphdr *th; + struct tcphdr _tcph; + const char *fb_ptr; + int ret; + u32 seq; + int dir = CTINFO2DIR(ctinfo); + unsigned int uninitialized_var(matchlen), uninitialized_var(matchoff); + struct nf_ct_ftp_master *ct_ftp_info = &nfct_help(ct)->help.ct_ftp_info; + struct nf_conntrack_expect *exp; + union nf_inet_addr *daddr; + struct nf_conntrack_man cmd = {}; + unsigned int i; + int found = 0, ends_in_nl; + typeof(nf_nat_ftp_hook) nf_nat_ftp; + + /* Until there's been traffic both ways, don't look in packets. */ + if (ctinfo != IP_CT_ESTABLISHED && + ctinfo != IP_CT_ESTABLISHED_REPLY) { + pr_debug("ftp: Conntrackinfo = %u\n", ctinfo); + return NF_ACCEPT; + } + + th = skb_header_pointer(skb, protoff, sizeof(_tcph), &_tcph); + if (th == NULL) + return NF_ACCEPT; + + dataoff = protoff + th->doff * 4; + /* No data? */ + if (dataoff >= skb->len) { + pr_debug("ftp: dataoff(%u) >= skblen(%u)\n", dataoff, + skb->len); + return NF_ACCEPT; + } + datalen = skb->len - dataoff; + + spin_lock_bh(&nf_ftp_lock); + fb_ptr = skb_header_pointer(skb, dataoff, datalen, ftp_buffer); + BUG_ON(fb_ptr == NULL); + + ends_in_nl = (fb_ptr[datalen - 1] == '\n'); + seq = ntohl(th->seq) + datalen; + + /* Look up to see if we're just after a \n. */ + if (!find_nl_seq(ntohl(th->seq), ct_ftp_info, dir)) { + /* Now if this ends in \n, update ftp info. */ + pr_debug("nf_conntrack_ftp: wrong seq pos %s(%u) or %s(%u)\n", + ct_ftp_info->seq_aft_nl_num[dir] > 0 ? "" : "(UNSET)", + ct_ftp_info->seq_aft_nl[dir][0], + ct_ftp_info->seq_aft_nl_num[dir] > 1 ? "" : "(UNSET)", + ct_ftp_info->seq_aft_nl[dir][1]); + ret = NF_ACCEPT; + goto out_update_nl; + } + + /* Initialize IP/IPv6 addr to expected address (it's not mentioned + in EPSV responses) */ + cmd.l3num = nf_ct_l3num(ct); + memcpy(cmd.u3.all, &ct->tuplehash[dir].tuple.src.u3.all, + sizeof(cmd.u3.all)); + + for (i = 0; i < ARRAY_SIZE(search[dir]); i++) { + found = find_pattern(fb_ptr, datalen, + search[dir][i].pattern, + search[dir][i].plen, + search[dir][i].skip, + search[dir][i].term, + &matchoff, &matchlen, + &cmd, + search[dir][i].getnum); + if (found) break; + } + if (found == -1) { + /* We don't usually drop packets. After all, this is + connection tracking, not packet filtering. + However, it is necessary for accurate tracking in + this case. */ + pr_debug("conntrack_ftp: partial %s %u+%u\n", + search[dir][i].pattern, ntohl(th->seq), datalen); + ret = NF_DROP; + goto out; + } else if (found == 0) { /* No match */ + ret = NF_ACCEPT; + goto out_update_nl; + } + + pr_debug("conntrack_ftp: match `%.*s' (%u bytes at %u)\n", + matchlen, fb_ptr + matchoff, + matchlen, ntohl(th->seq) + matchoff); + + exp = nf_ct_expect_alloc(ct); + if (exp == NULL) { + ret = NF_DROP; + goto out; + } + + /* We refer to the reverse direction ("!dir") tuples here, + * because we're expecting something in the other direction. + * Doesn't matter unless NAT is happening. */ + daddr = &ct->tuplehash[!dir].tuple.dst.u3; + + /* Update the ftp info */ + if ((cmd.l3num == nf_ct_l3num(ct)) && + memcmp(&cmd.u3.all, &ct->tuplehash[dir].tuple.src.u3.all, + sizeof(cmd.u3.all))) { + /* Enrico Scholz's passive FTP to partially RNAT'd ftp + server: it really wants us to connect to a + different IP address. Simply don't record it for + NAT. */ + if (cmd.l3num == PF_INET) { + pr_debug("conntrack_ftp: NOT RECORDING: %pI4 != %pI4\n", + &cmd.u3.ip, + &ct->tuplehash[dir].tuple.src.u3.ip); + } else { + pr_debug("conntrack_ftp: NOT RECORDING: %pI6 != %pI6\n", + cmd.u3.ip6, + ct->tuplehash[dir].tuple.src.u3.ip6); + } + + /* Thanks to Cristiano Lincoln Mattos + <lincoln@cesar.org.br> for reporting this potential + problem (DMZ machines opening holes to internal + networks, or the packet filter itself). */ + if (!loose) { + ret = NF_ACCEPT; + goto out_put_expect; + } + daddr = &cmd.u3; + } + + nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, cmd.l3num, + &ct->tuplehash[!dir].tuple.src.u3, daddr, + IPPROTO_TCP, NULL, &cmd.u.tcp.port); + + /* Now, NAT might want to mangle the packet, and register the + * (possibly changed) expectation itself. */ + nf_nat_ftp = rcu_dereference(nf_nat_ftp_hook); + if (nf_nat_ftp && ct->status & IPS_NAT_MASK) + ret = nf_nat_ftp(skb, ctinfo, search[dir][i].ftptype, + matchoff, matchlen, exp); + else { + /* Can't expect this? Best to drop packet now. */ + if (nf_ct_expect_related(exp) != 0) + ret = NF_DROP; + else + ret = NF_ACCEPT; + } + +out_put_expect: + nf_ct_expect_put(exp); + +out_update_nl: + /* Now if this ends in \n, update ftp info. Seq may have been + * adjusted by NAT code. */ + if (ends_in_nl) + update_nl_seq(ct, seq, ct_ftp_info, dir, skb); + out: + spin_unlock_bh(&nf_ftp_lock); + return ret; +} + +static struct nf_conntrack_helper ftp[MAX_PORTS][2] __read_mostly; +static char ftp_names[MAX_PORTS][2][sizeof("ftp-65535")] __read_mostly; + +static const struct nf_conntrack_expect_policy ftp_exp_policy = { + .max_expected = 1, + .timeout = 5 * 60, +}; + +/* don't make this __exit, since it's called from __init ! */ +static void nf_conntrack_ftp_fini(void) +{ + int i, j; + for (i = 0; i < ports_c; i++) { + for (j = 0; j < 2; j++) { + if (ftp[i][j].me == NULL) + continue; + + pr_debug("nf_ct_ftp: unregistering helper for pf: %d " + "port: %d\n", + ftp[i][j].tuple.src.l3num, ports[i]); + nf_conntrack_helper_unregister(&ftp[i][j]); + } + } + + kfree(ftp_buffer); +} + +static int __init nf_conntrack_ftp_init(void) +{ + int i, j = -1, ret = 0; + char *tmpname; + + ftp_buffer = kmalloc(65536, GFP_KERNEL); + if (!ftp_buffer) + return -ENOMEM; + + if (ports_c == 0) + ports[ports_c++] = FTP_PORT; + + /* FIXME should be configurable whether IPv4 and IPv6 FTP connections + are tracked or not - YK */ + for (i = 0; i < ports_c; i++) { + ftp[i][0].tuple.src.l3num = PF_INET; + ftp[i][1].tuple.src.l3num = PF_INET6; + for (j = 0; j < 2; j++) { + ftp[i][j].tuple.src.u.tcp.port = htons(ports[i]); + ftp[i][j].tuple.dst.protonum = IPPROTO_TCP; + ftp[i][j].expect_policy = &ftp_exp_policy; + ftp[i][j].me = THIS_MODULE; + ftp[i][j].help = help; + tmpname = &ftp_names[i][j][0]; + if (ports[i] == FTP_PORT) + sprintf(tmpname, "ftp"); + else + sprintf(tmpname, "ftp-%d", ports[i]); + ftp[i][j].name = tmpname; + + pr_debug("nf_ct_ftp: registering helper for pf: %d " + "port: %d\n", + ftp[i][j].tuple.src.l3num, ports[i]); + ret = nf_conntrack_helper_register(&ftp[i][j]); + if (ret) { + printk(KERN_ERR "nf_ct_ftp: failed to register" + " helper for pf: %d port: %d\n", + ftp[i][j].tuple.src.l3num, ports[i]); + nf_conntrack_ftp_fini(); + return ret; + } + } + } + + return 0; +} + +module_init(nf_conntrack_ftp_init); +module_exit(nf_conntrack_ftp_fini); diff --git a/net/netfilter/nf_conntrack_h323_asn1.c b/net/netfilter/nf_conntrack_h323_asn1.c new file mode 100644 index 00000000..bcd5ed6b --- /dev/null +++ b/net/netfilter/nf_conntrack_h323_asn1.c @@ -0,0 +1,888 @@ +/**************************************************************************** + * ip_conntrack_helper_h323_asn1.c - BER and PER decoding library for H.323 + * conntrack/NAT module. + * + * Copyright (c) 2006 by Jing Min Zhao <zhaojingmin@users.sourceforge.net> + * + * This source code is licensed under General Public License version 2. + * + * See ip_conntrack_helper_h323_asn1.h for details. + * + ****************************************************************************/ + +#ifdef __KERNEL__ +#include <linux/kernel.h> +#else +#include <stdio.h> +#endif +#include <linux/netfilter/nf_conntrack_h323_asn1.h> + +/* Trace Flag */ +#ifndef H323_TRACE +#define H323_TRACE 0 +#endif + +#if H323_TRACE +#define TAB_SIZE 4 +#define IFTHEN(cond, act) if(cond){act;} +#ifdef __KERNEL__ +#define PRINT printk +#else +#define PRINT printf +#endif +#define FNAME(name) name, +#else +#define IFTHEN(cond, act) +#define PRINT(fmt, args...) +#define FNAME(name) +#endif + +/* ASN.1 Types */ +#define NUL 0 +#define BOOL 1 +#define OID 2 +#define INT 3 +#define ENUM 4 +#define BITSTR 5 +#define NUMSTR 6 +#define NUMDGT 6 +#define TBCDSTR 6 +#define OCTSTR 7 +#define PRTSTR 7 +#define IA5STR 7 +#define GENSTR 7 +#define BMPSTR 8 +#define SEQ 9 +#define SET 9 +#define SEQOF 10 +#define SETOF 10 +#define CHOICE 11 + +/* Constraint Types */ +#define FIXD 0 +/* #define BITS 1-8 */ +#define BYTE 9 +#define WORD 10 +#define CONS 11 +#define SEMI 12 +#define UNCO 13 + +/* ASN.1 Type Attributes */ +#define SKIP 0 +#define STOP 1 +#define DECODE 2 +#define EXT 4 +#define OPEN 8 +#define OPT 16 + + +/* ASN.1 Field Structure */ +typedef struct field_t { +#if H323_TRACE + char *name; +#endif + unsigned char type; + unsigned char sz; + unsigned char lb; + unsigned char ub; + unsigned short attr; + unsigned short offset; + const struct field_t *fields; +} field_t; + +/* Bit Stream */ +typedef struct { + unsigned char *buf; + unsigned char *beg; + unsigned char *end; + unsigned char *cur; + unsigned int bit; +} bitstr_t; + +/* Tool Functions */ +#define INC_BIT(bs) if((++(bs)->bit)>7){(bs)->cur++;(bs)->bit=0;} +#define INC_BITS(bs,b) if(((bs)->bit+=(b))>7){(bs)->cur+=(bs)->bit>>3;(bs)->bit&=7;} +#define BYTE_ALIGN(bs) if((bs)->bit){(bs)->cur++;(bs)->bit=0;} +#define CHECK_BOUND(bs,n) if((bs)->cur+(n)>(bs)->end)return(H323_ERROR_BOUND) +static unsigned int get_len(bitstr_t *bs); +static unsigned int get_bit(bitstr_t *bs); +static unsigned int get_bits(bitstr_t *bs, unsigned int b); +static unsigned int get_bitmap(bitstr_t *bs, unsigned int b); +static unsigned int get_uint(bitstr_t *bs, int b); + +/* Decoder Functions */ +static int decode_nul(bitstr_t *bs, const struct field_t *f, char *base, int level); +static int decode_bool(bitstr_t *bs, const struct field_t *f, char *base, int level); +static int decode_oid(bitstr_t *bs, const struct field_t *f, char *base, int level); +static int decode_int(bitstr_t *bs, const struct field_t *f, char *base, int level); +static int decode_enum(bitstr_t *bs, const struct field_t *f, char *base, int level); +static int decode_bitstr(bitstr_t *bs, const struct field_t *f, char *base, int level); +static int decode_numstr(bitstr_t *bs, const struct field_t *f, char *base, int level); +static int decode_octstr(bitstr_t *bs, const struct field_t *f, char *base, int level); +static int decode_bmpstr(bitstr_t *bs, const struct field_t *f, char *base, int level); +static int decode_seq(bitstr_t *bs, const struct field_t *f, char *base, int level); +static int decode_seqof(bitstr_t *bs, const struct field_t *f, char *base, int level); +static int decode_choice(bitstr_t *bs, const struct field_t *f, char *base, int level); + +/* Decoder Functions Vector */ +typedef int (*decoder_t)(bitstr_t *, const struct field_t *, char *, int); +static const decoder_t Decoders[] = { + decode_nul, + decode_bool, + decode_oid, + decode_int, + decode_enum, + decode_bitstr, + decode_numstr, + decode_octstr, + decode_bmpstr, + decode_seq, + decode_seqof, + decode_choice, +}; + +/**************************************************************************** + * H.323 Types + ****************************************************************************/ +#include "nf_conntrack_h323_types.c" + +/**************************************************************************** + * Functions + ****************************************************************************/ +/* Assume bs is aligned && v < 16384 */ +static unsigned int get_len(bitstr_t *bs) +{ + unsigned int v; + + v = *bs->cur++; + + if (v & 0x80) { + v &= 0x3f; + v <<= 8; + v += *bs->cur++; + } + + return v; +} + +/****************************************************************************/ +static unsigned int get_bit(bitstr_t *bs) +{ + unsigned int b = (*bs->cur) & (0x80 >> bs->bit); + + INC_BIT(bs); + + return b; +} + +/****************************************************************************/ +/* Assume b <= 8 */ +static unsigned int get_bits(bitstr_t *bs, unsigned int b) +{ + unsigned int v, l; + + v = (*bs->cur) & (0xffU >> bs->bit); + l = b + bs->bit; + + if (l < 8) { + v >>= 8 - l; + bs->bit = l; + } else if (l == 8) { + bs->cur++; + bs->bit = 0; + } else { /* l > 8 */ + + v <<= 8; + v += *(++bs->cur); + v >>= 16 - l; + bs->bit = l - 8; + } + + return v; +} + +/****************************************************************************/ +/* Assume b <= 32 */ +static unsigned int get_bitmap(bitstr_t *bs, unsigned int b) +{ + unsigned int v, l, shift, bytes; + + if (!b) + return 0; + + l = bs->bit + b; + + if (l < 8) { + v = (unsigned int)(*bs->cur) << (bs->bit + 24); + bs->bit = l; + } else if (l == 8) { + v = (unsigned int)(*bs->cur++) << (bs->bit + 24); + bs->bit = 0; + } else { + for (bytes = l >> 3, shift = 24, v = 0; bytes; + bytes--, shift -= 8) + v |= (unsigned int)(*bs->cur++) << shift; + + if (l < 32) { + v |= (unsigned int)(*bs->cur) << shift; + v <<= bs->bit; + } else if (l > 32) { + v <<= bs->bit; + v |= (*bs->cur) >> (8 - bs->bit); + } + + bs->bit = l & 0x7; + } + + v &= 0xffffffff << (32 - b); + + return v; +} + +/**************************************************************************** + * Assume bs is aligned and sizeof(unsigned int) == 4 + ****************************************************************************/ +static unsigned int get_uint(bitstr_t *bs, int b) +{ + unsigned int v = 0; + + switch (b) { + case 4: + v |= *bs->cur++; + v <<= 8; + case 3: + v |= *bs->cur++; + v <<= 8; + case 2: + v |= *bs->cur++; + v <<= 8; + case 1: + v |= *bs->cur++; + break; + } + return v; +} + +/****************************************************************************/ +static int decode_nul(bitstr_t *bs, const struct field_t *f, + char *base, int level) +{ + PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name); + + return H323_ERROR_NONE; +} + +/****************************************************************************/ +static int decode_bool(bitstr_t *bs, const struct field_t *f, + char *base, int level) +{ + PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name); + + INC_BIT(bs); + + CHECK_BOUND(bs, 0); + return H323_ERROR_NONE; +} + +/****************************************************************************/ +static int decode_oid(bitstr_t *bs, const struct field_t *f, + char *base, int level) +{ + int len; + + PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name); + + BYTE_ALIGN(bs); + CHECK_BOUND(bs, 1); + len = *bs->cur++; + bs->cur += len; + + CHECK_BOUND(bs, 0); + return H323_ERROR_NONE; +} + +/****************************************************************************/ +static int decode_int(bitstr_t *bs, const struct field_t *f, + char *base, int level) +{ + unsigned int len; + + PRINT("%*.s%s", level * TAB_SIZE, " ", f->name); + + switch (f->sz) { + case BYTE: /* Range == 256 */ + BYTE_ALIGN(bs); + bs->cur++; + break; + case WORD: /* 257 <= Range <= 64K */ + BYTE_ALIGN(bs); + bs->cur += 2; + break; + case CONS: /* 64K < Range < 4G */ + len = get_bits(bs, 2) + 1; + BYTE_ALIGN(bs); + if (base && (f->attr & DECODE)) { /* timeToLive */ + unsigned int v = get_uint(bs, len) + f->lb; + PRINT(" = %u", v); + *((unsigned int *)(base + f->offset)) = v; + } + bs->cur += len; + break; + case UNCO: + BYTE_ALIGN(bs); + CHECK_BOUND(bs, 2); + len = get_len(bs); + bs->cur += len; + break; + default: /* 2 <= Range <= 255 */ + INC_BITS(bs, f->sz); + break; + } + + PRINT("\n"); + + CHECK_BOUND(bs, 0); + return H323_ERROR_NONE; +} + +/****************************************************************************/ +static int decode_enum(bitstr_t *bs, const struct field_t *f, + char *base, int level) +{ + PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name); + + if ((f->attr & EXT) && get_bit(bs)) { + INC_BITS(bs, 7); + } else { + INC_BITS(bs, f->sz); + } + + CHECK_BOUND(bs, 0); + return H323_ERROR_NONE; +} + +/****************************************************************************/ +static int decode_bitstr(bitstr_t *bs, const struct field_t *f, + char *base, int level) +{ + unsigned int len; + + PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name); + + BYTE_ALIGN(bs); + switch (f->sz) { + case FIXD: /* fixed length > 16 */ + len = f->lb; + break; + case WORD: /* 2-byte length */ + CHECK_BOUND(bs, 2); + len = (*bs->cur++) << 8; + len += (*bs->cur++) + f->lb; + break; + case SEMI: + CHECK_BOUND(bs, 2); + len = get_len(bs); + break; + default: + len = 0; + break; + } + + bs->cur += len >> 3; + bs->bit = len & 7; + + CHECK_BOUND(bs, 0); + return H323_ERROR_NONE; +} + +/****************************************************************************/ +static int decode_numstr(bitstr_t *bs, const struct field_t *f, + char *base, int level) +{ + unsigned int len; + + PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name); + + /* 2 <= Range <= 255 */ + len = get_bits(bs, f->sz) + f->lb; + + BYTE_ALIGN(bs); + INC_BITS(bs, (len << 2)); + + CHECK_BOUND(bs, 0); + return H323_ERROR_NONE; +} + +/****************************************************************************/ +static int decode_octstr(bitstr_t *bs, const struct field_t *f, + char *base, int level) +{ + unsigned int len; + + PRINT("%*.s%s", level * TAB_SIZE, " ", f->name); + + switch (f->sz) { + case FIXD: /* Range == 1 */ + if (f->lb > 2) { + BYTE_ALIGN(bs); + if (base && (f->attr & DECODE)) { + /* The IP Address */ + IFTHEN(f->lb == 4, + PRINT(" = %d.%d.%d.%d:%d", + bs->cur[0], bs->cur[1], + bs->cur[2], bs->cur[3], + bs->cur[4] * 256 + bs->cur[5])); + *((unsigned int *)(base + f->offset)) = + bs->cur - bs->buf; + } + } + len = f->lb; + break; + case BYTE: /* Range == 256 */ + BYTE_ALIGN(bs); + CHECK_BOUND(bs, 1); + len = (*bs->cur++) + f->lb; + break; + case SEMI: + BYTE_ALIGN(bs); + CHECK_BOUND(bs, 2); + len = get_len(bs) + f->lb; + break; + default: /* 2 <= Range <= 255 */ + len = get_bits(bs, f->sz) + f->lb; + BYTE_ALIGN(bs); + break; + } + + bs->cur += len; + + PRINT("\n"); + + CHECK_BOUND(bs, 0); + return H323_ERROR_NONE; +} + +/****************************************************************************/ +static int decode_bmpstr(bitstr_t *bs, const struct field_t *f, + char *base, int level) +{ + unsigned int len; + + PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name); + + switch (f->sz) { + case BYTE: /* Range == 256 */ + BYTE_ALIGN(bs); + CHECK_BOUND(bs, 1); + len = (*bs->cur++) + f->lb; + break; + default: /* 2 <= Range <= 255 */ + len = get_bits(bs, f->sz) + f->lb; + BYTE_ALIGN(bs); + break; + } + + bs->cur += len << 1; + + CHECK_BOUND(bs, 0); + return H323_ERROR_NONE; +} + +/****************************************************************************/ +static int decode_seq(bitstr_t *bs, const struct field_t *f, + char *base, int level) +{ + unsigned int ext, bmp, i, opt, len = 0, bmp2, bmp2_len; + int err; + const struct field_t *son; + unsigned char *beg = NULL; + + PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name); + + /* Decode? */ + base = (base && (f->attr & DECODE)) ? base + f->offset : NULL; + + /* Extensible? */ + ext = (f->attr & EXT) ? get_bit(bs) : 0; + + /* Get fields bitmap */ + bmp = get_bitmap(bs, f->sz); + if (base) + *(unsigned int *)base = bmp; + + /* Decode the root components */ + for (i = opt = 0, son = f->fields; i < f->lb; i++, son++) { + if (son->attr & STOP) { + PRINT("%*.s%s\n", (level + 1) * TAB_SIZE, " ", + son->name); + return H323_ERROR_STOP; + } + + if (son->attr & OPT) { /* Optional component */ + if (!((0x80000000U >> (opt++)) & bmp)) /* Not exist */ + continue; + } + + /* Decode */ + if (son->attr & OPEN) { /* Open field */ + CHECK_BOUND(bs, 2); + len = get_len(bs); + CHECK_BOUND(bs, len); + if (!base || !(son->attr & DECODE)) { + PRINT("%*.s%s\n", (level + 1) * TAB_SIZE, + " ", son->name); + bs->cur += len; + continue; + } + beg = bs->cur; + + /* Decode */ + if ((err = (Decoders[son->type]) (bs, son, base, + level + 1)) < + H323_ERROR_NONE) + return err; + + bs->cur = beg + len; + bs->bit = 0; + } else if ((err = (Decoders[son->type]) (bs, son, base, + level + 1)) < + H323_ERROR_NONE) + return err; + } + + /* No extension? */ + if (!ext) + return H323_ERROR_NONE; + + /* Get the extension bitmap */ + bmp2_len = get_bits(bs, 7) + 1; + CHECK_BOUND(bs, (bmp2_len + 7) >> 3); + bmp2 = get_bitmap(bs, bmp2_len); + bmp |= bmp2 >> f->sz; + if (base) + *(unsigned int *)base = bmp; + BYTE_ALIGN(bs); + + /* Decode the extension components */ + for (opt = 0; opt < bmp2_len; opt++, i++, son++) { + /* Check Range */ + if (i >= f->ub) { /* Newer Version? */ + CHECK_BOUND(bs, 2); + len = get_len(bs); + CHECK_BOUND(bs, len); + bs->cur += len; + continue; + } + + if (son->attr & STOP) { + PRINT("%*.s%s\n", (level + 1) * TAB_SIZE, " ", + son->name); + return H323_ERROR_STOP; + } + + if (!((0x80000000 >> opt) & bmp2)) /* Not present */ + continue; + + CHECK_BOUND(bs, 2); + len = get_len(bs); + CHECK_BOUND(bs, len); + if (!base || !(son->attr & DECODE)) { + PRINT("%*.s%s\n", (level + 1) * TAB_SIZE, " ", + son->name); + bs->cur += len; + continue; + } + beg = bs->cur; + + if ((err = (Decoders[son->type]) (bs, son, base, + level + 1)) < + H323_ERROR_NONE) + return err; + + bs->cur = beg + len; + bs->bit = 0; + } + return H323_ERROR_NONE; +} + +/****************************************************************************/ +static int decode_seqof(bitstr_t *bs, const struct field_t *f, + char *base, int level) +{ + unsigned int count, effective_count = 0, i, len = 0; + int err; + const struct field_t *son; + unsigned char *beg = NULL; + + PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name); + + /* Decode? */ + base = (base && (f->attr & DECODE)) ? base + f->offset : NULL; + + /* Decode item count */ + switch (f->sz) { + case BYTE: + BYTE_ALIGN(bs); + CHECK_BOUND(bs, 1); + count = *bs->cur++; + break; + case WORD: + BYTE_ALIGN(bs); + CHECK_BOUND(bs, 2); + count = *bs->cur++; + count <<= 8; + count += *bs->cur++; + break; + case SEMI: + BYTE_ALIGN(bs); + CHECK_BOUND(bs, 2); + count = get_len(bs); + break; + default: + count = get_bits(bs, f->sz); + break; + } + count += f->lb; + + /* Write Count */ + if (base) { + effective_count = count > f->ub ? f->ub : count; + *(unsigned int *)base = effective_count; + base += sizeof(unsigned int); + } + + /* Decode nested field */ + son = f->fields; + if (base) + base -= son->offset; + for (i = 0; i < count; i++) { + if (son->attr & OPEN) { + BYTE_ALIGN(bs); + len = get_len(bs); + CHECK_BOUND(bs, len); + if (!base || !(son->attr & DECODE)) { + PRINT("%*.s%s\n", (level + 1) * TAB_SIZE, + " ", son->name); + bs->cur += len; + continue; + } + beg = bs->cur; + + if ((err = (Decoders[son->type]) (bs, son, + i < + effective_count ? + base : NULL, + level + 1)) < + H323_ERROR_NONE) + return err; + + bs->cur = beg + len; + bs->bit = 0; + } else + if ((err = (Decoders[son->type]) (bs, son, + i < + effective_count ? + base : NULL, + level + 1)) < + H323_ERROR_NONE) + return err; + + if (base) + base += son->offset; + } + + return H323_ERROR_NONE; +} + + +/****************************************************************************/ +static int decode_choice(bitstr_t *bs, const struct field_t *f, + char *base, int level) +{ + unsigned int type, ext, len = 0; + int err; + const struct field_t *son; + unsigned char *beg = NULL; + + PRINT("%*.s%s\n", level * TAB_SIZE, " ", f->name); + + /* Decode? */ + base = (base && (f->attr & DECODE)) ? base + f->offset : NULL; + + /* Decode the choice index number */ + if ((f->attr & EXT) && get_bit(bs)) { + ext = 1; + type = get_bits(bs, 7) + f->lb; + } else { + ext = 0; + type = get_bits(bs, f->sz); + if (type >= f->lb) + return H323_ERROR_RANGE; + } + + /* Write Type */ + if (base) + *(unsigned int *)base = type; + + /* Check Range */ + if (type >= f->ub) { /* Newer version? */ + BYTE_ALIGN(bs); + len = get_len(bs); + CHECK_BOUND(bs, len); + bs->cur += len; + return H323_ERROR_NONE; + } + + /* Transfer to son level */ + son = &f->fields[type]; + if (son->attr & STOP) { + PRINT("%*.s%s\n", (level + 1) * TAB_SIZE, " ", son->name); + return H323_ERROR_STOP; + } + + if (ext || (son->attr & OPEN)) { + BYTE_ALIGN(bs); + len = get_len(bs); + CHECK_BOUND(bs, len); + if (!base || !(son->attr & DECODE)) { + PRINT("%*.s%s\n", (level + 1) * TAB_SIZE, " ", + son->name); + bs->cur += len; + return H323_ERROR_NONE; + } + beg = bs->cur; + + if ((err = (Decoders[son->type]) (bs, son, base, level + 1)) < + H323_ERROR_NONE) + return err; + + bs->cur = beg + len; + bs->bit = 0; + } else if ((err = (Decoders[son->type]) (bs, son, base, level + 1)) < + H323_ERROR_NONE) + return err; + + return H323_ERROR_NONE; +} + +/****************************************************************************/ +int DecodeRasMessage(unsigned char *buf, size_t sz, RasMessage *ras) +{ + static const struct field_t ras_message = { + FNAME("RasMessage") CHOICE, 5, 24, 32, DECODE | EXT, + 0, _RasMessage + }; + bitstr_t bs; + + bs.buf = bs.beg = bs.cur = buf; + bs.end = buf + sz; + bs.bit = 0; + + return decode_choice(&bs, &ras_message, (char *) ras, 0); +} + +/****************************************************************************/ +static int DecodeH323_UserInformation(unsigned char *buf, unsigned char *beg, + size_t sz, H323_UserInformation *uuie) +{ + static const struct field_t h323_userinformation = { + FNAME("H323-UserInformation") SEQ, 1, 2, 2, DECODE | EXT, + 0, _H323_UserInformation + }; + bitstr_t bs; + + bs.buf = buf; + bs.beg = bs.cur = beg; + bs.end = beg + sz; + bs.bit = 0; + + return decode_seq(&bs, &h323_userinformation, (char *) uuie, 0); +} + +/****************************************************************************/ +int DecodeMultimediaSystemControlMessage(unsigned char *buf, size_t sz, + MultimediaSystemControlMessage * + mscm) +{ + static const struct field_t multimediasystemcontrolmessage = { + FNAME("MultimediaSystemControlMessage") CHOICE, 2, 4, 4, + DECODE | EXT, 0, _MultimediaSystemControlMessage + }; + bitstr_t bs; + + bs.buf = bs.beg = bs.cur = buf; + bs.end = buf + sz; + bs.bit = 0; + + return decode_choice(&bs, &multimediasystemcontrolmessage, + (char *) mscm, 0); +} + +/****************************************************************************/ +int DecodeQ931(unsigned char *buf, size_t sz, Q931 *q931) +{ + unsigned char *p = buf; + int len; + + if (!p || sz < 1) + return H323_ERROR_BOUND; + + /* Protocol Discriminator */ + if (*p != 0x08) { + PRINT("Unknown Protocol Discriminator\n"); + return H323_ERROR_RANGE; + } + p++; + sz--; + + /* CallReferenceValue */ + if (sz < 1) + return H323_ERROR_BOUND; + len = *p++; + sz--; + if (sz < len) + return H323_ERROR_BOUND; + p += len; + sz -= len; + + /* Message Type */ + if (sz < 1) + return H323_ERROR_BOUND; + q931->MessageType = *p++; + PRINT("MessageType = %02X\n", q931->MessageType); + if (*p & 0x80) { + p++; + sz--; + } + + /* Decode Information Elements */ + while (sz > 0) { + if (*p == 0x7e) { /* UserUserIE */ + if (sz < 3) + break; + p++; + len = *p++ << 8; + len |= *p++; + sz -= 3; + if (sz < len) + break; + p++; + len--; + return DecodeH323_UserInformation(buf, p, len, + &q931->UUIE); + } + p++; + sz--; + if (sz < 1) + break; + len = *p++; + if (sz < len) + break; + p += len; + sz -= len; + } + + PRINT("Q.931 UUIE not found\n"); + + return H323_ERROR_BOUND; +} diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c new file mode 100644 index 00000000..722291f8 --- /dev/null +++ b/net/netfilter/nf_conntrack_h323_main.c @@ -0,0 +1,1836 @@ +/* + * H.323 connection tracking helper + * + * Copyright (c) 2006 Jing Min Zhao <zhaojingmin@users.sourceforge.net> + * + * This source code is licensed under General Public License version 2. + * + * Based on the 'brute force' H.323 connection tracking module by + * Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * + * For more information, please see http://nath323.sourceforge.net/ + */ + +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/ctype.h> +#include <linux/inet.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/slab.h> +#include <linux/udp.h> +#include <linux/tcp.h> +#include <linux/skbuff.h> +#include <net/route.h> +#include <net/ip6_route.h> + +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_tuple.h> +#include <net/netfilter/nf_conntrack_expect.h> +#include <net/netfilter/nf_conntrack_ecache.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_zones.h> +#include <linux/netfilter/nf_conntrack_h323.h> + +/* Parameters */ +static unsigned int default_rrq_ttl __read_mostly = 300; +module_param(default_rrq_ttl, uint, 0600); +MODULE_PARM_DESC(default_rrq_ttl, "use this TTL if it's missing in RRQ"); + +static int gkrouted_only __read_mostly = 1; +module_param(gkrouted_only, int, 0600); +MODULE_PARM_DESC(gkrouted_only, "only accept calls from gatekeeper"); + +static bool callforward_filter __read_mostly = true; +module_param(callforward_filter, bool, 0600); +MODULE_PARM_DESC(callforward_filter, "only create call forwarding expectations " + "if both endpoints are on different sides " + "(determined by routing information)"); + +/* Hooks for NAT */ +int (*set_h245_addr_hook) (struct sk_buff *skb, + unsigned char **data, int dataoff, + H245_TransportAddress *taddr, + union nf_inet_addr *addr, __be16 port) + __read_mostly; +int (*set_h225_addr_hook) (struct sk_buff *skb, + unsigned char **data, int dataoff, + TransportAddress *taddr, + union nf_inet_addr *addr, __be16 port) + __read_mostly; +int (*set_sig_addr_hook) (struct sk_buff *skb, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, + TransportAddress *taddr, int count) __read_mostly; +int (*set_ras_addr_hook) (struct sk_buff *skb, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, + TransportAddress *taddr, int count) __read_mostly; +int (*nat_rtp_rtcp_hook) (struct sk_buff *skb, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, int dataoff, + H245_TransportAddress *taddr, + __be16 port, __be16 rtp_port, + struct nf_conntrack_expect *rtp_exp, + struct nf_conntrack_expect *rtcp_exp) __read_mostly; +int (*nat_t120_hook) (struct sk_buff *skb, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, int dataoff, + H245_TransportAddress *taddr, __be16 port, + struct nf_conntrack_expect *exp) __read_mostly; +int (*nat_h245_hook) (struct sk_buff *skb, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, int dataoff, + TransportAddress *taddr, __be16 port, + struct nf_conntrack_expect *exp) __read_mostly; +int (*nat_callforwarding_hook) (struct sk_buff *skb, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, int dataoff, + TransportAddress *taddr, __be16 port, + struct nf_conntrack_expect *exp) __read_mostly; +int (*nat_q931_hook) (struct sk_buff *skb, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, TransportAddress *taddr, int idx, + __be16 port, struct nf_conntrack_expect *exp) + __read_mostly; + +static DEFINE_SPINLOCK(nf_h323_lock); +static char *h323_buffer; + +static struct nf_conntrack_helper nf_conntrack_helper_h245; +static struct nf_conntrack_helper nf_conntrack_helper_q931[]; +static struct nf_conntrack_helper nf_conntrack_helper_ras[]; + +/****************************************************************************/ +static int get_tpkt_data(struct sk_buff *skb, unsigned int protoff, + struct nf_conn *ct, enum ip_conntrack_info ctinfo, + unsigned char **data, int *datalen, int *dataoff) +{ + struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info; + int dir = CTINFO2DIR(ctinfo); + const struct tcphdr *th; + struct tcphdr _tcph; + int tcpdatalen; + int tcpdataoff; + unsigned char *tpkt; + int tpktlen; + int tpktoff; + + /* Get TCP header */ + th = skb_header_pointer(skb, protoff, sizeof(_tcph), &_tcph); + if (th == NULL) + return 0; + + /* Get TCP data offset */ + tcpdataoff = protoff + th->doff * 4; + + /* Get TCP data length */ + tcpdatalen = skb->len - tcpdataoff; + if (tcpdatalen <= 0) /* No TCP data */ + goto clear_out; + + if (*data == NULL) { /* first TPKT */ + /* Get first TPKT pointer */ + tpkt = skb_header_pointer(skb, tcpdataoff, tcpdatalen, + h323_buffer); + BUG_ON(tpkt == NULL); + + /* Validate TPKT identifier */ + if (tcpdatalen < 4 || tpkt[0] != 0x03 || tpkt[1] != 0) { + /* Netmeeting sends TPKT header and data separately */ + if (info->tpkt_len[dir] > 0) { + pr_debug("nf_ct_h323: previous packet " + "indicated separate TPKT data of %hu " + "bytes\n", info->tpkt_len[dir]); + if (info->tpkt_len[dir] <= tcpdatalen) { + /* Yes, there was a TPKT header + * received */ + *data = tpkt; + *datalen = info->tpkt_len[dir]; + *dataoff = 0; + goto out; + } + + /* Fragmented TPKT */ + pr_debug("nf_ct_h323: fragmented TPKT\n"); + goto clear_out; + } + + /* It is not even a TPKT */ + return 0; + } + tpktoff = 0; + } else { /* Next TPKT */ + tpktoff = *dataoff + *datalen; + tcpdatalen -= tpktoff; + if (tcpdatalen <= 4) /* No more TPKT */ + goto clear_out; + tpkt = *data + *datalen; + + /* Validate TPKT identifier */ + if (tpkt[0] != 0x03 || tpkt[1] != 0) + goto clear_out; + } + + /* Validate TPKT length */ + tpktlen = tpkt[2] * 256 + tpkt[3]; + if (tpktlen < 4) + goto clear_out; + if (tpktlen > tcpdatalen) { + if (tcpdatalen == 4) { /* Separate TPKT header */ + /* Netmeeting sends TPKT header and data separately */ + pr_debug("nf_ct_h323: separate TPKT header indicates " + "there will be TPKT data of %hu bytes\n", + tpktlen - 4); + info->tpkt_len[dir] = tpktlen - 4; + return 0; + } + + pr_debug("nf_ct_h323: incomplete TPKT (fragmented?)\n"); + goto clear_out; + } + + /* This is the encapsulated data */ + *data = tpkt + 4; + *datalen = tpktlen - 4; + *dataoff = tpktoff + 4; + + out: + /* Clear TPKT length */ + info->tpkt_len[dir] = 0; + return 1; + + clear_out: + info->tpkt_len[dir] = 0; + return 0; +} + +/****************************************************************************/ +static int get_h245_addr(struct nf_conn *ct, const unsigned char *data, + H245_TransportAddress *taddr, + union nf_inet_addr *addr, __be16 *port) +{ + const unsigned char *p; + int len; + + if (taddr->choice != eH245_TransportAddress_unicastAddress) + return 0; + + switch (taddr->unicastAddress.choice) { + case eUnicastAddress_iPAddress: + if (nf_ct_l3num(ct) != AF_INET) + return 0; + p = data + taddr->unicastAddress.iPAddress.network; + len = 4; + break; + case eUnicastAddress_iP6Address: + if (nf_ct_l3num(ct) != AF_INET6) + return 0; + p = data + taddr->unicastAddress.iP6Address.network; + len = 16; + break; + default: + return 0; + } + + memcpy(addr, p, len); + memset((void *)addr + len, 0, sizeof(*addr) - len); + memcpy(port, p + len, sizeof(__be16)); + + return 1; +} + +/****************************************************************************/ +static int expect_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, int dataoff, + H245_TransportAddress *taddr) +{ + int dir = CTINFO2DIR(ctinfo); + int ret = 0; + __be16 port; + __be16 rtp_port, rtcp_port; + union nf_inet_addr addr; + struct nf_conntrack_expect *rtp_exp; + struct nf_conntrack_expect *rtcp_exp; + typeof(nat_rtp_rtcp_hook) nat_rtp_rtcp; + + /* Read RTP or RTCP address */ + if (!get_h245_addr(ct, *data, taddr, &addr, &port) || + memcmp(&addr, &ct->tuplehash[dir].tuple.src.u3, sizeof(addr)) || + port == 0) + return 0; + + /* RTP port is even */ + port &= htons(~1); + rtp_port = port; + rtcp_port = htons(ntohs(port) + 1); + + /* Create expect for RTP */ + if ((rtp_exp = nf_ct_expect_alloc(ct)) == NULL) + return -1; + nf_ct_expect_init(rtp_exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct), + &ct->tuplehash[!dir].tuple.src.u3, + &ct->tuplehash[!dir].tuple.dst.u3, + IPPROTO_UDP, NULL, &rtp_port); + + /* Create expect for RTCP */ + if ((rtcp_exp = nf_ct_expect_alloc(ct)) == NULL) { + nf_ct_expect_put(rtp_exp); + return -1; + } + nf_ct_expect_init(rtcp_exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct), + &ct->tuplehash[!dir].tuple.src.u3, + &ct->tuplehash[!dir].tuple.dst.u3, + IPPROTO_UDP, NULL, &rtcp_port); + + if (memcmp(&ct->tuplehash[dir].tuple.src.u3, + &ct->tuplehash[!dir].tuple.dst.u3, + sizeof(ct->tuplehash[dir].tuple.src.u3)) && + (nat_rtp_rtcp = rcu_dereference(nat_rtp_rtcp_hook)) && + ct->status & IPS_NAT_MASK) { + /* NAT needed */ + ret = nat_rtp_rtcp(skb, ct, ctinfo, data, dataoff, + taddr, port, rtp_port, rtp_exp, rtcp_exp); + } else { /* Conntrack only */ + if (nf_ct_expect_related(rtp_exp) == 0) { + if (nf_ct_expect_related(rtcp_exp) == 0) { + pr_debug("nf_ct_h323: expect RTP "); + nf_ct_dump_tuple(&rtp_exp->tuple); + pr_debug("nf_ct_h323: expect RTCP "); + nf_ct_dump_tuple(&rtcp_exp->tuple); + } else { + nf_ct_unexpect_related(rtp_exp); + ret = -1; + } + } else + ret = -1; + } + + nf_ct_expect_put(rtp_exp); + nf_ct_expect_put(rtcp_exp); + + return ret; +} + +/****************************************************************************/ +static int expect_t120(struct sk_buff *skb, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, int dataoff, + H245_TransportAddress *taddr) +{ + int dir = CTINFO2DIR(ctinfo); + int ret = 0; + __be16 port; + union nf_inet_addr addr; + struct nf_conntrack_expect *exp; + typeof(nat_t120_hook) nat_t120; + + /* Read T.120 address */ + if (!get_h245_addr(ct, *data, taddr, &addr, &port) || + memcmp(&addr, &ct->tuplehash[dir].tuple.src.u3, sizeof(addr)) || + port == 0) + return 0; + + /* Create expect for T.120 connections */ + if ((exp = nf_ct_expect_alloc(ct)) == NULL) + return -1; + nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct), + &ct->tuplehash[!dir].tuple.src.u3, + &ct->tuplehash[!dir].tuple.dst.u3, + IPPROTO_TCP, NULL, &port); + exp->flags = NF_CT_EXPECT_PERMANENT; /* Accept multiple channels */ + + if (memcmp(&ct->tuplehash[dir].tuple.src.u3, + &ct->tuplehash[!dir].tuple.dst.u3, + sizeof(ct->tuplehash[dir].tuple.src.u3)) && + (nat_t120 = rcu_dereference(nat_t120_hook)) && + ct->status & IPS_NAT_MASK) { + /* NAT needed */ + ret = nat_t120(skb, ct, ctinfo, data, dataoff, taddr, + port, exp); + } else { /* Conntrack only */ + if (nf_ct_expect_related(exp) == 0) { + pr_debug("nf_ct_h323: expect T.120 "); + nf_ct_dump_tuple(&exp->tuple); + } else + ret = -1; + } + + nf_ct_expect_put(exp); + + return ret; +} + +/****************************************************************************/ +static int process_h245_channel(struct sk_buff *skb, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, int dataoff, + H2250LogicalChannelParameters *channel) +{ + int ret; + + if (channel->options & eH2250LogicalChannelParameters_mediaChannel) { + /* RTP */ + ret = expect_rtp_rtcp(skb, ct, ctinfo, data, dataoff, + &channel->mediaChannel); + if (ret < 0) + return -1; + } + + if (channel-> + options & eH2250LogicalChannelParameters_mediaControlChannel) { + /* RTCP */ + ret = expect_rtp_rtcp(skb, ct, ctinfo, data, dataoff, + &channel->mediaControlChannel); + if (ret < 0) + return -1; + } + + return 0; +} + +/****************************************************************************/ +static int process_olc(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, int dataoff, + OpenLogicalChannel *olc) +{ + int ret; + + pr_debug("nf_ct_h323: OpenLogicalChannel\n"); + + if (olc->forwardLogicalChannelParameters.multiplexParameters.choice == + eOpenLogicalChannel_forwardLogicalChannelParameters_multiplexParameters_h2250LogicalChannelParameters) + { + ret = process_h245_channel(skb, ct, ctinfo, data, dataoff, + &olc-> + forwardLogicalChannelParameters. + multiplexParameters. + h2250LogicalChannelParameters); + if (ret < 0) + return -1; + } + + if ((olc->options & + eOpenLogicalChannel_reverseLogicalChannelParameters) && + (olc->reverseLogicalChannelParameters.options & + eOpenLogicalChannel_reverseLogicalChannelParameters_multiplexParameters) + && (olc->reverseLogicalChannelParameters.multiplexParameters. + choice == + eOpenLogicalChannel_reverseLogicalChannelParameters_multiplexParameters_h2250LogicalChannelParameters)) + { + ret = + process_h245_channel(skb, ct, ctinfo, data, dataoff, + &olc-> + reverseLogicalChannelParameters. + multiplexParameters. + h2250LogicalChannelParameters); + if (ret < 0) + return -1; + } + + if ((olc->options & eOpenLogicalChannel_separateStack) && + olc->forwardLogicalChannelParameters.dataType.choice == + eDataType_data && + olc->forwardLogicalChannelParameters.dataType.data.application. + choice == eDataApplicationCapability_application_t120 && + olc->forwardLogicalChannelParameters.dataType.data.application. + t120.choice == eDataProtocolCapability_separateLANStack && + olc->separateStack.networkAddress.choice == + eNetworkAccessParameters_networkAddress_localAreaAddress) { + ret = expect_t120(skb, ct, ctinfo, data, dataoff, + &olc->separateStack.networkAddress. + localAreaAddress); + if (ret < 0) + return -1; + } + + return 0; +} + +/****************************************************************************/ +static int process_olca(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, int dataoff, + OpenLogicalChannelAck *olca) +{ + H2250LogicalChannelAckParameters *ack; + int ret; + + pr_debug("nf_ct_h323: OpenLogicalChannelAck\n"); + + if ((olca->options & + eOpenLogicalChannelAck_reverseLogicalChannelParameters) && + (olca->reverseLogicalChannelParameters.options & + eOpenLogicalChannelAck_reverseLogicalChannelParameters_multiplexParameters) + && (olca->reverseLogicalChannelParameters.multiplexParameters. + choice == + eOpenLogicalChannelAck_reverseLogicalChannelParameters_multiplexParameters_h2250LogicalChannelParameters)) + { + ret = process_h245_channel(skb, ct, ctinfo, data, dataoff, + &olca-> + reverseLogicalChannelParameters. + multiplexParameters. + h2250LogicalChannelParameters); + if (ret < 0) + return -1; + } + + if ((olca->options & + eOpenLogicalChannelAck_forwardMultiplexAckParameters) && + (olca->forwardMultiplexAckParameters.choice == + eOpenLogicalChannelAck_forwardMultiplexAckParameters_h2250LogicalChannelAckParameters)) + { + ack = &olca->forwardMultiplexAckParameters. + h2250LogicalChannelAckParameters; + if (ack->options & + eH2250LogicalChannelAckParameters_mediaChannel) { + /* RTP */ + ret = expect_rtp_rtcp(skb, ct, ctinfo, data, dataoff, + &ack->mediaChannel); + if (ret < 0) + return -1; + } + + if (ack->options & + eH2250LogicalChannelAckParameters_mediaControlChannel) { + /* RTCP */ + ret = expect_rtp_rtcp(skb, ct, ctinfo, data, dataoff, + &ack->mediaControlChannel); + if (ret < 0) + return -1; + } + } + + if ((olca->options & eOpenLogicalChannelAck_separateStack) && + olca->separateStack.networkAddress.choice == + eNetworkAccessParameters_networkAddress_localAreaAddress) { + ret = expect_t120(skb, ct, ctinfo, data, dataoff, + &olca->separateStack.networkAddress. + localAreaAddress); + if (ret < 0) + return -1; + } + + return 0; +} + +/****************************************************************************/ +static int process_h245(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, int dataoff, + MultimediaSystemControlMessage *mscm) +{ + switch (mscm->choice) { + case eMultimediaSystemControlMessage_request: + if (mscm->request.choice == + eRequestMessage_openLogicalChannel) { + return process_olc(skb, ct, ctinfo, data, dataoff, + &mscm->request.openLogicalChannel); + } + pr_debug("nf_ct_h323: H.245 Request %d\n", + mscm->request.choice); + break; + case eMultimediaSystemControlMessage_response: + if (mscm->response.choice == + eResponseMessage_openLogicalChannelAck) { + return process_olca(skb, ct, ctinfo, data, dataoff, + &mscm->response. + openLogicalChannelAck); + } + pr_debug("nf_ct_h323: H.245 Response %d\n", + mscm->response.choice); + break; + default: + pr_debug("nf_ct_h323: H.245 signal %d\n", mscm->choice); + break; + } + + return 0; +} + +/****************************************************************************/ +static int h245_help(struct sk_buff *skb, unsigned int protoff, + struct nf_conn *ct, enum ip_conntrack_info ctinfo) +{ + static MultimediaSystemControlMessage mscm; + unsigned char *data = NULL; + int datalen; + int dataoff; + int ret; + + /* Until there's been traffic both ways, don't look in packets. */ + if (ctinfo != IP_CT_ESTABLISHED && ctinfo != IP_CT_ESTABLISHED_REPLY) + return NF_ACCEPT; + + pr_debug("nf_ct_h245: skblen = %u\n", skb->len); + + spin_lock_bh(&nf_h323_lock); + + /* Process each TPKT */ + while (get_tpkt_data(skb, protoff, ct, ctinfo, + &data, &datalen, &dataoff)) { + pr_debug("nf_ct_h245: TPKT len=%d ", datalen); + nf_ct_dump_tuple(&ct->tuplehash[CTINFO2DIR(ctinfo)].tuple); + + /* Decode H.245 signal */ + ret = DecodeMultimediaSystemControlMessage(data, datalen, + &mscm); + if (ret < 0) { + pr_debug("nf_ct_h245: decoding error: %s\n", + ret == H323_ERROR_BOUND ? + "out of bound" : "out of range"); + /* We don't drop when decoding error */ + break; + } + + /* Process H.245 signal */ + if (process_h245(skb, ct, ctinfo, &data, dataoff, &mscm) < 0) + goto drop; + } + + spin_unlock_bh(&nf_h323_lock); + return NF_ACCEPT; + + drop: + spin_unlock_bh(&nf_h323_lock); + if (net_ratelimit()) + pr_info("nf_ct_h245: packet dropped\n"); + return NF_DROP; +} + +/****************************************************************************/ +static const struct nf_conntrack_expect_policy h245_exp_policy = { + .max_expected = H323_RTP_CHANNEL_MAX * 4 + 2 /* T.120 */, + .timeout = 240, +}; + +static struct nf_conntrack_helper nf_conntrack_helper_h245 __read_mostly = { + .name = "H.245", + .me = THIS_MODULE, + .tuple.src.l3num = AF_UNSPEC, + .tuple.dst.protonum = IPPROTO_UDP, + .help = h245_help, + .expect_policy = &h245_exp_policy, +}; + +/****************************************************************************/ +int get_h225_addr(struct nf_conn *ct, unsigned char *data, + TransportAddress *taddr, + union nf_inet_addr *addr, __be16 *port) +{ + const unsigned char *p; + int len; + + switch (taddr->choice) { + case eTransportAddress_ipAddress: + if (nf_ct_l3num(ct) != AF_INET) + return 0; + p = data + taddr->ipAddress.ip; + len = 4; + break; + case eTransportAddress_ip6Address: + if (nf_ct_l3num(ct) != AF_INET6) + return 0; + p = data + taddr->ip6Address.ip; + len = 16; + break; + default: + return 0; + } + + memcpy(addr, p, len); + memset((void *)addr + len, 0, sizeof(*addr) - len); + memcpy(port, p + len, sizeof(__be16)); + + return 1; +} + +/****************************************************************************/ +static int expect_h245(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, int dataoff, + TransportAddress *taddr) +{ + int dir = CTINFO2DIR(ctinfo); + int ret = 0; + __be16 port; + union nf_inet_addr addr; + struct nf_conntrack_expect *exp; + typeof(nat_h245_hook) nat_h245; + + /* Read h245Address */ + if (!get_h225_addr(ct, *data, taddr, &addr, &port) || + memcmp(&addr, &ct->tuplehash[dir].tuple.src.u3, sizeof(addr)) || + port == 0) + return 0; + + /* Create expect for h245 connection */ + if ((exp = nf_ct_expect_alloc(ct)) == NULL) + return -1; + nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct), + &ct->tuplehash[!dir].tuple.src.u3, + &ct->tuplehash[!dir].tuple.dst.u3, + IPPROTO_TCP, NULL, &port); + exp->helper = &nf_conntrack_helper_h245; + + if (memcmp(&ct->tuplehash[dir].tuple.src.u3, + &ct->tuplehash[!dir].tuple.dst.u3, + sizeof(ct->tuplehash[dir].tuple.src.u3)) && + (nat_h245 = rcu_dereference(nat_h245_hook)) && + ct->status & IPS_NAT_MASK) { + /* NAT needed */ + ret = nat_h245(skb, ct, ctinfo, data, dataoff, taddr, + port, exp); + } else { /* Conntrack only */ + if (nf_ct_expect_related(exp) == 0) { + pr_debug("nf_ct_q931: expect H.245 "); + nf_ct_dump_tuple(&exp->tuple); + } else + ret = -1; + } + + nf_ct_expect_put(exp); + + return ret; +} + +/* If the calling party is on the same side of the forward-to party, + * we don't need to track the second call */ +static int callforward_do_filter(const union nf_inet_addr *src, + const union nf_inet_addr *dst, + u_int8_t family) +{ + const struct nf_afinfo *afinfo; + int ret = 0; + + /* rcu_read_lock()ed by nf_hook_slow() */ + afinfo = nf_get_afinfo(family); + if (!afinfo) + return 0; + + switch (family) { + case AF_INET: { + struct flowi4 fl1, fl2; + struct rtable *rt1, *rt2; + + memset(&fl1, 0, sizeof(fl1)); + fl1.daddr = src->ip; + + memset(&fl2, 0, sizeof(fl2)); + fl2.daddr = dst->ip; + if (!afinfo->route(&init_net, (struct dst_entry **)&rt1, + flowi4_to_flowi(&fl1), false)) { + if (!afinfo->route(&init_net, (struct dst_entry **)&rt2, + flowi4_to_flowi(&fl2), false)) { + if (rt1->rt_gateway == rt2->rt_gateway && + rt1->dst.dev == rt2->dst.dev) + ret = 1; + dst_release(&rt2->dst); + } + dst_release(&rt1->dst); + } + break; + } +#if IS_ENABLED(CONFIG_NF_CONNTRACK_IPV6) + case AF_INET6: { + struct flowi6 fl1, fl2; + struct rt6_info *rt1, *rt2; + + memset(&fl1, 0, sizeof(fl1)); + fl1.daddr = src->in6; + + memset(&fl2, 0, sizeof(fl2)); + fl2.daddr = dst->in6; + if (!afinfo->route(&init_net, (struct dst_entry **)&rt1, + flowi6_to_flowi(&fl1), false)) { + if (!afinfo->route(&init_net, (struct dst_entry **)&rt2, + flowi6_to_flowi(&fl2), false)) { + if (!memcmp(&rt1->rt6i_gateway, &rt2->rt6i_gateway, + sizeof(rt1->rt6i_gateway)) && + rt1->dst.dev == rt2->dst.dev) + ret = 1; + dst_release(&rt2->dst); + } + dst_release(&rt1->dst); + } + break; + } +#endif + } + return ret; + +} + +/****************************************************************************/ +static int expect_callforwarding(struct sk_buff *skb, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, int dataoff, + TransportAddress *taddr) +{ + int dir = CTINFO2DIR(ctinfo); + int ret = 0; + __be16 port; + union nf_inet_addr addr; + struct nf_conntrack_expect *exp; + typeof(nat_callforwarding_hook) nat_callforwarding; + + /* Read alternativeAddress */ + if (!get_h225_addr(ct, *data, taddr, &addr, &port) || port == 0) + return 0; + + /* If the calling party is on the same side of the forward-to party, + * we don't need to track the second call */ + if (callforward_filter && + callforward_do_filter(&addr, &ct->tuplehash[!dir].tuple.src.u3, + nf_ct_l3num(ct))) { + pr_debug("nf_ct_q931: Call Forwarding not tracked\n"); + return 0; + } + + /* Create expect for the second call leg */ + if ((exp = nf_ct_expect_alloc(ct)) == NULL) + return -1; + nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct), + &ct->tuplehash[!dir].tuple.src.u3, &addr, + IPPROTO_TCP, NULL, &port); + exp->helper = nf_conntrack_helper_q931; + + if (memcmp(&ct->tuplehash[dir].tuple.src.u3, + &ct->tuplehash[!dir].tuple.dst.u3, + sizeof(ct->tuplehash[dir].tuple.src.u3)) && + (nat_callforwarding = rcu_dereference(nat_callforwarding_hook)) && + ct->status & IPS_NAT_MASK) { + /* Need NAT */ + ret = nat_callforwarding(skb, ct, ctinfo, data, dataoff, + taddr, port, exp); + } else { /* Conntrack only */ + if (nf_ct_expect_related(exp) == 0) { + pr_debug("nf_ct_q931: expect Call Forwarding "); + nf_ct_dump_tuple(&exp->tuple); + } else + ret = -1; + } + + nf_ct_expect_put(exp); + + return ret; +} + +/****************************************************************************/ +static int process_setup(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, int dataoff, + Setup_UUIE *setup) +{ + int dir = CTINFO2DIR(ctinfo); + int ret; + int i; + __be16 port; + union nf_inet_addr addr; + typeof(set_h225_addr_hook) set_h225_addr; + + pr_debug("nf_ct_q931: Setup\n"); + + if (setup->options & eSetup_UUIE_h245Address) { + ret = expect_h245(skb, ct, ctinfo, data, dataoff, + &setup->h245Address); + if (ret < 0) + return -1; + } + + set_h225_addr = rcu_dereference(set_h225_addr_hook); + if ((setup->options & eSetup_UUIE_destCallSignalAddress) && + (set_h225_addr) && ct->status & IPS_NAT_MASK && + get_h225_addr(ct, *data, &setup->destCallSignalAddress, + &addr, &port) && + memcmp(&addr, &ct->tuplehash[!dir].tuple.src.u3, sizeof(addr))) { + pr_debug("nf_ct_q931: set destCallSignalAddress %pI6:%hu->%pI6:%hu\n", + &addr, ntohs(port), &ct->tuplehash[!dir].tuple.src.u3, + ntohs(ct->tuplehash[!dir].tuple.src.u.tcp.port)); + ret = set_h225_addr(skb, data, dataoff, + &setup->destCallSignalAddress, + &ct->tuplehash[!dir].tuple.src.u3, + ct->tuplehash[!dir].tuple.src.u.tcp.port); + if (ret < 0) + return -1; + } + + if ((setup->options & eSetup_UUIE_sourceCallSignalAddress) && + (set_h225_addr) && ct->status & IPS_NAT_MASK && + get_h225_addr(ct, *data, &setup->sourceCallSignalAddress, + &addr, &port) && + memcmp(&addr, &ct->tuplehash[!dir].tuple.dst.u3, sizeof(addr))) { + pr_debug("nf_ct_q931: set sourceCallSignalAddress %pI6:%hu->%pI6:%hu\n", + &addr, ntohs(port), &ct->tuplehash[!dir].tuple.dst.u3, + ntohs(ct->tuplehash[!dir].tuple.dst.u.tcp.port)); + ret = set_h225_addr(skb, data, dataoff, + &setup->sourceCallSignalAddress, + &ct->tuplehash[!dir].tuple.dst.u3, + ct->tuplehash[!dir].tuple.dst.u.tcp.port); + if (ret < 0) + return -1; + } + + if (setup->options & eSetup_UUIE_fastStart) { + for (i = 0; i < setup->fastStart.count; i++) { + ret = process_olc(skb, ct, ctinfo, data, dataoff, + &setup->fastStart.item[i]); + if (ret < 0) + return -1; + } + } + + return 0; +} + +/****************************************************************************/ +static int process_callproceeding(struct sk_buff *skb, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, int dataoff, + CallProceeding_UUIE *callproc) +{ + int ret; + int i; + + pr_debug("nf_ct_q931: CallProceeding\n"); + + if (callproc->options & eCallProceeding_UUIE_h245Address) { + ret = expect_h245(skb, ct, ctinfo, data, dataoff, + &callproc->h245Address); + if (ret < 0) + return -1; + } + + if (callproc->options & eCallProceeding_UUIE_fastStart) { + for (i = 0; i < callproc->fastStart.count; i++) { + ret = process_olc(skb, ct, ctinfo, data, dataoff, + &callproc->fastStart.item[i]); + if (ret < 0) + return -1; + } + } + + return 0; +} + +/****************************************************************************/ +static int process_connect(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, int dataoff, + Connect_UUIE *connect) +{ + int ret; + int i; + + pr_debug("nf_ct_q931: Connect\n"); + + if (connect->options & eConnect_UUIE_h245Address) { + ret = expect_h245(skb, ct, ctinfo, data, dataoff, + &connect->h245Address); + if (ret < 0) + return -1; + } + + if (connect->options & eConnect_UUIE_fastStart) { + for (i = 0; i < connect->fastStart.count; i++) { + ret = process_olc(skb, ct, ctinfo, data, dataoff, + &connect->fastStart.item[i]); + if (ret < 0) + return -1; + } + } + + return 0; +} + +/****************************************************************************/ +static int process_alerting(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, int dataoff, + Alerting_UUIE *alert) +{ + int ret; + int i; + + pr_debug("nf_ct_q931: Alerting\n"); + + if (alert->options & eAlerting_UUIE_h245Address) { + ret = expect_h245(skb, ct, ctinfo, data, dataoff, + &alert->h245Address); + if (ret < 0) + return -1; + } + + if (alert->options & eAlerting_UUIE_fastStart) { + for (i = 0; i < alert->fastStart.count; i++) { + ret = process_olc(skb, ct, ctinfo, data, dataoff, + &alert->fastStart.item[i]); + if (ret < 0) + return -1; + } + } + + return 0; +} + +/****************************************************************************/ +static int process_facility(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, int dataoff, + Facility_UUIE *facility) +{ + int ret; + int i; + + pr_debug("nf_ct_q931: Facility\n"); + + if (facility->reason.choice == eFacilityReason_callForwarded) { + if (facility->options & eFacility_UUIE_alternativeAddress) + return expect_callforwarding(skb, ct, ctinfo, data, + dataoff, + &facility-> + alternativeAddress); + return 0; + } + + if (facility->options & eFacility_UUIE_h245Address) { + ret = expect_h245(skb, ct, ctinfo, data, dataoff, + &facility->h245Address); + if (ret < 0) + return -1; + } + + if (facility->options & eFacility_UUIE_fastStart) { + for (i = 0; i < facility->fastStart.count; i++) { + ret = process_olc(skb, ct, ctinfo, data, dataoff, + &facility->fastStart.item[i]); + if (ret < 0) + return -1; + } + } + + return 0; +} + +/****************************************************************************/ +static int process_progress(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, int dataoff, + Progress_UUIE *progress) +{ + int ret; + int i; + + pr_debug("nf_ct_q931: Progress\n"); + + if (progress->options & eProgress_UUIE_h245Address) { + ret = expect_h245(skb, ct, ctinfo, data, dataoff, + &progress->h245Address); + if (ret < 0) + return -1; + } + + if (progress->options & eProgress_UUIE_fastStart) { + for (i = 0; i < progress->fastStart.count; i++) { + ret = process_olc(skb, ct, ctinfo, data, dataoff, + &progress->fastStart.item[i]); + if (ret < 0) + return -1; + } + } + + return 0; +} + +/****************************************************************************/ +static int process_q931(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, int dataoff, Q931 *q931) +{ + H323_UU_PDU *pdu = &q931->UUIE.h323_uu_pdu; + int i; + int ret = 0; + + switch (pdu->h323_message_body.choice) { + case eH323_UU_PDU_h323_message_body_setup: + ret = process_setup(skb, ct, ctinfo, data, dataoff, + &pdu->h323_message_body.setup); + break; + case eH323_UU_PDU_h323_message_body_callProceeding: + ret = process_callproceeding(skb, ct, ctinfo, data, dataoff, + &pdu->h323_message_body. + callProceeding); + break; + case eH323_UU_PDU_h323_message_body_connect: + ret = process_connect(skb, ct, ctinfo, data, dataoff, + &pdu->h323_message_body.connect); + break; + case eH323_UU_PDU_h323_message_body_alerting: + ret = process_alerting(skb, ct, ctinfo, data, dataoff, + &pdu->h323_message_body.alerting); + break; + case eH323_UU_PDU_h323_message_body_facility: + ret = process_facility(skb, ct, ctinfo, data, dataoff, + &pdu->h323_message_body.facility); + break; + case eH323_UU_PDU_h323_message_body_progress: + ret = process_progress(skb, ct, ctinfo, data, dataoff, + &pdu->h323_message_body.progress); + break; + default: + pr_debug("nf_ct_q931: Q.931 signal %d\n", + pdu->h323_message_body.choice); + break; + } + + if (ret < 0) + return -1; + + if (pdu->options & eH323_UU_PDU_h245Control) { + for (i = 0; i < pdu->h245Control.count; i++) { + ret = process_h245(skb, ct, ctinfo, data, dataoff, + &pdu->h245Control.item[i]); + if (ret < 0) + return -1; + } + } + + return 0; +} + +/****************************************************************************/ +static int q931_help(struct sk_buff *skb, unsigned int protoff, + struct nf_conn *ct, enum ip_conntrack_info ctinfo) +{ + static Q931 q931; + unsigned char *data = NULL; + int datalen; + int dataoff; + int ret; + + /* Until there's been traffic both ways, don't look in packets. */ + if (ctinfo != IP_CT_ESTABLISHED && ctinfo != IP_CT_ESTABLISHED_REPLY) + return NF_ACCEPT; + + pr_debug("nf_ct_q931: skblen = %u\n", skb->len); + + spin_lock_bh(&nf_h323_lock); + + /* Process each TPKT */ + while (get_tpkt_data(skb, protoff, ct, ctinfo, + &data, &datalen, &dataoff)) { + pr_debug("nf_ct_q931: TPKT len=%d ", datalen); + nf_ct_dump_tuple(&ct->tuplehash[CTINFO2DIR(ctinfo)].tuple); + + /* Decode Q.931 signal */ + ret = DecodeQ931(data, datalen, &q931); + if (ret < 0) { + pr_debug("nf_ct_q931: decoding error: %s\n", + ret == H323_ERROR_BOUND ? + "out of bound" : "out of range"); + /* We don't drop when decoding error */ + break; + } + + /* Process Q.931 signal */ + if (process_q931(skb, ct, ctinfo, &data, dataoff, &q931) < 0) + goto drop; + } + + spin_unlock_bh(&nf_h323_lock); + return NF_ACCEPT; + + drop: + spin_unlock_bh(&nf_h323_lock); + if (net_ratelimit()) + pr_info("nf_ct_q931: packet dropped\n"); + return NF_DROP; +} + +/****************************************************************************/ +static const struct nf_conntrack_expect_policy q931_exp_policy = { + /* T.120 and H.245 */ + .max_expected = H323_RTP_CHANNEL_MAX * 4 + 4, + .timeout = 240, +}; + +static struct nf_conntrack_helper nf_conntrack_helper_q931[] __read_mostly = { + { + .name = "Q.931", + .me = THIS_MODULE, + .tuple.src.l3num = AF_INET, + .tuple.src.u.tcp.port = cpu_to_be16(Q931_PORT), + .tuple.dst.protonum = IPPROTO_TCP, + .help = q931_help, + .expect_policy = &q931_exp_policy, + }, + { + .name = "Q.931", + .me = THIS_MODULE, + .tuple.src.l3num = AF_INET6, + .tuple.src.u.tcp.port = cpu_to_be16(Q931_PORT), + .tuple.dst.protonum = IPPROTO_TCP, + .help = q931_help, + .expect_policy = &q931_exp_policy, + }, +}; + +/****************************************************************************/ +static unsigned char *get_udp_data(struct sk_buff *skb, unsigned int protoff, + int *datalen) +{ + const struct udphdr *uh; + struct udphdr _uh; + int dataoff; + + uh = skb_header_pointer(skb, protoff, sizeof(_uh), &_uh); + if (uh == NULL) + return NULL; + dataoff = protoff + sizeof(_uh); + if (dataoff >= skb->len) + return NULL; + *datalen = skb->len - dataoff; + return skb_header_pointer(skb, dataoff, *datalen, h323_buffer); +} + +/****************************************************************************/ +static struct nf_conntrack_expect *find_expect(struct nf_conn *ct, + union nf_inet_addr *addr, + __be16 port) +{ + struct net *net = nf_ct_net(ct); + struct nf_conntrack_expect *exp; + struct nf_conntrack_tuple tuple; + + memset(&tuple.src.u3, 0, sizeof(tuple.src.u3)); + tuple.src.u.tcp.port = 0; + memcpy(&tuple.dst.u3, addr, sizeof(tuple.dst.u3)); + tuple.dst.u.tcp.port = port; + tuple.dst.protonum = IPPROTO_TCP; + + exp = __nf_ct_expect_find(net, nf_ct_zone(ct), &tuple); + if (exp && exp->master == ct) + return exp; + return NULL; +} + +/****************************************************************************/ +static int set_expect_timeout(struct nf_conntrack_expect *exp, + unsigned timeout) +{ + if (!exp || !del_timer(&exp->timeout)) + return 0; + + exp->timeout.expires = jiffies + timeout * HZ; + add_timer(&exp->timeout); + + return 1; +} + +/****************************************************************************/ +static int expect_q931(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, + TransportAddress *taddr, int count) +{ + struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info; + int dir = CTINFO2DIR(ctinfo); + int ret = 0; + int i; + __be16 port; + union nf_inet_addr addr; + struct nf_conntrack_expect *exp; + typeof(nat_q931_hook) nat_q931; + + /* Look for the first related address */ + for (i = 0; i < count; i++) { + if (get_h225_addr(ct, *data, &taddr[i], &addr, &port) && + memcmp(&addr, &ct->tuplehash[dir].tuple.src.u3, + sizeof(addr)) == 0 && port != 0) + break; + } + + if (i >= count) /* Not found */ + return 0; + + /* Create expect for Q.931 */ + if ((exp = nf_ct_expect_alloc(ct)) == NULL) + return -1; + nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct), + gkrouted_only ? /* only accept calls from GK? */ + &ct->tuplehash[!dir].tuple.src.u3 : NULL, + &ct->tuplehash[!dir].tuple.dst.u3, + IPPROTO_TCP, NULL, &port); + exp->helper = nf_conntrack_helper_q931; + exp->flags = NF_CT_EXPECT_PERMANENT; /* Accept multiple calls */ + + nat_q931 = rcu_dereference(nat_q931_hook); + if (nat_q931 && ct->status & IPS_NAT_MASK) { /* Need NAT */ + ret = nat_q931(skb, ct, ctinfo, data, taddr, i, port, exp); + } else { /* Conntrack only */ + if (nf_ct_expect_related(exp) == 0) { + pr_debug("nf_ct_ras: expect Q.931 "); + nf_ct_dump_tuple(&exp->tuple); + + /* Save port for looking up expect in processing RCF */ + info->sig_port[dir] = port; + } else + ret = -1; + } + + nf_ct_expect_put(exp); + + return ret; +} + +/****************************************************************************/ +static int process_grq(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, GatekeeperRequest *grq) +{ + typeof(set_ras_addr_hook) set_ras_addr; + + pr_debug("nf_ct_ras: GRQ\n"); + + set_ras_addr = rcu_dereference(set_ras_addr_hook); + if (set_ras_addr && ct->status & IPS_NAT_MASK) /* NATed */ + return set_ras_addr(skb, ct, ctinfo, data, + &grq->rasAddress, 1); + return 0; +} + +/****************************************************************************/ +static int process_gcf(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, GatekeeperConfirm *gcf) +{ + int dir = CTINFO2DIR(ctinfo); + int ret = 0; + __be16 port; + union nf_inet_addr addr; + struct nf_conntrack_expect *exp; + + pr_debug("nf_ct_ras: GCF\n"); + + if (!get_h225_addr(ct, *data, &gcf->rasAddress, &addr, &port)) + return 0; + + /* Registration port is the same as discovery port */ + if (!memcmp(&addr, &ct->tuplehash[dir].tuple.src.u3, sizeof(addr)) && + port == ct->tuplehash[dir].tuple.src.u.udp.port) + return 0; + + /* Avoid RAS expectation loops. A GCF is never expected. */ + if (test_bit(IPS_EXPECTED_BIT, &ct->status)) + return 0; + + /* Need new expect */ + if ((exp = nf_ct_expect_alloc(ct)) == NULL) + return -1; + nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct), + &ct->tuplehash[!dir].tuple.src.u3, &addr, + IPPROTO_UDP, NULL, &port); + exp->helper = nf_conntrack_helper_ras; + + if (nf_ct_expect_related(exp) == 0) { + pr_debug("nf_ct_ras: expect RAS "); + nf_ct_dump_tuple(&exp->tuple); + } else + ret = -1; + + nf_ct_expect_put(exp); + + return ret; +} + +/****************************************************************************/ +static int process_rrq(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, RegistrationRequest *rrq) +{ + struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info; + int ret; + typeof(set_ras_addr_hook) set_ras_addr; + + pr_debug("nf_ct_ras: RRQ\n"); + + ret = expect_q931(skb, ct, ctinfo, data, + rrq->callSignalAddress.item, + rrq->callSignalAddress.count); + if (ret < 0) + return -1; + + set_ras_addr = rcu_dereference(set_ras_addr_hook); + if (set_ras_addr && ct->status & IPS_NAT_MASK) { + ret = set_ras_addr(skb, ct, ctinfo, data, + rrq->rasAddress.item, + rrq->rasAddress.count); + if (ret < 0) + return -1; + } + + if (rrq->options & eRegistrationRequest_timeToLive) { + pr_debug("nf_ct_ras: RRQ TTL = %u seconds\n", rrq->timeToLive); + info->timeout = rrq->timeToLive; + } else + info->timeout = default_rrq_ttl; + + return 0; +} + +/****************************************************************************/ +static int process_rcf(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, RegistrationConfirm *rcf) +{ + struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info; + int dir = CTINFO2DIR(ctinfo); + int ret; + struct nf_conntrack_expect *exp; + typeof(set_sig_addr_hook) set_sig_addr; + + pr_debug("nf_ct_ras: RCF\n"); + + set_sig_addr = rcu_dereference(set_sig_addr_hook); + if (set_sig_addr && ct->status & IPS_NAT_MASK) { + ret = set_sig_addr(skb, ct, ctinfo, data, + rcf->callSignalAddress.item, + rcf->callSignalAddress.count); + if (ret < 0) + return -1; + } + + if (rcf->options & eRegistrationConfirm_timeToLive) { + pr_debug("nf_ct_ras: RCF TTL = %u seconds\n", rcf->timeToLive); + info->timeout = rcf->timeToLive; + } + + if (info->timeout > 0) { + pr_debug("nf_ct_ras: set RAS connection timeout to " + "%u seconds\n", info->timeout); + nf_ct_refresh(ct, skb, info->timeout * HZ); + + /* Set expect timeout */ + spin_lock_bh(&nf_conntrack_lock); + exp = find_expect(ct, &ct->tuplehash[dir].tuple.dst.u3, + info->sig_port[!dir]); + if (exp) { + pr_debug("nf_ct_ras: set Q.931 expect " + "timeout to %u seconds for", + info->timeout); + nf_ct_dump_tuple(&exp->tuple); + set_expect_timeout(exp, info->timeout); + } + spin_unlock_bh(&nf_conntrack_lock); + } + + return 0; +} + +/****************************************************************************/ +static int process_urq(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, UnregistrationRequest *urq) +{ + struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info; + int dir = CTINFO2DIR(ctinfo); + int ret; + typeof(set_sig_addr_hook) set_sig_addr; + + pr_debug("nf_ct_ras: URQ\n"); + + set_sig_addr = rcu_dereference(set_sig_addr_hook); + if (set_sig_addr && ct->status & IPS_NAT_MASK) { + ret = set_sig_addr(skb, ct, ctinfo, data, + urq->callSignalAddress.item, + urq->callSignalAddress.count); + if (ret < 0) + return -1; + } + + /* Clear old expect */ + nf_ct_remove_expectations(ct); + info->sig_port[dir] = 0; + info->sig_port[!dir] = 0; + + /* Give it 30 seconds for UCF or URJ */ + nf_ct_refresh(ct, skb, 30 * HZ); + + return 0; +} + +/****************************************************************************/ +static int process_arq(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, AdmissionRequest *arq) +{ + const struct nf_ct_h323_master *info = &nfct_help(ct)->help.ct_h323_info; + int dir = CTINFO2DIR(ctinfo); + __be16 port; + union nf_inet_addr addr; + typeof(set_h225_addr_hook) set_h225_addr; + + pr_debug("nf_ct_ras: ARQ\n"); + + set_h225_addr = rcu_dereference(set_h225_addr_hook); + if ((arq->options & eAdmissionRequest_destCallSignalAddress) && + get_h225_addr(ct, *data, &arq->destCallSignalAddress, + &addr, &port) && + !memcmp(&addr, &ct->tuplehash[dir].tuple.src.u3, sizeof(addr)) && + port == info->sig_port[dir] && + set_h225_addr && ct->status & IPS_NAT_MASK) { + /* Answering ARQ */ + return set_h225_addr(skb, data, 0, + &arq->destCallSignalAddress, + &ct->tuplehash[!dir].tuple.dst.u3, + info->sig_port[!dir]); + } + + if ((arq->options & eAdmissionRequest_srcCallSignalAddress) && + get_h225_addr(ct, *data, &arq->srcCallSignalAddress, + &addr, &port) && + !memcmp(&addr, &ct->tuplehash[dir].tuple.src.u3, sizeof(addr)) && + set_h225_addr && ct->status & IPS_NAT_MASK) { + /* Calling ARQ */ + return set_h225_addr(skb, data, 0, + &arq->srcCallSignalAddress, + &ct->tuplehash[!dir].tuple.dst.u3, + port); + } + + return 0; +} + +/****************************************************************************/ +static int process_acf(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, AdmissionConfirm *acf) +{ + int dir = CTINFO2DIR(ctinfo); + int ret = 0; + __be16 port; + union nf_inet_addr addr; + struct nf_conntrack_expect *exp; + typeof(set_sig_addr_hook) set_sig_addr; + + pr_debug("nf_ct_ras: ACF\n"); + + if (!get_h225_addr(ct, *data, &acf->destCallSignalAddress, + &addr, &port)) + return 0; + + if (!memcmp(&addr, &ct->tuplehash[dir].tuple.dst.u3, sizeof(addr))) { + /* Answering ACF */ + set_sig_addr = rcu_dereference(set_sig_addr_hook); + if (set_sig_addr && ct->status & IPS_NAT_MASK) + return set_sig_addr(skb, ct, ctinfo, data, + &acf->destCallSignalAddress, 1); + return 0; + } + + /* Need new expect */ + if ((exp = nf_ct_expect_alloc(ct)) == NULL) + return -1; + nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct), + &ct->tuplehash[!dir].tuple.src.u3, &addr, + IPPROTO_TCP, NULL, &port); + exp->flags = NF_CT_EXPECT_PERMANENT; + exp->helper = nf_conntrack_helper_q931; + + if (nf_ct_expect_related(exp) == 0) { + pr_debug("nf_ct_ras: expect Q.931 "); + nf_ct_dump_tuple(&exp->tuple); + } else + ret = -1; + + nf_ct_expect_put(exp); + + return ret; +} + +/****************************************************************************/ +static int process_lrq(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, LocationRequest *lrq) +{ + typeof(set_ras_addr_hook) set_ras_addr; + + pr_debug("nf_ct_ras: LRQ\n"); + + set_ras_addr = rcu_dereference(set_ras_addr_hook); + if (set_ras_addr && ct->status & IPS_NAT_MASK) + return set_ras_addr(skb, ct, ctinfo, data, + &lrq->replyAddress, 1); + return 0; +} + +/****************************************************************************/ +static int process_lcf(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, LocationConfirm *lcf) +{ + int dir = CTINFO2DIR(ctinfo); + int ret = 0; + __be16 port; + union nf_inet_addr addr; + struct nf_conntrack_expect *exp; + + pr_debug("nf_ct_ras: LCF\n"); + + if (!get_h225_addr(ct, *data, &lcf->callSignalAddress, + &addr, &port)) + return 0; + + /* Need new expect for call signal */ + if ((exp = nf_ct_expect_alloc(ct)) == NULL) + return -1; + nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct), + &ct->tuplehash[!dir].tuple.src.u3, &addr, + IPPROTO_TCP, NULL, &port); + exp->flags = NF_CT_EXPECT_PERMANENT; + exp->helper = nf_conntrack_helper_q931; + + if (nf_ct_expect_related(exp) == 0) { + pr_debug("nf_ct_ras: expect Q.931 "); + nf_ct_dump_tuple(&exp->tuple); + } else + ret = -1; + + nf_ct_expect_put(exp); + + /* Ignore rasAddress */ + + return ret; +} + +/****************************************************************************/ +static int process_irr(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, InfoRequestResponse *irr) +{ + int ret; + typeof(set_ras_addr_hook) set_ras_addr; + typeof(set_sig_addr_hook) set_sig_addr; + + pr_debug("nf_ct_ras: IRR\n"); + + set_ras_addr = rcu_dereference(set_ras_addr_hook); + if (set_ras_addr && ct->status & IPS_NAT_MASK) { + ret = set_ras_addr(skb, ct, ctinfo, data, + &irr->rasAddress, 1); + if (ret < 0) + return -1; + } + + set_sig_addr = rcu_dereference(set_sig_addr_hook); + if (set_sig_addr && ct->status & IPS_NAT_MASK) { + ret = set_sig_addr(skb, ct, ctinfo, data, + irr->callSignalAddress.item, + irr->callSignalAddress.count); + if (ret < 0) + return -1; + } + + return 0; +} + +/****************************************************************************/ +static int process_ras(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, RasMessage *ras) +{ + switch (ras->choice) { + case eRasMessage_gatekeeperRequest: + return process_grq(skb, ct, ctinfo, data, + &ras->gatekeeperRequest); + case eRasMessage_gatekeeperConfirm: + return process_gcf(skb, ct, ctinfo, data, + &ras->gatekeeperConfirm); + case eRasMessage_registrationRequest: + return process_rrq(skb, ct, ctinfo, data, + &ras->registrationRequest); + case eRasMessage_registrationConfirm: + return process_rcf(skb, ct, ctinfo, data, + &ras->registrationConfirm); + case eRasMessage_unregistrationRequest: + return process_urq(skb, ct, ctinfo, data, + &ras->unregistrationRequest); + case eRasMessage_admissionRequest: + return process_arq(skb, ct, ctinfo, data, + &ras->admissionRequest); + case eRasMessage_admissionConfirm: + return process_acf(skb, ct, ctinfo, data, + &ras->admissionConfirm); + case eRasMessage_locationRequest: + return process_lrq(skb, ct, ctinfo, data, + &ras->locationRequest); + case eRasMessage_locationConfirm: + return process_lcf(skb, ct, ctinfo, data, + &ras->locationConfirm); + case eRasMessage_infoRequestResponse: + return process_irr(skb, ct, ctinfo, data, + &ras->infoRequestResponse); + default: + pr_debug("nf_ct_ras: RAS message %d\n", ras->choice); + break; + } + + return 0; +} + +/****************************************************************************/ +static int ras_help(struct sk_buff *skb, unsigned int protoff, + struct nf_conn *ct, enum ip_conntrack_info ctinfo) +{ + static RasMessage ras; + unsigned char *data; + int datalen = 0; + int ret; + + pr_debug("nf_ct_ras: skblen = %u\n", skb->len); + + spin_lock_bh(&nf_h323_lock); + + /* Get UDP data */ + data = get_udp_data(skb, protoff, &datalen); + if (data == NULL) + goto accept; + pr_debug("nf_ct_ras: RAS message len=%d ", datalen); + nf_ct_dump_tuple(&ct->tuplehash[CTINFO2DIR(ctinfo)].tuple); + + /* Decode RAS message */ + ret = DecodeRasMessage(data, datalen, &ras); + if (ret < 0) { + pr_debug("nf_ct_ras: decoding error: %s\n", + ret == H323_ERROR_BOUND ? + "out of bound" : "out of range"); + goto accept; + } + + /* Process RAS message */ + if (process_ras(skb, ct, ctinfo, &data, &ras) < 0) + goto drop; + + accept: + spin_unlock_bh(&nf_h323_lock); + return NF_ACCEPT; + + drop: + spin_unlock_bh(&nf_h323_lock); + if (net_ratelimit()) + pr_info("nf_ct_ras: packet dropped\n"); + return NF_DROP; +} + +/****************************************************************************/ +static const struct nf_conntrack_expect_policy ras_exp_policy = { + .max_expected = 32, + .timeout = 240, +}; + +static struct nf_conntrack_helper nf_conntrack_helper_ras[] __read_mostly = { + { + .name = "RAS", + .me = THIS_MODULE, + .tuple.src.l3num = AF_INET, + .tuple.src.u.udp.port = cpu_to_be16(RAS_PORT), + .tuple.dst.protonum = IPPROTO_UDP, + .help = ras_help, + .expect_policy = &ras_exp_policy, + }, + { + .name = "RAS", + .me = THIS_MODULE, + .tuple.src.l3num = AF_INET6, + .tuple.src.u.udp.port = cpu_to_be16(RAS_PORT), + .tuple.dst.protonum = IPPROTO_UDP, + .help = ras_help, + .expect_policy = &ras_exp_policy, + }, +}; + +/****************************************************************************/ +static void __exit nf_conntrack_h323_fini(void) +{ + nf_conntrack_helper_unregister(&nf_conntrack_helper_ras[1]); + nf_conntrack_helper_unregister(&nf_conntrack_helper_ras[0]); + nf_conntrack_helper_unregister(&nf_conntrack_helper_q931[1]); + nf_conntrack_helper_unregister(&nf_conntrack_helper_q931[0]); + nf_conntrack_helper_unregister(&nf_conntrack_helper_h245); + kfree(h323_buffer); + pr_debug("nf_ct_h323: fini\n"); +} + +/****************************************************************************/ +static int __init nf_conntrack_h323_init(void) +{ + int ret; + + h323_buffer = kmalloc(65536, GFP_KERNEL); + if (!h323_buffer) + return -ENOMEM; + ret = nf_conntrack_helper_register(&nf_conntrack_helper_h245); + if (ret < 0) + goto err1; + ret = nf_conntrack_helper_register(&nf_conntrack_helper_q931[0]); + if (ret < 0) + goto err2; + ret = nf_conntrack_helper_register(&nf_conntrack_helper_q931[1]); + if (ret < 0) + goto err3; + ret = nf_conntrack_helper_register(&nf_conntrack_helper_ras[0]); + if (ret < 0) + goto err4; + ret = nf_conntrack_helper_register(&nf_conntrack_helper_ras[1]); + if (ret < 0) + goto err5; + pr_debug("nf_ct_h323: init success\n"); + return 0; + +err5: + nf_conntrack_helper_unregister(&nf_conntrack_helper_ras[0]); +err4: + nf_conntrack_helper_unregister(&nf_conntrack_helper_q931[1]); +err3: + nf_conntrack_helper_unregister(&nf_conntrack_helper_q931[0]); +err2: + nf_conntrack_helper_unregister(&nf_conntrack_helper_h245); +err1: + kfree(h323_buffer); + return ret; +} + +/****************************************************************************/ +module_init(nf_conntrack_h323_init); +module_exit(nf_conntrack_h323_fini); + +EXPORT_SYMBOL_GPL(get_h225_addr); +EXPORT_SYMBOL_GPL(set_h245_addr_hook); +EXPORT_SYMBOL_GPL(set_h225_addr_hook); +EXPORT_SYMBOL_GPL(set_sig_addr_hook); +EXPORT_SYMBOL_GPL(set_ras_addr_hook); +EXPORT_SYMBOL_GPL(nat_rtp_rtcp_hook); +EXPORT_SYMBOL_GPL(nat_t120_hook); +EXPORT_SYMBOL_GPL(nat_h245_hook); +EXPORT_SYMBOL_GPL(nat_callforwarding_hook); +EXPORT_SYMBOL_GPL(nat_q931_hook); + +MODULE_AUTHOR("Jing Min Zhao <zhaojingmin@users.sourceforge.net>"); +MODULE_DESCRIPTION("H.323 connection tracking helper"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ip_conntrack_h323"); +MODULE_ALIAS_NFCT_HELPER("h323"); diff --git a/net/netfilter/nf_conntrack_h323_types.c b/net/netfilter/nf_conntrack_h323_types.c new file mode 100644 index 00000000..d880f352 --- /dev/null +++ b/net/netfilter/nf_conntrack_h323_types.c @@ -0,0 +1,1922 @@ +/* Generated by Jing Min Zhao's ASN.1 parser, May 16 2007 + * + * Copyright (c) 2006 Jing Min Zhao <zhaojingmin@users.sourceforge.net> + * + * This source code is licensed under General Public License version 2. + */ + +static const struct field_t _TransportAddress_ipAddress[] = { /* SEQUENCE */ + {FNAME("ip") OCTSTR, FIXD, 4, 0, DECODE, + offsetof(TransportAddress_ipAddress, ip), NULL}, + {FNAME("port") INT, WORD, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _TransportAddress_ipSourceRoute_route[] = { /* SEQUENCE OF */ + {FNAME("item") OCTSTR, FIXD, 4, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _TransportAddress_ipSourceRoute_routing[] = { /* CHOICE */ + {FNAME("strict") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("loose") NUL, FIXD, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _TransportAddress_ipSourceRoute[] = { /* SEQUENCE */ + {FNAME("ip") OCTSTR, FIXD, 4, 0, SKIP, 0, NULL}, + {FNAME("port") INT, WORD, 0, 0, SKIP, 0, NULL}, + {FNAME("route") SEQOF, SEMI, 0, 0, SKIP, 0, + _TransportAddress_ipSourceRoute_route}, + {FNAME("routing") CHOICE, 1, 2, 2, SKIP | EXT, 0, + _TransportAddress_ipSourceRoute_routing}, +}; + +static const struct field_t _TransportAddress_ipxAddress[] = { /* SEQUENCE */ + {FNAME("node") OCTSTR, FIXD, 6, 0, SKIP, 0, NULL}, + {FNAME("netnum") OCTSTR, FIXD, 4, 0, SKIP, 0, NULL}, + {FNAME("port") OCTSTR, FIXD, 2, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _TransportAddress_ip6Address[] = { /* SEQUENCE */ + {FNAME("ip") OCTSTR, FIXD, 16, 0, DECODE, + offsetof(TransportAddress_ip6Address, ip), NULL}, + {FNAME("port") INT, WORD, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _H221NonStandard[] = { /* SEQUENCE */ + {FNAME("t35CountryCode") INT, BYTE, 0, 0, SKIP, 0, NULL}, + {FNAME("t35Extension") INT, BYTE, 0, 0, SKIP, 0, NULL}, + {FNAME("manufacturerCode") INT, WORD, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _NonStandardIdentifier[] = { /* CHOICE */ + {FNAME("object") OID, BYTE, 0, 0, SKIP, 0, NULL}, + {FNAME("h221NonStandard") SEQ, 0, 3, 3, SKIP | EXT, 0, + _H221NonStandard}, +}; + +static const struct field_t _NonStandardParameter[] = { /* SEQUENCE */ + {FNAME("nonStandardIdentifier") CHOICE, 1, 2, 2, SKIP | EXT, 0, + _NonStandardIdentifier}, + {FNAME("data") OCTSTR, SEMI, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _TransportAddress[] = { /* CHOICE */ + {FNAME("ipAddress") SEQ, 0, 2, 2, DECODE, + offsetof(TransportAddress, ipAddress), _TransportAddress_ipAddress}, + {FNAME("ipSourceRoute") SEQ, 0, 4, 4, SKIP | EXT, 0, + _TransportAddress_ipSourceRoute}, + {FNAME("ipxAddress") SEQ, 0, 3, 3, SKIP, 0, + _TransportAddress_ipxAddress}, + {FNAME("ip6Address") SEQ, 0, 2, 2, DECODE | EXT, + offsetof(TransportAddress, ip6Address), + _TransportAddress_ip6Address}, + {FNAME("netBios") OCTSTR, FIXD, 16, 0, SKIP, 0, NULL}, + {FNAME("nsap") OCTSTR, 5, 1, 0, SKIP, 0, NULL}, + {FNAME("nonStandardAddress") SEQ, 0, 2, 2, SKIP, 0, + _NonStandardParameter}, +}; + +static const struct field_t _AliasAddress[] = { /* CHOICE */ + {FNAME("dialedDigits") NUMDGT, 7, 1, 0, SKIP, 0, NULL}, + {FNAME("h323-ID") BMPSTR, BYTE, 1, 0, SKIP, 0, NULL}, + {FNAME("url-ID") IA5STR, WORD, 1, 0, SKIP, 0, NULL}, + {FNAME("transportID") CHOICE, 3, 7, 7, SKIP | EXT, 0, NULL}, + {FNAME("email-ID") IA5STR, WORD, 1, 0, SKIP, 0, NULL}, + {FNAME("partyNumber") CHOICE, 3, 5, 5, SKIP | EXT, 0, NULL}, + {FNAME("mobileUIM") CHOICE, 1, 2, 2, SKIP | EXT, 0, NULL}, +}; + +static const struct field_t _Setup_UUIE_sourceAddress[] = { /* SEQUENCE OF */ + {FNAME("item") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress}, +}; + +static const struct field_t _VendorIdentifier[] = { /* SEQUENCE */ + {FNAME("vendor") SEQ, 0, 3, 3, SKIP | EXT, 0, _H221NonStandard}, + {FNAME("productId") OCTSTR, BYTE, 1, 0, SKIP | OPT, 0, NULL}, + {FNAME("versionId") OCTSTR, BYTE, 1, 0, SKIP | OPT, 0, NULL}, +}; + +static const struct field_t _GatekeeperInfo[] = { /* SEQUENCE */ + {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0, + _NonStandardParameter}, +}; + +static const struct field_t _H310Caps[] = { /* SEQUENCE */ + {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0, + _NonStandardParameter}, + {FNAME("dataRatesSupported") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("supportedPrefixes") SEQOF, SEMI, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _H320Caps[] = { /* SEQUENCE */ + {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0, + _NonStandardParameter}, + {FNAME("dataRatesSupported") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("supportedPrefixes") SEQOF, SEMI, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _H321Caps[] = { /* SEQUENCE */ + {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0, + _NonStandardParameter}, + {FNAME("dataRatesSupported") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("supportedPrefixes") SEQOF, SEMI, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _H322Caps[] = { /* SEQUENCE */ + {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0, + _NonStandardParameter}, + {FNAME("dataRatesSupported") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("supportedPrefixes") SEQOF, SEMI, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _H323Caps[] = { /* SEQUENCE */ + {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0, + _NonStandardParameter}, + {FNAME("dataRatesSupported") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("supportedPrefixes") SEQOF, SEMI, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _H324Caps[] = { /* SEQUENCE */ + {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0, + _NonStandardParameter}, + {FNAME("dataRatesSupported") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("supportedPrefixes") SEQOF, SEMI, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _VoiceCaps[] = { /* SEQUENCE */ + {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0, + _NonStandardParameter}, + {FNAME("dataRatesSupported") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("supportedPrefixes") SEQOF, SEMI, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _T120OnlyCaps[] = { /* SEQUENCE */ + {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0, + _NonStandardParameter}, + {FNAME("dataRatesSupported") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("supportedPrefixes") SEQOF, SEMI, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _SupportedProtocols[] = { /* CHOICE */ + {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP, 0, + _NonStandardParameter}, + {FNAME("h310") SEQ, 1, 1, 3, SKIP | EXT, 0, _H310Caps}, + {FNAME("h320") SEQ, 1, 1, 3, SKIP | EXT, 0, _H320Caps}, + {FNAME("h321") SEQ, 1, 1, 3, SKIP | EXT, 0, _H321Caps}, + {FNAME("h322") SEQ, 1, 1, 3, SKIP | EXT, 0, _H322Caps}, + {FNAME("h323") SEQ, 1, 1, 3, SKIP | EXT, 0, _H323Caps}, + {FNAME("h324") SEQ, 1, 1, 3, SKIP | EXT, 0, _H324Caps}, + {FNAME("voice") SEQ, 1, 1, 3, SKIP | EXT, 0, _VoiceCaps}, + {FNAME("t120-only") SEQ, 1, 1, 3, SKIP | EXT, 0, _T120OnlyCaps}, + {FNAME("nonStandardProtocol") SEQ, 2, 3, 3, SKIP | EXT, 0, NULL}, + {FNAME("t38FaxAnnexbOnly") SEQ, 2, 5, 5, SKIP | EXT, 0, NULL}, +}; + +static const struct field_t _GatewayInfo_protocol[] = { /* SEQUENCE OF */ + {FNAME("item") CHOICE, 4, 9, 11, SKIP | EXT, 0, _SupportedProtocols}, +}; + +static const struct field_t _GatewayInfo[] = { /* SEQUENCE */ + {FNAME("protocol") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, + _GatewayInfo_protocol}, + {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0, + _NonStandardParameter}, +}; + +static const struct field_t _McuInfo[] = { /* SEQUENCE */ + {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0, + _NonStandardParameter}, + {FNAME("protocol") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, +}; + +static const struct field_t _TerminalInfo[] = { /* SEQUENCE */ + {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0, + _NonStandardParameter}, +}; + +static const struct field_t _EndpointType[] = { /* SEQUENCE */ + {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0, + _NonStandardParameter}, + {FNAME("vendor") SEQ, 2, 3, 3, SKIP | EXT | OPT, 0, + _VendorIdentifier}, + {FNAME("gatekeeper") SEQ, 1, 1, 1, SKIP | EXT | OPT, 0, + _GatekeeperInfo}, + {FNAME("gateway") SEQ, 2, 2, 2, SKIP | EXT | OPT, 0, _GatewayInfo}, + {FNAME("mcu") SEQ, 1, 1, 2, SKIP | EXT | OPT, 0, _McuInfo}, + {FNAME("terminal") SEQ, 1, 1, 1, SKIP | EXT | OPT, 0, _TerminalInfo}, + {FNAME("mc") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("undefinedNode") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("set") BITSTR, FIXD, 32, 0, SKIP | OPT, 0, NULL}, + {FNAME("supportedTunnelledProtocols") SEQOF, SEMI, 0, 0, SKIP | OPT, + 0, NULL}, +}; + +static const struct field_t _Setup_UUIE_destinationAddress[] = { /* SEQUENCE OF */ + {FNAME("item") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress}, +}; + +static const struct field_t _Setup_UUIE_destExtraCallInfo[] = { /* SEQUENCE OF */ + {FNAME("item") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress}, +}; + +static const struct field_t _Setup_UUIE_destExtraCRV[] = { /* SEQUENCE OF */ + {FNAME("item") INT, WORD, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _Setup_UUIE_conferenceGoal[] = { /* CHOICE */ + {FNAME("create") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("join") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("invite") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("capability-negotiation") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("callIndependentSupplementaryService") NUL, FIXD, 0, 0, SKIP, + 0, NULL}, +}; + +static const struct field_t _Q954Details[] = { /* SEQUENCE */ + {FNAME("conferenceCalling") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("threePartyService") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _QseriesOptions[] = { /* SEQUENCE */ + {FNAME("q932Full") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("q951Full") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("q952Full") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("q953Full") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("q955Full") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("q956Full") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("q957Full") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("q954Info") SEQ, 0, 2, 2, SKIP | EXT, 0, _Q954Details}, +}; + +static const struct field_t _CallType[] = { /* CHOICE */ + {FNAME("pointToPoint") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("oneToN") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("nToOne") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("nToN") NUL, FIXD, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _H245_NonStandardIdentifier_h221NonStandard[] = { /* SEQUENCE */ + {FNAME("t35CountryCode") INT, BYTE, 0, 0, SKIP, 0, NULL}, + {FNAME("t35Extension") INT, BYTE, 0, 0, SKIP, 0, NULL}, + {FNAME("manufacturerCode") INT, WORD, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _H245_NonStandardIdentifier[] = { /* CHOICE */ + {FNAME("object") OID, BYTE, 0, 0, SKIP, 0, NULL}, + {FNAME("h221NonStandard") SEQ, 0, 3, 3, SKIP, 0, + _H245_NonStandardIdentifier_h221NonStandard}, +}; + +static const struct field_t _H245_NonStandardParameter[] = { /* SEQUENCE */ + {FNAME("nonStandardIdentifier") CHOICE, 1, 2, 2, SKIP, 0, + _H245_NonStandardIdentifier}, + {FNAME("data") OCTSTR, SEMI, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _H261VideoCapability[] = { /* SEQUENCE */ + {FNAME("qcifMPI") INT, 2, 1, 0, SKIP | OPT, 0, NULL}, + {FNAME("cifMPI") INT, 2, 1, 0, SKIP | OPT, 0, NULL}, + {FNAME("temporalSpatialTradeOffCapability") BOOL, FIXD, 0, 0, SKIP, 0, + NULL}, + {FNAME("maxBitRate") INT, WORD, 1, 0, SKIP, 0, NULL}, + {FNAME("stillImageTransmission") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("videoBadMBsCap") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _H262VideoCapability[] = { /* SEQUENCE */ + {FNAME("profileAndLevel-SPatML") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("profileAndLevel-MPatLL") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("profileAndLevel-MPatML") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("profileAndLevel-MPatH-14") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("profileAndLevel-MPatHL") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("profileAndLevel-SNRatLL") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("profileAndLevel-SNRatML") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("profileAndLevel-SpatialatH-14") BOOL, FIXD, 0, 0, SKIP, 0, + NULL}, + {FNAME("profileAndLevel-HPatML") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("profileAndLevel-HPatH-14") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("profileAndLevel-HPatHL") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("videoBitRate") INT, CONS, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("vbvBufferSize") INT, CONS, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("samplesPerLine") INT, WORD, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("linesPerFrame") INT, WORD, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("framesPerSecond") INT, 4, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("luminanceSampleRate") INT, CONS, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("videoBadMBsCap") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _H263VideoCapability[] = { /* SEQUENCE */ + {FNAME("sqcifMPI") INT, 5, 1, 0, SKIP | OPT, 0, NULL}, + {FNAME("qcifMPI") INT, 5, 1, 0, SKIP | OPT, 0, NULL}, + {FNAME("cifMPI") INT, 5, 1, 0, SKIP | OPT, 0, NULL}, + {FNAME("cif4MPI") INT, 5, 1, 0, SKIP | OPT, 0, NULL}, + {FNAME("cif16MPI") INT, 5, 1, 0, SKIP | OPT, 0, NULL}, + {FNAME("maxBitRate") INT, CONS, 1, 0, SKIP, 0, NULL}, + {FNAME("unrestrictedVector") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("arithmeticCoding") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("advancedPrediction") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("pbFrames") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("temporalSpatialTradeOffCapability") BOOL, FIXD, 0, 0, SKIP, 0, + NULL}, + {FNAME("hrd-B") INT, CONS, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("bppMaxKb") INT, WORD, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("slowSqcifMPI") INT, WORD, 1, 0, SKIP | OPT, 0, NULL}, + {FNAME("slowQcifMPI") INT, WORD, 1, 0, SKIP | OPT, 0, NULL}, + {FNAME("slowCifMPI") INT, WORD, 1, 0, SKIP | OPT, 0, NULL}, + {FNAME("slowCif4MPI") INT, WORD, 1, 0, SKIP | OPT, 0, NULL}, + {FNAME("slowCif16MPI") INT, WORD, 1, 0, SKIP | OPT, 0, NULL}, + {FNAME("errorCompensation") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("enhancementLayerInfo") SEQ, 3, 4, 4, SKIP | EXT | OPT, 0, + NULL}, + {FNAME("h263Options") SEQ, 5, 29, 31, SKIP | EXT | OPT, 0, NULL}, +}; + +static const struct field_t _IS11172VideoCapability[] = { /* SEQUENCE */ + {FNAME("constrainedBitstream") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("videoBitRate") INT, CONS, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("vbvBufferSize") INT, CONS, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("samplesPerLine") INT, WORD, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("linesPerFrame") INT, WORD, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("pictureRate") INT, 4, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("luminanceSampleRate") INT, CONS, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("videoBadMBsCap") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _VideoCapability[] = { /* CHOICE */ + {FNAME("nonStandard") SEQ, 0, 2, 2, SKIP, 0, + _H245_NonStandardParameter}, + {FNAME("h261VideoCapability") SEQ, 2, 5, 6, SKIP | EXT, 0, + _H261VideoCapability}, + {FNAME("h262VideoCapability") SEQ, 6, 17, 18, SKIP | EXT, 0, + _H262VideoCapability}, + {FNAME("h263VideoCapability") SEQ, 7, 13, 21, SKIP | EXT, 0, + _H263VideoCapability}, + {FNAME("is11172VideoCapability") SEQ, 6, 7, 8, SKIP | EXT, 0, + _IS11172VideoCapability}, + {FNAME("genericVideoCapability") SEQ, 5, 6, 6, SKIP | EXT, 0, NULL}, +}; + +static const struct field_t _AudioCapability_g7231[] = { /* SEQUENCE */ + {FNAME("maxAl-sduAudioFrames") INT, BYTE, 1, 0, SKIP, 0, NULL}, + {FNAME("silenceSuppression") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _IS11172AudioCapability[] = { /* SEQUENCE */ + {FNAME("audioLayer1") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("audioLayer2") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("audioLayer3") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("audioSampling32k") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("audioSampling44k1") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("audioSampling48k") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("singleChannel") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("twoChannels") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("bitRate") INT, WORD, 1, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _IS13818AudioCapability[] = { /* SEQUENCE */ + {FNAME("audioLayer1") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("audioLayer2") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("audioLayer3") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("audioSampling16k") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("audioSampling22k05") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("audioSampling24k") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("audioSampling32k") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("audioSampling44k1") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("audioSampling48k") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("singleChannel") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("twoChannels") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("threeChannels2-1") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("threeChannels3-0") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("fourChannels2-0-2-0") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("fourChannels2-2") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("fourChannels3-1") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("fiveChannels3-0-2-0") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("fiveChannels3-2") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("lowFrequencyEnhancement") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("multilingual") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("bitRate") INT, WORD, 1, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _AudioCapability[] = { /* CHOICE */ + {FNAME("nonStandard") SEQ, 0, 2, 2, SKIP, 0, + _H245_NonStandardParameter}, + {FNAME("g711Alaw64k") INT, BYTE, 1, 0, SKIP, 0, NULL}, + {FNAME("g711Alaw56k") INT, BYTE, 1, 0, SKIP, 0, NULL}, + {FNAME("g711Ulaw64k") INT, BYTE, 1, 0, SKIP, 0, NULL}, + {FNAME("g711Ulaw56k") INT, BYTE, 1, 0, SKIP, 0, NULL}, + {FNAME("g722-64k") INT, BYTE, 1, 0, SKIP, 0, NULL}, + {FNAME("g722-56k") INT, BYTE, 1, 0, SKIP, 0, NULL}, + {FNAME("g722-48k") INT, BYTE, 1, 0, SKIP, 0, NULL}, + {FNAME("g7231") SEQ, 0, 2, 2, SKIP, 0, _AudioCapability_g7231}, + {FNAME("g728") INT, BYTE, 1, 0, SKIP, 0, NULL}, + {FNAME("g729") INT, BYTE, 1, 0, SKIP, 0, NULL}, + {FNAME("g729AnnexA") INT, BYTE, 1, 0, SKIP, 0, NULL}, + {FNAME("is11172AudioCapability") SEQ, 0, 9, 9, SKIP | EXT, 0, + _IS11172AudioCapability}, + {FNAME("is13818AudioCapability") SEQ, 0, 21, 21, SKIP | EXT, 0, + _IS13818AudioCapability}, + {FNAME("g729wAnnexB") INT, BYTE, 1, 0, SKIP, 0, NULL}, + {FNAME("g729AnnexAwAnnexB") INT, BYTE, 1, 0, SKIP, 0, NULL}, + {FNAME("g7231AnnexCCapability") SEQ, 1, 3, 3, SKIP | EXT, 0, NULL}, + {FNAME("gsmFullRate") SEQ, 0, 3, 3, SKIP | EXT, 0, NULL}, + {FNAME("gsmHalfRate") SEQ, 0, 3, 3, SKIP | EXT, 0, NULL}, + {FNAME("gsmEnhancedFullRate") SEQ, 0, 3, 3, SKIP | EXT, 0, NULL}, + {FNAME("genericAudioCapability") SEQ, 5, 6, 6, SKIP | EXT, 0, NULL}, + {FNAME("g729Extensions") SEQ, 1, 8, 8, SKIP | EXT, 0, NULL}, +}; + +static const struct field_t _DataProtocolCapability[] = { /* CHOICE */ + {FNAME("nonStandard") SEQ, 0, 2, 2, SKIP, 0, + _H245_NonStandardParameter}, + {FNAME("v14buffered") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("v42lapm") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("hdlcFrameTunnelling") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("h310SeparateVCStack") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("h310SingleVCStack") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("transparent") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("segmentationAndReassembly") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("hdlcFrameTunnelingwSAR") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("v120") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("separateLANStack") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("v76wCompression") CHOICE, 2, 3, 3, SKIP | EXT, 0, NULL}, + {FNAME("tcp") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("udp") NUL, FIXD, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _T84Profile_t84Restricted[] = { /* SEQUENCE */ + {FNAME("qcif") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("cif") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("ccir601Seq") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("ccir601Prog") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("hdtvSeq") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("hdtvProg") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("g3FacsMH200x100") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("g3FacsMH200x200") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("g4FacsMMR200x100") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("g4FacsMMR200x200") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("jbig200x200Seq") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("jbig200x200Prog") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("jbig300x300Seq") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("jbig300x300Prog") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("digPhotoLow") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("digPhotoMedSeq") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("digPhotoMedProg") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("digPhotoHighSeq") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("digPhotoHighProg") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _T84Profile[] = { /* CHOICE */ + {FNAME("t84Unrestricted") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("t84Restricted") SEQ, 0, 19, 19, SKIP | EXT, 0, + _T84Profile_t84Restricted}, +}; + +static const struct field_t _DataApplicationCapability_application_t84[] = { /* SEQUENCE */ + {FNAME("t84Protocol") CHOICE, 3, 7, 14, SKIP | EXT, 0, + _DataProtocolCapability}, + {FNAME("t84Profile") CHOICE, 1, 2, 2, SKIP, 0, _T84Profile}, +}; + +static const struct field_t _DataApplicationCapability_application_nlpid[] = { /* SEQUENCE */ + {FNAME("nlpidProtocol") CHOICE, 3, 7, 14, SKIP | EXT, 0, + _DataProtocolCapability}, + {FNAME("nlpidData") OCTSTR, SEMI, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _DataApplicationCapability_application[] = { /* CHOICE */ + {FNAME("nonStandard") SEQ, 0, 2, 2, SKIP, 0, + _H245_NonStandardParameter}, + {FNAME("t120") CHOICE, 3, 7, 14, DECODE | EXT, + offsetof(DataApplicationCapability_application, t120), + _DataProtocolCapability}, + {FNAME("dsm-cc") CHOICE, 3, 7, 14, SKIP | EXT, 0, + _DataProtocolCapability}, + {FNAME("userData") CHOICE, 3, 7, 14, SKIP | EXT, 0, + _DataProtocolCapability}, + {FNAME("t84") SEQ, 0, 2, 2, SKIP, 0, + _DataApplicationCapability_application_t84}, + {FNAME("t434") CHOICE, 3, 7, 14, SKIP | EXT, 0, + _DataProtocolCapability}, + {FNAME("h224") CHOICE, 3, 7, 14, SKIP | EXT, 0, + _DataProtocolCapability}, + {FNAME("nlpid") SEQ, 0, 2, 2, SKIP, 0, + _DataApplicationCapability_application_nlpid}, + {FNAME("dsvdControl") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("h222DataPartitioning") CHOICE, 3, 7, 14, SKIP | EXT, 0, + _DataProtocolCapability}, + {FNAME("t30fax") CHOICE, 3, 7, 14, SKIP | EXT, 0, NULL}, + {FNAME("t140") CHOICE, 3, 7, 14, SKIP | EXT, 0, NULL}, + {FNAME("t38fax") SEQ, 0, 2, 2, SKIP, 0, NULL}, + {FNAME("genericDataCapability") SEQ, 5, 6, 6, SKIP | EXT, 0, NULL}, +}; + +static const struct field_t _DataApplicationCapability[] = { /* SEQUENCE */ + {FNAME("application") CHOICE, 4, 10, 14, DECODE | EXT, + offsetof(DataApplicationCapability, application), + _DataApplicationCapability_application}, + {FNAME("maxBitRate") INT, CONS, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _EncryptionMode[] = { /* CHOICE */ + {FNAME("nonStandard") SEQ, 0, 2, 2, SKIP, 0, + _H245_NonStandardParameter}, + {FNAME("h233Encryption") NUL, FIXD, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _DataType[] = { /* CHOICE */ + {FNAME("nonStandard") SEQ, 0, 2, 2, SKIP, 0, + _H245_NonStandardParameter}, + {FNAME("nullData") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("videoData") CHOICE, 3, 5, 6, SKIP | EXT, 0, _VideoCapability}, + {FNAME("audioData") CHOICE, 4, 14, 22, SKIP | EXT, 0, + _AudioCapability}, + {FNAME("data") SEQ, 0, 2, 2, DECODE | EXT, offsetof(DataType, data), + _DataApplicationCapability}, + {FNAME("encryptionData") CHOICE, 1, 2, 2, SKIP | EXT, 0, + _EncryptionMode}, + {FNAME("h235Control") SEQ, 0, 2, 2, SKIP, 0, NULL}, + {FNAME("h235Media") SEQ, 0, 2, 2, SKIP | EXT, 0, NULL}, + {FNAME("multiplexedStream") SEQ, 0, 2, 2, SKIP | EXT, 0, NULL}, +}; + +static const struct field_t _H222LogicalChannelParameters[] = { /* SEQUENCE */ + {FNAME("resourceID") INT, WORD, 0, 0, SKIP, 0, NULL}, + {FNAME("subChannelID") INT, WORD, 0, 0, SKIP, 0, NULL}, + {FNAME("pcr-pid") INT, WORD, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("programDescriptors") OCTSTR, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("streamDescriptors") OCTSTR, SEMI, 0, 0, SKIP | OPT, 0, NULL}, +}; + +static const struct field_t _H223LogicalChannelParameters_adaptationLayerType_al3[] = { /* SEQUENCE */ + {FNAME("controlFieldOctets") INT, 2, 0, 0, SKIP, 0, NULL}, + {FNAME("sendBufferSize") INT, CONS, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _H223LogicalChannelParameters_adaptationLayerType[] = { /* CHOICE */ + {FNAME("nonStandard") SEQ, 0, 2, 2, SKIP, 0, + _H245_NonStandardParameter}, + {FNAME("al1Framed") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("al1NotFramed") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("al2WithoutSequenceNumbers") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("al2WithSequenceNumbers") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("al3") SEQ, 0, 2, 2, SKIP, 0, + _H223LogicalChannelParameters_adaptationLayerType_al3}, + {FNAME("al1M") SEQ, 0, 7, 8, SKIP | EXT, 0, NULL}, + {FNAME("al2M") SEQ, 0, 2, 2, SKIP | EXT, 0, NULL}, + {FNAME("al3M") SEQ, 0, 5, 6, SKIP | EXT, 0, NULL}, +}; + +static const struct field_t _H223LogicalChannelParameters[] = { /* SEQUENCE */ + {FNAME("adaptationLayerType") CHOICE, 3, 6, 9, SKIP | EXT, 0, + _H223LogicalChannelParameters_adaptationLayerType}, + {FNAME("segmentableFlag") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _CRCLength[] = { /* CHOICE */ + {FNAME("crc8bit") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("crc16bit") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("crc32bit") NUL, FIXD, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _V76HDLCParameters[] = { /* SEQUENCE */ + {FNAME("crcLength") CHOICE, 2, 3, 3, SKIP | EXT, 0, _CRCLength}, + {FNAME("n401") INT, WORD, 1, 0, SKIP, 0, NULL}, + {FNAME("loopbackTestProcedure") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _V76LogicalChannelParameters_suspendResume[] = { /* CHOICE */ + {FNAME("noSuspendResume") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("suspendResumewAddress") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("suspendResumewoAddress") NUL, FIXD, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _V76LogicalChannelParameters_mode_eRM_recovery[] = { /* CHOICE */ + {FNAME("rej") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("sREJ") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("mSREJ") NUL, FIXD, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _V76LogicalChannelParameters_mode_eRM[] = { /* SEQUENCE */ + {FNAME("windowSize") INT, 7, 1, 0, SKIP, 0, NULL}, + {FNAME("recovery") CHOICE, 2, 3, 3, SKIP | EXT, 0, + _V76LogicalChannelParameters_mode_eRM_recovery}, +}; + +static const struct field_t _V76LogicalChannelParameters_mode[] = { /* CHOICE */ + {FNAME("eRM") SEQ, 0, 2, 2, SKIP | EXT, 0, + _V76LogicalChannelParameters_mode_eRM}, + {FNAME("uNERM") NUL, FIXD, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _V75Parameters[] = { /* SEQUENCE */ + {FNAME("audioHeaderPresent") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _V76LogicalChannelParameters[] = { /* SEQUENCE */ + {FNAME("hdlcParameters") SEQ, 0, 3, 3, SKIP | EXT, 0, + _V76HDLCParameters}, + {FNAME("suspendResume") CHOICE, 2, 3, 3, SKIP | EXT, 0, + _V76LogicalChannelParameters_suspendResume}, + {FNAME("uIH") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("mode") CHOICE, 1, 2, 2, SKIP | EXT, 0, + _V76LogicalChannelParameters_mode}, + {FNAME("v75Parameters") SEQ, 0, 1, 1, SKIP | EXT, 0, _V75Parameters}, +}; + +static const struct field_t _H2250LogicalChannelParameters_nonStandard[] = { /* SEQUENCE OF */ + {FNAME("item") SEQ, 0, 2, 2, SKIP, 0, _H245_NonStandardParameter}, +}; + +static const struct field_t _UnicastAddress_iPAddress[] = { /* SEQUENCE */ + {FNAME("network") OCTSTR, FIXD, 4, 0, DECODE, + offsetof(UnicastAddress_iPAddress, network), NULL}, + {FNAME("tsapIdentifier") INT, WORD, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _UnicastAddress_iPXAddress[] = { /* SEQUENCE */ + {FNAME("node") OCTSTR, FIXD, 6, 0, SKIP, 0, NULL}, + {FNAME("netnum") OCTSTR, FIXD, 4, 0, SKIP, 0, NULL}, + {FNAME("tsapIdentifier") OCTSTR, FIXD, 2, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _UnicastAddress_iP6Address[] = { /* SEQUENCE */ + {FNAME("network") OCTSTR, FIXD, 16, 0, DECODE, + offsetof(UnicastAddress_iP6Address, network), NULL}, + {FNAME("tsapIdentifier") INT, WORD, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _UnicastAddress_iPSourceRouteAddress_routing[] = { /* CHOICE */ + {FNAME("strict") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("loose") NUL, FIXD, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _UnicastAddress_iPSourceRouteAddress_route[] = { /* SEQUENCE OF */ + {FNAME("item") OCTSTR, FIXD, 4, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _UnicastAddress_iPSourceRouteAddress[] = { /* SEQUENCE */ + {FNAME("routing") CHOICE, 1, 2, 2, SKIP, 0, + _UnicastAddress_iPSourceRouteAddress_routing}, + {FNAME("network") OCTSTR, FIXD, 4, 0, SKIP, 0, NULL}, + {FNAME("tsapIdentifier") INT, WORD, 0, 0, SKIP, 0, NULL}, + {FNAME("route") SEQOF, SEMI, 0, 0, SKIP, 0, + _UnicastAddress_iPSourceRouteAddress_route}, +}; + +static const struct field_t _UnicastAddress[] = { /* CHOICE */ + {FNAME("iPAddress") SEQ, 0, 2, 2, DECODE | EXT, + offsetof(UnicastAddress, iPAddress), _UnicastAddress_iPAddress}, + {FNAME("iPXAddress") SEQ, 0, 3, 3, SKIP | EXT, 0, + _UnicastAddress_iPXAddress}, + {FNAME("iP6Address") SEQ, 0, 2, 2, DECODE | EXT, + offsetof(UnicastAddress, iP6Address), _UnicastAddress_iP6Address}, + {FNAME("netBios") OCTSTR, FIXD, 16, 0, SKIP, 0, NULL}, + {FNAME("iPSourceRouteAddress") SEQ, 0, 4, 4, SKIP | EXT, 0, + _UnicastAddress_iPSourceRouteAddress}, + {FNAME("nsap") OCTSTR, 5, 1, 0, SKIP, 0, NULL}, + {FNAME("nonStandardAddress") SEQ, 0, 2, 2, SKIP, 0, NULL}, +}; + +static const struct field_t _MulticastAddress_iPAddress[] = { /* SEQUENCE */ + {FNAME("network") OCTSTR, FIXD, 4, 0, SKIP, 0, NULL}, + {FNAME("tsapIdentifier") INT, WORD, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _MulticastAddress_iP6Address[] = { /* SEQUENCE */ + {FNAME("network") OCTSTR, FIXD, 16, 0, SKIP, 0, NULL}, + {FNAME("tsapIdentifier") INT, WORD, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _MulticastAddress[] = { /* CHOICE */ + {FNAME("iPAddress") SEQ, 0, 2, 2, SKIP | EXT, 0, + _MulticastAddress_iPAddress}, + {FNAME("iP6Address") SEQ, 0, 2, 2, SKIP | EXT, 0, + _MulticastAddress_iP6Address}, + {FNAME("nsap") OCTSTR, 5, 1, 0, SKIP, 0, NULL}, + {FNAME("nonStandardAddress") SEQ, 0, 2, 2, SKIP, 0, NULL}, +}; + +static const struct field_t _H245_TransportAddress[] = { /* CHOICE */ + {FNAME("unicastAddress") CHOICE, 3, 5, 7, DECODE | EXT, + offsetof(H245_TransportAddress, unicastAddress), _UnicastAddress}, + {FNAME("multicastAddress") CHOICE, 1, 2, 4, SKIP | EXT, 0, + _MulticastAddress}, +}; + +static const struct field_t _H2250LogicalChannelParameters[] = { /* SEQUENCE */ + {FNAME("nonStandard") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, + _H2250LogicalChannelParameters_nonStandard}, + {FNAME("sessionID") INT, BYTE, 0, 0, SKIP, 0, NULL}, + {FNAME("associatedSessionID") INT, 8, 1, 0, SKIP | OPT, 0, NULL}, + {FNAME("mediaChannel") CHOICE, 1, 2, 2, DECODE | EXT | OPT, + offsetof(H2250LogicalChannelParameters, mediaChannel), + _H245_TransportAddress}, + {FNAME("mediaGuaranteedDelivery") BOOL, FIXD, 0, 0, SKIP | OPT, 0, + NULL}, + {FNAME("mediaControlChannel") CHOICE, 1, 2, 2, DECODE | EXT | OPT, + offsetof(H2250LogicalChannelParameters, mediaControlChannel), + _H245_TransportAddress}, + {FNAME("mediaControlGuaranteedDelivery") BOOL, FIXD, 0, 0, STOP | OPT, + 0, NULL}, + {FNAME("silenceSuppression") BOOL, FIXD, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("destination") SEQ, 0, 2, 2, STOP | EXT | OPT, 0, NULL}, + {FNAME("dynamicRTPPayloadType") INT, 5, 96, 0, STOP | OPT, 0, NULL}, + {FNAME("mediaPacketization") CHOICE, 0, 1, 2, STOP | EXT | OPT, 0, + NULL}, + {FNAME("transportCapability") SEQ, 3, 3, 3, STOP | EXT | OPT, 0, + NULL}, + {FNAME("redundancyEncoding") SEQ, 1, 2, 2, STOP | EXT | OPT, 0, NULL}, + {FNAME("source") SEQ, 0, 2, 2, SKIP | EXT | OPT, 0, NULL}, +}; + +static const struct field_t _OpenLogicalChannel_forwardLogicalChannelParameters_multiplexParameters[] = { /* CHOICE */ + {FNAME("h222LogicalChannelParameters") SEQ, 3, 5, 5, SKIP | EXT, 0, + _H222LogicalChannelParameters}, + {FNAME("h223LogicalChannelParameters") SEQ, 0, 2, 2, SKIP | EXT, 0, + _H223LogicalChannelParameters}, + {FNAME("v76LogicalChannelParameters") SEQ, 0, 5, 5, SKIP | EXT, 0, + _V76LogicalChannelParameters}, + {FNAME("h2250LogicalChannelParameters") SEQ, 10, 11, 14, DECODE | EXT, + offsetof + (OpenLogicalChannel_forwardLogicalChannelParameters_multiplexParameters, + h2250LogicalChannelParameters), _H2250LogicalChannelParameters}, + {FNAME("none") NUL, FIXD, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _OpenLogicalChannel_forwardLogicalChannelParameters[] = { /* SEQUENCE */ + {FNAME("portNumber") INT, WORD, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("dataType") CHOICE, 3, 6, 9, DECODE | EXT, + offsetof(OpenLogicalChannel_forwardLogicalChannelParameters, + dataType), _DataType}, + {FNAME("multiplexParameters") CHOICE, 2, 3, 5, DECODE | EXT, + offsetof(OpenLogicalChannel_forwardLogicalChannelParameters, + multiplexParameters), + _OpenLogicalChannel_forwardLogicalChannelParameters_multiplexParameters}, + {FNAME("forwardLogicalChannelDependency") INT, WORD, 1, 0, SKIP | OPT, + 0, NULL}, + {FNAME("replacementFor") INT, WORD, 1, 0, SKIP | OPT, 0, NULL}, +}; + +static const struct field_t _OpenLogicalChannel_reverseLogicalChannelParameters_multiplexParameters[] = { /* CHOICE */ + {FNAME("h223LogicalChannelParameters") SEQ, 0, 2, 2, SKIP | EXT, 0, + _H223LogicalChannelParameters}, + {FNAME("v76LogicalChannelParameters") SEQ, 0, 5, 5, SKIP | EXT, 0, + _V76LogicalChannelParameters}, + {FNAME("h2250LogicalChannelParameters") SEQ, 10, 11, 14, DECODE | EXT, + offsetof + (OpenLogicalChannel_reverseLogicalChannelParameters_multiplexParameters, + h2250LogicalChannelParameters), _H2250LogicalChannelParameters}, +}; + +static const struct field_t _OpenLogicalChannel_reverseLogicalChannelParameters[] = { /* SEQUENCE */ + {FNAME("dataType") CHOICE, 3, 6, 9, SKIP | EXT, 0, _DataType}, + {FNAME("multiplexParameters") CHOICE, 1, 2, 3, DECODE | EXT | OPT, + offsetof(OpenLogicalChannel_reverseLogicalChannelParameters, + multiplexParameters), + _OpenLogicalChannel_reverseLogicalChannelParameters_multiplexParameters}, + {FNAME("reverseLogicalChannelDependency") INT, WORD, 1, 0, SKIP | OPT, + 0, NULL}, + {FNAME("replacementFor") INT, WORD, 1, 0, SKIP | OPT, 0, NULL}, +}; + +static const struct field_t _NetworkAccessParameters_distribution[] = { /* CHOICE */ + {FNAME("unicast") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("multicast") NUL, FIXD, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _Q2931Address_address[] = { /* CHOICE */ + {FNAME("internationalNumber") NUMSTR, 4, 1, 0, SKIP, 0, NULL}, + {FNAME("nsapAddress") OCTSTR, 5, 1, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _Q2931Address[] = { /* SEQUENCE */ + {FNAME("address") CHOICE, 1, 2, 2, SKIP | EXT, 0, + _Q2931Address_address}, + {FNAME("subaddress") OCTSTR, 5, 1, 0, SKIP | OPT, 0, NULL}, +}; + +static const struct field_t _NetworkAccessParameters_networkAddress[] = { /* CHOICE */ + {FNAME("q2931Address") SEQ, 1, 2, 2, SKIP | EXT, 0, _Q2931Address}, + {FNAME("e164Address") NUMDGT, 7, 1, 0, SKIP, 0, NULL}, + {FNAME("localAreaAddress") CHOICE, 1, 2, 2, DECODE | EXT, + offsetof(NetworkAccessParameters_networkAddress, localAreaAddress), + _H245_TransportAddress}, +}; + +static const struct field_t _NetworkAccessParameters[] = { /* SEQUENCE */ + {FNAME("distribution") CHOICE, 1, 2, 2, SKIP | EXT | OPT, 0, + _NetworkAccessParameters_distribution}, + {FNAME("networkAddress") CHOICE, 2, 3, 3, DECODE | EXT, + offsetof(NetworkAccessParameters, networkAddress), + _NetworkAccessParameters_networkAddress}, + {FNAME("associateConference") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("externalReference") OCTSTR, 8, 1, 0, SKIP | OPT, 0, NULL}, + {FNAME("t120SetupProcedure") CHOICE, 2, 3, 3, SKIP | EXT | OPT, 0, + NULL}, +}; + +static const struct field_t _OpenLogicalChannel[] = { /* SEQUENCE */ + {FNAME("forwardLogicalChannelNumber") INT, WORD, 1, 0, SKIP, 0, NULL}, + {FNAME("forwardLogicalChannelParameters") SEQ, 1, 3, 5, DECODE | EXT, + offsetof(OpenLogicalChannel, forwardLogicalChannelParameters), + _OpenLogicalChannel_forwardLogicalChannelParameters}, + {FNAME("reverseLogicalChannelParameters") SEQ, 1, 2, 4, + DECODE | EXT | OPT, offsetof(OpenLogicalChannel, + reverseLogicalChannelParameters), + _OpenLogicalChannel_reverseLogicalChannelParameters}, + {FNAME("separateStack") SEQ, 2, 4, 5, DECODE | EXT | OPT, + offsetof(OpenLogicalChannel, separateStack), + _NetworkAccessParameters}, + {FNAME("encryptionSync") SEQ, 2, 4, 4, STOP | EXT | OPT, 0, NULL}, +}; + +static const struct field_t _Setup_UUIE_fastStart[] = { /* SEQUENCE OF */ + {FNAME("item") SEQ, 1, 3, 5, DECODE | OPEN | EXT, + sizeof(OpenLogicalChannel), _OpenLogicalChannel} + , +}; + +static const struct field_t _Setup_UUIE[] = { /* SEQUENCE */ + {FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL}, + {FNAME("h245Address") CHOICE, 3, 7, 7, DECODE | EXT | OPT, + offsetof(Setup_UUIE, h245Address), _TransportAddress}, + {FNAME("sourceAddress") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, + _Setup_UUIE_sourceAddress}, + {FNAME("sourceInfo") SEQ, 6, 8, 10, SKIP | EXT, 0, _EndpointType}, + {FNAME("destinationAddress") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, + _Setup_UUIE_destinationAddress}, + {FNAME("destCallSignalAddress") CHOICE, 3, 7, 7, DECODE | EXT | OPT, + offsetof(Setup_UUIE, destCallSignalAddress), _TransportAddress}, + {FNAME("destExtraCallInfo") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, + _Setup_UUIE_destExtraCallInfo}, + {FNAME("destExtraCRV") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, + _Setup_UUIE_destExtraCRV}, + {FNAME("activeMC") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("conferenceID") OCTSTR, FIXD, 16, 0, SKIP, 0, NULL}, + {FNAME("conferenceGoal") CHOICE, 2, 3, 5, SKIP | EXT, 0, + _Setup_UUIE_conferenceGoal}, + {FNAME("callServices") SEQ, 0, 8, 8, SKIP | EXT | OPT, 0, + _QseriesOptions}, + {FNAME("callType") CHOICE, 2, 4, 4, SKIP | EXT, 0, _CallType}, + {FNAME("sourceCallSignalAddress") CHOICE, 3, 7, 7, DECODE | EXT | OPT, + offsetof(Setup_UUIE, sourceCallSignalAddress), _TransportAddress}, + {FNAME("remoteExtensionAddress") CHOICE, 1, 2, 7, SKIP | EXT | OPT, 0, + NULL}, + {FNAME("callIdentifier") SEQ, 0, 1, 1, SKIP | EXT, 0, NULL}, + {FNAME("h245SecurityCapability") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, + NULL}, + {FNAME("tokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("fastStart") SEQOF, SEMI, 0, 30, DECODE | OPT, + offsetof(Setup_UUIE, fastStart), _Setup_UUIE_fastStart}, + {FNAME("mediaWaitForConnect") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("canOverlapSend") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("endpointIdentifier") BMPSTR, 7, 1, 0, STOP | OPT, 0, NULL}, + {FNAME("multipleCalls") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("maintainConnection") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("connectionParameters") SEQ, 0, 3, 3, SKIP | EXT | OPT, 0, + NULL}, + {FNAME("language") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("presentationIndicator") CHOICE, 2, 3, 3, SKIP | EXT | OPT, 0, + NULL}, + {FNAME("screeningIndicator") ENUM, 2, 0, 0, SKIP | EXT | OPT, 0, + NULL}, + {FNAME("serviceControl") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("symmetricOperationRequired") NUL, FIXD, 0, 0, SKIP | OPT, 0, + NULL}, + {FNAME("capacity") SEQ, 2, 2, 2, SKIP | EXT | OPT, 0, NULL}, + {FNAME("circuitInfo") SEQ, 3, 3, 3, SKIP | EXT | OPT, 0, NULL}, + {FNAME("desiredProtocols") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("neededFeatures") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("desiredFeatures") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("supportedFeatures") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("parallelH245Control") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("additionalSourceAddresses") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, + NULL}, +}; + +static const struct field_t _CallProceeding_UUIE_fastStart[] = { /* SEQUENCE OF */ + {FNAME("item") SEQ, 1, 3, 5, DECODE | OPEN | EXT, + sizeof(OpenLogicalChannel), _OpenLogicalChannel} + , +}; + +static const struct field_t _CallProceeding_UUIE[] = { /* SEQUENCE */ + {FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL}, + {FNAME("destinationInfo") SEQ, 6, 8, 10, SKIP | EXT, 0, + _EndpointType}, + {FNAME("h245Address") CHOICE, 3, 7, 7, DECODE | EXT | OPT, + offsetof(CallProceeding_UUIE, h245Address), _TransportAddress}, + {FNAME("callIdentifier") SEQ, 0, 1, 1, SKIP | EXT, 0, NULL}, + {FNAME("h245SecurityMode") CHOICE, 2, 4, 4, SKIP | EXT | OPT, 0, + NULL}, + {FNAME("tokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("fastStart") SEQOF, SEMI, 0, 30, DECODE | OPT, + offsetof(CallProceeding_UUIE, fastStart), + _CallProceeding_UUIE_fastStart}, + {FNAME("multipleCalls") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("maintainConnection") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("fastConnectRefused") NUL, FIXD, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("featureSet") SEQ, 3, 4, 4, SKIP | EXT | OPT, 0, NULL}, +}; + +static const struct field_t _Connect_UUIE_fastStart[] = { /* SEQUENCE OF */ + {FNAME("item") SEQ, 1, 3, 5, DECODE | OPEN | EXT, + sizeof(OpenLogicalChannel), _OpenLogicalChannel} + , +}; + +static const struct field_t _Connect_UUIE[] = { /* SEQUENCE */ + {FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL}, + {FNAME("h245Address") CHOICE, 3, 7, 7, DECODE | EXT | OPT, + offsetof(Connect_UUIE, h245Address), _TransportAddress}, + {FNAME("destinationInfo") SEQ, 6, 8, 10, SKIP | EXT, 0, + _EndpointType}, + {FNAME("conferenceID") OCTSTR, FIXD, 16, 0, SKIP, 0, NULL}, + {FNAME("callIdentifier") SEQ, 0, 1, 1, SKIP | EXT, 0, NULL}, + {FNAME("h245SecurityMode") CHOICE, 2, 4, 4, SKIP | EXT | OPT, 0, + NULL}, + {FNAME("tokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("fastStart") SEQOF, SEMI, 0, 30, DECODE | OPT, + offsetof(Connect_UUIE, fastStart), _Connect_UUIE_fastStart}, + {FNAME("multipleCalls") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("maintainConnection") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("language") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("connectedAddress") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("presentationIndicator") CHOICE, 2, 3, 3, SKIP | EXT | OPT, 0, + NULL}, + {FNAME("screeningIndicator") ENUM, 2, 0, 0, SKIP | EXT | OPT, 0, + NULL}, + {FNAME("fastConnectRefused") NUL, FIXD, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("serviceControl") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("capacity") SEQ, 2, 2, 2, SKIP | EXT | OPT, 0, NULL}, + {FNAME("featureSet") SEQ, 3, 4, 4, SKIP | EXT | OPT, 0, NULL}, +}; + +static const struct field_t _Alerting_UUIE_fastStart[] = { /* SEQUENCE OF */ + {FNAME("item") SEQ, 1, 3, 5, DECODE | OPEN | EXT, + sizeof(OpenLogicalChannel), _OpenLogicalChannel} + , +}; + +static const struct field_t _Alerting_UUIE[] = { /* SEQUENCE */ + {FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL}, + {FNAME("destinationInfo") SEQ, 6, 8, 10, SKIP | EXT, 0, + _EndpointType}, + {FNAME("h245Address") CHOICE, 3, 7, 7, DECODE | EXT | OPT, + offsetof(Alerting_UUIE, h245Address), _TransportAddress}, + {FNAME("callIdentifier") SEQ, 0, 1, 1, SKIP | EXT, 0, NULL}, + {FNAME("h245SecurityMode") CHOICE, 2, 4, 4, SKIP | EXT | OPT, 0, + NULL}, + {FNAME("tokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("fastStart") SEQOF, SEMI, 0, 30, DECODE | OPT, + offsetof(Alerting_UUIE, fastStart), _Alerting_UUIE_fastStart}, + {FNAME("multipleCalls") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("maintainConnection") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("alertingAddress") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("presentationIndicator") CHOICE, 2, 3, 3, SKIP | EXT | OPT, 0, + NULL}, + {FNAME("screeningIndicator") ENUM, 2, 0, 0, SKIP | EXT | OPT, 0, + NULL}, + {FNAME("fastConnectRefused") NUL, FIXD, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("serviceControl") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("capacity") SEQ, 2, 2, 2, SKIP | EXT | OPT, 0, NULL}, + {FNAME("featureSet") SEQ, 3, 4, 4, SKIP | EXT | OPT, 0, NULL}, +}; + +static const struct field_t _Information_UUIE[] = { /* SEQUENCE */ + {FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL}, + {FNAME("callIdentifier") SEQ, 0, 1, 1, SKIP | EXT, 0, NULL}, + {FNAME("tokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("fastStart") SEQOF, SEMI, 0, 30, SKIP | OPT, 0, NULL}, + {FNAME("fastConnectRefused") NUL, FIXD, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("circuitInfo") SEQ, 3, 3, 3, SKIP | EXT | OPT, 0, NULL}, +}; + +static const struct field_t _ReleaseCompleteReason[] = { /* CHOICE */ + {FNAME("noBandwidth") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("gatekeeperResources") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("unreachableDestination") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("destinationRejection") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("invalidRevision") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("noPermission") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("unreachableGatekeeper") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("gatewayResources") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("badFormatAddress") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("adaptiveBusy") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("inConf") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("undefinedReason") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("facilityCallDeflection") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("securityDenied") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("calledPartyNotRegistered") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("callerNotRegistered") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("newConnectionNeeded") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("nonStandardReason") SEQ, 0, 2, 2, SKIP, 0, NULL}, + {FNAME("replaceWithConferenceInvite") OCTSTR, FIXD, 16, 0, SKIP, 0, + NULL}, + {FNAME("genericDataReason") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("neededFeatureNotSupported") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("tunnelledSignallingRejected") NUL, FIXD, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _ReleaseComplete_UUIE[] = { /* SEQUENCE */ + {FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL}, + {FNAME("reason") CHOICE, 4, 12, 22, SKIP | EXT | OPT, 0, + _ReleaseCompleteReason}, + {FNAME("callIdentifier") SEQ, 0, 1, 1, SKIP | EXT, 0, NULL}, + {FNAME("tokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("busyAddress") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("presentationIndicator") CHOICE, 2, 3, 3, SKIP | EXT | OPT, 0, + NULL}, + {FNAME("screeningIndicator") ENUM, 2, 0, 0, SKIP | EXT | OPT, 0, + NULL}, + {FNAME("capacity") SEQ, 2, 2, 2, SKIP | EXT | OPT, 0, NULL}, + {FNAME("serviceControl") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("featureSet") SEQ, 3, 4, 4, SKIP | EXT | OPT, 0, NULL}, +}; + +static const struct field_t _Facility_UUIE_alternativeAliasAddress[] = { /* SEQUENCE OF */ + {FNAME("item") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress}, +}; + +static const struct field_t _FacilityReason[] = { /* CHOICE */ + {FNAME("routeCallToGatekeeper") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("callForwarded") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("routeCallToMC") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("undefinedReason") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("conferenceListChoice") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("startH245") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("noH245") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("newTokens") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("featureSetUpdate") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("forwardedElements") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("transportedInformation") NUL, FIXD, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _Facility_UUIE_fastStart[] = { /* SEQUENCE OF */ + {FNAME("item") SEQ, 1, 3, 5, DECODE | OPEN | EXT, + sizeof(OpenLogicalChannel), _OpenLogicalChannel} + , +}; + +static const struct field_t _Facility_UUIE[] = { /* SEQUENCE */ + {FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL}, + {FNAME("alternativeAddress") CHOICE, 3, 7, 7, DECODE | EXT | OPT, + offsetof(Facility_UUIE, alternativeAddress), _TransportAddress}, + {FNAME("alternativeAliasAddress") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, + _Facility_UUIE_alternativeAliasAddress}, + {FNAME("conferenceID") OCTSTR, FIXD, 16, 0, SKIP | OPT, 0, NULL}, + {FNAME("reason") CHOICE, 2, 4, 11, DECODE | EXT, + offsetof(Facility_UUIE, reason), _FacilityReason}, + {FNAME("callIdentifier") SEQ, 0, 1, 1, SKIP | EXT, 0, NULL}, + {FNAME("destExtraCallInfo") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("remoteExtensionAddress") CHOICE, 1, 2, 7, SKIP | EXT | OPT, 0, + NULL}, + {FNAME("tokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("conferences") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("h245Address") CHOICE, 3, 7, 7, DECODE | EXT | OPT, + offsetof(Facility_UUIE, h245Address), _TransportAddress}, + {FNAME("fastStart") SEQOF, SEMI, 0, 30, DECODE | OPT, + offsetof(Facility_UUIE, fastStart), _Facility_UUIE_fastStart}, + {FNAME("multipleCalls") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("maintainConnection") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("fastConnectRefused") NUL, FIXD, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("serviceControl") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("circuitInfo") SEQ, 3, 3, 3, SKIP | EXT | OPT, 0, NULL}, + {FNAME("featureSet") SEQ, 3, 4, 4, SKIP | EXT | OPT, 0, NULL}, + {FNAME("destinationInfo") SEQ, 6, 8, 10, SKIP | EXT | OPT, 0, NULL}, + {FNAME("h245SecurityMode") CHOICE, 2, 4, 4, SKIP | EXT | OPT, 0, + NULL}, +}; + +static const struct field_t _CallIdentifier[] = { /* SEQUENCE */ + {FNAME("guid") OCTSTR, FIXD, 16, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _SecurityServiceMode[] = { /* CHOICE */ + {FNAME("nonStandard") SEQ, 0, 2, 2, SKIP, 0, _NonStandardParameter}, + {FNAME("none") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("default") NUL, FIXD, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _SecurityCapabilities[] = { /* SEQUENCE */ + {FNAME("nonStandard") SEQ, 0, 2, 2, SKIP | OPT, 0, + _NonStandardParameter}, + {FNAME("encryption") CHOICE, 2, 3, 3, SKIP | EXT, 0, + _SecurityServiceMode}, + {FNAME("authenticaton") CHOICE, 2, 3, 3, SKIP | EXT, 0, + _SecurityServiceMode}, + {FNAME("integrity") CHOICE, 2, 3, 3, SKIP | EXT, 0, + _SecurityServiceMode}, +}; + +static const struct field_t _H245Security[] = { /* CHOICE */ + {FNAME("nonStandard") SEQ, 0, 2, 2, SKIP, 0, _NonStandardParameter}, + {FNAME("noSecurity") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("tls") SEQ, 1, 4, 4, SKIP | EXT, 0, _SecurityCapabilities}, + {FNAME("ipsec") SEQ, 1, 4, 4, SKIP | EXT, 0, _SecurityCapabilities}, +}; + +static const struct field_t _DHset[] = { /* SEQUENCE */ + {FNAME("halfkey") BITSTR, WORD, 0, 0, SKIP, 0, NULL}, + {FNAME("modSize") BITSTR, WORD, 0, 0, SKIP, 0, NULL}, + {FNAME("generator") BITSTR, WORD, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _TypedCertificate[] = { /* SEQUENCE */ + {FNAME("type") OID, BYTE, 0, 0, SKIP, 0, NULL}, + {FNAME("certificate") OCTSTR, SEMI, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _H235_NonStandardParameter[] = { /* SEQUENCE */ + {FNAME("nonStandardIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL}, + {FNAME("data") OCTSTR, SEMI, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _ClearToken[] = { /* SEQUENCE */ + {FNAME("tokenOID") OID, BYTE, 0, 0, SKIP, 0, NULL}, + {FNAME("timeStamp") INT, CONS, 1, 0, SKIP | OPT, 0, NULL}, + {FNAME("password") BMPSTR, 7, 1, 0, SKIP | OPT, 0, NULL}, + {FNAME("dhkey") SEQ, 0, 3, 3, SKIP | EXT | OPT, 0, _DHset}, + {FNAME("challenge") OCTSTR, 7, 8, 0, SKIP | OPT, 0, NULL}, + {FNAME("random") INT, UNCO, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("certificate") SEQ, 0, 2, 2, SKIP | EXT | OPT, 0, + _TypedCertificate}, + {FNAME("generalID") BMPSTR, 7, 1, 0, SKIP | OPT, 0, NULL}, + {FNAME("nonStandard") SEQ, 0, 2, 2, SKIP | OPT, 0, + _H235_NonStandardParameter}, + {FNAME("eckasdhkey") CHOICE, 1, 2, 2, SKIP | EXT | OPT, 0, NULL}, + {FNAME("sendersID") BMPSTR, 7, 1, 0, SKIP | OPT, 0, NULL}, +}; + +static const struct field_t _Progress_UUIE_tokens[] = { /* SEQUENCE OF */ + {FNAME("item") SEQ, 8, 9, 11, SKIP | EXT, 0, _ClearToken}, +}; + +static const struct field_t _Params[] = { /* SEQUENCE */ + {FNAME("ranInt") INT, UNCO, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("iv8") OCTSTR, FIXD, 8, 0, SKIP | OPT, 0, NULL}, + {FNAME("iv16") OCTSTR, FIXD, 16, 0, SKIP | OPT, 0, NULL}, +}; + +static const struct field_t _CryptoH323Token_cryptoEPPwdHash_token[] = { /* SEQUENCE */ + {FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL}, + {FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params}, + {FNAME("hash") BITSTR, SEMI, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _CryptoH323Token_cryptoEPPwdHash[] = { /* SEQUENCE */ + {FNAME("alias") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress}, + {FNAME("timeStamp") INT, CONS, 1, 0, SKIP, 0, NULL}, + {FNAME("token") SEQ, 0, 3, 3, SKIP, 0, + _CryptoH323Token_cryptoEPPwdHash_token}, +}; + +static const struct field_t _CryptoH323Token_cryptoGKPwdHash_token[] = { /* SEQUENCE */ + {FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL}, + {FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params}, + {FNAME("hash") BITSTR, SEMI, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _CryptoH323Token_cryptoGKPwdHash[] = { /* SEQUENCE */ + {FNAME("gatekeeperId") BMPSTR, 7, 1, 0, SKIP, 0, NULL}, + {FNAME("timeStamp") INT, CONS, 1, 0, SKIP, 0, NULL}, + {FNAME("token") SEQ, 0, 3, 3, SKIP, 0, + _CryptoH323Token_cryptoGKPwdHash_token}, +}; + +static const struct field_t _CryptoH323Token_cryptoEPPwdEncr[] = { /* SEQUENCE */ + {FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL}, + {FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params}, + {FNAME("encryptedData") OCTSTR, SEMI, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _CryptoH323Token_cryptoGKPwdEncr[] = { /* SEQUENCE */ + {FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL}, + {FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params}, + {FNAME("encryptedData") OCTSTR, SEMI, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _CryptoH323Token_cryptoEPCert[] = { /* SEQUENCE */ + {FNAME("toBeSigned") SEQ, 8, 9, 11, SKIP | OPEN | EXT, 0, NULL}, + {FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL}, + {FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params}, + {FNAME("signature") BITSTR, SEMI, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _CryptoH323Token_cryptoGKCert[] = { /* SEQUENCE */ + {FNAME("toBeSigned") SEQ, 8, 9, 11, SKIP | OPEN | EXT, 0, NULL}, + {FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL}, + {FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params}, + {FNAME("signature") BITSTR, SEMI, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _CryptoH323Token_cryptoFastStart[] = { /* SEQUENCE */ + {FNAME("toBeSigned") SEQ, 8, 9, 11, SKIP | OPEN | EXT, 0, NULL}, + {FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL}, + {FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params}, + {FNAME("signature") BITSTR, SEMI, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _CryptoToken_cryptoEncryptedToken_token[] = { /* SEQUENCE */ + {FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL}, + {FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params}, + {FNAME("encryptedData") OCTSTR, SEMI, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _CryptoToken_cryptoEncryptedToken[] = { /* SEQUENCE */ + {FNAME("tokenOID") OID, BYTE, 0, 0, SKIP, 0, NULL}, + {FNAME("token") SEQ, 0, 3, 3, SKIP, 0, + _CryptoToken_cryptoEncryptedToken_token}, +}; + +static const struct field_t _CryptoToken_cryptoSignedToken_token[] = { /* SEQUENCE */ + {FNAME("toBeSigned") SEQ, 8, 9, 11, SKIP | OPEN | EXT, 0, NULL}, + {FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL}, + {FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params}, + {FNAME("signature") BITSTR, SEMI, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _CryptoToken_cryptoSignedToken[] = { /* SEQUENCE */ + {FNAME("tokenOID") OID, BYTE, 0, 0, SKIP, 0, NULL}, + {FNAME("token") SEQ, 0, 4, 4, SKIP, 0, + _CryptoToken_cryptoSignedToken_token}, +}; + +static const struct field_t _CryptoToken_cryptoHashedToken_token[] = { /* SEQUENCE */ + {FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL}, + {FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params}, + {FNAME("hash") BITSTR, SEMI, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _CryptoToken_cryptoHashedToken[] = { /* SEQUENCE */ + {FNAME("tokenOID") OID, BYTE, 0, 0, SKIP, 0, NULL}, + {FNAME("hashedVals") SEQ, 8, 9, 11, SKIP | EXT, 0, _ClearToken}, + {FNAME("token") SEQ, 0, 3, 3, SKIP, 0, + _CryptoToken_cryptoHashedToken_token}, +}; + +static const struct field_t _CryptoToken_cryptoPwdEncr[] = { /* SEQUENCE */ + {FNAME("algorithmOID") OID, BYTE, 0, 0, SKIP, 0, NULL}, + {FNAME("paramS") SEQ, 2, 2, 3, SKIP | EXT, 0, _Params}, + {FNAME("encryptedData") OCTSTR, SEMI, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _CryptoToken[] = { /* CHOICE */ + {FNAME("cryptoEncryptedToken") SEQ, 0, 2, 2, SKIP, 0, + _CryptoToken_cryptoEncryptedToken}, + {FNAME("cryptoSignedToken") SEQ, 0, 2, 2, SKIP, 0, + _CryptoToken_cryptoSignedToken}, + {FNAME("cryptoHashedToken") SEQ, 0, 3, 3, SKIP, 0, + _CryptoToken_cryptoHashedToken}, + {FNAME("cryptoPwdEncr") SEQ, 0, 3, 3, SKIP, 0, + _CryptoToken_cryptoPwdEncr}, +}; + +static const struct field_t _CryptoH323Token[] = { /* CHOICE */ + {FNAME("cryptoEPPwdHash") SEQ, 0, 3, 3, SKIP, 0, + _CryptoH323Token_cryptoEPPwdHash}, + {FNAME("cryptoGKPwdHash") SEQ, 0, 3, 3, SKIP, 0, + _CryptoH323Token_cryptoGKPwdHash}, + {FNAME("cryptoEPPwdEncr") SEQ, 0, 3, 3, SKIP, 0, + _CryptoH323Token_cryptoEPPwdEncr}, + {FNAME("cryptoGKPwdEncr") SEQ, 0, 3, 3, SKIP, 0, + _CryptoH323Token_cryptoGKPwdEncr}, + {FNAME("cryptoEPCert") SEQ, 0, 4, 4, SKIP, 0, + _CryptoH323Token_cryptoEPCert}, + {FNAME("cryptoGKCert") SEQ, 0, 4, 4, SKIP, 0, + _CryptoH323Token_cryptoGKCert}, + {FNAME("cryptoFastStart") SEQ, 0, 4, 4, SKIP, 0, + _CryptoH323Token_cryptoFastStart}, + {FNAME("nestedcryptoToken") CHOICE, 2, 4, 4, SKIP | EXT, 0, + _CryptoToken}, +}; + +static const struct field_t _Progress_UUIE_cryptoTokens[] = { /* SEQUENCE OF */ + {FNAME("item") CHOICE, 3, 8, 8, SKIP | EXT, 0, _CryptoH323Token}, +}; + +static const struct field_t _Progress_UUIE_fastStart[] = { /* SEQUENCE OF */ + {FNAME("item") SEQ, 1, 3, 5, DECODE | OPEN | EXT, + sizeof(OpenLogicalChannel), _OpenLogicalChannel} + , +}; + +static const struct field_t _Progress_UUIE[] = { /* SEQUENCE */ + {FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL}, + {FNAME("destinationInfo") SEQ, 6, 8, 10, SKIP | EXT, 0, + _EndpointType}, + {FNAME("h245Address") CHOICE, 3, 7, 7, DECODE | EXT | OPT, + offsetof(Progress_UUIE, h245Address), _TransportAddress}, + {FNAME("callIdentifier") SEQ, 0, 1, 1, SKIP | EXT, 0, + _CallIdentifier}, + {FNAME("h245SecurityMode") CHOICE, 2, 4, 4, SKIP | EXT | OPT, 0, + _H245Security}, + {FNAME("tokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, + _Progress_UUIE_tokens}, + {FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, + _Progress_UUIE_cryptoTokens}, + {FNAME("fastStart") SEQOF, SEMI, 0, 30, DECODE | OPT, + offsetof(Progress_UUIE, fastStart), _Progress_UUIE_fastStart}, + {FNAME("multipleCalls") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("maintainConnection") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("fastConnectRefused") NUL, FIXD, 0, 0, SKIP | OPT, 0, NULL}, +}; + +static const struct field_t _H323_UU_PDU_h323_message_body[] = { /* CHOICE */ + {FNAME("setup") SEQ, 7, 13, 39, DECODE | EXT, + offsetof(H323_UU_PDU_h323_message_body, setup), _Setup_UUIE}, + {FNAME("callProceeding") SEQ, 1, 3, 12, DECODE | EXT, + offsetof(H323_UU_PDU_h323_message_body, callProceeding), + _CallProceeding_UUIE}, + {FNAME("connect") SEQ, 1, 4, 19, DECODE | EXT, + offsetof(H323_UU_PDU_h323_message_body, connect), _Connect_UUIE}, + {FNAME("alerting") SEQ, 1, 3, 17, DECODE | EXT, + offsetof(H323_UU_PDU_h323_message_body, alerting), _Alerting_UUIE}, + {FNAME("information") SEQ, 0, 1, 7, SKIP | EXT, 0, _Information_UUIE}, + {FNAME("releaseComplete") SEQ, 1, 2, 11, SKIP | EXT, 0, + _ReleaseComplete_UUIE}, + {FNAME("facility") SEQ, 3, 5, 21, DECODE | EXT, + offsetof(H323_UU_PDU_h323_message_body, facility), _Facility_UUIE}, + {FNAME("progress") SEQ, 5, 8, 11, DECODE | EXT, + offsetof(H323_UU_PDU_h323_message_body, progress), _Progress_UUIE}, + {FNAME("empty") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("status") SEQ, 2, 4, 4, SKIP | EXT, 0, NULL}, + {FNAME("statusInquiry") SEQ, 2, 4, 4, SKIP | EXT, 0, NULL}, + {FNAME("setupAcknowledge") SEQ, 2, 4, 4, SKIP | EXT, 0, NULL}, + {FNAME("notify") SEQ, 2, 4, 4, SKIP | EXT, 0, NULL}, +}; + +static const struct field_t _RequestMessage[] = { /* CHOICE */ + {FNAME("nonStandard") SEQ, 0, 1, 1, STOP | EXT, 0, NULL}, + {FNAME("masterSlaveDetermination") SEQ, 0, 2, 2, STOP | EXT, 0, NULL}, + {FNAME("terminalCapabilitySet") SEQ, 3, 5, 5, STOP | EXT, 0, NULL}, + {FNAME("openLogicalChannel") SEQ, 1, 3, 5, DECODE | EXT, + offsetof(RequestMessage, openLogicalChannel), _OpenLogicalChannel}, + {FNAME("closeLogicalChannel") SEQ, 0, 2, 3, STOP | EXT, 0, NULL}, + {FNAME("requestChannelClose") SEQ, 0, 1, 3, STOP | EXT, 0, NULL}, + {FNAME("multiplexEntrySend") SEQ, 0, 2, 2, STOP | EXT, 0, NULL}, + {FNAME("requestMultiplexEntry") SEQ, 0, 1, 1, STOP | EXT, 0, NULL}, + {FNAME("requestMode") SEQ, 0, 2, 2, STOP | EXT, 0, NULL}, + {FNAME("roundTripDelayRequest") SEQ, 0, 1, 1, STOP | EXT, 0, NULL}, + {FNAME("maintenanceLoopRequest") SEQ, 0, 1, 1, STOP | EXT, 0, NULL}, + {FNAME("communicationModeRequest") SEQ, 0, 0, 0, STOP | EXT, 0, NULL}, + {FNAME("conferenceRequest") CHOICE, 3, 8, 16, STOP | EXT, 0, NULL}, + {FNAME("multilinkRequest") CHOICE, 3, 5, 5, STOP | EXT, 0, NULL}, + {FNAME("logicalChannelRateRequest") SEQ, 0, 3, 3, STOP | EXT, 0, + NULL}, +}; + +static const struct field_t _OpenLogicalChannelAck_reverseLogicalChannelParameters_multiplexParameters[] = { /* CHOICE */ + {FNAME("h222LogicalChannelParameters") SEQ, 3, 5, 5, SKIP | EXT, 0, + _H222LogicalChannelParameters}, + {FNAME("h2250LogicalChannelParameters") SEQ, 10, 11, 14, DECODE | EXT, + offsetof + (OpenLogicalChannelAck_reverseLogicalChannelParameters_multiplexParameters, + h2250LogicalChannelParameters), _H2250LogicalChannelParameters}, +}; + +static const struct field_t _OpenLogicalChannelAck_reverseLogicalChannelParameters[] = { /* SEQUENCE */ + {FNAME("reverseLogicalChannelNumber") INT, WORD, 1, 0, SKIP, 0, NULL}, + {FNAME("portNumber") INT, WORD, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("multiplexParameters") CHOICE, 0, 1, 2, DECODE | EXT | OPT, + offsetof(OpenLogicalChannelAck_reverseLogicalChannelParameters, + multiplexParameters), + _OpenLogicalChannelAck_reverseLogicalChannelParameters_multiplexParameters}, + {FNAME("replacementFor") INT, WORD, 1, 0, SKIP | OPT, 0, NULL}, +}; + +static const struct field_t _H2250LogicalChannelAckParameters_nonStandard[] = { /* SEQUENCE OF */ + {FNAME("item") SEQ, 0, 2, 2, SKIP, 0, _H245_NonStandardParameter}, +}; + +static const struct field_t _H2250LogicalChannelAckParameters[] = { /* SEQUENCE */ + {FNAME("nonStandard") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, + _H2250LogicalChannelAckParameters_nonStandard}, + {FNAME("sessionID") INT, 8, 1, 0, SKIP | OPT, 0, NULL}, + {FNAME("mediaChannel") CHOICE, 1, 2, 2, DECODE | EXT | OPT, + offsetof(H2250LogicalChannelAckParameters, mediaChannel), + _H245_TransportAddress}, + {FNAME("mediaControlChannel") CHOICE, 1, 2, 2, DECODE | EXT | OPT, + offsetof(H2250LogicalChannelAckParameters, mediaControlChannel), + _H245_TransportAddress}, + {FNAME("dynamicRTPPayloadType") INT, 5, 96, 0, SKIP | OPT, 0, NULL}, + {FNAME("flowControlToZero") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("portNumber") INT, WORD, 0, 0, SKIP | OPT, 0, NULL}, +}; + +static const struct field_t _OpenLogicalChannelAck_forwardMultiplexAckParameters[] = { /* CHOICE */ + {FNAME("h2250LogicalChannelAckParameters") SEQ, 5, 5, 7, DECODE | EXT, + offsetof(OpenLogicalChannelAck_forwardMultiplexAckParameters, + h2250LogicalChannelAckParameters), + _H2250LogicalChannelAckParameters}, +}; + +static const struct field_t _OpenLogicalChannelAck[] = { /* SEQUENCE */ + {FNAME("forwardLogicalChannelNumber") INT, WORD, 1, 0, SKIP, 0, NULL}, + {FNAME("reverseLogicalChannelParameters") SEQ, 2, 3, 4, + DECODE | EXT | OPT, offsetof(OpenLogicalChannelAck, + reverseLogicalChannelParameters), + _OpenLogicalChannelAck_reverseLogicalChannelParameters}, + {FNAME("separateStack") SEQ, 2, 4, 5, DECODE | EXT | OPT, + offsetof(OpenLogicalChannelAck, separateStack), + _NetworkAccessParameters}, + {FNAME("forwardMultiplexAckParameters") CHOICE, 0, 1, 1, + DECODE | EXT | OPT, offsetof(OpenLogicalChannelAck, + forwardMultiplexAckParameters), + _OpenLogicalChannelAck_forwardMultiplexAckParameters}, + {FNAME("encryptionSync") SEQ, 2, 4, 4, STOP | EXT | OPT, 0, NULL}, +}; + +static const struct field_t _ResponseMessage[] = { /* CHOICE */ + {FNAME("nonStandard") SEQ, 0, 1, 1, STOP | EXT, 0, NULL}, + {FNAME("masterSlaveDeterminationAck") SEQ, 0, 1, 1, STOP | EXT, 0, + NULL}, + {FNAME("masterSlaveDeterminationReject") SEQ, 0, 1, 1, STOP | EXT, 0, + NULL}, + {FNAME("terminalCapabilitySetAck") SEQ, 0, 1, 1, STOP | EXT, 0, NULL}, + {FNAME("terminalCapabilitySetReject") SEQ, 0, 2, 2, STOP | EXT, 0, + NULL}, + {FNAME("openLogicalChannelAck") SEQ, 1, 2, 5, DECODE | EXT, + offsetof(ResponseMessage, openLogicalChannelAck), + _OpenLogicalChannelAck}, + {FNAME("openLogicalChannelReject") SEQ, 0, 2, 2, STOP | EXT, 0, NULL}, + {FNAME("closeLogicalChannelAck") SEQ, 0, 1, 1, STOP | EXT, 0, NULL}, + {FNAME("requestChannelCloseAck") SEQ, 0, 1, 1, STOP | EXT, 0, NULL}, + {FNAME("requestChannelCloseReject") SEQ, 0, 2, 2, STOP | EXT, 0, + NULL}, + {FNAME("multiplexEntrySendAck") SEQ, 0, 2, 2, STOP | EXT, 0, NULL}, + {FNAME("multiplexEntrySendReject") SEQ, 0, 2, 2, STOP | EXT, 0, NULL}, + {FNAME("requestMultiplexEntryAck") SEQ, 0, 1, 1, STOP | EXT, 0, NULL}, + {FNAME("requestMultiplexEntryReject") SEQ, 0, 2, 2, STOP | EXT, 0, + NULL}, + {FNAME("requestModeAck") SEQ, 0, 2, 2, STOP | EXT, 0, NULL}, + {FNAME("requestModeReject") SEQ, 0, 2, 2, STOP | EXT, 0, NULL}, + {FNAME("roundTripDelayResponse") SEQ, 0, 1, 1, STOP | EXT, 0, NULL}, + {FNAME("maintenanceLoopAck") SEQ, 0, 1, 1, STOP | EXT, 0, NULL}, + {FNAME("maintenanceLoopReject") SEQ, 0, 2, 2, STOP | EXT, 0, NULL}, + {FNAME("communicationModeResponse") CHOICE, 0, 1, 1, STOP | EXT, 0, + NULL}, + {FNAME("conferenceResponse") CHOICE, 3, 8, 16, STOP | EXT, 0, NULL}, + {FNAME("multilinkResponse") CHOICE, 3, 5, 5, STOP | EXT, 0, NULL}, + {FNAME("logicalChannelRateAcknowledge") SEQ, 0, 3, 3, STOP | EXT, 0, + NULL}, + {FNAME("logicalChannelRateReject") SEQ, 1, 4, 4, STOP | EXT, 0, NULL}, +}; + +static const struct field_t _MultimediaSystemControlMessage[] = { /* CHOICE */ + {FNAME("request") CHOICE, 4, 11, 15, DECODE | EXT, + offsetof(MultimediaSystemControlMessage, request), _RequestMessage}, + {FNAME("response") CHOICE, 5, 19, 24, DECODE | EXT, + offsetof(MultimediaSystemControlMessage, response), + _ResponseMessage}, + {FNAME("command") CHOICE, 3, 7, 12, STOP | EXT, 0, NULL}, + {FNAME("indication") CHOICE, 4, 14, 23, STOP | EXT, 0, NULL}, +}; + +static const struct field_t _H323_UU_PDU_h245Control[] = { /* SEQUENCE OF */ + {FNAME("item") CHOICE, 2, 4, 4, DECODE | OPEN | EXT, + sizeof(MultimediaSystemControlMessage), + _MultimediaSystemControlMessage} + , +}; + +static const struct field_t _H323_UU_PDU[] = { /* SEQUENCE */ + {FNAME("h323-message-body") CHOICE, 3, 7, 13, DECODE | EXT, + offsetof(H323_UU_PDU, h323_message_body), + _H323_UU_PDU_h323_message_body}, + {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0, + _NonStandardParameter}, + {FNAME("h4501SupplementaryService") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, + NULL}, + {FNAME("h245Tunneling") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("h245Control") SEQOF, SEMI, 0, 4, DECODE | OPT, + offsetof(H323_UU_PDU, h245Control), _H323_UU_PDU_h245Control}, + {FNAME("nonStandardControl") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("callLinkage") SEQ, 2, 2, 2, STOP | EXT | OPT, 0, NULL}, + {FNAME("tunnelledSignallingMessage") SEQ, 2, 4, 4, STOP | EXT | OPT, + 0, NULL}, + {FNAME("provisionalRespToH245Tunneling") NUL, FIXD, 0, 0, STOP | OPT, + 0, NULL}, + {FNAME("stimulusControl") SEQ, 3, 3, 3, STOP | EXT | OPT, 0, NULL}, + {FNAME("genericData") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, +}; + +static const struct field_t _H323_UserInformation[] = { /* SEQUENCE */ + {FNAME("h323-uu-pdu") SEQ, 1, 2, 11, DECODE | EXT, + offsetof(H323_UserInformation, h323_uu_pdu), _H323_UU_PDU}, + {FNAME("user-data") SEQ, 0, 2, 2, STOP | EXT | OPT, 0, NULL}, +}; + +static const struct field_t _GatekeeperRequest[] = { /* SEQUENCE */ + {FNAME("requestSeqNum") INT, WORD, 1, 0, SKIP, 0, NULL}, + {FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL}, + {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0, + _NonStandardParameter}, + {FNAME("rasAddress") CHOICE, 3, 7, 7, DECODE | EXT, + offsetof(GatekeeperRequest, rasAddress), _TransportAddress}, + {FNAME("endpointType") SEQ, 6, 8, 10, STOP | EXT, 0, NULL}, + {FNAME("gatekeeperIdentifier") BMPSTR, 7, 1, 0, STOP | OPT, 0, NULL}, + {FNAME("callServices") SEQ, 0, 8, 8, STOP | EXT | OPT, 0, NULL}, + {FNAME("endpointAlias") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("alternateEndpoints") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("tokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("authenticationCapability") SEQOF, SEMI, 0, 0, STOP | OPT, 0, + NULL}, + {FNAME("algorithmOIDs") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("integrity") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("integrityCheckValue") SEQ, 0, 2, 2, STOP | OPT, 0, NULL}, + {FNAME("supportsAltGK") NUL, FIXD, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("featureSet") SEQ, 3, 4, 4, STOP | EXT | OPT, 0, NULL}, + {FNAME("genericData") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, +}; + +static const struct field_t _GatekeeperConfirm[] = { /* SEQUENCE */ + {FNAME("requestSeqNum") INT, WORD, 1, 0, SKIP, 0, NULL}, + {FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL}, + {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0, + _NonStandardParameter}, + {FNAME("gatekeeperIdentifier") BMPSTR, 7, 1, 0, SKIP | OPT, 0, NULL}, + {FNAME("rasAddress") CHOICE, 3, 7, 7, DECODE | EXT, + offsetof(GatekeeperConfirm, rasAddress), _TransportAddress}, + {FNAME("alternateGatekeeper") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("authenticationMode") CHOICE, 3, 7, 8, STOP | EXT | OPT, 0, + NULL}, + {FNAME("tokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("algorithmOID") OID, BYTE, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("integrity") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("integrityCheckValue") SEQ, 0, 2, 2, STOP | OPT, 0, NULL}, + {FNAME("featureSet") SEQ, 3, 4, 4, STOP | EXT | OPT, 0, NULL}, + {FNAME("genericData") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, +}; + +static const struct field_t _RegistrationRequest_callSignalAddress[] = { /* SEQUENCE OF */ + {FNAME("item") CHOICE, 3, 7, 7, DECODE | EXT, + sizeof(TransportAddress), _TransportAddress} + , +}; + +static const struct field_t _RegistrationRequest_rasAddress[] = { /* SEQUENCE OF */ + {FNAME("item") CHOICE, 3, 7, 7, DECODE | EXT, + sizeof(TransportAddress), _TransportAddress} + , +}; + +static const struct field_t _RegistrationRequest_terminalAlias[] = { /* SEQUENCE OF */ + {FNAME("item") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress}, +}; + +static const struct field_t _RegistrationRequest[] = { /* SEQUENCE */ + {FNAME("requestSeqNum") INT, WORD, 1, 0, SKIP, 0, NULL}, + {FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL}, + {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0, + _NonStandardParameter}, + {FNAME("discoveryComplete") BOOL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("callSignalAddress") SEQOF, SEMI, 0, 10, DECODE, + offsetof(RegistrationRequest, callSignalAddress), + _RegistrationRequest_callSignalAddress}, + {FNAME("rasAddress") SEQOF, SEMI, 0, 10, DECODE, + offsetof(RegistrationRequest, rasAddress), + _RegistrationRequest_rasAddress}, + {FNAME("terminalType") SEQ, 6, 8, 10, SKIP | EXT, 0, _EndpointType}, + {FNAME("terminalAlias") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, + _RegistrationRequest_terminalAlias}, + {FNAME("gatekeeperIdentifier") BMPSTR, 7, 1, 0, SKIP | OPT, 0, NULL}, + {FNAME("endpointVendor") SEQ, 2, 3, 3, SKIP | EXT, 0, + _VendorIdentifier}, + {FNAME("alternateEndpoints") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("timeToLive") INT, CONS, 1, 0, DECODE | OPT, + offsetof(RegistrationRequest, timeToLive), NULL}, + {FNAME("tokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("integrityCheckValue") SEQ, 0, 2, 2, STOP | OPT, 0, NULL}, + {FNAME("keepAlive") BOOL, FIXD, 0, 0, STOP, 0, NULL}, + {FNAME("endpointIdentifier") BMPSTR, 7, 1, 0, STOP | OPT, 0, NULL}, + {FNAME("willSupplyUUIEs") BOOL, FIXD, 0, 0, STOP, 0, NULL}, + {FNAME("maintainConnection") BOOL, FIXD, 0, 0, STOP, 0, NULL}, + {FNAME("alternateTransportAddresses") SEQ, 1, 1, 1, STOP | EXT | OPT, + 0, NULL}, + {FNAME("additiveRegistration") NUL, FIXD, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("terminalAliasPattern") SEQOF, SEMI, 0, 0, STOP | OPT, 0, + NULL}, + {FNAME("supportsAltGK") NUL, FIXD, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("usageReportingCapability") SEQ, 3, 4, 4, STOP | EXT | OPT, 0, + NULL}, + {FNAME("multipleCalls") BOOL, FIXD, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("supportedH248Packages") SEQOF, SEMI, 0, 0, STOP | OPT, 0, + NULL}, + {FNAME("callCreditCapability") SEQ, 2, 2, 2, STOP | EXT | OPT, 0, + NULL}, + {FNAME("capacityReportingCapability") SEQ, 0, 1, 1, STOP | EXT | OPT, + 0, NULL}, + {FNAME("capacity") SEQ, 2, 2, 2, STOP | EXT | OPT, 0, NULL}, + {FNAME("featureSet") SEQ, 3, 4, 4, STOP | EXT | OPT, 0, NULL}, + {FNAME("genericData") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, +}; + +static const struct field_t _RegistrationConfirm_callSignalAddress[] = { /* SEQUENCE OF */ + {FNAME("item") CHOICE, 3, 7, 7, DECODE | EXT, + sizeof(TransportAddress), _TransportAddress} + , +}; + +static const struct field_t _RegistrationConfirm_terminalAlias[] = { /* SEQUENCE OF */ + {FNAME("item") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress}, +}; + +static const struct field_t _RegistrationConfirm[] = { /* SEQUENCE */ + {FNAME("requestSeqNum") INT, WORD, 1, 0, SKIP, 0, NULL}, + {FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL}, + {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0, + _NonStandardParameter}, + {FNAME("callSignalAddress") SEQOF, SEMI, 0, 10, DECODE, + offsetof(RegistrationConfirm, callSignalAddress), + _RegistrationConfirm_callSignalAddress}, + {FNAME("terminalAlias") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, + _RegistrationConfirm_terminalAlias}, + {FNAME("gatekeeperIdentifier") BMPSTR, 7, 1, 0, SKIP | OPT, 0, NULL}, + {FNAME("endpointIdentifier") BMPSTR, 7, 1, 0, SKIP, 0, NULL}, + {FNAME("alternateGatekeeper") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, NULL}, + {FNAME("timeToLive") INT, CONS, 1, 0, DECODE | OPT, + offsetof(RegistrationConfirm, timeToLive), NULL}, + {FNAME("tokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("integrityCheckValue") SEQ, 0, 2, 2, STOP | OPT, 0, NULL}, + {FNAME("willRespondToIRR") BOOL, FIXD, 0, 0, STOP, 0, NULL}, + {FNAME("preGrantedARQ") SEQ, 0, 4, 8, STOP | EXT | OPT, 0, NULL}, + {FNAME("maintainConnection") BOOL, FIXD, 0, 0, STOP, 0, NULL}, + {FNAME("serviceControl") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("supportsAdditiveRegistration") NUL, FIXD, 0, 0, STOP | OPT, 0, + NULL}, + {FNAME("terminalAliasPattern") SEQOF, SEMI, 0, 0, STOP | OPT, 0, + NULL}, + {FNAME("supportedPrefixes") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("usageSpec") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("featureServerAlias") CHOICE, 1, 2, 7, STOP | EXT | OPT, 0, + NULL}, + {FNAME("capacityReportingSpec") SEQ, 0, 1, 1, STOP | EXT | OPT, 0, + NULL}, + {FNAME("featureSet") SEQ, 3, 4, 4, STOP | EXT | OPT, 0, NULL}, + {FNAME("genericData") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, +}; + +static const struct field_t _UnregistrationRequest_callSignalAddress[] = { /* SEQUENCE OF */ + {FNAME("item") CHOICE, 3, 7, 7, DECODE | EXT, + sizeof(TransportAddress), _TransportAddress} + , +}; + +static const struct field_t _UnregistrationRequest[] = { /* SEQUENCE */ + {FNAME("requestSeqNum") INT, WORD, 1, 0, SKIP, 0, NULL}, + {FNAME("callSignalAddress") SEQOF, SEMI, 0, 10, DECODE, + offsetof(UnregistrationRequest, callSignalAddress), + _UnregistrationRequest_callSignalAddress}, + {FNAME("endpointAlias") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("nonStandardData") SEQ, 0, 2, 2, STOP | OPT, 0, NULL}, + {FNAME("endpointIdentifier") BMPSTR, 7, 1, 0, STOP | OPT, 0, NULL}, + {FNAME("alternateEndpoints") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("gatekeeperIdentifier") BMPSTR, 7, 1, 0, STOP | OPT, 0, NULL}, + {FNAME("tokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("integrityCheckValue") SEQ, 0, 2, 2, STOP | OPT, 0, NULL}, + {FNAME("reason") CHOICE, 2, 4, 5, STOP | EXT | OPT, 0, NULL}, + {FNAME("endpointAliasPattern") SEQOF, SEMI, 0, 0, STOP | OPT, 0, + NULL}, + {FNAME("supportedPrefixes") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("alternateGatekeeper") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("genericData") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, +}; + +static const struct field_t _CallModel[] = { /* CHOICE */ + {FNAME("direct") NUL, FIXD, 0, 0, SKIP, 0, NULL}, + {FNAME("gatekeeperRouted") NUL, FIXD, 0, 0, SKIP, 0, NULL}, +}; + +static const struct field_t _AdmissionRequest_destinationInfo[] = { /* SEQUENCE OF */ + {FNAME("item") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress}, +}; + +static const struct field_t _AdmissionRequest_destExtraCallInfo[] = { /* SEQUENCE OF */ + {FNAME("item") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress}, +}; + +static const struct field_t _AdmissionRequest_srcInfo[] = { /* SEQUENCE OF */ + {FNAME("item") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress}, +}; + +static const struct field_t _AdmissionRequest[] = { /* SEQUENCE */ + {FNAME("requestSeqNum") INT, WORD, 1, 0, SKIP, 0, NULL}, + {FNAME("callType") CHOICE, 2, 4, 4, SKIP | EXT, 0, _CallType}, + {FNAME("callModel") CHOICE, 1, 2, 2, SKIP | EXT | OPT, 0, _CallModel}, + {FNAME("endpointIdentifier") BMPSTR, 7, 1, 0, SKIP, 0, NULL}, + {FNAME("destinationInfo") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, + _AdmissionRequest_destinationInfo}, + {FNAME("destCallSignalAddress") CHOICE, 3, 7, 7, DECODE | EXT | OPT, + offsetof(AdmissionRequest, destCallSignalAddress), + _TransportAddress}, + {FNAME("destExtraCallInfo") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, + _AdmissionRequest_destExtraCallInfo}, + {FNAME("srcInfo") SEQOF, SEMI, 0, 0, SKIP, 0, + _AdmissionRequest_srcInfo}, + {FNAME("srcCallSignalAddress") CHOICE, 3, 7, 7, DECODE | EXT | OPT, + offsetof(AdmissionRequest, srcCallSignalAddress), _TransportAddress}, + {FNAME("bandWidth") INT, CONS, 0, 0, STOP, 0, NULL}, + {FNAME("callReferenceValue") INT, WORD, 0, 0, STOP, 0, NULL}, + {FNAME("nonStandardData") SEQ, 0, 2, 2, STOP | OPT, 0, NULL}, + {FNAME("callServices") SEQ, 0, 8, 8, STOP | EXT | OPT, 0, NULL}, + {FNAME("conferenceID") OCTSTR, FIXD, 16, 0, STOP, 0, NULL}, + {FNAME("activeMC") BOOL, FIXD, 0, 0, STOP, 0, NULL}, + {FNAME("answerCall") BOOL, FIXD, 0, 0, STOP, 0, NULL}, + {FNAME("canMapAlias") BOOL, FIXD, 0, 0, STOP, 0, NULL}, + {FNAME("callIdentifier") SEQ, 0, 1, 1, STOP | EXT, 0, NULL}, + {FNAME("srcAlternatives") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("destAlternatives") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("gatekeeperIdentifier") BMPSTR, 7, 1, 0, STOP | OPT, 0, NULL}, + {FNAME("tokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("integrityCheckValue") SEQ, 0, 2, 2, STOP | OPT, 0, NULL}, + {FNAME("transportQOS") CHOICE, 2, 3, 3, STOP | EXT | OPT, 0, NULL}, + {FNAME("willSupplyUUIEs") BOOL, FIXD, 0, 0, STOP, 0, NULL}, + {FNAME("callLinkage") SEQ, 2, 2, 2, STOP | EXT | OPT, 0, NULL}, + {FNAME("gatewayDataRate") SEQ, 2, 3, 3, STOP | EXT | OPT, 0, NULL}, + {FNAME("capacity") SEQ, 2, 2, 2, STOP | EXT | OPT, 0, NULL}, + {FNAME("circuitInfo") SEQ, 3, 3, 3, STOP | EXT | OPT, 0, NULL}, + {FNAME("desiredProtocols") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("desiredTunnelledProtocol") SEQ, 1, 2, 2, STOP | EXT | OPT, 0, + NULL}, + {FNAME("featureSet") SEQ, 3, 4, 4, STOP | EXT | OPT, 0, NULL}, + {FNAME("genericData") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, +}; + +static const struct field_t _AdmissionConfirm[] = { /* SEQUENCE */ + {FNAME("requestSeqNum") INT, WORD, 1, 0, SKIP, 0, NULL}, + {FNAME("bandWidth") INT, CONS, 0, 0, SKIP, 0, NULL}, + {FNAME("callModel") CHOICE, 1, 2, 2, SKIP | EXT, 0, _CallModel}, + {FNAME("destCallSignalAddress") CHOICE, 3, 7, 7, DECODE | EXT, + offsetof(AdmissionConfirm, destCallSignalAddress), + _TransportAddress}, + {FNAME("irrFrequency") INT, WORD, 1, 0, STOP | OPT, 0, NULL}, + {FNAME("nonStandardData") SEQ, 0, 2, 2, STOP | OPT, 0, NULL}, + {FNAME("destinationInfo") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("destExtraCallInfo") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("destinationType") SEQ, 6, 8, 10, STOP | EXT | OPT, 0, NULL}, + {FNAME("remoteExtensionAddress") SEQOF, SEMI, 0, 0, STOP | OPT, 0, + NULL}, + {FNAME("alternateEndpoints") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("tokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("integrityCheckValue") SEQ, 0, 2, 2, STOP | OPT, 0, NULL}, + {FNAME("transportQOS") CHOICE, 2, 3, 3, STOP | EXT | OPT, 0, NULL}, + {FNAME("willRespondToIRR") BOOL, FIXD, 0, 0, STOP, 0, NULL}, + {FNAME("uuiesRequested") SEQ, 0, 9, 13, STOP | EXT, 0, NULL}, + {FNAME("language") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("alternateTransportAddresses") SEQ, 1, 1, 1, STOP | EXT | OPT, + 0, NULL}, + {FNAME("useSpecifiedTransport") CHOICE, 1, 2, 2, STOP | EXT | OPT, 0, + NULL}, + {FNAME("circuitInfo") SEQ, 3, 3, 3, STOP | EXT | OPT, 0, NULL}, + {FNAME("usageSpec") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("supportedProtocols") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("serviceControl") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("multipleCalls") BOOL, FIXD, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("featureSet") SEQ, 3, 4, 4, STOP | EXT | OPT, 0, NULL}, + {FNAME("genericData") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, +}; + +static const struct field_t _LocationRequest_destinationInfo[] = { /* SEQUENCE OF */ + {FNAME("item") CHOICE, 1, 2, 7, SKIP | EXT, 0, _AliasAddress}, +}; + +static const struct field_t _LocationRequest[] = { /* SEQUENCE */ + {FNAME("requestSeqNum") INT, WORD, 1, 0, SKIP, 0, NULL}, + {FNAME("endpointIdentifier") BMPSTR, 7, 1, 0, SKIP | OPT, 0, NULL}, + {FNAME("destinationInfo") SEQOF, SEMI, 0, 0, SKIP, 0, + _LocationRequest_destinationInfo}, + {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0, + _NonStandardParameter}, + {FNAME("replyAddress") CHOICE, 3, 7, 7, DECODE | EXT, + offsetof(LocationRequest, replyAddress), _TransportAddress}, + {FNAME("sourceInfo") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("canMapAlias") BOOL, FIXD, 0, 0, STOP, 0, NULL}, + {FNAME("gatekeeperIdentifier") BMPSTR, 7, 1, 0, STOP | OPT, 0, NULL}, + {FNAME("tokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("integrityCheckValue") SEQ, 0, 2, 2, STOP | OPT, 0, NULL}, + {FNAME("desiredProtocols") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("desiredTunnelledProtocol") SEQ, 1, 2, 2, STOP | EXT | OPT, 0, + NULL}, + {FNAME("featureSet") SEQ, 3, 4, 4, STOP | EXT | OPT, 0, NULL}, + {FNAME("genericData") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("hopCount") INT, 8, 1, 0, STOP | OPT, 0, NULL}, + {FNAME("circuitInfo") SEQ, 3, 3, 3, STOP | EXT | OPT, 0, NULL}, +}; + +static const struct field_t _LocationConfirm[] = { /* SEQUENCE */ + {FNAME("requestSeqNum") INT, WORD, 1, 0, SKIP, 0, NULL}, + {FNAME("callSignalAddress") CHOICE, 3, 7, 7, DECODE | EXT, + offsetof(LocationConfirm, callSignalAddress), _TransportAddress}, + {FNAME("rasAddress") CHOICE, 3, 7, 7, DECODE | EXT, + offsetof(LocationConfirm, rasAddress), _TransportAddress}, + {FNAME("nonStandardData") SEQ, 0, 2, 2, STOP | OPT, 0, NULL}, + {FNAME("destinationInfo") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("destExtraCallInfo") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("destinationType") SEQ, 6, 8, 10, STOP | EXT | OPT, 0, NULL}, + {FNAME("remoteExtensionAddress") SEQOF, SEMI, 0, 0, STOP | OPT, 0, + NULL}, + {FNAME("alternateEndpoints") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("tokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("integrityCheckValue") SEQ, 0, 2, 2, STOP | OPT, 0, NULL}, + {FNAME("alternateTransportAddresses") SEQ, 1, 1, 1, STOP | EXT | OPT, + 0, NULL}, + {FNAME("supportedProtocols") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("multipleCalls") BOOL, FIXD, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("featureSet") SEQ, 3, 4, 4, STOP | EXT | OPT, 0, NULL}, + {FNAME("genericData") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("circuitInfo") SEQ, 3, 3, 3, STOP | EXT | OPT, 0, NULL}, + {FNAME("serviceControl") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, +}; + +static const struct field_t _InfoRequestResponse_callSignalAddress[] = { /* SEQUENCE OF */ + {FNAME("item") CHOICE, 3, 7, 7, DECODE | EXT, + sizeof(TransportAddress), _TransportAddress} + , +}; + +static const struct field_t _InfoRequestResponse[] = { /* SEQUENCE */ + {FNAME("nonStandardData") SEQ, 0, 2, 2, SKIP | OPT, 0, + _NonStandardParameter}, + {FNAME("requestSeqNum") INT, WORD, 1, 0, SKIP, 0, NULL}, + {FNAME("endpointType") SEQ, 6, 8, 10, SKIP | EXT, 0, _EndpointType}, + {FNAME("endpointIdentifier") BMPSTR, 7, 1, 0, SKIP, 0, NULL}, + {FNAME("rasAddress") CHOICE, 3, 7, 7, DECODE | EXT, + offsetof(InfoRequestResponse, rasAddress), _TransportAddress}, + {FNAME("callSignalAddress") SEQOF, SEMI, 0, 10, DECODE, + offsetof(InfoRequestResponse, callSignalAddress), + _InfoRequestResponse_callSignalAddress}, + {FNAME("endpointAlias") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("perCallInfo") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("tokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("cryptoTokens") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, + {FNAME("integrityCheckValue") SEQ, 0, 2, 2, STOP | OPT, 0, NULL}, + {FNAME("needResponse") BOOL, FIXD, 0, 0, STOP, 0, NULL}, + {FNAME("capacity") SEQ, 2, 2, 2, STOP | EXT | OPT, 0, NULL}, + {FNAME("irrStatus") CHOICE, 2, 4, 4, STOP | EXT | OPT, 0, NULL}, + {FNAME("unsolicited") BOOL, FIXD, 0, 0, STOP, 0, NULL}, + {FNAME("genericData") SEQOF, SEMI, 0, 0, STOP | OPT, 0, NULL}, +}; + +static const struct field_t _RasMessage[] = { /* CHOICE */ + {FNAME("gatekeeperRequest") SEQ, 4, 8, 18, DECODE | EXT, + offsetof(RasMessage, gatekeeperRequest), _GatekeeperRequest}, + {FNAME("gatekeeperConfirm") SEQ, 2, 5, 14, DECODE | EXT, + offsetof(RasMessage, gatekeeperConfirm), _GatekeeperConfirm}, + {FNAME("gatekeeperReject") SEQ, 2, 5, 11, STOP | EXT, 0, NULL}, + {FNAME("registrationRequest") SEQ, 3, 10, 31, DECODE | EXT, + offsetof(RasMessage, registrationRequest), _RegistrationRequest}, + {FNAME("registrationConfirm") SEQ, 3, 7, 24, DECODE | EXT, + offsetof(RasMessage, registrationConfirm), _RegistrationConfirm}, + {FNAME("registrationReject") SEQ, 2, 5, 11, STOP | EXT, 0, NULL}, + {FNAME("unregistrationRequest") SEQ, 3, 5, 15, DECODE | EXT, + offsetof(RasMessage, unregistrationRequest), _UnregistrationRequest}, + {FNAME("unregistrationConfirm") SEQ, 1, 2, 6, STOP | EXT, 0, NULL}, + {FNAME("unregistrationReject") SEQ, 1, 3, 8, STOP | EXT, 0, NULL}, + {FNAME("admissionRequest") SEQ, 7, 16, 34, DECODE | EXT, + offsetof(RasMessage, admissionRequest), _AdmissionRequest}, + {FNAME("admissionConfirm") SEQ, 2, 6, 27, DECODE | EXT, + offsetof(RasMessage, admissionConfirm), _AdmissionConfirm}, + {FNAME("admissionReject") SEQ, 1, 3, 11, STOP | EXT, 0, NULL}, + {FNAME("bandwidthRequest") SEQ, 2, 7, 18, STOP | EXT, 0, NULL}, + {FNAME("bandwidthConfirm") SEQ, 1, 3, 8, STOP | EXT, 0, NULL}, + {FNAME("bandwidthReject") SEQ, 1, 4, 9, STOP | EXT, 0, NULL}, + {FNAME("disengageRequest") SEQ, 1, 6, 19, STOP | EXT, 0, NULL}, + {FNAME("disengageConfirm") SEQ, 1, 2, 9, STOP | EXT, 0, NULL}, + {FNAME("disengageReject") SEQ, 1, 3, 8, STOP | EXT, 0, NULL}, + {FNAME("locationRequest") SEQ, 2, 5, 17, DECODE | EXT, + offsetof(RasMessage, locationRequest), _LocationRequest}, + {FNAME("locationConfirm") SEQ, 1, 4, 19, DECODE | EXT, + offsetof(RasMessage, locationConfirm), _LocationConfirm}, + {FNAME("locationReject") SEQ, 1, 3, 10, STOP | EXT, 0, NULL}, + {FNAME("infoRequest") SEQ, 2, 4, 15, STOP | EXT, 0, NULL}, + {FNAME("infoRequestResponse") SEQ, 3, 8, 16, DECODE | EXT, + offsetof(RasMessage, infoRequestResponse), _InfoRequestResponse}, + {FNAME("nonStandardMessage") SEQ, 0, 2, 7, STOP | EXT, 0, NULL}, + {FNAME("unknownMessageResponse") SEQ, 0, 1, 5, STOP | EXT, 0, NULL}, + {FNAME("requestInProgress") SEQ, 4, 6, 6, STOP | EXT, 0, NULL}, + {FNAME("resourcesAvailableIndicate") SEQ, 4, 9, 11, STOP | EXT, 0, + NULL}, + {FNAME("resourcesAvailableConfirm") SEQ, 4, 6, 7, STOP | EXT, 0, + NULL}, + {FNAME("infoRequestAck") SEQ, 4, 5, 5, STOP | EXT, 0, NULL}, + {FNAME("infoRequestNak") SEQ, 5, 7, 7, STOP | EXT, 0, NULL}, + {FNAME("serviceControlIndication") SEQ, 8, 10, 10, STOP | EXT, 0, + NULL}, + {FNAME("serviceControlResponse") SEQ, 7, 8, 8, STOP | EXT, 0, NULL}, +}; diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c new file mode 100644 index 00000000..436b7cb7 --- /dev/null +++ b/net/netfilter/nf_conntrack_helper.c @@ -0,0 +1,342 @@ +/* Helper handling for netfilter. */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> + * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/types.h> +#include <linux/netfilter.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/vmalloc.h> +#include <linux/stddef.h> +#include <linux/random.h> +#include <linux/err.h> +#include <linux/kernel.h> +#include <linux/netdevice.h> +#include <linux/rculist.h> +#include <linux/rtnetlink.h> + +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_l3proto.h> +#include <net/netfilter/nf_conntrack_l4proto.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_extend.h> + +static DEFINE_MUTEX(nf_ct_helper_mutex); +static struct hlist_head *nf_ct_helper_hash __read_mostly; +static unsigned int nf_ct_helper_hsize __read_mostly; +static unsigned int nf_ct_helper_count __read_mostly; + + +/* Stupid hash, but collision free for the default registrations of the + * helpers currently in the kernel. */ +static unsigned int helper_hash(const struct nf_conntrack_tuple *tuple) +{ + return (((tuple->src.l3num << 8) | tuple->dst.protonum) ^ + (__force __u16)tuple->src.u.all) % nf_ct_helper_hsize; +} + +static struct nf_conntrack_helper * +__nf_ct_helper_find(const struct nf_conntrack_tuple *tuple) +{ + struct nf_conntrack_helper *helper; + struct nf_conntrack_tuple_mask mask = { .src.u.all = htons(0xFFFF) }; + struct hlist_node *n; + unsigned int h; + + if (!nf_ct_helper_count) + return NULL; + + h = helper_hash(tuple); + hlist_for_each_entry_rcu(helper, n, &nf_ct_helper_hash[h], hnode) { + if (nf_ct_tuple_src_mask_cmp(tuple, &helper->tuple, &mask)) + return helper; + } + return NULL; +} + +struct nf_conntrack_helper * +__nf_conntrack_helper_find(const char *name, u16 l3num, u8 protonum) +{ + struct nf_conntrack_helper *h; + struct hlist_node *n; + unsigned int i; + + for (i = 0; i < nf_ct_helper_hsize; i++) { + hlist_for_each_entry_rcu(h, n, &nf_ct_helper_hash[i], hnode) { + if (!strcmp(h->name, name) && + h->tuple.src.l3num == l3num && + h->tuple.dst.protonum == protonum) + return h; + } + } + return NULL; +} +EXPORT_SYMBOL_GPL(__nf_conntrack_helper_find); + +struct nf_conntrack_helper * +nf_conntrack_helper_try_module_get(const char *name, u16 l3num, u8 protonum) +{ + struct nf_conntrack_helper *h; + + h = __nf_conntrack_helper_find(name, l3num, protonum); +#ifdef CONFIG_MODULES + if (h == NULL) { + if (request_module("nfct-helper-%s", name) == 0) + h = __nf_conntrack_helper_find(name, l3num, protonum); + } +#endif + if (h != NULL && !try_module_get(h->me)) + h = NULL; + + return h; +} +EXPORT_SYMBOL_GPL(nf_conntrack_helper_try_module_get); + +struct nf_conn_help *nf_ct_helper_ext_add(struct nf_conn *ct, gfp_t gfp) +{ + struct nf_conn_help *help; + + help = nf_ct_ext_add(ct, NF_CT_EXT_HELPER, gfp); + if (help) + INIT_HLIST_HEAD(&help->expectations); + else + pr_debug("failed to add helper extension area"); + return help; +} +EXPORT_SYMBOL_GPL(nf_ct_helper_ext_add); + +int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl, + gfp_t flags) +{ + struct nf_conntrack_helper *helper = NULL; + struct nf_conn_help *help; + int ret = 0; + + if (tmpl != NULL) { + help = nfct_help(tmpl); + if (help != NULL) + helper = help->helper; + } + + help = nfct_help(ct); + if (helper == NULL) + helper = __nf_ct_helper_find(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); + if (helper == NULL) { + if (help) + RCU_INIT_POINTER(help->helper, NULL); + goto out; + } + + if (help == NULL) { + help = nf_ct_helper_ext_add(ct, flags); + if (help == NULL) { + ret = -ENOMEM; + goto out; + } + } else { + memset(&help->help, 0, sizeof(help->help)); + } + + rcu_assign_pointer(help->helper, helper); +out: + return ret; +} +EXPORT_SYMBOL_GPL(__nf_ct_try_assign_helper); + +static inline int unhelp(struct nf_conntrack_tuple_hash *i, + const struct nf_conntrack_helper *me) +{ + struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(i); + struct nf_conn_help *help = nfct_help(ct); + + if (help && rcu_dereference_protected( + help->helper, + lockdep_is_held(&nf_conntrack_lock) + ) == me) { + nf_conntrack_event(IPCT_HELPER, ct); + RCU_INIT_POINTER(help->helper, NULL); + } + return 0; +} + +void nf_ct_helper_destroy(struct nf_conn *ct) +{ + struct nf_conn_help *help = nfct_help(ct); + struct nf_conntrack_helper *helper; + + if (help) { + rcu_read_lock(); + helper = rcu_dereference(help->helper); + if (helper && helper->destroy) + helper->destroy(ct); + rcu_read_unlock(); + } +} + +static LIST_HEAD(nf_ct_helper_expectfn_list); + +void nf_ct_helper_expectfn_register(struct nf_ct_helper_expectfn *n) +{ + spin_lock_bh(&nf_conntrack_lock); + list_add_rcu(&n->head, &nf_ct_helper_expectfn_list); + spin_unlock_bh(&nf_conntrack_lock); +} +EXPORT_SYMBOL_GPL(nf_ct_helper_expectfn_register); + +void nf_ct_helper_expectfn_unregister(struct nf_ct_helper_expectfn *n) +{ + spin_lock_bh(&nf_conntrack_lock); + list_del_rcu(&n->head); + spin_unlock_bh(&nf_conntrack_lock); +} +EXPORT_SYMBOL_GPL(nf_ct_helper_expectfn_unregister); + +struct nf_ct_helper_expectfn * +nf_ct_helper_expectfn_find_by_name(const char *name) +{ + struct nf_ct_helper_expectfn *cur; + bool found = false; + + rcu_read_lock(); + list_for_each_entry_rcu(cur, &nf_ct_helper_expectfn_list, head) { + if (!strcmp(cur->name, name)) { + found = true; + break; + } + } + rcu_read_unlock(); + return found ? cur : NULL; +} +EXPORT_SYMBOL_GPL(nf_ct_helper_expectfn_find_by_name); + +struct nf_ct_helper_expectfn * +nf_ct_helper_expectfn_find_by_symbol(const void *symbol) +{ + struct nf_ct_helper_expectfn *cur; + bool found = false; + + rcu_read_lock(); + list_for_each_entry_rcu(cur, &nf_ct_helper_expectfn_list, head) { + if (cur->expectfn == symbol) { + found = true; + break; + } + } + rcu_read_unlock(); + return found ? cur : NULL; +} +EXPORT_SYMBOL_GPL(nf_ct_helper_expectfn_find_by_symbol); + +int nf_conntrack_helper_register(struct nf_conntrack_helper *me) +{ + unsigned int h = helper_hash(&me->tuple); + + BUG_ON(me->expect_policy == NULL); + BUG_ON(me->expect_class_max >= NF_CT_MAX_EXPECT_CLASSES); + BUG_ON(strlen(me->name) > NF_CT_HELPER_NAME_LEN - 1); + + mutex_lock(&nf_ct_helper_mutex); + hlist_add_head_rcu(&me->hnode, &nf_ct_helper_hash[h]); + nf_ct_helper_count++; + mutex_unlock(&nf_ct_helper_mutex); + + return 0; +} +EXPORT_SYMBOL_GPL(nf_conntrack_helper_register); + +static void __nf_conntrack_helper_unregister(struct nf_conntrack_helper *me, + struct net *net) +{ + struct nf_conntrack_tuple_hash *h; + struct nf_conntrack_expect *exp; + const struct hlist_node *n, *next; + const struct hlist_nulls_node *nn; + unsigned int i; + + /* Get rid of expectations */ + for (i = 0; i < nf_ct_expect_hsize; i++) { + hlist_for_each_entry_safe(exp, n, next, + &net->ct.expect_hash[i], hnode) { + struct nf_conn_help *help = nfct_help(exp->master); + if ((rcu_dereference_protected( + help->helper, + lockdep_is_held(&nf_conntrack_lock) + ) == me || exp->helper == me) && + del_timer(&exp->timeout)) { + nf_ct_unlink_expect(exp); + nf_ct_expect_put(exp); + } + } + } + + /* Get rid of expecteds, set helpers to NULL. */ + hlist_nulls_for_each_entry(h, nn, &net->ct.unconfirmed, hnnode) + unhelp(h, me); + for (i = 0; i < net->ct.htable_size; i++) { + hlist_nulls_for_each_entry(h, nn, &net->ct.hash[i], hnnode) + unhelp(h, me); + } +} + +void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me) +{ + struct net *net; + + mutex_lock(&nf_ct_helper_mutex); + hlist_del_rcu(&me->hnode); + nf_ct_helper_count--; + mutex_unlock(&nf_ct_helper_mutex); + + /* Make sure every nothing is still using the helper unless its a + * connection in the hash. + */ + synchronize_rcu(); + + rtnl_lock(); + spin_lock_bh(&nf_conntrack_lock); + for_each_net(net) + __nf_conntrack_helper_unregister(me, net); + spin_unlock_bh(&nf_conntrack_lock); + rtnl_unlock(); +} +EXPORT_SYMBOL_GPL(nf_conntrack_helper_unregister); + +static struct nf_ct_ext_type helper_extend __read_mostly = { + .len = sizeof(struct nf_conn_help), + .align = __alignof__(struct nf_conn_help), + .id = NF_CT_EXT_HELPER, +}; + +int nf_conntrack_helper_init(void) +{ + int err; + + nf_ct_helper_hsize = 1; /* gets rounded up to use one page */ + nf_ct_helper_hash = nf_ct_alloc_hashtable(&nf_ct_helper_hsize, 0); + if (!nf_ct_helper_hash) + return -ENOMEM; + + err = nf_ct_extend_register(&helper_extend); + if (err < 0) + goto err1; + + return 0; + +err1: + nf_ct_free_hashtable(nf_ct_helper_hash, nf_ct_helper_hsize); + return err; +} + +void nf_conntrack_helper_fini(void) +{ + nf_ct_extend_unregister(&helper_extend); + nf_ct_free_hashtable(nf_ct_helper_hash, nf_ct_helper_hsize); +} diff --git a/net/netfilter/nf_conntrack_irc.c b/net/netfilter/nf_conntrack_irc.c new file mode 100644 index 00000000..4f9390b9 --- /dev/null +++ b/net/netfilter/nf_conntrack_irc.c @@ -0,0 +1,291 @@ +/* IRC extension for IP connection tracking, Version 1.21 + * (C) 2000-2002 by Harald Welte <laforge@gnumonks.org> + * based on RR's ip_conntrack_ftp.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/skbuff.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/tcp.h> +#include <linux/netfilter.h> +#include <linux/slab.h> + +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_expect.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <linux/netfilter/nf_conntrack_irc.h> + +#define MAX_PORTS 8 +static unsigned short ports[MAX_PORTS]; +static unsigned int ports_c; +static unsigned int max_dcc_channels = 8; +static unsigned int dcc_timeout __read_mostly = 300; +/* This is slow, but it's simple. --RR */ +static char *irc_buffer; +static DEFINE_SPINLOCK(irc_buffer_lock); + +unsigned int (*nf_nat_irc_hook)(struct sk_buff *skb, + enum ip_conntrack_info ctinfo, + unsigned int matchoff, + unsigned int matchlen, + struct nf_conntrack_expect *exp) __read_mostly; +EXPORT_SYMBOL_GPL(nf_nat_irc_hook); + +MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); +MODULE_DESCRIPTION("IRC (DCC) connection tracking helper"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ip_conntrack_irc"); +MODULE_ALIAS_NFCT_HELPER("irc"); + +module_param_array(ports, ushort, &ports_c, 0400); +MODULE_PARM_DESC(ports, "port numbers of IRC servers"); +module_param(max_dcc_channels, uint, 0400); +MODULE_PARM_DESC(max_dcc_channels, "max number of expected DCC channels per " + "IRC session"); +module_param(dcc_timeout, uint, 0400); +MODULE_PARM_DESC(dcc_timeout, "timeout on for unestablished DCC channels"); + +static const char *const dccprotos[] = { + "SEND ", "CHAT ", "MOVE ", "TSEND ", "SCHAT " +}; + +#define MINMATCHLEN 5 + +/* tries to get the ip_addr and port out of a dcc command + * return value: -1 on failure, 0 on success + * data pointer to first byte of DCC command data + * data_end pointer to last byte of dcc command data + * ip returns parsed ip of dcc command + * port returns parsed port of dcc command + * ad_beg_p returns pointer to first byte of addr data + * ad_end_p returns pointer to last byte of addr data + */ +static int parse_dcc(char *data, const char *data_end, __be32 *ip, + u_int16_t *port, char **ad_beg_p, char **ad_end_p) +{ + char *tmp; + + /* at least 12: "AAAAAAAA P\1\n" */ + while (*data++ != ' ') + if (data > data_end - 12) + return -1; + + /* Make sure we have a newline character within the packet boundaries + * because simple_strtoul parses until the first invalid character. */ + for (tmp = data; tmp <= data_end; tmp++) + if (*tmp == '\n') + break; + if (tmp > data_end || *tmp != '\n') + return -1; + + *ad_beg_p = data; + *ip = cpu_to_be32(simple_strtoul(data, &data, 10)); + + /* skip blanks between ip and port */ + while (*data == ' ') { + if (data >= data_end) + return -1; + data++; + } + + *port = simple_strtoul(data, &data, 10); + *ad_end_p = data; + + return 0; +} + +static int help(struct sk_buff *skb, unsigned int protoff, + struct nf_conn *ct, enum ip_conntrack_info ctinfo) +{ + unsigned int dataoff; + const struct iphdr *iph; + const struct tcphdr *th; + struct tcphdr _tcph; + const char *data_limit; + char *data, *ib_ptr; + int dir = CTINFO2DIR(ctinfo); + struct nf_conntrack_expect *exp; + struct nf_conntrack_tuple *tuple; + __be32 dcc_ip; + u_int16_t dcc_port; + __be16 port; + int i, ret = NF_ACCEPT; + char *addr_beg_p, *addr_end_p; + typeof(nf_nat_irc_hook) nf_nat_irc; + + /* If packet is coming from IRC server */ + if (dir == IP_CT_DIR_REPLY) + return NF_ACCEPT; + + /* Until there's been traffic both ways, don't look in packets. */ + if (ctinfo != IP_CT_ESTABLISHED && ctinfo != IP_CT_ESTABLISHED_REPLY) + return NF_ACCEPT; + + /* Not a full tcp header? */ + th = skb_header_pointer(skb, protoff, sizeof(_tcph), &_tcph); + if (th == NULL) + return NF_ACCEPT; + + /* No data? */ + dataoff = protoff + th->doff*4; + if (dataoff >= skb->len) + return NF_ACCEPT; + + spin_lock_bh(&irc_buffer_lock); + ib_ptr = skb_header_pointer(skb, dataoff, skb->len - dataoff, + irc_buffer); + BUG_ON(ib_ptr == NULL); + + data = ib_ptr; + data_limit = ib_ptr + skb->len - dataoff; + + /* strlen("\1DCC SENT t AAAAAAAA P\1\n")=24 + * 5+MINMATCHLEN+strlen("t AAAAAAAA P\1\n")=14 */ + while (data < data_limit - (19 + MINMATCHLEN)) { + if (memcmp(data, "\1DCC ", 5)) { + data++; + continue; + } + data += 5; + /* we have at least (19+MINMATCHLEN)-5 bytes valid data left */ + + iph = ip_hdr(skb); + pr_debug("DCC found in master %pI4:%u %pI4:%u\n", + &iph->saddr, ntohs(th->source), + &iph->daddr, ntohs(th->dest)); + + for (i = 0; i < ARRAY_SIZE(dccprotos); i++) { + if (memcmp(data, dccprotos[i], strlen(dccprotos[i]))) { + /* no match */ + continue; + } + data += strlen(dccprotos[i]); + pr_debug("DCC %s detected\n", dccprotos[i]); + + /* we have at least + * (19+MINMATCHLEN)-5-dccprotos[i].matchlen bytes valid + * data left (== 14/13 bytes) */ + if (parse_dcc(data, data_limit, &dcc_ip, + &dcc_port, &addr_beg_p, &addr_end_p)) { + pr_debug("unable to parse dcc command\n"); + continue; + } + + pr_debug("DCC bound ip/port: %pI4:%u\n", + &dcc_ip, dcc_port); + + /* dcc_ip can be the internal OR external (NAT'ed) IP */ + tuple = &ct->tuplehash[dir].tuple; + if (tuple->src.u3.ip != dcc_ip && + tuple->dst.u3.ip != dcc_ip) { + if (net_ratelimit()) + printk(KERN_WARNING + "Forged DCC command from %pI4: %pI4:%u\n", + &tuple->src.u3.ip, + &dcc_ip, dcc_port); + continue; + } + + exp = nf_ct_expect_alloc(ct); + if (exp == NULL) { + ret = NF_DROP; + goto out; + } + tuple = &ct->tuplehash[!dir].tuple; + port = htons(dcc_port); + nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, + tuple->src.l3num, + NULL, &tuple->dst.u3, + IPPROTO_TCP, NULL, &port); + + nf_nat_irc = rcu_dereference(nf_nat_irc_hook); + if (nf_nat_irc && ct->status & IPS_NAT_MASK) + ret = nf_nat_irc(skb, ctinfo, + addr_beg_p - ib_ptr, + addr_end_p - addr_beg_p, + exp); + else if (nf_ct_expect_related(exp) != 0) + ret = NF_DROP; + nf_ct_expect_put(exp); + goto out; + } + } + out: + spin_unlock_bh(&irc_buffer_lock); + return ret; +} + +static struct nf_conntrack_helper irc[MAX_PORTS] __read_mostly; +static char irc_names[MAX_PORTS][sizeof("irc-65535")] __read_mostly; +static struct nf_conntrack_expect_policy irc_exp_policy; + +static void nf_conntrack_irc_fini(void); + +static int __init nf_conntrack_irc_init(void) +{ + int i, ret; + char *tmpname; + + if (max_dcc_channels < 1) { + printk(KERN_ERR "nf_ct_irc: max_dcc_channels must not be zero\n"); + return -EINVAL; + } + + irc_exp_policy.max_expected = max_dcc_channels; + irc_exp_policy.timeout = dcc_timeout; + + irc_buffer = kmalloc(65536, GFP_KERNEL); + if (!irc_buffer) + return -ENOMEM; + + /* If no port given, default to standard irc port */ + if (ports_c == 0) + ports[ports_c++] = IRC_PORT; + + for (i = 0; i < ports_c; i++) { + irc[i].tuple.src.l3num = AF_INET; + irc[i].tuple.src.u.tcp.port = htons(ports[i]); + irc[i].tuple.dst.protonum = IPPROTO_TCP; + irc[i].expect_policy = &irc_exp_policy; + irc[i].me = THIS_MODULE; + irc[i].help = help; + + tmpname = &irc_names[i][0]; + if (ports[i] == IRC_PORT) + sprintf(tmpname, "irc"); + else + sprintf(tmpname, "irc-%u", i); + irc[i].name = tmpname; + + ret = nf_conntrack_helper_register(&irc[i]); + if (ret) { + printk(KERN_ERR "nf_ct_irc: failed to register helper " + "for pf: %u port: %u\n", + irc[i].tuple.src.l3num, ports[i]); + nf_conntrack_irc_fini(); + return ret; + } + } + return 0; +} + +/* This function is intentionally _NOT_ defined as __exit, because + * it is needed by the init function */ +static void nf_conntrack_irc_fini(void) +{ + int i; + + for (i = 0; i < ports_c; i++) + nf_conntrack_helper_unregister(&irc[i]); + kfree(irc_buffer); +} + +module_init(nf_conntrack_irc_init); +module_exit(nf_conntrack_irc_fini); diff --git a/net/netfilter/nf_conntrack_l3proto_generic.c b/net/netfilter/nf_conntrack_l3proto_generic.c new file mode 100644 index 00000000..e7eb807f --- /dev/null +++ b/net/netfilter/nf_conntrack_l3proto_generic.c @@ -0,0 +1,74 @@ +/* + * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> + * + * Based largely upon the original ip_conntrack code which + * had the following copyright information: + * + * (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Author: + * Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> + */ + +#include <linux/types.h> +#include <linux/ip.h> +#include <linux/netfilter.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/icmp.h> +#include <linux/sysctl.h> +#include <net/ip.h> + +#include <linux/netfilter_ipv4.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_l4proto.h> +#include <net/netfilter/nf_conntrack_l3proto.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/ipv4/nf_conntrack_ipv4.h> + +static bool generic_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff, + struct nf_conntrack_tuple *tuple) +{ + memset(&tuple->src.u3, 0, sizeof(tuple->src.u3)); + memset(&tuple->dst.u3, 0, sizeof(tuple->dst.u3)); + + return true; +} + +static bool generic_invert_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple *orig) +{ + memset(&tuple->src.u3, 0, sizeof(tuple->src.u3)); + memset(&tuple->dst.u3, 0, sizeof(tuple->dst.u3)); + + return true; +} + +static int generic_print_tuple(struct seq_file *s, + const struct nf_conntrack_tuple *tuple) +{ + return 0; +} + +static int generic_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, + unsigned int *dataoff, u_int8_t *protonum) +{ + /* Never track !!! */ + return -NF_ACCEPT; +} + + +struct nf_conntrack_l3proto nf_conntrack_l3proto_generic __read_mostly = { + .l3proto = PF_UNSPEC, + .name = "unknown", + .pkt_to_tuple = generic_pkt_to_tuple, + .invert_tuple = generic_invert_tuple, + .print_tuple = generic_print_tuple, + .get_l4proto = generic_get_l4proto, +}; +EXPORT_SYMBOL_GPL(nf_conntrack_l3proto_generic); diff --git a/net/netfilter/nf_conntrack_netbios_ns.c b/net/netfilter/nf_conntrack_netbios_ns.c new file mode 100644 index 00000000..4c8f30a3 --- /dev/null +++ b/net/netfilter/nf_conntrack_netbios_ns.c @@ -0,0 +1,71 @@ +/* + * NetBIOS name service broadcast connection tracking helper + * + * (c) 2005 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +/* + * This helper tracks locally originating NetBIOS name service + * requests by issuing permanent expectations (valid until + * timing out) matching all reply connections from the + * destination network. The only NetBIOS specific thing is + * actually the port number. + */ +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/in.h> + +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_expect.h> + +#define NMBD_PORT 137 + +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_DESCRIPTION("NetBIOS name service broadcast connection tracking helper"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ip_conntrack_netbios_ns"); +MODULE_ALIAS_NFCT_HELPER("netbios_ns"); + +static unsigned int timeout __read_mostly = 3; +module_param(timeout, uint, S_IRUSR); +MODULE_PARM_DESC(timeout, "timeout for master connection/replies in seconds"); + +static struct nf_conntrack_expect_policy exp_policy = { + .max_expected = 1, +}; + +static int netbios_ns_help(struct sk_buff *skb, unsigned int protoff, + struct nf_conn *ct, enum ip_conntrack_info ctinfo) +{ + return nf_conntrack_broadcast_help(skb, protoff, ct, ctinfo, timeout); +} + +static struct nf_conntrack_helper helper __read_mostly = { + .name = "netbios-ns", + .tuple.src.l3num = NFPROTO_IPV4, + .tuple.src.u.udp.port = cpu_to_be16(NMBD_PORT), + .tuple.dst.protonum = IPPROTO_UDP, + .me = THIS_MODULE, + .help = netbios_ns_help, + .expect_policy = &exp_policy, +}; + +static int __init nf_conntrack_netbios_ns_init(void) +{ + exp_policy.timeout = timeout; + return nf_conntrack_helper_register(&helper); +} + +static void __exit nf_conntrack_netbios_ns_fini(void) +{ + nf_conntrack_helper_unregister(&helper); +} + +module_init(nf_conntrack_netbios_ns_init); +module_exit(nf_conntrack_netbios_ns_fini); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c new file mode 100644 index 00000000..ca7e8354 --- /dev/null +++ b/net/netfilter/nf_conntrack_netlink.c @@ -0,0 +1,2425 @@ +/* Connection tracking via netlink socket. Allows for user space + * protocol helpers and general trouble making from userspace. + * + * (C) 2001 by Jay Schulist <jschlst@samba.org> + * (C) 2002-2006 by Harald Welte <laforge@gnumonks.org> + * (C) 2003 by Patrick Mchardy <kaber@trash.net> + * (C) 2005-2011 by Pablo Neira Ayuso <pablo@netfilter.org> + * + * Initial connection tracking via netlink development funded and + * generally made possible by Network Robots, Inc. (www.networkrobots.com) + * + * Further development of this code funded by Astaro AG (http://www.astaro.com) + * + * This software may be used and distributed according to the terms + * of the GNU General Public License, incorporated herein by reference. + */ + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/rculist.h> +#include <linux/rculist_nulls.h> +#include <linux/types.h> +#include <linux/timer.h> +#include <linux/security.h> +#include <linux/skbuff.h> +#include <linux/errno.h> +#include <linux/netlink.h> +#include <linux/spinlock.h> +#include <linux/interrupt.h> +#include <linux/slab.h> + +#include <linux/netfilter.h> +#include <net/netlink.h> +#include <net/sock.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_expect.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_l3proto.h> +#include <net/netfilter/nf_conntrack_l4proto.h> +#include <net/netfilter/nf_conntrack_tuple.h> +#include <net/netfilter/nf_conntrack_acct.h> +#include <net/netfilter/nf_conntrack_zones.h> +#include <net/netfilter/nf_conntrack_timestamp.h> +#ifdef CONFIG_NF_NAT_NEEDED +#include <net/netfilter/nf_nat_core.h> +#include <net/netfilter/nf_nat_protocol.h> +#endif + +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nfnetlink_conntrack.h> + +MODULE_LICENSE("GPL"); + +static char __initdata version[] = "0.93"; + +static inline int +ctnetlink_dump_tuples_proto(struct sk_buff *skb, + const struct nf_conntrack_tuple *tuple, + struct nf_conntrack_l4proto *l4proto) +{ + int ret = 0; + struct nlattr *nest_parms; + + nest_parms = nla_nest_start(skb, CTA_TUPLE_PROTO | NLA_F_NESTED); + if (!nest_parms) + goto nla_put_failure; + NLA_PUT_U8(skb, CTA_PROTO_NUM, tuple->dst.protonum); + + if (likely(l4proto->tuple_to_nlattr)) + ret = l4proto->tuple_to_nlattr(skb, tuple); + + nla_nest_end(skb, nest_parms); + + return ret; + +nla_put_failure: + return -1; +} + +static inline int +ctnetlink_dump_tuples_ip(struct sk_buff *skb, + const struct nf_conntrack_tuple *tuple, + struct nf_conntrack_l3proto *l3proto) +{ + int ret = 0; + struct nlattr *nest_parms; + + nest_parms = nla_nest_start(skb, CTA_TUPLE_IP | NLA_F_NESTED); + if (!nest_parms) + goto nla_put_failure; + + if (likely(l3proto->tuple_to_nlattr)) + ret = l3proto->tuple_to_nlattr(skb, tuple); + + nla_nest_end(skb, nest_parms); + + return ret; + +nla_put_failure: + return -1; +} + +static int +ctnetlink_dump_tuples(struct sk_buff *skb, + const struct nf_conntrack_tuple *tuple) +{ + int ret; + struct nf_conntrack_l3proto *l3proto; + struct nf_conntrack_l4proto *l4proto; + + rcu_read_lock(); + l3proto = __nf_ct_l3proto_find(tuple->src.l3num); + ret = ctnetlink_dump_tuples_ip(skb, tuple, l3proto); + + if (ret >= 0) { + l4proto = __nf_ct_l4proto_find(tuple->src.l3num, + tuple->dst.protonum); + ret = ctnetlink_dump_tuples_proto(skb, tuple, l4proto); + } + rcu_read_unlock(); + return ret; +} + +static inline int +ctnetlink_dump_status(struct sk_buff *skb, const struct nf_conn *ct) +{ + NLA_PUT_BE32(skb, CTA_STATUS, htonl(ct->status)); + return 0; + +nla_put_failure: + return -1; +} + +static inline int +ctnetlink_dump_timeout(struct sk_buff *skb, const struct nf_conn *ct) +{ + long timeout = ((long)ct->timeout.expires - (long)jiffies) / HZ; + + if (timeout < 0) + timeout = 0; + + NLA_PUT_BE32(skb, CTA_TIMEOUT, htonl(timeout)); + return 0; + +nla_put_failure: + return -1; +} + +static inline int +ctnetlink_dump_protoinfo(struct sk_buff *skb, struct nf_conn *ct) +{ + struct nf_conntrack_l4proto *l4proto; + struct nlattr *nest_proto; + int ret; + + l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); + if (!l4proto->to_nlattr) + return 0; + + nest_proto = nla_nest_start(skb, CTA_PROTOINFO | NLA_F_NESTED); + if (!nest_proto) + goto nla_put_failure; + + ret = l4proto->to_nlattr(skb, nest_proto, ct); + + nla_nest_end(skb, nest_proto); + + return ret; + +nla_put_failure: + return -1; +} + +static inline int +ctnetlink_dump_helpinfo(struct sk_buff *skb, const struct nf_conn *ct) +{ + struct nlattr *nest_helper; + const struct nf_conn_help *help = nfct_help(ct); + struct nf_conntrack_helper *helper; + + if (!help) + return 0; + + helper = rcu_dereference(help->helper); + if (!helper) + goto out; + + nest_helper = nla_nest_start(skb, CTA_HELP | NLA_F_NESTED); + if (!nest_helper) + goto nla_put_failure; + NLA_PUT_STRING(skb, CTA_HELP_NAME, helper->name); + + if (helper->to_nlattr) + helper->to_nlattr(skb, ct); + + nla_nest_end(skb, nest_helper); +out: + return 0; + +nla_put_failure: + return -1; +} + +static int +dump_counters(struct sk_buff *skb, u64 pkts, u64 bytes, + enum ip_conntrack_dir dir) +{ + enum ctattr_type type = dir ? CTA_COUNTERS_REPLY: CTA_COUNTERS_ORIG; + struct nlattr *nest_count; + + nest_count = nla_nest_start(skb, type | NLA_F_NESTED); + if (!nest_count) + goto nla_put_failure; + + NLA_PUT_BE64(skb, CTA_COUNTERS_PACKETS, cpu_to_be64(pkts)); + NLA_PUT_BE64(skb, CTA_COUNTERS_BYTES, cpu_to_be64(bytes)); + + nla_nest_end(skb, nest_count); + + return 0; + +nla_put_failure: + return -1; +} + +static int +ctnetlink_dump_counters(struct sk_buff *skb, const struct nf_conn *ct, + enum ip_conntrack_dir dir, int type) +{ + struct nf_conn_counter *acct; + u64 pkts, bytes; + + acct = nf_conn_acct_find(ct); + if (!acct) + return 0; + + if (type == IPCTNL_MSG_CT_GET_CTRZERO) { + pkts = atomic64_xchg(&acct[dir].packets, 0); + bytes = atomic64_xchg(&acct[dir].bytes, 0); + } else { + pkts = atomic64_read(&acct[dir].packets); + bytes = atomic64_read(&acct[dir].bytes); + } + return dump_counters(skb, pkts, bytes, dir); +} + +static int +ctnetlink_dump_timestamp(struct sk_buff *skb, const struct nf_conn *ct) +{ + struct nlattr *nest_count; + const struct nf_conn_tstamp *tstamp; + + tstamp = nf_conn_tstamp_find(ct); + if (!tstamp) + return 0; + + nest_count = nla_nest_start(skb, CTA_TIMESTAMP | NLA_F_NESTED); + if (!nest_count) + goto nla_put_failure; + + NLA_PUT_BE64(skb, CTA_TIMESTAMP_START, cpu_to_be64(tstamp->start)); + if (tstamp->stop != 0) { + NLA_PUT_BE64(skb, CTA_TIMESTAMP_STOP, + cpu_to_be64(tstamp->stop)); + } + nla_nest_end(skb, nest_count); + + return 0; + +nla_put_failure: + return -1; +} + +#ifdef CONFIG_NF_CONNTRACK_MARK +static inline int +ctnetlink_dump_mark(struct sk_buff *skb, const struct nf_conn *ct) +{ + NLA_PUT_BE32(skb, CTA_MARK, htonl(ct->mark)); + return 0; + +nla_put_failure: + return -1; +} +#else +#define ctnetlink_dump_mark(a, b) (0) +#endif + +#ifdef CONFIG_NF_CONNTRACK_SECMARK +static inline int +ctnetlink_dump_secctx(struct sk_buff *skb, const struct nf_conn *ct) +{ + struct nlattr *nest_secctx; + int len, ret; + char *secctx; + + ret = security_secid_to_secctx(ct->secmark, &secctx, &len); + if (ret) + return 0; + + ret = -1; + nest_secctx = nla_nest_start(skb, CTA_SECCTX | NLA_F_NESTED); + if (!nest_secctx) + goto nla_put_failure; + + NLA_PUT_STRING(skb, CTA_SECCTX_NAME, secctx); + nla_nest_end(skb, nest_secctx); + + ret = 0; +nla_put_failure: + security_release_secctx(secctx, len); + return ret; +} +#else +#define ctnetlink_dump_secctx(a, b) (0) +#endif + +#define master_tuple(ct) &(ct->master->tuplehash[IP_CT_DIR_ORIGINAL].tuple) + +static inline int +ctnetlink_dump_master(struct sk_buff *skb, const struct nf_conn *ct) +{ + struct nlattr *nest_parms; + + if (!(ct->status & IPS_EXPECTED)) + return 0; + + nest_parms = nla_nest_start(skb, CTA_TUPLE_MASTER | NLA_F_NESTED); + if (!nest_parms) + goto nla_put_failure; + if (ctnetlink_dump_tuples(skb, master_tuple(ct)) < 0) + goto nla_put_failure; + nla_nest_end(skb, nest_parms); + + return 0; + +nla_put_failure: + return -1; +} + +#ifdef CONFIG_NF_NAT_NEEDED +static int +dump_nat_seq_adj(struct sk_buff *skb, const struct nf_nat_seq *natseq, int type) +{ + struct nlattr *nest_parms; + + nest_parms = nla_nest_start(skb, type | NLA_F_NESTED); + if (!nest_parms) + goto nla_put_failure; + + NLA_PUT_BE32(skb, CTA_NAT_SEQ_CORRECTION_POS, + htonl(natseq->correction_pos)); + NLA_PUT_BE32(skb, CTA_NAT_SEQ_OFFSET_BEFORE, + htonl(natseq->offset_before)); + NLA_PUT_BE32(skb, CTA_NAT_SEQ_OFFSET_AFTER, + htonl(natseq->offset_after)); + + nla_nest_end(skb, nest_parms); + + return 0; + +nla_put_failure: + return -1; +} + +static inline int +ctnetlink_dump_nat_seq_adj(struct sk_buff *skb, const struct nf_conn *ct) +{ + struct nf_nat_seq *natseq; + struct nf_conn_nat *nat = nfct_nat(ct); + + if (!(ct->status & IPS_SEQ_ADJUST) || !nat) + return 0; + + natseq = &nat->seq[IP_CT_DIR_ORIGINAL]; + if (dump_nat_seq_adj(skb, natseq, CTA_NAT_SEQ_ADJ_ORIG) == -1) + return -1; + + natseq = &nat->seq[IP_CT_DIR_REPLY]; + if (dump_nat_seq_adj(skb, natseq, CTA_NAT_SEQ_ADJ_REPLY) == -1) + return -1; + + return 0; +} +#else +#define ctnetlink_dump_nat_seq_adj(a, b) (0) +#endif + +static inline int +ctnetlink_dump_id(struct sk_buff *skb, const struct nf_conn *ct) +{ + NLA_PUT_BE32(skb, CTA_ID, htonl((unsigned long)ct)); + return 0; + +nla_put_failure: + return -1; +} + +static inline int +ctnetlink_dump_use(struct sk_buff *skb, const struct nf_conn *ct) +{ + NLA_PUT_BE32(skb, CTA_USE, htonl(atomic_read(&ct->ct_general.use))); + return 0; + +nla_put_failure: + return -1; +} + +static int +ctnetlink_fill_info(struct sk_buff *skb, u32 pid, u32 seq, u32 type, + struct nf_conn *ct) +{ + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + struct nlattr *nest_parms; + unsigned int flags = pid ? NLM_F_MULTI : 0, event; + + event = (NFNL_SUBSYS_CTNETLINK << 8 | IPCTNL_MSG_CT_NEW); + nlh = nlmsg_put(skb, pid, seq, event, sizeof(*nfmsg), flags); + if (nlh == NULL) + goto nlmsg_failure; + + nfmsg = nlmsg_data(nlh); + nfmsg->nfgen_family = nf_ct_l3num(ct); + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = 0; + + nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG | NLA_F_NESTED); + if (!nest_parms) + goto nla_put_failure; + if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL)) < 0) + goto nla_put_failure; + nla_nest_end(skb, nest_parms); + + nest_parms = nla_nest_start(skb, CTA_TUPLE_REPLY | NLA_F_NESTED); + if (!nest_parms) + goto nla_put_failure; + if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_REPLY)) < 0) + goto nla_put_failure; + nla_nest_end(skb, nest_parms); + + if (nf_ct_zone(ct)) + NLA_PUT_BE16(skb, CTA_ZONE, htons(nf_ct_zone(ct))); + + if (ctnetlink_dump_status(skb, ct) < 0 || + ctnetlink_dump_timeout(skb, ct) < 0 || + ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL, type) < 0 || + ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY, type) < 0 || + ctnetlink_dump_timestamp(skb, ct) < 0 || + ctnetlink_dump_protoinfo(skb, ct) < 0 || + ctnetlink_dump_helpinfo(skb, ct) < 0 || + ctnetlink_dump_mark(skb, ct) < 0 || + ctnetlink_dump_secctx(skb, ct) < 0 || + ctnetlink_dump_id(skb, ct) < 0 || + ctnetlink_dump_use(skb, ct) < 0 || + ctnetlink_dump_master(skb, ct) < 0 || + ctnetlink_dump_nat_seq_adj(skb, ct) < 0) + goto nla_put_failure; + + nlmsg_end(skb, nlh); + return skb->len; + +nlmsg_failure: +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -1; +} + +#ifdef CONFIG_NF_CONNTRACK_EVENTS +static inline size_t +ctnetlink_proto_size(const struct nf_conn *ct) +{ + struct nf_conntrack_l3proto *l3proto; + struct nf_conntrack_l4proto *l4proto; + size_t len = 0; + + rcu_read_lock(); + l3proto = __nf_ct_l3proto_find(nf_ct_l3num(ct)); + len += l3proto->nla_size; + + l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); + len += l4proto->nla_size; + rcu_read_unlock(); + + return len; +} + +static inline size_t +ctnetlink_counters_size(const struct nf_conn *ct) +{ + if (!nf_ct_ext_exist(ct, NF_CT_EXT_ACCT)) + return 0; + return 2 * nla_total_size(0) /* CTA_COUNTERS_ORIG|REPL */ + + 2 * nla_total_size(sizeof(uint64_t)) /* CTA_COUNTERS_PACKETS */ + + 2 * nla_total_size(sizeof(uint64_t)) /* CTA_COUNTERS_BYTES */ + ; +} + +static inline int +ctnetlink_secctx_size(const struct nf_conn *ct) +{ +#ifdef CONFIG_NF_CONNTRACK_SECMARK + int len, ret; + + ret = security_secid_to_secctx(ct->secmark, NULL, &len); + if (ret) + return 0; + + return nla_total_size(0) /* CTA_SECCTX */ + + nla_total_size(sizeof(char) * len); /* CTA_SECCTX_NAME */ +#else + return 0; +#endif +} + +static inline size_t +ctnetlink_timestamp_size(const struct nf_conn *ct) +{ +#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP + if (!nf_ct_ext_exist(ct, NF_CT_EXT_TSTAMP)) + return 0; + return nla_total_size(0) + 2 * nla_total_size(sizeof(uint64_t)); +#else + return 0; +#endif +} + +static inline size_t +ctnetlink_nlmsg_size(const struct nf_conn *ct) +{ + return NLMSG_ALIGN(sizeof(struct nfgenmsg)) + + 3 * nla_total_size(0) /* CTA_TUPLE_ORIG|REPL|MASTER */ + + 3 * nla_total_size(0) /* CTA_TUPLE_IP */ + + 3 * nla_total_size(0) /* CTA_TUPLE_PROTO */ + + 3 * nla_total_size(sizeof(u_int8_t)) /* CTA_PROTO_NUM */ + + nla_total_size(sizeof(u_int32_t)) /* CTA_ID */ + + nla_total_size(sizeof(u_int32_t)) /* CTA_STATUS */ + + ctnetlink_counters_size(ct) + + ctnetlink_timestamp_size(ct) + + nla_total_size(sizeof(u_int32_t)) /* CTA_TIMEOUT */ + + nla_total_size(0) /* CTA_PROTOINFO */ + + nla_total_size(0) /* CTA_HELP */ + + nla_total_size(NF_CT_HELPER_NAME_LEN) /* CTA_HELP_NAME */ + + ctnetlink_secctx_size(ct) +#ifdef CONFIG_NF_NAT_NEEDED + + 2 * nla_total_size(0) /* CTA_NAT_SEQ_ADJ_ORIG|REPL */ + + 6 * nla_total_size(sizeof(u_int32_t)) /* CTA_NAT_SEQ_OFFSET */ +#endif +#ifdef CONFIG_NF_CONNTRACK_MARK + + nla_total_size(sizeof(u_int32_t)) /* CTA_MARK */ +#endif + + ctnetlink_proto_size(ct) + ; +} + +static int +ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item) +{ + struct net *net; + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + struct nlattr *nest_parms; + struct nf_conn *ct = item->ct; + struct sk_buff *skb; + unsigned int type; + unsigned int flags = 0, group; + int err; + + /* ignore our fake conntrack entry */ + if (nf_ct_is_untracked(ct)) + return 0; + + if (events & (1 << IPCT_DESTROY)) { + type = IPCTNL_MSG_CT_DELETE; + group = NFNLGRP_CONNTRACK_DESTROY; + } else if (events & ((1 << IPCT_NEW) | (1 << IPCT_RELATED))) { + type = IPCTNL_MSG_CT_NEW; + flags = NLM_F_CREATE|NLM_F_EXCL; + group = NFNLGRP_CONNTRACK_NEW; + } else if (events) { + type = IPCTNL_MSG_CT_NEW; + group = NFNLGRP_CONNTRACK_UPDATE; + } else + return 0; + + net = nf_ct_net(ct); + if (!item->report && !nfnetlink_has_listeners(net, group)) + return 0; + + skb = nlmsg_new(ctnetlink_nlmsg_size(ct), GFP_ATOMIC); + if (skb == NULL) + goto errout; + + type |= NFNL_SUBSYS_CTNETLINK << 8; + nlh = nlmsg_put(skb, item->pid, 0, type, sizeof(*nfmsg), flags); + if (nlh == NULL) + goto nlmsg_failure; + + nfmsg = nlmsg_data(nlh); + nfmsg->nfgen_family = nf_ct_l3num(ct); + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = 0; + + rcu_read_lock(); + nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG | NLA_F_NESTED); + if (!nest_parms) + goto nla_put_failure; + if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL)) < 0) + goto nla_put_failure; + nla_nest_end(skb, nest_parms); + + nest_parms = nla_nest_start(skb, CTA_TUPLE_REPLY | NLA_F_NESTED); + if (!nest_parms) + goto nla_put_failure; + if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_REPLY)) < 0) + goto nla_put_failure; + nla_nest_end(skb, nest_parms); + + if (nf_ct_zone(ct)) + NLA_PUT_BE16(skb, CTA_ZONE, htons(nf_ct_zone(ct))); + + if (ctnetlink_dump_id(skb, ct) < 0) + goto nla_put_failure; + + if (ctnetlink_dump_status(skb, ct) < 0) + goto nla_put_failure; + + if (events & (1 << IPCT_DESTROY)) { + if (ctnetlink_dump_counters(skb, ct, + IP_CT_DIR_ORIGINAL, type) < 0 || + ctnetlink_dump_counters(skb, ct, + IP_CT_DIR_REPLY, type) < 0 || + ctnetlink_dump_timestamp(skb, ct) < 0) + goto nla_put_failure; + } else { + if (ctnetlink_dump_timeout(skb, ct) < 0) + goto nla_put_failure; + + if (events & (1 << IPCT_PROTOINFO) + && ctnetlink_dump_protoinfo(skb, ct) < 0) + goto nla_put_failure; + + if ((events & (1 << IPCT_HELPER) || nfct_help(ct)) + && ctnetlink_dump_helpinfo(skb, ct) < 0) + goto nla_put_failure; + +#ifdef CONFIG_NF_CONNTRACK_SECMARK + if ((events & (1 << IPCT_SECMARK) || ct->secmark) + && ctnetlink_dump_secctx(skb, ct) < 0) + goto nla_put_failure; +#endif + + if (events & (1 << IPCT_RELATED) && + ctnetlink_dump_master(skb, ct) < 0) + goto nla_put_failure; + + if (events & (1 << IPCT_NATSEQADJ) && + ctnetlink_dump_nat_seq_adj(skb, ct) < 0) + goto nla_put_failure; + } + +#ifdef CONFIG_NF_CONNTRACK_MARK + if ((events & (1 << IPCT_MARK) || ct->mark) + && ctnetlink_dump_mark(skb, ct) < 0) + goto nla_put_failure; +#endif + rcu_read_unlock(); + + nlmsg_end(skb, nlh); + err = nfnetlink_send(skb, net, item->pid, group, item->report, + GFP_ATOMIC); + if (err == -ENOBUFS || err == -EAGAIN) + return -ENOBUFS; + + return 0; + +nla_put_failure: + rcu_read_unlock(); + nlmsg_cancel(skb, nlh); +nlmsg_failure: + kfree_skb(skb); +errout: + if (nfnetlink_set_err(net, 0, group, -ENOBUFS) > 0) + return -ENOBUFS; + + return 0; +} +#endif /* CONFIG_NF_CONNTRACK_EVENTS */ + +static int ctnetlink_done(struct netlink_callback *cb) +{ + if (cb->args[1]) + nf_ct_put((struct nf_conn *)cb->args[1]); + if (cb->data) + kfree(cb->data); + return 0; +} + +struct ctnetlink_dump_filter { + struct { + u_int32_t val; + u_int32_t mask; + } mark; +}; + +static int +ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + struct nf_conn *ct, *last; + struct nf_conntrack_tuple_hash *h; + struct hlist_nulls_node *n; + struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); + u_int8_t l3proto = nfmsg->nfgen_family; + int res; +#ifdef CONFIG_NF_CONNTRACK_MARK + const struct ctnetlink_dump_filter *filter = cb->data; +#endif + + spin_lock_bh(&nf_conntrack_lock); + last = (struct nf_conn *)cb->args[1]; + for (; cb->args[0] < net->ct.htable_size; cb->args[0]++) { +restart: + hlist_nulls_for_each_entry(h, n, &net->ct.hash[cb->args[0]], + hnnode) { + if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL) + continue; + ct = nf_ct_tuplehash_to_ctrack(h); + /* Dump entries of a given L3 protocol number. + * If it is not specified, ie. l3proto == 0, + * then dump everything. */ + if (l3proto && nf_ct_l3num(ct) != l3proto) + continue; + if (cb->args[1]) { + if (ct != last) + continue; + cb->args[1] = 0; + } +#ifdef CONFIG_NF_CONNTRACK_MARK + if (filter && !((ct->mark & filter->mark.mask) == + filter->mark.val)) { + continue; + } +#endif + rcu_read_lock(); + res = + ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, + NFNL_MSG_TYPE(cb->nlh->nlmsg_type), + ct); + rcu_read_unlock(); + if (res < 0) { + nf_conntrack_get(&ct->ct_general); + cb->args[1] = (unsigned long)ct; + goto out; + } + } + if (cb->args[1]) { + cb->args[1] = 0; + goto restart; + } + } +out: + spin_unlock_bh(&nf_conntrack_lock); + if (last) + nf_ct_put(last); + + return skb->len; +} + +static inline int +ctnetlink_parse_tuple_ip(struct nlattr *attr, struct nf_conntrack_tuple *tuple) +{ + struct nlattr *tb[CTA_IP_MAX+1]; + struct nf_conntrack_l3proto *l3proto; + int ret = 0; + + nla_parse_nested(tb, CTA_IP_MAX, attr, NULL); + + rcu_read_lock(); + l3proto = __nf_ct_l3proto_find(tuple->src.l3num); + + if (likely(l3proto->nlattr_to_tuple)) { + ret = nla_validate_nested(attr, CTA_IP_MAX, + l3proto->nla_policy); + if (ret == 0) + ret = l3proto->nlattr_to_tuple(tb, tuple); + } + + rcu_read_unlock(); + + return ret; +} + +static const struct nla_policy proto_nla_policy[CTA_PROTO_MAX+1] = { + [CTA_PROTO_NUM] = { .type = NLA_U8 }, +}; + +static inline int +ctnetlink_parse_tuple_proto(struct nlattr *attr, + struct nf_conntrack_tuple *tuple) +{ + struct nlattr *tb[CTA_PROTO_MAX+1]; + struct nf_conntrack_l4proto *l4proto; + int ret = 0; + + ret = nla_parse_nested(tb, CTA_PROTO_MAX, attr, proto_nla_policy); + if (ret < 0) + return ret; + + if (!tb[CTA_PROTO_NUM]) + return -EINVAL; + tuple->dst.protonum = nla_get_u8(tb[CTA_PROTO_NUM]); + + rcu_read_lock(); + l4proto = __nf_ct_l4proto_find(tuple->src.l3num, tuple->dst.protonum); + + if (likely(l4proto->nlattr_to_tuple)) { + ret = nla_validate_nested(attr, CTA_PROTO_MAX, + l4proto->nla_policy); + if (ret == 0) + ret = l4proto->nlattr_to_tuple(tb, tuple); + } + + rcu_read_unlock(); + + return ret; +} + +static const struct nla_policy tuple_nla_policy[CTA_TUPLE_MAX+1] = { + [CTA_TUPLE_IP] = { .type = NLA_NESTED }, + [CTA_TUPLE_PROTO] = { .type = NLA_NESTED }, +}; + +static int +ctnetlink_parse_tuple(const struct nlattr * const cda[], + struct nf_conntrack_tuple *tuple, + enum ctattr_type type, u_int8_t l3num) +{ + struct nlattr *tb[CTA_TUPLE_MAX+1]; + int err; + + memset(tuple, 0, sizeof(*tuple)); + + nla_parse_nested(tb, CTA_TUPLE_MAX, cda[type], tuple_nla_policy); + + if (!tb[CTA_TUPLE_IP]) + return -EINVAL; + + tuple->src.l3num = l3num; + + err = ctnetlink_parse_tuple_ip(tb[CTA_TUPLE_IP], tuple); + if (err < 0) + return err; + + if (!tb[CTA_TUPLE_PROTO]) + return -EINVAL; + + err = ctnetlink_parse_tuple_proto(tb[CTA_TUPLE_PROTO], tuple); + if (err < 0) + return err; + + /* orig and expect tuples get DIR_ORIGINAL */ + if (type == CTA_TUPLE_REPLY) + tuple->dst.dir = IP_CT_DIR_REPLY; + else + tuple->dst.dir = IP_CT_DIR_ORIGINAL; + + return 0; +} + +static int +ctnetlink_parse_zone(const struct nlattr *attr, u16 *zone) +{ + if (attr) +#ifdef CONFIG_NF_CONNTRACK_ZONES + *zone = ntohs(nla_get_be16(attr)); +#else + return -EOPNOTSUPP; +#endif + else + *zone = 0; + + return 0; +} + +static const struct nla_policy help_nla_policy[CTA_HELP_MAX+1] = { + [CTA_HELP_NAME] = { .type = NLA_NUL_STRING }, +}; + +static inline int +ctnetlink_parse_help(const struct nlattr *attr, char **helper_name) +{ + struct nlattr *tb[CTA_HELP_MAX+1]; + + nla_parse_nested(tb, CTA_HELP_MAX, attr, help_nla_policy); + + if (!tb[CTA_HELP_NAME]) + return -EINVAL; + + *helper_name = nla_data(tb[CTA_HELP_NAME]); + + return 0; +} + +static const struct nla_policy ct_nla_policy[CTA_MAX+1] = { + [CTA_TUPLE_ORIG] = { .type = NLA_NESTED }, + [CTA_TUPLE_REPLY] = { .type = NLA_NESTED }, + [CTA_STATUS] = { .type = NLA_U32 }, + [CTA_PROTOINFO] = { .type = NLA_NESTED }, + [CTA_HELP] = { .type = NLA_NESTED }, + [CTA_NAT_SRC] = { .type = NLA_NESTED }, + [CTA_TIMEOUT] = { .type = NLA_U32 }, + [CTA_MARK] = { .type = NLA_U32 }, + [CTA_ID] = { .type = NLA_U32 }, + [CTA_NAT_DST] = { .type = NLA_NESTED }, + [CTA_TUPLE_MASTER] = { .type = NLA_NESTED }, + [CTA_ZONE] = { .type = NLA_U16 }, + [CTA_MARK_MASK] = { .type = NLA_U32 }, +}; + +static int +ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const cda[]) +{ + struct net *net = sock_net(ctnl); + struct nf_conntrack_tuple_hash *h; + struct nf_conntrack_tuple tuple; + struct nf_conn *ct; + struct nfgenmsg *nfmsg = nlmsg_data(nlh); + u_int8_t u3 = nfmsg->nfgen_family; + u16 zone; + int err; + + err = ctnetlink_parse_zone(cda[CTA_ZONE], &zone); + if (err < 0) + return err; + + if (cda[CTA_TUPLE_ORIG]) + err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG, u3); + else if (cda[CTA_TUPLE_REPLY]) + err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY, u3); + else { + /* Flush the whole table */ + nf_conntrack_flush_report(net, + NETLINK_CB(skb).pid, + nlmsg_report(nlh)); + return 0; + } + + if (err < 0) + return err; + + h = nf_conntrack_find_get(net, zone, &tuple); + if (!h) + return -ENOENT; + + ct = nf_ct_tuplehash_to_ctrack(h); + + if (cda[CTA_ID]) { + u_int32_t id = ntohl(nla_get_be32(cda[CTA_ID])); + if (id != (u32)(unsigned long)ct) { + nf_ct_put(ct); + return -ENOENT; + } + } + + if (del_timer(&ct->timeout)) { + if (nf_conntrack_event_report(IPCT_DESTROY, ct, + NETLINK_CB(skb).pid, + nlmsg_report(nlh)) < 0) { + nf_ct_delete_from_lists(ct); + /* we failed to report the event, try later */ + nf_ct_insert_dying_list(ct); + nf_ct_put(ct); + return 0; + } + /* death_by_timeout would report the event again */ + set_bit(IPS_DYING_BIT, &ct->status); + nf_ct_delete_from_lists(ct); + nf_ct_put(ct); + } + nf_ct_put(ct); + + return 0; +} + +static int +ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const cda[]) +{ + struct net *net = sock_net(ctnl); + struct nf_conntrack_tuple_hash *h; + struct nf_conntrack_tuple tuple; + struct nf_conn *ct; + struct sk_buff *skb2 = NULL; + struct nfgenmsg *nfmsg = nlmsg_data(nlh); + u_int8_t u3 = nfmsg->nfgen_family; + u16 zone; + int err; + + if (nlh->nlmsg_flags & NLM_F_DUMP) { + struct netlink_dump_control c = { + .dump = ctnetlink_dump_table, + .done = ctnetlink_done, + }; +#ifdef CONFIG_NF_CONNTRACK_MARK + if (cda[CTA_MARK] && cda[CTA_MARK_MASK]) { + struct ctnetlink_dump_filter *filter; + + filter = kzalloc(sizeof(struct ctnetlink_dump_filter), + GFP_ATOMIC); + if (filter == NULL) + return -ENOMEM; + + filter->mark.val = ntohl(nla_get_be32(cda[CTA_MARK])); + filter->mark.mask = + ntohl(nla_get_be32(cda[CTA_MARK_MASK])); + c.data = filter; + } +#endif + return netlink_dump_start(ctnl, skb, nlh, &c); + } + + err = ctnetlink_parse_zone(cda[CTA_ZONE], &zone); + if (err < 0) + return err; + + if (cda[CTA_TUPLE_ORIG]) + err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG, u3); + else if (cda[CTA_TUPLE_REPLY]) + err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY, u3); + else + return -EINVAL; + + if (err < 0) + return err; + + h = nf_conntrack_find_get(net, zone, &tuple); + if (!h) + return -ENOENT; + + ct = nf_ct_tuplehash_to_ctrack(h); + + err = -ENOMEM; + skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (skb2 == NULL) { + nf_ct_put(ct); + return -ENOMEM; + } + + rcu_read_lock(); + err = ctnetlink_fill_info(skb2, NETLINK_CB(skb).pid, nlh->nlmsg_seq, + NFNL_MSG_TYPE(nlh->nlmsg_type), ct); + rcu_read_unlock(); + nf_ct_put(ct); + if (err <= 0) + goto free; + + err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT); + if (err < 0) + goto out; + + return 0; + +free: + kfree_skb(skb2); +out: + /* this avoids a loop in nfnetlink. */ + return err == -EAGAIN ? -ENOBUFS : err; +} + +#ifdef CONFIG_NF_NAT_NEEDED +static int +ctnetlink_parse_nat_setup(struct nf_conn *ct, + enum nf_nat_manip_type manip, + const struct nlattr *attr) +{ + typeof(nfnetlink_parse_nat_setup_hook) parse_nat_setup; + + parse_nat_setup = rcu_dereference(nfnetlink_parse_nat_setup_hook); + if (!parse_nat_setup) { +#ifdef CONFIG_MODULES + rcu_read_unlock(); + nfnl_unlock(); + if (request_module("nf-nat-ipv4") < 0) { + nfnl_lock(); + rcu_read_lock(); + return -EOPNOTSUPP; + } + nfnl_lock(); + rcu_read_lock(); + if (nfnetlink_parse_nat_setup_hook) + return -EAGAIN; +#endif + return -EOPNOTSUPP; + } + + return parse_nat_setup(ct, manip, attr); +} +#endif + +static int +ctnetlink_change_status(struct nf_conn *ct, const struct nlattr * const cda[]) +{ + unsigned long d; + unsigned int status = ntohl(nla_get_be32(cda[CTA_STATUS])); + d = ct->status ^ status; + + if (d & (IPS_EXPECTED|IPS_CONFIRMED|IPS_DYING)) + /* unchangeable */ + return -EBUSY; + + if (d & IPS_SEEN_REPLY && !(status & IPS_SEEN_REPLY)) + /* SEEN_REPLY bit can only be set */ + return -EBUSY; + + if (d & IPS_ASSURED && !(status & IPS_ASSURED)) + /* ASSURED bit can only be set */ + return -EBUSY; + + /* Be careful here, modifying NAT bits can screw up things, + * so don't let users modify them directly if they don't pass + * nf_nat_range. */ + ct->status |= status & ~(IPS_NAT_DONE_MASK | IPS_NAT_MASK); + return 0; +} + +static int +ctnetlink_change_nat(struct nf_conn *ct, const struct nlattr * const cda[]) +{ +#ifdef CONFIG_NF_NAT_NEEDED + int ret; + + if (cda[CTA_NAT_DST]) { + ret = ctnetlink_parse_nat_setup(ct, + NF_NAT_MANIP_DST, + cda[CTA_NAT_DST]); + if (ret < 0) + return ret; + } + if (cda[CTA_NAT_SRC]) { + ret = ctnetlink_parse_nat_setup(ct, + NF_NAT_MANIP_SRC, + cda[CTA_NAT_SRC]); + if (ret < 0) + return ret; + } + return 0; +#else + return -EOPNOTSUPP; +#endif +} + +static inline int +ctnetlink_change_helper(struct nf_conn *ct, const struct nlattr * const cda[]) +{ + struct nf_conntrack_helper *helper; + struct nf_conn_help *help = nfct_help(ct); + char *helpname = NULL; + int err; + + /* don't change helper of sibling connections */ + if (ct->master) + return -EBUSY; + + err = ctnetlink_parse_help(cda[CTA_HELP], &helpname); + if (err < 0) + return err; + + if (!strcmp(helpname, "")) { + if (help && help->helper) { + /* we had a helper before ... */ + nf_ct_remove_expectations(ct); + RCU_INIT_POINTER(help->helper, NULL); + } + + return 0; + } + + helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct), + nf_ct_protonum(ct)); + if (helper == NULL) { +#ifdef CONFIG_MODULES + spin_unlock_bh(&nf_conntrack_lock); + + if (request_module("nfct-helper-%s", helpname) < 0) { + spin_lock_bh(&nf_conntrack_lock); + return -EOPNOTSUPP; + } + + spin_lock_bh(&nf_conntrack_lock); + helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct), + nf_ct_protonum(ct)); + if (helper) + return -EAGAIN; +#endif + return -EOPNOTSUPP; + } + + if (help) { + if (help->helper == helper) + return 0; + if (help->helper) + return -EBUSY; + /* need to zero data of old helper */ + memset(&help->help, 0, sizeof(help->help)); + } else { + /* we cannot set a helper for an existing conntrack */ + return -EOPNOTSUPP; + } + + rcu_assign_pointer(help->helper, helper); + + return 0; +} + +static inline int +ctnetlink_change_timeout(struct nf_conn *ct, const struct nlattr * const cda[]) +{ + u_int32_t timeout = ntohl(nla_get_be32(cda[CTA_TIMEOUT])); + + if (!del_timer(&ct->timeout)) + return -ETIME; + + ct->timeout.expires = jiffies + timeout * HZ; + add_timer(&ct->timeout); + + return 0; +} + +static const struct nla_policy protoinfo_policy[CTA_PROTOINFO_MAX+1] = { + [CTA_PROTOINFO_TCP] = { .type = NLA_NESTED }, + [CTA_PROTOINFO_DCCP] = { .type = NLA_NESTED }, + [CTA_PROTOINFO_SCTP] = { .type = NLA_NESTED }, +}; + +static inline int +ctnetlink_change_protoinfo(struct nf_conn *ct, const struct nlattr * const cda[]) +{ + const struct nlattr *attr = cda[CTA_PROTOINFO]; + struct nlattr *tb[CTA_PROTOINFO_MAX+1]; + struct nf_conntrack_l4proto *l4proto; + int err = 0; + + nla_parse_nested(tb, CTA_PROTOINFO_MAX, attr, protoinfo_policy); + + rcu_read_lock(); + l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); + if (l4proto->from_nlattr) + err = l4proto->from_nlattr(tb, ct); + rcu_read_unlock(); + + return err; +} + +#ifdef CONFIG_NF_NAT_NEEDED +static const struct nla_policy nat_seq_policy[CTA_NAT_SEQ_MAX+1] = { + [CTA_NAT_SEQ_CORRECTION_POS] = { .type = NLA_U32 }, + [CTA_NAT_SEQ_OFFSET_BEFORE] = { .type = NLA_U32 }, + [CTA_NAT_SEQ_OFFSET_AFTER] = { .type = NLA_U32 }, +}; + +static inline int +change_nat_seq_adj(struct nf_nat_seq *natseq, const struct nlattr * const attr) +{ + struct nlattr *cda[CTA_NAT_SEQ_MAX+1]; + + nla_parse_nested(cda, CTA_NAT_SEQ_MAX, attr, nat_seq_policy); + + if (!cda[CTA_NAT_SEQ_CORRECTION_POS]) + return -EINVAL; + + natseq->correction_pos = + ntohl(nla_get_be32(cda[CTA_NAT_SEQ_CORRECTION_POS])); + + if (!cda[CTA_NAT_SEQ_OFFSET_BEFORE]) + return -EINVAL; + + natseq->offset_before = + ntohl(nla_get_be32(cda[CTA_NAT_SEQ_OFFSET_BEFORE])); + + if (!cda[CTA_NAT_SEQ_OFFSET_AFTER]) + return -EINVAL; + + natseq->offset_after = + ntohl(nla_get_be32(cda[CTA_NAT_SEQ_OFFSET_AFTER])); + + return 0; +} + +static int +ctnetlink_change_nat_seq_adj(struct nf_conn *ct, + const struct nlattr * const cda[]) +{ + int ret = 0; + struct nf_conn_nat *nat = nfct_nat(ct); + + if (!nat) + return 0; + + if (cda[CTA_NAT_SEQ_ADJ_ORIG]) { + ret = change_nat_seq_adj(&nat->seq[IP_CT_DIR_ORIGINAL], + cda[CTA_NAT_SEQ_ADJ_ORIG]); + if (ret < 0) + return ret; + + ct->status |= IPS_SEQ_ADJUST; + } + + if (cda[CTA_NAT_SEQ_ADJ_REPLY]) { + ret = change_nat_seq_adj(&nat->seq[IP_CT_DIR_REPLY], + cda[CTA_NAT_SEQ_ADJ_REPLY]); + if (ret < 0) + return ret; + + ct->status |= IPS_SEQ_ADJUST; + } + + return 0; +} +#endif + +static int +ctnetlink_change_conntrack(struct nf_conn *ct, + const struct nlattr * const cda[]) +{ + int err; + + /* only allow NAT changes and master assignation for new conntracks */ + if (cda[CTA_NAT_SRC] || cda[CTA_NAT_DST] || cda[CTA_TUPLE_MASTER]) + return -EOPNOTSUPP; + + if (cda[CTA_HELP]) { + err = ctnetlink_change_helper(ct, cda); + if (err < 0) + return err; + } + + if (cda[CTA_TIMEOUT]) { + err = ctnetlink_change_timeout(ct, cda); + if (err < 0) + return err; + } + + if (cda[CTA_STATUS]) { + err = ctnetlink_change_status(ct, cda); + if (err < 0) + return err; + } + + if (cda[CTA_PROTOINFO]) { + err = ctnetlink_change_protoinfo(ct, cda); + if (err < 0) + return err; + } + +#if defined(CONFIG_NF_CONNTRACK_MARK) + if (cda[CTA_MARK]) + ct->mark = ntohl(nla_get_be32(cda[CTA_MARK])); +#endif + +#ifdef CONFIG_NF_NAT_NEEDED + if (cda[CTA_NAT_SEQ_ADJ_ORIG] || cda[CTA_NAT_SEQ_ADJ_REPLY]) { + err = ctnetlink_change_nat_seq_adj(ct, cda); + if (err < 0) + return err; + } +#endif + + return 0; +} + +static struct nf_conn * +ctnetlink_create_conntrack(struct net *net, u16 zone, + const struct nlattr * const cda[], + struct nf_conntrack_tuple *otuple, + struct nf_conntrack_tuple *rtuple, + u8 u3) +{ + struct nf_conn *ct; + int err = -EINVAL; + struct nf_conntrack_helper *helper; + struct nf_conn_tstamp *tstamp; + + ct = nf_conntrack_alloc(net, zone, otuple, rtuple, GFP_ATOMIC); + if (IS_ERR(ct)) + return ERR_PTR(-ENOMEM); + + if (!cda[CTA_TIMEOUT]) + goto err1; + ct->timeout.expires = ntohl(nla_get_be32(cda[CTA_TIMEOUT])); + + ct->timeout.expires = jiffies + ct->timeout.expires * HZ; + + rcu_read_lock(); + if (cda[CTA_HELP]) { + char *helpname = NULL; + + err = ctnetlink_parse_help(cda[CTA_HELP], &helpname); + if (err < 0) + goto err2; + + helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct), + nf_ct_protonum(ct)); + if (helper == NULL) { + rcu_read_unlock(); +#ifdef CONFIG_MODULES + if (request_module("nfct-helper-%s", helpname) < 0) { + err = -EOPNOTSUPP; + goto err1; + } + + rcu_read_lock(); + helper = __nf_conntrack_helper_find(helpname, + nf_ct_l3num(ct), + nf_ct_protonum(ct)); + if (helper) { + err = -EAGAIN; + goto err2; + } + rcu_read_unlock(); +#endif + err = -EOPNOTSUPP; + goto err1; + } else { + struct nf_conn_help *help; + + help = nf_ct_helper_ext_add(ct, GFP_ATOMIC); + if (help == NULL) { + err = -ENOMEM; + goto err2; + } + + /* not in hash table yet so not strictly necessary */ + RCU_INIT_POINTER(help->helper, helper); + } + } else { + /* try an implicit helper assignation */ + err = __nf_ct_try_assign_helper(ct, NULL, GFP_ATOMIC); + if (err < 0) + goto err2; + } + + if (cda[CTA_NAT_SRC] || cda[CTA_NAT_DST]) { + err = ctnetlink_change_nat(ct, cda); + if (err < 0) + goto err2; + } + + nf_ct_acct_ext_add(ct, GFP_ATOMIC); + nf_ct_tstamp_ext_add(ct, GFP_ATOMIC); + nf_ct_ecache_ext_add(ct, 0, 0, GFP_ATOMIC); + /* we must add conntrack extensions before confirmation. */ + ct->status |= IPS_CONFIRMED; + + if (cda[CTA_STATUS]) { + err = ctnetlink_change_status(ct, cda); + if (err < 0) + goto err2; + } + +#ifdef CONFIG_NF_NAT_NEEDED + if (cda[CTA_NAT_SEQ_ADJ_ORIG] || cda[CTA_NAT_SEQ_ADJ_REPLY]) { + err = ctnetlink_change_nat_seq_adj(ct, cda); + if (err < 0) + goto err2; + } +#endif + + memset(&ct->proto, 0, sizeof(ct->proto)); + if (cda[CTA_PROTOINFO]) { + err = ctnetlink_change_protoinfo(ct, cda); + if (err < 0) + goto err2; + } + +#if defined(CONFIG_NF_CONNTRACK_MARK) + if (cda[CTA_MARK]) + ct->mark = ntohl(nla_get_be32(cda[CTA_MARK])); +#endif + + /* setup master conntrack: this is a confirmed expectation */ + if (cda[CTA_TUPLE_MASTER]) { + struct nf_conntrack_tuple master; + struct nf_conntrack_tuple_hash *master_h; + struct nf_conn *master_ct; + + err = ctnetlink_parse_tuple(cda, &master, CTA_TUPLE_MASTER, u3); + if (err < 0) + goto err2; + + master_h = nf_conntrack_find_get(net, zone, &master); + if (master_h == NULL) { + err = -ENOENT; + goto err2; + } + master_ct = nf_ct_tuplehash_to_ctrack(master_h); + __set_bit(IPS_EXPECTED_BIT, &ct->status); + ct->master = master_ct; + } + tstamp = nf_conn_tstamp_find(ct); + if (tstamp) + tstamp->start = ktime_to_ns(ktime_get_real()); + + err = nf_conntrack_hash_check_insert(ct); + if (err < 0) + goto err2; + + rcu_read_unlock(); + + return ct; + +err2: + rcu_read_unlock(); +err1: + nf_conntrack_free(ct); + return ERR_PTR(err); +} + +static int +ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const cda[]) +{ + struct net *net = sock_net(ctnl); + struct nf_conntrack_tuple otuple, rtuple; + struct nf_conntrack_tuple_hash *h = NULL; + struct nfgenmsg *nfmsg = nlmsg_data(nlh); + struct nf_conn *ct; + u_int8_t u3 = nfmsg->nfgen_family; + u16 zone; + int err; + + err = ctnetlink_parse_zone(cda[CTA_ZONE], &zone); + if (err < 0) + return err; + + if (cda[CTA_TUPLE_ORIG]) { + err = ctnetlink_parse_tuple(cda, &otuple, CTA_TUPLE_ORIG, u3); + if (err < 0) + return err; + } + + if (cda[CTA_TUPLE_REPLY]) { + err = ctnetlink_parse_tuple(cda, &rtuple, CTA_TUPLE_REPLY, u3); + if (err < 0) + return err; + } + + if (cda[CTA_TUPLE_ORIG]) + h = nf_conntrack_find_get(net, zone, &otuple); + else if (cda[CTA_TUPLE_REPLY]) + h = nf_conntrack_find_get(net, zone, &rtuple); + + if (h == NULL) { + err = -ENOENT; + if (nlh->nlmsg_flags & NLM_F_CREATE) { + enum ip_conntrack_events events; + + ct = ctnetlink_create_conntrack(net, zone, cda, &otuple, + &rtuple, u3); + if (IS_ERR(ct)) + return PTR_ERR(ct); + + err = 0; + if (test_bit(IPS_EXPECTED_BIT, &ct->status)) + events = IPCT_RELATED; + else + events = IPCT_NEW; + + nf_conntrack_eventmask_report((1 << IPCT_REPLY) | + (1 << IPCT_ASSURED) | + (1 << IPCT_HELPER) | + (1 << IPCT_PROTOINFO) | + (1 << IPCT_NATSEQADJ) | + (1 << IPCT_MARK) | events, + ct, NETLINK_CB(skb).pid, + nlmsg_report(nlh)); + nf_ct_put(ct); + } + + return err; + } + /* implicit 'else' */ + + err = -EEXIST; + ct = nf_ct_tuplehash_to_ctrack(h); + if (!(nlh->nlmsg_flags & NLM_F_EXCL)) { + spin_lock_bh(&nf_conntrack_lock); + err = ctnetlink_change_conntrack(ct, cda); + spin_unlock_bh(&nf_conntrack_lock); + if (err == 0) { + nf_conntrack_eventmask_report((1 << IPCT_REPLY) | + (1 << IPCT_ASSURED) | + (1 << IPCT_HELPER) | + (1 << IPCT_PROTOINFO) | + (1 << IPCT_NATSEQADJ) | + (1 << IPCT_MARK), + ct, NETLINK_CB(skb).pid, + nlmsg_report(nlh)); + } + } + + nf_ct_put(ct); + return err; +} + +/*********************************************************************** + * EXPECT + ***********************************************************************/ + +static inline int +ctnetlink_exp_dump_tuple(struct sk_buff *skb, + const struct nf_conntrack_tuple *tuple, + enum ctattr_expect type) +{ + struct nlattr *nest_parms; + + nest_parms = nla_nest_start(skb, type | NLA_F_NESTED); + if (!nest_parms) + goto nla_put_failure; + if (ctnetlink_dump_tuples(skb, tuple) < 0) + goto nla_put_failure; + nla_nest_end(skb, nest_parms); + + return 0; + +nla_put_failure: + return -1; +} + +static inline int +ctnetlink_exp_dump_mask(struct sk_buff *skb, + const struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple_mask *mask) +{ + int ret; + struct nf_conntrack_l3proto *l3proto; + struct nf_conntrack_l4proto *l4proto; + struct nf_conntrack_tuple m; + struct nlattr *nest_parms; + + memset(&m, 0xFF, sizeof(m)); + memcpy(&m.src.u3, &mask->src.u3, sizeof(m.src.u3)); + m.src.u.all = mask->src.u.all; + m.dst.protonum = tuple->dst.protonum; + + nest_parms = nla_nest_start(skb, CTA_EXPECT_MASK | NLA_F_NESTED); + if (!nest_parms) + goto nla_put_failure; + + rcu_read_lock(); + l3proto = __nf_ct_l3proto_find(tuple->src.l3num); + ret = ctnetlink_dump_tuples_ip(skb, &m, l3proto); + if (ret >= 0) { + l4proto = __nf_ct_l4proto_find(tuple->src.l3num, + tuple->dst.protonum); + ret = ctnetlink_dump_tuples_proto(skb, &m, l4proto); + } + rcu_read_unlock(); + + if (unlikely(ret < 0)) + goto nla_put_failure; + + nla_nest_end(skb, nest_parms); + + return 0; + +nla_put_failure: + return -1; +} + +static int +ctnetlink_exp_dump_expect(struct sk_buff *skb, + const struct nf_conntrack_expect *exp) +{ + struct nf_conn *master = exp->master; + long timeout = ((long)exp->timeout.expires - (long)jiffies) / HZ; + struct nf_conn_help *help; +#ifdef CONFIG_NF_NAT_NEEDED + struct nlattr *nest_parms; + struct nf_conntrack_tuple nat_tuple = {}; +#endif + struct nf_ct_helper_expectfn *expfn; + + if (timeout < 0) + timeout = 0; + + if (ctnetlink_exp_dump_tuple(skb, &exp->tuple, CTA_EXPECT_TUPLE) < 0) + goto nla_put_failure; + if (ctnetlink_exp_dump_mask(skb, &exp->tuple, &exp->mask) < 0) + goto nla_put_failure; + if (ctnetlink_exp_dump_tuple(skb, + &master->tuplehash[IP_CT_DIR_ORIGINAL].tuple, + CTA_EXPECT_MASTER) < 0) + goto nla_put_failure; + +#ifdef CONFIG_NF_NAT_NEEDED + if (exp->saved_ip || exp->saved_proto.all) { + nest_parms = nla_nest_start(skb, CTA_EXPECT_NAT | NLA_F_NESTED); + if (!nest_parms) + goto nla_put_failure; + + NLA_PUT_BE32(skb, CTA_EXPECT_NAT_DIR, htonl(exp->dir)); + + nat_tuple.src.l3num = nf_ct_l3num(master); + nat_tuple.src.u3.ip = exp->saved_ip; + nat_tuple.dst.protonum = nf_ct_protonum(master); + nat_tuple.src.u = exp->saved_proto; + + if (ctnetlink_exp_dump_tuple(skb, &nat_tuple, + CTA_EXPECT_NAT_TUPLE) < 0) + goto nla_put_failure; + nla_nest_end(skb, nest_parms); + } +#endif + NLA_PUT_BE32(skb, CTA_EXPECT_TIMEOUT, htonl(timeout)); + NLA_PUT_BE32(skb, CTA_EXPECT_ID, htonl((unsigned long)exp)); + NLA_PUT_BE32(skb, CTA_EXPECT_FLAGS, htonl(exp->flags)); + NLA_PUT_BE32(skb, CTA_EXPECT_CLASS, htonl(exp->class)); + help = nfct_help(master); + if (help) { + struct nf_conntrack_helper *helper; + + helper = rcu_dereference(help->helper); + if (helper) + NLA_PUT_STRING(skb, CTA_EXPECT_HELP_NAME, helper->name); + } + expfn = nf_ct_helper_expectfn_find_by_symbol(exp->expectfn); + if (expfn != NULL) + NLA_PUT_STRING(skb, CTA_EXPECT_FN, expfn->name); + + return 0; + +nla_put_failure: + return -1; +} + +static int +ctnetlink_exp_fill_info(struct sk_buff *skb, u32 pid, u32 seq, + int event, const struct nf_conntrack_expect *exp) +{ + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + unsigned int flags = pid ? NLM_F_MULTI : 0; + + event |= NFNL_SUBSYS_CTNETLINK_EXP << 8; + nlh = nlmsg_put(skb, pid, seq, event, sizeof(*nfmsg), flags); + if (nlh == NULL) + goto nlmsg_failure; + + nfmsg = nlmsg_data(nlh); + nfmsg->nfgen_family = exp->tuple.src.l3num; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = 0; + + if (ctnetlink_exp_dump_expect(skb, exp) < 0) + goto nla_put_failure; + + nlmsg_end(skb, nlh); + return skb->len; + +nlmsg_failure: +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -1; +} + +#ifdef CONFIG_NF_CONNTRACK_EVENTS +static int +ctnetlink_expect_event(unsigned int events, struct nf_exp_event *item) +{ + struct nf_conntrack_expect *exp = item->exp; + struct net *net = nf_ct_exp_net(exp); + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + struct sk_buff *skb; + unsigned int type, group; + int flags = 0; + + if (events & (1 << IPEXP_DESTROY)) { + type = IPCTNL_MSG_EXP_DELETE; + group = NFNLGRP_CONNTRACK_EXP_DESTROY; + } else if (events & (1 << IPEXP_NEW)) { + type = IPCTNL_MSG_EXP_NEW; + flags = NLM_F_CREATE|NLM_F_EXCL; + group = NFNLGRP_CONNTRACK_EXP_NEW; + } else + return 0; + + if (!item->report && !nfnetlink_has_listeners(net, group)) + return 0; + + skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); + if (skb == NULL) + goto errout; + + type |= NFNL_SUBSYS_CTNETLINK_EXP << 8; + nlh = nlmsg_put(skb, item->pid, 0, type, sizeof(*nfmsg), flags); + if (nlh == NULL) + goto nlmsg_failure; + + nfmsg = nlmsg_data(nlh); + nfmsg->nfgen_family = exp->tuple.src.l3num; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = 0; + + rcu_read_lock(); + if (ctnetlink_exp_dump_expect(skb, exp) < 0) + goto nla_put_failure; + rcu_read_unlock(); + + nlmsg_end(skb, nlh); + nfnetlink_send(skb, net, item->pid, group, item->report, GFP_ATOMIC); + return 0; + +nla_put_failure: + rcu_read_unlock(); + nlmsg_cancel(skb, nlh); +nlmsg_failure: + kfree_skb(skb); +errout: + nfnetlink_set_err(net, 0, 0, -ENOBUFS); + return 0; +} +#endif +static int ctnetlink_exp_done(struct netlink_callback *cb) +{ + if (cb->args[1]) + nf_ct_expect_put((struct nf_conntrack_expect *)cb->args[1]); + return 0; +} + +static int +ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + struct nf_conntrack_expect *exp, *last; + struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); + struct hlist_node *n; + u_int8_t l3proto = nfmsg->nfgen_family; + + rcu_read_lock(); + last = (struct nf_conntrack_expect *)cb->args[1]; + for (; cb->args[0] < nf_ct_expect_hsize; cb->args[0]++) { +restart: + hlist_for_each_entry(exp, n, &net->ct.expect_hash[cb->args[0]], + hnode) { + if (l3proto && exp->tuple.src.l3num != l3proto) + continue; + if (cb->args[1]) { + if (exp != last) + continue; + cb->args[1] = 0; + } + if (ctnetlink_exp_fill_info(skb, + NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, + IPCTNL_MSG_EXP_NEW, + exp) < 0) { + if (!atomic_inc_not_zero(&exp->use)) + continue; + cb->args[1] = (unsigned long)exp; + goto out; + } + } + if (cb->args[1]) { + cb->args[1] = 0; + goto restart; + } + } +out: + rcu_read_unlock(); + if (last) + nf_ct_expect_put(last); + + return skb->len; +} + +static const struct nla_policy exp_nla_policy[CTA_EXPECT_MAX+1] = { + [CTA_EXPECT_MASTER] = { .type = NLA_NESTED }, + [CTA_EXPECT_TUPLE] = { .type = NLA_NESTED }, + [CTA_EXPECT_MASK] = { .type = NLA_NESTED }, + [CTA_EXPECT_TIMEOUT] = { .type = NLA_U32 }, + [CTA_EXPECT_ID] = { .type = NLA_U32 }, + [CTA_EXPECT_HELP_NAME] = { .type = NLA_NUL_STRING }, + [CTA_EXPECT_ZONE] = { .type = NLA_U16 }, + [CTA_EXPECT_FLAGS] = { .type = NLA_U32 }, + [CTA_EXPECT_CLASS] = { .type = NLA_U32 }, + [CTA_EXPECT_NAT] = { .type = NLA_NESTED }, + [CTA_EXPECT_FN] = { .type = NLA_NUL_STRING }, +}; + +static int +ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const cda[]) +{ + struct net *net = sock_net(ctnl); + struct nf_conntrack_tuple tuple; + struct nf_conntrack_expect *exp; + struct sk_buff *skb2; + struct nfgenmsg *nfmsg = nlmsg_data(nlh); + u_int8_t u3 = nfmsg->nfgen_family; + u16 zone; + int err; + + if (nlh->nlmsg_flags & NLM_F_DUMP) { + struct netlink_dump_control c = { + .dump = ctnetlink_exp_dump_table, + .done = ctnetlink_exp_done, + }; + return netlink_dump_start(ctnl, skb, nlh, &c); + } + + err = ctnetlink_parse_zone(cda[CTA_EXPECT_ZONE], &zone); + if (err < 0) + return err; + + if (cda[CTA_EXPECT_TUPLE]) + err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3); + else if (cda[CTA_EXPECT_MASTER]) + err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER, u3); + else + return -EINVAL; + + if (err < 0) + return err; + + exp = nf_ct_expect_find_get(net, zone, &tuple); + if (!exp) + return -ENOENT; + + if (cda[CTA_EXPECT_ID]) { + __be32 id = nla_get_be32(cda[CTA_EXPECT_ID]); + if (ntohl(id) != (u32)(unsigned long)exp) { + nf_ct_expect_put(exp); + return -ENOENT; + } + } + + err = -ENOMEM; + skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (skb2 == NULL) { + nf_ct_expect_put(exp); + goto out; + } + + rcu_read_lock(); + err = ctnetlink_exp_fill_info(skb2, NETLINK_CB(skb).pid, + nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW, exp); + rcu_read_unlock(); + nf_ct_expect_put(exp); + if (err <= 0) + goto free; + + err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT); + if (err < 0) + goto out; + + return 0; + +free: + kfree_skb(skb2); +out: + /* this avoids a loop in nfnetlink. */ + return err == -EAGAIN ? -ENOBUFS : err; +} + +static int +ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const cda[]) +{ + struct net *net = sock_net(ctnl); + struct nf_conntrack_expect *exp; + struct nf_conntrack_tuple tuple; + struct nfgenmsg *nfmsg = nlmsg_data(nlh); + struct hlist_node *n, *next; + u_int8_t u3 = nfmsg->nfgen_family; + unsigned int i; + u16 zone; + int err; + + if (cda[CTA_EXPECT_TUPLE]) { + /* delete a single expect by tuple */ + err = ctnetlink_parse_zone(cda[CTA_EXPECT_ZONE], &zone); + if (err < 0) + return err; + + err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3); + if (err < 0) + return err; + + /* bump usage count to 2 */ + exp = nf_ct_expect_find_get(net, zone, &tuple); + if (!exp) + return -ENOENT; + + if (cda[CTA_EXPECT_ID]) { + __be32 id = nla_get_be32(cda[CTA_EXPECT_ID]); + if (ntohl(id) != (u32)(unsigned long)exp) { + nf_ct_expect_put(exp); + return -ENOENT; + } + } + + /* after list removal, usage count == 1 */ + spin_lock_bh(&nf_conntrack_lock); + if (del_timer(&exp->timeout)) { + nf_ct_unlink_expect_report(exp, NETLINK_CB(skb).pid, + nlmsg_report(nlh)); + nf_ct_expect_put(exp); + } + spin_unlock_bh(&nf_conntrack_lock); + /* have to put what we 'get' above. + * after this line usage count == 0 */ + nf_ct_expect_put(exp); + } else if (cda[CTA_EXPECT_HELP_NAME]) { + char *name = nla_data(cda[CTA_EXPECT_HELP_NAME]); + struct nf_conn_help *m_help; + + /* delete all expectations for this helper */ + spin_lock_bh(&nf_conntrack_lock); + for (i = 0; i < nf_ct_expect_hsize; i++) { + hlist_for_each_entry_safe(exp, n, next, + &net->ct.expect_hash[i], + hnode) { + m_help = nfct_help(exp->master); + if (!strcmp(m_help->helper->name, name) && + del_timer(&exp->timeout)) { + nf_ct_unlink_expect_report(exp, + NETLINK_CB(skb).pid, + nlmsg_report(nlh)); + nf_ct_expect_put(exp); + } + } + } + spin_unlock_bh(&nf_conntrack_lock); + } else { + /* This basically means we have to flush everything*/ + spin_lock_bh(&nf_conntrack_lock); + for (i = 0; i < nf_ct_expect_hsize; i++) { + hlist_for_each_entry_safe(exp, n, next, + &net->ct.expect_hash[i], + hnode) { + if (del_timer(&exp->timeout)) { + nf_ct_unlink_expect_report(exp, + NETLINK_CB(skb).pid, + nlmsg_report(nlh)); + nf_ct_expect_put(exp); + } + } + } + spin_unlock_bh(&nf_conntrack_lock); + } + + return 0; +} +static int +ctnetlink_change_expect(struct nf_conntrack_expect *x, + const struct nlattr * const cda[]) +{ + return -EOPNOTSUPP; +} + +static const struct nla_policy exp_nat_nla_policy[CTA_EXPECT_NAT_MAX+1] = { + [CTA_EXPECT_NAT_DIR] = { .type = NLA_U32 }, + [CTA_EXPECT_NAT_TUPLE] = { .type = NLA_NESTED }, +}; + +static int +ctnetlink_parse_expect_nat(const struct nlattr *attr, + struct nf_conntrack_expect *exp, + u_int8_t u3) +{ +#ifdef CONFIG_NF_NAT_NEEDED + struct nlattr *tb[CTA_EXPECT_NAT_MAX+1]; + struct nf_conntrack_tuple nat_tuple = {}; + int err; + + nla_parse_nested(tb, CTA_EXPECT_NAT_MAX, attr, exp_nat_nla_policy); + + if (!tb[CTA_EXPECT_NAT_DIR] || !tb[CTA_EXPECT_NAT_TUPLE]) + return -EINVAL; + + err = ctnetlink_parse_tuple((const struct nlattr * const *)tb, + &nat_tuple, CTA_EXPECT_NAT_TUPLE, u3); + if (err < 0) + return err; + + exp->saved_ip = nat_tuple.src.u3.ip; + exp->saved_proto = nat_tuple.src.u; + exp->dir = ntohl(nla_get_be32(tb[CTA_EXPECT_NAT_DIR])); + + return 0; +#else + return -EOPNOTSUPP; +#endif +} + +static int +ctnetlink_create_expect(struct net *net, u16 zone, + const struct nlattr * const cda[], + u_int8_t u3, + u32 pid, int report) +{ + struct nf_conntrack_tuple tuple, mask, master_tuple; + struct nf_conntrack_tuple_hash *h = NULL; + struct nf_conntrack_expect *exp; + struct nf_conn *ct; + struct nf_conn_help *help; + struct nf_conntrack_helper *helper = NULL; + u_int32_t class = 0; + int err = 0; + + /* caller guarantees that those three CTA_EXPECT_* exist */ + err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3); + if (err < 0) + return err; + err = ctnetlink_parse_tuple(cda, &mask, CTA_EXPECT_MASK, u3); + if (err < 0) + return err; + err = ctnetlink_parse_tuple(cda, &master_tuple, CTA_EXPECT_MASTER, u3); + if (err < 0) + return err; + + /* Look for master conntrack of this expectation */ + h = nf_conntrack_find_get(net, zone, &master_tuple); + if (!h) + return -ENOENT; + ct = nf_ct_tuplehash_to_ctrack(h); + + /* Look for helper of this expectation */ + if (cda[CTA_EXPECT_HELP_NAME]) { + const char *helpname = nla_data(cda[CTA_EXPECT_HELP_NAME]); + + helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct), + nf_ct_protonum(ct)); + if (helper == NULL) { +#ifdef CONFIG_MODULES + if (request_module("nfct-helper-%s", helpname) < 0) { + err = -EOPNOTSUPP; + goto out; + } + + helper = __nf_conntrack_helper_find(helpname, + nf_ct_l3num(ct), + nf_ct_protonum(ct)); + if (helper) { + err = -EAGAIN; + goto out; + } +#endif + err = -EOPNOTSUPP; + goto out; + } + } + + if (cda[CTA_EXPECT_CLASS] && helper) { + class = ntohl(nla_get_be32(cda[CTA_EXPECT_CLASS])); + if (class > helper->expect_class_max) { + err = -EINVAL; + goto out; + } + } + exp = nf_ct_expect_alloc(ct); + if (!exp) { + err = -ENOMEM; + goto out; + } + help = nfct_help(ct); + if (!help) { + if (!cda[CTA_EXPECT_TIMEOUT]) { + err = -EINVAL; + goto out; + } + exp->timeout.expires = + jiffies + ntohl(nla_get_be32(cda[CTA_EXPECT_TIMEOUT])) * HZ; + + exp->flags = NF_CT_EXPECT_USERSPACE; + if (cda[CTA_EXPECT_FLAGS]) { + exp->flags |= + ntohl(nla_get_be32(cda[CTA_EXPECT_FLAGS])); + } + } else { + if (cda[CTA_EXPECT_FLAGS]) { + exp->flags = ntohl(nla_get_be32(cda[CTA_EXPECT_FLAGS])); + exp->flags &= ~NF_CT_EXPECT_USERSPACE; + } else + exp->flags = 0; + } + if (cda[CTA_EXPECT_FN]) { + const char *name = nla_data(cda[CTA_EXPECT_FN]); + struct nf_ct_helper_expectfn *expfn; + + expfn = nf_ct_helper_expectfn_find_by_name(name); + if (expfn == NULL) { + err = -EINVAL; + goto err_out; + } + exp->expectfn = expfn->expectfn; + } else + exp->expectfn = NULL; + + exp->class = class; + exp->master = ct; + exp->helper = helper; + memcpy(&exp->tuple, &tuple, sizeof(struct nf_conntrack_tuple)); + memcpy(&exp->mask.src.u3, &mask.src.u3, sizeof(exp->mask.src.u3)); + exp->mask.src.u.all = mask.src.u.all; + + if (cda[CTA_EXPECT_NAT]) { + err = ctnetlink_parse_expect_nat(cda[CTA_EXPECT_NAT], + exp, u3); + if (err < 0) + goto err_out; + } + err = nf_ct_expect_related_report(exp, pid, report); +err_out: + nf_ct_expect_put(exp); +out: + nf_ct_put(nf_ct_tuplehash_to_ctrack(h)); + return err; +} + +static int +ctnetlink_new_expect(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const cda[]) +{ + struct net *net = sock_net(ctnl); + struct nf_conntrack_tuple tuple; + struct nf_conntrack_expect *exp; + struct nfgenmsg *nfmsg = nlmsg_data(nlh); + u_int8_t u3 = nfmsg->nfgen_family; + u16 zone; + int err; + + if (!cda[CTA_EXPECT_TUPLE] + || !cda[CTA_EXPECT_MASK] + || !cda[CTA_EXPECT_MASTER]) + return -EINVAL; + + err = ctnetlink_parse_zone(cda[CTA_EXPECT_ZONE], &zone); + if (err < 0) + return err; + + err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3); + if (err < 0) + return err; + + spin_lock_bh(&nf_conntrack_lock); + exp = __nf_ct_expect_find(net, zone, &tuple); + + if (!exp) { + spin_unlock_bh(&nf_conntrack_lock); + err = -ENOENT; + if (nlh->nlmsg_flags & NLM_F_CREATE) { + err = ctnetlink_create_expect(net, zone, cda, + u3, + NETLINK_CB(skb).pid, + nlmsg_report(nlh)); + } + return err; + } + + err = -EEXIST; + if (!(nlh->nlmsg_flags & NLM_F_EXCL)) + err = ctnetlink_change_expect(exp, cda); + spin_unlock_bh(&nf_conntrack_lock); + + return err; +} + +#ifdef CONFIG_NF_CONNTRACK_EVENTS +static struct nf_ct_event_notifier ctnl_notifier = { + .fcn = ctnetlink_conntrack_event, +}; + +static struct nf_exp_event_notifier ctnl_notifier_exp = { + .fcn = ctnetlink_expect_event, +}; +#endif + +static const struct nfnl_callback ctnl_cb[IPCTNL_MSG_MAX] = { + [IPCTNL_MSG_CT_NEW] = { .call = ctnetlink_new_conntrack, + .attr_count = CTA_MAX, + .policy = ct_nla_policy }, + [IPCTNL_MSG_CT_GET] = { .call = ctnetlink_get_conntrack, + .attr_count = CTA_MAX, + .policy = ct_nla_policy }, + [IPCTNL_MSG_CT_DELETE] = { .call = ctnetlink_del_conntrack, + .attr_count = CTA_MAX, + .policy = ct_nla_policy }, + [IPCTNL_MSG_CT_GET_CTRZERO] = { .call = ctnetlink_get_conntrack, + .attr_count = CTA_MAX, + .policy = ct_nla_policy }, +}; + +static const struct nfnl_callback ctnl_exp_cb[IPCTNL_MSG_EXP_MAX] = { + [IPCTNL_MSG_EXP_GET] = { .call = ctnetlink_get_expect, + .attr_count = CTA_EXPECT_MAX, + .policy = exp_nla_policy }, + [IPCTNL_MSG_EXP_NEW] = { .call = ctnetlink_new_expect, + .attr_count = CTA_EXPECT_MAX, + .policy = exp_nla_policy }, + [IPCTNL_MSG_EXP_DELETE] = { .call = ctnetlink_del_expect, + .attr_count = CTA_EXPECT_MAX, + .policy = exp_nla_policy }, +}; + +static const struct nfnetlink_subsystem ctnl_subsys = { + .name = "conntrack", + .subsys_id = NFNL_SUBSYS_CTNETLINK, + .cb_count = IPCTNL_MSG_MAX, + .cb = ctnl_cb, +}; + +static const struct nfnetlink_subsystem ctnl_exp_subsys = { + .name = "conntrack_expect", + .subsys_id = NFNL_SUBSYS_CTNETLINK_EXP, + .cb_count = IPCTNL_MSG_EXP_MAX, + .cb = ctnl_exp_cb, +}; + +MODULE_ALIAS("ip_conntrack_netlink"); +MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTNETLINK); +MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTNETLINK_EXP); + +static int __net_init ctnetlink_net_init(struct net *net) +{ +#ifdef CONFIG_NF_CONNTRACK_EVENTS + int ret; + + ret = nf_conntrack_register_notifier(net, &ctnl_notifier); + if (ret < 0) { + pr_err("ctnetlink_init: cannot register notifier.\n"); + goto err_out; + } + + ret = nf_ct_expect_register_notifier(net, &ctnl_notifier_exp); + if (ret < 0) { + pr_err("ctnetlink_init: cannot expect register notifier.\n"); + goto err_unreg_notifier; + } +#endif + return 0; + +#ifdef CONFIG_NF_CONNTRACK_EVENTS +err_unreg_notifier: + nf_conntrack_unregister_notifier(net, &ctnl_notifier); +err_out: + return ret; +#endif +} + +static void ctnetlink_net_exit(struct net *net) +{ +#ifdef CONFIG_NF_CONNTRACK_EVENTS + nf_ct_expect_unregister_notifier(net, &ctnl_notifier_exp); + nf_conntrack_unregister_notifier(net, &ctnl_notifier); +#endif +} + +static void __net_exit ctnetlink_net_exit_batch(struct list_head *net_exit_list) +{ + struct net *net; + + list_for_each_entry(net, net_exit_list, exit_list) + ctnetlink_net_exit(net); +} + +static struct pernet_operations ctnetlink_net_ops = { + .init = ctnetlink_net_init, + .exit_batch = ctnetlink_net_exit_batch, +}; + +static int __init ctnetlink_init(void) +{ + int ret; + + pr_info("ctnetlink v%s: registering with nfnetlink.\n", version); + ret = nfnetlink_subsys_register(&ctnl_subsys); + if (ret < 0) { + pr_err("ctnetlink_init: cannot register with nfnetlink.\n"); + goto err_out; + } + + ret = nfnetlink_subsys_register(&ctnl_exp_subsys); + if (ret < 0) { + pr_err("ctnetlink_init: cannot register exp with nfnetlink.\n"); + goto err_unreg_subsys; + } + + if (register_pernet_subsys(&ctnetlink_net_ops)) { + pr_err("ctnetlink_init: cannot register pernet operations\n"); + goto err_unreg_exp_subsys; + } + + return 0; + +err_unreg_exp_subsys: + nfnetlink_subsys_unregister(&ctnl_exp_subsys); +err_unreg_subsys: + nfnetlink_subsys_unregister(&ctnl_subsys); +err_out: + return ret; +} + +static void __exit ctnetlink_exit(void) +{ + pr_info("ctnetlink: unregistering from nfnetlink.\n"); + + unregister_pernet_subsys(&ctnetlink_net_ops); + nfnetlink_subsys_unregister(&ctnl_exp_subsys); + nfnetlink_subsys_unregister(&ctnl_subsys); +} + +module_init(ctnetlink_init); +module_exit(ctnetlink_exit); diff --git a/net/netfilter/nf_conntrack_pptp.c b/net/netfilter/nf_conntrack_pptp.c new file mode 100644 index 00000000..31d56b23 --- /dev/null +++ b/net/netfilter/nf_conntrack_pptp.c @@ -0,0 +1,632 @@ +/* + * Connection tracking support for PPTP (Point to Point Tunneling Protocol). + * PPTP is a a protocol for creating virtual private networks. + * It is a specification defined by Microsoft and some vendors + * working with Microsoft. PPTP is built on top of a modified + * version of the Internet Generic Routing Encapsulation Protocol. + * GRE is defined in RFC 1701 and RFC 1702. Documentation of + * PPTP can be found in RFC 2637 + * + * (C) 2000-2005 by Harald Welte <laforge@gnumonks.org> + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + * + * Limitations: + * - We blindly assume that control connections are always + * established in PNS->PAC direction. This is a violation + * of RFFC2673 + * - We can only support one single call within each session + * TODO: + * - testing of incoming PPTP calls + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/in.h> +#include <linux/tcp.h> + +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_zones.h> +#include <linux/netfilter/nf_conntrack_proto_gre.h> +#include <linux/netfilter/nf_conntrack_pptp.h> + +#define NF_CT_PPTP_VERSION "3.1" + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>"); +MODULE_DESCRIPTION("Netfilter connection tracking helper module for PPTP"); +MODULE_ALIAS("ip_conntrack_pptp"); +MODULE_ALIAS_NFCT_HELPER("pptp"); + +static DEFINE_SPINLOCK(nf_pptp_lock); + +int +(*nf_nat_pptp_hook_outbound)(struct sk_buff *skb, + struct nf_conn *ct, enum ip_conntrack_info ctinfo, + struct PptpControlHeader *ctlh, + union pptp_ctrl_union *pptpReq) __read_mostly; +EXPORT_SYMBOL_GPL(nf_nat_pptp_hook_outbound); + +int +(*nf_nat_pptp_hook_inbound)(struct sk_buff *skb, + struct nf_conn *ct, enum ip_conntrack_info ctinfo, + struct PptpControlHeader *ctlh, + union pptp_ctrl_union *pptpReq) __read_mostly; +EXPORT_SYMBOL_GPL(nf_nat_pptp_hook_inbound); + +void +(*nf_nat_pptp_hook_exp_gre)(struct nf_conntrack_expect *expect_orig, + struct nf_conntrack_expect *expect_reply) + __read_mostly; +EXPORT_SYMBOL_GPL(nf_nat_pptp_hook_exp_gre); + +void +(*nf_nat_pptp_hook_expectfn)(struct nf_conn *ct, + struct nf_conntrack_expect *exp) __read_mostly; +EXPORT_SYMBOL_GPL(nf_nat_pptp_hook_expectfn); + +#if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG) +/* PptpControlMessageType names */ +const char *const pptp_msg_name[] = { + "UNKNOWN_MESSAGE", + "START_SESSION_REQUEST", + "START_SESSION_REPLY", + "STOP_SESSION_REQUEST", + "STOP_SESSION_REPLY", + "ECHO_REQUEST", + "ECHO_REPLY", + "OUT_CALL_REQUEST", + "OUT_CALL_REPLY", + "IN_CALL_REQUEST", + "IN_CALL_REPLY", + "IN_CALL_CONNECT", + "CALL_CLEAR_REQUEST", + "CALL_DISCONNECT_NOTIFY", + "WAN_ERROR_NOTIFY", + "SET_LINK_INFO" +}; +EXPORT_SYMBOL(pptp_msg_name); +#endif + +#define SECS *HZ +#define MINS * 60 SECS +#define HOURS * 60 MINS + +#define PPTP_GRE_TIMEOUT (10 MINS) +#define PPTP_GRE_STREAM_TIMEOUT (5 HOURS) + +static void pptp_expectfn(struct nf_conn *ct, + struct nf_conntrack_expect *exp) +{ + struct net *net = nf_ct_net(ct); + typeof(nf_nat_pptp_hook_expectfn) nf_nat_pptp_expectfn; + pr_debug("increasing timeouts\n"); + + /* increase timeout of GRE data channel conntrack entry */ + ct->proto.gre.timeout = PPTP_GRE_TIMEOUT; + ct->proto.gre.stream_timeout = PPTP_GRE_STREAM_TIMEOUT; + + /* Can you see how rusty this code is, compared with the pre-2.6.11 + * one? That's what happened to my shiny newnat of 2002 ;( -HW */ + + rcu_read_lock(); + nf_nat_pptp_expectfn = rcu_dereference(nf_nat_pptp_hook_expectfn); + if (nf_nat_pptp_expectfn && ct->master->status & IPS_NAT_MASK) + nf_nat_pptp_expectfn(ct, exp); + else { + struct nf_conntrack_tuple inv_t; + struct nf_conntrack_expect *exp_other; + + /* obviously this tuple inversion only works until you do NAT */ + nf_ct_invert_tuplepr(&inv_t, &exp->tuple); + pr_debug("trying to unexpect other dir: "); + nf_ct_dump_tuple(&inv_t); + + exp_other = nf_ct_expect_find_get(net, nf_ct_zone(ct), &inv_t); + if (exp_other) { + /* delete other expectation. */ + pr_debug("found\n"); + nf_ct_unexpect_related(exp_other); + nf_ct_expect_put(exp_other); + } else { + pr_debug("not found\n"); + } + } + rcu_read_unlock(); +} + +static int destroy_sibling_or_exp(struct net *net, struct nf_conn *ct, + const struct nf_conntrack_tuple *t) +{ + const struct nf_conntrack_tuple_hash *h; + struct nf_conntrack_expect *exp; + struct nf_conn *sibling; + u16 zone = nf_ct_zone(ct); + + pr_debug("trying to timeout ct or exp for tuple "); + nf_ct_dump_tuple(t); + + h = nf_conntrack_find_get(net, zone, t); + if (h) { + sibling = nf_ct_tuplehash_to_ctrack(h); + pr_debug("setting timeout of conntrack %p to 0\n", sibling); + sibling->proto.gre.timeout = 0; + sibling->proto.gre.stream_timeout = 0; + if (del_timer(&sibling->timeout)) + sibling->timeout.function((unsigned long)sibling); + nf_ct_put(sibling); + return 1; + } else { + exp = nf_ct_expect_find_get(net, zone, t); + if (exp) { + pr_debug("unexpect_related of expect %p\n", exp); + nf_ct_unexpect_related(exp); + nf_ct_expect_put(exp); + return 1; + } + } + return 0; +} + +/* timeout GRE data connections */ +static void pptp_destroy_siblings(struct nf_conn *ct) +{ + struct net *net = nf_ct_net(ct); + const struct nf_conn_help *help = nfct_help(ct); + struct nf_conntrack_tuple t; + + nf_ct_gre_keymap_destroy(ct); + + /* try original (pns->pac) tuple */ + memcpy(&t, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, sizeof(t)); + t.dst.protonum = IPPROTO_GRE; + t.src.u.gre.key = help->help.ct_pptp_info.pns_call_id; + t.dst.u.gre.key = help->help.ct_pptp_info.pac_call_id; + if (!destroy_sibling_or_exp(net, ct, &t)) + pr_debug("failed to timeout original pns->pac ct/exp\n"); + + /* try reply (pac->pns) tuple */ + memcpy(&t, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, sizeof(t)); + t.dst.protonum = IPPROTO_GRE; + t.src.u.gre.key = help->help.ct_pptp_info.pac_call_id; + t.dst.u.gre.key = help->help.ct_pptp_info.pns_call_id; + if (!destroy_sibling_or_exp(net, ct, &t)) + pr_debug("failed to timeout reply pac->pns ct/exp\n"); +} + +/* expect GRE connections (PNS->PAC and PAC->PNS direction) */ +static int exp_gre(struct nf_conn *ct, __be16 callid, __be16 peer_callid) +{ + struct nf_conntrack_expect *exp_orig, *exp_reply; + enum ip_conntrack_dir dir; + int ret = 1; + typeof(nf_nat_pptp_hook_exp_gre) nf_nat_pptp_exp_gre; + + exp_orig = nf_ct_expect_alloc(ct); + if (exp_orig == NULL) + goto out; + + exp_reply = nf_ct_expect_alloc(ct); + if (exp_reply == NULL) + goto out_put_orig; + + /* original direction, PNS->PAC */ + dir = IP_CT_DIR_ORIGINAL; + nf_ct_expect_init(exp_orig, NF_CT_EXPECT_CLASS_DEFAULT, + nf_ct_l3num(ct), + &ct->tuplehash[dir].tuple.src.u3, + &ct->tuplehash[dir].tuple.dst.u3, + IPPROTO_GRE, &peer_callid, &callid); + exp_orig->expectfn = pptp_expectfn; + + /* reply direction, PAC->PNS */ + dir = IP_CT_DIR_REPLY; + nf_ct_expect_init(exp_reply, NF_CT_EXPECT_CLASS_DEFAULT, + nf_ct_l3num(ct), + &ct->tuplehash[dir].tuple.src.u3, + &ct->tuplehash[dir].tuple.dst.u3, + IPPROTO_GRE, &callid, &peer_callid); + exp_reply->expectfn = pptp_expectfn; + + nf_nat_pptp_exp_gre = rcu_dereference(nf_nat_pptp_hook_exp_gre); + if (nf_nat_pptp_exp_gre && ct->status & IPS_NAT_MASK) + nf_nat_pptp_exp_gre(exp_orig, exp_reply); + if (nf_ct_expect_related(exp_orig) != 0) + goto out_put_both; + if (nf_ct_expect_related(exp_reply) != 0) + goto out_unexpect_orig; + + /* Add GRE keymap entries */ + if (nf_ct_gre_keymap_add(ct, IP_CT_DIR_ORIGINAL, &exp_orig->tuple) != 0) + goto out_unexpect_both; + if (nf_ct_gre_keymap_add(ct, IP_CT_DIR_REPLY, &exp_reply->tuple) != 0) { + nf_ct_gre_keymap_destroy(ct); + goto out_unexpect_both; + } + ret = 0; + +out_put_both: + nf_ct_expect_put(exp_reply); +out_put_orig: + nf_ct_expect_put(exp_orig); +out: + return ret; + +out_unexpect_both: + nf_ct_unexpect_related(exp_reply); +out_unexpect_orig: + nf_ct_unexpect_related(exp_orig); + goto out_put_both; +} + +static inline int +pptp_inbound_pkt(struct sk_buff *skb, + struct PptpControlHeader *ctlh, + union pptp_ctrl_union *pptpReq, + unsigned int reqlen, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo) +{ + struct nf_ct_pptp_master *info = &nfct_help(ct)->help.ct_pptp_info; + u_int16_t msg; + __be16 cid = 0, pcid = 0; + typeof(nf_nat_pptp_hook_inbound) nf_nat_pptp_inbound; + + msg = ntohs(ctlh->messageType); + pr_debug("inbound control message %s\n", pptp_msg_name[msg]); + + switch (msg) { + case PPTP_START_SESSION_REPLY: + /* server confirms new control session */ + if (info->sstate < PPTP_SESSION_REQUESTED) + goto invalid; + if (pptpReq->srep.resultCode == PPTP_START_OK) + info->sstate = PPTP_SESSION_CONFIRMED; + else + info->sstate = PPTP_SESSION_ERROR; + break; + + case PPTP_STOP_SESSION_REPLY: + /* server confirms end of control session */ + if (info->sstate > PPTP_SESSION_STOPREQ) + goto invalid; + if (pptpReq->strep.resultCode == PPTP_STOP_OK) + info->sstate = PPTP_SESSION_NONE; + else + info->sstate = PPTP_SESSION_ERROR; + break; + + case PPTP_OUT_CALL_REPLY: + /* server accepted call, we now expect GRE frames */ + if (info->sstate != PPTP_SESSION_CONFIRMED) + goto invalid; + if (info->cstate != PPTP_CALL_OUT_REQ && + info->cstate != PPTP_CALL_OUT_CONF) + goto invalid; + + cid = pptpReq->ocack.callID; + pcid = pptpReq->ocack.peersCallID; + if (info->pns_call_id != pcid) + goto invalid; + pr_debug("%s, CID=%X, PCID=%X\n", pptp_msg_name[msg], + ntohs(cid), ntohs(pcid)); + + if (pptpReq->ocack.resultCode == PPTP_OUTCALL_CONNECT) { + info->cstate = PPTP_CALL_OUT_CONF; + info->pac_call_id = cid; + exp_gre(ct, cid, pcid); + } else + info->cstate = PPTP_CALL_NONE; + break; + + case PPTP_IN_CALL_REQUEST: + /* server tells us about incoming call request */ + if (info->sstate != PPTP_SESSION_CONFIRMED) + goto invalid; + + cid = pptpReq->icreq.callID; + pr_debug("%s, CID=%X\n", pptp_msg_name[msg], ntohs(cid)); + info->cstate = PPTP_CALL_IN_REQ; + info->pac_call_id = cid; + break; + + case PPTP_IN_CALL_CONNECT: + /* server tells us about incoming call established */ + if (info->sstate != PPTP_SESSION_CONFIRMED) + goto invalid; + if (info->cstate != PPTP_CALL_IN_REP && + info->cstate != PPTP_CALL_IN_CONF) + goto invalid; + + pcid = pptpReq->iccon.peersCallID; + cid = info->pac_call_id; + + if (info->pns_call_id != pcid) + goto invalid; + + pr_debug("%s, PCID=%X\n", pptp_msg_name[msg], ntohs(pcid)); + info->cstate = PPTP_CALL_IN_CONF; + + /* we expect a GRE connection from PAC to PNS */ + exp_gre(ct, cid, pcid); + break; + + case PPTP_CALL_DISCONNECT_NOTIFY: + /* server confirms disconnect */ + cid = pptpReq->disc.callID; + pr_debug("%s, CID=%X\n", pptp_msg_name[msg], ntohs(cid)); + info->cstate = PPTP_CALL_NONE; + + /* untrack this call id, unexpect GRE packets */ + pptp_destroy_siblings(ct); + break; + + case PPTP_WAN_ERROR_NOTIFY: + case PPTP_SET_LINK_INFO: + case PPTP_ECHO_REQUEST: + case PPTP_ECHO_REPLY: + /* I don't have to explain these ;) */ + break; + + default: + goto invalid; + } + + nf_nat_pptp_inbound = rcu_dereference(nf_nat_pptp_hook_inbound); + if (nf_nat_pptp_inbound && ct->status & IPS_NAT_MASK) + return nf_nat_pptp_inbound(skb, ct, ctinfo, ctlh, pptpReq); + return NF_ACCEPT; + +invalid: + pr_debug("invalid %s: type=%d cid=%u pcid=%u " + "cstate=%d sstate=%d pns_cid=%u pac_cid=%u\n", + msg <= PPTP_MSG_MAX ? pptp_msg_name[msg] : pptp_msg_name[0], + msg, ntohs(cid), ntohs(pcid), info->cstate, info->sstate, + ntohs(info->pns_call_id), ntohs(info->pac_call_id)); + return NF_ACCEPT; +} + +static inline int +pptp_outbound_pkt(struct sk_buff *skb, + struct PptpControlHeader *ctlh, + union pptp_ctrl_union *pptpReq, + unsigned int reqlen, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo) +{ + struct nf_ct_pptp_master *info = &nfct_help(ct)->help.ct_pptp_info; + u_int16_t msg; + __be16 cid = 0, pcid = 0; + typeof(nf_nat_pptp_hook_outbound) nf_nat_pptp_outbound; + + msg = ntohs(ctlh->messageType); + pr_debug("outbound control message %s\n", pptp_msg_name[msg]); + + switch (msg) { + case PPTP_START_SESSION_REQUEST: + /* client requests for new control session */ + if (info->sstate != PPTP_SESSION_NONE) + goto invalid; + info->sstate = PPTP_SESSION_REQUESTED; + break; + + case PPTP_STOP_SESSION_REQUEST: + /* client requests end of control session */ + info->sstate = PPTP_SESSION_STOPREQ; + break; + + case PPTP_OUT_CALL_REQUEST: + /* client initiating connection to server */ + if (info->sstate != PPTP_SESSION_CONFIRMED) + goto invalid; + info->cstate = PPTP_CALL_OUT_REQ; + /* track PNS call id */ + cid = pptpReq->ocreq.callID; + pr_debug("%s, CID=%X\n", pptp_msg_name[msg], ntohs(cid)); + info->pns_call_id = cid; + break; + + case PPTP_IN_CALL_REPLY: + /* client answers incoming call */ + if (info->cstate != PPTP_CALL_IN_REQ && + info->cstate != PPTP_CALL_IN_REP) + goto invalid; + + cid = pptpReq->icack.callID; + pcid = pptpReq->icack.peersCallID; + if (info->pac_call_id != pcid) + goto invalid; + pr_debug("%s, CID=%X PCID=%X\n", pptp_msg_name[msg], + ntohs(cid), ntohs(pcid)); + + if (pptpReq->icack.resultCode == PPTP_INCALL_ACCEPT) { + /* part two of the three-way handshake */ + info->cstate = PPTP_CALL_IN_REP; + info->pns_call_id = cid; + } else + info->cstate = PPTP_CALL_NONE; + break; + + case PPTP_CALL_CLEAR_REQUEST: + /* client requests hangup of call */ + if (info->sstate != PPTP_SESSION_CONFIRMED) + goto invalid; + /* FUTURE: iterate over all calls and check if + * call ID is valid. We don't do this without newnat, + * because we only know about last call */ + info->cstate = PPTP_CALL_CLEAR_REQ; + break; + + case PPTP_SET_LINK_INFO: + case PPTP_ECHO_REQUEST: + case PPTP_ECHO_REPLY: + /* I don't have to explain these ;) */ + break; + + default: + goto invalid; + } + + nf_nat_pptp_outbound = rcu_dereference(nf_nat_pptp_hook_outbound); + if (nf_nat_pptp_outbound && ct->status & IPS_NAT_MASK) + return nf_nat_pptp_outbound(skb, ct, ctinfo, ctlh, pptpReq); + return NF_ACCEPT; + +invalid: + pr_debug("invalid %s: type=%d cid=%u pcid=%u " + "cstate=%d sstate=%d pns_cid=%u pac_cid=%u\n", + msg <= PPTP_MSG_MAX ? pptp_msg_name[msg] : pptp_msg_name[0], + msg, ntohs(cid), ntohs(pcid), info->cstate, info->sstate, + ntohs(info->pns_call_id), ntohs(info->pac_call_id)); + return NF_ACCEPT; +} + +static const unsigned int pptp_msg_size[] = { + [PPTP_START_SESSION_REQUEST] = sizeof(struct PptpStartSessionRequest), + [PPTP_START_SESSION_REPLY] = sizeof(struct PptpStartSessionReply), + [PPTP_STOP_SESSION_REQUEST] = sizeof(struct PptpStopSessionRequest), + [PPTP_STOP_SESSION_REPLY] = sizeof(struct PptpStopSessionReply), + [PPTP_OUT_CALL_REQUEST] = sizeof(struct PptpOutCallRequest), + [PPTP_OUT_CALL_REPLY] = sizeof(struct PptpOutCallReply), + [PPTP_IN_CALL_REQUEST] = sizeof(struct PptpInCallRequest), + [PPTP_IN_CALL_REPLY] = sizeof(struct PptpInCallReply), + [PPTP_IN_CALL_CONNECT] = sizeof(struct PptpInCallConnected), + [PPTP_CALL_CLEAR_REQUEST] = sizeof(struct PptpClearCallRequest), + [PPTP_CALL_DISCONNECT_NOTIFY] = sizeof(struct PptpCallDisconnectNotify), + [PPTP_WAN_ERROR_NOTIFY] = sizeof(struct PptpWanErrorNotify), + [PPTP_SET_LINK_INFO] = sizeof(struct PptpSetLinkInfo), +}; + +/* track caller id inside control connection, call expect_related */ +static int +conntrack_pptp_help(struct sk_buff *skb, unsigned int protoff, + struct nf_conn *ct, enum ip_conntrack_info ctinfo) + +{ + int dir = CTINFO2DIR(ctinfo); + const struct nf_ct_pptp_master *info = &nfct_help(ct)->help.ct_pptp_info; + const struct tcphdr *tcph; + struct tcphdr _tcph; + const struct pptp_pkt_hdr *pptph; + struct pptp_pkt_hdr _pptph; + struct PptpControlHeader _ctlh, *ctlh; + union pptp_ctrl_union _pptpReq, *pptpReq; + unsigned int tcplen = skb->len - protoff; + unsigned int datalen, reqlen, nexthdr_off; + int oldsstate, oldcstate; + int ret; + u_int16_t msg; + + /* don't do any tracking before tcp handshake complete */ + if (ctinfo != IP_CT_ESTABLISHED && ctinfo != IP_CT_ESTABLISHED_REPLY) + return NF_ACCEPT; + + nexthdr_off = protoff; + tcph = skb_header_pointer(skb, nexthdr_off, sizeof(_tcph), &_tcph); + BUG_ON(!tcph); + nexthdr_off += tcph->doff * 4; + datalen = tcplen - tcph->doff * 4; + + pptph = skb_header_pointer(skb, nexthdr_off, sizeof(_pptph), &_pptph); + if (!pptph) { + pr_debug("no full PPTP header, can't track\n"); + return NF_ACCEPT; + } + nexthdr_off += sizeof(_pptph); + datalen -= sizeof(_pptph); + + /* if it's not a control message we can't do anything with it */ + if (ntohs(pptph->packetType) != PPTP_PACKET_CONTROL || + ntohl(pptph->magicCookie) != PPTP_MAGIC_COOKIE) { + pr_debug("not a control packet\n"); + return NF_ACCEPT; + } + + ctlh = skb_header_pointer(skb, nexthdr_off, sizeof(_ctlh), &_ctlh); + if (!ctlh) + return NF_ACCEPT; + nexthdr_off += sizeof(_ctlh); + datalen -= sizeof(_ctlh); + + reqlen = datalen; + msg = ntohs(ctlh->messageType); + if (msg > 0 && msg <= PPTP_MSG_MAX && reqlen < pptp_msg_size[msg]) + return NF_ACCEPT; + if (reqlen > sizeof(*pptpReq)) + reqlen = sizeof(*pptpReq); + + pptpReq = skb_header_pointer(skb, nexthdr_off, reqlen, &_pptpReq); + if (!pptpReq) + return NF_ACCEPT; + + oldsstate = info->sstate; + oldcstate = info->cstate; + + spin_lock_bh(&nf_pptp_lock); + + /* FIXME: We just blindly assume that the control connection is always + * established from PNS->PAC. However, RFC makes no guarantee */ + if (dir == IP_CT_DIR_ORIGINAL) + /* client -> server (PNS -> PAC) */ + ret = pptp_outbound_pkt(skb, ctlh, pptpReq, reqlen, ct, + ctinfo); + else + /* server -> client (PAC -> PNS) */ + ret = pptp_inbound_pkt(skb, ctlh, pptpReq, reqlen, ct, + ctinfo); + pr_debug("sstate: %d->%d, cstate: %d->%d\n", + oldsstate, info->sstate, oldcstate, info->cstate); + spin_unlock_bh(&nf_pptp_lock); + + return ret; +} + +static const struct nf_conntrack_expect_policy pptp_exp_policy = { + .max_expected = 2, + .timeout = 5 * 60, +}; + +/* control protocol helper */ +static struct nf_conntrack_helper pptp __read_mostly = { + .name = "pptp", + .me = THIS_MODULE, + .tuple.src.l3num = AF_INET, + .tuple.src.u.tcp.port = cpu_to_be16(PPTP_CONTROL_PORT), + .tuple.dst.protonum = IPPROTO_TCP, + .help = conntrack_pptp_help, + .destroy = pptp_destroy_siblings, + .expect_policy = &pptp_exp_policy, +}; + +static void nf_conntrack_pptp_net_exit(struct net *net) +{ + nf_ct_gre_keymap_flush(net); +} + +static struct pernet_operations nf_conntrack_pptp_net_ops = { + .exit = nf_conntrack_pptp_net_exit, +}; + +static int __init nf_conntrack_pptp_init(void) +{ + int rv; + + rv = nf_conntrack_helper_register(&pptp); + if (rv < 0) + return rv; + rv = register_pernet_subsys(&nf_conntrack_pptp_net_ops); + if (rv < 0) + nf_conntrack_helper_unregister(&pptp); + return rv; +} + +static void __exit nf_conntrack_pptp_fini(void) +{ + nf_conntrack_helper_unregister(&pptp); + unregister_pernet_subsys(&nf_conntrack_pptp_net_ops); +} + +module_init(nf_conntrack_pptp_init); +module_exit(nf_conntrack_pptp_fini); diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c new file mode 100644 index 00000000..be3da2c8 --- /dev/null +++ b/net/netfilter/nf_conntrack_proto.c @@ -0,0 +1,405 @@ +/* L3/L4 protocol support for nf_conntrack. */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> + * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/types.h> +#include <linux/netfilter.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/mutex.h> +#include <linux/vmalloc.h> +#include <linux/stddef.h> +#include <linux/err.h> +#include <linux/percpu.h> +#include <linux/notifier.h> +#include <linux/kernel.h> +#include <linux/netdevice.h> +#include <linux/rtnetlink.h> + +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_l3proto.h> +#include <net/netfilter/nf_conntrack_l4proto.h> +#include <net/netfilter/nf_conntrack_core.h> + +static struct nf_conntrack_l4proto __rcu **nf_ct_protos[PF_MAX] __read_mostly; +struct nf_conntrack_l3proto __rcu *nf_ct_l3protos[AF_MAX] __read_mostly; +EXPORT_SYMBOL_GPL(nf_ct_l3protos); + +static DEFINE_MUTEX(nf_ct_proto_mutex); + +#ifdef CONFIG_SYSCTL +static int +nf_ct_register_sysctl(struct ctl_table_header **header, struct ctl_path *path, + struct ctl_table *table, unsigned int *users) +{ + if (*header == NULL) { + *header = register_sysctl_paths(path, table); + if (*header == NULL) + return -ENOMEM; + } + if (users != NULL) + (*users)++; + return 0; +} + +static void +nf_ct_unregister_sysctl(struct ctl_table_header **header, + struct ctl_table *table, unsigned int *users) +{ + if (users != NULL && --*users > 0) + return; + + unregister_sysctl_table(*header); + *header = NULL; +} +#endif + +struct nf_conntrack_l4proto * +__nf_ct_l4proto_find(u_int16_t l3proto, u_int8_t l4proto) +{ + if (unlikely(l3proto >= AF_MAX || nf_ct_protos[l3proto] == NULL)) + return &nf_conntrack_l4proto_generic; + + return rcu_dereference(nf_ct_protos[l3proto][l4proto]); +} +EXPORT_SYMBOL_GPL(__nf_ct_l4proto_find); + +/* this is guaranteed to always return a valid protocol helper, since + * it falls back to generic_protocol */ +struct nf_conntrack_l3proto * +nf_ct_l3proto_find_get(u_int16_t l3proto) +{ + struct nf_conntrack_l3proto *p; + + rcu_read_lock(); + p = __nf_ct_l3proto_find(l3proto); + if (!try_module_get(p->me)) + p = &nf_conntrack_l3proto_generic; + rcu_read_unlock(); + + return p; +} +EXPORT_SYMBOL_GPL(nf_ct_l3proto_find_get); + +void nf_ct_l3proto_put(struct nf_conntrack_l3proto *p) +{ + module_put(p->me); +} +EXPORT_SYMBOL_GPL(nf_ct_l3proto_put); + +int +nf_ct_l3proto_try_module_get(unsigned short l3proto) +{ + int ret; + struct nf_conntrack_l3proto *p; + +retry: p = nf_ct_l3proto_find_get(l3proto); + if (p == &nf_conntrack_l3proto_generic) { + ret = request_module("nf_conntrack-%d", l3proto); + if (!ret) + goto retry; + + return -EPROTOTYPE; + } + + return 0; +} +EXPORT_SYMBOL_GPL(nf_ct_l3proto_try_module_get); + +void nf_ct_l3proto_module_put(unsigned short l3proto) +{ + struct nf_conntrack_l3proto *p; + + /* rcu_read_lock not necessary since the caller holds a reference, but + * taken anyways to avoid lockdep warnings in __nf_ct_l3proto_find() + */ + rcu_read_lock(); + p = __nf_ct_l3proto_find(l3proto); + module_put(p->me); + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(nf_ct_l3proto_module_put); + +struct nf_conntrack_l4proto * +nf_ct_l4proto_find_get(u_int16_t l3num, u_int8_t l4num) +{ + struct nf_conntrack_l4proto *p; + + rcu_read_lock(); + p = __nf_ct_l4proto_find(l3num, l4num); + if (!try_module_get(p->me)) + p = &nf_conntrack_l4proto_generic; + rcu_read_unlock(); + + return p; +} +EXPORT_SYMBOL_GPL(nf_ct_l4proto_find_get); + +void nf_ct_l4proto_put(struct nf_conntrack_l4proto *p) +{ + module_put(p->me); +} +EXPORT_SYMBOL_GPL(nf_ct_l4proto_put); + +static int kill_l3proto(struct nf_conn *i, void *data) +{ + return nf_ct_l3num(i) == ((struct nf_conntrack_l3proto *)data)->l3proto; +} + +static int kill_l4proto(struct nf_conn *i, void *data) +{ + struct nf_conntrack_l4proto *l4proto; + l4proto = (struct nf_conntrack_l4proto *)data; + return nf_ct_protonum(i) == l4proto->l4proto && + nf_ct_l3num(i) == l4proto->l3proto; +} + +static int nf_ct_l3proto_register_sysctl(struct nf_conntrack_l3proto *l3proto) +{ + int err = 0; + +#ifdef CONFIG_SYSCTL + if (l3proto->ctl_table != NULL) { + err = nf_ct_register_sysctl(&l3proto->ctl_table_header, + l3proto->ctl_table_path, + l3proto->ctl_table, NULL); + } +#endif + return err; +} + +static void nf_ct_l3proto_unregister_sysctl(struct nf_conntrack_l3proto *l3proto) +{ +#ifdef CONFIG_SYSCTL + if (l3proto->ctl_table_header != NULL) + nf_ct_unregister_sysctl(&l3proto->ctl_table_header, + l3proto->ctl_table, NULL); +#endif +} + +int nf_conntrack_l3proto_register(struct nf_conntrack_l3proto *proto) +{ + int ret = 0; + struct nf_conntrack_l3proto *old; + + if (proto->l3proto >= AF_MAX) + return -EBUSY; + + if (proto->tuple_to_nlattr && !proto->nlattr_tuple_size) + return -EINVAL; + + mutex_lock(&nf_ct_proto_mutex); + old = rcu_dereference_protected(nf_ct_l3protos[proto->l3proto], + lockdep_is_held(&nf_ct_proto_mutex)); + if (old != &nf_conntrack_l3proto_generic) { + ret = -EBUSY; + goto out_unlock; + } + + ret = nf_ct_l3proto_register_sysctl(proto); + if (ret < 0) + goto out_unlock; + + if (proto->nlattr_tuple_size) + proto->nla_size = 3 * proto->nlattr_tuple_size(); + + rcu_assign_pointer(nf_ct_l3protos[proto->l3proto], proto); + +out_unlock: + mutex_unlock(&nf_ct_proto_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(nf_conntrack_l3proto_register); + +void nf_conntrack_l3proto_unregister(struct nf_conntrack_l3proto *proto) +{ + struct net *net; + + BUG_ON(proto->l3proto >= AF_MAX); + + mutex_lock(&nf_ct_proto_mutex); + BUG_ON(rcu_dereference_protected(nf_ct_l3protos[proto->l3proto], + lockdep_is_held(&nf_ct_proto_mutex) + ) != proto); + rcu_assign_pointer(nf_ct_l3protos[proto->l3proto], + &nf_conntrack_l3proto_generic); + nf_ct_l3proto_unregister_sysctl(proto); + mutex_unlock(&nf_ct_proto_mutex); + + synchronize_rcu(); + + /* Remove all contrack entries for this protocol */ + rtnl_lock(); + for_each_net(net) + nf_ct_iterate_cleanup(net, kill_l3proto, proto); + rtnl_unlock(); +} +EXPORT_SYMBOL_GPL(nf_conntrack_l3proto_unregister); + +static int nf_ct_l4proto_register_sysctl(struct nf_conntrack_l4proto *l4proto) +{ + int err = 0; + +#ifdef CONFIG_SYSCTL + if (l4proto->ctl_table != NULL) { + err = nf_ct_register_sysctl(l4proto->ctl_table_header, + nf_net_netfilter_sysctl_path, + l4proto->ctl_table, + l4proto->ctl_table_users); + if (err < 0) + goto out; + } +#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT + if (l4proto->ctl_compat_table != NULL) { + err = nf_ct_register_sysctl(&l4proto->ctl_compat_table_header, + nf_net_ipv4_netfilter_sysctl_path, + l4proto->ctl_compat_table, NULL); + if (err == 0) + goto out; + nf_ct_unregister_sysctl(l4proto->ctl_table_header, + l4proto->ctl_table, + l4proto->ctl_table_users); + } +#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */ +out: +#endif /* CONFIG_SYSCTL */ + return err; +} + +static void nf_ct_l4proto_unregister_sysctl(struct nf_conntrack_l4proto *l4proto) +{ +#ifdef CONFIG_SYSCTL + if (l4proto->ctl_table_header != NULL && + *l4proto->ctl_table_header != NULL) + nf_ct_unregister_sysctl(l4proto->ctl_table_header, + l4proto->ctl_table, + l4proto->ctl_table_users); +#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT + if (l4proto->ctl_compat_table_header != NULL) + nf_ct_unregister_sysctl(&l4proto->ctl_compat_table_header, + l4proto->ctl_compat_table, NULL); +#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */ +#endif /* CONFIG_SYSCTL */ +} + +/* FIXME: Allow NULL functions and sub in pointers to generic for + them. --RR */ +int nf_conntrack_l4proto_register(struct nf_conntrack_l4proto *l4proto) +{ + int ret = 0; + + if (l4proto->l3proto >= PF_MAX) + return -EBUSY; + + if ((l4proto->to_nlattr && !l4proto->nlattr_size) + || (l4proto->tuple_to_nlattr && !l4proto->nlattr_tuple_size)) + return -EINVAL; + + mutex_lock(&nf_ct_proto_mutex); + if (!nf_ct_protos[l4proto->l3proto]) { + /* l3proto may be loaded latter. */ + struct nf_conntrack_l4proto __rcu **proto_array; + int i; + + proto_array = kmalloc(MAX_NF_CT_PROTO * + sizeof(struct nf_conntrack_l4proto *), + GFP_KERNEL); + if (proto_array == NULL) { + ret = -ENOMEM; + goto out_unlock; + } + + for (i = 0; i < MAX_NF_CT_PROTO; i++) + RCU_INIT_POINTER(proto_array[i], &nf_conntrack_l4proto_generic); + + /* Before making proto_array visible to lockless readers, + * we must make sure its content is committed to memory. + */ + smp_wmb(); + + nf_ct_protos[l4proto->l3proto] = proto_array; + } else if (rcu_dereference_protected( + nf_ct_protos[l4proto->l3proto][l4proto->l4proto], + lockdep_is_held(&nf_ct_proto_mutex) + ) != &nf_conntrack_l4proto_generic) { + ret = -EBUSY; + goto out_unlock; + } + + ret = nf_ct_l4proto_register_sysctl(l4proto); + if (ret < 0) + goto out_unlock; + + l4proto->nla_size = 0; + if (l4proto->nlattr_size) + l4proto->nla_size += l4proto->nlattr_size(); + if (l4proto->nlattr_tuple_size) + l4proto->nla_size += 3 * l4proto->nlattr_tuple_size(); + + rcu_assign_pointer(nf_ct_protos[l4proto->l3proto][l4proto->l4proto], + l4proto); + +out_unlock: + mutex_unlock(&nf_ct_proto_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_register); + +void nf_conntrack_l4proto_unregister(struct nf_conntrack_l4proto *l4proto) +{ + struct net *net; + + BUG_ON(l4proto->l3proto >= PF_MAX); + + mutex_lock(&nf_ct_proto_mutex); + BUG_ON(rcu_dereference_protected( + nf_ct_protos[l4proto->l3proto][l4proto->l4proto], + lockdep_is_held(&nf_ct_proto_mutex) + ) != l4proto); + rcu_assign_pointer(nf_ct_protos[l4proto->l3proto][l4proto->l4proto], + &nf_conntrack_l4proto_generic); + nf_ct_l4proto_unregister_sysctl(l4proto); + mutex_unlock(&nf_ct_proto_mutex); + + synchronize_rcu(); + + /* Remove all contrack entries for this protocol */ + rtnl_lock(); + for_each_net(net) + nf_ct_iterate_cleanup(net, kill_l4proto, l4proto); + rtnl_unlock(); +} +EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_unregister); + +int nf_conntrack_proto_init(void) +{ + unsigned int i; + int err; + + err = nf_ct_l4proto_register_sysctl(&nf_conntrack_l4proto_generic); + if (err < 0) + return err; + + for (i = 0; i < AF_MAX; i++) + rcu_assign_pointer(nf_ct_l3protos[i], + &nf_conntrack_l3proto_generic); + return 0; +} + +void nf_conntrack_proto_fini(void) +{ + unsigned int i; + + nf_ct_l4proto_unregister_sysctl(&nf_conntrack_l4proto_generic); + + /* free l3proto protocol tables */ + for (i = 0; i < PF_MAX; i++) + kfree(nf_ct_protos[i]); +} diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c new file mode 100644 index 00000000..24fdce25 --- /dev/null +++ b/net/netfilter/nf_conntrack_proto_dccp.c @@ -0,0 +1,975 @@ +/* + * DCCP connection tracking protocol helper + * + * Copyright (c) 2005, 2006, 2008 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/sysctl.h> +#include <linux/spinlock.h> +#include <linux/skbuff.h> +#include <linux/dccp.h> +#include <linux/slab.h> + +#include <net/net_namespace.h> +#include <net/netns/generic.h> + +#include <linux/netfilter/nfnetlink_conntrack.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_l4proto.h> +#include <net/netfilter/nf_conntrack_ecache.h> +#include <net/netfilter/nf_log.h> + +/* Timeouts are based on values from RFC4340: + * + * - REQUEST: + * + * 8.1.2. Client Request + * + * A client MAY give up on its DCCP-Requests after some time + * (3 minutes, for example). + * + * - RESPOND: + * + * 8.1.3. Server Response + * + * It MAY also leave the RESPOND state for CLOSED after a timeout of + * not less than 4MSL (8 minutes); + * + * - PARTOPEN: + * + * 8.1.5. Handshake Completion + * + * If the client remains in PARTOPEN for more than 4MSL (8 minutes), + * it SHOULD reset the connection with Reset Code 2, "Aborted". + * + * - OPEN: + * + * The DCCP timestamp overflows after 11.9 hours. If the connection + * stays idle this long the sequence number won't be recognized + * as valid anymore. + * + * - CLOSEREQ/CLOSING: + * + * 8.3. Termination + * + * The retransmission timer should initially be set to go off in two + * round-trip times and should back off to not less than once every + * 64 seconds ... + * + * - TIMEWAIT: + * + * 4.3. States + * + * A server or client socket remains in this state for 2MSL (4 minutes) + * after the connection has been town down, ... + */ + +#define DCCP_MSL (2 * 60 * HZ) + +static const char * const dccp_state_names[] = { + [CT_DCCP_NONE] = "NONE", + [CT_DCCP_REQUEST] = "REQUEST", + [CT_DCCP_RESPOND] = "RESPOND", + [CT_DCCP_PARTOPEN] = "PARTOPEN", + [CT_DCCP_OPEN] = "OPEN", + [CT_DCCP_CLOSEREQ] = "CLOSEREQ", + [CT_DCCP_CLOSING] = "CLOSING", + [CT_DCCP_TIMEWAIT] = "TIMEWAIT", + [CT_DCCP_IGNORE] = "IGNORE", + [CT_DCCP_INVALID] = "INVALID", +}; + +#define sNO CT_DCCP_NONE +#define sRQ CT_DCCP_REQUEST +#define sRS CT_DCCP_RESPOND +#define sPO CT_DCCP_PARTOPEN +#define sOP CT_DCCP_OPEN +#define sCR CT_DCCP_CLOSEREQ +#define sCG CT_DCCP_CLOSING +#define sTW CT_DCCP_TIMEWAIT +#define sIG CT_DCCP_IGNORE +#define sIV CT_DCCP_INVALID + +/* + * DCCP state transition table + * + * The assumption is the same as for TCP tracking: + * + * We are the man in the middle. All the packets go through us but might + * get lost in transit to the destination. It is assumed that the destination + * can't receive segments we haven't seen. + * + * The following states exist: + * + * NONE: Initial state, expecting Request + * REQUEST: Request seen, waiting for Response from server + * RESPOND: Response from server seen, waiting for Ack from client + * PARTOPEN: Ack after Response seen, waiting for packet other than Response, + * Reset or Sync from server + * OPEN: Packet other than Response, Reset or Sync seen + * CLOSEREQ: CloseReq from server seen, expecting Close from client + * CLOSING: Close seen, expecting Reset + * TIMEWAIT: Reset seen + * IGNORE: Not determinable whether packet is valid + * + * Some states exist only on one side of the connection: REQUEST, RESPOND, + * PARTOPEN, CLOSEREQ. For the other side these states are equivalent to + * the one it was in before. + * + * Packets are marked as ignored (sIG) if we don't know if they're valid + * (for example a reincarnation of a connection we didn't notice is dead + * already) and the server may send back a connection closing Reset or a + * Response. They're also used for Sync/SyncAck packets, which we don't + * care about. + */ +static const u_int8_t +dccp_state_table[CT_DCCP_ROLE_MAX + 1][DCCP_PKT_SYNCACK + 1][CT_DCCP_MAX + 1] = { + [CT_DCCP_ROLE_CLIENT] = { + [DCCP_PKT_REQUEST] = { + /* + * sNO -> sRQ Regular Request + * sRQ -> sRQ Retransmitted Request or reincarnation + * sRS -> sRS Retransmitted Request (apparently Response + * got lost after we saw it) or reincarnation + * sPO -> sIG Ignore, conntrack might be out of sync + * sOP -> sIG Ignore, conntrack might be out of sync + * sCR -> sIG Ignore, conntrack might be out of sync + * sCG -> sIG Ignore, conntrack might be out of sync + * sTW -> sRQ Reincarnation + * + * sNO, sRQ, sRS, sPO. sOP, sCR, sCG, sTW, */ + sRQ, sRQ, sRS, sIG, sIG, sIG, sIG, sRQ, + }, + [DCCP_PKT_RESPONSE] = { + /* + * sNO -> sIV Invalid + * sRQ -> sIG Ignore, might be response to ignored Request + * sRS -> sIG Ignore, might be response to ignored Request + * sPO -> sIG Ignore, might be response to ignored Request + * sOP -> sIG Ignore, might be response to ignored Request + * sCR -> sIG Ignore, might be response to ignored Request + * sCG -> sIG Ignore, might be response to ignored Request + * sTW -> sIV Invalid, reincarnation in reverse direction + * goes through sRQ + * + * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ + sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIV, + }, + [DCCP_PKT_ACK] = { + /* + * sNO -> sIV No connection + * sRQ -> sIV No connection + * sRS -> sPO Ack for Response, move to PARTOPEN (8.1.5.) + * sPO -> sPO Retransmitted Ack for Response, remain in PARTOPEN + * sOP -> sOP Regular ACK, remain in OPEN + * sCR -> sCR Ack in CLOSEREQ MAY be processed (8.3.) + * sCG -> sCG Ack in CLOSING MAY be processed (8.3.) + * sTW -> sIV + * + * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ + sIV, sIV, sPO, sPO, sOP, sCR, sCG, sIV + }, + [DCCP_PKT_DATA] = { + /* + * sNO -> sIV No connection + * sRQ -> sIV No connection + * sRS -> sIV No connection + * sPO -> sIV MUST use DataAck in PARTOPEN state (8.1.5.) + * sOP -> sOP Regular Data packet + * sCR -> sCR Data in CLOSEREQ MAY be processed (8.3.) + * sCG -> sCG Data in CLOSING MAY be processed (8.3.) + * sTW -> sIV + * + * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ + sIV, sIV, sIV, sIV, sOP, sCR, sCG, sIV, + }, + [DCCP_PKT_DATAACK] = { + /* + * sNO -> sIV No connection + * sRQ -> sIV No connection + * sRS -> sPO Ack for Response, move to PARTOPEN (8.1.5.) + * sPO -> sPO Remain in PARTOPEN state + * sOP -> sOP Regular DataAck packet in OPEN state + * sCR -> sCR DataAck in CLOSEREQ MAY be processed (8.3.) + * sCG -> sCG DataAck in CLOSING MAY be processed (8.3.) + * sTW -> sIV + * + * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ + sIV, sIV, sPO, sPO, sOP, sCR, sCG, sIV + }, + [DCCP_PKT_CLOSEREQ] = { + /* + * CLOSEREQ may only be sent by the server. + * + * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ + sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV + }, + [DCCP_PKT_CLOSE] = { + /* + * sNO -> sIV No connection + * sRQ -> sIV No connection + * sRS -> sIV No connection + * sPO -> sCG Client-initiated close + * sOP -> sCG Client-initiated close + * sCR -> sCG Close in response to CloseReq (8.3.) + * sCG -> sCG Retransmit + * sTW -> sIV Late retransmit, already in TIME_WAIT + * + * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ + sIV, sIV, sIV, sCG, sCG, sCG, sIV, sIV + }, + [DCCP_PKT_RESET] = { + /* + * sNO -> sIV No connection + * sRQ -> sTW Sync received or timeout, SHOULD send Reset (8.1.1.) + * sRS -> sTW Response received without Request + * sPO -> sTW Timeout, SHOULD send Reset (8.1.5.) + * sOP -> sTW Connection reset + * sCR -> sTW Connection reset + * sCG -> sTW Connection reset + * sTW -> sIG Ignore (don't refresh timer) + * + * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ + sIV, sTW, sTW, sTW, sTW, sTW, sTW, sIG + }, + [DCCP_PKT_SYNC] = { + /* + * We currently ignore Sync packets + * + * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ + sIG, sIG, sIG, sIG, sIG, sIG, sIG, sIG, + }, + [DCCP_PKT_SYNCACK] = { + /* + * We currently ignore SyncAck packets + * + * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ + sIG, sIG, sIG, sIG, sIG, sIG, sIG, sIG, + }, + }, + [CT_DCCP_ROLE_SERVER] = { + [DCCP_PKT_REQUEST] = { + /* + * sNO -> sIV Invalid + * sRQ -> sIG Ignore, conntrack might be out of sync + * sRS -> sIG Ignore, conntrack might be out of sync + * sPO -> sIG Ignore, conntrack might be out of sync + * sOP -> sIG Ignore, conntrack might be out of sync + * sCR -> sIG Ignore, conntrack might be out of sync + * sCG -> sIG Ignore, conntrack might be out of sync + * sTW -> sRQ Reincarnation, must reverse roles + * + * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ + sIV, sIG, sIG, sIG, sIG, sIG, sIG, sRQ + }, + [DCCP_PKT_RESPONSE] = { + /* + * sNO -> sIV Response without Request + * sRQ -> sRS Response to clients Request + * sRS -> sRS Retransmitted Response (8.1.3. SHOULD NOT) + * sPO -> sIG Response to an ignored Request or late retransmit + * sOP -> sIG Ignore, might be response to ignored Request + * sCR -> sIG Ignore, might be response to ignored Request + * sCG -> sIG Ignore, might be response to ignored Request + * sTW -> sIV Invalid, Request from client in sTW moves to sRQ + * + * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ + sIV, sRS, sRS, sIG, sIG, sIG, sIG, sIV + }, + [DCCP_PKT_ACK] = { + /* + * sNO -> sIV No connection + * sRQ -> sIV No connection + * sRS -> sIV No connection + * sPO -> sOP Enter OPEN state (8.1.5.) + * sOP -> sOP Regular Ack in OPEN state + * sCR -> sIV Waiting for Close from client + * sCG -> sCG Ack in CLOSING MAY be processed (8.3.) + * sTW -> sIV + * + * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ + sIV, sIV, sIV, sOP, sOP, sIV, sCG, sIV + }, + [DCCP_PKT_DATA] = { + /* + * sNO -> sIV No connection + * sRQ -> sIV No connection + * sRS -> sIV No connection + * sPO -> sOP Enter OPEN state (8.1.5.) + * sOP -> sOP Regular Data packet in OPEN state + * sCR -> sIV Waiting for Close from client + * sCG -> sCG Data in CLOSING MAY be processed (8.3.) + * sTW -> sIV + * + * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ + sIV, sIV, sIV, sOP, sOP, sIV, sCG, sIV + }, + [DCCP_PKT_DATAACK] = { + /* + * sNO -> sIV No connection + * sRQ -> sIV No connection + * sRS -> sIV No connection + * sPO -> sOP Enter OPEN state (8.1.5.) + * sOP -> sOP Regular DataAck in OPEN state + * sCR -> sIV Waiting for Close from client + * sCG -> sCG Data in CLOSING MAY be processed (8.3.) + * sTW -> sIV + * + * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ + sIV, sIV, sIV, sOP, sOP, sIV, sCG, sIV + }, + [DCCP_PKT_CLOSEREQ] = { + /* + * sNO -> sIV No connection + * sRQ -> sIV No connection + * sRS -> sIV No connection + * sPO -> sOP -> sCR Move directly to CLOSEREQ (8.1.5.) + * sOP -> sCR CloseReq in OPEN state + * sCR -> sCR Retransmit + * sCG -> sCR Simultaneous close, client sends another Close + * sTW -> sIV Already closed + * + * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ + sIV, sIV, sIV, sCR, sCR, sCR, sCR, sIV + }, + [DCCP_PKT_CLOSE] = { + /* + * sNO -> sIV No connection + * sRQ -> sIV No connection + * sRS -> sIV No connection + * sPO -> sOP -> sCG Move direcly to CLOSING + * sOP -> sCG Move to CLOSING + * sCR -> sIV Close after CloseReq is invalid + * sCG -> sCG Retransmit + * sTW -> sIV Already closed + * + * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ + sIV, sIV, sIV, sCG, sCG, sIV, sCG, sIV + }, + [DCCP_PKT_RESET] = { + /* + * sNO -> sIV No connection + * sRQ -> sTW Reset in response to Request + * sRS -> sTW Timeout, SHOULD send Reset (8.1.3.) + * sPO -> sTW Timeout, SHOULD send Reset (8.1.3.) + * sOP -> sTW + * sCR -> sTW + * sCG -> sTW + * sTW -> sIG Ignore (don't refresh timer) + * + * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW, sTW */ + sIV, sTW, sTW, sTW, sTW, sTW, sTW, sTW, sIG + }, + [DCCP_PKT_SYNC] = { + /* + * We currently ignore Sync packets + * + * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ + sIG, sIG, sIG, sIG, sIG, sIG, sIG, sIG, + }, + [DCCP_PKT_SYNCACK] = { + /* + * We currently ignore SyncAck packets + * + * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ + sIG, sIG, sIG, sIG, sIG, sIG, sIG, sIG, + }, + }, +}; + +/* this module per-net specifics */ +static int dccp_net_id __read_mostly; +struct dccp_net { + int dccp_loose; + unsigned int dccp_timeout[CT_DCCP_MAX + 1]; +#ifdef CONFIG_SYSCTL + struct ctl_table_header *sysctl_header; + struct ctl_table *sysctl_table; +#endif +}; + +static inline struct dccp_net *dccp_pernet(struct net *net) +{ + return net_generic(net, dccp_net_id); +} + +static bool dccp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, + struct nf_conntrack_tuple *tuple) +{ + struct dccp_hdr _hdr, *dh; + + dh = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); + if (dh == NULL) + return false; + + tuple->src.u.dccp.port = dh->dccph_sport; + tuple->dst.u.dccp.port = dh->dccph_dport; + return true; +} + +static bool dccp_invert_tuple(struct nf_conntrack_tuple *inv, + const struct nf_conntrack_tuple *tuple) +{ + inv->src.u.dccp.port = tuple->dst.u.dccp.port; + inv->dst.u.dccp.port = tuple->src.u.dccp.port; + return true; +} + +static bool dccp_new(struct nf_conn *ct, const struct sk_buff *skb, + unsigned int dataoff, unsigned int *timeouts) +{ + struct net *net = nf_ct_net(ct); + struct dccp_net *dn; + struct dccp_hdr _dh, *dh; + const char *msg; + u_int8_t state; + + dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &dh); + BUG_ON(dh == NULL); + + state = dccp_state_table[CT_DCCP_ROLE_CLIENT][dh->dccph_type][CT_DCCP_NONE]; + switch (state) { + default: + dn = dccp_pernet(net); + if (dn->dccp_loose == 0) { + msg = "nf_ct_dccp: not picking up existing connection "; + goto out_invalid; + } + case CT_DCCP_REQUEST: + break; + case CT_DCCP_INVALID: + msg = "nf_ct_dccp: invalid state transition "; + goto out_invalid; + } + + ct->proto.dccp.role[IP_CT_DIR_ORIGINAL] = CT_DCCP_ROLE_CLIENT; + ct->proto.dccp.role[IP_CT_DIR_REPLY] = CT_DCCP_ROLE_SERVER; + ct->proto.dccp.state = CT_DCCP_NONE; + ct->proto.dccp.last_pkt = DCCP_PKT_REQUEST; + ct->proto.dccp.last_dir = IP_CT_DIR_ORIGINAL; + ct->proto.dccp.handshake_seq = 0; + return true; + +out_invalid: + if (LOG_INVALID(net, IPPROTO_DCCP)) + nf_log_packet(nf_ct_l3num(ct), 0, skb, NULL, NULL, NULL, msg); + return false; +} + +static u64 dccp_ack_seq(const struct dccp_hdr *dh) +{ + const struct dccp_hdr_ack_bits *dhack; + + dhack = (void *)dh + __dccp_basic_hdr_len(dh); + return ((u64)ntohs(dhack->dccph_ack_nr_high) << 32) + + ntohl(dhack->dccph_ack_nr_low); +} + +static unsigned int *dccp_get_timeouts(struct net *net) +{ + return dccp_pernet(net)->dccp_timeout; +} + +static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb, + unsigned int dataoff, enum ip_conntrack_info ctinfo, + u_int8_t pf, unsigned int hooknum, + unsigned int *timeouts) +{ + struct net *net = nf_ct_net(ct); + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + struct dccp_hdr _dh, *dh; + u_int8_t type, old_state, new_state; + enum ct_dccp_roles role; + + dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &dh); + BUG_ON(dh == NULL); + type = dh->dccph_type; + + if (type == DCCP_PKT_RESET && + !test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { + /* Tear down connection immediately if only reply is a RESET */ + nf_ct_kill_acct(ct, ctinfo, skb); + return NF_ACCEPT; + } + + spin_lock_bh(&ct->lock); + + role = ct->proto.dccp.role[dir]; + old_state = ct->proto.dccp.state; + new_state = dccp_state_table[role][type][old_state]; + + switch (new_state) { + case CT_DCCP_REQUEST: + if (old_state == CT_DCCP_TIMEWAIT && + role == CT_DCCP_ROLE_SERVER) { + /* Reincarnation in the reverse direction: reopen and + * reverse client/server roles. */ + ct->proto.dccp.role[dir] = CT_DCCP_ROLE_CLIENT; + ct->proto.dccp.role[!dir] = CT_DCCP_ROLE_SERVER; + } + break; + case CT_DCCP_RESPOND: + if (old_state == CT_DCCP_REQUEST) + ct->proto.dccp.handshake_seq = dccp_hdr_seq(dh); + break; + case CT_DCCP_PARTOPEN: + if (old_state == CT_DCCP_RESPOND && + type == DCCP_PKT_ACK && + dccp_ack_seq(dh) == ct->proto.dccp.handshake_seq) + set_bit(IPS_ASSURED_BIT, &ct->status); + break; + case CT_DCCP_IGNORE: + /* + * Connection tracking might be out of sync, so we ignore + * packets that might establish a new connection and resync + * if the server responds with a valid Response. + */ + if (ct->proto.dccp.last_dir == !dir && + ct->proto.dccp.last_pkt == DCCP_PKT_REQUEST && + type == DCCP_PKT_RESPONSE) { + ct->proto.dccp.role[!dir] = CT_DCCP_ROLE_CLIENT; + ct->proto.dccp.role[dir] = CT_DCCP_ROLE_SERVER; + ct->proto.dccp.handshake_seq = dccp_hdr_seq(dh); + new_state = CT_DCCP_RESPOND; + break; + } + ct->proto.dccp.last_dir = dir; + ct->proto.dccp.last_pkt = type; + + spin_unlock_bh(&ct->lock); + if (LOG_INVALID(net, IPPROTO_DCCP)) + nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + "nf_ct_dccp: invalid packet ignored "); + return NF_ACCEPT; + case CT_DCCP_INVALID: + spin_unlock_bh(&ct->lock); + if (LOG_INVALID(net, IPPROTO_DCCP)) + nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + "nf_ct_dccp: invalid state transition "); + return -NF_ACCEPT; + } + + ct->proto.dccp.last_dir = dir; + ct->proto.dccp.last_pkt = type; + ct->proto.dccp.state = new_state; + spin_unlock_bh(&ct->lock); + + if (new_state != old_state) + nf_conntrack_event_cache(IPCT_PROTOINFO, ct); + + nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[new_state]); + + return NF_ACCEPT; +} + +static int dccp_error(struct net *net, struct nf_conn *tmpl, + struct sk_buff *skb, unsigned int dataoff, + enum ip_conntrack_info *ctinfo, + u_int8_t pf, unsigned int hooknum) +{ + struct dccp_hdr _dh, *dh; + unsigned int dccp_len = skb->len - dataoff; + unsigned int cscov; + const char *msg; + + dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &dh); + if (dh == NULL) { + msg = "nf_ct_dccp: short packet "; + goto out_invalid; + } + + if (dh->dccph_doff * 4 < sizeof(struct dccp_hdr) || + dh->dccph_doff * 4 > dccp_len) { + msg = "nf_ct_dccp: truncated/malformed packet "; + goto out_invalid; + } + + cscov = dccp_len; + if (dh->dccph_cscov) { + cscov = (dh->dccph_cscov - 1) * 4; + if (cscov > dccp_len) { + msg = "nf_ct_dccp: bad checksum coverage "; + goto out_invalid; + } + } + + if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && + nf_checksum_partial(skb, hooknum, dataoff, cscov, IPPROTO_DCCP, + pf)) { + msg = "nf_ct_dccp: bad checksum "; + goto out_invalid; + } + + if (dh->dccph_type >= DCCP_PKT_INVALID) { + msg = "nf_ct_dccp: reserved packet type "; + goto out_invalid; + } + + return NF_ACCEPT; + +out_invalid: + if (LOG_INVALID(net, IPPROTO_DCCP)) + nf_log_packet(pf, 0, skb, NULL, NULL, NULL, msg); + return -NF_ACCEPT; +} + +static int dccp_print_tuple(struct seq_file *s, + const struct nf_conntrack_tuple *tuple) +{ + return seq_printf(s, "sport=%hu dport=%hu ", + ntohs(tuple->src.u.dccp.port), + ntohs(tuple->dst.u.dccp.port)); +} + +static int dccp_print_conntrack(struct seq_file *s, struct nf_conn *ct) +{ + return seq_printf(s, "%s ", dccp_state_names[ct->proto.dccp.state]); +} + +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) +static int dccp_to_nlattr(struct sk_buff *skb, struct nlattr *nla, + struct nf_conn *ct) +{ + struct nlattr *nest_parms; + + spin_lock_bh(&ct->lock); + nest_parms = nla_nest_start(skb, CTA_PROTOINFO_DCCP | NLA_F_NESTED); + if (!nest_parms) + goto nla_put_failure; + NLA_PUT_U8(skb, CTA_PROTOINFO_DCCP_STATE, ct->proto.dccp.state); + NLA_PUT_U8(skb, CTA_PROTOINFO_DCCP_ROLE, + ct->proto.dccp.role[IP_CT_DIR_ORIGINAL]); + NLA_PUT_BE64(skb, CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ, + cpu_to_be64(ct->proto.dccp.handshake_seq)); + nla_nest_end(skb, nest_parms); + spin_unlock_bh(&ct->lock); + return 0; + +nla_put_failure: + spin_unlock_bh(&ct->lock); + return -1; +} + +static const struct nla_policy dccp_nla_policy[CTA_PROTOINFO_DCCP_MAX + 1] = { + [CTA_PROTOINFO_DCCP_STATE] = { .type = NLA_U8 }, + [CTA_PROTOINFO_DCCP_ROLE] = { .type = NLA_U8 }, + [CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ] = { .type = NLA_U64 }, +}; + +static int nlattr_to_dccp(struct nlattr *cda[], struct nf_conn *ct) +{ + struct nlattr *attr = cda[CTA_PROTOINFO_DCCP]; + struct nlattr *tb[CTA_PROTOINFO_DCCP_MAX + 1]; + int err; + + if (!attr) + return 0; + + err = nla_parse_nested(tb, CTA_PROTOINFO_DCCP_MAX, attr, + dccp_nla_policy); + if (err < 0) + return err; + + if (!tb[CTA_PROTOINFO_DCCP_STATE] || + !tb[CTA_PROTOINFO_DCCP_ROLE] || + nla_get_u8(tb[CTA_PROTOINFO_DCCP_ROLE]) > CT_DCCP_ROLE_MAX || + nla_get_u8(tb[CTA_PROTOINFO_DCCP_STATE]) >= CT_DCCP_IGNORE) { + return -EINVAL; + } + + spin_lock_bh(&ct->lock); + ct->proto.dccp.state = nla_get_u8(tb[CTA_PROTOINFO_DCCP_STATE]); + if (nla_get_u8(tb[CTA_PROTOINFO_DCCP_ROLE]) == CT_DCCP_ROLE_CLIENT) { + ct->proto.dccp.role[IP_CT_DIR_ORIGINAL] = CT_DCCP_ROLE_CLIENT; + ct->proto.dccp.role[IP_CT_DIR_REPLY] = CT_DCCP_ROLE_SERVER; + } else { + ct->proto.dccp.role[IP_CT_DIR_ORIGINAL] = CT_DCCP_ROLE_SERVER; + ct->proto.dccp.role[IP_CT_DIR_REPLY] = CT_DCCP_ROLE_CLIENT; + } + if (tb[CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ]) { + ct->proto.dccp.handshake_seq = + be64_to_cpu(nla_get_be64(tb[CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ])); + } + spin_unlock_bh(&ct->lock); + return 0; +} + +static int dccp_nlattr_size(void) +{ + return nla_total_size(0) /* CTA_PROTOINFO_DCCP */ + + nla_policy_len(dccp_nla_policy, CTA_PROTOINFO_DCCP_MAX + 1); +} + +#endif + +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nfnetlink_cttimeout.h> + +static int dccp_timeout_nlattr_to_obj(struct nlattr *tb[], void *data) +{ + struct dccp_net *dn = dccp_pernet(&init_net); + unsigned int *timeouts = data; + int i; + + /* set default DCCP timeouts. */ + for (i=0; i<CT_DCCP_MAX; i++) + timeouts[i] = dn->dccp_timeout[i]; + + /* there's a 1:1 mapping between attributes and protocol states. */ + for (i=CTA_TIMEOUT_DCCP_UNSPEC+1; i<CTA_TIMEOUT_DCCP_MAX+1; i++) { + if (tb[i]) { + timeouts[i] = ntohl(nla_get_be32(tb[i])) * HZ; + } + } + return 0; +} + +static int +dccp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) +{ + const unsigned int *timeouts = data; + int i; + + for (i=CTA_TIMEOUT_DCCP_UNSPEC+1; i<CTA_TIMEOUT_DCCP_MAX+1; i++) + NLA_PUT_BE32(skb, i, htonl(timeouts[i] / HZ)); + + return 0; + +nla_put_failure: + return -ENOSPC; +} + +static const struct nla_policy +dccp_timeout_nla_policy[CTA_TIMEOUT_DCCP_MAX+1] = { + [CTA_TIMEOUT_DCCP_REQUEST] = { .type = NLA_U32 }, + [CTA_TIMEOUT_DCCP_RESPOND] = { .type = NLA_U32 }, + [CTA_TIMEOUT_DCCP_PARTOPEN] = { .type = NLA_U32 }, + [CTA_TIMEOUT_DCCP_OPEN] = { .type = NLA_U32 }, + [CTA_TIMEOUT_DCCP_CLOSEREQ] = { .type = NLA_U32 }, + [CTA_TIMEOUT_DCCP_CLOSING] = { .type = NLA_U32 }, + [CTA_TIMEOUT_DCCP_TIMEWAIT] = { .type = NLA_U32 }, +}; +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ + +#ifdef CONFIG_SYSCTL +/* template, data assigned later */ +static struct ctl_table dccp_sysctl_table[] = { + { + .procname = "nf_conntrack_dccp_timeout_request", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "nf_conntrack_dccp_timeout_respond", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "nf_conntrack_dccp_timeout_partopen", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "nf_conntrack_dccp_timeout_open", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "nf_conntrack_dccp_timeout_closereq", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "nf_conntrack_dccp_timeout_closing", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "nf_conntrack_dccp_timeout_timewait", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "nf_conntrack_dccp_loose", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { } +}; +#endif /* CONFIG_SYSCTL */ + +static struct nf_conntrack_l4proto dccp_proto4 __read_mostly = { + .l3proto = AF_INET, + .l4proto = IPPROTO_DCCP, + .name = "dccp", + .pkt_to_tuple = dccp_pkt_to_tuple, + .invert_tuple = dccp_invert_tuple, + .new = dccp_new, + .packet = dccp_packet, + .get_timeouts = dccp_get_timeouts, + .error = dccp_error, + .print_tuple = dccp_print_tuple, + .print_conntrack = dccp_print_conntrack, +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) + .to_nlattr = dccp_to_nlattr, + .nlattr_size = dccp_nlattr_size, + .from_nlattr = nlattr_to_dccp, + .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, + .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, + .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, + .nla_policy = nf_ct_port_nla_policy, +#endif +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + .ctnl_timeout = { + .nlattr_to_obj = dccp_timeout_nlattr_to_obj, + .obj_to_nlattr = dccp_timeout_obj_to_nlattr, + .nlattr_max = CTA_TIMEOUT_DCCP_MAX, + .obj_size = sizeof(unsigned int) * CT_DCCP_MAX, + .nla_policy = dccp_timeout_nla_policy, + }, +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ +}; + +static struct nf_conntrack_l4proto dccp_proto6 __read_mostly = { + .l3proto = AF_INET6, + .l4proto = IPPROTO_DCCP, + .name = "dccp", + .pkt_to_tuple = dccp_pkt_to_tuple, + .invert_tuple = dccp_invert_tuple, + .new = dccp_new, + .packet = dccp_packet, + .get_timeouts = dccp_get_timeouts, + .error = dccp_error, + .print_tuple = dccp_print_tuple, + .print_conntrack = dccp_print_conntrack, +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) + .to_nlattr = dccp_to_nlattr, + .nlattr_size = dccp_nlattr_size, + .from_nlattr = nlattr_to_dccp, + .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, + .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, + .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, + .nla_policy = nf_ct_port_nla_policy, +#endif +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + .ctnl_timeout = { + .nlattr_to_obj = dccp_timeout_nlattr_to_obj, + .obj_to_nlattr = dccp_timeout_obj_to_nlattr, + .nlattr_max = CTA_TIMEOUT_DCCP_MAX, + .obj_size = sizeof(unsigned int) * CT_DCCP_MAX, + .nla_policy = dccp_timeout_nla_policy, + }, +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ +}; + +static __net_init int dccp_net_init(struct net *net) +{ + struct dccp_net *dn = dccp_pernet(net); + + /* default values */ + dn->dccp_loose = 1; + dn->dccp_timeout[CT_DCCP_REQUEST] = 2 * DCCP_MSL; + dn->dccp_timeout[CT_DCCP_RESPOND] = 4 * DCCP_MSL; + dn->dccp_timeout[CT_DCCP_PARTOPEN] = 4 * DCCP_MSL; + dn->dccp_timeout[CT_DCCP_OPEN] = 12 * 3600 * HZ; + dn->dccp_timeout[CT_DCCP_CLOSEREQ] = 64 * HZ; + dn->dccp_timeout[CT_DCCP_CLOSING] = 64 * HZ; + dn->dccp_timeout[CT_DCCP_TIMEWAIT] = 2 * DCCP_MSL; + +#ifdef CONFIG_SYSCTL + dn->sysctl_table = kmemdup(dccp_sysctl_table, + sizeof(dccp_sysctl_table), GFP_KERNEL); + if (!dn->sysctl_table) + return -ENOMEM; + + dn->sysctl_table[0].data = &dn->dccp_timeout[CT_DCCP_REQUEST]; + dn->sysctl_table[1].data = &dn->dccp_timeout[CT_DCCP_RESPOND]; + dn->sysctl_table[2].data = &dn->dccp_timeout[CT_DCCP_PARTOPEN]; + dn->sysctl_table[3].data = &dn->dccp_timeout[CT_DCCP_OPEN]; + dn->sysctl_table[4].data = &dn->dccp_timeout[CT_DCCP_CLOSEREQ]; + dn->sysctl_table[5].data = &dn->dccp_timeout[CT_DCCP_CLOSING]; + dn->sysctl_table[6].data = &dn->dccp_timeout[CT_DCCP_TIMEWAIT]; + dn->sysctl_table[7].data = &dn->dccp_loose; + + dn->sysctl_header = register_net_sysctl_table(net, + nf_net_netfilter_sysctl_path, dn->sysctl_table); + if (!dn->sysctl_header) { + kfree(dn->sysctl_table); + return -ENOMEM; + } +#endif + + return 0; +} + +static __net_exit void dccp_net_exit(struct net *net) +{ + struct dccp_net *dn = dccp_pernet(net); +#ifdef CONFIG_SYSCTL + unregister_net_sysctl_table(dn->sysctl_header); + kfree(dn->sysctl_table); +#endif +} + +static struct pernet_operations dccp_net_ops = { + .init = dccp_net_init, + .exit = dccp_net_exit, + .id = &dccp_net_id, + .size = sizeof(struct dccp_net), +}; + +static int __init nf_conntrack_proto_dccp_init(void) +{ + int err; + + err = register_pernet_subsys(&dccp_net_ops); + if (err < 0) + goto err1; + + err = nf_conntrack_l4proto_register(&dccp_proto4); + if (err < 0) + goto err2; + + err = nf_conntrack_l4proto_register(&dccp_proto6); + if (err < 0) + goto err3; + return 0; + +err3: + nf_conntrack_l4proto_unregister(&dccp_proto4); +err2: + unregister_pernet_subsys(&dccp_net_ops); +err1: + return err; +} + +static void __exit nf_conntrack_proto_dccp_fini(void) +{ + unregister_pernet_subsys(&dccp_net_ops); + nf_conntrack_l4proto_unregister(&dccp_proto6); + nf_conntrack_l4proto_unregister(&dccp_proto4); +} + +module_init(nf_conntrack_proto_dccp_init); +module_exit(nf_conntrack_proto_dccp_fini); + +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_DESCRIPTION("DCCP connection tracking protocol helper"); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c new file mode 100644 index 00000000..835e24c5 --- /dev/null +++ b/net/netfilter/nf_conntrack_proto_generic.c @@ -0,0 +1,160 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/types.h> +#include <linux/jiffies.h> +#include <linux/timer.h> +#include <linux/netfilter.h> +#include <net/netfilter/nf_conntrack_l4proto.h> + +static unsigned int nf_ct_generic_timeout __read_mostly = 600*HZ; + +static bool generic_pkt_to_tuple(const struct sk_buff *skb, + unsigned int dataoff, + struct nf_conntrack_tuple *tuple) +{ + tuple->src.u.all = 0; + tuple->dst.u.all = 0; + + return true; +} + +static bool generic_invert_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple *orig) +{ + tuple->src.u.all = 0; + tuple->dst.u.all = 0; + + return true; +} + +/* Print out the per-protocol part of the tuple. */ +static int generic_print_tuple(struct seq_file *s, + const struct nf_conntrack_tuple *tuple) +{ + return 0; +} + +static unsigned int *generic_get_timeouts(struct net *net) +{ + return &nf_ct_generic_timeout; +} + +/* Returns verdict for packet, or -1 for invalid. */ +static int generic_packet(struct nf_conn *ct, + const struct sk_buff *skb, + unsigned int dataoff, + enum ip_conntrack_info ctinfo, + u_int8_t pf, + unsigned int hooknum, + unsigned int *timeout) +{ + nf_ct_refresh_acct(ct, ctinfo, skb, *timeout); + return NF_ACCEPT; +} + +/* Called when a new connection for this protocol found. */ +static bool generic_new(struct nf_conn *ct, const struct sk_buff *skb, + unsigned int dataoff, unsigned int *timeouts) +{ + return true; +} + +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nfnetlink_cttimeout.h> + +static int generic_timeout_nlattr_to_obj(struct nlattr *tb[], void *data) +{ + unsigned int *timeout = data; + + if (tb[CTA_TIMEOUT_GENERIC_TIMEOUT]) + *timeout = + ntohl(nla_get_be32(tb[CTA_TIMEOUT_GENERIC_TIMEOUT])) * HZ; + else { + /* Set default generic timeout. */ + *timeout = nf_ct_generic_timeout; + } + + return 0; +} + +static int +generic_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) +{ + const unsigned int *timeout = data; + + NLA_PUT_BE32(skb, CTA_TIMEOUT_GENERIC_TIMEOUT, htonl(*timeout / HZ)); + + return 0; + +nla_put_failure: + return -ENOSPC; +} + +static const struct nla_policy +generic_timeout_nla_policy[CTA_TIMEOUT_GENERIC_MAX+1] = { + [CTA_TIMEOUT_GENERIC_TIMEOUT] = { .type = NLA_U32 }, +}; +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ + +#ifdef CONFIG_SYSCTL +static struct ctl_table_header *generic_sysctl_header; +static struct ctl_table generic_sysctl_table[] = { + { + .procname = "nf_conntrack_generic_timeout", + .data = &nf_ct_generic_timeout, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { } +}; +#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT +static struct ctl_table generic_compat_sysctl_table[] = { + { + .procname = "ip_conntrack_generic_timeout", + .data = &nf_ct_generic_timeout, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { } +}; +#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */ +#endif /* CONFIG_SYSCTL */ + +struct nf_conntrack_l4proto nf_conntrack_l4proto_generic __read_mostly = +{ + .l3proto = PF_UNSPEC, + .l4proto = 255, + .name = "unknown", + .pkt_to_tuple = generic_pkt_to_tuple, + .invert_tuple = generic_invert_tuple, + .print_tuple = generic_print_tuple, + .packet = generic_packet, + .get_timeouts = generic_get_timeouts, + .new = generic_new, +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + .ctnl_timeout = { + .nlattr_to_obj = generic_timeout_nlattr_to_obj, + .obj_to_nlattr = generic_timeout_obj_to_nlattr, + .nlattr_max = CTA_TIMEOUT_GENERIC_MAX, + .obj_size = sizeof(unsigned int), + .nla_policy = generic_timeout_nla_policy, + }, +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ +#ifdef CONFIG_SYSCTL + .ctl_table_header = &generic_sysctl_header, + .ctl_table = generic_sysctl_table, +#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT + .ctl_compat_table = generic_compat_sysctl_table, +#endif +#endif +}; diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c new file mode 100644 index 00000000..659648c4 --- /dev/null +++ b/net/netfilter/nf_conntrack_proto_gre.c @@ -0,0 +1,416 @@ +/* + * ip_conntrack_proto_gre.c - Version 3.0 + * + * Connection tracking protocol helper module for GRE. + * + * GRE is a generic encapsulation protocol, which is generally not very + * suited for NAT, as it has no protocol-specific part as port numbers. + * + * It has an optional key field, which may help us distinguishing two + * connections between the same two hosts. + * + * GRE is defined in RFC 1701 and RFC 1702, as well as RFC 2784 + * + * PPTP is built on top of a modified version of GRE, and has a mandatory + * field called "CallID", which serves us for the same purpose as the key + * field in plain GRE. + * + * Documentation about PPTP can be found in RFC 2637 + * + * (C) 2000-2005 by Harald Welte <laforge@gnumonks.org> + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + * + */ + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/timer.h> +#include <linux/list.h> +#include <linux/seq_file.h> +#include <linux/in.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <linux/slab.h> +#include <net/dst.h> +#include <net/net_namespace.h> +#include <net/netns/generic.h> +#include <net/netfilter/nf_conntrack_l4proto.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <linux/netfilter/nf_conntrack_proto_gre.h> +#include <linux/netfilter/nf_conntrack_pptp.h> + +enum grep_conntrack { + GRE_CT_UNREPLIED, + GRE_CT_REPLIED, + GRE_CT_MAX +}; + +static unsigned int gre_timeouts[GRE_CT_MAX] = { + [GRE_CT_UNREPLIED] = 30*HZ, + [GRE_CT_REPLIED] = 180*HZ, +}; + +static int proto_gre_net_id __read_mostly; +struct netns_proto_gre { + rwlock_t keymap_lock; + struct list_head keymap_list; +}; + +void nf_ct_gre_keymap_flush(struct net *net) +{ + struct netns_proto_gre *net_gre = net_generic(net, proto_gre_net_id); + struct nf_ct_gre_keymap *km, *tmp; + + write_lock_bh(&net_gre->keymap_lock); + list_for_each_entry_safe(km, tmp, &net_gre->keymap_list, list) { + list_del(&km->list); + kfree(km); + } + write_unlock_bh(&net_gre->keymap_lock); +} +EXPORT_SYMBOL(nf_ct_gre_keymap_flush); + +static inline int gre_key_cmpfn(const struct nf_ct_gre_keymap *km, + const struct nf_conntrack_tuple *t) +{ + return km->tuple.src.l3num == t->src.l3num && + !memcmp(&km->tuple.src.u3, &t->src.u3, sizeof(t->src.u3)) && + !memcmp(&km->tuple.dst.u3, &t->dst.u3, sizeof(t->dst.u3)) && + km->tuple.dst.protonum == t->dst.protonum && + km->tuple.dst.u.all == t->dst.u.all; +} + +/* look up the source key for a given tuple */ +static __be16 gre_keymap_lookup(struct net *net, struct nf_conntrack_tuple *t) +{ + struct netns_proto_gre *net_gre = net_generic(net, proto_gre_net_id); + struct nf_ct_gre_keymap *km; + __be16 key = 0; + + read_lock_bh(&net_gre->keymap_lock); + list_for_each_entry(km, &net_gre->keymap_list, list) { + if (gre_key_cmpfn(km, t)) { + key = km->tuple.src.u.gre.key; + break; + } + } + read_unlock_bh(&net_gre->keymap_lock); + + pr_debug("lookup src key 0x%x for ", key); + nf_ct_dump_tuple(t); + + return key; +} + +/* add a single keymap entry, associate with specified master ct */ +int nf_ct_gre_keymap_add(struct nf_conn *ct, enum ip_conntrack_dir dir, + struct nf_conntrack_tuple *t) +{ + struct net *net = nf_ct_net(ct); + struct netns_proto_gre *net_gre = net_generic(net, proto_gre_net_id); + struct nf_conn_help *help = nfct_help(ct); + struct nf_ct_gre_keymap **kmp, *km; + + kmp = &help->help.ct_pptp_info.keymap[dir]; + if (*kmp) { + /* check whether it's a retransmission */ + read_lock_bh(&net_gre->keymap_lock); + list_for_each_entry(km, &net_gre->keymap_list, list) { + if (gre_key_cmpfn(km, t) && km == *kmp) { + read_unlock_bh(&net_gre->keymap_lock); + return 0; + } + } + read_unlock_bh(&net_gre->keymap_lock); + pr_debug("trying to override keymap_%s for ct %p\n", + dir == IP_CT_DIR_REPLY ? "reply" : "orig", ct); + return -EEXIST; + } + + km = kmalloc(sizeof(*km), GFP_ATOMIC); + if (!km) + return -ENOMEM; + memcpy(&km->tuple, t, sizeof(*t)); + *kmp = km; + + pr_debug("adding new entry %p: ", km); + nf_ct_dump_tuple(&km->tuple); + + write_lock_bh(&net_gre->keymap_lock); + list_add_tail(&km->list, &net_gre->keymap_list); + write_unlock_bh(&net_gre->keymap_lock); + + return 0; +} +EXPORT_SYMBOL_GPL(nf_ct_gre_keymap_add); + +/* destroy the keymap entries associated with specified master ct */ +void nf_ct_gre_keymap_destroy(struct nf_conn *ct) +{ + struct net *net = nf_ct_net(ct); + struct netns_proto_gre *net_gre = net_generic(net, proto_gre_net_id); + struct nf_conn_help *help = nfct_help(ct); + enum ip_conntrack_dir dir; + + pr_debug("entering for ct %p\n", ct); + + write_lock_bh(&net_gre->keymap_lock); + for (dir = IP_CT_DIR_ORIGINAL; dir < IP_CT_DIR_MAX; dir++) { + if (help->help.ct_pptp_info.keymap[dir]) { + pr_debug("removing %p from list\n", + help->help.ct_pptp_info.keymap[dir]); + list_del(&help->help.ct_pptp_info.keymap[dir]->list); + kfree(help->help.ct_pptp_info.keymap[dir]); + help->help.ct_pptp_info.keymap[dir] = NULL; + } + } + write_unlock_bh(&net_gre->keymap_lock); +} +EXPORT_SYMBOL_GPL(nf_ct_gre_keymap_destroy); + +/* PUBLIC CONNTRACK PROTO HELPER FUNCTIONS */ + +/* invert gre part of tuple */ +static bool gre_invert_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple *orig) +{ + tuple->dst.u.gre.key = orig->src.u.gre.key; + tuple->src.u.gre.key = orig->dst.u.gre.key; + return true; +} + +/* gre hdr info to tuple */ +static bool gre_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, + struct nf_conntrack_tuple *tuple) +{ + struct net *net = dev_net(skb->dev ? skb->dev : skb_dst(skb)->dev); + const struct gre_hdr_pptp *pgrehdr; + struct gre_hdr_pptp _pgrehdr; + __be16 srckey; + const struct gre_hdr *grehdr; + struct gre_hdr _grehdr; + + /* first only delinearize old RFC1701 GRE header */ + grehdr = skb_header_pointer(skb, dataoff, sizeof(_grehdr), &_grehdr); + if (!grehdr || grehdr->version != GRE_VERSION_PPTP) { + /* try to behave like "nf_conntrack_proto_generic" */ + tuple->src.u.all = 0; + tuple->dst.u.all = 0; + return true; + } + + /* PPTP header is variable length, only need up to the call_id field */ + pgrehdr = skb_header_pointer(skb, dataoff, 8, &_pgrehdr); + if (!pgrehdr) + return true; + + if (ntohs(grehdr->protocol) != GRE_PROTOCOL_PPTP) { + pr_debug("GRE_VERSION_PPTP but unknown proto\n"); + return false; + } + + tuple->dst.u.gre.key = pgrehdr->call_id; + srckey = gre_keymap_lookup(net, tuple); + tuple->src.u.gre.key = srckey; + + return true; +} + +/* print gre part of tuple */ +static int gre_print_tuple(struct seq_file *s, + const struct nf_conntrack_tuple *tuple) +{ + return seq_printf(s, "srckey=0x%x dstkey=0x%x ", + ntohs(tuple->src.u.gre.key), + ntohs(tuple->dst.u.gre.key)); +} + +/* print private data for conntrack */ +static int gre_print_conntrack(struct seq_file *s, struct nf_conn *ct) +{ + return seq_printf(s, "timeout=%u, stream_timeout=%u ", + (ct->proto.gre.timeout / HZ), + (ct->proto.gre.stream_timeout / HZ)); +} + +static unsigned int *gre_get_timeouts(struct net *net) +{ + return gre_timeouts; +} + +/* Returns verdict for packet, and may modify conntrack */ +static int gre_packet(struct nf_conn *ct, + const struct sk_buff *skb, + unsigned int dataoff, + enum ip_conntrack_info ctinfo, + u_int8_t pf, + unsigned int hooknum, + unsigned int *timeouts) +{ + /* If we've seen traffic both ways, this is a GRE connection. + * Extend timeout. */ + if (ct->status & IPS_SEEN_REPLY) { + nf_ct_refresh_acct(ct, ctinfo, skb, + ct->proto.gre.stream_timeout); + /* Also, more likely to be important, and not a probe. */ + if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status)) + nf_conntrack_event_cache(IPCT_ASSURED, ct); + } else + nf_ct_refresh_acct(ct, ctinfo, skb, + ct->proto.gre.timeout); + + return NF_ACCEPT; +} + +/* Called when a new connection for this protocol found. */ +static bool gre_new(struct nf_conn *ct, const struct sk_buff *skb, + unsigned int dataoff, unsigned int *timeouts) +{ + pr_debug(": "); + nf_ct_dump_tuple(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + + /* initialize to sane value. Ideally a conntrack helper + * (e.g. in case of pptp) is increasing them */ + ct->proto.gre.stream_timeout = timeouts[GRE_CT_REPLIED]; + ct->proto.gre.timeout = timeouts[GRE_CT_UNREPLIED]; + + return true; +} + +/* Called when a conntrack entry has already been removed from the hashes + * and is about to be deleted from memory */ +static void gre_destroy(struct nf_conn *ct) +{ + struct nf_conn *master = ct->master; + pr_debug(" entering\n"); + + if (!master) + pr_debug("no master !?!\n"); + else + nf_ct_gre_keymap_destroy(master); +} + +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nfnetlink_cttimeout.h> + +static int gre_timeout_nlattr_to_obj(struct nlattr *tb[], void *data) +{ + unsigned int *timeouts = data; + + /* set default timeouts for GRE. */ + timeouts[GRE_CT_UNREPLIED] = gre_timeouts[GRE_CT_UNREPLIED]; + timeouts[GRE_CT_REPLIED] = gre_timeouts[GRE_CT_REPLIED]; + + if (tb[CTA_TIMEOUT_GRE_UNREPLIED]) { + timeouts[GRE_CT_UNREPLIED] = + ntohl(nla_get_be32(tb[CTA_TIMEOUT_GRE_UNREPLIED])) * HZ; + } + if (tb[CTA_TIMEOUT_GRE_REPLIED]) { + timeouts[GRE_CT_REPLIED] = + ntohl(nla_get_be32(tb[CTA_TIMEOUT_GRE_REPLIED])) * HZ; + } + return 0; +} + +static int +gre_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) +{ + const unsigned int *timeouts = data; + + NLA_PUT_BE32(skb, CTA_TIMEOUT_GRE_UNREPLIED, + htonl(timeouts[GRE_CT_UNREPLIED] / HZ)); + NLA_PUT_BE32(skb, CTA_TIMEOUT_GRE_REPLIED, + htonl(timeouts[GRE_CT_REPLIED] / HZ)); + return 0; + +nla_put_failure: + return -ENOSPC; +} + +static const struct nla_policy +gre_timeout_nla_policy[CTA_TIMEOUT_GRE_MAX+1] = { + [CTA_TIMEOUT_GRE_UNREPLIED] = { .type = NLA_U32 }, + [CTA_TIMEOUT_GRE_REPLIED] = { .type = NLA_U32 }, +}; +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ + +/* protocol helper struct */ +static struct nf_conntrack_l4proto nf_conntrack_l4proto_gre4 __read_mostly = { + .l3proto = AF_INET, + .l4proto = IPPROTO_GRE, + .name = "gre", + .pkt_to_tuple = gre_pkt_to_tuple, + .invert_tuple = gre_invert_tuple, + .print_tuple = gre_print_tuple, + .print_conntrack = gre_print_conntrack, + .get_timeouts = gre_get_timeouts, + .packet = gre_packet, + .new = gre_new, + .destroy = gre_destroy, + .me = THIS_MODULE, +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) + .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, + .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, + .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, + .nla_policy = nf_ct_port_nla_policy, +#endif +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + .ctnl_timeout = { + .nlattr_to_obj = gre_timeout_nlattr_to_obj, + .obj_to_nlattr = gre_timeout_obj_to_nlattr, + .nlattr_max = CTA_TIMEOUT_GRE_MAX, + .obj_size = sizeof(unsigned int) * GRE_CT_MAX, + .nla_policy = gre_timeout_nla_policy, + }, +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ +}; + +static int proto_gre_net_init(struct net *net) +{ + struct netns_proto_gre *net_gre = net_generic(net, proto_gre_net_id); + + rwlock_init(&net_gre->keymap_lock); + INIT_LIST_HEAD(&net_gre->keymap_list); + + return 0; +} + +static void proto_gre_net_exit(struct net *net) +{ + nf_ct_gre_keymap_flush(net); +} + +static struct pernet_operations proto_gre_net_ops = { + .init = proto_gre_net_init, + .exit = proto_gre_net_exit, + .id = &proto_gre_net_id, + .size = sizeof(struct netns_proto_gre), +}; + +static int __init nf_ct_proto_gre_init(void) +{ + int rv; + + rv = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_gre4); + if (rv < 0) + return rv; + rv = register_pernet_subsys(&proto_gre_net_ops); + if (rv < 0) + nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_gre4); + return rv; +} + +static void __exit nf_ct_proto_gre_fini(void) +{ + nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_gre4); + unregister_pernet_subsys(&proto_gre_net_ops); +} + +module_init(nf_ct_proto_gre_init); +module_exit(nf_ct_proto_gre_fini); + +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c new file mode 100644 index 00000000..72b50885 --- /dev/null +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -0,0 +1,827 @@ +/* + * Connection tracking protocol helper module for SCTP. + * + * SCTP is defined in RFC 2960. References to various sections in this code + * are to this RFC. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/types.h> +#include <linux/timer.h> +#include <linux/netfilter.h> +#include <linux/module.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/sctp.h> +#include <linux/string.h> +#include <linux/seq_file.h> +#include <linux/spinlock.h> +#include <linux/interrupt.h> + +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_l4proto.h> +#include <net/netfilter/nf_conntrack_ecache.h> + +/* FIXME: Examine ipfilter's timeouts and conntrack transitions more + closely. They're more complex. --RR + + And so for me for SCTP :D -Kiran */ + +static const char *const sctp_conntrack_names[] = { + "NONE", + "CLOSED", + "COOKIE_WAIT", + "COOKIE_ECHOED", + "ESTABLISHED", + "SHUTDOWN_SENT", + "SHUTDOWN_RECD", + "SHUTDOWN_ACK_SENT", +}; + +#define SECS * HZ +#define MINS * 60 SECS +#define HOURS * 60 MINS +#define DAYS * 24 HOURS + +static unsigned int sctp_timeouts[SCTP_CONNTRACK_MAX] __read_mostly = { + [SCTP_CONNTRACK_CLOSED] = 10 SECS, + [SCTP_CONNTRACK_COOKIE_WAIT] = 3 SECS, + [SCTP_CONNTRACK_COOKIE_ECHOED] = 3 SECS, + [SCTP_CONNTRACK_ESTABLISHED] = 5 DAYS, + [SCTP_CONNTRACK_SHUTDOWN_SENT] = 300 SECS / 1000, + [SCTP_CONNTRACK_SHUTDOWN_RECD] = 300 SECS / 1000, + [SCTP_CONNTRACK_SHUTDOWN_ACK_SENT] = 3 SECS, +}; + +#define sNO SCTP_CONNTRACK_NONE +#define sCL SCTP_CONNTRACK_CLOSED +#define sCW SCTP_CONNTRACK_COOKIE_WAIT +#define sCE SCTP_CONNTRACK_COOKIE_ECHOED +#define sES SCTP_CONNTRACK_ESTABLISHED +#define sSS SCTP_CONNTRACK_SHUTDOWN_SENT +#define sSR SCTP_CONNTRACK_SHUTDOWN_RECD +#define sSA SCTP_CONNTRACK_SHUTDOWN_ACK_SENT +#define sIV SCTP_CONNTRACK_MAX + +/* + These are the descriptions of the states: + +NOTE: These state names are tantalizingly similar to the states of an +SCTP endpoint. But the interpretation of the states is a little different, +considering that these are the states of the connection and not of an end +point. Please note the subtleties. -Kiran + +NONE - Nothing so far. +COOKIE WAIT - We have seen an INIT chunk in the original direction, or also + an INIT_ACK chunk in the reply direction. +COOKIE ECHOED - We have seen a COOKIE_ECHO chunk in the original direction. +ESTABLISHED - We have seen a COOKIE_ACK in the reply direction. +SHUTDOWN_SENT - We have seen a SHUTDOWN chunk in the original direction. +SHUTDOWN_RECD - We have seen a SHUTDOWN chunk in the reply directoin. +SHUTDOWN_ACK_SENT - We have seen a SHUTDOWN_ACK chunk in the direction opposite + to that of the SHUTDOWN chunk. +CLOSED - We have seen a SHUTDOWN_COMPLETE chunk in the direction of + the SHUTDOWN chunk. Connection is closed. +*/ + +/* TODO + - I have assumed that the first INIT is in the original direction. + This messes things when an INIT comes in the reply direction in CLOSED + state. + - Check the error type in the reply dir before transitioning from +cookie echoed to closed. + - Sec 5.2.4 of RFC 2960 + - Multi Homing support. +*/ + +/* SCTP conntrack state transitions */ +static const u8 sctp_conntracks[2][9][SCTP_CONNTRACK_MAX] = { + { +/* ORIGINAL */ +/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA */ +/* init */ {sCW, sCW, sCW, sCE, sES, sSS, sSR, sSA}, +/* init_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA}, +/* abort */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, +/* shutdown */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA}, +/* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA}, +/* error */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Can't have Stale cookie*/ +/* cookie_echo */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA},/* 5.2.4 - Big TODO */ +/* cookie_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Can't come in orig dir */ +/* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL} + }, + { +/* REPLY */ +/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA */ +/* init */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* INIT in sCL Big TODO */ +/* init_ack */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA}, +/* abort */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, +/* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA}, +/* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA}, +/* error */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA}, +/* cookie_echo */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Can't come in reply dir */ +/* cookie_ack */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA}, +/* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL} + } +}; + +static bool sctp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, + struct nf_conntrack_tuple *tuple) +{ + const struct sctphdr *hp; + struct sctphdr _hdr; + + /* Actually only need first 8 bytes. */ + hp = skb_header_pointer(skb, dataoff, 8, &_hdr); + if (hp == NULL) + return false; + + tuple->src.u.sctp.port = hp->source; + tuple->dst.u.sctp.port = hp->dest; + return true; +} + +static bool sctp_invert_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple *orig) +{ + tuple->src.u.sctp.port = orig->dst.u.sctp.port; + tuple->dst.u.sctp.port = orig->src.u.sctp.port; + return true; +} + +/* Print out the per-protocol part of the tuple. */ +static int sctp_print_tuple(struct seq_file *s, + const struct nf_conntrack_tuple *tuple) +{ + return seq_printf(s, "sport=%hu dport=%hu ", + ntohs(tuple->src.u.sctp.port), + ntohs(tuple->dst.u.sctp.port)); +} + +/* Print out the private part of the conntrack. */ +static int sctp_print_conntrack(struct seq_file *s, struct nf_conn *ct) +{ + enum sctp_conntrack state; + + spin_lock_bh(&ct->lock); + state = ct->proto.sctp.state; + spin_unlock_bh(&ct->lock); + + return seq_printf(s, "%s ", sctp_conntrack_names[state]); +} + +#define for_each_sctp_chunk(skb, sch, _sch, offset, dataoff, count) \ +for ((offset) = (dataoff) + sizeof(sctp_sctphdr_t), (count) = 0; \ + (offset) < (skb)->len && \ + ((sch) = skb_header_pointer((skb), (offset), sizeof(_sch), &(_sch))); \ + (offset) += (ntohs((sch)->length) + 3) & ~3, (count)++) + +/* Some validity checks to make sure the chunks are fine */ +static int do_basic_checks(struct nf_conn *ct, + const struct sk_buff *skb, + unsigned int dataoff, + unsigned long *map) +{ + u_int32_t offset, count; + sctp_chunkhdr_t _sch, *sch; + int flag; + + flag = 0; + + for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) { + pr_debug("Chunk Num: %d Type: %d\n", count, sch->type); + + if (sch->type == SCTP_CID_INIT || + sch->type == SCTP_CID_INIT_ACK || + sch->type == SCTP_CID_SHUTDOWN_COMPLETE) + flag = 1; + + /* + * Cookie Ack/Echo chunks not the first OR + * Init / Init Ack / Shutdown compl chunks not the only chunks + * OR zero-length. + */ + if (((sch->type == SCTP_CID_COOKIE_ACK || + sch->type == SCTP_CID_COOKIE_ECHO || + flag) && + count != 0) || !sch->length) { + pr_debug("Basic checks failed\n"); + return 1; + } + + if (map) + set_bit(sch->type, map); + } + + pr_debug("Basic checks passed\n"); + return count == 0; +} + +static int sctp_new_state(enum ip_conntrack_dir dir, + enum sctp_conntrack cur_state, + int chunk_type) +{ + int i; + + pr_debug("Chunk type: %d\n", chunk_type); + + switch (chunk_type) { + case SCTP_CID_INIT: + pr_debug("SCTP_CID_INIT\n"); + i = 0; + break; + case SCTP_CID_INIT_ACK: + pr_debug("SCTP_CID_INIT_ACK\n"); + i = 1; + break; + case SCTP_CID_ABORT: + pr_debug("SCTP_CID_ABORT\n"); + i = 2; + break; + case SCTP_CID_SHUTDOWN: + pr_debug("SCTP_CID_SHUTDOWN\n"); + i = 3; + break; + case SCTP_CID_SHUTDOWN_ACK: + pr_debug("SCTP_CID_SHUTDOWN_ACK\n"); + i = 4; + break; + case SCTP_CID_ERROR: + pr_debug("SCTP_CID_ERROR\n"); + i = 5; + break; + case SCTP_CID_COOKIE_ECHO: + pr_debug("SCTP_CID_COOKIE_ECHO\n"); + i = 6; + break; + case SCTP_CID_COOKIE_ACK: + pr_debug("SCTP_CID_COOKIE_ACK\n"); + i = 7; + break; + case SCTP_CID_SHUTDOWN_COMPLETE: + pr_debug("SCTP_CID_SHUTDOWN_COMPLETE\n"); + i = 8; + break; + default: + /* Other chunks like DATA, SACK, HEARTBEAT and + its ACK do not cause a change in state */ + pr_debug("Unknown chunk type, Will stay in %s\n", + sctp_conntrack_names[cur_state]); + return cur_state; + } + + pr_debug("dir: %d cur_state: %s chunk_type: %d new_state: %s\n", + dir, sctp_conntrack_names[cur_state], chunk_type, + sctp_conntrack_names[sctp_conntracks[dir][i][cur_state]]); + + return sctp_conntracks[dir][i][cur_state]; +} + +static unsigned int *sctp_get_timeouts(struct net *net) +{ + return sctp_timeouts; +} + +/* Returns verdict for packet, or -NF_ACCEPT for invalid. */ +static int sctp_packet(struct nf_conn *ct, + const struct sk_buff *skb, + unsigned int dataoff, + enum ip_conntrack_info ctinfo, + u_int8_t pf, + unsigned int hooknum, + unsigned int *timeouts) +{ + enum sctp_conntrack new_state, old_state; + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + const struct sctphdr *sh; + struct sctphdr _sctph; + const struct sctp_chunkhdr *sch; + struct sctp_chunkhdr _sch; + u_int32_t offset, count; + unsigned long map[256 / sizeof(unsigned long)] = { 0 }; + + sh = skb_header_pointer(skb, dataoff, sizeof(_sctph), &_sctph); + if (sh == NULL) + goto out; + + if (do_basic_checks(ct, skb, dataoff, map) != 0) + goto out; + + /* Check the verification tag (Sec 8.5) */ + if (!test_bit(SCTP_CID_INIT, map) && + !test_bit(SCTP_CID_SHUTDOWN_COMPLETE, map) && + !test_bit(SCTP_CID_COOKIE_ECHO, map) && + !test_bit(SCTP_CID_ABORT, map) && + !test_bit(SCTP_CID_SHUTDOWN_ACK, map) && + sh->vtag != ct->proto.sctp.vtag[dir]) { + pr_debug("Verification tag check failed\n"); + goto out; + } + + old_state = new_state = SCTP_CONNTRACK_NONE; + spin_lock_bh(&ct->lock); + for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) { + /* Special cases of Verification tag check (Sec 8.5.1) */ + if (sch->type == SCTP_CID_INIT) { + /* Sec 8.5.1 (A) */ + if (sh->vtag != 0) + goto out_unlock; + } else if (sch->type == SCTP_CID_ABORT) { + /* Sec 8.5.1 (B) */ + if (sh->vtag != ct->proto.sctp.vtag[dir] && + sh->vtag != ct->proto.sctp.vtag[!dir]) + goto out_unlock; + } else if (sch->type == SCTP_CID_SHUTDOWN_COMPLETE) { + /* Sec 8.5.1 (C) */ + if (sh->vtag != ct->proto.sctp.vtag[dir] && + sh->vtag != ct->proto.sctp.vtag[!dir] && + sch->flags & SCTP_CHUNK_FLAG_T) + goto out_unlock; + } else if (sch->type == SCTP_CID_COOKIE_ECHO) { + /* Sec 8.5.1 (D) */ + if (sh->vtag != ct->proto.sctp.vtag[dir]) + goto out_unlock; + } + + old_state = ct->proto.sctp.state; + new_state = sctp_new_state(dir, old_state, sch->type); + + /* Invalid */ + if (new_state == SCTP_CONNTRACK_MAX) { + pr_debug("nf_conntrack_sctp: Invalid dir=%i ctype=%u " + "conntrack=%u\n", + dir, sch->type, old_state); + goto out_unlock; + } + + /* If it is an INIT or an INIT ACK note down the vtag */ + if (sch->type == SCTP_CID_INIT || + sch->type == SCTP_CID_INIT_ACK) { + sctp_inithdr_t _inithdr, *ih; + + ih = skb_header_pointer(skb, offset + sizeof(sctp_chunkhdr_t), + sizeof(_inithdr), &_inithdr); + if (ih == NULL) + goto out_unlock; + pr_debug("Setting vtag %x for dir %d\n", + ih->init_tag, !dir); + ct->proto.sctp.vtag[!dir] = ih->init_tag; + } + + ct->proto.sctp.state = new_state; + if (old_state != new_state) + nf_conntrack_event_cache(IPCT_PROTOINFO, ct); + } + spin_unlock_bh(&ct->lock); + + nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[new_state]); + + if (old_state == SCTP_CONNTRACK_COOKIE_ECHOED && + dir == IP_CT_DIR_REPLY && + new_state == SCTP_CONNTRACK_ESTABLISHED) { + pr_debug("Setting assured bit\n"); + set_bit(IPS_ASSURED_BIT, &ct->status); + nf_conntrack_event_cache(IPCT_ASSURED, ct); + } + + return NF_ACCEPT; + +out_unlock: + spin_unlock_bh(&ct->lock); +out: + return -NF_ACCEPT; +} + +/* Called when a new connection for this protocol found. */ +static bool sctp_new(struct nf_conn *ct, const struct sk_buff *skb, + unsigned int dataoff, unsigned int *timeouts) +{ + enum sctp_conntrack new_state; + const struct sctphdr *sh; + struct sctphdr _sctph; + const struct sctp_chunkhdr *sch; + struct sctp_chunkhdr _sch; + u_int32_t offset, count; + unsigned long map[256 / sizeof(unsigned long)] = { 0 }; + + sh = skb_header_pointer(skb, dataoff, sizeof(_sctph), &_sctph); + if (sh == NULL) + return false; + + if (do_basic_checks(ct, skb, dataoff, map) != 0) + return false; + + /* If an OOTB packet has any of these chunks discard (Sec 8.4) */ + if (test_bit(SCTP_CID_ABORT, map) || + test_bit(SCTP_CID_SHUTDOWN_COMPLETE, map) || + test_bit(SCTP_CID_COOKIE_ACK, map)) + return false; + + memset(&ct->proto.sctp, 0, sizeof(ct->proto.sctp)); + new_state = SCTP_CONNTRACK_MAX; + for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) { + /* Don't need lock here: this conntrack not in circulation yet */ + new_state = sctp_new_state(IP_CT_DIR_ORIGINAL, + SCTP_CONNTRACK_NONE, sch->type); + + /* Invalid: delete conntrack */ + if (new_state == SCTP_CONNTRACK_NONE || + new_state == SCTP_CONNTRACK_MAX) { + pr_debug("nf_conntrack_sctp: invalid new deleting.\n"); + return false; + } + + /* Copy the vtag into the state info */ + if (sch->type == SCTP_CID_INIT) { + if (sh->vtag == 0) { + sctp_inithdr_t _inithdr, *ih; + + ih = skb_header_pointer(skb, offset + sizeof(sctp_chunkhdr_t), + sizeof(_inithdr), &_inithdr); + if (ih == NULL) + return false; + + pr_debug("Setting vtag %x for new conn\n", + ih->init_tag); + + ct->proto.sctp.vtag[IP_CT_DIR_REPLY] = + ih->init_tag; + } else { + /* Sec 8.5.1 (A) */ + return false; + } + } + /* If it is a shutdown ack OOTB packet, we expect a return + shutdown complete, otherwise an ABORT Sec 8.4 (5) and (8) */ + else { + pr_debug("Setting vtag %x for new conn OOTB\n", + sh->vtag); + ct->proto.sctp.vtag[IP_CT_DIR_REPLY] = sh->vtag; + } + + ct->proto.sctp.state = new_state; + } + + return true; +} + +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) + +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nfnetlink_conntrack.h> + +static int sctp_to_nlattr(struct sk_buff *skb, struct nlattr *nla, + struct nf_conn *ct) +{ + struct nlattr *nest_parms; + + spin_lock_bh(&ct->lock); + nest_parms = nla_nest_start(skb, CTA_PROTOINFO_SCTP | NLA_F_NESTED); + if (!nest_parms) + goto nla_put_failure; + + NLA_PUT_U8(skb, CTA_PROTOINFO_SCTP_STATE, ct->proto.sctp.state); + + NLA_PUT_BE32(skb, + CTA_PROTOINFO_SCTP_VTAG_ORIGINAL, + ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL]); + + NLA_PUT_BE32(skb, + CTA_PROTOINFO_SCTP_VTAG_REPLY, + ct->proto.sctp.vtag[IP_CT_DIR_REPLY]); + + spin_unlock_bh(&ct->lock); + + nla_nest_end(skb, nest_parms); + + return 0; + +nla_put_failure: + spin_unlock_bh(&ct->lock); + return -1; +} + +static const struct nla_policy sctp_nla_policy[CTA_PROTOINFO_SCTP_MAX+1] = { + [CTA_PROTOINFO_SCTP_STATE] = { .type = NLA_U8 }, + [CTA_PROTOINFO_SCTP_VTAG_ORIGINAL] = { .type = NLA_U32 }, + [CTA_PROTOINFO_SCTP_VTAG_REPLY] = { .type = NLA_U32 }, +}; + +static int nlattr_to_sctp(struct nlattr *cda[], struct nf_conn *ct) +{ + struct nlattr *attr = cda[CTA_PROTOINFO_SCTP]; + struct nlattr *tb[CTA_PROTOINFO_SCTP_MAX+1]; + int err; + + /* updates may not contain the internal protocol info, skip parsing */ + if (!attr) + return 0; + + err = nla_parse_nested(tb, + CTA_PROTOINFO_SCTP_MAX, + attr, + sctp_nla_policy); + if (err < 0) + return err; + + if (!tb[CTA_PROTOINFO_SCTP_STATE] || + !tb[CTA_PROTOINFO_SCTP_VTAG_ORIGINAL] || + !tb[CTA_PROTOINFO_SCTP_VTAG_REPLY]) + return -EINVAL; + + spin_lock_bh(&ct->lock); + ct->proto.sctp.state = nla_get_u8(tb[CTA_PROTOINFO_SCTP_STATE]); + ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL] = + nla_get_be32(tb[CTA_PROTOINFO_SCTP_VTAG_ORIGINAL]); + ct->proto.sctp.vtag[IP_CT_DIR_REPLY] = + nla_get_be32(tb[CTA_PROTOINFO_SCTP_VTAG_REPLY]); + spin_unlock_bh(&ct->lock); + + return 0; +} + +static int sctp_nlattr_size(void) +{ + return nla_total_size(0) /* CTA_PROTOINFO_SCTP */ + + nla_policy_len(sctp_nla_policy, CTA_PROTOINFO_SCTP_MAX + 1); +} +#endif + +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nfnetlink_cttimeout.h> + +static int sctp_timeout_nlattr_to_obj(struct nlattr *tb[], void *data) +{ + unsigned int *timeouts = data; + int i; + + /* set default SCTP timeouts. */ + for (i=0; i<SCTP_CONNTRACK_MAX; i++) + timeouts[i] = sctp_timeouts[i]; + + /* there's a 1:1 mapping between attributes and protocol states. */ + for (i=CTA_TIMEOUT_SCTP_UNSPEC+1; i<CTA_TIMEOUT_SCTP_MAX+1; i++) { + if (tb[i]) { + timeouts[i] = ntohl(nla_get_be32(tb[i])) * HZ; + } + } + return 0; +} + +static int +sctp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) +{ + const unsigned int *timeouts = data; + int i; + + for (i=CTA_TIMEOUT_SCTP_UNSPEC+1; i<CTA_TIMEOUT_SCTP_MAX+1; i++) + NLA_PUT_BE32(skb, i, htonl(timeouts[i] / HZ)); + + return 0; + +nla_put_failure: + return -ENOSPC; +} + +static const struct nla_policy +sctp_timeout_nla_policy[CTA_TIMEOUT_SCTP_MAX+1] = { + [CTA_TIMEOUT_SCTP_CLOSED] = { .type = NLA_U32 }, + [CTA_TIMEOUT_SCTP_COOKIE_WAIT] = { .type = NLA_U32 }, + [CTA_TIMEOUT_SCTP_COOKIE_ECHOED] = { .type = NLA_U32 }, + [CTA_TIMEOUT_SCTP_ESTABLISHED] = { .type = NLA_U32 }, + [CTA_TIMEOUT_SCTP_SHUTDOWN_SENT] = { .type = NLA_U32 }, + [CTA_TIMEOUT_SCTP_SHUTDOWN_RECD] = { .type = NLA_U32 }, + [CTA_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT] = { .type = NLA_U32 }, +}; +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ + + +#ifdef CONFIG_SYSCTL +static unsigned int sctp_sysctl_table_users; +static struct ctl_table_header *sctp_sysctl_header; +static struct ctl_table sctp_sysctl_table[] = { + { + .procname = "nf_conntrack_sctp_timeout_closed", + .data = &sctp_timeouts[SCTP_CONNTRACK_CLOSED], + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "nf_conntrack_sctp_timeout_cookie_wait", + .data = &sctp_timeouts[SCTP_CONNTRACK_COOKIE_WAIT], + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "nf_conntrack_sctp_timeout_cookie_echoed", + .data = &sctp_timeouts[SCTP_CONNTRACK_COOKIE_ECHOED], + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "nf_conntrack_sctp_timeout_established", + .data = &sctp_timeouts[SCTP_CONNTRACK_ESTABLISHED], + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "nf_conntrack_sctp_timeout_shutdown_sent", + .data = &sctp_timeouts[SCTP_CONNTRACK_SHUTDOWN_SENT], + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "nf_conntrack_sctp_timeout_shutdown_recd", + .data = &sctp_timeouts[SCTP_CONNTRACK_SHUTDOWN_RECD], + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "nf_conntrack_sctp_timeout_shutdown_ack_sent", + .data = &sctp_timeouts[SCTP_CONNTRACK_SHUTDOWN_ACK_SENT], + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { } +}; + +#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT +static struct ctl_table sctp_compat_sysctl_table[] = { + { + .procname = "ip_conntrack_sctp_timeout_closed", + .data = &sctp_timeouts[SCTP_CONNTRACK_CLOSED], + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "ip_conntrack_sctp_timeout_cookie_wait", + .data = &sctp_timeouts[SCTP_CONNTRACK_COOKIE_WAIT], + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "ip_conntrack_sctp_timeout_cookie_echoed", + .data = &sctp_timeouts[SCTP_CONNTRACK_COOKIE_ECHOED], + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "ip_conntrack_sctp_timeout_established", + .data = &sctp_timeouts[SCTP_CONNTRACK_ESTABLISHED], + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "ip_conntrack_sctp_timeout_shutdown_sent", + .data = &sctp_timeouts[SCTP_CONNTRACK_SHUTDOWN_SENT], + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "ip_conntrack_sctp_timeout_shutdown_recd", + .data = &sctp_timeouts[SCTP_CONNTRACK_SHUTDOWN_RECD], + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "ip_conntrack_sctp_timeout_shutdown_ack_sent", + .data = &sctp_timeouts[SCTP_CONNTRACK_SHUTDOWN_ACK_SENT], + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { } +}; +#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */ +#endif + +static struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 __read_mostly = { + .l3proto = PF_INET, + .l4proto = IPPROTO_SCTP, + .name = "sctp", + .pkt_to_tuple = sctp_pkt_to_tuple, + .invert_tuple = sctp_invert_tuple, + .print_tuple = sctp_print_tuple, + .print_conntrack = sctp_print_conntrack, + .packet = sctp_packet, + .get_timeouts = sctp_get_timeouts, + .new = sctp_new, + .me = THIS_MODULE, +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) + .to_nlattr = sctp_to_nlattr, + .nlattr_size = sctp_nlattr_size, + .from_nlattr = nlattr_to_sctp, + .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, + .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, + .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, + .nla_policy = nf_ct_port_nla_policy, +#endif +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + .ctnl_timeout = { + .nlattr_to_obj = sctp_timeout_nlattr_to_obj, + .obj_to_nlattr = sctp_timeout_obj_to_nlattr, + .nlattr_max = CTA_TIMEOUT_SCTP_MAX, + .obj_size = sizeof(unsigned int) * SCTP_CONNTRACK_MAX, + .nla_policy = sctp_timeout_nla_policy, + }, +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ +#ifdef CONFIG_SYSCTL + .ctl_table_users = &sctp_sysctl_table_users, + .ctl_table_header = &sctp_sysctl_header, + .ctl_table = sctp_sysctl_table, +#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT + .ctl_compat_table = sctp_compat_sysctl_table, +#endif +#endif +}; + +static struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 __read_mostly = { + .l3proto = PF_INET6, + .l4proto = IPPROTO_SCTP, + .name = "sctp", + .pkt_to_tuple = sctp_pkt_to_tuple, + .invert_tuple = sctp_invert_tuple, + .print_tuple = sctp_print_tuple, + .print_conntrack = sctp_print_conntrack, + .packet = sctp_packet, + .get_timeouts = sctp_get_timeouts, + .new = sctp_new, + .me = THIS_MODULE, +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) + .to_nlattr = sctp_to_nlattr, + .nlattr_size = sctp_nlattr_size, + .from_nlattr = nlattr_to_sctp, + .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, + .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, + .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, + .nla_policy = nf_ct_port_nla_policy, +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + .ctnl_timeout = { + .nlattr_to_obj = sctp_timeout_nlattr_to_obj, + .obj_to_nlattr = sctp_timeout_obj_to_nlattr, + .nlattr_max = CTA_TIMEOUT_SCTP_MAX, + .obj_size = sizeof(unsigned int) * SCTP_CONNTRACK_MAX, + .nla_policy = sctp_timeout_nla_policy, + }, +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ +#endif +#ifdef CONFIG_SYSCTL + .ctl_table_users = &sctp_sysctl_table_users, + .ctl_table_header = &sctp_sysctl_header, + .ctl_table = sctp_sysctl_table, +#endif +}; + +static int __init nf_conntrack_proto_sctp_init(void) +{ + int ret; + + ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_sctp4); + if (ret) { + pr_err("nf_conntrack_l4proto_sctp4: protocol register failed\n"); + goto out; + } + ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_sctp6); + if (ret) { + pr_err("nf_conntrack_l4proto_sctp6: protocol register failed\n"); + goto cleanup_sctp4; + } + + return ret; + + cleanup_sctp4: + nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_sctp4); + out: + return ret; +} + +static void __exit nf_conntrack_proto_sctp_fini(void) +{ + nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_sctp6); + nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_sctp4); +} + +module_init(nf_conntrack_proto_sctp_init); +module_exit(nf_conntrack_proto_sctp_fini); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Kiran Kumar Immidi"); +MODULE_DESCRIPTION("Netfilter connection tracking protocol helper for SCTP"); +MODULE_ALIAS("ip_conntrack_proto_sctp"); diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c new file mode 100644 index 00000000..0d07a1dc --- /dev/null +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -0,0 +1,1631 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/types.h> +#include <linux/timer.h> +#include <linux/module.h> +#include <linux/in.h> +#include <linux/tcp.h> +#include <linux/spinlock.h> +#include <linux/skbuff.h> +#include <linux/ipv6.h> +#include <net/ip6_checksum.h> +#include <asm/unaligned.h> + +#include <net/tcp.h> + +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter_ipv6.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_l4proto.h> +#include <net/netfilter/nf_conntrack_ecache.h> +#include <net/netfilter/nf_log.h> +#include <net/netfilter/ipv4/nf_conntrack_ipv4.h> +#include <net/netfilter/ipv6/nf_conntrack_ipv6.h> + +/* "Be conservative in what you do, + be liberal in what you accept from others." + If it's non-zero, we mark only out of window RST segments as INVALID. */ +static int nf_ct_tcp_be_liberal __read_mostly = 0; + +/* If it is set to zero, we disable picking up already established + connections. */ +static int nf_ct_tcp_loose __read_mostly = 1; + +/* Max number of the retransmitted packets without receiving an (acceptable) + ACK from the destination. If this number is reached, a shorter timer + will be started. */ +static int nf_ct_tcp_max_retrans __read_mostly = 3; + + /* FIXME: Examine ipfilter's timeouts and conntrack transitions more + closely. They're more complex. --RR */ + +static const char *const tcp_conntrack_names[] = { + "NONE", + "SYN_SENT", + "SYN_RECV", + "ESTABLISHED", + "FIN_WAIT", + "CLOSE_WAIT", + "LAST_ACK", + "TIME_WAIT", + "CLOSE", + "SYN_SENT2", +}; + +#define SECS * HZ +#define MINS * 60 SECS +#define HOURS * 60 MINS +#define DAYS * 24 HOURS + +static unsigned int tcp_timeouts[TCP_CONNTRACK_TIMEOUT_MAX] __read_mostly = { + [TCP_CONNTRACK_SYN_SENT] = 2 MINS, + [TCP_CONNTRACK_SYN_RECV] = 60 SECS, + [TCP_CONNTRACK_ESTABLISHED] = 5 DAYS, + [TCP_CONNTRACK_FIN_WAIT] = 2 MINS, + [TCP_CONNTRACK_CLOSE_WAIT] = 60 SECS, + [TCP_CONNTRACK_LAST_ACK] = 30 SECS, + [TCP_CONNTRACK_TIME_WAIT] = 2 MINS, + [TCP_CONNTRACK_CLOSE] = 10 SECS, + [TCP_CONNTRACK_SYN_SENT2] = 2 MINS, +/* RFC1122 says the R2 limit should be at least 100 seconds. + Linux uses 15 packets as limit, which corresponds + to ~13-30min depending on RTO. */ + [TCP_CONNTRACK_RETRANS] = 5 MINS, + [TCP_CONNTRACK_UNACK] = 5 MINS, +}; + +#define sNO TCP_CONNTRACK_NONE +#define sSS TCP_CONNTRACK_SYN_SENT +#define sSR TCP_CONNTRACK_SYN_RECV +#define sES TCP_CONNTRACK_ESTABLISHED +#define sFW TCP_CONNTRACK_FIN_WAIT +#define sCW TCP_CONNTRACK_CLOSE_WAIT +#define sLA TCP_CONNTRACK_LAST_ACK +#define sTW TCP_CONNTRACK_TIME_WAIT +#define sCL TCP_CONNTRACK_CLOSE +#define sS2 TCP_CONNTRACK_SYN_SENT2 +#define sIV TCP_CONNTRACK_MAX +#define sIG TCP_CONNTRACK_IGNORE + +/* What TCP flags are set from RST/SYN/FIN/ACK. */ +enum tcp_bit_set { + TCP_SYN_SET, + TCP_SYNACK_SET, + TCP_FIN_SET, + TCP_ACK_SET, + TCP_RST_SET, + TCP_NONE_SET, +}; + +/* + * The TCP state transition table needs a few words... + * + * We are the man in the middle. All the packets go through us + * but might get lost in transit to the destination. + * It is assumed that the destinations can't receive segments + * we haven't seen. + * + * The checked segment is in window, but our windows are *not* + * equivalent with the ones of the sender/receiver. We always + * try to guess the state of the current sender. + * + * The meaning of the states are: + * + * NONE: initial state + * SYN_SENT: SYN-only packet seen + * SYN_SENT2: SYN-only packet seen from reply dir, simultaneous open + * SYN_RECV: SYN-ACK packet seen + * ESTABLISHED: ACK packet seen + * FIN_WAIT: FIN packet seen + * CLOSE_WAIT: ACK seen (after FIN) + * LAST_ACK: FIN seen (after FIN) + * TIME_WAIT: last ACK seen + * CLOSE: closed connection (RST) + * + * Packets marked as IGNORED (sIG): + * if they may be either invalid or valid + * and the receiver may send back a connection + * closing RST or a SYN/ACK. + * + * Packets marked as INVALID (sIV): + * if we regard them as truly invalid packets + */ +static const u8 tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = { + { +/* ORIGINAL */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ +/*syn*/ { sSS, sSS, sIG, sIG, sIG, sIG, sIG, sSS, sSS, sS2 }, +/* + * sNO -> sSS Initialize a new connection + * sSS -> sSS Retransmitted SYN + * sS2 -> sS2 Late retransmitted SYN + * sSR -> sIG + * sES -> sIG Error: SYNs in window outside the SYN_SENT state + * are errors. Receiver will reply with RST + * and close the connection. + * Or we are not in sync and hold a dead connection. + * sFW -> sIG + * sCW -> sIG + * sLA -> sIG + * sTW -> sSS Reopened connection (RFC 1122). + * sCL -> sSS + */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ +/*synack*/ { sIV, sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIG, sSR }, +/* + * sNO -> sIV Too late and no reason to do anything + * sSS -> sIV Client can't send SYN and then SYN/ACK + * sS2 -> sSR SYN/ACK sent to SYN2 in simultaneous open + * sSR -> sIG + * sES -> sIG Error: SYNs in window outside the SYN_SENT state + * are errors. Receiver will reply with RST + * and close the connection. + * Or we are not in sync and hold a dead connection. + * sFW -> sIG + * sCW -> sIG + * sLA -> sIG + * sTW -> sIG + * sCL -> sIG + */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ +/*fin*/ { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV }, +/* + * sNO -> sIV Too late and no reason to do anything... + * sSS -> sIV Client migth not send FIN in this state: + * we enforce waiting for a SYN/ACK reply first. + * sS2 -> sIV + * sSR -> sFW Close started. + * sES -> sFW + * sFW -> sLA FIN seen in both directions, waiting for + * the last ACK. + * Migth be a retransmitted FIN as well... + * sCW -> sLA + * sLA -> sLA Retransmitted FIN. Remain in the same state. + * sTW -> sTW + * sCL -> sCL + */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ +/*ack*/ { sES, sIV, sES, sES, sCW, sCW, sTW, sTW, sCL, sIV }, +/* + * sNO -> sES Assumed. + * sSS -> sIV ACK is invalid: we haven't seen a SYN/ACK yet. + * sS2 -> sIV + * sSR -> sES Established state is reached. + * sES -> sES :-) + * sFW -> sCW Normal close request answered by ACK. + * sCW -> sCW + * sLA -> sTW Last ACK detected. + * sTW -> sTW Retransmitted last ACK. Remain in the same state. + * sCL -> sCL + */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ +/*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL }, +/*none*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV } + }, + { +/* REPLY */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ +/*syn*/ { sIV, sS2, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sS2 }, +/* + * sNO -> sIV Never reached. + * sSS -> sS2 Simultaneous open + * sS2 -> sS2 Retransmitted simultaneous SYN + * sSR -> sIV Invalid SYN packets sent by the server + * sES -> sIV + * sFW -> sIV + * sCW -> sIV + * sLA -> sIV + * sTW -> sIV Reopened connection, but server may not do it. + * sCL -> sIV + */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ +/*synack*/ { sIV, sSR, sIG, sIG, sIG, sIG, sIG, sIG, sIG, sSR }, +/* + * sSS -> sSR Standard open. + * sS2 -> sSR Simultaneous open + * sSR -> sIG Retransmitted SYN/ACK, ignore it. + * sES -> sIG Late retransmitted SYN/ACK? + * sFW -> sIG Might be SYN/ACK answering ignored SYN + * sCW -> sIG + * sLA -> sIG + * sTW -> sIG + * sCL -> sIG + */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ +/*fin*/ { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV }, +/* + * sSS -> sIV Server might not send FIN in this state. + * sS2 -> sIV + * sSR -> sFW Close started. + * sES -> sFW + * sFW -> sLA FIN seen in both directions. + * sCW -> sLA + * sLA -> sLA Retransmitted FIN. + * sTW -> sTW + * sCL -> sCL + */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ +/*ack*/ { sIV, sIG, sSR, sES, sCW, sCW, sTW, sTW, sCL, sIG }, +/* + * sSS -> sIG Might be a half-open connection. + * sS2 -> sIG + * sSR -> sSR Might answer late resent SYN. + * sES -> sES :-) + * sFW -> sCW Normal close request answered by ACK. + * sCW -> sCW + * sLA -> sTW Last ACK detected. + * sTW -> sTW Retransmitted last ACK. + * sCL -> sCL + */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ +/*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL }, +/*none*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV } + } +}; + +static bool tcp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, + struct nf_conntrack_tuple *tuple) +{ + const struct tcphdr *hp; + struct tcphdr _hdr; + + /* Actually only need first 8 bytes. */ + hp = skb_header_pointer(skb, dataoff, 8, &_hdr); + if (hp == NULL) + return false; + + tuple->src.u.tcp.port = hp->source; + tuple->dst.u.tcp.port = hp->dest; + + return true; +} + +static bool tcp_invert_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple *orig) +{ + tuple->src.u.tcp.port = orig->dst.u.tcp.port; + tuple->dst.u.tcp.port = orig->src.u.tcp.port; + return true; +} + +/* Print out the per-protocol part of the tuple. */ +static int tcp_print_tuple(struct seq_file *s, + const struct nf_conntrack_tuple *tuple) +{ + return seq_printf(s, "sport=%hu dport=%hu ", + ntohs(tuple->src.u.tcp.port), + ntohs(tuple->dst.u.tcp.port)); +} + +/* Print out the private part of the conntrack. */ +static int tcp_print_conntrack(struct seq_file *s, struct nf_conn *ct) +{ + enum tcp_conntrack state; + + spin_lock_bh(&ct->lock); + state = ct->proto.tcp.state; + spin_unlock_bh(&ct->lock); + + return seq_printf(s, "%s ", tcp_conntrack_names[state]); +} + +static unsigned int get_conntrack_index(const struct tcphdr *tcph) +{ + if (tcph->rst) return TCP_RST_SET; + else if (tcph->syn) return (tcph->ack ? TCP_SYNACK_SET : TCP_SYN_SET); + else if (tcph->fin) return TCP_FIN_SET; + else if (tcph->ack) return TCP_ACK_SET; + else return TCP_NONE_SET; +} + +/* TCP connection tracking based on 'Real Stateful TCP Packet Filtering + in IP Filter' by Guido van Rooij. + + http://www.sane.nl/events/sane2000/papers.html + http://www.darkart.com/mirrors/www.obfuscation.org/ipf/ + + The boundaries and the conditions are changed according to RFC793: + the packet must intersect the window (i.e. segments may be + after the right or before the left edge) and thus receivers may ACK + segments after the right edge of the window. + + td_maxend = max(sack + max(win,1)) seen in reply packets + td_maxwin = max(max(win, 1)) + (sack - ack) seen in sent packets + td_maxwin += seq + len - sender.td_maxend + if seq + len > sender.td_maxend + td_end = max(seq + len) seen in sent packets + + I. Upper bound for valid data: seq <= sender.td_maxend + II. Lower bound for valid data: seq + len >= sender.td_end - receiver.td_maxwin + III. Upper bound for valid (s)ack: sack <= receiver.td_end + IV. Lower bound for valid (s)ack: sack >= receiver.td_end - MAXACKWINDOW + + where sack is the highest right edge of sack block found in the packet + or ack in the case of packet without SACK option. + + The upper bound limit for a valid (s)ack is not ignored - + we doesn't have to deal with fragments. +*/ + +static inline __u32 segment_seq_plus_len(__u32 seq, + size_t len, + unsigned int dataoff, + const struct tcphdr *tcph) +{ + /* XXX Should I use payload length field in IP/IPv6 header ? + * - YK */ + return (seq + len - dataoff - tcph->doff*4 + + (tcph->syn ? 1 : 0) + (tcph->fin ? 1 : 0)); +} + +/* Fixme: what about big packets? */ +#define MAXACKWINCONST 66000 +#define MAXACKWINDOW(sender) \ + ((sender)->td_maxwin > MAXACKWINCONST ? (sender)->td_maxwin \ + : MAXACKWINCONST) + +/* + * Simplified tcp_parse_options routine from tcp_input.c + */ +static void tcp_options(const struct sk_buff *skb, + unsigned int dataoff, + const struct tcphdr *tcph, + struct ip_ct_tcp_state *state) +{ + unsigned char buff[(15 * 4) - sizeof(struct tcphdr)]; + const unsigned char *ptr; + int length = (tcph->doff*4) - sizeof(struct tcphdr); + + if (!length) + return; + + ptr = skb_header_pointer(skb, dataoff + sizeof(struct tcphdr), + length, buff); + BUG_ON(ptr == NULL); + + state->td_scale = + state->flags = 0; + + while (length > 0) { + int opcode=*ptr++; + int opsize; + + switch (opcode) { + case TCPOPT_EOL: + return; + case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ + length--; + continue; + default: + opsize=*ptr++; + if (opsize < 2) /* "silly options" */ + return; + if (opsize > length) + return; /* don't parse partial options */ + + if (opcode == TCPOPT_SACK_PERM + && opsize == TCPOLEN_SACK_PERM) + state->flags |= IP_CT_TCP_FLAG_SACK_PERM; + else if (opcode == TCPOPT_WINDOW + && opsize == TCPOLEN_WINDOW) { + state->td_scale = *(u_int8_t *)ptr; + + if (state->td_scale > 14) { + /* See RFC1323 */ + state->td_scale = 14; + } + state->flags |= + IP_CT_TCP_FLAG_WINDOW_SCALE; + } + ptr += opsize - 2; + length -= opsize; + } + } +} + +static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff, + const struct tcphdr *tcph, __u32 *sack) +{ + unsigned char buff[(15 * 4) - sizeof(struct tcphdr)]; + const unsigned char *ptr; + int length = (tcph->doff*4) - sizeof(struct tcphdr); + __u32 tmp; + + if (!length) + return; + + ptr = skb_header_pointer(skb, dataoff + sizeof(struct tcphdr), + length, buff); + BUG_ON(ptr == NULL); + + /* Fast path for timestamp-only option */ + if (length == TCPOLEN_TSTAMP_ALIGNED + && *(__be32 *)ptr == htonl((TCPOPT_NOP << 24) + | (TCPOPT_NOP << 16) + | (TCPOPT_TIMESTAMP << 8) + | TCPOLEN_TIMESTAMP)) + return; + + while (length > 0) { + int opcode = *ptr++; + int opsize, i; + + switch (opcode) { + case TCPOPT_EOL: + return; + case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ + length--; + continue; + default: + opsize = *ptr++; + if (opsize < 2) /* "silly options" */ + return; + if (opsize > length) + return; /* don't parse partial options */ + + if (opcode == TCPOPT_SACK + && opsize >= (TCPOLEN_SACK_BASE + + TCPOLEN_SACK_PERBLOCK) + && !((opsize - TCPOLEN_SACK_BASE) + % TCPOLEN_SACK_PERBLOCK)) { + for (i = 0; + i < (opsize - TCPOLEN_SACK_BASE); + i += TCPOLEN_SACK_PERBLOCK) { + tmp = get_unaligned_be32((__be32 *)(ptr+i)+1); + + if (after(tmp, *sack)) + *sack = tmp; + } + return; + } + ptr += opsize - 2; + length -= opsize; + } + } +} + +#ifdef CONFIG_NF_NAT_NEEDED +static inline s16 nat_offset(const struct nf_conn *ct, + enum ip_conntrack_dir dir, + u32 seq) +{ + typeof(nf_ct_nat_offset) get_offset = rcu_dereference(nf_ct_nat_offset); + + return get_offset != NULL ? get_offset(ct, dir, seq) : 0; +} +#define NAT_OFFSET(pf, ct, dir, seq) \ + (pf == NFPROTO_IPV4 ? nat_offset(ct, dir, seq) : 0) +#else +#define NAT_OFFSET(pf, ct, dir, seq) 0 +#endif + +static bool tcp_in_window(const struct nf_conn *ct, + struct ip_ct_tcp *state, + enum ip_conntrack_dir dir, + unsigned int index, + const struct sk_buff *skb, + unsigned int dataoff, + const struct tcphdr *tcph, + u_int8_t pf) +{ + struct net *net = nf_ct_net(ct); + struct ip_ct_tcp_state *sender = &state->seen[dir]; + struct ip_ct_tcp_state *receiver = &state->seen[!dir]; + const struct nf_conntrack_tuple *tuple = &ct->tuplehash[dir].tuple; + __u32 seq, ack, sack, end, win, swin; + s16 receiver_offset; + bool res; + + /* + * Get the required data from the packet. + */ + seq = ntohl(tcph->seq); + ack = sack = ntohl(tcph->ack_seq); + win = ntohs(tcph->window); + end = segment_seq_plus_len(seq, skb->len, dataoff, tcph); + + if (receiver->flags & IP_CT_TCP_FLAG_SACK_PERM) + tcp_sack(skb, dataoff, tcph, &sack); + + /* Take into account NAT sequence number mangling */ + receiver_offset = NAT_OFFSET(pf, ct, !dir, ack - 1); + ack -= receiver_offset; + sack -= receiver_offset; + + pr_debug("tcp_in_window: START\n"); + pr_debug("tcp_in_window: "); + nf_ct_dump_tuple(tuple); + pr_debug("seq=%u ack=%u+(%d) sack=%u+(%d) win=%u end=%u\n", + seq, ack, receiver_offset, sack, receiver_offset, win, end); + pr_debug("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i " + "receiver end=%u maxend=%u maxwin=%u scale=%i\n", + sender->td_end, sender->td_maxend, sender->td_maxwin, + sender->td_scale, + receiver->td_end, receiver->td_maxend, receiver->td_maxwin, + receiver->td_scale); + + if (sender->td_maxwin == 0) { + /* + * Initialize sender data. + */ + if (tcph->syn) { + /* + * SYN-ACK in reply to a SYN + * or SYN from reply direction in simultaneous open. + */ + sender->td_end = + sender->td_maxend = end; + sender->td_maxwin = (win == 0 ? 1 : win); + + tcp_options(skb, dataoff, tcph, sender); + /* + * RFC 1323: + * Both sides must send the Window Scale option + * to enable window scaling in either direction. + */ + if (!(sender->flags & IP_CT_TCP_FLAG_WINDOW_SCALE + && receiver->flags & IP_CT_TCP_FLAG_WINDOW_SCALE)) + sender->td_scale = + receiver->td_scale = 0; + if (!tcph->ack) + /* Simultaneous open */ + return true; + } else { + /* + * We are in the middle of a connection, + * its history is lost for us. + * Let's try to use the data from the packet. + */ + sender->td_end = end; + swin = win << sender->td_scale; + sender->td_maxwin = (swin == 0 ? 1 : swin); + sender->td_maxend = end + sender->td_maxwin; + /* + * We haven't seen traffic in the other direction yet + * but we have to tweak window tracking to pass III + * and IV until that happens. + */ + if (receiver->td_maxwin == 0) + receiver->td_end = receiver->td_maxend = sack; + } + } else if (((state->state == TCP_CONNTRACK_SYN_SENT + && dir == IP_CT_DIR_ORIGINAL) + || (state->state == TCP_CONNTRACK_SYN_RECV + && dir == IP_CT_DIR_REPLY)) + && after(end, sender->td_end)) { + /* + * RFC 793: "if a TCP is reinitialized ... then it need + * not wait at all; it must only be sure to use sequence + * numbers larger than those recently used." + */ + sender->td_end = + sender->td_maxend = end; + sender->td_maxwin = (win == 0 ? 1 : win); + + tcp_options(skb, dataoff, tcph, sender); + } + + if (!(tcph->ack)) { + /* + * If there is no ACK, just pretend it was set and OK. + */ + ack = sack = receiver->td_end; + } else if (((tcp_flag_word(tcph) & (TCP_FLAG_ACK|TCP_FLAG_RST)) == + (TCP_FLAG_ACK|TCP_FLAG_RST)) + && (ack == 0)) { + /* + * Broken TCP stacks, that set ACK in RST packets as well + * with zero ack value. + */ + ack = sack = receiver->td_end; + } + + if (seq == end + && (!tcph->rst + || (seq == 0 && state->state == TCP_CONNTRACK_SYN_SENT))) + /* + * Packets contains no data: we assume it is valid + * and check the ack value only. + * However RST segments are always validated by their + * SEQ number, except when seq == 0 (reset sent answering + * SYN. + */ + seq = end = sender->td_end; + + pr_debug("tcp_in_window: "); + nf_ct_dump_tuple(tuple); + pr_debug("seq=%u ack=%u+(%d) sack=%u+(%d) win=%u end=%u\n", + seq, ack, receiver_offset, sack, receiver_offset, win, end); + pr_debug("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i " + "receiver end=%u maxend=%u maxwin=%u scale=%i\n", + sender->td_end, sender->td_maxend, sender->td_maxwin, + sender->td_scale, + receiver->td_end, receiver->td_maxend, receiver->td_maxwin, + receiver->td_scale); + + pr_debug("tcp_in_window: I=%i II=%i III=%i IV=%i\n", + before(seq, sender->td_maxend + 1), + after(end, sender->td_end - receiver->td_maxwin - 1), + before(sack, receiver->td_end + 1), + after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1)); + + if (before(seq, sender->td_maxend + 1) && + after(end, sender->td_end - receiver->td_maxwin - 1) && + before(sack, receiver->td_end + 1) && + after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1)) { + /* + * Take into account window scaling (RFC 1323). + */ + if (!tcph->syn) + win <<= sender->td_scale; + + /* + * Update sender data. + */ + swin = win + (sack - ack); + if (sender->td_maxwin < swin) + sender->td_maxwin = swin; + if (after(end, sender->td_end)) { + sender->td_end = end; + sender->flags |= IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED; + } + if (tcph->ack) { + if (!(sender->flags & IP_CT_TCP_FLAG_MAXACK_SET)) { + sender->td_maxack = ack; + sender->flags |= IP_CT_TCP_FLAG_MAXACK_SET; + } else if (after(ack, sender->td_maxack)) + sender->td_maxack = ack; + } + + /* + * Update receiver data. + */ + if (receiver->td_maxwin != 0 && after(end, sender->td_maxend)) + receiver->td_maxwin += end - sender->td_maxend; + if (after(sack + win, receiver->td_maxend - 1)) { + receiver->td_maxend = sack + win; + if (win == 0) + receiver->td_maxend++; + } + if (ack == receiver->td_end) + receiver->flags &= ~IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED; + + /* + * Check retransmissions. + */ + if (index == TCP_ACK_SET) { + if (state->last_dir == dir + && state->last_seq == seq + && state->last_ack == ack + && state->last_end == end + && state->last_win == win) + state->retrans++; + else { + state->last_dir = dir; + state->last_seq = seq; + state->last_ack = ack; + state->last_end = end; + state->last_win = win; + state->retrans = 0; + } + } + res = true; + } else { + res = false; + if (sender->flags & IP_CT_TCP_FLAG_BE_LIBERAL || + nf_ct_tcp_be_liberal) + res = true; + if (!res && LOG_INVALID(net, IPPROTO_TCP)) + nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + "nf_ct_tcp: %s ", + before(seq, sender->td_maxend + 1) ? + after(end, sender->td_end - receiver->td_maxwin - 1) ? + before(sack, receiver->td_end + 1) ? + after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1) ? "BUG" + : "ACK is under the lower bound (possible overly delayed ACK)" + : "ACK is over the upper bound (ACKed data not seen yet)" + : "SEQ is under the lower bound (already ACKed data retransmitted)" + : "SEQ is over the upper bound (over the window of the receiver)"); + } + + pr_debug("tcp_in_window: res=%u sender end=%u maxend=%u maxwin=%u " + "receiver end=%u maxend=%u maxwin=%u\n", + res, sender->td_end, sender->td_maxend, sender->td_maxwin, + receiver->td_end, receiver->td_maxend, receiver->td_maxwin); + + return res; +} + +/* table of valid flag combinations - PUSH, ECE and CWR are always valid */ +static const u8 tcp_valid_flags[(TCPHDR_FIN|TCPHDR_SYN|TCPHDR_RST|TCPHDR_ACK| + TCPHDR_URG) + 1] = +{ + [TCPHDR_SYN] = 1, + [TCPHDR_SYN|TCPHDR_URG] = 1, + [TCPHDR_SYN|TCPHDR_ACK] = 1, + [TCPHDR_RST] = 1, + [TCPHDR_RST|TCPHDR_ACK] = 1, + [TCPHDR_FIN|TCPHDR_ACK] = 1, + [TCPHDR_FIN|TCPHDR_ACK|TCPHDR_URG] = 1, + [TCPHDR_ACK] = 1, + [TCPHDR_ACK|TCPHDR_URG] = 1, +}; + +/* Protect conntrack agaist broken packets. Code taken from ipt_unclean.c. */ +static int tcp_error(struct net *net, struct nf_conn *tmpl, + struct sk_buff *skb, + unsigned int dataoff, + enum ip_conntrack_info *ctinfo, + u_int8_t pf, + unsigned int hooknum) +{ + const struct tcphdr *th; + struct tcphdr _tcph; + unsigned int tcplen = skb->len - dataoff; + u_int8_t tcpflags; + + /* Smaller that minimal TCP header? */ + th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph); + if (th == NULL) { + if (LOG_INVALID(net, IPPROTO_TCP)) + nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + "nf_ct_tcp: short packet "); + return -NF_ACCEPT; + } + + /* Not whole TCP header or malformed packet */ + if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) { + if (LOG_INVALID(net, IPPROTO_TCP)) + nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + "nf_ct_tcp: truncated/malformed packet "); + return -NF_ACCEPT; + } + + /* Checksum invalid? Ignore. + * We skip checking packets on the outgoing path + * because the checksum is assumed to be correct. + */ + /* FIXME: Source route IP option packets --RR */ + if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && + nf_checksum(skb, hooknum, dataoff, IPPROTO_TCP, pf)) { + if (LOG_INVALID(net, IPPROTO_TCP)) + nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + "nf_ct_tcp: bad TCP checksum "); + return -NF_ACCEPT; + } + + /* Check TCP flags. */ + tcpflags = (tcp_flag_byte(th) & ~(TCPHDR_ECE|TCPHDR_CWR|TCPHDR_PSH)); + if (!tcp_valid_flags[tcpflags]) { + if (LOG_INVALID(net, IPPROTO_TCP)) + nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + "nf_ct_tcp: invalid TCP flag combination "); + return -NF_ACCEPT; + } + + return NF_ACCEPT; +} + +static unsigned int *tcp_get_timeouts(struct net *net) +{ + return tcp_timeouts; +} + +/* Returns verdict for packet, or -1 for invalid. */ +static int tcp_packet(struct nf_conn *ct, + const struct sk_buff *skb, + unsigned int dataoff, + enum ip_conntrack_info ctinfo, + u_int8_t pf, + unsigned int hooknum, + unsigned int *timeouts) +{ + struct net *net = nf_ct_net(ct); + struct nf_conntrack_tuple *tuple; + enum tcp_conntrack new_state, old_state; + enum ip_conntrack_dir dir; + const struct tcphdr *th; + struct tcphdr _tcph; + unsigned long timeout; + unsigned int index; + + th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph); + BUG_ON(th == NULL); + + spin_lock_bh(&ct->lock); + old_state = ct->proto.tcp.state; + dir = CTINFO2DIR(ctinfo); + index = get_conntrack_index(th); + new_state = tcp_conntracks[dir][index][old_state]; + tuple = &ct->tuplehash[dir].tuple; + + switch (new_state) { + case TCP_CONNTRACK_SYN_SENT: + if (old_state < TCP_CONNTRACK_TIME_WAIT) + break; + /* RFC 1122: "When a connection is closed actively, + * it MUST linger in TIME-WAIT state for a time 2xMSL + * (Maximum Segment Lifetime). However, it MAY accept + * a new SYN from the remote TCP to reopen the connection + * directly from TIME-WAIT state, if..." + * We ignore the conditions because we are in the + * TIME-WAIT state anyway. + * + * Handle aborted connections: we and the server + * think there is an existing connection but the client + * aborts it and starts a new one. + */ + if (((ct->proto.tcp.seen[dir].flags + | ct->proto.tcp.seen[!dir].flags) + & IP_CT_TCP_FLAG_CLOSE_INIT) + || (ct->proto.tcp.last_dir == dir + && ct->proto.tcp.last_index == TCP_RST_SET)) { + /* Attempt to reopen a closed/aborted connection. + * Delete this connection and look up again. */ + spin_unlock_bh(&ct->lock); + + /* Only repeat if we can actually remove the timer. + * Destruction may already be in progress in process + * context and we must give it a chance to terminate. + */ + if (nf_ct_kill(ct)) + return -NF_REPEAT; + return NF_DROP; + } + /* Fall through */ + case TCP_CONNTRACK_IGNORE: + /* Ignored packets: + * + * Our connection entry may be out of sync, so ignore + * packets which may signal the real connection between + * the client and the server. + * + * a) SYN in ORIGINAL + * b) SYN/ACK in REPLY + * c) ACK in reply direction after initial SYN in original. + * + * If the ignored packet is invalid, the receiver will send + * a RST we'll catch below. + */ + if (index == TCP_SYNACK_SET + && ct->proto.tcp.last_index == TCP_SYN_SET + && ct->proto.tcp.last_dir != dir + && ntohl(th->ack_seq) == ct->proto.tcp.last_end) { + /* b) This SYN/ACK acknowledges a SYN that we earlier + * ignored as invalid. This means that the client and + * the server are both in sync, while the firewall is + * not. We get in sync from the previously annotated + * values. + */ + old_state = TCP_CONNTRACK_SYN_SENT; + new_state = TCP_CONNTRACK_SYN_RECV; + ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_end = + ct->proto.tcp.last_end; + ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_maxend = + ct->proto.tcp.last_end; + ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_maxwin = + ct->proto.tcp.last_win == 0 ? + 1 : ct->proto.tcp.last_win; + ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_scale = + ct->proto.tcp.last_wscale; + ct->proto.tcp.seen[ct->proto.tcp.last_dir].flags = + ct->proto.tcp.last_flags; + memset(&ct->proto.tcp.seen[dir], 0, + sizeof(struct ip_ct_tcp_state)); + break; + } + ct->proto.tcp.last_index = index; + ct->proto.tcp.last_dir = dir; + ct->proto.tcp.last_seq = ntohl(th->seq); + ct->proto.tcp.last_end = + segment_seq_plus_len(ntohl(th->seq), skb->len, dataoff, th); + ct->proto.tcp.last_win = ntohs(th->window); + + /* a) This is a SYN in ORIGINAL. The client and the server + * may be in sync but we are not. In that case, we annotate + * the TCP options and let the packet go through. If it is a + * valid SYN packet, the server will reply with a SYN/ACK, and + * then we'll get in sync. Otherwise, the server ignores it. */ + if (index == TCP_SYN_SET && dir == IP_CT_DIR_ORIGINAL) { + struct ip_ct_tcp_state seen = {}; + + ct->proto.tcp.last_flags = + ct->proto.tcp.last_wscale = 0; + tcp_options(skb, dataoff, th, &seen); + if (seen.flags & IP_CT_TCP_FLAG_WINDOW_SCALE) { + ct->proto.tcp.last_flags |= + IP_CT_TCP_FLAG_WINDOW_SCALE; + ct->proto.tcp.last_wscale = seen.td_scale; + } + if (seen.flags & IP_CT_TCP_FLAG_SACK_PERM) { + ct->proto.tcp.last_flags |= + IP_CT_TCP_FLAG_SACK_PERM; + } + } + spin_unlock_bh(&ct->lock); + if (LOG_INVALID(net, IPPROTO_TCP)) + nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + "nf_ct_tcp: invalid packet ignored "); + return NF_ACCEPT; + case TCP_CONNTRACK_MAX: + /* Invalid packet */ + pr_debug("nf_ct_tcp: Invalid dir=%i index=%u ostate=%u\n", + dir, get_conntrack_index(th), old_state); + spin_unlock_bh(&ct->lock); + if (LOG_INVALID(net, IPPROTO_TCP)) + nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + "nf_ct_tcp: invalid state "); + return -NF_ACCEPT; + case TCP_CONNTRACK_CLOSE: + if (index == TCP_RST_SET + && (ct->proto.tcp.seen[!dir].flags & IP_CT_TCP_FLAG_MAXACK_SET) + && before(ntohl(th->seq), ct->proto.tcp.seen[!dir].td_maxack)) { + /* Invalid RST */ + spin_unlock_bh(&ct->lock); + if (LOG_INVALID(net, IPPROTO_TCP)) + nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + "nf_ct_tcp: invalid RST "); + return -NF_ACCEPT; + } + if (index == TCP_RST_SET + && ((test_bit(IPS_SEEN_REPLY_BIT, &ct->status) + && ct->proto.tcp.last_index == TCP_SYN_SET) + || (!test_bit(IPS_ASSURED_BIT, &ct->status) + && ct->proto.tcp.last_index == TCP_ACK_SET)) + && ntohl(th->ack_seq) == ct->proto.tcp.last_end) { + /* RST sent to invalid SYN or ACK we had let through + * at a) and c) above: + * + * a) SYN was in window then + * c) we hold a half-open connection. + * + * Delete our connection entry. + * We skip window checking, because packet might ACK + * segments we ignored. */ + goto in_window; + } + /* Just fall through */ + default: + /* Keep compilers happy. */ + break; + } + + if (!tcp_in_window(ct, &ct->proto.tcp, dir, index, + skb, dataoff, th, pf)) { + spin_unlock_bh(&ct->lock); + return -NF_ACCEPT; + } + in_window: + /* From now on we have got in-window packets */ + ct->proto.tcp.last_index = index; + ct->proto.tcp.last_dir = dir; + + pr_debug("tcp_conntracks: "); + nf_ct_dump_tuple(tuple); + pr_debug("syn=%i ack=%i fin=%i rst=%i old=%i new=%i\n", + (th->syn ? 1 : 0), (th->ack ? 1 : 0), + (th->fin ? 1 : 0), (th->rst ? 1 : 0), + old_state, new_state); + + ct->proto.tcp.state = new_state; + if (old_state != new_state + && new_state == TCP_CONNTRACK_FIN_WAIT) + ct->proto.tcp.seen[dir].flags |= IP_CT_TCP_FLAG_CLOSE_INIT; + + if (ct->proto.tcp.retrans >= nf_ct_tcp_max_retrans && + timeouts[new_state] > timeouts[TCP_CONNTRACK_RETRANS]) + timeout = timeouts[TCP_CONNTRACK_RETRANS]; + else if ((ct->proto.tcp.seen[0].flags | ct->proto.tcp.seen[1].flags) & + IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED && + timeouts[new_state] > timeouts[TCP_CONNTRACK_UNACK]) + timeout = timeouts[TCP_CONNTRACK_UNACK]; + else + timeout = timeouts[new_state]; + spin_unlock_bh(&ct->lock); + + if (new_state != old_state) + nf_conntrack_event_cache(IPCT_PROTOINFO, ct); + + if (!test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { + /* If only reply is a RST, we can consider ourselves not to + have an established connection: this is a fairly common + problem case, so we can delete the conntrack + immediately. --RR */ + if (th->rst) { + nf_ct_kill_acct(ct, ctinfo, skb); + return NF_ACCEPT; + } + } else if (!test_bit(IPS_ASSURED_BIT, &ct->status) + && (old_state == TCP_CONNTRACK_SYN_RECV + || old_state == TCP_CONNTRACK_ESTABLISHED) + && new_state == TCP_CONNTRACK_ESTABLISHED) { + /* Set ASSURED if we see see valid ack in ESTABLISHED + after SYN_RECV or a valid answer for a picked up + connection. */ + set_bit(IPS_ASSURED_BIT, &ct->status); + nf_conntrack_event_cache(IPCT_ASSURED, ct); + } + nf_ct_refresh_acct(ct, ctinfo, skb, timeout); + + return NF_ACCEPT; +} + +/* Called when a new connection for this protocol found. */ +static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb, + unsigned int dataoff, unsigned int *timeouts) +{ + enum tcp_conntrack new_state; + const struct tcphdr *th; + struct tcphdr _tcph; + const struct ip_ct_tcp_state *sender = &ct->proto.tcp.seen[0]; + const struct ip_ct_tcp_state *receiver = &ct->proto.tcp.seen[1]; + + th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph); + BUG_ON(th == NULL); + + /* Don't need lock here: this conntrack not in circulation yet */ + new_state = tcp_conntracks[0][get_conntrack_index(th)][TCP_CONNTRACK_NONE]; + + /* Invalid: delete conntrack */ + if (new_state >= TCP_CONNTRACK_MAX) { + pr_debug("nf_ct_tcp: invalid new deleting.\n"); + return false; + } + + if (new_state == TCP_CONNTRACK_SYN_SENT) { + memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp)); + /* SYN packet */ + ct->proto.tcp.seen[0].td_end = + segment_seq_plus_len(ntohl(th->seq), skb->len, + dataoff, th); + ct->proto.tcp.seen[0].td_maxwin = ntohs(th->window); + if (ct->proto.tcp.seen[0].td_maxwin == 0) + ct->proto.tcp.seen[0].td_maxwin = 1; + ct->proto.tcp.seen[0].td_maxend = + ct->proto.tcp.seen[0].td_end; + + tcp_options(skb, dataoff, th, &ct->proto.tcp.seen[0]); + } else if (nf_ct_tcp_loose == 0) { + /* Don't try to pick up connections. */ + return false; + } else { + memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp)); + /* + * We are in the middle of a connection, + * its history is lost for us. + * Let's try to use the data from the packet. + */ + ct->proto.tcp.seen[0].td_end = + segment_seq_plus_len(ntohl(th->seq), skb->len, + dataoff, th); + ct->proto.tcp.seen[0].td_maxwin = ntohs(th->window); + if (ct->proto.tcp.seen[0].td_maxwin == 0) + ct->proto.tcp.seen[0].td_maxwin = 1; + ct->proto.tcp.seen[0].td_maxend = + ct->proto.tcp.seen[0].td_end + + ct->proto.tcp.seen[0].td_maxwin; + + /* We assume SACK and liberal window checking to handle + * window scaling */ + ct->proto.tcp.seen[0].flags = + ct->proto.tcp.seen[1].flags = IP_CT_TCP_FLAG_SACK_PERM | + IP_CT_TCP_FLAG_BE_LIBERAL; + } + + /* tcp_packet will set them */ + ct->proto.tcp.last_index = TCP_NONE_SET; + + pr_debug("tcp_new: sender end=%u maxend=%u maxwin=%u scale=%i " + "receiver end=%u maxend=%u maxwin=%u scale=%i\n", + sender->td_end, sender->td_maxend, sender->td_maxwin, + sender->td_scale, + receiver->td_end, receiver->td_maxend, receiver->td_maxwin, + receiver->td_scale); + return true; +} + +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) + +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nfnetlink_conntrack.h> + +static int tcp_to_nlattr(struct sk_buff *skb, struct nlattr *nla, + struct nf_conn *ct) +{ + struct nlattr *nest_parms; + struct nf_ct_tcp_flags tmp = {}; + + spin_lock_bh(&ct->lock); + nest_parms = nla_nest_start(skb, CTA_PROTOINFO_TCP | NLA_F_NESTED); + if (!nest_parms) + goto nla_put_failure; + + NLA_PUT_U8(skb, CTA_PROTOINFO_TCP_STATE, ct->proto.tcp.state); + + NLA_PUT_U8(skb, CTA_PROTOINFO_TCP_WSCALE_ORIGINAL, + ct->proto.tcp.seen[0].td_scale); + + NLA_PUT_U8(skb, CTA_PROTOINFO_TCP_WSCALE_REPLY, + ct->proto.tcp.seen[1].td_scale); + + tmp.flags = ct->proto.tcp.seen[0].flags; + NLA_PUT(skb, CTA_PROTOINFO_TCP_FLAGS_ORIGINAL, + sizeof(struct nf_ct_tcp_flags), &tmp); + + tmp.flags = ct->proto.tcp.seen[1].flags; + NLA_PUT(skb, CTA_PROTOINFO_TCP_FLAGS_REPLY, + sizeof(struct nf_ct_tcp_flags), &tmp); + spin_unlock_bh(&ct->lock); + + nla_nest_end(skb, nest_parms); + + return 0; + +nla_put_failure: + spin_unlock_bh(&ct->lock); + return -1; +} + +static const struct nla_policy tcp_nla_policy[CTA_PROTOINFO_TCP_MAX+1] = { + [CTA_PROTOINFO_TCP_STATE] = { .type = NLA_U8 }, + [CTA_PROTOINFO_TCP_WSCALE_ORIGINAL] = { .type = NLA_U8 }, + [CTA_PROTOINFO_TCP_WSCALE_REPLY] = { .type = NLA_U8 }, + [CTA_PROTOINFO_TCP_FLAGS_ORIGINAL] = { .len = sizeof(struct nf_ct_tcp_flags) }, + [CTA_PROTOINFO_TCP_FLAGS_REPLY] = { .len = sizeof(struct nf_ct_tcp_flags) }, +}; + +static int nlattr_to_tcp(struct nlattr *cda[], struct nf_conn *ct) +{ + struct nlattr *pattr = cda[CTA_PROTOINFO_TCP]; + struct nlattr *tb[CTA_PROTOINFO_TCP_MAX+1]; + int err; + + /* updates could not contain anything about the private + * protocol info, in that case skip the parsing */ + if (!pattr) + return 0; + + err = nla_parse_nested(tb, CTA_PROTOINFO_TCP_MAX, pattr, tcp_nla_policy); + if (err < 0) + return err; + + if (tb[CTA_PROTOINFO_TCP_STATE] && + nla_get_u8(tb[CTA_PROTOINFO_TCP_STATE]) >= TCP_CONNTRACK_MAX) + return -EINVAL; + + spin_lock_bh(&ct->lock); + if (tb[CTA_PROTOINFO_TCP_STATE]) + ct->proto.tcp.state = nla_get_u8(tb[CTA_PROTOINFO_TCP_STATE]); + + if (tb[CTA_PROTOINFO_TCP_FLAGS_ORIGINAL]) { + struct nf_ct_tcp_flags *attr = + nla_data(tb[CTA_PROTOINFO_TCP_FLAGS_ORIGINAL]); + ct->proto.tcp.seen[0].flags &= ~attr->mask; + ct->proto.tcp.seen[0].flags |= attr->flags & attr->mask; + } + + if (tb[CTA_PROTOINFO_TCP_FLAGS_REPLY]) { + struct nf_ct_tcp_flags *attr = + nla_data(tb[CTA_PROTOINFO_TCP_FLAGS_REPLY]); + ct->proto.tcp.seen[1].flags &= ~attr->mask; + ct->proto.tcp.seen[1].flags |= attr->flags & attr->mask; + } + + if (tb[CTA_PROTOINFO_TCP_WSCALE_ORIGINAL] && + tb[CTA_PROTOINFO_TCP_WSCALE_REPLY] && + ct->proto.tcp.seen[0].flags & IP_CT_TCP_FLAG_WINDOW_SCALE && + ct->proto.tcp.seen[1].flags & IP_CT_TCP_FLAG_WINDOW_SCALE) { + ct->proto.tcp.seen[0].td_scale = + nla_get_u8(tb[CTA_PROTOINFO_TCP_WSCALE_ORIGINAL]); + ct->proto.tcp.seen[1].td_scale = + nla_get_u8(tb[CTA_PROTOINFO_TCP_WSCALE_REPLY]); + } + spin_unlock_bh(&ct->lock); + + return 0; +} + +static int tcp_nlattr_size(void) +{ + return nla_total_size(0) /* CTA_PROTOINFO_TCP */ + + nla_policy_len(tcp_nla_policy, CTA_PROTOINFO_TCP_MAX + 1); +} + +static int tcp_nlattr_tuple_size(void) +{ + return nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1); +} +#endif + +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nfnetlink_cttimeout.h> + +static int tcp_timeout_nlattr_to_obj(struct nlattr *tb[], void *data) +{ + unsigned int *timeouts = data; + int i; + + /* set default TCP timeouts. */ + for (i=0; i<TCP_CONNTRACK_TIMEOUT_MAX; i++) + timeouts[i] = tcp_timeouts[i]; + + if (tb[CTA_TIMEOUT_TCP_SYN_SENT]) { + timeouts[TCP_CONNTRACK_SYN_SENT] = + ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_SYN_SENT]))*HZ; + } + if (tb[CTA_TIMEOUT_TCP_SYN_RECV]) { + timeouts[TCP_CONNTRACK_SYN_RECV] = + ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_SYN_RECV]))*HZ; + } + if (tb[CTA_TIMEOUT_TCP_ESTABLISHED]) { + timeouts[TCP_CONNTRACK_ESTABLISHED] = + ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_ESTABLISHED]))*HZ; + } + if (tb[CTA_TIMEOUT_TCP_FIN_WAIT]) { + timeouts[TCP_CONNTRACK_FIN_WAIT] = + ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_FIN_WAIT]))*HZ; + } + if (tb[CTA_TIMEOUT_TCP_CLOSE_WAIT]) { + timeouts[TCP_CONNTRACK_CLOSE_WAIT] = + ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_CLOSE_WAIT]))*HZ; + } + if (tb[CTA_TIMEOUT_TCP_LAST_ACK]) { + timeouts[TCP_CONNTRACK_LAST_ACK] = + ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_LAST_ACK]))*HZ; + } + if (tb[CTA_TIMEOUT_TCP_TIME_WAIT]) { + timeouts[TCP_CONNTRACK_TIME_WAIT] = + ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_TIME_WAIT]))*HZ; + } + if (tb[CTA_TIMEOUT_TCP_CLOSE]) { + timeouts[TCP_CONNTRACK_CLOSE] = + ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_CLOSE]))*HZ; + } + if (tb[CTA_TIMEOUT_TCP_SYN_SENT2]) { + timeouts[TCP_CONNTRACK_SYN_SENT2] = + ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_SYN_SENT2]))*HZ; + } + if (tb[CTA_TIMEOUT_TCP_RETRANS]) { + timeouts[TCP_CONNTRACK_RETRANS] = + ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_RETRANS]))*HZ; + } + if (tb[CTA_TIMEOUT_TCP_UNACK]) { + timeouts[TCP_CONNTRACK_UNACK] = + ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_UNACK]))*HZ; + } + return 0; +} + +static int +tcp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) +{ + const unsigned int *timeouts = data; + + NLA_PUT_BE32(skb, CTA_TIMEOUT_TCP_SYN_SENT, + htonl(timeouts[TCP_CONNTRACK_SYN_SENT] / HZ)); + NLA_PUT_BE32(skb, CTA_TIMEOUT_TCP_SYN_RECV, + htonl(timeouts[TCP_CONNTRACK_SYN_RECV] / HZ)); + NLA_PUT_BE32(skb, CTA_TIMEOUT_TCP_ESTABLISHED, + htonl(timeouts[TCP_CONNTRACK_ESTABLISHED] / HZ)); + NLA_PUT_BE32(skb, CTA_TIMEOUT_TCP_FIN_WAIT, + htonl(timeouts[TCP_CONNTRACK_FIN_WAIT] / HZ)); + NLA_PUT_BE32(skb, CTA_TIMEOUT_TCP_CLOSE_WAIT, + htonl(timeouts[TCP_CONNTRACK_CLOSE_WAIT] / HZ)); + NLA_PUT_BE32(skb, CTA_TIMEOUT_TCP_LAST_ACK, + htonl(timeouts[TCP_CONNTRACK_LAST_ACK] / HZ)); + NLA_PUT_BE32(skb, CTA_TIMEOUT_TCP_TIME_WAIT, + htonl(timeouts[TCP_CONNTRACK_TIME_WAIT] / HZ)); + NLA_PUT_BE32(skb, CTA_TIMEOUT_TCP_CLOSE, + htonl(timeouts[TCP_CONNTRACK_CLOSE] / HZ)); + NLA_PUT_BE32(skb, CTA_TIMEOUT_TCP_SYN_SENT2, + htonl(timeouts[TCP_CONNTRACK_SYN_SENT2] / HZ)); + NLA_PUT_BE32(skb, CTA_TIMEOUT_TCP_RETRANS, + htonl(timeouts[TCP_CONNTRACK_RETRANS] / HZ)); + NLA_PUT_BE32(skb, CTA_TIMEOUT_TCP_UNACK, + htonl(timeouts[TCP_CONNTRACK_UNACK] / HZ)); + return 0; + +nla_put_failure: + return -ENOSPC; +} + +static const struct nla_policy tcp_timeout_nla_policy[CTA_TIMEOUT_TCP_MAX+1] = { + [CTA_TIMEOUT_TCP_SYN_SENT] = { .type = NLA_U32 }, + [CTA_TIMEOUT_TCP_SYN_RECV] = { .type = NLA_U32 }, + [CTA_TIMEOUT_TCP_ESTABLISHED] = { .type = NLA_U32 }, + [CTA_TIMEOUT_TCP_FIN_WAIT] = { .type = NLA_U32 }, + [CTA_TIMEOUT_TCP_CLOSE_WAIT] = { .type = NLA_U32 }, + [CTA_TIMEOUT_TCP_LAST_ACK] = { .type = NLA_U32 }, + [CTA_TIMEOUT_TCP_TIME_WAIT] = { .type = NLA_U32 }, + [CTA_TIMEOUT_TCP_CLOSE] = { .type = NLA_U32 }, + [CTA_TIMEOUT_TCP_SYN_SENT2] = { .type = NLA_U32 }, +}; +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ + +#ifdef CONFIG_SYSCTL +static unsigned int tcp_sysctl_table_users; +static struct ctl_table_header *tcp_sysctl_header; +static struct ctl_table tcp_sysctl_table[] = { + { + .procname = "nf_conntrack_tcp_timeout_syn_sent", + .data = &tcp_timeouts[TCP_CONNTRACK_SYN_SENT], + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "nf_conntrack_tcp_timeout_syn_recv", + .data = &tcp_timeouts[TCP_CONNTRACK_SYN_RECV], + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "nf_conntrack_tcp_timeout_established", + .data = &tcp_timeouts[TCP_CONNTRACK_ESTABLISHED], + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "nf_conntrack_tcp_timeout_fin_wait", + .data = &tcp_timeouts[TCP_CONNTRACK_FIN_WAIT], + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "nf_conntrack_tcp_timeout_close_wait", + .data = &tcp_timeouts[TCP_CONNTRACK_CLOSE_WAIT], + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "nf_conntrack_tcp_timeout_last_ack", + .data = &tcp_timeouts[TCP_CONNTRACK_LAST_ACK], + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "nf_conntrack_tcp_timeout_time_wait", + .data = &tcp_timeouts[TCP_CONNTRACK_TIME_WAIT], + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "nf_conntrack_tcp_timeout_close", + .data = &tcp_timeouts[TCP_CONNTRACK_CLOSE], + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "nf_conntrack_tcp_timeout_max_retrans", + .data = &tcp_timeouts[TCP_CONNTRACK_RETRANS], + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "nf_conntrack_tcp_timeout_unacknowledged", + .data = &tcp_timeouts[TCP_CONNTRACK_UNACK], + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "nf_conntrack_tcp_loose", + .data = &nf_ct_tcp_loose, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "nf_conntrack_tcp_be_liberal", + .data = &nf_ct_tcp_be_liberal, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "nf_conntrack_tcp_max_retrans", + .data = &nf_ct_tcp_max_retrans, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { } +}; + +#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT +static struct ctl_table tcp_compat_sysctl_table[] = { + { + .procname = "ip_conntrack_tcp_timeout_syn_sent", + .data = &tcp_timeouts[TCP_CONNTRACK_SYN_SENT], + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "ip_conntrack_tcp_timeout_syn_sent2", + .data = &tcp_timeouts[TCP_CONNTRACK_SYN_SENT2], + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "ip_conntrack_tcp_timeout_syn_recv", + .data = &tcp_timeouts[TCP_CONNTRACK_SYN_RECV], + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "ip_conntrack_tcp_timeout_established", + .data = &tcp_timeouts[TCP_CONNTRACK_ESTABLISHED], + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "ip_conntrack_tcp_timeout_fin_wait", + .data = &tcp_timeouts[TCP_CONNTRACK_FIN_WAIT], + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "ip_conntrack_tcp_timeout_close_wait", + .data = &tcp_timeouts[TCP_CONNTRACK_CLOSE_WAIT], + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "ip_conntrack_tcp_timeout_last_ack", + .data = &tcp_timeouts[TCP_CONNTRACK_LAST_ACK], + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "ip_conntrack_tcp_timeout_time_wait", + .data = &tcp_timeouts[TCP_CONNTRACK_TIME_WAIT], + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "ip_conntrack_tcp_timeout_close", + .data = &tcp_timeouts[TCP_CONNTRACK_CLOSE], + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "ip_conntrack_tcp_timeout_max_retrans", + .data = &tcp_timeouts[TCP_CONNTRACK_RETRANS], + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "ip_conntrack_tcp_loose", + .data = &nf_ct_tcp_loose, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "ip_conntrack_tcp_be_liberal", + .data = &nf_ct_tcp_be_liberal, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "ip_conntrack_tcp_max_retrans", + .data = &nf_ct_tcp_max_retrans, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { } +}; +#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */ +#endif /* CONFIG_SYSCTL */ + +struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4 __read_mostly = +{ + .l3proto = PF_INET, + .l4proto = IPPROTO_TCP, + .name = "tcp", + .pkt_to_tuple = tcp_pkt_to_tuple, + .invert_tuple = tcp_invert_tuple, + .print_tuple = tcp_print_tuple, + .print_conntrack = tcp_print_conntrack, + .packet = tcp_packet, + .get_timeouts = tcp_get_timeouts, + .new = tcp_new, + .error = tcp_error, +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) + .to_nlattr = tcp_to_nlattr, + .nlattr_size = tcp_nlattr_size, + .from_nlattr = nlattr_to_tcp, + .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, + .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, + .nlattr_tuple_size = tcp_nlattr_tuple_size, + .nla_policy = nf_ct_port_nla_policy, +#endif +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + .ctnl_timeout = { + .nlattr_to_obj = tcp_timeout_nlattr_to_obj, + .obj_to_nlattr = tcp_timeout_obj_to_nlattr, + .nlattr_max = CTA_TIMEOUT_TCP_MAX, + .obj_size = sizeof(unsigned int) * + TCP_CONNTRACK_TIMEOUT_MAX, + .nla_policy = tcp_timeout_nla_policy, + }, +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ +#ifdef CONFIG_SYSCTL + .ctl_table_users = &tcp_sysctl_table_users, + .ctl_table_header = &tcp_sysctl_header, + .ctl_table = tcp_sysctl_table, +#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT + .ctl_compat_table = tcp_compat_sysctl_table, +#endif +#endif +}; +EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_tcp4); + +struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6 __read_mostly = +{ + .l3proto = PF_INET6, + .l4proto = IPPROTO_TCP, + .name = "tcp", + .pkt_to_tuple = tcp_pkt_to_tuple, + .invert_tuple = tcp_invert_tuple, + .print_tuple = tcp_print_tuple, + .print_conntrack = tcp_print_conntrack, + .packet = tcp_packet, + .get_timeouts = tcp_get_timeouts, + .new = tcp_new, + .error = tcp_error, +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) + .to_nlattr = tcp_to_nlattr, + .nlattr_size = tcp_nlattr_size, + .from_nlattr = nlattr_to_tcp, + .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, + .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, + .nlattr_tuple_size = tcp_nlattr_tuple_size, + .nla_policy = nf_ct_port_nla_policy, +#endif +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + .ctnl_timeout = { + .nlattr_to_obj = tcp_timeout_nlattr_to_obj, + .obj_to_nlattr = tcp_timeout_obj_to_nlattr, + .nlattr_max = CTA_TIMEOUT_TCP_MAX, + .obj_size = sizeof(unsigned int) * + TCP_CONNTRACK_TIMEOUT_MAX, + .nla_policy = tcp_timeout_nla_policy, + }, +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ +#ifdef CONFIG_SYSCTL + .ctl_table_users = &tcp_sysctl_table_users, + .ctl_table_header = &tcp_sysctl_header, + .ctl_table = tcp_sysctl_table, +#endif +}; +EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_tcp6); diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c new file mode 100644 index 00000000..a9073dc1 --- /dev/null +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -0,0 +1,313 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/types.h> +#include <linux/timer.h> +#include <linux/module.h> +#include <linux/udp.h> +#include <linux/seq_file.h> +#include <linux/skbuff.h> +#include <linux/ipv6.h> +#include <net/ip6_checksum.h> +#include <net/checksum.h> + +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter_ipv6.h> +#include <net/netfilter/nf_conntrack_l4proto.h> +#include <net/netfilter/nf_conntrack_ecache.h> +#include <net/netfilter/nf_log.h> +#include <net/netfilter/ipv4/nf_conntrack_ipv4.h> +#include <net/netfilter/ipv6/nf_conntrack_ipv6.h> + +enum udp_conntrack { + UDP_CT_UNREPLIED, + UDP_CT_REPLIED, + UDP_CT_MAX +}; + +static unsigned int udp_timeouts[UDP_CT_MAX] = { + [UDP_CT_UNREPLIED] = 30*HZ, + [UDP_CT_REPLIED] = 180*HZ, +}; + +static bool udp_pkt_to_tuple(const struct sk_buff *skb, + unsigned int dataoff, + struct nf_conntrack_tuple *tuple) +{ + const struct udphdr *hp; + struct udphdr _hdr; + + /* Actually only need first 8 bytes. */ + hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); + if (hp == NULL) + return false; + + tuple->src.u.udp.port = hp->source; + tuple->dst.u.udp.port = hp->dest; + + return true; +} + +static bool udp_invert_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple *orig) +{ + tuple->src.u.udp.port = orig->dst.u.udp.port; + tuple->dst.u.udp.port = orig->src.u.udp.port; + return true; +} + +/* Print out the per-protocol part of the tuple. */ +static int udp_print_tuple(struct seq_file *s, + const struct nf_conntrack_tuple *tuple) +{ + return seq_printf(s, "sport=%hu dport=%hu ", + ntohs(tuple->src.u.udp.port), + ntohs(tuple->dst.u.udp.port)); +} + +static unsigned int *udp_get_timeouts(struct net *net) +{ + return udp_timeouts; +} + +/* Returns verdict for packet, and may modify conntracktype */ +static int udp_packet(struct nf_conn *ct, + const struct sk_buff *skb, + unsigned int dataoff, + enum ip_conntrack_info ctinfo, + u_int8_t pf, + unsigned int hooknum, + unsigned int *timeouts) +{ + /* If we've seen traffic both ways, this is some kind of UDP + stream. Extend timeout. */ + if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { + nf_ct_refresh_acct(ct, ctinfo, skb, + timeouts[UDP_CT_REPLIED]); + /* Also, more likely to be important, and not a probe */ + if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status)) + nf_conntrack_event_cache(IPCT_ASSURED, ct); + } else { + nf_ct_refresh_acct(ct, ctinfo, skb, + timeouts[UDP_CT_UNREPLIED]); + } + return NF_ACCEPT; +} + +/* Called when a new connection for this protocol found. */ +static bool udp_new(struct nf_conn *ct, const struct sk_buff *skb, + unsigned int dataoff, unsigned int *timeouts) +{ + return true; +} + +static int udp_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, + unsigned int dataoff, enum ip_conntrack_info *ctinfo, + u_int8_t pf, + unsigned int hooknum) +{ + unsigned int udplen = skb->len - dataoff; + const struct udphdr *hdr; + struct udphdr _hdr; + + /* Header is too small? */ + hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); + if (hdr == NULL) { + if (LOG_INVALID(net, IPPROTO_UDP)) + nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + "nf_ct_udp: short packet "); + return -NF_ACCEPT; + } + + /* Truncated/malformed packets */ + if (ntohs(hdr->len) > udplen || ntohs(hdr->len) < sizeof(*hdr)) { + if (LOG_INVALID(net, IPPROTO_UDP)) + nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + "nf_ct_udp: truncated/malformed packet "); + return -NF_ACCEPT; + } + + /* Packet with no checksum */ + if (!hdr->check) + return NF_ACCEPT; + + /* Checksum invalid? Ignore. + * We skip checking packets on the outgoing path + * because the checksum is assumed to be correct. + * FIXME: Source route IP option packets --RR */ + if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && + nf_checksum(skb, hooknum, dataoff, IPPROTO_UDP, pf)) { + if (LOG_INVALID(net, IPPROTO_UDP)) + nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + "nf_ct_udp: bad UDP checksum "); + return -NF_ACCEPT; + } + + return NF_ACCEPT; +} + +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nfnetlink_cttimeout.h> + +static int udp_timeout_nlattr_to_obj(struct nlattr *tb[], void *data) +{ + unsigned int *timeouts = data; + + /* set default timeouts for UDP. */ + timeouts[UDP_CT_UNREPLIED] = udp_timeouts[UDP_CT_UNREPLIED]; + timeouts[UDP_CT_REPLIED] = udp_timeouts[UDP_CT_REPLIED]; + + if (tb[CTA_TIMEOUT_UDP_UNREPLIED]) { + timeouts[UDP_CT_UNREPLIED] = + ntohl(nla_get_be32(tb[CTA_TIMEOUT_UDP_UNREPLIED])) * HZ; + } + if (tb[CTA_TIMEOUT_UDP_REPLIED]) { + timeouts[UDP_CT_REPLIED] = + ntohl(nla_get_be32(tb[CTA_TIMEOUT_UDP_REPLIED])) * HZ; + } + return 0; +} + +static int +udp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) +{ + const unsigned int *timeouts = data; + + NLA_PUT_BE32(skb, CTA_TIMEOUT_UDP_UNREPLIED, + htonl(timeouts[UDP_CT_UNREPLIED] / HZ)); + NLA_PUT_BE32(skb, CTA_TIMEOUT_UDP_REPLIED, + htonl(timeouts[UDP_CT_REPLIED] / HZ)); + return 0; + +nla_put_failure: + return -ENOSPC; +} + +static const struct nla_policy +udp_timeout_nla_policy[CTA_TIMEOUT_UDP_MAX+1] = { + [CTA_TIMEOUT_UDP_UNREPLIED] = { .type = NLA_U32 }, + [CTA_TIMEOUT_UDP_REPLIED] = { .type = NLA_U32 }, +}; +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ + +#ifdef CONFIG_SYSCTL +static unsigned int udp_sysctl_table_users; +static struct ctl_table_header *udp_sysctl_header; +static struct ctl_table udp_sysctl_table[] = { + { + .procname = "nf_conntrack_udp_timeout", + .data = &udp_timeouts[UDP_CT_UNREPLIED], + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "nf_conntrack_udp_timeout_stream", + .data = &udp_timeouts[UDP_CT_REPLIED], + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { } +}; +#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT +static struct ctl_table udp_compat_sysctl_table[] = { + { + .procname = "ip_conntrack_udp_timeout", + .data = &udp_timeouts[UDP_CT_UNREPLIED], + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "ip_conntrack_udp_timeout_stream", + .data = &udp_timeouts[UDP_CT_REPLIED], + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { } +}; +#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */ +#endif /* CONFIG_SYSCTL */ + +struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4 __read_mostly = +{ + .l3proto = PF_INET, + .l4proto = IPPROTO_UDP, + .name = "udp", + .pkt_to_tuple = udp_pkt_to_tuple, + .invert_tuple = udp_invert_tuple, + .print_tuple = udp_print_tuple, + .packet = udp_packet, + .get_timeouts = udp_get_timeouts, + .new = udp_new, + .error = udp_error, +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) + .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, + .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, + .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, + .nla_policy = nf_ct_port_nla_policy, +#endif +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + .ctnl_timeout = { + .nlattr_to_obj = udp_timeout_nlattr_to_obj, + .obj_to_nlattr = udp_timeout_obj_to_nlattr, + .nlattr_max = CTA_TIMEOUT_UDP_MAX, + .obj_size = sizeof(unsigned int) * CTA_TIMEOUT_UDP_MAX, + .nla_policy = udp_timeout_nla_policy, + }, +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ +#ifdef CONFIG_SYSCTL + .ctl_table_users = &udp_sysctl_table_users, + .ctl_table_header = &udp_sysctl_header, + .ctl_table = udp_sysctl_table, +#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT + .ctl_compat_table = udp_compat_sysctl_table, +#endif +#endif +}; +EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udp4); + +struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 __read_mostly = +{ + .l3proto = PF_INET6, + .l4proto = IPPROTO_UDP, + .name = "udp", + .pkt_to_tuple = udp_pkt_to_tuple, + .invert_tuple = udp_invert_tuple, + .print_tuple = udp_print_tuple, + .packet = udp_packet, + .get_timeouts = udp_get_timeouts, + .new = udp_new, + .error = udp_error, +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) + .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, + .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, + .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, + .nla_policy = nf_ct_port_nla_policy, +#endif +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + .ctnl_timeout = { + .nlattr_to_obj = udp_timeout_nlattr_to_obj, + .obj_to_nlattr = udp_timeout_obj_to_nlattr, + .nlattr_max = CTA_TIMEOUT_UDP_MAX, + .obj_size = sizeof(unsigned int) * CTA_TIMEOUT_UDP_MAX, + .nla_policy = udp_timeout_nla_policy, + }, +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ +#ifdef CONFIG_SYSCTL + .ctl_table_users = &udp_sysctl_table_users, + .ctl_table_header = &udp_sysctl_header, + .ctl_table = udp_sysctl_table, +#endif +}; +EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udp6); diff --git a/net/netfilter/nf_conntrack_proto_udplite.c b/net/netfilter/nf_conntrack_proto_udplite.c new file mode 100644 index 00000000..e0606392 --- /dev/null +++ b/net/netfilter/nf_conntrack_proto_udplite.c @@ -0,0 +1,323 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * (C) 2007 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/types.h> +#include <linux/timer.h> +#include <linux/module.h> +#include <linux/udp.h> +#include <linux/seq_file.h> +#include <linux/skbuff.h> +#include <linux/ipv6.h> +#include <net/ip6_checksum.h> +#include <net/checksum.h> + +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter_ipv6.h> +#include <net/netfilter/nf_conntrack_l4proto.h> +#include <net/netfilter/nf_conntrack_ecache.h> +#include <net/netfilter/nf_log.h> + +enum udplite_conntrack { + UDPLITE_CT_UNREPLIED, + UDPLITE_CT_REPLIED, + UDPLITE_CT_MAX +}; + +static unsigned int udplite_timeouts[UDPLITE_CT_MAX] = { + [UDPLITE_CT_UNREPLIED] = 30*HZ, + [UDPLITE_CT_REPLIED] = 180*HZ, +}; + +static bool udplite_pkt_to_tuple(const struct sk_buff *skb, + unsigned int dataoff, + struct nf_conntrack_tuple *tuple) +{ + const struct udphdr *hp; + struct udphdr _hdr; + + hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); + if (hp == NULL) + return false; + + tuple->src.u.udp.port = hp->source; + tuple->dst.u.udp.port = hp->dest; + return true; +} + +static bool udplite_invert_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple *orig) +{ + tuple->src.u.udp.port = orig->dst.u.udp.port; + tuple->dst.u.udp.port = orig->src.u.udp.port; + return true; +} + +/* Print out the per-protocol part of the tuple. */ +static int udplite_print_tuple(struct seq_file *s, + const struct nf_conntrack_tuple *tuple) +{ + return seq_printf(s, "sport=%hu dport=%hu ", + ntohs(tuple->src.u.udp.port), + ntohs(tuple->dst.u.udp.port)); +} + +static unsigned int *udplite_get_timeouts(struct net *net) +{ + return udplite_timeouts; +} + +/* Returns verdict for packet, and may modify conntracktype */ +static int udplite_packet(struct nf_conn *ct, + const struct sk_buff *skb, + unsigned int dataoff, + enum ip_conntrack_info ctinfo, + u_int8_t pf, + unsigned int hooknum, + unsigned int *timeouts) +{ + /* If we've seen traffic both ways, this is some kind of UDP + stream. Extend timeout. */ + if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { + nf_ct_refresh_acct(ct, ctinfo, skb, + timeouts[UDPLITE_CT_REPLIED]); + /* Also, more likely to be important, and not a probe */ + if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status)) + nf_conntrack_event_cache(IPCT_ASSURED, ct); + } else { + nf_ct_refresh_acct(ct, ctinfo, skb, + timeouts[UDPLITE_CT_UNREPLIED]); + } + return NF_ACCEPT; +} + +/* Called when a new connection for this protocol found. */ +static bool udplite_new(struct nf_conn *ct, const struct sk_buff *skb, + unsigned int dataoff, unsigned int *timeouts) +{ + return true; +} + +static int udplite_error(struct net *net, struct nf_conn *tmpl, + struct sk_buff *skb, + unsigned int dataoff, + enum ip_conntrack_info *ctinfo, + u_int8_t pf, + unsigned int hooknum) +{ + unsigned int udplen = skb->len - dataoff; + const struct udphdr *hdr; + struct udphdr _hdr; + unsigned int cscov; + + /* Header is too small? */ + hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); + if (hdr == NULL) { + if (LOG_INVALID(net, IPPROTO_UDPLITE)) + nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + "nf_ct_udplite: short packet "); + return -NF_ACCEPT; + } + + cscov = ntohs(hdr->len); + if (cscov == 0) + cscov = udplen; + else if (cscov < sizeof(*hdr) || cscov > udplen) { + if (LOG_INVALID(net, IPPROTO_UDPLITE)) + nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + "nf_ct_udplite: invalid checksum coverage "); + return -NF_ACCEPT; + } + + /* UDPLITE mandates checksums */ + if (!hdr->check) { + if (LOG_INVALID(net, IPPROTO_UDPLITE)) + nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + "nf_ct_udplite: checksum missing "); + return -NF_ACCEPT; + } + + /* Checksum invalid? Ignore. */ + if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && + nf_checksum_partial(skb, hooknum, dataoff, cscov, IPPROTO_UDP, + pf)) { + if (LOG_INVALID(net, IPPROTO_UDPLITE)) + nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + "nf_ct_udplite: bad UDPLite checksum "); + return -NF_ACCEPT; + } + + return NF_ACCEPT; +} + +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nfnetlink_cttimeout.h> + +static int udplite_timeout_nlattr_to_obj(struct nlattr *tb[], void *data) +{ + unsigned int *timeouts = data; + + /* set default timeouts for UDPlite. */ + timeouts[UDPLITE_CT_UNREPLIED] = udplite_timeouts[UDPLITE_CT_UNREPLIED]; + timeouts[UDPLITE_CT_REPLIED] = udplite_timeouts[UDPLITE_CT_REPLIED]; + + if (tb[CTA_TIMEOUT_UDPLITE_UNREPLIED]) { + timeouts[UDPLITE_CT_UNREPLIED] = + ntohl(nla_get_be32(tb[CTA_TIMEOUT_UDPLITE_UNREPLIED])) * HZ; + } + if (tb[CTA_TIMEOUT_UDPLITE_REPLIED]) { + timeouts[UDPLITE_CT_REPLIED] = + ntohl(nla_get_be32(tb[CTA_TIMEOUT_UDPLITE_REPLIED])) * HZ; + } + return 0; +} + +static int +udplite_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) +{ + const unsigned int *timeouts = data; + + NLA_PUT_BE32(skb, CTA_TIMEOUT_UDPLITE_UNREPLIED, + htonl(timeouts[UDPLITE_CT_UNREPLIED] / HZ)); + NLA_PUT_BE32(skb, CTA_TIMEOUT_UDPLITE_REPLIED, + htonl(timeouts[UDPLITE_CT_REPLIED] / HZ)); + return 0; + +nla_put_failure: + return -ENOSPC; +} + +static const struct nla_policy +udplite_timeout_nla_policy[CTA_TIMEOUT_UDPLITE_MAX+1] = { + [CTA_TIMEOUT_UDPLITE_UNREPLIED] = { .type = NLA_U32 }, + [CTA_TIMEOUT_UDPLITE_REPLIED] = { .type = NLA_U32 }, +}; +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ + +#ifdef CONFIG_SYSCTL +static unsigned int udplite_sysctl_table_users; +static struct ctl_table_header *udplite_sysctl_header; +static struct ctl_table udplite_sysctl_table[] = { + { + .procname = "nf_conntrack_udplite_timeout", + .data = &udplite_timeouts[UDPLITE_CT_UNREPLIED], + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "nf_conntrack_udplite_timeout_stream", + .data = &udplite_timeouts[UDPLITE_CT_REPLIED], + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { } +}; +#endif /* CONFIG_SYSCTL */ + +static struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 __read_mostly = +{ + .l3proto = PF_INET, + .l4proto = IPPROTO_UDPLITE, + .name = "udplite", + .pkt_to_tuple = udplite_pkt_to_tuple, + .invert_tuple = udplite_invert_tuple, + .print_tuple = udplite_print_tuple, + .packet = udplite_packet, + .get_timeouts = udplite_get_timeouts, + .new = udplite_new, + .error = udplite_error, +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) + .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, + .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, + .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, + .nla_policy = nf_ct_port_nla_policy, +#endif +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + .ctnl_timeout = { + .nlattr_to_obj = udplite_timeout_nlattr_to_obj, + .obj_to_nlattr = udplite_timeout_obj_to_nlattr, + .nlattr_max = CTA_TIMEOUT_UDPLITE_MAX, + .obj_size = sizeof(unsigned int) * + CTA_TIMEOUT_UDPLITE_MAX, + .nla_policy = udplite_timeout_nla_policy, + }, +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ +#ifdef CONFIG_SYSCTL + .ctl_table_users = &udplite_sysctl_table_users, + .ctl_table_header = &udplite_sysctl_header, + .ctl_table = udplite_sysctl_table, +#endif +}; + +static struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 __read_mostly = +{ + .l3proto = PF_INET6, + .l4proto = IPPROTO_UDPLITE, + .name = "udplite", + .pkt_to_tuple = udplite_pkt_to_tuple, + .invert_tuple = udplite_invert_tuple, + .print_tuple = udplite_print_tuple, + .packet = udplite_packet, + .get_timeouts = udplite_get_timeouts, + .new = udplite_new, + .error = udplite_error, +#if IS_ENABLED(CONFIG_NF_CT_NETLINK) + .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, + .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, + .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, + .nla_policy = nf_ct_port_nla_policy, +#endif +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + .ctnl_timeout = { + .nlattr_to_obj = udplite_timeout_nlattr_to_obj, + .obj_to_nlattr = udplite_timeout_obj_to_nlattr, + .nlattr_max = CTA_TIMEOUT_UDPLITE_MAX, + .obj_size = sizeof(unsigned int) * + CTA_TIMEOUT_UDPLITE_MAX, + .nla_policy = udplite_timeout_nla_policy, + }, +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ +#ifdef CONFIG_SYSCTL + .ctl_table_users = &udplite_sysctl_table_users, + .ctl_table_header = &udplite_sysctl_header, + .ctl_table = udplite_sysctl_table, +#endif +}; + +static int __init nf_conntrack_proto_udplite_init(void) +{ + int err; + + err = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_udplite4); + if (err < 0) + goto err1; + err = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_udplite6); + if (err < 0) + goto err2; + return 0; +err2: + nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udplite4); +err1: + return err; +} + +static void __exit nf_conntrack_proto_udplite_exit(void) +{ + nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udplite6); + nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udplite4); +} + +module_init(nf_conntrack_proto_udplite_init); +module_exit(nf_conntrack_proto_udplite_exit); + +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/nf_conntrack_sane.c b/net/netfilter/nf_conntrack_sane.c new file mode 100644 index 00000000..8501823b --- /dev/null +++ b/net/netfilter/nf_conntrack_sane.c @@ -0,0 +1,238 @@ +/* SANE connection tracking helper + * (SANE = Scanner Access Now Easy) + * For documentation about the SANE network protocol see + * http://www.sane-project.org/html/doc015.html + */ + +/* Copyright (C) 2007 Red Hat, Inc. + * Author: Michal Schmidt <mschmidt@redhat.com> + * Based on the FTP conntrack helper (net/netfilter/nf_conntrack_ftp.c): + * (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> + * (C) 2003 Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/netfilter.h> +#include <linux/slab.h> +#include <linux/in.h> +#include <linux/tcp.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_expect.h> +#include <linux/netfilter/nf_conntrack_sane.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Michal Schmidt <mschmidt@redhat.com>"); +MODULE_DESCRIPTION("SANE connection tracking helper"); +MODULE_ALIAS_NFCT_HELPER("sane"); + +static char *sane_buffer; + +static DEFINE_SPINLOCK(nf_sane_lock); + +#define MAX_PORTS 8 +static u_int16_t ports[MAX_PORTS]; +static unsigned int ports_c; +module_param_array(ports, ushort, &ports_c, 0400); + +struct sane_request { + __be32 RPC_code; +#define SANE_NET_START 7 /* RPC code */ + + __be32 handle; +}; + +struct sane_reply_net_start { + __be32 status; +#define SANE_STATUS_SUCCESS 0 + + __be16 zero; + __be16 port; + /* other fields aren't interesting for conntrack */ +}; + +static int help(struct sk_buff *skb, + unsigned int protoff, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo) +{ + unsigned int dataoff, datalen; + const struct tcphdr *th; + struct tcphdr _tcph; + void *sb_ptr; + int ret = NF_ACCEPT; + int dir = CTINFO2DIR(ctinfo); + struct nf_ct_sane_master *ct_sane_info; + struct nf_conntrack_expect *exp; + struct nf_conntrack_tuple *tuple; + struct sane_request *req; + struct sane_reply_net_start *reply; + + ct_sane_info = &nfct_help(ct)->help.ct_sane_info; + /* Until there's been traffic both ways, don't look in packets. */ + if (ctinfo != IP_CT_ESTABLISHED && + ctinfo != IP_CT_ESTABLISHED_REPLY) + return NF_ACCEPT; + + /* Not a full tcp header? */ + th = skb_header_pointer(skb, protoff, sizeof(_tcph), &_tcph); + if (th == NULL) + return NF_ACCEPT; + + /* No data? */ + dataoff = protoff + th->doff * 4; + if (dataoff >= skb->len) + return NF_ACCEPT; + + datalen = skb->len - dataoff; + + spin_lock_bh(&nf_sane_lock); + sb_ptr = skb_header_pointer(skb, dataoff, datalen, sane_buffer); + BUG_ON(sb_ptr == NULL); + + if (dir == IP_CT_DIR_ORIGINAL) { + if (datalen != sizeof(struct sane_request)) + goto out; + + req = sb_ptr; + if (req->RPC_code != htonl(SANE_NET_START)) { + /* Not an interesting command */ + ct_sane_info->state = SANE_STATE_NORMAL; + goto out; + } + + /* We're interested in the next reply */ + ct_sane_info->state = SANE_STATE_START_REQUESTED; + goto out; + } + + /* Is it a reply to an uninteresting command? */ + if (ct_sane_info->state != SANE_STATE_START_REQUESTED) + goto out; + + /* It's a reply to SANE_NET_START. */ + ct_sane_info->state = SANE_STATE_NORMAL; + + if (datalen < sizeof(struct sane_reply_net_start)) { + pr_debug("nf_ct_sane: NET_START reply too short\n"); + goto out; + } + + reply = sb_ptr; + if (reply->status != htonl(SANE_STATUS_SUCCESS)) { + /* saned refused the command */ + pr_debug("nf_ct_sane: unsuccessful SANE_STATUS = %u\n", + ntohl(reply->status)); + goto out; + } + + /* Invalid saned reply? Ignore it. */ + if (reply->zero != 0) + goto out; + + exp = nf_ct_expect_alloc(ct); + if (exp == NULL) { + ret = NF_DROP; + goto out; + } + + tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; + nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct), + &tuple->src.u3, &tuple->dst.u3, + IPPROTO_TCP, NULL, &reply->port); + + pr_debug("nf_ct_sane: expect: "); + nf_ct_dump_tuple(&exp->tuple); + + /* Can't expect this? Best to drop packet now. */ + if (nf_ct_expect_related(exp) != 0) + ret = NF_DROP; + + nf_ct_expect_put(exp); + +out: + spin_unlock_bh(&nf_sane_lock); + return ret; +} + +static struct nf_conntrack_helper sane[MAX_PORTS][2] __read_mostly; +static char sane_names[MAX_PORTS][2][sizeof("sane-65535")] __read_mostly; + +static const struct nf_conntrack_expect_policy sane_exp_policy = { + .max_expected = 1, + .timeout = 5 * 60, +}; + +/* don't make this __exit, since it's called from __init ! */ +static void nf_conntrack_sane_fini(void) +{ + int i, j; + + for (i = 0; i < ports_c; i++) { + for (j = 0; j < 2; j++) { + pr_debug("nf_ct_sane: unregistering helper for pf: %d " + "port: %d\n", + sane[i][j].tuple.src.l3num, ports[i]); + nf_conntrack_helper_unregister(&sane[i][j]); + } + } + + kfree(sane_buffer); +} + +static int __init nf_conntrack_sane_init(void) +{ + int i, j = -1, ret = 0; + char *tmpname; + + sane_buffer = kmalloc(65536, GFP_KERNEL); + if (!sane_buffer) + return -ENOMEM; + + if (ports_c == 0) + ports[ports_c++] = SANE_PORT; + + /* FIXME should be configurable whether IPv4 and IPv6 connections + are tracked or not - YK */ + for (i = 0; i < ports_c; i++) { + sane[i][0].tuple.src.l3num = PF_INET; + sane[i][1].tuple.src.l3num = PF_INET6; + for (j = 0; j < 2; j++) { + sane[i][j].tuple.src.u.tcp.port = htons(ports[i]); + sane[i][j].tuple.dst.protonum = IPPROTO_TCP; + sane[i][j].expect_policy = &sane_exp_policy; + sane[i][j].me = THIS_MODULE; + sane[i][j].help = help; + tmpname = &sane_names[i][j][0]; + if (ports[i] == SANE_PORT) + sprintf(tmpname, "sane"); + else + sprintf(tmpname, "sane-%d", ports[i]); + sane[i][j].name = tmpname; + + pr_debug("nf_ct_sane: registering helper for pf: %d " + "port: %d\n", + sane[i][j].tuple.src.l3num, ports[i]); + ret = nf_conntrack_helper_register(&sane[i][j]); + if (ret) { + printk(KERN_ERR "nf_ct_sane: failed to " + "register helper for pf: %d port: %d\n", + sane[i][j].tuple.src.l3num, ports[i]); + nf_conntrack_sane_fini(); + return ret; + } + } + } + + return 0; +} + +module_init(nf_conntrack_sane_init); +module_exit(nf_conntrack_sane_fini); diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c new file mode 100644 index 00000000..93faf6a3 --- /dev/null +++ b/net/netfilter/nf_conntrack_sip.c @@ -0,0 +1,1610 @@ +/* SIP extension for IP connection tracking. + * + * (C) 2005 by Christian Hentschel <chentschel@arnet.com.ar> + * based on RR's ip_conntrack_ftp.c and other modules. + * (C) 2007 United Security Providers + * (C) 2007, 2008 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/ctype.h> +#include <linux/skbuff.h> +#include <linux/inet.h> +#include <linux/in.h> +#include <linux/udp.h> +#include <linux/tcp.h> +#include <linux/netfilter.h> + +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_expect.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_zones.h> +#include <linux/netfilter/nf_conntrack_sip.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Christian Hentschel <chentschel@arnet.com.ar>"); +MODULE_DESCRIPTION("SIP connection tracking helper"); +MODULE_ALIAS("ip_conntrack_sip"); +MODULE_ALIAS_NFCT_HELPER("sip"); + +#define MAX_PORTS 8 +static unsigned short ports[MAX_PORTS]; +static unsigned int ports_c; +module_param_array(ports, ushort, &ports_c, 0400); +MODULE_PARM_DESC(ports, "port numbers of SIP servers"); + +static unsigned int sip_timeout __read_mostly = SIP_TIMEOUT; +module_param(sip_timeout, uint, 0600); +MODULE_PARM_DESC(sip_timeout, "timeout for the master SIP session"); + +static int sip_direct_signalling __read_mostly = 1; +module_param(sip_direct_signalling, int, 0600); +MODULE_PARM_DESC(sip_direct_signalling, "expect incoming calls from registrar " + "only (default 1)"); + +static int sip_direct_media __read_mostly = 1; +module_param(sip_direct_media, int, 0600); +MODULE_PARM_DESC(sip_direct_media, "Expect Media streams between signalling " + "endpoints only (default 1)"); + +unsigned int (*nf_nat_sip_hook)(struct sk_buff *skb, unsigned int dataoff, + const char **dptr, + unsigned int *datalen) __read_mostly; +EXPORT_SYMBOL_GPL(nf_nat_sip_hook); + +void (*nf_nat_sip_seq_adjust_hook)(struct sk_buff *skb, s16 off) __read_mostly; +EXPORT_SYMBOL_GPL(nf_nat_sip_seq_adjust_hook); + +unsigned int (*nf_nat_sip_expect_hook)(struct sk_buff *skb, + unsigned int dataoff, + const char **dptr, + unsigned int *datalen, + struct nf_conntrack_expect *exp, + unsigned int matchoff, + unsigned int matchlen) __read_mostly; +EXPORT_SYMBOL_GPL(nf_nat_sip_expect_hook); + +unsigned int (*nf_nat_sdp_addr_hook)(struct sk_buff *skb, unsigned int dataoff, + const char **dptr, + unsigned int *datalen, + unsigned int sdpoff, + enum sdp_header_types type, + enum sdp_header_types term, + const union nf_inet_addr *addr) + __read_mostly; +EXPORT_SYMBOL_GPL(nf_nat_sdp_addr_hook); + +unsigned int (*nf_nat_sdp_port_hook)(struct sk_buff *skb, unsigned int dataoff, + const char **dptr, + unsigned int *datalen, + unsigned int matchoff, + unsigned int matchlen, + u_int16_t port) __read_mostly; +EXPORT_SYMBOL_GPL(nf_nat_sdp_port_hook); + +unsigned int (*nf_nat_sdp_session_hook)(struct sk_buff *skb, + unsigned int dataoff, + const char **dptr, + unsigned int *datalen, + unsigned int sdpoff, + const union nf_inet_addr *addr) + __read_mostly; +EXPORT_SYMBOL_GPL(nf_nat_sdp_session_hook); + +unsigned int (*nf_nat_sdp_media_hook)(struct sk_buff *skb, unsigned int dataoff, + const char **dptr, + unsigned int *datalen, + struct nf_conntrack_expect *rtp_exp, + struct nf_conntrack_expect *rtcp_exp, + unsigned int mediaoff, + unsigned int medialen, + union nf_inet_addr *rtp_addr) + __read_mostly; +EXPORT_SYMBOL_GPL(nf_nat_sdp_media_hook); + +static int string_len(const struct nf_conn *ct, const char *dptr, + const char *limit, int *shift) +{ + int len = 0; + + while (dptr < limit && isalpha(*dptr)) { + dptr++; + len++; + } + return len; +} + +static int digits_len(const struct nf_conn *ct, const char *dptr, + const char *limit, int *shift) +{ + int len = 0; + while (dptr < limit && isdigit(*dptr)) { + dptr++; + len++; + } + return len; +} + +static int iswordc(const char c) +{ + if (isalnum(c) || c == '!' || c == '"' || c == '%' || + (c >= '(' && c <= '/') || c == ':' || c == '<' || c == '>' || + c == '?' || (c >= '[' && c <= ']') || c == '_' || c == '`' || + c == '{' || c == '}' || c == '~') + return 1; + return 0; +} + +static int word_len(const char *dptr, const char *limit) +{ + int len = 0; + while (dptr < limit && iswordc(*dptr)) { + dptr++; + len++; + } + return len; +} + +static int callid_len(const struct nf_conn *ct, const char *dptr, + const char *limit, int *shift) +{ + int len, domain_len; + + len = word_len(dptr, limit); + dptr += len; + if (!len || dptr == limit || *dptr != '@') + return len; + dptr++; + len++; + + domain_len = word_len(dptr, limit); + if (!domain_len) + return 0; + return len + domain_len; +} + +/* get media type + port length */ +static int media_len(const struct nf_conn *ct, const char *dptr, + const char *limit, int *shift) +{ + int len = string_len(ct, dptr, limit, shift); + + dptr += len; + if (dptr >= limit || *dptr != ' ') + return 0; + len++; + dptr++; + + return len + digits_len(ct, dptr, limit, shift); +} + +static int parse_addr(const struct nf_conn *ct, const char *cp, + const char **endp, union nf_inet_addr *addr, + const char *limit) +{ + const char *end; + int ret = 0; + + if (!ct) + return 0; + + memset(addr, 0, sizeof(*addr)); + switch (nf_ct_l3num(ct)) { + case AF_INET: + ret = in4_pton(cp, limit - cp, (u8 *)&addr->ip, -1, &end); + break; + case AF_INET6: + ret = in6_pton(cp, limit - cp, (u8 *)&addr->ip6, -1, &end); + break; + default: + BUG(); + } + + if (ret == 0 || end == cp) + return 0; + if (endp) + *endp = end; + return 1; +} + +/* skip ip address. returns its length. */ +static int epaddr_len(const struct nf_conn *ct, const char *dptr, + const char *limit, int *shift) +{ + union nf_inet_addr addr; + const char *aux = dptr; + + if (!parse_addr(ct, dptr, &dptr, &addr, limit)) { + pr_debug("ip: %s parse failed.!\n", dptr); + return 0; + } + + /* Port number */ + if (*dptr == ':') { + dptr++; + dptr += digits_len(ct, dptr, limit, shift); + } + return dptr - aux; +} + +/* get address length, skiping user info. */ +static int skp_epaddr_len(const struct nf_conn *ct, const char *dptr, + const char *limit, int *shift) +{ + const char *start = dptr; + int s = *shift; + + /* Search for @, but stop at the end of the line. + * We are inside a sip: URI, so we don't need to worry about + * continuation lines. */ + while (dptr < limit && + *dptr != '@' && *dptr != '\r' && *dptr != '\n') { + (*shift)++; + dptr++; + } + + if (dptr < limit && *dptr == '@') { + dptr++; + (*shift)++; + } else { + dptr = start; + *shift = s; + } + + return epaddr_len(ct, dptr, limit, shift); +} + +/* Parse a SIP request line of the form: + * + * Request-Line = Method SP Request-URI SP SIP-Version CRLF + * + * and return the offset and length of the address contained in the Request-URI. + */ +int ct_sip_parse_request(const struct nf_conn *ct, + const char *dptr, unsigned int datalen, + unsigned int *matchoff, unsigned int *matchlen, + union nf_inet_addr *addr, __be16 *port) +{ + const char *start = dptr, *limit = dptr + datalen, *end; + unsigned int mlen; + unsigned int p; + int shift = 0; + + /* Skip method and following whitespace */ + mlen = string_len(ct, dptr, limit, NULL); + if (!mlen) + return 0; + dptr += mlen; + if (++dptr >= limit) + return 0; + + /* Find SIP URI */ + for (; dptr < limit - strlen("sip:"); dptr++) { + if (*dptr == '\r' || *dptr == '\n') + return -1; + if (strnicmp(dptr, "sip:", strlen("sip:")) == 0) { + dptr += strlen("sip:"); + break; + } + } + if (!skp_epaddr_len(ct, dptr, limit, &shift)) + return 0; + dptr += shift; + + if (!parse_addr(ct, dptr, &end, addr, limit)) + return -1; + if (end < limit && *end == ':') { + end++; + p = simple_strtoul(end, (char **)&end, 10); + if (p < 1024 || p > 65535) + return -1; + *port = htons(p); + } else + *port = htons(SIP_PORT); + + if (end == dptr) + return 0; + *matchoff = dptr - start; + *matchlen = end - dptr; + return 1; +} +EXPORT_SYMBOL_GPL(ct_sip_parse_request); + +/* SIP header parsing: SIP headers are located at the beginning of a line, but + * may span several lines, in which case the continuation lines begin with a + * whitespace character. RFC 2543 allows lines to be terminated with CR, LF or + * CRLF, RFC 3261 allows only CRLF, we support both. + * + * Headers are followed by (optionally) whitespace, a colon, again (optionally) + * whitespace and the values. Whitespace in this context means any amount of + * tabs, spaces and continuation lines, which are treated as a single whitespace + * character. + * + * Some headers may appear multiple times. A comma separated list of values is + * equivalent to multiple headers. + */ +static const struct sip_header ct_sip_hdrs[] = { + [SIP_HDR_CSEQ] = SIP_HDR("CSeq", NULL, NULL, digits_len), + [SIP_HDR_FROM] = SIP_HDR("From", "f", "sip:", skp_epaddr_len), + [SIP_HDR_TO] = SIP_HDR("To", "t", "sip:", skp_epaddr_len), + [SIP_HDR_CONTACT] = SIP_HDR("Contact", "m", "sip:", skp_epaddr_len), + [SIP_HDR_VIA_UDP] = SIP_HDR("Via", "v", "UDP ", epaddr_len), + [SIP_HDR_VIA_TCP] = SIP_HDR("Via", "v", "TCP ", epaddr_len), + [SIP_HDR_EXPIRES] = SIP_HDR("Expires", NULL, NULL, digits_len), + [SIP_HDR_CONTENT_LENGTH] = SIP_HDR("Content-Length", "l", NULL, digits_len), + [SIP_HDR_CALL_ID] = SIP_HDR("Call-Id", "i", NULL, callid_len), +}; + +static const char *sip_follow_continuation(const char *dptr, const char *limit) +{ + /* Walk past newline */ + if (++dptr >= limit) + return NULL; + + /* Skip '\n' in CR LF */ + if (*(dptr - 1) == '\r' && *dptr == '\n') { + if (++dptr >= limit) + return NULL; + } + + /* Continuation line? */ + if (*dptr != ' ' && *dptr != '\t') + return NULL; + + /* skip leading whitespace */ + for (; dptr < limit; dptr++) { + if (*dptr != ' ' && *dptr != '\t') + break; + } + return dptr; +} + +static const char *sip_skip_whitespace(const char *dptr, const char *limit) +{ + for (; dptr < limit; dptr++) { + if (*dptr == ' ') + continue; + if (*dptr != '\r' && *dptr != '\n') + break; + dptr = sip_follow_continuation(dptr, limit); + if (dptr == NULL) + return NULL; + } + return dptr; +} + +/* Search within a SIP header value, dealing with continuation lines */ +static const char *ct_sip_header_search(const char *dptr, const char *limit, + const char *needle, unsigned int len) +{ + for (limit -= len; dptr < limit; dptr++) { + if (*dptr == '\r' || *dptr == '\n') { + dptr = sip_follow_continuation(dptr, limit); + if (dptr == NULL) + break; + continue; + } + + if (strnicmp(dptr, needle, len) == 0) + return dptr; + } + return NULL; +} + +int ct_sip_get_header(const struct nf_conn *ct, const char *dptr, + unsigned int dataoff, unsigned int datalen, + enum sip_header_types type, + unsigned int *matchoff, unsigned int *matchlen) +{ + const struct sip_header *hdr = &ct_sip_hdrs[type]; + const char *start = dptr, *limit = dptr + datalen; + int shift = 0; + + for (dptr += dataoff; dptr < limit; dptr++) { + /* Find beginning of line */ + if (*dptr != '\r' && *dptr != '\n') + continue; + if (++dptr >= limit) + break; + if (*(dptr - 1) == '\r' && *dptr == '\n') { + if (++dptr >= limit) + break; + } + + /* Skip continuation lines */ + if (*dptr == ' ' || *dptr == '\t') + continue; + + /* Find header. Compact headers must be followed by a + * non-alphabetic character to avoid mismatches. */ + if (limit - dptr >= hdr->len && + strnicmp(dptr, hdr->name, hdr->len) == 0) + dptr += hdr->len; + else if (hdr->cname && limit - dptr >= hdr->clen + 1 && + strnicmp(dptr, hdr->cname, hdr->clen) == 0 && + !isalpha(*(dptr + hdr->clen))) + dptr += hdr->clen; + else + continue; + + /* Find and skip colon */ + dptr = sip_skip_whitespace(dptr, limit); + if (dptr == NULL) + break; + if (*dptr != ':' || ++dptr >= limit) + break; + + /* Skip whitespace after colon */ + dptr = sip_skip_whitespace(dptr, limit); + if (dptr == NULL) + break; + + *matchoff = dptr - start; + if (hdr->search) { + dptr = ct_sip_header_search(dptr, limit, hdr->search, + hdr->slen); + if (!dptr) + return -1; + dptr += hdr->slen; + } + + *matchlen = hdr->match_len(ct, dptr, limit, &shift); + if (!*matchlen) + return -1; + *matchoff = dptr - start + shift; + return 1; + } + return 0; +} +EXPORT_SYMBOL_GPL(ct_sip_get_header); + +/* Get next header field in a list of comma separated values */ +static int ct_sip_next_header(const struct nf_conn *ct, const char *dptr, + unsigned int dataoff, unsigned int datalen, + enum sip_header_types type, + unsigned int *matchoff, unsigned int *matchlen) +{ + const struct sip_header *hdr = &ct_sip_hdrs[type]; + const char *start = dptr, *limit = dptr + datalen; + int shift = 0; + + dptr += dataoff; + + dptr = ct_sip_header_search(dptr, limit, ",", strlen(",")); + if (!dptr) + return 0; + + dptr = ct_sip_header_search(dptr, limit, hdr->search, hdr->slen); + if (!dptr) + return 0; + dptr += hdr->slen; + + *matchoff = dptr - start; + *matchlen = hdr->match_len(ct, dptr, limit, &shift); + if (!*matchlen) + return -1; + *matchoff += shift; + return 1; +} + +/* Walk through headers until a parsable one is found or no header of the + * given type is left. */ +static int ct_sip_walk_headers(const struct nf_conn *ct, const char *dptr, + unsigned int dataoff, unsigned int datalen, + enum sip_header_types type, int *in_header, + unsigned int *matchoff, unsigned int *matchlen) +{ + int ret; + + if (in_header && *in_header) { + while (1) { + ret = ct_sip_next_header(ct, dptr, dataoff, datalen, + type, matchoff, matchlen); + if (ret > 0) + return ret; + if (ret == 0) + break; + dataoff += *matchoff; + } + *in_header = 0; + } + + while (1) { + ret = ct_sip_get_header(ct, dptr, dataoff, datalen, + type, matchoff, matchlen); + if (ret > 0) + break; + if (ret == 0) + return ret; + dataoff += *matchoff; + } + + if (in_header) + *in_header = 1; + return 1; +} + +/* Locate a SIP header, parse the URI and return the offset and length of + * the address as well as the address and port themselves. A stream of + * headers can be parsed by handing in a non-NULL datalen and in_header + * pointer. + */ +int ct_sip_parse_header_uri(const struct nf_conn *ct, const char *dptr, + unsigned int *dataoff, unsigned int datalen, + enum sip_header_types type, int *in_header, + unsigned int *matchoff, unsigned int *matchlen, + union nf_inet_addr *addr, __be16 *port) +{ + const char *c, *limit = dptr + datalen; + unsigned int p; + int ret; + + ret = ct_sip_walk_headers(ct, dptr, dataoff ? *dataoff : 0, datalen, + type, in_header, matchoff, matchlen); + WARN_ON(ret < 0); + if (ret == 0) + return ret; + + if (!parse_addr(ct, dptr + *matchoff, &c, addr, limit)) + return -1; + if (*c == ':') { + c++; + p = simple_strtoul(c, (char **)&c, 10); + if (p < 1024 || p > 65535) + return -1; + *port = htons(p); + } else + *port = htons(SIP_PORT); + + if (dataoff) + *dataoff = c - dptr; + return 1; +} +EXPORT_SYMBOL_GPL(ct_sip_parse_header_uri); + +static int ct_sip_parse_param(const struct nf_conn *ct, const char *dptr, + unsigned int dataoff, unsigned int datalen, + const char *name, + unsigned int *matchoff, unsigned int *matchlen) +{ + const char *limit = dptr + datalen; + const char *start; + const char *end; + + limit = ct_sip_header_search(dptr + dataoff, limit, ",", strlen(",")); + if (!limit) + limit = dptr + datalen; + + start = ct_sip_header_search(dptr + dataoff, limit, name, strlen(name)); + if (!start) + return 0; + start += strlen(name); + + end = ct_sip_header_search(start, limit, ";", strlen(";")); + if (!end) + end = limit; + + *matchoff = start - dptr; + *matchlen = end - start; + return 1; +} + +/* Parse address from header parameter and return address, offset and length */ +int ct_sip_parse_address_param(const struct nf_conn *ct, const char *dptr, + unsigned int dataoff, unsigned int datalen, + const char *name, + unsigned int *matchoff, unsigned int *matchlen, + union nf_inet_addr *addr) +{ + const char *limit = dptr + datalen; + const char *start, *end; + + limit = ct_sip_header_search(dptr + dataoff, limit, ",", strlen(",")); + if (!limit) + limit = dptr + datalen; + + start = ct_sip_header_search(dptr + dataoff, limit, name, strlen(name)); + if (!start) + return 0; + + start += strlen(name); + if (!parse_addr(ct, start, &end, addr, limit)) + return 0; + *matchoff = start - dptr; + *matchlen = end - start; + return 1; +} +EXPORT_SYMBOL_GPL(ct_sip_parse_address_param); + +/* Parse numerical header parameter and return value, offset and length */ +int ct_sip_parse_numerical_param(const struct nf_conn *ct, const char *dptr, + unsigned int dataoff, unsigned int datalen, + const char *name, + unsigned int *matchoff, unsigned int *matchlen, + unsigned int *val) +{ + const char *limit = dptr + datalen; + const char *start; + char *end; + + limit = ct_sip_header_search(dptr + dataoff, limit, ",", strlen(",")); + if (!limit) + limit = dptr + datalen; + + start = ct_sip_header_search(dptr + dataoff, limit, name, strlen(name)); + if (!start) + return 0; + + start += strlen(name); + *val = simple_strtoul(start, &end, 0); + if (start == end) + return 0; + if (matchoff && matchlen) { + *matchoff = start - dptr; + *matchlen = end - start; + } + return 1; +} +EXPORT_SYMBOL_GPL(ct_sip_parse_numerical_param); + +static int ct_sip_parse_transport(struct nf_conn *ct, const char *dptr, + unsigned int dataoff, unsigned int datalen, + u8 *proto) +{ + unsigned int matchoff, matchlen; + + if (ct_sip_parse_param(ct, dptr, dataoff, datalen, "transport=", + &matchoff, &matchlen)) { + if (!strnicmp(dptr + matchoff, "TCP", strlen("TCP"))) + *proto = IPPROTO_TCP; + else if (!strnicmp(dptr + matchoff, "UDP", strlen("UDP"))) + *proto = IPPROTO_UDP; + else + return 0; + + if (*proto != nf_ct_protonum(ct)) + return 0; + } else + *proto = nf_ct_protonum(ct); + + return 1; +} + +/* SDP header parsing: a SDP session description contains an ordered set of + * headers, starting with a section containing general session parameters, + * optionally followed by multiple media descriptions. + * + * SDP headers always start at the beginning of a line. According to RFC 2327: + * "The sequence CRLF (0x0d0a) is used to end a record, although parsers should + * be tolerant and also accept records terminated with a single newline + * character". We handle both cases. + */ +static const struct sip_header ct_sdp_hdrs[] = { + [SDP_HDR_VERSION] = SDP_HDR("v=", NULL, digits_len), + [SDP_HDR_OWNER_IP4] = SDP_HDR("o=", "IN IP4 ", epaddr_len), + [SDP_HDR_CONNECTION_IP4] = SDP_HDR("c=", "IN IP4 ", epaddr_len), + [SDP_HDR_OWNER_IP6] = SDP_HDR("o=", "IN IP6 ", epaddr_len), + [SDP_HDR_CONNECTION_IP6] = SDP_HDR("c=", "IN IP6 ", epaddr_len), + [SDP_HDR_MEDIA] = SDP_HDR("m=", NULL, media_len), +}; + +/* Linear string search within SDP header values */ +static const char *ct_sdp_header_search(const char *dptr, const char *limit, + const char *needle, unsigned int len) +{ + for (limit -= len; dptr < limit; dptr++) { + if (*dptr == '\r' || *dptr == '\n') + break; + if (strncmp(dptr, needle, len) == 0) + return dptr; + } + return NULL; +} + +/* Locate a SDP header (optionally a substring within the header value), + * optionally stopping at the first occurrence of the term header, parse + * it and return the offset and length of the data we're interested in. + */ +int ct_sip_get_sdp_header(const struct nf_conn *ct, const char *dptr, + unsigned int dataoff, unsigned int datalen, + enum sdp_header_types type, + enum sdp_header_types term, + unsigned int *matchoff, unsigned int *matchlen) +{ + const struct sip_header *hdr = &ct_sdp_hdrs[type]; + const struct sip_header *thdr = &ct_sdp_hdrs[term]; + const char *start = dptr, *limit = dptr + datalen; + int shift = 0; + + for (dptr += dataoff; dptr < limit; dptr++) { + /* Find beginning of line */ + if (*dptr != '\r' && *dptr != '\n') + continue; + if (++dptr >= limit) + break; + if (*(dptr - 1) == '\r' && *dptr == '\n') { + if (++dptr >= limit) + break; + } + + if (term != SDP_HDR_UNSPEC && + limit - dptr >= thdr->len && + strnicmp(dptr, thdr->name, thdr->len) == 0) + break; + else if (limit - dptr >= hdr->len && + strnicmp(dptr, hdr->name, hdr->len) == 0) + dptr += hdr->len; + else + continue; + + *matchoff = dptr - start; + if (hdr->search) { + dptr = ct_sdp_header_search(dptr, limit, hdr->search, + hdr->slen); + if (!dptr) + return -1; + dptr += hdr->slen; + } + + *matchlen = hdr->match_len(ct, dptr, limit, &shift); + if (!*matchlen) + return -1; + *matchoff = dptr - start + shift; + return 1; + } + return 0; +} +EXPORT_SYMBOL_GPL(ct_sip_get_sdp_header); + +static int ct_sip_parse_sdp_addr(const struct nf_conn *ct, const char *dptr, + unsigned int dataoff, unsigned int datalen, + enum sdp_header_types type, + enum sdp_header_types term, + unsigned int *matchoff, unsigned int *matchlen, + union nf_inet_addr *addr) +{ + int ret; + + ret = ct_sip_get_sdp_header(ct, dptr, dataoff, datalen, type, term, + matchoff, matchlen); + if (ret <= 0) + return ret; + + if (!parse_addr(ct, dptr + *matchoff, NULL, addr, + dptr + *matchoff + *matchlen)) + return -1; + return 1; +} + +static int refresh_signalling_expectation(struct nf_conn *ct, + union nf_inet_addr *addr, + u8 proto, __be16 port, + unsigned int expires) +{ + struct nf_conn_help *help = nfct_help(ct); + struct nf_conntrack_expect *exp; + struct hlist_node *n, *next; + int found = 0; + + spin_lock_bh(&nf_conntrack_lock); + hlist_for_each_entry_safe(exp, n, next, &help->expectations, lnode) { + if (exp->class != SIP_EXPECT_SIGNALLING || + !nf_inet_addr_cmp(&exp->tuple.dst.u3, addr) || + exp->tuple.dst.protonum != proto || + exp->tuple.dst.u.udp.port != port) + continue; + if (!del_timer(&exp->timeout)) + continue; + exp->flags &= ~NF_CT_EXPECT_INACTIVE; + exp->timeout.expires = jiffies + expires * HZ; + add_timer(&exp->timeout); + found = 1; + break; + } + spin_unlock_bh(&nf_conntrack_lock); + return found; +} + +static void flush_expectations(struct nf_conn *ct, bool media) +{ + struct nf_conn_help *help = nfct_help(ct); + struct nf_conntrack_expect *exp; + struct hlist_node *n, *next; + + spin_lock_bh(&nf_conntrack_lock); + hlist_for_each_entry_safe(exp, n, next, &help->expectations, lnode) { + if ((exp->class != SIP_EXPECT_SIGNALLING) ^ media) + continue; + if (!del_timer(&exp->timeout)) + continue; + nf_ct_unlink_expect(exp); + nf_ct_expect_put(exp); + if (!media) + break; + } + spin_unlock_bh(&nf_conntrack_lock); +} + +static int set_expected_rtp_rtcp(struct sk_buff *skb, unsigned int dataoff, + const char **dptr, unsigned int *datalen, + union nf_inet_addr *daddr, __be16 port, + enum sip_expectation_classes class, + unsigned int mediaoff, unsigned int medialen) +{ + struct nf_conntrack_expect *exp, *rtp_exp, *rtcp_exp; + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + struct net *net = nf_ct_net(ct); + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + union nf_inet_addr *saddr; + struct nf_conntrack_tuple tuple; + int direct_rtp = 0, skip_expect = 0, ret = NF_DROP; + u_int16_t base_port; + __be16 rtp_port, rtcp_port; + typeof(nf_nat_sdp_port_hook) nf_nat_sdp_port; + typeof(nf_nat_sdp_media_hook) nf_nat_sdp_media; + + saddr = NULL; + if (sip_direct_media) { + if (!nf_inet_addr_cmp(daddr, &ct->tuplehash[dir].tuple.src.u3)) + return NF_ACCEPT; + saddr = &ct->tuplehash[!dir].tuple.src.u3; + } + + /* We need to check whether the registration exists before attempting + * to register it since we can see the same media description multiple + * times on different connections in case multiple endpoints receive + * the same call. + * + * RTP optimization: if we find a matching media channel expectation + * and both the expectation and this connection are SNATed, we assume + * both sides can reach each other directly and use the final + * destination address from the expectation. We still need to keep + * the NATed expectations for media that might arrive from the + * outside, and additionally need to expect the direct RTP stream + * in case it passes through us even without NAT. + */ + memset(&tuple, 0, sizeof(tuple)); + if (saddr) + tuple.src.u3 = *saddr; + tuple.src.l3num = nf_ct_l3num(ct); + tuple.dst.protonum = IPPROTO_UDP; + tuple.dst.u3 = *daddr; + tuple.dst.u.udp.port = port; + + rcu_read_lock(); + do { + exp = __nf_ct_expect_find(net, nf_ct_zone(ct), &tuple); + + if (!exp || exp->master == ct || + nfct_help(exp->master)->helper != nfct_help(ct)->helper || + exp->class != class) + break; +#ifdef CONFIG_NF_NAT_NEEDED + if (exp->tuple.src.l3num == AF_INET && !direct_rtp && + (exp->saved_ip != exp->tuple.dst.u3.ip || + exp->saved_proto.udp.port != exp->tuple.dst.u.udp.port) && + ct->status & IPS_NAT_MASK) { + daddr->ip = exp->saved_ip; + tuple.dst.u3.ip = exp->saved_ip; + tuple.dst.u.udp.port = exp->saved_proto.udp.port; + direct_rtp = 1; + } else +#endif + skip_expect = 1; + } while (!skip_expect); + rcu_read_unlock(); + + base_port = ntohs(tuple.dst.u.udp.port) & ~1; + rtp_port = htons(base_port); + rtcp_port = htons(base_port + 1); + + if (direct_rtp) { + nf_nat_sdp_port = rcu_dereference(nf_nat_sdp_port_hook); + if (nf_nat_sdp_port && + !nf_nat_sdp_port(skb, dataoff, dptr, datalen, + mediaoff, medialen, ntohs(rtp_port))) + goto err1; + } + + if (skip_expect) + return NF_ACCEPT; + + rtp_exp = nf_ct_expect_alloc(ct); + if (rtp_exp == NULL) + goto err1; + nf_ct_expect_init(rtp_exp, class, nf_ct_l3num(ct), saddr, daddr, + IPPROTO_UDP, NULL, &rtp_port); + + rtcp_exp = nf_ct_expect_alloc(ct); + if (rtcp_exp == NULL) + goto err2; + nf_ct_expect_init(rtcp_exp, class, nf_ct_l3num(ct), saddr, daddr, + IPPROTO_UDP, NULL, &rtcp_port); + + nf_nat_sdp_media = rcu_dereference(nf_nat_sdp_media_hook); + if (nf_nat_sdp_media && ct->status & IPS_NAT_MASK && !direct_rtp) + ret = nf_nat_sdp_media(skb, dataoff, dptr, datalen, + rtp_exp, rtcp_exp, + mediaoff, medialen, daddr); + else { + if (nf_ct_expect_related(rtp_exp) == 0) { + if (nf_ct_expect_related(rtcp_exp) != 0) + nf_ct_unexpect_related(rtp_exp); + else + ret = NF_ACCEPT; + } + } + nf_ct_expect_put(rtcp_exp); +err2: + nf_ct_expect_put(rtp_exp); +err1: + return ret; +} + +static const struct sdp_media_type sdp_media_types[] = { + SDP_MEDIA_TYPE("audio ", SIP_EXPECT_AUDIO), + SDP_MEDIA_TYPE("video ", SIP_EXPECT_VIDEO), + SDP_MEDIA_TYPE("image ", SIP_EXPECT_IMAGE), +}; + +static const struct sdp_media_type *sdp_media_type(const char *dptr, + unsigned int matchoff, + unsigned int matchlen) +{ + const struct sdp_media_type *t; + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(sdp_media_types); i++) { + t = &sdp_media_types[i]; + if (matchlen < t->len || + strncmp(dptr + matchoff, t->name, t->len)) + continue; + return t; + } + return NULL; +} + +static int process_sdp(struct sk_buff *skb, unsigned int dataoff, + const char **dptr, unsigned int *datalen, + unsigned int cseq) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + unsigned int matchoff, matchlen; + unsigned int mediaoff, medialen; + unsigned int sdpoff; + unsigned int caddr_len, maddr_len; + unsigned int i; + union nf_inet_addr caddr, maddr, rtp_addr; + unsigned int port; + enum sdp_header_types c_hdr; + const struct sdp_media_type *t; + int ret = NF_ACCEPT; + typeof(nf_nat_sdp_addr_hook) nf_nat_sdp_addr; + typeof(nf_nat_sdp_session_hook) nf_nat_sdp_session; + + nf_nat_sdp_addr = rcu_dereference(nf_nat_sdp_addr_hook); + c_hdr = nf_ct_l3num(ct) == AF_INET ? SDP_HDR_CONNECTION_IP4 : + SDP_HDR_CONNECTION_IP6; + + /* Find beginning of session description */ + if (ct_sip_get_sdp_header(ct, *dptr, 0, *datalen, + SDP_HDR_VERSION, SDP_HDR_UNSPEC, + &matchoff, &matchlen) <= 0) + return NF_ACCEPT; + sdpoff = matchoff; + + /* The connection information is contained in the session description + * and/or once per media description. The first media description marks + * the end of the session description. */ + caddr_len = 0; + if (ct_sip_parse_sdp_addr(ct, *dptr, sdpoff, *datalen, + c_hdr, SDP_HDR_MEDIA, + &matchoff, &matchlen, &caddr) > 0) + caddr_len = matchlen; + + mediaoff = sdpoff; + for (i = 0; i < ARRAY_SIZE(sdp_media_types); ) { + if (ct_sip_get_sdp_header(ct, *dptr, mediaoff, *datalen, + SDP_HDR_MEDIA, SDP_HDR_UNSPEC, + &mediaoff, &medialen) <= 0) + break; + + /* Get media type and port number. A media port value of zero + * indicates an inactive stream. */ + t = sdp_media_type(*dptr, mediaoff, medialen); + if (!t) { + mediaoff += medialen; + continue; + } + mediaoff += t->len; + medialen -= t->len; + + port = simple_strtoul(*dptr + mediaoff, NULL, 10); + if (port == 0) + continue; + if (port < 1024 || port > 65535) + return NF_DROP; + + /* The media description overrides the session description. */ + maddr_len = 0; + if (ct_sip_parse_sdp_addr(ct, *dptr, mediaoff, *datalen, + c_hdr, SDP_HDR_MEDIA, + &matchoff, &matchlen, &maddr) > 0) { + maddr_len = matchlen; + memcpy(&rtp_addr, &maddr, sizeof(rtp_addr)); + } else if (caddr_len) + memcpy(&rtp_addr, &caddr, sizeof(rtp_addr)); + else + return NF_DROP; + + ret = set_expected_rtp_rtcp(skb, dataoff, dptr, datalen, + &rtp_addr, htons(port), t->class, + mediaoff, medialen); + if (ret != NF_ACCEPT) + return ret; + + /* Update media connection address if present */ + if (maddr_len && nf_nat_sdp_addr && ct->status & IPS_NAT_MASK) { + ret = nf_nat_sdp_addr(skb, dataoff, dptr, datalen, + mediaoff, c_hdr, SDP_HDR_MEDIA, + &rtp_addr); + if (ret != NF_ACCEPT) + return ret; + } + i++; + } + + /* Update session connection and owner addresses */ + nf_nat_sdp_session = rcu_dereference(nf_nat_sdp_session_hook); + if (nf_nat_sdp_session && ct->status & IPS_NAT_MASK) + ret = nf_nat_sdp_session(skb, dataoff, dptr, datalen, sdpoff, + &rtp_addr); + + return ret; +} +static int process_invite_response(struct sk_buff *skb, unsigned int dataoff, + const char **dptr, unsigned int *datalen, + unsigned int cseq, unsigned int code) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + struct nf_conn_help *help = nfct_help(ct); + + if ((code >= 100 && code <= 199) || + (code >= 200 && code <= 299)) + return process_sdp(skb, dataoff, dptr, datalen, cseq); + else if (help->help.ct_sip_info.invite_cseq == cseq) + flush_expectations(ct, true); + return NF_ACCEPT; +} + +static int process_update_response(struct sk_buff *skb, unsigned int dataoff, + const char **dptr, unsigned int *datalen, + unsigned int cseq, unsigned int code) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + struct nf_conn_help *help = nfct_help(ct); + + if ((code >= 100 && code <= 199) || + (code >= 200 && code <= 299)) + return process_sdp(skb, dataoff, dptr, datalen, cseq); + else if (help->help.ct_sip_info.invite_cseq == cseq) + flush_expectations(ct, true); + return NF_ACCEPT; +} + +static int process_prack_response(struct sk_buff *skb, unsigned int dataoff, + const char **dptr, unsigned int *datalen, + unsigned int cseq, unsigned int code) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + struct nf_conn_help *help = nfct_help(ct); + + if ((code >= 100 && code <= 199) || + (code >= 200 && code <= 299)) + return process_sdp(skb, dataoff, dptr, datalen, cseq); + else if (help->help.ct_sip_info.invite_cseq == cseq) + flush_expectations(ct, true); + return NF_ACCEPT; +} + +static int process_invite_request(struct sk_buff *skb, unsigned int dataoff, + const char **dptr, unsigned int *datalen, + unsigned int cseq) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + struct nf_conn_help *help = nfct_help(ct); + unsigned int ret; + + flush_expectations(ct, true); + ret = process_sdp(skb, dataoff, dptr, datalen, cseq); + if (ret == NF_ACCEPT) + help->help.ct_sip_info.invite_cseq = cseq; + return ret; +} + +static int process_bye_request(struct sk_buff *skb, unsigned int dataoff, + const char **dptr, unsigned int *datalen, + unsigned int cseq) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + + flush_expectations(ct, true); + return NF_ACCEPT; +} + +/* Parse a REGISTER request and create a permanent expectation for incoming + * signalling connections. The expectation is marked inactive and is activated + * when receiving a response indicating success from the registrar. + */ +static int process_register_request(struct sk_buff *skb, unsigned int dataoff, + const char **dptr, unsigned int *datalen, + unsigned int cseq) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + struct nf_conn_help *help = nfct_help(ct); + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + unsigned int matchoff, matchlen; + struct nf_conntrack_expect *exp; + union nf_inet_addr *saddr, daddr; + __be16 port; + u8 proto; + unsigned int expires = 0; + int ret; + typeof(nf_nat_sip_expect_hook) nf_nat_sip_expect; + + /* Expected connections can not register again. */ + if (ct->status & IPS_EXPECTED) + return NF_ACCEPT; + + /* We must check the expiration time: a value of zero signals the + * registrar to release the binding. We'll remove our expectation + * when receiving the new bindings in the response, but we don't + * want to create new ones. + * + * The expiration time may be contained in Expires: header, the + * Contact: header parameters or the URI parameters. + */ + if (ct_sip_get_header(ct, *dptr, 0, *datalen, SIP_HDR_EXPIRES, + &matchoff, &matchlen) > 0) + expires = simple_strtoul(*dptr + matchoff, NULL, 10); + + ret = ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen, + SIP_HDR_CONTACT, NULL, + &matchoff, &matchlen, &daddr, &port); + if (ret < 0) + return NF_DROP; + else if (ret == 0) + return NF_ACCEPT; + + /* We don't support third-party registrations */ + if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.src.u3, &daddr)) + return NF_ACCEPT; + + if (ct_sip_parse_transport(ct, *dptr, matchoff + matchlen, *datalen, + &proto) == 0) + return NF_ACCEPT; + + if (ct_sip_parse_numerical_param(ct, *dptr, + matchoff + matchlen, *datalen, + "expires=", NULL, NULL, &expires) < 0) + return NF_DROP; + + if (expires == 0) { + ret = NF_ACCEPT; + goto store_cseq; + } + + exp = nf_ct_expect_alloc(ct); + if (!exp) + return NF_DROP; + + saddr = NULL; + if (sip_direct_signalling) + saddr = &ct->tuplehash[!dir].tuple.src.u3; + + nf_ct_expect_init(exp, SIP_EXPECT_SIGNALLING, nf_ct_l3num(ct), + saddr, &daddr, proto, NULL, &port); + exp->timeout.expires = sip_timeout * HZ; + exp->helper = nfct_help(ct)->helper; + exp->flags = NF_CT_EXPECT_PERMANENT | NF_CT_EXPECT_INACTIVE; + + nf_nat_sip_expect = rcu_dereference(nf_nat_sip_expect_hook); + if (nf_nat_sip_expect && ct->status & IPS_NAT_MASK) + ret = nf_nat_sip_expect(skb, dataoff, dptr, datalen, exp, + matchoff, matchlen); + else { + if (nf_ct_expect_related(exp) != 0) + ret = NF_DROP; + else + ret = NF_ACCEPT; + } + nf_ct_expect_put(exp); + +store_cseq: + if (ret == NF_ACCEPT) + help->help.ct_sip_info.register_cseq = cseq; + return ret; +} + +static int process_register_response(struct sk_buff *skb, unsigned int dataoff, + const char **dptr, unsigned int *datalen, + unsigned int cseq, unsigned int code) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + struct nf_conn_help *help = nfct_help(ct); + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + union nf_inet_addr addr; + __be16 port; + u8 proto; + unsigned int matchoff, matchlen, coff = 0; + unsigned int expires = 0; + int in_contact = 0, ret; + + /* According to RFC 3261, "UAs MUST NOT send a new registration until + * they have received a final response from the registrar for the + * previous one or the previous REGISTER request has timed out". + * + * However, some servers fail to detect retransmissions and send late + * responses, so we store the sequence number of the last valid + * request and compare it here. + */ + if (help->help.ct_sip_info.register_cseq != cseq) + return NF_ACCEPT; + + if (code >= 100 && code <= 199) + return NF_ACCEPT; + if (code < 200 || code > 299) + goto flush; + + if (ct_sip_get_header(ct, *dptr, 0, *datalen, SIP_HDR_EXPIRES, + &matchoff, &matchlen) > 0) + expires = simple_strtoul(*dptr + matchoff, NULL, 10); + + while (1) { + unsigned int c_expires = expires; + + ret = ct_sip_parse_header_uri(ct, *dptr, &coff, *datalen, + SIP_HDR_CONTACT, &in_contact, + &matchoff, &matchlen, + &addr, &port); + if (ret < 0) + return NF_DROP; + else if (ret == 0) + break; + + /* We don't support third-party registrations */ + if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.dst.u3, &addr)) + continue; + + if (ct_sip_parse_transport(ct, *dptr, matchoff + matchlen, + *datalen, &proto) == 0) + continue; + + ret = ct_sip_parse_numerical_param(ct, *dptr, + matchoff + matchlen, + *datalen, "expires=", + NULL, NULL, &c_expires); + if (ret < 0) + return NF_DROP; + if (c_expires == 0) + break; + if (refresh_signalling_expectation(ct, &addr, proto, port, + c_expires)) + return NF_ACCEPT; + } + +flush: + flush_expectations(ct, false); + return NF_ACCEPT; +} + +static const struct sip_handler sip_handlers[] = { + SIP_HANDLER("INVITE", process_invite_request, process_invite_response), + SIP_HANDLER("UPDATE", process_sdp, process_update_response), + SIP_HANDLER("ACK", process_sdp, NULL), + SIP_HANDLER("PRACK", process_sdp, process_prack_response), + SIP_HANDLER("BYE", process_bye_request, NULL), + SIP_HANDLER("REGISTER", process_register_request, process_register_response), +}; + +static int process_sip_response(struct sk_buff *skb, unsigned int dataoff, + const char **dptr, unsigned int *datalen) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + unsigned int matchoff, matchlen, matchend; + unsigned int code, cseq, i; + + if (*datalen < strlen("SIP/2.0 200")) + return NF_ACCEPT; + code = simple_strtoul(*dptr + strlen("SIP/2.0 "), NULL, 10); + if (!code) + return NF_DROP; + + if (ct_sip_get_header(ct, *dptr, 0, *datalen, SIP_HDR_CSEQ, + &matchoff, &matchlen) <= 0) + return NF_DROP; + cseq = simple_strtoul(*dptr + matchoff, NULL, 10); + if (!cseq) + return NF_DROP; + matchend = matchoff + matchlen + 1; + + for (i = 0; i < ARRAY_SIZE(sip_handlers); i++) { + const struct sip_handler *handler; + + handler = &sip_handlers[i]; + if (handler->response == NULL) + continue; + if (*datalen < matchend + handler->len || + strnicmp(*dptr + matchend, handler->method, handler->len)) + continue; + return handler->response(skb, dataoff, dptr, datalen, + cseq, code); + } + return NF_ACCEPT; +} + +static int process_sip_request(struct sk_buff *skb, unsigned int dataoff, + const char **dptr, unsigned int *datalen) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + unsigned int matchoff, matchlen; + unsigned int cseq, i; + + for (i = 0; i < ARRAY_SIZE(sip_handlers); i++) { + const struct sip_handler *handler; + + handler = &sip_handlers[i]; + if (handler->request == NULL) + continue; + if (*datalen < handler->len || + strnicmp(*dptr, handler->method, handler->len)) + continue; + + if (ct_sip_get_header(ct, *dptr, 0, *datalen, SIP_HDR_CSEQ, + &matchoff, &matchlen) <= 0) + return NF_DROP; + cseq = simple_strtoul(*dptr + matchoff, NULL, 10); + if (!cseq) + return NF_DROP; + + return handler->request(skb, dataoff, dptr, datalen, cseq); + } + return NF_ACCEPT; +} + +static int process_sip_msg(struct sk_buff *skb, struct nf_conn *ct, + unsigned int dataoff, const char **dptr, + unsigned int *datalen) +{ + typeof(nf_nat_sip_hook) nf_nat_sip; + int ret; + + if (strnicmp(*dptr, "SIP/2.0 ", strlen("SIP/2.0 ")) != 0) + ret = process_sip_request(skb, dataoff, dptr, datalen); + else + ret = process_sip_response(skb, dataoff, dptr, datalen); + + if (ret == NF_ACCEPT && ct->status & IPS_NAT_MASK) { + nf_nat_sip = rcu_dereference(nf_nat_sip_hook); + if (nf_nat_sip && !nf_nat_sip(skb, dataoff, dptr, datalen)) + ret = NF_DROP; + } + + return ret; +} + +static int sip_help_tcp(struct sk_buff *skb, unsigned int protoff, + struct nf_conn *ct, enum ip_conntrack_info ctinfo) +{ + struct tcphdr *th, _tcph; + unsigned int dataoff, datalen; + unsigned int matchoff, matchlen, clen; + unsigned int msglen, origlen; + const char *dptr, *end; + s16 diff, tdiff = 0; + int ret = NF_ACCEPT; + bool term; + typeof(nf_nat_sip_seq_adjust_hook) nf_nat_sip_seq_adjust; + + if (ctinfo != IP_CT_ESTABLISHED && + ctinfo != IP_CT_ESTABLISHED_REPLY) + return NF_ACCEPT; + + /* No Data ? */ + th = skb_header_pointer(skb, protoff, sizeof(_tcph), &_tcph); + if (th == NULL) + return NF_ACCEPT; + dataoff = protoff + th->doff * 4; + if (dataoff >= skb->len) + return NF_ACCEPT; + + nf_ct_refresh(ct, skb, sip_timeout * HZ); + + if (unlikely(skb_linearize(skb))) + return NF_DROP; + + dptr = skb->data + dataoff; + datalen = skb->len - dataoff; + if (datalen < strlen("SIP/2.0 200")) + return NF_ACCEPT; + + while (1) { + if (ct_sip_get_header(ct, dptr, 0, datalen, + SIP_HDR_CONTENT_LENGTH, + &matchoff, &matchlen) <= 0) + break; + + clen = simple_strtoul(dptr + matchoff, (char **)&end, 10); + if (dptr + matchoff == end) + break; + + term = false; + for (; end + strlen("\r\n\r\n") <= dptr + datalen; end++) { + if (end[0] == '\r' && end[1] == '\n' && + end[2] == '\r' && end[3] == '\n') { + term = true; + break; + } + } + if (!term) + break; + end += strlen("\r\n\r\n") + clen; + + msglen = origlen = end - dptr; + if (msglen > datalen) + return NF_DROP; + + ret = process_sip_msg(skb, ct, dataoff, &dptr, &msglen); + if (ret != NF_ACCEPT) + break; + diff = msglen - origlen; + tdiff += diff; + + dataoff += msglen; + dptr += msglen; + datalen = datalen + diff - msglen; + } + + if (ret == NF_ACCEPT && ct->status & IPS_NAT_MASK) { + nf_nat_sip_seq_adjust = rcu_dereference(nf_nat_sip_seq_adjust_hook); + if (nf_nat_sip_seq_adjust) + nf_nat_sip_seq_adjust(skb, tdiff); + } + + return ret; +} + +static int sip_help_udp(struct sk_buff *skb, unsigned int protoff, + struct nf_conn *ct, enum ip_conntrack_info ctinfo) +{ + unsigned int dataoff, datalen; + const char *dptr; + + /* No Data ? */ + dataoff = protoff + sizeof(struct udphdr); + if (dataoff >= skb->len) + return NF_ACCEPT; + + nf_ct_refresh(ct, skb, sip_timeout * HZ); + + if (unlikely(skb_linearize(skb))) + return NF_DROP; + + dptr = skb->data + dataoff; + datalen = skb->len - dataoff; + if (datalen < strlen("SIP/2.0 200")) + return NF_ACCEPT; + + return process_sip_msg(skb, ct, dataoff, &dptr, &datalen); +} + +static struct nf_conntrack_helper sip[MAX_PORTS][4] __read_mostly; +static char sip_names[MAX_PORTS][4][sizeof("sip-65535")] __read_mostly; + +static const struct nf_conntrack_expect_policy sip_exp_policy[SIP_EXPECT_MAX + 1] = { + [SIP_EXPECT_SIGNALLING] = { + .name = "signalling", + .max_expected = 1, + .timeout = 3 * 60, + }, + [SIP_EXPECT_AUDIO] = { + .name = "audio", + .max_expected = 2 * IP_CT_DIR_MAX, + .timeout = 3 * 60, + }, + [SIP_EXPECT_VIDEO] = { + .name = "video", + .max_expected = 2 * IP_CT_DIR_MAX, + .timeout = 3 * 60, + }, + [SIP_EXPECT_IMAGE] = { + .name = "image", + .max_expected = IP_CT_DIR_MAX, + .timeout = 3 * 60, + }, +}; + +static void nf_conntrack_sip_fini(void) +{ + int i, j; + + for (i = 0; i < ports_c; i++) { + for (j = 0; j < ARRAY_SIZE(sip[i]); j++) { + if (sip[i][j].me == NULL) + continue; + nf_conntrack_helper_unregister(&sip[i][j]); + } + } +} + +static int __init nf_conntrack_sip_init(void) +{ + int i, j, ret; + char *tmpname; + + if (ports_c == 0) + ports[ports_c++] = SIP_PORT; + + for (i = 0; i < ports_c; i++) { + memset(&sip[i], 0, sizeof(sip[i])); + + sip[i][0].tuple.src.l3num = AF_INET; + sip[i][0].tuple.dst.protonum = IPPROTO_UDP; + sip[i][0].help = sip_help_udp; + sip[i][1].tuple.src.l3num = AF_INET; + sip[i][1].tuple.dst.protonum = IPPROTO_TCP; + sip[i][1].help = sip_help_tcp; + + sip[i][2].tuple.src.l3num = AF_INET6; + sip[i][2].tuple.dst.protonum = IPPROTO_UDP; + sip[i][2].help = sip_help_udp; + sip[i][3].tuple.src.l3num = AF_INET6; + sip[i][3].tuple.dst.protonum = IPPROTO_TCP; + sip[i][3].help = sip_help_tcp; + + for (j = 0; j < ARRAY_SIZE(sip[i]); j++) { + sip[i][j].tuple.src.u.udp.port = htons(ports[i]); + sip[i][j].expect_policy = sip_exp_policy; + sip[i][j].expect_class_max = SIP_EXPECT_MAX; + sip[i][j].me = THIS_MODULE; + + tmpname = &sip_names[i][j][0]; + if (ports[i] == SIP_PORT) + sprintf(tmpname, "sip"); + else + sprintf(tmpname, "sip-%u", i); + sip[i][j].name = tmpname; + + pr_debug("port #%u: %u\n", i, ports[i]); + + ret = nf_conntrack_helper_register(&sip[i][j]); + if (ret) { + printk(KERN_ERR "nf_ct_sip: failed to register" + " helper for pf: %u port: %u\n", + sip[i][j].tuple.src.l3num, ports[i]); + nf_conntrack_sip_fini(); + return ret; + } + } + } + return 0; +} + +module_init(nf_conntrack_sip_init); +module_exit(nf_conntrack_sip_fini); diff --git a/net/netfilter/nf_conntrack_snmp.c b/net/netfilter/nf_conntrack_snmp.c new file mode 100644 index 00000000..6e545e26 --- /dev/null +++ b/net/netfilter/nf_conntrack_snmp.c @@ -0,0 +1,77 @@ +/* + * SNMP service broadcast connection tracking helper + * + * (c) 2011 Jiri Olsa <jolsa@redhat.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/in.h> + +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_expect.h> + +#define SNMP_PORT 161 + +MODULE_AUTHOR("Jiri Olsa <jolsa@redhat.com>"); +MODULE_DESCRIPTION("SNMP service broadcast connection tracking helper"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NFCT_HELPER("snmp"); + +static unsigned int timeout __read_mostly = 30; +module_param(timeout, uint, S_IRUSR); +MODULE_PARM_DESC(timeout, "timeout for master connection/replies in seconds"); + +int (*nf_nat_snmp_hook)(struct sk_buff *skb, + unsigned int protoff, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo); +EXPORT_SYMBOL_GPL(nf_nat_snmp_hook); + +static int snmp_conntrack_help(struct sk_buff *skb, unsigned int protoff, + struct nf_conn *ct, enum ip_conntrack_info ctinfo) +{ + typeof(nf_nat_snmp_hook) nf_nat_snmp; + + nf_conntrack_broadcast_help(skb, protoff, ct, ctinfo, timeout); + + nf_nat_snmp = rcu_dereference(nf_nat_snmp_hook); + if (nf_nat_snmp && ct->status & IPS_NAT_MASK) + return nf_nat_snmp(skb, protoff, ct, ctinfo); + + return NF_ACCEPT; +} + +static struct nf_conntrack_expect_policy exp_policy = { + .max_expected = 1, +}; + +static struct nf_conntrack_helper helper __read_mostly = { + .name = "snmp", + .tuple.src.l3num = NFPROTO_IPV4, + .tuple.src.u.udp.port = cpu_to_be16(SNMP_PORT), + .tuple.dst.protonum = IPPROTO_UDP, + .me = THIS_MODULE, + .help = snmp_conntrack_help, + .expect_policy = &exp_policy, +}; + +static int __init nf_conntrack_snmp_init(void) +{ + exp_policy.timeout = timeout; + return nf_conntrack_helper_register(&helper); +} + +static void __exit nf_conntrack_snmp_fini(void) +{ + nf_conntrack_helper_unregister(&helper); +} + +module_init(nf_conntrack_snmp_init); +module_exit(nf_conntrack_snmp_fini); diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c new file mode 100644 index 00000000..885f5ab9 --- /dev/null +++ b/net/netfilter/nf_conntrack_standalone.c @@ -0,0 +1,590 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/types.h> +#include <linux/netfilter.h> +#include <linux/slab.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/percpu.h> +#include <linux/netdevice.h> +#include <linux/security.h> +#include <net/net_namespace.h> +#ifdef CONFIG_SYSCTL +#include <linux/sysctl.h> +#endif + +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_l3proto.h> +#include <net/netfilter/nf_conntrack_l4proto.h> +#include <net/netfilter/nf_conntrack_expect.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_acct.h> +#include <net/netfilter/nf_conntrack_zones.h> +#include <net/netfilter/nf_conntrack_timestamp.h> +#include <linux/rculist_nulls.h> + +MODULE_LICENSE("GPL"); + +#ifdef CONFIG_NF_CONNTRACK_PROCFS +int +print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_l3proto *l3proto, + const struct nf_conntrack_l4proto *l4proto) +{ + return l3proto->print_tuple(s, tuple) || l4proto->print_tuple(s, tuple); +} +EXPORT_SYMBOL_GPL(print_tuple); + +struct ct_iter_state { + struct seq_net_private p; + unsigned int bucket; + u_int64_t time_now; +}; + +static struct hlist_nulls_node *ct_get_first(struct seq_file *seq) +{ + struct net *net = seq_file_net(seq); + struct ct_iter_state *st = seq->private; + struct hlist_nulls_node *n; + + for (st->bucket = 0; + st->bucket < net->ct.htable_size; + st->bucket++) { + n = rcu_dereference(hlist_nulls_first_rcu(&net->ct.hash[st->bucket])); + if (!is_a_nulls(n)) + return n; + } + return NULL; +} + +static struct hlist_nulls_node *ct_get_next(struct seq_file *seq, + struct hlist_nulls_node *head) +{ + struct net *net = seq_file_net(seq); + struct ct_iter_state *st = seq->private; + + head = rcu_dereference(hlist_nulls_next_rcu(head)); + while (is_a_nulls(head)) { + if (likely(get_nulls_value(head) == st->bucket)) { + if (++st->bucket >= net->ct.htable_size) + return NULL; + } + head = rcu_dereference( + hlist_nulls_first_rcu( + &net->ct.hash[st->bucket])); + } + return head; +} + +static struct hlist_nulls_node *ct_get_idx(struct seq_file *seq, loff_t pos) +{ + struct hlist_nulls_node *head = ct_get_first(seq); + + if (head) + while (pos && (head = ct_get_next(seq, head))) + pos--; + return pos ? NULL : head; +} + +static void *ct_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(RCU) +{ + struct ct_iter_state *st = seq->private; + + st->time_now = ktime_to_ns(ktime_get_real()); + rcu_read_lock(); + return ct_get_idx(seq, *pos); +} + +static void *ct_seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + (*pos)++; + return ct_get_next(s, v); +} + +static void ct_seq_stop(struct seq_file *s, void *v) + __releases(RCU) +{ + rcu_read_unlock(); +} + +#ifdef CONFIG_NF_CONNTRACK_SECMARK +static int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct) +{ + int ret; + u32 len; + char *secctx; + + ret = security_secid_to_secctx(ct->secmark, &secctx, &len); + if (ret) + return 0; + + ret = seq_printf(s, "secctx=%s ", secctx); + + security_release_secctx(secctx, len); + return ret; +} +#else +static inline int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct) +{ + return 0; +} +#endif + +#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP +static int ct_show_delta_time(struct seq_file *s, const struct nf_conn *ct) +{ + struct ct_iter_state *st = s->private; + struct nf_conn_tstamp *tstamp; + s64 delta_time; + + tstamp = nf_conn_tstamp_find(ct); + if (tstamp) { + delta_time = st->time_now - tstamp->start; + if (delta_time > 0) + delta_time = div_s64(delta_time, NSEC_PER_SEC); + else + delta_time = 0; + + return seq_printf(s, "delta-time=%llu ", + (unsigned long long)delta_time); + } + return 0; +} +#else +static inline int +ct_show_delta_time(struct seq_file *s, const struct nf_conn *ct) +{ + return 0; +} +#endif + +/* return 0 on success, 1 in case of error */ +static int ct_seq_show(struct seq_file *s, void *v) +{ + struct nf_conntrack_tuple_hash *hash = v; + struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(hash); + const struct nf_conntrack_l3proto *l3proto; + const struct nf_conntrack_l4proto *l4proto; + int ret = 0; + + NF_CT_ASSERT(ct); + if (unlikely(!atomic_inc_not_zero(&ct->ct_general.use))) + return 0; + + /* we only want to print DIR_ORIGINAL */ + if (NF_CT_DIRECTION(hash)) + goto release; + + l3proto = __nf_ct_l3proto_find(nf_ct_l3num(ct)); + NF_CT_ASSERT(l3proto); + l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); + NF_CT_ASSERT(l4proto); + + ret = -ENOSPC; + if (seq_printf(s, "%-8s %u %-8s %u %ld ", + l3proto->name, nf_ct_l3num(ct), + l4proto->name, nf_ct_protonum(ct), + timer_pending(&ct->timeout) + ? (long)(ct->timeout.expires - jiffies)/HZ : 0) != 0) + goto release; + + if (l4proto->print_conntrack && l4proto->print_conntrack(s, ct)) + goto release; + + if (print_tuple(s, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, + l3proto, l4proto)) + goto release; + + if (seq_print_acct(s, ct, IP_CT_DIR_ORIGINAL)) + goto release; + + if (!(test_bit(IPS_SEEN_REPLY_BIT, &ct->status))) + if (seq_printf(s, "[UNREPLIED] ")) + goto release; + + if (print_tuple(s, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, + l3proto, l4proto)) + goto release; + + if (seq_print_acct(s, ct, IP_CT_DIR_REPLY)) + goto release; + + if (test_bit(IPS_ASSURED_BIT, &ct->status)) + if (seq_printf(s, "[ASSURED] ")) + goto release; + +#if defined(CONFIG_NF_CONNTRACK_MARK) + if (seq_printf(s, "mark=%u ", ct->mark)) + goto release; +#endif + + if (ct_show_secctx(s, ct)) + goto release; + +#ifdef CONFIG_NF_CONNTRACK_ZONES + if (seq_printf(s, "zone=%u ", nf_ct_zone(ct))) + goto release; +#endif + + if (ct_show_delta_time(s, ct)) + goto release; + + if (seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use))) + goto release; + + ret = 0; +release: + nf_ct_put(ct); + return ret; +} + +static const struct seq_operations ct_seq_ops = { + .start = ct_seq_start, + .next = ct_seq_next, + .stop = ct_seq_stop, + .show = ct_seq_show +}; + +static int ct_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &ct_seq_ops, + sizeof(struct ct_iter_state)); +} + +static const struct file_operations ct_file_ops = { + .owner = THIS_MODULE, + .open = ct_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct net *net = seq_file_net(seq); + int cpu; + + if (*pos == 0) + return SEQ_START_TOKEN; + + for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) { + if (!cpu_possible(cpu)) + continue; + *pos = cpu + 1; + return per_cpu_ptr(net->ct.stat, cpu); + } + + return NULL; +} + +static void *ct_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct net *net = seq_file_net(seq); + int cpu; + + for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) { + if (!cpu_possible(cpu)) + continue; + *pos = cpu + 1; + return per_cpu_ptr(net->ct.stat, cpu); + } + + return NULL; +} + +static void ct_cpu_seq_stop(struct seq_file *seq, void *v) +{ +} + +static int ct_cpu_seq_show(struct seq_file *seq, void *v) +{ + struct net *net = seq_file_net(seq); + unsigned int nr_conntracks = atomic_read(&net->ct.count); + const struct ip_conntrack_stat *st = v; + + if (v == SEQ_START_TOKEN) { + seq_printf(seq, "entries searched found new invalid ignore delete delete_list insert insert_failed drop early_drop icmp_error expect_new expect_create expect_delete search_restart\n"); + return 0; + } + + seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x " + "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n", + nr_conntracks, + st->searched, + st->found, + st->new, + st->invalid, + st->ignore, + st->delete, + st->delete_list, + st->insert, + st->insert_failed, + st->drop, + st->early_drop, + st->error, + + st->expect_new, + st->expect_create, + st->expect_delete, + st->search_restart + ); + return 0; +} + +static const struct seq_operations ct_cpu_seq_ops = { + .start = ct_cpu_seq_start, + .next = ct_cpu_seq_next, + .stop = ct_cpu_seq_stop, + .show = ct_cpu_seq_show, +}; + +static int ct_cpu_seq_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &ct_cpu_seq_ops, + sizeof(struct seq_net_private)); +} + +static const struct file_operations ct_cpu_seq_fops = { + .owner = THIS_MODULE, + .open = ct_cpu_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +static int nf_conntrack_standalone_init_proc(struct net *net) +{ + struct proc_dir_entry *pde; + + pde = proc_net_fops_create(net, "nf_conntrack", 0440, &ct_file_ops); + if (!pde) + goto out_nf_conntrack; + + pde = proc_create("nf_conntrack", S_IRUGO, net->proc_net_stat, + &ct_cpu_seq_fops); + if (!pde) + goto out_stat_nf_conntrack; + return 0; + +out_stat_nf_conntrack: + proc_net_remove(net, "nf_conntrack"); +out_nf_conntrack: + return -ENOMEM; +} + +static void nf_conntrack_standalone_fini_proc(struct net *net) +{ + remove_proc_entry("nf_conntrack", net->proc_net_stat); + proc_net_remove(net, "nf_conntrack"); +} +#else +static int nf_conntrack_standalone_init_proc(struct net *net) +{ + return 0; +} + +static void nf_conntrack_standalone_fini_proc(struct net *net) +{ +} +#endif /* CONFIG_NF_CONNTRACK_PROCFS */ + +/* Sysctl support */ + +#ifdef CONFIG_SYSCTL +/* Log invalid packets of a given protocol */ +static int log_invalid_proto_min = 0; +static int log_invalid_proto_max = 255; + +static struct ctl_table_header *nf_ct_netfilter_header; + +static ctl_table nf_ct_sysctl_table[] = { + { + .procname = "nf_conntrack_max", + .data = &nf_conntrack_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "nf_conntrack_count", + .data = &init_net.ct.count, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = proc_dointvec, + }, + { + .procname = "nf_conntrack_buckets", + .data = &init_net.ct.htable_size, + .maxlen = sizeof(unsigned int), + .mode = 0444, + .proc_handler = proc_dointvec, + }, + { + .procname = "nf_conntrack_checksum", + .data = &init_net.ct.sysctl_checksum, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "nf_conntrack_log_invalid", + .data = &init_net.ct.sysctl_log_invalid, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &log_invalid_proto_min, + .extra2 = &log_invalid_proto_max, + }, + { + .procname = "nf_conntrack_expect_max", + .data = &nf_ct_expect_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { } +}; + +#define NET_NF_CONNTRACK_MAX 2089 + +static ctl_table nf_ct_netfilter_table[] = { + { + .procname = "nf_conntrack_max", + .data = &nf_conntrack_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { } +}; + +static struct ctl_path nf_ct_path[] = { + { .procname = "net", }, + { } +}; + +static int nf_conntrack_standalone_init_sysctl(struct net *net) +{ + struct ctl_table *table; + + if (net_eq(net, &init_net)) { + nf_ct_netfilter_header = + register_sysctl_paths(nf_ct_path, nf_ct_netfilter_table); + if (!nf_ct_netfilter_header) + goto out; + } + + table = kmemdup(nf_ct_sysctl_table, sizeof(nf_ct_sysctl_table), + GFP_KERNEL); + if (!table) + goto out_kmemdup; + + table[1].data = &net->ct.count; + table[2].data = &net->ct.htable_size; + table[3].data = &net->ct.sysctl_checksum; + table[4].data = &net->ct.sysctl_log_invalid; + + net->ct.sysctl_header = register_net_sysctl_table(net, + nf_net_netfilter_sysctl_path, table); + if (!net->ct.sysctl_header) + goto out_unregister_netfilter; + + return 0; + +out_unregister_netfilter: + kfree(table); +out_kmemdup: + if (net_eq(net, &init_net)) + unregister_sysctl_table(nf_ct_netfilter_header); +out: + printk(KERN_ERR "nf_conntrack: can't register to sysctl.\n"); + return -ENOMEM; +} + +static void nf_conntrack_standalone_fini_sysctl(struct net *net) +{ + struct ctl_table *table; + + if (net_eq(net, &init_net)) + unregister_sysctl_table(nf_ct_netfilter_header); + table = net->ct.sysctl_header->ctl_table_arg; + unregister_net_sysctl_table(net->ct.sysctl_header); + kfree(table); +} +#else +static int nf_conntrack_standalone_init_sysctl(struct net *net) +{ + return 0; +} + +static void nf_conntrack_standalone_fini_sysctl(struct net *net) +{ +} +#endif /* CONFIG_SYSCTL */ + +static int nf_conntrack_net_init(struct net *net) +{ + int ret; + + ret = nf_conntrack_init(net); + if (ret < 0) + goto out_init; + ret = nf_conntrack_standalone_init_proc(net); + if (ret < 0) + goto out_proc; + net->ct.sysctl_checksum = 1; + net->ct.sysctl_log_invalid = 0; + ret = nf_conntrack_standalone_init_sysctl(net); + if (ret < 0) + goto out_sysctl; + return 0; + +out_sysctl: + nf_conntrack_standalone_fini_proc(net); +out_proc: + nf_conntrack_cleanup(net); +out_init: + return ret; +} + +static void nf_conntrack_net_exit(struct net *net) +{ + nf_conntrack_standalone_fini_sysctl(net); + nf_conntrack_standalone_fini_proc(net); + nf_conntrack_cleanup(net); +} + +static struct pernet_operations nf_conntrack_net_ops = { + .init = nf_conntrack_net_init, + .exit = nf_conntrack_net_exit, +}; + +static int __init nf_conntrack_standalone_init(void) +{ + return register_pernet_subsys(&nf_conntrack_net_ops); +} + +static void __exit nf_conntrack_standalone_fini(void) +{ + unregister_pernet_subsys(&nf_conntrack_net_ops); +} + +module_init(nf_conntrack_standalone_init); +module_exit(nf_conntrack_standalone_fini); + +/* Some modules need us, but don't depend directly on any symbol. + They should call this. */ +void need_conntrack(void) +{ +} +EXPORT_SYMBOL_GPL(need_conntrack); diff --git a/net/netfilter/nf_conntrack_tftp.c b/net/netfilter/nf_conntrack_tftp.c new file mode 100644 index 00000000..75466fd7 --- /dev/null +++ b/net/netfilter/nf_conntrack_tftp.c @@ -0,0 +1,153 @@ +/* (C) 2001-2002 Magnus Boden <mb@ozaba.mine.nu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/in.h> +#include <linux/udp.h> +#include <linux/netfilter.h> + +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_tuple.h> +#include <net/netfilter/nf_conntrack_expect.h> +#include <net/netfilter/nf_conntrack_ecache.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <linux/netfilter/nf_conntrack_tftp.h> + +MODULE_AUTHOR("Magnus Boden <mb@ozaba.mine.nu>"); +MODULE_DESCRIPTION("TFTP connection tracking helper"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ip_conntrack_tftp"); +MODULE_ALIAS_NFCT_HELPER("tftp"); + +#define MAX_PORTS 8 +static unsigned short ports[MAX_PORTS]; +static unsigned int ports_c; +module_param_array(ports, ushort, &ports_c, 0400); +MODULE_PARM_DESC(ports, "Port numbers of TFTP servers"); + +unsigned int (*nf_nat_tftp_hook)(struct sk_buff *skb, + enum ip_conntrack_info ctinfo, + struct nf_conntrack_expect *exp) __read_mostly; +EXPORT_SYMBOL_GPL(nf_nat_tftp_hook); + +static int tftp_help(struct sk_buff *skb, + unsigned int protoff, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo) +{ + const struct tftphdr *tfh; + struct tftphdr _tftph; + struct nf_conntrack_expect *exp; + struct nf_conntrack_tuple *tuple; + unsigned int ret = NF_ACCEPT; + typeof(nf_nat_tftp_hook) nf_nat_tftp; + + tfh = skb_header_pointer(skb, protoff + sizeof(struct udphdr), + sizeof(_tftph), &_tftph); + if (tfh == NULL) + return NF_ACCEPT; + + switch (ntohs(tfh->opcode)) { + case TFTP_OPCODE_READ: + case TFTP_OPCODE_WRITE: + /* RRQ and WRQ works the same way */ + nf_ct_dump_tuple(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + nf_ct_dump_tuple(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); + + exp = nf_ct_expect_alloc(ct); + if (exp == NULL) + return NF_DROP; + tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple; + nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, + nf_ct_l3num(ct), + &tuple->src.u3, &tuple->dst.u3, + IPPROTO_UDP, NULL, &tuple->dst.u.udp.port); + + pr_debug("expect: "); + nf_ct_dump_tuple(&exp->tuple); + + nf_nat_tftp = rcu_dereference(nf_nat_tftp_hook); + if (nf_nat_tftp && ct->status & IPS_NAT_MASK) + ret = nf_nat_tftp(skb, ctinfo, exp); + else if (nf_ct_expect_related(exp) != 0) + ret = NF_DROP; + nf_ct_expect_put(exp); + break; + case TFTP_OPCODE_DATA: + case TFTP_OPCODE_ACK: + pr_debug("Data/ACK opcode\n"); + break; + case TFTP_OPCODE_ERROR: + pr_debug("Error opcode\n"); + break; + default: + pr_debug("Unknown opcode\n"); + } + return ret; +} + +static struct nf_conntrack_helper tftp[MAX_PORTS][2] __read_mostly; +static char tftp_names[MAX_PORTS][2][sizeof("tftp-65535")] __read_mostly; + +static const struct nf_conntrack_expect_policy tftp_exp_policy = { + .max_expected = 1, + .timeout = 5 * 60, +}; + +static void nf_conntrack_tftp_fini(void) +{ + int i, j; + + for (i = 0; i < ports_c; i++) { + for (j = 0; j < 2; j++) + nf_conntrack_helper_unregister(&tftp[i][j]); + } +} + +static int __init nf_conntrack_tftp_init(void) +{ + int i, j, ret; + char *tmpname; + + if (ports_c == 0) + ports[ports_c++] = TFTP_PORT; + + for (i = 0; i < ports_c; i++) { + memset(&tftp[i], 0, sizeof(tftp[i])); + + tftp[i][0].tuple.src.l3num = AF_INET; + tftp[i][1].tuple.src.l3num = AF_INET6; + for (j = 0; j < 2; j++) { + tftp[i][j].tuple.dst.protonum = IPPROTO_UDP; + tftp[i][j].tuple.src.u.udp.port = htons(ports[i]); + tftp[i][j].expect_policy = &tftp_exp_policy; + tftp[i][j].me = THIS_MODULE; + tftp[i][j].help = tftp_help; + + tmpname = &tftp_names[i][j][0]; + if (ports[i] == TFTP_PORT) + sprintf(tmpname, "tftp"); + else + sprintf(tmpname, "tftp-%u", i); + tftp[i][j].name = tmpname; + + ret = nf_conntrack_helper_register(&tftp[i][j]); + if (ret) { + printk(KERN_ERR "nf_ct_tftp: failed to register" + " helper for pf: %u port: %u\n", + tftp[i][j].tuple.src.l3num, ports[i]); + nf_conntrack_tftp_fini(); + return ret; + } + } + } + return 0; +} + +module_init(nf_conntrack_tftp_init); +module_exit(nf_conntrack_tftp_fini); diff --git a/net/netfilter/nf_conntrack_timeout.c b/net/netfilter/nf_conntrack_timeout.c new file mode 100644 index 00000000..a878ce5b --- /dev/null +++ b/net/netfilter/nf_conntrack_timeout.c @@ -0,0 +1,60 @@ +/* + * (C) 2012 by Pablo Neira Ayuso <pablo@netfilter.org> + * (C) 2012 by Vyatta Inc. <http://www.vyatta.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation (or any later at your option). + */ + +#include <linux/types.h> +#include <linux/netfilter.h> +#include <linux/skbuff.h> +#include <linux/vmalloc.h> +#include <linux/stddef.h> +#include <linux/err.h> +#include <linux/percpu.h> +#include <linux/kernel.h> +#include <linux/netdevice.h> +#include <linux/slab.h> +#include <linux/export.h> + +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_extend.h> +#include <net/netfilter/nf_conntrack_timeout.h> + +struct ctnl_timeout * +(*nf_ct_timeout_find_get_hook)(const char *name) __read_mostly; +EXPORT_SYMBOL_GPL(nf_ct_timeout_find_get_hook); + +void (*nf_ct_timeout_put_hook)(struct ctnl_timeout *timeout) __read_mostly; +EXPORT_SYMBOL_GPL(nf_ct_timeout_put_hook); + +static struct nf_ct_ext_type timeout_extend __read_mostly = { + .len = sizeof(struct nf_conn_timeout), + .align = __alignof__(struct nf_conn_timeout), + .id = NF_CT_EXT_TIMEOUT, +}; + +int nf_conntrack_timeout_init(struct net *net) +{ + int ret = 0; + + if (net_eq(net, &init_net)) { + ret = nf_ct_extend_register(&timeout_extend); + if (ret < 0) { + printk(KERN_ERR "nf_ct_timeout: Unable to register " + "timeout extension.\n"); + return ret; + } + } + + return 0; +} + +void nf_conntrack_timeout_fini(struct net *net) +{ + if (net_eq(net, &init_net)) + nf_ct_extend_unregister(&timeout_extend); +} diff --git a/net/netfilter/nf_conntrack_timestamp.c b/net/netfilter/nf_conntrack_timestamp.c new file mode 100644 index 00000000..e8d27afb --- /dev/null +++ b/net/netfilter/nf_conntrack_timestamp.c @@ -0,0 +1,120 @@ +/* + * (C) 2010 Pablo Neira Ayuso <pablo@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation (or any later at your option). + */ + +#include <linux/netfilter.h> +#include <linux/slab.h> +#include <linux/kernel.h> +#include <linux/moduleparam.h> + +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_extend.h> +#include <net/netfilter/nf_conntrack_timestamp.h> + +static bool nf_ct_tstamp __read_mostly; + +module_param_named(tstamp, nf_ct_tstamp, bool, 0644); +MODULE_PARM_DESC(tstamp, "Enable connection tracking flow timestamping."); + +#ifdef CONFIG_SYSCTL +static struct ctl_table tstamp_sysctl_table[] = { + { + .procname = "nf_conntrack_timestamp", + .data = &init_net.ct.sysctl_tstamp, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + {} +}; +#endif /* CONFIG_SYSCTL */ + +static struct nf_ct_ext_type tstamp_extend __read_mostly = { + .len = sizeof(struct nf_conn_tstamp), + .align = __alignof__(struct nf_conn_tstamp), + .id = NF_CT_EXT_TSTAMP, +}; + +#ifdef CONFIG_SYSCTL +static int nf_conntrack_tstamp_init_sysctl(struct net *net) +{ + struct ctl_table *table; + + table = kmemdup(tstamp_sysctl_table, sizeof(tstamp_sysctl_table), + GFP_KERNEL); + if (!table) + goto out; + + table[0].data = &net->ct.sysctl_tstamp; + + net->ct.tstamp_sysctl_header = register_net_sysctl_table(net, + nf_net_netfilter_sysctl_path, table); + if (!net->ct.tstamp_sysctl_header) { + printk(KERN_ERR "nf_ct_tstamp: can't register to sysctl.\n"); + goto out_register; + } + return 0; + +out_register: + kfree(table); +out: + return -ENOMEM; +} + +static void nf_conntrack_tstamp_fini_sysctl(struct net *net) +{ + struct ctl_table *table; + + table = net->ct.tstamp_sysctl_header->ctl_table_arg; + unregister_net_sysctl_table(net->ct.tstamp_sysctl_header); + kfree(table); +} +#else +static int nf_conntrack_tstamp_init_sysctl(struct net *net) +{ + return 0; +} + +static void nf_conntrack_tstamp_fini_sysctl(struct net *net) +{ +} +#endif + +int nf_conntrack_tstamp_init(struct net *net) +{ + int ret; + + net->ct.sysctl_tstamp = nf_ct_tstamp; + + if (net_eq(net, &init_net)) { + ret = nf_ct_extend_register(&tstamp_extend); + if (ret < 0) { + printk(KERN_ERR "nf_ct_tstamp: Unable to register " + "extension\n"); + goto out_extend_register; + } + } + + ret = nf_conntrack_tstamp_init_sysctl(net); + if (ret < 0) + goto out_sysctl; + + return 0; + +out_sysctl: + if (net_eq(net, &init_net)) + nf_ct_extend_unregister(&tstamp_extend); +out_extend_register: + return ret; +} + +void nf_conntrack_tstamp_fini(struct net *net) +{ + nf_conntrack_tstamp_fini_sysctl(net); + if (net_eq(net, &init_net)) + nf_ct_extend_unregister(&tstamp_extend); +} diff --git a/net/netfilter/nf_internals.h b/net/netfilter/nf_internals.h new file mode 100644 index 00000000..770f7643 --- /dev/null +++ b/net/netfilter/nf_internals.h @@ -0,0 +1,38 @@ +#ifndef _NF_INTERNALS_H +#define _NF_INTERNALS_H + +#include <linux/list.h> +#include <linux/skbuff.h> +#include <linux/netdevice.h> + +#ifdef CONFIG_NETFILTER_DEBUG +#define NFDEBUG(format, args...) printk(KERN_DEBUG format , ## args) +#else +#define NFDEBUG(format, args...) +#endif + + +/* core.c */ +extern unsigned int nf_iterate(struct list_head *head, + struct sk_buff *skb, + unsigned int hook, + const struct net_device *indev, + const struct net_device *outdev, + struct list_head **i, + int (*okfn)(struct sk_buff *), + int hook_thresh); + +/* nf_queue.c */ +extern int nf_queue(struct sk_buff *skb, + struct list_head *elem, + u_int8_t pf, unsigned int hook, + struct net_device *indev, + struct net_device *outdev, + int (*okfn)(struct sk_buff *), + unsigned int queuenum); +extern int __init netfilter_queue_init(void); + +/* nf_log.c */ +extern int __init netfilter_log_init(void); + +#endif diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c new file mode 100644 index 00000000..957374a2 --- /dev/null +++ b/net/netfilter/nf_log.c @@ -0,0 +1,318 @@ +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/proc_fs.h> +#include <linux/skbuff.h> +#include <linux/netfilter.h> +#include <linux/seq_file.h> +#include <net/protocol.h> +#include <net/netfilter/nf_log.h> + +#include "nf_internals.h" + +/* Internal logging interface, which relies on the real + LOG target modules */ + +#define NF_LOG_PREFIXLEN 128 +#define NFLOGGER_NAME_LEN 64 + +static const struct nf_logger __rcu *nf_loggers[NFPROTO_NUMPROTO] __read_mostly; +static struct list_head nf_loggers_l[NFPROTO_NUMPROTO] __read_mostly; +static DEFINE_MUTEX(nf_log_mutex); + +static struct nf_logger *__find_logger(int pf, const char *str_logger) +{ + struct nf_logger *t; + + list_for_each_entry(t, &nf_loggers_l[pf], list[pf]) { + if (!strnicmp(str_logger, t->name, strlen(t->name))) + return t; + } + + return NULL; +} + +/* return EEXIST if the same logger is registred, 0 on success. */ +int nf_log_register(u_int8_t pf, struct nf_logger *logger) +{ + const struct nf_logger *llog; + int i; + + if (pf >= ARRAY_SIZE(nf_loggers)) + return -EINVAL; + + for (i = 0; i < ARRAY_SIZE(logger->list); i++) + INIT_LIST_HEAD(&logger->list[i]); + + mutex_lock(&nf_log_mutex); + + if (pf == NFPROTO_UNSPEC) { + for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++) + list_add_tail(&(logger->list[i]), &(nf_loggers_l[i])); + } else { + /* register at end of list to honor first register win */ + list_add_tail(&logger->list[pf], &nf_loggers_l[pf]); + llog = rcu_dereference_protected(nf_loggers[pf], + lockdep_is_held(&nf_log_mutex)); + if (llog == NULL) + rcu_assign_pointer(nf_loggers[pf], logger); + } + + mutex_unlock(&nf_log_mutex); + + return 0; +} +EXPORT_SYMBOL(nf_log_register); + +void nf_log_unregister(struct nf_logger *logger) +{ + const struct nf_logger *c_logger; + int i; + + mutex_lock(&nf_log_mutex); + for (i = 0; i < ARRAY_SIZE(nf_loggers); i++) { + c_logger = rcu_dereference_protected(nf_loggers[i], + lockdep_is_held(&nf_log_mutex)); + if (c_logger == logger) + RCU_INIT_POINTER(nf_loggers[i], NULL); + list_del(&logger->list[i]); + } + mutex_unlock(&nf_log_mutex); + + synchronize_rcu(); +} +EXPORT_SYMBOL(nf_log_unregister); + +int nf_log_bind_pf(u_int8_t pf, const struct nf_logger *logger) +{ + if (pf >= ARRAY_SIZE(nf_loggers)) + return -EINVAL; + mutex_lock(&nf_log_mutex); + if (__find_logger(pf, logger->name) == NULL) { + mutex_unlock(&nf_log_mutex); + return -ENOENT; + } + rcu_assign_pointer(nf_loggers[pf], logger); + mutex_unlock(&nf_log_mutex); + return 0; +} +EXPORT_SYMBOL(nf_log_bind_pf); + +void nf_log_unbind_pf(u_int8_t pf) +{ + if (pf >= ARRAY_SIZE(nf_loggers)) + return; + mutex_lock(&nf_log_mutex); + RCU_INIT_POINTER(nf_loggers[pf], NULL); + mutex_unlock(&nf_log_mutex); +} +EXPORT_SYMBOL(nf_log_unbind_pf); + +void nf_log_packet(u_int8_t pf, + unsigned int hooknum, + const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const struct nf_loginfo *loginfo, + const char *fmt, ...) +{ + va_list args; + char prefix[NF_LOG_PREFIXLEN]; + const struct nf_logger *logger; + + rcu_read_lock(); + logger = rcu_dereference(nf_loggers[pf]); + if (logger) { + va_start(args, fmt); + vsnprintf(prefix, sizeof(prefix), fmt, args); + va_end(args); + logger->logfn(pf, hooknum, skb, in, out, loginfo, prefix); + } + rcu_read_unlock(); +} +EXPORT_SYMBOL(nf_log_packet); + +#ifdef CONFIG_PROC_FS +static void *seq_start(struct seq_file *seq, loff_t *pos) +{ + mutex_lock(&nf_log_mutex); + + if (*pos >= ARRAY_SIZE(nf_loggers)) + return NULL; + + return pos; +} + +static void *seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + (*pos)++; + + if (*pos >= ARRAY_SIZE(nf_loggers)) + return NULL; + + return pos; +} + +static void seq_stop(struct seq_file *s, void *v) +{ + mutex_unlock(&nf_log_mutex); +} + +static int seq_show(struct seq_file *s, void *v) +{ + loff_t *pos = v; + const struct nf_logger *logger; + struct nf_logger *t; + int ret; + + logger = rcu_dereference_protected(nf_loggers[*pos], + lockdep_is_held(&nf_log_mutex)); + + if (!logger) + ret = seq_printf(s, "%2lld NONE (", *pos); + else + ret = seq_printf(s, "%2lld %s (", *pos, logger->name); + + if (ret < 0) + return ret; + + list_for_each_entry(t, &nf_loggers_l[*pos], list[*pos]) { + ret = seq_printf(s, "%s", t->name); + if (ret < 0) + return ret; + if (&t->list[*pos] != nf_loggers_l[*pos].prev) { + ret = seq_printf(s, ","); + if (ret < 0) + return ret; + } + } + + return seq_printf(s, ")\n"); +} + +static const struct seq_operations nflog_seq_ops = { + .start = seq_start, + .next = seq_next, + .stop = seq_stop, + .show = seq_show, +}; + +static int nflog_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &nflog_seq_ops); +} + +static const struct file_operations nflog_file_ops = { + .owner = THIS_MODULE, + .open = nflog_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + + +#endif /* PROC_FS */ + +#ifdef CONFIG_SYSCTL +static struct ctl_path nf_log_sysctl_path[] = { + { .procname = "net", }, + { .procname = "netfilter", }, + { .procname = "nf_log", }, + { } +}; + +static char nf_log_sysctl_fnames[NFPROTO_NUMPROTO-NFPROTO_UNSPEC][3]; +static struct ctl_table nf_log_sysctl_table[NFPROTO_NUMPROTO+1]; +static struct ctl_table_header *nf_log_dir_header; + +static int nf_log_proc_dostring(ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + const struct nf_logger *logger; + char buf[NFLOGGER_NAME_LEN]; + size_t size = *lenp; + int r = 0; + int tindex = (unsigned long)table->extra1; + + if (write) { + if (size > sizeof(buf)) + size = sizeof(buf); + if (copy_from_user(buf, buffer, size)) + return -EFAULT; + + if (!strcmp(buf, "NONE")) { + nf_log_unbind_pf(tindex); + return 0; + } + mutex_lock(&nf_log_mutex); + logger = __find_logger(tindex, buf); + if (logger == NULL) { + mutex_unlock(&nf_log_mutex); + return -ENOENT; + } + rcu_assign_pointer(nf_loggers[tindex], logger); + mutex_unlock(&nf_log_mutex); + } else { + mutex_lock(&nf_log_mutex); + logger = rcu_dereference_protected(nf_loggers[tindex], + lockdep_is_held(&nf_log_mutex)); + if (!logger) + table->data = "NONE"; + else + table->data = logger->name; + r = proc_dostring(table, write, buffer, lenp, ppos); + mutex_unlock(&nf_log_mutex); + } + + return r; +} + +static __init int netfilter_log_sysctl_init(void) +{ + int i; + + for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++) { + snprintf(nf_log_sysctl_fnames[i-NFPROTO_UNSPEC], 3, "%d", i); + nf_log_sysctl_table[i].procname = + nf_log_sysctl_fnames[i-NFPROTO_UNSPEC]; + nf_log_sysctl_table[i].data = NULL; + nf_log_sysctl_table[i].maxlen = + NFLOGGER_NAME_LEN * sizeof(char); + nf_log_sysctl_table[i].mode = 0644; + nf_log_sysctl_table[i].proc_handler = nf_log_proc_dostring; + nf_log_sysctl_table[i].extra1 = (void *)(unsigned long) i; + } + + nf_log_dir_header = register_sysctl_paths(nf_log_sysctl_path, + nf_log_sysctl_table); + if (!nf_log_dir_header) + return -ENOMEM; + + return 0; +} +#else +static __init int netfilter_log_sysctl_init(void) +{ + return 0; +} +#endif /* CONFIG_SYSCTL */ + +int __init netfilter_log_init(void) +{ + int i, r; +#ifdef CONFIG_PROC_FS + if (!proc_create("nf_log", S_IRUGO, + proc_net_netfilter, &nflog_file_ops)) + return -1; +#endif + + /* Errors will trigger panic, unroll on error is unnecessary. */ + r = netfilter_log_sysctl_init(); + if (r < 0) + return r; + + for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++) + INIT_LIST_HEAD(&(nf_loggers_l[i])); + + return 0; +} diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c new file mode 100644 index 00000000..ce60cf0f --- /dev/null +++ b/net/netfilter/nf_queue.c @@ -0,0 +1,420 @@ +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/proc_fs.h> +#include <linux/skbuff.h> +#include <linux/netfilter.h> +#include <linux/seq_file.h> +#include <linux/rcupdate.h> +#include <net/protocol.h> +#include <net/netfilter/nf_queue.h> +#include <net/dst.h> + +#include "nf_internals.h" + +/* + * A queue handler may be registered for each protocol. Each is protected by + * long term mutex. The handler must provide an an outfn() to accept packets + * for queueing and must reinject all packets it receives, no matter what. + */ +static const struct nf_queue_handler __rcu *queue_handler[NFPROTO_NUMPROTO] __read_mostly; + +static DEFINE_MUTEX(queue_handler_mutex); + +/* return EBUSY when somebody else is registered, return EEXIST if the + * same handler is registered, return 0 in case of success. */ +int nf_register_queue_handler(u_int8_t pf, const struct nf_queue_handler *qh) +{ + int ret; + const struct nf_queue_handler *old; + + if (pf >= ARRAY_SIZE(queue_handler)) + return -EINVAL; + + mutex_lock(&queue_handler_mutex); + old = rcu_dereference_protected(queue_handler[pf], + lockdep_is_held(&queue_handler_mutex)); + if (old == qh) + ret = -EEXIST; + else if (old) + ret = -EBUSY; + else { + rcu_assign_pointer(queue_handler[pf], qh); + ret = 0; + } + mutex_unlock(&queue_handler_mutex); + + return ret; +} +EXPORT_SYMBOL(nf_register_queue_handler); + +/* The caller must flush their queue before this */ +int nf_unregister_queue_handler(u_int8_t pf, const struct nf_queue_handler *qh) +{ + const struct nf_queue_handler *old; + + if (pf >= ARRAY_SIZE(queue_handler)) + return -EINVAL; + + mutex_lock(&queue_handler_mutex); + old = rcu_dereference_protected(queue_handler[pf], + lockdep_is_held(&queue_handler_mutex)); + if (old && old != qh) { + mutex_unlock(&queue_handler_mutex); + return -EINVAL; + } + + RCU_INIT_POINTER(queue_handler[pf], NULL); + mutex_unlock(&queue_handler_mutex); + + synchronize_rcu(); + + return 0; +} +EXPORT_SYMBOL(nf_unregister_queue_handler); + +void nf_unregister_queue_handlers(const struct nf_queue_handler *qh) +{ + u_int8_t pf; + + mutex_lock(&queue_handler_mutex); + for (pf = 0; pf < ARRAY_SIZE(queue_handler); pf++) { + if (rcu_dereference_protected( + queue_handler[pf], + lockdep_is_held(&queue_handler_mutex) + ) == qh) + RCU_INIT_POINTER(queue_handler[pf], NULL); + } + mutex_unlock(&queue_handler_mutex); + + synchronize_rcu(); +} +EXPORT_SYMBOL_GPL(nf_unregister_queue_handlers); + +static void nf_queue_entry_release_refs(struct nf_queue_entry *entry) +{ + /* Release those devices we held, or Alexey will kill me. */ + if (entry->indev) + dev_put(entry->indev); + if (entry->outdev) + dev_put(entry->outdev); +#ifdef CONFIG_BRIDGE_NETFILTER + if (entry->skb->nf_bridge) { + struct nf_bridge_info *nf_bridge = entry->skb->nf_bridge; + + if (nf_bridge->physindev) + dev_put(nf_bridge->physindev); + if (nf_bridge->physoutdev) + dev_put(nf_bridge->physoutdev); + } +#endif + /* Drop reference to owner of hook which queued us. */ + module_put(entry->elem->owner); +} + +/* + * Any packet that leaves via this function must come back + * through nf_reinject(). + */ +static int __nf_queue(struct sk_buff *skb, + struct list_head *elem, + u_int8_t pf, unsigned int hook, + struct net_device *indev, + struct net_device *outdev, + int (*okfn)(struct sk_buff *), + unsigned int queuenum) +{ + int status = -ENOENT; + struct nf_queue_entry *entry = NULL; +#ifdef CONFIG_BRIDGE_NETFILTER + struct net_device *physindev; + struct net_device *physoutdev; +#endif + const struct nf_afinfo *afinfo; + const struct nf_queue_handler *qh; + + /* QUEUE == DROP if no one is waiting, to be safe. */ + rcu_read_lock(); + + qh = rcu_dereference(queue_handler[pf]); + if (!qh) { + status = -ESRCH; + goto err_unlock; + } + + afinfo = nf_get_afinfo(pf); + if (!afinfo) + goto err_unlock; + + entry = kmalloc(sizeof(*entry) + afinfo->route_key_size, GFP_ATOMIC); + if (!entry) { + status = -ENOMEM; + goto err_unlock; + } + + *entry = (struct nf_queue_entry) { + .skb = skb, + .elem = list_entry(elem, struct nf_hook_ops, list), + .pf = pf, + .hook = hook, + .indev = indev, + .outdev = outdev, + .okfn = okfn, + }; + + /* If it's going away, ignore hook. */ + if (!try_module_get(entry->elem->owner)) { + status = -ECANCELED; + goto err_unlock; + } + /* Bump dev refs so they don't vanish while packet is out */ + if (indev) + dev_hold(indev); + if (outdev) + dev_hold(outdev); +#ifdef CONFIG_BRIDGE_NETFILTER + if (skb->nf_bridge) { + physindev = skb->nf_bridge->physindev; + if (physindev) + dev_hold(physindev); + physoutdev = skb->nf_bridge->physoutdev; + if (physoutdev) + dev_hold(physoutdev); + } +#endif + skb_dst_force(skb); + afinfo->saveroute(skb, entry); + status = qh->outfn(entry, queuenum); + + rcu_read_unlock(); + + if (status < 0) { + nf_queue_entry_release_refs(entry); + goto err; + } + + return 0; + +err_unlock: + rcu_read_unlock(); +err: + kfree(entry); + return status; +} + +#ifdef CONFIG_BRIDGE_NETFILTER +/* When called from bridge netfilter, skb->data must point to MAC header + * before calling skb_gso_segment(). Else, original MAC header is lost + * and segmented skbs will be sent to wrong destination. + */ +static void nf_bridge_adjust_skb_data(struct sk_buff *skb) +{ + if (skb->nf_bridge) + __skb_push(skb, skb->network_header - skb->mac_header); +} + +static void nf_bridge_adjust_segmented_data(struct sk_buff *skb) +{ + if (skb->nf_bridge) + __skb_pull(skb, skb->network_header - skb->mac_header); +} +#else +#define nf_bridge_adjust_skb_data(s) do {} while (0) +#define nf_bridge_adjust_segmented_data(s) do {} while (0) +#endif + +int nf_queue(struct sk_buff *skb, + struct list_head *elem, + u_int8_t pf, unsigned int hook, + struct net_device *indev, + struct net_device *outdev, + int (*okfn)(struct sk_buff *), + unsigned int queuenum) +{ + struct sk_buff *segs; + int err = -EINVAL; + unsigned int queued; + + if (!skb_is_gso(skb)) + return __nf_queue(skb, elem, pf, hook, indev, outdev, okfn, + queuenum); + + switch (pf) { + case NFPROTO_IPV4: + skb->protocol = htons(ETH_P_IP); + break; + case NFPROTO_IPV6: + skb->protocol = htons(ETH_P_IPV6); + break; + } + + nf_bridge_adjust_skb_data(skb); + segs = skb_gso_segment(skb, 0); + /* Does not use PTR_ERR to limit the number of error codes that can be + * returned by nf_queue. For instance, callers rely on -ECANCELED to mean + * 'ignore this hook'. + */ + if (IS_ERR(segs)) + goto out_err; + queued = 0; + err = 0; + do { + struct sk_buff *nskb = segs->next; + + segs->next = NULL; + if (err == 0) { + nf_bridge_adjust_segmented_data(segs); + err = __nf_queue(segs, elem, pf, hook, indev, + outdev, okfn, queuenum); + } + if (err == 0) + queued++; + else + kfree_skb(segs); + segs = nskb; + } while (segs); + + if (queued) { + kfree_skb(skb); + return 0; + } + out_err: + nf_bridge_adjust_segmented_data(skb); + return err; +} + +void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict) +{ + struct sk_buff *skb = entry->skb; + struct list_head *elem = &entry->elem->list; + const struct nf_afinfo *afinfo; + int err; + + rcu_read_lock(); + + nf_queue_entry_release_refs(entry); + + /* Continue traversal iff userspace said ok... */ + if (verdict == NF_REPEAT) { + elem = elem->prev; + verdict = NF_ACCEPT; + } + + if (verdict == NF_ACCEPT) { + afinfo = nf_get_afinfo(entry->pf); + if (!afinfo || afinfo->reroute(skb, entry) < 0) + verdict = NF_DROP; + } + + if (verdict == NF_ACCEPT) { + next_hook: + verdict = nf_iterate(&nf_hooks[entry->pf][entry->hook], + skb, entry->hook, + entry->indev, entry->outdev, &elem, + entry->okfn, INT_MIN); + } + + switch (verdict & NF_VERDICT_MASK) { + case NF_ACCEPT: + case NF_STOP: + local_bh_disable(); + entry->okfn(skb); + local_bh_enable(); + break; + case NF_QUEUE: + err = __nf_queue(skb, elem, entry->pf, entry->hook, + entry->indev, entry->outdev, entry->okfn, + verdict >> NF_VERDICT_QBITS); + if (err < 0) { + if (err == -ECANCELED) + goto next_hook; + if (err == -ESRCH && + (verdict & NF_VERDICT_FLAG_QUEUE_BYPASS)) + goto next_hook; + kfree_skb(skb); + } + break; + case NF_STOLEN: + break; + default: + kfree_skb(skb); + } + rcu_read_unlock(); + kfree(entry); +} +EXPORT_SYMBOL(nf_reinject); + +#ifdef CONFIG_PROC_FS +static void *seq_start(struct seq_file *seq, loff_t *pos) +{ + if (*pos >= ARRAY_SIZE(queue_handler)) + return NULL; + + return pos; +} + +static void *seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + (*pos)++; + + if (*pos >= ARRAY_SIZE(queue_handler)) + return NULL; + + return pos; +} + +static void seq_stop(struct seq_file *s, void *v) +{ + +} + +static int seq_show(struct seq_file *s, void *v) +{ + int ret; + loff_t *pos = v; + const struct nf_queue_handler *qh; + + rcu_read_lock(); + qh = rcu_dereference(queue_handler[*pos]); + if (!qh) + ret = seq_printf(s, "%2lld NONE\n", *pos); + else + ret = seq_printf(s, "%2lld %s\n", *pos, qh->name); + rcu_read_unlock(); + + return ret; +} + +static const struct seq_operations nfqueue_seq_ops = { + .start = seq_start, + .next = seq_next, + .stop = seq_stop, + .show = seq_show, +}; + +static int nfqueue_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &nfqueue_seq_ops); +} + +static const struct file_operations nfqueue_file_ops = { + .owner = THIS_MODULE, + .open = nfqueue_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; +#endif /* PROC_FS */ + + +int __init netfilter_queue_init(void) +{ +#ifdef CONFIG_PROC_FS + if (!proc_create("nf_queue", S_IRUGO, + proc_net_netfilter, &nfqueue_file_ops)) + return -1; +#endif + return 0; +} + diff --git a/net/netfilter/nf_sockopt.c b/net/netfilter/nf_sockopt.c new file mode 100644 index 00000000..f042ae52 --- /dev/null +++ b/net/netfilter/nf_sockopt.c @@ -0,0 +1,169 @@ +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/netfilter.h> +#include <linux/mutex.h> +#include <net/sock.h> + +#include "nf_internals.h" + +/* Sockopts only registered and called from user context, so + net locking would be overkill. Also, [gs]etsockopt calls may + sleep. */ +static DEFINE_MUTEX(nf_sockopt_mutex); +static LIST_HEAD(nf_sockopts); + +/* Do exclusive ranges overlap? */ +static inline int overlap(int min1, int max1, int min2, int max2) +{ + return max1 > min2 && min1 < max2; +} + +/* Functions to register sockopt ranges (exclusive). */ +int nf_register_sockopt(struct nf_sockopt_ops *reg) +{ + struct nf_sockopt_ops *ops; + int ret = 0; + + if (mutex_lock_interruptible(&nf_sockopt_mutex) != 0) + return -EINTR; + + list_for_each_entry(ops, &nf_sockopts, list) { + if (ops->pf == reg->pf + && (overlap(ops->set_optmin, ops->set_optmax, + reg->set_optmin, reg->set_optmax) + || overlap(ops->get_optmin, ops->get_optmax, + reg->get_optmin, reg->get_optmax))) { + NFDEBUG("nf_sock overlap: %u-%u/%u-%u v %u-%u/%u-%u\n", + ops->set_optmin, ops->set_optmax, + ops->get_optmin, ops->get_optmax, + reg->set_optmin, reg->set_optmax, + reg->get_optmin, reg->get_optmax); + ret = -EBUSY; + goto out; + } + } + + list_add(®->list, &nf_sockopts); +out: + mutex_unlock(&nf_sockopt_mutex); + return ret; +} +EXPORT_SYMBOL(nf_register_sockopt); + +void nf_unregister_sockopt(struct nf_sockopt_ops *reg) +{ + mutex_lock(&nf_sockopt_mutex); + list_del(®->list); + mutex_unlock(&nf_sockopt_mutex); +} +EXPORT_SYMBOL(nf_unregister_sockopt); + +static struct nf_sockopt_ops *nf_sockopt_find(struct sock *sk, u_int8_t pf, + int val, int get) +{ + struct nf_sockopt_ops *ops; + + if (mutex_lock_interruptible(&nf_sockopt_mutex) != 0) + return ERR_PTR(-EINTR); + + list_for_each_entry(ops, &nf_sockopts, list) { + if (ops->pf == pf) { + if (!try_module_get(ops->owner)) + goto out_nosup; + + if (get) { + if (val >= ops->get_optmin && + val < ops->get_optmax) + goto out; + } else { + if (val >= ops->set_optmin && + val < ops->set_optmax) + goto out; + } + module_put(ops->owner); + } + } +out_nosup: + ops = ERR_PTR(-ENOPROTOOPT); +out: + mutex_unlock(&nf_sockopt_mutex); + return ops; +} + +/* Call get/setsockopt() */ +static int nf_sockopt(struct sock *sk, u_int8_t pf, int val, + char __user *opt, int *len, int get) +{ + struct nf_sockopt_ops *ops; + int ret; + + ops = nf_sockopt_find(sk, pf, val, get); + if (IS_ERR(ops)) + return PTR_ERR(ops); + + if (get) + ret = ops->get(sk, val, opt, len); + else + ret = ops->set(sk, val, opt, *len); + + module_put(ops->owner); + return ret; +} + +int nf_setsockopt(struct sock *sk, u_int8_t pf, int val, char __user *opt, + unsigned int len) +{ + return nf_sockopt(sk, pf, val, opt, &len, 0); +} +EXPORT_SYMBOL(nf_setsockopt); + +int nf_getsockopt(struct sock *sk, u_int8_t pf, int val, char __user *opt, + int *len) +{ + return nf_sockopt(sk, pf, val, opt, len, 1); +} +EXPORT_SYMBOL(nf_getsockopt); + +#ifdef CONFIG_COMPAT +static int compat_nf_sockopt(struct sock *sk, u_int8_t pf, int val, + char __user *opt, int *len, int get) +{ + struct nf_sockopt_ops *ops; + int ret; + + ops = nf_sockopt_find(sk, pf, val, get); + if (IS_ERR(ops)) + return PTR_ERR(ops); + + if (get) { + if (ops->compat_get) + ret = ops->compat_get(sk, val, opt, len); + else + ret = ops->get(sk, val, opt, len); + } else { + if (ops->compat_set) + ret = ops->compat_set(sk, val, opt, *len); + else + ret = ops->set(sk, val, opt, *len); + } + + module_put(ops->owner); + return ret; +} + +int compat_nf_setsockopt(struct sock *sk, u_int8_t pf, + int val, char __user *opt, unsigned int len) +{ + return compat_nf_sockopt(sk, pf, val, opt, &len, 0); +} +EXPORT_SYMBOL(compat_nf_setsockopt); + +int compat_nf_getsockopt(struct sock *sk, u_int8_t pf, + int val, char __user *opt, int *len) +{ + return compat_nf_sockopt(sk, pf, val, opt, len, 1); +} +EXPORT_SYMBOL(compat_nf_getsockopt); +#endif diff --git a/net/netfilter/nf_tproxy_core.c b/net/netfilter/nf_tproxy_core.c new file mode 100644 index 00000000..474d621c --- /dev/null +++ b/net/netfilter/nf_tproxy_core.c @@ -0,0 +1,62 @@ +/* + * Transparent proxy support for Linux/iptables + * + * Copyright (c) 2006-2007 BalaBit IT Ltd. + * Author: Balazs Scheidler, Krisztian Kovacs + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include <linux/module.h> + +#include <linux/net.h> +#include <linux/if.h> +#include <linux/netdevice.h> +#include <net/udp.h> +#include <net/netfilter/nf_tproxy_core.h> + + +static void +nf_tproxy_destructor(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + + skb->sk = NULL; + skb->destructor = NULL; + + if (sk) + sock_put(sk); +} + +/* consumes sk */ +void +nf_tproxy_assign_sock(struct sk_buff *skb, struct sock *sk) +{ + /* assigning tw sockets complicates things; most + * skb->sk->X checks would have to test sk->sk_state first */ + if (sk->sk_state == TCP_TIME_WAIT) { + inet_twsk_put(inet_twsk(sk)); + return; + } + + skb_orphan(skb); + skb->sk = sk; + skb->destructor = nf_tproxy_destructor; +} +EXPORT_SYMBOL_GPL(nf_tproxy_assign_sock); + +static int __init nf_tproxy_init(void) +{ + pr_info("NF_TPROXY: Transparent proxy support initialized, version 4.1.0\n"); + pr_info("NF_TPROXY: Copyright (c) 2006-2007 BalaBit IT Ltd.\n"); + return 0; +} + +module_init(nf_tproxy_init); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Krisztian Kovacs"); +MODULE_DESCRIPTION("Transparent proxy support core routines"); diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c new file mode 100644 index 00000000..e6ddde16 --- /dev/null +++ b/net/netfilter/nfnetlink.c @@ -0,0 +1,244 @@ +/* Netfilter messages via netlink socket. Allows for user space + * protocol helpers and general trouble making from userspace. + * + * (C) 2001 by Jay Schulist <jschlst@samba.org>, + * (C) 2002-2005 by Harald Welte <laforge@gnumonks.org> + * (C) 2005,2007 by Pablo Neira Ayuso <pablo@netfilter.org> + * + * Initial netfilter messages via netlink development funded and + * generally made possible by Network Robots, Inc. (www.networkrobots.com) + * + * Further development of this code funded by Astaro AG (http://www.astaro.com) + * + * This software may be used and distributed according to the terms + * of the GNU General Public License, incorporated herein by reference. + */ + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/sockios.h> +#include <linux/net.h> +#include <linux/skbuff.h> +#include <asm/uaccess.h> +#include <net/sock.h> +#include <net/netlink.h> +#include <linux/init.h> + +#include <linux/netlink.h> +#include <linux/netfilter/nfnetlink.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); +MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NETFILTER); + +static char __initdata nfversion[] = "0.30"; + +static const struct nfnetlink_subsystem __rcu *subsys_table[NFNL_SUBSYS_COUNT]; +static DEFINE_MUTEX(nfnl_mutex); + +void nfnl_lock(void) +{ + mutex_lock(&nfnl_mutex); +} +EXPORT_SYMBOL_GPL(nfnl_lock); + +void nfnl_unlock(void) +{ + mutex_unlock(&nfnl_mutex); +} +EXPORT_SYMBOL_GPL(nfnl_unlock); + +int nfnetlink_subsys_register(const struct nfnetlink_subsystem *n) +{ + nfnl_lock(); + if (subsys_table[n->subsys_id]) { + nfnl_unlock(); + return -EBUSY; + } + rcu_assign_pointer(subsys_table[n->subsys_id], n); + nfnl_unlock(); + + return 0; +} +EXPORT_SYMBOL_GPL(nfnetlink_subsys_register); + +int nfnetlink_subsys_unregister(const struct nfnetlink_subsystem *n) +{ + nfnl_lock(); + subsys_table[n->subsys_id] = NULL; + nfnl_unlock(); + synchronize_rcu(); + return 0; +} +EXPORT_SYMBOL_GPL(nfnetlink_subsys_unregister); + +static inline const struct nfnetlink_subsystem *nfnetlink_get_subsys(u_int16_t type) +{ + u_int8_t subsys_id = NFNL_SUBSYS_ID(type); + + if (subsys_id >= NFNL_SUBSYS_COUNT) + return NULL; + + return rcu_dereference(subsys_table[subsys_id]); +} + +static inline const struct nfnl_callback * +nfnetlink_find_client(u_int16_t type, const struct nfnetlink_subsystem *ss) +{ + u_int8_t cb_id = NFNL_MSG_TYPE(type); + + if (cb_id >= ss->cb_count) + return NULL; + + return &ss->cb[cb_id]; +} + +int nfnetlink_has_listeners(struct net *net, unsigned int group) +{ + return netlink_has_listeners(net->nfnl, group); +} +EXPORT_SYMBOL_GPL(nfnetlink_has_listeners); + +int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, + unsigned group, int echo, gfp_t flags) +{ + return nlmsg_notify(net->nfnl, skb, pid, group, echo, flags); +} +EXPORT_SYMBOL_GPL(nfnetlink_send); + +int nfnetlink_set_err(struct net *net, u32 pid, u32 group, int error) +{ + return netlink_set_err(net->nfnl, pid, group, error); +} +EXPORT_SYMBOL_GPL(nfnetlink_set_err); + +int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u_int32_t pid, int flags) +{ + return netlink_unicast(net->nfnl, skb, pid, flags); +} +EXPORT_SYMBOL_GPL(nfnetlink_unicast); + +/* Process one complete nfnetlink message. */ +static int nfnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) +{ + struct net *net = sock_net(skb->sk); + const struct nfnl_callback *nc; + const struct nfnetlink_subsystem *ss; + int type, err; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + /* All the messages must at least contain nfgenmsg */ + if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(struct nfgenmsg))) + return 0; + + type = nlh->nlmsg_type; +replay: + rcu_read_lock(); + ss = nfnetlink_get_subsys(type); + if (!ss) { +#ifdef CONFIG_MODULES + rcu_read_unlock(); + request_module("nfnetlink-subsys-%d", NFNL_SUBSYS_ID(type)); + rcu_read_lock(); + ss = nfnetlink_get_subsys(type); + if (!ss) +#endif + { + rcu_read_unlock(); + return -EINVAL; + } + } + + nc = nfnetlink_find_client(type, ss); + if (!nc) { + rcu_read_unlock(); + return -EINVAL; + } + + { + int min_len = NLMSG_SPACE(sizeof(struct nfgenmsg)); + u_int8_t cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type); + struct nlattr *cda[ss->cb[cb_id].attr_count + 1]; + struct nlattr *attr = (void *)nlh + min_len; + int attrlen = nlh->nlmsg_len - min_len; + + err = nla_parse(cda, ss->cb[cb_id].attr_count, + attr, attrlen, ss->cb[cb_id].policy); + if (err < 0) + return err; + + if (nc->call_rcu) { + err = nc->call_rcu(net->nfnl, skb, nlh, + (const struct nlattr **)cda); + rcu_read_unlock(); + } else { + rcu_read_unlock(); + nfnl_lock(); + if (rcu_dereference_protected( + subsys_table[NFNL_SUBSYS_ID(type)], + lockdep_is_held(&nfnl_mutex)) != ss || + nfnetlink_find_client(type, ss) != nc) + err = -EAGAIN; + else + err = nc->call(net->nfnl, skb, nlh, + (const struct nlattr **)cda); + nfnl_unlock(); + } + if (err == -EAGAIN) + goto replay; + return err; + } +} + +static void nfnetlink_rcv(struct sk_buff *skb) +{ + netlink_rcv_skb(skb, &nfnetlink_rcv_msg); +} + +static int __net_init nfnetlink_net_init(struct net *net) +{ + struct sock *nfnl; + + nfnl = netlink_kernel_create(net, NETLINK_NETFILTER, NFNLGRP_MAX, + nfnetlink_rcv, NULL, THIS_MODULE); + if (!nfnl) + return -ENOMEM; + net->nfnl_stash = nfnl; + rcu_assign_pointer(net->nfnl, nfnl); + return 0; +} + +static void __net_exit nfnetlink_net_exit_batch(struct list_head *net_exit_list) +{ + struct net *net; + + list_for_each_entry(net, net_exit_list, exit_list) + RCU_INIT_POINTER(net->nfnl, NULL); + synchronize_net(); + list_for_each_entry(net, net_exit_list, exit_list) + netlink_kernel_release(net->nfnl_stash); +} + +static struct pernet_operations nfnetlink_net_ops = { + .init = nfnetlink_net_init, + .exit_batch = nfnetlink_net_exit_batch, +}; + +static int __init nfnetlink_init(void) +{ + pr_info("Netfilter messages via NETLINK v%s.\n", nfversion); + return register_pernet_subsys(&nfnetlink_net_ops); +} + +static void __exit nfnetlink_exit(void) +{ + pr_info("Removing netfilter NETLINK layer.\n"); + unregister_pernet_subsys(&nfnetlink_net_ops); +} +module_init(nfnetlink_init); +module_exit(nfnetlink_exit); diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c new file mode 100644 index 00000000..d98c868c --- /dev/null +++ b/net/netfilter/nfnetlink_acct.c @@ -0,0 +1,363 @@ +/* + * (C) 2011 Pablo Neira Ayuso <pablo@netfilter.org> + * (C) 2011 Intra2net AG <http://www.intra2net.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation (or any later at your option). + */ +#include <linux/init.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/skbuff.h> +#include <linux/atomic.h> +#include <linux/netlink.h> +#include <linux/rculist.h> +#include <linux/slab.h> +#include <linux/types.h> +#include <linux/errno.h> +#include <net/netlink.h> +#include <net/sock.h> + +#include <linux/netfilter.h> +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nfnetlink_acct.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); +MODULE_DESCRIPTION("nfacct: Extended Netfilter accounting infrastructure"); + +static LIST_HEAD(nfnl_acct_list); + +struct nf_acct { + atomic64_t pkts; + atomic64_t bytes; + struct list_head head; + atomic_t refcnt; + char name[NFACCT_NAME_MAX]; + struct rcu_head rcu_head; +}; + +static int +nfnl_acct_new(struct sock *nfnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, const struct nlattr * const tb[]) +{ + struct nf_acct *nfacct, *matching = NULL; + char *acct_name; + + if (!tb[NFACCT_NAME]) + return -EINVAL; + + acct_name = nla_data(tb[NFACCT_NAME]); + + list_for_each_entry(nfacct, &nfnl_acct_list, head) { + if (strncmp(nfacct->name, acct_name, NFACCT_NAME_MAX) != 0) + continue; + + if (nlh->nlmsg_flags & NLM_F_EXCL) + return -EEXIST; + + matching = nfacct; + break; + } + + if (matching) { + if (nlh->nlmsg_flags & NLM_F_REPLACE) { + /* reset counters if you request a replacement. */ + atomic64_set(&matching->pkts, 0); + atomic64_set(&matching->bytes, 0); + return 0; + } + return -EBUSY; + } + + nfacct = kzalloc(sizeof(struct nf_acct), GFP_KERNEL); + if (nfacct == NULL) + return -ENOMEM; + + strncpy(nfacct->name, nla_data(tb[NFACCT_NAME]), NFACCT_NAME_MAX); + + if (tb[NFACCT_BYTES]) { + atomic64_set(&nfacct->bytes, + be64_to_cpu(nla_get_u64(tb[NFACCT_BYTES]))); + } + if (tb[NFACCT_PKTS]) { + atomic64_set(&nfacct->pkts, + be64_to_cpu(nla_get_u64(tb[NFACCT_PKTS]))); + } + atomic_set(&nfacct->refcnt, 1); + list_add_tail_rcu(&nfacct->head, &nfnl_acct_list); + return 0; +} + +static int +nfnl_acct_fill_info(struct sk_buff *skb, u32 pid, u32 seq, u32 type, + int event, struct nf_acct *acct) +{ + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + unsigned int flags = pid ? NLM_F_MULTI : 0; + u64 pkts, bytes; + + event |= NFNL_SUBSYS_ACCT << 8; + nlh = nlmsg_put(skb, pid, seq, event, sizeof(*nfmsg), flags); + if (nlh == NULL) + goto nlmsg_failure; + + nfmsg = nlmsg_data(nlh); + nfmsg->nfgen_family = AF_UNSPEC; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = 0; + + NLA_PUT_STRING(skb, NFACCT_NAME, acct->name); + + if (type == NFNL_MSG_ACCT_GET_CTRZERO) { + pkts = atomic64_xchg(&acct->pkts, 0); + bytes = atomic64_xchg(&acct->bytes, 0); + } else { + pkts = atomic64_read(&acct->pkts); + bytes = atomic64_read(&acct->bytes); + } + NLA_PUT_BE64(skb, NFACCT_PKTS, cpu_to_be64(pkts)); + NLA_PUT_BE64(skb, NFACCT_BYTES, cpu_to_be64(bytes)); + NLA_PUT_BE32(skb, NFACCT_USE, htonl(atomic_read(&acct->refcnt))); + + nlmsg_end(skb, nlh); + return skb->len; + +nlmsg_failure: +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -1; +} + +static int +nfnl_acct_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct nf_acct *cur, *last; + + if (cb->args[2]) + return 0; + + last = (struct nf_acct *)cb->args[1]; + if (cb->args[1]) + cb->args[1] = 0; + + rcu_read_lock(); + list_for_each_entry_rcu(cur, &nfnl_acct_list, head) { + if (last && cur != last) + continue; + + if (nfnl_acct_fill_info(skb, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, + NFNL_MSG_TYPE(cb->nlh->nlmsg_type), + NFNL_MSG_ACCT_NEW, cur) < 0) { + cb->args[1] = (unsigned long)cur; + break; + } + } + if (!cb->args[1]) + cb->args[2] = 1; + rcu_read_unlock(); + return skb->len; +} + +static int +nfnl_acct_get(struct sock *nfnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, const struct nlattr * const tb[]) +{ + int ret = -ENOENT; + struct nf_acct *cur; + char *acct_name; + + if (nlh->nlmsg_flags & NLM_F_DUMP) { + struct netlink_dump_control c = { + .dump = nfnl_acct_dump, + }; + return netlink_dump_start(nfnl, skb, nlh, &c); + } + + if (!tb[NFACCT_NAME]) + return -EINVAL; + acct_name = nla_data(tb[NFACCT_NAME]); + + list_for_each_entry(cur, &nfnl_acct_list, head) { + struct sk_buff *skb2; + + if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX)!= 0) + continue; + + skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (skb2 == NULL) { + ret = -ENOMEM; + break; + } + + ret = nfnl_acct_fill_info(skb2, NETLINK_CB(skb).pid, + nlh->nlmsg_seq, + NFNL_MSG_TYPE(nlh->nlmsg_type), + NFNL_MSG_ACCT_NEW, cur); + if (ret <= 0) { + kfree_skb(skb2); + break; + } + ret = netlink_unicast(nfnl, skb2, NETLINK_CB(skb).pid, + MSG_DONTWAIT); + if (ret > 0) + ret = 0; + + /* this avoids a loop in nfnetlink. */ + return ret == -EAGAIN ? -ENOBUFS : ret; + } + return ret; +} + +/* try to delete object, fail if it is still in use. */ +static int nfnl_acct_try_del(struct nf_acct *cur) +{ + int ret = 0; + + /* we want to avoid races with nfnl_acct_find_get. */ + if (atomic_dec_and_test(&cur->refcnt)) { + /* We are protected by nfnl mutex. */ + list_del_rcu(&cur->head); + kfree_rcu(cur, rcu_head); + } else { + /* still in use, restore reference counter. */ + atomic_inc(&cur->refcnt); + ret = -EBUSY; + } + return ret; +} + +static int +nfnl_acct_del(struct sock *nfnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, const struct nlattr * const tb[]) +{ + char *acct_name; + struct nf_acct *cur; + int ret = -ENOENT; + + if (!tb[NFACCT_NAME]) { + list_for_each_entry(cur, &nfnl_acct_list, head) + nfnl_acct_try_del(cur); + + return 0; + } + acct_name = nla_data(tb[NFACCT_NAME]); + + list_for_each_entry(cur, &nfnl_acct_list, head) { + if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX) != 0) + continue; + + ret = nfnl_acct_try_del(cur); + if (ret < 0) + return ret; + + break; + } + return ret; +} + +static const struct nla_policy nfnl_acct_policy[NFACCT_MAX+1] = { + [NFACCT_NAME] = { .type = NLA_NUL_STRING, .len = NFACCT_NAME_MAX-1 }, + [NFACCT_BYTES] = { .type = NLA_U64 }, + [NFACCT_PKTS] = { .type = NLA_U64 }, +}; + +static const struct nfnl_callback nfnl_acct_cb[NFNL_MSG_ACCT_MAX] = { + [NFNL_MSG_ACCT_NEW] = { .call = nfnl_acct_new, + .attr_count = NFACCT_MAX, + .policy = nfnl_acct_policy }, + [NFNL_MSG_ACCT_GET] = { .call = nfnl_acct_get, + .attr_count = NFACCT_MAX, + .policy = nfnl_acct_policy }, + [NFNL_MSG_ACCT_GET_CTRZERO] = { .call = nfnl_acct_get, + .attr_count = NFACCT_MAX, + .policy = nfnl_acct_policy }, + [NFNL_MSG_ACCT_DEL] = { .call = nfnl_acct_del, + .attr_count = NFACCT_MAX, + .policy = nfnl_acct_policy }, +}; + +static const struct nfnetlink_subsystem nfnl_acct_subsys = { + .name = "acct", + .subsys_id = NFNL_SUBSYS_ACCT, + .cb_count = NFNL_MSG_ACCT_MAX, + .cb = nfnl_acct_cb, +}; + +MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_ACCT); + +struct nf_acct *nfnl_acct_find_get(const char *acct_name) +{ + struct nf_acct *cur, *acct = NULL; + + rcu_read_lock(); + list_for_each_entry_rcu(cur, &nfnl_acct_list, head) { + if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX)!= 0) + continue; + + if (!try_module_get(THIS_MODULE)) + goto err; + + if (!atomic_inc_not_zero(&cur->refcnt)) { + module_put(THIS_MODULE); + goto err; + } + + acct = cur; + break; + } +err: + rcu_read_unlock(); + return acct; +} +EXPORT_SYMBOL_GPL(nfnl_acct_find_get); + +void nfnl_acct_put(struct nf_acct *acct) +{ + atomic_dec(&acct->refcnt); + module_put(THIS_MODULE); +} +EXPORT_SYMBOL_GPL(nfnl_acct_put); + +void nfnl_acct_update(const struct sk_buff *skb, struct nf_acct *nfacct) +{ + atomic64_inc(&nfacct->pkts); + atomic64_add(skb->len, &nfacct->bytes); +} +EXPORT_SYMBOL_GPL(nfnl_acct_update); + +static int __init nfnl_acct_init(void) +{ + int ret; + + pr_info("nfnl_acct: registering with nfnetlink.\n"); + ret = nfnetlink_subsys_register(&nfnl_acct_subsys); + if (ret < 0) { + pr_err("nfnl_acct_init: cannot register with nfnetlink.\n"); + goto err_out; + } + return 0; +err_out: + return ret; +} + +static void __exit nfnl_acct_exit(void) +{ + struct nf_acct *cur, *tmp; + + pr_info("nfnl_acct: unregistering from nfnetlink.\n"); + nfnetlink_subsys_unregister(&nfnl_acct_subsys); + + list_for_each_entry_safe(cur, tmp, &nfnl_acct_list, head) { + list_del_rcu(&cur->head); + /* We are sure that our objects have no clients at this point, + * it's safe to release them all without checking refcnt. */ + kfree_rcu(cur, rcu_head); + } +} + +module_init(nfnl_acct_init); +module_exit(nfnl_acct_exit); diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c new file mode 100644 index 00000000..2b9e79f5 --- /dev/null +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -0,0 +1,430 @@ +/* + * (C) 2012 by Pablo Neira Ayuso <pablo@netfilter.org> + * (C) 2012 by Vyatta Inc. <http://www.vyatta.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation (or any later at your option). + */ +#include <linux/init.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/rculist.h> +#include <linux/rculist_nulls.h> +#include <linux/types.h> +#include <linux/timer.h> +#include <linux/security.h> +#include <linux/skbuff.h> +#include <linux/errno.h> +#include <linux/netlink.h> +#include <linux/spinlock.h> +#include <linux/interrupt.h> +#include <linux/slab.h> + +#include <linux/netfilter.h> +#include <net/netlink.h> +#include <net/sock.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_l3proto.h> +#include <net/netfilter/nf_conntrack_l4proto.h> +#include <net/netfilter/nf_conntrack_tuple.h> +#include <net/netfilter/nf_conntrack_timeout.h> + +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nfnetlink_cttimeout.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); +MODULE_DESCRIPTION("cttimeout: Extended Netfilter Connection Tracking timeout tuning"); + +static LIST_HEAD(cttimeout_list); + +static const struct nla_policy cttimeout_nla_policy[CTA_TIMEOUT_MAX+1] = { + [CTA_TIMEOUT_NAME] = { .type = NLA_NUL_STRING }, + [CTA_TIMEOUT_L3PROTO] = { .type = NLA_U16 }, + [CTA_TIMEOUT_L4PROTO] = { .type = NLA_U8 }, + [CTA_TIMEOUT_DATA] = { .type = NLA_NESTED }, +}; + +static int +ctnl_timeout_parse_policy(struct ctnl_timeout *timeout, + struct nf_conntrack_l4proto *l4proto, + const struct nlattr *attr) +{ + int ret = 0; + + if (likely(l4proto->ctnl_timeout.nlattr_to_obj)) { + struct nlattr *tb[l4proto->ctnl_timeout.nlattr_max+1]; + + nla_parse_nested(tb, l4proto->ctnl_timeout.nlattr_max, + attr, l4proto->ctnl_timeout.nla_policy); + + ret = l4proto->ctnl_timeout.nlattr_to_obj(tb, &timeout->data); + } + return ret; +} + +static int +cttimeout_new_timeout(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const cda[]) +{ + __u16 l3num; + __u8 l4num; + struct nf_conntrack_l4proto *l4proto; + struct ctnl_timeout *timeout, *matching = NULL; + char *name; + int ret; + + if (!cda[CTA_TIMEOUT_NAME] || + !cda[CTA_TIMEOUT_L3PROTO] || + !cda[CTA_TIMEOUT_L4PROTO] || + !cda[CTA_TIMEOUT_DATA]) + return -EINVAL; + + name = nla_data(cda[CTA_TIMEOUT_NAME]); + l3num = ntohs(nla_get_be16(cda[CTA_TIMEOUT_L3PROTO])); + l4num = nla_get_u8(cda[CTA_TIMEOUT_L4PROTO]); + + list_for_each_entry(timeout, &cttimeout_list, head) { + if (strncmp(timeout->name, name, CTNL_TIMEOUT_NAME_MAX) != 0) + continue; + + if (nlh->nlmsg_flags & NLM_F_EXCL) + return -EEXIST; + + matching = timeout; + break; + } + + l4proto = nf_ct_l4proto_find_get(l3num, l4num); + + /* This protocol is not supportted, skip. */ + if (l4proto->l4proto != l4num) { + ret = -EOPNOTSUPP; + goto err_proto_put; + } + + if (matching) { + if (nlh->nlmsg_flags & NLM_F_REPLACE) { + /* You cannot replace one timeout policy by another of + * different kind, sorry. + */ + if (matching->l3num != l3num || + matching->l4proto->l4proto != l4num) { + ret = -EINVAL; + goto err_proto_put; + } + + ret = ctnl_timeout_parse_policy(matching, l4proto, + cda[CTA_TIMEOUT_DATA]); + return ret; + } + ret = -EBUSY; + goto err_proto_put; + } + + timeout = kzalloc(sizeof(struct ctnl_timeout) + + l4proto->ctnl_timeout.obj_size, GFP_KERNEL); + if (timeout == NULL) { + ret = -ENOMEM; + goto err_proto_put; + } + + ret = ctnl_timeout_parse_policy(timeout, l4proto, + cda[CTA_TIMEOUT_DATA]); + if (ret < 0) + goto err; + + strcpy(timeout->name, nla_data(cda[CTA_TIMEOUT_NAME])); + timeout->l3num = l3num; + timeout->l4proto = l4proto; + atomic_set(&timeout->refcnt, 1); + list_add_tail_rcu(&timeout->head, &cttimeout_list); + + return 0; +err: + kfree(timeout); +err_proto_put: + nf_ct_l4proto_put(l4proto); + return ret; +} + +static int +ctnl_timeout_fill_info(struct sk_buff *skb, u32 pid, u32 seq, u32 type, + int event, struct ctnl_timeout *timeout) +{ + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + unsigned int flags = pid ? NLM_F_MULTI : 0; + struct nf_conntrack_l4proto *l4proto = timeout->l4proto; + + event |= NFNL_SUBSYS_CTNETLINK_TIMEOUT << 8; + nlh = nlmsg_put(skb, pid, seq, event, sizeof(*nfmsg), flags); + if (nlh == NULL) + goto nlmsg_failure; + + nfmsg = nlmsg_data(nlh); + nfmsg->nfgen_family = AF_UNSPEC; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = 0; + + NLA_PUT_STRING(skb, CTA_TIMEOUT_NAME, timeout->name); + NLA_PUT_BE16(skb, CTA_TIMEOUT_L3PROTO, htons(timeout->l3num)); + NLA_PUT_U8(skb, CTA_TIMEOUT_L4PROTO, timeout->l4proto->l4proto); + NLA_PUT_BE32(skb, CTA_TIMEOUT_USE, + htonl(atomic_read(&timeout->refcnt))); + + if (likely(l4proto->ctnl_timeout.obj_to_nlattr)) { + struct nlattr *nest_parms; + int ret; + + nest_parms = nla_nest_start(skb, + CTA_TIMEOUT_DATA | NLA_F_NESTED); + if (!nest_parms) + goto nla_put_failure; + + ret = l4proto->ctnl_timeout.obj_to_nlattr(skb, &timeout->data); + if (ret < 0) + goto nla_put_failure; + + nla_nest_end(skb, nest_parms); + } + + nlmsg_end(skb, nlh); + return skb->len; + +nlmsg_failure: +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -1; +} + +static int +ctnl_timeout_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct ctnl_timeout *cur, *last; + + if (cb->args[2]) + return 0; + + last = (struct ctnl_timeout *)cb->args[1]; + if (cb->args[1]) + cb->args[1] = 0; + + rcu_read_lock(); + list_for_each_entry_rcu(cur, &cttimeout_list, head) { + if (last && cur != last) + continue; + + if (ctnl_timeout_fill_info(skb, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, + NFNL_MSG_TYPE(cb->nlh->nlmsg_type), + IPCTNL_MSG_TIMEOUT_NEW, cur) < 0) { + cb->args[1] = (unsigned long)cur; + break; + } + } + if (!cb->args[1]) + cb->args[2] = 1; + rcu_read_unlock(); + return skb->len; +} + +static int +cttimeout_get_timeout(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const cda[]) +{ + int ret = -ENOENT; + char *name; + struct ctnl_timeout *cur; + + if (nlh->nlmsg_flags & NLM_F_DUMP) { + struct netlink_dump_control c = { + .dump = ctnl_timeout_dump, + }; + return netlink_dump_start(ctnl, skb, nlh, &c); + } + + if (!cda[CTA_TIMEOUT_NAME]) + return -EINVAL; + name = nla_data(cda[CTA_TIMEOUT_NAME]); + + list_for_each_entry(cur, &cttimeout_list, head) { + struct sk_buff *skb2; + + if (strncmp(cur->name, name, CTNL_TIMEOUT_NAME_MAX) != 0) + continue; + + skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (skb2 == NULL) { + ret = -ENOMEM; + break; + } + + ret = ctnl_timeout_fill_info(skb2, NETLINK_CB(skb).pid, + nlh->nlmsg_seq, + NFNL_MSG_TYPE(nlh->nlmsg_type), + IPCTNL_MSG_TIMEOUT_NEW, cur); + if (ret <= 0) { + kfree_skb(skb2); + break; + } + ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, + MSG_DONTWAIT); + if (ret > 0) + ret = 0; + + /* this avoids a loop in nfnetlink. */ + return ret == -EAGAIN ? -ENOBUFS : ret; + } + return ret; +} + +/* try to delete object, fail if it is still in use. */ +static int ctnl_timeout_try_del(struct ctnl_timeout *timeout) +{ + int ret = 0; + + /* we want to avoid races with nf_ct_timeout_find_get. */ + if (atomic_dec_and_test(&timeout->refcnt)) { + /* We are protected by nfnl mutex. */ + list_del_rcu(&timeout->head); + nf_ct_l4proto_put(timeout->l4proto); + kfree_rcu(timeout, rcu_head); + } else { + /* still in use, restore reference counter. */ + atomic_inc(&timeout->refcnt); + ret = -EBUSY; + } + return ret; +} + +static int +cttimeout_del_timeout(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const cda[]) +{ + char *name; + struct ctnl_timeout *cur; + int ret = -ENOENT; + + if (!cda[CTA_TIMEOUT_NAME]) { + list_for_each_entry(cur, &cttimeout_list, head) + ctnl_timeout_try_del(cur); + + return 0; + } + name = nla_data(cda[CTA_TIMEOUT_NAME]); + + list_for_each_entry(cur, &cttimeout_list, head) { + if (strncmp(cur->name, name, CTNL_TIMEOUT_NAME_MAX) != 0) + continue; + + ret = ctnl_timeout_try_del(cur); + if (ret < 0) + return ret; + + break; + } + return ret; +} + +#ifdef CONFIG_NF_CONNTRACK_TIMEOUT +static struct ctnl_timeout *ctnl_timeout_find_get(const char *name) +{ + struct ctnl_timeout *timeout, *matching = NULL; + + rcu_read_lock(); + list_for_each_entry_rcu(timeout, &cttimeout_list, head) { + if (strncmp(timeout->name, name, CTNL_TIMEOUT_NAME_MAX) != 0) + continue; + + if (!try_module_get(THIS_MODULE)) + goto err; + + if (!atomic_inc_not_zero(&timeout->refcnt)) { + module_put(THIS_MODULE); + goto err; + } + matching = timeout; + break; + } +err: + rcu_read_unlock(); + return matching; +} + +static void ctnl_timeout_put(struct ctnl_timeout *timeout) +{ + atomic_dec(&timeout->refcnt); + module_put(THIS_MODULE); +} +#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ + +static const struct nfnl_callback cttimeout_cb[IPCTNL_MSG_TIMEOUT_MAX] = { + [IPCTNL_MSG_TIMEOUT_NEW] = { .call = cttimeout_new_timeout, + .attr_count = CTA_TIMEOUT_MAX, + .policy = cttimeout_nla_policy }, + [IPCTNL_MSG_TIMEOUT_GET] = { .call = cttimeout_get_timeout, + .attr_count = CTA_TIMEOUT_MAX, + .policy = cttimeout_nla_policy }, + [IPCTNL_MSG_TIMEOUT_DELETE] = { .call = cttimeout_del_timeout, + .attr_count = CTA_TIMEOUT_MAX, + .policy = cttimeout_nla_policy }, +}; + +static const struct nfnetlink_subsystem cttimeout_subsys = { + .name = "conntrack_timeout", + .subsys_id = NFNL_SUBSYS_CTNETLINK_TIMEOUT, + .cb_count = IPCTNL_MSG_TIMEOUT_MAX, + .cb = cttimeout_cb, +}; + +MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTNETLINK_TIMEOUT); + +static int __init cttimeout_init(void) +{ + int ret; + + ret = nfnetlink_subsys_register(&cttimeout_subsys); + if (ret < 0) { + pr_err("cttimeout_init: cannot register cttimeout with " + "nfnetlink.\n"); + goto err_out; + } +#ifdef CONFIG_NF_CONNTRACK_TIMEOUT + RCU_INIT_POINTER(nf_ct_timeout_find_get_hook, ctnl_timeout_find_get); + RCU_INIT_POINTER(nf_ct_timeout_put_hook, ctnl_timeout_put); +#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ + return 0; + +err_out: + return ret; +} + +static void __exit cttimeout_exit(void) +{ + struct ctnl_timeout *cur, *tmp; + + pr_info("cttimeout: unregistering from nfnetlink.\n"); + + nfnetlink_subsys_unregister(&cttimeout_subsys); + list_for_each_entry_safe(cur, tmp, &cttimeout_list, head) { + list_del_rcu(&cur->head); + /* We are sure that our objects have no clients at this point, + * it's safe to release them all without checking refcnt. + */ + nf_ct_l4proto_put(cur->l4proto); + kfree_rcu(cur, rcu_head); + } +#ifdef CONFIG_NF_CONNTRACK_TIMEOUT + RCU_INIT_POINTER(nf_ct_timeout_find_get_hook, NULL); + RCU_INIT_POINTER(nf_ct_timeout_put_hook, NULL); +#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ +} + +module_init(cttimeout_init); +module_exit(cttimeout_exit); diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c new file mode 100644 index 00000000..66b2c54c --- /dev/null +++ b/net/netfilter/nfnetlink_log.c @@ -0,0 +1,1012 @@ +/* + * This is a module which is used for logging packets to userspace via + * nfetlink. + * + * (C) 2005 by Harald Welte <laforge@netfilter.org> + * + * Based on the old ipv4-only ipt_ULOG.c: + * (C) 2000-2004 by Harald Welte <laforge@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/init.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/netdevice.h> +#include <linux/netfilter.h> +#include <linux/netlink.h> +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nfnetlink_log.h> +#include <linux/spinlock.h> +#include <linux/sysctl.h> +#include <linux/proc_fs.h> +#include <linux/security.h> +#include <linux/list.h> +#include <linux/jhash.h> +#include <linux/random.h> +#include <linux/slab.h> +#include <net/sock.h> +#include <net/netfilter/nf_log.h> +#include <net/netfilter/nfnetlink_log.h> + +#include <linux/atomic.h> + +#ifdef CONFIG_BRIDGE_NETFILTER +#include "../bridge/br_private.h" +#endif + +#define NFULNL_NLBUFSIZ_DEFAULT NLMSG_GOODSIZE +#define NFULNL_TIMEOUT_DEFAULT 100 /* every second */ +#define NFULNL_QTHRESH_DEFAULT 100 /* 100 packets */ +#define NFULNL_COPY_RANGE_MAX 0xFFFF /* max packet size is limited by 16-bit struct nfattr nfa_len field */ + +#define PRINTR(x, args...) do { if (net_ratelimit()) \ + printk(x, ## args); } while (0); + +struct nfulnl_instance { + struct hlist_node hlist; /* global list of instances */ + spinlock_t lock; + atomic_t use; /* use count */ + + unsigned int qlen; /* number of nlmsgs in skb */ + struct sk_buff *skb; /* pre-allocatd skb */ + struct timer_list timer; + int peer_pid; /* PID of the peer process */ + + /* configurable parameters */ + unsigned int flushtimeout; /* timeout until queue flush */ + unsigned int nlbufsiz; /* netlink buffer allocation size */ + unsigned int qthreshold; /* threshold of the queue */ + u_int32_t copy_range; + u_int32_t seq; /* instance-local sequential counter */ + u_int16_t group_num; /* number of this queue */ + u_int16_t flags; + u_int8_t copy_mode; + struct rcu_head rcu; +}; + +static DEFINE_SPINLOCK(instances_lock); +static atomic_t global_seq; + +#define INSTANCE_BUCKETS 16 +static struct hlist_head instance_table[INSTANCE_BUCKETS]; +static unsigned int hash_init; + +static inline u_int8_t instance_hashfn(u_int16_t group_num) +{ + return ((group_num & 0xff) % INSTANCE_BUCKETS); +} + +static struct nfulnl_instance * +__instance_lookup(u_int16_t group_num) +{ + struct hlist_head *head; + struct hlist_node *pos; + struct nfulnl_instance *inst; + + head = &instance_table[instance_hashfn(group_num)]; + hlist_for_each_entry_rcu(inst, pos, head, hlist) { + if (inst->group_num == group_num) + return inst; + } + return NULL; +} + +static inline void +instance_get(struct nfulnl_instance *inst) +{ + atomic_inc(&inst->use); +} + +static struct nfulnl_instance * +instance_lookup_get(u_int16_t group_num) +{ + struct nfulnl_instance *inst; + + rcu_read_lock_bh(); + inst = __instance_lookup(group_num); + if (inst && !atomic_inc_not_zero(&inst->use)) + inst = NULL; + rcu_read_unlock_bh(); + + return inst; +} + +static void nfulnl_instance_free_rcu(struct rcu_head *head) +{ + kfree(container_of(head, struct nfulnl_instance, rcu)); + module_put(THIS_MODULE); +} + +static void +instance_put(struct nfulnl_instance *inst) +{ + if (inst && atomic_dec_and_test(&inst->use)) + call_rcu_bh(&inst->rcu, nfulnl_instance_free_rcu); +} + +static void nfulnl_timer(unsigned long data); + +static struct nfulnl_instance * +instance_create(u_int16_t group_num, int pid) +{ + struct nfulnl_instance *inst; + int err; + + spin_lock_bh(&instances_lock); + if (__instance_lookup(group_num)) { + err = -EEXIST; + goto out_unlock; + } + + inst = kzalloc(sizeof(*inst), GFP_ATOMIC); + if (!inst) { + err = -ENOMEM; + goto out_unlock; + } + + if (!try_module_get(THIS_MODULE)) { + kfree(inst); + err = -EAGAIN; + goto out_unlock; + } + + INIT_HLIST_NODE(&inst->hlist); + spin_lock_init(&inst->lock); + /* needs to be two, since we _put() after creation */ + atomic_set(&inst->use, 2); + + setup_timer(&inst->timer, nfulnl_timer, (unsigned long)inst); + + inst->peer_pid = pid; + inst->group_num = group_num; + + inst->qthreshold = NFULNL_QTHRESH_DEFAULT; + inst->flushtimeout = NFULNL_TIMEOUT_DEFAULT; + inst->nlbufsiz = NFULNL_NLBUFSIZ_DEFAULT; + inst->copy_mode = NFULNL_COPY_PACKET; + inst->copy_range = NFULNL_COPY_RANGE_MAX; + + hlist_add_head_rcu(&inst->hlist, + &instance_table[instance_hashfn(group_num)]); + + spin_unlock_bh(&instances_lock); + + return inst; + +out_unlock: + spin_unlock_bh(&instances_lock); + return ERR_PTR(err); +} + +static void __nfulnl_flush(struct nfulnl_instance *inst); + +/* called with BH disabled */ +static void +__instance_destroy(struct nfulnl_instance *inst) +{ + /* first pull it out of the global list */ + hlist_del_rcu(&inst->hlist); + + /* then flush all pending packets from skb */ + + spin_lock(&inst->lock); + + /* lockless readers wont be able to use us */ + inst->copy_mode = NFULNL_COPY_DISABLED; + + if (inst->skb) + __nfulnl_flush(inst); + spin_unlock(&inst->lock); + + /* and finally put the refcount */ + instance_put(inst); +} + +static inline void +instance_destroy(struct nfulnl_instance *inst) +{ + spin_lock_bh(&instances_lock); + __instance_destroy(inst); + spin_unlock_bh(&instances_lock); +} + +static int +nfulnl_set_mode(struct nfulnl_instance *inst, u_int8_t mode, + unsigned int range) +{ + int status = 0; + + spin_lock_bh(&inst->lock); + + switch (mode) { + case NFULNL_COPY_NONE: + case NFULNL_COPY_META: + inst->copy_mode = mode; + inst->copy_range = 0; + break; + + case NFULNL_COPY_PACKET: + inst->copy_mode = mode; + inst->copy_range = min_t(unsigned int, + range, NFULNL_COPY_RANGE_MAX); + break; + + default: + status = -EINVAL; + break; + } + + spin_unlock_bh(&inst->lock); + + return status; +} + +static int +nfulnl_set_nlbufsiz(struct nfulnl_instance *inst, u_int32_t nlbufsiz) +{ + int status; + + spin_lock_bh(&inst->lock); + if (nlbufsiz < NFULNL_NLBUFSIZ_DEFAULT) + status = -ERANGE; + else if (nlbufsiz > 131072) + status = -ERANGE; + else { + inst->nlbufsiz = nlbufsiz; + status = 0; + } + spin_unlock_bh(&inst->lock); + + return status; +} + +static int +nfulnl_set_timeout(struct nfulnl_instance *inst, u_int32_t timeout) +{ + spin_lock_bh(&inst->lock); + inst->flushtimeout = timeout; + spin_unlock_bh(&inst->lock); + + return 0; +} + +static int +nfulnl_set_qthresh(struct nfulnl_instance *inst, u_int32_t qthresh) +{ + spin_lock_bh(&inst->lock); + inst->qthreshold = qthresh; + spin_unlock_bh(&inst->lock); + + return 0; +} + +static int +nfulnl_set_flags(struct nfulnl_instance *inst, u_int16_t flags) +{ + spin_lock_bh(&inst->lock); + inst->flags = flags; + spin_unlock_bh(&inst->lock); + + return 0; +} + +static struct sk_buff * +nfulnl_alloc_skb(unsigned int inst_size, unsigned int pkt_size) +{ + struct sk_buff *skb; + unsigned int n; + + /* alloc skb which should be big enough for a whole multipart + * message. WARNING: has to be <= 128k due to slab restrictions */ + + n = max(inst_size, pkt_size); + skb = alloc_skb(n, GFP_ATOMIC); + if (!skb) { + if (n > pkt_size) { + /* try to allocate only as much as we need for current + * packet */ + + skb = alloc_skb(pkt_size, GFP_ATOMIC); + if (!skb) + pr_err("nfnetlink_log: can't even alloc %u bytes\n", + pkt_size); + } + } + + return skb; +} + +static int +__nfulnl_send(struct nfulnl_instance *inst) +{ + int status = -1; + + if (inst->qlen > 1) + NLMSG_PUT(inst->skb, 0, 0, + NLMSG_DONE, + sizeof(struct nfgenmsg)); + + status = nfnetlink_unicast(inst->skb, &init_net, inst->peer_pid, + MSG_DONTWAIT); + + inst->qlen = 0; + inst->skb = NULL; + +nlmsg_failure: + return status; +} + +static void +__nfulnl_flush(struct nfulnl_instance *inst) +{ + /* timer holds a reference */ + if (del_timer(&inst->timer)) + instance_put(inst); + if (inst->skb) + __nfulnl_send(inst); +} + +static void +nfulnl_timer(unsigned long data) +{ + struct nfulnl_instance *inst = (struct nfulnl_instance *)data; + + spin_lock_bh(&inst->lock); + if (inst->skb) + __nfulnl_send(inst); + spin_unlock_bh(&inst->lock); + instance_put(inst); +} + +/* This is an inline function, we don't really care about a long + * list of arguments */ +static inline int +__build_packet_message(struct nfulnl_instance *inst, + const struct sk_buff *skb, + unsigned int data_len, + u_int8_t pf, + unsigned int hooknum, + const struct net_device *indev, + const struct net_device *outdev, + const char *prefix, unsigned int plen) +{ + struct nfulnl_msg_packet_hdr pmsg; + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + sk_buff_data_t old_tail = inst->skb->tail; + + nlh = NLMSG_PUT(inst->skb, 0, 0, + NFNL_SUBSYS_ULOG << 8 | NFULNL_MSG_PACKET, + sizeof(struct nfgenmsg)); + nfmsg = NLMSG_DATA(nlh); + nfmsg->nfgen_family = pf; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = htons(inst->group_num); + + pmsg.hw_protocol = skb->protocol; + pmsg.hook = hooknum; + + NLA_PUT(inst->skb, NFULA_PACKET_HDR, sizeof(pmsg), &pmsg); + + if (prefix) + NLA_PUT(inst->skb, NFULA_PREFIX, plen, prefix); + + if (indev) { +#ifndef CONFIG_BRIDGE_NETFILTER + NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_INDEV, + htonl(indev->ifindex)); +#else + if (pf == PF_BRIDGE) { + /* Case 1: outdev is physical input device, we need to + * look for bridge group (when called from + * netfilter_bridge) */ + NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_PHYSINDEV, + htonl(indev->ifindex)); + /* this is the bridge group "brX" */ + /* rcu_read_lock()ed by nf_hook_slow or nf_log_packet */ + NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_INDEV, + htonl(br_port_get_rcu(indev)->br->dev->ifindex)); + } else { + /* Case 2: indev is bridge group, we need to look for + * physical device (when called from ipv4) */ + NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_INDEV, + htonl(indev->ifindex)); + if (skb->nf_bridge && skb->nf_bridge->physindev) + NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_PHYSINDEV, + htonl(skb->nf_bridge->physindev->ifindex)); + } +#endif + } + + if (outdev) { +#ifndef CONFIG_BRIDGE_NETFILTER + NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_OUTDEV, + htonl(outdev->ifindex)); +#else + if (pf == PF_BRIDGE) { + /* Case 1: outdev is physical output device, we need to + * look for bridge group (when called from + * netfilter_bridge) */ + NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_PHYSOUTDEV, + htonl(outdev->ifindex)); + /* this is the bridge group "brX" */ + /* rcu_read_lock()ed by nf_hook_slow or nf_log_packet */ + NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_OUTDEV, + htonl(br_port_get_rcu(outdev)->br->dev->ifindex)); + } else { + /* Case 2: indev is a bridge group, we need to look + * for physical device (when called from ipv4) */ + NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_OUTDEV, + htonl(outdev->ifindex)); + if (skb->nf_bridge && skb->nf_bridge->physoutdev) + NLA_PUT_BE32(inst->skb, NFULA_IFINDEX_PHYSOUTDEV, + htonl(skb->nf_bridge->physoutdev->ifindex)); + } +#endif + } + + if (skb->mark) + NLA_PUT_BE32(inst->skb, NFULA_MARK, htonl(skb->mark)); + + if (indev && skb->dev && + skb->mac_header != skb->network_header) { + struct nfulnl_msg_packet_hw phw; + int len = dev_parse_header(skb, phw.hw_addr); + if (len > 0) { + phw.hw_addrlen = htons(len); + NLA_PUT(inst->skb, NFULA_HWADDR, sizeof(phw), &phw); + } + } + + if (indev && skb_mac_header_was_set(skb)) { + NLA_PUT_BE16(inst->skb, NFULA_HWTYPE, htons(skb->dev->type)); + NLA_PUT_BE16(inst->skb, NFULA_HWLEN, + htons(skb->dev->hard_header_len)); + NLA_PUT(inst->skb, NFULA_HWHEADER, skb->dev->hard_header_len, + skb_mac_header(skb)); + } + + if (skb->tstamp.tv64) { + struct nfulnl_msg_packet_timestamp ts; + struct timeval tv = ktime_to_timeval(skb->tstamp); + ts.sec = cpu_to_be64(tv.tv_sec); + ts.usec = cpu_to_be64(tv.tv_usec); + + NLA_PUT(inst->skb, NFULA_TIMESTAMP, sizeof(ts), &ts); + } + + /* UID */ + if (skb->sk) { + read_lock_bh(&skb->sk->sk_callback_lock); + if (skb->sk->sk_socket && skb->sk->sk_socket->file) { + struct file *file = skb->sk->sk_socket->file; + __be32 uid = htonl(file->f_cred->fsuid); + __be32 gid = htonl(file->f_cred->fsgid); + /* need to unlock here since NLA_PUT may goto */ + read_unlock_bh(&skb->sk->sk_callback_lock); + NLA_PUT_BE32(inst->skb, NFULA_UID, uid); + NLA_PUT_BE32(inst->skb, NFULA_GID, gid); + } else + read_unlock_bh(&skb->sk->sk_callback_lock); + } + + /* local sequence number */ + if (inst->flags & NFULNL_CFG_F_SEQ) + NLA_PUT_BE32(inst->skb, NFULA_SEQ, htonl(inst->seq++)); + + /* global sequence number */ + if (inst->flags & NFULNL_CFG_F_SEQ_GLOBAL) + NLA_PUT_BE32(inst->skb, NFULA_SEQ_GLOBAL, + htonl(atomic_inc_return(&global_seq))); + + if (data_len) { + struct nlattr *nla; + int size = nla_attr_size(data_len); + + if (skb_tailroom(inst->skb) < nla_total_size(data_len)) { + printk(KERN_WARNING "nfnetlink_log: no tailroom!\n"); + goto nlmsg_failure; + } + + nla = (struct nlattr *)skb_put(inst->skb, nla_total_size(data_len)); + nla->nla_type = NFULA_PAYLOAD; + nla->nla_len = size; + + if (skb_copy_bits(skb, 0, nla_data(nla), data_len)) + BUG(); + } + + nlh->nlmsg_len = inst->skb->tail - old_tail; + return 0; + +nlmsg_failure: +nla_put_failure: + PRINTR(KERN_ERR "nfnetlink_log: error creating log nlmsg\n"); + return -1; +} + +#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0) + +static struct nf_loginfo default_loginfo = { + .type = NF_LOG_TYPE_ULOG, + .u = { + .ulog = { + .copy_len = 0xffff, + .group = 0, + .qthreshold = 1, + }, + }, +}; + +/* log handler for internal netfilter logging api */ +void +nfulnl_log_packet(u_int8_t pf, + unsigned int hooknum, + const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const struct nf_loginfo *li_user, + const char *prefix) +{ + unsigned int size, data_len; + struct nfulnl_instance *inst; + const struct nf_loginfo *li; + unsigned int qthreshold; + unsigned int plen; + + if (li_user && li_user->type == NF_LOG_TYPE_ULOG) + li = li_user; + else + li = &default_loginfo; + + inst = instance_lookup_get(li->u.ulog.group); + if (!inst) + return; + + plen = 0; + if (prefix) + plen = strlen(prefix) + 1; + + /* FIXME: do we want to make the size calculation conditional based on + * what is actually present? way more branches and checks, but more + * memory efficient... */ + size = NLMSG_SPACE(sizeof(struct nfgenmsg)) + + nla_total_size(sizeof(struct nfulnl_msg_packet_hdr)) + + nla_total_size(sizeof(u_int32_t)) /* ifindex */ + + nla_total_size(sizeof(u_int32_t)) /* ifindex */ +#ifdef CONFIG_BRIDGE_NETFILTER + + nla_total_size(sizeof(u_int32_t)) /* ifindex */ + + nla_total_size(sizeof(u_int32_t)) /* ifindex */ +#endif + + nla_total_size(sizeof(u_int32_t)) /* mark */ + + nla_total_size(sizeof(u_int32_t)) /* uid */ + + nla_total_size(sizeof(u_int32_t)) /* gid */ + + nla_total_size(plen) /* prefix */ + + nla_total_size(sizeof(struct nfulnl_msg_packet_hw)) + + nla_total_size(sizeof(struct nfulnl_msg_packet_timestamp)); + + if (in && skb_mac_header_was_set(skb)) { + size += nla_total_size(skb->dev->hard_header_len) + + nla_total_size(sizeof(u_int16_t)) /* hwtype */ + + nla_total_size(sizeof(u_int16_t)); /* hwlen */ + } + + spin_lock_bh(&inst->lock); + + if (inst->flags & NFULNL_CFG_F_SEQ) + size += nla_total_size(sizeof(u_int32_t)); + if (inst->flags & NFULNL_CFG_F_SEQ_GLOBAL) + size += nla_total_size(sizeof(u_int32_t)); + + qthreshold = inst->qthreshold; + /* per-rule qthreshold overrides per-instance */ + if (li->u.ulog.qthreshold) + if (qthreshold > li->u.ulog.qthreshold) + qthreshold = li->u.ulog.qthreshold; + + + switch (inst->copy_mode) { + case NFULNL_COPY_META: + case NFULNL_COPY_NONE: + data_len = 0; + break; + + case NFULNL_COPY_PACKET: + if (inst->copy_range == 0 + || inst->copy_range > skb->len) + data_len = skb->len; + else + data_len = inst->copy_range; + + size += nla_total_size(data_len); + break; + + case NFULNL_COPY_DISABLED: + default: + goto unlock_and_release; + } + + if (inst->skb && + size > skb_tailroom(inst->skb) - sizeof(struct nfgenmsg)) { + /* either the queue len is too high or we don't have + * enough room in the skb left. flush to userspace. */ + __nfulnl_flush(inst); + } + + if (!inst->skb) { + inst->skb = nfulnl_alloc_skb(inst->nlbufsiz, size); + if (!inst->skb) + goto alloc_failure; + } + + inst->qlen++; + + __build_packet_message(inst, skb, data_len, pf, + hooknum, in, out, prefix, plen); + + if (inst->qlen >= qthreshold) + __nfulnl_flush(inst); + /* timer_pending always called within inst->lock, so there + * is no chance of a race here */ + else if (!timer_pending(&inst->timer)) { + instance_get(inst); + inst->timer.expires = jiffies + (inst->flushtimeout*HZ/100); + add_timer(&inst->timer); + } + +unlock_and_release: + spin_unlock_bh(&inst->lock); + instance_put(inst); + return; + +alloc_failure: + /* FIXME: statistics */ + goto unlock_and_release; +} +EXPORT_SYMBOL_GPL(nfulnl_log_packet); + +static int +nfulnl_rcv_nl_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct netlink_notify *n = ptr; + + if (event == NETLINK_URELEASE && n->protocol == NETLINK_NETFILTER) { + int i; + + /* destroy all instances for this pid */ + spin_lock_bh(&instances_lock); + for (i = 0; i < INSTANCE_BUCKETS; i++) { + struct hlist_node *tmp, *t2; + struct nfulnl_instance *inst; + struct hlist_head *head = &instance_table[i]; + + hlist_for_each_entry_safe(inst, tmp, t2, head, hlist) { + if ((net_eq(n->net, &init_net)) && + (n->pid == inst->peer_pid)) + __instance_destroy(inst); + } + } + spin_unlock_bh(&instances_lock); + } + return NOTIFY_DONE; +} + +static struct notifier_block nfulnl_rtnl_notifier = { + .notifier_call = nfulnl_rcv_nl_event, +}; + +static int +nfulnl_recv_unsupp(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const nfqa[]) +{ + return -ENOTSUPP; +} + +static struct nf_logger nfulnl_logger __read_mostly = { + .name = "nfnetlink_log", + .logfn = &nfulnl_log_packet, + .me = THIS_MODULE, +}; + +static const struct nla_policy nfula_cfg_policy[NFULA_CFG_MAX+1] = { + [NFULA_CFG_CMD] = { .len = sizeof(struct nfulnl_msg_config_cmd) }, + [NFULA_CFG_MODE] = { .len = sizeof(struct nfulnl_msg_config_mode) }, + [NFULA_CFG_TIMEOUT] = { .type = NLA_U32 }, + [NFULA_CFG_QTHRESH] = { .type = NLA_U32 }, + [NFULA_CFG_NLBUFSIZ] = { .type = NLA_U32 }, + [NFULA_CFG_FLAGS] = { .type = NLA_U16 }, +}; + +static int +nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const nfula[]) +{ + struct nfgenmsg *nfmsg = NLMSG_DATA(nlh); + u_int16_t group_num = ntohs(nfmsg->res_id); + struct nfulnl_instance *inst; + struct nfulnl_msg_config_cmd *cmd = NULL; + int ret = 0; + + if (nfula[NFULA_CFG_CMD]) { + u_int8_t pf = nfmsg->nfgen_family; + cmd = nla_data(nfula[NFULA_CFG_CMD]); + + /* Commands without queue context */ + switch (cmd->command) { + case NFULNL_CFG_CMD_PF_BIND: + return nf_log_bind_pf(pf, &nfulnl_logger); + case NFULNL_CFG_CMD_PF_UNBIND: + nf_log_unbind_pf(pf); + return 0; + } + } + + inst = instance_lookup_get(group_num); + if (inst && inst->peer_pid != NETLINK_CB(skb).pid) { + ret = -EPERM; + goto out_put; + } + + if (cmd != NULL) { + switch (cmd->command) { + case NFULNL_CFG_CMD_BIND: + if (inst) { + ret = -EBUSY; + goto out_put; + } + + inst = instance_create(group_num, + NETLINK_CB(skb).pid); + if (IS_ERR(inst)) { + ret = PTR_ERR(inst); + goto out; + } + break; + case NFULNL_CFG_CMD_UNBIND: + if (!inst) { + ret = -ENODEV; + goto out; + } + + instance_destroy(inst); + goto out_put; + default: + ret = -ENOTSUPP; + break; + } + } + + if (nfula[NFULA_CFG_MODE]) { + struct nfulnl_msg_config_mode *params; + params = nla_data(nfula[NFULA_CFG_MODE]); + + if (!inst) { + ret = -ENODEV; + goto out; + } + nfulnl_set_mode(inst, params->copy_mode, + ntohl(params->copy_range)); + } + + if (nfula[NFULA_CFG_TIMEOUT]) { + __be32 timeout = nla_get_be32(nfula[NFULA_CFG_TIMEOUT]); + + if (!inst) { + ret = -ENODEV; + goto out; + } + nfulnl_set_timeout(inst, ntohl(timeout)); + } + + if (nfula[NFULA_CFG_NLBUFSIZ]) { + __be32 nlbufsiz = nla_get_be32(nfula[NFULA_CFG_NLBUFSIZ]); + + if (!inst) { + ret = -ENODEV; + goto out; + } + nfulnl_set_nlbufsiz(inst, ntohl(nlbufsiz)); + } + + if (nfula[NFULA_CFG_QTHRESH]) { + __be32 qthresh = nla_get_be32(nfula[NFULA_CFG_QTHRESH]); + + if (!inst) { + ret = -ENODEV; + goto out; + } + nfulnl_set_qthresh(inst, ntohl(qthresh)); + } + + if (nfula[NFULA_CFG_FLAGS]) { + __be16 flags = nla_get_be16(nfula[NFULA_CFG_FLAGS]); + + if (!inst) { + ret = -ENODEV; + goto out; + } + nfulnl_set_flags(inst, ntohs(flags)); + } + +out_put: + instance_put(inst); +out: + return ret; +} + +static const struct nfnl_callback nfulnl_cb[NFULNL_MSG_MAX] = { + [NFULNL_MSG_PACKET] = { .call = nfulnl_recv_unsupp, + .attr_count = NFULA_MAX, }, + [NFULNL_MSG_CONFIG] = { .call = nfulnl_recv_config, + .attr_count = NFULA_CFG_MAX, + .policy = nfula_cfg_policy }, +}; + +static const struct nfnetlink_subsystem nfulnl_subsys = { + .name = "log", + .subsys_id = NFNL_SUBSYS_ULOG, + .cb_count = NFULNL_MSG_MAX, + .cb = nfulnl_cb, +}; + +#ifdef CONFIG_PROC_FS +struct iter_state { + unsigned int bucket; +}; + +static struct hlist_node *get_first(struct iter_state *st) +{ + if (!st) + return NULL; + + for (st->bucket = 0; st->bucket < INSTANCE_BUCKETS; st->bucket++) { + if (!hlist_empty(&instance_table[st->bucket])) + return rcu_dereference_bh(hlist_first_rcu(&instance_table[st->bucket])); + } + return NULL; +} + +static struct hlist_node *get_next(struct iter_state *st, struct hlist_node *h) +{ + h = rcu_dereference_bh(hlist_next_rcu(h)); + while (!h) { + if (++st->bucket >= INSTANCE_BUCKETS) + return NULL; + + h = rcu_dereference_bh(hlist_first_rcu(&instance_table[st->bucket])); + } + return h; +} + +static struct hlist_node *get_idx(struct iter_state *st, loff_t pos) +{ + struct hlist_node *head; + head = get_first(st); + + if (head) + while (pos && (head = get_next(st, head))) + pos--; + return pos ? NULL : head; +} + +static void *seq_start(struct seq_file *seq, loff_t *pos) + __acquires(rcu_bh) +{ + rcu_read_lock_bh(); + return get_idx(seq->private, *pos); +} + +static void *seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + (*pos)++; + return get_next(s->private, v); +} + +static void seq_stop(struct seq_file *s, void *v) + __releases(rcu_bh) +{ + rcu_read_unlock_bh(); +} + +static int seq_show(struct seq_file *s, void *v) +{ + const struct nfulnl_instance *inst = v; + + return seq_printf(s, "%5d %6d %5d %1d %5d %6d %2d\n", + inst->group_num, + inst->peer_pid, inst->qlen, + inst->copy_mode, inst->copy_range, + inst->flushtimeout, atomic_read(&inst->use)); +} + +static const struct seq_operations nful_seq_ops = { + .start = seq_start, + .next = seq_next, + .stop = seq_stop, + .show = seq_show, +}; + +static int nful_open(struct inode *inode, struct file *file) +{ + return seq_open_private(file, &nful_seq_ops, + sizeof(struct iter_state)); +} + +static const struct file_operations nful_file_ops = { + .owner = THIS_MODULE, + .open = nful_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +#endif /* PROC_FS */ + +static int __init nfnetlink_log_init(void) +{ + int i, status = -ENOMEM; + + for (i = 0; i < INSTANCE_BUCKETS; i++) + INIT_HLIST_HEAD(&instance_table[i]); + + /* it's not really all that important to have a random value, so + * we can do this from the init function, even if there hasn't + * been that much entropy yet */ + get_random_bytes(&hash_init, sizeof(hash_init)); + + netlink_register_notifier(&nfulnl_rtnl_notifier); + status = nfnetlink_subsys_register(&nfulnl_subsys); + if (status < 0) { + printk(KERN_ERR "log: failed to create netlink socket\n"); + goto cleanup_netlink_notifier; + } + + status = nf_log_register(NFPROTO_UNSPEC, &nfulnl_logger); + if (status < 0) { + printk(KERN_ERR "log: failed to register logger\n"); + goto cleanup_subsys; + } + +#ifdef CONFIG_PROC_FS + if (!proc_create("nfnetlink_log", 0440, + proc_net_netfilter, &nful_file_ops)) + goto cleanup_logger; +#endif + return status; + +#ifdef CONFIG_PROC_FS +cleanup_logger: + nf_log_unregister(&nfulnl_logger); +#endif +cleanup_subsys: + nfnetlink_subsys_unregister(&nfulnl_subsys); +cleanup_netlink_notifier: + netlink_unregister_notifier(&nfulnl_rtnl_notifier); + return status; +} + +static void __exit nfnetlink_log_fini(void) +{ + nf_log_unregister(&nfulnl_logger); +#ifdef CONFIG_PROC_FS + remove_proc_entry("nfnetlink_log", proc_net_netfilter); +#endif + nfnetlink_subsys_unregister(&nfulnl_subsys); + netlink_unregister_notifier(&nfulnl_rtnl_notifier); +} + +MODULE_DESCRIPTION("netfilter userspace logging"); +MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_ULOG); + +module_init(nfnetlink_log_init); +module_exit(nfnetlink_log_fini); diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c new file mode 100644 index 00000000..a80b0cb0 --- /dev/null +++ b/net/netfilter/nfnetlink_queue.c @@ -0,0 +1,1028 @@ +/* + * This is a module which is used for queueing packets and communicating with + * userspace via nfnetlink. + * + * (C) 2005 by Harald Welte <laforge@netfilter.org> + * (C) 2007 by Patrick McHardy <kaber@trash.net> + * + * Based on the old ipv4-only ip_queue.c: + * (C) 2000-2002 James Morris <jmorris@intercode.com.au> + * (C) 2003-2005 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/init.h> +#include <linux/spinlock.h> +#include <linux/slab.h> +#include <linux/notifier.h> +#include <linux/netdevice.h> +#include <linux/netfilter.h> +#include <linux/proc_fs.h> +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter_ipv6.h> +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nfnetlink_queue.h> +#include <linux/list.h> +#include <net/sock.h> +#include <net/netfilter/nf_queue.h> + +#include <linux/atomic.h> + +#ifdef CONFIG_BRIDGE_NETFILTER +#include "../bridge/br_private.h" +#endif + +#define NFQNL_QMAX_DEFAULT 1024 + +struct nfqnl_instance { + struct hlist_node hlist; /* global list of queues */ + struct rcu_head rcu; + + int peer_pid; + unsigned int queue_maxlen; + unsigned int copy_range; + unsigned int queue_dropped; + unsigned int queue_user_dropped; + + + u_int16_t queue_num; /* number of this queue */ + u_int8_t copy_mode; +/* + * Following fields are dirtied for each queued packet, + * keep them in same cache line if possible. + */ + spinlock_t lock; + unsigned int queue_total; + unsigned int id_sequence; /* 'sequence' of pkt ids */ + struct list_head queue_list; /* packets in queue */ +}; + +typedef int (*nfqnl_cmpfn)(struct nf_queue_entry *, unsigned long); + +static DEFINE_SPINLOCK(instances_lock); + +#define INSTANCE_BUCKETS 16 +static struct hlist_head instance_table[INSTANCE_BUCKETS] __read_mostly; + +static inline u_int8_t instance_hashfn(u_int16_t queue_num) +{ + return ((queue_num >> 8) | queue_num) % INSTANCE_BUCKETS; +} + +static struct nfqnl_instance * +instance_lookup(u_int16_t queue_num) +{ + struct hlist_head *head; + struct hlist_node *pos; + struct nfqnl_instance *inst; + + head = &instance_table[instance_hashfn(queue_num)]; + hlist_for_each_entry_rcu(inst, pos, head, hlist) { + if (inst->queue_num == queue_num) + return inst; + } + return NULL; +} + +static struct nfqnl_instance * +instance_create(u_int16_t queue_num, int pid) +{ + struct nfqnl_instance *inst; + unsigned int h; + int err; + + spin_lock(&instances_lock); + if (instance_lookup(queue_num)) { + err = -EEXIST; + goto out_unlock; + } + + inst = kzalloc(sizeof(*inst), GFP_ATOMIC); + if (!inst) { + err = -ENOMEM; + goto out_unlock; + } + + inst->queue_num = queue_num; + inst->peer_pid = pid; + inst->queue_maxlen = NFQNL_QMAX_DEFAULT; + inst->copy_range = 0xfffff; + inst->copy_mode = NFQNL_COPY_NONE; + spin_lock_init(&inst->lock); + INIT_LIST_HEAD(&inst->queue_list); + + if (!try_module_get(THIS_MODULE)) { + err = -EAGAIN; + goto out_free; + } + + h = instance_hashfn(queue_num); + hlist_add_head_rcu(&inst->hlist, &instance_table[h]); + + spin_unlock(&instances_lock); + + return inst; + +out_free: + kfree(inst); +out_unlock: + spin_unlock(&instances_lock); + return ERR_PTR(err); +} + +static void nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, + unsigned long data); + +static void +instance_destroy_rcu(struct rcu_head *head) +{ + struct nfqnl_instance *inst = container_of(head, struct nfqnl_instance, + rcu); + + nfqnl_flush(inst, NULL, 0); + kfree(inst); + module_put(THIS_MODULE); +} + +static void +__instance_destroy(struct nfqnl_instance *inst) +{ + hlist_del_rcu(&inst->hlist); + call_rcu(&inst->rcu, instance_destroy_rcu); +} + +static void +instance_destroy(struct nfqnl_instance *inst) +{ + spin_lock(&instances_lock); + __instance_destroy(inst); + spin_unlock(&instances_lock); +} + +static inline void +__enqueue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry) +{ + list_add_tail(&entry->list, &queue->queue_list); + queue->queue_total++; +} + +static void +__dequeue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry) +{ + list_del(&entry->list); + queue->queue_total--; +} + +static struct nf_queue_entry * +find_dequeue_entry(struct nfqnl_instance *queue, unsigned int id) +{ + struct nf_queue_entry *entry = NULL, *i; + + spin_lock_bh(&queue->lock); + + list_for_each_entry(i, &queue->queue_list, list) { + if (i->id == id) { + entry = i; + break; + } + } + + if (entry) + __dequeue_entry(queue, entry); + + spin_unlock_bh(&queue->lock); + + return entry; +} + +static void +nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data) +{ + struct nf_queue_entry *entry, *next; + + spin_lock_bh(&queue->lock); + list_for_each_entry_safe(entry, next, &queue->queue_list, list) { + if (!cmpfn || cmpfn(entry, data)) { + list_del(&entry->list); + queue->queue_total--; + nf_reinject(entry, NF_DROP); + } + } + spin_unlock_bh(&queue->lock); +} + +static struct sk_buff * +nfqnl_build_packet_message(struct nfqnl_instance *queue, + struct nf_queue_entry *entry, + __be32 **packet_id_ptr) +{ + sk_buff_data_t old_tail; + size_t size; + size_t data_len = 0; + struct sk_buff *skb; + struct nlattr *nla; + struct nfqnl_msg_packet_hdr *pmsg; + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + struct sk_buff *entskb = entry->skb; + struct net_device *indev; + struct net_device *outdev; + + size = NLMSG_SPACE(sizeof(struct nfgenmsg)) + + nla_total_size(sizeof(struct nfqnl_msg_packet_hdr)) + + nla_total_size(sizeof(u_int32_t)) /* ifindex */ + + nla_total_size(sizeof(u_int32_t)) /* ifindex */ +#ifdef CONFIG_BRIDGE_NETFILTER + + nla_total_size(sizeof(u_int32_t)) /* ifindex */ + + nla_total_size(sizeof(u_int32_t)) /* ifindex */ +#endif + + nla_total_size(sizeof(u_int32_t)) /* mark */ + + nla_total_size(sizeof(struct nfqnl_msg_packet_hw)) + + nla_total_size(sizeof(struct nfqnl_msg_packet_timestamp)); + + outdev = entry->outdev; + + switch ((enum nfqnl_config_mode)ACCESS_ONCE(queue->copy_mode)) { + case NFQNL_COPY_META: + case NFQNL_COPY_NONE: + break; + + case NFQNL_COPY_PACKET: + if (entskb->ip_summed == CHECKSUM_PARTIAL && + skb_checksum_help(entskb)) + return NULL; + + data_len = ACCESS_ONCE(queue->copy_range); + if (data_len == 0 || data_len > entskb->len) + data_len = entskb->len; + + size += nla_total_size(data_len); + break; + } + + + skb = alloc_skb(size, GFP_ATOMIC); + if (!skb) + goto nlmsg_failure; + + old_tail = skb->tail; + nlh = NLMSG_PUT(skb, 0, 0, + NFNL_SUBSYS_QUEUE << 8 | NFQNL_MSG_PACKET, + sizeof(struct nfgenmsg)); + nfmsg = NLMSG_DATA(nlh); + nfmsg->nfgen_family = entry->pf; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = htons(queue->queue_num); + + nla = __nla_reserve(skb, NFQA_PACKET_HDR, sizeof(*pmsg)); + pmsg = nla_data(nla); + pmsg->hw_protocol = entskb->protocol; + pmsg->hook = entry->hook; + *packet_id_ptr = &pmsg->packet_id; + + indev = entry->indev; + if (indev) { +#ifndef CONFIG_BRIDGE_NETFILTER + NLA_PUT_BE32(skb, NFQA_IFINDEX_INDEV, htonl(indev->ifindex)); +#else + if (entry->pf == PF_BRIDGE) { + /* Case 1: indev is physical input device, we need to + * look for bridge group (when called from + * netfilter_bridge) */ + NLA_PUT_BE32(skb, NFQA_IFINDEX_PHYSINDEV, + htonl(indev->ifindex)); + /* this is the bridge group "brX" */ + /* rcu_read_lock()ed by __nf_queue */ + NLA_PUT_BE32(skb, NFQA_IFINDEX_INDEV, + htonl(br_port_get_rcu(indev)->br->dev->ifindex)); + } else { + /* Case 2: indev is bridge group, we need to look for + * physical device (when called from ipv4) */ + NLA_PUT_BE32(skb, NFQA_IFINDEX_INDEV, + htonl(indev->ifindex)); + if (entskb->nf_bridge && entskb->nf_bridge->physindev) + NLA_PUT_BE32(skb, NFQA_IFINDEX_PHYSINDEV, + htonl(entskb->nf_bridge->physindev->ifindex)); + } +#endif + } + + if (outdev) { +#ifndef CONFIG_BRIDGE_NETFILTER + NLA_PUT_BE32(skb, NFQA_IFINDEX_OUTDEV, htonl(outdev->ifindex)); +#else + if (entry->pf == PF_BRIDGE) { + /* Case 1: outdev is physical output device, we need to + * look for bridge group (when called from + * netfilter_bridge) */ + NLA_PUT_BE32(skb, NFQA_IFINDEX_PHYSOUTDEV, + htonl(outdev->ifindex)); + /* this is the bridge group "brX" */ + /* rcu_read_lock()ed by __nf_queue */ + NLA_PUT_BE32(skb, NFQA_IFINDEX_OUTDEV, + htonl(br_port_get_rcu(outdev)->br->dev->ifindex)); + } else { + /* Case 2: outdev is bridge group, we need to look for + * physical output device (when called from ipv4) */ + NLA_PUT_BE32(skb, NFQA_IFINDEX_OUTDEV, + htonl(outdev->ifindex)); + if (entskb->nf_bridge && entskb->nf_bridge->physoutdev) + NLA_PUT_BE32(skb, NFQA_IFINDEX_PHYSOUTDEV, + htonl(entskb->nf_bridge->physoutdev->ifindex)); + } +#endif + } + + if (entskb->mark) + NLA_PUT_BE32(skb, NFQA_MARK, htonl(entskb->mark)); + + if (indev && entskb->dev && + entskb->mac_header != entskb->network_header) { + struct nfqnl_msg_packet_hw phw; + int len = dev_parse_header(entskb, phw.hw_addr); + if (len) { + phw.hw_addrlen = htons(len); + NLA_PUT(skb, NFQA_HWADDR, sizeof(phw), &phw); + } + } + + if (entskb->tstamp.tv64) { + struct nfqnl_msg_packet_timestamp ts; + struct timeval tv = ktime_to_timeval(entskb->tstamp); + ts.sec = cpu_to_be64(tv.tv_sec); + ts.usec = cpu_to_be64(tv.tv_usec); + + NLA_PUT(skb, NFQA_TIMESTAMP, sizeof(ts), &ts); + } + + if (data_len) { + struct nlattr *nla; + int sz = nla_attr_size(data_len); + + if (skb_tailroom(skb) < nla_total_size(data_len)) { + printk(KERN_WARNING "nf_queue: no tailroom!\n"); + goto nlmsg_failure; + } + + nla = (struct nlattr *)skb_put(skb, nla_total_size(data_len)); + nla->nla_type = NFQA_PAYLOAD; + nla->nla_len = sz; + + if (skb_copy_bits(entskb, 0, nla_data(nla), data_len)) + BUG(); + } + + nlh->nlmsg_len = skb->tail - old_tail; + return skb; + +nlmsg_failure: +nla_put_failure: + if (skb) + kfree_skb(skb); + if (net_ratelimit()) + printk(KERN_ERR "nf_queue: error creating packet message\n"); + return NULL; +} + +static int +nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) +{ + struct sk_buff *nskb; + struct nfqnl_instance *queue; + int err = -ENOBUFS; + __be32 *packet_id_ptr; + + /* rcu_read_lock()ed by nf_hook_slow() */ + queue = instance_lookup(queuenum); + if (!queue) { + err = -ESRCH; + goto err_out; + } + + if (queue->copy_mode == NFQNL_COPY_NONE) { + err = -EINVAL; + goto err_out; + } + + nskb = nfqnl_build_packet_message(queue, entry, &packet_id_ptr); + if (nskb == NULL) { + err = -ENOMEM; + goto err_out; + } + spin_lock_bh(&queue->lock); + + if (!queue->peer_pid) { + err = -EINVAL; + goto err_out_free_nskb; + } + if (queue->queue_total >= queue->queue_maxlen) { + queue->queue_dropped++; + if (net_ratelimit()) + printk(KERN_WARNING "nf_queue: full at %d entries, " + "dropping packets(s).\n", + queue->queue_total); + goto err_out_free_nskb; + } + entry->id = ++queue->id_sequence; + *packet_id_ptr = htonl(entry->id); + + /* nfnetlink_unicast will either free the nskb or add it to a socket */ + err = nfnetlink_unicast(nskb, &init_net, queue->peer_pid, MSG_DONTWAIT); + if (err < 0) { + queue->queue_user_dropped++; + goto err_out_unlock; + } + + __enqueue_entry(queue, entry); + + spin_unlock_bh(&queue->lock); + return 0; + +err_out_free_nskb: + kfree_skb(nskb); +err_out_unlock: + spin_unlock_bh(&queue->lock); +err_out: + return err; +} + +static int +nfqnl_mangle(void *data, int data_len, struct nf_queue_entry *e) +{ + struct sk_buff *nskb; + int diff; + + diff = data_len - e->skb->len; + if (diff < 0) { + if (pskb_trim(e->skb, data_len)) + return -ENOMEM; + } else if (diff > 0) { + if (data_len > 0xFFFF) + return -EINVAL; + if (diff > skb_tailroom(e->skb)) { + nskb = skb_copy_expand(e->skb, skb_headroom(e->skb), + diff, GFP_ATOMIC); + if (!nskb) { + printk(KERN_WARNING "nf_queue: OOM " + "in mangle, dropping packet\n"); + return -ENOMEM; + } + kfree_skb(e->skb); + e->skb = nskb; + } + skb_put(e->skb, diff); + } + if (!skb_make_writable(e->skb, data_len)) + return -ENOMEM; + skb_copy_to_linear_data(e->skb, data, data_len); + e->skb->ip_summed = CHECKSUM_NONE; + return 0; +} + +static int +nfqnl_set_mode(struct nfqnl_instance *queue, + unsigned char mode, unsigned int range) +{ + int status = 0; + + spin_lock_bh(&queue->lock); + switch (mode) { + case NFQNL_COPY_NONE: + case NFQNL_COPY_META: + queue->copy_mode = mode; + queue->copy_range = 0; + break; + + case NFQNL_COPY_PACKET: + queue->copy_mode = mode; + /* we're using struct nlattr which has 16bit nla_len */ + if (range > 0xffff) + queue->copy_range = 0xffff; + else + queue->copy_range = range; + break; + + default: + status = -EINVAL; + + } + spin_unlock_bh(&queue->lock); + + return status; +} + +static int +dev_cmp(struct nf_queue_entry *entry, unsigned long ifindex) +{ + if (entry->indev) + if (entry->indev->ifindex == ifindex) + return 1; + if (entry->outdev) + if (entry->outdev->ifindex == ifindex) + return 1; +#ifdef CONFIG_BRIDGE_NETFILTER + if (entry->skb->nf_bridge) { + if (entry->skb->nf_bridge->physindev && + entry->skb->nf_bridge->physindev->ifindex == ifindex) + return 1; + if (entry->skb->nf_bridge->physoutdev && + entry->skb->nf_bridge->physoutdev->ifindex == ifindex) + return 1; + } +#endif + return 0; +} + +/* drop all packets with either indev or outdev == ifindex from all queue + * instances */ +static void +nfqnl_dev_drop(int ifindex) +{ + int i; + + rcu_read_lock(); + + for (i = 0; i < INSTANCE_BUCKETS; i++) { + struct hlist_node *tmp; + struct nfqnl_instance *inst; + struct hlist_head *head = &instance_table[i]; + + hlist_for_each_entry_rcu(inst, tmp, head, hlist) + nfqnl_flush(inst, dev_cmp, ifindex); + } + + rcu_read_unlock(); +} + +#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0) + +static int +nfqnl_rcv_dev_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct net_device *dev = ptr; + + if (!net_eq(dev_net(dev), &init_net)) + return NOTIFY_DONE; + + /* Drop any packets associated with the downed device */ + if (event == NETDEV_DOWN) + nfqnl_dev_drop(dev->ifindex); + return NOTIFY_DONE; +} + +static struct notifier_block nfqnl_dev_notifier = { + .notifier_call = nfqnl_rcv_dev_event, +}; + +static int +nfqnl_rcv_nl_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct netlink_notify *n = ptr; + + if (event == NETLINK_URELEASE && n->protocol == NETLINK_NETFILTER) { + int i; + + /* destroy all instances for this pid */ + spin_lock(&instances_lock); + for (i = 0; i < INSTANCE_BUCKETS; i++) { + struct hlist_node *tmp, *t2; + struct nfqnl_instance *inst; + struct hlist_head *head = &instance_table[i]; + + hlist_for_each_entry_safe(inst, tmp, t2, head, hlist) { + if ((n->net == &init_net) && + (n->pid == inst->peer_pid)) + __instance_destroy(inst); + } + } + spin_unlock(&instances_lock); + } + return NOTIFY_DONE; +} + +static struct notifier_block nfqnl_rtnl_notifier = { + .notifier_call = nfqnl_rcv_nl_event, +}; + +static const struct nla_policy nfqa_verdict_policy[NFQA_MAX+1] = { + [NFQA_VERDICT_HDR] = { .len = sizeof(struct nfqnl_msg_verdict_hdr) }, + [NFQA_MARK] = { .type = NLA_U32 }, + [NFQA_PAYLOAD] = { .type = NLA_UNSPEC }, +}; + +static const struct nla_policy nfqa_verdict_batch_policy[NFQA_MAX+1] = { + [NFQA_VERDICT_HDR] = { .len = sizeof(struct nfqnl_msg_verdict_hdr) }, + [NFQA_MARK] = { .type = NLA_U32 }, +}; + +static struct nfqnl_instance *verdict_instance_lookup(u16 queue_num, int nlpid) +{ + struct nfqnl_instance *queue; + + queue = instance_lookup(queue_num); + if (!queue) + return ERR_PTR(-ENODEV); + + if (queue->peer_pid != nlpid) + return ERR_PTR(-EPERM); + + return queue; +} + +static struct nfqnl_msg_verdict_hdr* +verdicthdr_get(const struct nlattr * const nfqa[]) +{ + struct nfqnl_msg_verdict_hdr *vhdr; + unsigned int verdict; + + if (!nfqa[NFQA_VERDICT_HDR]) + return NULL; + + vhdr = nla_data(nfqa[NFQA_VERDICT_HDR]); + verdict = ntohl(vhdr->verdict) & NF_VERDICT_MASK; + if (verdict > NF_MAX_VERDICT || verdict == NF_STOLEN) + return NULL; + return vhdr; +} + +static int nfq_id_after(unsigned int id, unsigned int max) +{ + return (int)(id - max) > 0; +} + +static int +nfqnl_recv_verdict_batch(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const nfqa[]) +{ + struct nfgenmsg *nfmsg = NLMSG_DATA(nlh); + struct nf_queue_entry *entry, *tmp; + unsigned int verdict, maxid; + struct nfqnl_msg_verdict_hdr *vhdr; + struct nfqnl_instance *queue; + LIST_HEAD(batch_list); + u16 queue_num = ntohs(nfmsg->res_id); + + queue = verdict_instance_lookup(queue_num, NETLINK_CB(skb).pid); + if (IS_ERR(queue)) + return PTR_ERR(queue); + + vhdr = verdicthdr_get(nfqa); + if (!vhdr) + return -EINVAL; + + verdict = ntohl(vhdr->verdict); + maxid = ntohl(vhdr->id); + + spin_lock_bh(&queue->lock); + + list_for_each_entry_safe(entry, tmp, &queue->queue_list, list) { + if (nfq_id_after(entry->id, maxid)) + break; + __dequeue_entry(queue, entry); + list_add_tail(&entry->list, &batch_list); + } + + spin_unlock_bh(&queue->lock); + + if (list_empty(&batch_list)) + return -ENOENT; + + list_for_each_entry_safe(entry, tmp, &batch_list, list) { + if (nfqa[NFQA_MARK]) + entry->skb->mark = ntohl(nla_get_be32(nfqa[NFQA_MARK])); + nf_reinject(entry, verdict); + } + return 0; +} + +static int +nfqnl_recv_verdict(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const nfqa[]) +{ + struct nfgenmsg *nfmsg = NLMSG_DATA(nlh); + u_int16_t queue_num = ntohs(nfmsg->res_id); + + struct nfqnl_msg_verdict_hdr *vhdr; + struct nfqnl_instance *queue; + unsigned int verdict; + struct nf_queue_entry *entry; + + queue = instance_lookup(queue_num); + if (!queue) + + queue = verdict_instance_lookup(queue_num, NETLINK_CB(skb).pid); + if (IS_ERR(queue)) + return PTR_ERR(queue); + + vhdr = verdicthdr_get(nfqa); + if (!vhdr) + return -EINVAL; + + verdict = ntohl(vhdr->verdict); + + entry = find_dequeue_entry(queue, ntohl(vhdr->id)); + if (entry == NULL) + return -ENOENT; + + if (nfqa[NFQA_PAYLOAD]) { + if (nfqnl_mangle(nla_data(nfqa[NFQA_PAYLOAD]), + nla_len(nfqa[NFQA_PAYLOAD]), entry) < 0) + verdict = NF_DROP; + } + + if (nfqa[NFQA_MARK]) + entry->skb->mark = ntohl(nla_get_be32(nfqa[NFQA_MARK])); + + nf_reinject(entry, verdict); + return 0; +} + +static int +nfqnl_recv_unsupp(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const nfqa[]) +{ + return -ENOTSUPP; +} + +static const struct nla_policy nfqa_cfg_policy[NFQA_CFG_MAX+1] = { + [NFQA_CFG_CMD] = { .len = sizeof(struct nfqnl_msg_config_cmd) }, + [NFQA_CFG_PARAMS] = { .len = sizeof(struct nfqnl_msg_config_params) }, +}; + +static const struct nf_queue_handler nfqh = { + .name = "nf_queue", + .outfn = &nfqnl_enqueue_packet, +}; + +static int +nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const nfqa[]) +{ + struct nfgenmsg *nfmsg = NLMSG_DATA(nlh); + u_int16_t queue_num = ntohs(nfmsg->res_id); + struct nfqnl_instance *queue; + struct nfqnl_msg_config_cmd *cmd = NULL; + int ret = 0; + + if (nfqa[NFQA_CFG_CMD]) { + cmd = nla_data(nfqa[NFQA_CFG_CMD]); + + /* Commands without queue context - might sleep */ + switch (cmd->command) { + case NFQNL_CFG_CMD_PF_BIND: + return nf_register_queue_handler(ntohs(cmd->pf), + &nfqh); + case NFQNL_CFG_CMD_PF_UNBIND: + return nf_unregister_queue_handler(ntohs(cmd->pf), + &nfqh); + } + } + + rcu_read_lock(); + queue = instance_lookup(queue_num); + if (queue && queue->peer_pid != NETLINK_CB(skb).pid) { + ret = -EPERM; + goto err_out_unlock; + } + + if (cmd != NULL) { + switch (cmd->command) { + case NFQNL_CFG_CMD_BIND: + if (queue) { + ret = -EBUSY; + goto err_out_unlock; + } + queue = instance_create(queue_num, NETLINK_CB(skb).pid); + if (IS_ERR(queue)) { + ret = PTR_ERR(queue); + goto err_out_unlock; + } + break; + case NFQNL_CFG_CMD_UNBIND: + if (!queue) { + ret = -ENODEV; + goto err_out_unlock; + } + instance_destroy(queue); + break; + case NFQNL_CFG_CMD_PF_BIND: + case NFQNL_CFG_CMD_PF_UNBIND: + break; + default: + ret = -ENOTSUPP; + break; + } + } + + if (nfqa[NFQA_CFG_PARAMS]) { + struct nfqnl_msg_config_params *params; + + if (!queue) { + ret = -ENODEV; + goto err_out_unlock; + } + params = nla_data(nfqa[NFQA_CFG_PARAMS]); + nfqnl_set_mode(queue, params->copy_mode, + ntohl(params->copy_range)); + } + + if (nfqa[NFQA_CFG_QUEUE_MAXLEN]) { + __be32 *queue_maxlen; + + if (!queue) { + ret = -ENODEV; + goto err_out_unlock; + } + queue_maxlen = nla_data(nfqa[NFQA_CFG_QUEUE_MAXLEN]); + spin_lock_bh(&queue->lock); + queue->queue_maxlen = ntohl(*queue_maxlen); + spin_unlock_bh(&queue->lock); + } + +err_out_unlock: + rcu_read_unlock(); + return ret; +} + +static const struct nfnl_callback nfqnl_cb[NFQNL_MSG_MAX] = { + [NFQNL_MSG_PACKET] = { .call_rcu = nfqnl_recv_unsupp, + .attr_count = NFQA_MAX, }, + [NFQNL_MSG_VERDICT] = { .call_rcu = nfqnl_recv_verdict, + .attr_count = NFQA_MAX, + .policy = nfqa_verdict_policy }, + [NFQNL_MSG_CONFIG] = { .call = nfqnl_recv_config, + .attr_count = NFQA_CFG_MAX, + .policy = nfqa_cfg_policy }, + [NFQNL_MSG_VERDICT_BATCH]={ .call_rcu = nfqnl_recv_verdict_batch, + .attr_count = NFQA_MAX, + .policy = nfqa_verdict_batch_policy }, +}; + +static const struct nfnetlink_subsystem nfqnl_subsys = { + .name = "nf_queue", + .subsys_id = NFNL_SUBSYS_QUEUE, + .cb_count = NFQNL_MSG_MAX, + .cb = nfqnl_cb, +}; + +#ifdef CONFIG_PROC_FS +struct iter_state { + unsigned int bucket; +}; + +static struct hlist_node *get_first(struct seq_file *seq) +{ + struct iter_state *st = seq->private; + + if (!st) + return NULL; + + for (st->bucket = 0; st->bucket < INSTANCE_BUCKETS; st->bucket++) { + if (!hlist_empty(&instance_table[st->bucket])) + return instance_table[st->bucket].first; + } + return NULL; +} + +static struct hlist_node *get_next(struct seq_file *seq, struct hlist_node *h) +{ + struct iter_state *st = seq->private; + + h = h->next; + while (!h) { + if (++st->bucket >= INSTANCE_BUCKETS) + return NULL; + + h = instance_table[st->bucket].first; + } + return h; +} + +static struct hlist_node *get_idx(struct seq_file *seq, loff_t pos) +{ + struct hlist_node *head; + head = get_first(seq); + + if (head) + while (pos && (head = get_next(seq, head))) + pos--; + return pos ? NULL : head; +} + +static void *seq_start(struct seq_file *seq, loff_t *pos) + __acquires(instances_lock) +{ + spin_lock(&instances_lock); + return get_idx(seq, *pos); +} + +static void *seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + (*pos)++; + return get_next(s, v); +} + +static void seq_stop(struct seq_file *s, void *v) + __releases(instances_lock) +{ + spin_unlock(&instances_lock); +} + +static int seq_show(struct seq_file *s, void *v) +{ + const struct nfqnl_instance *inst = v; + + return seq_printf(s, "%5d %6d %5d %1d %5d %5d %5d %8d %2d\n", + inst->queue_num, + inst->peer_pid, inst->queue_total, + inst->copy_mode, inst->copy_range, + inst->queue_dropped, inst->queue_user_dropped, + inst->id_sequence, 1); +} + +static const struct seq_operations nfqnl_seq_ops = { + .start = seq_start, + .next = seq_next, + .stop = seq_stop, + .show = seq_show, +}; + +static int nfqnl_open(struct inode *inode, struct file *file) +{ + return seq_open_private(file, &nfqnl_seq_ops, + sizeof(struct iter_state)); +} + +static const struct file_operations nfqnl_file_ops = { + .owner = THIS_MODULE, + .open = nfqnl_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +#endif /* PROC_FS */ + +static int __init nfnetlink_queue_init(void) +{ + int i, status = -ENOMEM; + + for (i = 0; i < INSTANCE_BUCKETS; i++) + INIT_HLIST_HEAD(&instance_table[i]); + + netlink_register_notifier(&nfqnl_rtnl_notifier); + status = nfnetlink_subsys_register(&nfqnl_subsys); + if (status < 0) { + printk(KERN_ERR "nf_queue: failed to create netlink socket\n"); + goto cleanup_netlink_notifier; + } + +#ifdef CONFIG_PROC_FS + if (!proc_create("nfnetlink_queue", 0440, + proc_net_netfilter, &nfqnl_file_ops)) + goto cleanup_subsys; +#endif + + register_netdevice_notifier(&nfqnl_dev_notifier); + return status; + +#ifdef CONFIG_PROC_FS +cleanup_subsys: + nfnetlink_subsys_unregister(&nfqnl_subsys); +#endif +cleanup_netlink_notifier: + netlink_unregister_notifier(&nfqnl_rtnl_notifier); + return status; +} + +static void __exit nfnetlink_queue_fini(void) +{ + nf_unregister_queue_handlers(&nfqh); + unregister_netdevice_notifier(&nfqnl_dev_notifier); +#ifdef CONFIG_PROC_FS + remove_proc_entry("nfnetlink_queue", proc_net_netfilter); +#endif + nfnetlink_subsys_unregister(&nfqnl_subsys); + netlink_unregister_notifier(&nfqnl_rtnl_notifier); + + rcu_barrier(); /* Wait for completion of call_rcu()'s */ +} + +MODULE_DESCRIPTION("netfilter packet queue handler"); +MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_QUEUE); + +module_init(nfnetlink_queue_init); +module_exit(nfnetlink_queue_fini); diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c new file mode 100644 index 00000000..8d987c35 --- /dev/null +++ b/net/netfilter/x_tables.c @@ -0,0 +1,1395 @@ +/* + * x_tables core - Backend for {ip,ip6,arp}_tables + * + * Copyright (C) 2006-2006 Harald Welte <laforge@netfilter.org> + * + * Based on existing ip_tables code which is + * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling + * Copyright (C) 2000-2005 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/socket.h> +#include <linux/net.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/string.h> +#include <linux/vmalloc.h> +#include <linux/mutex.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/audit.h> +#include <net/net_namespace.h> + +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter_arp.h> +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv6/ip6_tables.h> +#include <linux/netfilter_arp/arp_tables.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); +MODULE_DESCRIPTION("{ip,ip6,arp,eb}_tables backend module"); + +#define SMP_ALIGN(x) (((x) + SMP_CACHE_BYTES-1) & ~(SMP_CACHE_BYTES-1)) + +struct compat_delta { + unsigned int offset; /* offset in kernel */ + int delta; /* delta in 32bit user land */ +}; + +struct xt_af { + struct mutex mutex; + struct list_head match; + struct list_head target; +#ifdef CONFIG_COMPAT + struct mutex compat_mutex; + struct compat_delta *compat_tab; + unsigned int number; /* number of slots in compat_tab[] */ + unsigned int cur; /* number of used slots in compat_tab[] */ +#endif +}; + +static struct xt_af *xt; + +static const char *const xt_prefix[NFPROTO_NUMPROTO] = { + [NFPROTO_UNSPEC] = "x", + [NFPROTO_IPV4] = "ip", + [NFPROTO_ARP] = "arp", + [NFPROTO_BRIDGE] = "eb", + [NFPROTO_IPV6] = "ip6", +}; + +/* Allow this many total (re)entries. */ +static const unsigned int xt_jumpstack_multiplier = 2; + +/* Registration hooks for targets. */ +int +xt_register_target(struct xt_target *target) +{ + u_int8_t af = target->family; + int ret; + + ret = mutex_lock_interruptible(&xt[af].mutex); + if (ret != 0) + return ret; + list_add(&target->list, &xt[af].target); + mutex_unlock(&xt[af].mutex); + return ret; +} +EXPORT_SYMBOL(xt_register_target); + +void +xt_unregister_target(struct xt_target *target) +{ + u_int8_t af = target->family; + + mutex_lock(&xt[af].mutex); + list_del(&target->list); + mutex_unlock(&xt[af].mutex); +} +EXPORT_SYMBOL(xt_unregister_target); + +int +xt_register_targets(struct xt_target *target, unsigned int n) +{ + unsigned int i; + int err = 0; + + for (i = 0; i < n; i++) { + err = xt_register_target(&target[i]); + if (err) + goto err; + } + return err; + +err: + if (i > 0) + xt_unregister_targets(target, i); + return err; +} +EXPORT_SYMBOL(xt_register_targets); + +void +xt_unregister_targets(struct xt_target *target, unsigned int n) +{ + while (n-- > 0) + xt_unregister_target(&target[n]); +} +EXPORT_SYMBOL(xt_unregister_targets); + +int +xt_register_match(struct xt_match *match) +{ + u_int8_t af = match->family; + int ret; + + ret = mutex_lock_interruptible(&xt[af].mutex); + if (ret != 0) + return ret; + + list_add(&match->list, &xt[af].match); + mutex_unlock(&xt[af].mutex); + + return ret; +} +EXPORT_SYMBOL(xt_register_match); + +void +xt_unregister_match(struct xt_match *match) +{ + u_int8_t af = match->family; + + mutex_lock(&xt[af].mutex); + list_del(&match->list); + mutex_unlock(&xt[af].mutex); +} +EXPORT_SYMBOL(xt_unregister_match); + +int +xt_register_matches(struct xt_match *match, unsigned int n) +{ + unsigned int i; + int err = 0; + + for (i = 0; i < n; i++) { + err = xt_register_match(&match[i]); + if (err) + goto err; + } + return err; + +err: + if (i > 0) + xt_unregister_matches(match, i); + return err; +} +EXPORT_SYMBOL(xt_register_matches); + +void +xt_unregister_matches(struct xt_match *match, unsigned int n) +{ + while (n-- > 0) + xt_unregister_match(&match[n]); +} +EXPORT_SYMBOL(xt_unregister_matches); + + +/* + * These are weird, but module loading must not be done with mutex + * held (since they will register), and we have to have a single + * function to use. + */ + +/* Find match, grabs ref. Returns ERR_PTR() on error. */ +struct xt_match *xt_find_match(u8 af, const char *name, u8 revision) +{ + struct xt_match *m; + int err = -ENOENT; + + if (mutex_lock_interruptible(&xt[af].mutex) != 0) + return ERR_PTR(-EINTR); + + list_for_each_entry(m, &xt[af].match, list) { + if (strcmp(m->name, name) == 0) { + if (m->revision == revision) { + if (try_module_get(m->me)) { + mutex_unlock(&xt[af].mutex); + return m; + } + } else + err = -EPROTOTYPE; /* Found something. */ + } + } + mutex_unlock(&xt[af].mutex); + + if (af != NFPROTO_UNSPEC) + /* Try searching again in the family-independent list */ + return xt_find_match(NFPROTO_UNSPEC, name, revision); + + return ERR_PTR(err); +} +EXPORT_SYMBOL(xt_find_match); + +struct xt_match * +xt_request_find_match(uint8_t nfproto, const char *name, uint8_t revision) +{ + struct xt_match *match; + + match = xt_find_match(nfproto, name, revision); + if (IS_ERR(match)) { + request_module("%st_%s", xt_prefix[nfproto], name); + match = xt_find_match(nfproto, name, revision); + } + + return match; +} +EXPORT_SYMBOL_GPL(xt_request_find_match); + +/* Find target, grabs ref. Returns ERR_PTR() on error. */ +struct xt_target *xt_find_target(u8 af, const char *name, u8 revision) +{ + struct xt_target *t; + int err = -ENOENT; + + if (mutex_lock_interruptible(&xt[af].mutex) != 0) + return ERR_PTR(-EINTR); + + list_for_each_entry(t, &xt[af].target, list) { + if (strcmp(t->name, name) == 0) { + if (t->revision == revision) { + if (try_module_get(t->me)) { + mutex_unlock(&xt[af].mutex); + return t; + } + } else + err = -EPROTOTYPE; /* Found something. */ + } + } + mutex_unlock(&xt[af].mutex); + + if (af != NFPROTO_UNSPEC) + /* Try searching again in the family-independent list */ + return xt_find_target(NFPROTO_UNSPEC, name, revision); + + return ERR_PTR(err); +} +EXPORT_SYMBOL(xt_find_target); + +struct xt_target *xt_request_find_target(u8 af, const char *name, u8 revision) +{ + struct xt_target *target; + + target = xt_find_target(af, name, revision); + if (IS_ERR(target)) { + request_module("%st_%s", xt_prefix[af], name); + target = xt_find_target(af, name, revision); + } + + return target; +} +EXPORT_SYMBOL_GPL(xt_request_find_target); + +static int match_revfn(u8 af, const char *name, u8 revision, int *bestp) +{ + const struct xt_match *m; + int have_rev = 0; + + list_for_each_entry(m, &xt[af].match, list) { + if (strcmp(m->name, name) == 0) { + if (m->revision > *bestp) + *bestp = m->revision; + if (m->revision == revision) + have_rev = 1; + } + } + + if (af != NFPROTO_UNSPEC && !have_rev) + return match_revfn(NFPROTO_UNSPEC, name, revision, bestp); + + return have_rev; +} + +static int target_revfn(u8 af, const char *name, u8 revision, int *bestp) +{ + const struct xt_target *t; + int have_rev = 0; + + list_for_each_entry(t, &xt[af].target, list) { + if (strcmp(t->name, name) == 0) { + if (t->revision > *bestp) + *bestp = t->revision; + if (t->revision == revision) + have_rev = 1; + } + } + + if (af != NFPROTO_UNSPEC && !have_rev) + return target_revfn(NFPROTO_UNSPEC, name, revision, bestp); + + return have_rev; +} + +/* Returns true or false (if no such extension at all) */ +int xt_find_revision(u8 af, const char *name, u8 revision, int target, + int *err) +{ + int have_rev, best = -1; + + if (mutex_lock_interruptible(&xt[af].mutex) != 0) { + *err = -EINTR; + return 1; + } + if (target == 1) + have_rev = target_revfn(af, name, revision, &best); + else + have_rev = match_revfn(af, name, revision, &best); + mutex_unlock(&xt[af].mutex); + + /* Nothing at all? Return 0 to try loading module. */ + if (best == -1) { + *err = -ENOENT; + return 0; + } + + *err = best; + if (!have_rev) + *err = -EPROTONOSUPPORT; + return 1; +} +EXPORT_SYMBOL_GPL(xt_find_revision); + +static char *textify_hooks(char *buf, size_t size, unsigned int mask) +{ + static const char *const names[] = { + "PREROUTING", "INPUT", "FORWARD", + "OUTPUT", "POSTROUTING", "BROUTING", + }; + unsigned int i; + char *p = buf; + bool np = false; + int res; + + *p = '\0'; + for (i = 0; i < ARRAY_SIZE(names); ++i) { + if (!(mask & (1 << i))) + continue; + res = snprintf(p, size, "%s%s", np ? "/" : "", names[i]); + if (res > 0) { + size -= res; + p += res; + } + np = true; + } + + return buf; +} + +int xt_check_match(struct xt_mtchk_param *par, + unsigned int size, u_int8_t proto, bool inv_proto) +{ + int ret; + + if (XT_ALIGN(par->match->matchsize) != size && + par->match->matchsize != -1) { + /* + * ebt_among is exempt from centralized matchsize checking + * because it uses a dynamic-size data set. + */ + pr_err("%s_tables: %s.%u match: invalid size " + "%u (kernel) != (user) %u\n", + xt_prefix[par->family], par->match->name, + par->match->revision, + XT_ALIGN(par->match->matchsize), size); + return -EINVAL; + } + if (par->match->table != NULL && + strcmp(par->match->table, par->table) != 0) { + pr_err("%s_tables: %s match: only valid in %s table, not %s\n", + xt_prefix[par->family], par->match->name, + par->match->table, par->table); + return -EINVAL; + } + if (par->match->hooks && (par->hook_mask & ~par->match->hooks) != 0) { + char used[64], allow[64]; + + pr_err("%s_tables: %s match: used from hooks %s, but only " + "valid from %s\n", + xt_prefix[par->family], par->match->name, + textify_hooks(used, sizeof(used), par->hook_mask), + textify_hooks(allow, sizeof(allow), par->match->hooks)); + return -EINVAL; + } + if (par->match->proto && (par->match->proto != proto || inv_proto)) { + pr_err("%s_tables: %s match: only valid for protocol %u\n", + xt_prefix[par->family], par->match->name, + par->match->proto); + return -EINVAL; + } + if (par->match->checkentry != NULL) { + ret = par->match->checkentry(par); + if (ret < 0) + return ret; + else if (ret > 0) + /* Flag up potential errors. */ + return -EIO; + } + return 0; +} +EXPORT_SYMBOL_GPL(xt_check_match); + +#ifdef CONFIG_COMPAT +int xt_compat_add_offset(u_int8_t af, unsigned int offset, int delta) +{ + struct xt_af *xp = &xt[af]; + + if (!xp->compat_tab) { + if (!xp->number) + return -EINVAL; + xp->compat_tab = vmalloc(sizeof(struct compat_delta) * xp->number); + if (!xp->compat_tab) + return -ENOMEM; + xp->cur = 0; + } + + if (xp->cur >= xp->number) + return -EINVAL; + + if (xp->cur) + delta += xp->compat_tab[xp->cur - 1].delta; + xp->compat_tab[xp->cur].offset = offset; + xp->compat_tab[xp->cur].delta = delta; + xp->cur++; + return 0; +} +EXPORT_SYMBOL_GPL(xt_compat_add_offset); + +void xt_compat_flush_offsets(u_int8_t af) +{ + if (xt[af].compat_tab) { + vfree(xt[af].compat_tab); + xt[af].compat_tab = NULL; + xt[af].number = 0; + xt[af].cur = 0; + } +} +EXPORT_SYMBOL_GPL(xt_compat_flush_offsets); + +int xt_compat_calc_jump(u_int8_t af, unsigned int offset) +{ + struct compat_delta *tmp = xt[af].compat_tab; + int mid, left = 0, right = xt[af].cur - 1; + + while (left <= right) { + mid = (left + right) >> 1; + if (offset > tmp[mid].offset) + left = mid + 1; + else if (offset < tmp[mid].offset) + right = mid - 1; + else + return mid ? tmp[mid - 1].delta : 0; + } + return left ? tmp[left - 1].delta : 0; +} +EXPORT_SYMBOL_GPL(xt_compat_calc_jump); + +void xt_compat_init_offsets(u_int8_t af, unsigned int number) +{ + xt[af].number = number; + xt[af].cur = 0; +} +EXPORT_SYMBOL(xt_compat_init_offsets); + +int xt_compat_match_offset(const struct xt_match *match) +{ + u_int16_t csize = match->compatsize ? : match->matchsize; + return XT_ALIGN(match->matchsize) - COMPAT_XT_ALIGN(csize); +} +EXPORT_SYMBOL_GPL(xt_compat_match_offset); + +int xt_compat_match_from_user(struct xt_entry_match *m, void **dstptr, + unsigned int *size) +{ + const struct xt_match *match = m->u.kernel.match; + struct compat_xt_entry_match *cm = (struct compat_xt_entry_match *)m; + int pad, off = xt_compat_match_offset(match); + u_int16_t msize = cm->u.user.match_size; + + m = *dstptr; + memcpy(m, cm, sizeof(*cm)); + if (match->compat_from_user) + match->compat_from_user(m->data, cm->data); + else + memcpy(m->data, cm->data, msize - sizeof(*cm)); + pad = XT_ALIGN(match->matchsize) - match->matchsize; + if (pad > 0) + memset(m->data + match->matchsize, 0, pad); + + msize += off; + m->u.user.match_size = msize; + + *size += off; + *dstptr += msize; + return 0; +} +EXPORT_SYMBOL_GPL(xt_compat_match_from_user); + +int xt_compat_match_to_user(const struct xt_entry_match *m, + void __user **dstptr, unsigned int *size) +{ + const struct xt_match *match = m->u.kernel.match; + struct compat_xt_entry_match __user *cm = *dstptr; + int off = xt_compat_match_offset(match); + u_int16_t msize = m->u.user.match_size - off; + + if (copy_to_user(cm, m, sizeof(*cm)) || + put_user(msize, &cm->u.user.match_size) || + copy_to_user(cm->u.user.name, m->u.kernel.match->name, + strlen(m->u.kernel.match->name) + 1)) + return -EFAULT; + + if (match->compat_to_user) { + if (match->compat_to_user((void __user *)cm->data, m->data)) + return -EFAULT; + } else { + if (copy_to_user(cm->data, m->data, msize - sizeof(*cm))) + return -EFAULT; + } + + *size -= off; + *dstptr += msize; + return 0; +} +EXPORT_SYMBOL_GPL(xt_compat_match_to_user); +#endif /* CONFIG_COMPAT */ + +int xt_check_target(struct xt_tgchk_param *par, + unsigned int size, u_int8_t proto, bool inv_proto) +{ + int ret; + + if (XT_ALIGN(par->target->targetsize) != size) { + pr_err("%s_tables: %s.%u target: invalid size " + "%u (kernel) != (user) %u\n", + xt_prefix[par->family], par->target->name, + par->target->revision, + XT_ALIGN(par->target->targetsize), size); + return -EINVAL; + } + if (par->target->table != NULL && + strcmp(par->target->table, par->table) != 0) { + pr_err("%s_tables: %s target: only valid in %s table, not %s\n", + xt_prefix[par->family], par->target->name, + par->target->table, par->table); + return -EINVAL; + } + if (par->target->hooks && (par->hook_mask & ~par->target->hooks) != 0) { + char used[64], allow[64]; + + pr_err("%s_tables: %s target: used from hooks %s, but only " + "usable from %s\n", + xt_prefix[par->family], par->target->name, + textify_hooks(used, sizeof(used), par->hook_mask), + textify_hooks(allow, sizeof(allow), par->target->hooks)); + return -EINVAL; + } + if (par->target->proto && (par->target->proto != proto || inv_proto)) { + pr_err("%s_tables: %s target: only valid for protocol %u\n", + xt_prefix[par->family], par->target->name, + par->target->proto); + return -EINVAL; + } + if (par->target->checkentry != NULL) { + ret = par->target->checkentry(par); + if (ret < 0) + return ret; + else if (ret > 0) + /* Flag up potential errors. */ + return -EIO; + } + return 0; +} +EXPORT_SYMBOL_GPL(xt_check_target); + +#ifdef CONFIG_COMPAT +int xt_compat_target_offset(const struct xt_target *target) +{ + u_int16_t csize = target->compatsize ? : target->targetsize; + return XT_ALIGN(target->targetsize) - COMPAT_XT_ALIGN(csize); +} +EXPORT_SYMBOL_GPL(xt_compat_target_offset); + +void xt_compat_target_from_user(struct xt_entry_target *t, void **dstptr, + unsigned int *size) +{ + const struct xt_target *target = t->u.kernel.target; + struct compat_xt_entry_target *ct = (struct compat_xt_entry_target *)t; + int pad, off = xt_compat_target_offset(target); + u_int16_t tsize = ct->u.user.target_size; + + t = *dstptr; + memcpy(t, ct, sizeof(*ct)); + if (target->compat_from_user) + target->compat_from_user(t->data, ct->data); + else + memcpy(t->data, ct->data, tsize - sizeof(*ct)); + pad = XT_ALIGN(target->targetsize) - target->targetsize; + if (pad > 0) + memset(t->data + target->targetsize, 0, pad); + + tsize += off; + t->u.user.target_size = tsize; + + *size += off; + *dstptr += tsize; +} +EXPORT_SYMBOL_GPL(xt_compat_target_from_user); + +int xt_compat_target_to_user(const struct xt_entry_target *t, + void __user **dstptr, unsigned int *size) +{ + const struct xt_target *target = t->u.kernel.target; + struct compat_xt_entry_target __user *ct = *dstptr; + int off = xt_compat_target_offset(target); + u_int16_t tsize = t->u.user.target_size - off; + + if (copy_to_user(ct, t, sizeof(*ct)) || + put_user(tsize, &ct->u.user.target_size) || + copy_to_user(ct->u.user.name, t->u.kernel.target->name, + strlen(t->u.kernel.target->name) + 1)) + return -EFAULT; + + if (target->compat_to_user) { + if (target->compat_to_user((void __user *)ct->data, t->data)) + return -EFAULT; + } else { + if (copy_to_user(ct->data, t->data, tsize - sizeof(*ct))) + return -EFAULT; + } + + *size -= off; + *dstptr += tsize; + return 0; +} +EXPORT_SYMBOL_GPL(xt_compat_target_to_user); +#endif + +struct xt_table_info *xt_alloc_table_info(unsigned int size) +{ + struct xt_table_info *newinfo; + int cpu; + + /* Pedantry: prevent them from hitting BUG() in vmalloc.c --RR */ + if ((SMP_ALIGN(size) >> PAGE_SHIFT) + 2 > totalram_pages) + return NULL; + + newinfo = kzalloc(XT_TABLE_INFO_SZ, GFP_KERNEL); + if (!newinfo) + return NULL; + + newinfo->size = size; + + for_each_possible_cpu(cpu) { + if (size <= PAGE_SIZE) + newinfo->entries[cpu] = kmalloc_node(size, + GFP_KERNEL, + cpu_to_node(cpu)); + else + newinfo->entries[cpu] = vmalloc_node(size, + cpu_to_node(cpu)); + + if (newinfo->entries[cpu] == NULL) { + xt_free_table_info(newinfo); + return NULL; + } + } + + return newinfo; +} +EXPORT_SYMBOL(xt_alloc_table_info); + +void xt_free_table_info(struct xt_table_info *info) +{ + int cpu; + + for_each_possible_cpu(cpu) { + if (info->size <= PAGE_SIZE) + kfree(info->entries[cpu]); + else + vfree(info->entries[cpu]); + } + + if (info->jumpstack != NULL) { + if (sizeof(void *) * info->stacksize > PAGE_SIZE) { + for_each_possible_cpu(cpu) + vfree(info->jumpstack[cpu]); + } else { + for_each_possible_cpu(cpu) + kfree(info->jumpstack[cpu]); + } + } + + if (sizeof(void **) * nr_cpu_ids > PAGE_SIZE) + vfree(info->jumpstack); + else + kfree(info->jumpstack); + + free_percpu(info->stackptr); + + kfree(info); +} +EXPORT_SYMBOL(xt_free_table_info); + +/* Find table by name, grabs mutex & ref. Returns ERR_PTR() on error. */ +struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af, + const char *name) +{ + struct xt_table *t; + + if (mutex_lock_interruptible(&xt[af].mutex) != 0) + return ERR_PTR(-EINTR); + + list_for_each_entry(t, &net->xt.tables[af], list) + if (strcmp(t->name, name) == 0 && try_module_get(t->me)) + return t; + mutex_unlock(&xt[af].mutex); + return NULL; +} +EXPORT_SYMBOL_GPL(xt_find_table_lock); + +void xt_table_unlock(struct xt_table *table) +{ + mutex_unlock(&xt[table->af].mutex); +} +EXPORT_SYMBOL_GPL(xt_table_unlock); + +#ifdef CONFIG_COMPAT +void xt_compat_lock(u_int8_t af) +{ + mutex_lock(&xt[af].compat_mutex); +} +EXPORT_SYMBOL_GPL(xt_compat_lock); + +void xt_compat_unlock(u_int8_t af) +{ + mutex_unlock(&xt[af].compat_mutex); +} +EXPORT_SYMBOL_GPL(xt_compat_unlock); +#endif + +DEFINE_PER_CPU(seqcount_t, xt_recseq); +EXPORT_PER_CPU_SYMBOL_GPL(xt_recseq); + +static int xt_jumpstack_alloc(struct xt_table_info *i) +{ + unsigned int size; + int cpu; + + i->stackptr = alloc_percpu(unsigned int); + if (i->stackptr == NULL) + return -ENOMEM; + + size = sizeof(void **) * nr_cpu_ids; + if (size > PAGE_SIZE) + i->jumpstack = vzalloc(size); + else + i->jumpstack = kzalloc(size, GFP_KERNEL); + if (i->jumpstack == NULL) + return -ENOMEM; + + i->stacksize *= xt_jumpstack_multiplier; + size = sizeof(void *) * i->stacksize; + for_each_possible_cpu(cpu) { + if (size > PAGE_SIZE) + i->jumpstack[cpu] = vmalloc_node(size, + cpu_to_node(cpu)); + else + i->jumpstack[cpu] = kmalloc_node(size, + GFP_KERNEL, cpu_to_node(cpu)); + if (i->jumpstack[cpu] == NULL) + /* + * Freeing will be done later on by the callers. The + * chain is: xt_replace_table -> __do_replace -> + * do_replace -> xt_free_table_info. + */ + return -ENOMEM; + } + + return 0; +} + +struct xt_table_info * +xt_replace_table(struct xt_table *table, + unsigned int num_counters, + struct xt_table_info *newinfo, + int *error) +{ + struct xt_table_info *private; + int ret; + + ret = xt_jumpstack_alloc(newinfo); + if (ret < 0) { + *error = ret; + return NULL; + } + + /* Do the substitution. */ + local_bh_disable(); + private = table->private; + + /* Check inside lock: is the old number correct? */ + if (num_counters != private->number) { + pr_debug("num_counters != table->private->number (%u/%u)\n", + num_counters, private->number); + local_bh_enable(); + *error = -EAGAIN; + return NULL; + } + + table->private = newinfo; + newinfo->initial_entries = private->initial_entries; + + /* + * Even though table entries have now been swapped, other CPU's + * may still be using the old entries. This is okay, because + * resynchronization happens because of the locking done + * during the get_counters() routine. + */ + local_bh_enable(); + +#ifdef CONFIG_AUDIT + if (audit_enabled) { + struct audit_buffer *ab; + + ab = audit_log_start(current->audit_context, GFP_KERNEL, + AUDIT_NETFILTER_CFG); + if (ab) { + audit_log_format(ab, "table=%s family=%u entries=%u", + table->name, table->af, + private->number); + audit_log_end(ab); + } + } +#endif + + return private; +} +EXPORT_SYMBOL_GPL(xt_replace_table); + +struct xt_table *xt_register_table(struct net *net, + const struct xt_table *input_table, + struct xt_table_info *bootstrap, + struct xt_table_info *newinfo) +{ + int ret; + struct xt_table_info *private; + struct xt_table *t, *table; + + /* Don't add one object to multiple lists. */ + table = kmemdup(input_table, sizeof(struct xt_table), GFP_KERNEL); + if (!table) { + ret = -ENOMEM; + goto out; + } + + ret = mutex_lock_interruptible(&xt[table->af].mutex); + if (ret != 0) + goto out_free; + + /* Don't autoload: we'd eat our tail... */ + list_for_each_entry(t, &net->xt.tables[table->af], list) { + if (strcmp(t->name, table->name) == 0) { + ret = -EEXIST; + goto unlock; + } + } + + /* Simplifies replace_table code. */ + table->private = bootstrap; + + if (!xt_replace_table(table, 0, newinfo, &ret)) + goto unlock; + + private = table->private; + pr_debug("table->private->number = %u\n", private->number); + + /* save number of initial entries */ + private->initial_entries = private->number; + + list_add(&table->list, &net->xt.tables[table->af]); + mutex_unlock(&xt[table->af].mutex); + return table; + + unlock: + mutex_unlock(&xt[table->af].mutex); +out_free: + kfree(table); +out: + return ERR_PTR(ret); +} +EXPORT_SYMBOL_GPL(xt_register_table); + +void *xt_unregister_table(struct xt_table *table) +{ + struct xt_table_info *private; + + mutex_lock(&xt[table->af].mutex); + private = table->private; + list_del(&table->list); + mutex_unlock(&xt[table->af].mutex); + kfree(table); + + return private; +} +EXPORT_SYMBOL_GPL(xt_unregister_table); + +#ifdef CONFIG_PROC_FS +struct xt_names_priv { + struct seq_net_private p; + u_int8_t af; +}; +static void *xt_table_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct xt_names_priv *priv = seq->private; + struct net *net = seq_file_net(seq); + u_int8_t af = priv->af; + + mutex_lock(&xt[af].mutex); + return seq_list_start(&net->xt.tables[af], *pos); +} + +static void *xt_table_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct xt_names_priv *priv = seq->private; + struct net *net = seq_file_net(seq); + u_int8_t af = priv->af; + + return seq_list_next(v, &net->xt.tables[af], pos); +} + +static void xt_table_seq_stop(struct seq_file *seq, void *v) +{ + struct xt_names_priv *priv = seq->private; + u_int8_t af = priv->af; + + mutex_unlock(&xt[af].mutex); +} + +static int xt_table_seq_show(struct seq_file *seq, void *v) +{ + struct xt_table *table = list_entry(v, struct xt_table, list); + + if (strlen(table->name)) + return seq_printf(seq, "%s\n", table->name); + else + return 0; +} + +static const struct seq_operations xt_table_seq_ops = { + .start = xt_table_seq_start, + .next = xt_table_seq_next, + .stop = xt_table_seq_stop, + .show = xt_table_seq_show, +}; + +static int xt_table_open(struct inode *inode, struct file *file) +{ + int ret; + struct xt_names_priv *priv; + + ret = seq_open_net(inode, file, &xt_table_seq_ops, + sizeof(struct xt_names_priv)); + if (!ret) { + priv = ((struct seq_file *)file->private_data)->private; + priv->af = (unsigned long)PDE(inode)->data; + } + return ret; +} + +static const struct file_operations xt_table_ops = { + .owner = THIS_MODULE, + .open = xt_table_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +/* + * Traverse state for ip{,6}_{tables,matches} for helping crossing + * the multi-AF mutexes. + */ +struct nf_mttg_trav { + struct list_head *head, *curr; + uint8_t class, nfproto; +}; + +enum { + MTTG_TRAV_INIT, + MTTG_TRAV_NFP_UNSPEC, + MTTG_TRAV_NFP_SPEC, + MTTG_TRAV_DONE, +}; + +static void *xt_mttg_seq_next(struct seq_file *seq, void *v, loff_t *ppos, + bool is_target) +{ + static const uint8_t next_class[] = { + [MTTG_TRAV_NFP_UNSPEC] = MTTG_TRAV_NFP_SPEC, + [MTTG_TRAV_NFP_SPEC] = MTTG_TRAV_DONE, + }; + struct nf_mttg_trav *trav = seq->private; + + switch (trav->class) { + case MTTG_TRAV_INIT: + trav->class = MTTG_TRAV_NFP_UNSPEC; + mutex_lock(&xt[NFPROTO_UNSPEC].mutex); + trav->head = trav->curr = is_target ? + &xt[NFPROTO_UNSPEC].target : &xt[NFPROTO_UNSPEC].match; + break; + case MTTG_TRAV_NFP_UNSPEC: + trav->curr = trav->curr->next; + if (trav->curr != trav->head) + break; + mutex_unlock(&xt[NFPROTO_UNSPEC].mutex); + mutex_lock(&xt[trav->nfproto].mutex); + trav->head = trav->curr = is_target ? + &xt[trav->nfproto].target : &xt[trav->nfproto].match; + trav->class = next_class[trav->class]; + break; + case MTTG_TRAV_NFP_SPEC: + trav->curr = trav->curr->next; + if (trav->curr != trav->head) + break; + /* fallthru, _stop will unlock */ + default: + return NULL; + } + + if (ppos != NULL) + ++*ppos; + return trav; +} + +static void *xt_mttg_seq_start(struct seq_file *seq, loff_t *pos, + bool is_target) +{ + struct nf_mttg_trav *trav = seq->private; + unsigned int j; + + trav->class = MTTG_TRAV_INIT; + for (j = 0; j < *pos; ++j) + if (xt_mttg_seq_next(seq, NULL, NULL, is_target) == NULL) + return NULL; + return trav; +} + +static void xt_mttg_seq_stop(struct seq_file *seq, void *v) +{ + struct nf_mttg_trav *trav = seq->private; + + switch (trav->class) { + case MTTG_TRAV_NFP_UNSPEC: + mutex_unlock(&xt[NFPROTO_UNSPEC].mutex); + break; + case MTTG_TRAV_NFP_SPEC: + mutex_unlock(&xt[trav->nfproto].mutex); + break; + } +} + +static void *xt_match_seq_start(struct seq_file *seq, loff_t *pos) +{ + return xt_mttg_seq_start(seq, pos, false); +} + +static void *xt_match_seq_next(struct seq_file *seq, void *v, loff_t *ppos) +{ + return xt_mttg_seq_next(seq, v, ppos, false); +} + +static int xt_match_seq_show(struct seq_file *seq, void *v) +{ + const struct nf_mttg_trav *trav = seq->private; + const struct xt_match *match; + + switch (trav->class) { + case MTTG_TRAV_NFP_UNSPEC: + case MTTG_TRAV_NFP_SPEC: + if (trav->curr == trav->head) + return 0; + match = list_entry(trav->curr, struct xt_match, list); + return (*match->name == '\0') ? 0 : + seq_printf(seq, "%s\n", match->name); + } + return 0; +} + +static const struct seq_operations xt_match_seq_ops = { + .start = xt_match_seq_start, + .next = xt_match_seq_next, + .stop = xt_mttg_seq_stop, + .show = xt_match_seq_show, +}; + +static int xt_match_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + struct nf_mttg_trav *trav; + int ret; + + trav = kmalloc(sizeof(*trav), GFP_KERNEL); + if (trav == NULL) + return -ENOMEM; + + ret = seq_open(file, &xt_match_seq_ops); + if (ret < 0) { + kfree(trav); + return ret; + } + + seq = file->private_data; + seq->private = trav; + trav->nfproto = (unsigned long)PDE(inode)->data; + return 0; +} + +static const struct file_operations xt_match_ops = { + .owner = THIS_MODULE, + .open = xt_match_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +static void *xt_target_seq_start(struct seq_file *seq, loff_t *pos) +{ + return xt_mttg_seq_start(seq, pos, true); +} + +static void *xt_target_seq_next(struct seq_file *seq, void *v, loff_t *ppos) +{ + return xt_mttg_seq_next(seq, v, ppos, true); +} + +static int xt_target_seq_show(struct seq_file *seq, void *v) +{ + const struct nf_mttg_trav *trav = seq->private; + const struct xt_target *target; + + switch (trav->class) { + case MTTG_TRAV_NFP_UNSPEC: + case MTTG_TRAV_NFP_SPEC: + if (trav->curr == trav->head) + return 0; + target = list_entry(trav->curr, struct xt_target, list); + return (*target->name == '\0') ? 0 : + seq_printf(seq, "%s\n", target->name); + } + return 0; +} + +static const struct seq_operations xt_target_seq_ops = { + .start = xt_target_seq_start, + .next = xt_target_seq_next, + .stop = xt_mttg_seq_stop, + .show = xt_target_seq_show, +}; + +static int xt_target_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + struct nf_mttg_trav *trav; + int ret; + + trav = kmalloc(sizeof(*trav), GFP_KERNEL); + if (trav == NULL) + return -ENOMEM; + + ret = seq_open(file, &xt_target_seq_ops); + if (ret < 0) { + kfree(trav); + return ret; + } + + seq = file->private_data; + seq->private = trav; + trav->nfproto = (unsigned long)PDE(inode)->data; + return 0; +} + +static const struct file_operations xt_target_ops = { + .owner = THIS_MODULE, + .open = xt_target_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +#define FORMAT_TABLES "_tables_names" +#define FORMAT_MATCHES "_tables_matches" +#define FORMAT_TARGETS "_tables_targets" + +#endif /* CONFIG_PROC_FS */ + +/** + * xt_hook_link - set up hooks for a new table + * @table: table with metadata needed to set up hooks + * @fn: Hook function + * + * This function will take care of creating and registering the necessary + * Netfilter hooks for XT tables. + */ +struct nf_hook_ops *xt_hook_link(const struct xt_table *table, nf_hookfn *fn) +{ + unsigned int hook_mask = table->valid_hooks; + uint8_t i, num_hooks = hweight32(hook_mask); + uint8_t hooknum; + struct nf_hook_ops *ops; + int ret; + + ops = kmalloc(sizeof(*ops) * num_hooks, GFP_KERNEL); + if (ops == NULL) + return ERR_PTR(-ENOMEM); + + for (i = 0, hooknum = 0; i < num_hooks && hook_mask != 0; + hook_mask >>= 1, ++hooknum) { + if (!(hook_mask & 1)) + continue; + ops[i].hook = fn; + ops[i].owner = table->me; + ops[i].pf = table->af; + ops[i].hooknum = hooknum; + ops[i].priority = table->priority; + ++i; + } + + ret = nf_register_hooks(ops, num_hooks); + if (ret < 0) { + kfree(ops); + return ERR_PTR(ret); + } + + return ops; +} +EXPORT_SYMBOL_GPL(xt_hook_link); + +/** + * xt_hook_unlink - remove hooks for a table + * @ops: nf_hook_ops array as returned by nf_hook_link + * @hook_mask: the very same mask that was passed to nf_hook_link + */ +void xt_hook_unlink(const struct xt_table *table, struct nf_hook_ops *ops) +{ + nf_unregister_hooks(ops, hweight32(table->valid_hooks)); + kfree(ops); +} +EXPORT_SYMBOL_GPL(xt_hook_unlink); + +int xt_proto_init(struct net *net, u_int8_t af) +{ +#ifdef CONFIG_PROC_FS + char buf[XT_FUNCTION_MAXNAMELEN]; + struct proc_dir_entry *proc; +#endif + + if (af >= ARRAY_SIZE(xt_prefix)) + return -EINVAL; + + +#ifdef CONFIG_PROC_FS + strlcpy(buf, xt_prefix[af], sizeof(buf)); + strlcat(buf, FORMAT_TABLES, sizeof(buf)); + proc = proc_create_data(buf, 0440, net->proc_net, &xt_table_ops, + (void *)(unsigned long)af); + if (!proc) + goto out; + + strlcpy(buf, xt_prefix[af], sizeof(buf)); + strlcat(buf, FORMAT_MATCHES, sizeof(buf)); + proc = proc_create_data(buf, 0440, net->proc_net, &xt_match_ops, + (void *)(unsigned long)af); + if (!proc) + goto out_remove_tables; + + strlcpy(buf, xt_prefix[af], sizeof(buf)); + strlcat(buf, FORMAT_TARGETS, sizeof(buf)); + proc = proc_create_data(buf, 0440, net->proc_net, &xt_target_ops, + (void *)(unsigned long)af); + if (!proc) + goto out_remove_matches; +#endif + + return 0; + +#ifdef CONFIG_PROC_FS +out_remove_matches: + strlcpy(buf, xt_prefix[af], sizeof(buf)); + strlcat(buf, FORMAT_MATCHES, sizeof(buf)); + proc_net_remove(net, buf); + +out_remove_tables: + strlcpy(buf, xt_prefix[af], sizeof(buf)); + strlcat(buf, FORMAT_TABLES, sizeof(buf)); + proc_net_remove(net, buf); +out: + return -1; +#endif +} +EXPORT_SYMBOL_GPL(xt_proto_init); + +void xt_proto_fini(struct net *net, u_int8_t af) +{ +#ifdef CONFIG_PROC_FS + char buf[XT_FUNCTION_MAXNAMELEN]; + + strlcpy(buf, xt_prefix[af], sizeof(buf)); + strlcat(buf, FORMAT_TABLES, sizeof(buf)); + proc_net_remove(net, buf); + + strlcpy(buf, xt_prefix[af], sizeof(buf)); + strlcat(buf, FORMAT_TARGETS, sizeof(buf)); + proc_net_remove(net, buf); + + strlcpy(buf, xt_prefix[af], sizeof(buf)); + strlcat(buf, FORMAT_MATCHES, sizeof(buf)); + proc_net_remove(net, buf); +#endif /*CONFIG_PROC_FS*/ +} +EXPORT_SYMBOL_GPL(xt_proto_fini); + +static int __net_init xt_net_init(struct net *net) +{ + int i; + + for (i = 0; i < NFPROTO_NUMPROTO; i++) + INIT_LIST_HEAD(&net->xt.tables[i]); + return 0; +} + +static struct pernet_operations xt_net_ops = { + .init = xt_net_init, +}; + +static int __init xt_init(void) +{ + unsigned int i; + int rv; + + for_each_possible_cpu(i) { + seqcount_init(&per_cpu(xt_recseq, i)); + } + + xt = kmalloc(sizeof(struct xt_af) * NFPROTO_NUMPROTO, GFP_KERNEL); + if (!xt) + return -ENOMEM; + + for (i = 0; i < NFPROTO_NUMPROTO; i++) { + mutex_init(&xt[i].mutex); +#ifdef CONFIG_COMPAT + mutex_init(&xt[i].compat_mutex); + xt[i].compat_tab = NULL; +#endif + INIT_LIST_HEAD(&xt[i].target); + INIT_LIST_HEAD(&xt[i].match); + } + rv = register_pernet_subsys(&xt_net_ops); + if (rv < 0) + kfree(xt); + return rv; +} + +static void __exit xt_fini(void) +{ + unregister_pernet_subsys(&xt_net_ops); + kfree(xt); +} + +module_init(xt_init); +module_exit(xt_fini); + diff --git a/net/netfilter/xt_AUDIT.c b/net/netfilter/xt_AUDIT.c new file mode 100644 index 00000000..ba928240 --- /dev/null +++ b/net/netfilter/xt_AUDIT.c @@ -0,0 +1,228 @@ +/* + * Creates audit record for dropped/accepted packets + * + * (C) 2010-2011 Thomas Graf <tgraf@redhat.com> + * (C) 2010-2011 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. +*/ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/audit.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <linux/if_arp.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_AUDIT.h> +#include <linux/netfilter_bridge/ebtables.h> +#include <net/ipv6.h> +#include <net/ip.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Thomas Graf <tgraf@redhat.com>"); +MODULE_DESCRIPTION("Xtables: creates audit records for dropped/accepted packets"); +MODULE_ALIAS("ipt_AUDIT"); +MODULE_ALIAS("ip6t_AUDIT"); +MODULE_ALIAS("ebt_AUDIT"); +MODULE_ALIAS("arpt_AUDIT"); + +static void audit_proto(struct audit_buffer *ab, struct sk_buff *skb, + unsigned int proto, unsigned int offset) +{ + switch (proto) { + case IPPROTO_TCP: + case IPPROTO_UDP: + case IPPROTO_UDPLITE: { + const __be16 *pptr; + __be16 _ports[2]; + + pptr = skb_header_pointer(skb, offset, sizeof(_ports), _ports); + if (pptr == NULL) { + audit_log_format(ab, " truncated=1"); + return; + } + + audit_log_format(ab, " sport=%hu dport=%hu", + ntohs(pptr[0]), ntohs(pptr[1])); + } + break; + + case IPPROTO_ICMP: + case IPPROTO_ICMPV6: { + const u8 *iptr; + u8 _ih[2]; + + iptr = skb_header_pointer(skb, offset, sizeof(_ih), &_ih); + if (iptr == NULL) { + audit_log_format(ab, " truncated=1"); + return; + } + + audit_log_format(ab, " icmptype=%hhu icmpcode=%hhu", + iptr[0], iptr[1]); + + } + break; + } +} + +static void audit_ip4(struct audit_buffer *ab, struct sk_buff *skb) +{ + struct iphdr _iph; + const struct iphdr *ih; + + ih = skb_header_pointer(skb, 0, sizeof(_iph), &_iph); + if (!ih) { + audit_log_format(ab, " truncated=1"); + return; + } + + audit_log_format(ab, " saddr=%pI4 daddr=%pI4 ipid=%hu proto=%hhu", + &ih->saddr, &ih->daddr, ntohs(ih->id), ih->protocol); + + if (ntohs(ih->frag_off) & IP_OFFSET) { + audit_log_format(ab, " frag=1"); + return; + } + + audit_proto(ab, skb, ih->protocol, ih->ihl * 4); +} + +static void audit_ip6(struct audit_buffer *ab, struct sk_buff *skb) +{ + struct ipv6hdr _ip6h; + const struct ipv6hdr *ih; + u8 nexthdr; + __be16 frag_off; + int offset; + + ih = skb_header_pointer(skb, skb_network_offset(skb), sizeof(_ip6h), &_ip6h); + if (!ih) { + audit_log_format(ab, " truncated=1"); + return; + } + + nexthdr = ih->nexthdr; + offset = ipv6_skip_exthdr(skb, skb_network_offset(skb) + sizeof(_ip6h), + &nexthdr, &frag_off); + + audit_log_format(ab, " saddr=%pI6c daddr=%pI6c proto=%hhu", + &ih->saddr, &ih->daddr, nexthdr); + + if (offset) + audit_proto(ab, skb, nexthdr, offset); +} + +static unsigned int +audit_tg(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_audit_info *info = par->targinfo; + struct audit_buffer *ab; + + ab = audit_log_start(NULL, GFP_ATOMIC, AUDIT_NETFILTER_PKT); + if (ab == NULL) + goto errout; + + audit_log_format(ab, "action=%hhu hook=%u len=%u inif=%s outif=%s", + info->type, par->hooknum, skb->len, + par->in ? par->in->name : "?", + par->out ? par->out->name : "?"); + + if (skb->mark) + audit_log_format(ab, " mark=%#x", skb->mark); + + if (skb->dev && skb->dev->type == ARPHRD_ETHER) { + audit_log_format(ab, " smac=%pM dmac=%pM macproto=0x%04x", + eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest, + ntohs(eth_hdr(skb)->h_proto)); + + if (par->family == NFPROTO_BRIDGE) { + switch (eth_hdr(skb)->h_proto) { + case __constant_htons(ETH_P_IP): + audit_ip4(ab, skb); + break; + + case __constant_htons(ETH_P_IPV6): + audit_ip6(ab, skb); + break; + } + } + } + + switch (par->family) { + case NFPROTO_IPV4: + audit_ip4(ab, skb); + break; + + case NFPROTO_IPV6: + audit_ip6(ab, skb); + break; + } + +#ifdef CONFIG_NETWORK_SECMARK + if (skb->secmark) + audit_log_secctx(ab, skb->secmark); +#endif + + audit_log_end(ab); + +errout: + return XT_CONTINUE; +} + +static unsigned int +audit_tg_ebt(struct sk_buff *skb, const struct xt_action_param *par) +{ + audit_tg(skb, par); + return EBT_CONTINUE; +} + +static int audit_tg_check(const struct xt_tgchk_param *par) +{ + const struct xt_audit_info *info = par->targinfo; + + if (info->type > XT_AUDIT_TYPE_MAX) { + pr_info("Audit type out of range (valid range: 0..%hhu)\n", + XT_AUDIT_TYPE_MAX); + return -ERANGE; + } + + return 0; +} + +static struct xt_target audit_tg_reg[] __read_mostly = { + { + .name = "AUDIT", + .family = NFPROTO_UNSPEC, + .target = audit_tg, + .targetsize = sizeof(struct xt_audit_info), + .checkentry = audit_tg_check, + .me = THIS_MODULE, + }, + { + .name = "AUDIT", + .family = NFPROTO_BRIDGE, + .target = audit_tg_ebt, + .targetsize = sizeof(struct xt_audit_info), + .checkentry = audit_tg_check, + .me = THIS_MODULE, + }, +}; + +static int __init audit_tg_init(void) +{ + return xt_register_targets(audit_tg_reg, ARRAY_SIZE(audit_tg_reg)); +} + +static void __exit audit_tg_exit(void) +{ + xt_unregister_targets(audit_tg_reg, ARRAY_SIZE(audit_tg_reg)); +} + +module_init(audit_tg_init); +module_exit(audit_tg_exit); diff --git a/net/netfilter/xt_CHECKSUM.c b/net/netfilter/xt_CHECKSUM.c new file mode 100644 index 00000000..0f642ef8 --- /dev/null +++ b/net/netfilter/xt_CHECKSUM.c @@ -0,0 +1,70 @@ +/* iptables module for the packet checksum mangling + * + * (C) 2002 by Harald Welte <laforge@netfilter.org> + * (C) 2010 Red Hat, Inc. + * + * Author: Michael S. Tsirkin <mst@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. +*/ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/module.h> +#include <linux/skbuff.h> + +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_CHECKSUM.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Michael S. Tsirkin <mst@redhat.com>"); +MODULE_DESCRIPTION("Xtables: checksum modification"); +MODULE_ALIAS("ipt_CHECKSUM"); +MODULE_ALIAS("ip6t_CHECKSUM"); + +static unsigned int +checksum_tg(struct sk_buff *skb, const struct xt_action_param *par) +{ + if (skb->ip_summed == CHECKSUM_PARTIAL) + skb_checksum_help(skb); + + return XT_CONTINUE; +} + +static int checksum_tg_check(const struct xt_tgchk_param *par) +{ + const struct xt_CHECKSUM_info *einfo = par->targinfo; + + if (einfo->operation & ~XT_CHECKSUM_OP_FILL) { + pr_info("unsupported CHECKSUM operation %x\n", einfo->operation); + return -EINVAL; + } + if (!einfo->operation) { + pr_info("no CHECKSUM operation enabled\n"); + return -EINVAL; + } + return 0; +} + +static struct xt_target checksum_tg_reg __read_mostly = { + .name = "CHECKSUM", + .family = NFPROTO_UNSPEC, + .target = checksum_tg, + .targetsize = sizeof(struct xt_CHECKSUM_info), + .table = "mangle", + .checkentry = checksum_tg_check, + .me = THIS_MODULE, +}; + +static int __init checksum_tg_init(void) +{ + return xt_register_target(&checksum_tg_reg); +} + +static void __exit checksum_tg_exit(void) +{ + xt_unregister_target(&checksum_tg_reg); +} + +module_init(checksum_tg_init); +module_exit(checksum_tg_exit); diff --git a/net/netfilter/xt_CLASSIFY.c b/net/netfilter/xt_CLASSIFY.c new file mode 100644 index 00000000..af9c4dad --- /dev/null +++ b/net/netfilter/xt_CLASSIFY.c @@ -0,0 +1,73 @@ +/* + * This is a module which is used for setting the skb->priority field + * of an skb for qdisc classification. + */ + +/* (C) 2001-2002 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <net/checksum.h> + +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter_ipv6.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_CLASSIFY.h> +#include <linux/netfilter_arp.h> + +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Xtables: Qdisc classification"); +MODULE_ALIAS("ipt_CLASSIFY"); +MODULE_ALIAS("ip6t_CLASSIFY"); +MODULE_ALIAS("arpt_CLASSIFY"); + +static unsigned int +classify_tg(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_classify_target_info *clinfo = par->targinfo; + + skb->priority = clinfo->priority; + return XT_CONTINUE; +} + +static struct xt_target classify_tg_reg[] __read_mostly = { + { + .name = "CLASSIFY", + .revision = 0, + .family = NFPROTO_UNSPEC, + .hooks = (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_FORWARD) | + (1 << NF_INET_POST_ROUTING), + .target = classify_tg, + .targetsize = sizeof(struct xt_classify_target_info), + .me = THIS_MODULE, + }, + { + .name = "CLASSIFY", + .revision = 0, + .family = NFPROTO_ARP, + .hooks = (1 << NF_ARP_OUT) | (1 << NF_ARP_FORWARD), + .target = classify_tg, + .targetsize = sizeof(struct xt_classify_target_info), + .me = THIS_MODULE, + }, +}; + +static int __init classify_tg_init(void) +{ + return xt_register_targets(classify_tg_reg, ARRAY_SIZE(classify_tg_reg)); +} + +static void __exit classify_tg_exit(void) +{ + xt_unregister_targets(classify_tg_reg, ARRAY_SIZE(classify_tg_reg)); +} + +module_init(classify_tg_init); +module_exit(classify_tg_exit); diff --git a/net/netfilter/xt_CONNSECMARK.c b/net/netfilter/xt_CONNSECMARK.c new file mode 100644 index 00000000..e04dc282 --- /dev/null +++ b/net/netfilter/xt_CONNSECMARK.c @@ -0,0 +1,143 @@ +/* + * This module is used to copy security markings from packets + * to connections, and restore security markings from connections + * back to packets. This would normally be performed in conjunction + * with the SECMARK target and state match. + * + * Based somewhat on CONNMARK: + * Copyright (C) 2002,2004 MARA Systems AB <http://www.marasystems.com> + * by Henrik Nordstrom <hno@marasystems.com> + * + * (C) 2006,2008 Red Hat, Inc., James Morris <jmorris@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_CONNSECMARK.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_ecache.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("James Morris <jmorris@redhat.com>"); +MODULE_DESCRIPTION("Xtables: target for copying between connection and security mark"); +MODULE_ALIAS("ipt_CONNSECMARK"); +MODULE_ALIAS("ip6t_CONNSECMARK"); + +/* + * If the packet has a security mark and the connection does not, copy + * the security mark from the packet to the connection. + */ +static void secmark_save(const struct sk_buff *skb) +{ + if (skb->secmark) { + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + + ct = nf_ct_get(skb, &ctinfo); + if (ct && !ct->secmark) { + ct->secmark = skb->secmark; + nf_conntrack_event_cache(IPCT_SECMARK, ct); + } + } +} + +/* + * If packet has no security mark, and the connection does, restore the + * security mark from the connection to the packet. + */ +static void secmark_restore(struct sk_buff *skb) +{ + if (!skb->secmark) { + const struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + + ct = nf_ct_get(skb, &ctinfo); + if (ct && ct->secmark) + skb->secmark = ct->secmark; + } +} + +static unsigned int +connsecmark_tg(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_connsecmark_target_info *info = par->targinfo; + + switch (info->mode) { + case CONNSECMARK_SAVE: + secmark_save(skb); + break; + + case CONNSECMARK_RESTORE: + secmark_restore(skb); + break; + + default: + BUG(); + } + + return XT_CONTINUE; +} + +static int connsecmark_tg_check(const struct xt_tgchk_param *par) +{ + const struct xt_connsecmark_target_info *info = par->targinfo; + int ret; + + if (strcmp(par->table, "mangle") != 0 && + strcmp(par->table, "security") != 0) { + pr_info("target only valid in the \'mangle\' " + "or \'security\' tables, not \'%s\'.\n", par->table); + return -EINVAL; + } + + switch (info->mode) { + case CONNSECMARK_SAVE: + case CONNSECMARK_RESTORE: + break; + + default: + pr_info("invalid mode: %hu\n", info->mode); + return -EINVAL; + } + + ret = nf_ct_l3proto_try_module_get(par->family); + if (ret < 0) + pr_info("cannot load conntrack support for proto=%u\n", + par->family); + return ret; +} + +static void connsecmark_tg_destroy(const struct xt_tgdtor_param *par) +{ + nf_ct_l3proto_module_put(par->family); +} + +static struct xt_target connsecmark_tg_reg __read_mostly = { + .name = "CONNSECMARK", + .revision = 0, + .family = NFPROTO_UNSPEC, + .checkentry = connsecmark_tg_check, + .destroy = connsecmark_tg_destroy, + .target = connsecmark_tg, + .targetsize = sizeof(struct xt_connsecmark_target_info), + .me = THIS_MODULE, +}; + +static int __init connsecmark_tg_init(void) +{ + return xt_register_target(&connsecmark_tg_reg); +} + +static void __exit connsecmark_tg_exit(void) +{ + xt_unregister_target(&connsecmark_tg_reg); +} + +module_init(connsecmark_tg_init); +module_exit(connsecmark_tg_exit); diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c new file mode 100644 index 00000000..3746d8b9 --- /dev/null +++ b/net/netfilter/xt_CT.c @@ -0,0 +1,397 @@ +/* + * Copyright (c) 2010 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/module.h> +#include <linux/gfp.h> +#include <linux/skbuff.h> +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv6/ip6_tables.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_CT.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_l4proto.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_ecache.h> +#include <net/netfilter/nf_conntrack_l4proto.h> +#include <net/netfilter/nf_conntrack_timeout.h> +#include <net/netfilter/nf_conntrack_zones.h> + +static unsigned int xt_ct_target_v0(struct sk_buff *skb, + const struct xt_action_param *par) +{ + const struct xt_ct_target_info *info = par->targinfo; + struct nf_conn *ct = info->ct; + + /* Previously seen (loopback)? Ignore. */ + if (skb->nfct != NULL) + return XT_CONTINUE; + + atomic_inc(&ct->ct_general.use); + skb->nfct = &ct->ct_general; + skb->nfctinfo = IP_CT_NEW; + + return XT_CONTINUE; +} + +static unsigned int xt_ct_target_v1(struct sk_buff *skb, + const struct xt_action_param *par) +{ + const struct xt_ct_target_info_v1 *info = par->targinfo; + struct nf_conn *ct = info->ct; + + /* Previously seen (loopback)? Ignore. */ + if (skb->nfct != NULL) + return XT_CONTINUE; + + atomic_inc(&ct->ct_general.use); + skb->nfct = &ct->ct_general; + skb->nfctinfo = IP_CT_NEW; + + return XT_CONTINUE; +} + +static u8 xt_ct_find_proto(const struct xt_tgchk_param *par) +{ + if (par->family == NFPROTO_IPV4) { + const struct ipt_entry *e = par->entryinfo; + + if (e->ip.invflags & IPT_INV_PROTO) + return 0; + return e->ip.proto; + } else if (par->family == NFPROTO_IPV6) { + const struct ip6t_entry *e = par->entryinfo; + + if (e->ipv6.invflags & IP6T_INV_PROTO) + return 0; + return e->ipv6.proto; + } else + return 0; +} + +static int xt_ct_tg_check_v0(const struct xt_tgchk_param *par) +{ + struct xt_ct_target_info *info = par->targinfo; + struct nf_conntrack_tuple t; + struct nf_conn_help *help; + struct nf_conn *ct; + int ret = 0; + u8 proto; + + if (info->flags & ~XT_CT_NOTRACK) + return -EINVAL; + + if (info->flags & XT_CT_NOTRACK) { + ct = nf_ct_untracked_get(); + atomic_inc(&ct->ct_general.use); + goto out; + } + +#ifndef CONFIG_NF_CONNTRACK_ZONES + if (info->zone) + goto err1; +#endif + + ret = nf_ct_l3proto_try_module_get(par->family); + if (ret < 0) + goto err1; + + memset(&t, 0, sizeof(t)); + ct = nf_conntrack_alloc(par->net, info->zone, &t, &t, GFP_KERNEL); + ret = PTR_ERR(ct); + if (IS_ERR(ct)) + goto err2; + + ret = 0; + if ((info->ct_events || info->exp_events) && + !nf_ct_ecache_ext_add(ct, info->ct_events, info->exp_events, + GFP_KERNEL)) + goto err3; + + if (info->helper[0]) { + ret = -ENOENT; + proto = xt_ct_find_proto(par); + if (!proto) { + pr_info("You must specify a L4 protocol, " + "and not use inversions on it.\n"); + goto err3; + } + + ret = -ENOMEM; + help = nf_ct_helper_ext_add(ct, GFP_KERNEL); + if (help == NULL) + goto err3; + + ret = -ENOENT; + help->helper = nf_conntrack_helper_try_module_get(info->helper, + par->family, + proto); + if (help->helper == NULL) { + pr_info("No such helper \"%s\"\n", info->helper); + goto err3; + } + } + + __set_bit(IPS_TEMPLATE_BIT, &ct->status); + __set_bit(IPS_CONFIRMED_BIT, &ct->status); +out: + info->ct = ct; + return 0; + +err3: + nf_conntrack_free(ct); +err2: + nf_ct_l3proto_module_put(par->family); +err1: + return ret; +} + +#ifdef CONFIG_NF_CONNTRACK_TIMEOUT +static void __xt_ct_tg_timeout_put(struct ctnl_timeout *timeout) +{ + typeof(nf_ct_timeout_put_hook) timeout_put; + + timeout_put = rcu_dereference(nf_ct_timeout_put_hook); + if (timeout_put) + timeout_put(timeout); +} +#endif + +static int xt_ct_tg_check_v1(const struct xt_tgchk_param *par) +{ + struct xt_ct_target_info_v1 *info = par->targinfo; + struct nf_conntrack_tuple t; + struct nf_conn_help *help; + struct nf_conn *ct; + int ret = 0; + u8 proto; +#ifdef CONFIG_NF_CONNTRACK_TIMEOUT + struct ctnl_timeout *timeout; +#endif + if (info->flags & ~XT_CT_NOTRACK) + return -EINVAL; + + if (info->flags & XT_CT_NOTRACK) { + ct = nf_ct_untracked_get(); + atomic_inc(&ct->ct_general.use); + goto out; + } + +#ifndef CONFIG_NF_CONNTRACK_ZONES + if (info->zone) + goto err1; +#endif + + ret = nf_ct_l3proto_try_module_get(par->family); + if (ret < 0) + goto err1; + + memset(&t, 0, sizeof(t)); + ct = nf_conntrack_alloc(par->net, info->zone, &t, &t, GFP_KERNEL); + ret = PTR_ERR(ct); + if (IS_ERR(ct)) + goto err2; + + ret = 0; + if ((info->ct_events || info->exp_events) && + !nf_ct_ecache_ext_add(ct, info->ct_events, info->exp_events, + GFP_KERNEL)) + goto err3; + + if (info->helper[0]) { + ret = -ENOENT; + proto = xt_ct_find_proto(par); + if (!proto) { + pr_info("You must specify a L4 protocol, " + "and not use inversions on it.\n"); + goto err3; + } + + ret = -ENOMEM; + help = nf_ct_helper_ext_add(ct, GFP_KERNEL); + if (help == NULL) + goto err3; + + ret = -ENOENT; + help->helper = nf_conntrack_helper_try_module_get(info->helper, + par->family, + proto); + if (help->helper == NULL) { + pr_info("No such helper \"%s\"\n", info->helper); + goto err3; + } + } + +#ifdef CONFIG_NF_CONNTRACK_TIMEOUT + if (info->timeout[0]) { + typeof(nf_ct_timeout_find_get_hook) timeout_find_get; + struct nf_conn_timeout *timeout_ext; + + rcu_read_lock(); + timeout_find_get = + rcu_dereference(nf_ct_timeout_find_get_hook); + + if (timeout_find_get) { + const struct ipt_entry *e = par->entryinfo; + struct nf_conntrack_l4proto *l4proto; + + if (e->ip.invflags & IPT_INV_PROTO) { + ret = -EINVAL; + pr_info("You cannot use inversion on " + "L4 protocol\n"); + goto err4; + } + timeout = timeout_find_get(info->timeout); + if (timeout == NULL) { + ret = -ENOENT; + pr_info("No such timeout policy \"%s\"\n", + info->timeout); + goto err4; + } + if (timeout->l3num != par->family) { + ret = -EINVAL; + pr_info("Timeout policy `%s' can only be " + "used by L3 protocol number %d\n", + info->timeout, timeout->l3num); + goto err5; + } + /* Make sure the timeout policy matches any existing + * protocol tracker, otherwise default to generic. + */ + l4proto = __nf_ct_l4proto_find(par->family, + e->ip.proto); + if (timeout->l4proto->l4proto != l4proto->l4proto) { + ret = -EINVAL; + pr_info("Timeout policy `%s' can only be " + "used by L4 protocol number %d\n", + info->timeout, + timeout->l4proto->l4proto); + goto err5; + } + timeout_ext = nf_ct_timeout_ext_add(ct, timeout, + GFP_ATOMIC); + if (timeout_ext == NULL) { + ret = -ENOMEM; + goto err5; + } + } else { + ret = -ENOENT; + pr_info("Timeout policy base is empty\n"); + goto err4; + } + rcu_read_unlock(); + } +#endif + + __set_bit(IPS_TEMPLATE_BIT, &ct->status); + __set_bit(IPS_CONFIRMED_BIT, &ct->status); +out: + info->ct = ct; + return 0; + +#ifdef CONFIG_NF_CONNTRACK_TIMEOUT +err5: + __xt_ct_tg_timeout_put(timeout); +err4: + rcu_read_unlock(); +#endif +err3: + nf_conntrack_free(ct); +err2: + nf_ct_l3proto_module_put(par->family); +err1: + return ret; +} + +static void xt_ct_tg_destroy_v0(const struct xt_tgdtor_param *par) +{ + struct xt_ct_target_info *info = par->targinfo; + struct nf_conn *ct = info->ct; + struct nf_conn_help *help; + + if (!nf_ct_is_untracked(ct)) { + help = nfct_help(ct); + if (help) + module_put(help->helper->me); + + nf_ct_l3proto_module_put(par->family); + } + nf_ct_put(info->ct); +} + +static void xt_ct_tg_destroy_v1(const struct xt_tgdtor_param *par) +{ + struct xt_ct_target_info_v1 *info = par->targinfo; + struct nf_conn *ct = info->ct; + struct nf_conn_help *help; +#ifdef CONFIG_NF_CONNTRACK_TIMEOUT + struct nf_conn_timeout *timeout_ext; + typeof(nf_ct_timeout_put_hook) timeout_put; +#endif + if (!nf_ct_is_untracked(ct)) { + help = nfct_help(ct); + if (help) + module_put(help->helper->me); + + nf_ct_l3proto_module_put(par->family); + +#ifdef CONFIG_NF_CONNTRACK_TIMEOUT + rcu_read_lock(); + timeout_put = rcu_dereference(nf_ct_timeout_put_hook); + + if (timeout_put) { + timeout_ext = nf_ct_timeout_find(ct); + if (timeout_ext) + timeout_put(timeout_ext->timeout); + } + rcu_read_unlock(); +#endif + } + nf_ct_put(info->ct); +} + +static struct xt_target xt_ct_tg_reg[] __read_mostly = { + { + .name = "CT", + .family = NFPROTO_UNSPEC, + .targetsize = sizeof(struct xt_ct_target_info), + .checkentry = xt_ct_tg_check_v0, + .destroy = xt_ct_tg_destroy_v0, + .target = xt_ct_target_v0, + .table = "raw", + .me = THIS_MODULE, + }, + { + .name = "CT", + .family = NFPROTO_UNSPEC, + .revision = 1, + .targetsize = sizeof(struct xt_ct_target_info_v1), + .checkentry = xt_ct_tg_check_v1, + .destroy = xt_ct_tg_destroy_v1, + .target = xt_ct_target_v1, + .table = "raw", + .me = THIS_MODULE, + }, +}; + +static int __init xt_ct_tg_init(void) +{ + return xt_register_targets(xt_ct_tg_reg, ARRAY_SIZE(xt_ct_tg_reg)); +} + +static void __exit xt_ct_tg_exit(void) +{ + xt_unregister_targets(xt_ct_tg_reg, ARRAY_SIZE(xt_ct_tg_reg)); +} + +module_init(xt_ct_tg_init); +module_exit(xt_ct_tg_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Xtables: connection tracking target"); +MODULE_ALIAS("ipt_CT"); +MODULE_ALIAS("ip6t_CT"); diff --git a/net/netfilter/xt_DSCP.c b/net/netfilter/xt_DSCP.c new file mode 100644 index 00000000..ae827165 --- /dev/null +++ b/net/netfilter/xt_DSCP.c @@ -0,0 +1,164 @@ +/* x_tables module for setting the IPv4/IPv6 DSCP field, Version 1.8 + * + * (C) 2002 by Harald Welte <laforge@netfilter.org> + * based on ipt_FTOS.c (C) 2000 by Matthew G. Marsh <mgm@paktronix.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * See RFC2474 for a description of the DSCP field within the IP Header. +*/ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <net/dsfield.h> + +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_DSCP.h> + +MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); +MODULE_DESCRIPTION("Xtables: DSCP/TOS field modification"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_DSCP"); +MODULE_ALIAS("ip6t_DSCP"); +MODULE_ALIAS("ipt_TOS"); +MODULE_ALIAS("ip6t_TOS"); + +static unsigned int +dscp_tg(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_DSCP_info *dinfo = par->targinfo; + u_int8_t dscp = ipv4_get_dsfield(ip_hdr(skb)) >> XT_DSCP_SHIFT; + + if (dscp != dinfo->dscp) { + if (!skb_make_writable(skb, sizeof(struct iphdr))) + return NF_DROP; + + ipv4_change_dsfield(ip_hdr(skb), (__u8)(~XT_DSCP_MASK), + dinfo->dscp << XT_DSCP_SHIFT); + + } + return XT_CONTINUE; +} + +static unsigned int +dscp_tg6(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_DSCP_info *dinfo = par->targinfo; + u_int8_t dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> XT_DSCP_SHIFT; + + if (dscp != dinfo->dscp) { + if (!skb_make_writable(skb, sizeof(struct ipv6hdr))) + return NF_DROP; + + ipv6_change_dsfield(ipv6_hdr(skb), (__u8)(~XT_DSCP_MASK), + dinfo->dscp << XT_DSCP_SHIFT); + } + return XT_CONTINUE; +} + +static int dscp_tg_check(const struct xt_tgchk_param *par) +{ + const struct xt_DSCP_info *info = par->targinfo; + + if (info->dscp > XT_DSCP_MAX) { + pr_info("dscp %x out of range\n", info->dscp); + return -EDOM; + } + return 0; +} + +static unsigned int +tos_tg(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_tos_target_info *info = par->targinfo; + struct iphdr *iph = ip_hdr(skb); + u_int8_t orig, nv; + + orig = ipv4_get_dsfield(iph); + nv = (orig & ~info->tos_mask) ^ info->tos_value; + + if (orig != nv) { + if (!skb_make_writable(skb, sizeof(struct iphdr))) + return NF_DROP; + iph = ip_hdr(skb); + ipv4_change_dsfield(iph, 0, nv); + } + + return XT_CONTINUE; +} + +static unsigned int +tos_tg6(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_tos_target_info *info = par->targinfo; + struct ipv6hdr *iph = ipv6_hdr(skb); + u_int8_t orig, nv; + + orig = ipv6_get_dsfield(iph); + nv = (orig & ~info->tos_mask) ^ info->tos_value; + + if (orig != nv) { + if (!skb_make_writable(skb, sizeof(struct iphdr))) + return NF_DROP; + iph = ipv6_hdr(skb); + ipv6_change_dsfield(iph, 0, nv); + } + + return XT_CONTINUE; +} + +static struct xt_target dscp_tg_reg[] __read_mostly = { + { + .name = "DSCP", + .family = NFPROTO_IPV4, + .checkentry = dscp_tg_check, + .target = dscp_tg, + .targetsize = sizeof(struct xt_DSCP_info), + .table = "mangle", + .me = THIS_MODULE, + }, + { + .name = "DSCP", + .family = NFPROTO_IPV6, + .checkentry = dscp_tg_check, + .target = dscp_tg6, + .targetsize = sizeof(struct xt_DSCP_info), + .table = "mangle", + .me = THIS_MODULE, + }, + { + .name = "TOS", + .revision = 1, + .family = NFPROTO_IPV4, + .table = "mangle", + .target = tos_tg, + .targetsize = sizeof(struct xt_tos_target_info), + .me = THIS_MODULE, + }, + { + .name = "TOS", + .revision = 1, + .family = NFPROTO_IPV6, + .table = "mangle", + .target = tos_tg6, + .targetsize = sizeof(struct xt_tos_target_info), + .me = THIS_MODULE, + }, +}; + +static int __init dscp_tg_init(void) +{ + return xt_register_targets(dscp_tg_reg, ARRAY_SIZE(dscp_tg_reg)); +} + +static void __exit dscp_tg_exit(void) +{ + xt_unregister_targets(dscp_tg_reg, ARRAY_SIZE(dscp_tg_reg)); +} + +module_init(dscp_tg_init); +module_exit(dscp_tg_exit); diff --git a/net/netfilter/xt_HL.c b/net/netfilter/xt_HL.c new file mode 100644 index 00000000..1535e87e --- /dev/null +++ b/net/netfilter/xt_HL.c @@ -0,0 +1,169 @@ +/* + * TTL modification target for IP tables + * (C) 2000,2005 by Harald Welte <laforge@netfilter.org> + * + * Hop Limit modification target for ip6tables + * Maciej Soltysiak <solt@dns.toxicfilms.tv> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <net/checksum.h> + +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter_ipv4/ipt_TTL.h> +#include <linux/netfilter_ipv6/ip6t_HL.h> + +MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); +MODULE_AUTHOR("Maciej Soltysiak <solt@dns.toxicfilms.tv>"); +MODULE_DESCRIPTION("Xtables: Hoplimit/TTL Limit field modification target"); +MODULE_LICENSE("GPL"); + +static unsigned int +ttl_tg(struct sk_buff *skb, const struct xt_action_param *par) +{ + struct iphdr *iph; + const struct ipt_TTL_info *info = par->targinfo; + int new_ttl; + + if (!skb_make_writable(skb, skb->len)) + return NF_DROP; + + iph = ip_hdr(skb); + + switch (info->mode) { + case IPT_TTL_SET: + new_ttl = info->ttl; + break; + case IPT_TTL_INC: + new_ttl = iph->ttl + info->ttl; + if (new_ttl > 255) + new_ttl = 255; + break; + case IPT_TTL_DEC: + new_ttl = iph->ttl - info->ttl; + if (new_ttl < 0) + new_ttl = 0; + break; + default: + new_ttl = iph->ttl; + break; + } + + if (new_ttl != iph->ttl) { + csum_replace2(&iph->check, htons(iph->ttl << 8), + htons(new_ttl << 8)); + iph->ttl = new_ttl; + } + + return XT_CONTINUE; +} + +static unsigned int +hl_tg6(struct sk_buff *skb, const struct xt_action_param *par) +{ + struct ipv6hdr *ip6h; + const struct ip6t_HL_info *info = par->targinfo; + int new_hl; + + if (!skb_make_writable(skb, skb->len)) + return NF_DROP; + + ip6h = ipv6_hdr(skb); + + switch (info->mode) { + case IP6T_HL_SET: + new_hl = info->hop_limit; + break; + case IP6T_HL_INC: + new_hl = ip6h->hop_limit + info->hop_limit; + if (new_hl > 255) + new_hl = 255; + break; + case IP6T_HL_DEC: + new_hl = ip6h->hop_limit - info->hop_limit; + if (new_hl < 0) + new_hl = 0; + break; + default: + new_hl = ip6h->hop_limit; + break; + } + + ip6h->hop_limit = new_hl; + + return XT_CONTINUE; +} + +static int ttl_tg_check(const struct xt_tgchk_param *par) +{ + const struct ipt_TTL_info *info = par->targinfo; + + if (info->mode > IPT_TTL_MAXMODE) { + pr_info("TTL: invalid or unknown mode %u\n", info->mode); + return -EINVAL; + } + if (info->mode != IPT_TTL_SET && info->ttl == 0) + return -EINVAL; + return 0; +} + +static int hl_tg6_check(const struct xt_tgchk_param *par) +{ + const struct ip6t_HL_info *info = par->targinfo; + + if (info->mode > IP6T_HL_MAXMODE) { + pr_info("invalid or unknown mode %u\n", info->mode); + return -EINVAL; + } + if (info->mode != IP6T_HL_SET && info->hop_limit == 0) { + pr_info("increment/decrement does not " + "make sense with value 0\n"); + return -EINVAL; + } + return 0; +} + +static struct xt_target hl_tg_reg[] __read_mostly = { + { + .name = "TTL", + .revision = 0, + .family = NFPROTO_IPV4, + .target = ttl_tg, + .targetsize = sizeof(struct ipt_TTL_info), + .table = "mangle", + .checkentry = ttl_tg_check, + .me = THIS_MODULE, + }, + { + .name = "HL", + .revision = 0, + .family = NFPROTO_IPV6, + .target = hl_tg6, + .targetsize = sizeof(struct ip6t_HL_info), + .table = "mangle", + .checkentry = hl_tg6_check, + .me = THIS_MODULE, + }, +}; + +static int __init hl_tg_init(void) +{ + return xt_register_targets(hl_tg_reg, ARRAY_SIZE(hl_tg_reg)); +} + +static void __exit hl_tg_exit(void) +{ + xt_unregister_targets(hl_tg_reg, ARRAY_SIZE(hl_tg_reg)); +} + +module_init(hl_tg_init); +module_exit(hl_tg_exit); +MODULE_ALIAS("ipt_TTL"); +MODULE_ALIAS("ip6t_HL"); diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c new file mode 100644 index 00000000..f4ba8634 --- /dev/null +++ b/net/netfilter/xt_IDLETIMER.c @@ -0,0 +1,379 @@ +/* + * linux/net/netfilter/xt_IDLETIMER.c + * + * Netfilter module to trigger a timer when packet matches. + * After timer expires a kevent will be sent. + * + * Copyright (C) 2004, 2010 Nokia Corporation + * + * Written by Timo Teras <ext-timo.teras@nokia.com> + * + * Converted to x_tables and reworked for upstream inclusion + * by Luciano Coelho <luciano.coelho@nokia.com> + * + * Contact: Luciano Coelho <luciano.coelho@nokia.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA + * 02110-1301 USA + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/module.h> +#include <linux/timer.h> +#include <linux/list.h> +#include <linux/mutex.h> +#include <linux/netfilter.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_IDLETIMER.h> +#include <linux/kdev_t.h> +#include <linux/kobject.h> +#include <linux/skbuff.h> +#include <linux/workqueue.h> +#include <linux/sysfs.h> +#include <net/net_namespace.h> + +struct idletimer_tg_attr { + struct attribute attr; + ssize_t (*show)(struct kobject *kobj, + struct attribute *attr, char *buf); +}; + +struct idletimer_tg { + struct list_head entry; + struct timer_list timer; + struct work_struct work; + + struct kobject *kobj; + struct idletimer_tg_attr attr; + + unsigned int refcnt; + bool send_nl_msg; + bool active; +}; + +static LIST_HEAD(idletimer_tg_list); +static DEFINE_MUTEX(list_mutex); + +static struct kobject *idletimer_tg_kobj; + +static void notify_netlink_uevent(const char *label, struct idletimer_tg *timer) +{ + char label_msg[NLMSG_MAX_SIZE]; + char state_msg[NLMSG_MAX_SIZE]; + char *envp[] = { label_msg, state_msg, NULL }; + int res; + + res = snprintf(label_msg, NLMSG_MAX_SIZE, "LABEL=%s", + label); + if (NLMSG_MAX_SIZE <= res) { + pr_err("message too long (%d)", res); + return; + } + res = snprintf(state_msg, NLMSG_MAX_SIZE, "STATE=%s", + timer->active ? "active" : "inactive"); + if (NLMSG_MAX_SIZE <= res) { + pr_err("message too long (%d)", res); + return; + } + pr_debug("putting nlmsg: <%s> <%s>\n", label_msg, state_msg); + kobject_uevent_env(idletimer_tg_kobj, KOBJ_CHANGE, envp); + return; + + +} + +static +struct idletimer_tg *__idletimer_tg_find_by_label(const char *label) +{ + struct idletimer_tg *entry; + + BUG_ON(!label); + + list_for_each_entry(entry, &idletimer_tg_list, entry) { + if (!strcmp(label, entry->attr.attr.name)) + return entry; + } + + return NULL; +} + +static ssize_t idletimer_tg_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct idletimer_tg *timer; + unsigned long expires = 0; + unsigned long now = jiffies; + + mutex_lock(&list_mutex); + + timer = __idletimer_tg_find_by_label(attr->name); + if (timer) + expires = timer->timer.expires; + + mutex_unlock(&list_mutex); + + if (time_after(expires, now)) + return sprintf(buf, "%u\n", + jiffies_to_msecs(expires - now) / 1000); + + if (timer->send_nl_msg) + return sprintf(buf, "0 %d\n", + jiffies_to_msecs(now - expires) / 1000); + else + return sprintf(buf, "0\n"); +} + +static void idletimer_tg_work(struct work_struct *work) +{ + struct idletimer_tg *timer = container_of(work, struct idletimer_tg, + work); + + sysfs_notify(idletimer_tg_kobj, NULL, timer->attr.attr.name); + + if (timer->send_nl_msg) + notify_netlink_uevent(timer->attr.attr.name, timer); +} + +static void idletimer_tg_expired(unsigned long data) +{ + struct idletimer_tg *timer = (struct idletimer_tg *) data; + + pr_debug("timer %s expired\n", timer->attr.attr.name); + + timer->active = false; + schedule_work(&timer->work); +} + +static int idletimer_tg_create(struct idletimer_tg_info *info) +{ + int ret; + + info->timer = kmalloc(sizeof(*info->timer), GFP_KERNEL); + if (!info->timer) { + ret = -ENOMEM; + goto out; + } + + info->timer->attr.attr.name = kstrdup(info->label, GFP_KERNEL); + if (!info->timer->attr.attr.name) { + ret = -ENOMEM; + goto out_free_timer; + } + info->timer->attr.attr.mode = S_IRUGO; + info->timer->attr.show = idletimer_tg_show; + + ret = sysfs_create_file(idletimer_tg_kobj, &info->timer->attr.attr); + if (ret < 0) { + pr_debug("couldn't add file to sysfs"); + goto out_free_attr; + } + + list_add(&info->timer->entry, &idletimer_tg_list); + + setup_timer(&info->timer->timer, idletimer_tg_expired, + (unsigned long) info->timer); + info->timer->refcnt = 1; + info->timer->send_nl_msg = (info->send_nl_msg == 0) ? false : true; + info->timer->active = true; + + mod_timer(&info->timer->timer, + msecs_to_jiffies(info->timeout * 1000) + jiffies); + + INIT_WORK(&info->timer->work, idletimer_tg_work); + + return 0; + +out_free_attr: + kfree(info->timer->attr.attr.name); +out_free_timer: + kfree(info->timer); +out: + return ret; +} + +/* + * The actual xt_tables plugin. + */ +static unsigned int idletimer_tg_target(struct sk_buff *skb, + const struct xt_action_param *par) +{ + const struct idletimer_tg_info *info = par->targinfo; + unsigned long now = jiffies; + + pr_debug("resetting timer %s, timeout period %u\n", + info->label, info->timeout); + + BUG_ON(!info->timer); + + info->timer->active = true; + + if (time_before(info->timer->timer.expires, now)) { + schedule_work(&info->timer->work); + pr_debug("Starting timer %s (Expired, Jiffies): %lu, %lu\n", + info->label, info->timer->timer.expires, now); + } + + /* TODO: Avoid modifying timers on each packet */ + mod_timer(&info->timer->timer, + msecs_to_jiffies(info->timeout * 1000) + now); + + return XT_CONTINUE; +} + +static int idletimer_tg_checkentry(const struct xt_tgchk_param *par) +{ + struct idletimer_tg_info *info = par->targinfo; + int ret; + unsigned long now = jiffies; + + pr_debug("checkentry targinfo %s\n", info->label); + + if (info->timeout == 0) { + pr_debug("timeout value is zero\n"); + return -EINVAL; + } + + if (info->label[0] == '\0' || + strnlen(info->label, + MAX_IDLETIMER_LABEL_SIZE) == MAX_IDLETIMER_LABEL_SIZE) { + pr_debug("label is empty or not nul-terminated\n"); + return -EINVAL; + } + + mutex_lock(&list_mutex); + + info->timer = __idletimer_tg_find_by_label(info->label); + if (info->timer) { + info->timer->refcnt++; + info->timer->active = true; + + if (time_before(info->timer->timer.expires, now)) { + schedule_work(&info->timer->work); + pr_debug("Starting Checkentry timer (Expired, Jiffies): %lu, %lu\n", + info->timer->timer.expires, now); + } + + mod_timer(&info->timer->timer, + msecs_to_jiffies(info->timeout * 1000) + now); + + pr_debug("increased refcnt of timer %s to %u\n", + info->label, info->timer->refcnt); + } else { + ret = idletimer_tg_create(info); + if (ret < 0) { + pr_debug("failed to create timer\n"); + mutex_unlock(&list_mutex); + return ret; + } + } + + mutex_unlock(&list_mutex); + + return 0; +} + +static void idletimer_tg_destroy(const struct xt_tgdtor_param *par) +{ + const struct idletimer_tg_info *info = par->targinfo; + + pr_debug("destroy targinfo %s\n", info->label); + + mutex_lock(&list_mutex); + + if (--info->timer->refcnt == 0) { + pr_debug("deleting timer %s\n", info->label); + + list_del(&info->timer->entry); + del_timer_sync(&info->timer->timer); + sysfs_remove_file(idletimer_tg_kobj, &info->timer->attr.attr); + kfree(info->timer->attr.attr.name); + kfree(info->timer); + } else { + pr_debug("decreased refcnt of timer %s to %u\n", + info->label, info->timer->refcnt); + } + + mutex_unlock(&list_mutex); +} + +static struct xt_target idletimer_tg __read_mostly = { + .name = "IDLETIMER", + .revision = 1, + .family = NFPROTO_UNSPEC, + .target = idletimer_tg_target, + .targetsize = sizeof(struct idletimer_tg_info), + .checkentry = idletimer_tg_checkentry, + .destroy = idletimer_tg_destroy, + .me = THIS_MODULE, +}; + +static struct class *idletimer_tg_class; + +static struct device *idletimer_tg_device; + +static int __init idletimer_tg_init(void) +{ + int err; + + idletimer_tg_class = class_create(THIS_MODULE, "xt_idletimer"); + err = PTR_ERR(idletimer_tg_class); + if (IS_ERR(idletimer_tg_class)) { + pr_debug("couldn't register device class\n"); + goto out; + } + + idletimer_tg_device = device_create(idletimer_tg_class, NULL, + MKDEV(0, 0), NULL, "timers"); + err = PTR_ERR(idletimer_tg_device); + if (IS_ERR(idletimer_tg_device)) { + pr_debug("couldn't register system device\n"); + goto out_class; + } + + idletimer_tg_kobj = &idletimer_tg_device->kobj; + + err = xt_register_target(&idletimer_tg); + if (err < 0) { + pr_debug("couldn't register xt target\n"); + goto out_dev; + } + + return 0; +out_dev: + device_destroy(idletimer_tg_class, MKDEV(0, 0)); +out_class: + class_destroy(idletimer_tg_class); +out: + return err; +} + +static void __exit idletimer_tg_exit(void) +{ + xt_unregister_target(&idletimer_tg); + + device_destroy(idletimer_tg_class, MKDEV(0, 0)); + class_destroy(idletimer_tg_class); +} + +module_init(idletimer_tg_init); +module_exit(idletimer_tg_exit); + +MODULE_AUTHOR("Timo Teras <ext-timo.teras@nokia.com>"); +MODULE_AUTHOR("Luciano Coelho <luciano.coelho@nokia.com>"); +MODULE_DESCRIPTION("Xtables: idle time monitor"); +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS("ipt_IDLETIMER"); +MODULE_ALIAS("ip6t_IDLETIMER"); +MODULE_ALIAS("arpt_IDLETIMER"); diff --git a/net/netfilter/xt_LED.c b/net/netfilter/xt_LED.c new file mode 100644 index 00000000..993de2ba --- /dev/null +++ b/net/netfilter/xt_LED.c @@ -0,0 +1,215 @@ +/* + * xt_LED.c - netfilter target to make LEDs blink upon packet matches + * + * Copyright (C) 2008 Adam Nielsen <a.nielsen@shikadi.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301 USA. + * + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/netfilter/x_tables.h> +#include <linux/slab.h> +#include <linux/leds.h> +#include <linux/mutex.h> + +#include <linux/netfilter/xt_LED.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Adam Nielsen <a.nielsen@shikadi.net>"); +MODULE_DESCRIPTION("Xtables: trigger LED devices on packet match"); +MODULE_ALIAS("ipt_LED"); +MODULE_ALIAS("ip6t_LED"); + +static LIST_HEAD(xt_led_triggers); +static DEFINE_MUTEX(xt_led_mutex); + +/* + * This is declared in here (the kernel module) only, to avoid having these + * dependencies in userspace code. This is what xt_led_info.internal_data + * points to. + */ +struct xt_led_info_internal { + struct list_head list; + int refcnt; + char *trigger_id; + struct led_trigger netfilter_led_trigger; + struct timer_list timer; +}; + +static unsigned int +led_tg(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_led_info *ledinfo = par->targinfo; + struct xt_led_info_internal *ledinternal = ledinfo->internal_data; + + /* + * If "always blink" is enabled, and there's still some time until the + * LED will switch off, briefly switch it off now. + */ + if ((ledinfo->delay > 0) && ledinfo->always_blink && + timer_pending(&ledinternal->timer)) + led_trigger_event(&ledinternal->netfilter_led_trigger, LED_OFF); + + led_trigger_event(&ledinternal->netfilter_led_trigger, LED_FULL); + + /* If there's a positive delay, start/update the timer */ + if (ledinfo->delay > 0) { + mod_timer(&ledinternal->timer, + jiffies + msecs_to_jiffies(ledinfo->delay)); + + /* Otherwise if there was no delay given, blink as fast as possible */ + } else if (ledinfo->delay == 0) { + led_trigger_event(&ledinternal->netfilter_led_trigger, LED_OFF); + } + + /* else the delay is negative, which means switch on and stay on */ + + return XT_CONTINUE; +} + +static void led_timeout_callback(unsigned long data) +{ + struct xt_led_info_internal *ledinternal = (struct xt_led_info_internal *)data; + + led_trigger_event(&ledinternal->netfilter_led_trigger, LED_OFF); +} + +static struct xt_led_info_internal *led_trigger_lookup(const char *name) +{ + struct xt_led_info_internal *ledinternal; + + list_for_each_entry(ledinternal, &xt_led_triggers, list) { + if (!strcmp(name, ledinternal->netfilter_led_trigger.name)) { + return ledinternal; + } + } + return NULL; +} + +static int led_tg_check(const struct xt_tgchk_param *par) +{ + struct xt_led_info *ledinfo = par->targinfo; + struct xt_led_info_internal *ledinternal; + int err; + + if (ledinfo->id[0] == '\0') { + pr_info("No 'id' parameter given.\n"); + return -EINVAL; + } + + mutex_lock(&xt_led_mutex); + + ledinternal = led_trigger_lookup(ledinfo->id); + if (ledinternal) { + ledinternal->refcnt++; + goto out; + } + + err = -ENOMEM; + ledinternal = kzalloc(sizeof(struct xt_led_info_internal), GFP_KERNEL); + if (!ledinternal) + goto exit_mutex_only; + + ledinternal->trigger_id = kstrdup(ledinfo->id, GFP_KERNEL); + if (!ledinternal->trigger_id) + goto exit_internal_alloc; + + ledinternal->refcnt = 1; + ledinternal->netfilter_led_trigger.name = ledinternal->trigger_id; + + err = led_trigger_register(&ledinternal->netfilter_led_trigger); + if (err) { + pr_warning("led_trigger_register() failed\n"); + if (err == -EEXIST) + pr_warning("Trigger name is already in use.\n"); + goto exit_alloc; + } + + /* See if we need to set up a timer */ + if (ledinfo->delay > 0) + setup_timer(&ledinternal->timer, led_timeout_callback, + (unsigned long)ledinternal); + + list_add_tail(&ledinternal->list, &xt_led_triggers); + +out: + mutex_unlock(&xt_led_mutex); + + ledinfo->internal_data = ledinternal; + + return 0; + +exit_alloc: + kfree(ledinternal->trigger_id); + +exit_internal_alloc: + kfree(ledinternal); + +exit_mutex_only: + mutex_unlock(&xt_led_mutex); + + return err; +} + +static void led_tg_destroy(const struct xt_tgdtor_param *par) +{ + const struct xt_led_info *ledinfo = par->targinfo; + struct xt_led_info_internal *ledinternal = ledinfo->internal_data; + + mutex_lock(&xt_led_mutex); + + if (--ledinternal->refcnt) { + mutex_unlock(&xt_led_mutex); + return; + } + + list_del(&ledinternal->list); + + if (ledinfo->delay > 0) + del_timer_sync(&ledinternal->timer); + + led_trigger_unregister(&ledinternal->netfilter_led_trigger); + + mutex_unlock(&xt_led_mutex); + + kfree(ledinternal->trigger_id); + kfree(ledinternal); +} + +static struct xt_target led_tg_reg __read_mostly = { + .name = "LED", + .revision = 0, + .family = NFPROTO_UNSPEC, + .target = led_tg, + .targetsize = sizeof(struct xt_led_info), + .checkentry = led_tg_check, + .destroy = led_tg_destroy, + .me = THIS_MODULE, +}; + +static int __init led_tg_init(void) +{ + return xt_register_target(&led_tg_reg); +} + +static void __exit led_tg_exit(void) +{ + xt_unregister_target(&led_tg_reg); +} + +module_init(led_tg_init); +module_exit(led_tg_exit); diff --git a/net/netfilter/xt_LOG.c b/net/netfilter/xt_LOG.c new file mode 100644 index 00000000..ff5f75fd --- /dev/null +++ b/net/netfilter/xt_LOG.c @@ -0,0 +1,925 @@ +/* + * This is a module which is used for logging packets. + */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/module.h> +#include <linux/spinlock.h> +#include <linux/skbuff.h> +#include <linux/if_arp.h> +#include <linux/ip.h> +#include <net/ipv6.h> +#include <net/icmp.h> +#include <net/udp.h> +#include <net/tcp.h> +#include <net/route.h> + +#include <linux/netfilter.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_LOG.h> +#include <linux/netfilter_ipv6/ip6_tables.h> +#include <net/netfilter/nf_log.h> +#include <net/netfilter/xt_log.h> + +static struct nf_loginfo default_loginfo = { + .type = NF_LOG_TYPE_LOG, + .u = { + .log = { + .level = 5, + .logflags = NF_LOG_MASK, + }, + }, +}; + +static int dump_udp_header(struct sbuff *m, const struct sk_buff *skb, + u8 proto, int fragment, unsigned int offset) +{ + struct udphdr _udph; + const struct udphdr *uh; + + if (proto == IPPROTO_UDP) + /* Max length: 10 "PROTO=UDP " */ + sb_add(m, "PROTO=UDP "); + else /* Max length: 14 "PROTO=UDPLITE " */ + sb_add(m, "PROTO=UDPLITE "); + + if (fragment) + goto out; + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + uh = skb_header_pointer(skb, offset, sizeof(_udph), &_udph); + if (uh == NULL) { + sb_add(m, "INCOMPLETE [%u bytes] ", skb->len - offset); + + return 1; + } + + /* Max length: 20 "SPT=65535 DPT=65535 " */ + sb_add(m, "SPT=%u DPT=%u LEN=%u ", ntohs(uh->source), ntohs(uh->dest), + ntohs(uh->len)); + +out: + return 0; +} + +static int dump_tcp_header(struct sbuff *m, const struct sk_buff *skb, + u8 proto, int fragment, unsigned int offset, + unsigned int logflags) +{ + struct tcphdr _tcph; + const struct tcphdr *th; + + /* Max length: 10 "PROTO=TCP " */ + sb_add(m, "PROTO=TCP "); + + if (fragment) + return 0; + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + th = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph); + if (th == NULL) { + sb_add(m, "INCOMPLETE [%u bytes] ", skb->len - offset); + return 1; + } + + /* Max length: 20 "SPT=65535 DPT=65535 " */ + sb_add(m, "SPT=%u DPT=%u ", ntohs(th->source), ntohs(th->dest)); + /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */ + if (logflags & XT_LOG_TCPSEQ) + sb_add(m, "SEQ=%u ACK=%u ", ntohl(th->seq), ntohl(th->ack_seq)); + + /* Max length: 13 "WINDOW=65535 " */ + sb_add(m, "WINDOW=%u ", ntohs(th->window)); + /* Max length: 9 "RES=0x3C " */ + sb_add(m, "RES=0x%02x ", (u_int8_t)(ntohl(tcp_flag_word(th) & + TCP_RESERVED_BITS) >> 22)); + /* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */ + if (th->cwr) + sb_add(m, "CWR "); + if (th->ece) + sb_add(m, "ECE "); + if (th->urg) + sb_add(m, "URG "); + if (th->ack) + sb_add(m, "ACK "); + if (th->psh) + sb_add(m, "PSH "); + if (th->rst) + sb_add(m, "RST "); + if (th->syn) + sb_add(m, "SYN "); + if (th->fin) + sb_add(m, "FIN "); + /* Max length: 11 "URGP=65535 " */ + sb_add(m, "URGP=%u ", ntohs(th->urg_ptr)); + + if ((logflags & XT_LOG_TCPOPT) && th->doff*4 > sizeof(struct tcphdr)) { + u_int8_t _opt[60 - sizeof(struct tcphdr)]; + const u_int8_t *op; + unsigned int i; + unsigned int optsize = th->doff*4 - sizeof(struct tcphdr); + + op = skb_header_pointer(skb, offset + sizeof(struct tcphdr), + optsize, _opt); + if (op == NULL) { + sb_add(m, "OPT (TRUNCATED)"); + return 1; + } + + /* Max length: 127 "OPT (" 15*4*2chars ") " */ + sb_add(m, "OPT ("); + for (i = 0; i < optsize; i++) + sb_add(m, "%02X", op[i]); + + sb_add(m, ") "); + } + + return 0; +} + +/* One level of recursion won't kill us */ +static void dump_ipv4_packet(struct sbuff *m, + const struct nf_loginfo *info, + const struct sk_buff *skb, + unsigned int iphoff) +{ + struct iphdr _iph; + const struct iphdr *ih; + unsigned int logflags; + + if (info->type == NF_LOG_TYPE_LOG) + logflags = info->u.log.logflags; + else + logflags = NF_LOG_MASK; + + ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph); + if (ih == NULL) { + sb_add(m, "TRUNCATED"); + return; + } + + /* Important fields: + * TOS, len, DF/MF, fragment offset, TTL, src, dst, options. */ + /* Max length: 40 "SRC=255.255.255.255 DST=255.255.255.255 " */ + sb_add(m, "SRC=%pI4 DST=%pI4 ", + &ih->saddr, &ih->daddr); + + /* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */ + sb_add(m, "LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ", + ntohs(ih->tot_len), ih->tos & IPTOS_TOS_MASK, + ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id)); + + /* Max length: 6 "CE DF MF " */ + if (ntohs(ih->frag_off) & IP_CE) + sb_add(m, "CE "); + if (ntohs(ih->frag_off) & IP_DF) + sb_add(m, "DF "); + if (ntohs(ih->frag_off) & IP_MF) + sb_add(m, "MF "); + + /* Max length: 11 "FRAG:65535 " */ + if (ntohs(ih->frag_off) & IP_OFFSET) + sb_add(m, "FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET); + + if ((logflags & XT_LOG_IPOPT) && + ih->ihl * 4 > sizeof(struct iphdr)) { + const unsigned char *op; + unsigned char _opt[4 * 15 - sizeof(struct iphdr)]; + unsigned int i, optsize; + + optsize = ih->ihl * 4 - sizeof(struct iphdr); + op = skb_header_pointer(skb, iphoff+sizeof(_iph), + optsize, _opt); + if (op == NULL) { + sb_add(m, "TRUNCATED"); + return; + } + + /* Max length: 127 "OPT (" 15*4*2chars ") " */ + sb_add(m, "OPT ("); + for (i = 0; i < optsize; i++) + sb_add(m, "%02X", op[i]); + sb_add(m, ") "); + } + + switch (ih->protocol) { + case IPPROTO_TCP: + if (dump_tcp_header(m, skb, ih->protocol, + ntohs(ih->frag_off) & IP_OFFSET, + iphoff+ih->ihl*4, logflags)) + return; + break; + case IPPROTO_UDP: + case IPPROTO_UDPLITE: + if (dump_udp_header(m, skb, ih->protocol, + ntohs(ih->frag_off) & IP_OFFSET, + iphoff+ih->ihl*4)) + return; + break; + case IPPROTO_ICMP: { + struct icmphdr _icmph; + const struct icmphdr *ich; + static const size_t required_len[NR_ICMP_TYPES+1] + = { [ICMP_ECHOREPLY] = 4, + [ICMP_DEST_UNREACH] + = 8 + sizeof(struct iphdr), + [ICMP_SOURCE_QUENCH] + = 8 + sizeof(struct iphdr), + [ICMP_REDIRECT] + = 8 + sizeof(struct iphdr), + [ICMP_ECHO] = 4, + [ICMP_TIME_EXCEEDED] + = 8 + sizeof(struct iphdr), + [ICMP_PARAMETERPROB] + = 8 + sizeof(struct iphdr), + [ICMP_TIMESTAMP] = 20, + [ICMP_TIMESTAMPREPLY] = 20, + [ICMP_ADDRESS] = 12, + [ICMP_ADDRESSREPLY] = 12 }; + + /* Max length: 11 "PROTO=ICMP " */ + sb_add(m, "PROTO=ICMP "); + + if (ntohs(ih->frag_off) & IP_OFFSET) + break; + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + ich = skb_header_pointer(skb, iphoff + ih->ihl * 4, + sizeof(_icmph), &_icmph); + if (ich == NULL) { + sb_add(m, "INCOMPLETE [%u bytes] ", + skb->len - iphoff - ih->ihl*4); + break; + } + + /* Max length: 18 "TYPE=255 CODE=255 " */ + sb_add(m, "TYPE=%u CODE=%u ", ich->type, ich->code); + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + if (ich->type <= NR_ICMP_TYPES && + required_len[ich->type] && + skb->len-iphoff-ih->ihl*4 < required_len[ich->type]) { + sb_add(m, "INCOMPLETE [%u bytes] ", + skb->len - iphoff - ih->ihl*4); + break; + } + + switch (ich->type) { + case ICMP_ECHOREPLY: + case ICMP_ECHO: + /* Max length: 19 "ID=65535 SEQ=65535 " */ + sb_add(m, "ID=%u SEQ=%u ", + ntohs(ich->un.echo.id), + ntohs(ich->un.echo.sequence)); + break; + + case ICMP_PARAMETERPROB: + /* Max length: 14 "PARAMETER=255 " */ + sb_add(m, "PARAMETER=%u ", + ntohl(ich->un.gateway) >> 24); + break; + case ICMP_REDIRECT: + /* Max length: 24 "GATEWAY=255.255.255.255 " */ + sb_add(m, "GATEWAY=%pI4 ", &ich->un.gateway); + /* Fall through */ + case ICMP_DEST_UNREACH: + case ICMP_SOURCE_QUENCH: + case ICMP_TIME_EXCEEDED: + /* Max length: 3+maxlen */ + if (!iphoff) { /* Only recurse once. */ + sb_add(m, "["); + dump_ipv4_packet(m, info, skb, + iphoff + ih->ihl*4+sizeof(_icmph)); + sb_add(m, "] "); + } + + /* Max length: 10 "MTU=65535 " */ + if (ich->type == ICMP_DEST_UNREACH && + ich->code == ICMP_FRAG_NEEDED) + sb_add(m, "MTU=%u ", ntohs(ich->un.frag.mtu)); + } + break; + } + /* Max Length */ + case IPPROTO_AH: { + struct ip_auth_hdr _ahdr; + const struct ip_auth_hdr *ah; + + if (ntohs(ih->frag_off) & IP_OFFSET) + break; + + /* Max length: 9 "PROTO=AH " */ + sb_add(m, "PROTO=AH "); + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + ah = skb_header_pointer(skb, iphoff+ih->ihl*4, + sizeof(_ahdr), &_ahdr); + if (ah == NULL) { + sb_add(m, "INCOMPLETE [%u bytes] ", + skb->len - iphoff - ih->ihl*4); + break; + } + + /* Length: 15 "SPI=0xF1234567 " */ + sb_add(m, "SPI=0x%x ", ntohl(ah->spi)); + break; + } + case IPPROTO_ESP: { + struct ip_esp_hdr _esph; + const struct ip_esp_hdr *eh; + + /* Max length: 10 "PROTO=ESP " */ + sb_add(m, "PROTO=ESP "); + + if (ntohs(ih->frag_off) & IP_OFFSET) + break; + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + eh = skb_header_pointer(skb, iphoff+ih->ihl*4, + sizeof(_esph), &_esph); + if (eh == NULL) { + sb_add(m, "INCOMPLETE [%u bytes] ", + skb->len - iphoff - ih->ihl*4); + break; + } + + /* Length: 15 "SPI=0xF1234567 " */ + sb_add(m, "SPI=0x%x ", ntohl(eh->spi)); + break; + } + /* Max length: 10 "PROTO 255 " */ + default: + sb_add(m, "PROTO=%u ", ih->protocol); + } + + /* Max length: 15 "UID=4294967295 " */ + if ((logflags & XT_LOG_UID) && !iphoff && skb->sk) { + read_lock_bh(&skb->sk->sk_callback_lock); + if (skb->sk->sk_socket && skb->sk->sk_socket->file) + sb_add(m, "UID=%u GID=%u ", + skb->sk->sk_socket->file->f_cred->fsuid, + skb->sk->sk_socket->file->f_cred->fsgid); + read_unlock_bh(&skb->sk->sk_callback_lock); + } + + /* Max length: 16 "MARK=0xFFFFFFFF " */ + if (!iphoff && skb->mark) + sb_add(m, "MARK=0x%x ", skb->mark); + + /* Proto Max log string length */ + /* IP: 40+46+6+11+127 = 230 */ + /* TCP: 10+max(25,20+30+13+9+32+11+127) = 252 */ + /* UDP: 10+max(25,20) = 35 */ + /* UDPLITE: 14+max(25,20) = 39 */ + /* ICMP: 11+max(25, 18+25+max(19,14,24+3+n+10,3+n+10)) = 91+n */ + /* ESP: 10+max(25)+15 = 50 */ + /* AH: 9+max(25)+15 = 49 */ + /* unknown: 10 */ + + /* (ICMP allows recursion one level deep) */ + /* maxlen = IP + ICMP + IP + max(TCP,UDP,ICMP,unknown) */ + /* maxlen = 230+ 91 + 230 + 252 = 803 */ +} + +static void dump_ipv4_mac_header(struct sbuff *m, + const struct nf_loginfo *info, + const struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + unsigned int logflags = 0; + + if (info->type == NF_LOG_TYPE_LOG) + logflags = info->u.log.logflags; + + if (!(logflags & XT_LOG_MACDECODE)) + goto fallback; + + switch (dev->type) { + case ARPHRD_ETHER: + sb_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ", + eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest, + ntohs(eth_hdr(skb)->h_proto)); + return; + default: + break; + } + +fallback: + sb_add(m, "MAC="); + if (dev->hard_header_len && + skb->mac_header != skb->network_header) { + const unsigned char *p = skb_mac_header(skb); + unsigned int i; + + sb_add(m, "%02x", *p++); + for (i = 1; i < dev->hard_header_len; i++, p++) + sb_add(m, ":%02x", *p); + } + sb_add(m, " "); +} + +static void +log_packet_common(struct sbuff *m, + u_int8_t pf, + unsigned int hooknum, + const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const struct nf_loginfo *loginfo, + const char *prefix) +{ + sb_add(m, "<%d>%sIN=%s OUT=%s ", loginfo->u.log.level, + prefix, + in ? in->name : "", + out ? out->name : ""); +#ifdef CONFIG_BRIDGE_NETFILTER + if (skb->nf_bridge) { + const struct net_device *physindev; + const struct net_device *physoutdev; + + physindev = skb->nf_bridge->physindev; + if (physindev && in != physindev) + sb_add(m, "PHYSIN=%s ", physindev->name); + physoutdev = skb->nf_bridge->physoutdev; + if (physoutdev && out != physoutdev) + sb_add(m, "PHYSOUT=%s ", physoutdev->name); + } +#endif +} + + +static void +ipt_log_packet(u_int8_t pf, + unsigned int hooknum, + const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const struct nf_loginfo *loginfo, + const char *prefix) +{ + struct sbuff *m = sb_open(); + + if (!loginfo) + loginfo = &default_loginfo; + + log_packet_common(m, pf, hooknum, skb, in, out, loginfo, prefix); + + if (in != NULL) + dump_ipv4_mac_header(m, loginfo, skb); + + dump_ipv4_packet(m, loginfo, skb, 0); + + sb_close(m); +} + +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) +/* One level of recursion won't kill us */ +static void dump_ipv6_packet(struct sbuff *m, + const struct nf_loginfo *info, + const struct sk_buff *skb, unsigned int ip6hoff, + int recurse) +{ + u_int8_t currenthdr; + int fragment; + struct ipv6hdr _ip6h; + const struct ipv6hdr *ih; + unsigned int ptr; + unsigned int hdrlen = 0; + unsigned int logflags; + + if (info->type == NF_LOG_TYPE_LOG) + logflags = info->u.log.logflags; + else + logflags = NF_LOG_MASK; + + ih = skb_header_pointer(skb, ip6hoff, sizeof(_ip6h), &_ip6h); + if (ih == NULL) { + sb_add(m, "TRUNCATED"); + return; + } + + /* Max length: 88 "SRC=0000.0000.0000.0000.0000.0000.0000.0000 DST=0000.0000.0000.0000.0000.0000.0000.0000 " */ + sb_add(m, "SRC=%pI6 DST=%pI6 ", &ih->saddr, &ih->daddr); + + /* Max length: 44 "LEN=65535 TC=255 HOPLIMIT=255 FLOWLBL=FFFFF " */ + sb_add(m, "LEN=%Zu TC=%u HOPLIMIT=%u FLOWLBL=%u ", + ntohs(ih->payload_len) + sizeof(struct ipv6hdr), + (ntohl(*(__be32 *)ih) & 0x0ff00000) >> 20, + ih->hop_limit, + (ntohl(*(__be32 *)ih) & 0x000fffff)); + + fragment = 0; + ptr = ip6hoff + sizeof(struct ipv6hdr); + currenthdr = ih->nexthdr; + while (currenthdr != NEXTHDR_NONE && ip6t_ext_hdr(currenthdr)) { + struct ipv6_opt_hdr _hdr; + const struct ipv6_opt_hdr *hp; + + hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr); + if (hp == NULL) { + sb_add(m, "TRUNCATED"); + return; + } + + /* Max length: 48 "OPT (...) " */ + if (logflags & XT_LOG_IPOPT) + sb_add(m, "OPT ( "); + + switch (currenthdr) { + case IPPROTO_FRAGMENT: { + struct frag_hdr _fhdr; + const struct frag_hdr *fh; + + sb_add(m, "FRAG:"); + fh = skb_header_pointer(skb, ptr, sizeof(_fhdr), + &_fhdr); + if (fh == NULL) { + sb_add(m, "TRUNCATED "); + return; + } + + /* Max length: 6 "65535 " */ + sb_add(m, "%u ", ntohs(fh->frag_off) & 0xFFF8); + + /* Max length: 11 "INCOMPLETE " */ + if (fh->frag_off & htons(0x0001)) + sb_add(m, "INCOMPLETE "); + + sb_add(m, "ID:%08x ", ntohl(fh->identification)); + + if (ntohs(fh->frag_off) & 0xFFF8) + fragment = 1; + + hdrlen = 8; + + break; + } + case IPPROTO_DSTOPTS: + case IPPROTO_ROUTING: + case IPPROTO_HOPOPTS: + if (fragment) { + if (logflags & XT_LOG_IPOPT) + sb_add(m, ")"); + return; + } + hdrlen = ipv6_optlen(hp); + break; + /* Max Length */ + case IPPROTO_AH: + if (logflags & XT_LOG_IPOPT) { + struct ip_auth_hdr _ahdr; + const struct ip_auth_hdr *ah; + + /* Max length: 3 "AH " */ + sb_add(m, "AH "); + + if (fragment) { + sb_add(m, ")"); + return; + } + + ah = skb_header_pointer(skb, ptr, sizeof(_ahdr), + &_ahdr); + if (ah == NULL) { + /* + * Max length: 26 "INCOMPLETE [65535 + * bytes] )" + */ + sb_add(m, "INCOMPLETE [%u bytes] )", + skb->len - ptr); + return; + } + + /* Length: 15 "SPI=0xF1234567 */ + sb_add(m, "SPI=0x%x ", ntohl(ah->spi)); + + } + + hdrlen = (hp->hdrlen+2)<<2; + break; + case IPPROTO_ESP: + if (logflags & XT_LOG_IPOPT) { + struct ip_esp_hdr _esph; + const struct ip_esp_hdr *eh; + + /* Max length: 4 "ESP " */ + sb_add(m, "ESP "); + + if (fragment) { + sb_add(m, ")"); + return; + } + + /* + * Max length: 26 "INCOMPLETE [65535 bytes] )" + */ + eh = skb_header_pointer(skb, ptr, sizeof(_esph), + &_esph); + if (eh == NULL) { + sb_add(m, "INCOMPLETE [%u bytes] )", + skb->len - ptr); + return; + } + + /* Length: 16 "SPI=0xF1234567 )" */ + sb_add(m, "SPI=0x%x )", ntohl(eh->spi)); + + } + return; + default: + /* Max length: 20 "Unknown Ext Hdr 255" */ + sb_add(m, "Unknown Ext Hdr %u", currenthdr); + return; + } + if (logflags & XT_LOG_IPOPT) + sb_add(m, ") "); + + currenthdr = hp->nexthdr; + ptr += hdrlen; + } + + switch (currenthdr) { + case IPPROTO_TCP: + if (dump_tcp_header(m, skb, currenthdr, fragment, ptr, + logflags)) + return; + break; + case IPPROTO_UDP: + case IPPROTO_UDPLITE: + if (dump_udp_header(m, skb, currenthdr, fragment, ptr)) + return; + break; + case IPPROTO_ICMPV6: { + struct icmp6hdr _icmp6h; + const struct icmp6hdr *ic; + + /* Max length: 13 "PROTO=ICMPv6 " */ + sb_add(m, "PROTO=ICMPv6 "); + + if (fragment) + break; + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + ic = skb_header_pointer(skb, ptr, sizeof(_icmp6h), &_icmp6h); + if (ic == NULL) { + sb_add(m, "INCOMPLETE [%u bytes] ", skb->len - ptr); + return; + } + + /* Max length: 18 "TYPE=255 CODE=255 " */ + sb_add(m, "TYPE=%u CODE=%u ", ic->icmp6_type, ic->icmp6_code); + + switch (ic->icmp6_type) { + case ICMPV6_ECHO_REQUEST: + case ICMPV6_ECHO_REPLY: + /* Max length: 19 "ID=65535 SEQ=65535 " */ + sb_add(m, "ID=%u SEQ=%u ", + ntohs(ic->icmp6_identifier), + ntohs(ic->icmp6_sequence)); + break; + case ICMPV6_MGM_QUERY: + case ICMPV6_MGM_REPORT: + case ICMPV6_MGM_REDUCTION: + break; + + case ICMPV6_PARAMPROB: + /* Max length: 17 "POINTER=ffffffff " */ + sb_add(m, "POINTER=%08x ", ntohl(ic->icmp6_pointer)); + /* Fall through */ + case ICMPV6_DEST_UNREACH: + case ICMPV6_PKT_TOOBIG: + case ICMPV6_TIME_EXCEED: + /* Max length: 3+maxlen */ + if (recurse) { + sb_add(m, "["); + dump_ipv6_packet(m, info, skb, + ptr + sizeof(_icmp6h), 0); + sb_add(m, "] "); + } + + /* Max length: 10 "MTU=65535 " */ + if (ic->icmp6_type == ICMPV6_PKT_TOOBIG) + sb_add(m, "MTU=%u ", ntohl(ic->icmp6_mtu)); + } + break; + } + /* Max length: 10 "PROTO=255 " */ + default: + sb_add(m, "PROTO=%u ", currenthdr); + } + + /* Max length: 15 "UID=4294967295 " */ + if ((logflags & XT_LOG_UID) && recurse && skb->sk) { + read_lock_bh(&skb->sk->sk_callback_lock); + if (skb->sk->sk_socket && skb->sk->sk_socket->file) + sb_add(m, "UID=%u GID=%u ", + skb->sk->sk_socket->file->f_cred->fsuid, + skb->sk->sk_socket->file->f_cred->fsgid); + read_unlock_bh(&skb->sk->sk_callback_lock); + } + + /* Max length: 16 "MARK=0xFFFFFFFF " */ + if (!recurse && skb->mark) + sb_add(m, "MARK=0x%x ", skb->mark); +} + +static void dump_ipv6_mac_header(struct sbuff *m, + const struct nf_loginfo *info, + const struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + unsigned int logflags = 0; + + if (info->type == NF_LOG_TYPE_LOG) + logflags = info->u.log.logflags; + + if (!(logflags & XT_LOG_MACDECODE)) + goto fallback; + + switch (dev->type) { + case ARPHRD_ETHER: + sb_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ", + eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest, + ntohs(eth_hdr(skb)->h_proto)); + return; + default: + break; + } + +fallback: + sb_add(m, "MAC="); + if (dev->hard_header_len && + skb->mac_header != skb->network_header) { + const unsigned char *p = skb_mac_header(skb); + unsigned int len = dev->hard_header_len; + unsigned int i; + + if (dev->type == ARPHRD_SIT) { + p -= ETH_HLEN; + + if (p < skb->head) + p = NULL; + } + + if (p != NULL) { + sb_add(m, "%02x", *p++); + for (i = 1; i < len; i++) + sb_add(m, ":%02x", *p++); + } + sb_add(m, " "); + + if (dev->type == ARPHRD_SIT) { + const struct iphdr *iph = + (struct iphdr *)skb_mac_header(skb); + sb_add(m, "TUNNEL=%pI4->%pI4 ", &iph->saddr, + &iph->daddr); + } + } else + sb_add(m, " "); +} + +static void +ip6t_log_packet(u_int8_t pf, + unsigned int hooknum, + const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const struct nf_loginfo *loginfo, + const char *prefix) +{ + struct sbuff *m = sb_open(); + + if (!loginfo) + loginfo = &default_loginfo; + + log_packet_common(m, pf, hooknum, skb, in, out, loginfo, prefix); + + if (in != NULL) + dump_ipv6_mac_header(m, loginfo, skb); + + dump_ipv6_packet(m, loginfo, skb, skb_network_offset(skb), 1); + + sb_close(m); +} +#endif + +static unsigned int +log_tg(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_log_info *loginfo = par->targinfo; + struct nf_loginfo li; + + li.type = NF_LOG_TYPE_LOG; + li.u.log.level = loginfo->level; + li.u.log.logflags = loginfo->logflags; + + if (par->family == NFPROTO_IPV4) + ipt_log_packet(NFPROTO_IPV4, par->hooknum, skb, par->in, + par->out, &li, loginfo->prefix); +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + else if (par->family == NFPROTO_IPV6) + ip6t_log_packet(NFPROTO_IPV6, par->hooknum, skb, par->in, + par->out, &li, loginfo->prefix); +#endif + else + WARN_ON_ONCE(1); + + return XT_CONTINUE; +} + +static int log_tg_check(const struct xt_tgchk_param *par) +{ + const struct xt_log_info *loginfo = par->targinfo; + + if (par->family != NFPROTO_IPV4 && par->family != NFPROTO_IPV6) + return -EINVAL; + + if (loginfo->level >= 8) { + pr_debug("level %u >= 8\n", loginfo->level); + return -EINVAL; + } + + if (loginfo->prefix[sizeof(loginfo->prefix)-1] != '\0') { + pr_debug("prefix is not null-terminated\n"); + return -EINVAL; + } + + return 0; +} + +static struct xt_target log_tg_regs[] __read_mostly = { + { + .name = "LOG", + .family = NFPROTO_IPV4, + .target = log_tg, + .targetsize = sizeof(struct xt_log_info), + .checkentry = log_tg_check, + .me = THIS_MODULE, + }, +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + { + .name = "LOG", + .family = NFPROTO_IPV6, + .target = log_tg, + .targetsize = sizeof(struct xt_log_info), + .checkentry = log_tg_check, + .me = THIS_MODULE, + }, +#endif +}; + +static struct nf_logger ipt_log_logger __read_mostly = { + .name = "ipt_LOG", + .logfn = &ipt_log_packet, + .me = THIS_MODULE, +}; + +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) +static struct nf_logger ip6t_log_logger __read_mostly = { + .name = "ip6t_LOG", + .logfn = &ip6t_log_packet, + .me = THIS_MODULE, +}; +#endif + +static int __init log_tg_init(void) +{ + int ret; + + ret = xt_register_targets(log_tg_regs, ARRAY_SIZE(log_tg_regs)); + if (ret < 0) + return ret; + + nf_log_register(NFPROTO_IPV4, &ipt_log_logger); +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + nf_log_register(NFPROTO_IPV6, &ip6t_log_logger); +#endif + return 0; +} + +static void __exit log_tg_exit(void) +{ + nf_log_unregister(&ipt_log_logger); +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + nf_log_unregister(&ip6t_log_logger); +#endif + xt_unregister_targets(log_tg_regs, ARRAY_SIZE(log_tg_regs)); +} + +module_init(log_tg_init); +module_exit(log_tg_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); +MODULE_AUTHOR("Jan Rekorajski <baggins@pld.org.pl>"); +MODULE_DESCRIPTION("Xtables: IPv4/IPv6 packet logging"); +MODULE_ALIAS("ipt_LOG"); +MODULE_ALIAS("ip6t_LOG"); diff --git a/net/netfilter/xt_NFLOG.c b/net/netfilter/xt_NFLOG.c new file mode 100644 index 00000000..a17dd0f5 --- /dev/null +++ b/net/netfilter/xt_NFLOG.c @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2006 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/skbuff.h> + +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_NFLOG.h> +#include <net/netfilter/nf_log.h> +#include <net/netfilter/nfnetlink_log.h> + +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_DESCRIPTION("Xtables: packet logging to netlink using NFLOG"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_NFLOG"); +MODULE_ALIAS("ip6t_NFLOG"); + +static unsigned int +nflog_tg(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_nflog_info *info = par->targinfo; + struct nf_loginfo li; + + li.type = NF_LOG_TYPE_ULOG; + li.u.ulog.copy_len = info->len; + li.u.ulog.group = info->group; + li.u.ulog.qthreshold = info->threshold; + + nfulnl_log_packet(par->family, par->hooknum, skb, par->in, + par->out, &li, info->prefix); + return XT_CONTINUE; +} + +static int nflog_tg_check(const struct xt_tgchk_param *par) +{ + const struct xt_nflog_info *info = par->targinfo; + + if (info->flags & ~XT_NFLOG_MASK) + return -EINVAL; + if (info->prefix[sizeof(info->prefix) - 1] != '\0') + return -EINVAL; + return 0; +} + +static struct xt_target nflog_tg_reg __read_mostly = { + .name = "NFLOG", + .revision = 0, + .family = NFPROTO_UNSPEC, + .checkentry = nflog_tg_check, + .target = nflog_tg, + .targetsize = sizeof(struct xt_nflog_info), + .me = THIS_MODULE, +}; + +static int __init nflog_tg_init(void) +{ + return xt_register_target(&nflog_tg_reg); +} + +static void __exit nflog_tg_exit(void) +{ + xt_unregister_target(&nflog_tg_reg); +} + +module_init(nflog_tg_init); +module_exit(nflog_tg_exit); diff --git a/net/netfilter/xt_NFQUEUE.c b/net/netfilter/xt_NFQUEUE.c new file mode 100644 index 00000000..95237c89 --- /dev/null +++ b/net/netfilter/xt_NFQUEUE.c @@ -0,0 +1,160 @@ +/* iptables module for using new netfilter netlink queue + * + * (C) 2005 by Harald Welte <laforge@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include <linux/module.h> +#include <linux/skbuff.h> + +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/jhash.h> + +#include <linux/netfilter.h> +#include <linux/netfilter_arp.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_NFQUEUE.h> + +MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); +MODULE_DESCRIPTION("Xtables: packet forwarding to netlink"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_NFQUEUE"); +MODULE_ALIAS("ip6t_NFQUEUE"); +MODULE_ALIAS("arpt_NFQUEUE"); + +static u32 jhash_initval __read_mostly; +static bool rnd_inited __read_mostly; + +static unsigned int +nfqueue_tg(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_NFQ_info *tinfo = par->targinfo; + + return NF_QUEUE_NR(tinfo->queuenum); +} + +static u32 hash_v4(const struct sk_buff *skb) +{ + const struct iphdr *iph = ip_hdr(skb); + __be32 ipaddr; + + /* packets in either direction go into same queue */ + ipaddr = iph->saddr ^ iph->daddr; + + return jhash_2words((__force u32)ipaddr, iph->protocol, jhash_initval); +} + +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) +static u32 hash_v6(const struct sk_buff *skb) +{ + const struct ipv6hdr *ip6h = ipv6_hdr(skb); + __be32 addr[4]; + + addr[0] = ip6h->saddr.s6_addr32[0] ^ ip6h->daddr.s6_addr32[0]; + addr[1] = ip6h->saddr.s6_addr32[1] ^ ip6h->daddr.s6_addr32[1]; + addr[2] = ip6h->saddr.s6_addr32[2] ^ ip6h->daddr.s6_addr32[2]; + addr[3] = ip6h->saddr.s6_addr32[3] ^ ip6h->daddr.s6_addr32[3]; + + return jhash2((__force u32 *)addr, ARRAY_SIZE(addr), jhash_initval); +} +#endif + +static unsigned int +nfqueue_tg_v1(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_NFQ_info_v1 *info = par->targinfo; + u32 queue = info->queuenum; + + if (info->queues_total > 1) { + if (par->family == NFPROTO_IPV4) + queue = (((u64) hash_v4(skb) * info->queues_total) >> + 32) + queue; +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + else if (par->family == NFPROTO_IPV6) + queue = (((u64) hash_v6(skb) * info->queues_total) >> + 32) + queue; +#endif + } + return NF_QUEUE_NR(queue); +} + +static unsigned int +nfqueue_tg_v2(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_NFQ_info_v2 *info = par->targinfo; + unsigned int ret = nfqueue_tg_v1(skb, par); + + if (info->bypass) + ret |= NF_VERDICT_FLAG_QUEUE_BYPASS; + return ret; +} + +static int nfqueue_tg_check(const struct xt_tgchk_param *par) +{ + const struct xt_NFQ_info_v2 *info = par->targinfo; + u32 maxid; + + if (unlikely(!rnd_inited)) { + get_random_bytes(&jhash_initval, sizeof(jhash_initval)); + rnd_inited = true; + } + if (info->queues_total == 0) { + pr_err("NFQUEUE: number of total queues is 0\n"); + return -EINVAL; + } + maxid = info->queues_total - 1 + info->queuenum; + if (maxid > 0xffff) { + pr_err("NFQUEUE: number of queues (%u) out of range (got %u)\n", + info->queues_total, maxid); + return -ERANGE; + } + if (par->target->revision == 2 && info->bypass > 1) + return -EINVAL; + return 0; +} + +static struct xt_target nfqueue_tg_reg[] __read_mostly = { + { + .name = "NFQUEUE", + .family = NFPROTO_UNSPEC, + .target = nfqueue_tg, + .targetsize = sizeof(struct xt_NFQ_info), + .me = THIS_MODULE, + }, + { + .name = "NFQUEUE", + .revision = 1, + .family = NFPROTO_UNSPEC, + .checkentry = nfqueue_tg_check, + .target = nfqueue_tg_v1, + .targetsize = sizeof(struct xt_NFQ_info_v1), + .me = THIS_MODULE, + }, + { + .name = "NFQUEUE", + .revision = 2, + .family = NFPROTO_UNSPEC, + .checkentry = nfqueue_tg_check, + .target = nfqueue_tg_v2, + .targetsize = sizeof(struct xt_NFQ_info_v2), + .me = THIS_MODULE, + }, +}; + +static int __init nfqueue_tg_init(void) +{ + return xt_register_targets(nfqueue_tg_reg, ARRAY_SIZE(nfqueue_tg_reg)); +} + +static void __exit nfqueue_tg_exit(void) +{ + xt_unregister_targets(nfqueue_tg_reg, ARRAY_SIZE(nfqueue_tg_reg)); +} + +module_init(nfqueue_tg_init); +module_exit(nfqueue_tg_exit); diff --git a/net/netfilter/xt_NOTRACK.c b/net/netfilter/xt_NOTRACK.c new file mode 100644 index 00000000..9d782181 --- /dev/null +++ b/net/netfilter/xt_NOTRACK.c @@ -0,0 +1,53 @@ +/* This is a module which is used for setting up fake conntracks + * on packets so that they are not seen by the conntrack/NAT code. + */ +#include <linux/module.h> +#include <linux/skbuff.h> + +#include <linux/netfilter/x_tables.h> +#include <net/netfilter/nf_conntrack.h> + +MODULE_DESCRIPTION("Xtables: Disabling connection tracking for packets"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_NOTRACK"); +MODULE_ALIAS("ip6t_NOTRACK"); + +static unsigned int +notrack_tg(struct sk_buff *skb, const struct xt_action_param *par) +{ + /* Previously seen (loopback)? Ignore. */ + if (skb->nfct != NULL) + return XT_CONTINUE; + + /* Attach fake conntrack entry. + If there is a real ct entry correspondig to this packet, + it'll hang aroun till timing out. We don't deal with it + for performance reasons. JK */ + skb->nfct = &nf_ct_untracked_get()->ct_general; + skb->nfctinfo = IP_CT_NEW; + nf_conntrack_get(skb->nfct); + + return XT_CONTINUE; +} + +static struct xt_target notrack_tg_reg __read_mostly = { + .name = "NOTRACK", + .revision = 0, + .family = NFPROTO_UNSPEC, + .target = notrack_tg, + .table = "raw", + .me = THIS_MODULE, +}; + +static int __init notrack_tg_init(void) +{ + return xt_register_target(¬rack_tg_reg); +} + +static void __exit notrack_tg_exit(void) +{ + xt_unregister_target(¬rack_tg_reg); +} + +module_init(notrack_tg_init); +module_exit(notrack_tg_exit); diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c new file mode 100644 index 00000000..f264032b --- /dev/null +++ b/net/netfilter/xt_RATEEST.c @@ -0,0 +1,195 @@ +/* + * (C) 2007 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/gen_stats.h> +#include <linux/jhash.h> +#include <linux/rtnetlink.h> +#include <linux/random.h> +#include <linux/slab.h> +#include <net/gen_stats.h> +#include <net/netlink.h> + +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_RATEEST.h> +#include <net/netfilter/xt_rateest.h> + +static DEFINE_MUTEX(xt_rateest_mutex); + +#define RATEEST_HSIZE 16 +static struct hlist_head rateest_hash[RATEEST_HSIZE] __read_mostly; +static unsigned int jhash_rnd __read_mostly; +static bool rnd_inited __read_mostly; + +static unsigned int xt_rateest_hash(const char *name) +{ + return jhash(name, FIELD_SIZEOF(struct xt_rateest, name), jhash_rnd) & + (RATEEST_HSIZE - 1); +} + +static void xt_rateest_hash_insert(struct xt_rateest *est) +{ + unsigned int h; + + h = xt_rateest_hash(est->name); + hlist_add_head(&est->list, &rateest_hash[h]); +} + +struct xt_rateest *xt_rateest_lookup(const char *name) +{ + struct xt_rateest *est; + struct hlist_node *n; + unsigned int h; + + h = xt_rateest_hash(name); + mutex_lock(&xt_rateest_mutex); + hlist_for_each_entry(est, n, &rateest_hash[h], list) { + if (strcmp(est->name, name) == 0) { + est->refcnt++; + mutex_unlock(&xt_rateest_mutex); + return est; + } + } + mutex_unlock(&xt_rateest_mutex); + return NULL; +} +EXPORT_SYMBOL_GPL(xt_rateest_lookup); + +void xt_rateest_put(struct xt_rateest *est) +{ + mutex_lock(&xt_rateest_mutex); + if (--est->refcnt == 0) { + hlist_del(&est->list); + gen_kill_estimator(&est->bstats, &est->rstats); + /* + * gen_estimator est_timer() might access est->lock or bstats, + * wait a RCU grace period before freeing 'est' + */ + kfree_rcu(est, rcu); + } + mutex_unlock(&xt_rateest_mutex); +} +EXPORT_SYMBOL_GPL(xt_rateest_put); + +static unsigned int +xt_rateest_tg(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_rateest_target_info *info = par->targinfo; + struct gnet_stats_basic_packed *stats = &info->est->bstats; + + spin_lock_bh(&info->est->lock); + stats->bytes += skb->len; + stats->packets++; + spin_unlock_bh(&info->est->lock); + + return XT_CONTINUE; +} + +static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par) +{ + struct xt_rateest_target_info *info = par->targinfo; + struct xt_rateest *est; + struct { + struct nlattr opt; + struct gnet_estimator est; + } cfg; + int ret; + + if (unlikely(!rnd_inited)) { + get_random_bytes(&jhash_rnd, sizeof(jhash_rnd)); + rnd_inited = true; + } + + est = xt_rateest_lookup(info->name); + if (est) { + /* + * If estimator parameters are specified, they must match the + * existing estimator. + */ + if ((!info->interval && !info->ewma_log) || + (info->interval != est->params.interval || + info->ewma_log != est->params.ewma_log)) { + xt_rateest_put(est); + return -EINVAL; + } + info->est = est; + return 0; + } + + ret = -ENOMEM; + est = kzalloc(sizeof(*est), GFP_KERNEL); + if (!est) + goto err1; + + strlcpy(est->name, info->name, sizeof(est->name)); + spin_lock_init(&est->lock); + est->refcnt = 1; + est->params.interval = info->interval; + est->params.ewma_log = info->ewma_log; + + cfg.opt.nla_len = nla_attr_size(sizeof(cfg.est)); + cfg.opt.nla_type = TCA_STATS_RATE_EST; + cfg.est.interval = info->interval; + cfg.est.ewma_log = info->ewma_log; + + ret = gen_new_estimator(&est->bstats, &est->rstats, + &est->lock, &cfg.opt); + if (ret < 0) + goto err2; + + info->est = est; + xt_rateest_hash_insert(est); + return 0; + +err2: + kfree(est); +err1: + return ret; +} + +static void xt_rateest_tg_destroy(const struct xt_tgdtor_param *par) +{ + struct xt_rateest_target_info *info = par->targinfo; + + xt_rateest_put(info->est); +} + +static struct xt_target xt_rateest_tg_reg __read_mostly = { + .name = "RATEEST", + .revision = 0, + .family = NFPROTO_UNSPEC, + .target = xt_rateest_tg, + .checkentry = xt_rateest_tg_checkentry, + .destroy = xt_rateest_tg_destroy, + .targetsize = sizeof(struct xt_rateest_target_info), + .me = THIS_MODULE, +}; + +static int __init xt_rateest_tg_init(void) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(rateest_hash); i++) + INIT_HLIST_HEAD(&rateest_hash[i]); + + return xt_register_target(&xt_rateest_tg_reg); +} + +static void __exit xt_rateest_tg_fini(void) +{ + xt_unregister_target(&xt_rateest_tg_reg); +} + + +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Xtables: packet rate estimator"); +MODULE_ALIAS("ipt_RATEEST"); +MODULE_ALIAS("ip6t_RATEEST"); +module_init(xt_rateest_tg_init); +module_exit(xt_rateest_tg_fini); diff --git a/net/netfilter/xt_SECMARK.c b/net/netfilter/xt_SECMARK.c new file mode 100644 index 00000000..9faf5e05 --- /dev/null +++ b/net/netfilter/xt_SECMARK.c @@ -0,0 +1,147 @@ +/* + * Module for modifying the secmark field of the skb, for use by + * security subsystems. + * + * Based on the nfmark match by: + * (C) 1999-2001 Marc Boucher <marc@mbsi.ca> + * + * (C) 2006,2008 Red Hat, Inc., James Morris <jmorris@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/module.h> +#include <linux/security.h> +#include <linux/skbuff.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_SECMARK.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("James Morris <jmorris@redhat.com>"); +MODULE_DESCRIPTION("Xtables: packet security mark modification"); +MODULE_ALIAS("ipt_SECMARK"); +MODULE_ALIAS("ip6t_SECMARK"); + +#define PFX "SECMARK: " + +static u8 mode; + +static unsigned int +secmark_tg(struct sk_buff *skb, const struct xt_action_param *par) +{ + u32 secmark = 0; + const struct xt_secmark_target_info *info = par->targinfo; + + BUG_ON(info->mode != mode); + + switch (mode) { + case SECMARK_MODE_SEL: + secmark = info->secid; + break; + default: + BUG(); + } + + skb->secmark = secmark; + return XT_CONTINUE; +} + +static int checkentry_lsm(struct xt_secmark_target_info *info) +{ + int err; + + info->secctx[SECMARK_SECCTX_MAX - 1] = '\0'; + info->secid = 0; + + err = security_secctx_to_secid(info->secctx, strlen(info->secctx), + &info->secid); + if (err) { + if (err == -EINVAL) + pr_info("invalid security context \'%s\'\n", info->secctx); + return err; + } + + if (!info->secid) { + pr_info("unable to map security context \'%s\'\n", info->secctx); + return -ENOENT; + } + + err = security_secmark_relabel_packet(info->secid); + if (err) { + pr_info("unable to obtain relabeling permission\n"); + return err; + } + + security_secmark_refcount_inc(); + return 0; +} + +static int secmark_tg_check(const struct xt_tgchk_param *par) +{ + struct xt_secmark_target_info *info = par->targinfo; + int err; + + if (strcmp(par->table, "mangle") != 0 && + strcmp(par->table, "security") != 0) { + pr_info("target only valid in the \'mangle\' " + "or \'security\' tables, not \'%s\'.\n", par->table); + return -EINVAL; + } + + if (mode && mode != info->mode) { + pr_info("mode already set to %hu cannot mix with " + "rules for mode %hu\n", mode, info->mode); + return -EINVAL; + } + + switch (info->mode) { + case SECMARK_MODE_SEL: + break; + default: + pr_info("invalid mode: %hu\n", info->mode); + return -EINVAL; + } + + err = checkentry_lsm(info); + if (err) + return err; + + if (!mode) + mode = info->mode; + return 0; +} + +static void secmark_tg_destroy(const struct xt_tgdtor_param *par) +{ + switch (mode) { + case SECMARK_MODE_SEL: + security_secmark_refcount_dec(); + } +} + +static struct xt_target secmark_tg_reg __read_mostly = { + .name = "SECMARK", + .revision = 0, + .family = NFPROTO_UNSPEC, + .checkentry = secmark_tg_check, + .destroy = secmark_tg_destroy, + .target = secmark_tg, + .targetsize = sizeof(struct xt_secmark_target_info), + .me = THIS_MODULE, +}; + +static int __init secmark_tg_init(void) +{ + return xt_register_target(&secmark_tg_reg); +} + +static void __exit secmark_tg_exit(void) +{ + xt_unregister_target(&secmark_tg_reg); +} + +module_init(secmark_tg_init); +module_exit(secmark_tg_exit); diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c new file mode 100644 index 00000000..190ad37c --- /dev/null +++ b/net/netfilter/xt_TCPMSS.c @@ -0,0 +1,320 @@ +/* + * This is a module which is used for setting the MSS option in TCP packets. + * + * Copyright (C) 2000 Marc Boucher <marc@mbsi.ca> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <linux/gfp.h> +#include <linux/ipv6.h> +#include <linux/tcp.h> +#include <net/dst.h> +#include <net/flow.h> +#include <net/ipv6.h> +#include <net/route.h> +#include <net/tcp.h> + +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv6/ip6_tables.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_tcpudp.h> +#include <linux/netfilter/xt_TCPMSS.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>"); +MODULE_DESCRIPTION("Xtables: TCP Maximum Segment Size (MSS) adjustment"); +MODULE_ALIAS("ipt_TCPMSS"); +MODULE_ALIAS("ip6t_TCPMSS"); + +static inline unsigned int +optlen(const u_int8_t *opt, unsigned int offset) +{ + /* Beware zero-length options: make finite progress */ + if (opt[offset] <= TCPOPT_NOP || opt[offset+1] == 0) + return 1; + else + return opt[offset+1]; +} + +static int +tcpmss_mangle_packet(struct sk_buff *skb, + const struct xt_tcpmss_info *info, + unsigned int in_mtu, + unsigned int tcphoff, + unsigned int minlen) +{ + struct tcphdr *tcph; + unsigned int tcplen, i; + __be16 oldval; + u16 newmss; + u8 *opt; + + if (!skb_make_writable(skb, skb->len)) + return -1; + + tcplen = skb->len - tcphoff; + tcph = (struct tcphdr *)(skb_network_header(skb) + tcphoff); + + /* Header cannot be larger than the packet */ + if (tcplen < tcph->doff*4) + return -1; + + if (info->mss == XT_TCPMSS_CLAMP_PMTU) { + if (dst_mtu(skb_dst(skb)) <= minlen) { + if (net_ratelimit()) + pr_err("unknown or invalid path-MTU (%u)\n", + dst_mtu(skb_dst(skb))); + return -1; + } + if (in_mtu <= minlen) { + if (net_ratelimit()) + pr_err("unknown or invalid path-MTU (%u)\n", + in_mtu); + return -1; + } + newmss = min(dst_mtu(skb_dst(skb)), in_mtu) - minlen; + } else + newmss = info->mss; + + opt = (u_int8_t *)tcph; + for (i = sizeof(struct tcphdr); i < tcph->doff*4; i += optlen(opt, i)) { + if (opt[i] == TCPOPT_MSS && tcph->doff*4 - i >= TCPOLEN_MSS && + opt[i+1] == TCPOLEN_MSS) { + u_int16_t oldmss; + + oldmss = (opt[i+2] << 8) | opt[i+3]; + + /* Never increase MSS, even when setting it, as + * doing so results in problems for hosts that rely + * on MSS being set correctly. + */ + if (oldmss <= newmss) + return 0; + + opt[i+2] = (newmss & 0xff00) >> 8; + opt[i+3] = newmss & 0x00ff; + + inet_proto_csum_replace2(&tcph->check, skb, + htons(oldmss), htons(newmss), + 0); + return 0; + } + } + + /* There is data after the header so the option can't be added + without moving it, and doing so may make the SYN packet + itself too large. Accept the packet unmodified instead. */ + if (tcplen > tcph->doff*4) + return 0; + + /* + * MSS Option not found ?! add it.. + */ + if (skb_tailroom(skb) < TCPOLEN_MSS) { + if (pskb_expand_head(skb, 0, + TCPOLEN_MSS - skb_tailroom(skb), + GFP_ATOMIC)) + return -1; + tcph = (struct tcphdr *)(skb_network_header(skb) + tcphoff); + } + + skb_put(skb, TCPOLEN_MSS); + + opt = (u_int8_t *)tcph + sizeof(struct tcphdr); + memmove(opt + TCPOLEN_MSS, opt, tcplen - sizeof(struct tcphdr)); + + inet_proto_csum_replace2(&tcph->check, skb, + htons(tcplen), htons(tcplen + TCPOLEN_MSS), 1); + opt[0] = TCPOPT_MSS; + opt[1] = TCPOLEN_MSS; + opt[2] = (newmss & 0xff00) >> 8; + opt[3] = newmss & 0x00ff; + + inet_proto_csum_replace4(&tcph->check, skb, 0, *((__be32 *)opt), 0); + + oldval = ((__be16 *)tcph)[6]; + tcph->doff += TCPOLEN_MSS/4; + inet_proto_csum_replace2(&tcph->check, skb, + oldval, ((__be16 *)tcph)[6], 0); + return TCPOLEN_MSS; +} + +static u_int32_t tcpmss_reverse_mtu(const struct sk_buff *skb, + unsigned int family) +{ + struct flowi fl; + const struct nf_afinfo *ai; + struct rtable *rt = NULL; + u_int32_t mtu = ~0U; + + if (family == PF_INET) { + struct flowi4 *fl4 = &fl.u.ip4; + memset(fl4, 0, sizeof(*fl4)); + fl4->daddr = ip_hdr(skb)->saddr; + } else { + struct flowi6 *fl6 = &fl.u.ip6; + + memset(fl6, 0, sizeof(*fl6)); + fl6->daddr = ipv6_hdr(skb)->saddr; + } + rcu_read_lock(); + ai = nf_get_afinfo(family); + if (ai != NULL) + ai->route(&init_net, (struct dst_entry **)&rt, &fl, false); + rcu_read_unlock(); + + if (rt != NULL) { + mtu = dst_mtu(&rt->dst); + dst_release(&rt->dst); + } + return mtu; +} + +static unsigned int +tcpmss_tg4(struct sk_buff *skb, const struct xt_action_param *par) +{ + struct iphdr *iph = ip_hdr(skb); + __be16 newlen; + int ret; + + ret = tcpmss_mangle_packet(skb, par->targinfo, + tcpmss_reverse_mtu(skb, PF_INET), + iph->ihl * 4, + sizeof(*iph) + sizeof(struct tcphdr)); + if (ret < 0) + return NF_DROP; + if (ret > 0) { + iph = ip_hdr(skb); + newlen = htons(ntohs(iph->tot_len) + ret); + csum_replace2(&iph->check, iph->tot_len, newlen); + iph->tot_len = newlen; + } + return XT_CONTINUE; +} + +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) +static unsigned int +tcpmss_tg6(struct sk_buff *skb, const struct xt_action_param *par) +{ + struct ipv6hdr *ipv6h = ipv6_hdr(skb); + u8 nexthdr; + __be16 frag_off; + int tcphoff; + int ret; + + nexthdr = ipv6h->nexthdr; + tcphoff = ipv6_skip_exthdr(skb, sizeof(*ipv6h), &nexthdr, &frag_off); + if (tcphoff < 0) + return NF_DROP; + ret = tcpmss_mangle_packet(skb, par->targinfo, + tcpmss_reverse_mtu(skb, PF_INET6), + tcphoff, + sizeof(*ipv6h) + sizeof(struct tcphdr)); + if (ret < 0) + return NF_DROP; + if (ret > 0) { + ipv6h = ipv6_hdr(skb); + ipv6h->payload_len = htons(ntohs(ipv6h->payload_len) + ret); + } + return XT_CONTINUE; +} +#endif + +/* Must specify -p tcp --syn */ +static inline bool find_syn_match(const struct xt_entry_match *m) +{ + const struct xt_tcp *tcpinfo = (const struct xt_tcp *)m->data; + + if (strcmp(m->u.kernel.match->name, "tcp") == 0 && + tcpinfo->flg_cmp & TCPHDR_SYN && + !(tcpinfo->invflags & XT_TCP_INV_FLAGS)) + return true; + + return false; +} + +static int tcpmss_tg4_check(const struct xt_tgchk_param *par) +{ + const struct xt_tcpmss_info *info = par->targinfo; + const struct ipt_entry *e = par->entryinfo; + const struct xt_entry_match *ematch; + + if (info->mss == XT_TCPMSS_CLAMP_PMTU && + (par->hook_mask & ~((1 << NF_INET_FORWARD) | + (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_POST_ROUTING))) != 0) { + pr_info("path-MTU clamping only supported in " + "FORWARD, OUTPUT and POSTROUTING hooks\n"); + return -EINVAL; + } + xt_ematch_foreach(ematch, e) + if (find_syn_match(ematch)) + return 0; + pr_info("Only works on TCP SYN packets\n"); + return -EINVAL; +} + +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) +static int tcpmss_tg6_check(const struct xt_tgchk_param *par) +{ + const struct xt_tcpmss_info *info = par->targinfo; + const struct ip6t_entry *e = par->entryinfo; + const struct xt_entry_match *ematch; + + if (info->mss == XT_TCPMSS_CLAMP_PMTU && + (par->hook_mask & ~((1 << NF_INET_FORWARD) | + (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_POST_ROUTING))) != 0) { + pr_info("path-MTU clamping only supported in " + "FORWARD, OUTPUT and POSTROUTING hooks\n"); + return -EINVAL; + } + xt_ematch_foreach(ematch, e) + if (find_syn_match(ematch)) + return 0; + pr_info("Only works on TCP SYN packets\n"); + return -EINVAL; +} +#endif + +static struct xt_target tcpmss_tg_reg[] __read_mostly = { + { + .family = NFPROTO_IPV4, + .name = "TCPMSS", + .checkentry = tcpmss_tg4_check, + .target = tcpmss_tg4, + .targetsize = sizeof(struct xt_tcpmss_info), + .proto = IPPROTO_TCP, + .me = THIS_MODULE, + }, +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + { + .family = NFPROTO_IPV6, + .name = "TCPMSS", + .checkentry = tcpmss_tg6_check, + .target = tcpmss_tg6, + .targetsize = sizeof(struct xt_tcpmss_info), + .proto = IPPROTO_TCP, + .me = THIS_MODULE, + }, +#endif +}; + +static int __init tcpmss_tg_init(void) +{ + return xt_register_targets(tcpmss_tg_reg, ARRAY_SIZE(tcpmss_tg_reg)); +} + +static void __exit tcpmss_tg_exit(void) +{ + xt_unregister_targets(tcpmss_tg_reg, ARRAY_SIZE(tcpmss_tg_reg)); +} + +module_init(tcpmss_tg_init); +module_exit(tcpmss_tg_exit); diff --git a/net/netfilter/xt_TCPOPTSTRIP.c b/net/netfilter/xt_TCPOPTSTRIP.c new file mode 100644 index 00000000..25fd1c4e --- /dev/null +++ b/net/netfilter/xt_TCPOPTSTRIP.c @@ -0,0 +1,143 @@ +/* + * A module for stripping a specific TCP option from TCP packets. + * + * Copyright (C) 2007 Sven Schnelle <svens@bitebene.org> + * Copyright © CC Computer Consultants GmbH, 2007 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/tcp.h> +#include <net/ipv6.h> +#include <net/tcp.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_TCPOPTSTRIP.h> + +static inline unsigned int optlen(const u_int8_t *opt, unsigned int offset) +{ + /* Beware zero-length options: make finite progress */ + if (opt[offset] <= TCPOPT_NOP || opt[offset+1] == 0) + return 1; + else + return opt[offset+1]; +} + +static unsigned int +tcpoptstrip_mangle_packet(struct sk_buff *skb, + const struct xt_tcpoptstrip_target_info *info, + unsigned int tcphoff, unsigned int minlen) +{ + unsigned int optl, i, j; + struct tcphdr *tcph; + u_int16_t n, o; + u_int8_t *opt; + + if (!skb_make_writable(skb, skb->len)) + return NF_DROP; + + tcph = (struct tcphdr *)(skb_network_header(skb) + tcphoff); + opt = (u_int8_t *)tcph; + + /* + * Walk through all TCP options - if we find some option to remove, + * set all octets to %TCPOPT_NOP and adjust checksum. + */ + for (i = sizeof(struct tcphdr); i < tcp_hdrlen(skb); i += optl) { + optl = optlen(opt, i); + + if (i + optl > tcp_hdrlen(skb)) + break; + + if (!tcpoptstrip_test_bit(info->strip_bmap, opt[i])) + continue; + + for (j = 0; j < optl; ++j) { + o = opt[i+j]; + n = TCPOPT_NOP; + if ((i + j) % 2 == 0) { + o <<= 8; + n <<= 8; + } + inet_proto_csum_replace2(&tcph->check, skb, htons(o), + htons(n), 0); + } + memset(opt + i, TCPOPT_NOP, optl); + } + + return XT_CONTINUE; +} + +static unsigned int +tcpoptstrip_tg4(struct sk_buff *skb, const struct xt_action_param *par) +{ + return tcpoptstrip_mangle_packet(skb, par->targinfo, ip_hdrlen(skb), + sizeof(struct iphdr) + sizeof(struct tcphdr)); +} + +#if IS_ENABLED(CONFIG_IP6_NF_MANGLE) +static unsigned int +tcpoptstrip_tg6(struct sk_buff *skb, const struct xt_action_param *par) +{ + struct ipv6hdr *ipv6h = ipv6_hdr(skb); + int tcphoff; + u_int8_t nexthdr; + __be16 frag_off; + + nexthdr = ipv6h->nexthdr; + tcphoff = ipv6_skip_exthdr(skb, sizeof(*ipv6h), &nexthdr, &frag_off); + if (tcphoff < 0) + return NF_DROP; + + return tcpoptstrip_mangle_packet(skb, par->targinfo, tcphoff, + sizeof(*ipv6h) + sizeof(struct tcphdr)); +} +#endif + +static struct xt_target tcpoptstrip_tg_reg[] __read_mostly = { + { + .name = "TCPOPTSTRIP", + .family = NFPROTO_IPV4, + .table = "mangle", + .proto = IPPROTO_TCP, + .target = tcpoptstrip_tg4, + .targetsize = sizeof(struct xt_tcpoptstrip_target_info), + .me = THIS_MODULE, + }, +#if IS_ENABLED(CONFIG_IP6_NF_MANGLE) + { + .name = "TCPOPTSTRIP", + .family = NFPROTO_IPV6, + .table = "mangle", + .proto = IPPROTO_TCP, + .target = tcpoptstrip_tg6, + .targetsize = sizeof(struct xt_tcpoptstrip_target_info), + .me = THIS_MODULE, + }, +#endif +}; + +static int __init tcpoptstrip_tg_init(void) +{ + return xt_register_targets(tcpoptstrip_tg_reg, + ARRAY_SIZE(tcpoptstrip_tg_reg)); +} + +static void __exit tcpoptstrip_tg_exit(void) +{ + xt_unregister_targets(tcpoptstrip_tg_reg, + ARRAY_SIZE(tcpoptstrip_tg_reg)); +} + +module_init(tcpoptstrip_tg_init); +module_exit(tcpoptstrip_tg_exit); +MODULE_AUTHOR("Sven Schnelle <svens@bitebene.org>, Jan Engelhardt <jengelh@medozas.de>"); +MODULE_DESCRIPTION("Xtables: TCP option stripping"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_TCPOPTSTRIP"); +MODULE_ALIAS("ip6t_TCPOPTSTRIP"); diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c new file mode 100644 index 00000000..4d505790 --- /dev/null +++ b/net/netfilter/xt_TEE.c @@ -0,0 +1,308 @@ +/* + * "TEE" target extension for Xtables + * Copyright © Sebastian Claßen, 2007 + * Jan Engelhardt, 2007-2010 + * + * based on ipt_ROUTE.c from Cédric de Launois + * <delaunois@info.ucl.be> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 or later, as published by the Free Software Foundation. + */ +#include <linux/ip.h> +#include <linux/module.h> +#include <linux/percpu.h> +#include <linux/route.h> +#include <linux/skbuff.h> +#include <linux/notifier.h> +#include <net/checksum.h> +#include <net/icmp.h> +#include <net/ip.h> +#include <net/ipv6.h> +#include <net/ip6_route.h> +#include <net/route.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_TEE.h> + +#if IS_ENABLED(CONFIG_NF_CONNTRACK) +# define WITH_CONNTRACK 1 +# include <net/netfilter/nf_conntrack.h> +#endif + +struct xt_tee_priv { + struct notifier_block notifier; + struct xt_tee_tginfo *tginfo; + int oif; +}; + +static const union nf_inet_addr tee_zero_address; +static DEFINE_PER_CPU(bool, tee_active); + +static struct net *pick_net(struct sk_buff *skb) +{ +#ifdef CONFIG_NET_NS + const struct dst_entry *dst; + + if (skb->dev != NULL) + return dev_net(skb->dev); + dst = skb_dst(skb); + if (dst != NULL && dst->dev != NULL) + return dev_net(dst->dev); +#endif + return &init_net; +} + +static bool +tee_tg_route4(struct sk_buff *skb, const struct xt_tee_tginfo *info) +{ + const struct iphdr *iph = ip_hdr(skb); + struct net *net = pick_net(skb); + struct rtable *rt; + struct flowi4 fl4; + + memset(&fl4, 0, sizeof(fl4)); + if (info->priv) { + if (info->priv->oif == -1) + return false; + fl4.flowi4_oif = info->priv->oif; + } + fl4.daddr = info->gw.ip; + fl4.flowi4_tos = RT_TOS(iph->tos); + fl4.flowi4_scope = RT_SCOPE_UNIVERSE; + rt = ip_route_output_key(net, &fl4); + if (IS_ERR(rt)) + return false; + + skb_dst_drop(skb); + skb_dst_set(skb, &rt->dst); + skb->dev = rt->dst.dev; + skb->protocol = htons(ETH_P_IP); + return true; +} + +static unsigned int +tee_tg4(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_tee_tginfo *info = par->targinfo; + struct iphdr *iph; + + if (percpu_read(tee_active)) + return XT_CONTINUE; + /* + * Copy the skb, and route the copy. Will later return %XT_CONTINUE for + * the original skb, which should continue on its way as if nothing has + * happened. The copy should be independently delivered to the TEE + * --gateway. + */ + skb = pskb_copy(skb, GFP_ATOMIC); + if (skb == NULL) + return XT_CONTINUE; + +#ifdef WITH_CONNTRACK + /* Avoid counting cloned packets towards the original connection. */ + nf_conntrack_put(skb->nfct); + skb->nfct = &nf_ct_untracked_get()->ct_general; + skb->nfctinfo = IP_CT_NEW; + nf_conntrack_get(skb->nfct); +#endif + /* + * If we are in PREROUTING/INPUT, the checksum must be recalculated + * since the length could have changed as a result of defragmentation. + * + * We also decrease the TTL to mitigate potential TEE loops + * between two hosts. + * + * Set %IP_DF so that the original source is notified of a potentially + * decreased MTU on the clone route. IPv6 does this too. + */ + iph = ip_hdr(skb); + iph->frag_off |= htons(IP_DF); + if (par->hooknum == NF_INET_PRE_ROUTING || + par->hooknum == NF_INET_LOCAL_IN) + --iph->ttl; + ip_send_check(iph); + + if (tee_tg_route4(skb, info)) { + percpu_write(tee_active, true); + ip_local_out(skb); + percpu_write(tee_active, false); + } else { + kfree_skb(skb); + } + return XT_CONTINUE; +} + +#if IS_ENABLED(CONFIG_IPV6) +static bool +tee_tg_route6(struct sk_buff *skb, const struct xt_tee_tginfo *info) +{ + const struct ipv6hdr *iph = ipv6_hdr(skb); + struct net *net = pick_net(skb); + struct dst_entry *dst; + struct flowi6 fl6; + + memset(&fl6, 0, sizeof(fl6)); + if (info->priv) { + if (info->priv->oif == -1) + return false; + fl6.flowi6_oif = info->priv->oif; + } + fl6.daddr = info->gw.in6; + fl6.flowlabel = ((iph->flow_lbl[0] & 0xF) << 16) | + (iph->flow_lbl[1] << 8) | iph->flow_lbl[2]; + dst = ip6_route_output(net, NULL, &fl6); + if (dst->error) { + dst_release(dst); + return false; + } + skb_dst_drop(skb); + skb_dst_set(skb, dst); + skb->dev = dst->dev; + skb->protocol = htons(ETH_P_IPV6); + return true; +} + +static unsigned int +tee_tg6(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_tee_tginfo *info = par->targinfo; + + if (percpu_read(tee_active)) + return XT_CONTINUE; + skb = pskb_copy(skb, GFP_ATOMIC); + if (skb == NULL) + return XT_CONTINUE; + +#ifdef WITH_CONNTRACK + nf_conntrack_put(skb->nfct); + skb->nfct = &nf_ct_untracked_get()->ct_general; + skb->nfctinfo = IP_CT_NEW; + nf_conntrack_get(skb->nfct); +#endif + if (par->hooknum == NF_INET_PRE_ROUTING || + par->hooknum == NF_INET_LOCAL_IN) { + struct ipv6hdr *iph = ipv6_hdr(skb); + --iph->hop_limit; + } + if (tee_tg_route6(skb, info)) { + percpu_write(tee_active, true); + ip6_local_out(skb); + percpu_write(tee_active, false); + } else { + kfree_skb(skb); + } + return XT_CONTINUE; +} +#endif + +static int tee_netdev_event(struct notifier_block *this, unsigned long event, + void *ptr) +{ + struct net_device *dev = ptr; + struct xt_tee_priv *priv; + + priv = container_of(this, struct xt_tee_priv, notifier); + switch (event) { + case NETDEV_REGISTER: + if (!strcmp(dev->name, priv->tginfo->oif)) + priv->oif = dev->ifindex; + break; + case NETDEV_UNREGISTER: + if (dev->ifindex == priv->oif) + priv->oif = -1; + break; + case NETDEV_CHANGENAME: + if (!strcmp(dev->name, priv->tginfo->oif)) + priv->oif = dev->ifindex; + else if (dev->ifindex == priv->oif) + priv->oif = -1; + break; + } + + return NOTIFY_DONE; +} + +static int tee_tg_check(const struct xt_tgchk_param *par) +{ + struct xt_tee_tginfo *info = par->targinfo; + struct xt_tee_priv *priv; + + /* 0.0.0.0 and :: not allowed */ + if (memcmp(&info->gw, &tee_zero_address, + sizeof(tee_zero_address)) == 0) + return -EINVAL; + + if (info->oif[0]) { + if (info->oif[sizeof(info->oif)-1] != '\0') + return -EINVAL; + + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (priv == NULL) + return -ENOMEM; + + priv->tginfo = info; + priv->oif = -1; + priv->notifier.notifier_call = tee_netdev_event; + info->priv = priv; + + register_netdevice_notifier(&priv->notifier); + } else + info->priv = NULL; + + return 0; +} + +static void tee_tg_destroy(const struct xt_tgdtor_param *par) +{ + struct xt_tee_tginfo *info = par->targinfo; + + if (info->priv) { + unregister_netdevice_notifier(&info->priv->notifier); + kfree(info->priv); + } +} + +static struct xt_target tee_tg_reg[] __read_mostly = { + { + .name = "TEE", + .revision = 1, + .family = NFPROTO_IPV4, + .target = tee_tg4, + .targetsize = sizeof(struct xt_tee_tginfo), + .checkentry = tee_tg_check, + .destroy = tee_tg_destroy, + .me = THIS_MODULE, + }, +#if IS_ENABLED(CONFIG_IPV6) + { + .name = "TEE", + .revision = 1, + .family = NFPROTO_IPV6, + .target = tee_tg6, + .targetsize = sizeof(struct xt_tee_tginfo), + .checkentry = tee_tg_check, + .destroy = tee_tg_destroy, + .me = THIS_MODULE, + }, +#endif +}; + +static int __init tee_tg_init(void) +{ + return xt_register_targets(tee_tg_reg, ARRAY_SIZE(tee_tg_reg)); +} + +static void __exit tee_tg_exit(void) +{ + xt_unregister_targets(tee_tg_reg, ARRAY_SIZE(tee_tg_reg)); +} + +module_init(tee_tg_init); +module_exit(tee_tg_exit); +MODULE_AUTHOR("Sebastian Claßen <sebastian.classen@freenet.ag>"); +MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>"); +MODULE_DESCRIPTION("Xtables: Reroute packet copy"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_TEE"); +MODULE_ALIAS("ip6t_TEE"); diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c new file mode 100644 index 00000000..35a959a0 --- /dev/null +++ b/net/netfilter/xt_TPROXY.c @@ -0,0 +1,432 @@ +/* + * Transparent proxy support for Linux/iptables + * + * Copyright (c) 2006-2010 BalaBit IT Ltd. + * Author: Balazs Scheidler, Krisztian Kovacs + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <net/checksum.h> +#include <net/udp.h> +#include <net/inet_sock.h> +#include <linux/inetdevice.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter_ipv4/ip_tables.h> + +#include <net/netfilter/ipv4/nf_defrag_ipv4.h> + +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) +#define XT_TPROXY_HAVE_IPV6 1 +#include <net/if_inet6.h> +#include <net/addrconf.h> +#include <linux/netfilter_ipv6/ip6_tables.h> +#include <net/netfilter/ipv6/nf_defrag_ipv6.h> +#endif + +#include <net/netfilter/nf_tproxy_core.h> +#include <linux/netfilter/xt_TPROXY.h> + +static bool tproxy_sk_is_transparent(struct sock *sk) +{ + if (sk->sk_state != TCP_TIME_WAIT) { + if (inet_sk(sk)->transparent) + return true; + sock_put(sk); + } else { + if (inet_twsk(sk)->tw_transparent) + return true; + inet_twsk_put(inet_twsk(sk)); + } + return false; +} + +static inline __be32 +tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr) +{ + struct in_device *indev; + __be32 laddr; + + if (user_laddr) + return user_laddr; + + laddr = 0; + rcu_read_lock(); + indev = __in_dev_get_rcu(skb->dev); + for_primary_ifa(indev) { + laddr = ifa->ifa_local; + break; + } endfor_ifa(indev); + rcu_read_unlock(); + + return laddr ? laddr : daddr; +} + +/** + * tproxy_handle_time_wait4() - handle IPv4 TCP TIME_WAIT reopen redirections + * @skb: The skb being processed. + * @laddr: IPv4 address to redirect to or zero. + * @lport: TCP port to redirect to or zero. + * @sk: The TIME_WAIT TCP socket found by the lookup. + * + * We have to handle SYN packets arriving to TIME_WAIT sockets + * differently: instead of reopening the connection we should rather + * redirect the new connection to the proxy if there's a listener + * socket present. + * + * tproxy_handle_time_wait4() consumes the socket reference passed in. + * + * Returns the listener socket if there's one, the TIME_WAIT socket if + * no such listener is found, or NULL if the TCP header is incomplete. + */ +static struct sock * +tproxy_handle_time_wait4(struct sk_buff *skb, __be32 laddr, __be16 lport, + struct sock *sk) +{ + const struct iphdr *iph = ip_hdr(skb); + struct tcphdr _hdr, *hp; + + hp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_hdr), &_hdr); + if (hp == NULL) { + inet_twsk_put(inet_twsk(sk)); + return NULL; + } + + if (hp->syn && !hp->rst && !hp->ack && !hp->fin) { + /* SYN to a TIME_WAIT socket, we'd rather redirect it + * to a listener socket if there's one */ + struct sock *sk2; + + sk2 = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol, + iph->saddr, laddr ? laddr : iph->daddr, + hp->source, lport ? lport : hp->dest, + skb->dev, NFT_LOOKUP_LISTENER); + if (sk2) { + inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row); + inet_twsk_put(inet_twsk(sk)); + sk = sk2; + } + } + + return sk; +} + +static unsigned int +tproxy_tg4(struct sk_buff *skb, __be32 laddr, __be16 lport, + u_int32_t mark_mask, u_int32_t mark_value) +{ + const struct iphdr *iph = ip_hdr(skb); + struct udphdr _hdr, *hp; + struct sock *sk; + + hp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_hdr), &_hdr); + if (hp == NULL) + return NF_DROP; + + /* check if there's an ongoing connection on the packet + * addresses, this happens if the redirect already happened + * and the current packet belongs to an already established + * connection */ + sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol, + iph->saddr, iph->daddr, + hp->source, hp->dest, + skb->dev, NFT_LOOKUP_ESTABLISHED); + + laddr = tproxy_laddr4(skb, laddr, iph->daddr); + if (!lport) + lport = hp->dest; + + /* UDP has no TCP_TIME_WAIT state, so we never enter here */ + if (sk && sk->sk_state == TCP_TIME_WAIT) + /* reopening a TIME_WAIT connection needs special handling */ + sk = tproxy_handle_time_wait4(skb, laddr, lport, sk); + else if (!sk) + /* no, there's no established connection, check if + * there's a listener on the redirected addr/port */ + sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol, + iph->saddr, laddr, + hp->source, lport, + skb->dev, NFT_LOOKUP_LISTENER); + + /* NOTE: assign_sock consumes our sk reference */ + if (sk && tproxy_sk_is_transparent(sk)) { + /* This should be in a separate target, but we don't do multiple + targets on the same rule yet */ + skb->mark = (skb->mark & ~mark_mask) ^ mark_value; + + pr_debug("redirecting: proto %hhu %pI4:%hu -> %pI4:%hu, mark: %x\n", + iph->protocol, &iph->daddr, ntohs(hp->dest), + &laddr, ntohs(lport), skb->mark); + + nf_tproxy_assign_sock(skb, sk); + return NF_ACCEPT; + } + + pr_debug("no socket, dropping: proto %hhu %pI4:%hu -> %pI4:%hu, mark: %x\n", + iph->protocol, &iph->saddr, ntohs(hp->source), + &iph->daddr, ntohs(hp->dest), skb->mark); + return NF_DROP; +} + +static unsigned int +tproxy_tg4_v0(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_tproxy_target_info *tgi = par->targinfo; + + return tproxy_tg4(skb, tgi->laddr, tgi->lport, tgi->mark_mask, tgi->mark_value); +} + +static unsigned int +tproxy_tg4_v1(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_tproxy_target_info_v1 *tgi = par->targinfo; + + return tproxy_tg4(skb, tgi->laddr.ip, tgi->lport, tgi->mark_mask, tgi->mark_value); +} + +#ifdef XT_TPROXY_HAVE_IPV6 + +static inline const struct in6_addr * +tproxy_laddr6(struct sk_buff *skb, const struct in6_addr *user_laddr, + const struct in6_addr *daddr) +{ + struct inet6_dev *indev; + struct inet6_ifaddr *ifa; + struct in6_addr *laddr; + + if (!ipv6_addr_any(user_laddr)) + return user_laddr; + laddr = NULL; + + rcu_read_lock(); + indev = __in6_dev_get(skb->dev); + if (indev) + list_for_each_entry(ifa, &indev->addr_list, if_list) { + if (ifa->flags & (IFA_F_TENTATIVE | IFA_F_DEPRECATED)) + continue; + + laddr = &ifa->addr; + break; + } + rcu_read_unlock(); + + return laddr ? laddr : daddr; +} + +/** + * tproxy_handle_time_wait6() - handle IPv6 TCP TIME_WAIT reopen redirections + * @skb: The skb being processed. + * @tproto: Transport protocol. + * @thoff: Transport protocol header offset. + * @par: Iptables target parameters. + * @sk: The TIME_WAIT TCP socket found by the lookup. + * + * We have to handle SYN packets arriving to TIME_WAIT sockets + * differently: instead of reopening the connection we should rather + * redirect the new connection to the proxy if there's a listener + * socket present. + * + * tproxy_handle_time_wait6() consumes the socket reference passed in. + * + * Returns the listener socket if there's one, the TIME_WAIT socket if + * no such listener is found, or NULL if the TCP header is incomplete. + */ +static struct sock * +tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff, + const struct xt_action_param *par, + struct sock *sk) +{ + const struct ipv6hdr *iph = ipv6_hdr(skb); + struct tcphdr _hdr, *hp; + const struct xt_tproxy_target_info_v1 *tgi = par->targinfo; + + hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr); + if (hp == NULL) { + inet_twsk_put(inet_twsk(sk)); + return NULL; + } + + if (hp->syn && !hp->rst && !hp->ack && !hp->fin) { + /* SYN to a TIME_WAIT socket, we'd rather redirect it + * to a listener socket if there's one */ + struct sock *sk2; + + sk2 = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto, + &iph->saddr, + tproxy_laddr6(skb, &tgi->laddr.in6, &iph->daddr), + hp->source, + tgi->lport ? tgi->lport : hp->dest, + skb->dev, NFT_LOOKUP_LISTENER); + if (sk2) { + inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row); + inet_twsk_put(inet_twsk(sk)); + sk = sk2; + } + } + + return sk; +} + +static unsigned int +tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct ipv6hdr *iph = ipv6_hdr(skb); + const struct xt_tproxy_target_info_v1 *tgi = par->targinfo; + struct udphdr _hdr, *hp; + struct sock *sk; + const struct in6_addr *laddr; + __be16 lport; + int thoff; + int tproto; + + tproto = ipv6_find_hdr(skb, &thoff, -1, NULL); + if (tproto < 0) { + pr_debug("unable to find transport header in IPv6 packet, dropping\n"); + return NF_DROP; + } + + hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr); + if (hp == NULL) { + pr_debug("unable to grab transport header contents in IPv6 packet, dropping\n"); + return NF_DROP; + } + + /* check if there's an ongoing connection on the packet + * addresses, this happens if the redirect already happened + * and the current packet belongs to an already established + * connection */ + sk = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto, + &iph->saddr, &iph->daddr, + hp->source, hp->dest, + par->in, NFT_LOOKUP_ESTABLISHED); + + laddr = tproxy_laddr6(skb, &tgi->laddr.in6, &iph->daddr); + lport = tgi->lport ? tgi->lport : hp->dest; + + /* UDP has no TCP_TIME_WAIT state, so we never enter here */ + if (sk && sk->sk_state == TCP_TIME_WAIT) + /* reopening a TIME_WAIT connection needs special handling */ + sk = tproxy_handle_time_wait6(skb, tproto, thoff, par, sk); + else if (!sk) + /* no there's no established connection, check if + * there's a listener on the redirected addr/port */ + sk = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto, + &iph->saddr, laddr, + hp->source, lport, + par->in, NFT_LOOKUP_LISTENER); + + /* NOTE: assign_sock consumes our sk reference */ + if (sk && tproxy_sk_is_transparent(sk)) { + /* This should be in a separate target, but we don't do multiple + targets on the same rule yet */ + skb->mark = (skb->mark & ~tgi->mark_mask) ^ tgi->mark_value; + + pr_debug("redirecting: proto %hhu %pI6:%hu -> %pI6:%hu, mark: %x\n", + tproto, &iph->saddr, ntohs(hp->source), + laddr, ntohs(lport), skb->mark); + + nf_tproxy_assign_sock(skb, sk); + return NF_ACCEPT; + } + + pr_debug("no socket, dropping: proto %hhu %pI6:%hu -> %pI6:%hu, mark: %x\n", + tproto, &iph->saddr, ntohs(hp->source), + &iph->daddr, ntohs(hp->dest), skb->mark); + + return NF_DROP; +} + +static int tproxy_tg6_check(const struct xt_tgchk_param *par) +{ + const struct ip6t_ip6 *i = par->entryinfo; + + if ((i->proto == IPPROTO_TCP || i->proto == IPPROTO_UDP) + && !(i->flags & IP6T_INV_PROTO)) + return 0; + + pr_info("Can be used only in combination with " + "either -p tcp or -p udp\n"); + return -EINVAL; +} +#endif + +static int tproxy_tg4_check(const struct xt_tgchk_param *par) +{ + const struct ipt_ip *i = par->entryinfo; + + if ((i->proto == IPPROTO_TCP || i->proto == IPPROTO_UDP) + && !(i->invflags & IPT_INV_PROTO)) + return 0; + + pr_info("Can be used only in combination with " + "either -p tcp or -p udp\n"); + return -EINVAL; +} + +static struct xt_target tproxy_tg_reg[] __read_mostly = { + { + .name = "TPROXY", + .family = NFPROTO_IPV4, + .table = "mangle", + .target = tproxy_tg4_v0, + .revision = 0, + .targetsize = sizeof(struct xt_tproxy_target_info), + .checkentry = tproxy_tg4_check, + .hooks = 1 << NF_INET_PRE_ROUTING, + .me = THIS_MODULE, + }, + { + .name = "TPROXY", + .family = NFPROTO_IPV4, + .table = "mangle", + .target = tproxy_tg4_v1, + .revision = 1, + .targetsize = sizeof(struct xt_tproxy_target_info_v1), + .checkentry = tproxy_tg4_check, + .hooks = 1 << NF_INET_PRE_ROUTING, + .me = THIS_MODULE, + }, +#ifdef XT_TPROXY_HAVE_IPV6 + { + .name = "TPROXY", + .family = NFPROTO_IPV6, + .table = "mangle", + .target = tproxy_tg6_v1, + .revision = 1, + .targetsize = sizeof(struct xt_tproxy_target_info_v1), + .checkentry = tproxy_tg6_check, + .hooks = 1 << NF_INET_PRE_ROUTING, + .me = THIS_MODULE, + }, +#endif + +}; + +static int __init tproxy_tg_init(void) +{ + nf_defrag_ipv4_enable(); +#ifdef XT_TPROXY_HAVE_IPV6 + nf_defrag_ipv6_enable(); +#endif + + return xt_register_targets(tproxy_tg_reg, ARRAY_SIZE(tproxy_tg_reg)); +} + +static void __exit tproxy_tg_exit(void) +{ + xt_unregister_targets(tproxy_tg_reg, ARRAY_SIZE(tproxy_tg_reg)); +} + +module_init(tproxy_tg_init); +module_exit(tproxy_tg_exit); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Balazs Scheidler, Krisztian Kovacs"); +MODULE_DESCRIPTION("Netfilter transparent proxy (TPROXY) target module."); +MODULE_ALIAS("ipt_TPROXY"); +MODULE_ALIAS("ip6t_TPROXY"); diff --git a/net/netfilter/xt_TRACE.c b/net/netfilter/xt_TRACE.c new file mode 100644 index 00000000..df48967a --- /dev/null +++ b/net/netfilter/xt_TRACE.c @@ -0,0 +1,40 @@ +/* This is a module which is used to mark packets for tracing. + */ +#include <linux/module.h> +#include <linux/skbuff.h> + +#include <linux/netfilter/x_tables.h> + +MODULE_DESCRIPTION("Xtables: packet flow tracing"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_TRACE"); +MODULE_ALIAS("ip6t_TRACE"); + +static unsigned int +trace_tg(struct sk_buff *skb, const struct xt_action_param *par) +{ + skb->nf_trace = 1; + return XT_CONTINUE; +} + +static struct xt_target trace_tg_reg __read_mostly = { + .name = "TRACE", + .revision = 0, + .family = NFPROTO_UNSPEC, + .table = "raw", + .target = trace_tg, + .me = THIS_MODULE, +}; + +static int __init trace_tg_init(void) +{ + return xt_register_target(&trace_tg_reg); +} + +static void __exit trace_tg_exit(void) +{ + xt_unregister_target(&trace_tg_reg); +} + +module_init(trace_tg_init); +module_exit(trace_tg_exit); diff --git a/net/netfilter/xt_addrtype.c b/net/netfilter/xt_addrtype.c new file mode 100644 index 00000000..49c5ff7f --- /dev/null +++ b/net/netfilter/xt_addrtype.c @@ -0,0 +1,243 @@ +/* + * iptables module to match inet_addr_type() of an ip. + * + * Copyright (c) 2004 Patrick McHardy <kaber@trash.net> + * (C) 2007 Laszlo Attila Toth <panther@balabit.hu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/netdevice.h> +#include <linux/ip.h> +#include <net/route.h> + +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) +#include <net/ipv6.h> +#include <net/ip6_route.h> +#include <net/ip6_fib.h> +#endif + +#include <linux/netfilter/xt_addrtype.h> +#include <linux/netfilter/x_tables.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_DESCRIPTION("Xtables: address type match"); +MODULE_ALIAS("ipt_addrtype"); +MODULE_ALIAS("ip6t_addrtype"); + +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) +static u32 match_lookup_rt6(struct net *net, const struct net_device *dev, + const struct in6_addr *addr) +{ + const struct nf_afinfo *afinfo; + struct flowi6 flow; + struct rt6_info *rt; + u32 ret; + int route_err; + + memset(&flow, 0, sizeof(flow)); + flow.daddr = *addr; + if (dev) + flow.flowi6_oif = dev->ifindex; + + rcu_read_lock(); + + afinfo = nf_get_afinfo(NFPROTO_IPV6); + if (afinfo != NULL) + route_err = afinfo->route(net, (struct dst_entry **)&rt, + flowi6_to_flowi(&flow), !!dev); + else + route_err = 1; + + rcu_read_unlock(); + + if (route_err) + return XT_ADDRTYPE_UNREACHABLE; + + if (rt->rt6i_flags & RTF_REJECT) + ret = XT_ADDRTYPE_UNREACHABLE; + else + ret = 0; + + if (rt->rt6i_flags & RTF_LOCAL) + ret |= XT_ADDRTYPE_LOCAL; + if (rt->rt6i_flags & RTF_ANYCAST) + ret |= XT_ADDRTYPE_ANYCAST; + + + dst_release(&rt->dst); + return ret; +} + +static bool match_type6(struct net *net, const struct net_device *dev, + const struct in6_addr *addr, u16 mask) +{ + int addr_type = ipv6_addr_type(addr); + + if ((mask & XT_ADDRTYPE_MULTICAST) && + !(addr_type & IPV6_ADDR_MULTICAST)) + return false; + if ((mask & XT_ADDRTYPE_UNICAST) && !(addr_type & IPV6_ADDR_UNICAST)) + return false; + if ((mask & XT_ADDRTYPE_UNSPEC) && addr_type != IPV6_ADDR_ANY) + return false; + + if ((XT_ADDRTYPE_LOCAL | XT_ADDRTYPE_ANYCAST | + XT_ADDRTYPE_UNREACHABLE) & mask) + return !!(mask & match_lookup_rt6(net, dev, addr)); + return true; +} + +static bool +addrtype_mt6(struct net *net, const struct net_device *dev, + const struct sk_buff *skb, const struct xt_addrtype_info_v1 *info) +{ + const struct ipv6hdr *iph = ipv6_hdr(skb); + bool ret = true; + + if (info->source) + ret &= match_type6(net, dev, &iph->saddr, info->source) ^ + (info->flags & XT_ADDRTYPE_INVERT_SOURCE); + if (ret && info->dest) + ret &= match_type6(net, dev, &iph->daddr, info->dest) ^ + !!(info->flags & XT_ADDRTYPE_INVERT_DEST); + return ret; +} +#endif + +static inline bool match_type(struct net *net, const struct net_device *dev, + __be32 addr, u_int16_t mask) +{ + return !!(mask & (1 << inet_dev_addr_type(net, dev, addr))); +} + +static bool +addrtype_mt_v0(const struct sk_buff *skb, struct xt_action_param *par) +{ + struct net *net = dev_net(par->in ? par->in : par->out); + const struct xt_addrtype_info *info = par->matchinfo; + const struct iphdr *iph = ip_hdr(skb); + bool ret = true; + + if (info->source) + ret &= match_type(net, NULL, iph->saddr, info->source) ^ + info->invert_source; + if (info->dest) + ret &= match_type(net, NULL, iph->daddr, info->dest) ^ + info->invert_dest; + + return ret; +} + +static bool +addrtype_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) +{ + struct net *net = dev_net(par->in ? par->in : par->out); + const struct xt_addrtype_info_v1 *info = par->matchinfo; + const struct iphdr *iph; + const struct net_device *dev = NULL; + bool ret = true; + + if (info->flags & XT_ADDRTYPE_LIMIT_IFACE_IN) + dev = par->in; + else if (info->flags & XT_ADDRTYPE_LIMIT_IFACE_OUT) + dev = par->out; + +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + if (par->family == NFPROTO_IPV6) + return addrtype_mt6(net, dev, skb, info); +#endif + iph = ip_hdr(skb); + if (info->source) + ret &= match_type(net, dev, iph->saddr, info->source) ^ + (info->flags & XT_ADDRTYPE_INVERT_SOURCE); + if (ret && info->dest) + ret &= match_type(net, dev, iph->daddr, info->dest) ^ + !!(info->flags & XT_ADDRTYPE_INVERT_DEST); + return ret; +} + +static int addrtype_mt_checkentry_v1(const struct xt_mtchk_param *par) +{ + struct xt_addrtype_info_v1 *info = par->matchinfo; + + if (info->flags & XT_ADDRTYPE_LIMIT_IFACE_IN && + info->flags & XT_ADDRTYPE_LIMIT_IFACE_OUT) { + pr_info("both incoming and outgoing " + "interface limitation cannot be selected\n"); + return -EINVAL; + } + + if (par->hook_mask & ((1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN)) && + info->flags & XT_ADDRTYPE_LIMIT_IFACE_OUT) { + pr_info("output interface limitation " + "not valid in PREROUTING and INPUT\n"); + return -EINVAL; + } + + if (par->hook_mask & ((1 << NF_INET_POST_ROUTING) | + (1 << NF_INET_LOCAL_OUT)) && + info->flags & XT_ADDRTYPE_LIMIT_IFACE_IN) { + pr_info("input interface limitation " + "not valid in POSTROUTING and OUTPUT\n"); + return -EINVAL; + } + +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + if (par->family == NFPROTO_IPV6) { + if ((info->source | info->dest) & XT_ADDRTYPE_BLACKHOLE) { + pr_err("ipv6 BLACKHOLE matching not supported\n"); + return -EINVAL; + } + if ((info->source | info->dest) >= XT_ADDRTYPE_PROHIBIT) { + pr_err("ipv6 PROHIBT (THROW, NAT ..) matching not supported\n"); + return -EINVAL; + } + if ((info->source | info->dest) & XT_ADDRTYPE_BROADCAST) { + pr_err("ipv6 does not support BROADCAST matching\n"); + return -EINVAL; + } + } +#endif + return 0; +} + +static struct xt_match addrtype_mt_reg[] __read_mostly = { + { + .name = "addrtype", + .family = NFPROTO_IPV4, + .match = addrtype_mt_v0, + .matchsize = sizeof(struct xt_addrtype_info), + .me = THIS_MODULE + }, + { + .name = "addrtype", + .family = NFPROTO_UNSPEC, + .revision = 1, + .match = addrtype_mt_v1, + .checkentry = addrtype_mt_checkentry_v1, + .matchsize = sizeof(struct xt_addrtype_info_v1), + .me = THIS_MODULE + } +}; + +static int __init addrtype_mt_init(void) +{ + return xt_register_matches(addrtype_mt_reg, + ARRAY_SIZE(addrtype_mt_reg)); +} + +static void __exit addrtype_mt_exit(void) +{ + xt_unregister_matches(addrtype_mt_reg, ARRAY_SIZE(addrtype_mt_reg)); +} + +module_init(addrtype_mt_init); +module_exit(addrtype_mt_exit); diff --git a/net/netfilter/xt_cluster.c b/net/netfilter/xt_cluster.c new file mode 100644 index 00000000..f4af1bfa --- /dev/null +++ b/net/netfilter/xt_cluster.c @@ -0,0 +1,178 @@ +/* + * (C) 2008-2009 Pablo Neira Ayuso <pablo@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/jhash.h> +#include <linux/ip.h> +#include <net/ipv6.h> + +#include <linux/netfilter/x_tables.h> +#include <net/netfilter/nf_conntrack.h> +#include <linux/netfilter/xt_cluster.h> + +static inline u32 nf_ct_orig_ipv4_src(const struct nf_conn *ct) +{ + return (__force u32)ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip; +} + +static inline const u32 *nf_ct_orig_ipv6_src(const struct nf_conn *ct) +{ + return (__force u32 *)ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip6; +} + +static inline u_int32_t +xt_cluster_hash_ipv4(u_int32_t ip, const struct xt_cluster_match_info *info) +{ + return jhash_1word(ip, info->hash_seed); +} + +static inline u_int32_t +xt_cluster_hash_ipv6(const void *ip, const struct xt_cluster_match_info *info) +{ + return jhash2(ip, NF_CT_TUPLE_L3SIZE / sizeof(__u32), info->hash_seed); +} + +static inline u_int32_t +xt_cluster_hash(const struct nf_conn *ct, + const struct xt_cluster_match_info *info) +{ + u_int32_t hash = 0; + + switch(nf_ct_l3num(ct)) { + case AF_INET: + hash = xt_cluster_hash_ipv4(nf_ct_orig_ipv4_src(ct), info); + break; + case AF_INET6: + hash = xt_cluster_hash_ipv6(nf_ct_orig_ipv6_src(ct), info); + break; + default: + WARN_ON(1); + break; + } + return (((u64)hash * info->total_nodes) >> 32); +} + +static inline bool +xt_cluster_ipv6_is_multicast(const struct in6_addr *addr) +{ + __be32 st = addr->s6_addr32[0]; + return ((st & htonl(0xFF000000)) == htonl(0xFF000000)); +} + +static inline bool +xt_cluster_is_multicast_addr(const struct sk_buff *skb, u_int8_t family) +{ + bool is_multicast = false; + + switch(family) { + case NFPROTO_IPV4: + is_multicast = ipv4_is_multicast(ip_hdr(skb)->daddr); + break; + case NFPROTO_IPV6: + is_multicast = + xt_cluster_ipv6_is_multicast(&ipv6_hdr(skb)->daddr); + break; + default: + WARN_ON(1); + break; + } + return is_multicast; +} + +static bool +xt_cluster_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + struct sk_buff *pskb = (struct sk_buff *)skb; + const struct xt_cluster_match_info *info = par->matchinfo; + const struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + unsigned long hash; + + /* This match assumes that all nodes see the same packets. This can be + * achieved if the switch that connects the cluster nodes support some + * sort of 'port mirroring'. However, if your switch does not support + * this, your cluster nodes can reply ARP request using a multicast MAC + * address. Thus, your switch will flood the same packets to the + * cluster nodes with the same multicast MAC address. Using a multicast + * link address is a RFC 1812 (section 3.3.2) violation, but this works + * fine in practise. + * + * Unfortunately, if you use the multicast MAC address, the link layer + * sets skbuff's pkt_type to PACKET_MULTICAST, which is not accepted + * by TCP and others for packets coming to this node. For that reason, + * this match mangles skbuff's pkt_type if it detects a packet + * addressed to a unicast address but using PACKET_MULTICAST. Yes, I + * know, matches should not alter packets, but we are doing this here + * because we would need to add a PKTTYPE target for this sole purpose. + */ + if (!xt_cluster_is_multicast_addr(skb, par->family) && + skb->pkt_type == PACKET_MULTICAST) { + pskb->pkt_type = PACKET_HOST; + } + + ct = nf_ct_get(skb, &ctinfo); + if (ct == NULL) + return false; + + if (nf_ct_is_untracked(ct)) + return false; + + if (ct->master) + hash = xt_cluster_hash(ct->master, info); + else + hash = xt_cluster_hash(ct, info); + + return !!((1 << hash) & info->node_mask) ^ + !!(info->flags & XT_CLUSTER_F_INV); +} + +static int xt_cluster_mt_checkentry(const struct xt_mtchk_param *par) +{ + struct xt_cluster_match_info *info = par->matchinfo; + + if (info->total_nodes > XT_CLUSTER_NODES_MAX) { + pr_info("you have exceeded the maximum " + "number of cluster nodes (%u > %u)\n", + info->total_nodes, XT_CLUSTER_NODES_MAX); + return -EINVAL; + } + if (info->node_mask >= (1ULL << info->total_nodes)) { + pr_info("this node mask cannot be " + "higher than the total number of nodes\n"); + return -EDOM; + } + return 0; +} + +static struct xt_match xt_cluster_match __read_mostly = { + .name = "cluster", + .family = NFPROTO_UNSPEC, + .match = xt_cluster_mt, + .checkentry = xt_cluster_mt_checkentry, + .matchsize = sizeof(struct xt_cluster_match_info), + .me = THIS_MODULE, +}; + +static int __init xt_cluster_mt_init(void) +{ + return xt_register_match(&xt_cluster_match); +} + +static void __exit xt_cluster_mt_fini(void) +{ + xt_unregister_match(&xt_cluster_match); +} + +MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Xtables: hash-based cluster match"); +MODULE_ALIAS("ipt_cluster"); +MODULE_ALIAS("ip6t_cluster"); +module_init(xt_cluster_mt_init); +module_exit(xt_cluster_mt_fini); diff --git a/net/netfilter/xt_comment.c b/net/netfilter/xt_comment.c new file mode 100644 index 00000000..5c861d2f --- /dev/null +++ b/net/netfilter/xt_comment.c @@ -0,0 +1,45 @@ +/* + * Implements a dummy match to allow attaching comments to rules + * + * 2003-05-13 Brad Fisher (brad@info-link.net) + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_comment.h> + +MODULE_AUTHOR("Brad Fisher <brad@info-link.net>"); +MODULE_DESCRIPTION("Xtables: No-op match which can be tagged with a comment"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_comment"); +MODULE_ALIAS("ip6t_comment"); + +static bool +comment_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + /* We always match */ + return true; +} + +static struct xt_match comment_mt_reg __read_mostly = { + .name = "comment", + .revision = 0, + .family = NFPROTO_UNSPEC, + .match = comment_mt, + .matchsize = sizeof(struct xt_comment_info), + .me = THIS_MODULE, +}; + +static int __init comment_mt_init(void) +{ + return xt_register_match(&comment_mt_reg); +} + +static void __exit comment_mt_exit(void) +{ + xt_unregister_match(&comment_mt_reg); +} + +module_init(comment_mt_init); +module_exit(comment_mt_exit); diff --git a/net/netfilter/xt_connbytes.c b/net/netfilter/xt_connbytes.c new file mode 100644 index 00000000..e595e07a --- /dev/null +++ b/net/netfilter/xt_connbytes.c @@ -0,0 +1,155 @@ +/* Kernel module to match connection tracking byte counter. + * GPL (C) 2002 Martin Devera (devik@cdi.cz). + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/module.h> +#include <linux/bitops.h> +#include <linux/skbuff.h> +#include <linux/math64.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_connbytes.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_acct.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); +MODULE_DESCRIPTION("Xtables: Number of packets/bytes per connection matching"); +MODULE_ALIAS("ipt_connbytes"); +MODULE_ALIAS("ip6t_connbytes"); + +static bool +connbytes_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_connbytes_info *sinfo = par->matchinfo; + const struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + u_int64_t what = 0; /* initialize to make gcc happy */ + u_int64_t bytes = 0; + u_int64_t pkts = 0; + const struct nf_conn_counter *counters; + + ct = nf_ct_get(skb, &ctinfo); + if (!ct) + return false; + + counters = nf_conn_acct_find(ct); + if (!counters) + return false; + + switch (sinfo->what) { + case XT_CONNBYTES_PKTS: + switch (sinfo->direction) { + case XT_CONNBYTES_DIR_ORIGINAL: + what = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].packets); + break; + case XT_CONNBYTES_DIR_REPLY: + what = atomic64_read(&counters[IP_CT_DIR_REPLY].packets); + break; + case XT_CONNBYTES_DIR_BOTH: + what = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].packets); + what += atomic64_read(&counters[IP_CT_DIR_REPLY].packets); + break; + } + break; + case XT_CONNBYTES_BYTES: + switch (sinfo->direction) { + case XT_CONNBYTES_DIR_ORIGINAL: + what = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].bytes); + break; + case XT_CONNBYTES_DIR_REPLY: + what = atomic64_read(&counters[IP_CT_DIR_REPLY].bytes); + break; + case XT_CONNBYTES_DIR_BOTH: + what = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].bytes); + what += atomic64_read(&counters[IP_CT_DIR_REPLY].bytes); + break; + } + break; + case XT_CONNBYTES_AVGPKT: + switch (sinfo->direction) { + case XT_CONNBYTES_DIR_ORIGINAL: + bytes = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].bytes); + pkts = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].packets); + break; + case XT_CONNBYTES_DIR_REPLY: + bytes = atomic64_read(&counters[IP_CT_DIR_REPLY].bytes); + pkts = atomic64_read(&counters[IP_CT_DIR_REPLY].packets); + break; + case XT_CONNBYTES_DIR_BOTH: + bytes = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].bytes) + + atomic64_read(&counters[IP_CT_DIR_REPLY].bytes); + pkts = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].packets) + + atomic64_read(&counters[IP_CT_DIR_REPLY].packets); + break; + } + if (pkts != 0) + what = div64_u64(bytes, pkts); + break; + } + + if (sinfo->count.to >= sinfo->count.from) + return what <= sinfo->count.to && what >= sinfo->count.from; + else /* inverted */ + return what < sinfo->count.to || what > sinfo->count.from; +} + +static int connbytes_mt_check(const struct xt_mtchk_param *par) +{ + const struct xt_connbytes_info *sinfo = par->matchinfo; + int ret; + + if (sinfo->what != XT_CONNBYTES_PKTS && + sinfo->what != XT_CONNBYTES_BYTES && + sinfo->what != XT_CONNBYTES_AVGPKT) + return -EINVAL; + + if (sinfo->direction != XT_CONNBYTES_DIR_ORIGINAL && + sinfo->direction != XT_CONNBYTES_DIR_REPLY && + sinfo->direction != XT_CONNBYTES_DIR_BOTH) + return -EINVAL; + + ret = nf_ct_l3proto_try_module_get(par->family); + if (ret < 0) + pr_info("cannot load conntrack support for proto=%u\n", + par->family); + + /* + * This filter cannot function correctly unless connection tracking + * accounting is enabled, so complain in the hope that someone notices. + */ + if (!nf_ct_acct_enabled(par->net)) { + pr_warning("Forcing CT accounting to be enabled\n"); + nf_ct_set_acct(par->net, true); + } + + return ret; +} + +static void connbytes_mt_destroy(const struct xt_mtdtor_param *par) +{ + nf_ct_l3proto_module_put(par->family); +} + +static struct xt_match connbytes_mt_reg __read_mostly = { + .name = "connbytes", + .revision = 0, + .family = NFPROTO_UNSPEC, + .checkentry = connbytes_mt_check, + .match = connbytes_mt, + .destroy = connbytes_mt_destroy, + .matchsize = sizeof(struct xt_connbytes_info), + .me = THIS_MODULE, +}; + +static int __init connbytes_mt_init(void) +{ + return xt_register_match(&connbytes_mt_reg); +} + +static void __exit connbytes_mt_exit(void) +{ + xt_unregister_match(&connbytes_mt_reg); +} + +module_init(connbytes_mt_init); +module_exit(connbytes_mt_exit); diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c new file mode 100644 index 00000000..c6d5a834 --- /dev/null +++ b/net/netfilter/xt_connlimit.c @@ -0,0 +1,317 @@ +/* + * netfilter module to limit the number of parallel tcp + * connections per IP address. + * (c) 2000 Gerd Knorr <kraxel@bytesex.org> + * Nov 2002: Martin Bene <martin.bene@icomedias.com>: + * only ignore TIME_WAIT or gone connections + * (C) CC Computer Consultants GmbH, 2007 + * + * based on ... + * + * Kernel module to match connection tracking information. + * GPL (C) 1999 Rusty Russell (rusty@rustcorp.com.au). + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/in.h> +#include <linux/in6.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/jhash.h> +#include <linux/slab.h> +#include <linux/list.h> +#include <linux/module.h> +#include <linux/random.h> +#include <linux/skbuff.h> +#include <linux/spinlock.h> +#include <linux/netfilter/nf_conntrack_tcp.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_connlimit.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_tuple.h> +#include <net/netfilter/nf_conntrack_zones.h> + +/* we will save the tuples of all connections we care about */ +struct xt_connlimit_conn { + struct hlist_node node; + struct nf_conntrack_tuple tuple; + union nf_inet_addr addr; +}; + +struct xt_connlimit_data { + struct hlist_head iphash[256]; + spinlock_t lock; +}; + +static u_int32_t connlimit_rnd __read_mostly; + +static inline unsigned int connlimit_iphash(__be32 addr) +{ + return jhash_1word((__force __u32)addr, connlimit_rnd) & 0xFF; +} + +static inline unsigned int +connlimit_iphash6(const union nf_inet_addr *addr, + const union nf_inet_addr *mask) +{ + union nf_inet_addr res; + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(addr->ip6); ++i) + res.ip6[i] = addr->ip6[i] & mask->ip6[i]; + + return jhash2((u32 *)res.ip6, ARRAY_SIZE(res.ip6), connlimit_rnd) & 0xFF; +} + +static inline bool already_closed(const struct nf_conn *conn) +{ + if (nf_ct_protonum(conn) == IPPROTO_TCP) + return conn->proto.tcp.state == TCP_CONNTRACK_TIME_WAIT || + conn->proto.tcp.state == TCP_CONNTRACK_CLOSE; + else + return 0; +} + +static inline unsigned int +same_source_net(const union nf_inet_addr *addr, + const union nf_inet_addr *mask, + const union nf_inet_addr *u3, u_int8_t family) +{ + if (family == NFPROTO_IPV4) { + return (addr->ip & mask->ip) == (u3->ip & mask->ip); + } else { + union nf_inet_addr lh, rh; + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(addr->ip6); ++i) { + lh.ip6[i] = addr->ip6[i] & mask->ip6[i]; + rh.ip6[i] = u3->ip6[i] & mask->ip6[i]; + } + + return memcmp(&lh.ip6, &rh.ip6, sizeof(lh.ip6)) == 0; + } +} + +static int count_them(struct net *net, + struct xt_connlimit_data *data, + const struct nf_conntrack_tuple *tuple, + const union nf_inet_addr *addr, + const union nf_inet_addr *mask, + u_int8_t family) +{ + const struct nf_conntrack_tuple_hash *found; + struct xt_connlimit_conn *conn; + struct hlist_node *pos, *n; + struct nf_conn *found_ct; + struct hlist_head *hash; + bool addit = true; + int matches = 0; + + if (family == NFPROTO_IPV6) + hash = &data->iphash[connlimit_iphash6(addr, mask)]; + else + hash = &data->iphash[connlimit_iphash(addr->ip & mask->ip)]; + + rcu_read_lock(); + + /* check the saved connections */ + hlist_for_each_entry_safe(conn, pos, n, hash, node) { + found = nf_conntrack_find_get(net, NF_CT_DEFAULT_ZONE, + &conn->tuple); + found_ct = NULL; + + if (found != NULL) + found_ct = nf_ct_tuplehash_to_ctrack(found); + + if (found_ct != NULL && + nf_ct_tuple_equal(&conn->tuple, tuple) && + !already_closed(found_ct)) + /* + * Just to be sure we have it only once in the list. + * We should not see tuples twice unless someone hooks + * this into a table without "-p tcp --syn". + */ + addit = false; + + if (found == NULL) { + /* this one is gone */ + hlist_del(&conn->node); + kfree(conn); + continue; + } + + if (already_closed(found_ct)) { + /* + * we do not care about connections which are + * closed already -> ditch it + */ + nf_ct_put(found_ct); + hlist_del(&conn->node); + kfree(conn); + continue; + } + + if (same_source_net(addr, mask, &conn->addr, family)) + /* same source network -> be counted! */ + ++matches; + nf_ct_put(found_ct); + } + + rcu_read_unlock(); + + if (addit) { + /* save the new connection in our list */ + conn = kmalloc(sizeof(*conn), GFP_ATOMIC); + if (conn == NULL) + return -ENOMEM; + conn->tuple = *tuple; + conn->addr = *addr; + hlist_add_head(&conn->node, hash); + ++matches; + } + + return matches; +} + +static bool +connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + struct net *net = dev_net(par->in ? par->in : par->out); + const struct xt_connlimit_info *info = par->matchinfo; + union nf_inet_addr addr; + struct nf_conntrack_tuple tuple; + const struct nf_conntrack_tuple *tuple_ptr = &tuple; + enum ip_conntrack_info ctinfo; + const struct nf_conn *ct; + int connections; + + ct = nf_ct_get(skb, &ctinfo); + if (ct != NULL) + tuple_ptr = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; + else if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), + par->family, &tuple)) + goto hotdrop; + + if (par->family == NFPROTO_IPV6) { + const struct ipv6hdr *iph = ipv6_hdr(skb); + memcpy(&addr.ip6, (info->flags & XT_CONNLIMIT_DADDR) ? + &iph->daddr : &iph->saddr, sizeof(addr.ip6)); + } else { + const struct iphdr *iph = ip_hdr(skb); + addr.ip = (info->flags & XT_CONNLIMIT_DADDR) ? + iph->daddr : iph->saddr; + } + + spin_lock_bh(&info->data->lock); + connections = count_them(net, info->data, tuple_ptr, &addr, + &info->mask, par->family); + spin_unlock_bh(&info->data->lock); + + if (connections < 0) + /* kmalloc failed, drop it entirely */ + goto hotdrop; + + return (connections > info->limit) ^ + !!(info->flags & XT_CONNLIMIT_INVERT); + + hotdrop: + par->hotdrop = true; + return false; +} + +static int connlimit_mt_check(const struct xt_mtchk_param *par) +{ + struct xt_connlimit_info *info = par->matchinfo; + unsigned int i; + int ret; + + if (unlikely(!connlimit_rnd)) { + u_int32_t rand; + + do { + get_random_bytes(&rand, sizeof(rand)); + } while (!rand); + cmpxchg(&connlimit_rnd, 0, rand); + } + ret = nf_ct_l3proto_try_module_get(par->family); + if (ret < 0) { + pr_info("cannot load conntrack support for " + "address family %u\n", par->family); + return ret; + } + + /* init private data */ + info->data = kmalloc(sizeof(struct xt_connlimit_data), GFP_KERNEL); + if (info->data == NULL) { + nf_ct_l3proto_module_put(par->family); + return -ENOMEM; + } + + spin_lock_init(&info->data->lock); + for (i = 0; i < ARRAY_SIZE(info->data->iphash); ++i) + INIT_HLIST_HEAD(&info->data->iphash[i]); + + return 0; +} + +static void connlimit_mt_destroy(const struct xt_mtdtor_param *par) +{ + const struct xt_connlimit_info *info = par->matchinfo; + struct xt_connlimit_conn *conn; + struct hlist_node *pos, *n; + struct hlist_head *hash = info->data->iphash; + unsigned int i; + + nf_ct_l3proto_module_put(par->family); + + for (i = 0; i < ARRAY_SIZE(info->data->iphash); ++i) { + hlist_for_each_entry_safe(conn, pos, n, &hash[i], node) { + hlist_del(&conn->node); + kfree(conn); + } + } + + kfree(info->data); +} + +static struct xt_match connlimit_mt_reg[] __read_mostly = { + { + .name = "connlimit", + .revision = 0, + .family = NFPROTO_UNSPEC, + .checkentry = connlimit_mt_check, + .match = connlimit_mt, + .matchsize = sizeof(struct xt_connlimit_info), + .destroy = connlimit_mt_destroy, + .me = THIS_MODULE, + }, + { + .name = "connlimit", + .revision = 1, + .family = NFPROTO_UNSPEC, + .checkentry = connlimit_mt_check, + .match = connlimit_mt, + .matchsize = sizeof(struct xt_connlimit_info), + .destroy = connlimit_mt_destroy, + .me = THIS_MODULE, + }, +}; + +static int __init connlimit_mt_init(void) +{ + return xt_register_matches(connlimit_mt_reg, + ARRAY_SIZE(connlimit_mt_reg)); +} + +static void __exit connlimit_mt_exit(void) +{ + xt_unregister_matches(connlimit_mt_reg, ARRAY_SIZE(connlimit_mt_reg)); +} + +module_init(connlimit_mt_init); +module_exit(connlimit_mt_exit); +MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>"); +MODULE_DESCRIPTION("Xtables: Number of connections matching"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_connlimit"); +MODULE_ALIAS("ip6t_connlimit"); diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c new file mode 100644 index 00000000..7278145e --- /dev/null +++ b/net/netfilter/xt_connmark.c @@ -0,0 +1,167 @@ +/* + * xt_connmark - Netfilter module to operate on connection marks + * + * Copyright (C) 2002,2004 MARA Systems AB <http://www.marasystems.com> + * by Henrik Nordstrom <hno@marasystems.com> + * Copyright © CC Computer Consultants GmbH, 2007 - 2008 + * Jan Engelhardt <jengelh@medozas.de> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_ecache.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_connmark.h> + +MODULE_AUTHOR("Henrik Nordstrom <hno@marasystems.com>"); +MODULE_DESCRIPTION("Xtables: connection mark operations"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_CONNMARK"); +MODULE_ALIAS("ip6t_CONNMARK"); +MODULE_ALIAS("ipt_connmark"); +MODULE_ALIAS("ip6t_connmark"); + +static unsigned int +connmark_tg(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_connmark_tginfo1 *info = par->targinfo; + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + u_int32_t newmark; + + ct = nf_ct_get(skb, &ctinfo); + if (ct == NULL) + return XT_CONTINUE; + + switch (info->mode) { + case XT_CONNMARK_SET: + newmark = (ct->mark & ~info->ctmask) ^ info->ctmark; + if (ct->mark != newmark) { + ct->mark = newmark; + nf_conntrack_event_cache(IPCT_MARK, ct); + } + break; + case XT_CONNMARK_SAVE: + newmark = (ct->mark & ~info->ctmask) ^ + (skb->mark & info->nfmask); + if (ct->mark != newmark) { + ct->mark = newmark; + nf_conntrack_event_cache(IPCT_MARK, ct); + } + break; + case XT_CONNMARK_RESTORE: + newmark = (skb->mark & ~info->nfmask) ^ + (ct->mark & info->ctmask); + skb->mark = newmark; + break; + } + + return XT_CONTINUE; +} + +static int connmark_tg_check(const struct xt_tgchk_param *par) +{ + int ret; + + ret = nf_ct_l3proto_try_module_get(par->family); + if (ret < 0) + pr_info("cannot load conntrack support for proto=%u\n", + par->family); + return ret; +} + +static void connmark_tg_destroy(const struct xt_tgdtor_param *par) +{ + nf_ct_l3proto_module_put(par->family); +} + +static bool +connmark_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_connmark_mtinfo1 *info = par->matchinfo; + enum ip_conntrack_info ctinfo; + const struct nf_conn *ct; + + ct = nf_ct_get(skb, &ctinfo); + if (ct == NULL) + return false; + + return ((ct->mark & info->mask) == info->mark) ^ info->invert; +} + +static int connmark_mt_check(const struct xt_mtchk_param *par) +{ + int ret; + + ret = nf_ct_l3proto_try_module_get(par->family); + if (ret < 0) + pr_info("cannot load conntrack support for proto=%u\n", + par->family); + return ret; +} + +static void connmark_mt_destroy(const struct xt_mtdtor_param *par) +{ + nf_ct_l3proto_module_put(par->family); +} + +static struct xt_target connmark_tg_reg __read_mostly = { + .name = "CONNMARK", + .revision = 1, + .family = NFPROTO_UNSPEC, + .checkentry = connmark_tg_check, + .target = connmark_tg, + .targetsize = sizeof(struct xt_connmark_tginfo1), + .destroy = connmark_tg_destroy, + .me = THIS_MODULE, +}; + +static struct xt_match connmark_mt_reg __read_mostly = { + .name = "connmark", + .revision = 1, + .family = NFPROTO_UNSPEC, + .checkentry = connmark_mt_check, + .match = connmark_mt, + .matchsize = sizeof(struct xt_connmark_mtinfo1), + .destroy = connmark_mt_destroy, + .me = THIS_MODULE, +}; + +static int __init connmark_mt_init(void) +{ + int ret; + + ret = xt_register_target(&connmark_tg_reg); + if (ret < 0) + return ret; + ret = xt_register_match(&connmark_mt_reg); + if (ret < 0) { + xt_unregister_target(&connmark_tg_reg); + return ret; + } + return 0; +} + +static void __exit connmark_mt_exit(void) +{ + xt_unregister_match(&connmark_mt_reg); + xt_unregister_target(&connmark_tg_reg); +} + +module_init(connmark_mt_init); +module_exit(connmark_mt_exit); diff --git a/net/netfilter/xt_conntrack.c b/net/netfilter/xt_conntrack.c new file mode 100644 index 00000000..61805d7b --- /dev/null +++ b/net/netfilter/xt_conntrack.c @@ -0,0 +1,332 @@ +/* + * xt_conntrack - Netfilter module to match connection tracking + * information. (Superset of Rusty's minimalistic state match.) + * + * (C) 2001 Marc Boucher (marc@mbsi.ca). + * Copyright © CC Computer Consultants GmbH, 2007 - 2008 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/module.h> +#include <linux/skbuff.h> +#include <net/ipv6.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_conntrack.h> +#include <net/netfilter/nf_conntrack.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>"); +MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>"); +MODULE_DESCRIPTION("Xtables: connection tracking state match"); +MODULE_ALIAS("ipt_conntrack"); +MODULE_ALIAS("ip6t_conntrack"); + +static bool +conntrack_addrcmp(const union nf_inet_addr *kaddr, + const union nf_inet_addr *uaddr, + const union nf_inet_addr *umask, unsigned int l3proto) +{ + if (l3proto == NFPROTO_IPV4) + return ((kaddr->ip ^ uaddr->ip) & umask->ip) == 0; + else if (l3proto == NFPROTO_IPV6) + return ipv6_masked_addr_cmp(&kaddr->in6, &umask->in6, + &uaddr->in6) == 0; + else + return false; +} + +static inline bool +conntrack_mt_origsrc(const struct nf_conn *ct, + const struct xt_conntrack_mtinfo2 *info, + u_int8_t family) +{ + return conntrack_addrcmp(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3, + &info->origsrc_addr, &info->origsrc_mask, family); +} + +static inline bool +conntrack_mt_origdst(const struct nf_conn *ct, + const struct xt_conntrack_mtinfo2 *info, + u_int8_t family) +{ + return conntrack_addrcmp(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3, + &info->origdst_addr, &info->origdst_mask, family); +} + +static inline bool +conntrack_mt_replsrc(const struct nf_conn *ct, + const struct xt_conntrack_mtinfo2 *info, + u_int8_t family) +{ + return conntrack_addrcmp(&ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3, + &info->replsrc_addr, &info->replsrc_mask, family); +} + +static inline bool +conntrack_mt_repldst(const struct nf_conn *ct, + const struct xt_conntrack_mtinfo2 *info, + u_int8_t family) +{ + return conntrack_addrcmp(&ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3, + &info->repldst_addr, &info->repldst_mask, family); +} + +static inline bool +ct_proto_port_check(const struct xt_conntrack_mtinfo2 *info, + const struct nf_conn *ct) +{ + const struct nf_conntrack_tuple *tuple; + + tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; + if ((info->match_flags & XT_CONNTRACK_PROTO) && + (nf_ct_protonum(ct) == info->l4proto) ^ + !(info->invert_flags & XT_CONNTRACK_PROTO)) + return false; + + /* Shortcut to match all recognized protocols by using ->src.all. */ + if ((info->match_flags & XT_CONNTRACK_ORIGSRC_PORT) && + (tuple->src.u.all == info->origsrc_port) ^ + !(info->invert_flags & XT_CONNTRACK_ORIGSRC_PORT)) + return false; + + if ((info->match_flags & XT_CONNTRACK_ORIGDST_PORT) && + (tuple->dst.u.all == info->origdst_port) ^ + !(info->invert_flags & XT_CONNTRACK_ORIGDST_PORT)) + return false; + + tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple; + + if ((info->match_flags & XT_CONNTRACK_REPLSRC_PORT) && + (tuple->src.u.all == info->replsrc_port) ^ + !(info->invert_flags & XT_CONNTRACK_REPLSRC_PORT)) + return false; + + if ((info->match_flags & XT_CONNTRACK_REPLDST_PORT) && + (tuple->dst.u.all == info->repldst_port) ^ + !(info->invert_flags & XT_CONNTRACK_REPLDST_PORT)) + return false; + + return true; +} + +static inline bool +port_match(u16 min, u16 max, u16 port, bool invert) +{ + return (port >= min && port <= max) ^ invert; +} + +static inline bool +ct_proto_port_check_v3(const struct xt_conntrack_mtinfo3 *info, + const struct nf_conn *ct) +{ + const struct nf_conntrack_tuple *tuple; + + tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; + if ((info->match_flags & XT_CONNTRACK_PROTO) && + (nf_ct_protonum(ct) == info->l4proto) ^ + !(info->invert_flags & XT_CONNTRACK_PROTO)) + return false; + + /* Shortcut to match all recognized protocols by using ->src.all. */ + if ((info->match_flags & XT_CONNTRACK_ORIGSRC_PORT) && + !port_match(info->origsrc_port, info->origsrc_port_high, + ntohs(tuple->src.u.all), + info->invert_flags & XT_CONNTRACK_ORIGSRC_PORT)) + return false; + + if ((info->match_flags & XT_CONNTRACK_ORIGDST_PORT) && + !port_match(info->origdst_port, info->origdst_port_high, + ntohs(tuple->dst.u.all), + info->invert_flags & XT_CONNTRACK_ORIGDST_PORT)) + return false; + + tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple; + + if ((info->match_flags & XT_CONNTRACK_REPLSRC_PORT) && + !port_match(info->replsrc_port, info->replsrc_port_high, + ntohs(tuple->src.u.all), + info->invert_flags & XT_CONNTRACK_REPLSRC_PORT)) + return false; + + if ((info->match_flags & XT_CONNTRACK_REPLDST_PORT) && + !port_match(info->repldst_port, info->repldst_port_high, + ntohs(tuple->dst.u.all), + info->invert_flags & XT_CONNTRACK_REPLDST_PORT)) + return false; + + return true; +} + +static bool +conntrack_mt(const struct sk_buff *skb, struct xt_action_param *par, + u16 state_mask, u16 status_mask) +{ + const struct xt_conntrack_mtinfo2 *info = par->matchinfo; + enum ip_conntrack_info ctinfo; + const struct nf_conn *ct; + unsigned int statebit; + + ct = nf_ct_get(skb, &ctinfo); + + if (ct) { + if (nf_ct_is_untracked(ct)) + statebit = XT_CONNTRACK_STATE_UNTRACKED; + else + statebit = XT_CONNTRACK_STATE_BIT(ctinfo); + } else + statebit = XT_CONNTRACK_STATE_INVALID; + + if (info->match_flags & XT_CONNTRACK_STATE) { + if (ct != NULL) { + if (test_bit(IPS_SRC_NAT_BIT, &ct->status)) + statebit |= XT_CONNTRACK_STATE_SNAT; + if (test_bit(IPS_DST_NAT_BIT, &ct->status)) + statebit |= XT_CONNTRACK_STATE_DNAT; + } + if (!!(state_mask & statebit) ^ + !(info->invert_flags & XT_CONNTRACK_STATE)) + return false; + } + + if (ct == NULL) + return info->match_flags & XT_CONNTRACK_STATE; + if ((info->match_flags & XT_CONNTRACK_DIRECTION) && + (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) ^ + !(info->invert_flags & XT_CONNTRACK_DIRECTION)) + return false; + + if (info->match_flags & XT_CONNTRACK_ORIGSRC) + if (conntrack_mt_origsrc(ct, info, par->family) ^ + !(info->invert_flags & XT_CONNTRACK_ORIGSRC)) + return false; + + if (info->match_flags & XT_CONNTRACK_ORIGDST) + if (conntrack_mt_origdst(ct, info, par->family) ^ + !(info->invert_flags & XT_CONNTRACK_ORIGDST)) + return false; + + if (info->match_flags & XT_CONNTRACK_REPLSRC) + if (conntrack_mt_replsrc(ct, info, par->family) ^ + !(info->invert_flags & XT_CONNTRACK_REPLSRC)) + return false; + + if (info->match_flags & XT_CONNTRACK_REPLDST) + if (conntrack_mt_repldst(ct, info, par->family) ^ + !(info->invert_flags & XT_CONNTRACK_REPLDST)) + return false; + + if (par->match->revision != 3) { + if (!ct_proto_port_check(info, ct)) + return false; + } else { + if (!ct_proto_port_check_v3(par->matchinfo, ct)) + return false; + } + + if ((info->match_flags & XT_CONNTRACK_STATUS) && + (!!(status_mask & ct->status) ^ + !(info->invert_flags & XT_CONNTRACK_STATUS))) + return false; + + if (info->match_flags & XT_CONNTRACK_EXPIRES) { + unsigned long expires = 0; + + if (timer_pending(&ct->timeout)) + expires = (ct->timeout.expires - jiffies) / HZ; + if ((expires >= info->expires_min && + expires <= info->expires_max) ^ + !(info->invert_flags & XT_CONNTRACK_EXPIRES)) + return false; + } + return true; +} + +static bool +conntrack_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_conntrack_mtinfo1 *info = par->matchinfo; + + return conntrack_mt(skb, par, info->state_mask, info->status_mask); +} + +static bool +conntrack_mt_v2(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_conntrack_mtinfo2 *info = par->matchinfo; + + return conntrack_mt(skb, par, info->state_mask, info->status_mask); +} + +static bool +conntrack_mt_v3(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_conntrack_mtinfo3 *info = par->matchinfo; + + return conntrack_mt(skb, par, info->state_mask, info->status_mask); +} + +static int conntrack_mt_check(const struct xt_mtchk_param *par) +{ + int ret; + + ret = nf_ct_l3proto_try_module_get(par->family); + if (ret < 0) + pr_info("cannot load conntrack support for proto=%u\n", + par->family); + return ret; +} + +static void conntrack_mt_destroy(const struct xt_mtdtor_param *par) +{ + nf_ct_l3proto_module_put(par->family); +} + +static struct xt_match conntrack_mt_reg[] __read_mostly = { + { + .name = "conntrack", + .revision = 1, + .family = NFPROTO_UNSPEC, + .matchsize = sizeof(struct xt_conntrack_mtinfo1), + .match = conntrack_mt_v1, + .checkentry = conntrack_mt_check, + .destroy = conntrack_mt_destroy, + .me = THIS_MODULE, + }, + { + .name = "conntrack", + .revision = 2, + .family = NFPROTO_UNSPEC, + .matchsize = sizeof(struct xt_conntrack_mtinfo2), + .match = conntrack_mt_v2, + .checkentry = conntrack_mt_check, + .destroy = conntrack_mt_destroy, + .me = THIS_MODULE, + }, + { + .name = "conntrack", + .revision = 3, + .family = NFPROTO_UNSPEC, + .matchsize = sizeof(struct xt_conntrack_mtinfo3), + .match = conntrack_mt_v3, + .checkentry = conntrack_mt_check, + .destroy = conntrack_mt_destroy, + .me = THIS_MODULE, + }, +}; + +static int __init conntrack_mt_init(void) +{ + return xt_register_matches(conntrack_mt_reg, + ARRAY_SIZE(conntrack_mt_reg)); +} + +static void __exit conntrack_mt_exit(void) +{ + xt_unregister_matches(conntrack_mt_reg, ARRAY_SIZE(conntrack_mt_reg)); +} + +module_init(conntrack_mt_init); +module_exit(conntrack_mt_exit); diff --git a/net/netfilter/xt_cpu.c b/net/netfilter/xt_cpu.c new file mode 100644 index 00000000..c7a2e546 --- /dev/null +++ b/net/netfilter/xt_cpu.c @@ -0,0 +1,65 @@ +/* Kernel module to match running CPU */ + +/* + * Might be used to distribute connections on several daemons, if + * RPS (Remote Packet Steering) is enabled or NIC is multiqueue capable, + * each RX queue IRQ affined to one CPU (1:1 mapping) + * + */ + +/* (C) 2010 Eric Dumazet + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/netfilter/xt_cpu.h> +#include <linux/netfilter/x_tables.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Eric Dumazet <eric.dumazet@gmail.com>"); +MODULE_DESCRIPTION("Xtables: CPU match"); +MODULE_ALIAS("ipt_cpu"); +MODULE_ALIAS("ip6t_cpu"); + +static int cpu_mt_check(const struct xt_mtchk_param *par) +{ + const struct xt_cpu_info *info = par->matchinfo; + + if (info->invert & ~1) + return -EINVAL; + return 0; +} + +static bool cpu_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_cpu_info *info = par->matchinfo; + + return (info->cpu == smp_processor_id()) ^ info->invert; +} + +static struct xt_match cpu_mt_reg __read_mostly = { + .name = "cpu", + .revision = 0, + .family = NFPROTO_UNSPEC, + .checkentry = cpu_mt_check, + .match = cpu_mt, + .matchsize = sizeof(struct xt_cpu_info), + .me = THIS_MODULE, +}; + +static int __init cpu_mt_init(void) +{ + return xt_register_match(&cpu_mt_reg); +} + +static void __exit cpu_mt_exit(void) +{ + xt_unregister_match(&cpu_mt_reg); +} + +module_init(cpu_mt_init); +module_exit(cpu_mt_exit); diff --git a/net/netfilter/xt_dccp.c b/net/netfilter/xt_dccp.c new file mode 100644 index 00000000..b63d2a3d --- /dev/null +++ b/net/netfilter/xt_dccp.c @@ -0,0 +1,188 @@ +/* + * iptables module for DCCP protocol header matching + * + * (C) 2005 by Harald Welte <laforge@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <net/ip.h> +#include <linux/dccp.h> + +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_dccp.h> + +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv6/ip6_tables.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); +MODULE_DESCRIPTION("Xtables: DCCP protocol packet match"); +MODULE_ALIAS("ipt_dccp"); +MODULE_ALIAS("ip6t_dccp"); + +#define DCCHECK(cond, option, flag, invflag) (!((flag) & (option)) \ + || (!!((invflag) & (option)) ^ (cond))) + +static unsigned char *dccp_optbuf; +static DEFINE_SPINLOCK(dccp_buflock); + +static inline bool +dccp_find_option(u_int8_t option, + const struct sk_buff *skb, + unsigned int protoff, + const struct dccp_hdr *dh, + bool *hotdrop) +{ + /* tcp.doff is only 4 bits, ie. max 15 * 4 bytes */ + const unsigned char *op; + unsigned int optoff = __dccp_hdr_len(dh); + unsigned int optlen = dh->dccph_doff*4 - __dccp_hdr_len(dh); + unsigned int i; + + if (dh->dccph_doff * 4 < __dccp_hdr_len(dh)) + goto invalid; + + if (!optlen) + return false; + + spin_lock_bh(&dccp_buflock); + op = skb_header_pointer(skb, protoff + optoff, optlen, dccp_optbuf); + if (op == NULL) { + /* If we don't have the whole header, drop packet. */ + goto partial; + } + + for (i = 0; i < optlen; ) { + if (op[i] == option) { + spin_unlock_bh(&dccp_buflock); + return true; + } + + if (op[i] < 2) + i++; + else + i += op[i+1]?:1; + } + + spin_unlock_bh(&dccp_buflock); + return false; + +partial: + spin_unlock_bh(&dccp_buflock); +invalid: + *hotdrop = true; + return false; +} + + +static inline bool +match_types(const struct dccp_hdr *dh, u_int16_t typemask) +{ + return typemask & (1 << dh->dccph_type); +} + +static inline bool +match_option(u_int8_t option, const struct sk_buff *skb, unsigned int protoff, + const struct dccp_hdr *dh, bool *hotdrop) +{ + return dccp_find_option(option, skb, protoff, dh, hotdrop); +} + +static bool +dccp_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_dccp_info *info = par->matchinfo; + const struct dccp_hdr *dh; + struct dccp_hdr _dh; + + if (par->fragoff != 0) + return false; + + dh = skb_header_pointer(skb, par->thoff, sizeof(_dh), &_dh); + if (dh == NULL) { + par->hotdrop = true; + return false; + } + + return DCCHECK(ntohs(dh->dccph_sport) >= info->spts[0] + && ntohs(dh->dccph_sport) <= info->spts[1], + XT_DCCP_SRC_PORTS, info->flags, info->invflags) + && DCCHECK(ntohs(dh->dccph_dport) >= info->dpts[0] + && ntohs(dh->dccph_dport) <= info->dpts[1], + XT_DCCP_DEST_PORTS, info->flags, info->invflags) + && DCCHECK(match_types(dh, info->typemask), + XT_DCCP_TYPE, info->flags, info->invflags) + && DCCHECK(match_option(info->option, skb, par->thoff, dh, + &par->hotdrop), + XT_DCCP_OPTION, info->flags, info->invflags); +} + +static int dccp_mt_check(const struct xt_mtchk_param *par) +{ + const struct xt_dccp_info *info = par->matchinfo; + + if (info->flags & ~XT_DCCP_VALID_FLAGS) + return -EINVAL; + if (info->invflags & ~XT_DCCP_VALID_FLAGS) + return -EINVAL; + if (info->invflags & ~info->flags) + return -EINVAL; + return 0; +} + +static struct xt_match dccp_mt_reg[] __read_mostly = { + { + .name = "dccp", + .family = NFPROTO_IPV4, + .checkentry = dccp_mt_check, + .match = dccp_mt, + .matchsize = sizeof(struct xt_dccp_info), + .proto = IPPROTO_DCCP, + .me = THIS_MODULE, + }, + { + .name = "dccp", + .family = NFPROTO_IPV6, + .checkentry = dccp_mt_check, + .match = dccp_mt, + .matchsize = sizeof(struct xt_dccp_info), + .proto = IPPROTO_DCCP, + .me = THIS_MODULE, + }, +}; + +static int __init dccp_mt_init(void) +{ + int ret; + + /* doff is 8 bits, so the maximum option size is (4*256). Don't put + * this in BSS since DaveM is worried about locked TLB's for kernel + * BSS. */ + dccp_optbuf = kmalloc(256 * 4, GFP_KERNEL); + if (!dccp_optbuf) + return -ENOMEM; + ret = xt_register_matches(dccp_mt_reg, ARRAY_SIZE(dccp_mt_reg)); + if (ret) + goto out_kfree; + return ret; + +out_kfree: + kfree(dccp_optbuf); + return ret; +} + +static void __exit dccp_mt_exit(void) +{ + xt_unregister_matches(dccp_mt_reg, ARRAY_SIZE(dccp_mt_reg)); + kfree(dccp_optbuf); +} + +module_init(dccp_mt_init); +module_exit(dccp_mt_exit); diff --git a/net/netfilter/xt_devgroup.c b/net/netfilter/xt_devgroup.c new file mode 100644 index 00000000..d9202cdd --- /dev/null +++ b/net/netfilter/xt_devgroup.c @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2011 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/netdevice.h> + +#include <linux/netfilter/xt_devgroup.h> +#include <linux/netfilter/x_tables.h> + +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Xtables: Device group match"); +MODULE_ALIAS("ipt_devgroup"); +MODULE_ALIAS("ip6t_devgroup"); + +static bool devgroup_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_devgroup_info *info = par->matchinfo; + + if (info->flags & XT_DEVGROUP_MATCH_SRC && + (((info->src_group ^ par->in->group) & info->src_mask ? 1 : 0) ^ + ((info->flags & XT_DEVGROUP_INVERT_SRC) ? 1 : 0))) + return false; + + if (info->flags & XT_DEVGROUP_MATCH_DST && + (((info->dst_group ^ par->out->group) & info->dst_mask ? 1 : 0) ^ + ((info->flags & XT_DEVGROUP_INVERT_DST) ? 1 : 0))) + return false; + + return true; +} + +static int devgroup_mt_checkentry(const struct xt_mtchk_param *par) +{ + const struct xt_devgroup_info *info = par->matchinfo; + + if (info->flags & ~(XT_DEVGROUP_MATCH_SRC | XT_DEVGROUP_INVERT_SRC | + XT_DEVGROUP_MATCH_DST | XT_DEVGROUP_INVERT_DST)) + return -EINVAL; + + if (info->flags & XT_DEVGROUP_MATCH_SRC && + par->hook_mask & ~((1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_FORWARD))) + return -EINVAL; + + if (info->flags & XT_DEVGROUP_MATCH_DST && + par->hook_mask & ~((1 << NF_INET_FORWARD) | + (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_POST_ROUTING))) + return -EINVAL; + + return 0; +} + +static struct xt_match devgroup_mt_reg __read_mostly = { + .name = "devgroup", + .match = devgroup_mt, + .checkentry = devgroup_mt_checkentry, + .matchsize = sizeof(struct xt_devgroup_info), + .family = NFPROTO_UNSPEC, + .me = THIS_MODULE +}; + +static int __init devgroup_mt_init(void) +{ + return xt_register_match(&devgroup_mt_reg); +} + +static void __exit devgroup_mt_exit(void) +{ + xt_unregister_match(&devgroup_mt_reg); +} + +module_init(devgroup_mt_init); +module_exit(devgroup_mt_exit); diff --git a/net/netfilter/xt_dscp.c b/net/netfilter/xt_dscp.c new file mode 100644 index 00000000..64670fc5 --- /dev/null +++ b/net/netfilter/xt_dscp.c @@ -0,0 +1,115 @@ +/* IP tables module for matching the value of the IPv4/IPv6 DSCP field + * + * (C) 2002 by Harald Welte <laforge@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <net/dsfield.h> + +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_dscp.h> + +MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); +MODULE_DESCRIPTION("Xtables: DSCP/TOS field match"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_dscp"); +MODULE_ALIAS("ip6t_dscp"); +MODULE_ALIAS("ipt_tos"); +MODULE_ALIAS("ip6t_tos"); + +static bool +dscp_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_dscp_info *info = par->matchinfo; + u_int8_t dscp = ipv4_get_dsfield(ip_hdr(skb)) >> XT_DSCP_SHIFT; + + return (dscp == info->dscp) ^ !!info->invert; +} + +static bool +dscp_mt6(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_dscp_info *info = par->matchinfo; + u_int8_t dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> XT_DSCP_SHIFT; + + return (dscp == info->dscp) ^ !!info->invert; +} + +static int dscp_mt_check(const struct xt_mtchk_param *par) +{ + const struct xt_dscp_info *info = par->matchinfo; + + if (info->dscp > XT_DSCP_MAX) { + pr_info("dscp %x out of range\n", info->dscp); + return -EDOM; + } + + return 0; +} + +static bool tos_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_tos_match_info *info = par->matchinfo; + + if (par->family == NFPROTO_IPV4) + return ((ip_hdr(skb)->tos & info->tos_mask) == + info->tos_value) ^ !!info->invert; + else + return ((ipv6_get_dsfield(ipv6_hdr(skb)) & info->tos_mask) == + info->tos_value) ^ !!info->invert; +} + +static struct xt_match dscp_mt_reg[] __read_mostly = { + { + .name = "dscp", + .family = NFPROTO_IPV4, + .checkentry = dscp_mt_check, + .match = dscp_mt, + .matchsize = sizeof(struct xt_dscp_info), + .me = THIS_MODULE, + }, + { + .name = "dscp", + .family = NFPROTO_IPV6, + .checkentry = dscp_mt_check, + .match = dscp_mt6, + .matchsize = sizeof(struct xt_dscp_info), + .me = THIS_MODULE, + }, + { + .name = "tos", + .revision = 1, + .family = NFPROTO_IPV4, + .match = tos_mt, + .matchsize = sizeof(struct xt_tos_match_info), + .me = THIS_MODULE, + }, + { + .name = "tos", + .revision = 1, + .family = NFPROTO_IPV6, + .match = tos_mt, + .matchsize = sizeof(struct xt_tos_match_info), + .me = THIS_MODULE, + }, +}; + +static int __init dscp_mt_init(void) +{ + return xt_register_matches(dscp_mt_reg, ARRAY_SIZE(dscp_mt_reg)); +} + +static void __exit dscp_mt_exit(void) +{ + xt_unregister_matches(dscp_mt_reg, ARRAY_SIZE(dscp_mt_reg)); +} + +module_init(dscp_mt_init); +module_exit(dscp_mt_exit); diff --git a/net/netfilter/xt_ecn.c b/net/netfilter/xt_ecn.c new file mode 100644 index 00000000..3c831a8e --- /dev/null +++ b/net/netfilter/xt_ecn.c @@ -0,0 +1,179 @@ +/* + * Xtables module for matching the value of the IPv4/IPv6 and TCP ECN bits + * + * (C) 2002 by Harald Welte <laforge@gnumonks.org> + * (C) 2011 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/in.h> +#include <linux/ip.h> +#include <net/ip.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/tcp.h> + +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_ecn.h> +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv6/ip6_tables.h> + +MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); +MODULE_DESCRIPTION("Xtables: Explicit Congestion Notification (ECN) flag match"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_ecn"); +MODULE_ALIAS("ip6t_ecn"); + +static bool match_tcp(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_ecn_info *einfo = par->matchinfo; + struct tcphdr _tcph; + const struct tcphdr *th; + + /* In practice, TCP match does this, so can't fail. But let's + * be good citizens. + */ + th = skb_header_pointer(skb, par->thoff, sizeof(_tcph), &_tcph); + if (th == NULL) + return false; + + if (einfo->operation & XT_ECN_OP_MATCH_ECE) { + if (einfo->invert & XT_ECN_OP_MATCH_ECE) { + if (th->ece == 1) + return false; + } else { + if (th->ece == 0) + return false; + } + } + + if (einfo->operation & XT_ECN_OP_MATCH_CWR) { + if (einfo->invert & XT_ECN_OP_MATCH_CWR) { + if (th->cwr == 1) + return false; + } else { + if (th->cwr == 0) + return false; + } + } + + return true; +} + +static inline bool match_ip(const struct sk_buff *skb, + const struct xt_ecn_info *einfo) +{ + return ((ip_hdr(skb)->tos & XT_ECN_IP_MASK) == einfo->ip_ect) ^ + !!(einfo->invert & XT_ECN_OP_MATCH_IP); +} + +static bool ecn_mt4(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_ecn_info *info = par->matchinfo; + + if (info->operation & XT_ECN_OP_MATCH_IP && !match_ip(skb, info)) + return false; + + if (info->operation & (XT_ECN_OP_MATCH_ECE | XT_ECN_OP_MATCH_CWR) && + !match_tcp(skb, par)) + return false; + + return true; +} + +static int ecn_mt_check4(const struct xt_mtchk_param *par) +{ + const struct xt_ecn_info *info = par->matchinfo; + const struct ipt_ip *ip = par->entryinfo; + + if (info->operation & XT_ECN_OP_MATCH_MASK) + return -EINVAL; + + if (info->invert & XT_ECN_OP_MATCH_MASK) + return -EINVAL; + + if (info->operation & (XT_ECN_OP_MATCH_ECE | XT_ECN_OP_MATCH_CWR) && + (ip->proto != IPPROTO_TCP || ip->invflags & IPT_INV_PROTO)) { + pr_info("cannot match TCP bits in rule for non-tcp packets\n"); + return -EINVAL; + } + + return 0; +} + +static inline bool match_ipv6(const struct sk_buff *skb, + const struct xt_ecn_info *einfo) +{ + return (((ipv6_hdr(skb)->flow_lbl[0] >> 4) & XT_ECN_IP_MASK) == + einfo->ip_ect) ^ + !!(einfo->invert & XT_ECN_OP_MATCH_IP); +} + +static bool ecn_mt6(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_ecn_info *info = par->matchinfo; + + if (info->operation & XT_ECN_OP_MATCH_IP && !match_ipv6(skb, info)) + return false; + + if (info->operation & (XT_ECN_OP_MATCH_ECE | XT_ECN_OP_MATCH_CWR) && + !match_tcp(skb, par)) + return false; + + return true; +} + +static int ecn_mt_check6(const struct xt_mtchk_param *par) +{ + const struct xt_ecn_info *info = par->matchinfo; + const struct ip6t_ip6 *ip = par->entryinfo; + + if (info->operation & XT_ECN_OP_MATCH_MASK) + return -EINVAL; + + if (info->invert & XT_ECN_OP_MATCH_MASK) + return -EINVAL; + + if (info->operation & (XT_ECN_OP_MATCH_ECE | XT_ECN_OP_MATCH_CWR) && + (ip->proto != IPPROTO_TCP || ip->invflags & IP6T_INV_PROTO)) { + pr_info("cannot match TCP bits in rule for non-tcp packets\n"); + return -EINVAL; + } + + return 0; +} + +static struct xt_match ecn_mt_reg[] __read_mostly = { + { + .name = "ecn", + .family = NFPROTO_IPV4, + .match = ecn_mt4, + .matchsize = sizeof(struct xt_ecn_info), + .checkentry = ecn_mt_check4, + .me = THIS_MODULE, + }, + { + .name = "ecn", + .family = NFPROTO_IPV6, + .match = ecn_mt6, + .matchsize = sizeof(struct xt_ecn_info), + .checkentry = ecn_mt_check6, + .me = THIS_MODULE, + }, +}; + +static int __init ecn_mt_init(void) +{ + return xt_register_matches(ecn_mt_reg, ARRAY_SIZE(ecn_mt_reg)); +} + +static void __exit ecn_mt_exit(void) +{ + xt_unregister_matches(ecn_mt_reg, ARRAY_SIZE(ecn_mt_reg)); +} + +module_init(ecn_mt_init); +module_exit(ecn_mt_exit); diff --git a/net/netfilter/xt_esp.c b/net/netfilter/xt_esp.c new file mode 100644 index 00000000..171ba82b --- /dev/null +++ b/net/netfilter/xt_esp.c @@ -0,0 +1,107 @@ +/* Kernel module to match ESP parameters. */ + +/* (C) 1999-2000 Yon Uriarte <yon@astaro.de> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/in.h> +#include <linux/ip.h> + +#include <linux/netfilter/xt_esp.h> +#include <linux/netfilter/x_tables.h> + +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv6/ip6_tables.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Yon Uriarte <yon@astaro.de>"); +MODULE_DESCRIPTION("Xtables: IPsec-ESP packet match"); +MODULE_ALIAS("ipt_esp"); +MODULE_ALIAS("ip6t_esp"); + +/* Returns 1 if the spi is matched by the range, 0 otherwise */ +static inline bool +spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, bool invert) +{ + bool r; + pr_debug("spi_match:%c 0x%x <= 0x%x <= 0x%x\n", + invert ? '!' : ' ', min, spi, max); + r = (spi >= min && spi <= max) ^ invert; + pr_debug(" result %s\n", r ? "PASS" : "FAILED"); + return r; +} + +static bool esp_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct ip_esp_hdr *eh; + struct ip_esp_hdr _esp; + const struct xt_esp *espinfo = par->matchinfo; + + /* Must not be a fragment. */ + if (par->fragoff != 0) + return false; + + eh = skb_header_pointer(skb, par->thoff, sizeof(_esp), &_esp); + if (eh == NULL) { + /* We've been asked to examine this packet, and we + * can't. Hence, no choice but to drop. + */ + pr_debug("Dropping evil ESP tinygram.\n"); + par->hotdrop = true; + return false; + } + + return spi_match(espinfo->spis[0], espinfo->spis[1], ntohl(eh->spi), + !!(espinfo->invflags & XT_ESP_INV_SPI)); +} + +static int esp_mt_check(const struct xt_mtchk_param *par) +{ + const struct xt_esp *espinfo = par->matchinfo; + + if (espinfo->invflags & ~XT_ESP_INV_MASK) { + pr_debug("unknown flags %X\n", espinfo->invflags); + return -EINVAL; + } + + return 0; +} + +static struct xt_match esp_mt_reg[] __read_mostly = { + { + .name = "esp", + .family = NFPROTO_IPV4, + .checkentry = esp_mt_check, + .match = esp_mt, + .matchsize = sizeof(struct xt_esp), + .proto = IPPROTO_ESP, + .me = THIS_MODULE, + }, + { + .name = "esp", + .family = NFPROTO_IPV6, + .checkentry = esp_mt_check, + .match = esp_mt, + .matchsize = sizeof(struct xt_esp), + .proto = IPPROTO_ESP, + .me = THIS_MODULE, + }, +}; + +static int __init esp_mt_init(void) +{ + return xt_register_matches(esp_mt_reg, ARRAY_SIZE(esp_mt_reg)); +} + +static void __exit esp_mt_exit(void) +{ + xt_unregister_matches(esp_mt_reg, ARRAY_SIZE(esp_mt_reg)); +} + +module_init(esp_mt_init); +module_exit(esp_mt_exit); diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c new file mode 100644 index 00000000..d95f9c96 --- /dev/null +++ b/net/netfilter/xt_hashlimit.c @@ -0,0 +1,848 @@ +/* + * xt_hashlimit - Netfilter module to limit the number of packets per time + * separately for each hashbucket (sourceip/sourceport/dstip/dstport) + * + * (C) 2003-2004 by Harald Welte <laforge@netfilter.org> + * Copyright © CC Computer Consultants GmbH, 2007 - 2008 + * + * Development of this code was funded by Astaro AG, http://www.astaro.com/ + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/module.h> +#include <linux/spinlock.h> +#include <linux/random.h> +#include <linux/jhash.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/list.h> +#include <linux/skbuff.h> +#include <linux/mm.h> +#include <linux/in.h> +#include <linux/ip.h> +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) +#include <linux/ipv6.h> +#include <net/ipv6.h> +#endif + +#include <net/net_namespace.h> +#include <net/netns/generic.h> + +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv6/ip6_tables.h> +#include <linux/netfilter/xt_hashlimit.h> +#include <linux/mutex.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); +MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>"); +MODULE_DESCRIPTION("Xtables: per hash-bucket rate-limit match"); +MODULE_ALIAS("ipt_hashlimit"); +MODULE_ALIAS("ip6t_hashlimit"); + +struct hashlimit_net { + struct hlist_head htables; + struct proc_dir_entry *ipt_hashlimit; + struct proc_dir_entry *ip6t_hashlimit; +}; + +static int hashlimit_net_id; +static inline struct hashlimit_net *hashlimit_pernet(struct net *net) +{ + return net_generic(net, hashlimit_net_id); +} + +/* need to declare this at the top */ +static const struct file_operations dl_file_ops; + +/* hash table crap */ +struct dsthash_dst { + union { + struct { + __be32 src; + __be32 dst; + } ip; +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + struct { + __be32 src[4]; + __be32 dst[4]; + } ip6; +#endif + }; + __be16 src_port; + __be16 dst_port; +}; + +struct dsthash_ent { + /* static / read-only parts in the beginning */ + struct hlist_node node; + struct dsthash_dst dst; + + /* modified structure members in the end */ + spinlock_t lock; + unsigned long expires; /* precalculated expiry time */ + struct { + unsigned long prev; /* last modification */ + u_int32_t credit; + u_int32_t credit_cap, cost; + } rateinfo; + struct rcu_head rcu; +}; + +struct xt_hashlimit_htable { + struct hlist_node node; /* global list of all htables */ + int use; + u_int8_t family; + bool rnd_initialized; + + struct hashlimit_cfg1 cfg; /* config */ + + /* used internally */ + spinlock_t lock; /* lock for list_head */ + u_int32_t rnd; /* random seed for hash */ + unsigned int count; /* number entries in table */ + struct timer_list timer; /* timer for gc */ + + /* seq_file stuff */ + struct proc_dir_entry *pde; + struct net *net; + + struct hlist_head hash[0]; /* hashtable itself */ +}; + +static DEFINE_MUTEX(hashlimit_mutex); /* protects htables list */ +static struct kmem_cache *hashlimit_cachep __read_mostly; + +static inline bool dst_cmp(const struct dsthash_ent *ent, + const struct dsthash_dst *b) +{ + return !memcmp(&ent->dst, b, sizeof(ent->dst)); +} + +static u_int32_t +hash_dst(const struct xt_hashlimit_htable *ht, const struct dsthash_dst *dst) +{ + u_int32_t hash = jhash2((const u32 *)dst, + sizeof(*dst)/sizeof(u32), + ht->rnd); + /* + * Instead of returning hash % ht->cfg.size (implying a divide) + * we return the high 32 bits of the (hash * ht->cfg.size) that will + * give results between [0 and cfg.size-1] and same hash distribution, + * but using a multiply, less expensive than a divide + */ + return ((u64)hash * ht->cfg.size) >> 32; +} + +static struct dsthash_ent * +dsthash_find(const struct xt_hashlimit_htable *ht, + const struct dsthash_dst *dst) +{ + struct dsthash_ent *ent; + struct hlist_node *pos; + u_int32_t hash = hash_dst(ht, dst); + + if (!hlist_empty(&ht->hash[hash])) { + hlist_for_each_entry_rcu(ent, pos, &ht->hash[hash], node) + if (dst_cmp(ent, dst)) { + spin_lock(&ent->lock); + return ent; + } + } + return NULL; +} + +/* allocate dsthash_ent, initialize dst, put in htable and lock it */ +static struct dsthash_ent * +dsthash_alloc_init(struct xt_hashlimit_htable *ht, + const struct dsthash_dst *dst) +{ + struct dsthash_ent *ent; + + spin_lock(&ht->lock); + /* initialize hash with random val at the time we allocate + * the first hashtable entry */ + if (unlikely(!ht->rnd_initialized)) { + get_random_bytes(&ht->rnd, sizeof(ht->rnd)); + ht->rnd_initialized = true; + } + + if (ht->cfg.max && ht->count >= ht->cfg.max) { + /* FIXME: do something. question is what.. */ + if (net_ratelimit()) + pr_err("max count of %u reached\n", ht->cfg.max); + ent = NULL; + } else + ent = kmem_cache_alloc(hashlimit_cachep, GFP_ATOMIC); + if (ent) { + memcpy(&ent->dst, dst, sizeof(ent->dst)); + spin_lock_init(&ent->lock); + + spin_lock(&ent->lock); + hlist_add_head_rcu(&ent->node, &ht->hash[hash_dst(ht, dst)]); + ht->count++; + } + spin_unlock(&ht->lock); + return ent; +} + +static void dsthash_free_rcu(struct rcu_head *head) +{ + struct dsthash_ent *ent = container_of(head, struct dsthash_ent, rcu); + + kmem_cache_free(hashlimit_cachep, ent); +} + +static inline void +dsthash_free(struct xt_hashlimit_htable *ht, struct dsthash_ent *ent) +{ + hlist_del_rcu(&ent->node); + call_rcu_bh(&ent->rcu, dsthash_free_rcu); + ht->count--; +} +static void htable_gc(unsigned long htlong); + +static int htable_create(struct net *net, struct xt_hashlimit_mtinfo1 *minfo, + u_int8_t family) +{ + struct hashlimit_net *hashlimit_net = hashlimit_pernet(net); + struct xt_hashlimit_htable *hinfo; + unsigned int size; + unsigned int i; + + if (minfo->cfg.size) { + size = minfo->cfg.size; + } else { + size = (totalram_pages << PAGE_SHIFT) / 16384 / + sizeof(struct list_head); + if (totalram_pages > 1024 * 1024 * 1024 / PAGE_SIZE) + size = 8192; + if (size < 16) + size = 16; + } + /* FIXME: don't use vmalloc() here or anywhere else -HW */ + hinfo = vmalloc(sizeof(struct xt_hashlimit_htable) + + sizeof(struct list_head) * size); + if (hinfo == NULL) + return -ENOMEM; + minfo->hinfo = hinfo; + + /* copy match config into hashtable config */ + memcpy(&hinfo->cfg, &minfo->cfg, sizeof(hinfo->cfg)); + hinfo->cfg.size = size; + if (hinfo->cfg.max == 0) + hinfo->cfg.max = 8 * hinfo->cfg.size; + else if (hinfo->cfg.max < hinfo->cfg.size) + hinfo->cfg.max = hinfo->cfg.size; + + for (i = 0; i < hinfo->cfg.size; i++) + INIT_HLIST_HEAD(&hinfo->hash[i]); + + hinfo->use = 1; + hinfo->count = 0; + hinfo->family = family; + hinfo->rnd_initialized = false; + spin_lock_init(&hinfo->lock); + + hinfo->pde = proc_create_data(minfo->name, 0, + (family == NFPROTO_IPV4) ? + hashlimit_net->ipt_hashlimit : hashlimit_net->ip6t_hashlimit, + &dl_file_ops, hinfo); + if (hinfo->pde == NULL) { + vfree(hinfo); + return -ENOMEM; + } + hinfo->net = net; + + setup_timer(&hinfo->timer, htable_gc, (unsigned long)hinfo); + hinfo->timer.expires = jiffies + msecs_to_jiffies(hinfo->cfg.gc_interval); + add_timer(&hinfo->timer); + + hlist_add_head(&hinfo->node, &hashlimit_net->htables); + + return 0; +} + +static bool select_all(const struct xt_hashlimit_htable *ht, + const struct dsthash_ent *he) +{ + return 1; +} + +static bool select_gc(const struct xt_hashlimit_htable *ht, + const struct dsthash_ent *he) +{ + return time_after_eq(jiffies, he->expires); +} + +static void htable_selective_cleanup(struct xt_hashlimit_htable *ht, + bool (*select)(const struct xt_hashlimit_htable *ht, + const struct dsthash_ent *he)) +{ + unsigned int i; + + /* lock hash table and iterate over it */ + spin_lock_bh(&ht->lock); + for (i = 0; i < ht->cfg.size; i++) { + struct dsthash_ent *dh; + struct hlist_node *pos, *n; + hlist_for_each_entry_safe(dh, pos, n, &ht->hash[i], node) { + if ((*select)(ht, dh)) + dsthash_free(ht, dh); + } + } + spin_unlock_bh(&ht->lock); +} + +/* hash table garbage collector, run by timer */ +static void htable_gc(unsigned long htlong) +{ + struct xt_hashlimit_htable *ht = (struct xt_hashlimit_htable *)htlong; + + htable_selective_cleanup(ht, select_gc); + + /* re-add the timer accordingly */ + ht->timer.expires = jiffies + msecs_to_jiffies(ht->cfg.gc_interval); + add_timer(&ht->timer); +} + +static void htable_destroy(struct xt_hashlimit_htable *hinfo) +{ + struct hashlimit_net *hashlimit_net = hashlimit_pernet(hinfo->net); + struct proc_dir_entry *parent; + + del_timer_sync(&hinfo->timer); + + if (hinfo->family == NFPROTO_IPV4) + parent = hashlimit_net->ipt_hashlimit; + else + parent = hashlimit_net->ip6t_hashlimit; + remove_proc_entry(hinfo->pde->name, parent); + htable_selective_cleanup(hinfo, select_all); + vfree(hinfo); +} + +static struct xt_hashlimit_htable *htable_find_get(struct net *net, + const char *name, + u_int8_t family) +{ + struct hashlimit_net *hashlimit_net = hashlimit_pernet(net); + struct xt_hashlimit_htable *hinfo; + struct hlist_node *pos; + + hlist_for_each_entry(hinfo, pos, &hashlimit_net->htables, node) { + if (!strcmp(name, hinfo->pde->name) && + hinfo->family == family) { + hinfo->use++; + return hinfo; + } + } + return NULL; +} + +static void htable_put(struct xt_hashlimit_htable *hinfo) +{ + mutex_lock(&hashlimit_mutex); + if (--hinfo->use == 0) { + hlist_del(&hinfo->node); + htable_destroy(hinfo); + } + mutex_unlock(&hashlimit_mutex); +} + +/* The algorithm used is the Simple Token Bucket Filter (TBF) + * see net/sched/sch_tbf.c in the linux source tree + */ + +/* Rusty: This is my (non-mathematically-inclined) understanding of + this algorithm. The `average rate' in jiffies becomes your initial + amount of credit `credit' and the most credit you can ever have + `credit_cap'. The `peak rate' becomes the cost of passing the + test, `cost'. + + `prev' tracks the last packet hit: you gain one credit per jiffy. + If you get credit balance more than this, the extra credit is + discarded. Every time the match passes, you lose `cost' credits; + if you don't have that many, the test fails. + + See Alexey's formal explanation in net/sched/sch_tbf.c. + + To get the maximum range, we multiply by this factor (ie. you get N + credits per jiffy). We want to allow a rate as low as 1 per day + (slowest userspace tool allows), which means + CREDITS_PER_JIFFY*HZ*60*60*24 < 2^32 ie. +*/ +#define MAX_CPJ (0xFFFFFFFF / (HZ*60*60*24)) + +/* Repeated shift and or gives us all 1s, final shift and add 1 gives + * us the power of 2 below the theoretical max, so GCC simply does a + * shift. */ +#define _POW2_BELOW2(x) ((x)|((x)>>1)) +#define _POW2_BELOW4(x) (_POW2_BELOW2(x)|_POW2_BELOW2((x)>>2)) +#define _POW2_BELOW8(x) (_POW2_BELOW4(x)|_POW2_BELOW4((x)>>4)) +#define _POW2_BELOW16(x) (_POW2_BELOW8(x)|_POW2_BELOW8((x)>>8)) +#define _POW2_BELOW32(x) (_POW2_BELOW16(x)|_POW2_BELOW16((x)>>16)) +#define POW2_BELOW32(x) ((_POW2_BELOW32(x)>>1) + 1) + +#define CREDITS_PER_JIFFY POW2_BELOW32(MAX_CPJ) + +/* Precision saver. */ +static inline u_int32_t +user2credits(u_int32_t user) +{ + /* If multiplying would overflow... */ + if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY)) + /* Divide first. */ + return (user / XT_HASHLIMIT_SCALE) * HZ * CREDITS_PER_JIFFY; + + return (user * HZ * CREDITS_PER_JIFFY) / XT_HASHLIMIT_SCALE; +} + +static inline void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now) +{ + dh->rateinfo.credit += (now - dh->rateinfo.prev) * CREDITS_PER_JIFFY; + if (dh->rateinfo.credit > dh->rateinfo.credit_cap) + dh->rateinfo.credit = dh->rateinfo.credit_cap; + dh->rateinfo.prev = now; +} + +static inline __be32 maskl(__be32 a, unsigned int l) +{ + return l ? htonl(ntohl(a) & ~0 << (32 - l)) : 0; +} + +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) +static void hashlimit_ipv6_mask(__be32 *i, unsigned int p) +{ + switch (p) { + case 0 ... 31: + i[0] = maskl(i[0], p); + i[1] = i[2] = i[3] = 0; + break; + case 32 ... 63: + i[1] = maskl(i[1], p - 32); + i[2] = i[3] = 0; + break; + case 64 ... 95: + i[2] = maskl(i[2], p - 64); + i[3] = 0; + break; + case 96 ... 127: + i[3] = maskl(i[3], p - 96); + break; + case 128: + break; + } +} +#endif + +static int +hashlimit_init_dst(const struct xt_hashlimit_htable *hinfo, + struct dsthash_dst *dst, + const struct sk_buff *skb, unsigned int protoff) +{ + __be16 _ports[2], *ports; + u8 nexthdr; + int poff; + + memset(dst, 0, sizeof(*dst)); + + switch (hinfo->family) { + case NFPROTO_IPV4: + if (hinfo->cfg.mode & XT_HASHLIMIT_HASH_DIP) + dst->ip.dst = maskl(ip_hdr(skb)->daddr, + hinfo->cfg.dstmask); + if (hinfo->cfg.mode & XT_HASHLIMIT_HASH_SIP) + dst->ip.src = maskl(ip_hdr(skb)->saddr, + hinfo->cfg.srcmask); + + if (!(hinfo->cfg.mode & + (XT_HASHLIMIT_HASH_DPT | XT_HASHLIMIT_HASH_SPT))) + return 0; + nexthdr = ip_hdr(skb)->protocol; + break; +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + case NFPROTO_IPV6: + { + __be16 frag_off; + + if (hinfo->cfg.mode & XT_HASHLIMIT_HASH_DIP) { + memcpy(&dst->ip6.dst, &ipv6_hdr(skb)->daddr, + sizeof(dst->ip6.dst)); + hashlimit_ipv6_mask(dst->ip6.dst, hinfo->cfg.dstmask); + } + if (hinfo->cfg.mode & XT_HASHLIMIT_HASH_SIP) { + memcpy(&dst->ip6.src, &ipv6_hdr(skb)->saddr, + sizeof(dst->ip6.src)); + hashlimit_ipv6_mask(dst->ip6.src, hinfo->cfg.srcmask); + } + + if (!(hinfo->cfg.mode & + (XT_HASHLIMIT_HASH_DPT | XT_HASHLIMIT_HASH_SPT))) + return 0; + nexthdr = ipv6_hdr(skb)->nexthdr; + protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, &frag_off); + if ((int)protoff < 0) + return -1; + break; + } +#endif + default: + BUG(); + return 0; + } + + poff = proto_ports_offset(nexthdr); + if (poff >= 0) { + ports = skb_header_pointer(skb, protoff + poff, sizeof(_ports), + &_ports); + } else { + _ports[0] = _ports[1] = 0; + ports = _ports; + } + if (!ports) + return -1; + if (hinfo->cfg.mode & XT_HASHLIMIT_HASH_SPT) + dst->src_port = ports[0]; + if (hinfo->cfg.mode & XT_HASHLIMIT_HASH_DPT) + dst->dst_port = ports[1]; + return 0; +} + +static bool +hashlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_hashlimit_mtinfo1 *info = par->matchinfo; + struct xt_hashlimit_htable *hinfo = info->hinfo; + unsigned long now = jiffies; + struct dsthash_ent *dh; + struct dsthash_dst dst; + + if (hashlimit_init_dst(hinfo, &dst, skb, par->thoff) < 0) + goto hotdrop; + + rcu_read_lock_bh(); + dh = dsthash_find(hinfo, &dst); + if (dh == NULL) { + dh = dsthash_alloc_init(hinfo, &dst); + if (dh == NULL) { + rcu_read_unlock_bh(); + goto hotdrop; + } + dh->expires = jiffies + msecs_to_jiffies(hinfo->cfg.expire); + dh->rateinfo.prev = jiffies; + dh->rateinfo.credit = user2credits(hinfo->cfg.avg * + hinfo->cfg.burst); + dh->rateinfo.credit_cap = user2credits(hinfo->cfg.avg * + hinfo->cfg.burst); + dh->rateinfo.cost = user2credits(hinfo->cfg.avg); + } else { + /* update expiration timeout */ + dh->expires = now + msecs_to_jiffies(hinfo->cfg.expire); + rateinfo_recalc(dh, now); + } + + if (dh->rateinfo.credit >= dh->rateinfo.cost) { + /* below the limit */ + dh->rateinfo.credit -= dh->rateinfo.cost; + spin_unlock(&dh->lock); + rcu_read_unlock_bh(); + return !(info->cfg.mode & XT_HASHLIMIT_INVERT); + } + + spin_unlock(&dh->lock); + rcu_read_unlock_bh(); + /* default match is underlimit - so over the limit, we need to invert */ + return info->cfg.mode & XT_HASHLIMIT_INVERT; + + hotdrop: + par->hotdrop = true; + return false; +} + +static int hashlimit_mt_check(const struct xt_mtchk_param *par) +{ + struct net *net = par->net; + struct xt_hashlimit_mtinfo1 *info = par->matchinfo; + int ret; + + /* Check for overflow. */ + if (info->cfg.burst == 0 || + user2credits(info->cfg.avg * info->cfg.burst) < + user2credits(info->cfg.avg)) { + pr_info("overflow, try lower: %u/%u\n", + info->cfg.avg, info->cfg.burst); + return -ERANGE; + } + if (info->cfg.gc_interval == 0 || info->cfg.expire == 0) + return -EINVAL; + if (info->name[sizeof(info->name)-1] != '\0') + return -EINVAL; + if (par->family == NFPROTO_IPV4) { + if (info->cfg.srcmask > 32 || info->cfg.dstmask > 32) + return -EINVAL; + } else { + if (info->cfg.srcmask > 128 || info->cfg.dstmask > 128) + return -EINVAL; + } + + mutex_lock(&hashlimit_mutex); + info->hinfo = htable_find_get(net, info->name, par->family); + if (info->hinfo == NULL) { + ret = htable_create(net, info, par->family); + if (ret < 0) { + mutex_unlock(&hashlimit_mutex); + return ret; + } + } + mutex_unlock(&hashlimit_mutex); + return 0; +} + +static void hashlimit_mt_destroy(const struct xt_mtdtor_param *par) +{ + const struct xt_hashlimit_mtinfo1 *info = par->matchinfo; + + htable_put(info->hinfo); +} + +static struct xt_match hashlimit_mt_reg[] __read_mostly = { + { + .name = "hashlimit", + .revision = 1, + .family = NFPROTO_IPV4, + .match = hashlimit_mt, + .matchsize = sizeof(struct xt_hashlimit_mtinfo1), + .checkentry = hashlimit_mt_check, + .destroy = hashlimit_mt_destroy, + .me = THIS_MODULE, + }, +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + { + .name = "hashlimit", + .revision = 1, + .family = NFPROTO_IPV6, + .match = hashlimit_mt, + .matchsize = sizeof(struct xt_hashlimit_mtinfo1), + .checkentry = hashlimit_mt_check, + .destroy = hashlimit_mt_destroy, + .me = THIS_MODULE, + }, +#endif +}; + +/* PROC stuff */ +static void *dl_seq_start(struct seq_file *s, loff_t *pos) + __acquires(htable->lock) +{ + struct xt_hashlimit_htable *htable = s->private; + unsigned int *bucket; + + spin_lock_bh(&htable->lock); + if (*pos >= htable->cfg.size) + return NULL; + + bucket = kmalloc(sizeof(unsigned int), GFP_ATOMIC); + if (!bucket) + return ERR_PTR(-ENOMEM); + + *bucket = *pos; + return bucket; +} + +static void *dl_seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct xt_hashlimit_htable *htable = s->private; + unsigned int *bucket = (unsigned int *)v; + + *pos = ++(*bucket); + if (*pos >= htable->cfg.size) { + kfree(v); + return NULL; + } + return bucket; +} + +static void dl_seq_stop(struct seq_file *s, void *v) + __releases(htable->lock) +{ + struct xt_hashlimit_htable *htable = s->private; + unsigned int *bucket = (unsigned int *)v; + + if (!IS_ERR(bucket)) + kfree(bucket); + spin_unlock_bh(&htable->lock); +} + +static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family, + struct seq_file *s) +{ + int res; + + spin_lock(&ent->lock); + /* recalculate to show accurate numbers */ + rateinfo_recalc(ent, jiffies); + + switch (family) { + case NFPROTO_IPV4: + res = seq_printf(s, "%ld %pI4:%u->%pI4:%u %u %u %u\n", + (long)(ent->expires - jiffies)/HZ, + &ent->dst.ip.src, + ntohs(ent->dst.src_port), + &ent->dst.ip.dst, + ntohs(ent->dst.dst_port), + ent->rateinfo.credit, ent->rateinfo.credit_cap, + ent->rateinfo.cost); + break; +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + case NFPROTO_IPV6: + res = seq_printf(s, "%ld %pI6:%u->%pI6:%u %u %u %u\n", + (long)(ent->expires - jiffies)/HZ, + &ent->dst.ip6.src, + ntohs(ent->dst.src_port), + &ent->dst.ip6.dst, + ntohs(ent->dst.dst_port), + ent->rateinfo.credit, ent->rateinfo.credit_cap, + ent->rateinfo.cost); + break; +#endif + default: + BUG(); + res = 0; + } + spin_unlock(&ent->lock); + return res; +} + +static int dl_seq_show(struct seq_file *s, void *v) +{ + struct xt_hashlimit_htable *htable = s->private; + unsigned int *bucket = (unsigned int *)v; + struct dsthash_ent *ent; + struct hlist_node *pos; + + if (!hlist_empty(&htable->hash[*bucket])) { + hlist_for_each_entry(ent, pos, &htable->hash[*bucket], node) + if (dl_seq_real_show(ent, htable->family, s)) + return -1; + } + return 0; +} + +static const struct seq_operations dl_seq_ops = { + .start = dl_seq_start, + .next = dl_seq_next, + .stop = dl_seq_stop, + .show = dl_seq_show +}; + +static int dl_proc_open(struct inode *inode, struct file *file) +{ + int ret = seq_open(file, &dl_seq_ops); + + if (!ret) { + struct seq_file *sf = file->private_data; + sf->private = PDE(inode)->data; + } + return ret; +} + +static const struct file_operations dl_file_ops = { + .owner = THIS_MODULE, + .open = dl_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release +}; + +static int __net_init hashlimit_proc_net_init(struct net *net) +{ + struct hashlimit_net *hashlimit_net = hashlimit_pernet(net); + + hashlimit_net->ipt_hashlimit = proc_mkdir("ipt_hashlimit", net->proc_net); + if (!hashlimit_net->ipt_hashlimit) + return -ENOMEM; +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + hashlimit_net->ip6t_hashlimit = proc_mkdir("ip6t_hashlimit", net->proc_net); + if (!hashlimit_net->ip6t_hashlimit) { + proc_net_remove(net, "ipt_hashlimit"); + return -ENOMEM; + } +#endif + return 0; +} + +static void __net_exit hashlimit_proc_net_exit(struct net *net) +{ + proc_net_remove(net, "ipt_hashlimit"); +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + proc_net_remove(net, "ip6t_hashlimit"); +#endif +} + +static int __net_init hashlimit_net_init(struct net *net) +{ + struct hashlimit_net *hashlimit_net = hashlimit_pernet(net); + + INIT_HLIST_HEAD(&hashlimit_net->htables); + return hashlimit_proc_net_init(net); +} + +static void __net_exit hashlimit_net_exit(struct net *net) +{ + struct hashlimit_net *hashlimit_net = hashlimit_pernet(net); + + BUG_ON(!hlist_empty(&hashlimit_net->htables)); + hashlimit_proc_net_exit(net); +} + +static struct pernet_operations hashlimit_net_ops = { + .init = hashlimit_net_init, + .exit = hashlimit_net_exit, + .id = &hashlimit_net_id, + .size = sizeof(struct hashlimit_net), +}; + +static int __init hashlimit_mt_init(void) +{ + int err; + + err = register_pernet_subsys(&hashlimit_net_ops); + if (err < 0) + return err; + err = xt_register_matches(hashlimit_mt_reg, + ARRAY_SIZE(hashlimit_mt_reg)); + if (err < 0) + goto err1; + + err = -ENOMEM; + hashlimit_cachep = kmem_cache_create("xt_hashlimit", + sizeof(struct dsthash_ent), 0, 0, + NULL); + if (!hashlimit_cachep) { + pr_warning("unable to create slab cache\n"); + goto err2; + } + return 0; + +err2: + xt_unregister_matches(hashlimit_mt_reg, ARRAY_SIZE(hashlimit_mt_reg)); +err1: + unregister_pernet_subsys(&hashlimit_net_ops); + return err; + +} + +static void __exit hashlimit_mt_exit(void) +{ + xt_unregister_matches(hashlimit_mt_reg, ARRAY_SIZE(hashlimit_mt_reg)); + unregister_pernet_subsys(&hashlimit_net_ops); + + rcu_barrier_bh(); + kmem_cache_destroy(hashlimit_cachep); +} + +module_init(hashlimit_mt_init); +module_exit(hashlimit_mt_exit); diff --git a/net/netfilter/xt_helper.c b/net/netfilter/xt_helper.c new file mode 100644 index 00000000..9f4ab00c --- /dev/null +++ b/net/netfilter/xt_helper.c @@ -0,0 +1,99 @@ +/* iptables module to match on related connections */ +/* + * (C) 2001 Martin Josefsson <gandalf@wlug.westbo.se> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/netfilter.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_helper.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Martin Josefsson <gandalf@netfilter.org>"); +MODULE_DESCRIPTION("Xtables: Related connection matching"); +MODULE_ALIAS("ipt_helper"); +MODULE_ALIAS("ip6t_helper"); + + +static bool +helper_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_helper_info *info = par->matchinfo; + const struct nf_conn *ct; + const struct nf_conn_help *master_help; + const struct nf_conntrack_helper *helper; + enum ip_conntrack_info ctinfo; + bool ret = info->invert; + + ct = nf_ct_get(skb, &ctinfo); + if (!ct || !ct->master) + return ret; + + master_help = nfct_help(ct->master); + if (!master_help) + return ret; + + /* rcu_read_lock()ed by nf_hook_slow */ + helper = rcu_dereference(master_help->helper); + if (!helper) + return ret; + + if (info->name[0] == '\0') + ret = !ret; + else + ret ^= !strncmp(helper->name, info->name, + strlen(helper->name)); + return ret; +} + +static int helper_mt_check(const struct xt_mtchk_param *par) +{ + struct xt_helper_info *info = par->matchinfo; + int ret; + + ret = nf_ct_l3proto_try_module_get(par->family); + if (ret < 0) { + pr_info("cannot load conntrack support for proto=%u\n", + par->family); + return ret; + } + info->name[29] = '\0'; + return 0; +} + +static void helper_mt_destroy(const struct xt_mtdtor_param *par) +{ + nf_ct_l3proto_module_put(par->family); +} + +static struct xt_match helper_mt_reg __read_mostly = { + .name = "helper", + .revision = 0, + .family = NFPROTO_UNSPEC, + .checkentry = helper_mt_check, + .match = helper_mt, + .destroy = helper_mt_destroy, + .matchsize = sizeof(struct xt_helper_info), + .me = THIS_MODULE, +}; + +static int __init helper_mt_init(void) +{ + return xt_register_match(&helper_mt_reg); +} + +static void __exit helper_mt_exit(void) +{ + xt_unregister_match(&helper_mt_reg); +} + +module_init(helper_mt_init); +module_exit(helper_mt_exit); diff --git a/net/netfilter/xt_hl.c b/net/netfilter/xt_hl.c new file mode 100644 index 00000000..00395114 --- /dev/null +++ b/net/netfilter/xt_hl.c @@ -0,0 +1,96 @@ +/* + * IP tables module for matching the value of the TTL + * (C) 2000,2001 by Harald Welte <laforge@netfilter.org> + * + * Hop Limit matching module + * (C) 2001-2002 Maciej Soltysiak <solt@dns.toxicfilms.tv> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/module.h> +#include <linux/skbuff.h> + +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter_ipv4/ipt_ttl.h> +#include <linux/netfilter_ipv6/ip6t_hl.h> + +MODULE_AUTHOR("Maciej Soltysiak <solt@dns.toxicfilms.tv>"); +MODULE_DESCRIPTION("Xtables: Hoplimit/TTL field match"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_ttl"); +MODULE_ALIAS("ip6t_hl"); + +static bool ttl_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct ipt_ttl_info *info = par->matchinfo; + const u8 ttl = ip_hdr(skb)->ttl; + + switch (info->mode) { + case IPT_TTL_EQ: + return ttl == info->ttl; + case IPT_TTL_NE: + return ttl != info->ttl; + case IPT_TTL_LT: + return ttl < info->ttl; + case IPT_TTL_GT: + return ttl > info->ttl; + } + + return false; +} + +static bool hl_mt6(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct ip6t_hl_info *info = par->matchinfo; + const struct ipv6hdr *ip6h = ipv6_hdr(skb); + + switch (info->mode) { + case IP6T_HL_EQ: + return ip6h->hop_limit == info->hop_limit; + case IP6T_HL_NE: + return ip6h->hop_limit != info->hop_limit; + case IP6T_HL_LT: + return ip6h->hop_limit < info->hop_limit; + case IP6T_HL_GT: + return ip6h->hop_limit > info->hop_limit; + } + + return false; +} + +static struct xt_match hl_mt_reg[] __read_mostly = { + { + .name = "ttl", + .revision = 0, + .family = NFPROTO_IPV4, + .match = ttl_mt, + .matchsize = sizeof(struct ipt_ttl_info), + .me = THIS_MODULE, + }, + { + .name = "hl", + .revision = 0, + .family = NFPROTO_IPV6, + .match = hl_mt6, + .matchsize = sizeof(struct ip6t_hl_info), + .me = THIS_MODULE, + }, +}; + +static int __init hl_mt_init(void) +{ + return xt_register_matches(hl_mt_reg, ARRAY_SIZE(hl_mt_reg)); +} + +static void __exit hl_mt_exit(void) +{ + xt_unregister_matches(hl_mt_reg, ARRAY_SIZE(hl_mt_reg)); +} + +module_init(hl_mt_init); +module_exit(hl_mt_exit); diff --git a/net/netfilter/xt_iprange.c b/net/netfilter/xt_iprange.c new file mode 100644 index 00000000..b46626cd --- /dev/null +++ b/net/netfilter/xt_iprange.c @@ -0,0 +1,140 @@ +/* + * xt_iprange - Netfilter module to match IP address ranges + * + * (C) 2003 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * (C) CC Computer Consultants GmbH, 2008 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_iprange.h> + +static bool +iprange_mt4(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_iprange_mtinfo *info = par->matchinfo; + const struct iphdr *iph = ip_hdr(skb); + bool m; + + if (info->flags & IPRANGE_SRC) { + m = ntohl(iph->saddr) < ntohl(info->src_min.ip); + m |= ntohl(iph->saddr) > ntohl(info->src_max.ip); + m ^= !!(info->flags & IPRANGE_SRC_INV); + if (m) { + pr_debug("src IP %pI4 NOT in range %s%pI4-%pI4\n", + &iph->saddr, + (info->flags & IPRANGE_SRC_INV) ? "(INV) " : "", + &info->src_min.ip, + &info->src_max.ip); + return false; + } + } + if (info->flags & IPRANGE_DST) { + m = ntohl(iph->daddr) < ntohl(info->dst_min.ip); + m |= ntohl(iph->daddr) > ntohl(info->dst_max.ip); + m ^= !!(info->flags & IPRANGE_DST_INV); + if (m) { + pr_debug("dst IP %pI4 NOT in range %s%pI4-%pI4\n", + &iph->daddr, + (info->flags & IPRANGE_DST_INV) ? "(INV) " : "", + &info->dst_min.ip, + &info->dst_max.ip); + return false; + } + } + return true; +} + +static inline int +iprange_ipv6_lt(const struct in6_addr *a, const struct in6_addr *b) +{ + unsigned int i; + + for (i = 0; i < 4; ++i) { + if (a->s6_addr32[i] != b->s6_addr32[i]) + return ntohl(a->s6_addr32[i]) < ntohl(b->s6_addr32[i]); + } + + return 0; +} + +static bool +iprange_mt6(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_iprange_mtinfo *info = par->matchinfo; + const struct ipv6hdr *iph = ipv6_hdr(skb); + bool m; + + if (info->flags & IPRANGE_SRC) { + m = iprange_ipv6_lt(&iph->saddr, &info->src_min.in6); + m |= iprange_ipv6_lt(&info->src_max.in6, &iph->saddr); + m ^= !!(info->flags & IPRANGE_SRC_INV); + if (m) { + pr_debug("src IP %pI6 NOT in range %s%pI6-%pI6\n", + &iph->saddr, + (info->flags & IPRANGE_SRC_INV) ? "(INV) " : "", + &info->src_min.in6, + &info->src_max.in6); + return false; + } + } + if (info->flags & IPRANGE_DST) { + m = iprange_ipv6_lt(&iph->daddr, &info->dst_min.in6); + m |= iprange_ipv6_lt(&info->dst_max.in6, &iph->daddr); + m ^= !!(info->flags & IPRANGE_DST_INV); + if (m) { + pr_debug("dst IP %pI6 NOT in range %s%pI6-%pI6\n", + &iph->daddr, + (info->flags & IPRANGE_DST_INV) ? "(INV) " : "", + &info->dst_min.in6, + &info->dst_max.in6); + return false; + } + } + return true; +} + +static struct xt_match iprange_mt_reg[] __read_mostly = { + { + .name = "iprange", + .revision = 1, + .family = NFPROTO_IPV4, + .match = iprange_mt4, + .matchsize = sizeof(struct xt_iprange_mtinfo), + .me = THIS_MODULE, + }, + { + .name = "iprange", + .revision = 1, + .family = NFPROTO_IPV6, + .match = iprange_mt6, + .matchsize = sizeof(struct xt_iprange_mtinfo), + .me = THIS_MODULE, + }, +}; + +static int __init iprange_mt_init(void) +{ + return xt_register_matches(iprange_mt_reg, ARRAY_SIZE(iprange_mt_reg)); +} + +static void __exit iprange_mt_exit(void) +{ + xt_unregister_matches(iprange_mt_reg, ARRAY_SIZE(iprange_mt_reg)); +} + +module_init(iprange_mt_init); +module_exit(iprange_mt_exit); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>"); +MODULE_DESCRIPTION("Xtables: arbitrary IPv4 range matching"); +MODULE_ALIAS("ipt_iprange"); +MODULE_ALIAS("ip6t_iprange"); diff --git a/net/netfilter/xt_ipvs.c b/net/netfilter/xt_ipvs.c new file mode 100644 index 00000000..bb10b071 --- /dev/null +++ b/net/netfilter/xt_ipvs.c @@ -0,0 +1,188 @@ +/* + * xt_ipvs - kernel module to match IPVS connection properties + * + * Author: Hannes Eder <heder@google.com> + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/spinlock.h> +#include <linux/skbuff.h> +#ifdef CONFIG_IP_VS_IPV6 +#include <net/ipv6.h> +#endif +#include <linux/ip_vs.h> +#include <linux/types.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_ipvs.h> +#include <net/netfilter/nf_conntrack.h> + +#include <net/ip_vs.h> + +MODULE_AUTHOR("Hannes Eder <heder@google.com>"); +MODULE_DESCRIPTION("Xtables: match IPVS connection properties"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_ipvs"); +MODULE_ALIAS("ip6t_ipvs"); + +/* borrowed from xt_conntrack */ +static bool ipvs_mt_addrcmp(const union nf_inet_addr *kaddr, + const union nf_inet_addr *uaddr, + const union nf_inet_addr *umask, + unsigned int l3proto) +{ + if (l3proto == NFPROTO_IPV4) + return ((kaddr->ip ^ uaddr->ip) & umask->ip) == 0; +#ifdef CONFIG_IP_VS_IPV6 + else if (l3proto == NFPROTO_IPV6) + return ipv6_masked_addr_cmp(&kaddr->in6, &umask->in6, + &uaddr->in6) == 0; +#endif + else + return false; +} + +static bool +ipvs_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_ipvs_mtinfo *data = par->matchinfo; + /* ipvs_mt_check ensures that family is only NFPROTO_IPV[46]. */ + const u_int8_t family = par->family; + struct ip_vs_iphdr iph; + struct ip_vs_protocol *pp; + struct ip_vs_conn *cp; + bool match = true; + + if (data->bitmask == XT_IPVS_IPVS_PROPERTY) { + match = skb->ipvs_property ^ + !!(data->invert & XT_IPVS_IPVS_PROPERTY); + goto out; + } + + /* other flags than XT_IPVS_IPVS_PROPERTY are set */ + if (!skb->ipvs_property) { + match = false; + goto out; + } + + ip_vs_fill_iphdr(family, skb_network_header(skb), &iph); + + if (data->bitmask & XT_IPVS_PROTO) + if ((iph.protocol == data->l4proto) ^ + !(data->invert & XT_IPVS_PROTO)) { + match = false; + goto out; + } + + pp = ip_vs_proto_get(iph.protocol); + if (unlikely(!pp)) { + match = false; + goto out; + } + + /* + * Check if the packet belongs to an existing entry + */ + cp = pp->conn_out_get(family, skb, &iph, iph.len, 1 /* inverse */); + if (unlikely(cp == NULL)) { + match = false; + goto out; + } + + /* + * We found a connection, i.e. ct != 0, make sure to call + * __ip_vs_conn_put before returning. In our case jump to out_put_con. + */ + + if (data->bitmask & XT_IPVS_VPORT) + if ((cp->vport == data->vport) ^ + !(data->invert & XT_IPVS_VPORT)) { + match = false; + goto out_put_cp; + } + + if (data->bitmask & XT_IPVS_VPORTCTL) + if ((cp->control != NULL && + cp->control->vport == data->vportctl) ^ + !(data->invert & XT_IPVS_VPORTCTL)) { + match = false; + goto out_put_cp; + } + + if (data->bitmask & XT_IPVS_DIR) { + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + + if (ct == NULL || nf_ct_is_untracked(ct)) { + match = false; + goto out_put_cp; + } + + if ((ctinfo >= IP_CT_IS_REPLY) ^ + !!(data->invert & XT_IPVS_DIR)) { + match = false; + goto out_put_cp; + } + } + + if (data->bitmask & XT_IPVS_METHOD) + if (((cp->flags & IP_VS_CONN_F_FWD_MASK) == data->fwd_method) ^ + !(data->invert & XT_IPVS_METHOD)) { + match = false; + goto out_put_cp; + } + + if (data->bitmask & XT_IPVS_VADDR) { + if (ipvs_mt_addrcmp(&cp->vaddr, &data->vaddr, + &data->vmask, family) ^ + !(data->invert & XT_IPVS_VADDR)) { + match = false; + goto out_put_cp; + } + } + +out_put_cp: + __ip_vs_conn_put(cp); +out: + pr_debug("match=%d\n", match); + return match; +} + +static int ipvs_mt_check(const struct xt_mtchk_param *par) +{ + if (par->family != NFPROTO_IPV4 +#ifdef CONFIG_IP_VS_IPV6 + && par->family != NFPROTO_IPV6 +#endif + ) { + pr_info("protocol family %u not supported\n", par->family); + return -EINVAL; + } + + return 0; +} + +static struct xt_match xt_ipvs_mt_reg __read_mostly = { + .name = "ipvs", + .revision = 0, + .family = NFPROTO_UNSPEC, + .match = ipvs_mt, + .checkentry = ipvs_mt_check, + .matchsize = XT_ALIGN(sizeof(struct xt_ipvs_mtinfo)), + .me = THIS_MODULE, +}; + +static int __init ipvs_mt_init(void) +{ + return xt_register_match(&xt_ipvs_mt_reg); +} + +static void __exit ipvs_mt_exit(void) +{ + xt_unregister_match(&xt_ipvs_mt_reg); +} + +module_init(ipvs_mt_init); +module_exit(ipvs_mt_exit); diff --git a/net/netfilter/xt_length.c b/net/netfilter/xt_length.c new file mode 100644 index 00000000..176e5570 --- /dev/null +++ b/net/netfilter/xt_length.c @@ -0,0 +1,70 @@ +/* Kernel module to match packet length. */ +/* (C) 1999-2001 James Morris <jmorros@intercode.com.au> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/ipv6.h> +#include <net/ip.h> + +#include <linux/netfilter/xt_length.h> +#include <linux/netfilter/x_tables.h> + +MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>"); +MODULE_DESCRIPTION("Xtables: Packet length (Layer3,4,5) match"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_length"); +MODULE_ALIAS("ip6t_length"); + +static bool +length_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_length_info *info = par->matchinfo; + u_int16_t pktlen = ntohs(ip_hdr(skb)->tot_len); + + return (pktlen >= info->min && pktlen <= info->max) ^ info->invert; +} + +static bool +length_mt6(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_length_info *info = par->matchinfo; + const u_int16_t pktlen = ntohs(ipv6_hdr(skb)->payload_len) + + sizeof(struct ipv6hdr); + + return (pktlen >= info->min && pktlen <= info->max) ^ info->invert; +} + +static struct xt_match length_mt_reg[] __read_mostly = { + { + .name = "length", + .family = NFPROTO_IPV4, + .match = length_mt, + .matchsize = sizeof(struct xt_length_info), + .me = THIS_MODULE, + }, + { + .name = "length", + .family = NFPROTO_IPV6, + .match = length_mt6, + .matchsize = sizeof(struct xt_length_info), + .me = THIS_MODULE, + }, +}; + +static int __init length_mt_init(void) +{ + return xt_register_matches(length_mt_reg, ARRAY_SIZE(length_mt_reg)); +} + +static void __exit length_mt_exit(void) +{ + xt_unregister_matches(length_mt_reg, ARRAY_SIZE(length_mt_reg)); +} + +module_init(length_mt_init); +module_exit(length_mt_exit); diff --git a/net/netfilter/xt_limit.c b/net/netfilter/xt_limit.c new file mode 100644 index 00000000..32b7a579 --- /dev/null +++ b/net/netfilter/xt_limit.c @@ -0,0 +1,210 @@ +/* (C) 1999 Jérôme de Vivie <devivie@info.enserb.u-bordeaux.fr> + * (C) 1999 Hervé Eychenne <eychenne@info.enserb.u-bordeaux.fr> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/slab.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/spinlock.h> +#include <linux/interrupt.h> + +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_limit.h> + +struct xt_limit_priv { + unsigned long prev; + uint32_t credit; +}; + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Herve Eychenne <rv@wallfire.org>"); +MODULE_DESCRIPTION("Xtables: rate-limit match"); +MODULE_ALIAS("ipt_limit"); +MODULE_ALIAS("ip6t_limit"); + +/* The algorithm used is the Simple Token Bucket Filter (TBF) + * see net/sched/sch_tbf.c in the linux source tree + */ + +static DEFINE_SPINLOCK(limit_lock); + +/* Rusty: This is my (non-mathematically-inclined) understanding of + this algorithm. The `average rate' in jiffies becomes your initial + amount of credit `credit' and the most credit you can ever have + `credit_cap'. The `peak rate' becomes the cost of passing the + test, `cost'. + + `prev' tracks the last packet hit: you gain one credit per jiffy. + If you get credit balance more than this, the extra credit is + discarded. Every time the match passes, you lose `cost' credits; + if you don't have that many, the test fails. + + See Alexey's formal explanation in net/sched/sch_tbf.c. + + To get the maxmum range, we multiply by this factor (ie. you get N + credits per jiffy). We want to allow a rate as low as 1 per day + (slowest userspace tool allows), which means + CREDITS_PER_JIFFY*HZ*60*60*24 < 2^32. ie. */ +#define MAX_CPJ (0xFFFFFFFF / (HZ*60*60*24)) + +/* Repeated shift and or gives us all 1s, final shift and add 1 gives + * us the power of 2 below the theoretical max, so GCC simply does a + * shift. */ +#define _POW2_BELOW2(x) ((x)|((x)>>1)) +#define _POW2_BELOW4(x) (_POW2_BELOW2(x)|_POW2_BELOW2((x)>>2)) +#define _POW2_BELOW8(x) (_POW2_BELOW4(x)|_POW2_BELOW4((x)>>4)) +#define _POW2_BELOW16(x) (_POW2_BELOW8(x)|_POW2_BELOW8((x)>>8)) +#define _POW2_BELOW32(x) (_POW2_BELOW16(x)|_POW2_BELOW16((x)>>16)) +#define POW2_BELOW32(x) ((_POW2_BELOW32(x)>>1) + 1) + +#define CREDITS_PER_JIFFY POW2_BELOW32(MAX_CPJ) + +static bool +limit_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_rateinfo *r = par->matchinfo; + struct xt_limit_priv *priv = r->master; + unsigned long now = jiffies; + + spin_lock_bh(&limit_lock); + priv->credit += (now - xchg(&priv->prev, now)) * CREDITS_PER_JIFFY; + if (priv->credit > r->credit_cap) + priv->credit = r->credit_cap; + + if (priv->credit >= r->cost) { + /* We're not limited. */ + priv->credit -= r->cost; + spin_unlock_bh(&limit_lock); + return true; + } + + spin_unlock_bh(&limit_lock); + return false; +} + +/* Precision saver. */ +static u_int32_t +user2credits(u_int32_t user) +{ + /* If multiplying would overflow... */ + if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY)) + /* Divide first. */ + return (user / XT_LIMIT_SCALE) * HZ * CREDITS_PER_JIFFY; + + return (user * HZ * CREDITS_PER_JIFFY) / XT_LIMIT_SCALE; +} + +static int limit_mt_check(const struct xt_mtchk_param *par) +{ + struct xt_rateinfo *r = par->matchinfo; + struct xt_limit_priv *priv; + + /* Check for overflow. */ + if (r->burst == 0 + || user2credits(r->avg * r->burst) < user2credits(r->avg)) { + pr_info("Overflow, try lower: %u/%u\n", + r->avg, r->burst); + return -ERANGE; + } + + priv = kmalloc(sizeof(*priv), GFP_KERNEL); + if (priv == NULL) + return -ENOMEM; + + /* For SMP, we only want to use one set of state. */ + r->master = priv; + if (r->cost == 0) { + /* User avg in seconds * XT_LIMIT_SCALE: convert to jiffies * + 128. */ + priv->prev = jiffies; + priv->credit = user2credits(r->avg * r->burst); /* Credits full. */ + r->credit_cap = user2credits(r->avg * r->burst); /* Credits full. */ + r->cost = user2credits(r->avg); + } + return 0; +} + +static void limit_mt_destroy(const struct xt_mtdtor_param *par) +{ + const struct xt_rateinfo *info = par->matchinfo; + + kfree(info->master); +} + +#ifdef CONFIG_COMPAT +struct compat_xt_rateinfo { + u_int32_t avg; + u_int32_t burst; + + compat_ulong_t prev; + u_int32_t credit; + u_int32_t credit_cap, cost; + + u_int32_t master; +}; + +/* To keep the full "prev" timestamp, the upper 32 bits are stored in the + * master pointer, which does not need to be preserved. */ +static void limit_mt_compat_from_user(void *dst, const void *src) +{ + const struct compat_xt_rateinfo *cm = src; + struct xt_rateinfo m = { + .avg = cm->avg, + .burst = cm->burst, + .prev = cm->prev | (unsigned long)cm->master << 32, + .credit = cm->credit, + .credit_cap = cm->credit_cap, + .cost = cm->cost, + }; + memcpy(dst, &m, sizeof(m)); +} + +static int limit_mt_compat_to_user(void __user *dst, const void *src) +{ + const struct xt_rateinfo *m = src; + struct compat_xt_rateinfo cm = { + .avg = m->avg, + .burst = m->burst, + .prev = m->prev, + .credit = m->credit, + .credit_cap = m->credit_cap, + .cost = m->cost, + .master = m->prev >> 32, + }; + return copy_to_user(dst, &cm, sizeof(cm)) ? -EFAULT : 0; +} +#endif /* CONFIG_COMPAT */ + +static struct xt_match limit_mt_reg __read_mostly = { + .name = "limit", + .revision = 0, + .family = NFPROTO_UNSPEC, + .match = limit_mt, + .checkentry = limit_mt_check, + .destroy = limit_mt_destroy, + .matchsize = sizeof(struct xt_rateinfo), +#ifdef CONFIG_COMPAT + .compatsize = sizeof(struct compat_xt_rateinfo), + .compat_from_user = limit_mt_compat_from_user, + .compat_to_user = limit_mt_compat_to_user, +#endif + .me = THIS_MODULE, +}; + +static int __init limit_mt_init(void) +{ + return xt_register_match(&limit_mt_reg); +} + +static void __exit limit_mt_exit(void) +{ + xt_unregister_match(&limit_mt_reg); +} + +module_init(limit_mt_init); +module_exit(limit_mt_exit); diff --git a/net/netfilter/xt_mac.c b/net/netfilter/xt_mac.c new file mode 100644 index 00000000..8160f6b1 --- /dev/null +++ b/net/netfilter/xt_mac.c @@ -0,0 +1,66 @@ +/* Kernel module to match MAC address parameters. */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/if_arp.h> +#include <linux/if_ether.h> +#include <linux/etherdevice.h> + +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter_ipv6.h> +#include <linux/netfilter/xt_mac.h> +#include <linux/netfilter/x_tables.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); +MODULE_DESCRIPTION("Xtables: MAC address match"); +MODULE_ALIAS("ipt_mac"); +MODULE_ALIAS("ip6t_mac"); + +static bool mac_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_mac_info *info = par->matchinfo; + bool ret; + + if (skb->dev == NULL || skb->dev->type != ARPHRD_ETHER) + return false; + if (skb_mac_header(skb) < skb->head) + return false; + if (skb_mac_header(skb) + ETH_HLEN > skb->data) + return false; + ret = compare_ether_addr(eth_hdr(skb)->h_source, info->srcaddr) == 0; + ret ^= info->invert; + return ret; +} + +static struct xt_match mac_mt_reg __read_mostly = { + .name = "mac", + .revision = 0, + .family = NFPROTO_UNSPEC, + .match = mac_mt, + .matchsize = sizeof(struct xt_mac_info), + .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_FORWARD), + .me = THIS_MODULE, +}; + +static int __init mac_mt_init(void) +{ + return xt_register_match(&mac_mt_reg); +} + +static void __exit mac_mt_exit(void) +{ + xt_unregister_match(&mac_mt_reg); +} + +module_init(mac_mt_init); +module_exit(mac_mt_exit); diff --git a/net/netfilter/xt_mark.c b/net/netfilter/xt_mark.c new file mode 100644 index 00000000..23345238 --- /dev/null +++ b/net/netfilter/xt_mark.c @@ -0,0 +1,84 @@ +/* + * xt_mark - Netfilter module to match NFMARK value + * + * (C) 1999-2001 Marc Boucher <marc@mbsi.ca> + * Copyright © CC Computer Consultants GmbH, 2007 - 2008 + * Jan Engelhardt <jengelh@medozas.de> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> + +#include <linux/netfilter/xt_mark.h> +#include <linux/netfilter/x_tables.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>"); +MODULE_DESCRIPTION("Xtables: packet mark operations"); +MODULE_ALIAS("ipt_mark"); +MODULE_ALIAS("ip6t_mark"); +MODULE_ALIAS("ipt_MARK"); +MODULE_ALIAS("ip6t_MARK"); + +static unsigned int +mark_tg(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_mark_tginfo2 *info = par->targinfo; + + skb->mark = (skb->mark & ~info->mask) ^ info->mark; + return XT_CONTINUE; +} + +static bool +mark_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_mark_mtinfo1 *info = par->matchinfo; + + return ((skb->mark & info->mask) == info->mark) ^ info->invert; +} + +static struct xt_target mark_tg_reg __read_mostly = { + .name = "MARK", + .revision = 2, + .family = NFPROTO_UNSPEC, + .target = mark_tg, + .targetsize = sizeof(struct xt_mark_tginfo2), + .me = THIS_MODULE, +}; + +static struct xt_match mark_mt_reg __read_mostly = { + .name = "mark", + .revision = 1, + .family = NFPROTO_UNSPEC, + .match = mark_mt, + .matchsize = sizeof(struct xt_mark_mtinfo1), + .me = THIS_MODULE, +}; + +static int __init mark_mt_init(void) +{ + int ret; + + ret = xt_register_target(&mark_tg_reg); + if (ret < 0) + return ret; + ret = xt_register_match(&mark_mt_reg); + if (ret < 0) { + xt_unregister_target(&mark_tg_reg); + return ret; + } + return 0; +} + +static void __exit mark_mt_exit(void) +{ + xt_unregister_match(&mark_mt_reg); + xt_unregister_target(&mark_tg_reg); +} + +module_init(mark_mt_init); +module_exit(mark_mt_exit); diff --git a/net/netfilter/xt_multiport.c b/net/netfilter/xt_multiport.c new file mode 100644 index 00000000..ac1d3c3d --- /dev/null +++ b/net/netfilter/xt_multiport.c @@ -0,0 +1,165 @@ +/* Kernel module to match one of a list of TCP/UDP(-Lite)/SCTP/DCCP ports: + ports are in the same place so we can treat them as equal. */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/module.h> +#include <linux/types.h> +#include <linux/udp.h> +#include <linux/skbuff.h> +#include <linux/in.h> + +#include <linux/netfilter/xt_multiport.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv6/ip6_tables.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); +MODULE_DESCRIPTION("Xtables: multiple port matching for TCP, UDP, UDP-Lite, SCTP and DCCP"); +MODULE_ALIAS("ipt_multiport"); +MODULE_ALIAS("ip6t_multiport"); + +/* Returns 1 if the port is matched by the test, 0 otherwise. */ +static inline bool +ports_match_v1(const struct xt_multiport_v1 *minfo, + u_int16_t src, u_int16_t dst) +{ + unsigned int i; + u_int16_t s, e; + + for (i = 0; i < minfo->count; i++) { + s = minfo->ports[i]; + + if (minfo->pflags[i]) { + /* range port matching */ + e = minfo->ports[++i]; + pr_debug("src or dst matches with %d-%d?\n", s, e); + + if (minfo->flags == XT_MULTIPORT_SOURCE + && src >= s && src <= e) + return true ^ minfo->invert; + if (minfo->flags == XT_MULTIPORT_DESTINATION + && dst >= s && dst <= e) + return true ^ minfo->invert; + if (minfo->flags == XT_MULTIPORT_EITHER + && ((dst >= s && dst <= e) + || (src >= s && src <= e))) + return true ^ minfo->invert; + } else { + /* exact port matching */ + pr_debug("src or dst matches with %d?\n", s); + + if (minfo->flags == XT_MULTIPORT_SOURCE + && src == s) + return true ^ minfo->invert; + if (minfo->flags == XT_MULTIPORT_DESTINATION + && dst == s) + return true ^ minfo->invert; + if (minfo->flags == XT_MULTIPORT_EITHER + && (src == s || dst == s)) + return true ^ minfo->invert; + } + } + + return minfo->invert; +} + +static bool +multiport_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const __be16 *pptr; + __be16 _ports[2]; + const struct xt_multiport_v1 *multiinfo = par->matchinfo; + + if (par->fragoff != 0) + return false; + + pptr = skb_header_pointer(skb, par->thoff, sizeof(_ports), _ports); + if (pptr == NULL) { + /* We've been asked to examine this packet, and we + * can't. Hence, no choice but to drop. + */ + pr_debug("Dropping evil offset=0 tinygram.\n"); + par->hotdrop = true; + return false; + } + + return ports_match_v1(multiinfo, ntohs(pptr[0]), ntohs(pptr[1])); +} + +static inline bool +check(u_int16_t proto, + u_int8_t ip_invflags, + u_int8_t match_flags, + u_int8_t count) +{ + /* Must specify supported protocol, no unknown flags or bad count */ + return (proto == IPPROTO_TCP || proto == IPPROTO_UDP + || proto == IPPROTO_UDPLITE + || proto == IPPROTO_SCTP || proto == IPPROTO_DCCP) + && !(ip_invflags & XT_INV_PROTO) + && (match_flags == XT_MULTIPORT_SOURCE + || match_flags == XT_MULTIPORT_DESTINATION + || match_flags == XT_MULTIPORT_EITHER) + && count <= XT_MULTI_PORTS; +} + +static int multiport_mt_check(const struct xt_mtchk_param *par) +{ + const struct ipt_ip *ip = par->entryinfo; + const struct xt_multiport_v1 *multiinfo = par->matchinfo; + + return check(ip->proto, ip->invflags, multiinfo->flags, + multiinfo->count) ? 0 : -EINVAL; +} + +static int multiport_mt6_check(const struct xt_mtchk_param *par) +{ + const struct ip6t_ip6 *ip = par->entryinfo; + const struct xt_multiport_v1 *multiinfo = par->matchinfo; + + return check(ip->proto, ip->invflags, multiinfo->flags, + multiinfo->count) ? 0 : -EINVAL; +} + +static struct xt_match multiport_mt_reg[] __read_mostly = { + { + .name = "multiport", + .family = NFPROTO_IPV4, + .revision = 1, + .checkentry = multiport_mt_check, + .match = multiport_mt, + .matchsize = sizeof(struct xt_multiport_v1), + .me = THIS_MODULE, + }, + { + .name = "multiport", + .family = NFPROTO_IPV6, + .revision = 1, + .checkentry = multiport_mt6_check, + .match = multiport_mt, + .matchsize = sizeof(struct xt_multiport_v1), + .me = THIS_MODULE, + }, +}; + +static int __init multiport_mt_init(void) +{ + return xt_register_matches(multiport_mt_reg, + ARRAY_SIZE(multiport_mt_reg)); +} + +static void __exit multiport_mt_exit(void) +{ + xt_unregister_matches(multiport_mt_reg, ARRAY_SIZE(multiport_mt_reg)); +} + +module_init(multiport_mt_init); +module_exit(multiport_mt_exit); diff --git a/net/netfilter/xt_nfacct.c b/net/netfilter/xt_nfacct.c new file mode 100644 index 00000000..b3be0ef2 --- /dev/null +++ b/net/netfilter/xt_nfacct.c @@ -0,0 +1,76 @@ +/* + * (C) 2011 Pablo Neira Ayuso <pablo@netfilter.org> + * (C) 2011 Intra2net AG <http://www.intra2net.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 (or any + * later at your option) as published by the Free Software Foundation. + */ +#include <linux/module.h> +#include <linux/skbuff.h> + +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/nfnetlink_acct.h> +#include <linux/netfilter/xt_nfacct.h> + +MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); +MODULE_DESCRIPTION("Xtables: match for the extended accounting infrastructure"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_nfacct"); +MODULE_ALIAS("ip6t_nfacct"); + +static bool nfacct_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_nfacct_match_info *info = par->targinfo; + + nfnl_acct_update(skb, info->nfacct); + + return true; +} + +static int +nfacct_mt_checkentry(const struct xt_mtchk_param *par) +{ + struct xt_nfacct_match_info *info = par->matchinfo; + struct nf_acct *nfacct; + + nfacct = nfnl_acct_find_get(info->name); + if (nfacct == NULL) { + pr_info("xt_nfacct: accounting object with name `%s' " + "does not exists\n", info->name); + return -ENOENT; + } + info->nfacct = nfacct; + return 0; +} + +static void +nfacct_mt_destroy(const struct xt_mtdtor_param *par) +{ + const struct xt_nfacct_match_info *info = par->matchinfo; + + nfnl_acct_put(info->nfacct); +} + +static struct xt_match nfacct_mt_reg __read_mostly = { + .name = "nfacct", + .family = NFPROTO_UNSPEC, + .checkentry = nfacct_mt_checkentry, + .match = nfacct_mt, + .destroy = nfacct_mt_destroy, + .matchsize = sizeof(struct xt_nfacct_match_info), + .me = THIS_MODULE, +}; + +static int __init nfacct_mt_init(void) +{ + return xt_register_match(&nfacct_mt_reg); +} + +static void __exit nfacct_mt_exit(void) +{ + xt_unregister_match(&nfacct_mt_reg); +} + +module_init(nfacct_mt_init); +module_exit(nfacct_mt_exit); diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c new file mode 100644 index 00000000..846f895c --- /dev/null +++ b/net/netfilter/xt_osf.c @@ -0,0 +1,424 @@ +/* + * Copyright (c) 2003+ Evgeniy Polyakov <zbr@ioremap.net> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/module.h> +#include <linux/kernel.h> + +#include <linux/if.h> +#include <linux/inetdevice.h> +#include <linux/ip.h> +#include <linux/list.h> +#include <linux/rculist.h> +#include <linux/skbuff.h> +#include <linux/slab.h> +#include <linux/tcp.h> + +#include <net/ip.h> +#include <net/tcp.h> + +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/x_tables.h> +#include <net/netfilter/nf_log.h> +#include <linux/netfilter/xt_osf.h> + +struct xt_osf_finger { + struct rcu_head rcu_head; + struct list_head finger_entry; + struct xt_osf_user_finger finger; +}; + +enum osf_fmatch_states { + /* Packet does not match the fingerprint */ + FMATCH_WRONG = 0, + /* Packet matches the fingerprint */ + FMATCH_OK, + /* Options do not match the fingerprint, but header does */ + FMATCH_OPT_WRONG, +}; + +/* + * Indexed by dont-fragment bit. + * It is the only constant value in the fingerprint. + */ +static struct list_head xt_osf_fingers[2]; + +static const struct nla_policy xt_osf_policy[OSF_ATTR_MAX + 1] = { + [OSF_ATTR_FINGER] = { .len = sizeof(struct xt_osf_user_finger) }, +}; + +static int xt_osf_add_callback(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const osf_attrs[]) +{ + struct xt_osf_user_finger *f; + struct xt_osf_finger *kf = NULL, *sf; + int err = 0; + + if (!osf_attrs[OSF_ATTR_FINGER]) + return -EINVAL; + + if (!(nlh->nlmsg_flags & NLM_F_CREATE)) + return -EINVAL; + + f = nla_data(osf_attrs[OSF_ATTR_FINGER]); + + kf = kmalloc(sizeof(struct xt_osf_finger), GFP_KERNEL); + if (!kf) + return -ENOMEM; + + memcpy(&kf->finger, f, sizeof(struct xt_osf_user_finger)); + + list_for_each_entry(sf, &xt_osf_fingers[!!f->df], finger_entry) { + if (memcmp(&sf->finger, f, sizeof(struct xt_osf_user_finger))) + continue; + + kfree(kf); + kf = NULL; + + if (nlh->nlmsg_flags & NLM_F_EXCL) + err = -EEXIST; + break; + } + + /* + * We are protected by nfnl mutex. + */ + if (kf) + list_add_tail_rcu(&kf->finger_entry, &xt_osf_fingers[!!f->df]); + + return err; +} + +static int xt_osf_remove_callback(struct sock *ctnl, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const osf_attrs[]) +{ + struct xt_osf_user_finger *f; + struct xt_osf_finger *sf; + int err = -ENOENT; + + if (!osf_attrs[OSF_ATTR_FINGER]) + return -EINVAL; + + f = nla_data(osf_attrs[OSF_ATTR_FINGER]); + + list_for_each_entry(sf, &xt_osf_fingers[!!f->df], finger_entry) { + if (memcmp(&sf->finger, f, sizeof(struct xt_osf_user_finger))) + continue; + + /* + * We are protected by nfnl mutex. + */ + list_del_rcu(&sf->finger_entry); + kfree_rcu(sf, rcu_head); + + err = 0; + break; + } + + return err; +} + +static const struct nfnl_callback xt_osf_nfnetlink_callbacks[OSF_MSG_MAX] = { + [OSF_MSG_ADD] = { + .call = xt_osf_add_callback, + .attr_count = OSF_ATTR_MAX, + .policy = xt_osf_policy, + }, + [OSF_MSG_REMOVE] = { + .call = xt_osf_remove_callback, + .attr_count = OSF_ATTR_MAX, + .policy = xt_osf_policy, + }, +}; + +static const struct nfnetlink_subsystem xt_osf_nfnetlink = { + .name = "osf", + .subsys_id = NFNL_SUBSYS_OSF, + .cb_count = OSF_MSG_MAX, + .cb = xt_osf_nfnetlink_callbacks, +}; + +static inline int xt_osf_ttl(const struct sk_buff *skb, const struct xt_osf_info *info, + unsigned char f_ttl) +{ + const struct iphdr *ip = ip_hdr(skb); + + if (info->flags & XT_OSF_TTL) { + if (info->ttl == XT_OSF_TTL_TRUE) + return ip->ttl == f_ttl; + if (info->ttl == XT_OSF_TTL_NOCHECK) + return 1; + else if (ip->ttl <= f_ttl) + return 1; + else { + struct in_device *in_dev = __in_dev_get_rcu(skb->dev); + int ret = 0; + + for_ifa(in_dev) { + if (inet_ifa_match(ip->saddr, ifa)) { + ret = (ip->ttl == f_ttl); + break; + } + } + endfor_ifa(in_dev); + + return ret; + } + } + + return ip->ttl == f_ttl; +} + +static bool +xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p) +{ + const struct xt_osf_info *info = p->matchinfo; + const struct iphdr *ip = ip_hdr(skb); + const struct tcphdr *tcp; + struct tcphdr _tcph; + int fmatch = FMATCH_WRONG, fcount = 0; + unsigned int optsize = 0, check_WSS = 0; + u16 window, totlen, mss = 0; + bool df; + const unsigned char *optp = NULL, *_optp = NULL; + unsigned char opts[MAX_IPOPTLEN]; + const struct xt_osf_finger *kf; + const struct xt_osf_user_finger *f; + + if (!info) + return false; + + tcp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(struct tcphdr), &_tcph); + if (!tcp) + return false; + + if (!tcp->syn) + return false; + + totlen = ntohs(ip->tot_len); + df = ntohs(ip->frag_off) & IP_DF; + window = ntohs(tcp->window); + + if (tcp->doff * 4 > sizeof(struct tcphdr)) { + optsize = tcp->doff * 4 - sizeof(struct tcphdr); + + _optp = optp = skb_header_pointer(skb, ip_hdrlen(skb) + + sizeof(struct tcphdr), optsize, opts); + } + + rcu_read_lock(); + list_for_each_entry_rcu(kf, &xt_osf_fingers[df], finger_entry) { + f = &kf->finger; + + if (!(info->flags & XT_OSF_LOG) && strcmp(info->genre, f->genre)) + continue; + + optp = _optp; + fmatch = FMATCH_WRONG; + + if (totlen == f->ss && xt_osf_ttl(skb, info, f->ttl)) { + int foptsize, optnum; + + /* + * Should not happen if userspace parser was written correctly. + */ + if (f->wss.wc >= OSF_WSS_MAX) + continue; + + /* Check options */ + + foptsize = 0; + for (optnum = 0; optnum < f->opt_num; ++optnum) + foptsize += f->opt[optnum].length; + + if (foptsize > MAX_IPOPTLEN || + optsize > MAX_IPOPTLEN || + optsize != foptsize) + continue; + + check_WSS = f->wss.wc; + + for (optnum = 0; optnum < f->opt_num; ++optnum) { + if (f->opt[optnum].kind == (*optp)) { + __u32 len = f->opt[optnum].length; + const __u8 *optend = optp + len; + int loop_cont = 0; + + fmatch = FMATCH_OK; + + switch (*optp) { + case OSFOPT_MSS: + mss = optp[3]; + mss <<= 8; + mss |= optp[2]; + + mss = ntohs(mss); + break; + case OSFOPT_TS: + loop_cont = 1; + break; + } + + optp = optend; + } else + fmatch = FMATCH_OPT_WRONG; + + if (fmatch != FMATCH_OK) + break; + } + + if (fmatch != FMATCH_OPT_WRONG) { + fmatch = FMATCH_WRONG; + + switch (check_WSS) { + case OSF_WSS_PLAIN: + if (f->wss.val == 0 || window == f->wss.val) + fmatch = FMATCH_OK; + break; + case OSF_WSS_MSS: + /* + * Some smart modems decrease mangle MSS to + * SMART_MSS_2, so we check standard, decreased + * and the one provided in the fingerprint MSS + * values. + */ +#define SMART_MSS_1 1460 +#define SMART_MSS_2 1448 + if (window == f->wss.val * mss || + window == f->wss.val * SMART_MSS_1 || + window == f->wss.val * SMART_MSS_2) + fmatch = FMATCH_OK; + break; + case OSF_WSS_MTU: + if (window == f->wss.val * (mss + 40) || + window == f->wss.val * (SMART_MSS_1 + 40) || + window == f->wss.val * (SMART_MSS_2 + 40)) + fmatch = FMATCH_OK; + break; + case OSF_WSS_MODULO: + if ((window % f->wss.val) == 0) + fmatch = FMATCH_OK; + break; + } + } + + if (fmatch != FMATCH_OK) + continue; + + fcount++; + + if (info->flags & XT_OSF_LOG) + nf_log_packet(p->family, p->hooknum, skb, + p->in, p->out, NULL, + "%s [%s:%s] : %pI4:%d -> %pI4:%d hops=%d\n", + f->genre, f->version, f->subtype, + &ip->saddr, ntohs(tcp->source), + &ip->daddr, ntohs(tcp->dest), + f->ttl - ip->ttl); + + if ((info->flags & XT_OSF_LOG) && + info->loglevel == XT_OSF_LOGLEVEL_FIRST) + break; + } + } + rcu_read_unlock(); + + if (!fcount && (info->flags & XT_OSF_LOG)) + nf_log_packet(p->family, p->hooknum, skb, p->in, p->out, NULL, + "Remote OS is not known: %pI4:%u -> %pI4:%u\n", + &ip->saddr, ntohs(tcp->source), + &ip->daddr, ntohs(tcp->dest)); + + if (fcount) + fmatch = FMATCH_OK; + + return fmatch == FMATCH_OK; +} + +static struct xt_match xt_osf_match = { + .name = "osf", + .revision = 0, + .family = NFPROTO_IPV4, + .proto = IPPROTO_TCP, + .hooks = (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_FORWARD), + .match = xt_osf_match_packet, + .matchsize = sizeof(struct xt_osf_info), + .me = THIS_MODULE, +}; + +static int __init xt_osf_init(void) +{ + int err = -EINVAL; + int i; + + for (i=0; i<ARRAY_SIZE(xt_osf_fingers); ++i) + INIT_LIST_HEAD(&xt_osf_fingers[i]); + + err = nfnetlink_subsys_register(&xt_osf_nfnetlink); + if (err < 0) { + pr_err("Failed to register OSF nsfnetlink helper (%d)\n", err); + goto err_out_exit; + } + + err = xt_register_match(&xt_osf_match); + if (err) { + pr_err("Failed to register OS fingerprint " + "matching module (%d)\n", err); + goto err_out_remove; + } + + return 0; + +err_out_remove: + nfnetlink_subsys_unregister(&xt_osf_nfnetlink); +err_out_exit: + return err; +} + +static void __exit xt_osf_fini(void) +{ + struct xt_osf_finger *f; + int i; + + nfnetlink_subsys_unregister(&xt_osf_nfnetlink); + xt_unregister_match(&xt_osf_match); + + rcu_read_lock(); + for (i=0; i<ARRAY_SIZE(xt_osf_fingers); ++i) { + + list_for_each_entry_rcu(f, &xt_osf_fingers[i], finger_entry) { + list_del_rcu(&f->finger_entry); + kfree_rcu(f, rcu_head); + } + } + rcu_read_unlock(); + + rcu_barrier(); +} + +module_init(xt_osf_init); +module_exit(xt_osf_fini); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Evgeniy Polyakov <zbr@ioremap.net>"); +MODULE_DESCRIPTION("Passive OS fingerprint matching."); +MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_OSF); diff --git a/net/netfilter/xt_owner.c b/net/netfilter/xt_owner.c new file mode 100644 index 00000000..772d7389 --- /dev/null +++ b/net/netfilter/xt_owner.c @@ -0,0 +1,82 @@ +/* + * Kernel module to match various things tied to sockets associated with + * locally generated outgoing packets. + * + * (C) 2000 Marc Boucher <marc@mbsi.ca> + * + * Copyright © CC Computer Consultants GmbH, 2007 - 2008 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/file.h> +#include <net/sock.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_owner.h> + +static bool +owner_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_owner_match_info *info = par->matchinfo; + const struct file *filp; + + if (skb->sk == NULL || skb->sk->sk_socket == NULL) + return (info->match ^ info->invert) == 0; + else if (info->match & info->invert & XT_OWNER_SOCKET) + /* + * Socket exists but user wanted ! --socket-exists. + * (Single ampersands intended.) + */ + return false; + + filp = skb->sk->sk_socket->file; + if (filp == NULL) + return ((info->match ^ info->invert) & + (XT_OWNER_UID | XT_OWNER_GID)) == 0; + + if (info->match & XT_OWNER_UID) + if ((filp->f_cred->fsuid >= info->uid_min && + filp->f_cred->fsuid <= info->uid_max) ^ + !(info->invert & XT_OWNER_UID)) + return false; + + if (info->match & XT_OWNER_GID) + if ((filp->f_cred->fsgid >= info->gid_min && + filp->f_cred->fsgid <= info->gid_max) ^ + !(info->invert & XT_OWNER_GID)) + return false; + + return true; +} + +static struct xt_match owner_mt_reg __read_mostly = { + .name = "owner", + .revision = 1, + .family = NFPROTO_UNSPEC, + .match = owner_mt, + .matchsize = sizeof(struct xt_owner_match_info), + .hooks = (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_POST_ROUTING), + .me = THIS_MODULE, +}; + +static int __init owner_mt_init(void) +{ + return xt_register_match(&owner_mt_reg); +} + +static void __exit owner_mt_exit(void) +{ + xt_unregister_match(&owner_mt_reg); +} + +module_init(owner_mt_init); +module_exit(owner_mt_exit); +MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>"); +MODULE_DESCRIPTION("Xtables: socket owner matching"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_owner"); +MODULE_ALIAS("ip6t_owner"); diff --git a/net/netfilter/xt_physdev.c b/net/netfilter/xt_physdev.c new file mode 100644 index 00000000..d7ca16b8 --- /dev/null +++ b/net/netfilter/xt_physdev.c @@ -0,0 +1,128 @@ +/* Kernel module to match the bridge port in and + * out device for IP packets coming into contact with a bridge. */ + +/* (C) 2001-2003 Bart De Schuymer <bdschuym@pandora.be> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/netfilter_bridge.h> +#include <linux/netfilter/xt_physdev.h> +#include <linux/netfilter/x_tables.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Bart De Schuymer <bdschuym@pandora.be>"); +MODULE_DESCRIPTION("Xtables: Bridge physical device match"); +MODULE_ALIAS("ipt_physdev"); +MODULE_ALIAS("ip6t_physdev"); + + +static bool +physdev_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long)))); + const struct xt_physdev_info *info = par->matchinfo; + unsigned long ret; + const char *indev, *outdev; + const struct nf_bridge_info *nf_bridge; + + /* Not a bridged IP packet or no info available yet: + * LOCAL_OUT/mangle and LOCAL_OUT/nat don't know if + * the destination device will be a bridge. */ + if (!(nf_bridge = skb->nf_bridge)) { + /* Return MATCH if the invert flags of the used options are on */ + if ((info->bitmask & XT_PHYSDEV_OP_BRIDGED) && + !(info->invert & XT_PHYSDEV_OP_BRIDGED)) + return false; + if ((info->bitmask & XT_PHYSDEV_OP_ISIN) && + !(info->invert & XT_PHYSDEV_OP_ISIN)) + return false; + if ((info->bitmask & XT_PHYSDEV_OP_ISOUT) && + !(info->invert & XT_PHYSDEV_OP_ISOUT)) + return false; + if ((info->bitmask & XT_PHYSDEV_OP_IN) && + !(info->invert & XT_PHYSDEV_OP_IN)) + return false; + if ((info->bitmask & XT_PHYSDEV_OP_OUT) && + !(info->invert & XT_PHYSDEV_OP_OUT)) + return false; + return true; + } + + /* This only makes sense in the FORWARD and POSTROUTING chains */ + if ((info->bitmask & XT_PHYSDEV_OP_BRIDGED) && + (!!(nf_bridge->mask & BRNF_BRIDGED) ^ + !(info->invert & XT_PHYSDEV_OP_BRIDGED))) + return false; + + if ((info->bitmask & XT_PHYSDEV_OP_ISIN && + (!nf_bridge->physindev ^ !!(info->invert & XT_PHYSDEV_OP_ISIN))) || + (info->bitmask & XT_PHYSDEV_OP_ISOUT && + (!nf_bridge->physoutdev ^ !!(info->invert & XT_PHYSDEV_OP_ISOUT)))) + return false; + + if (!(info->bitmask & XT_PHYSDEV_OP_IN)) + goto match_outdev; + indev = nf_bridge->physindev ? nf_bridge->physindev->name : nulldevname; + ret = ifname_compare_aligned(indev, info->physindev, info->in_mask); + + if (!ret ^ !(info->invert & XT_PHYSDEV_OP_IN)) + return false; + +match_outdev: + if (!(info->bitmask & XT_PHYSDEV_OP_OUT)) + return true; + outdev = nf_bridge->physoutdev ? + nf_bridge->physoutdev->name : nulldevname; + ret = ifname_compare_aligned(outdev, info->physoutdev, info->out_mask); + + return (!!ret ^ !(info->invert & XT_PHYSDEV_OP_OUT)); +} + +static int physdev_mt_check(const struct xt_mtchk_param *par) +{ + const struct xt_physdev_info *info = par->matchinfo; + + if (!(info->bitmask & XT_PHYSDEV_OP_MASK) || + info->bitmask & ~XT_PHYSDEV_OP_MASK) + return -EINVAL; + if (info->bitmask & XT_PHYSDEV_OP_OUT && + (!(info->bitmask & XT_PHYSDEV_OP_BRIDGED) || + info->invert & XT_PHYSDEV_OP_BRIDGED) && + par->hook_mask & ((1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_FORWARD) | (1 << NF_INET_POST_ROUTING))) { + pr_info("using --physdev-out in the OUTPUT, FORWARD and " + "POSTROUTING chains for non-bridged traffic is not " + "supported anymore.\n"); + if (par->hook_mask & (1 << NF_INET_LOCAL_OUT)) + return -EINVAL; + } + return 0; +} + +static struct xt_match physdev_mt_reg __read_mostly = { + .name = "physdev", + .revision = 0, + .family = NFPROTO_UNSPEC, + .checkentry = physdev_mt_check, + .match = physdev_mt, + .matchsize = sizeof(struct xt_physdev_info), + .me = THIS_MODULE, +}; + +static int __init physdev_mt_init(void) +{ + return xt_register_match(&physdev_mt_reg); +} + +static void __exit physdev_mt_exit(void) +{ + xt_unregister_match(&physdev_mt_reg); +} + +module_init(physdev_mt_init); +module_exit(physdev_mt_exit); diff --git a/net/netfilter/xt_pkttype.c b/net/netfilter/xt_pkttype.c new file mode 100644 index 00000000..5b645cb5 --- /dev/null +++ b/net/netfilter/xt_pkttype.c @@ -0,0 +1,65 @@ +/* (C) 1999-2001 Michal Ludvig <michal@logix.cz> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/if_ether.h> +#include <linux/if_packet.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/ipv6.h> + +#include <linux/netfilter/xt_pkttype.h> +#include <linux/netfilter/x_tables.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Michal Ludvig <michal@logix.cz>"); +MODULE_DESCRIPTION("Xtables: link layer packet type match"); +MODULE_ALIAS("ipt_pkttype"); +MODULE_ALIAS("ip6t_pkttype"); + +static bool +pkttype_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_pkttype_info *info = par->matchinfo; + u_int8_t type; + + if (skb->pkt_type != PACKET_LOOPBACK) + type = skb->pkt_type; + else if (par->family == NFPROTO_IPV4 && + ipv4_is_multicast(ip_hdr(skb)->daddr)) + type = PACKET_MULTICAST; + else if (par->family == NFPROTO_IPV6 && + ipv6_hdr(skb)->daddr.s6_addr[0] == 0xFF) + type = PACKET_MULTICAST; + else + type = PACKET_BROADCAST; + + return (type == info->pkttype) ^ info->invert; +} + +static struct xt_match pkttype_mt_reg __read_mostly = { + .name = "pkttype", + .revision = 0, + .family = NFPROTO_UNSPEC, + .match = pkttype_mt, + .matchsize = sizeof(struct xt_pkttype_info), + .me = THIS_MODULE, +}; + +static int __init pkttype_mt_init(void) +{ + return xt_register_match(&pkttype_mt_reg); +} + +static void __exit pkttype_mt_exit(void) +{ + xt_unregister_match(&pkttype_mt_reg); +} + +module_init(pkttype_mt_init); +module_exit(pkttype_mt_exit); diff --git a/net/netfilter/xt_policy.c b/net/netfilter/xt_policy.c new file mode 100644 index 00000000..f23e97bb --- /dev/null +++ b/net/netfilter/xt_policy.c @@ -0,0 +1,188 @@ +/* IP tables module for matching IPsec policy + * + * Copyright (c) 2004,2005 Patrick McHardy, <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/init.h> +#include <net/xfrm.h> + +#include <linux/netfilter.h> +#include <linux/netfilter/xt_policy.h> +#include <linux/netfilter/x_tables.h> + +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_DESCRIPTION("Xtables: IPsec policy match"); +MODULE_LICENSE("GPL"); + +static inline bool +xt_addr_cmp(const union nf_inet_addr *a1, const union nf_inet_addr *m, + const union nf_inet_addr *a2, unsigned short family) +{ + switch (family) { + case NFPROTO_IPV4: + return ((a1->ip ^ a2->ip) & m->ip) == 0; + case NFPROTO_IPV6: + return ipv6_masked_addr_cmp(&a1->in6, &m->in6, &a2->in6) == 0; + } + return false; +} + +static bool +match_xfrm_state(const struct xfrm_state *x, const struct xt_policy_elem *e, + unsigned short family) +{ +#define MATCH_ADDR(x,y,z) (!e->match.x || \ + (xt_addr_cmp(&e->x, &e->y, (const union nf_inet_addr *)(z), family) \ + ^ e->invert.x)) +#define MATCH(x,y) (!e->match.x || ((e->x == (y)) ^ e->invert.x)) + + return MATCH_ADDR(saddr, smask, &x->props.saddr) && + MATCH_ADDR(daddr, dmask, &x->id.daddr) && + MATCH(proto, x->id.proto) && + MATCH(mode, x->props.mode) && + MATCH(spi, x->id.spi) && + MATCH(reqid, x->props.reqid); +} + +static int +match_policy_in(const struct sk_buff *skb, const struct xt_policy_info *info, + unsigned short family) +{ + const struct xt_policy_elem *e; + const struct sec_path *sp = skb->sp; + int strict = info->flags & XT_POLICY_MATCH_STRICT; + int i, pos; + + if (sp == NULL) + return -1; + if (strict && info->len != sp->len) + return 0; + + for (i = sp->len - 1; i >= 0; i--) { + pos = strict ? i - sp->len + 1 : 0; + if (pos >= info->len) + return 0; + e = &info->pol[pos]; + + if (match_xfrm_state(sp->xvec[i], e, family)) { + if (!strict) + return 1; + } else if (strict) + return 0; + } + + return strict ? 1 : 0; +} + +static int +match_policy_out(const struct sk_buff *skb, const struct xt_policy_info *info, + unsigned short family) +{ + const struct xt_policy_elem *e; + const struct dst_entry *dst = skb_dst(skb); + int strict = info->flags & XT_POLICY_MATCH_STRICT; + int i, pos; + + if (dst->xfrm == NULL) + return -1; + + for (i = 0; dst && dst->xfrm; dst = dst->child, i++) { + pos = strict ? i : 0; + if (pos >= info->len) + return 0; + e = &info->pol[pos]; + + if (match_xfrm_state(dst->xfrm, e, family)) { + if (!strict) + return 1; + } else if (strict) + return 0; + } + + return strict ? i == info->len : 0; +} + +static bool +policy_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_policy_info *info = par->matchinfo; + int ret; + + if (info->flags & XT_POLICY_MATCH_IN) + ret = match_policy_in(skb, info, par->family); + else + ret = match_policy_out(skb, info, par->family); + + if (ret < 0) + ret = info->flags & XT_POLICY_MATCH_NONE ? true : false; + else if (info->flags & XT_POLICY_MATCH_NONE) + ret = false; + + return ret; +} + +static int policy_mt_check(const struct xt_mtchk_param *par) +{ + const struct xt_policy_info *info = par->matchinfo; + + if (!(info->flags & (XT_POLICY_MATCH_IN|XT_POLICY_MATCH_OUT))) { + pr_info("neither incoming nor outgoing policy selected\n"); + return -EINVAL; + } + if (par->hook_mask & ((1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN)) && info->flags & XT_POLICY_MATCH_OUT) { + pr_info("output policy not valid in PREROUTING and INPUT\n"); + return -EINVAL; + } + if (par->hook_mask & ((1 << NF_INET_POST_ROUTING) | + (1 << NF_INET_LOCAL_OUT)) && info->flags & XT_POLICY_MATCH_IN) { + pr_info("input policy not valid in POSTROUTING and OUTPUT\n"); + return -EINVAL; + } + if (info->len > XT_POLICY_MAX_ELEM) { + pr_info("too many policy elements\n"); + return -EINVAL; + } + return 0; +} + +static struct xt_match policy_mt_reg[] __read_mostly = { + { + .name = "policy", + .family = NFPROTO_IPV4, + .checkentry = policy_mt_check, + .match = policy_mt, + .matchsize = sizeof(struct xt_policy_info), + .me = THIS_MODULE, + }, + { + .name = "policy", + .family = NFPROTO_IPV6, + .checkentry = policy_mt_check, + .match = policy_mt, + .matchsize = sizeof(struct xt_policy_info), + .me = THIS_MODULE, + }, +}; + +static int __init policy_mt_init(void) +{ + return xt_register_matches(policy_mt_reg, ARRAY_SIZE(policy_mt_reg)); +} + +static void __exit policy_mt_exit(void) +{ + xt_unregister_matches(policy_mt_reg, ARRAY_SIZE(policy_mt_reg)); +} + +module_init(policy_mt_init); +module_exit(policy_mt_exit); +MODULE_ALIAS("ipt_policy"); +MODULE_ALIAS("ip6t_policy"); diff --git a/net/netfilter/xt_qtaguid.c b/net/netfilter/xt_qtaguid.c new file mode 100644 index 00000000..d61762be --- /dev/null +++ b/net/netfilter/xt_qtaguid.c @@ -0,0 +1,2971 @@ +/* + * Kernel iptables module to track stats for packets based on user tags. + * + * (C) 2011 Google, Inc + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* + * There are run-time debug flags enabled via the debug_mask module param, or + * via the DEFAULT_DEBUG_MASK. See xt_qtaguid_internal.h. + */ +#define DEBUG + +#include <linux/file.h> +#include <linux/inetdevice.h> +#include <linux/module.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_qtaguid.h> +#include <linux/skbuff.h> +#include <linux/workqueue.h> +#include <net/addrconf.h> +#include <net/sock.h> +#include <net/tcp.h> +#include <net/udp.h> + +#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) +#include <linux/netfilter_ipv6/ip6_tables.h> +#endif + +#include <linux/netfilter/xt_socket.h> +#include "xt_qtaguid_internal.h" +#include "xt_qtaguid_print.h" + +/* + * We only use the xt_socket funcs within a similar context to avoid unexpected + * return values. + */ +#define XT_SOCKET_SUPPORTED_HOOKS \ + ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN)) + + +static const char *module_procdirname = "xt_qtaguid"; +static struct proc_dir_entry *xt_qtaguid_procdir; + +static unsigned int proc_iface_perms = S_IRUGO; +module_param_named(iface_perms, proc_iface_perms, uint, S_IRUGO | S_IWUSR); + +static struct proc_dir_entry *xt_qtaguid_stats_file; +static unsigned int proc_stats_perms = S_IRUGO; +module_param_named(stats_perms, proc_stats_perms, uint, S_IRUGO | S_IWUSR); + +static struct proc_dir_entry *xt_qtaguid_ctrl_file; +#ifdef CONFIG_ANDROID_PARANOID_NETWORK +static unsigned int proc_ctrl_perms = S_IRUGO | S_IWUGO; +#else +static unsigned int proc_ctrl_perms = S_IRUGO | S_IWUSR; +#endif +module_param_named(ctrl_perms, proc_ctrl_perms, uint, S_IRUGO | S_IWUSR); + +#ifdef CONFIG_ANDROID_PARANOID_NETWORK +#include <linux/android_aid.h> +static gid_t proc_stats_readall_gid = AID_NET_BW_STATS; +static gid_t proc_ctrl_write_gid = AID_NET_BW_ACCT; +#else +/* 0 means, don't limit anybody */ +static gid_t proc_stats_readall_gid; +static gid_t proc_ctrl_write_gid; +#endif +module_param_named(stats_readall_gid, proc_stats_readall_gid, uint, + S_IRUGO | S_IWUSR); +module_param_named(ctrl_write_gid, proc_ctrl_write_gid, uint, + S_IRUGO | S_IWUSR); + +/* + * Limit the number of active tags (via socket tags) for a given UID. + * Multiple processes could share the UID. + */ +static int max_sock_tags = DEFAULT_MAX_SOCK_TAGS; +module_param(max_sock_tags, int, S_IRUGO | S_IWUSR); + +/* + * After the kernel has initiallized this module, it is still possible + * to make it passive. + * Setting passive to Y: + * - the iface stats handling will not act on notifications. + * - iptables matches will never match. + * - ctrl commands silently succeed. + * - stats are always empty. + * This is mostly usefull when a bug is suspected. + */ +static bool module_passive; +module_param_named(passive, module_passive, bool, S_IRUGO | S_IWUSR); + +/* + * Control how qtaguid data is tracked per proc/uid. + * Setting tag_tracking_passive to Y: + * - don't create proc specific structs to track tags + * - don't check that active tag stats exceed some limits. + * - don't clean up socket tags on process exits. + * This is mostly usefull when a bug is suspected. + */ +static bool qtu_proc_handling_passive; +module_param_named(tag_tracking_passive, qtu_proc_handling_passive, bool, + S_IRUGO | S_IWUSR); + +#define QTU_DEV_NAME "xt_qtaguid" + +uint qtaguid_debug_mask = DEFAULT_DEBUG_MASK; +module_param_named(debug_mask, qtaguid_debug_mask, uint, S_IRUGO | S_IWUSR); + +/*---------------------------------------------------------------------------*/ +static const char *iface_stat_procdirname = "iface_stat"; +static struct proc_dir_entry *iface_stat_procdir; +/* + * The iface_stat_all* will go away once userspace gets use to the new fields + * that have a format line. + */ +static const char *iface_stat_all_procfilename = "iface_stat_all"; +static struct proc_dir_entry *iface_stat_all_procfile; +static const char *iface_stat_fmt_procfilename = "iface_stat_fmt"; +static struct proc_dir_entry *iface_stat_fmt_procfile; + + +/* + * Ordering of locks: + * outer locks: + * iface_stat_list_lock + * sock_tag_list_lock + * inner locks: + * uid_tag_data_tree_lock + * tag_counter_set_list_lock + * Notice how sock_tag_list_lock is held sometimes when uid_tag_data_tree_lock + * is acquired. + * + * Call tree with all lock holders as of 2012-04-27: + * + * iface_stat_fmt_proc_read() + * iface_stat_list_lock + * (struct iface_stat) + * + * qtaguid_ctrl_proc_read() + * sock_tag_list_lock + * (sock_tag_tree) + * (struct proc_qtu_data->sock_tag_list) + * prdebug_full_state() + * sock_tag_list_lock + * (sock_tag_tree) + * uid_tag_data_tree_lock + * (uid_tag_data_tree) + * (proc_qtu_data_tree) + * iface_stat_list_lock + * + * qtaguid_stats_proc_read() + * iface_stat_list_lock + * struct iface_stat->tag_stat_list_lock + * + * qtudev_open() + * uid_tag_data_tree_lock + * + * qtudev_release() + * sock_tag_data_list_lock + * uid_tag_data_tree_lock + * prdebug_full_state() + * sock_tag_list_lock + * uid_tag_data_tree_lock + * iface_stat_list_lock + * + * iface_netdev_event_handler() + * iface_stat_create() + * iface_stat_list_lock + * iface_stat_update() + * iface_stat_list_lock + * + * iface_inetaddr_event_handler() + * iface_stat_create() + * iface_stat_list_lock + * iface_stat_update() + * iface_stat_list_lock + * + * iface_inet6addr_event_handler() + * iface_stat_create_ipv6() + * iface_stat_list_lock + * iface_stat_update() + * iface_stat_list_lock + * + * qtaguid_mt() + * account_for_uid() + * if_tag_stat_update() + * get_sock_stat() + * sock_tag_list_lock + * struct iface_stat->tag_stat_list_lock + * tag_stat_update() + * get_active_counter_set() + * tag_counter_set_list_lock + * tag_stat_update() + * get_active_counter_set() + * tag_counter_set_list_lock + * + * + * qtaguid_ctrl_parse() + * ctrl_cmd_delete() + * sock_tag_list_lock + * tag_counter_set_list_lock + * iface_stat_list_lock + * struct iface_stat->tag_stat_list_lock + * uid_tag_data_tree_lock + * ctrl_cmd_counter_set() + * tag_counter_set_list_lock + * ctrl_cmd_tag() + * sock_tag_list_lock + * (sock_tag_tree) + * get_tag_ref() + * uid_tag_data_tree_lock + * (uid_tag_data_tree) + * uid_tag_data_tree_lock + * (proc_qtu_data_tree) + * ctrl_cmd_untag() + * sock_tag_list_lock + * uid_tag_data_tree_lock + * + */ +static LIST_HEAD(iface_stat_list); +static DEFINE_SPINLOCK(iface_stat_list_lock); + +static struct rb_root sock_tag_tree = RB_ROOT; +static DEFINE_SPINLOCK(sock_tag_list_lock); + +static struct rb_root tag_counter_set_tree = RB_ROOT; +static DEFINE_SPINLOCK(tag_counter_set_list_lock); + +static struct rb_root uid_tag_data_tree = RB_ROOT; +static DEFINE_SPINLOCK(uid_tag_data_tree_lock); + +static struct rb_root proc_qtu_data_tree = RB_ROOT; +/* No proc_qtu_data_tree_lock; use uid_tag_data_tree_lock */ + +static struct qtaguid_event_counts qtu_events; +/*----------------------------------------------*/ +static bool can_manipulate_uids(void) +{ + /* root pwnd */ + return unlikely(!current_fsuid()) || unlikely(!proc_ctrl_write_gid) + || in_egroup_p(proc_ctrl_write_gid); +} + +static bool can_impersonate_uid(uid_t uid) +{ + return uid == current_fsuid() || can_manipulate_uids(); +} + +static bool can_read_other_uid_stats(uid_t uid) +{ + /* root pwnd */ + return unlikely(!current_fsuid()) || uid == current_fsuid() + || unlikely(!proc_stats_readall_gid) + || in_egroup_p(proc_stats_readall_gid); +} + +static inline void dc_add_byte_packets(struct data_counters *counters, int set, + enum ifs_tx_rx direction, + enum ifs_proto ifs_proto, + int bytes, + int packets) +{ + counters->bpc[set][direction][ifs_proto].bytes += bytes; + counters->bpc[set][direction][ifs_proto].packets += packets; +} + +static inline uint64_t dc_sum_bytes(struct data_counters *counters, + int set, + enum ifs_tx_rx direction) +{ + return counters->bpc[set][direction][IFS_TCP].bytes + + counters->bpc[set][direction][IFS_UDP].bytes + + counters->bpc[set][direction][IFS_PROTO_OTHER].bytes; +} + +static inline uint64_t dc_sum_packets(struct data_counters *counters, + int set, + enum ifs_tx_rx direction) +{ + return counters->bpc[set][direction][IFS_TCP].packets + + counters->bpc[set][direction][IFS_UDP].packets + + counters->bpc[set][direction][IFS_PROTO_OTHER].packets; +} + +static struct tag_node *tag_node_tree_search(struct rb_root *root, tag_t tag) +{ + struct rb_node *node = root->rb_node; + + while (node) { + struct tag_node *data = rb_entry(node, struct tag_node, node); + int result; + RB_DEBUG("qtaguid: tag_node_tree_search(0x%llx): " + " node=%p data=%p\n", tag, node, data); + result = tag_compare(tag, data->tag); + RB_DEBUG("qtaguid: tag_node_tree_search(0x%llx): " + " data.tag=0x%llx (uid=%u) res=%d\n", + tag, data->tag, get_uid_from_tag(data->tag), result); + if (result < 0) + node = node->rb_left; + else if (result > 0) + node = node->rb_right; + else + return data; + } + return NULL; +} + +static void tag_node_tree_insert(struct tag_node *data, struct rb_root *root) +{ + struct rb_node **new = &(root->rb_node), *parent = NULL; + + /* Figure out where to put new node */ + while (*new) { + struct tag_node *this = rb_entry(*new, struct tag_node, + node); + int result = tag_compare(data->tag, this->tag); + RB_DEBUG("qtaguid: %s(): tag=0x%llx" + " (uid=%u)\n", __func__, + this->tag, + get_uid_from_tag(this->tag)); + parent = *new; + if (result < 0) + new = &((*new)->rb_left); + else if (result > 0) + new = &((*new)->rb_right); + else + BUG(); + } + + /* Add new node and rebalance tree. */ + rb_link_node(&data->node, parent, new); + rb_insert_color(&data->node, root); +} + +static void tag_stat_tree_insert(struct tag_stat *data, struct rb_root *root) +{ + tag_node_tree_insert(&data->tn, root); +} + +static struct tag_stat *tag_stat_tree_search(struct rb_root *root, tag_t tag) +{ + struct tag_node *node = tag_node_tree_search(root, tag); + if (!node) + return NULL; + return rb_entry(&node->node, struct tag_stat, tn.node); +} + +static void tag_counter_set_tree_insert(struct tag_counter_set *data, + struct rb_root *root) +{ + tag_node_tree_insert(&data->tn, root); +} + +static struct tag_counter_set *tag_counter_set_tree_search(struct rb_root *root, + tag_t tag) +{ + struct tag_node *node = tag_node_tree_search(root, tag); + if (!node) + return NULL; + return rb_entry(&node->node, struct tag_counter_set, tn.node); + +} + +static void tag_ref_tree_insert(struct tag_ref *data, struct rb_root *root) +{ + tag_node_tree_insert(&data->tn, root); +} + +static struct tag_ref *tag_ref_tree_search(struct rb_root *root, tag_t tag) +{ + struct tag_node *node = tag_node_tree_search(root, tag); + if (!node) + return NULL; + return rb_entry(&node->node, struct tag_ref, tn.node); +} + +static struct sock_tag *sock_tag_tree_search(struct rb_root *root, + const struct sock *sk) +{ + struct rb_node *node = root->rb_node; + + while (node) { + struct sock_tag *data = rb_entry(node, struct sock_tag, + sock_node); + if (sk < data->sk) + node = node->rb_left; + else if (sk > data->sk) + node = node->rb_right; + else + return data; + } + return NULL; +} + +static void sock_tag_tree_insert(struct sock_tag *data, struct rb_root *root) +{ + struct rb_node **new = &(root->rb_node), *parent = NULL; + + /* Figure out where to put new node */ + while (*new) { + struct sock_tag *this = rb_entry(*new, struct sock_tag, + sock_node); + parent = *new; + if (data->sk < this->sk) + new = &((*new)->rb_left); + else if (data->sk > this->sk) + new = &((*new)->rb_right); + else + BUG(); + } + + /* Add new node and rebalance tree. */ + rb_link_node(&data->sock_node, parent, new); + rb_insert_color(&data->sock_node, root); +} + +static void sock_tag_tree_erase(struct rb_root *st_to_free_tree) +{ + struct rb_node *node; + struct sock_tag *st_entry; + + node = rb_first(st_to_free_tree); + while (node) { + st_entry = rb_entry(node, struct sock_tag, sock_node); + node = rb_next(node); + CT_DEBUG("qtaguid: %s(): " + "erase st: sk=%p tag=0x%llx (uid=%u)\n", __func__, + st_entry->sk, + st_entry->tag, + get_uid_from_tag(st_entry->tag)); + rb_erase(&st_entry->sock_node, st_to_free_tree); + sockfd_put(st_entry->socket); + kfree(st_entry); + } +} + +static struct proc_qtu_data *proc_qtu_data_tree_search(struct rb_root *root, + const pid_t pid) +{ + struct rb_node *node = root->rb_node; + + while (node) { + struct proc_qtu_data *data = rb_entry(node, + struct proc_qtu_data, + node); + if (pid < data->pid) + node = node->rb_left; + else if (pid > data->pid) + node = node->rb_right; + else + return data; + } + return NULL; +} + +static void proc_qtu_data_tree_insert(struct proc_qtu_data *data, + struct rb_root *root) +{ + struct rb_node **new = &(root->rb_node), *parent = NULL; + + /* Figure out where to put new node */ + while (*new) { + struct proc_qtu_data *this = rb_entry(*new, + struct proc_qtu_data, + node); + parent = *new; + if (data->pid < this->pid) + new = &((*new)->rb_left); + else if (data->pid > this->pid) + new = &((*new)->rb_right); + else + BUG(); + } + + /* Add new node and rebalance tree. */ + rb_link_node(&data->node, parent, new); + rb_insert_color(&data->node, root); +} + +static void uid_tag_data_tree_insert(struct uid_tag_data *data, + struct rb_root *root) +{ + struct rb_node **new = &(root->rb_node), *parent = NULL; + + /* Figure out where to put new node */ + while (*new) { + struct uid_tag_data *this = rb_entry(*new, + struct uid_tag_data, + node); + parent = *new; + if (data->uid < this->uid) + new = &((*new)->rb_left); + else if (data->uid > this->uid) + new = &((*new)->rb_right); + else + BUG(); + } + + /* Add new node and rebalance tree. */ + rb_link_node(&data->node, parent, new); + rb_insert_color(&data->node, root); +} + +static struct uid_tag_data *uid_tag_data_tree_search(struct rb_root *root, + uid_t uid) +{ + struct rb_node *node = root->rb_node; + + while (node) { + struct uid_tag_data *data = rb_entry(node, + struct uid_tag_data, + node); + if (uid < data->uid) + node = node->rb_left; + else if (uid > data->uid) + node = node->rb_right; + else + return data; + } + return NULL; +} + +/* + * Allocates a new uid_tag_data struct if needed. + * Returns a pointer to the found or allocated uid_tag_data. + * Returns a PTR_ERR on failures, and lock is not held. + * If found is not NULL: + * sets *found to true if not allocated. + * sets *found to false if allocated. + */ +struct uid_tag_data *get_uid_data(uid_t uid, bool *found_res) +{ + struct uid_tag_data *utd_entry; + + /* Look for top level uid_tag_data for the UID */ + utd_entry = uid_tag_data_tree_search(&uid_tag_data_tree, uid); + DR_DEBUG("qtaguid: get_uid_data(%u) utd=%p\n", uid, utd_entry); + + if (found_res) + *found_res = utd_entry; + if (utd_entry) + return utd_entry; + + utd_entry = kzalloc(sizeof(*utd_entry), GFP_ATOMIC); + if (!utd_entry) { + pr_err("qtaguid: get_uid_data(%u): " + "tag data alloc failed\n", uid); + return ERR_PTR(-ENOMEM); + } + + utd_entry->uid = uid; + utd_entry->tag_ref_tree = RB_ROOT; + uid_tag_data_tree_insert(utd_entry, &uid_tag_data_tree); + DR_DEBUG("qtaguid: get_uid_data(%u) new utd=%p\n", uid, utd_entry); + return utd_entry; +} + +/* Never returns NULL. Either PTR_ERR or a valid ptr. */ +static struct tag_ref *new_tag_ref(tag_t new_tag, + struct uid_tag_data *utd_entry) +{ + struct tag_ref *tr_entry; + int res; + + if (utd_entry->num_active_tags + 1 > max_sock_tags) { + pr_info("qtaguid: new_tag_ref(0x%llx): " + "tag ref alloc quota exceeded. max=%d\n", + new_tag, max_sock_tags); + res = -EMFILE; + goto err_res; + + } + + tr_entry = kzalloc(sizeof(*tr_entry), GFP_ATOMIC); + if (!tr_entry) { + pr_err("qtaguid: new_tag_ref(0x%llx): " + "tag ref alloc failed\n", + new_tag); + res = -ENOMEM; + goto err_res; + } + tr_entry->tn.tag = new_tag; + /* tr_entry->num_sock_tags handled by caller */ + utd_entry->num_active_tags++; + tag_ref_tree_insert(tr_entry, &utd_entry->tag_ref_tree); + DR_DEBUG("qtaguid: new_tag_ref(0x%llx): " + " inserted new tag ref %p\n", + new_tag, tr_entry); + return tr_entry; + +err_res: + return ERR_PTR(res); +} + +static struct tag_ref *lookup_tag_ref(tag_t full_tag, + struct uid_tag_data **utd_res) +{ + struct uid_tag_data *utd_entry; + struct tag_ref *tr_entry; + bool found_utd; + uid_t uid = get_uid_from_tag(full_tag); + + DR_DEBUG("qtaguid: lookup_tag_ref(tag=0x%llx (uid=%u))\n", + full_tag, uid); + + utd_entry = get_uid_data(uid, &found_utd); + if (IS_ERR_OR_NULL(utd_entry)) { + if (utd_res) + *utd_res = utd_entry; + return NULL; + } + + tr_entry = tag_ref_tree_search(&utd_entry->tag_ref_tree, full_tag); + if (utd_res) + *utd_res = utd_entry; + DR_DEBUG("qtaguid: lookup_tag_ref(0x%llx) utd_entry=%p tr_entry=%p\n", + full_tag, utd_entry, tr_entry); + return tr_entry; +} + +/* Never returns NULL. Either PTR_ERR or a valid ptr. */ +static struct tag_ref *get_tag_ref(tag_t full_tag, + struct uid_tag_data **utd_res) +{ + struct uid_tag_data *utd_entry; + struct tag_ref *tr_entry; + + DR_DEBUG("qtaguid: get_tag_ref(0x%llx)\n", + full_tag); + spin_lock_bh(&uid_tag_data_tree_lock); + tr_entry = lookup_tag_ref(full_tag, &utd_entry); + BUG_ON(IS_ERR_OR_NULL(utd_entry)); + if (!tr_entry) + tr_entry = new_tag_ref(full_tag, utd_entry); + + spin_unlock_bh(&uid_tag_data_tree_lock); + if (utd_res) + *utd_res = utd_entry; + DR_DEBUG("qtaguid: get_tag_ref(0x%llx) utd=%p tr=%p\n", + full_tag, utd_entry, tr_entry); + return tr_entry; +} + +/* Checks and maybe frees the UID Tag Data entry */ +static void put_utd_entry(struct uid_tag_data *utd_entry) +{ + /* Are we done with the UID tag data entry? */ + if (RB_EMPTY_ROOT(&utd_entry->tag_ref_tree) && + !utd_entry->num_pqd) { + DR_DEBUG("qtaguid: %s(): " + "erase utd_entry=%p uid=%u " + "by pid=%u tgid=%u uid=%u\n", __func__, + utd_entry, utd_entry->uid, + current->pid, current->tgid, current_fsuid()); + BUG_ON(utd_entry->num_active_tags); + rb_erase(&utd_entry->node, &uid_tag_data_tree); + kfree(utd_entry); + } else { + DR_DEBUG("qtaguid: %s(): " + "utd_entry=%p still has %d tags %d proc_qtu_data\n", + __func__, utd_entry, utd_entry->num_active_tags, + utd_entry->num_pqd); + BUG_ON(!(utd_entry->num_active_tags || + utd_entry->num_pqd)); + } +} + +/* + * If no sock_tags are using this tag_ref, + * decrements refcount of utd_entry, removes tr_entry + * from utd_entry->tag_ref_tree and frees. + */ +static void free_tag_ref_from_utd_entry(struct tag_ref *tr_entry, + struct uid_tag_data *utd_entry) +{ + DR_DEBUG("qtaguid: %s(): %p tag=0x%llx (uid=%u)\n", __func__, + tr_entry, tr_entry->tn.tag, + get_uid_from_tag(tr_entry->tn.tag)); + if (!tr_entry->num_sock_tags) { + BUG_ON(!utd_entry->num_active_tags); + utd_entry->num_active_tags--; + rb_erase(&tr_entry->tn.node, &utd_entry->tag_ref_tree); + DR_DEBUG("qtaguid: %s(): erased %p\n", __func__, tr_entry); + kfree(tr_entry); + } +} + +static void put_tag_ref_tree(tag_t full_tag, struct uid_tag_data *utd_entry) +{ + struct rb_node *node; + struct tag_ref *tr_entry; + tag_t acct_tag; + + DR_DEBUG("qtaguid: %s(tag=0x%llx (uid=%u))\n", __func__, + full_tag, get_uid_from_tag(full_tag)); + acct_tag = get_atag_from_tag(full_tag); + node = rb_first(&utd_entry->tag_ref_tree); + while (node) { + tr_entry = rb_entry(node, struct tag_ref, tn.node); + node = rb_next(node); + if (!acct_tag || tr_entry->tn.tag == full_tag) + free_tag_ref_from_utd_entry(tr_entry, utd_entry); + } +} + +static int read_proc_u64(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + int len; + uint64_t value; + char *p = page; + uint64_t *iface_entry = data; + + if (!data) + return 0; + + value = *iface_entry; + p += sprintf(p, "%llu\n", value); + len = (p - page) - off; + *eof = (len <= count) ? 1 : 0; + *start = page + off; + return len; +} + +static int read_proc_bool(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + int len; + bool value; + char *p = page; + bool *bool_entry = data; + + if (!data) + return 0; + + value = *bool_entry; + p += sprintf(p, "%u\n", value); + len = (p - page) - off; + *eof = (len <= count) ? 1 : 0; + *start = page + off; + return len; +} + +static int get_active_counter_set(tag_t tag) +{ + int active_set = 0; + struct tag_counter_set *tcs; + + MT_DEBUG("qtaguid: get_active_counter_set(tag=0x%llx)" + " (uid=%u)\n", + tag, get_uid_from_tag(tag)); + /* For now we only handle UID tags for active sets */ + tag = get_utag_from_tag(tag); + spin_lock_bh(&tag_counter_set_list_lock); + tcs = tag_counter_set_tree_search(&tag_counter_set_tree, tag); + if (tcs) + active_set = tcs->active_set; + spin_unlock_bh(&tag_counter_set_list_lock); + return active_set; +} + +/* + * Find the entry for tracking the specified interface. + * Caller must hold iface_stat_list_lock + */ +static struct iface_stat *get_iface_entry(const char *ifname) +{ + struct iface_stat *iface_entry; + + /* Find the entry for tracking the specified tag within the interface */ + if (ifname == NULL) { + pr_info("qtaguid: iface_stat: get() NULL device name\n"); + return NULL; + } + + /* Iterate over interfaces */ + list_for_each_entry(iface_entry, &iface_stat_list, list) { + if (!strcmp(ifname, iface_entry->ifname)) + goto done; + } + iface_entry = NULL; +done: + return iface_entry; +} + +static int iface_stat_fmt_proc_read(char *page, char **num_items_returned, + off_t items_to_skip, int char_count, + int *eof, void *data) +{ + char *outp = page; + int item_index = 0; + int len; + int fmt = (int)data; /* The data is just 1 (old) or 2 (uses fmt) */ + struct iface_stat *iface_entry; + struct rtnl_link_stats64 dev_stats, *stats; + struct rtnl_link_stats64 no_dev_stats = {0}; + + if (unlikely(module_passive)) { + *eof = 1; + return 0; + } + + CT_DEBUG("qtaguid:proc iface_stat_fmt " + "pid=%u tgid=%u uid=%u " + "page=%p *num_items_returned=%p off=%ld " + "char_count=%d *eof=%d\n", + current->pid, current->tgid, current_fsuid(), + page, *num_items_returned, + items_to_skip, char_count, *eof); + + if (*eof) + return 0; + + if (fmt == 2 && item_index++ >= items_to_skip) { + len = snprintf(outp, char_count, + "ifname " + "total_skb_rx_bytes total_skb_rx_packets " + "total_skb_tx_bytes total_skb_tx_packets\n" + ); + if (len >= char_count) { + *outp = '\0'; + return outp - page; + } + outp += len; + char_count -= len; + (*num_items_returned)++; + } + + /* + * This lock will prevent iface_stat_update() from changing active, + * and in turn prevent an interface from unregistering itself. + */ + spin_lock_bh(&iface_stat_list_lock); + list_for_each_entry(iface_entry, &iface_stat_list, list) { + if (item_index++ < items_to_skip) + continue; + + if (iface_entry->active) { + stats = dev_get_stats(iface_entry->net_dev, + &dev_stats); + } else { + stats = &no_dev_stats; + } + /* + * If the meaning of the data changes, then update the fmtX + * string. + */ + if (fmt == 1) { + len = snprintf( + outp, char_count, + "%s %d " + "%llu %llu %llu %llu " + "%llu %llu %llu %llu\n", + iface_entry->ifname, + iface_entry->active, + iface_entry->totals_via_dev[IFS_RX].bytes, + iface_entry->totals_via_dev[IFS_RX].packets, + iface_entry->totals_via_dev[IFS_TX].bytes, + iface_entry->totals_via_dev[IFS_TX].packets, + stats->rx_bytes, stats->rx_packets, + stats->tx_bytes, stats->tx_packets + ); + } else { + len = snprintf( + outp, char_count, + "%s " + "%llu %llu %llu %llu\n", + iface_entry->ifname, + iface_entry->totals_via_skb[IFS_RX].bytes, + iface_entry->totals_via_skb[IFS_RX].packets, + iface_entry->totals_via_skb[IFS_TX].bytes, + iface_entry->totals_via_skb[IFS_TX].packets + ); + } + if (len >= char_count) { + spin_unlock_bh(&iface_stat_list_lock); + *outp = '\0'; + return outp - page; + } + outp += len; + char_count -= len; + (*num_items_returned)++; + } + spin_unlock_bh(&iface_stat_list_lock); + + *eof = 1; + return outp - page; +} + +static void iface_create_proc_worker(struct work_struct *work) +{ + struct proc_dir_entry *proc_entry; + struct iface_stat_work *isw = container_of(work, struct iface_stat_work, + iface_work); + struct iface_stat *new_iface = isw->iface_entry; + + /* iface_entries are not deleted, so safe to manipulate. */ + proc_entry = proc_mkdir(new_iface->ifname, iface_stat_procdir); + if (IS_ERR_OR_NULL(proc_entry)) { + pr_err("qtaguid: iface_stat: create_proc(): alloc failed.\n"); + kfree(isw); + return; + } + + new_iface->proc_ptr = proc_entry; + + create_proc_read_entry("tx_bytes", proc_iface_perms, proc_entry, + read_proc_u64, + &new_iface->totals_via_dev[IFS_TX].bytes); + create_proc_read_entry("rx_bytes", proc_iface_perms, proc_entry, + read_proc_u64, + &new_iface->totals_via_dev[IFS_RX].bytes); + create_proc_read_entry("tx_packets", proc_iface_perms, proc_entry, + read_proc_u64, + &new_iface->totals_via_dev[IFS_TX].packets); + create_proc_read_entry("rx_packets", proc_iface_perms, proc_entry, + read_proc_u64, + &new_iface->totals_via_dev[IFS_RX].packets); + create_proc_read_entry("active", proc_iface_perms, proc_entry, + read_proc_bool, &new_iface->active); + + IF_DEBUG("qtaguid: iface_stat: create_proc(): done " + "entry=%p dev=%s\n", new_iface, new_iface->ifname); + kfree(isw); +} + +/* + * Will set the entry's active state, and + * update the net_dev accordingly also. + */ +static void _iface_stat_set_active(struct iface_stat *entry, + struct net_device *net_dev, + bool activate) +{ + if (activate) { + entry->net_dev = net_dev; + entry->active = true; + IF_DEBUG("qtaguid: %s(%s): " + "enable tracking. rfcnt=%d\n", __func__, + entry->ifname, + percpu_read(*net_dev->pcpu_refcnt)); + } else { + entry->active = false; + entry->net_dev = NULL; + IF_DEBUG("qtaguid: %s(%s): " + "disable tracking. rfcnt=%d\n", __func__, + entry->ifname, + percpu_read(*net_dev->pcpu_refcnt)); + + } +} + +/* Caller must hold iface_stat_list_lock */ +static struct iface_stat *iface_alloc(struct net_device *net_dev) +{ + struct iface_stat *new_iface; + struct iface_stat_work *isw; + + new_iface = kzalloc(sizeof(*new_iface), GFP_ATOMIC); + if (new_iface == NULL) { + pr_err("qtaguid: iface_stat: create(%s): " + "iface_stat alloc failed\n", net_dev->name); + return NULL; + } + new_iface->ifname = kstrdup(net_dev->name, GFP_ATOMIC); + if (new_iface->ifname == NULL) { + pr_err("qtaguid: iface_stat: create(%s): " + "ifname alloc failed\n", net_dev->name); + kfree(new_iface); + return NULL; + } + spin_lock_init(&new_iface->tag_stat_list_lock); + new_iface->tag_stat_tree = RB_ROOT; + _iface_stat_set_active(new_iface, net_dev, true); + + /* + * ipv6 notifier chains are atomic :( + * No create_proc_read_entry() for you! + */ + isw = kmalloc(sizeof(*isw), GFP_ATOMIC); + if (!isw) { + pr_err("qtaguid: iface_stat: create(%s): " + "work alloc failed\n", new_iface->ifname); + _iface_stat_set_active(new_iface, net_dev, false); + kfree(new_iface->ifname); + kfree(new_iface); + return NULL; + } + isw->iface_entry = new_iface; + INIT_WORK(&isw->iface_work, iface_create_proc_worker); + schedule_work(&isw->iface_work); + list_add(&new_iface->list, &iface_stat_list); + return new_iface; +} + +static void iface_check_stats_reset_and_adjust(struct net_device *net_dev, + struct iface_stat *iface) +{ + struct rtnl_link_stats64 dev_stats, *stats; + bool stats_rewound; + + stats = dev_get_stats(net_dev, &dev_stats); + /* No empty packets */ + stats_rewound = + (stats->rx_bytes < iface->last_known[IFS_RX].bytes) + || (stats->tx_bytes < iface->last_known[IFS_TX].bytes); + + IF_DEBUG("qtaguid: %s(%s): iface=%p netdev=%p " + "bytes rx/tx=%llu/%llu " + "active=%d last_known=%d " + "stats_rewound=%d\n", __func__, + net_dev ? net_dev->name : "?", + iface, net_dev, + stats->rx_bytes, stats->tx_bytes, + iface->active, iface->last_known_valid, stats_rewound); + + if (iface->active && iface->last_known_valid && stats_rewound) { + pr_warn_once("qtaguid: iface_stat: %s(%s): " + "iface reset its stats unexpectedly\n", __func__, + net_dev->name); + + iface->totals_via_dev[IFS_TX].bytes += + iface->last_known[IFS_TX].bytes; + iface->totals_via_dev[IFS_TX].packets += + iface->last_known[IFS_TX].packets; + iface->totals_via_dev[IFS_RX].bytes += + iface->last_known[IFS_RX].bytes; + iface->totals_via_dev[IFS_RX].packets += + iface->last_known[IFS_RX].packets; + iface->last_known_valid = false; + IF_DEBUG("qtaguid: %s(%s): iface=%p " + "used last known bytes rx/tx=%llu/%llu\n", __func__, + iface->ifname, iface, iface->last_known[IFS_RX].bytes, + iface->last_known[IFS_TX].bytes); + } +} + +/* + * Create a new entry for tracking the specified interface. + * Do nothing if the entry already exists. + * Called when an interface is configured with a valid IP address. + */ +static void iface_stat_create(struct net_device *net_dev, + struct in_ifaddr *ifa) +{ + struct in_device *in_dev = NULL; + const char *ifname; + struct iface_stat *entry; + __be32 ipaddr = 0; + struct iface_stat *new_iface; + + IF_DEBUG("qtaguid: iface_stat: create(%s): ifa=%p netdev=%p\n", + net_dev ? net_dev->name : "?", + ifa, net_dev); + if (!net_dev) { + pr_err("qtaguid: iface_stat: create(): no net dev\n"); + return; + } + + ifname = net_dev->name; + if (!ifa) { + in_dev = in_dev_get(net_dev); + if (!in_dev) { + pr_err("qtaguid: iface_stat: create(%s): no inet dev\n", + ifname); + return; + } + IF_DEBUG("qtaguid: iface_stat: create(%s): in_dev=%p\n", + ifname, in_dev); + for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) { + IF_DEBUG("qtaguid: iface_stat: create(%s): " + "ifa=%p ifa_label=%s\n", + ifname, ifa, + ifa->ifa_label ? ifa->ifa_label : "(null)"); + if (ifa->ifa_label && !strcmp(ifname, ifa->ifa_label)) + break; + } + } + + if (!ifa) { + IF_DEBUG("qtaguid: iface_stat: create(%s): no matching IP\n", + ifname); + goto done_put; + } + ipaddr = ifa->ifa_local; + + spin_lock_bh(&iface_stat_list_lock); + entry = get_iface_entry(ifname); + if (entry != NULL) { + IF_DEBUG("qtaguid: iface_stat: create(%s): entry=%p\n", + ifname, entry); + iface_check_stats_reset_and_adjust(net_dev, entry); + _iface_stat_set_active(entry, net_dev, true); + IF_DEBUG("qtaguid: %s(%s): " + "tracking now %d on ip=%pI4\n", __func__, + entry->ifname, true, &ipaddr); + goto done_unlock_put; + } + + new_iface = iface_alloc(net_dev); + IF_DEBUG("qtaguid: iface_stat: create(%s): done " + "entry=%p ip=%pI4\n", ifname, new_iface, &ipaddr); +done_unlock_put: + spin_unlock_bh(&iface_stat_list_lock); +done_put: + if (in_dev) + in_dev_put(in_dev); +} + +static void iface_stat_create_ipv6(struct net_device *net_dev, + struct inet6_ifaddr *ifa) +{ + struct in_device *in_dev; + const char *ifname; + struct iface_stat *entry; + struct iface_stat *new_iface; + int addr_type; + + IF_DEBUG("qtaguid: iface_stat: create6(): ifa=%p netdev=%p->name=%s\n", + ifa, net_dev, net_dev ? net_dev->name : ""); + if (!net_dev) { + pr_err("qtaguid: iface_stat: create6(): no net dev!\n"); + return; + } + ifname = net_dev->name; + + in_dev = in_dev_get(net_dev); + if (!in_dev) { + pr_err("qtaguid: iface_stat: create6(%s): no inet dev\n", + ifname); + return; + } + + IF_DEBUG("qtaguid: iface_stat: create6(%s): in_dev=%p\n", + ifname, in_dev); + + if (!ifa) { + IF_DEBUG("qtaguid: iface_stat: create6(%s): no matching IP\n", + ifname); + goto done_put; + } + addr_type = ipv6_addr_type(&ifa->addr); + + spin_lock_bh(&iface_stat_list_lock); + entry = get_iface_entry(ifname); + if (entry != NULL) { + IF_DEBUG("qtaguid: %s(%s): entry=%p\n", __func__, + ifname, entry); + iface_check_stats_reset_and_adjust(net_dev, entry); + _iface_stat_set_active(entry, net_dev, true); + IF_DEBUG("qtaguid: %s(%s): " + "tracking now %d on ip=%pI6c\n", __func__, + entry->ifname, true, &ifa->addr); + goto done_unlock_put; + } + + new_iface = iface_alloc(net_dev); + IF_DEBUG("qtaguid: iface_stat: create6(%s): done " + "entry=%p ip=%pI6c\n", ifname, new_iface, &ifa->addr); + +done_unlock_put: + spin_unlock_bh(&iface_stat_list_lock); +done_put: + in_dev_put(in_dev); +} + +static struct sock_tag *get_sock_stat_nl(const struct sock *sk) +{ + MT_DEBUG("qtaguid: get_sock_stat_nl(sk=%p)\n", sk); + return sock_tag_tree_search(&sock_tag_tree, sk); +} + +static struct sock_tag *get_sock_stat(const struct sock *sk) +{ + struct sock_tag *sock_tag_entry; + MT_DEBUG("qtaguid: get_sock_stat(sk=%p)\n", sk); + if (!sk) + return NULL; + spin_lock_bh(&sock_tag_list_lock); + sock_tag_entry = get_sock_stat_nl(sk); + spin_unlock_bh(&sock_tag_list_lock); + return sock_tag_entry; +} + +static int ipx_proto(const struct sk_buff *skb, + struct xt_action_param *par) +{ + int thoff, tproto; + + switch (par->family) { + case NFPROTO_IPV6: + tproto = ipv6_find_hdr(skb, &thoff, -1, NULL); + if (tproto < 0) + MT_DEBUG("%s(): transport header not found in ipv6" + " skb=%p\n", __func__, skb); + break; + case NFPROTO_IPV4: + tproto = ip_hdr(skb)->protocol; + break; + default: + tproto = IPPROTO_RAW; + } + return tproto; +} + +static void +data_counters_update(struct data_counters *dc, int set, + enum ifs_tx_rx direction, int proto, int bytes) +{ + switch (proto) { + case IPPROTO_TCP: + dc_add_byte_packets(dc, set, direction, IFS_TCP, bytes, 1); + break; + case IPPROTO_UDP: + dc_add_byte_packets(dc, set, direction, IFS_UDP, bytes, 1); + break; + case IPPROTO_IP: + default: + dc_add_byte_packets(dc, set, direction, IFS_PROTO_OTHER, bytes, + 1); + break; + } +} + +/* + * Update stats for the specified interface. Do nothing if the entry + * does not exist (when a device was never configured with an IP address). + * Called when an device is being unregistered. + */ +static void iface_stat_update(struct net_device *net_dev, bool stash_only) +{ + struct rtnl_link_stats64 dev_stats, *stats; + struct iface_stat *entry; + + stats = dev_get_stats(net_dev, &dev_stats); + spin_lock_bh(&iface_stat_list_lock); + entry = get_iface_entry(net_dev->name); + if (entry == NULL) { + IF_DEBUG("qtaguid: iface_stat: update(%s): not tracked\n", + net_dev->name); + spin_unlock_bh(&iface_stat_list_lock); + return; + } + + IF_DEBUG("qtaguid: %s(%s): entry=%p\n", __func__, + net_dev->name, entry); + if (!entry->active) { + IF_DEBUG("qtaguid: %s(%s): already disabled\n", __func__, + net_dev->name); + spin_unlock_bh(&iface_stat_list_lock); + return; + } + + if (stash_only) { + entry->last_known[IFS_TX].bytes = stats->tx_bytes; + entry->last_known[IFS_TX].packets = stats->tx_packets; + entry->last_known[IFS_RX].bytes = stats->rx_bytes; + entry->last_known[IFS_RX].packets = stats->rx_packets; + entry->last_known_valid = true; + IF_DEBUG("qtaguid: %s(%s): " + "dev stats stashed rx/tx=%llu/%llu\n", __func__, + net_dev->name, stats->rx_bytes, stats->tx_bytes); + spin_unlock_bh(&iface_stat_list_lock); + return; + } + entry->totals_via_dev[IFS_TX].bytes += stats->tx_bytes; + entry->totals_via_dev[IFS_TX].packets += stats->tx_packets; + entry->totals_via_dev[IFS_RX].bytes += stats->rx_bytes; + entry->totals_via_dev[IFS_RX].packets += stats->rx_packets; + /* We don't need the last_known[] anymore */ + entry->last_known_valid = false; + _iface_stat_set_active(entry, net_dev, false); + IF_DEBUG("qtaguid: %s(%s): " + "disable tracking. rx/tx=%llu/%llu\n", __func__, + net_dev->name, stats->rx_bytes, stats->tx_bytes); + spin_unlock_bh(&iface_stat_list_lock); +} + +/* + * Update stats for the specified interface from the skb. + * Do nothing if the entry + * does not exist (when a device was never configured with an IP address). + * Called on each sk. + */ +static void iface_stat_update_from_skb(const struct sk_buff *skb, + struct xt_action_param *par) +{ + struct iface_stat *entry; + const struct net_device *el_dev; + enum ifs_tx_rx direction = par->in ? IFS_RX : IFS_TX; + int bytes = skb->len; + + if (!skb->dev) { + MT_DEBUG("qtaguid[%d]: no skb->dev\n", par->hooknum); + el_dev = par->in ? : par->out; + } else { + const struct net_device *other_dev; + el_dev = skb->dev; + other_dev = par->in ? : par->out; + if (el_dev != other_dev) { + MT_DEBUG("qtaguid[%d]: skb->dev=%p %s vs " + "par->(in/out)=%p %s\n", + par->hooknum, el_dev, el_dev->name, other_dev, + other_dev->name); + } + } + + if (unlikely(!el_dev)) { + pr_err("qtaguid[%d]: %s(): no par->in/out?!!\n", + par->hooknum, __func__); + BUG(); + } else if (unlikely(!el_dev->name)) { + pr_err("qtaguid[%d]: %s(): no dev->name?!!\n", + par->hooknum, __func__); + BUG(); + } else { + int proto = ipx_proto(skb, par); + MT_DEBUG("qtaguid[%d]: dev name=%s type=%d fam=%d proto=%d\n", + par->hooknum, el_dev->name, el_dev->type, + par->family, proto); + } + + spin_lock_bh(&iface_stat_list_lock); + entry = get_iface_entry(el_dev->name); + if (entry == NULL) { + IF_DEBUG("qtaguid: iface_stat: %s(%s): not tracked\n", + __func__, el_dev->name); + spin_unlock_bh(&iface_stat_list_lock); + return; + } + + IF_DEBUG("qtaguid: %s(%s): entry=%p\n", __func__, + el_dev->name, entry); + + entry->totals_via_skb[direction].bytes += bytes; + entry->totals_via_skb[direction].packets++; + spin_unlock_bh(&iface_stat_list_lock); +} + +static void tag_stat_update(struct tag_stat *tag_entry, + enum ifs_tx_rx direction, int proto, int bytes) +{ + int active_set; + active_set = get_active_counter_set(tag_entry->tn.tag); + MT_DEBUG("qtaguid: tag_stat_update(tag=0x%llx (uid=%u) set=%d " + "dir=%d proto=%d bytes=%d)\n", + tag_entry->tn.tag, get_uid_from_tag(tag_entry->tn.tag), + active_set, direction, proto, bytes); + data_counters_update(&tag_entry->counters, active_set, direction, + proto, bytes); + if (tag_entry->parent_counters) + data_counters_update(tag_entry->parent_counters, active_set, + direction, proto, bytes); +} + +/* + * Create a new entry for tracking the specified {acct_tag,uid_tag} within + * the interface. + * iface_entry->tag_stat_list_lock should be held. + */ +static struct tag_stat *create_if_tag_stat(struct iface_stat *iface_entry, + tag_t tag) +{ + struct tag_stat *new_tag_stat_entry = NULL; + IF_DEBUG("qtaguid: iface_stat: %s(): ife=%p tag=0x%llx" + " (uid=%u)\n", __func__, + iface_entry, tag, get_uid_from_tag(tag)); + new_tag_stat_entry = kzalloc(sizeof(*new_tag_stat_entry), GFP_ATOMIC); + if (!new_tag_stat_entry) { + pr_err("qtaguid: iface_stat: tag stat alloc failed\n"); + goto done; + } + new_tag_stat_entry->tn.tag = tag; + tag_stat_tree_insert(new_tag_stat_entry, &iface_entry->tag_stat_tree); +done: + return new_tag_stat_entry; +} + +static void if_tag_stat_update(const char *ifname, uid_t uid, + const struct sock *sk, enum ifs_tx_rx direction, + int proto, int bytes) +{ + struct tag_stat *tag_stat_entry; + tag_t tag, acct_tag; + tag_t uid_tag; + struct data_counters *uid_tag_counters; + struct sock_tag *sock_tag_entry; + struct iface_stat *iface_entry; + struct tag_stat *new_tag_stat = NULL; + MT_DEBUG("qtaguid: if_tag_stat_update(ifname=%s " + "uid=%u sk=%p dir=%d proto=%d bytes=%d)\n", + ifname, uid, sk, direction, proto, bytes); + + + iface_entry = get_iface_entry(ifname); + if (!iface_entry) { + pr_err("qtaguid: iface_stat: stat_update() %s not found\n", + ifname); + return; + } + /* It is ok to process data when an iface_entry is inactive */ + + MT_DEBUG("qtaguid: iface_stat: stat_update() dev=%s entry=%p\n", + ifname, iface_entry); + + /* + * Look for a tagged sock. + * It will have an acct_uid. + */ + sock_tag_entry = get_sock_stat(sk); + if (sock_tag_entry) { + tag = sock_tag_entry->tag; + acct_tag = get_atag_from_tag(tag); + uid_tag = get_utag_from_tag(tag); + } else { + acct_tag = make_atag_from_value(0); + tag = combine_atag_with_uid(acct_tag, uid); + uid_tag = make_tag_from_uid(uid); + } + MT_DEBUG("qtaguid: iface_stat: stat_update(): " + " looking for tag=0x%llx (uid=%u) in ife=%p\n", + tag, get_uid_from_tag(tag), iface_entry); + /* Loop over tag list under this interface for {acct_tag,uid_tag} */ + spin_lock_bh(&iface_entry->tag_stat_list_lock); + + tag_stat_entry = tag_stat_tree_search(&iface_entry->tag_stat_tree, + tag); + if (tag_stat_entry) { + /* + * Updating the {acct_tag, uid_tag} entry handles both stats: + * {0, uid_tag} will also get updated. + */ + tag_stat_update(tag_stat_entry, direction, proto, bytes); + spin_unlock_bh(&iface_entry->tag_stat_list_lock); + return; + } + + /* Loop over tag list under this interface for {0,uid_tag} */ + tag_stat_entry = tag_stat_tree_search(&iface_entry->tag_stat_tree, + uid_tag); + if (!tag_stat_entry) { + /* Here: the base uid_tag did not exist */ + /* + * No parent counters. So + * - No {0, uid_tag} stats and no {acc_tag, uid_tag} stats. + */ + new_tag_stat = create_if_tag_stat(iface_entry, uid_tag); + if (!new_tag_stat) + goto unlock; + uid_tag_counters = &new_tag_stat->counters; + } else { + uid_tag_counters = &tag_stat_entry->counters; + } + + if (acct_tag) { + /* Create the child {acct_tag, uid_tag} and hook up parent. */ + new_tag_stat = create_if_tag_stat(iface_entry, tag); + if (!new_tag_stat) + goto unlock; + new_tag_stat->parent_counters = uid_tag_counters; + } else { + /* + * For new_tag_stat to be still NULL here would require: + * {0, uid_tag} exists + * and {acct_tag, uid_tag} doesn't exist + * AND acct_tag == 0. + * Impossible. This reassures us that new_tag_stat + * below will always be assigned. + */ + BUG_ON(!new_tag_stat); + } + tag_stat_update(new_tag_stat, direction, proto, bytes); +unlock: + spin_unlock_bh(&iface_entry->tag_stat_list_lock); +} + +static int iface_netdev_event_handler(struct notifier_block *nb, + unsigned long event, void *ptr) { + struct net_device *dev = ptr; + + if (unlikely(module_passive)) + return NOTIFY_DONE; + + IF_DEBUG("qtaguid: iface_stat: netdev_event(): " + "ev=0x%lx/%s netdev=%p->name=%s\n", + event, netdev_evt_str(event), dev, dev ? dev->name : ""); + + switch (event) { + case NETDEV_UP: + iface_stat_create(dev, NULL); + atomic64_inc(&qtu_events.iface_events); + break; + case NETDEV_DOWN: + case NETDEV_UNREGISTER: + iface_stat_update(dev, event == NETDEV_DOWN); + atomic64_inc(&qtu_events.iface_events); + break; + } + return NOTIFY_DONE; +} + +static int iface_inet6addr_event_handler(struct notifier_block *nb, + unsigned long event, void *ptr) +{ + struct inet6_ifaddr *ifa = ptr; + struct net_device *dev; + + if (unlikely(module_passive)) + return NOTIFY_DONE; + + IF_DEBUG("qtaguid: iface_stat: inet6addr_event(): " + "ev=0x%lx/%s ifa=%p\n", + event, netdev_evt_str(event), ifa); + + switch (event) { + case NETDEV_UP: + BUG_ON(!ifa || !ifa->idev); + dev = (struct net_device *)ifa->idev->dev; + iface_stat_create_ipv6(dev, ifa); + atomic64_inc(&qtu_events.iface_events); + break; + case NETDEV_DOWN: + case NETDEV_UNREGISTER: + BUG_ON(!ifa || !ifa->idev); + dev = (struct net_device *)ifa->idev->dev; + iface_stat_update(dev, event == NETDEV_DOWN); + atomic64_inc(&qtu_events.iface_events); + break; + } + return NOTIFY_DONE; +} + +static int iface_inetaddr_event_handler(struct notifier_block *nb, + unsigned long event, void *ptr) +{ + struct in_ifaddr *ifa = ptr; + struct net_device *dev; + + if (unlikely(module_passive)) + return NOTIFY_DONE; + + IF_DEBUG("qtaguid: iface_stat: inetaddr_event(): " + "ev=0x%lx/%s ifa=%p\n", + event, netdev_evt_str(event), ifa); + + switch (event) { + case NETDEV_UP: + BUG_ON(!ifa || !ifa->ifa_dev); + dev = ifa->ifa_dev->dev; + iface_stat_create(dev, ifa); + atomic64_inc(&qtu_events.iface_events); + break; + case NETDEV_DOWN: + case NETDEV_UNREGISTER: + BUG_ON(!ifa || !ifa->ifa_dev); + dev = ifa->ifa_dev->dev; + iface_stat_update(dev, event == NETDEV_DOWN); + atomic64_inc(&qtu_events.iface_events); + break; + } + return NOTIFY_DONE; +} + +static struct notifier_block iface_netdev_notifier_blk = { + .notifier_call = iface_netdev_event_handler, +}; + +static struct notifier_block iface_inetaddr_notifier_blk = { + .notifier_call = iface_inetaddr_event_handler, +}; + +static struct notifier_block iface_inet6addr_notifier_blk = { + .notifier_call = iface_inet6addr_event_handler, +}; + +static int __init iface_stat_init(struct proc_dir_entry *parent_procdir) +{ + int err; + + iface_stat_procdir = proc_mkdir(iface_stat_procdirname, parent_procdir); + if (!iface_stat_procdir) { + pr_err("qtaguid: iface_stat: init failed to create proc entry\n"); + err = -1; + goto err; + } + + iface_stat_all_procfile = create_proc_entry(iface_stat_all_procfilename, + proc_iface_perms, + parent_procdir); + if (!iface_stat_all_procfile) { + pr_err("qtaguid: iface_stat: init " + " failed to create stat_old proc entry\n"); + err = -1; + goto err_zap_entry; + } + iface_stat_all_procfile->read_proc = iface_stat_fmt_proc_read; + iface_stat_all_procfile->data = (void *)1; /* fmt1 */ + + iface_stat_fmt_procfile = create_proc_entry(iface_stat_fmt_procfilename, + proc_iface_perms, + parent_procdir); + if (!iface_stat_fmt_procfile) { + pr_err("qtaguid: iface_stat: init " + " failed to create stat_all proc entry\n"); + err = -1; + goto err_zap_all_stats_entry; + } + iface_stat_fmt_procfile->read_proc = iface_stat_fmt_proc_read; + iface_stat_fmt_procfile->data = (void *)2; /* fmt2 */ + + + err = register_netdevice_notifier(&iface_netdev_notifier_blk); + if (err) { + pr_err("qtaguid: iface_stat: init " + "failed to register dev event handler\n"); + goto err_zap_all_stats_entries; + } + err = register_inetaddr_notifier(&iface_inetaddr_notifier_blk); + if (err) { + pr_err("qtaguid: iface_stat: init " + "failed to register ipv4 dev event handler\n"); + goto err_unreg_nd; + } + + err = register_inet6addr_notifier(&iface_inet6addr_notifier_blk); + if (err) { + pr_err("qtaguid: iface_stat: init " + "failed to register ipv6 dev event handler\n"); + goto err_unreg_ip4_addr; + } + return 0; + +err_unreg_ip4_addr: + unregister_inetaddr_notifier(&iface_inetaddr_notifier_blk); +err_unreg_nd: + unregister_netdevice_notifier(&iface_netdev_notifier_blk); +err_zap_all_stats_entries: + remove_proc_entry(iface_stat_fmt_procfilename, parent_procdir); +err_zap_all_stats_entry: + remove_proc_entry(iface_stat_all_procfilename, parent_procdir); +err_zap_entry: + remove_proc_entry(iface_stat_procdirname, parent_procdir); +err: + return err; +} + +static struct sock *qtaguid_find_sk(const struct sk_buff *skb, + struct xt_action_param *par) +{ + struct sock *sk; + unsigned int hook_mask = (1 << par->hooknum); + + MT_DEBUG("qtaguid: find_sk(skb=%p) hooknum=%d family=%d\n", skb, + par->hooknum, par->family); + + /* + * Let's not abuse the the xt_socket_get*_sk(), or else it will + * return garbage SKs. + */ + if (!(hook_mask & XT_SOCKET_SUPPORTED_HOOKS)) + return NULL; + + switch (par->family) { + case NFPROTO_IPV6: + sk = xt_socket_get6_sk(skb, par); + break; + case NFPROTO_IPV4: + sk = xt_socket_get4_sk(skb, par); + break; + default: + return NULL; + } + + /* + * Seems to be issues on the file ptr for TCP_TIME_WAIT SKs. + * http://kerneltrap.org/mailarchive/linux-netdev/2010/10/21/6287959 + * Not fixed in 3.0-r3 :( + */ + if (sk) { + MT_DEBUG("qtaguid: %p->sk_proto=%u " + "->sk_state=%d\n", sk, sk->sk_protocol, sk->sk_state); + if (sk->sk_state == TCP_TIME_WAIT) { + xt_socket_put_sk(sk); + sk = NULL; + } + } + return sk; +} + +static void account_for_uid(const struct sk_buff *skb, + const struct sock *alternate_sk, uid_t uid, + struct xt_action_param *par) +{ + const struct net_device *el_dev; + + if (!skb->dev) { + MT_DEBUG("qtaguid[%d]: no skb->dev\n", par->hooknum); + el_dev = par->in ? : par->out; + } else { + const struct net_device *other_dev; + el_dev = skb->dev; + other_dev = par->in ? : par->out; + if (el_dev != other_dev) { + MT_DEBUG("qtaguid[%d]: skb->dev=%p %s vs " + "par->(in/out)=%p %s\n", + par->hooknum, el_dev, el_dev->name, other_dev, + other_dev->name); + } + } + + if (unlikely(!el_dev)) { + pr_info("qtaguid[%d]: no par->in/out?!!\n", par->hooknum); + } else if (unlikely(!el_dev->name)) { + pr_info("qtaguid[%d]: no dev->name?!!\n", par->hooknum); + } else { + int proto = ipx_proto(skb, par); + MT_DEBUG("qtaguid[%d]: dev name=%s type=%d fam=%d proto=%d\n", + par->hooknum, el_dev->name, el_dev->type, + par->family, proto); + + if_tag_stat_update(el_dev->name, uid, + skb->sk ? skb->sk : alternate_sk, + par->in ? IFS_RX : IFS_TX, + proto, skb->len); + } +} + +static bool qtaguid_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_qtaguid_match_info *info = par->matchinfo; + const struct file *filp; + bool got_sock = false; + struct sock *sk; + uid_t sock_uid; + bool res; + + if (unlikely(module_passive)) + return (info->match ^ info->invert) == 0; + + MT_DEBUG("qtaguid[%d]: entered skb=%p par->in=%p/out=%p fam=%d\n", + par->hooknum, skb, par->in, par->out, par->family); + + atomic64_inc(&qtu_events.match_calls); + if (skb == NULL) { + res = (info->match ^ info->invert) == 0; + goto ret_res; + } + + switch (par->hooknum) { + case NF_INET_PRE_ROUTING: + case NF_INET_POST_ROUTING: + atomic64_inc(&qtu_events.match_calls_prepost); + iface_stat_update_from_skb(skb, par); + /* + * We are done in pre/post. The skb will get processed + * further alter. + */ + res = (info->match ^ info->invert); + goto ret_res; + break; + /* default: Fall through and do UID releated work */ + } + + sk = skb->sk; + if (sk == NULL) { + /* + * A missing sk->sk_socket happens when packets are in-flight + * and the matching socket is already closed and gone. + */ + sk = qtaguid_find_sk(skb, par); + /* + * If we got the socket from the find_sk(), we will need to put + * it back, as nf_tproxy_get_sock_v4() got it. + */ + got_sock = sk; + if (sk) + atomic64_inc(&qtu_events.match_found_sk_in_ct); + else + atomic64_inc(&qtu_events.match_found_no_sk_in_ct); + } else { + atomic64_inc(&qtu_events.match_found_sk); + } + MT_DEBUG("qtaguid[%d]: sk=%p got_sock=%d fam=%d proto=%d\n", + par->hooknum, sk, got_sock, par->family, ipx_proto(skb, par)); + if (sk != NULL) { + MT_DEBUG("qtaguid[%d]: sk=%p->sk_socket=%p->file=%p\n", + par->hooknum, sk, sk->sk_socket, + sk->sk_socket ? sk->sk_socket->file : (void *)-1LL); + filp = sk->sk_socket ? sk->sk_socket->file : NULL; + MT_DEBUG("qtaguid[%d]: filp...uid=%u\n", + par->hooknum, filp ? filp->f_cred->fsuid : -1); + } + + if (sk == NULL || sk->sk_socket == NULL) { + /* + * Here, the qtaguid_find_sk() using connection tracking + * couldn't find the owner, so for now we just count them + * against the system. + */ + /* + * TODO: unhack how to force just accounting. + * For now we only do iface stats when the uid-owner is not + * requested. + */ + if (!(info->match & XT_QTAGUID_UID)) + account_for_uid(skb, sk, 0, par); + MT_DEBUG("qtaguid[%d]: leaving (sk?sk->sk_socket)=%p\n", + par->hooknum, + sk ? sk->sk_socket : NULL); + res = (info->match ^ info->invert) == 0; + atomic64_inc(&qtu_events.match_no_sk); + goto put_sock_ret_res; + } else if (info->match & info->invert & XT_QTAGUID_SOCKET) { + res = false; + goto put_sock_ret_res; + } + filp = sk->sk_socket->file; + if (filp == NULL) { + MT_DEBUG("qtaguid[%d]: leaving filp=NULL\n", par->hooknum); + account_for_uid(skb, sk, 0, par); + res = ((info->match ^ info->invert) & + (XT_QTAGUID_UID | XT_QTAGUID_GID)) == 0; + atomic64_inc(&qtu_events.match_no_sk_file); + goto put_sock_ret_res; + } + sock_uid = filp->f_cred->fsuid; + /* + * TODO: unhack how to force just accounting. + * For now we only do iface stats when the uid-owner is not requested + */ + if (!(info->match & XT_QTAGUID_UID)) + account_for_uid(skb, sk, sock_uid, par); + + /* + * The following two tests fail the match when: + * id not in range AND no inverted condition requested + * or id in range AND inverted condition requested + * Thus (!a && b) || (a && !b) == a ^ b + */ + if (info->match & XT_QTAGUID_UID) + if ((filp->f_cred->fsuid >= info->uid_min && + filp->f_cred->fsuid <= info->uid_max) ^ + !(info->invert & XT_QTAGUID_UID)) { + MT_DEBUG("qtaguid[%d]: leaving uid not matching\n", + par->hooknum); + res = false; + goto put_sock_ret_res; + } + if (info->match & XT_QTAGUID_GID) + if ((filp->f_cred->fsgid >= info->gid_min && + filp->f_cred->fsgid <= info->gid_max) ^ + !(info->invert & XT_QTAGUID_GID)) { + MT_DEBUG("qtaguid[%d]: leaving gid not matching\n", + par->hooknum); + res = false; + goto put_sock_ret_res; + } + + MT_DEBUG("qtaguid[%d]: leaving matched\n", par->hooknum); + res = true; + +put_sock_ret_res: + if (got_sock) + xt_socket_put_sk(sk); +ret_res: + MT_DEBUG("qtaguid[%d]: left %d\n", par->hooknum, res); + return res; +} + +#ifdef DDEBUG +/* This function is not in xt_qtaguid_print.c because of locks visibility */ +static void prdebug_full_state(int indent_level, const char *fmt, ...) +{ + va_list args; + char *fmt_buff; + char *buff; + + if (!unlikely(qtaguid_debug_mask & DDEBUG_MASK)) + return; + + fmt_buff = kasprintf(GFP_ATOMIC, + "qtaguid: %s(): %s {\n", __func__, fmt); + BUG_ON(!fmt_buff); + va_start(args, fmt); + buff = kvasprintf(GFP_ATOMIC, + fmt_buff, args); + BUG_ON(!buff); + pr_debug("%s", buff); + kfree(fmt_buff); + kfree(buff); + va_end(args); + + spin_lock_bh(&sock_tag_list_lock); + prdebug_sock_tag_tree(indent_level, &sock_tag_tree); + spin_unlock_bh(&sock_tag_list_lock); + + spin_lock_bh(&sock_tag_list_lock); + spin_lock_bh(&uid_tag_data_tree_lock); + prdebug_uid_tag_data_tree(indent_level, &uid_tag_data_tree); + prdebug_proc_qtu_data_tree(indent_level, &proc_qtu_data_tree); + spin_unlock_bh(&uid_tag_data_tree_lock); + spin_unlock_bh(&sock_tag_list_lock); + + spin_lock_bh(&iface_stat_list_lock); + prdebug_iface_stat_list(indent_level, &iface_stat_list); + spin_unlock_bh(&iface_stat_list_lock); + + pr_debug("qtaguid: %s(): }\n", __func__); +} +#else +static void prdebug_full_state(int indent_level, const char *fmt, ...) {} +#endif + +/* + * Procfs reader to get all active socket tags using style "1)" as described in + * fs/proc/generic.c + */ +static int qtaguid_ctrl_proc_read(char *page, char **num_items_returned, + off_t items_to_skip, int char_count, int *eof, + void *data) +{ + char *outp = page; + int len; + uid_t uid; + struct rb_node *node; + struct sock_tag *sock_tag_entry; + int item_index = 0; + int indent_level = 0; + long f_count; + + if (unlikely(module_passive)) { + *eof = 1; + return 0; + } + + if (*eof) + return 0; + + CT_DEBUG("qtaguid: proc ctrl pid=%u tgid=%u uid=%u " + "page=%p off=%ld char_count=%d *eof=%d\n", + current->pid, current->tgid, current_fsuid(), + page, items_to_skip, char_count, *eof); + + spin_lock_bh(&sock_tag_list_lock); + for (node = rb_first(&sock_tag_tree); + node; + node = rb_next(node)) { + if (item_index++ < items_to_skip) + continue; + sock_tag_entry = rb_entry(node, struct sock_tag, sock_node); + uid = get_uid_from_tag(sock_tag_entry->tag); + CT_DEBUG("qtaguid: proc_read(): sk=%p tag=0x%llx (uid=%u) " + "pid=%u\n", + sock_tag_entry->sk, + sock_tag_entry->tag, + uid, + sock_tag_entry->pid + ); + f_count = atomic_long_read( + &sock_tag_entry->socket->file->f_count); + len = snprintf(outp, char_count, + "sock=%p tag=0x%llx (uid=%u) pid=%u " + "f_count=%lu\n", + sock_tag_entry->sk, + sock_tag_entry->tag, uid, + sock_tag_entry->pid, f_count); + if (len >= char_count) { + spin_unlock_bh(&sock_tag_list_lock); + *outp = '\0'; + return outp - page; + } + outp += len; + char_count -= len; + (*num_items_returned)++; + } + spin_unlock_bh(&sock_tag_list_lock); + + if (item_index++ >= items_to_skip) { + len = snprintf(outp, char_count, + "events: sockets_tagged=%llu " + "sockets_untagged=%llu " + "counter_set_changes=%llu " + "delete_cmds=%llu " + "iface_events=%llu " + "match_calls=%llu " + "match_calls_prepost=%llu " + "match_found_sk=%llu " + "match_found_sk_in_ct=%llu " + "match_found_no_sk_in_ct=%llu " + "match_no_sk=%llu " + "match_no_sk_file=%llu\n", + atomic64_read(&qtu_events.sockets_tagged), + atomic64_read(&qtu_events.sockets_untagged), + atomic64_read(&qtu_events.counter_set_changes), + atomic64_read(&qtu_events.delete_cmds), + atomic64_read(&qtu_events.iface_events), + atomic64_read(&qtu_events.match_calls), + atomic64_read(&qtu_events.match_calls_prepost), + atomic64_read(&qtu_events.match_found_sk), + atomic64_read(&qtu_events.match_found_sk_in_ct), + atomic64_read( + &qtu_events.match_found_no_sk_in_ct), + atomic64_read(&qtu_events.match_no_sk), + atomic64_read(&qtu_events.match_no_sk_file)); + if (len >= char_count) { + *outp = '\0'; + return outp - page; + } + outp += len; + char_count -= len; + (*num_items_returned)++; + } + + /* Count the following as part of the last item_index */ + if (item_index > items_to_skip) { + prdebug_full_state(indent_level, "proc ctrl"); + } + + *eof = 1; + return outp - page; +} + +/* + * Delete socket tags, and stat tags associated with a given + * accouting tag and uid. + */ +static int ctrl_cmd_delete(const char *input) +{ + char cmd; + uid_t uid; + uid_t entry_uid; + tag_t acct_tag; + tag_t tag; + int res, argc; + struct iface_stat *iface_entry; + struct rb_node *node; + struct sock_tag *st_entry; + struct rb_root st_to_free_tree = RB_ROOT; + struct tag_stat *ts_entry; + struct tag_counter_set *tcs_entry; + struct tag_ref *tr_entry; + struct uid_tag_data *utd_entry; + + argc = sscanf(input, "%c %llu %u", &cmd, &acct_tag, &uid); + CT_DEBUG("qtaguid: ctrl_delete(%s): argc=%d cmd=%c " + "user_tag=0x%llx uid=%u\n", input, argc, cmd, + acct_tag, uid); + if (argc < 2) { + res = -EINVAL; + goto err; + } + if (!valid_atag(acct_tag)) { + pr_info("qtaguid: ctrl_delete(%s): invalid tag\n", input); + res = -EINVAL; + goto err; + } + if (argc < 3) { + uid = current_fsuid(); + } else if (!can_impersonate_uid(uid)) { + pr_info("qtaguid: ctrl_delete(%s): " + "insufficient priv from pid=%u tgid=%u uid=%u\n", + input, current->pid, current->tgid, current_fsuid()); + res = -EPERM; + goto err; + } + + tag = combine_atag_with_uid(acct_tag, uid); + CT_DEBUG("qtaguid: ctrl_delete(%s): " + "looking for tag=0x%llx (uid=%u)\n", + input, tag, uid); + + /* Delete socket tags */ + spin_lock_bh(&sock_tag_list_lock); + node = rb_first(&sock_tag_tree); + while (node) { + st_entry = rb_entry(node, struct sock_tag, sock_node); + entry_uid = get_uid_from_tag(st_entry->tag); + node = rb_next(node); + if (entry_uid != uid) + continue; + + CT_DEBUG("qtaguid: ctrl_delete(%s): st tag=0x%llx (uid=%u)\n", + input, st_entry->tag, entry_uid); + + if (!acct_tag || st_entry->tag == tag) { + rb_erase(&st_entry->sock_node, &sock_tag_tree); + /* Can't sockfd_put() within spinlock, do it later. */ + sock_tag_tree_insert(st_entry, &st_to_free_tree); + tr_entry = lookup_tag_ref(st_entry->tag, NULL); + BUG_ON(tr_entry->num_sock_tags <= 0); + tr_entry->num_sock_tags--; + /* + * TODO: remove if, and start failing. + * This is a hack to work around the fact that in some + * places we have "if (IS_ERR_OR_NULL(pqd_entry))" + * and are trying to work around apps + * that didn't open the /dev/xt_qtaguid. + */ + if (st_entry->list.next && st_entry->list.prev) + list_del(&st_entry->list); + } + } + spin_unlock_bh(&sock_tag_list_lock); + + sock_tag_tree_erase(&st_to_free_tree); + + /* Delete tag counter-sets */ + spin_lock_bh(&tag_counter_set_list_lock); + /* Counter sets are only on the uid tag, not full tag */ + tcs_entry = tag_counter_set_tree_search(&tag_counter_set_tree, tag); + if (tcs_entry) { + CT_DEBUG("qtaguid: ctrl_delete(%s): " + "erase tcs: tag=0x%llx (uid=%u) set=%d\n", + input, + tcs_entry->tn.tag, + get_uid_from_tag(tcs_entry->tn.tag), + tcs_entry->active_set); + rb_erase(&tcs_entry->tn.node, &tag_counter_set_tree); + kfree(tcs_entry); + } + spin_unlock_bh(&tag_counter_set_list_lock); + + /* + * If acct_tag is 0, then all entries belonging to uid are + * erased. + */ + spin_lock_bh(&iface_stat_list_lock); + list_for_each_entry(iface_entry, &iface_stat_list, list) { + spin_lock_bh(&iface_entry->tag_stat_list_lock); + node = rb_first(&iface_entry->tag_stat_tree); + while (node) { + ts_entry = rb_entry(node, struct tag_stat, tn.node); + entry_uid = get_uid_from_tag(ts_entry->tn.tag); + node = rb_next(node); + + CT_DEBUG("qtaguid: ctrl_delete(%s): " + "ts tag=0x%llx (uid=%u)\n", + input, ts_entry->tn.tag, entry_uid); + + if (entry_uid != uid) + continue; + if (!acct_tag || ts_entry->tn.tag == tag) { + CT_DEBUG("qtaguid: ctrl_delete(%s): " + "erase ts: %s 0x%llx %u\n", + input, iface_entry->ifname, + get_atag_from_tag(ts_entry->tn.tag), + entry_uid); + rb_erase(&ts_entry->tn.node, + &iface_entry->tag_stat_tree); + kfree(ts_entry); + } + } + spin_unlock_bh(&iface_entry->tag_stat_list_lock); + } + spin_unlock_bh(&iface_stat_list_lock); + + /* Cleanup the uid_tag_data */ + spin_lock_bh(&uid_tag_data_tree_lock); + node = rb_first(&uid_tag_data_tree); + while (node) { + utd_entry = rb_entry(node, struct uid_tag_data, node); + entry_uid = utd_entry->uid; + node = rb_next(node); + + CT_DEBUG("qtaguid: ctrl_delete(%s): " + "utd uid=%u\n", + input, entry_uid); + + if (entry_uid != uid) + continue; + /* + * Go over the tag_refs, and those that don't have + * sock_tags using them are freed. + */ + put_tag_ref_tree(tag, utd_entry); + put_utd_entry(utd_entry); + } + spin_unlock_bh(&uid_tag_data_tree_lock); + + atomic64_inc(&qtu_events.delete_cmds); + res = 0; + +err: + return res; +} + +static int ctrl_cmd_counter_set(const char *input) +{ + char cmd; + uid_t uid = 0; + tag_t tag; + int res, argc; + struct tag_counter_set *tcs; + int counter_set; + + argc = sscanf(input, "%c %d %u", &cmd, &counter_set, &uid); + CT_DEBUG("qtaguid: ctrl_counterset(%s): argc=%d cmd=%c " + "set=%d uid=%u\n", input, argc, cmd, + counter_set, uid); + if (argc != 3) { + res = -EINVAL; + goto err; + } + if (counter_set < 0 || counter_set >= IFS_MAX_COUNTER_SETS) { + pr_info("qtaguid: ctrl_counterset(%s): invalid counter_set range\n", + input); + res = -EINVAL; + goto err; + } + if (!can_manipulate_uids()) { + pr_info("qtaguid: ctrl_counterset(%s): " + "insufficient priv from pid=%u tgid=%u uid=%u\n", + input, current->pid, current->tgid, current_fsuid()); + res = -EPERM; + goto err; + } + + tag = make_tag_from_uid(uid); + spin_lock_bh(&tag_counter_set_list_lock); + tcs = tag_counter_set_tree_search(&tag_counter_set_tree, tag); + if (!tcs) { + tcs = kzalloc(sizeof(*tcs), GFP_ATOMIC); + if (!tcs) { + spin_unlock_bh(&tag_counter_set_list_lock); + pr_err("qtaguid: ctrl_counterset(%s): " + "failed to alloc counter set\n", + input); + res = -ENOMEM; + goto err; + } + tcs->tn.tag = tag; + tag_counter_set_tree_insert(tcs, &tag_counter_set_tree); + CT_DEBUG("qtaguid: ctrl_counterset(%s): added tcs tag=0x%llx " + "(uid=%u) set=%d\n", + input, tag, get_uid_from_tag(tag), counter_set); + } + tcs->active_set = counter_set; + spin_unlock_bh(&tag_counter_set_list_lock); + atomic64_inc(&qtu_events.counter_set_changes); + res = 0; + +err: + return res; +} + +static int ctrl_cmd_tag(const char *input) +{ + char cmd; + int sock_fd = 0; + uid_t uid = 0; + tag_t acct_tag = make_atag_from_value(0); + tag_t full_tag; + struct socket *el_socket; + int res, argc; + struct sock_tag *sock_tag_entry; + struct tag_ref *tag_ref_entry; + struct uid_tag_data *uid_tag_data_entry; + struct proc_qtu_data *pqd_entry; + + /* Unassigned args will get defaulted later. */ + argc = sscanf(input, "%c %d %llu %u", &cmd, &sock_fd, &acct_tag, &uid); + CT_DEBUG("qtaguid: ctrl_tag(%s): argc=%d cmd=%c sock_fd=%d " + "acct_tag=0x%llx uid=%u\n", input, argc, cmd, sock_fd, + acct_tag, uid); + if (argc < 2) { + res = -EINVAL; + goto err; + } + el_socket = sockfd_lookup(sock_fd, &res); /* This locks the file */ + if (!el_socket) { + pr_info("qtaguid: ctrl_tag(%s): failed to lookup" + " sock_fd=%d err=%d pid=%u tgid=%u uid=%u\n", + input, sock_fd, res, current->pid, current->tgid, + current_fsuid()); + goto err; + } + CT_DEBUG("qtaguid: ctrl_tag(%s): socket->...->f_count=%ld ->sk=%p\n", + input, atomic_long_read(&el_socket->file->f_count), + el_socket->sk); + if (argc < 3) { + acct_tag = make_atag_from_value(0); + } else if (!valid_atag(acct_tag)) { + pr_info("qtaguid: ctrl_tag(%s): invalid tag\n", input); + res = -EINVAL; + goto err_put; + } + CT_DEBUG("qtaguid: ctrl_tag(%s): " + "pid=%u tgid=%u uid=%u euid=%u fsuid=%u " + "in_group=%d in_egroup=%d\n", + input, current->pid, current->tgid, current_uid(), + current_euid(), current_fsuid(), + in_group_p(proc_ctrl_write_gid), + in_egroup_p(proc_ctrl_write_gid)); + if (argc < 4) { + uid = current_fsuid(); + } else if (!can_impersonate_uid(uid)) { + pr_info("qtaguid: ctrl_tag(%s): " + "insufficient priv from pid=%u tgid=%u uid=%u\n", + input, current->pid, current->tgid, current_fsuid()); + res = -EPERM; + goto err_put; + } + full_tag = combine_atag_with_uid(acct_tag, uid); + + spin_lock_bh(&sock_tag_list_lock); + sock_tag_entry = get_sock_stat_nl(el_socket->sk); + tag_ref_entry = get_tag_ref(full_tag, &uid_tag_data_entry); + if (IS_ERR(tag_ref_entry)) { + res = PTR_ERR(tag_ref_entry); + spin_unlock_bh(&sock_tag_list_lock); + goto err_put; + } + tag_ref_entry->num_sock_tags++; + if (sock_tag_entry) { + struct tag_ref *prev_tag_ref_entry; + + CT_DEBUG("qtaguid: ctrl_tag(%s): retag for sk=%p " + "st@%p ...->f_count=%ld\n", + input, el_socket->sk, sock_tag_entry, + atomic_long_read(&el_socket->file->f_count)); + /* + * This is a re-tagging, so release the sock_fd that was + * locked at the time of the 1st tagging. + * There is still the ref from this call's sockfd_lookup() so + * it can be done within the spinlock. + */ + sockfd_put(sock_tag_entry->socket); + prev_tag_ref_entry = lookup_tag_ref(sock_tag_entry->tag, + &uid_tag_data_entry); + BUG_ON(IS_ERR_OR_NULL(prev_tag_ref_entry)); + BUG_ON(prev_tag_ref_entry->num_sock_tags <= 0); + prev_tag_ref_entry->num_sock_tags--; + sock_tag_entry->tag = full_tag; + } else { + CT_DEBUG("qtaguid: ctrl_tag(%s): newtag for sk=%p\n", + input, el_socket->sk); + sock_tag_entry = kzalloc(sizeof(*sock_tag_entry), + GFP_ATOMIC); + if (!sock_tag_entry) { + pr_err("qtaguid: ctrl_tag(%s): " + "socket tag alloc failed\n", + input); + spin_unlock_bh(&sock_tag_list_lock); + res = -ENOMEM; + goto err_tag_unref_put; + } + sock_tag_entry->sk = el_socket->sk; + sock_tag_entry->socket = el_socket; + sock_tag_entry->pid = current->tgid; + sock_tag_entry->tag = combine_atag_with_uid(acct_tag, + uid); + spin_lock_bh(&uid_tag_data_tree_lock); + pqd_entry = proc_qtu_data_tree_search( + &proc_qtu_data_tree, current->tgid); + /* + * TODO: remove if, and start failing. + * At first, we want to catch user-space code that is not + * opening the /dev/xt_qtaguid. + */ + if (IS_ERR_OR_NULL(pqd_entry)) + pr_warn_once( + "qtaguid: %s(): " + "User space forgot to open /dev/xt_qtaguid? " + "pid=%u tgid=%u uid=%u\n", __func__, + current->pid, current->tgid, + current_fsuid()); + else + list_add(&sock_tag_entry->list, + &pqd_entry->sock_tag_list); + spin_unlock_bh(&uid_tag_data_tree_lock); + + sock_tag_tree_insert(sock_tag_entry, &sock_tag_tree); + atomic64_inc(&qtu_events.sockets_tagged); + } + spin_unlock_bh(&sock_tag_list_lock); + /* We keep the ref to the socket (file) until it is untagged */ + CT_DEBUG("qtaguid: ctrl_tag(%s): done st@%p ...->f_count=%ld\n", + input, sock_tag_entry, + atomic_long_read(&el_socket->file->f_count)); + return 0; + +err_tag_unref_put: + BUG_ON(tag_ref_entry->num_sock_tags <= 0); + tag_ref_entry->num_sock_tags--; + free_tag_ref_from_utd_entry(tag_ref_entry, uid_tag_data_entry); +err_put: + CT_DEBUG("qtaguid: ctrl_tag(%s): done. ...->f_count=%ld\n", + input, atomic_long_read(&el_socket->file->f_count) - 1); + /* Release the sock_fd that was grabbed by sockfd_lookup(). */ + sockfd_put(el_socket); + return res; + +err: + CT_DEBUG("qtaguid: ctrl_tag(%s): done.\n", input); + return res; +} + +static int ctrl_cmd_untag(const char *input) +{ + char cmd; + int sock_fd = 0; + struct socket *el_socket; + int res, argc; + struct sock_tag *sock_tag_entry; + struct tag_ref *tag_ref_entry; + struct uid_tag_data *utd_entry; + struct proc_qtu_data *pqd_entry; + + argc = sscanf(input, "%c %d", &cmd, &sock_fd); + CT_DEBUG("qtaguid: ctrl_untag(%s): argc=%d cmd=%c sock_fd=%d\n", + input, argc, cmd, sock_fd); + if (argc < 2) { + res = -EINVAL; + goto err; + } + el_socket = sockfd_lookup(sock_fd, &res); /* This locks the file */ + if (!el_socket) { + pr_info("qtaguid: ctrl_untag(%s): failed to lookup" + " sock_fd=%d err=%d pid=%u tgid=%u uid=%u\n", + input, sock_fd, res, current->pid, current->tgid, + current_fsuid()); + goto err; + } + CT_DEBUG("qtaguid: ctrl_untag(%s): socket->...->f_count=%ld ->sk=%p\n", + input, atomic_long_read(&el_socket->file->f_count), + el_socket->sk); + spin_lock_bh(&sock_tag_list_lock); + sock_tag_entry = get_sock_stat_nl(el_socket->sk); + if (!sock_tag_entry) { + spin_unlock_bh(&sock_tag_list_lock); + res = -EINVAL; + goto err_put; + } + /* + * The socket already belongs to the current process + * so it can do whatever it wants to it. + */ + rb_erase(&sock_tag_entry->sock_node, &sock_tag_tree); + + tag_ref_entry = lookup_tag_ref(sock_tag_entry->tag, &utd_entry); + BUG_ON(!tag_ref_entry); + BUG_ON(tag_ref_entry->num_sock_tags <= 0); + spin_lock_bh(&uid_tag_data_tree_lock); + pqd_entry = proc_qtu_data_tree_search( + &proc_qtu_data_tree, current->tgid); + /* + * TODO: remove if, and start failing. + * At first, we want to catch user-space code that is not + * opening the /dev/xt_qtaguid. + */ + if (IS_ERR_OR_NULL(pqd_entry)) + pr_warn_once("qtaguid: %s(): " + "User space forgot to open /dev/xt_qtaguid? " + "pid=%u tgid=%u uid=%u\n", __func__, + current->pid, current->tgid, current_fsuid()); + else + list_del(&sock_tag_entry->list); + spin_unlock_bh(&uid_tag_data_tree_lock); + /* + * We don't free tag_ref from the utd_entry here, + * only during a cmd_delete(). + */ + tag_ref_entry->num_sock_tags--; + spin_unlock_bh(&sock_tag_list_lock); + /* + * Release the sock_fd that was grabbed at tag time, + * and once more for the sockfd_lookup() here. + */ + sockfd_put(sock_tag_entry->socket); + CT_DEBUG("qtaguid: ctrl_untag(%s): done. st@%p ...->f_count=%ld\n", + input, sock_tag_entry, + atomic_long_read(&el_socket->file->f_count) - 1); + sockfd_put(el_socket); + + kfree(sock_tag_entry); + atomic64_inc(&qtu_events.sockets_untagged); + + return 0; + +err_put: + CT_DEBUG("qtaguid: ctrl_untag(%s): done. socket->...->f_count=%ld\n", + input, atomic_long_read(&el_socket->file->f_count) - 1); + /* Release the sock_fd that was grabbed by sockfd_lookup(). */ + sockfd_put(el_socket); + return res; + +err: + CT_DEBUG("qtaguid: ctrl_untag(%s): done.\n", input); + return res; +} + +static int qtaguid_ctrl_parse(const char *input, int count) +{ + char cmd; + int res; + + CT_DEBUG("qtaguid: ctrl(%s): pid=%u tgid=%u uid=%u\n", + input, current->pid, current->tgid, current_fsuid()); + + cmd = input[0]; + /* Collect params for commands */ + switch (cmd) { + case 'd': + res = ctrl_cmd_delete(input); + break; + + case 's': + res = ctrl_cmd_counter_set(input); + break; + + case 't': + res = ctrl_cmd_tag(input); + break; + + case 'u': + res = ctrl_cmd_untag(input); + break; + + default: + res = -EINVAL; + goto err; + } + if (!res) + res = count; +err: + CT_DEBUG("qtaguid: ctrl(%s): res=%d\n", input, res); + return res; +} + +#define MAX_QTAGUID_CTRL_INPUT_LEN 255 +static int qtaguid_ctrl_proc_write(struct file *file, const char __user *buffer, + unsigned long count, void *data) +{ + char input_buf[MAX_QTAGUID_CTRL_INPUT_LEN]; + + if (unlikely(module_passive)) + return count; + + if (count >= MAX_QTAGUID_CTRL_INPUT_LEN) + return -EINVAL; + + if (copy_from_user(input_buf, buffer, count)) + return -EFAULT; + + input_buf[count] = '\0'; + return qtaguid_ctrl_parse(input_buf, count); +} + +struct proc_print_info { + char *outp; + char **num_items_returned; + struct iface_stat *iface_entry; + struct tag_stat *ts_entry; + int item_index; + int items_to_skip; + int char_count; +}; + +static int pp_stats_line(struct proc_print_info *ppi, int cnt_set) +{ + int len; + struct data_counters *cnts; + + if (!ppi->item_index) { + if (ppi->item_index++ < ppi->items_to_skip) + return 0; + len = snprintf(ppi->outp, ppi->char_count, + "idx iface acct_tag_hex uid_tag_int cnt_set " + "rx_bytes rx_packets " + "tx_bytes tx_packets " + "rx_tcp_bytes rx_tcp_packets " + "rx_udp_bytes rx_udp_packets " + "rx_other_bytes rx_other_packets " + "tx_tcp_bytes tx_tcp_packets " + "tx_udp_bytes tx_udp_packets " + "tx_other_bytes tx_other_packets\n"); + } else { + tag_t tag = ppi->ts_entry->tn.tag; + uid_t stat_uid = get_uid_from_tag(tag); + /* Detailed tags are not available to everybody */ + if (get_atag_from_tag(tag) + && !can_read_other_uid_stats(stat_uid)) { + CT_DEBUG("qtaguid: stats line: " + "%s 0x%llx %u: insufficient priv " + "from pid=%u tgid=%u uid=%u\n", + ppi->iface_entry->ifname, + get_atag_from_tag(tag), stat_uid, + current->pid, current->tgid, current_fsuid()); + return 0; + } + if (ppi->item_index++ < ppi->items_to_skip) + return 0; + cnts = &ppi->ts_entry->counters; + len = snprintf( + ppi->outp, ppi->char_count, + "%d %s 0x%llx %u %u " + "%llu %llu " + "%llu %llu " + "%llu %llu " + "%llu %llu " + "%llu %llu " + "%llu %llu " + "%llu %llu " + "%llu %llu\n", + ppi->item_index, + ppi->iface_entry->ifname, + get_atag_from_tag(tag), + stat_uid, + cnt_set, + dc_sum_bytes(cnts, cnt_set, IFS_RX), + dc_sum_packets(cnts, cnt_set, IFS_RX), + dc_sum_bytes(cnts, cnt_set, IFS_TX), + dc_sum_packets(cnts, cnt_set, IFS_TX), + cnts->bpc[cnt_set][IFS_RX][IFS_TCP].bytes, + cnts->bpc[cnt_set][IFS_RX][IFS_TCP].packets, + cnts->bpc[cnt_set][IFS_RX][IFS_UDP].bytes, + cnts->bpc[cnt_set][IFS_RX][IFS_UDP].packets, + cnts->bpc[cnt_set][IFS_RX][IFS_PROTO_OTHER].bytes, + cnts->bpc[cnt_set][IFS_RX][IFS_PROTO_OTHER].packets, + cnts->bpc[cnt_set][IFS_TX][IFS_TCP].bytes, + cnts->bpc[cnt_set][IFS_TX][IFS_TCP].packets, + cnts->bpc[cnt_set][IFS_TX][IFS_UDP].bytes, + cnts->bpc[cnt_set][IFS_TX][IFS_UDP].packets, + cnts->bpc[cnt_set][IFS_TX][IFS_PROTO_OTHER].bytes, + cnts->bpc[cnt_set][IFS_TX][IFS_PROTO_OTHER].packets); + } + return len; +} + +static bool pp_sets(struct proc_print_info *ppi) +{ + int len; + int counter_set; + for (counter_set = 0; counter_set < IFS_MAX_COUNTER_SETS; + counter_set++) { + len = pp_stats_line(ppi, counter_set); + if (len >= ppi->char_count) { + *ppi->outp = '\0'; + return false; + } + if (len) { + ppi->outp += len; + ppi->char_count -= len; + (*ppi->num_items_returned)++; + } + } + return true; +} + +/* + * Procfs reader to get all tag stats using style "1)" as described in + * fs/proc/generic.c + * Groups all protocols tx/rx bytes. + */ +static int qtaguid_stats_proc_read(char *page, char **num_items_returned, + off_t items_to_skip, int char_count, int *eof, + void *data) +{ + struct proc_print_info ppi; + int len; + + ppi.outp = page; + ppi.item_index = 0; + ppi.char_count = char_count; + ppi.num_items_returned = num_items_returned; + ppi.items_to_skip = items_to_skip; + + if (unlikely(module_passive)) { + len = pp_stats_line(&ppi, 0); + /* The header should always be shorter than the buffer. */ + BUG_ON(len >= ppi.char_count); + (*num_items_returned)++; + *eof = 1; + return len; + } + + CT_DEBUG("qtaguid:proc stats pid=%u tgid=%u uid=%u " + "page=%p *num_items_returned=%p off=%ld " + "char_count=%d *eof=%d\n", + current->pid, current->tgid, current_fsuid(), + page, *num_items_returned, + items_to_skip, char_count, *eof); + + if (*eof) + return 0; + + /* The idx is there to help debug when things go belly up. */ + len = pp_stats_line(&ppi, 0); + /* Don't advance the outp unless the whole line was printed */ + if (len >= ppi.char_count) { + *ppi.outp = '\0'; + return ppi.outp - page; + } + if (len) { + ppi.outp += len; + ppi.char_count -= len; + (*num_items_returned)++; + } + + spin_lock_bh(&iface_stat_list_lock); + list_for_each_entry(ppi.iface_entry, &iface_stat_list, list) { + struct rb_node *node; + spin_lock_bh(&ppi.iface_entry->tag_stat_list_lock); + for (node = rb_first(&ppi.iface_entry->tag_stat_tree); + node; + node = rb_next(node)) { + ppi.ts_entry = rb_entry(node, struct tag_stat, tn.node); + if (!pp_sets(&ppi)) { + spin_unlock_bh( + &ppi.iface_entry->tag_stat_list_lock); + spin_unlock_bh(&iface_stat_list_lock); + return ppi.outp - page; + } + } + spin_unlock_bh(&ppi.iface_entry->tag_stat_list_lock); + } + spin_unlock_bh(&iface_stat_list_lock); + + *eof = 1; + return ppi.outp - page; +} + +/*------------------------------------------*/ +static int qtudev_open(struct inode *inode, struct file *file) +{ + struct uid_tag_data *utd_entry; + struct proc_qtu_data *pqd_entry; + struct proc_qtu_data *new_pqd_entry; + int res; + bool utd_entry_found; + + if (unlikely(qtu_proc_handling_passive)) + return 0; + + DR_DEBUG("qtaguid: qtudev_open(): pid=%u tgid=%u uid=%u\n", + current->pid, current->tgid, current_fsuid()); + + spin_lock_bh(&uid_tag_data_tree_lock); + + /* Look for existing uid data, or alloc one. */ + utd_entry = get_uid_data(current_fsuid(), &utd_entry_found); + if (IS_ERR_OR_NULL(utd_entry)) { + res = PTR_ERR(utd_entry); + goto err_unlock; + } + + /* Look for existing PID based proc_data */ + pqd_entry = proc_qtu_data_tree_search(&proc_qtu_data_tree, + current->tgid); + if (pqd_entry) { + pr_err("qtaguid: qtudev_open(): %u/%u %u " + "%s already opened\n", + current->pid, current->tgid, current_fsuid(), + QTU_DEV_NAME); + res = -EBUSY; + goto err_unlock_free_utd; + } + + new_pqd_entry = kzalloc(sizeof(*new_pqd_entry), GFP_ATOMIC); + if (!new_pqd_entry) { + pr_err("qtaguid: qtudev_open(): %u/%u %u: " + "proc data alloc failed\n", + current->pid, current->tgid, current_fsuid()); + res = -ENOMEM; + goto err_unlock_free_utd; + } + new_pqd_entry->pid = current->tgid; + INIT_LIST_HEAD(&new_pqd_entry->sock_tag_list); + new_pqd_entry->parent_tag_data = utd_entry; + utd_entry->num_pqd++; + + proc_qtu_data_tree_insert(new_pqd_entry, + &proc_qtu_data_tree); + + spin_unlock_bh(&uid_tag_data_tree_lock); + DR_DEBUG("qtaguid: tracking data for uid=%u in pqd=%p\n", + current_fsuid(), new_pqd_entry); + file->private_data = new_pqd_entry; + return 0; + +err_unlock_free_utd: + if (!utd_entry_found) { + rb_erase(&utd_entry->node, &uid_tag_data_tree); + kfree(utd_entry); + } +err_unlock: + spin_unlock_bh(&uid_tag_data_tree_lock); + return res; +} + +static int qtudev_release(struct inode *inode, struct file *file) +{ + struct proc_qtu_data *pqd_entry = file->private_data; + struct uid_tag_data *utd_entry = pqd_entry->parent_tag_data; + struct sock_tag *st_entry; + struct rb_root st_to_free_tree = RB_ROOT; + struct list_head *entry, *next; + struct tag_ref *tr; + + if (unlikely(qtu_proc_handling_passive)) + return 0; + + /* + * Do not trust the current->pid, it might just be a kworker cleaning + * up after a dead proc. + */ + DR_DEBUG("qtaguid: qtudev_release(): " + "pid=%u tgid=%u uid=%u " + "pqd_entry=%p->pid=%u utd_entry=%p->active_tags=%d\n", + current->pid, current->tgid, pqd_entry->parent_tag_data->uid, + pqd_entry, pqd_entry->pid, utd_entry, + utd_entry->num_active_tags); + + spin_lock_bh(&sock_tag_list_lock); + spin_lock_bh(&uid_tag_data_tree_lock); + + list_for_each_safe(entry, next, &pqd_entry->sock_tag_list) { + st_entry = list_entry(entry, struct sock_tag, list); + DR_DEBUG("qtaguid: %s(): " + "erase sock_tag=%p->sk=%p pid=%u tgid=%u uid=%u\n", + __func__, + st_entry, st_entry->sk, + current->pid, current->tgid, + pqd_entry->parent_tag_data->uid); + + utd_entry = uid_tag_data_tree_search( + &uid_tag_data_tree, + get_uid_from_tag(st_entry->tag)); + BUG_ON(IS_ERR_OR_NULL(utd_entry)); + DR_DEBUG("qtaguid: %s(): " + "looking for tag=0x%llx in utd_entry=%p\n", __func__, + st_entry->tag, utd_entry); + tr = tag_ref_tree_search(&utd_entry->tag_ref_tree, + st_entry->tag); + BUG_ON(!tr); + BUG_ON(tr->num_sock_tags <= 0); + tr->num_sock_tags--; + free_tag_ref_from_utd_entry(tr, utd_entry); + + rb_erase(&st_entry->sock_node, &sock_tag_tree); + list_del(&st_entry->list); + /* Can't sockfd_put() within spinlock, do it later. */ + sock_tag_tree_insert(st_entry, &st_to_free_tree); + + /* + * Try to free the utd_entry if no other proc_qtu_data is + * using it (num_pqd is 0) and it doesn't have active tags + * (num_active_tags is 0). + */ + put_utd_entry(utd_entry); + } + + rb_erase(&pqd_entry->node, &proc_qtu_data_tree); + BUG_ON(pqd_entry->parent_tag_data->num_pqd < 1); + pqd_entry->parent_tag_data->num_pqd--; + put_utd_entry(pqd_entry->parent_tag_data); + kfree(pqd_entry); + file->private_data = NULL; + + spin_unlock_bh(&uid_tag_data_tree_lock); + spin_unlock_bh(&sock_tag_list_lock); + + + sock_tag_tree_erase(&st_to_free_tree); + + prdebug_full_state(0, "%s(): pid=%u tgid=%u", __func__, + current->pid, current->tgid); + return 0; +} + +/*------------------------------------------*/ +static const struct file_operations qtudev_fops = { + .owner = THIS_MODULE, + .open = qtudev_open, + .release = qtudev_release, +}; + +static struct miscdevice qtu_device = { + .minor = MISC_DYNAMIC_MINOR, + .name = QTU_DEV_NAME, + .fops = &qtudev_fops, + /* How sad it doesn't allow for defaults: .mode = S_IRUGO | S_IWUSR */ +}; + +/*------------------------------------------*/ +static int __init qtaguid_proc_register(struct proc_dir_entry **res_procdir) +{ + int ret; + *res_procdir = proc_mkdir(module_procdirname, init_net.proc_net); + if (!*res_procdir) { + pr_err("qtaguid: failed to create proc/.../xt_qtaguid\n"); + ret = -ENOMEM; + goto no_dir; + } + + xt_qtaguid_ctrl_file = create_proc_entry("ctrl", proc_ctrl_perms, + *res_procdir); + if (!xt_qtaguid_ctrl_file) { + pr_err("qtaguid: failed to create xt_qtaguid/ctrl " + " file\n"); + ret = -ENOMEM; + goto no_ctrl_entry; + } + xt_qtaguid_ctrl_file->read_proc = qtaguid_ctrl_proc_read; + xt_qtaguid_ctrl_file->write_proc = qtaguid_ctrl_proc_write; + + xt_qtaguid_stats_file = create_proc_entry("stats", proc_stats_perms, + *res_procdir); + if (!xt_qtaguid_stats_file) { + pr_err("qtaguid: failed to create xt_qtaguid/stats " + "file\n"); + ret = -ENOMEM; + goto no_stats_entry; + } + xt_qtaguid_stats_file->read_proc = qtaguid_stats_proc_read; + /* + * TODO: add support counter hacking + * xt_qtaguid_stats_file->write_proc = qtaguid_stats_proc_write; + */ + return 0; + +no_stats_entry: + remove_proc_entry("ctrl", *res_procdir); +no_ctrl_entry: + remove_proc_entry("xt_qtaguid", NULL); +no_dir: + return ret; +} + +static struct xt_match qtaguid_mt_reg __read_mostly = { + /* + * This module masquerades as the "owner" module so that iptables + * tools can deal with it. + */ + .name = "owner", + .revision = 1, + .family = NFPROTO_UNSPEC, + .match = qtaguid_mt, + .matchsize = sizeof(struct xt_qtaguid_match_info), + .me = THIS_MODULE, +}; + +static int __init qtaguid_mt_init(void) +{ + if (qtaguid_proc_register(&xt_qtaguid_procdir) + || iface_stat_init(xt_qtaguid_procdir) + || xt_register_match(&qtaguid_mt_reg) + || misc_register(&qtu_device)) + return -1; + return 0; +} + +/* + * TODO: allow unloading of the module. + * For now stats are permanent. + * Kconfig forces'y/n' and never an 'm'. + */ + +module_init(qtaguid_mt_init); +MODULE_AUTHOR("jpa <jpa@google.com>"); +MODULE_DESCRIPTION("Xtables: socket owner+tag matching and associated stats"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_owner"); +MODULE_ALIAS("ip6t_owner"); +MODULE_ALIAS("ipt_qtaguid"); +MODULE_ALIAS("ip6t_qtaguid"); diff --git a/net/netfilter/xt_qtaguid_internal.h b/net/netfilter/xt_qtaguid_internal.h new file mode 100644 index 00000000..d79f8383 --- /dev/null +++ b/net/netfilter/xt_qtaguid_internal.h @@ -0,0 +1,333 @@ +/* + * Kernel iptables module to track stats for packets based on user tags. + * + * (C) 2011 Google, Inc + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#ifndef __XT_QTAGUID_INTERNAL_H__ +#define __XT_QTAGUID_INTERNAL_H__ + +#include <linux/types.h> +#include <linux/rbtree.h> +#include <linux/spinlock_types.h> +#include <linux/workqueue.h> + +/* Iface handling */ +#define IDEBUG_MASK (1<<0) +/* Iptable Matching. Per packet. */ +#define MDEBUG_MASK (1<<1) +/* Red-black tree handling. Per packet. */ +#define RDEBUG_MASK (1<<2) +/* procfs ctrl/stats handling */ +#define CDEBUG_MASK (1<<3) +/* dev and resource tracking */ +#define DDEBUG_MASK (1<<4) + +/* E.g (IDEBUG_MASK | CDEBUG_MASK | DDEBUG_MASK) */ +#define DEFAULT_DEBUG_MASK 0 + +/* + * (Un)Define these *DEBUG to compile out/in the pr_debug calls. + * All undef: text size ~ 0x3030; all def: ~ 0x4404. + */ +#define IDEBUG +#define MDEBUG +#define RDEBUG +#define CDEBUG +#define DDEBUG + +#define MSK_DEBUG(mask, ...) do { \ + if (unlikely(qtaguid_debug_mask & (mask))) \ + pr_debug(__VA_ARGS__); \ + } while (0) +#ifdef IDEBUG +#define IF_DEBUG(...) MSK_DEBUG(IDEBUG_MASK, __VA_ARGS__) +#else +#define IF_DEBUG(...) no_printk(__VA_ARGS__) +#endif +#ifdef MDEBUG +#define MT_DEBUG(...) MSK_DEBUG(MDEBUG_MASK, __VA_ARGS__) +#else +#define MT_DEBUG(...) no_printk(__VA_ARGS__) +#endif +#ifdef RDEBUG +#define RB_DEBUG(...) MSK_DEBUG(RDEBUG_MASK, __VA_ARGS__) +#else +#define RB_DEBUG(...) no_printk(__VA_ARGS__) +#endif +#ifdef CDEBUG +#define CT_DEBUG(...) MSK_DEBUG(CDEBUG_MASK, __VA_ARGS__) +#else +#define CT_DEBUG(...) no_printk(__VA_ARGS__) +#endif +#ifdef DDEBUG +#define DR_DEBUG(...) MSK_DEBUG(DDEBUG_MASK, __VA_ARGS__) +#else +#define DR_DEBUG(...) no_printk(__VA_ARGS__) +#endif + +extern uint qtaguid_debug_mask; + +/*---------------------------------------------------------------------------*/ +/* + * Tags: + * + * They represent what the data usage counters will be tracked against. + * By default a tag is just based on the UID. + * The UID is used as the base for policing, and can not be ignored. + * So a tag will always at least represent a UID (uid_tag). + * + * A tag can be augmented with an "accounting tag" which is associated + * with a UID. + * User space can set the acct_tag portion of the tag which is then used + * with sockets: all data belonging to that socket will be counted against the + * tag. The policing is then based on the tag's uid_tag portion, + * and stats are collected for the acct_tag portion separately. + * + * There could be + * a: {acct_tag=1, uid_tag=10003} + * b: {acct_tag=2, uid_tag=10003} + * c: {acct_tag=3, uid_tag=10003} + * d: {acct_tag=0, uid_tag=10003} + * a, b, and c represent tags associated with specific sockets. + * d is for the totals for that uid, including all untagged traffic. + * Typically d is used with policing/quota rules. + * + * We want tag_t big enough to distinguish uid_t and acct_tag. + * It might become a struct if needed. + * Nothing should be using it as an int. + */ +typedef uint64_t tag_t; /* Only used via accessors */ + +#define TAG_UID_MASK 0xFFFFFFFFULL +#define TAG_ACCT_MASK (~0xFFFFFFFFULL) + +static inline int tag_compare(tag_t t1, tag_t t2) +{ + return t1 < t2 ? -1 : t1 == t2 ? 0 : 1; +} + +static inline tag_t combine_atag_with_uid(tag_t acct_tag, uid_t uid) +{ + return acct_tag | uid; +} +static inline tag_t make_tag_from_uid(uid_t uid) +{ + return uid; +} +static inline uid_t get_uid_from_tag(tag_t tag) +{ + return tag & TAG_UID_MASK; +} +static inline tag_t get_utag_from_tag(tag_t tag) +{ + return tag & TAG_UID_MASK; +} +static inline tag_t get_atag_from_tag(tag_t tag) +{ + return tag & TAG_ACCT_MASK; +} + +static inline bool valid_atag(tag_t tag) +{ + return !(tag & TAG_UID_MASK); +} +static inline tag_t make_atag_from_value(uint32_t value) +{ + return (uint64_t)value << 32; +} +/*---------------------------------------------------------------------------*/ + +/* + * Maximum number of socket tags that a UID is allowed to have active. + * Multiple processes belonging to the same UID contribute towards this limit. + * Special UIDs that can impersonate a UID also contribute (e.g. download + * manager, ...) + */ +#define DEFAULT_MAX_SOCK_TAGS 1024 + +/* + * For now we only track 2 sets of counters. + * The default set is 0. + * Userspace can activate another set for a given uid being tracked. + */ +#define IFS_MAX_COUNTER_SETS 2 + +enum ifs_tx_rx { + IFS_TX, + IFS_RX, + IFS_MAX_DIRECTIONS +}; + +/* For now, TCP, UDP, the rest */ +enum ifs_proto { + IFS_TCP, + IFS_UDP, + IFS_PROTO_OTHER, + IFS_MAX_PROTOS +}; + +struct byte_packet_counters { + uint64_t bytes; + uint64_t packets; +}; + +struct data_counters { + struct byte_packet_counters bpc[IFS_MAX_COUNTER_SETS][IFS_MAX_DIRECTIONS][IFS_MAX_PROTOS]; +}; + +/* Generic X based nodes used as a base for rb_tree ops */ +struct tag_node { + struct rb_node node; + tag_t tag; +}; + +struct tag_stat { + struct tag_node tn; + struct data_counters counters; + /* + * If this tag is acct_tag based, we need to count against the + * matching parent uid_tag. + */ + struct data_counters *parent_counters; +}; + +struct iface_stat { + struct list_head list; /* in iface_stat_list */ + char *ifname; + bool active; + /* net_dev is only valid for active iface_stat */ + struct net_device *net_dev; + + struct byte_packet_counters totals_via_dev[IFS_MAX_DIRECTIONS]; + struct byte_packet_counters totals_via_skb[IFS_MAX_DIRECTIONS]; + /* + * We keep the last_known, because some devices reset their counters + * just before NETDEV_UP, while some will reset just before + * NETDEV_REGISTER (which is more normal). + * So now, if the device didn't do a NETDEV_UNREGISTER and we see + * its current dev stats smaller that what was previously known, we + * assume an UNREGISTER and just use the last_known. + */ + struct byte_packet_counters last_known[IFS_MAX_DIRECTIONS]; + /* last_known is usable when last_known_valid is true */ + bool last_known_valid; + + struct proc_dir_entry *proc_ptr; + + struct rb_root tag_stat_tree; + spinlock_t tag_stat_list_lock; +}; + +/* This is needed to create proc_dir_entries from atomic context. */ +struct iface_stat_work { + struct work_struct iface_work; + struct iface_stat *iface_entry; +}; + +/* + * Track tag that this socket is transferring data for, and not necessarily + * the uid that owns the socket. + * This is the tag against which tag_stat.counters will be billed. + * These structs need to be looked up by sock and pid. + */ +struct sock_tag { + struct rb_node sock_node; + struct sock *sk; /* Only used as a number, never dereferenced */ + /* The socket is needed for sockfd_put() */ + struct socket *socket; + /* Used to associate with a given pid */ + struct list_head list; /* in proc_qtu_data.sock_tag_list */ + pid_t pid; + + tag_t tag; +}; + +struct qtaguid_event_counts { + /* Various successful events */ + atomic64_t sockets_tagged; + atomic64_t sockets_untagged; + atomic64_t counter_set_changes; + atomic64_t delete_cmds; + atomic64_t iface_events; /* Number of NETDEV_* events handled */ + + atomic64_t match_calls; /* Number of times iptables called mt */ + /* Number of times iptables called mt from pre or post routing hooks */ + atomic64_t match_calls_prepost; + /* + * match_found_sk_*: numbers related to the netfilter matching + * function finding a sock for the sk_buff. + * Total skbs processed is sum(match_found*). + */ + atomic64_t match_found_sk; /* An sk was already in the sk_buff. */ + /* The connection tracker had or didn't have the sk. */ + atomic64_t match_found_sk_in_ct; + atomic64_t match_found_no_sk_in_ct; + /* + * No sk could be found. No apparent owner. Could happen with + * unsolicited traffic. + */ + atomic64_t match_no_sk; + /* + * The file ptr in the sk_socket wasn't there. + * This might happen for traffic while the socket is being closed. + */ + atomic64_t match_no_sk_file; +}; + +/* Track the set active_set for the given tag. */ +struct tag_counter_set { + struct tag_node tn; + int active_set; +}; + +/*----------------------------------------------*/ +/* + * The qtu uid data is used to track resources that are created directly or + * indirectly by processes (uid tracked). + * It is shared by the processes with the same uid. + * Some of the resource will be counted to prevent further rogue allocations, + * some will need freeing once the owner process (uid) exits. + */ +struct uid_tag_data { + struct rb_node node; + uid_t uid; + + /* + * For the uid, how many accounting tags have been set. + */ + int num_active_tags; + /* Track the number of proc_qtu_data that reference it */ + int num_pqd; + struct rb_root tag_ref_tree; + /* No tag_node_tree_lock; use uid_tag_data_tree_lock */ +}; + +struct tag_ref { + struct tag_node tn; + + /* + * This tracks the number of active sockets that have a tag on them + * which matches this tag_ref.tn.tag. + * A tag ref can live on after the sockets are untagged. + * A tag ref can only be removed during a tag delete command. + */ + int num_sock_tags; +}; + +struct proc_qtu_data { + struct rb_node node; + pid_t pid; + + struct uid_tag_data *parent_tag_data; + + /* Tracks the sock_tags that need freeing upon this proc's death */ + struct list_head sock_tag_list; + /* No spinlock_t sock_tag_list_lock; use the global one. */ +}; + +/*----------------------------------------------*/ +#endif /* ifndef __XT_QTAGUID_INTERNAL_H__ */ diff --git a/net/netfilter/xt_qtaguid_print.c b/net/netfilter/xt_qtaguid_print.c new file mode 100644 index 00000000..8cbd8e42 --- /dev/null +++ b/net/netfilter/xt_qtaguid_print.c @@ -0,0 +1,564 @@ +/* + * Pretty printing Support for iptables xt_qtaguid module. + * + * (C) 2011 Google, Inc + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* + * Most of the functions in this file just waste time if DEBUG is not defined. + * The matching xt_qtaguid_print.h will static inline empty funcs if the needed + * debug flags ore not defined. + * Those funcs that fail to allocate memory will panic as there is no need to + * hobble allong just pretending to do the requested work. + */ + +#define DEBUG + +#include <linux/fs.h> +#include <linux/gfp.h> +#include <linux/net.h> +#include <linux/rbtree.h> +#include <linux/slab.h> +#include <linux/spinlock_types.h> + + +#include "xt_qtaguid_internal.h" +#include "xt_qtaguid_print.h" + +#ifdef DDEBUG + +static void _bug_on_err_or_null(void *ptr) +{ + if (IS_ERR_OR_NULL(ptr)) { + pr_err("qtaguid: kmalloc failed\n"); + BUG(); + } +} + +char *pp_tag_t(tag_t *tag) +{ + char *res; + + if (!tag) + res = kasprintf(GFP_ATOMIC, "tag_t@null{}"); + else + res = kasprintf(GFP_ATOMIC, + "tag_t@%p{tag=0x%llx, uid=%u}", + tag, *tag, get_uid_from_tag(*tag)); + _bug_on_err_or_null(res); + return res; +} + +char *pp_data_counters(struct data_counters *dc, bool showValues) +{ + char *res; + + if (!dc) + res = kasprintf(GFP_ATOMIC, "data_counters@null{}"); + else if (showValues) + res = kasprintf( + GFP_ATOMIC, "data_counters@%p{" + "set0{" + "rx{" + "tcp{b=%llu, p=%llu}, " + "udp{b=%llu, p=%llu}," + "other{b=%llu, p=%llu}}, " + "tx{" + "tcp{b=%llu, p=%llu}, " + "udp{b=%llu, p=%llu}," + "other{b=%llu, p=%llu}}}, " + "set1{" + "rx{" + "tcp{b=%llu, p=%llu}, " + "udp{b=%llu, p=%llu}," + "other{b=%llu, p=%llu}}, " + "tx{" + "tcp{b=%llu, p=%llu}, " + "udp{b=%llu, p=%llu}," + "other{b=%llu, p=%llu}}}}", + dc, + dc->bpc[0][IFS_RX][IFS_TCP].bytes, + dc->bpc[0][IFS_RX][IFS_TCP].packets, + dc->bpc[0][IFS_RX][IFS_UDP].bytes, + dc->bpc[0][IFS_RX][IFS_UDP].packets, + dc->bpc[0][IFS_RX][IFS_PROTO_OTHER].bytes, + dc->bpc[0][IFS_RX][IFS_PROTO_OTHER].packets, + dc->bpc[0][IFS_TX][IFS_TCP].bytes, + dc->bpc[0][IFS_TX][IFS_TCP].packets, + dc->bpc[0][IFS_TX][IFS_UDP].bytes, + dc->bpc[0][IFS_TX][IFS_UDP].packets, + dc->bpc[0][IFS_TX][IFS_PROTO_OTHER].bytes, + dc->bpc[0][IFS_TX][IFS_PROTO_OTHER].packets, + dc->bpc[1][IFS_RX][IFS_TCP].bytes, + dc->bpc[1][IFS_RX][IFS_TCP].packets, + dc->bpc[1][IFS_RX][IFS_UDP].bytes, + dc->bpc[1][IFS_RX][IFS_UDP].packets, + dc->bpc[1][IFS_RX][IFS_PROTO_OTHER].bytes, + dc->bpc[1][IFS_RX][IFS_PROTO_OTHER].packets, + dc->bpc[1][IFS_TX][IFS_TCP].bytes, + dc->bpc[1][IFS_TX][IFS_TCP].packets, + dc->bpc[1][IFS_TX][IFS_UDP].bytes, + dc->bpc[1][IFS_TX][IFS_UDP].packets, + dc->bpc[1][IFS_TX][IFS_PROTO_OTHER].bytes, + dc->bpc[1][IFS_TX][IFS_PROTO_OTHER].packets); + else + res = kasprintf(GFP_ATOMIC, "data_counters@%p{...}", dc); + _bug_on_err_or_null(res); + return res; +} + +char *pp_tag_node(struct tag_node *tn) +{ + char *tag_str; + char *res; + + if (!tn) { + res = kasprintf(GFP_ATOMIC, "tag_node@null{}"); + _bug_on_err_or_null(res); + return res; + } + tag_str = pp_tag_t(&tn->tag); + res = kasprintf(GFP_ATOMIC, + "tag_node@%p{tag=%s}", + tn, tag_str); + _bug_on_err_or_null(res); + kfree(tag_str); + return res; +} + +char *pp_tag_ref(struct tag_ref *tr) +{ + char *tn_str; + char *res; + + if (!tr) { + res = kasprintf(GFP_ATOMIC, "tag_ref@null{}"); + _bug_on_err_or_null(res); + return res; + } + tn_str = pp_tag_node(&tr->tn); + res = kasprintf(GFP_ATOMIC, + "tag_ref@%p{%s, num_sock_tags=%d}", + tr, tn_str, tr->num_sock_tags); + _bug_on_err_or_null(res); + kfree(tn_str); + return res; +} + +char *pp_tag_stat(struct tag_stat *ts) +{ + char *tn_str; + char *counters_str; + char *parent_counters_str; + char *res; + + if (!ts) { + res = kasprintf(GFP_ATOMIC, "tag_stat@null{}"); + _bug_on_err_or_null(res); + return res; + } + tn_str = pp_tag_node(&ts->tn); + counters_str = pp_data_counters(&ts->counters, true); + parent_counters_str = pp_data_counters(ts->parent_counters, false); + res = kasprintf(GFP_ATOMIC, + "tag_stat@%p{%s, counters=%s, parent_counters=%s}", + ts, tn_str, counters_str, parent_counters_str); + _bug_on_err_or_null(res); + kfree(tn_str); + kfree(counters_str); + kfree(parent_counters_str); + return res; +} + +char *pp_iface_stat(struct iface_stat *is) +{ + char *res; + if (!is) + res = kasprintf(GFP_ATOMIC, "iface_stat@null{}"); + else + res = kasprintf(GFP_ATOMIC, "iface_stat@%p{" + "list=list_head{...}, " + "ifname=%s, " + "total_dev={rx={bytes=%llu, " + "packets=%llu}, " + "tx={bytes=%llu, " + "packets=%llu}}, " + "total_skb={rx={bytes=%llu, " + "packets=%llu}, " + "tx={bytes=%llu, " + "packets=%llu}}, " + "last_known_valid=%d, " + "last_known={rx={bytes=%llu, " + "packets=%llu}, " + "tx={bytes=%llu, " + "packets=%llu}}, " + "active=%d, " + "net_dev=%p, " + "proc_ptr=%p, " + "tag_stat_tree=rb_root{...}}", + is, + is->ifname, + is->totals_via_dev[IFS_RX].bytes, + is->totals_via_dev[IFS_RX].packets, + is->totals_via_dev[IFS_TX].bytes, + is->totals_via_dev[IFS_TX].packets, + is->totals_via_skb[IFS_RX].bytes, + is->totals_via_skb[IFS_RX].packets, + is->totals_via_skb[IFS_TX].bytes, + is->totals_via_skb[IFS_TX].packets, + is->last_known_valid, + is->last_known[IFS_RX].bytes, + is->last_known[IFS_RX].packets, + is->last_known[IFS_TX].bytes, + is->last_known[IFS_TX].packets, + is->active, + is->net_dev, + is->proc_ptr); + _bug_on_err_or_null(res); + return res; +} + +char *pp_sock_tag(struct sock_tag *st) +{ + char *tag_str; + char *res; + + if (!st) { + res = kasprintf(GFP_ATOMIC, "sock_tag@null{}"); + _bug_on_err_or_null(res); + return res; + } + tag_str = pp_tag_t(&st->tag); + res = kasprintf(GFP_ATOMIC, "sock_tag@%p{" + "sock_node=rb_node{...}, " + "sk=%p socket=%p (f_count=%lu), list=list_head{...}, " + "pid=%u, tag=%s}", + st, st->sk, st->socket, atomic_long_read( + &st->socket->file->f_count), + st->pid, tag_str); + _bug_on_err_or_null(res); + kfree(tag_str); + return res; +} + +char *pp_uid_tag_data(struct uid_tag_data *utd) +{ + char *res; + + if (!utd) + res = kasprintf(GFP_ATOMIC, "uid_tag_data@null{}"); + else + res = kasprintf(GFP_ATOMIC, "uid_tag_data@%p{" + "uid=%u, num_active_acct_tags=%d, " + "num_pqd=%d, " + "tag_node_tree=rb_root{...}, " + "proc_qtu_data_tree=rb_root{...}}", + utd, utd->uid, + utd->num_active_tags, utd->num_pqd); + _bug_on_err_or_null(res); + return res; +} + +char *pp_proc_qtu_data(struct proc_qtu_data *pqd) +{ + char *parent_tag_data_str; + char *res; + + if (!pqd) { + res = kasprintf(GFP_ATOMIC, "proc_qtu_data@null{}"); + _bug_on_err_or_null(res); + return res; + } + parent_tag_data_str = pp_uid_tag_data(pqd->parent_tag_data); + res = kasprintf(GFP_ATOMIC, "proc_qtu_data@%p{" + "node=rb_node{...}, pid=%u, " + "parent_tag_data=%s, " + "sock_tag_list=list_head{...}}", + pqd, pqd->pid, parent_tag_data_str + ); + _bug_on_err_or_null(res); + kfree(parent_tag_data_str); + return res; +} + +/*------------------------------------------*/ +void prdebug_sock_tag_tree(int indent_level, + struct rb_root *sock_tag_tree) +{ + struct rb_node *node; + struct sock_tag *sock_tag_entry; + char *str; + + if (!unlikely(qtaguid_debug_mask & DDEBUG_MASK)) + return; + + if (RB_EMPTY_ROOT(sock_tag_tree)) { + str = "sock_tag_tree=rb_root{}"; + pr_debug("%*d: %s\n", indent_level*2, indent_level, str); + return; + } + + str = "sock_tag_tree=rb_root{"; + pr_debug("%*d: %s\n", indent_level*2, indent_level, str); + indent_level++; + for (node = rb_first(sock_tag_tree); + node; + node = rb_next(node)) { + sock_tag_entry = rb_entry(node, struct sock_tag, sock_node); + str = pp_sock_tag(sock_tag_entry); + pr_debug("%*d: %s,\n", indent_level*2, indent_level, str); + kfree(str); + } + indent_level--; + str = "}"; + pr_debug("%*d: %s\n", indent_level*2, indent_level, str); +} + +void prdebug_sock_tag_list(int indent_level, + struct list_head *sock_tag_list) +{ + struct sock_tag *sock_tag_entry; + char *str; + + if (!unlikely(qtaguid_debug_mask & DDEBUG_MASK)) + return; + + if (list_empty(sock_tag_list)) { + str = "sock_tag_list=list_head{}"; + pr_debug("%*d: %s\n", indent_level*2, indent_level, str); + return; + } + + str = "sock_tag_list=list_head{"; + pr_debug("%*d: %s\n", indent_level*2, indent_level, str); + indent_level++; + list_for_each_entry(sock_tag_entry, sock_tag_list, list) { + str = pp_sock_tag(sock_tag_entry); + pr_debug("%*d: %s,\n", indent_level*2, indent_level, str); + kfree(str); + } + indent_level--; + str = "}"; + pr_debug("%*d: %s\n", indent_level*2, indent_level, str); +} + +void prdebug_proc_qtu_data_tree(int indent_level, + struct rb_root *proc_qtu_data_tree) +{ + char *str; + struct rb_node *node; + struct proc_qtu_data *proc_qtu_data_entry; + + if (!unlikely(qtaguid_debug_mask & DDEBUG_MASK)) + return; + + if (RB_EMPTY_ROOT(proc_qtu_data_tree)) { + str = "proc_qtu_data_tree=rb_root{}"; + pr_debug("%*d: %s\n", indent_level*2, indent_level, str); + return; + } + + str = "proc_qtu_data_tree=rb_root{"; + pr_debug("%*d: %s\n", indent_level*2, indent_level, str); + indent_level++; + for (node = rb_first(proc_qtu_data_tree); + node; + node = rb_next(node)) { + proc_qtu_data_entry = rb_entry(node, + struct proc_qtu_data, + node); + str = pp_proc_qtu_data(proc_qtu_data_entry); + pr_debug("%*d: %s,\n", indent_level*2, indent_level, + str); + kfree(str); + indent_level++; + prdebug_sock_tag_list(indent_level, + &proc_qtu_data_entry->sock_tag_list); + indent_level--; + + } + indent_level--; + str = "}"; + pr_debug("%*d: %s\n", indent_level*2, indent_level, str); +} + +void prdebug_tag_ref_tree(int indent_level, struct rb_root *tag_ref_tree) +{ + char *str; + struct rb_node *node; + struct tag_ref *tag_ref_entry; + + if (!unlikely(qtaguid_debug_mask & DDEBUG_MASK)) + return; + + if (RB_EMPTY_ROOT(tag_ref_tree)) { + str = "tag_ref_tree{}"; + pr_debug("%*d: %s\n", indent_level*2, indent_level, str); + return; + } + + str = "tag_ref_tree{"; + pr_debug("%*d: %s\n", indent_level*2, indent_level, str); + indent_level++; + for (node = rb_first(tag_ref_tree); + node; + node = rb_next(node)) { + tag_ref_entry = rb_entry(node, + struct tag_ref, + tn.node); + str = pp_tag_ref(tag_ref_entry); + pr_debug("%*d: %s,\n", indent_level*2, indent_level, + str); + kfree(str); + } + indent_level--; + str = "}"; + pr_debug("%*d: %s\n", indent_level*2, indent_level, str); +} + +void prdebug_uid_tag_data_tree(int indent_level, + struct rb_root *uid_tag_data_tree) +{ + char *str; + struct rb_node *node; + struct uid_tag_data *uid_tag_data_entry; + + if (!unlikely(qtaguid_debug_mask & DDEBUG_MASK)) + return; + + if (RB_EMPTY_ROOT(uid_tag_data_tree)) { + str = "uid_tag_data_tree=rb_root{}"; + pr_debug("%*d: %s\n", indent_level*2, indent_level, str); + return; + } + + str = "uid_tag_data_tree=rb_root{"; + pr_debug("%*d: %s\n", indent_level*2, indent_level, str); + indent_level++; + for (node = rb_first(uid_tag_data_tree); + node; + node = rb_next(node)) { + uid_tag_data_entry = rb_entry(node, struct uid_tag_data, + node); + str = pp_uid_tag_data(uid_tag_data_entry); + pr_debug("%*d: %s,\n", indent_level*2, indent_level, str); + kfree(str); + if (!RB_EMPTY_ROOT(&uid_tag_data_entry->tag_ref_tree)) { + indent_level++; + prdebug_tag_ref_tree(indent_level, + &uid_tag_data_entry->tag_ref_tree); + indent_level--; + } + } + indent_level--; + str = "}"; + pr_debug("%*d: %s\n", indent_level*2, indent_level, str); +} + +void prdebug_tag_stat_tree(int indent_level, + struct rb_root *tag_stat_tree) +{ + char *str; + struct rb_node *node; + struct tag_stat *ts_entry; + + if (!unlikely(qtaguid_debug_mask & DDEBUG_MASK)) + return; + + if (RB_EMPTY_ROOT(tag_stat_tree)) { + str = "tag_stat_tree{}"; + pr_debug("%*d: %s\n", indent_level*2, indent_level, str); + return; + } + + str = "tag_stat_tree{"; + pr_debug("%*d: %s\n", indent_level*2, indent_level, str); + indent_level++; + for (node = rb_first(tag_stat_tree); + node; + node = rb_next(node)) { + ts_entry = rb_entry(node, struct tag_stat, tn.node); + str = pp_tag_stat(ts_entry); + pr_debug("%*d: %s\n", indent_level*2, indent_level, + str); + kfree(str); + } + indent_level--; + str = "}"; + pr_debug("%*d: %s\n", indent_level*2, indent_level, str); +} + +void prdebug_iface_stat_list(int indent_level, + struct list_head *iface_stat_list) +{ + char *str; + struct iface_stat *iface_entry; + + if (!unlikely(qtaguid_debug_mask & DDEBUG_MASK)) + return; + + if (list_empty(iface_stat_list)) { + str = "iface_stat_list=list_head{}"; + pr_debug("%*d: %s\n", indent_level*2, indent_level, str); + return; + } + + str = "iface_stat_list=list_head{"; + pr_debug("%*d: %s\n", indent_level*2, indent_level, str); + indent_level++; + list_for_each_entry(iface_entry, iface_stat_list, list) { + str = pp_iface_stat(iface_entry); + pr_debug("%*d: %s\n", indent_level*2, indent_level, str); + kfree(str); + + spin_lock_bh(&iface_entry->tag_stat_list_lock); + if (!RB_EMPTY_ROOT(&iface_entry->tag_stat_tree)) { + indent_level++; + prdebug_tag_stat_tree(indent_level, + &iface_entry->tag_stat_tree); + indent_level--; + } + spin_unlock_bh(&iface_entry->tag_stat_list_lock); + } + indent_level--; + str = "}"; + pr_debug("%*d: %s\n", indent_level*2, indent_level, str); +} + +#endif /* ifdef DDEBUG */ +/*------------------------------------------*/ +static const char * const netdev_event_strings[] = { + "netdev_unknown", + "NETDEV_UP", + "NETDEV_DOWN", + "NETDEV_REBOOT", + "NETDEV_CHANGE", + "NETDEV_REGISTER", + "NETDEV_UNREGISTER", + "NETDEV_CHANGEMTU", + "NETDEV_CHANGEADDR", + "NETDEV_GOING_DOWN", + "NETDEV_CHANGENAME", + "NETDEV_FEAT_CHANGE", + "NETDEV_BONDING_FAILOVER", + "NETDEV_PRE_UP", + "NETDEV_PRE_TYPE_CHANGE", + "NETDEV_POST_TYPE_CHANGE", + "NETDEV_POST_INIT", + "NETDEV_UNREGISTER_BATCH", + "NETDEV_RELEASE", + "NETDEV_NOTIFY_PEERS", + "NETDEV_JOIN", +}; + +const char *netdev_evt_str(int netdev_event) +{ + if (netdev_event < 0 + || netdev_event >= ARRAY_SIZE(netdev_event_strings)) + return "bad event num"; + return netdev_event_strings[netdev_event]; +} diff --git a/net/netfilter/xt_qtaguid_print.h b/net/netfilter/xt_qtaguid_print.h new file mode 100644 index 00000000..b63871a0 --- /dev/null +++ b/net/netfilter/xt_qtaguid_print.h @@ -0,0 +1,120 @@ +/* + * Pretty printing Support for iptables xt_qtaguid module. + * + * (C) 2011 Google, Inc + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#ifndef __XT_QTAGUID_PRINT_H__ +#define __XT_QTAGUID_PRINT_H__ + +#include "xt_qtaguid_internal.h" + +#ifdef DDEBUG + +char *pp_tag_t(tag_t *tag); +char *pp_data_counters(struct data_counters *dc, bool showValues); +char *pp_tag_node(struct tag_node *tn); +char *pp_tag_ref(struct tag_ref *tr); +char *pp_tag_stat(struct tag_stat *ts); +char *pp_iface_stat(struct iface_stat *is); +char *pp_sock_tag(struct sock_tag *st); +char *pp_uid_tag_data(struct uid_tag_data *qtd); +char *pp_proc_qtu_data(struct proc_qtu_data *pqd); + +/*------------------------------------------*/ +void prdebug_sock_tag_list(int indent_level, + struct list_head *sock_tag_list); +void prdebug_sock_tag_tree(int indent_level, + struct rb_root *sock_tag_tree); +void prdebug_proc_qtu_data_tree(int indent_level, + struct rb_root *proc_qtu_data_tree); +void prdebug_tag_ref_tree(int indent_level, struct rb_root *tag_ref_tree); +void prdebug_uid_tag_data_tree(int indent_level, + struct rb_root *uid_tag_data_tree); +void prdebug_tag_stat_tree(int indent_level, + struct rb_root *tag_stat_tree); +void prdebug_iface_stat_list(int indent_level, + struct list_head *iface_stat_list); + +#else + +/*------------------------------------------*/ +static inline char *pp_tag_t(tag_t *tag) +{ + return NULL; +} +static inline char *pp_data_counters(struct data_counters *dc, bool showValues) +{ + return NULL; +} +static inline char *pp_tag_node(struct tag_node *tn) +{ + return NULL; +} +static inline char *pp_tag_ref(struct tag_ref *tr) +{ + return NULL; +} +static inline char *pp_tag_stat(struct tag_stat *ts) +{ + return NULL; +} +static inline char *pp_iface_stat(struct iface_stat *is) +{ + return NULL; +} +static inline char *pp_sock_tag(struct sock_tag *st) +{ + return NULL; +} +static inline char *pp_uid_tag_data(struct uid_tag_data *qtd) +{ + return NULL; +} +static inline char *pp_proc_qtu_data(struct proc_qtu_data *pqd) +{ + return NULL; +} + +/*------------------------------------------*/ +static inline +void prdebug_sock_tag_list(int indent_level, + struct list_head *sock_tag_list) +{ +} +static inline +void prdebug_sock_tag_tree(int indent_level, + struct rb_root *sock_tag_tree) +{ +} +static inline +void prdebug_proc_qtu_data_tree(int indent_level, + struct rb_root *proc_qtu_data_tree) +{ +} +static inline +void prdebug_tag_ref_tree(int indent_level, struct rb_root *tag_ref_tree) +{ +} +static inline +void prdebug_uid_tag_data_tree(int indent_level, + struct rb_root *uid_tag_data_tree) +{ +} +static inline +void prdebug_tag_stat_tree(int indent_level, + struct rb_root *tag_stat_tree) +{ +} +static inline +void prdebug_iface_stat_list(int indent_level, + struct list_head *iface_stat_list) +{ +} +#endif +/*------------------------------------------*/ +const char *netdev_evt_str(int netdev_event); +#endif /* ifndef __XT_QTAGUID_PRINT_H__ */ diff --git a/net/netfilter/xt_quota.c b/net/netfilter/xt_quota.c new file mode 100644 index 00000000..44c8eb4c --- /dev/null +++ b/net/netfilter/xt_quota.c @@ -0,0 +1,90 @@ +/* + * netfilter module to enforce network quotas + * + * Sam Johnston <samj@samj.net> + */ +#include <linux/skbuff.h> +#include <linux/slab.h> +#include <linux/spinlock.h> + +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_quota.h> +#include <linux/module.h> + +struct xt_quota_priv { + spinlock_t lock; + uint64_t quota; +}; + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Sam Johnston <samj@samj.net>"); +MODULE_DESCRIPTION("Xtables: countdown quota match"); +MODULE_ALIAS("ipt_quota"); +MODULE_ALIAS("ip6t_quota"); + +static bool +quota_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + struct xt_quota_info *q = (void *)par->matchinfo; + struct xt_quota_priv *priv = q->master; + bool ret = q->flags & XT_QUOTA_INVERT; + + spin_lock_bh(&priv->lock); + if (priv->quota >= skb->len) { + priv->quota -= skb->len; + ret = !ret; + } else { + /* we do not allow even small packets from now on */ + priv->quota = 0; + } + spin_unlock_bh(&priv->lock); + + return ret; +} + +static int quota_mt_check(const struct xt_mtchk_param *par) +{ + struct xt_quota_info *q = par->matchinfo; + + if (q->flags & ~XT_QUOTA_MASK) + return -EINVAL; + + q->master = kmalloc(sizeof(*q->master), GFP_KERNEL); + if (q->master == NULL) + return -ENOMEM; + + spin_lock_init(&q->master->lock); + q->master->quota = q->quota; + return 0; +} + +static void quota_mt_destroy(const struct xt_mtdtor_param *par) +{ + const struct xt_quota_info *q = par->matchinfo; + + kfree(q->master); +} + +static struct xt_match quota_mt_reg __read_mostly = { + .name = "quota", + .revision = 0, + .family = NFPROTO_UNSPEC, + .match = quota_mt, + .checkentry = quota_mt_check, + .destroy = quota_mt_destroy, + .matchsize = sizeof(struct xt_quota_info), + .me = THIS_MODULE, +}; + +static int __init quota_mt_init(void) +{ + return xt_register_match("a_mt_reg); +} + +static void __exit quota_mt_exit(void) +{ + xt_unregister_match("a_mt_reg); +} + +module_init(quota_mt_init); +module_exit(quota_mt_exit); diff --git a/net/netfilter/xt_quota2.c b/net/netfilter/xt_quota2.c new file mode 100644 index 00000000..fb2ef46b --- /dev/null +++ b/net/netfilter/xt_quota2.c @@ -0,0 +1,382 @@ +/* + * xt_quota2 - enhanced xt_quota that can count upwards and in packets + * as a minimal accounting match. + * by Jan Engelhardt <jengelh@medozas.de>, 2008 + * + * Originally based on xt_quota.c: + * netfilter module to enforce network quotas + * Sam Johnston <samj@samj.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License; either + * version 2 of the License, as published by the Free Software Foundation. + */ +#include <linux/list.h> +#include <linux/module.h> +#include <linux/proc_fs.h> +#include <linux/skbuff.h> +#include <linux/spinlock.h> +#include <asm/atomic.h> + +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_quota2.h> +#ifdef CONFIG_NETFILTER_XT_MATCH_QUOTA2_LOG +#include <linux/netfilter_ipv4/ipt_ULOG.h> +#endif + +/** + * @lock: lock to protect quota writers from each other + */ +struct xt_quota_counter { + u_int64_t quota; + spinlock_t lock; + struct list_head list; + atomic_t ref; + char name[sizeof(((struct xt_quota_mtinfo2 *)NULL)->name)]; + struct proc_dir_entry *procfs_entry; +}; + +#ifdef CONFIG_NETFILTER_XT_MATCH_QUOTA2_LOG +/* Harald's favorite number +1 :D From ipt_ULOG.C */ +static int qlog_nl_event = 112; +module_param_named(event_num, qlog_nl_event, uint, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(event_num, + "Event number for NETLINK_NFLOG message. 0 disables log." + "111 is what ipt_ULOG uses."); +static struct sock *nflognl; +#endif + +static LIST_HEAD(counter_list); +static DEFINE_SPINLOCK(counter_list_lock); + +static struct proc_dir_entry *proc_xt_quota; +static unsigned int quota_list_perms = S_IRUGO | S_IWUSR; +static unsigned int quota_list_uid = 0; +static unsigned int quota_list_gid = 0; +module_param_named(perms, quota_list_perms, uint, S_IRUGO | S_IWUSR); +module_param_named(uid, quota_list_uid, uint, S_IRUGO | S_IWUSR); +module_param_named(gid, quota_list_gid, uint, S_IRUGO | S_IWUSR); + + +#ifdef CONFIG_NETFILTER_XT_MATCH_QUOTA2_LOG +static void quota2_log(unsigned int hooknum, + const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const char *prefix) +{ + ulog_packet_msg_t *pm; + struct sk_buff *log_skb; + size_t size; + struct nlmsghdr *nlh; + + if (!qlog_nl_event) + return; + + size = NLMSG_SPACE(sizeof(*pm)); + size = max(size, (size_t)NLMSG_GOODSIZE); + log_skb = alloc_skb(size, GFP_ATOMIC); + if (!log_skb) { + pr_err("xt_quota2: cannot alloc skb for logging\n"); + return; + } + + /* NLMSG_PUT() uses "goto nlmsg_failure" */ + nlh = NLMSG_PUT(log_skb, /*pid*/0, /*seq*/0, qlog_nl_event, + sizeof(*pm)); + pm = NLMSG_DATA(nlh); + if (skb->tstamp.tv64 == 0) + __net_timestamp((struct sk_buff *)skb); + pm->data_len = 0; + pm->hook = hooknum; + if (prefix != NULL) + strlcpy(pm->prefix, prefix, sizeof(pm->prefix)); + else + *(pm->prefix) = '\0'; + if (in) + strlcpy(pm->indev_name, in->name, sizeof(pm->indev_name)); + else + pm->indev_name[0] = '\0'; + + if (out) + strlcpy(pm->outdev_name, out->name, sizeof(pm->outdev_name)); + else + pm->outdev_name[0] = '\0'; + + NETLINK_CB(log_skb).dst_group = 1; + pr_debug("throwing 1 packets to netlink group 1\n"); + netlink_broadcast(nflognl, log_skb, 0, 1, GFP_ATOMIC); + +nlmsg_failure: /* Used within NLMSG_PUT() */ + pr_debug("xt_quota2: error during NLMSG_PUT\n"); +} +#else +static void quota2_log(unsigned int hooknum, + const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const char *prefix) +{ +} +#endif /* if+else CONFIG_NETFILTER_XT_MATCH_QUOTA2_LOG */ + +static int quota_proc_read(char *page, char **start, off_t offset, + int count, int *eof, void *data) +{ + struct xt_quota_counter *e = data; + int ret; + + spin_lock_bh(&e->lock); + ret = snprintf(page, PAGE_SIZE, "%llu\n", e->quota); + spin_unlock_bh(&e->lock); + return ret; +} + +static int quota_proc_write(struct file *file, const char __user *input, + unsigned long size, void *data) +{ + struct xt_quota_counter *e = data; + char buf[sizeof("18446744073709551616")]; + + if (size > sizeof(buf)) + size = sizeof(buf); + if (copy_from_user(buf, input, size) != 0) + return -EFAULT; + buf[sizeof(buf)-1] = '\0'; + + spin_lock_bh(&e->lock); + e->quota = simple_strtoull(buf, NULL, 0); + spin_unlock_bh(&e->lock); + return size; +} + +static struct xt_quota_counter * +q2_new_counter(const struct xt_quota_mtinfo2 *q, bool anon) +{ + struct xt_quota_counter *e; + unsigned int size; + + /* Do not need all the procfs things for anonymous counters. */ + size = anon ? offsetof(typeof(*e), list) : sizeof(*e); + e = kmalloc(size, GFP_KERNEL); + if (e == NULL) + return NULL; + + e->quota = q->quota; + spin_lock_init(&e->lock); + if (!anon) { + INIT_LIST_HEAD(&e->list); + atomic_set(&e->ref, 1); + strlcpy(e->name, q->name, sizeof(e->name)); + } + return e; +} + +/** + * q2_get_counter - get ref to counter or create new + * @name: name of counter + */ +static struct xt_quota_counter * +q2_get_counter(const struct xt_quota_mtinfo2 *q) +{ + struct proc_dir_entry *p; + struct xt_quota_counter *e = NULL; + struct xt_quota_counter *new_e; + + if (*q->name == '\0') + return q2_new_counter(q, true); + + /* No need to hold a lock while getting a new counter */ + new_e = q2_new_counter(q, false); + if (new_e == NULL) + goto out; + + spin_lock_bh(&counter_list_lock); + list_for_each_entry(e, &counter_list, list) + if (strcmp(e->name, q->name) == 0) { + atomic_inc(&e->ref); + spin_unlock_bh(&counter_list_lock); + kfree(new_e); + pr_debug("xt_quota2: old counter name=%s", e->name); + return e; + } + e = new_e; + pr_debug("xt_quota2: new_counter name=%s", e->name); + list_add_tail(&e->list, &counter_list); + /* The entry having a refcount of 1 is not directly destructible. + * This func has not yet returned the new entry, thus iptables + * has not references for destroying this entry. + * For another rule to try to destroy it, it would 1st need for this + * func* to be re-invoked, acquire a new ref for the same named quota. + * Nobody will access the e->procfs_entry either. + * So release the lock. */ + spin_unlock_bh(&counter_list_lock); + + /* create_proc_entry() is not spin_lock happy */ + p = e->procfs_entry = create_proc_entry(e->name, quota_list_perms, + proc_xt_quota); + + if (IS_ERR_OR_NULL(p)) { + spin_lock_bh(&counter_list_lock); + list_del(&e->list); + spin_unlock_bh(&counter_list_lock); + goto out; + } + p->data = e; + p->read_proc = quota_proc_read; + p->write_proc = quota_proc_write; + p->uid = quota_list_uid; + p->gid = quota_list_gid; + return e; + + out: + kfree(e); + return NULL; +} + +static int quota_mt2_check(const struct xt_mtchk_param *par) +{ + struct xt_quota_mtinfo2 *q = par->matchinfo; + + pr_debug("xt_quota2: check() flags=0x%04x", q->flags); + + if (q->flags & ~XT_QUOTA_MASK) + return -EINVAL; + + q->name[sizeof(q->name)-1] = '\0'; + if (*q->name == '.' || strchr(q->name, '/') != NULL) { + printk(KERN_ERR "xt_quota.3: illegal name\n"); + return -EINVAL; + } + + q->master = q2_get_counter(q); + if (q->master == NULL) { + printk(KERN_ERR "xt_quota.3: memory alloc failure\n"); + return -ENOMEM; + } + + return 0; +} + +static void quota_mt2_destroy(const struct xt_mtdtor_param *par) +{ + struct xt_quota_mtinfo2 *q = par->matchinfo; + struct xt_quota_counter *e = q->master; + + if (*q->name == '\0') { + kfree(e); + return; + } + + spin_lock_bh(&counter_list_lock); + if (!atomic_dec_and_test(&e->ref)) { + spin_unlock_bh(&counter_list_lock); + return; + } + + list_del(&e->list); + remove_proc_entry(e->name, proc_xt_quota); + spin_unlock_bh(&counter_list_lock); + kfree(e); +} + +static bool +quota_mt2(const struct sk_buff *skb, struct xt_action_param *par) +{ + struct xt_quota_mtinfo2 *q = (void *)par->matchinfo; + struct xt_quota_counter *e = q->master; + bool ret = q->flags & XT_QUOTA_INVERT; + + spin_lock_bh(&e->lock); + if (q->flags & XT_QUOTA_GROW) { + /* + * While no_change is pointless in "grow" mode, we will + * implement it here simply to have a consistent behavior. + */ + if (!(q->flags & XT_QUOTA_NO_CHANGE)) { + e->quota += (q->flags & XT_QUOTA_PACKET) ? 1 : skb->len; + } + ret = true; + } else { + if (e->quota >= skb->len) { + if (!(q->flags & XT_QUOTA_NO_CHANGE)) + e->quota -= (q->flags & XT_QUOTA_PACKET) ? 1 : skb->len; + ret = !ret; + } else { + /* We are transitioning, log that fact. */ + if (e->quota) { + quota2_log(par->hooknum, + skb, + par->in, + par->out, + q->name); + } + /* we do not allow even small packets from now on */ + e->quota = 0; + } + } + spin_unlock_bh(&e->lock); + return ret; +} + +static struct xt_match quota_mt2_reg[] __read_mostly = { + { + .name = "quota2", + .revision = 3, + .family = NFPROTO_IPV4, + .checkentry = quota_mt2_check, + .match = quota_mt2, + .destroy = quota_mt2_destroy, + .matchsize = sizeof(struct xt_quota_mtinfo2), + .me = THIS_MODULE, + }, + { + .name = "quota2", + .revision = 3, + .family = NFPROTO_IPV6, + .checkentry = quota_mt2_check, + .match = quota_mt2, + .destroy = quota_mt2_destroy, + .matchsize = sizeof(struct xt_quota_mtinfo2), + .me = THIS_MODULE, + }, +}; + +static int __init quota_mt2_init(void) +{ + int ret; + pr_debug("xt_quota2: init()"); + +#ifdef CONFIG_NETFILTER_XT_MATCH_QUOTA2_LOG + nflognl = netlink_kernel_create(&init_net, + NETLINK_NFLOG, 1, NULL, + NULL, THIS_MODULE); + if (!nflognl) + return -ENOMEM; +#endif + + proc_xt_quota = proc_mkdir("xt_quota", init_net.proc_net); + if (proc_xt_quota == NULL) + return -EACCES; + + ret = xt_register_matches(quota_mt2_reg, ARRAY_SIZE(quota_mt2_reg)); + if (ret < 0) + remove_proc_entry("xt_quota", init_net.proc_net); + pr_debug("xt_quota2: init() %d", ret); + return ret; +} + +static void __exit quota_mt2_exit(void) +{ + xt_unregister_matches(quota_mt2_reg, ARRAY_SIZE(quota_mt2_reg)); + remove_proc_entry("xt_quota", init_net.proc_net); +} + +module_init(quota_mt2_init); +module_exit(quota_mt2_exit); +MODULE_DESCRIPTION("Xtables: countdown quota match; up counter"); +MODULE_AUTHOR("Sam Johnston <samj@samj.net>"); +MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_quota2"); +MODULE_ALIAS("ip6t_quota2"); diff --git a/net/netfilter/xt_rateest.c b/net/netfilter/xt_rateest.c new file mode 100644 index 00000000..ed0db15a --- /dev/null +++ b/net/netfilter/xt_rateest.c @@ -0,0 +1,157 @@ +/* + * (C) 2007 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/gen_stats.h> + +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_rateest.h> +#include <net/netfilter/xt_rateest.h> + + +static bool +xt_rateest_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_rateest_match_info *info = par->matchinfo; + struct gnet_stats_rate_est *r; + u_int32_t bps1, bps2, pps1, pps2; + bool ret = true; + + spin_lock_bh(&info->est1->lock); + r = &info->est1->rstats; + if (info->flags & XT_RATEEST_MATCH_DELTA) { + bps1 = info->bps1 >= r->bps ? info->bps1 - r->bps : 0; + pps1 = info->pps1 >= r->pps ? info->pps1 - r->pps : 0; + } else { + bps1 = r->bps; + pps1 = r->pps; + } + spin_unlock_bh(&info->est1->lock); + + if (info->flags & XT_RATEEST_MATCH_ABS) { + bps2 = info->bps2; + pps2 = info->pps2; + } else { + spin_lock_bh(&info->est2->lock); + r = &info->est2->rstats; + if (info->flags & XT_RATEEST_MATCH_DELTA) { + bps2 = info->bps2 >= r->bps ? info->bps2 - r->bps : 0; + pps2 = info->pps2 >= r->pps ? info->pps2 - r->pps : 0; + } else { + bps2 = r->bps; + pps2 = r->pps; + } + spin_unlock_bh(&info->est2->lock); + } + + switch (info->mode) { + case XT_RATEEST_MATCH_LT: + if (info->flags & XT_RATEEST_MATCH_BPS) + ret &= bps1 < bps2; + if (info->flags & XT_RATEEST_MATCH_PPS) + ret &= pps1 < pps2; + break; + case XT_RATEEST_MATCH_GT: + if (info->flags & XT_RATEEST_MATCH_BPS) + ret &= bps1 > bps2; + if (info->flags & XT_RATEEST_MATCH_PPS) + ret &= pps1 > pps2; + break; + case XT_RATEEST_MATCH_EQ: + if (info->flags & XT_RATEEST_MATCH_BPS) + ret &= bps1 == bps2; + if (info->flags & XT_RATEEST_MATCH_PPS) + ret &= pps1 == pps2; + break; + } + + ret ^= info->flags & XT_RATEEST_MATCH_INVERT ? true : false; + return ret; +} + +static int xt_rateest_mt_checkentry(const struct xt_mtchk_param *par) +{ + struct xt_rateest_match_info *info = par->matchinfo; + struct xt_rateest *est1, *est2; + int ret = -EINVAL; + + if (hweight32(info->flags & (XT_RATEEST_MATCH_ABS | + XT_RATEEST_MATCH_REL)) != 1) + goto err1; + + if (!(info->flags & (XT_RATEEST_MATCH_BPS | XT_RATEEST_MATCH_PPS))) + goto err1; + + switch (info->mode) { + case XT_RATEEST_MATCH_EQ: + case XT_RATEEST_MATCH_LT: + case XT_RATEEST_MATCH_GT: + break; + default: + goto err1; + } + + ret = -ENOENT; + est1 = xt_rateest_lookup(info->name1); + if (!est1) + goto err1; + + est2 = NULL; + if (info->flags & XT_RATEEST_MATCH_REL) { + est2 = xt_rateest_lookup(info->name2); + if (!est2) + goto err2; + } + + info->est1 = est1; + info->est2 = est2; + return 0; + +err2: + xt_rateest_put(est1); +err1: + return ret; +} + +static void xt_rateest_mt_destroy(const struct xt_mtdtor_param *par) +{ + struct xt_rateest_match_info *info = par->matchinfo; + + xt_rateest_put(info->est1); + if (info->est2) + xt_rateest_put(info->est2); +} + +static struct xt_match xt_rateest_mt_reg __read_mostly = { + .name = "rateest", + .revision = 0, + .family = NFPROTO_UNSPEC, + .match = xt_rateest_mt, + .checkentry = xt_rateest_mt_checkentry, + .destroy = xt_rateest_mt_destroy, + .matchsize = sizeof(struct xt_rateest_match_info), + .me = THIS_MODULE, +}; + +static int __init xt_rateest_mt_init(void) +{ + return xt_register_match(&xt_rateest_mt_reg); +} + +static void __exit xt_rateest_mt_fini(void) +{ + xt_unregister_match(&xt_rateest_mt_reg); +} + +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("xtables rate estimator match"); +MODULE_ALIAS("ipt_rateest"); +MODULE_ALIAS("ip6t_rateest"); +module_init(xt_rateest_mt_init); +module_exit(xt_rateest_mt_fini); diff --git a/net/netfilter/xt_realm.c b/net/netfilter/xt_realm.c new file mode 100644 index 00000000..459a7b25 --- /dev/null +++ b/net/netfilter/xt_realm.c @@ -0,0 +1,54 @@ +/* IP tables module for matching the routing realm + * + * (C) 2003 by Sampsa Ranta <sampsa@netsonic.fi> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/netdevice.h> +#include <net/route.h> + +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter/xt_realm.h> +#include <linux/netfilter/x_tables.h> + +MODULE_AUTHOR("Sampsa Ranta <sampsa@netsonic.fi>"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Xtables: Routing realm match"); +MODULE_ALIAS("ipt_realm"); + +static bool +realm_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_realm_info *info = par->matchinfo; + const struct dst_entry *dst = skb_dst(skb); + + return (info->id == (dst->tclassid & info->mask)) ^ info->invert; +} + +static struct xt_match realm_mt_reg __read_mostly = { + .name = "realm", + .match = realm_mt, + .matchsize = sizeof(struct xt_realm_info), + .hooks = (1 << NF_INET_POST_ROUTING) | (1 << NF_INET_FORWARD) | + (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_LOCAL_IN), + .family = NFPROTO_UNSPEC, + .me = THIS_MODULE +}; + +static int __init realm_mt_init(void) +{ + return xt_register_match(&realm_mt_reg); +} + +static void __exit realm_mt_exit(void) +{ + xt_unregister_match(&realm_mt_reg); +} + +module_init(realm_mt_init); +module_exit(realm_mt_exit); diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c new file mode 100644 index 00000000..d2ff15a2 --- /dev/null +++ b/net/netfilter/xt_recent.c @@ -0,0 +1,668 @@ +/* + * Copyright (c) 2006 Patrick McHardy <kaber@trash.net> + * Copyright © CC Computer Consultants GmbH, 2007 - 2008 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This is a replacement of the old ipt_recent module, which carried the + * following copyright notice: + * + * Author: Stephen Frost <sfrost@snowman.net> + * Copyright 2002-2003, Stephen Frost, 2.5.x port by laforge@netfilter.org + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/init.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/string.h> +#include <linux/ctype.h> +#include <linux/list.h> +#include <linux/random.h> +#include <linux/jhash.h> +#include <linux/bitops.h> +#include <linux/skbuff.h> +#include <linux/inet.h> +#include <linux/slab.h> +#include <net/net_namespace.h> +#include <net/netns/generic.h> + +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_recent.h> + +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>"); +MODULE_DESCRIPTION("Xtables: \"recently-seen\" host matching"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_recent"); +MODULE_ALIAS("ip6t_recent"); + +static unsigned int ip_list_tot = 100; +static unsigned int ip_pkt_list_tot = 20; +static unsigned int ip_list_hash_size = 0; +static unsigned int ip_list_perms = 0644; +static unsigned int ip_list_uid = 0; +static unsigned int ip_list_gid = 0; +module_param(ip_list_tot, uint, 0400); +module_param(ip_pkt_list_tot, uint, 0400); +module_param(ip_list_hash_size, uint, 0400); +module_param(ip_list_perms, uint, 0400); +module_param(ip_list_uid, uint, S_IRUGO | S_IWUSR); +module_param(ip_list_gid, uint, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(ip_list_tot, "number of IPs to remember per list"); +MODULE_PARM_DESC(ip_pkt_list_tot, "number of packets per IP address to remember (max. 255)"); +MODULE_PARM_DESC(ip_list_hash_size, "size of hash table used to look up IPs"); +MODULE_PARM_DESC(ip_list_perms, "permissions on /proc/net/xt_recent/* files"); +MODULE_PARM_DESC(ip_list_uid, "default owner of /proc/net/xt_recent/* files"); +MODULE_PARM_DESC(ip_list_gid, "default owning group of /proc/net/xt_recent/* files"); + +struct recent_entry { + struct list_head list; + struct list_head lru_list; + union nf_inet_addr addr; + u_int16_t family; + u_int8_t ttl; + u_int8_t index; + u_int16_t nstamps; + unsigned long stamps[0]; +}; + +struct recent_table { + struct list_head list; + char name[XT_RECENT_NAME_LEN]; + unsigned int refcnt; + unsigned int entries; + struct list_head lru_list; + struct list_head iphash[0]; +}; + +struct recent_net { + struct list_head tables; +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *xt_recent; +#endif +}; + +static int recent_net_id; +static inline struct recent_net *recent_pernet(struct net *net) +{ + return net_generic(net, recent_net_id); +} + +static DEFINE_SPINLOCK(recent_lock); +static DEFINE_MUTEX(recent_mutex); + +#ifdef CONFIG_PROC_FS +static const struct file_operations recent_old_fops, recent_mt_fops; +#endif + +static u_int32_t hash_rnd __read_mostly; +static bool hash_rnd_inited __read_mostly; + +static inline unsigned int recent_entry_hash4(const union nf_inet_addr *addr) +{ + return jhash_1word((__force u32)addr->ip, hash_rnd) & + (ip_list_hash_size - 1); +} + +static inline unsigned int recent_entry_hash6(const union nf_inet_addr *addr) +{ + return jhash2((u32 *)addr->ip6, ARRAY_SIZE(addr->ip6), hash_rnd) & + (ip_list_hash_size - 1); +} + +static struct recent_entry * +recent_entry_lookup(const struct recent_table *table, + const union nf_inet_addr *addrp, u_int16_t family, + u_int8_t ttl) +{ + struct recent_entry *e; + unsigned int h; + + if (family == NFPROTO_IPV4) + h = recent_entry_hash4(addrp); + else + h = recent_entry_hash6(addrp); + + list_for_each_entry(e, &table->iphash[h], list) + if (e->family == family && + memcmp(&e->addr, addrp, sizeof(e->addr)) == 0 && + (ttl == e->ttl || ttl == 0 || e->ttl == 0)) + return e; + return NULL; +} + +static void recent_entry_remove(struct recent_table *t, struct recent_entry *e) +{ + list_del(&e->list); + list_del(&e->lru_list); + kfree(e); + t->entries--; +} + +/* + * Drop entries with timestamps older then 'time'. + */ +static void recent_entry_reap(struct recent_table *t, unsigned long time) +{ + struct recent_entry *e; + + /* + * The head of the LRU list is always the oldest entry. + */ + e = list_entry(t->lru_list.next, struct recent_entry, lru_list); + + /* + * The last time stamp is the most recent. + */ + if (time_after(time, e->stamps[e->index-1])) + recent_entry_remove(t, e); +} + +static struct recent_entry * +recent_entry_init(struct recent_table *t, const union nf_inet_addr *addr, + u_int16_t family, u_int8_t ttl) +{ + struct recent_entry *e; + + if (t->entries >= ip_list_tot) { + e = list_entry(t->lru_list.next, struct recent_entry, lru_list); + recent_entry_remove(t, e); + } + e = kmalloc(sizeof(*e) + sizeof(e->stamps[0]) * ip_pkt_list_tot, + GFP_ATOMIC); + if (e == NULL) + return NULL; + memcpy(&e->addr, addr, sizeof(e->addr)); + e->ttl = ttl; + e->stamps[0] = jiffies; + e->nstamps = 1; + e->index = 1; + e->family = family; + if (family == NFPROTO_IPV4) + list_add_tail(&e->list, &t->iphash[recent_entry_hash4(addr)]); + else + list_add_tail(&e->list, &t->iphash[recent_entry_hash6(addr)]); + list_add_tail(&e->lru_list, &t->lru_list); + t->entries++; + return e; +} + +static void recent_entry_update(struct recent_table *t, struct recent_entry *e) +{ + e->index %= ip_pkt_list_tot; + e->stamps[e->index++] = jiffies; + if (e->index > e->nstamps) + e->nstamps = e->index; + list_move_tail(&e->lru_list, &t->lru_list); +} + +static struct recent_table *recent_table_lookup(struct recent_net *recent_net, + const char *name) +{ + struct recent_table *t; + + list_for_each_entry(t, &recent_net->tables, list) + if (!strcmp(t->name, name)) + return t; + return NULL; +} + +static void recent_table_flush(struct recent_table *t) +{ + struct recent_entry *e, *next; + unsigned int i; + + for (i = 0; i < ip_list_hash_size; i++) + list_for_each_entry_safe(e, next, &t->iphash[i], list) + recent_entry_remove(t, e); +} + +static bool +recent_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + struct net *net = dev_net(par->in ? par->in : par->out); + struct recent_net *recent_net = recent_pernet(net); + const struct xt_recent_mtinfo *info = par->matchinfo; + struct recent_table *t; + struct recent_entry *e; + union nf_inet_addr addr = {}; + u_int8_t ttl; + bool ret = info->invert; + + if (par->family == NFPROTO_IPV4) { + const struct iphdr *iph = ip_hdr(skb); + + if (info->side == XT_RECENT_DEST) + addr.ip = iph->daddr; + else + addr.ip = iph->saddr; + + ttl = iph->ttl; + } else { + const struct ipv6hdr *iph = ipv6_hdr(skb); + + if (info->side == XT_RECENT_DEST) + memcpy(&addr.in6, &iph->daddr, sizeof(addr.in6)); + else + memcpy(&addr.in6, &iph->saddr, sizeof(addr.in6)); + + ttl = iph->hop_limit; + } + + /* use TTL as seen before forwarding */ + if (par->out != NULL && skb->sk == NULL) + ttl++; + + spin_lock_bh(&recent_lock); + t = recent_table_lookup(recent_net, info->name); + e = recent_entry_lookup(t, &addr, par->family, + (info->check_set & XT_RECENT_TTL) ? ttl : 0); + if (e == NULL) { + if (!(info->check_set & XT_RECENT_SET)) + goto out; + e = recent_entry_init(t, &addr, par->family, ttl); + if (e == NULL) + par->hotdrop = true; + ret = !ret; + goto out; + } + + if (info->check_set & XT_RECENT_SET) + ret = !ret; + else if (info->check_set & XT_RECENT_REMOVE) { + recent_entry_remove(t, e); + ret = !ret; + } else if (info->check_set & (XT_RECENT_CHECK | XT_RECENT_UPDATE)) { + unsigned long time = jiffies - info->seconds * HZ; + unsigned int i, hits = 0; + + for (i = 0; i < e->nstamps; i++) { + if (info->seconds && time_after(time, e->stamps[i])) + continue; + if (!info->hit_count || ++hits >= info->hit_count) { + ret = !ret; + break; + } + } + + /* info->seconds must be non-zero */ + if (info->check_set & XT_RECENT_REAP) + recent_entry_reap(t, time); + } + + if (info->check_set & XT_RECENT_SET || + (info->check_set & XT_RECENT_UPDATE && ret)) { + recent_entry_update(t, e); + e->ttl = ttl; + } +out: + spin_unlock_bh(&recent_lock); + return ret; +} + +static int recent_mt_check(const struct xt_mtchk_param *par) +{ + struct recent_net *recent_net = recent_pernet(par->net); + const struct xt_recent_mtinfo *info = par->matchinfo; + struct recent_table *t; +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *pde; +#endif + unsigned i; + int ret = -EINVAL; + + if (unlikely(!hash_rnd_inited)) { + get_random_bytes(&hash_rnd, sizeof(hash_rnd)); + hash_rnd_inited = true; + } + if (info->check_set & ~XT_RECENT_VALID_FLAGS) { + pr_info("Unsupported user space flags (%08x)\n", + info->check_set); + return -EINVAL; + } + if (hweight8(info->check_set & + (XT_RECENT_SET | XT_RECENT_REMOVE | + XT_RECENT_CHECK | XT_RECENT_UPDATE)) != 1) + return -EINVAL; + if ((info->check_set & (XT_RECENT_SET | XT_RECENT_REMOVE)) && + (info->seconds || info->hit_count || + (info->check_set & XT_RECENT_MODIFIERS))) + return -EINVAL; + if ((info->check_set & XT_RECENT_REAP) && !info->seconds) + return -EINVAL; + if (info->hit_count > ip_pkt_list_tot) { + pr_info("hitcount (%u) is larger than " + "packets to be remembered (%u)\n", + info->hit_count, ip_pkt_list_tot); + return -EINVAL; + } + if (info->name[0] == '\0' || + strnlen(info->name, XT_RECENT_NAME_LEN) == XT_RECENT_NAME_LEN) + return -EINVAL; + + mutex_lock(&recent_mutex); + t = recent_table_lookup(recent_net, info->name); + if (t != NULL) { + t->refcnt++; + ret = 0; + goto out; + } + + t = kzalloc(sizeof(*t) + sizeof(t->iphash[0]) * ip_list_hash_size, + GFP_KERNEL); + if (t == NULL) { + ret = -ENOMEM; + goto out; + } + t->refcnt = 1; + strcpy(t->name, info->name); + INIT_LIST_HEAD(&t->lru_list); + for (i = 0; i < ip_list_hash_size; i++) + INIT_LIST_HEAD(&t->iphash[i]); +#ifdef CONFIG_PROC_FS + pde = proc_create_data(t->name, ip_list_perms, recent_net->xt_recent, + &recent_mt_fops, t); + if (pde == NULL) { + kfree(t); + ret = -ENOMEM; + goto out; + } + pde->uid = ip_list_uid; + pde->gid = ip_list_gid; +#endif + spin_lock_bh(&recent_lock); + list_add_tail(&t->list, &recent_net->tables); + spin_unlock_bh(&recent_lock); + ret = 0; +out: + mutex_unlock(&recent_mutex); + return ret; +} + +static void recent_mt_destroy(const struct xt_mtdtor_param *par) +{ + struct recent_net *recent_net = recent_pernet(par->net); + const struct xt_recent_mtinfo *info = par->matchinfo; + struct recent_table *t; + + mutex_lock(&recent_mutex); + t = recent_table_lookup(recent_net, info->name); + if (--t->refcnt == 0) { + spin_lock_bh(&recent_lock); + list_del(&t->list); + spin_unlock_bh(&recent_lock); +#ifdef CONFIG_PROC_FS + remove_proc_entry(t->name, recent_net->xt_recent); +#endif + recent_table_flush(t); + kfree(t); + } + mutex_unlock(&recent_mutex); +} + +#ifdef CONFIG_PROC_FS +struct recent_iter_state { + const struct recent_table *table; + unsigned int bucket; +}; + +static void *recent_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(recent_lock) +{ + struct recent_iter_state *st = seq->private; + const struct recent_table *t = st->table; + struct recent_entry *e; + loff_t p = *pos; + + spin_lock_bh(&recent_lock); + + for (st->bucket = 0; st->bucket < ip_list_hash_size; st->bucket++) + list_for_each_entry(e, &t->iphash[st->bucket], list) + if (p-- == 0) + return e; + return NULL; +} + +static void *recent_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct recent_iter_state *st = seq->private; + const struct recent_table *t = st->table; + const struct recent_entry *e = v; + const struct list_head *head = e->list.next; + + while (head == &t->iphash[st->bucket]) { + if (++st->bucket >= ip_list_hash_size) + return NULL; + head = t->iphash[st->bucket].next; + } + (*pos)++; + return list_entry(head, struct recent_entry, list); +} + +static void recent_seq_stop(struct seq_file *s, void *v) + __releases(recent_lock) +{ + spin_unlock_bh(&recent_lock); +} + +static int recent_seq_show(struct seq_file *seq, void *v) +{ + const struct recent_entry *e = v; + unsigned int i; + + i = (e->index - 1) % ip_pkt_list_tot; + if (e->family == NFPROTO_IPV4) + seq_printf(seq, "src=%pI4 ttl: %u last_seen: %lu oldest_pkt: %u", + &e->addr.ip, e->ttl, e->stamps[i], e->index); + else + seq_printf(seq, "src=%pI6 ttl: %u last_seen: %lu oldest_pkt: %u", + &e->addr.in6, e->ttl, e->stamps[i], e->index); + for (i = 0; i < e->nstamps; i++) + seq_printf(seq, "%s %lu", i ? "," : "", e->stamps[i]); + seq_printf(seq, "\n"); + return 0; +} + +static const struct seq_operations recent_seq_ops = { + .start = recent_seq_start, + .next = recent_seq_next, + .stop = recent_seq_stop, + .show = recent_seq_show, +}; + +static int recent_seq_open(struct inode *inode, struct file *file) +{ + struct proc_dir_entry *pde = PDE(inode); + struct recent_iter_state *st; + + st = __seq_open_private(file, &recent_seq_ops, sizeof(*st)); + if (st == NULL) + return -ENOMEM; + + st->table = pde->data; + return 0; +} + +static ssize_t +recent_mt_proc_write(struct file *file, const char __user *input, + size_t size, loff_t *loff) +{ + const struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode); + struct recent_table *t = pde->data; + struct recent_entry *e; + char buf[sizeof("+b335:1d35:1e55:dead:c0de:1715:5afe:c0de")]; + const char *c = buf; + union nf_inet_addr addr = {}; + u_int16_t family; + bool add, succ; + + if (size == 0) + return 0; + if (size > sizeof(buf)) + size = sizeof(buf); + if (copy_from_user(buf, input, size) != 0) + return -EFAULT; + + /* Strict protocol! */ + if (*loff != 0) + return -ESPIPE; + switch (*c) { + case '/': /* flush table */ + spin_lock_bh(&recent_lock); + recent_table_flush(t); + spin_unlock_bh(&recent_lock); + return size; + case '-': /* remove address */ + add = false; + break; + case '+': /* add address */ + add = true; + break; + default: + pr_info("Need \"+ip\", \"-ip\" or \"/\"\n"); + return -EINVAL; + } + + ++c; + --size; + if (strnchr(c, size, ':') != NULL) { + family = NFPROTO_IPV6; + succ = in6_pton(c, size, (void *)&addr, '\n', NULL); + } else { + family = NFPROTO_IPV4; + succ = in4_pton(c, size, (void *)&addr, '\n', NULL); + } + + if (!succ) { + pr_info("illegal address written to procfs\n"); + return -EINVAL; + } + + spin_lock_bh(&recent_lock); + e = recent_entry_lookup(t, &addr, family, 0); + if (e == NULL) { + if (add) + recent_entry_init(t, &addr, family, 0); + } else { + if (add) + recent_entry_update(t, e); + else + recent_entry_remove(t, e); + } + spin_unlock_bh(&recent_lock); + /* Note we removed one above */ + *loff += size + 1; + return size + 1; +} + +static const struct file_operations recent_mt_fops = { + .open = recent_seq_open, + .read = seq_read, + .write = recent_mt_proc_write, + .release = seq_release_private, + .owner = THIS_MODULE, + .llseek = seq_lseek, +}; + +static int __net_init recent_proc_net_init(struct net *net) +{ + struct recent_net *recent_net = recent_pernet(net); + + recent_net->xt_recent = proc_mkdir("xt_recent", net->proc_net); + if (!recent_net->xt_recent) + return -ENOMEM; + return 0; +} + +static void __net_exit recent_proc_net_exit(struct net *net) +{ + proc_net_remove(net, "xt_recent"); +} +#else +static inline int recent_proc_net_init(struct net *net) +{ + return 0; +} + +static inline void recent_proc_net_exit(struct net *net) +{ +} +#endif /* CONFIG_PROC_FS */ + +static int __net_init recent_net_init(struct net *net) +{ + struct recent_net *recent_net = recent_pernet(net); + + INIT_LIST_HEAD(&recent_net->tables); + return recent_proc_net_init(net); +} + +static void __net_exit recent_net_exit(struct net *net) +{ + struct recent_net *recent_net = recent_pernet(net); + + BUG_ON(!list_empty(&recent_net->tables)); + recent_proc_net_exit(net); +} + +static struct pernet_operations recent_net_ops = { + .init = recent_net_init, + .exit = recent_net_exit, + .id = &recent_net_id, + .size = sizeof(struct recent_net), +}; + +static struct xt_match recent_mt_reg[] __read_mostly = { + { + .name = "recent", + .revision = 0, + .family = NFPROTO_IPV4, + .match = recent_mt, + .matchsize = sizeof(struct xt_recent_mtinfo), + .checkentry = recent_mt_check, + .destroy = recent_mt_destroy, + .me = THIS_MODULE, + }, + { + .name = "recent", + .revision = 0, + .family = NFPROTO_IPV6, + .match = recent_mt, + .matchsize = sizeof(struct xt_recent_mtinfo), + .checkentry = recent_mt_check, + .destroy = recent_mt_destroy, + .me = THIS_MODULE, + }, +}; + +static int __init recent_mt_init(void) +{ + int err; + + if (!ip_list_tot || !ip_pkt_list_tot || ip_pkt_list_tot > 255) + return -EINVAL; + ip_list_hash_size = 1 << fls(ip_list_tot); + + err = register_pernet_subsys(&recent_net_ops); + if (err) + return err; + err = xt_register_matches(recent_mt_reg, ARRAY_SIZE(recent_mt_reg)); + if (err) + unregister_pernet_subsys(&recent_net_ops); + return err; +} + +static void __exit recent_mt_exit(void) +{ + xt_unregister_matches(recent_mt_reg, ARRAY_SIZE(recent_mt_reg)); + unregister_pernet_subsys(&recent_net_ops); +} + +module_init(recent_mt_init); +module_exit(recent_mt_exit); diff --git a/net/netfilter/xt_repldata.h b/net/netfilter/xt_repldata.h new file mode 100644 index 00000000..6efe4e5a --- /dev/null +++ b/net/netfilter/xt_repldata.h @@ -0,0 +1,35 @@ +/* + * Today's hack: quantum tunneling in structs + * + * 'entries' and 'term' are never anywhere referenced by word in code. In fact, + * they serve as the hanging-off data accessed through repl.data[]. + */ + +#define xt_alloc_initial_table(type, typ2) ({ \ + unsigned int hook_mask = info->valid_hooks; \ + unsigned int nhooks = hweight32(hook_mask); \ + unsigned int bytes = 0, hooknum = 0, i = 0; \ + struct { \ + struct type##_replace repl; \ + struct type##_standard entries[nhooks]; \ + struct type##_error term; \ + } *tbl = kzalloc(sizeof(*tbl), GFP_KERNEL); \ + if (tbl == NULL) \ + return NULL; \ + strncpy(tbl->repl.name, info->name, sizeof(tbl->repl.name)); \ + tbl->term = (struct type##_error)typ2##_ERROR_INIT; \ + tbl->repl.valid_hooks = hook_mask; \ + tbl->repl.num_entries = nhooks + 1; \ + tbl->repl.size = nhooks * sizeof(struct type##_standard) + \ + sizeof(struct type##_error); \ + for (; hook_mask != 0; hook_mask >>= 1, ++hooknum) { \ + if (!(hook_mask & 1)) \ + continue; \ + tbl->repl.hook_entry[hooknum] = bytes; \ + tbl->repl.underflow[hooknum] = bytes; \ + tbl->entries[i++] = (struct type##_standard) \ + typ2##_STANDARD_INIT(NF_ACCEPT); \ + bytes += sizeof(struct type##_standard); \ + } \ + tbl; \ +}) diff --git a/net/netfilter/xt_sctp.c b/net/netfilter/xt_sctp.c new file mode 100644 index 00000000..ef36a56a --- /dev/null +++ b/net/netfilter/xt_sctp.c @@ -0,0 +1,198 @@ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/module.h> +#include <linux/skbuff.h> +#include <net/ip.h> +#include <net/ipv6.h> +#include <net/sctp/sctp.h> +#include <linux/sctp.h> + +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_sctp.h> +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv6/ip6_tables.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Kiran Kumar Immidi"); +MODULE_DESCRIPTION("Xtables: SCTP protocol packet match"); +MODULE_ALIAS("ipt_sctp"); +MODULE_ALIAS("ip6t_sctp"); + +#define SCCHECK(cond, option, flag, invflag) (!((flag) & (option)) \ + || (!!((invflag) & (option)) ^ (cond))) + +static bool +match_flags(const struct xt_sctp_flag_info *flag_info, + const int flag_count, + u_int8_t chunktype, + u_int8_t chunkflags) +{ + int i; + + for (i = 0; i < flag_count; i++) + if (flag_info[i].chunktype == chunktype) + return (chunkflags & flag_info[i].flag_mask) == flag_info[i].flag; + + return true; +} + +static inline bool +match_packet(const struct sk_buff *skb, + unsigned int offset, + const struct xt_sctp_info *info, + bool *hotdrop) +{ + u_int32_t chunkmapcopy[256 / sizeof (u_int32_t)]; + const sctp_chunkhdr_t *sch; + sctp_chunkhdr_t _sch; + int chunk_match_type = info->chunk_match_type; + const struct xt_sctp_flag_info *flag_info = info->flag_info; + int flag_count = info->flag_count; + +#ifdef DEBUG + int i = 0; +#endif + + if (chunk_match_type == SCTP_CHUNK_MATCH_ALL) + SCTP_CHUNKMAP_COPY(chunkmapcopy, info->chunkmap); + + do { + sch = skb_header_pointer(skb, offset, sizeof(_sch), &_sch); + if (sch == NULL || sch->length == 0) { + pr_debug("Dropping invalid SCTP packet.\n"); + *hotdrop = true; + return false; + } +#ifdef DEBUG + pr_debug("Chunk num: %d\toffset: %d\ttype: %d\tlength: %d" + "\tflags: %x\n", + ++i, offset, sch->type, htons(sch->length), + sch->flags); +#endif + offset += WORD_ROUND(ntohs(sch->length)); + + pr_debug("skb->len: %d\toffset: %d\n", skb->len, offset); + + if (SCTP_CHUNKMAP_IS_SET(info->chunkmap, sch->type)) { + switch (chunk_match_type) { + case SCTP_CHUNK_MATCH_ANY: + if (match_flags(flag_info, flag_count, + sch->type, sch->flags)) { + return true; + } + break; + + case SCTP_CHUNK_MATCH_ALL: + if (match_flags(flag_info, flag_count, + sch->type, sch->flags)) + SCTP_CHUNKMAP_CLEAR(chunkmapcopy, sch->type); + break; + + case SCTP_CHUNK_MATCH_ONLY: + if (!match_flags(flag_info, flag_count, + sch->type, sch->flags)) + return false; + break; + } + } else { + switch (chunk_match_type) { + case SCTP_CHUNK_MATCH_ONLY: + return false; + } + } + } while (offset < skb->len); + + switch (chunk_match_type) { + case SCTP_CHUNK_MATCH_ALL: + return SCTP_CHUNKMAP_IS_CLEAR(chunkmapcopy); + case SCTP_CHUNK_MATCH_ANY: + return false; + case SCTP_CHUNK_MATCH_ONLY: + return true; + } + + /* This will never be reached, but required to stop compiler whine */ + return false; +} + +static bool +sctp_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_sctp_info *info = par->matchinfo; + const sctp_sctphdr_t *sh; + sctp_sctphdr_t _sh; + + if (par->fragoff != 0) { + pr_debug("Dropping non-first fragment.. FIXME\n"); + return false; + } + + sh = skb_header_pointer(skb, par->thoff, sizeof(_sh), &_sh); + if (sh == NULL) { + pr_debug("Dropping evil TCP offset=0 tinygram.\n"); + par->hotdrop = true; + return false; + } + pr_debug("spt: %d\tdpt: %d\n", ntohs(sh->source), ntohs(sh->dest)); + + return SCCHECK(ntohs(sh->source) >= info->spts[0] + && ntohs(sh->source) <= info->spts[1], + XT_SCTP_SRC_PORTS, info->flags, info->invflags) + && SCCHECK(ntohs(sh->dest) >= info->dpts[0] + && ntohs(sh->dest) <= info->dpts[1], + XT_SCTP_DEST_PORTS, info->flags, info->invflags) + && SCCHECK(match_packet(skb, par->thoff + sizeof(sctp_sctphdr_t), + info, &par->hotdrop), + XT_SCTP_CHUNK_TYPES, info->flags, info->invflags); +} + +static int sctp_mt_check(const struct xt_mtchk_param *par) +{ + const struct xt_sctp_info *info = par->matchinfo; + + if (info->flags & ~XT_SCTP_VALID_FLAGS) + return -EINVAL; + if (info->invflags & ~XT_SCTP_VALID_FLAGS) + return -EINVAL; + if (info->invflags & ~info->flags) + return -EINVAL; + if (!(info->flags & XT_SCTP_CHUNK_TYPES)) + return 0; + if (info->chunk_match_type & (SCTP_CHUNK_MATCH_ALL | + SCTP_CHUNK_MATCH_ANY | SCTP_CHUNK_MATCH_ONLY)) + return 0; + return -EINVAL; +} + +static struct xt_match sctp_mt_reg[] __read_mostly = { + { + .name = "sctp", + .family = NFPROTO_IPV4, + .checkentry = sctp_mt_check, + .match = sctp_mt, + .matchsize = sizeof(struct xt_sctp_info), + .proto = IPPROTO_SCTP, + .me = THIS_MODULE + }, + { + .name = "sctp", + .family = NFPROTO_IPV6, + .checkentry = sctp_mt_check, + .match = sctp_mt, + .matchsize = sizeof(struct xt_sctp_info), + .proto = IPPROTO_SCTP, + .me = THIS_MODULE + }, +}; + +static int __init sctp_mt_init(void) +{ + return xt_register_matches(sctp_mt_reg, ARRAY_SIZE(sctp_mt_reg)); +} + +static void __exit sctp_mt_exit(void) +{ + xt_unregister_matches(sctp_mt_reg, ARRAY_SIZE(sctp_mt_reg)); +} + +module_init(sctp_mt_init); +module_exit(sctp_mt_exit); diff --git a/net/netfilter/xt_set.c b/net/netfilter/xt_set.c new file mode 100644 index 00000000..0ec8138a --- /dev/null +++ b/net/netfilter/xt_set.c @@ -0,0 +1,422 @@ +/* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu> + * Patrick Schaaf <bof@bof.de> + * Martin Josefsson <gandalf@wlug.westbo.se> + * Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Kernel module which implements the set match and SET target + * for netfilter/iptables. */ + +#include <linux/module.h> +#include <linux/skbuff.h> + +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_set.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +MODULE_DESCRIPTION("Xtables: IP set match and target module"); +MODULE_ALIAS("xt_SET"); +MODULE_ALIAS("ipt_set"); +MODULE_ALIAS("ip6t_set"); +MODULE_ALIAS("ipt_SET"); +MODULE_ALIAS("ip6t_SET"); + +static inline int +match_set(ip_set_id_t index, const struct sk_buff *skb, + const struct xt_action_param *par, + const struct ip_set_adt_opt *opt, int inv) +{ + if (ip_set_test(index, skb, par, opt)) + inv = !inv; + return inv; +} + +#define ADT_OPT(n, f, d, fs, cfs, t) \ +const struct ip_set_adt_opt n = { \ + .family = f, \ + .dim = d, \ + .flags = fs, \ + .cmdflags = cfs, \ + .timeout = t, \ +} + +/* Revision 0 interface: backward compatible with netfilter/iptables */ + +static bool +set_match_v0(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_set_info_match_v0 *info = par->matchinfo; + ADT_OPT(opt, par->family, info->match_set.u.compat.dim, + info->match_set.u.compat.flags, 0, UINT_MAX); + + return match_set(info->match_set.index, skb, par, &opt, + info->match_set.u.compat.flags & IPSET_INV_MATCH); +} + +static void +compat_flags(struct xt_set_info_v0 *info) +{ + u_int8_t i; + + /* Fill out compatibility data according to enum ip_set_kopt */ + info->u.compat.dim = IPSET_DIM_ZERO; + if (info->u.flags[0] & IPSET_MATCH_INV) + info->u.compat.flags |= IPSET_INV_MATCH; + for (i = 0; i < IPSET_DIM_MAX-1 && info->u.flags[i]; i++) { + info->u.compat.dim++; + if (info->u.flags[i] & IPSET_SRC) + info->u.compat.flags |= (1<<info->u.compat.dim); + } +} + +static int +set_match_v0_checkentry(const struct xt_mtchk_param *par) +{ + struct xt_set_info_match_v0 *info = par->matchinfo; + ip_set_id_t index; + + index = ip_set_nfnl_get_byindex(info->match_set.index); + + if (index == IPSET_INVALID_ID) { + pr_warning("Cannot find set indentified by id %u to match\n", + info->match_set.index); + return -ENOENT; + } + if (info->match_set.u.flags[IPSET_DIM_MAX-1] != 0) { + pr_warning("Protocol error: set match dimension " + "is over the limit!\n"); + ip_set_nfnl_put(info->match_set.index); + return -ERANGE; + } + + /* Fill out compatibility data */ + compat_flags(&info->match_set); + + return 0; +} + +static void +set_match_v0_destroy(const struct xt_mtdtor_param *par) +{ + struct xt_set_info_match_v0 *info = par->matchinfo; + + ip_set_nfnl_put(info->match_set.index); +} + +static unsigned int +set_target_v0(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_set_info_target_v0 *info = par->targinfo; + ADT_OPT(add_opt, par->family, info->add_set.u.compat.dim, + info->add_set.u.compat.flags, 0, UINT_MAX); + ADT_OPT(del_opt, par->family, info->del_set.u.compat.dim, + info->del_set.u.compat.flags, 0, UINT_MAX); + + if (info->add_set.index != IPSET_INVALID_ID) + ip_set_add(info->add_set.index, skb, par, &add_opt); + if (info->del_set.index != IPSET_INVALID_ID) + ip_set_del(info->del_set.index, skb, par, &del_opt); + + return XT_CONTINUE; +} + +static int +set_target_v0_checkentry(const struct xt_tgchk_param *par) +{ + struct xt_set_info_target_v0 *info = par->targinfo; + ip_set_id_t index; + + if (info->add_set.index != IPSET_INVALID_ID) { + index = ip_set_nfnl_get_byindex(info->add_set.index); + if (index == IPSET_INVALID_ID) { + pr_warning("Cannot find add_set index %u as target\n", + info->add_set.index); + return -ENOENT; + } + } + + if (info->del_set.index != IPSET_INVALID_ID) { + index = ip_set_nfnl_get_byindex(info->del_set.index); + if (index == IPSET_INVALID_ID) { + pr_warning("Cannot find del_set index %u as target\n", + info->del_set.index); + if (info->add_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(info->add_set.index); + return -ENOENT; + } + } + if (info->add_set.u.flags[IPSET_DIM_MAX-1] != 0 || + info->del_set.u.flags[IPSET_DIM_MAX-1] != 0) { + pr_warning("Protocol error: SET target dimension " + "is over the limit!\n"); + if (info->add_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(info->add_set.index); + if (info->del_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(info->del_set.index); + return -ERANGE; + } + + /* Fill out compatibility data */ + compat_flags(&info->add_set); + compat_flags(&info->del_set); + + return 0; +} + +static void +set_target_v0_destroy(const struct xt_tgdtor_param *par) +{ + const struct xt_set_info_target_v0 *info = par->targinfo; + + if (info->add_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(info->add_set.index); + if (info->del_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(info->del_set.index); +} + +/* Revision 1 match and target */ + +static bool +set_match_v1(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_set_info_match_v1 *info = par->matchinfo; + ADT_OPT(opt, par->family, info->match_set.dim, + info->match_set.flags, 0, UINT_MAX); + + return match_set(info->match_set.index, skb, par, &opt, + info->match_set.flags & IPSET_INV_MATCH); +} + +static int +set_match_v1_checkentry(const struct xt_mtchk_param *par) +{ + struct xt_set_info_match_v1 *info = par->matchinfo; + ip_set_id_t index; + + index = ip_set_nfnl_get_byindex(info->match_set.index); + + if (index == IPSET_INVALID_ID) { + pr_warning("Cannot find set indentified by id %u to match\n", + info->match_set.index); + return -ENOENT; + } + if (info->match_set.dim > IPSET_DIM_MAX) { + pr_warning("Protocol error: set match dimension " + "is over the limit!\n"); + ip_set_nfnl_put(info->match_set.index); + return -ERANGE; + } + + return 0; +} + +static void +set_match_v1_destroy(const struct xt_mtdtor_param *par) +{ + struct xt_set_info_match_v1 *info = par->matchinfo; + + ip_set_nfnl_put(info->match_set.index); +} + +static unsigned int +set_target_v1(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_set_info_target_v1 *info = par->targinfo; + ADT_OPT(add_opt, par->family, info->add_set.dim, + info->add_set.flags, 0, UINT_MAX); + ADT_OPT(del_opt, par->family, info->del_set.dim, + info->del_set.flags, 0, UINT_MAX); + + if (info->add_set.index != IPSET_INVALID_ID) + ip_set_add(info->add_set.index, skb, par, &add_opt); + if (info->del_set.index != IPSET_INVALID_ID) + ip_set_del(info->del_set.index, skb, par, &del_opt); + + return XT_CONTINUE; +} + +static int +set_target_v1_checkentry(const struct xt_tgchk_param *par) +{ + const struct xt_set_info_target_v1 *info = par->targinfo; + ip_set_id_t index; + + if (info->add_set.index != IPSET_INVALID_ID) { + index = ip_set_nfnl_get_byindex(info->add_set.index); + if (index == IPSET_INVALID_ID) { + pr_warning("Cannot find add_set index %u as target\n", + info->add_set.index); + return -ENOENT; + } + } + + if (info->del_set.index != IPSET_INVALID_ID) { + index = ip_set_nfnl_get_byindex(info->del_set.index); + if (index == IPSET_INVALID_ID) { + pr_warning("Cannot find del_set index %u as target\n", + info->del_set.index); + if (info->add_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(info->add_set.index); + return -ENOENT; + } + } + if (info->add_set.dim > IPSET_DIM_MAX || + info->del_set.dim > IPSET_DIM_MAX) { + pr_warning("Protocol error: SET target dimension " + "is over the limit!\n"); + if (info->add_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(info->add_set.index); + if (info->del_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(info->del_set.index); + return -ERANGE; + } + + return 0; +} + +static void +set_target_v1_destroy(const struct xt_tgdtor_param *par) +{ + const struct xt_set_info_target_v1 *info = par->targinfo; + + if (info->add_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(info->add_set.index); + if (info->del_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(info->del_set.index); +} + +/* Revision 2 target */ + +static unsigned int +set_target_v2(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_set_info_target_v2 *info = par->targinfo; + ADT_OPT(add_opt, par->family, info->add_set.dim, + info->add_set.flags, info->flags, info->timeout); + ADT_OPT(del_opt, par->family, info->del_set.dim, + info->del_set.flags, 0, UINT_MAX); + + if (info->add_set.index != IPSET_INVALID_ID) + ip_set_add(info->add_set.index, skb, par, &add_opt); + if (info->del_set.index != IPSET_INVALID_ID) + ip_set_del(info->del_set.index, skb, par, &del_opt); + + return XT_CONTINUE; +} + +#define set_target_v2_checkentry set_target_v1_checkentry +#define set_target_v2_destroy set_target_v1_destroy + +static struct xt_match set_matches[] __read_mostly = { + { + .name = "set", + .family = NFPROTO_IPV4, + .revision = 0, + .match = set_match_v0, + .matchsize = sizeof(struct xt_set_info_match_v0), + .checkentry = set_match_v0_checkentry, + .destroy = set_match_v0_destroy, + .me = THIS_MODULE + }, + { + .name = "set", + .family = NFPROTO_IPV4, + .revision = 1, + .match = set_match_v1, + .matchsize = sizeof(struct xt_set_info_match_v1), + .checkentry = set_match_v1_checkentry, + .destroy = set_match_v1_destroy, + .me = THIS_MODULE + }, + { + .name = "set", + .family = NFPROTO_IPV6, + .revision = 1, + .match = set_match_v1, + .matchsize = sizeof(struct xt_set_info_match_v1), + .checkentry = set_match_v1_checkentry, + .destroy = set_match_v1_destroy, + .me = THIS_MODULE + }, +}; + +static struct xt_target set_targets[] __read_mostly = { + { + .name = "SET", + .revision = 0, + .family = NFPROTO_IPV4, + .target = set_target_v0, + .targetsize = sizeof(struct xt_set_info_target_v0), + .checkentry = set_target_v0_checkentry, + .destroy = set_target_v0_destroy, + .me = THIS_MODULE + }, + { + .name = "SET", + .revision = 1, + .family = NFPROTO_IPV4, + .target = set_target_v1, + .targetsize = sizeof(struct xt_set_info_target_v1), + .checkentry = set_target_v1_checkentry, + .destroy = set_target_v1_destroy, + .me = THIS_MODULE + }, + { + .name = "SET", + .revision = 1, + .family = NFPROTO_IPV6, + .target = set_target_v1, + .targetsize = sizeof(struct xt_set_info_target_v1), + .checkentry = set_target_v1_checkentry, + .destroy = set_target_v1_destroy, + .me = THIS_MODULE + }, + { + .name = "SET", + .revision = 2, + .family = NFPROTO_IPV4, + .target = set_target_v2, + .targetsize = sizeof(struct xt_set_info_target_v2), + .checkentry = set_target_v2_checkentry, + .destroy = set_target_v2_destroy, + .me = THIS_MODULE + }, + { + .name = "SET", + .revision = 2, + .family = NFPROTO_IPV6, + .target = set_target_v2, + .targetsize = sizeof(struct xt_set_info_target_v2), + .checkentry = set_target_v2_checkentry, + .destroy = set_target_v2_destroy, + .me = THIS_MODULE + }, +}; + +static int __init xt_set_init(void) +{ + int ret = xt_register_matches(set_matches, ARRAY_SIZE(set_matches)); + + if (!ret) { + ret = xt_register_targets(set_targets, + ARRAY_SIZE(set_targets)); + if (ret) + xt_unregister_matches(set_matches, + ARRAY_SIZE(set_matches)); + } + return ret; +} + +static void __exit xt_set_fini(void) +{ + xt_unregister_matches(set_matches, ARRAY_SIZE(set_matches)); + xt_unregister_targets(set_targets, ARRAY_SIZE(set_targets)); +} + +module_init(xt_set_init); +module_exit(xt_set_fini); diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c new file mode 100644 index 00000000..1e48fcf2 --- /dev/null +++ b/net/netfilter/xt_socket.c @@ -0,0 +1,406 @@ +/* + * Transparent proxy support for Linux/iptables + * + * Copyright (C) 2007-2008 BalaBit IT Ltd. + * Author: Krisztian Kovacs + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter_ipv4/ip_tables.h> +#include <net/tcp.h> +#include <net/udp.h> +#include <net/icmp.h> +#include <net/sock.h> +#include <net/inet_sock.h> +#include <net/netfilter/nf_tproxy_core.h> +#include <net/netfilter/ipv4/nf_defrag_ipv4.h> + +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) +#define XT_SOCKET_HAVE_IPV6 1 +#include <linux/netfilter_ipv6/ip6_tables.h> +#include <net/netfilter/ipv6/nf_defrag_ipv6.h> +#endif + +#include <linux/netfilter/xt_socket.h> + +#if IS_ENABLED(CONFIG_NF_CONNTRACK) +#define XT_SOCKET_HAVE_CONNTRACK 1 +#include <net/netfilter/nf_conntrack.h> +#endif + +void +xt_socket_put_sk(struct sock *sk) +{ + if (sk->sk_state == TCP_TIME_WAIT) + inet_twsk_put(inet_twsk(sk)); + else + sock_put(sk); +} +EXPORT_SYMBOL(xt_socket_put_sk); + +static int +extract_icmp4_fields(const struct sk_buff *skb, + u8 *protocol, + __be32 *raddr, + __be32 *laddr, + __be16 *rport, + __be16 *lport) +{ + unsigned int outside_hdrlen = ip_hdrlen(skb); + struct iphdr *inside_iph, _inside_iph; + struct icmphdr *icmph, _icmph; + __be16 *ports, _ports[2]; + + icmph = skb_header_pointer(skb, outside_hdrlen, + sizeof(_icmph), &_icmph); + if (icmph == NULL) + return 1; + + switch (icmph->type) { + case ICMP_DEST_UNREACH: + case ICMP_SOURCE_QUENCH: + case ICMP_REDIRECT: + case ICMP_TIME_EXCEEDED: + case ICMP_PARAMETERPROB: + break; + default: + return 1; + } + + inside_iph = skb_header_pointer(skb, outside_hdrlen + + sizeof(struct icmphdr), + sizeof(_inside_iph), &_inside_iph); + if (inside_iph == NULL) + return 1; + + if (inside_iph->protocol != IPPROTO_TCP && + inside_iph->protocol != IPPROTO_UDP) + return 1; + + ports = skb_header_pointer(skb, outside_hdrlen + + sizeof(struct icmphdr) + + (inside_iph->ihl << 2), + sizeof(_ports), &_ports); + if (ports == NULL) + return 1; + + /* the inside IP packet is the one quoted from our side, thus + * its saddr is the local address */ + *protocol = inside_iph->protocol; + *laddr = inside_iph->saddr; + *lport = ports[0]; + *raddr = inside_iph->daddr; + *rport = ports[1]; + + return 0; +} + +struct sock* +xt_socket_get4_sk(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct iphdr *iph = ip_hdr(skb); + struct udphdr _hdr, *hp = NULL; + struct sock *sk; + __be32 daddr, saddr; + __be16 dport, sport; + u8 protocol; +#ifdef XT_SOCKET_HAVE_CONNTRACK + struct nf_conn const *ct; + enum ip_conntrack_info ctinfo; +#endif + + if (iph->protocol == IPPROTO_UDP || iph->protocol == IPPROTO_TCP) { + hp = skb_header_pointer(skb, ip_hdrlen(skb), + sizeof(_hdr), &_hdr); + if (hp == NULL) + return NULL; + + protocol = iph->protocol; + saddr = iph->saddr; + sport = hp->source; + daddr = iph->daddr; + dport = hp->dest; + + } else if (iph->protocol == IPPROTO_ICMP) { + if (extract_icmp4_fields(skb, &protocol, &saddr, &daddr, + &sport, &dport)) + return NULL; + } else { + return NULL; + } + +#ifdef XT_SOCKET_HAVE_CONNTRACK + /* Do the lookup with the original socket address in case this is a + * reply packet of an established SNAT-ted connection. */ + + ct = nf_ct_get(skb, &ctinfo); + if (ct && !nf_ct_is_untracked(ct) && + ((iph->protocol != IPPROTO_ICMP && + ctinfo == IP_CT_ESTABLISHED_REPLY) || + (iph->protocol == IPPROTO_ICMP && + ctinfo == IP_CT_RELATED_REPLY)) && + (ct->status & IPS_SRC_NAT_DONE)) { + + daddr = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip; + dport = (iph->protocol == IPPROTO_TCP) ? + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.tcp.port : + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.udp.port; + } +#endif + + sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), protocol, + saddr, daddr, sport, dport, par->in, NFT_LOOKUP_ANY); + + pr_debug("proto %hhu %pI4:%hu -> %pI4:%hu (orig %pI4:%hu) sock %p\n", + protocol, &saddr, ntohs(sport), + &daddr, ntohs(dport), + &iph->daddr, hp ? ntohs(hp->dest) : 0, sk); + + return sk; +} +EXPORT_SYMBOL(xt_socket_get4_sk); + +static bool +socket_match(const struct sk_buff *skb, struct xt_action_param *par, + const struct xt_socket_mtinfo1 *info) +{ + struct sock *sk; + + sk = xt_socket_get4_sk(skb, par); + if (sk != NULL) { + bool wildcard; + bool transparent = true; + + /* Ignore sockets listening on INADDR_ANY */ + wildcard = (sk->sk_state != TCP_TIME_WAIT && + inet_sk(sk)->inet_rcv_saddr == 0); + + /* Ignore non-transparent sockets, + if XT_SOCKET_TRANSPARENT is used */ + if (info && info->flags & XT_SOCKET_TRANSPARENT) + transparent = ((sk->sk_state != TCP_TIME_WAIT && + inet_sk(sk)->transparent) || + (sk->sk_state == TCP_TIME_WAIT && + inet_twsk(sk)->tw_transparent)); + + xt_socket_put_sk(sk); + + if (wildcard || !transparent) + sk = NULL; + } + + return (sk != NULL); +} + +static bool +socket_mt4_v0(const struct sk_buff *skb, struct xt_action_param *par) +{ + return socket_match(skb, par, NULL); +} + +static bool +socket_mt4_v1(const struct sk_buff *skb, struct xt_action_param *par) +{ + return socket_match(skb, par, par->matchinfo); +} + +#ifdef XT_SOCKET_HAVE_IPV6 + +static int +extract_icmp6_fields(const struct sk_buff *skb, + unsigned int outside_hdrlen, + int *protocol, + struct in6_addr **raddr, + struct in6_addr **laddr, + __be16 *rport, + __be16 *lport) +{ + struct ipv6hdr *inside_iph, _inside_iph; + struct icmp6hdr *icmph, _icmph; + __be16 *ports, _ports[2]; + u8 inside_nexthdr; + __be16 inside_fragoff; + int inside_hdrlen; + + icmph = skb_header_pointer(skb, outside_hdrlen, + sizeof(_icmph), &_icmph); + if (icmph == NULL) + return 1; + + if (icmph->icmp6_type & ICMPV6_INFOMSG_MASK) + return 1; + + inside_iph = skb_header_pointer(skb, outside_hdrlen + sizeof(_icmph), sizeof(_inside_iph), &_inside_iph); + if (inside_iph == NULL) + return 1; + inside_nexthdr = inside_iph->nexthdr; + + inside_hdrlen = ipv6_skip_exthdr(skb, outside_hdrlen + sizeof(_icmph) + sizeof(_inside_iph), + &inside_nexthdr, &inside_fragoff); + if (inside_hdrlen < 0) + return 1; /* hjm: Packet has no/incomplete transport layer headers. */ + + if (inside_nexthdr != IPPROTO_TCP && + inside_nexthdr != IPPROTO_UDP) + return 1; + + ports = skb_header_pointer(skb, inside_hdrlen, + sizeof(_ports), &_ports); + if (ports == NULL) + return 1; + + /* the inside IP packet is the one quoted from our side, thus + * its saddr is the local address */ + *protocol = inside_nexthdr; + *laddr = &inside_iph->saddr; + *lport = ports[0]; + *raddr = &inside_iph->daddr; + *rport = ports[1]; + + return 0; +} + +struct sock* +xt_socket_get6_sk(const struct sk_buff *skb, struct xt_action_param *par) +{ + struct ipv6hdr *iph = ipv6_hdr(skb); + struct udphdr _hdr, *hp = NULL; + struct sock *sk; + struct in6_addr *daddr, *saddr; + __be16 dport, sport; + int thoff, tproto; + + tproto = ipv6_find_hdr(skb, &thoff, -1, NULL); + if (tproto < 0) { + pr_debug("unable to find transport header in IPv6 packet, dropping\n"); + return NF_DROP; + } + + if (tproto == IPPROTO_UDP || tproto == IPPROTO_TCP) { + hp = skb_header_pointer(skb, thoff, + sizeof(_hdr), &_hdr); + if (hp == NULL) + return NULL; + + saddr = &iph->saddr; + sport = hp->source; + daddr = &iph->daddr; + dport = hp->dest; + + } else if (tproto == IPPROTO_ICMPV6) { + if (extract_icmp6_fields(skb, thoff, &tproto, &saddr, &daddr, + &sport, &dport)) + return NULL; + } else { + return NULL; + } + + sk = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto, + saddr, daddr, sport, dport, par->in, NFT_LOOKUP_ANY); + pr_debug("proto %hhd %pI6:%hu -> %pI6:%hu " + "(orig %pI6:%hu) sock %p\n", + tproto, saddr, ntohs(sport), + daddr, ntohs(dport), + &iph->daddr, hp ? ntohs(hp->dest) : 0, sk); + return sk; +} +EXPORT_SYMBOL(xt_socket_get6_sk); + +static bool +socket_mt6_v1(const struct sk_buff *skb, struct xt_action_param *par) +{ + struct sock *sk; + const struct xt_socket_mtinfo1 *info; + + info = (struct xt_socket_mtinfo1 *) par->matchinfo; + sk = xt_socket_get6_sk(skb, par); + if (sk != NULL) { + bool wildcard; + bool transparent = true; + + /* Ignore sockets listening on INADDR_ANY */ + wildcard = (sk->sk_state != TCP_TIME_WAIT && + ipv6_addr_any(&inet6_sk(sk)->rcv_saddr)); + + /* Ignore non-transparent sockets, + if XT_SOCKET_TRANSPARENT is used */ + if (info && info->flags & XT_SOCKET_TRANSPARENT) + transparent = ((sk->sk_state != TCP_TIME_WAIT && + inet_sk(sk)->transparent) || + (sk->sk_state == TCP_TIME_WAIT && + inet_twsk(sk)->tw_transparent)); + + xt_socket_put_sk(sk); + + if (wildcard || !transparent) + sk = NULL; + } + + return (sk != NULL); +} +#endif + +static struct xt_match socket_mt_reg[] __read_mostly = { + { + .name = "socket", + .revision = 0, + .family = NFPROTO_IPV4, + .match = socket_mt4_v0, + .hooks = (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN), + .me = THIS_MODULE, + }, + { + .name = "socket", + .revision = 1, + .family = NFPROTO_IPV4, + .match = socket_mt4_v1, + .matchsize = sizeof(struct xt_socket_mtinfo1), + .hooks = (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN), + .me = THIS_MODULE, + }, +#ifdef XT_SOCKET_HAVE_IPV6 + { + .name = "socket", + .revision = 1, + .family = NFPROTO_IPV6, + .match = socket_mt6_v1, + .matchsize = sizeof(struct xt_socket_mtinfo1), + .hooks = (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN), + .me = THIS_MODULE, + }, +#endif +}; + +static int __init socket_mt_init(void) +{ + nf_defrag_ipv4_enable(); +#ifdef XT_SOCKET_HAVE_IPV6 + nf_defrag_ipv6_enable(); +#endif + + return xt_register_matches(socket_mt_reg, ARRAY_SIZE(socket_mt_reg)); +} + +static void __exit socket_mt_exit(void) +{ + xt_unregister_matches(socket_mt_reg, ARRAY_SIZE(socket_mt_reg)); +} + +module_init(socket_mt_init); +module_exit(socket_mt_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Krisztian Kovacs, Balazs Scheidler"); +MODULE_DESCRIPTION("x_tables socket match module"); +MODULE_ALIAS("ipt_socket"); +MODULE_ALIAS("ip6t_socket"); diff --git a/net/netfilter/xt_state.c b/net/netfilter/xt_state.c new file mode 100644 index 00000000..a507922d --- /dev/null +++ b/net/netfilter/xt_state.c @@ -0,0 +1,79 @@ +/* Kernel module to match connection tracking information. */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2005 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <net/netfilter/nf_conntrack.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_state.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>"); +MODULE_DESCRIPTION("ip[6]_tables connection tracking state match module"); +MODULE_ALIAS("ipt_state"); +MODULE_ALIAS("ip6t_state"); + +static bool +state_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_state_info *sinfo = par->matchinfo; + enum ip_conntrack_info ctinfo; + unsigned int statebit; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + + if (!ct) + statebit = XT_STATE_INVALID; + else { + if (nf_ct_is_untracked(ct)) + statebit = XT_STATE_UNTRACKED; + else + statebit = XT_STATE_BIT(ctinfo); + } + return (sinfo->statemask & statebit); +} + +static int state_mt_check(const struct xt_mtchk_param *par) +{ + int ret; + + ret = nf_ct_l3proto_try_module_get(par->family); + if (ret < 0) + pr_info("cannot load conntrack support for proto=%u\n", + par->family); + return ret; +} + +static void state_mt_destroy(const struct xt_mtdtor_param *par) +{ + nf_ct_l3proto_module_put(par->family); +} + +static struct xt_match state_mt_reg __read_mostly = { + .name = "state", + .family = NFPROTO_UNSPEC, + .checkentry = state_mt_check, + .match = state_mt, + .destroy = state_mt_destroy, + .matchsize = sizeof(struct xt_state_info), + .me = THIS_MODULE, +}; + +static int __init state_mt_init(void) +{ + return xt_register_match(&state_mt_reg); +} + +static void __exit state_mt_exit(void) +{ + xt_unregister_match(&state_mt_reg); +} + +module_init(state_mt_init); +module_exit(state_mt_exit); diff --git a/net/netfilter/xt_statistic.c b/net/netfilter/xt_statistic.c new file mode 100644 index 00000000..4fe4fb42 --- /dev/null +++ b/net/netfilter/xt_statistic.c @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2006 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Based on ipt_random and ipt_nth by Fabrice MARIE <fabrice@netfilter.org>. + */ + +#include <linux/init.h> +#include <linux/spinlock.h> +#include <linux/skbuff.h> +#include <linux/net.h> +#include <linux/slab.h> + +#include <linux/netfilter/xt_statistic.h> +#include <linux/netfilter/x_tables.h> +#include <linux/module.h> + +struct xt_statistic_priv { + atomic_t count; +} ____cacheline_aligned_in_smp; + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_DESCRIPTION("Xtables: statistics-based matching (\"Nth\", random)"); +MODULE_ALIAS("ipt_statistic"); +MODULE_ALIAS("ip6t_statistic"); + +static bool +statistic_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_statistic_info *info = par->matchinfo; + bool ret = info->flags & XT_STATISTIC_INVERT; + int nval, oval; + + switch (info->mode) { + case XT_STATISTIC_MODE_RANDOM: + if ((net_random() & 0x7FFFFFFF) < info->u.random.probability) + ret = !ret; + break; + case XT_STATISTIC_MODE_NTH: + do { + oval = atomic_read(&info->master->count); + nval = (oval == info->u.nth.every) ? 0 : oval + 1; + } while (atomic_cmpxchg(&info->master->count, oval, nval) != oval); + if (nval == 0) + ret = !ret; + break; + } + + return ret; +} + +static int statistic_mt_check(const struct xt_mtchk_param *par) +{ + struct xt_statistic_info *info = par->matchinfo; + + if (info->mode > XT_STATISTIC_MODE_MAX || + info->flags & ~XT_STATISTIC_MASK) + return -EINVAL; + + info->master = kzalloc(sizeof(*info->master), GFP_KERNEL); + if (info->master == NULL) + return -ENOMEM; + atomic_set(&info->master->count, info->u.nth.count); + + return 0; +} + +static void statistic_mt_destroy(const struct xt_mtdtor_param *par) +{ + const struct xt_statistic_info *info = par->matchinfo; + + kfree(info->master); +} + +static struct xt_match xt_statistic_mt_reg __read_mostly = { + .name = "statistic", + .revision = 0, + .family = NFPROTO_UNSPEC, + .match = statistic_mt, + .checkentry = statistic_mt_check, + .destroy = statistic_mt_destroy, + .matchsize = sizeof(struct xt_statistic_info), + .me = THIS_MODULE, +}; + +static int __init statistic_mt_init(void) +{ + return xt_register_match(&xt_statistic_mt_reg); +} + +static void __exit statistic_mt_exit(void) +{ + xt_unregister_match(&xt_statistic_mt_reg); +} + +module_init(statistic_mt_init); +module_exit(statistic_mt_exit); diff --git a/net/netfilter/xt_string.c b/net/netfilter/xt_string.c new file mode 100644 index 00000000..d3c48b14 --- /dev/null +++ b/net/netfilter/xt_string.c @@ -0,0 +1,96 @@ +/* String matching match for iptables + * + * (C) 2005 Pablo Neira Ayuso <pablo@eurodev.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/gfp.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/skbuff.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_string.h> +#include <linux/textsearch.h> + +MODULE_AUTHOR("Pablo Neira Ayuso <pablo@eurodev.net>"); +MODULE_DESCRIPTION("Xtables: string-based matching"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_string"); +MODULE_ALIAS("ip6t_string"); + +static bool +string_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_string_info *conf = par->matchinfo; + struct ts_state state; + bool invert; + + memset(&state, 0, sizeof(struct ts_state)); + invert = conf->u.v1.flags & XT_STRING_FLAG_INVERT; + + return (skb_find_text((struct sk_buff *)skb, conf->from_offset, + conf->to_offset, conf->config, &state) + != UINT_MAX) ^ invert; +} + +#define STRING_TEXT_PRIV(m) ((struct xt_string_info *)(m)) + +static int string_mt_check(const struct xt_mtchk_param *par) +{ + struct xt_string_info *conf = par->matchinfo; + struct ts_config *ts_conf; + int flags = TS_AUTOLOAD; + + /* Damn, can't handle this case properly with iptables... */ + if (conf->from_offset > conf->to_offset) + return -EINVAL; + if (conf->algo[XT_STRING_MAX_ALGO_NAME_SIZE - 1] != '\0') + return -EINVAL; + if (conf->patlen > XT_STRING_MAX_PATTERN_SIZE) + return -EINVAL; + if (conf->u.v1.flags & + ~(XT_STRING_FLAG_IGNORECASE | XT_STRING_FLAG_INVERT)) + return -EINVAL; + if (conf->u.v1.flags & XT_STRING_FLAG_IGNORECASE) + flags |= TS_IGNORECASE; + ts_conf = textsearch_prepare(conf->algo, conf->pattern, conf->patlen, + GFP_KERNEL, flags); + if (IS_ERR(ts_conf)) + return PTR_ERR(ts_conf); + + conf->config = ts_conf; + return 0; +} + +static void string_mt_destroy(const struct xt_mtdtor_param *par) +{ + textsearch_destroy(STRING_TEXT_PRIV(par->matchinfo)->config); +} + +static struct xt_match xt_string_mt_reg __read_mostly = { + .name = "string", + .revision = 1, + .family = NFPROTO_UNSPEC, + .checkentry = string_mt_check, + .match = string_mt, + .destroy = string_mt_destroy, + .matchsize = sizeof(struct xt_string_info), + .me = THIS_MODULE, +}; + +static int __init string_mt_init(void) +{ + return xt_register_match(&xt_string_mt_reg); +} + +static void __exit string_mt_exit(void) +{ + xt_unregister_match(&xt_string_mt_reg); +} + +module_init(string_mt_init); +module_exit(string_mt_exit); diff --git a/net/netfilter/xt_tcpmss.c b/net/netfilter/xt_tcpmss.c new file mode 100644 index 00000000..c53d4d18 --- /dev/null +++ b/net/netfilter/xt_tcpmss.c @@ -0,0 +1,110 @@ +/* Kernel module to match TCP MSS values. */ + +/* Copyright (C) 2000 Marc Boucher <marc@mbsi.ca> + * Portions (C) 2005 by Harald Welte <laforge@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <net/tcp.h> + +#include <linux/netfilter/xt_tcpmss.h> +#include <linux/netfilter/x_tables.h> + +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv6/ip6_tables.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>"); +MODULE_DESCRIPTION("Xtables: TCP MSS match"); +MODULE_ALIAS("ipt_tcpmss"); +MODULE_ALIAS("ip6t_tcpmss"); + +static bool +tcpmss_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_tcpmss_match_info *info = par->matchinfo; + const struct tcphdr *th; + struct tcphdr _tcph; + /* tcp.doff is only 4 bits, ie. max 15 * 4 bytes */ + const u_int8_t *op; + u8 _opt[15 * 4 - sizeof(_tcph)]; + unsigned int i, optlen; + + /* If we don't have the whole header, drop packet. */ + th = skb_header_pointer(skb, par->thoff, sizeof(_tcph), &_tcph); + if (th == NULL) + goto dropit; + + /* Malformed. */ + if (th->doff*4 < sizeof(*th)) + goto dropit; + + optlen = th->doff*4 - sizeof(*th); + if (!optlen) + goto out; + + /* Truncated options. */ + op = skb_header_pointer(skb, par->thoff + sizeof(*th), optlen, _opt); + if (op == NULL) + goto dropit; + + for (i = 0; i < optlen; ) { + if (op[i] == TCPOPT_MSS + && (optlen - i) >= TCPOLEN_MSS + && op[i+1] == TCPOLEN_MSS) { + u_int16_t mssval; + + mssval = (op[i+2] << 8) | op[i+3]; + + return (mssval >= info->mss_min && + mssval <= info->mss_max) ^ info->invert; + } + if (op[i] < 2) + i++; + else + i += op[i+1] ? : 1; + } +out: + return info->invert; + +dropit: + par->hotdrop = true; + return false; +} + +static struct xt_match tcpmss_mt_reg[] __read_mostly = { + { + .name = "tcpmss", + .family = NFPROTO_IPV4, + .match = tcpmss_mt, + .matchsize = sizeof(struct xt_tcpmss_match_info), + .proto = IPPROTO_TCP, + .me = THIS_MODULE, + }, + { + .name = "tcpmss", + .family = NFPROTO_IPV6, + .match = tcpmss_mt, + .matchsize = sizeof(struct xt_tcpmss_match_info), + .proto = IPPROTO_TCP, + .me = THIS_MODULE, + }, +}; + +static int __init tcpmss_mt_init(void) +{ + return xt_register_matches(tcpmss_mt_reg, ARRAY_SIZE(tcpmss_mt_reg)); +} + +static void __exit tcpmss_mt_exit(void) +{ + xt_unregister_matches(tcpmss_mt_reg, ARRAY_SIZE(tcpmss_mt_reg)); +} + +module_init(tcpmss_mt_init); +module_exit(tcpmss_mt_exit); diff --git a/net/netfilter/xt_tcpudp.c b/net/netfilter/xt_tcpudp.c new file mode 100644 index 00000000..c14d4645 --- /dev/null +++ b/net/netfilter/xt_tcpudp.c @@ -0,0 +1,234 @@ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/types.h> +#include <linux/module.h> +#include <net/ip.h> +#include <linux/ipv6.h> +#include <net/ipv6.h> +#include <net/tcp.h> +#include <net/udp.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_tcpudp.h> +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv6/ip6_tables.h> + +MODULE_DESCRIPTION("Xtables: TCP, UDP and UDP-Lite match"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("xt_tcp"); +MODULE_ALIAS("xt_udp"); +MODULE_ALIAS("ipt_udp"); +MODULE_ALIAS("ipt_tcp"); +MODULE_ALIAS("ip6t_udp"); +MODULE_ALIAS("ip6t_tcp"); + +/* Returns 1 if the port is matched by the range, 0 otherwise */ +static inline bool +port_match(u_int16_t min, u_int16_t max, u_int16_t port, bool invert) +{ + return (port >= min && port <= max) ^ invert; +} + +static bool +tcp_find_option(u_int8_t option, + const struct sk_buff *skb, + unsigned int protoff, + unsigned int optlen, + bool invert, + bool *hotdrop) +{ + /* tcp.doff is only 4 bits, ie. max 15 * 4 bytes */ + const u_int8_t *op; + u_int8_t _opt[60 - sizeof(struct tcphdr)]; + unsigned int i; + + pr_debug("finding option\n"); + + if (!optlen) + return invert; + + /* If we don't have the whole header, drop packet. */ + op = skb_header_pointer(skb, protoff + sizeof(struct tcphdr), + optlen, _opt); + if (op == NULL) { + *hotdrop = true; + return false; + } + + for (i = 0; i < optlen; ) { + if (op[i] == option) return !invert; + if (op[i] < 2) i++; + else i += op[i+1]?:1; + } + + return invert; +} + +static bool tcp_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct tcphdr *th; + struct tcphdr _tcph; + const struct xt_tcp *tcpinfo = par->matchinfo; + + if (par->fragoff != 0) { + /* To quote Alan: + + Don't allow a fragment of TCP 8 bytes in. Nobody normal + causes this. Its a cracker trying to break in by doing a + flag overwrite to pass the direction checks. + */ + if (par->fragoff == 1) { + pr_debug("Dropping evil TCP offset=1 frag.\n"); + par->hotdrop = true; + } + /* Must not be a fragment. */ + return false; + } + +#define FWINVTCP(bool, invflg) ((bool) ^ !!(tcpinfo->invflags & (invflg))) + + th = skb_header_pointer(skb, par->thoff, sizeof(_tcph), &_tcph); + if (th == NULL) { + /* We've been asked to examine this packet, and we + can't. Hence, no choice but to drop. */ + pr_debug("Dropping evil TCP offset=0 tinygram.\n"); + par->hotdrop = true; + return false; + } + + if (!port_match(tcpinfo->spts[0], tcpinfo->spts[1], + ntohs(th->source), + !!(tcpinfo->invflags & XT_TCP_INV_SRCPT))) + return false; + if (!port_match(tcpinfo->dpts[0], tcpinfo->dpts[1], + ntohs(th->dest), + !!(tcpinfo->invflags & XT_TCP_INV_DSTPT))) + return false; + if (!FWINVTCP((((unsigned char *)th)[13] & tcpinfo->flg_mask) + == tcpinfo->flg_cmp, + XT_TCP_INV_FLAGS)) + return false; + if (tcpinfo->option) { + if (th->doff * 4 < sizeof(_tcph)) { + par->hotdrop = true; + return false; + } + if (!tcp_find_option(tcpinfo->option, skb, par->thoff, + th->doff*4 - sizeof(_tcph), + tcpinfo->invflags & XT_TCP_INV_OPTION, + &par->hotdrop)) + return false; + } + return true; +} + +static int tcp_mt_check(const struct xt_mtchk_param *par) +{ + const struct xt_tcp *tcpinfo = par->matchinfo; + + /* Must specify no unknown invflags */ + return (tcpinfo->invflags & ~XT_TCP_INV_MASK) ? -EINVAL : 0; +} + +static bool udp_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct udphdr *uh; + struct udphdr _udph; + const struct xt_udp *udpinfo = par->matchinfo; + + /* Must not be a fragment. */ + if (par->fragoff != 0) + return false; + + uh = skb_header_pointer(skb, par->thoff, sizeof(_udph), &_udph); + if (uh == NULL) { + /* We've been asked to examine this packet, and we + can't. Hence, no choice but to drop. */ + pr_debug("Dropping evil UDP tinygram.\n"); + par->hotdrop = true; + return false; + } + + return port_match(udpinfo->spts[0], udpinfo->spts[1], + ntohs(uh->source), + !!(udpinfo->invflags & XT_UDP_INV_SRCPT)) + && port_match(udpinfo->dpts[0], udpinfo->dpts[1], + ntohs(uh->dest), + !!(udpinfo->invflags & XT_UDP_INV_DSTPT)); +} + +static int udp_mt_check(const struct xt_mtchk_param *par) +{ + const struct xt_udp *udpinfo = par->matchinfo; + + /* Must specify no unknown invflags */ + return (udpinfo->invflags & ~XT_UDP_INV_MASK) ? -EINVAL : 0; +} + +static struct xt_match tcpudp_mt_reg[] __read_mostly = { + { + .name = "tcp", + .family = NFPROTO_IPV4, + .checkentry = tcp_mt_check, + .match = tcp_mt, + .matchsize = sizeof(struct xt_tcp), + .proto = IPPROTO_TCP, + .me = THIS_MODULE, + }, + { + .name = "tcp", + .family = NFPROTO_IPV6, + .checkentry = tcp_mt_check, + .match = tcp_mt, + .matchsize = sizeof(struct xt_tcp), + .proto = IPPROTO_TCP, + .me = THIS_MODULE, + }, + { + .name = "udp", + .family = NFPROTO_IPV4, + .checkentry = udp_mt_check, + .match = udp_mt, + .matchsize = sizeof(struct xt_udp), + .proto = IPPROTO_UDP, + .me = THIS_MODULE, + }, + { + .name = "udp", + .family = NFPROTO_IPV6, + .checkentry = udp_mt_check, + .match = udp_mt, + .matchsize = sizeof(struct xt_udp), + .proto = IPPROTO_UDP, + .me = THIS_MODULE, + }, + { + .name = "udplite", + .family = NFPROTO_IPV4, + .checkentry = udp_mt_check, + .match = udp_mt, + .matchsize = sizeof(struct xt_udp), + .proto = IPPROTO_UDPLITE, + .me = THIS_MODULE, + }, + { + .name = "udplite", + .family = NFPROTO_IPV6, + .checkentry = udp_mt_check, + .match = udp_mt, + .matchsize = sizeof(struct xt_udp), + .proto = IPPROTO_UDPLITE, + .me = THIS_MODULE, + }, +}; + +static int __init tcpudp_mt_init(void) +{ + return xt_register_matches(tcpudp_mt_reg, ARRAY_SIZE(tcpudp_mt_reg)); +} + +static void __exit tcpudp_mt_exit(void) +{ + xt_unregister_matches(tcpudp_mt_reg, ARRAY_SIZE(tcpudp_mt_reg)); +} + +module_init(tcpudp_mt_init); +module_exit(tcpudp_mt_exit); diff --git a/net/netfilter/xt_time.c b/net/netfilter/xt_time.c new file mode 100644 index 00000000..c48975ff --- /dev/null +++ b/net/netfilter/xt_time.c @@ -0,0 +1,269 @@ +/* + * xt_time + * Copyright © CC Computer Consultants GmbH, 2007 + * + * based on ipt_time by Fabrice MARIE <fabrice@netfilter.org> + * This is a module which is used for time matching + * It is using some modified code from dietlibc (localtime() function) + * that you can find at http://www.fefe.de/dietlibc/ + * This file is distributed under the terms of the GNU General Public + * License (GPL). Copies of the GPL can be obtained from gnu.org/gpl. + */ +#include <linux/ktime.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/types.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_time.h> + +struct xtm { + u_int8_t month; /* (1-12) */ + u_int8_t monthday; /* (1-31) */ + u_int8_t weekday; /* (1-7) */ + u_int8_t hour; /* (0-23) */ + u_int8_t minute; /* (0-59) */ + u_int8_t second; /* (0-59) */ + unsigned int dse; +}; + +extern struct timezone sys_tz; /* ouch */ + +static const u_int16_t days_since_year[] = { + 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, +}; + +static const u_int16_t days_since_leapyear[] = { + 0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, +}; + +/* + * Since time progresses forward, it is best to organize this array in reverse, + * to minimize lookup time. + */ +enum { + DSE_FIRST = 2039, +}; +static const u_int16_t days_since_epoch[] = { + /* 2039 - 2030 */ + 25202, 24837, 24472, 24106, 23741, 23376, 23011, 22645, 22280, 21915, + /* 2029 - 2020 */ + 21550, 21184, 20819, 20454, 20089, 19723, 19358, 18993, 18628, 18262, + /* 2019 - 2010 */ + 17897, 17532, 17167, 16801, 16436, 16071, 15706, 15340, 14975, 14610, + /* 2009 - 2000 */ + 14245, 13879, 13514, 13149, 12784, 12418, 12053, 11688, 11323, 10957, + /* 1999 - 1990 */ + 10592, 10227, 9862, 9496, 9131, 8766, 8401, 8035, 7670, 7305, + /* 1989 - 1980 */ + 6940, 6574, 6209, 5844, 5479, 5113, 4748, 4383, 4018, 3652, + /* 1979 - 1970 */ + 3287, 2922, 2557, 2191, 1826, 1461, 1096, 730, 365, 0, +}; + +static inline bool is_leap(unsigned int y) +{ + return y % 4 == 0 && (y % 100 != 0 || y % 400 == 0); +} + +/* + * Each network packet has a (nano)seconds-since-the-epoch (SSTE) timestamp. + * Since we match against days and daytime, the SSTE value needs to be + * computed back into human-readable dates. + * + * This is done in three separate functions so that the most expensive + * calculations are done last, in case a "simple match" can be found earlier. + */ +static inline unsigned int localtime_1(struct xtm *r, time_t time) +{ + unsigned int v, w; + + /* Each day has 86400s, so finding the hour/minute is actually easy. */ + v = time % 86400; + r->second = v % 60; + w = v / 60; + r->minute = w % 60; + r->hour = w / 60; + return v; +} + +static inline void localtime_2(struct xtm *r, time_t time) +{ + /* + * Here comes the rest (weekday, monthday). First, divide the SSTE + * by seconds-per-day to get the number of _days_ since the epoch. + */ + r->dse = time / 86400; + + /* + * 1970-01-01 (w=0) was a Thursday (4). + * -1 and +1 map Sunday properly onto 7. + */ + r->weekday = (4 + r->dse - 1) % 7 + 1; +} + +static void localtime_3(struct xtm *r, time_t time) +{ + unsigned int year, i, w = r->dse; + + /* + * In each year, a certain number of days-since-the-epoch have passed. + * Find the year that is closest to said days. + * + * Consider, for example, w=21612 (2029-03-04). Loop will abort on + * dse[i] <= w, which happens when dse[i] == 21550. This implies + * year == 2009. w will then be 62. + */ + for (i = 0, year = DSE_FIRST; days_since_epoch[i] > w; + ++i, --year) + /* just loop */; + + w -= days_since_epoch[i]; + + /* + * By now we have the current year, and the day of the year. + * r->yearday = w; + * + * On to finding the month (like above). In each month, a certain + * number of days-since-New Year have passed, and find the closest + * one. + * + * Consider w=62 (in a non-leap year). Loop will abort on + * dsy[i] < w, which happens when dsy[i] == 31+28 (i == 2). + * Concludes i == 2, i.e. 3rd month => March. + * + * (A different approach to use would be to subtract a monthlength + * from w repeatedly while counting.) + */ + if (is_leap(year)) { + /* use days_since_leapyear[] in a leap year */ + for (i = ARRAY_SIZE(days_since_leapyear) - 1; + i > 0 && days_since_leapyear[i] > w; --i) + /* just loop */; + r->monthday = w - days_since_leapyear[i] + 1; + } else { + for (i = ARRAY_SIZE(days_since_year) - 1; + i > 0 && days_since_year[i] > w; --i) + /* just loop */; + r->monthday = w - days_since_year[i] + 1; + } + + r->month = i + 1; +} + +static bool +time_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_time_info *info = par->matchinfo; + unsigned int packet_time; + struct xtm current_time; + s64 stamp; + + /* + * We cannot use get_seconds() instead of __net_timestamp() here. + * Suppose you have two rules: + * 1. match before 13:00 + * 2. match after 13:00 + * If you match against processing time (get_seconds) it + * may happen that the same packet matches both rules if + * it arrived at the right moment before 13:00. + */ + if (skb->tstamp.tv64 == 0) + __net_timestamp((struct sk_buff *)skb); + + stamp = ktime_to_ns(skb->tstamp); + stamp = div_s64(stamp, NSEC_PER_SEC); + + if (info->flags & XT_TIME_LOCAL_TZ) + /* Adjust for local timezone */ + stamp -= 60 * sys_tz.tz_minuteswest; + + /* + * xt_time will match when _all_ of the following hold: + * - 'now' is in the global time range date_start..date_end + * - 'now' is in the monthday mask + * - 'now' is in the weekday mask + * - 'now' is in the daytime range time_start..time_end + * (and by default, libxt_time will set these so as to match) + */ + + if (stamp < info->date_start || stamp > info->date_stop) + return false; + + packet_time = localtime_1(¤t_time, stamp); + + if (info->daytime_start < info->daytime_stop) { + if (packet_time < info->daytime_start || + packet_time > info->daytime_stop) + return false; + } else { + if (packet_time < info->daytime_start && + packet_time > info->daytime_stop) + return false; + } + + localtime_2(¤t_time, stamp); + + if (!(info->weekdays_match & (1 << current_time.weekday))) + return false; + + /* Do not spend time computing monthday if all days match anyway */ + if (info->monthdays_match != XT_TIME_ALL_MONTHDAYS) { + localtime_3(¤t_time, stamp); + if (!(info->monthdays_match & (1 << current_time.monthday))) + return false; + } + + return true; +} + +static int time_mt_check(const struct xt_mtchk_param *par) +{ + const struct xt_time_info *info = par->matchinfo; + + if (info->daytime_start > XT_TIME_MAX_DAYTIME || + info->daytime_stop > XT_TIME_MAX_DAYTIME) { + pr_info("invalid argument - start or " + "stop time greater than 23:59:59\n"); + return -EDOM; + } + + return 0; +} + +static struct xt_match xt_time_mt_reg __read_mostly = { + .name = "time", + .family = NFPROTO_UNSPEC, + .match = time_mt, + .checkentry = time_mt_check, + .matchsize = sizeof(struct xt_time_info), + .me = THIS_MODULE, +}; + +static int __init time_mt_init(void) +{ + int minutes = sys_tz.tz_minuteswest; + + if (minutes < 0) /* east of Greenwich */ + printk(KERN_INFO KBUILD_MODNAME + ": kernel timezone is +%02d%02d\n", + -minutes / 60, -minutes % 60); + else /* west of Greenwich */ + printk(KERN_INFO KBUILD_MODNAME + ": kernel timezone is -%02d%02d\n", + minutes / 60, minutes % 60); + + return xt_register_match(&xt_time_mt_reg); +} + +static void __exit time_mt_exit(void) +{ + xt_unregister_match(&xt_time_mt_reg); +} + +module_init(time_mt_init); +module_exit(time_mt_exit); +MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>"); +MODULE_DESCRIPTION("Xtables: time-based matching"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_time"); +MODULE_ALIAS("ip6t_time"); diff --git a/net/netfilter/xt_u32.c b/net/netfilter/xt_u32.c new file mode 100644 index 00000000..a95b5034 --- /dev/null +++ b/net/netfilter/xt_u32.c @@ -0,0 +1,123 @@ +/* + * xt_u32 - kernel module to match u32 packet content + * + * Original author: Don Cohen <don@isis.cs3-inc.com> + * (C) CC Computer Consultants GmbH, 2007 + */ + +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/spinlock.h> +#include <linux/skbuff.h> +#include <linux/types.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_u32.h> + +static bool u32_match_it(const struct xt_u32 *data, + const struct sk_buff *skb) +{ + const struct xt_u32_test *ct; + unsigned int testind; + unsigned int nnums; + unsigned int nvals; + unsigned int i; + __be32 n; + u_int32_t pos; + u_int32_t val; + u_int32_t at; + + /* + * Small example: "0 >> 28 == 4 && 8 & 0xFF0000 >> 16 = 6, 17" + * (=IPv4 and (TCP or UDP)). Outer loop runs over the "&&" operands. + */ + for (testind = 0; testind < data->ntests; ++testind) { + ct = &data->tests[testind]; + at = 0; + pos = ct->location[0].number; + + if (skb->len < 4 || pos > skb->len - 4) + return false; + + if (skb_copy_bits(skb, pos, &n, sizeof(n)) < 0) + BUG(); + val = ntohl(n); + nnums = ct->nnums; + + /* Inner loop runs over "&", "<<", ">>" and "@" operands */ + for (i = 1; i < nnums; ++i) { + u_int32_t number = ct->location[i].number; + switch (ct->location[i].nextop) { + case XT_U32_AND: + val &= number; + break; + case XT_U32_LEFTSH: + val <<= number; + break; + case XT_U32_RIGHTSH: + val >>= number; + break; + case XT_U32_AT: + if (at + val < at) + return false; + at += val; + pos = number; + if (at + 4 < at || skb->len < at + 4 || + pos > skb->len - at - 4) + return false; + + if (skb_copy_bits(skb, at + pos, &n, + sizeof(n)) < 0) + BUG(); + val = ntohl(n); + break; + } + } + + /* Run over the "," and ":" operands */ + nvals = ct->nvalues; + for (i = 0; i < nvals; ++i) + if (ct->value[i].min <= val && val <= ct->value[i].max) + break; + + if (i >= ct->nvalues) + return false; + } + + return true; +} + +static bool u32_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_u32 *data = par->matchinfo; + bool ret; + + ret = u32_match_it(data, skb); + return ret ^ data->invert; +} + +static struct xt_match xt_u32_mt_reg __read_mostly = { + .name = "u32", + .revision = 0, + .family = NFPROTO_UNSPEC, + .match = u32_mt, + .matchsize = sizeof(struct xt_u32), + .me = THIS_MODULE, +}; + +static int __init u32_mt_init(void) +{ + return xt_register_match(&xt_u32_mt_reg); +} + +static void __exit u32_mt_exit(void) +{ + xt_unregister_match(&xt_u32_mt_reg); +} + +module_init(u32_mt_init); +module_exit(u32_mt_exit); +MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>"); +MODULE_DESCRIPTION("Xtables: arbitrary byte matching"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_u32"); +MODULE_ALIAS("ip6t_u32"); |