diff options
author | Srikant Patnaik | 2015-01-11 12:28:04 +0530 |
---|---|---|
committer | Srikant Patnaik | 2015-01-11 12:28:04 +0530 |
commit | 871480933a1c28f8a9fed4c4d34d06c439a7a422 (patch) | |
tree | 8718f573808810c2a1e8cb8fb6ac469093ca2784 /net/ipv6 | |
parent | 9d40ac5867b9aefe0722bc1f110b965ff294d30d (diff) | |
download | FOSSEE-netbook-kernel-source-871480933a1c28f8a9fed4c4d34d06c439a7a422.tar.gz FOSSEE-netbook-kernel-source-871480933a1c28f8a9fed4c4d34d06c439a7a422.tar.bz2 FOSSEE-netbook-kernel-source-871480933a1c28f8a9fed4c4d34d06c439a7a422.zip |
Moved, renamed, and deleted files
The original directory structure was scattered and unorganized.
Changes are basically to make it look like kernel structure.
Diffstat (limited to 'net/ipv6')
72 files changed, 48192 insertions, 0 deletions
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig new file mode 100644 index 00000000..36d7437a --- /dev/null +++ b/net/ipv6/Kconfig @@ -0,0 +1,253 @@ +# +# IPv6 configuration +# + +# IPv6 as module will cause a CRASH if you try to unload it +menuconfig IPV6 + tristate "The IPv6 protocol" + default m + ---help--- + This is complemental support for the IP version 6. + You will still be able to do traditional IPv4 networking as well. + + For general information about IPv6, see + <http://playground.sun.com/pub/ipng/html/ipng-main.html>. + For Linux IPv6 development information, see <http://www.linux-ipv6.org>. + For specific information about IPv6 under Linux, read the HOWTO at + <http://www.bieringer.de/linux/IPv6/>. + + To compile this protocol support as a module, choose M here: the + module will be called ipv6. + +if IPV6 + +config IPV6_PRIVACY + bool "IPv6: Privacy Extensions (RFC 3041) support" + ---help--- + Privacy Extensions for Stateless Address Autoconfiguration in IPv6 + support. With this option, additional periodically-altered + pseudo-random global-scope unicast address(es) will be assigned to + your interface(s). + + We use our standard pseudo-random algorithm to generate the + randomized interface identifier, instead of one described in RFC 3041. + + By default the kernel does not generate temporary addresses. + To use temporary addresses, do + + echo 2 >/proc/sys/net/ipv6/conf/all/use_tempaddr + + See <file:Documentation/networking/ip-sysctl.txt> for details. + +config IPV6_ROUTER_PREF + bool "IPv6: Router Preference (RFC 4191) support" + ---help--- + Router Preference is an optional extension to the Router + Advertisement message which improves the ability of hosts + to pick an appropriate router, especially when the hosts + are placed in a multi-homed network. + + If unsure, say N. + +config IPV6_ROUTE_INFO + bool "IPv6: Route Information (RFC 4191) support (EXPERIMENTAL)" + depends on IPV6_ROUTER_PREF && EXPERIMENTAL + ---help--- + This is experimental support of Route Information. + + If unsure, say N. + +config IPV6_OPTIMISTIC_DAD + bool "IPv6: Enable RFC 4429 Optimistic DAD (EXPERIMENTAL)" + depends on EXPERIMENTAL + ---help--- + This is experimental support for optimistic Duplicate + Address Detection. It allows for autoconfigured addresses + to be used more quickly. + + If unsure, say N. + +config INET6_AH + tristate "IPv6: AH transformation" + select XFRM + select CRYPTO + select CRYPTO_HMAC + select CRYPTO_MD5 + select CRYPTO_SHA1 + ---help--- + Support for IPsec AH. + + If unsure, say Y. + +config INET6_ESP + tristate "IPv6: ESP transformation" + select XFRM + select CRYPTO + select CRYPTO_AUTHENC + select CRYPTO_HMAC + select CRYPTO_MD5 + select CRYPTO_CBC + select CRYPTO_SHA1 + select CRYPTO_DES + ---help--- + Support for IPsec ESP. + + If unsure, say Y. + +config INET6_IPCOMP + tristate "IPv6: IPComp transformation" + select INET6_XFRM_TUNNEL + select XFRM_IPCOMP + ---help--- + Support for IP Payload Compression Protocol (IPComp) (RFC3173), + typically needed for IPsec. + + If unsure, say Y. + +config IPV6_MIP6 + tristate "IPv6: Mobility (EXPERIMENTAL)" + depends on EXPERIMENTAL + select XFRM + ---help--- + Support for IPv6 Mobility described in RFC 3775. + + If unsure, say N. + +config INET6_XFRM_TUNNEL + tristate + select INET6_TUNNEL + default n + +config INET6_TUNNEL + tristate + default n + +config INET6_XFRM_MODE_TRANSPORT + tristate "IPv6: IPsec transport mode" + default IPV6 + select XFRM + ---help--- + Support for IPsec transport mode. + + If unsure, say Y. + +config INET6_XFRM_MODE_TUNNEL + tristate "IPv6: IPsec tunnel mode" + default IPV6 + select XFRM + ---help--- + Support for IPsec tunnel mode. + + If unsure, say Y. + +config INET6_XFRM_MODE_BEET + tristate "IPv6: IPsec BEET mode" + default IPV6 + select XFRM + ---help--- + Support for IPsec BEET mode. + + If unsure, say Y. + +config INET6_XFRM_MODE_ROUTEOPTIMIZATION + tristate "IPv6: MIPv6 route optimization mode (EXPERIMENTAL)" + depends on EXPERIMENTAL + select XFRM + ---help--- + Support for MIPv6 route optimization mode. + +config IPV6_SIT + tristate "IPv6: IPv6-in-IPv4 tunnel (SIT driver)" + select INET_TUNNEL + select IPV6_NDISC_NODETYPE + default y + ---help--- + Tunneling means encapsulating data of one protocol type within + another protocol and sending it over a channel that understands the + encapsulating protocol. This driver implements encapsulation of IPv6 + into IPv4 packets. This is useful if you want to connect two IPv6 + networks over an IPv4-only path. + + Saying M here will produce a module called sit. If unsure, say Y. + +config IPV6_SIT_6RD + bool "IPv6: IPv6 Rapid Deployment (6RD) (EXPERIMENTAL)" + depends on IPV6_SIT && EXPERIMENTAL + default n + ---help--- + IPv6 Rapid Deployment (6rd; draft-ietf-softwire-ipv6-6rd) builds upon + mechanisms of 6to4 (RFC3056) to enable a service provider to rapidly + deploy IPv6 unicast service to IPv4 sites to which it provides + customer premise equipment. Like 6to4, it utilizes stateless IPv6 in + IPv4 encapsulation in order to transit IPv4-only network + infrastructure. Unlike 6to4, a 6rd service provider uses an IPv6 + prefix of its own in place of the fixed 6to4 prefix. + + With this option enabled, the SIT driver offers 6rd functionality by + providing additional ioctl API to configure the IPv6 Prefix for in + stead of static 2002::/16 for 6to4. + + If unsure, say N. + +config IPV6_NDISC_NODETYPE + bool + +config IPV6_TUNNEL + tristate "IPv6: IP-in-IPv6 tunnel (RFC2473)" + select INET6_TUNNEL + ---help--- + Support for IPv6-in-IPv6 and IPv4-in-IPv6 tunnels described in + RFC 2473. + + If unsure, say N. + +config IPV6_MULTIPLE_TABLES + bool "IPv6: Multiple Routing Tables" + depends on EXPERIMENTAL + select FIB_RULES + ---help--- + Support multiple routing tables. + +config IPV6_SUBTREES + bool "IPv6: source address based routing" + depends on IPV6_MULTIPLE_TABLES + ---help--- + Enable routing by source address or prefix. + + The destination address is still the primary routing key, so mixing + normal and source prefix specific routes in the same routing table + may sometimes lead to unintended routing behavior. This can be + avoided by defining different routing tables for the normal and + source prefix specific routes. + + If unsure, say N. + +config IPV6_MROUTE + bool "IPv6: multicast routing (EXPERIMENTAL)" + depends on IPV6 && EXPERIMENTAL + ---help--- + Experimental support for IPv6 multicast forwarding. + If unsure, say N. + +config IPV6_MROUTE_MULTIPLE_TABLES + bool "IPv6: multicast policy routing" + depends on IPV6_MROUTE + select FIB_RULES + help + Normally, a multicast router runs a userspace daemon and decides + what to do with a multicast packet based on the source and + destination addresses. If you say Y here, the multicast router + will also be able to take interfaces and packet marks into + account and run multiple instances of userspace daemons + simultaneously, each one handling a single table. + + If unsure, say N. + +config IPV6_PIMSM_V2 + bool "IPv6: PIM-SM version 2 support (EXPERIMENTAL)" + depends on IPV6_MROUTE + ---help--- + Support for IPv6 PIM multicast routing protocol PIM-SMv2. + If unsure, say N. + +endif # IPV6 diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile new file mode 100644 index 00000000..753be5dd --- /dev/null +++ b/net/ipv6/Makefile @@ -0,0 +1,42 @@ +# +# Makefile for the Linux TCP/IP (INET6) layer. +# + +obj-$(CONFIG_IPV6) += ipv6.o + +ipv6-objs := af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o \ + addrlabel.o \ + route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o udplite.o \ + raw.o protocol.o icmp.o mcast.o reassembly.o tcp_ipv6.o ping.o \ + exthdrs.o datagram.o ip6_flowlabel.o inet6_connection_sock.o + +ipv6-$(CONFIG_SYSCTL) = sysctl_net_ipv6.o +ipv6-$(CONFIG_IPV6_MROUTE) += ip6mr.o + +ipv6-$(CONFIG_XFRM) += xfrm6_policy.o xfrm6_state.o xfrm6_input.o \ + xfrm6_output.o +ipv6-$(CONFIG_NETFILTER) += netfilter.o +ipv6-$(CONFIG_IPV6_MULTIPLE_TABLES) += fib6_rules.o +ipv6-$(CONFIG_PROC_FS) += proc.o +ipv6-$(CONFIG_SYN_COOKIES) += syncookies.o + +ipv6-objs += $(ipv6-y) + +obj-$(CONFIG_INET6_AH) += ah6.o +obj-$(CONFIG_INET6_ESP) += esp6.o +obj-$(CONFIG_INET6_IPCOMP) += ipcomp6.o +obj-$(CONFIG_INET6_XFRM_TUNNEL) += xfrm6_tunnel.o +obj-$(CONFIG_INET6_TUNNEL) += tunnel6.o +obj-$(CONFIG_INET6_XFRM_MODE_TRANSPORT) += xfrm6_mode_transport.o +obj-$(CONFIG_INET6_XFRM_MODE_TUNNEL) += xfrm6_mode_tunnel.o +obj-$(CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION) += xfrm6_mode_ro.o +obj-$(CONFIG_INET6_XFRM_MODE_BEET) += xfrm6_mode_beet.o +obj-$(CONFIG_IPV6_MIP6) += mip6.o +obj-$(CONFIG_NETFILTER) += netfilter/ + +obj-$(CONFIG_IPV6_SIT) += sit.o +obj-$(CONFIG_IPV6_TUNNEL) += ip6_tunnel.o + +obj-y += addrconf_core.o exthdrs_core.o + +obj-$(subst m,y,$(CONFIG_IPV6)) += inet6_hashtables.o diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c new file mode 100644 index 00000000..7d5cb975 --- /dev/null +++ b/net/ipv6/addrconf.c @@ -0,0 +1,4884 @@ +/* + * IPv6 Address [auto]configuration + * Linux INET6 implementation + * + * Authors: + * Pedro Roque <roque@di.fc.ul.pt> + * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* + * Changes: + * + * Janos Farkas : delete timer on ifdown + * <chexum@bankinf.banki.hu> + * Andi Kleen : kill double kfree on module + * unload. + * Maciej W. Rozycki : FDDI support + * sekiya@USAGI : Don't send too many RS + * packets. + * yoshfuji@USAGI : Fixed interval between DAD + * packets. + * YOSHIFUJI Hideaki @USAGI : improved accuracy of + * address validation timer. + * YOSHIFUJI Hideaki @USAGI : Privacy Extensions (RFC3041) + * support. + * Yuji SEKIYA @USAGI : Don't assign a same IPv6 + * address on a same interface. + * YOSHIFUJI Hideaki @USAGI : ARCnet support + * YOSHIFUJI Hideaki @USAGI : convert /proc/net/if_inet6 to + * seq_file. + * YOSHIFUJI Hideaki @USAGI : improved source address + * selection; consider scope, + * status etc. + */ + +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/net.h> +#include <linux/in6.h> +#include <linux/netdevice.h> +#include <linux/if_addr.h> +#include <linux/if_arp.h> +#include <linux/if_arcnet.h> +#include <linux/if_infiniband.h> +#include <linux/route.h> +#include <linux/inetdevice.h> +#include <linux/init.h> +#include <linux/slab.h> +#ifdef CONFIG_SYSCTL +#include <linux/sysctl.h> +#endif +#include <linux/capability.h> +#include <linux/delay.h> +#include <linux/notifier.h> +#include <linux/string.h> + +#include <net/net_namespace.h> +#include <net/sock.h> +#include <net/snmp.h> + +#include <net/ipv6.h> +#include <net/protocol.h> +#include <net/ndisc.h> +#include <net/ip6_route.h> +#include <net/addrconf.h> +#include <net/tcp.h> +#include <net/ip.h> +#include <net/netlink.h> +#include <net/pkt_sched.h> +#include <linux/if_tunnel.h> +#include <linux/rtnetlink.h> + +#ifdef CONFIG_IPV6_PRIVACY +#include <linux/random.h> +#endif + +#include <linux/uaccess.h> +#include <asm/unaligned.h> + +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/export.h> + +/* Set to 3 to get tracing... */ +#define ACONF_DEBUG 2 + +#if ACONF_DEBUG >= 3 +#define ADBG(x) printk x +#else +#define ADBG(x) +#endif + +#define INFINITY_LIFE_TIME 0xFFFFFFFF + +static inline u32 cstamp_delta(unsigned long cstamp) +{ + return (cstamp - INITIAL_JIFFIES) * 100UL / HZ; +} + +#define ADDRCONF_TIMER_FUZZ_MINUS (HZ > 50 ? HZ/50 : 1) +#define ADDRCONF_TIMER_FUZZ (HZ / 4) +#define ADDRCONF_TIMER_FUZZ_MAX (HZ) + +#ifdef CONFIG_SYSCTL +static void addrconf_sysctl_register(struct inet6_dev *idev); +static void addrconf_sysctl_unregister(struct inet6_dev *idev); +#else +static inline void addrconf_sysctl_register(struct inet6_dev *idev) +{ +} + +static inline void addrconf_sysctl_unregister(struct inet6_dev *idev) +{ +} +#endif + +#ifdef CONFIG_IPV6_PRIVACY +static int __ipv6_regen_rndid(struct inet6_dev *idev); +static int __ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpaddr); +static void ipv6_regen_rndid(unsigned long data); +#endif + +static int ipv6_generate_eui64(u8 *eui, struct net_device *dev); +static int ipv6_count_addresses(struct inet6_dev *idev); + +/* + * Configured unicast address hash table + */ +static struct hlist_head inet6_addr_lst[IN6_ADDR_HSIZE]; +static DEFINE_SPINLOCK(addrconf_hash_lock); + +static void addrconf_verify(unsigned long); + +static DEFINE_TIMER(addr_chk_timer, addrconf_verify, 0, 0); +static DEFINE_SPINLOCK(addrconf_verify_lock); + +static void addrconf_join_anycast(struct inet6_ifaddr *ifp); +static void addrconf_leave_anycast(struct inet6_ifaddr *ifp); + +static void addrconf_type_change(struct net_device *dev, + unsigned long event); +static int addrconf_ifdown(struct net_device *dev, int how); + +static void addrconf_dad_start(struct inet6_ifaddr *ifp, u32 flags); +static void addrconf_dad_timer(unsigned long data); +static void addrconf_dad_completed(struct inet6_ifaddr *ifp); +static void addrconf_dad_run(struct inet6_dev *idev); +static void addrconf_rs_timer(unsigned long data); +static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifa); +static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifa); + +static void inet6_prefix_notify(int event, struct inet6_dev *idev, + struct prefix_info *pinfo); +static bool ipv6_chk_same_addr(struct net *net, const struct in6_addr *addr, + struct net_device *dev); + +static ATOMIC_NOTIFIER_HEAD(inet6addr_chain); + +static struct ipv6_devconf ipv6_devconf __read_mostly = { + .forwarding = 0, + .hop_limit = IPV6_DEFAULT_HOPLIMIT, + .mtu6 = IPV6_MIN_MTU, + .accept_ra = 1, + .accept_redirects = 1, + .autoconf = 1, + .force_mld_version = 0, + .dad_transmits = 1, + .rtr_solicits = MAX_RTR_SOLICITATIONS, + .rtr_solicit_interval = RTR_SOLICITATION_INTERVAL, + .rtr_solicit_delay = MAX_RTR_SOLICITATION_DELAY, +#ifdef CONFIG_IPV6_PRIVACY + .use_tempaddr = 0, + .temp_valid_lft = TEMP_VALID_LIFETIME, + .temp_prefered_lft = TEMP_PREFERRED_LIFETIME, + .regen_max_retry = REGEN_MAX_RETRY, + .max_desync_factor = MAX_DESYNC_FACTOR, +#endif + .max_addresses = IPV6_MAX_ADDRESSES, + .accept_ra_defrtr = 1, + .accept_ra_pinfo = 1, +#ifdef CONFIG_IPV6_ROUTER_PREF + .accept_ra_rtr_pref = 1, + .rtr_probe_interval = 60 * HZ, +#ifdef CONFIG_IPV6_ROUTE_INFO + .accept_ra_rt_info_max_plen = 0, +#endif +#endif + .proxy_ndp = 0, + .accept_source_route = 0, /* we do not accept RH0 by default. */ + .disable_ipv6 = 0, + .accept_dad = 1, +}; + +static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { + .forwarding = 0, + .hop_limit = IPV6_DEFAULT_HOPLIMIT, + .mtu6 = IPV6_MIN_MTU, + .accept_ra = 1, + .accept_redirects = 1, + .autoconf = 1, + .dad_transmits = 1, + .rtr_solicits = MAX_RTR_SOLICITATIONS, + .rtr_solicit_interval = RTR_SOLICITATION_INTERVAL, + .rtr_solicit_delay = MAX_RTR_SOLICITATION_DELAY, +#ifdef CONFIG_IPV6_PRIVACY + .use_tempaddr = 0, + .temp_valid_lft = TEMP_VALID_LIFETIME, + .temp_prefered_lft = TEMP_PREFERRED_LIFETIME, + .regen_max_retry = REGEN_MAX_RETRY, + .max_desync_factor = MAX_DESYNC_FACTOR, +#endif + .max_addresses = IPV6_MAX_ADDRESSES, + .accept_ra_defrtr = 1, + .accept_ra_pinfo = 1, +#ifdef CONFIG_IPV6_ROUTER_PREF + .accept_ra_rtr_pref = 1, + .rtr_probe_interval = 60 * HZ, +#ifdef CONFIG_IPV6_ROUTE_INFO + .accept_ra_rt_info_max_plen = 0, +#endif +#endif + .proxy_ndp = 0, + .accept_source_route = 0, /* we do not accept RH0 by default. */ + .disable_ipv6 = 0, + .accept_dad = 1, +}; + +/* IPv6 Wildcard Address and Loopback Address defined by RFC2553 */ +const struct in6_addr in6addr_any = IN6ADDR_ANY_INIT; +const struct in6_addr in6addr_loopback = IN6ADDR_LOOPBACK_INIT; +const struct in6_addr in6addr_linklocal_allnodes = IN6ADDR_LINKLOCAL_ALLNODES_INIT; +const struct in6_addr in6addr_linklocal_allrouters = IN6ADDR_LINKLOCAL_ALLROUTERS_INIT; + +/* Check if a valid qdisc is available */ +static inline bool addrconf_qdisc_ok(const struct net_device *dev) +{ + return !qdisc_tx_is_noop(dev); +} + +/* Check if a route is valid prefix route */ +static inline int addrconf_is_prefix_route(const struct rt6_info *rt) +{ + return (rt->rt6i_flags & (RTF_GATEWAY | RTF_DEFAULT)) == 0; +} + +static void addrconf_del_timer(struct inet6_ifaddr *ifp) +{ + if (del_timer(&ifp->timer)) + __in6_ifa_put(ifp); +} + +enum addrconf_timer_t { + AC_NONE, + AC_DAD, + AC_RS, +}; + +static void addrconf_mod_timer(struct inet6_ifaddr *ifp, + enum addrconf_timer_t what, + unsigned long when) +{ + if (!del_timer(&ifp->timer)) + in6_ifa_hold(ifp); + + switch (what) { + case AC_DAD: + ifp->timer.function = addrconf_dad_timer; + break; + case AC_RS: + ifp->timer.function = addrconf_rs_timer; + break; + default: + break; + } + ifp->timer.expires = jiffies + when; + add_timer(&ifp->timer); +} + +static int snmp6_alloc_dev(struct inet6_dev *idev) +{ + if (snmp_mib_init((void __percpu **)idev->stats.ipv6, + sizeof(struct ipstats_mib), + __alignof__(struct ipstats_mib)) < 0) + goto err_ip; + idev->stats.icmpv6dev = kzalloc(sizeof(struct icmpv6_mib_device), + GFP_KERNEL); + if (!idev->stats.icmpv6dev) + goto err_icmp; + idev->stats.icmpv6msgdev = kzalloc(sizeof(struct icmpv6msg_mib_device), + GFP_KERNEL); + if (!idev->stats.icmpv6msgdev) + goto err_icmpmsg; + + return 0; + +err_icmpmsg: + kfree(idev->stats.icmpv6dev); +err_icmp: + snmp_mib_free((void __percpu **)idev->stats.ipv6); +err_ip: + return -ENOMEM; +} + +static void snmp6_free_dev(struct inet6_dev *idev) +{ + kfree(idev->stats.icmpv6msgdev); + kfree(idev->stats.icmpv6dev); + snmp_mib_free((void __percpu **)idev->stats.ipv6); +} + +/* Nobody refers to this device, we may destroy it. */ + +void in6_dev_finish_destroy(struct inet6_dev *idev) +{ + struct net_device *dev = idev->dev; + + WARN_ON(!list_empty(&idev->addr_list)); + WARN_ON(idev->mc_list != NULL); + +#ifdef NET_REFCNT_DEBUG + printk(KERN_DEBUG "in6_dev_finish_destroy: %s\n", dev ? dev->name : "NIL"); +#endif + dev_put(dev); + if (!idev->dead) { + pr_warning("Freeing alive inet6 device %p\n", idev); + return; + } + snmp6_free_dev(idev); + kfree_rcu(idev, rcu); +} + +EXPORT_SYMBOL(in6_dev_finish_destroy); + +static struct inet6_dev * ipv6_add_dev(struct net_device *dev) +{ + struct inet6_dev *ndev; + + ASSERT_RTNL(); + + if (dev->mtu < IPV6_MIN_MTU) + return NULL; + + ndev = kzalloc(sizeof(struct inet6_dev), GFP_KERNEL); + + if (ndev == NULL) + return NULL; + + rwlock_init(&ndev->lock); + ndev->dev = dev; + INIT_LIST_HEAD(&ndev->addr_list); + + memcpy(&ndev->cnf, dev_net(dev)->ipv6.devconf_dflt, sizeof(ndev->cnf)); + ndev->cnf.mtu6 = dev->mtu; + ndev->cnf.sysctl = NULL; + ndev->nd_parms = neigh_parms_alloc(dev, &nd_tbl); + if (ndev->nd_parms == NULL) { + kfree(ndev); + return NULL; + } + if (ndev->cnf.forwarding) + dev_disable_lro(dev); + /* We refer to the device */ + dev_hold(dev); + + if (snmp6_alloc_dev(ndev) < 0) { + ADBG((KERN_WARNING + "%s(): cannot allocate memory for statistics; dev=%s.\n", + __func__, dev->name)); + neigh_parms_release(&nd_tbl, ndev->nd_parms); + dev_put(dev); + kfree(ndev); + return NULL; + } + + if (snmp6_register_dev(ndev) < 0) { + ADBG((KERN_WARNING + "%s(): cannot create /proc/net/dev_snmp6/%s\n", + __func__, dev->name)); + neigh_parms_release(&nd_tbl, ndev->nd_parms); + ndev->dead = 1; + in6_dev_finish_destroy(ndev); + return NULL; + } + + /* One reference from device. We must do this before + * we invoke __ipv6_regen_rndid(). + */ + in6_dev_hold(ndev); + + if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) + ndev->cnf.accept_dad = -1; + +#if defined(CONFIG_IPV6_SIT) || defined(CONFIG_IPV6_SIT_MODULE) + if (dev->type == ARPHRD_SIT && (dev->priv_flags & IFF_ISATAP)) { + printk(KERN_INFO + "%s: Disabled Multicast RS\n", + dev->name); + ndev->cnf.rtr_solicits = 0; + } +#endif + +#ifdef CONFIG_IPV6_PRIVACY + INIT_LIST_HEAD(&ndev->tempaddr_list); + setup_timer(&ndev->regen_timer, ipv6_regen_rndid, (unsigned long)ndev); + if ((dev->flags&IFF_LOOPBACK) || + dev->type == ARPHRD_TUNNEL || + dev->type == ARPHRD_TUNNEL6 || + dev->type == ARPHRD_SIT || + dev->type == ARPHRD_NONE) { + ndev->cnf.use_tempaddr = -1; + } else { + in6_dev_hold(ndev); + ipv6_regen_rndid((unsigned long) ndev); + } +#endif + + if (netif_running(dev) && addrconf_qdisc_ok(dev)) + ndev->if_flags |= IF_READY; + + ipv6_mc_init_dev(ndev); + ndev->tstamp = jiffies; + addrconf_sysctl_register(ndev); + /* protected by rtnl_lock */ + rcu_assign_pointer(dev->ip6_ptr, ndev); + + /* Join all-node multicast group */ + ipv6_dev_mc_inc(dev, &in6addr_linklocal_allnodes); + + /* Join all-router multicast group if forwarding is set */ + if (ndev->cnf.forwarding && (dev->flags & IFF_MULTICAST)) + ipv6_dev_mc_inc(dev, &in6addr_linklocal_allrouters); + + return ndev; +} + +static struct inet6_dev * ipv6_find_idev(struct net_device *dev) +{ + struct inet6_dev *idev; + + ASSERT_RTNL(); + + idev = __in6_dev_get(dev); + if (!idev) { + idev = ipv6_add_dev(dev); + if (!idev) + return NULL; + } + + if (dev->flags&IFF_UP) + ipv6_mc_up(idev); + return idev; +} + +#ifdef CONFIG_SYSCTL +static void dev_forward_change(struct inet6_dev *idev) +{ + struct net_device *dev; + struct inet6_ifaddr *ifa; + + if (!idev) + return; + dev = idev->dev; + if (idev->cnf.forwarding) + dev_disable_lro(dev); + if (dev && (dev->flags & IFF_MULTICAST)) { + if (idev->cnf.forwarding) + ipv6_dev_mc_inc(dev, &in6addr_linklocal_allrouters); + else + ipv6_dev_mc_dec(dev, &in6addr_linklocal_allrouters); + } + + list_for_each_entry(ifa, &idev->addr_list, if_list) { + if (ifa->flags&IFA_F_TENTATIVE) + continue; + if (idev->cnf.forwarding) + addrconf_join_anycast(ifa); + else + addrconf_leave_anycast(ifa); + } +} + + +static void addrconf_forward_change(struct net *net, __s32 newf) +{ + struct net_device *dev; + struct inet6_dev *idev; + + rcu_read_lock(); + for_each_netdev_rcu(net, dev) { + idev = __in6_dev_get(dev); + if (idev) { + int changed = (!idev->cnf.forwarding) ^ (!newf); + idev->cnf.forwarding = newf; + if (changed) + dev_forward_change(idev); + } + } + rcu_read_unlock(); +} + +static int addrconf_fixup_forwarding(struct ctl_table *table, int *p, int newf) +{ + struct net *net; + int old; + + if (!rtnl_trylock()) + return restart_syscall(); + + net = (struct net *)table->extra2; + old = *p; + *p = newf; + + if (p == &net->ipv6.devconf_dflt->forwarding) { + rtnl_unlock(); + return 0; + } + + if (p == &net->ipv6.devconf_all->forwarding) { + net->ipv6.devconf_dflt->forwarding = newf; + addrconf_forward_change(net, newf); + } else if ((!newf) ^ (!old)) + dev_forward_change((struct inet6_dev *)table->extra1); + rtnl_unlock(); + + if (newf) + rt6_purge_dflt_routers(net); + return 1; +} +#endif + +/* Nobody refers to this ifaddr, destroy it */ +void inet6_ifa_finish_destroy(struct inet6_ifaddr *ifp) +{ + WARN_ON(!hlist_unhashed(&ifp->addr_lst)); + +#ifdef NET_REFCNT_DEBUG + printk(KERN_DEBUG "inet6_ifa_finish_destroy\n"); +#endif + + in6_dev_put(ifp->idev); + + if (del_timer(&ifp->timer)) + pr_notice("Timer is still running, when freeing ifa=%p\n", ifp); + + if (ifp->state != INET6_IFADDR_STATE_DEAD) { + pr_warning("Freeing alive inet6 address %p\n", ifp); + return; + } + dst_release(&ifp->rt->dst); + + kfree_rcu(ifp, rcu); +} + +static void +ipv6_link_dev_addr(struct inet6_dev *idev, struct inet6_ifaddr *ifp) +{ + struct list_head *p; + int ifp_scope = ipv6_addr_src_scope(&ifp->addr); + + /* + * Each device address list is sorted in order of scope - + * global before linklocal. + */ + list_for_each(p, &idev->addr_list) { + struct inet6_ifaddr *ifa + = list_entry(p, struct inet6_ifaddr, if_list); + if (ifp_scope >= ipv6_addr_src_scope(&ifa->addr)) + break; + } + + list_add_tail(&ifp->if_list, p); +} + +static u32 ipv6_addr_hash(const struct in6_addr *addr) +{ + /* + * We perform the hash function over the last 64 bits of the address + * This will include the IEEE address token on links that support it. + */ + return jhash_2words((__force u32)addr->s6_addr32[2], + (__force u32)addr->s6_addr32[3], 0) + & (IN6_ADDR_HSIZE - 1); +} + +/* On success it returns ifp with increased reference count */ + +static struct inet6_ifaddr * +ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, int pfxlen, + int scope, u32 flags) +{ + struct inet6_ifaddr *ifa = NULL; + struct rt6_info *rt; + unsigned int hash; + int err = 0; + int addr_type = ipv6_addr_type(addr); + + if (addr_type == IPV6_ADDR_ANY || + addr_type & IPV6_ADDR_MULTICAST || + (!(idev->dev->flags & IFF_LOOPBACK) && + addr_type & IPV6_ADDR_LOOPBACK)) + return ERR_PTR(-EADDRNOTAVAIL); + + rcu_read_lock_bh(); + if (idev->dead) { + err = -ENODEV; /*XXX*/ + goto out2; + } + + if (idev->cnf.disable_ipv6) { + err = -EACCES; + goto out2; + } + + spin_lock(&addrconf_hash_lock); + + /* Ignore adding duplicate addresses on an interface */ + if (ipv6_chk_same_addr(dev_net(idev->dev), addr, idev->dev)) { + ADBG(("ipv6_add_addr: already assigned\n")); + err = -EEXIST; + goto out; + } + + ifa = kzalloc(sizeof(struct inet6_ifaddr), GFP_ATOMIC); + + if (ifa == NULL) { + ADBG(("ipv6_add_addr: malloc failed\n")); + err = -ENOBUFS; + goto out; + } + + rt = addrconf_dst_alloc(idev, addr, false); + if (IS_ERR(rt)) { + err = PTR_ERR(rt); + goto out; + } + + ifa->addr = *addr; + + spin_lock_init(&ifa->lock); + spin_lock_init(&ifa->state_lock); + init_timer(&ifa->timer); + INIT_HLIST_NODE(&ifa->addr_lst); + ifa->timer.data = (unsigned long) ifa; + ifa->scope = scope; + ifa->prefix_len = pfxlen; + ifa->flags = flags | IFA_F_TENTATIVE; + ifa->cstamp = ifa->tstamp = jiffies; + + ifa->rt = rt; + + ifa->idev = idev; + in6_dev_hold(idev); + /* For caller */ + in6_ifa_hold(ifa); + + /* Add to big hash table */ + hash = ipv6_addr_hash(addr); + + hlist_add_head_rcu(&ifa->addr_lst, &inet6_addr_lst[hash]); + spin_unlock(&addrconf_hash_lock); + + write_lock(&idev->lock); + /* Add to inet6_dev unicast addr list. */ + ipv6_link_dev_addr(idev, ifa); + +#ifdef CONFIG_IPV6_PRIVACY + if (ifa->flags&IFA_F_TEMPORARY) { + list_add(&ifa->tmp_list, &idev->tempaddr_list); + in6_ifa_hold(ifa); + } +#endif + + in6_ifa_hold(ifa); + write_unlock(&idev->lock); +out2: + rcu_read_unlock_bh(); + + if (likely(err == 0)) + atomic_notifier_call_chain(&inet6addr_chain, NETDEV_UP, ifa); + else { + kfree(ifa); + ifa = ERR_PTR(err); + } + + return ifa; +out: + spin_unlock(&addrconf_hash_lock); + goto out2; +} + +/* This function wants to get referenced ifp and releases it before return */ + +static void ipv6_del_addr(struct inet6_ifaddr *ifp) +{ + struct inet6_ifaddr *ifa, *ifn; + struct inet6_dev *idev = ifp->idev; + int state; + int deleted = 0, onlink = 0; + unsigned long expires = jiffies; + + spin_lock_bh(&ifp->state_lock); + state = ifp->state; + ifp->state = INET6_IFADDR_STATE_DEAD; + spin_unlock_bh(&ifp->state_lock); + + if (state == INET6_IFADDR_STATE_DEAD) + goto out; + + spin_lock_bh(&addrconf_hash_lock); + hlist_del_init_rcu(&ifp->addr_lst); + spin_unlock_bh(&addrconf_hash_lock); + + write_lock_bh(&idev->lock); +#ifdef CONFIG_IPV6_PRIVACY + if (ifp->flags&IFA_F_TEMPORARY) { + list_del(&ifp->tmp_list); + if (ifp->ifpub) { + in6_ifa_put(ifp->ifpub); + ifp->ifpub = NULL; + } + __in6_ifa_put(ifp); + } +#endif + + list_for_each_entry_safe(ifa, ifn, &idev->addr_list, if_list) { + if (ifa == ifp) { + list_del_init(&ifp->if_list); + __in6_ifa_put(ifp); + + if (!(ifp->flags & IFA_F_PERMANENT) || onlink > 0) + break; + deleted = 1; + continue; + } else if (ifp->flags & IFA_F_PERMANENT) { + if (ipv6_prefix_equal(&ifa->addr, &ifp->addr, + ifp->prefix_len)) { + if (ifa->flags & IFA_F_PERMANENT) { + onlink = 1; + if (deleted) + break; + } else { + unsigned long lifetime; + + if (!onlink) + onlink = -1; + + spin_lock(&ifa->lock); + + lifetime = addrconf_timeout_fixup(ifa->valid_lft, HZ); + /* + * Note: Because this address is + * not permanent, lifetime < + * LONG_MAX / HZ here. + */ + if (time_before(expires, + ifa->tstamp + lifetime * HZ)) + expires = ifa->tstamp + lifetime * HZ; + spin_unlock(&ifa->lock); + } + } + } + } + write_unlock_bh(&idev->lock); + + addrconf_del_timer(ifp); + + ipv6_ifa_notify(RTM_DELADDR, ifp); + + atomic_notifier_call_chain(&inet6addr_chain, NETDEV_DOWN, ifp); + + /* + * Purge or update corresponding prefix + * + * 1) we don't purge prefix here if address was not permanent. + * prefix is managed by its own lifetime. + * 2) if there're no addresses, delete prefix. + * 3) if there're still other permanent address(es), + * corresponding prefix is still permanent. + * 4) otherwise, update prefix lifetime to the + * longest valid lifetime among the corresponding + * addresses on the device. + * Note: subsequent RA will update lifetime. + * + * --yoshfuji + */ + if ((ifp->flags & IFA_F_PERMANENT) && onlink < 1) { + struct in6_addr prefix; + struct rt6_info *rt; + struct net *net = dev_net(ifp->idev->dev); + ipv6_addr_prefix(&prefix, &ifp->addr, ifp->prefix_len); + rt = rt6_lookup(net, &prefix, NULL, ifp->idev->dev->ifindex, 1); + + if (rt && addrconf_is_prefix_route(rt)) { + if (onlink == 0) { + ip6_del_rt(rt); + rt = NULL; + } else if (!(rt->rt6i_flags & RTF_EXPIRES)) { + rt6_set_expires(rt, expires); + } + } + dst_release(&rt->dst); + } + + /* clean up prefsrc entries */ + rt6_remove_prefsrc(ifp); +out: + in6_ifa_put(ifp); +} + +#ifdef CONFIG_IPV6_PRIVACY +static int ipv6_create_tempaddr(struct inet6_ifaddr *ifp, struct inet6_ifaddr *ift) +{ + struct inet6_dev *idev = ifp->idev; + struct in6_addr addr, *tmpaddr; + unsigned long tmp_prefered_lft, tmp_valid_lft, tmp_tstamp, age; + unsigned long regen_advance; + int tmp_plen; + int ret = 0; + int max_addresses; + u32 addr_flags; + unsigned long now = jiffies; + + write_lock(&idev->lock); + if (ift) { + spin_lock_bh(&ift->lock); + memcpy(&addr.s6_addr[8], &ift->addr.s6_addr[8], 8); + spin_unlock_bh(&ift->lock); + tmpaddr = &addr; + } else { + tmpaddr = NULL; + } +retry: + in6_dev_hold(idev); + if (idev->cnf.use_tempaddr <= 0) { + write_unlock(&idev->lock); + printk(KERN_INFO + "ipv6_create_tempaddr(): use_tempaddr is disabled.\n"); + in6_dev_put(idev); + ret = -1; + goto out; + } + spin_lock_bh(&ifp->lock); + if (ifp->regen_count++ >= idev->cnf.regen_max_retry) { + idev->cnf.use_tempaddr = -1; /*XXX*/ + spin_unlock_bh(&ifp->lock); + write_unlock(&idev->lock); + printk(KERN_WARNING + "ipv6_create_tempaddr(): regeneration time exceeded. disabled temporary address support.\n"); + in6_dev_put(idev); + ret = -1; + goto out; + } + in6_ifa_hold(ifp); + memcpy(addr.s6_addr, ifp->addr.s6_addr, 8); + if (__ipv6_try_regen_rndid(idev, tmpaddr) < 0) { + spin_unlock_bh(&ifp->lock); + write_unlock(&idev->lock); + printk(KERN_WARNING + "ipv6_create_tempaddr(): regeneration of randomized interface id failed.\n"); + in6_ifa_put(ifp); + in6_dev_put(idev); + ret = -1; + goto out; + } + memcpy(&addr.s6_addr[8], idev->rndid, 8); + age = (now - ifp->tstamp) / HZ; + tmp_valid_lft = min_t(__u32, + ifp->valid_lft, + idev->cnf.temp_valid_lft + age); + tmp_prefered_lft = min_t(__u32, + ifp->prefered_lft, + idev->cnf.temp_prefered_lft + age - + idev->cnf.max_desync_factor); + tmp_plen = ifp->prefix_len; + max_addresses = idev->cnf.max_addresses; + tmp_tstamp = ifp->tstamp; + spin_unlock_bh(&ifp->lock); + + regen_advance = idev->cnf.regen_max_retry * + idev->cnf.dad_transmits * + idev->nd_parms->retrans_time / HZ; + write_unlock(&idev->lock); + + /* A temporary address is created only if this calculated Preferred + * Lifetime is greater than REGEN_ADVANCE time units. In particular, + * an implementation must not create a temporary address with a zero + * Preferred Lifetime. + */ + if (tmp_prefered_lft <= regen_advance) { + in6_ifa_put(ifp); + in6_dev_put(idev); + ret = -1; + goto out; + } + + addr_flags = IFA_F_TEMPORARY; + /* set in addrconf_prefix_rcv() */ + if (ifp->flags & IFA_F_OPTIMISTIC) + addr_flags |= IFA_F_OPTIMISTIC; + + ift = !max_addresses || + ipv6_count_addresses(idev) < max_addresses ? + ipv6_add_addr(idev, &addr, tmp_plen, + ipv6_addr_type(&addr)&IPV6_ADDR_SCOPE_MASK, + addr_flags) : NULL; + if (!ift || IS_ERR(ift)) { + in6_ifa_put(ifp); + in6_dev_put(idev); + printk(KERN_INFO + "ipv6_create_tempaddr(): retry temporary address regeneration.\n"); + tmpaddr = &addr; + write_lock(&idev->lock); + goto retry; + } + + spin_lock_bh(&ift->lock); + ift->ifpub = ifp; + ift->valid_lft = tmp_valid_lft; + ift->prefered_lft = tmp_prefered_lft; + ift->cstamp = now; + ift->tstamp = tmp_tstamp; + spin_unlock_bh(&ift->lock); + + addrconf_dad_start(ift, 0); + in6_ifa_put(ift); + in6_dev_put(idev); +out: + return ret; +} +#endif + +/* + * Choose an appropriate source address (RFC3484) + */ +enum { + IPV6_SADDR_RULE_INIT = 0, + IPV6_SADDR_RULE_LOCAL, + IPV6_SADDR_RULE_SCOPE, + IPV6_SADDR_RULE_PREFERRED, +#ifdef CONFIG_IPV6_MIP6 + IPV6_SADDR_RULE_HOA, +#endif + IPV6_SADDR_RULE_OIF, + IPV6_SADDR_RULE_LABEL, +#ifdef CONFIG_IPV6_PRIVACY + IPV6_SADDR_RULE_PRIVACY, +#endif + IPV6_SADDR_RULE_ORCHID, + IPV6_SADDR_RULE_PREFIX, + IPV6_SADDR_RULE_MAX +}; + +struct ipv6_saddr_score { + int rule; + int addr_type; + struct inet6_ifaddr *ifa; + DECLARE_BITMAP(scorebits, IPV6_SADDR_RULE_MAX); + int scopedist; + int matchlen; +}; + +struct ipv6_saddr_dst { + const struct in6_addr *addr; + int ifindex; + int scope; + int label; + unsigned int prefs; +}; + +static inline int ipv6_saddr_preferred(int type) +{ + if (type & (IPV6_ADDR_MAPPED|IPV6_ADDR_COMPATv4|IPV6_ADDR_LOOPBACK)) + return 1; + return 0; +} + +static int ipv6_get_saddr_eval(struct net *net, + struct ipv6_saddr_score *score, + struct ipv6_saddr_dst *dst, + int i) +{ + int ret; + + if (i <= score->rule) { + switch (i) { + case IPV6_SADDR_RULE_SCOPE: + ret = score->scopedist; + break; + case IPV6_SADDR_RULE_PREFIX: + ret = score->matchlen; + break; + default: + ret = !!test_bit(i, score->scorebits); + } + goto out; + } + + switch (i) { + case IPV6_SADDR_RULE_INIT: + /* Rule 0: remember if hiscore is not ready yet */ + ret = !!score->ifa; + break; + case IPV6_SADDR_RULE_LOCAL: + /* Rule 1: Prefer same address */ + ret = ipv6_addr_equal(&score->ifa->addr, dst->addr); + break; + case IPV6_SADDR_RULE_SCOPE: + /* Rule 2: Prefer appropriate scope + * + * ret + * ^ + * -1 | d 15 + * ---+--+-+---> scope + * | + * | d is scope of the destination. + * B-d | \ + * | \ <- smaller scope is better if + * B-15 | \ if scope is enough for destinaion. + * | ret = B - scope (-1 <= scope >= d <= 15). + * d-C-1 | / + * |/ <- greater is better + * -C / if scope is not enough for destination. + * /| ret = scope - C (-1 <= d < scope <= 15). + * + * d - C - 1 < B -15 (for all -1 <= d <= 15). + * C > d + 14 - B >= 15 + 14 - B = 29 - B. + * Assume B = 0 and we get C > 29. + */ + ret = __ipv6_addr_src_scope(score->addr_type); + if (ret >= dst->scope) + ret = -ret; + else + ret -= 128; /* 30 is enough */ + score->scopedist = ret; + break; + case IPV6_SADDR_RULE_PREFERRED: + /* Rule 3: Avoid deprecated and optimistic addresses */ + ret = ipv6_saddr_preferred(score->addr_type) || + !(score->ifa->flags & (IFA_F_DEPRECATED|IFA_F_OPTIMISTIC)); + break; +#ifdef CONFIG_IPV6_MIP6 + case IPV6_SADDR_RULE_HOA: + { + /* Rule 4: Prefer home address */ + int prefhome = !(dst->prefs & IPV6_PREFER_SRC_COA); + ret = !(score->ifa->flags & IFA_F_HOMEADDRESS) ^ prefhome; + break; + } +#endif + case IPV6_SADDR_RULE_OIF: + /* Rule 5: Prefer outgoing interface */ + ret = (!dst->ifindex || + dst->ifindex == score->ifa->idev->dev->ifindex); + break; + case IPV6_SADDR_RULE_LABEL: + /* Rule 6: Prefer matching label */ + ret = ipv6_addr_label(net, + &score->ifa->addr, score->addr_type, + score->ifa->idev->dev->ifindex) == dst->label; + break; +#ifdef CONFIG_IPV6_PRIVACY + case IPV6_SADDR_RULE_PRIVACY: + { + /* Rule 7: Prefer public address + * Note: prefer temporary address if use_tempaddr >= 2 + */ + int preftmp = dst->prefs & (IPV6_PREFER_SRC_PUBLIC|IPV6_PREFER_SRC_TMP) ? + !!(dst->prefs & IPV6_PREFER_SRC_TMP) : + score->ifa->idev->cnf.use_tempaddr >= 2; + ret = (!(score->ifa->flags & IFA_F_TEMPORARY)) ^ preftmp; + break; + } +#endif + case IPV6_SADDR_RULE_ORCHID: + /* Rule 8-: Prefer ORCHID vs ORCHID or + * non-ORCHID vs non-ORCHID + */ + ret = !(ipv6_addr_orchid(&score->ifa->addr) ^ + ipv6_addr_orchid(dst->addr)); + break; + case IPV6_SADDR_RULE_PREFIX: + /* Rule 8: Use longest matching prefix */ + score->matchlen = ret = ipv6_addr_diff(&score->ifa->addr, + dst->addr); + break; + default: + ret = 0; + } + + if (ret) + __set_bit(i, score->scorebits); + score->rule = i; +out: + return ret; +} + +int ipv6_dev_get_saddr(struct net *net, struct net_device *dst_dev, + const struct in6_addr *daddr, unsigned int prefs, + struct in6_addr *saddr) +{ + struct ipv6_saddr_score scores[2], + *score = &scores[0], *hiscore = &scores[1]; + struct ipv6_saddr_dst dst; + struct net_device *dev; + int dst_type; + + dst_type = __ipv6_addr_type(daddr); + dst.addr = daddr; + dst.ifindex = dst_dev ? dst_dev->ifindex : 0; + dst.scope = __ipv6_addr_src_scope(dst_type); + dst.label = ipv6_addr_label(net, daddr, dst_type, dst.ifindex); + dst.prefs = prefs; + + hiscore->rule = -1; + hiscore->ifa = NULL; + + rcu_read_lock(); + + for_each_netdev_rcu(net, dev) { + struct inet6_dev *idev; + + /* Candidate Source Address (section 4) + * - multicast and link-local destination address, + * the set of candidate source address MUST only + * include addresses assigned to interfaces + * belonging to the same link as the outgoing + * interface. + * (- For site-local destination addresses, the + * set of candidate source addresses MUST only + * include addresses assigned to interfaces + * belonging to the same site as the outgoing + * interface.) + */ + if (((dst_type & IPV6_ADDR_MULTICAST) || + dst.scope <= IPV6_ADDR_SCOPE_LINKLOCAL) && + dst.ifindex && dev->ifindex != dst.ifindex) + continue; + + idev = __in6_dev_get(dev); + if (!idev) + continue; + + read_lock_bh(&idev->lock); + list_for_each_entry(score->ifa, &idev->addr_list, if_list) { + int i; + + /* + * - Tentative Address (RFC2462 section 5.4) + * - A tentative address is not considered + * "assigned to an interface" in the traditional + * sense, unless it is also flagged as optimistic. + * - Candidate Source Address (section 4) + * - In any case, anycast addresses, multicast + * addresses, and the unspecified address MUST + * NOT be included in a candidate set. + */ + if ((score->ifa->flags & IFA_F_TENTATIVE) && + (!(score->ifa->flags & IFA_F_OPTIMISTIC))) + continue; + + score->addr_type = __ipv6_addr_type(&score->ifa->addr); + + if (unlikely(score->addr_type == IPV6_ADDR_ANY || + score->addr_type & IPV6_ADDR_MULTICAST)) { + LIMIT_NETDEBUG(KERN_DEBUG + "ADDRCONF: unspecified / multicast address " + "assigned as unicast address on %s", + dev->name); + continue; + } + + score->rule = -1; + bitmap_zero(score->scorebits, IPV6_SADDR_RULE_MAX); + + for (i = 0; i < IPV6_SADDR_RULE_MAX; i++) { + int minihiscore, miniscore; + + minihiscore = ipv6_get_saddr_eval(net, hiscore, &dst, i); + miniscore = ipv6_get_saddr_eval(net, score, &dst, i); + + if (minihiscore > miniscore) { + if (i == IPV6_SADDR_RULE_SCOPE && + score->scopedist > 0) { + /* + * special case: + * each remaining entry + * has too small (not enough) + * scope, because ifa entries + * are sorted by their scope + * values. + */ + goto try_nextdev; + } + break; + } else if (minihiscore < miniscore) { + if (hiscore->ifa) + in6_ifa_put(hiscore->ifa); + + in6_ifa_hold(score->ifa); + + swap(hiscore, score); + + /* restore our iterator */ + score->ifa = hiscore->ifa; + + break; + } + } + } +try_nextdev: + read_unlock_bh(&idev->lock); + } + rcu_read_unlock(); + + if (!hiscore->ifa) + return -EADDRNOTAVAIL; + + *saddr = hiscore->ifa->addr; + in6_ifa_put(hiscore->ifa); + return 0; +} +EXPORT_SYMBOL(ipv6_dev_get_saddr); + +int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr, + unsigned char banned_flags) +{ + struct inet6_dev *idev; + int err = -EADDRNOTAVAIL; + + rcu_read_lock(); + idev = __in6_dev_get(dev); + if (idev) { + struct inet6_ifaddr *ifp; + + read_lock_bh(&idev->lock); + list_for_each_entry(ifp, &idev->addr_list, if_list) { + if (ifp->scope == IFA_LINK && + !(ifp->flags & banned_flags)) { + *addr = ifp->addr; + err = 0; + break; + } + } + read_unlock_bh(&idev->lock); + } + rcu_read_unlock(); + return err; +} + +static int ipv6_count_addresses(struct inet6_dev *idev) +{ + int cnt = 0; + struct inet6_ifaddr *ifp; + + read_lock_bh(&idev->lock); + list_for_each_entry(ifp, &idev->addr_list, if_list) + cnt++; + read_unlock_bh(&idev->lock); + return cnt; +} + +int ipv6_chk_addr(struct net *net, const struct in6_addr *addr, + struct net_device *dev, int strict) +{ + struct inet6_ifaddr *ifp; + struct hlist_node *node; + unsigned int hash = ipv6_addr_hash(addr); + + rcu_read_lock_bh(); + hlist_for_each_entry_rcu(ifp, node, &inet6_addr_lst[hash], addr_lst) { + if (!net_eq(dev_net(ifp->idev->dev), net)) + continue; + if (ipv6_addr_equal(&ifp->addr, addr) && + !(ifp->flags&IFA_F_TENTATIVE) && + (dev == NULL || ifp->idev->dev == dev || + !(ifp->scope&(IFA_LINK|IFA_HOST) || strict))) { + rcu_read_unlock_bh(); + return 1; + } + } + + rcu_read_unlock_bh(); + return 0; +} +EXPORT_SYMBOL(ipv6_chk_addr); + +static bool ipv6_chk_same_addr(struct net *net, const struct in6_addr *addr, + struct net_device *dev) +{ + unsigned int hash = ipv6_addr_hash(addr); + struct inet6_ifaddr *ifp; + struct hlist_node *node; + + hlist_for_each_entry(ifp, node, &inet6_addr_lst[hash], addr_lst) { + if (!net_eq(dev_net(ifp->idev->dev), net)) + continue; + if (ipv6_addr_equal(&ifp->addr, addr)) { + if (dev == NULL || ifp->idev->dev == dev) + return true; + } + } + return false; +} + +int ipv6_chk_prefix(const struct in6_addr *addr, struct net_device *dev) +{ + struct inet6_dev *idev; + struct inet6_ifaddr *ifa; + int onlink; + + onlink = 0; + rcu_read_lock(); + idev = __in6_dev_get(dev); + if (idev) { + read_lock_bh(&idev->lock); + list_for_each_entry(ifa, &idev->addr_list, if_list) { + onlink = ipv6_prefix_equal(addr, &ifa->addr, + ifa->prefix_len); + if (onlink) + break; + } + read_unlock_bh(&idev->lock); + } + rcu_read_unlock(); + return onlink; +} + +EXPORT_SYMBOL(ipv6_chk_prefix); + +struct inet6_ifaddr *ipv6_get_ifaddr(struct net *net, const struct in6_addr *addr, + struct net_device *dev, int strict) +{ + struct inet6_ifaddr *ifp, *result = NULL; + unsigned int hash = ipv6_addr_hash(addr); + struct hlist_node *node; + + rcu_read_lock_bh(); + hlist_for_each_entry_rcu_bh(ifp, node, &inet6_addr_lst[hash], addr_lst) { + if (!net_eq(dev_net(ifp->idev->dev), net)) + continue; + if (ipv6_addr_equal(&ifp->addr, addr)) { + if (dev == NULL || ifp->idev->dev == dev || + !(ifp->scope&(IFA_LINK|IFA_HOST) || strict)) { + result = ifp; + in6_ifa_hold(ifp); + break; + } + } + } + rcu_read_unlock_bh(); + + return result; +} + +/* Gets referenced address, destroys ifaddr */ + +static void addrconf_dad_stop(struct inet6_ifaddr *ifp, int dad_failed) +{ + if (ifp->flags&IFA_F_PERMANENT) { + spin_lock_bh(&ifp->lock); + addrconf_del_timer(ifp); + ifp->flags |= IFA_F_TENTATIVE; + if (dad_failed) + ifp->flags |= IFA_F_DADFAILED; + spin_unlock_bh(&ifp->lock); + if (dad_failed) + ipv6_ifa_notify(0, ifp); + in6_ifa_put(ifp); +#ifdef CONFIG_IPV6_PRIVACY + } else if (ifp->flags&IFA_F_TEMPORARY) { + struct inet6_ifaddr *ifpub; + spin_lock_bh(&ifp->lock); + ifpub = ifp->ifpub; + if (ifpub) { + in6_ifa_hold(ifpub); + spin_unlock_bh(&ifp->lock); + ipv6_create_tempaddr(ifpub, ifp); + in6_ifa_put(ifpub); + } else { + spin_unlock_bh(&ifp->lock); + } + ipv6_del_addr(ifp); +#endif + } else + ipv6_del_addr(ifp); +} + +static int addrconf_dad_end(struct inet6_ifaddr *ifp) +{ + int err = -ENOENT; + + spin_lock(&ifp->state_lock); + if (ifp->state == INET6_IFADDR_STATE_DAD) { + ifp->state = INET6_IFADDR_STATE_POSTDAD; + err = 0; + } + spin_unlock(&ifp->state_lock); + + return err; +} + +void addrconf_dad_failure(struct inet6_ifaddr *ifp) +{ + struct inet6_dev *idev = ifp->idev; + + if (addrconf_dad_end(ifp)) { + in6_ifa_put(ifp); + return; + } + + if (net_ratelimit()) + printk(KERN_INFO "%s: IPv6 duplicate address %pI6c detected!\n", + ifp->idev->dev->name, &ifp->addr); + + if (idev->cnf.accept_dad > 1 && !idev->cnf.disable_ipv6) { + struct in6_addr addr; + + addr.s6_addr32[0] = htonl(0xfe800000); + addr.s6_addr32[1] = 0; + + if (!ipv6_generate_eui64(addr.s6_addr + 8, idev->dev) && + ipv6_addr_equal(&ifp->addr, &addr)) { + /* DAD failed for link-local based on MAC address */ + idev->cnf.disable_ipv6 = 1; + + printk(KERN_INFO "%s: IPv6 being disabled!\n", + ifp->idev->dev->name); + } + } + + addrconf_dad_stop(ifp, 1); +} + +/* Join to solicited addr multicast group. */ + +void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr) +{ + struct in6_addr maddr; + + if (dev->flags&(IFF_LOOPBACK|IFF_NOARP)) + return; + + addrconf_addr_solict_mult(addr, &maddr); + ipv6_dev_mc_inc(dev, &maddr); +} + +void addrconf_leave_solict(struct inet6_dev *idev, const struct in6_addr *addr) +{ + struct in6_addr maddr; + + if (idev->dev->flags&(IFF_LOOPBACK|IFF_NOARP)) + return; + + addrconf_addr_solict_mult(addr, &maddr); + __ipv6_dev_mc_dec(idev, &maddr); +} + +static void addrconf_join_anycast(struct inet6_ifaddr *ifp) +{ + struct in6_addr addr; + if (ifp->prefix_len == 127) /* RFC 6164 */ + return; + ipv6_addr_prefix(&addr, &ifp->addr, ifp->prefix_len); + if (ipv6_addr_any(&addr)) + return; + ipv6_dev_ac_inc(ifp->idev->dev, &addr); +} + +static void addrconf_leave_anycast(struct inet6_ifaddr *ifp) +{ + struct in6_addr addr; + if (ifp->prefix_len == 127) /* RFC 6164 */ + return; + ipv6_addr_prefix(&addr, &ifp->addr, ifp->prefix_len); + if (ipv6_addr_any(&addr)) + return; + __ipv6_dev_ac_dec(ifp->idev, &addr); +} + +static int addrconf_ifid_eui48(u8 *eui, struct net_device *dev) +{ + if (dev->addr_len != ETH_ALEN) + return -1; + memcpy(eui, dev->dev_addr, 3); + memcpy(eui + 5, dev->dev_addr + 3, 3); + + /* + * The zSeries OSA network cards can be shared among various + * OS instances, but the OSA cards have only one MAC address. + * This leads to duplicate address conflicts in conjunction + * with IPv6 if more than one instance uses the same card. + * + * The driver for these cards can deliver a unique 16-bit + * identifier for each instance sharing the same card. It is + * placed instead of 0xFFFE in the interface identifier. The + * "u" bit of the interface identifier is not inverted in this + * case. Hence the resulting interface identifier has local + * scope according to RFC2373. + */ + if (dev->dev_id) { + eui[3] = (dev->dev_id >> 8) & 0xFF; + eui[4] = dev->dev_id & 0xFF; + } else { + eui[3] = 0xFF; + eui[4] = 0xFE; + eui[0] ^= 2; + } + return 0; +} + +static int addrconf_ifid_arcnet(u8 *eui, struct net_device *dev) +{ + /* XXX: inherit EUI-64 from other interface -- yoshfuji */ + if (dev->addr_len != ARCNET_ALEN) + return -1; + memset(eui, 0, 7); + eui[7] = *(u8*)dev->dev_addr; + return 0; +} + +static int addrconf_ifid_infiniband(u8 *eui, struct net_device *dev) +{ + if (dev->addr_len != INFINIBAND_ALEN) + return -1; + memcpy(eui, dev->dev_addr + 12, 8); + eui[0] |= 2; + return 0; +} + +static int __ipv6_isatap_ifid(u8 *eui, __be32 addr) +{ + if (addr == 0) + return -1; + eui[0] = (ipv4_is_zeronet(addr) || ipv4_is_private_10(addr) || + ipv4_is_loopback(addr) || ipv4_is_linklocal_169(addr) || + ipv4_is_private_172(addr) || ipv4_is_test_192(addr) || + ipv4_is_anycast_6to4(addr) || ipv4_is_private_192(addr) || + ipv4_is_test_198(addr) || ipv4_is_multicast(addr) || + ipv4_is_lbcast(addr)) ? 0x00 : 0x02; + eui[1] = 0; + eui[2] = 0x5E; + eui[3] = 0xFE; + memcpy(eui + 4, &addr, 4); + return 0; +} + +static int addrconf_ifid_sit(u8 *eui, struct net_device *dev) +{ + if (dev->priv_flags & IFF_ISATAP) + return __ipv6_isatap_ifid(eui, *(__be32 *)dev->dev_addr); + return -1; +} + +static int addrconf_ifid_gre(u8 *eui, struct net_device *dev) +{ + return __ipv6_isatap_ifid(eui, *(__be32 *)dev->dev_addr); +} + +static int ipv6_generate_eui64(u8 *eui, struct net_device *dev) +{ + switch (dev->type) { + case ARPHRD_ETHER: + case ARPHRD_FDDI: + case ARPHRD_IEEE802_TR: + return addrconf_ifid_eui48(eui, dev); + case ARPHRD_ARCNET: + return addrconf_ifid_arcnet(eui, dev); + case ARPHRD_INFINIBAND: + return addrconf_ifid_infiniband(eui, dev); + case ARPHRD_SIT: + return addrconf_ifid_sit(eui, dev); + case ARPHRD_IPGRE: + return addrconf_ifid_gre(eui, dev); + } + return -1; +} + +static int ipv6_inherit_eui64(u8 *eui, struct inet6_dev *idev) +{ + int err = -1; + struct inet6_ifaddr *ifp; + + read_lock_bh(&idev->lock); + list_for_each_entry(ifp, &idev->addr_list, if_list) { + if (ifp->scope == IFA_LINK && !(ifp->flags&IFA_F_TENTATIVE)) { + memcpy(eui, ifp->addr.s6_addr+8, 8); + err = 0; + break; + } + } + read_unlock_bh(&idev->lock); + return err; +} + +#ifdef CONFIG_IPV6_PRIVACY +/* (re)generation of randomized interface identifier (RFC 3041 3.2, 3.5) */ +static int __ipv6_regen_rndid(struct inet6_dev *idev) +{ +regen: + get_random_bytes(idev->rndid, sizeof(idev->rndid)); + idev->rndid[0] &= ~0x02; + + /* + * <draft-ietf-ipngwg-temp-addresses-v2-00.txt>: + * check if generated address is not inappropriate + * + * - Reserved subnet anycast (RFC 2526) + * 11111101 11....11 1xxxxxxx + * - ISATAP (RFC4214) 6.1 + * 00-00-5E-FE-xx-xx-xx-xx + * - value 0 + * - XXX: already assigned to an address on the device + */ + if (idev->rndid[0] == 0xfd && + (idev->rndid[1]&idev->rndid[2]&idev->rndid[3]&idev->rndid[4]&idev->rndid[5]&idev->rndid[6]) == 0xff && + (idev->rndid[7]&0x80)) + goto regen; + if ((idev->rndid[0]|idev->rndid[1]) == 0) { + if (idev->rndid[2] == 0x5e && idev->rndid[3] == 0xfe) + goto regen; + if ((idev->rndid[2]|idev->rndid[3]|idev->rndid[4]|idev->rndid[5]|idev->rndid[6]|idev->rndid[7]) == 0x00) + goto regen; + } + + return 0; +} + +static void ipv6_regen_rndid(unsigned long data) +{ + struct inet6_dev *idev = (struct inet6_dev *) data; + unsigned long expires; + + rcu_read_lock_bh(); + write_lock_bh(&idev->lock); + + if (idev->dead) + goto out; + + if (__ipv6_regen_rndid(idev) < 0) + goto out; + + expires = jiffies + + idev->cnf.temp_prefered_lft * HZ - + idev->cnf.regen_max_retry * idev->cnf.dad_transmits * idev->nd_parms->retrans_time - + idev->cnf.max_desync_factor * HZ; + if (time_before(expires, jiffies)) { + printk(KERN_WARNING + "ipv6_regen_rndid(): too short regeneration interval; timer disabled for %s.\n", + idev->dev->name); + goto out; + } + + if (!mod_timer(&idev->regen_timer, expires)) + in6_dev_hold(idev); + +out: + write_unlock_bh(&idev->lock); + rcu_read_unlock_bh(); + in6_dev_put(idev); +} + +static int __ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpaddr) { + int ret = 0; + + if (tmpaddr && memcmp(idev->rndid, &tmpaddr->s6_addr[8], 8) == 0) + ret = __ipv6_regen_rndid(idev); + return ret; +} +#endif + +/* + * Add prefix route. + */ + +static void +addrconf_prefix_route(struct in6_addr *pfx, int plen, struct net_device *dev, + unsigned long expires, u32 flags) +{ + struct fib6_config cfg = { + .fc_table = RT6_TABLE_PREFIX, + .fc_metric = IP6_RT_PRIO_ADDRCONF, + .fc_ifindex = dev->ifindex, + .fc_expires = expires, + .fc_dst_len = plen, + .fc_flags = RTF_UP | flags, + .fc_nlinfo.nl_net = dev_net(dev), + .fc_protocol = RTPROT_KERNEL, + }; + + cfg.fc_dst = *pfx; + + /* Prevent useless cloning on PtP SIT. + This thing is done here expecting that the whole + class of non-broadcast devices need not cloning. + */ +#if defined(CONFIG_IPV6_SIT) || defined(CONFIG_IPV6_SIT_MODULE) + if (dev->type == ARPHRD_SIT && (dev->flags & IFF_POINTOPOINT)) + cfg.fc_flags |= RTF_NONEXTHOP; +#endif + + ip6_route_add(&cfg); +} + + +static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx, + int plen, + const struct net_device *dev, + u32 flags, u32 noflags) +{ + struct fib6_node *fn; + struct rt6_info *rt = NULL; + struct fib6_table *table; + + table = fib6_get_table(dev_net(dev), RT6_TABLE_PREFIX); + if (table == NULL) + return NULL; + + write_lock_bh(&table->tb6_lock); + fn = fib6_locate(&table->tb6_root, pfx, plen, NULL, 0); + if (!fn) + goto out; + for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { + if (rt->dst.dev->ifindex != dev->ifindex) + continue; + if ((rt->rt6i_flags & flags) != flags) + continue; + if ((noflags != 0) && ((rt->rt6i_flags & flags) != 0)) + continue; + dst_hold(&rt->dst); + break; + } +out: + write_unlock_bh(&table->tb6_lock); + return rt; +} + + +/* Create "default" multicast route to the interface */ + +static void addrconf_add_mroute(struct net_device *dev) +{ + struct fib6_config cfg = { + .fc_table = RT6_TABLE_LOCAL, + .fc_metric = IP6_RT_PRIO_ADDRCONF, + .fc_ifindex = dev->ifindex, + .fc_dst_len = 8, + .fc_flags = RTF_UP, + .fc_nlinfo.nl_net = dev_net(dev), + }; + + ipv6_addr_set(&cfg.fc_dst, htonl(0xFF000000), 0, 0, 0); + + ip6_route_add(&cfg); +} + +#if defined(CONFIG_IPV6_SIT) || defined(CONFIG_IPV6_SIT_MODULE) +static void sit_route_add(struct net_device *dev) +{ + struct fib6_config cfg = { + .fc_table = RT6_TABLE_MAIN, + .fc_metric = IP6_RT_PRIO_ADDRCONF, + .fc_ifindex = dev->ifindex, + .fc_dst_len = 96, + .fc_flags = RTF_UP | RTF_NONEXTHOP, + .fc_nlinfo.nl_net = dev_net(dev), + }; + + /* prefix length - 96 bits "::d.d.d.d" */ + ip6_route_add(&cfg); +} +#endif + +static void addrconf_add_lroute(struct net_device *dev) +{ + struct in6_addr addr; + + ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0); + addrconf_prefix_route(&addr, 64, dev, 0, 0); +} + +static struct inet6_dev *addrconf_add_dev(struct net_device *dev) +{ + struct inet6_dev *idev; + + ASSERT_RTNL(); + + idev = ipv6_find_idev(dev); + if (!idev) + return ERR_PTR(-ENOBUFS); + + if (idev->cnf.disable_ipv6) + return ERR_PTR(-EACCES); + + /* Add default multicast route */ + if (!(dev->flags & IFF_LOOPBACK)) + addrconf_add_mroute(dev); + + /* Add link local route */ + addrconf_add_lroute(dev); + return idev; +} + +void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) +{ + struct prefix_info *pinfo; + __u32 valid_lft; + __u32 prefered_lft; + int addr_type; + struct inet6_dev *in6_dev; + struct net *net = dev_net(dev); + + pinfo = (struct prefix_info *) opt; + + if (len < sizeof(struct prefix_info)) { + ADBG(("addrconf: prefix option too short\n")); + return; + } + + /* + * Validation checks ([ADDRCONF], page 19) + */ + + addr_type = ipv6_addr_type(&pinfo->prefix); + + if (addr_type & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL)) + return; + + valid_lft = ntohl(pinfo->valid); + prefered_lft = ntohl(pinfo->prefered); + + if (prefered_lft > valid_lft) { + if (net_ratelimit()) + printk(KERN_WARNING "addrconf: prefix option has invalid lifetime\n"); + return; + } + + in6_dev = in6_dev_get(dev); + + if (in6_dev == NULL) { + if (net_ratelimit()) + printk(KERN_DEBUG "addrconf: device %s not configured\n", dev->name); + return; + } + + /* + * Two things going on here: + * 1) Add routes for on-link prefixes + * 2) Configure prefixes with the auto flag set + */ + + if (pinfo->onlink) { + struct rt6_info *rt; + unsigned long rt_expires; + + /* Avoid arithmetic overflow. Really, we could + * save rt_expires in seconds, likely valid_lft, + * but it would require division in fib gc, that it + * not good. + */ + if (HZ > USER_HZ) + rt_expires = addrconf_timeout_fixup(valid_lft, HZ); + else + rt_expires = addrconf_timeout_fixup(valid_lft, USER_HZ); + + if (addrconf_finite_timeout(rt_expires)) + rt_expires *= HZ; + + rt = addrconf_get_prefix_route(&pinfo->prefix, + pinfo->prefix_len, + dev, + RTF_ADDRCONF | RTF_PREFIX_RT, + RTF_GATEWAY | RTF_DEFAULT); + + if (rt) { + /* Autoconf prefix route */ + if (valid_lft == 0) { + ip6_del_rt(rt); + rt = NULL; + } else if (addrconf_finite_timeout(rt_expires)) { + /* not infinity */ + rt6_set_expires(rt, jiffies + rt_expires); + } else { + rt6_clean_expires(rt); + } + } else if (valid_lft) { + clock_t expires = 0; + int flags = RTF_ADDRCONF | RTF_PREFIX_RT; + if (addrconf_finite_timeout(rt_expires)) { + /* not infinity */ + flags |= RTF_EXPIRES; + expires = jiffies_to_clock_t(rt_expires); + } + addrconf_prefix_route(&pinfo->prefix, pinfo->prefix_len, + dev, expires, flags); + } + if (rt) + dst_release(&rt->dst); + } + + /* Try to figure out our local address for this prefix */ + + if (pinfo->autoconf && in6_dev->cnf.autoconf) { + struct inet6_ifaddr * ifp; + struct in6_addr addr; + int create = 0, update_lft = 0; + + if (pinfo->prefix_len == 64) { + memcpy(&addr, &pinfo->prefix, 8); + if (ipv6_generate_eui64(addr.s6_addr + 8, dev) && + ipv6_inherit_eui64(addr.s6_addr + 8, in6_dev)) { + in6_dev_put(in6_dev); + return; + } + goto ok; + } + if (net_ratelimit()) + printk(KERN_DEBUG "IPv6 addrconf: prefix with wrong length %d\n", + pinfo->prefix_len); + in6_dev_put(in6_dev); + return; + +ok: + + ifp = ipv6_get_ifaddr(net, &addr, dev, 1); + + if (ifp == NULL && valid_lft) { + int max_addresses = in6_dev->cnf.max_addresses; + u32 addr_flags = 0; + +#ifdef CONFIG_IPV6_OPTIMISTIC_DAD + if (in6_dev->cnf.optimistic_dad && + !net->ipv6.devconf_all->forwarding && sllao) + addr_flags = IFA_F_OPTIMISTIC; +#endif + + /* Do not allow to create too much of autoconfigured + * addresses; this would be too easy way to crash kernel. + */ + if (!max_addresses || + ipv6_count_addresses(in6_dev) < max_addresses) + ifp = ipv6_add_addr(in6_dev, &addr, pinfo->prefix_len, + addr_type&IPV6_ADDR_SCOPE_MASK, + addr_flags); + + if (!ifp || IS_ERR(ifp)) { + in6_dev_put(in6_dev); + return; + } + + update_lft = create = 1; + ifp->cstamp = jiffies; + addrconf_dad_start(ifp, RTF_ADDRCONF|RTF_PREFIX_RT); + } + + if (ifp) { + int flags; + unsigned long now; +#ifdef CONFIG_IPV6_PRIVACY + struct inet6_ifaddr *ift; +#endif + u32 stored_lft; + + /* update lifetime (RFC2462 5.5.3 e) */ + spin_lock(&ifp->lock); + now = jiffies; + if (ifp->valid_lft > (now - ifp->tstamp) / HZ) + stored_lft = ifp->valid_lft - (now - ifp->tstamp) / HZ; + else + stored_lft = 0; + if (!update_lft && stored_lft) { + if (valid_lft > MIN_VALID_LIFETIME || + valid_lft > stored_lft) + update_lft = 1; + else if (stored_lft <= MIN_VALID_LIFETIME) { + /* valid_lft <= stored_lft is always true */ + /* + * RFC 4862 Section 5.5.3e: + * "Note that the preferred lifetime of + * the corresponding address is always + * reset to the Preferred Lifetime in + * the received Prefix Information + * option, regardless of whether the + * valid lifetime is also reset or + * ignored." + * + * So if the preferred lifetime in + * this advertisement is different + * than what we have stored, but the + * valid lifetime is invalid, just + * reset prefered_lft. + * + * We must set the valid lifetime + * to the stored lifetime since we'll + * be updating the timestamp below, + * else we'll set it back to the + * minimum. + */ + if (prefered_lft != ifp->prefered_lft) { + valid_lft = stored_lft; + update_lft = 1; + } + } else { + valid_lft = MIN_VALID_LIFETIME; + if (valid_lft < prefered_lft) + prefered_lft = valid_lft; + update_lft = 1; + } + } + + if (update_lft) { + ifp->valid_lft = valid_lft; + ifp->prefered_lft = prefered_lft; + ifp->tstamp = now; + flags = ifp->flags; + ifp->flags &= ~IFA_F_DEPRECATED; + spin_unlock(&ifp->lock); + + if (!(flags&IFA_F_TENTATIVE)) + ipv6_ifa_notify(0, ifp); + } else + spin_unlock(&ifp->lock); + +#ifdef CONFIG_IPV6_PRIVACY + read_lock_bh(&in6_dev->lock); + /* update all temporary addresses in the list */ + list_for_each_entry(ift, &in6_dev->tempaddr_list, + tmp_list) { + int age, max_valid, max_prefered; + + if (ifp != ift->ifpub) + continue; + + /* + * RFC 4941 section 3.3: + * If a received option will extend the lifetime + * of a public address, the lifetimes of + * temporary addresses should be extended, + * subject to the overall constraint that no + * temporary addresses should ever remain + * "valid" or "preferred" for a time longer than + * (TEMP_VALID_LIFETIME) or + * (TEMP_PREFERRED_LIFETIME - DESYNC_FACTOR), + * respectively. + */ + age = (now - ift->cstamp) / HZ; + max_valid = in6_dev->cnf.temp_valid_lft - age; + if (max_valid < 0) + max_valid = 0; + + max_prefered = in6_dev->cnf.temp_prefered_lft - + in6_dev->cnf.max_desync_factor - + age; + if (max_prefered < 0) + max_prefered = 0; + + if (valid_lft > max_valid) + valid_lft = max_valid; + + if (prefered_lft > max_prefered) + prefered_lft = max_prefered; + + spin_lock(&ift->lock); + flags = ift->flags; + ift->valid_lft = valid_lft; + ift->prefered_lft = prefered_lft; + ift->tstamp = now; + if (prefered_lft > 0) + ift->flags &= ~IFA_F_DEPRECATED; + + spin_unlock(&ift->lock); + if (!(flags&IFA_F_TENTATIVE)) + ipv6_ifa_notify(0, ift); + } + + if ((create || list_empty(&in6_dev->tempaddr_list)) && in6_dev->cnf.use_tempaddr > 0) { + /* + * When a new public address is created as + * described in [ADDRCONF], also create a new + * temporary address. Also create a temporary + * address if it's enabled but no temporary + * address currently exists. + */ + read_unlock_bh(&in6_dev->lock); + ipv6_create_tempaddr(ifp, NULL); + } else { + read_unlock_bh(&in6_dev->lock); + } +#endif + in6_ifa_put(ifp); + addrconf_verify(0); + } + } + inet6_prefix_notify(RTM_NEWPREFIX, in6_dev, pinfo); + in6_dev_put(in6_dev); +} + +/* + * Set destination address. + * Special case for SIT interfaces where we create a new "virtual" + * device. + */ +int addrconf_set_dstaddr(struct net *net, void __user *arg) +{ + struct in6_ifreq ireq; + struct net_device *dev; + int err = -EINVAL; + + rtnl_lock(); + + err = -EFAULT; + if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq))) + goto err_exit; + + dev = __dev_get_by_index(net, ireq.ifr6_ifindex); + + err = -ENODEV; + if (dev == NULL) + goto err_exit; + +#if defined(CONFIG_IPV6_SIT) || defined(CONFIG_IPV6_SIT_MODULE) + if (dev->type == ARPHRD_SIT) { + const struct net_device_ops *ops = dev->netdev_ops; + struct ifreq ifr; + struct ip_tunnel_parm p; + + err = -EADDRNOTAVAIL; + if (!(ipv6_addr_type(&ireq.ifr6_addr) & IPV6_ADDR_COMPATv4)) + goto err_exit; + + memset(&p, 0, sizeof(p)); + p.iph.daddr = ireq.ifr6_addr.s6_addr32[3]; + p.iph.saddr = 0; + p.iph.version = 4; + p.iph.ihl = 5; + p.iph.protocol = IPPROTO_IPV6; + p.iph.ttl = 64; + ifr.ifr_ifru.ifru_data = (__force void __user *)&p; + + if (ops->ndo_do_ioctl) { + mm_segment_t oldfs = get_fs(); + + set_fs(KERNEL_DS); + err = ops->ndo_do_ioctl(dev, &ifr, SIOCADDTUNNEL); + set_fs(oldfs); + } else + err = -EOPNOTSUPP; + + if (err == 0) { + err = -ENOBUFS; + dev = __dev_get_by_name(net, p.name); + if (!dev) + goto err_exit; + err = dev_open(dev); + } + } +#endif + +err_exit: + rtnl_unlock(); + return err; +} + +/* + * Manual configuration of address on an interface + */ +static int inet6_addr_add(struct net *net, int ifindex, const struct in6_addr *pfx, + unsigned int plen, __u8 ifa_flags, __u32 prefered_lft, + __u32 valid_lft) +{ + struct inet6_ifaddr *ifp; + struct inet6_dev *idev; + struct net_device *dev; + int scope; + u32 flags; + clock_t expires; + unsigned long timeout; + + ASSERT_RTNL(); + + if (plen > 128) + return -EINVAL; + + /* check the lifetime */ + if (!valid_lft || prefered_lft > valid_lft) + return -EINVAL; + + dev = __dev_get_by_index(net, ifindex); + if (!dev) + return -ENODEV; + + idev = addrconf_add_dev(dev); + if (IS_ERR(idev)) + return PTR_ERR(idev); + + scope = ipv6_addr_scope(pfx); + + timeout = addrconf_timeout_fixup(valid_lft, HZ); + if (addrconf_finite_timeout(timeout)) { + expires = jiffies_to_clock_t(timeout * HZ); + valid_lft = timeout; + flags = RTF_EXPIRES; + } else { + expires = 0; + flags = 0; + ifa_flags |= IFA_F_PERMANENT; + } + + timeout = addrconf_timeout_fixup(prefered_lft, HZ); + if (addrconf_finite_timeout(timeout)) { + if (timeout == 0) + ifa_flags |= IFA_F_DEPRECATED; + prefered_lft = timeout; + } + + ifp = ipv6_add_addr(idev, pfx, plen, scope, ifa_flags); + + if (!IS_ERR(ifp)) { + spin_lock_bh(&ifp->lock); + ifp->valid_lft = valid_lft; + ifp->prefered_lft = prefered_lft; + ifp->tstamp = jiffies; + spin_unlock_bh(&ifp->lock); + + addrconf_prefix_route(&ifp->addr, ifp->prefix_len, dev, + expires, flags); + /* + * Note that section 3.1 of RFC 4429 indicates + * that the Optimistic flag should not be set for + * manually configured addresses + */ + addrconf_dad_start(ifp, 0); + in6_ifa_put(ifp); + addrconf_verify(0); + return 0; + } + + return PTR_ERR(ifp); +} + +static int inet6_addr_del(struct net *net, int ifindex, const struct in6_addr *pfx, + unsigned int plen) +{ + struct inet6_ifaddr *ifp; + struct inet6_dev *idev; + struct net_device *dev; + + if (plen > 128) + return -EINVAL; + + dev = __dev_get_by_index(net, ifindex); + if (!dev) + return -ENODEV; + + if ((idev = __in6_dev_get(dev)) == NULL) + return -ENXIO; + + read_lock_bh(&idev->lock); + list_for_each_entry(ifp, &idev->addr_list, if_list) { + if (ifp->prefix_len == plen && + ipv6_addr_equal(pfx, &ifp->addr)) { + in6_ifa_hold(ifp); + read_unlock_bh(&idev->lock); + + ipv6_del_addr(ifp); + + /* If the last address is deleted administratively, + disable IPv6 on this interface. + */ + if (list_empty(&idev->addr_list)) + addrconf_ifdown(idev->dev, 1); + return 0; + } + } + read_unlock_bh(&idev->lock); + return -EADDRNOTAVAIL; +} + + +int addrconf_add_ifaddr(struct net *net, void __user *arg) +{ + struct in6_ifreq ireq; + int err; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq))) + return -EFAULT; + + rtnl_lock(); + err = inet6_addr_add(net, ireq.ifr6_ifindex, &ireq.ifr6_addr, + ireq.ifr6_prefixlen, IFA_F_PERMANENT, + INFINITY_LIFE_TIME, INFINITY_LIFE_TIME); + rtnl_unlock(); + return err; +} + +int addrconf_del_ifaddr(struct net *net, void __user *arg) +{ + struct in6_ifreq ireq; + int err; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq))) + return -EFAULT; + + rtnl_lock(); + err = inet6_addr_del(net, ireq.ifr6_ifindex, &ireq.ifr6_addr, + ireq.ifr6_prefixlen); + rtnl_unlock(); + return err; +} + +static void add_addr(struct inet6_dev *idev, const struct in6_addr *addr, + int plen, int scope) +{ + struct inet6_ifaddr *ifp; + + ifp = ipv6_add_addr(idev, addr, plen, scope, IFA_F_PERMANENT); + if (!IS_ERR(ifp)) { + spin_lock_bh(&ifp->lock); + ifp->flags &= ~IFA_F_TENTATIVE; + spin_unlock_bh(&ifp->lock); + ipv6_ifa_notify(RTM_NEWADDR, ifp); + in6_ifa_put(ifp); + } +} + +#if defined(CONFIG_IPV6_SIT) || defined(CONFIG_IPV6_SIT_MODULE) +static void sit_add_v4_addrs(struct inet6_dev *idev) +{ + struct in6_addr addr; + struct net_device *dev; + struct net *net = dev_net(idev->dev); + int scope; + + ASSERT_RTNL(); + + memset(&addr, 0, sizeof(struct in6_addr)); + memcpy(&addr.s6_addr32[3], idev->dev->dev_addr, 4); + + if (idev->dev->flags&IFF_POINTOPOINT) { + addr.s6_addr32[0] = htonl(0xfe800000); + scope = IFA_LINK; + } else { + scope = IPV6_ADDR_COMPATv4; + } + + if (addr.s6_addr32[3]) { + add_addr(idev, &addr, 128, scope); + return; + } + + for_each_netdev(net, dev) { + struct in_device * in_dev = __in_dev_get_rtnl(dev); + if (in_dev && (dev->flags & IFF_UP)) { + struct in_ifaddr * ifa; + + int flag = scope; + + for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) { + int plen; + + addr.s6_addr32[3] = ifa->ifa_local; + + if (ifa->ifa_scope == RT_SCOPE_LINK) + continue; + if (ifa->ifa_scope >= RT_SCOPE_HOST) { + if (idev->dev->flags&IFF_POINTOPOINT) + continue; + flag |= IFA_HOST; + } + if (idev->dev->flags&IFF_POINTOPOINT) + plen = 64; + else + plen = 96; + + add_addr(idev, &addr, plen, flag); + } + } + } +} +#endif + +static void init_loopback(struct net_device *dev) +{ + struct inet6_dev *idev; + + /* ::1 */ + + ASSERT_RTNL(); + + if ((idev = ipv6_find_idev(dev)) == NULL) { + printk(KERN_DEBUG "init loopback: add_dev failed\n"); + return; + } + + add_addr(idev, &in6addr_loopback, 128, IFA_HOST); +} + +static void addrconf_add_linklocal(struct inet6_dev *idev, const struct in6_addr *addr) +{ + struct inet6_ifaddr * ifp; + u32 addr_flags = IFA_F_PERMANENT; + +#ifdef CONFIG_IPV6_OPTIMISTIC_DAD + if (idev->cnf.optimistic_dad && + !dev_net(idev->dev)->ipv6.devconf_all->forwarding) + addr_flags |= IFA_F_OPTIMISTIC; +#endif + + + ifp = ipv6_add_addr(idev, addr, 64, IFA_LINK, addr_flags); + if (!IS_ERR(ifp)) { + addrconf_prefix_route(&ifp->addr, ifp->prefix_len, idev->dev, 0, 0); + addrconf_dad_start(ifp, 0); + in6_ifa_put(ifp); + } +} + +static void addrconf_dev_config(struct net_device *dev) +{ + struct in6_addr addr; + struct inet6_dev * idev; + + ASSERT_RTNL(); + + if ((dev->type != ARPHRD_ETHER) && + (dev->type != ARPHRD_FDDI) && + (dev->type != ARPHRD_IEEE802_TR) && + (dev->type != ARPHRD_ARCNET) && + (dev->type != ARPHRD_INFINIBAND)) { + /* Alas, we support only Ethernet autoconfiguration. */ + return; + } + + idev = addrconf_add_dev(dev); + if (IS_ERR(idev)) + return; + + memset(&addr, 0, sizeof(struct in6_addr)); + addr.s6_addr32[0] = htonl(0xFE800000); + + if (ipv6_generate_eui64(addr.s6_addr + 8, dev) == 0) + addrconf_add_linklocal(idev, &addr); +} + +#if defined(CONFIG_IPV6_SIT) || defined(CONFIG_IPV6_SIT_MODULE) +static void addrconf_sit_config(struct net_device *dev) +{ + struct inet6_dev *idev; + + ASSERT_RTNL(); + + /* + * Configure the tunnel with one of our IPv4 + * addresses... we should configure all of + * our v4 addrs in the tunnel + */ + + if ((idev = ipv6_find_idev(dev)) == NULL) { + printk(KERN_DEBUG "init sit: add_dev failed\n"); + return; + } + + if (dev->priv_flags & IFF_ISATAP) { + struct in6_addr addr; + + ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0); + addrconf_prefix_route(&addr, 64, dev, 0, 0); + if (!ipv6_generate_eui64(addr.s6_addr + 8, dev)) + addrconf_add_linklocal(idev, &addr); + return; + } + + sit_add_v4_addrs(idev); + + if (dev->flags&IFF_POINTOPOINT) { + addrconf_add_mroute(dev); + addrconf_add_lroute(dev); + } else + sit_route_add(dev); +} +#endif + +#if defined(CONFIG_NET_IPGRE) || defined(CONFIG_NET_IPGRE_MODULE) +static void addrconf_gre_config(struct net_device *dev) +{ + struct inet6_dev *idev; + struct in6_addr addr; + + pr_info("ipv6: addrconf_gre_config(%s)\n", dev->name); + + ASSERT_RTNL(); + + if ((idev = ipv6_find_idev(dev)) == NULL) { + printk(KERN_DEBUG "init gre: add_dev failed\n"); + return; + } + + ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0); + addrconf_prefix_route(&addr, 64, dev, 0, 0); + + if (!ipv6_generate_eui64(addr.s6_addr + 8, dev)) + addrconf_add_linklocal(idev, &addr); +} +#endif + +static inline int +ipv6_inherit_linklocal(struct inet6_dev *idev, struct net_device *link_dev) +{ + struct in6_addr lladdr; + + if (!ipv6_get_lladdr(link_dev, &lladdr, IFA_F_TENTATIVE)) { + addrconf_add_linklocal(idev, &lladdr); + return 0; + } + return -1; +} + +static void ip6_tnl_add_linklocal(struct inet6_dev *idev) +{ + struct net_device *link_dev; + struct net *net = dev_net(idev->dev); + + /* first try to inherit the link-local address from the link device */ + if (idev->dev->iflink && + (link_dev = __dev_get_by_index(net, idev->dev->iflink))) { + if (!ipv6_inherit_linklocal(idev, link_dev)) + return; + } + /* then try to inherit it from any device */ + for_each_netdev(net, link_dev) { + if (!ipv6_inherit_linklocal(idev, link_dev)) + return; + } + printk(KERN_DEBUG "init ip6-ip6: add_linklocal failed\n"); +} + +/* + * Autoconfigure tunnel with a link-local address so routing protocols, + * DHCPv6, MLD etc. can be run over the virtual link + */ + +static void addrconf_ip6_tnl_config(struct net_device *dev) +{ + struct inet6_dev *idev; + + ASSERT_RTNL(); + + idev = addrconf_add_dev(dev); + if (IS_ERR(idev)) { + printk(KERN_DEBUG "init ip6-ip6: add_dev failed\n"); + return; + } + ip6_tnl_add_linklocal(idev); +} + +static int addrconf_notify(struct notifier_block *this, unsigned long event, + void * data) +{ + struct net_device *dev = (struct net_device *) data; + struct inet6_dev *idev = __in6_dev_get(dev); + int run_pending = 0; + int err; + + switch (event) { + case NETDEV_REGISTER: + if (!idev && dev->mtu >= IPV6_MIN_MTU) { + idev = ipv6_add_dev(dev); + if (!idev) + return notifier_from_errno(-ENOMEM); + } + break; + + case NETDEV_UP: + case NETDEV_CHANGE: + if (dev->flags & IFF_SLAVE) + break; + + if (event == NETDEV_UP) { + if (!addrconf_qdisc_ok(dev)) { + /* device is not ready yet. */ + printk(KERN_INFO + "ADDRCONF(NETDEV_UP): %s: " + "link is not ready\n", + dev->name); + break; + } + + if (!idev && dev->mtu >= IPV6_MIN_MTU) + idev = ipv6_add_dev(dev); + + if (idev) { + idev->if_flags |= IF_READY; + run_pending = 1; + } + } else { + if (!addrconf_qdisc_ok(dev)) { + /* device is still not ready. */ + break; + } + + if (idev) { + if (idev->if_flags & IF_READY) + /* device is already configured. */ + break; + idev->if_flags |= IF_READY; + } + + printk(KERN_INFO + "ADDRCONF(NETDEV_CHANGE): %s: " + "link becomes ready\n", + dev->name); + + run_pending = 1; + } + + switch (dev->type) { +#if defined(CONFIG_IPV6_SIT) || defined(CONFIG_IPV6_SIT_MODULE) + case ARPHRD_SIT: + addrconf_sit_config(dev); + break; +#endif +#if defined(CONFIG_NET_IPGRE) || defined(CONFIG_NET_IPGRE_MODULE) + case ARPHRD_IPGRE: + addrconf_gre_config(dev); + break; +#endif + case ARPHRD_TUNNEL6: + addrconf_ip6_tnl_config(dev); + break; + case ARPHRD_LOOPBACK: + init_loopback(dev); + break; + + default: + addrconf_dev_config(dev); + break; + } + + if (idev) { + if (run_pending) + addrconf_dad_run(idev); + + /* + * If the MTU changed during the interface down, + * when the interface up, the changed MTU must be + * reflected in the idev as well as routers. + */ + if (idev->cnf.mtu6 != dev->mtu && + dev->mtu >= IPV6_MIN_MTU) { + rt6_mtu_change(dev, dev->mtu); + idev->cnf.mtu6 = dev->mtu; + } + idev->tstamp = jiffies; + inet6_ifinfo_notify(RTM_NEWLINK, idev); + + /* + * If the changed mtu during down is lower than + * IPV6_MIN_MTU stop IPv6 on this interface. + */ + if (dev->mtu < IPV6_MIN_MTU) + addrconf_ifdown(dev, 1); + } + break; + + case NETDEV_CHANGEMTU: + if (idev && dev->mtu >= IPV6_MIN_MTU) { + rt6_mtu_change(dev, dev->mtu); + idev->cnf.mtu6 = dev->mtu; + break; + } + + if (!idev && dev->mtu >= IPV6_MIN_MTU) { + idev = ipv6_add_dev(dev); + if (idev) + break; + } + + /* + * MTU falled under IPV6_MIN_MTU. + * Stop IPv6 on this interface. + */ + + case NETDEV_DOWN: + case NETDEV_UNREGISTER: + /* + * Remove all addresses from this interface. + */ + addrconf_ifdown(dev, event != NETDEV_DOWN); + break; + + case NETDEV_CHANGENAME: + if (idev) { + snmp6_unregister_dev(idev); + addrconf_sysctl_unregister(idev); + addrconf_sysctl_register(idev); + err = snmp6_register_dev(idev); + if (err) + return notifier_from_errno(err); + } + break; + + case NETDEV_PRE_TYPE_CHANGE: + case NETDEV_POST_TYPE_CHANGE: + addrconf_type_change(dev, event); + break; + } + + return NOTIFY_OK; +} + +/* + * addrconf module should be notified of a device going up + */ +static struct notifier_block ipv6_dev_notf = { + .notifier_call = addrconf_notify, +}; + +static void addrconf_type_change(struct net_device *dev, unsigned long event) +{ + struct inet6_dev *idev; + ASSERT_RTNL(); + + idev = __in6_dev_get(dev); + + if (event == NETDEV_POST_TYPE_CHANGE) + ipv6_mc_remap(idev); + else if (event == NETDEV_PRE_TYPE_CHANGE) + ipv6_mc_unmap(idev); +} + +static int addrconf_ifdown(struct net_device *dev, int how) +{ + struct net *net = dev_net(dev); + struct inet6_dev *idev; + struct inet6_ifaddr *ifa; + int state, i; + + ASSERT_RTNL(); + + rt6_ifdown(net, dev); + neigh_ifdown(&nd_tbl, dev); + + idev = __in6_dev_get(dev); + if (idev == NULL) + return -ENODEV; + + /* + * Step 1: remove reference to ipv6 device from parent device. + * Do not dev_put! + */ + if (how) { + idev->dead = 1; + + /* protected by rtnl_lock */ + RCU_INIT_POINTER(dev->ip6_ptr, NULL); + + /* Step 1.5: remove snmp6 entry */ + snmp6_unregister_dev(idev); + + } + + /* Step 2: clear hash table */ + for (i = 0; i < IN6_ADDR_HSIZE; i++) { + struct hlist_head *h = &inet6_addr_lst[i]; + struct hlist_node *n; + + spin_lock_bh(&addrconf_hash_lock); + restart: + hlist_for_each_entry_rcu(ifa, n, h, addr_lst) { + if (ifa->idev == idev) { + hlist_del_init_rcu(&ifa->addr_lst); + addrconf_del_timer(ifa); + goto restart; + } + } + spin_unlock_bh(&addrconf_hash_lock); + } + + write_lock_bh(&idev->lock); + + /* Step 2: clear flags for stateless addrconf */ + if (!how) + idev->if_flags &= ~(IF_RS_SENT|IF_RA_RCVD|IF_READY); + +#ifdef CONFIG_IPV6_PRIVACY + if (how && del_timer(&idev->regen_timer)) + in6_dev_put(idev); + + /* Step 3: clear tempaddr list */ + while (!list_empty(&idev->tempaddr_list)) { + ifa = list_first_entry(&idev->tempaddr_list, + struct inet6_ifaddr, tmp_list); + list_del(&ifa->tmp_list); + write_unlock_bh(&idev->lock); + spin_lock_bh(&ifa->lock); + + if (ifa->ifpub) { + in6_ifa_put(ifa->ifpub); + ifa->ifpub = NULL; + } + spin_unlock_bh(&ifa->lock); + in6_ifa_put(ifa); + write_lock_bh(&idev->lock); + } +#endif + + while (!list_empty(&idev->addr_list)) { + ifa = list_first_entry(&idev->addr_list, + struct inet6_ifaddr, if_list); + addrconf_del_timer(ifa); + + list_del(&ifa->if_list); + + write_unlock_bh(&idev->lock); + + spin_lock_bh(&ifa->state_lock); + state = ifa->state; + ifa->state = INET6_IFADDR_STATE_DEAD; + spin_unlock_bh(&ifa->state_lock); + + if (state != INET6_IFADDR_STATE_DEAD) { + __ipv6_ifa_notify(RTM_DELADDR, ifa); + atomic_notifier_call_chain(&inet6addr_chain, NETDEV_DOWN, ifa); + } + in6_ifa_put(ifa); + + write_lock_bh(&idev->lock); + } + + write_unlock_bh(&idev->lock); + + /* Step 5: Discard multicast list */ + if (how) + ipv6_mc_destroy_dev(idev); + else + ipv6_mc_down(idev); + + idev->tstamp = jiffies; + + /* Last: Shot the device (if unregistered) */ + if (how) { + addrconf_sysctl_unregister(idev); + neigh_parms_release(&nd_tbl, idev->nd_parms); + neigh_ifdown(&nd_tbl, dev); + in6_dev_put(idev); + } + return 0; +} + +static void addrconf_rs_timer(unsigned long data) +{ + struct inet6_ifaddr *ifp = (struct inet6_ifaddr *) data; + struct inet6_dev *idev = ifp->idev; + + read_lock(&idev->lock); + if (idev->dead || !(idev->if_flags & IF_READY)) + goto out; + + if (idev->cnf.forwarding) + goto out; + + /* Announcement received after solicitation was sent */ + if (idev->if_flags & IF_RA_RCVD) + goto out; + + spin_lock(&ifp->lock); + if (ifp->probes++ < idev->cnf.rtr_solicits) { + /* The wait after the last probe can be shorter */ + addrconf_mod_timer(ifp, AC_RS, + (ifp->probes == idev->cnf.rtr_solicits) ? + idev->cnf.rtr_solicit_delay : + idev->cnf.rtr_solicit_interval); + spin_unlock(&ifp->lock); + + ndisc_send_rs(idev->dev, &ifp->addr, &in6addr_linklocal_allrouters); + } else { + spin_unlock(&ifp->lock); + /* + * Note: we do not support deprecated "all on-link" + * assumption any longer. + */ + printk(KERN_DEBUG "%s: no IPv6 routers present\n", + idev->dev->name); + } + +out: + read_unlock(&idev->lock); + in6_ifa_put(ifp); +} + +/* + * Duplicate Address Detection + */ +static void addrconf_dad_kick(struct inet6_ifaddr *ifp) +{ + unsigned long rand_num; + struct inet6_dev *idev = ifp->idev; + + if (ifp->flags & IFA_F_OPTIMISTIC) + rand_num = 0; + else + rand_num = net_random() % (idev->cnf.rtr_solicit_delay ? : 1); + + ifp->probes = idev->cnf.dad_transmits; + addrconf_mod_timer(ifp, AC_DAD, rand_num); +} + +static void addrconf_dad_start(struct inet6_ifaddr *ifp, u32 flags) +{ + struct inet6_dev *idev = ifp->idev; + struct net_device *dev = idev->dev; + + addrconf_join_solict(dev, &ifp->addr); + + net_srandom(ifp->addr.s6_addr32[3]); + + read_lock_bh(&idev->lock); + spin_lock(&ifp->lock); + if (ifp->state == INET6_IFADDR_STATE_DEAD) + goto out; + + if (dev->flags&(IFF_NOARP|IFF_LOOPBACK) || + idev->cnf.accept_dad < 1 || + !(ifp->flags&IFA_F_TENTATIVE) || + ifp->flags & IFA_F_NODAD) { + ifp->flags &= ~(IFA_F_TENTATIVE|IFA_F_OPTIMISTIC|IFA_F_DADFAILED); + spin_unlock(&ifp->lock); + read_unlock_bh(&idev->lock); + + addrconf_dad_completed(ifp); + return; + } + + if (!(idev->if_flags & IF_READY)) { + spin_unlock(&ifp->lock); + read_unlock_bh(&idev->lock); + /* + * If the device is not ready: + * - keep it tentative if it is a permanent address. + * - otherwise, kill it. + */ + in6_ifa_hold(ifp); + addrconf_dad_stop(ifp, 0); + return; + } + + /* + * Optimistic nodes can start receiving + * Frames right away + */ + if (ifp->flags & IFA_F_OPTIMISTIC) + ip6_ins_rt(ifp->rt); + + addrconf_dad_kick(ifp); +out: + spin_unlock(&ifp->lock); + read_unlock_bh(&idev->lock); +} + +static void addrconf_dad_timer(unsigned long data) +{ + struct inet6_ifaddr *ifp = (struct inet6_ifaddr *) data; + struct inet6_dev *idev = ifp->idev; + struct in6_addr mcaddr; + + if (!ifp->probes && addrconf_dad_end(ifp)) + goto out; + + read_lock(&idev->lock); + if (idev->dead || !(idev->if_flags & IF_READY)) { + read_unlock(&idev->lock); + goto out; + } + + spin_lock(&ifp->lock); + if (ifp->state == INET6_IFADDR_STATE_DEAD) { + spin_unlock(&ifp->lock); + read_unlock(&idev->lock); + goto out; + } + + if (ifp->probes == 0) { + /* + * DAD was successful + */ + + ifp->flags &= ~(IFA_F_TENTATIVE|IFA_F_OPTIMISTIC|IFA_F_DADFAILED); + spin_unlock(&ifp->lock); + read_unlock(&idev->lock); + + addrconf_dad_completed(ifp); + + goto out; + } + + ifp->probes--; + addrconf_mod_timer(ifp, AC_DAD, ifp->idev->nd_parms->retrans_time); + spin_unlock(&ifp->lock); + read_unlock(&idev->lock); + + /* send a neighbour solicitation for our addr */ + addrconf_addr_solict_mult(&ifp->addr, &mcaddr); + ndisc_send_ns(ifp->idev->dev, NULL, &ifp->addr, &mcaddr, &in6addr_any); +out: + in6_ifa_put(ifp); +} + +static void addrconf_dad_completed(struct inet6_ifaddr *ifp) +{ + struct net_device *dev = ifp->idev->dev; + + /* + * Configure the address for reception. Now it is valid. + */ + + ipv6_ifa_notify(RTM_NEWADDR, ifp); + + /* If added prefix is link local and we are prepared to process + router advertisements, start sending router solicitations. + */ + + if (((ifp->idev->cnf.accept_ra == 1 && !ifp->idev->cnf.forwarding) || + ifp->idev->cnf.accept_ra == 2) && + ifp->idev->cnf.rtr_solicits > 0 && + (dev->flags&IFF_LOOPBACK) == 0 && + (ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL)) { + /* + * If a host as already performed a random delay + * [...] as part of DAD [...] there is no need + * to delay again before sending the first RS + */ + ndisc_send_rs(ifp->idev->dev, &ifp->addr, &in6addr_linklocal_allrouters); + + spin_lock_bh(&ifp->lock); + ifp->probes = 1; + ifp->idev->if_flags |= IF_RS_SENT; + addrconf_mod_timer(ifp, AC_RS, ifp->idev->cnf.rtr_solicit_interval); + spin_unlock_bh(&ifp->lock); + } +} + +static void addrconf_dad_run(struct inet6_dev *idev) +{ + struct inet6_ifaddr *ifp; + + read_lock_bh(&idev->lock); + list_for_each_entry(ifp, &idev->addr_list, if_list) { + spin_lock(&ifp->lock); + if (ifp->flags & IFA_F_TENTATIVE && + ifp->state == INET6_IFADDR_STATE_DAD) + addrconf_dad_kick(ifp); + spin_unlock(&ifp->lock); + } + read_unlock_bh(&idev->lock); +} + +#ifdef CONFIG_PROC_FS +struct if6_iter_state { + struct seq_net_private p; + int bucket; + int offset; +}; + +static struct inet6_ifaddr *if6_get_first(struct seq_file *seq, loff_t pos) +{ + struct inet6_ifaddr *ifa = NULL; + struct if6_iter_state *state = seq->private; + struct net *net = seq_file_net(seq); + int p = 0; + + /* initial bucket if pos is 0 */ + if (pos == 0) { + state->bucket = 0; + state->offset = 0; + } + + for (; state->bucket < IN6_ADDR_HSIZE; ++state->bucket) { + struct hlist_node *n; + hlist_for_each_entry_rcu_bh(ifa, n, &inet6_addr_lst[state->bucket], + addr_lst) { + /* sync with offset */ + if (p < state->offset) { + p++; + continue; + } + state->offset++; + if (net_eq(dev_net(ifa->idev->dev), net)) + return ifa; + } + + /* prepare for next bucket */ + state->offset = 0; + p = 0; + } + return NULL; +} + +static struct inet6_ifaddr *if6_get_next(struct seq_file *seq, + struct inet6_ifaddr *ifa) +{ + struct if6_iter_state *state = seq->private; + struct net *net = seq_file_net(seq); + struct hlist_node *n = &ifa->addr_lst; + + hlist_for_each_entry_continue_rcu_bh(ifa, n, addr_lst) { + state->offset++; + if (net_eq(dev_net(ifa->idev->dev), net)) + return ifa; + } + + while (++state->bucket < IN6_ADDR_HSIZE) { + state->offset = 0; + hlist_for_each_entry_rcu_bh(ifa, n, + &inet6_addr_lst[state->bucket], addr_lst) { + state->offset++; + if (net_eq(dev_net(ifa->idev->dev), net)) + return ifa; + } + } + + return NULL; +} + +static void *if6_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(rcu_bh) +{ + rcu_read_lock_bh(); + return if6_get_first(seq, *pos); +} + +static void *if6_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct inet6_ifaddr *ifa; + + ifa = if6_get_next(seq, v); + ++*pos; + return ifa; +} + +static void if6_seq_stop(struct seq_file *seq, void *v) + __releases(rcu_bh) +{ + rcu_read_unlock_bh(); +} + +static int if6_seq_show(struct seq_file *seq, void *v) +{ + struct inet6_ifaddr *ifp = (struct inet6_ifaddr *)v; + seq_printf(seq, "%pi6 %02x %02x %02x %02x %8s\n", + &ifp->addr, + ifp->idev->dev->ifindex, + ifp->prefix_len, + ifp->scope, + ifp->flags, + ifp->idev->dev->name); + return 0; +} + +static const struct seq_operations if6_seq_ops = { + .start = if6_seq_start, + .next = if6_seq_next, + .show = if6_seq_show, + .stop = if6_seq_stop, +}; + +static int if6_seq_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &if6_seq_ops, + sizeof(struct if6_iter_state)); +} + +static const struct file_operations if6_fops = { + .owner = THIS_MODULE, + .open = if6_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +static int __net_init if6_proc_net_init(struct net *net) +{ + if (!proc_net_fops_create(net, "if_inet6", S_IRUGO, &if6_fops)) + return -ENOMEM; + return 0; +} + +static void __net_exit if6_proc_net_exit(struct net *net) +{ + proc_net_remove(net, "if_inet6"); +} + +static struct pernet_operations if6_proc_net_ops = { + .init = if6_proc_net_init, + .exit = if6_proc_net_exit, +}; + +int __init if6_proc_init(void) +{ + return register_pernet_subsys(&if6_proc_net_ops); +} + +void if6_proc_exit(void) +{ + unregister_pernet_subsys(&if6_proc_net_ops); +} +#endif /* CONFIG_PROC_FS */ + +#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) +/* Check if address is a home address configured on any interface. */ +int ipv6_chk_home_addr(struct net *net, const struct in6_addr *addr) +{ + int ret = 0; + struct inet6_ifaddr *ifp = NULL; + struct hlist_node *n; + unsigned int hash = ipv6_addr_hash(addr); + + rcu_read_lock_bh(); + hlist_for_each_entry_rcu_bh(ifp, n, &inet6_addr_lst[hash], addr_lst) { + if (!net_eq(dev_net(ifp->idev->dev), net)) + continue; + if (ipv6_addr_equal(&ifp->addr, addr) && + (ifp->flags & IFA_F_HOMEADDRESS)) { + ret = 1; + break; + } + } + rcu_read_unlock_bh(); + return ret; +} +#endif + +/* + * Periodic address status verification + */ + +static void addrconf_verify(unsigned long foo) +{ + unsigned long now, next, next_sec, next_sched; + struct inet6_ifaddr *ifp; + struct hlist_node *node; + int i; + + rcu_read_lock_bh(); + spin_lock(&addrconf_verify_lock); + now = jiffies; + next = round_jiffies_up(now + ADDR_CHECK_FREQUENCY); + + del_timer(&addr_chk_timer); + + for (i = 0; i < IN6_ADDR_HSIZE; i++) { +restart: + hlist_for_each_entry_rcu_bh(ifp, node, + &inet6_addr_lst[i], addr_lst) { + unsigned long age; + + if (ifp->flags & IFA_F_PERMANENT) + continue; + + spin_lock(&ifp->lock); + /* We try to batch several events at once. */ + age = (now - ifp->tstamp + ADDRCONF_TIMER_FUZZ_MINUS) / HZ; + + if (ifp->valid_lft != INFINITY_LIFE_TIME && + age >= ifp->valid_lft) { + spin_unlock(&ifp->lock); + in6_ifa_hold(ifp); + ipv6_del_addr(ifp); + goto restart; + } else if (ifp->prefered_lft == INFINITY_LIFE_TIME) { + spin_unlock(&ifp->lock); + continue; + } else if (age >= ifp->prefered_lft) { + /* jiffies - ifp->tstamp > age >= ifp->prefered_lft */ + int deprecate = 0; + + if (!(ifp->flags&IFA_F_DEPRECATED)) { + deprecate = 1; + ifp->flags |= IFA_F_DEPRECATED; + } + + if (time_before(ifp->tstamp + ifp->valid_lft * HZ, next)) + next = ifp->tstamp + ifp->valid_lft * HZ; + + spin_unlock(&ifp->lock); + + if (deprecate) { + in6_ifa_hold(ifp); + + ipv6_ifa_notify(0, ifp); + in6_ifa_put(ifp); + goto restart; + } +#ifdef CONFIG_IPV6_PRIVACY + } else if ((ifp->flags&IFA_F_TEMPORARY) && + !(ifp->flags&IFA_F_TENTATIVE)) { + unsigned long regen_advance = ifp->idev->cnf.regen_max_retry * + ifp->idev->cnf.dad_transmits * + ifp->idev->nd_parms->retrans_time / HZ; + + if (age >= ifp->prefered_lft - regen_advance) { + struct inet6_ifaddr *ifpub = ifp->ifpub; + if (time_before(ifp->tstamp + ifp->prefered_lft * HZ, next)) + next = ifp->tstamp + ifp->prefered_lft * HZ; + if (!ifp->regen_count && ifpub) { + ifp->regen_count++; + in6_ifa_hold(ifp); + in6_ifa_hold(ifpub); + spin_unlock(&ifp->lock); + + spin_lock(&ifpub->lock); + ifpub->regen_count = 0; + spin_unlock(&ifpub->lock); + ipv6_create_tempaddr(ifpub, ifp); + in6_ifa_put(ifpub); + in6_ifa_put(ifp); + goto restart; + } + } else if (time_before(ifp->tstamp + ifp->prefered_lft * HZ - regen_advance * HZ, next)) + next = ifp->tstamp + ifp->prefered_lft * HZ - regen_advance * HZ; + spin_unlock(&ifp->lock); +#endif + } else { + /* ifp->prefered_lft <= ifp->valid_lft */ + if (time_before(ifp->tstamp + ifp->prefered_lft * HZ, next)) + next = ifp->tstamp + ifp->prefered_lft * HZ; + spin_unlock(&ifp->lock); + } + } + } + + next_sec = round_jiffies_up(next); + next_sched = next; + + /* If rounded timeout is accurate enough, accept it. */ + if (time_before(next_sec, next + ADDRCONF_TIMER_FUZZ)) + next_sched = next_sec; + + /* And minimum interval is ADDRCONF_TIMER_FUZZ_MAX. */ + if (time_before(next_sched, jiffies + ADDRCONF_TIMER_FUZZ_MAX)) + next_sched = jiffies + ADDRCONF_TIMER_FUZZ_MAX; + + ADBG((KERN_DEBUG "now = %lu, schedule = %lu, rounded schedule = %lu => %lu\n", + now, next, next_sec, next_sched)); + + addr_chk_timer.expires = next_sched; + add_timer(&addr_chk_timer); + spin_unlock(&addrconf_verify_lock); + rcu_read_unlock_bh(); +} + +static struct in6_addr *extract_addr(struct nlattr *addr, struct nlattr *local) +{ + struct in6_addr *pfx = NULL; + + if (addr) + pfx = nla_data(addr); + + if (local) { + if (pfx && nla_memcmp(local, pfx, sizeof(*pfx))) + pfx = NULL; + else + pfx = nla_data(local); + } + + return pfx; +} + +static const struct nla_policy ifa_ipv6_policy[IFA_MAX+1] = { + [IFA_ADDRESS] = { .len = sizeof(struct in6_addr) }, + [IFA_LOCAL] = { .len = sizeof(struct in6_addr) }, + [IFA_CACHEINFO] = { .len = sizeof(struct ifa_cacheinfo) }, +}; + +static int +inet6_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +{ + struct net *net = sock_net(skb->sk); + struct ifaddrmsg *ifm; + struct nlattr *tb[IFA_MAX+1]; + struct in6_addr *pfx; + int err; + + err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv6_policy); + if (err < 0) + return err; + + ifm = nlmsg_data(nlh); + pfx = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL]); + if (pfx == NULL) + return -EINVAL; + + return inet6_addr_del(net, ifm->ifa_index, pfx, ifm->ifa_prefixlen); +} + +static int inet6_addr_modify(struct inet6_ifaddr *ifp, u8 ifa_flags, + u32 prefered_lft, u32 valid_lft) +{ + u32 flags; + clock_t expires; + unsigned long timeout; + + if (!valid_lft || (prefered_lft > valid_lft)) + return -EINVAL; + + timeout = addrconf_timeout_fixup(valid_lft, HZ); + if (addrconf_finite_timeout(timeout)) { + expires = jiffies_to_clock_t(timeout * HZ); + valid_lft = timeout; + flags = RTF_EXPIRES; + } else { + expires = 0; + flags = 0; + ifa_flags |= IFA_F_PERMANENT; + } + + timeout = addrconf_timeout_fixup(prefered_lft, HZ); + if (addrconf_finite_timeout(timeout)) { + if (timeout == 0) + ifa_flags |= IFA_F_DEPRECATED; + prefered_lft = timeout; + } + + spin_lock_bh(&ifp->lock); + ifp->flags = (ifp->flags & ~(IFA_F_DEPRECATED | IFA_F_PERMANENT | IFA_F_NODAD | IFA_F_HOMEADDRESS)) | ifa_flags; + ifp->tstamp = jiffies; + ifp->valid_lft = valid_lft; + ifp->prefered_lft = prefered_lft; + + spin_unlock_bh(&ifp->lock); + if (!(ifp->flags&IFA_F_TENTATIVE)) + ipv6_ifa_notify(0, ifp); + + addrconf_prefix_route(&ifp->addr, ifp->prefix_len, ifp->idev->dev, + expires, flags); + addrconf_verify(0); + + return 0; +} + +static int +inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +{ + struct net *net = sock_net(skb->sk); + struct ifaddrmsg *ifm; + struct nlattr *tb[IFA_MAX+1]; + struct in6_addr *pfx; + struct inet6_ifaddr *ifa; + struct net_device *dev; + u32 valid_lft = INFINITY_LIFE_TIME, preferred_lft = INFINITY_LIFE_TIME; + u8 ifa_flags; + int err; + + err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv6_policy); + if (err < 0) + return err; + + ifm = nlmsg_data(nlh); + pfx = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL]); + if (pfx == NULL) + return -EINVAL; + + if (tb[IFA_CACHEINFO]) { + struct ifa_cacheinfo *ci; + + ci = nla_data(tb[IFA_CACHEINFO]); + valid_lft = ci->ifa_valid; + preferred_lft = ci->ifa_prefered; + } else { + preferred_lft = INFINITY_LIFE_TIME; + valid_lft = INFINITY_LIFE_TIME; + } + + dev = __dev_get_by_index(net, ifm->ifa_index); + if (dev == NULL) + return -ENODEV; + + /* We ignore other flags so far. */ + ifa_flags = ifm->ifa_flags & (IFA_F_NODAD | IFA_F_HOMEADDRESS); + + ifa = ipv6_get_ifaddr(net, pfx, dev, 1); + if (ifa == NULL) { + /* + * It would be best to check for !NLM_F_CREATE here but + * userspace alreay relies on not having to provide this. + */ + return inet6_addr_add(net, ifm->ifa_index, pfx, + ifm->ifa_prefixlen, ifa_flags, + preferred_lft, valid_lft); + } + + if (nlh->nlmsg_flags & NLM_F_EXCL || + !(nlh->nlmsg_flags & NLM_F_REPLACE)) + err = -EEXIST; + else + err = inet6_addr_modify(ifa, ifa_flags, preferred_lft, valid_lft); + + in6_ifa_put(ifa); + + return err; +} + +static void put_ifaddrmsg(struct nlmsghdr *nlh, u8 prefixlen, u8 flags, + u8 scope, int ifindex) +{ + struct ifaddrmsg *ifm; + + ifm = nlmsg_data(nlh); + ifm->ifa_family = AF_INET6; + ifm->ifa_prefixlen = prefixlen; + ifm->ifa_flags = flags; + ifm->ifa_scope = scope; + ifm->ifa_index = ifindex; +} + +static int put_cacheinfo(struct sk_buff *skb, unsigned long cstamp, + unsigned long tstamp, u32 preferred, u32 valid) +{ + struct ifa_cacheinfo ci; + + ci.cstamp = cstamp_delta(cstamp); + ci.tstamp = cstamp_delta(tstamp); + ci.ifa_prefered = preferred; + ci.ifa_valid = valid; + + return nla_put(skb, IFA_CACHEINFO, sizeof(ci), &ci); +} + +static inline int rt_scope(int ifa_scope) +{ + if (ifa_scope & IFA_HOST) + return RT_SCOPE_HOST; + else if (ifa_scope & IFA_LINK) + return RT_SCOPE_LINK; + else if (ifa_scope & IFA_SITE) + return RT_SCOPE_SITE; + else + return RT_SCOPE_UNIVERSE; +} + +static inline int inet6_ifaddr_msgsize(void) +{ + return NLMSG_ALIGN(sizeof(struct ifaddrmsg)) + + nla_total_size(16) /* IFA_ADDRESS */ + + nla_total_size(sizeof(struct ifa_cacheinfo)); +} + +static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa, + u32 pid, u32 seq, int event, unsigned int flags) +{ + struct nlmsghdr *nlh; + u32 preferred, valid; + + nlh = nlmsg_put(skb, pid, seq, event, sizeof(struct ifaddrmsg), flags); + if (nlh == NULL) + return -EMSGSIZE; + + put_ifaddrmsg(nlh, ifa->prefix_len, ifa->flags, rt_scope(ifa->scope), + ifa->idev->dev->ifindex); + + if (!(ifa->flags&IFA_F_PERMANENT)) { + preferred = ifa->prefered_lft; + valid = ifa->valid_lft; + if (preferred != INFINITY_LIFE_TIME) { + long tval = (jiffies - ifa->tstamp)/HZ; + if (preferred > tval) + preferred -= tval; + else + preferred = 0; + if (valid != INFINITY_LIFE_TIME) { + if (valid > tval) + valid -= tval; + else + valid = 0; + } + } + } else { + preferred = INFINITY_LIFE_TIME; + valid = INFINITY_LIFE_TIME; + } + + if (nla_put(skb, IFA_ADDRESS, 16, &ifa->addr) < 0 || + put_cacheinfo(skb, ifa->cstamp, ifa->tstamp, preferred, valid) < 0) { + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; + } + + return nlmsg_end(skb, nlh); +} + +static int inet6_fill_ifmcaddr(struct sk_buff *skb, struct ifmcaddr6 *ifmca, + u32 pid, u32 seq, int event, u16 flags) +{ + struct nlmsghdr *nlh; + u8 scope = RT_SCOPE_UNIVERSE; + int ifindex = ifmca->idev->dev->ifindex; + + if (ipv6_addr_scope(&ifmca->mca_addr) & IFA_SITE) + scope = RT_SCOPE_SITE; + + nlh = nlmsg_put(skb, pid, seq, event, sizeof(struct ifaddrmsg), flags); + if (nlh == NULL) + return -EMSGSIZE; + + put_ifaddrmsg(nlh, 128, IFA_F_PERMANENT, scope, ifindex); + if (nla_put(skb, IFA_MULTICAST, 16, &ifmca->mca_addr) < 0 || + put_cacheinfo(skb, ifmca->mca_cstamp, ifmca->mca_tstamp, + INFINITY_LIFE_TIME, INFINITY_LIFE_TIME) < 0) { + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; + } + + return nlmsg_end(skb, nlh); +} + +static int inet6_fill_ifacaddr(struct sk_buff *skb, struct ifacaddr6 *ifaca, + u32 pid, u32 seq, int event, unsigned int flags) +{ + struct nlmsghdr *nlh; + u8 scope = RT_SCOPE_UNIVERSE; + int ifindex = ifaca->aca_idev->dev->ifindex; + + if (ipv6_addr_scope(&ifaca->aca_addr) & IFA_SITE) + scope = RT_SCOPE_SITE; + + nlh = nlmsg_put(skb, pid, seq, event, sizeof(struct ifaddrmsg), flags); + if (nlh == NULL) + return -EMSGSIZE; + + put_ifaddrmsg(nlh, 128, IFA_F_PERMANENT, scope, ifindex); + if (nla_put(skb, IFA_ANYCAST, 16, &ifaca->aca_addr) < 0 || + put_cacheinfo(skb, ifaca->aca_cstamp, ifaca->aca_tstamp, + INFINITY_LIFE_TIME, INFINITY_LIFE_TIME) < 0) { + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; + } + + return nlmsg_end(skb, nlh); +} + +enum addr_type_t { + UNICAST_ADDR, + MULTICAST_ADDR, + ANYCAST_ADDR, +}; + +/* called with rcu_read_lock() */ +static int in6_dump_addrs(struct inet6_dev *idev, struct sk_buff *skb, + struct netlink_callback *cb, enum addr_type_t type, + int s_ip_idx, int *p_ip_idx) +{ + struct ifmcaddr6 *ifmca; + struct ifacaddr6 *ifaca; + int err = 1; + int ip_idx = *p_ip_idx; + + read_lock_bh(&idev->lock); + switch (type) { + case UNICAST_ADDR: { + struct inet6_ifaddr *ifa; + + /* unicast address incl. temp addr */ + list_for_each_entry(ifa, &idev->addr_list, if_list) { + if (++ip_idx < s_ip_idx) + continue; + err = inet6_fill_ifaddr(skb, ifa, + NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, + RTM_NEWADDR, + NLM_F_MULTI); + if (err <= 0) + break; + } + break; + } + case MULTICAST_ADDR: + /* multicast address */ + for (ifmca = idev->mc_list; ifmca; + ifmca = ifmca->next, ip_idx++) { + if (ip_idx < s_ip_idx) + continue; + err = inet6_fill_ifmcaddr(skb, ifmca, + NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, + RTM_GETMULTICAST, + NLM_F_MULTI); + if (err <= 0) + break; + } + break; + case ANYCAST_ADDR: + /* anycast address */ + for (ifaca = idev->ac_list; ifaca; + ifaca = ifaca->aca_next, ip_idx++) { + if (ip_idx < s_ip_idx) + continue; + err = inet6_fill_ifacaddr(skb, ifaca, + NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, + RTM_GETANYCAST, + NLM_F_MULTI); + if (err <= 0) + break; + } + break; + default: + break; + } + read_unlock_bh(&idev->lock); + *p_ip_idx = ip_idx; + return err; +} + +static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb, + enum addr_type_t type) +{ + struct net *net = sock_net(skb->sk); + int h, s_h; + int idx, ip_idx; + int s_idx, s_ip_idx; + struct net_device *dev; + struct inet6_dev *idev; + struct hlist_head *head; + struct hlist_node *node; + + s_h = cb->args[0]; + s_idx = idx = cb->args[1]; + s_ip_idx = ip_idx = cb->args[2]; + + rcu_read_lock(); + for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { + idx = 0; + head = &net->dev_index_head[h]; + hlist_for_each_entry_rcu(dev, node, head, index_hlist) { + if (idx < s_idx) + goto cont; + if (h > s_h || idx > s_idx) + s_ip_idx = 0; + ip_idx = 0; + idev = __in6_dev_get(dev); + if (!idev) + goto cont; + + if (in6_dump_addrs(idev, skb, cb, type, + s_ip_idx, &ip_idx) <= 0) + goto done; +cont: + idx++; + } + } +done: + rcu_read_unlock(); + cb->args[0] = h; + cb->args[1] = idx; + cb->args[2] = ip_idx; + + return skb->len; +} + +static int inet6_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) +{ + enum addr_type_t type = UNICAST_ADDR; + + return inet6_dump_addr(skb, cb, type); +} + +static int inet6_dump_ifmcaddr(struct sk_buff *skb, struct netlink_callback *cb) +{ + enum addr_type_t type = MULTICAST_ADDR; + + return inet6_dump_addr(skb, cb, type); +} + + +static int inet6_dump_ifacaddr(struct sk_buff *skb, struct netlink_callback *cb) +{ + enum addr_type_t type = ANYCAST_ADDR; + + return inet6_dump_addr(skb, cb, type); +} + +static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr* nlh, + void *arg) +{ + struct net *net = sock_net(in_skb->sk); + struct ifaddrmsg *ifm; + struct nlattr *tb[IFA_MAX+1]; + struct in6_addr *addr = NULL; + struct net_device *dev = NULL; + struct inet6_ifaddr *ifa; + struct sk_buff *skb; + int err; + + err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv6_policy); + if (err < 0) + goto errout; + + addr = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL]); + if (addr == NULL) { + err = -EINVAL; + goto errout; + } + + ifm = nlmsg_data(nlh); + if (ifm->ifa_index) + dev = __dev_get_by_index(net, ifm->ifa_index); + + ifa = ipv6_get_ifaddr(net, addr, dev, 1); + if (!ifa) { + err = -EADDRNOTAVAIL; + goto errout; + } + + skb = nlmsg_new(inet6_ifaddr_msgsize(), GFP_KERNEL); + if (!skb) { + err = -ENOBUFS; + goto errout_ifa; + } + + err = inet6_fill_ifaddr(skb, ifa, NETLINK_CB(in_skb).pid, + nlh->nlmsg_seq, RTM_NEWADDR, 0); + if (err < 0) { + /* -EMSGSIZE implies BUG in inet6_ifaddr_msgsize() */ + WARN_ON(err == -EMSGSIZE); + kfree_skb(skb); + goto errout_ifa; + } + err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid); +errout_ifa: + in6_ifa_put(ifa); +errout: + return err; +} + +static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa) +{ + struct sk_buff *skb; + struct net *net = dev_net(ifa->idev->dev); + int err = -ENOBUFS; + + skb = nlmsg_new(inet6_ifaddr_msgsize(), GFP_ATOMIC); + if (skb == NULL) + goto errout; + + err = inet6_fill_ifaddr(skb, ifa, 0, 0, event, 0); + if (err < 0) { + /* -EMSGSIZE implies BUG in inet6_ifaddr_msgsize() */ + WARN_ON(err == -EMSGSIZE); + kfree_skb(skb); + goto errout; + } + rtnl_notify(skb, net, 0, RTNLGRP_IPV6_IFADDR, NULL, GFP_ATOMIC); + return; +errout: + if (err < 0) + rtnl_set_sk_err(net, RTNLGRP_IPV6_IFADDR, err); +} + +static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, + __s32 *array, int bytes) +{ + BUG_ON(bytes < (DEVCONF_MAX * 4)); + + memset(array, 0, bytes); + array[DEVCONF_FORWARDING] = cnf->forwarding; + array[DEVCONF_HOPLIMIT] = cnf->hop_limit; + array[DEVCONF_MTU6] = cnf->mtu6; + array[DEVCONF_ACCEPT_RA] = cnf->accept_ra; + array[DEVCONF_ACCEPT_REDIRECTS] = cnf->accept_redirects; + array[DEVCONF_AUTOCONF] = cnf->autoconf; + array[DEVCONF_DAD_TRANSMITS] = cnf->dad_transmits; + array[DEVCONF_RTR_SOLICITS] = cnf->rtr_solicits; + array[DEVCONF_RTR_SOLICIT_INTERVAL] = + jiffies_to_msecs(cnf->rtr_solicit_interval); + array[DEVCONF_RTR_SOLICIT_DELAY] = + jiffies_to_msecs(cnf->rtr_solicit_delay); + array[DEVCONF_FORCE_MLD_VERSION] = cnf->force_mld_version; +#ifdef CONFIG_IPV6_PRIVACY + array[DEVCONF_USE_TEMPADDR] = cnf->use_tempaddr; + array[DEVCONF_TEMP_VALID_LFT] = cnf->temp_valid_lft; + array[DEVCONF_TEMP_PREFERED_LFT] = cnf->temp_prefered_lft; + array[DEVCONF_REGEN_MAX_RETRY] = cnf->regen_max_retry; + array[DEVCONF_MAX_DESYNC_FACTOR] = cnf->max_desync_factor; +#endif + array[DEVCONF_MAX_ADDRESSES] = cnf->max_addresses; + array[DEVCONF_ACCEPT_RA_DEFRTR] = cnf->accept_ra_defrtr; + array[DEVCONF_ACCEPT_RA_PINFO] = cnf->accept_ra_pinfo; +#ifdef CONFIG_IPV6_ROUTER_PREF + array[DEVCONF_ACCEPT_RA_RTR_PREF] = cnf->accept_ra_rtr_pref; + array[DEVCONF_RTR_PROBE_INTERVAL] = + jiffies_to_msecs(cnf->rtr_probe_interval); +#ifdef CONFIG_IPV6_ROUTE_INFO + array[DEVCONF_ACCEPT_RA_RT_INFO_MAX_PLEN] = cnf->accept_ra_rt_info_max_plen; +#endif +#endif + array[DEVCONF_PROXY_NDP] = cnf->proxy_ndp; + array[DEVCONF_ACCEPT_SOURCE_ROUTE] = cnf->accept_source_route; +#ifdef CONFIG_IPV6_OPTIMISTIC_DAD + array[DEVCONF_OPTIMISTIC_DAD] = cnf->optimistic_dad; +#endif +#ifdef CONFIG_IPV6_MROUTE + array[DEVCONF_MC_FORWARDING] = cnf->mc_forwarding; +#endif + array[DEVCONF_DISABLE_IPV6] = cnf->disable_ipv6; + array[DEVCONF_ACCEPT_DAD] = cnf->accept_dad; + array[DEVCONF_FORCE_TLLAO] = cnf->force_tllao; +} + +static inline size_t inet6_ifla6_size(void) +{ + return nla_total_size(4) /* IFLA_INET6_FLAGS */ + + nla_total_size(sizeof(struct ifla_cacheinfo)) + + nla_total_size(DEVCONF_MAX * 4) /* IFLA_INET6_CONF */ + + nla_total_size(IPSTATS_MIB_MAX * 8) /* IFLA_INET6_STATS */ + + nla_total_size(ICMP6_MIB_MAX * 8); /* IFLA_INET6_ICMP6STATS */ +} + +static inline size_t inet6_if_nlmsg_size(void) +{ + return NLMSG_ALIGN(sizeof(struct ifinfomsg)) + + nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */ + + nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */ + + nla_total_size(4) /* IFLA_MTU */ + + nla_total_size(4) /* IFLA_LINK */ + + nla_total_size(inet6_ifla6_size()); /* IFLA_PROTINFO */ +} + +static inline void __snmp6_fill_statsdev(u64 *stats, atomic_long_t *mib, + int items, int bytes) +{ + int i; + int pad = bytes - sizeof(u64) * items; + BUG_ON(pad < 0); + + /* Use put_unaligned() because stats may not be aligned for u64. */ + put_unaligned(items, &stats[0]); + for (i = 1; i < items; i++) + put_unaligned(atomic_long_read(&mib[i]), &stats[i]); + + memset(&stats[items], 0, pad); +} + +static inline void __snmp6_fill_stats64(u64 *stats, void __percpu **mib, + int items, int bytes, size_t syncpoff) +{ + int i; + int pad = bytes - sizeof(u64) * items; + BUG_ON(pad < 0); + + /* Use put_unaligned() because stats may not be aligned for u64. */ + put_unaligned(items, &stats[0]); + for (i = 1; i < items; i++) + put_unaligned(snmp_fold_field64(mib, i, syncpoff), &stats[i]); + + memset(&stats[items], 0, pad); +} + +static void snmp6_fill_stats(u64 *stats, struct inet6_dev *idev, int attrtype, + int bytes) +{ + switch (attrtype) { + case IFLA_INET6_STATS: + __snmp6_fill_stats64(stats, (void __percpu **)idev->stats.ipv6, + IPSTATS_MIB_MAX, bytes, offsetof(struct ipstats_mib, syncp)); + break; + case IFLA_INET6_ICMP6STATS: + __snmp6_fill_statsdev(stats, idev->stats.icmpv6dev->mibs, ICMP6_MIB_MAX, bytes); + break; + } +} + +static int inet6_fill_ifla6_attrs(struct sk_buff *skb, struct inet6_dev *idev) +{ + struct nlattr *nla; + struct ifla_cacheinfo ci; + + NLA_PUT_U32(skb, IFLA_INET6_FLAGS, idev->if_flags); + + ci.max_reasm_len = IPV6_MAXPLEN; + ci.tstamp = cstamp_delta(idev->tstamp); + ci.reachable_time = jiffies_to_msecs(idev->nd_parms->reachable_time); + ci.retrans_time = jiffies_to_msecs(idev->nd_parms->retrans_time); + NLA_PUT(skb, IFLA_INET6_CACHEINFO, sizeof(ci), &ci); + + nla = nla_reserve(skb, IFLA_INET6_CONF, DEVCONF_MAX * sizeof(s32)); + if (nla == NULL) + goto nla_put_failure; + ipv6_store_devconf(&idev->cnf, nla_data(nla), nla_len(nla)); + + /* XXX - MC not implemented */ + + nla = nla_reserve(skb, IFLA_INET6_STATS, IPSTATS_MIB_MAX * sizeof(u64)); + if (nla == NULL) + goto nla_put_failure; + snmp6_fill_stats(nla_data(nla), idev, IFLA_INET6_STATS, nla_len(nla)); + + nla = nla_reserve(skb, IFLA_INET6_ICMP6STATS, ICMP6_MIB_MAX * sizeof(u64)); + if (nla == NULL) + goto nla_put_failure; + snmp6_fill_stats(nla_data(nla), idev, IFLA_INET6_ICMP6STATS, nla_len(nla)); + + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + +static size_t inet6_get_link_af_size(const struct net_device *dev) +{ + if (!__in6_dev_get(dev)) + return 0; + + return inet6_ifla6_size(); +} + +static int inet6_fill_link_af(struct sk_buff *skb, const struct net_device *dev) +{ + struct inet6_dev *idev = __in6_dev_get(dev); + + if (!idev) + return -ENODATA; + + if (inet6_fill_ifla6_attrs(skb, idev) < 0) + return -EMSGSIZE; + + return 0; +} + +static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev, + u32 pid, u32 seq, int event, unsigned int flags) +{ + struct net_device *dev = idev->dev; + struct ifinfomsg *hdr; + struct nlmsghdr *nlh; + void *protoinfo; + + nlh = nlmsg_put(skb, pid, seq, event, sizeof(*hdr), flags); + if (nlh == NULL) + return -EMSGSIZE; + + hdr = nlmsg_data(nlh); + hdr->ifi_family = AF_INET6; + hdr->__ifi_pad = 0; + hdr->ifi_type = dev->type; + hdr->ifi_index = dev->ifindex; + hdr->ifi_flags = dev_get_flags(dev); + hdr->ifi_change = 0; + + NLA_PUT_STRING(skb, IFLA_IFNAME, dev->name); + + if (dev->addr_len) + NLA_PUT(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr); + + NLA_PUT_U32(skb, IFLA_MTU, dev->mtu); + if (dev->ifindex != dev->iflink) + NLA_PUT_U32(skb, IFLA_LINK, dev->iflink); + + protoinfo = nla_nest_start(skb, IFLA_PROTINFO); + if (protoinfo == NULL) + goto nla_put_failure; + + if (inet6_fill_ifla6_attrs(skb, idev) < 0) + goto nla_put_failure; + + nla_nest_end(skb, protoinfo); + return nlmsg_end(skb, nlh); + +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static int inet6_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + int h, s_h; + int idx = 0, s_idx; + struct net_device *dev; + struct inet6_dev *idev; + struct hlist_head *head; + struct hlist_node *node; + + s_h = cb->args[0]; + s_idx = cb->args[1]; + + rcu_read_lock(); + for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { + idx = 0; + head = &net->dev_index_head[h]; + hlist_for_each_entry_rcu(dev, node, head, index_hlist) { + if (idx < s_idx) + goto cont; + idev = __in6_dev_get(dev); + if (!idev) + goto cont; + if (inet6_fill_ifinfo(skb, idev, + NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, + RTM_NEWLINK, NLM_F_MULTI) <= 0) + goto out; +cont: + idx++; + } + } +out: + rcu_read_unlock(); + cb->args[1] = idx; + cb->args[0] = h; + + return skb->len; +} + +void inet6_ifinfo_notify(int event, struct inet6_dev *idev) +{ + struct sk_buff *skb; + struct net *net = dev_net(idev->dev); + int err = -ENOBUFS; + + skb = nlmsg_new(inet6_if_nlmsg_size(), GFP_ATOMIC); + if (skb == NULL) + goto errout; + + err = inet6_fill_ifinfo(skb, idev, 0, 0, event, 0); + if (err < 0) { + /* -EMSGSIZE implies BUG in inet6_if_nlmsg_size() */ + WARN_ON(err == -EMSGSIZE); + kfree_skb(skb); + goto errout; + } + rtnl_notify(skb, net, 0, RTNLGRP_IPV6_IFINFO, NULL, GFP_ATOMIC); + return; +errout: + if (err < 0) + rtnl_set_sk_err(net, RTNLGRP_IPV6_IFINFO, err); +} + +static inline size_t inet6_prefix_nlmsg_size(void) +{ + return NLMSG_ALIGN(sizeof(struct prefixmsg)) + + nla_total_size(sizeof(struct in6_addr)) + + nla_total_size(sizeof(struct prefix_cacheinfo)); +} + +static int inet6_fill_prefix(struct sk_buff *skb, struct inet6_dev *idev, + struct prefix_info *pinfo, u32 pid, u32 seq, + int event, unsigned int flags) +{ + struct prefixmsg *pmsg; + struct nlmsghdr *nlh; + struct prefix_cacheinfo ci; + + nlh = nlmsg_put(skb, pid, seq, event, sizeof(*pmsg), flags); + if (nlh == NULL) + return -EMSGSIZE; + + pmsg = nlmsg_data(nlh); + pmsg->prefix_family = AF_INET6; + pmsg->prefix_pad1 = 0; + pmsg->prefix_pad2 = 0; + pmsg->prefix_ifindex = idev->dev->ifindex; + pmsg->prefix_len = pinfo->prefix_len; + pmsg->prefix_type = pinfo->type; + pmsg->prefix_pad3 = 0; + pmsg->prefix_flags = 0; + if (pinfo->onlink) + pmsg->prefix_flags |= IF_PREFIX_ONLINK; + if (pinfo->autoconf) + pmsg->prefix_flags |= IF_PREFIX_AUTOCONF; + + NLA_PUT(skb, PREFIX_ADDRESS, sizeof(pinfo->prefix), &pinfo->prefix); + + ci.preferred_time = ntohl(pinfo->prefered); + ci.valid_time = ntohl(pinfo->valid); + NLA_PUT(skb, PREFIX_CACHEINFO, sizeof(ci), &ci); + + return nlmsg_end(skb, nlh); + +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static void inet6_prefix_notify(int event, struct inet6_dev *idev, + struct prefix_info *pinfo) +{ + struct sk_buff *skb; + struct net *net = dev_net(idev->dev); + int err = -ENOBUFS; + + skb = nlmsg_new(inet6_prefix_nlmsg_size(), GFP_ATOMIC); + if (skb == NULL) + goto errout; + + err = inet6_fill_prefix(skb, idev, pinfo, 0, 0, event, 0); + if (err < 0) { + /* -EMSGSIZE implies BUG in inet6_prefix_nlmsg_size() */ + WARN_ON(err == -EMSGSIZE); + kfree_skb(skb); + goto errout; + } + rtnl_notify(skb, net, 0, RTNLGRP_IPV6_PREFIX, NULL, GFP_ATOMIC); + return; +errout: + if (err < 0) + rtnl_set_sk_err(net, RTNLGRP_IPV6_PREFIX, err); +} + +static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) +{ + inet6_ifa_notify(event ? : RTM_NEWADDR, ifp); + + switch (event) { + case RTM_NEWADDR: + /* + * If the address was optimistic + * we inserted the route at the start of + * our DAD process, so we don't need + * to do it again + */ + if (!(ifp->rt->rt6i_node)) + ip6_ins_rt(ifp->rt); + if (ifp->idev->cnf.forwarding) + addrconf_join_anycast(ifp); + break; + case RTM_DELADDR: + if (ifp->idev->cnf.forwarding) + addrconf_leave_anycast(ifp); + addrconf_leave_solict(ifp->idev, &ifp->addr); + dst_hold(&ifp->rt->dst); + + if (ip6_del_rt(ifp->rt)) + dst_free(&ifp->rt->dst); + break; + } +} + +static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) +{ + rcu_read_lock_bh(); + if (likely(ifp->idev->dead == 0)) + __ipv6_ifa_notify(event, ifp); + rcu_read_unlock_bh(); +} + +#ifdef CONFIG_SYSCTL + +static +int addrconf_sysctl_forward(ctl_table *ctl, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int *valp = ctl->data; + int val = *valp; + loff_t pos = *ppos; + ctl_table lctl; + int ret; + + /* + * ctl->data points to idev->cnf.forwarding, we should + * not modify it until we get the rtnl lock. + */ + lctl = *ctl; + lctl.data = &val; + + ret = proc_dointvec(&lctl, write, buffer, lenp, ppos); + + if (write) + ret = addrconf_fixup_forwarding(ctl, valp, val); + if (ret) + *ppos = pos; + return ret; +} + +static void dev_disable_change(struct inet6_dev *idev) +{ + if (!idev || !idev->dev) + return; + + if (idev->cnf.disable_ipv6) + addrconf_notify(NULL, NETDEV_DOWN, idev->dev); + else + addrconf_notify(NULL, NETDEV_UP, idev->dev); +} + +static void addrconf_disable_change(struct net *net, __s32 newf) +{ + struct net_device *dev; + struct inet6_dev *idev; + + rcu_read_lock(); + for_each_netdev_rcu(net, dev) { + idev = __in6_dev_get(dev); + if (idev) { + int changed = (!idev->cnf.disable_ipv6) ^ (!newf); + idev->cnf.disable_ipv6 = newf; + if (changed) + dev_disable_change(idev); + } + } + rcu_read_unlock(); +} + +static int addrconf_disable_ipv6(struct ctl_table *table, int *p, int newf) +{ + struct net *net; + int old; + + if (!rtnl_trylock()) + return restart_syscall(); + + net = (struct net *)table->extra2; + old = *p; + *p = newf; + + if (p == &net->ipv6.devconf_dflt->disable_ipv6) { + rtnl_unlock(); + return 0; + } + + if (p == &net->ipv6.devconf_all->disable_ipv6) { + net->ipv6.devconf_dflt->disable_ipv6 = newf; + addrconf_disable_change(net, newf); + } else if ((!newf) ^ (!old)) + dev_disable_change((struct inet6_dev *)table->extra1); + + rtnl_unlock(); + return 0; +} + +static +int addrconf_sysctl_disable(ctl_table *ctl, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int *valp = ctl->data; + int val = *valp; + loff_t pos = *ppos; + ctl_table lctl; + int ret; + + /* + * ctl->data points to idev->cnf.disable_ipv6, we should + * not modify it until we get the rtnl lock. + */ + lctl = *ctl; + lctl.data = &val; + + ret = proc_dointvec(&lctl, write, buffer, lenp, ppos); + + if (write) + ret = addrconf_disable_ipv6(ctl, valp, val); + if (ret) + *ppos = pos; + return ret; +} + +static struct addrconf_sysctl_table +{ + struct ctl_table_header *sysctl_header; + ctl_table addrconf_vars[DEVCONF_MAX+1]; + char *dev_name; +} addrconf_sysctl __read_mostly = { + .sysctl_header = NULL, + .addrconf_vars = { + { + .procname = "forwarding", + .data = &ipv6_devconf.forwarding, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = addrconf_sysctl_forward, + }, + { + .procname = "hop_limit", + .data = &ipv6_devconf.hop_limit, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "mtu", + .data = &ipv6_devconf.mtu6, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "accept_ra", + .data = &ipv6_devconf.accept_ra, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "accept_redirects", + .data = &ipv6_devconf.accept_redirects, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "autoconf", + .data = &ipv6_devconf.autoconf, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "dad_transmits", + .data = &ipv6_devconf.dad_transmits, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "router_solicitations", + .data = &ipv6_devconf.rtr_solicits, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "router_solicitation_interval", + .data = &ipv6_devconf.rtr_solicit_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "router_solicitation_delay", + .data = &ipv6_devconf.rtr_solicit_delay, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "force_mld_version", + .data = &ipv6_devconf.force_mld_version, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#ifdef CONFIG_IPV6_PRIVACY + { + .procname = "use_tempaddr", + .data = &ipv6_devconf.use_tempaddr, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "temp_valid_lft", + .data = &ipv6_devconf.temp_valid_lft, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "temp_prefered_lft", + .data = &ipv6_devconf.temp_prefered_lft, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "regen_max_retry", + .data = &ipv6_devconf.regen_max_retry, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "max_desync_factor", + .data = &ipv6_devconf.max_desync_factor, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif + { + .procname = "max_addresses", + .data = &ipv6_devconf.max_addresses, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "accept_ra_defrtr", + .data = &ipv6_devconf.accept_ra_defrtr, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "accept_ra_pinfo", + .data = &ipv6_devconf.accept_ra_pinfo, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#ifdef CONFIG_IPV6_ROUTER_PREF + { + .procname = "accept_ra_rtr_pref", + .data = &ipv6_devconf.accept_ra_rtr_pref, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "router_probe_interval", + .data = &ipv6_devconf.rtr_probe_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, +#ifdef CONFIG_IPV6_ROUTE_INFO + { + .procname = "accept_ra_rt_info_max_plen", + .data = &ipv6_devconf.accept_ra_rt_info_max_plen, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#endif + { + .procname = "proxy_ndp", + .data = &ipv6_devconf.proxy_ndp, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "accept_source_route", + .data = &ipv6_devconf.accept_source_route, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#ifdef CONFIG_IPV6_OPTIMISTIC_DAD + { + .procname = "optimistic_dad", + .data = &ipv6_devconf.optimistic_dad, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + + }, +#endif +#ifdef CONFIG_IPV6_MROUTE + { + .procname = "mc_forwarding", + .data = &ipv6_devconf.mc_forwarding, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = proc_dointvec, + }, +#endif + { + .procname = "disable_ipv6", + .data = &ipv6_devconf.disable_ipv6, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = addrconf_sysctl_disable, + }, + { + .procname = "accept_dad", + .data = &ipv6_devconf.accept_dad, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "force_tllao", + .data = &ipv6_devconf.force_tllao, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + /* sentinel */ + } + }, +}; + +static int __addrconf_sysctl_register(struct net *net, char *dev_name, + struct inet6_dev *idev, struct ipv6_devconf *p) +{ + int i; + struct addrconf_sysctl_table *t; + +#define ADDRCONF_CTL_PATH_DEV 3 + + struct ctl_path addrconf_ctl_path[] = { + { .procname = "net", }, + { .procname = "ipv6", }, + { .procname = "conf", }, + { /* to be set */ }, + { }, + }; + + + t = kmemdup(&addrconf_sysctl, sizeof(*t), GFP_KERNEL); + if (t == NULL) + goto out; + + for (i = 0; t->addrconf_vars[i].data; i++) { + t->addrconf_vars[i].data += (char *)p - (char *)&ipv6_devconf; + t->addrconf_vars[i].extra1 = idev; /* embedded; no ref */ + t->addrconf_vars[i].extra2 = net; + } + + /* + * Make a copy of dev_name, because '.procname' is regarded as const + * by sysctl and we wouldn't want anyone to change it under our feet + * (see SIOCSIFNAME). + */ + t->dev_name = kstrdup(dev_name, GFP_KERNEL); + if (!t->dev_name) + goto free; + + addrconf_ctl_path[ADDRCONF_CTL_PATH_DEV].procname = t->dev_name; + + t->sysctl_header = register_net_sysctl_table(net, addrconf_ctl_path, + t->addrconf_vars); + if (t->sysctl_header == NULL) + goto free_procname; + + p->sysctl = t; + return 0; + +free_procname: + kfree(t->dev_name); +free: + kfree(t); +out: + return -ENOBUFS; +} + +static void __addrconf_sysctl_unregister(struct ipv6_devconf *p) +{ + struct addrconf_sysctl_table *t; + + if (p->sysctl == NULL) + return; + + t = p->sysctl; + p->sysctl = NULL; + unregister_net_sysctl_table(t->sysctl_header); + kfree(t->dev_name); + kfree(t); +} + +static void addrconf_sysctl_register(struct inet6_dev *idev) +{ + neigh_sysctl_register(idev->dev, idev->nd_parms, "ipv6", + &ndisc_ifinfo_sysctl_change); + __addrconf_sysctl_register(dev_net(idev->dev), idev->dev->name, + idev, &idev->cnf); +} + +static void addrconf_sysctl_unregister(struct inet6_dev *idev) +{ + __addrconf_sysctl_unregister(&idev->cnf); + neigh_sysctl_unregister(idev->nd_parms); +} + + +#endif + +static int __net_init addrconf_init_net(struct net *net) +{ + int err; + struct ipv6_devconf *all, *dflt; + + err = -ENOMEM; + all = &ipv6_devconf; + dflt = &ipv6_devconf_dflt; + + if (!net_eq(net, &init_net)) { + all = kmemdup(all, sizeof(ipv6_devconf), GFP_KERNEL); + if (all == NULL) + goto err_alloc_all; + + dflt = kmemdup(dflt, sizeof(ipv6_devconf_dflt), GFP_KERNEL); + if (dflt == NULL) + goto err_alloc_dflt; + } else { + /* these will be inherited by all namespaces */ + dflt->autoconf = ipv6_defaults.autoconf; + dflt->disable_ipv6 = ipv6_defaults.disable_ipv6; + } + + net->ipv6.devconf_all = all; + net->ipv6.devconf_dflt = dflt; + +#ifdef CONFIG_SYSCTL + err = __addrconf_sysctl_register(net, "all", NULL, all); + if (err < 0) + goto err_reg_all; + + err = __addrconf_sysctl_register(net, "default", NULL, dflt); + if (err < 0) + goto err_reg_dflt; +#endif + return 0; + +#ifdef CONFIG_SYSCTL +err_reg_dflt: + __addrconf_sysctl_unregister(all); +err_reg_all: + kfree(dflt); +#endif +err_alloc_dflt: + kfree(all); +err_alloc_all: + return err; +} + +static void __net_exit addrconf_exit_net(struct net *net) +{ +#ifdef CONFIG_SYSCTL + __addrconf_sysctl_unregister(net->ipv6.devconf_dflt); + __addrconf_sysctl_unregister(net->ipv6.devconf_all); +#endif + if (!net_eq(net, &init_net)) { + kfree(net->ipv6.devconf_dflt); + kfree(net->ipv6.devconf_all); + } +} + +static struct pernet_operations addrconf_ops = { + .init = addrconf_init_net, + .exit = addrconf_exit_net, +}; + +/* + * Device notifier + */ + +int register_inet6addr_notifier(struct notifier_block *nb) +{ + return atomic_notifier_chain_register(&inet6addr_chain, nb); +} +EXPORT_SYMBOL(register_inet6addr_notifier); + +int unregister_inet6addr_notifier(struct notifier_block *nb) +{ + return atomic_notifier_chain_unregister(&inet6addr_chain, nb); +} +EXPORT_SYMBOL(unregister_inet6addr_notifier); + +static struct rtnl_af_ops inet6_ops = { + .family = AF_INET6, + .fill_link_af = inet6_fill_link_af, + .get_link_af_size = inet6_get_link_af_size, +}; + +/* + * Init / cleanup code + */ + +int __init addrconf_init(void) +{ + int i, err; + + err = ipv6_addr_label_init(); + if (err < 0) { + printk(KERN_CRIT "IPv6 Addrconf:" + " cannot initialize default policy table: %d.\n", err); + goto out; + } + + err = register_pernet_subsys(&addrconf_ops); + if (err < 0) + goto out_addrlabel; + + /* The addrconf netdev notifier requires that loopback_dev + * has it's ipv6 private information allocated and setup + * before it can bring up and give link-local addresses + * to other devices which are up. + * + * Unfortunately, loopback_dev is not necessarily the first + * entry in the global dev_base list of net devices. In fact, + * it is likely to be the very last entry on that list. + * So this causes the notifier registry below to try and + * give link-local addresses to all devices besides loopback_dev + * first, then loopback_dev, which cases all the non-loopback_dev + * devices to fail to get a link-local address. + * + * So, as a temporary fix, allocate the ipv6 structure for + * loopback_dev first by hand. + * Longer term, all of the dependencies ipv6 has upon the loopback + * device and it being up should be removed. + */ + rtnl_lock(); + if (!ipv6_add_dev(init_net.loopback_dev)) + err = -ENOMEM; + rtnl_unlock(); + if (err) + goto errlo; + + for (i = 0; i < IN6_ADDR_HSIZE; i++) + INIT_HLIST_HEAD(&inet6_addr_lst[i]); + + register_netdevice_notifier(&ipv6_dev_notf); + + addrconf_verify(0); + + err = rtnl_af_register(&inet6_ops); + if (err < 0) + goto errout_af; + + err = __rtnl_register(PF_INET6, RTM_GETLINK, NULL, inet6_dump_ifinfo, + NULL); + if (err < 0) + goto errout; + + /* Only the first call to __rtnl_register can fail */ + __rtnl_register(PF_INET6, RTM_NEWADDR, inet6_rtm_newaddr, NULL, NULL); + __rtnl_register(PF_INET6, RTM_DELADDR, inet6_rtm_deladdr, NULL, NULL); + __rtnl_register(PF_INET6, RTM_GETADDR, inet6_rtm_getaddr, + inet6_dump_ifaddr, NULL); + __rtnl_register(PF_INET6, RTM_GETMULTICAST, NULL, + inet6_dump_ifmcaddr, NULL); + __rtnl_register(PF_INET6, RTM_GETANYCAST, NULL, + inet6_dump_ifacaddr, NULL); + + ipv6_addr_label_rtnl_register(); + + return 0; +errout: + rtnl_af_unregister(&inet6_ops); +errout_af: + unregister_netdevice_notifier(&ipv6_dev_notf); +errlo: + unregister_pernet_subsys(&addrconf_ops); +out_addrlabel: + ipv6_addr_label_cleanup(); +out: + return err; +} + +void addrconf_cleanup(void) +{ + struct net_device *dev; + int i; + + unregister_netdevice_notifier(&ipv6_dev_notf); + unregister_pernet_subsys(&addrconf_ops); + ipv6_addr_label_cleanup(); + + rtnl_lock(); + + __rtnl_af_unregister(&inet6_ops); + + /* clean dev list */ + for_each_netdev(&init_net, dev) { + if (__in6_dev_get(dev) == NULL) + continue; + addrconf_ifdown(dev, 1); + } + addrconf_ifdown(init_net.loopback_dev, 2); + + /* + * Check hash table. + */ + spin_lock_bh(&addrconf_hash_lock); + for (i = 0; i < IN6_ADDR_HSIZE; i++) + WARN_ON(!hlist_empty(&inet6_addr_lst[i])); + spin_unlock_bh(&addrconf_hash_lock); + + del_timer(&addr_chk_timer); + rtnl_unlock(); +} diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c new file mode 100644 index 00000000..399287e5 --- /dev/null +++ b/net/ipv6/addrconf_core.c @@ -0,0 +1,80 @@ +/* + * IPv6 library code, needed by static components when full IPv6 support is + * not configured or static. + */ + +#include <linux/export.h> +#include <net/ipv6.h> + +#define IPV6_ADDR_SCOPE_TYPE(scope) ((scope) << 16) + +static inline unsigned ipv6_addr_scope2type(unsigned scope) +{ + switch(scope) { + case IPV6_ADDR_SCOPE_NODELOCAL: + return (IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_NODELOCAL) | + IPV6_ADDR_LOOPBACK); + case IPV6_ADDR_SCOPE_LINKLOCAL: + return (IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_LINKLOCAL) | + IPV6_ADDR_LINKLOCAL); + case IPV6_ADDR_SCOPE_SITELOCAL: + return (IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_SITELOCAL) | + IPV6_ADDR_SITELOCAL); + } + return IPV6_ADDR_SCOPE_TYPE(scope); +} + +int __ipv6_addr_type(const struct in6_addr *addr) +{ + __be32 st; + + st = addr->s6_addr32[0]; + + /* Consider all addresses with the first three bits different of + 000 and 111 as unicasts. + */ + if ((st & htonl(0xE0000000)) != htonl(0x00000000) && + (st & htonl(0xE0000000)) != htonl(0xE0000000)) + return (IPV6_ADDR_UNICAST | + IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL)); + + if ((st & htonl(0xFF000000)) == htonl(0xFF000000)) { + /* multicast */ + /* addr-select 3.1 */ + return (IPV6_ADDR_MULTICAST | + ipv6_addr_scope2type(IPV6_ADDR_MC_SCOPE(addr))); + } + + if ((st & htonl(0xFFC00000)) == htonl(0xFE800000)) + return (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST | + IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_LINKLOCAL)); /* addr-select 3.1 */ + if ((st & htonl(0xFFC00000)) == htonl(0xFEC00000)) + return (IPV6_ADDR_SITELOCAL | IPV6_ADDR_UNICAST | + IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_SITELOCAL)); /* addr-select 3.1 */ + if ((st & htonl(0xFE000000)) == htonl(0xFC000000)) + return (IPV6_ADDR_UNICAST | + IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL)); /* RFC 4193 */ + + if ((addr->s6_addr32[0] | addr->s6_addr32[1]) == 0) { + if (addr->s6_addr32[2] == 0) { + if (addr->s6_addr32[3] == 0) + return IPV6_ADDR_ANY; + + if (addr->s6_addr32[3] == htonl(0x00000001)) + return (IPV6_ADDR_LOOPBACK | IPV6_ADDR_UNICAST | + IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_LINKLOCAL)); /* addr-select 3.4 */ + + return (IPV6_ADDR_COMPATv4 | IPV6_ADDR_UNICAST | + IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL)); /* addr-select 3.3 */ + } + + if (addr->s6_addr32[2] == htonl(0x0000ffff)) + return (IPV6_ADDR_MAPPED | + IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL)); /* addr-select 3.3 */ + } + + return (IPV6_ADDR_UNICAST | + IPV6_ADDR_SCOPE_TYPE(IPV6_ADDR_SCOPE_GLOBAL)); /* addr-select 3.4 */ +} +EXPORT_SYMBOL(__ipv6_addr_type); + diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c new file mode 100644 index 00000000..2d8ddba9 --- /dev/null +++ b/net/ipv6/addrlabel.c @@ -0,0 +1,602 @@ +/* + * IPv6 Address Label subsystem + * for the IPv6 "Default" Source Address Selection + * + * Copyright (C)2007 USAGI/WIDE Project + */ +/* + * Author: + * YOSHIFUJI Hideaki @ USAGI/WIDE Project <yoshfuji@linux-ipv6.org> + */ + +#include <linux/kernel.h> +#include <linux/list.h> +#include <linux/rcupdate.h> +#include <linux/in6.h> +#include <linux/slab.h> +#include <net/addrconf.h> +#include <linux/if_addrlabel.h> +#include <linux/netlink.h> +#include <linux/rtnetlink.h> + +#if 0 +#define ADDRLABEL(x...) printk(x) +#else +#define ADDRLABEL(x...) do { ; } while(0) +#endif + +/* + * Policy Table + */ +struct ip6addrlbl_entry +{ +#ifdef CONFIG_NET_NS + struct net *lbl_net; +#endif + struct in6_addr prefix; + int prefixlen; + int ifindex; + int addrtype; + u32 label; + struct hlist_node list; + atomic_t refcnt; + struct rcu_head rcu; +}; + +static struct ip6addrlbl_table +{ + struct hlist_head head; + spinlock_t lock; + u32 seq; +} ip6addrlbl_table; + +static inline +struct net *ip6addrlbl_net(const struct ip6addrlbl_entry *lbl) +{ + return read_pnet(&lbl->lbl_net); +} + +/* + * Default policy table (RFC3484 + extensions) + * + * prefix addr_type label + * ------------------------------------------------------------------------- + * ::1/128 LOOPBACK 0 + * ::/0 N/A 1 + * 2002::/16 N/A 2 + * ::/96 COMPATv4 3 + * ::ffff:0:0/96 V4MAPPED 4 + * fc00::/7 N/A 5 ULA (RFC 4193) + * 2001::/32 N/A 6 Teredo (RFC 4380) + * 2001:10::/28 N/A 7 ORCHID (RFC 4843) + * + * Note: 0xffffffff is used if we do not have any policies. + */ + +#define IPV6_ADDR_LABEL_DEFAULT 0xffffffffUL + +static const __net_initdata struct ip6addrlbl_init_table +{ + const struct in6_addr *prefix; + int prefixlen; + u32 label; +} ip6addrlbl_init_table[] = { + { /* ::/0 */ + .prefix = &in6addr_any, + .label = 1, + },{ /* fc00::/7 */ + .prefix = &(struct in6_addr){{{ 0xfc }}}, + .prefixlen = 7, + .label = 5, + },{ /* 2002::/16 */ + .prefix = &(struct in6_addr){{{ 0x20, 0x02 }}}, + .prefixlen = 16, + .label = 2, + },{ /* 2001::/32 */ + .prefix = &(struct in6_addr){{{ 0x20, 0x01 }}}, + .prefixlen = 32, + .label = 6, + },{ /* 2001:10::/28 */ + .prefix = &(struct in6_addr){{{ 0x20, 0x01, 0x00, 0x10 }}}, + .prefixlen = 28, + .label = 7, + },{ /* ::ffff:0:0 */ + .prefix = &(struct in6_addr){{{ [10] = 0xff, [11] = 0xff }}}, + .prefixlen = 96, + .label = 4, + },{ /* ::/96 */ + .prefix = &in6addr_any, + .prefixlen = 96, + .label = 3, + },{ /* ::1/128 */ + .prefix = &in6addr_loopback, + .prefixlen = 128, + .label = 0, + } +}; + +/* Object management */ +static inline void ip6addrlbl_free(struct ip6addrlbl_entry *p) +{ +#ifdef CONFIG_NET_NS + release_net(p->lbl_net); +#endif + kfree(p); +} + +static void ip6addrlbl_free_rcu(struct rcu_head *h) +{ + ip6addrlbl_free(container_of(h, struct ip6addrlbl_entry, rcu)); +} + +static inline int ip6addrlbl_hold(struct ip6addrlbl_entry *p) +{ + return atomic_inc_not_zero(&p->refcnt); +} + +static inline void ip6addrlbl_put(struct ip6addrlbl_entry *p) +{ + if (atomic_dec_and_test(&p->refcnt)) + call_rcu(&p->rcu, ip6addrlbl_free_rcu); +} + +/* Find label */ +static int __ip6addrlbl_match(struct net *net, + struct ip6addrlbl_entry *p, + const struct in6_addr *addr, + int addrtype, int ifindex) +{ + if (!net_eq(ip6addrlbl_net(p), net)) + return 0; + if (p->ifindex && p->ifindex != ifindex) + return 0; + if (p->addrtype && p->addrtype != addrtype) + return 0; + if (!ipv6_prefix_equal(addr, &p->prefix, p->prefixlen)) + return 0; + return 1; +} + +static struct ip6addrlbl_entry *__ipv6_addr_label(struct net *net, + const struct in6_addr *addr, + int type, int ifindex) +{ + struct hlist_node *pos; + struct ip6addrlbl_entry *p; + hlist_for_each_entry_rcu(p, pos, &ip6addrlbl_table.head, list) { + if (__ip6addrlbl_match(net, p, addr, type, ifindex)) + return p; + } + return NULL; +} + +u32 ipv6_addr_label(struct net *net, + const struct in6_addr *addr, int type, int ifindex) +{ + u32 label; + struct ip6addrlbl_entry *p; + + type &= IPV6_ADDR_MAPPED | IPV6_ADDR_COMPATv4 | IPV6_ADDR_LOOPBACK; + + rcu_read_lock(); + p = __ipv6_addr_label(net, addr, type, ifindex); + label = p ? p->label : IPV6_ADDR_LABEL_DEFAULT; + rcu_read_unlock(); + + ADDRLABEL(KERN_DEBUG "%s(addr=%pI6, type=%d, ifindex=%d) => %08x\n", + __func__, addr, type, ifindex, label); + + return label; +} + +/* allocate one entry */ +static struct ip6addrlbl_entry *ip6addrlbl_alloc(struct net *net, + const struct in6_addr *prefix, + int prefixlen, int ifindex, + u32 label) +{ + struct ip6addrlbl_entry *newp; + int addrtype; + + ADDRLABEL(KERN_DEBUG "%s(prefix=%pI6, prefixlen=%d, ifindex=%d, label=%u)\n", + __func__, prefix, prefixlen, ifindex, (unsigned int)label); + + addrtype = ipv6_addr_type(prefix) & (IPV6_ADDR_MAPPED | IPV6_ADDR_COMPATv4 | IPV6_ADDR_LOOPBACK); + + switch (addrtype) { + case IPV6_ADDR_MAPPED: + if (prefixlen > 96) + return ERR_PTR(-EINVAL); + if (prefixlen < 96) + addrtype = 0; + break; + case IPV6_ADDR_COMPATv4: + if (prefixlen != 96) + addrtype = 0; + break; + case IPV6_ADDR_LOOPBACK: + if (prefixlen != 128) + addrtype = 0; + break; + } + + newp = kmalloc(sizeof(*newp), GFP_KERNEL); + if (!newp) + return ERR_PTR(-ENOMEM); + + ipv6_addr_prefix(&newp->prefix, prefix, prefixlen); + newp->prefixlen = prefixlen; + newp->ifindex = ifindex; + newp->addrtype = addrtype; + newp->label = label; + INIT_HLIST_NODE(&newp->list); +#ifdef CONFIG_NET_NS + newp->lbl_net = hold_net(net); +#endif + atomic_set(&newp->refcnt, 1); + return newp; +} + +/* add a label */ +static int __ip6addrlbl_add(struct ip6addrlbl_entry *newp, int replace) +{ + int ret = 0; + + ADDRLABEL(KERN_DEBUG "%s(newp=%p, replace=%d)\n", + __func__, + newp, replace); + + if (hlist_empty(&ip6addrlbl_table.head)) { + hlist_add_head_rcu(&newp->list, &ip6addrlbl_table.head); + } else { + struct hlist_node *pos, *n; + struct ip6addrlbl_entry *p = NULL; + hlist_for_each_entry_safe(p, pos, n, + &ip6addrlbl_table.head, list) { + if (p->prefixlen == newp->prefixlen && + net_eq(ip6addrlbl_net(p), ip6addrlbl_net(newp)) && + p->ifindex == newp->ifindex && + ipv6_addr_equal(&p->prefix, &newp->prefix)) { + if (!replace) { + ret = -EEXIST; + goto out; + } + hlist_replace_rcu(&p->list, &newp->list); + ip6addrlbl_put(p); + goto out; + } else if ((p->prefixlen == newp->prefixlen && !p->ifindex) || + (p->prefixlen < newp->prefixlen)) { + hlist_add_before_rcu(&newp->list, &p->list); + goto out; + } + } + hlist_add_after_rcu(&p->list, &newp->list); + } +out: + if (!ret) + ip6addrlbl_table.seq++; + return ret; +} + +/* add a label */ +static int ip6addrlbl_add(struct net *net, + const struct in6_addr *prefix, int prefixlen, + int ifindex, u32 label, int replace) +{ + struct ip6addrlbl_entry *newp; + int ret = 0; + + ADDRLABEL(KERN_DEBUG "%s(prefix=%pI6, prefixlen=%d, ifindex=%d, label=%u, replace=%d)\n", + __func__, prefix, prefixlen, ifindex, (unsigned int)label, + replace); + + newp = ip6addrlbl_alloc(net, prefix, prefixlen, ifindex, label); + if (IS_ERR(newp)) + return PTR_ERR(newp); + spin_lock(&ip6addrlbl_table.lock); + ret = __ip6addrlbl_add(newp, replace); + spin_unlock(&ip6addrlbl_table.lock); + if (ret) + ip6addrlbl_free(newp); + return ret; +} + +/* remove a label */ +static int __ip6addrlbl_del(struct net *net, + const struct in6_addr *prefix, int prefixlen, + int ifindex) +{ + struct ip6addrlbl_entry *p = NULL; + struct hlist_node *pos, *n; + int ret = -ESRCH; + + ADDRLABEL(KERN_DEBUG "%s(prefix=%pI6, prefixlen=%d, ifindex=%d)\n", + __func__, prefix, prefixlen, ifindex); + + hlist_for_each_entry_safe(p, pos, n, &ip6addrlbl_table.head, list) { + if (p->prefixlen == prefixlen && + net_eq(ip6addrlbl_net(p), net) && + p->ifindex == ifindex && + ipv6_addr_equal(&p->prefix, prefix)) { + hlist_del_rcu(&p->list); + ip6addrlbl_put(p); + ret = 0; + break; + } + } + return ret; +} + +static int ip6addrlbl_del(struct net *net, + const struct in6_addr *prefix, int prefixlen, + int ifindex) +{ + struct in6_addr prefix_buf; + int ret; + + ADDRLABEL(KERN_DEBUG "%s(prefix=%pI6, prefixlen=%d, ifindex=%d)\n", + __func__, prefix, prefixlen, ifindex); + + ipv6_addr_prefix(&prefix_buf, prefix, prefixlen); + spin_lock(&ip6addrlbl_table.lock); + ret = __ip6addrlbl_del(net, &prefix_buf, prefixlen, ifindex); + spin_unlock(&ip6addrlbl_table.lock); + return ret; +} + +/* add default label */ +static int __net_init ip6addrlbl_net_init(struct net *net) +{ + int err = 0; + int i; + + ADDRLABEL(KERN_DEBUG "%s()\n", __func__); + + for (i = 0; i < ARRAY_SIZE(ip6addrlbl_init_table); i++) { + int ret = ip6addrlbl_add(net, + ip6addrlbl_init_table[i].prefix, + ip6addrlbl_init_table[i].prefixlen, + 0, + ip6addrlbl_init_table[i].label, 0); + /* XXX: should we free all rules when we catch an error? */ + if (ret && (!err || err != -ENOMEM)) + err = ret; + } + return err; +} + +static void __net_exit ip6addrlbl_net_exit(struct net *net) +{ + struct ip6addrlbl_entry *p = NULL; + struct hlist_node *pos, *n; + + /* Remove all labels belonging to the exiting net */ + spin_lock(&ip6addrlbl_table.lock); + hlist_for_each_entry_safe(p, pos, n, &ip6addrlbl_table.head, list) { + if (net_eq(ip6addrlbl_net(p), net)) { + hlist_del_rcu(&p->list); + ip6addrlbl_put(p); + } + } + spin_unlock(&ip6addrlbl_table.lock); +} + +static struct pernet_operations ipv6_addr_label_ops = { + .init = ip6addrlbl_net_init, + .exit = ip6addrlbl_net_exit, +}; + +int __init ipv6_addr_label_init(void) +{ + spin_lock_init(&ip6addrlbl_table.lock); + + return register_pernet_subsys(&ipv6_addr_label_ops); +} + +void ipv6_addr_label_cleanup(void) +{ + unregister_pernet_subsys(&ipv6_addr_label_ops); +} + +static const struct nla_policy ifal_policy[IFAL_MAX+1] = { + [IFAL_ADDRESS] = { .len = sizeof(struct in6_addr), }, + [IFAL_LABEL] = { .len = sizeof(u32), }, +}; + +static int ip6addrlbl_newdel(struct sk_buff *skb, struct nlmsghdr *nlh, + void *arg) +{ + struct net *net = sock_net(skb->sk); + struct ifaddrlblmsg *ifal; + struct nlattr *tb[IFAL_MAX+1]; + struct in6_addr *pfx; + u32 label; + int err = 0; + + err = nlmsg_parse(nlh, sizeof(*ifal), tb, IFAL_MAX, ifal_policy); + if (err < 0) + return err; + + ifal = nlmsg_data(nlh); + + if (ifal->ifal_family != AF_INET6 || + ifal->ifal_prefixlen > 128) + return -EINVAL; + + if (!tb[IFAL_ADDRESS]) + return -EINVAL; + + pfx = nla_data(tb[IFAL_ADDRESS]); + if (!pfx) + return -EINVAL; + + if (!tb[IFAL_LABEL]) + return -EINVAL; + label = nla_get_u32(tb[IFAL_LABEL]); + if (label == IPV6_ADDR_LABEL_DEFAULT) + return -EINVAL; + + switch(nlh->nlmsg_type) { + case RTM_NEWADDRLABEL: + if (ifal->ifal_index && + !__dev_get_by_index(net, ifal->ifal_index)) + return -EINVAL; + + err = ip6addrlbl_add(net, pfx, ifal->ifal_prefixlen, + ifal->ifal_index, label, + nlh->nlmsg_flags & NLM_F_REPLACE); + break; + case RTM_DELADDRLABEL: + err = ip6addrlbl_del(net, pfx, ifal->ifal_prefixlen, + ifal->ifal_index); + break; + default: + err = -EOPNOTSUPP; + } + return err; +} + +static inline void ip6addrlbl_putmsg(struct nlmsghdr *nlh, + int prefixlen, int ifindex, u32 lseq) +{ + struct ifaddrlblmsg *ifal = nlmsg_data(nlh); + ifal->ifal_family = AF_INET6; + ifal->ifal_prefixlen = prefixlen; + ifal->ifal_flags = 0; + ifal->ifal_index = ifindex; + ifal->ifal_seq = lseq; +}; + +static int ip6addrlbl_fill(struct sk_buff *skb, + struct ip6addrlbl_entry *p, + u32 lseq, + u32 pid, u32 seq, int event, + unsigned int flags) +{ + struct nlmsghdr *nlh = nlmsg_put(skb, pid, seq, event, + sizeof(struct ifaddrlblmsg), flags); + if (!nlh) + return -EMSGSIZE; + + ip6addrlbl_putmsg(nlh, p->prefixlen, p->ifindex, lseq); + + if (nla_put(skb, IFAL_ADDRESS, 16, &p->prefix) < 0 || + nla_put_u32(skb, IFAL_LABEL, p->label) < 0) { + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; + } + + return nlmsg_end(skb, nlh); +} + +static int ip6addrlbl_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + struct ip6addrlbl_entry *p; + struct hlist_node *pos; + int idx = 0, s_idx = cb->args[0]; + int err; + + rcu_read_lock(); + hlist_for_each_entry_rcu(p, pos, &ip6addrlbl_table.head, list) { + if (idx >= s_idx && + net_eq(ip6addrlbl_net(p), net)) { + if ((err = ip6addrlbl_fill(skb, p, + ip6addrlbl_table.seq, + NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, + RTM_NEWADDRLABEL, + NLM_F_MULTI)) <= 0) + break; + } + idx++; + } + rcu_read_unlock(); + cb->args[0] = idx; + return skb->len; +} + +static inline int ip6addrlbl_msgsize(void) +{ + return NLMSG_ALIGN(sizeof(struct ifaddrlblmsg)) + + nla_total_size(16) /* IFAL_ADDRESS */ + + nla_total_size(4); /* IFAL_LABEL */ +} + +static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr* nlh, + void *arg) +{ + struct net *net = sock_net(in_skb->sk); + struct ifaddrlblmsg *ifal; + struct nlattr *tb[IFAL_MAX+1]; + struct in6_addr *addr; + u32 lseq; + int err = 0; + struct ip6addrlbl_entry *p; + struct sk_buff *skb; + + err = nlmsg_parse(nlh, sizeof(*ifal), tb, IFAL_MAX, ifal_policy); + if (err < 0) + return err; + + ifal = nlmsg_data(nlh); + + if (ifal->ifal_family != AF_INET6 || + ifal->ifal_prefixlen != 128) + return -EINVAL; + + if (ifal->ifal_index && + !__dev_get_by_index(net, ifal->ifal_index)) + return -EINVAL; + + if (!tb[IFAL_ADDRESS]) + return -EINVAL; + + addr = nla_data(tb[IFAL_ADDRESS]); + if (!addr) + return -EINVAL; + + rcu_read_lock(); + p = __ipv6_addr_label(net, addr, ipv6_addr_type(addr), ifal->ifal_index); + if (p && ip6addrlbl_hold(p)) + p = NULL; + lseq = ip6addrlbl_table.seq; + rcu_read_unlock(); + + if (!p) { + err = -ESRCH; + goto out; + } + + if (!(skb = nlmsg_new(ip6addrlbl_msgsize(), GFP_KERNEL))) { + ip6addrlbl_put(p); + return -ENOBUFS; + } + + err = ip6addrlbl_fill(skb, p, lseq, + NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, + RTM_NEWADDRLABEL, 0); + + ip6addrlbl_put(p); + + if (err < 0) { + WARN_ON(err == -EMSGSIZE); + kfree_skb(skb); + goto out; + } + + err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid); +out: + return err; +} + +void __init ipv6_addr_label_rtnl_register(void) +{ + __rtnl_register(PF_INET6, RTM_NEWADDRLABEL, ip6addrlbl_newdel, + NULL, NULL); + __rtnl_register(PF_INET6, RTM_DELADDRLABEL, ip6addrlbl_newdel, + NULL, NULL); + __rtnl_register(PF_INET6, RTM_GETADDRLABEL, ip6addrlbl_get, + ip6addrlbl_dump, NULL); +} + diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c new file mode 100644 index 00000000..22ebbb97 --- /dev/null +++ b/net/ipv6/af_inet6.c @@ -0,0 +1,1363 @@ +/* + * PF_INET6 socket protocol family + * Linux INET6 implementation + * + * Authors: + * Pedro Roque <roque@di.fc.ul.pt> + * + * Adapted from linux/net/ipv4/af_inet.c + * + * Fixes: + * piggy, Karl Knutson : Socket protocol table + * Hideaki YOSHIFUJI : sin6_scope_id support + * Arnaldo Melo : check proc_net_create return, cleanups + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + + +#include <linux/module.h> +#include <linux/capability.h> +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/in.h> +#include <linux/kernel.h> +#include <linux/timer.h> +#include <linux/string.h> +#include <linux/sockios.h> +#include <linux/net.h> +#include <linux/fcntl.h> +#include <linux/mm.h> +#include <linux/interrupt.h> +#include <linux/proc_fs.h> +#include <linux/stat.h> +#include <linux/init.h> +#include <linux/slab.h> + +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/icmpv6.h> +#include <linux/netfilter_ipv6.h> + +#include <net/ip.h> +#include <net/ipv6.h> +#include <net/udp.h> +#include <net/udplite.h> +#include <net/tcp.h> +#include <net/ipip.h> +#include <net/ping.h> +#include <net/protocol.h> +#include <net/inet_common.h> +#include <net/route.h> +#include <net/transp_v6.h> +#include <net/ip6_route.h> +#include <net/addrconf.h> +#ifdef CONFIG_IPV6_TUNNEL +#include <net/ip6_tunnel.h> +#endif + +#include <asm/uaccess.h> +#include <linux/mroute6.h> + +#ifdef CONFIG_ANDROID_PARANOID_NETWORK +#include <linux/android_aid.h> + +static inline int current_has_network(void) +{ + return in_egroup_p(AID_INET) || capable(CAP_NET_RAW); +} +#else +static inline int current_has_network(void) +{ + return 1; +} +#endif + +MODULE_AUTHOR("Cast of dozens"); +MODULE_DESCRIPTION("IPv6 protocol stack for Linux"); +MODULE_LICENSE("GPL"); + +/* The inetsw6 table contains everything that inet6_create needs to + * build a new socket. + */ +static struct list_head inetsw6[SOCK_MAX]; +static DEFINE_SPINLOCK(inetsw6_lock); + +struct ipv6_params ipv6_defaults = { + .disable_ipv6 = 0, + .autoconf = 1, +}; + +static int disable_ipv6_mod = 0; + +module_param_named(disable, disable_ipv6_mod, int, 0444); +MODULE_PARM_DESC(disable, "Disable IPv6 module such that it is non-functional"); + +module_param_named(disable_ipv6, ipv6_defaults.disable_ipv6, int, 0444); +MODULE_PARM_DESC(disable_ipv6, "Disable IPv6 on all interfaces"); + +module_param_named(autoconf, ipv6_defaults.autoconf, int, 0444); +MODULE_PARM_DESC(autoconf, "Enable IPv6 address autoconfiguration on all interfaces"); + +static __inline__ struct ipv6_pinfo *inet6_sk_generic(struct sock *sk) +{ + const int offset = sk->sk_prot->obj_size - sizeof(struct ipv6_pinfo); + + return (struct ipv6_pinfo *)(((u8 *)sk) + offset); +} + +static int inet6_create(struct net *net, struct socket *sock, int protocol, + int kern) +{ + struct inet_sock *inet; + struct ipv6_pinfo *np; + struct sock *sk; + struct inet_protosw *answer; + struct proto *answer_prot; + unsigned char answer_flags; + char answer_no_check; + int try_loading_module = 0; + int err; + + if (!current_has_network()) + return -EACCES; + + if (sock->type != SOCK_RAW && + sock->type != SOCK_DGRAM && + !inet_ehash_secret) + build_ehash_secret(); + + /* Look for the requested type/protocol pair. */ +lookup_protocol: + err = -ESOCKTNOSUPPORT; + rcu_read_lock(); + list_for_each_entry_rcu(answer, &inetsw6[sock->type], list) { + + err = 0; + /* Check the non-wild match. */ + if (protocol == answer->protocol) { + if (protocol != IPPROTO_IP) + break; + } else { + /* Check for the two wild cases. */ + if (IPPROTO_IP == protocol) { + protocol = answer->protocol; + break; + } + if (IPPROTO_IP == answer->protocol) + break; + } + err = -EPROTONOSUPPORT; + } + + if (err) { + if (try_loading_module < 2) { + rcu_read_unlock(); + /* + * Be more specific, e.g. net-pf-10-proto-132-type-1 + * (net-pf-PF_INET6-proto-IPPROTO_SCTP-type-SOCK_STREAM) + */ + if (++try_loading_module == 1) + request_module("net-pf-%d-proto-%d-type-%d", + PF_INET6, protocol, sock->type); + /* + * Fall back to generic, e.g. net-pf-10-proto-132 + * (net-pf-PF_INET6-proto-IPPROTO_SCTP) + */ + else + request_module("net-pf-%d-proto-%d", + PF_INET6, protocol); + goto lookup_protocol; + } else + goto out_rcu_unlock; + } + + err = -EPERM; + if (sock->type == SOCK_RAW && !kern && !capable(CAP_NET_RAW)) + goto out_rcu_unlock; + + sock->ops = answer->ops; + answer_prot = answer->prot; + answer_no_check = answer->no_check; + answer_flags = answer->flags; + rcu_read_unlock(); + + WARN_ON(answer_prot->slab == NULL); + + err = -ENOBUFS; + sk = sk_alloc(net, PF_INET6, GFP_KERNEL, answer_prot); + if (sk == NULL) + goto out; + + sock_init_data(sock, sk); + + err = 0; + sk->sk_no_check = answer_no_check; + if (INET_PROTOSW_REUSE & answer_flags) + sk->sk_reuse = 1; + + inet = inet_sk(sk); + inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0; + + if (SOCK_RAW == sock->type) { + inet->inet_num = protocol; + if (IPPROTO_RAW == protocol) + inet->hdrincl = 1; + } + + sk->sk_destruct = inet_sock_destruct; + sk->sk_family = PF_INET6; + sk->sk_protocol = protocol; + + sk->sk_backlog_rcv = answer->prot->backlog_rcv; + + inet_sk(sk)->pinet6 = np = inet6_sk_generic(sk); + np->hop_limit = -1; + np->mcast_hops = IPV6_DEFAULT_MCASTHOPS; + np->mc_loop = 1; + np->pmtudisc = IPV6_PMTUDISC_WANT; + np->ipv6only = net->ipv6.sysctl.bindv6only; + + /* Init the ipv4 part of the socket since we can have sockets + * using v6 API for ipv4. + */ + inet->uc_ttl = -1; + + inet->mc_loop = 1; + inet->mc_ttl = 1; + inet->mc_index = 0; + inet->mc_list = NULL; + inet->rcv_tos = 0; + + if (ipv4_config.no_pmtu_disc) + inet->pmtudisc = IP_PMTUDISC_DONT; + else + inet->pmtudisc = IP_PMTUDISC_WANT; + /* + * Increment only the relevant sk_prot->socks debug field, this changes + * the previous behaviour of incrementing both the equivalent to + * answer->prot->socks (inet6_sock_nr) and inet_sock_nr. + * + * This allows better debug granularity as we'll know exactly how many + * UDPv6, TCPv6, etc socks were allocated, not the sum of all IPv6 + * transport protocol socks. -acme + */ + sk_refcnt_debug_inc(sk); + + if (inet->inet_num) { + /* It assumes that any protocol which allows + * the user to assign a number at socket + * creation time automatically shares. + */ + inet->inet_sport = htons(inet->inet_num); + sk->sk_prot->hash(sk); + } + if (sk->sk_prot->init) { + err = sk->sk_prot->init(sk); + if (err) { + sk_common_release(sk); + goto out; + } + } +out: + return err; +out_rcu_unlock: + rcu_read_unlock(); + goto out; +} + + +/* bind for INET6 API */ +int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +{ + struct sockaddr_in6 *addr=(struct sockaddr_in6 *)uaddr; + struct sock *sk = sock->sk; + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct net *net = sock_net(sk); + __be32 v4addr = 0; + unsigned short snum; + int addr_type = 0; + int err = 0; + + /* If the socket has its own bind function then use it. */ + if (sk->sk_prot->bind) + return sk->sk_prot->bind(sk, uaddr, addr_len); + + if (addr_len < SIN6_LEN_RFC2133) + return -EINVAL; + + if (addr->sin6_family != AF_INET6) + return -EAFNOSUPPORT; + + addr_type = ipv6_addr_type(&addr->sin6_addr); + if ((addr_type & IPV6_ADDR_MULTICAST) && sock->type == SOCK_STREAM) + return -EINVAL; + + snum = ntohs(addr->sin6_port); + if (snum && snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE)) + return -EACCES; + + lock_sock(sk); + + /* Check these errors (active socket, double bind). */ + if (sk->sk_state != TCP_CLOSE || inet->inet_num) { + err = -EINVAL; + goto out; + } + + /* Check if the address belongs to the host. */ + if (addr_type == IPV6_ADDR_MAPPED) { + int chk_addr_ret; + + /* Binding to v4-mapped address on a v6-only socket + * makes no sense + */ + if (np->ipv6only) { + err = -EINVAL; + goto out; + } + + /* Reproduce AF_INET checks to make the bindings consistent */ + v4addr = addr->sin6_addr.s6_addr32[3]; + chk_addr_ret = inet_addr_type(net, v4addr); + if (!sysctl_ip_nonlocal_bind && + !(inet->freebind || inet->transparent) && + v4addr != htonl(INADDR_ANY) && + chk_addr_ret != RTN_LOCAL && + chk_addr_ret != RTN_MULTICAST && + chk_addr_ret != RTN_BROADCAST) { + err = -EADDRNOTAVAIL; + goto out; + } + } else { + if (addr_type != IPV6_ADDR_ANY) { + struct net_device *dev = NULL; + + rcu_read_lock(); + if (addr_type & IPV6_ADDR_LINKLOCAL) { + if (addr_len >= sizeof(struct sockaddr_in6) && + addr->sin6_scope_id) { + /* Override any existing binding, if another one + * is supplied by user. + */ + sk->sk_bound_dev_if = addr->sin6_scope_id; + } + + /* Binding to link-local address requires an interface */ + if (!sk->sk_bound_dev_if) { + err = -EINVAL; + goto out_unlock; + } + dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if); + if (!dev) { + err = -ENODEV; + goto out_unlock; + } + } + + /* ipv4 addr of the socket is invalid. Only the + * unspecified and mapped address have a v4 equivalent. + */ + v4addr = LOOPBACK4_IPV6; + if (!(addr_type & IPV6_ADDR_MULTICAST)) { + if (!(inet->freebind || inet->transparent) && + !ipv6_chk_addr(net, &addr->sin6_addr, + dev, 0)) { + err = -EADDRNOTAVAIL; + goto out_unlock; + } + } + rcu_read_unlock(); + } + } + + inet->inet_rcv_saddr = v4addr; + inet->inet_saddr = v4addr; + + np->rcv_saddr = addr->sin6_addr; + + if (!(addr_type & IPV6_ADDR_MULTICAST)) + np->saddr = addr->sin6_addr; + + /* Make sure we are allowed to bind here. */ + if (sk->sk_prot->get_port(sk, snum)) { + inet_reset_saddr(sk); + err = -EADDRINUSE; + goto out; + } + + if (addr_type != IPV6_ADDR_ANY) { + sk->sk_userlocks |= SOCK_BINDADDR_LOCK; + if (addr_type != IPV6_ADDR_MAPPED) + np->ipv6only = 1; + } + if (snum) + sk->sk_userlocks |= SOCK_BINDPORT_LOCK; + inet->inet_sport = htons(inet->inet_num); + inet->inet_dport = 0; + inet->inet_daddr = 0; +out: + release_sock(sk); + return err; +out_unlock: + rcu_read_unlock(); + goto out; +} + +EXPORT_SYMBOL(inet6_bind); + +int inet6_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + + if (sk == NULL) + return -EINVAL; + + /* Free mc lists */ + ipv6_sock_mc_close(sk); + + /* Free ac lists */ + ipv6_sock_ac_close(sk); + + return inet_release(sock); +} + +EXPORT_SYMBOL(inet6_release); + +void inet6_destroy_sock(struct sock *sk) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct sk_buff *skb; + struct ipv6_txoptions *opt; + + /* Release rx options */ + + if ((skb = xchg(&np->pktoptions, NULL)) != NULL) + kfree_skb(skb); + + if ((skb = xchg(&np->rxpmtu, NULL)) != NULL) + kfree_skb(skb); + + /* Free flowlabels */ + fl6_free_socklist(sk); + + /* Free tx options */ + + if ((opt = xchg(&np->opt, NULL)) != NULL) + sock_kfree_s(sk, opt, opt->tot_len); +} + +EXPORT_SYMBOL_GPL(inet6_destroy_sock); + +/* + * This does both peername and sockname. + */ + +int inet6_getname(struct socket *sock, struct sockaddr *uaddr, + int *uaddr_len, int peer) +{ + struct sockaddr_in6 *sin=(struct sockaddr_in6 *)uaddr; + struct sock *sk = sock->sk; + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + + sin->sin6_family = AF_INET6; + sin->sin6_flowinfo = 0; + sin->sin6_scope_id = 0; + if (peer) { + if (!inet->inet_dport) + return -ENOTCONN; + if (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) && + peer == 1) + return -ENOTCONN; + sin->sin6_port = inet->inet_dport; + sin->sin6_addr = np->daddr; + if (np->sndflow) + sin->sin6_flowinfo = np->flow_label; + } else { + if (ipv6_addr_any(&np->rcv_saddr)) + sin->sin6_addr = np->saddr; + else + sin->sin6_addr = np->rcv_saddr; + + sin->sin6_port = inet->inet_sport; + } + if (ipv6_addr_type(&sin->sin6_addr) & IPV6_ADDR_LINKLOCAL) + sin->sin6_scope_id = sk->sk_bound_dev_if; + *uaddr_len = sizeof(*sin); + return 0; +} + +EXPORT_SYMBOL(inet6_getname); + +int inet6_killaddr_ioctl(struct net *net, void __user *arg) { + struct in6_ifreq ireq; + struct sockaddr_in6 sin6; + + if (!capable(CAP_NET_ADMIN)) + return -EACCES; + + if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq))) + return -EFAULT; + + sin6.sin6_family = AF_INET6; + sin6.sin6_addr = ireq.ifr6_addr; + return tcp_nuke_addr(net, (struct sockaddr *) &sin6); +} + +int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + struct sock *sk = sock->sk; + struct net *net = sock_net(sk); + + switch(cmd) + { + case SIOCGSTAMP: + return sock_get_timestamp(sk, (struct timeval __user *)arg); + + case SIOCGSTAMPNS: + return sock_get_timestampns(sk, (struct timespec __user *)arg); + + case SIOCADDRT: + case SIOCDELRT: + + return ipv6_route_ioctl(net, cmd, (void __user *)arg); + + case SIOCSIFADDR: + return addrconf_add_ifaddr(net, (void __user *) arg); + case SIOCDIFADDR: + return addrconf_del_ifaddr(net, (void __user *) arg); + case SIOCSIFDSTADDR: + return addrconf_set_dstaddr(net, (void __user *) arg); + case SIOCKILLADDR: + return inet6_killaddr_ioctl(net, (void __user *) arg); + default: + if (!sk->sk_prot->ioctl) + return -ENOIOCTLCMD; + return sk->sk_prot->ioctl(sk, cmd, arg); + } + /*NOTREACHED*/ + return 0; +} + +EXPORT_SYMBOL(inet6_ioctl); + +const struct proto_ops inet6_stream_ops = { + .family = PF_INET6, + .owner = THIS_MODULE, + .release = inet6_release, + .bind = inet6_bind, + .connect = inet_stream_connect, /* ok */ + .socketpair = sock_no_socketpair, /* a do nothing */ + .accept = inet_accept, /* ok */ + .getname = inet6_getname, + .poll = tcp_poll, /* ok */ + .ioctl = inet6_ioctl, /* must change */ + .listen = inet_listen, /* ok */ + .shutdown = inet_shutdown, /* ok */ + .setsockopt = sock_common_setsockopt, /* ok */ + .getsockopt = sock_common_getsockopt, /* ok */ + .sendmsg = inet_sendmsg, /* ok */ + .recvmsg = inet_recvmsg, /* ok */ + .mmap = sock_no_mmap, + .sendpage = inet_sendpage, + .splice_read = tcp_splice_read, +#ifdef CONFIG_COMPAT + .compat_setsockopt = compat_sock_common_setsockopt, + .compat_getsockopt = compat_sock_common_getsockopt, +#endif +}; + +const struct proto_ops inet6_dgram_ops = { + .family = PF_INET6, + .owner = THIS_MODULE, + .release = inet6_release, + .bind = inet6_bind, + .connect = inet_dgram_connect, /* ok */ + .socketpair = sock_no_socketpair, /* a do nothing */ + .accept = sock_no_accept, /* a do nothing */ + .getname = inet6_getname, + .poll = udp_poll, /* ok */ + .ioctl = inet6_ioctl, /* must change */ + .listen = sock_no_listen, /* ok */ + .shutdown = inet_shutdown, /* ok */ + .setsockopt = sock_common_setsockopt, /* ok */ + .getsockopt = sock_common_getsockopt, /* ok */ + .sendmsg = inet_sendmsg, /* ok */ + .recvmsg = inet_recvmsg, /* ok */ + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +#ifdef CONFIG_COMPAT + .compat_setsockopt = compat_sock_common_setsockopt, + .compat_getsockopt = compat_sock_common_getsockopt, +#endif +}; + +static const struct net_proto_family inet6_family_ops = { + .family = PF_INET6, + .create = inet6_create, + .owner = THIS_MODULE, +}; + +int inet6_register_protosw(struct inet_protosw *p) +{ + struct list_head *lh; + struct inet_protosw *answer; + struct list_head *last_perm; + int protocol = p->protocol; + int ret; + + spin_lock_bh(&inetsw6_lock); + + ret = -EINVAL; + if (p->type >= SOCK_MAX) + goto out_illegal; + + /* If we are trying to override a permanent protocol, bail. */ + answer = NULL; + ret = -EPERM; + last_perm = &inetsw6[p->type]; + list_for_each(lh, &inetsw6[p->type]) { + answer = list_entry(lh, struct inet_protosw, list); + + /* Check only the non-wild match. */ + if (INET_PROTOSW_PERMANENT & answer->flags) { + if (protocol == answer->protocol) + break; + last_perm = lh; + } + + answer = NULL; + } + if (answer) + goto out_permanent; + + /* Add the new entry after the last permanent entry if any, so that + * the new entry does not override a permanent entry when matched with + * a wild-card protocol. But it is allowed to override any existing + * non-permanent entry. This means that when we remove this entry, the + * system automatically returns to the old behavior. + */ + list_add_rcu(&p->list, last_perm); + ret = 0; +out: + spin_unlock_bh(&inetsw6_lock); + return ret; + +out_permanent: + printk(KERN_ERR "Attempt to override permanent protocol %d.\n", + protocol); + goto out; + +out_illegal: + printk(KERN_ERR + "Ignoring attempt to register invalid socket type %d.\n", + p->type); + goto out; +} + +EXPORT_SYMBOL(inet6_register_protosw); + +void +inet6_unregister_protosw(struct inet_protosw *p) +{ + if (INET_PROTOSW_PERMANENT & p->flags) { + printk(KERN_ERR + "Attempt to unregister permanent protocol %d.\n", + p->protocol); + } else { + spin_lock_bh(&inetsw6_lock); + list_del_rcu(&p->list); + spin_unlock_bh(&inetsw6_lock); + + synchronize_net(); + } +} + +EXPORT_SYMBOL(inet6_unregister_protosw); + +int inet6_sk_rebuild_header(struct sock *sk) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct dst_entry *dst; + + dst = __sk_dst_check(sk, np->dst_cookie); + + if (dst == NULL) { + struct inet_sock *inet = inet_sk(sk); + struct in6_addr *final_p, final; + struct flowi6 fl6; + + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_proto = sk->sk_protocol; + fl6.daddr = np->daddr; + fl6.saddr = np->saddr; + fl6.flowlabel = np->flow_label; + fl6.flowi6_oif = sk->sk_bound_dev_if; + fl6.flowi6_mark = sk->sk_mark; + fl6.fl6_dport = inet->inet_dport; + fl6.fl6_sport = inet->inet_sport; + security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); + + final_p = fl6_update_dst(&fl6, np->opt, &final); + + dst = ip6_dst_lookup_flow(sk, &fl6, final_p, false); + if (IS_ERR(dst)) { + sk->sk_route_caps = 0; + sk->sk_err_soft = -PTR_ERR(dst); + return PTR_ERR(dst); + } + + __ip6_dst_store(sk, dst, NULL, NULL); + } + + return 0; +} + +EXPORT_SYMBOL_GPL(inet6_sk_rebuild_header); + +int ipv6_opt_accepted(struct sock *sk, struct sk_buff *skb) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct inet6_skb_parm *opt = IP6CB(skb); + + if (np->rxopt.all) { + if ((opt->hop && (np->rxopt.bits.hopopts || + np->rxopt.bits.ohopopts)) || + ((IPV6_FLOWINFO_MASK & + *(__be32 *)skb_network_header(skb)) && + np->rxopt.bits.rxflow) || + (opt->srcrt && (np->rxopt.bits.srcrt || + np->rxopt.bits.osrcrt)) || + ((opt->dst1 || opt->dst0) && + (np->rxopt.bits.dstopts || np->rxopt.bits.odstopts))) + return 1; + } + return 0; +} + +EXPORT_SYMBOL_GPL(ipv6_opt_accepted); + +static int ipv6_gso_pull_exthdrs(struct sk_buff *skb, int proto) +{ + const struct inet6_protocol *ops = NULL; + + for (;;) { + struct ipv6_opt_hdr *opth; + int len; + + if (proto != NEXTHDR_HOP) { + ops = rcu_dereference(inet6_protos[proto]); + + if (unlikely(!ops)) + break; + + if (!(ops->flags & INET6_PROTO_GSO_EXTHDR)) + break; + } + + if (unlikely(!pskb_may_pull(skb, 8))) + break; + + opth = (void *)skb->data; + len = ipv6_optlen(opth); + + if (unlikely(!pskb_may_pull(skb, len))) + break; + + proto = opth->nexthdr; + __skb_pull(skb, len); + } + + return proto; +} + +static int ipv6_gso_send_check(struct sk_buff *skb) +{ + const struct ipv6hdr *ipv6h; + const struct inet6_protocol *ops; + int err = -EINVAL; + + if (unlikely(!pskb_may_pull(skb, sizeof(*ipv6h)))) + goto out; + + ipv6h = ipv6_hdr(skb); + __skb_pull(skb, sizeof(*ipv6h)); + err = -EPROTONOSUPPORT; + + rcu_read_lock(); + ops = rcu_dereference(inet6_protos[ + ipv6_gso_pull_exthdrs(skb, ipv6h->nexthdr)]); + + if (likely(ops && ops->gso_send_check)) { + skb_reset_transport_header(skb); + err = ops->gso_send_check(skb); + } + rcu_read_unlock(); + +out: + return err; +} + +static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, + netdev_features_t features) +{ + struct sk_buff *segs = ERR_PTR(-EINVAL); + struct ipv6hdr *ipv6h; + const struct inet6_protocol *ops; + int proto; + struct frag_hdr *fptr; + unsigned int unfrag_ip6hlen; + u8 *prevhdr; + int offset = 0; + + if (!(features & NETIF_F_V6_CSUM)) + features &= ~NETIF_F_SG; + + if (unlikely(skb_shinfo(skb)->gso_type & + ~(SKB_GSO_UDP | + SKB_GSO_DODGY | + SKB_GSO_TCP_ECN | + SKB_GSO_TCPV6 | + 0))) + goto out; + + if (unlikely(!pskb_may_pull(skb, sizeof(*ipv6h)))) + goto out; + + ipv6h = ipv6_hdr(skb); + __skb_pull(skb, sizeof(*ipv6h)); + segs = ERR_PTR(-EPROTONOSUPPORT); + + proto = ipv6_gso_pull_exthdrs(skb, ipv6h->nexthdr); + rcu_read_lock(); + ops = rcu_dereference(inet6_protos[proto]); + if (likely(ops && ops->gso_segment)) { + skb_reset_transport_header(skb); + segs = ops->gso_segment(skb, features); + } + rcu_read_unlock(); + + if (IS_ERR(segs)) + goto out; + + for (skb = segs; skb; skb = skb->next) { + ipv6h = ipv6_hdr(skb); + ipv6h->payload_len = htons(skb->len - skb->mac_len - + sizeof(*ipv6h)); + if (proto == IPPROTO_UDP) { + unfrag_ip6hlen = ip6_find_1stfragopt(skb, &prevhdr); + fptr = (struct frag_hdr *)(skb_network_header(skb) + + unfrag_ip6hlen); + fptr->frag_off = htons(offset); + if (skb->next != NULL) + fptr->frag_off |= htons(IP6_MF); + offset += (ntohs(ipv6h->payload_len) - + sizeof(struct frag_hdr)); + } + } + +out: + return segs; +} + +struct ipv6_gro_cb { + struct napi_gro_cb napi; + int proto; +}; + +#define IPV6_GRO_CB(skb) ((struct ipv6_gro_cb *)(skb)->cb) + +static struct sk_buff **ipv6_gro_receive(struct sk_buff **head, + struct sk_buff *skb) +{ + const struct inet6_protocol *ops; + struct sk_buff **pp = NULL; + struct sk_buff *p; + struct ipv6hdr *iph; + unsigned int nlen; + unsigned int hlen; + unsigned int off; + int flush = 1; + int proto; + __wsum csum; + + off = skb_gro_offset(skb); + hlen = off + sizeof(*iph); + iph = skb_gro_header_fast(skb, off); + if (skb_gro_header_hard(skb, hlen)) { + iph = skb_gro_header_slow(skb, hlen, off); + if (unlikely(!iph)) + goto out; + } + + skb_gro_pull(skb, sizeof(*iph)); + skb_set_transport_header(skb, skb_gro_offset(skb)); + + flush += ntohs(iph->payload_len) != skb_gro_len(skb); + + rcu_read_lock(); + proto = iph->nexthdr; + ops = rcu_dereference(inet6_protos[proto]); + if (!ops || !ops->gro_receive) { + __pskb_pull(skb, skb_gro_offset(skb)); + proto = ipv6_gso_pull_exthdrs(skb, proto); + skb_gro_pull(skb, -skb_transport_offset(skb)); + skb_reset_transport_header(skb); + __skb_push(skb, skb_gro_offset(skb)); + + ops = rcu_dereference(inet6_protos[proto]); + if (!ops || !ops->gro_receive) + goto out_unlock; + + iph = ipv6_hdr(skb); + } + + IPV6_GRO_CB(skb)->proto = proto; + + flush--; + nlen = skb_network_header_len(skb); + + for (p = *head; p; p = p->next) { + struct ipv6hdr *iph2; + + if (!NAPI_GRO_CB(p)->same_flow) + continue; + + iph2 = ipv6_hdr(p); + + /* All fields must match except length. */ + if (nlen != skb_network_header_len(p) || + memcmp(iph, iph2, offsetof(struct ipv6hdr, payload_len)) || + memcmp(&iph->nexthdr, &iph2->nexthdr, + nlen - offsetof(struct ipv6hdr, nexthdr))) { + NAPI_GRO_CB(p)->same_flow = 0; + continue; + } + + NAPI_GRO_CB(p)->flush |= flush; + } + + NAPI_GRO_CB(skb)->flush |= flush; + + csum = skb->csum; + skb_postpull_rcsum(skb, iph, skb_network_header_len(skb)); + + pp = ops->gro_receive(head, skb); + + skb->csum = csum; + +out_unlock: + rcu_read_unlock(); + +out: + NAPI_GRO_CB(skb)->flush |= flush; + + return pp; +} + +static int ipv6_gro_complete(struct sk_buff *skb) +{ + const struct inet6_protocol *ops; + struct ipv6hdr *iph = ipv6_hdr(skb); + int err = -ENOSYS; + + iph->payload_len = htons(skb->len - skb_network_offset(skb) - + sizeof(*iph)); + + rcu_read_lock(); + ops = rcu_dereference(inet6_protos[IPV6_GRO_CB(skb)->proto]); + if (WARN_ON(!ops || !ops->gro_complete)) + goto out_unlock; + + err = ops->gro_complete(skb); + +out_unlock: + rcu_read_unlock(); + + return err; +} + +static struct packet_type ipv6_packet_type __read_mostly = { + .type = cpu_to_be16(ETH_P_IPV6), + .func = ipv6_rcv, + .gso_send_check = ipv6_gso_send_check, + .gso_segment = ipv6_gso_segment, + .gro_receive = ipv6_gro_receive, + .gro_complete = ipv6_gro_complete, +}; + +static int __init ipv6_packet_init(void) +{ + dev_add_pack(&ipv6_packet_type); + return 0; +} + +static void ipv6_packet_cleanup(void) +{ + dev_remove_pack(&ipv6_packet_type); +} + +static int __net_init ipv6_init_mibs(struct net *net) +{ + if (snmp_mib_init((void __percpu **)net->mib.udp_stats_in6, + sizeof(struct udp_mib), + __alignof__(struct udp_mib)) < 0) + return -ENOMEM; + if (snmp_mib_init((void __percpu **)net->mib.udplite_stats_in6, + sizeof(struct udp_mib), + __alignof__(struct udp_mib)) < 0) + goto err_udplite_mib; + if (snmp_mib_init((void __percpu **)net->mib.ipv6_statistics, + sizeof(struct ipstats_mib), + __alignof__(struct ipstats_mib)) < 0) + goto err_ip_mib; + if (snmp_mib_init((void __percpu **)net->mib.icmpv6_statistics, + sizeof(struct icmpv6_mib), + __alignof__(struct icmpv6_mib)) < 0) + goto err_icmp_mib; + net->mib.icmpv6msg_statistics = kzalloc(sizeof(struct icmpv6msg_mib), + GFP_KERNEL); + if (!net->mib.icmpv6msg_statistics) + goto err_icmpmsg_mib; + return 0; + +err_icmpmsg_mib: + snmp_mib_free((void __percpu **)net->mib.icmpv6_statistics); +err_icmp_mib: + snmp_mib_free((void __percpu **)net->mib.ipv6_statistics); +err_ip_mib: + snmp_mib_free((void __percpu **)net->mib.udplite_stats_in6); +err_udplite_mib: + snmp_mib_free((void __percpu **)net->mib.udp_stats_in6); + return -ENOMEM; +} + +static void ipv6_cleanup_mibs(struct net *net) +{ + snmp_mib_free((void __percpu **)net->mib.udp_stats_in6); + snmp_mib_free((void __percpu **)net->mib.udplite_stats_in6); + snmp_mib_free((void __percpu **)net->mib.ipv6_statistics); + snmp_mib_free((void __percpu **)net->mib.icmpv6_statistics); + kfree(net->mib.icmpv6msg_statistics); +} + +static int __net_init inet6_net_init(struct net *net) +{ + int err = 0; + + net->ipv6.sysctl.bindv6only = 0; + net->ipv6.sysctl.icmpv6_time = 1*HZ; + + err = ipv6_init_mibs(net); + if (err) + return err; +#ifdef CONFIG_PROC_FS + err = udp6_proc_init(net); + if (err) + goto out; + err = tcp6_proc_init(net); + if (err) + goto proc_tcp6_fail; + err = ac6_proc_init(net); + if (err) + goto proc_ac6_fail; +#endif + return err; + +#ifdef CONFIG_PROC_FS +proc_ac6_fail: + tcp6_proc_exit(net); +proc_tcp6_fail: + udp6_proc_exit(net); +out: + ipv6_cleanup_mibs(net); + return err; +#endif +} + +static void __net_exit inet6_net_exit(struct net *net) +{ +#ifdef CONFIG_PROC_FS + udp6_proc_exit(net); + tcp6_proc_exit(net); + ac6_proc_exit(net); +#endif + ipv6_cleanup_mibs(net); +} + +static struct pernet_operations inet6_net_ops = { + .init = inet6_net_init, + .exit = inet6_net_exit, +}; + +static int __init inet6_init(void) +{ + struct sk_buff *dummy_skb; + struct list_head *r; + int err = 0; + + BUILD_BUG_ON(sizeof(struct inet6_skb_parm) > sizeof(dummy_skb->cb)); + + /* Register the socket-side information for inet6_create. */ + for(r = &inetsw6[0]; r < &inetsw6[SOCK_MAX]; ++r) + INIT_LIST_HEAD(r); + + if (disable_ipv6_mod) { + printk(KERN_INFO + "IPv6: Loaded, but administratively disabled, " + "reboot required to enable\n"); + goto out; + } + + err = proto_register(&tcpv6_prot, 1); + if (err) + goto out; + + err = proto_register(&udpv6_prot, 1); + if (err) + goto out_unregister_tcp_proto; + + err = proto_register(&udplitev6_prot, 1); + if (err) + goto out_unregister_udp_proto; + + err = proto_register(&rawv6_prot, 1); + if (err) + goto out_unregister_udplite_proto; + + err = proto_register(&pingv6_prot, 1); + if (err) + goto out_unregister_ping_proto; + + /* We MUST register RAW sockets before we create the ICMP6, + * IGMP6, or NDISC control sockets. + */ + err = rawv6_init(); + if (err) + goto out_unregister_raw_proto; + + /* Register the family here so that the init calls below will + * be able to create sockets. (?? is this dangerous ??) + */ + err = sock_register(&inet6_family_ops); + if (err) + goto out_sock_register_fail; + +#ifdef CONFIG_SYSCTL + err = ipv6_static_sysctl_register(); + if (err) + goto static_sysctl_fail; +#endif + tcpv6_prot.sysctl_mem = init_net.ipv4.sysctl_tcp_mem; + + /* + * ipngwg API draft makes clear that the correct semantics + * for TCP and UDP is to consider one TCP and UDP instance + * in a host available by both INET and INET6 APIs and + * able to communicate via both network protocols. + */ + + err = register_pernet_subsys(&inet6_net_ops); + if (err) + goto register_pernet_fail; + err = icmpv6_init(); + if (err) + goto icmp_fail; + err = ip6_mr_init(); + if (err) + goto ipmr_fail; + err = ndisc_init(); + if (err) + goto ndisc_fail; + err = igmp6_init(); + if (err) + goto igmp_fail; + err = ipv6_netfilter_init(); + if (err) + goto netfilter_fail; + /* Create /proc/foo6 entries. */ +#ifdef CONFIG_PROC_FS + err = -ENOMEM; + if (raw6_proc_init()) + goto proc_raw6_fail; + if (udplite6_proc_init()) + goto proc_udplite6_fail; + if (ipv6_misc_proc_init()) + goto proc_misc6_fail; + if (if6_proc_init()) + goto proc_if6_fail; +#endif + err = ip6_route_init(); + if (err) + goto ip6_route_fail; + err = ip6_flowlabel_init(); + if (err) + goto ip6_flowlabel_fail; + err = addrconf_init(); + if (err) + goto addrconf_fail; + + /* Init v6 extension headers. */ + err = ipv6_exthdrs_init(); + if (err) + goto ipv6_exthdrs_fail; + + err = ipv6_frag_init(); + if (err) + goto ipv6_frag_fail; + + /* Init v6 transport protocols. */ + err = udpv6_init(); + if (err) + goto udpv6_fail; + + err = udplitev6_init(); + if (err) + goto udplitev6_fail; + + err = tcpv6_init(); + if (err) + goto tcpv6_fail; + + err = ipv6_packet_init(); + if (err) + goto ipv6_packet_fail; + + err = pingv6_init(); + if (err) + goto pingv6_fail; + +#ifdef CONFIG_SYSCTL + err = ipv6_sysctl_register(); + if (err) + goto sysctl_fail; +#endif +out: + return err; + +#ifdef CONFIG_SYSCTL +sysctl_fail: + ipv6_packet_cleanup(); +#endif +pingv6_fail: + pingv6_exit(); +ipv6_packet_fail: + tcpv6_exit(); +tcpv6_fail: + udplitev6_exit(); +udplitev6_fail: + udpv6_exit(); +udpv6_fail: + ipv6_frag_exit(); +ipv6_frag_fail: + ipv6_exthdrs_exit(); +ipv6_exthdrs_fail: + addrconf_cleanup(); +addrconf_fail: + ip6_flowlabel_cleanup(); +ip6_flowlabel_fail: + ip6_route_cleanup(); +ip6_route_fail: +#ifdef CONFIG_PROC_FS + if6_proc_exit(); +proc_if6_fail: + ipv6_misc_proc_exit(); +proc_misc6_fail: + udplite6_proc_exit(); +proc_udplite6_fail: + raw6_proc_exit(); +proc_raw6_fail: +#endif + ipv6_netfilter_fini(); +netfilter_fail: + igmp6_cleanup(); +igmp_fail: + ndisc_cleanup(); +ndisc_fail: + ip6_mr_cleanup(); +ipmr_fail: + icmpv6_cleanup(); +icmp_fail: + unregister_pernet_subsys(&inet6_net_ops); +register_pernet_fail: +#ifdef CONFIG_SYSCTL + ipv6_static_sysctl_unregister(); +static_sysctl_fail: +#endif + sock_unregister(PF_INET6); + rtnl_unregister_all(PF_INET6); +out_sock_register_fail: + rawv6_exit(); +out_unregister_ping_proto: + proto_unregister(&pingv6_prot); +out_unregister_raw_proto: + proto_unregister(&rawv6_prot); +out_unregister_udplite_proto: + proto_unregister(&udplitev6_prot); +out_unregister_udp_proto: + proto_unregister(&udpv6_prot); +out_unregister_tcp_proto: + proto_unregister(&tcpv6_prot); + goto out; +} +module_init(inet6_init); + +static void __exit inet6_exit(void) +{ + if (disable_ipv6_mod) + return; + + /* First of all disallow new sockets creation. */ + sock_unregister(PF_INET6); + /* Disallow any further netlink messages */ + rtnl_unregister_all(PF_INET6); + +#ifdef CONFIG_SYSCTL + ipv6_sysctl_unregister(); +#endif + udpv6_exit(); + udplitev6_exit(); + tcpv6_exit(); + + /* Cleanup code parts. */ + ipv6_packet_cleanup(); + ipv6_frag_exit(); + ipv6_exthdrs_exit(); + addrconf_cleanup(); + ip6_flowlabel_cleanup(); + ip6_route_cleanup(); +#ifdef CONFIG_PROC_FS + + /* Cleanup code parts. */ + if6_proc_exit(); + ipv6_misc_proc_exit(); + udplite6_proc_exit(); + raw6_proc_exit(); +#endif + ipv6_netfilter_fini(); + igmp6_cleanup(); + ndisc_cleanup(); + ip6_mr_cleanup(); + icmpv6_cleanup(); + rawv6_exit(); + + unregister_pernet_subsys(&inet6_net_ops); +#ifdef CONFIG_SYSCTL + ipv6_static_sysctl_unregister(); +#endif + proto_unregister(&rawv6_prot); + proto_unregister(&udplitev6_prot); + proto_unregister(&udpv6_prot); + proto_unregister(&tcpv6_prot); + + rcu_barrier(); /* Wait for completion of call_rcu()'s */ +} +module_exit(inet6_exit); + +MODULE_ALIAS_NETPROTO(PF_INET6); diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c new file mode 100644 index 00000000..2ae79dbe --- /dev/null +++ b/net/ipv6/ah6.c @@ -0,0 +1,757 @@ +/* + * Copyright (C)2002 USAGI/WIDE Project + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Authors + * + * Mitsuru KANDA @USAGI : IPv6 Support + * Kazunori MIYAZAWA @USAGI : + * Kunihiro Ishiguro <kunihiro@ipinfusion.com> + * + * This file is derived from net/ipv4/ah.c. + */ + +#include <crypto/hash.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <net/ip.h> +#include <net/ah.h> +#include <linux/crypto.h> +#include <linux/pfkeyv2.h> +#include <linux/string.h> +#include <linux/scatterlist.h> +#include <net/icmp.h> +#include <net/ipv6.h> +#include <net/protocol.h> +#include <net/xfrm.h> + +#define IPV6HDR_BASELEN 8 + +struct tmp_ext { +#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) + struct in6_addr saddr; +#endif + struct in6_addr daddr; + char hdrs[0]; +}; + +struct ah_skb_cb { + struct xfrm_skb_cb xfrm; + void *tmp; +}; + +#define AH_SKB_CB(__skb) ((struct ah_skb_cb *)&((__skb)->cb[0])) + +static void *ah_alloc_tmp(struct crypto_ahash *ahash, int nfrags, + unsigned int size) +{ + unsigned int len; + + len = size + crypto_ahash_digestsize(ahash) + + (crypto_ahash_alignmask(ahash) & + ~(crypto_tfm_ctx_alignment() - 1)); + + len = ALIGN(len, crypto_tfm_ctx_alignment()); + + len += sizeof(struct ahash_request) + crypto_ahash_reqsize(ahash); + len = ALIGN(len, __alignof__(struct scatterlist)); + + len += sizeof(struct scatterlist) * nfrags; + + return kmalloc(len, GFP_ATOMIC); +} + +static inline struct tmp_ext *ah_tmp_ext(void *base) +{ + return base + IPV6HDR_BASELEN; +} + +static inline u8 *ah_tmp_auth(u8 *tmp, unsigned int offset) +{ + return tmp + offset; +} + +static inline u8 *ah_tmp_icv(struct crypto_ahash *ahash, void *tmp, + unsigned int offset) +{ + return PTR_ALIGN((u8 *)tmp + offset, crypto_ahash_alignmask(ahash) + 1); +} + +static inline struct ahash_request *ah_tmp_req(struct crypto_ahash *ahash, + u8 *icv) +{ + struct ahash_request *req; + + req = (void *)PTR_ALIGN(icv + crypto_ahash_digestsize(ahash), + crypto_tfm_ctx_alignment()); + + ahash_request_set_tfm(req, ahash); + + return req; +} + +static inline struct scatterlist *ah_req_sg(struct crypto_ahash *ahash, + struct ahash_request *req) +{ + return (void *)ALIGN((unsigned long)(req + 1) + + crypto_ahash_reqsize(ahash), + __alignof__(struct scatterlist)); +} + +static int zero_out_mutable_opts(struct ipv6_opt_hdr *opthdr) +{ + u8 *opt = (u8 *)opthdr; + int len = ipv6_optlen(opthdr); + int off = 0; + int optlen = 0; + + off += 2; + len -= 2; + + while (len > 0) { + + switch (opt[off]) { + + case IPV6_TLV_PAD0: + optlen = 1; + break; + default: + if (len < 2) + goto bad; + optlen = opt[off+1]+2; + if (len < optlen) + goto bad; + if (opt[off] & 0x20) + memset(&opt[off+2], 0, opt[off+1]); + break; + } + + off += optlen; + len -= optlen; + } + if (len == 0) + return 1; + +bad: + return 0; +} + +#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) +/** + * ipv6_rearrange_destopt - rearrange IPv6 destination options header + * @iph: IPv6 header + * @destopt: destionation options header + */ +static void ipv6_rearrange_destopt(struct ipv6hdr *iph, struct ipv6_opt_hdr *destopt) +{ + u8 *opt = (u8 *)destopt; + int len = ipv6_optlen(destopt); + int off = 0; + int optlen = 0; + + off += 2; + len -= 2; + + while (len > 0) { + + switch (opt[off]) { + + case IPV6_TLV_PAD0: + optlen = 1; + break; + default: + if (len < 2) + goto bad; + optlen = opt[off+1]+2; + if (len < optlen) + goto bad; + + /* Rearrange the source address in @iph and the + * addresses in home address option for final source. + * See 11.3.2 of RFC 3775 for details. + */ + if (opt[off] == IPV6_TLV_HAO) { + struct in6_addr final_addr; + struct ipv6_destopt_hao *hao; + + hao = (struct ipv6_destopt_hao *)&opt[off]; + if (hao->length != sizeof(hao->addr)) { + if (net_ratelimit()) + printk(KERN_WARNING "destopt hao: invalid header length: %u\n", hao->length); + goto bad; + } + final_addr = hao->addr; + hao->addr = iph->saddr; + iph->saddr = final_addr; + } + break; + } + + off += optlen; + len -= optlen; + } + /* Note: ok if len == 0 */ +bad: + return; +} +#else +static void ipv6_rearrange_destopt(struct ipv6hdr *iph, struct ipv6_opt_hdr *destopt) {} +#endif + +/** + * ipv6_rearrange_rthdr - rearrange IPv6 routing header + * @iph: IPv6 header + * @rthdr: routing header + * + * Rearrange the destination address in @iph and the addresses in @rthdr + * so that they appear in the order they will at the final destination. + * See Appendix A2 of RFC 2402 for details. + */ +static void ipv6_rearrange_rthdr(struct ipv6hdr *iph, struct ipv6_rt_hdr *rthdr) +{ + int segments, segments_left; + struct in6_addr *addrs; + struct in6_addr final_addr; + + segments_left = rthdr->segments_left; + if (segments_left == 0) + return; + rthdr->segments_left = 0; + + /* The value of rthdr->hdrlen has been verified either by the system + * call if it is locally generated, or by ipv6_rthdr_rcv() for incoming + * packets. So we can assume that it is even and that segments is + * greater than or equal to segments_left. + * + * For the same reason we can assume that this option is of type 0. + */ + segments = rthdr->hdrlen >> 1; + + addrs = ((struct rt0_hdr *)rthdr)->addr; + final_addr = addrs[segments - 1]; + + addrs += segments - segments_left; + memmove(addrs + 1, addrs, (segments_left - 1) * sizeof(*addrs)); + + addrs[0] = iph->daddr; + iph->daddr = final_addr; +} + +static int ipv6_clear_mutable_options(struct ipv6hdr *iph, int len, int dir) +{ + union { + struct ipv6hdr *iph; + struct ipv6_opt_hdr *opth; + struct ipv6_rt_hdr *rth; + char *raw; + } exthdr = { .iph = iph }; + char *end = exthdr.raw + len; + int nexthdr = iph->nexthdr; + + exthdr.iph++; + + while (exthdr.raw < end) { + switch (nexthdr) { + case NEXTHDR_DEST: + if (dir == XFRM_POLICY_OUT) + ipv6_rearrange_destopt(iph, exthdr.opth); + case NEXTHDR_HOP: + if (!zero_out_mutable_opts(exthdr.opth)) { + LIMIT_NETDEBUG( + KERN_WARNING "overrun %sopts\n", + nexthdr == NEXTHDR_HOP ? + "hop" : "dest"); + return -EINVAL; + } + break; + + case NEXTHDR_ROUTING: + ipv6_rearrange_rthdr(iph, exthdr.rth); + break; + + default : + return 0; + } + + nexthdr = exthdr.opth->nexthdr; + exthdr.raw += ipv6_optlen(exthdr.opth); + } + + return 0; +} + +static void ah6_output_done(struct crypto_async_request *base, int err) +{ + int extlen; + u8 *iph_base; + u8 *icv; + struct sk_buff *skb = base->data; + struct xfrm_state *x = skb_dst(skb)->xfrm; + struct ah_data *ahp = x->data; + struct ipv6hdr *top_iph = ipv6_hdr(skb); + struct ip_auth_hdr *ah = ip_auth_hdr(skb); + struct tmp_ext *iph_ext; + + extlen = skb_network_header_len(skb) - sizeof(struct ipv6hdr); + if (extlen) + extlen += sizeof(*iph_ext); + + iph_base = AH_SKB_CB(skb)->tmp; + iph_ext = ah_tmp_ext(iph_base); + icv = ah_tmp_icv(ahp->ahash, iph_ext, extlen); + + memcpy(ah->auth_data, icv, ahp->icv_trunc_len); + memcpy(top_iph, iph_base, IPV6HDR_BASELEN); + + if (extlen) { +#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) + memcpy(&top_iph->saddr, iph_ext, extlen); +#else + memcpy(&top_iph->daddr, iph_ext, extlen); +#endif + } + + kfree(AH_SKB_CB(skb)->tmp); + xfrm_output_resume(skb, err); +} + +static int ah6_output(struct xfrm_state *x, struct sk_buff *skb) +{ + int err; + int nfrags; + int extlen; + u8 *iph_base; + u8 *icv; + u8 nexthdr; + struct sk_buff *trailer; + struct crypto_ahash *ahash; + struct ahash_request *req; + struct scatterlist *sg; + struct ipv6hdr *top_iph; + struct ip_auth_hdr *ah; + struct ah_data *ahp; + struct tmp_ext *iph_ext; + + ahp = x->data; + ahash = ahp->ahash; + + if ((err = skb_cow_data(skb, 0, &trailer)) < 0) + goto out; + nfrags = err; + + skb_push(skb, -skb_network_offset(skb)); + extlen = skb_network_header_len(skb) - sizeof(struct ipv6hdr); + if (extlen) + extlen += sizeof(*iph_ext); + + err = -ENOMEM; + iph_base = ah_alloc_tmp(ahash, nfrags, IPV6HDR_BASELEN + extlen); + if (!iph_base) + goto out; + + iph_ext = ah_tmp_ext(iph_base); + icv = ah_tmp_icv(ahash, iph_ext, extlen); + req = ah_tmp_req(ahash, icv); + sg = ah_req_sg(ahash, req); + + ah = ip_auth_hdr(skb); + memset(ah->auth_data, 0, ahp->icv_trunc_len); + + top_iph = ipv6_hdr(skb); + top_iph->payload_len = htons(skb->len - sizeof(*top_iph)); + + nexthdr = *skb_mac_header(skb); + *skb_mac_header(skb) = IPPROTO_AH; + + /* When there are no extension headers, we only need to save the first + * 8 bytes of the base IP header. + */ + memcpy(iph_base, top_iph, IPV6HDR_BASELEN); + + if (extlen) { +#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) + memcpy(iph_ext, &top_iph->saddr, extlen); +#else + memcpy(iph_ext, &top_iph->daddr, extlen); +#endif + err = ipv6_clear_mutable_options(top_iph, + extlen - sizeof(*iph_ext) + + sizeof(*top_iph), + XFRM_POLICY_OUT); + if (err) + goto out_free; + } + + ah->nexthdr = nexthdr; + + top_iph->priority = 0; + top_iph->flow_lbl[0] = 0; + top_iph->flow_lbl[1] = 0; + top_iph->flow_lbl[2] = 0; + top_iph->hop_limit = 0; + + ah->hdrlen = (XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2; + + ah->reserved = 0; + ah->spi = x->id.spi; + ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); + + sg_init_table(sg, nfrags); + skb_to_sgvec(skb, sg, 0, skb->len); + + ahash_request_set_crypt(req, sg, icv, skb->len); + ahash_request_set_callback(req, 0, ah6_output_done, skb); + + AH_SKB_CB(skb)->tmp = iph_base; + + err = crypto_ahash_digest(req); + if (err) { + if (err == -EINPROGRESS) + goto out; + + if (err == -EBUSY) + err = NET_XMIT_DROP; + goto out_free; + } + + memcpy(ah->auth_data, icv, ahp->icv_trunc_len); + memcpy(top_iph, iph_base, IPV6HDR_BASELEN); + + if (extlen) { +#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) + memcpy(&top_iph->saddr, iph_ext, extlen); +#else + memcpy(&top_iph->daddr, iph_ext, extlen); +#endif + } + +out_free: + kfree(iph_base); +out: + return err; +} + +static void ah6_input_done(struct crypto_async_request *base, int err) +{ + u8 *auth_data; + u8 *icv; + u8 *work_iph; + struct sk_buff *skb = base->data; + struct xfrm_state *x = xfrm_input_state(skb); + struct ah_data *ahp = x->data; + struct ip_auth_hdr *ah = ip_auth_hdr(skb); + int hdr_len = skb_network_header_len(skb); + int ah_hlen = (ah->hdrlen + 2) << 2; + + work_iph = AH_SKB_CB(skb)->tmp; + auth_data = ah_tmp_auth(work_iph, hdr_len); + icv = ah_tmp_icv(ahp->ahash, auth_data, ahp->icv_trunc_len); + + err = memcmp(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG: 0; + if (err) + goto out; + + err = ah->nexthdr; + + skb->network_header += ah_hlen; + memcpy(skb_network_header(skb), work_iph, hdr_len); + __skb_pull(skb, ah_hlen + hdr_len); + skb_set_transport_header(skb, -hdr_len); +out: + kfree(AH_SKB_CB(skb)->tmp); + xfrm_input_resume(skb, err); +} + + + +static int ah6_input(struct xfrm_state *x, struct sk_buff *skb) +{ + /* + * Before process AH + * [IPv6][Ext1][Ext2][AH][Dest][Payload] + * |<-------------->| hdr_len + * + * To erase AH: + * Keeping copy of cleared headers. After AH processing, + * Moving the pointer of skb->network_header by using skb_pull as long + * as AH header length. Then copy back the copy as long as hdr_len + * If destination header following AH exists, copy it into after [Ext2]. + * + * |<>|[IPv6][Ext1][Ext2][Dest][Payload] + * There is offset of AH before IPv6 header after the process. + */ + + u8 *auth_data; + u8 *icv; + u8 *work_iph; + struct sk_buff *trailer; + struct crypto_ahash *ahash; + struct ahash_request *req; + struct scatterlist *sg; + struct ip_auth_hdr *ah; + struct ipv6hdr *ip6h; + struct ah_data *ahp; + u16 hdr_len; + u16 ah_hlen; + int nexthdr; + int nfrags; + int err = -ENOMEM; + + if (!pskb_may_pull(skb, sizeof(struct ip_auth_hdr))) + goto out; + + /* We are going to _remove_ AH header to keep sockets happy, + * so... Later this can change. */ + if (skb_cloned(skb) && + pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + goto out; + + skb->ip_summed = CHECKSUM_NONE; + + hdr_len = skb_network_header_len(skb); + ah = (struct ip_auth_hdr *)skb->data; + ahp = x->data; + ahash = ahp->ahash; + + nexthdr = ah->nexthdr; + ah_hlen = (ah->hdrlen + 2) << 2; + + if (ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_full_len) && + ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len)) + goto out; + + if (!pskb_may_pull(skb, ah_hlen)) + goto out; + + + if ((err = skb_cow_data(skb, 0, &trailer)) < 0) + goto out; + nfrags = err; + + ah = (struct ip_auth_hdr *)skb->data; + ip6h = ipv6_hdr(skb); + + skb_push(skb, hdr_len); + + work_iph = ah_alloc_tmp(ahash, nfrags, hdr_len + ahp->icv_trunc_len); + if (!work_iph) + goto out; + + auth_data = ah_tmp_auth(work_iph, hdr_len); + icv = ah_tmp_icv(ahash, auth_data, ahp->icv_trunc_len); + req = ah_tmp_req(ahash, icv); + sg = ah_req_sg(ahash, req); + + memcpy(work_iph, ip6h, hdr_len); + memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len); + memset(ah->auth_data, 0, ahp->icv_trunc_len); + + if (ipv6_clear_mutable_options(ip6h, hdr_len, XFRM_POLICY_IN)) + goto out_free; + + ip6h->priority = 0; + ip6h->flow_lbl[0] = 0; + ip6h->flow_lbl[1] = 0; + ip6h->flow_lbl[2] = 0; + ip6h->hop_limit = 0; + + sg_init_table(sg, nfrags); + skb_to_sgvec(skb, sg, 0, skb->len); + + ahash_request_set_crypt(req, sg, icv, skb->len); + ahash_request_set_callback(req, 0, ah6_input_done, skb); + + AH_SKB_CB(skb)->tmp = work_iph; + + err = crypto_ahash_digest(req); + if (err) { + if (err == -EINPROGRESS) + goto out; + + goto out_free; + } + + err = memcmp(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG: 0; + if (err) + goto out_free; + + skb->network_header += ah_hlen; + memcpy(skb_network_header(skb), work_iph, hdr_len); + skb->transport_header = skb->network_header; + __skb_pull(skb, ah_hlen + hdr_len); + + err = nexthdr; + +out_free: + kfree(work_iph); +out: + return err; +} + +static void ah6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, __be32 info) +{ + struct net *net = dev_net(skb->dev); + struct ipv6hdr *iph = (struct ipv6hdr*)skb->data; + struct ip_auth_hdr *ah = (struct ip_auth_hdr*)(skb->data+offset); + struct xfrm_state *x; + + if (type != ICMPV6_DEST_UNREACH && + type != ICMPV6_PKT_TOOBIG) + return; + + x = xfrm_state_lookup(net, skb->mark, (xfrm_address_t *)&iph->daddr, ah->spi, IPPROTO_AH, AF_INET6); + if (!x) + return; + + NETDEBUG(KERN_DEBUG "pmtu discovery on SA AH/%08x/%pI6\n", + ntohl(ah->spi), &iph->daddr); + + xfrm_state_put(x); +} + +static int ah6_init_state(struct xfrm_state *x) +{ + struct ah_data *ahp = NULL; + struct xfrm_algo_desc *aalg_desc; + struct crypto_ahash *ahash; + + if (!x->aalg) + goto error; + + if (x->encap) + goto error; + + ahp = kzalloc(sizeof(*ahp), GFP_KERNEL); + if (ahp == NULL) + return -ENOMEM; + + ahash = crypto_alloc_ahash(x->aalg->alg_name, 0, 0); + if (IS_ERR(ahash)) + goto error; + + ahp->ahash = ahash; + if (crypto_ahash_setkey(ahash, x->aalg->alg_key, + (x->aalg->alg_key_len + 7) / 8)) + goto error; + + /* + * Lookup the algorithm description maintained by xfrm_algo, + * verify crypto transform properties, and store information + * we need for AH processing. This lookup cannot fail here + * after a successful crypto_alloc_hash(). + */ + aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name, 0); + BUG_ON(!aalg_desc); + + if (aalg_desc->uinfo.auth.icv_fullbits/8 != + crypto_ahash_digestsize(ahash)) { + printk(KERN_INFO "AH: %s digestsize %u != %hu\n", + x->aalg->alg_name, crypto_ahash_digestsize(ahash), + aalg_desc->uinfo.auth.icv_fullbits/8); + goto error; + } + + ahp->icv_full_len = aalg_desc->uinfo.auth.icv_fullbits/8; + ahp->icv_trunc_len = x->aalg->alg_trunc_len/8; + + BUG_ON(ahp->icv_trunc_len > MAX_AH_AUTH_LEN); + + x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + + ahp->icv_trunc_len); + switch (x->props.mode) { + case XFRM_MODE_BEET: + case XFRM_MODE_TRANSPORT: + break; + case XFRM_MODE_TUNNEL: + x->props.header_len += sizeof(struct ipv6hdr); + break; + default: + goto error; + } + x->data = ahp; + + return 0; + +error: + if (ahp) { + crypto_free_ahash(ahp->ahash); + kfree(ahp); + } + return -EINVAL; +} + +static void ah6_destroy(struct xfrm_state *x) +{ + struct ah_data *ahp = x->data; + + if (!ahp) + return; + + crypto_free_ahash(ahp->ahash); + kfree(ahp); +} + +static const struct xfrm_type ah6_type = +{ + .description = "AH6", + .owner = THIS_MODULE, + .proto = IPPROTO_AH, + .flags = XFRM_TYPE_REPLAY_PROT, + .init_state = ah6_init_state, + .destructor = ah6_destroy, + .input = ah6_input, + .output = ah6_output, + .hdr_offset = xfrm6_find_1stfragopt, +}; + +static const struct inet6_protocol ah6_protocol = { + .handler = xfrm6_rcv, + .err_handler = ah6_err, + .flags = INET6_PROTO_NOPOLICY, +}; + +static int __init ah6_init(void) +{ + if (xfrm_register_type(&ah6_type, AF_INET6) < 0) { + printk(KERN_INFO "ipv6 ah init: can't add xfrm type\n"); + return -EAGAIN; + } + + if (inet6_add_protocol(&ah6_protocol, IPPROTO_AH) < 0) { + printk(KERN_INFO "ipv6 ah init: can't add protocol\n"); + xfrm_unregister_type(&ah6_type, AF_INET6); + return -EAGAIN; + } + + return 0; +} + +static void __exit ah6_fini(void) +{ + if (inet6_del_protocol(&ah6_protocol, IPPROTO_AH) < 0) + printk(KERN_INFO "ipv6 ah close: can't remove protocol\n"); + + if (xfrm_unregister_type(&ah6_type, AF_INET6) < 0) + printk(KERN_INFO "ipv6 ah close: can't remove xfrm type\n"); + +} + +module_init(ah6_init); +module_exit(ah6_fini); + +MODULE_LICENSE("GPL"); +MODULE_ALIAS_XFRM_TYPE(AF_INET6, XFRM_PROTO_AH); diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c new file mode 100644 index 00000000..db00d27f --- /dev/null +++ b/net/ipv6/anycast.c @@ -0,0 +1,520 @@ +/* + * Anycast support for IPv6 + * Linux INET6 implementation + * + * Authors: + * David L Stevens (dlstevens@us.ibm.com) + * + * based heavily on net/ipv6/mcast.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/capability.h> +#include <linux/module.h> +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/random.h> +#include <linux/string.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/net.h> +#include <linux/in6.h> +#include <linux/netdevice.h> +#include <linux/if_arp.h> +#include <linux/route.h> +#include <linux/init.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/slab.h> + +#include <net/net_namespace.h> +#include <net/sock.h> +#include <net/snmp.h> + +#include <net/ipv6.h> +#include <net/protocol.h> +#include <net/if_inet6.h> +#include <net/ndisc.h> +#include <net/addrconf.h> +#include <net/ip6_route.h> + +#include <net/checksum.h> + +static int ipv6_dev_ac_dec(struct net_device *dev, const struct in6_addr *addr); + +/* Big ac list lock for all the sockets */ +static DEFINE_RWLOCK(ipv6_sk_ac_lock); + + +/* + * socket join an anycast group + */ + +int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct net_device *dev = NULL; + struct inet6_dev *idev; + struct ipv6_ac_socklist *pac; + struct net *net = sock_net(sk); + int ishost = !net->ipv6.devconf_all->forwarding; + int err = 0; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (ipv6_addr_is_multicast(addr)) + return -EINVAL; + if (ipv6_chk_addr(net, addr, NULL, 0)) + return -EINVAL; + + pac = sock_kmalloc(sk, sizeof(struct ipv6_ac_socklist), GFP_KERNEL); + if (pac == NULL) + return -ENOMEM; + pac->acl_next = NULL; + pac->acl_addr = *addr; + + rcu_read_lock(); + if (ifindex == 0) { + struct rt6_info *rt; + + rt = rt6_lookup(net, addr, NULL, 0, 0); + if (rt) { + dev = rt->dst.dev; + dst_release(&rt->dst); + } else if (ishost) { + err = -EADDRNOTAVAIL; + goto error; + } else { + /* router, no matching interface: just pick one */ + dev = dev_get_by_flags_rcu(net, IFF_UP, + IFF_UP | IFF_LOOPBACK); + } + } else + dev = dev_get_by_index_rcu(net, ifindex); + + if (dev == NULL) { + err = -ENODEV; + goto error; + } + + idev = __in6_dev_get(dev); + if (!idev) { + if (ifindex) + err = -ENODEV; + else + err = -EADDRNOTAVAIL; + goto error; + } + /* reset ishost, now that we have a specific device */ + ishost = !idev->cnf.forwarding; + + pac->acl_ifindex = dev->ifindex; + + /* XXX + * For hosts, allow link-local or matching prefix anycasts. + * This obviates the need for propagating anycast routes while + * still allowing some non-router anycast participation. + */ + if (!ipv6_chk_prefix(addr, dev)) { + if (ishost) + err = -EADDRNOTAVAIL; + if (err) + goto error; + } + + err = ipv6_dev_ac_inc(dev, addr); + if (!err) { + write_lock_bh(&ipv6_sk_ac_lock); + pac->acl_next = np->ipv6_ac_list; + np->ipv6_ac_list = pac; + write_unlock_bh(&ipv6_sk_ac_lock); + pac = NULL; + } + +error: + rcu_read_unlock(); + if (pac) + sock_kfree_s(sk, pac, sizeof(*pac)); + return err; +} + +/* + * socket leave an anycast group + */ +int ipv6_sock_ac_drop(struct sock *sk, int ifindex, const struct in6_addr *addr) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct net_device *dev; + struct ipv6_ac_socklist *pac, *prev_pac; + struct net *net = sock_net(sk); + + write_lock_bh(&ipv6_sk_ac_lock); + prev_pac = NULL; + for (pac = np->ipv6_ac_list; pac; pac = pac->acl_next) { + if ((ifindex == 0 || pac->acl_ifindex == ifindex) && + ipv6_addr_equal(&pac->acl_addr, addr)) + break; + prev_pac = pac; + } + if (!pac) { + write_unlock_bh(&ipv6_sk_ac_lock); + return -ENOENT; + } + if (prev_pac) + prev_pac->acl_next = pac->acl_next; + else + np->ipv6_ac_list = pac->acl_next; + + write_unlock_bh(&ipv6_sk_ac_lock); + + rcu_read_lock(); + dev = dev_get_by_index_rcu(net, pac->acl_ifindex); + if (dev) + ipv6_dev_ac_dec(dev, &pac->acl_addr); + rcu_read_unlock(); + + sock_kfree_s(sk, pac, sizeof(*pac)); + return 0; +} + +void ipv6_sock_ac_close(struct sock *sk) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct net_device *dev = NULL; + struct ipv6_ac_socklist *pac; + struct net *net = sock_net(sk); + int prev_index; + + write_lock_bh(&ipv6_sk_ac_lock); + pac = np->ipv6_ac_list; + np->ipv6_ac_list = NULL; + write_unlock_bh(&ipv6_sk_ac_lock); + + prev_index = 0; + rcu_read_lock(); + while (pac) { + struct ipv6_ac_socklist *next = pac->acl_next; + + if (pac->acl_ifindex != prev_index) { + dev = dev_get_by_index_rcu(net, pac->acl_ifindex); + prev_index = pac->acl_ifindex; + } + if (dev) + ipv6_dev_ac_dec(dev, &pac->acl_addr); + sock_kfree_s(sk, pac, sizeof(*pac)); + pac = next; + } + rcu_read_unlock(); +} + +static void aca_put(struct ifacaddr6 *ac) +{ + if (atomic_dec_and_test(&ac->aca_refcnt)) { + in6_dev_put(ac->aca_idev); + dst_release(&ac->aca_rt->dst); + kfree(ac); + } +} + +/* + * device anycast group inc (add if not found) + */ +int ipv6_dev_ac_inc(struct net_device *dev, const struct in6_addr *addr) +{ + struct ifacaddr6 *aca; + struct inet6_dev *idev; + struct rt6_info *rt; + int err; + + idev = in6_dev_get(dev); + + if (idev == NULL) + return -EINVAL; + + write_lock_bh(&idev->lock); + if (idev->dead) { + err = -ENODEV; + goto out; + } + + for (aca = idev->ac_list; aca; aca = aca->aca_next) { + if (ipv6_addr_equal(&aca->aca_addr, addr)) { + aca->aca_users++; + err = 0; + goto out; + } + } + + /* + * not found: create a new one. + */ + + aca = kzalloc(sizeof(struct ifacaddr6), GFP_ATOMIC); + + if (aca == NULL) { + err = -ENOMEM; + goto out; + } + + rt = addrconf_dst_alloc(idev, addr, true); + if (IS_ERR(rt)) { + kfree(aca); + err = PTR_ERR(rt); + goto out; + } + + aca->aca_addr = *addr; + aca->aca_idev = idev; + aca->aca_rt = rt; + aca->aca_users = 1; + /* aca_tstamp should be updated upon changes */ + aca->aca_cstamp = aca->aca_tstamp = jiffies; + atomic_set(&aca->aca_refcnt, 2); + spin_lock_init(&aca->aca_lock); + + aca->aca_next = idev->ac_list; + idev->ac_list = aca; + write_unlock_bh(&idev->lock); + + ip6_ins_rt(rt); + + addrconf_join_solict(dev, &aca->aca_addr); + + aca_put(aca); + return 0; +out: + write_unlock_bh(&idev->lock); + in6_dev_put(idev); + return err; +} + +/* + * device anycast group decrement + */ +int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr) +{ + struct ifacaddr6 *aca, *prev_aca; + + write_lock_bh(&idev->lock); + prev_aca = NULL; + for (aca = idev->ac_list; aca; aca = aca->aca_next) { + if (ipv6_addr_equal(&aca->aca_addr, addr)) + break; + prev_aca = aca; + } + if (!aca) { + write_unlock_bh(&idev->lock); + return -ENOENT; + } + if (--aca->aca_users > 0) { + write_unlock_bh(&idev->lock); + return 0; + } + if (prev_aca) + prev_aca->aca_next = aca->aca_next; + else + idev->ac_list = aca->aca_next; + write_unlock_bh(&idev->lock); + addrconf_leave_solict(idev, &aca->aca_addr); + + dst_hold(&aca->aca_rt->dst); + ip6_del_rt(aca->aca_rt); + + aca_put(aca); + return 0; +} + +/* called with rcu_read_lock() */ +static int ipv6_dev_ac_dec(struct net_device *dev, const struct in6_addr *addr) +{ + struct inet6_dev *idev = __in6_dev_get(dev); + + if (idev == NULL) + return -ENODEV; + return __ipv6_dev_ac_dec(idev, addr); +} + +/* + * check if the interface has this anycast address + * called with rcu_read_lock() + */ +static int ipv6_chk_acast_dev(struct net_device *dev, const struct in6_addr *addr) +{ + struct inet6_dev *idev; + struct ifacaddr6 *aca; + + idev = __in6_dev_get(dev); + if (idev) { + read_lock_bh(&idev->lock); + for (aca = idev->ac_list; aca; aca = aca->aca_next) + if (ipv6_addr_equal(&aca->aca_addr, addr)) + break; + read_unlock_bh(&idev->lock); + return aca != NULL; + } + return 0; +} + +/* + * check if given interface (or any, if dev==0) has this anycast address + */ +int ipv6_chk_acast_addr(struct net *net, struct net_device *dev, + const struct in6_addr *addr) +{ + int found = 0; + + rcu_read_lock(); + if (dev) + found = ipv6_chk_acast_dev(dev, addr); + else + for_each_netdev_rcu(net, dev) + if (ipv6_chk_acast_dev(dev, addr)) { + found = 1; + break; + } + rcu_read_unlock(); + return found; +} + + +#ifdef CONFIG_PROC_FS +struct ac6_iter_state { + struct seq_net_private p; + struct net_device *dev; + struct inet6_dev *idev; +}; + +#define ac6_seq_private(seq) ((struct ac6_iter_state *)(seq)->private) + +static inline struct ifacaddr6 *ac6_get_first(struct seq_file *seq) +{ + struct ifacaddr6 *im = NULL; + struct ac6_iter_state *state = ac6_seq_private(seq); + struct net *net = seq_file_net(seq); + + state->idev = NULL; + for_each_netdev_rcu(net, state->dev) { + struct inet6_dev *idev; + idev = __in6_dev_get(state->dev); + if (!idev) + continue; + read_lock_bh(&idev->lock); + im = idev->ac_list; + if (im) { + state->idev = idev; + break; + } + read_unlock_bh(&idev->lock); + } + return im; +} + +static struct ifacaddr6 *ac6_get_next(struct seq_file *seq, struct ifacaddr6 *im) +{ + struct ac6_iter_state *state = ac6_seq_private(seq); + + im = im->aca_next; + while (!im) { + if (likely(state->idev != NULL)) + read_unlock_bh(&state->idev->lock); + + state->dev = next_net_device_rcu(state->dev); + if (!state->dev) { + state->idev = NULL; + break; + } + state->idev = __in6_dev_get(state->dev); + if (!state->idev) + continue; + read_lock_bh(&state->idev->lock); + im = state->idev->ac_list; + } + return im; +} + +static struct ifacaddr6 *ac6_get_idx(struct seq_file *seq, loff_t pos) +{ + struct ifacaddr6 *im = ac6_get_first(seq); + if (im) + while (pos && (im = ac6_get_next(seq, im)) != NULL) + --pos; + return pos ? NULL : im; +} + +static void *ac6_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(RCU) +{ + rcu_read_lock(); + return ac6_get_idx(seq, *pos); +} + +static void *ac6_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct ifacaddr6 *im = ac6_get_next(seq, v); + + ++*pos; + return im; +} + +static void ac6_seq_stop(struct seq_file *seq, void *v) + __releases(RCU) +{ + struct ac6_iter_state *state = ac6_seq_private(seq); + + if (likely(state->idev != NULL)) { + read_unlock_bh(&state->idev->lock); + state->idev = NULL; + } + rcu_read_unlock(); +} + +static int ac6_seq_show(struct seq_file *seq, void *v) +{ + struct ifacaddr6 *im = (struct ifacaddr6 *)v; + struct ac6_iter_state *state = ac6_seq_private(seq); + + seq_printf(seq, "%-4d %-15s %pi6 %5d\n", + state->dev->ifindex, state->dev->name, + &im->aca_addr, im->aca_users); + return 0; +} + +static const struct seq_operations ac6_seq_ops = { + .start = ac6_seq_start, + .next = ac6_seq_next, + .stop = ac6_seq_stop, + .show = ac6_seq_show, +}; + +static int ac6_seq_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &ac6_seq_ops, + sizeof(struct ac6_iter_state)); +} + +static const struct file_operations ac6_seq_fops = { + .owner = THIS_MODULE, + .open = ac6_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +int __net_init ac6_proc_init(struct net *net) +{ + if (!proc_net_fops_create(net, "anycast6", S_IRUGO, &ac6_seq_fops)) + return -ENOMEM; + + return 0; +} + +void ac6_proc_exit(struct net *net) +{ + proc_net_remove(net, "anycast6"); +} +#endif + diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c new file mode 100644 index 00000000..76832c8d --- /dev/null +++ b/net/ipv6/datagram.c @@ -0,0 +1,872 @@ +/* + * common UDP/RAW code + * Linux INET6 implementation + * + * Authors: + * Pedro Roque <roque@di.fc.ul.pt> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/capability.h> +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/interrupt.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in6.h> +#include <linux/ipv6.h> +#include <linux/route.h> +#include <linux/slab.h> + +#include <net/ipv6.h> +#include <net/ndisc.h> +#include <net/addrconf.h> +#include <net/transp_v6.h> +#include <net/ip6_route.h> +#include <net/tcp_states.h> + +#include <linux/errqueue.h> +#include <asm/uaccess.h> + +static inline int ipv6_mapped_addr_any(const struct in6_addr *a) +{ + return (ipv6_addr_v4mapped(a) && (a->s6_addr32[3] == 0)); +} + +int ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) +{ + struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr; + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct in6_addr *daddr, *final_p, final; + struct dst_entry *dst; + struct flowi6 fl6; + struct ip6_flowlabel *flowlabel = NULL; + struct ipv6_txoptions *opt; + int addr_type; + int err; + + if (usin->sin6_family == AF_INET) { + if (__ipv6_only_sock(sk)) + return -EAFNOSUPPORT; + err = ip4_datagram_connect(sk, uaddr, addr_len); + goto ipv4_connected; + } + + if (addr_len < SIN6_LEN_RFC2133) + return -EINVAL; + + if (usin->sin6_family != AF_INET6) + return -EAFNOSUPPORT; + + memset(&fl6, 0, sizeof(fl6)); + if (np->sndflow) { + fl6.flowlabel = usin->sin6_flowinfo&IPV6_FLOWINFO_MASK; + if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) { + flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); + if (flowlabel == NULL) + return -EINVAL; + usin->sin6_addr = flowlabel->dst; + } + } + + addr_type = ipv6_addr_type(&usin->sin6_addr); + + if (addr_type == IPV6_ADDR_ANY) { + /* + * connect to self + */ + usin->sin6_addr.s6_addr[15] = 0x01; + } + + daddr = &usin->sin6_addr; + + if (addr_type == IPV6_ADDR_MAPPED) { + struct sockaddr_in sin; + + if (__ipv6_only_sock(sk)) { + err = -ENETUNREACH; + goto out; + } + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = daddr->s6_addr32[3]; + sin.sin_port = usin->sin6_port; + + err = ip4_datagram_connect(sk, + (struct sockaddr*) &sin, + sizeof(sin)); + +ipv4_connected: + if (err) + goto out; + + ipv6_addr_set_v4mapped(inet->inet_daddr, &np->daddr); + + if (ipv6_addr_any(&np->saddr) || + ipv6_mapped_addr_any(&np->saddr)) + ipv6_addr_set_v4mapped(inet->inet_saddr, &np->saddr); + + if (ipv6_addr_any(&np->rcv_saddr) || + ipv6_mapped_addr_any(&np->rcv_saddr)) { + ipv6_addr_set_v4mapped(inet->inet_rcv_saddr, + &np->rcv_saddr); + if (sk->sk_prot->rehash) + sk->sk_prot->rehash(sk); + } + + goto out; + } + + if (addr_type&IPV6_ADDR_LINKLOCAL) { + if (addr_len >= sizeof(struct sockaddr_in6) && + usin->sin6_scope_id) { + if (sk->sk_bound_dev_if && + sk->sk_bound_dev_if != usin->sin6_scope_id) { + err = -EINVAL; + goto out; + } + sk->sk_bound_dev_if = usin->sin6_scope_id; + } + + if (!sk->sk_bound_dev_if && (addr_type & IPV6_ADDR_MULTICAST)) + sk->sk_bound_dev_if = np->mcast_oif; + + /* Connect to link-local address requires an interface */ + if (!sk->sk_bound_dev_if) { + err = -EINVAL; + goto out; + } + } + + np->daddr = *daddr; + np->flow_label = fl6.flowlabel; + + inet->inet_dport = usin->sin6_port; + + /* + * Check for a route to destination an obtain the + * destination cache for it. + */ + + fl6.flowi6_proto = sk->sk_protocol; + fl6.daddr = np->daddr; + fl6.saddr = np->saddr; + fl6.flowi6_oif = sk->sk_bound_dev_if; + fl6.flowi6_mark = sk->sk_mark; + fl6.fl6_dport = inet->inet_dport; + fl6.fl6_sport = inet->inet_sport; + + if (!fl6.flowi6_oif && (addr_type&IPV6_ADDR_MULTICAST)) + fl6.flowi6_oif = np->mcast_oif; + + security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); + + opt = flowlabel ? flowlabel->opt : np->opt; + final_p = fl6_update_dst(&fl6, opt, &final); + + dst = ip6_dst_lookup_flow(sk, &fl6, final_p, true); + err = 0; + if (IS_ERR(dst)) { + err = PTR_ERR(dst); + goto out; + } + + /* source address lookup done in ip6_dst_lookup */ + + if (ipv6_addr_any(&np->saddr)) + np->saddr = fl6.saddr; + + if (ipv6_addr_any(&np->rcv_saddr)) { + np->rcv_saddr = fl6.saddr; + inet->inet_rcv_saddr = LOOPBACK4_IPV6; + if (sk->sk_prot->rehash) + sk->sk_prot->rehash(sk); + } + + ip6_dst_store(sk, dst, + ipv6_addr_equal(&fl6.daddr, &np->daddr) ? + &np->daddr : NULL, +#ifdef CONFIG_IPV6_SUBTREES + ipv6_addr_equal(&fl6.saddr, &np->saddr) ? + &np->saddr : +#endif + NULL); + + sk->sk_state = TCP_ESTABLISHED; +out: + fl6_sock_release(flowlabel); + return err; +} + +void ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err, + __be16 port, u32 info, u8 *payload) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct icmp6hdr *icmph = icmp6_hdr(skb); + struct sock_exterr_skb *serr; + + if (!np->recverr) + return; + + skb = skb_clone(skb, GFP_ATOMIC); + if (!skb) + return; + + skb->protocol = htons(ETH_P_IPV6); + + serr = SKB_EXT_ERR(skb); + serr->ee.ee_errno = err; + serr->ee.ee_origin = SO_EE_ORIGIN_ICMP6; + serr->ee.ee_type = icmph->icmp6_type; + serr->ee.ee_code = icmph->icmp6_code; + serr->ee.ee_pad = 0; + serr->ee.ee_info = info; + serr->ee.ee_data = 0; + serr->addr_offset = (u8 *)&(((struct ipv6hdr *)(icmph + 1))->daddr) - + skb_network_header(skb); + serr->port = port; + + __skb_pull(skb, payload - skb->data); + skb_reset_transport_header(skb); + + if (sock_queue_err_skb(sk, skb)) + kfree_skb(skb); +} + +void ipv6_local_error(struct sock *sk, int err, struct flowi6 *fl6, u32 info) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct sock_exterr_skb *serr; + struct ipv6hdr *iph; + struct sk_buff *skb; + + if (!np->recverr) + return; + + skb = alloc_skb(sizeof(struct ipv6hdr), GFP_ATOMIC); + if (!skb) + return; + + skb->protocol = htons(ETH_P_IPV6); + + skb_put(skb, sizeof(struct ipv6hdr)); + skb_reset_network_header(skb); + iph = ipv6_hdr(skb); + iph->daddr = fl6->daddr; + + serr = SKB_EXT_ERR(skb); + serr->ee.ee_errno = err; + serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL; + serr->ee.ee_type = 0; + serr->ee.ee_code = 0; + serr->ee.ee_pad = 0; + serr->ee.ee_info = info; + serr->ee.ee_data = 0; + serr->addr_offset = (u8 *)&iph->daddr - skb_network_header(skb); + serr->port = fl6->fl6_dport; + + __skb_pull(skb, skb_tail_pointer(skb) - skb->data); + skb_reset_transport_header(skb); + + if (sock_queue_err_skb(sk, skb)) + kfree_skb(skb); +} + +void ipv6_local_rxpmtu(struct sock *sk, struct flowi6 *fl6, u32 mtu) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct ipv6hdr *iph; + struct sk_buff *skb; + struct ip6_mtuinfo *mtu_info; + + if (!np->rxopt.bits.rxpmtu) + return; + + skb = alloc_skb(sizeof(struct ipv6hdr), GFP_ATOMIC); + if (!skb) + return; + + skb_put(skb, sizeof(struct ipv6hdr)); + skb_reset_network_header(skb); + iph = ipv6_hdr(skb); + iph->daddr = fl6->daddr; + + mtu_info = IP6CBMTU(skb); + + mtu_info->ip6m_mtu = mtu; + mtu_info->ip6m_addr.sin6_family = AF_INET6; + mtu_info->ip6m_addr.sin6_port = 0; + mtu_info->ip6m_addr.sin6_flowinfo = 0; + mtu_info->ip6m_addr.sin6_scope_id = fl6->flowi6_oif; + mtu_info->ip6m_addr.sin6_addr = ipv6_hdr(skb)->daddr; + + __skb_pull(skb, skb_tail_pointer(skb) - skb->data); + skb_reset_transport_header(skb); + + skb = xchg(&np->rxpmtu, skb); + kfree_skb(skb); +} + +/* + * Handle MSG_ERRQUEUE + */ +int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct sock_exterr_skb *serr; + struct sk_buff *skb, *skb2; + struct sockaddr_in6 *sin; + struct { + struct sock_extended_err ee; + struct sockaddr_in6 offender; + } errhdr; + int err; + int copied; + + err = -EAGAIN; + skb = skb_dequeue(&sk->sk_error_queue); + if (skb == NULL) + goto out; + + copied = skb->len; + if (copied > len) { + msg->msg_flags |= MSG_TRUNC; + copied = len; + } + err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + if (err) + goto out_free_skb; + + sock_recv_timestamp(msg, sk, skb); + + serr = SKB_EXT_ERR(skb); + + sin = (struct sockaddr_in6 *)msg->msg_name; + if (sin) { + const unsigned char *nh = skb_network_header(skb); + sin->sin6_family = AF_INET6; + sin->sin6_flowinfo = 0; + sin->sin6_port = serr->port; + sin->sin6_scope_id = 0; + if (skb->protocol == htons(ETH_P_IPV6)) { + sin->sin6_addr = + *(struct in6_addr *)(nh + serr->addr_offset); + if (np->sndflow) + sin->sin6_flowinfo = + (*(__be32 *)(nh + serr->addr_offset - 24) & + IPV6_FLOWINFO_MASK); + if (ipv6_addr_type(&sin->sin6_addr) & IPV6_ADDR_LINKLOCAL) + sin->sin6_scope_id = IP6CB(skb)->iif; + } else { + ipv6_addr_set_v4mapped(*(__be32 *)(nh + serr->addr_offset), + &sin->sin6_addr); + } + } + + memcpy(&errhdr.ee, &serr->ee, sizeof(struct sock_extended_err)); + sin = &errhdr.offender; + sin->sin6_family = AF_UNSPEC; + if (serr->ee.ee_origin != SO_EE_ORIGIN_LOCAL) { + sin->sin6_family = AF_INET6; + sin->sin6_flowinfo = 0; + sin->sin6_scope_id = 0; + if (skb->protocol == htons(ETH_P_IPV6)) { + sin->sin6_addr = ipv6_hdr(skb)->saddr; + if (np->rxopt.all) + datagram_recv_ctl(sk, msg, skb); + if (ipv6_addr_type(&sin->sin6_addr) & IPV6_ADDR_LINKLOCAL) + sin->sin6_scope_id = IP6CB(skb)->iif; + } else { + struct inet_sock *inet = inet_sk(sk); + + ipv6_addr_set_v4mapped(ip_hdr(skb)->saddr, + &sin->sin6_addr); + if (inet->cmsg_flags) + ip_cmsg_recv(msg, skb); + } + } + + put_cmsg(msg, SOL_IPV6, IPV6_RECVERR, sizeof(errhdr), &errhdr); + + /* Now we could try to dump offended packet options */ + + msg->msg_flags |= MSG_ERRQUEUE; + err = copied; + + /* Reset and regenerate socket error */ + spin_lock_bh(&sk->sk_error_queue.lock); + sk->sk_err = 0; + if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) { + sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno; + spin_unlock_bh(&sk->sk_error_queue.lock); + sk->sk_error_report(sk); + } else { + spin_unlock_bh(&sk->sk_error_queue.lock); + } + +out_free_skb: + kfree_skb(skb); +out: + return err; +} + +/* + * Handle IPV6_RECVPATHMTU + */ +int ipv6_recv_rxpmtu(struct sock *sk, struct msghdr *msg, int len) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct sk_buff *skb; + struct sockaddr_in6 *sin; + struct ip6_mtuinfo mtu_info; + int err; + int copied; + + err = -EAGAIN; + skb = xchg(&np->rxpmtu, NULL); + if (skb == NULL) + goto out; + + copied = skb->len; + if (copied > len) { + msg->msg_flags |= MSG_TRUNC; + copied = len; + } + err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + if (err) + goto out_free_skb; + + sock_recv_timestamp(msg, sk, skb); + + memcpy(&mtu_info, IP6CBMTU(skb), sizeof(mtu_info)); + + sin = (struct sockaddr_in6 *)msg->msg_name; + if (sin) { + sin->sin6_family = AF_INET6; + sin->sin6_flowinfo = 0; + sin->sin6_port = 0; + sin->sin6_scope_id = mtu_info.ip6m_addr.sin6_scope_id; + sin->sin6_addr = mtu_info.ip6m_addr.sin6_addr; + } + + put_cmsg(msg, SOL_IPV6, IPV6_PATHMTU, sizeof(mtu_info), &mtu_info); + + err = copied; + +out_free_skb: + kfree_skb(skb); +out: + return err; +} + + +int datagram_recv_ctl(struct sock *sk, struct msghdr *msg, struct sk_buff *skb) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct inet6_skb_parm *opt = IP6CB(skb); + unsigned char *nh = skb_network_header(skb); + + if (np->rxopt.bits.rxinfo) { + struct in6_pktinfo src_info; + + src_info.ipi6_ifindex = opt->iif; + src_info.ipi6_addr = ipv6_hdr(skb)->daddr; + put_cmsg(msg, SOL_IPV6, IPV6_PKTINFO, sizeof(src_info), &src_info); + } + + if (np->rxopt.bits.rxhlim) { + int hlim = ipv6_hdr(skb)->hop_limit; + put_cmsg(msg, SOL_IPV6, IPV6_HOPLIMIT, sizeof(hlim), &hlim); + } + + if (np->rxopt.bits.rxtclass) { + int tclass = ipv6_tclass(ipv6_hdr(skb)); + put_cmsg(msg, SOL_IPV6, IPV6_TCLASS, sizeof(tclass), &tclass); + } + + if (np->rxopt.bits.rxflow && (*(__be32 *)nh & IPV6_FLOWINFO_MASK)) { + __be32 flowinfo = *(__be32 *)nh & IPV6_FLOWINFO_MASK; + put_cmsg(msg, SOL_IPV6, IPV6_FLOWINFO, sizeof(flowinfo), &flowinfo); + } + + /* HbH is allowed only once */ + if (np->rxopt.bits.hopopts && opt->hop) { + u8 *ptr = nh + opt->hop; + put_cmsg(msg, SOL_IPV6, IPV6_HOPOPTS, (ptr[1]+1)<<3, ptr); + } + + if (opt->lastopt && + (np->rxopt.bits.dstopts || np->rxopt.bits.srcrt)) { + /* + * Silly enough, but we need to reparse in order to + * report extension headers (except for HbH) + * in order. + * + * Also note that IPV6_RECVRTHDRDSTOPTS is NOT + * (and WILL NOT be) defined because + * IPV6_RECVDSTOPTS is more generic. --yoshfuji + */ + unsigned int off = sizeof(struct ipv6hdr); + u8 nexthdr = ipv6_hdr(skb)->nexthdr; + + while (off <= opt->lastopt) { + unsigned len; + u8 *ptr = nh + off; + + switch(nexthdr) { + case IPPROTO_DSTOPTS: + nexthdr = ptr[0]; + len = (ptr[1] + 1) << 3; + if (np->rxopt.bits.dstopts) + put_cmsg(msg, SOL_IPV6, IPV6_DSTOPTS, len, ptr); + break; + case IPPROTO_ROUTING: + nexthdr = ptr[0]; + len = (ptr[1] + 1) << 3; + if (np->rxopt.bits.srcrt) + put_cmsg(msg, SOL_IPV6, IPV6_RTHDR, len, ptr); + break; + case IPPROTO_AH: + nexthdr = ptr[0]; + len = (ptr[1] + 2) << 2; + break; + default: + nexthdr = ptr[0]; + len = (ptr[1] + 1) << 3; + break; + } + + off += len; + } + } + + /* socket options in old style */ + if (np->rxopt.bits.rxoinfo) { + struct in6_pktinfo src_info; + + src_info.ipi6_ifindex = opt->iif; + src_info.ipi6_addr = ipv6_hdr(skb)->daddr; + put_cmsg(msg, SOL_IPV6, IPV6_2292PKTINFO, sizeof(src_info), &src_info); + } + if (np->rxopt.bits.rxohlim) { + int hlim = ipv6_hdr(skb)->hop_limit; + put_cmsg(msg, SOL_IPV6, IPV6_2292HOPLIMIT, sizeof(hlim), &hlim); + } + if (np->rxopt.bits.ohopopts && opt->hop) { + u8 *ptr = nh + opt->hop; + put_cmsg(msg, SOL_IPV6, IPV6_2292HOPOPTS, (ptr[1]+1)<<3, ptr); + } + if (np->rxopt.bits.odstopts && opt->dst0) { + u8 *ptr = nh + opt->dst0; + put_cmsg(msg, SOL_IPV6, IPV6_2292DSTOPTS, (ptr[1]+1)<<3, ptr); + } + if (np->rxopt.bits.osrcrt && opt->srcrt) { + struct ipv6_rt_hdr *rthdr = (struct ipv6_rt_hdr *)(nh + opt->srcrt); + put_cmsg(msg, SOL_IPV6, IPV6_2292RTHDR, (rthdr->hdrlen+1) << 3, rthdr); + } + if (np->rxopt.bits.odstopts && opt->dst1) { + u8 *ptr = nh + opt->dst1; + put_cmsg(msg, SOL_IPV6, IPV6_2292DSTOPTS, (ptr[1]+1)<<3, ptr); + } + if (np->rxopt.bits.rxorigdstaddr) { + struct sockaddr_in6 sin6; + __be16 *ports = (__be16 *) skb_transport_header(skb); + + if (skb_transport_offset(skb) + 4 <= skb->len) { + /* All current transport protocols have the port numbers in the + * first four bytes of the transport header and this function is + * written with this assumption in mind. + */ + + sin6.sin6_family = AF_INET6; + sin6.sin6_addr = ipv6_hdr(skb)->daddr; + sin6.sin6_port = ports[1]; + sin6.sin6_flowinfo = 0; + sin6.sin6_scope_id = 0; + + put_cmsg(msg, SOL_IPV6, IPV6_ORIGDSTADDR, sizeof(sin6), &sin6); + } + } + return 0; +} + +int datagram_send_ctl(struct net *net, struct sock *sk, + struct msghdr *msg, struct flowi6 *fl6, + struct ipv6_txoptions *opt, + int *hlimit, int *tclass, int *dontfrag) +{ + struct in6_pktinfo *src_info; + struct cmsghdr *cmsg; + struct ipv6_rt_hdr *rthdr; + struct ipv6_opt_hdr *hdr; + int len; + int err = 0; + + for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) { + int addr_type; + + if (!CMSG_OK(msg, cmsg)) { + err = -EINVAL; + goto exit_f; + } + + if (cmsg->cmsg_level != SOL_IPV6) + continue; + + switch (cmsg->cmsg_type) { + case IPV6_PKTINFO: + case IPV6_2292PKTINFO: + { + struct net_device *dev = NULL; + + if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct in6_pktinfo))) { + err = -EINVAL; + goto exit_f; + } + + src_info = (struct in6_pktinfo *)CMSG_DATA(cmsg); + + if (src_info->ipi6_ifindex) { + if (fl6->flowi6_oif && + src_info->ipi6_ifindex != fl6->flowi6_oif) + return -EINVAL; + fl6->flowi6_oif = src_info->ipi6_ifindex; + } + + addr_type = __ipv6_addr_type(&src_info->ipi6_addr); + + rcu_read_lock(); + if (fl6->flowi6_oif) { + dev = dev_get_by_index_rcu(net, fl6->flowi6_oif); + if (!dev) { + rcu_read_unlock(); + return -ENODEV; + } + } else if (addr_type & IPV6_ADDR_LINKLOCAL) { + rcu_read_unlock(); + return -EINVAL; + } + + if (addr_type != IPV6_ADDR_ANY) { + int strict = __ipv6_addr_src_scope(addr_type) <= IPV6_ADDR_SCOPE_LINKLOCAL; + if (!(inet_sk(sk)->freebind || inet_sk(sk)->transparent) && + !ipv6_chk_addr(net, &src_info->ipi6_addr, + strict ? dev : NULL, 0)) + err = -EINVAL; + else + fl6->saddr = src_info->ipi6_addr; + } + + rcu_read_unlock(); + + if (err) + goto exit_f; + + break; + } + + case IPV6_FLOWINFO: + if (cmsg->cmsg_len < CMSG_LEN(4)) { + err = -EINVAL; + goto exit_f; + } + + if (fl6->flowlabel&IPV6_FLOWINFO_MASK) { + if ((fl6->flowlabel^*(__be32 *)CMSG_DATA(cmsg))&~IPV6_FLOWINFO_MASK) { + err = -EINVAL; + goto exit_f; + } + } + fl6->flowlabel = IPV6_FLOWINFO_MASK & *(__be32 *)CMSG_DATA(cmsg); + break; + + case IPV6_2292HOPOPTS: + case IPV6_HOPOPTS: + if (opt->hopopt || cmsg->cmsg_len < CMSG_LEN(sizeof(struct ipv6_opt_hdr))) { + err = -EINVAL; + goto exit_f; + } + + hdr = (struct ipv6_opt_hdr *)CMSG_DATA(cmsg); + len = ((hdr->hdrlen + 1) << 3); + if (cmsg->cmsg_len < CMSG_LEN(len)) { + err = -EINVAL; + goto exit_f; + } + if (!capable(CAP_NET_RAW)) { + err = -EPERM; + goto exit_f; + } + opt->opt_nflen += len; + opt->hopopt = hdr; + break; + + case IPV6_2292DSTOPTS: + if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct ipv6_opt_hdr))) { + err = -EINVAL; + goto exit_f; + } + + hdr = (struct ipv6_opt_hdr *)CMSG_DATA(cmsg); + len = ((hdr->hdrlen + 1) << 3); + if (cmsg->cmsg_len < CMSG_LEN(len)) { + err = -EINVAL; + goto exit_f; + } + if (!capable(CAP_NET_RAW)) { + err = -EPERM; + goto exit_f; + } + if (opt->dst1opt) { + err = -EINVAL; + goto exit_f; + } + opt->opt_flen += len; + opt->dst1opt = hdr; + break; + + case IPV6_DSTOPTS: + case IPV6_RTHDRDSTOPTS: + if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct ipv6_opt_hdr))) { + err = -EINVAL; + goto exit_f; + } + + hdr = (struct ipv6_opt_hdr *)CMSG_DATA(cmsg); + len = ((hdr->hdrlen + 1) << 3); + if (cmsg->cmsg_len < CMSG_LEN(len)) { + err = -EINVAL; + goto exit_f; + } + if (!capable(CAP_NET_RAW)) { + err = -EPERM; + goto exit_f; + } + if (cmsg->cmsg_type == IPV6_DSTOPTS) { + opt->opt_flen += len; + opt->dst1opt = hdr; + } else { + opt->opt_nflen += len; + opt->dst0opt = hdr; + } + break; + + case IPV6_2292RTHDR: + case IPV6_RTHDR: + if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct ipv6_rt_hdr))) { + err = -EINVAL; + goto exit_f; + } + + rthdr = (struct ipv6_rt_hdr *)CMSG_DATA(cmsg); + + switch (rthdr->type) { +#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) + case IPV6_SRCRT_TYPE_2: + if (rthdr->hdrlen != 2 || + rthdr->segments_left != 1) { + err = -EINVAL; + goto exit_f; + } + break; +#endif + default: + err = -EINVAL; + goto exit_f; + } + + len = ((rthdr->hdrlen + 1) << 3); + + if (cmsg->cmsg_len < CMSG_LEN(len)) { + err = -EINVAL; + goto exit_f; + } + + /* segments left must also match */ + if ((rthdr->hdrlen >> 1) != rthdr->segments_left) { + err = -EINVAL; + goto exit_f; + } + + opt->opt_nflen += len; + opt->srcrt = rthdr; + + if (cmsg->cmsg_type == IPV6_2292RTHDR && opt->dst1opt) { + int dsthdrlen = ((opt->dst1opt->hdrlen+1)<<3); + + opt->opt_nflen += dsthdrlen; + opt->dst0opt = opt->dst1opt; + opt->dst1opt = NULL; + opt->opt_flen -= dsthdrlen; + } + + break; + + case IPV6_2292HOPLIMIT: + case IPV6_HOPLIMIT: + if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) { + err = -EINVAL; + goto exit_f; + } + + *hlimit = *(int *)CMSG_DATA(cmsg); + if (*hlimit < -1 || *hlimit > 0xff) { + err = -EINVAL; + goto exit_f; + } + + break; + + case IPV6_TCLASS: + { + int tc; + + err = -EINVAL; + if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) { + goto exit_f; + } + + tc = *(int *)CMSG_DATA(cmsg); + if (tc < -1 || tc > 0xff) + goto exit_f; + + err = 0; + *tclass = tc; + + break; + } + + case IPV6_DONTFRAG: + { + int df; + + err = -EINVAL; + if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) { + goto exit_f; + } + + df = *(int *)CMSG_DATA(cmsg); + if (df < 0 || df > 1) + goto exit_f; + + err = 0; + *dontfrag = df; + + break; + } + default: + LIMIT_NETDEBUG(KERN_DEBUG "invalid cmsg type: %d\n", + cmsg->cmsg_type); + err = -EINVAL; + goto exit_f; + } + } + +exit_f: + return err; +} diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c new file mode 100644 index 00000000..65dd5433 --- /dev/null +++ b/net/ipv6/esp6.c @@ -0,0 +1,674 @@ +/* + * Copyright (C)2002 USAGI/WIDE Project + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Authors + * + * Mitsuru KANDA @USAGI : IPv6 Support + * Kazunori MIYAZAWA @USAGI : + * Kunihiro Ishiguro <kunihiro@ipinfusion.com> + * + * This file is derived from net/ipv4/esp.c + */ + +#include <crypto/aead.h> +#include <crypto/authenc.h> +#include <linux/err.h> +#include <linux/module.h> +#include <net/ip.h> +#include <net/xfrm.h> +#include <net/esp.h> +#include <linux/scatterlist.h> +#include <linux/kernel.h> +#include <linux/pfkeyv2.h> +#include <linux/random.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <net/icmp.h> +#include <net/ipv6.h> +#include <net/protocol.h> +#include <linux/icmpv6.h> + +struct esp_skb_cb { + struct xfrm_skb_cb xfrm; + void *tmp; +}; + +#define ESP_SKB_CB(__skb) ((struct esp_skb_cb *)&((__skb)->cb[0])) + +static u32 esp6_get_mtu(struct xfrm_state *x, int mtu); + +/* + * Allocate an AEAD request structure with extra space for SG and IV. + * + * For alignment considerations the upper 32 bits of the sequence number are + * placed at the front, if present. Followed by the IV, the request and finally + * the SG list. + * + * TODO: Use spare space in skb for this where possible. + */ +static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags, int seqihlen) +{ + unsigned int len; + + len = seqihlen; + + len += crypto_aead_ivsize(aead); + + if (len) { + len += crypto_aead_alignmask(aead) & + ~(crypto_tfm_ctx_alignment() - 1); + len = ALIGN(len, crypto_tfm_ctx_alignment()); + } + + len += sizeof(struct aead_givcrypt_request) + crypto_aead_reqsize(aead); + len = ALIGN(len, __alignof__(struct scatterlist)); + + len += sizeof(struct scatterlist) * nfrags; + + return kmalloc(len, GFP_ATOMIC); +} + +static inline __be32 *esp_tmp_seqhi(void *tmp) +{ + return PTR_ALIGN((__be32 *)tmp, __alignof__(__be32)); +} + +static inline u8 *esp_tmp_iv(struct crypto_aead *aead, void *tmp, int seqhilen) +{ + return crypto_aead_ivsize(aead) ? + PTR_ALIGN((u8 *)tmp + seqhilen, + crypto_aead_alignmask(aead) + 1) : tmp + seqhilen; +} + +static inline struct aead_givcrypt_request *esp_tmp_givreq( + struct crypto_aead *aead, u8 *iv) +{ + struct aead_givcrypt_request *req; + + req = (void *)PTR_ALIGN(iv + crypto_aead_ivsize(aead), + crypto_tfm_ctx_alignment()); + aead_givcrypt_set_tfm(req, aead); + return req; +} + +static inline struct aead_request *esp_tmp_req(struct crypto_aead *aead, u8 *iv) +{ + struct aead_request *req; + + req = (void *)PTR_ALIGN(iv + crypto_aead_ivsize(aead), + crypto_tfm_ctx_alignment()); + aead_request_set_tfm(req, aead); + return req; +} + +static inline struct scatterlist *esp_req_sg(struct crypto_aead *aead, + struct aead_request *req) +{ + return (void *)ALIGN((unsigned long)(req + 1) + + crypto_aead_reqsize(aead), + __alignof__(struct scatterlist)); +} + +static inline struct scatterlist *esp_givreq_sg( + struct crypto_aead *aead, struct aead_givcrypt_request *req) +{ + return (void *)ALIGN((unsigned long)(req + 1) + + crypto_aead_reqsize(aead), + __alignof__(struct scatterlist)); +} + +static void esp_output_done(struct crypto_async_request *base, int err) +{ + struct sk_buff *skb = base->data; + + kfree(ESP_SKB_CB(skb)->tmp); + xfrm_output_resume(skb, err); +} + +static int esp6_output(struct xfrm_state *x, struct sk_buff *skb) +{ + int err; + struct ip_esp_hdr *esph; + struct crypto_aead *aead; + struct aead_givcrypt_request *req; + struct scatterlist *sg; + struct scatterlist *asg; + struct sk_buff *trailer; + void *tmp; + int blksize; + int clen; + int alen; + int plen; + int tfclen; + int nfrags; + int assoclen; + int sglists; + int seqhilen; + u8 *iv; + u8 *tail; + __be32 *seqhi; + struct esp_data *esp = x->data; + + /* skb is pure payload to encrypt */ + err = -ENOMEM; + + aead = esp->aead; + alen = crypto_aead_authsize(aead); + + tfclen = 0; + if (x->tfcpad) { + struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb); + u32 padto; + + padto = min(x->tfcpad, esp6_get_mtu(x, dst->child_mtu_cached)); + if (skb->len < padto) + tfclen = padto - skb->len; + } + blksize = ALIGN(crypto_aead_blocksize(aead), 4); + clen = ALIGN(skb->len + 2 + tfclen, blksize); + if (esp->padlen) + clen = ALIGN(clen, esp->padlen); + plen = clen - skb->len - tfclen; + + err = skb_cow_data(skb, tfclen + plen + alen, &trailer); + if (err < 0) + goto error; + nfrags = err; + + assoclen = sizeof(*esph); + sglists = 1; + seqhilen = 0; + + if (x->props.flags & XFRM_STATE_ESN) { + sglists += 2; + seqhilen += sizeof(__be32); + assoclen += seqhilen; + } + + tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen); + if (!tmp) + goto error; + + seqhi = esp_tmp_seqhi(tmp); + iv = esp_tmp_iv(aead, tmp, seqhilen); + req = esp_tmp_givreq(aead, iv); + asg = esp_givreq_sg(aead, req); + sg = asg + sglists; + + /* Fill padding... */ + tail = skb_tail_pointer(trailer); + if (tfclen) { + memset(tail, 0, tfclen); + tail += tfclen; + } + do { + int i; + for (i = 0; i < plen - 2; i++) + tail[i] = i + 1; + } while (0); + tail[plen - 2] = plen - 2; + tail[plen - 1] = *skb_mac_header(skb); + pskb_put(skb, trailer, clen - skb->len + alen); + + skb_push(skb, -skb_network_offset(skb)); + esph = ip_esp_hdr(skb); + *skb_mac_header(skb) = IPPROTO_ESP; + + esph->spi = x->id.spi; + esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); + + sg_init_table(sg, nfrags); + skb_to_sgvec(skb, sg, + esph->enc_data + crypto_aead_ivsize(aead) - skb->data, + clen + alen); + + if ((x->props.flags & XFRM_STATE_ESN)) { + sg_init_table(asg, 3); + sg_set_buf(asg, &esph->spi, sizeof(__be32)); + *seqhi = htonl(XFRM_SKB_CB(skb)->seq.output.hi); + sg_set_buf(asg + 1, seqhi, seqhilen); + sg_set_buf(asg + 2, &esph->seq_no, sizeof(__be32)); + } else + sg_init_one(asg, esph, sizeof(*esph)); + + aead_givcrypt_set_callback(req, 0, esp_output_done, skb); + aead_givcrypt_set_crypt(req, sg, sg, clen, iv); + aead_givcrypt_set_assoc(req, asg, assoclen); + aead_givcrypt_set_giv(req, esph->enc_data, + XFRM_SKB_CB(skb)->seq.output.low); + + ESP_SKB_CB(skb)->tmp = tmp; + err = crypto_aead_givencrypt(req); + if (err == -EINPROGRESS) + goto error; + + if (err == -EBUSY) + err = NET_XMIT_DROP; + + kfree(tmp); + +error: + return err; +} + +static int esp_input_done2(struct sk_buff *skb, int err) +{ + struct xfrm_state *x = xfrm_input_state(skb); + struct esp_data *esp = x->data; + struct crypto_aead *aead = esp->aead; + int alen = crypto_aead_authsize(aead); + int hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead); + int elen = skb->len - hlen; + int hdr_len = skb_network_header_len(skb); + int padlen; + u8 nexthdr[2]; + + kfree(ESP_SKB_CB(skb)->tmp); + + if (unlikely(err)) + goto out; + + if (skb_copy_bits(skb, skb->len - alen - 2, nexthdr, 2)) + BUG(); + + err = -EINVAL; + padlen = nexthdr[0]; + if (padlen + 2 + alen >= elen) { + LIMIT_NETDEBUG(KERN_WARNING "ipsec esp packet is garbage " + "padlen=%d, elen=%d\n", padlen + 2, elen - alen); + goto out; + } + + /* ... check padding bits here. Silly. :-) */ + + pskb_trim(skb, skb->len - alen - padlen - 2); + __skb_pull(skb, hlen); + skb_set_transport_header(skb, -hdr_len); + + err = nexthdr[1]; + + /* RFC4303: Drop dummy packets without any error */ + if (err == IPPROTO_NONE) + err = -EINVAL; + +out: + return err; +} + +static void esp_input_done(struct crypto_async_request *base, int err) +{ + struct sk_buff *skb = base->data; + + xfrm_input_resume(skb, esp_input_done2(skb, err)); +} + +static int esp6_input(struct xfrm_state *x, struct sk_buff *skb) +{ + struct ip_esp_hdr *esph; + struct esp_data *esp = x->data; + struct crypto_aead *aead = esp->aead; + struct aead_request *req; + struct sk_buff *trailer; + int elen = skb->len - sizeof(*esph) - crypto_aead_ivsize(aead); + int nfrags; + int assoclen; + int sglists; + int seqhilen; + int ret = 0; + void *tmp; + __be32 *seqhi; + u8 *iv; + struct scatterlist *sg; + struct scatterlist *asg; + + if (!pskb_may_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead))) { + ret = -EINVAL; + goto out; + } + + if (elen <= 0) { + ret = -EINVAL; + goto out; + } + + if ((nfrags = skb_cow_data(skb, 0, &trailer)) < 0) { + ret = -EINVAL; + goto out; + } + + ret = -ENOMEM; + + assoclen = sizeof(*esph); + sglists = 1; + seqhilen = 0; + + if (x->props.flags & XFRM_STATE_ESN) { + sglists += 2; + seqhilen += sizeof(__be32); + assoclen += seqhilen; + } + + tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen); + if (!tmp) + goto out; + + ESP_SKB_CB(skb)->tmp = tmp; + seqhi = esp_tmp_seqhi(tmp); + iv = esp_tmp_iv(aead, tmp, seqhilen); + req = esp_tmp_req(aead, iv); + asg = esp_req_sg(aead, req); + sg = asg + sglists; + + skb->ip_summed = CHECKSUM_NONE; + + esph = (struct ip_esp_hdr *)skb->data; + + /* Get ivec. This can be wrong, check against another impls. */ + iv = esph->enc_data; + + sg_init_table(sg, nfrags); + skb_to_sgvec(skb, sg, sizeof(*esph) + crypto_aead_ivsize(aead), elen); + + if ((x->props.flags & XFRM_STATE_ESN)) { + sg_init_table(asg, 3); + sg_set_buf(asg, &esph->spi, sizeof(__be32)); + *seqhi = XFRM_SKB_CB(skb)->seq.input.hi; + sg_set_buf(asg + 1, seqhi, seqhilen); + sg_set_buf(asg + 2, &esph->seq_no, sizeof(__be32)); + } else + sg_init_one(asg, esph, sizeof(*esph)); + + aead_request_set_callback(req, 0, esp_input_done, skb); + aead_request_set_crypt(req, sg, sg, elen, iv); + aead_request_set_assoc(req, asg, assoclen); + + ret = crypto_aead_decrypt(req); + if (ret == -EINPROGRESS) + goto out; + + ret = esp_input_done2(skb, ret); + +out: + return ret; +} + +static u32 esp6_get_mtu(struct xfrm_state *x, int mtu) +{ + struct esp_data *esp = x->data; + u32 blksize = ALIGN(crypto_aead_blocksize(esp->aead), 4); + u32 align = max_t(u32, blksize, esp->padlen); + unsigned int net_adj; + + if (x->props.mode != XFRM_MODE_TUNNEL) + net_adj = sizeof(struct ipv6hdr); + else + net_adj = 0; + + return ((mtu - x->props.header_len - crypto_aead_authsize(esp->aead) - + net_adj) & ~(align - 1)) + (net_adj - 2); +} + +static void esp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, __be32 info) +{ + struct net *net = dev_net(skb->dev); + const struct ipv6hdr *iph = (const struct ipv6hdr *)skb->data; + struct ip_esp_hdr *esph = (struct ip_esp_hdr *)(skb->data + offset); + struct xfrm_state *x; + + if (type != ICMPV6_DEST_UNREACH && + type != ICMPV6_PKT_TOOBIG) + return; + + x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, + esph->spi, IPPROTO_ESP, AF_INET6); + if (!x) + return; + printk(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%pI6\n", + ntohl(esph->spi), &iph->daddr); + xfrm_state_put(x); +} + +static void esp6_destroy(struct xfrm_state *x) +{ + struct esp_data *esp = x->data; + + if (!esp) + return; + + crypto_free_aead(esp->aead); + kfree(esp); +} + +static int esp_init_aead(struct xfrm_state *x) +{ + struct esp_data *esp = x->data; + struct crypto_aead *aead; + int err; + + aead = crypto_alloc_aead(x->aead->alg_name, 0, 0); + err = PTR_ERR(aead); + if (IS_ERR(aead)) + goto error; + + esp->aead = aead; + + err = crypto_aead_setkey(aead, x->aead->alg_key, + (x->aead->alg_key_len + 7) / 8); + if (err) + goto error; + + err = crypto_aead_setauthsize(aead, x->aead->alg_icv_len / 8); + if (err) + goto error; + +error: + return err; +} + +static int esp_init_authenc(struct xfrm_state *x) +{ + struct esp_data *esp = x->data; + struct crypto_aead *aead; + struct crypto_authenc_key_param *param; + struct rtattr *rta; + char *key; + char *p; + char authenc_name[CRYPTO_MAX_ALG_NAME]; + unsigned int keylen; + int err; + + err = -EINVAL; + if (x->ealg == NULL) + goto error; + + err = -ENAMETOOLONG; + + if ((x->props.flags & XFRM_STATE_ESN)) { + if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME, + "authencesn(%s,%s)", + x->aalg ? x->aalg->alg_name : "digest_null", + x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME) + goto error; + } else { + if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME, + "authenc(%s,%s)", + x->aalg ? x->aalg->alg_name : "digest_null", + x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME) + goto error; + } + + aead = crypto_alloc_aead(authenc_name, 0, 0); + err = PTR_ERR(aead); + if (IS_ERR(aead)) + goto error; + + esp->aead = aead; + + keylen = (x->aalg ? (x->aalg->alg_key_len + 7) / 8 : 0) + + (x->ealg->alg_key_len + 7) / 8 + RTA_SPACE(sizeof(*param)); + err = -ENOMEM; + key = kmalloc(keylen, GFP_KERNEL); + if (!key) + goto error; + + p = key; + rta = (void *)p; + rta->rta_type = CRYPTO_AUTHENC_KEYA_PARAM; + rta->rta_len = RTA_LENGTH(sizeof(*param)); + param = RTA_DATA(rta); + p += RTA_SPACE(sizeof(*param)); + + if (x->aalg) { + struct xfrm_algo_desc *aalg_desc; + + memcpy(p, x->aalg->alg_key, (x->aalg->alg_key_len + 7) / 8); + p += (x->aalg->alg_key_len + 7) / 8; + + aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name, 0); + BUG_ON(!aalg_desc); + + err = -EINVAL; + if (aalg_desc->uinfo.auth.icv_fullbits/8 != + crypto_aead_authsize(aead)) { + NETDEBUG(KERN_INFO "ESP: %s digestsize %u != %hu\n", + x->aalg->alg_name, + crypto_aead_authsize(aead), + aalg_desc->uinfo.auth.icv_fullbits/8); + goto free_key; + } + + err = crypto_aead_setauthsize( + aead, x->aalg->alg_trunc_len / 8); + if (err) + goto free_key; + } + + param->enckeylen = cpu_to_be32((x->ealg->alg_key_len + 7) / 8); + memcpy(p, x->ealg->alg_key, (x->ealg->alg_key_len + 7) / 8); + + err = crypto_aead_setkey(aead, key, keylen); + +free_key: + kfree(key); + +error: + return err; +} + +static int esp6_init_state(struct xfrm_state *x) +{ + struct esp_data *esp; + struct crypto_aead *aead; + u32 align; + int err; + + if (x->encap) + return -EINVAL; + + esp = kzalloc(sizeof(*esp), GFP_KERNEL); + if (esp == NULL) + return -ENOMEM; + + x->data = esp; + + if (x->aead) + err = esp_init_aead(x); + else + err = esp_init_authenc(x); + + if (err) + goto error; + + aead = esp->aead; + + esp->padlen = 0; + + x->props.header_len = sizeof(struct ip_esp_hdr) + + crypto_aead_ivsize(aead); + switch (x->props.mode) { + case XFRM_MODE_BEET: + if (x->sel.family != AF_INET6) + x->props.header_len += IPV4_BEET_PHMAXLEN + + (sizeof(struct ipv6hdr) - sizeof(struct iphdr)); + break; + case XFRM_MODE_TRANSPORT: + break; + case XFRM_MODE_TUNNEL: + x->props.header_len += sizeof(struct ipv6hdr); + break; + default: + goto error; + } + + align = ALIGN(crypto_aead_blocksize(aead), 4); + if (esp->padlen) + align = max_t(u32, align, esp->padlen); + x->props.trailer_len = align + 1 + crypto_aead_authsize(esp->aead); + +error: + return err; +} + +static const struct xfrm_type esp6_type = +{ + .description = "ESP6", + .owner = THIS_MODULE, + .proto = IPPROTO_ESP, + .flags = XFRM_TYPE_REPLAY_PROT, + .init_state = esp6_init_state, + .destructor = esp6_destroy, + .get_mtu = esp6_get_mtu, + .input = esp6_input, + .output = esp6_output, + .hdr_offset = xfrm6_find_1stfragopt, +}; + +static const struct inet6_protocol esp6_protocol = { + .handler = xfrm6_rcv, + .err_handler = esp6_err, + .flags = INET6_PROTO_NOPOLICY, +}; + +static int __init esp6_init(void) +{ + if (xfrm_register_type(&esp6_type, AF_INET6) < 0) { + printk(KERN_INFO "ipv6 esp init: can't add xfrm type\n"); + return -EAGAIN; + } + if (inet6_add_protocol(&esp6_protocol, IPPROTO_ESP) < 0) { + printk(KERN_INFO "ipv6 esp init: can't add protocol\n"); + xfrm_unregister_type(&esp6_type, AF_INET6); + return -EAGAIN; + } + + return 0; +} + +static void __exit esp6_fini(void) +{ + if (inet6_del_protocol(&esp6_protocol, IPPROTO_ESP) < 0) + printk(KERN_INFO "ipv6 esp close: can't remove protocol\n"); + if (xfrm_unregister_type(&esp6_type, AF_INET6) < 0) + printk(KERN_INFO "ipv6 esp close: can't remove xfrm type\n"); +} + +module_init(esp6_init); +module_exit(esp6_fini); + +MODULE_LICENSE("GPL"); +MODULE_ALIAS_XFRM_TYPE(AF_INET6, XFRM_PROTO_ESP); diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c new file mode 100644 index 00000000..3d641b6e --- /dev/null +++ b/net/ipv6/exthdrs.c @@ -0,0 +1,896 @@ +/* + * Extension Header handling for IPv6 + * Linux INET6 implementation + * + * Authors: + * Pedro Roque <roque@di.fc.ul.pt> + * Andi Kleen <ak@muc.de> + * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* Changes: + * yoshfuji : ensure not to overrun while parsing + * tlv options. + * Mitsuru KANDA @USAGI and: Remove ipv6_parse_exthdrs(). + * YOSHIFUJI Hideaki @USAGI Register inbound extension header + * handlers as inet6_protocol{}. + */ + +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/net.h> +#include <linux/netdevice.h> +#include <linux/in6.h> +#include <linux/icmpv6.h> +#include <linux/slab.h> +#include <linux/export.h> + +#include <net/dst.h> +#include <net/sock.h> +#include <net/snmp.h> + +#include <net/ipv6.h> +#include <net/protocol.h> +#include <net/transp_v6.h> +#include <net/rawv6.h> +#include <net/ndisc.h> +#include <net/ip6_route.h> +#include <net/addrconf.h> +#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) +#include <net/xfrm.h> +#endif + +#include <asm/uaccess.h> + +int ipv6_find_tlv(struct sk_buff *skb, int offset, int type) +{ + const unsigned char *nh = skb_network_header(skb); + int packet_len = skb->tail - skb->network_header; + struct ipv6_opt_hdr *hdr; + int len; + + if (offset + 2 > packet_len) + goto bad; + hdr = (struct ipv6_opt_hdr *)(nh + offset); + len = ((hdr->hdrlen + 1) << 3); + + if (offset + len > packet_len) + goto bad; + + offset += 2; + len -= 2; + + while (len > 0) { + int opttype = nh[offset]; + int optlen; + + if (opttype == type) + return offset; + + switch (opttype) { + case IPV6_TLV_PAD0: + optlen = 1; + break; + default: + optlen = nh[offset + 1] + 2; + if (optlen > len) + goto bad; + break; + } + offset += optlen; + len -= optlen; + } + /* not_found */ + bad: + return -1; +} +EXPORT_SYMBOL_GPL(ipv6_find_tlv); + +/* + * Parsing tlv encoded headers. + * + * Parsing function "func" returns 1, if parsing succeed + * and 0, if it failed. + * It MUST NOT touch skb->h. + */ + +struct tlvtype_proc { + int type; + int (*func)(struct sk_buff *skb, int offset); +}; + +/********************* + Generic functions + *********************/ + +/* An unknown option is detected, decide what to do */ + +static int ip6_tlvopt_unknown(struct sk_buff *skb, int optoff) +{ + switch ((skb_network_header(skb)[optoff] & 0xC0) >> 6) { + case 0: /* ignore */ + return 1; + + case 1: /* drop packet */ + break; + + case 3: /* Send ICMP if not a multicast address and drop packet */ + /* Actually, it is redundant check. icmp_send + will recheck in any case. + */ + if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) + break; + case 2: /* send ICMP PARM PROB regardless and drop packet */ + icmpv6_param_prob(skb, ICMPV6_UNK_OPTION, optoff); + return 0; + } + + kfree_skb(skb); + return 0; +} + +/* Parse tlv encoded option header (hop-by-hop or destination) */ + +static int ip6_parse_tlv(struct tlvtype_proc *procs, struct sk_buff *skb) +{ + struct tlvtype_proc *curr; + const unsigned char *nh = skb_network_header(skb); + int off = skb_network_header_len(skb); + int len = (skb_transport_header(skb)[1] + 1) << 3; + + if (skb_transport_offset(skb) + len > skb_headlen(skb)) + goto bad; + + off += 2; + len -= 2; + + while (len > 0) { + int optlen = nh[off + 1] + 2; + + switch (nh[off]) { + case IPV6_TLV_PAD0: + optlen = 1; + break; + + case IPV6_TLV_PADN: + break; + + default: /* Other TLV code so scan list */ + if (optlen > len) + goto bad; + for (curr=procs; curr->type >= 0; curr++) { + if (curr->type == nh[off]) { + /* type specific length/alignment + checks will be performed in the + func(). */ + if (curr->func(skb, off) == 0) + return 0; + break; + } + } + if (curr->type < 0) { + if (ip6_tlvopt_unknown(skb, off) == 0) + return 0; + } + break; + } + off += optlen; + len -= optlen; + } + if (len == 0) + return 1; +bad: + kfree_skb(skb); + return 0; +} + +/***************************** + Destination options header. + *****************************/ + +#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) +static int ipv6_dest_hao(struct sk_buff *skb, int optoff) +{ + struct ipv6_destopt_hao *hao; + struct inet6_skb_parm *opt = IP6CB(skb); + struct ipv6hdr *ipv6h = ipv6_hdr(skb); + struct in6_addr tmp_addr; + int ret; + + if (opt->dsthao) { + LIMIT_NETDEBUG(KERN_DEBUG "hao duplicated\n"); + goto discard; + } + opt->dsthao = opt->dst1; + opt->dst1 = 0; + + hao = (struct ipv6_destopt_hao *)(skb_network_header(skb) + optoff); + + if (hao->length != 16) { + LIMIT_NETDEBUG( + KERN_DEBUG "hao invalid option length = %d\n", hao->length); + goto discard; + } + + if (!(ipv6_addr_type(&hao->addr) & IPV6_ADDR_UNICAST)) { + LIMIT_NETDEBUG( + KERN_DEBUG "hao is not an unicast addr: %pI6\n", &hao->addr); + goto discard; + } + + ret = xfrm6_input_addr(skb, (xfrm_address_t *)&ipv6h->daddr, + (xfrm_address_t *)&hao->addr, IPPROTO_DSTOPTS); + if (unlikely(ret < 0)) + goto discard; + + if (skb_cloned(skb)) { + if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + goto discard; + + /* update all variable using below by copied skbuff */ + hao = (struct ipv6_destopt_hao *)(skb_network_header(skb) + + optoff); + ipv6h = ipv6_hdr(skb); + } + + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->ip_summed = CHECKSUM_NONE; + + tmp_addr = ipv6h->saddr; + ipv6h->saddr = hao->addr; + hao->addr = tmp_addr; + + if (skb->tstamp.tv64 == 0) + __net_timestamp(skb); + + return 1; + + discard: + kfree_skb(skb); + return 0; +} +#endif + +static struct tlvtype_proc tlvprocdestopt_lst[] = { +#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) + { + .type = IPV6_TLV_HAO, + .func = ipv6_dest_hao, + }, +#endif + {-1, NULL} +}; + +static int ipv6_destopt_rcv(struct sk_buff *skb) +{ + struct inet6_skb_parm *opt = IP6CB(skb); +#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) + __u16 dstbuf; +#endif + struct dst_entry *dst = skb_dst(skb); + + if (!pskb_may_pull(skb, skb_transport_offset(skb) + 8) || + !pskb_may_pull(skb, (skb_transport_offset(skb) + + ((skb_transport_header(skb)[1] + 1) << 3)))) { + IP6_INC_STATS_BH(dev_net(dst->dev), ip6_dst_idev(dst), + IPSTATS_MIB_INHDRERRORS); + kfree_skb(skb); + return -1; + } + + opt->lastopt = opt->dst1 = skb_network_header_len(skb); +#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) + dstbuf = opt->dst1; +#endif + + if (ip6_parse_tlv(tlvprocdestopt_lst, skb)) { + skb->transport_header += (skb_transport_header(skb)[1] + 1) << 3; + opt = IP6CB(skb); +#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) + opt->nhoff = dstbuf; +#else + opt->nhoff = opt->dst1; +#endif + return 1; + } + + IP6_INC_STATS_BH(dev_net(dst->dev), + ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS); + return -1; +} + +/******************************** + Routing header. + ********************************/ + +/* called with rcu_read_lock() */ +static int ipv6_rthdr_rcv(struct sk_buff *skb) +{ + struct inet6_skb_parm *opt = IP6CB(skb); + struct in6_addr *addr = NULL; + struct in6_addr daddr; + struct inet6_dev *idev; + int n, i; + struct ipv6_rt_hdr *hdr; + struct rt0_hdr *rthdr; + struct net *net = dev_net(skb->dev); + int accept_source_route = net->ipv6.devconf_all->accept_source_route; + + idev = __in6_dev_get(skb->dev); + if (idev && accept_source_route > idev->cnf.accept_source_route) + accept_source_route = idev->cnf.accept_source_route; + + if (!pskb_may_pull(skb, skb_transport_offset(skb) + 8) || + !pskb_may_pull(skb, (skb_transport_offset(skb) + + ((skb_transport_header(skb)[1] + 1) << 3)))) { + IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_INHDRERRORS); + kfree_skb(skb); + return -1; + } + + hdr = (struct ipv6_rt_hdr *)skb_transport_header(skb); + + if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr) || + skb->pkt_type != PACKET_HOST) { + IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_INADDRERRORS); + kfree_skb(skb); + return -1; + } + +looped_back: + if (hdr->segments_left == 0) { + switch (hdr->type) { +#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) + case IPV6_SRCRT_TYPE_2: + /* Silently discard type 2 header unless it was + * processed by own + */ + if (!addr) { + IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_INADDRERRORS); + kfree_skb(skb); + return -1; + } + break; +#endif + default: + break; + } + + opt->lastopt = opt->srcrt = skb_network_header_len(skb); + skb->transport_header += (hdr->hdrlen + 1) << 3; + opt->dst0 = opt->dst1; + opt->dst1 = 0; + opt->nhoff = (&hdr->nexthdr) - skb_network_header(skb); + return 1; + } + + switch (hdr->type) { +#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) + case IPV6_SRCRT_TYPE_2: + if (accept_source_route < 0) + goto unknown_rh; + /* Silently discard invalid RTH type 2 */ + if (hdr->hdrlen != 2 || hdr->segments_left != 1) { + IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_INHDRERRORS); + kfree_skb(skb); + return -1; + } + break; +#endif + default: + goto unknown_rh; + } + + /* + * This is the routing header forwarding algorithm from + * RFC 2460, page 16. + */ + + n = hdr->hdrlen >> 1; + + if (hdr->segments_left > n) { + IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_INHDRERRORS); + icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, + ((&hdr->segments_left) - + skb_network_header(skb))); + return -1; + } + + /* We are about to mangle packet header. Be careful! + Do not damage packets queued somewhere. + */ + if (skb_cloned(skb)) { + /* the copy is a forwarded packet */ + if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) { + IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_OUTDISCARDS); + kfree_skb(skb); + return -1; + } + hdr = (struct ipv6_rt_hdr *)skb_transport_header(skb); + } + + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->ip_summed = CHECKSUM_NONE; + + i = n - --hdr->segments_left; + + rthdr = (struct rt0_hdr *) hdr; + addr = rthdr->addr; + addr += i - 1; + + switch (hdr->type) { +#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) + case IPV6_SRCRT_TYPE_2: + if (xfrm6_input_addr(skb, (xfrm_address_t *)addr, + (xfrm_address_t *)&ipv6_hdr(skb)->saddr, + IPPROTO_ROUTING) < 0) { + IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_INADDRERRORS); + kfree_skb(skb); + return -1; + } + if (!ipv6_chk_home_addr(dev_net(skb_dst(skb)->dev), addr)) { + IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_INADDRERRORS); + kfree_skb(skb); + return -1; + } + break; +#endif + default: + break; + } + + if (ipv6_addr_is_multicast(addr)) { + IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_INADDRERRORS); + kfree_skb(skb); + return -1; + } + + daddr = *addr; + *addr = ipv6_hdr(skb)->daddr; + ipv6_hdr(skb)->daddr = daddr; + + skb_dst_drop(skb); + ip6_route_input(skb); + if (skb_dst(skb)->error) { + skb_push(skb, skb->data - skb_network_header(skb)); + dst_input(skb); + return -1; + } + + if (skb_dst(skb)->dev->flags&IFF_LOOPBACK) { + if (ipv6_hdr(skb)->hop_limit <= 1) { + IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_INHDRERRORS); + icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, + 0); + kfree_skb(skb); + return -1; + } + ipv6_hdr(skb)->hop_limit--; + goto looped_back; + } + + skb_push(skb, skb->data - skb_network_header(skb)); + dst_input(skb); + return -1; + +unknown_rh: + IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_INHDRERRORS); + icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, + (&hdr->type) - skb_network_header(skb)); + return -1; +} + +static const struct inet6_protocol rthdr_protocol = { + .handler = ipv6_rthdr_rcv, + .flags = INET6_PROTO_NOPOLICY | INET6_PROTO_GSO_EXTHDR, +}; + +static const struct inet6_protocol destopt_protocol = { + .handler = ipv6_destopt_rcv, + .flags = INET6_PROTO_NOPOLICY | INET6_PROTO_GSO_EXTHDR, +}; + +static const struct inet6_protocol nodata_protocol = { + .handler = dst_discard, + .flags = INET6_PROTO_NOPOLICY, +}; + +int __init ipv6_exthdrs_init(void) +{ + int ret; + + ret = inet6_add_protocol(&rthdr_protocol, IPPROTO_ROUTING); + if (ret) + goto out; + + ret = inet6_add_protocol(&destopt_protocol, IPPROTO_DSTOPTS); + if (ret) + goto out_rthdr; + + ret = inet6_add_protocol(&nodata_protocol, IPPROTO_NONE); + if (ret) + goto out_destopt; + +out: + return ret; +out_rthdr: + inet6_del_protocol(&rthdr_protocol, IPPROTO_ROUTING); +out_destopt: + inet6_del_protocol(&destopt_protocol, IPPROTO_DSTOPTS); + goto out; +}; + +void ipv6_exthdrs_exit(void) +{ + inet6_del_protocol(&nodata_protocol, IPPROTO_NONE); + inet6_del_protocol(&destopt_protocol, IPPROTO_DSTOPTS); + inet6_del_protocol(&rthdr_protocol, IPPROTO_ROUTING); +} + +/********************************** + Hop-by-hop options. + **********************************/ + +/* + * Note: we cannot rely on skb_dst(skb) before we assign it in ip6_route_input(). + */ +static inline struct inet6_dev *ipv6_skb_idev(struct sk_buff *skb) +{ + return skb_dst(skb) ? ip6_dst_idev(skb_dst(skb)) : __in6_dev_get(skb->dev); +} + +static inline struct net *ipv6_skb_net(struct sk_buff *skb) +{ + return skb_dst(skb) ? dev_net(skb_dst(skb)->dev) : dev_net(skb->dev); +} + +/* Router Alert as of RFC 2711 */ + +static int ipv6_hop_ra(struct sk_buff *skb, int optoff) +{ + const unsigned char *nh = skb_network_header(skb); + + if (nh[optoff + 1] == 2) { + IP6CB(skb)->ra = optoff; + return 1; + } + LIMIT_NETDEBUG(KERN_DEBUG "ipv6_hop_ra: wrong RA length %d\n", + nh[optoff + 1]); + kfree_skb(skb); + return 0; +} + +/* Jumbo payload */ + +static int ipv6_hop_jumbo(struct sk_buff *skb, int optoff) +{ + const unsigned char *nh = skb_network_header(skb); + struct net *net = ipv6_skb_net(skb); + u32 pkt_len; + + if (nh[optoff + 1] != 4 || (optoff & 3) != 2) { + LIMIT_NETDEBUG(KERN_DEBUG "ipv6_hop_jumbo: wrong jumbo opt length/alignment %d\n", + nh[optoff+1]); + IP6_INC_STATS_BH(net, ipv6_skb_idev(skb), + IPSTATS_MIB_INHDRERRORS); + goto drop; + } + + pkt_len = ntohl(*(__be32 *)(nh + optoff + 2)); + if (pkt_len <= IPV6_MAXPLEN) { + IP6_INC_STATS_BH(net, ipv6_skb_idev(skb), + IPSTATS_MIB_INHDRERRORS); + icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, optoff+2); + return 0; + } + if (ipv6_hdr(skb)->payload_len) { + IP6_INC_STATS_BH(net, ipv6_skb_idev(skb), + IPSTATS_MIB_INHDRERRORS); + icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, optoff); + return 0; + } + + if (pkt_len > skb->len - sizeof(struct ipv6hdr)) { + IP6_INC_STATS_BH(net, ipv6_skb_idev(skb), + IPSTATS_MIB_INTRUNCATEDPKTS); + goto drop; + } + + if (pskb_trim_rcsum(skb, pkt_len + sizeof(struct ipv6hdr))) + goto drop; + + return 1; + +drop: + kfree_skb(skb); + return 0; +} + +static struct tlvtype_proc tlvprochopopt_lst[] = { + { + .type = IPV6_TLV_ROUTERALERT, + .func = ipv6_hop_ra, + }, + { + .type = IPV6_TLV_JUMBO, + .func = ipv6_hop_jumbo, + }, + { -1, } +}; + +int ipv6_parse_hopopts(struct sk_buff *skb) +{ + struct inet6_skb_parm *opt = IP6CB(skb); + + /* + * skb_network_header(skb) is equal to skb->data, and + * skb_network_header_len(skb) is always equal to + * sizeof(struct ipv6hdr) by definition of + * hop-by-hop options. + */ + if (!pskb_may_pull(skb, sizeof(struct ipv6hdr) + 8) || + !pskb_may_pull(skb, (sizeof(struct ipv6hdr) + + ((skb_transport_header(skb)[1] + 1) << 3)))) { + kfree_skb(skb); + return -1; + } + + opt->hop = sizeof(struct ipv6hdr); + if (ip6_parse_tlv(tlvprochopopt_lst, skb)) { + skb->transport_header += (skb_transport_header(skb)[1] + 1) << 3; + opt = IP6CB(skb); + opt->nhoff = sizeof(struct ipv6hdr); + return 1; + } + return -1; +} + +/* + * Creating outbound headers. + * + * "build" functions work when skb is filled from head to tail (datagram) + * "push" functions work when headers are added from tail to head (tcp) + * + * In both cases we assume, that caller reserved enough room + * for headers. + */ + +static void ipv6_push_rthdr(struct sk_buff *skb, u8 *proto, + struct ipv6_rt_hdr *opt, + struct in6_addr **addr_p) +{ + struct rt0_hdr *phdr, *ihdr; + int hops; + + ihdr = (struct rt0_hdr *) opt; + + phdr = (struct rt0_hdr *) skb_push(skb, (ihdr->rt_hdr.hdrlen + 1) << 3); + memcpy(phdr, ihdr, sizeof(struct rt0_hdr)); + + hops = ihdr->rt_hdr.hdrlen >> 1; + + if (hops > 1) + memcpy(phdr->addr, ihdr->addr + 1, + (hops - 1) * sizeof(struct in6_addr)); + + phdr->addr[hops - 1] = **addr_p; + *addr_p = ihdr->addr; + + phdr->rt_hdr.nexthdr = *proto; + *proto = NEXTHDR_ROUTING; +} + +static void ipv6_push_exthdr(struct sk_buff *skb, u8 *proto, u8 type, struct ipv6_opt_hdr *opt) +{ + struct ipv6_opt_hdr *h = (struct ipv6_opt_hdr *)skb_push(skb, ipv6_optlen(opt)); + + memcpy(h, opt, ipv6_optlen(opt)); + h->nexthdr = *proto; + *proto = type; +} + +void ipv6_push_nfrag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt, + u8 *proto, + struct in6_addr **daddr) +{ + if (opt->srcrt) { + ipv6_push_rthdr(skb, proto, opt->srcrt, daddr); + /* + * IPV6_RTHDRDSTOPTS is ignored + * unless IPV6_RTHDR is set (RFC3542). + */ + if (opt->dst0opt) + ipv6_push_exthdr(skb, proto, NEXTHDR_DEST, opt->dst0opt); + } + if (opt->hopopt) + ipv6_push_exthdr(skb, proto, NEXTHDR_HOP, opt->hopopt); +} + +EXPORT_SYMBOL(ipv6_push_nfrag_opts); + +void ipv6_push_frag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt, u8 *proto) +{ + if (opt->dst1opt) + ipv6_push_exthdr(skb, proto, NEXTHDR_DEST, opt->dst1opt); +} + +struct ipv6_txoptions * +ipv6_dup_options(struct sock *sk, struct ipv6_txoptions *opt) +{ + struct ipv6_txoptions *opt2; + + opt2 = sock_kmalloc(sk, opt->tot_len, GFP_ATOMIC); + if (opt2) { + long dif = (char*)opt2 - (char*)opt; + memcpy(opt2, opt, opt->tot_len); + if (opt2->hopopt) + *((char**)&opt2->hopopt) += dif; + if (opt2->dst0opt) + *((char**)&opt2->dst0opt) += dif; + if (opt2->dst1opt) + *((char**)&opt2->dst1opt) += dif; + if (opt2->srcrt) + *((char**)&opt2->srcrt) += dif; + } + return opt2; +} + +EXPORT_SYMBOL_GPL(ipv6_dup_options); + +static int ipv6_renew_option(void *ohdr, + struct ipv6_opt_hdr __user *newopt, int newoptlen, + int inherit, + struct ipv6_opt_hdr **hdr, + char **p) +{ + if (inherit) { + if (ohdr) { + memcpy(*p, ohdr, ipv6_optlen((struct ipv6_opt_hdr *)ohdr)); + *hdr = (struct ipv6_opt_hdr *)*p; + *p += CMSG_ALIGN(ipv6_optlen(*(struct ipv6_opt_hdr **)hdr)); + } + } else { + if (newopt) { + if (copy_from_user(*p, newopt, newoptlen)) + return -EFAULT; + *hdr = (struct ipv6_opt_hdr *)*p; + if (ipv6_optlen(*(struct ipv6_opt_hdr **)hdr) > newoptlen) + return -EINVAL; + *p += CMSG_ALIGN(newoptlen); + } + } + return 0; +} + +struct ipv6_txoptions * +ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt, + int newtype, + struct ipv6_opt_hdr __user *newopt, int newoptlen) +{ + int tot_len = 0; + char *p; + struct ipv6_txoptions *opt2; + int err; + + if (opt) { + if (newtype != IPV6_HOPOPTS && opt->hopopt) + tot_len += CMSG_ALIGN(ipv6_optlen(opt->hopopt)); + if (newtype != IPV6_RTHDRDSTOPTS && opt->dst0opt) + tot_len += CMSG_ALIGN(ipv6_optlen(opt->dst0opt)); + if (newtype != IPV6_RTHDR && opt->srcrt) + tot_len += CMSG_ALIGN(ipv6_optlen(opt->srcrt)); + if (newtype != IPV6_DSTOPTS && opt->dst1opt) + tot_len += CMSG_ALIGN(ipv6_optlen(opt->dst1opt)); + } + + if (newopt && newoptlen) + tot_len += CMSG_ALIGN(newoptlen); + + if (!tot_len) + return NULL; + + tot_len += sizeof(*opt2); + opt2 = sock_kmalloc(sk, tot_len, GFP_ATOMIC); + if (!opt2) + return ERR_PTR(-ENOBUFS); + + memset(opt2, 0, tot_len); + + opt2->tot_len = tot_len; + p = (char *)(opt2 + 1); + + err = ipv6_renew_option(opt ? opt->hopopt : NULL, newopt, newoptlen, + newtype != IPV6_HOPOPTS, + &opt2->hopopt, &p); + if (err) + goto out; + + err = ipv6_renew_option(opt ? opt->dst0opt : NULL, newopt, newoptlen, + newtype != IPV6_RTHDRDSTOPTS, + &opt2->dst0opt, &p); + if (err) + goto out; + + err = ipv6_renew_option(opt ? opt->srcrt : NULL, newopt, newoptlen, + newtype != IPV6_RTHDR, + (struct ipv6_opt_hdr **)&opt2->srcrt, &p); + if (err) + goto out; + + err = ipv6_renew_option(opt ? opt->dst1opt : NULL, newopt, newoptlen, + newtype != IPV6_DSTOPTS, + &opt2->dst1opt, &p); + if (err) + goto out; + + opt2->opt_nflen = (opt2->hopopt ? ipv6_optlen(opt2->hopopt) : 0) + + (opt2->dst0opt ? ipv6_optlen(opt2->dst0opt) : 0) + + (opt2->srcrt ? ipv6_optlen(opt2->srcrt) : 0); + opt2->opt_flen = (opt2->dst1opt ? ipv6_optlen(opt2->dst1opt) : 0); + + return opt2; +out: + sock_kfree_s(sk, opt2, opt2->tot_len); + return ERR_PTR(err); +} + +struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space, + struct ipv6_txoptions *opt) +{ + /* + * ignore the dest before srcrt unless srcrt is being included. + * --yoshfuji + */ + if (opt && opt->dst0opt && !opt->srcrt) { + if (opt_space != opt) { + memcpy(opt_space, opt, sizeof(*opt_space)); + opt = opt_space; + } + opt->opt_nflen -= ipv6_optlen(opt->dst0opt); + opt->dst0opt = NULL; + } + + return opt; +} + +/** + * fl6_update_dst - update flowi destination address with info given + * by srcrt option, if any. + * + * @fl6: flowi6 for which daddr is to be updated + * @opt: struct ipv6_txoptions in which to look for srcrt opt + * @orig: copy of original daddr address if modified + * + * Returns NULL if no txoptions or no srcrt, otherwise returns orig + * and initial value of fl6->daddr set in orig + */ +struct in6_addr *fl6_update_dst(struct flowi6 *fl6, + const struct ipv6_txoptions *opt, + struct in6_addr *orig) +{ + if (!opt || !opt->srcrt) + return NULL; + + *orig = fl6->daddr; + fl6->daddr = *((struct rt0_hdr *)opt->srcrt)->addr; + return orig; +} + +EXPORT_SYMBOL_GPL(fl6_update_dst); diff --git a/net/ipv6/exthdrs_core.c b/net/ipv6/exthdrs_core.c new file mode 100644 index 00000000..72957f4a --- /dev/null +++ b/net/ipv6/exthdrs_core.c @@ -0,0 +1,114 @@ +/* + * IPv6 library code, needed by static components when full IPv6 support is + * not configured or static. + */ +#include <linux/export.h> +#include <net/ipv6.h> + +/* + * find out if nexthdr is a well-known extension header or a protocol + */ + +int ipv6_ext_hdr(u8 nexthdr) +{ + /* + * find out if nexthdr is an extension header or a protocol + */ + return (nexthdr == NEXTHDR_HOP) || + (nexthdr == NEXTHDR_ROUTING) || + (nexthdr == NEXTHDR_FRAGMENT) || + (nexthdr == NEXTHDR_AUTH) || + (nexthdr == NEXTHDR_NONE) || + (nexthdr == NEXTHDR_DEST); +} + +/* + * Skip any extension headers. This is used by the ICMP module. + * + * Note that strictly speaking this conflicts with RFC 2460 4.0: + * ...The contents and semantics of each extension header determine whether + * or not to proceed to the next header. Therefore, extension headers must + * be processed strictly in the order they appear in the packet; a + * receiver must not, for example, scan through a packet looking for a + * particular kind of extension header and process that header prior to + * processing all preceding ones. + * + * We do exactly this. This is a protocol bug. We can't decide after a + * seeing an unknown discard-with-error flavour TLV option if it's a + * ICMP error message or not (errors should never be send in reply to + * ICMP error messages). + * + * But I see no other way to do this. This might need to be reexamined + * when Linux implements ESP (and maybe AUTH) headers. + * --AK + * + * This function parses (probably truncated) exthdr set "hdr". + * "nexthdrp" initially points to some place, + * where type of the first header can be found. + * + * It skips all well-known exthdrs, and returns pointer to the start + * of unparsable area i.e. the first header with unknown type. + * If it is not NULL *nexthdr is updated by type/protocol of this header. + * + * NOTES: - if packet terminated with NEXTHDR_NONE it returns NULL. + * - it may return pointer pointing beyond end of packet, + * if the last recognized header is truncated in the middle. + * - if packet is truncated, so that all parsed headers are skipped, + * it returns NULL. + * - First fragment header is skipped, not-first ones + * are considered as unparsable. + * - Reports the offset field of the final fragment header so it is + * possible to tell whether this is a first fragment, later fragment, + * or not fragmented. + * - ESP is unparsable for now and considered like + * normal payload protocol. + * - Note also special handling of AUTH header. Thanks to IPsec wizards. + * + * --ANK (980726) + */ + +int ipv6_skip_exthdr(const struct sk_buff *skb, int start, u8 *nexthdrp, + __be16 *frag_offp) +{ + u8 nexthdr = *nexthdrp; + + *frag_offp = 0; + + while (ipv6_ext_hdr(nexthdr)) { + struct ipv6_opt_hdr _hdr, *hp; + int hdrlen; + + if (nexthdr == NEXTHDR_NONE) + return -1; + hp = skb_header_pointer(skb, start, sizeof(_hdr), &_hdr); + if (hp == NULL) + return -1; + if (nexthdr == NEXTHDR_FRAGMENT) { + __be16 _frag_off, *fp; + fp = skb_header_pointer(skb, + start+offsetof(struct frag_hdr, + frag_off), + sizeof(_frag_off), + &_frag_off); + if (fp == NULL) + return -1; + + *frag_offp = *fp; + if (ntohs(*frag_offp) & ~0x7) + break; + hdrlen = 8; + } else if (nexthdr == NEXTHDR_AUTH) + hdrlen = (hp->hdrlen+2)<<2; + else + hdrlen = ipv6_optlen(hp); + + nexthdr = hp->nexthdr; + start += hdrlen; + } + + *nexthdrp = nexthdr; + return start; +} + +EXPORT_SYMBOL(ipv6_ext_hdr); +EXPORT_SYMBOL(ipv6_skip_exthdr); diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c new file mode 100644 index 00000000..b6c57315 --- /dev/null +++ b/net/ipv6/fib6_rules.c @@ -0,0 +1,308 @@ +/* + * net/ipv6/fib6_rules.c IPv6 Routing Policy Rules + * + * Copyright (C)2003-2006 Helsinki University of Technology + * Copyright (C)2003-2006 USAGI/WIDE Project + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, version 2. + * + * Authors + * Thomas Graf <tgraf@suug.ch> + * Ville Nuorvala <vnuorval@tcs.hut.fi> + */ + +#include <linux/netdevice.h> +#include <linux/export.h> + +#include <net/fib_rules.h> +#include <net/ipv6.h> +#include <net/addrconf.h> +#include <net/ip6_route.h> +#include <net/netlink.h> + +struct fib6_rule +{ + struct fib_rule common; + struct rt6key src; + struct rt6key dst; + u8 tclass; +}; + +struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, + int flags, pol_lookup_t lookup) +{ + struct fib_lookup_arg arg = { + .lookup_ptr = lookup, + .flags = FIB_LOOKUP_NOREF, + }; + + fib_rules_lookup(net->ipv6.fib6_rules_ops, + flowi6_to_flowi(fl6), flags, &arg); + + if (arg.result) + return arg.result; + + dst_hold(&net->ipv6.ip6_null_entry->dst); + return &net->ipv6.ip6_null_entry->dst; +} + +static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp, + int flags, struct fib_lookup_arg *arg) +{ + struct flowi6 *flp6 = &flp->u.ip6; + struct rt6_info *rt = NULL; + struct fib6_table *table; + struct net *net = rule->fr_net; + pol_lookup_t lookup = arg->lookup_ptr; + + switch (rule->action) { + case FR_ACT_TO_TBL: + break; + case FR_ACT_UNREACHABLE: + rt = net->ipv6.ip6_null_entry; + goto discard_pkt; + default: + case FR_ACT_BLACKHOLE: + rt = net->ipv6.ip6_blk_hole_entry; + goto discard_pkt; + case FR_ACT_PROHIBIT: + rt = net->ipv6.ip6_prohibit_entry; + goto discard_pkt; + } + + table = fib6_get_table(net, rule->table); + if (table) + rt = lookup(net, table, flp6, flags); + + if (rt != net->ipv6.ip6_null_entry) { + struct fib6_rule *r = (struct fib6_rule *)rule; + + /* + * If we need to find a source address for this traffic, + * we check the result if it meets requirement of the rule. + */ + if ((rule->flags & FIB_RULE_FIND_SADDR) && + r->src.plen && !(flags & RT6_LOOKUP_F_HAS_SADDR)) { + struct in6_addr saddr; + + if (ipv6_dev_get_saddr(net, + ip6_dst_idev(&rt->dst)->dev, + &flp6->daddr, + rt6_flags2srcprefs(flags), + &saddr)) + goto again; + if (!ipv6_prefix_equal(&saddr, &r->src.addr, + r->src.plen)) + goto again; + flp6->saddr = saddr; + } + goto out; + } +again: + dst_release(&rt->dst); + rt = NULL; + goto out; + +discard_pkt: + dst_hold(&rt->dst); +out: + arg->result = rt; + return rt == NULL ? -EAGAIN : 0; +} + + +static int fib6_rule_match(struct fib_rule *rule, struct flowi *fl, int flags) +{ + struct fib6_rule *r = (struct fib6_rule *) rule; + struct flowi6 *fl6 = &fl->u.ip6; + + if (r->dst.plen && + !ipv6_prefix_equal(&fl6->daddr, &r->dst.addr, r->dst.plen)) + return 0; + + /* + * If FIB_RULE_FIND_SADDR is set and we do not have a + * source address for the traffic, we defer check for + * source address. + */ + if (r->src.plen) { + if (flags & RT6_LOOKUP_F_HAS_SADDR) { + if (!ipv6_prefix_equal(&fl6->saddr, &r->src.addr, + r->src.plen)) + return 0; + } else if (!(r->common.flags & FIB_RULE_FIND_SADDR)) + return 0; + } + + if (r->tclass && r->tclass != ((ntohl(fl6->flowlabel) >> 20) & 0xff)) + return 0; + + return 1; +} + +static const struct nla_policy fib6_rule_policy[FRA_MAX+1] = { + FRA_GENERIC_POLICY, +}; + +static int fib6_rule_configure(struct fib_rule *rule, struct sk_buff *skb, + struct fib_rule_hdr *frh, + struct nlattr **tb) +{ + int err = -EINVAL; + struct net *net = sock_net(skb->sk); + struct fib6_rule *rule6 = (struct fib6_rule *) rule; + + if (rule->action == FR_ACT_TO_TBL) { + if (rule->table == RT6_TABLE_UNSPEC) + goto errout; + + if (fib6_new_table(net, rule->table) == NULL) { + err = -ENOBUFS; + goto errout; + } + } + + if (frh->src_len) + nla_memcpy(&rule6->src.addr, tb[FRA_SRC], + sizeof(struct in6_addr)); + + if (frh->dst_len) + nla_memcpy(&rule6->dst.addr, tb[FRA_DST], + sizeof(struct in6_addr)); + + rule6->src.plen = frh->src_len; + rule6->dst.plen = frh->dst_len; + rule6->tclass = frh->tos; + + err = 0; +errout: + return err; +} + +static int fib6_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, + struct nlattr **tb) +{ + struct fib6_rule *rule6 = (struct fib6_rule *) rule; + + if (frh->src_len && (rule6->src.plen != frh->src_len)) + return 0; + + if (frh->dst_len && (rule6->dst.plen != frh->dst_len)) + return 0; + + if (frh->tos && (rule6->tclass != frh->tos)) + return 0; + + if (frh->src_len && + nla_memcmp(tb[FRA_SRC], &rule6->src.addr, sizeof(struct in6_addr))) + return 0; + + if (frh->dst_len && + nla_memcmp(tb[FRA_DST], &rule6->dst.addr, sizeof(struct in6_addr))) + return 0; + + return 1; +} + +static int fib6_rule_fill(struct fib_rule *rule, struct sk_buff *skb, + struct fib_rule_hdr *frh) +{ + struct fib6_rule *rule6 = (struct fib6_rule *) rule; + + frh->dst_len = rule6->dst.plen; + frh->src_len = rule6->src.plen; + frh->tos = rule6->tclass; + + if (rule6->dst.plen) + NLA_PUT(skb, FRA_DST, sizeof(struct in6_addr), + &rule6->dst.addr); + + if (rule6->src.plen) + NLA_PUT(skb, FRA_SRC, sizeof(struct in6_addr), + &rule6->src.addr); + + return 0; + +nla_put_failure: + return -ENOBUFS; +} + +static u32 fib6_rule_default_pref(struct fib_rules_ops *ops) +{ + return 0x3FFF; +} + +static size_t fib6_rule_nlmsg_payload(struct fib_rule *rule) +{ + return nla_total_size(16) /* dst */ + + nla_total_size(16); /* src */ +} + +static const struct fib_rules_ops __net_initdata fib6_rules_ops_template = { + .family = AF_INET6, + .rule_size = sizeof(struct fib6_rule), + .addr_size = sizeof(struct in6_addr), + .action = fib6_rule_action, + .match = fib6_rule_match, + .configure = fib6_rule_configure, + .compare = fib6_rule_compare, + .fill = fib6_rule_fill, + .default_pref = fib6_rule_default_pref, + .nlmsg_payload = fib6_rule_nlmsg_payload, + .nlgroup = RTNLGRP_IPV6_RULE, + .policy = fib6_rule_policy, + .owner = THIS_MODULE, + .fro_net = &init_net, +}; + +static int __net_init fib6_rules_net_init(struct net *net) +{ + struct fib_rules_ops *ops; + int err = -ENOMEM; + + ops = fib_rules_register(&fib6_rules_ops_template, net); + if (IS_ERR(ops)) + return PTR_ERR(ops); + net->ipv6.fib6_rules_ops = ops; + + + err = fib_default_rule_add(net->ipv6.fib6_rules_ops, 0, + RT6_TABLE_LOCAL, 0); + if (err) + goto out_fib6_rules_ops; + + err = fib_default_rule_add(net->ipv6.fib6_rules_ops, + 0x7FFE, RT6_TABLE_MAIN, 0); + if (err) + goto out_fib6_rules_ops; + +out: + return err; + +out_fib6_rules_ops: + fib_rules_unregister(ops); + goto out; +} + +static void __net_exit fib6_rules_net_exit(struct net *net) +{ + fib_rules_unregister(net->ipv6.fib6_rules_ops); +} + +static struct pernet_operations fib6_rules_net_ops = { + .init = fib6_rules_net_init, + .exit = fib6_rules_net_exit, +}; + +int __init fib6_rules_init(void) +{ + return register_pernet_subsys(&fib6_rules_net_ops); +} + + +void fib6_rules_cleanup(void) +{ + unregister_pernet_subsys(&fib6_rules_net_ops); +} diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c new file mode 100644 index 00000000..ba0c1479 --- /dev/null +++ b/net/ipv6/icmp.c @@ -0,0 +1,997 @@ +/* + * Internet Control Message Protocol (ICMPv6) + * Linux INET6 implementation + * + * Authors: + * Pedro Roque <roque@di.fc.ul.pt> + * + * Based on net/ipv4/icmp.c + * + * RFC 1885 + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* + * Changes: + * + * Andi Kleen : exception handling + * Andi Kleen add rate limits. never reply to a icmp. + * add more length checks and other fixes. + * yoshfuji : ensure to sent parameter problem for + * fragments. + * YOSHIFUJI Hideaki @USAGI: added sysctl for icmp rate limit. + * Randy Dunlap and + * YOSHIFUJI Hideaki @USAGI: Per-interface statistics support + * Kazunori MIYAZAWA @USAGI: change output process to use ip6_append_data + */ + +#include <linux/module.h> +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/in.h> +#include <linux/kernel.h> +#include <linux/sockios.h> +#include <linux/net.h> +#include <linux/skbuff.h> +#include <linux/init.h> +#include <linux/netfilter.h> +#include <linux/slab.h> + +#ifdef CONFIG_SYSCTL +#include <linux/sysctl.h> +#endif + +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/icmpv6.h> + +#include <net/ip.h> +#include <net/sock.h> + +#include <net/ipv6.h> +#include <net/ip6_checksum.h> +#include <net/ping.h> +#include <net/protocol.h> +#include <net/raw.h> +#include <net/rawv6.h> +#include <net/transp_v6.h> +#include <net/ip6_route.h> +#include <net/addrconf.h> +#include <net/icmp.h> +#include <net/xfrm.h> +#include <net/inet_common.h> + +#include <asm/uaccess.h> + +/* + * The ICMP socket(s). This is the most convenient way to flow control + * our ICMP output as well as maintain a clean interface throughout + * all layers. All Socketless IP sends will soon be gone. + * + * On SMP we have one ICMP socket per-cpu. + */ +static inline struct sock *icmpv6_sk(struct net *net) +{ + return net->ipv6.icmp_sk[smp_processor_id()]; +} + +static void icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, __be32 info) +{ + /* icmpv6_notify checks 8 bytes can be pulled, icmp6hdr is 8 bytes */ + struct icmp6hdr *icmp6 = (struct icmp6hdr *) (skb->data + offset); + + if (!(type & ICMPV6_INFOMSG_MASK)) + if (icmp6->icmp6_type == ICMPV6_ECHO_REQUEST) + ping_err(skb, offset, info); +} + +static int icmpv6_rcv(struct sk_buff *skb); + +static const struct inet6_protocol icmpv6_protocol = { + .handler = icmpv6_rcv, + .err_handler = icmpv6_err, + .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, +}; + +static __inline__ struct sock *icmpv6_xmit_lock(struct net *net) +{ + struct sock *sk; + + local_bh_disable(); + + sk = icmpv6_sk(net); + if (unlikely(!spin_trylock(&sk->sk_lock.slock))) { + /* This can happen if the output path (f.e. SIT or + * ip6ip6 tunnel) signals dst_link_failure() for an + * outgoing ICMP6 packet. + */ + local_bh_enable(); + return NULL; + } + return sk; +} + +static __inline__ void icmpv6_xmit_unlock(struct sock *sk) +{ + spin_unlock_bh(&sk->sk_lock.slock); +} + +/* + * Slightly more convenient version of icmpv6_send. + */ +void icmpv6_param_prob(struct sk_buff *skb, u8 code, int pos) +{ + icmpv6_send(skb, ICMPV6_PARAMPROB, code, pos); + kfree_skb(skb); +} + +/* + * Figure out, may we reply to this packet with icmp error. + * + * We do not reply, if: + * - it was icmp error message. + * - it is truncated, so that it is known, that protocol is ICMPV6 + * (i.e. in the middle of some exthdr) + * + * --ANK (980726) + */ + +static int is_ineligible(struct sk_buff *skb) +{ + int ptr = (u8 *)(ipv6_hdr(skb) + 1) - skb->data; + int len = skb->len - ptr; + __u8 nexthdr = ipv6_hdr(skb)->nexthdr; + __be16 frag_off; + + if (len < 0) + return 1; + + ptr = ipv6_skip_exthdr(skb, ptr, &nexthdr, &frag_off); + if (ptr < 0) + return 0; + if (nexthdr == IPPROTO_ICMPV6) { + u8 _type, *tp; + tp = skb_header_pointer(skb, + ptr+offsetof(struct icmp6hdr, icmp6_type), + sizeof(_type), &_type); + if (tp == NULL || + !(*tp & ICMPV6_INFOMSG_MASK)) + return 1; + } + return 0; +} + +/* + * Check the ICMP output rate limit + */ +static inline bool icmpv6_xrlim_allow(struct sock *sk, u8 type, + struct flowi6 *fl6) +{ + struct dst_entry *dst; + struct net *net = sock_net(sk); + bool res = false; + + /* Informational messages are not limited. */ + if (type & ICMPV6_INFOMSG_MASK) + return true; + + /* Do not limit pmtu discovery, it would break it. */ + if (type == ICMPV6_PKT_TOOBIG) + return true; + + /* + * Look up the output route. + * XXX: perhaps the expire for routing entries cloned by + * this lookup should be more aggressive (not longer than timeout). + */ + dst = ip6_route_output(net, sk, fl6); + if (dst->error) { + IP6_INC_STATS(net, ip6_dst_idev(dst), + IPSTATS_MIB_OUTNOROUTES); + } else if (dst->dev && (dst->dev->flags&IFF_LOOPBACK)) { + res = true; + } else { + struct rt6_info *rt = (struct rt6_info *)dst; + int tmo = net->ipv6.sysctl.icmpv6_time; + + /* Give more bandwidth to wider prefixes. */ + if (rt->rt6i_dst.plen < 128) + tmo >>= ((128 - rt->rt6i_dst.plen)>>5); + + if (!rt->rt6i_peer) + rt6_bind_peer(rt, 1); + res = inet_peer_xrlim_allow(rt->rt6i_peer, tmo); + } + dst_release(dst); + return res; +} + +/* + * an inline helper for the "simple" if statement below + * checks if parameter problem report is caused by an + * unrecognized IPv6 option that has the Option Type + * highest-order two bits set to 10 + */ + +static __inline__ int opt_unrec(struct sk_buff *skb, __u32 offset) +{ + u8 _optval, *op; + + offset += skb_network_offset(skb); + op = skb_header_pointer(skb, offset, sizeof(_optval), &_optval); + if (op == NULL) + return 1; + return (*op & 0xC0) == 0x80; +} + +int icmpv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6, + struct icmp6hdr *thdr, int len) +{ + struct sk_buff *skb; + struct icmp6hdr *icmp6h; + int err = 0; + + if ((skb = skb_peek(&sk->sk_write_queue)) == NULL) + goto out; + + icmp6h = icmp6_hdr(skb); + memcpy(icmp6h, thdr, sizeof(struct icmp6hdr)); + icmp6h->icmp6_cksum = 0; + + if (skb_queue_len(&sk->sk_write_queue) == 1) { + skb->csum = csum_partial(icmp6h, + sizeof(struct icmp6hdr), skb->csum); + icmp6h->icmp6_cksum = csum_ipv6_magic(&fl6->saddr, + &fl6->daddr, + len, fl6->flowi6_proto, + skb->csum); + } else { + __wsum tmp_csum = 0; + + skb_queue_walk(&sk->sk_write_queue, skb) { + tmp_csum = csum_add(tmp_csum, skb->csum); + } + + tmp_csum = csum_partial(icmp6h, + sizeof(struct icmp6hdr), tmp_csum); + icmp6h->icmp6_cksum = csum_ipv6_magic(&fl6->saddr, + &fl6->daddr, + len, fl6->flowi6_proto, + tmp_csum); + } + ip6_push_pending_frames(sk); +out: + return err; +} + +struct icmpv6_msg { + struct sk_buff *skb; + int offset; + uint8_t type; +}; + +static int icmpv6_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb) +{ + struct icmpv6_msg *msg = (struct icmpv6_msg *) from; + struct sk_buff *org_skb = msg->skb; + __wsum csum = 0; + + csum = skb_copy_and_csum_bits(org_skb, msg->offset + offset, + to, len, csum); + skb->csum = csum_block_add(skb->csum, csum, odd); + if (!(msg->type & ICMPV6_INFOMSG_MASK)) + nf_ct_attach(skb, org_skb); + return 0; +} + +#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) +static void mip6_addr_swap(struct sk_buff *skb) +{ + struct ipv6hdr *iph = ipv6_hdr(skb); + struct inet6_skb_parm *opt = IP6CB(skb); + struct ipv6_destopt_hao *hao; + struct in6_addr tmp; + int off; + + if (opt->dsthao) { + off = ipv6_find_tlv(skb, opt->dsthao, IPV6_TLV_HAO); + if (likely(off >= 0)) { + hao = (struct ipv6_destopt_hao *) + (skb_network_header(skb) + off); + tmp = iph->saddr; + iph->saddr = hao->addr; + hao->addr = tmp; + } + } +} +#else +static inline void mip6_addr_swap(struct sk_buff *skb) {} +#endif + +struct dst_entry *icmpv6_route_lookup(struct net *net, struct sk_buff *skb, + struct sock *sk, struct flowi6 *fl6) +{ + struct dst_entry *dst, *dst2; + struct flowi6 fl2; + int err; + + err = ip6_dst_lookup(sk, &dst, fl6); + if (err) + return ERR_PTR(err); + + /* + * We won't send icmp if the destination is known + * anycast. + */ + if (((struct rt6_info *)dst)->rt6i_flags & RTF_ANYCAST) { + LIMIT_NETDEBUG(KERN_DEBUG "icmpv6_send: acast source\n"); + dst_release(dst); + return ERR_PTR(-EINVAL); + } + + /* No need to clone since we're just using its address. */ + dst2 = dst; + + dst = xfrm_lookup(net, dst, flowi6_to_flowi(fl6), sk, 0); + if (!IS_ERR(dst)) { + if (dst != dst2) + return dst; + } else { + if (PTR_ERR(dst) == -EPERM) + dst = NULL; + else + return dst; + } + + err = xfrm_decode_session_reverse(skb, flowi6_to_flowi(&fl2), AF_INET6); + if (err) + goto relookup_failed; + + err = ip6_dst_lookup(sk, &dst2, &fl2); + if (err) + goto relookup_failed; + + dst2 = xfrm_lookup(net, dst2, flowi6_to_flowi(&fl2), sk, XFRM_LOOKUP_ICMP); + if (!IS_ERR(dst2)) { + dst_release(dst); + dst = dst2; + } else { + err = PTR_ERR(dst2); + if (err == -EPERM) { + dst_release(dst); + return dst2; + } else + goto relookup_failed; + } + +relookup_failed: + if (dst) + return dst; + return ERR_PTR(err); +} + +/* + * Send an ICMP message in response to a packet in error + */ +void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) +{ + struct net *net = dev_net(skb->dev); + struct inet6_dev *idev = NULL; + struct ipv6hdr *hdr = ipv6_hdr(skb); + struct sock *sk; + struct ipv6_pinfo *np; + const struct in6_addr *saddr = NULL; + struct dst_entry *dst; + struct icmp6hdr tmp_hdr; + struct flowi6 fl6; + struct icmpv6_msg msg; + int iif = 0; + int addr_type = 0; + int len; + int hlimit; + int err = 0; + + if ((u8 *)hdr < skb->head || + (skb->network_header + sizeof(*hdr)) > skb->tail) + return; + + /* + * Make sure we respect the rules + * i.e. RFC 1885 2.4(e) + * Rule (e.1) is enforced by not using icmpv6_send + * in any code that processes icmp errors. + */ + addr_type = ipv6_addr_type(&hdr->daddr); + + if (ipv6_chk_addr(net, &hdr->daddr, skb->dev, 0)) + saddr = &hdr->daddr; + + /* + * Dest addr check + */ + + if ((addr_type & IPV6_ADDR_MULTICAST || skb->pkt_type != PACKET_HOST)) { + if (type != ICMPV6_PKT_TOOBIG && + !(type == ICMPV6_PARAMPROB && + code == ICMPV6_UNK_OPTION && + (opt_unrec(skb, info)))) + return; + + saddr = NULL; + } + + addr_type = ipv6_addr_type(&hdr->saddr); + + /* + * Source addr check + */ + + if (addr_type & IPV6_ADDR_LINKLOCAL) + iif = skb->dev->ifindex; + + /* + * Must not send error if the source does not uniquely + * identify a single node (RFC2463 Section 2.4). + * We check unspecified / multicast addresses here, + * and anycast addresses will be checked later. + */ + if ((addr_type == IPV6_ADDR_ANY) || (addr_type & IPV6_ADDR_MULTICAST)) { + LIMIT_NETDEBUG(KERN_DEBUG "icmpv6_send: addr_any/mcast source\n"); + return; + } + + /* + * Never answer to a ICMP packet. + */ + if (is_ineligible(skb)) { + LIMIT_NETDEBUG(KERN_DEBUG "icmpv6_send: no reply to icmp error\n"); + return; + } + + mip6_addr_swap(skb); + + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_proto = IPPROTO_ICMPV6; + fl6.daddr = hdr->saddr; + if (saddr) + fl6.saddr = *saddr; + fl6.flowi6_oif = iif; + fl6.fl6_icmp_type = type; + fl6.fl6_icmp_code = code; + security_skb_classify_flow(skb, flowi6_to_flowi(&fl6)); + + sk = icmpv6_xmit_lock(net); + if (sk == NULL) + return; + np = inet6_sk(sk); + + if (!icmpv6_xrlim_allow(sk, type, &fl6)) + goto out; + + tmp_hdr.icmp6_type = type; + tmp_hdr.icmp6_code = code; + tmp_hdr.icmp6_cksum = 0; + tmp_hdr.icmp6_pointer = htonl(info); + + if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) + fl6.flowi6_oif = np->mcast_oif; + else if (!fl6.flowi6_oif) + fl6.flowi6_oif = np->ucast_oif; + + dst = icmpv6_route_lookup(net, skb, sk, &fl6); + if (IS_ERR(dst)) + goto out; + + if (ipv6_addr_is_multicast(&fl6.daddr)) + hlimit = np->mcast_hops; + else + hlimit = np->hop_limit; + if (hlimit < 0) + hlimit = ip6_dst_hoplimit(dst); + + msg.skb = skb; + msg.offset = skb_network_offset(skb); + msg.type = type; + + len = skb->len - msg.offset; + len = min_t(unsigned int, len, IPV6_MIN_MTU - sizeof(struct ipv6hdr) -sizeof(struct icmp6hdr)); + if (len < 0) { + LIMIT_NETDEBUG(KERN_DEBUG "icmp: len problem\n"); + goto out_dst_release; + } + + rcu_read_lock(); + idev = __in6_dev_get(skb->dev); + + err = ip6_append_data(sk, icmpv6_getfrag, &msg, + len + sizeof(struct icmp6hdr), + sizeof(struct icmp6hdr), hlimit, + np->tclass, NULL, &fl6, (struct rt6_info*)dst, + MSG_DONTWAIT, np->dontfrag); + if (err) { + ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTERRORS); + ip6_flush_pending_frames(sk); + } else { + err = icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr, + len + sizeof(struct icmp6hdr)); + } + rcu_read_unlock(); +out_dst_release: + dst_release(dst); +out: + icmpv6_xmit_unlock(sk); +} +EXPORT_SYMBOL(icmpv6_send); + +static void icmpv6_echo_reply(struct sk_buff *skb) +{ + struct net *net = dev_net(skb->dev); + struct sock *sk; + struct inet6_dev *idev; + struct ipv6_pinfo *np; + const struct in6_addr *saddr = NULL; + struct icmp6hdr *icmph = icmp6_hdr(skb); + struct icmp6hdr tmp_hdr; + struct flowi6 fl6; + struct icmpv6_msg msg; + struct dst_entry *dst; + int err = 0; + int hlimit; + + saddr = &ipv6_hdr(skb)->daddr; + + if (!ipv6_unicast_destination(skb)) + saddr = NULL; + + memcpy(&tmp_hdr, icmph, sizeof(tmp_hdr)); + tmp_hdr.icmp6_type = ICMPV6_ECHO_REPLY; + + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_proto = IPPROTO_ICMPV6; + fl6.daddr = ipv6_hdr(skb)->saddr; + if (saddr) + fl6.saddr = *saddr; + fl6.flowi6_oif = skb->dev->ifindex; + fl6.fl6_icmp_type = ICMPV6_ECHO_REPLY; + security_skb_classify_flow(skb, flowi6_to_flowi(&fl6)); + + sk = icmpv6_xmit_lock(net); + if (sk == NULL) + return; + np = inet6_sk(sk); + + if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) + fl6.flowi6_oif = np->mcast_oif; + else if (!fl6.flowi6_oif) + fl6.flowi6_oif = np->ucast_oif; + + err = ip6_dst_lookup(sk, &dst, &fl6); + if (err) + goto out; + dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), sk, 0); + if (IS_ERR(dst)) + goto out; + + if (ipv6_addr_is_multicast(&fl6.daddr)) + hlimit = np->mcast_hops; + else + hlimit = np->hop_limit; + if (hlimit < 0) + hlimit = ip6_dst_hoplimit(dst); + + idev = __in6_dev_get(skb->dev); + + msg.skb = skb; + msg.offset = 0; + msg.type = ICMPV6_ECHO_REPLY; + + err = ip6_append_data(sk, icmpv6_getfrag, &msg, skb->len + sizeof(struct icmp6hdr), + sizeof(struct icmp6hdr), hlimit, np->tclass, NULL, &fl6, + (struct rt6_info*)dst, MSG_DONTWAIT, + np->dontfrag); + + if (err) { + ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTERRORS); + ip6_flush_pending_frames(sk); + } else { + err = icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr, + skb->len + sizeof(struct icmp6hdr)); + } + dst_release(dst); +out: + icmpv6_xmit_unlock(sk); +} + +void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info) +{ + const struct inet6_protocol *ipprot; + int inner_offset; + int hash; + u8 nexthdr; + __be16 frag_off; + + if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) + return; + + nexthdr = ((struct ipv6hdr *)skb->data)->nexthdr; + if (ipv6_ext_hdr(nexthdr)) { + /* now skip over extension headers */ + inner_offset = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), + &nexthdr, &frag_off); + if (inner_offset<0) + return; + } else { + inner_offset = sizeof(struct ipv6hdr); + } + + /* Checkin header including 8 bytes of inner protocol header. */ + if (!pskb_may_pull(skb, inner_offset+8)) + return; + + /* BUGGG_FUTURE: we should try to parse exthdrs in this packet. + Without this we will not able f.e. to make source routed + pmtu discovery. + Corresponding argument (opt) to notifiers is already added. + --ANK (980726) + */ + + hash = nexthdr & (MAX_INET_PROTOS - 1); + + rcu_read_lock(); + ipprot = rcu_dereference(inet6_protos[hash]); + if (ipprot && ipprot->err_handler) + ipprot->err_handler(skb, NULL, type, code, inner_offset, info); + rcu_read_unlock(); + + raw6_icmp_error(skb, nexthdr, type, code, inner_offset, info); +} + +/* + * Handle icmp messages + */ + +static int icmpv6_rcv(struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + struct inet6_dev *idev = __in6_dev_get(dev); + const struct in6_addr *saddr, *daddr; + const struct ipv6hdr *orig_hdr; + struct icmp6hdr *hdr; + u8 type; + + if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) { + struct sec_path *sp = skb_sec_path(skb); + int nh; + + if (!(sp && sp->xvec[sp->len - 1]->props.flags & + XFRM_STATE_ICMP)) + goto drop_no_count; + + if (!pskb_may_pull(skb, sizeof(*hdr) + sizeof(*orig_hdr))) + goto drop_no_count; + + nh = skb_network_offset(skb); + skb_set_network_header(skb, sizeof(*hdr)); + + if (!xfrm6_policy_check_reverse(NULL, XFRM_POLICY_IN, skb)) + goto drop_no_count; + + skb_set_network_header(skb, nh); + } + + ICMP6_INC_STATS_BH(dev_net(dev), idev, ICMP6_MIB_INMSGS); + + saddr = &ipv6_hdr(skb)->saddr; + daddr = &ipv6_hdr(skb)->daddr; + + /* Perform checksum. */ + switch (skb->ip_summed) { + case CHECKSUM_COMPLETE: + if (!csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_ICMPV6, + skb->csum)) + break; + /* fall through */ + case CHECKSUM_NONE: + skb->csum = ~csum_unfold(csum_ipv6_magic(saddr, daddr, skb->len, + IPPROTO_ICMPV6, 0)); + if (__skb_checksum_complete(skb)) { + LIMIT_NETDEBUG(KERN_DEBUG + "ICMPv6 checksum failed [%pI6c > %pI6c]\n", + saddr, daddr); + goto discard_it; + } + } + + if (!pskb_pull(skb, sizeof(*hdr))) + goto discard_it; + + hdr = icmp6_hdr(skb); + + type = hdr->icmp6_type; + + ICMP6MSGIN_INC_STATS_BH(dev_net(dev), idev, type); + + switch (type) { + case ICMPV6_ECHO_REQUEST: + icmpv6_echo_reply(skb); + break; + + case ICMPV6_ECHO_REPLY: + ping_rcv(skb); + break; + + case ICMPV6_PKT_TOOBIG: + /* BUGGG_FUTURE: if packet contains rthdr, we cannot update + standard destination cache. Seems, only "advanced" + destination cache will allow to solve this problem + --ANK (980726) + */ + if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) + goto discard_it; + hdr = icmp6_hdr(skb); + orig_hdr = (struct ipv6hdr *) (hdr + 1); + rt6_pmtu_discovery(&orig_hdr->daddr, &orig_hdr->saddr, dev, + ntohl(hdr->icmp6_mtu)); + + /* + * Drop through to notify + */ + + case ICMPV6_DEST_UNREACH: + case ICMPV6_TIME_EXCEED: + case ICMPV6_PARAMPROB: + icmpv6_notify(skb, type, hdr->icmp6_code, hdr->icmp6_mtu); + break; + + case NDISC_ROUTER_SOLICITATION: + case NDISC_ROUTER_ADVERTISEMENT: + case NDISC_NEIGHBOUR_SOLICITATION: + case NDISC_NEIGHBOUR_ADVERTISEMENT: + case NDISC_REDIRECT: + ndisc_rcv(skb); + break; + + case ICMPV6_MGM_QUERY: + igmp6_event_query(skb); + break; + + case ICMPV6_MGM_REPORT: + igmp6_event_report(skb); + break; + + case ICMPV6_MGM_REDUCTION: + case ICMPV6_NI_QUERY: + case ICMPV6_NI_REPLY: + case ICMPV6_MLD2_REPORT: + case ICMPV6_DHAAD_REQUEST: + case ICMPV6_DHAAD_REPLY: + case ICMPV6_MOBILE_PREFIX_SOL: + case ICMPV6_MOBILE_PREFIX_ADV: + break; + + default: + LIMIT_NETDEBUG(KERN_DEBUG "icmpv6: msg of unknown type\n"); + + /* informational */ + if (type & ICMPV6_INFOMSG_MASK) + break; + + /* + * error of unknown type. + * must pass to upper level + */ + + icmpv6_notify(skb, type, hdr->icmp6_code, hdr->icmp6_mtu); + } + + kfree_skb(skb); + return 0; + +discard_it: + ICMP6_INC_STATS_BH(dev_net(dev), idev, ICMP6_MIB_INERRORS); +drop_no_count: + kfree_skb(skb); + return 0; +} + +void icmpv6_flow_init(struct sock *sk, struct flowi6 *fl6, + u8 type, + const struct in6_addr *saddr, + const struct in6_addr *daddr, + int oif) +{ + memset(fl6, 0, sizeof(*fl6)); + fl6->saddr = *saddr; + fl6->daddr = *daddr; + fl6->flowi6_proto = IPPROTO_ICMPV6; + fl6->fl6_icmp_type = type; + fl6->fl6_icmp_code = 0; + fl6->flowi6_oif = oif; + security_sk_classify_flow(sk, flowi6_to_flowi(fl6)); +} + +/* + * Special lock-class for __icmpv6_sk: + */ +static struct lock_class_key icmpv6_socket_sk_dst_lock_key; + +static int __net_init icmpv6_sk_init(struct net *net) +{ + struct sock *sk; + int err, i, j; + + net->ipv6.icmp_sk = + kzalloc(nr_cpu_ids * sizeof(struct sock *), GFP_KERNEL); + if (net->ipv6.icmp_sk == NULL) + return -ENOMEM; + + for_each_possible_cpu(i) { + err = inet_ctl_sock_create(&sk, PF_INET6, + SOCK_RAW, IPPROTO_ICMPV6, net); + if (err < 0) { + printk(KERN_ERR + "Failed to initialize the ICMP6 control socket " + "(err %d).\n", + err); + goto fail; + } + + net->ipv6.icmp_sk[i] = sk; + + /* + * Split off their lock-class, because sk->sk_dst_lock + * gets used from softirqs, which is safe for + * __icmpv6_sk (because those never get directly used + * via userspace syscalls), but unsafe for normal sockets. + */ + lockdep_set_class(&sk->sk_dst_lock, + &icmpv6_socket_sk_dst_lock_key); + + /* Enough space for 2 64K ICMP packets, including + * sk_buff struct overhead. + */ + sk->sk_sndbuf = 2 * SKB_TRUESIZE(64 * 1024); + } + return 0; + + fail: + for (j = 0; j < i; j++) + inet_ctl_sock_destroy(net->ipv6.icmp_sk[j]); + kfree(net->ipv6.icmp_sk); + return err; +} + +static void __net_exit icmpv6_sk_exit(struct net *net) +{ + int i; + + for_each_possible_cpu(i) { + inet_ctl_sock_destroy(net->ipv6.icmp_sk[i]); + } + kfree(net->ipv6.icmp_sk); +} + +static struct pernet_operations icmpv6_sk_ops = { + .init = icmpv6_sk_init, + .exit = icmpv6_sk_exit, +}; + +int __init icmpv6_init(void) +{ + int err; + + err = register_pernet_subsys(&icmpv6_sk_ops); + if (err < 0) + return err; + + err = -EAGAIN; + if (inet6_add_protocol(&icmpv6_protocol, IPPROTO_ICMPV6) < 0) + goto fail; + return 0; + +fail: + printk(KERN_ERR "Failed to register ICMP6 protocol\n"); + unregister_pernet_subsys(&icmpv6_sk_ops); + return err; +} + +void icmpv6_cleanup(void) +{ + unregister_pernet_subsys(&icmpv6_sk_ops); + inet6_del_protocol(&icmpv6_protocol, IPPROTO_ICMPV6); +} + + +static const struct icmp6_err { + int err; + int fatal; +} tab_unreach[] = { + { /* NOROUTE */ + .err = ENETUNREACH, + .fatal = 0, + }, + { /* ADM_PROHIBITED */ + .err = EACCES, + .fatal = 1, + }, + { /* Was NOT_NEIGHBOUR, now reserved */ + .err = EHOSTUNREACH, + .fatal = 0, + }, + { /* ADDR_UNREACH */ + .err = EHOSTUNREACH, + .fatal = 0, + }, + { /* PORT_UNREACH */ + .err = ECONNREFUSED, + .fatal = 1, + }, +}; + +int icmpv6_err_convert(u8 type, u8 code, int *err) +{ + int fatal = 0; + + *err = EPROTO; + + switch (type) { + case ICMPV6_DEST_UNREACH: + fatal = 1; + if (code <= ICMPV6_PORT_UNREACH) { + *err = tab_unreach[code].err; + fatal = tab_unreach[code].fatal; + } + break; + + case ICMPV6_PKT_TOOBIG: + *err = EMSGSIZE; + break; + + case ICMPV6_PARAMPROB: + *err = EPROTO; + fatal = 1; + break; + + case ICMPV6_TIME_EXCEED: + *err = EHOSTUNREACH; + break; + } + + return fatal; +} + +EXPORT_SYMBOL(icmpv6_err_convert); + +#ifdef CONFIG_SYSCTL +ctl_table ipv6_icmp_table_template[] = { + { + .procname = "ratelimit", + .data = &init_net.ipv6.sysctl.icmpv6_time, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_ms_jiffies, + }, + { }, +}; + +struct ctl_table * __net_init ipv6_icmp_sysctl_init(struct net *net) +{ + struct ctl_table *table; + + table = kmemdup(ipv6_icmp_table_template, + sizeof(ipv6_icmp_table_template), + GFP_KERNEL); + + if (table) + table[0].data = &net->ipv6.sysctl.icmpv6_time; + + return table; +} +#endif + diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c new file mode 100644 index 00000000..02dd203d --- /dev/null +++ b/net/ipv6/inet6_connection_sock.c @@ -0,0 +1,255 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Support for INET6 connection oriented protocols. + * + * Authors: See the TCPv6 sources + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or(at your option) any later version. + */ + +#include <linux/module.h> +#include <linux/in6.h> +#include <linux/ipv6.h> +#include <linux/jhash.h> +#include <linux/slab.h> + +#include <net/addrconf.h> +#include <net/inet_connection_sock.h> +#include <net/inet_ecn.h> +#include <net/inet_hashtables.h> +#include <net/ip6_route.h> +#include <net/sock.h> +#include <net/inet6_connection_sock.h> + +int inet6_csk_bind_conflict(const struct sock *sk, + const struct inet_bind_bucket *tb) +{ + const struct sock *sk2; + const struct hlist_node *node; + + /* We must walk the whole port owner list in this case. -DaveM */ + /* + * See comment in inet_csk_bind_conflict about sock lookup + * vs net namespaces issues. + */ + sk_for_each_bound(sk2, node, &tb->owners) { + if (sk != sk2 && + (!sk->sk_bound_dev_if || + !sk2->sk_bound_dev_if || + sk->sk_bound_dev_if == sk2->sk_bound_dev_if) && + (!sk->sk_reuse || !sk2->sk_reuse || + sk2->sk_state == TCP_LISTEN) && + ipv6_rcv_saddr_equal(sk, sk2)) + break; + } + + return node != NULL; +} + +EXPORT_SYMBOL_GPL(inet6_csk_bind_conflict); + +struct dst_entry *inet6_csk_route_req(struct sock *sk, + const struct request_sock *req) +{ + struct inet6_request_sock *treq = inet6_rsk(req); + struct ipv6_pinfo *np = inet6_sk(sk); + struct in6_addr *final_p, final; + struct dst_entry *dst; + struct flowi6 fl6; + + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_proto = IPPROTO_TCP; + fl6.daddr = treq->rmt_addr; + final_p = fl6_update_dst(&fl6, np->opt, &final); + fl6.saddr = treq->loc_addr; + fl6.flowi6_oif = sk->sk_bound_dev_if; + fl6.flowi6_mark = sk->sk_mark; + fl6.fl6_dport = inet_rsk(req)->rmt_port; + fl6.fl6_sport = inet_rsk(req)->loc_port; + security_req_classify_flow(req, flowi6_to_flowi(&fl6)); + + dst = ip6_dst_lookup_flow(sk, &fl6, final_p, false); + if (IS_ERR(dst)) + return NULL; + + return dst; +} + +/* + * request_sock (formerly open request) hash tables. + */ +static u32 inet6_synq_hash(const struct in6_addr *raddr, const __be16 rport, + const u32 rnd, const u32 synq_hsize) +{ + u32 c; + + c = jhash_3words((__force u32)raddr->s6_addr32[0], + (__force u32)raddr->s6_addr32[1], + (__force u32)raddr->s6_addr32[2], + rnd); + + c = jhash_2words((__force u32)raddr->s6_addr32[3], + (__force u32)rport, + c); + + return c & (synq_hsize - 1); +} + +struct request_sock *inet6_csk_search_req(const struct sock *sk, + struct request_sock ***prevp, + const __be16 rport, + const struct in6_addr *raddr, + const struct in6_addr *laddr, + const int iif) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; + struct request_sock *req, **prev; + + for (prev = &lopt->syn_table[inet6_synq_hash(raddr, rport, + lopt->hash_rnd, + lopt->nr_table_entries)]; + (req = *prev) != NULL; + prev = &req->dl_next) { + const struct inet6_request_sock *treq = inet6_rsk(req); + + if (inet_rsk(req)->rmt_port == rport && + req->rsk_ops->family == AF_INET6 && + ipv6_addr_equal(&treq->rmt_addr, raddr) && + ipv6_addr_equal(&treq->loc_addr, laddr) && + (!treq->iif || treq->iif == iif)) { + WARN_ON(req->sk != NULL); + *prevp = prev; + return req; + } + } + + return NULL; +} + +EXPORT_SYMBOL_GPL(inet6_csk_search_req); + +void inet6_csk_reqsk_queue_hash_add(struct sock *sk, + struct request_sock *req, + const unsigned long timeout) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; + const u32 h = inet6_synq_hash(&inet6_rsk(req)->rmt_addr, + inet_rsk(req)->rmt_port, + lopt->hash_rnd, lopt->nr_table_entries); + + reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout); + inet_csk_reqsk_queue_added(sk, timeout); +} + +EXPORT_SYMBOL_GPL(inet6_csk_reqsk_queue_hash_add); + +void inet6_csk_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) uaddr; + + sin6->sin6_family = AF_INET6; + sin6->sin6_addr = np->daddr; + sin6->sin6_port = inet_sk(sk)->inet_dport; + /* We do not store received flowlabel for TCP */ + sin6->sin6_flowinfo = 0; + sin6->sin6_scope_id = 0; + if (sk->sk_bound_dev_if && + ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL) + sin6->sin6_scope_id = sk->sk_bound_dev_if; +} + +EXPORT_SYMBOL_GPL(inet6_csk_addr2sockaddr); + +static inline +void __inet6_csk_dst_store(struct sock *sk, struct dst_entry *dst, + struct in6_addr *daddr, struct in6_addr *saddr) +{ + __ip6_dst_store(sk, dst, daddr, saddr); + +#ifdef CONFIG_XFRM + { + struct rt6_info *rt = (struct rt6_info *)dst; + rt->rt6i_flow_cache_genid = atomic_read(&flow_cache_genid); + } +#endif +} + +static inline +struct dst_entry *__inet6_csk_dst_check(struct sock *sk, u32 cookie) +{ + struct dst_entry *dst; + + dst = __sk_dst_check(sk, cookie); + +#ifdef CONFIG_XFRM + if (dst) { + struct rt6_info *rt = (struct rt6_info *)dst; + if (rt->rt6i_flow_cache_genid != atomic_read(&flow_cache_genid)) { + __sk_dst_reset(sk); + dst = NULL; + } + } +#endif + + return dst; +} + +int inet6_csk_xmit(struct sk_buff *skb, struct flowi *fl_unused) +{ + struct sock *sk = skb->sk; + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct flowi6 fl6; + struct dst_entry *dst; + struct in6_addr *final_p, final; + int res; + + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_proto = sk->sk_protocol; + fl6.daddr = np->daddr; + fl6.saddr = np->saddr; + fl6.flowlabel = np->flow_label; + IP6_ECN_flow_xmit(sk, fl6.flowlabel); + fl6.flowi6_oif = sk->sk_bound_dev_if; + fl6.flowi6_mark = sk->sk_mark; + fl6.fl6_sport = inet->inet_sport; + fl6.fl6_dport = inet->inet_dport; + security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); + + final_p = fl6_update_dst(&fl6, np->opt, &final); + + dst = __inet6_csk_dst_check(sk, np->dst_cookie); + + if (dst == NULL) { + dst = ip6_dst_lookup_flow(sk, &fl6, final_p, false); + + if (IS_ERR(dst)) { + sk->sk_err_soft = -PTR_ERR(dst); + sk->sk_route_caps = 0; + kfree_skb(skb); + return PTR_ERR(dst); + } + + __inet6_csk_dst_store(sk, dst, NULL, NULL); + } + + rcu_read_lock(); + skb_dst_set_noref(skb, dst); + + /* Restore final destination back after routing done */ + fl6.daddr = np->daddr; + + res = ip6_xmit(sk, skb, &fl6, np->opt, np->tclass); + rcu_read_unlock(); + return res; +} +EXPORT_SYMBOL_GPL(inet6_csk_xmit); diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c new file mode 100644 index 00000000..73f1a00a --- /dev/null +++ b/net/ipv6/inet6_hashtables.c @@ -0,0 +1,304 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Generic INET6 transport hashtables + * + * Authors: Lotsa people, from code originally in tcp, generalised here + * by Arnaldo Carvalho de Melo <acme@mandriva.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/module.h> +#include <linux/random.h> + +#include <net/inet_connection_sock.h> +#include <net/inet_hashtables.h> +#include <net/inet6_hashtables.h> +#include <net/secure_seq.h> +#include <net/ip.h> + +int __inet6_hash(struct sock *sk, struct inet_timewait_sock *tw) +{ + struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; + int twrefcnt = 0; + + WARN_ON(!sk_unhashed(sk)); + + if (sk->sk_state == TCP_LISTEN) { + struct inet_listen_hashbucket *ilb; + + ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; + spin_lock(&ilb->lock); + __sk_nulls_add_node_rcu(sk, &ilb->head); + spin_unlock(&ilb->lock); + } else { + unsigned int hash; + struct hlist_nulls_head *list; + spinlock_t *lock; + + sk->sk_hash = hash = inet6_sk_ehashfn(sk); + list = &inet_ehash_bucket(hashinfo, hash)->chain; + lock = inet_ehash_lockp(hashinfo, hash); + spin_lock(lock); + __sk_nulls_add_node_rcu(sk, list); + if (tw) { + WARN_ON(sk->sk_hash != tw->tw_hash); + twrefcnt = inet_twsk_unhash(tw); + } + spin_unlock(lock); + } + + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); + return twrefcnt; +} +EXPORT_SYMBOL(__inet6_hash); + +/* + * Sockets in TCP_CLOSE state are _always_ taken out of the hash, so + * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM + * + * The sockhash lock must be held as a reader here. + */ +struct sock *__inet6_lookup_established(struct net *net, + struct inet_hashinfo *hashinfo, + const struct in6_addr *saddr, + const __be16 sport, + const struct in6_addr *daddr, + const u16 hnum, + const int dif) +{ + struct sock *sk; + const struct hlist_nulls_node *node; + const __portpair ports = INET_COMBINED_PORTS(sport, hnum); + /* Optimize here for direct hit, only listening connections can + * have wildcards anyways. + */ + unsigned int hash = inet6_ehashfn(net, daddr, hnum, saddr, sport); + unsigned int slot = hash & hashinfo->ehash_mask; + struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; + + + rcu_read_lock(); +begin: + sk_nulls_for_each_rcu(sk, node, &head->chain) { + /* For IPV6 do the cheaper port and family tests first. */ + if (INET6_MATCH(sk, net, hash, saddr, daddr, ports, dif)) { + if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) + goto begintw; + if (!INET6_MATCH(sk, net, hash, saddr, daddr, ports, dif)) { + sock_put(sk); + goto begin; + } + goto out; + } + } + if (get_nulls_value(node) != slot) + goto begin; + +begintw: + /* Must check for a TIME_WAIT'er before going to listener hash. */ + sk_nulls_for_each_rcu(sk, node, &head->twchain) { + if (INET6_TW_MATCH(sk, net, hash, saddr, daddr, ports, dif)) { + if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) { + sk = NULL; + goto out; + } + if (!INET6_TW_MATCH(sk, net, hash, saddr, daddr, ports, dif)) { + sock_put(sk); + goto begintw; + } + goto out; + } + } + if (get_nulls_value(node) != slot) + goto begintw; + sk = NULL; +out: + rcu_read_unlock(); + return sk; +} +EXPORT_SYMBOL(__inet6_lookup_established); + +static inline int compute_score(struct sock *sk, struct net *net, + const unsigned short hnum, + const struct in6_addr *daddr, + const int dif) +{ + int score = -1; + + if (net_eq(sock_net(sk), net) && inet_sk(sk)->inet_num == hnum && + sk->sk_family == PF_INET6) { + const struct ipv6_pinfo *np = inet6_sk(sk); + + score = 1; + if (!ipv6_addr_any(&np->rcv_saddr)) { + if (!ipv6_addr_equal(&np->rcv_saddr, daddr)) + return -1; + score++; + } + if (sk->sk_bound_dev_if) { + if (sk->sk_bound_dev_if != dif) + return -1; + score++; + } + } + return score; +} + +struct sock *inet6_lookup_listener(struct net *net, + struct inet_hashinfo *hashinfo, const struct in6_addr *daddr, + const unsigned short hnum, const int dif) +{ + struct sock *sk; + const struct hlist_nulls_node *node; + struct sock *result; + int score, hiscore; + unsigned int hash = inet_lhashfn(net, hnum); + struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; + + rcu_read_lock(); +begin: + result = NULL; + hiscore = -1; + sk_nulls_for_each(sk, node, &ilb->head) { + score = compute_score(sk, net, hnum, daddr, dif); + if (score > hiscore) { + hiscore = score; + result = sk; + } + } + /* + * if the nulls value we got at the end of this lookup is + * not the expected one, we must restart lookup. + * We probably met an item that was moved to another chain. + */ + if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE) + goto begin; + if (result) { + if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt))) + result = NULL; + else if (unlikely(compute_score(result, net, hnum, daddr, + dif) < hiscore)) { + sock_put(result); + goto begin; + } + } + rcu_read_unlock(); + return result; +} + +EXPORT_SYMBOL_GPL(inet6_lookup_listener); + +struct sock *inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo, + const struct in6_addr *saddr, const __be16 sport, + const struct in6_addr *daddr, const __be16 dport, + const int dif) +{ + struct sock *sk; + + local_bh_disable(); + sk = __inet6_lookup(net, hashinfo, saddr, sport, daddr, ntohs(dport), dif); + local_bh_enable(); + + return sk; +} + +EXPORT_SYMBOL_GPL(inet6_lookup); + +static int __inet6_check_established(struct inet_timewait_death_row *death_row, + struct sock *sk, const __u16 lport, + struct inet_timewait_sock **twp) +{ + struct inet_hashinfo *hinfo = death_row->hashinfo; + struct inet_sock *inet = inet_sk(sk); + const struct ipv6_pinfo *np = inet6_sk(sk); + const struct in6_addr *daddr = &np->rcv_saddr; + const struct in6_addr *saddr = &np->daddr; + const int dif = sk->sk_bound_dev_if; + const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport); + struct net *net = sock_net(sk); + const unsigned int hash = inet6_ehashfn(net, daddr, lport, saddr, + inet->inet_dport); + struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); + spinlock_t *lock = inet_ehash_lockp(hinfo, hash); + struct sock *sk2; + const struct hlist_nulls_node *node; + struct inet_timewait_sock *tw; + int twrefcnt = 0; + + spin_lock(lock); + + /* Check TIME-WAIT sockets first. */ + sk_nulls_for_each(sk2, node, &head->twchain) { + tw = inet_twsk(sk2); + + if (INET6_TW_MATCH(sk2, net, hash, saddr, daddr, ports, dif)) { + if (twsk_unique(sk, sk2, twp)) + goto unique; + else + goto not_unique; + } + } + tw = NULL; + + /* And established part... */ + sk_nulls_for_each(sk2, node, &head->chain) { + if (INET6_MATCH(sk2, net, hash, saddr, daddr, ports, dif)) + goto not_unique; + } + +unique: + /* Must record num and sport now. Otherwise we will see + * in hash table socket with a funny identity. */ + inet->inet_num = lport; + inet->inet_sport = htons(lport); + sk->sk_hash = hash; + WARN_ON(!sk_unhashed(sk)); + __sk_nulls_add_node_rcu(sk, &head->chain); + if (tw) { + twrefcnt = inet_twsk_unhash(tw); + NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED); + } + spin_unlock(lock); + if (twrefcnt) + inet_twsk_put(tw); + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); + + if (twp) { + *twp = tw; + } else if (tw) { + /* Silly. Should hash-dance instead... */ + inet_twsk_deschedule(tw, death_row); + + inet_twsk_put(tw); + } + return 0; + +not_unique: + spin_unlock(lock); + return -EADDRNOTAVAIL; +} + +static inline u32 inet6_sk_port_offset(const struct sock *sk) +{ + const struct inet_sock *inet = inet_sk(sk); + const struct ipv6_pinfo *np = inet6_sk(sk); + return secure_ipv6_port_ephemeral(np->rcv_saddr.s6_addr32, + np->daddr.s6_addr32, + inet->inet_dport); +} + +int inet6_hash_connect(struct inet_timewait_death_row *death_row, + struct sock *sk) +{ + return __inet_hash_connect(death_row, sk, inet6_sk_port_offset(sk), + __inet6_check_established, __inet6_hash); +} + +EXPORT_SYMBOL_GPL(inet6_hash_connect); diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c new file mode 100644 index 00000000..92bb9cba --- /dev/null +++ b/net/ipv6/ip6_fib.c @@ -0,0 +1,1713 @@ +/* + * Linux INET6 implementation + * Forwarding Information Database + * + * Authors: + * Pedro Roque <roque@di.fc.ul.pt> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* + * Changes: + * Yuji SEKIYA @USAGI: Support default route on router node; + * remove ip6_null_entry from the top of + * routing table. + * Ville Nuorvala: Fixed routing subtrees. + */ +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/net.h> +#include <linux/route.h> +#include <linux/netdevice.h> +#include <linux/in6.h> +#include <linux/init.h> +#include <linux/list.h> +#include <linux/slab.h> + +#include <net/ipv6.h> +#include <net/ndisc.h> +#include <net/addrconf.h> + +#include <net/ip6_fib.h> +#include <net/ip6_route.h> + +#define RT6_DEBUG 2 + +#if RT6_DEBUG >= 3 +#define RT6_TRACE(x...) printk(KERN_DEBUG x) +#else +#define RT6_TRACE(x...) do { ; } while (0) +#endif + +static struct kmem_cache * fib6_node_kmem __read_mostly; + +enum fib_walk_state_t +{ +#ifdef CONFIG_IPV6_SUBTREES + FWS_S, +#endif + FWS_L, + FWS_R, + FWS_C, + FWS_U +}; + +struct fib6_cleaner_t +{ + struct fib6_walker_t w; + struct net *net; + int (*func)(struct rt6_info *, void *arg); + void *arg; +}; + +static DEFINE_RWLOCK(fib6_walker_lock); + +#ifdef CONFIG_IPV6_SUBTREES +#define FWS_INIT FWS_S +#else +#define FWS_INIT FWS_L +#endif + +static void fib6_prune_clones(struct net *net, struct fib6_node *fn, + struct rt6_info *rt); +static struct rt6_info *fib6_find_prefix(struct net *net, struct fib6_node *fn); +static struct fib6_node *fib6_repair_tree(struct net *net, struct fib6_node *fn); +static int fib6_walk(struct fib6_walker_t *w); +static int fib6_walk_continue(struct fib6_walker_t *w); + +/* + * A routing update causes an increase of the serial number on the + * affected subtree. This allows for cached routes to be asynchronously + * tested when modifications are made to the destination cache as a + * result of redirects, path MTU changes, etc. + */ + +static __u32 rt_sernum; + +static void fib6_gc_timer_cb(unsigned long arg); + +static LIST_HEAD(fib6_walkers); +#define FOR_WALKERS(w) list_for_each_entry(w, &fib6_walkers, lh) + +static inline void fib6_walker_link(struct fib6_walker_t *w) +{ + write_lock_bh(&fib6_walker_lock); + list_add(&w->lh, &fib6_walkers); + write_unlock_bh(&fib6_walker_lock); +} + +static inline void fib6_walker_unlink(struct fib6_walker_t *w) +{ + write_lock_bh(&fib6_walker_lock); + list_del(&w->lh); + write_unlock_bh(&fib6_walker_lock); +} +static __inline__ u32 fib6_new_sernum(void) +{ + u32 n = ++rt_sernum; + if ((__s32)n <= 0) + rt_sernum = n = 1; + return n; +} + +/* + * Auxiliary address test functions for the radix tree. + * + * These assume a 32bit processor (although it will work on + * 64bit processors) + */ + +/* + * test bit + */ +#if defined(__LITTLE_ENDIAN) +# define BITOP_BE32_SWIZZLE (0x1F & ~7) +#else +# define BITOP_BE32_SWIZZLE 0 +#endif + +static __inline__ __be32 addr_bit_set(const void *token, int fn_bit) +{ + const __be32 *addr = token; + /* + * Here, + * 1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f) + * is optimized version of + * htonl(1 << ((~fn_bit)&0x1F)) + * See include/asm-generic/bitops/le.h. + */ + return (__force __be32)(1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f)) & + addr[fn_bit >> 5]; +} + +static __inline__ struct fib6_node * node_alloc(void) +{ + struct fib6_node *fn; + + fn = kmem_cache_zalloc(fib6_node_kmem, GFP_ATOMIC); + + return fn; +} + +static __inline__ void node_free(struct fib6_node * fn) +{ + kmem_cache_free(fib6_node_kmem, fn); +} + +static __inline__ void rt6_release(struct rt6_info *rt) +{ + if (atomic_dec_and_test(&rt->rt6i_ref)) + dst_free(&rt->dst); +} + +static void fib6_link_table(struct net *net, struct fib6_table *tb) +{ + unsigned int h; + + /* + * Initialize table lock at a single place to give lockdep a key, + * tables aren't visible prior to being linked to the list. + */ + rwlock_init(&tb->tb6_lock); + + h = tb->tb6_id & (FIB6_TABLE_HASHSZ - 1); + + /* + * No protection necessary, this is the only list mutatation + * operation, tables never disappear once they exist. + */ + hlist_add_head_rcu(&tb->tb6_hlist, &net->ipv6.fib_table_hash[h]); +} + +#ifdef CONFIG_IPV6_MULTIPLE_TABLES + +static struct fib6_table *fib6_alloc_table(struct net *net, u32 id) +{ + struct fib6_table *table; + + table = kzalloc(sizeof(*table), GFP_ATOMIC); + if (table) { + table->tb6_id = id; + table->tb6_root.leaf = net->ipv6.ip6_null_entry; + table->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; + } + + return table; +} + +struct fib6_table *fib6_new_table(struct net *net, u32 id) +{ + struct fib6_table *tb; + + if (id == 0) + id = RT6_TABLE_MAIN; + tb = fib6_get_table(net, id); + if (tb) + return tb; + + tb = fib6_alloc_table(net, id); + if (tb) + fib6_link_table(net, tb); + + return tb; +} + +struct fib6_table *fib6_get_table(struct net *net, u32 id) +{ + struct fib6_table *tb; + struct hlist_head *head; + struct hlist_node *node; + unsigned int h; + + if (id == 0) + id = RT6_TABLE_MAIN; + h = id & (FIB6_TABLE_HASHSZ - 1); + rcu_read_lock(); + head = &net->ipv6.fib_table_hash[h]; + hlist_for_each_entry_rcu(tb, node, head, tb6_hlist) { + if (tb->tb6_id == id) { + rcu_read_unlock(); + return tb; + } + } + rcu_read_unlock(); + + return NULL; +} + +static void __net_init fib6_tables_init(struct net *net) +{ + fib6_link_table(net, net->ipv6.fib6_main_tbl); + fib6_link_table(net, net->ipv6.fib6_local_tbl); +} +#else + +struct fib6_table *fib6_new_table(struct net *net, u32 id) +{ + return fib6_get_table(net, id); +} + +struct fib6_table *fib6_get_table(struct net *net, u32 id) +{ + return net->ipv6.fib6_main_tbl; +} + +struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, + int flags, pol_lookup_t lookup) +{ + return (struct dst_entry *) lookup(net, net->ipv6.fib6_main_tbl, fl6, flags); +} + +static void __net_init fib6_tables_init(struct net *net) +{ + fib6_link_table(net, net->ipv6.fib6_main_tbl); +} + +#endif + +static int fib6_dump_node(struct fib6_walker_t *w) +{ + int res; + struct rt6_info *rt; + + for (rt = w->leaf; rt; rt = rt->dst.rt6_next) { + res = rt6_dump_route(rt, w->args); + if (res < 0) { + /* Frame is full, suspend walking */ + w->leaf = rt; + return 1; + } + WARN_ON(res == 0); + } + w->leaf = NULL; + return 0; +} + +static void fib6_dump_end(struct netlink_callback *cb) +{ + struct fib6_walker_t *w = (void*)cb->args[2]; + + if (w) { + if (cb->args[4]) { + cb->args[4] = 0; + fib6_walker_unlink(w); + } + cb->args[2] = 0; + kfree(w); + } + cb->done = (void*)cb->args[3]; + cb->args[1] = 3; +} + +static int fib6_dump_done(struct netlink_callback *cb) +{ + fib6_dump_end(cb); + return cb->done ? cb->done(cb) : 0; +} + +static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct fib6_walker_t *w; + int res; + + w = (void *)cb->args[2]; + w->root = &table->tb6_root; + + if (cb->args[4] == 0) { + w->count = 0; + w->skip = 0; + + read_lock_bh(&table->tb6_lock); + res = fib6_walk(w); + read_unlock_bh(&table->tb6_lock); + if (res > 0) { + cb->args[4] = 1; + cb->args[5] = w->root->fn_sernum; + } + } else { + if (cb->args[5] != w->root->fn_sernum) { + /* Begin at the root if the tree changed */ + cb->args[5] = w->root->fn_sernum; + w->state = FWS_INIT; + w->node = w->root; + w->skip = w->count; + } else + w->skip = 0; + + read_lock_bh(&table->tb6_lock); + res = fib6_walk_continue(w); + read_unlock_bh(&table->tb6_lock); + if (res <= 0) { + fib6_walker_unlink(w); + cb->args[4] = 0; + } + } + + return res; +} + +static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + unsigned int h, s_h; + unsigned int e = 0, s_e; + struct rt6_rtnl_dump_arg arg; + struct fib6_walker_t *w; + struct fib6_table *tb; + struct hlist_node *node; + struct hlist_head *head; + int res = 0; + + s_h = cb->args[0]; + s_e = cb->args[1]; + + w = (void *)cb->args[2]; + if (!w) { + /* New dump: + * + * 1. hook callback destructor. + */ + cb->args[3] = (long)cb->done; + cb->done = fib6_dump_done; + + /* + * 2. allocate and initialize walker. + */ + w = kzalloc(sizeof(*w), GFP_ATOMIC); + if (!w) + return -ENOMEM; + w->func = fib6_dump_node; + cb->args[2] = (long)w; + } + + arg.skb = skb; + arg.cb = cb; + arg.net = net; + w->args = &arg; + + rcu_read_lock(); + for (h = s_h; h < FIB6_TABLE_HASHSZ; h++, s_e = 0) { + e = 0; + head = &net->ipv6.fib_table_hash[h]; + hlist_for_each_entry_rcu(tb, node, head, tb6_hlist) { + if (e < s_e) + goto next; + res = fib6_dump_table(tb, skb, cb); + if (res != 0) + goto out; +next: + e++; + } + } +out: + rcu_read_unlock(); + cb->args[1] = e; + cb->args[0] = h; + + res = res < 0 ? res : skb->len; + if (res <= 0) + fib6_dump_end(cb); + return res; +} + +/* + * Routing Table + * + * return the appropriate node for a routing tree "add" operation + * by either creating and inserting or by returning an existing + * node. + */ + +static struct fib6_node * fib6_add_1(struct fib6_node *root, void *addr, + int addrlen, int plen, + int offset, int allow_create, + int replace_required) +{ + struct fib6_node *fn, *in, *ln; + struct fib6_node *pn = NULL; + struct rt6key *key; + int bit; + __be32 dir = 0; + __u32 sernum = fib6_new_sernum(); + + RT6_TRACE("fib6_add_1\n"); + + /* insert node in tree */ + + fn = root; + + do { + key = (struct rt6key *)((u8 *)fn->leaf + offset); + + /* + * Prefix match + */ + if (plen < fn->fn_bit || + !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit)) { + if (!allow_create) { + if (replace_required) { + pr_warn("IPv6: Can't replace route, " + "no match found\n"); + return ERR_PTR(-ENOENT); + } + pr_warn("IPv6: NLM_F_CREATE should be set " + "when creating new route\n"); + } + goto insert_above; + } + + /* + * Exact match ? + */ + + if (plen == fn->fn_bit) { + /* clean up an intermediate node */ + if (!(fn->fn_flags & RTN_RTINFO)) { + rt6_release(fn->leaf); + fn->leaf = NULL; + } + + fn->fn_sernum = sernum; + + return fn; + } + + /* + * We have more bits to go + */ + + /* Try to walk down on tree. */ + fn->fn_sernum = sernum; + dir = addr_bit_set(addr, fn->fn_bit); + pn = fn; + fn = dir ? fn->right: fn->left; + } while (fn); + + if (!allow_create) { + /* We should not create new node because + * NLM_F_REPLACE was specified without NLM_F_CREATE + * I assume it is safe to require NLM_F_CREATE when + * REPLACE flag is used! Later we may want to remove the + * check for replace_required, because according + * to netlink specification, NLM_F_CREATE + * MUST be specified if new route is created. + * That would keep IPv6 consistent with IPv4 + */ + if (replace_required) { + pr_warn("IPv6: Can't replace route, no match found\n"); + return ERR_PTR(-ENOENT); + } + pr_warn("IPv6: NLM_F_CREATE should be set " + "when creating new route\n"); + } + /* + * We walked to the bottom of tree. + * Create new leaf node without children. + */ + + ln = node_alloc(); + + if (!ln) + return NULL; + ln->fn_bit = plen; + + ln->parent = pn; + ln->fn_sernum = sernum; + + if (dir) + pn->right = ln; + else + pn->left = ln; + + return ln; + + +insert_above: + /* + * split since we don't have a common prefix anymore or + * we have a less significant route. + * we've to insert an intermediate node on the list + * this new node will point to the one we need to create + * and the current + */ + + pn = fn->parent; + + /* find 1st bit in difference between the 2 addrs. + + See comment in __ipv6_addr_diff: bit may be an invalid value, + but if it is >= plen, the value is ignored in any case. + */ + + bit = __ipv6_addr_diff(addr, &key->addr, addrlen); + + /* + * (intermediate)[in] + * / \ + * (new leaf node)[ln] (old node)[fn] + */ + if (plen > bit) { + in = node_alloc(); + ln = node_alloc(); + + if (!in || !ln) { + if (in) + node_free(in); + if (ln) + node_free(ln); + return NULL; + } + + /* + * new intermediate node. + * RTN_RTINFO will + * be off since that an address that chooses one of + * the branches would not match less specific routes + * in the other branch + */ + + in->fn_bit = bit; + + in->parent = pn; + in->leaf = fn->leaf; + atomic_inc(&in->leaf->rt6i_ref); + + in->fn_sernum = sernum; + + /* update parent pointer */ + if (dir) + pn->right = in; + else + pn->left = in; + + ln->fn_bit = plen; + + ln->parent = in; + fn->parent = in; + + ln->fn_sernum = sernum; + + if (addr_bit_set(addr, bit)) { + in->right = ln; + in->left = fn; + } else { + in->left = ln; + in->right = fn; + } + } else { /* plen <= bit */ + + /* + * (new leaf node)[ln] + * / \ + * (old node)[fn] NULL + */ + + ln = node_alloc(); + + if (!ln) + return NULL; + + ln->fn_bit = plen; + + ln->parent = pn; + + ln->fn_sernum = sernum; + + if (dir) + pn->right = ln; + else + pn->left = ln; + + if (addr_bit_set(&key->addr, plen)) + ln->right = fn; + else + ln->left = fn; + + fn->parent = ln; + } + return ln; +} + +/* + * Insert routing information in a node. + */ + +static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, + struct nl_info *info) +{ + struct rt6_info *iter = NULL; + struct rt6_info **ins; + int replace = (info->nlh && + (info->nlh->nlmsg_flags & NLM_F_REPLACE)); + int add = (!info->nlh || + (info->nlh->nlmsg_flags & NLM_F_CREATE)); + int found = 0; + + ins = &fn->leaf; + + for (iter = fn->leaf; iter; iter = iter->dst.rt6_next) { + /* + * Search for duplicates + */ + + if (iter->rt6i_metric == rt->rt6i_metric) { + /* + * Same priority level + */ + if (info->nlh && + (info->nlh->nlmsg_flags & NLM_F_EXCL)) + return -EEXIST; + if (replace) { + found++; + break; + } + + if (iter->dst.dev == rt->dst.dev && + iter->rt6i_idev == rt->rt6i_idev && + ipv6_addr_equal(&iter->rt6i_gateway, + &rt->rt6i_gateway)) { + if (!(iter->rt6i_flags & RTF_EXPIRES)) + return -EEXIST; + if (!(rt->rt6i_flags & RTF_EXPIRES)) + rt6_clean_expires(iter); + else + rt6_set_expires(iter, rt->dst.expires); + return -EEXIST; + } + } + + if (iter->rt6i_metric > rt->rt6i_metric) + break; + + ins = &iter->dst.rt6_next; + } + + /* Reset round-robin state, if necessary */ + if (ins == &fn->leaf) + fn->rr_ptr = NULL; + + /* + * insert node + */ + if (!replace) { + if (!add) + pr_warn("IPv6: NLM_F_CREATE should be set when creating new route\n"); + +add: + rt->dst.rt6_next = iter; + *ins = rt; + rt->rt6i_node = fn; + atomic_inc(&rt->rt6i_ref); + inet6_rt_notify(RTM_NEWROUTE, rt, info); + info->nl_net->ipv6.rt6_stats->fib_rt_entries++; + + if (!(fn->fn_flags & RTN_RTINFO)) { + info->nl_net->ipv6.rt6_stats->fib_route_nodes++; + fn->fn_flags |= RTN_RTINFO; + } + + } else { + if (!found) { + if (add) + goto add; + pr_warn("IPv6: NLM_F_REPLACE set, but no existing node found!\n"); + return -ENOENT; + } + *ins = rt; + rt->rt6i_node = fn; + rt->dst.rt6_next = iter->dst.rt6_next; + atomic_inc(&rt->rt6i_ref); + inet6_rt_notify(RTM_NEWROUTE, rt, info); + rt6_release(iter); + if (!(fn->fn_flags & RTN_RTINFO)) { + info->nl_net->ipv6.rt6_stats->fib_route_nodes++; + fn->fn_flags |= RTN_RTINFO; + } + } + + return 0; +} + +static __inline__ void fib6_start_gc(struct net *net, struct rt6_info *rt) +{ + if (!timer_pending(&net->ipv6.ip6_fib_timer) && + (rt->rt6i_flags & (RTF_EXPIRES | RTF_CACHE))) + mod_timer(&net->ipv6.ip6_fib_timer, + jiffies + net->ipv6.sysctl.ip6_rt_gc_interval); +} + +void fib6_force_start_gc(struct net *net) +{ + if (!timer_pending(&net->ipv6.ip6_fib_timer)) + mod_timer(&net->ipv6.ip6_fib_timer, + jiffies + net->ipv6.sysctl.ip6_rt_gc_interval); +} + +/* + * Add routing information to the routing tree. + * <destination addr>/<source addr> + * with source addr info in sub-trees + */ + +int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info) +{ + struct fib6_node *fn, *pn = NULL; + int err = -ENOMEM; + int allow_create = 1; + int replace_required = 0; + + if (info->nlh) { + if (!(info->nlh->nlmsg_flags & NLM_F_CREATE)) + allow_create = 0; + if (info->nlh->nlmsg_flags & NLM_F_REPLACE) + replace_required = 1; + } + if (!allow_create && !replace_required) + pr_warn("IPv6: RTM_NEWROUTE with no NLM_F_CREATE or NLM_F_REPLACE\n"); + + fn = fib6_add_1(root, &rt->rt6i_dst.addr, sizeof(struct in6_addr), + rt->rt6i_dst.plen, offsetof(struct rt6_info, rt6i_dst), + allow_create, replace_required); + + if (IS_ERR(fn)) { + err = PTR_ERR(fn); + fn = NULL; + } + + if (!fn) + goto out; + + pn = fn; + +#ifdef CONFIG_IPV6_SUBTREES + if (rt->rt6i_src.plen) { + struct fib6_node *sn; + + if (!fn->subtree) { + struct fib6_node *sfn; + + /* + * Create subtree. + * + * fn[main tree] + * | + * sfn[subtree root] + * \ + * sn[new leaf node] + */ + + /* Create subtree root node */ + sfn = node_alloc(); + if (!sfn) + goto st_failure; + + sfn->leaf = info->nl_net->ipv6.ip6_null_entry; + atomic_inc(&info->nl_net->ipv6.ip6_null_entry->rt6i_ref); + sfn->fn_flags = RTN_ROOT; + sfn->fn_sernum = fib6_new_sernum(); + + /* Now add the first leaf node to new subtree */ + + sn = fib6_add_1(sfn, &rt->rt6i_src.addr, + sizeof(struct in6_addr), rt->rt6i_src.plen, + offsetof(struct rt6_info, rt6i_src), + allow_create, replace_required); + + if (!sn) { + /* If it is failed, discard just allocated + root, and then (in st_failure) stale node + in main tree. + */ + node_free(sfn); + goto st_failure; + } + + /* Now link new subtree to main tree */ + sfn->parent = fn; + fn->subtree = sfn; + } else { + sn = fib6_add_1(fn->subtree, &rt->rt6i_src.addr, + sizeof(struct in6_addr), rt->rt6i_src.plen, + offsetof(struct rt6_info, rt6i_src), + allow_create, replace_required); + + if (IS_ERR(sn)) { + err = PTR_ERR(sn); + sn = NULL; + } + if (!sn) + goto st_failure; + } + + if (!fn->leaf) { + fn->leaf = rt; + atomic_inc(&rt->rt6i_ref); + } + fn = sn; + } +#endif + + err = fib6_add_rt2node(fn, rt, info); + if (!err) { + fib6_start_gc(info->nl_net, rt); + if (!(rt->rt6i_flags & RTF_CACHE)) + fib6_prune_clones(info->nl_net, pn, rt); + } + +out: + if (err) { +#ifdef CONFIG_IPV6_SUBTREES + /* + * If fib6_add_1 has cleared the old leaf pointer in the + * super-tree leaf node we have to find a new one for it. + */ + if (pn != fn && pn->leaf == rt) { + pn->leaf = NULL; + atomic_dec(&rt->rt6i_ref); + } + if (pn != fn && !pn->leaf && !(pn->fn_flags & RTN_RTINFO)) { + pn->leaf = fib6_find_prefix(info->nl_net, pn); +#if RT6_DEBUG >= 2 + if (!pn->leaf) { + WARN_ON(pn->leaf == NULL); + pn->leaf = info->nl_net->ipv6.ip6_null_entry; + } +#endif + atomic_inc(&pn->leaf->rt6i_ref); + } +#endif + dst_free(&rt->dst); + } + return err; + +#ifdef CONFIG_IPV6_SUBTREES + /* Subtree creation failed, probably main tree node + is orphan. If it is, shoot it. + */ +st_failure: + if (fn && !(fn->fn_flags & (RTN_RTINFO|RTN_ROOT))) + fib6_repair_tree(info->nl_net, fn); + dst_free(&rt->dst); + return err; +#endif +} + +/* + * Routing tree lookup + * + */ + +struct lookup_args { + int offset; /* key offset on rt6_info */ + const struct in6_addr *addr; /* search key */ +}; + +static struct fib6_node * fib6_lookup_1(struct fib6_node *root, + struct lookup_args *args) +{ + struct fib6_node *fn; + __be32 dir; + + if (unlikely(args->offset == 0)) + return NULL; + + /* + * Descend on a tree + */ + + fn = root; + + for (;;) { + struct fib6_node *next; + + dir = addr_bit_set(args->addr, fn->fn_bit); + + next = dir ? fn->right : fn->left; + + if (next) { + fn = next; + continue; + } + break; + } + + while (fn) { + if (FIB6_SUBTREE(fn) || fn->fn_flags & RTN_RTINFO) { + struct rt6key *key; + + key = (struct rt6key *) ((u8 *) fn->leaf + + args->offset); + + if (ipv6_prefix_equal(&key->addr, args->addr, key->plen)) { +#ifdef CONFIG_IPV6_SUBTREES + if (fn->subtree) + fn = fib6_lookup_1(fn->subtree, args + 1); +#endif + if (!fn || fn->fn_flags & RTN_RTINFO) + return fn; + } + } + + if (fn->fn_flags & RTN_ROOT) + break; + + fn = fn->parent; + } + + return NULL; +} + +struct fib6_node * fib6_lookup(struct fib6_node *root, const struct in6_addr *daddr, + const struct in6_addr *saddr) +{ + struct fib6_node *fn; + struct lookup_args args[] = { + { + .offset = offsetof(struct rt6_info, rt6i_dst), + .addr = daddr, + }, +#ifdef CONFIG_IPV6_SUBTREES + { + .offset = offsetof(struct rt6_info, rt6i_src), + .addr = saddr, + }, +#endif + { + .offset = 0, /* sentinel */ + } + }; + + fn = fib6_lookup_1(root, daddr ? args : args + 1); + if (!fn || fn->fn_flags & RTN_TL_ROOT) + fn = root; + + return fn; +} + +/* + * Get node with specified destination prefix (and source prefix, + * if subtrees are used) + */ + + +static struct fib6_node * fib6_locate_1(struct fib6_node *root, + const struct in6_addr *addr, + int plen, int offset) +{ + struct fib6_node *fn; + + for (fn = root; fn ; ) { + struct rt6key *key = (struct rt6key *)((u8 *)fn->leaf + offset); + + /* + * Prefix match + */ + if (plen < fn->fn_bit || + !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit)) + return NULL; + + if (plen == fn->fn_bit) + return fn; + + /* + * We have more bits to go + */ + if (addr_bit_set(addr, fn->fn_bit)) + fn = fn->right; + else + fn = fn->left; + } + return NULL; +} + +struct fib6_node * fib6_locate(struct fib6_node *root, + const struct in6_addr *daddr, int dst_len, + const struct in6_addr *saddr, int src_len) +{ + struct fib6_node *fn; + + fn = fib6_locate_1(root, daddr, dst_len, + offsetof(struct rt6_info, rt6i_dst)); + +#ifdef CONFIG_IPV6_SUBTREES + if (src_len) { + WARN_ON(saddr == NULL); + if (fn && fn->subtree) + fn = fib6_locate_1(fn->subtree, saddr, src_len, + offsetof(struct rt6_info, rt6i_src)); + } +#endif + + if (fn && fn->fn_flags & RTN_RTINFO) + return fn; + + return NULL; +} + + +/* + * Deletion + * + */ + +static struct rt6_info *fib6_find_prefix(struct net *net, struct fib6_node *fn) +{ + if (fn->fn_flags & RTN_ROOT) + return net->ipv6.ip6_null_entry; + + while (fn) { + if (fn->left) + return fn->left->leaf; + if (fn->right) + return fn->right->leaf; + + fn = FIB6_SUBTREE(fn); + } + return NULL; +} + +/* + * Called to trim the tree of intermediate nodes when possible. "fn" + * is the node we want to try and remove. + */ + +static struct fib6_node *fib6_repair_tree(struct net *net, + struct fib6_node *fn) +{ + int children; + int nstate; + struct fib6_node *child, *pn; + struct fib6_walker_t *w; + int iter = 0; + + for (;;) { + RT6_TRACE("fixing tree: plen=%d iter=%d\n", fn->fn_bit, iter); + iter++; + + WARN_ON(fn->fn_flags & RTN_RTINFO); + WARN_ON(fn->fn_flags & RTN_TL_ROOT); + WARN_ON(fn->leaf != NULL); + + children = 0; + child = NULL; + if (fn->right) child = fn->right, children |= 1; + if (fn->left) child = fn->left, children |= 2; + + if (children == 3 || FIB6_SUBTREE(fn) +#ifdef CONFIG_IPV6_SUBTREES + /* Subtree root (i.e. fn) may have one child */ + || (children && fn->fn_flags & RTN_ROOT) +#endif + ) { + fn->leaf = fib6_find_prefix(net, fn); +#if RT6_DEBUG >= 2 + if (!fn->leaf) { + WARN_ON(!fn->leaf); + fn->leaf = net->ipv6.ip6_null_entry; + } +#endif + atomic_inc(&fn->leaf->rt6i_ref); + return fn->parent; + } + + pn = fn->parent; +#ifdef CONFIG_IPV6_SUBTREES + if (FIB6_SUBTREE(pn) == fn) { + WARN_ON(!(fn->fn_flags & RTN_ROOT)); + FIB6_SUBTREE(pn) = NULL; + nstate = FWS_L; + } else { + WARN_ON(fn->fn_flags & RTN_ROOT); +#endif + if (pn->right == fn) pn->right = child; + else if (pn->left == fn) pn->left = child; +#if RT6_DEBUG >= 2 + else + WARN_ON(1); +#endif + if (child) + child->parent = pn; + nstate = FWS_R; +#ifdef CONFIG_IPV6_SUBTREES + } +#endif + + read_lock(&fib6_walker_lock); + FOR_WALKERS(w) { + if (!child) { + if (w->root == fn) { + w->root = w->node = NULL; + RT6_TRACE("W %p adjusted by delroot 1\n", w); + } else if (w->node == fn) { + RT6_TRACE("W %p adjusted by delnode 1, s=%d/%d\n", w, w->state, nstate); + w->node = pn; + w->state = nstate; + } + } else { + if (w->root == fn) { + w->root = child; + RT6_TRACE("W %p adjusted by delroot 2\n", w); + } + if (w->node == fn) { + w->node = child; + if (children&2) { + RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state); + w->state = w->state>=FWS_R ? FWS_U : FWS_INIT; + } else { + RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state); + w->state = w->state>=FWS_C ? FWS_U : FWS_INIT; + } + } + } + } + read_unlock(&fib6_walker_lock); + + node_free(fn); + if (pn->fn_flags & RTN_RTINFO || FIB6_SUBTREE(pn)) + return pn; + + rt6_release(pn->leaf); + pn->leaf = NULL; + fn = pn; + } +} + +static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, + struct nl_info *info) +{ + struct fib6_walker_t *w; + struct rt6_info *rt = *rtp; + struct net *net = info->nl_net; + + RT6_TRACE("fib6_del_route\n"); + + /* Unlink it */ + *rtp = rt->dst.rt6_next; + rt->rt6i_node = NULL; + net->ipv6.rt6_stats->fib_rt_entries--; + net->ipv6.rt6_stats->fib_discarded_routes++; + + /* Reset round-robin state, if necessary */ + if (fn->rr_ptr == rt) + fn->rr_ptr = NULL; + + /* Adjust walkers */ + read_lock(&fib6_walker_lock); + FOR_WALKERS(w) { + if (w->state == FWS_C && w->leaf == rt) { + RT6_TRACE("walker %p adjusted by delroute\n", w); + w->leaf = rt->dst.rt6_next; + if (!w->leaf) + w->state = FWS_U; + } + } + read_unlock(&fib6_walker_lock); + + rt->dst.rt6_next = NULL; + + /* If it was last route, expunge its radix tree node */ + if (!fn->leaf) { + fn->fn_flags &= ~RTN_RTINFO; + net->ipv6.rt6_stats->fib_route_nodes--; + fn = fib6_repair_tree(net, fn); + } + + if (atomic_read(&rt->rt6i_ref) != 1) { + /* This route is used as dummy address holder in some split + * nodes. It is not leaked, but it still holds other resources, + * which must be released in time. So, scan ascendant nodes + * and replace dummy references to this route with references + * to still alive ones. + */ + while (fn) { + if (!(fn->fn_flags & RTN_RTINFO) && fn->leaf == rt) { + fn->leaf = fib6_find_prefix(net, fn); + atomic_inc(&fn->leaf->rt6i_ref); + rt6_release(rt); + } + fn = fn->parent; + } + /* No more references are possible at this point. */ + BUG_ON(atomic_read(&rt->rt6i_ref) != 1); + } + + inet6_rt_notify(RTM_DELROUTE, rt, info); + rt6_release(rt); +} + +int fib6_del(struct rt6_info *rt, struct nl_info *info) +{ + struct net *net = info->nl_net; + struct fib6_node *fn = rt->rt6i_node; + struct rt6_info **rtp; + +#if RT6_DEBUG >= 2 + if (rt->dst.obsolete>0) { + WARN_ON(fn != NULL); + return -ENOENT; + } +#endif + if (!fn || rt == net->ipv6.ip6_null_entry) + return -ENOENT; + + WARN_ON(!(fn->fn_flags & RTN_RTINFO)); + + if (!(rt->rt6i_flags & RTF_CACHE)) { + struct fib6_node *pn = fn; +#ifdef CONFIG_IPV6_SUBTREES + /* clones of this route might be in another subtree */ + if (rt->rt6i_src.plen) { + while (!(pn->fn_flags & RTN_ROOT)) + pn = pn->parent; + pn = pn->parent; + } +#endif + fib6_prune_clones(info->nl_net, pn, rt); + } + + /* + * Walk the leaf entries looking for ourself + */ + + for (rtp = &fn->leaf; *rtp; rtp = &(*rtp)->dst.rt6_next) { + if (*rtp == rt) { + fib6_del_route(fn, rtp, info); + return 0; + } + } + return -ENOENT; +} + +/* + * Tree traversal function. + * + * Certainly, it is not interrupt safe. + * However, it is internally reenterable wrt itself and fib6_add/fib6_del. + * It means, that we can modify tree during walking + * and use this function for garbage collection, clone pruning, + * cleaning tree when a device goes down etc. etc. + * + * It guarantees that every node will be traversed, + * and that it will be traversed only once. + * + * Callback function w->func may return: + * 0 -> continue walking. + * positive value -> walking is suspended (used by tree dumps, + * and probably by gc, if it will be split to several slices) + * negative value -> terminate walking. + * + * The function itself returns: + * 0 -> walk is complete. + * >0 -> walk is incomplete (i.e. suspended) + * <0 -> walk is terminated by an error. + */ + +static int fib6_walk_continue(struct fib6_walker_t *w) +{ + struct fib6_node *fn, *pn; + + for (;;) { + fn = w->node; + if (!fn) + return 0; + + if (w->prune && fn != w->root && + fn->fn_flags & RTN_RTINFO && w->state < FWS_C) { + w->state = FWS_C; + w->leaf = fn->leaf; + } + switch (w->state) { +#ifdef CONFIG_IPV6_SUBTREES + case FWS_S: + if (FIB6_SUBTREE(fn)) { + w->node = FIB6_SUBTREE(fn); + continue; + } + w->state = FWS_L; +#endif + case FWS_L: + if (fn->left) { + w->node = fn->left; + w->state = FWS_INIT; + continue; + } + w->state = FWS_R; + case FWS_R: + if (fn->right) { + w->node = fn->right; + w->state = FWS_INIT; + continue; + } + w->state = FWS_C; + w->leaf = fn->leaf; + case FWS_C: + if (w->leaf && fn->fn_flags & RTN_RTINFO) { + int err; + + if (w->count < w->skip) { + w->count++; + continue; + } + + err = w->func(w); + if (err) + return err; + + w->count++; + continue; + } + w->state = FWS_U; + case FWS_U: + if (fn == w->root) + return 0; + pn = fn->parent; + w->node = pn; +#ifdef CONFIG_IPV6_SUBTREES + if (FIB6_SUBTREE(pn) == fn) { + WARN_ON(!(fn->fn_flags & RTN_ROOT)); + w->state = FWS_L; + continue; + } +#endif + if (pn->left == fn) { + w->state = FWS_R; + continue; + } + if (pn->right == fn) { + w->state = FWS_C; + w->leaf = w->node->leaf; + continue; + } +#if RT6_DEBUG >= 2 + WARN_ON(1); +#endif + } + } +} + +static int fib6_walk(struct fib6_walker_t *w) +{ + int res; + + w->state = FWS_INIT; + w->node = w->root; + + fib6_walker_link(w); + res = fib6_walk_continue(w); + if (res <= 0) + fib6_walker_unlink(w); + return res; +} + +static int fib6_clean_node(struct fib6_walker_t *w) +{ + int res; + struct rt6_info *rt; + struct fib6_cleaner_t *c = container_of(w, struct fib6_cleaner_t, w); + struct nl_info info = { + .nl_net = c->net, + }; + + for (rt = w->leaf; rt; rt = rt->dst.rt6_next) { + res = c->func(rt, c->arg); + if (res < 0) { + w->leaf = rt; + res = fib6_del(rt, &info); + if (res) { +#if RT6_DEBUG >= 2 + printk(KERN_DEBUG "fib6_clean_node: del failed: rt=%p@%p err=%d\n", rt, rt->rt6i_node, res); +#endif + continue; + } + return 0; + } + WARN_ON(res != 0); + } + w->leaf = rt; + return 0; +} + +/* + * Convenient frontend to tree walker. + * + * func is called on each route. + * It may return -1 -> delete this route. + * 0 -> continue walking + * + * prune==1 -> only immediate children of node (certainly, + * ignoring pure split nodes) will be scanned. + */ + +static void fib6_clean_tree(struct net *net, struct fib6_node *root, + int (*func)(struct rt6_info *, void *arg), + int prune, void *arg) +{ + struct fib6_cleaner_t c; + + c.w.root = root; + c.w.func = fib6_clean_node; + c.w.prune = prune; + c.w.count = 0; + c.w.skip = 0; + c.func = func; + c.arg = arg; + c.net = net; + + fib6_walk(&c.w); +} + +void fib6_clean_all_ro(struct net *net, int (*func)(struct rt6_info *, void *arg), + int prune, void *arg) +{ + struct fib6_table *table; + struct hlist_node *node; + struct hlist_head *head; + unsigned int h; + + rcu_read_lock(); + for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { + head = &net->ipv6.fib_table_hash[h]; + hlist_for_each_entry_rcu(table, node, head, tb6_hlist) { + read_lock_bh(&table->tb6_lock); + fib6_clean_tree(net, &table->tb6_root, + func, prune, arg); + read_unlock_bh(&table->tb6_lock); + } + } + rcu_read_unlock(); +} +void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *arg), + int prune, void *arg) +{ + struct fib6_table *table; + struct hlist_node *node; + struct hlist_head *head; + unsigned int h; + + rcu_read_lock(); + for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { + head = &net->ipv6.fib_table_hash[h]; + hlist_for_each_entry_rcu(table, node, head, tb6_hlist) { + write_lock_bh(&table->tb6_lock); + fib6_clean_tree(net, &table->tb6_root, + func, prune, arg); + write_unlock_bh(&table->tb6_lock); + } + } + rcu_read_unlock(); +} + +static int fib6_prune_clone(struct rt6_info *rt, void *arg) +{ + if (rt->rt6i_flags & RTF_CACHE) { + RT6_TRACE("pruning clone %p\n", rt); + return -1; + } + + return 0; +} + +static void fib6_prune_clones(struct net *net, struct fib6_node *fn, + struct rt6_info *rt) +{ + fib6_clean_tree(net, fn, fib6_prune_clone, 1, rt); +} + +/* + * Garbage collection + */ + +static struct fib6_gc_args +{ + int timeout; + int more; +} gc_args; + +static int fib6_age(struct rt6_info *rt, void *arg) +{ + unsigned long now = jiffies; + + /* + * check addrconf expiration here. + * Routes are expired even if they are in use. + * + * Also age clones. Note, that clones are aged out + * only if they are not in use now. + */ + + if (rt->rt6i_flags & RTF_EXPIRES && rt->dst.expires) { + if (time_after(now, rt->dst.expires)) { + RT6_TRACE("expiring %p\n", rt); + return -1; + } + gc_args.more++; + } else if (rt->rt6i_flags & RTF_CACHE) { + if (atomic_read(&rt->dst.__refcnt) == 0 && + time_after_eq(now, rt->dst.lastuse + gc_args.timeout)) { + RT6_TRACE("aging clone %p\n", rt); + return -1; + } else if (rt->rt6i_flags & RTF_GATEWAY) { + struct neighbour *neigh; + __u8 neigh_flags = 0; + + neigh = dst_neigh_lookup(&rt->dst, &rt->rt6i_gateway); + if (neigh) { + neigh_flags = neigh->flags; + neigh_release(neigh); + } + if (!(neigh_flags & NTF_ROUTER)) { + RT6_TRACE("purging route %p via non-router but gateway\n", + rt); + return -1; + } + } + gc_args.more++; + } + + return 0; +} + +static DEFINE_SPINLOCK(fib6_gc_lock); + +void fib6_run_gc(unsigned long expires, struct net *net) +{ + if (expires != ~0UL) { + spin_lock_bh(&fib6_gc_lock); + gc_args.timeout = expires ? (int)expires : + net->ipv6.sysctl.ip6_rt_gc_interval; + } else { + if (!spin_trylock_bh(&fib6_gc_lock)) { + mod_timer(&net->ipv6.ip6_fib_timer, jiffies + HZ); + return; + } + gc_args.timeout = net->ipv6.sysctl.ip6_rt_gc_interval; + } + + gc_args.more = icmp6_dst_gc(); + + fib6_clean_all(net, fib6_age, 0, NULL); + + if (gc_args.more) + mod_timer(&net->ipv6.ip6_fib_timer, + round_jiffies(jiffies + + net->ipv6.sysctl.ip6_rt_gc_interval)); + else + del_timer(&net->ipv6.ip6_fib_timer); + spin_unlock_bh(&fib6_gc_lock); +} + +static void fib6_gc_timer_cb(unsigned long arg) +{ + fib6_run_gc(0, (struct net *)arg); +} + +static int __net_init fib6_net_init(struct net *net) +{ + size_t size = sizeof(struct hlist_head) * FIB6_TABLE_HASHSZ; + + setup_timer(&net->ipv6.ip6_fib_timer, fib6_gc_timer_cb, (unsigned long)net); + + net->ipv6.rt6_stats = kzalloc(sizeof(*net->ipv6.rt6_stats), GFP_KERNEL); + if (!net->ipv6.rt6_stats) + goto out_timer; + + /* Avoid false sharing : Use at least a full cache line */ + size = max_t(size_t, size, L1_CACHE_BYTES); + + net->ipv6.fib_table_hash = kzalloc(size, GFP_KERNEL); + if (!net->ipv6.fib_table_hash) + goto out_rt6_stats; + + net->ipv6.fib6_main_tbl = kzalloc(sizeof(*net->ipv6.fib6_main_tbl), + GFP_KERNEL); + if (!net->ipv6.fib6_main_tbl) + goto out_fib_table_hash; + + net->ipv6.fib6_main_tbl->tb6_id = RT6_TABLE_MAIN; + net->ipv6.fib6_main_tbl->tb6_root.leaf = net->ipv6.ip6_null_entry; + net->ipv6.fib6_main_tbl->tb6_root.fn_flags = + RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; + +#ifdef CONFIG_IPV6_MULTIPLE_TABLES + net->ipv6.fib6_local_tbl = kzalloc(sizeof(*net->ipv6.fib6_local_tbl), + GFP_KERNEL); + if (!net->ipv6.fib6_local_tbl) + goto out_fib6_main_tbl; + net->ipv6.fib6_local_tbl->tb6_id = RT6_TABLE_LOCAL; + net->ipv6.fib6_local_tbl->tb6_root.leaf = net->ipv6.ip6_null_entry; + net->ipv6.fib6_local_tbl->tb6_root.fn_flags = + RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; +#endif + fib6_tables_init(net); + + return 0; + +#ifdef CONFIG_IPV6_MULTIPLE_TABLES +out_fib6_main_tbl: + kfree(net->ipv6.fib6_main_tbl); +#endif +out_fib_table_hash: + kfree(net->ipv6.fib_table_hash); +out_rt6_stats: + kfree(net->ipv6.rt6_stats); +out_timer: + return -ENOMEM; + } + +static void fib6_net_exit(struct net *net) +{ + rt6_ifdown(net, NULL); + del_timer_sync(&net->ipv6.ip6_fib_timer); + +#ifdef CONFIG_IPV6_MULTIPLE_TABLES + kfree(net->ipv6.fib6_local_tbl); +#endif + kfree(net->ipv6.fib6_main_tbl); + kfree(net->ipv6.fib_table_hash); + kfree(net->ipv6.rt6_stats); +} + +static struct pernet_operations fib6_net_ops = { + .init = fib6_net_init, + .exit = fib6_net_exit, +}; + +int __init fib6_init(void) +{ + int ret = -ENOMEM; + + fib6_node_kmem = kmem_cache_create("fib6_nodes", + sizeof(struct fib6_node), + 0, SLAB_HWCACHE_ALIGN, + NULL); + if (!fib6_node_kmem) + goto out; + + ret = register_pernet_subsys(&fib6_net_ops); + if (ret) + goto out_kmem_cache_create; + + ret = __rtnl_register(PF_INET6, RTM_GETROUTE, NULL, inet6_dump_fib, + NULL); + if (ret) + goto out_unregister_subsys; +out: + return ret; + +out_unregister_subsys: + unregister_pernet_subsys(&fib6_net_ops); +out_kmem_cache_create: + kmem_cache_destroy(fib6_node_kmem); + goto out; +} + +void fib6_gc_cleanup(void) +{ + unregister_pernet_subsys(&fib6_net_ops); + kmem_cache_destroy(fib6_node_kmem); +} diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c new file mode 100644 index 00000000..b7867a12 --- /dev/null +++ b/net/ipv6/ip6_flowlabel.c @@ -0,0 +1,783 @@ +/* + * ip6_flowlabel.c IPv6 flowlabel manager. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + */ + +#include <linux/capability.h> +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/net.h> +#include <linux/netdevice.h> +#include <linux/if_arp.h> +#include <linux/in6.h> +#include <linux/route.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/slab.h> +#include <linux/export.h> + +#include <net/net_namespace.h> +#include <net/sock.h> + +#include <net/ipv6.h> +#include <net/ndisc.h> +#include <net/protocol.h> +#include <net/ip6_route.h> +#include <net/addrconf.h> +#include <net/rawv6.h> +#include <net/icmp.h> +#include <net/transp_v6.h> + +#include <asm/uaccess.h> + +#define FL_MIN_LINGER 6 /* Minimal linger. It is set to 6sec specified + in old IPv6 RFC. Well, it was reasonable value. + */ +#define FL_MAX_LINGER 60 /* Maximal linger timeout */ + +/* FL hash table */ + +#define FL_MAX_PER_SOCK 32 +#define FL_MAX_SIZE 4096 +#define FL_HASH_MASK 255 +#define FL_HASH(l) (ntohl(l)&FL_HASH_MASK) + +static atomic_t fl_size = ATOMIC_INIT(0); +static struct ip6_flowlabel *fl_ht[FL_HASH_MASK+1]; + +static void ip6_fl_gc(unsigned long dummy); +static DEFINE_TIMER(ip6_fl_gc_timer, ip6_fl_gc, 0, 0); + +/* FL hash table lock: it protects only of GC */ + +static DEFINE_RWLOCK(ip6_fl_lock); + +/* Big socket sock */ + +static DEFINE_RWLOCK(ip6_sk_fl_lock); + + +static inline struct ip6_flowlabel *__fl_lookup(struct net *net, __be32 label) +{ + struct ip6_flowlabel *fl; + + for (fl=fl_ht[FL_HASH(label)]; fl; fl = fl->next) { + if (fl->label == label && net_eq(fl->fl_net, net)) + return fl; + } + return NULL; +} + +static struct ip6_flowlabel *fl_lookup(struct net *net, __be32 label) +{ + struct ip6_flowlabel *fl; + + read_lock_bh(&ip6_fl_lock); + fl = __fl_lookup(net, label); + if (fl) + atomic_inc(&fl->users); + read_unlock_bh(&ip6_fl_lock); + return fl; +} + + +static void fl_free(struct ip6_flowlabel *fl) +{ + if (fl) { + release_net(fl->fl_net); + kfree(fl->opt); + } + kfree(fl); +} + +static void fl_release(struct ip6_flowlabel *fl) +{ + write_lock_bh(&ip6_fl_lock); + + fl->lastuse = jiffies; + if (atomic_dec_and_test(&fl->users)) { + unsigned long ttd = fl->lastuse + fl->linger; + if (time_after(ttd, fl->expires)) + fl->expires = ttd; + ttd = fl->expires; + if (fl->opt && fl->share == IPV6_FL_S_EXCL) { + struct ipv6_txoptions *opt = fl->opt; + fl->opt = NULL; + kfree(opt); + } + if (!timer_pending(&ip6_fl_gc_timer) || + time_after(ip6_fl_gc_timer.expires, ttd)) + mod_timer(&ip6_fl_gc_timer, ttd); + } + write_unlock_bh(&ip6_fl_lock); +} + +static void ip6_fl_gc(unsigned long dummy) +{ + int i; + unsigned long now = jiffies; + unsigned long sched = 0; + + write_lock(&ip6_fl_lock); + + for (i=0; i<=FL_HASH_MASK; i++) { + struct ip6_flowlabel *fl, **flp; + flp = &fl_ht[i]; + while ((fl=*flp) != NULL) { + if (atomic_read(&fl->users) == 0) { + unsigned long ttd = fl->lastuse + fl->linger; + if (time_after(ttd, fl->expires)) + fl->expires = ttd; + ttd = fl->expires; + if (time_after_eq(now, ttd)) { + *flp = fl->next; + fl_free(fl); + atomic_dec(&fl_size); + continue; + } + if (!sched || time_before(ttd, sched)) + sched = ttd; + } + flp = &fl->next; + } + } + if (!sched && atomic_read(&fl_size)) + sched = now + FL_MAX_LINGER; + if (sched) { + mod_timer(&ip6_fl_gc_timer, sched); + } + write_unlock(&ip6_fl_lock); +} + +static void __net_exit ip6_fl_purge(struct net *net) +{ + int i; + + write_lock(&ip6_fl_lock); + for (i = 0; i <= FL_HASH_MASK; i++) { + struct ip6_flowlabel *fl, **flp; + flp = &fl_ht[i]; + while ((fl = *flp) != NULL) { + if (net_eq(fl->fl_net, net) && + atomic_read(&fl->users) == 0) { + *flp = fl->next; + fl_free(fl); + atomic_dec(&fl_size); + continue; + } + flp = &fl->next; + } + } + write_unlock(&ip6_fl_lock); +} + +static struct ip6_flowlabel *fl_intern(struct net *net, + struct ip6_flowlabel *fl, __be32 label) +{ + struct ip6_flowlabel *lfl; + + fl->label = label & IPV6_FLOWLABEL_MASK; + + write_lock_bh(&ip6_fl_lock); + if (label == 0) { + for (;;) { + fl->label = htonl(net_random())&IPV6_FLOWLABEL_MASK; + if (fl->label) { + lfl = __fl_lookup(net, fl->label); + if (lfl == NULL) + break; + } + } + } else { + /* + * we dropper the ip6_fl_lock, so this entry could reappear + * and we need to recheck with it. + * + * OTOH no need to search the active socket first, like it is + * done in ipv6_flowlabel_opt - sock is locked, so new entry + * with the same label can only appear on another sock + */ + lfl = __fl_lookup(net, fl->label); + if (lfl != NULL) { + atomic_inc(&lfl->users); + write_unlock_bh(&ip6_fl_lock); + return lfl; + } + } + + fl->lastuse = jiffies; + fl->next = fl_ht[FL_HASH(fl->label)]; + fl_ht[FL_HASH(fl->label)] = fl; + atomic_inc(&fl_size); + write_unlock_bh(&ip6_fl_lock); + return NULL; +} + + + +/* Socket flowlabel lists */ + +struct ip6_flowlabel * fl6_sock_lookup(struct sock *sk, __be32 label) +{ + struct ipv6_fl_socklist *sfl; + struct ipv6_pinfo *np = inet6_sk(sk); + + label &= IPV6_FLOWLABEL_MASK; + + read_lock_bh(&ip6_sk_fl_lock); + for (sfl=np->ipv6_fl_list; sfl; sfl = sfl->next) { + struct ip6_flowlabel *fl = sfl->fl; + if (fl->label == label) { + fl->lastuse = jiffies; + atomic_inc(&fl->users); + read_unlock_bh(&ip6_sk_fl_lock); + return fl; + } + } + read_unlock_bh(&ip6_sk_fl_lock); + return NULL; +} + +EXPORT_SYMBOL_GPL(fl6_sock_lookup); + +void fl6_free_socklist(struct sock *sk) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct ipv6_fl_socklist *sfl; + + while ((sfl = np->ipv6_fl_list) != NULL) { + np->ipv6_fl_list = sfl->next; + fl_release(sfl->fl); + kfree(sfl); + } +} + +/* Service routines */ + + +/* + It is the only difficult place. flowlabel enforces equal headers + before and including routing header, however user may supply options + following rthdr. + */ + +struct ipv6_txoptions *fl6_merge_options(struct ipv6_txoptions * opt_space, + struct ip6_flowlabel * fl, + struct ipv6_txoptions * fopt) +{ + struct ipv6_txoptions * fl_opt = fl->opt; + + if (fopt == NULL || fopt->opt_flen == 0) + return fl_opt; + + if (fl_opt != NULL) { + opt_space->hopopt = fl_opt->hopopt; + opt_space->dst0opt = fl_opt->dst0opt; + opt_space->srcrt = fl_opt->srcrt; + opt_space->opt_nflen = fl_opt->opt_nflen; + } else { + if (fopt->opt_nflen == 0) + return fopt; + opt_space->hopopt = NULL; + opt_space->dst0opt = NULL; + opt_space->srcrt = NULL; + opt_space->opt_nflen = 0; + } + opt_space->dst1opt = fopt->dst1opt; + opt_space->opt_flen = fopt->opt_flen; + return opt_space; +} + +static unsigned long check_linger(unsigned long ttl) +{ + if (ttl < FL_MIN_LINGER) + return FL_MIN_LINGER*HZ; + if (ttl > FL_MAX_LINGER && !capable(CAP_NET_ADMIN)) + return 0; + return ttl*HZ; +} + +static int fl6_renew(struct ip6_flowlabel *fl, unsigned long linger, unsigned long expires) +{ + linger = check_linger(linger); + if (!linger) + return -EPERM; + expires = check_linger(expires); + if (!expires) + return -EPERM; + fl->lastuse = jiffies; + if (time_before(fl->linger, linger)) + fl->linger = linger; + if (time_before(expires, fl->linger)) + expires = fl->linger; + if (time_before(fl->expires, fl->lastuse + expires)) + fl->expires = fl->lastuse + expires; + return 0; +} + +static struct ip6_flowlabel * +fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq, + char __user *optval, int optlen, int *err_p) +{ + struct ip6_flowlabel *fl = NULL; + int olen; + int addr_type; + int err; + + olen = optlen - CMSG_ALIGN(sizeof(*freq)); + err = -EINVAL; + if (olen > 64 * 1024) + goto done; + + err = -ENOMEM; + fl = kzalloc(sizeof(*fl), GFP_KERNEL); + if (fl == NULL) + goto done; + + if (olen > 0) { + struct msghdr msg; + struct flowi6 flowi6; + int junk; + + err = -ENOMEM; + fl->opt = kmalloc(sizeof(*fl->opt) + olen, GFP_KERNEL); + if (fl->opt == NULL) + goto done; + + memset(fl->opt, 0, sizeof(*fl->opt)); + fl->opt->tot_len = sizeof(*fl->opt) + olen; + err = -EFAULT; + if (copy_from_user(fl->opt+1, optval+CMSG_ALIGN(sizeof(*freq)), olen)) + goto done; + + msg.msg_controllen = olen; + msg.msg_control = (void*)(fl->opt+1); + memset(&flowi6, 0, sizeof(flowi6)); + + err = datagram_send_ctl(net, sk, &msg, &flowi6, fl->opt, &junk, + &junk, &junk); + if (err) + goto done; + err = -EINVAL; + if (fl->opt->opt_flen) + goto done; + if (fl->opt->opt_nflen == 0) { + kfree(fl->opt); + fl->opt = NULL; + } + } + + fl->fl_net = hold_net(net); + fl->expires = jiffies; + err = fl6_renew(fl, freq->flr_linger, freq->flr_expires); + if (err) + goto done; + fl->share = freq->flr_share; + addr_type = ipv6_addr_type(&freq->flr_dst); + if ((addr_type & IPV6_ADDR_MAPPED) || + addr_type == IPV6_ADDR_ANY) { + err = -EINVAL; + goto done; + } + fl->dst = freq->flr_dst; + atomic_set(&fl->users, 1); + switch (fl->share) { + case IPV6_FL_S_EXCL: + case IPV6_FL_S_ANY: + break; + case IPV6_FL_S_PROCESS: + fl->owner = current->pid; + break; + case IPV6_FL_S_USER: + fl->owner = current_euid(); + break; + default: + err = -EINVAL; + goto done; + } + return fl; + +done: + fl_free(fl); + *err_p = err; + return NULL; +} + +static int mem_check(struct sock *sk) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct ipv6_fl_socklist *sfl; + int room = FL_MAX_SIZE - atomic_read(&fl_size); + int count = 0; + + if (room > FL_MAX_SIZE - FL_MAX_PER_SOCK) + return 0; + + for (sfl = np->ipv6_fl_list; sfl; sfl = sfl->next) + count++; + + if (room <= 0 || + ((count >= FL_MAX_PER_SOCK || + (count > 0 && room < FL_MAX_SIZE/2) || room < FL_MAX_SIZE/4) && + !capable(CAP_NET_ADMIN))) + return -ENOBUFS; + + return 0; +} + +static int ipv6_hdr_cmp(struct ipv6_opt_hdr *h1, struct ipv6_opt_hdr *h2) +{ + if (h1 == h2) + return 0; + if (h1 == NULL || h2 == NULL) + return 1; + if (h1->hdrlen != h2->hdrlen) + return 1; + return memcmp(h1+1, h2+1, ((h1->hdrlen+1)<<3) - sizeof(*h1)); +} + +static int ipv6_opt_cmp(struct ipv6_txoptions *o1, struct ipv6_txoptions *o2) +{ + if (o1 == o2) + return 0; + if (o1 == NULL || o2 == NULL) + return 1; + if (o1->opt_nflen != o2->opt_nflen) + return 1; + if (ipv6_hdr_cmp(o1->hopopt, o2->hopopt)) + return 1; + if (ipv6_hdr_cmp(o1->dst0opt, o2->dst0opt)) + return 1; + if (ipv6_hdr_cmp((struct ipv6_opt_hdr *)o1->srcrt, (struct ipv6_opt_hdr *)o2->srcrt)) + return 1; + return 0; +} + +static inline void fl_link(struct ipv6_pinfo *np, struct ipv6_fl_socklist *sfl, + struct ip6_flowlabel *fl) +{ + write_lock_bh(&ip6_sk_fl_lock); + sfl->fl = fl; + sfl->next = np->ipv6_fl_list; + np->ipv6_fl_list = sfl; + write_unlock_bh(&ip6_sk_fl_lock); +} + +int ipv6_flowlabel_opt(struct sock *sk, char __user *optval, int optlen) +{ + int uninitialized_var(err); + struct net *net = sock_net(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct in6_flowlabel_req freq; + struct ipv6_fl_socklist *sfl1=NULL; + struct ipv6_fl_socklist *sfl, **sflp; + struct ip6_flowlabel *fl, *fl1 = NULL; + + + if (optlen < sizeof(freq)) + return -EINVAL; + + if (copy_from_user(&freq, optval, sizeof(freq))) + return -EFAULT; + + switch (freq.flr_action) { + case IPV6_FL_A_PUT: + write_lock_bh(&ip6_sk_fl_lock); + for (sflp = &np->ipv6_fl_list; (sfl=*sflp)!=NULL; sflp = &sfl->next) { + if (sfl->fl->label == freq.flr_label) { + if (freq.flr_label == (np->flow_label&IPV6_FLOWLABEL_MASK)) + np->flow_label &= ~IPV6_FLOWLABEL_MASK; + *sflp = sfl->next; + write_unlock_bh(&ip6_sk_fl_lock); + fl_release(sfl->fl); + kfree(sfl); + return 0; + } + } + write_unlock_bh(&ip6_sk_fl_lock); + return -ESRCH; + + case IPV6_FL_A_RENEW: + read_lock_bh(&ip6_sk_fl_lock); + for (sfl = np->ipv6_fl_list; sfl; sfl = sfl->next) { + if (sfl->fl->label == freq.flr_label) { + err = fl6_renew(sfl->fl, freq.flr_linger, freq.flr_expires); + read_unlock_bh(&ip6_sk_fl_lock); + return err; + } + } + read_unlock_bh(&ip6_sk_fl_lock); + + if (freq.flr_share == IPV6_FL_S_NONE && capable(CAP_NET_ADMIN)) { + fl = fl_lookup(net, freq.flr_label); + if (fl) { + err = fl6_renew(fl, freq.flr_linger, freq.flr_expires); + fl_release(fl); + return err; + } + } + return -ESRCH; + + case IPV6_FL_A_GET: + if (freq.flr_label & ~IPV6_FLOWLABEL_MASK) + return -EINVAL; + + fl = fl_create(net, sk, &freq, optval, optlen, &err); + if (fl == NULL) + return err; + sfl1 = kmalloc(sizeof(*sfl1), GFP_KERNEL); + + if (freq.flr_label) { + err = -EEXIST; + read_lock_bh(&ip6_sk_fl_lock); + for (sfl = np->ipv6_fl_list; sfl; sfl = sfl->next) { + if (sfl->fl->label == freq.flr_label) { + if (freq.flr_flags&IPV6_FL_F_EXCL) { + read_unlock_bh(&ip6_sk_fl_lock); + goto done; + } + fl1 = sfl->fl; + atomic_inc(&fl1->users); + break; + } + } + read_unlock_bh(&ip6_sk_fl_lock); + + if (fl1 == NULL) + fl1 = fl_lookup(net, freq.flr_label); + if (fl1) { +recheck: + err = -EEXIST; + if (freq.flr_flags&IPV6_FL_F_EXCL) + goto release; + err = -EPERM; + if (fl1->share == IPV6_FL_S_EXCL || + fl1->share != fl->share || + fl1->owner != fl->owner) + goto release; + + err = -EINVAL; + if (!ipv6_addr_equal(&fl1->dst, &fl->dst) || + ipv6_opt_cmp(fl1->opt, fl->opt)) + goto release; + + err = -ENOMEM; + if (sfl1 == NULL) + goto release; + if (fl->linger > fl1->linger) + fl1->linger = fl->linger; + if ((long)(fl->expires - fl1->expires) > 0) + fl1->expires = fl->expires; + fl_link(np, sfl1, fl1); + fl_free(fl); + return 0; + +release: + fl_release(fl1); + goto done; + } + } + err = -ENOENT; + if (!(freq.flr_flags&IPV6_FL_F_CREATE)) + goto done; + + err = -ENOMEM; + if (sfl1 == NULL || (err = mem_check(sk)) != 0) + goto done; + + fl1 = fl_intern(net, fl, freq.flr_label); + if (fl1 != NULL) + goto recheck; + + if (!freq.flr_label) { + if (copy_to_user(&((struct in6_flowlabel_req __user *) optval)->flr_label, + &fl->label, sizeof(fl->label))) { + /* Intentionally ignore fault. */ + } + } + + fl_link(np, sfl1, fl); + return 0; + + default: + return -EINVAL; + } + +done: + fl_free(fl); + kfree(sfl1); + return err; +} + +#ifdef CONFIG_PROC_FS + +struct ip6fl_iter_state { + struct seq_net_private p; + int bucket; +}; + +#define ip6fl_seq_private(seq) ((struct ip6fl_iter_state *)(seq)->private) + +static struct ip6_flowlabel *ip6fl_get_first(struct seq_file *seq) +{ + struct ip6_flowlabel *fl = NULL; + struct ip6fl_iter_state *state = ip6fl_seq_private(seq); + struct net *net = seq_file_net(seq); + + for (state->bucket = 0; state->bucket <= FL_HASH_MASK; ++state->bucket) { + fl = fl_ht[state->bucket]; + + while (fl && !net_eq(fl->fl_net, net)) + fl = fl->next; + if (fl) + break; + } + return fl; +} + +static struct ip6_flowlabel *ip6fl_get_next(struct seq_file *seq, struct ip6_flowlabel *fl) +{ + struct ip6fl_iter_state *state = ip6fl_seq_private(seq); + struct net *net = seq_file_net(seq); + + fl = fl->next; +try_again: + while (fl && !net_eq(fl->fl_net, net)) + fl = fl->next; + + while (!fl) { + if (++state->bucket <= FL_HASH_MASK) { + fl = fl_ht[state->bucket]; + goto try_again; + } else + break; + } + return fl; +} + +static struct ip6_flowlabel *ip6fl_get_idx(struct seq_file *seq, loff_t pos) +{ + struct ip6_flowlabel *fl = ip6fl_get_first(seq); + if (fl) + while (pos && (fl = ip6fl_get_next(seq, fl)) != NULL) + --pos; + return pos ? NULL : fl; +} + +static void *ip6fl_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(ip6_fl_lock) +{ + read_lock_bh(&ip6_fl_lock); + return *pos ? ip6fl_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; +} + +static void *ip6fl_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct ip6_flowlabel *fl; + + if (v == SEQ_START_TOKEN) + fl = ip6fl_get_first(seq); + else + fl = ip6fl_get_next(seq, v); + ++*pos; + return fl; +} + +static void ip6fl_seq_stop(struct seq_file *seq, void *v) + __releases(ip6_fl_lock) +{ + read_unlock_bh(&ip6_fl_lock); +} + +static int ip6fl_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) + seq_printf(seq, "%-5s %-1s %-6s %-6s %-6s %-8s %-32s %s\n", + "Label", "S", "Owner", "Users", "Linger", "Expires", "Dst", "Opt"); + else { + struct ip6_flowlabel *fl = v; + seq_printf(seq, + "%05X %-1d %-6d %-6d %-6ld %-8ld %pi6 %-4d\n", + (unsigned)ntohl(fl->label), + fl->share, + (unsigned)fl->owner, + atomic_read(&fl->users), + fl->linger/HZ, + (long)(fl->expires - jiffies)/HZ, + &fl->dst, + fl->opt ? fl->opt->opt_nflen : 0); + } + return 0; +} + +static const struct seq_operations ip6fl_seq_ops = { + .start = ip6fl_seq_start, + .next = ip6fl_seq_next, + .stop = ip6fl_seq_stop, + .show = ip6fl_seq_show, +}; + +static int ip6fl_seq_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &ip6fl_seq_ops, + sizeof(struct ip6fl_iter_state)); +} + +static const struct file_operations ip6fl_seq_fops = { + .owner = THIS_MODULE, + .open = ip6fl_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +static int __net_init ip6_flowlabel_proc_init(struct net *net) +{ + if (!proc_net_fops_create(net, "ip6_flowlabel", + S_IRUGO, &ip6fl_seq_fops)) + return -ENOMEM; + return 0; +} + +static void __net_exit ip6_flowlabel_proc_fini(struct net *net) +{ + proc_net_remove(net, "ip6_flowlabel"); +} +#else +static inline int ip6_flowlabel_proc_init(struct net *net) +{ + return 0; +} +static inline void ip6_flowlabel_proc_fini(struct net *net) +{ +} +#endif + +static void __net_exit ip6_flowlabel_net_exit(struct net *net) +{ + ip6_fl_purge(net); + ip6_flowlabel_proc_fini(net); +} + +static struct pernet_operations ip6_flowlabel_net_ops = { + .init = ip6_flowlabel_proc_init, + .exit = ip6_flowlabel_net_exit, +}; + +int ip6_flowlabel_init(void) +{ + return register_pernet_subsys(&ip6_flowlabel_net_ops); +} + +void ip6_flowlabel_cleanup(void) +{ + del_timer(&ip6_fl_gc_timer); + unregister_pernet_subsys(&ip6_flowlabel_net_ops); +} diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c new file mode 100644 index 00000000..1ca5d45a --- /dev/null +++ b/net/ipv6/ip6_input.c @@ -0,0 +1,344 @@ +/* + * IPv6 input + * Linux INET6 implementation + * + * Authors: + * Pedro Roque <roque@di.fc.ul.pt> + * Ian P. Morris <I.P.Morris@soton.ac.uk> + * + * Based in linux/net/ipv4/ip_input.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +/* Changes + * + * Mitsuru KANDA @USAGI and + * YOSHIFUJI Hideaki @USAGI: Remove ipv6_parse_exthdrs(). + */ + +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/net.h> +#include <linux/netdevice.h> +#include <linux/in6.h> +#include <linux/icmpv6.h> +#include <linux/mroute6.h> +#include <linux/slab.h> + +#include <linux/netfilter.h> +#include <linux/netfilter_ipv6.h> + +#include <net/sock.h> +#include <net/snmp.h> + +#include <net/ipv6.h> +#include <net/protocol.h> +#include <net/transp_v6.h> +#include <net/rawv6.h> +#include <net/ndisc.h> +#include <net/ip6_route.h> +#include <net/addrconf.h> +#include <net/xfrm.h> + + + +inline int ip6_rcv_finish( struct sk_buff *skb) +{ + if (skb_dst(skb) == NULL) + ip6_route_input(skb); + + return dst_input(skb); +} + +int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) +{ + const struct ipv6hdr *hdr; + u32 pkt_len; + struct inet6_dev *idev; + struct net *net = dev_net(skb->dev); + + if (skb->pkt_type == PACKET_OTHERHOST) { + kfree_skb(skb); + return NET_RX_DROP; + } + + rcu_read_lock(); + + idev = __in6_dev_get(skb->dev); + + IP6_UPD_PO_STATS_BH(net, idev, IPSTATS_MIB_IN, skb->len); + + if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL || + !idev || unlikely(idev->cnf.disable_ipv6)) { + IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INDISCARDS); + goto drop; + } + + memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm)); + + /* + * Store incoming device index. When the packet will + * be queued, we cannot refer to skb->dev anymore. + * + * BTW, when we send a packet for our own local address on a + * non-loopback interface (e.g. ethX), it is being delivered + * via the loopback interface (lo) here; skb->dev = loopback_dev. + * It, however, should be considered as if it is being + * arrived via the sending interface (ethX), because of the + * nature of scoping architecture. --yoshfuji + */ + IP6CB(skb)->iif = skb_dst(skb) ? ip6_dst_idev(skb_dst(skb))->dev->ifindex : dev->ifindex; + + if (unlikely(!pskb_may_pull(skb, sizeof(*hdr)))) + goto err; + + hdr = ipv6_hdr(skb); + + if (hdr->version != 6) + goto err; + + /* + * RFC4291 2.5.3 + * A packet received on an interface with a destination address + * of loopback must be dropped. + */ + if (!(dev->flags & IFF_LOOPBACK) && + ipv6_addr_loopback(&hdr->daddr)) + goto err; + + /* + * RFC4291 2.7 + * Multicast addresses must not be used as source addresses in IPv6 + * packets or appear in any Routing header. + */ + if (ipv6_addr_is_multicast(&hdr->saddr)) + goto err; + + skb->transport_header = skb->network_header + sizeof(*hdr); + IP6CB(skb)->nhoff = offsetof(struct ipv6hdr, nexthdr); + + pkt_len = ntohs(hdr->payload_len); + + /* pkt_len may be zero if Jumbo payload option is present */ + if (pkt_len || hdr->nexthdr != NEXTHDR_HOP) { + if (pkt_len + sizeof(struct ipv6hdr) > skb->len) { + IP6_INC_STATS_BH(net, + idev, IPSTATS_MIB_INTRUNCATEDPKTS); + goto drop; + } + if (pskb_trim_rcsum(skb, pkt_len + sizeof(struct ipv6hdr))) { + IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INHDRERRORS); + goto drop; + } + hdr = ipv6_hdr(skb); + } + + if (hdr->nexthdr == NEXTHDR_HOP) { + if (ipv6_parse_hopopts(skb) < 0) { + IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INHDRERRORS); + rcu_read_unlock(); + return NET_RX_DROP; + } + } + + rcu_read_unlock(); + + /* Must drop socket now because of tproxy. */ + skb_orphan(skb); + + return NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, skb, dev, NULL, + ip6_rcv_finish); +err: + IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INHDRERRORS); +drop: + rcu_read_unlock(); + kfree_skb(skb); + return NET_RX_DROP; +} + +/* + * Deliver the packet to the host + */ + + +static int ip6_input_finish(struct sk_buff *skb) +{ + const struct inet6_protocol *ipprot; + unsigned int nhoff; + int nexthdr, raw; + u8 hash; + struct inet6_dev *idev; + struct net *net = dev_net(skb_dst(skb)->dev); + + /* + * Parse extension headers + */ + + rcu_read_lock(); +resubmit: + idev = ip6_dst_idev(skb_dst(skb)); + if (!pskb_pull(skb, skb_transport_offset(skb))) + goto discard; + nhoff = IP6CB(skb)->nhoff; + nexthdr = skb_network_header(skb)[nhoff]; + + raw = raw6_local_deliver(skb, nexthdr); + + hash = nexthdr & (MAX_INET_PROTOS - 1); + if ((ipprot = rcu_dereference(inet6_protos[hash])) != NULL) { + int ret; + + if (ipprot->flags & INET6_PROTO_FINAL) { + const struct ipv6hdr *hdr; + + /* Free reference early: we don't need it any more, + and it may hold ip_conntrack module loaded + indefinitely. */ + nf_reset(skb); + + skb_postpull_rcsum(skb, skb_network_header(skb), + skb_network_header_len(skb)); + hdr = ipv6_hdr(skb); + if (ipv6_addr_is_multicast(&hdr->daddr) && + !ipv6_chk_mcast_addr(skb->dev, &hdr->daddr, + &hdr->saddr) && + !ipv6_is_mld(skb, nexthdr)) + goto discard; + } + if (!(ipprot->flags & INET6_PROTO_NOPOLICY) && + !xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) + goto discard; + + ret = ipprot->handler(skb); + if (ret > 0) + goto resubmit; + else if (ret == 0) + IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INDELIVERS); + } else { + if (!raw) { + if (xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) { + IP6_INC_STATS_BH(net, idev, + IPSTATS_MIB_INUNKNOWNPROTOS); + icmpv6_send(skb, ICMPV6_PARAMPROB, + ICMPV6_UNK_NEXTHDR, nhoff); + } + } else + IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INDELIVERS); + kfree_skb(skb); + } + rcu_read_unlock(); + return 0; + +discard: + IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INDISCARDS); + rcu_read_unlock(); + kfree_skb(skb); + return 0; +} + + +int ip6_input(struct sk_buff *skb) +{ + return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_IN, skb, skb->dev, NULL, + ip6_input_finish); +} + +int ip6_mc_input(struct sk_buff *skb) +{ + const struct ipv6hdr *hdr; + int deliver; + + IP6_UPD_PO_STATS_BH(dev_net(skb_dst(skb)->dev), + ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_INMCAST, + skb->len); + + hdr = ipv6_hdr(skb); + deliver = ipv6_chk_mcast_addr(skb->dev, &hdr->daddr, NULL); + +#ifdef CONFIG_IPV6_MROUTE + /* + * IPv6 multicast router mode is now supported ;) + */ + if (dev_net(skb->dev)->ipv6.devconf_all->mc_forwarding && + !(ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) && + likely(!(IP6CB(skb)->flags & IP6SKB_FORWARDED))) { + /* + * Okay, we try to forward - split and duplicate + * packets. + */ + struct sk_buff *skb2; + struct inet6_skb_parm *opt = IP6CB(skb); + + /* Check for MLD */ + if (unlikely(opt->ra)) { + /* Check if this is a mld message */ + u8 *ptr = skb_network_header(skb) + opt->ra; + struct icmp6hdr *icmp6; + u8 nexthdr = hdr->nexthdr; + __be16 frag_off; + int offset; + + /* Check if the value of Router Alert + * is for MLD (0x0000). + */ + if ((ptr[2] | ptr[3]) == 0) { + deliver = 0; + + if (!ipv6_ext_hdr(nexthdr)) { + /* BUG */ + goto out; + } + offset = ipv6_skip_exthdr(skb, sizeof(*hdr), + &nexthdr, &frag_off); + if (offset < 0) + goto out; + + if (nexthdr != IPPROTO_ICMPV6) + goto out; + + if (!pskb_may_pull(skb, (skb_network_header(skb) + + offset + 1 - skb->data))) + goto out; + + icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset); + + switch (icmp6->icmp6_type) { + case ICMPV6_MGM_QUERY: + case ICMPV6_MGM_REPORT: + case ICMPV6_MGM_REDUCTION: + case ICMPV6_MLD2_REPORT: + deliver = 1; + break; + } + goto out; + } + /* unknown RA - process it normally */ + } + + if (deliver) + skb2 = skb_clone(skb, GFP_ATOMIC); + else { + skb2 = skb; + skb = NULL; + } + + if (skb2) { + ip6_mr_input(skb2); + } + } +out: +#endif + if (likely(deliver)) + ip6_input(skb); + else { + /* discard */ + kfree_skb(skb); + } + + return 0; +} diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c new file mode 100644 index 00000000..13e5399b --- /dev/null +++ b/net/ipv6/ip6_output.c @@ -0,0 +1,1686 @@ +/* + * IPv6 output functions + * Linux INET6 implementation + * + * Authors: + * Pedro Roque <roque@di.fc.ul.pt> + * + * Based on linux/net/ipv4/ip_output.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * A.N.Kuznetsov : airthmetics in fragmentation. + * extension headers are implemented. + * route changes now work. + * ip6_forward does not confuse sniffers. + * etc. + * + * H. von Brand : Added missing #include <linux/string.h> + * Imran Patel : frag id should be in NBO + * Kazunori MIYAZAWA @USAGI + * : add ip6_append_data and related functions + * for datagram xmit + */ + +#include <linux/errno.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/socket.h> +#include <linux/net.h> +#include <linux/netdevice.h> +#include <linux/if_arp.h> +#include <linux/in6.h> +#include <linux/tcp.h> +#include <linux/route.h> +#include <linux/module.h> +#include <linux/slab.h> + +#include <linux/netfilter.h> +#include <linux/netfilter_ipv6.h> + +#include <net/sock.h> +#include <net/snmp.h> + +#include <net/ipv6.h> +#include <net/ndisc.h> +#include <net/protocol.h> +#include <net/ip6_route.h> +#include <net/addrconf.h> +#include <net/rawv6.h> +#include <net/icmp.h> +#include <net/xfrm.h> +#include <net/checksum.h> +#include <linux/mroute6.h> + +int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)); + +int __ip6_local_out(struct sk_buff *skb) +{ + int len; + + len = skb->len - sizeof(struct ipv6hdr); + if (len > IPV6_MAXPLEN) + len = 0; + ipv6_hdr(skb)->payload_len = htons(len); + + return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL, + skb_dst(skb)->dev, dst_output); +} + +int ip6_local_out(struct sk_buff *skb) +{ + int err; + + err = __ip6_local_out(skb); + if (likely(err == 1)) + err = dst_output(skb); + + return err; +} +EXPORT_SYMBOL_GPL(ip6_local_out); + +/* dev_loopback_xmit for use with netfilter. */ +static int ip6_dev_loopback_xmit(struct sk_buff *newskb) +{ + skb_reset_mac_header(newskb); + __skb_pull(newskb, skb_network_offset(newskb)); + newskb->pkt_type = PACKET_LOOPBACK; + newskb->ip_summed = CHECKSUM_UNNECESSARY; + WARN_ON(!skb_dst(newskb)); + + netif_rx_ni(newskb); + return 0; +} + +static int ip6_finish_output2(struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct net_device *dev = dst->dev; + struct neighbour *neigh; + + skb->protocol = htons(ETH_P_IPV6); + skb->dev = dev; + + if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) { + struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); + + if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) && + ((mroute6_socket(dev_net(dev), skb) && + !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) || + ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr, + &ipv6_hdr(skb)->saddr))) { + struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); + + /* Do not check for IFF_ALLMULTI; multicast routing + is not supported in any case. + */ + if (newskb) + NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING, + newskb, NULL, newskb->dev, + ip6_dev_loopback_xmit); + + if (ipv6_hdr(skb)->hop_limit == 0) { + IP6_INC_STATS(dev_net(dev), idev, + IPSTATS_MIB_OUTDISCARDS); + kfree_skb(skb); + return 0; + } + } + + IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST, + skb->len); + } + + rcu_read_lock(); + neigh = dst_get_neighbour_noref(dst); + if (neigh) { + int res = neigh_output(neigh, skb); + + rcu_read_unlock(); + return res; + } + rcu_read_unlock(); + IP6_INC_STATS_BH(dev_net(dst->dev), + ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); + kfree_skb(skb); + return -EINVAL; +} + +static int ip6_finish_output(struct sk_buff *skb) +{ + if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) || + dst_allfrag(skb_dst(skb))) + return ip6_fragment(skb, ip6_finish_output2); + else + return ip6_finish_output2(skb); +} + +int ip6_output(struct sk_buff *skb) +{ + struct net_device *dev = skb_dst(skb)->dev; + struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); + if (unlikely(idev->cnf.disable_ipv6)) { + IP6_INC_STATS(dev_net(dev), idev, + IPSTATS_MIB_OUTDISCARDS); + kfree_skb(skb); + return 0; + } + + return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev, + ip6_finish_output, + !(IP6CB(skb)->flags & IP6SKB_REROUTED)); +} + +/* + * xmit an sk_buff (used by TCP, SCTP and DCCP) + */ + +int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, + struct ipv6_txoptions *opt, int tclass) +{ + struct net *net = sock_net(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct in6_addr *first_hop = &fl6->daddr; + struct dst_entry *dst = skb_dst(skb); + struct ipv6hdr *hdr; + u8 proto = fl6->flowi6_proto; + int seg_len = skb->len; + int hlimit = -1; + u32 mtu; + + if (opt) { + unsigned int head_room; + + /* First: exthdrs may take lots of space (~8K for now) + MAX_HEADER is not enough. + */ + head_room = opt->opt_nflen + opt->opt_flen; + seg_len += head_room; + head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev); + + if (skb_headroom(skb) < head_room) { + struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room); + if (skb2 == NULL) { + IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_OUTDISCARDS); + kfree_skb(skb); + return -ENOBUFS; + } + kfree_skb(skb); + skb = skb2; + skb_set_owner_w(skb, sk); + } + if (opt->opt_flen) + ipv6_push_frag_opts(skb, opt, &proto); + if (opt->opt_nflen) + ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop); + } + + skb_push(skb, sizeof(struct ipv6hdr)); + skb_reset_network_header(skb); + hdr = ipv6_hdr(skb); + + /* + * Fill in the IPv6 header + */ + if (np) + hlimit = np->hop_limit; + if (hlimit < 0) + hlimit = ip6_dst_hoplimit(dst); + + *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl6->flowlabel; + + hdr->payload_len = htons(seg_len); + hdr->nexthdr = proto; + hdr->hop_limit = hlimit; + + hdr->saddr = fl6->saddr; + hdr->daddr = *first_hop; + + skb->priority = sk->sk_priority; + skb->mark = sk->sk_mark; + + mtu = dst_mtu(dst); + if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) { + IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_OUT, skb->len); + return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL, + dst->dev, dst_output); + } + + if (net_ratelimit()) + printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n"); + skb->dev = dst->dev; + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS); + kfree_skb(skb); + return -EMSGSIZE; +} + +EXPORT_SYMBOL(ip6_xmit); + +/* + * To avoid extra problems ND packets are send through this + * routine. It's code duplication but I really want to avoid + * extra checks since ipv6_build_header is used by TCP (which + * is for us performance critical) + */ + +int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev, + const struct in6_addr *saddr, const struct in6_addr *daddr, + int proto, int len) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct ipv6hdr *hdr; + + skb->protocol = htons(ETH_P_IPV6); + skb->dev = dev; + + skb_reset_network_header(skb); + skb_put(skb, sizeof(struct ipv6hdr)); + hdr = ipv6_hdr(skb); + + *(__be32*)hdr = htonl(0x60000000); + + hdr->payload_len = htons(len); + hdr->nexthdr = proto; + hdr->hop_limit = np->hop_limit; + + hdr->saddr = *saddr; + hdr->daddr = *daddr; + + return 0; +} + +static int ip6_call_ra_chain(struct sk_buff *skb, int sel) +{ + struct ip6_ra_chain *ra; + struct sock *last = NULL; + + read_lock(&ip6_ra_lock); + for (ra = ip6_ra_chain; ra; ra = ra->next) { + struct sock *sk = ra->sk; + if (sk && ra->sel == sel && + (!sk->sk_bound_dev_if || + sk->sk_bound_dev_if == skb->dev->ifindex)) { + if (last) { + struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); + if (skb2) + rawv6_rcv(last, skb2); + } + last = sk; + } + } + + if (last) { + rawv6_rcv(last, skb); + read_unlock(&ip6_ra_lock); + return 1; + } + read_unlock(&ip6_ra_lock); + return 0; +} + +static int ip6_forward_proxy_check(struct sk_buff *skb) +{ + struct ipv6hdr *hdr = ipv6_hdr(skb); + u8 nexthdr = hdr->nexthdr; + __be16 frag_off; + int offset; + + if (ipv6_ext_hdr(nexthdr)) { + offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off); + if (offset < 0) + return 0; + } else + offset = sizeof(struct ipv6hdr); + + if (nexthdr == IPPROTO_ICMPV6) { + struct icmp6hdr *icmp6; + + if (!pskb_may_pull(skb, (skb_network_header(skb) + + offset + 1 - skb->data))) + return 0; + + icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset); + + switch (icmp6->icmp6_type) { + case NDISC_ROUTER_SOLICITATION: + case NDISC_ROUTER_ADVERTISEMENT: + case NDISC_NEIGHBOUR_SOLICITATION: + case NDISC_NEIGHBOUR_ADVERTISEMENT: + case NDISC_REDIRECT: + /* For reaction involving unicast neighbor discovery + * message destined to the proxied address, pass it to + * input function. + */ + return 1; + default: + break; + } + } + + /* + * The proxying router can't forward traffic sent to a link-local + * address, so signal the sender and discard the packet. This + * behavior is clarified by the MIPv6 specification. + */ + if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) { + dst_link_failure(skb); + return -1; + } + + return 0; +} + +static inline int ip6_forward_finish(struct sk_buff *skb) +{ + return dst_output(skb); +} + +int ip6_forward(struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct ipv6hdr *hdr = ipv6_hdr(skb); + struct inet6_skb_parm *opt = IP6CB(skb); + struct net *net = dev_net(dst->dev); + u32 mtu; + + if (net->ipv6.devconf_all->forwarding == 0) + goto error; + + if (skb_warn_if_lro(skb)) + goto drop; + + if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) { + IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS); + goto drop; + } + + if (skb->pkt_type != PACKET_HOST) + goto drop; + + skb_forward_csum(skb); + + /* + * We DO NOT make any processing on + * RA packets, pushing them to user level AS IS + * without ane WARRANTY that application will be able + * to interpret them. The reason is that we + * cannot make anything clever here. + * + * We are not end-node, so that if packet contains + * AH/ESP, we cannot make anything. + * Defragmentation also would be mistake, RA packets + * cannot be fragmented, because there is no warranty + * that different fragments will go along one path. --ANK + */ + if (opt->ra) { + u8 *ptr = skb_network_header(skb) + opt->ra; + if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3])) + return 0; + } + + /* + * check and decrement ttl + */ + if (hdr->hop_limit <= 1) { + /* Force OUTPUT device used as source address */ + skb->dev = dst->dev; + icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); + IP6_INC_STATS_BH(net, + ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS); + + kfree_skb(skb); + return -ETIMEDOUT; + } + + /* XXX: idev->cnf.proxy_ndp? */ + if (net->ipv6.devconf_all->proxy_ndp && + pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) { + int proxied = ip6_forward_proxy_check(skb); + if (proxied > 0) + return ip6_input(skb); + else if (proxied < 0) { + IP6_INC_STATS(net, ip6_dst_idev(dst), + IPSTATS_MIB_INDISCARDS); + goto drop; + } + } + + if (!xfrm6_route_forward(skb)) { + IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS); + goto drop; + } + dst = skb_dst(skb); + + /* IPv6 specs say nothing about it, but it is clear that we cannot + send redirects to source routed frames. + We don't send redirects to frames decapsulated from IPsec. + */ + if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) { + struct in6_addr *target = NULL; + struct rt6_info *rt; + + /* + * incoming and outgoing devices are the same + * send a redirect. + */ + + rt = (struct rt6_info *) dst; + if (rt->rt6i_flags & RTF_GATEWAY) + target = &rt->rt6i_gateway; + else + target = &hdr->daddr; + + if (!rt->rt6i_peer) + rt6_bind_peer(rt, 1); + + /* Limit redirects both by destination (here) + and by source (inside ndisc_send_redirect) + */ + if (inet_peer_xrlim_allow(rt->rt6i_peer, 1*HZ)) + ndisc_send_redirect(skb, target); + } else { + int addrtype = ipv6_addr_type(&hdr->saddr); + + /* This check is security critical. */ + if (addrtype == IPV6_ADDR_ANY || + addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK)) + goto error; + if (addrtype & IPV6_ADDR_LINKLOCAL) { + icmpv6_send(skb, ICMPV6_DEST_UNREACH, + ICMPV6_NOT_NEIGHBOUR, 0); + goto error; + } + } + + mtu = dst_mtu(dst); + if (mtu < IPV6_MIN_MTU) + mtu = IPV6_MIN_MTU; + + if (skb->len > mtu && !skb_is_gso(skb)) { + /* Again, force OUTPUT device used as source address */ + skb->dev = dst->dev; + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + IP6_INC_STATS_BH(net, + ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS); + IP6_INC_STATS_BH(net, + ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS); + kfree_skb(skb); + return -EMSGSIZE; + } + + if (skb_cow(skb, dst->dev->hard_header_len)) { + IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS); + goto drop; + } + + hdr = ipv6_hdr(skb); + + /* Mangling hops number delayed to point after skb COW */ + + hdr->hop_limit--; + + IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS); + return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev, + ip6_forward_finish); + +error: + IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS); +drop: + kfree_skb(skb); + return -EINVAL; +} + +static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) +{ + to->pkt_type = from->pkt_type; + to->priority = from->priority; + to->protocol = from->protocol; + skb_dst_drop(to); + skb_dst_set(to, dst_clone(skb_dst(from))); + to->dev = from->dev; + to->mark = from->mark; + +#ifdef CONFIG_NET_SCHED + to->tc_index = from->tc_index; +#endif + nf_copy(to, from); +#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \ + defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE) + to->nf_trace = from->nf_trace; +#endif + skb_copy_secmark(to, from); +} + +int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr) +{ + u16 offset = sizeof(struct ipv6hdr); + struct ipv6_opt_hdr *exthdr = + (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1); + unsigned int packet_len = skb->tail - skb->network_header; + int found_rhdr = 0; + *nexthdr = &ipv6_hdr(skb)->nexthdr; + + while (offset + 1 <= packet_len) { + + switch (**nexthdr) { + + case NEXTHDR_HOP: + break; + case NEXTHDR_ROUTING: + found_rhdr = 1; + break; + case NEXTHDR_DEST: +#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) + if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0) + break; +#endif + if (found_rhdr) + return offset; + break; + default : + return offset; + } + + offset += ipv6_optlen(exthdr); + *nexthdr = &exthdr->nexthdr; + exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) + + offset); + } + + return offset; +} + +void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt) +{ + static atomic_t ipv6_fragmentation_id; + int old, new; + + if (rt && !(rt->dst.flags & DST_NOPEER)) { + struct inet_peer *peer; + + if (!rt->rt6i_peer) + rt6_bind_peer(rt, 1); + peer = rt->rt6i_peer; + if (peer) { + fhdr->identification = htonl(inet_getid(peer, 0)); + return; + } + } + do { + old = atomic_read(&ipv6_fragmentation_id); + new = old + 1; + if (!new) + new = 1; + } while (atomic_cmpxchg(&ipv6_fragmentation_id, old, new) != old); + fhdr->identification = htonl(new); +} + +int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) +{ + struct sk_buff *frag; + struct rt6_info *rt = (struct rt6_info*)skb_dst(skb); + struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL; + struct ipv6hdr *tmp_hdr; + struct frag_hdr *fh; + unsigned int mtu, hlen, left, len; + int hroom, troom; + __be32 frag_id = 0; + int ptr, offset = 0, err=0; + u8 *prevhdr, nexthdr = 0; + struct net *net = dev_net(skb_dst(skb)->dev); + + hlen = ip6_find_1stfragopt(skb, &prevhdr); + nexthdr = *prevhdr; + + mtu = ip6_skb_dst_mtu(skb); + + /* We must not fragment if the socket is set to force MTU discovery + * or if the skb it not generated by a local socket. + */ + if (!skb->local_df && skb->len > mtu) { + skb->dev = skb_dst(skb)->dev; + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_FRAGFAILS); + kfree_skb(skb); + return -EMSGSIZE; + } + + if (np && np->frag_size < mtu) { + if (np->frag_size) + mtu = np->frag_size; + } + mtu -= hlen + sizeof(struct frag_hdr); + + if (skb_has_frag_list(skb)) { + int first_len = skb_pagelen(skb); + struct sk_buff *frag2; + + if (first_len - hlen > mtu || + ((first_len - hlen) & 7) || + skb_cloned(skb)) + goto slow_path; + + skb_walk_frags(skb, frag) { + /* Correct geometry. */ + if (frag->len > mtu || + ((frag->len & 7) && frag->next) || + skb_headroom(frag) < hlen) + goto slow_path_clean; + + /* Partially cloned skb? */ + if (skb_shared(frag)) + goto slow_path_clean; + + BUG_ON(frag->sk); + if (skb->sk) { + frag->sk = skb->sk; + frag->destructor = sock_wfree; + } + skb->truesize -= frag->truesize; + } + + err = 0; + offset = 0; + frag = skb_shinfo(skb)->frag_list; + skb_frag_list_init(skb); + /* BUILD HEADER */ + + *prevhdr = NEXTHDR_FRAGMENT; + tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC); + if (!tmp_hdr) { + IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_FRAGFAILS); + return -ENOMEM; + } + + __skb_pull(skb, hlen); + fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr)); + __skb_push(skb, hlen); + skb_reset_network_header(skb); + memcpy(skb_network_header(skb), tmp_hdr, hlen); + + ipv6_select_ident(fh, rt); + fh->nexthdr = nexthdr; + fh->reserved = 0; + fh->frag_off = htons(IP6_MF); + frag_id = fh->identification; + + first_len = skb_pagelen(skb); + skb->data_len = first_len - skb_headlen(skb); + skb->len = first_len; + ipv6_hdr(skb)->payload_len = htons(first_len - + sizeof(struct ipv6hdr)); + + dst_hold(&rt->dst); + + for (;;) { + /* Prepare header of the next frame, + * before previous one went down. */ + if (frag) { + frag->ip_summed = CHECKSUM_NONE; + skb_reset_transport_header(frag); + fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr)); + __skb_push(frag, hlen); + skb_reset_network_header(frag); + memcpy(skb_network_header(frag), tmp_hdr, + hlen); + offset += skb->len - hlen - sizeof(struct frag_hdr); + fh->nexthdr = nexthdr; + fh->reserved = 0; + fh->frag_off = htons(offset); + if (frag->next != NULL) + fh->frag_off |= htons(IP6_MF); + fh->identification = frag_id; + ipv6_hdr(frag)->payload_len = + htons(frag->len - + sizeof(struct ipv6hdr)); + ip6_copy_metadata(frag, skb); + } + + err = output(skb); + if(!err) + IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), + IPSTATS_MIB_FRAGCREATES); + + if (err || !frag) + break; + + skb = frag; + frag = skb->next; + skb->next = NULL; + } + + kfree(tmp_hdr); + + if (err == 0) { + IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), + IPSTATS_MIB_FRAGOKS); + dst_release(&rt->dst); + return 0; + } + + while (frag) { + skb = frag->next; + kfree_skb(frag); + frag = skb; + } + + IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), + IPSTATS_MIB_FRAGFAILS); + dst_release(&rt->dst); + return err; + +slow_path_clean: + skb_walk_frags(skb, frag2) { + if (frag2 == frag) + break; + frag2->sk = NULL; + frag2->destructor = NULL; + skb->truesize += frag2->truesize; + } + } + +slow_path: + left = skb->len - hlen; /* Space per frame */ + ptr = hlen; /* Where to start from */ + + /* + * Fragment the datagram. + */ + + *prevhdr = NEXTHDR_FRAGMENT; + hroom = LL_RESERVED_SPACE(rt->dst.dev); + troom = rt->dst.dev->needed_tailroom; + + /* + * Keep copying data until we run out. + */ + while(left > 0) { + len = left; + /* IF: it doesn't fit, use 'mtu' - the data space left */ + if (len > mtu) + len = mtu; + /* IF: we are not sending up to and including the packet end + then align the next start on an eight byte boundary */ + if (len < left) { + len &= ~7; + } + /* + * Allocate buffer. + */ + + if ((frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) + + hroom + troom, GFP_ATOMIC)) == NULL) { + NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n"); + IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_FRAGFAILS); + err = -ENOMEM; + goto fail; + } + + /* + * Set up data on packet + */ + + ip6_copy_metadata(frag, skb); + skb_reserve(frag, hroom); + skb_put(frag, len + hlen + sizeof(struct frag_hdr)); + skb_reset_network_header(frag); + fh = (struct frag_hdr *)(skb_network_header(frag) + hlen); + frag->transport_header = (frag->network_header + hlen + + sizeof(struct frag_hdr)); + + /* + * Charge the memory for the fragment to any owner + * it might possess + */ + if (skb->sk) + skb_set_owner_w(frag, skb->sk); + + /* + * Copy the packet header into the new buffer. + */ + skb_copy_from_linear_data(skb, skb_network_header(frag), hlen); + + /* + * Build fragment header. + */ + fh->nexthdr = nexthdr; + fh->reserved = 0; + if (!frag_id) { + ipv6_select_ident(fh, rt); + frag_id = fh->identification; + } else + fh->identification = frag_id; + + /* + * Copy a block of the IP datagram. + */ + if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len)) + BUG(); + left -= len; + + fh->frag_off = htons(offset); + if (left > 0) + fh->frag_off |= htons(IP6_MF); + ipv6_hdr(frag)->payload_len = htons(frag->len - + sizeof(struct ipv6hdr)); + + ptr += len; + offset += len; + + /* + * Put this fragment into the sending queue. + */ + err = output(frag); + if (err) + goto fail; + + IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_FRAGCREATES); + } + IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_FRAGOKS); + kfree_skb(skb); + return err; + +fail: + IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_FRAGFAILS); + kfree_skb(skb); + return err; +} + +static inline int ip6_rt_check(const struct rt6key *rt_key, + const struct in6_addr *fl_addr, + const struct in6_addr *addr_cache) +{ + return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) && + (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache)); +} + +static struct dst_entry *ip6_sk_dst_check(struct sock *sk, + struct dst_entry *dst, + const struct flowi6 *fl6) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct rt6_info *rt = (struct rt6_info *)dst; + + if (!dst) + goto out; + + /* Yes, checking route validity in not connected + * case is not very simple. Take into account, + * that we do not support routing by source, TOS, + * and MSG_DONTROUTE --ANK (980726) + * + * 1. ip6_rt_check(): If route was host route, + * check that cached destination is current. + * If it is network route, we still may + * check its validity using saved pointer + * to the last used address: daddr_cache. + * We do not want to save whole address now, + * (because main consumer of this service + * is tcp, which has not this problem), + * so that the last trick works only on connected + * sockets. + * 2. oif also should be the same. + */ + if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) || +#ifdef CONFIG_IPV6_SUBTREES + ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) || +#endif + (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) { + dst_release(dst); + dst = NULL; + } + +out: + return dst; +} + +static int ip6_dst_lookup_tail(struct sock *sk, + struct dst_entry **dst, struct flowi6 *fl6) +{ + struct net *net = sock_net(sk); +#ifdef CONFIG_IPV6_OPTIMISTIC_DAD + struct neighbour *n; +#endif + int err; + + if (*dst == NULL) + *dst = ip6_route_output(net, sk, fl6); + + if ((err = (*dst)->error)) + goto out_err_release; + + if (ipv6_addr_any(&fl6->saddr)) { + struct rt6_info *rt = (struct rt6_info *) *dst; + err = ip6_route_get_saddr(net, rt, &fl6->daddr, + sk ? inet6_sk(sk)->srcprefs : 0, + &fl6->saddr); + if (err) + goto out_err_release; + } + +#ifdef CONFIG_IPV6_OPTIMISTIC_DAD + /* + * Here if the dst entry we've looked up + * has a neighbour entry that is in the INCOMPLETE + * state and the src address from the flow is + * marked as OPTIMISTIC, we release the found + * dst entry and replace it instead with the + * dst entry of the nexthop router + */ + rcu_read_lock(); + n = dst_get_neighbour_noref(*dst); + if (n && !(n->nud_state & NUD_VALID)) { + struct inet6_ifaddr *ifp; + struct flowi6 fl_gw6; + int redirect; + + rcu_read_unlock(); + ifp = ipv6_get_ifaddr(net, &fl6->saddr, + (*dst)->dev, 1); + + redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC); + if (ifp) + in6_ifa_put(ifp); + + if (redirect) { + /* + * We need to get the dst entry for the + * default router instead + */ + dst_release(*dst); + memcpy(&fl_gw6, fl6, sizeof(struct flowi6)); + memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr)); + *dst = ip6_route_output(net, sk, &fl_gw6); + if ((err = (*dst)->error)) + goto out_err_release; + } + } else { + rcu_read_unlock(); + } +#endif + + return 0; + +out_err_release: + if (err == -ENETUNREACH) + IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES); + dst_release(*dst); + *dst = NULL; + return err; +} + +/** + * ip6_dst_lookup - perform route lookup on flow + * @sk: socket which provides route info + * @dst: pointer to dst_entry * for result + * @fl6: flow to lookup + * + * This function performs a route lookup on the given flow. + * + * It returns zero on success, or a standard errno code on error. + */ +int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6) +{ + *dst = NULL; + return ip6_dst_lookup_tail(sk, dst, fl6); +} +EXPORT_SYMBOL_GPL(ip6_dst_lookup); + +/** + * ip6_dst_lookup_flow - perform route lookup on flow with ipsec + * @sk: socket which provides route info + * @fl6: flow to lookup + * @final_dst: final destination address for ipsec lookup + * @can_sleep: we are in a sleepable context + * + * This function performs a route lookup on the given flow. + * + * It returns a valid dst pointer on success, or a pointer encoded + * error code. + */ +struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, + const struct in6_addr *final_dst, + bool can_sleep) +{ + struct dst_entry *dst = NULL; + int err; + + err = ip6_dst_lookup_tail(sk, &dst, fl6); + if (err) + return ERR_PTR(err); + if (final_dst) + fl6->daddr = *final_dst; + if (can_sleep) + fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP; + + return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0); +} +EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow); + +/** + * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow + * @sk: socket which provides the dst cache and route info + * @fl6: flow to lookup + * @final_dst: final destination address for ipsec lookup + * @can_sleep: we are in a sleepable context + * + * This function performs a route lookup on the given flow with the + * possibility of using the cached route in the socket if it is valid. + * It will take the socket dst lock when operating on the dst cache. + * As a result, this function can only be used in process context. + * + * It returns a valid dst pointer on success, or a pointer encoded + * error code. + */ +struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, + const struct in6_addr *final_dst, + bool can_sleep) +{ + struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie); + int err; + + dst = ip6_sk_dst_check(sk, dst, fl6); + + err = ip6_dst_lookup_tail(sk, &dst, fl6); + if (err) + return ERR_PTR(err); + if (final_dst) + fl6->daddr = *final_dst; + if (can_sleep) + fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP; + + return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0); +} +EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow); + +static inline int ip6_ufo_append_data(struct sock *sk, + int getfrag(void *from, char *to, int offset, int len, + int odd, struct sk_buff *skb), + void *from, int length, int hh_len, int fragheaderlen, + int transhdrlen, int mtu,unsigned int flags, + struct rt6_info *rt) + +{ + struct sk_buff *skb; + int err; + + /* There is support for UDP large send offload by network + * device, so create one single skb packet containing complete + * udp datagram + */ + if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) { + skb = sock_alloc_send_skb(sk, + hh_len + fragheaderlen + transhdrlen + 20, + (flags & MSG_DONTWAIT), &err); + if (skb == NULL) + return err; + + /* reserve space for Hardware header */ + skb_reserve(skb, hh_len); + + /* create space for UDP/IP header */ + skb_put(skb,fragheaderlen + transhdrlen); + + /* initialize network header pointer */ + skb_reset_network_header(skb); + + /* initialize protocol header pointer */ + skb->transport_header = skb->network_header + fragheaderlen; + + skb->ip_summed = CHECKSUM_PARTIAL; + skb->csum = 0; + } + + err = skb_append_datato_frags(sk,skb, getfrag, from, + (length - transhdrlen)); + if (!err) { + struct frag_hdr fhdr; + + /* Specify the length of each IPv6 datagram fragment. + * It has to be a multiple of 8. + */ + skb_shinfo(skb)->gso_size = (mtu - fragheaderlen - + sizeof(struct frag_hdr)) & ~7; + skb_shinfo(skb)->gso_type = SKB_GSO_UDP; + ipv6_select_ident(&fhdr, rt); + skb_shinfo(skb)->ip6_frag_id = fhdr.identification; + __skb_queue_tail(&sk->sk_write_queue, skb); + + return 0; + } + /* There is not enough support do UPD LSO, + * so follow normal path + */ + kfree_skb(skb); + + return err; +} + +static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src, + gfp_t gfp) +{ + return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL; +} + +static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src, + gfp_t gfp) +{ + return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL; +} + +static void ip6_append_data_mtu(int *mtu, + int *maxfraglen, + unsigned int fragheaderlen, + struct sk_buff *skb, + struct rt6_info *rt) +{ + if (!(rt->dst.flags & DST_XFRM_TUNNEL)) { + if (skb == NULL) { + /* first fragment, reserve header_len */ + *mtu = *mtu - rt->dst.header_len; + + } else { + /* + * this fragment is not first, the headers + * space is regarded as data space. + */ + *mtu = dst_mtu(rt->dst.path); + } + *maxfraglen = ((*mtu - fragheaderlen) & ~7) + + fragheaderlen - sizeof(struct frag_hdr); + } +} + +int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, + int offset, int len, int odd, struct sk_buff *skb), + void *from, int length, int transhdrlen, + int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6, + struct rt6_info *rt, unsigned int flags, int dontfrag) +{ + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct inet_cork *cork; + struct sk_buff *skb, *skb_prev = NULL; + unsigned int maxfraglen, fragheaderlen; + int exthdrlen; + int dst_exthdrlen; + int hh_len; + int mtu; + int copy; + int err; + int offset = 0; + int csummode = CHECKSUM_NONE; + __u8 tx_flags = 0; + + if (flags&MSG_PROBE) + return 0; + cork = &inet->cork.base; + if (skb_queue_empty(&sk->sk_write_queue)) { + /* + * setup for corking + */ + if (opt) { + if (WARN_ON(np->cork.opt)) + return -EINVAL; + + np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation); + if (unlikely(np->cork.opt == NULL)) + return -ENOBUFS; + + np->cork.opt->tot_len = opt->tot_len; + np->cork.opt->opt_flen = opt->opt_flen; + np->cork.opt->opt_nflen = opt->opt_nflen; + + np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt, + sk->sk_allocation); + if (opt->dst0opt && !np->cork.opt->dst0opt) + return -ENOBUFS; + + np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt, + sk->sk_allocation); + if (opt->dst1opt && !np->cork.opt->dst1opt) + return -ENOBUFS; + + np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt, + sk->sk_allocation); + if (opt->hopopt && !np->cork.opt->hopopt) + return -ENOBUFS; + + np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt, + sk->sk_allocation); + if (opt->srcrt && !np->cork.opt->srcrt) + return -ENOBUFS; + + /* need source address above miyazawa*/ + } + dst_hold(&rt->dst); + cork->dst = &rt->dst; + inet->cork.fl.u.ip6 = *fl6; + np->cork.hop_limit = hlimit; + np->cork.tclass = tclass; + if (rt->dst.flags & DST_XFRM_TUNNEL) + mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ? + rt->dst.dev->mtu : dst_mtu(&rt->dst); + else + mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ? + rt->dst.dev->mtu : dst_mtu(rt->dst.path); + if (np->frag_size < mtu) { + if (np->frag_size) + mtu = np->frag_size; + } + cork->fragsize = mtu; + if (dst_allfrag(rt->dst.path)) + cork->flags |= IPCORK_ALLFRAG; + cork->length = 0; + sk->sk_sndmsg_page = NULL; + sk->sk_sndmsg_off = 0; + exthdrlen = (opt ? opt->opt_flen : 0) - rt->rt6i_nfheader_len; + length += exthdrlen; + transhdrlen += exthdrlen; + dst_exthdrlen = rt->dst.header_len; + } else { + rt = (struct rt6_info *)cork->dst; + fl6 = &inet->cork.fl.u.ip6; + opt = np->cork.opt; + transhdrlen = 0; + exthdrlen = 0; + dst_exthdrlen = 0; + mtu = cork->fragsize; + } + + hh_len = LL_RESERVED_SPACE(rt->dst.dev); + + fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len + + (opt ? opt->opt_nflen : 0); + maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr); + + if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) { + if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) { + ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen); + return -EMSGSIZE; + } + } + + /* For UDP, check if TX timestamp is enabled */ + if (sk->sk_type == SOCK_DGRAM) { + err = sock_tx_timestamp(sk, &tx_flags); + if (err) + goto error; + } + + /* + * Let's try using as much space as possible. + * Use MTU if total length of the message fits into the MTU. + * Otherwise, we need to reserve fragment header and + * fragment alignment (= 8-15 octects, in total). + * + * Note that we may need to "move" the data from the tail of + * of the buffer to the new fragment when we split + * the message. + * + * FIXME: It may be fragmented into multiple chunks + * at once if non-fragmentable extension headers + * are too large. + * --yoshfuji + */ + + cork->length += length; + if (length > mtu) { + int proto = sk->sk_protocol; + if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){ + ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen); + return -EMSGSIZE; + } + + if (proto == IPPROTO_UDP && + (rt->dst.dev->features & NETIF_F_UFO)) { + + err = ip6_ufo_append_data(sk, getfrag, from, length, + hh_len, fragheaderlen, + transhdrlen, mtu, flags, rt); + if (err) + goto error; + return 0; + } + } + + if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) + goto alloc_new_skb; + + while (length > 0) { + /* Check if the remaining data fits into current packet. */ + copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len; + if (copy < length) + copy = maxfraglen - skb->len; + + if (copy <= 0) { + char *data; + unsigned int datalen; + unsigned int fraglen; + unsigned int fraggap; + unsigned int alloclen; +alloc_new_skb: + /* There's no room in the current skb */ + if (skb) + fraggap = skb->len - maxfraglen; + else + fraggap = 0; + /* update mtu and maxfraglen if necessary */ + if (skb == NULL || skb_prev == NULL) + ip6_append_data_mtu(&mtu, &maxfraglen, + fragheaderlen, skb, rt); + + skb_prev = skb; + + /* + * If remaining data exceeds the mtu, + * we know we need more fragment(s). + */ + datalen = length + fraggap; + + if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen) + datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len; + if ((flags & MSG_MORE) && + !(rt->dst.dev->features&NETIF_F_SG)) + alloclen = mtu; + else + alloclen = datalen + fragheaderlen; + + alloclen += dst_exthdrlen; + + if (datalen != length + fraggap) { + /* + * this is not the last fragment, the trailer + * space is regarded as data space. + */ + datalen += rt->dst.trailer_len; + } + + alloclen += rt->dst.trailer_len; + fraglen = datalen + fragheaderlen; + + /* + * We just reserve space for fragment header. + * Note: this may be overallocation if the message + * (without MSG_MORE) fits into the MTU. + */ + alloclen += sizeof(struct frag_hdr); + + if (transhdrlen) { + skb = sock_alloc_send_skb(sk, + alloclen + hh_len, + (flags & MSG_DONTWAIT), &err); + } else { + skb = NULL; + if (atomic_read(&sk->sk_wmem_alloc) <= + 2 * sk->sk_sndbuf) + skb = sock_wmalloc(sk, + alloclen + hh_len, 1, + sk->sk_allocation); + if (unlikely(skb == NULL)) + err = -ENOBUFS; + else { + /* Only the initial fragment + * is time stamped. + */ + tx_flags = 0; + } + } + if (skb == NULL) + goto error; + /* + * Fill in the control structures + */ + skb->ip_summed = csummode; + skb->csum = 0; + /* reserve for fragmentation and ipsec header */ + skb_reserve(skb, hh_len + sizeof(struct frag_hdr) + + dst_exthdrlen); + + if (sk->sk_type == SOCK_DGRAM) + skb_shinfo(skb)->tx_flags = tx_flags; + + /* + * Find where to start putting bytes + */ + data = skb_put(skb, fraglen); + skb_set_network_header(skb, exthdrlen); + data += fragheaderlen; + skb->transport_header = (skb->network_header + + fragheaderlen); + if (fraggap) { + skb->csum = skb_copy_and_csum_bits( + skb_prev, maxfraglen, + data + transhdrlen, fraggap, 0); + skb_prev->csum = csum_sub(skb_prev->csum, + skb->csum); + data += fraggap; + pskb_trim_unique(skb_prev, maxfraglen); + } + copy = datalen - transhdrlen - fraggap; + + if (copy < 0) { + err = -EINVAL; + kfree_skb(skb); + goto error; + } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) { + err = -EFAULT; + kfree_skb(skb); + goto error; + } + + offset += copy; + length -= datalen - fraggap; + transhdrlen = 0; + exthdrlen = 0; + dst_exthdrlen = 0; + csummode = CHECKSUM_NONE; + + /* + * Put the packet on the pending queue + */ + __skb_queue_tail(&sk->sk_write_queue, skb); + continue; + } + + if (copy > length) + copy = length; + + if (!(rt->dst.dev->features&NETIF_F_SG)) { + unsigned int off; + + off = skb->len; + if (getfrag(from, skb_put(skb, copy), + offset, copy, off, skb) < 0) { + __skb_trim(skb, off); + err = -EFAULT; + goto error; + } + } else { + int i = skb_shinfo(skb)->nr_frags; + skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1]; + struct page *page = sk->sk_sndmsg_page; + int off = sk->sk_sndmsg_off; + unsigned int left; + + if (page && (left = PAGE_SIZE - off) > 0) { + if (copy >= left) + copy = left; + if (page != skb_frag_page(frag)) { + if (i == MAX_SKB_FRAGS) { + err = -EMSGSIZE; + goto error; + } + skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0); + skb_frag_ref(skb, i); + frag = &skb_shinfo(skb)->frags[i]; + } + } else if(i < MAX_SKB_FRAGS) { + if (copy > PAGE_SIZE) + copy = PAGE_SIZE; + page = alloc_pages(sk->sk_allocation, 0); + if (page == NULL) { + err = -ENOMEM; + goto error; + } + sk->sk_sndmsg_page = page; + sk->sk_sndmsg_off = 0; + + skb_fill_page_desc(skb, i, page, 0, 0); + frag = &skb_shinfo(skb)->frags[i]; + } else { + err = -EMSGSIZE; + goto error; + } + if (getfrag(from, + skb_frag_address(frag) + skb_frag_size(frag), + offset, copy, skb->len, skb) < 0) { + err = -EFAULT; + goto error; + } + sk->sk_sndmsg_off += copy; + skb_frag_size_add(frag, copy); + skb->len += copy; + skb->data_len += copy; + skb->truesize += copy; + atomic_add(copy, &sk->sk_wmem_alloc); + } + offset += copy; + length -= copy; + } + return 0; +error: + cork->length -= length; + IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); + return err; +} + +static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np) +{ + if (np->cork.opt) { + kfree(np->cork.opt->dst0opt); + kfree(np->cork.opt->dst1opt); + kfree(np->cork.opt->hopopt); + kfree(np->cork.opt->srcrt); + kfree(np->cork.opt); + np->cork.opt = NULL; + } + + if (inet->cork.base.dst) { + dst_release(inet->cork.base.dst); + inet->cork.base.dst = NULL; + inet->cork.base.flags &= ~IPCORK_ALLFRAG; + } + memset(&inet->cork.fl, 0, sizeof(inet->cork.fl)); +} + +int ip6_push_pending_frames(struct sock *sk) +{ + struct sk_buff *skb, *tmp_skb; + struct sk_buff **tail_skb; + struct in6_addr final_dst_buf, *final_dst = &final_dst_buf; + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct net *net = sock_net(sk); + struct ipv6hdr *hdr; + struct ipv6_txoptions *opt = np->cork.opt; + struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst; + struct flowi6 *fl6 = &inet->cork.fl.u.ip6; + unsigned char proto = fl6->flowi6_proto; + int err = 0; + + if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL) + goto out; + tail_skb = &(skb_shinfo(skb)->frag_list); + + /* move skb->data to ip header from ext header */ + if (skb->data < skb_network_header(skb)) + __skb_pull(skb, skb_network_offset(skb)); + while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) { + __skb_pull(tmp_skb, skb_network_header_len(skb)); + *tail_skb = tmp_skb; + tail_skb = &(tmp_skb->next); + skb->len += tmp_skb->len; + skb->data_len += tmp_skb->len; + skb->truesize += tmp_skb->truesize; + tmp_skb->destructor = NULL; + tmp_skb->sk = NULL; + } + + /* Allow local fragmentation. */ + if (np->pmtudisc < IPV6_PMTUDISC_DO) + skb->local_df = 1; + + *final_dst = fl6->daddr; + __skb_pull(skb, skb_network_header_len(skb)); + if (opt && opt->opt_flen) + ipv6_push_frag_opts(skb, opt, &proto); + if (opt && opt->opt_nflen) + ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst); + + skb_push(skb, sizeof(struct ipv6hdr)); + skb_reset_network_header(skb); + hdr = ipv6_hdr(skb); + + *(__be32*)hdr = fl6->flowlabel | + htonl(0x60000000 | ((int)np->cork.tclass << 20)); + + hdr->hop_limit = np->cork.hop_limit; + hdr->nexthdr = proto; + hdr->saddr = fl6->saddr; + hdr->daddr = *final_dst; + + skb->priority = sk->sk_priority; + skb->mark = sk->sk_mark; + + skb_dst_set(skb, dst_clone(&rt->dst)); + IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len); + if (proto == IPPROTO_ICMPV6) { + struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); + + ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type); + ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS); + } + + err = ip6_local_out(skb); + if (err) { + if (err > 0) + err = net_xmit_errno(err); + if (err) + goto error; + } + +out: + ip6_cork_release(inet, np); + return err; +error: + IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); + goto out; +} + +void ip6_flush_pending_frames(struct sock *sk) +{ + struct sk_buff *skb; + + while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) { + if (skb_dst(skb)) + IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_OUTDISCARDS); + kfree_skb(skb); + } + + ip6_cork_release(inet_sk(sk), inet6_sk(sk)); +} diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c new file mode 100644 index 00000000..aa21da6a --- /dev/null +++ b/net/ipv6/ip6_tunnel.c @@ -0,0 +1,1592 @@ +/* + * IPv6 tunneling device + * Linux INET6 implementation + * + * Authors: + * Ville Nuorvala <vnuorval@tcs.hut.fi> + * Yasuyuki Kozakai <kozakai@linux-ipv6.org> + * + * Based on: + * linux/net/ipv6/sit.c and linux/net/ipv4/ipip.c + * + * RFC 2473 + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <linux/module.h> +#include <linux/capability.h> +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/sockios.h> +#include <linux/icmp.h> +#include <linux/if.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/if_tunnel.h> +#include <linux/net.h> +#include <linux/in6.h> +#include <linux/netdevice.h> +#include <linux/if_arp.h> +#include <linux/icmpv6.h> +#include <linux/init.h> +#include <linux/route.h> +#include <linux/rtnetlink.h> +#include <linux/netfilter_ipv6.h> +#include <linux/slab.h> + +#include <asm/uaccess.h> +#include <linux/atomic.h> + +#include <net/icmp.h> +#include <net/ip.h> +#include <net/ipv6.h> +#include <net/ip6_route.h> +#include <net/addrconf.h> +#include <net/ip6_tunnel.h> +#include <net/xfrm.h> +#include <net/dsfield.h> +#include <net/inet_ecn.h> +#include <net/net_namespace.h> +#include <net/netns/generic.h> + +MODULE_AUTHOR("Ville Nuorvala"); +MODULE_DESCRIPTION("IPv6 tunneling device"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NETDEV("ip6tnl0"); + +#ifdef IP6_TNL_DEBUG +#define IP6_TNL_TRACE(x...) printk(KERN_DEBUG "%s:" x "\n", __func__) +#else +#define IP6_TNL_TRACE(x...) do {;} while(0) +#endif + +#define IPV6_TCLASS_MASK (IPV6_FLOWINFO_MASK & ~IPV6_FLOWLABEL_MASK) +#define IPV6_TCLASS_SHIFT 20 + +#define HASH_SIZE 32 + +#define HASH(addr) ((__force u32)((addr)->s6_addr32[0] ^ (addr)->s6_addr32[1] ^ \ + (addr)->s6_addr32[2] ^ (addr)->s6_addr32[3]) & \ + (HASH_SIZE - 1)) + +static int ip6_tnl_dev_init(struct net_device *dev); +static void ip6_tnl_dev_setup(struct net_device *dev); + +static int ip6_tnl_net_id __read_mostly; +struct ip6_tnl_net { + /* the IPv6 tunnel fallback device */ + struct net_device *fb_tnl_dev; + /* lists for storing tunnels in use */ + struct ip6_tnl __rcu *tnls_r_l[HASH_SIZE]; + struct ip6_tnl __rcu *tnls_wc[1]; + struct ip6_tnl __rcu **tnls[2]; +}; + +/* often modified stats are per cpu, other are shared (netdev->stats) */ +struct pcpu_tstats { + unsigned long rx_packets; + unsigned long rx_bytes; + unsigned long tx_packets; + unsigned long tx_bytes; +} __attribute__((aligned(4*sizeof(unsigned long)))); + +static struct net_device_stats *ip6_get_stats(struct net_device *dev) +{ + struct pcpu_tstats sum = { 0 }; + int i; + + for_each_possible_cpu(i) { + const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i); + + sum.rx_packets += tstats->rx_packets; + sum.rx_bytes += tstats->rx_bytes; + sum.tx_packets += tstats->tx_packets; + sum.tx_bytes += tstats->tx_bytes; + } + dev->stats.rx_packets = sum.rx_packets; + dev->stats.rx_bytes = sum.rx_bytes; + dev->stats.tx_packets = sum.tx_packets; + dev->stats.tx_bytes = sum.tx_bytes; + return &dev->stats; +} + +/* + * Locking : hash tables are protected by RCU and RTNL + */ + +static inline struct dst_entry *ip6_tnl_dst_check(struct ip6_tnl *t) +{ + struct dst_entry *dst = t->dst_cache; + + if (dst && dst->obsolete && + dst->ops->check(dst, t->dst_cookie) == NULL) { + t->dst_cache = NULL; + dst_release(dst); + return NULL; + } + + return dst; +} + +static inline void ip6_tnl_dst_reset(struct ip6_tnl *t) +{ + dst_release(t->dst_cache); + t->dst_cache = NULL; +} + +static inline void ip6_tnl_dst_store(struct ip6_tnl *t, struct dst_entry *dst) +{ + struct rt6_info *rt = (struct rt6_info *) dst; + t->dst_cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0; + dst_release(t->dst_cache); + t->dst_cache = dst; +} + +/** + * ip6_tnl_lookup - fetch tunnel matching the end-point addresses + * @remote: the address of the tunnel exit-point + * @local: the address of the tunnel entry-point + * + * Return: + * tunnel matching given end-points if found, + * else fallback tunnel if its device is up, + * else %NULL + **/ + +#define for_each_ip6_tunnel_rcu(start) \ + for (t = rcu_dereference(start); t; t = rcu_dereference(t->next)) + +static struct ip6_tnl * +ip6_tnl_lookup(struct net *net, const struct in6_addr *remote, const struct in6_addr *local) +{ + unsigned int h0 = HASH(remote); + unsigned int h1 = HASH(local); + struct ip6_tnl *t; + struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); + + for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[h0 ^ h1]) { + if (ipv6_addr_equal(local, &t->parms.laddr) && + ipv6_addr_equal(remote, &t->parms.raddr) && + (t->dev->flags & IFF_UP)) + return t; + } + t = rcu_dereference(ip6n->tnls_wc[0]); + if (t && (t->dev->flags & IFF_UP)) + return t; + + return NULL; +} + +/** + * ip6_tnl_bucket - get head of list matching given tunnel parameters + * @p: parameters containing tunnel end-points + * + * Description: + * ip6_tnl_bucket() returns the head of the list matching the + * &struct in6_addr entries laddr and raddr in @p. + * + * Return: head of IPv6 tunnel list + **/ + +static struct ip6_tnl __rcu ** +ip6_tnl_bucket(struct ip6_tnl_net *ip6n, const struct ip6_tnl_parm *p) +{ + const struct in6_addr *remote = &p->raddr; + const struct in6_addr *local = &p->laddr; + unsigned h = 0; + int prio = 0; + + if (!ipv6_addr_any(remote) || !ipv6_addr_any(local)) { + prio = 1; + h = HASH(remote) ^ HASH(local); + } + return &ip6n->tnls[prio][h]; +} + +/** + * ip6_tnl_link - add tunnel to hash table + * @t: tunnel to be added + **/ + +static void +ip6_tnl_link(struct ip6_tnl_net *ip6n, struct ip6_tnl *t) +{ + struct ip6_tnl __rcu **tp = ip6_tnl_bucket(ip6n, &t->parms); + + rcu_assign_pointer(t->next , rtnl_dereference(*tp)); + rcu_assign_pointer(*tp, t); +} + +/** + * ip6_tnl_unlink - remove tunnel from hash table + * @t: tunnel to be removed + **/ + +static void +ip6_tnl_unlink(struct ip6_tnl_net *ip6n, struct ip6_tnl *t) +{ + struct ip6_tnl __rcu **tp; + struct ip6_tnl *iter; + + for (tp = ip6_tnl_bucket(ip6n, &t->parms); + (iter = rtnl_dereference(*tp)) != NULL; + tp = &iter->next) { + if (t == iter) { + rcu_assign_pointer(*tp, t->next); + break; + } + } +} + +static void ip6_dev_free(struct net_device *dev) +{ + free_percpu(dev->tstats); + free_netdev(dev); +} + +/** + * ip6_tnl_create() - create a new tunnel + * @p: tunnel parameters + * @pt: pointer to new tunnel + * + * Description: + * Create tunnel matching given parameters. + * + * Return: + * created tunnel or NULL + **/ + +static struct ip6_tnl *ip6_tnl_create(struct net *net, struct ip6_tnl_parm *p) +{ + struct net_device *dev; + struct ip6_tnl *t; + char name[IFNAMSIZ]; + int err; + struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); + + if (p->name[0]) + strlcpy(name, p->name, IFNAMSIZ); + else + sprintf(name, "ip6tnl%%d"); + + dev = alloc_netdev(sizeof (*t), name, ip6_tnl_dev_setup); + if (dev == NULL) + goto failed; + + dev_net_set(dev, net); + + t = netdev_priv(dev); + t->parms = *p; + err = ip6_tnl_dev_init(dev); + if (err < 0) + goto failed_free; + + if ((err = register_netdevice(dev)) < 0) + goto failed_free; + + strcpy(t->parms.name, dev->name); + + dev_hold(dev); + ip6_tnl_link(ip6n, t); + return t; + +failed_free: + ip6_dev_free(dev); +failed: + return NULL; +} + +/** + * ip6_tnl_locate - find or create tunnel matching given parameters + * @p: tunnel parameters + * @create: != 0 if allowed to create new tunnel if no match found + * + * Description: + * ip6_tnl_locate() first tries to locate an existing tunnel + * based on @parms. If this is unsuccessful, but @create is set a new + * tunnel device is created and registered for use. + * + * Return: + * matching tunnel or NULL + **/ + +static struct ip6_tnl *ip6_tnl_locate(struct net *net, + struct ip6_tnl_parm *p, int create) +{ + const struct in6_addr *remote = &p->raddr; + const struct in6_addr *local = &p->laddr; + struct ip6_tnl __rcu **tp; + struct ip6_tnl *t; + struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); + + for (tp = ip6_tnl_bucket(ip6n, p); + (t = rtnl_dereference(*tp)) != NULL; + tp = &t->next) { + if (ipv6_addr_equal(local, &t->parms.laddr) && + ipv6_addr_equal(remote, &t->parms.raddr)) + return t; + } + if (!create) + return NULL; + return ip6_tnl_create(net, p); +} + +/** + * ip6_tnl_dev_uninit - tunnel device uninitializer + * @dev: the device to be destroyed + * + * Description: + * ip6_tnl_dev_uninit() removes tunnel from its list + **/ + +static void +ip6_tnl_dev_uninit(struct net_device *dev) +{ + struct ip6_tnl *t = netdev_priv(dev); + struct net *net = dev_net(dev); + struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); + + if (dev == ip6n->fb_tnl_dev) + RCU_INIT_POINTER(ip6n->tnls_wc[0], NULL); + else + ip6_tnl_unlink(ip6n, t); + ip6_tnl_dst_reset(t); + dev_put(dev); +} + +/** + * parse_tvl_tnl_enc_lim - handle encapsulation limit option + * @skb: received socket buffer + * + * Return: + * 0 if none was found, + * else index to encapsulation limit + **/ + +static __u16 +parse_tlv_tnl_enc_lim(struct sk_buff *skb, __u8 * raw) +{ + const struct ipv6hdr *ipv6h = (const struct ipv6hdr *) raw; + __u8 nexthdr = ipv6h->nexthdr; + __u16 off = sizeof (*ipv6h); + + while (ipv6_ext_hdr(nexthdr) && nexthdr != NEXTHDR_NONE) { + __u16 optlen = 0; + struct ipv6_opt_hdr *hdr; + if (raw + off + sizeof (*hdr) > skb->data && + !pskb_may_pull(skb, raw - skb->data + off + sizeof (*hdr))) + break; + + hdr = (struct ipv6_opt_hdr *) (raw + off); + if (nexthdr == NEXTHDR_FRAGMENT) { + struct frag_hdr *frag_hdr = (struct frag_hdr *) hdr; + if (frag_hdr->frag_off) + break; + optlen = 8; + } else if (nexthdr == NEXTHDR_AUTH) { + optlen = (hdr->hdrlen + 2) << 2; + } else { + optlen = ipv6_optlen(hdr); + } + if (nexthdr == NEXTHDR_DEST) { + __u16 i = off + 2; + while (1) { + struct ipv6_tlv_tnl_enc_lim *tel; + + /* No more room for encapsulation limit */ + if (i + sizeof (*tel) > off + optlen) + break; + + tel = (struct ipv6_tlv_tnl_enc_lim *) &raw[i]; + /* return index of option if found and valid */ + if (tel->type == IPV6_TLV_TNL_ENCAP_LIMIT && + tel->length == 1) + return i; + /* else jump to next option */ + if (tel->type) + i += tel->length + 2; + else + i++; + } + } + nexthdr = hdr->nexthdr; + off += optlen; + } + return 0; +} + +/** + * ip6_tnl_err - tunnel error handler + * + * Description: + * ip6_tnl_err() should handle errors in the tunnel according + * to the specifications in RFC 2473. + **/ + +static int +ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt, + u8 *type, u8 *code, int *msg, __u32 *info, int offset) +{ + const struct ipv6hdr *ipv6h = (const struct ipv6hdr *) skb->data; + struct ip6_tnl *t; + int rel_msg = 0; + u8 rel_type = ICMPV6_DEST_UNREACH; + u8 rel_code = ICMPV6_ADDR_UNREACH; + __u32 rel_info = 0; + __u16 len; + int err = -ENOENT; + + /* If the packet doesn't contain the original IPv6 header we are + in trouble since we might need the source address for further + processing of the error. */ + + rcu_read_lock(); + if ((t = ip6_tnl_lookup(dev_net(skb->dev), &ipv6h->daddr, + &ipv6h->saddr)) == NULL) + goto out; + + if (t->parms.proto != ipproto && t->parms.proto != 0) + goto out; + + err = 0; + + switch (*type) { + __u32 teli; + struct ipv6_tlv_tnl_enc_lim *tel; + __u32 mtu; + case ICMPV6_DEST_UNREACH: + if (net_ratelimit()) + printk(KERN_WARNING + "%s: Path to destination invalid " + "or inactive!\n", t->parms.name); + rel_msg = 1; + break; + case ICMPV6_TIME_EXCEED: + if ((*code) == ICMPV6_EXC_HOPLIMIT) { + if (net_ratelimit()) + printk(KERN_WARNING + "%s: Too small hop limit or " + "routing loop in tunnel!\n", + t->parms.name); + rel_msg = 1; + } + break; + case ICMPV6_PARAMPROB: + teli = 0; + if ((*code) == ICMPV6_HDR_FIELD) + teli = parse_tlv_tnl_enc_lim(skb, skb->data); + + if (teli && teli == *info - 2) { + tel = (struct ipv6_tlv_tnl_enc_lim *) &skb->data[teli]; + if (tel->encap_limit == 0) { + if (net_ratelimit()) + printk(KERN_WARNING + "%s: Too small encapsulation " + "limit or routing loop in " + "tunnel!\n", t->parms.name); + rel_msg = 1; + } + } else if (net_ratelimit()) { + printk(KERN_WARNING + "%s: Recipient unable to parse tunneled " + "packet!\n ", t->parms.name); + } + break; + case ICMPV6_PKT_TOOBIG: + mtu = *info - offset; + if (mtu < IPV6_MIN_MTU) + mtu = IPV6_MIN_MTU; + t->dev->mtu = mtu; + + if ((len = sizeof (*ipv6h) + ntohs(ipv6h->payload_len)) > mtu) { + rel_type = ICMPV6_PKT_TOOBIG; + rel_code = 0; + rel_info = mtu; + rel_msg = 1; + } + break; + } + + *type = rel_type; + *code = rel_code; + *info = rel_info; + *msg = rel_msg; + +out: + rcu_read_unlock(); + return err; +} + +static int +ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, __be32 info) +{ + int rel_msg = 0; + u8 rel_type = type; + u8 rel_code = code; + __u32 rel_info = ntohl(info); + int err; + struct sk_buff *skb2; + const struct iphdr *eiph; + struct rtable *rt; + struct flowi4 fl4; + + err = ip6_tnl_err(skb, IPPROTO_IPIP, opt, &rel_type, &rel_code, + &rel_msg, &rel_info, offset); + if (err < 0) + return err; + + if (rel_msg == 0) + return 0; + + switch (rel_type) { + case ICMPV6_DEST_UNREACH: + if (rel_code != ICMPV6_ADDR_UNREACH) + return 0; + rel_type = ICMP_DEST_UNREACH; + rel_code = ICMP_HOST_UNREACH; + break; + case ICMPV6_PKT_TOOBIG: + if (rel_code != 0) + return 0; + rel_type = ICMP_DEST_UNREACH; + rel_code = ICMP_FRAG_NEEDED; + break; + default: + return 0; + } + + if (!pskb_may_pull(skb, offset + sizeof(struct iphdr))) + return 0; + + skb2 = skb_clone(skb, GFP_ATOMIC); + if (!skb2) + return 0; + + skb_dst_drop(skb2); + + skb_pull(skb2, offset); + skb_reset_network_header(skb2); + eiph = ip_hdr(skb2); + + /* Try to guess incoming interface */ + rt = ip_route_output_ports(dev_net(skb->dev), &fl4, NULL, + eiph->saddr, 0, + 0, 0, + IPPROTO_IPIP, RT_TOS(eiph->tos), 0); + if (IS_ERR(rt)) + goto out; + + skb2->dev = rt->dst.dev; + + /* route "incoming" packet */ + if (rt->rt_flags & RTCF_LOCAL) { + ip_rt_put(rt); + rt = NULL; + rt = ip_route_output_ports(dev_net(skb->dev), &fl4, NULL, + eiph->daddr, eiph->saddr, + 0, 0, + IPPROTO_IPIP, + RT_TOS(eiph->tos), 0); + if (IS_ERR(rt) || + rt->dst.dev->type != ARPHRD_TUNNEL) { + if (!IS_ERR(rt)) + ip_rt_put(rt); + goto out; + } + skb_dst_set(skb2, &rt->dst); + } else { + ip_rt_put(rt); + if (ip_route_input(skb2, eiph->daddr, eiph->saddr, eiph->tos, + skb2->dev) || + skb_dst(skb2)->dev->type != ARPHRD_TUNNEL) + goto out; + } + + /* change mtu on this route */ + if (rel_type == ICMP_DEST_UNREACH && rel_code == ICMP_FRAG_NEEDED) { + if (rel_info > dst_mtu(skb_dst(skb2))) + goto out; + + skb_dst(skb2)->ops->update_pmtu(skb_dst(skb2), rel_info); + } + + icmp_send(skb2, rel_type, rel_code, htonl(rel_info)); + +out: + kfree_skb(skb2); + return 0; +} + +static int +ip6ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, __be32 info) +{ + int rel_msg = 0; + u8 rel_type = type; + u8 rel_code = code; + __u32 rel_info = ntohl(info); + int err; + + err = ip6_tnl_err(skb, IPPROTO_IPV6, opt, &rel_type, &rel_code, + &rel_msg, &rel_info, offset); + if (err < 0) + return err; + + if (rel_msg && pskb_may_pull(skb, offset + sizeof(struct ipv6hdr))) { + struct rt6_info *rt; + struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); + + if (!skb2) + return 0; + + skb_dst_drop(skb2); + skb_pull(skb2, offset); + skb_reset_network_header(skb2); + + /* Try to guess incoming interface */ + rt = rt6_lookup(dev_net(skb->dev), &ipv6_hdr(skb2)->saddr, + NULL, 0, 0); + + if (rt && rt->dst.dev) + skb2->dev = rt->dst.dev; + + icmpv6_send(skb2, rel_type, rel_code, rel_info); + + if (rt) + dst_release(&rt->dst); + + kfree_skb(skb2); + } + + return 0; +} + +static void ip4ip6_dscp_ecn_decapsulate(const struct ip6_tnl *t, + const struct ipv6hdr *ipv6h, + struct sk_buff *skb) +{ + __u8 dsfield = ipv6_get_dsfield(ipv6h) & ~INET_ECN_MASK; + + if (t->parms.flags & IP6_TNL_F_RCV_DSCP_COPY) + ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, dsfield); + + if (INET_ECN_is_ce(dsfield)) + IP_ECN_set_ce(ip_hdr(skb)); +} + +static void ip6ip6_dscp_ecn_decapsulate(const struct ip6_tnl *t, + const struct ipv6hdr *ipv6h, + struct sk_buff *skb) +{ + if (t->parms.flags & IP6_TNL_F_RCV_DSCP_COPY) + ipv6_copy_dscp(ipv6_get_dsfield(ipv6h), ipv6_hdr(skb)); + + if (INET_ECN_is_ce(ipv6_get_dsfield(ipv6h))) + IP6_ECN_set_ce(ipv6_hdr(skb)); +} + +/* called with rcu_read_lock() */ +static inline int ip6_tnl_rcv_ctl(struct ip6_tnl *t) +{ + struct ip6_tnl_parm *p = &t->parms; + int ret = 0; + struct net *net = dev_net(t->dev); + + if (p->flags & IP6_TNL_F_CAP_RCV) { + struct net_device *ldev = NULL; + + if (p->link) + ldev = dev_get_by_index_rcu(net, p->link); + + if ((ipv6_addr_is_multicast(&p->laddr) || + likely(ipv6_chk_addr(net, &p->laddr, ldev, 0))) && + likely(!ipv6_chk_addr(net, &p->raddr, NULL, 0))) + ret = 1; + + } + return ret; +} + +/** + * ip6_tnl_rcv - decapsulate IPv6 packet and retransmit it locally + * @skb: received socket buffer + * @protocol: ethernet protocol ID + * @dscp_ecn_decapsulate: the function to decapsulate DSCP code and ECN + * + * Return: 0 + **/ + +static int ip6_tnl_rcv(struct sk_buff *skb, __u16 protocol, + __u8 ipproto, + void (*dscp_ecn_decapsulate)(const struct ip6_tnl *t, + const struct ipv6hdr *ipv6h, + struct sk_buff *skb)) +{ + struct ip6_tnl *t; + const struct ipv6hdr *ipv6h = ipv6_hdr(skb); + + rcu_read_lock(); + + if ((t = ip6_tnl_lookup(dev_net(skb->dev), &ipv6h->saddr, + &ipv6h->daddr)) != NULL) { + struct pcpu_tstats *tstats; + + if (t->parms.proto != ipproto && t->parms.proto != 0) { + rcu_read_unlock(); + goto discard; + } + + if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) { + rcu_read_unlock(); + goto discard; + } + + if (!ip6_tnl_rcv_ctl(t)) { + t->dev->stats.rx_dropped++; + rcu_read_unlock(); + goto discard; + } + secpath_reset(skb); + skb->mac_header = skb->network_header; + skb_reset_network_header(skb); + skb->protocol = htons(protocol); + skb->pkt_type = PACKET_HOST; + memset(skb->cb, 0, sizeof(struct inet6_skb_parm)); + + tstats = this_cpu_ptr(t->dev->tstats); + tstats->rx_packets++; + tstats->rx_bytes += skb->len; + + __skb_tunnel_rx(skb, t->dev); + + dscp_ecn_decapsulate(t, ipv6h, skb); + + netif_rx(skb); + + rcu_read_unlock(); + return 0; + } + rcu_read_unlock(); + return 1; + +discard: + kfree_skb(skb); + return 0; +} + +static int ip4ip6_rcv(struct sk_buff *skb) +{ + return ip6_tnl_rcv(skb, ETH_P_IP, IPPROTO_IPIP, + ip4ip6_dscp_ecn_decapsulate); +} + +static int ip6ip6_rcv(struct sk_buff *skb) +{ + return ip6_tnl_rcv(skb, ETH_P_IPV6, IPPROTO_IPV6, + ip6ip6_dscp_ecn_decapsulate); +} + +struct ipv6_tel_txoption { + struct ipv6_txoptions ops; + __u8 dst_opt[8]; +}; + +static void init_tel_txopt(struct ipv6_tel_txoption *opt, __u8 encap_limit) +{ + memset(opt, 0, sizeof(struct ipv6_tel_txoption)); + + opt->dst_opt[2] = IPV6_TLV_TNL_ENCAP_LIMIT; + opt->dst_opt[3] = 1; + opt->dst_opt[4] = encap_limit; + opt->dst_opt[5] = IPV6_TLV_PADN; + opt->dst_opt[6] = 1; + + opt->ops.dst0opt = (struct ipv6_opt_hdr *) opt->dst_opt; + opt->ops.opt_nflen = 8; +} + +/** + * ip6_tnl_addr_conflict - compare packet addresses to tunnel's own + * @t: the outgoing tunnel device + * @hdr: IPv6 header from the incoming packet + * + * Description: + * Avoid trivial tunneling loop by checking that tunnel exit-point + * doesn't match source of incoming packet. + * + * Return: + * 1 if conflict, + * 0 else + **/ + +static inline int +ip6_tnl_addr_conflict(const struct ip6_tnl *t, const struct ipv6hdr *hdr) +{ + return ipv6_addr_equal(&t->parms.raddr, &hdr->saddr); +} + +static inline int ip6_tnl_xmit_ctl(struct ip6_tnl *t) +{ + struct ip6_tnl_parm *p = &t->parms; + int ret = 0; + struct net *net = dev_net(t->dev); + + if (p->flags & IP6_TNL_F_CAP_XMIT) { + struct net_device *ldev = NULL; + + rcu_read_lock(); + if (p->link) + ldev = dev_get_by_index_rcu(net, p->link); + + if (unlikely(!ipv6_chk_addr(net, &p->laddr, ldev, 0))) + printk(KERN_WARNING + "%s xmit: Local address not yet configured!\n", + p->name); + else if (!ipv6_addr_is_multicast(&p->raddr) && + unlikely(ipv6_chk_addr(net, &p->raddr, NULL, 0))) + printk(KERN_WARNING + "%s xmit: Routing loop! " + "Remote address found on this node!\n", + p->name); + else + ret = 1; + rcu_read_unlock(); + } + return ret; +} +/** + * ip6_tnl_xmit2 - encapsulate packet and send + * @skb: the outgoing socket buffer + * @dev: the outgoing tunnel device + * @dsfield: dscp code for outer header + * @fl: flow of tunneled packet + * @encap_limit: encapsulation limit + * @pmtu: Path MTU is stored if packet is too big + * + * Description: + * Build new header and do some sanity checks on the packet before sending + * it. + * + * Return: + * 0 on success + * -1 fail + * %-EMSGSIZE message too big. return mtu in this case. + **/ + +static int ip6_tnl_xmit2(struct sk_buff *skb, + struct net_device *dev, + __u8 dsfield, + struct flowi6 *fl6, + int encap_limit, + __u32 *pmtu) +{ + struct net *net = dev_net(dev); + struct ip6_tnl *t = netdev_priv(dev); + struct net_device_stats *stats = &t->dev->stats; + struct ipv6hdr *ipv6h = ipv6_hdr(skb); + struct ipv6_tel_txoption opt; + struct dst_entry *dst = NULL, *ndst = NULL; + struct net_device *tdev; + int mtu; + unsigned int max_headroom = sizeof(struct ipv6hdr); + u8 proto; + int err = -1; + int pkt_len; + + if (!fl6->flowi6_mark) + dst = ip6_tnl_dst_check(t); + if (!dst) { + ndst = ip6_route_output(net, NULL, fl6); + + if (ndst->error) + goto tx_err_link_failure; + ndst = xfrm_lookup(net, ndst, flowi6_to_flowi(fl6), NULL, 0); + if (IS_ERR(ndst)) { + err = PTR_ERR(ndst); + ndst = NULL; + goto tx_err_link_failure; + } + dst = ndst; + } + + tdev = dst->dev; + + if (tdev == dev) { + stats->collisions++; + if (net_ratelimit()) + printk(KERN_WARNING + "%s: Local routing loop detected!\n", + t->parms.name); + goto tx_err_dst_release; + } + mtu = dst_mtu(dst) - sizeof (*ipv6h); + if (encap_limit >= 0) { + max_headroom += 8; + mtu -= 8; + } + if (mtu < IPV6_MIN_MTU) + mtu = IPV6_MIN_MTU; + if (skb_dst(skb)) + skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu); + if (skb->len > mtu) { + *pmtu = mtu; + err = -EMSGSIZE; + goto tx_err_dst_release; + } + + /* + * Okay, now see if we can stuff it in the buffer as-is. + */ + max_headroom += LL_RESERVED_SPACE(tdev); + + if (skb_headroom(skb) < max_headroom || skb_shared(skb) || + (skb_cloned(skb) && !skb_clone_writable(skb, 0))) { + struct sk_buff *new_skb; + + if (!(new_skb = skb_realloc_headroom(skb, max_headroom))) + goto tx_err_dst_release; + + if (skb->sk) + skb_set_owner_w(new_skb, skb->sk); + kfree_skb(skb); + skb = new_skb; + } + skb_dst_drop(skb); + if (fl6->flowi6_mark) { + skb_dst_set(skb, dst); + ndst = NULL; + } else { + skb_dst_set_noref(skb, dst); + } + skb->transport_header = skb->network_header; + + proto = fl6->flowi6_proto; + if (encap_limit >= 0) { + init_tel_txopt(&opt, encap_limit); + ipv6_push_nfrag_opts(skb, &opt.ops, &proto, NULL); + } + skb_push(skb, sizeof(struct ipv6hdr)); + skb_reset_network_header(skb); + ipv6h = ipv6_hdr(skb); + *(__be32*)ipv6h = fl6->flowlabel | htonl(0x60000000); + dsfield = INET_ECN_encapsulate(0, dsfield); + ipv6_change_dsfield(ipv6h, ~INET_ECN_MASK, dsfield); + ipv6h->hop_limit = t->parms.hop_limit; + ipv6h->nexthdr = proto; + ipv6h->saddr = fl6->saddr; + ipv6h->daddr = fl6->daddr; + nf_reset(skb); + pkt_len = skb->len; + err = ip6_local_out(skb); + + if (net_xmit_eval(err) == 0) { + struct pcpu_tstats *tstats = this_cpu_ptr(t->dev->tstats); + + tstats->tx_bytes += pkt_len; + tstats->tx_packets++; + } else { + stats->tx_errors++; + stats->tx_aborted_errors++; + } + if (ndst) + ip6_tnl_dst_store(t, ndst); + return 0; +tx_err_link_failure: + stats->tx_carrier_errors++; + dst_link_failure(skb); +tx_err_dst_release: + dst_release(ndst); + return err; +} + +static inline int +ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct ip6_tnl *t = netdev_priv(dev); + const struct iphdr *iph = ip_hdr(skb); + int encap_limit = -1; + struct flowi6 fl6; + __u8 dsfield; + __u32 mtu; + int err; + + if ((t->parms.proto != IPPROTO_IPIP && t->parms.proto != 0) || + !ip6_tnl_xmit_ctl(t)) + return -1; + + if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) + encap_limit = t->parms.encap_limit; + + memcpy(&fl6, &t->fl.u.ip6, sizeof (fl6)); + fl6.flowi6_proto = IPPROTO_IPIP; + + dsfield = ipv4_get_dsfield(iph); + + if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) + fl6.flowlabel |= htonl((__u32)iph->tos << IPV6_TCLASS_SHIFT) + & IPV6_TCLASS_MASK; + if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) + fl6.flowi6_mark = skb->mark; + + err = ip6_tnl_xmit2(skb, dev, dsfield, &fl6, encap_limit, &mtu); + if (err != 0) { + /* XXX: send ICMP error even if DF is not set. */ + if (err == -EMSGSIZE) + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, + htonl(mtu)); + return -1; + } + + return 0; +} + +static inline int +ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct ip6_tnl *t = netdev_priv(dev); + struct ipv6hdr *ipv6h = ipv6_hdr(skb); + int encap_limit = -1; + __u16 offset; + struct flowi6 fl6; + __u8 dsfield; + __u32 mtu; + int err; + + if ((t->parms.proto != IPPROTO_IPV6 && t->parms.proto != 0) || + !ip6_tnl_xmit_ctl(t) || ip6_tnl_addr_conflict(t, ipv6h)) + return -1; + + offset = parse_tlv_tnl_enc_lim(skb, skb_network_header(skb)); + if (offset > 0) { + struct ipv6_tlv_tnl_enc_lim *tel; + tel = (struct ipv6_tlv_tnl_enc_lim *)&skb_network_header(skb)[offset]; + if (tel->encap_limit == 0) { + icmpv6_send(skb, ICMPV6_PARAMPROB, + ICMPV6_HDR_FIELD, offset + 2); + return -1; + } + encap_limit = tel->encap_limit - 1; + } else if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) + encap_limit = t->parms.encap_limit; + + memcpy(&fl6, &t->fl.u.ip6, sizeof (fl6)); + fl6.flowi6_proto = IPPROTO_IPV6; + + dsfield = ipv6_get_dsfield(ipv6h); + if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) + fl6.flowlabel |= (*(__be32 *) ipv6h & IPV6_TCLASS_MASK); + if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL) + fl6.flowlabel |= (*(__be32 *) ipv6h & IPV6_FLOWLABEL_MASK); + if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) + fl6.flowi6_mark = skb->mark; + + err = ip6_tnl_xmit2(skb, dev, dsfield, &fl6, encap_limit, &mtu); + if (err != 0) { + if (err == -EMSGSIZE) + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + return -1; + } + + return 0; +} + +static netdev_tx_t +ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct ip6_tnl *t = netdev_priv(dev); + struct net_device_stats *stats = &t->dev->stats; + int ret; + + switch (skb->protocol) { + case htons(ETH_P_IP): + ret = ip4ip6_tnl_xmit(skb, dev); + break; + case htons(ETH_P_IPV6): + ret = ip6ip6_tnl_xmit(skb, dev); + break; + default: + goto tx_err; + } + + if (ret < 0) + goto tx_err; + + return NETDEV_TX_OK; + +tx_err: + stats->tx_errors++; + stats->tx_dropped++; + kfree_skb(skb); + return NETDEV_TX_OK; +} + +static void ip6_tnl_set_cap(struct ip6_tnl *t) +{ + struct ip6_tnl_parm *p = &t->parms; + int ltype = ipv6_addr_type(&p->laddr); + int rtype = ipv6_addr_type(&p->raddr); + + p->flags &= ~(IP6_TNL_F_CAP_XMIT|IP6_TNL_F_CAP_RCV); + + if (ltype & (IPV6_ADDR_UNICAST|IPV6_ADDR_MULTICAST) && + rtype & (IPV6_ADDR_UNICAST|IPV6_ADDR_MULTICAST) && + !((ltype|rtype) & IPV6_ADDR_LOOPBACK) && + (!((ltype|rtype) & IPV6_ADDR_LINKLOCAL) || p->link)) { + if (ltype&IPV6_ADDR_UNICAST) + p->flags |= IP6_TNL_F_CAP_XMIT; + if (rtype&IPV6_ADDR_UNICAST) + p->flags |= IP6_TNL_F_CAP_RCV; + } +} + +static void ip6_tnl_link_config(struct ip6_tnl *t) +{ + struct net_device *dev = t->dev; + struct ip6_tnl_parm *p = &t->parms; + struct flowi6 *fl6 = &t->fl.u.ip6; + + memcpy(dev->dev_addr, &p->laddr, sizeof(struct in6_addr)); + memcpy(dev->broadcast, &p->raddr, sizeof(struct in6_addr)); + + /* Set up flowi template */ + fl6->saddr = p->laddr; + fl6->daddr = p->raddr; + fl6->flowi6_oif = p->link; + fl6->flowlabel = 0; + + if (!(p->flags&IP6_TNL_F_USE_ORIG_TCLASS)) + fl6->flowlabel |= IPV6_TCLASS_MASK & p->flowinfo; + if (!(p->flags&IP6_TNL_F_USE_ORIG_FLOWLABEL)) + fl6->flowlabel |= IPV6_FLOWLABEL_MASK & p->flowinfo; + + ip6_tnl_set_cap(t); + + if (p->flags&IP6_TNL_F_CAP_XMIT && p->flags&IP6_TNL_F_CAP_RCV) + dev->flags |= IFF_POINTOPOINT; + else + dev->flags &= ~IFF_POINTOPOINT; + + dev->iflink = p->link; + + if (p->flags & IP6_TNL_F_CAP_XMIT) { + int strict = (ipv6_addr_type(&p->raddr) & + (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL)); + + struct rt6_info *rt = rt6_lookup(dev_net(dev), + &p->raddr, &p->laddr, + p->link, strict); + + if (rt == NULL) + return; + + if (rt->dst.dev) { + dev->hard_header_len = rt->dst.dev->hard_header_len + + sizeof (struct ipv6hdr); + + dev->mtu = rt->dst.dev->mtu - sizeof (struct ipv6hdr); + if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) + dev->mtu-=8; + + if (dev->mtu < IPV6_MIN_MTU) + dev->mtu = IPV6_MIN_MTU; + } + dst_release(&rt->dst); + } +} + +/** + * ip6_tnl_change - update the tunnel parameters + * @t: tunnel to be changed + * @p: tunnel configuration parameters + * + * Description: + * ip6_tnl_change() updates the tunnel parameters + **/ + +static int +ip6_tnl_change(struct ip6_tnl *t, struct ip6_tnl_parm *p) +{ + t->parms.laddr = p->laddr; + t->parms.raddr = p->raddr; + t->parms.flags = p->flags; + t->parms.hop_limit = p->hop_limit; + t->parms.encap_limit = p->encap_limit; + t->parms.flowinfo = p->flowinfo; + t->parms.link = p->link; + t->parms.proto = p->proto; + ip6_tnl_dst_reset(t); + ip6_tnl_link_config(t); + return 0; +} + +/** + * ip6_tnl_ioctl - configure ipv6 tunnels from userspace + * @dev: virtual device associated with tunnel + * @ifr: parameters passed from userspace + * @cmd: command to be performed + * + * Description: + * ip6_tnl_ioctl() is used for managing IPv6 tunnels + * from userspace. + * + * The possible commands are the following: + * %SIOCGETTUNNEL: get tunnel parameters for device + * %SIOCADDTUNNEL: add tunnel matching given tunnel parameters + * %SIOCCHGTUNNEL: change tunnel parameters to those given + * %SIOCDELTUNNEL: delete tunnel + * + * The fallback device "ip6tnl0", created during module + * initialization, can be used for creating other tunnel devices. + * + * Return: + * 0 on success, + * %-EFAULT if unable to copy data to or from userspace, + * %-EPERM if current process hasn't %CAP_NET_ADMIN set + * %-EINVAL if passed tunnel parameters are invalid, + * %-EEXIST if changing a tunnel's parameters would cause a conflict + * %-ENODEV if attempting to change or delete a nonexisting device + **/ + +static int +ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) +{ + int err = 0; + struct ip6_tnl_parm p; + struct ip6_tnl *t = NULL; + struct net *net = dev_net(dev); + struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); + + switch (cmd) { + case SIOCGETTUNNEL: + if (dev == ip6n->fb_tnl_dev) { + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof (p))) { + err = -EFAULT; + break; + } + t = ip6_tnl_locate(net, &p, 0); + } + if (t == NULL) + t = netdev_priv(dev); + memcpy(&p, &t->parms, sizeof (p)); + if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof (p))) { + err = -EFAULT; + } + break; + case SIOCADDTUNNEL: + case SIOCCHGTUNNEL: + err = -EPERM; + if (!capable(CAP_NET_ADMIN)) + break; + err = -EFAULT; + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof (p))) + break; + err = -EINVAL; + if (p.proto != IPPROTO_IPV6 && p.proto != IPPROTO_IPIP && + p.proto != 0) + break; + t = ip6_tnl_locate(net, &p, cmd == SIOCADDTUNNEL); + if (dev != ip6n->fb_tnl_dev && cmd == SIOCCHGTUNNEL) { + if (t != NULL) { + if (t->dev != dev) { + err = -EEXIST; + break; + } + } else + t = netdev_priv(dev); + + ip6_tnl_unlink(ip6n, t); + synchronize_net(); + err = ip6_tnl_change(t, &p); + ip6_tnl_link(ip6n, t); + netdev_state_change(dev); + } + if (t) { + err = 0; + if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof (p))) + err = -EFAULT; + + } else + err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT); + break; + case SIOCDELTUNNEL: + err = -EPERM; + if (!capable(CAP_NET_ADMIN)) + break; + + if (dev == ip6n->fb_tnl_dev) { + err = -EFAULT; + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof (p))) + break; + err = -ENOENT; + if ((t = ip6_tnl_locate(net, &p, 0)) == NULL) + break; + err = -EPERM; + if (t->dev == ip6n->fb_tnl_dev) + break; + dev = t->dev; + } + err = 0; + unregister_netdevice(dev); + break; + default: + err = -EINVAL; + } + return err; +} + +/** + * ip6_tnl_change_mtu - change mtu manually for tunnel device + * @dev: virtual device associated with tunnel + * @new_mtu: the new mtu + * + * Return: + * 0 on success, + * %-EINVAL if mtu too small + **/ + +static int +ip6_tnl_change_mtu(struct net_device *dev, int new_mtu) +{ + if (new_mtu < IPV6_MIN_MTU) { + return -EINVAL; + } + dev->mtu = new_mtu; + return 0; +} + + +static const struct net_device_ops ip6_tnl_netdev_ops = { + .ndo_uninit = ip6_tnl_dev_uninit, + .ndo_start_xmit = ip6_tnl_xmit, + .ndo_do_ioctl = ip6_tnl_ioctl, + .ndo_change_mtu = ip6_tnl_change_mtu, + .ndo_get_stats = ip6_get_stats, +}; + + +/** + * ip6_tnl_dev_setup - setup virtual tunnel device + * @dev: virtual device associated with tunnel + * + * Description: + * Initialize function pointers and device parameters + **/ + +static void ip6_tnl_dev_setup(struct net_device *dev) +{ + struct ip6_tnl *t; + + dev->netdev_ops = &ip6_tnl_netdev_ops; + dev->destructor = ip6_dev_free; + + dev->type = ARPHRD_TUNNEL6; + dev->hard_header_len = LL_MAX_HEADER + sizeof (struct ipv6hdr); + dev->mtu = ETH_DATA_LEN - sizeof (struct ipv6hdr); + t = netdev_priv(dev); + if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) + dev->mtu-=8; + dev->flags |= IFF_NOARP; + dev->addr_len = sizeof(struct in6_addr); + dev->features |= NETIF_F_NETNS_LOCAL; + dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; +} + + +/** + * ip6_tnl_dev_init_gen - general initializer for all tunnel devices + * @dev: virtual device associated with tunnel + **/ + +static inline int +ip6_tnl_dev_init_gen(struct net_device *dev) +{ + struct ip6_tnl *t = netdev_priv(dev); + + t->dev = dev; + dev->tstats = alloc_percpu(struct pcpu_tstats); + if (!dev->tstats) + return -ENOMEM; + return 0; +} + +/** + * ip6_tnl_dev_init - initializer for all non fallback tunnel devices + * @dev: virtual device associated with tunnel + **/ + +static int ip6_tnl_dev_init(struct net_device *dev) +{ + struct ip6_tnl *t = netdev_priv(dev); + int err = ip6_tnl_dev_init_gen(dev); + + if (err) + return err; + ip6_tnl_link_config(t); + return 0; +} + +/** + * ip6_fb_tnl_dev_init - initializer for fallback tunnel device + * @dev: fallback device + * + * Return: 0 + **/ + +static int __net_init ip6_fb_tnl_dev_init(struct net_device *dev) +{ + struct ip6_tnl *t = netdev_priv(dev); + struct net *net = dev_net(dev); + struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); + int err = ip6_tnl_dev_init_gen(dev); + + if (err) + return err; + + t->parms.proto = IPPROTO_IPV6; + dev_hold(dev); + rcu_assign_pointer(ip6n->tnls_wc[0], t); + return 0; +} + +static struct xfrm6_tunnel ip4ip6_handler __read_mostly = { + .handler = ip4ip6_rcv, + .err_handler = ip4ip6_err, + .priority = 1, +}; + +static struct xfrm6_tunnel ip6ip6_handler __read_mostly = { + .handler = ip6ip6_rcv, + .err_handler = ip6ip6_err, + .priority = 1, +}; + +static void __net_exit ip6_tnl_destroy_tunnels(struct ip6_tnl_net *ip6n) +{ + int h; + struct ip6_tnl *t; + LIST_HEAD(list); + + for (h = 0; h < HASH_SIZE; h++) { + t = rtnl_dereference(ip6n->tnls_r_l[h]); + while (t != NULL) { + unregister_netdevice_queue(t->dev, &list); + t = rtnl_dereference(t->next); + } + } + + t = rtnl_dereference(ip6n->tnls_wc[0]); + unregister_netdevice_queue(t->dev, &list); + unregister_netdevice_many(&list); +} + +static int __net_init ip6_tnl_init_net(struct net *net) +{ + struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); + struct ip6_tnl *t = NULL; + int err; + + ip6n->tnls[0] = ip6n->tnls_wc; + ip6n->tnls[1] = ip6n->tnls_r_l; + + err = -ENOMEM; + ip6n->fb_tnl_dev = alloc_netdev(sizeof(struct ip6_tnl), "ip6tnl0", + ip6_tnl_dev_setup); + + if (!ip6n->fb_tnl_dev) + goto err_alloc_dev; + dev_net_set(ip6n->fb_tnl_dev, net); + + err = ip6_fb_tnl_dev_init(ip6n->fb_tnl_dev); + if (err < 0) + goto err_register; + + err = register_netdev(ip6n->fb_tnl_dev); + if (err < 0) + goto err_register; + + t = netdev_priv(ip6n->fb_tnl_dev); + + strcpy(t->parms.name, ip6n->fb_tnl_dev->name); + return 0; + +err_register: + ip6_dev_free(ip6n->fb_tnl_dev); +err_alloc_dev: + return err; +} + +static void __net_exit ip6_tnl_exit_net(struct net *net) +{ + struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); + + rtnl_lock(); + ip6_tnl_destroy_tunnels(ip6n); + rtnl_unlock(); +} + +static struct pernet_operations ip6_tnl_net_ops = { + .init = ip6_tnl_init_net, + .exit = ip6_tnl_exit_net, + .id = &ip6_tnl_net_id, + .size = sizeof(struct ip6_tnl_net), +}; + +/** + * ip6_tunnel_init - register protocol and reserve needed resources + * + * Return: 0 on success + **/ + +static int __init ip6_tunnel_init(void) +{ + int err; + + err = register_pernet_device(&ip6_tnl_net_ops); + if (err < 0) + goto out_pernet; + + err = xfrm6_tunnel_register(&ip4ip6_handler, AF_INET); + if (err < 0) { + printk(KERN_ERR "ip6_tunnel init: can't register ip4ip6\n"); + goto out_ip4ip6; + } + + err = xfrm6_tunnel_register(&ip6ip6_handler, AF_INET6); + if (err < 0) { + printk(KERN_ERR "ip6_tunnel init: can't register ip6ip6\n"); + goto out_ip6ip6; + } + + return 0; + +out_ip6ip6: + xfrm6_tunnel_deregister(&ip4ip6_handler, AF_INET); +out_ip4ip6: + unregister_pernet_device(&ip6_tnl_net_ops); +out_pernet: + return err; +} + +/** + * ip6_tunnel_cleanup - free resources and unregister protocol + **/ + +static void __exit ip6_tunnel_cleanup(void) +{ + if (xfrm6_tunnel_deregister(&ip4ip6_handler, AF_INET)) + printk(KERN_INFO "ip6_tunnel close: can't deregister ip4ip6\n"); + + if (xfrm6_tunnel_deregister(&ip6ip6_handler, AF_INET6)) + printk(KERN_INFO "ip6_tunnel close: can't deregister ip6ip6\n"); + + unregister_pernet_device(&ip6_tnl_net_ops); +} + +module_init(ip6_tunnel_init); +module_exit(ip6_tunnel_cleanup); diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c new file mode 100644 index 00000000..8110362e --- /dev/null +++ b/net/ipv6/ip6mr.c @@ -0,0 +1,2281 @@ +/* + * Linux IPv6 multicast routing support for BSD pim6sd + * Based on net/ipv4/ipmr.c. + * + * (c) 2004 Mickael Hoerdt, <hoerdt@clarinet.u-strasbg.fr> + * LSIIT Laboratory, Strasbourg, France + * (c) 2004 Jean-Philippe Andriot, <jean-philippe.andriot@6WIND.com> + * 6WIND, Paris, France + * Copyright (C)2007,2008 USAGI/WIDE Project + * YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <asm/uaccess.h> +#include <linux/types.h> +#include <linux/sched.h> +#include <linux/errno.h> +#include <linux/timer.h> +#include <linux/mm.h> +#include <linux/kernel.h> +#include <linux/fcntl.h> +#include <linux/stat.h> +#include <linux/socket.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/inetdevice.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/compat.h> +#include <net/protocol.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/raw.h> +#include <linux/notifier.h> +#include <linux/if_arp.h> +#include <net/checksum.h> +#include <net/netlink.h> +#include <net/fib_rules.h> + +#include <net/ipv6.h> +#include <net/ip6_route.h> +#include <linux/mroute6.h> +#include <linux/pim.h> +#include <net/addrconf.h> +#include <linux/netfilter_ipv6.h> +#include <linux/export.h> +#include <net/ip6_checksum.h> + +struct mr6_table { + struct list_head list; +#ifdef CONFIG_NET_NS + struct net *net; +#endif + u32 id; + struct sock *mroute6_sk; + struct timer_list ipmr_expire_timer; + struct list_head mfc6_unres_queue; + struct list_head mfc6_cache_array[MFC6_LINES]; + struct mif_device vif6_table[MAXMIFS]; + int maxvif; + atomic_t cache_resolve_queue_len; + int mroute_do_assert; + int mroute_do_pim; +#ifdef CONFIG_IPV6_PIMSM_V2 + int mroute_reg_vif_num; +#endif +}; + +struct ip6mr_rule { + struct fib_rule common; +}; + +struct ip6mr_result { + struct mr6_table *mrt; +}; + +/* Big lock, protecting vif table, mrt cache and mroute socket state. + Note that the changes are semaphored via rtnl_lock. + */ + +static DEFINE_RWLOCK(mrt_lock); + +/* + * Multicast router control variables + */ + +#define MIF_EXISTS(_mrt, _idx) ((_mrt)->vif6_table[_idx].dev != NULL) + +/* Special spinlock for queue of unresolved entries */ +static DEFINE_SPINLOCK(mfc_unres_lock); + +/* We return to original Alan's scheme. Hash table of resolved + entries is changed only in process context and protected + with weak lock mrt_lock. Queue of unresolved entries is protected + with strong spinlock mfc_unres_lock. + + In this case data path is free of exclusive locks at all. + */ + +static struct kmem_cache *mrt_cachep __read_mostly; + +static struct mr6_table *ip6mr_new_table(struct net *net, u32 id); +static void ip6mr_free_table(struct mr6_table *mrt); + +static int ip6_mr_forward(struct net *net, struct mr6_table *mrt, + struct sk_buff *skb, struct mfc6_cache *cache); +static int ip6mr_cache_report(struct mr6_table *mrt, struct sk_buff *pkt, + mifi_t mifi, int assert); +static int __ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb, + struct mfc6_cache *c, struct rtmsg *rtm); +static int ip6mr_rtm_dumproute(struct sk_buff *skb, + struct netlink_callback *cb); +static void mroute_clean_tables(struct mr6_table *mrt); +static void ipmr_expire_process(unsigned long arg); + +#ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES +#define ip6mr_for_each_table(mrt, net) \ + list_for_each_entry_rcu(mrt, &net->ipv6.mr6_tables, list) + +static struct mr6_table *ip6mr_get_table(struct net *net, u32 id) +{ + struct mr6_table *mrt; + + ip6mr_for_each_table(mrt, net) { + if (mrt->id == id) + return mrt; + } + return NULL; +} + +static int ip6mr_fib_lookup(struct net *net, struct flowi6 *flp6, + struct mr6_table **mrt) +{ + struct ip6mr_result res; + struct fib_lookup_arg arg = { .result = &res, }; + int err; + + err = fib_rules_lookup(net->ipv6.mr6_rules_ops, + flowi6_to_flowi(flp6), 0, &arg); + if (err < 0) + return err; + *mrt = res.mrt; + return 0; +} + +static int ip6mr_rule_action(struct fib_rule *rule, struct flowi *flp, + int flags, struct fib_lookup_arg *arg) +{ + struct ip6mr_result *res = arg->result; + struct mr6_table *mrt; + + switch (rule->action) { + case FR_ACT_TO_TBL: + break; + case FR_ACT_UNREACHABLE: + return -ENETUNREACH; + case FR_ACT_PROHIBIT: + return -EACCES; + case FR_ACT_BLACKHOLE: + default: + return -EINVAL; + } + + mrt = ip6mr_get_table(rule->fr_net, rule->table); + if (mrt == NULL) + return -EAGAIN; + res->mrt = mrt; + return 0; +} + +static int ip6mr_rule_match(struct fib_rule *rule, struct flowi *flp, int flags) +{ + return 1; +} + +static const struct nla_policy ip6mr_rule_policy[FRA_MAX + 1] = { + FRA_GENERIC_POLICY, +}; + +static int ip6mr_rule_configure(struct fib_rule *rule, struct sk_buff *skb, + struct fib_rule_hdr *frh, struct nlattr **tb) +{ + return 0; +} + +static int ip6mr_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, + struct nlattr **tb) +{ + return 1; +} + +static int ip6mr_rule_fill(struct fib_rule *rule, struct sk_buff *skb, + struct fib_rule_hdr *frh) +{ + frh->dst_len = 0; + frh->src_len = 0; + frh->tos = 0; + return 0; +} + +static const struct fib_rules_ops __net_initdata ip6mr_rules_ops_template = { + .family = RTNL_FAMILY_IP6MR, + .rule_size = sizeof(struct ip6mr_rule), + .addr_size = sizeof(struct in6_addr), + .action = ip6mr_rule_action, + .match = ip6mr_rule_match, + .configure = ip6mr_rule_configure, + .compare = ip6mr_rule_compare, + .default_pref = fib_default_rule_pref, + .fill = ip6mr_rule_fill, + .nlgroup = RTNLGRP_IPV6_RULE, + .policy = ip6mr_rule_policy, + .owner = THIS_MODULE, +}; + +static int __net_init ip6mr_rules_init(struct net *net) +{ + struct fib_rules_ops *ops; + struct mr6_table *mrt; + int err; + + ops = fib_rules_register(&ip6mr_rules_ops_template, net); + if (IS_ERR(ops)) + return PTR_ERR(ops); + + INIT_LIST_HEAD(&net->ipv6.mr6_tables); + + mrt = ip6mr_new_table(net, RT6_TABLE_DFLT); + if (mrt == NULL) { + err = -ENOMEM; + goto err1; + } + + err = fib_default_rule_add(ops, 0x7fff, RT6_TABLE_DFLT, 0); + if (err < 0) + goto err2; + + net->ipv6.mr6_rules_ops = ops; + return 0; + +err2: + kfree(mrt); +err1: + fib_rules_unregister(ops); + return err; +} + +static void __net_exit ip6mr_rules_exit(struct net *net) +{ + struct mr6_table *mrt, *next; + + list_for_each_entry_safe(mrt, next, &net->ipv6.mr6_tables, list) { + list_del(&mrt->list); + ip6mr_free_table(mrt); + } + fib_rules_unregister(net->ipv6.mr6_rules_ops); +} +#else +#define ip6mr_for_each_table(mrt, net) \ + for (mrt = net->ipv6.mrt6; mrt; mrt = NULL) + +static struct mr6_table *ip6mr_get_table(struct net *net, u32 id) +{ + return net->ipv6.mrt6; +} + +static int ip6mr_fib_lookup(struct net *net, struct flowi6 *flp6, + struct mr6_table **mrt) +{ + *mrt = net->ipv6.mrt6; + return 0; +} + +static int __net_init ip6mr_rules_init(struct net *net) +{ + net->ipv6.mrt6 = ip6mr_new_table(net, RT6_TABLE_DFLT); + return net->ipv6.mrt6 ? 0 : -ENOMEM; +} + +static void __net_exit ip6mr_rules_exit(struct net *net) +{ + ip6mr_free_table(net->ipv6.mrt6); +} +#endif + +static struct mr6_table *ip6mr_new_table(struct net *net, u32 id) +{ + struct mr6_table *mrt; + unsigned int i; + + mrt = ip6mr_get_table(net, id); + if (mrt != NULL) + return mrt; + + mrt = kzalloc(sizeof(*mrt), GFP_KERNEL); + if (mrt == NULL) + return NULL; + mrt->id = id; + write_pnet(&mrt->net, net); + + /* Forwarding cache */ + for (i = 0; i < MFC6_LINES; i++) + INIT_LIST_HEAD(&mrt->mfc6_cache_array[i]); + + INIT_LIST_HEAD(&mrt->mfc6_unres_queue); + + setup_timer(&mrt->ipmr_expire_timer, ipmr_expire_process, + (unsigned long)mrt); + +#ifdef CONFIG_IPV6_PIMSM_V2 + mrt->mroute_reg_vif_num = -1; +#endif +#ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES + list_add_tail_rcu(&mrt->list, &net->ipv6.mr6_tables); +#endif + return mrt; +} + +static void ip6mr_free_table(struct mr6_table *mrt) +{ + del_timer(&mrt->ipmr_expire_timer); + mroute_clean_tables(mrt); + kfree(mrt); +} + +#ifdef CONFIG_PROC_FS + +struct ipmr_mfc_iter { + struct seq_net_private p; + struct mr6_table *mrt; + struct list_head *cache; + int ct; +}; + + +static struct mfc6_cache *ipmr_mfc_seq_idx(struct net *net, + struct ipmr_mfc_iter *it, loff_t pos) +{ + struct mr6_table *mrt = it->mrt; + struct mfc6_cache *mfc; + + read_lock(&mrt_lock); + for (it->ct = 0; it->ct < MFC6_LINES; it->ct++) { + it->cache = &mrt->mfc6_cache_array[it->ct]; + list_for_each_entry(mfc, it->cache, list) + if (pos-- == 0) + return mfc; + } + read_unlock(&mrt_lock); + + spin_lock_bh(&mfc_unres_lock); + it->cache = &mrt->mfc6_unres_queue; + list_for_each_entry(mfc, it->cache, list) + if (pos-- == 0) + return mfc; + spin_unlock_bh(&mfc_unres_lock); + + it->cache = NULL; + return NULL; +} + +/* + * The /proc interfaces to multicast routing /proc/ip6_mr_cache /proc/ip6_mr_vif + */ + +struct ipmr_vif_iter { + struct seq_net_private p; + struct mr6_table *mrt; + int ct; +}; + +static struct mif_device *ip6mr_vif_seq_idx(struct net *net, + struct ipmr_vif_iter *iter, + loff_t pos) +{ + struct mr6_table *mrt = iter->mrt; + + for (iter->ct = 0; iter->ct < mrt->maxvif; ++iter->ct) { + if (!MIF_EXISTS(mrt, iter->ct)) + continue; + if (pos-- == 0) + return &mrt->vif6_table[iter->ct]; + } + return NULL; +} + +static void *ip6mr_vif_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(mrt_lock) +{ + struct ipmr_vif_iter *iter = seq->private; + struct net *net = seq_file_net(seq); + struct mr6_table *mrt; + + mrt = ip6mr_get_table(net, RT6_TABLE_DFLT); + if (mrt == NULL) + return ERR_PTR(-ENOENT); + + iter->mrt = mrt; + + read_lock(&mrt_lock); + return *pos ? ip6mr_vif_seq_idx(net, seq->private, *pos - 1) + : SEQ_START_TOKEN; +} + +static void *ip6mr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct ipmr_vif_iter *iter = seq->private; + struct net *net = seq_file_net(seq); + struct mr6_table *mrt = iter->mrt; + + ++*pos; + if (v == SEQ_START_TOKEN) + return ip6mr_vif_seq_idx(net, iter, 0); + + while (++iter->ct < mrt->maxvif) { + if (!MIF_EXISTS(mrt, iter->ct)) + continue; + return &mrt->vif6_table[iter->ct]; + } + return NULL; +} + +static void ip6mr_vif_seq_stop(struct seq_file *seq, void *v) + __releases(mrt_lock) +{ + read_unlock(&mrt_lock); +} + +static int ip6mr_vif_seq_show(struct seq_file *seq, void *v) +{ + struct ipmr_vif_iter *iter = seq->private; + struct mr6_table *mrt = iter->mrt; + + if (v == SEQ_START_TOKEN) { + seq_puts(seq, + "Interface BytesIn PktsIn BytesOut PktsOut Flags\n"); + } else { + const struct mif_device *vif = v; + const char *name = vif->dev ? vif->dev->name : "none"; + + seq_printf(seq, + "%2td %-10s %8ld %7ld %8ld %7ld %05X\n", + vif - mrt->vif6_table, + name, vif->bytes_in, vif->pkt_in, + vif->bytes_out, vif->pkt_out, + vif->flags); + } + return 0; +} + +static const struct seq_operations ip6mr_vif_seq_ops = { + .start = ip6mr_vif_seq_start, + .next = ip6mr_vif_seq_next, + .stop = ip6mr_vif_seq_stop, + .show = ip6mr_vif_seq_show, +}; + +static int ip6mr_vif_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &ip6mr_vif_seq_ops, + sizeof(struct ipmr_vif_iter)); +} + +static const struct file_operations ip6mr_vif_fops = { + .owner = THIS_MODULE, + .open = ip6mr_vif_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct ipmr_mfc_iter *it = seq->private; + struct net *net = seq_file_net(seq); + struct mr6_table *mrt; + + mrt = ip6mr_get_table(net, RT6_TABLE_DFLT); + if (mrt == NULL) + return ERR_PTR(-ENOENT); + + it->mrt = mrt; + return *pos ? ipmr_mfc_seq_idx(net, seq->private, *pos - 1) + : SEQ_START_TOKEN; +} + +static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct mfc6_cache *mfc = v; + struct ipmr_mfc_iter *it = seq->private; + struct net *net = seq_file_net(seq); + struct mr6_table *mrt = it->mrt; + + ++*pos; + + if (v == SEQ_START_TOKEN) + return ipmr_mfc_seq_idx(net, seq->private, 0); + + if (mfc->list.next != it->cache) + return list_entry(mfc->list.next, struct mfc6_cache, list); + + if (it->cache == &mrt->mfc6_unres_queue) + goto end_of_list; + + BUG_ON(it->cache != &mrt->mfc6_cache_array[it->ct]); + + while (++it->ct < MFC6_LINES) { + it->cache = &mrt->mfc6_cache_array[it->ct]; + if (list_empty(it->cache)) + continue; + return list_first_entry(it->cache, struct mfc6_cache, list); + } + + /* exhausted cache_array, show unresolved */ + read_unlock(&mrt_lock); + it->cache = &mrt->mfc6_unres_queue; + it->ct = 0; + + spin_lock_bh(&mfc_unres_lock); + if (!list_empty(it->cache)) + return list_first_entry(it->cache, struct mfc6_cache, list); + + end_of_list: + spin_unlock_bh(&mfc_unres_lock); + it->cache = NULL; + + return NULL; +} + +static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v) +{ + struct ipmr_mfc_iter *it = seq->private; + struct mr6_table *mrt = it->mrt; + + if (it->cache == &mrt->mfc6_unres_queue) + spin_unlock_bh(&mfc_unres_lock); + else if (it->cache == mrt->mfc6_cache_array) + read_unlock(&mrt_lock); +} + +static int ipmr_mfc_seq_show(struct seq_file *seq, void *v) +{ + int n; + + if (v == SEQ_START_TOKEN) { + seq_puts(seq, + "Group " + "Origin " + "Iif Pkts Bytes Wrong Oifs\n"); + } else { + const struct mfc6_cache *mfc = v; + const struct ipmr_mfc_iter *it = seq->private; + struct mr6_table *mrt = it->mrt; + + seq_printf(seq, "%pI6 %pI6 %-3hd", + &mfc->mf6c_mcastgrp, &mfc->mf6c_origin, + mfc->mf6c_parent); + + if (it->cache != &mrt->mfc6_unres_queue) { + seq_printf(seq, " %8lu %8lu %8lu", + mfc->mfc_un.res.pkt, + mfc->mfc_un.res.bytes, + mfc->mfc_un.res.wrong_if); + for (n = mfc->mfc_un.res.minvif; + n < mfc->mfc_un.res.maxvif; n++) { + if (MIF_EXISTS(mrt, n) && + mfc->mfc_un.res.ttls[n] < 255) + seq_printf(seq, + " %2d:%-3d", + n, mfc->mfc_un.res.ttls[n]); + } + } else { + /* unresolved mfc_caches don't contain + * pkt, bytes and wrong_if values + */ + seq_printf(seq, " %8lu %8lu %8lu", 0ul, 0ul, 0ul); + } + seq_putc(seq, '\n'); + } + return 0; +} + +static const struct seq_operations ipmr_mfc_seq_ops = { + .start = ipmr_mfc_seq_start, + .next = ipmr_mfc_seq_next, + .stop = ipmr_mfc_seq_stop, + .show = ipmr_mfc_seq_show, +}; + +static int ipmr_mfc_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &ipmr_mfc_seq_ops, + sizeof(struct ipmr_mfc_iter)); +} + +static const struct file_operations ip6mr_mfc_fops = { + .owner = THIS_MODULE, + .open = ipmr_mfc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; +#endif + +#ifdef CONFIG_IPV6_PIMSM_V2 + +static int pim6_rcv(struct sk_buff *skb) +{ + struct pimreghdr *pim; + struct ipv6hdr *encap; + struct net_device *reg_dev = NULL; + struct net *net = dev_net(skb->dev); + struct mr6_table *mrt; + struct flowi6 fl6 = { + .flowi6_iif = skb->dev->ifindex, + .flowi6_mark = skb->mark, + }; + int reg_vif_num; + + if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap))) + goto drop; + + pim = (struct pimreghdr *)skb_transport_header(skb); + if (pim->type != ((PIM_VERSION << 4) | PIM_REGISTER) || + (pim->flags & PIM_NULL_REGISTER) || + (csum_ipv6_magic(&ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, + sizeof(*pim), IPPROTO_PIM, + csum_partial((void *)pim, sizeof(*pim), 0)) && + csum_fold(skb_checksum(skb, 0, skb->len, 0)))) + goto drop; + + /* check if the inner packet is destined to mcast group */ + encap = (struct ipv6hdr *)(skb_transport_header(skb) + + sizeof(*pim)); + + if (!ipv6_addr_is_multicast(&encap->daddr) || + encap->payload_len == 0 || + ntohs(encap->payload_len) + sizeof(*pim) > skb->len) + goto drop; + + if (ip6mr_fib_lookup(net, &fl6, &mrt) < 0) + goto drop; + reg_vif_num = mrt->mroute_reg_vif_num; + + read_lock(&mrt_lock); + if (reg_vif_num >= 0) + reg_dev = mrt->vif6_table[reg_vif_num].dev; + if (reg_dev) + dev_hold(reg_dev); + read_unlock(&mrt_lock); + + if (reg_dev == NULL) + goto drop; + + skb->mac_header = skb->network_header; + skb_pull(skb, (u8 *)encap - skb->data); + skb_reset_network_header(skb); + skb->protocol = htons(ETH_P_IPV6); + skb->ip_summed = CHECKSUM_NONE; + skb->pkt_type = PACKET_HOST; + + skb_tunnel_rx(skb, reg_dev); + + netif_rx(skb); + + dev_put(reg_dev); + return 0; + drop: + kfree_skb(skb); + return 0; +} + +static const struct inet6_protocol pim6_protocol = { + .handler = pim6_rcv, +}; + +/* Service routines creating virtual interfaces: PIMREG */ + +static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, + struct net_device *dev) +{ + struct net *net = dev_net(dev); + struct mr6_table *mrt; + struct flowi6 fl6 = { + .flowi6_oif = dev->ifindex, + .flowi6_iif = skb->skb_iif, + .flowi6_mark = skb->mark, + }; + int err; + + err = ip6mr_fib_lookup(net, &fl6, &mrt); + if (err < 0) { + kfree_skb(skb); + return err; + } + + read_lock(&mrt_lock); + dev->stats.tx_bytes += skb->len; + dev->stats.tx_packets++; + ip6mr_cache_report(mrt, skb, mrt->mroute_reg_vif_num, MRT6MSG_WHOLEPKT); + read_unlock(&mrt_lock); + kfree_skb(skb); + return NETDEV_TX_OK; +} + +static const struct net_device_ops reg_vif_netdev_ops = { + .ndo_start_xmit = reg_vif_xmit, +}; + +static void reg_vif_setup(struct net_device *dev) +{ + dev->type = ARPHRD_PIMREG; + dev->mtu = 1500 - sizeof(struct ipv6hdr) - 8; + dev->flags = IFF_NOARP; + dev->netdev_ops = ®_vif_netdev_ops; + dev->destructor = free_netdev; + dev->features |= NETIF_F_NETNS_LOCAL; +} + +static struct net_device *ip6mr_reg_vif(struct net *net, struct mr6_table *mrt) +{ + struct net_device *dev; + char name[IFNAMSIZ]; + + if (mrt->id == RT6_TABLE_DFLT) + sprintf(name, "pim6reg"); + else + sprintf(name, "pim6reg%u", mrt->id); + + dev = alloc_netdev(0, name, reg_vif_setup); + if (dev == NULL) + return NULL; + + dev_net_set(dev, net); + + if (register_netdevice(dev)) { + free_netdev(dev); + return NULL; + } + dev->iflink = 0; + + if (dev_open(dev)) + goto failure; + + dev_hold(dev); + return dev; + +failure: + /* allow the register to be completed before unregistering. */ + rtnl_unlock(); + rtnl_lock(); + + unregister_netdevice(dev); + return NULL; +} +#endif + +/* + * Delete a VIF entry + */ + +static int mif6_delete(struct mr6_table *mrt, int vifi, struct list_head *head) +{ + struct mif_device *v; + struct net_device *dev; + struct inet6_dev *in6_dev; + + if (vifi < 0 || vifi >= mrt->maxvif) + return -EADDRNOTAVAIL; + + v = &mrt->vif6_table[vifi]; + + write_lock_bh(&mrt_lock); + dev = v->dev; + v->dev = NULL; + + if (!dev) { + write_unlock_bh(&mrt_lock); + return -EADDRNOTAVAIL; + } + +#ifdef CONFIG_IPV6_PIMSM_V2 + if (vifi == mrt->mroute_reg_vif_num) + mrt->mroute_reg_vif_num = -1; +#endif + + if (vifi + 1 == mrt->maxvif) { + int tmp; + for (tmp = vifi - 1; tmp >= 0; tmp--) { + if (MIF_EXISTS(mrt, tmp)) + break; + } + mrt->maxvif = tmp + 1; + } + + write_unlock_bh(&mrt_lock); + + dev_set_allmulti(dev, -1); + + in6_dev = __in6_dev_get(dev); + if (in6_dev) + in6_dev->cnf.mc_forwarding--; + + if (v->flags & MIFF_REGISTER) + unregister_netdevice_queue(dev, head); + + dev_put(dev); + return 0; +} + +static inline void ip6mr_cache_free(struct mfc6_cache *c) +{ + kmem_cache_free(mrt_cachep, c); +} + +/* Destroy an unresolved cache entry, killing queued skbs + and reporting error to netlink readers. + */ + +static void ip6mr_destroy_unres(struct mr6_table *mrt, struct mfc6_cache *c) +{ + struct net *net = read_pnet(&mrt->net); + struct sk_buff *skb; + + atomic_dec(&mrt->cache_resolve_queue_len); + + while((skb = skb_dequeue(&c->mfc_un.unres.unresolved)) != NULL) { + if (ipv6_hdr(skb)->version == 0) { + struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct ipv6hdr)); + nlh->nlmsg_type = NLMSG_ERROR; + nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr)); + skb_trim(skb, nlh->nlmsg_len); + ((struct nlmsgerr *)NLMSG_DATA(nlh))->error = -ETIMEDOUT; + rtnl_unicast(skb, net, NETLINK_CB(skb).pid); + } else + kfree_skb(skb); + } + + ip6mr_cache_free(c); +} + + +/* Timer process for all the unresolved queue. */ + +static void ipmr_do_expire_process(struct mr6_table *mrt) +{ + unsigned long now = jiffies; + unsigned long expires = 10 * HZ; + struct mfc6_cache *c, *next; + + list_for_each_entry_safe(c, next, &mrt->mfc6_unres_queue, list) { + if (time_after(c->mfc_un.unres.expires, now)) { + /* not yet... */ + unsigned long interval = c->mfc_un.unres.expires - now; + if (interval < expires) + expires = interval; + continue; + } + + list_del(&c->list); + ip6mr_destroy_unres(mrt, c); + } + + if (!list_empty(&mrt->mfc6_unres_queue)) + mod_timer(&mrt->ipmr_expire_timer, jiffies + expires); +} + +static void ipmr_expire_process(unsigned long arg) +{ + struct mr6_table *mrt = (struct mr6_table *)arg; + + if (!spin_trylock(&mfc_unres_lock)) { + mod_timer(&mrt->ipmr_expire_timer, jiffies + 1); + return; + } + + if (!list_empty(&mrt->mfc6_unres_queue)) + ipmr_do_expire_process(mrt); + + spin_unlock(&mfc_unres_lock); +} + +/* Fill oifs list. It is called under write locked mrt_lock. */ + +static void ip6mr_update_thresholds(struct mr6_table *mrt, struct mfc6_cache *cache, + unsigned char *ttls) +{ + int vifi; + + cache->mfc_un.res.minvif = MAXMIFS; + cache->mfc_un.res.maxvif = 0; + memset(cache->mfc_un.res.ttls, 255, MAXMIFS); + + for (vifi = 0; vifi < mrt->maxvif; vifi++) { + if (MIF_EXISTS(mrt, vifi) && + ttls[vifi] && ttls[vifi] < 255) { + cache->mfc_un.res.ttls[vifi] = ttls[vifi]; + if (cache->mfc_un.res.minvif > vifi) + cache->mfc_un.res.minvif = vifi; + if (cache->mfc_un.res.maxvif <= vifi) + cache->mfc_un.res.maxvif = vifi + 1; + } + } +} + +static int mif6_add(struct net *net, struct mr6_table *mrt, + struct mif6ctl *vifc, int mrtsock) +{ + int vifi = vifc->mif6c_mifi; + struct mif_device *v = &mrt->vif6_table[vifi]; + struct net_device *dev; + struct inet6_dev *in6_dev; + int err; + + /* Is vif busy ? */ + if (MIF_EXISTS(mrt, vifi)) + return -EADDRINUSE; + + switch (vifc->mif6c_flags) { +#ifdef CONFIG_IPV6_PIMSM_V2 + case MIFF_REGISTER: + /* + * Special Purpose VIF in PIM + * All the packets will be sent to the daemon + */ + if (mrt->mroute_reg_vif_num >= 0) + return -EADDRINUSE; + dev = ip6mr_reg_vif(net, mrt); + if (!dev) + return -ENOBUFS; + err = dev_set_allmulti(dev, 1); + if (err) { + unregister_netdevice(dev); + dev_put(dev); + return err; + } + break; +#endif + case 0: + dev = dev_get_by_index(net, vifc->mif6c_pifi); + if (!dev) + return -EADDRNOTAVAIL; + err = dev_set_allmulti(dev, 1); + if (err) { + dev_put(dev); + return err; + } + break; + default: + return -EINVAL; + } + + in6_dev = __in6_dev_get(dev); + if (in6_dev) + in6_dev->cnf.mc_forwarding++; + + /* + * Fill in the VIF structures + */ + v->rate_limit = vifc->vifc_rate_limit; + v->flags = vifc->mif6c_flags; + if (!mrtsock) + v->flags |= VIFF_STATIC; + v->threshold = vifc->vifc_threshold; + v->bytes_in = 0; + v->bytes_out = 0; + v->pkt_in = 0; + v->pkt_out = 0; + v->link = dev->ifindex; + if (v->flags & MIFF_REGISTER) + v->link = dev->iflink; + + /* And finish update writing critical data */ + write_lock_bh(&mrt_lock); + v->dev = dev; +#ifdef CONFIG_IPV6_PIMSM_V2 + if (v->flags & MIFF_REGISTER) + mrt->mroute_reg_vif_num = vifi; +#endif + if (vifi + 1 > mrt->maxvif) + mrt->maxvif = vifi + 1; + write_unlock_bh(&mrt_lock); + return 0; +} + +static struct mfc6_cache *ip6mr_cache_find(struct mr6_table *mrt, + const struct in6_addr *origin, + const struct in6_addr *mcastgrp) +{ + int line = MFC6_HASH(mcastgrp, origin); + struct mfc6_cache *c; + + list_for_each_entry(c, &mrt->mfc6_cache_array[line], list) { + if (ipv6_addr_equal(&c->mf6c_origin, origin) && + ipv6_addr_equal(&c->mf6c_mcastgrp, mcastgrp)) + return c; + } + return NULL; +} + +/* + * Allocate a multicast cache entry + */ +static struct mfc6_cache *ip6mr_cache_alloc(void) +{ + struct mfc6_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL); + if (c == NULL) + return NULL; + c->mfc_un.res.minvif = MAXMIFS; + return c; +} + +static struct mfc6_cache *ip6mr_cache_alloc_unres(void) +{ + struct mfc6_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC); + if (c == NULL) + return NULL; + skb_queue_head_init(&c->mfc_un.unres.unresolved); + c->mfc_un.unres.expires = jiffies + 10 * HZ; + return c; +} + +/* + * A cache entry has gone into a resolved state from queued + */ + +static void ip6mr_cache_resolve(struct net *net, struct mr6_table *mrt, + struct mfc6_cache *uc, struct mfc6_cache *c) +{ + struct sk_buff *skb; + + /* + * Play the pending entries through our router + */ + + while((skb = __skb_dequeue(&uc->mfc_un.unres.unresolved))) { + if (ipv6_hdr(skb)->version == 0) { + struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct ipv6hdr)); + + if (__ip6mr_fill_mroute(mrt, skb, c, NLMSG_DATA(nlh)) > 0) { + nlh->nlmsg_len = skb_tail_pointer(skb) - (u8 *)nlh; + } else { + nlh->nlmsg_type = NLMSG_ERROR; + nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr)); + skb_trim(skb, nlh->nlmsg_len); + ((struct nlmsgerr *)NLMSG_DATA(nlh))->error = -EMSGSIZE; + } + rtnl_unicast(skb, net, NETLINK_CB(skb).pid); + } else + ip6_mr_forward(net, mrt, skb, c); + } +} + +/* + * Bounce a cache query up to pim6sd. We could use netlink for this but pim6sd + * expects the following bizarre scheme. + * + * Called under mrt_lock. + */ + +static int ip6mr_cache_report(struct mr6_table *mrt, struct sk_buff *pkt, + mifi_t mifi, int assert) +{ + struct sk_buff *skb; + struct mrt6msg *msg; + int ret; + +#ifdef CONFIG_IPV6_PIMSM_V2 + if (assert == MRT6MSG_WHOLEPKT) + skb = skb_realloc_headroom(pkt, -skb_network_offset(pkt) + +sizeof(*msg)); + else +#endif + skb = alloc_skb(sizeof(struct ipv6hdr) + sizeof(*msg), GFP_ATOMIC); + + if (!skb) + return -ENOBUFS; + + /* I suppose that internal messages + * do not require checksums */ + + skb->ip_summed = CHECKSUM_UNNECESSARY; + +#ifdef CONFIG_IPV6_PIMSM_V2 + if (assert == MRT6MSG_WHOLEPKT) { + /* Ugly, but we have no choice with this interface. + Duplicate old header, fix length etc. + And all this only to mangle msg->im6_msgtype and + to set msg->im6_mbz to "mbz" :-) + */ + skb_push(skb, -skb_network_offset(pkt)); + + skb_push(skb, sizeof(*msg)); + skb_reset_transport_header(skb); + msg = (struct mrt6msg *)skb_transport_header(skb); + msg->im6_mbz = 0; + msg->im6_msgtype = MRT6MSG_WHOLEPKT; + msg->im6_mif = mrt->mroute_reg_vif_num; + msg->im6_pad = 0; + msg->im6_src = ipv6_hdr(pkt)->saddr; + msg->im6_dst = ipv6_hdr(pkt)->daddr; + + skb->ip_summed = CHECKSUM_UNNECESSARY; + } else +#endif + { + /* + * Copy the IP header + */ + + skb_put(skb, sizeof(struct ipv6hdr)); + skb_reset_network_header(skb); + skb_copy_to_linear_data(skb, ipv6_hdr(pkt), sizeof(struct ipv6hdr)); + + /* + * Add our header + */ + skb_put(skb, sizeof(*msg)); + skb_reset_transport_header(skb); + msg = (struct mrt6msg *)skb_transport_header(skb); + + msg->im6_mbz = 0; + msg->im6_msgtype = assert; + msg->im6_mif = mifi; + msg->im6_pad = 0; + msg->im6_src = ipv6_hdr(pkt)->saddr; + msg->im6_dst = ipv6_hdr(pkt)->daddr; + + skb_dst_set(skb, dst_clone(skb_dst(pkt))); + skb->ip_summed = CHECKSUM_UNNECESSARY; + } + + if (mrt->mroute6_sk == NULL) { + kfree_skb(skb); + return -EINVAL; + } + + /* + * Deliver to user space multicast routing algorithms + */ + ret = sock_queue_rcv_skb(mrt->mroute6_sk, skb); + if (ret < 0) { + if (net_ratelimit()) + printk(KERN_WARNING "mroute6: pending queue full, dropping entries.\n"); + kfree_skb(skb); + } + + return ret; +} + +/* + * Queue a packet for resolution. It gets locked cache entry! + */ + +static int +ip6mr_cache_unresolved(struct mr6_table *mrt, mifi_t mifi, struct sk_buff *skb) +{ + bool found = false; + int err; + struct mfc6_cache *c; + + spin_lock_bh(&mfc_unres_lock); + list_for_each_entry(c, &mrt->mfc6_unres_queue, list) { + if (ipv6_addr_equal(&c->mf6c_mcastgrp, &ipv6_hdr(skb)->daddr) && + ipv6_addr_equal(&c->mf6c_origin, &ipv6_hdr(skb)->saddr)) { + found = true; + break; + } + } + + if (!found) { + /* + * Create a new entry if allowable + */ + + if (atomic_read(&mrt->cache_resolve_queue_len) >= 10 || + (c = ip6mr_cache_alloc_unres()) == NULL) { + spin_unlock_bh(&mfc_unres_lock); + + kfree_skb(skb); + return -ENOBUFS; + } + + /* + * Fill in the new cache entry + */ + c->mf6c_parent = -1; + c->mf6c_origin = ipv6_hdr(skb)->saddr; + c->mf6c_mcastgrp = ipv6_hdr(skb)->daddr; + + /* + * Reflect first query at pim6sd + */ + err = ip6mr_cache_report(mrt, skb, mifi, MRT6MSG_NOCACHE); + if (err < 0) { + /* If the report failed throw the cache entry + out - Brad Parker + */ + spin_unlock_bh(&mfc_unres_lock); + + ip6mr_cache_free(c); + kfree_skb(skb); + return err; + } + + atomic_inc(&mrt->cache_resolve_queue_len); + list_add(&c->list, &mrt->mfc6_unres_queue); + + ipmr_do_expire_process(mrt); + } + + /* + * See if we can append the packet + */ + if (c->mfc_un.unres.unresolved.qlen > 3) { + kfree_skb(skb); + err = -ENOBUFS; + } else { + skb_queue_tail(&c->mfc_un.unres.unresolved, skb); + err = 0; + } + + spin_unlock_bh(&mfc_unres_lock); + return err; +} + +/* + * MFC6 cache manipulation by user space + */ + +static int ip6mr_mfc_delete(struct mr6_table *mrt, struct mf6cctl *mfc) +{ + int line; + struct mfc6_cache *c, *next; + + line = MFC6_HASH(&mfc->mf6cc_mcastgrp.sin6_addr, &mfc->mf6cc_origin.sin6_addr); + + list_for_each_entry_safe(c, next, &mrt->mfc6_cache_array[line], list) { + if (ipv6_addr_equal(&c->mf6c_origin, &mfc->mf6cc_origin.sin6_addr) && + ipv6_addr_equal(&c->mf6c_mcastgrp, &mfc->mf6cc_mcastgrp.sin6_addr)) { + write_lock_bh(&mrt_lock); + list_del(&c->list); + write_unlock_bh(&mrt_lock); + + ip6mr_cache_free(c); + return 0; + } + } + return -ENOENT; +} + +static int ip6mr_device_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct net_device *dev = ptr; + struct net *net = dev_net(dev); + struct mr6_table *mrt; + struct mif_device *v; + int ct; + LIST_HEAD(list); + + if (event != NETDEV_UNREGISTER) + return NOTIFY_DONE; + + ip6mr_for_each_table(mrt, net) { + v = &mrt->vif6_table[0]; + for (ct = 0; ct < mrt->maxvif; ct++, v++) { + if (v->dev == dev) + mif6_delete(mrt, ct, &list); + } + } + unregister_netdevice_many(&list); + + return NOTIFY_DONE; +} + +static struct notifier_block ip6_mr_notifier = { + .notifier_call = ip6mr_device_event +}; + +/* + * Setup for IP multicast routing + */ + +static int __net_init ip6mr_net_init(struct net *net) +{ + int err; + + err = ip6mr_rules_init(net); + if (err < 0) + goto fail; + +#ifdef CONFIG_PROC_FS + err = -ENOMEM; + if (!proc_net_fops_create(net, "ip6_mr_vif", 0, &ip6mr_vif_fops)) + goto proc_vif_fail; + if (!proc_net_fops_create(net, "ip6_mr_cache", 0, &ip6mr_mfc_fops)) + goto proc_cache_fail; +#endif + + return 0; + +#ifdef CONFIG_PROC_FS +proc_cache_fail: + proc_net_remove(net, "ip6_mr_vif"); +proc_vif_fail: + ip6mr_rules_exit(net); +#endif +fail: + return err; +} + +static void __net_exit ip6mr_net_exit(struct net *net) +{ +#ifdef CONFIG_PROC_FS + proc_net_remove(net, "ip6_mr_cache"); + proc_net_remove(net, "ip6_mr_vif"); +#endif + ip6mr_rules_exit(net); +} + +static struct pernet_operations ip6mr_net_ops = { + .init = ip6mr_net_init, + .exit = ip6mr_net_exit, +}; + +int __init ip6_mr_init(void) +{ + int err; + + mrt_cachep = kmem_cache_create("ip6_mrt_cache", + sizeof(struct mfc6_cache), + 0, SLAB_HWCACHE_ALIGN, + NULL); + if (!mrt_cachep) + return -ENOMEM; + + err = register_pernet_subsys(&ip6mr_net_ops); + if (err) + goto reg_pernet_fail; + + err = register_netdevice_notifier(&ip6_mr_notifier); + if (err) + goto reg_notif_fail; +#ifdef CONFIG_IPV6_PIMSM_V2 + if (inet6_add_protocol(&pim6_protocol, IPPROTO_PIM) < 0) { + printk(KERN_ERR "ip6_mr_init: can't add PIM protocol\n"); + err = -EAGAIN; + goto add_proto_fail; + } +#endif + rtnl_register(RTNL_FAMILY_IP6MR, RTM_GETROUTE, NULL, + ip6mr_rtm_dumproute, NULL); + return 0; +#ifdef CONFIG_IPV6_PIMSM_V2 +add_proto_fail: + unregister_netdevice_notifier(&ip6_mr_notifier); +#endif +reg_notif_fail: + unregister_pernet_subsys(&ip6mr_net_ops); +reg_pernet_fail: + kmem_cache_destroy(mrt_cachep); + return err; +} + +void ip6_mr_cleanup(void) +{ + unregister_netdevice_notifier(&ip6_mr_notifier); + unregister_pernet_subsys(&ip6mr_net_ops); + kmem_cache_destroy(mrt_cachep); +} + +static int ip6mr_mfc_add(struct net *net, struct mr6_table *mrt, + struct mf6cctl *mfc, int mrtsock) +{ + bool found = false; + int line; + struct mfc6_cache *uc, *c; + unsigned char ttls[MAXMIFS]; + int i; + + if (mfc->mf6cc_parent >= MAXMIFS) + return -ENFILE; + + memset(ttls, 255, MAXMIFS); + for (i = 0; i < MAXMIFS; i++) { + if (IF_ISSET(i, &mfc->mf6cc_ifset)) + ttls[i] = 1; + + } + + line = MFC6_HASH(&mfc->mf6cc_mcastgrp.sin6_addr, &mfc->mf6cc_origin.sin6_addr); + + list_for_each_entry(c, &mrt->mfc6_cache_array[line], list) { + if (ipv6_addr_equal(&c->mf6c_origin, &mfc->mf6cc_origin.sin6_addr) && + ipv6_addr_equal(&c->mf6c_mcastgrp, &mfc->mf6cc_mcastgrp.sin6_addr)) { + found = true; + break; + } + } + + if (found) { + write_lock_bh(&mrt_lock); + c->mf6c_parent = mfc->mf6cc_parent; + ip6mr_update_thresholds(mrt, c, ttls); + if (!mrtsock) + c->mfc_flags |= MFC_STATIC; + write_unlock_bh(&mrt_lock); + return 0; + } + + if (!ipv6_addr_is_multicast(&mfc->mf6cc_mcastgrp.sin6_addr)) + return -EINVAL; + + c = ip6mr_cache_alloc(); + if (c == NULL) + return -ENOMEM; + + c->mf6c_origin = mfc->mf6cc_origin.sin6_addr; + c->mf6c_mcastgrp = mfc->mf6cc_mcastgrp.sin6_addr; + c->mf6c_parent = mfc->mf6cc_parent; + ip6mr_update_thresholds(mrt, c, ttls); + if (!mrtsock) + c->mfc_flags |= MFC_STATIC; + + write_lock_bh(&mrt_lock); + list_add(&c->list, &mrt->mfc6_cache_array[line]); + write_unlock_bh(&mrt_lock); + + /* + * Check to see if we resolved a queued list. If so we + * need to send on the frames and tidy up. + */ + found = false; + spin_lock_bh(&mfc_unres_lock); + list_for_each_entry(uc, &mrt->mfc6_unres_queue, list) { + if (ipv6_addr_equal(&uc->mf6c_origin, &c->mf6c_origin) && + ipv6_addr_equal(&uc->mf6c_mcastgrp, &c->mf6c_mcastgrp)) { + list_del(&uc->list); + atomic_dec(&mrt->cache_resolve_queue_len); + found = true; + break; + } + } + if (list_empty(&mrt->mfc6_unres_queue)) + del_timer(&mrt->ipmr_expire_timer); + spin_unlock_bh(&mfc_unres_lock); + + if (found) { + ip6mr_cache_resolve(net, mrt, uc, c); + ip6mr_cache_free(uc); + } + return 0; +} + +/* + * Close the multicast socket, and clear the vif tables etc + */ + +static void mroute_clean_tables(struct mr6_table *mrt) +{ + int i; + LIST_HEAD(list); + struct mfc6_cache *c, *next; + + /* + * Shut down all active vif entries + */ + for (i = 0; i < mrt->maxvif; i++) { + if (!(mrt->vif6_table[i].flags & VIFF_STATIC)) + mif6_delete(mrt, i, &list); + } + unregister_netdevice_many(&list); + + /* + * Wipe the cache + */ + for (i = 0; i < MFC6_LINES; i++) { + list_for_each_entry_safe(c, next, &mrt->mfc6_cache_array[i], list) { + if (c->mfc_flags & MFC_STATIC) + continue; + write_lock_bh(&mrt_lock); + list_del(&c->list); + write_unlock_bh(&mrt_lock); + + ip6mr_cache_free(c); + } + } + + if (atomic_read(&mrt->cache_resolve_queue_len) != 0) { + spin_lock_bh(&mfc_unres_lock); + list_for_each_entry_safe(c, next, &mrt->mfc6_unres_queue, list) { + list_del(&c->list); + ip6mr_destroy_unres(mrt, c); + } + spin_unlock_bh(&mfc_unres_lock); + } +} + +static int ip6mr_sk_init(struct mr6_table *mrt, struct sock *sk) +{ + int err = 0; + struct net *net = sock_net(sk); + + rtnl_lock(); + write_lock_bh(&mrt_lock); + if (likely(mrt->mroute6_sk == NULL)) { + mrt->mroute6_sk = sk; + net->ipv6.devconf_all->mc_forwarding++; + } + else + err = -EADDRINUSE; + write_unlock_bh(&mrt_lock); + + rtnl_unlock(); + + return err; +} + +int ip6mr_sk_done(struct sock *sk) +{ + int err = -EACCES; + struct net *net = sock_net(sk); + struct mr6_table *mrt; + + rtnl_lock(); + ip6mr_for_each_table(mrt, net) { + if (sk == mrt->mroute6_sk) { + write_lock_bh(&mrt_lock); + mrt->mroute6_sk = NULL; + net->ipv6.devconf_all->mc_forwarding--; + write_unlock_bh(&mrt_lock); + + mroute_clean_tables(mrt); + err = 0; + break; + } + } + rtnl_unlock(); + + return err; +} + +struct sock *mroute6_socket(struct net *net, struct sk_buff *skb) +{ + struct mr6_table *mrt; + struct flowi6 fl6 = { + .flowi6_iif = skb->skb_iif, + .flowi6_oif = skb->dev->ifindex, + .flowi6_mark = skb->mark, + }; + + if (ip6mr_fib_lookup(net, &fl6, &mrt) < 0) + return NULL; + + return mrt->mroute6_sk; +} + +/* + * Socket options and virtual interface manipulation. The whole + * virtual interface system is a complete heap, but unfortunately + * that's how BSD mrouted happens to think. Maybe one day with a proper + * MOSPF/PIM router set up we can clean this up. + */ + +int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsigned int optlen) +{ + int ret; + struct mif6ctl vif; + struct mf6cctl mfc; + mifi_t mifi; + struct net *net = sock_net(sk); + struct mr6_table *mrt; + + mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT); + if (mrt == NULL) + return -ENOENT; + + if (optname != MRT6_INIT) { + if (sk != mrt->mroute6_sk && !capable(CAP_NET_ADMIN)) + return -EACCES; + } + + switch (optname) { + case MRT6_INIT: + if (sk->sk_type != SOCK_RAW || + inet_sk(sk)->inet_num != IPPROTO_ICMPV6) + return -EOPNOTSUPP; + if (optlen < sizeof(int)) + return -EINVAL; + + return ip6mr_sk_init(mrt, sk); + + case MRT6_DONE: + return ip6mr_sk_done(sk); + + case MRT6_ADD_MIF: + if (optlen < sizeof(vif)) + return -EINVAL; + if (copy_from_user(&vif, optval, sizeof(vif))) + return -EFAULT; + if (vif.mif6c_mifi >= MAXMIFS) + return -ENFILE; + rtnl_lock(); + ret = mif6_add(net, mrt, &vif, sk == mrt->mroute6_sk); + rtnl_unlock(); + return ret; + + case MRT6_DEL_MIF: + if (optlen < sizeof(mifi_t)) + return -EINVAL; + if (copy_from_user(&mifi, optval, sizeof(mifi_t))) + return -EFAULT; + rtnl_lock(); + ret = mif6_delete(mrt, mifi, NULL); + rtnl_unlock(); + return ret; + + /* + * Manipulate the forwarding caches. These live + * in a sort of kernel/user symbiosis. + */ + case MRT6_ADD_MFC: + case MRT6_DEL_MFC: + if (optlen < sizeof(mfc)) + return -EINVAL; + if (copy_from_user(&mfc, optval, sizeof(mfc))) + return -EFAULT; + rtnl_lock(); + if (optname == MRT6_DEL_MFC) + ret = ip6mr_mfc_delete(mrt, &mfc); + else + ret = ip6mr_mfc_add(net, mrt, &mfc, sk == mrt->mroute6_sk); + rtnl_unlock(); + return ret; + + /* + * Control PIM assert (to activate pim will activate assert) + */ + case MRT6_ASSERT: + { + int v; + if (get_user(v, (int __user *)optval)) + return -EFAULT; + mrt->mroute_do_assert = !!v; + return 0; + } + +#ifdef CONFIG_IPV6_PIMSM_V2 + case MRT6_PIM: + { + int v; + if (get_user(v, (int __user *)optval)) + return -EFAULT; + v = !!v; + rtnl_lock(); + ret = 0; + if (v != mrt->mroute_do_pim) { + mrt->mroute_do_pim = v; + mrt->mroute_do_assert = v; + } + rtnl_unlock(); + return ret; + } + +#endif +#ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES + case MRT6_TABLE: + { + u32 v; + + if (optlen != sizeof(u32)) + return -EINVAL; + if (get_user(v, (u32 __user *)optval)) + return -EFAULT; + if (sk == mrt->mroute6_sk) + return -EBUSY; + + rtnl_lock(); + ret = 0; + if (!ip6mr_new_table(net, v)) + ret = -ENOMEM; + raw6_sk(sk)->ip6mr_table = v; + rtnl_unlock(); + return ret; + } +#endif + /* + * Spurious command, or MRT6_VERSION which you cannot + * set. + */ + default: + return -ENOPROTOOPT; + } +} + +/* + * Getsock opt support for the multicast routing system. + */ + +int ip6_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, + int __user *optlen) +{ + int olr; + int val; + struct net *net = sock_net(sk); + struct mr6_table *mrt; + + mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT); + if (mrt == NULL) + return -ENOENT; + + switch (optname) { + case MRT6_VERSION: + val = 0x0305; + break; +#ifdef CONFIG_IPV6_PIMSM_V2 + case MRT6_PIM: + val = mrt->mroute_do_pim; + break; +#endif + case MRT6_ASSERT: + val = mrt->mroute_do_assert; + break; + default: + return -ENOPROTOOPT; + } + + if (get_user(olr, optlen)) + return -EFAULT; + + olr = min_t(int, olr, sizeof(int)); + if (olr < 0) + return -EINVAL; + + if (put_user(olr, optlen)) + return -EFAULT; + if (copy_to_user(optval, &val, olr)) + return -EFAULT; + return 0; +} + +/* + * The IP multicast ioctl support routines. + */ + +int ip6mr_ioctl(struct sock *sk, int cmd, void __user *arg) +{ + struct sioc_sg_req6 sr; + struct sioc_mif_req6 vr; + struct mif_device *vif; + struct mfc6_cache *c; + struct net *net = sock_net(sk); + struct mr6_table *mrt; + + mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT); + if (mrt == NULL) + return -ENOENT; + + switch (cmd) { + case SIOCGETMIFCNT_IN6: + if (copy_from_user(&vr, arg, sizeof(vr))) + return -EFAULT; + if (vr.mifi >= mrt->maxvif) + return -EINVAL; + read_lock(&mrt_lock); + vif = &mrt->vif6_table[vr.mifi]; + if (MIF_EXISTS(mrt, vr.mifi)) { + vr.icount = vif->pkt_in; + vr.ocount = vif->pkt_out; + vr.ibytes = vif->bytes_in; + vr.obytes = vif->bytes_out; + read_unlock(&mrt_lock); + + if (copy_to_user(arg, &vr, sizeof(vr))) + return -EFAULT; + return 0; + } + read_unlock(&mrt_lock); + return -EADDRNOTAVAIL; + case SIOCGETSGCNT_IN6: + if (copy_from_user(&sr, arg, sizeof(sr))) + return -EFAULT; + + read_lock(&mrt_lock); + c = ip6mr_cache_find(mrt, &sr.src.sin6_addr, &sr.grp.sin6_addr); + if (c) { + sr.pktcnt = c->mfc_un.res.pkt; + sr.bytecnt = c->mfc_un.res.bytes; + sr.wrong_if = c->mfc_un.res.wrong_if; + read_unlock(&mrt_lock); + + if (copy_to_user(arg, &sr, sizeof(sr))) + return -EFAULT; + return 0; + } + read_unlock(&mrt_lock); + return -EADDRNOTAVAIL; + default: + return -ENOIOCTLCMD; + } +} + +#ifdef CONFIG_COMPAT +struct compat_sioc_sg_req6 { + struct sockaddr_in6 src; + struct sockaddr_in6 grp; + compat_ulong_t pktcnt; + compat_ulong_t bytecnt; + compat_ulong_t wrong_if; +}; + +struct compat_sioc_mif_req6 { + mifi_t mifi; + compat_ulong_t icount; + compat_ulong_t ocount; + compat_ulong_t ibytes; + compat_ulong_t obytes; +}; + +int ip6mr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) +{ + struct compat_sioc_sg_req6 sr; + struct compat_sioc_mif_req6 vr; + struct mif_device *vif; + struct mfc6_cache *c; + struct net *net = sock_net(sk); + struct mr6_table *mrt; + + mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT); + if (mrt == NULL) + return -ENOENT; + + switch (cmd) { + case SIOCGETMIFCNT_IN6: + if (copy_from_user(&vr, arg, sizeof(vr))) + return -EFAULT; + if (vr.mifi >= mrt->maxvif) + return -EINVAL; + read_lock(&mrt_lock); + vif = &mrt->vif6_table[vr.mifi]; + if (MIF_EXISTS(mrt, vr.mifi)) { + vr.icount = vif->pkt_in; + vr.ocount = vif->pkt_out; + vr.ibytes = vif->bytes_in; + vr.obytes = vif->bytes_out; + read_unlock(&mrt_lock); + + if (copy_to_user(arg, &vr, sizeof(vr))) + return -EFAULT; + return 0; + } + read_unlock(&mrt_lock); + return -EADDRNOTAVAIL; + case SIOCGETSGCNT_IN6: + if (copy_from_user(&sr, arg, sizeof(sr))) + return -EFAULT; + + read_lock(&mrt_lock); + c = ip6mr_cache_find(mrt, &sr.src.sin6_addr, &sr.grp.sin6_addr); + if (c) { + sr.pktcnt = c->mfc_un.res.pkt; + sr.bytecnt = c->mfc_un.res.bytes; + sr.wrong_if = c->mfc_un.res.wrong_if; + read_unlock(&mrt_lock); + + if (copy_to_user(arg, &sr, sizeof(sr))) + return -EFAULT; + return 0; + } + read_unlock(&mrt_lock); + return -EADDRNOTAVAIL; + default: + return -ENOIOCTLCMD; + } +} +#endif + +static inline int ip6mr_forward2_finish(struct sk_buff *skb) +{ + IP6_INC_STATS_BH(dev_net(skb_dst(skb)->dev), ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_OUTFORWDATAGRAMS); + return dst_output(skb); +} + +/* + * Processing handlers for ip6mr_forward + */ + +static int ip6mr_forward2(struct net *net, struct mr6_table *mrt, + struct sk_buff *skb, struct mfc6_cache *c, int vifi) +{ + struct ipv6hdr *ipv6h; + struct mif_device *vif = &mrt->vif6_table[vifi]; + struct net_device *dev; + struct dst_entry *dst; + struct flowi6 fl6; + + if (vif->dev == NULL) + goto out_free; + +#ifdef CONFIG_IPV6_PIMSM_V2 + if (vif->flags & MIFF_REGISTER) { + vif->pkt_out++; + vif->bytes_out += skb->len; + vif->dev->stats.tx_bytes += skb->len; + vif->dev->stats.tx_packets++; + ip6mr_cache_report(mrt, skb, vifi, MRT6MSG_WHOLEPKT); + goto out_free; + } +#endif + + ipv6h = ipv6_hdr(skb); + + fl6 = (struct flowi6) { + .flowi6_oif = vif->link, + .daddr = ipv6h->daddr, + }; + + dst = ip6_route_output(net, NULL, &fl6); + if (dst->error) { + dst_release(dst); + goto out_free; + } + + skb_dst_drop(skb); + skb_dst_set(skb, dst); + + /* + * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally + * not only before forwarding, but after forwarding on all output + * interfaces. It is clear, if mrouter runs a multicasting + * program, it should receive packets not depending to what interface + * program is joined. + * If we will not make it, the program will have to join on all + * interfaces. On the other hand, multihoming host (or router, but + * not mrouter) cannot join to more than one interface - it will + * result in receiving multiple packets. + */ + dev = vif->dev; + skb->dev = dev; + vif->pkt_out++; + vif->bytes_out += skb->len; + + /* We are about to write */ + /* XXX: extension headers? */ + if (skb_cow(skb, sizeof(*ipv6h) + LL_RESERVED_SPACE(dev))) + goto out_free; + + ipv6h = ipv6_hdr(skb); + ipv6h->hop_limit--; + + IP6CB(skb)->flags |= IP6SKB_FORWARDED; + + return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dev, + ip6mr_forward2_finish); + +out_free: + kfree_skb(skb); + return 0; +} + +static int ip6mr_find_vif(struct mr6_table *mrt, struct net_device *dev) +{ + int ct; + + for (ct = mrt->maxvif - 1; ct >= 0; ct--) { + if (mrt->vif6_table[ct].dev == dev) + break; + } + return ct; +} + +static int ip6_mr_forward(struct net *net, struct mr6_table *mrt, + struct sk_buff *skb, struct mfc6_cache *cache) +{ + int psend = -1; + int vif, ct; + + vif = cache->mf6c_parent; + cache->mfc_un.res.pkt++; + cache->mfc_un.res.bytes += skb->len; + + /* + * Wrong interface: drop packet and (maybe) send PIM assert. + */ + if (mrt->vif6_table[vif].dev != skb->dev) { + int true_vifi; + + cache->mfc_un.res.wrong_if++; + true_vifi = ip6mr_find_vif(mrt, skb->dev); + + if (true_vifi >= 0 && mrt->mroute_do_assert && + /* pimsm uses asserts, when switching from RPT to SPT, + so that we cannot check that packet arrived on an oif. + It is bad, but otherwise we would need to move pretty + large chunk of pimd to kernel. Ough... --ANK + */ + (mrt->mroute_do_pim || + cache->mfc_un.res.ttls[true_vifi] < 255) && + time_after(jiffies, + cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) { + cache->mfc_un.res.last_assert = jiffies; + ip6mr_cache_report(mrt, skb, true_vifi, MRT6MSG_WRONGMIF); + } + goto dont_forward; + } + + mrt->vif6_table[vif].pkt_in++; + mrt->vif6_table[vif].bytes_in += skb->len; + + /* + * Forward the frame + */ + for (ct = cache->mfc_un.res.maxvif - 1; ct >= cache->mfc_un.res.minvif; ct--) { + if (ipv6_hdr(skb)->hop_limit > cache->mfc_un.res.ttls[ct]) { + if (psend != -1) { + struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); + if (skb2) + ip6mr_forward2(net, mrt, skb2, cache, psend); + } + psend = ct; + } + } + if (psend != -1) { + ip6mr_forward2(net, mrt, skb, cache, psend); + return 0; + } + +dont_forward: + kfree_skb(skb); + return 0; +} + + +/* + * Multicast packets for forwarding arrive here + */ + +int ip6_mr_input(struct sk_buff *skb) +{ + struct mfc6_cache *cache; + struct net *net = dev_net(skb->dev); + struct mr6_table *mrt; + struct flowi6 fl6 = { + .flowi6_iif = skb->dev->ifindex, + .flowi6_mark = skb->mark, + }; + int err; + + err = ip6mr_fib_lookup(net, &fl6, &mrt); + if (err < 0) { + kfree_skb(skb); + return err; + } + + read_lock(&mrt_lock); + cache = ip6mr_cache_find(mrt, + &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr); + + /* + * No usable cache entry + */ + if (cache == NULL) { + int vif; + + vif = ip6mr_find_vif(mrt, skb->dev); + if (vif >= 0) { + int err = ip6mr_cache_unresolved(mrt, vif, skb); + read_unlock(&mrt_lock); + + return err; + } + read_unlock(&mrt_lock); + kfree_skb(skb); + return -ENODEV; + } + + ip6_mr_forward(net, mrt, skb, cache); + + read_unlock(&mrt_lock); + + return 0; +} + + +static int __ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb, + struct mfc6_cache *c, struct rtmsg *rtm) +{ + int ct; + struct rtnexthop *nhp; + u8 *b = skb_tail_pointer(skb); + struct rtattr *mp_head; + + /* If cache is unresolved, don't try to parse IIF and OIF */ + if (c->mf6c_parent >= MAXMIFS) + return -ENOENT; + + if (MIF_EXISTS(mrt, c->mf6c_parent)) + RTA_PUT(skb, RTA_IIF, 4, &mrt->vif6_table[c->mf6c_parent].dev->ifindex); + + mp_head = (struct rtattr *)skb_put(skb, RTA_LENGTH(0)); + + for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) { + if (MIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) { + if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4)) + goto rtattr_failure; + nhp = (struct rtnexthop *)skb_put(skb, RTA_ALIGN(sizeof(*nhp))); + nhp->rtnh_flags = 0; + nhp->rtnh_hops = c->mfc_un.res.ttls[ct]; + nhp->rtnh_ifindex = mrt->vif6_table[ct].dev->ifindex; + nhp->rtnh_len = sizeof(*nhp); + } + } + mp_head->rta_type = RTA_MULTIPATH; + mp_head->rta_len = skb_tail_pointer(skb) - (u8 *)mp_head; + rtm->rtm_type = RTN_MULTICAST; + return 1; + +rtattr_failure: + nlmsg_trim(skb, b); + return -EMSGSIZE; +} + +int ip6mr_get_route(struct net *net, + struct sk_buff *skb, struct rtmsg *rtm, int nowait) +{ + int err; + struct mr6_table *mrt; + struct mfc6_cache *cache; + struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); + + mrt = ip6mr_get_table(net, RT6_TABLE_DFLT); + if (mrt == NULL) + return -ENOENT; + + read_lock(&mrt_lock); + cache = ip6mr_cache_find(mrt, &rt->rt6i_src.addr, &rt->rt6i_dst.addr); + + if (!cache) { + struct sk_buff *skb2; + struct ipv6hdr *iph; + struct net_device *dev; + int vif; + + if (nowait) { + read_unlock(&mrt_lock); + return -EAGAIN; + } + + dev = skb->dev; + if (dev == NULL || (vif = ip6mr_find_vif(mrt, dev)) < 0) { + read_unlock(&mrt_lock); + return -ENODEV; + } + + /* really correct? */ + skb2 = alloc_skb(sizeof(struct ipv6hdr), GFP_ATOMIC); + if (!skb2) { + read_unlock(&mrt_lock); + return -ENOMEM; + } + + skb_reset_transport_header(skb2); + + skb_put(skb2, sizeof(struct ipv6hdr)); + skb_reset_network_header(skb2); + + iph = ipv6_hdr(skb2); + iph->version = 0; + iph->priority = 0; + iph->flow_lbl[0] = 0; + iph->flow_lbl[1] = 0; + iph->flow_lbl[2] = 0; + iph->payload_len = 0; + iph->nexthdr = IPPROTO_NONE; + iph->hop_limit = 0; + iph->saddr = rt->rt6i_src.addr; + iph->daddr = rt->rt6i_dst.addr; + + err = ip6mr_cache_unresolved(mrt, vif, skb2); + read_unlock(&mrt_lock); + + return err; + } + + if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY)) + cache->mfc_flags |= MFC_NOTIFY; + + err = __ip6mr_fill_mroute(mrt, skb, cache, rtm); + read_unlock(&mrt_lock); + return err; +} + +static int ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb, + u32 pid, u32 seq, struct mfc6_cache *c) +{ + struct nlmsghdr *nlh; + struct rtmsg *rtm; + + nlh = nlmsg_put(skb, pid, seq, RTM_NEWROUTE, sizeof(*rtm), NLM_F_MULTI); + if (nlh == NULL) + return -EMSGSIZE; + + rtm = nlmsg_data(nlh); + rtm->rtm_family = RTNL_FAMILY_IPMR; + rtm->rtm_dst_len = 128; + rtm->rtm_src_len = 128; + rtm->rtm_tos = 0; + rtm->rtm_table = mrt->id; + NLA_PUT_U32(skb, RTA_TABLE, mrt->id); + rtm->rtm_scope = RT_SCOPE_UNIVERSE; + rtm->rtm_protocol = RTPROT_UNSPEC; + rtm->rtm_flags = 0; + + NLA_PUT(skb, RTA_SRC, 16, &c->mf6c_origin); + NLA_PUT(skb, RTA_DST, 16, &c->mf6c_mcastgrp); + + if (__ip6mr_fill_mroute(mrt, skb, c, rtm) < 0) + goto nla_put_failure; + + return nlmsg_end(skb, nlh); + +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + struct mr6_table *mrt; + struct mfc6_cache *mfc; + unsigned int t = 0, s_t; + unsigned int h = 0, s_h; + unsigned int e = 0, s_e; + + s_t = cb->args[0]; + s_h = cb->args[1]; + s_e = cb->args[2]; + + read_lock(&mrt_lock); + ip6mr_for_each_table(mrt, net) { + if (t < s_t) + goto next_table; + if (t > s_t) + s_h = 0; + for (h = s_h; h < MFC6_LINES; h++) { + list_for_each_entry(mfc, &mrt->mfc6_cache_array[h], list) { + if (e < s_e) + goto next_entry; + if (ip6mr_fill_mroute(mrt, skb, + NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, + mfc) < 0) + goto done; +next_entry: + e++; + } + e = s_e = 0; + } + s_h = 0; +next_table: + t++; + } +done: + read_unlock(&mrt_lock); + + cb->args[2] = e; + cb->args[1] = h; + cb->args[0] = t; + + return skb->len; +} diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c new file mode 100644 index 00000000..bba658d9 --- /dev/null +++ b/net/ipv6/ipcomp6.c @@ -0,0 +1,218 @@ +/* + * IP Payload Compression Protocol (IPComp) for IPv6 - RFC3173 + * + * Copyright (C)2003 USAGI/WIDE Project + * + * Author Mitsuru KANDA <mk@linux-ipv6.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +/* + * [Memo] + * + * Outbound: + * The compression of IP datagram MUST be done before AH/ESP processing, + * fragmentation, and the addition of Hop-by-Hop/Routing header. + * + * Inbound: + * The decompression of IP datagram MUST be done after the reassembly, + * AH/ESP processing. + */ +#include <linux/module.h> +#include <net/ip.h> +#include <net/xfrm.h> +#include <net/ipcomp.h> +#include <linux/crypto.h> +#include <linux/err.h> +#include <linux/pfkeyv2.h> +#include <linux/random.h> +#include <linux/percpu.h> +#include <linux/smp.h> +#include <linux/list.h> +#include <linux/vmalloc.h> +#include <linux/rtnetlink.h> +#include <net/icmp.h> +#include <net/ipv6.h> +#include <net/protocol.h> +#include <linux/ipv6.h> +#include <linux/icmpv6.h> +#include <linux/mutex.h> + +static void ipcomp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, __be32 info) +{ + struct net *net = dev_net(skb->dev); + __be32 spi; + const struct ipv6hdr *iph = (const struct ipv6hdr *)skb->data; + struct ip_comp_hdr *ipcomph = + (struct ip_comp_hdr *)(skb->data + offset); + struct xfrm_state *x; + + if (type != ICMPV6_DEST_UNREACH && type != ICMPV6_PKT_TOOBIG) + return; + + spi = htonl(ntohs(ipcomph->cpi)); + x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, + spi, IPPROTO_COMP, AF_INET6); + if (!x) + return; + + printk(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/%pI6\n", + spi, &iph->daddr); + xfrm_state_put(x); +} + +static struct xfrm_state *ipcomp6_tunnel_create(struct xfrm_state *x) +{ + struct net *net = xs_net(x); + struct xfrm_state *t = NULL; + + t = xfrm_state_alloc(net); + if (!t) + goto out; + + t->id.proto = IPPROTO_IPV6; + t->id.spi = xfrm6_tunnel_alloc_spi(net, (xfrm_address_t *)&x->props.saddr); + if (!t->id.spi) + goto error; + + memcpy(t->id.daddr.a6, x->id.daddr.a6, sizeof(struct in6_addr)); + memcpy(&t->sel, &x->sel, sizeof(t->sel)); + t->props.family = AF_INET6; + t->props.mode = x->props.mode; + memcpy(t->props.saddr.a6, x->props.saddr.a6, sizeof(struct in6_addr)); + memcpy(&t->mark, &x->mark, sizeof(t->mark)); + + if (xfrm_init_state(t)) + goto error; + + atomic_set(&t->tunnel_users, 1); + +out: + return t; + +error: + t->km.state = XFRM_STATE_DEAD; + xfrm_state_put(t); + t = NULL; + goto out; +} + +static int ipcomp6_tunnel_attach(struct xfrm_state *x) +{ + struct net *net = xs_net(x); + int err = 0; + struct xfrm_state *t = NULL; + __be32 spi; + u32 mark = x->mark.m & x->mark.v; + + spi = xfrm6_tunnel_spi_lookup(net, (xfrm_address_t *)&x->props.saddr); + if (spi) + t = xfrm_state_lookup(net, mark, (xfrm_address_t *)&x->id.daddr, + spi, IPPROTO_IPV6, AF_INET6); + if (!t) { + t = ipcomp6_tunnel_create(x); + if (!t) { + err = -EINVAL; + goto out; + } + xfrm_state_insert(t); + xfrm_state_hold(t); + } + x->tunnel = t; + atomic_inc(&t->tunnel_users); + +out: + return err; +} + +static int ipcomp6_init_state(struct xfrm_state *x) +{ + int err = -EINVAL; + + x->props.header_len = 0; + switch (x->props.mode) { + case XFRM_MODE_TRANSPORT: + break; + case XFRM_MODE_TUNNEL: + x->props.header_len += sizeof(struct ipv6hdr); + break; + default: + goto out; + } + + err = ipcomp_init_state(x); + if (err) + goto out; + + if (x->props.mode == XFRM_MODE_TUNNEL) { + err = ipcomp6_tunnel_attach(x); + if (err) + goto out; + } + + err = 0; +out: + return err; +} + +static const struct xfrm_type ipcomp6_type = +{ + .description = "IPCOMP6", + .owner = THIS_MODULE, + .proto = IPPROTO_COMP, + .init_state = ipcomp6_init_state, + .destructor = ipcomp_destroy, + .input = ipcomp_input, + .output = ipcomp_output, + .hdr_offset = xfrm6_find_1stfragopt, +}; + +static const struct inet6_protocol ipcomp6_protocol = +{ + .handler = xfrm6_rcv, + .err_handler = ipcomp6_err, + .flags = INET6_PROTO_NOPOLICY, +}; + +static int __init ipcomp6_init(void) +{ + if (xfrm_register_type(&ipcomp6_type, AF_INET6) < 0) { + printk(KERN_INFO "ipcomp6 init: can't add xfrm type\n"); + return -EAGAIN; + } + if (inet6_add_protocol(&ipcomp6_protocol, IPPROTO_COMP) < 0) { + printk(KERN_INFO "ipcomp6 init: can't add protocol\n"); + xfrm_unregister_type(&ipcomp6_type, AF_INET6); + return -EAGAIN; + } + return 0; +} + +static void __exit ipcomp6_fini(void) +{ + if (inet6_del_protocol(&ipcomp6_protocol, IPPROTO_COMP) < 0) + printk(KERN_INFO "ipv6 ipcomp close: can't remove protocol\n"); + if (xfrm_unregister_type(&ipcomp6_type, AF_INET6) < 0) + printk(KERN_INFO "ipv6 ipcomp close: can't remove xfrm type\n"); +} + +module_init(ipcomp6_init); +module_exit(ipcomp6_fini); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("IP Payload Compression Protocol (IPComp) for IPv6 - RFC3173"); +MODULE_AUTHOR("Mitsuru KANDA <mk@linux-ipv6.org>"); + +MODULE_ALIAS_XFRM_TYPE(AF_INET6, XFRM_PROTO_COMP); diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c new file mode 100644 index 00000000..63dd1f89 --- /dev/null +++ b/net/ipv6/ipv6_sockglue.c @@ -0,0 +1,1326 @@ +/* + * IPv6 BSD socket options interface + * Linux INET6 implementation + * + * Authors: + * Pedro Roque <roque@di.fc.ul.pt> + * + * Based on linux/net/ipv4/ip_sockglue.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * FIXME: Make the setsockopt code POSIX compliant: That is + * + * o Truncate getsockopt returns + * o Return an optlen of the truncated length if need be + * + * Changes: + * David L Stevens <dlstevens@us.ibm.com>: + * - added multicast source filtering API for MLDv2 + */ + +#include <linux/module.h> +#include <linux/capability.h> +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/net.h> +#include <linux/in6.h> +#include <linux/mroute6.h> +#include <linux/netdevice.h> +#include <linux/if_arp.h> +#include <linux/init.h> +#include <linux/sysctl.h> +#include <linux/netfilter.h> +#include <linux/slab.h> + +#include <net/sock.h> +#include <net/snmp.h> +#include <net/ipv6.h> +#include <net/ndisc.h> +#include <net/protocol.h> +#include <net/transp_v6.h> +#include <net/ip6_route.h> +#include <net/addrconf.h> +#include <net/inet_common.h> +#include <net/tcp.h> +#include <net/udp.h> +#include <net/udplite.h> +#include <net/xfrm.h> +#include <net/compat.h> + +#include <asm/uaccess.h> + +struct ip6_ra_chain *ip6_ra_chain; +DEFINE_RWLOCK(ip6_ra_lock); + +int ip6_ra_control(struct sock *sk, int sel) +{ + struct ip6_ra_chain *ra, *new_ra, **rap; + + /* RA packet may be delivered ONLY to IPPROTO_RAW socket */ + if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num != IPPROTO_RAW) + return -ENOPROTOOPT; + + new_ra = (sel>=0) ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL; + + write_lock_bh(&ip6_ra_lock); + for (rap = &ip6_ra_chain; (ra=*rap) != NULL; rap = &ra->next) { + if (ra->sk == sk) { + if (sel>=0) { + write_unlock_bh(&ip6_ra_lock); + kfree(new_ra); + return -EADDRINUSE; + } + + *rap = ra->next; + write_unlock_bh(&ip6_ra_lock); + + sock_put(sk); + kfree(ra); + return 0; + } + } + if (new_ra == NULL) { + write_unlock_bh(&ip6_ra_lock); + return -ENOBUFS; + } + new_ra->sk = sk; + new_ra->sel = sel; + new_ra->next = ra; + *rap = new_ra; + sock_hold(sk); + write_unlock_bh(&ip6_ra_lock); + return 0; +} + +static +struct ipv6_txoptions *ipv6_update_options(struct sock *sk, + struct ipv6_txoptions *opt) +{ + if (inet_sk(sk)->is_icsk) { + if (opt && + !((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) && + inet_sk(sk)->inet_daddr != LOOPBACK4_IPV6) { + struct inet_connection_sock *icsk = inet_csk(sk); + icsk->icsk_ext_hdr_len = opt->opt_flen + opt->opt_nflen; + icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie); + } + opt = xchg(&inet6_sk(sk)->opt, opt); + } else { + spin_lock(&sk->sk_dst_lock); + opt = xchg(&inet6_sk(sk)->opt, opt); + spin_unlock(&sk->sk_dst_lock); + } + sk_dst_reset(sk); + + return opt; +} + +static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, + char __user *optval, unsigned int optlen) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct net *net = sock_net(sk); + int val, valbool; + int retv = -ENOPROTOOPT; + + if (optval == NULL) + val=0; + else { + if (optlen >= sizeof(int)) { + if (get_user(val, (int __user *) optval)) + return -EFAULT; + } else + val = 0; + } + + valbool = (val!=0); + + if (ip6_mroute_opt(optname)) + return ip6_mroute_setsockopt(sk, optname, optval, optlen); + + lock_sock(sk); + + switch (optname) { + + case IPV6_ADDRFORM: + if (optlen < sizeof(int)) + goto e_inval; + if (val == PF_INET) { + struct ipv6_txoptions *opt; + struct sk_buff *pktopt; + + if (sk->sk_type == SOCK_RAW) + break; + + if (sk->sk_protocol == IPPROTO_UDP || + sk->sk_protocol == IPPROTO_UDPLITE) { + struct udp_sock *up = udp_sk(sk); + if (up->pending == AF_INET6) { + retv = -EBUSY; + break; + } + } else if (sk->sk_protocol != IPPROTO_TCP) + break; + + if (sk->sk_state != TCP_ESTABLISHED) { + retv = -ENOTCONN; + break; + } + + if (ipv6_only_sock(sk) || + !ipv6_addr_v4mapped(&np->daddr)) { + retv = -EADDRNOTAVAIL; + break; + } + + fl6_free_socklist(sk); + ipv6_sock_mc_close(sk); + + /* + * Sock is moving from IPv6 to IPv4 (sk_prot), so + * remove it from the refcnt debug socks count in the + * original family... + */ + sk_refcnt_debug_dec(sk); + + if (sk->sk_protocol == IPPROTO_TCP) { + struct inet_connection_sock *icsk = inet_csk(sk); + local_bh_disable(); + sock_prot_inuse_add(net, sk->sk_prot, -1); + sock_prot_inuse_add(net, &tcp_prot, 1); + local_bh_enable(); + sk->sk_prot = &tcp_prot; + icsk->icsk_af_ops = &ipv4_specific; + sk->sk_socket->ops = &inet_stream_ops; + sk->sk_family = PF_INET; + tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); + } else { + struct proto *prot = &udp_prot; + + if (sk->sk_protocol == IPPROTO_UDPLITE) + prot = &udplite_prot; + local_bh_disable(); + sock_prot_inuse_add(net, sk->sk_prot, -1); + sock_prot_inuse_add(net, prot, 1); + local_bh_enable(); + sk->sk_prot = prot; + sk->sk_socket->ops = &inet_dgram_ops; + sk->sk_family = PF_INET; + } + opt = xchg(&np->opt, NULL); + if (opt) + sock_kfree_s(sk, opt, opt->tot_len); + pktopt = xchg(&np->pktoptions, NULL); + kfree_skb(pktopt); + + sk->sk_destruct = inet_sock_destruct; + /* + * ... and add it to the refcnt debug socks count + * in the new family. -acme + */ + sk_refcnt_debug_inc(sk); + module_put(THIS_MODULE); + retv = 0; + break; + } + goto e_inval; + + case IPV6_V6ONLY: + if (optlen < sizeof(int) || + inet_sk(sk)->inet_num) + goto e_inval; + np->ipv6only = valbool; + retv = 0; + break; + + case IPV6_RECVPKTINFO: + if (optlen < sizeof(int)) + goto e_inval; + np->rxopt.bits.rxinfo = valbool; + retv = 0; + break; + + case IPV6_2292PKTINFO: + if (optlen < sizeof(int)) + goto e_inval; + np->rxopt.bits.rxoinfo = valbool; + retv = 0; + break; + + case IPV6_RECVHOPLIMIT: + if (optlen < sizeof(int)) + goto e_inval; + np->rxopt.bits.rxhlim = valbool; + retv = 0; + break; + + case IPV6_2292HOPLIMIT: + if (optlen < sizeof(int)) + goto e_inval; + np->rxopt.bits.rxohlim = valbool; + retv = 0; + break; + + case IPV6_RECVRTHDR: + if (optlen < sizeof(int)) + goto e_inval; + np->rxopt.bits.srcrt = valbool; + retv = 0; + break; + + case IPV6_2292RTHDR: + if (optlen < sizeof(int)) + goto e_inval; + np->rxopt.bits.osrcrt = valbool; + retv = 0; + break; + + case IPV6_RECVHOPOPTS: + if (optlen < sizeof(int)) + goto e_inval; + np->rxopt.bits.hopopts = valbool; + retv = 0; + break; + + case IPV6_2292HOPOPTS: + if (optlen < sizeof(int)) + goto e_inval; + np->rxopt.bits.ohopopts = valbool; + retv = 0; + break; + + case IPV6_RECVDSTOPTS: + if (optlen < sizeof(int)) + goto e_inval; + np->rxopt.bits.dstopts = valbool; + retv = 0; + break; + + case IPV6_2292DSTOPTS: + if (optlen < sizeof(int)) + goto e_inval; + np->rxopt.bits.odstopts = valbool; + retv = 0; + break; + + case IPV6_TCLASS: + if (optlen < sizeof(int)) + goto e_inval; + if (val < -1 || val > 0xff) + goto e_inval; + /* RFC 3542, 6.5: default traffic class of 0x0 */ + if (val == -1) + val = 0; + np->tclass = val; + retv = 0; + break; + + case IPV6_RECVTCLASS: + if (optlen < sizeof(int)) + goto e_inval; + np->rxopt.bits.rxtclass = valbool; + retv = 0; + break; + + case IPV6_FLOWINFO: + if (optlen < sizeof(int)) + goto e_inval; + np->rxopt.bits.rxflow = valbool; + retv = 0; + break; + + case IPV6_RECVPATHMTU: + if (optlen < sizeof(int)) + goto e_inval; + np->rxopt.bits.rxpmtu = valbool; + retv = 0; + break; + + case IPV6_TRANSPARENT: + if (valbool && !capable(CAP_NET_ADMIN) && !capable(CAP_NET_RAW)) { + retv = -EPERM; + break; + } + if (optlen < sizeof(int)) + goto e_inval; + /* we don't have a separate transparent bit for IPV6 we use the one in the IPv4 socket */ + inet_sk(sk)->transparent = valbool; + retv = 0; + break; + + case IPV6_RECVORIGDSTADDR: + if (optlen < sizeof(int)) + goto e_inval; + np->rxopt.bits.rxorigdstaddr = valbool; + retv = 0; + break; + + case IPV6_HOPOPTS: + case IPV6_RTHDRDSTOPTS: + case IPV6_RTHDR: + case IPV6_DSTOPTS: + { + struct ipv6_txoptions *opt; + + /* remove any sticky options header with a zero option + * length, per RFC3542. + */ + if (optlen == 0) + optval = NULL; + else if (optval == NULL) + goto e_inval; + else if (optlen < sizeof(struct ipv6_opt_hdr) || + optlen & 0x7 || optlen > 8 * 255) + goto e_inval; + + /* hop-by-hop / destination options are privileged option */ + retv = -EPERM; + if (optname != IPV6_RTHDR && !capable(CAP_NET_RAW)) + break; + + opt = ipv6_renew_options(sk, np->opt, optname, + (struct ipv6_opt_hdr __user *)optval, + optlen); + if (IS_ERR(opt)) { + retv = PTR_ERR(opt); + break; + } + + /* routing header option needs extra check */ + retv = -EINVAL; + if (optname == IPV6_RTHDR && opt && opt->srcrt) { + struct ipv6_rt_hdr *rthdr = opt->srcrt; + switch (rthdr->type) { +#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) + case IPV6_SRCRT_TYPE_2: + if (rthdr->hdrlen != 2 || + rthdr->segments_left != 1) + goto sticky_done; + + break; +#endif + default: + goto sticky_done; + } + } + + retv = 0; + opt = ipv6_update_options(sk, opt); +sticky_done: + if (opt) + sock_kfree_s(sk, opt, opt->tot_len); + break; + } + + case IPV6_PKTINFO: + { + struct in6_pktinfo pkt; + + if (optlen == 0) + goto e_inval; + else if (optlen < sizeof(struct in6_pktinfo) || optval == NULL) + goto e_inval; + + if (copy_from_user(&pkt, optval, sizeof(struct in6_pktinfo))) { + retv = -EFAULT; + break; + } + if (sk->sk_bound_dev_if && pkt.ipi6_ifindex != sk->sk_bound_dev_if) + goto e_inval; + + np->sticky_pktinfo.ipi6_ifindex = pkt.ipi6_ifindex; + np->sticky_pktinfo.ipi6_addr = pkt.ipi6_addr; + retv = 0; + break; + } + + case IPV6_2292PKTOPTIONS: + { + struct ipv6_txoptions *opt = NULL; + struct msghdr msg; + struct flowi6 fl6; + int junk; + + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_oif = sk->sk_bound_dev_if; + fl6.flowi6_mark = sk->sk_mark; + + if (optlen == 0) + goto update; + + /* 1K is probably excessive + * 1K is surely not enough, 2K per standard header is 16K. + */ + retv = -EINVAL; + if (optlen > 64*1024) + break; + + opt = sock_kmalloc(sk, sizeof(*opt) + optlen, GFP_KERNEL); + retv = -ENOBUFS; + if (opt == NULL) + break; + + memset(opt, 0, sizeof(*opt)); + opt->tot_len = sizeof(*opt) + optlen; + retv = -EFAULT; + if (copy_from_user(opt+1, optval, optlen)) + goto done; + + msg.msg_controllen = optlen; + msg.msg_control = (void*)(opt+1); + + retv = datagram_send_ctl(net, sk, &msg, &fl6, opt, &junk, &junk, + &junk); + if (retv) + goto done; +update: + retv = 0; + opt = ipv6_update_options(sk, opt); +done: + if (opt) + sock_kfree_s(sk, opt, opt->tot_len); + break; + } + case IPV6_UNICAST_HOPS: + if (optlen < sizeof(int)) + goto e_inval; + if (val > 255 || val < -1) + goto e_inval; + np->hop_limit = val; + retv = 0; + break; + + case IPV6_MULTICAST_HOPS: + if (sk->sk_type == SOCK_STREAM) + break; + if (optlen < sizeof(int)) + goto e_inval; + if (val > 255 || val < -1) + goto e_inval; + np->mcast_hops = (val == -1 ? IPV6_DEFAULT_MCASTHOPS : val); + retv = 0; + break; + + case IPV6_MULTICAST_LOOP: + if (optlen < sizeof(int)) + goto e_inval; + if (val != valbool) + goto e_inval; + np->mc_loop = valbool; + retv = 0; + break; + + case IPV6_UNICAST_IF: + { + struct net_device *dev = NULL; + int ifindex; + + if (optlen != sizeof(int)) + goto e_inval; + + ifindex = (__force int)ntohl((__force __be32)val); + if (ifindex == 0) { + np->ucast_oif = 0; + retv = 0; + break; + } + + dev = dev_get_by_index(net, ifindex); + retv = -EADDRNOTAVAIL; + if (!dev) + break; + dev_put(dev); + + retv = -EINVAL; + if (sk->sk_bound_dev_if) + break; + + np->ucast_oif = ifindex; + retv = 0; + break; + } + + case IPV6_MULTICAST_IF: + if (sk->sk_type == SOCK_STREAM) + break; + if (optlen < sizeof(int)) + goto e_inval; + + if (val) { + struct net_device *dev; + + if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != val) + goto e_inval; + + dev = dev_get_by_index(net, val); + if (!dev) { + retv = -ENODEV; + break; + } + dev_put(dev); + } + np->mcast_oif = val; + retv = 0; + break; + case IPV6_ADD_MEMBERSHIP: + case IPV6_DROP_MEMBERSHIP: + { + struct ipv6_mreq mreq; + + if (optlen < sizeof(struct ipv6_mreq)) + goto e_inval; + + retv = -EPROTO; + if (inet_sk(sk)->is_icsk) + break; + + retv = -EFAULT; + if (copy_from_user(&mreq, optval, sizeof(struct ipv6_mreq))) + break; + + if (optname == IPV6_ADD_MEMBERSHIP) + retv = ipv6_sock_mc_join(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_multiaddr); + else + retv = ipv6_sock_mc_drop(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_multiaddr); + break; + } + case IPV6_JOIN_ANYCAST: + case IPV6_LEAVE_ANYCAST: + { + struct ipv6_mreq mreq; + + if (optlen < sizeof(struct ipv6_mreq)) + goto e_inval; + + retv = -EFAULT; + if (copy_from_user(&mreq, optval, sizeof(struct ipv6_mreq))) + break; + + if (optname == IPV6_JOIN_ANYCAST) + retv = ipv6_sock_ac_join(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_acaddr); + else + retv = ipv6_sock_ac_drop(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_acaddr); + break; + } + case MCAST_JOIN_GROUP: + case MCAST_LEAVE_GROUP: + { + struct group_req greq; + struct sockaddr_in6 *psin6; + + if (optlen < sizeof(struct group_req)) + goto e_inval; + + retv = -EFAULT; + if (copy_from_user(&greq, optval, sizeof(struct group_req))) + break; + if (greq.gr_group.ss_family != AF_INET6) { + retv = -EADDRNOTAVAIL; + break; + } + psin6 = (struct sockaddr_in6 *)&greq.gr_group; + if (optname == MCAST_JOIN_GROUP) + retv = ipv6_sock_mc_join(sk, greq.gr_interface, + &psin6->sin6_addr); + else + retv = ipv6_sock_mc_drop(sk, greq.gr_interface, + &psin6->sin6_addr); + break; + } + case MCAST_JOIN_SOURCE_GROUP: + case MCAST_LEAVE_SOURCE_GROUP: + case MCAST_BLOCK_SOURCE: + case MCAST_UNBLOCK_SOURCE: + { + struct group_source_req greqs; + int omode, add; + + if (optlen < sizeof(struct group_source_req)) + goto e_inval; + if (copy_from_user(&greqs, optval, sizeof(greqs))) { + retv = -EFAULT; + break; + } + if (greqs.gsr_group.ss_family != AF_INET6 || + greqs.gsr_source.ss_family != AF_INET6) { + retv = -EADDRNOTAVAIL; + break; + } + if (optname == MCAST_BLOCK_SOURCE) { + omode = MCAST_EXCLUDE; + add = 1; + } else if (optname == MCAST_UNBLOCK_SOURCE) { + omode = MCAST_EXCLUDE; + add = 0; + } else if (optname == MCAST_JOIN_SOURCE_GROUP) { + struct sockaddr_in6 *psin6; + + psin6 = (struct sockaddr_in6 *)&greqs.gsr_group; + retv = ipv6_sock_mc_join(sk, greqs.gsr_interface, + &psin6->sin6_addr); + /* prior join w/ different source is ok */ + if (retv && retv != -EADDRINUSE) + break; + omode = MCAST_INCLUDE; + add = 1; + } else /* MCAST_LEAVE_SOURCE_GROUP */ { + omode = MCAST_INCLUDE; + add = 0; + } + retv = ip6_mc_source(add, omode, sk, &greqs); + break; + } + case MCAST_MSFILTER: + { + extern int sysctl_mld_max_msf; + struct group_filter *gsf; + + if (optlen < GROUP_FILTER_SIZE(0)) + goto e_inval; + if (optlen > sysctl_optmem_max) { + retv = -ENOBUFS; + break; + } + gsf = kmalloc(optlen,GFP_KERNEL); + if (!gsf) { + retv = -ENOBUFS; + break; + } + retv = -EFAULT; + if (copy_from_user(gsf, optval, optlen)) { + kfree(gsf); + break; + } + /* numsrc >= (4G-140)/128 overflow in 32 bits */ + if (gsf->gf_numsrc >= 0x1ffffffU || + gsf->gf_numsrc > sysctl_mld_max_msf) { + kfree(gsf); + retv = -ENOBUFS; + break; + } + if (GROUP_FILTER_SIZE(gsf->gf_numsrc) > optlen) { + kfree(gsf); + retv = -EINVAL; + break; + } + retv = ip6_mc_msfilter(sk, gsf); + kfree(gsf); + + break; + } + case IPV6_ROUTER_ALERT: + if (optlen < sizeof(int)) + goto e_inval; + retv = ip6_ra_control(sk, val); + break; + case IPV6_MTU_DISCOVER: + if (optlen < sizeof(int)) + goto e_inval; + if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_PROBE) + goto e_inval; + np->pmtudisc = val; + retv = 0; + break; + case IPV6_MTU: + if (optlen < sizeof(int)) + goto e_inval; + if (val && val < IPV6_MIN_MTU) + goto e_inval; + np->frag_size = val; + retv = 0; + break; + case IPV6_RECVERR: + if (optlen < sizeof(int)) + goto e_inval; + np->recverr = valbool; + if (!val) + skb_queue_purge(&sk->sk_error_queue); + retv = 0; + break; + case IPV6_FLOWINFO_SEND: + if (optlen < sizeof(int)) + goto e_inval; + np->sndflow = valbool; + retv = 0; + break; + case IPV6_FLOWLABEL_MGR: + retv = ipv6_flowlabel_opt(sk, optval, optlen); + break; + case IPV6_IPSEC_POLICY: + case IPV6_XFRM_POLICY: + retv = -EPERM; + if (!capable(CAP_NET_ADMIN)) + break; + retv = xfrm_user_policy(sk, optname, optval, optlen); + break; + + case IPV6_ADDR_PREFERENCES: + { + unsigned int pref = 0; + unsigned int prefmask = ~0; + + if (optlen < sizeof(int)) + goto e_inval; + + retv = -EINVAL; + + /* check PUBLIC/TMP/PUBTMP_DEFAULT conflicts */ + switch (val & (IPV6_PREFER_SRC_PUBLIC| + IPV6_PREFER_SRC_TMP| + IPV6_PREFER_SRC_PUBTMP_DEFAULT)) { + case IPV6_PREFER_SRC_PUBLIC: + pref |= IPV6_PREFER_SRC_PUBLIC; + break; + case IPV6_PREFER_SRC_TMP: + pref |= IPV6_PREFER_SRC_TMP; + break; + case IPV6_PREFER_SRC_PUBTMP_DEFAULT: + break; + case 0: + goto pref_skip_pubtmp; + default: + goto e_inval; + } + + prefmask &= ~(IPV6_PREFER_SRC_PUBLIC| + IPV6_PREFER_SRC_TMP); +pref_skip_pubtmp: + + /* check HOME/COA conflicts */ + switch (val & (IPV6_PREFER_SRC_HOME|IPV6_PREFER_SRC_COA)) { + case IPV6_PREFER_SRC_HOME: + break; + case IPV6_PREFER_SRC_COA: + pref |= IPV6_PREFER_SRC_COA; + case 0: + goto pref_skip_coa; + default: + goto e_inval; + } + + prefmask &= ~IPV6_PREFER_SRC_COA; +pref_skip_coa: + + /* check CGA/NONCGA conflicts */ + switch (val & (IPV6_PREFER_SRC_CGA|IPV6_PREFER_SRC_NONCGA)) { + case IPV6_PREFER_SRC_CGA: + case IPV6_PREFER_SRC_NONCGA: + case 0: + break; + default: + goto e_inval; + } + + np->srcprefs = (np->srcprefs & prefmask) | pref; + retv = 0; + + break; + } + case IPV6_MINHOPCOUNT: + if (optlen < sizeof(int)) + goto e_inval; + if (val < 0 || val > 255) + goto e_inval; + np->min_hopcount = val; + break; + case IPV6_DONTFRAG: + np->dontfrag = valbool; + retv = 0; + break; + } + + release_sock(sk); + + return retv; + +e_inval: + release_sock(sk); + return -EINVAL; +} + +int ipv6_setsockopt(struct sock *sk, int level, int optname, + char __user *optval, unsigned int optlen) +{ + int err; + + if (level == SOL_IP && sk->sk_type != SOCK_RAW) + return udp_prot.setsockopt(sk, level, optname, optval, optlen); + + if (level != SOL_IPV6) + return -ENOPROTOOPT; + + err = do_ipv6_setsockopt(sk, level, optname, optval, optlen); +#ifdef CONFIG_NETFILTER + /* we need to exclude all possible ENOPROTOOPTs except default case */ + if (err == -ENOPROTOOPT && optname != IPV6_IPSEC_POLICY && + optname != IPV6_XFRM_POLICY) { + lock_sock(sk); + err = nf_setsockopt(sk, PF_INET6, optname, optval, + optlen); + release_sock(sk); + } +#endif + return err; +} + +EXPORT_SYMBOL(ipv6_setsockopt); + +#ifdef CONFIG_COMPAT +int compat_ipv6_setsockopt(struct sock *sk, int level, int optname, + char __user *optval, unsigned int optlen) +{ + int err; + + if (level == SOL_IP && sk->sk_type != SOCK_RAW) { + if (udp_prot.compat_setsockopt != NULL) + return udp_prot.compat_setsockopt(sk, level, optname, + optval, optlen); + return udp_prot.setsockopt(sk, level, optname, optval, optlen); + } + + if (level != SOL_IPV6) + return -ENOPROTOOPT; + + if (optname >= MCAST_JOIN_GROUP && optname <= MCAST_MSFILTER) + return compat_mc_setsockopt(sk, level, optname, optval, optlen, + ipv6_setsockopt); + + err = do_ipv6_setsockopt(sk, level, optname, optval, optlen); +#ifdef CONFIG_NETFILTER + /* we need to exclude all possible ENOPROTOOPTs except default case */ + if (err == -ENOPROTOOPT && optname != IPV6_IPSEC_POLICY && + optname != IPV6_XFRM_POLICY) { + lock_sock(sk); + err = compat_nf_setsockopt(sk, PF_INET6, optname, + optval, optlen); + release_sock(sk); + } +#endif + return err; +} + +EXPORT_SYMBOL(compat_ipv6_setsockopt); +#endif + +static int ipv6_getsockopt_sticky(struct sock *sk, struct ipv6_txoptions *opt, + int optname, char __user *optval, int len) +{ + struct ipv6_opt_hdr *hdr; + + if (!opt) + return 0; + + switch(optname) { + case IPV6_HOPOPTS: + hdr = opt->hopopt; + break; + case IPV6_RTHDRDSTOPTS: + hdr = opt->dst0opt; + break; + case IPV6_RTHDR: + hdr = (struct ipv6_opt_hdr *)opt->srcrt; + break; + case IPV6_DSTOPTS: + hdr = opt->dst1opt; + break; + default: + return -EINVAL; /* should not happen */ + } + + if (!hdr) + return 0; + + len = min_t(unsigned int, len, ipv6_optlen(hdr)); + if (copy_to_user(optval, hdr, len)) + return -EFAULT; + return len; +} + +static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *optlen, unsigned flags) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + int len; + int val; + + if (ip6_mroute_opt(optname)) + return ip6_mroute_getsockopt(sk, optname, optval, optlen); + + if (get_user(len, optlen)) + return -EFAULT; + switch (optname) { + case IPV6_ADDRFORM: + if (sk->sk_protocol != IPPROTO_UDP && + sk->sk_protocol != IPPROTO_UDPLITE && + sk->sk_protocol != IPPROTO_TCP) + return -ENOPROTOOPT; + if (sk->sk_state != TCP_ESTABLISHED) + return -ENOTCONN; + val = sk->sk_family; + break; + case MCAST_MSFILTER: + { + struct group_filter gsf; + int err; + + if (len < GROUP_FILTER_SIZE(0)) + return -EINVAL; + if (copy_from_user(&gsf, optval, GROUP_FILTER_SIZE(0))) + return -EFAULT; + if (gsf.gf_group.ss_family != AF_INET6) + return -EADDRNOTAVAIL; + lock_sock(sk); + err = ip6_mc_msfget(sk, &gsf, + (struct group_filter __user *)optval, optlen); + release_sock(sk); + return err; + } + + case IPV6_2292PKTOPTIONS: + { + struct msghdr msg; + struct sk_buff *skb; + + if (sk->sk_type != SOCK_STREAM) + return -ENOPROTOOPT; + + msg.msg_control = optval; + msg.msg_controllen = len; + msg.msg_flags = flags; + + lock_sock(sk); + skb = np->pktoptions; + if (skb) + atomic_inc(&skb->users); + release_sock(sk); + + if (skb) { + int err = datagram_recv_ctl(sk, &msg, skb); + kfree_skb(skb); + if (err) + return err; + } else { + if (np->rxopt.bits.rxinfo) { + struct in6_pktinfo src_info; + src_info.ipi6_ifindex = np->mcast_oif ? np->mcast_oif : + np->sticky_pktinfo.ipi6_ifindex; + src_info.ipi6_addr = np->mcast_oif ? np->daddr : np->sticky_pktinfo.ipi6_addr; + put_cmsg(&msg, SOL_IPV6, IPV6_PKTINFO, sizeof(src_info), &src_info); + } + if (np->rxopt.bits.rxhlim) { + int hlim = np->mcast_hops; + put_cmsg(&msg, SOL_IPV6, IPV6_HOPLIMIT, sizeof(hlim), &hlim); + } + if (np->rxopt.bits.rxtclass) { + int tclass = np->rcv_tclass; + put_cmsg(&msg, SOL_IPV6, IPV6_TCLASS, sizeof(tclass), &tclass); + } + if (np->rxopt.bits.rxoinfo) { + struct in6_pktinfo src_info; + src_info.ipi6_ifindex = np->mcast_oif ? np->mcast_oif : + np->sticky_pktinfo.ipi6_ifindex; + src_info.ipi6_addr = np->mcast_oif ? np->daddr : np->sticky_pktinfo.ipi6_addr; + put_cmsg(&msg, SOL_IPV6, IPV6_2292PKTINFO, sizeof(src_info), &src_info); + } + if (np->rxopt.bits.rxohlim) { + int hlim = np->mcast_hops; + put_cmsg(&msg, SOL_IPV6, IPV6_2292HOPLIMIT, sizeof(hlim), &hlim); + } + } + len -= msg.msg_controllen; + return put_user(len, optlen); + } + case IPV6_MTU: + { + struct dst_entry *dst; + + val = 0; + rcu_read_lock(); + dst = __sk_dst_get(sk); + if (dst) + val = dst_mtu(dst); + rcu_read_unlock(); + if (!val) + return -ENOTCONN; + break; + } + + case IPV6_V6ONLY: + val = np->ipv6only; + break; + + case IPV6_RECVPKTINFO: + val = np->rxopt.bits.rxinfo; + break; + + case IPV6_2292PKTINFO: + val = np->rxopt.bits.rxoinfo; + break; + + case IPV6_RECVHOPLIMIT: + val = np->rxopt.bits.rxhlim; + break; + + case IPV6_2292HOPLIMIT: + val = np->rxopt.bits.rxohlim; + break; + + case IPV6_RECVRTHDR: + val = np->rxopt.bits.srcrt; + break; + + case IPV6_2292RTHDR: + val = np->rxopt.bits.osrcrt; + break; + + case IPV6_HOPOPTS: + case IPV6_RTHDRDSTOPTS: + case IPV6_RTHDR: + case IPV6_DSTOPTS: + { + + lock_sock(sk); + len = ipv6_getsockopt_sticky(sk, np->opt, + optname, optval, len); + release_sock(sk); + /* check if ipv6_getsockopt_sticky() returns err code */ + if (len < 0) + return len; + return put_user(len, optlen); + } + + case IPV6_RECVHOPOPTS: + val = np->rxopt.bits.hopopts; + break; + + case IPV6_2292HOPOPTS: + val = np->rxopt.bits.ohopopts; + break; + + case IPV6_RECVDSTOPTS: + val = np->rxopt.bits.dstopts; + break; + + case IPV6_2292DSTOPTS: + val = np->rxopt.bits.odstopts; + break; + + case IPV6_TCLASS: + val = np->tclass; + break; + + case IPV6_RECVTCLASS: + val = np->rxopt.bits.rxtclass; + break; + + case IPV6_FLOWINFO: + val = np->rxopt.bits.rxflow; + break; + + case IPV6_RECVPATHMTU: + val = np->rxopt.bits.rxpmtu; + break; + + case IPV6_PATHMTU: + { + struct dst_entry *dst; + struct ip6_mtuinfo mtuinfo; + + if (len < sizeof(mtuinfo)) + return -EINVAL; + + len = sizeof(mtuinfo); + memset(&mtuinfo, 0, sizeof(mtuinfo)); + + rcu_read_lock(); + dst = __sk_dst_get(sk); + if (dst) + mtuinfo.ip6m_mtu = dst_mtu(dst); + rcu_read_unlock(); + if (!mtuinfo.ip6m_mtu) + return -ENOTCONN; + + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &mtuinfo, len)) + return -EFAULT; + + return 0; + break; + } + + case IPV6_TRANSPARENT: + val = inet_sk(sk)->transparent; + break; + + case IPV6_RECVORIGDSTADDR: + val = np->rxopt.bits.rxorigdstaddr; + break; + + case IPV6_UNICAST_HOPS: + case IPV6_MULTICAST_HOPS: + { + struct dst_entry *dst; + + if (optname == IPV6_UNICAST_HOPS) + val = np->hop_limit; + else + val = np->mcast_hops; + + if (val < 0) { + rcu_read_lock(); + dst = __sk_dst_get(sk); + if (dst) + val = ip6_dst_hoplimit(dst); + rcu_read_unlock(); + } + + if (val < 0) + val = sock_net(sk)->ipv6.devconf_all->hop_limit; + break; + } + + case IPV6_MULTICAST_LOOP: + val = np->mc_loop; + break; + + case IPV6_MULTICAST_IF: + val = np->mcast_oif; + break; + + case IPV6_UNICAST_IF: + val = (__force int)htonl((__u32) np->ucast_oif); + break; + + case IPV6_MTU_DISCOVER: + val = np->pmtudisc; + break; + + case IPV6_RECVERR: + val = np->recverr; + break; + + case IPV6_FLOWINFO_SEND: + val = np->sndflow; + break; + + case IPV6_ADDR_PREFERENCES: + val = 0; + + if (np->srcprefs & IPV6_PREFER_SRC_TMP) + val |= IPV6_PREFER_SRC_TMP; + else if (np->srcprefs & IPV6_PREFER_SRC_PUBLIC) + val |= IPV6_PREFER_SRC_PUBLIC; + else { + /* XXX: should we return system default? */ + val |= IPV6_PREFER_SRC_PUBTMP_DEFAULT; + } + + if (np->srcprefs & IPV6_PREFER_SRC_COA) + val |= IPV6_PREFER_SRC_COA; + else + val |= IPV6_PREFER_SRC_HOME; + break; + + case IPV6_MINHOPCOUNT: + val = np->min_hopcount; + break; + + case IPV6_DONTFRAG: + val = np->dontfrag; + break; + + default: + return -ENOPROTOOPT; + } + len = min_t(unsigned int, sizeof(int), len); + if(put_user(len, optlen)) + return -EFAULT; + if(copy_to_user(optval,&val,len)) + return -EFAULT; + return 0; +} + +int ipv6_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *optlen) +{ + int err; + + if (level == SOL_IP && sk->sk_type != SOCK_RAW) + return udp_prot.getsockopt(sk, level, optname, optval, optlen); + + if(level != SOL_IPV6) + return -ENOPROTOOPT; + + err = do_ipv6_getsockopt(sk, level, optname, optval, optlen, 0); +#ifdef CONFIG_NETFILTER + /* we need to exclude all possible ENOPROTOOPTs except default case */ + if (err == -ENOPROTOOPT && optname != IPV6_2292PKTOPTIONS) { + int len; + + if (get_user(len, optlen)) + return -EFAULT; + + lock_sock(sk); + err = nf_getsockopt(sk, PF_INET6, optname, optval, + &len); + release_sock(sk); + if (err >= 0) + err = put_user(len, optlen); + } +#endif + return err; +} + +EXPORT_SYMBOL(ipv6_getsockopt); + +#ifdef CONFIG_COMPAT +int compat_ipv6_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *optlen) +{ + int err; + + if (level == SOL_IP && sk->sk_type != SOCK_RAW) { + if (udp_prot.compat_getsockopt != NULL) + return udp_prot.compat_getsockopt(sk, level, optname, + optval, optlen); + return udp_prot.getsockopt(sk, level, optname, optval, optlen); + } + + if (level != SOL_IPV6) + return -ENOPROTOOPT; + + if (optname == MCAST_MSFILTER) + return compat_mc_getsockopt(sk, level, optname, optval, optlen, + ipv6_getsockopt); + + err = do_ipv6_getsockopt(sk, level, optname, optval, optlen, + MSG_CMSG_COMPAT); +#ifdef CONFIG_NETFILTER + /* we need to exclude all possible ENOPROTOOPTs except default case */ + if (err == -ENOPROTOOPT && optname != IPV6_2292PKTOPTIONS) { + int len; + + if (get_user(len, optlen)) + return -EFAULT; + + lock_sock(sk); + err = compat_nf_getsockopt(sk, PF_INET6, + optname, optval, &len); + release_sock(sk); + if (err >= 0) + err = put_user(len, optlen); + } +#endif + return err; +} + +EXPORT_SYMBOL(compat_ipv6_getsockopt); +#endif + diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c new file mode 100644 index 00000000..b2869cab --- /dev/null +++ b/net/ipv6/mcast.c @@ -0,0 +1,2668 @@ +/* + * Multicast support for IPv6 + * Linux INET6 implementation + * + * Authors: + * Pedro Roque <roque@di.fc.ul.pt> + * + * Based on linux/ipv4/igmp.c and linux/ipv4/ip_sockglue.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* Changes: + * + * yoshfuji : fix format of router-alert option + * YOSHIFUJI Hideaki @USAGI: + * Fixed source address for MLD message based on + * <draft-ietf-magma-mld-source-05.txt>. + * YOSHIFUJI Hideaki @USAGI: + * - Ignore Queries for invalid addresses. + * - MLD for link-local addresses. + * David L Stevens <dlstevens@us.ibm.com>: + * - MLDv2 support + */ + +#include <linux/module.h> +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/string.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/jiffies.h> +#include <linux/times.h> +#include <linux/net.h> +#include <linux/in.h> +#include <linux/in6.h> +#include <linux/netdevice.h> +#include <linux/if_arp.h> +#include <linux/route.h> +#include <linux/init.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/slab.h> +#include <net/mld.h> + +#include <linux/netfilter.h> +#include <linux/netfilter_ipv6.h> + +#include <net/net_namespace.h> +#include <net/sock.h> +#include <net/snmp.h> + +#include <net/ipv6.h> +#include <net/protocol.h> +#include <net/if_inet6.h> +#include <net/ndisc.h> +#include <net/addrconf.h> +#include <net/ip6_route.h> +#include <net/inet_common.h> + +#include <net/ip6_checksum.h> + +/* Set to 3 to get tracing... */ +#define MCAST_DEBUG 2 + +#if MCAST_DEBUG >= 3 +#define MDBG(x) printk x +#else +#define MDBG(x) +#endif + +/* Ensure that we have struct in6_addr aligned on 32bit word. */ +static void *__mld2_query_bugs[] __attribute__((__unused__)) = { + BUILD_BUG_ON_NULL(offsetof(struct mld2_query, mld2q_srcs) % 4), + BUILD_BUG_ON_NULL(offsetof(struct mld2_report, mld2r_grec) % 4), + BUILD_BUG_ON_NULL(offsetof(struct mld2_grec, grec_mca) % 4) +}; + +static struct in6_addr mld2_all_mcr = MLD2_ALL_MCR_INIT; + +/* Big mc list lock for all the sockets */ +static DEFINE_SPINLOCK(ipv6_sk_mc_lock); + +static void igmp6_join_group(struct ifmcaddr6 *ma); +static void igmp6_leave_group(struct ifmcaddr6 *ma); +static void igmp6_timer_handler(unsigned long data); + +static void mld_gq_timer_expire(unsigned long data); +static void mld_ifc_timer_expire(unsigned long data); +static void mld_ifc_event(struct inet6_dev *idev); +static void mld_add_delrec(struct inet6_dev *idev, struct ifmcaddr6 *pmc); +static void mld_del_delrec(struct inet6_dev *idev, const struct in6_addr *addr); +static void mld_clear_delrec(struct inet6_dev *idev); +static int sf_setstate(struct ifmcaddr6 *pmc); +static void sf_markstate(struct ifmcaddr6 *pmc); +static void ip6_mc_clear_src(struct ifmcaddr6 *pmc); +static int ip6_mc_del_src(struct inet6_dev *idev, const struct in6_addr *pmca, + int sfmode, int sfcount, const struct in6_addr *psfsrc, + int delta); +static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca, + int sfmode, int sfcount, const struct in6_addr *psfsrc, + int delta); +static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml, + struct inet6_dev *idev); + + +#define IGMP6_UNSOLICITED_IVAL (10*HZ) +#define MLD_QRV_DEFAULT 2 + +#define MLD_V1_SEEN(idev) (dev_net((idev)->dev)->ipv6.devconf_all->force_mld_version == 1 || \ + (idev)->cnf.force_mld_version == 1 || \ + ((idev)->mc_v1_seen && \ + time_before(jiffies, (idev)->mc_v1_seen))) + +#define IPV6_MLD_MAX_MSF 64 + +int sysctl_mld_max_msf __read_mostly = IPV6_MLD_MAX_MSF; + +/* + * socket join on multicast group + */ + +#define for_each_pmc_rcu(np, pmc) \ + for (pmc = rcu_dereference(np->ipv6_mc_list); \ + pmc != NULL; \ + pmc = rcu_dereference(pmc->next)) + +int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr) +{ + struct net_device *dev = NULL; + struct ipv6_mc_socklist *mc_lst; + struct ipv6_pinfo *np = inet6_sk(sk); + struct net *net = sock_net(sk); + int err; + + if (!ipv6_addr_is_multicast(addr)) + return -EINVAL; + + rcu_read_lock(); + for_each_pmc_rcu(np, mc_lst) { + if ((ifindex == 0 || mc_lst->ifindex == ifindex) && + ipv6_addr_equal(&mc_lst->addr, addr)) { + rcu_read_unlock(); + return -EADDRINUSE; + } + } + rcu_read_unlock(); + + mc_lst = sock_kmalloc(sk, sizeof(struct ipv6_mc_socklist), GFP_KERNEL); + + if (mc_lst == NULL) + return -ENOMEM; + + mc_lst->next = NULL; + mc_lst->addr = *addr; + + rcu_read_lock(); + if (ifindex == 0) { + struct rt6_info *rt; + rt = rt6_lookup(net, addr, NULL, 0, 0); + if (rt) { + dev = rt->dst.dev; + dst_release(&rt->dst); + } + } else + dev = dev_get_by_index_rcu(net, ifindex); + + if (dev == NULL) { + rcu_read_unlock(); + sock_kfree_s(sk, mc_lst, sizeof(*mc_lst)); + return -ENODEV; + } + + mc_lst->ifindex = dev->ifindex; + mc_lst->sfmode = MCAST_EXCLUDE; + rwlock_init(&mc_lst->sflock); + mc_lst->sflist = NULL; + + /* + * now add/increase the group membership on the device + */ + + err = ipv6_dev_mc_inc(dev, addr); + + if (err) { + rcu_read_unlock(); + sock_kfree_s(sk, mc_lst, sizeof(*mc_lst)); + return err; + } + + spin_lock(&ipv6_sk_mc_lock); + mc_lst->next = np->ipv6_mc_list; + rcu_assign_pointer(np->ipv6_mc_list, mc_lst); + spin_unlock(&ipv6_sk_mc_lock); + + rcu_read_unlock(); + + return 0; +} + +/* + * socket leave on multicast group + */ +int ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct ipv6_mc_socklist *mc_lst; + struct ipv6_mc_socklist __rcu **lnk; + struct net *net = sock_net(sk); + + spin_lock(&ipv6_sk_mc_lock); + for (lnk = &np->ipv6_mc_list; + (mc_lst = rcu_dereference_protected(*lnk, + lockdep_is_held(&ipv6_sk_mc_lock))) !=NULL ; + lnk = &mc_lst->next) { + if ((ifindex == 0 || mc_lst->ifindex == ifindex) && + ipv6_addr_equal(&mc_lst->addr, addr)) { + struct net_device *dev; + + *lnk = mc_lst->next; + spin_unlock(&ipv6_sk_mc_lock); + + rcu_read_lock(); + dev = dev_get_by_index_rcu(net, mc_lst->ifindex); + if (dev != NULL) { + struct inet6_dev *idev = __in6_dev_get(dev); + + (void) ip6_mc_leave_src(sk, mc_lst, idev); + if (idev) + __ipv6_dev_mc_dec(idev, &mc_lst->addr); + } else + (void) ip6_mc_leave_src(sk, mc_lst, NULL); + rcu_read_unlock(); + atomic_sub(sizeof(*mc_lst), &sk->sk_omem_alloc); + kfree_rcu(mc_lst, rcu); + return 0; + } + } + spin_unlock(&ipv6_sk_mc_lock); + + return -EADDRNOTAVAIL; +} + +/* called with rcu_read_lock() */ +static struct inet6_dev *ip6_mc_find_dev_rcu(struct net *net, + const struct in6_addr *group, + int ifindex) +{ + struct net_device *dev = NULL; + struct inet6_dev *idev = NULL; + + if (ifindex == 0) { + struct rt6_info *rt = rt6_lookup(net, group, NULL, 0, 0); + + if (rt) { + dev = rt->dst.dev; + dst_release(&rt->dst); + } + } else + dev = dev_get_by_index_rcu(net, ifindex); + + if (!dev) + return NULL; + idev = __in6_dev_get(dev); + if (!idev) + return NULL; + read_lock_bh(&idev->lock); + if (idev->dead) { + read_unlock_bh(&idev->lock); + return NULL; + } + return idev; +} + +void ipv6_sock_mc_close(struct sock *sk) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct ipv6_mc_socklist *mc_lst; + struct net *net = sock_net(sk); + + spin_lock(&ipv6_sk_mc_lock); + while ((mc_lst = rcu_dereference_protected(np->ipv6_mc_list, + lockdep_is_held(&ipv6_sk_mc_lock))) != NULL) { + struct net_device *dev; + + np->ipv6_mc_list = mc_lst->next; + spin_unlock(&ipv6_sk_mc_lock); + + rcu_read_lock(); + dev = dev_get_by_index_rcu(net, mc_lst->ifindex); + if (dev) { + struct inet6_dev *idev = __in6_dev_get(dev); + + (void) ip6_mc_leave_src(sk, mc_lst, idev); + if (idev) + __ipv6_dev_mc_dec(idev, &mc_lst->addr); + } else + (void) ip6_mc_leave_src(sk, mc_lst, NULL); + rcu_read_unlock(); + + atomic_sub(sizeof(*mc_lst), &sk->sk_omem_alloc); + kfree_rcu(mc_lst, rcu); + + spin_lock(&ipv6_sk_mc_lock); + } + spin_unlock(&ipv6_sk_mc_lock); +} + +int ip6_mc_source(int add, int omode, struct sock *sk, + struct group_source_req *pgsr) +{ + struct in6_addr *source, *group; + struct ipv6_mc_socklist *pmc; + struct inet6_dev *idev; + struct ipv6_pinfo *inet6 = inet6_sk(sk); + struct ip6_sf_socklist *psl; + struct net *net = sock_net(sk); + int i, j, rv; + int leavegroup = 0; + int pmclocked = 0; + int err; + + source = &((struct sockaddr_in6 *)&pgsr->gsr_source)->sin6_addr; + group = &((struct sockaddr_in6 *)&pgsr->gsr_group)->sin6_addr; + + if (!ipv6_addr_is_multicast(group)) + return -EINVAL; + + rcu_read_lock(); + idev = ip6_mc_find_dev_rcu(net, group, pgsr->gsr_interface); + if (!idev) { + rcu_read_unlock(); + return -ENODEV; + } + + err = -EADDRNOTAVAIL; + + for_each_pmc_rcu(inet6, pmc) { + if (pgsr->gsr_interface && pmc->ifindex != pgsr->gsr_interface) + continue; + if (ipv6_addr_equal(&pmc->addr, group)) + break; + } + if (!pmc) { /* must have a prior join */ + err = -EINVAL; + goto done; + } + /* if a source filter was set, must be the same mode as before */ + if (pmc->sflist) { + if (pmc->sfmode != omode) { + err = -EINVAL; + goto done; + } + } else if (pmc->sfmode != omode) { + /* allow mode switches for empty-set filters */ + ip6_mc_add_src(idev, group, omode, 0, NULL, 0); + ip6_mc_del_src(idev, group, pmc->sfmode, 0, NULL, 0); + pmc->sfmode = omode; + } + + write_lock(&pmc->sflock); + pmclocked = 1; + + psl = pmc->sflist; + if (!add) { + if (!psl) + goto done; /* err = -EADDRNOTAVAIL */ + rv = !0; + for (i=0; i<psl->sl_count; i++) { + rv = memcmp(&psl->sl_addr[i], source, + sizeof(struct in6_addr)); + if (rv == 0) + break; + } + if (rv) /* source not found */ + goto done; /* err = -EADDRNOTAVAIL */ + + /* special case - (INCLUDE, empty) == LEAVE_GROUP */ + if (psl->sl_count == 1 && omode == MCAST_INCLUDE) { + leavegroup = 1; + goto done; + } + + /* update the interface filter */ + ip6_mc_del_src(idev, group, omode, 1, source, 1); + + for (j=i+1; j<psl->sl_count; j++) + psl->sl_addr[j-1] = psl->sl_addr[j]; + psl->sl_count--; + err = 0; + goto done; + } + /* else, add a new source to the filter */ + + if (psl && psl->sl_count >= sysctl_mld_max_msf) { + err = -ENOBUFS; + goto done; + } + if (!psl || psl->sl_count == psl->sl_max) { + struct ip6_sf_socklist *newpsl; + int count = IP6_SFBLOCK; + + if (psl) + count += psl->sl_max; + newpsl = sock_kmalloc(sk, IP6_SFLSIZE(count), GFP_ATOMIC); + if (!newpsl) { + err = -ENOBUFS; + goto done; + } + newpsl->sl_max = count; + newpsl->sl_count = count - IP6_SFBLOCK; + if (psl) { + for (i=0; i<psl->sl_count; i++) + newpsl->sl_addr[i] = psl->sl_addr[i]; + sock_kfree_s(sk, psl, IP6_SFLSIZE(psl->sl_max)); + } + pmc->sflist = psl = newpsl; + } + rv = 1; /* > 0 for insert logic below if sl_count is 0 */ + for (i=0; i<psl->sl_count; i++) { + rv = memcmp(&psl->sl_addr[i], source, sizeof(struct in6_addr)); + if (rv == 0) + break; + } + if (rv == 0) /* address already there is an error */ + goto done; + for (j=psl->sl_count-1; j>=i; j--) + psl->sl_addr[j+1] = psl->sl_addr[j]; + psl->sl_addr[i] = *source; + psl->sl_count++; + err = 0; + /* update the interface list */ + ip6_mc_add_src(idev, group, omode, 1, source, 1); +done: + if (pmclocked) + write_unlock(&pmc->sflock); + read_unlock_bh(&idev->lock); + rcu_read_unlock(); + if (leavegroup) + return ipv6_sock_mc_drop(sk, pgsr->gsr_interface, group); + return err; +} + +int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf) +{ + const struct in6_addr *group; + struct ipv6_mc_socklist *pmc; + struct inet6_dev *idev; + struct ipv6_pinfo *inet6 = inet6_sk(sk); + struct ip6_sf_socklist *newpsl, *psl; + struct net *net = sock_net(sk); + int leavegroup = 0; + int i, err; + + group = &((struct sockaddr_in6 *)&gsf->gf_group)->sin6_addr; + + if (!ipv6_addr_is_multicast(group)) + return -EINVAL; + if (gsf->gf_fmode != MCAST_INCLUDE && + gsf->gf_fmode != MCAST_EXCLUDE) + return -EINVAL; + + rcu_read_lock(); + idev = ip6_mc_find_dev_rcu(net, group, gsf->gf_interface); + + if (!idev) { + rcu_read_unlock(); + return -ENODEV; + } + + err = 0; + + if (gsf->gf_fmode == MCAST_INCLUDE && gsf->gf_numsrc == 0) { + leavegroup = 1; + goto done; + } + + for_each_pmc_rcu(inet6, pmc) { + if (pmc->ifindex != gsf->gf_interface) + continue; + if (ipv6_addr_equal(&pmc->addr, group)) + break; + } + if (!pmc) { /* must have a prior join */ + err = -EINVAL; + goto done; + } + if (gsf->gf_numsrc) { + newpsl = sock_kmalloc(sk, IP6_SFLSIZE(gsf->gf_numsrc), + GFP_ATOMIC); + if (!newpsl) { + err = -ENOBUFS; + goto done; + } + newpsl->sl_max = newpsl->sl_count = gsf->gf_numsrc; + for (i=0; i<newpsl->sl_count; ++i) { + struct sockaddr_in6 *psin6; + + psin6 = (struct sockaddr_in6 *)&gsf->gf_slist[i]; + newpsl->sl_addr[i] = psin6->sin6_addr; + } + err = ip6_mc_add_src(idev, group, gsf->gf_fmode, + newpsl->sl_count, newpsl->sl_addr, 0); + if (err) { + sock_kfree_s(sk, newpsl, IP6_SFLSIZE(newpsl->sl_max)); + goto done; + } + } else { + newpsl = NULL; + (void) ip6_mc_add_src(idev, group, gsf->gf_fmode, 0, NULL, 0); + } + + write_lock(&pmc->sflock); + psl = pmc->sflist; + if (psl) { + (void) ip6_mc_del_src(idev, group, pmc->sfmode, + psl->sl_count, psl->sl_addr, 0); + sock_kfree_s(sk, psl, IP6_SFLSIZE(psl->sl_max)); + } else + (void) ip6_mc_del_src(idev, group, pmc->sfmode, 0, NULL, 0); + pmc->sflist = newpsl; + pmc->sfmode = gsf->gf_fmode; + write_unlock(&pmc->sflock); + err = 0; +done: + read_unlock_bh(&idev->lock); + rcu_read_unlock(); + if (leavegroup) + err = ipv6_sock_mc_drop(sk, gsf->gf_interface, group); + return err; +} + +int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf, + struct group_filter __user *optval, int __user *optlen) +{ + int err, i, count, copycount; + const struct in6_addr *group; + struct ipv6_mc_socklist *pmc; + struct inet6_dev *idev; + struct ipv6_pinfo *inet6 = inet6_sk(sk); + struct ip6_sf_socklist *psl; + struct net *net = sock_net(sk); + + group = &((struct sockaddr_in6 *)&gsf->gf_group)->sin6_addr; + + if (!ipv6_addr_is_multicast(group)) + return -EINVAL; + + rcu_read_lock(); + idev = ip6_mc_find_dev_rcu(net, group, gsf->gf_interface); + + if (!idev) { + rcu_read_unlock(); + return -ENODEV; + } + + err = -EADDRNOTAVAIL; + /* + * changes to the ipv6_mc_list require the socket lock and + * a read lock on ip6_sk_mc_lock. We have the socket lock, + * so reading the list is safe. + */ + + for_each_pmc_rcu(inet6, pmc) { + if (pmc->ifindex != gsf->gf_interface) + continue; + if (ipv6_addr_equal(group, &pmc->addr)) + break; + } + if (!pmc) /* must have a prior join */ + goto done; + gsf->gf_fmode = pmc->sfmode; + psl = pmc->sflist; + count = psl ? psl->sl_count : 0; + read_unlock_bh(&idev->lock); + rcu_read_unlock(); + + copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc; + gsf->gf_numsrc = count; + if (put_user(GROUP_FILTER_SIZE(copycount), optlen) || + copy_to_user(optval, gsf, GROUP_FILTER_SIZE(0))) { + return -EFAULT; + } + /* changes to psl require the socket lock, a read lock on + * on ipv6_sk_mc_lock and a write lock on pmc->sflock. We + * have the socket lock, so reading here is safe. + */ + for (i=0; i<copycount; i++) { + struct sockaddr_in6 *psin6; + struct sockaddr_storage ss; + + psin6 = (struct sockaddr_in6 *)&ss; + memset(&ss, 0, sizeof(ss)); + psin6->sin6_family = AF_INET6; + psin6->sin6_addr = psl->sl_addr[i]; + if (copy_to_user(&optval->gf_slist[i], &ss, sizeof(ss))) + return -EFAULT; + } + return 0; +done: + read_unlock_bh(&idev->lock); + rcu_read_unlock(); + return err; +} + +int inet6_mc_check(struct sock *sk, const struct in6_addr *mc_addr, + const struct in6_addr *src_addr) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct ipv6_mc_socklist *mc; + struct ip6_sf_socklist *psl; + int rv = 1; + + rcu_read_lock(); + for_each_pmc_rcu(np, mc) { + if (ipv6_addr_equal(&mc->addr, mc_addr)) + break; + } + if (!mc) { + rcu_read_unlock(); + return 1; + } + read_lock(&mc->sflock); + psl = mc->sflist; + if (!psl) { + rv = mc->sfmode == MCAST_EXCLUDE; + } else { + int i; + + for (i=0; i<psl->sl_count; i++) { + if (ipv6_addr_equal(&psl->sl_addr[i], src_addr)) + break; + } + if (mc->sfmode == MCAST_INCLUDE && i >= psl->sl_count) + rv = 0; + if (mc->sfmode == MCAST_EXCLUDE && i < psl->sl_count) + rv = 0; + } + read_unlock(&mc->sflock); + rcu_read_unlock(); + + return rv; +} + +static void ma_put(struct ifmcaddr6 *mc) +{ + if (atomic_dec_and_test(&mc->mca_refcnt)) { + in6_dev_put(mc->idev); + kfree(mc); + } +} + +static void igmp6_group_added(struct ifmcaddr6 *mc) +{ + struct net_device *dev = mc->idev->dev; + char buf[MAX_ADDR_LEN]; + + spin_lock_bh(&mc->mca_lock); + if (!(mc->mca_flags&MAF_LOADED)) { + mc->mca_flags |= MAF_LOADED; + if (ndisc_mc_map(&mc->mca_addr, buf, dev, 0) == 0) + dev_mc_add(dev, buf); + } + spin_unlock_bh(&mc->mca_lock); + + if (!(dev->flags & IFF_UP) || (mc->mca_flags & MAF_NOREPORT)) + return; + + if (MLD_V1_SEEN(mc->idev)) { + igmp6_join_group(mc); + return; + } + /* else v2 */ + + mc->mca_crcount = mc->idev->mc_qrv; + mld_ifc_event(mc->idev); +} + +static void igmp6_group_dropped(struct ifmcaddr6 *mc) +{ + struct net_device *dev = mc->idev->dev; + char buf[MAX_ADDR_LEN]; + + spin_lock_bh(&mc->mca_lock); + if (mc->mca_flags&MAF_LOADED) { + mc->mca_flags &= ~MAF_LOADED; + if (ndisc_mc_map(&mc->mca_addr, buf, dev, 0) == 0) + dev_mc_del(dev, buf); + } + + if (mc->mca_flags & MAF_NOREPORT) + goto done; + spin_unlock_bh(&mc->mca_lock); + + if (!mc->idev->dead) + igmp6_leave_group(mc); + + spin_lock_bh(&mc->mca_lock); + if (del_timer(&mc->mca_timer)) + atomic_dec(&mc->mca_refcnt); +done: + ip6_mc_clear_src(mc); + spin_unlock_bh(&mc->mca_lock); +} + +/* + * deleted ifmcaddr6 manipulation + */ +static void mld_add_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im) +{ + struct ifmcaddr6 *pmc; + + /* this is an "ifmcaddr6" for convenience; only the fields below + * are actually used. In particular, the refcnt and users are not + * used for management of the delete list. Using the same structure + * for deleted items allows change reports to use common code with + * non-deleted or query-response MCA's. + */ + pmc = kzalloc(sizeof(*pmc), GFP_ATOMIC); + if (!pmc) + return; + + spin_lock_bh(&im->mca_lock); + spin_lock_init(&pmc->mca_lock); + pmc->idev = im->idev; + in6_dev_hold(idev); + pmc->mca_addr = im->mca_addr; + pmc->mca_crcount = idev->mc_qrv; + pmc->mca_sfmode = im->mca_sfmode; + if (pmc->mca_sfmode == MCAST_INCLUDE) { + struct ip6_sf_list *psf; + + pmc->mca_tomb = im->mca_tomb; + pmc->mca_sources = im->mca_sources; + im->mca_tomb = im->mca_sources = NULL; + for (psf=pmc->mca_sources; psf; psf=psf->sf_next) + psf->sf_crcount = pmc->mca_crcount; + } + spin_unlock_bh(&im->mca_lock); + + spin_lock_bh(&idev->mc_lock); + pmc->next = idev->mc_tomb; + idev->mc_tomb = pmc; + spin_unlock_bh(&idev->mc_lock); +} + +static void mld_del_delrec(struct inet6_dev *idev, const struct in6_addr *pmca) +{ + struct ifmcaddr6 *pmc, *pmc_prev; + struct ip6_sf_list *psf, *psf_next; + + spin_lock_bh(&idev->mc_lock); + pmc_prev = NULL; + for (pmc=idev->mc_tomb; pmc; pmc=pmc->next) { + if (ipv6_addr_equal(&pmc->mca_addr, pmca)) + break; + pmc_prev = pmc; + } + if (pmc) { + if (pmc_prev) + pmc_prev->next = pmc->next; + else + idev->mc_tomb = pmc->next; + } + spin_unlock_bh(&idev->mc_lock); + + if (pmc) { + for (psf=pmc->mca_tomb; psf; psf=psf_next) { + psf_next = psf->sf_next; + kfree(psf); + } + in6_dev_put(pmc->idev); + kfree(pmc); + } +} + +static void mld_clear_delrec(struct inet6_dev *idev) +{ + struct ifmcaddr6 *pmc, *nextpmc; + + spin_lock_bh(&idev->mc_lock); + pmc = idev->mc_tomb; + idev->mc_tomb = NULL; + spin_unlock_bh(&idev->mc_lock); + + for (; pmc; pmc = nextpmc) { + nextpmc = pmc->next; + ip6_mc_clear_src(pmc); + in6_dev_put(pmc->idev); + kfree(pmc); + } + + /* clear dead sources, too */ + read_lock_bh(&idev->lock); + for (pmc=idev->mc_list; pmc; pmc=pmc->next) { + struct ip6_sf_list *psf, *psf_next; + + spin_lock_bh(&pmc->mca_lock); + psf = pmc->mca_tomb; + pmc->mca_tomb = NULL; + spin_unlock_bh(&pmc->mca_lock); + for (; psf; psf=psf_next) { + psf_next = psf->sf_next; + kfree(psf); + } + } + read_unlock_bh(&idev->lock); +} + + +/* + * device multicast group inc (add if not found) + */ +int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr) +{ + struct ifmcaddr6 *mc; + struct inet6_dev *idev; + + /* we need to take a reference on idev */ + idev = in6_dev_get(dev); + + if (idev == NULL) + return -EINVAL; + + write_lock_bh(&idev->lock); + if (idev->dead) { + write_unlock_bh(&idev->lock); + in6_dev_put(idev); + return -ENODEV; + } + + for (mc = idev->mc_list; mc; mc = mc->next) { + if (ipv6_addr_equal(&mc->mca_addr, addr)) { + mc->mca_users++; + write_unlock_bh(&idev->lock); + ip6_mc_add_src(idev, &mc->mca_addr, MCAST_EXCLUDE, 0, + NULL, 0); + in6_dev_put(idev); + return 0; + } + } + + /* + * not found: create a new one. + */ + + mc = kzalloc(sizeof(struct ifmcaddr6), GFP_ATOMIC); + + if (mc == NULL) { + write_unlock_bh(&idev->lock); + in6_dev_put(idev); + return -ENOMEM; + } + + setup_timer(&mc->mca_timer, igmp6_timer_handler, (unsigned long)mc); + + mc->mca_addr = *addr; + mc->idev = idev; /* (reference taken) */ + mc->mca_users = 1; + /* mca_stamp should be updated upon changes */ + mc->mca_cstamp = mc->mca_tstamp = jiffies; + atomic_set(&mc->mca_refcnt, 2); + spin_lock_init(&mc->mca_lock); + + /* initial mode is (EX, empty) */ + mc->mca_sfmode = MCAST_EXCLUDE; + mc->mca_sfcount[MCAST_EXCLUDE] = 1; + + if (ipv6_addr_is_ll_all_nodes(&mc->mca_addr) || + IPV6_ADDR_MC_SCOPE(&mc->mca_addr) < IPV6_ADDR_SCOPE_LINKLOCAL) + mc->mca_flags |= MAF_NOREPORT; + + mc->next = idev->mc_list; + idev->mc_list = mc; + write_unlock_bh(&idev->lock); + + mld_del_delrec(idev, &mc->mca_addr); + igmp6_group_added(mc); + ma_put(mc); + return 0; +} + +/* + * device multicast group del + */ +int __ipv6_dev_mc_dec(struct inet6_dev *idev, const struct in6_addr *addr) +{ + struct ifmcaddr6 *ma, **map; + + write_lock_bh(&idev->lock); + for (map = &idev->mc_list; (ma=*map) != NULL; map = &ma->next) { + if (ipv6_addr_equal(&ma->mca_addr, addr)) { + if (--ma->mca_users == 0) { + *map = ma->next; + write_unlock_bh(&idev->lock); + + igmp6_group_dropped(ma); + + ma_put(ma); + return 0; + } + write_unlock_bh(&idev->lock); + return 0; + } + } + write_unlock_bh(&idev->lock); + + return -ENOENT; +} + +int ipv6_dev_mc_dec(struct net_device *dev, const struct in6_addr *addr) +{ + struct inet6_dev *idev; + int err; + + rcu_read_lock(); + + idev = __in6_dev_get(dev); + if (!idev) + err = -ENODEV; + else + err = __ipv6_dev_mc_dec(idev, addr); + + rcu_read_unlock(); + return err; +} + +/* + * identify MLD packets for MLD filter exceptions + */ +int ipv6_is_mld(struct sk_buff *skb, int nexthdr) +{ + struct icmp6hdr *pic; + + if (nexthdr != IPPROTO_ICMPV6) + return 0; + + if (!pskb_may_pull(skb, sizeof(struct icmp6hdr))) + return 0; + + pic = icmp6_hdr(skb); + + switch (pic->icmp6_type) { + case ICMPV6_MGM_QUERY: + case ICMPV6_MGM_REPORT: + case ICMPV6_MGM_REDUCTION: + case ICMPV6_MLD2_REPORT: + return 1; + default: + break; + } + return 0; +} + +/* + * check if the interface/address pair is valid + */ +int ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group, + const struct in6_addr *src_addr) +{ + struct inet6_dev *idev; + struct ifmcaddr6 *mc; + int rv = 0; + + rcu_read_lock(); + idev = __in6_dev_get(dev); + if (idev) { + read_lock_bh(&idev->lock); + for (mc = idev->mc_list; mc; mc=mc->next) { + if (ipv6_addr_equal(&mc->mca_addr, group)) + break; + } + if (mc) { + if (src_addr && !ipv6_addr_any(src_addr)) { + struct ip6_sf_list *psf; + + spin_lock_bh(&mc->mca_lock); + for (psf=mc->mca_sources;psf;psf=psf->sf_next) { + if (ipv6_addr_equal(&psf->sf_addr, src_addr)) + break; + } + if (psf) + rv = psf->sf_count[MCAST_INCLUDE] || + psf->sf_count[MCAST_EXCLUDE] != + mc->mca_sfcount[MCAST_EXCLUDE]; + else + rv = mc->mca_sfcount[MCAST_EXCLUDE] !=0; + spin_unlock_bh(&mc->mca_lock); + } else + rv = 1; /* don't filter unspecified source */ + } + read_unlock_bh(&idev->lock); + } + rcu_read_unlock(); + return rv; +} + +static void mld_gq_start_timer(struct inet6_dev *idev) +{ + int tv = net_random() % idev->mc_maxdelay; + + idev->mc_gq_running = 1; + if (!mod_timer(&idev->mc_gq_timer, jiffies+tv+2)) + in6_dev_hold(idev); +} + +static void mld_ifc_start_timer(struct inet6_dev *idev, int delay) +{ + int tv = net_random() % delay; + + if (!mod_timer(&idev->mc_ifc_timer, jiffies+tv+2)) + in6_dev_hold(idev); +} + +/* + * IGMP handling (alias multicast ICMPv6 messages) + */ + +static void igmp6_group_queried(struct ifmcaddr6 *ma, unsigned long resptime) +{ + unsigned long delay = resptime; + + /* Do not start timer for these addresses */ + if (ipv6_addr_is_ll_all_nodes(&ma->mca_addr) || + IPV6_ADDR_MC_SCOPE(&ma->mca_addr) < IPV6_ADDR_SCOPE_LINKLOCAL) + return; + + if (del_timer(&ma->mca_timer)) { + atomic_dec(&ma->mca_refcnt); + delay = ma->mca_timer.expires - jiffies; + } + + if (delay >= resptime) { + if (resptime) + delay = net_random() % resptime; + else + delay = 1; + } + ma->mca_timer.expires = jiffies + delay; + if (!mod_timer(&ma->mca_timer, jiffies + delay)) + atomic_inc(&ma->mca_refcnt); + ma->mca_flags |= MAF_TIMER_RUNNING; +} + +/* mark EXCLUDE-mode sources */ +static int mld_xmarksources(struct ifmcaddr6 *pmc, int nsrcs, + const struct in6_addr *srcs) +{ + struct ip6_sf_list *psf; + int i, scount; + + scount = 0; + for (psf=pmc->mca_sources; psf; psf=psf->sf_next) { + if (scount == nsrcs) + break; + for (i=0; i<nsrcs; i++) { + /* skip inactive filters */ + if (psf->sf_count[MCAST_INCLUDE] || + pmc->mca_sfcount[MCAST_EXCLUDE] != + psf->sf_count[MCAST_EXCLUDE]) + continue; + if (ipv6_addr_equal(&srcs[i], &psf->sf_addr)) { + scount++; + break; + } + } + } + pmc->mca_flags &= ~MAF_GSQUERY; + if (scount == nsrcs) /* all sources excluded */ + return 0; + return 1; +} + +static int mld_marksources(struct ifmcaddr6 *pmc, int nsrcs, + const struct in6_addr *srcs) +{ + struct ip6_sf_list *psf; + int i, scount; + + if (pmc->mca_sfmode == MCAST_EXCLUDE) + return mld_xmarksources(pmc, nsrcs, srcs); + + /* mark INCLUDE-mode sources */ + + scount = 0; + for (psf=pmc->mca_sources; psf; psf=psf->sf_next) { + if (scount == nsrcs) + break; + for (i=0; i<nsrcs; i++) { + if (ipv6_addr_equal(&srcs[i], &psf->sf_addr)) { + psf->sf_gsresp = 1; + scount++; + break; + } + } + } + if (!scount) { + pmc->mca_flags &= ~MAF_GSQUERY; + return 0; + } + pmc->mca_flags |= MAF_GSQUERY; + return 1; +} + +/* called with rcu_read_lock() */ +int igmp6_event_query(struct sk_buff *skb) +{ + struct mld2_query *mlh2 = NULL; + struct ifmcaddr6 *ma; + const struct in6_addr *group; + unsigned long max_delay; + struct inet6_dev *idev; + struct mld_msg *mld; + int group_type; + int mark = 0; + int len; + + if (!pskb_may_pull(skb, sizeof(struct in6_addr))) + return -EINVAL; + + /* compute payload length excluding extension headers */ + len = ntohs(ipv6_hdr(skb)->payload_len) + sizeof(struct ipv6hdr); + len -= skb_network_header_len(skb); + + /* Drop queries with not link local source */ + if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) + return -EINVAL; + + idev = __in6_dev_get(skb->dev); + + if (idev == NULL) + return 0; + + mld = (struct mld_msg *)icmp6_hdr(skb); + group = &mld->mld_mca; + group_type = ipv6_addr_type(group); + + if (group_type != IPV6_ADDR_ANY && + !(group_type&IPV6_ADDR_MULTICAST)) + return -EINVAL; + + if (len == 24) { + int switchback; + /* MLDv1 router present */ + + /* Translate milliseconds to jiffies */ + max_delay = (ntohs(mld->mld_maxdelay)*HZ)/1000; + + switchback = (idev->mc_qrv + 1) * max_delay; + idev->mc_v1_seen = jiffies + switchback; + + /* cancel the interface change timer */ + idev->mc_ifc_count = 0; + if (del_timer(&idev->mc_ifc_timer)) + __in6_dev_put(idev); + /* clear deleted report items */ + mld_clear_delrec(idev); + } else if (len >= 28) { + int srcs_offset = sizeof(struct mld2_query) - + sizeof(struct icmp6hdr); + if (!pskb_may_pull(skb, srcs_offset)) + return -EINVAL; + + mlh2 = (struct mld2_query *)skb_transport_header(skb); + max_delay = (MLDV2_MRC(ntohs(mlh2->mld2q_mrc))*HZ)/1000; + if (!max_delay) + max_delay = 1; + idev->mc_maxdelay = max_delay; + if (mlh2->mld2q_qrv) + idev->mc_qrv = mlh2->mld2q_qrv; + if (group_type == IPV6_ADDR_ANY) { /* general query */ + if (mlh2->mld2q_nsrcs) + return -EINVAL; /* no sources allowed */ + + mld_gq_start_timer(idev); + return 0; + } + /* mark sources to include, if group & source-specific */ + if (mlh2->mld2q_nsrcs != 0) { + if (!pskb_may_pull(skb, srcs_offset + + ntohs(mlh2->mld2q_nsrcs) * sizeof(struct in6_addr))) + return -EINVAL; + + mlh2 = (struct mld2_query *)skb_transport_header(skb); + mark = 1; + } + } else + return -EINVAL; + + read_lock_bh(&idev->lock); + if (group_type == IPV6_ADDR_ANY) { + for (ma = idev->mc_list; ma; ma=ma->next) { + spin_lock_bh(&ma->mca_lock); + igmp6_group_queried(ma, max_delay); + spin_unlock_bh(&ma->mca_lock); + } + } else { + for (ma = idev->mc_list; ma; ma=ma->next) { + if (!ipv6_addr_equal(group, &ma->mca_addr)) + continue; + spin_lock_bh(&ma->mca_lock); + if (ma->mca_flags & MAF_TIMER_RUNNING) { + /* gsquery <- gsquery && mark */ + if (!mark) + ma->mca_flags &= ~MAF_GSQUERY; + } else { + /* gsquery <- mark */ + if (mark) + ma->mca_flags |= MAF_GSQUERY; + else + ma->mca_flags &= ~MAF_GSQUERY; + } + if (!(ma->mca_flags & MAF_GSQUERY) || + mld_marksources(ma, ntohs(mlh2->mld2q_nsrcs), mlh2->mld2q_srcs)) + igmp6_group_queried(ma, max_delay); + spin_unlock_bh(&ma->mca_lock); + break; + } + } + read_unlock_bh(&idev->lock); + + return 0; +} + +/* called with rcu_read_lock() */ +int igmp6_event_report(struct sk_buff *skb) +{ + struct ifmcaddr6 *ma; + struct inet6_dev *idev; + struct mld_msg *mld; + int addr_type; + + /* Our own report looped back. Ignore it. */ + if (skb->pkt_type == PACKET_LOOPBACK) + return 0; + + /* send our report if the MC router may not have heard this report */ + if (skb->pkt_type != PACKET_MULTICAST && + skb->pkt_type != PACKET_BROADCAST) + return 0; + + if (!pskb_may_pull(skb, sizeof(*mld) - sizeof(struct icmp6hdr))) + return -EINVAL; + + mld = (struct mld_msg *)icmp6_hdr(skb); + + /* Drop reports with not link local source */ + addr_type = ipv6_addr_type(&ipv6_hdr(skb)->saddr); + if (addr_type != IPV6_ADDR_ANY && + !(addr_type&IPV6_ADDR_LINKLOCAL)) + return -EINVAL; + + idev = __in6_dev_get(skb->dev); + if (idev == NULL) + return -ENODEV; + + /* + * Cancel the timer for this group + */ + + read_lock_bh(&idev->lock); + for (ma = idev->mc_list; ma; ma=ma->next) { + if (ipv6_addr_equal(&ma->mca_addr, &mld->mld_mca)) { + spin_lock(&ma->mca_lock); + if (del_timer(&ma->mca_timer)) + atomic_dec(&ma->mca_refcnt); + ma->mca_flags &= ~(MAF_LAST_REPORTER|MAF_TIMER_RUNNING); + spin_unlock(&ma->mca_lock); + break; + } + } + read_unlock_bh(&idev->lock); + return 0; +} + +static int is_in(struct ifmcaddr6 *pmc, struct ip6_sf_list *psf, int type, + int gdeleted, int sdeleted) +{ + switch (type) { + case MLD2_MODE_IS_INCLUDE: + case MLD2_MODE_IS_EXCLUDE: + if (gdeleted || sdeleted) + return 0; + if (!((pmc->mca_flags & MAF_GSQUERY) && !psf->sf_gsresp)) { + if (pmc->mca_sfmode == MCAST_INCLUDE) + return 1; + /* don't include if this source is excluded + * in all filters + */ + if (psf->sf_count[MCAST_INCLUDE]) + return type == MLD2_MODE_IS_INCLUDE; + return pmc->mca_sfcount[MCAST_EXCLUDE] == + psf->sf_count[MCAST_EXCLUDE]; + } + return 0; + case MLD2_CHANGE_TO_INCLUDE: + if (gdeleted || sdeleted) + return 0; + return psf->sf_count[MCAST_INCLUDE] != 0; + case MLD2_CHANGE_TO_EXCLUDE: + if (gdeleted || sdeleted) + return 0; + if (pmc->mca_sfcount[MCAST_EXCLUDE] == 0 || + psf->sf_count[MCAST_INCLUDE]) + return 0; + return pmc->mca_sfcount[MCAST_EXCLUDE] == + psf->sf_count[MCAST_EXCLUDE]; + case MLD2_ALLOW_NEW_SOURCES: + if (gdeleted || !psf->sf_crcount) + return 0; + return (pmc->mca_sfmode == MCAST_INCLUDE) ^ sdeleted; + case MLD2_BLOCK_OLD_SOURCES: + if (pmc->mca_sfmode == MCAST_INCLUDE) + return gdeleted || (psf->sf_crcount && sdeleted); + return psf->sf_crcount && !gdeleted && !sdeleted; + } + return 0; +} + +static int +mld_scount(struct ifmcaddr6 *pmc, int type, int gdeleted, int sdeleted) +{ + struct ip6_sf_list *psf; + int scount = 0; + + for (psf=pmc->mca_sources; psf; psf=psf->sf_next) { + if (!is_in(pmc, psf, type, gdeleted, sdeleted)) + continue; + scount++; + } + return scount; +} + +static struct sk_buff *mld_newpack(struct net_device *dev, int size) +{ + struct net *net = dev_net(dev); + struct sock *sk = net->ipv6.igmp_sk; + struct sk_buff *skb; + struct mld2_report *pmr; + struct in6_addr addr_buf; + const struct in6_addr *saddr; + int hlen = LL_RESERVED_SPACE(dev); + int tlen = dev->needed_tailroom; + int err; + u8 ra[8] = { IPPROTO_ICMPV6, 0, + IPV6_TLV_ROUTERALERT, 2, 0, 0, + IPV6_TLV_PADN, 0 }; + + /* we assume size > sizeof(ra) here */ + size += hlen + tlen; + /* limit our allocations to order-0 page */ + size = min_t(int, size, SKB_MAX_ORDER(0, 0)); + skb = sock_alloc_send_skb(sk, size, 1, &err); + + if (!skb) + return NULL; + + skb_reserve(skb, hlen); + + if (ipv6_get_lladdr(dev, &addr_buf, IFA_F_TENTATIVE)) { + /* <draft-ietf-magma-mld-source-05.txt>: + * use unspecified address as the source address + * when a valid link-local address is not available. + */ + saddr = &in6addr_any; + } else + saddr = &addr_buf; + + ip6_nd_hdr(sk, skb, dev, saddr, &mld2_all_mcr, NEXTHDR_HOP, 0); + + memcpy(skb_put(skb, sizeof(ra)), ra, sizeof(ra)); + + skb_set_transport_header(skb, skb_tail_pointer(skb) - skb->data); + skb_put(skb, sizeof(*pmr)); + pmr = (struct mld2_report *)skb_transport_header(skb); + pmr->mld2r_type = ICMPV6_MLD2_REPORT; + pmr->mld2r_resv1 = 0; + pmr->mld2r_cksum = 0; + pmr->mld2r_resv2 = 0; + pmr->mld2r_ngrec = 0; + return skb; +} + +static void mld_sendpack(struct sk_buff *skb) +{ + struct ipv6hdr *pip6 = ipv6_hdr(skb); + struct mld2_report *pmr = + (struct mld2_report *)skb_transport_header(skb); + int payload_len, mldlen; + struct inet6_dev *idev; + struct net *net = dev_net(skb->dev); + int err; + struct flowi6 fl6; + struct dst_entry *dst; + + rcu_read_lock(); + idev = __in6_dev_get(skb->dev); + IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len); + + payload_len = (skb->tail - skb->network_header) - sizeof(*pip6); + mldlen = skb->tail - skb->transport_header; + pip6->payload_len = htons(payload_len); + + pmr->mld2r_cksum = csum_ipv6_magic(&pip6->saddr, &pip6->daddr, mldlen, + IPPROTO_ICMPV6, + csum_partial(skb_transport_header(skb), + mldlen, 0)); + + icmpv6_flow_init(net->ipv6.igmp_sk, &fl6, ICMPV6_MLD2_REPORT, + &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, + skb->dev->ifindex); + dst = icmp6_dst_alloc(skb->dev, NULL, &fl6); + + err = 0; + if (IS_ERR(dst)) { + err = PTR_ERR(dst); + dst = NULL; + } + skb_dst_set(skb, dst); + if (err) + goto err_out; + + payload_len = skb->len; + + err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL, skb->dev, + dst_output); +out: + if (!err) { + ICMP6MSGOUT_INC_STATS_BH(net, idev, ICMPV6_MLD2_REPORT); + ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS); + IP6_UPD_PO_STATS_BH(net, idev, IPSTATS_MIB_OUTMCAST, payload_len); + } else + IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_OUTDISCARDS); + + rcu_read_unlock(); + return; + +err_out: + kfree_skb(skb); + goto out; +} + +static int grec_size(struct ifmcaddr6 *pmc, int type, int gdel, int sdel) +{ + return sizeof(struct mld2_grec) + 16 * mld_scount(pmc,type,gdel,sdel); +} + +static struct sk_buff *add_grhead(struct sk_buff *skb, struct ifmcaddr6 *pmc, + int type, struct mld2_grec **ppgr) +{ + struct net_device *dev = pmc->idev->dev; + struct mld2_report *pmr; + struct mld2_grec *pgr; + + if (!skb) + skb = mld_newpack(dev, dev->mtu); + if (!skb) + return NULL; + pgr = (struct mld2_grec *)skb_put(skb, sizeof(struct mld2_grec)); + pgr->grec_type = type; + pgr->grec_auxwords = 0; + pgr->grec_nsrcs = 0; + pgr->grec_mca = pmc->mca_addr; /* structure copy */ + pmr = (struct mld2_report *)skb_transport_header(skb); + pmr->mld2r_ngrec = htons(ntohs(pmr->mld2r_ngrec)+1); + *ppgr = pgr; + return skb; +} + +#define AVAILABLE(skb) ((skb) ? ((skb)->dev ? (skb)->dev->mtu - (skb)->len : \ + skb_tailroom(skb)) : 0) + +static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc, + int type, int gdeleted, int sdeleted) +{ + struct net_device *dev = pmc->idev->dev; + struct mld2_report *pmr; + struct mld2_grec *pgr = NULL; + struct ip6_sf_list *psf, *psf_next, *psf_prev, **psf_list; + int scount, stotal, first, isquery, truncate; + + if (pmc->mca_flags & MAF_NOREPORT) + return skb; + + isquery = type == MLD2_MODE_IS_INCLUDE || + type == MLD2_MODE_IS_EXCLUDE; + truncate = type == MLD2_MODE_IS_EXCLUDE || + type == MLD2_CHANGE_TO_EXCLUDE; + + stotal = scount = 0; + + psf_list = sdeleted ? &pmc->mca_tomb : &pmc->mca_sources; + + if (!*psf_list) + goto empty_source; + + pmr = skb ? (struct mld2_report *)skb_transport_header(skb) : NULL; + + /* EX and TO_EX get a fresh packet, if needed */ + if (truncate) { + if (pmr && pmr->mld2r_ngrec && + AVAILABLE(skb) < grec_size(pmc, type, gdeleted, sdeleted)) { + if (skb) + mld_sendpack(skb); + skb = mld_newpack(dev, dev->mtu); + } + } + first = 1; + psf_prev = NULL; + for (psf=*psf_list; psf; psf=psf_next) { + struct in6_addr *psrc; + + psf_next = psf->sf_next; + + if (!is_in(pmc, psf, type, gdeleted, sdeleted)) { + psf_prev = psf; + continue; + } + + /* clear marks on query responses */ + if (isquery) + psf->sf_gsresp = 0; + + if (AVAILABLE(skb) < sizeof(*psrc) + + first*sizeof(struct mld2_grec)) { + if (truncate && !first) + break; /* truncate these */ + if (pgr) + pgr->grec_nsrcs = htons(scount); + if (skb) + mld_sendpack(skb); + skb = mld_newpack(dev, dev->mtu); + first = 1; + scount = 0; + } + if (first) { + skb = add_grhead(skb, pmc, type, &pgr); + first = 0; + } + if (!skb) + return NULL; + psrc = (struct in6_addr *)skb_put(skb, sizeof(*psrc)); + *psrc = psf->sf_addr; + scount++; stotal++; + if ((type == MLD2_ALLOW_NEW_SOURCES || + type == MLD2_BLOCK_OLD_SOURCES) && psf->sf_crcount) { + psf->sf_crcount--; + if ((sdeleted || gdeleted) && psf->sf_crcount == 0) { + if (psf_prev) + psf_prev->sf_next = psf->sf_next; + else + *psf_list = psf->sf_next; + kfree(psf); + continue; + } + } + psf_prev = psf; + } + +empty_source: + if (!stotal) { + if (type == MLD2_ALLOW_NEW_SOURCES || + type == MLD2_BLOCK_OLD_SOURCES) + return skb; + if (pmc->mca_crcount || isquery) { + /* make sure we have room for group header */ + if (skb && AVAILABLE(skb) < sizeof(struct mld2_grec)) { + mld_sendpack(skb); + skb = NULL; /* add_grhead will get a new one */ + } + skb = add_grhead(skb, pmc, type, &pgr); + } + } + if (pgr) + pgr->grec_nsrcs = htons(scount); + + if (isquery) + pmc->mca_flags &= ~MAF_GSQUERY; /* clear query state */ + return skb; +} + +static void mld_send_report(struct inet6_dev *idev, struct ifmcaddr6 *pmc) +{ + struct sk_buff *skb = NULL; + int type; + + if (!pmc) { + read_lock_bh(&idev->lock); + for (pmc=idev->mc_list; pmc; pmc=pmc->next) { + if (pmc->mca_flags & MAF_NOREPORT) + continue; + spin_lock_bh(&pmc->mca_lock); + if (pmc->mca_sfcount[MCAST_EXCLUDE]) + type = MLD2_MODE_IS_EXCLUDE; + else + type = MLD2_MODE_IS_INCLUDE; + skb = add_grec(skb, pmc, type, 0, 0); + spin_unlock_bh(&pmc->mca_lock); + } + read_unlock_bh(&idev->lock); + } else { + spin_lock_bh(&pmc->mca_lock); + if (pmc->mca_sfcount[MCAST_EXCLUDE]) + type = MLD2_MODE_IS_EXCLUDE; + else + type = MLD2_MODE_IS_INCLUDE; + skb = add_grec(skb, pmc, type, 0, 0); + spin_unlock_bh(&pmc->mca_lock); + } + if (skb) + mld_sendpack(skb); +} + +/* + * remove zero-count source records from a source filter list + */ +static void mld_clear_zeros(struct ip6_sf_list **ppsf) +{ + struct ip6_sf_list *psf_prev, *psf_next, *psf; + + psf_prev = NULL; + for (psf=*ppsf; psf; psf = psf_next) { + psf_next = psf->sf_next; + if (psf->sf_crcount == 0) { + if (psf_prev) + psf_prev->sf_next = psf->sf_next; + else + *ppsf = psf->sf_next; + kfree(psf); + } else + psf_prev = psf; + } +} + +static void mld_send_cr(struct inet6_dev *idev) +{ + struct ifmcaddr6 *pmc, *pmc_prev, *pmc_next; + struct sk_buff *skb = NULL; + int type, dtype; + + read_lock_bh(&idev->lock); + spin_lock(&idev->mc_lock); + + /* deleted MCA's */ + pmc_prev = NULL; + for (pmc=idev->mc_tomb; pmc; pmc=pmc_next) { + pmc_next = pmc->next; + if (pmc->mca_sfmode == MCAST_INCLUDE) { + type = MLD2_BLOCK_OLD_SOURCES; + dtype = MLD2_BLOCK_OLD_SOURCES; + skb = add_grec(skb, pmc, type, 1, 0); + skb = add_grec(skb, pmc, dtype, 1, 1); + } + if (pmc->mca_crcount) { + if (pmc->mca_sfmode == MCAST_EXCLUDE) { + type = MLD2_CHANGE_TO_INCLUDE; + skb = add_grec(skb, pmc, type, 1, 0); + } + pmc->mca_crcount--; + if (pmc->mca_crcount == 0) { + mld_clear_zeros(&pmc->mca_tomb); + mld_clear_zeros(&pmc->mca_sources); + } + } + if (pmc->mca_crcount == 0 && !pmc->mca_tomb && + !pmc->mca_sources) { + if (pmc_prev) + pmc_prev->next = pmc_next; + else + idev->mc_tomb = pmc_next; + in6_dev_put(pmc->idev); + kfree(pmc); + } else + pmc_prev = pmc; + } + spin_unlock(&idev->mc_lock); + + /* change recs */ + for (pmc=idev->mc_list; pmc; pmc=pmc->next) { + spin_lock_bh(&pmc->mca_lock); + if (pmc->mca_sfcount[MCAST_EXCLUDE]) { + type = MLD2_BLOCK_OLD_SOURCES; + dtype = MLD2_ALLOW_NEW_SOURCES; + } else { + type = MLD2_ALLOW_NEW_SOURCES; + dtype = MLD2_BLOCK_OLD_SOURCES; + } + skb = add_grec(skb, pmc, type, 0, 0); + skb = add_grec(skb, pmc, dtype, 0, 1); /* deleted sources */ + + /* filter mode changes */ + if (pmc->mca_crcount) { + if (pmc->mca_sfmode == MCAST_EXCLUDE) + type = MLD2_CHANGE_TO_EXCLUDE; + else + type = MLD2_CHANGE_TO_INCLUDE; + skb = add_grec(skb, pmc, type, 0, 0); + pmc->mca_crcount--; + } + spin_unlock_bh(&pmc->mca_lock); + } + read_unlock_bh(&idev->lock); + if (!skb) + return; + (void) mld_sendpack(skb); +} + +static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type) +{ + struct net *net = dev_net(dev); + struct sock *sk = net->ipv6.igmp_sk; + struct inet6_dev *idev; + struct sk_buff *skb; + struct mld_msg *hdr; + const struct in6_addr *snd_addr, *saddr; + struct in6_addr addr_buf; + int hlen = LL_RESERVED_SPACE(dev); + int tlen = dev->needed_tailroom; + int err, len, payload_len, full_len; + u8 ra[8] = { IPPROTO_ICMPV6, 0, + IPV6_TLV_ROUTERALERT, 2, 0, 0, + IPV6_TLV_PADN, 0 }; + struct flowi6 fl6; + struct dst_entry *dst; + + if (type == ICMPV6_MGM_REDUCTION) + snd_addr = &in6addr_linklocal_allrouters; + else + snd_addr = addr; + + len = sizeof(struct icmp6hdr) + sizeof(struct in6_addr); + payload_len = len + sizeof(ra); + full_len = sizeof(struct ipv6hdr) + payload_len; + + rcu_read_lock(); + IP6_UPD_PO_STATS(net, __in6_dev_get(dev), + IPSTATS_MIB_OUT, full_len); + rcu_read_unlock(); + + skb = sock_alloc_send_skb(sk, hlen + tlen + full_len, 1, &err); + + if (skb == NULL) { + rcu_read_lock(); + IP6_INC_STATS(net, __in6_dev_get(dev), + IPSTATS_MIB_OUTDISCARDS); + rcu_read_unlock(); + return; + } + + skb_reserve(skb, hlen); + + if (ipv6_get_lladdr(dev, &addr_buf, IFA_F_TENTATIVE)) { + /* <draft-ietf-magma-mld-source-05.txt>: + * use unspecified address as the source address + * when a valid link-local address is not available. + */ + saddr = &in6addr_any; + } else + saddr = &addr_buf; + + ip6_nd_hdr(sk, skb, dev, saddr, snd_addr, NEXTHDR_HOP, payload_len); + + memcpy(skb_put(skb, sizeof(ra)), ra, sizeof(ra)); + + hdr = (struct mld_msg *) skb_put(skb, sizeof(struct mld_msg)); + memset(hdr, 0, sizeof(struct mld_msg)); + hdr->mld_type = type; + hdr->mld_mca = *addr; + + hdr->mld_cksum = csum_ipv6_magic(saddr, snd_addr, len, + IPPROTO_ICMPV6, + csum_partial(hdr, len, 0)); + + rcu_read_lock(); + idev = __in6_dev_get(skb->dev); + + icmpv6_flow_init(sk, &fl6, type, + &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, + skb->dev->ifindex); + dst = icmp6_dst_alloc(skb->dev, NULL, &fl6); + if (IS_ERR(dst)) { + err = PTR_ERR(dst); + goto err_out; + } + + skb_dst_set(skb, dst); + err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL, skb->dev, + dst_output); +out: + if (!err) { + ICMP6MSGOUT_INC_STATS(net, idev, type); + ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS); + IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, full_len); + } else + IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); + + rcu_read_unlock(); + return; + +err_out: + kfree_skb(skb); + goto out; +} + +static int ip6_mc_del1_src(struct ifmcaddr6 *pmc, int sfmode, + const struct in6_addr *psfsrc) +{ + struct ip6_sf_list *psf, *psf_prev; + int rv = 0; + + psf_prev = NULL; + for (psf=pmc->mca_sources; psf; psf=psf->sf_next) { + if (ipv6_addr_equal(&psf->sf_addr, psfsrc)) + break; + psf_prev = psf; + } + if (!psf || psf->sf_count[sfmode] == 0) { + /* source filter not found, or count wrong => bug */ + return -ESRCH; + } + psf->sf_count[sfmode]--; + if (!psf->sf_count[MCAST_INCLUDE] && !psf->sf_count[MCAST_EXCLUDE]) { + struct inet6_dev *idev = pmc->idev; + + /* no more filters for this source */ + if (psf_prev) + psf_prev->sf_next = psf->sf_next; + else + pmc->mca_sources = psf->sf_next; + if (psf->sf_oldin && !(pmc->mca_flags & MAF_NOREPORT) && + !MLD_V1_SEEN(idev)) { + psf->sf_crcount = idev->mc_qrv; + psf->sf_next = pmc->mca_tomb; + pmc->mca_tomb = psf; + rv = 1; + } else + kfree(psf); + } + return rv; +} + +static int ip6_mc_del_src(struct inet6_dev *idev, const struct in6_addr *pmca, + int sfmode, int sfcount, const struct in6_addr *psfsrc, + int delta) +{ + struct ifmcaddr6 *pmc; + int changerec = 0; + int i, err; + + if (!idev) + return -ENODEV; + read_lock_bh(&idev->lock); + for (pmc=idev->mc_list; pmc; pmc=pmc->next) { + if (ipv6_addr_equal(pmca, &pmc->mca_addr)) + break; + } + if (!pmc) { + /* MCA not found?? bug */ + read_unlock_bh(&idev->lock); + return -ESRCH; + } + spin_lock_bh(&pmc->mca_lock); + sf_markstate(pmc); + if (!delta) { + if (!pmc->mca_sfcount[sfmode]) { + spin_unlock_bh(&pmc->mca_lock); + read_unlock_bh(&idev->lock); + return -EINVAL; + } + pmc->mca_sfcount[sfmode]--; + } + err = 0; + for (i=0; i<sfcount; i++) { + int rv = ip6_mc_del1_src(pmc, sfmode, &psfsrc[i]); + + changerec |= rv > 0; + if (!err && rv < 0) + err = rv; + } + if (pmc->mca_sfmode == MCAST_EXCLUDE && + pmc->mca_sfcount[MCAST_EXCLUDE] == 0 && + pmc->mca_sfcount[MCAST_INCLUDE]) { + struct ip6_sf_list *psf; + + /* filter mode change */ + pmc->mca_sfmode = MCAST_INCLUDE; + pmc->mca_crcount = idev->mc_qrv; + idev->mc_ifc_count = pmc->mca_crcount; + for (psf=pmc->mca_sources; psf; psf = psf->sf_next) + psf->sf_crcount = 0; + mld_ifc_event(pmc->idev); + } else if (sf_setstate(pmc) || changerec) + mld_ifc_event(pmc->idev); + spin_unlock_bh(&pmc->mca_lock); + read_unlock_bh(&idev->lock); + return err; +} + +/* + * Add multicast single-source filter to the interface list + */ +static int ip6_mc_add1_src(struct ifmcaddr6 *pmc, int sfmode, + const struct in6_addr *psfsrc) +{ + struct ip6_sf_list *psf, *psf_prev; + + psf_prev = NULL; + for (psf=pmc->mca_sources; psf; psf=psf->sf_next) { + if (ipv6_addr_equal(&psf->sf_addr, psfsrc)) + break; + psf_prev = psf; + } + if (!psf) { + psf = kzalloc(sizeof(*psf), GFP_ATOMIC); + if (!psf) + return -ENOBUFS; + + psf->sf_addr = *psfsrc; + if (psf_prev) { + psf_prev->sf_next = psf; + } else + pmc->mca_sources = psf; + } + psf->sf_count[sfmode]++; + return 0; +} + +static void sf_markstate(struct ifmcaddr6 *pmc) +{ + struct ip6_sf_list *psf; + int mca_xcount = pmc->mca_sfcount[MCAST_EXCLUDE]; + + for (psf=pmc->mca_sources; psf; psf=psf->sf_next) + if (pmc->mca_sfcount[MCAST_EXCLUDE]) { + psf->sf_oldin = mca_xcount == + psf->sf_count[MCAST_EXCLUDE] && + !psf->sf_count[MCAST_INCLUDE]; + } else + psf->sf_oldin = psf->sf_count[MCAST_INCLUDE] != 0; +} + +static int sf_setstate(struct ifmcaddr6 *pmc) +{ + struct ip6_sf_list *psf, *dpsf; + int mca_xcount = pmc->mca_sfcount[MCAST_EXCLUDE]; + int qrv = pmc->idev->mc_qrv; + int new_in, rv; + + rv = 0; + for (psf=pmc->mca_sources; psf; psf=psf->sf_next) { + if (pmc->mca_sfcount[MCAST_EXCLUDE]) { + new_in = mca_xcount == psf->sf_count[MCAST_EXCLUDE] && + !psf->sf_count[MCAST_INCLUDE]; + } else + new_in = psf->sf_count[MCAST_INCLUDE] != 0; + if (new_in) { + if (!psf->sf_oldin) { + struct ip6_sf_list *prev = NULL; + + for (dpsf=pmc->mca_tomb; dpsf; + dpsf=dpsf->sf_next) { + if (ipv6_addr_equal(&dpsf->sf_addr, + &psf->sf_addr)) + break; + prev = dpsf; + } + if (dpsf) { + if (prev) + prev->sf_next = dpsf->sf_next; + else + pmc->mca_tomb = dpsf->sf_next; + kfree(dpsf); + } + psf->sf_crcount = qrv; + rv++; + } + } else if (psf->sf_oldin) { + psf->sf_crcount = 0; + /* + * add or update "delete" records if an active filter + * is now inactive + */ + for (dpsf=pmc->mca_tomb; dpsf; dpsf=dpsf->sf_next) + if (ipv6_addr_equal(&dpsf->sf_addr, + &psf->sf_addr)) + break; + if (!dpsf) { + dpsf = kmalloc(sizeof(*dpsf), GFP_ATOMIC); + if (!dpsf) + continue; + *dpsf = *psf; + /* pmc->mca_lock held by callers */ + dpsf->sf_next = pmc->mca_tomb; + pmc->mca_tomb = dpsf; + } + dpsf->sf_crcount = qrv; + rv++; + } + } + return rv; +} + +/* + * Add multicast source filter list to the interface list + */ +static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca, + int sfmode, int sfcount, const struct in6_addr *psfsrc, + int delta) +{ + struct ifmcaddr6 *pmc; + int isexclude; + int i, err; + + if (!idev) + return -ENODEV; + read_lock_bh(&idev->lock); + for (pmc=idev->mc_list; pmc; pmc=pmc->next) { + if (ipv6_addr_equal(pmca, &pmc->mca_addr)) + break; + } + if (!pmc) { + /* MCA not found?? bug */ + read_unlock_bh(&idev->lock); + return -ESRCH; + } + spin_lock_bh(&pmc->mca_lock); + + sf_markstate(pmc); + isexclude = pmc->mca_sfmode == MCAST_EXCLUDE; + if (!delta) + pmc->mca_sfcount[sfmode]++; + err = 0; + for (i=0; i<sfcount; i++) { + err = ip6_mc_add1_src(pmc, sfmode, &psfsrc[i]); + if (err) + break; + } + if (err) { + int j; + + if (!delta) + pmc->mca_sfcount[sfmode]--; + for (j=0; j<i; j++) + ip6_mc_del1_src(pmc, sfmode, &psfsrc[j]); + } else if (isexclude != (pmc->mca_sfcount[MCAST_EXCLUDE] != 0)) { + struct ip6_sf_list *psf; + + /* filter mode change */ + if (pmc->mca_sfcount[MCAST_EXCLUDE]) + pmc->mca_sfmode = MCAST_EXCLUDE; + else if (pmc->mca_sfcount[MCAST_INCLUDE]) + pmc->mca_sfmode = MCAST_INCLUDE; + /* else no filters; keep old mode for reports */ + + pmc->mca_crcount = idev->mc_qrv; + idev->mc_ifc_count = pmc->mca_crcount; + for (psf=pmc->mca_sources; psf; psf = psf->sf_next) + psf->sf_crcount = 0; + mld_ifc_event(idev); + } else if (sf_setstate(pmc)) + mld_ifc_event(idev); + spin_unlock_bh(&pmc->mca_lock); + read_unlock_bh(&idev->lock); + return err; +} + +static void ip6_mc_clear_src(struct ifmcaddr6 *pmc) +{ + struct ip6_sf_list *psf, *nextpsf; + + for (psf=pmc->mca_tomb; psf; psf=nextpsf) { + nextpsf = psf->sf_next; + kfree(psf); + } + pmc->mca_tomb = NULL; + for (psf=pmc->mca_sources; psf; psf=nextpsf) { + nextpsf = psf->sf_next; + kfree(psf); + } + pmc->mca_sources = NULL; + pmc->mca_sfmode = MCAST_EXCLUDE; + pmc->mca_sfcount[MCAST_INCLUDE] = 0; + pmc->mca_sfcount[MCAST_EXCLUDE] = 1; +} + + +static void igmp6_join_group(struct ifmcaddr6 *ma) +{ + unsigned long delay; + + if (ma->mca_flags & MAF_NOREPORT) + return; + + igmp6_send(&ma->mca_addr, ma->idev->dev, ICMPV6_MGM_REPORT); + + delay = net_random() % IGMP6_UNSOLICITED_IVAL; + + spin_lock_bh(&ma->mca_lock); + if (del_timer(&ma->mca_timer)) { + atomic_dec(&ma->mca_refcnt); + delay = ma->mca_timer.expires - jiffies; + } + + if (!mod_timer(&ma->mca_timer, jiffies + delay)) + atomic_inc(&ma->mca_refcnt); + ma->mca_flags |= MAF_TIMER_RUNNING | MAF_LAST_REPORTER; + spin_unlock_bh(&ma->mca_lock); +} + +static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml, + struct inet6_dev *idev) +{ + int err; + + /* callers have the socket lock and a write lock on ipv6_sk_mc_lock, + * so no other readers or writers of iml or its sflist + */ + if (!iml->sflist) { + /* any-source empty exclude case */ + return ip6_mc_del_src(idev, &iml->addr, iml->sfmode, 0, NULL, 0); + } + err = ip6_mc_del_src(idev, &iml->addr, iml->sfmode, + iml->sflist->sl_count, iml->sflist->sl_addr, 0); + sock_kfree_s(sk, iml->sflist, IP6_SFLSIZE(iml->sflist->sl_max)); + iml->sflist = NULL; + return err; +} + +static void igmp6_leave_group(struct ifmcaddr6 *ma) +{ + if (MLD_V1_SEEN(ma->idev)) { + if (ma->mca_flags & MAF_LAST_REPORTER) + igmp6_send(&ma->mca_addr, ma->idev->dev, + ICMPV6_MGM_REDUCTION); + } else { + mld_add_delrec(ma->idev, ma); + mld_ifc_event(ma->idev); + } +} + +static void mld_gq_timer_expire(unsigned long data) +{ + struct inet6_dev *idev = (struct inet6_dev *)data; + + idev->mc_gq_running = 0; + mld_send_report(idev, NULL); + __in6_dev_put(idev); +} + +static void mld_ifc_timer_expire(unsigned long data) +{ + struct inet6_dev *idev = (struct inet6_dev *)data; + + mld_send_cr(idev); + if (idev->mc_ifc_count) { + idev->mc_ifc_count--; + if (idev->mc_ifc_count) + mld_ifc_start_timer(idev, idev->mc_maxdelay); + } + __in6_dev_put(idev); +} + +static void mld_ifc_event(struct inet6_dev *idev) +{ + if (MLD_V1_SEEN(idev)) + return; + idev->mc_ifc_count = idev->mc_qrv; + mld_ifc_start_timer(idev, 1); +} + + +static void igmp6_timer_handler(unsigned long data) +{ + struct ifmcaddr6 *ma = (struct ifmcaddr6 *) data; + + if (MLD_V1_SEEN(ma->idev)) + igmp6_send(&ma->mca_addr, ma->idev->dev, ICMPV6_MGM_REPORT); + else + mld_send_report(ma->idev, ma); + + spin_lock(&ma->mca_lock); + ma->mca_flags |= MAF_LAST_REPORTER; + ma->mca_flags &= ~MAF_TIMER_RUNNING; + spin_unlock(&ma->mca_lock); + ma_put(ma); +} + +/* Device changing type */ + +void ipv6_mc_unmap(struct inet6_dev *idev) +{ + struct ifmcaddr6 *i; + + /* Install multicast list, except for all-nodes (already installed) */ + + read_lock_bh(&idev->lock); + for (i = idev->mc_list; i; i = i->next) + igmp6_group_dropped(i); + read_unlock_bh(&idev->lock); +} + +void ipv6_mc_remap(struct inet6_dev *idev) +{ + ipv6_mc_up(idev); +} + +/* Device going down */ + +void ipv6_mc_down(struct inet6_dev *idev) +{ + struct ifmcaddr6 *i; + + /* Withdraw multicast list */ + + read_lock_bh(&idev->lock); + idev->mc_ifc_count = 0; + if (del_timer(&idev->mc_ifc_timer)) + __in6_dev_put(idev); + idev->mc_gq_running = 0; + if (del_timer(&idev->mc_gq_timer)) + __in6_dev_put(idev); + + for (i = idev->mc_list; i; i=i->next) + igmp6_group_dropped(i); + read_unlock_bh(&idev->lock); + + mld_clear_delrec(idev); +} + + +/* Device going up */ + +void ipv6_mc_up(struct inet6_dev *idev) +{ + struct ifmcaddr6 *i; + + /* Install multicast list, except for all-nodes (already installed) */ + + read_lock_bh(&idev->lock); + for (i = idev->mc_list; i; i=i->next) + igmp6_group_added(i); + read_unlock_bh(&idev->lock); +} + +/* IPv6 device initialization. */ + +void ipv6_mc_init_dev(struct inet6_dev *idev) +{ + write_lock_bh(&idev->lock); + spin_lock_init(&idev->mc_lock); + idev->mc_gq_running = 0; + setup_timer(&idev->mc_gq_timer, mld_gq_timer_expire, + (unsigned long)idev); + idev->mc_tomb = NULL; + idev->mc_ifc_count = 0; + setup_timer(&idev->mc_ifc_timer, mld_ifc_timer_expire, + (unsigned long)idev); + idev->mc_qrv = MLD_QRV_DEFAULT; + idev->mc_maxdelay = IGMP6_UNSOLICITED_IVAL; + idev->mc_v1_seen = 0; + write_unlock_bh(&idev->lock); +} + +/* + * Device is about to be destroyed: clean up. + */ + +void ipv6_mc_destroy_dev(struct inet6_dev *idev) +{ + struct ifmcaddr6 *i; + + /* Deactivate timers */ + ipv6_mc_down(idev); + + /* Delete all-nodes address. */ + /* We cannot call ipv6_dev_mc_dec() directly, our caller in + * addrconf.c has NULL'd out dev->ip6_ptr so in6_dev_get() will + * fail. + */ + __ipv6_dev_mc_dec(idev, &in6addr_linklocal_allnodes); + + if (idev->cnf.forwarding) + __ipv6_dev_mc_dec(idev, &in6addr_linklocal_allrouters); + + write_lock_bh(&idev->lock); + while ((i = idev->mc_list) != NULL) { + idev->mc_list = i->next; + write_unlock_bh(&idev->lock); + + igmp6_group_dropped(i); + ma_put(i); + + write_lock_bh(&idev->lock); + } + write_unlock_bh(&idev->lock); +} + +#ifdef CONFIG_PROC_FS +struct igmp6_mc_iter_state { + struct seq_net_private p; + struct net_device *dev; + struct inet6_dev *idev; +}; + +#define igmp6_mc_seq_private(seq) ((struct igmp6_mc_iter_state *)(seq)->private) + +static inline struct ifmcaddr6 *igmp6_mc_get_first(struct seq_file *seq) +{ + struct ifmcaddr6 *im = NULL; + struct igmp6_mc_iter_state *state = igmp6_mc_seq_private(seq); + struct net *net = seq_file_net(seq); + + state->idev = NULL; + for_each_netdev_rcu(net, state->dev) { + struct inet6_dev *idev; + idev = __in6_dev_get(state->dev); + if (!idev) + continue; + read_lock_bh(&idev->lock); + im = idev->mc_list; + if (im) { + state->idev = idev; + break; + } + read_unlock_bh(&idev->lock); + } + return im; +} + +static struct ifmcaddr6 *igmp6_mc_get_next(struct seq_file *seq, struct ifmcaddr6 *im) +{ + struct igmp6_mc_iter_state *state = igmp6_mc_seq_private(seq); + + im = im->next; + while (!im) { + if (likely(state->idev != NULL)) + read_unlock_bh(&state->idev->lock); + + state->dev = next_net_device_rcu(state->dev); + if (!state->dev) { + state->idev = NULL; + break; + } + state->idev = __in6_dev_get(state->dev); + if (!state->idev) + continue; + read_lock_bh(&state->idev->lock); + im = state->idev->mc_list; + } + return im; +} + +static struct ifmcaddr6 *igmp6_mc_get_idx(struct seq_file *seq, loff_t pos) +{ + struct ifmcaddr6 *im = igmp6_mc_get_first(seq); + if (im) + while (pos && (im = igmp6_mc_get_next(seq, im)) != NULL) + --pos; + return pos ? NULL : im; +} + +static void *igmp6_mc_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(RCU) +{ + rcu_read_lock(); + return igmp6_mc_get_idx(seq, *pos); +} + +static void *igmp6_mc_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct ifmcaddr6 *im = igmp6_mc_get_next(seq, v); + + ++*pos; + return im; +} + +static void igmp6_mc_seq_stop(struct seq_file *seq, void *v) + __releases(RCU) +{ + struct igmp6_mc_iter_state *state = igmp6_mc_seq_private(seq); + + if (likely(state->idev != NULL)) { + read_unlock_bh(&state->idev->lock); + state->idev = NULL; + } + state->dev = NULL; + rcu_read_unlock(); +} + +static int igmp6_mc_seq_show(struct seq_file *seq, void *v) +{ + struct ifmcaddr6 *im = (struct ifmcaddr6 *)v; + struct igmp6_mc_iter_state *state = igmp6_mc_seq_private(seq); + + seq_printf(seq, + "%-4d %-15s %pi6 %5d %08X %ld\n", + state->dev->ifindex, state->dev->name, + &im->mca_addr, + im->mca_users, im->mca_flags, + (im->mca_flags&MAF_TIMER_RUNNING) ? + jiffies_to_clock_t(im->mca_timer.expires-jiffies) : 0); + return 0; +} + +static const struct seq_operations igmp6_mc_seq_ops = { + .start = igmp6_mc_seq_start, + .next = igmp6_mc_seq_next, + .stop = igmp6_mc_seq_stop, + .show = igmp6_mc_seq_show, +}; + +static int igmp6_mc_seq_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &igmp6_mc_seq_ops, + sizeof(struct igmp6_mc_iter_state)); +} + +static const struct file_operations igmp6_mc_seq_fops = { + .owner = THIS_MODULE, + .open = igmp6_mc_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +struct igmp6_mcf_iter_state { + struct seq_net_private p; + struct net_device *dev; + struct inet6_dev *idev; + struct ifmcaddr6 *im; +}; + +#define igmp6_mcf_seq_private(seq) ((struct igmp6_mcf_iter_state *)(seq)->private) + +static inline struct ip6_sf_list *igmp6_mcf_get_first(struct seq_file *seq) +{ + struct ip6_sf_list *psf = NULL; + struct ifmcaddr6 *im = NULL; + struct igmp6_mcf_iter_state *state = igmp6_mcf_seq_private(seq); + struct net *net = seq_file_net(seq); + + state->idev = NULL; + state->im = NULL; + for_each_netdev_rcu(net, state->dev) { + struct inet6_dev *idev; + idev = __in6_dev_get(state->dev); + if (unlikely(idev == NULL)) + continue; + read_lock_bh(&idev->lock); + im = idev->mc_list; + if (likely(im != NULL)) { + spin_lock_bh(&im->mca_lock); + psf = im->mca_sources; + if (likely(psf != NULL)) { + state->im = im; + state->idev = idev; + break; + } + spin_unlock_bh(&im->mca_lock); + } + read_unlock_bh(&idev->lock); + } + return psf; +} + +static struct ip6_sf_list *igmp6_mcf_get_next(struct seq_file *seq, struct ip6_sf_list *psf) +{ + struct igmp6_mcf_iter_state *state = igmp6_mcf_seq_private(seq); + + psf = psf->sf_next; + while (!psf) { + spin_unlock_bh(&state->im->mca_lock); + state->im = state->im->next; + while (!state->im) { + if (likely(state->idev != NULL)) + read_unlock_bh(&state->idev->lock); + + state->dev = next_net_device_rcu(state->dev); + if (!state->dev) { + state->idev = NULL; + goto out; + } + state->idev = __in6_dev_get(state->dev); + if (!state->idev) + continue; + read_lock_bh(&state->idev->lock); + state->im = state->idev->mc_list; + } + if (!state->im) + break; + spin_lock_bh(&state->im->mca_lock); + psf = state->im->mca_sources; + } +out: + return psf; +} + +static struct ip6_sf_list *igmp6_mcf_get_idx(struct seq_file *seq, loff_t pos) +{ + struct ip6_sf_list *psf = igmp6_mcf_get_first(seq); + if (psf) + while (pos && (psf = igmp6_mcf_get_next(seq, psf)) != NULL) + --pos; + return pos ? NULL : psf; +} + +static void *igmp6_mcf_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(RCU) +{ + rcu_read_lock(); + return *pos ? igmp6_mcf_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; +} + +static void *igmp6_mcf_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct ip6_sf_list *psf; + if (v == SEQ_START_TOKEN) + psf = igmp6_mcf_get_first(seq); + else + psf = igmp6_mcf_get_next(seq, v); + ++*pos; + return psf; +} + +static void igmp6_mcf_seq_stop(struct seq_file *seq, void *v) + __releases(RCU) +{ + struct igmp6_mcf_iter_state *state = igmp6_mcf_seq_private(seq); + if (likely(state->im != NULL)) { + spin_unlock_bh(&state->im->mca_lock); + state->im = NULL; + } + if (likely(state->idev != NULL)) { + read_unlock_bh(&state->idev->lock); + state->idev = NULL; + } + state->dev = NULL; + rcu_read_unlock(); +} + +static int igmp6_mcf_seq_show(struct seq_file *seq, void *v) +{ + struct ip6_sf_list *psf = (struct ip6_sf_list *)v; + struct igmp6_mcf_iter_state *state = igmp6_mcf_seq_private(seq); + + if (v == SEQ_START_TOKEN) { + seq_printf(seq, + "%3s %6s " + "%32s %32s %6s %6s\n", "Idx", + "Device", "Multicast Address", + "Source Address", "INC", "EXC"); + } else { + seq_printf(seq, + "%3d %6.6s %pi6 %pi6 %6lu %6lu\n", + state->dev->ifindex, state->dev->name, + &state->im->mca_addr, + &psf->sf_addr, + psf->sf_count[MCAST_INCLUDE], + psf->sf_count[MCAST_EXCLUDE]); + } + return 0; +} + +static const struct seq_operations igmp6_mcf_seq_ops = { + .start = igmp6_mcf_seq_start, + .next = igmp6_mcf_seq_next, + .stop = igmp6_mcf_seq_stop, + .show = igmp6_mcf_seq_show, +}; + +static int igmp6_mcf_seq_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &igmp6_mcf_seq_ops, + sizeof(struct igmp6_mcf_iter_state)); +} + +static const struct file_operations igmp6_mcf_seq_fops = { + .owner = THIS_MODULE, + .open = igmp6_mcf_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +static int __net_init igmp6_proc_init(struct net *net) +{ + int err; + + err = -ENOMEM; + if (!proc_net_fops_create(net, "igmp6", S_IRUGO, &igmp6_mc_seq_fops)) + goto out; + if (!proc_net_fops_create(net, "mcfilter6", S_IRUGO, + &igmp6_mcf_seq_fops)) + goto out_proc_net_igmp6; + + err = 0; +out: + return err; + +out_proc_net_igmp6: + proc_net_remove(net, "igmp6"); + goto out; +} + +static void __net_exit igmp6_proc_exit(struct net *net) +{ + proc_net_remove(net, "mcfilter6"); + proc_net_remove(net, "igmp6"); +} +#else +static inline int igmp6_proc_init(struct net *net) +{ + return 0; +} +static inline void igmp6_proc_exit(struct net *net) +{ +} +#endif + +static int __net_init igmp6_net_init(struct net *net) +{ + int err; + + err = inet_ctl_sock_create(&net->ipv6.igmp_sk, PF_INET6, + SOCK_RAW, IPPROTO_ICMPV6, net); + if (err < 0) { + printk(KERN_ERR + "Failed to initialize the IGMP6 control socket (err %d).\n", + err); + goto out; + } + + inet6_sk(net->ipv6.igmp_sk)->hop_limit = 1; + + err = igmp6_proc_init(net); + if (err) + goto out_sock_create; +out: + return err; + +out_sock_create: + inet_ctl_sock_destroy(net->ipv6.igmp_sk); + goto out; +} + +static void __net_exit igmp6_net_exit(struct net *net) +{ + inet_ctl_sock_destroy(net->ipv6.igmp_sk); + igmp6_proc_exit(net); +} + +static struct pernet_operations igmp6_net_ops = { + .init = igmp6_net_init, + .exit = igmp6_net_exit, +}; + +int __init igmp6_init(void) +{ + return register_pernet_subsys(&igmp6_net_ops); +} + +void igmp6_cleanup(void) +{ + unregister_pernet_subsys(&igmp6_net_ops); +} diff --git a/net/ipv6/mip6.c b/net/ipv6/mip6.c new file mode 100644 index 00000000..7e1e0fbf --- /dev/null +++ b/net/ipv6/mip6.c @@ -0,0 +1,525 @@ +/* + * Copyright (C)2003-2006 Helsinki University of Technology + * Copyright (C)2003-2006 USAGI/WIDE Project + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +/* + * Authors: + * Noriaki TAKAMIYA @USAGI + * Masahide NAKAMURA @USAGI + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/time.h> +#include <linux/ipv6.h> +#include <linux/icmpv6.h> +#include <net/sock.h> +#include <net/ipv6.h> +#include <net/ip6_checksum.h> +#include <net/rawv6.h> +#include <net/xfrm.h> +#include <net/mip6.h> + +static inline unsigned int calc_padlen(unsigned int len, unsigned int n) +{ + return (n - len + 16) & 0x7; +} + +static inline void *mip6_padn(__u8 *data, __u8 padlen) +{ + if (!data) + return NULL; + if (padlen == 1) { + data[0] = IPV6_TLV_PAD0; + } else if (padlen > 1) { + data[0] = IPV6_TLV_PADN; + data[1] = padlen - 2; + if (padlen > 2) + memset(data+2, 0, data[1]); + } + return data + padlen; +} + +static inline void mip6_param_prob(struct sk_buff *skb, u8 code, int pos) +{ + icmpv6_send(skb, ICMPV6_PARAMPROB, code, pos); +} + +static int mip6_mh_len(int type) +{ + int len = 0; + + switch (type) { + case IP6_MH_TYPE_BRR: + len = 0; + break; + case IP6_MH_TYPE_HOTI: + case IP6_MH_TYPE_COTI: + case IP6_MH_TYPE_BU: + case IP6_MH_TYPE_BACK: + len = 1; + break; + case IP6_MH_TYPE_HOT: + case IP6_MH_TYPE_COT: + case IP6_MH_TYPE_BERROR: + len = 2; + break; + } + return len; +} + +static int mip6_mh_filter(struct sock *sk, struct sk_buff *skb) +{ + struct ip6_mh *mh; + + if (!pskb_may_pull(skb, (skb_transport_offset(skb)) + 8) || + !pskb_may_pull(skb, (skb_transport_offset(skb) + + ((skb_transport_header(skb)[1] + 1) << 3)))) + return -1; + + mh = (struct ip6_mh *)skb_transport_header(skb); + + if (mh->ip6mh_hdrlen < mip6_mh_len(mh->ip6mh_type)) { + LIMIT_NETDEBUG(KERN_DEBUG "mip6: MH message too short: %d vs >=%d\n", + mh->ip6mh_hdrlen, mip6_mh_len(mh->ip6mh_type)); + mip6_param_prob(skb, 0, ((&mh->ip6mh_hdrlen) - + skb_network_header(skb))); + return -1; + } + + if (mh->ip6mh_proto != IPPROTO_NONE) { + LIMIT_NETDEBUG(KERN_DEBUG "mip6: MH invalid payload proto = %d\n", + mh->ip6mh_proto); + mip6_param_prob(skb, 0, ((&mh->ip6mh_proto) - + skb_network_header(skb))); + return -1; + } + + return 0; +} + +struct mip6_report_rate_limiter { + spinlock_t lock; + struct timeval stamp; + int iif; + struct in6_addr src; + struct in6_addr dst; +}; + +static struct mip6_report_rate_limiter mip6_report_rl = { + .lock = __SPIN_LOCK_UNLOCKED(mip6_report_rl.lock) +}; + +static int mip6_destopt_input(struct xfrm_state *x, struct sk_buff *skb) +{ + const struct ipv6hdr *iph = ipv6_hdr(skb); + struct ipv6_destopt_hdr *destopt = (struct ipv6_destopt_hdr *)skb->data; + int err = destopt->nexthdr; + + spin_lock(&x->lock); + if (!ipv6_addr_equal(&iph->saddr, (struct in6_addr *)x->coaddr) && + !ipv6_addr_any((struct in6_addr *)x->coaddr)) + err = -ENOENT; + spin_unlock(&x->lock); + + return err; +} + +/* Destination Option Header is inserted. + * IP Header's src address is replaced with Home Address Option in + * Destination Option Header. + */ +static int mip6_destopt_output(struct xfrm_state *x, struct sk_buff *skb) +{ + struct ipv6hdr *iph; + struct ipv6_destopt_hdr *dstopt; + struct ipv6_destopt_hao *hao; + u8 nexthdr; + int len; + + skb_push(skb, -skb_network_offset(skb)); + iph = ipv6_hdr(skb); + + nexthdr = *skb_mac_header(skb); + *skb_mac_header(skb) = IPPROTO_DSTOPTS; + + dstopt = (struct ipv6_destopt_hdr *)skb_transport_header(skb); + dstopt->nexthdr = nexthdr; + + hao = mip6_padn((char *)(dstopt + 1), + calc_padlen(sizeof(*dstopt), 6)); + + hao->type = IPV6_TLV_HAO; + BUILD_BUG_ON(sizeof(*hao) != 18); + hao->length = sizeof(*hao) - 2; + + len = ((char *)hao - (char *)dstopt) + sizeof(*hao); + + memcpy(&hao->addr, &iph->saddr, sizeof(hao->addr)); + spin_lock_bh(&x->lock); + memcpy(&iph->saddr, x->coaddr, sizeof(iph->saddr)); + spin_unlock_bh(&x->lock); + + WARN_ON(len != x->props.header_len); + dstopt->hdrlen = (x->props.header_len >> 3) - 1; + + return 0; +} + +static inline int mip6_report_rl_allow(struct timeval *stamp, + const struct in6_addr *dst, + const struct in6_addr *src, int iif) +{ + int allow = 0; + + spin_lock_bh(&mip6_report_rl.lock); + if (mip6_report_rl.stamp.tv_sec != stamp->tv_sec || + mip6_report_rl.stamp.tv_usec != stamp->tv_usec || + mip6_report_rl.iif != iif || + !ipv6_addr_equal(&mip6_report_rl.src, src) || + !ipv6_addr_equal(&mip6_report_rl.dst, dst)) { + mip6_report_rl.stamp.tv_sec = stamp->tv_sec; + mip6_report_rl.stamp.tv_usec = stamp->tv_usec; + mip6_report_rl.iif = iif; + mip6_report_rl.src = *src; + mip6_report_rl.dst = *dst; + allow = 1; + } + spin_unlock_bh(&mip6_report_rl.lock); + return allow; +} + +static int mip6_destopt_reject(struct xfrm_state *x, struct sk_buff *skb, + const struct flowi *fl) +{ + struct net *net = xs_net(x); + struct inet6_skb_parm *opt = (struct inet6_skb_parm *)skb->cb; + const struct flowi6 *fl6 = &fl->u.ip6; + struct ipv6_destopt_hao *hao = NULL; + struct xfrm_selector sel; + int offset; + struct timeval stamp; + int err = 0; + + if (unlikely(fl6->flowi6_proto == IPPROTO_MH && + fl6->fl6_mh_type <= IP6_MH_TYPE_MAX)) + goto out; + + if (likely(opt->dsthao)) { + offset = ipv6_find_tlv(skb, opt->dsthao, IPV6_TLV_HAO); + if (likely(offset >= 0)) + hao = (struct ipv6_destopt_hao *) + (skb_network_header(skb) + offset); + } + + skb_get_timestamp(skb, &stamp); + + if (!mip6_report_rl_allow(&stamp, &ipv6_hdr(skb)->daddr, + hao ? &hao->addr : &ipv6_hdr(skb)->saddr, + opt->iif)) + goto out; + + memset(&sel, 0, sizeof(sel)); + memcpy(&sel.daddr, (xfrm_address_t *)&ipv6_hdr(skb)->daddr, + sizeof(sel.daddr)); + sel.prefixlen_d = 128; + memcpy(&sel.saddr, (xfrm_address_t *)&ipv6_hdr(skb)->saddr, + sizeof(sel.saddr)); + sel.prefixlen_s = 128; + sel.family = AF_INET6; + sel.proto = fl6->flowi6_proto; + sel.dport = xfrm_flowi_dport(fl, &fl6->uli); + if (sel.dport) + sel.dport_mask = htons(~0); + sel.sport = xfrm_flowi_sport(fl, &fl6->uli); + if (sel.sport) + sel.sport_mask = htons(~0); + sel.ifindex = fl6->flowi6_oif; + + err = km_report(net, IPPROTO_DSTOPTS, &sel, + (hao ? (xfrm_address_t *)&hao->addr : NULL)); + + out: + return err; +} + +static int mip6_destopt_offset(struct xfrm_state *x, struct sk_buff *skb, + u8 **nexthdr) +{ + u16 offset = sizeof(struct ipv6hdr); + struct ipv6_opt_hdr *exthdr = + (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1); + const unsigned char *nh = skb_network_header(skb); + unsigned int packet_len = skb->tail - skb->network_header; + int found_rhdr = 0; + + *nexthdr = &ipv6_hdr(skb)->nexthdr; + + while (offset + 1 <= packet_len) { + + switch (**nexthdr) { + case NEXTHDR_HOP: + break; + case NEXTHDR_ROUTING: + found_rhdr = 1; + break; + case NEXTHDR_DEST: + /* + * HAO MUST NOT appear more than once. + * XXX: It is better to try to find by the end of + * XXX: packet if HAO exists. + */ + if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0) { + LIMIT_NETDEBUG(KERN_WARNING "mip6: hao exists already, override\n"); + return offset; + } + + if (found_rhdr) + return offset; + + break; + default: + return offset; + } + + offset += ipv6_optlen(exthdr); + *nexthdr = &exthdr->nexthdr; + exthdr = (struct ipv6_opt_hdr *)(nh + offset); + } + + return offset; +} + +static int mip6_destopt_init_state(struct xfrm_state *x) +{ + if (x->id.spi) { + printk(KERN_INFO "%s: spi is not 0: %u\n", __func__, + x->id.spi); + return -EINVAL; + } + if (x->props.mode != XFRM_MODE_ROUTEOPTIMIZATION) { + printk(KERN_INFO "%s: state's mode is not %u: %u\n", + __func__, XFRM_MODE_ROUTEOPTIMIZATION, x->props.mode); + return -EINVAL; + } + + x->props.header_len = sizeof(struct ipv6_destopt_hdr) + + calc_padlen(sizeof(struct ipv6_destopt_hdr), 6) + + sizeof(struct ipv6_destopt_hao); + WARN_ON(x->props.header_len != 24); + + return 0; +} + +/* + * Do nothing about destroying since it has no specific operation for + * destination options header unlike IPsec protocols. + */ +static void mip6_destopt_destroy(struct xfrm_state *x) +{ +} + +static const struct xfrm_type mip6_destopt_type = +{ + .description = "MIP6DESTOPT", + .owner = THIS_MODULE, + .proto = IPPROTO_DSTOPTS, + .flags = XFRM_TYPE_NON_FRAGMENT | XFRM_TYPE_LOCAL_COADDR, + .init_state = mip6_destopt_init_state, + .destructor = mip6_destopt_destroy, + .input = mip6_destopt_input, + .output = mip6_destopt_output, + .reject = mip6_destopt_reject, + .hdr_offset = mip6_destopt_offset, +}; + +static int mip6_rthdr_input(struct xfrm_state *x, struct sk_buff *skb) +{ + const struct ipv6hdr *iph = ipv6_hdr(skb); + struct rt2_hdr *rt2 = (struct rt2_hdr *)skb->data; + int err = rt2->rt_hdr.nexthdr; + + spin_lock(&x->lock); + if (!ipv6_addr_equal(&iph->daddr, (struct in6_addr *)x->coaddr) && + !ipv6_addr_any((struct in6_addr *)x->coaddr)) + err = -ENOENT; + spin_unlock(&x->lock); + + return err; +} + +/* Routing Header type 2 is inserted. + * IP Header's dst address is replaced with Routing Header's Home Address. + */ +static int mip6_rthdr_output(struct xfrm_state *x, struct sk_buff *skb) +{ + struct ipv6hdr *iph; + struct rt2_hdr *rt2; + u8 nexthdr; + + skb_push(skb, -skb_network_offset(skb)); + iph = ipv6_hdr(skb); + + nexthdr = *skb_mac_header(skb); + *skb_mac_header(skb) = IPPROTO_ROUTING; + + rt2 = (struct rt2_hdr *)skb_transport_header(skb); + rt2->rt_hdr.nexthdr = nexthdr; + rt2->rt_hdr.hdrlen = (x->props.header_len >> 3) - 1; + rt2->rt_hdr.type = IPV6_SRCRT_TYPE_2; + rt2->rt_hdr.segments_left = 1; + memset(&rt2->reserved, 0, sizeof(rt2->reserved)); + + WARN_ON(rt2->rt_hdr.hdrlen != 2); + + memcpy(&rt2->addr, &iph->daddr, sizeof(rt2->addr)); + spin_lock_bh(&x->lock); + memcpy(&iph->daddr, x->coaddr, sizeof(iph->daddr)); + spin_unlock_bh(&x->lock); + + return 0; +} + +static int mip6_rthdr_offset(struct xfrm_state *x, struct sk_buff *skb, + u8 **nexthdr) +{ + u16 offset = sizeof(struct ipv6hdr); + struct ipv6_opt_hdr *exthdr = + (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1); + const unsigned char *nh = skb_network_header(skb); + unsigned int packet_len = skb->tail - skb->network_header; + int found_rhdr = 0; + + *nexthdr = &ipv6_hdr(skb)->nexthdr; + + while (offset + 1 <= packet_len) { + + switch (**nexthdr) { + case NEXTHDR_HOP: + break; + case NEXTHDR_ROUTING: + if (offset + 3 <= packet_len) { + struct ipv6_rt_hdr *rt; + rt = (struct ipv6_rt_hdr *)(nh + offset); + if (rt->type != 0) + return offset; + } + found_rhdr = 1; + break; + case NEXTHDR_DEST: + if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0) + return offset; + + if (found_rhdr) + return offset; + + break; + default: + return offset; + } + + offset += ipv6_optlen(exthdr); + *nexthdr = &exthdr->nexthdr; + exthdr = (struct ipv6_opt_hdr *)(nh + offset); + } + + return offset; +} + +static int mip6_rthdr_init_state(struct xfrm_state *x) +{ + if (x->id.spi) { + printk(KERN_INFO "%s: spi is not 0: %u\n", __func__, + x->id.spi); + return -EINVAL; + } + if (x->props.mode != XFRM_MODE_ROUTEOPTIMIZATION) { + printk(KERN_INFO "%s: state's mode is not %u: %u\n", + __func__, XFRM_MODE_ROUTEOPTIMIZATION, x->props.mode); + return -EINVAL; + } + + x->props.header_len = sizeof(struct rt2_hdr); + + return 0; +} + +/* + * Do nothing about destroying since it has no specific operation for routing + * header type 2 unlike IPsec protocols. + */ +static void mip6_rthdr_destroy(struct xfrm_state *x) +{ +} + +static const struct xfrm_type mip6_rthdr_type = +{ + .description = "MIP6RT", + .owner = THIS_MODULE, + .proto = IPPROTO_ROUTING, + .flags = XFRM_TYPE_NON_FRAGMENT | XFRM_TYPE_REMOTE_COADDR, + .init_state = mip6_rthdr_init_state, + .destructor = mip6_rthdr_destroy, + .input = mip6_rthdr_input, + .output = mip6_rthdr_output, + .hdr_offset = mip6_rthdr_offset, +}; + +static int __init mip6_init(void) +{ + printk(KERN_INFO "Mobile IPv6\n"); + + if (xfrm_register_type(&mip6_destopt_type, AF_INET6) < 0) { + printk(KERN_INFO "%s: can't add xfrm type(destopt)\n", __func__); + goto mip6_destopt_xfrm_fail; + } + if (xfrm_register_type(&mip6_rthdr_type, AF_INET6) < 0) { + printk(KERN_INFO "%s: can't add xfrm type(rthdr)\n", __func__); + goto mip6_rthdr_xfrm_fail; + } + if (rawv6_mh_filter_register(mip6_mh_filter) < 0) { + printk(KERN_INFO "%s: can't add rawv6 mh filter\n", __func__); + goto mip6_rawv6_mh_fail; + } + + + return 0; + + mip6_rawv6_mh_fail: + xfrm_unregister_type(&mip6_rthdr_type, AF_INET6); + mip6_rthdr_xfrm_fail: + xfrm_unregister_type(&mip6_destopt_type, AF_INET6); + mip6_destopt_xfrm_fail: + return -EAGAIN; +} + +static void __exit mip6_fini(void) +{ + if (rawv6_mh_filter_unregister(mip6_mh_filter) < 0) + printk(KERN_INFO "%s: can't remove rawv6 mh filter\n", __func__); + if (xfrm_unregister_type(&mip6_rthdr_type, AF_INET6) < 0) + printk(KERN_INFO "%s: can't remove xfrm type(rthdr)\n", __func__); + if (xfrm_unregister_type(&mip6_destopt_type, AF_INET6) < 0) + printk(KERN_INFO "%s: can't remove xfrm type(destopt)\n", __func__); +} + +module_init(mip6_init); +module_exit(mip6_fini); + +MODULE_LICENSE("GPL"); +MODULE_ALIAS_XFRM_TYPE(AF_INET6, XFRM_PROTO_DSTOPTS); +MODULE_ALIAS_XFRM_TYPE(AF_INET6, XFRM_PROTO_ROUTING); diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c new file mode 100644 index 00000000..176b4693 --- /dev/null +++ b/net/ipv6/ndisc.c @@ -0,0 +1,1892 @@ +/* + * Neighbour Discovery for IPv6 + * Linux INET6 implementation + * + * Authors: + * Pedro Roque <roque@di.fc.ul.pt> + * Mike Shaver <shaver@ingenia.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* + * Changes: + * + * Pierre Ynard : export userland ND options + * through netlink (RDNSS support) + * Lars Fenneberg : fixed MTU setting on receipt + * of an RA. + * Janos Farkas : kmalloc failure checks + * Alexey Kuznetsov : state machine reworked + * and moved to net/core. + * Pekka Savola : RFC2461 validation + * YOSHIFUJI Hideaki @USAGI : Verify ND options properly + */ + +/* Set to 3 to get tracing... */ +#define ND_DEBUG 1 + +#define ND_PRINTK(fmt, args...) do { if (net_ratelimit()) { printk(fmt, ## args); } } while(0) +#define ND_NOPRINTK(x...) do { ; } while(0) +#define ND_PRINTK0 ND_PRINTK +#define ND_PRINTK1 ND_NOPRINTK +#define ND_PRINTK2 ND_NOPRINTK +#define ND_PRINTK3 ND_NOPRINTK +#if ND_DEBUG >= 1 +#undef ND_PRINTK1 +#define ND_PRINTK1 ND_PRINTK +#endif +#if ND_DEBUG >= 2 +#undef ND_PRINTK2 +#define ND_PRINTK2 ND_PRINTK +#endif +#if ND_DEBUG >= 3 +#undef ND_PRINTK3 +#define ND_PRINTK3 ND_PRINTK +#endif + +#include <linux/module.h> +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/sched.h> +#include <linux/net.h> +#include <linux/in6.h> +#include <linux/route.h> +#include <linux/init.h> +#include <linux/rcupdate.h> +#include <linux/slab.h> +#ifdef CONFIG_SYSCTL +#include <linux/sysctl.h> +#endif + +#include <linux/if_addr.h> +#include <linux/if_arp.h> +#include <linux/ipv6.h> +#include <linux/icmpv6.h> +#include <linux/jhash.h> + +#include <net/sock.h> +#include <net/snmp.h> + +#include <net/ipv6.h> +#include <net/protocol.h> +#include <net/ndisc.h> +#include <net/ip6_route.h> +#include <net/addrconf.h> +#include <net/icmp.h> + +#include <net/netlink.h> +#include <linux/rtnetlink.h> + +#include <net/flow.h> +#include <net/ip6_checksum.h> +#include <net/inet_common.h> +#include <linux/proc_fs.h> + +#include <linux/netfilter.h> +#include <linux/netfilter_ipv6.h> + +static u32 ndisc_hash(const void *pkey, + const struct net_device *dev, + __u32 *hash_rnd); +static int ndisc_constructor(struct neighbour *neigh); +static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb); +static void ndisc_error_report(struct neighbour *neigh, struct sk_buff *skb); +static int pndisc_constructor(struct pneigh_entry *n); +static void pndisc_destructor(struct pneigh_entry *n); +static void pndisc_redo(struct sk_buff *skb); + +static const struct neigh_ops ndisc_generic_ops = { + .family = AF_INET6, + .solicit = ndisc_solicit, + .error_report = ndisc_error_report, + .output = neigh_resolve_output, + .connected_output = neigh_connected_output, +}; + +static const struct neigh_ops ndisc_hh_ops = { + .family = AF_INET6, + .solicit = ndisc_solicit, + .error_report = ndisc_error_report, + .output = neigh_resolve_output, + .connected_output = neigh_resolve_output, +}; + + +static const struct neigh_ops ndisc_direct_ops = { + .family = AF_INET6, + .output = neigh_direct_output, + .connected_output = neigh_direct_output, +}; + +struct neigh_table nd_tbl = { + .family = AF_INET6, + .key_len = sizeof(struct in6_addr), + .hash = ndisc_hash, + .constructor = ndisc_constructor, + .pconstructor = pndisc_constructor, + .pdestructor = pndisc_destructor, + .proxy_redo = pndisc_redo, + .id = "ndisc_cache", + .parms = { + .tbl = &nd_tbl, + .base_reachable_time = ND_REACHABLE_TIME, + .retrans_time = ND_RETRANS_TIMER, + .gc_staletime = 60 * HZ, + .reachable_time = ND_REACHABLE_TIME, + .delay_probe_time = 5 * HZ, + .queue_len_bytes = 64*1024, + .ucast_probes = 3, + .mcast_probes = 3, + .anycast_delay = 1 * HZ, + .proxy_delay = (8 * HZ) / 10, + .proxy_qlen = 64, + }, + .gc_interval = 30 * HZ, + .gc_thresh1 = 128, + .gc_thresh2 = 512, + .gc_thresh3 = 1024, +}; + +/* ND options */ +struct ndisc_options { + struct nd_opt_hdr *nd_opt_array[__ND_OPT_ARRAY_MAX]; +#ifdef CONFIG_IPV6_ROUTE_INFO + struct nd_opt_hdr *nd_opts_ri; + struct nd_opt_hdr *nd_opts_ri_end; +#endif + struct nd_opt_hdr *nd_useropts; + struct nd_opt_hdr *nd_useropts_end; +}; + +#define nd_opts_src_lladdr nd_opt_array[ND_OPT_SOURCE_LL_ADDR] +#define nd_opts_tgt_lladdr nd_opt_array[ND_OPT_TARGET_LL_ADDR] +#define nd_opts_pi nd_opt_array[ND_OPT_PREFIX_INFO] +#define nd_opts_pi_end nd_opt_array[__ND_OPT_PREFIX_INFO_END] +#define nd_opts_rh nd_opt_array[ND_OPT_REDIRECT_HDR] +#define nd_opts_mtu nd_opt_array[ND_OPT_MTU] + +#define NDISC_OPT_SPACE(len) (((len)+2+7)&~7) + +/* + * Return the padding between the option length and the start of the + * link addr. Currently only IP-over-InfiniBand needs this, although + * if RFC 3831 IPv6-over-Fibre Channel is ever implemented it may + * also need a pad of 2. + */ +static int ndisc_addr_option_pad(unsigned short type) +{ + switch (type) { + case ARPHRD_INFINIBAND: return 2; + default: return 0; + } +} + +static inline int ndisc_opt_addr_space(struct net_device *dev) +{ + return NDISC_OPT_SPACE(dev->addr_len + ndisc_addr_option_pad(dev->type)); +} + +static u8 *ndisc_fill_addr_option(u8 *opt, int type, void *data, int data_len, + unsigned short addr_type) +{ + int space = NDISC_OPT_SPACE(data_len); + int pad = ndisc_addr_option_pad(addr_type); + + opt[0] = type; + opt[1] = space>>3; + + memset(opt + 2, 0, pad); + opt += pad; + space -= pad; + + memcpy(opt+2, data, data_len); + data_len += 2; + opt += data_len; + if ((space -= data_len) > 0) + memset(opt, 0, space); + return opt + space; +} + +static struct nd_opt_hdr *ndisc_next_option(struct nd_opt_hdr *cur, + struct nd_opt_hdr *end) +{ + int type; + if (!cur || !end || cur >= end) + return NULL; + type = cur->nd_opt_type; + do { + cur = ((void *)cur) + (cur->nd_opt_len << 3); + } while(cur < end && cur->nd_opt_type != type); + return cur <= end && cur->nd_opt_type == type ? cur : NULL; +} + +static inline int ndisc_is_useropt(struct nd_opt_hdr *opt) +{ + return opt->nd_opt_type == ND_OPT_RDNSS; +} + +static struct nd_opt_hdr *ndisc_next_useropt(struct nd_opt_hdr *cur, + struct nd_opt_hdr *end) +{ + if (!cur || !end || cur >= end) + return NULL; + do { + cur = ((void *)cur) + (cur->nd_opt_len << 3); + } while(cur < end && !ndisc_is_useropt(cur)); + return cur <= end && ndisc_is_useropt(cur) ? cur : NULL; +} + +static struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len, + struct ndisc_options *ndopts) +{ + struct nd_opt_hdr *nd_opt = (struct nd_opt_hdr *)opt; + + if (!nd_opt || opt_len < 0 || !ndopts) + return NULL; + memset(ndopts, 0, sizeof(*ndopts)); + while (opt_len) { + int l; + if (opt_len < sizeof(struct nd_opt_hdr)) + return NULL; + l = nd_opt->nd_opt_len << 3; + if (opt_len < l || l == 0) + return NULL; + switch (nd_opt->nd_opt_type) { + case ND_OPT_SOURCE_LL_ADDR: + case ND_OPT_TARGET_LL_ADDR: + case ND_OPT_MTU: + case ND_OPT_REDIRECT_HDR: + if (ndopts->nd_opt_array[nd_opt->nd_opt_type]) { + ND_PRINTK2(KERN_WARNING + "%s(): duplicated ND6 option found: type=%d\n", + __func__, + nd_opt->nd_opt_type); + } else { + ndopts->nd_opt_array[nd_opt->nd_opt_type] = nd_opt; + } + break; + case ND_OPT_PREFIX_INFO: + ndopts->nd_opts_pi_end = nd_opt; + if (!ndopts->nd_opt_array[nd_opt->nd_opt_type]) + ndopts->nd_opt_array[nd_opt->nd_opt_type] = nd_opt; + break; +#ifdef CONFIG_IPV6_ROUTE_INFO + case ND_OPT_ROUTE_INFO: + ndopts->nd_opts_ri_end = nd_opt; + if (!ndopts->nd_opts_ri) + ndopts->nd_opts_ri = nd_opt; + break; +#endif + default: + if (ndisc_is_useropt(nd_opt)) { + ndopts->nd_useropts_end = nd_opt; + if (!ndopts->nd_useropts) + ndopts->nd_useropts = nd_opt; + } else { + /* + * Unknown options must be silently ignored, + * to accommodate future extension to the + * protocol. + */ + ND_PRINTK2(KERN_NOTICE + "%s(): ignored unsupported option; type=%d, len=%d\n", + __func__, + nd_opt->nd_opt_type, nd_opt->nd_opt_len); + } + } + opt_len -= l; + nd_opt = ((void *)nd_opt) + l; + } + return ndopts; +} + +static inline u8 *ndisc_opt_addr_data(struct nd_opt_hdr *p, + struct net_device *dev) +{ + u8 *lladdr = (u8 *)(p + 1); + int lladdrlen = p->nd_opt_len << 3; + int prepad = ndisc_addr_option_pad(dev->type); + if (lladdrlen != NDISC_OPT_SPACE(dev->addr_len + prepad)) + return NULL; + return lladdr + prepad; +} + +int ndisc_mc_map(const struct in6_addr *addr, char *buf, struct net_device *dev, int dir) +{ + switch (dev->type) { + case ARPHRD_ETHER: + case ARPHRD_IEEE802: /* Not sure. Check it later. --ANK */ + case ARPHRD_FDDI: + ipv6_eth_mc_map(addr, buf); + return 0; + case ARPHRD_IEEE802_TR: + ipv6_tr_mc_map(addr,buf); + return 0; + case ARPHRD_ARCNET: + ipv6_arcnet_mc_map(addr, buf); + return 0; + case ARPHRD_INFINIBAND: + ipv6_ib_mc_map(addr, dev->broadcast, buf); + return 0; + case ARPHRD_IPGRE: + return ipv6_ipgre_mc_map(addr, dev->broadcast, buf); + default: + if (dir) { + memcpy(buf, dev->broadcast, dev->addr_len); + return 0; + } + } + return -EINVAL; +} + +EXPORT_SYMBOL(ndisc_mc_map); + +static u32 ndisc_hash(const void *pkey, + const struct net_device *dev, + __u32 *hash_rnd) +{ + return ndisc_hashfn(pkey, dev, hash_rnd); +} + +static int ndisc_constructor(struct neighbour *neigh) +{ + struct in6_addr *addr = (struct in6_addr*)&neigh->primary_key; + struct net_device *dev = neigh->dev; + struct inet6_dev *in6_dev; + struct neigh_parms *parms; + int is_multicast = ipv6_addr_is_multicast(addr); + + in6_dev = in6_dev_get(dev); + if (in6_dev == NULL) { + return -EINVAL; + } + + parms = in6_dev->nd_parms; + __neigh_parms_put(neigh->parms); + neigh->parms = neigh_parms_clone(parms); + + neigh->type = is_multicast ? RTN_MULTICAST : RTN_UNICAST; + if (!dev->header_ops) { + neigh->nud_state = NUD_NOARP; + neigh->ops = &ndisc_direct_ops; + neigh->output = neigh_direct_output; + } else { + if (is_multicast) { + neigh->nud_state = NUD_NOARP; + ndisc_mc_map(addr, neigh->ha, dev, 1); + } else if (dev->flags&(IFF_NOARP|IFF_LOOPBACK)) { + neigh->nud_state = NUD_NOARP; + memcpy(neigh->ha, dev->dev_addr, dev->addr_len); + if (dev->flags&IFF_LOOPBACK) + neigh->type = RTN_LOCAL; + } else if (dev->flags&IFF_POINTOPOINT) { + neigh->nud_state = NUD_NOARP; + memcpy(neigh->ha, dev->broadcast, dev->addr_len); + } + if (dev->header_ops->cache) + neigh->ops = &ndisc_hh_ops; + else + neigh->ops = &ndisc_generic_ops; + if (neigh->nud_state&NUD_VALID) + neigh->output = neigh->ops->connected_output; + else + neigh->output = neigh->ops->output; + } + in6_dev_put(in6_dev); + return 0; +} + +static int pndisc_constructor(struct pneigh_entry *n) +{ + struct in6_addr *addr = (struct in6_addr*)&n->key; + struct in6_addr maddr; + struct net_device *dev = n->dev; + + if (dev == NULL || __in6_dev_get(dev) == NULL) + return -EINVAL; + addrconf_addr_solict_mult(addr, &maddr); + ipv6_dev_mc_inc(dev, &maddr); + return 0; +} + +static void pndisc_destructor(struct pneigh_entry *n) +{ + struct in6_addr *addr = (struct in6_addr*)&n->key; + struct in6_addr maddr; + struct net_device *dev = n->dev; + + if (dev == NULL || __in6_dev_get(dev) == NULL) + return; + addrconf_addr_solict_mult(addr, &maddr); + ipv6_dev_mc_dec(dev, &maddr); +} + +struct sk_buff *ndisc_build_skb(struct net_device *dev, + const struct in6_addr *daddr, + const struct in6_addr *saddr, + struct icmp6hdr *icmp6h, + const struct in6_addr *target, + int llinfo) +{ + struct net *net = dev_net(dev); + struct sock *sk = net->ipv6.ndisc_sk; + struct sk_buff *skb; + struct icmp6hdr *hdr; + int hlen = LL_RESERVED_SPACE(dev); + int tlen = dev->needed_tailroom; + int len; + int err; + u8 *opt; + + if (!dev->addr_len) + llinfo = 0; + + len = sizeof(struct icmp6hdr) + (target ? sizeof(*target) : 0); + if (llinfo) + len += ndisc_opt_addr_space(dev); + + skb = sock_alloc_send_skb(sk, + (MAX_HEADER + sizeof(struct ipv6hdr) + + len + hlen + tlen), + 1, &err); + if (!skb) { + ND_PRINTK0(KERN_ERR + "ICMPv6 ND: %s() failed to allocate an skb, err=%d.\n", + __func__, err); + return NULL; + } + + skb_reserve(skb, hlen); + ip6_nd_hdr(sk, skb, dev, saddr, daddr, IPPROTO_ICMPV6, len); + + skb->transport_header = skb->tail; + skb_put(skb, len); + + hdr = (struct icmp6hdr *)skb_transport_header(skb); + memcpy(hdr, icmp6h, sizeof(*hdr)); + + opt = skb_transport_header(skb) + sizeof(struct icmp6hdr); + if (target) { + *(struct in6_addr *)opt = *target; + opt += sizeof(*target); + } + + if (llinfo) + ndisc_fill_addr_option(opt, llinfo, dev->dev_addr, + dev->addr_len, dev->type); + + hdr->icmp6_cksum = csum_ipv6_magic(saddr, daddr, len, + IPPROTO_ICMPV6, + csum_partial(hdr, + len, 0)); + + return skb; +} + +EXPORT_SYMBOL(ndisc_build_skb); + +void ndisc_send_skb(struct sk_buff *skb, + struct net_device *dev, + struct neighbour *neigh, + const struct in6_addr *daddr, + const struct in6_addr *saddr, + struct icmp6hdr *icmp6h) +{ + struct flowi6 fl6; + struct dst_entry *dst; + struct net *net = dev_net(dev); + struct sock *sk = net->ipv6.ndisc_sk; + struct inet6_dev *idev; + int err; + u8 type; + + type = icmp6h->icmp6_type; + + icmpv6_flow_init(sk, &fl6, type, saddr, daddr, dev->ifindex); + dst = icmp6_dst_alloc(dev, neigh, &fl6); + if (IS_ERR(dst)) { + kfree_skb(skb); + return; + } + + skb_dst_set(skb, dst); + + rcu_read_lock(); + idev = __in6_dev_get(dst->dev); + IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len); + + err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL, dst->dev, + dst_output); + if (!err) { + ICMP6MSGOUT_INC_STATS(net, idev, type); + ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS); + } + + rcu_read_unlock(); +} + +EXPORT_SYMBOL(ndisc_send_skb); + +/* + * Send a Neighbour Discover packet + */ +static void __ndisc_send(struct net_device *dev, + struct neighbour *neigh, + const struct in6_addr *daddr, + const struct in6_addr *saddr, + struct icmp6hdr *icmp6h, const struct in6_addr *target, + int llinfo) +{ + struct sk_buff *skb; + + skb = ndisc_build_skb(dev, daddr, saddr, icmp6h, target, llinfo); + if (!skb) + return; + + ndisc_send_skb(skb, dev, neigh, daddr, saddr, icmp6h); +} + +static void ndisc_send_na(struct net_device *dev, struct neighbour *neigh, + const struct in6_addr *daddr, + const struct in6_addr *solicited_addr, + int router, int solicited, int override, int inc_opt) +{ + struct in6_addr tmpaddr; + struct inet6_ifaddr *ifp; + const struct in6_addr *src_addr; + struct icmp6hdr icmp6h = { + .icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT, + }; + + /* for anycast or proxy, solicited_addr != src_addr */ + ifp = ipv6_get_ifaddr(dev_net(dev), solicited_addr, dev, 1); + if (ifp) { + src_addr = solicited_addr; + if (ifp->flags & IFA_F_OPTIMISTIC) + override = 0; + inc_opt |= ifp->idev->cnf.force_tllao; + in6_ifa_put(ifp); + } else { + if (ipv6_dev_get_saddr(dev_net(dev), dev, daddr, + inet6_sk(dev_net(dev)->ipv6.ndisc_sk)->srcprefs, + &tmpaddr)) + return; + src_addr = &tmpaddr; + } + + icmp6h.icmp6_router = router; + icmp6h.icmp6_solicited = solicited; + icmp6h.icmp6_override = override; + + __ndisc_send(dev, neigh, daddr, src_addr, + &icmp6h, solicited_addr, + inc_opt ? ND_OPT_TARGET_LL_ADDR : 0); +} + +static void ndisc_send_unsol_na(struct net_device *dev) +{ + struct inet6_dev *idev; + struct inet6_ifaddr *ifa; + struct in6_addr mcaddr; + + idev = in6_dev_get(dev); + if (!idev) + return; + + read_lock_bh(&idev->lock); + list_for_each_entry(ifa, &idev->addr_list, if_list) { + addrconf_addr_solict_mult(&ifa->addr, &mcaddr); + ndisc_send_na(dev, NULL, &mcaddr, &ifa->addr, + /*router=*/ !!idev->cnf.forwarding, + /*solicited=*/ false, /*override=*/ true, + /*inc_opt=*/ true); + } + read_unlock_bh(&idev->lock); + + in6_dev_put(idev); +} + +void ndisc_send_ns(struct net_device *dev, struct neighbour *neigh, + const struct in6_addr *solicit, + const struct in6_addr *daddr, const struct in6_addr *saddr) +{ + struct in6_addr addr_buf; + struct icmp6hdr icmp6h = { + .icmp6_type = NDISC_NEIGHBOUR_SOLICITATION, + }; + + if (saddr == NULL) { + if (ipv6_get_lladdr(dev, &addr_buf, + (IFA_F_TENTATIVE|IFA_F_OPTIMISTIC))) + return; + saddr = &addr_buf; + } + + __ndisc_send(dev, neigh, daddr, saddr, + &icmp6h, solicit, + !ipv6_addr_any(saddr) ? ND_OPT_SOURCE_LL_ADDR : 0); +} + +void ndisc_send_rs(struct net_device *dev, const struct in6_addr *saddr, + const struct in6_addr *daddr) +{ + struct icmp6hdr icmp6h = { + .icmp6_type = NDISC_ROUTER_SOLICITATION, + }; + int send_sllao = dev->addr_len; + +#ifdef CONFIG_IPV6_OPTIMISTIC_DAD + /* + * According to section 2.2 of RFC 4429, we must not + * send router solicitations with a sllao from + * optimistic addresses, but we may send the solicitation + * if we don't include the sllao. So here we check + * if our address is optimistic, and if so, we + * suppress the inclusion of the sllao. + */ + if (send_sllao) { + struct inet6_ifaddr *ifp = ipv6_get_ifaddr(dev_net(dev), saddr, + dev, 1); + if (ifp) { + if (ifp->flags & IFA_F_OPTIMISTIC) { + send_sllao = 0; + } + in6_ifa_put(ifp); + } else { + send_sllao = 0; + } + } +#endif + __ndisc_send(dev, NULL, daddr, saddr, + &icmp6h, NULL, + send_sllao ? ND_OPT_SOURCE_LL_ADDR : 0); +} + + +static void ndisc_error_report(struct neighbour *neigh, struct sk_buff *skb) +{ + /* + * "The sender MUST return an ICMP + * destination unreachable" + */ + dst_link_failure(skb); + kfree_skb(skb); +} + +/* Called with locked neigh: either read or both */ + +static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb) +{ + struct in6_addr *saddr = NULL; + struct in6_addr mcaddr; + struct net_device *dev = neigh->dev; + struct in6_addr *target = (struct in6_addr *)&neigh->primary_key; + int probes = atomic_read(&neigh->probes); + + if (skb && ipv6_chk_addr(dev_net(dev), &ipv6_hdr(skb)->saddr, dev, 1)) + saddr = &ipv6_hdr(skb)->saddr; + + if ((probes -= neigh->parms->ucast_probes) < 0) { + if (!(neigh->nud_state & NUD_VALID)) { + ND_PRINTK1(KERN_DEBUG "%s(): trying to ucast probe in NUD_INVALID: %pI6\n", + __func__, target); + } + ndisc_send_ns(dev, neigh, target, target, saddr); + } else if ((probes -= neigh->parms->app_probes) < 0) { +#ifdef CONFIG_ARPD + neigh_app_ns(neigh); +#endif + } else { + addrconf_addr_solict_mult(target, &mcaddr); + ndisc_send_ns(dev, NULL, target, &mcaddr, saddr); + } +} + +static int pndisc_is_router(const void *pkey, + struct net_device *dev) +{ + struct pneigh_entry *n; + int ret = -1; + + read_lock_bh(&nd_tbl.lock); + n = __pneigh_lookup(&nd_tbl, dev_net(dev), pkey, dev); + if (n) + ret = !!(n->flags & NTF_ROUTER); + read_unlock_bh(&nd_tbl.lock); + + return ret; +} + +static void ndisc_recv_ns(struct sk_buff *skb) +{ + struct nd_msg *msg = (struct nd_msg *)skb_transport_header(skb); + const struct in6_addr *saddr = &ipv6_hdr(skb)->saddr; + const struct in6_addr *daddr = &ipv6_hdr(skb)->daddr; + u8 *lladdr = NULL; + u32 ndoptlen = skb->tail - (skb->transport_header + + offsetof(struct nd_msg, opt)); + struct ndisc_options ndopts; + struct net_device *dev = skb->dev; + struct inet6_ifaddr *ifp; + struct inet6_dev *idev = NULL; + struct neighbour *neigh; + int dad = ipv6_addr_any(saddr); + int inc; + int is_router = -1; + + if (ipv6_addr_is_multicast(&msg->target)) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 NS: multicast target address"); + return; + } + + /* + * RFC2461 7.1.1: + * DAD has to be destined for solicited node multicast address. + */ + if (dad && + !(daddr->s6_addr32[0] == htonl(0xff020000) && + daddr->s6_addr32[1] == htonl(0x00000000) && + daddr->s6_addr32[2] == htonl(0x00000001) && + daddr->s6_addr [12] == 0xff )) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 NS: bad DAD packet (wrong destination)\n"); + return; + } + + if (!ndisc_parse_options(msg->opt, ndoptlen, &ndopts)) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 NS: invalid ND options\n"); + return; + } + + if (ndopts.nd_opts_src_lladdr) { + lladdr = ndisc_opt_addr_data(ndopts.nd_opts_src_lladdr, dev); + if (!lladdr) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 NS: invalid link-layer address length\n"); + return; + } + + /* RFC2461 7.1.1: + * If the IP source address is the unspecified address, + * there MUST NOT be source link-layer address option + * in the message. + */ + if (dad) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 NS: bad DAD packet (link-layer address option)\n"); + return; + } + } + + inc = ipv6_addr_is_multicast(daddr); + + ifp = ipv6_get_ifaddr(dev_net(dev), &msg->target, dev, 1); + if (ifp) { + + if (ifp->flags & (IFA_F_TENTATIVE|IFA_F_OPTIMISTIC)) { + if (dad) { + if (dev->type == ARPHRD_IEEE802_TR) { + const unsigned char *sadr; + sadr = skb_mac_header(skb); + if (((sadr[8] ^ dev->dev_addr[0]) & 0x7f) == 0 && + sadr[9] == dev->dev_addr[1] && + sadr[10] == dev->dev_addr[2] && + sadr[11] == dev->dev_addr[3] && + sadr[12] == dev->dev_addr[4] && + sadr[13] == dev->dev_addr[5]) { + /* looped-back to us */ + goto out; + } + } + + /* + * We are colliding with another node + * who is doing DAD + * so fail our DAD process + */ + addrconf_dad_failure(ifp); + return; + } else { + /* + * This is not a dad solicitation. + * If we are an optimistic node, + * we should respond. + * Otherwise, we should ignore it. + */ + if (!(ifp->flags & IFA_F_OPTIMISTIC)) + goto out; + } + } + + idev = ifp->idev; + } else { + struct net *net = dev_net(dev); + + idev = in6_dev_get(dev); + if (!idev) { + /* XXX: count this drop? */ + return; + } + + if (ipv6_chk_acast_addr(net, dev, &msg->target) || + (idev->cnf.forwarding && + (net->ipv6.devconf_all->proxy_ndp || idev->cnf.proxy_ndp) && + (is_router = pndisc_is_router(&msg->target, dev)) >= 0)) { + if (!(NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED) && + skb->pkt_type != PACKET_HOST && + inc != 0 && + idev->nd_parms->proxy_delay != 0) { + /* + * for anycast or proxy, + * sender should delay its response + * by a random time between 0 and + * MAX_ANYCAST_DELAY_TIME seconds. + * (RFC2461) -- yoshfuji + */ + struct sk_buff *n = skb_clone(skb, GFP_ATOMIC); + if (n) + pneigh_enqueue(&nd_tbl, idev->nd_parms, n); + goto out; + } + } else + goto out; + } + + if (is_router < 0) + is_router = !!idev->cnf.forwarding; + + if (dad) { + ndisc_send_na(dev, NULL, &in6addr_linklocal_allnodes, &msg->target, + is_router, 0, (ifp != NULL), 1); + goto out; + } + + if (inc) + NEIGH_CACHE_STAT_INC(&nd_tbl, rcv_probes_mcast); + else + NEIGH_CACHE_STAT_INC(&nd_tbl, rcv_probes_ucast); + + /* + * update / create cache entry + * for the source address + */ + neigh = __neigh_lookup(&nd_tbl, saddr, dev, + !inc || lladdr || !dev->addr_len); + if (neigh) + neigh_update(neigh, lladdr, NUD_STALE, + NEIGH_UPDATE_F_WEAK_OVERRIDE| + NEIGH_UPDATE_F_OVERRIDE); + if (neigh || !dev->header_ops) { + ndisc_send_na(dev, neigh, saddr, &msg->target, + is_router, + 1, (ifp != NULL && inc), inc); + if (neigh) + neigh_release(neigh); + } + +out: + if (ifp) + in6_ifa_put(ifp); + else + in6_dev_put(idev); +} + +static void ndisc_recv_na(struct sk_buff *skb) +{ + struct nd_msg *msg = (struct nd_msg *)skb_transport_header(skb); + const struct in6_addr *saddr = &ipv6_hdr(skb)->saddr; + const struct in6_addr *daddr = &ipv6_hdr(skb)->daddr; + u8 *lladdr = NULL; + u32 ndoptlen = skb->tail - (skb->transport_header + + offsetof(struct nd_msg, opt)); + struct ndisc_options ndopts; + struct net_device *dev = skb->dev; + struct inet6_ifaddr *ifp; + struct neighbour *neigh; + + if (skb->len < sizeof(struct nd_msg)) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 NA: packet too short\n"); + return; + } + + if (ipv6_addr_is_multicast(&msg->target)) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 NA: target address is multicast.\n"); + return; + } + + if (ipv6_addr_is_multicast(daddr) && + msg->icmph.icmp6_solicited) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 NA: solicited NA is multicasted.\n"); + return; + } + + if (!ndisc_parse_options(msg->opt, ndoptlen, &ndopts)) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 NS: invalid ND option\n"); + return; + } + if (ndopts.nd_opts_tgt_lladdr) { + lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, dev); + if (!lladdr) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 NA: invalid link-layer address length\n"); + return; + } + } + ifp = ipv6_get_ifaddr(dev_net(dev), &msg->target, dev, 1); + if (ifp) { + if (skb->pkt_type != PACKET_LOOPBACK + && (ifp->flags & IFA_F_TENTATIVE)) { + addrconf_dad_failure(ifp); + return; + } + /* What should we make now? The advertisement + is invalid, but ndisc specs say nothing + about it. It could be misconfiguration, or + an smart proxy agent tries to help us :-) + + We should not print the error if NA has been + received from loopback - it is just our own + unsolicited advertisement. + */ + if (skb->pkt_type != PACKET_LOOPBACK) + ND_PRINTK1(KERN_WARNING + "ICMPv6 NA: someone advertises our address %pI6 on %s!\n", + &ifp->addr, ifp->idev->dev->name); + in6_ifa_put(ifp); + return; + } + neigh = neigh_lookup(&nd_tbl, &msg->target, dev); + + if (neigh) { + u8 old_flags = neigh->flags; + struct net *net = dev_net(dev); + + if (neigh->nud_state & NUD_FAILED) + goto out; + + /* + * Don't update the neighbor cache entry on a proxy NA from + * ourselves because either the proxied node is off link or it + * has already sent a NA to us. + */ + if (lladdr && !memcmp(lladdr, dev->dev_addr, dev->addr_len) && + net->ipv6.devconf_all->forwarding && net->ipv6.devconf_all->proxy_ndp && + pneigh_lookup(&nd_tbl, net, &msg->target, dev, 0)) { + /* XXX: idev->cnf.prixy_ndp */ + goto out; + } + + neigh_update(neigh, lladdr, + msg->icmph.icmp6_solicited ? NUD_REACHABLE : NUD_STALE, + NEIGH_UPDATE_F_WEAK_OVERRIDE| + (msg->icmph.icmp6_override ? NEIGH_UPDATE_F_OVERRIDE : 0)| + NEIGH_UPDATE_F_OVERRIDE_ISROUTER| + (msg->icmph.icmp6_router ? NEIGH_UPDATE_F_ISROUTER : 0)); + + if ((old_flags & ~neigh->flags) & NTF_ROUTER) { + /* + * Change: router to host + */ + struct rt6_info *rt; + rt = rt6_get_dflt_router(saddr, dev); + if (rt) + ip6_del_rt(rt); + } + +out: + neigh_release(neigh); + } +} + +static void ndisc_recv_rs(struct sk_buff *skb) +{ + struct rs_msg *rs_msg = (struct rs_msg *)skb_transport_header(skb); + unsigned long ndoptlen = skb->len - sizeof(*rs_msg); + struct neighbour *neigh; + struct inet6_dev *idev; + const struct in6_addr *saddr = &ipv6_hdr(skb)->saddr; + struct ndisc_options ndopts; + u8 *lladdr = NULL; + + if (skb->len < sizeof(*rs_msg)) + return; + + idev = __in6_dev_get(skb->dev); + if (!idev) { + if (net_ratelimit()) + ND_PRINTK1("ICMP6 RS: can't find in6 device\n"); + return; + } + + /* Don't accept RS if we're not in router mode */ + if (!idev->cnf.forwarding) + goto out; + + /* + * Don't update NCE if src = ::; + * this implies that the source node has no ip address assigned yet. + */ + if (ipv6_addr_any(saddr)) + goto out; + + /* Parse ND options */ + if (!ndisc_parse_options(rs_msg->opt, ndoptlen, &ndopts)) { + if (net_ratelimit()) + ND_PRINTK2("ICMP6 NS: invalid ND option, ignored\n"); + goto out; + } + + if (ndopts.nd_opts_src_lladdr) { + lladdr = ndisc_opt_addr_data(ndopts.nd_opts_src_lladdr, + skb->dev); + if (!lladdr) + goto out; + } + + neigh = __neigh_lookup(&nd_tbl, saddr, skb->dev, 1); + if (neigh) { + neigh_update(neigh, lladdr, NUD_STALE, + NEIGH_UPDATE_F_WEAK_OVERRIDE| + NEIGH_UPDATE_F_OVERRIDE| + NEIGH_UPDATE_F_OVERRIDE_ISROUTER); + neigh_release(neigh); + } +out: + return; +} + +static void ndisc_ra_useropt(struct sk_buff *ra, struct nd_opt_hdr *opt) +{ + struct icmp6hdr *icmp6h = (struct icmp6hdr *)skb_transport_header(ra); + struct sk_buff *skb; + struct nlmsghdr *nlh; + struct nduseroptmsg *ndmsg; + struct net *net = dev_net(ra->dev); + int err; + int base_size = NLMSG_ALIGN(sizeof(struct nduseroptmsg) + + (opt->nd_opt_len << 3)); + size_t msg_size = base_size + nla_total_size(sizeof(struct in6_addr)); + + skb = nlmsg_new(msg_size, GFP_ATOMIC); + if (skb == NULL) { + err = -ENOBUFS; + goto errout; + } + + nlh = nlmsg_put(skb, 0, 0, RTM_NEWNDUSEROPT, base_size, 0); + if (nlh == NULL) { + goto nla_put_failure; + } + + ndmsg = nlmsg_data(nlh); + ndmsg->nduseropt_family = AF_INET6; + ndmsg->nduseropt_ifindex = ra->dev->ifindex; + ndmsg->nduseropt_icmp_type = icmp6h->icmp6_type; + ndmsg->nduseropt_icmp_code = icmp6h->icmp6_code; + ndmsg->nduseropt_opts_len = opt->nd_opt_len << 3; + + memcpy(ndmsg + 1, opt, opt->nd_opt_len << 3); + + NLA_PUT(skb, NDUSEROPT_SRCADDR, sizeof(struct in6_addr), + &ipv6_hdr(ra)->saddr); + nlmsg_end(skb, nlh); + + rtnl_notify(skb, net, 0, RTNLGRP_ND_USEROPT, NULL, GFP_ATOMIC); + return; + +nla_put_failure: + nlmsg_free(skb); + err = -EMSGSIZE; +errout: + rtnl_set_sk_err(net, RTNLGRP_ND_USEROPT, err); +} + +static inline int accept_ra(struct inet6_dev *in6_dev) +{ + /* + * If forwarding is enabled, RA are not accepted unless the special + * hybrid mode (accept_ra=2) is enabled. + */ + if (in6_dev->cnf.forwarding && in6_dev->cnf.accept_ra < 2) + return 0; + + return in6_dev->cnf.accept_ra; +} + +static void ndisc_router_discovery(struct sk_buff *skb) +{ + struct ra_msg *ra_msg = (struct ra_msg *)skb_transport_header(skb); + struct neighbour *neigh = NULL; + struct inet6_dev *in6_dev; + struct rt6_info *rt = NULL; + int lifetime; + struct ndisc_options ndopts; + int optlen; + unsigned int pref = 0; + + __u8 * opt = (__u8 *)(ra_msg + 1); + + optlen = (skb->tail - skb->transport_header) - sizeof(struct ra_msg); + + if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 RA: source address is not link-local.\n"); + return; + } + if (optlen < 0) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 RA: packet too short\n"); + return; + } + +#ifdef CONFIG_IPV6_NDISC_NODETYPE + if (skb->ndisc_nodetype == NDISC_NODETYPE_HOST) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 RA: from host or unauthorized router\n"); + return; + } +#endif + + /* + * set the RA_RECV flag in the interface + */ + + in6_dev = __in6_dev_get(skb->dev); + if (in6_dev == NULL) { + ND_PRINTK0(KERN_ERR + "ICMPv6 RA: can't find inet6 device for %s.\n", + skb->dev->name); + return; + } + + if (!ndisc_parse_options(opt, optlen, &ndopts)) { + ND_PRINTK2(KERN_WARNING + "ICMP6 RA: invalid ND options\n"); + return; + } + + if (!accept_ra(in6_dev)) + goto skip_linkparms; + +#ifdef CONFIG_IPV6_NDISC_NODETYPE + /* skip link-specific parameters from interior routers */ + if (skb->ndisc_nodetype == NDISC_NODETYPE_NODEFAULT) + goto skip_linkparms; +#endif + + if (in6_dev->if_flags & IF_RS_SENT) { + /* + * flag that an RA was received after an RS was sent + * out on this interface. + */ + in6_dev->if_flags |= IF_RA_RCVD; + } + + /* + * Remember the managed/otherconf flags from most recently + * received RA message (RFC 2462) -- yoshfuji + */ + in6_dev->if_flags = (in6_dev->if_flags & ~(IF_RA_MANAGED | + IF_RA_OTHERCONF)) | + (ra_msg->icmph.icmp6_addrconf_managed ? + IF_RA_MANAGED : 0) | + (ra_msg->icmph.icmp6_addrconf_other ? + IF_RA_OTHERCONF : 0); + + if (!in6_dev->cnf.accept_ra_defrtr) + goto skip_defrtr; + + if (ipv6_chk_addr(dev_net(in6_dev->dev), &ipv6_hdr(skb)->saddr, NULL, 0)) + goto skip_defrtr; + + lifetime = ntohs(ra_msg->icmph.icmp6_rt_lifetime); + +#ifdef CONFIG_IPV6_ROUTER_PREF + pref = ra_msg->icmph.icmp6_router_pref; + /* 10b is handled as if it were 00b (medium) */ + if (pref == ICMPV6_ROUTER_PREF_INVALID || + !in6_dev->cnf.accept_ra_rtr_pref) + pref = ICMPV6_ROUTER_PREF_MEDIUM; +#endif + + rt = rt6_get_dflt_router(&ipv6_hdr(skb)->saddr, skb->dev); + + if (rt) { + neigh = dst_neigh_lookup(&rt->dst, &ipv6_hdr(skb)->saddr); + if (!neigh) { + ND_PRINTK0(KERN_ERR + "ICMPv6 RA: %s() got default router without neighbour.\n", + __func__); + dst_release(&rt->dst); + return; + } + } + if (rt && lifetime == 0) { + ip6_del_rt(rt); + rt = NULL; + } + + if (rt == NULL && lifetime) { + ND_PRINTK3(KERN_DEBUG + "ICMPv6 RA: adding default router.\n"); + + rt = rt6_add_dflt_router(&ipv6_hdr(skb)->saddr, skb->dev, pref); + if (rt == NULL) { + ND_PRINTK0(KERN_ERR + "ICMPv6 RA: %s() failed to add default route.\n", + __func__); + return; + } + + neigh = dst_neigh_lookup(&rt->dst, &ipv6_hdr(skb)->saddr); + if (neigh == NULL) { + ND_PRINTK0(KERN_ERR + "ICMPv6 RA: %s() got default router without neighbour.\n", + __func__); + dst_release(&rt->dst); + return; + } + neigh->flags |= NTF_ROUTER; + } else if (rt) { + rt->rt6i_flags = (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); + } + + if (rt) + rt6_set_expires(rt, jiffies + (HZ * lifetime)); + if (ra_msg->icmph.icmp6_hop_limit) { + in6_dev->cnf.hop_limit = ra_msg->icmph.icmp6_hop_limit; + if (rt) + dst_metric_set(&rt->dst, RTAX_HOPLIMIT, + ra_msg->icmph.icmp6_hop_limit); + } + +skip_defrtr: + + /* + * Update Reachable Time and Retrans Timer + */ + + if (in6_dev->nd_parms) { + unsigned long rtime = ntohl(ra_msg->retrans_timer); + + if (rtime && rtime/1000 < MAX_SCHEDULE_TIMEOUT/HZ) { + rtime = (rtime*HZ)/1000; + if (rtime < HZ/10) + rtime = HZ/10; + in6_dev->nd_parms->retrans_time = rtime; + in6_dev->tstamp = jiffies; + inet6_ifinfo_notify(RTM_NEWLINK, in6_dev); + } + + rtime = ntohl(ra_msg->reachable_time); + if (rtime && rtime/1000 < MAX_SCHEDULE_TIMEOUT/(3*HZ)) { + rtime = (rtime*HZ)/1000; + + if (rtime < HZ/10) + rtime = HZ/10; + + if (rtime != in6_dev->nd_parms->base_reachable_time) { + in6_dev->nd_parms->base_reachable_time = rtime; + in6_dev->nd_parms->gc_staletime = 3 * rtime; + in6_dev->nd_parms->reachable_time = neigh_rand_reach_time(rtime); + in6_dev->tstamp = jiffies; + inet6_ifinfo_notify(RTM_NEWLINK, in6_dev); + } + } + } + +skip_linkparms: + + /* + * Process options. + */ + + if (!neigh) + neigh = __neigh_lookup(&nd_tbl, &ipv6_hdr(skb)->saddr, + skb->dev, 1); + if (neigh) { + u8 *lladdr = NULL; + if (ndopts.nd_opts_src_lladdr) { + lladdr = ndisc_opt_addr_data(ndopts.nd_opts_src_lladdr, + skb->dev); + if (!lladdr) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 RA: invalid link-layer address length\n"); + goto out; + } + } + neigh_update(neigh, lladdr, NUD_STALE, + NEIGH_UPDATE_F_WEAK_OVERRIDE| + NEIGH_UPDATE_F_OVERRIDE| + NEIGH_UPDATE_F_OVERRIDE_ISROUTER| + NEIGH_UPDATE_F_ISROUTER); + } + + if (!accept_ra(in6_dev)) + goto out; + +#ifdef CONFIG_IPV6_ROUTE_INFO + if (ipv6_chk_addr(dev_net(in6_dev->dev), &ipv6_hdr(skb)->saddr, NULL, 0)) + goto skip_routeinfo; + + if (in6_dev->cnf.accept_ra_rtr_pref && ndopts.nd_opts_ri) { + struct nd_opt_hdr *p; + for (p = ndopts.nd_opts_ri; + p; + p = ndisc_next_option(p, ndopts.nd_opts_ri_end)) { + struct route_info *ri = (struct route_info *)p; +#ifdef CONFIG_IPV6_NDISC_NODETYPE + if (skb->ndisc_nodetype == NDISC_NODETYPE_NODEFAULT && + ri->prefix_len == 0) + continue; +#endif + if (ri->prefix_len > in6_dev->cnf.accept_ra_rt_info_max_plen) + continue; + rt6_route_rcv(skb->dev, (u8*)p, (p->nd_opt_len) << 3, + &ipv6_hdr(skb)->saddr); + } + } + +skip_routeinfo: +#endif + +#ifdef CONFIG_IPV6_NDISC_NODETYPE + /* skip link-specific ndopts from interior routers */ + if (skb->ndisc_nodetype == NDISC_NODETYPE_NODEFAULT) + goto out; +#endif + + if (in6_dev->cnf.accept_ra_pinfo && ndopts.nd_opts_pi) { + struct nd_opt_hdr *p; + for (p = ndopts.nd_opts_pi; + p; + p = ndisc_next_option(p, ndopts.nd_opts_pi_end)) { + addrconf_prefix_rcv(skb->dev, (u8 *)p, + (p->nd_opt_len) << 3, + ndopts.nd_opts_src_lladdr != NULL); + } + } + + if (ndopts.nd_opts_mtu) { + __be32 n; + u32 mtu; + + memcpy(&n, ((u8*)(ndopts.nd_opts_mtu+1))+2, sizeof(mtu)); + mtu = ntohl(n); + + if (mtu < IPV6_MIN_MTU || mtu > skb->dev->mtu) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 RA: invalid mtu: %d\n", + mtu); + } else if (in6_dev->cnf.mtu6 != mtu) { + in6_dev->cnf.mtu6 = mtu; + + if (rt) + dst_metric_set(&rt->dst, RTAX_MTU, mtu); + + rt6_mtu_change(skb->dev, mtu); + } + } + + if (ndopts.nd_useropts) { + struct nd_opt_hdr *p; + for (p = ndopts.nd_useropts; + p; + p = ndisc_next_useropt(p, ndopts.nd_useropts_end)) { + ndisc_ra_useropt(skb, p); + } + } + + if (ndopts.nd_opts_tgt_lladdr || ndopts.nd_opts_rh) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 RA: invalid RA options"); + } +out: + if (rt) + dst_release(&rt->dst); + if (neigh) + neigh_release(neigh); +} + +static void ndisc_redirect_rcv(struct sk_buff *skb) +{ + struct inet6_dev *in6_dev; + struct icmp6hdr *icmph; + const struct in6_addr *dest; + const struct in6_addr *target; /* new first hop to destination */ + struct neighbour *neigh; + int on_link = 0; + struct ndisc_options ndopts; + int optlen; + u8 *lladdr = NULL; + +#ifdef CONFIG_IPV6_NDISC_NODETYPE + switch (skb->ndisc_nodetype) { + case NDISC_NODETYPE_HOST: + case NDISC_NODETYPE_NODEFAULT: + ND_PRINTK2(KERN_WARNING + "ICMPv6 Redirect: from host or unauthorized router\n"); + return; + } +#endif + + if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 Redirect: source address is not link-local.\n"); + return; + } + + optlen = skb->tail - skb->transport_header; + optlen -= sizeof(struct icmp6hdr) + 2 * sizeof(struct in6_addr); + + if (optlen < 0) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 Redirect: packet too short\n"); + return; + } + + icmph = icmp6_hdr(skb); + target = (const struct in6_addr *) (icmph + 1); + dest = target + 1; + + if (ipv6_addr_is_multicast(dest)) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 Redirect: destination address is multicast.\n"); + return; + } + + if (ipv6_addr_equal(dest, target)) { + on_link = 1; + } else if (ipv6_addr_type(target) != + (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 Redirect: target address is not link-local unicast.\n"); + return; + } + + in6_dev = __in6_dev_get(skb->dev); + if (!in6_dev) + return; + if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects) + return; + + /* RFC2461 8.1: + * The IP source address of the Redirect MUST be the same as the current + * first-hop router for the specified ICMP Destination Address. + */ + + if (!ndisc_parse_options((u8*)(dest + 1), optlen, &ndopts)) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 Redirect: invalid ND options\n"); + return; + } + if (ndopts.nd_opts_tgt_lladdr) { + lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, + skb->dev); + if (!lladdr) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 Redirect: invalid link-layer address length\n"); + return; + } + } + + neigh = __neigh_lookup(&nd_tbl, target, skb->dev, 1); + if (neigh) { + rt6_redirect(dest, &ipv6_hdr(skb)->daddr, + &ipv6_hdr(skb)->saddr, neigh, lladdr, + on_link); + neigh_release(neigh); + } +} + +void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target) +{ + struct net_device *dev = skb->dev; + struct net *net = dev_net(dev); + struct sock *sk = net->ipv6.ndisc_sk; + int len = sizeof(struct icmp6hdr) + 2 * sizeof(struct in6_addr); + struct sk_buff *buff; + struct icmp6hdr *icmph; + struct in6_addr saddr_buf; + struct in6_addr *addrp; + struct rt6_info *rt; + struct dst_entry *dst; + struct inet6_dev *idev; + struct flowi6 fl6; + u8 *opt; + int hlen, tlen; + int rd_len; + int err; + u8 ha_buf[MAX_ADDR_LEN], *ha = NULL; + + if (ipv6_get_lladdr(dev, &saddr_buf, IFA_F_TENTATIVE)) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 Redirect: no link-local address on %s\n", + dev->name); + return; + } + + if (!ipv6_addr_equal(&ipv6_hdr(skb)->daddr, target) && + ipv6_addr_type(target) != (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 Redirect: target address is not link-local unicast.\n"); + return; + } + + icmpv6_flow_init(sk, &fl6, NDISC_REDIRECT, + &saddr_buf, &ipv6_hdr(skb)->saddr, dev->ifindex); + + dst = ip6_route_output(net, NULL, &fl6); + if (dst->error) { + dst_release(dst); + return; + } + dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0); + if (IS_ERR(dst)) + return; + + rt = (struct rt6_info *) dst; + + if (rt->rt6i_flags & RTF_GATEWAY) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 Redirect: destination is not a neighbour.\n"); + goto release; + } + if (!rt->rt6i_peer) + rt6_bind_peer(rt, 1); + if (!inet_peer_xrlim_allow(rt->rt6i_peer, 1*HZ)) + goto release; + + if (dev->addr_len) { + struct neighbour *neigh = dst_neigh_lookup(skb_dst(skb), target); + if (!neigh) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 Redirect: no neigh for target address\n"); + goto release; + } + + read_lock_bh(&neigh->lock); + if (neigh->nud_state & NUD_VALID) { + memcpy(ha_buf, neigh->ha, dev->addr_len); + read_unlock_bh(&neigh->lock); + ha = ha_buf; + len += ndisc_opt_addr_space(dev); + } else + read_unlock_bh(&neigh->lock); + + neigh_release(neigh); + } + + rd_len = min_t(unsigned int, + IPV6_MIN_MTU-sizeof(struct ipv6hdr)-len, skb->len + 8); + rd_len &= ~0x7; + len += rd_len; + + hlen = LL_RESERVED_SPACE(dev); + tlen = dev->needed_tailroom; + buff = sock_alloc_send_skb(sk, + (MAX_HEADER + sizeof(struct ipv6hdr) + + len + hlen + tlen), + 1, &err); + if (buff == NULL) { + ND_PRINTK0(KERN_ERR + "ICMPv6 Redirect: %s() failed to allocate an skb, err=%d.\n", + __func__, err); + goto release; + } + + skb_reserve(buff, hlen); + ip6_nd_hdr(sk, buff, dev, &saddr_buf, &ipv6_hdr(skb)->saddr, + IPPROTO_ICMPV6, len); + + skb_set_transport_header(buff, skb_tail_pointer(buff) - buff->data); + skb_put(buff, len); + icmph = icmp6_hdr(buff); + + memset(icmph, 0, sizeof(struct icmp6hdr)); + icmph->icmp6_type = NDISC_REDIRECT; + + /* + * copy target and destination addresses + */ + + addrp = (struct in6_addr *)(icmph + 1); + *addrp = *target; + addrp++; + *addrp = ipv6_hdr(skb)->daddr; + + opt = (u8*) (addrp + 1); + + /* + * include target_address option + */ + + if (ha) + opt = ndisc_fill_addr_option(opt, ND_OPT_TARGET_LL_ADDR, ha, + dev->addr_len, dev->type); + + /* + * build redirect option and copy skb over to the new packet. + */ + + memset(opt, 0, 8); + *(opt++) = ND_OPT_REDIRECT_HDR; + *(opt++) = (rd_len >> 3); + opt += 6; + + memcpy(opt, ipv6_hdr(skb), rd_len - 8); + + icmph->icmp6_cksum = csum_ipv6_magic(&saddr_buf, &ipv6_hdr(skb)->saddr, + len, IPPROTO_ICMPV6, + csum_partial(icmph, len, 0)); + + skb_dst_set(buff, dst); + rcu_read_lock(); + idev = __in6_dev_get(dst->dev); + IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len); + err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, buff, NULL, dst->dev, + dst_output); + if (!err) { + ICMP6MSGOUT_INC_STATS(net, idev, NDISC_REDIRECT); + ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS); + } + + rcu_read_unlock(); + return; + +release: + dst_release(dst); +} + +static void pndisc_redo(struct sk_buff *skb) +{ + ndisc_recv_ns(skb); + kfree_skb(skb); +} + +int ndisc_rcv(struct sk_buff *skb) +{ + struct nd_msg *msg; + + if (!pskb_may_pull(skb, skb->len)) + return 0; + + msg = (struct nd_msg *)skb_transport_header(skb); + + __skb_push(skb, skb->data - skb_transport_header(skb)); + + if (ipv6_hdr(skb)->hop_limit != 255) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 NDISC: invalid hop-limit: %d\n", + ipv6_hdr(skb)->hop_limit); + return 0; + } + + if (msg->icmph.icmp6_code != 0) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 NDISC: invalid ICMPv6 code: %d\n", + msg->icmph.icmp6_code); + return 0; + } + + memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb)); + + switch (msg->icmph.icmp6_type) { + case NDISC_NEIGHBOUR_SOLICITATION: + ndisc_recv_ns(skb); + break; + + case NDISC_NEIGHBOUR_ADVERTISEMENT: + ndisc_recv_na(skb); + break; + + case NDISC_ROUTER_SOLICITATION: + ndisc_recv_rs(skb); + break; + + case NDISC_ROUTER_ADVERTISEMENT: + ndisc_router_discovery(skb); + break; + + case NDISC_REDIRECT: + ndisc_redirect_rcv(skb); + break; + } + + return 0; +} + +static int ndisc_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct net_device *dev = ptr; + struct net *net = dev_net(dev); + + switch (event) { + case NETDEV_CHANGEADDR: + neigh_changeaddr(&nd_tbl, dev); + fib6_run_gc(~0UL, net); + break; + case NETDEV_DOWN: + neigh_ifdown(&nd_tbl, dev); + fib6_run_gc(~0UL, net); + break; + case NETDEV_NOTIFY_PEERS: + ndisc_send_unsol_na(dev); + break; + default: + break; + } + + return NOTIFY_DONE; +} + +static struct notifier_block ndisc_netdev_notifier = { + .notifier_call = ndisc_netdev_event, +}; + +#ifdef CONFIG_SYSCTL +static void ndisc_warn_deprecated_sysctl(struct ctl_table *ctl, + const char *func, const char *dev_name) +{ + static char warncomm[TASK_COMM_LEN]; + static int warned; + if (strcmp(warncomm, current->comm) && warned < 5) { + strcpy(warncomm, current->comm); + printk(KERN_WARNING + "process `%s' is using deprecated sysctl (%s) " + "net.ipv6.neigh.%s.%s; " + "Use net.ipv6.neigh.%s.%s_ms " + "instead.\n", + warncomm, func, + dev_name, ctl->procname, + dev_name, ctl->procname); + warned++; + } +} + +int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct net_device *dev = ctl->extra1; + struct inet6_dev *idev; + int ret; + + if ((strcmp(ctl->procname, "retrans_time") == 0) || + (strcmp(ctl->procname, "base_reachable_time") == 0)) + ndisc_warn_deprecated_sysctl(ctl, "syscall", dev ? dev->name : "default"); + + if (strcmp(ctl->procname, "retrans_time") == 0) + ret = proc_dointvec(ctl, write, buffer, lenp, ppos); + + else if (strcmp(ctl->procname, "base_reachable_time") == 0) + ret = proc_dointvec_jiffies(ctl, write, + buffer, lenp, ppos); + + else if ((strcmp(ctl->procname, "retrans_time_ms") == 0) || + (strcmp(ctl->procname, "base_reachable_time_ms") == 0)) + ret = proc_dointvec_ms_jiffies(ctl, write, + buffer, lenp, ppos); + else + ret = -1; + + if (write && ret == 0 && dev && (idev = in6_dev_get(dev)) != NULL) { + if (ctl->data == &idev->nd_parms->base_reachable_time) + idev->nd_parms->reachable_time = neigh_rand_reach_time(idev->nd_parms->base_reachable_time); + idev->tstamp = jiffies; + inet6_ifinfo_notify(RTM_NEWLINK, idev); + in6_dev_put(idev); + } + return ret; +} + + +#endif + +static int __net_init ndisc_net_init(struct net *net) +{ + struct ipv6_pinfo *np; + struct sock *sk; + int err; + + err = inet_ctl_sock_create(&sk, PF_INET6, + SOCK_RAW, IPPROTO_ICMPV6, net); + if (err < 0) { + ND_PRINTK0(KERN_ERR + "ICMPv6 NDISC: Failed to initialize the control socket (err %d).\n", + err); + return err; + } + + net->ipv6.ndisc_sk = sk; + + np = inet6_sk(sk); + np->hop_limit = 255; + /* Do not loopback ndisc messages */ + np->mc_loop = 0; + + return 0; +} + +static void __net_exit ndisc_net_exit(struct net *net) +{ + inet_ctl_sock_destroy(net->ipv6.ndisc_sk); +} + +static struct pernet_operations ndisc_net_ops = { + .init = ndisc_net_init, + .exit = ndisc_net_exit, +}; + +int __init ndisc_init(void) +{ + int err; + + err = register_pernet_subsys(&ndisc_net_ops); + if (err) + return err; + /* + * Initialize the neighbour table + */ + neigh_table_init(&nd_tbl); + +#ifdef CONFIG_SYSCTL + err = neigh_sysctl_register(NULL, &nd_tbl.parms, "ipv6", + &ndisc_ifinfo_sysctl_change); + if (err) + goto out_unregister_pernet; +#endif + err = register_netdevice_notifier(&ndisc_netdev_notifier); + if (err) + goto out_unregister_sysctl; +out: + return err; + +out_unregister_sysctl: +#ifdef CONFIG_SYSCTL + neigh_sysctl_unregister(&nd_tbl.parms); +out_unregister_pernet: +#endif + unregister_pernet_subsys(&ndisc_net_ops); + goto out; +} + +void ndisc_cleanup(void) +{ + unregister_netdevice_notifier(&ndisc_netdev_notifier); +#ifdef CONFIG_SYSCTL + neigh_sysctl_unregister(&nd_tbl.parms); +#endif + neigh_table_clear(&nd_tbl); + unregister_pernet_subsys(&ndisc_net_ops); +} diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c new file mode 100644 index 00000000..db31561c --- /dev/null +++ b/net/ipv6/netfilter.c @@ -0,0 +1,196 @@ +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/ipv6.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv6.h> +#include <linux/export.h> +#include <net/dst.h> +#include <net/ipv6.h> +#include <net/ip6_route.h> +#include <net/xfrm.h> +#include <net/ip6_checksum.h> +#include <net/netfilter/nf_queue.h> + +int ip6_route_me_harder(struct sk_buff *skb) +{ + struct net *net = dev_net(skb_dst(skb)->dev); + const struct ipv6hdr *iph = ipv6_hdr(skb); + struct dst_entry *dst; + struct flowi6 fl6 = { + .flowi6_oif = skb->sk ? skb->sk->sk_bound_dev_if : 0, + .flowi6_mark = skb->mark, + .daddr = iph->daddr, + .saddr = iph->saddr, + }; + + dst = ip6_route_output(net, skb->sk, &fl6); + if (dst->error) { + IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); + LIMIT_NETDEBUG(KERN_DEBUG "ip6_route_me_harder: No more route.\n"); + dst_release(dst); + return -EINVAL; + } + + /* Drop old route. */ + skb_dst_drop(skb); + + skb_dst_set(skb, dst); + +#ifdef CONFIG_XFRM + if (!(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) && + xfrm_decode_session(skb, flowi6_to_flowi(&fl6), AF_INET6) == 0) { + skb_dst_set(skb, NULL); + dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), skb->sk, 0); + if (IS_ERR(dst)) + return -1; + skb_dst_set(skb, dst); + } +#endif + + return 0; +} +EXPORT_SYMBOL(ip6_route_me_harder); + +/* + * Extra routing may needed on local out, as the QUEUE target never + * returns control to the table. + */ + +struct ip6_rt_info { + struct in6_addr daddr; + struct in6_addr saddr; + u_int32_t mark; +}; + +static void nf_ip6_saveroute(const struct sk_buff *skb, + struct nf_queue_entry *entry) +{ + struct ip6_rt_info *rt_info = nf_queue_entry_reroute(entry); + + if (entry->hook == NF_INET_LOCAL_OUT) { + const struct ipv6hdr *iph = ipv6_hdr(skb); + + rt_info->daddr = iph->daddr; + rt_info->saddr = iph->saddr; + rt_info->mark = skb->mark; + } +} + +static int nf_ip6_reroute(struct sk_buff *skb, + const struct nf_queue_entry *entry) +{ + struct ip6_rt_info *rt_info = nf_queue_entry_reroute(entry); + + if (entry->hook == NF_INET_LOCAL_OUT) { + const struct ipv6hdr *iph = ipv6_hdr(skb); + if (!ipv6_addr_equal(&iph->daddr, &rt_info->daddr) || + !ipv6_addr_equal(&iph->saddr, &rt_info->saddr) || + skb->mark != rt_info->mark) + return ip6_route_me_harder(skb); + } + return 0; +} + +static int nf_ip6_route(struct net *net, struct dst_entry **dst, + struct flowi *fl, bool strict) +{ + static const struct ipv6_pinfo fake_pinfo; + static const struct inet_sock fake_sk = { + /* makes ip6_route_output set RT6_LOOKUP_F_IFACE: */ + .sk.sk_bound_dev_if = 1, + .pinet6 = (struct ipv6_pinfo *) &fake_pinfo, + }; + const void *sk = strict ? &fake_sk : NULL; + struct dst_entry *result; + int err; + + result = ip6_route_output(net, sk, &fl->u.ip6); + err = result->error; + if (err) + dst_release(result); + else + *dst = result; + return err; +} + +__sum16 nf_ip6_checksum(struct sk_buff *skb, unsigned int hook, + unsigned int dataoff, u_int8_t protocol) +{ + const struct ipv6hdr *ip6h = ipv6_hdr(skb); + __sum16 csum = 0; + + switch (skb->ip_summed) { + case CHECKSUM_COMPLETE: + if (hook != NF_INET_PRE_ROUTING && hook != NF_INET_LOCAL_IN) + break; + if (!csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, + skb->len - dataoff, protocol, + csum_sub(skb->csum, + skb_checksum(skb, 0, + dataoff, 0)))) { + skb->ip_summed = CHECKSUM_UNNECESSARY; + break; + } + /* fall through */ + case CHECKSUM_NONE: + skb->csum = ~csum_unfold( + csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, + skb->len - dataoff, + protocol, + csum_sub(0, + skb_checksum(skb, 0, + dataoff, 0)))); + csum = __skb_checksum_complete(skb); + } + return csum; +} +EXPORT_SYMBOL(nf_ip6_checksum); + +static __sum16 nf_ip6_checksum_partial(struct sk_buff *skb, unsigned int hook, + unsigned int dataoff, unsigned int len, + u_int8_t protocol) +{ + const struct ipv6hdr *ip6h = ipv6_hdr(skb); + __wsum hsum; + __sum16 csum = 0; + + switch (skb->ip_summed) { + case CHECKSUM_COMPLETE: + if (len == skb->len - dataoff) + return nf_ip6_checksum(skb, hook, dataoff, protocol); + /* fall through */ + case CHECKSUM_NONE: + hsum = skb_checksum(skb, 0, dataoff, 0); + skb->csum = ~csum_unfold(csum_ipv6_magic(&ip6h->saddr, + &ip6h->daddr, + skb->len - dataoff, + protocol, + csum_sub(0, hsum))); + skb->ip_summed = CHECKSUM_NONE; + return __skb_checksum_complete_head(skb, dataoff + len); + } + return csum; +}; + +static const struct nf_afinfo nf_ip6_afinfo = { + .family = AF_INET6, + .checksum = nf_ip6_checksum, + .checksum_partial = nf_ip6_checksum_partial, + .route = nf_ip6_route, + .saveroute = nf_ip6_saveroute, + .reroute = nf_ip6_reroute, + .route_key_size = sizeof(struct ip6_rt_info), +}; + +int __init ipv6_netfilter_init(void) +{ + return nf_register_afinfo(&nf_ip6_afinfo); +} + +/* This can be called from inet6_init() on errors, so it cannot + * be marked __exit. -DaveM + */ +void ipv6_netfilter_fini(void) +{ + nf_unregister_afinfo(&nf_ip6_afinfo); +} diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig new file mode 100644 index 00000000..acd7c9e1 --- /dev/null +++ b/net/ipv6/netfilter/Kconfig @@ -0,0 +1,224 @@ +# +# IP netfilter configuration +# + +menu "IPv6: Netfilter Configuration" + depends on INET && IPV6 && NETFILTER + +config NF_DEFRAG_IPV6 + tristate + default n + +config NF_CONNTRACK_IPV6 + tristate "IPv6 connection tracking support" + depends on INET && IPV6 && NF_CONNTRACK + default m if NETFILTER_ADVANCED=n + select NF_DEFRAG_IPV6 + ---help--- + Connection tracking keeps a record of what packets have passed + through your machine, in order to figure out how they are related + into connections. + + This is IPv6 support on Layer 3 independent connection tracking. + Layer 3 independent connection tracking is experimental scheme + which generalize ip_conntrack to support other layer 3 protocols. + + To compile it as a module, choose M here. If unsure, say N. + +config IP6_NF_QUEUE + tristate "IP6 Userspace queueing via NETLINK (OBSOLETE)" + depends on INET && IPV6 && NETFILTER + depends on NETFILTER_ADVANCED + ---help--- + + This option adds a queue handler to the kernel for IPv6 + packets which enables users to receive the filtered packets + with QUEUE target using libipq. + + This option enables the old IPv6-only "ip6_queue" implementation + which has been obsoleted by the new "nfnetlink_queue" code (see + CONFIG_NETFILTER_NETLINK_QUEUE). + + (C) Fernando Anton 2001 + IPv64 Project - Work based in IPv64 draft by Arturo Azcorra. + Universidad Carlos III de Madrid + Universidad Politecnica de Alcala de Henares + email: <fanton@it.uc3m.es>. + + To compile it as a module, choose M here. If unsure, say N. + +config IP6_NF_IPTABLES + tristate "IP6 tables support (required for filtering)" + depends on INET && IPV6 + select NETFILTER_XTABLES + default m if NETFILTER_ADVANCED=n + help + ip6tables is a general, extensible packet identification framework. + Currently only the packet filtering and packet mangling subsystem + for IPv6 use this, but connection tracking is going to follow. + Say 'Y' or 'M' here if you want to use either of those. + + To compile it as a module, choose M here. If unsure, say N. + +if IP6_NF_IPTABLES + +# The simple matches. +config IP6_NF_MATCH_AH + tristate '"ah" match support' + depends on NETFILTER_ADVANCED + help + This module allows one to match AH packets. + + To compile it as a module, choose M here. If unsure, say N. + +config IP6_NF_MATCH_EUI64 + tristate '"eui64" address check' + depends on NETFILTER_ADVANCED + help + This module performs checking on the IPv6 source address + Compares the last 64 bits with the EUI64 (delivered + from the MAC address) address + + To compile it as a module, choose M here. If unsure, say N. + +config IP6_NF_MATCH_FRAG + tristate '"frag" Fragmentation header match support' + depends on NETFILTER_ADVANCED + help + frag matching allows you to match packets based on the fragmentation + header of the packet. + + To compile it as a module, choose M here. If unsure, say N. + +config IP6_NF_MATCH_OPTS + tristate '"hbh" hop-by-hop and "dst" opts header match support' + depends on NETFILTER_ADVANCED + help + This allows one to match packets based on the hop-by-hop + and destination options headers of a packet. + + To compile it as a module, choose M here. If unsure, say N. + +config IP6_NF_MATCH_HL + tristate '"hl" hoplimit match support' + depends on NETFILTER_ADVANCED + select NETFILTER_XT_MATCH_HL + ---help--- + This is a backwards-compat option for the user's convenience + (e.g. when running oldconfig). It selects + CONFIG_NETFILTER_XT_MATCH_HL. + +config IP6_NF_MATCH_IPV6HEADER + tristate '"ipv6header" IPv6 Extension Headers Match' + default m if NETFILTER_ADVANCED=n + help + This module allows one to match packets based upon + the ipv6 extension headers. + + To compile it as a module, choose M here. If unsure, say N. + +config IP6_NF_MATCH_MH + tristate '"mh" match support' + depends on NETFILTER_ADVANCED + help + This module allows one to match MH packets. + + To compile it as a module, choose M here. If unsure, say N. + +config IP6_NF_MATCH_RPFILTER + tristate '"rpfilter" reverse path filter match support' + depends on NETFILTER_ADVANCED + ---help--- + This option allows you to match packets whose replies would + go out via the interface the packet came in. + + To compile it as a module, choose M here. If unsure, say N. + The module will be called ip6t_rpfilter. + +config IP6_NF_MATCH_RT + tristate '"rt" Routing header match support' + depends on NETFILTER_ADVANCED + help + rt matching allows you to match packets based on the routing + header of the packet. + + To compile it as a module, choose M here. If unsure, say N. + +# The targets +config IP6_NF_TARGET_HL + tristate '"HL" hoplimit target support' + depends on NETFILTER_ADVANCED && IP6_NF_MANGLE + select NETFILTER_XT_TARGET_HL + ---help--- + This is a backwards-compatible option for the user's convenience + (e.g. when running oldconfig). It selects + CONFIG_NETFILTER_XT_TARGET_HL. + +config IP6_NF_FILTER + tristate "Packet filtering" + default m if NETFILTER_ADVANCED=n + help + Packet filtering defines a table `filter', which has a series of + rules for simple packet filtering at local input, forwarding and + local output. See the man page for iptables(8). + + To compile it as a module, choose M here. If unsure, say N. + +config IP6_NF_TARGET_REJECT + tristate "REJECT target support" + depends on IP6_NF_FILTER + default m if NETFILTER_ADVANCED=n + help + The REJECT target allows a filtering rule to specify that an ICMPv6 + error should be issued in response to an incoming packet, rather + than silently being dropped. + + To compile it as a module, choose M here. If unsure, say N. + +config IP6_NF_TARGET_REJECT_SKERR + bool "Force socket error when rejecting with icmp*" + depends on IP6_NF_TARGET_REJECT + default n + help + This option enables turning a "--reject-with icmp*" into a matching + socket error also. + The REJECT target normally allows sending an ICMP message. But it + leaves the local socket unaware of any ingress rejects. + + If unsure, say N. + +config IP6_NF_MANGLE + tristate "Packet mangling" + default m if NETFILTER_ADVANCED=n + help + This option adds a `mangle' table to iptables: see the man page for + iptables(8). This table is used for various packet alterations + which can effect how the packet is routed. + + To compile it as a module, choose M here. If unsure, say N. + +config IP6_NF_RAW + tristate 'raw table support (required for TRACE)' + help + This option adds a `raw' table to ip6tables. This table is the very + first in the netfilter framework and hooks in at the PREROUTING + and OUTPUT chains. + + If you want to compile it as a module, say M here and read + <file:Documentation/kbuild/modules.txt>. If unsure, say `N'. + +# security table for MAC policy +config IP6_NF_SECURITY + tristate "Security table" + depends on SECURITY + depends on NETFILTER_ADVANCED + help + This option adds a `security' table to iptables, for use + with Mandatory Access Control (MAC) policy. + + If unsure, say N. + +endif # IP6_NF_IPTABLES + +endmenu + diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile new file mode 100644 index 00000000..d4dfd0a2 --- /dev/null +++ b/net/ipv6/netfilter/Makefile @@ -0,0 +1,34 @@ +# +# Makefile for the netfilter modules on top of IPv6. +# + +# Link order matters here. +obj-$(CONFIG_IP6_NF_IPTABLES) += ip6_tables.o +obj-$(CONFIG_IP6_NF_FILTER) += ip6table_filter.o +obj-$(CONFIG_IP6_NF_MANGLE) += ip6table_mangle.o +obj-$(CONFIG_IP6_NF_QUEUE) += ip6_queue.o +obj-$(CONFIG_IP6_NF_RAW) += ip6table_raw.o +obj-$(CONFIG_IP6_NF_SECURITY) += ip6table_security.o + +# objects for l3 independent conntrack +nf_conntrack_ipv6-y := nf_conntrack_l3proto_ipv6.o nf_conntrack_proto_icmpv6.o + +# l3 independent conntrack +obj-$(CONFIG_NF_CONNTRACK_IPV6) += nf_conntrack_ipv6.o nf_defrag_ipv6.o + +# defrag +nf_defrag_ipv6-y := nf_defrag_ipv6_hooks.o nf_conntrack_reasm.o +obj-$(CONFIG_NF_DEFRAG_IPV6) += nf_defrag_ipv6.o + +# matches +obj-$(CONFIG_IP6_NF_MATCH_AH) += ip6t_ah.o +obj-$(CONFIG_IP6_NF_MATCH_EUI64) += ip6t_eui64.o +obj-$(CONFIG_IP6_NF_MATCH_FRAG) += ip6t_frag.o +obj-$(CONFIG_IP6_NF_MATCH_IPV6HEADER) += ip6t_ipv6header.o +obj-$(CONFIG_IP6_NF_MATCH_MH) += ip6t_mh.o +obj-$(CONFIG_IP6_NF_MATCH_OPTS) += ip6t_hbh.o +obj-$(CONFIG_IP6_NF_MATCH_RPFILTER) += ip6t_rpfilter.o +obj-$(CONFIG_IP6_NF_MATCH_RT) += ip6t_rt.o + +# targets +obj-$(CONFIG_IP6_NF_TARGET_REJECT) += ip6t_REJECT.o diff --git a/net/ipv6/netfilter/ip6_queue.c b/net/ipv6/netfilter/ip6_queue.c new file mode 100644 index 00000000..a34c9e4c --- /dev/null +++ b/net/ipv6/netfilter/ip6_queue.c @@ -0,0 +1,641 @@ +/* + * This is a module which is used for queueing IPv6 packets and + * communicating with userspace via netlink. + * + * (C) 2001 Fernando Anton, this code is GPL. + * IPv64 Project - Work based in IPv64 draft by Arturo Azcorra. + * Universidad Carlos III de Madrid - Leganes (Madrid) - Spain + * Universidad Politecnica de Alcala de Henares - Alcala de H. (Madrid) - Spain + * email: fanton@it.uc3m.es + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/init.h> +#include <linux/ipv6.h> +#include <linux/notifier.h> +#include <linux/netdevice.h> +#include <linux/netfilter.h> +#include <linux/netlink.h> +#include <linux/spinlock.h> +#include <linux/sysctl.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/mutex.h> +#include <linux/slab.h> +#include <net/net_namespace.h> +#include <net/sock.h> +#include <net/ipv6.h> +#include <net/ip6_route.h> +#include <net/netfilter/nf_queue.h> +#include <linux/netfilter_ipv4/ip_queue.h> +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv6/ip6_tables.h> + +#define IPQ_QMAX_DEFAULT 1024 +#define IPQ_PROC_FS_NAME "ip6_queue" +#define NET_IPQ_QMAX_NAME "ip6_queue_maxlen" + +typedef int (*ipq_cmpfn)(struct nf_queue_entry *, unsigned long); + +static unsigned char copy_mode __read_mostly = IPQ_COPY_NONE; +static unsigned int queue_maxlen __read_mostly = IPQ_QMAX_DEFAULT; +static DEFINE_SPINLOCK(queue_lock); +static int peer_pid __read_mostly; +static unsigned int copy_range __read_mostly; +static unsigned int queue_total; +static unsigned int queue_dropped = 0; +static unsigned int queue_user_dropped = 0; +static struct sock *ipqnl __read_mostly; +static LIST_HEAD(queue_list); +static DEFINE_MUTEX(ipqnl_mutex); + +static inline void +__ipq_enqueue_entry(struct nf_queue_entry *entry) +{ + list_add_tail(&entry->list, &queue_list); + queue_total++; +} + +static inline int +__ipq_set_mode(unsigned char mode, unsigned int range) +{ + int status = 0; + + switch(mode) { + case IPQ_COPY_NONE: + case IPQ_COPY_META: + copy_mode = mode; + copy_range = 0; + break; + + case IPQ_COPY_PACKET: + if (range > 0xFFFF) + range = 0xFFFF; + copy_range = range; + copy_mode = mode; + break; + + default: + status = -EINVAL; + + } + return status; +} + +static void __ipq_flush(ipq_cmpfn cmpfn, unsigned long data); + +static inline void +__ipq_reset(void) +{ + peer_pid = 0; + net_disable_timestamp(); + __ipq_set_mode(IPQ_COPY_NONE, 0); + __ipq_flush(NULL, 0); +} + +static struct nf_queue_entry * +ipq_find_dequeue_entry(unsigned long id) +{ + struct nf_queue_entry *entry = NULL, *i; + + spin_lock_bh(&queue_lock); + + list_for_each_entry(i, &queue_list, list) { + if ((unsigned long)i == id) { + entry = i; + break; + } + } + + if (entry) { + list_del(&entry->list); + queue_total--; + } + + spin_unlock_bh(&queue_lock); + return entry; +} + +static void +__ipq_flush(ipq_cmpfn cmpfn, unsigned long data) +{ + struct nf_queue_entry *entry, *next; + + list_for_each_entry_safe(entry, next, &queue_list, list) { + if (!cmpfn || cmpfn(entry, data)) { + list_del(&entry->list); + queue_total--; + nf_reinject(entry, NF_DROP); + } + } +} + +static void +ipq_flush(ipq_cmpfn cmpfn, unsigned long data) +{ + spin_lock_bh(&queue_lock); + __ipq_flush(cmpfn, data); + spin_unlock_bh(&queue_lock); +} + +static struct sk_buff * +ipq_build_packet_message(struct nf_queue_entry *entry, int *errp) +{ + sk_buff_data_t old_tail; + size_t size = 0; + size_t data_len = 0; + struct sk_buff *skb; + struct ipq_packet_msg *pmsg; + struct nlmsghdr *nlh; + struct timeval tv; + + switch (ACCESS_ONCE(copy_mode)) { + case IPQ_COPY_META: + case IPQ_COPY_NONE: + size = NLMSG_SPACE(sizeof(*pmsg)); + break; + + case IPQ_COPY_PACKET: + if (entry->skb->ip_summed == CHECKSUM_PARTIAL && + (*errp = skb_checksum_help(entry->skb))) + return NULL; + + data_len = ACCESS_ONCE(copy_range); + if (data_len == 0 || data_len > entry->skb->len) + data_len = entry->skb->len; + + size = NLMSG_SPACE(sizeof(*pmsg) + data_len); + break; + + default: + *errp = -EINVAL; + return NULL; + } + + skb = alloc_skb(size, GFP_ATOMIC); + if (!skb) + goto nlmsg_failure; + + old_tail = skb->tail; + nlh = NLMSG_PUT(skb, 0, 0, IPQM_PACKET, size - sizeof(*nlh)); + pmsg = NLMSG_DATA(nlh); + memset(pmsg, 0, sizeof(*pmsg)); + + pmsg->packet_id = (unsigned long )entry; + pmsg->data_len = data_len; + tv = ktime_to_timeval(entry->skb->tstamp); + pmsg->timestamp_sec = tv.tv_sec; + pmsg->timestamp_usec = tv.tv_usec; + pmsg->mark = entry->skb->mark; + pmsg->hook = entry->hook; + pmsg->hw_protocol = entry->skb->protocol; + + if (entry->indev) + strcpy(pmsg->indev_name, entry->indev->name); + else + pmsg->indev_name[0] = '\0'; + + if (entry->outdev) + strcpy(pmsg->outdev_name, entry->outdev->name); + else + pmsg->outdev_name[0] = '\0'; + + if (entry->indev && entry->skb->dev && + entry->skb->mac_header != entry->skb->network_header) { + pmsg->hw_type = entry->skb->dev->type; + pmsg->hw_addrlen = dev_parse_header(entry->skb, pmsg->hw_addr); + } + + if (data_len) + if (skb_copy_bits(entry->skb, 0, pmsg->payload, data_len)) + BUG(); + + nlh->nlmsg_len = skb->tail - old_tail; + return skb; + +nlmsg_failure: + kfree_skb(skb); + *errp = -EINVAL; + printk(KERN_ERR "ip6_queue: error creating packet message\n"); + return NULL; +} + +static int +ipq_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) +{ + int status = -EINVAL; + struct sk_buff *nskb; + + if (copy_mode == IPQ_COPY_NONE) + return -EAGAIN; + + nskb = ipq_build_packet_message(entry, &status); + if (nskb == NULL) + return status; + + spin_lock_bh(&queue_lock); + + if (!peer_pid) + goto err_out_free_nskb; + + if (queue_total >= queue_maxlen) { + queue_dropped++; + status = -ENOSPC; + if (net_ratelimit()) + printk (KERN_WARNING "ip6_queue: fill at %d entries, " + "dropping packet(s). Dropped: %d\n", queue_total, + queue_dropped); + goto err_out_free_nskb; + } + + /* netlink_unicast will either free the nskb or attach it to a socket */ + status = netlink_unicast(ipqnl, nskb, peer_pid, MSG_DONTWAIT); + if (status < 0) { + queue_user_dropped++; + goto err_out_unlock; + } + + __ipq_enqueue_entry(entry); + + spin_unlock_bh(&queue_lock); + return status; + +err_out_free_nskb: + kfree_skb(nskb); + +err_out_unlock: + spin_unlock_bh(&queue_lock); + return status; +} + +static int +ipq_mangle_ipv6(ipq_verdict_msg_t *v, struct nf_queue_entry *e) +{ + int diff; + struct ipv6hdr *user_iph = (struct ipv6hdr *)v->payload; + struct sk_buff *nskb; + + if (v->data_len < sizeof(*user_iph)) + return 0; + diff = v->data_len - e->skb->len; + if (diff < 0) { + if (pskb_trim(e->skb, v->data_len)) + return -ENOMEM; + } else if (diff > 0) { + if (v->data_len > 0xFFFF) + return -EINVAL; + if (diff > skb_tailroom(e->skb)) { + nskb = skb_copy_expand(e->skb, skb_headroom(e->skb), + diff, GFP_ATOMIC); + if (!nskb) { + printk(KERN_WARNING "ip6_queue: OOM " + "in mangle, dropping packet\n"); + return -ENOMEM; + } + kfree_skb(e->skb); + e->skb = nskb; + } + skb_put(e->skb, diff); + } + if (!skb_make_writable(e->skb, v->data_len)) + return -ENOMEM; + skb_copy_to_linear_data(e->skb, v->payload, v->data_len); + e->skb->ip_summed = CHECKSUM_NONE; + + return 0; +} + +static int +ipq_set_verdict(struct ipq_verdict_msg *vmsg, unsigned int len) +{ + struct nf_queue_entry *entry; + + if (vmsg->value > NF_MAX_VERDICT || vmsg->value == NF_STOLEN) + return -EINVAL; + + entry = ipq_find_dequeue_entry(vmsg->id); + if (entry == NULL) + return -ENOENT; + else { + int verdict = vmsg->value; + + if (vmsg->data_len && vmsg->data_len == len) + if (ipq_mangle_ipv6(vmsg, entry) < 0) + verdict = NF_DROP; + + nf_reinject(entry, verdict); + return 0; + } +} + +static int +ipq_set_mode(unsigned char mode, unsigned int range) +{ + int status; + + spin_lock_bh(&queue_lock); + status = __ipq_set_mode(mode, range); + spin_unlock_bh(&queue_lock); + return status; +} + +static int +ipq_receive_peer(struct ipq_peer_msg *pmsg, + unsigned char type, unsigned int len) +{ + int status = 0; + + if (len < sizeof(*pmsg)) + return -EINVAL; + + switch (type) { + case IPQM_MODE: + status = ipq_set_mode(pmsg->msg.mode.value, + pmsg->msg.mode.range); + break; + + case IPQM_VERDICT: + status = ipq_set_verdict(&pmsg->msg.verdict, + len - sizeof(*pmsg)); + break; + default: + status = -EINVAL; + } + return status; +} + +static int +dev_cmp(struct nf_queue_entry *entry, unsigned long ifindex) +{ + if (entry->indev) + if (entry->indev->ifindex == ifindex) + return 1; + + if (entry->outdev) + if (entry->outdev->ifindex == ifindex) + return 1; +#ifdef CONFIG_BRIDGE_NETFILTER + if (entry->skb->nf_bridge) { + if (entry->skb->nf_bridge->physindev && + entry->skb->nf_bridge->physindev->ifindex == ifindex) + return 1; + if (entry->skb->nf_bridge->physoutdev && + entry->skb->nf_bridge->physoutdev->ifindex == ifindex) + return 1; + } +#endif + return 0; +} + +static void +ipq_dev_drop(int ifindex) +{ + ipq_flush(dev_cmp, ifindex); +} + +#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0) + +static inline void +__ipq_rcv_skb(struct sk_buff *skb) +{ + int status, type, pid, flags; + unsigned int nlmsglen, skblen; + struct nlmsghdr *nlh; + bool enable_timestamp = false; + + skblen = skb->len; + if (skblen < sizeof(*nlh)) + return; + + nlh = nlmsg_hdr(skb); + nlmsglen = nlh->nlmsg_len; + if (nlmsglen < sizeof(*nlh) || skblen < nlmsglen) + return; + + pid = nlh->nlmsg_pid; + flags = nlh->nlmsg_flags; + + if(pid <= 0 || !(flags & NLM_F_REQUEST) || flags & NLM_F_MULTI) + RCV_SKB_FAIL(-EINVAL); + + if (flags & MSG_TRUNC) + RCV_SKB_FAIL(-ECOMM); + + type = nlh->nlmsg_type; + if (type < NLMSG_NOOP || type >= IPQM_MAX) + RCV_SKB_FAIL(-EINVAL); + + if (type <= IPQM_BASE) + return; + + if (!capable(CAP_NET_ADMIN)) + RCV_SKB_FAIL(-EPERM); + + spin_lock_bh(&queue_lock); + + if (peer_pid) { + if (peer_pid != pid) { + spin_unlock_bh(&queue_lock); + RCV_SKB_FAIL(-EBUSY); + } + } else { + enable_timestamp = true; + peer_pid = pid; + } + + spin_unlock_bh(&queue_lock); + if (enable_timestamp) + net_enable_timestamp(); + + status = ipq_receive_peer(NLMSG_DATA(nlh), type, + nlmsglen - NLMSG_LENGTH(0)); + if (status < 0) + RCV_SKB_FAIL(status); + + if (flags & NLM_F_ACK) + netlink_ack(skb, nlh, 0); +} + +static void +ipq_rcv_skb(struct sk_buff *skb) +{ + mutex_lock(&ipqnl_mutex); + __ipq_rcv_skb(skb); + mutex_unlock(&ipqnl_mutex); +} + +static int +ipq_rcv_dev_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct net_device *dev = ptr; + + if (!net_eq(dev_net(dev), &init_net)) + return NOTIFY_DONE; + + /* Drop any packets associated with the downed device */ + if (event == NETDEV_DOWN) + ipq_dev_drop(dev->ifindex); + return NOTIFY_DONE; +} + +static struct notifier_block ipq_dev_notifier = { + .notifier_call = ipq_rcv_dev_event, +}; + +static int +ipq_rcv_nl_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct netlink_notify *n = ptr; + + if (event == NETLINK_URELEASE && n->protocol == NETLINK_IP6_FW) { + spin_lock_bh(&queue_lock); + if ((net_eq(n->net, &init_net)) && (n->pid == peer_pid)) + __ipq_reset(); + spin_unlock_bh(&queue_lock); + } + return NOTIFY_DONE; +} + +static struct notifier_block ipq_nl_notifier = { + .notifier_call = ipq_rcv_nl_event, +}; + +#ifdef CONFIG_SYSCTL +static struct ctl_table_header *ipq_sysctl_header; + +static ctl_table ipq_table[] = { + { + .procname = NET_IPQ_QMAX_NAME, + .data = &queue_maxlen, + .maxlen = sizeof(queue_maxlen), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { } +}; +#endif + +#ifdef CONFIG_PROC_FS +static int ip6_queue_show(struct seq_file *m, void *v) +{ + spin_lock_bh(&queue_lock); + + seq_printf(m, + "Peer PID : %d\n" + "Copy mode : %hu\n" + "Copy range : %u\n" + "Queue length : %u\n" + "Queue max. length : %u\n" + "Queue dropped : %u\n" + "Netfilter dropped : %u\n", + peer_pid, + copy_mode, + copy_range, + queue_total, + queue_maxlen, + queue_dropped, + queue_user_dropped); + + spin_unlock_bh(&queue_lock); + return 0; +} + +static int ip6_queue_open(struct inode *inode, struct file *file) +{ + return single_open(file, ip6_queue_show, NULL); +} + +static const struct file_operations ip6_queue_proc_fops = { + .open = ip6_queue_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, + .owner = THIS_MODULE, +}; +#endif + +static const struct nf_queue_handler nfqh = { + .name = "ip6_queue", + .outfn = &ipq_enqueue_packet, +}; + +static int __init ip6_queue_init(void) +{ + int status = -ENOMEM; + struct proc_dir_entry *proc __maybe_unused; + + netlink_register_notifier(&ipq_nl_notifier); + ipqnl = netlink_kernel_create(&init_net, NETLINK_IP6_FW, 0, + ipq_rcv_skb, NULL, THIS_MODULE); + if (ipqnl == NULL) { + printk(KERN_ERR "ip6_queue: failed to create netlink socket\n"); + goto cleanup_netlink_notifier; + } + +#ifdef CONFIG_PROC_FS + proc = proc_create(IPQ_PROC_FS_NAME, 0, init_net.proc_net, + &ip6_queue_proc_fops); + if (!proc) { + printk(KERN_ERR "ip6_queue: failed to create proc entry\n"); + goto cleanup_ipqnl; + } +#endif + register_netdevice_notifier(&ipq_dev_notifier); +#ifdef CONFIG_SYSCTL + ipq_sysctl_header = register_sysctl_paths(net_ipv6_ctl_path, ipq_table); +#endif + status = nf_register_queue_handler(NFPROTO_IPV6, &nfqh); + if (status < 0) { + printk(KERN_ERR "ip6_queue: failed to register queue handler\n"); + goto cleanup_sysctl; + } + return status; + +cleanup_sysctl: +#ifdef CONFIG_SYSCTL + unregister_sysctl_table(ipq_sysctl_header); +#endif + unregister_netdevice_notifier(&ipq_dev_notifier); + proc_net_remove(&init_net, IPQ_PROC_FS_NAME); + +cleanup_ipqnl: __maybe_unused + netlink_kernel_release(ipqnl); + mutex_lock(&ipqnl_mutex); + mutex_unlock(&ipqnl_mutex); + +cleanup_netlink_notifier: + netlink_unregister_notifier(&ipq_nl_notifier); + return status; +} + +static void __exit ip6_queue_fini(void) +{ + nf_unregister_queue_handlers(&nfqh); + + ipq_flush(NULL, 0); + +#ifdef CONFIG_SYSCTL + unregister_sysctl_table(ipq_sysctl_header); +#endif + unregister_netdevice_notifier(&ipq_dev_notifier); + proc_net_remove(&init_net, IPQ_PROC_FS_NAME); + + netlink_kernel_release(ipqnl); + mutex_lock(&ipqnl_mutex); + mutex_unlock(&ipqnl_mutex); + + netlink_unregister_notifier(&ipq_nl_notifier); +} + +MODULE_DESCRIPTION("IPv6 packet queue handler"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_IP6_FW); + +module_init(ip6_queue_init); +module_exit(ip6_queue_fini); diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c new file mode 100644 index 00000000..e641f8fa --- /dev/null +++ b/net/ipv6/netfilter/ip6_tables.c @@ -0,0 +1,2361 @@ +/* + * Packet matching code. + * + * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling + * Copyright (C) 2000-2005 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/capability.h> +#include <linux/in.h> +#include <linux/skbuff.h> +#include <linux/kmod.h> +#include <linux/vmalloc.h> +#include <linux/netdevice.h> +#include <linux/module.h> +#include <linux/poison.h> +#include <linux/icmpv6.h> +#include <net/ipv6.h> +#include <net/compat.h> +#include <asm/uaccess.h> +#include <linux/mutex.h> +#include <linux/proc_fs.h> +#include <linux/err.h> +#include <linux/cpumask.h> + +#include <linux/netfilter_ipv6/ip6_tables.h> +#include <linux/netfilter/x_tables.h> +#include <net/netfilter/nf_log.h> +#include "../../netfilter/xt_repldata.h" + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); +MODULE_DESCRIPTION("IPv6 packet filter"); + +/*#define DEBUG_IP_FIREWALL*/ +/*#define DEBUG_ALLOW_ALL*/ /* Useful for remote debugging */ +/*#define DEBUG_IP_FIREWALL_USER*/ + +#ifdef DEBUG_IP_FIREWALL +#define dprintf(format, args...) pr_info(format , ## args) +#else +#define dprintf(format, args...) +#endif + +#ifdef DEBUG_IP_FIREWALL_USER +#define duprintf(format, args...) pr_info(format , ## args) +#else +#define duprintf(format, args...) +#endif + +#ifdef CONFIG_NETFILTER_DEBUG +#define IP_NF_ASSERT(x) WARN_ON(!(x)) +#else +#define IP_NF_ASSERT(x) +#endif + +#if 0 +/* All the better to debug you with... */ +#define static +#define inline +#endif + +void *ip6t_alloc_initial_table(const struct xt_table *info) +{ + return xt_alloc_initial_table(ip6t, IP6T); +} +EXPORT_SYMBOL_GPL(ip6t_alloc_initial_table); + +/* + We keep a set of rules for each CPU, so we can avoid write-locking + them in the softirq when updating the counters and therefore + only need to read-lock in the softirq; doing a write_lock_bh() in user + context stops packets coming through and allows user context to read + the counters or update the rules. + + Hence the start of any table is given by get_table() below. */ + +/* Returns whether matches rule or not. */ +/* Performance critical - called for every packet */ +static inline bool +ip6_packet_match(const struct sk_buff *skb, + const char *indev, + const char *outdev, + const struct ip6t_ip6 *ip6info, + unsigned int *protoff, + int *fragoff, bool *hotdrop) +{ + unsigned long ret; + const struct ipv6hdr *ipv6 = ipv6_hdr(skb); + +#define FWINV(bool, invflg) ((bool) ^ !!(ip6info->invflags & (invflg))) + + if (FWINV(ipv6_masked_addr_cmp(&ipv6->saddr, &ip6info->smsk, + &ip6info->src), IP6T_INV_SRCIP) || + FWINV(ipv6_masked_addr_cmp(&ipv6->daddr, &ip6info->dmsk, + &ip6info->dst), IP6T_INV_DSTIP)) { + dprintf("Source or dest mismatch.\n"); +/* + dprintf("SRC: %u. Mask: %u. Target: %u.%s\n", ip->saddr, + ipinfo->smsk.s_addr, ipinfo->src.s_addr, + ipinfo->invflags & IP6T_INV_SRCIP ? " (INV)" : ""); + dprintf("DST: %u. Mask: %u. Target: %u.%s\n", ip->daddr, + ipinfo->dmsk.s_addr, ipinfo->dst.s_addr, + ipinfo->invflags & IP6T_INV_DSTIP ? " (INV)" : "");*/ + return false; + } + + ret = ifname_compare_aligned(indev, ip6info->iniface, ip6info->iniface_mask); + + if (FWINV(ret != 0, IP6T_INV_VIA_IN)) { + dprintf("VIA in mismatch (%s vs %s).%s\n", + indev, ip6info->iniface, + ip6info->invflags&IP6T_INV_VIA_IN ?" (INV)":""); + return false; + } + + ret = ifname_compare_aligned(outdev, ip6info->outiface, ip6info->outiface_mask); + + if (FWINV(ret != 0, IP6T_INV_VIA_OUT)) { + dprintf("VIA out mismatch (%s vs %s).%s\n", + outdev, ip6info->outiface, + ip6info->invflags&IP6T_INV_VIA_OUT ?" (INV)":""); + return false; + } + +/* ... might want to do something with class and flowlabel here ... */ + + /* look for the desired protocol header */ + if((ip6info->flags & IP6T_F_PROTO)) { + int protohdr; + unsigned short _frag_off; + + protohdr = ipv6_find_hdr(skb, protoff, -1, &_frag_off); + if (protohdr < 0) { + if (_frag_off == 0) + *hotdrop = true; + return false; + } + *fragoff = _frag_off; + + dprintf("Packet protocol %hi ?= %s%hi.\n", + protohdr, + ip6info->invflags & IP6T_INV_PROTO ? "!":"", + ip6info->proto); + + if (ip6info->proto == protohdr) { + if(ip6info->invflags & IP6T_INV_PROTO) { + return false; + } + return true; + } + + /* We need match for the '-p all', too! */ + if ((ip6info->proto != 0) && + !(ip6info->invflags & IP6T_INV_PROTO)) + return false; + } + return true; +} + +/* should be ip6 safe */ +static bool +ip6_checkentry(const struct ip6t_ip6 *ipv6) +{ + if (ipv6->flags & ~IP6T_F_MASK) { + duprintf("Unknown flag bits set: %08X\n", + ipv6->flags & ~IP6T_F_MASK); + return false; + } + if (ipv6->invflags & ~IP6T_INV_MASK) { + duprintf("Unknown invflag bits set: %08X\n", + ipv6->invflags & ~IP6T_INV_MASK); + return false; + } + return true; +} + +static unsigned int +ip6t_error(struct sk_buff *skb, const struct xt_action_param *par) +{ + if (net_ratelimit()) + pr_info("error: `%s'\n", (const char *)par->targinfo); + + return NF_DROP; +} + +static inline struct ip6t_entry * +get_entry(const void *base, unsigned int offset) +{ + return (struct ip6t_entry *)(base + offset); +} + +/* All zeroes == unconditional rule. */ +/* Mildly perf critical (only if packet tracing is on) */ +static inline bool unconditional(const struct ip6t_ip6 *ipv6) +{ + static const struct ip6t_ip6 uncond; + + return memcmp(ipv6, &uncond, sizeof(uncond)) == 0; +} + +static inline const struct xt_entry_target * +ip6t_get_target_c(const struct ip6t_entry *e) +{ + return ip6t_get_target((struct ip6t_entry *)e); +} + +#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \ + defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE) +/* This cries for unification! */ +static const char *const hooknames[] = { + [NF_INET_PRE_ROUTING] = "PREROUTING", + [NF_INET_LOCAL_IN] = "INPUT", + [NF_INET_FORWARD] = "FORWARD", + [NF_INET_LOCAL_OUT] = "OUTPUT", + [NF_INET_POST_ROUTING] = "POSTROUTING", +}; + +enum nf_ip_trace_comments { + NF_IP6_TRACE_COMMENT_RULE, + NF_IP6_TRACE_COMMENT_RETURN, + NF_IP6_TRACE_COMMENT_POLICY, +}; + +static const char *const comments[] = { + [NF_IP6_TRACE_COMMENT_RULE] = "rule", + [NF_IP6_TRACE_COMMENT_RETURN] = "return", + [NF_IP6_TRACE_COMMENT_POLICY] = "policy", +}; + +static struct nf_loginfo trace_loginfo = { + .type = NF_LOG_TYPE_LOG, + .u = { + .log = { + .level = 4, + .logflags = NF_LOG_MASK, + }, + }, +}; + +/* Mildly perf critical (only if packet tracing is on) */ +static inline int +get_chainname_rulenum(const struct ip6t_entry *s, const struct ip6t_entry *e, + const char *hookname, const char **chainname, + const char **comment, unsigned int *rulenum) +{ + const struct xt_standard_target *t = (void *)ip6t_get_target_c(s); + + if (strcmp(t->target.u.kernel.target->name, XT_ERROR_TARGET) == 0) { + /* Head of user chain: ERROR target with chainname */ + *chainname = t->target.data; + (*rulenum) = 0; + } else if (s == e) { + (*rulenum)++; + + if (s->target_offset == sizeof(struct ip6t_entry) && + strcmp(t->target.u.kernel.target->name, + XT_STANDARD_TARGET) == 0 && + t->verdict < 0 && + unconditional(&s->ipv6)) { + /* Tail of chains: STANDARD target (return/policy) */ + *comment = *chainname == hookname + ? comments[NF_IP6_TRACE_COMMENT_POLICY] + : comments[NF_IP6_TRACE_COMMENT_RETURN]; + } + return 1; + } else + (*rulenum)++; + + return 0; +} + +static void trace_packet(const struct sk_buff *skb, + unsigned int hook, + const struct net_device *in, + const struct net_device *out, + const char *tablename, + const struct xt_table_info *private, + const struct ip6t_entry *e) +{ + const void *table_base; + const struct ip6t_entry *root; + const char *hookname, *chainname, *comment; + const struct ip6t_entry *iter; + unsigned int rulenum = 0; + + table_base = private->entries[smp_processor_id()]; + root = get_entry(table_base, private->hook_entry[hook]); + + hookname = chainname = hooknames[hook]; + comment = comments[NF_IP6_TRACE_COMMENT_RULE]; + + xt_entry_foreach(iter, root, private->size - private->hook_entry[hook]) + if (get_chainname_rulenum(iter, e, hookname, + &chainname, &comment, &rulenum) != 0) + break; + + nf_log_packet(AF_INET6, hook, skb, in, out, &trace_loginfo, + "TRACE: %s:%s:%s:%u ", + tablename, chainname, comment, rulenum); +} +#endif + +static inline __pure struct ip6t_entry * +ip6t_next_entry(const struct ip6t_entry *entry) +{ + return (void *)entry + entry->next_offset; +} + +/* Returns one of the generic firewall policies, like NF_ACCEPT. */ +unsigned int +ip6t_do_table(struct sk_buff *skb, + unsigned int hook, + const struct net_device *in, + const struct net_device *out, + struct xt_table *table) +{ + static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long)))); + /* Initializing verdict to NF_DROP keeps gcc happy. */ + unsigned int verdict = NF_DROP; + const char *indev, *outdev; + const void *table_base; + struct ip6t_entry *e, **jumpstack; + unsigned int *stackptr, origptr, cpu; + const struct xt_table_info *private; + struct xt_action_param acpar; + unsigned int addend; + + /* Initialization */ + indev = in ? in->name : nulldevname; + outdev = out ? out->name : nulldevname; + /* We handle fragments by dealing with the first fragment as + * if it was a normal packet. All other fragments are treated + * normally, except that they will NEVER match rules that ask + * things we don't know, ie. tcp syn flag or ports). If the + * rule is also a fragment-specific rule, non-fragments won't + * match it. */ + acpar.hotdrop = false; + acpar.in = in; + acpar.out = out; + acpar.family = NFPROTO_IPV6; + acpar.hooknum = hook; + + IP_NF_ASSERT(table->valid_hooks & (1 << hook)); + + local_bh_disable(); + addend = xt_write_recseq_begin(); + private = table->private; + cpu = smp_processor_id(); + table_base = private->entries[cpu]; + jumpstack = (struct ip6t_entry **)private->jumpstack[cpu]; + stackptr = per_cpu_ptr(private->stackptr, cpu); + origptr = *stackptr; + + e = get_entry(table_base, private->hook_entry[hook]); + + do { + const struct xt_entry_target *t; + const struct xt_entry_match *ematch; + + IP_NF_ASSERT(e); + if (!ip6_packet_match(skb, indev, outdev, &e->ipv6, + &acpar.thoff, &acpar.fragoff, &acpar.hotdrop)) { + no_match: + e = ip6t_next_entry(e); + continue; + } + + xt_ematch_foreach(ematch, e) { + acpar.match = ematch->u.kernel.match; + acpar.matchinfo = ematch->data; + if (!acpar.match->match(skb, &acpar)) + goto no_match; + } + + ADD_COUNTER(e->counters, skb->len, 1); + + t = ip6t_get_target_c(e); + IP_NF_ASSERT(t->u.kernel.target); + +#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \ + defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE) + /* The packet is traced: log it */ + if (unlikely(skb->nf_trace)) + trace_packet(skb, hook, in, out, + table->name, private, e); +#endif + /* Standard target? */ + if (!t->u.kernel.target->target) { + int v; + + v = ((struct xt_standard_target *)t)->verdict; + if (v < 0) { + /* Pop from stack? */ + if (v != XT_RETURN) { + verdict = (unsigned)(-v) - 1; + break; + } + if (*stackptr <= origptr) + e = get_entry(table_base, + private->underflow[hook]); + else + e = ip6t_next_entry(jumpstack[--*stackptr]); + continue; + } + if (table_base + v != ip6t_next_entry(e) && + !(e->ipv6.flags & IP6T_F_GOTO)) { + if (*stackptr >= private->stacksize) { + verdict = NF_DROP; + break; + } + jumpstack[(*stackptr)++] = e; + } + + e = get_entry(table_base, v); + continue; + } + + acpar.target = t->u.kernel.target; + acpar.targinfo = t->data; + + verdict = t->u.kernel.target->target(skb, &acpar); + if (verdict == XT_CONTINUE) + e = ip6t_next_entry(e); + else + /* Verdict */ + break; + } while (!acpar.hotdrop); + + *stackptr = origptr; + + xt_write_recseq_end(addend); + local_bh_enable(); + +#ifdef DEBUG_ALLOW_ALL + return NF_ACCEPT; +#else + if (acpar.hotdrop) + return NF_DROP; + else return verdict; +#endif +} + +/* Figures out from what hook each rule can be called: returns 0 if + there are loops. Puts hook bitmask in comefrom. */ +static int +mark_source_chains(const struct xt_table_info *newinfo, + unsigned int valid_hooks, void *entry0) +{ + unsigned int hook; + + /* No recursion; use packet counter to save back ptrs (reset + to 0 as we leave), and comefrom to save source hook bitmask */ + for (hook = 0; hook < NF_INET_NUMHOOKS; hook++) { + unsigned int pos = newinfo->hook_entry[hook]; + struct ip6t_entry *e = (struct ip6t_entry *)(entry0 + pos); + + if (!(valid_hooks & (1 << hook))) + continue; + + /* Set initial back pointer. */ + e->counters.pcnt = pos; + + for (;;) { + const struct xt_standard_target *t + = (void *)ip6t_get_target_c(e); + int visited = e->comefrom & (1 << hook); + + if (e->comefrom & (1 << NF_INET_NUMHOOKS)) { + pr_err("iptables: loop hook %u pos %u %08X.\n", + hook, pos, e->comefrom); + return 0; + } + e->comefrom |= ((1 << hook) | (1 << NF_INET_NUMHOOKS)); + + /* Unconditional return/END. */ + if ((e->target_offset == sizeof(struct ip6t_entry) && + (strcmp(t->target.u.user.name, + XT_STANDARD_TARGET) == 0) && + t->verdict < 0 && + unconditional(&e->ipv6)) || visited) { + unsigned int oldpos, size; + + if ((strcmp(t->target.u.user.name, + XT_STANDARD_TARGET) == 0) && + t->verdict < -NF_MAX_VERDICT - 1) { + duprintf("mark_source_chains: bad " + "negative verdict (%i)\n", + t->verdict); + return 0; + } + + /* Return: backtrack through the last + big jump. */ + do { + e->comefrom ^= (1<<NF_INET_NUMHOOKS); +#ifdef DEBUG_IP_FIREWALL_USER + if (e->comefrom + & (1 << NF_INET_NUMHOOKS)) { + duprintf("Back unset " + "on hook %u " + "rule %u\n", + hook, pos); + } +#endif + oldpos = pos; + pos = e->counters.pcnt; + e->counters.pcnt = 0; + + /* We're at the start. */ + if (pos == oldpos) + goto next; + + e = (struct ip6t_entry *) + (entry0 + pos); + } while (oldpos == pos + e->next_offset); + + /* Move along one */ + size = e->next_offset; + e = (struct ip6t_entry *) + (entry0 + pos + size); + e->counters.pcnt = pos; + pos += size; + } else { + int newpos = t->verdict; + + if (strcmp(t->target.u.user.name, + XT_STANDARD_TARGET) == 0 && + newpos >= 0) { + if (newpos > newinfo->size - + sizeof(struct ip6t_entry)) { + duprintf("mark_source_chains: " + "bad verdict (%i)\n", + newpos); + return 0; + } + /* This a jump; chase it. */ + duprintf("Jump rule %u -> %u\n", + pos, newpos); + } else { + /* ... this is a fallthru */ + newpos = pos + e->next_offset; + } + e = (struct ip6t_entry *) + (entry0 + newpos); + e->counters.pcnt = pos; + pos = newpos; + } + } + next: + duprintf("Finished chain %u\n", hook); + } + return 1; +} + +static void cleanup_match(struct xt_entry_match *m, struct net *net) +{ + struct xt_mtdtor_param par; + + par.net = net; + par.match = m->u.kernel.match; + par.matchinfo = m->data; + par.family = NFPROTO_IPV6; + if (par.match->destroy != NULL) + par.match->destroy(&par); + module_put(par.match->me); +} + +static int +check_entry(const struct ip6t_entry *e, const char *name) +{ + const struct xt_entry_target *t; + + if (!ip6_checkentry(&e->ipv6)) { + duprintf("ip_tables: ip check failed %p %s.\n", e, name); + return -EINVAL; + } + + if (e->target_offset + sizeof(struct xt_entry_target) > + e->next_offset) + return -EINVAL; + + t = ip6t_get_target_c(e); + if (e->target_offset + t->u.target_size > e->next_offset) + return -EINVAL; + + return 0; +} + +static int check_match(struct xt_entry_match *m, struct xt_mtchk_param *par) +{ + const struct ip6t_ip6 *ipv6 = par->entryinfo; + int ret; + + par->match = m->u.kernel.match; + par->matchinfo = m->data; + + ret = xt_check_match(par, m->u.match_size - sizeof(*m), + ipv6->proto, ipv6->invflags & IP6T_INV_PROTO); + if (ret < 0) { + duprintf("ip_tables: check failed for `%s'.\n", + par.match->name); + return ret; + } + return 0; +} + +static int +find_check_match(struct xt_entry_match *m, struct xt_mtchk_param *par) +{ + struct xt_match *match; + int ret; + + match = xt_request_find_match(NFPROTO_IPV6, m->u.user.name, + m->u.user.revision); + if (IS_ERR(match)) { + duprintf("find_check_match: `%s' not found\n", m->u.user.name); + return PTR_ERR(match); + } + m->u.kernel.match = match; + + ret = check_match(m, par); + if (ret) + goto err; + + return 0; +err: + module_put(m->u.kernel.match->me); + return ret; +} + +static int check_target(struct ip6t_entry *e, struct net *net, const char *name) +{ + struct xt_entry_target *t = ip6t_get_target(e); + struct xt_tgchk_param par = { + .net = net, + .table = name, + .entryinfo = e, + .target = t->u.kernel.target, + .targinfo = t->data, + .hook_mask = e->comefrom, + .family = NFPROTO_IPV6, + }; + int ret; + + t = ip6t_get_target(e); + ret = xt_check_target(&par, t->u.target_size - sizeof(*t), + e->ipv6.proto, e->ipv6.invflags & IP6T_INV_PROTO); + if (ret < 0) { + duprintf("ip_tables: check failed for `%s'.\n", + t->u.kernel.target->name); + return ret; + } + return 0; +} + +static int +find_check_entry(struct ip6t_entry *e, struct net *net, const char *name, + unsigned int size) +{ + struct xt_entry_target *t; + struct xt_target *target; + int ret; + unsigned int j; + struct xt_mtchk_param mtpar; + struct xt_entry_match *ematch; + + ret = check_entry(e, name); + if (ret) + return ret; + + j = 0; + mtpar.net = net; + mtpar.table = name; + mtpar.entryinfo = &e->ipv6; + mtpar.hook_mask = e->comefrom; + mtpar.family = NFPROTO_IPV6; + xt_ematch_foreach(ematch, e) { + ret = find_check_match(ematch, &mtpar); + if (ret != 0) + goto cleanup_matches; + ++j; + } + + t = ip6t_get_target(e); + target = xt_request_find_target(NFPROTO_IPV6, t->u.user.name, + t->u.user.revision); + if (IS_ERR(target)) { + duprintf("find_check_entry: `%s' not found\n", t->u.user.name); + ret = PTR_ERR(target); + goto cleanup_matches; + } + t->u.kernel.target = target; + + ret = check_target(e, net, name); + if (ret) + goto err; + return 0; + err: + module_put(t->u.kernel.target->me); + cleanup_matches: + xt_ematch_foreach(ematch, e) { + if (j-- == 0) + break; + cleanup_match(ematch, net); + } + return ret; +} + +static bool check_underflow(const struct ip6t_entry *e) +{ + const struct xt_entry_target *t; + unsigned int verdict; + + if (!unconditional(&e->ipv6)) + return false; + t = ip6t_get_target_c(e); + if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0) + return false; + verdict = ((struct xt_standard_target *)t)->verdict; + verdict = -verdict - 1; + return verdict == NF_DROP || verdict == NF_ACCEPT; +} + +static int +check_entry_size_and_hooks(struct ip6t_entry *e, + struct xt_table_info *newinfo, + const unsigned char *base, + const unsigned char *limit, + const unsigned int *hook_entries, + const unsigned int *underflows, + unsigned int valid_hooks) +{ + unsigned int h; + + if ((unsigned long)e % __alignof__(struct ip6t_entry) != 0 || + (unsigned char *)e + sizeof(struct ip6t_entry) >= limit) { + duprintf("Bad offset %p\n", e); + return -EINVAL; + } + + if (e->next_offset + < sizeof(struct ip6t_entry) + sizeof(struct xt_entry_target)) { + duprintf("checking: element %p size %u\n", + e, e->next_offset); + return -EINVAL; + } + + /* Check hooks & underflows */ + for (h = 0; h < NF_INET_NUMHOOKS; h++) { + if (!(valid_hooks & (1 << h))) + continue; + if ((unsigned char *)e - base == hook_entries[h]) + newinfo->hook_entry[h] = hook_entries[h]; + if ((unsigned char *)e - base == underflows[h]) { + if (!check_underflow(e)) { + pr_err("Underflows must be unconditional and " + "use the STANDARD target with " + "ACCEPT/DROP\n"); + return -EINVAL; + } + newinfo->underflow[h] = underflows[h]; + } + } + + /* Clear counters and comefrom */ + e->counters = ((struct xt_counters) { 0, 0 }); + e->comefrom = 0; + return 0; +} + +static void cleanup_entry(struct ip6t_entry *e, struct net *net) +{ + struct xt_tgdtor_param par; + struct xt_entry_target *t; + struct xt_entry_match *ematch; + + /* Cleanup all matches */ + xt_ematch_foreach(ematch, e) + cleanup_match(ematch, net); + t = ip6t_get_target(e); + + par.net = net; + par.target = t->u.kernel.target; + par.targinfo = t->data; + par.family = NFPROTO_IPV6; + if (par.target->destroy != NULL) + par.target->destroy(&par); + module_put(par.target->me); +} + +/* Checks and translates the user-supplied table segment (held in + newinfo) */ +static int +translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, + const struct ip6t_replace *repl) +{ + struct ip6t_entry *iter; + unsigned int i; + int ret = 0; + + newinfo->size = repl->size; + newinfo->number = repl->num_entries; + + /* Init all hooks to impossible value. */ + for (i = 0; i < NF_INET_NUMHOOKS; i++) { + newinfo->hook_entry[i] = 0xFFFFFFFF; + newinfo->underflow[i] = 0xFFFFFFFF; + } + + duprintf("translate_table: size %u\n", newinfo->size); + i = 0; + /* Walk through entries, checking offsets. */ + xt_entry_foreach(iter, entry0, newinfo->size) { + ret = check_entry_size_and_hooks(iter, newinfo, entry0, + entry0 + repl->size, + repl->hook_entry, + repl->underflow, + repl->valid_hooks); + if (ret != 0) + return ret; + ++i; + if (strcmp(ip6t_get_target(iter)->u.user.name, + XT_ERROR_TARGET) == 0) + ++newinfo->stacksize; + } + + if (i != repl->num_entries) { + duprintf("translate_table: %u not %u entries\n", + i, repl->num_entries); + return -EINVAL; + } + + /* Check hooks all assigned */ + for (i = 0; i < NF_INET_NUMHOOKS; i++) { + /* Only hooks which are valid */ + if (!(repl->valid_hooks & (1 << i))) + continue; + if (newinfo->hook_entry[i] == 0xFFFFFFFF) { + duprintf("Invalid hook entry %u %u\n", + i, repl->hook_entry[i]); + return -EINVAL; + } + if (newinfo->underflow[i] == 0xFFFFFFFF) { + duprintf("Invalid underflow %u %u\n", + i, repl->underflow[i]); + return -EINVAL; + } + } + + if (!mark_source_chains(newinfo, repl->valid_hooks, entry0)) + return -ELOOP; + + /* Finally, each sanity check must pass */ + i = 0; + xt_entry_foreach(iter, entry0, newinfo->size) { + ret = find_check_entry(iter, net, repl->name, repl->size); + if (ret != 0) + break; + ++i; + } + + if (ret != 0) { + xt_entry_foreach(iter, entry0, newinfo->size) { + if (i-- == 0) + break; + cleanup_entry(iter, net); + } + return ret; + } + + /* And one copy for every other CPU */ + for_each_possible_cpu(i) { + if (newinfo->entries[i] && newinfo->entries[i] != entry0) + memcpy(newinfo->entries[i], entry0, newinfo->size); + } + + return ret; +} + +static void +get_counters(const struct xt_table_info *t, + struct xt_counters counters[]) +{ + struct ip6t_entry *iter; + unsigned int cpu; + unsigned int i; + + for_each_possible_cpu(cpu) { + seqcount_t *s = &per_cpu(xt_recseq, cpu); + + i = 0; + xt_entry_foreach(iter, t->entries[cpu], t->size) { + u64 bcnt, pcnt; + unsigned int start; + + do { + start = read_seqcount_begin(s); + bcnt = iter->counters.bcnt; + pcnt = iter->counters.pcnt; + } while (read_seqcount_retry(s, start)); + + ADD_COUNTER(counters[i], bcnt, pcnt); + ++i; + } + } +} + +static struct xt_counters *alloc_counters(const struct xt_table *table) +{ + unsigned int countersize; + struct xt_counters *counters; + const struct xt_table_info *private = table->private; + + /* We need atomic snapshot of counters: rest doesn't change + (other than comefrom, which userspace doesn't care + about). */ + countersize = sizeof(struct xt_counters) * private->number; + counters = vzalloc(countersize); + + if (counters == NULL) + return ERR_PTR(-ENOMEM); + + get_counters(private, counters); + + return counters; +} + +static int +copy_entries_to_user(unsigned int total_size, + const struct xt_table *table, + void __user *userptr) +{ + unsigned int off, num; + const struct ip6t_entry *e; + struct xt_counters *counters; + const struct xt_table_info *private = table->private; + int ret = 0; + const void *loc_cpu_entry; + + counters = alloc_counters(table); + if (IS_ERR(counters)) + return PTR_ERR(counters); + + /* choose the copy that is on our node/cpu, ... + * This choice is lazy (because current thread is + * allowed to migrate to another cpu) + */ + loc_cpu_entry = private->entries[raw_smp_processor_id()]; + if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) { + ret = -EFAULT; + goto free_counters; + } + + /* FIXME: use iterator macros --RR */ + /* ... then go back and fix counters and names */ + for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){ + unsigned int i; + const struct xt_entry_match *m; + const struct xt_entry_target *t; + + e = (struct ip6t_entry *)(loc_cpu_entry + off); + if (copy_to_user(userptr + off + + offsetof(struct ip6t_entry, counters), + &counters[num], + sizeof(counters[num])) != 0) { + ret = -EFAULT; + goto free_counters; + } + + for (i = sizeof(struct ip6t_entry); + i < e->target_offset; + i += m->u.match_size) { + m = (void *)e + i; + + if (copy_to_user(userptr + off + i + + offsetof(struct xt_entry_match, + u.user.name), + m->u.kernel.match->name, + strlen(m->u.kernel.match->name)+1) + != 0) { + ret = -EFAULT; + goto free_counters; + } + } + + t = ip6t_get_target_c(e); + if (copy_to_user(userptr + off + e->target_offset + + offsetof(struct xt_entry_target, + u.user.name), + t->u.kernel.target->name, + strlen(t->u.kernel.target->name)+1) != 0) { + ret = -EFAULT; + goto free_counters; + } + } + + free_counters: + vfree(counters); + return ret; +} + +#ifdef CONFIG_COMPAT +static void compat_standard_from_user(void *dst, const void *src) +{ + int v = *(compat_int_t *)src; + + if (v > 0) + v += xt_compat_calc_jump(AF_INET6, v); + memcpy(dst, &v, sizeof(v)); +} + +static int compat_standard_to_user(void __user *dst, const void *src) +{ + compat_int_t cv = *(int *)src; + + if (cv > 0) + cv -= xt_compat_calc_jump(AF_INET6, cv); + return copy_to_user(dst, &cv, sizeof(cv)) ? -EFAULT : 0; +} + +static int compat_calc_entry(const struct ip6t_entry *e, + const struct xt_table_info *info, + const void *base, struct xt_table_info *newinfo) +{ + const struct xt_entry_match *ematch; + const struct xt_entry_target *t; + unsigned int entry_offset; + int off, i, ret; + + off = sizeof(struct ip6t_entry) - sizeof(struct compat_ip6t_entry); + entry_offset = (void *)e - base; + xt_ematch_foreach(ematch, e) + off += xt_compat_match_offset(ematch->u.kernel.match); + t = ip6t_get_target_c(e); + off += xt_compat_target_offset(t->u.kernel.target); + newinfo->size -= off; + ret = xt_compat_add_offset(AF_INET6, entry_offset, off); + if (ret) + return ret; + + for (i = 0; i < NF_INET_NUMHOOKS; i++) { + if (info->hook_entry[i] && + (e < (struct ip6t_entry *)(base + info->hook_entry[i]))) + newinfo->hook_entry[i] -= off; + if (info->underflow[i] && + (e < (struct ip6t_entry *)(base + info->underflow[i]))) + newinfo->underflow[i] -= off; + } + return 0; +} + +static int compat_table_info(const struct xt_table_info *info, + struct xt_table_info *newinfo) +{ + struct ip6t_entry *iter; + void *loc_cpu_entry; + int ret; + + if (!newinfo || !info) + return -EINVAL; + + /* we dont care about newinfo->entries[] */ + memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); + newinfo->initial_entries = 0; + loc_cpu_entry = info->entries[raw_smp_processor_id()]; + xt_compat_init_offsets(AF_INET6, info->number); + xt_entry_foreach(iter, loc_cpu_entry, info->size) { + ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo); + if (ret != 0) + return ret; + } + return 0; +} +#endif + +static int get_info(struct net *net, void __user *user, + const int *len, int compat) +{ + char name[XT_TABLE_MAXNAMELEN]; + struct xt_table *t; + int ret; + + if (*len != sizeof(struct ip6t_getinfo)) { + duprintf("length %u != %zu\n", *len, + sizeof(struct ip6t_getinfo)); + return -EINVAL; + } + + if (copy_from_user(name, user, sizeof(name)) != 0) + return -EFAULT; + + name[XT_TABLE_MAXNAMELEN-1] = '\0'; +#ifdef CONFIG_COMPAT + if (compat) + xt_compat_lock(AF_INET6); +#endif + t = try_then_request_module(xt_find_table_lock(net, AF_INET6, name), + "ip6table_%s", name); + if (t && !IS_ERR(t)) { + struct ip6t_getinfo info; + const struct xt_table_info *private = t->private; +#ifdef CONFIG_COMPAT + struct xt_table_info tmp; + + if (compat) { + ret = compat_table_info(private, &tmp); + xt_compat_flush_offsets(AF_INET6); + private = &tmp; + } +#endif + memset(&info, 0, sizeof(info)); + info.valid_hooks = t->valid_hooks; + memcpy(info.hook_entry, private->hook_entry, + sizeof(info.hook_entry)); + memcpy(info.underflow, private->underflow, + sizeof(info.underflow)); + info.num_entries = private->number; + info.size = private->size; + strcpy(info.name, name); + + if (copy_to_user(user, &info, *len) != 0) + ret = -EFAULT; + else + ret = 0; + + xt_table_unlock(t); + module_put(t->me); + } else + ret = t ? PTR_ERR(t) : -ENOENT; +#ifdef CONFIG_COMPAT + if (compat) + xt_compat_unlock(AF_INET6); +#endif + return ret; +} + +static int +get_entries(struct net *net, struct ip6t_get_entries __user *uptr, + const int *len) +{ + int ret; + struct ip6t_get_entries get; + struct xt_table *t; + + if (*len < sizeof(get)) { + duprintf("get_entries: %u < %zu\n", *len, sizeof(get)); + return -EINVAL; + } + if (copy_from_user(&get, uptr, sizeof(get)) != 0) + return -EFAULT; + if (*len != sizeof(struct ip6t_get_entries) + get.size) { + duprintf("get_entries: %u != %zu\n", + *len, sizeof(get) + get.size); + return -EINVAL; + } + + t = xt_find_table_lock(net, AF_INET6, get.name); + if (t && !IS_ERR(t)) { + struct xt_table_info *private = t->private; + duprintf("t->private->number = %u\n", private->number); + if (get.size == private->size) + ret = copy_entries_to_user(private->size, + t, uptr->entrytable); + else { + duprintf("get_entries: I've got %u not %u!\n", + private->size, get.size); + ret = -EAGAIN; + } + module_put(t->me); + xt_table_unlock(t); + } else + ret = t ? PTR_ERR(t) : -ENOENT; + + return ret; +} + +static int +__do_replace(struct net *net, const char *name, unsigned int valid_hooks, + struct xt_table_info *newinfo, unsigned int num_counters, + void __user *counters_ptr) +{ + int ret; + struct xt_table *t; + struct xt_table_info *oldinfo; + struct xt_counters *counters; + const void *loc_cpu_old_entry; + struct ip6t_entry *iter; + + ret = 0; + counters = vzalloc(num_counters * sizeof(struct xt_counters)); + if (!counters) { + ret = -ENOMEM; + goto out; + } + + t = try_then_request_module(xt_find_table_lock(net, AF_INET6, name), + "ip6table_%s", name); + if (!t || IS_ERR(t)) { + ret = t ? PTR_ERR(t) : -ENOENT; + goto free_newinfo_counters_untrans; + } + + /* You lied! */ + if (valid_hooks != t->valid_hooks) { + duprintf("Valid hook crap: %08X vs %08X\n", + valid_hooks, t->valid_hooks); + ret = -EINVAL; + goto put_module; + } + + oldinfo = xt_replace_table(t, num_counters, newinfo, &ret); + if (!oldinfo) + goto put_module; + + /* Update module usage count based on number of rules */ + duprintf("do_replace: oldnum=%u, initnum=%u, newnum=%u\n", + oldinfo->number, oldinfo->initial_entries, newinfo->number); + if ((oldinfo->number > oldinfo->initial_entries) || + (newinfo->number <= oldinfo->initial_entries)) + module_put(t->me); + if ((oldinfo->number > oldinfo->initial_entries) && + (newinfo->number <= oldinfo->initial_entries)) + module_put(t->me); + + /* Get the old counters, and synchronize with replace */ + get_counters(oldinfo, counters); + + /* Decrease module usage counts and free resource */ + loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; + xt_entry_foreach(iter, loc_cpu_old_entry, oldinfo->size) + cleanup_entry(iter, net); + + xt_free_table_info(oldinfo); + if (copy_to_user(counters_ptr, counters, + sizeof(struct xt_counters) * num_counters) != 0) + ret = -EFAULT; + vfree(counters); + xt_table_unlock(t); + return ret; + + put_module: + module_put(t->me); + xt_table_unlock(t); + free_newinfo_counters_untrans: + vfree(counters); + out: + return ret; +} + +static int +do_replace(struct net *net, const void __user *user, unsigned int len) +{ + int ret; + struct ip6t_replace tmp; + struct xt_table_info *newinfo; + void *loc_cpu_entry; + struct ip6t_entry *iter; + + if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) + return -EFAULT; + + /* overflow check */ + if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) + return -ENOMEM; + tmp.name[sizeof(tmp.name)-1] = 0; + + newinfo = xt_alloc_table_info(tmp.size); + if (!newinfo) + return -ENOMEM; + + /* choose the copy that is on our node/cpu */ + loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), + tmp.size) != 0) { + ret = -EFAULT; + goto free_newinfo; + } + + ret = translate_table(net, newinfo, loc_cpu_entry, &tmp); + if (ret != 0) + goto free_newinfo; + + duprintf("ip_tables: Translated table\n"); + + ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo, + tmp.num_counters, tmp.counters); + if (ret) + goto free_newinfo_untrans; + return 0; + + free_newinfo_untrans: + xt_entry_foreach(iter, loc_cpu_entry, newinfo->size) + cleanup_entry(iter, net); + free_newinfo: + xt_free_table_info(newinfo); + return ret; +} + +static int +do_add_counters(struct net *net, const void __user *user, unsigned int len, + int compat) +{ + unsigned int i, curcpu; + struct xt_counters_info tmp; + struct xt_counters *paddc; + unsigned int num_counters; + char *name; + int size; + void *ptmp; + struct xt_table *t; + const struct xt_table_info *private; + int ret = 0; + const void *loc_cpu_entry; + struct ip6t_entry *iter; + unsigned int addend; +#ifdef CONFIG_COMPAT + struct compat_xt_counters_info compat_tmp; + + if (compat) { + ptmp = &compat_tmp; + size = sizeof(struct compat_xt_counters_info); + } else +#endif + { + ptmp = &tmp; + size = sizeof(struct xt_counters_info); + } + + if (copy_from_user(ptmp, user, size) != 0) + return -EFAULT; + +#ifdef CONFIG_COMPAT + if (compat) { + num_counters = compat_tmp.num_counters; + name = compat_tmp.name; + } else +#endif + { + num_counters = tmp.num_counters; + name = tmp.name; + } + + if (len != size + num_counters * sizeof(struct xt_counters)) + return -EINVAL; + + paddc = vmalloc(len - size); + if (!paddc) + return -ENOMEM; + + if (copy_from_user(paddc, user + size, len - size) != 0) { + ret = -EFAULT; + goto free; + } + + t = xt_find_table_lock(net, AF_INET6, name); + if (!t || IS_ERR(t)) { + ret = t ? PTR_ERR(t) : -ENOENT; + goto free; + } + + + local_bh_disable(); + private = t->private; + if (private->number != num_counters) { + ret = -EINVAL; + goto unlock_up_free; + } + + i = 0; + /* Choose the copy that is on our node */ + curcpu = smp_processor_id(); + addend = xt_write_recseq_begin(); + loc_cpu_entry = private->entries[curcpu]; + xt_entry_foreach(iter, loc_cpu_entry, private->size) { + ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt); + ++i; + } + xt_write_recseq_end(addend); + + unlock_up_free: + local_bh_enable(); + xt_table_unlock(t); + module_put(t->me); + free: + vfree(paddc); + + return ret; +} + +#ifdef CONFIG_COMPAT +struct compat_ip6t_replace { + char name[XT_TABLE_MAXNAMELEN]; + u32 valid_hooks; + u32 num_entries; + u32 size; + u32 hook_entry[NF_INET_NUMHOOKS]; + u32 underflow[NF_INET_NUMHOOKS]; + u32 num_counters; + compat_uptr_t counters; /* struct xt_counters * */ + struct compat_ip6t_entry entries[0]; +}; + +static int +compat_copy_entry_to_user(struct ip6t_entry *e, void __user **dstptr, + unsigned int *size, struct xt_counters *counters, + unsigned int i) +{ + struct xt_entry_target *t; + struct compat_ip6t_entry __user *ce; + u_int16_t target_offset, next_offset; + compat_uint_t origsize; + const struct xt_entry_match *ematch; + int ret = 0; + + origsize = *size; + ce = (struct compat_ip6t_entry __user *)*dstptr; + if (copy_to_user(ce, e, sizeof(struct ip6t_entry)) != 0 || + copy_to_user(&ce->counters, &counters[i], + sizeof(counters[i])) != 0) + return -EFAULT; + + *dstptr += sizeof(struct compat_ip6t_entry); + *size -= sizeof(struct ip6t_entry) - sizeof(struct compat_ip6t_entry); + + xt_ematch_foreach(ematch, e) { + ret = xt_compat_match_to_user(ematch, dstptr, size); + if (ret != 0) + return ret; + } + target_offset = e->target_offset - (origsize - *size); + t = ip6t_get_target(e); + ret = xt_compat_target_to_user(t, dstptr, size); + if (ret) + return ret; + next_offset = e->next_offset - (origsize - *size); + if (put_user(target_offset, &ce->target_offset) != 0 || + put_user(next_offset, &ce->next_offset) != 0) + return -EFAULT; + return 0; +} + +static int +compat_find_calc_match(struct xt_entry_match *m, + const char *name, + const struct ip6t_ip6 *ipv6, + unsigned int hookmask, + int *size) +{ + struct xt_match *match; + + match = xt_request_find_match(NFPROTO_IPV6, m->u.user.name, + m->u.user.revision); + if (IS_ERR(match)) { + duprintf("compat_check_calc_match: `%s' not found\n", + m->u.user.name); + return PTR_ERR(match); + } + m->u.kernel.match = match; + *size += xt_compat_match_offset(match); + return 0; +} + +static void compat_release_entry(struct compat_ip6t_entry *e) +{ + struct xt_entry_target *t; + struct xt_entry_match *ematch; + + /* Cleanup all matches */ + xt_ematch_foreach(ematch, e) + module_put(ematch->u.kernel.match->me); + t = compat_ip6t_get_target(e); + module_put(t->u.kernel.target->me); +} + +static int +check_compat_entry_size_and_hooks(struct compat_ip6t_entry *e, + struct xt_table_info *newinfo, + unsigned int *size, + const unsigned char *base, + const unsigned char *limit, + const unsigned int *hook_entries, + const unsigned int *underflows, + const char *name) +{ + struct xt_entry_match *ematch; + struct xt_entry_target *t; + struct xt_target *target; + unsigned int entry_offset; + unsigned int j; + int ret, off, h; + + duprintf("check_compat_entry_size_and_hooks %p\n", e); + if ((unsigned long)e % __alignof__(struct compat_ip6t_entry) != 0 || + (unsigned char *)e + sizeof(struct compat_ip6t_entry) >= limit) { + duprintf("Bad offset %p, limit = %p\n", e, limit); + return -EINVAL; + } + + if (e->next_offset < sizeof(struct compat_ip6t_entry) + + sizeof(struct compat_xt_entry_target)) { + duprintf("checking: element %p size %u\n", + e, e->next_offset); + return -EINVAL; + } + + /* For purposes of check_entry casting the compat entry is fine */ + ret = check_entry((struct ip6t_entry *)e, name); + if (ret) + return ret; + + off = sizeof(struct ip6t_entry) - sizeof(struct compat_ip6t_entry); + entry_offset = (void *)e - (void *)base; + j = 0; + xt_ematch_foreach(ematch, e) { + ret = compat_find_calc_match(ematch, name, + &e->ipv6, e->comefrom, &off); + if (ret != 0) + goto release_matches; + ++j; + } + + t = compat_ip6t_get_target(e); + target = xt_request_find_target(NFPROTO_IPV6, t->u.user.name, + t->u.user.revision); + if (IS_ERR(target)) { + duprintf("check_compat_entry_size_and_hooks: `%s' not found\n", + t->u.user.name); + ret = PTR_ERR(target); + goto release_matches; + } + t->u.kernel.target = target; + + off += xt_compat_target_offset(target); + *size += off; + ret = xt_compat_add_offset(AF_INET6, entry_offset, off); + if (ret) + goto out; + + /* Check hooks & underflows */ + for (h = 0; h < NF_INET_NUMHOOKS; h++) { + if ((unsigned char *)e - base == hook_entries[h]) + newinfo->hook_entry[h] = hook_entries[h]; + if ((unsigned char *)e - base == underflows[h]) + newinfo->underflow[h] = underflows[h]; + } + + /* Clear counters and comefrom */ + memset(&e->counters, 0, sizeof(e->counters)); + e->comefrom = 0; + return 0; + +out: + module_put(t->u.kernel.target->me); +release_matches: + xt_ematch_foreach(ematch, e) { + if (j-- == 0) + break; + module_put(ematch->u.kernel.match->me); + } + return ret; +} + +static int +compat_copy_entry_from_user(struct compat_ip6t_entry *e, void **dstptr, + unsigned int *size, const char *name, + struct xt_table_info *newinfo, unsigned char *base) +{ + struct xt_entry_target *t; + struct ip6t_entry *de; + unsigned int origsize; + int ret, h; + struct xt_entry_match *ematch; + + ret = 0; + origsize = *size; + de = (struct ip6t_entry *)*dstptr; + memcpy(de, e, sizeof(struct ip6t_entry)); + memcpy(&de->counters, &e->counters, sizeof(e->counters)); + + *dstptr += sizeof(struct ip6t_entry); + *size += sizeof(struct ip6t_entry) - sizeof(struct compat_ip6t_entry); + + xt_ematch_foreach(ematch, e) { + ret = xt_compat_match_from_user(ematch, dstptr, size); + if (ret != 0) + return ret; + } + de->target_offset = e->target_offset - (origsize - *size); + t = compat_ip6t_get_target(e); + xt_compat_target_from_user(t, dstptr, size); + + de->next_offset = e->next_offset - (origsize - *size); + for (h = 0; h < NF_INET_NUMHOOKS; h++) { + if ((unsigned char *)de - base < newinfo->hook_entry[h]) + newinfo->hook_entry[h] -= origsize - *size; + if ((unsigned char *)de - base < newinfo->underflow[h]) + newinfo->underflow[h] -= origsize - *size; + } + return ret; +} + +static int compat_check_entry(struct ip6t_entry *e, struct net *net, + const char *name) +{ + unsigned int j; + int ret = 0; + struct xt_mtchk_param mtpar; + struct xt_entry_match *ematch; + + j = 0; + mtpar.net = net; + mtpar.table = name; + mtpar.entryinfo = &e->ipv6; + mtpar.hook_mask = e->comefrom; + mtpar.family = NFPROTO_IPV6; + xt_ematch_foreach(ematch, e) { + ret = check_match(ematch, &mtpar); + if (ret != 0) + goto cleanup_matches; + ++j; + } + + ret = check_target(e, net, name); + if (ret) + goto cleanup_matches; + return 0; + + cleanup_matches: + xt_ematch_foreach(ematch, e) { + if (j-- == 0) + break; + cleanup_match(ematch, net); + } + return ret; +} + +static int +translate_compat_table(struct net *net, + const char *name, + unsigned int valid_hooks, + struct xt_table_info **pinfo, + void **pentry0, + unsigned int total_size, + unsigned int number, + unsigned int *hook_entries, + unsigned int *underflows) +{ + unsigned int i, j; + struct xt_table_info *newinfo, *info; + void *pos, *entry0, *entry1; + struct compat_ip6t_entry *iter0; + struct ip6t_entry *iter1; + unsigned int size; + int ret = 0; + + info = *pinfo; + entry0 = *pentry0; + size = total_size; + info->number = number; + + /* Init all hooks to impossible value. */ + for (i = 0; i < NF_INET_NUMHOOKS; i++) { + info->hook_entry[i] = 0xFFFFFFFF; + info->underflow[i] = 0xFFFFFFFF; + } + + duprintf("translate_compat_table: size %u\n", info->size); + j = 0; + xt_compat_lock(AF_INET6); + xt_compat_init_offsets(AF_INET6, number); + /* Walk through entries, checking offsets. */ + xt_entry_foreach(iter0, entry0, total_size) { + ret = check_compat_entry_size_and_hooks(iter0, info, &size, + entry0, + entry0 + total_size, + hook_entries, + underflows, + name); + if (ret != 0) + goto out_unlock; + ++j; + } + + ret = -EINVAL; + if (j != number) { + duprintf("translate_compat_table: %u not %u entries\n", + j, number); + goto out_unlock; + } + + /* Check hooks all assigned */ + for (i = 0; i < NF_INET_NUMHOOKS; i++) { + /* Only hooks which are valid */ + if (!(valid_hooks & (1 << i))) + continue; + if (info->hook_entry[i] == 0xFFFFFFFF) { + duprintf("Invalid hook entry %u %u\n", + i, hook_entries[i]); + goto out_unlock; + } + if (info->underflow[i] == 0xFFFFFFFF) { + duprintf("Invalid underflow %u %u\n", + i, underflows[i]); + goto out_unlock; + } + } + + ret = -ENOMEM; + newinfo = xt_alloc_table_info(size); + if (!newinfo) + goto out_unlock; + + newinfo->number = number; + for (i = 0; i < NF_INET_NUMHOOKS; i++) { + newinfo->hook_entry[i] = info->hook_entry[i]; + newinfo->underflow[i] = info->underflow[i]; + } + entry1 = newinfo->entries[raw_smp_processor_id()]; + pos = entry1; + size = total_size; + xt_entry_foreach(iter0, entry0, total_size) { + ret = compat_copy_entry_from_user(iter0, &pos, &size, + name, newinfo, entry1); + if (ret != 0) + break; + } + xt_compat_flush_offsets(AF_INET6); + xt_compat_unlock(AF_INET6); + if (ret) + goto free_newinfo; + + ret = -ELOOP; + if (!mark_source_chains(newinfo, valid_hooks, entry1)) + goto free_newinfo; + + i = 0; + xt_entry_foreach(iter1, entry1, newinfo->size) { + ret = compat_check_entry(iter1, net, name); + if (ret != 0) + break; + ++i; + if (strcmp(ip6t_get_target(iter1)->u.user.name, + XT_ERROR_TARGET) == 0) + ++newinfo->stacksize; + } + if (ret) { + /* + * The first i matches need cleanup_entry (calls ->destroy) + * because they had called ->check already. The other j-i + * entries need only release. + */ + int skip = i; + j -= i; + xt_entry_foreach(iter0, entry0, newinfo->size) { + if (skip-- > 0) + continue; + if (j-- == 0) + break; + compat_release_entry(iter0); + } + xt_entry_foreach(iter1, entry1, newinfo->size) { + if (i-- == 0) + break; + cleanup_entry(iter1, net); + } + xt_free_table_info(newinfo); + return ret; + } + + /* And one copy for every other CPU */ + for_each_possible_cpu(i) + if (newinfo->entries[i] && newinfo->entries[i] != entry1) + memcpy(newinfo->entries[i], entry1, newinfo->size); + + *pinfo = newinfo; + *pentry0 = entry1; + xt_free_table_info(info); + return 0; + +free_newinfo: + xt_free_table_info(newinfo); +out: + xt_entry_foreach(iter0, entry0, total_size) { + if (j-- == 0) + break; + compat_release_entry(iter0); + } + return ret; +out_unlock: + xt_compat_flush_offsets(AF_INET6); + xt_compat_unlock(AF_INET6); + goto out; +} + +static int +compat_do_replace(struct net *net, void __user *user, unsigned int len) +{ + int ret; + struct compat_ip6t_replace tmp; + struct xt_table_info *newinfo; + void *loc_cpu_entry; + struct ip6t_entry *iter; + + if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) + return -EFAULT; + + /* overflow check */ + if (tmp.size >= INT_MAX / num_possible_cpus()) + return -ENOMEM; + if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) + return -ENOMEM; + tmp.name[sizeof(tmp.name)-1] = 0; + + newinfo = xt_alloc_table_info(tmp.size); + if (!newinfo) + return -ENOMEM; + + /* choose the copy that is on our node/cpu */ + loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), + tmp.size) != 0) { + ret = -EFAULT; + goto free_newinfo; + } + + ret = translate_compat_table(net, tmp.name, tmp.valid_hooks, + &newinfo, &loc_cpu_entry, tmp.size, + tmp.num_entries, tmp.hook_entry, + tmp.underflow); + if (ret != 0) + goto free_newinfo; + + duprintf("compat_do_replace: Translated table\n"); + + ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo, + tmp.num_counters, compat_ptr(tmp.counters)); + if (ret) + goto free_newinfo_untrans; + return 0; + + free_newinfo_untrans: + xt_entry_foreach(iter, loc_cpu_entry, newinfo->size) + cleanup_entry(iter, net); + free_newinfo: + xt_free_table_info(newinfo); + return ret; +} + +static int +compat_do_ip6t_set_ctl(struct sock *sk, int cmd, void __user *user, + unsigned int len) +{ + int ret; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + switch (cmd) { + case IP6T_SO_SET_REPLACE: + ret = compat_do_replace(sock_net(sk), user, len); + break; + + case IP6T_SO_SET_ADD_COUNTERS: + ret = do_add_counters(sock_net(sk), user, len, 1); + break; + + default: + duprintf("do_ip6t_set_ctl: unknown request %i\n", cmd); + ret = -EINVAL; + } + + return ret; +} + +struct compat_ip6t_get_entries { + char name[XT_TABLE_MAXNAMELEN]; + compat_uint_t size; + struct compat_ip6t_entry entrytable[0]; +}; + +static int +compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table, + void __user *userptr) +{ + struct xt_counters *counters; + const struct xt_table_info *private = table->private; + void __user *pos; + unsigned int size; + int ret = 0; + const void *loc_cpu_entry; + unsigned int i = 0; + struct ip6t_entry *iter; + + counters = alloc_counters(table); + if (IS_ERR(counters)) + return PTR_ERR(counters); + + /* choose the copy that is on our node/cpu, ... + * This choice is lazy (because current thread is + * allowed to migrate to another cpu) + */ + loc_cpu_entry = private->entries[raw_smp_processor_id()]; + pos = userptr; + size = total_size; + xt_entry_foreach(iter, loc_cpu_entry, total_size) { + ret = compat_copy_entry_to_user(iter, &pos, + &size, counters, i++); + if (ret != 0) + break; + } + + vfree(counters); + return ret; +} + +static int +compat_get_entries(struct net *net, struct compat_ip6t_get_entries __user *uptr, + int *len) +{ + int ret; + struct compat_ip6t_get_entries get; + struct xt_table *t; + + if (*len < sizeof(get)) { + duprintf("compat_get_entries: %u < %zu\n", *len, sizeof(get)); + return -EINVAL; + } + + if (copy_from_user(&get, uptr, sizeof(get)) != 0) + return -EFAULT; + + if (*len != sizeof(struct compat_ip6t_get_entries) + get.size) { + duprintf("compat_get_entries: %u != %zu\n", + *len, sizeof(get) + get.size); + return -EINVAL; + } + + xt_compat_lock(AF_INET6); + t = xt_find_table_lock(net, AF_INET6, get.name); + if (t && !IS_ERR(t)) { + const struct xt_table_info *private = t->private; + struct xt_table_info info; + duprintf("t->private->number = %u\n", private->number); + ret = compat_table_info(private, &info); + if (!ret && get.size == info.size) { + ret = compat_copy_entries_to_user(private->size, + t, uptr->entrytable); + } else if (!ret) { + duprintf("compat_get_entries: I've got %u not %u!\n", + private->size, get.size); + ret = -EAGAIN; + } + xt_compat_flush_offsets(AF_INET6); + module_put(t->me); + xt_table_unlock(t); + } else + ret = t ? PTR_ERR(t) : -ENOENT; + + xt_compat_unlock(AF_INET6); + return ret; +} + +static int do_ip6t_get_ctl(struct sock *, int, void __user *, int *); + +static int +compat_do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) +{ + int ret; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + switch (cmd) { + case IP6T_SO_GET_INFO: + ret = get_info(sock_net(sk), user, len, 1); + break; + case IP6T_SO_GET_ENTRIES: + ret = compat_get_entries(sock_net(sk), user, len); + break; + default: + ret = do_ip6t_get_ctl(sk, cmd, user, len); + } + return ret; +} +#endif + +static int +do_ip6t_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) +{ + int ret; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + switch (cmd) { + case IP6T_SO_SET_REPLACE: + ret = do_replace(sock_net(sk), user, len); + break; + + case IP6T_SO_SET_ADD_COUNTERS: + ret = do_add_counters(sock_net(sk), user, len, 0); + break; + + default: + duprintf("do_ip6t_set_ctl: unknown request %i\n", cmd); + ret = -EINVAL; + } + + return ret; +} + +static int +do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) +{ + int ret; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + switch (cmd) { + case IP6T_SO_GET_INFO: + ret = get_info(sock_net(sk), user, len, 0); + break; + + case IP6T_SO_GET_ENTRIES: + ret = get_entries(sock_net(sk), user, len); + break; + + case IP6T_SO_GET_REVISION_MATCH: + case IP6T_SO_GET_REVISION_TARGET: { + struct xt_get_revision rev; + int target; + + if (*len != sizeof(rev)) { + ret = -EINVAL; + break; + } + if (copy_from_user(&rev, user, sizeof(rev)) != 0) { + ret = -EFAULT; + break; + } + rev.name[sizeof(rev.name)-1] = 0; + + if (cmd == IP6T_SO_GET_REVISION_TARGET) + target = 1; + else + target = 0; + + try_then_request_module(xt_find_revision(AF_INET6, rev.name, + rev.revision, + target, &ret), + "ip6t_%s", rev.name); + break; + } + + default: + duprintf("do_ip6t_get_ctl: unknown request %i\n", cmd); + ret = -EINVAL; + } + + return ret; +} + +struct xt_table *ip6t_register_table(struct net *net, + const struct xt_table *table, + const struct ip6t_replace *repl) +{ + int ret; + struct xt_table_info *newinfo; + struct xt_table_info bootstrap = {0}; + void *loc_cpu_entry; + struct xt_table *new_table; + + newinfo = xt_alloc_table_info(repl->size); + if (!newinfo) { + ret = -ENOMEM; + goto out; + } + + /* choose the copy on our node/cpu, but dont care about preemption */ + loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + memcpy(loc_cpu_entry, repl->entries, repl->size); + + ret = translate_table(net, newinfo, loc_cpu_entry, repl); + if (ret != 0) + goto out_free; + + new_table = xt_register_table(net, table, &bootstrap, newinfo); + if (IS_ERR(new_table)) { + ret = PTR_ERR(new_table); + goto out_free; + } + return new_table; + +out_free: + xt_free_table_info(newinfo); +out: + return ERR_PTR(ret); +} + +void ip6t_unregister_table(struct net *net, struct xt_table *table) +{ + struct xt_table_info *private; + void *loc_cpu_entry; + struct module *table_owner = table->me; + struct ip6t_entry *iter; + + private = xt_unregister_table(table); + + /* Decrease module usage counts and free resources */ + loc_cpu_entry = private->entries[raw_smp_processor_id()]; + xt_entry_foreach(iter, loc_cpu_entry, private->size) + cleanup_entry(iter, net); + if (private->number > private->initial_entries) + module_put(table_owner); + xt_free_table_info(private); +} + +/* Returns 1 if the type and code is matched by the range, 0 otherwise */ +static inline bool +icmp6_type_code_match(u_int8_t test_type, u_int8_t min_code, u_int8_t max_code, + u_int8_t type, u_int8_t code, + bool invert) +{ + return (type == test_type && code >= min_code && code <= max_code) + ^ invert; +} + +static bool +icmp6_match(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct icmp6hdr *ic; + struct icmp6hdr _icmph; + const struct ip6t_icmp *icmpinfo = par->matchinfo; + + /* Must not be a fragment. */ + if (par->fragoff != 0) + return false; + + ic = skb_header_pointer(skb, par->thoff, sizeof(_icmph), &_icmph); + if (ic == NULL) { + /* We've been asked to examine this packet, and we + * can't. Hence, no choice but to drop. + */ + duprintf("Dropping evil ICMP tinygram.\n"); + par->hotdrop = true; + return false; + } + + return icmp6_type_code_match(icmpinfo->type, + icmpinfo->code[0], + icmpinfo->code[1], + ic->icmp6_type, ic->icmp6_code, + !!(icmpinfo->invflags&IP6T_ICMP_INV)); +} + +/* Called when user tries to insert an entry of this type. */ +static int icmp6_checkentry(const struct xt_mtchk_param *par) +{ + const struct ip6t_icmp *icmpinfo = par->matchinfo; + + /* Must specify no unknown invflags */ + return (icmpinfo->invflags & ~IP6T_ICMP_INV) ? -EINVAL : 0; +} + +/* The built-in targets: standard (NULL) and error. */ +static struct xt_target ip6t_builtin_tg[] __read_mostly = { + { + .name = XT_STANDARD_TARGET, + .targetsize = sizeof(int), + .family = NFPROTO_IPV6, +#ifdef CONFIG_COMPAT + .compatsize = sizeof(compat_int_t), + .compat_from_user = compat_standard_from_user, + .compat_to_user = compat_standard_to_user, +#endif + }, + { + .name = XT_ERROR_TARGET, + .target = ip6t_error, + .targetsize = XT_FUNCTION_MAXNAMELEN, + .family = NFPROTO_IPV6, + }, +}; + +static struct nf_sockopt_ops ip6t_sockopts = { + .pf = PF_INET6, + .set_optmin = IP6T_BASE_CTL, + .set_optmax = IP6T_SO_SET_MAX+1, + .set = do_ip6t_set_ctl, +#ifdef CONFIG_COMPAT + .compat_set = compat_do_ip6t_set_ctl, +#endif + .get_optmin = IP6T_BASE_CTL, + .get_optmax = IP6T_SO_GET_MAX+1, + .get = do_ip6t_get_ctl, +#ifdef CONFIG_COMPAT + .compat_get = compat_do_ip6t_get_ctl, +#endif + .owner = THIS_MODULE, +}; + +static struct xt_match ip6t_builtin_mt[] __read_mostly = { + { + .name = "icmp6", + .match = icmp6_match, + .matchsize = sizeof(struct ip6t_icmp), + .checkentry = icmp6_checkentry, + .proto = IPPROTO_ICMPV6, + .family = NFPROTO_IPV6, + }, +}; + +static int __net_init ip6_tables_net_init(struct net *net) +{ + return xt_proto_init(net, NFPROTO_IPV6); +} + +static void __net_exit ip6_tables_net_exit(struct net *net) +{ + xt_proto_fini(net, NFPROTO_IPV6); +} + +static struct pernet_operations ip6_tables_net_ops = { + .init = ip6_tables_net_init, + .exit = ip6_tables_net_exit, +}; + +static int __init ip6_tables_init(void) +{ + int ret; + + ret = register_pernet_subsys(&ip6_tables_net_ops); + if (ret < 0) + goto err1; + + /* No one else will be downing sem now, so we won't sleep */ + ret = xt_register_targets(ip6t_builtin_tg, ARRAY_SIZE(ip6t_builtin_tg)); + if (ret < 0) + goto err2; + ret = xt_register_matches(ip6t_builtin_mt, ARRAY_SIZE(ip6t_builtin_mt)); + if (ret < 0) + goto err4; + + /* Register setsockopt */ + ret = nf_register_sockopt(&ip6t_sockopts); + if (ret < 0) + goto err5; + + pr_info("(C) 2000-2006 Netfilter Core Team\n"); + return 0; + +err5: + xt_unregister_matches(ip6t_builtin_mt, ARRAY_SIZE(ip6t_builtin_mt)); +err4: + xt_unregister_targets(ip6t_builtin_tg, ARRAY_SIZE(ip6t_builtin_tg)); +err2: + unregister_pernet_subsys(&ip6_tables_net_ops); +err1: + return ret; +} + +static void __exit ip6_tables_fini(void) +{ + nf_unregister_sockopt(&ip6t_sockopts); + + xt_unregister_matches(ip6t_builtin_mt, ARRAY_SIZE(ip6t_builtin_mt)); + xt_unregister_targets(ip6t_builtin_tg, ARRAY_SIZE(ip6t_builtin_tg)); + unregister_pernet_subsys(&ip6_tables_net_ops); +} + +/* + * find the offset to specified header or the protocol number of last header + * if target < 0. "last header" is transport protocol header, ESP, or + * "No next header". + * + * If target header is found, its offset is set in *offset and return protocol + * number. Otherwise, return -ENOENT or -EBADMSG. + * + * If the first fragment doesn't contain the final protocol header or + * NEXTHDR_NONE it is considered invalid. + * + * Note that non-1st fragment is special case that "the protocol number + * of last header" is "next header" field in Fragment header. In this case, + * *offset is meaningless. If fragoff is not NULL, the fragment offset is + * stored in *fragoff; if it is NULL, return -EINVAL. + */ +int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset, + int target, unsigned short *fragoff) +{ + unsigned int start = skb_network_offset(skb) + sizeof(struct ipv6hdr); + u8 nexthdr = ipv6_hdr(skb)->nexthdr; + unsigned int len = skb->len - start; + + if (fragoff) + *fragoff = 0; + + while (nexthdr != target) { + struct ipv6_opt_hdr _hdr, *hp; + unsigned int hdrlen; + + if ((!ipv6_ext_hdr(nexthdr)) || nexthdr == NEXTHDR_NONE) { + if (target < 0) + break; + return -ENOENT; + } + + hp = skb_header_pointer(skb, start, sizeof(_hdr), &_hdr); + if (hp == NULL) + return -EBADMSG; + if (nexthdr == NEXTHDR_FRAGMENT) { + unsigned short _frag_off; + __be16 *fp; + fp = skb_header_pointer(skb, + start+offsetof(struct frag_hdr, + frag_off), + sizeof(_frag_off), + &_frag_off); + if (fp == NULL) + return -EBADMSG; + + _frag_off = ntohs(*fp) & ~0x7; + if (_frag_off) { + if (target < 0 && + ((!ipv6_ext_hdr(hp->nexthdr)) || + hp->nexthdr == NEXTHDR_NONE)) { + if (fragoff) { + *fragoff = _frag_off; + return hp->nexthdr; + } else { + return -EINVAL; + } + } + return -ENOENT; + } + hdrlen = 8; + } else if (nexthdr == NEXTHDR_AUTH) + hdrlen = (hp->hdrlen + 2) << 2; + else + hdrlen = ipv6_optlen(hp); + + nexthdr = hp->nexthdr; + len -= hdrlen; + start += hdrlen; + } + + *offset = start; + return nexthdr; +} + +EXPORT_SYMBOL(ip6t_register_table); +EXPORT_SYMBOL(ip6t_unregister_table); +EXPORT_SYMBOL(ip6t_do_table); +EXPORT_SYMBOL(ipv6_find_hdr); + +module_init(ip6_tables_init); +module_exit(ip6_tables_fini); diff --git a/net/ipv6/netfilter/ip6t_REJECT.c b/net/ipv6/netfilter/ip6t_REJECT.c new file mode 100644 index 00000000..09155e34 --- /dev/null +++ b/net/ipv6/netfilter/ip6t_REJECT.c @@ -0,0 +1,272 @@ +/* + * IP6 tables REJECT target module + * Linux INET6 implementation + * + * Copyright (C)2003 USAGI/WIDE Project + * + * Authors: + * Yasuyuki Kozakai <yasuyuki.kozakai@toshiba.co.jp> + * + * Based on net/ipv4/netfilter/ipt_REJECT.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/gfp.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/icmpv6.h> +#include <linux/netdevice.h> +#include <net/ipv6.h> +#include <net/tcp.h> +#include <net/icmp.h> +#include <net/ip6_checksum.h> +#include <net/ip6_fib.h> +#include <net/ip6_route.h> +#include <net/flow.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter_ipv6/ip6_tables.h> +#include <linux/netfilter_ipv6/ip6t_REJECT.h> + +MODULE_AUTHOR("Yasuyuki KOZAKAI <yasuyuki.kozakai@toshiba.co.jp>"); +MODULE_DESCRIPTION("Xtables: packet \"rejection\" target for IPv6"); +MODULE_LICENSE("GPL"); + +/* Send RST reply */ +static void send_reset(struct net *net, struct sk_buff *oldskb) +{ + struct sk_buff *nskb; + struct tcphdr otcph, *tcph; + unsigned int otcplen, hh_len; + int tcphoff, needs_ack; + const struct ipv6hdr *oip6h = ipv6_hdr(oldskb); + struct ipv6hdr *ip6h; +#define DEFAULT_TOS_VALUE 0x0U + const __u8 tclass = DEFAULT_TOS_VALUE; + struct dst_entry *dst = NULL; + u8 proto; + __be16 frag_off; + struct flowi6 fl6; + + if ((!(ipv6_addr_type(&oip6h->saddr) & IPV6_ADDR_UNICAST)) || + (!(ipv6_addr_type(&oip6h->daddr) & IPV6_ADDR_UNICAST))) { + pr_debug("addr is not unicast.\n"); + return; + } + + proto = oip6h->nexthdr; + tcphoff = ipv6_skip_exthdr(oldskb, ((u8*)(oip6h+1) - oldskb->data), &proto, &frag_off); + + if ((tcphoff < 0) || (tcphoff > oldskb->len)) { + pr_debug("Cannot get TCP header.\n"); + return; + } + + otcplen = oldskb->len - tcphoff; + + /* IP header checks: fragment, too short. */ + if (proto != IPPROTO_TCP || otcplen < sizeof(struct tcphdr)) { + pr_debug("proto(%d) != IPPROTO_TCP, " + "or too short. otcplen = %d\n", + proto, otcplen); + return; + } + + if (skb_copy_bits(oldskb, tcphoff, &otcph, sizeof(struct tcphdr))) + BUG(); + + /* No RST for RST. */ + if (otcph.rst) { + pr_debug("RST is set\n"); + return; + } + + /* Check checksum. */ + if (csum_ipv6_magic(&oip6h->saddr, &oip6h->daddr, otcplen, IPPROTO_TCP, + skb_checksum(oldskb, tcphoff, otcplen, 0))) { + pr_debug("TCP checksum is invalid\n"); + return; + } + + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_proto = IPPROTO_TCP; + fl6.saddr = oip6h->daddr; + fl6.daddr = oip6h->saddr; + fl6.fl6_sport = otcph.dest; + fl6.fl6_dport = otcph.source; + security_skb_classify_flow(oldskb, flowi6_to_flowi(&fl6)); + dst = ip6_route_output(net, NULL, &fl6); + if (dst == NULL || dst->error) { + dst_release(dst); + return; + } + dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0); + if (IS_ERR(dst)) + return; + + hh_len = (dst->dev->hard_header_len + 15)&~15; + nskb = alloc_skb(hh_len + 15 + dst->header_len + sizeof(struct ipv6hdr) + + sizeof(struct tcphdr) + dst->trailer_len, + GFP_ATOMIC); + + if (!nskb) { + if (net_ratelimit()) + pr_debug("cannot alloc skb\n"); + dst_release(dst); + return; + } + + skb_dst_set(nskb, dst); + + skb_reserve(nskb, hh_len + dst->header_len); + + skb_put(nskb, sizeof(struct ipv6hdr)); + skb_reset_network_header(nskb); + ip6h = ipv6_hdr(nskb); + *(__be32 *)ip6h = htonl(0x60000000 | (tclass << 20)); + ip6h->hop_limit = ip6_dst_hoplimit(dst); + ip6h->nexthdr = IPPROTO_TCP; + ip6h->saddr = oip6h->daddr; + ip6h->daddr = oip6h->saddr; + + tcph = (struct tcphdr *)skb_put(nskb, sizeof(struct tcphdr)); + /* Truncate to length (no data) */ + tcph->doff = sizeof(struct tcphdr)/4; + tcph->source = otcph.dest; + tcph->dest = otcph.source; + + if (otcph.ack) { + needs_ack = 0; + tcph->seq = otcph.ack_seq; + tcph->ack_seq = 0; + } else { + needs_ack = 1; + tcph->ack_seq = htonl(ntohl(otcph.seq) + otcph.syn + otcph.fin + + otcplen - (otcph.doff<<2)); + tcph->seq = 0; + } + + /* Reset flags */ + ((u_int8_t *)tcph)[13] = 0; + tcph->rst = 1; + tcph->ack = needs_ack; + tcph->window = 0; + tcph->urg_ptr = 0; + tcph->check = 0; + + /* Adjust TCP checksum */ + tcph->check = csum_ipv6_magic(&ipv6_hdr(nskb)->saddr, + &ipv6_hdr(nskb)->daddr, + sizeof(struct tcphdr), IPPROTO_TCP, + csum_partial(tcph, + sizeof(struct tcphdr), 0)); + + nf_ct_attach(nskb, oldskb); + + ip6_local_out(nskb); +} + +static inline void +send_unreach(struct net *net, struct sk_buff *skb_in, unsigned char code, + unsigned int hooknum) +{ + if (hooknum == NF_INET_LOCAL_OUT && skb_in->dev == NULL) + skb_in->dev = net->loopback_dev; + + icmpv6_send(skb_in, ICMPV6_DEST_UNREACH, code, 0); +#ifdef CONFIG_IP6_NF_TARGET_REJECT_SKERR + if (skb_in->sk) { + icmpv6_err_convert(ICMPV6_DEST_UNREACH, code, + &skb_in->sk->sk_err); + skb_in->sk->sk_error_report(skb_in->sk); + pr_debug("ip6t_REJECT: sk_err=%d for skb=%p sk=%p\n", + skb_in->sk->sk_err, skb_in, skb_in->sk); + } +#endif +} + +static unsigned int +reject_tg6(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct ip6t_reject_info *reject = par->targinfo; + struct net *net = dev_net((par->in != NULL) ? par->in : par->out); + + pr_debug("%s: medium point\n", __func__); + switch (reject->with) { + case IP6T_ICMP6_NO_ROUTE: + send_unreach(net, skb, ICMPV6_NOROUTE, par->hooknum); + break; + case IP6T_ICMP6_ADM_PROHIBITED: + send_unreach(net, skb, ICMPV6_ADM_PROHIBITED, par->hooknum); + break; + case IP6T_ICMP6_NOT_NEIGHBOUR: + send_unreach(net, skb, ICMPV6_NOT_NEIGHBOUR, par->hooknum); + break; + case IP6T_ICMP6_ADDR_UNREACH: + send_unreach(net, skb, ICMPV6_ADDR_UNREACH, par->hooknum); + break; + case IP6T_ICMP6_PORT_UNREACH: + send_unreach(net, skb, ICMPV6_PORT_UNREACH, par->hooknum); + break; + case IP6T_ICMP6_ECHOREPLY: + /* Do nothing */ + break; + case IP6T_TCP_RESET: + send_reset(net, skb); + break; + default: + if (net_ratelimit()) + pr_info("case %u not handled yet\n", reject->with); + break; + } + + return NF_DROP; +} + +static int reject_tg6_check(const struct xt_tgchk_param *par) +{ + const struct ip6t_reject_info *rejinfo = par->targinfo; + const struct ip6t_entry *e = par->entryinfo; + + if (rejinfo->with == IP6T_ICMP6_ECHOREPLY) { + pr_info("ECHOREPLY is not supported.\n"); + return -EINVAL; + } else if (rejinfo->with == IP6T_TCP_RESET) { + /* Must specify that it's a TCP packet */ + if (e->ipv6.proto != IPPROTO_TCP || + (e->ipv6.invflags & XT_INV_PROTO)) { + pr_info("TCP_RESET illegal for non-tcp\n"); + return -EINVAL; + } + } + return 0; +} + +static struct xt_target reject_tg6_reg __read_mostly = { + .name = "REJECT", + .family = NFPROTO_IPV6, + .target = reject_tg6, + .targetsize = sizeof(struct ip6t_reject_info), + .table = "filter", + .hooks = (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD) | + (1 << NF_INET_LOCAL_OUT), + .checkentry = reject_tg6_check, + .me = THIS_MODULE +}; + +static int __init reject_tg6_init(void) +{ + return xt_register_target(&reject_tg6_reg); +} + +static void __exit reject_tg6_exit(void) +{ + xt_unregister_target(&reject_tg6_reg); +} + +module_init(reject_tg6_init); +module_exit(reject_tg6_exit); diff --git a/net/ipv6/netfilter/ip6t_ah.c b/net/ipv6/netfilter/ip6t_ah.c new file mode 100644 index 00000000..89cccc5a --- /dev/null +++ b/net/ipv6/netfilter/ip6t_ah.c @@ -0,0 +1,121 @@ +/* Kernel module to match AH parameters. */ + +/* (C) 2001-2002 Andras Kis-Szabo <kisza@sch.bme.hu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/types.h> +#include <net/checksum.h> +#include <net/ipv6.h> + +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter_ipv6/ip6_tables.h> +#include <linux/netfilter_ipv6/ip6t_ah.h> + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Xtables: IPv6 IPsec-AH match"); +MODULE_AUTHOR("Andras Kis-Szabo <kisza@sch.bme.hu>"); + +/* Returns 1 if the spi is matched by the range, 0 otherwise */ +static inline bool +spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, bool invert) +{ + bool r; + + pr_debug("spi_match:%c 0x%x <= 0x%x <= 0x%x\n", + invert ? '!' : ' ', min, spi, max); + r = (spi >= min && spi <= max) ^ invert; + pr_debug(" result %s\n", r ? "PASS" : "FAILED"); + return r; +} + +static bool ah_mt6(const struct sk_buff *skb, struct xt_action_param *par) +{ + struct ip_auth_hdr _ah; + const struct ip_auth_hdr *ah; + const struct ip6t_ah *ahinfo = par->matchinfo; + unsigned int ptr; + unsigned int hdrlen = 0; + int err; + + err = ipv6_find_hdr(skb, &ptr, NEXTHDR_AUTH, NULL); + if (err < 0) { + if (err != -ENOENT) + par->hotdrop = true; + return false; + } + + ah = skb_header_pointer(skb, ptr, sizeof(_ah), &_ah); + if (ah == NULL) { + par->hotdrop = true; + return false; + } + + hdrlen = (ah->hdrlen + 2) << 2; + + pr_debug("IPv6 AH LEN %u %u ", hdrlen, ah->hdrlen); + pr_debug("RES %04X ", ah->reserved); + pr_debug("SPI %u %08X\n", ntohl(ah->spi), ntohl(ah->spi)); + + pr_debug("IPv6 AH spi %02X ", + spi_match(ahinfo->spis[0], ahinfo->spis[1], + ntohl(ah->spi), + !!(ahinfo->invflags & IP6T_AH_INV_SPI))); + pr_debug("len %02X %04X %02X ", + ahinfo->hdrlen, hdrlen, + (!ahinfo->hdrlen || + (ahinfo->hdrlen == hdrlen) ^ + !!(ahinfo->invflags & IP6T_AH_INV_LEN))); + pr_debug("res %02X %04X %02X\n", + ahinfo->hdrres, ah->reserved, + !(ahinfo->hdrres && ah->reserved)); + + return (ah != NULL) && + spi_match(ahinfo->spis[0], ahinfo->spis[1], + ntohl(ah->spi), + !!(ahinfo->invflags & IP6T_AH_INV_SPI)) && + (!ahinfo->hdrlen || + (ahinfo->hdrlen == hdrlen) ^ + !!(ahinfo->invflags & IP6T_AH_INV_LEN)) && + !(ahinfo->hdrres && ah->reserved); +} + +static int ah_mt6_check(const struct xt_mtchk_param *par) +{ + const struct ip6t_ah *ahinfo = par->matchinfo; + + if (ahinfo->invflags & ~IP6T_AH_INV_MASK) { + pr_debug("unknown flags %X\n", ahinfo->invflags); + return -EINVAL; + } + return 0; +} + +static struct xt_match ah_mt6_reg __read_mostly = { + .name = "ah", + .family = NFPROTO_IPV6, + .match = ah_mt6, + .matchsize = sizeof(struct ip6t_ah), + .checkentry = ah_mt6_check, + .me = THIS_MODULE, +}; + +static int __init ah_mt6_init(void) +{ + return xt_register_match(&ah_mt6_reg); +} + +static void __exit ah_mt6_exit(void) +{ + xt_unregister_match(&ah_mt6_reg); +} + +module_init(ah_mt6_init); +module_exit(ah_mt6_exit); diff --git a/net/ipv6/netfilter/ip6t_eui64.c b/net/ipv6/netfilter/ip6t_eui64.c new file mode 100644 index 00000000..aab07069 --- /dev/null +++ b/net/ipv6/netfilter/ip6t_eui64.c @@ -0,0 +1,74 @@ +/* Kernel module to match EUI64 address parameters. */ + +/* (C) 2001-2002 Andras Kis-Szabo <kisza@sch.bme.hu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/ipv6.h> +#include <linux/if_ether.h> + +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter_ipv6/ip6_tables.h> + +MODULE_DESCRIPTION("Xtables: IPv6 EUI64 address match"); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Andras Kis-Szabo <kisza@sch.bme.hu>"); + +static bool +eui64_mt6(const struct sk_buff *skb, struct xt_action_param *par) +{ + unsigned char eui64[8]; + + if (!(skb_mac_header(skb) >= skb->head && + skb_mac_header(skb) + ETH_HLEN <= skb->data) && + par->fragoff != 0) { + par->hotdrop = true; + return false; + } + + memset(eui64, 0, sizeof(eui64)); + + if (eth_hdr(skb)->h_proto == htons(ETH_P_IPV6)) { + if (ipv6_hdr(skb)->version == 0x6) { + memcpy(eui64, eth_hdr(skb)->h_source, 3); + memcpy(eui64 + 5, eth_hdr(skb)->h_source + 3, 3); + eui64[3] = 0xff; + eui64[4] = 0xfe; + eui64[0] ^= 0x02; + + if (!memcmp(ipv6_hdr(skb)->saddr.s6_addr + 8, eui64, + sizeof(eui64))) + return true; + } + } + + return false; +} + +static struct xt_match eui64_mt6_reg __read_mostly = { + .name = "eui64", + .family = NFPROTO_IPV6, + .match = eui64_mt6, + .matchsize = sizeof(int), + .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_FORWARD), + .me = THIS_MODULE, +}; + +static int __init eui64_mt6_init(void) +{ + return xt_register_match(&eui64_mt6_reg); +} + +static void __exit eui64_mt6_exit(void) +{ + xt_unregister_match(&eui64_mt6_reg); +} + +module_init(eui64_mt6_init); +module_exit(eui64_mt6_exit); diff --git a/net/ipv6/netfilter/ip6t_frag.c b/net/ipv6/netfilter/ip6t_frag.c new file mode 100644 index 00000000..eda898fd --- /dev/null +++ b/net/ipv6/netfilter/ip6t_frag.c @@ -0,0 +1,136 @@ +/* Kernel module to match FRAG parameters. */ + +/* (C) 2001-2002 Andras Kis-Szabo <kisza@sch.bme.hu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/ipv6.h> +#include <linux/types.h> +#include <net/checksum.h> +#include <net/ipv6.h> + +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter_ipv6/ip6_tables.h> +#include <linux/netfilter_ipv6/ip6t_frag.h> + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Xtables: IPv6 fragment match"); +MODULE_AUTHOR("Andras Kis-Szabo <kisza@sch.bme.hu>"); + +/* Returns 1 if the id is matched by the range, 0 otherwise */ +static inline bool +id_match(u_int32_t min, u_int32_t max, u_int32_t id, bool invert) +{ + bool r; + pr_debug("id_match:%c 0x%x <= 0x%x <= 0x%x\n", invert ? '!' : ' ', + min, id, max); + r = (id >= min && id <= max) ^ invert; + pr_debug(" result %s\n", r ? "PASS" : "FAILED"); + return r; +} + +static bool +frag_mt6(const struct sk_buff *skb, struct xt_action_param *par) +{ + struct frag_hdr _frag; + const struct frag_hdr *fh; + const struct ip6t_frag *fraginfo = par->matchinfo; + unsigned int ptr; + int err; + + err = ipv6_find_hdr(skb, &ptr, NEXTHDR_FRAGMENT, NULL); + if (err < 0) { + if (err != -ENOENT) + par->hotdrop = true; + return false; + } + + fh = skb_header_pointer(skb, ptr, sizeof(_frag), &_frag); + if (fh == NULL) { + par->hotdrop = true; + return false; + } + + pr_debug("INFO %04X ", fh->frag_off); + pr_debug("OFFSET %04X ", ntohs(fh->frag_off) & ~0x7); + pr_debug("RES %02X %04X", fh->reserved, ntohs(fh->frag_off) & 0x6); + pr_debug("MF %04X ", fh->frag_off & htons(IP6_MF)); + pr_debug("ID %u %08X\n", ntohl(fh->identification), + ntohl(fh->identification)); + + pr_debug("IPv6 FRAG id %02X ", + id_match(fraginfo->ids[0], fraginfo->ids[1], + ntohl(fh->identification), + !!(fraginfo->invflags & IP6T_FRAG_INV_IDS))); + pr_debug("res %02X %02X%04X %02X ", + fraginfo->flags & IP6T_FRAG_RES, fh->reserved, + ntohs(fh->frag_off) & 0x6, + !((fraginfo->flags & IP6T_FRAG_RES) && + (fh->reserved || (ntohs(fh->frag_off) & 0x06)))); + pr_debug("first %02X %02X %02X ", + fraginfo->flags & IP6T_FRAG_FST, + ntohs(fh->frag_off) & ~0x7, + !((fraginfo->flags & IP6T_FRAG_FST) && + (ntohs(fh->frag_off) & ~0x7))); + pr_debug("mf %02X %02X %02X ", + fraginfo->flags & IP6T_FRAG_MF, + ntohs(fh->frag_off) & IP6_MF, + !((fraginfo->flags & IP6T_FRAG_MF) && + !((ntohs(fh->frag_off) & IP6_MF)))); + pr_debug("last %02X %02X %02X\n", + fraginfo->flags & IP6T_FRAG_NMF, + ntohs(fh->frag_off) & IP6_MF, + !((fraginfo->flags & IP6T_FRAG_NMF) && + (ntohs(fh->frag_off) & IP6_MF))); + + return (fh != NULL) && + id_match(fraginfo->ids[0], fraginfo->ids[1], + ntohl(fh->identification), + !!(fraginfo->invflags & IP6T_FRAG_INV_IDS)) && + !((fraginfo->flags & IP6T_FRAG_RES) && + (fh->reserved || (ntohs(fh->frag_off) & 0x6))) && + !((fraginfo->flags & IP6T_FRAG_FST) && + (ntohs(fh->frag_off) & ~0x7)) && + !((fraginfo->flags & IP6T_FRAG_MF) && + !(ntohs(fh->frag_off) & IP6_MF)) && + !((fraginfo->flags & IP6T_FRAG_NMF) && + (ntohs(fh->frag_off) & IP6_MF)); +} + +static int frag_mt6_check(const struct xt_mtchk_param *par) +{ + const struct ip6t_frag *fraginfo = par->matchinfo; + + if (fraginfo->invflags & ~IP6T_FRAG_INV_MASK) { + pr_debug("unknown flags %X\n", fraginfo->invflags); + return -EINVAL; + } + return 0; +} + +static struct xt_match frag_mt6_reg __read_mostly = { + .name = "frag", + .family = NFPROTO_IPV6, + .match = frag_mt6, + .matchsize = sizeof(struct ip6t_frag), + .checkentry = frag_mt6_check, + .me = THIS_MODULE, +}; + +static int __init frag_mt6_init(void) +{ + return xt_register_match(&frag_mt6_reg); +} + +static void __exit frag_mt6_exit(void) +{ + xt_unregister_match(&frag_mt6_reg); +} + +module_init(frag_mt6_init); +module_exit(frag_mt6_exit); diff --git a/net/ipv6/netfilter/ip6t_hbh.c b/net/ipv6/netfilter/ip6t_hbh.c new file mode 100644 index 00000000..59df051e --- /dev/null +++ b/net/ipv6/netfilter/ip6t_hbh.c @@ -0,0 +1,215 @@ +/* Kernel module to match Hop-by-Hop and Destination parameters. */ + +/* (C) 2001-2002 Andras Kis-Szabo <kisza@sch.bme.hu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/ipv6.h> +#include <linux/types.h> +#include <net/checksum.h> +#include <net/ipv6.h> + +#include <asm/byteorder.h> + +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter_ipv6/ip6_tables.h> +#include <linux/netfilter_ipv6/ip6t_opts.h> + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Xtables: IPv6 Hop-By-Hop and Destination Header match"); +MODULE_AUTHOR("Andras Kis-Szabo <kisza@sch.bme.hu>"); +MODULE_ALIAS("ip6t_dst"); + +/* + * (Type & 0xC0) >> 6 + * 0 -> ignorable + * 1 -> must drop the packet + * 2 -> send ICMP PARM PROB regardless and drop packet + * 3 -> Send ICMP if not a multicast address and drop packet + * (Type & 0x20) >> 5 + * 0 -> invariant + * 1 -> can change the routing + * (Type & 0x1F) Type + * 0 -> Pad1 (only 1 byte!) + * 1 -> PadN LENGTH info (total length = length + 2) + * C0 | 2 -> JUMBO 4 x x x x ( xxxx > 64k ) + * 5 -> RTALERT 2 x x + */ + +static struct xt_match hbh_mt6_reg[] __read_mostly; + +static bool +hbh_mt6(const struct sk_buff *skb, struct xt_action_param *par) +{ + struct ipv6_opt_hdr _optsh; + const struct ipv6_opt_hdr *oh; + const struct ip6t_opts *optinfo = par->matchinfo; + unsigned int temp; + unsigned int ptr; + unsigned int hdrlen = 0; + bool ret = false; + u8 _opttype; + u8 _optlen; + const u_int8_t *tp = NULL; + const u_int8_t *lp = NULL; + unsigned int optlen; + int err; + + err = ipv6_find_hdr(skb, &ptr, + (par->match == &hbh_mt6_reg[0]) ? + NEXTHDR_HOP : NEXTHDR_DEST, NULL); + if (err < 0) { + if (err != -ENOENT) + par->hotdrop = true; + return false; + } + + oh = skb_header_pointer(skb, ptr, sizeof(_optsh), &_optsh); + if (oh == NULL) { + par->hotdrop = true; + return false; + } + + hdrlen = ipv6_optlen(oh); + if (skb->len - ptr < hdrlen) { + /* Packet smaller than it's length field */ + return false; + } + + pr_debug("IPv6 OPTS LEN %u %u ", hdrlen, oh->hdrlen); + + pr_debug("len %02X %04X %02X ", + optinfo->hdrlen, hdrlen, + (!(optinfo->flags & IP6T_OPTS_LEN) || + ((optinfo->hdrlen == hdrlen) ^ + !!(optinfo->invflags & IP6T_OPTS_INV_LEN)))); + + ret = (oh != NULL) && + (!(optinfo->flags & IP6T_OPTS_LEN) || + ((optinfo->hdrlen == hdrlen) ^ + !!(optinfo->invflags & IP6T_OPTS_INV_LEN))); + + ptr += 2; + hdrlen -= 2; + if (!(optinfo->flags & IP6T_OPTS_OPTS)) { + return ret; + } else { + pr_debug("Strict "); + pr_debug("#%d ", optinfo->optsnr); + for (temp = 0; temp < optinfo->optsnr; temp++) { + /* type field exists ? */ + if (hdrlen < 1) + break; + tp = skb_header_pointer(skb, ptr, sizeof(_opttype), + &_opttype); + if (tp == NULL) + break; + + /* Type check */ + if (*tp != (optinfo->opts[temp] & 0xFF00) >> 8) { + pr_debug("Tbad %02X %02X\n", *tp, + (optinfo->opts[temp] & 0xFF00) >> 8); + return false; + } else { + pr_debug("Tok "); + } + /* Length check */ + if (*tp) { + u16 spec_len; + + /* length field exists ? */ + if (hdrlen < 2) + break; + lp = skb_header_pointer(skb, ptr + 1, + sizeof(_optlen), + &_optlen); + if (lp == NULL) + break; + spec_len = optinfo->opts[temp] & 0x00FF; + + if (spec_len != 0x00FF && spec_len != *lp) { + pr_debug("Lbad %02X %04X\n", *lp, + spec_len); + return false; + } + pr_debug("Lok "); + optlen = *lp + 2; + } else { + pr_debug("Pad1\n"); + optlen = 1; + } + + /* Step to the next */ + pr_debug("len%04X\n", optlen); + + if ((ptr > skb->len - optlen || hdrlen < optlen) && + temp < optinfo->optsnr - 1) { + pr_debug("new pointer is too large!\n"); + break; + } + ptr += optlen; + hdrlen -= optlen; + } + if (temp == optinfo->optsnr) + return ret; + else + return false; + } + + return false; +} + +static int hbh_mt6_check(const struct xt_mtchk_param *par) +{ + const struct ip6t_opts *optsinfo = par->matchinfo; + + if (optsinfo->invflags & ~IP6T_OPTS_INV_MASK) { + pr_debug("unknown flags %X\n", optsinfo->invflags); + return -EINVAL; + } + + if (optsinfo->flags & IP6T_OPTS_NSTRICT) { + pr_debug("Not strict - not implemented"); + return -EINVAL; + } + + return 0; +} + +static struct xt_match hbh_mt6_reg[] __read_mostly = { + { + /* Note, hbh_mt6 relies on the order of hbh_mt6_reg */ + .name = "hbh", + .family = NFPROTO_IPV6, + .match = hbh_mt6, + .matchsize = sizeof(struct ip6t_opts), + .checkentry = hbh_mt6_check, + .me = THIS_MODULE, + }, + { + .name = "dst", + .family = NFPROTO_IPV6, + .match = hbh_mt6, + .matchsize = sizeof(struct ip6t_opts), + .checkentry = hbh_mt6_check, + .me = THIS_MODULE, + }, +}; + +static int __init hbh_mt6_init(void) +{ + return xt_register_matches(hbh_mt6_reg, ARRAY_SIZE(hbh_mt6_reg)); +} + +static void __exit hbh_mt6_exit(void) +{ + xt_unregister_matches(hbh_mt6_reg, ARRAY_SIZE(hbh_mt6_reg)); +} + +module_init(hbh_mt6_init); +module_exit(hbh_mt6_exit); diff --git a/net/ipv6/netfilter/ip6t_ipv6header.c b/net/ipv6/netfilter/ip6t_ipv6header.c new file mode 100644 index 00000000..54bd9790 --- /dev/null +++ b/net/ipv6/netfilter/ip6t_ipv6header.c @@ -0,0 +1,154 @@ +/* ipv6header match - matches IPv6 packets based + on whether they contain certain headers */ + +/* Original idea: Brad Chapman + * Rewritten by: Andras Kis-Szabo <kisza@sch.bme.hu> */ + +/* (C) 2001-2002 Andras Kis-Szabo <kisza@sch.bme.hu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/ipv6.h> +#include <linux/types.h> +#include <net/checksum.h> +#include <net/ipv6.h> + +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter_ipv6/ip6_tables.h> +#include <linux/netfilter_ipv6/ip6t_ipv6header.h> + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Xtables: IPv6 header types match"); +MODULE_AUTHOR("Andras Kis-Szabo <kisza@sch.bme.hu>"); + +static bool +ipv6header_mt6(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct ip6t_ipv6header_info *info = par->matchinfo; + unsigned int temp; + int len; + u8 nexthdr; + unsigned int ptr; + + /* Make sure this isn't an evil packet */ + + /* type of the 1st exthdr */ + nexthdr = ipv6_hdr(skb)->nexthdr; + /* pointer to the 1st exthdr */ + ptr = sizeof(struct ipv6hdr); + /* available length */ + len = skb->len - ptr; + temp = 0; + + while (ip6t_ext_hdr(nexthdr)) { + const struct ipv6_opt_hdr *hp; + struct ipv6_opt_hdr _hdr; + int hdrlen; + + /* No more exthdr -> evaluate */ + if (nexthdr == NEXTHDR_NONE) { + temp |= MASK_NONE; + break; + } + /* Is there enough space for the next ext header? */ + if (len < (int)sizeof(struct ipv6_opt_hdr)) + return false; + /* ESP -> evaluate */ + if (nexthdr == NEXTHDR_ESP) { + temp |= MASK_ESP; + break; + } + + hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr); + BUG_ON(hp == NULL); + + /* Calculate the header length */ + if (nexthdr == NEXTHDR_FRAGMENT) + hdrlen = 8; + else if (nexthdr == NEXTHDR_AUTH) + hdrlen = (hp->hdrlen + 2) << 2; + else + hdrlen = ipv6_optlen(hp); + + /* set the flag */ + switch (nexthdr) { + case NEXTHDR_HOP: + temp |= MASK_HOPOPTS; + break; + case NEXTHDR_ROUTING: + temp |= MASK_ROUTING; + break; + case NEXTHDR_FRAGMENT: + temp |= MASK_FRAGMENT; + break; + case NEXTHDR_AUTH: + temp |= MASK_AH; + break; + case NEXTHDR_DEST: + temp |= MASK_DSTOPTS; + break; + default: + return false; + break; + } + + nexthdr = hp->nexthdr; + len -= hdrlen; + ptr += hdrlen; + if (ptr > skb->len) + break; + } + + if (nexthdr != NEXTHDR_NONE && nexthdr != NEXTHDR_ESP) + temp |= MASK_PROTO; + + if (info->modeflag) + return !((temp ^ info->matchflags ^ info->invflags) + & info->matchflags); + else { + if (info->invflags) + return temp != info->matchflags; + else + return temp == info->matchflags; + } +} + +static int ipv6header_mt6_check(const struct xt_mtchk_param *par) +{ + const struct ip6t_ipv6header_info *info = par->matchinfo; + + /* invflags is 0 or 0xff in hard mode */ + if ((!info->modeflag) && info->invflags != 0x00 && + info->invflags != 0xFF) + return -EINVAL; + + return 0; +} + +static struct xt_match ipv6header_mt6_reg __read_mostly = { + .name = "ipv6header", + .family = NFPROTO_IPV6, + .match = ipv6header_mt6, + .matchsize = sizeof(struct ip6t_ipv6header_info), + .checkentry = ipv6header_mt6_check, + .destroy = NULL, + .me = THIS_MODULE, +}; + +static int __init ipv6header_mt6_init(void) +{ + return xt_register_match(&ipv6header_mt6_reg); +} + +static void __exit ipv6header_mt6_exit(void) +{ + xt_unregister_match(&ipv6header_mt6_reg); +} + +module_init(ipv6header_mt6_init); +module_exit(ipv6header_mt6_exit); diff --git a/net/ipv6/netfilter/ip6t_mh.c b/net/ipv6/netfilter/ip6t_mh.c new file mode 100644 index 00000000..0c90c66b --- /dev/null +++ b/net/ipv6/netfilter/ip6t_mh.c @@ -0,0 +1,94 @@ +/* + * Copyright (C)2006 USAGI/WIDE Project + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Author: + * Masahide NAKAMURA @USAGI <masahide.nakamura.cz@hitachi.com> + * + * Based on net/netfilter/xt_tcpudp.c + * + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/types.h> +#include <linux/module.h> +#include <net/ip.h> +#include <linux/ipv6.h> +#include <net/ipv6.h> +#include <net/mip6.h> + +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter_ipv6/ip6t_mh.h> + +MODULE_DESCRIPTION("Xtables: IPv6 Mobility Header match"); +MODULE_LICENSE("GPL"); + +/* Returns 1 if the type is matched by the range, 0 otherwise */ +static inline bool +type_match(u_int8_t min, u_int8_t max, u_int8_t type, bool invert) +{ + return (type >= min && type <= max) ^ invert; +} + +static bool mh_mt6(const struct sk_buff *skb, struct xt_action_param *par) +{ + struct ip6_mh _mh; + const struct ip6_mh *mh; + const struct ip6t_mh *mhinfo = par->matchinfo; + + /* Must not be a fragment. */ + if (par->fragoff != 0) + return false; + + mh = skb_header_pointer(skb, par->thoff, sizeof(_mh), &_mh); + if (mh == NULL) { + /* We've been asked to examine this packet, and we + can't. Hence, no choice but to drop. */ + pr_debug("Dropping evil MH tinygram.\n"); + par->hotdrop = true; + return false; + } + + if (mh->ip6mh_proto != IPPROTO_NONE) { + pr_debug("Dropping invalid MH Payload Proto: %u\n", + mh->ip6mh_proto); + par->hotdrop = true; + return false; + } + + return type_match(mhinfo->types[0], mhinfo->types[1], mh->ip6mh_type, + !!(mhinfo->invflags & IP6T_MH_INV_TYPE)); +} + +static int mh_mt6_check(const struct xt_mtchk_param *par) +{ + const struct ip6t_mh *mhinfo = par->matchinfo; + + /* Must specify no unknown invflags */ + return (mhinfo->invflags & ~IP6T_MH_INV_MASK) ? -EINVAL : 0; +} + +static struct xt_match mh_mt6_reg __read_mostly = { + .name = "mh", + .family = NFPROTO_IPV6, + .checkentry = mh_mt6_check, + .match = mh_mt6, + .matchsize = sizeof(struct ip6t_mh), + .proto = IPPROTO_MH, + .me = THIS_MODULE, +}; + +static int __init mh_mt6_init(void) +{ + return xt_register_match(&mh_mt6_reg); +} + +static void __exit mh_mt6_exit(void) +{ + xt_unregister_match(&mh_mt6_reg); +} + +module_init(mh_mt6_init); +module_exit(mh_mt6_exit); diff --git a/net/ipv6/netfilter/ip6t_rpfilter.c b/net/ipv6/netfilter/ip6t_rpfilter.c new file mode 100644 index 00000000..5d1d8b04 --- /dev/null +++ b/net/ipv6/netfilter/ip6t_rpfilter.c @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2011 Florian Westphal <fw@strlen.de> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/netdevice.h> +#include <linux/route.h> +#include <net/ip6_fib.h> +#include <net/ip6_route.h> + +#include <linux/netfilter/xt_rpfilter.h> +#include <linux/netfilter/x_tables.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Florian Westphal <fw@strlen.de>"); +MODULE_DESCRIPTION("Xtables: IPv6 reverse path filter match"); + +static bool rpfilter_addr_unicast(const struct in6_addr *addr) +{ + int addr_type = ipv6_addr_type(addr); + return addr_type & IPV6_ADDR_UNICAST; +} + +static bool rpfilter_lookup_reverse6(const struct sk_buff *skb, + const struct net_device *dev, u8 flags) +{ + struct rt6_info *rt; + struct ipv6hdr *iph = ipv6_hdr(skb); + bool ret = false; + struct flowi6 fl6 = { + .flowlabel = (* (__be32 *) iph) & IPV6_FLOWINFO_MASK, + .flowi6_proto = iph->nexthdr, + .daddr = iph->saddr, + }; + int lookup_flags; + + if (rpfilter_addr_unicast(&iph->daddr)) { + memcpy(&fl6.saddr, &iph->daddr, sizeof(struct in6_addr)); + lookup_flags = RT6_LOOKUP_F_HAS_SADDR; + } else { + lookup_flags = 0; + } + + fl6.flowi6_mark = flags & XT_RPFILTER_VALID_MARK ? skb->mark : 0; + if ((flags & XT_RPFILTER_LOOSE) == 0) { + fl6.flowi6_oif = dev->ifindex; + lookup_flags |= RT6_LOOKUP_F_IFACE; + } + + rt = (void *) ip6_route_lookup(dev_net(dev), &fl6, lookup_flags); + if (rt->dst.error) + goto out; + + if (rt->rt6i_flags & (RTF_REJECT|RTF_ANYCAST)) + goto out; + + if (rt->rt6i_flags & RTF_LOCAL) { + ret = flags & XT_RPFILTER_ACCEPT_LOCAL; + goto out; + } + + if (rt->rt6i_idev->dev == dev || (flags & XT_RPFILTER_LOOSE)) + ret = true; + out: + dst_release(&rt->dst); + return ret; +} + +static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_rpfilter_info *info = par->matchinfo; + int saddrtype; + struct ipv6hdr *iph; + bool invert = info->flags & XT_RPFILTER_INVERT; + + if (par->in->flags & IFF_LOOPBACK) + return true ^ invert; + + iph = ipv6_hdr(skb); + saddrtype = ipv6_addr_type(&iph->saddr); + if (unlikely(saddrtype == IPV6_ADDR_ANY)) + return true ^ invert; /* not routable: forward path will drop it */ + + return rpfilter_lookup_reverse6(skb, par->in, info->flags) ^ invert; +} + +static int rpfilter_check(const struct xt_mtchk_param *par) +{ + const struct xt_rpfilter_info *info = par->matchinfo; + unsigned int options = ~XT_RPFILTER_OPTION_MASK; + + if (info->flags & options) { + pr_info("unknown options encountered"); + return -EINVAL; + } + + if (strcmp(par->table, "mangle") != 0 && + strcmp(par->table, "raw") != 0) { + pr_info("match only valid in the \'raw\' " + "or \'mangle\' tables, not \'%s\'.\n", par->table); + return -EINVAL; + } + + return 0; +} + +static struct xt_match rpfilter_mt_reg __read_mostly = { + .name = "rpfilter", + .family = NFPROTO_IPV6, + .checkentry = rpfilter_check, + .match = rpfilter_mt, + .matchsize = sizeof(struct xt_rpfilter_info), + .hooks = (1 << NF_INET_PRE_ROUTING), + .me = THIS_MODULE +}; + +static int __init rpfilter_mt_init(void) +{ + return xt_register_match(&rpfilter_mt_reg); +} + +static void __exit rpfilter_mt_exit(void) +{ + xt_unregister_match(&rpfilter_mt_reg); +} + +module_init(rpfilter_mt_init); +module_exit(rpfilter_mt_exit); diff --git a/net/ipv6/netfilter/ip6t_rt.c b/net/ipv6/netfilter/ip6t_rt.c new file mode 100644 index 00000000..d8488c50 --- /dev/null +++ b/net/ipv6/netfilter/ip6t_rt.c @@ -0,0 +1,225 @@ +/* Kernel module to match ROUTING parameters. */ + +/* (C) 2001-2002 Andras Kis-Szabo <kisza@sch.bme.hu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/ipv6.h> +#include <linux/types.h> +#include <net/checksum.h> +#include <net/ipv6.h> + +#include <asm/byteorder.h> + +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter_ipv6/ip6_tables.h> +#include <linux/netfilter_ipv6/ip6t_rt.h> + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Xtables: IPv6 Routing Header match"); +MODULE_AUTHOR("Andras Kis-Szabo <kisza@sch.bme.hu>"); + +/* Returns 1 if the id is matched by the range, 0 otherwise */ +static inline bool +segsleft_match(u_int32_t min, u_int32_t max, u_int32_t id, bool invert) +{ + bool r; + pr_debug("segsleft_match:%c 0x%x <= 0x%x <= 0x%x\n", + invert ? '!' : ' ', min, id, max); + r = (id >= min && id <= max) ^ invert; + pr_debug(" result %s\n", r ? "PASS" : "FAILED"); + return r; +} + +static bool rt_mt6(const struct sk_buff *skb, struct xt_action_param *par) +{ + struct ipv6_rt_hdr _route; + const struct ipv6_rt_hdr *rh; + const struct ip6t_rt *rtinfo = par->matchinfo; + unsigned int temp; + unsigned int ptr; + unsigned int hdrlen = 0; + bool ret = false; + struct in6_addr _addr; + const struct in6_addr *ap; + int err; + + err = ipv6_find_hdr(skb, &ptr, NEXTHDR_ROUTING, NULL); + if (err < 0) { + if (err != -ENOENT) + par->hotdrop = true; + return false; + } + + rh = skb_header_pointer(skb, ptr, sizeof(_route), &_route); + if (rh == NULL) { + par->hotdrop = true; + return false; + } + + hdrlen = ipv6_optlen(rh); + if (skb->len - ptr < hdrlen) { + /* Pcket smaller than its length field */ + return false; + } + + pr_debug("IPv6 RT LEN %u %u ", hdrlen, rh->hdrlen); + pr_debug("TYPE %04X ", rh->type); + pr_debug("SGS_LEFT %u %02X\n", rh->segments_left, rh->segments_left); + + pr_debug("IPv6 RT segsleft %02X ", + segsleft_match(rtinfo->segsleft[0], rtinfo->segsleft[1], + rh->segments_left, + !!(rtinfo->invflags & IP6T_RT_INV_SGS))); + pr_debug("type %02X %02X %02X ", + rtinfo->rt_type, rh->type, + (!(rtinfo->flags & IP6T_RT_TYP) || + ((rtinfo->rt_type == rh->type) ^ + !!(rtinfo->invflags & IP6T_RT_INV_TYP)))); + pr_debug("len %02X %04X %02X ", + rtinfo->hdrlen, hdrlen, + !(rtinfo->flags & IP6T_RT_LEN) || + ((rtinfo->hdrlen == hdrlen) ^ + !!(rtinfo->invflags & IP6T_RT_INV_LEN))); + pr_debug("res %02X %02X %02X ", + rtinfo->flags & IP6T_RT_RES, + ((const struct rt0_hdr *)rh)->reserved, + !((rtinfo->flags & IP6T_RT_RES) && + (((const struct rt0_hdr *)rh)->reserved))); + + ret = (rh != NULL) && + (segsleft_match(rtinfo->segsleft[0], rtinfo->segsleft[1], + rh->segments_left, + !!(rtinfo->invflags & IP6T_RT_INV_SGS))) && + (!(rtinfo->flags & IP6T_RT_LEN) || + ((rtinfo->hdrlen == hdrlen) ^ + !!(rtinfo->invflags & IP6T_RT_INV_LEN))) && + (!(rtinfo->flags & IP6T_RT_TYP) || + ((rtinfo->rt_type == rh->type) ^ + !!(rtinfo->invflags & IP6T_RT_INV_TYP))); + + if (ret && (rtinfo->flags & IP6T_RT_RES)) { + const u_int32_t *rp; + u_int32_t _reserved; + rp = skb_header_pointer(skb, + ptr + offsetof(struct rt0_hdr, + reserved), + sizeof(_reserved), + &_reserved); + + ret = (*rp == 0); + } + + pr_debug("#%d ", rtinfo->addrnr); + if (!(rtinfo->flags & IP6T_RT_FST)) { + return ret; + } else if (rtinfo->flags & IP6T_RT_FST_NSTRICT) { + pr_debug("Not strict "); + if (rtinfo->addrnr > (unsigned int)((hdrlen - 8) / 16)) { + pr_debug("There isn't enough space\n"); + return false; + } else { + unsigned int i = 0; + + pr_debug("#%d ", rtinfo->addrnr); + for (temp = 0; + temp < (unsigned int)((hdrlen - 8) / 16); + temp++) { + ap = skb_header_pointer(skb, + ptr + + sizeof(struct rt0_hdr) + + temp * sizeof(_addr), + sizeof(_addr), + &_addr); + + BUG_ON(ap == NULL); + + if (ipv6_addr_equal(ap, &rtinfo->addrs[i])) { + pr_debug("i=%d temp=%d;\n", i, temp); + i++; + } + if (i == rtinfo->addrnr) + break; + } + pr_debug("i=%d #%d\n", i, rtinfo->addrnr); + if (i == rtinfo->addrnr) + return ret; + else + return false; + } + } else { + pr_debug("Strict "); + if (rtinfo->addrnr > (unsigned int)((hdrlen - 8) / 16)) { + pr_debug("There isn't enough space\n"); + return false; + } else { + pr_debug("#%d ", rtinfo->addrnr); + for (temp = 0; temp < rtinfo->addrnr; temp++) { + ap = skb_header_pointer(skb, + ptr + + sizeof(struct rt0_hdr) + + temp * sizeof(_addr), + sizeof(_addr), + &_addr); + BUG_ON(ap == NULL); + + if (!ipv6_addr_equal(ap, &rtinfo->addrs[temp])) + break; + } + pr_debug("temp=%d #%d\n", temp, rtinfo->addrnr); + if (temp == rtinfo->addrnr && + temp == (unsigned int)((hdrlen - 8) / 16)) + return ret; + else + return false; + } + } + + return false; +} + +static int rt_mt6_check(const struct xt_mtchk_param *par) +{ + const struct ip6t_rt *rtinfo = par->matchinfo; + + if (rtinfo->invflags & ~IP6T_RT_INV_MASK) { + pr_debug("unknown flags %X\n", rtinfo->invflags); + return -EINVAL; + } + if ((rtinfo->flags & (IP6T_RT_RES | IP6T_RT_FST_MASK)) && + (!(rtinfo->flags & IP6T_RT_TYP) || + (rtinfo->rt_type != 0) || + (rtinfo->invflags & IP6T_RT_INV_TYP))) { + pr_debug("`--rt-type 0' required before `--rt-0-*'"); + return -EINVAL; + } + + return 0; +} + +static struct xt_match rt_mt6_reg __read_mostly = { + .name = "rt", + .family = NFPROTO_IPV6, + .match = rt_mt6, + .matchsize = sizeof(struct ip6t_rt), + .checkentry = rt_mt6_check, + .me = THIS_MODULE, +}; + +static int __init rt_mt6_init(void) +{ + return xt_register_match(&rt_mt6_reg); +} + +static void __exit rt_mt6_exit(void) +{ + xt_unregister_match(&rt_mt6_reg); +} + +module_init(rt_mt6_init); +module_exit(rt_mt6_exit); diff --git a/net/ipv6/netfilter/ip6table_filter.c b/net/ipv6/netfilter/ip6table_filter.c new file mode 100644 index 00000000..325e59a0 --- /dev/null +++ b/net/ipv6/netfilter/ip6table_filter.c @@ -0,0 +1,108 @@ +/* + * This is the 1999 rewrite of IP Firewalling, aiming for kernel 2.3.x. + * + * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling + * Copyright (C) 2000-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/netfilter_ipv6/ip6_tables.h> +#include <linux/slab.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); +MODULE_DESCRIPTION("ip6tables filter table"); + +#define FILTER_VALID_HOOKS ((1 << NF_INET_LOCAL_IN) | \ + (1 << NF_INET_FORWARD) | \ + (1 << NF_INET_LOCAL_OUT)) + +static const struct xt_table packet_filter = { + .name = "filter", + .valid_hooks = FILTER_VALID_HOOKS, + .me = THIS_MODULE, + .af = NFPROTO_IPV6, + .priority = NF_IP6_PRI_FILTER, +}; + +/* The work comes in here from netfilter.c. */ +static unsigned int +ip6table_filter_hook(unsigned int hook, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + const struct net *net = dev_net((in != NULL) ? in : out); + + return ip6t_do_table(skb, hook, in, out, net->ipv6.ip6table_filter); +} + +static struct nf_hook_ops *filter_ops __read_mostly; + +/* Default to forward because I got too much mail already. */ +static bool forward = true; +module_param(forward, bool, 0000); + +static int __net_init ip6table_filter_net_init(struct net *net) +{ + struct ip6t_replace *repl; + + repl = ip6t_alloc_initial_table(&packet_filter); + if (repl == NULL) + return -ENOMEM; + /* Entry 1 is the FORWARD hook */ + ((struct ip6t_standard *)repl->entries)[1].target.verdict = + forward ? -NF_ACCEPT - 1 : -NF_DROP - 1; + + net->ipv6.ip6table_filter = + ip6t_register_table(net, &packet_filter, repl); + kfree(repl); + if (IS_ERR(net->ipv6.ip6table_filter)) + return PTR_ERR(net->ipv6.ip6table_filter); + return 0; +} + +static void __net_exit ip6table_filter_net_exit(struct net *net) +{ + ip6t_unregister_table(net, net->ipv6.ip6table_filter); +} + +static struct pernet_operations ip6table_filter_net_ops = { + .init = ip6table_filter_net_init, + .exit = ip6table_filter_net_exit, +}; + +static int __init ip6table_filter_init(void) +{ + int ret; + + ret = register_pernet_subsys(&ip6table_filter_net_ops); + if (ret < 0) + return ret; + + /* Register hooks */ + filter_ops = xt_hook_link(&packet_filter, ip6table_filter_hook); + if (IS_ERR(filter_ops)) { + ret = PTR_ERR(filter_ops); + goto cleanup_table; + } + + return ret; + + cleanup_table: + unregister_pernet_subsys(&ip6table_filter_net_ops); + return ret; +} + +static void __exit ip6table_filter_fini(void) +{ + xt_hook_unlink(&packet_filter, filter_ops); + unregister_pernet_subsys(&ip6table_filter_net_ops); +} + +module_init(ip6table_filter_init); +module_exit(ip6table_filter_fini); diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c new file mode 100644 index 00000000..00d19173 --- /dev/null +++ b/net/ipv6/netfilter/ip6table_mangle.c @@ -0,0 +1,145 @@ +/* + * IPv6 packet mangling table, a port of the IPv4 mangle table to IPv6 + * + * Copyright (C) 2000-2001 by Harald Welte <laforge@gnumonks.org> + * Copyright (C) 2000-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/module.h> +#include <linux/netfilter_ipv6/ip6_tables.h> +#include <linux/slab.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); +MODULE_DESCRIPTION("ip6tables mangle table"); + +#define MANGLE_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | \ + (1 << NF_INET_LOCAL_IN) | \ + (1 << NF_INET_FORWARD) | \ + (1 << NF_INET_LOCAL_OUT) | \ + (1 << NF_INET_POST_ROUTING)) + +static const struct xt_table packet_mangler = { + .name = "mangle", + .valid_hooks = MANGLE_VALID_HOOKS, + .me = THIS_MODULE, + .af = NFPROTO_IPV6, + .priority = NF_IP6_PRI_MANGLE, +}; + +static unsigned int +ip6t_mangle_out(struct sk_buff *skb, const struct net_device *out) +{ + unsigned int ret; + struct in6_addr saddr, daddr; + u_int8_t hop_limit; + u_int32_t flowlabel, mark; + +#if 0 + /* root is playing with raw sockets. */ + if (skb->len < sizeof(struct iphdr) || + ip_hdrlen(skb) < sizeof(struct iphdr)) { + if (net_ratelimit()) + pr_warning("ip6t_hook: happy cracking.\n"); + return NF_ACCEPT; + } +#endif + + /* save source/dest address, mark, hoplimit, flowlabel, priority, */ + memcpy(&saddr, &ipv6_hdr(skb)->saddr, sizeof(saddr)); + memcpy(&daddr, &ipv6_hdr(skb)->daddr, sizeof(daddr)); + mark = skb->mark; + hop_limit = ipv6_hdr(skb)->hop_limit; + + /* flowlabel and prio (includes version, which shouldn't change either */ + flowlabel = *((u_int32_t *)ipv6_hdr(skb)); + + ret = ip6t_do_table(skb, NF_INET_LOCAL_OUT, NULL, out, + dev_net(out)->ipv6.ip6table_mangle); + + if (ret != NF_DROP && ret != NF_STOLEN && + (memcmp(&ipv6_hdr(skb)->saddr, &saddr, sizeof(saddr)) || + memcmp(&ipv6_hdr(skb)->daddr, &daddr, sizeof(daddr)) || + skb->mark != mark || + ipv6_hdr(skb)->hop_limit != hop_limit || + flowlabel != *((u_int32_t *)ipv6_hdr(skb)))) + return ip6_route_me_harder(skb) == 0 ? ret : NF_DROP; + + return ret; +} + +/* The work comes in here from netfilter.c. */ +static unsigned int +ip6table_mangle_hook(unsigned int hook, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + if (hook == NF_INET_LOCAL_OUT) + return ip6t_mangle_out(skb, out); + if (hook == NF_INET_POST_ROUTING) + return ip6t_do_table(skb, hook, in, out, + dev_net(out)->ipv6.ip6table_mangle); + /* INPUT/FORWARD */ + return ip6t_do_table(skb, hook, in, out, + dev_net(in)->ipv6.ip6table_mangle); +} + +static struct nf_hook_ops *mangle_ops __read_mostly; +static int __net_init ip6table_mangle_net_init(struct net *net) +{ + struct ip6t_replace *repl; + + repl = ip6t_alloc_initial_table(&packet_mangler); + if (repl == NULL) + return -ENOMEM; + net->ipv6.ip6table_mangle = + ip6t_register_table(net, &packet_mangler, repl); + kfree(repl); + if (IS_ERR(net->ipv6.ip6table_mangle)) + return PTR_ERR(net->ipv6.ip6table_mangle); + return 0; +} + +static void __net_exit ip6table_mangle_net_exit(struct net *net) +{ + ip6t_unregister_table(net, net->ipv6.ip6table_mangle); +} + +static struct pernet_operations ip6table_mangle_net_ops = { + .init = ip6table_mangle_net_init, + .exit = ip6table_mangle_net_exit, +}; + +static int __init ip6table_mangle_init(void) +{ + int ret; + + ret = register_pernet_subsys(&ip6table_mangle_net_ops); + if (ret < 0) + return ret; + + /* Register hooks */ + mangle_ops = xt_hook_link(&packet_mangler, ip6table_mangle_hook); + if (IS_ERR(mangle_ops)) { + ret = PTR_ERR(mangle_ops); + goto cleanup_table; + } + + return ret; + + cleanup_table: + unregister_pernet_subsys(&ip6table_mangle_net_ops); + return ret; +} + +static void __exit ip6table_mangle_fini(void) +{ + xt_hook_unlink(&packet_mangler, mangle_ops); + unregister_pernet_subsys(&ip6table_mangle_net_ops); +} + +module_init(ip6table_mangle_init); +module_exit(ip6table_mangle_fini); diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c new file mode 100644 index 00000000..5b9926a0 --- /dev/null +++ b/net/ipv6/netfilter/ip6table_raw.c @@ -0,0 +1,88 @@ +/* + * IPv6 raw table, a port of the IPv4 raw table to IPv6 + * + * Copyright (C) 2003 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + */ +#include <linux/module.h> +#include <linux/netfilter_ipv6/ip6_tables.h> +#include <linux/slab.h> + +#define RAW_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT)) + +static const struct xt_table packet_raw = { + .name = "raw", + .valid_hooks = RAW_VALID_HOOKS, + .me = THIS_MODULE, + .af = NFPROTO_IPV6, + .priority = NF_IP6_PRI_RAW, +}; + +/* The work comes in here from netfilter.c. */ +static unsigned int +ip6table_raw_hook(unsigned int hook, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + const struct net *net = dev_net((in != NULL) ? in : out); + + return ip6t_do_table(skb, hook, in, out, net->ipv6.ip6table_raw); +} + +static struct nf_hook_ops *rawtable_ops __read_mostly; + +static int __net_init ip6table_raw_net_init(struct net *net) +{ + struct ip6t_replace *repl; + + repl = ip6t_alloc_initial_table(&packet_raw); + if (repl == NULL) + return -ENOMEM; + net->ipv6.ip6table_raw = + ip6t_register_table(net, &packet_raw, repl); + kfree(repl); + if (IS_ERR(net->ipv6.ip6table_raw)) + return PTR_ERR(net->ipv6.ip6table_raw); + return 0; +} + +static void __net_exit ip6table_raw_net_exit(struct net *net) +{ + ip6t_unregister_table(net, net->ipv6.ip6table_raw); +} + +static struct pernet_operations ip6table_raw_net_ops = { + .init = ip6table_raw_net_init, + .exit = ip6table_raw_net_exit, +}; + +static int __init ip6table_raw_init(void) +{ + int ret; + + ret = register_pernet_subsys(&ip6table_raw_net_ops); + if (ret < 0) + return ret; + + /* Register hooks */ + rawtable_ops = xt_hook_link(&packet_raw, ip6table_raw_hook); + if (IS_ERR(rawtable_ops)) { + ret = PTR_ERR(rawtable_ops); + goto cleanup_table; + } + + return ret; + + cleanup_table: + unregister_pernet_subsys(&ip6table_raw_net_ops); + return ret; +} + +static void __exit ip6table_raw_fini(void) +{ + xt_hook_unlink(&packet_raw, rawtable_ops); + unregister_pernet_subsys(&ip6table_raw_net_ops); +} + +module_init(ip6table_raw_init); +module_exit(ip6table_raw_fini); +MODULE_LICENSE("GPL"); diff --git a/net/ipv6/netfilter/ip6table_security.c b/net/ipv6/netfilter/ip6table_security.c new file mode 100644 index 00000000..91aa2b4d --- /dev/null +++ b/net/ipv6/netfilter/ip6table_security.c @@ -0,0 +1,105 @@ +/* + * "security" table for IPv6 + * + * This is for use by Mandatory Access Control (MAC) security models, + * which need to be able to manage security policy in separate context + * to DAC. + * + * Based on iptable_mangle.c + * + * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling + * Copyright (C) 2000-2004 Netfilter Core Team <coreteam <at> netfilter.org> + * Copyright (C) 2008 Red Hat, Inc., James Morris <jmorris <at> redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/module.h> +#include <linux/netfilter_ipv6/ip6_tables.h> +#include <linux/slab.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("James Morris <jmorris <at> redhat.com>"); +MODULE_DESCRIPTION("ip6tables security table, for MAC rules"); + +#define SECURITY_VALID_HOOKS (1 << NF_INET_LOCAL_IN) | \ + (1 << NF_INET_FORWARD) | \ + (1 << NF_INET_LOCAL_OUT) + +static const struct xt_table security_table = { + .name = "security", + .valid_hooks = SECURITY_VALID_HOOKS, + .me = THIS_MODULE, + .af = NFPROTO_IPV6, + .priority = NF_IP6_PRI_SECURITY, +}; + +static unsigned int +ip6table_security_hook(unsigned int hook, struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + const struct net *net = dev_net((in != NULL) ? in : out); + + return ip6t_do_table(skb, hook, in, out, net->ipv6.ip6table_security); +} + +static struct nf_hook_ops *sectbl_ops __read_mostly; + +static int __net_init ip6table_security_net_init(struct net *net) +{ + struct ip6t_replace *repl; + + repl = ip6t_alloc_initial_table(&security_table); + if (repl == NULL) + return -ENOMEM; + net->ipv6.ip6table_security = + ip6t_register_table(net, &security_table, repl); + kfree(repl); + if (IS_ERR(net->ipv6.ip6table_security)) + return PTR_ERR(net->ipv6.ip6table_security); + + return 0; +} + +static void __net_exit ip6table_security_net_exit(struct net *net) +{ + ip6t_unregister_table(net, net->ipv6.ip6table_security); +} + +static struct pernet_operations ip6table_security_net_ops = { + .init = ip6table_security_net_init, + .exit = ip6table_security_net_exit, +}; + +static int __init ip6table_security_init(void) +{ + int ret; + + ret = register_pernet_subsys(&ip6table_security_net_ops); + if (ret < 0) + return ret; + + sectbl_ops = xt_hook_link(&security_table, ip6table_security_hook); + if (IS_ERR(sectbl_ops)) { + ret = PTR_ERR(sectbl_ops); + goto cleanup_table; + } + + return ret; + +cleanup_table: + unregister_pernet_subsys(&ip6table_security_net_ops); + return ret; +} + +static void __exit ip6table_security_fini(void) +{ + xt_hook_unlink(&security_table, sectbl_ops); + unregister_pernet_subsys(&ip6table_security_net_ops); +} + +module_init(ip6table_security_init); +module_exit(ip6table_security_fini); diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c new file mode 100644 index 00000000..4111050a --- /dev/null +++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c @@ -0,0 +1,398 @@ +/* + * Copyright (C)2004 USAGI/WIDE Project + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Author: + * Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> + */ + +#include <linux/types.h> +#include <linux/ipv6.h> +#include <linux/in6.h> +#include <linux/netfilter.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/icmp.h> +#include <net/ipv6.h> +#include <net/inet_frag.h> + +#include <linux/netfilter_bridge.h> +#include <linux/netfilter_ipv6.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_l4proto.h> +#include <net/netfilter/nf_conntrack_l3proto.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_zones.h> +#include <net/netfilter/ipv6/nf_conntrack_ipv6.h> +#include <net/netfilter/ipv6/nf_defrag_ipv6.h> +#include <net/netfilter/nf_log.h> + +static bool ipv6_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff, + struct nf_conntrack_tuple *tuple) +{ + const u_int32_t *ap; + u_int32_t _addrs[8]; + + ap = skb_header_pointer(skb, nhoff + offsetof(struct ipv6hdr, saddr), + sizeof(_addrs), _addrs); + if (ap == NULL) + return false; + + memcpy(tuple->src.u3.ip6, ap, sizeof(tuple->src.u3.ip6)); + memcpy(tuple->dst.u3.ip6, ap + 4, sizeof(tuple->dst.u3.ip6)); + + return true; +} + +static bool ipv6_invert_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple *orig) +{ + memcpy(tuple->src.u3.ip6, orig->dst.u3.ip6, sizeof(tuple->src.u3.ip6)); + memcpy(tuple->dst.u3.ip6, orig->src.u3.ip6, sizeof(tuple->dst.u3.ip6)); + + return true; +} + +static int ipv6_print_tuple(struct seq_file *s, + const struct nf_conntrack_tuple *tuple) +{ + return seq_printf(s, "src=%pI6 dst=%pI6 ", + tuple->src.u3.ip6, tuple->dst.u3.ip6); +} + +/* + * Based on ipv6_skip_exthdr() in net/ipv6/exthdr.c + * + * This function parses (probably truncated) exthdr set "hdr" + * of length "len". "nexthdrp" initially points to some place, + * where type of the first header can be found. + * + * It skips all well-known exthdrs, and returns pointer to the start + * of unparsable area i.e. the first header with unknown type. + * if success, *nexthdr is updated by type/protocol of this header. + * + * NOTES: - it may return pointer pointing beyond end of packet, + * if the last recognized header is truncated in the middle. + * - if packet is truncated, so that all parsed headers are skipped, + * it returns -1. + * - if packet is fragmented, return pointer of the fragment header. + * - ESP is unparsable for now and considered like + * normal payload protocol. + * - Note also special handling of AUTH header. Thanks to IPsec wizards. + */ + +static int nf_ct_ipv6_skip_exthdr(const struct sk_buff *skb, int start, + u8 *nexthdrp, int len) +{ + u8 nexthdr = *nexthdrp; + + while (ipv6_ext_hdr(nexthdr)) { + struct ipv6_opt_hdr hdr; + int hdrlen; + + if (len < (int)sizeof(struct ipv6_opt_hdr)) + return -1; + if (nexthdr == NEXTHDR_NONE) + break; + if (nexthdr == NEXTHDR_FRAGMENT) + break; + if (skb_copy_bits(skb, start, &hdr, sizeof(hdr))) + BUG(); + if (nexthdr == NEXTHDR_AUTH) + hdrlen = (hdr.hdrlen+2)<<2; + else + hdrlen = ipv6_optlen(&hdr); + + nexthdr = hdr.nexthdr; + len -= hdrlen; + start += hdrlen; + } + + *nexthdrp = nexthdr; + return start; +} + +static int ipv6_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, + unsigned int *dataoff, u_int8_t *protonum) +{ + unsigned int extoff = nhoff + sizeof(struct ipv6hdr); + unsigned char pnum; + int protoff; + + if (skb_copy_bits(skb, nhoff + offsetof(struct ipv6hdr, nexthdr), + &pnum, sizeof(pnum)) != 0) { + pr_debug("ip6_conntrack_core: can't get nexthdr\n"); + return -NF_ACCEPT; + } + protoff = nf_ct_ipv6_skip_exthdr(skb, extoff, &pnum, skb->len - extoff); + /* + * (protoff == skb->len) mean that the packet doesn't have no data + * except of IPv6 & ext headers. but it's tracked anyway. - YK + */ + if ((protoff < 0) || (protoff > skb->len)) { + pr_debug("ip6_conntrack_core: can't find proto in pkt\n"); + return -NF_ACCEPT; + } + + *dataoff = protoff; + *protonum = pnum; + return NF_ACCEPT; +} + +static unsigned int ipv6_confirm(unsigned int hooknum, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct nf_conn *ct; + const struct nf_conn_help *help; + const struct nf_conntrack_helper *helper; + enum ip_conntrack_info ctinfo; + unsigned int ret, protoff; + unsigned int extoff = (u8 *)(ipv6_hdr(skb) + 1) - skb->data; + unsigned char pnum = ipv6_hdr(skb)->nexthdr; + + + /* This is where we call the helper: as the packet goes out. */ + ct = nf_ct_get(skb, &ctinfo); + if (!ct || ctinfo == IP_CT_RELATED_REPLY) + goto out; + + help = nfct_help(ct); + if (!help) + goto out; + /* rcu_read_lock()ed by nf_hook_slow */ + helper = rcu_dereference(help->helper); + if (!helper) + goto out; + + protoff = nf_ct_ipv6_skip_exthdr(skb, extoff, &pnum, + skb->len - extoff); + if (protoff > skb->len || pnum == NEXTHDR_FRAGMENT) { + pr_debug("proto header not found\n"); + return NF_ACCEPT; + } + + ret = helper->help(skb, protoff, ct, ctinfo); + if (ret != NF_ACCEPT) { + nf_log_packet(NFPROTO_IPV6, hooknum, skb, in, out, NULL, + "nf_ct_%s: dropping packet", helper->name); + return ret; + } +out: + /* We've seen it coming out the other side: confirm it */ + return nf_conntrack_confirm(skb); +} + +static unsigned int __ipv6_conntrack_in(struct net *net, + unsigned int hooknum, + struct sk_buff *skb, + int (*okfn)(struct sk_buff *)) +{ + struct sk_buff *reasm = skb->nfct_reasm; + + /* This packet is fragmented and has reassembled packet. */ + if (reasm) { + /* Reassembled packet isn't parsed yet ? */ + if (!reasm->nfct) { + unsigned int ret; + + ret = nf_conntrack_in(net, PF_INET6, hooknum, reasm); + if (ret != NF_ACCEPT) + return ret; + } + nf_conntrack_get(reasm->nfct); + skb->nfct = reasm->nfct; + skb->nfctinfo = reasm->nfctinfo; + return NF_ACCEPT; + } + + return nf_conntrack_in(net, PF_INET6, hooknum, skb); +} + +static unsigned int ipv6_conntrack_in(unsigned int hooknum, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return __ipv6_conntrack_in(dev_net(in), hooknum, skb, okfn); +} + +static unsigned int ipv6_conntrack_local(unsigned int hooknum, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + /* root is playing with raw sockets. */ + if (skb->len < sizeof(struct ipv6hdr)) { + if (net_ratelimit()) + pr_notice("ipv6_conntrack_local: packet too short\n"); + return NF_ACCEPT; + } + return __ipv6_conntrack_in(dev_net(out), hooknum, skb, okfn); +} + +static struct nf_hook_ops ipv6_conntrack_ops[] __read_mostly = { + { + .hook = ipv6_conntrack_in, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_PRE_ROUTING, + .priority = NF_IP6_PRI_CONNTRACK, + }, + { + .hook = ipv6_conntrack_local, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_LOCAL_OUT, + .priority = NF_IP6_PRI_CONNTRACK, + }, + { + .hook = ipv6_confirm, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_POST_ROUTING, + .priority = NF_IP6_PRI_LAST, + }, + { + .hook = ipv6_confirm, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_LOCAL_IN, + .priority = NF_IP6_PRI_LAST-1, + }, +}; + +#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) + +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nfnetlink_conntrack.h> + +static int ipv6_tuple_to_nlattr(struct sk_buff *skb, + const struct nf_conntrack_tuple *tuple) +{ + NLA_PUT(skb, CTA_IP_V6_SRC, sizeof(u_int32_t) * 4, + &tuple->src.u3.ip6); + NLA_PUT(skb, CTA_IP_V6_DST, sizeof(u_int32_t) * 4, + &tuple->dst.u3.ip6); + return 0; + +nla_put_failure: + return -1; +} + +static const struct nla_policy ipv6_nla_policy[CTA_IP_MAX+1] = { + [CTA_IP_V6_SRC] = { .len = sizeof(u_int32_t)*4 }, + [CTA_IP_V6_DST] = { .len = sizeof(u_int32_t)*4 }, +}; + +static int ipv6_nlattr_to_tuple(struct nlattr *tb[], + struct nf_conntrack_tuple *t) +{ + if (!tb[CTA_IP_V6_SRC] || !tb[CTA_IP_V6_DST]) + return -EINVAL; + + memcpy(&t->src.u3.ip6, nla_data(tb[CTA_IP_V6_SRC]), + sizeof(u_int32_t) * 4); + memcpy(&t->dst.u3.ip6, nla_data(tb[CTA_IP_V6_DST]), + sizeof(u_int32_t) * 4); + + return 0; +} + +static int ipv6_nlattr_tuple_size(void) +{ + return nla_policy_len(ipv6_nla_policy, CTA_IP_MAX + 1); +} +#endif + +struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6 __read_mostly = { + .l3proto = PF_INET6, + .name = "ipv6", + .pkt_to_tuple = ipv6_pkt_to_tuple, + .invert_tuple = ipv6_invert_tuple, + .print_tuple = ipv6_print_tuple, + .get_l4proto = ipv6_get_l4proto, +#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) + .tuple_to_nlattr = ipv6_tuple_to_nlattr, + .nlattr_tuple_size = ipv6_nlattr_tuple_size, + .nlattr_to_tuple = ipv6_nlattr_to_tuple, + .nla_policy = ipv6_nla_policy, +#endif + .me = THIS_MODULE, +}; + +MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET6)); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Yasuyuki KOZAKAI @USAGI <yasuyuki.kozakai@toshiba.co.jp>"); + +static int __init nf_conntrack_l3proto_ipv6_init(void) +{ + int ret = 0; + + need_conntrack(); + nf_defrag_ipv6_enable(); + + ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_tcp6); + if (ret < 0) { + pr_err("nf_conntrack_ipv6: can't register tcp.\n"); + return ret; + } + + ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_udp6); + if (ret < 0) { + pr_err("nf_conntrack_ipv6: can't register udp.\n"); + goto cleanup_tcp; + } + + ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_icmpv6); + if (ret < 0) { + pr_err("nf_conntrack_ipv6: can't register icmpv6.\n"); + goto cleanup_udp; + } + + ret = nf_conntrack_l3proto_register(&nf_conntrack_l3proto_ipv6); + if (ret < 0) { + pr_err("nf_conntrack_ipv6: can't register ipv6\n"); + goto cleanup_icmpv6; + } + + ret = nf_register_hooks(ipv6_conntrack_ops, + ARRAY_SIZE(ipv6_conntrack_ops)); + if (ret < 0) { + pr_err("nf_conntrack_ipv6: can't register pre-routing defrag " + "hook.\n"); + goto cleanup_ipv6; + } + return ret; + + cleanup_ipv6: + nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv6); + cleanup_icmpv6: + nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_icmpv6); + cleanup_udp: + nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp6); + cleanup_tcp: + nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp6); + return ret; +} + +static void __exit nf_conntrack_l3proto_ipv6_fini(void) +{ + synchronize_net(); + nf_unregister_hooks(ipv6_conntrack_ops, ARRAY_SIZE(ipv6_conntrack_ops)); + nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv6); + nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_icmpv6); + nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp6); + nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp6); +} + +module_init(nf_conntrack_l3proto_ipv6_init); +module_exit(nf_conntrack_l3proto_ipv6_fini); diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c new file mode 100644 index 00000000..92cc9f29 --- /dev/null +++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c @@ -0,0 +1,362 @@ +/* + * Copyright (C)2003,2004 USAGI/WIDE Project + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Author: + * Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> + */ + +#include <linux/types.h> +#include <linux/timer.h> +#include <linux/module.h> +#include <linux/netfilter.h> +#include <linux/in6.h> +#include <linux/icmpv6.h> +#include <linux/ipv6.h> +#include <net/ipv6.h> +#include <net/ip6_checksum.h> +#include <linux/seq_file.h> +#include <linux/netfilter_ipv6.h> +#include <net/netfilter/nf_conntrack_tuple.h> +#include <net/netfilter/nf_conntrack_l4proto.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_zones.h> +#include <net/netfilter/ipv6/nf_conntrack_icmpv6.h> +#include <net/netfilter/nf_log.h> + +static unsigned int nf_ct_icmpv6_timeout __read_mostly = 30*HZ; + +static bool icmpv6_pkt_to_tuple(const struct sk_buff *skb, + unsigned int dataoff, + struct nf_conntrack_tuple *tuple) +{ + const struct icmp6hdr *hp; + struct icmp6hdr _hdr; + + hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); + if (hp == NULL) + return false; + tuple->dst.u.icmp.type = hp->icmp6_type; + tuple->src.u.icmp.id = hp->icmp6_identifier; + tuple->dst.u.icmp.code = hp->icmp6_code; + + return true; +} + +/* Add 1; spaces filled with 0. */ +static const u_int8_t invmap[] = { + [ICMPV6_ECHO_REQUEST - 128] = ICMPV6_ECHO_REPLY + 1, + [ICMPV6_ECHO_REPLY - 128] = ICMPV6_ECHO_REQUEST + 1, + [ICMPV6_NI_QUERY - 128] = ICMPV6_NI_REPLY + 1, + [ICMPV6_NI_REPLY - 128] = ICMPV6_NI_QUERY +1 +}; + +static const u_int8_t noct_valid_new[] = { + [ICMPV6_MGM_QUERY - 130] = 1, + [ICMPV6_MGM_REPORT -130] = 1, + [ICMPV6_MGM_REDUCTION - 130] = 1, + [NDISC_ROUTER_SOLICITATION - 130] = 1, + [NDISC_ROUTER_ADVERTISEMENT - 130] = 1, + [NDISC_NEIGHBOUR_SOLICITATION - 130] = 1, + [NDISC_NEIGHBOUR_ADVERTISEMENT - 130] = 1, + [ICMPV6_MLD2_REPORT - 130] = 1 +}; + +static bool icmpv6_invert_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple *orig) +{ + int type = orig->dst.u.icmp.type - 128; + if (type < 0 || type >= sizeof(invmap) || !invmap[type]) + return false; + + tuple->src.u.icmp.id = orig->src.u.icmp.id; + tuple->dst.u.icmp.type = invmap[type] - 1; + tuple->dst.u.icmp.code = orig->dst.u.icmp.code; + return true; +} + +/* Print out the per-protocol part of the tuple. */ +static int icmpv6_print_tuple(struct seq_file *s, + const struct nf_conntrack_tuple *tuple) +{ + return seq_printf(s, "type=%u code=%u id=%u ", + tuple->dst.u.icmp.type, + tuple->dst.u.icmp.code, + ntohs(tuple->src.u.icmp.id)); +} + +static unsigned int *icmpv6_get_timeouts(struct net *net) +{ + return &nf_ct_icmpv6_timeout; +} + +/* Returns verdict for packet, or -1 for invalid. */ +static int icmpv6_packet(struct nf_conn *ct, + const struct sk_buff *skb, + unsigned int dataoff, + enum ip_conntrack_info ctinfo, + u_int8_t pf, + unsigned int hooknum, + unsigned int *timeout) +{ + /* Do not immediately delete the connection after the first + successful reply to avoid excessive conntrackd traffic + and also to handle correctly ICMP echo reply duplicates. */ + nf_ct_refresh_acct(ct, ctinfo, skb, *timeout); + + return NF_ACCEPT; +} + +/* Called when a new connection for this protocol found. */ +static bool icmpv6_new(struct nf_conn *ct, const struct sk_buff *skb, + unsigned int dataoff, unsigned int *timeouts) +{ + static const u_int8_t valid_new[] = { + [ICMPV6_ECHO_REQUEST - 128] = 1, + [ICMPV6_NI_QUERY - 128] = 1 + }; + int type = ct->tuplehash[0].tuple.dst.u.icmp.type - 128; + + if (type < 0 || type >= sizeof(valid_new) || !valid_new[type]) { + /* Can't create a new ICMPv6 `conn' with this. */ + pr_debug("icmpv6: can't create new conn with type %u\n", + type + 128); + nf_ct_dump_tuple_ipv6(&ct->tuplehash[0].tuple); + if (LOG_INVALID(nf_ct_net(ct), IPPROTO_ICMPV6)) + nf_log_packet(PF_INET6, 0, skb, NULL, NULL, NULL, + "nf_ct_icmpv6: invalid new with type %d ", + type + 128); + return false; + } + return true; +} + +static int +icmpv6_error_message(struct net *net, struct nf_conn *tmpl, + struct sk_buff *skb, + unsigned int icmp6off, + enum ip_conntrack_info *ctinfo, + unsigned int hooknum) +{ + struct nf_conntrack_tuple intuple, origtuple; + const struct nf_conntrack_tuple_hash *h; + const struct nf_conntrack_l4proto *inproto; + u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE; + + NF_CT_ASSERT(skb->nfct == NULL); + + /* Are they talking about one of our connections? */ + if (!nf_ct_get_tuplepr(skb, + skb_network_offset(skb) + + sizeof(struct ipv6hdr) + + sizeof(struct icmp6hdr), + PF_INET6, &origtuple)) { + pr_debug("icmpv6_error: Can't get tuple\n"); + return -NF_ACCEPT; + } + + /* rcu_read_lock()ed by nf_hook_slow */ + inproto = __nf_ct_l4proto_find(PF_INET6, origtuple.dst.protonum); + + /* Ordinarily, we'd expect the inverted tupleproto, but it's + been preserved inside the ICMP. */ + if (!nf_ct_invert_tuple(&intuple, &origtuple, + &nf_conntrack_l3proto_ipv6, inproto)) { + pr_debug("icmpv6_error: Can't invert tuple\n"); + return -NF_ACCEPT; + } + + *ctinfo = IP_CT_RELATED; + + h = nf_conntrack_find_get(net, zone, &intuple); + if (!h) { + pr_debug("icmpv6_error: no match\n"); + return -NF_ACCEPT; + } else { + if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) + *ctinfo += IP_CT_IS_REPLY; + } + + /* Update skb to refer to this connection */ + skb->nfct = &nf_ct_tuplehash_to_ctrack(h)->ct_general; + skb->nfctinfo = *ctinfo; + return NF_ACCEPT; +} + +static int +icmpv6_error(struct net *net, struct nf_conn *tmpl, + struct sk_buff *skb, unsigned int dataoff, + enum ip_conntrack_info *ctinfo, u_int8_t pf, unsigned int hooknum) +{ + const struct icmp6hdr *icmp6h; + struct icmp6hdr _ih; + int type; + + icmp6h = skb_header_pointer(skb, dataoff, sizeof(_ih), &_ih); + if (icmp6h == NULL) { + if (LOG_INVALID(net, IPPROTO_ICMPV6)) + nf_log_packet(PF_INET6, 0, skb, NULL, NULL, NULL, + "nf_ct_icmpv6: short packet "); + return -NF_ACCEPT; + } + + if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && + nf_ip6_checksum(skb, hooknum, dataoff, IPPROTO_ICMPV6)) { + if (LOG_INVALID(net, IPPROTO_ICMPV6)) + nf_log_packet(PF_INET6, 0, skb, NULL, NULL, NULL, + "nf_ct_icmpv6: ICMPv6 checksum failed "); + return -NF_ACCEPT; + } + + type = icmp6h->icmp6_type - 130; + if (type >= 0 && type < sizeof(noct_valid_new) && + noct_valid_new[type]) { + skb->nfct = &nf_ct_untracked_get()->ct_general; + skb->nfctinfo = IP_CT_NEW; + nf_conntrack_get(skb->nfct); + return NF_ACCEPT; + } + + /* is not error message ? */ + if (icmp6h->icmp6_type >= 128) + return NF_ACCEPT; + + return icmpv6_error_message(net, tmpl, skb, dataoff, ctinfo, hooknum); +} + +#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) + +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nfnetlink_conntrack.h> +static int icmpv6_tuple_to_nlattr(struct sk_buff *skb, + const struct nf_conntrack_tuple *t) +{ + NLA_PUT_BE16(skb, CTA_PROTO_ICMPV6_ID, t->src.u.icmp.id); + NLA_PUT_U8(skb, CTA_PROTO_ICMPV6_TYPE, t->dst.u.icmp.type); + NLA_PUT_U8(skb, CTA_PROTO_ICMPV6_CODE, t->dst.u.icmp.code); + + return 0; + +nla_put_failure: + return -1; +} + +static const struct nla_policy icmpv6_nla_policy[CTA_PROTO_MAX+1] = { + [CTA_PROTO_ICMPV6_TYPE] = { .type = NLA_U8 }, + [CTA_PROTO_ICMPV6_CODE] = { .type = NLA_U8 }, + [CTA_PROTO_ICMPV6_ID] = { .type = NLA_U16 }, +}; + +static int icmpv6_nlattr_to_tuple(struct nlattr *tb[], + struct nf_conntrack_tuple *tuple) +{ + if (!tb[CTA_PROTO_ICMPV6_TYPE] || + !tb[CTA_PROTO_ICMPV6_CODE] || + !tb[CTA_PROTO_ICMPV6_ID]) + return -EINVAL; + + tuple->dst.u.icmp.type = nla_get_u8(tb[CTA_PROTO_ICMPV6_TYPE]); + tuple->dst.u.icmp.code = nla_get_u8(tb[CTA_PROTO_ICMPV6_CODE]); + tuple->src.u.icmp.id = nla_get_be16(tb[CTA_PROTO_ICMPV6_ID]); + + if (tuple->dst.u.icmp.type < 128 || + tuple->dst.u.icmp.type - 128 >= sizeof(invmap) || + !invmap[tuple->dst.u.icmp.type - 128]) + return -EINVAL; + + return 0; +} + +static int icmpv6_nlattr_tuple_size(void) +{ + return nla_policy_len(icmpv6_nla_policy, CTA_PROTO_MAX + 1); +} +#endif + +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/nfnetlink_cttimeout.h> + +static int icmpv6_timeout_nlattr_to_obj(struct nlattr *tb[], void *data) +{ + unsigned int *timeout = data; + + if (tb[CTA_TIMEOUT_ICMPV6_TIMEOUT]) { + *timeout = + ntohl(nla_get_be32(tb[CTA_TIMEOUT_ICMPV6_TIMEOUT])) * HZ; + } else { + /* Set default ICMPv6 timeout. */ + *timeout = nf_ct_icmpv6_timeout; + } + return 0; +} + +static int +icmpv6_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) +{ + const unsigned int *timeout = data; + + NLA_PUT_BE32(skb, CTA_TIMEOUT_ICMPV6_TIMEOUT, htonl(*timeout / HZ)); + + return 0; + +nla_put_failure: + return -ENOSPC; +} + +static const struct nla_policy +icmpv6_timeout_nla_policy[CTA_TIMEOUT_ICMPV6_MAX+1] = { + [CTA_TIMEOUT_ICMPV6_TIMEOUT] = { .type = NLA_U32 }, +}; +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ + +#ifdef CONFIG_SYSCTL +static struct ctl_table_header *icmpv6_sysctl_header; +static struct ctl_table icmpv6_sysctl_table[] = { + { + .procname = "nf_conntrack_icmpv6_timeout", + .data = &nf_ct_icmpv6_timeout, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { } +}; +#endif /* CONFIG_SYSCTL */ + +struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6 __read_mostly = +{ + .l3proto = PF_INET6, + .l4proto = IPPROTO_ICMPV6, + .name = "icmpv6", + .pkt_to_tuple = icmpv6_pkt_to_tuple, + .invert_tuple = icmpv6_invert_tuple, + .print_tuple = icmpv6_print_tuple, + .packet = icmpv6_packet, + .get_timeouts = icmpv6_get_timeouts, + .new = icmpv6_new, + .error = icmpv6_error, +#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) + .tuple_to_nlattr = icmpv6_tuple_to_nlattr, + .nlattr_tuple_size = icmpv6_nlattr_tuple_size, + .nlattr_to_tuple = icmpv6_nlattr_to_tuple, + .nla_policy = icmpv6_nla_policy, +#endif +#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) + .ctnl_timeout = { + .nlattr_to_obj = icmpv6_timeout_nlattr_to_obj, + .obj_to_nlattr = icmpv6_timeout_obj_to_nlattr, + .nlattr_max = CTA_TIMEOUT_ICMP_MAX, + .obj_size = sizeof(unsigned int), + .nla_policy = icmpv6_timeout_nla_policy, + }, +#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ +#ifdef CONFIG_SYSCTL + .ctl_table_header = &icmpv6_sysctl_header, + .ctl_table = icmpv6_sysctl_table, +#endif +}; diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c new file mode 100644 index 00000000..38f00b02 --- /dev/null +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -0,0 +1,650 @@ +/* + * IPv6 fragment reassembly for connection tracking + * + * Copyright (C)2004 USAGI/WIDE Project + * + * Author: + * Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> + * + * Based on: net/ipv6/reassembly.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/string.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/jiffies.h> +#include <linux/net.h> +#include <linux/list.h> +#include <linux/netdevice.h> +#include <linux/in6.h> +#include <linux/ipv6.h> +#include <linux/icmpv6.h> +#include <linux/random.h> +#include <linux/slab.h> + +#include <net/sock.h> +#include <net/snmp.h> +#include <net/inet_frag.h> + +#include <net/ipv6.h> +#include <net/protocol.h> +#include <net/transp_v6.h> +#include <net/rawv6.h> +#include <net/ndisc.h> +#include <net/addrconf.h> +#include <net/netfilter/ipv6/nf_conntrack_ipv6.h> +#include <linux/sysctl.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv6.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <net/netfilter/ipv6/nf_defrag_ipv6.h> + + +struct nf_ct_frag6_skb_cb +{ + struct inet6_skb_parm h; + int offset; + struct sk_buff *orig; +}; + +#define NFCT_FRAG6_CB(skb) ((struct nf_ct_frag6_skb_cb*)((skb)->cb)) + +struct nf_ct_frag6_queue +{ + struct inet_frag_queue q; + + __be32 id; /* fragment id */ + u32 user; + struct in6_addr saddr; + struct in6_addr daddr; + + unsigned int csum; + __u16 nhoffset; +}; + +static struct inet_frags nf_frags; +static struct netns_frags nf_init_frags; + +#ifdef CONFIG_SYSCTL +static struct ctl_table nf_ct_frag6_sysctl_table[] = { + { + .procname = "nf_conntrack_frag6_timeout", + .data = &nf_init_frags.timeout, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "nf_conntrack_frag6_low_thresh", + .data = &nf_init_frags.low_thresh, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "nf_conntrack_frag6_high_thresh", + .data = &nf_init_frags.high_thresh, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { } +}; + +static struct ctl_table_header *nf_ct_frag6_sysctl_header; +#endif + +static unsigned int nf_hashfn(struct inet_frag_queue *q) +{ + const struct nf_ct_frag6_queue *nq; + + nq = container_of(q, struct nf_ct_frag6_queue, q); + return inet6_hash_frag(nq->id, &nq->saddr, &nq->daddr, nf_frags.rnd); +} + +static void nf_skb_free(struct sk_buff *skb) +{ + if (NFCT_FRAG6_CB(skb)->orig) + kfree_skb(NFCT_FRAG6_CB(skb)->orig); +} + +/* Destruction primitives. */ + +static __inline__ void fq_put(struct nf_ct_frag6_queue *fq) +{ + inet_frag_put(&fq->q, &nf_frags); +} + +/* Kill fq entry. It is not destroyed immediately, + * because caller (and someone more) holds reference count. + */ +static __inline__ void fq_kill(struct nf_ct_frag6_queue *fq) +{ + inet_frag_kill(&fq->q, &nf_frags); +} + +static void nf_ct_frag6_evictor(void) +{ + local_bh_disable(); + inet_frag_evictor(&nf_init_frags, &nf_frags); + local_bh_enable(); +} + +static void nf_ct_frag6_expire(unsigned long data) +{ + struct nf_ct_frag6_queue *fq; + + fq = container_of((struct inet_frag_queue *)data, + struct nf_ct_frag6_queue, q); + + spin_lock(&fq->q.lock); + + if (fq->q.last_in & INET_FRAG_COMPLETE) + goto out; + + fq_kill(fq); + +out: + spin_unlock(&fq->q.lock); + fq_put(fq); +} + +/* Creation primitives. */ + +static __inline__ struct nf_ct_frag6_queue * +fq_find(__be32 id, u32 user, struct in6_addr *src, struct in6_addr *dst) +{ + struct inet_frag_queue *q; + struct ip6_create_arg arg; + unsigned int hash; + + arg.id = id; + arg.user = user; + arg.src = src; + arg.dst = dst; + + read_lock_bh(&nf_frags.lock); + hash = inet6_hash_frag(id, src, dst, nf_frags.rnd); + + q = inet_frag_find(&nf_init_frags, &nf_frags, &arg, hash); + local_bh_enable(); + if (q == NULL) + goto oom; + + return container_of(q, struct nf_ct_frag6_queue, q); + +oom: + return NULL; +} + + +static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb, + const struct frag_hdr *fhdr, int nhoff) +{ + struct sk_buff *prev, *next; + int offset, end; + + if (fq->q.last_in & INET_FRAG_COMPLETE) { + pr_debug("Already completed\n"); + goto err; + } + + offset = ntohs(fhdr->frag_off) & ~0x7; + end = offset + (ntohs(ipv6_hdr(skb)->payload_len) - + ((u8 *)(fhdr + 1) - (u8 *)(ipv6_hdr(skb) + 1))); + + if ((unsigned int)end > IPV6_MAXPLEN) { + pr_debug("offset is too large.\n"); + return -1; + } + + if (skb->ip_summed == CHECKSUM_COMPLETE) { + const unsigned char *nh = skb_network_header(skb); + skb->csum = csum_sub(skb->csum, + csum_partial(nh, (u8 *)(fhdr + 1) - nh, + 0)); + } + + /* Is this the final fragment? */ + if (!(fhdr->frag_off & htons(IP6_MF))) { + /* If we already have some bits beyond end + * or have different end, the segment is corrupted. + */ + if (end < fq->q.len || + ((fq->q.last_in & INET_FRAG_LAST_IN) && end != fq->q.len)) { + pr_debug("already received last fragment\n"); + goto err; + } + fq->q.last_in |= INET_FRAG_LAST_IN; + fq->q.len = end; + } else { + /* Check if the fragment is rounded to 8 bytes. + * Required by the RFC. + */ + if (end & 0x7) { + /* RFC2460 says always send parameter problem in + * this case. -DaveM + */ + pr_debug("end of fragment not rounded to 8 bytes.\n"); + return -1; + } + if (end > fq->q.len) { + /* Some bits beyond end -> corruption. */ + if (fq->q.last_in & INET_FRAG_LAST_IN) { + pr_debug("last packet already reached.\n"); + goto err; + } + fq->q.len = end; + } + } + + if (end == offset) + goto err; + + /* Point into the IP datagram 'data' part. */ + if (!pskb_pull(skb, (u8 *) (fhdr + 1) - skb->data)) { + pr_debug("queue: message is too short.\n"); + goto err; + } + if (pskb_trim_rcsum(skb, end - offset)) { + pr_debug("Can't trim\n"); + goto err; + } + + /* Find out which fragments are in front and at the back of us + * in the chain of fragments so far. We must know where to put + * this fragment, right? + */ + prev = fq->q.fragments_tail; + if (!prev || NFCT_FRAG6_CB(prev)->offset < offset) { + next = NULL; + goto found; + } + prev = NULL; + for (next = fq->q.fragments; next != NULL; next = next->next) { + if (NFCT_FRAG6_CB(next)->offset >= offset) + break; /* bingo! */ + prev = next; + } + +found: + /* RFC5722, Section 4: + * When reassembling an IPv6 datagram, if + * one or more its constituent fragments is determined to be an + * overlapping fragment, the entire datagram (and any constituent + * fragments, including those not yet received) MUST be silently + * discarded. + */ + + /* Check for overlap with preceding fragment. */ + if (prev && + (NFCT_FRAG6_CB(prev)->offset + prev->len) > offset) + goto discard_fq; + + /* Look for overlap with succeeding segment. */ + if (next && NFCT_FRAG6_CB(next)->offset < end) + goto discard_fq; + + NFCT_FRAG6_CB(skb)->offset = offset; + + /* Insert this fragment in the chain of fragments. */ + skb->next = next; + if (!next) + fq->q.fragments_tail = skb; + if (prev) + prev->next = skb; + else + fq->q.fragments = skb; + + skb->dev = NULL; + fq->q.stamp = skb->tstamp; + fq->q.meat += skb->len; + atomic_add(skb->truesize, &nf_init_frags.mem); + + /* The first fragment. + * nhoffset is obtained from the first fragment, of course. + */ + if (offset == 0) { + fq->nhoffset = nhoff; + fq->q.last_in |= INET_FRAG_FIRST_IN; + } + write_lock(&nf_frags.lock); + list_move_tail(&fq->q.lru_list, &nf_init_frags.lru_list); + write_unlock(&nf_frags.lock); + return 0; + +discard_fq: + fq_kill(fq); +err: + return -1; +} + +/* + * Check if this packet is complete. + * Returns NULL on failure by any reason, and pointer + * to current nexthdr field in reassembled frame. + * + * It is called with locked fq, and caller must check that + * queue is eligible for reassembly i.e. it is not COMPLETE, + * the last and the first frames arrived and all the bits are here. + */ +static struct sk_buff * +nf_ct_frag6_reasm(struct nf_ct_frag6_queue *fq, struct net_device *dev) +{ + struct sk_buff *fp, *op, *head = fq->q.fragments; + int payload_len; + + fq_kill(fq); + + WARN_ON(head == NULL); + WARN_ON(NFCT_FRAG6_CB(head)->offset != 0); + + /* Unfragmented part is taken from the first segment. */ + payload_len = ((head->data - skb_network_header(head)) - + sizeof(struct ipv6hdr) + fq->q.len - + sizeof(struct frag_hdr)); + if (payload_len > IPV6_MAXPLEN) { + pr_debug("payload len is too large.\n"); + goto out_oversize; + } + + /* Head of list must not be cloned. */ + if (skb_cloned(head) && pskb_expand_head(head, 0, 0, GFP_ATOMIC)) { + pr_debug("skb is cloned but can't expand head"); + goto out_oom; + } + + /* If the first fragment is fragmented itself, we split + * it to two chunks: the first with data and paged part + * and the second, holding only fragments. */ + if (skb_has_frag_list(head)) { + struct sk_buff *clone; + int i, plen = 0; + + clone = alloc_skb(0, GFP_ATOMIC); + if (clone == NULL) + goto out_oom; + + clone->next = head->next; + head->next = clone; + skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; + skb_frag_list_init(head); + for (i = 0; i < skb_shinfo(head)->nr_frags; i++) + plen += skb_frag_size(&skb_shinfo(head)->frags[i]); + clone->len = clone->data_len = head->data_len - plen; + head->data_len -= clone->len; + head->len -= clone->len; + clone->csum = 0; + clone->ip_summed = head->ip_summed; + + NFCT_FRAG6_CB(clone)->orig = NULL; + atomic_add(clone->truesize, &nf_init_frags.mem); + } + + /* We have to remove fragment header from datagram and to relocate + * header in order to calculate ICV correctly. */ + skb_network_header(head)[fq->nhoffset] = skb_transport_header(head)[0]; + memmove(head->head + sizeof(struct frag_hdr), head->head, + (head->data - head->head) - sizeof(struct frag_hdr)); + head->mac_header += sizeof(struct frag_hdr); + head->network_header += sizeof(struct frag_hdr); + + skb_shinfo(head)->frag_list = head->next; + skb_reset_transport_header(head); + skb_push(head, head->data - skb_network_header(head)); + + for (fp=head->next; fp; fp = fp->next) { + head->data_len += fp->len; + head->len += fp->len; + if (head->ip_summed != fp->ip_summed) + head->ip_summed = CHECKSUM_NONE; + else if (head->ip_summed == CHECKSUM_COMPLETE) + head->csum = csum_add(head->csum, fp->csum); + head->truesize += fp->truesize; + } + atomic_sub(head->truesize, &nf_init_frags.mem); + + head->next = NULL; + head->dev = dev; + head->tstamp = fq->q.stamp; + ipv6_hdr(head)->payload_len = htons(payload_len); + + /* Yes, and fold redundant checksum back. 8) */ + if (head->ip_summed == CHECKSUM_COMPLETE) + head->csum = csum_partial(skb_network_header(head), + skb_network_header_len(head), + head->csum); + + fq->q.fragments = NULL; + fq->q.fragments_tail = NULL; + + /* all original skbs are linked into the NFCT_FRAG6_CB(head).orig */ + fp = skb_shinfo(head)->frag_list; + if (fp && NFCT_FRAG6_CB(fp)->orig == NULL) + /* at above code, head skb is divided into two skbs. */ + fp = fp->next; + + op = NFCT_FRAG6_CB(head)->orig; + for (; fp; fp = fp->next) { + struct sk_buff *orig = NFCT_FRAG6_CB(fp)->orig; + + op->next = orig; + op = orig; + NFCT_FRAG6_CB(fp)->orig = NULL; + } + + return head; + +out_oversize: + if (net_ratelimit()) + printk(KERN_DEBUG "nf_ct_frag6_reasm: payload len = %d\n", payload_len); + goto out_fail; +out_oom: + if (net_ratelimit()) + printk(KERN_DEBUG "nf_ct_frag6_reasm: no memory for reassembly\n"); +out_fail: + return NULL; +} + +/* + * find the header just before Fragment Header. + * + * if success return 0 and set ... + * (*prevhdrp): the value of "Next Header Field" in the header + * just before Fragment Header. + * (*prevhoff): the offset of "Next Header Field" in the header + * just before Fragment Header. + * (*fhoff) : the offset of Fragment Header. + * + * Based on ipv6_skip_hdr() in net/ipv6/exthdr.c + * + */ +static int +find_prev_fhdr(struct sk_buff *skb, u8 *prevhdrp, int *prevhoff, int *fhoff) +{ + u8 nexthdr = ipv6_hdr(skb)->nexthdr; + const int netoff = skb_network_offset(skb); + u8 prev_nhoff = netoff + offsetof(struct ipv6hdr, nexthdr); + int start = netoff + sizeof(struct ipv6hdr); + int len = skb->len - start; + u8 prevhdr = NEXTHDR_IPV6; + + while (nexthdr != NEXTHDR_FRAGMENT) { + struct ipv6_opt_hdr hdr; + int hdrlen; + + if (!ipv6_ext_hdr(nexthdr)) { + return -1; + } + if (nexthdr == NEXTHDR_NONE) { + pr_debug("next header is none\n"); + return -1; + } + if (len < (int)sizeof(struct ipv6_opt_hdr)) { + pr_debug("too short\n"); + return -1; + } + if (skb_copy_bits(skb, start, &hdr, sizeof(hdr))) + BUG(); + if (nexthdr == NEXTHDR_AUTH) + hdrlen = (hdr.hdrlen+2)<<2; + else + hdrlen = ipv6_optlen(&hdr); + + prevhdr = nexthdr; + prev_nhoff = start; + + nexthdr = hdr.nexthdr; + len -= hdrlen; + start += hdrlen; + } + + if (len < 0) + return -1; + + *prevhdrp = prevhdr; + *prevhoff = prev_nhoff; + *fhoff = start; + + return 0; +} + +struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb, u32 user) +{ + struct sk_buff *clone; + struct net_device *dev = skb->dev; + struct frag_hdr *fhdr; + struct nf_ct_frag6_queue *fq; + struct ipv6hdr *hdr; + int fhoff, nhoff; + u8 prevhdr; + struct sk_buff *ret_skb = NULL; + + /* Jumbo payload inhibits frag. header */ + if (ipv6_hdr(skb)->payload_len == 0) { + pr_debug("payload len = 0\n"); + return skb; + } + + if (find_prev_fhdr(skb, &prevhdr, &nhoff, &fhoff) < 0) + return skb; + + clone = skb_clone(skb, GFP_ATOMIC); + if (clone == NULL) { + pr_debug("Can't clone skb\n"); + return skb; + } + + NFCT_FRAG6_CB(clone)->orig = skb; + + if (!pskb_may_pull(clone, fhoff + sizeof(*fhdr))) { + pr_debug("message is too short.\n"); + goto ret_orig; + } + + skb_set_transport_header(clone, fhoff); + hdr = ipv6_hdr(clone); + fhdr = (struct frag_hdr *)skb_transport_header(clone); + + if (atomic_read(&nf_init_frags.mem) > nf_init_frags.high_thresh) + nf_ct_frag6_evictor(); + + fq = fq_find(fhdr->identification, user, &hdr->saddr, &hdr->daddr); + if (fq == NULL) { + pr_debug("Can't find and can't create new queue\n"); + goto ret_orig; + } + + spin_lock_bh(&fq->q.lock); + + if (nf_ct_frag6_queue(fq, clone, fhdr, nhoff) < 0) { + spin_unlock_bh(&fq->q.lock); + pr_debug("Can't insert skb to queue\n"); + fq_put(fq); + goto ret_orig; + } + + if (fq->q.last_in == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && + fq->q.meat == fq->q.len) { + ret_skb = nf_ct_frag6_reasm(fq, dev); + if (ret_skb == NULL) + pr_debug("Can't reassemble fragmented packets\n"); + } + spin_unlock_bh(&fq->q.lock); + + fq_put(fq); + return ret_skb; + +ret_orig: + kfree_skb(clone); + return skb; +} + +void nf_ct_frag6_output(unsigned int hooknum, struct sk_buff *skb, + struct net_device *in, struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct sk_buff *s, *s2; + + for (s = NFCT_FRAG6_CB(skb)->orig; s;) { + nf_conntrack_put_reasm(s->nfct_reasm); + nf_conntrack_get_reasm(skb); + s->nfct_reasm = skb; + + s2 = s->next; + s->next = NULL; + + NF_HOOK_THRESH(NFPROTO_IPV6, hooknum, s, in, out, okfn, + NF_IP6_PRI_CONNTRACK_DEFRAG + 1); + s = s2; + } + nf_conntrack_put_reasm(skb); +} + +int nf_ct_frag6_init(void) +{ + nf_frags.hashfn = nf_hashfn; + nf_frags.constructor = ip6_frag_init; + nf_frags.destructor = NULL; + nf_frags.skb_free = nf_skb_free; + nf_frags.qsize = sizeof(struct nf_ct_frag6_queue); + nf_frags.match = ip6_frag_match; + nf_frags.frag_expire = nf_ct_frag6_expire; + nf_frags.secret_interval = 10 * 60 * HZ; + nf_init_frags.timeout = IPV6_FRAG_TIMEOUT; + nf_init_frags.high_thresh = IPV6_FRAG_HIGH_THRESH; + nf_init_frags.low_thresh = IPV6_FRAG_LOW_THRESH; + inet_frags_init_net(&nf_init_frags); + inet_frags_init(&nf_frags); + +#ifdef CONFIG_SYSCTL + nf_ct_frag6_sysctl_header = register_sysctl_paths(nf_net_netfilter_sysctl_path, + nf_ct_frag6_sysctl_table); + if (!nf_ct_frag6_sysctl_header) { + inet_frags_fini(&nf_frags); + return -ENOMEM; + } +#endif + + return 0; +} + +void nf_ct_frag6_cleanup(void) +{ +#ifdef CONFIG_SYSCTL + unregister_sysctl_table(nf_ct_frag6_sysctl_header); + nf_ct_frag6_sysctl_header = NULL; +#endif + inet_frags_fini(&nf_frags); + + nf_init_frags.low_thresh = 0; + nf_ct_frag6_evictor(); +} diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c new file mode 100644 index 00000000..cdd6d045 --- /dev/null +++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c @@ -0,0 +1,137 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/types.h> +#include <linux/ipv6.h> +#include <linux/in6.h> +#include <linux/netfilter.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/icmp.h> +#include <linux/sysctl.h> +#include <net/ipv6.h> +#include <net/inet_frag.h> + +#include <linux/netfilter_ipv6.h> +#include <linux/netfilter_bridge.h> +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_l4proto.h> +#include <net/netfilter/nf_conntrack_l3proto.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/ipv6/nf_conntrack_ipv6.h> +#endif +#include <net/netfilter/nf_conntrack_zones.h> +#include <net/netfilter/ipv6/nf_defrag_ipv6.h> + +static enum ip6_defrag_users nf_ct6_defrag_user(unsigned int hooknum, + struct sk_buff *skb) +{ + u16 zone = NF_CT_DEFAULT_ZONE; + +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) + if (skb->nfct) + zone = nf_ct_zone((struct nf_conn *)skb->nfct); +#endif + +#ifdef CONFIG_BRIDGE_NETFILTER + if (skb->nf_bridge && + skb->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING) + return IP6_DEFRAG_CONNTRACK_BRIDGE_IN + zone; +#endif + if (hooknum == NF_INET_PRE_ROUTING) + return IP6_DEFRAG_CONNTRACK_IN + zone; + else + return IP6_DEFRAG_CONNTRACK_OUT + zone; + +} + +static unsigned int ipv6_defrag(unsigned int hooknum, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct sk_buff *reasm; + +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) + /* Previously seen (loopback)? */ + if (skb->nfct && !nf_ct_is_template((struct nf_conn *)skb->nfct)) + return NF_ACCEPT; +#endif + + reasm = nf_ct_frag6_gather(skb, nf_ct6_defrag_user(hooknum, skb)); + /* queued */ + if (reasm == NULL) + return NF_STOLEN; + + /* error occurred or not fragmented */ + if (reasm == skb) + return NF_ACCEPT; + + nf_ct_frag6_output(hooknum, reasm, (struct net_device *)in, + (struct net_device *)out, okfn); + + return NF_STOLEN; +} + +static struct nf_hook_ops ipv6_defrag_ops[] = { + { + .hook = ipv6_defrag, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_PRE_ROUTING, + .priority = NF_IP6_PRI_CONNTRACK_DEFRAG, + }, + { + .hook = ipv6_defrag, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_LOCAL_OUT, + .priority = NF_IP6_PRI_CONNTRACK_DEFRAG, + }, +}; + +static int __init nf_defrag_init(void) +{ + int ret = 0; + + ret = nf_ct_frag6_init(); + if (ret < 0) { + pr_err("nf_defrag_ipv6: can't initialize frag6.\n"); + return ret; + } + ret = nf_register_hooks(ipv6_defrag_ops, ARRAY_SIZE(ipv6_defrag_ops)); + if (ret < 0) { + pr_err("nf_defrag_ipv6: can't register hooks\n"); + goto cleanup_frag6; + } + return ret; + +cleanup_frag6: + nf_ct_frag6_cleanup(); + return ret; + +} + +static void __exit nf_defrag_fini(void) +{ + nf_unregister_hooks(ipv6_defrag_ops, ARRAY_SIZE(ipv6_defrag_ops)); + nf_ct_frag6_cleanup(); +} + +void nf_defrag_ipv6_enable(void) +{ +} +EXPORT_SYMBOL_GPL(nf_defrag_ipv6_enable); + +module_init(nf_defrag_init); +module_exit(nf_defrag_fini); + +MODULE_LICENSE("GPL"); diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c new file mode 100755 index 00000000..f46e315b --- /dev/null +++ b/net/ipv6/ping.c @@ -0,0 +1,222 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * "Ping" sockets + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Based on ipv4/ping.c code. + * + * Authors: Lorenzo Colitti (IPv6 support) + * Vasiliy Kulikov / Openwall (IPv4 implementation, for Linux 2.6), + * Pavel Kankovsky (IPv4 implementation, for Linux 2.4.32) + * + */ + +#include <net/addrconf.h> +#include <net/ipv6.h> +#include <net/ip6_route.h> +#include <net/protocol.h> +#include <net/udp.h> +#include <net/transp_v6.h> +#include <net/ping.h> +#include <linux/module.h> + +struct proto pingv6_prot = { + .name = "PINGv6", + .owner = THIS_MODULE, + .init = ping_init_sock, + .close = ping_close, + .connect = ip6_datagram_connect, + .disconnect = udp_disconnect, + .setsockopt = ipv6_setsockopt, + .getsockopt = ipv6_getsockopt, + .sendmsg = ping_v6_sendmsg, + .recvmsg = ping_recvmsg, + .bind = ping_bind, + .backlog_rcv = ping_queue_rcv_skb, + .hash = ping_hash, + .unhash = ping_unhash, + .get_port = ping_get_port, + .obj_size = sizeof(struct raw6_sock), +}; +EXPORT_SYMBOL_GPL(pingv6_prot); + +static struct inet_protosw pingv6_protosw = { + .type = SOCK_DGRAM, + .protocol = IPPROTO_ICMPV6, + .prot = &pingv6_prot, + .ops = &inet6_dgram_ops, + .no_check = UDP_CSUM_DEFAULT, + .flags = INET_PROTOSW_REUSE, +}; + + +/* Compatibility glue so we can support IPv6 when it's compiled as a module */ +int dummy_ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len) +{ + return -EAFNOSUPPORT; +} +int dummy_datagram_recv_ctl(struct sock *sk, struct msghdr *msg, + struct sk_buff *skb) +{ + return -EAFNOSUPPORT; +} +int dummy_icmpv6_err_convert(u8 type, u8 code, int *err) +{ + return -EAFNOSUPPORT; +} +void dummy_ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err, + __be16 port, u32 info, u8 *payload) {} +int dummy_ipv6_chk_addr(struct net *net, const struct in6_addr *addr, + struct net_device *dev, int strict) +{ + return 0; +} + +int __init pingv6_init(void) +{ + pingv6_ops.ipv6_recv_error = ipv6_recv_error; + pingv6_ops.datagram_recv_ctl = datagram_recv_ctl; + pingv6_ops.icmpv6_err_convert = icmpv6_err_convert; + pingv6_ops.ipv6_icmp_error = ipv6_icmp_error; + pingv6_ops.ipv6_chk_addr = ipv6_chk_addr; + return inet6_register_protosw(&pingv6_protosw); +} + +/* This never gets called because it's not possible to unload the ipv6 module, + * but just in case. + */ +void pingv6_exit(void) +{ + pingv6_ops.ipv6_recv_error = dummy_ipv6_recv_error; + pingv6_ops.datagram_recv_ctl = dummy_datagram_recv_ctl; + pingv6_ops.icmpv6_err_convert = dummy_icmpv6_err_convert; + pingv6_ops.ipv6_icmp_error = dummy_ipv6_icmp_error; + pingv6_ops.ipv6_chk_addr = dummy_ipv6_chk_addr; + inet6_unregister_protosw(&pingv6_protosw); +} + +int ping_v6_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, + size_t len) +{ + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct icmp6hdr user_icmph; + int addr_type; + struct in6_addr *daddr; + int iif = 0; + struct flowi6 fl6; + int err; + int hlimit; + struct dst_entry *dst; + struct rt6_info *rt; + struct pingfakehdr pfh; + + pr_debug("ping_v6_sendmsg(sk=%p,sk->num=%u)\n", inet, inet->inet_num); + + err = ping_common_sendmsg(AF_INET6, msg, len, &user_icmph, + sizeof(user_icmph)); + if (err) + return err; + + if (msg->msg_name) { + struct sockaddr_in6 *u = (struct sockaddr_in6 *) msg->msg_name; + if (msg->msg_namelen < sizeof(struct sockaddr_in6) || + u->sin6_family != AF_INET6) { + return -EINVAL; + } + if (sk->sk_bound_dev_if && + sk->sk_bound_dev_if != u->sin6_scope_id) { + return -EINVAL; + } + daddr = &(u->sin6_addr); + iif = u->sin6_scope_id; + } else { + if (sk->sk_state != TCP_ESTABLISHED) + return -EDESTADDRREQ; + daddr = &np->daddr; + } + + if (!iif) + iif = sk->sk_bound_dev_if; + + addr_type = ipv6_addr_type(daddr); + if (__ipv6_addr_needs_scope_id(addr_type) && !iif) + return -EINVAL; + if (addr_type & IPV6_ADDR_MAPPED) + return -EINVAL; + + /* TODO: use ip6_datagram_send_ctl to get options from cmsg */ + + memset(&fl6, 0, sizeof(fl6)); + + fl6.flowi6_proto = IPPROTO_ICMPV6; + fl6.saddr = np->saddr; + fl6.daddr = *daddr; + fl6.fl6_icmp_type = user_icmph.icmp6_type; + fl6.fl6_icmp_code = user_icmph.icmp6_code; + security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); + + if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) + fl6.flowi6_oif = np->mcast_oif; + else if (!fl6.flowi6_oif) + fl6.flowi6_oif = np->ucast_oif; + + dst = ip6_sk_dst_lookup_flow(sk, &fl6, daddr, 1); + if (IS_ERR(dst)) + return PTR_ERR(dst); + rt = (struct rt6_info *) dst; + + np = inet6_sk(sk); + if (!np) + return -EBADF; + + if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) + fl6.flowi6_oif = np->mcast_oif; + else if (!fl6.flowi6_oif) + fl6.flowi6_oif = np->ucast_oif; + + pfh.icmph.type = user_icmph.icmp6_type; + pfh.icmph.code = user_icmph.icmp6_code; + pfh.icmph.checksum = 0; + pfh.icmph.un.echo.id = inet->inet_sport; + pfh.icmph.un.echo.sequence = user_icmph.icmp6_sequence; + pfh.iov = msg->msg_iov; + pfh.wcheck = 0; + pfh.family = AF_INET6; + + if (ipv6_addr_is_multicast(&fl6.daddr)) + hlimit = np->mcast_hops; + else + hlimit = np->hop_limit; + if (hlimit < 0) + hlimit = ip6_dst_hoplimit(dst); + + lock_sock(sk); + err = ip6_append_data(sk, ping_getfrag, &pfh, len, + 0, hlimit, + np->tclass, NULL, &fl6, rt, + MSG_DONTWAIT, np->dontfrag); + + if (err) { + ICMP6_INC_STATS_BH(sock_net(sk), rt->rt6i_idev, + ICMP6_MIB_OUTERRORS); + ip6_flush_pending_frames(sk); + } else { + err = icmpv6_push_pending_frames(sk, &fl6, + (struct icmp6hdr *) &pfh.icmph, + len); + } + release_sock(sk); + + if (err) + return err; + + return len; +} diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c new file mode 100644 index 00000000..da2e92d0 --- /dev/null +++ b/net/ipv6/proc.c @@ -0,0 +1,338 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * This file implements the various access functions for the + * PROC file system. This is very similar to the IPv4 version, + * except it reports the sockets in the INET6 address family. + * + * Authors: David S. Miller (davem@caip.rutgers.edu) + * YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <linux/socket.h> +#include <linux/net.h> +#include <linux/ipv6.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/stddef.h> +#include <linux/export.h> +#include <net/net_namespace.h> +#include <net/ip.h> +#include <net/sock.h> +#include <net/tcp.h> +#include <net/udp.h> +#include <net/transp_v6.h> +#include <net/ipv6.h> + +static int sockstat6_seq_show(struct seq_file *seq, void *v) +{ + struct net *net = seq->private; + + seq_printf(seq, "TCP6: inuse %d\n", + sock_prot_inuse_get(net, &tcpv6_prot)); + seq_printf(seq, "UDP6: inuse %d\n", + sock_prot_inuse_get(net, &udpv6_prot)); + seq_printf(seq, "UDPLITE6: inuse %d\n", + sock_prot_inuse_get(net, &udplitev6_prot)); + seq_printf(seq, "RAW6: inuse %d\n", + sock_prot_inuse_get(net, &rawv6_prot)); + seq_printf(seq, "FRAG6: inuse %d memory %d\n", + ip6_frag_nqueues(net), ip6_frag_mem(net)); + return 0; +} + +static int sockstat6_seq_open(struct inode *inode, struct file *file) +{ + return single_open_net(inode, file, sockstat6_seq_show); +} + +static const struct file_operations sockstat6_seq_fops = { + .owner = THIS_MODULE, + .open = sockstat6_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release_net, +}; + +static const struct snmp_mib snmp6_ipstats_list[] = { +/* ipv6 mib according to RFC 2465 */ + SNMP_MIB_ITEM("Ip6InReceives", IPSTATS_MIB_INPKTS), + SNMP_MIB_ITEM("Ip6InHdrErrors", IPSTATS_MIB_INHDRERRORS), + SNMP_MIB_ITEM("Ip6InTooBigErrors", IPSTATS_MIB_INTOOBIGERRORS), + SNMP_MIB_ITEM("Ip6InNoRoutes", IPSTATS_MIB_INNOROUTES), + SNMP_MIB_ITEM("Ip6InAddrErrors", IPSTATS_MIB_INADDRERRORS), + SNMP_MIB_ITEM("Ip6InUnknownProtos", IPSTATS_MIB_INUNKNOWNPROTOS), + SNMP_MIB_ITEM("Ip6InTruncatedPkts", IPSTATS_MIB_INTRUNCATEDPKTS), + SNMP_MIB_ITEM("Ip6InDiscards", IPSTATS_MIB_INDISCARDS), + SNMP_MIB_ITEM("Ip6InDelivers", IPSTATS_MIB_INDELIVERS), + SNMP_MIB_ITEM("Ip6OutForwDatagrams", IPSTATS_MIB_OUTFORWDATAGRAMS), + SNMP_MIB_ITEM("Ip6OutRequests", IPSTATS_MIB_OUTPKTS), + SNMP_MIB_ITEM("Ip6OutDiscards", IPSTATS_MIB_OUTDISCARDS), + SNMP_MIB_ITEM("Ip6OutNoRoutes", IPSTATS_MIB_OUTNOROUTES), + SNMP_MIB_ITEM("Ip6ReasmTimeout", IPSTATS_MIB_REASMTIMEOUT), + SNMP_MIB_ITEM("Ip6ReasmReqds", IPSTATS_MIB_REASMREQDS), + SNMP_MIB_ITEM("Ip6ReasmOKs", IPSTATS_MIB_REASMOKS), + SNMP_MIB_ITEM("Ip6ReasmFails", IPSTATS_MIB_REASMFAILS), + SNMP_MIB_ITEM("Ip6FragOKs", IPSTATS_MIB_FRAGOKS), + SNMP_MIB_ITEM("Ip6FragFails", IPSTATS_MIB_FRAGFAILS), + SNMP_MIB_ITEM("Ip6FragCreates", IPSTATS_MIB_FRAGCREATES), + SNMP_MIB_ITEM("Ip6InMcastPkts", IPSTATS_MIB_INMCASTPKTS), + SNMP_MIB_ITEM("Ip6OutMcastPkts", IPSTATS_MIB_OUTMCASTPKTS), + SNMP_MIB_ITEM("Ip6InOctets", IPSTATS_MIB_INOCTETS), + SNMP_MIB_ITEM("Ip6OutOctets", IPSTATS_MIB_OUTOCTETS), + SNMP_MIB_ITEM("Ip6InMcastOctets", IPSTATS_MIB_INMCASTOCTETS), + SNMP_MIB_ITEM("Ip6OutMcastOctets", IPSTATS_MIB_OUTMCASTOCTETS), + SNMP_MIB_ITEM("Ip6InBcastOctets", IPSTATS_MIB_INBCASTOCTETS), + SNMP_MIB_ITEM("Ip6OutBcastOctets", IPSTATS_MIB_OUTBCASTOCTETS), + SNMP_MIB_SENTINEL +}; + +static const struct snmp_mib snmp6_icmp6_list[] = { +/* icmpv6 mib according to RFC 2466 */ + SNMP_MIB_ITEM("Icmp6InMsgs", ICMP6_MIB_INMSGS), + SNMP_MIB_ITEM("Icmp6InErrors", ICMP6_MIB_INERRORS), + SNMP_MIB_ITEM("Icmp6OutMsgs", ICMP6_MIB_OUTMSGS), + SNMP_MIB_ITEM("Icmp6OutErrors", ICMP6_MIB_OUTERRORS), + SNMP_MIB_SENTINEL +}; + +/* RFC 4293 v6 ICMPMsgStatsTable; named items for RFC 2466 compatibility */ +static const char *const icmp6type2name[256] = { + [ICMPV6_DEST_UNREACH] = "DestUnreachs", + [ICMPV6_PKT_TOOBIG] = "PktTooBigs", + [ICMPV6_TIME_EXCEED] = "TimeExcds", + [ICMPV6_PARAMPROB] = "ParmProblems", + [ICMPV6_ECHO_REQUEST] = "Echos", + [ICMPV6_ECHO_REPLY] = "EchoReplies", + [ICMPV6_MGM_QUERY] = "GroupMembQueries", + [ICMPV6_MGM_REPORT] = "GroupMembResponses", + [ICMPV6_MGM_REDUCTION] = "GroupMembReductions", + [ICMPV6_MLD2_REPORT] = "MLDv2Reports", + [NDISC_ROUTER_ADVERTISEMENT] = "RouterAdvertisements", + [NDISC_ROUTER_SOLICITATION] = "RouterSolicits", + [NDISC_NEIGHBOUR_ADVERTISEMENT] = "NeighborAdvertisements", + [NDISC_NEIGHBOUR_SOLICITATION] = "NeighborSolicits", + [NDISC_REDIRECT] = "Redirects", +}; + + +static const struct snmp_mib snmp6_udp6_list[] = { + SNMP_MIB_ITEM("Udp6InDatagrams", UDP_MIB_INDATAGRAMS), + SNMP_MIB_ITEM("Udp6NoPorts", UDP_MIB_NOPORTS), + SNMP_MIB_ITEM("Udp6InErrors", UDP_MIB_INERRORS), + SNMP_MIB_ITEM("Udp6OutDatagrams", UDP_MIB_OUTDATAGRAMS), + SNMP_MIB_ITEM("Udp6RcvbufErrors", UDP_MIB_RCVBUFERRORS), + SNMP_MIB_ITEM("Udp6SndbufErrors", UDP_MIB_SNDBUFERRORS), + SNMP_MIB_SENTINEL +}; + +static const struct snmp_mib snmp6_udplite6_list[] = { + SNMP_MIB_ITEM("UdpLite6InDatagrams", UDP_MIB_INDATAGRAMS), + SNMP_MIB_ITEM("UdpLite6NoPorts", UDP_MIB_NOPORTS), + SNMP_MIB_ITEM("UdpLite6InErrors", UDP_MIB_INERRORS), + SNMP_MIB_ITEM("UdpLite6OutDatagrams", UDP_MIB_OUTDATAGRAMS), + SNMP_MIB_ITEM("UdpLite6RcvbufErrors", UDP_MIB_RCVBUFERRORS), + SNMP_MIB_ITEM("UdpLite6SndbufErrors", UDP_MIB_SNDBUFERRORS), + SNMP_MIB_SENTINEL +}; + +static void snmp6_seq_show_icmpv6msg(struct seq_file *seq, atomic_long_t *smib) +{ + char name[32]; + int i; + + /* print by name -- deprecated items */ + for (i = 0; i < ICMP6MSG_MIB_MAX; i++) { + int icmptype; + const char *p; + + icmptype = i & 0xff; + p = icmp6type2name[icmptype]; + if (!p) /* don't print un-named types here */ + continue; + snprintf(name, sizeof(name), "Icmp6%s%s", + i & 0x100 ? "Out" : "In", p); + seq_printf(seq, "%-32s\t%lu\n", name, + atomic_long_read(smib + i)); + } + + /* print by number (nonzero only) - ICMPMsgStat format */ + for (i = 0; i < ICMP6MSG_MIB_MAX; i++) { + unsigned long val; + + val = atomic_long_read(smib + i); + if (!val) + continue; + snprintf(name, sizeof(name), "Icmp6%sType%u", + i & 0x100 ? "Out" : "In", i & 0xff); + seq_printf(seq, "%-32s\t%lu\n", name, val); + } +} + +/* can be called either with percpu mib (pcpumib != NULL), + * or shared one (smib != NULL) + */ +static void snmp6_seq_show_item(struct seq_file *seq, void __percpu **pcpumib, + atomic_long_t *smib, + const struct snmp_mib *itemlist) +{ + int i; + unsigned long val; + + for (i = 0; itemlist[i].name; i++) { + val = pcpumib ? + snmp_fold_field(pcpumib, itemlist[i].entry) : + atomic_long_read(smib + itemlist[i].entry); + seq_printf(seq, "%-32s\t%lu\n", itemlist[i].name, val); + } +} + +static void snmp6_seq_show_item64(struct seq_file *seq, void __percpu **mib, + const struct snmp_mib *itemlist, size_t syncpoff) +{ + int i; + + for (i = 0; itemlist[i].name; i++) + seq_printf(seq, "%-32s\t%llu\n", itemlist[i].name, + snmp_fold_field64(mib, itemlist[i].entry, syncpoff)); +} + +static int snmp6_seq_show(struct seq_file *seq, void *v) +{ + struct net *net = (struct net *)seq->private; + + snmp6_seq_show_item64(seq, (void __percpu **)net->mib.ipv6_statistics, + snmp6_ipstats_list, offsetof(struct ipstats_mib, syncp)); + snmp6_seq_show_item(seq, (void __percpu **)net->mib.icmpv6_statistics, + NULL, snmp6_icmp6_list); + snmp6_seq_show_icmpv6msg(seq, net->mib.icmpv6msg_statistics->mibs); + snmp6_seq_show_item(seq, (void __percpu **)net->mib.udp_stats_in6, + NULL, snmp6_udp6_list); + snmp6_seq_show_item(seq, (void __percpu **)net->mib.udplite_stats_in6, + NULL, snmp6_udplite6_list); + return 0; +} + +static int snmp6_seq_open(struct inode *inode, struct file *file) +{ + return single_open_net(inode, file, snmp6_seq_show); +} + +static const struct file_operations snmp6_seq_fops = { + .owner = THIS_MODULE, + .open = snmp6_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release_net, +}; + +static int snmp6_dev_seq_show(struct seq_file *seq, void *v) +{ + struct inet6_dev *idev = (struct inet6_dev *)seq->private; + + seq_printf(seq, "%-32s\t%u\n", "ifIndex", idev->dev->ifindex); + snmp6_seq_show_item64(seq, (void __percpu **)idev->stats.ipv6, + snmp6_ipstats_list, offsetof(struct ipstats_mib, syncp)); + snmp6_seq_show_item(seq, NULL, idev->stats.icmpv6dev->mibs, + snmp6_icmp6_list); + snmp6_seq_show_icmpv6msg(seq, idev->stats.icmpv6msgdev->mibs); + return 0; +} + +static int snmp6_dev_seq_open(struct inode *inode, struct file *file) +{ + return single_open(file, snmp6_dev_seq_show, PDE(inode)->data); +} + +static const struct file_operations snmp6_dev_seq_fops = { + .owner = THIS_MODULE, + .open = snmp6_dev_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +int snmp6_register_dev(struct inet6_dev *idev) +{ + struct proc_dir_entry *p; + struct net *net; + + if (!idev || !idev->dev) + return -EINVAL; + + net = dev_net(idev->dev); + if (!net->mib.proc_net_devsnmp6) + return -ENOENT; + + p = proc_create_data(idev->dev->name, S_IRUGO, + net->mib.proc_net_devsnmp6, + &snmp6_dev_seq_fops, idev); + if (!p) + return -ENOMEM; + + idev->stats.proc_dir_entry = p; + return 0; +} + +int snmp6_unregister_dev(struct inet6_dev *idev) +{ + struct net *net = dev_net(idev->dev); + if (!net->mib.proc_net_devsnmp6) + return -ENOENT; + if (!idev->stats.proc_dir_entry) + return -EINVAL; + remove_proc_entry(idev->stats.proc_dir_entry->name, + net->mib.proc_net_devsnmp6); + idev->stats.proc_dir_entry = NULL; + return 0; +} + +static int __net_init ipv6_proc_init_net(struct net *net) +{ + if (!proc_net_fops_create(net, "sockstat6", S_IRUGO, + &sockstat6_seq_fops)) + return -ENOMEM; + + if (!proc_net_fops_create(net, "snmp6", S_IRUGO, &snmp6_seq_fops)) + goto proc_snmp6_fail; + + net->mib.proc_net_devsnmp6 = proc_mkdir("dev_snmp6", net->proc_net); + if (!net->mib.proc_net_devsnmp6) + goto proc_dev_snmp6_fail; + return 0; + +proc_snmp6_fail: + proc_net_remove(net, "sockstat6"); +proc_dev_snmp6_fail: + proc_net_remove(net, "dev_snmp6"); + return -ENOMEM; +} + +static void __net_exit ipv6_proc_exit_net(struct net *net) +{ + proc_net_remove(net, "sockstat6"); + proc_net_remove(net, "dev_snmp6"); + proc_net_remove(net, "snmp6"); +} + +static struct pernet_operations ipv6_proc_ops = { + .init = ipv6_proc_init_net, + .exit = ipv6_proc_exit_net, +}; + +int __init ipv6_misc_proc_init(void) +{ + return register_pernet_subsys(&ipv6_proc_ops); +} + +void ipv6_misc_proc_exit(void) +{ + unregister_pernet_subsys(&ipv6_proc_ops); +} + diff --git a/net/ipv6/protocol.c b/net/ipv6/protocol.c new file mode 100644 index 00000000..9a7978fd --- /dev/null +++ b/net/ipv6/protocol.c @@ -0,0 +1,54 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * PF_INET6 protocol dispatch tables. + * + * Authors: Pedro Roque <roque@di.fc.ul.pt> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* + * Changes: + * + * Vince Laviano (vince@cs.stanford.edu) 16 May 2001 + * - Removed unused variable 'inet6_protocol_base' + * - Modified inet6_del_protocol() to correctly maintain copy bit. + */ +#include <linux/module.h> +#include <linux/netdevice.h> +#include <linux/spinlock.h> +#include <net/protocol.h> + +const struct inet6_protocol __rcu *inet6_protos[MAX_INET_PROTOS] __read_mostly; + +int inet6_add_protocol(const struct inet6_protocol *prot, unsigned char protocol) +{ + int hash = protocol & (MAX_INET_PROTOS - 1); + + return !cmpxchg((const struct inet6_protocol **)&inet6_protos[hash], + NULL, prot) ? 0 : -1; +} +EXPORT_SYMBOL(inet6_add_protocol); + +/* + * Remove a protocol from the hash tables. + */ + +int inet6_del_protocol(const struct inet6_protocol *prot, unsigned char protocol) +{ + int ret, hash = protocol & (MAX_INET_PROTOS - 1); + + ret = (cmpxchg((const struct inet6_protocol **)&inet6_protos[hash], + prot, NULL) == prot) ? 0 : -1; + + synchronize_net(); + + return ret; +} +EXPORT_SYMBOL(inet6_del_protocol); diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c new file mode 100644 index 00000000..5bddea77 --- /dev/null +++ b/net/ipv6/raw.c @@ -0,0 +1,1368 @@ +/* + * RAW sockets for IPv6 + * Linux INET6 implementation + * + * Authors: + * Pedro Roque <roque@di.fc.ul.pt> + * + * Adapted from linux/net/ipv4/raw.c + * + * Fixes: + * Hideaki YOSHIFUJI : sin6_scope_id support + * YOSHIFUJI,H.@USAGI : raw checksum (RFC2292(bis) compliance) + * Kazunori MIYAZAWA @USAGI: change process style to use ip6_append_data + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/slab.h> +#include <linux/sockios.h> +#include <linux/net.h> +#include <linux/in6.h> +#include <linux/netdevice.h> +#include <linux/if_arp.h> +#include <linux/icmpv6.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv6.h> +#include <linux/skbuff.h> +#include <linux/compat.h> +#include <asm/uaccess.h> +#include <asm/ioctls.h> + +#include <net/net_namespace.h> +#include <net/ip.h> +#include <net/sock.h> +#include <net/snmp.h> + +#include <net/ipv6.h> +#include <net/ndisc.h> +#include <net/protocol.h> +#include <net/ip6_route.h> +#include <net/ip6_checksum.h> +#include <net/addrconf.h> +#include <net/transp_v6.h> +#include <net/udp.h> +#include <net/inet_common.h> +#include <net/tcp_states.h> +#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) +#include <net/mip6.h> +#endif +#include <linux/mroute6.h> + +#include <net/raw.h> +#include <net/rawv6.h> +#include <net/xfrm.h> + +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/export.h> + +static struct raw_hashinfo raw_v6_hashinfo = { + .lock = __RW_LOCK_UNLOCKED(raw_v6_hashinfo.lock), +}; + +static struct sock *__raw_v6_lookup(struct net *net, struct sock *sk, + unsigned short num, const struct in6_addr *loc_addr, + const struct in6_addr *rmt_addr, int dif) +{ + struct hlist_node *node; + int is_multicast = ipv6_addr_is_multicast(loc_addr); + + sk_for_each_from(sk, node) + if (inet_sk(sk)->inet_num == num) { + struct ipv6_pinfo *np = inet6_sk(sk); + + if (!net_eq(sock_net(sk), net)) + continue; + + if (!ipv6_addr_any(&np->daddr) && + !ipv6_addr_equal(&np->daddr, rmt_addr)) + continue; + + if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif) + continue; + + if (!ipv6_addr_any(&np->rcv_saddr)) { + if (ipv6_addr_equal(&np->rcv_saddr, loc_addr)) + goto found; + if (is_multicast && + inet6_mc_check(sk, loc_addr, rmt_addr)) + goto found; + continue; + } + goto found; + } + sk = NULL; +found: + return sk; +} + +/* + * 0 - deliver + * 1 - block + */ +static __inline__ int icmpv6_filter(struct sock *sk, struct sk_buff *skb) +{ + struct icmp6hdr *icmph; + struct raw6_sock *rp = raw6_sk(sk); + + if (pskb_may_pull(skb, sizeof(struct icmp6hdr))) { + __u32 *data = &rp->filter.data[0]; + int bit_nr; + + icmph = (struct icmp6hdr *) skb->data; + bit_nr = icmph->icmp6_type; + + return (data[bit_nr >> 5] & (1 << (bit_nr & 31))) != 0; + } + return 0; +} + +#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) +typedef int mh_filter_t(struct sock *sock, struct sk_buff *skb); + +static mh_filter_t __rcu *mh_filter __read_mostly; + +int rawv6_mh_filter_register(mh_filter_t filter) +{ + rcu_assign_pointer(mh_filter, filter); + return 0; +} +EXPORT_SYMBOL(rawv6_mh_filter_register); + +int rawv6_mh_filter_unregister(mh_filter_t filter) +{ + RCU_INIT_POINTER(mh_filter, NULL); + synchronize_rcu(); + return 0; +} +EXPORT_SYMBOL(rawv6_mh_filter_unregister); + +#endif + +/* + * demultiplex raw sockets. + * (should consider queueing the skb in the sock receive_queue + * without calling rawv6.c) + * + * Caller owns SKB so we must make clones. + */ +static int ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) +{ + const struct in6_addr *saddr; + const struct in6_addr *daddr; + struct sock *sk; + int delivered = 0; + __u8 hash; + struct net *net; + + saddr = &ipv6_hdr(skb)->saddr; + daddr = saddr + 1; + + hash = nexthdr & (MAX_INET_PROTOS - 1); + + read_lock(&raw_v6_hashinfo.lock); + sk = sk_head(&raw_v6_hashinfo.ht[hash]); + + if (sk == NULL) + goto out; + + net = dev_net(skb->dev); + sk = __raw_v6_lookup(net, sk, nexthdr, daddr, saddr, IP6CB(skb)->iif); + + while (sk) { + int filtered; + + delivered = 1; + switch (nexthdr) { + case IPPROTO_ICMPV6: + filtered = icmpv6_filter(sk, skb); + break; + +#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) + case IPPROTO_MH: + { + /* XXX: To validate MH only once for each packet, + * this is placed here. It should be after checking + * xfrm policy, however it doesn't. The checking xfrm + * policy is placed in rawv6_rcv() because it is + * required for each socket. + */ + mh_filter_t *filter; + + filter = rcu_dereference(mh_filter); + filtered = filter ? (*filter)(sk, skb) : 0; + break; + } +#endif + default: + filtered = 0; + break; + } + + if (filtered < 0) + break; + if (filtered == 0) { + struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC); + + /* Not releasing hash table! */ + if (clone) { + nf_reset(clone); + rawv6_rcv(sk, clone); + } + } + sk = __raw_v6_lookup(net, sk_next(sk), nexthdr, daddr, saddr, + IP6CB(skb)->iif); + } +out: + read_unlock(&raw_v6_hashinfo.lock); + return delivered; +} + +int raw6_local_deliver(struct sk_buff *skb, int nexthdr) +{ + struct sock *raw_sk; + + raw_sk = sk_head(&raw_v6_hashinfo.ht[nexthdr & (MAX_INET_PROTOS - 1)]); + if (raw_sk && !ipv6_raw_deliver(skb, nexthdr)) + raw_sk = NULL; + + return raw_sk != NULL; +} + +/* This cleans up af_inet6 a bit. -DaveM */ +static int rawv6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) +{ + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct sockaddr_in6 *addr = (struct sockaddr_in6 *) uaddr; + __be32 v4addr = 0; + int addr_type; + int err; + + if (addr_len < SIN6_LEN_RFC2133) + return -EINVAL; + addr_type = ipv6_addr_type(&addr->sin6_addr); + + /* Raw sockets are IPv6 only */ + if (addr_type == IPV6_ADDR_MAPPED) + return -EADDRNOTAVAIL; + + lock_sock(sk); + + err = -EINVAL; + if (sk->sk_state != TCP_CLOSE) + goto out; + + rcu_read_lock(); + /* Check if the address belongs to the host. */ + if (addr_type != IPV6_ADDR_ANY) { + struct net_device *dev = NULL; + + if (addr_type & IPV6_ADDR_LINKLOCAL) { + if (addr_len >= sizeof(struct sockaddr_in6) && + addr->sin6_scope_id) { + /* Override any existing binding, if another + * one is supplied by user. + */ + sk->sk_bound_dev_if = addr->sin6_scope_id; + } + + /* Binding to link-local address requires an interface */ + if (!sk->sk_bound_dev_if) + goto out_unlock; + + err = -ENODEV; + dev = dev_get_by_index_rcu(sock_net(sk), + sk->sk_bound_dev_if); + if (!dev) + goto out_unlock; + } + + /* ipv4 addr of the socket is invalid. Only the + * unspecified and mapped address have a v4 equivalent. + */ + v4addr = LOOPBACK4_IPV6; + if (!(addr_type & IPV6_ADDR_MULTICAST)) { + err = -EADDRNOTAVAIL; + if (!ipv6_chk_addr(sock_net(sk), &addr->sin6_addr, + dev, 0)) { + goto out_unlock; + } + } + } + + inet->inet_rcv_saddr = inet->inet_saddr = v4addr; + np->rcv_saddr = addr->sin6_addr; + if (!(addr_type & IPV6_ADDR_MULTICAST)) + np->saddr = addr->sin6_addr; + err = 0; +out_unlock: + rcu_read_unlock(); +out: + release_sock(sk); + return err; +} + +static void rawv6_err(struct sock *sk, struct sk_buff *skb, + struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, __be32 info) +{ + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + int err; + int harderr; + + /* Report error on raw socket, if: + 1. User requested recverr. + 2. Socket is connected (otherwise the error indication + is useless without recverr and error is hard. + */ + if (!np->recverr && sk->sk_state != TCP_ESTABLISHED) + return; + + harderr = icmpv6_err_convert(type, code, &err); + if (type == ICMPV6_PKT_TOOBIG) + harderr = (np->pmtudisc == IPV6_PMTUDISC_DO); + + if (np->recverr) { + u8 *payload = skb->data; + if (!inet->hdrincl) + payload += offset; + ipv6_icmp_error(sk, skb, err, 0, ntohl(info), payload); + } + + if (np->recverr || harderr) { + sk->sk_err = err; + sk->sk_error_report(sk); + } +} + +void raw6_icmp_error(struct sk_buff *skb, int nexthdr, + u8 type, u8 code, int inner_offset, __be32 info) +{ + struct sock *sk; + int hash; + const struct in6_addr *saddr, *daddr; + struct net *net; + + hash = nexthdr & (RAW_HTABLE_SIZE - 1); + + read_lock(&raw_v6_hashinfo.lock); + sk = sk_head(&raw_v6_hashinfo.ht[hash]); + if (sk != NULL) { + /* Note: ipv6_hdr(skb) != skb->data */ + const struct ipv6hdr *ip6h = (const struct ipv6hdr *)skb->data; + saddr = &ip6h->saddr; + daddr = &ip6h->daddr; + net = dev_net(skb->dev); + + while ((sk = __raw_v6_lookup(net, sk, nexthdr, saddr, daddr, + IP6CB(skb)->iif))) { + rawv6_err(sk, skb, NULL, type, code, + inner_offset, info); + sk = sk_next(sk); + } + } + read_unlock(&raw_v6_hashinfo.lock); +} + +static inline int rawv6_rcv_skb(struct sock *sk, struct sk_buff *skb) +{ + if ((raw6_sk(sk)->checksum || rcu_access_pointer(sk->sk_filter)) && + skb_checksum_complete(skb)) { + atomic_inc(&sk->sk_drops); + kfree_skb(skb); + return NET_RX_DROP; + } + + /* Charge it to the socket. */ + skb_dst_drop(skb); + if (sock_queue_rcv_skb(sk, skb) < 0) { + kfree_skb(skb); + return NET_RX_DROP; + } + + return 0; +} + +/* + * This is next to useless... + * if we demultiplex in network layer we don't need the extra call + * just to queue the skb... + * maybe we could have the network decide upon a hint if it + * should call raw_rcv for demultiplexing + */ +int rawv6_rcv(struct sock *sk, struct sk_buff *skb) +{ + struct inet_sock *inet = inet_sk(sk); + struct raw6_sock *rp = raw6_sk(sk); + + if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) { + atomic_inc(&sk->sk_drops); + kfree_skb(skb); + return NET_RX_DROP; + } + + if (!rp->checksum) + skb->ip_summed = CHECKSUM_UNNECESSARY; + + if (skb->ip_summed == CHECKSUM_COMPLETE) { + skb_postpull_rcsum(skb, skb_network_header(skb), + skb_network_header_len(skb)); + if (!csum_ipv6_magic(&ipv6_hdr(skb)->saddr, + &ipv6_hdr(skb)->daddr, + skb->len, inet->inet_num, skb->csum)) + skb->ip_summed = CHECKSUM_UNNECESSARY; + } + if (!skb_csum_unnecessary(skb)) + skb->csum = ~csum_unfold(csum_ipv6_magic(&ipv6_hdr(skb)->saddr, + &ipv6_hdr(skb)->daddr, + skb->len, + inet->inet_num, 0)); + + if (inet->hdrincl) { + if (skb_checksum_complete(skb)) { + atomic_inc(&sk->sk_drops); + kfree_skb(skb); + return NET_RX_DROP; + } + } + + rawv6_rcv_skb(sk, skb); + return 0; +} + + +/* + * This should be easy, if there is something there + * we return it, otherwise we block. + */ + +static int rawv6_recvmsg(struct kiocb *iocb, struct sock *sk, + struct msghdr *msg, size_t len, + int noblock, int flags, int *addr_len) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)msg->msg_name; + struct sk_buff *skb; + size_t copied; + int err; + + if (flags & MSG_OOB) + return -EOPNOTSUPP; + + if (addr_len) + *addr_len=sizeof(*sin6); + + if (flags & MSG_ERRQUEUE) + return ipv6_recv_error(sk, msg, len); + + if (np->rxpmtu && np->rxopt.bits.rxpmtu) + return ipv6_recv_rxpmtu(sk, msg, len); + + skb = skb_recv_datagram(sk, flags, noblock, &err); + if (!skb) + goto out; + + copied = skb->len; + if (copied > len) { + copied = len; + msg->msg_flags |= MSG_TRUNC; + } + + if (skb_csum_unnecessary(skb)) { + err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + } else if (msg->msg_flags&MSG_TRUNC) { + if (__skb_checksum_complete(skb)) + goto csum_copy_err; + err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + } else { + err = skb_copy_and_csum_datagram_iovec(skb, 0, msg->msg_iov); + if (err == -EINVAL) + goto csum_copy_err; + } + if (err) + goto out_free; + + /* Copy the address. */ + if (sin6) { + sin6->sin6_family = AF_INET6; + sin6->sin6_port = 0; + sin6->sin6_addr = ipv6_hdr(skb)->saddr; + sin6->sin6_flowinfo = 0; + sin6->sin6_scope_id = 0; + if (ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL) + sin6->sin6_scope_id = IP6CB(skb)->iif; + } + + sock_recv_ts_and_drops(msg, sk, skb); + + if (np->rxopt.all) + datagram_recv_ctl(sk, msg, skb); + + err = copied; + if (flags & MSG_TRUNC) + err = skb->len; + +out_free: + skb_free_datagram(sk, skb); +out: + return err; + +csum_copy_err: + skb_kill_datagram(sk, skb, flags); + + /* Error for blocking case is chosen to masquerade + as some normal condition. + */ + err = (flags&MSG_DONTWAIT) ? -EAGAIN : -EHOSTUNREACH; + goto out; +} + +static int rawv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6, + struct raw6_sock *rp) +{ + struct sk_buff *skb; + int err = 0; + int offset; + int len; + int total_len; + __wsum tmp_csum; + __sum16 csum; + + if (!rp->checksum) + goto send; + + if ((skb = skb_peek(&sk->sk_write_queue)) == NULL) + goto out; + + offset = rp->offset; + total_len = inet_sk(sk)->cork.base.length; + if (offset >= total_len - 1) { + err = -EINVAL; + ip6_flush_pending_frames(sk); + goto out; + } + + /* should be check HW csum miyazawa */ + if (skb_queue_len(&sk->sk_write_queue) == 1) { + /* + * Only one fragment on the socket. + */ + tmp_csum = skb->csum; + } else { + struct sk_buff *csum_skb = NULL; + tmp_csum = 0; + + skb_queue_walk(&sk->sk_write_queue, skb) { + tmp_csum = csum_add(tmp_csum, skb->csum); + + if (csum_skb) + continue; + + len = skb->len - skb_transport_offset(skb); + if (offset >= len) { + offset -= len; + continue; + } + + csum_skb = skb; + } + + skb = csum_skb; + } + + offset += skb_transport_offset(skb); + if (skb_copy_bits(skb, offset, &csum, 2)) + BUG(); + + /* in case cksum was not initialized */ + if (unlikely(csum)) + tmp_csum = csum_sub(tmp_csum, csum_unfold(csum)); + + csum = csum_ipv6_magic(&fl6->saddr, &fl6->daddr, + total_len, fl6->flowi6_proto, tmp_csum); + + if (csum == 0 && fl6->flowi6_proto == IPPROTO_UDP) + csum = CSUM_MANGLED_0; + + if (skb_store_bits(skb, offset, &csum, 2)) + BUG(); + +send: + err = ip6_push_pending_frames(sk); +out: + return err; +} + +static int rawv6_send_hdrinc(struct sock *sk, void *from, int length, + struct flowi6 *fl6, struct dst_entry **dstp, + unsigned int flags) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct ipv6hdr *iph; + struct sk_buff *skb; + int err; + struct rt6_info *rt = (struct rt6_info *)*dstp; + int hlen = LL_RESERVED_SPACE(rt->dst.dev); + int tlen = rt->dst.dev->needed_tailroom; + + if (length > rt->dst.dev->mtu) { + ipv6_local_error(sk, EMSGSIZE, fl6, rt->dst.dev->mtu); + return -EMSGSIZE; + } + if (flags&MSG_PROBE) + goto out; + + skb = sock_alloc_send_skb(sk, + length + hlen + tlen + 15, + flags & MSG_DONTWAIT, &err); + if (skb == NULL) + goto error; + skb_reserve(skb, hlen); + + skb->priority = sk->sk_priority; + skb->mark = sk->sk_mark; + skb_dst_set(skb, &rt->dst); + *dstp = NULL; + + skb_put(skb, length); + skb_reset_network_header(skb); + iph = ipv6_hdr(skb); + + skb->ip_summed = CHECKSUM_NONE; + + skb->transport_header = skb->network_header; + err = memcpy_fromiovecend((void *)iph, from, 0, length); + if (err) + goto error_fault; + + IP6_UPD_PO_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len); + err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL, + rt->dst.dev, dst_output); + if (err > 0) + err = net_xmit_errno(err); + if (err) + goto error; +out: + return 0; + +error_fault: + err = -EFAULT; + kfree_skb(skb); +error: + IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); + if (err == -ENOBUFS && !np->recverr) + err = 0; + return err; +} + +static int rawv6_probe_proto_opt(struct flowi6 *fl6, struct msghdr *msg) +{ + struct iovec *iov; + u8 __user *type = NULL; + u8 __user *code = NULL; + u8 len = 0; + int probed = 0; + int i; + + if (!msg->msg_iov) + return 0; + + for (i = 0; i < msg->msg_iovlen; i++) { + iov = &msg->msg_iov[i]; + if (!iov) + continue; + + switch (fl6->flowi6_proto) { + case IPPROTO_ICMPV6: + /* check if one-byte field is readable or not. */ + if (iov->iov_base && iov->iov_len < 1) + break; + + if (!type) { + type = iov->iov_base; + /* check if code field is readable or not. */ + if (iov->iov_len > 1) + code = type + 1; + } else if (!code) + code = iov->iov_base; + + if (type && code) { + if (get_user(fl6->fl6_icmp_type, type) || + get_user(fl6->fl6_icmp_code, code)) + return -EFAULT; + probed = 1; + } + break; + case IPPROTO_MH: + if (iov->iov_base && iov->iov_len < 1) + break; + /* check if type field is readable or not. */ + if (iov->iov_len > 2 - len) { + u8 __user *p = iov->iov_base; + if (get_user(fl6->fl6_mh_type, &p[2 - len])) + return -EFAULT; + probed = 1; + } else + len += iov->iov_len; + + break; + default: + probed = 1; + break; + } + if (probed) + break; + } + return 0; +} + +static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk, + struct msghdr *msg, size_t len) +{ + struct ipv6_txoptions opt_space; + struct sockaddr_in6 * sin6 = (struct sockaddr_in6 *) msg->msg_name; + struct in6_addr *daddr, *final_p, final; + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct raw6_sock *rp = raw6_sk(sk); + struct ipv6_txoptions *opt = NULL; + struct ip6_flowlabel *flowlabel = NULL; + struct dst_entry *dst = NULL; + struct flowi6 fl6; + int addr_len = msg->msg_namelen; + int hlimit = -1; + int tclass = -1; + int dontfrag = -1; + u16 proto; + int err; + + /* Rough check on arithmetic overflow, + better check is made in ip6_append_data(). + */ + if (len > INT_MAX) + return -EMSGSIZE; + + /* Mirror BSD error message compatibility */ + if (msg->msg_flags & MSG_OOB) + return -EOPNOTSUPP; + + /* + * Get and verify the address. + */ + memset(&fl6, 0, sizeof(fl6)); + + fl6.flowi6_mark = sk->sk_mark; + + if (sin6) { + if (addr_len < SIN6_LEN_RFC2133) + return -EINVAL; + + if (sin6->sin6_family && sin6->sin6_family != AF_INET6) + return -EAFNOSUPPORT; + + /* port is the proto value [0..255] carried in nexthdr */ + proto = ntohs(sin6->sin6_port); + + if (!proto) + proto = inet->inet_num; + else if (proto != inet->inet_num) + return -EINVAL; + + if (proto > 255) + return -EINVAL; + + daddr = &sin6->sin6_addr; + if (np->sndflow) { + fl6.flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK; + if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) { + flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); + if (flowlabel == NULL) + return -EINVAL; + daddr = &flowlabel->dst; + } + } + + /* + * Otherwise it will be difficult to maintain + * sk->sk_dst_cache. + */ + if (sk->sk_state == TCP_ESTABLISHED && + ipv6_addr_equal(daddr, &np->daddr)) + daddr = &np->daddr; + + if (addr_len >= sizeof(struct sockaddr_in6) && + sin6->sin6_scope_id && + ipv6_addr_type(daddr)&IPV6_ADDR_LINKLOCAL) + fl6.flowi6_oif = sin6->sin6_scope_id; + } else { + if (sk->sk_state != TCP_ESTABLISHED) + return -EDESTADDRREQ; + + proto = inet->inet_num; + daddr = &np->daddr; + fl6.flowlabel = np->flow_label; + } + + if (fl6.flowi6_oif == 0) + fl6.flowi6_oif = sk->sk_bound_dev_if; + + if (msg->msg_controllen) { + opt = &opt_space; + memset(opt, 0, sizeof(struct ipv6_txoptions)); + opt->tot_len = sizeof(struct ipv6_txoptions); + + err = datagram_send_ctl(sock_net(sk), sk, msg, &fl6, opt, + &hlimit, &tclass, &dontfrag); + if (err < 0) { + fl6_sock_release(flowlabel); + return err; + } + if ((fl6.flowlabel&IPV6_FLOWLABEL_MASK) && !flowlabel) { + flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); + if (flowlabel == NULL) + return -EINVAL; + } + if (!(opt->opt_nflen|opt->opt_flen)) + opt = NULL; + } + if (opt == NULL) + opt = np->opt; + if (flowlabel) + opt = fl6_merge_options(&opt_space, flowlabel, opt); + opt = ipv6_fixup_options(&opt_space, opt); + + fl6.flowi6_proto = proto; + err = rawv6_probe_proto_opt(&fl6, msg); + if (err) + goto out; + + if (!ipv6_addr_any(daddr)) + fl6.daddr = *daddr; + else + fl6.daddr.s6_addr[15] = 0x1; /* :: means loopback (BSD'ism) */ + if (ipv6_addr_any(&fl6.saddr) && !ipv6_addr_any(&np->saddr)) + fl6.saddr = np->saddr; + + final_p = fl6_update_dst(&fl6, opt, &final); + + if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) + fl6.flowi6_oif = np->mcast_oif; + else if (!fl6.flowi6_oif) + fl6.flowi6_oif = np->ucast_oif; + security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); + + dst = ip6_dst_lookup_flow(sk, &fl6, final_p, true); + if (IS_ERR(dst)) { + err = PTR_ERR(dst); + goto out; + } + if (hlimit < 0) { + if (ipv6_addr_is_multicast(&fl6.daddr)) + hlimit = np->mcast_hops; + else + hlimit = np->hop_limit; + if (hlimit < 0) + hlimit = ip6_dst_hoplimit(dst); + } + + if (tclass < 0) + tclass = np->tclass; + + if (dontfrag < 0) + dontfrag = np->dontfrag; + + if (msg->msg_flags&MSG_CONFIRM) + goto do_confirm; + +back_from_confirm: + if (inet->hdrincl) + err = rawv6_send_hdrinc(sk, msg->msg_iov, len, &fl6, &dst, msg->msg_flags); + else { + lock_sock(sk); + err = ip6_append_data(sk, ip_generic_getfrag, msg->msg_iov, + len, 0, hlimit, tclass, opt, &fl6, (struct rt6_info*)dst, + msg->msg_flags, dontfrag); + + if (err) + ip6_flush_pending_frames(sk); + else if (!(msg->msg_flags & MSG_MORE)) + err = rawv6_push_pending_frames(sk, &fl6, rp); + release_sock(sk); + } +done: + dst_release(dst); +out: + fl6_sock_release(flowlabel); + return err<0?err:len; +do_confirm: + dst_confirm(dst); + if (!(msg->msg_flags & MSG_PROBE) || len) + goto back_from_confirm; + err = 0; + goto done; +} + +static int rawv6_seticmpfilter(struct sock *sk, int level, int optname, + char __user *optval, int optlen) +{ + switch (optname) { + case ICMPV6_FILTER: + if (optlen > sizeof(struct icmp6_filter)) + optlen = sizeof(struct icmp6_filter); + if (copy_from_user(&raw6_sk(sk)->filter, optval, optlen)) + return -EFAULT; + return 0; + default: + return -ENOPROTOOPT; + } + + return 0; +} + +static int rawv6_geticmpfilter(struct sock *sk, int level, int optname, + char __user *optval, int __user *optlen) +{ + int len; + + switch (optname) { + case ICMPV6_FILTER: + if (get_user(len, optlen)) + return -EFAULT; + if (len < 0) + return -EINVAL; + if (len > sizeof(struct icmp6_filter)) + len = sizeof(struct icmp6_filter); + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &raw6_sk(sk)->filter, len)) + return -EFAULT; + return 0; + default: + return -ENOPROTOOPT; + } + + return 0; +} + + +static int do_rawv6_setsockopt(struct sock *sk, int level, int optname, + char __user *optval, unsigned int optlen) +{ + struct raw6_sock *rp = raw6_sk(sk); + int val; + + if (get_user(val, (int __user *)optval)) + return -EFAULT; + + switch (optname) { + case IPV6_CHECKSUM: + if (inet_sk(sk)->inet_num == IPPROTO_ICMPV6 && + level == IPPROTO_IPV6) { + /* + * RFC3542 tells that IPV6_CHECKSUM socket + * option in the IPPROTO_IPV6 level is not + * allowed on ICMPv6 sockets. + * If you want to set it, use IPPROTO_RAW + * level IPV6_CHECKSUM socket option + * (Linux extension). + */ + return -EINVAL; + } + + /* You may get strange result with a positive odd offset; + RFC2292bis agrees with me. */ + if (val > 0 && (val&1)) + return -EINVAL; + if (val < 0) { + rp->checksum = 0; + } else { + rp->checksum = 1; + rp->offset = val; + } + + return 0; + + default: + return -ENOPROTOOPT; + } +} + +static int rawv6_setsockopt(struct sock *sk, int level, int optname, + char __user *optval, unsigned int optlen) +{ + switch (level) { + case SOL_RAW: + break; + + case SOL_ICMPV6: + if (inet_sk(sk)->inet_num != IPPROTO_ICMPV6) + return -EOPNOTSUPP; + return rawv6_seticmpfilter(sk, level, optname, optval, optlen); + case SOL_IPV6: + if (optname == IPV6_CHECKSUM) + break; + default: + return ipv6_setsockopt(sk, level, optname, optval, optlen); + } + + return do_rawv6_setsockopt(sk, level, optname, optval, optlen); +} + +#ifdef CONFIG_COMPAT +static int compat_rawv6_setsockopt(struct sock *sk, int level, int optname, + char __user *optval, unsigned int optlen) +{ + switch (level) { + case SOL_RAW: + break; + case SOL_ICMPV6: + if (inet_sk(sk)->inet_num != IPPROTO_ICMPV6) + return -EOPNOTSUPP; + return rawv6_seticmpfilter(sk, level, optname, optval, optlen); + case SOL_IPV6: + if (optname == IPV6_CHECKSUM) + break; + default: + return compat_ipv6_setsockopt(sk, level, optname, + optval, optlen); + } + return do_rawv6_setsockopt(sk, level, optname, optval, optlen); +} +#endif + +static int do_rawv6_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct raw6_sock *rp = raw6_sk(sk); + int val, len; + + if (get_user(len,optlen)) + return -EFAULT; + + switch (optname) { + case IPV6_CHECKSUM: + /* + * We allow getsockopt() for IPPROTO_IPV6-level + * IPV6_CHECKSUM socket option on ICMPv6 sockets + * since RFC3542 is silent about it. + */ + if (rp->checksum == 0) + val = -1; + else + val = rp->offset; + break; + + default: + return -ENOPROTOOPT; + } + + len = min_t(unsigned int, sizeof(int), len); + + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval,&val,len)) + return -EFAULT; + return 0; +} + +static int rawv6_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *optlen) +{ + switch (level) { + case SOL_RAW: + break; + + case SOL_ICMPV6: + if (inet_sk(sk)->inet_num != IPPROTO_ICMPV6) + return -EOPNOTSUPP; + return rawv6_geticmpfilter(sk, level, optname, optval, optlen); + case SOL_IPV6: + if (optname == IPV6_CHECKSUM) + break; + default: + return ipv6_getsockopt(sk, level, optname, optval, optlen); + } + + return do_rawv6_getsockopt(sk, level, optname, optval, optlen); +} + +#ifdef CONFIG_COMPAT +static int compat_rawv6_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *optlen) +{ + switch (level) { + case SOL_RAW: + break; + case SOL_ICMPV6: + if (inet_sk(sk)->inet_num != IPPROTO_ICMPV6) + return -EOPNOTSUPP; + return rawv6_geticmpfilter(sk, level, optname, optval, optlen); + case SOL_IPV6: + if (optname == IPV6_CHECKSUM) + break; + default: + return compat_ipv6_getsockopt(sk, level, optname, + optval, optlen); + } + return do_rawv6_getsockopt(sk, level, optname, optval, optlen); +} +#endif + +static int rawv6_ioctl(struct sock *sk, int cmd, unsigned long arg) +{ + switch (cmd) { + case SIOCOUTQ: { + int amount = sk_wmem_alloc_get(sk); + + return put_user(amount, (int __user *)arg); + } + case SIOCINQ: { + struct sk_buff *skb; + int amount = 0; + + spin_lock_bh(&sk->sk_receive_queue.lock); + skb = skb_peek(&sk->sk_receive_queue); + if (skb != NULL) + amount = skb->tail - skb->transport_header; + spin_unlock_bh(&sk->sk_receive_queue.lock); + return put_user(amount, (int __user *)arg); + } + + default: +#ifdef CONFIG_IPV6_MROUTE + return ip6mr_ioctl(sk, cmd, (void __user *)arg); +#else + return -ENOIOCTLCMD; +#endif + } +} + +#ifdef CONFIG_COMPAT +static int compat_rawv6_ioctl(struct sock *sk, unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + case SIOCOUTQ: + case SIOCINQ: + return -ENOIOCTLCMD; + default: +#ifdef CONFIG_IPV6_MROUTE + return ip6mr_compat_ioctl(sk, cmd, compat_ptr(arg)); +#else + return -ENOIOCTLCMD; +#endif + } +} +#endif + +static void rawv6_close(struct sock *sk, long timeout) +{ + if (inet_sk(sk)->inet_num == IPPROTO_RAW) + ip6_ra_control(sk, -1); + ip6mr_sk_done(sk); + sk_common_release(sk); +} + +static void raw6_destroy(struct sock *sk) +{ + lock_sock(sk); + ip6_flush_pending_frames(sk); + release_sock(sk); + + inet6_destroy_sock(sk); +} + +static int rawv6_init_sk(struct sock *sk) +{ + struct raw6_sock *rp = raw6_sk(sk); + + switch (inet_sk(sk)->inet_num) { + case IPPROTO_ICMPV6: + rp->checksum = 1; + rp->offset = 2; + break; + case IPPROTO_MH: + rp->checksum = 1; + rp->offset = 4; + break; + default: + break; + } + return 0; +} + +struct proto rawv6_prot = { + .name = "RAWv6", + .owner = THIS_MODULE, + .close = rawv6_close, + .destroy = raw6_destroy, + .connect = ip6_datagram_connect, + .disconnect = udp_disconnect, + .ioctl = rawv6_ioctl, + .init = rawv6_init_sk, + .setsockopt = rawv6_setsockopt, + .getsockopt = rawv6_getsockopt, + .sendmsg = rawv6_sendmsg, + .recvmsg = rawv6_recvmsg, + .bind = rawv6_bind, + .backlog_rcv = rawv6_rcv_skb, + .hash = raw_hash_sk, + .unhash = raw_unhash_sk, + .obj_size = sizeof(struct raw6_sock), + .h.raw_hash = &raw_v6_hashinfo, +#ifdef CONFIG_COMPAT + .compat_setsockopt = compat_rawv6_setsockopt, + .compat_getsockopt = compat_rawv6_getsockopt, + .compat_ioctl = compat_rawv6_ioctl, +#endif +}; + +#ifdef CONFIG_PROC_FS +static void raw6_sock_seq_show(struct seq_file *seq, struct sock *sp, int i) +{ + struct ipv6_pinfo *np = inet6_sk(sp); + const struct in6_addr *dest, *src; + __u16 destp, srcp; + + dest = &np->daddr; + src = &np->rcv_saddr; + destp = 0; + srcp = inet_sk(sp)->inet_num; + seq_printf(seq, + "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " + "%02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %pK %d\n", + i, + src->s6_addr32[0], src->s6_addr32[1], + src->s6_addr32[2], src->s6_addr32[3], srcp, + dest->s6_addr32[0], dest->s6_addr32[1], + dest->s6_addr32[2], dest->s6_addr32[3], destp, + sp->sk_state, + sk_wmem_alloc_get(sp), + sk_rmem_alloc_get(sp), + 0, 0L, 0, + sock_i_uid(sp), 0, + sock_i_ino(sp), + atomic_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops)); +} + +static int raw6_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) + seq_printf(seq, + " sl " + "local_address " + "remote_address " + "st tx_queue rx_queue tr tm->when retrnsmt" + " uid timeout inode ref pointer drops\n"); + else + raw6_sock_seq_show(seq, v, raw_seq_private(seq)->bucket); + return 0; +} + +static const struct seq_operations raw6_seq_ops = { + .start = raw_seq_start, + .next = raw_seq_next, + .stop = raw_seq_stop, + .show = raw6_seq_show, +}; + +static int raw6_seq_open(struct inode *inode, struct file *file) +{ + return raw_seq_open(inode, file, &raw_v6_hashinfo, &raw6_seq_ops); +} + +static const struct file_operations raw6_seq_fops = { + .owner = THIS_MODULE, + .open = raw6_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +static int __net_init raw6_init_net(struct net *net) +{ + if (!proc_net_fops_create(net, "raw6", S_IRUGO, &raw6_seq_fops)) + return -ENOMEM; + + return 0; +} + +static void __net_exit raw6_exit_net(struct net *net) +{ + proc_net_remove(net, "raw6"); +} + +static struct pernet_operations raw6_net_ops = { + .init = raw6_init_net, + .exit = raw6_exit_net, +}; + +int __init raw6_proc_init(void) +{ + return register_pernet_subsys(&raw6_net_ops); +} + +void raw6_proc_exit(void) +{ + unregister_pernet_subsys(&raw6_net_ops); +} +#endif /* CONFIG_PROC_FS */ + +/* Same as inet6_dgram_ops, sans udp_poll. */ +static const struct proto_ops inet6_sockraw_ops = { + .family = PF_INET6, + .owner = THIS_MODULE, + .release = inet6_release, + .bind = inet6_bind, + .connect = inet_dgram_connect, /* ok */ + .socketpair = sock_no_socketpair, /* a do nothing */ + .accept = sock_no_accept, /* a do nothing */ + .getname = inet6_getname, + .poll = datagram_poll, /* ok */ + .ioctl = inet6_ioctl, /* must change */ + .listen = sock_no_listen, /* ok */ + .shutdown = inet_shutdown, /* ok */ + .setsockopt = sock_common_setsockopt, /* ok */ + .getsockopt = sock_common_getsockopt, /* ok */ + .sendmsg = inet_sendmsg, /* ok */ + .recvmsg = sock_common_recvmsg, /* ok */ + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +#ifdef CONFIG_COMPAT + .compat_setsockopt = compat_sock_common_setsockopt, + .compat_getsockopt = compat_sock_common_getsockopt, +#endif +}; + +static struct inet_protosw rawv6_protosw = { + .type = SOCK_RAW, + .protocol = IPPROTO_IP, /* wild card */ + .prot = &rawv6_prot, + .ops = &inet6_sockraw_ops, + .no_check = UDP_CSUM_DEFAULT, + .flags = INET_PROTOSW_REUSE, +}; + +int __init rawv6_init(void) +{ + int ret; + + ret = inet6_register_protosw(&rawv6_protosw); + if (ret) + goto out; +out: + return ret; +} + +void rawv6_exit(void) +{ + inet6_unregister_protosw(&rawv6_protosw); +} diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c new file mode 100644 index 00000000..9447bd69 --- /dev/null +++ b/net/ipv6/reassembly.c @@ -0,0 +1,769 @@ +/* + * IPv6 fragment reassembly + * Linux INET6 implementation + * + * Authors: + * Pedro Roque <roque@di.fc.ul.pt> + * + * Based on: net/ipv4/ip_fragment.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* + * Fixes: + * Andi Kleen Make it work with multiple hosts. + * More RFC compliance. + * + * Horst von Brand Add missing #include <linux/string.h> + * Alexey Kuznetsov SMP races, threading, cleanup. + * Patrick McHardy LRU queue of frag heads for evictor. + * Mitsuru KANDA @USAGI Register inet6_protocol{}. + * David Stevens and + * YOSHIFUJI,H. @USAGI Always remove fragment header to + * calculate ICV correctly. + */ +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/string.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/jiffies.h> +#include <linux/net.h> +#include <linux/list.h> +#include <linux/netdevice.h> +#include <linux/in6.h> +#include <linux/ipv6.h> +#include <linux/icmpv6.h> +#include <linux/random.h> +#include <linux/jhash.h> +#include <linux/skbuff.h> +#include <linux/slab.h> +#include <linux/export.h> + +#include <net/sock.h> +#include <net/snmp.h> + +#include <net/ipv6.h> +#include <net/ip6_route.h> +#include <net/protocol.h> +#include <net/transp_v6.h> +#include <net/rawv6.h> +#include <net/ndisc.h> +#include <net/addrconf.h> +#include <net/inet_frag.h> + +struct ip6frag_skb_cb +{ + struct inet6_skb_parm h; + int offset; +}; + +#define FRAG6_CB(skb) ((struct ip6frag_skb_cb*)((skb)->cb)) + + +/* + * Equivalent of ipv4 struct ipq + */ + +struct frag_queue +{ + struct inet_frag_queue q; + + __be32 id; /* fragment id */ + u32 user; + struct in6_addr saddr; + struct in6_addr daddr; + + int iif; + unsigned int csum; + __u16 nhoffset; +}; + +static struct inet_frags ip6_frags; + +int ip6_frag_nqueues(struct net *net) +{ + return net->ipv6.frags.nqueues; +} + +int ip6_frag_mem(struct net *net) +{ + return atomic_read(&net->ipv6.frags.mem); +} + +static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, + struct net_device *dev); + +/* + * callers should be careful not to use the hash value outside the ipfrag_lock + * as doing so could race with ipfrag_hash_rnd being recalculated. + */ +unsigned int inet6_hash_frag(__be32 id, const struct in6_addr *saddr, + const struct in6_addr *daddr, u32 rnd) +{ + u32 c; + + c = jhash_3words((__force u32)saddr->s6_addr32[0], + (__force u32)saddr->s6_addr32[1], + (__force u32)saddr->s6_addr32[2], + rnd); + + c = jhash_3words((__force u32)saddr->s6_addr32[3], + (__force u32)daddr->s6_addr32[0], + (__force u32)daddr->s6_addr32[1], + c); + + c = jhash_3words((__force u32)daddr->s6_addr32[2], + (__force u32)daddr->s6_addr32[3], + (__force u32)id, + c); + + return c & (INETFRAGS_HASHSZ - 1); +} +EXPORT_SYMBOL_GPL(inet6_hash_frag); + +static unsigned int ip6_hashfn(struct inet_frag_queue *q) +{ + struct frag_queue *fq; + + fq = container_of(q, struct frag_queue, q); + return inet6_hash_frag(fq->id, &fq->saddr, &fq->daddr, ip6_frags.rnd); +} + +int ip6_frag_match(struct inet_frag_queue *q, void *a) +{ + struct frag_queue *fq; + struct ip6_create_arg *arg = a; + + fq = container_of(q, struct frag_queue, q); + return (fq->id == arg->id && fq->user == arg->user && + ipv6_addr_equal(&fq->saddr, arg->src) && + ipv6_addr_equal(&fq->daddr, arg->dst)); +} +EXPORT_SYMBOL(ip6_frag_match); + +void ip6_frag_init(struct inet_frag_queue *q, void *a) +{ + struct frag_queue *fq = container_of(q, struct frag_queue, q); + struct ip6_create_arg *arg = a; + + fq->id = arg->id; + fq->user = arg->user; + fq->saddr = *arg->src; + fq->daddr = *arg->dst; +} +EXPORT_SYMBOL(ip6_frag_init); + +/* Destruction primitives. */ + +static __inline__ void fq_put(struct frag_queue *fq) +{ + inet_frag_put(&fq->q, &ip6_frags); +} + +/* Kill fq entry. It is not destroyed immediately, + * because caller (and someone more) holds reference count. + */ +static __inline__ void fq_kill(struct frag_queue *fq) +{ + inet_frag_kill(&fq->q, &ip6_frags); +} + +static void ip6_evictor(struct net *net, struct inet6_dev *idev) +{ + int evicted; + + evicted = inet_frag_evictor(&net->ipv6.frags, &ip6_frags); + if (evicted) + IP6_ADD_STATS_BH(net, idev, IPSTATS_MIB_REASMFAILS, evicted); +} + +static void ip6_frag_expire(unsigned long data) +{ + struct frag_queue *fq; + struct net_device *dev = NULL; + struct net *net; + + fq = container_of((struct inet_frag_queue *)data, struct frag_queue, q); + + spin_lock(&fq->q.lock); + + if (fq->q.last_in & INET_FRAG_COMPLETE) + goto out; + + fq_kill(fq); + + net = container_of(fq->q.net, struct net, ipv6.frags); + rcu_read_lock(); + dev = dev_get_by_index_rcu(net, fq->iif); + if (!dev) + goto out_rcu_unlock; + + IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT); + IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS); + + /* Don't send error if the first segment did not arrive. */ + if (!(fq->q.last_in & INET_FRAG_FIRST_IN) || !fq->q.fragments) + goto out_rcu_unlock; + + /* + But use as source device on which LAST ARRIVED + segment was received. And do not use fq->dev + pointer directly, device might already disappeared. + */ + fq->q.fragments->dev = dev; + icmpv6_send(fq->q.fragments, ICMPV6_TIME_EXCEED, ICMPV6_EXC_FRAGTIME, 0); +out_rcu_unlock: + rcu_read_unlock(); +out: + spin_unlock(&fq->q.lock); + fq_put(fq); +} + +static __inline__ struct frag_queue * +fq_find(struct net *net, __be32 id, const struct in6_addr *src, const struct in6_addr *dst) +{ + struct inet_frag_queue *q; + struct ip6_create_arg arg; + unsigned int hash; + + arg.id = id; + arg.user = IP6_DEFRAG_LOCAL_DELIVER; + arg.src = src; + arg.dst = dst; + + read_lock(&ip6_frags.lock); + hash = inet6_hash_frag(id, src, dst, ip6_frags.rnd); + + q = inet_frag_find(&net->ipv6.frags, &ip6_frags, &arg, hash); + if (q == NULL) + return NULL; + + return container_of(q, struct frag_queue, q); +} + +static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, + struct frag_hdr *fhdr, int nhoff) +{ + struct sk_buff *prev, *next; + struct net_device *dev; + int offset, end; + struct net *net = dev_net(skb_dst(skb)->dev); + + if (fq->q.last_in & INET_FRAG_COMPLETE) + goto err; + + offset = ntohs(fhdr->frag_off) & ~0x7; + end = offset + (ntohs(ipv6_hdr(skb)->payload_len) - + ((u8 *)(fhdr + 1) - (u8 *)(ipv6_hdr(skb) + 1))); + + if ((unsigned int)end > IPV6_MAXPLEN) { + IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_INHDRERRORS); + icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, + ((u8 *)&fhdr->frag_off - + skb_network_header(skb))); + return -1; + } + + if (skb->ip_summed == CHECKSUM_COMPLETE) { + const unsigned char *nh = skb_network_header(skb); + skb->csum = csum_sub(skb->csum, + csum_partial(nh, (u8 *)(fhdr + 1) - nh, + 0)); + } + + /* Is this the final fragment? */ + if (!(fhdr->frag_off & htons(IP6_MF))) { + /* If we already have some bits beyond end + * or have different end, the segment is corrupted. + */ + if (end < fq->q.len || + ((fq->q.last_in & INET_FRAG_LAST_IN) && end != fq->q.len)) + goto err; + fq->q.last_in |= INET_FRAG_LAST_IN; + fq->q.len = end; + } else { + /* Check if the fragment is rounded to 8 bytes. + * Required by the RFC. + */ + if (end & 0x7) { + /* RFC2460 says always send parameter problem in + * this case. -DaveM + */ + IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_INHDRERRORS); + icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, + offsetof(struct ipv6hdr, payload_len)); + return -1; + } + if (end > fq->q.len) { + /* Some bits beyond end -> corruption. */ + if (fq->q.last_in & INET_FRAG_LAST_IN) + goto err; + fq->q.len = end; + } + } + + if (end == offset) + goto err; + + /* Point into the IP datagram 'data' part. */ + if (!pskb_pull(skb, (u8 *) (fhdr + 1) - skb->data)) + goto err; + + if (pskb_trim_rcsum(skb, end - offset)) + goto err; + + /* Find out which fragments are in front and at the back of us + * in the chain of fragments so far. We must know where to put + * this fragment, right? + */ + prev = fq->q.fragments_tail; + if (!prev || FRAG6_CB(prev)->offset < offset) { + next = NULL; + goto found; + } + prev = NULL; + for(next = fq->q.fragments; next != NULL; next = next->next) { + if (FRAG6_CB(next)->offset >= offset) + break; /* bingo! */ + prev = next; + } + +found: + /* RFC5722, Section 4, amended by Errata ID : 3089 + * When reassembling an IPv6 datagram, if + * one or more its constituent fragments is determined to be an + * overlapping fragment, the entire datagram (and any constituent + * fragments) MUST be silently discarded. + */ + + /* Check for overlap with preceding fragment. */ + if (prev && + (FRAG6_CB(prev)->offset + prev->len) > offset) + goto discard_fq; + + /* Look for overlap with succeeding segment. */ + if (next && FRAG6_CB(next)->offset < end) + goto discard_fq; + + FRAG6_CB(skb)->offset = offset; + + /* Insert this fragment in the chain of fragments. */ + skb->next = next; + if (!next) + fq->q.fragments_tail = skb; + if (prev) + prev->next = skb; + else + fq->q.fragments = skb; + + dev = skb->dev; + if (dev) { + fq->iif = dev->ifindex; + skb->dev = NULL; + } + fq->q.stamp = skb->tstamp; + fq->q.meat += skb->len; + atomic_add(skb->truesize, &fq->q.net->mem); + + /* The first fragment. + * nhoffset is obtained from the first fragment, of course. + */ + if (offset == 0) { + fq->nhoffset = nhoff; + fq->q.last_in |= INET_FRAG_FIRST_IN; + } + + if (fq->q.last_in == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && + fq->q.meat == fq->q.len) + return ip6_frag_reasm(fq, prev, dev); + + write_lock(&ip6_frags.lock); + list_move_tail(&fq->q.lru_list, &fq->q.net->lru_list); + write_unlock(&ip6_frags.lock); + return -1; + +discard_fq: + fq_kill(fq); +err: + IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_REASMFAILS); + kfree_skb(skb); + return -1; +} + +/* + * Check if this packet is complete. + * Returns NULL on failure by any reason, and pointer + * to current nexthdr field in reassembled frame. + * + * It is called with locked fq, and caller must check that + * queue is eligible for reassembly i.e. it is not COMPLETE, + * the last and the first frames arrived and all the bits are here. + */ +static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, + struct net_device *dev) +{ + struct net *net = container_of(fq->q.net, struct net, ipv6.frags); + struct sk_buff *fp, *head = fq->q.fragments; + int payload_len; + unsigned int nhoff; + + fq_kill(fq); + + /* Make the one we just received the head. */ + if (prev) { + head = prev->next; + fp = skb_clone(head, GFP_ATOMIC); + + if (!fp) + goto out_oom; + + fp->next = head->next; + if (!fp->next) + fq->q.fragments_tail = fp; + prev->next = fp; + + skb_morph(head, fq->q.fragments); + head->next = fq->q.fragments->next; + + kfree_skb(fq->q.fragments); + fq->q.fragments = head; + } + + WARN_ON(head == NULL); + WARN_ON(FRAG6_CB(head)->offset != 0); + + /* Unfragmented part is taken from the first segment. */ + payload_len = ((head->data - skb_network_header(head)) - + sizeof(struct ipv6hdr) + fq->q.len - + sizeof(struct frag_hdr)); + if (payload_len > IPV6_MAXPLEN) + goto out_oversize; + + /* Head of list must not be cloned. */ + if (skb_cloned(head) && pskb_expand_head(head, 0, 0, GFP_ATOMIC)) + goto out_oom; + + /* If the first fragment is fragmented itself, we split + * it to two chunks: the first with data and paged part + * and the second, holding only fragments. */ + if (skb_has_frag_list(head)) { + struct sk_buff *clone; + int i, plen = 0; + + if ((clone = alloc_skb(0, GFP_ATOMIC)) == NULL) + goto out_oom; + clone->next = head->next; + head->next = clone; + skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; + skb_frag_list_init(head); + for (i = 0; i < skb_shinfo(head)->nr_frags; i++) + plen += skb_frag_size(&skb_shinfo(head)->frags[i]); + clone->len = clone->data_len = head->data_len - plen; + head->data_len -= clone->len; + head->len -= clone->len; + clone->csum = 0; + clone->ip_summed = head->ip_summed; + atomic_add(clone->truesize, &fq->q.net->mem); + } + + /* We have to remove fragment header from datagram and to relocate + * header in order to calculate ICV correctly. */ + nhoff = fq->nhoffset; + skb_network_header(head)[nhoff] = skb_transport_header(head)[0]; + memmove(head->head + sizeof(struct frag_hdr), head->head, + (head->data - head->head) - sizeof(struct frag_hdr)); + head->mac_header += sizeof(struct frag_hdr); + head->network_header += sizeof(struct frag_hdr); + + skb_shinfo(head)->frag_list = head->next; + skb_reset_transport_header(head); + skb_push(head, head->data - skb_network_header(head)); + + for (fp=head->next; fp; fp = fp->next) { + head->data_len += fp->len; + head->len += fp->len; + if (head->ip_summed != fp->ip_summed) + head->ip_summed = CHECKSUM_NONE; + else if (head->ip_summed == CHECKSUM_COMPLETE) + head->csum = csum_add(head->csum, fp->csum); + head->truesize += fp->truesize; + } + atomic_sub(head->truesize, &fq->q.net->mem); + + head->next = NULL; + head->dev = dev; + head->tstamp = fq->q.stamp; + ipv6_hdr(head)->payload_len = htons(payload_len); + IP6CB(head)->nhoff = nhoff; + + /* Yes, and fold redundant checksum back. 8) */ + if (head->ip_summed == CHECKSUM_COMPLETE) + head->csum = csum_partial(skb_network_header(head), + skb_network_header_len(head), + head->csum); + + rcu_read_lock(); + IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMOKS); + rcu_read_unlock(); + fq->q.fragments = NULL; + fq->q.fragments_tail = NULL; + return 1; + +out_oversize: + if (net_ratelimit()) + printk(KERN_DEBUG "ip6_frag_reasm: payload len = %d\n", payload_len); + goto out_fail; +out_oom: + if (net_ratelimit()) + printk(KERN_DEBUG "ip6_frag_reasm: no memory for reassembly\n"); +out_fail: + rcu_read_lock(); + IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS); + rcu_read_unlock(); + return -1; +} + +static int ipv6_frag_rcv(struct sk_buff *skb) +{ + struct frag_hdr *fhdr; + struct frag_queue *fq; + const struct ipv6hdr *hdr = ipv6_hdr(skb); + struct net *net = dev_net(skb_dst(skb)->dev); + + IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMREQDS); + + /* Jumbo payload inhibits frag. header */ + if (hdr->payload_len==0) + goto fail_hdr; + + if (!pskb_may_pull(skb, (skb_transport_offset(skb) + + sizeof(struct frag_hdr)))) + goto fail_hdr; + + hdr = ipv6_hdr(skb); + fhdr = (struct frag_hdr *)skb_transport_header(skb); + + if (!(fhdr->frag_off & htons(0xFFF9))) { + /* It is not a fragmented frame */ + skb->transport_header += sizeof(struct frag_hdr); + IP6_INC_STATS_BH(net, + ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMOKS); + + IP6CB(skb)->nhoff = (u8 *)fhdr - skb_network_header(skb); + return 1; + } + + if (atomic_read(&net->ipv6.frags.mem) > net->ipv6.frags.high_thresh) + ip6_evictor(net, ip6_dst_idev(skb_dst(skb))); + + fq = fq_find(net, fhdr->identification, &hdr->saddr, &hdr->daddr); + if (fq != NULL) { + int ret; + + spin_lock(&fq->q.lock); + + ret = ip6_frag_queue(fq, skb, fhdr, IP6CB(skb)->nhoff); + + spin_unlock(&fq->q.lock); + fq_put(fq); + return ret; + } + + IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMFAILS); + kfree_skb(skb); + return -1; + +fail_hdr: + IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_INHDRERRORS); + icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, skb_network_header_len(skb)); + return -1; +} + +static const struct inet6_protocol frag_protocol = +{ + .handler = ipv6_frag_rcv, + .flags = INET6_PROTO_NOPOLICY, +}; + +#ifdef CONFIG_SYSCTL +static struct ctl_table ip6_frags_ns_ctl_table[] = { + { + .procname = "ip6frag_high_thresh", + .data = &init_net.ipv6.frags.high_thresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "ip6frag_low_thresh", + .data = &init_net.ipv6.frags.low_thresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "ip6frag_time", + .data = &init_net.ipv6.frags.timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { } +}; + +static struct ctl_table ip6_frags_ctl_table[] = { + { + .procname = "ip6frag_secret_interval", + .data = &ip6_frags.secret_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { } +}; + +static int __net_init ip6_frags_ns_sysctl_register(struct net *net) +{ + struct ctl_table *table; + struct ctl_table_header *hdr; + + table = ip6_frags_ns_ctl_table; + if (!net_eq(net, &init_net)) { + table = kmemdup(table, sizeof(ip6_frags_ns_ctl_table), GFP_KERNEL); + if (table == NULL) + goto err_alloc; + + table[0].data = &net->ipv6.frags.high_thresh; + table[1].data = &net->ipv6.frags.low_thresh; + table[2].data = &net->ipv6.frags.timeout; + } + + hdr = register_net_sysctl_table(net, net_ipv6_ctl_path, table); + if (hdr == NULL) + goto err_reg; + + net->ipv6.sysctl.frags_hdr = hdr; + return 0; + +err_reg: + if (!net_eq(net, &init_net)) + kfree(table); +err_alloc: + return -ENOMEM; +} + +static void __net_exit ip6_frags_ns_sysctl_unregister(struct net *net) +{ + struct ctl_table *table; + + table = net->ipv6.sysctl.frags_hdr->ctl_table_arg; + unregister_net_sysctl_table(net->ipv6.sysctl.frags_hdr); + if (!net_eq(net, &init_net)) + kfree(table); +} + +static struct ctl_table_header *ip6_ctl_header; + +static int ip6_frags_sysctl_register(void) +{ + ip6_ctl_header = register_net_sysctl_rotable(net_ipv6_ctl_path, + ip6_frags_ctl_table); + return ip6_ctl_header == NULL ? -ENOMEM : 0; +} + +static void ip6_frags_sysctl_unregister(void) +{ + unregister_net_sysctl_table(ip6_ctl_header); +} +#else +static inline int ip6_frags_ns_sysctl_register(struct net *net) +{ + return 0; +} + +static inline void ip6_frags_ns_sysctl_unregister(struct net *net) +{ +} + +static inline int ip6_frags_sysctl_register(void) +{ + return 0; +} + +static inline void ip6_frags_sysctl_unregister(void) +{ +} +#endif + +static int __net_init ipv6_frags_init_net(struct net *net) +{ + net->ipv6.frags.high_thresh = IPV6_FRAG_HIGH_THRESH; + net->ipv6.frags.low_thresh = IPV6_FRAG_LOW_THRESH; + net->ipv6.frags.timeout = IPV6_FRAG_TIMEOUT; + + inet_frags_init_net(&net->ipv6.frags); + + return ip6_frags_ns_sysctl_register(net); +} + +static void __net_exit ipv6_frags_exit_net(struct net *net) +{ + ip6_frags_ns_sysctl_unregister(net); + inet_frags_exit_net(&net->ipv6.frags, &ip6_frags); +} + +static struct pernet_operations ip6_frags_ops = { + .init = ipv6_frags_init_net, + .exit = ipv6_frags_exit_net, +}; + +int __init ipv6_frag_init(void) +{ + int ret; + + ret = inet6_add_protocol(&frag_protocol, IPPROTO_FRAGMENT); + if (ret) + goto out; + + ret = ip6_frags_sysctl_register(); + if (ret) + goto err_sysctl; + + ret = register_pernet_subsys(&ip6_frags_ops); + if (ret) + goto err_pernet; + + ip6_frags.hashfn = ip6_hashfn; + ip6_frags.constructor = ip6_frag_init; + ip6_frags.destructor = NULL; + ip6_frags.skb_free = NULL; + ip6_frags.qsize = sizeof(struct frag_queue); + ip6_frags.match = ip6_frag_match; + ip6_frags.frag_expire = ip6_frag_expire; + ip6_frags.secret_interval = 10 * 60 * HZ; + inet_frags_init(&ip6_frags); +out: + return ret; + +err_pernet: + ip6_frags_sysctl_unregister(); +err_sysctl: + inet6_del_protocol(&frag_protocol, IPPROTO_FRAGMENT); + goto out; +} + +void ipv6_frag_exit(void) +{ + inet_frags_fini(&ip6_frags); + ip6_frags_sysctl_unregister(); + unregister_pernet_subsys(&ip6_frags_ops); + inet6_del_protocol(&frag_protocol, IPPROTO_FRAGMENT); +} diff --git a/net/ipv6/route.c b/net/ipv6/route.c new file mode 100644 index 00000000..c4920ca8 --- /dev/null +++ b/net/ipv6/route.c @@ -0,0 +1,3104 @@ +/* + * Linux INET6 implementation + * FIB front-end. + * + * Authors: + * Pedro Roque <roque@di.fc.ul.pt> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* Changes: + * + * YOSHIFUJI Hideaki @USAGI + * reworked default router selection. + * - respect outgoing interface + * - select from (probably) reachable routers (i.e. + * routers in REACHABLE, STALE, DELAY or PROBE states). + * - always select the same router if it is (probably) + * reachable. otherwise, round-robin the list. + * Ville Nuorvala + * Fixed routing subtrees. + */ + +#include <linux/capability.h> +#include <linux/errno.h> +#include <linux/export.h> +#include <linux/types.h> +#include <linux/times.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/net.h> +#include <linux/route.h> +#include <linux/netdevice.h> +#include <linux/in6.h> +#include <linux/mroute6.h> +#include <linux/init.h> +#include <linux/if_arp.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/nsproxy.h> +#include <linux/slab.h> +#include <net/net_namespace.h> +#include <net/snmp.h> +#include <net/ipv6.h> +#include <net/ip6_fib.h> +#include <net/ip6_route.h> +#include <net/ndisc.h> +#include <net/addrconf.h> +#include <net/tcp.h> +#include <linux/rtnetlink.h> +#include <net/dst.h> +#include <net/xfrm.h> +#include <net/netevent.h> +#include <net/netlink.h> + +#include <asm/uaccess.h> + +#ifdef CONFIG_SYSCTL +#include <linux/sysctl.h> +#endif + +static struct rt6_info *ip6_rt_copy(struct rt6_info *ort, + const struct in6_addr *dest); +static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); +static unsigned int ip6_default_advmss(const struct dst_entry *dst); +static unsigned int ip6_mtu(const struct dst_entry *dst); +static struct dst_entry *ip6_negative_advice(struct dst_entry *); +static void ip6_dst_destroy(struct dst_entry *); +static void ip6_dst_ifdown(struct dst_entry *, + struct net_device *dev, int how); +static int ip6_dst_gc(struct dst_ops *ops); + +static int ip6_pkt_discard(struct sk_buff *skb); +static int ip6_pkt_discard_out(struct sk_buff *skb); +static void ip6_link_failure(struct sk_buff *skb); +static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu); + +#ifdef CONFIG_IPV6_ROUTE_INFO +static struct rt6_info *rt6_add_route_info(struct net *net, + const struct in6_addr *prefix, int prefixlen, + const struct in6_addr *gwaddr, int ifindex, + unsigned pref); +static struct rt6_info *rt6_get_route_info(struct net *net, + const struct in6_addr *prefix, int prefixlen, + const struct in6_addr *gwaddr, int ifindex); +#endif + +static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old) +{ + struct rt6_info *rt = (struct rt6_info *) dst; + struct inet_peer *peer; + u32 *p = NULL; + + if (!(rt->dst.flags & DST_HOST)) + return NULL; + + if (!rt->rt6i_peer) + rt6_bind_peer(rt, 1); + + peer = rt->rt6i_peer; + if (peer) { + u32 *old_p = __DST_METRICS_PTR(old); + unsigned long prev, new; + + p = peer->metrics; + if (inet_metrics_new(peer)) + memcpy(p, old_p, sizeof(u32) * RTAX_MAX); + + new = (unsigned long) p; + prev = cmpxchg(&dst->_metrics, old, new); + + if (prev != old) { + p = __DST_METRICS_PTR(prev); + if (prev & DST_METRICS_READ_ONLY) + p = NULL; + } + } + return p; +} + +static inline const void *choose_neigh_daddr(struct rt6_info *rt, const void *daddr) +{ + struct in6_addr *p = &rt->rt6i_gateway; + + if (!ipv6_addr_any(p)) + return (const void *) p; + return daddr; +} + +static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst, const void *daddr) +{ + struct rt6_info *rt = (struct rt6_info *) dst; + struct neighbour *n; + + daddr = choose_neigh_daddr(rt, daddr); + n = __ipv6_neigh_lookup(&nd_tbl, dst->dev, daddr); + if (n) + return n; + return neigh_create(&nd_tbl, daddr, dst->dev); +} + +static int rt6_bind_neighbour(struct rt6_info *rt, struct net_device *dev) +{ + struct neighbour *n = __ipv6_neigh_lookup(&nd_tbl, dev, &rt->rt6i_gateway); + if (!n) { + n = neigh_create(&nd_tbl, &rt->rt6i_gateway, dev); + if (IS_ERR(n)) + return PTR_ERR(n); + } + dst_set_neighbour(&rt->dst, n); + + return 0; +} + +static struct dst_ops ip6_dst_ops_template = { + .family = AF_INET6, + .protocol = cpu_to_be16(ETH_P_IPV6), + .gc = ip6_dst_gc, + .gc_thresh = 1024, + .check = ip6_dst_check, + .default_advmss = ip6_default_advmss, + .mtu = ip6_mtu, + .cow_metrics = ipv6_cow_metrics, + .destroy = ip6_dst_destroy, + .ifdown = ip6_dst_ifdown, + .negative_advice = ip6_negative_advice, + .link_failure = ip6_link_failure, + .update_pmtu = ip6_rt_update_pmtu, + .local_out = __ip6_local_out, + .neigh_lookup = ip6_neigh_lookup, +}; + +static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst) +{ + unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); + + return mtu ? : dst->dev->mtu; +} + +static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu) +{ +} + +static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst, + unsigned long old) +{ + return NULL; +} + +static struct dst_ops ip6_dst_blackhole_ops = { + .family = AF_INET6, + .protocol = cpu_to_be16(ETH_P_IPV6), + .destroy = ip6_dst_destroy, + .check = ip6_dst_check, + .mtu = ip6_blackhole_mtu, + .default_advmss = ip6_default_advmss, + .update_pmtu = ip6_rt_blackhole_update_pmtu, + .cow_metrics = ip6_rt_blackhole_cow_metrics, + .neigh_lookup = ip6_neigh_lookup, +}; + +static const u32 ip6_template_metrics[RTAX_MAX] = { + [RTAX_HOPLIMIT - 1] = 255, +}; + +static struct rt6_info ip6_null_entry_template = { + .dst = { + .__refcnt = ATOMIC_INIT(1), + .__use = 1, + .obsolete = -1, + .error = -ENETUNREACH, + .input = ip6_pkt_discard, + .output = ip6_pkt_discard_out, + }, + .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), + .rt6i_protocol = RTPROT_KERNEL, + .rt6i_metric = ~(u32) 0, + .rt6i_ref = ATOMIC_INIT(1), +}; + +#ifdef CONFIG_IPV6_MULTIPLE_TABLES + +static int ip6_pkt_prohibit(struct sk_buff *skb); +static int ip6_pkt_prohibit_out(struct sk_buff *skb); + +static struct rt6_info ip6_prohibit_entry_template = { + .dst = { + .__refcnt = ATOMIC_INIT(1), + .__use = 1, + .obsolete = -1, + .error = -EACCES, + .input = ip6_pkt_prohibit, + .output = ip6_pkt_prohibit_out, + }, + .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), + .rt6i_protocol = RTPROT_KERNEL, + .rt6i_metric = ~(u32) 0, + .rt6i_ref = ATOMIC_INIT(1), +}; + +static struct rt6_info ip6_blk_hole_entry_template = { + .dst = { + .__refcnt = ATOMIC_INIT(1), + .__use = 1, + .obsolete = -1, + .error = -EINVAL, + .input = dst_discard, + .output = dst_discard, + }, + .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), + .rt6i_protocol = RTPROT_KERNEL, + .rt6i_metric = ~(u32) 0, + .rt6i_ref = ATOMIC_INIT(1), +}; + +#endif + +/* allocate dst with ip6_dst_ops */ +static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops, + struct net_device *dev, + int flags) +{ + struct rt6_info *rt = dst_alloc(ops, dev, 0, 0, flags); + + if (rt) + memset(&rt->rt6i_table, 0, + sizeof(*rt) - sizeof(struct dst_entry)); + + return rt; +} + +static void ip6_dst_destroy(struct dst_entry *dst) +{ + struct rt6_info *rt = (struct rt6_info *)dst; + struct inet6_dev *idev = rt->rt6i_idev; + struct inet_peer *peer = rt->rt6i_peer; + + if (!(rt->dst.flags & DST_HOST)) + dst_destroy_metrics_generic(dst); + + if (idev) { + rt->rt6i_idev = NULL; + in6_dev_put(idev); + } + + if (!(rt->rt6i_flags & RTF_EXPIRES) && dst->from) + dst_release(dst->from); + + if (peer) { + rt->rt6i_peer = NULL; + inet_putpeer(peer); + } +} + +static atomic_t __rt6_peer_genid = ATOMIC_INIT(0); + +static u32 rt6_peer_genid(void) +{ + return atomic_read(&__rt6_peer_genid); +} + +void rt6_bind_peer(struct rt6_info *rt, int create) +{ + struct inet_peer *peer; + + peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create); + if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL) + inet_putpeer(peer); + else + rt->rt6i_peer_genid = rt6_peer_genid(); +} + +static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, + int how) +{ + struct rt6_info *rt = (struct rt6_info *)dst; + struct inet6_dev *idev = rt->rt6i_idev; + struct net_device *loopback_dev = + dev_net(dev)->loopback_dev; + + if (dev != loopback_dev && idev && idev->dev == dev) { + struct inet6_dev *loopback_idev = + in6_dev_get(loopback_dev); + if (loopback_idev) { + rt->rt6i_idev = loopback_idev; + in6_dev_put(idev); + } + } +} + +static __inline__ int rt6_check_expired(const struct rt6_info *rt) +{ + struct rt6_info *ort = NULL; + + if (rt->rt6i_flags & RTF_EXPIRES) { + if (time_after(jiffies, rt->dst.expires)) + return 1; + } else if (rt->dst.from) { + ort = (struct rt6_info *) rt->dst.from; + return (ort->rt6i_flags & RTF_EXPIRES) && + time_after(jiffies, ort->dst.expires); + } + return 0; +} + +static inline int rt6_need_strict(const struct in6_addr *daddr) +{ + return ipv6_addr_type(daddr) & + (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK); +} + +/* + * Route lookup. Any table->tb6_lock is implied. + */ + +static inline struct rt6_info *rt6_device_match(struct net *net, + struct rt6_info *rt, + const struct in6_addr *saddr, + int oif, + int flags) +{ + struct rt6_info *local = NULL; + struct rt6_info *sprt; + + if (!oif && ipv6_addr_any(saddr)) + goto out; + + for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) { + struct net_device *dev = sprt->dst.dev; + + if (oif) { + if (dev->ifindex == oif) + return sprt; + if (dev->flags & IFF_LOOPBACK) { + if (!sprt->rt6i_idev || + sprt->rt6i_idev->dev->ifindex != oif) { + if (flags & RT6_LOOKUP_F_IFACE && oif) + continue; + if (local && (!oif || + local->rt6i_idev->dev->ifindex == oif)) + continue; + } + local = sprt; + } + } else { + if (ipv6_chk_addr(net, saddr, dev, + flags & RT6_LOOKUP_F_IFACE)) + return sprt; + } + } + + if (oif) { + if (local) + return local; + + if (flags & RT6_LOOKUP_F_IFACE) + return net->ipv6.ip6_null_entry; + } +out: + return rt; +} + +#ifdef CONFIG_IPV6_ROUTER_PREF +static void rt6_probe(struct rt6_info *rt) +{ + struct neighbour *neigh; + /* + * Okay, this does not seem to be appropriate + * for now, however, we need to check if it + * is really so; aka Router Reachability Probing. + * + * Router Reachability Probe MUST be rate-limited + * to no more than one per minute. + */ + rcu_read_lock(); + neigh = rt ? dst_get_neighbour_noref(&rt->dst) : NULL; + if (!neigh || (neigh->nud_state & NUD_VALID)) + goto out; + read_lock_bh(&neigh->lock); + if (!(neigh->nud_state & NUD_VALID) && + time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) { + struct in6_addr mcaddr; + struct in6_addr *target; + + neigh->updated = jiffies; + read_unlock_bh(&neigh->lock); + + target = (struct in6_addr *)&neigh->primary_key; + addrconf_addr_solict_mult(target, &mcaddr); + ndisc_send_ns(rt->dst.dev, NULL, target, &mcaddr, NULL); + } else { + read_unlock_bh(&neigh->lock); + } +out: + rcu_read_unlock(); +} +#else +static inline void rt6_probe(struct rt6_info *rt) +{ +} +#endif + +/* + * Default Router Selection (RFC 2461 6.3.6) + */ +static inline int rt6_check_dev(struct rt6_info *rt, int oif) +{ + struct net_device *dev = rt->dst.dev; + if (!oif || dev->ifindex == oif) + return 2; + if ((dev->flags & IFF_LOOPBACK) && + rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif) + return 1; + return 0; +} + +static inline int rt6_check_neigh(struct rt6_info *rt) +{ + struct neighbour *neigh; + int m; + + rcu_read_lock(); + neigh = dst_get_neighbour_noref(&rt->dst); + if (rt->rt6i_flags & RTF_NONEXTHOP || + !(rt->rt6i_flags & RTF_GATEWAY)) + m = 1; + else if (neigh) { + read_lock_bh(&neigh->lock); + if (neigh->nud_state & NUD_VALID) + m = 2; +#ifdef CONFIG_IPV6_ROUTER_PREF + else if (neigh->nud_state & NUD_FAILED) + m = 0; +#endif + else + m = 1; + read_unlock_bh(&neigh->lock); + } else + m = 0; + rcu_read_unlock(); + return m; +} + +static int rt6_score_route(struct rt6_info *rt, int oif, + int strict) +{ + int m, n; + + m = rt6_check_dev(rt, oif); + if (!m && (strict & RT6_LOOKUP_F_IFACE)) + return -1; +#ifdef CONFIG_IPV6_ROUTER_PREF + m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2; +#endif + n = rt6_check_neigh(rt); + if (!n && (strict & RT6_LOOKUP_F_REACHABLE)) + return -1; + return m; +} + +static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict, + int *mpri, struct rt6_info *match) +{ + int m; + + if (rt6_check_expired(rt)) + goto out; + + m = rt6_score_route(rt, oif, strict); + if (m < 0) + goto out; + + if (m > *mpri) { + if (strict & RT6_LOOKUP_F_REACHABLE) + rt6_probe(match); + *mpri = m; + match = rt; + } else if (strict & RT6_LOOKUP_F_REACHABLE) { + rt6_probe(rt); + } + +out: + return match; +} + +static struct rt6_info *find_rr_leaf(struct fib6_node *fn, + struct rt6_info *rr_head, + u32 metric, int oif, int strict) +{ + struct rt6_info *rt, *match; + int mpri = -1; + + match = NULL; + for (rt = rr_head; rt && rt->rt6i_metric == metric; + rt = rt->dst.rt6_next) + match = find_match(rt, oif, strict, &mpri, match); + for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric; + rt = rt->dst.rt6_next) + match = find_match(rt, oif, strict, &mpri, match); + + return match; +} + +static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict) +{ + struct rt6_info *match, *rt0; + struct net *net; + + rt0 = fn->rr_ptr; + if (!rt0) + fn->rr_ptr = rt0 = fn->leaf; + + match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict); + + if (!match && + (strict & RT6_LOOKUP_F_REACHABLE)) { + struct rt6_info *next = rt0->dst.rt6_next; + + /* no entries matched; do round-robin */ + if (!next || next->rt6i_metric != rt0->rt6i_metric) + next = fn->leaf; + + if (next != rt0) + fn->rr_ptr = next; + } + + net = dev_net(rt0->dst.dev); + return match ? match : net->ipv6.ip6_null_entry; +} + +#ifdef CONFIG_IPV6_ROUTE_INFO +int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, + const struct in6_addr *gwaddr) +{ + struct net *net = dev_net(dev); + struct route_info *rinfo = (struct route_info *) opt; + struct in6_addr prefix_buf, *prefix; + unsigned int pref; + unsigned long lifetime; + struct rt6_info *rt; + + if (len < sizeof(struct route_info)) { + return -EINVAL; + } + + /* Sanity check for prefix_len and length */ + if (rinfo->length > 3) { + return -EINVAL; + } else if (rinfo->prefix_len > 128) { + return -EINVAL; + } else if (rinfo->prefix_len > 64) { + if (rinfo->length < 2) { + return -EINVAL; + } + } else if (rinfo->prefix_len > 0) { + if (rinfo->length < 1) { + return -EINVAL; + } + } + + pref = rinfo->route_pref; + if (pref == ICMPV6_ROUTER_PREF_INVALID) + return -EINVAL; + + lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ); + + if (rinfo->length == 3) + prefix = (struct in6_addr *)rinfo->prefix; + else { + /* this function is safe */ + ipv6_addr_prefix(&prefix_buf, + (struct in6_addr *)rinfo->prefix, + rinfo->prefix_len); + prefix = &prefix_buf; + } + + rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr, + dev->ifindex); + + if (rt && !lifetime) { + ip6_del_rt(rt); + rt = NULL; + } + + if (!rt && lifetime) + rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex, + pref); + else if (rt) + rt->rt6i_flags = RTF_ROUTEINFO | + (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); + + if (rt) { + if (!addrconf_finite_timeout(lifetime)) + rt6_clean_expires(rt); + else + rt6_set_expires(rt, jiffies + HZ * lifetime); + + dst_release(&rt->dst); + } + return 0; +} +#endif + +#define BACKTRACK(__net, saddr) \ +do { \ + if (rt == __net->ipv6.ip6_null_entry) { \ + struct fib6_node *pn; \ + while (1) { \ + if (fn->fn_flags & RTN_TL_ROOT) \ + goto out; \ + pn = fn->parent; \ + if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \ + fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \ + else \ + fn = pn; \ + if (fn->fn_flags & RTN_RTINFO) \ + goto restart; \ + } \ + } \ +} while (0) + +static struct rt6_info *ip6_pol_route_lookup(struct net *net, + struct fib6_table *table, + struct flowi6 *fl6, int flags) +{ + struct fib6_node *fn; + struct rt6_info *rt; + + read_lock_bh(&table->tb6_lock); + fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); +restart: + rt = fn->leaf; + rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags); + BACKTRACK(net, &fl6->saddr); +out: + dst_use(&rt->dst, jiffies); + read_unlock_bh(&table->tb6_lock); + return rt; + +} + +struct dst_entry * ip6_route_lookup(struct net *net, struct flowi6 *fl6, + int flags) +{ + return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup); +} +EXPORT_SYMBOL_GPL(ip6_route_lookup); + +struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, + const struct in6_addr *saddr, int oif, int strict) +{ + struct flowi6 fl6 = { + .flowi6_oif = oif, + .daddr = *daddr, + }; + struct dst_entry *dst; + int flags = strict ? RT6_LOOKUP_F_IFACE : 0; + + if (saddr) { + memcpy(&fl6.saddr, saddr, sizeof(*saddr)); + flags |= RT6_LOOKUP_F_HAS_SADDR; + } + + dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup); + if (dst->error == 0) + return (struct rt6_info *) dst; + + dst_release(dst); + + return NULL; +} + +EXPORT_SYMBOL(rt6_lookup); + +/* ip6_ins_rt is called with FREE table->tb6_lock. + It takes new route entry, the addition fails by any reason the + route is freed. In any case, if caller does not hold it, it may + be destroyed. + */ + +static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info) +{ + int err; + struct fib6_table *table; + + table = rt->rt6i_table; + write_lock_bh(&table->tb6_lock); + err = fib6_add(&table->tb6_root, rt, info); + write_unlock_bh(&table->tb6_lock); + + return err; +} + +int ip6_ins_rt(struct rt6_info *rt) +{ + struct nl_info info = { + .nl_net = dev_net(rt->dst.dev), + }; + return __ip6_ins_rt(rt, &info); +} + +static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, + const struct in6_addr *daddr, + const struct in6_addr *saddr) +{ + struct rt6_info *rt; + + /* + * Clone the route. + */ + + rt = ip6_rt_copy(ort, daddr); + + if (rt) { + int attempts = !in_softirq(); + + if (!(rt->rt6i_flags & RTF_GATEWAY)) { + if (ort->rt6i_dst.plen != 128 && + ipv6_addr_equal(&ort->rt6i_dst.addr, daddr)) + rt->rt6i_flags |= RTF_ANYCAST; + rt->rt6i_gateway = *daddr; + } + + rt->rt6i_flags |= RTF_CACHE; + +#ifdef CONFIG_IPV6_SUBTREES + if (rt->rt6i_src.plen && saddr) { + rt->rt6i_src.addr = *saddr; + rt->rt6i_src.plen = 128; + } +#endif + + retry: + if (rt6_bind_neighbour(rt, rt->dst.dev)) { + struct net *net = dev_net(rt->dst.dev); + int saved_rt_min_interval = + net->ipv6.sysctl.ip6_rt_gc_min_interval; + int saved_rt_elasticity = + net->ipv6.sysctl.ip6_rt_gc_elasticity; + + if (attempts-- > 0) { + net->ipv6.sysctl.ip6_rt_gc_elasticity = 1; + net->ipv6.sysctl.ip6_rt_gc_min_interval = 0; + + ip6_dst_gc(&net->ipv6.ip6_dst_ops); + + net->ipv6.sysctl.ip6_rt_gc_elasticity = + saved_rt_elasticity; + net->ipv6.sysctl.ip6_rt_gc_min_interval = + saved_rt_min_interval; + goto retry; + } + + if (net_ratelimit()) + printk(KERN_WARNING + "ipv6: Neighbour table overflow.\n"); + dst_free(&rt->dst); + return NULL; + } + } + + return rt; +} + +static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, + const struct in6_addr *daddr) +{ + struct rt6_info *rt = ip6_rt_copy(ort, daddr); + + if (rt) { + rt->rt6i_flags |= RTF_CACHE; + dst_set_neighbour(&rt->dst, neigh_clone(dst_get_neighbour_noref_raw(&ort->dst))); + } + return rt; +} + +static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif, + struct flowi6 *fl6, int flags) +{ + struct fib6_node *fn; + struct rt6_info *rt, *nrt; + int strict = 0; + int attempts = 3; + int err; + int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE; + + strict |= flags & RT6_LOOKUP_F_IFACE; + +relookup: + read_lock_bh(&table->tb6_lock); + +restart_2: + fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); + +restart: + rt = rt6_select(fn, oif, strict | reachable); + + BACKTRACK(net, &fl6->saddr); + if (rt == net->ipv6.ip6_null_entry || + rt->rt6i_flags & RTF_CACHE) + goto out; + + dst_hold(&rt->dst); + read_unlock_bh(&table->tb6_lock); + + if (!dst_get_neighbour_noref_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP)) + nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr); + else if (!(rt->dst.flags & DST_HOST)) + nrt = rt6_alloc_clone(rt, &fl6->daddr); + else + goto out2; + + dst_release(&rt->dst); + rt = nrt ? : net->ipv6.ip6_null_entry; + + dst_hold(&rt->dst); + if (nrt) { + err = ip6_ins_rt(nrt); + if (!err) + goto out2; + } + + if (--attempts <= 0) + goto out2; + + /* + * Race condition! In the gap, when table->tb6_lock was + * released someone could insert this route. Relookup. + */ + dst_release(&rt->dst); + goto relookup; + +out: + if (reachable) { + reachable = 0; + goto restart_2; + } + dst_hold(&rt->dst); + read_unlock_bh(&table->tb6_lock); +out2: + rt->dst.lastuse = jiffies; + rt->dst.__use++; + + return rt; +} + +static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table, + struct flowi6 *fl6, int flags) +{ + return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags); +} + +static struct dst_entry *ip6_route_input_lookup(struct net *net, + struct net_device *dev, + struct flowi6 *fl6, int flags) +{ + if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG) + flags |= RT6_LOOKUP_F_IFACE; + + return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input); +} + +void ip6_route_input(struct sk_buff *skb) +{ + const struct ipv6hdr *iph = ipv6_hdr(skb); + struct net *net = dev_net(skb->dev); + int flags = RT6_LOOKUP_F_HAS_SADDR; + struct flowi6 fl6 = { + .flowi6_iif = skb->dev->ifindex, + .daddr = iph->daddr, + .saddr = iph->saddr, + .flowlabel = (* (__be32 *) iph) & IPV6_FLOWINFO_MASK, + .flowi6_mark = skb->mark, + .flowi6_proto = iph->nexthdr, + }; + + skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags)); +} + +static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table, + struct flowi6 *fl6, int flags) +{ + return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags); +} + +struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk, + struct flowi6 *fl6) +{ + int flags = 0; + + if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr)) + flags |= RT6_LOOKUP_F_IFACE; + + if (!ipv6_addr_any(&fl6->saddr)) + flags |= RT6_LOOKUP_F_HAS_SADDR; + else if (sk) + flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs); + + return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output); +} + +EXPORT_SYMBOL(ip6_route_output); + +struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig) +{ + struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig; + struct dst_entry *new = NULL; + + rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0); + if (rt) { + memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry)); + + new = &rt->dst; + + new->__use = 1; + new->input = dst_discard; + new->output = dst_discard; + + if (dst_metrics_read_only(&ort->dst)) + new->_metrics = ort->dst._metrics; + else + dst_copy_metrics(new, &ort->dst); + rt->rt6i_idev = ort->rt6i_idev; + if (rt->rt6i_idev) + in6_dev_hold(rt->rt6i_idev); + + rt->rt6i_gateway = ort->rt6i_gateway; + rt->rt6i_flags = ort->rt6i_flags; + rt6_clean_expires(rt); + rt->rt6i_metric = 0; + + memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key)); +#ifdef CONFIG_IPV6_SUBTREES + memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key)); +#endif + + dst_free(new); + } + + dst_release(dst_orig); + return new ? new : ERR_PTR(-ENOMEM); +} + +/* + * Destination cache support functions + */ + +static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) +{ + struct rt6_info *rt; + + rt = (struct rt6_info *) dst; + + if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) { + if (rt->rt6i_peer_genid != rt6_peer_genid()) { + if (!rt->rt6i_peer) + rt6_bind_peer(rt, 0); + rt->rt6i_peer_genid = rt6_peer_genid(); + } + return dst; + } + return NULL; +} + +static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) +{ + struct rt6_info *rt = (struct rt6_info *) dst; + + if (rt) { + if (rt->rt6i_flags & RTF_CACHE) { + if (rt6_check_expired(rt)) { + ip6_del_rt(rt); + dst = NULL; + } + } else { + dst_release(dst); + dst = NULL; + } + } + return dst; +} + +static void ip6_link_failure(struct sk_buff *skb) +{ + struct rt6_info *rt; + + icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0); + + rt = (struct rt6_info *) skb_dst(skb); + if (rt) { + if (rt->rt6i_flags & RTF_CACHE) + rt6_update_expires(rt, 0); + else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) + rt->rt6i_node->fn_sernum = -1; + } +} + +static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu) +{ + struct rt6_info *rt6 = (struct rt6_info*)dst; + + if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) { + rt6->rt6i_flags |= RTF_MODIFIED; + if (mtu < IPV6_MIN_MTU) { + u32 features = dst_metric(dst, RTAX_FEATURES); + mtu = IPV6_MIN_MTU; + features |= RTAX_FEATURE_ALLFRAG; + dst_metric_set(dst, RTAX_FEATURES, features); + } + dst_metric_set(dst, RTAX_MTU, mtu); + } +} + +static unsigned int ip6_default_advmss(const struct dst_entry *dst) +{ + struct net_device *dev = dst->dev; + unsigned int mtu = dst_mtu(dst); + struct net *net = dev_net(dev); + + mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr); + + if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss) + mtu = net->ipv6.sysctl.ip6_rt_min_advmss; + + /* + * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and + * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. + * IPV6_MAXPLEN is also valid and means: "any MSS, + * rely only on pmtu discovery" + */ + if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr)) + mtu = IPV6_MAXPLEN; + return mtu; +} + +static unsigned int ip6_mtu(const struct dst_entry *dst) +{ + struct inet6_dev *idev; + unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); + + if (mtu) + return mtu; + + mtu = IPV6_MIN_MTU; + + rcu_read_lock(); + idev = __in6_dev_get(dst->dev); + if (idev) + mtu = idev->cnf.mtu6; + rcu_read_unlock(); + + return mtu; +} + +static struct dst_entry *icmp6_dst_gc_list; +static DEFINE_SPINLOCK(icmp6_dst_lock); + +struct dst_entry *icmp6_dst_alloc(struct net_device *dev, + struct neighbour *neigh, + struct flowi6 *fl6) +{ + struct dst_entry *dst; + struct rt6_info *rt; + struct inet6_dev *idev = in6_dev_get(dev); + struct net *net = dev_net(dev); + + if (unlikely(!idev)) + return ERR_PTR(-ENODEV); + + rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, dev, 0); + if (unlikely(!rt)) { + in6_dev_put(idev); + dst = ERR_PTR(-ENOMEM); + goto out; + } + + if (neigh) + neigh_hold(neigh); + else { + neigh = ip6_neigh_lookup(&rt->dst, &fl6->daddr); + if (IS_ERR(neigh)) { + in6_dev_put(idev); + dst_free(&rt->dst); + return ERR_CAST(neigh); + } + } + + rt->dst.flags |= DST_HOST; + rt->dst.output = ip6_output; + dst_set_neighbour(&rt->dst, neigh); + atomic_set(&rt->dst.__refcnt, 1); + rt->rt6i_dst.addr = fl6->daddr; + rt->rt6i_dst.plen = 128; + rt->rt6i_idev = idev; + dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255); + + spin_lock_bh(&icmp6_dst_lock); + rt->dst.next = icmp6_dst_gc_list; + icmp6_dst_gc_list = &rt->dst; + spin_unlock_bh(&icmp6_dst_lock); + + fib6_force_start_gc(net); + + dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0); + +out: + return dst; +} + +int icmp6_dst_gc(void) +{ + struct dst_entry *dst, **pprev; + int more = 0; + + spin_lock_bh(&icmp6_dst_lock); + pprev = &icmp6_dst_gc_list; + + while ((dst = *pprev) != NULL) { + if (!atomic_read(&dst->__refcnt)) { + *pprev = dst->next; + dst_free(dst); + } else { + pprev = &dst->next; + ++more; + } + } + + spin_unlock_bh(&icmp6_dst_lock); + + return more; +} + +static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg), + void *arg) +{ + struct dst_entry *dst, **pprev; + + spin_lock_bh(&icmp6_dst_lock); + pprev = &icmp6_dst_gc_list; + while ((dst = *pprev) != NULL) { + struct rt6_info *rt = (struct rt6_info *) dst; + if (func(rt, arg)) { + *pprev = dst->next; + dst_free(dst); + } else { + pprev = &dst->next; + } + } + spin_unlock_bh(&icmp6_dst_lock); +} + +static int ip6_dst_gc(struct dst_ops *ops) +{ + unsigned long now = jiffies; + struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops); + int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval; + int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size; + int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity; + int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout; + unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc; + int entries; + + entries = dst_entries_get_fast(ops); + if (time_after(rt_last_gc + rt_min_interval, now) && + entries <= rt_max_size) + goto out; + + net->ipv6.ip6_rt_gc_expire++; + fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net); + net->ipv6.ip6_rt_last_gc = now; + entries = dst_entries_get_slow(ops); + if (entries < ops->gc_thresh) + net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1; +out: + net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity; + return entries > rt_max_size; +} + +/* Clean host part of a prefix. Not necessary in radix tree, + but results in cleaner routing tables. + + Remove it only when all the things will work! + */ + +int ip6_dst_hoplimit(struct dst_entry *dst) +{ + int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT); + if (hoplimit == 0) { + struct net_device *dev = dst->dev; + struct inet6_dev *idev; + + rcu_read_lock(); + idev = __in6_dev_get(dev); + if (idev) + hoplimit = idev->cnf.hop_limit; + else + hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit; + rcu_read_unlock(); + } + return hoplimit; +} +EXPORT_SYMBOL(ip6_dst_hoplimit); + +/* + * + */ + +int ip6_route_add(struct fib6_config *cfg) +{ + int err; + struct net *net = cfg->fc_nlinfo.nl_net; + struct rt6_info *rt = NULL; + struct net_device *dev = NULL; + struct inet6_dev *idev = NULL; + struct fib6_table *table; + int addr_type; + + if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128) + return -EINVAL; +#ifndef CONFIG_IPV6_SUBTREES + if (cfg->fc_src_len) + return -EINVAL; +#endif + if (cfg->fc_ifindex) { + err = -ENODEV; + dev = dev_get_by_index(net, cfg->fc_ifindex); + if (!dev) + goto out; + idev = in6_dev_get(dev); + if (!idev) + goto out; + } + + if (cfg->fc_metric == 0) + cfg->fc_metric = IP6_RT_PRIO_USER; + + err = -ENOBUFS; + if (cfg->fc_nlinfo.nlh && + !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) { + table = fib6_get_table(net, cfg->fc_table); + if (!table) { + printk(KERN_WARNING "IPv6: NLM_F_CREATE should be specified when creating new route\n"); + table = fib6_new_table(net, cfg->fc_table); + } + } else { + table = fib6_new_table(net, cfg->fc_table); + } + + if (!table) + goto out; + + rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, NULL, DST_NOCOUNT); + + if (!rt) { + err = -ENOMEM; + goto out; + } + + rt->dst.obsolete = -1; + + if (cfg->fc_flags & RTF_EXPIRES) + rt6_set_expires(rt, jiffies + + clock_t_to_jiffies(cfg->fc_expires)); + else + rt6_clean_expires(rt); + + if (cfg->fc_protocol == RTPROT_UNSPEC) + cfg->fc_protocol = RTPROT_BOOT; + rt->rt6i_protocol = cfg->fc_protocol; + + addr_type = ipv6_addr_type(&cfg->fc_dst); + + if (addr_type & IPV6_ADDR_MULTICAST) + rt->dst.input = ip6_mc_input; + else if (cfg->fc_flags & RTF_LOCAL) + rt->dst.input = ip6_input; + else + rt->dst.input = ip6_forward; + + rt->dst.output = ip6_output; + + ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); + rt->rt6i_dst.plen = cfg->fc_dst_len; + if (rt->rt6i_dst.plen == 128) + rt->dst.flags |= DST_HOST; + + if (!(rt->dst.flags & DST_HOST) && cfg->fc_mx) { + u32 *metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL); + if (!metrics) { + err = -ENOMEM; + goto out; + } + dst_init_metrics(&rt->dst, metrics, 0); + } +#ifdef CONFIG_IPV6_SUBTREES + ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len); + rt->rt6i_src.plen = cfg->fc_src_len; +#endif + + rt->rt6i_metric = cfg->fc_metric; + + /* We cannot add true routes via loopback here, + they would result in kernel looping; promote them to reject routes + */ + if ((cfg->fc_flags & RTF_REJECT) || + (dev && (dev->flags & IFF_LOOPBACK) && + !(addr_type & IPV6_ADDR_LOOPBACK) && + !(cfg->fc_flags & RTF_LOCAL))) { + /* hold loopback dev/idev if we haven't done so. */ + if (dev != net->loopback_dev) { + if (dev) { + dev_put(dev); + in6_dev_put(idev); + } + dev = net->loopback_dev; + dev_hold(dev); + idev = in6_dev_get(dev); + if (!idev) { + err = -ENODEV; + goto out; + } + } + rt->dst.output = ip6_pkt_discard_out; + rt->dst.input = ip6_pkt_discard; + rt->dst.error = -ENETUNREACH; + rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP; + goto install_route; + } + + if (cfg->fc_flags & RTF_GATEWAY) { + const struct in6_addr *gw_addr; + int gwa_type; + + gw_addr = &cfg->fc_gateway; + rt->rt6i_gateway = *gw_addr; + gwa_type = ipv6_addr_type(gw_addr); + + if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) { + struct rt6_info *grt; + + /* IPv6 strictly inhibits using not link-local + addresses as nexthop address. + Otherwise, router will not able to send redirects. + It is very good, but in some (rare!) circumstances + (SIT, PtP, NBMA NOARP links) it is handy to allow + some exceptions. --ANK + */ + err = -EINVAL; + if (!(gwa_type & IPV6_ADDR_UNICAST)) + goto out; + + grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1); + + err = -EHOSTUNREACH; + if (!grt) + goto out; + if (dev) { + if (dev != grt->dst.dev) { + dst_release(&grt->dst); + goto out; + } + } else { + dev = grt->dst.dev; + idev = grt->rt6i_idev; + dev_hold(dev); + in6_dev_hold(grt->rt6i_idev); + } + if (!(grt->rt6i_flags & RTF_GATEWAY)) + err = 0; + dst_release(&grt->dst); + + if (err) + goto out; + } + err = -EINVAL; + if (!dev || (dev->flags & IFF_LOOPBACK)) + goto out; + } + + err = -ENODEV; + if (!dev) + goto out; + + if (!ipv6_addr_any(&cfg->fc_prefsrc)) { + if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) { + err = -EINVAL; + goto out; + } + rt->rt6i_prefsrc.addr = cfg->fc_prefsrc; + rt->rt6i_prefsrc.plen = 128; + } else + rt->rt6i_prefsrc.plen = 0; + + if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) { + err = rt6_bind_neighbour(rt, dev); + if (err) + goto out; + } + + rt->rt6i_flags = cfg->fc_flags; + +install_route: + if (cfg->fc_mx) { + struct nlattr *nla; + int remaining; + + nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) { + int type = nla_type(nla); + + if (type) { + if (type > RTAX_MAX) { + err = -EINVAL; + goto out; + } + + dst_metric_set(&rt->dst, type, nla_get_u32(nla)); + } + } + } + + rt->dst.dev = dev; + rt->rt6i_idev = idev; + rt->rt6i_table = table; + + cfg->fc_nlinfo.nl_net = dev_net(dev); + + return __ip6_ins_rt(rt, &cfg->fc_nlinfo); + +out: + if (dev) + dev_put(dev); + if (idev) + in6_dev_put(idev); + if (rt) + dst_free(&rt->dst); + return err; +} + +static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info) +{ + int err; + struct fib6_table *table; + struct net *net = dev_net(rt->dst.dev); + + if (rt == net->ipv6.ip6_null_entry) + return -ENOENT; + + table = rt->rt6i_table; + write_lock_bh(&table->tb6_lock); + + err = fib6_del(rt, info); + dst_release(&rt->dst); + + write_unlock_bh(&table->tb6_lock); + + return err; +} + +int ip6_del_rt(struct rt6_info *rt) +{ + struct nl_info info = { + .nl_net = dev_net(rt->dst.dev), + }; + return __ip6_del_rt(rt, &info); +} + +static int ip6_route_del(struct fib6_config *cfg) +{ + struct fib6_table *table; + struct fib6_node *fn; + struct rt6_info *rt; + int err = -ESRCH; + + table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table); + if (!table) + return err; + + read_lock_bh(&table->tb6_lock); + + fn = fib6_locate(&table->tb6_root, + &cfg->fc_dst, cfg->fc_dst_len, + &cfg->fc_src, cfg->fc_src_len); + + if (fn) { + for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { + if (cfg->fc_ifindex && + (!rt->dst.dev || + rt->dst.dev->ifindex != cfg->fc_ifindex)) + continue; + if (cfg->fc_flags & RTF_GATEWAY && + !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway)) + continue; + if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric) + continue; + dst_hold(&rt->dst); + read_unlock_bh(&table->tb6_lock); + + return __ip6_del_rt(rt, &cfg->fc_nlinfo); + } + } + read_unlock_bh(&table->tb6_lock); + + return err; +} + +/* + * Handle redirects + */ +struct ip6rd_flowi { + struct flowi6 fl6; + struct in6_addr gateway; +}; + +static struct rt6_info *__ip6_route_redirect(struct net *net, + struct fib6_table *table, + struct flowi6 *fl6, + int flags) +{ + struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; + struct rt6_info *rt; + struct fib6_node *fn; + + /* + * Get the "current" route for this destination and + * check if the redirect has come from approriate router. + * + * RFC 2461 specifies that redirects should only be + * accepted if they come from the nexthop to the target. + * Due to the way the routes are chosen, this notion + * is a bit fuzzy and one might need to check all possible + * routes. + */ + + read_lock_bh(&table->tb6_lock); + fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); +restart: + for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { + /* + * Current route is on-link; redirect is always invalid. + * + * Seems, previous statement is not true. It could + * be node, which looks for us as on-link (f.e. proxy ndisc) + * But then router serving it might decide, that we should + * know truth 8)8) --ANK (980726). + */ + if (rt6_check_expired(rt)) + continue; + if (!(rt->rt6i_flags & RTF_GATEWAY)) + continue; + if (fl6->flowi6_oif != rt->dst.dev->ifindex) + continue; + if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) + continue; + break; + } + + if (!rt) + rt = net->ipv6.ip6_null_entry; + BACKTRACK(net, &fl6->saddr); +out: + dst_hold(&rt->dst); + + read_unlock_bh(&table->tb6_lock); + + return rt; +}; + +static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest, + const struct in6_addr *src, + const struct in6_addr *gateway, + struct net_device *dev) +{ + int flags = RT6_LOOKUP_F_HAS_SADDR; + struct net *net = dev_net(dev); + struct ip6rd_flowi rdfl = { + .fl6 = { + .flowi6_oif = dev->ifindex, + .daddr = *dest, + .saddr = *src, + }, + }; + + rdfl.gateway = *gateway; + + if (rt6_need_strict(dest)) + flags |= RT6_LOOKUP_F_IFACE; + + return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6, + flags, __ip6_route_redirect); +} + +void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src, + const struct in6_addr *saddr, + struct neighbour *neigh, u8 *lladdr, int on_link) +{ + struct rt6_info *rt, *nrt = NULL; + struct netevent_redirect netevent; + struct net *net = dev_net(neigh->dev); + + rt = ip6_route_redirect(dest, src, saddr, neigh->dev); + + if (rt == net->ipv6.ip6_null_entry) { + if (net_ratelimit()) + printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop " + "for redirect target\n"); + goto out; + } + + /* + * We have finally decided to accept it. + */ + + neigh_update(neigh, lladdr, NUD_STALE, + NEIGH_UPDATE_F_WEAK_OVERRIDE| + NEIGH_UPDATE_F_OVERRIDE| + (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER| + NEIGH_UPDATE_F_ISROUTER)) + ); + + /* + * Redirect received -> path was valid. + * Look, redirects are sent only in response to data packets, + * so that this nexthop apparently is reachable. --ANK + */ + dst_confirm(&rt->dst); + + /* Duplicate redirect: silently ignore. */ + if (neigh == dst_get_neighbour_noref_raw(&rt->dst)) + goto out; + + nrt = ip6_rt_copy(rt, dest); + if (!nrt) + goto out; + + nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE; + if (on_link) + nrt->rt6i_flags &= ~RTF_GATEWAY; + + nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key; + dst_set_neighbour(&nrt->dst, neigh_clone(neigh)); + + if (ip6_ins_rt(nrt)) + goto out; + + netevent.old = &rt->dst; + netevent.new = &nrt->dst; + call_netevent_notifiers(NETEVENT_REDIRECT, &netevent); + + if (rt->rt6i_flags & RTF_CACHE) { + ip6_del_rt(rt); + return; + } + +out: + dst_release(&rt->dst); +} + +/* + * Handle ICMP "packet too big" messages + * i.e. Path MTU discovery + */ + +static void rt6_do_pmtu_disc(const struct in6_addr *daddr, const struct in6_addr *saddr, + struct net *net, u32 pmtu, int ifindex) +{ + struct rt6_info *rt, *nrt; + int allfrag = 0; +again: + rt = rt6_lookup(net, daddr, saddr, ifindex, 0); + if (!rt) + return; + + if (rt6_check_expired(rt)) { + ip6_del_rt(rt); + goto again; + } + + if (pmtu >= dst_mtu(&rt->dst)) + goto out; + + if (pmtu < IPV6_MIN_MTU) { + /* + * According to RFC2460, PMTU is set to the IPv6 Minimum Link + * MTU (1280) and a fragment header should always be included + * after a node receiving Too Big message reporting PMTU is + * less than the IPv6 Minimum Link MTU. + */ + pmtu = IPV6_MIN_MTU; + allfrag = 1; + } + + /* New mtu received -> path was valid. + They are sent only in response to data packets, + so that this nexthop apparently is reachable. --ANK + */ + dst_confirm(&rt->dst); + + /* Host route. If it is static, it would be better + not to override it, but add new one, so that + when cache entry will expire old pmtu + would return automatically. + */ + if (rt->rt6i_flags & RTF_CACHE) { + dst_metric_set(&rt->dst, RTAX_MTU, pmtu); + if (allfrag) { + u32 features = dst_metric(&rt->dst, RTAX_FEATURES); + features |= RTAX_FEATURE_ALLFRAG; + dst_metric_set(&rt->dst, RTAX_FEATURES, features); + } + rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires); + rt->rt6i_flags |= RTF_MODIFIED; + goto out; + } + + /* Network route. + Two cases are possible: + 1. It is connected route. Action: COW + 2. It is gatewayed route or NONEXTHOP route. Action: clone it. + */ + if (!dst_get_neighbour_noref_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP)) + nrt = rt6_alloc_cow(rt, daddr, saddr); + else + nrt = rt6_alloc_clone(rt, daddr); + + if (nrt) { + dst_metric_set(&nrt->dst, RTAX_MTU, pmtu); + if (allfrag) { + u32 features = dst_metric(&nrt->dst, RTAX_FEATURES); + features |= RTAX_FEATURE_ALLFRAG; + dst_metric_set(&nrt->dst, RTAX_FEATURES, features); + } + + /* According to RFC 1981, detecting PMTU increase shouldn't be + * happened within 5 mins, the recommended timer is 10 mins. + * Here this route expiration time is set to ip6_rt_mtu_expires + * which is 10 mins. After 10 mins the decreased pmtu is expired + * and detecting PMTU increase will be automatically happened. + */ + rt6_update_expires(nrt, net->ipv6.sysctl.ip6_rt_mtu_expires); + nrt->rt6i_flags |= RTF_DYNAMIC; + ip6_ins_rt(nrt); + } +out: + dst_release(&rt->dst); +} + +void rt6_pmtu_discovery(const struct in6_addr *daddr, const struct in6_addr *saddr, + struct net_device *dev, u32 pmtu) +{ + struct net *net = dev_net(dev); + + /* + * RFC 1981 states that a node "MUST reduce the size of the packets it + * is sending along the path" that caused the Packet Too Big message. + * Since it's not possible in the general case to determine which + * interface was used to send the original packet, we update the MTU + * on the interface that will be used to send future packets. We also + * update the MTU on the interface that received the Packet Too Big in + * case the original packet was forced out that interface with + * SO_BINDTODEVICE or similar. This is the next best thing to the + * correct behaviour, which would be to update the MTU on all + * interfaces. + */ + rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0); + rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex); +} + +/* + * Misc support functions + */ + +static struct rt6_info *ip6_rt_copy(struct rt6_info *ort, + const struct in6_addr *dest) +{ + struct net *net = dev_net(ort->dst.dev); + struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, + ort->dst.dev, 0); + + if (rt) { + rt->dst.input = ort->dst.input; + rt->dst.output = ort->dst.output; + rt->dst.flags |= DST_HOST; + + rt->rt6i_dst.addr = *dest; + rt->rt6i_dst.plen = 128; + dst_copy_metrics(&rt->dst, &ort->dst); + rt->dst.error = ort->dst.error; + rt->rt6i_idev = ort->rt6i_idev; + if (rt->rt6i_idev) + in6_dev_hold(rt->rt6i_idev); + rt->dst.lastuse = jiffies; + + rt->rt6i_gateway = ort->rt6i_gateway; + rt->rt6i_flags = ort->rt6i_flags; + if ((ort->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) == + (RTF_DEFAULT | RTF_ADDRCONF)) + rt6_set_from(rt, ort); + else + rt6_clean_expires(rt); + rt->rt6i_metric = 0; + +#ifdef CONFIG_IPV6_SUBTREES + memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key)); +#endif + memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key)); + rt->rt6i_table = ort->rt6i_table; + } + return rt; +} + +#ifdef CONFIG_IPV6_ROUTE_INFO +static struct rt6_info *rt6_get_route_info(struct net *net, + const struct in6_addr *prefix, int prefixlen, + const struct in6_addr *gwaddr, int ifindex) +{ + struct fib6_node *fn; + struct rt6_info *rt = NULL; + struct fib6_table *table; + + table = fib6_get_table(net, RT6_TABLE_INFO); + if (!table) + return NULL; + + write_lock_bh(&table->tb6_lock); + fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0); + if (!fn) + goto out; + + for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { + if (rt->dst.dev->ifindex != ifindex) + continue; + if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY)) + continue; + if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr)) + continue; + dst_hold(&rt->dst); + break; + } +out: + write_unlock_bh(&table->tb6_lock); + return rt; +} + +static struct rt6_info *rt6_add_route_info(struct net *net, + const struct in6_addr *prefix, int prefixlen, + const struct in6_addr *gwaddr, int ifindex, + unsigned pref) +{ + struct fib6_config cfg = { + .fc_table = RT6_TABLE_INFO, + .fc_metric = IP6_RT_PRIO_USER, + .fc_ifindex = ifindex, + .fc_dst_len = prefixlen, + .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | + RTF_UP | RTF_PREF(pref), + .fc_nlinfo.pid = 0, + .fc_nlinfo.nlh = NULL, + .fc_nlinfo.nl_net = net, + }; + + cfg.fc_dst = *prefix; + cfg.fc_gateway = *gwaddr; + + /* We should treat it as a default route if prefix length is 0. */ + if (!prefixlen) + cfg.fc_flags |= RTF_DEFAULT; + + ip6_route_add(&cfg); + + return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex); +} +#endif + +struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev) +{ + struct rt6_info *rt; + struct fib6_table *table; + + table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT); + if (!table) + return NULL; + + write_lock_bh(&table->tb6_lock); + for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) { + if (dev == rt->dst.dev && + ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && + ipv6_addr_equal(&rt->rt6i_gateway, addr)) + break; + } + if (rt) + dst_hold(&rt->dst); + write_unlock_bh(&table->tb6_lock); + return rt; +} + +struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr, + struct net_device *dev, + unsigned int pref) +{ + struct fib6_config cfg = { + .fc_table = RT6_TABLE_DFLT, + .fc_metric = IP6_RT_PRIO_USER, + .fc_ifindex = dev->ifindex, + .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | + RTF_UP | RTF_EXPIRES | RTF_PREF(pref), + .fc_nlinfo.pid = 0, + .fc_nlinfo.nlh = NULL, + .fc_nlinfo.nl_net = dev_net(dev), + }; + + cfg.fc_gateway = *gwaddr; + + ip6_route_add(&cfg); + + return rt6_get_dflt_router(gwaddr, dev); +} + +void rt6_purge_dflt_routers(struct net *net) +{ + struct rt6_info *rt; + struct fib6_table *table; + + /* NOTE: Keep consistent with rt6_get_dflt_router */ + table = fib6_get_table(net, RT6_TABLE_DFLT); + if (!table) + return; + +restart: + read_lock_bh(&table->tb6_lock); + for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) { + if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) { + dst_hold(&rt->dst); + read_unlock_bh(&table->tb6_lock); + ip6_del_rt(rt); + goto restart; + } + } + read_unlock_bh(&table->tb6_lock); +} + +static void rtmsg_to_fib6_config(struct net *net, + struct in6_rtmsg *rtmsg, + struct fib6_config *cfg) +{ + memset(cfg, 0, sizeof(*cfg)); + + cfg->fc_table = RT6_TABLE_MAIN; + cfg->fc_ifindex = rtmsg->rtmsg_ifindex; + cfg->fc_metric = rtmsg->rtmsg_metric; + cfg->fc_expires = rtmsg->rtmsg_info; + cfg->fc_dst_len = rtmsg->rtmsg_dst_len; + cfg->fc_src_len = rtmsg->rtmsg_src_len; + cfg->fc_flags = rtmsg->rtmsg_flags; + + cfg->fc_nlinfo.nl_net = net; + + cfg->fc_dst = rtmsg->rtmsg_dst; + cfg->fc_src = rtmsg->rtmsg_src; + cfg->fc_gateway = rtmsg->rtmsg_gateway; +} + +int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg) +{ + struct fib6_config cfg; + struct in6_rtmsg rtmsg; + int err; + + switch(cmd) { + case SIOCADDRT: /* Add a route */ + case SIOCDELRT: /* Delete a route */ + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + err = copy_from_user(&rtmsg, arg, + sizeof(struct in6_rtmsg)); + if (err) + return -EFAULT; + + rtmsg_to_fib6_config(net, &rtmsg, &cfg); + + rtnl_lock(); + switch (cmd) { + case SIOCADDRT: + err = ip6_route_add(&cfg); + break; + case SIOCDELRT: + err = ip6_route_del(&cfg); + break; + default: + err = -EINVAL; + } + rtnl_unlock(); + + return err; + } + + return -EINVAL; +} + +/* + * Drop the packet on the floor + */ + +static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes) +{ + int type; + struct dst_entry *dst = skb_dst(skb); + switch (ipstats_mib_noroutes) { + case IPSTATS_MIB_INNOROUTES: + type = ipv6_addr_type(&ipv6_hdr(skb)->daddr); + if (type == IPV6_ADDR_ANY) { + IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst), + IPSTATS_MIB_INADDRERRORS); + break; + } + /* FALLTHROUGH */ + case IPSTATS_MIB_OUTNOROUTES: + IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst), + ipstats_mib_noroutes); + break; + } + icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0); + kfree_skb(skb); + return 0; +} + +static int ip6_pkt_discard(struct sk_buff *skb) +{ + return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES); +} + +static int ip6_pkt_discard_out(struct sk_buff *skb) +{ + skb->dev = skb_dst(skb)->dev; + return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES); +} + +#ifdef CONFIG_IPV6_MULTIPLE_TABLES + +static int ip6_pkt_prohibit(struct sk_buff *skb) +{ + return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES); +} + +static int ip6_pkt_prohibit_out(struct sk_buff *skb) +{ + skb->dev = skb_dst(skb)->dev; + return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES); +} + +#endif + +/* + * Allocate a dst for local (unicast / anycast) address. + */ + +struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev, + const struct in6_addr *addr, + bool anycast) +{ + struct net *net = dev_net(idev->dev); + struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, + net->loopback_dev, 0); + int err; + + if (!rt) { + if (net_ratelimit()) + pr_warning("IPv6: Maximum number of routes reached," + " consider increasing route/max_size.\n"); + return ERR_PTR(-ENOMEM); + } + + in6_dev_hold(idev); + + rt->dst.flags |= DST_HOST; + rt->dst.input = ip6_input; + rt->dst.output = ip6_output; + rt->rt6i_idev = idev; + rt->dst.obsolete = -1; + + rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP; + if (anycast) + rt->rt6i_flags |= RTF_ANYCAST; + else + rt->rt6i_flags |= RTF_LOCAL; + err = rt6_bind_neighbour(rt, rt->dst.dev); + if (err) { + dst_free(&rt->dst); + return ERR_PTR(err); + } + + rt->rt6i_dst.addr = *addr; + rt->rt6i_dst.plen = 128; + rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL); + + atomic_set(&rt->dst.__refcnt, 1); + + return rt; +} + +int ip6_route_get_saddr(struct net *net, + struct rt6_info *rt, + const struct in6_addr *daddr, + unsigned int prefs, + struct in6_addr *saddr) +{ + struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt); + int err = 0; + if (rt->rt6i_prefsrc.plen) + *saddr = rt->rt6i_prefsrc.addr; + else + err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL, + daddr, prefs, saddr); + return err; +} + +/* remove deleted ip from prefsrc entries */ +struct arg_dev_net_ip { + struct net_device *dev; + struct net *net; + struct in6_addr *addr; +}; + +static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg) +{ + struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev; + struct net *net = ((struct arg_dev_net_ip *)arg)->net; + struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr; + + if (((void *)rt->dst.dev == dev || !dev) && + rt != net->ipv6.ip6_null_entry && + ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) { + /* remove prefsrc entry */ + rt->rt6i_prefsrc.plen = 0; + } + return 0; +} + +void rt6_remove_prefsrc(struct inet6_ifaddr *ifp) +{ + struct net *net = dev_net(ifp->idev->dev); + struct arg_dev_net_ip adni = { + .dev = ifp->idev->dev, + .net = net, + .addr = &ifp->addr, + }; + fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni); +} + +struct arg_dev_net { + struct net_device *dev; + struct net *net; +}; + +static int fib6_ifdown(struct rt6_info *rt, void *arg) +{ + const struct arg_dev_net *adn = arg; + const struct net_device *dev = adn->dev; + + if ((rt->dst.dev == dev || !dev) && + rt != adn->net->ipv6.ip6_null_entry) + return -1; + + return 0; +} + +void rt6_ifdown(struct net *net, struct net_device *dev) +{ + struct arg_dev_net adn = { + .dev = dev, + .net = net, + }; + + fib6_clean_all(net, fib6_ifdown, 0, &adn); + icmp6_clean_all(fib6_ifdown, &adn); +} + +struct rt6_mtu_change_arg +{ + struct net_device *dev; + unsigned mtu; +}; + +static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg) +{ + struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg; + struct inet6_dev *idev; + + /* In IPv6 pmtu discovery is not optional, + so that RTAX_MTU lock cannot disable it. + We still use this lock to block changes + caused by addrconf/ndisc. + */ + + idev = __in6_dev_get(arg->dev); + if (!idev) + return 0; + + /* For administrative MTU increase, there is no way to discover + IPv6 PMTU increase, so PMTU increase should be updated here. + Since RFC 1981 doesn't include administrative MTU increase + update PMTU increase is a MUST. (i.e. jumbo frame) + */ + /* + If new MTU is less than route PMTU, this new MTU will be the + lowest MTU in the path, update the route PMTU to reflect PMTU + decreases; if new MTU is greater than route PMTU, and the + old MTU is the lowest MTU in the path, update the route PMTU + to reflect the increase. In this case if the other nodes' MTU + also have the lowest MTU, TOO BIG MESSAGE will be lead to + PMTU discouvery. + */ + if (rt->dst.dev == arg->dev && + !dst_metric_locked(&rt->dst, RTAX_MTU) && + (dst_mtu(&rt->dst) >= arg->mtu || + (dst_mtu(&rt->dst) < arg->mtu && + dst_mtu(&rt->dst) == idev->cnf.mtu6))) { + dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu); + } + return 0; +} + +void rt6_mtu_change(struct net_device *dev, unsigned mtu) +{ + struct rt6_mtu_change_arg arg = { + .dev = dev, + .mtu = mtu, + }; + + fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg); +} + +static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { + [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) }, + [RTA_OIF] = { .type = NLA_U32 }, + [RTA_IIF] = { .type = NLA_U32 }, + [RTA_PRIORITY] = { .type = NLA_U32 }, + [RTA_METRICS] = { .type = NLA_NESTED }, +}; + +static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, + struct fib6_config *cfg) +{ + struct rtmsg *rtm; + struct nlattr *tb[RTA_MAX+1]; + int err; + + err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy); + if (err < 0) + goto errout; + + err = -EINVAL; + rtm = nlmsg_data(nlh); + memset(cfg, 0, sizeof(*cfg)); + + cfg->fc_table = rtm->rtm_table; + cfg->fc_dst_len = rtm->rtm_dst_len; + cfg->fc_src_len = rtm->rtm_src_len; + cfg->fc_flags = RTF_UP; + cfg->fc_protocol = rtm->rtm_protocol; + + if (rtm->rtm_type == RTN_UNREACHABLE) + cfg->fc_flags |= RTF_REJECT; + + if (rtm->rtm_type == RTN_LOCAL) + cfg->fc_flags |= RTF_LOCAL; + + cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid; + cfg->fc_nlinfo.nlh = nlh; + cfg->fc_nlinfo.nl_net = sock_net(skb->sk); + + if (tb[RTA_GATEWAY]) { + nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16); + cfg->fc_flags |= RTF_GATEWAY; + } + + if (tb[RTA_DST]) { + int plen = (rtm->rtm_dst_len + 7) >> 3; + + if (nla_len(tb[RTA_DST]) < plen) + goto errout; + + nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen); + } + + if (tb[RTA_SRC]) { + int plen = (rtm->rtm_src_len + 7) >> 3; + + if (nla_len(tb[RTA_SRC]) < plen) + goto errout; + + nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen); + } + + if (tb[RTA_PREFSRC]) + nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16); + + if (tb[RTA_OIF]) + cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]); + + if (tb[RTA_PRIORITY]) + cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]); + + if (tb[RTA_METRICS]) { + cfg->fc_mx = nla_data(tb[RTA_METRICS]); + cfg->fc_mx_len = nla_len(tb[RTA_METRICS]); + } + + if (tb[RTA_TABLE]) + cfg->fc_table = nla_get_u32(tb[RTA_TABLE]); + + err = 0; +errout: + return err; +} + +static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) +{ + struct fib6_config cfg; + int err; + + err = rtm_to_fib6_config(skb, nlh, &cfg); + if (err < 0) + return err; + + return ip6_route_del(&cfg); +} + +static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) +{ + struct fib6_config cfg; + int err; + + err = rtm_to_fib6_config(skb, nlh, &cfg); + if (err < 0) + return err; + + return ip6_route_add(&cfg); +} + +static inline size_t rt6_nlmsg_size(void) +{ + return NLMSG_ALIGN(sizeof(struct rtmsg)) + + nla_total_size(16) /* RTA_SRC */ + + nla_total_size(16) /* RTA_DST */ + + nla_total_size(16) /* RTA_GATEWAY */ + + nla_total_size(16) /* RTA_PREFSRC */ + + nla_total_size(4) /* RTA_TABLE */ + + nla_total_size(4) /* RTA_IIF */ + + nla_total_size(4) /* RTA_OIF */ + + nla_total_size(4) /* RTA_PRIORITY */ + + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */ + + nla_total_size(sizeof(struct rta_cacheinfo)); +} + +static int rt6_fill_node(struct net *net, + struct sk_buff *skb, struct rt6_info *rt, + struct in6_addr *dst, struct in6_addr *src, + int iif, int type, u32 pid, u32 seq, + int prefix, int nowait, unsigned int flags) +{ + const struct inet_peer *peer; + struct rtmsg *rtm; + struct nlmsghdr *nlh; + long expires; + u32 table; + struct neighbour *n; + u32 ts, tsage; + + if (prefix) { /* user wants prefix routes only */ + if (!(rt->rt6i_flags & RTF_PREFIX_RT)) { + /* success since this is not a prefix route */ + return 1; + } + } + + nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags); + if (!nlh) + return -EMSGSIZE; + + rtm = nlmsg_data(nlh); + rtm->rtm_family = AF_INET6; + rtm->rtm_dst_len = rt->rt6i_dst.plen; + rtm->rtm_src_len = rt->rt6i_src.plen; + rtm->rtm_tos = 0; + if (rt->rt6i_table) + table = rt->rt6i_table->tb6_id; + else + table = RT6_TABLE_UNSPEC; + rtm->rtm_table = table; + NLA_PUT_U32(skb, RTA_TABLE, table); + if (rt->rt6i_flags & RTF_REJECT) + rtm->rtm_type = RTN_UNREACHABLE; + else if (rt->rt6i_flags & RTF_LOCAL) + rtm->rtm_type = RTN_LOCAL; + else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK)) + rtm->rtm_type = RTN_LOCAL; + else + rtm->rtm_type = RTN_UNICAST; + rtm->rtm_flags = 0; + rtm->rtm_scope = RT_SCOPE_UNIVERSE; + rtm->rtm_protocol = rt->rt6i_protocol; + if (rt->rt6i_flags & RTF_DYNAMIC) + rtm->rtm_protocol = RTPROT_REDIRECT; + else if (rt->rt6i_flags & RTF_ADDRCONF) + rtm->rtm_protocol = RTPROT_KERNEL; + else if (rt->rt6i_flags & RTF_DEFAULT) + rtm->rtm_protocol = RTPROT_RA; + + if (rt->rt6i_flags & RTF_CACHE) + rtm->rtm_flags |= RTM_F_CLONED; + + if (dst) { + NLA_PUT(skb, RTA_DST, 16, dst); + rtm->rtm_dst_len = 128; + } else if (rtm->rtm_dst_len) + NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr); +#ifdef CONFIG_IPV6_SUBTREES + if (src) { + NLA_PUT(skb, RTA_SRC, 16, src); + rtm->rtm_src_len = 128; + } else if (rtm->rtm_src_len) + NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr); +#endif + if (iif) { +#ifdef CONFIG_IPV6_MROUTE + if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) { + int err = ip6mr_get_route(net, skb, rtm, nowait); + if (err <= 0) { + if (!nowait) { + if (err == 0) + return 0; + goto nla_put_failure; + } else { + if (err == -EMSGSIZE) + goto nla_put_failure; + } + } + } else +#endif + NLA_PUT_U32(skb, RTA_IIF, iif); + } else if (dst) { + struct in6_addr saddr_buf; + if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0) + NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf); + } + + if (rt->rt6i_prefsrc.plen) { + struct in6_addr saddr_buf; + saddr_buf = rt->rt6i_prefsrc.addr; + NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf); + } + + if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0) + goto nla_put_failure; + + rcu_read_lock(); + n = dst_get_neighbour_noref(&rt->dst); + if (n) { + if (nla_put(skb, RTA_GATEWAY, 16, &n->primary_key) < 0) { + rcu_read_unlock(); + goto nla_put_failure; + } + } + rcu_read_unlock(); + + if (rt->dst.dev) + NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex); + + NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric); + + if (!(rt->rt6i_flags & RTF_EXPIRES)) + expires = 0; + else if (rt->dst.expires - jiffies < INT_MAX) + expires = rt->dst.expires - jiffies; + else + expires = INT_MAX; + + peer = rt->rt6i_peer; + ts = tsage = 0; + if (peer && peer->tcp_ts_stamp) { + ts = peer->tcp_ts; + tsage = get_seconds() - peer->tcp_ts_stamp; + } + + if (rtnl_put_cacheinfo(skb, &rt->dst, 0, ts, tsage, + expires, rt->dst.error) < 0) + goto nla_put_failure; + + return nlmsg_end(skb, nlh); + +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +int rt6_dump_route(struct rt6_info *rt, void *p_arg) +{ + struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg; + int prefix; + + if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) { + struct rtmsg *rtm = nlmsg_data(arg->cb->nlh); + prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0; + } else + prefix = 0; + + return rt6_fill_node(arg->net, + arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE, + NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq, + prefix, 0, NLM_F_MULTI); +} + +static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg) +{ + struct net *net = sock_net(in_skb->sk); + struct nlattr *tb[RTA_MAX+1]; + struct rt6_info *rt; + struct sk_buff *skb; + struct rtmsg *rtm; + struct flowi6 fl6; + int err, iif = 0, oif = 0; + + err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy); + if (err < 0) + goto errout; + + err = -EINVAL; + memset(&fl6, 0, sizeof(fl6)); + + if (tb[RTA_SRC]) { + if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr)) + goto errout; + + fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]); + } + + if (tb[RTA_DST]) { + if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr)) + goto errout; + + fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]); + } + + if (tb[RTA_IIF]) + iif = nla_get_u32(tb[RTA_IIF]); + + if (tb[RTA_OIF]) + oif = nla_get_u32(tb[RTA_OIF]); + + if (iif) { + struct net_device *dev; + int flags = 0; + + dev = __dev_get_by_index(net, iif); + if (!dev) { + err = -ENODEV; + goto errout; + } + + fl6.flowi6_iif = iif; + + if (!ipv6_addr_any(&fl6.saddr)) + flags |= RT6_LOOKUP_F_HAS_SADDR; + + rt = (struct rt6_info *)ip6_route_input_lookup(net, dev, &fl6, + flags); + } else { + fl6.flowi6_oif = oif; + + rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6); + } + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) { + err = -ENOBUFS; + goto errout; + } + + /* Reserve room for dummy headers, this skb can pass + through good chunk of routing engine. + */ + skb_reset_mac_header(skb); + skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr)); + + skb_dst_set(skb, &rt->dst); + + err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif, + RTM_NEWROUTE, NETLINK_CB(in_skb).pid, + nlh->nlmsg_seq, 0, 0, 0); + if (err < 0) { + kfree_skb(skb); + goto errout; + } + + err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid); +errout: + return err; +} + +void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info) +{ + struct sk_buff *skb; + struct net *net = info->nl_net; + u32 seq; + int err; + + err = -ENOBUFS; + seq = info->nlh ? info->nlh->nlmsg_seq : 0; + + skb = nlmsg_new(rt6_nlmsg_size(), gfp_any()); + if (!skb) + goto errout; + + err = rt6_fill_node(net, skb, rt, NULL, NULL, 0, + event, info->pid, seq, 0, 0, 0); + if (err < 0) { + /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ + WARN_ON(err == -EMSGSIZE); + kfree_skb(skb); + goto errout; + } + rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE, + info->nlh, gfp_any()); + return; +errout: + if (err < 0) + rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); +} + +static int ip6_route_dev_notify(struct notifier_block *this, + unsigned long event, void *data) +{ + struct net_device *dev = (struct net_device *)data; + struct net *net = dev_net(dev); + + if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) { + net->ipv6.ip6_null_entry->dst.dev = dev; + net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev); +#ifdef CONFIG_IPV6_MULTIPLE_TABLES + net->ipv6.ip6_prohibit_entry->dst.dev = dev; + net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev); + net->ipv6.ip6_blk_hole_entry->dst.dev = dev; + net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev); +#endif + } + + return NOTIFY_OK; +} + +/* + * /proc + */ + +#ifdef CONFIG_PROC_FS + +struct rt6_proc_arg +{ + char *buffer; + int offset; + int length; + int skip; + int len; +}; + +static int rt6_info_route(struct rt6_info *rt, void *p_arg) +{ + struct seq_file *m = p_arg; + struct neighbour *n; + + seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen); + +#ifdef CONFIG_IPV6_SUBTREES + seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen); +#else + seq_puts(m, "00000000000000000000000000000000 00 "); +#endif + rcu_read_lock(); + n = dst_get_neighbour_noref(&rt->dst); + if (n) { + seq_printf(m, "%pi6", n->primary_key); + } else { + seq_puts(m, "00000000000000000000000000000000"); + } + rcu_read_unlock(); + seq_printf(m, " %08x %08x %08x %08x %8s\n", + rt->rt6i_metric, atomic_read(&rt->dst.__refcnt), + rt->dst.__use, rt->rt6i_flags, + rt->dst.dev ? rt->dst.dev->name : ""); + return 0; +} + +static int ipv6_route_show(struct seq_file *m, void *v) +{ + struct net *net = (struct net *)m->private; + fib6_clean_all_ro(net, rt6_info_route, 0, m); + return 0; +} + +static int ipv6_route_open(struct inode *inode, struct file *file) +{ + return single_open_net(inode, file, ipv6_route_show); +} + +static const struct file_operations ipv6_route_proc_fops = { + .owner = THIS_MODULE, + .open = ipv6_route_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release_net, +}; + +static int rt6_stats_seq_show(struct seq_file *seq, void *v) +{ + struct net *net = (struct net *)seq->private; + seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n", + net->ipv6.rt6_stats->fib_nodes, + net->ipv6.rt6_stats->fib_route_nodes, + net->ipv6.rt6_stats->fib_rt_alloc, + net->ipv6.rt6_stats->fib_rt_entries, + net->ipv6.rt6_stats->fib_rt_cache, + dst_entries_get_slow(&net->ipv6.ip6_dst_ops), + net->ipv6.rt6_stats->fib_discarded_routes); + + return 0; +} + +static int rt6_stats_seq_open(struct inode *inode, struct file *file) +{ + return single_open_net(inode, file, rt6_stats_seq_show); +} + +static const struct file_operations rt6_stats_seq_fops = { + .owner = THIS_MODULE, + .open = rt6_stats_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release_net, +}; +#endif /* CONFIG_PROC_FS */ + +#ifdef CONFIG_SYSCTL + +static +int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct net *net; + int delay; + if (!write) + return -EINVAL; + + net = (struct net *)ctl->extra1; + delay = net->ipv6.sysctl.flush_delay; + proc_dointvec(ctl, write, buffer, lenp, ppos); + fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net); + return 0; +} + +ctl_table ipv6_route_table_template[] = { + { + .procname = "flush", + .data = &init_net.ipv6.sysctl.flush_delay, + .maxlen = sizeof(int), + .mode = 0200, + .proc_handler = ipv6_sysctl_rtcache_flush + }, + { + .procname = "gc_thresh", + .data = &ip6_dst_ops_template.gc_thresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "max_size", + .data = &init_net.ipv6.sysctl.ip6_rt_max_size, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "gc_min_interval", + .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "gc_timeout", + .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "gc_interval", + .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "gc_elasticity", + .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "mtu_expires", + .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "min_adv_mss", + .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "gc_min_interval_ms", + .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_ms_jiffies, + }, + { } +}; + +struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net) +{ + struct ctl_table *table; + + table = kmemdup(ipv6_route_table_template, + sizeof(ipv6_route_table_template), + GFP_KERNEL); + + if (table) { + table[0].data = &net->ipv6.sysctl.flush_delay; + table[0].extra1 = net; + table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh; + table[2].data = &net->ipv6.sysctl.ip6_rt_max_size; + table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; + table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout; + table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval; + table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity; + table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires; + table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss; + table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; + } + + return table; +} +#endif + +static int __net_init ip6_route_net_init(struct net *net) +{ + int ret = -ENOMEM; + + memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template, + sizeof(net->ipv6.ip6_dst_ops)); + + if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0) + goto out_ip6_dst_ops; + + net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template, + sizeof(*net->ipv6.ip6_null_entry), + GFP_KERNEL); + if (!net->ipv6.ip6_null_entry) + goto out_ip6_dst_entries; + net->ipv6.ip6_null_entry->dst.path = + (struct dst_entry *)net->ipv6.ip6_null_entry; + net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops; + dst_init_metrics(&net->ipv6.ip6_null_entry->dst, + ip6_template_metrics, true); + +#ifdef CONFIG_IPV6_MULTIPLE_TABLES + net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template, + sizeof(*net->ipv6.ip6_prohibit_entry), + GFP_KERNEL); + if (!net->ipv6.ip6_prohibit_entry) + goto out_ip6_null_entry; + net->ipv6.ip6_prohibit_entry->dst.path = + (struct dst_entry *)net->ipv6.ip6_prohibit_entry; + net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops; + dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst, + ip6_template_metrics, true); + + net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template, + sizeof(*net->ipv6.ip6_blk_hole_entry), + GFP_KERNEL); + if (!net->ipv6.ip6_blk_hole_entry) + goto out_ip6_prohibit_entry; + net->ipv6.ip6_blk_hole_entry->dst.path = + (struct dst_entry *)net->ipv6.ip6_blk_hole_entry; + net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops; + dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst, + ip6_template_metrics, true); +#endif + + net->ipv6.sysctl.flush_delay = 0; + net->ipv6.sysctl.ip6_rt_max_size = 4096; + net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2; + net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ; + net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ; + net->ipv6.sysctl.ip6_rt_gc_elasticity = 9; + net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ; + net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40; + + net->ipv6.ip6_rt_gc_expire = 30*HZ; + + ret = 0; +out: + return ret; + +#ifdef CONFIG_IPV6_MULTIPLE_TABLES +out_ip6_prohibit_entry: + kfree(net->ipv6.ip6_prohibit_entry); +out_ip6_null_entry: + kfree(net->ipv6.ip6_null_entry); +#endif +out_ip6_dst_entries: + dst_entries_destroy(&net->ipv6.ip6_dst_ops); +out_ip6_dst_ops: + goto out; +} + +static void __net_exit ip6_route_net_exit(struct net *net) +{ + kfree(net->ipv6.ip6_null_entry); +#ifdef CONFIG_IPV6_MULTIPLE_TABLES + kfree(net->ipv6.ip6_prohibit_entry); + kfree(net->ipv6.ip6_blk_hole_entry); +#endif + dst_entries_destroy(&net->ipv6.ip6_dst_ops); +} + +static int __net_init ip6_route_net_init_late(struct net *net) +{ +#ifdef CONFIG_PROC_FS + proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops); + proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops); +#endif + return 0; +} + +static void __net_exit ip6_route_net_exit_late(struct net *net) +{ +#ifdef CONFIG_PROC_FS + proc_net_remove(net, "ipv6_route"); + proc_net_remove(net, "rt6_stats"); +#endif +} + +static struct pernet_operations ip6_route_net_ops = { + .init = ip6_route_net_init, + .exit = ip6_route_net_exit, +}; + +static struct pernet_operations ip6_route_net_late_ops = { + .init = ip6_route_net_init_late, + .exit = ip6_route_net_exit_late, +}; + +static struct notifier_block ip6_route_dev_notifier = { + .notifier_call = ip6_route_dev_notify, + .priority = 0, +}; + +int __init ip6_route_init(void) +{ + int ret; + + ret = -ENOMEM; + ip6_dst_ops_template.kmem_cachep = + kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0, + SLAB_HWCACHE_ALIGN, NULL); + if (!ip6_dst_ops_template.kmem_cachep) + goto out; + + ret = dst_entries_init(&ip6_dst_blackhole_ops); + if (ret) + goto out_kmem_cache; + + ret = register_pernet_subsys(&ip6_route_net_ops); + if (ret) + goto out_dst_entries; + + ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep; + + /* Registering of the loopback is done before this portion of code, + * the loopback reference in rt6_info will not be taken, do it + * manually for init_net */ + init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; + init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); + #ifdef CONFIG_IPV6_MULTIPLE_TABLES + init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev; + init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); + init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev; + init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); + #endif + ret = fib6_init(); + if (ret) + goto out_register_subsys; + + ret = xfrm6_init(); + if (ret) + goto out_fib6_init; + + ret = fib6_rules_init(); + if (ret) + goto xfrm6_init; + + ret = register_pernet_subsys(&ip6_route_net_late_ops); + if (ret) + goto fib6_rules_init; + + ret = -ENOBUFS; + if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) || + __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) || + __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL)) + goto out_register_late_subsys; + + ret = register_netdevice_notifier(&ip6_route_dev_notifier); + if (ret) + goto out_register_late_subsys; + +out: + return ret; + +out_register_late_subsys: + unregister_pernet_subsys(&ip6_route_net_late_ops); +fib6_rules_init: + fib6_rules_cleanup(); +xfrm6_init: + xfrm6_fini(); +out_fib6_init: + fib6_gc_cleanup(); +out_register_subsys: + unregister_pernet_subsys(&ip6_route_net_ops); +out_dst_entries: + dst_entries_destroy(&ip6_dst_blackhole_ops); +out_kmem_cache: + kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); + goto out; +} + +void ip6_route_cleanup(void) +{ + unregister_netdevice_notifier(&ip6_route_dev_notifier); + unregister_pernet_subsys(&ip6_route_net_late_ops); + fib6_rules_cleanup(); + xfrm6_fini(); + fib6_gc_cleanup(); + unregister_pernet_subsys(&ip6_route_net_ops); + dst_entries_destroy(&ip6_dst_blackhole_ops); + kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); +} diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c new file mode 100644 index 00000000..c4ffd174 --- /dev/null +++ b/net/ipv6/sit.c @@ -0,0 +1,1306 @@ +/* + * IPv6 over IPv4 tunnel device - Simple Internet Transition (SIT) + * Linux INET6 implementation + * + * Authors: + * Pedro Roque <roque@di.fc.ul.pt> + * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * Roger Venning <r.venning@telstra.com>: 6to4 support + * Nate Thompson <nate@thebog.net>: 6to4 support + * Fred Templin <fred.l.templin@boeing.com>: isatap support + */ + +#include <linux/module.h> +#include <linux/capability.h> +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/net.h> +#include <linux/in6.h> +#include <linux/netdevice.h> +#include <linux/if_arp.h> +#include <linux/icmp.h> +#include <linux/slab.h> +#include <asm/uaccess.h> +#include <linux/init.h> +#include <linux/netfilter_ipv4.h> +#include <linux/if_ether.h> + +#include <net/sock.h> +#include <net/snmp.h> + +#include <net/ipv6.h> +#include <net/protocol.h> +#include <net/transp_v6.h> +#include <net/ip6_fib.h> +#include <net/ip6_route.h> +#include <net/ndisc.h> +#include <net/addrconf.h> +#include <net/ip.h> +#include <net/udp.h> +#include <net/icmp.h> +#include <net/ipip.h> +#include <net/inet_ecn.h> +#include <net/xfrm.h> +#include <net/dsfield.h> +#include <net/net_namespace.h> +#include <net/netns/generic.h> + +/* + This version of net/ipv6/sit.c is cloned of net/ipv4/ip_gre.c + + For comments look at net/ipv4/ip_gre.c --ANK + */ + +#define HASH_SIZE 16 +#define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF) + +static int ipip6_tunnel_init(struct net_device *dev); +static void ipip6_tunnel_setup(struct net_device *dev); +static void ipip6_dev_free(struct net_device *dev); + +static int sit_net_id __read_mostly; +struct sit_net { + struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE]; + struct ip_tunnel __rcu *tunnels_r[HASH_SIZE]; + struct ip_tunnel __rcu *tunnels_l[HASH_SIZE]; + struct ip_tunnel __rcu *tunnels_wc[1]; + struct ip_tunnel __rcu **tunnels[4]; + + struct net_device *fb_tunnel_dev; +}; + +/* + * Locking : hash tables are protected by RCU and RTNL + */ + +#define for_each_ip_tunnel_rcu(start) \ + for (t = rcu_dereference(start); t; t = rcu_dereference(t->next)) + +/* often modified stats are per cpu, other are shared (netdev->stats) */ +struct pcpu_tstats { + unsigned long rx_packets; + unsigned long rx_bytes; + unsigned long tx_packets; + unsigned long tx_bytes; +} __attribute__((aligned(4*sizeof(unsigned long)))); + +static struct net_device_stats *ipip6_get_stats(struct net_device *dev) +{ + struct pcpu_tstats sum = { 0 }; + int i; + + for_each_possible_cpu(i) { + const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i); + + sum.rx_packets += tstats->rx_packets; + sum.rx_bytes += tstats->rx_bytes; + sum.tx_packets += tstats->tx_packets; + sum.tx_bytes += tstats->tx_bytes; + } + dev->stats.rx_packets = sum.rx_packets; + dev->stats.rx_bytes = sum.rx_bytes; + dev->stats.tx_packets = sum.tx_packets; + dev->stats.tx_bytes = sum.tx_bytes; + return &dev->stats; +} +/* + * Must be invoked with rcu_read_lock + */ +static struct ip_tunnel * ipip6_tunnel_lookup(struct net *net, + struct net_device *dev, __be32 remote, __be32 local) +{ + unsigned int h0 = HASH(remote); + unsigned int h1 = HASH(local); + struct ip_tunnel *t; + struct sit_net *sitn = net_generic(net, sit_net_id); + + for_each_ip_tunnel_rcu(sitn->tunnels_r_l[h0 ^ h1]) { + if (local == t->parms.iph.saddr && + remote == t->parms.iph.daddr && + (!dev || !t->parms.link || dev->iflink == t->parms.link) && + (t->dev->flags & IFF_UP)) + return t; + } + for_each_ip_tunnel_rcu(sitn->tunnels_r[h0]) { + if (remote == t->parms.iph.daddr && + (!dev || !t->parms.link || dev->iflink == t->parms.link) && + (t->dev->flags & IFF_UP)) + return t; + } + for_each_ip_tunnel_rcu(sitn->tunnels_l[h1]) { + if (local == t->parms.iph.saddr && + (!dev || !t->parms.link || dev->iflink == t->parms.link) && + (t->dev->flags & IFF_UP)) + return t; + } + t = rcu_dereference(sitn->tunnels_wc[0]); + if ((t != NULL) && (t->dev->flags & IFF_UP)) + return t; + return NULL; +} + +static struct ip_tunnel __rcu **__ipip6_bucket(struct sit_net *sitn, + struct ip_tunnel_parm *parms) +{ + __be32 remote = parms->iph.daddr; + __be32 local = parms->iph.saddr; + unsigned int h = 0; + int prio = 0; + + if (remote) { + prio |= 2; + h ^= HASH(remote); + } + if (local) { + prio |= 1; + h ^= HASH(local); + } + return &sitn->tunnels[prio][h]; +} + +static inline struct ip_tunnel __rcu **ipip6_bucket(struct sit_net *sitn, + struct ip_tunnel *t) +{ + return __ipip6_bucket(sitn, &t->parms); +} + +static void ipip6_tunnel_unlink(struct sit_net *sitn, struct ip_tunnel *t) +{ + struct ip_tunnel __rcu **tp; + struct ip_tunnel *iter; + + for (tp = ipip6_bucket(sitn, t); + (iter = rtnl_dereference(*tp)) != NULL; + tp = &iter->next) { + if (t == iter) { + rcu_assign_pointer(*tp, t->next); + break; + } + } +} + +static void ipip6_tunnel_link(struct sit_net *sitn, struct ip_tunnel *t) +{ + struct ip_tunnel __rcu **tp = ipip6_bucket(sitn, t); + + rcu_assign_pointer(t->next, rtnl_dereference(*tp)); + rcu_assign_pointer(*tp, t); +} + +static void ipip6_tunnel_clone_6rd(struct net_device *dev, struct sit_net *sitn) +{ +#ifdef CONFIG_IPV6_SIT_6RD + struct ip_tunnel *t = netdev_priv(dev); + + if (t->dev == sitn->fb_tunnel_dev) { + ipv6_addr_set(&t->ip6rd.prefix, htonl(0x20020000), 0, 0, 0); + t->ip6rd.relay_prefix = 0; + t->ip6rd.prefixlen = 16; + t->ip6rd.relay_prefixlen = 0; + } else { + struct ip_tunnel *t0 = netdev_priv(sitn->fb_tunnel_dev); + memcpy(&t->ip6rd, &t0->ip6rd, sizeof(t->ip6rd)); + } +#endif +} + +static struct ip_tunnel *ipip6_tunnel_locate(struct net *net, + struct ip_tunnel_parm *parms, int create) +{ + __be32 remote = parms->iph.daddr; + __be32 local = parms->iph.saddr; + struct ip_tunnel *t, *nt; + struct ip_tunnel __rcu **tp; + struct net_device *dev; + char name[IFNAMSIZ]; + struct sit_net *sitn = net_generic(net, sit_net_id); + + for (tp = __ipip6_bucket(sitn, parms); + (t = rtnl_dereference(*tp)) != NULL; + tp = &t->next) { + if (local == t->parms.iph.saddr && + remote == t->parms.iph.daddr && + parms->link == t->parms.link) { + if (create) + return NULL; + else + return t; + } + } + if (!create) + goto failed; + + if (parms->name[0]) + strlcpy(name, parms->name, IFNAMSIZ); + else + strcpy(name, "sit%d"); + + dev = alloc_netdev(sizeof(*t), name, ipip6_tunnel_setup); + if (dev == NULL) + return NULL; + + dev_net_set(dev, net); + + nt = netdev_priv(dev); + + nt->parms = *parms; + if (ipip6_tunnel_init(dev) < 0) + goto failed_free; + ipip6_tunnel_clone_6rd(dev, sitn); + + if (parms->i_flags & SIT_ISATAP) + dev->priv_flags |= IFF_ISATAP; + + if (register_netdevice(dev) < 0) + goto failed_free; + + strcpy(nt->parms.name, dev->name); + + dev_hold(dev); + + ipip6_tunnel_link(sitn, nt); + return nt; + +failed_free: + ipip6_dev_free(dev); +failed: + return NULL; +} + +#define for_each_prl_rcu(start) \ + for (prl = rcu_dereference(start); \ + prl; \ + prl = rcu_dereference(prl->next)) + +static struct ip_tunnel_prl_entry * +__ipip6_tunnel_locate_prl(struct ip_tunnel *t, __be32 addr) +{ + struct ip_tunnel_prl_entry *prl; + + for_each_prl_rcu(t->prl) + if (prl->addr == addr) + break; + return prl; + +} + +static int ipip6_tunnel_get_prl(struct ip_tunnel *t, + struct ip_tunnel_prl __user *a) +{ + struct ip_tunnel_prl kprl, *kp; + struct ip_tunnel_prl_entry *prl; + unsigned int cmax, c = 0, ca, len; + int ret = 0; + + if (copy_from_user(&kprl, a, sizeof(kprl))) + return -EFAULT; + cmax = kprl.datalen / sizeof(kprl); + if (cmax > 1 && kprl.addr != htonl(INADDR_ANY)) + cmax = 1; + + /* For simple GET or for root users, + * we try harder to allocate. + */ + kp = (cmax <= 1 || capable(CAP_NET_ADMIN)) ? + kcalloc(cmax, sizeof(*kp), GFP_KERNEL) : + NULL; + + rcu_read_lock(); + + ca = t->prl_count < cmax ? t->prl_count : cmax; + + if (!kp) { + /* We don't try hard to allocate much memory for + * non-root users. + * For root users, retry allocating enough memory for + * the answer. + */ + kp = kcalloc(ca, sizeof(*kp), GFP_ATOMIC); + if (!kp) { + ret = -ENOMEM; + goto out; + } + } + + c = 0; + for_each_prl_rcu(t->prl) { + if (c >= cmax) + break; + if (kprl.addr != htonl(INADDR_ANY) && prl->addr != kprl.addr) + continue; + kp[c].addr = prl->addr; + kp[c].flags = prl->flags; + c++; + if (kprl.addr != htonl(INADDR_ANY)) + break; + } +out: + rcu_read_unlock(); + + len = sizeof(*kp) * c; + ret = 0; + if ((len && copy_to_user(a + 1, kp, len)) || put_user(len, &a->datalen)) + ret = -EFAULT; + + kfree(kp); + + return ret; +} + +static int +ipip6_tunnel_add_prl(struct ip_tunnel *t, struct ip_tunnel_prl *a, int chg) +{ + struct ip_tunnel_prl_entry *p; + int err = 0; + + if (a->addr == htonl(INADDR_ANY)) + return -EINVAL; + + ASSERT_RTNL(); + + for (p = rtnl_dereference(t->prl); p; p = rtnl_dereference(p->next)) { + if (p->addr == a->addr) { + if (chg) { + p->flags = a->flags; + goto out; + } + err = -EEXIST; + goto out; + } + } + + if (chg) { + err = -ENXIO; + goto out; + } + + p = kzalloc(sizeof(struct ip_tunnel_prl_entry), GFP_KERNEL); + if (!p) { + err = -ENOBUFS; + goto out; + } + + p->next = t->prl; + p->addr = a->addr; + p->flags = a->flags; + t->prl_count++; + rcu_assign_pointer(t->prl, p); +out: + return err; +} + +static void prl_list_destroy_rcu(struct rcu_head *head) +{ + struct ip_tunnel_prl_entry *p, *n; + + p = container_of(head, struct ip_tunnel_prl_entry, rcu_head); + do { + n = rcu_dereference_protected(p->next, 1); + kfree(p); + p = n; + } while (p); +} + +static int +ipip6_tunnel_del_prl(struct ip_tunnel *t, struct ip_tunnel_prl *a) +{ + struct ip_tunnel_prl_entry *x; + struct ip_tunnel_prl_entry __rcu **p; + int err = 0; + + ASSERT_RTNL(); + + if (a && a->addr != htonl(INADDR_ANY)) { + for (p = &t->prl; + (x = rtnl_dereference(*p)) != NULL; + p = &x->next) { + if (x->addr == a->addr) { + *p = x->next; + kfree_rcu(x, rcu_head); + t->prl_count--; + goto out; + } + } + err = -ENXIO; + } else { + x = rtnl_dereference(t->prl); + if (x) { + t->prl_count = 0; + call_rcu(&x->rcu_head, prl_list_destroy_rcu); + t->prl = NULL; + } + } +out: + return err; +} + +static int +isatap_chksrc(struct sk_buff *skb, const struct iphdr *iph, struct ip_tunnel *t) +{ + struct ip_tunnel_prl_entry *p; + int ok = 1; + + rcu_read_lock(); + p = __ipip6_tunnel_locate_prl(t, iph->saddr); + if (p) { + if (p->flags & PRL_DEFAULT) + skb->ndisc_nodetype = NDISC_NODETYPE_DEFAULT; + else + skb->ndisc_nodetype = NDISC_NODETYPE_NODEFAULT; + } else { + const struct in6_addr *addr6 = &ipv6_hdr(skb)->saddr; + + if (ipv6_addr_is_isatap(addr6) && + (addr6->s6_addr32[3] == iph->saddr) && + ipv6_chk_prefix(addr6, t->dev)) + skb->ndisc_nodetype = NDISC_NODETYPE_HOST; + else + ok = 0; + } + rcu_read_unlock(); + return ok; +} + +static void ipip6_tunnel_uninit(struct net_device *dev) +{ + struct net *net = dev_net(dev); + struct sit_net *sitn = net_generic(net, sit_net_id); + + if (dev == sitn->fb_tunnel_dev) { + RCU_INIT_POINTER(sitn->tunnels_wc[0], NULL); + } else { + ipip6_tunnel_unlink(sitn, netdev_priv(dev)); + ipip6_tunnel_del_prl(netdev_priv(dev), NULL); + } + dev_put(dev); +} + + +static int ipip6_err(struct sk_buff *skb, u32 info) +{ + +/* All the routers (except for Linux) return only + 8 bytes of packet payload. It means, that precise relaying of + ICMP in the real Internet is absolutely infeasible. + */ + const struct iphdr *iph = (const struct iphdr *)skb->data; + const int type = icmp_hdr(skb)->type; + const int code = icmp_hdr(skb)->code; + struct ip_tunnel *t; + int err; + + switch (type) { + default: + case ICMP_PARAMETERPROB: + return 0; + + case ICMP_DEST_UNREACH: + switch (code) { + case ICMP_SR_FAILED: + case ICMP_PORT_UNREACH: + /* Impossible event. */ + return 0; + case ICMP_FRAG_NEEDED: + /* Soft state for pmtu is maintained by IP core. */ + return 0; + default: + /* All others are translated to HOST_UNREACH. + rfc2003 contains "deep thoughts" about NET_UNREACH, + I believe they are just ether pollution. --ANK + */ + break; + } + break; + case ICMP_TIME_EXCEEDED: + if (code != ICMP_EXC_TTL) + return 0; + break; + } + + err = -ENOENT; + + rcu_read_lock(); + t = ipip6_tunnel_lookup(dev_net(skb->dev), + skb->dev, + iph->daddr, + iph->saddr); + if (t == NULL || t->parms.iph.daddr == 0) + goto out; + + err = 0; + if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) + goto out; + + if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO)) + t->err_count++; + else + t->err_count = 1; + t->err_time = jiffies; +out: + rcu_read_unlock(); + return err; +} + +static inline void ipip6_ecn_decapsulate(const struct iphdr *iph, struct sk_buff *skb) +{ + if (INET_ECN_is_ce(iph->tos)) + IP6_ECN_set_ce(ipv6_hdr(skb)); +} + +static int ipip6_rcv(struct sk_buff *skb) +{ + const struct iphdr *iph; + struct ip_tunnel *tunnel; + + if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) + goto out; + + iph = ip_hdr(skb); + + rcu_read_lock(); + tunnel = ipip6_tunnel_lookup(dev_net(skb->dev), skb->dev, + iph->saddr, iph->daddr); + if (tunnel != NULL) { + struct pcpu_tstats *tstats; + + secpath_reset(skb); + skb->mac_header = skb->network_header; + skb_reset_network_header(skb); + IPCB(skb)->flags = 0; + skb->protocol = htons(ETH_P_IPV6); + skb->pkt_type = PACKET_HOST; + + if ((tunnel->dev->priv_flags & IFF_ISATAP) && + !isatap_chksrc(skb, iph, tunnel)) { + tunnel->dev->stats.rx_errors++; + rcu_read_unlock(); + kfree_skb(skb); + return 0; + } + + tstats = this_cpu_ptr(tunnel->dev->tstats); + tstats->rx_packets++; + tstats->rx_bytes += skb->len; + + __skb_tunnel_rx(skb, tunnel->dev); + + ipip6_ecn_decapsulate(iph, skb); + + netif_rx(skb); + + rcu_read_unlock(); + return 0; + } + + /* no tunnel matched, let upstream know, ipsec may handle it */ + rcu_read_unlock(); + return 1; +out: + kfree_skb(skb); + return 0; +} + +/* + * Returns the embedded IPv4 address if the IPv6 address + * comes from 6rd / 6to4 (RFC 3056) addr space. + */ +static inline +__be32 try_6rd(const struct in6_addr *v6dst, struct ip_tunnel *tunnel) +{ + __be32 dst = 0; + +#ifdef CONFIG_IPV6_SIT_6RD + if (ipv6_prefix_equal(v6dst, &tunnel->ip6rd.prefix, + tunnel->ip6rd.prefixlen)) { + unsigned int pbw0, pbi0; + int pbi1; + u32 d; + + pbw0 = tunnel->ip6rd.prefixlen >> 5; + pbi0 = tunnel->ip6rd.prefixlen & 0x1f; + + d = (ntohl(v6dst->s6_addr32[pbw0]) << pbi0) >> + tunnel->ip6rd.relay_prefixlen; + + pbi1 = pbi0 - tunnel->ip6rd.relay_prefixlen; + if (pbi1 > 0) + d |= ntohl(v6dst->s6_addr32[pbw0 + 1]) >> + (32 - pbi1); + + dst = tunnel->ip6rd.relay_prefix | htonl(d); + } +#else + if (v6dst->s6_addr16[0] == htons(0x2002)) { + /* 6to4 v6 addr has 16 bits prefix, 32 v4addr, 16 SLA, ... */ + memcpy(&dst, &v6dst->s6_addr16[1], 4); + } +#endif + return dst; +} + +/* + * This function assumes it is being called from dev_queue_xmit() + * and that skb is filled properly by that function. + */ + +static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, + struct net_device *dev) +{ + struct ip_tunnel *tunnel = netdev_priv(dev); + struct pcpu_tstats *tstats; + const struct iphdr *tiph = &tunnel->parms.iph; + const struct ipv6hdr *iph6 = ipv6_hdr(skb); + u8 tos = tunnel->parms.iph.tos; + __be16 df = tiph->frag_off; + struct rtable *rt; /* Route to the other host */ + struct net_device *tdev; /* Device to other host */ + struct iphdr *iph; /* Our new IP header */ + unsigned int max_headroom; /* The extra header space needed */ + __be32 dst = tiph->daddr; + struct flowi4 fl4; + int mtu; + const struct in6_addr *addr6; + int addr_type; + + if (skb->protocol != htons(ETH_P_IPV6)) + goto tx_error; + + if (tos == 1) + tos = ipv6_get_dsfield(iph6); + + /* ISATAP (RFC4214) - must come before 6to4 */ + if (dev->priv_flags & IFF_ISATAP) { + struct neighbour *neigh = NULL; + bool do_tx_error = false; + + if (skb_dst(skb)) + neigh = dst_neigh_lookup(skb_dst(skb), &iph6->daddr); + + if (neigh == NULL) { + if (net_ratelimit()) + printk(KERN_DEBUG "sit: nexthop == NULL\n"); + goto tx_error; + } + + addr6 = (const struct in6_addr*)&neigh->primary_key; + addr_type = ipv6_addr_type(addr6); + + if ((addr_type & IPV6_ADDR_UNICAST) && + ipv6_addr_is_isatap(addr6)) + dst = addr6->s6_addr32[3]; + else + do_tx_error = true; + + neigh_release(neigh); + if (do_tx_error) + goto tx_error; + } + + if (!dst) + dst = try_6rd(&iph6->daddr, tunnel); + + if (!dst) { + struct neighbour *neigh = NULL; + bool do_tx_error = false; + + if (skb_dst(skb)) + neigh = dst_neigh_lookup(skb_dst(skb), &iph6->daddr); + + if (neigh == NULL) { + if (net_ratelimit()) + printk(KERN_DEBUG "sit: nexthop == NULL\n"); + goto tx_error; + } + + addr6 = (const struct in6_addr*)&neigh->primary_key; + addr_type = ipv6_addr_type(addr6); + + if (addr_type == IPV6_ADDR_ANY) { + addr6 = &ipv6_hdr(skb)->daddr; + addr_type = ipv6_addr_type(addr6); + } + + if ((addr_type & IPV6_ADDR_COMPATv4) != 0) + dst = addr6->s6_addr32[3]; + else + do_tx_error = true; + + neigh_release(neigh); + if (do_tx_error) + goto tx_error; + } + + rt = ip_route_output_ports(dev_net(dev), &fl4, NULL, + dst, tiph->saddr, + 0, 0, + IPPROTO_IPV6, RT_TOS(tos), + tunnel->parms.link); + if (IS_ERR(rt)) { + dev->stats.tx_carrier_errors++; + goto tx_error_icmp; + } + if (rt->rt_type != RTN_UNICAST) { + ip_rt_put(rt); + dev->stats.tx_carrier_errors++; + goto tx_error_icmp; + } + tdev = rt->dst.dev; + + if (tdev == dev) { + ip_rt_put(rt); + dev->stats.collisions++; + goto tx_error; + } + + if (df) { + mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr); + + if (mtu < 68) { + dev->stats.collisions++; + ip_rt_put(rt); + goto tx_error; + } + + if (mtu < IPV6_MIN_MTU) { + mtu = IPV6_MIN_MTU; + df = 0; + } + + if (tunnel->parms.iph.daddr && skb_dst(skb)) + skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu); + + if (skb->len > mtu) { + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + ip_rt_put(rt); + goto tx_error; + } + } + + if (tunnel->err_count > 0) { + if (time_before(jiffies, + tunnel->err_time + IPTUNNEL_ERR_TIMEO)) { + tunnel->err_count--; + dst_link_failure(skb); + } else + tunnel->err_count = 0; + } + + /* + * Okay, now see if we can stuff it in the buffer as-is. + */ + max_headroom = LL_RESERVED_SPACE(tdev)+sizeof(struct iphdr); + + if (skb_headroom(skb) < max_headroom || skb_shared(skb) || + (skb_cloned(skb) && !skb_clone_writable(skb, 0))) { + struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); + if (!new_skb) { + ip_rt_put(rt); + dev->stats.tx_dropped++; + dev_kfree_skb(skb); + return NETDEV_TX_OK; + } + if (skb->sk) + skb_set_owner_w(new_skb, skb->sk); + dev_kfree_skb(skb); + skb = new_skb; + iph6 = ipv6_hdr(skb); + } + + skb->transport_header = skb->network_header; + skb_push(skb, sizeof(struct iphdr)); + skb_reset_network_header(skb); + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + IPCB(skb)->flags = 0; + skb_dst_drop(skb); + skb_dst_set(skb, &rt->dst); + + /* + * Push down and install the IPIP header. + */ + + iph = ip_hdr(skb); + iph->version = 4; + iph->ihl = sizeof(struct iphdr)>>2; + iph->frag_off = df; + iph->protocol = IPPROTO_IPV6; + iph->tos = INET_ECN_encapsulate(tos, ipv6_get_dsfield(iph6)); + iph->daddr = fl4.daddr; + iph->saddr = fl4.saddr; + + if ((iph->ttl = tiph->ttl) == 0) + iph->ttl = iph6->hop_limit; + + nf_reset(skb); + tstats = this_cpu_ptr(dev->tstats); + __IPTUNNEL_XMIT(tstats, &dev->stats); + return NETDEV_TX_OK; + +tx_error_icmp: + dst_link_failure(skb); +tx_error: + dev->stats.tx_errors++; + dev_kfree_skb(skb); + return NETDEV_TX_OK; +} + +static void ipip6_tunnel_bind_dev(struct net_device *dev) +{ + struct net_device *tdev = NULL; + struct ip_tunnel *tunnel; + const struct iphdr *iph; + struct flowi4 fl4; + + tunnel = netdev_priv(dev); + iph = &tunnel->parms.iph; + + if (iph->daddr) { + struct rtable *rt = ip_route_output_ports(dev_net(dev), &fl4, NULL, + iph->daddr, iph->saddr, + 0, 0, + IPPROTO_IPV6, + RT_TOS(iph->tos), + tunnel->parms.link); + + if (!IS_ERR(rt)) { + tdev = rt->dst.dev; + ip_rt_put(rt); + } + dev->flags |= IFF_POINTOPOINT; + } + + if (!tdev && tunnel->parms.link) + tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link); + + if (tdev) { + dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr); + dev->mtu = tdev->mtu - sizeof(struct iphdr); + if (dev->mtu < IPV6_MIN_MTU) + dev->mtu = IPV6_MIN_MTU; + } + dev->iflink = tunnel->parms.link; +} + +static int +ipip6_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) +{ + int err = 0; + struct ip_tunnel_parm p; + struct ip_tunnel_prl prl; + struct ip_tunnel *t; + struct net *net = dev_net(dev); + struct sit_net *sitn = net_generic(net, sit_net_id); +#ifdef CONFIG_IPV6_SIT_6RD + struct ip_tunnel_6rd ip6rd; +#endif + + switch (cmd) { + case SIOCGETTUNNEL: +#ifdef CONFIG_IPV6_SIT_6RD + case SIOCGET6RD: +#endif + t = NULL; + if (dev == sitn->fb_tunnel_dev) { + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) { + err = -EFAULT; + break; + } + t = ipip6_tunnel_locate(net, &p, 0); + } + if (t == NULL) + t = netdev_priv(dev); + + err = -EFAULT; + if (cmd == SIOCGETTUNNEL) { + memcpy(&p, &t->parms, sizeof(p)); + if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, + sizeof(p))) + goto done; +#ifdef CONFIG_IPV6_SIT_6RD + } else { + ip6rd.prefix = t->ip6rd.prefix; + ip6rd.relay_prefix = t->ip6rd.relay_prefix; + ip6rd.prefixlen = t->ip6rd.prefixlen; + ip6rd.relay_prefixlen = t->ip6rd.relay_prefixlen; + if (copy_to_user(ifr->ifr_ifru.ifru_data, &ip6rd, + sizeof(ip6rd))) + goto done; +#endif + } + err = 0; + break; + + case SIOCADDTUNNEL: + case SIOCCHGTUNNEL: + err = -EPERM; + if (!capable(CAP_NET_ADMIN)) + goto done; + + err = -EFAULT; + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + goto done; + + err = -EINVAL; + if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPV6 || + p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF))) + goto done; + if (p.iph.ttl) + p.iph.frag_off |= htons(IP_DF); + + t = ipip6_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL); + + if (dev != sitn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { + if (t != NULL) { + if (t->dev != dev) { + err = -EEXIST; + break; + } + } else { + if (((dev->flags&IFF_POINTOPOINT) && !p.iph.daddr) || + (!(dev->flags&IFF_POINTOPOINT) && p.iph.daddr)) { + err = -EINVAL; + break; + } + t = netdev_priv(dev); + ipip6_tunnel_unlink(sitn, t); + synchronize_net(); + t->parms.iph.saddr = p.iph.saddr; + t->parms.iph.daddr = p.iph.daddr; + memcpy(dev->dev_addr, &p.iph.saddr, 4); + memcpy(dev->broadcast, &p.iph.daddr, 4); + ipip6_tunnel_link(sitn, t); + netdev_state_change(dev); + } + } + + if (t) { + err = 0; + if (cmd == SIOCCHGTUNNEL) { + t->parms.iph.ttl = p.iph.ttl; + t->parms.iph.tos = p.iph.tos; + if (t->parms.link != p.link) { + t->parms.link = p.link; + ipip6_tunnel_bind_dev(dev); + netdev_state_change(dev); + } + } + if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p))) + err = -EFAULT; + } else + err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT); + break; + + case SIOCDELTUNNEL: + err = -EPERM; + if (!capable(CAP_NET_ADMIN)) + goto done; + + if (dev == sitn->fb_tunnel_dev) { + err = -EFAULT; + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + goto done; + err = -ENOENT; + if ((t = ipip6_tunnel_locate(net, &p, 0)) == NULL) + goto done; + err = -EPERM; + if (t == netdev_priv(sitn->fb_tunnel_dev)) + goto done; + dev = t->dev; + } + unregister_netdevice(dev); + err = 0; + break; + + case SIOCGETPRL: + err = -EINVAL; + if (dev == sitn->fb_tunnel_dev) + goto done; + err = -ENOENT; + if (!(t = netdev_priv(dev))) + goto done; + err = ipip6_tunnel_get_prl(t, ifr->ifr_ifru.ifru_data); + break; + + case SIOCADDPRL: + case SIOCDELPRL: + case SIOCCHGPRL: + err = -EPERM; + if (!capable(CAP_NET_ADMIN)) + goto done; + err = -EINVAL; + if (dev == sitn->fb_tunnel_dev) + goto done; + err = -EFAULT; + if (copy_from_user(&prl, ifr->ifr_ifru.ifru_data, sizeof(prl))) + goto done; + err = -ENOENT; + if (!(t = netdev_priv(dev))) + goto done; + + switch (cmd) { + case SIOCDELPRL: + err = ipip6_tunnel_del_prl(t, &prl); + break; + case SIOCADDPRL: + case SIOCCHGPRL: + err = ipip6_tunnel_add_prl(t, &prl, cmd == SIOCCHGPRL); + break; + } + netdev_state_change(dev); + break; + +#ifdef CONFIG_IPV6_SIT_6RD + case SIOCADD6RD: + case SIOCCHG6RD: + case SIOCDEL6RD: + err = -EPERM; + if (!capable(CAP_NET_ADMIN)) + goto done; + + err = -EFAULT; + if (copy_from_user(&ip6rd, ifr->ifr_ifru.ifru_data, + sizeof(ip6rd))) + goto done; + + t = netdev_priv(dev); + + if (cmd != SIOCDEL6RD) { + struct in6_addr prefix; + __be32 relay_prefix; + + err = -EINVAL; + if (ip6rd.relay_prefixlen > 32 || + ip6rd.prefixlen + (32 - ip6rd.relay_prefixlen) > 64) + goto done; + + ipv6_addr_prefix(&prefix, &ip6rd.prefix, + ip6rd.prefixlen); + if (!ipv6_addr_equal(&prefix, &ip6rd.prefix)) + goto done; + if (ip6rd.relay_prefixlen) + relay_prefix = ip6rd.relay_prefix & + htonl(0xffffffffUL << + (32 - ip6rd.relay_prefixlen)); + else + relay_prefix = 0; + if (relay_prefix != ip6rd.relay_prefix) + goto done; + + t->ip6rd.prefix = prefix; + t->ip6rd.relay_prefix = relay_prefix; + t->ip6rd.prefixlen = ip6rd.prefixlen; + t->ip6rd.relay_prefixlen = ip6rd.relay_prefixlen; + } else + ipip6_tunnel_clone_6rd(dev, sitn); + + err = 0; + break; +#endif + + default: + err = -EINVAL; + } + +done: + return err; +} + +static int ipip6_tunnel_change_mtu(struct net_device *dev, int new_mtu) +{ + if (new_mtu < IPV6_MIN_MTU || new_mtu > 0xFFF8 - sizeof(struct iphdr)) + return -EINVAL; + dev->mtu = new_mtu; + return 0; +} + +static const struct net_device_ops ipip6_netdev_ops = { + .ndo_uninit = ipip6_tunnel_uninit, + .ndo_start_xmit = ipip6_tunnel_xmit, + .ndo_do_ioctl = ipip6_tunnel_ioctl, + .ndo_change_mtu = ipip6_tunnel_change_mtu, + .ndo_get_stats = ipip6_get_stats, +}; + +static void ipip6_dev_free(struct net_device *dev) +{ + free_percpu(dev->tstats); + free_netdev(dev); +} + +static void ipip6_tunnel_setup(struct net_device *dev) +{ + dev->netdev_ops = &ipip6_netdev_ops; + dev->destructor = ipip6_dev_free; + + dev->type = ARPHRD_SIT; + dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr); + dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr); + dev->flags = IFF_NOARP; + dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + dev->iflink = 0; + dev->addr_len = 4; + dev->features |= NETIF_F_NETNS_LOCAL; + dev->features |= NETIF_F_LLTX; +} + +static int ipip6_tunnel_init(struct net_device *dev) +{ + struct ip_tunnel *tunnel = netdev_priv(dev); + + tunnel->dev = dev; + + memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4); + memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); + + ipip6_tunnel_bind_dev(dev); + dev->tstats = alloc_percpu(struct pcpu_tstats); + if (!dev->tstats) + return -ENOMEM; + + return 0; +} + +static int __net_init ipip6_fb_tunnel_init(struct net_device *dev) +{ + struct ip_tunnel *tunnel = netdev_priv(dev); + struct iphdr *iph = &tunnel->parms.iph; + struct net *net = dev_net(dev); + struct sit_net *sitn = net_generic(net, sit_net_id); + + tunnel->dev = dev; + strcpy(tunnel->parms.name, dev->name); + + iph->version = 4; + iph->protocol = IPPROTO_IPV6; + iph->ihl = 5; + iph->ttl = 64; + + dev->tstats = alloc_percpu(struct pcpu_tstats); + if (!dev->tstats) + return -ENOMEM; + dev_hold(dev); + rcu_assign_pointer(sitn->tunnels_wc[0], tunnel); + return 0; +} + +static struct xfrm_tunnel sit_handler __read_mostly = { + .handler = ipip6_rcv, + .err_handler = ipip6_err, + .priority = 1, +}; + +static void __net_exit sit_destroy_tunnels(struct sit_net *sitn, struct list_head *head) +{ + int prio; + + for (prio = 1; prio < 4; prio++) { + int h; + for (h = 0; h < HASH_SIZE; h++) { + struct ip_tunnel *t; + + t = rtnl_dereference(sitn->tunnels[prio][h]); + while (t != NULL) { + unregister_netdevice_queue(t->dev, head); + t = rtnl_dereference(t->next); + } + } + } +} + +static int __net_init sit_init_net(struct net *net) +{ + struct sit_net *sitn = net_generic(net, sit_net_id); + struct ip_tunnel *t; + int err; + + sitn->tunnels[0] = sitn->tunnels_wc; + sitn->tunnels[1] = sitn->tunnels_l; + sitn->tunnels[2] = sitn->tunnels_r; + sitn->tunnels[3] = sitn->tunnels_r_l; + + sitn->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "sit0", + ipip6_tunnel_setup); + if (!sitn->fb_tunnel_dev) { + err = -ENOMEM; + goto err_alloc_dev; + } + dev_net_set(sitn->fb_tunnel_dev, net); + + err = ipip6_fb_tunnel_init(sitn->fb_tunnel_dev); + if (err) + goto err_dev_free; + + ipip6_tunnel_clone_6rd(sitn->fb_tunnel_dev, sitn); + + if ((err = register_netdev(sitn->fb_tunnel_dev))) + goto err_reg_dev; + + t = netdev_priv(sitn->fb_tunnel_dev); + + strcpy(t->parms.name, sitn->fb_tunnel_dev->name); + return 0; + +err_reg_dev: + dev_put(sitn->fb_tunnel_dev); +err_dev_free: + ipip6_dev_free(sitn->fb_tunnel_dev); +err_alloc_dev: + return err; +} + +static void __net_exit sit_exit_net(struct net *net) +{ + struct sit_net *sitn = net_generic(net, sit_net_id); + LIST_HEAD(list); + + rtnl_lock(); + sit_destroy_tunnels(sitn, &list); + unregister_netdevice_queue(sitn->fb_tunnel_dev, &list); + unregister_netdevice_many(&list); + rtnl_unlock(); +} + +static struct pernet_operations sit_net_ops = { + .init = sit_init_net, + .exit = sit_exit_net, + .id = &sit_net_id, + .size = sizeof(struct sit_net), +}; + +static void __exit sit_cleanup(void) +{ + xfrm4_tunnel_deregister(&sit_handler, AF_INET6); + + unregister_pernet_device(&sit_net_ops); + rcu_barrier(); /* Wait for completion of call_rcu()'s */ +} + +static int __init sit_init(void) +{ + int err; + + printk(KERN_INFO "IPv6 over IPv4 tunneling driver\n"); + + err = register_pernet_device(&sit_net_ops); + if (err < 0) + return err; + err = xfrm4_tunnel_register(&sit_handler, AF_INET6); + if (err < 0) { + unregister_pernet_device(&sit_net_ops); + printk(KERN_INFO "sit init: Can't add protocol\n"); + } + return err; +} + +module_init(sit_init); +module_exit(sit_cleanup); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NETDEV("sit0"); diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c new file mode 100644 index 00000000..8e951d8d --- /dev/null +++ b/net/ipv6/syncookies.c @@ -0,0 +1,269 @@ +/* + * IPv6 Syncookies implementation for the Linux kernel + * + * Authors: + * Glenn Griffin <ggriffin.kernel@gmail.com> + * + * Based on IPv4 implementation by Andi Kleen + * linux/net/ipv4/syncookies.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <linux/tcp.h> +#include <linux/random.h> +#include <linux/cryptohash.h> +#include <linux/kernel.h> +#include <net/ipv6.h> +#include <net/tcp.h> + +extern int sysctl_tcp_syncookies; +extern __u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS]; + +#define COOKIEBITS 24 /* Upper bits store count */ +#define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1) + +/* Table must be sorted. */ +static __u16 const msstab[] = { + 64, + 512, + 536, + 1280 - 60, + 1480 - 60, + 1500 - 60, + 4460 - 60, + 9000 - 60, +}; + +/* + * This (misnamed) value is the age of syncookie which is permitted. + * Its ideal value should be dependent on TCP_TIMEOUT_INIT and + * sysctl_tcp_retries1. It's a rather complicated formula (exponential + * backoff) to compute at runtime so it's currently hardcoded here. + */ +#define COUNTER_TRIES 4 + +static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct dst_entry *dst) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct sock *child; + + child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst); + if (child) + inet_csk_reqsk_queue_add(sk, req, child); + else + reqsk_free(req); + + return child; +} + +static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], + ipv6_cookie_scratch); + +static u32 cookie_hash(const struct in6_addr *saddr, const struct in6_addr *daddr, + __be16 sport, __be16 dport, u32 count, int c) +{ + __u32 *tmp = __get_cpu_var(ipv6_cookie_scratch); + + /* + * we have 320 bits of information to hash, copy in the remaining + * 192 bits required for sha_transform, from the syncookie_secret + * and overwrite the digest with the secret + */ + memcpy(tmp + 10, syncookie_secret[c], 44); + memcpy(tmp, saddr, 16); + memcpy(tmp + 4, daddr, 16); + tmp[8] = ((__force u32)sport << 16) + (__force u32)dport; + tmp[9] = count; + sha_transform(tmp + 16, (__u8 *)tmp, tmp + 16 + 5); + + return tmp[17]; +} + +static __u32 secure_tcp_syn_cookie(const struct in6_addr *saddr, + const struct in6_addr *daddr, + __be16 sport, __be16 dport, __u32 sseq, + __u32 count, __u32 data) +{ + return (cookie_hash(saddr, daddr, sport, dport, 0, 0) + + sseq + (count << COOKIEBITS) + + ((cookie_hash(saddr, daddr, sport, dport, count, 1) + data) + & COOKIEMASK)); +} + +static __u32 check_tcp_syn_cookie(__u32 cookie, const struct in6_addr *saddr, + const struct in6_addr *daddr, __be16 sport, + __be16 dport, __u32 sseq, __u32 count, + __u32 maxdiff) +{ + __u32 diff; + + cookie -= cookie_hash(saddr, daddr, sport, dport, 0, 0) + sseq; + + diff = (count - (cookie >> COOKIEBITS)) & ((__u32) -1 >> COOKIEBITS); + if (diff >= maxdiff) + return (__u32)-1; + + return (cookie - + cookie_hash(saddr, daddr, sport, dport, count - diff, 1)) + & COOKIEMASK; +} + +__u32 cookie_v6_init_sequence(struct sock *sk, const struct sk_buff *skb, __u16 *mssp) +{ + const struct ipv6hdr *iph = ipv6_hdr(skb); + const struct tcphdr *th = tcp_hdr(skb); + int mssind; + const __u16 mss = *mssp; + + tcp_synq_overflow(sk); + + for (mssind = ARRAY_SIZE(msstab) - 1; mssind ; mssind--) + if (mss >= msstab[mssind]) + break; + + *mssp = msstab[mssind]; + + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT); + + return secure_tcp_syn_cookie(&iph->saddr, &iph->daddr, th->source, + th->dest, ntohl(th->seq), + jiffies / (HZ * 60), mssind); +} + +static inline int cookie_check(const struct sk_buff *skb, __u32 cookie) +{ + const struct ipv6hdr *iph = ipv6_hdr(skb); + const struct tcphdr *th = tcp_hdr(skb); + __u32 seq = ntohl(th->seq) - 1; + __u32 mssind = check_tcp_syn_cookie(cookie, &iph->saddr, &iph->daddr, + th->source, th->dest, seq, + jiffies / (HZ * 60), COUNTER_TRIES); + + return mssind < ARRAY_SIZE(msstab) ? msstab[mssind] : 0; +} + +struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_options_received tcp_opt; + const u8 *hash_location; + struct inet_request_sock *ireq; + struct inet6_request_sock *ireq6; + struct tcp_request_sock *treq; + struct ipv6_pinfo *np = inet6_sk(sk); + struct tcp_sock *tp = tcp_sk(sk); + const struct tcphdr *th = tcp_hdr(skb); + __u32 cookie = ntohl(th->ack_seq) - 1; + struct sock *ret = sk; + struct request_sock *req; + int mss; + struct dst_entry *dst; + __u8 rcv_wscale; + bool ecn_ok = false; + + if (!sysctl_tcp_syncookies || !th->ack || th->rst) + goto out; + + if (tcp_synq_no_recent_overflow(sk) || + (mss = cookie_check(skb, cookie)) == 0) { + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESFAILED); + goto out; + } + + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESRECV); + + /* check for timestamp cookie support */ + memset(&tcp_opt, 0, sizeof(tcp_opt)); + tcp_parse_options(skb, &tcp_opt, &hash_location, 0); + + if (!cookie_check_timestamp(&tcp_opt, &ecn_ok)) + goto out; + + ret = NULL; + req = inet6_reqsk_alloc(&tcp6_request_sock_ops); + if (!req) + goto out; + + ireq = inet_rsk(req); + ireq6 = inet6_rsk(req); + treq = tcp_rsk(req); + + if (security_inet_conn_request(sk, skb, req)) + goto out_free; + + req->mss = mss; + ireq->rmt_port = th->source; + ireq->loc_port = th->dest; + ireq6->rmt_addr = ipv6_hdr(skb)->saddr; + ireq6->loc_addr = ipv6_hdr(skb)->daddr; + if (ipv6_opt_accepted(sk, skb) || + np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || + np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) { + atomic_inc(&skb->users); + ireq6->pktopts = skb; + } + + ireq6->iif = sk->sk_bound_dev_if; + /* So that link locals have meaning */ + if (!sk->sk_bound_dev_if && + ipv6_addr_type(&ireq6->rmt_addr) & IPV6_ADDR_LINKLOCAL) + ireq6->iif = inet6_iif(skb); + + req->expires = 0UL; + req->retrans = 0; + ireq->ecn_ok = ecn_ok; + ireq->snd_wscale = tcp_opt.snd_wscale; + ireq->sack_ok = tcp_opt.sack_ok; + ireq->wscale_ok = tcp_opt.wscale_ok; + ireq->tstamp_ok = tcp_opt.saw_tstamp; + req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0; + treq->snt_synack = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0; + treq->rcv_isn = ntohl(th->seq) - 1; + treq->snt_isn = cookie; + + /* + * We need to lookup the dst_entry to get the correct window size. + * This is taken from tcp_v6_syn_recv_sock. Somebody please enlighten + * me if there is a preferred way. + */ + { + struct in6_addr *final_p, final; + struct flowi6 fl6; + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_proto = IPPROTO_TCP; + fl6.daddr = ireq6->rmt_addr; + final_p = fl6_update_dst(&fl6, np->opt, &final); + fl6.saddr = ireq6->loc_addr; + fl6.flowi6_oif = sk->sk_bound_dev_if; + fl6.flowi6_mark = sk->sk_mark; + fl6.fl6_dport = inet_rsk(req)->rmt_port; + fl6.fl6_sport = inet_sk(sk)->inet_sport; + security_req_classify_flow(req, flowi6_to_flowi(&fl6)); + + dst = ip6_dst_lookup_flow(sk, &fl6, final_p, false); + if (IS_ERR(dst)) + goto out_free; + } + + req->window_clamp = tp->window_clamp ? :dst_metric(dst, RTAX_WINDOW); + tcp_select_initial_window(tcp_full_space(sk), req->mss, + &req->rcv_wnd, &req->window_clamp, + ireq->wscale_ok, &rcv_wscale, + dst_metric(dst, RTAX_INITRWND)); + + ireq->rcv_wscale = rcv_wscale; + + ret = get_cookie_sock(sk, skb, req, dst); +out: + return ret; +out_free: + reqsk_free(req); + return NULL; +} + diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c new file mode 100644 index 00000000..166a57c4 --- /dev/null +++ b/net/ipv6/sysctl_net_ipv6.c @@ -0,0 +1,177 @@ +/* + * sysctl_net_ipv6.c: sysctl interface to net IPV6 subsystem. + * + * Changes: + * YOSHIFUJI Hideaki @USAGI: added icmp sysctl table. + */ + +#include <linux/mm.h> +#include <linux/sysctl.h> +#include <linux/in6.h> +#include <linux/ipv6.h> +#include <linux/slab.h> +#include <linux/export.h> +#include <net/ndisc.h> +#include <net/ipv6.h> +#include <net/addrconf.h> +#include <net/inet_frag.h> + +static struct ctl_table empty[1]; + +static ctl_table ipv6_static_skeleton[] = { + { + .procname = "neigh", + .maxlen = 0, + .mode = 0555, + .child = empty, + }, + { } +}; + +static ctl_table ipv6_table_template[] = { + { + .procname = "route", + .maxlen = 0, + .mode = 0555, + .child = ipv6_route_table_template + }, + { + .procname = "icmp", + .maxlen = 0, + .mode = 0555, + .child = ipv6_icmp_table_template + }, + { + .procname = "bindv6only", + .data = &init_net.ipv6.sysctl.bindv6only, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { } +}; + +static ctl_table ipv6_rotable[] = { + { + .procname = "mld_max_msf", + .data = &sysctl_mld_max_msf, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { } +}; + +struct ctl_path net_ipv6_ctl_path[] = { + { .procname = "net", }, + { .procname = "ipv6", }, + { }, +}; +EXPORT_SYMBOL_GPL(net_ipv6_ctl_path); + +static int __net_init ipv6_sysctl_net_init(struct net *net) +{ + struct ctl_table *ipv6_table; + struct ctl_table *ipv6_route_table; + struct ctl_table *ipv6_icmp_table; + int err; + + err = -ENOMEM; + ipv6_table = kmemdup(ipv6_table_template, sizeof(ipv6_table_template), + GFP_KERNEL); + if (!ipv6_table) + goto out; + + ipv6_route_table = ipv6_route_sysctl_init(net); + if (!ipv6_route_table) + goto out_ipv6_table; + ipv6_table[0].child = ipv6_route_table; + + ipv6_icmp_table = ipv6_icmp_sysctl_init(net); + if (!ipv6_icmp_table) + goto out_ipv6_route_table; + ipv6_table[1].child = ipv6_icmp_table; + + ipv6_table[2].data = &net->ipv6.sysctl.bindv6only; + + net->ipv6.sysctl.table = register_net_sysctl_table(net, net_ipv6_ctl_path, + ipv6_table); + if (!net->ipv6.sysctl.table) + goto out_ipv6_icmp_table; + + err = 0; +out: + return err; + +out_ipv6_icmp_table: + kfree(ipv6_icmp_table); +out_ipv6_route_table: + kfree(ipv6_route_table); +out_ipv6_table: + kfree(ipv6_table); + goto out; +} + +static void __net_exit ipv6_sysctl_net_exit(struct net *net) +{ + struct ctl_table *ipv6_table; + struct ctl_table *ipv6_route_table; + struct ctl_table *ipv6_icmp_table; + + ipv6_table = net->ipv6.sysctl.table->ctl_table_arg; + ipv6_route_table = ipv6_table[0].child; + ipv6_icmp_table = ipv6_table[1].child; + + unregister_net_sysctl_table(net->ipv6.sysctl.table); + + kfree(ipv6_table); + kfree(ipv6_route_table); + kfree(ipv6_icmp_table); +} + +static struct pernet_operations ipv6_sysctl_net_ops = { + .init = ipv6_sysctl_net_init, + .exit = ipv6_sysctl_net_exit, +}; + +static struct ctl_table_header *ip6_header; + +int ipv6_sysctl_register(void) +{ + int err = -ENOMEM; + + ip6_header = register_net_sysctl_rotable(net_ipv6_ctl_path, ipv6_rotable); + if (ip6_header == NULL) + goto out; + + err = register_pernet_subsys(&ipv6_sysctl_net_ops); + if (err) + goto err_pernet; +out: + return err; + +err_pernet: + unregister_net_sysctl_table(ip6_header); + goto out; +} + +void ipv6_sysctl_unregister(void) +{ + unregister_net_sysctl_table(ip6_header); + unregister_pernet_subsys(&ipv6_sysctl_net_ops); +} + +static struct ctl_table_header *ip6_base; + +int ipv6_static_sysctl_register(void) +{ + ip6_base = register_sysctl_paths(net_ipv6_ctl_path, ipv6_static_skeleton); + if (ip6_base == NULL) + return -ENOMEM; + return 0; +} + +void ipv6_static_sysctl_unregister(void) +{ + unregister_net_sysctl_table(ip6_base); +} diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c new file mode 100644 index 00000000..98256cf7 --- /dev/null +++ b/net/ipv6/tcp_ipv6.c @@ -0,0 +1,2196 @@ +/* + * TCP over IPv6 + * Linux INET6 implementation + * + * Authors: + * Pedro Roque <roque@di.fc.ul.pt> + * + * Based on: + * linux/net/ipv4/tcp.c + * linux/net/ipv4/tcp_input.c + * linux/net/ipv4/tcp_output.c + * + * Fixes: + * Hideaki YOSHIFUJI : sin6_scope_id support + * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which + * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind + * a single port at the same time. + * YOSHIFUJI Hideaki @USAGI: convert /proc/net/tcp6 to seq_file. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/bottom_half.h> +#include <linux/module.h> +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/net.h> +#include <linux/jiffies.h> +#include <linux/in.h> +#include <linux/in6.h> +#include <linux/netdevice.h> +#include <linux/init.h> +#include <linux/jhash.h> +#include <linux/ipsec.h> +#include <linux/times.h> +#include <linux/slab.h> + +#include <linux/ipv6.h> +#include <linux/icmpv6.h> +#include <linux/random.h> + +#include <net/tcp.h> +#include <net/ndisc.h> +#include <net/inet6_hashtables.h> +#include <net/inet6_connection_sock.h> +#include <net/ipv6.h> +#include <net/transp_v6.h> +#include <net/addrconf.h> +#include <net/ip6_route.h> +#include <net/ip6_checksum.h> +#include <net/inet_ecn.h> +#include <net/protocol.h> +#include <net/xfrm.h> +#include <net/snmp.h> +#include <net/dsfield.h> +#include <net/timewait_sock.h> +#include <net/netdma.h> +#include <net/inet_common.h> +#include <net/secure_seq.h> +#include <net/tcp_memcontrol.h> + +#include <asm/uaccess.h> + +#include <linux/proc_fs.h> +#include <linux/seq_file.h> + +#include <linux/crypto.h> +#include <linux/scatterlist.h> + +static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb); +static void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, + struct request_sock *req); + +static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb); +static void __tcp_v6_send_check(struct sk_buff *skb, + const struct in6_addr *saddr, + const struct in6_addr *daddr); + +static const struct inet_connection_sock_af_ops ipv6_mapped; +static const struct inet_connection_sock_af_ops ipv6_specific; +#ifdef CONFIG_TCP_MD5SIG +static const struct tcp_sock_af_ops tcp_sock_ipv6_specific; +static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific; +#else +static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(struct sock *sk, + const struct in6_addr *addr) +{ + return NULL; +} +#endif + +static void tcp_v6_hash(struct sock *sk) +{ + if (sk->sk_state != TCP_CLOSE) { + if (inet_csk(sk)->icsk_af_ops == &ipv6_mapped) { + tcp_prot.hash(sk); + return; + } + local_bh_disable(); + __inet6_hash(sk, NULL); + local_bh_enable(); + } +} + +static __inline__ __sum16 tcp_v6_check(int len, + const struct in6_addr *saddr, + const struct in6_addr *daddr, + __wsum base) +{ + return csum_ipv6_magic(saddr, daddr, len, IPPROTO_TCP, base); +} + +static __u32 tcp_v6_init_sequence(const struct sk_buff *skb) +{ + return secure_tcpv6_sequence_number(ipv6_hdr(skb)->daddr.s6_addr32, + ipv6_hdr(skb)->saddr.s6_addr32, + tcp_hdr(skb)->dest, + tcp_hdr(skb)->source); +} + +static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, + int addr_len) +{ + struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr; + struct inet_sock *inet = inet_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct tcp_sock *tp = tcp_sk(sk); + struct in6_addr *saddr = NULL, *final_p, final; + struct rt6_info *rt; + struct flowi6 fl6; + struct dst_entry *dst; + int addr_type; + int err; + + if (addr_len < SIN6_LEN_RFC2133) + return -EINVAL; + + if (usin->sin6_family != AF_INET6) + return -EAFNOSUPPORT; + + memset(&fl6, 0, sizeof(fl6)); + + if (np->sndflow) { + fl6.flowlabel = usin->sin6_flowinfo&IPV6_FLOWINFO_MASK; + IP6_ECN_flow_init(fl6.flowlabel); + if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) { + struct ip6_flowlabel *flowlabel; + flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); + if (flowlabel == NULL) + return -EINVAL; + usin->sin6_addr = flowlabel->dst; + fl6_sock_release(flowlabel); + } + } + + /* + * connect() to INADDR_ANY means loopback (BSD'ism). + */ + + if(ipv6_addr_any(&usin->sin6_addr)) + usin->sin6_addr.s6_addr[15] = 0x1; + + addr_type = ipv6_addr_type(&usin->sin6_addr); + + if(addr_type & IPV6_ADDR_MULTICAST) + return -ENETUNREACH; + + if (addr_type&IPV6_ADDR_LINKLOCAL) { + if (addr_len >= sizeof(struct sockaddr_in6) && + usin->sin6_scope_id) { + /* If interface is set while binding, indices + * must coincide. + */ + if (sk->sk_bound_dev_if && + sk->sk_bound_dev_if != usin->sin6_scope_id) + return -EINVAL; + + sk->sk_bound_dev_if = usin->sin6_scope_id; + } + + /* Connect to link-local address requires an interface */ + if (!sk->sk_bound_dev_if) + return -EINVAL; + } + + if (tp->rx_opt.ts_recent_stamp && + !ipv6_addr_equal(&np->daddr, &usin->sin6_addr)) { + tp->rx_opt.ts_recent = 0; + tp->rx_opt.ts_recent_stamp = 0; + tp->write_seq = 0; + } + + np->daddr = usin->sin6_addr; + np->flow_label = fl6.flowlabel; + + /* + * TCP over IPv4 + */ + + if (addr_type == IPV6_ADDR_MAPPED) { + u32 exthdrlen = icsk->icsk_ext_hdr_len; + struct sockaddr_in sin; + + SOCK_DEBUG(sk, "connect: ipv4 mapped\n"); + + if (__ipv6_only_sock(sk)) + return -ENETUNREACH; + + sin.sin_family = AF_INET; + sin.sin_port = usin->sin6_port; + sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3]; + + icsk->icsk_af_ops = &ipv6_mapped; + sk->sk_backlog_rcv = tcp_v4_do_rcv; +#ifdef CONFIG_TCP_MD5SIG + tp->af_specific = &tcp_sock_ipv6_mapped_specific; +#endif + + err = tcp_v4_connect(sk, (struct sockaddr *)&sin, sizeof(sin)); + + if (err) { + icsk->icsk_ext_hdr_len = exthdrlen; + icsk->icsk_af_ops = &ipv6_specific; + sk->sk_backlog_rcv = tcp_v6_do_rcv; +#ifdef CONFIG_TCP_MD5SIG + tp->af_specific = &tcp_sock_ipv6_specific; +#endif + goto failure; + } else { + ipv6_addr_set_v4mapped(inet->inet_saddr, &np->saddr); + ipv6_addr_set_v4mapped(inet->inet_rcv_saddr, + &np->rcv_saddr); + } + + return err; + } + + if (!ipv6_addr_any(&np->rcv_saddr)) + saddr = &np->rcv_saddr; + + fl6.flowi6_proto = IPPROTO_TCP; + fl6.daddr = np->daddr; + fl6.saddr = saddr ? *saddr : np->saddr; + fl6.flowi6_oif = sk->sk_bound_dev_if; + fl6.flowi6_mark = sk->sk_mark; + fl6.fl6_dport = usin->sin6_port; + fl6.fl6_sport = inet->inet_sport; + + final_p = fl6_update_dst(&fl6, np->opt, &final); + + security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); + + dst = ip6_dst_lookup_flow(sk, &fl6, final_p, true); + if (IS_ERR(dst)) { + err = PTR_ERR(dst); + goto failure; + } + + if (saddr == NULL) { + saddr = &fl6.saddr; + np->rcv_saddr = *saddr; + } + + /* set the source address */ + np->saddr = *saddr; + inet->inet_rcv_saddr = LOOPBACK4_IPV6; + + sk->sk_gso_type = SKB_GSO_TCPV6; + __ip6_dst_store(sk, dst, NULL, NULL); + + rt = (struct rt6_info *) dst; + if (tcp_death_row.sysctl_tw_recycle && + !tp->rx_opt.ts_recent_stamp && + ipv6_addr_equal(&rt->rt6i_dst.addr, &np->daddr)) { + struct inet_peer *peer = rt6_get_peer(rt); + /* + * VJ's idea. We save last timestamp seen from + * the destination in peer table, when entering state + * TIME-WAIT * and initialize rx_opt.ts_recent from it, + * when trying new connection. + */ + if (peer) { + inet_peer_refcheck(peer); + if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) { + tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp; + tp->rx_opt.ts_recent = peer->tcp_ts; + } + } + } + + icsk->icsk_ext_hdr_len = 0; + if (np->opt) + icsk->icsk_ext_hdr_len = (np->opt->opt_flen + + np->opt->opt_nflen); + + tp->rx_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr); + + inet->inet_dport = usin->sin6_port; + + tcp_set_state(sk, TCP_SYN_SENT); + err = inet6_hash_connect(&tcp_death_row, sk); + if (err) + goto late_failure; + + if (!tp->write_seq) + tp->write_seq = secure_tcpv6_sequence_number(np->saddr.s6_addr32, + np->daddr.s6_addr32, + inet->inet_sport, + inet->inet_dport); + + err = tcp_connect(sk); + if (err) + goto late_failure; + + return 0; + +late_failure: + tcp_set_state(sk, TCP_CLOSE); + __sk_dst_reset(sk); +failure: + inet->inet_dport = 0; + sk->sk_route_caps = 0; + return err; +} + +static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, __be32 info) +{ + const struct ipv6hdr *hdr = (const struct ipv6hdr*)skb->data; + const struct tcphdr *th = (struct tcphdr *)(skb->data+offset); + struct ipv6_pinfo *np; + struct sock *sk; + int err; + struct tcp_sock *tp; + __u32 seq; + struct net *net = dev_net(skb->dev); + + sk = inet6_lookup(net, &tcp_hashinfo, &hdr->daddr, + th->dest, &hdr->saddr, th->source, skb->dev->ifindex); + + if (sk == NULL) { + ICMP6_INC_STATS_BH(net, __in6_dev_get(skb->dev), + ICMP6_MIB_INERRORS); + return; + } + + if (sk->sk_state == TCP_TIME_WAIT) { + inet_twsk_put(inet_twsk(sk)); + return; + } + + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) + NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS); + + if (sk->sk_state == TCP_CLOSE) + goto out; + + if (ipv6_hdr(skb)->hop_limit < inet6_sk(sk)->min_hopcount) { + NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP); + goto out; + } + + tp = tcp_sk(sk); + seq = ntohl(th->seq); + if (sk->sk_state != TCP_LISTEN && + !between(seq, tp->snd_una, tp->snd_nxt)) { + NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); + goto out; + } + + np = inet6_sk(sk); + + if (type == ICMPV6_PKT_TOOBIG) { + struct dst_entry *dst; + + if (sock_owned_by_user(sk)) + goto out; + if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) + goto out; + + /* icmp should have updated the destination cache entry */ + dst = __sk_dst_check(sk, np->dst_cookie); + + if (dst == NULL) { + struct inet_sock *inet = inet_sk(sk); + struct flowi6 fl6; + + /* BUGGG_FUTURE: Again, it is not clear how + to handle rthdr case. Ignore this complexity + for now. + */ + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_proto = IPPROTO_TCP; + fl6.daddr = np->daddr; + fl6.saddr = np->saddr; + fl6.flowi6_oif = sk->sk_bound_dev_if; + fl6.flowi6_mark = sk->sk_mark; + fl6.fl6_dport = inet->inet_dport; + fl6.fl6_sport = inet->inet_sport; + security_skb_classify_flow(skb, flowi6_to_flowi(&fl6)); + + dst = ip6_dst_lookup_flow(sk, &fl6, NULL, false); + if (IS_ERR(dst)) { + sk->sk_err_soft = -PTR_ERR(dst); + goto out; + } + + } else + dst_hold(dst); + + if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) { + tcp_sync_mss(sk, dst_mtu(dst)); + tcp_simple_retransmit(sk); + } /* else let the usual retransmit timer handle it */ + dst_release(dst); + goto out; + } + + icmpv6_err_convert(type, code, &err); + + /* Might be for an request_sock */ + switch (sk->sk_state) { + struct request_sock *req, **prev; + case TCP_LISTEN: + if (sock_owned_by_user(sk)) + goto out; + + req = inet6_csk_search_req(sk, &prev, th->dest, &hdr->daddr, + &hdr->saddr, inet6_iif(skb)); + if (!req) + goto out; + + /* ICMPs are not backlogged, hence we cannot get + * an established socket here. + */ + WARN_ON(req->sk != NULL); + + if (seq != tcp_rsk(req)->snt_isn) { + NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); + goto out; + } + + inet_csk_reqsk_queue_drop(sk, req, prev); + goto out; + + case TCP_SYN_SENT: + case TCP_SYN_RECV: /* Cannot happen. + It can, it SYNs are crossed. --ANK */ + if (!sock_owned_by_user(sk)) { + sk->sk_err = err; + sk->sk_error_report(sk); /* Wake people up to see the error (see connect in sock.c) */ + + tcp_done(sk); + } else + sk->sk_err_soft = err; + goto out; + } + + if (!sock_owned_by_user(sk) && np->recverr) { + sk->sk_err = err; + sk->sk_error_report(sk); + } else + sk->sk_err_soft = err; + +out: + bh_unlock_sock(sk); + sock_put(sk); +} + + +static int tcp_v6_send_synack(struct sock *sk, struct request_sock *req, + struct request_values *rvp) +{ + struct inet6_request_sock *treq = inet6_rsk(req); + struct ipv6_pinfo *np = inet6_sk(sk); + struct sk_buff * skb; + struct ipv6_txoptions *opt = NULL; + struct in6_addr * final_p, final; + struct flowi6 fl6; + struct dst_entry *dst; + int err; + + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_proto = IPPROTO_TCP; + fl6.daddr = treq->rmt_addr; + fl6.saddr = treq->loc_addr; + fl6.flowlabel = 0; + fl6.flowi6_oif = treq->iif; + fl6.flowi6_mark = sk->sk_mark; + fl6.fl6_dport = inet_rsk(req)->rmt_port; + fl6.fl6_sport = inet_rsk(req)->loc_port; + security_req_classify_flow(req, flowi6_to_flowi(&fl6)); + + opt = np->opt; + final_p = fl6_update_dst(&fl6, opt, &final); + + dst = ip6_dst_lookup_flow(sk, &fl6, final_p, false); + if (IS_ERR(dst)) { + err = PTR_ERR(dst); + dst = NULL; + goto done; + } + skb = tcp_make_synack(sk, dst, req, rvp); + err = -ENOMEM; + if (skb) { + __tcp_v6_send_check(skb, &treq->loc_addr, &treq->rmt_addr); + + fl6.daddr = treq->rmt_addr; + err = ip6_xmit(sk, skb, &fl6, opt, np->tclass); + err = net_xmit_eval(err); + } + +done: + if (opt && opt != np->opt) + sock_kfree_s(sk, opt, opt->tot_len); + dst_release(dst); + return err; +} + +static int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req, + struct request_values *rvp) +{ + TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); + return tcp_v6_send_synack(sk, req, rvp); +} + +static void tcp_v6_reqsk_destructor(struct request_sock *req) +{ + kfree_skb(inet6_rsk(req)->pktopts); +} + +#ifdef CONFIG_TCP_MD5SIG +static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(struct sock *sk, + const struct in6_addr *addr) +{ + return tcp_md5_do_lookup(sk, (union tcp_md5_addr *)addr, AF_INET6); +} + +static struct tcp_md5sig_key *tcp_v6_md5_lookup(struct sock *sk, + struct sock *addr_sk) +{ + return tcp_v6_md5_do_lookup(sk, &inet6_sk(addr_sk)->daddr); +} + +static struct tcp_md5sig_key *tcp_v6_reqsk_md5_lookup(struct sock *sk, + struct request_sock *req) +{ + return tcp_v6_md5_do_lookup(sk, &inet6_rsk(req)->rmt_addr); +} + +static int tcp_v6_parse_md5_keys (struct sock *sk, char __user *optval, + int optlen) +{ + struct tcp_md5sig cmd; + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&cmd.tcpm_addr; + + if (optlen < sizeof(cmd)) + return -EINVAL; + + if (copy_from_user(&cmd, optval, sizeof(cmd))) + return -EFAULT; + + if (sin6->sin6_family != AF_INET6) + return -EINVAL; + + if (!cmd.tcpm_keylen) { + if (ipv6_addr_v4mapped(&sin6->sin6_addr)) + return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin6->sin6_addr.s6_addr32[3], + AF_INET); + return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin6->sin6_addr, + AF_INET6); + } + + if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) + return -EINVAL; + + if (ipv6_addr_v4mapped(&sin6->sin6_addr)) + return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin6->sin6_addr.s6_addr32[3], + AF_INET, cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL); + + return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin6->sin6_addr, + AF_INET6, cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL); +} + +static int tcp_v6_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp, + const struct in6_addr *daddr, + const struct in6_addr *saddr, int nbytes) +{ + struct tcp6_pseudohdr *bp; + struct scatterlist sg; + + bp = &hp->md5_blk.ip6; + /* 1. TCP pseudo-header (RFC2460) */ + bp->saddr = *saddr; + bp->daddr = *daddr; + bp->protocol = cpu_to_be32(IPPROTO_TCP); + bp->len = cpu_to_be32(nbytes); + + sg_init_one(&sg, bp, sizeof(*bp)); + return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp)); +} + +static int tcp_v6_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key, + const struct in6_addr *daddr, struct in6_addr *saddr, + const struct tcphdr *th) +{ + struct tcp_md5sig_pool *hp; + struct hash_desc *desc; + + hp = tcp_get_md5sig_pool(); + if (!hp) + goto clear_hash_noput; + desc = &hp->md5_desc; + + if (crypto_hash_init(desc)) + goto clear_hash; + if (tcp_v6_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2)) + goto clear_hash; + if (tcp_md5_hash_header(hp, th)) + goto clear_hash; + if (tcp_md5_hash_key(hp, key)) + goto clear_hash; + if (crypto_hash_final(desc, md5_hash)) + goto clear_hash; + + tcp_put_md5sig_pool(); + return 0; + +clear_hash: + tcp_put_md5sig_pool(); +clear_hash_noput: + memset(md5_hash, 0, 16); + return 1; +} + +static int tcp_v6_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key, + const struct sock *sk, + const struct request_sock *req, + const struct sk_buff *skb) +{ + const struct in6_addr *saddr, *daddr; + struct tcp_md5sig_pool *hp; + struct hash_desc *desc; + const struct tcphdr *th = tcp_hdr(skb); + + if (sk) { + saddr = &inet6_sk(sk)->saddr; + daddr = &inet6_sk(sk)->daddr; + } else if (req) { + saddr = &inet6_rsk(req)->loc_addr; + daddr = &inet6_rsk(req)->rmt_addr; + } else { + const struct ipv6hdr *ip6h = ipv6_hdr(skb); + saddr = &ip6h->saddr; + daddr = &ip6h->daddr; + } + + hp = tcp_get_md5sig_pool(); + if (!hp) + goto clear_hash_noput; + desc = &hp->md5_desc; + + if (crypto_hash_init(desc)) + goto clear_hash; + + if (tcp_v6_md5_hash_pseudoheader(hp, daddr, saddr, skb->len)) + goto clear_hash; + if (tcp_md5_hash_header(hp, th)) + goto clear_hash; + if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2)) + goto clear_hash; + if (tcp_md5_hash_key(hp, key)) + goto clear_hash; + if (crypto_hash_final(desc, md5_hash)) + goto clear_hash; + + tcp_put_md5sig_pool(); + return 0; + +clear_hash: + tcp_put_md5sig_pool(); +clear_hash_noput: + memset(md5_hash, 0, 16); + return 1; +} + +static int tcp_v6_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb) +{ + const __u8 *hash_location = NULL; + struct tcp_md5sig_key *hash_expected; + const struct ipv6hdr *ip6h = ipv6_hdr(skb); + const struct tcphdr *th = tcp_hdr(skb); + int genhash; + u8 newhash[16]; + + hash_expected = tcp_v6_md5_do_lookup(sk, &ip6h->saddr); + hash_location = tcp_parse_md5sig_option(th); + + /* We've parsed the options - do we have a hash? */ + if (!hash_expected && !hash_location) + return 0; + + if (hash_expected && !hash_location) { + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND); + return 1; + } + + if (!hash_expected && hash_location) { + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED); + return 1; + } + + /* check the signature */ + genhash = tcp_v6_md5_hash_skb(newhash, + hash_expected, + NULL, NULL, skb); + + if (genhash || memcmp(hash_location, newhash, 16) != 0) { + if (net_ratelimit()) { + printk(KERN_INFO "MD5 Hash %s for [%pI6c]:%u->[%pI6c]:%u\n", + genhash ? "failed" : "mismatch", + &ip6h->saddr, ntohs(th->source), + &ip6h->daddr, ntohs(th->dest)); + } + return 1; + } + return 0; +} +#endif + +struct request_sock_ops tcp6_request_sock_ops __read_mostly = { + .family = AF_INET6, + .obj_size = sizeof(struct tcp6_request_sock), + .rtx_syn_ack = tcp_v6_rtx_synack, + .send_ack = tcp_v6_reqsk_send_ack, + .destructor = tcp_v6_reqsk_destructor, + .send_reset = tcp_v6_send_reset, + .syn_ack_timeout = tcp_syn_ack_timeout, +}; + +#ifdef CONFIG_TCP_MD5SIG +static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = { + .md5_lookup = tcp_v6_reqsk_md5_lookup, + .calc_md5_hash = tcp_v6_md5_hash_skb, +}; +#endif + +static void __tcp_v6_send_check(struct sk_buff *skb, + const struct in6_addr *saddr, const struct in6_addr *daddr) +{ + struct tcphdr *th = tcp_hdr(skb); + + if (skb->ip_summed == CHECKSUM_PARTIAL) { + th->check = ~tcp_v6_check(skb->len, saddr, daddr, 0); + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = offsetof(struct tcphdr, check); + } else { + th->check = tcp_v6_check(skb->len, saddr, daddr, + csum_partial(th, th->doff << 2, + skb->csum)); + } +} + +static void tcp_v6_send_check(struct sock *sk, struct sk_buff *skb) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + + __tcp_v6_send_check(skb, &np->saddr, &np->daddr); +} + +static int tcp_v6_gso_send_check(struct sk_buff *skb) +{ + const struct ipv6hdr *ipv6h; + struct tcphdr *th; + + if (!pskb_may_pull(skb, sizeof(*th))) + return -EINVAL; + + ipv6h = ipv6_hdr(skb); + th = tcp_hdr(skb); + + th->check = 0; + skb->ip_summed = CHECKSUM_PARTIAL; + __tcp_v6_send_check(skb, &ipv6h->saddr, &ipv6h->daddr); + return 0; +} + +static struct sk_buff **tcp6_gro_receive(struct sk_buff **head, + struct sk_buff *skb) +{ + const struct ipv6hdr *iph = skb_gro_network_header(skb); + + switch (skb->ip_summed) { + case CHECKSUM_COMPLETE: + if (!tcp_v6_check(skb_gro_len(skb), &iph->saddr, &iph->daddr, + skb->csum)) { + skb->ip_summed = CHECKSUM_UNNECESSARY; + break; + } + + /* fall through */ + case CHECKSUM_NONE: + NAPI_GRO_CB(skb)->flush = 1; + return NULL; + } + + return tcp_gro_receive(head, skb); +} + +static int tcp6_gro_complete(struct sk_buff *skb) +{ + const struct ipv6hdr *iph = ipv6_hdr(skb); + struct tcphdr *th = tcp_hdr(skb); + + th->check = ~tcp_v6_check(skb->len - skb_transport_offset(skb), + &iph->saddr, &iph->daddr, 0); + skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6; + + return tcp_gro_complete(skb); +} + +static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win, + u32 ts, struct tcp_md5sig_key *key, int rst, u8 tclass) +{ + const struct tcphdr *th = tcp_hdr(skb); + struct tcphdr *t1; + struct sk_buff *buff; + struct flowi6 fl6; + struct net *net = dev_net(skb_dst(skb)->dev); + struct sock *ctl_sk = net->ipv6.tcp_sk; + unsigned int tot_len = sizeof(struct tcphdr); + struct dst_entry *dst; + __be32 *topt; + + if (ts) + tot_len += TCPOLEN_TSTAMP_ALIGNED; +#ifdef CONFIG_TCP_MD5SIG + if (key) + tot_len += TCPOLEN_MD5SIG_ALIGNED; +#endif + + buff = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) + tot_len, + GFP_ATOMIC); + if (buff == NULL) + return; + + skb_reserve(buff, MAX_HEADER + sizeof(struct ipv6hdr) + tot_len); + + t1 = (struct tcphdr *) skb_push(buff, tot_len); + skb_reset_transport_header(buff); + + /* Swap the send and the receive. */ + memset(t1, 0, sizeof(*t1)); + t1->dest = th->source; + t1->source = th->dest; + t1->doff = tot_len / 4; + t1->seq = htonl(seq); + t1->ack_seq = htonl(ack); + t1->ack = !rst || !th->ack; + t1->rst = rst; + t1->window = htons(win); + + topt = (__be32 *)(t1 + 1); + + if (ts) { + *topt++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | + (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); + *topt++ = htonl(tcp_time_stamp); + *topt++ = htonl(ts); + } + +#ifdef CONFIG_TCP_MD5SIG + if (key) { + *topt++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | + (TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG); + tcp_v6_md5_hash_hdr((__u8 *)topt, key, + &ipv6_hdr(skb)->saddr, + &ipv6_hdr(skb)->daddr, t1); + } +#endif + + memset(&fl6, 0, sizeof(fl6)); + fl6.daddr = ipv6_hdr(skb)->saddr; + fl6.saddr = ipv6_hdr(skb)->daddr; + + buff->ip_summed = CHECKSUM_PARTIAL; + buff->csum = 0; + + __tcp_v6_send_check(buff, &fl6.saddr, &fl6.daddr); + + fl6.flowi6_proto = IPPROTO_TCP; + fl6.flowi6_oif = inet6_iif(skb); + fl6.fl6_dport = t1->dest; + fl6.fl6_sport = t1->source; + security_skb_classify_flow(skb, flowi6_to_flowi(&fl6)); + + /* Pass a socket to ip6_dst_lookup either it is for RST + * Underlying function will use this to retrieve the network + * namespace + */ + dst = ip6_dst_lookup_flow(ctl_sk, &fl6, NULL, false); + if (!IS_ERR(dst)) { + skb_dst_set(buff, dst); + ip6_xmit(ctl_sk, buff, &fl6, NULL, tclass); + TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); + if (rst) + TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS); + return; + } + + kfree_skb(buff); +} + +static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb) +{ + const struct tcphdr *th = tcp_hdr(skb); + u32 seq = 0, ack_seq = 0; + struct tcp_md5sig_key *key = NULL; +#ifdef CONFIG_TCP_MD5SIG + const __u8 *hash_location = NULL; + struct ipv6hdr *ipv6h = ipv6_hdr(skb); + unsigned char newhash[16]; + int genhash; + struct sock *sk1 = NULL; +#endif + + if (th->rst) + return; + + if (!ipv6_unicast_destination(skb)) + return; + +#ifdef CONFIG_TCP_MD5SIG + hash_location = tcp_parse_md5sig_option(th); + if (!sk && hash_location) { + /* + * active side is lost. Try to find listening socket through + * source port, and then find md5 key through listening socket. + * we are not loose security here: + * Incoming packet is checked with md5 hash with finding key, + * no RST generated if md5 hash doesn't match. + */ + sk1 = inet6_lookup_listener(dev_net(skb_dst(skb)->dev), + &tcp_hashinfo, &ipv6h->daddr, + ntohs(th->source), inet6_iif(skb)); + if (!sk1) + return; + + rcu_read_lock(); + key = tcp_v6_md5_do_lookup(sk1, &ipv6h->saddr); + if (!key) + goto release_sk1; + + genhash = tcp_v6_md5_hash_skb(newhash, key, NULL, NULL, skb); + if (genhash || memcmp(hash_location, newhash, 16) != 0) + goto release_sk1; + } else { + key = sk ? tcp_v6_md5_do_lookup(sk, &ipv6h->saddr) : NULL; + } +#endif + + if (th->ack) + seq = ntohl(th->ack_seq); + else + ack_seq = ntohl(th->seq) + th->syn + th->fin + skb->len - + (th->doff << 2); + + tcp_v6_send_response(skb, seq, ack_seq, 0, 0, key, 1, 0); + +#ifdef CONFIG_TCP_MD5SIG +release_sk1: + if (sk1) { + rcu_read_unlock(); + sock_put(sk1); + } +#endif +} + +static void tcp_v6_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 ts, + struct tcp_md5sig_key *key, u8 tclass) +{ + tcp_v6_send_response(skb, seq, ack, win, ts, key, 0, tclass); +} + +static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb) +{ + struct inet_timewait_sock *tw = inet_twsk(sk); + struct tcp_timewait_sock *tcptw = tcp_twsk(sk); + + tcp_v6_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, + tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, + tcptw->tw_ts_recent, tcp_twsk_md5_key(tcptw), + tw->tw_tclass); + + inet_twsk_put(tw); +} + +static void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, + struct request_sock *req) +{ + tcp_v6_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd, req->ts_recent, + tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->daddr), 0); +} + + +static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb) +{ + struct request_sock *req, **prev; + const struct tcphdr *th = tcp_hdr(skb); + struct sock *nsk; + + /* Find possible connection requests. */ + req = inet6_csk_search_req(sk, &prev, th->source, + &ipv6_hdr(skb)->saddr, + &ipv6_hdr(skb)->daddr, inet6_iif(skb)); + if (req) + return tcp_check_req(sk, skb, req, prev); + + nsk = __inet6_lookup_established(sock_net(sk), &tcp_hashinfo, + &ipv6_hdr(skb)->saddr, th->source, + &ipv6_hdr(skb)->daddr, ntohs(th->dest), inet6_iif(skb)); + + if (nsk) { + if (nsk->sk_state != TCP_TIME_WAIT) { + bh_lock_sock(nsk); + return nsk; + } + inet_twsk_put(inet_twsk(nsk)); + return NULL; + } + +#ifdef CONFIG_SYN_COOKIES + if (!th->syn) + sk = cookie_v6_check(sk, skb); +#endif + return sk; +} + +/* FIXME: this is substantially similar to the ipv4 code. + * Can some kind of merge be done? -- erics + */ +static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_extend_values tmp_ext; + struct tcp_options_received tmp_opt; + const u8 *hash_location; + struct request_sock *req; + struct inet6_request_sock *treq; + struct ipv6_pinfo *np = inet6_sk(sk); + struct tcp_sock *tp = tcp_sk(sk); + __u32 isn = TCP_SKB_CB(skb)->when; + struct dst_entry *dst = NULL; + int want_cookie = 0; + + if (skb->protocol == htons(ETH_P_IP)) + return tcp_v4_conn_request(sk, skb); + + if (!ipv6_unicast_destination(skb)) + goto drop; + + if (inet_csk_reqsk_queue_is_full(sk) && !isn) { + want_cookie = tcp_syn_flood_action(sk, skb, "TCPv6"); + if (!want_cookie) + goto drop; + } + + if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) + goto drop; + + req = inet6_reqsk_alloc(&tcp6_request_sock_ops); + if (req == NULL) + goto drop; + +#ifdef CONFIG_TCP_MD5SIG + tcp_rsk(req)->af_specific = &tcp_request_sock_ipv6_ops; +#endif + + tcp_clear_options(&tmp_opt); + tmp_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr); + tmp_opt.user_mss = tp->rx_opt.user_mss; + tcp_parse_options(skb, &tmp_opt, &hash_location, 0); + + if (tmp_opt.cookie_plus > 0 && + tmp_opt.saw_tstamp && + !tp->rx_opt.cookie_out_never && + (sysctl_tcp_cookie_size > 0 || + (tp->cookie_values != NULL && + tp->cookie_values->cookie_desired > 0))) { + u8 *c; + u32 *d; + u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS]; + int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE; + + if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0) + goto drop_and_free; + + /* Secret recipe starts with IP addresses */ + d = (__force u32 *)&ipv6_hdr(skb)->daddr.s6_addr32[0]; + *mess++ ^= *d++; + *mess++ ^= *d++; + *mess++ ^= *d++; + *mess++ ^= *d++; + d = (__force u32 *)&ipv6_hdr(skb)->saddr.s6_addr32[0]; + *mess++ ^= *d++; + *mess++ ^= *d++; + *mess++ ^= *d++; + *mess++ ^= *d++; + + /* plus variable length Initiator Cookie */ + c = (u8 *)mess; + while (l-- > 0) + *c++ ^= *hash_location++; + + want_cookie = 0; /* not our kind of cookie */ + tmp_ext.cookie_out_never = 0; /* false */ + tmp_ext.cookie_plus = tmp_opt.cookie_plus; + } else if (!tp->rx_opt.cookie_in_always) { + /* redundant indications, but ensure initialization. */ + tmp_ext.cookie_out_never = 1; /* true */ + tmp_ext.cookie_plus = 0; + } else { + goto drop_and_free; + } + tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always; + + if (want_cookie && !tmp_opt.saw_tstamp) + tcp_clear_options(&tmp_opt); + + tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; + tcp_openreq_init(req, &tmp_opt, skb); + + treq = inet6_rsk(req); + treq->rmt_addr = ipv6_hdr(skb)->saddr; + treq->loc_addr = ipv6_hdr(skb)->daddr; + if (!want_cookie || tmp_opt.tstamp_ok) + TCP_ECN_create_request(req, tcp_hdr(skb)); + + treq->iif = sk->sk_bound_dev_if; + + /* So that link locals have meaning */ + if (!sk->sk_bound_dev_if && + ipv6_addr_type(&treq->rmt_addr) & IPV6_ADDR_LINKLOCAL) + treq->iif = inet6_iif(skb); + + if (!isn) { + struct inet_peer *peer = NULL; + + if (ipv6_opt_accepted(sk, skb) || + np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || + np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) { + atomic_inc(&skb->users); + treq->pktopts = skb; + } + + if (want_cookie) { + isn = cookie_v6_init_sequence(sk, skb, &req->mss); + req->cookie_ts = tmp_opt.tstamp_ok; + goto have_isn; + } + + /* VJ's idea. We save last timestamp seen + * from the destination in peer table, when entering + * state TIME-WAIT, and check against it before + * accepting new connection request. + * + * If "isn" is not zero, this request hit alive + * timewait bucket, so that all the necessary checks + * are made in the function processing timewait state. + */ + if (tmp_opt.saw_tstamp && + tcp_death_row.sysctl_tw_recycle && + (dst = inet6_csk_route_req(sk, req)) != NULL && + (peer = rt6_get_peer((struct rt6_info *)dst)) != NULL && + ipv6_addr_equal((struct in6_addr *)peer->daddr.addr.a6, + &treq->rmt_addr)) { + inet_peer_refcheck(peer); + if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL && + (s32)(peer->tcp_ts - req->ts_recent) > + TCP_PAWS_WINDOW) { + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); + goto drop_and_release; + } + } + /* Kill the following clause, if you dislike this way. */ + else if (!sysctl_tcp_syncookies && + (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < + (sysctl_max_syn_backlog >> 2)) && + (!peer || !peer->tcp_ts_stamp) && + (!dst || !dst_metric(dst, RTAX_RTT))) { + /* Without syncookies last quarter of + * backlog is filled with destinations, + * proven to be alive. + * It means that we continue to communicate + * to destinations, already remembered + * to the moment of synflood. + */ + LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI6/%u\n", + &treq->rmt_addr, ntohs(tcp_hdr(skb)->source)); + goto drop_and_release; + } + + isn = tcp_v6_init_sequence(skb); + } +have_isn: + tcp_rsk(req)->snt_isn = isn; + tcp_rsk(req)->snt_synack = tcp_time_stamp; + + security_inet_conn_request(sk, skb, req); + + if (tcp_v6_send_synack(sk, req, + (struct request_values *)&tmp_ext) || + want_cookie) + goto drop_and_free; + + inet6_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); + return 0; + +drop_and_release: + dst_release(dst); +drop_and_free: + reqsk_free(req); +drop: + return 0; /* don't send reset */ +} + +static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct dst_entry *dst) +{ + struct inet6_request_sock *treq; + struct ipv6_pinfo *newnp, *np = inet6_sk(sk); + struct tcp6_sock *newtcp6sk; + struct inet_sock *newinet; + struct tcp_sock *newtp; + struct sock *newsk; + struct ipv6_txoptions *opt; +#ifdef CONFIG_TCP_MD5SIG + struct tcp_md5sig_key *key; +#endif + + if (skb->protocol == htons(ETH_P_IP)) { + /* + * v6 mapped + */ + + newsk = tcp_v4_syn_recv_sock(sk, skb, req, dst); + + if (newsk == NULL) + return NULL; + + newtcp6sk = (struct tcp6_sock *)newsk; + inet_sk(newsk)->pinet6 = &newtcp6sk->inet6; + + newinet = inet_sk(newsk); + newnp = inet6_sk(newsk); + newtp = tcp_sk(newsk); + + memcpy(newnp, np, sizeof(struct ipv6_pinfo)); + + ipv6_addr_set_v4mapped(newinet->inet_daddr, &newnp->daddr); + + ipv6_addr_set_v4mapped(newinet->inet_saddr, &newnp->saddr); + + newnp->rcv_saddr = newnp->saddr; + + inet_csk(newsk)->icsk_af_ops = &ipv6_mapped; + newsk->sk_backlog_rcv = tcp_v4_do_rcv; +#ifdef CONFIG_TCP_MD5SIG + newtp->af_specific = &tcp_sock_ipv6_mapped_specific; +#endif + + newnp->ipv6_ac_list = NULL; + newnp->ipv6_fl_list = NULL; + newnp->pktoptions = NULL; + newnp->opt = NULL; + newnp->mcast_oif = inet6_iif(skb); + newnp->mcast_hops = ipv6_hdr(skb)->hop_limit; + newnp->rcv_tclass = ipv6_tclass(ipv6_hdr(skb)); + + /* + * No need to charge this sock to the relevant IPv6 refcnt debug socks count + * here, tcp_create_openreq_child now does this for us, see the comment in + * that function for the gory details. -acme + */ + + /* It is tricky place. Until this moment IPv4 tcp + worked with IPv6 icsk.icsk_af_ops. + Sync it now. + */ + tcp_sync_mss(newsk, inet_csk(newsk)->icsk_pmtu_cookie); + + return newsk; + } + + treq = inet6_rsk(req); + opt = np->opt; + + if (sk_acceptq_is_full(sk)) + goto out_overflow; + + if (!dst) { + dst = inet6_csk_route_req(sk, req); + if (!dst) + goto out; + } + + newsk = tcp_create_openreq_child(sk, req, skb); + if (newsk == NULL) + goto out_nonewsk; + + /* + * No need to charge this sock to the relevant IPv6 refcnt debug socks + * count here, tcp_create_openreq_child now does this for us, see the + * comment in that function for the gory details. -acme + */ + + newsk->sk_gso_type = SKB_GSO_TCPV6; + __ip6_dst_store(newsk, dst, NULL, NULL); + + newtcp6sk = (struct tcp6_sock *)newsk; + inet_sk(newsk)->pinet6 = &newtcp6sk->inet6; + + newtp = tcp_sk(newsk); + newinet = inet_sk(newsk); + newnp = inet6_sk(newsk); + + memcpy(newnp, np, sizeof(struct ipv6_pinfo)); + + newnp->daddr = treq->rmt_addr; + newnp->saddr = treq->loc_addr; + newnp->rcv_saddr = treq->loc_addr; + newsk->sk_bound_dev_if = treq->iif; + + /* Now IPv6 options... + + First: no IPv4 options. + */ + newinet->inet_opt = NULL; + newnp->ipv6_ac_list = NULL; + newnp->ipv6_fl_list = NULL; + + /* Clone RX bits */ + newnp->rxopt.all = np->rxopt.all; + + /* Clone pktoptions received with SYN */ + newnp->pktoptions = NULL; + if (treq->pktopts != NULL) { + newnp->pktoptions = skb_clone(treq->pktopts, GFP_ATOMIC); + kfree_skb(treq->pktopts); + treq->pktopts = NULL; + if (newnp->pktoptions) + skb_set_owner_r(newnp->pktoptions, newsk); + } + newnp->opt = NULL; + newnp->mcast_oif = inet6_iif(skb); + newnp->mcast_hops = ipv6_hdr(skb)->hop_limit; + newnp->rcv_tclass = ipv6_tclass(ipv6_hdr(skb)); + + /* Clone native IPv6 options from listening socket (if any) + + Yes, keeping reference count would be much more clever, + but we make one more one thing there: reattach optmem + to newsk. + */ + if (opt) { + newnp->opt = ipv6_dup_options(newsk, opt); + if (opt != np->opt) + sock_kfree_s(sk, opt, opt->tot_len); + } + + inet_csk(newsk)->icsk_ext_hdr_len = 0; + if (newnp->opt) + inet_csk(newsk)->icsk_ext_hdr_len = (newnp->opt->opt_nflen + + newnp->opt->opt_flen); + + tcp_mtup_init(newsk); + tcp_sync_mss(newsk, dst_mtu(dst)); + newtp->advmss = dst_metric_advmss(dst); + if (tcp_sk(sk)->rx_opt.user_mss && + tcp_sk(sk)->rx_opt.user_mss < newtp->advmss) + newtp->advmss = tcp_sk(sk)->rx_opt.user_mss; + + tcp_initialize_rcv_mss(newsk); + if (tcp_rsk(req)->snt_synack) + tcp_valid_rtt_meas(newsk, + tcp_time_stamp - tcp_rsk(req)->snt_synack); + newtp->total_retrans = req->retrans; + + newinet->inet_daddr = newinet->inet_saddr = LOOPBACK4_IPV6; + newinet->inet_rcv_saddr = LOOPBACK4_IPV6; + +#ifdef CONFIG_TCP_MD5SIG + /* Copy over the MD5 key from the original socket */ + if ((key = tcp_v6_md5_do_lookup(sk, &newnp->daddr)) != NULL) { + /* We're using one, so create a matching key + * on the newsk structure. If we fail to get + * memory, then we end up not copying the key + * across. Shucks. + */ + tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newnp->daddr, + AF_INET6, key->key, key->keylen, GFP_ATOMIC); + } +#endif + + if (__inet_inherit_port(sk, newsk) < 0) { + sock_put(newsk); + goto out; + } + __inet6_hash(newsk, NULL); + + return newsk; + +out_overflow: + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); +out_nonewsk: + if (opt && opt != np->opt) + sock_kfree_s(sk, opt, opt->tot_len); + dst_release(dst); +out: + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); + return NULL; +} + +static __sum16 tcp_v6_checksum_init(struct sk_buff *skb) +{ + if (skb->ip_summed == CHECKSUM_COMPLETE) { + if (!tcp_v6_check(skb->len, &ipv6_hdr(skb)->saddr, + &ipv6_hdr(skb)->daddr, skb->csum)) { + skb->ip_summed = CHECKSUM_UNNECESSARY; + return 0; + } + } + + skb->csum = ~csum_unfold(tcp_v6_check(skb->len, + &ipv6_hdr(skb)->saddr, + &ipv6_hdr(skb)->daddr, 0)); + + if (skb->len <= 76) { + return __skb_checksum_complete(skb); + } + return 0; +} + +/* The socket must have it's spinlock held when we get + * here. + * + * We have a potential double-lock case here, so even when + * doing backlog processing we use the BH locking scheme. + * This is because we cannot sleep with the original spinlock + * held. + */ +static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct tcp_sock *tp; + struct sk_buff *opt_skb = NULL; + + /* Imagine: socket is IPv6. IPv4 packet arrives, + goes to IPv4 receive handler and backlogged. + From backlog it always goes here. Kerboom... + Fortunately, tcp_rcv_established and rcv_established + handle them correctly, but it is not case with + tcp_v6_hnd_req and tcp_v6_send_reset(). --ANK + */ + + if (skb->protocol == htons(ETH_P_IP)) + return tcp_v4_do_rcv(sk, skb); + +#ifdef CONFIG_TCP_MD5SIG + if (tcp_v6_inbound_md5_hash (sk, skb)) + goto discard; +#endif + + if (sk_filter(sk, skb)) + goto discard; + + /* + * socket locking is here for SMP purposes as backlog rcv + * is currently called with bh processing disabled. + */ + + /* Do Stevens' IPV6_PKTOPTIONS. + + Yes, guys, it is the only place in our code, where we + may make it not affecting IPv4. + The rest of code is protocol independent, + and I do not like idea to uglify IPv4. + + Actually, all the idea behind IPV6_PKTOPTIONS + looks not very well thought. For now we latch + options, received in the last packet, enqueued + by tcp. Feel free to propose better solution. + --ANK (980728) + */ + if (np->rxopt.all) + opt_skb = skb_clone(skb, GFP_ATOMIC); + + if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ + sock_rps_save_rxhash(sk, skb); + if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) + goto reset; + if (opt_skb) + goto ipv6_pktoptions; + return 0; + } + + if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb)) + goto csum_err; + + if (sk->sk_state == TCP_LISTEN) { + struct sock *nsk = tcp_v6_hnd_req(sk, skb); + if (!nsk) + goto discard; + + /* + * Queue it on the new socket if the new socket is active, + * otherwise we just shortcircuit this and continue with + * the new socket.. + */ + if(nsk != sk) { + sock_rps_save_rxhash(nsk, skb); + if (tcp_child_process(sk, nsk, skb)) + goto reset; + if (opt_skb) + __kfree_skb(opt_skb); + return 0; + } + } else + sock_rps_save_rxhash(sk, skb); + + if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) + goto reset; + if (opt_skb) + goto ipv6_pktoptions; + return 0; + +reset: + tcp_v6_send_reset(sk, skb); +discard: + if (opt_skb) + __kfree_skb(opt_skb); + kfree_skb(skb); + return 0; +csum_err: + TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); + goto discard; + + +ipv6_pktoptions: + /* Do you ask, what is it? + + 1. skb was enqueued by tcp. + 2. skb is added to tail of read queue, rather than out of order. + 3. socket is not in passive state. + 4. Finally, it really contains options, which user wants to receive. + */ + tp = tcp_sk(sk); + if (TCP_SKB_CB(opt_skb)->end_seq == tp->rcv_nxt && + !((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) { + if (np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo) + np->mcast_oif = inet6_iif(opt_skb); + if (np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) + np->mcast_hops = ipv6_hdr(opt_skb)->hop_limit; + if (np->rxopt.bits.rxtclass) + np->rcv_tclass = ipv6_tclass(ipv6_hdr(skb)); + if (ipv6_opt_accepted(sk, opt_skb)) { + skb_set_owner_r(opt_skb, sk); + opt_skb = xchg(&np->pktoptions, opt_skb); + } else { + __kfree_skb(opt_skb); + opt_skb = xchg(&np->pktoptions, NULL); + } + } + + kfree_skb(opt_skb); + return 0; +} + +static int tcp_v6_rcv(struct sk_buff *skb) +{ + const struct tcphdr *th; + const struct ipv6hdr *hdr; + struct sock *sk; + int ret; + struct net *net = dev_net(skb->dev); + + if (skb->pkt_type != PACKET_HOST) + goto discard_it; + + /* + * Count it even if it's bad. + */ + TCP_INC_STATS_BH(net, TCP_MIB_INSEGS); + + if (!pskb_may_pull(skb, sizeof(struct tcphdr))) + goto discard_it; + + th = tcp_hdr(skb); + + if (th->doff < sizeof(struct tcphdr)/4) + goto bad_packet; + if (!pskb_may_pull(skb, th->doff*4)) + goto discard_it; + + if (!skb_csum_unnecessary(skb) && tcp_v6_checksum_init(skb)) + goto bad_packet; + + th = tcp_hdr(skb); + hdr = ipv6_hdr(skb); + TCP_SKB_CB(skb)->seq = ntohl(th->seq); + TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + + skb->len - th->doff*4); + TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); + TCP_SKB_CB(skb)->when = 0; + TCP_SKB_CB(skb)->ip_dsfield = ipv6_get_dsfield(hdr); + TCP_SKB_CB(skb)->sacked = 0; + + sk = __inet6_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest); + if (!sk) + goto no_tcp_socket; + +process: + if (sk->sk_state == TCP_TIME_WAIT) + goto do_time_wait; + + if (hdr->hop_limit < inet6_sk(sk)->min_hopcount) { + NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP); + goto discard_and_relse; + } + + if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) + goto discard_and_relse; + + if (sk_filter(sk, skb)) + goto discard_and_relse; + + skb->dev = NULL; + + bh_lock_sock_nested(sk); + ret = 0; + if (!sock_owned_by_user(sk)) { +#ifdef CONFIG_NET_DMA + struct tcp_sock *tp = tcp_sk(sk); + if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) + tp->ucopy.dma_chan = net_dma_find_channel(); + if (tp->ucopy.dma_chan) + ret = tcp_v6_do_rcv(sk, skb); + else +#endif + { + if (!tcp_prequeue(sk, skb)) + ret = tcp_v6_do_rcv(sk, skb); + } + } else if (unlikely(sk_add_backlog(sk, skb))) { + bh_unlock_sock(sk); + NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP); + goto discard_and_relse; + } + bh_unlock_sock(sk); + + sock_put(sk); + return ret ? -1 : 0; + +no_tcp_socket: + if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) + goto discard_it; + + if (skb->len < (th->doff<<2) || tcp_checksum_complete(skb)) { +bad_packet: + TCP_INC_STATS_BH(net, TCP_MIB_INERRS); + } else { + tcp_v6_send_reset(NULL, skb); + } + +discard_it: + + /* + * Discard frame + */ + + kfree_skb(skb); + return 0; + +discard_and_relse: + sock_put(sk); + goto discard_it; + +do_time_wait: + if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) { + inet_twsk_put(inet_twsk(sk)); + goto discard_it; + } + + if (skb->len < (th->doff<<2) || tcp_checksum_complete(skb)) { + TCP_INC_STATS_BH(net, TCP_MIB_INERRS); + inet_twsk_put(inet_twsk(sk)); + goto discard_it; + } + + switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { + case TCP_TW_SYN: + { + struct sock *sk2; + + sk2 = inet6_lookup_listener(dev_net(skb->dev), &tcp_hashinfo, + &ipv6_hdr(skb)->daddr, + ntohs(th->dest), inet6_iif(skb)); + if (sk2 != NULL) { + struct inet_timewait_sock *tw = inet_twsk(sk); + inet_twsk_deschedule(tw, &tcp_death_row); + inet_twsk_put(tw); + sk = sk2; + goto process; + } + /* Fall through to ACK */ + } + case TCP_TW_ACK: + tcp_v6_timewait_ack(sk, skb); + break; + case TCP_TW_RST: + goto no_tcp_socket; + case TCP_TW_SUCCESS:; + } + goto discard_it; +} + +static struct inet_peer *tcp_v6_get_peer(struct sock *sk, bool *release_it) +{ + struct rt6_info *rt = (struct rt6_info *) __sk_dst_get(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct inet_peer *peer; + + if (!rt || + !ipv6_addr_equal(&np->daddr, &rt->rt6i_dst.addr)) { + peer = inet_getpeer_v6(&np->daddr, 1); + *release_it = true; + } else { + if (!rt->rt6i_peer) + rt6_bind_peer(rt, 1); + peer = rt->rt6i_peer; + *release_it = false; + } + + return peer; +} + +static void *tcp_v6_tw_get_peer(struct sock *sk) +{ + const struct inet6_timewait_sock *tw6 = inet6_twsk(sk); + const struct inet_timewait_sock *tw = inet_twsk(sk); + + if (tw->tw_family == AF_INET) + return tcp_v4_tw_get_peer(sk); + + return inet_getpeer_v6(&tw6->tw_v6_daddr, 1); +} + +static struct timewait_sock_ops tcp6_timewait_sock_ops = { + .twsk_obj_size = sizeof(struct tcp6_timewait_sock), + .twsk_unique = tcp_twsk_unique, + .twsk_destructor= tcp_twsk_destructor, + .twsk_getpeer = tcp_v6_tw_get_peer, +}; + +static const struct inet_connection_sock_af_ops ipv6_specific = { + .queue_xmit = inet6_csk_xmit, + .send_check = tcp_v6_send_check, + .rebuild_header = inet6_sk_rebuild_header, + .conn_request = tcp_v6_conn_request, + .syn_recv_sock = tcp_v6_syn_recv_sock, + .get_peer = tcp_v6_get_peer, + .net_header_len = sizeof(struct ipv6hdr), + .setsockopt = ipv6_setsockopt, + .getsockopt = ipv6_getsockopt, + .addr2sockaddr = inet6_csk_addr2sockaddr, + .sockaddr_len = sizeof(struct sockaddr_in6), + .bind_conflict = inet6_csk_bind_conflict, +#ifdef CONFIG_COMPAT + .compat_setsockopt = compat_ipv6_setsockopt, + .compat_getsockopt = compat_ipv6_getsockopt, +#endif +}; + +#ifdef CONFIG_TCP_MD5SIG +static const struct tcp_sock_af_ops tcp_sock_ipv6_specific = { + .md5_lookup = tcp_v6_md5_lookup, + .calc_md5_hash = tcp_v6_md5_hash_skb, + .md5_parse = tcp_v6_parse_md5_keys, +}; +#endif + +/* + * TCP over IPv4 via INET6 API + */ + +static const struct inet_connection_sock_af_ops ipv6_mapped = { + .queue_xmit = ip_queue_xmit, + .send_check = tcp_v4_send_check, + .rebuild_header = inet_sk_rebuild_header, + .conn_request = tcp_v6_conn_request, + .syn_recv_sock = tcp_v6_syn_recv_sock, + .get_peer = tcp_v4_get_peer, + .net_header_len = sizeof(struct iphdr), + .setsockopt = ipv6_setsockopt, + .getsockopt = ipv6_getsockopt, + .addr2sockaddr = inet6_csk_addr2sockaddr, + .sockaddr_len = sizeof(struct sockaddr_in6), + .bind_conflict = inet6_csk_bind_conflict, +#ifdef CONFIG_COMPAT + .compat_setsockopt = compat_ipv6_setsockopt, + .compat_getsockopt = compat_ipv6_getsockopt, +#endif +}; + +#ifdef CONFIG_TCP_MD5SIG +static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific = { + .md5_lookup = tcp_v4_md5_lookup, + .calc_md5_hash = tcp_v4_md5_hash_skb, + .md5_parse = tcp_v6_parse_md5_keys, +}; +#endif + +/* NOTE: A lot of things set to zero explicitly by call to + * sk_alloc() so need not be done here. + */ +static int tcp_v6_init_sock(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + + skb_queue_head_init(&tp->out_of_order_queue); + tcp_init_xmit_timers(sk); + tcp_prequeue_init(tp); + + icsk->icsk_rto = TCP_TIMEOUT_INIT; + tp->mdev = TCP_TIMEOUT_INIT; + + /* So many TCP implementations out there (incorrectly) count the + * initial SYN frame in their delayed-ACK and congestion control + * algorithms that we must have the following bandaid to talk + * efficiently to them. -DaveM + */ + tp->snd_cwnd = 2; + + /* See draft-stevens-tcpca-spec-01 for discussion of the + * initialization of these values. + */ + tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; + tp->snd_cwnd_clamp = ~0; + tp->mss_cache = TCP_MSS_DEFAULT; + + tp->reordering = sysctl_tcp_reordering; + + sk->sk_state = TCP_CLOSE; + + icsk->icsk_af_ops = &ipv6_specific; + icsk->icsk_ca_ops = &tcp_init_congestion_ops; + icsk->icsk_sync_mss = tcp_sync_mss; + sk->sk_write_space = sk_stream_write_space; + sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); + +#ifdef CONFIG_TCP_MD5SIG + tp->af_specific = &tcp_sock_ipv6_specific; +#endif + + /* TCP Cookie Transactions */ + if (sysctl_tcp_cookie_size > 0) { + /* Default, cookies without s_data_payload. */ + tp->cookie_values = + kzalloc(sizeof(*tp->cookie_values), + sk->sk_allocation); + if (tp->cookie_values != NULL) + kref_init(&tp->cookie_values->kref); + } + /* Presumed zeroed, in order of appearance: + * cookie_in_always, cookie_out_never, + * s_data_constant, s_data_in, s_data_out + */ + sk->sk_sndbuf = sysctl_tcp_wmem[1]; + sk->sk_rcvbuf = sysctl_tcp_rmem[1]; + + local_bh_disable(); + sock_update_memcg(sk); + sk_sockets_allocated_inc(sk); + local_bh_enable(); + + return 0; +} + +static void tcp_v6_destroy_sock(struct sock *sk) +{ + tcp_v4_destroy_sock(sk); + inet6_destroy_sock(sk); +} + +#ifdef CONFIG_PROC_FS +/* Proc filesystem TCPv6 sock list dumping. */ +static void get_openreq6(struct seq_file *seq, + const struct sock *sk, struct request_sock *req, int i, int uid) +{ + int ttd = req->expires - jiffies; + const struct in6_addr *src = &inet6_rsk(req)->loc_addr; + const struct in6_addr *dest = &inet6_rsk(req)->rmt_addr; + + if (ttd < 0) + ttd = 0; + + seq_printf(seq, + "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " + "%02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK\n", + i, + src->s6_addr32[0], src->s6_addr32[1], + src->s6_addr32[2], src->s6_addr32[3], + ntohs(inet_rsk(req)->loc_port), + dest->s6_addr32[0], dest->s6_addr32[1], + dest->s6_addr32[2], dest->s6_addr32[3], + ntohs(inet_rsk(req)->rmt_port), + TCP_SYN_RECV, + 0,0, /* could print option size, but that is af dependent. */ + 1, /* timers active (only the expire timer) */ + jiffies_to_clock_t(ttd), + req->retrans, + uid, + 0, /* non standard timer */ + 0, /* open_requests have no inode */ + 0, req); +} + +static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) +{ + const struct in6_addr *dest, *src; + __u16 destp, srcp; + int timer_active; + unsigned long timer_expires; + const struct inet_sock *inet = inet_sk(sp); + const struct tcp_sock *tp = tcp_sk(sp); + const struct inet_connection_sock *icsk = inet_csk(sp); + const struct ipv6_pinfo *np = inet6_sk(sp); + + dest = &np->daddr; + src = &np->rcv_saddr; + destp = ntohs(inet->inet_dport); + srcp = ntohs(inet->inet_sport); + + if (icsk->icsk_pending == ICSK_TIME_RETRANS) { + timer_active = 1; + timer_expires = icsk->icsk_timeout; + } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { + timer_active = 4; + timer_expires = icsk->icsk_timeout; + } else if (timer_pending(&sp->sk_timer)) { + timer_active = 2; + timer_expires = sp->sk_timer.expires; + } else { + timer_active = 0; + timer_expires = jiffies; + } + + seq_printf(seq, + "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " + "%02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %pK %lu %lu %u %u %d\n", + i, + src->s6_addr32[0], src->s6_addr32[1], + src->s6_addr32[2], src->s6_addr32[3], srcp, + dest->s6_addr32[0], dest->s6_addr32[1], + dest->s6_addr32[2], dest->s6_addr32[3], destp, + sp->sk_state, + tp->write_seq-tp->snd_una, + (sp->sk_state == TCP_LISTEN) ? sp->sk_ack_backlog : (tp->rcv_nxt - tp->copied_seq), + timer_active, + jiffies_to_clock_t(timer_expires - jiffies), + icsk->icsk_retransmits, + sock_i_uid(sp), + icsk->icsk_probes_out, + sock_i_ino(sp), + atomic_read(&sp->sk_refcnt), sp, + jiffies_to_clock_t(icsk->icsk_rto), + jiffies_to_clock_t(icsk->icsk_ack.ato), + (icsk->icsk_ack.quick << 1 ) | icsk->icsk_ack.pingpong, + tp->snd_cwnd, + tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh + ); +} + +static void get_timewait6_sock(struct seq_file *seq, + struct inet_timewait_sock *tw, int i) +{ + const struct in6_addr *dest, *src; + __u16 destp, srcp; + const struct inet6_timewait_sock *tw6 = inet6_twsk((struct sock *)tw); + int ttd = tw->tw_ttd - jiffies; + + if (ttd < 0) + ttd = 0; + + dest = &tw6->tw_v6_daddr; + src = &tw6->tw_v6_rcv_saddr; + destp = ntohs(tw->tw_dport); + srcp = ntohs(tw->tw_sport); + + seq_printf(seq, + "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " + "%02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK\n", + i, + src->s6_addr32[0], src->s6_addr32[1], + src->s6_addr32[2], src->s6_addr32[3], srcp, + dest->s6_addr32[0], dest->s6_addr32[1], + dest->s6_addr32[2], dest->s6_addr32[3], destp, + tw->tw_substate, 0, 0, + 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0, + atomic_read(&tw->tw_refcnt), tw); +} + +static int tcp6_seq_show(struct seq_file *seq, void *v) +{ + struct tcp_iter_state *st; + + if (v == SEQ_START_TOKEN) { + seq_puts(seq, + " sl " + "local_address " + "remote_address " + "st tx_queue rx_queue tr tm->when retrnsmt" + " uid timeout inode\n"); + goto out; + } + st = seq->private; + + switch (st->state) { + case TCP_SEQ_STATE_LISTENING: + case TCP_SEQ_STATE_ESTABLISHED: + get_tcp6_sock(seq, v, st->num); + break; + case TCP_SEQ_STATE_OPENREQ: + get_openreq6(seq, st->syn_wait_sk, v, st->num, st->uid); + break; + case TCP_SEQ_STATE_TIME_WAIT: + get_timewait6_sock(seq, v, st->num); + break; + } +out: + return 0; +} + +static const struct file_operations tcp6_afinfo_seq_fops = { + .owner = THIS_MODULE, + .open = tcp_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net +}; + +static struct tcp_seq_afinfo tcp6_seq_afinfo = { + .name = "tcp6", + .family = AF_INET6, + .seq_fops = &tcp6_afinfo_seq_fops, + .seq_ops = { + .show = tcp6_seq_show, + }, +}; + +int __net_init tcp6_proc_init(struct net *net) +{ + return tcp_proc_register(net, &tcp6_seq_afinfo); +} + +void tcp6_proc_exit(struct net *net) +{ + tcp_proc_unregister(net, &tcp6_seq_afinfo); +} +#endif + +struct proto tcpv6_prot = { + .name = "TCPv6", + .owner = THIS_MODULE, + .close = tcp_close, + .connect = tcp_v6_connect, + .disconnect = tcp_disconnect, + .accept = inet_csk_accept, + .ioctl = tcp_ioctl, + .init = tcp_v6_init_sock, + .destroy = tcp_v6_destroy_sock, + .shutdown = tcp_shutdown, + .setsockopt = tcp_setsockopt, + .getsockopt = tcp_getsockopt, + .recvmsg = tcp_recvmsg, + .sendmsg = tcp_sendmsg, + .sendpage = tcp_sendpage, + .backlog_rcv = tcp_v6_do_rcv, + .hash = tcp_v6_hash, + .unhash = inet_unhash, + .get_port = inet_csk_get_port, + .enter_memory_pressure = tcp_enter_memory_pressure, + .sockets_allocated = &tcp_sockets_allocated, + .memory_allocated = &tcp_memory_allocated, + .memory_pressure = &tcp_memory_pressure, + .orphan_count = &tcp_orphan_count, + .sysctl_wmem = sysctl_tcp_wmem, + .sysctl_rmem = sysctl_tcp_rmem, + .max_header = MAX_TCP_HEADER, + .obj_size = sizeof(struct tcp6_sock), + .slab_flags = SLAB_DESTROY_BY_RCU, + .twsk_prot = &tcp6_timewait_sock_ops, + .rsk_prot = &tcp6_request_sock_ops, + .h.hashinfo = &tcp_hashinfo, + .no_autobind = true, +#ifdef CONFIG_COMPAT + .compat_setsockopt = compat_tcp_setsockopt, + .compat_getsockopt = compat_tcp_getsockopt, +#endif +#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM + .proto_cgroup = tcp_proto_cgroup, +#endif +}; + +static const struct inet6_protocol tcpv6_protocol = { + .handler = tcp_v6_rcv, + .err_handler = tcp_v6_err, + .gso_send_check = tcp_v6_gso_send_check, + .gso_segment = tcp_tso_segment, + .gro_receive = tcp6_gro_receive, + .gro_complete = tcp6_gro_complete, + .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, +}; + +static struct inet_protosw tcpv6_protosw = { + .type = SOCK_STREAM, + .protocol = IPPROTO_TCP, + .prot = &tcpv6_prot, + .ops = &inet6_stream_ops, + .no_check = 0, + .flags = INET_PROTOSW_PERMANENT | + INET_PROTOSW_ICSK, +}; + +static int __net_init tcpv6_net_init(struct net *net) +{ + return inet_ctl_sock_create(&net->ipv6.tcp_sk, PF_INET6, + SOCK_RAW, IPPROTO_TCP, net); +} + +static void __net_exit tcpv6_net_exit(struct net *net) +{ + inet_ctl_sock_destroy(net->ipv6.tcp_sk); +} + +static void __net_exit tcpv6_net_exit_batch(struct list_head *net_exit_list) +{ + inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET6); +} + +static struct pernet_operations tcpv6_net_ops = { + .init = tcpv6_net_init, + .exit = tcpv6_net_exit, + .exit_batch = tcpv6_net_exit_batch, +}; + +int __init tcpv6_init(void) +{ + int ret; + + ret = inet6_add_protocol(&tcpv6_protocol, IPPROTO_TCP); + if (ret) + goto out; + + /* register inet6 protocol */ + ret = inet6_register_protosw(&tcpv6_protosw); + if (ret) + goto out_tcpv6_protocol; + + ret = register_pernet_subsys(&tcpv6_net_ops); + if (ret) + goto out_tcpv6_protosw; +out: + return ret; + +out_tcpv6_protocol: + inet6_del_protocol(&tcpv6_protocol, IPPROTO_TCP); +out_tcpv6_protosw: + inet6_unregister_protosw(&tcpv6_protosw); + goto out; +} + +void tcpv6_exit(void) +{ + unregister_pernet_subsys(&tcpv6_net_ops); + inet6_unregister_protosw(&tcpv6_protosw); + inet6_del_protocol(&tcpv6_protocol, IPPROTO_TCP); +} diff --git a/net/ipv6/tunnel6.c b/net/ipv6/tunnel6.c new file mode 100644 index 00000000..4f3cec12 --- /dev/null +++ b/net/ipv6/tunnel6.c @@ -0,0 +1,184 @@ +/* + * Copyright (C)2003,2004 USAGI/WIDE Project + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Authors Mitsuru KANDA <mk@linux-ipv6.org> + * YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org> + */ + +#include <linux/icmpv6.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <linux/slab.h> +#include <net/ipv6.h> +#include <net/protocol.h> +#include <net/xfrm.h> + +static struct xfrm6_tunnel __rcu *tunnel6_handlers __read_mostly; +static struct xfrm6_tunnel __rcu *tunnel46_handlers __read_mostly; +static DEFINE_MUTEX(tunnel6_mutex); + +int xfrm6_tunnel_register(struct xfrm6_tunnel *handler, unsigned short family) +{ + struct xfrm6_tunnel __rcu **pprev; + struct xfrm6_tunnel *t; + int ret = -EEXIST; + int priority = handler->priority; + + mutex_lock(&tunnel6_mutex); + + for (pprev = (family == AF_INET6) ? &tunnel6_handlers : &tunnel46_handlers; + (t = rcu_dereference_protected(*pprev, + lockdep_is_held(&tunnel6_mutex))) != NULL; + pprev = &t->next) { + if (t->priority > priority) + break; + if (t->priority == priority) + goto err; + } + + handler->next = *pprev; + rcu_assign_pointer(*pprev, handler); + + ret = 0; + +err: + mutex_unlock(&tunnel6_mutex); + + return ret; +} + +EXPORT_SYMBOL(xfrm6_tunnel_register); + +int xfrm6_tunnel_deregister(struct xfrm6_tunnel *handler, unsigned short family) +{ + struct xfrm6_tunnel __rcu **pprev; + struct xfrm6_tunnel *t; + int ret = -ENOENT; + + mutex_lock(&tunnel6_mutex); + + for (pprev = (family == AF_INET6) ? &tunnel6_handlers : &tunnel46_handlers; + (t = rcu_dereference_protected(*pprev, + lockdep_is_held(&tunnel6_mutex))) != NULL; + pprev = &t->next) { + if (t == handler) { + *pprev = handler->next; + ret = 0; + break; + } + } + + mutex_unlock(&tunnel6_mutex); + + synchronize_net(); + + return ret; +} + +EXPORT_SYMBOL(xfrm6_tunnel_deregister); + +#define for_each_tunnel_rcu(head, handler) \ + for (handler = rcu_dereference(head); \ + handler != NULL; \ + handler = rcu_dereference(handler->next)) \ + +static int tunnel6_rcv(struct sk_buff *skb) +{ + struct xfrm6_tunnel *handler; + + if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) + goto drop; + + for_each_tunnel_rcu(tunnel6_handlers, handler) + if (!handler->handler(skb)) + return 0; + + icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); + +drop: + kfree_skb(skb); + return 0; +} + +static int tunnel46_rcv(struct sk_buff *skb) +{ + struct xfrm6_tunnel *handler; + + if (!pskb_may_pull(skb, sizeof(struct iphdr))) + goto drop; + + for_each_tunnel_rcu(tunnel46_handlers, handler) + if (!handler->handler(skb)) + return 0; + + icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); + +drop: + kfree_skb(skb); + return 0; +} + +static void tunnel6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, __be32 info) +{ + struct xfrm6_tunnel *handler; + + for_each_tunnel_rcu(tunnel6_handlers, handler) + if (!handler->err_handler(skb, opt, type, code, offset, info)) + break; +} + +static const struct inet6_protocol tunnel6_protocol = { + .handler = tunnel6_rcv, + .err_handler = tunnel6_err, + .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, +}; + +static const struct inet6_protocol tunnel46_protocol = { + .handler = tunnel46_rcv, + .err_handler = tunnel6_err, + .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, +}; + +static int __init tunnel6_init(void) +{ + if (inet6_add_protocol(&tunnel6_protocol, IPPROTO_IPV6)) { + printk(KERN_ERR "tunnel6 init(): can't add protocol\n"); + return -EAGAIN; + } + if (inet6_add_protocol(&tunnel46_protocol, IPPROTO_IPIP)) { + printk(KERN_ERR "tunnel6 init(): can't add protocol\n"); + inet6_del_protocol(&tunnel6_protocol, IPPROTO_IPV6); + return -EAGAIN; + } + return 0; +} + +static void __exit tunnel6_fini(void) +{ + if (inet6_del_protocol(&tunnel46_protocol, IPPROTO_IPIP)) + printk(KERN_ERR "tunnel6 close: can't remove protocol\n"); + if (inet6_del_protocol(&tunnel6_protocol, IPPROTO_IPV6)) + printk(KERN_ERR "tunnel6 close: can't remove protocol\n"); +} + +module_init(tunnel6_init); +module_exit(tunnel6_fini); +MODULE_LICENSE("GPL"); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c new file mode 100644 index 00000000..37b0699e --- /dev/null +++ b/net/ipv6/udp.c @@ -0,0 +1,1526 @@ +/* + * UDP over IPv6 + * Linux INET6 implementation + * + * Authors: + * Pedro Roque <roque@di.fc.ul.pt> + * + * Based on linux/ipv4/udp.c + * + * Fixes: + * Hideaki YOSHIFUJI : sin6_scope_id support + * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which + * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind + * a single port at the same time. + * Kazunori MIYAZAWA @USAGI: change process style to use ip6_append_data + * YOSHIFUJI Hideaki @USAGI: convert /proc/net/udp6 to seq_file. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/net.h> +#include <linux/in6.h> +#include <linux/netdevice.h> +#include <linux/if_arp.h> +#include <linux/ipv6.h> +#include <linux/icmpv6.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/slab.h> +#include <asm/uaccess.h> + +#include <net/ndisc.h> +#include <net/protocol.h> +#include <net/transp_v6.h> +#include <net/ip6_route.h> +#include <net/raw.h> +#include <net/tcp_states.h> +#include <net/ip6_checksum.h> +#include <net/xfrm.h> + +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include "udp_impl.h" + +int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2) +{ + const struct in6_addr *sk_rcv_saddr6 = &inet6_sk(sk)->rcv_saddr; + const struct in6_addr *sk2_rcv_saddr6 = inet6_rcv_saddr(sk2); + __be32 sk1_rcv_saddr = sk_rcv_saddr(sk); + __be32 sk2_rcv_saddr = sk_rcv_saddr(sk2); + int sk_ipv6only = ipv6_only_sock(sk); + int sk2_ipv6only = inet_v6_ipv6only(sk2); + int addr_type = ipv6_addr_type(sk_rcv_saddr6); + int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED; + + /* if both are mapped, treat as IPv4 */ + if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED) + return (!sk2_ipv6only && + (!sk1_rcv_saddr || !sk2_rcv_saddr || + sk1_rcv_saddr == sk2_rcv_saddr)); + + if (addr_type2 == IPV6_ADDR_ANY && + !(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED)) + return 1; + + if (addr_type == IPV6_ADDR_ANY && + !(sk_ipv6only && addr_type2 == IPV6_ADDR_MAPPED)) + return 1; + + if (sk2_rcv_saddr6 && + ipv6_addr_equal(sk_rcv_saddr6, sk2_rcv_saddr6)) + return 1; + + return 0; +} + +static unsigned int udp6_portaddr_hash(struct net *net, + const struct in6_addr *addr6, + unsigned int port) +{ + unsigned int hash, mix = net_hash_mix(net); + + if (ipv6_addr_any(addr6)) + hash = jhash_1word(0, mix); + else if (ipv6_addr_v4mapped(addr6)) + hash = jhash_1word((__force u32)addr6->s6_addr32[3], mix); + else + hash = jhash2((__force u32 *)addr6->s6_addr32, 4, mix); + + return hash ^ port; +} + + +int udp_v6_get_port(struct sock *sk, unsigned short snum) +{ + unsigned int hash2_nulladdr = + udp6_portaddr_hash(sock_net(sk), &in6addr_any, snum); + unsigned int hash2_partial = + udp6_portaddr_hash(sock_net(sk), &inet6_sk(sk)->rcv_saddr, 0); + + /* precompute partial secondary hash */ + udp_sk(sk)->udp_portaddr_hash = hash2_partial; + return udp_lib_get_port(sk, snum, ipv6_rcv_saddr_equal, hash2_nulladdr); +} + +static void udp_v6_rehash(struct sock *sk) +{ + u16 new_hash = udp6_portaddr_hash(sock_net(sk), + &inet6_sk(sk)->rcv_saddr, + inet_sk(sk)->inet_num); + + udp_lib_rehash(sk, new_hash); +} + +static inline int compute_score(struct sock *sk, struct net *net, + unsigned short hnum, + const struct in6_addr *saddr, __be16 sport, + const struct in6_addr *daddr, __be16 dport, + int dif) +{ + int score = -1; + + if (net_eq(sock_net(sk), net) && udp_sk(sk)->udp_port_hash == hnum && + sk->sk_family == PF_INET6) { + struct ipv6_pinfo *np = inet6_sk(sk); + struct inet_sock *inet = inet_sk(sk); + + score = 0; + if (inet->inet_dport) { + if (inet->inet_dport != sport) + return -1; + score++; + } + if (!ipv6_addr_any(&np->rcv_saddr)) { + if (!ipv6_addr_equal(&np->rcv_saddr, daddr)) + return -1; + score++; + } + if (!ipv6_addr_any(&np->daddr)) { + if (!ipv6_addr_equal(&np->daddr, saddr)) + return -1; + score++; + } + if (sk->sk_bound_dev_if) { + if (sk->sk_bound_dev_if != dif) + return -1; + score++; + } + } + return score; +} + +#define SCORE2_MAX (1 + 1 + 1) +static inline int compute_score2(struct sock *sk, struct net *net, + const struct in6_addr *saddr, __be16 sport, + const struct in6_addr *daddr, unsigned short hnum, + int dif) +{ + int score = -1; + + if (net_eq(sock_net(sk), net) && udp_sk(sk)->udp_port_hash == hnum && + sk->sk_family == PF_INET6) { + struct ipv6_pinfo *np = inet6_sk(sk); + struct inet_sock *inet = inet_sk(sk); + + if (!ipv6_addr_equal(&np->rcv_saddr, daddr)) + return -1; + score = 0; + if (inet->inet_dport) { + if (inet->inet_dport != sport) + return -1; + score++; + } + if (!ipv6_addr_any(&np->daddr)) { + if (!ipv6_addr_equal(&np->daddr, saddr)) + return -1; + score++; + } + if (sk->sk_bound_dev_if) { + if (sk->sk_bound_dev_if != dif) + return -1; + score++; + } + } + return score; +} + + +/* called with read_rcu_lock() */ +static struct sock *udp6_lib_lookup2(struct net *net, + const struct in6_addr *saddr, __be16 sport, + const struct in6_addr *daddr, unsigned int hnum, int dif, + struct udp_hslot *hslot2, unsigned int slot2) +{ + struct sock *sk, *result; + struct hlist_nulls_node *node; + int score, badness; + +begin: + result = NULL; + badness = -1; + udp_portaddr_for_each_entry_rcu(sk, node, &hslot2->head) { + score = compute_score2(sk, net, saddr, sport, + daddr, hnum, dif); + if (score > badness) { + result = sk; + badness = score; + if (score == SCORE2_MAX) + goto exact_match; + } + } + /* + * if the nulls value we got at the end of this lookup is + * not the expected one, we must restart lookup. + * We probably met an item that was moved to another chain. + */ + if (get_nulls_value(node) != slot2) + goto begin; + + if (result) { +exact_match: + if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2))) + result = NULL; + else if (unlikely(compute_score2(result, net, saddr, sport, + daddr, hnum, dif) < badness)) { + sock_put(result); + goto begin; + } + } + return result; +} + +struct sock *__udp6_lib_lookup(struct net *net, + const struct in6_addr *saddr, __be16 sport, + const struct in6_addr *daddr, __be16 dport, + int dif, struct udp_table *udptable) +{ + struct sock *sk, *result; + struct hlist_nulls_node *node; + unsigned short hnum = ntohs(dport); + unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask); + struct udp_hslot *hslot2, *hslot = &udptable->hash[slot]; + int score, badness; + + rcu_read_lock(); + if (hslot->count > 10) { + hash2 = udp6_portaddr_hash(net, daddr, hnum); + slot2 = hash2 & udptable->mask; + hslot2 = &udptable->hash2[slot2]; + if (hslot->count < hslot2->count) + goto begin; + + result = udp6_lib_lookup2(net, saddr, sport, + daddr, hnum, dif, + hslot2, slot2); + if (!result) { + hash2 = udp6_portaddr_hash(net, &in6addr_any, hnum); + slot2 = hash2 & udptable->mask; + hslot2 = &udptable->hash2[slot2]; + if (hslot->count < hslot2->count) + goto begin; + + result = udp6_lib_lookup2(net, saddr, sport, + &in6addr_any, hnum, dif, + hslot2, slot2); + } + rcu_read_unlock(); + return result; + } +begin: + result = NULL; + badness = -1; + sk_nulls_for_each_rcu(sk, node, &hslot->head) { + score = compute_score(sk, net, hnum, saddr, sport, daddr, dport, dif); + if (score > badness) { + result = sk; + badness = score; + } + } + /* + * if the nulls value we got at the end of this lookup is + * not the expected one, we must restart lookup. + * We probably met an item that was moved to another chain. + */ + if (get_nulls_value(node) != slot) + goto begin; + + if (result) { + if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2))) + result = NULL; + else if (unlikely(compute_score(result, net, hnum, saddr, sport, + daddr, dport, dif) < badness)) { + sock_put(result); + goto begin; + } + } + rcu_read_unlock(); + return result; +} +EXPORT_SYMBOL_GPL(__udp6_lib_lookup); + +static struct sock *__udp6_lib_lookup_skb(struct sk_buff *skb, + __be16 sport, __be16 dport, + struct udp_table *udptable) +{ + struct sock *sk; + const struct ipv6hdr *iph = ipv6_hdr(skb); + + if (unlikely(sk = skb_steal_sock(skb))) + return sk; + return __udp6_lib_lookup(dev_net(skb_dst(skb)->dev), &iph->saddr, sport, + &iph->daddr, dport, inet6_iif(skb), + udptable); +} + +struct sock *udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be16 sport, + const struct in6_addr *daddr, __be16 dport, int dif) +{ + return __udp6_lib_lookup(net, saddr, sport, daddr, dport, dif, &udp_table); +} +EXPORT_SYMBOL_GPL(udp6_lib_lookup); + + +/* + * This should be easy, if there is something there we + * return it, otherwise we block. + */ + +int udpv6_recvmsg(struct kiocb *iocb, struct sock *sk, + struct msghdr *msg, size_t len, + int noblock, int flags, int *addr_len) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct inet_sock *inet = inet_sk(sk); + struct sk_buff *skb; + unsigned int ulen, copied; + int peeked, off = 0; + int err; + int is_udplite = IS_UDPLITE(sk); + int is_udp4; + bool slow; + + if (addr_len) + *addr_len=sizeof(struct sockaddr_in6); + + if (flags & MSG_ERRQUEUE) + return ipv6_recv_error(sk, msg, len); + + if (np->rxpmtu && np->rxopt.bits.rxpmtu) + return ipv6_recv_rxpmtu(sk, msg, len); + +try_again: + skb = __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0), + &peeked, &off, &err); + if (!skb) + goto out; + + ulen = skb->len - sizeof(struct udphdr); + copied = len; + if (copied > ulen) + copied = ulen; + else if (copied < ulen) + msg->msg_flags |= MSG_TRUNC; + + is_udp4 = (skb->protocol == htons(ETH_P_IP)); + + /* + * If checksum is needed at all, try to do it while copying the + * data. If the data is truncated, or if we only want a partial + * coverage checksum (UDP-Lite), do it before the copy. + */ + + if (copied < ulen || UDP_SKB_CB(skb)->partial_cov) { + if (udp_lib_checksum_complete(skb)) + goto csum_copy_err; + } + + if (skb_csum_unnecessary(skb)) + err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), + msg->msg_iov, copied ); + else { + err = skb_copy_and_csum_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov); + if (err == -EINVAL) + goto csum_copy_err; + } + if (err) + goto out_free; + + if (!peeked) { + if (is_udp4) + UDP_INC_STATS_USER(sock_net(sk), + UDP_MIB_INDATAGRAMS, is_udplite); + else + UDP6_INC_STATS_USER(sock_net(sk), + UDP_MIB_INDATAGRAMS, is_udplite); + } + + sock_recv_ts_and_drops(msg, sk, skb); + + /* Copy the address. */ + if (msg->msg_name) { + struct sockaddr_in6 *sin6; + + sin6 = (struct sockaddr_in6 *) msg->msg_name; + sin6->sin6_family = AF_INET6; + sin6->sin6_port = udp_hdr(skb)->source; + sin6->sin6_flowinfo = 0; + sin6->sin6_scope_id = 0; + + if (is_udp4) + ipv6_addr_set_v4mapped(ip_hdr(skb)->saddr, + &sin6->sin6_addr); + else { + sin6->sin6_addr = ipv6_hdr(skb)->saddr; + if (ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL) + sin6->sin6_scope_id = IP6CB(skb)->iif; + } + + } + if (is_udp4) { + if (inet->cmsg_flags) + ip_cmsg_recv(msg, skb); + } else { + if (np->rxopt.all) + datagram_recv_ctl(sk, msg, skb); + } + + err = copied; + if (flags & MSG_TRUNC) + err = ulen; + +out_free: + skb_free_datagram_locked(sk, skb); +out: + return err; + +csum_copy_err: + slow = lock_sock_fast(sk); + if (!skb_kill_datagram(sk, skb, flags)) { + if (is_udp4) + UDP_INC_STATS_USER(sock_net(sk), + UDP_MIB_INERRORS, is_udplite); + else + UDP6_INC_STATS_USER(sock_net(sk), + UDP_MIB_INERRORS, is_udplite); + } + unlock_sock_fast(sk, slow); + + if (noblock) + return -EAGAIN; + + /* starting over for a new packet */ + msg->msg_flags &= ~MSG_TRUNC; + goto try_again; +} + +void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, __be32 info, + struct udp_table *udptable) +{ + struct ipv6_pinfo *np; + const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data; + const struct in6_addr *saddr = &hdr->saddr; + const struct in6_addr *daddr = &hdr->daddr; + struct udphdr *uh = (struct udphdr*)(skb->data+offset); + struct sock *sk; + int err; + + sk = __udp6_lib_lookup(dev_net(skb->dev), daddr, uh->dest, + saddr, uh->source, inet6_iif(skb), udptable); + if (sk == NULL) + return; + + np = inet6_sk(sk); + + if (!icmpv6_err_convert(type, code, &err) && !np->recverr) + goto out; + + if (sk->sk_state != TCP_ESTABLISHED && !np->recverr) + goto out; + + if (np->recverr) + ipv6_icmp_error(sk, skb, err, uh->dest, ntohl(info), (u8 *)(uh+1)); + + sk->sk_err = err; + sk->sk_error_report(sk); +out: + sock_put(sk); +} + +static __inline__ void udpv6_err(struct sk_buff *skb, + struct inet6_skb_parm *opt, u8 type, + u8 code, int offset, __be32 info ) +{ + __udp6_lib_err(skb, opt, type, code, offset, info, &udp_table); +} + +int udpv6_queue_rcv_skb(struct sock * sk, struct sk_buff *skb) +{ + struct udp_sock *up = udp_sk(sk); + int rc; + int is_udplite = IS_UDPLITE(sk); + + if (!ipv6_addr_any(&inet6_sk(sk)->daddr)) + sock_rps_save_rxhash(sk, skb); + + if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) + goto drop; + + /* + * UDP-Lite specific tests, ignored on UDP sockets (see net/ipv4/udp.c). + */ + if ((is_udplite & UDPLITE_RECV_CC) && UDP_SKB_CB(skb)->partial_cov) { + + if (up->pcrlen == 0) { /* full coverage was set */ + LIMIT_NETDEBUG(KERN_WARNING "UDPLITE6: partial coverage" + " %d while full coverage %d requested\n", + UDP_SKB_CB(skb)->cscov, skb->len); + goto drop; + } + if (UDP_SKB_CB(skb)->cscov < up->pcrlen) { + LIMIT_NETDEBUG(KERN_WARNING "UDPLITE6: coverage %d " + "too small, need min %d\n", + UDP_SKB_CB(skb)->cscov, up->pcrlen); + goto drop; + } + } + + if (rcu_access_pointer(sk->sk_filter)) { + if (udp_lib_checksum_complete(skb)) + goto drop; + } + + skb_dst_drop(skb); + rc = sock_queue_rcv_skb(sk, skb); + if (rc < 0) { + /* Note that an ENOMEM error is charged twice */ + if (rc == -ENOMEM) + UDP6_INC_STATS_BH(sock_net(sk), + UDP_MIB_RCVBUFERRORS, is_udplite); + goto drop_no_sk_drops_inc; + } + + return 0; +drop: + atomic_inc(&sk->sk_drops); +drop_no_sk_drops_inc: + UDP6_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite); + kfree_skb(skb); + return -1; +} + +static struct sock *udp_v6_mcast_next(struct net *net, struct sock *sk, + __be16 loc_port, const struct in6_addr *loc_addr, + __be16 rmt_port, const struct in6_addr *rmt_addr, + int dif) +{ + struct hlist_nulls_node *node; + struct sock *s = sk; + unsigned short num = ntohs(loc_port); + + sk_nulls_for_each_from(s, node) { + struct inet_sock *inet = inet_sk(s); + + if (!net_eq(sock_net(s), net)) + continue; + + if (udp_sk(s)->udp_port_hash == num && + s->sk_family == PF_INET6) { + struct ipv6_pinfo *np = inet6_sk(s); + if (inet->inet_dport) { + if (inet->inet_dport != rmt_port) + continue; + } + if (!ipv6_addr_any(&np->daddr) && + !ipv6_addr_equal(&np->daddr, rmt_addr)) + continue; + + if (s->sk_bound_dev_if && s->sk_bound_dev_if != dif) + continue; + + if (!ipv6_addr_any(&np->rcv_saddr)) { + if (!ipv6_addr_equal(&np->rcv_saddr, loc_addr)) + continue; + } + if (!inet6_mc_check(s, loc_addr, rmt_addr)) + continue; + return s; + } + } + return NULL; +} + +static void flush_stack(struct sock **stack, unsigned int count, + struct sk_buff *skb, unsigned int final) +{ + unsigned int i; + struct sock *sk; + struct sk_buff *skb1; + + for (i = 0; i < count; i++) { + skb1 = (i == final) ? skb : skb_clone(skb, GFP_ATOMIC); + + sk = stack[i]; + if (skb1) { + if (sk_rcvqueues_full(sk, skb1)) { + kfree_skb(skb1); + goto drop; + } + bh_lock_sock(sk); + if (!sock_owned_by_user(sk)) + udpv6_queue_rcv_skb(sk, skb1); + else if (sk_add_backlog(sk, skb1)) { + kfree_skb(skb1); + bh_unlock_sock(sk); + goto drop; + } + bh_unlock_sock(sk); + continue; + } +drop: + atomic_inc(&sk->sk_drops); + UDP6_INC_STATS_BH(sock_net(sk), + UDP_MIB_RCVBUFERRORS, IS_UDPLITE(sk)); + UDP6_INC_STATS_BH(sock_net(sk), + UDP_MIB_INERRORS, IS_UDPLITE(sk)); + } +} +/* + * Note: called only from the BH handler context, + * so we don't need to lock the hashes. + */ +static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb, + const struct in6_addr *saddr, const struct in6_addr *daddr, + struct udp_table *udptable) +{ + struct sock *sk, *stack[256 / sizeof(struct sock *)]; + const struct udphdr *uh = udp_hdr(skb); + struct udp_hslot *hslot = udp_hashslot(udptable, net, ntohs(uh->dest)); + int dif; + unsigned int i, count = 0; + + spin_lock(&hslot->lock); + sk = sk_nulls_head(&hslot->head); + dif = inet6_iif(skb); + sk = udp_v6_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif); + while (sk) { + stack[count++] = sk; + sk = udp_v6_mcast_next(net, sk_nulls_next(sk), uh->dest, daddr, + uh->source, saddr, dif); + if (unlikely(count == ARRAY_SIZE(stack))) { + if (!sk) + break; + flush_stack(stack, count, skb, ~0); + count = 0; + } + } + /* + * before releasing the lock, we must take reference on sockets + */ + for (i = 0; i < count; i++) + sock_hold(stack[i]); + + spin_unlock(&hslot->lock); + + if (count) { + flush_stack(stack, count, skb, count - 1); + + for (i = 0; i < count; i++) + sock_put(stack[i]); + } else { + kfree_skb(skb); + } + return 0; +} + +static inline int udp6_csum_init(struct sk_buff *skb, struct udphdr *uh, + int proto) +{ + int err; + + UDP_SKB_CB(skb)->partial_cov = 0; + UDP_SKB_CB(skb)->cscov = skb->len; + + if (proto == IPPROTO_UDPLITE) { + err = udplite_checksum_init(skb, uh); + if (err) + return err; + } + + if (uh->check == 0) { + /* RFC 2460 section 8.1 says that we SHOULD log + this error. Well, it is reasonable. + */ + LIMIT_NETDEBUG(KERN_INFO "IPv6: udp checksum is 0\n"); + return 1; + } + if (skb->ip_summed == CHECKSUM_COMPLETE && + !csum_ipv6_magic(&ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, + skb->len, proto, skb->csum)) + skb->ip_summed = CHECKSUM_UNNECESSARY; + + if (!skb_csum_unnecessary(skb)) + skb->csum = ~csum_unfold(csum_ipv6_magic(&ipv6_hdr(skb)->saddr, + &ipv6_hdr(skb)->daddr, + skb->len, proto, 0)); + + return 0; +} + +int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, + int proto) +{ + struct net *net = dev_net(skb->dev); + struct sock *sk; + struct udphdr *uh; + const struct in6_addr *saddr, *daddr; + u32 ulen = 0; + + if (!pskb_may_pull(skb, sizeof(struct udphdr))) + goto discard; + + saddr = &ipv6_hdr(skb)->saddr; + daddr = &ipv6_hdr(skb)->daddr; + uh = udp_hdr(skb); + + ulen = ntohs(uh->len); + if (ulen > skb->len) + goto short_packet; + + if (proto == IPPROTO_UDP) { + /* UDP validates ulen. */ + + /* Check for jumbo payload */ + if (ulen == 0) + ulen = skb->len; + + if (ulen < sizeof(*uh)) + goto short_packet; + + if (ulen < skb->len) { + if (pskb_trim_rcsum(skb, ulen)) + goto short_packet; + saddr = &ipv6_hdr(skb)->saddr; + daddr = &ipv6_hdr(skb)->daddr; + uh = udp_hdr(skb); + } + } + + if (udp6_csum_init(skb, uh, proto)) + goto discard; + + /* + * Multicast receive code + */ + if (ipv6_addr_is_multicast(daddr)) + return __udp6_lib_mcast_deliver(net, skb, + saddr, daddr, udptable); + + /* Unicast */ + + /* + * check socket cache ... must talk to Alan about his plans + * for sock caches... i'll skip this for now. + */ + sk = __udp6_lib_lookup_skb(skb, uh->source, uh->dest, udptable); + + if (sk == NULL) { + if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) + goto discard; + + if (udp_lib_checksum_complete(skb)) + goto discard; + UDP6_INC_STATS_BH(net, UDP_MIB_NOPORTS, + proto == IPPROTO_UDPLITE); + + icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); + + kfree_skb(skb); + return 0; + } + + /* deliver */ + + if (sk_rcvqueues_full(sk, skb)) { + sock_put(sk); + goto discard; + } + bh_lock_sock(sk); + if (!sock_owned_by_user(sk)) + udpv6_queue_rcv_skb(sk, skb); + else if (sk_add_backlog(sk, skb)) { + atomic_inc(&sk->sk_drops); + bh_unlock_sock(sk); + sock_put(sk); + goto discard; + } + bh_unlock_sock(sk); + sock_put(sk); + return 0; + +short_packet: + LIMIT_NETDEBUG(KERN_DEBUG "UDP%sv6: short packet: From [%pI6c]:%u %d/%d to [%pI6c]:%u\n", + proto == IPPROTO_UDPLITE ? "-Lite" : "", + saddr, + ntohs(uh->source), + ulen, + skb->len, + daddr, + ntohs(uh->dest)); + +discard: + UDP6_INC_STATS_BH(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE); + kfree_skb(skb); + return 0; +} + +static __inline__ int udpv6_rcv(struct sk_buff *skb) +{ + return __udp6_lib_rcv(skb, &udp_table, IPPROTO_UDP); +} + +/* + * Throw away all pending data and cancel the corking. Socket is locked. + */ +static void udp_v6_flush_pending_frames(struct sock *sk) +{ + struct udp_sock *up = udp_sk(sk); + + if (up->pending == AF_INET) + udp_flush_pending_frames(sk); + else if (up->pending) { + up->len = 0; + up->pending = 0; + ip6_flush_pending_frames(sk); + } +} + +/** + * udp6_hwcsum_outgoing - handle outgoing HW checksumming + * @sk: socket we are sending on + * @skb: sk_buff containing the filled-in UDP header + * (checksum field must be zeroed out) + */ +static void udp6_hwcsum_outgoing(struct sock *sk, struct sk_buff *skb, + const struct in6_addr *saddr, + const struct in6_addr *daddr, int len) +{ + unsigned int offset; + struct udphdr *uh = udp_hdr(skb); + __wsum csum = 0; + + if (skb_queue_len(&sk->sk_write_queue) == 1) { + /* Only one fragment on the socket. */ + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = offsetof(struct udphdr, check); + uh->check = ~csum_ipv6_magic(saddr, daddr, len, IPPROTO_UDP, 0); + } else { + /* + * HW-checksum won't work as there are two or more + * fragments on the socket so that all csums of sk_buffs + * should be together + */ + offset = skb_transport_offset(skb); + skb->csum = skb_checksum(skb, offset, skb->len - offset, 0); + + skb->ip_summed = CHECKSUM_NONE; + + skb_queue_walk(&sk->sk_write_queue, skb) { + csum = csum_add(csum, skb->csum); + } + + uh->check = csum_ipv6_magic(saddr, daddr, len, IPPROTO_UDP, + csum); + if (uh->check == 0) + uh->check = CSUM_MANGLED_0; + } +} + +/* + * Sending + */ + +static int udp_v6_push_pending_frames(struct sock *sk) +{ + struct sk_buff *skb; + struct udphdr *uh; + struct udp_sock *up = udp_sk(sk); + struct inet_sock *inet = inet_sk(sk); + struct flowi6 *fl6 = &inet->cork.fl.u.ip6; + int err = 0; + int is_udplite = IS_UDPLITE(sk); + __wsum csum = 0; + + /* Grab the skbuff where UDP header space exists. */ + if ((skb = skb_peek(&sk->sk_write_queue)) == NULL) + goto out; + + /* + * Create a UDP header + */ + uh = udp_hdr(skb); + uh->source = fl6->fl6_sport; + uh->dest = fl6->fl6_dport; + uh->len = htons(up->len); + uh->check = 0; + + if (is_udplite) + csum = udplite_csum_outgoing(sk, skb); + else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */ + udp6_hwcsum_outgoing(sk, skb, &fl6->saddr, &fl6->daddr, + up->len); + goto send; + } else + csum = udp_csum_outgoing(sk, skb); + + /* add protocol-dependent pseudo-header */ + uh->check = csum_ipv6_magic(&fl6->saddr, &fl6->daddr, + up->len, fl6->flowi6_proto, csum); + if (uh->check == 0) + uh->check = CSUM_MANGLED_0; + +send: + err = ip6_push_pending_frames(sk); + if (err) { + if (err == -ENOBUFS && !inet6_sk(sk)->recverr) { + UDP6_INC_STATS_USER(sock_net(sk), + UDP_MIB_SNDBUFERRORS, is_udplite); + err = 0; + } + } else + UDP6_INC_STATS_USER(sock_net(sk), + UDP_MIB_OUTDATAGRAMS, is_udplite); +out: + up->len = 0; + up->pending = 0; + return err; +} + +int udpv6_sendmsg(struct kiocb *iocb, struct sock *sk, + struct msghdr *msg, size_t len) +{ + struct ipv6_txoptions opt_space; + struct udp_sock *up = udp_sk(sk); + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) msg->msg_name; + struct in6_addr *daddr, *final_p, final; + struct ipv6_txoptions *opt = NULL; + struct ip6_flowlabel *flowlabel = NULL; + struct flowi6 fl6; + struct dst_entry *dst; + int addr_len = msg->msg_namelen; + int ulen = len; + int hlimit = -1; + int tclass = -1; + int dontfrag = -1; + int corkreq = up->corkflag || msg->msg_flags&MSG_MORE; + int err; + int connected = 0; + int is_udplite = IS_UDPLITE(sk); + int (*getfrag)(void *, char *, int, int, int, struct sk_buff *); + + /* destination address check */ + if (sin6) { + if (addr_len < offsetof(struct sockaddr, sa_data)) + return -EINVAL; + + switch (sin6->sin6_family) { + case AF_INET6: + if (addr_len < SIN6_LEN_RFC2133) + return -EINVAL; + daddr = &sin6->sin6_addr; + break; + case AF_INET: + goto do_udp_sendmsg; + case AF_UNSPEC: + msg->msg_name = sin6 = NULL; + msg->msg_namelen = addr_len = 0; + daddr = NULL; + break; + default: + return -EINVAL; + } + } else if (!up->pending) { + if (sk->sk_state != TCP_ESTABLISHED) + return -EDESTADDRREQ; + daddr = &np->daddr; + } else + daddr = NULL; + + if (daddr) { + if (ipv6_addr_v4mapped(daddr)) { + struct sockaddr_in sin; + sin.sin_family = AF_INET; + sin.sin_port = sin6 ? sin6->sin6_port : inet->inet_dport; + sin.sin_addr.s_addr = daddr->s6_addr32[3]; + msg->msg_name = &sin; + msg->msg_namelen = sizeof(sin); +do_udp_sendmsg: + if (__ipv6_only_sock(sk)) + return -ENETUNREACH; + return udp_sendmsg(iocb, sk, msg, len); + } + } + + if (up->pending == AF_INET) + return udp_sendmsg(iocb, sk, msg, len); + + /* Rough check on arithmetic overflow, + better check is made in ip6_append_data(). + */ + if (len > INT_MAX - sizeof(struct udphdr)) + return -EMSGSIZE; + + if (up->pending) { + /* + * There are pending frames. + * The socket lock must be held while it's corked. + */ + lock_sock(sk); + if (likely(up->pending)) { + if (unlikely(up->pending != AF_INET6)) { + release_sock(sk); + return -EAFNOSUPPORT; + } + dst = NULL; + goto do_append_data; + } + release_sock(sk); + } + ulen += sizeof(struct udphdr); + + memset(&fl6, 0, sizeof(fl6)); + + if (sin6) { + if (sin6->sin6_port == 0) + return -EINVAL; + + fl6.fl6_dport = sin6->sin6_port; + daddr = &sin6->sin6_addr; + + if (np->sndflow) { + fl6.flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK; + if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) { + flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); + if (flowlabel == NULL) + return -EINVAL; + daddr = &flowlabel->dst; + } + } + + /* + * Otherwise it will be difficult to maintain + * sk->sk_dst_cache. + */ + if (sk->sk_state == TCP_ESTABLISHED && + ipv6_addr_equal(daddr, &np->daddr)) + daddr = &np->daddr; + + if (addr_len >= sizeof(struct sockaddr_in6) && + sin6->sin6_scope_id && + ipv6_addr_type(daddr)&IPV6_ADDR_LINKLOCAL) + fl6.flowi6_oif = sin6->sin6_scope_id; + } else { + if (sk->sk_state != TCP_ESTABLISHED) + return -EDESTADDRREQ; + + fl6.fl6_dport = inet->inet_dport; + daddr = &np->daddr; + fl6.flowlabel = np->flow_label; + connected = 1; + } + + if (!fl6.flowi6_oif) + fl6.flowi6_oif = sk->sk_bound_dev_if; + + if (!fl6.flowi6_oif) + fl6.flowi6_oif = np->sticky_pktinfo.ipi6_ifindex; + + fl6.flowi6_mark = sk->sk_mark; + + if (msg->msg_controllen) { + opt = &opt_space; + memset(opt, 0, sizeof(struct ipv6_txoptions)); + opt->tot_len = sizeof(*opt); + + err = datagram_send_ctl(sock_net(sk), sk, msg, &fl6, opt, + &hlimit, &tclass, &dontfrag); + if (err < 0) { + fl6_sock_release(flowlabel); + return err; + } + if ((fl6.flowlabel&IPV6_FLOWLABEL_MASK) && !flowlabel) { + flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); + if (flowlabel == NULL) + return -EINVAL; + } + if (!(opt->opt_nflen|opt->opt_flen)) + opt = NULL; + connected = 0; + } + if (opt == NULL) + opt = np->opt; + if (flowlabel) + opt = fl6_merge_options(&opt_space, flowlabel, opt); + opt = ipv6_fixup_options(&opt_space, opt); + + fl6.flowi6_proto = sk->sk_protocol; + if (!ipv6_addr_any(daddr)) + fl6.daddr = *daddr; + else + fl6.daddr.s6_addr[15] = 0x1; /* :: means loopback (BSD'ism) */ + if (ipv6_addr_any(&fl6.saddr) && !ipv6_addr_any(&np->saddr)) + fl6.saddr = np->saddr; + fl6.fl6_sport = inet->inet_sport; + + final_p = fl6_update_dst(&fl6, opt, &final); + if (final_p) + connected = 0; + + if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) { + fl6.flowi6_oif = np->mcast_oif; + connected = 0; + } else if (!fl6.flowi6_oif) + fl6.flowi6_oif = np->ucast_oif; + + security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); + + dst = ip6_sk_dst_lookup_flow(sk, &fl6, final_p, true); + if (IS_ERR(dst)) { + err = PTR_ERR(dst); + dst = NULL; + goto out; + } + + if (hlimit < 0) { + if (ipv6_addr_is_multicast(&fl6.daddr)) + hlimit = np->mcast_hops; + else + hlimit = np->hop_limit; + if (hlimit < 0) + hlimit = ip6_dst_hoplimit(dst); + } + + if (tclass < 0) + tclass = np->tclass; + + if (dontfrag < 0) + dontfrag = np->dontfrag; + + if (msg->msg_flags&MSG_CONFIRM) + goto do_confirm; +back_from_confirm: + + lock_sock(sk); + if (unlikely(up->pending)) { + /* The socket is already corked while preparing it. */ + /* ... which is an evident application bug. --ANK */ + release_sock(sk); + + LIMIT_NETDEBUG(KERN_DEBUG "udp cork app bug 2\n"); + err = -EINVAL; + goto out; + } + + up->pending = AF_INET6; + +do_append_data: + up->len += ulen; + getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag; + err = ip6_append_data(sk, getfrag, msg->msg_iov, ulen, + sizeof(struct udphdr), hlimit, tclass, opt, &fl6, + (struct rt6_info*)dst, + corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags, dontfrag); + if (err) + udp_v6_flush_pending_frames(sk); + else if (!corkreq) + err = udp_v6_push_pending_frames(sk); + else if (unlikely(skb_queue_empty(&sk->sk_write_queue))) + up->pending = 0; + + if (dst) { + if (connected) { + ip6_dst_store(sk, dst, + ipv6_addr_equal(&fl6.daddr, &np->daddr) ? + &np->daddr : NULL, +#ifdef CONFIG_IPV6_SUBTREES + ipv6_addr_equal(&fl6.saddr, &np->saddr) ? + &np->saddr : +#endif + NULL); + } else { + dst_release(dst); + } + dst = NULL; + } + + if (err > 0) + err = np->recverr ? net_xmit_errno(err) : 0; + release_sock(sk); +out: + dst_release(dst); + fl6_sock_release(flowlabel); + if (!err) + return len; + /* + * ENOBUFS = no kernel mem, SOCK_NOSPACE = no sndbuf space. Reporting + * ENOBUFS might not be good (it's not tunable per se), but otherwise + * we don't have a good statistic (IpOutDiscards but it can be too many + * things). We could add another new stat but at least for now that + * seems like overkill. + */ + if (err == -ENOBUFS || test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { + UDP6_INC_STATS_USER(sock_net(sk), + UDP_MIB_SNDBUFERRORS, is_udplite); + } + return err; + +do_confirm: + dst_confirm(dst); + if (!(msg->msg_flags&MSG_PROBE) || len) + goto back_from_confirm; + err = 0; + goto out; +} + +void udpv6_destroy_sock(struct sock *sk) +{ + lock_sock(sk); + udp_v6_flush_pending_frames(sk); + release_sock(sk); + + inet6_destroy_sock(sk); +} + +/* + * Socket option code for UDP + */ +int udpv6_setsockopt(struct sock *sk, int level, int optname, + char __user *optval, unsigned int optlen) +{ + if (level == SOL_UDP || level == SOL_UDPLITE) + return udp_lib_setsockopt(sk, level, optname, optval, optlen, + udp_v6_push_pending_frames); + return ipv6_setsockopt(sk, level, optname, optval, optlen); +} + +#ifdef CONFIG_COMPAT +int compat_udpv6_setsockopt(struct sock *sk, int level, int optname, + char __user *optval, unsigned int optlen) +{ + if (level == SOL_UDP || level == SOL_UDPLITE) + return udp_lib_setsockopt(sk, level, optname, optval, optlen, + udp_v6_push_pending_frames); + return compat_ipv6_setsockopt(sk, level, optname, optval, optlen); +} +#endif + +int udpv6_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *optlen) +{ + if (level == SOL_UDP || level == SOL_UDPLITE) + return udp_lib_getsockopt(sk, level, optname, optval, optlen); + return ipv6_getsockopt(sk, level, optname, optval, optlen); +} + +#ifdef CONFIG_COMPAT +int compat_udpv6_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *optlen) +{ + if (level == SOL_UDP || level == SOL_UDPLITE) + return udp_lib_getsockopt(sk, level, optname, optval, optlen); + return compat_ipv6_getsockopt(sk, level, optname, optval, optlen); +} +#endif + +static int udp6_ufo_send_check(struct sk_buff *skb) +{ + const struct ipv6hdr *ipv6h; + struct udphdr *uh; + + if (!pskb_may_pull(skb, sizeof(*uh))) + return -EINVAL; + + ipv6h = ipv6_hdr(skb); + uh = udp_hdr(skb); + + uh->check = ~csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr, skb->len, + IPPROTO_UDP, 0); + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = offsetof(struct udphdr, check); + skb->ip_summed = CHECKSUM_PARTIAL; + return 0; +} + +static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, + netdev_features_t features) +{ + struct sk_buff *segs = ERR_PTR(-EINVAL); + unsigned int mss; + unsigned int unfrag_ip6hlen, unfrag_len; + struct frag_hdr *fptr; + u8 *mac_start, *prevhdr; + u8 nexthdr; + u8 frag_hdr_sz = sizeof(struct frag_hdr); + int offset; + __wsum csum; + + mss = skb_shinfo(skb)->gso_size; + if (unlikely(skb->len <= mss)) + goto out; + + if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) { + /* Packet is from an untrusted source, reset gso_segs. */ + int type = skb_shinfo(skb)->gso_type; + + if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY) || + !(type & (SKB_GSO_UDP)))) + goto out; + + skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss); + + segs = NULL; + goto out; + } + + /* Do software UFO. Complete and fill in the UDP checksum as HW cannot + * do checksum of UDP packets sent as multiple IP fragments. + */ + offset = skb_checksum_start_offset(skb); + csum = skb_checksum(skb, offset, skb->len- offset, 0); + offset += skb->csum_offset; + *(__sum16 *)(skb->data + offset) = csum_fold(csum); + skb->ip_summed = CHECKSUM_NONE; + + /* Check if there is enough headroom to insert fragment header. */ + if ((skb_mac_header(skb) < skb->head + frag_hdr_sz) && + pskb_expand_head(skb, frag_hdr_sz, 0, GFP_ATOMIC)) + goto out; + + /* Find the unfragmentable header and shift it left by frag_hdr_sz + * bytes to insert fragment header. + */ + unfrag_ip6hlen = ip6_find_1stfragopt(skb, &prevhdr); + nexthdr = *prevhdr; + *prevhdr = NEXTHDR_FRAGMENT; + unfrag_len = skb_network_header(skb) - skb_mac_header(skb) + + unfrag_ip6hlen; + mac_start = skb_mac_header(skb); + memmove(mac_start-frag_hdr_sz, mac_start, unfrag_len); + + skb->mac_header -= frag_hdr_sz; + skb->network_header -= frag_hdr_sz; + + fptr = (struct frag_hdr *)(skb_network_header(skb) + unfrag_ip6hlen); + fptr->nexthdr = nexthdr; + fptr->reserved = 0; + ipv6_select_ident(fptr, (struct rt6_info *)skb_dst(skb)); + + /* Fragment the skb. ipv6 header and the remaining fields of the + * fragment header are updated in ipv6_gso_segment() + */ + segs = skb_segment(skb, features); + +out: + return segs; +} + +static const struct inet6_protocol udpv6_protocol = { + .handler = udpv6_rcv, + .err_handler = udpv6_err, + .gso_send_check = udp6_ufo_send_check, + .gso_segment = udp6_ufo_fragment, + .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, +}; + +/* ------------------------------------------------------------------------ */ +#ifdef CONFIG_PROC_FS + +static void udp6_sock_seq_show(struct seq_file *seq, struct sock *sp, int bucket) +{ + struct inet_sock *inet = inet_sk(sp); + struct ipv6_pinfo *np = inet6_sk(sp); + const struct in6_addr *dest, *src; + __u16 destp, srcp; + + dest = &np->daddr; + src = &np->rcv_saddr; + destp = ntohs(inet->inet_dport); + srcp = ntohs(inet->inet_sport); + seq_printf(seq, + "%5d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " + "%02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %pK %d\n", + bucket, + src->s6_addr32[0], src->s6_addr32[1], + src->s6_addr32[2], src->s6_addr32[3], srcp, + dest->s6_addr32[0], dest->s6_addr32[1], + dest->s6_addr32[2], dest->s6_addr32[3], destp, + sp->sk_state, + sk_wmem_alloc_get(sp), + sk_rmem_alloc_get(sp), + 0, 0L, 0, + sock_i_uid(sp), 0, + sock_i_ino(sp), + atomic_read(&sp->sk_refcnt), sp, + atomic_read(&sp->sk_drops)); +} + +int udp6_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) + seq_printf(seq, + " sl " + "local_address " + "remote_address " + "st tx_queue rx_queue tr tm->when retrnsmt" + " uid timeout inode ref pointer drops\n"); + else + udp6_sock_seq_show(seq, v, ((struct udp_iter_state *)seq->private)->bucket); + return 0; +} + +static const struct file_operations udp6_afinfo_seq_fops = { + .owner = THIS_MODULE, + .open = udp_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net +}; + +static struct udp_seq_afinfo udp6_seq_afinfo = { + .name = "udp6", + .family = AF_INET6, + .udp_table = &udp_table, + .seq_fops = &udp6_afinfo_seq_fops, + .seq_ops = { + .show = udp6_seq_show, + }, +}; + +int __net_init udp6_proc_init(struct net *net) +{ + return udp_proc_register(net, &udp6_seq_afinfo); +} + +void udp6_proc_exit(struct net *net) { + udp_proc_unregister(net, &udp6_seq_afinfo); +} +#endif /* CONFIG_PROC_FS */ + +/* ------------------------------------------------------------------------ */ + +struct proto udpv6_prot = { + .name = "UDPv6", + .owner = THIS_MODULE, + .close = udp_lib_close, + .connect = ip6_datagram_connect, + .disconnect = udp_disconnect, + .ioctl = udp_ioctl, + .destroy = udpv6_destroy_sock, + .setsockopt = udpv6_setsockopt, + .getsockopt = udpv6_getsockopt, + .sendmsg = udpv6_sendmsg, + .recvmsg = udpv6_recvmsg, + .backlog_rcv = udpv6_queue_rcv_skb, + .hash = udp_lib_hash, + .unhash = udp_lib_unhash, + .rehash = udp_v6_rehash, + .get_port = udp_v6_get_port, + .memory_allocated = &udp_memory_allocated, + .sysctl_mem = sysctl_udp_mem, + .sysctl_wmem = &sysctl_udp_wmem_min, + .sysctl_rmem = &sysctl_udp_rmem_min, + .obj_size = sizeof(struct udp6_sock), + .slab_flags = SLAB_DESTROY_BY_RCU, + .h.udp_table = &udp_table, +#ifdef CONFIG_COMPAT + .compat_setsockopt = compat_udpv6_setsockopt, + .compat_getsockopt = compat_udpv6_getsockopt, +#endif + .clear_sk = sk_prot_clear_portaddr_nulls, +}; + +static struct inet_protosw udpv6_protosw = { + .type = SOCK_DGRAM, + .protocol = IPPROTO_UDP, + .prot = &udpv6_prot, + .ops = &inet6_dgram_ops, + .no_check = UDP_CSUM_DEFAULT, + .flags = INET_PROTOSW_PERMANENT, +}; + + +int __init udpv6_init(void) +{ + int ret; + + ret = inet6_add_protocol(&udpv6_protocol, IPPROTO_UDP); + if (ret) + goto out; + + ret = inet6_register_protosw(&udpv6_protosw); + if (ret) + goto out_udpv6_protocol; +out: + return ret; + +out_udpv6_protocol: + inet6_del_protocol(&udpv6_protocol, IPPROTO_UDP); + goto out; +} + +void udpv6_exit(void) +{ + inet6_unregister_protosw(&udpv6_protosw); + inet6_del_protocol(&udpv6_protocol, IPPROTO_UDP); +} diff --git a/net/ipv6/udp_impl.h b/net/ipv6/udp_impl.h new file mode 100644 index 00000000..d7571046 --- /dev/null +++ b/net/ipv6/udp_impl.h @@ -0,0 +1,37 @@ +#ifndef _UDP6_IMPL_H +#define _UDP6_IMPL_H +#include <net/udp.h> +#include <net/udplite.h> +#include <net/protocol.h> +#include <net/addrconf.h> +#include <net/inet_common.h> +#include <net/transp_v6.h> + +extern int __udp6_lib_rcv(struct sk_buff *, struct udp_table *, int ); +extern void __udp6_lib_err(struct sk_buff *, struct inet6_skb_parm *, + u8 , u8 , int , __be32 , struct udp_table *); + +extern int udp_v6_get_port(struct sock *sk, unsigned short snum); + +extern int udpv6_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *optlen); +extern int udpv6_setsockopt(struct sock *sk, int level, int optname, + char __user *optval, unsigned int optlen); +#ifdef CONFIG_COMPAT +extern int compat_udpv6_setsockopt(struct sock *sk, int level, int optname, + char __user *optval, unsigned int optlen); +extern int compat_udpv6_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *optlen); +#endif +extern int udpv6_sendmsg(struct kiocb *iocb, struct sock *sk, + struct msghdr *msg, size_t len); +extern int udpv6_recvmsg(struct kiocb *iocb, struct sock *sk, + struct msghdr *msg, size_t len, + int noblock, int flags, int *addr_len); +extern int udpv6_queue_rcv_skb(struct sock * sk, struct sk_buff *skb); +extern void udpv6_destroy_sock(struct sock *sk); + +#ifdef CONFIG_PROC_FS +extern int udp6_seq_show(struct seq_file *seq, void *v); +#endif +#endif /* _UDP6_IMPL_H */ diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c new file mode 100644 index 00000000..1d08e21d --- /dev/null +++ b/net/ipv6/udplite.c @@ -0,0 +1,140 @@ +/* + * UDPLITEv6 An implementation of the UDP-Lite protocol over IPv6. + * See also net/ipv4/udplite.c + * + * Authors: Gerrit Renker <gerrit@erg.abdn.ac.uk> + * + * Changes: + * Fixes: + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <linux/export.h> +#include "udp_impl.h" + +static int udplitev6_rcv(struct sk_buff *skb) +{ + return __udp6_lib_rcv(skb, &udplite_table, IPPROTO_UDPLITE); +} + +static void udplitev6_err(struct sk_buff *skb, + struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, __be32 info) +{ + __udp6_lib_err(skb, opt, type, code, offset, info, &udplite_table); +} + +static const struct inet6_protocol udplitev6_protocol = { + .handler = udplitev6_rcv, + .err_handler = udplitev6_err, + .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, +}; + +struct proto udplitev6_prot = { + .name = "UDPLITEv6", + .owner = THIS_MODULE, + .close = udp_lib_close, + .connect = ip6_datagram_connect, + .disconnect = udp_disconnect, + .ioctl = udp_ioctl, + .init = udplite_sk_init, + .destroy = udpv6_destroy_sock, + .setsockopt = udpv6_setsockopt, + .getsockopt = udpv6_getsockopt, + .sendmsg = udpv6_sendmsg, + .recvmsg = udpv6_recvmsg, + .backlog_rcv = udpv6_queue_rcv_skb, + .hash = udp_lib_hash, + .unhash = udp_lib_unhash, + .get_port = udp_v6_get_port, + .obj_size = sizeof(struct udp6_sock), + .slab_flags = SLAB_DESTROY_BY_RCU, + .h.udp_table = &udplite_table, +#ifdef CONFIG_COMPAT + .compat_setsockopt = compat_udpv6_setsockopt, + .compat_getsockopt = compat_udpv6_getsockopt, +#endif + .clear_sk = sk_prot_clear_portaddr_nulls, +}; + +static struct inet_protosw udplite6_protosw = { + .type = SOCK_DGRAM, + .protocol = IPPROTO_UDPLITE, + .prot = &udplitev6_prot, + .ops = &inet6_dgram_ops, + .no_check = 0, + .flags = INET_PROTOSW_PERMANENT, +}; + +int __init udplitev6_init(void) +{ + int ret; + + ret = inet6_add_protocol(&udplitev6_protocol, IPPROTO_UDPLITE); + if (ret) + goto out; + + ret = inet6_register_protosw(&udplite6_protosw); + if (ret) + goto out_udplitev6_protocol; +out: + return ret; + +out_udplitev6_protocol: + inet6_del_protocol(&udplitev6_protocol, IPPROTO_UDPLITE); + goto out; +} + +void udplitev6_exit(void) +{ + inet6_unregister_protosw(&udplite6_protosw); + inet6_del_protocol(&udplitev6_protocol, IPPROTO_UDPLITE); +} + +#ifdef CONFIG_PROC_FS + +static const struct file_operations udplite6_afinfo_seq_fops = { + .owner = THIS_MODULE, + .open = udp_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net +}; + +static struct udp_seq_afinfo udplite6_seq_afinfo = { + .name = "udplite6", + .family = AF_INET6, + .udp_table = &udplite_table, + .seq_fops = &udplite6_afinfo_seq_fops, + .seq_ops = { + .show = udp6_seq_show, + }, +}; + +static int __net_init udplite6_proc_init_net(struct net *net) +{ + return udp_proc_register(net, &udplite6_seq_afinfo); +} + +static void __net_exit udplite6_proc_exit_net(struct net *net) +{ + udp_proc_unregister(net, &udplite6_seq_afinfo); +} + +static struct pernet_operations udplite6_net_ops = { + .init = udplite6_proc_init_net, + .exit = udplite6_proc_exit_net, +}; + +int __init udplite6_proc_init(void) +{ + return register_pernet_subsys(&udplite6_net_ops); +} + +void udplite6_proc_exit(void) +{ + unregister_pernet_subsys(&udplite6_net_ops); +} +#endif diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c new file mode 100644 index 00000000..f8c3cf84 --- /dev/null +++ b/net/ipv6/xfrm6_input.c @@ -0,0 +1,146 @@ +/* + * xfrm6_input.c: based on net/ipv4/xfrm4_input.c + * + * Authors: + * Mitsuru KANDA @USAGI + * Kazunori MIYAZAWA @USAGI + * Kunihiro Ishiguro <kunihiro@ipinfusion.com> + * YOSHIFUJI Hideaki @USAGI + * IPv6 support + */ + +#include <linux/module.h> +#include <linux/string.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv6.h> +#include <net/ipv6.h> +#include <net/xfrm.h> + +int xfrm6_extract_input(struct xfrm_state *x, struct sk_buff *skb) +{ + return xfrm6_extract_header(skb); +} + +int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi) +{ + XFRM_SPI_SKB_CB(skb)->family = AF_INET6; + XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr); + return xfrm_input(skb, nexthdr, spi, 0); +} +EXPORT_SYMBOL(xfrm6_rcv_spi); + +int xfrm6_transport_finish(struct sk_buff *skb, int async) +{ + skb_network_header(skb)[IP6CB(skb)->nhoff] = + XFRM_MODE_SKB_CB(skb)->protocol; + +#ifndef CONFIG_NETFILTER + if (!async) + return 1; +#endif + + ipv6_hdr(skb)->payload_len = htons(skb->len); + __skb_push(skb, skb->data - skb_network_header(skb)); + + NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, skb, skb->dev, NULL, + ip6_rcv_finish); + return -1; +} + +int xfrm6_rcv(struct sk_buff *skb) +{ + return xfrm6_rcv_spi(skb, skb_network_header(skb)[IP6CB(skb)->nhoff], + 0); +} + +EXPORT_SYMBOL(xfrm6_rcv); + +int xfrm6_input_addr(struct sk_buff *skb, xfrm_address_t *daddr, + xfrm_address_t *saddr, u8 proto) +{ + struct net *net = dev_net(skb->dev); + struct xfrm_state *x = NULL; + int i = 0; + + /* Allocate new secpath or COW existing one. */ + if (!skb->sp || atomic_read(&skb->sp->refcnt) != 1) { + struct sec_path *sp; + + sp = secpath_dup(skb->sp); + if (!sp) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMINERROR); + goto drop; + } + if (skb->sp) + secpath_put(skb->sp); + skb->sp = sp; + } + + if (1 + skb->sp->len == XFRM_MAX_DEPTH) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR); + goto drop; + } + + for (i = 0; i < 3; i++) { + xfrm_address_t *dst, *src; + + switch (i) { + case 0: + dst = daddr; + src = saddr; + break; + case 1: + /* lookup state with wild-card source address */ + dst = daddr; + src = (xfrm_address_t *)&in6addr_any; + break; + default: + /* lookup state with wild-card addresses */ + dst = (xfrm_address_t *)&in6addr_any; + src = (xfrm_address_t *)&in6addr_any; + break; + } + + x = xfrm_state_lookup_byaddr(net, skb->mark, dst, src, proto, AF_INET6); + if (!x) + continue; + + spin_lock(&x->lock); + + if ((!i || (x->props.flags & XFRM_STATE_WILDRECV)) && + likely(x->km.state == XFRM_STATE_VALID) && + !xfrm_state_check_expire(x)) { + spin_unlock(&x->lock); + if (x->type->input(x, skb) > 0) { + /* found a valid state */ + break; + } + } else + spin_unlock(&x->lock); + + xfrm_state_put(x); + x = NULL; + } + + if (!x) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOSTATES); + xfrm_audit_state_notfound_simple(skb, AF_INET6); + goto drop; + } + + skb->sp->xvec[skb->sp->len++] = x; + + spin_lock(&x->lock); + + x->curlft.bytes += skb->len; + x->curlft.packets++; + + spin_unlock(&x->lock); + + return 1; + +drop: + return -1; +} + +EXPORT_SYMBOL(xfrm6_input_addr); diff --git a/net/ipv6/xfrm6_mode_beet.c b/net/ipv6/xfrm6_mode_beet.c new file mode 100644 index 00000000..9949a356 --- /dev/null +++ b/net/ipv6/xfrm6_mode_beet.c @@ -0,0 +1,131 @@ +/* + * xfrm6_mode_beet.c - BEET mode encapsulation for IPv6. + * + * Copyright (c) 2006 Diego Beltrami <diego.beltrami@gmail.com> + * Miika Komu <miika@iki.fi> + * Herbert Xu <herbert@gondor.apana.org.au> + * Abhinav Pathak <abhinav.pathak@hiit.fi> + * Jeff Ahrenholz <ahrenholz@gmail.com> + */ + +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/stringify.h> +#include <net/dsfield.h> +#include <net/dst.h> +#include <net/inet_ecn.h> +#include <net/ipv6.h> +#include <net/xfrm.h> + +static void xfrm6_beet_make_header(struct sk_buff *skb) +{ + struct ipv6hdr *iph = ipv6_hdr(skb); + + iph->version = 6; + + memcpy(iph->flow_lbl, XFRM_MODE_SKB_CB(skb)->flow_lbl, + sizeof(iph->flow_lbl)); + iph->nexthdr = XFRM_MODE_SKB_CB(skb)->protocol; + + ipv6_change_dsfield(iph, 0, XFRM_MODE_SKB_CB(skb)->tos); + iph->hop_limit = XFRM_MODE_SKB_CB(skb)->ttl; +} + +/* Add encapsulation header. + * + * The top IP header will be constructed per draft-nikander-esp-beet-mode-06.txt. + */ +static int xfrm6_beet_output(struct xfrm_state *x, struct sk_buff *skb) +{ + struct ipv6hdr *top_iph; + struct ip_beet_phdr *ph; + int optlen, hdr_len; + + hdr_len = 0; + optlen = XFRM_MODE_SKB_CB(skb)->optlen; + if (unlikely(optlen)) + hdr_len += IPV4_BEET_PHMAXLEN - (optlen & 4); + + skb_set_network_header(skb, -x->props.header_len - hdr_len); + if (x->sel.family != AF_INET6) + skb->network_header += IPV4_BEET_PHMAXLEN; + skb->mac_header = skb->network_header + + offsetof(struct ipv6hdr, nexthdr); + skb->transport_header = skb->network_header + sizeof(*top_iph); + ph = (struct ip_beet_phdr *)__skb_pull(skb, XFRM_MODE_SKB_CB(skb)->ihl-hdr_len); + + xfrm6_beet_make_header(skb); + + top_iph = ipv6_hdr(skb); + if (unlikely(optlen)) { + + BUG_ON(optlen < 0); + + ph->padlen = 4 - (optlen & 4); + ph->hdrlen = optlen / 8; + ph->nexthdr = top_iph->nexthdr; + if (ph->padlen) + memset(ph + 1, IPOPT_NOP, ph->padlen); + + top_iph->nexthdr = IPPROTO_BEETPH; + } + + top_iph->saddr = *(struct in6_addr *)&x->props.saddr; + top_iph->daddr = *(struct in6_addr *)&x->id.daddr; + return 0; +} + +static int xfrm6_beet_input(struct xfrm_state *x, struct sk_buff *skb) +{ + struct ipv6hdr *ip6h; + int size = sizeof(struct ipv6hdr); + int err; + + err = skb_cow_head(skb, size + skb->mac_len); + if (err) + goto out; + + __skb_push(skb, size); + skb_reset_network_header(skb); + skb_mac_header_rebuild(skb); + + xfrm6_beet_make_header(skb); + + ip6h = ipv6_hdr(skb); + ip6h->payload_len = htons(skb->len - size); + ip6h->daddr = *(struct in6_addr *)&x->sel.daddr.a6; + ip6h->saddr = *(struct in6_addr *)&x->sel.saddr.a6; + err = 0; +out: + return err; +} + +static struct xfrm_mode xfrm6_beet_mode = { + .input2 = xfrm6_beet_input, + .input = xfrm_prepare_input, + .output2 = xfrm6_beet_output, + .output = xfrm6_prepare_output, + .owner = THIS_MODULE, + .encap = XFRM_MODE_BEET, + .flags = XFRM_MODE_FLAG_TUNNEL, +}; + +static int __init xfrm6_beet_init(void) +{ + return xfrm_register_mode(&xfrm6_beet_mode, AF_INET6); +} + +static void __exit xfrm6_beet_exit(void) +{ + int err; + + err = xfrm_unregister_mode(&xfrm6_beet_mode, AF_INET6); + BUG_ON(err); +} + +module_init(xfrm6_beet_init); +module_exit(xfrm6_beet_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_XFRM_MODE(AF_INET6, XFRM_MODE_BEET); diff --git a/net/ipv6/xfrm6_mode_ro.c b/net/ipv6/xfrm6_mode_ro.c new file mode 100644 index 00000000..63d5d493 --- /dev/null +++ b/net/ipv6/xfrm6_mode_ro.c @@ -0,0 +1,84 @@ +/* + * xfrm6_mode_ro.c - Route optimization mode for IPv6. + * + * Copyright (C)2003-2006 Helsinki University of Technology + * Copyright (C)2003-2006 USAGI/WIDE Project + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +/* + * Authors: + * Noriaki TAKAMIYA @USAGI + * Masahide NAKAMURA @USAGI + */ + +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/spinlock.h> +#include <linux/stringify.h> +#include <linux/time.h> +#include <net/ipv6.h> +#include <net/xfrm.h> + +/* Add route optimization header space. + * + * The IP header and mutable extension headers will be moved forward to make + * space for the route optimization header. + */ +static int xfrm6_ro_output(struct xfrm_state *x, struct sk_buff *skb) +{ + struct ipv6hdr *iph; + u8 *prevhdr; + int hdr_len; + + iph = ipv6_hdr(skb); + + hdr_len = x->type->hdr_offset(x, skb, &prevhdr); + skb_set_mac_header(skb, (prevhdr - x->props.header_len) - skb->data); + skb_set_network_header(skb, -x->props.header_len); + skb->transport_header = skb->network_header + hdr_len; + __skb_pull(skb, hdr_len); + memmove(ipv6_hdr(skb), iph, hdr_len); + + x->lastused = get_seconds(); + + return 0; +} + +static struct xfrm_mode xfrm6_ro_mode = { + .output = xfrm6_ro_output, + .owner = THIS_MODULE, + .encap = XFRM_MODE_ROUTEOPTIMIZATION, +}; + +static int __init xfrm6_ro_init(void) +{ + return xfrm_register_mode(&xfrm6_ro_mode, AF_INET6); +} + +static void __exit xfrm6_ro_exit(void) +{ + int err; + + err = xfrm_unregister_mode(&xfrm6_ro_mode, AF_INET6); + BUG_ON(err); +} + +module_init(xfrm6_ro_init); +module_exit(xfrm6_ro_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_XFRM_MODE(AF_INET6, XFRM_MODE_ROUTEOPTIMIZATION); diff --git a/net/ipv6/xfrm6_mode_transport.c b/net/ipv6/xfrm6_mode_transport.c new file mode 100644 index 00000000..4e344105 --- /dev/null +++ b/net/ipv6/xfrm6_mode_transport.c @@ -0,0 +1,85 @@ +/* + * xfrm6_mode_transport.c - Transport mode encapsulation for IPv6. + * + * Copyright (C) 2002 USAGI/WIDE Project + * Copyright (c) 2004-2006 Herbert Xu <herbert@gondor.apana.org.au> + */ + +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/stringify.h> +#include <net/dst.h> +#include <net/ipv6.h> +#include <net/xfrm.h> + +/* Add encapsulation header. + * + * The IP header and mutable extension headers will be moved forward to make + * space for the encapsulation header. + */ +static int xfrm6_transport_output(struct xfrm_state *x, struct sk_buff *skb) +{ + struct ipv6hdr *iph; + u8 *prevhdr; + int hdr_len; + + iph = ipv6_hdr(skb); + + hdr_len = x->type->hdr_offset(x, skb, &prevhdr); + skb_set_mac_header(skb, (prevhdr - x->props.header_len) - skb->data); + skb_set_network_header(skb, -x->props.header_len); + skb->transport_header = skb->network_header + hdr_len; + __skb_pull(skb, hdr_len); + memmove(ipv6_hdr(skb), iph, hdr_len); + return 0; +} + +/* Remove encapsulation header. + * + * The IP header will be moved over the top of the encapsulation header. + * + * On entry, skb->h shall point to where the IP header should be and skb->nh + * shall be set to where the IP header currently is. skb->data shall point + * to the start of the payload. + */ +static int xfrm6_transport_input(struct xfrm_state *x, struct sk_buff *skb) +{ + int ihl = skb->data - skb_transport_header(skb); + + if (skb->transport_header != skb->network_header) { + memmove(skb_transport_header(skb), + skb_network_header(skb), ihl); + skb->network_header = skb->transport_header; + } + ipv6_hdr(skb)->payload_len = htons(skb->len + ihl - + sizeof(struct ipv6hdr)); + skb_reset_transport_header(skb); + return 0; +} + +static struct xfrm_mode xfrm6_transport_mode = { + .input = xfrm6_transport_input, + .output = xfrm6_transport_output, + .owner = THIS_MODULE, + .encap = XFRM_MODE_TRANSPORT, +}; + +static int __init xfrm6_transport_init(void) +{ + return xfrm_register_mode(&xfrm6_transport_mode, AF_INET6); +} + +static void __exit xfrm6_transport_exit(void) +{ + int err; + + err = xfrm_unregister_mode(&xfrm6_transport_mode, AF_INET6); + BUG_ON(err); +} + +module_init(xfrm6_transport_init); +module_exit(xfrm6_transport_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_XFRM_MODE(AF_INET6, XFRM_MODE_TRANSPORT); diff --git a/net/ipv6/xfrm6_mode_tunnel.c b/net/ipv6/xfrm6_mode_tunnel.c new file mode 100644 index 00000000..9f2095b1 --- /dev/null +++ b/net/ipv6/xfrm6_mode_tunnel.c @@ -0,0 +1,117 @@ +/* + * xfrm6_mode_tunnel.c - Tunnel mode encapsulation for IPv6. + * + * Copyright (C) 2002 USAGI/WIDE Project + * Copyright (c) 2004-2006 Herbert Xu <herbert@gondor.apana.org.au> + */ + +#include <linux/gfp.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/stringify.h> +#include <net/dsfield.h> +#include <net/dst.h> +#include <net/inet_ecn.h> +#include <net/ip6_route.h> +#include <net/ipv6.h> +#include <net/xfrm.h> + +static inline void ipip6_ecn_decapsulate(struct sk_buff *skb) +{ + const struct ipv6hdr *outer_iph = ipv6_hdr(skb); + struct ipv6hdr *inner_iph = ipipv6_hdr(skb); + + if (INET_ECN_is_ce(ipv6_get_dsfield(outer_iph))) + IP6_ECN_set_ce(inner_iph); +} + +/* Add encapsulation header. + * + * The top IP header will be constructed per RFC 2401. + */ +static int xfrm6_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct ipv6hdr *top_iph; + int dsfield; + + skb_set_network_header(skb, -x->props.header_len); + skb->mac_header = skb->network_header + + offsetof(struct ipv6hdr, nexthdr); + skb->transport_header = skb->network_header + sizeof(*top_iph); + top_iph = ipv6_hdr(skb); + + top_iph->version = 6; + + memcpy(top_iph->flow_lbl, XFRM_MODE_SKB_CB(skb)->flow_lbl, + sizeof(top_iph->flow_lbl)); + top_iph->nexthdr = xfrm_af2proto(skb_dst(skb)->ops->family); + + dsfield = XFRM_MODE_SKB_CB(skb)->tos; + dsfield = INET_ECN_encapsulate(dsfield, dsfield); + if (x->props.flags & XFRM_STATE_NOECN) + dsfield &= ~INET_ECN_MASK; + ipv6_change_dsfield(top_iph, 0, dsfield); + top_iph->hop_limit = ip6_dst_hoplimit(dst->child); + top_iph->saddr = *(struct in6_addr *)&x->props.saddr; + top_iph->daddr = *(struct in6_addr *)&x->id.daddr; + return 0; +} + +static int xfrm6_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) +{ + int err = -EINVAL; + + if (XFRM_MODE_SKB_CB(skb)->protocol != IPPROTO_IPV6) + goto out; + if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) + goto out; + + if (skb_cloned(skb) && + (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) + goto out; + + if (x->props.flags & XFRM_STATE_DECAP_DSCP) + ipv6_copy_dscp(ipv6_get_dsfield(ipv6_hdr(skb)), + ipipv6_hdr(skb)); + if (!(x->props.flags & XFRM_STATE_NOECN)) + ipip6_ecn_decapsulate(skb); + + skb_reset_network_header(skb); + skb_mac_header_rebuild(skb); + + err = 0; + +out: + return err; +} + +static struct xfrm_mode xfrm6_tunnel_mode = { + .input2 = xfrm6_mode_tunnel_input, + .input = xfrm_prepare_input, + .output2 = xfrm6_mode_tunnel_output, + .output = xfrm6_prepare_output, + .owner = THIS_MODULE, + .encap = XFRM_MODE_TUNNEL, + .flags = XFRM_MODE_FLAG_TUNNEL, +}; + +static int __init xfrm6_mode_tunnel_init(void) +{ + return xfrm_register_mode(&xfrm6_tunnel_mode, AF_INET6); +} + +static void __exit xfrm6_mode_tunnel_exit(void) +{ + int err; + + err = xfrm_unregister_mode(&xfrm6_tunnel_mode, AF_INET6); + BUG_ON(err); +} + +module_init(xfrm6_mode_tunnel_init); +module_exit(xfrm6_mode_tunnel_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_XFRM_MODE(AF_INET6, XFRM_MODE_TUNNEL); diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c new file mode 100644 index 00000000..8755a307 --- /dev/null +++ b/net/ipv6/xfrm6_output.c @@ -0,0 +1,161 @@ +/* + * xfrm6_output.c - Common IPsec encapsulation code for IPv6. + * Copyright (C) 2002 USAGI/WIDE Project + * Copyright (c) 2004 Herbert Xu <herbert@gondor.apana.org.au> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/if_ether.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/icmpv6.h> +#include <linux/netfilter_ipv6.h> +#include <net/dst.h> +#include <net/ipv6.h> +#include <net/ip6_route.h> +#include <net/xfrm.h> + +int xfrm6_find_1stfragopt(struct xfrm_state *x, struct sk_buff *skb, + u8 **prevhdr) +{ + return ip6_find_1stfragopt(skb, prevhdr); +} + +EXPORT_SYMBOL(xfrm6_find_1stfragopt); + +static int xfrm6_local_dontfrag(struct sk_buff *skb) +{ + int proto; + struct sock *sk = skb->sk; + + if (sk) { + proto = sk->sk_protocol; + + if (proto == IPPROTO_UDP || proto == IPPROTO_RAW) + return inet6_sk(sk)->dontfrag; + } + + return 0; +} + +static void xfrm6_local_rxpmtu(struct sk_buff *skb, u32 mtu) +{ + struct flowi6 fl6; + struct sock *sk = skb->sk; + + fl6.flowi6_oif = sk->sk_bound_dev_if; + fl6.daddr = ipv6_hdr(skb)->daddr; + + ipv6_local_rxpmtu(sk, &fl6, mtu); +} + +static void xfrm6_local_error(struct sk_buff *skb, u32 mtu) +{ + struct flowi6 fl6; + struct sock *sk = skb->sk; + + fl6.fl6_dport = inet_sk(sk)->inet_dport; + fl6.daddr = ipv6_hdr(skb)->daddr; + + ipv6_local_error(sk, EMSGSIZE, &fl6, mtu); +} + +static int xfrm6_tunnel_check_size(struct sk_buff *skb) +{ + int mtu, ret = 0; + struct dst_entry *dst = skb_dst(skb); + + mtu = dst_mtu(dst); + if (mtu < IPV6_MIN_MTU) + mtu = IPV6_MIN_MTU; + + if (!skb->local_df && skb->len > mtu) { + skb->dev = dst->dev; + + if (xfrm6_local_dontfrag(skb)) + xfrm6_local_rxpmtu(skb, mtu); + else if (skb->sk) + xfrm6_local_error(skb, mtu); + else + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + ret = -EMSGSIZE; + } + + return ret; +} + +int xfrm6_extract_output(struct xfrm_state *x, struct sk_buff *skb) +{ + int err; + + err = xfrm6_tunnel_check_size(skb); + if (err) + return err; + + XFRM_MODE_SKB_CB(skb)->protocol = ipv6_hdr(skb)->nexthdr; + + return xfrm6_extract_header(skb); +} + +int xfrm6_prepare_output(struct xfrm_state *x, struct sk_buff *skb) +{ + int err; + + err = xfrm_inner_extract_output(x, skb); + if (err) + return err; + + memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); +#ifdef CONFIG_NETFILTER + IP6CB(skb)->flags |= IP6SKB_XFRM_TRANSFORMED; +#endif + + skb->protocol = htons(ETH_P_IPV6); + skb->local_df = 1; + + return x->outer_mode->output2(x, skb); +} +EXPORT_SYMBOL(xfrm6_prepare_output); + +int xfrm6_output_finish(struct sk_buff *skb) +{ +#ifdef CONFIG_NETFILTER + IP6CB(skb)->flags |= IP6SKB_XFRM_TRANSFORMED; +#endif + + skb->protocol = htons(ETH_P_IPV6); + return xfrm_output(skb); +} + +static int __xfrm6_output(struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct xfrm_state *x = dst->xfrm; + int mtu = ip6_skb_dst_mtu(skb); + + if (skb->len > mtu && xfrm6_local_dontfrag(skb)) { + xfrm6_local_rxpmtu(skb, mtu); + return -EMSGSIZE; + } else if (!skb->local_df && skb->len > mtu && skb->sk) { + xfrm6_local_error(skb, mtu); + return -EMSGSIZE; + } + + if (x->props.mode == XFRM_MODE_TUNNEL && + ((skb->len > mtu && !skb_is_gso(skb)) || + dst_allfrag(skb_dst(skb)))) { + return ip6_fragment(skb, x->outer_mode->afinfo->output_finish); + } + return x->outer_mode->afinfo->output_finish(skb); +} + +int xfrm6_output(struct sk_buff *skb) +{ + return NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, + skb_dst(skb)->dev, __xfrm6_output); +} diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c new file mode 100644 index 00000000..8ea65e03 --- /dev/null +++ b/net/ipv6/xfrm6_policy.c @@ -0,0 +1,357 @@ +/* + * xfrm6_policy.c: based on xfrm4_policy.c + * + * Authors: + * Mitsuru KANDA @USAGI + * Kazunori MIYAZAWA @USAGI + * Kunihiro Ishiguro <kunihiro@ipinfusion.com> + * IPv6 support + * YOSHIFUJI Hideaki + * Split up af-specific portion + * + */ + +#include <linux/err.h> +#include <linux/kernel.h> +#include <linux/netdevice.h> +#include <net/addrconf.h> +#include <net/dst.h> +#include <net/xfrm.h> +#include <net/ip.h> +#include <net/ipv6.h> +#include <net/ip6_route.h> +#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) +#include <net/mip6.h> +#endif + +static struct xfrm_policy_afinfo xfrm6_policy_afinfo; + +static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos, + const xfrm_address_t *saddr, + const xfrm_address_t *daddr) +{ + struct flowi6 fl6; + struct dst_entry *dst; + int err; + + memset(&fl6, 0, sizeof(fl6)); + memcpy(&fl6.daddr, daddr, sizeof(fl6.daddr)); + if (saddr) + memcpy(&fl6.saddr, saddr, sizeof(fl6.saddr)); + + dst = ip6_route_output(net, NULL, &fl6); + + err = dst->error; + if (dst->error) { + dst_release(dst); + dst = ERR_PTR(err); + } + + return dst; +} + +static int xfrm6_get_saddr(struct net *net, + xfrm_address_t *saddr, xfrm_address_t *daddr) +{ + struct dst_entry *dst; + struct net_device *dev; + + dst = xfrm6_dst_lookup(net, 0, NULL, daddr); + if (IS_ERR(dst)) + return -EHOSTUNREACH; + + dev = ip6_dst_idev(dst)->dev; + ipv6_dev_get_saddr(dev_net(dev), dev, + (struct in6_addr *)&daddr->a6, 0, + (struct in6_addr *)&saddr->a6); + dst_release(dst); + return 0; +} + +static int xfrm6_get_tos(const struct flowi *fl) +{ + return 0; +} + +static int xfrm6_init_path(struct xfrm_dst *path, struct dst_entry *dst, + int nfheader_len) +{ + if (dst->ops->family == AF_INET6) { + struct rt6_info *rt = (struct rt6_info*)dst; + if (rt->rt6i_node) + path->path_cookie = rt->rt6i_node->fn_sernum; + } + + path->u.rt6.rt6i_nfheader_len = nfheader_len; + + return 0; +} + +static int xfrm6_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, + const struct flowi *fl) +{ + struct rt6_info *rt = (struct rt6_info*)xdst->route; + + xdst->u.dst.dev = dev; + dev_hold(dev); + + xdst->u.rt6.rt6i_idev = in6_dev_get(dev); + if (!xdst->u.rt6.rt6i_idev) + return -ENODEV; + + xdst->u.rt6.rt6i_peer = rt->rt6i_peer; + if (rt->rt6i_peer) + atomic_inc(&rt->rt6i_peer->refcnt); + + /* Sheit... I remember I did this right. Apparently, + * it was magically lost, so this code needs audit */ + xdst->u.rt6.rt6i_flags = rt->rt6i_flags & (RTF_ANYCAST | + RTF_LOCAL); + xdst->u.rt6.rt6i_metric = rt->rt6i_metric; + xdst->u.rt6.rt6i_node = rt->rt6i_node; + if (rt->rt6i_node) + xdst->route_cookie = rt->rt6i_node->fn_sernum; + xdst->u.rt6.rt6i_gateway = rt->rt6i_gateway; + xdst->u.rt6.rt6i_dst = rt->rt6i_dst; + xdst->u.rt6.rt6i_src = rt->rt6i_src; + + return 0; +} + +static inline void +_decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse) +{ + struct flowi6 *fl6 = &fl->u.ip6; + int onlyproto = 0; + u16 offset = skb_network_header_len(skb); + const struct ipv6hdr *hdr = ipv6_hdr(skb); + struct ipv6_opt_hdr *exthdr; + const unsigned char *nh = skb_network_header(skb); + u8 nexthdr = nh[IP6CB(skb)->nhoff]; + + memset(fl6, 0, sizeof(struct flowi6)); + fl6->flowi6_mark = skb->mark; + + fl6->daddr = reverse ? hdr->saddr : hdr->daddr; + fl6->saddr = reverse ? hdr->daddr : hdr->saddr; + + while (nh + offset + 1 < skb->data || + pskb_may_pull(skb, nh + offset + 1 - skb->data)) { + nh = skb_network_header(skb); + exthdr = (struct ipv6_opt_hdr *)(nh + offset); + + switch (nexthdr) { + case NEXTHDR_FRAGMENT: + onlyproto = 1; + case NEXTHDR_ROUTING: + case NEXTHDR_HOP: + case NEXTHDR_DEST: + offset += ipv6_optlen(exthdr); + nexthdr = exthdr->nexthdr; + exthdr = (struct ipv6_opt_hdr *)(nh + offset); + break; + + case IPPROTO_UDP: + case IPPROTO_UDPLITE: + case IPPROTO_TCP: + case IPPROTO_SCTP: + case IPPROTO_DCCP: + if (!onlyproto && (nh + offset + 4 < skb->data || + pskb_may_pull(skb, nh + offset + 4 - skb->data))) { + __be16 *ports = (__be16 *)exthdr; + + fl6->fl6_sport = ports[!!reverse]; + fl6->fl6_dport = ports[!reverse]; + } + fl6->flowi6_proto = nexthdr; + return; + + case IPPROTO_ICMPV6: + if (!onlyproto && pskb_may_pull(skb, nh + offset + 2 - skb->data)) { + u8 *icmp = (u8 *)exthdr; + + fl6->fl6_icmp_type = icmp[0]; + fl6->fl6_icmp_code = icmp[1]; + } + fl6->flowi6_proto = nexthdr; + return; + +#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) + case IPPROTO_MH: + if (!onlyproto && pskb_may_pull(skb, nh + offset + 3 - skb->data)) { + struct ip6_mh *mh; + mh = (struct ip6_mh *)exthdr; + + fl6->fl6_mh_type = mh->ip6mh_type; + } + fl6->flowi6_proto = nexthdr; + return; +#endif + + /* XXX Why are there these headers? */ + case IPPROTO_AH: + case IPPROTO_ESP: + case IPPROTO_COMP: + default: + fl6->fl6_ipsec_spi = 0; + fl6->flowi6_proto = nexthdr; + return; + } + } +} + +static inline int xfrm6_garbage_collect(struct dst_ops *ops) +{ + struct net *net = container_of(ops, struct net, xfrm.xfrm6_dst_ops); + + xfrm6_policy_afinfo.garbage_collect(net); + return dst_entries_get_fast(ops) > ops->gc_thresh * 2; +} + +static void xfrm6_update_pmtu(struct dst_entry *dst, u32 mtu) +{ + struct xfrm_dst *xdst = (struct xfrm_dst *)dst; + struct dst_entry *path = xdst->route; + + path->ops->update_pmtu(path, mtu); +} + +static void xfrm6_dst_destroy(struct dst_entry *dst) +{ + struct xfrm_dst *xdst = (struct xfrm_dst *)dst; + + if (likely(xdst->u.rt6.rt6i_idev)) + in6_dev_put(xdst->u.rt6.rt6i_idev); + dst_destroy_metrics_generic(dst); + if (likely(xdst->u.rt6.rt6i_peer)) + inet_putpeer(xdst->u.rt6.rt6i_peer); + xfrm_dst_destroy(xdst); +} + +static void xfrm6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, + int unregister) +{ + struct xfrm_dst *xdst; + + if (!unregister) + return; + + xdst = (struct xfrm_dst *)dst; + if (xdst->u.rt6.rt6i_idev->dev == dev) { + struct inet6_dev *loopback_idev = + in6_dev_get(dev_net(dev)->loopback_dev); + BUG_ON(!loopback_idev); + + do { + in6_dev_put(xdst->u.rt6.rt6i_idev); + xdst->u.rt6.rt6i_idev = loopback_idev; + in6_dev_hold(loopback_idev); + xdst = (struct xfrm_dst *)xdst->u.dst.child; + } while (xdst->u.dst.xfrm); + + __in6_dev_put(loopback_idev); + } + + xfrm_dst_ifdown(dst, dev); +} + +static struct dst_ops xfrm6_dst_ops = { + .family = AF_INET6, + .protocol = cpu_to_be16(ETH_P_IPV6), + .gc = xfrm6_garbage_collect, + .update_pmtu = xfrm6_update_pmtu, + .cow_metrics = dst_cow_metrics_generic, + .destroy = xfrm6_dst_destroy, + .ifdown = xfrm6_dst_ifdown, + .local_out = __ip6_local_out, + .gc_thresh = 1024, +}; + +static struct xfrm_policy_afinfo xfrm6_policy_afinfo = { + .family = AF_INET6, + .dst_ops = &xfrm6_dst_ops, + .dst_lookup = xfrm6_dst_lookup, + .get_saddr = xfrm6_get_saddr, + .decode_session = _decode_session6, + .get_tos = xfrm6_get_tos, + .init_path = xfrm6_init_path, + .fill_dst = xfrm6_fill_dst, + .blackhole_route = ip6_blackhole_route, +}; + +static int __init xfrm6_policy_init(void) +{ + return xfrm_policy_register_afinfo(&xfrm6_policy_afinfo); +} + +static void xfrm6_policy_fini(void) +{ + xfrm_policy_unregister_afinfo(&xfrm6_policy_afinfo); +} + +#ifdef CONFIG_SYSCTL +static struct ctl_table xfrm6_policy_table[] = { + { + .procname = "xfrm6_gc_thresh", + .data = &init_net.xfrm.xfrm6_dst_ops.gc_thresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { } +}; + +static struct ctl_table_header *sysctl_hdr; +#endif + +int __init xfrm6_init(void) +{ + int ret; + unsigned int gc_thresh; + + /* + * We need a good default value for the xfrm6 gc threshold. + * In ipv4 we set it to the route hash table size * 8, which + * is half the size of the maximaum route cache for ipv4. It + * would be good to do the same thing for v6, except the table is + * constructed differently here. Here each table for a net namespace + * can have FIB_TABLE_HASHSZ entries, so lets go with the same + * computation that we used for ipv4 here. Also, lets keep the initial + * gc_thresh to a minimum of 1024, since, the ipv6 route cache defaults + * to that as a minimum as well + */ + gc_thresh = FIB6_TABLE_HASHSZ * 8; + xfrm6_dst_ops.gc_thresh = (gc_thresh < 1024) ? 1024 : gc_thresh; + dst_entries_init(&xfrm6_dst_ops); + + ret = xfrm6_policy_init(); + if (ret) { + dst_entries_destroy(&xfrm6_dst_ops); + goto out; + } + ret = xfrm6_state_init(); + if (ret) + goto out_policy; + +#ifdef CONFIG_SYSCTL + sysctl_hdr = register_net_sysctl_table(&init_net, net_ipv6_ctl_path, + xfrm6_policy_table); +#endif +out: + return ret; +out_policy: + xfrm6_policy_fini(); + goto out; +} + +void xfrm6_fini(void) +{ +#ifdef CONFIG_SYSCTL + if (sysctl_hdr) + unregister_net_sysctl_table(sysctl_hdr); +#endif + //xfrm6_input_fini(); + xfrm6_policy_fini(); + xfrm6_state_fini(); + dst_entries_destroy(&xfrm6_dst_ops); +} diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c new file mode 100644 index 00000000..3f2f7c4a --- /dev/null +++ b/net/ipv6/xfrm6_state.c @@ -0,0 +1,197 @@ +/* + * xfrm6_state.c: based on xfrm4_state.c + * + * Authors: + * Mitsuru KANDA @USAGI + * Kazunori MIYAZAWA @USAGI + * Kunihiro Ishiguro <kunihiro@ipinfusion.com> + * IPv6 support + * YOSHIFUJI Hideaki @USAGI + * Split up af-specific portion + * + */ + +#include <net/xfrm.h> +#include <linux/pfkeyv2.h> +#include <linux/ipsec.h> +#include <linux/netfilter_ipv6.h> +#include <linux/export.h> +#include <net/dsfield.h> +#include <net/ipv6.h> +#include <net/addrconf.h> + +static void +__xfrm6_init_tempsel(struct xfrm_selector *sel, const struct flowi *fl) +{ + const struct flowi6 *fl6 = &fl->u.ip6; + + /* Initialize temporary selector matching only + * to current session. */ + *(struct in6_addr *)&sel->daddr = fl6->daddr; + *(struct in6_addr *)&sel->saddr = fl6->saddr; + sel->dport = xfrm_flowi_dport(fl, &fl6->uli); + sel->dport_mask = htons(0xffff); + sel->sport = xfrm_flowi_sport(fl, &fl6->uli); + sel->sport_mask = htons(0xffff); + sel->family = AF_INET6; + sel->prefixlen_d = 128; + sel->prefixlen_s = 128; + sel->proto = fl6->flowi6_proto; + sel->ifindex = fl6->flowi6_oif; +} + +static void +xfrm6_init_temprop(struct xfrm_state *x, const struct xfrm_tmpl *tmpl, + const xfrm_address_t *daddr, const xfrm_address_t *saddr) +{ + x->id = tmpl->id; + if (ipv6_addr_any((struct in6_addr*)&x->id.daddr)) + memcpy(&x->id.daddr, daddr, sizeof(x->sel.daddr)); + memcpy(&x->props.saddr, &tmpl->saddr, sizeof(x->props.saddr)); + if (ipv6_addr_any((struct in6_addr*)&x->props.saddr)) + memcpy(&x->props.saddr, saddr, sizeof(x->props.saddr)); + x->props.mode = tmpl->mode; + x->props.reqid = tmpl->reqid; + x->props.family = AF_INET6; +} + +/* distribution counting sort function for xfrm_state and xfrm_tmpl */ +static int +__xfrm6_sort(void **dst, void **src, int n, int (*cmp)(void *p), int maxclass) +{ + int i; + int class[XFRM_MAX_DEPTH]; + int count[maxclass]; + + memset(count, 0, sizeof(count)); + + for (i = 0; i < n; i++) { + int c; + class[i] = c = cmp(src[i]); + count[c]++; + } + + for (i = 2; i < maxclass; i++) + count[i] += count[i - 1]; + + for (i = 0; i < n; i++) { + dst[count[class[i] - 1]++] = src[i]; + src[i] = NULL; + } + + return 0; +} + +/* + * Rule for xfrm_state: + * + * rule 1: select IPsec transport except AH + * rule 2: select MIPv6 RO or inbound trigger + * rule 3: select IPsec transport AH + * rule 4: select IPsec tunnel + * rule 5: others + */ +static int __xfrm6_state_sort_cmp(void *p) +{ + struct xfrm_state *v = p; + + switch (v->props.mode) { + case XFRM_MODE_TRANSPORT: + if (v->id.proto != IPPROTO_AH) + return 1; + else + return 3; +#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) + case XFRM_MODE_ROUTEOPTIMIZATION: + case XFRM_MODE_IN_TRIGGER: + return 2; +#endif + case XFRM_MODE_TUNNEL: + case XFRM_MODE_BEET: + return 4; + } + return 5; +} + +static int +__xfrm6_state_sort(struct xfrm_state **dst, struct xfrm_state **src, int n) +{ + return __xfrm6_sort((void **)dst, (void **)src, n, + __xfrm6_state_sort_cmp, 6); +} + +/* + * Rule for xfrm_tmpl: + * + * rule 1: select IPsec transport + * rule 2: select MIPv6 RO or inbound trigger + * rule 3: select IPsec tunnel + * rule 4: others + */ +static int __xfrm6_tmpl_sort_cmp(void *p) +{ + struct xfrm_tmpl *v = p; + switch (v->mode) { + case XFRM_MODE_TRANSPORT: + return 1; +#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) + case XFRM_MODE_ROUTEOPTIMIZATION: + case XFRM_MODE_IN_TRIGGER: + return 2; +#endif + case XFRM_MODE_TUNNEL: + case XFRM_MODE_BEET: + return 3; + } + return 4; +} + +static int +__xfrm6_tmpl_sort(struct xfrm_tmpl **dst, struct xfrm_tmpl **src, int n) +{ + return __xfrm6_sort((void **)dst, (void **)src, n, + __xfrm6_tmpl_sort_cmp, 5); +} + +int xfrm6_extract_header(struct sk_buff *skb) +{ + struct ipv6hdr *iph = ipv6_hdr(skb); + + XFRM_MODE_SKB_CB(skb)->ihl = sizeof(*iph); + XFRM_MODE_SKB_CB(skb)->id = 0; + XFRM_MODE_SKB_CB(skb)->frag_off = htons(IP_DF); + XFRM_MODE_SKB_CB(skb)->tos = ipv6_get_dsfield(iph); + XFRM_MODE_SKB_CB(skb)->ttl = iph->hop_limit; + XFRM_MODE_SKB_CB(skb)->optlen = 0; + memcpy(XFRM_MODE_SKB_CB(skb)->flow_lbl, iph->flow_lbl, + sizeof(XFRM_MODE_SKB_CB(skb)->flow_lbl)); + + return 0; +} + +static struct xfrm_state_afinfo xfrm6_state_afinfo = { + .family = AF_INET6, + .proto = IPPROTO_IPV6, + .eth_proto = htons(ETH_P_IPV6), + .owner = THIS_MODULE, + .init_tempsel = __xfrm6_init_tempsel, + .init_temprop = xfrm6_init_temprop, + .tmpl_sort = __xfrm6_tmpl_sort, + .state_sort = __xfrm6_state_sort, + .output = xfrm6_output, + .output_finish = xfrm6_output_finish, + .extract_input = xfrm6_extract_input, + .extract_output = xfrm6_extract_output, + .transport_finish = xfrm6_transport_finish, +}; + +int __init xfrm6_state_init(void) +{ + return xfrm_state_register_afinfo(&xfrm6_state_afinfo); +} + +void xfrm6_state_fini(void) +{ + xfrm_state_unregister_afinfo(&xfrm6_state_afinfo); +} + diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c new file mode 100644 index 00000000..4fe1db12 --- /dev/null +++ b/net/ipv6/xfrm6_tunnel.c @@ -0,0 +1,402 @@ +/* + * Copyright (C)2003,2004 USAGI/WIDE Project + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Authors Mitsuru KANDA <mk@linux-ipv6.org> + * YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org> + * + * Based on net/ipv4/xfrm4_tunnel.c + * + */ +#include <linux/module.h> +#include <linux/xfrm.h> +#include <linux/slab.h> +#include <linux/rculist.h> +#include <net/ip.h> +#include <net/xfrm.h> +#include <net/ipv6.h> +#include <linux/ipv6.h> +#include <linux/icmpv6.h> +#include <linux/mutex.h> +#include <net/netns/generic.h> + +#define XFRM6_TUNNEL_SPI_BYADDR_HSIZE 256 +#define XFRM6_TUNNEL_SPI_BYSPI_HSIZE 256 + +#define XFRM6_TUNNEL_SPI_MIN 1 +#define XFRM6_TUNNEL_SPI_MAX 0xffffffff + +struct xfrm6_tunnel_net { + struct hlist_head spi_byaddr[XFRM6_TUNNEL_SPI_BYADDR_HSIZE]; + struct hlist_head spi_byspi[XFRM6_TUNNEL_SPI_BYSPI_HSIZE]; + u32 spi; +}; + +static int xfrm6_tunnel_net_id __read_mostly; +static inline struct xfrm6_tunnel_net *xfrm6_tunnel_pernet(struct net *net) +{ + return net_generic(net, xfrm6_tunnel_net_id); +} + +/* + * xfrm_tunnel_spi things are for allocating unique id ("spi") + * per xfrm_address_t. + */ +struct xfrm6_tunnel_spi { + struct hlist_node list_byaddr; + struct hlist_node list_byspi; + xfrm_address_t addr; + u32 spi; + atomic_t refcnt; + struct rcu_head rcu_head; +}; + +static DEFINE_SPINLOCK(xfrm6_tunnel_spi_lock); + +static struct kmem_cache *xfrm6_tunnel_spi_kmem __read_mostly; + +static inline unsigned xfrm6_tunnel_spi_hash_byaddr(const xfrm_address_t *addr) +{ + unsigned h; + + h = (__force u32)(addr->a6[0] ^ addr->a6[1] ^ addr->a6[2] ^ addr->a6[3]); + h ^= h >> 16; + h ^= h >> 8; + h &= XFRM6_TUNNEL_SPI_BYADDR_HSIZE - 1; + + return h; +} + +static inline unsigned xfrm6_tunnel_spi_hash_byspi(u32 spi) +{ + return spi % XFRM6_TUNNEL_SPI_BYSPI_HSIZE; +} + +static struct xfrm6_tunnel_spi *__xfrm6_tunnel_spi_lookup(struct net *net, const xfrm_address_t *saddr) +{ + struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net); + struct xfrm6_tunnel_spi *x6spi; + struct hlist_node *pos; + + hlist_for_each_entry_rcu(x6spi, pos, + &xfrm6_tn->spi_byaddr[xfrm6_tunnel_spi_hash_byaddr(saddr)], + list_byaddr) { + if (memcmp(&x6spi->addr, saddr, sizeof(x6spi->addr)) == 0) + return x6spi; + } + + return NULL; +} + +__be32 xfrm6_tunnel_spi_lookup(struct net *net, const xfrm_address_t *saddr) +{ + struct xfrm6_tunnel_spi *x6spi; + u32 spi; + + rcu_read_lock_bh(); + x6spi = __xfrm6_tunnel_spi_lookup(net, saddr); + spi = x6spi ? x6spi->spi : 0; + rcu_read_unlock_bh(); + return htonl(spi); +} + +EXPORT_SYMBOL(xfrm6_tunnel_spi_lookup); + +static int __xfrm6_tunnel_spi_check(struct net *net, u32 spi) +{ + struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net); + struct xfrm6_tunnel_spi *x6spi; + int index = xfrm6_tunnel_spi_hash_byspi(spi); + struct hlist_node *pos; + + hlist_for_each_entry(x6spi, pos, + &xfrm6_tn->spi_byspi[index], + list_byspi) { + if (x6spi->spi == spi) + return -1; + } + return index; +} + +static u32 __xfrm6_tunnel_alloc_spi(struct net *net, xfrm_address_t *saddr) +{ + struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net); + u32 spi; + struct xfrm6_tunnel_spi *x6spi; + int index; + + if (xfrm6_tn->spi < XFRM6_TUNNEL_SPI_MIN || + xfrm6_tn->spi >= XFRM6_TUNNEL_SPI_MAX) + xfrm6_tn->spi = XFRM6_TUNNEL_SPI_MIN; + else + xfrm6_tn->spi++; + + for (spi = xfrm6_tn->spi; spi <= XFRM6_TUNNEL_SPI_MAX; spi++) { + index = __xfrm6_tunnel_spi_check(net, spi); + if (index >= 0) + goto alloc_spi; + } + for (spi = XFRM6_TUNNEL_SPI_MIN; spi < xfrm6_tn->spi; spi++) { + index = __xfrm6_tunnel_spi_check(net, spi); + if (index >= 0) + goto alloc_spi; + } + spi = 0; + goto out; +alloc_spi: + xfrm6_tn->spi = spi; + x6spi = kmem_cache_alloc(xfrm6_tunnel_spi_kmem, GFP_ATOMIC); + if (!x6spi) + goto out; + + memcpy(&x6spi->addr, saddr, sizeof(x6spi->addr)); + x6spi->spi = spi; + atomic_set(&x6spi->refcnt, 1); + + hlist_add_head_rcu(&x6spi->list_byspi, &xfrm6_tn->spi_byspi[index]); + + index = xfrm6_tunnel_spi_hash_byaddr(saddr); + hlist_add_head_rcu(&x6spi->list_byaddr, &xfrm6_tn->spi_byaddr[index]); +out: + return spi; +} + +__be32 xfrm6_tunnel_alloc_spi(struct net *net, xfrm_address_t *saddr) +{ + struct xfrm6_tunnel_spi *x6spi; + u32 spi; + + spin_lock_bh(&xfrm6_tunnel_spi_lock); + x6spi = __xfrm6_tunnel_spi_lookup(net, saddr); + if (x6spi) { + atomic_inc(&x6spi->refcnt); + spi = x6spi->spi; + } else + spi = __xfrm6_tunnel_alloc_spi(net, saddr); + spin_unlock_bh(&xfrm6_tunnel_spi_lock); + + return htonl(spi); +} + +EXPORT_SYMBOL(xfrm6_tunnel_alloc_spi); + +static void x6spi_destroy_rcu(struct rcu_head *head) +{ + kmem_cache_free(xfrm6_tunnel_spi_kmem, + container_of(head, struct xfrm6_tunnel_spi, rcu_head)); +} + +static void xfrm6_tunnel_free_spi(struct net *net, xfrm_address_t *saddr) +{ + struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net); + struct xfrm6_tunnel_spi *x6spi; + struct hlist_node *pos, *n; + + spin_lock_bh(&xfrm6_tunnel_spi_lock); + + hlist_for_each_entry_safe(x6spi, pos, n, + &xfrm6_tn->spi_byaddr[xfrm6_tunnel_spi_hash_byaddr(saddr)], + list_byaddr) + { + if (memcmp(&x6spi->addr, saddr, sizeof(x6spi->addr)) == 0) { + if (atomic_dec_and_test(&x6spi->refcnt)) { + hlist_del_rcu(&x6spi->list_byaddr); + hlist_del_rcu(&x6spi->list_byspi); + call_rcu(&x6spi->rcu_head, x6spi_destroy_rcu); + break; + } + } + } + spin_unlock_bh(&xfrm6_tunnel_spi_lock); +} + +static int xfrm6_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) +{ + skb_push(skb, -skb_network_offset(skb)); + return 0; +} + +static int xfrm6_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) +{ + return skb_network_header(skb)[IP6CB(skb)->nhoff]; +} + +static int xfrm6_tunnel_rcv(struct sk_buff *skb) +{ + struct net *net = dev_net(skb->dev); + const struct ipv6hdr *iph = ipv6_hdr(skb); + __be32 spi; + + spi = xfrm6_tunnel_spi_lookup(net, (const xfrm_address_t *)&iph->saddr); + return xfrm6_rcv_spi(skb, IPPROTO_IPV6, spi); +} + +static int xfrm6_tunnel_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, __be32 info) +{ + /* xfrm6_tunnel native err handling */ + switch (type) { + case ICMPV6_DEST_UNREACH: + switch (code) { + case ICMPV6_NOROUTE: + case ICMPV6_ADM_PROHIBITED: + case ICMPV6_NOT_NEIGHBOUR: + case ICMPV6_ADDR_UNREACH: + case ICMPV6_PORT_UNREACH: + default: + break; + } + break; + case ICMPV6_PKT_TOOBIG: + break; + case ICMPV6_TIME_EXCEED: + switch (code) { + case ICMPV6_EXC_HOPLIMIT: + break; + case ICMPV6_EXC_FRAGTIME: + default: + break; + } + break; + case ICMPV6_PARAMPROB: + switch (code) { + case ICMPV6_HDR_FIELD: break; + case ICMPV6_UNK_NEXTHDR: break; + case ICMPV6_UNK_OPTION: break; + } + break; + default: + break; + } + + return 0; +} + +static int xfrm6_tunnel_init_state(struct xfrm_state *x) +{ + if (x->props.mode != XFRM_MODE_TUNNEL) + return -EINVAL; + + if (x->encap) + return -EINVAL; + + x->props.header_len = sizeof(struct ipv6hdr); + + return 0; +} + +static void xfrm6_tunnel_destroy(struct xfrm_state *x) +{ + struct net *net = xs_net(x); + + xfrm6_tunnel_free_spi(net, (xfrm_address_t *)&x->props.saddr); +} + +static const struct xfrm_type xfrm6_tunnel_type = { + .description = "IP6IP6", + .owner = THIS_MODULE, + .proto = IPPROTO_IPV6, + .init_state = xfrm6_tunnel_init_state, + .destructor = xfrm6_tunnel_destroy, + .input = xfrm6_tunnel_input, + .output = xfrm6_tunnel_output, +}; + +static struct xfrm6_tunnel xfrm6_tunnel_handler __read_mostly = { + .handler = xfrm6_tunnel_rcv, + .err_handler = xfrm6_tunnel_err, + .priority = 2, +}; + +static struct xfrm6_tunnel xfrm46_tunnel_handler __read_mostly = { + .handler = xfrm6_tunnel_rcv, + .err_handler = xfrm6_tunnel_err, + .priority = 2, +}; + +static int __net_init xfrm6_tunnel_net_init(struct net *net) +{ + struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net); + unsigned int i; + + for (i = 0; i < XFRM6_TUNNEL_SPI_BYADDR_HSIZE; i++) + INIT_HLIST_HEAD(&xfrm6_tn->spi_byaddr[i]); + for (i = 0; i < XFRM6_TUNNEL_SPI_BYSPI_HSIZE; i++) + INIT_HLIST_HEAD(&xfrm6_tn->spi_byspi[i]); + xfrm6_tn->spi = 0; + + return 0; +} + +static void __net_exit xfrm6_tunnel_net_exit(struct net *net) +{ +} + +static struct pernet_operations xfrm6_tunnel_net_ops = { + .init = xfrm6_tunnel_net_init, + .exit = xfrm6_tunnel_net_exit, + .id = &xfrm6_tunnel_net_id, + .size = sizeof(struct xfrm6_tunnel_net), +}; + +static int __init xfrm6_tunnel_init(void) +{ + int rv; + + xfrm6_tunnel_spi_kmem = kmem_cache_create("xfrm6_tunnel_spi", + sizeof(struct xfrm6_tunnel_spi), + 0, SLAB_HWCACHE_ALIGN, + NULL); + if (!xfrm6_tunnel_spi_kmem) + return -ENOMEM; + rv = register_pernet_subsys(&xfrm6_tunnel_net_ops); + if (rv < 0) + goto out_pernet; + rv = xfrm_register_type(&xfrm6_tunnel_type, AF_INET6); + if (rv < 0) + goto out_type; + rv = xfrm6_tunnel_register(&xfrm6_tunnel_handler, AF_INET6); + if (rv < 0) + goto out_xfrm6; + rv = xfrm6_tunnel_register(&xfrm46_tunnel_handler, AF_INET); + if (rv < 0) + goto out_xfrm46; + return 0; + +out_xfrm46: + xfrm6_tunnel_deregister(&xfrm6_tunnel_handler, AF_INET6); +out_xfrm6: + xfrm_unregister_type(&xfrm6_tunnel_type, AF_INET6); +out_type: + unregister_pernet_subsys(&xfrm6_tunnel_net_ops); +out_pernet: + kmem_cache_destroy(xfrm6_tunnel_spi_kmem); + return rv; +} + +static void __exit xfrm6_tunnel_fini(void) +{ + xfrm6_tunnel_deregister(&xfrm46_tunnel_handler, AF_INET); + xfrm6_tunnel_deregister(&xfrm6_tunnel_handler, AF_INET6); + xfrm_unregister_type(&xfrm6_tunnel_type, AF_INET6); + unregister_pernet_subsys(&xfrm6_tunnel_net_ops); + kmem_cache_destroy(xfrm6_tunnel_spi_kmem); +} + +module_init(xfrm6_tunnel_init); +module_exit(xfrm6_tunnel_fini); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_XFRM_TYPE(AF_INET6, XFRM_PROTO_IPV6); |