From 871480933a1c28f8a9fed4c4d34d06c439a7a422 Mon Sep 17 00:00:00 2001
From: Srikant Patnaik
Date: Sun, 11 Jan 2015 12:28:04 +0530
Subject: Moved, renamed, and deleted files
The original directory structure was scattered and unorganized.
Changes are basically to make it look like kernel structure.
---
net/ipv4/Kconfig | 632 ++
net/ipv4/Makefile | 56 +
net/ipv4/af_inet.c | 1824 ++++++
net/ipv4/ah4.c | 538 ++
net/ipv4/arp.c | 1446 +++++
net/ipv4/cipso_ipv4.c | 2363 ++++++++
net/ipv4/datagram.c | 87 +
net/ipv4/devinet.c | 1851 ++++++
net/ipv4/esp4.c | 727 +++
net/ipv4/fib_frontend.c | 1135 ++++
net/ipv4/fib_lookup.h | 57 +
net/ipv4/fib_rules.c | 309 +
net/ipv4/fib_semantics.c | 1249 ++++
net/ipv4/fib_trie.c | 2636 +++++++++
net/ipv4/gre.c | 144 +
net/ipv4/icmp.c | 1204 ++++
net/ipv4/igmp.c | 2666 +++++++++
net/ipv4/inet_connection_sock.c | 787 +++
net/ipv4/inet_diag.c | 1086 ++++
net/ipv4/inet_fragment.c | 286 +
net/ipv4/inet_hashtables.c | 584 ++
net/ipv4/inet_lro.c | 548 ++
net/ipv4/inet_timewait_sock.c | 525 ++
net/ipv4/inetpeer.c | 596 ++
net/ipv4/ip_forward.c | 132 +
net/ipv4/ip_fragment.c | 873 +++
net/ipv4/ip_gre.c | 1767 ++++++
net/ipv4/ip_input.c | 453 ++
net/ipv4/ip_options.c | 651 +++
net/ipv4/ip_output.c | 1549 +++++
net/ipv4/ip_sockglue.c | 1387 +++++
net/ipv4/ipcomp.c | 185 +
net/ipv4/ipconfig.c | 1647 ++++++
net/ipv4/ipip.c | 911 +++
net/ipv4/ipmr.c | 2558 +++++++++
net/ipv4/netfilter.c | 249 +
net/ipv4/netfilter/Kconfig | 395 ++
net/ipv4/netfilter/Makefile | 71 +
net/ipv4/netfilter/arp_tables.c | 1914 +++++++
net/ipv4/netfilter/arpt_mangle.c | 91 +
net/ipv4/netfilter/arptable_filter.c | 93 +
net/ipv4/netfilter/ip_queue.c | 639 +++
net/ipv4/netfilter/ip_tables.c | 2271 ++++++++
net/ipv4/netfilter/ipt_CLUSTERIP.c | 745 +++
net/ipv4/netfilter/ipt_ECN.c | 138 +
net/ipv4/netfilter/ipt_MASQUERADE.c | 173 +
net/ipv4/netfilter/ipt_NETMAP.c | 98 +
net/ipv4/netfilter/ipt_REDIRECT.c | 110 +
net/ipv4/netfilter/ipt_REJECT.c | 220 +
net/ipv4/netfilter/ipt_ULOG.c | 440 ++
net/ipv4/netfilter/ipt_ah.c | 91 +
net/ipv4/netfilter/ipt_rpfilter.c | 141 +
net/ipv4/netfilter/iptable_filter.c | 116 +
net/ipv4/netfilter/iptable_mangle.c | 151 +
net/ipv4/netfilter/iptable_raw.c | 96 +
net/ipv4/netfilter/iptable_security.c | 113 +
net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c | 467 ++
.../netfilter/nf_conntrack_l3proto_ipv4_compat.c | 463 ++
net/ipv4/netfilter/nf_conntrack_proto_icmp.c | 372 ++
net/ipv4/netfilter/nf_defrag_ipv4.c | 128 +
net/ipv4/netfilter/nf_nat_amanda.c | 85 +
net/ipv4/netfilter/nf_nat_core.c | 757 +++
net/ipv4/netfilter/nf_nat_ftp.c | 137 +
net/ipv4/netfilter/nf_nat_h323.c | 632 ++
net/ipv4/netfilter/nf_nat_helper.c | 445 ++
net/ipv4/netfilter/nf_nat_irc.c | 99 +
net/ipv4/netfilter/nf_nat_pptp.c | 308 +
net/ipv4/netfilter/nf_nat_proto_common.c | 114 +
net/ipv4/netfilter/nf_nat_proto_dccp.c | 106 +
net/ipv4/netfilter/nf_nat_proto_gre.c | 147 +
net/ipv4/netfilter/nf_nat_proto_icmp.c | 83 +
net/ipv4/netfilter/nf_nat_proto_sctp.c | 96 +
net/ipv4/netfilter/nf_nat_proto_tcp.c | 91 +
net/ipv4/netfilter/nf_nat_proto_udp.c | 82 +
net/ipv4/netfilter/nf_nat_proto_udplite.c | 98 +
net/ipv4/netfilter/nf_nat_proto_unknown.c | 52 +
net/ipv4/netfilter/nf_nat_rule.c | 214 +
net/ipv4/netfilter/nf_nat_sip.c | 568 ++
net/ipv4/netfilter/nf_nat_snmp_basic.c | 1314 +++++
net/ipv4/netfilter/nf_nat_standalone.c | 326 ++
net/ipv4/netfilter/nf_nat_tftp.c | 51 +
net/ipv4/ping.c | 1176 ++++
net/ipv4/proc.c | 498 ++
net/ipv4/protocol.c | 61 +
net/ipv4/raw.c | 1068 ++++
net/ipv4/route.c | 3510 ++++++++++++
net/ipv4/syncookies.c | 379 ++
net/ipv4/sysctl_net_ipv4.c | 872 +++
net/ipv4/sysfs_net_ipv4.c | 88 +
net/ipv4/tcp.c | 3448 +++++++++++
net/ipv4/tcp_bic.c | 242 +
net/ipv4/tcp_cong.c | 425 ++
net/ipv4/tcp_cubic.c | 494 ++
net/ipv4/tcp_diag.c | 69 +
net/ipv4/tcp_highspeed.c | 187 +
net/ipv4/tcp_htcp.c | 315 +
net/ipv4/tcp_hybla.c | 192 +
net/ipv4/tcp_illinois.c | 356 ++
net/ipv4/tcp_input.c | 6060 ++++++++++++++++++++
net/ipv4/tcp_ipv4.c | 2685 +++++++++
net/ipv4/tcp_lp.c | 344 ++
net/ipv4/tcp_memcontrol.c | 272 +
net/ipv4/tcp_minisocks.c | 794 +++
net/ipv4/tcp_output.c | 2877 ++++++++++
net/ipv4/tcp_probe.c | 260 +
net/ipv4/tcp_scalable.c | 62 +
net/ipv4/tcp_timer.c | 599 ++
net/ipv4/tcp_vegas.c | 339 ++
net/ipv4/tcp_vegas.h | 24 +
net/ipv4/tcp_veno.c | 234 +
net/ipv4/tcp_westwood.c | 304 +
net/ipv4/tcp_yeah.c | 260 +
net/ipv4/tunnel4.c | 192 +
net/ipv4/udp.c | 2288 ++++++++
net/ipv4/udp_diag.c | 209 +
net/ipv4/udp_impl.h | 34 +
net/ipv4/udplite.c | 142 +
net/ipv4/xfrm4_input.c | 166 +
net/ipv4/xfrm4_mode_beet.c | 156 +
net/ipv4/xfrm4_mode_transport.c | 80 +
net/ipv4/xfrm4_mode_tunnel.c | 121 +
net/ipv4/xfrm4_output.c | 101 +
net/ipv4/xfrm4_policy.c | 305 +
net/ipv4/xfrm4_state.c | 99 +
net/ipv4/xfrm4_tunnel.c | 117 +
125 files changed, 83543 insertions(+)
create mode 100644 net/ipv4/Kconfig
create mode 100644 net/ipv4/Makefile
create mode 100644 net/ipv4/af_inet.c
create mode 100644 net/ipv4/ah4.c
create mode 100644 net/ipv4/arp.c
create mode 100644 net/ipv4/cipso_ipv4.c
create mode 100644 net/ipv4/datagram.c
create mode 100644 net/ipv4/devinet.c
create mode 100644 net/ipv4/esp4.c
create mode 100644 net/ipv4/fib_frontend.c
create mode 100644 net/ipv4/fib_lookup.h
create mode 100644 net/ipv4/fib_rules.c
create mode 100644 net/ipv4/fib_semantics.c
create mode 100644 net/ipv4/fib_trie.c
create mode 100644 net/ipv4/gre.c
create mode 100644 net/ipv4/icmp.c
create mode 100644 net/ipv4/igmp.c
create mode 100644 net/ipv4/inet_connection_sock.c
create mode 100644 net/ipv4/inet_diag.c
create mode 100644 net/ipv4/inet_fragment.c
create mode 100644 net/ipv4/inet_hashtables.c
create mode 100644 net/ipv4/inet_lro.c
create mode 100644 net/ipv4/inet_timewait_sock.c
create mode 100644 net/ipv4/inetpeer.c
create mode 100644 net/ipv4/ip_forward.c
create mode 100644 net/ipv4/ip_fragment.c
create mode 100644 net/ipv4/ip_gre.c
create mode 100644 net/ipv4/ip_input.c
create mode 100644 net/ipv4/ip_options.c
create mode 100644 net/ipv4/ip_output.c
create mode 100644 net/ipv4/ip_sockglue.c
create mode 100644 net/ipv4/ipcomp.c
create mode 100644 net/ipv4/ipconfig.c
create mode 100644 net/ipv4/ipip.c
create mode 100644 net/ipv4/ipmr.c
create mode 100644 net/ipv4/netfilter.c
create mode 100644 net/ipv4/netfilter/Kconfig
create mode 100644 net/ipv4/netfilter/Makefile
create mode 100644 net/ipv4/netfilter/arp_tables.c
create mode 100644 net/ipv4/netfilter/arpt_mangle.c
create mode 100644 net/ipv4/netfilter/arptable_filter.c
create mode 100644 net/ipv4/netfilter/ip_queue.c
create mode 100644 net/ipv4/netfilter/ip_tables.c
create mode 100644 net/ipv4/netfilter/ipt_CLUSTERIP.c
create mode 100644 net/ipv4/netfilter/ipt_ECN.c
create mode 100644 net/ipv4/netfilter/ipt_MASQUERADE.c
create mode 100644 net/ipv4/netfilter/ipt_NETMAP.c
create mode 100644 net/ipv4/netfilter/ipt_REDIRECT.c
create mode 100644 net/ipv4/netfilter/ipt_REJECT.c
create mode 100644 net/ipv4/netfilter/ipt_ULOG.c
create mode 100644 net/ipv4/netfilter/ipt_ah.c
create mode 100644 net/ipv4/netfilter/ipt_rpfilter.c
create mode 100644 net/ipv4/netfilter/iptable_filter.c
create mode 100644 net/ipv4/netfilter/iptable_mangle.c
create mode 100644 net/ipv4/netfilter/iptable_raw.c
create mode 100644 net/ipv4/netfilter/iptable_security.c
create mode 100644 net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
create mode 100644 net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
create mode 100644 net/ipv4/netfilter/nf_conntrack_proto_icmp.c
create mode 100644 net/ipv4/netfilter/nf_defrag_ipv4.c
create mode 100644 net/ipv4/netfilter/nf_nat_amanda.c
create mode 100644 net/ipv4/netfilter/nf_nat_core.c
create mode 100644 net/ipv4/netfilter/nf_nat_ftp.c
create mode 100644 net/ipv4/netfilter/nf_nat_h323.c
create mode 100644 net/ipv4/netfilter/nf_nat_helper.c
create mode 100644 net/ipv4/netfilter/nf_nat_irc.c
create mode 100644 net/ipv4/netfilter/nf_nat_pptp.c
create mode 100644 net/ipv4/netfilter/nf_nat_proto_common.c
create mode 100644 net/ipv4/netfilter/nf_nat_proto_dccp.c
create mode 100644 net/ipv4/netfilter/nf_nat_proto_gre.c
create mode 100644 net/ipv4/netfilter/nf_nat_proto_icmp.c
create mode 100644 net/ipv4/netfilter/nf_nat_proto_sctp.c
create mode 100644 net/ipv4/netfilter/nf_nat_proto_tcp.c
create mode 100644 net/ipv4/netfilter/nf_nat_proto_udp.c
create mode 100644 net/ipv4/netfilter/nf_nat_proto_udplite.c
create mode 100644 net/ipv4/netfilter/nf_nat_proto_unknown.c
create mode 100644 net/ipv4/netfilter/nf_nat_rule.c
create mode 100644 net/ipv4/netfilter/nf_nat_sip.c
create mode 100644 net/ipv4/netfilter/nf_nat_snmp_basic.c
create mode 100644 net/ipv4/netfilter/nf_nat_standalone.c
create mode 100644 net/ipv4/netfilter/nf_nat_tftp.c
create mode 100644 net/ipv4/ping.c
create mode 100644 net/ipv4/proc.c
create mode 100644 net/ipv4/protocol.c
create mode 100644 net/ipv4/raw.c
create mode 100644 net/ipv4/route.c
create mode 100644 net/ipv4/syncookies.c
create mode 100644 net/ipv4/sysctl_net_ipv4.c
create mode 100644 net/ipv4/sysfs_net_ipv4.c
create mode 100644 net/ipv4/tcp.c
create mode 100644 net/ipv4/tcp_bic.c
create mode 100644 net/ipv4/tcp_cong.c
create mode 100644 net/ipv4/tcp_cubic.c
create mode 100644 net/ipv4/tcp_diag.c
create mode 100644 net/ipv4/tcp_highspeed.c
create mode 100644 net/ipv4/tcp_htcp.c
create mode 100644 net/ipv4/tcp_hybla.c
create mode 100644 net/ipv4/tcp_illinois.c
create mode 100644 net/ipv4/tcp_input.c
create mode 100644 net/ipv4/tcp_ipv4.c
create mode 100644 net/ipv4/tcp_lp.c
create mode 100644 net/ipv4/tcp_memcontrol.c
create mode 100644 net/ipv4/tcp_minisocks.c
create mode 100644 net/ipv4/tcp_output.c
create mode 100644 net/ipv4/tcp_probe.c
create mode 100644 net/ipv4/tcp_scalable.c
create mode 100644 net/ipv4/tcp_timer.c
create mode 100644 net/ipv4/tcp_vegas.c
create mode 100644 net/ipv4/tcp_vegas.h
create mode 100644 net/ipv4/tcp_veno.c
create mode 100644 net/ipv4/tcp_westwood.c
create mode 100644 net/ipv4/tcp_yeah.c
create mode 100644 net/ipv4/tunnel4.c
create mode 100644 net/ipv4/udp.c
create mode 100644 net/ipv4/udp_diag.c
create mode 100644 net/ipv4/udp_impl.h
create mode 100644 net/ipv4/udplite.c
create mode 100644 net/ipv4/xfrm4_input.c
create mode 100644 net/ipv4/xfrm4_mode_beet.c
create mode 100644 net/ipv4/xfrm4_mode_transport.c
create mode 100644 net/ipv4/xfrm4_mode_tunnel.c
create mode 100644 net/ipv4/xfrm4_output.c
create mode 100644 net/ipv4/xfrm4_policy.c
create mode 100644 net/ipv4/xfrm4_state.c
create mode 100644 net/ipv4/xfrm4_tunnel.c
(limited to 'net/ipv4')
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
new file mode 100644
index 00000000..d1832629
--- /dev/null
+++ b/net/ipv4/Kconfig
@@ -0,0 +1,632 @@
+#
+# IP configuration
+#
+config IP_MULTICAST
+ bool "IP: multicasting"
+ help
+ This is code for addressing several networked computers at once,
+ enlarging your kernel by about 2 KB. You need multicasting if you
+ intend to participate in the MBONE, a high bandwidth network on top
+ of the Internet which carries audio and video broadcasts. More
+ information about the MBONE is on the WWW at
+ . Information about the multicast
+ capabilities of the various network cards is contained in
+ . For most people, it's
+ safe to say N.
+
+config IP_ADVANCED_ROUTER
+ bool "IP: advanced router"
+ ---help---
+ If you intend to run your Linux box mostly as a router, i.e. as a
+ computer that forwards and redistributes network packets, say Y; you
+ will then be presented with several options that allow more precise
+ control about the routing process.
+
+ The answer to this question won't directly affect the kernel:
+ answering N will just cause the configurator to skip all the
+ questions about advanced routing.
+
+ Note that your box can only act as a router if you enable IP
+ forwarding in your kernel; you can do that by saying Y to "/proc
+ file system support" and "Sysctl support" below and executing the
+ line
+
+ echo "1" > /proc/sys/net/ipv4/ip_forward
+
+ at boot time after the /proc file system has been mounted.
+
+ If you turn on IP forwarding, you should consider the rp_filter, which
+ automatically rejects incoming packets if the routing table entry
+ for their source address doesn't match the network interface they're
+ arriving on. This has security advantages because it prevents the
+ so-called IP spoofing, however it can pose problems if you use
+ asymmetric routing (packets from you to a host take a different path
+ than packets from that host to you) or if you operate a non-routing
+ host which has several IP addresses on different interfaces. To turn
+ rp_filter on use:
+
+ echo 1 > /proc/sys/net/ipv4/conf//rp_filter
+ or
+ echo 1 > /proc/sys/net/ipv4/conf/all/rp_filter
+
+ Note that some distributions enable it in startup scripts.
+ For details about rp_filter strict and loose mode read
+ .
+
+ If unsure, say N here.
+
+config IP_FIB_TRIE_STATS
+ bool "FIB TRIE statistics"
+ depends on IP_ADVANCED_ROUTER
+ ---help---
+ Keep track of statistics on structure of FIB TRIE table.
+ Useful for testing and measuring TRIE performance.
+
+config IP_MULTIPLE_TABLES
+ bool "IP: policy routing"
+ depends on IP_ADVANCED_ROUTER
+ select FIB_RULES
+ ---help---
+ Normally, a router decides what to do with a received packet based
+ solely on the packet's final destination address. If you say Y here,
+ the Linux router will also be able to take the packet's source
+ address into account. Furthermore, the TOS (Type-Of-Service) field
+ of the packet can be used for routing decisions as well.
+
+ If you are interested in this, please see the preliminary
+ documentation at
+ and .
+ You will need supporting software from
+ .
+
+ If unsure, say N.
+
+config IP_ROUTE_MULTIPATH
+ bool "IP: equal cost multipath"
+ depends on IP_ADVANCED_ROUTER
+ help
+ Normally, the routing tables specify a single action to be taken in
+ a deterministic manner for a given packet. If you say Y here
+ however, it becomes possible to attach several actions to a packet
+ pattern, in effect specifying several alternative paths to travel
+ for those packets. The router considers all these paths to be of
+ equal "cost" and chooses one of them in a non-deterministic fashion
+ if a matching packet arrives.
+
+config IP_ROUTE_VERBOSE
+ bool "IP: verbose route monitoring"
+ depends on IP_ADVANCED_ROUTER
+ help
+ If you say Y here, which is recommended, then the kernel will print
+ verbose messages regarding the routing, for example warnings about
+ received packets which look strange and could be evidence of an
+ attack or a misconfigured system somewhere. The information is
+ handled by the klogd daemon which is responsible for kernel messages
+ ("man klogd").
+
+config IP_ROUTE_CLASSID
+ bool
+
+config IP_PNP
+ bool "IP: kernel level autoconfiguration"
+ help
+ This enables automatic configuration of IP addresses of devices and
+ of the routing table during kernel boot, based on either information
+ supplied on the kernel command line or by BOOTP or RARP protocols.
+ You need to say Y only for diskless machines requiring network
+ access to boot (in which case you want to say Y to "Root file system
+ on NFS" as well), because all other machines configure the network
+ in their startup scripts.
+
+config IP_PNP_DHCP
+ bool "IP: DHCP support"
+ depends on IP_PNP
+ ---help---
+ If you want your Linux box to mount its whole root file system (the
+ one containing the directory /) from some other computer over the
+ net via NFS and you want the IP address of your computer to be
+ discovered automatically at boot time using the DHCP protocol (a
+ special protocol designed for doing this job), say Y here. In case
+ the boot ROM of your network card was designed for booting Linux and
+ does DHCP itself, providing all necessary information on the kernel
+ command line, you can say N here.
+
+ If unsure, say Y. Note that if you want to use DHCP, a DHCP server
+ must be operating on your network. Read
+ for details.
+
+config IP_PNP_BOOTP
+ bool "IP: BOOTP support"
+ depends on IP_PNP
+ ---help---
+ If you want your Linux box to mount its whole root file system (the
+ one containing the directory /) from some other computer over the
+ net via NFS and you want the IP address of your computer to be
+ discovered automatically at boot time using the BOOTP protocol (a
+ special protocol designed for doing this job), say Y here. In case
+ the boot ROM of your network card was designed for booting Linux and
+ does BOOTP itself, providing all necessary information on the kernel
+ command line, you can say N here. If unsure, say Y. Note that if you
+ want to use BOOTP, a BOOTP server must be operating on your network.
+ Read for details.
+
+config IP_PNP_RARP
+ bool "IP: RARP support"
+ depends on IP_PNP
+ help
+ If you want your Linux box to mount its whole root file system (the
+ one containing the directory /) from some other computer over the
+ net via NFS and you want the IP address of your computer to be
+ discovered automatically at boot time using the RARP protocol (an
+ older protocol which is being obsoleted by BOOTP and DHCP), say Y
+ here. Note that if you want to use RARP, a RARP server must be
+ operating on your network. Read
+ for details.
+
+config NET_IPIP
+ tristate "IP: tunneling"
+ select INET_TUNNEL
+ ---help---
+ Tunneling means encapsulating data of one protocol type within
+ another protocol and sending it over a channel that understands the
+ encapsulating protocol. This particular tunneling driver implements
+ encapsulation of IP within IP, which sounds kind of pointless, but
+ can be useful if you want to make your (or some other) machine
+ appear on a different network than it physically is, or to use
+ mobile-IP facilities (allowing laptops to seamlessly move between
+ networks without changing their IP addresses).
+
+ Saying Y to this option will produce two modules ( = code which can
+ be inserted in and removed from the running kernel whenever you
+ want). Most people won't need this and can say N.
+
+config NET_IPGRE_DEMUX
+ tristate "IP: GRE demultiplexer"
+ help
+ This is helper module to demultiplex GRE packets on GRE version field criteria.
+ Required by ip_gre and pptp modules.
+
+config NET_IPGRE
+ tristate "IP: GRE tunnels over IP"
+ depends on (IPV6 || IPV6=n) && NET_IPGRE_DEMUX
+ help
+ Tunneling means encapsulating data of one protocol type within
+ another protocol and sending it over a channel that understands the
+ encapsulating protocol. This particular tunneling driver implements
+ GRE (Generic Routing Encapsulation) and at this time allows
+ encapsulating of IPv4 or IPv6 over existing IPv4 infrastructure.
+ This driver is useful if the other endpoint is a Cisco router: Cisco
+ likes GRE much better than the other Linux tunneling driver ("IP
+ tunneling" above). In addition, GRE allows multicast redistribution
+ through the tunnel.
+
+config NET_IPGRE_BROADCAST
+ bool "IP: broadcast GRE over IP"
+ depends on IP_MULTICAST && NET_IPGRE
+ help
+ One application of GRE/IP is to construct a broadcast WAN (Wide Area
+ Network), which looks like a normal Ethernet LAN (Local Area
+ Network), but can be distributed all over the Internet. If you want
+ to do that, say Y here and to "IP multicast routing" below.
+
+config IP_MROUTE
+ bool "IP: multicast routing"
+ depends on IP_MULTICAST
+ help
+ This is used if you want your machine to act as a router for IP
+ packets that have several destination addresses. It is needed on the
+ MBONE, a high bandwidth network on top of the Internet which carries
+ audio and video broadcasts. In order to do that, you would most
+ likely run the program mrouted. Information about the multicast
+ capabilities of the various network cards is contained in
+ . If you haven't heard
+ about it, you don't need it.
+
+config IP_MROUTE_MULTIPLE_TABLES
+ bool "IP: multicast policy routing"
+ depends on IP_MROUTE && IP_ADVANCED_ROUTER
+ select FIB_RULES
+ help
+ Normally, a multicast router runs a userspace daemon and decides
+ what to do with a multicast packet based on the source and
+ destination addresses. If you say Y here, the multicast router
+ will also be able to take interfaces and packet marks into
+ account and run multiple instances of userspace daemons
+ simultaneously, each one handling a single table.
+
+ If unsure, say N.
+
+config IP_PIMSM_V1
+ bool "IP: PIM-SM version 1 support"
+ depends on IP_MROUTE
+ help
+ Kernel side support for Sparse Mode PIM (Protocol Independent
+ Multicast) version 1. This multicast routing protocol is used widely
+ because Cisco supports it. You need special software to use it
+ (pimd-v1). Please see for more
+ information about PIM.
+
+ Say Y if you want to use PIM-SM v1. Note that you can say N here if
+ you just want to use Dense Mode PIM.
+
+config IP_PIMSM_V2
+ bool "IP: PIM-SM version 2 support"
+ depends on IP_MROUTE
+ help
+ Kernel side support for Sparse Mode PIM version 2. In order to use
+ this, you need an experimental routing daemon supporting it (pimd or
+ gated-5). This routing protocol is not used widely, so say N unless
+ you want to play with it.
+
+config ARPD
+ bool "IP: ARP daemon support"
+ ---help---
+ The kernel maintains an internal cache which maps IP addresses to
+ hardware addresses on the local network, so that Ethernet/Token Ring/
+ etc. frames are sent to the proper address on the physical networking
+ layer. Normally, kernel uses the ARP protocol to resolve these
+ mappings.
+
+ Saying Y here adds support to have an user space daemon to do this
+ resolution instead. This is useful for implementing an alternate
+ address resolution protocol (e.g. NHRP on mGRE tunnels) and also for
+ testing purposes.
+
+ If unsure, say N.
+
+config SYN_COOKIES
+ bool "IP: TCP syncookie support"
+ ---help---
+ Normal TCP/IP networking is open to an attack known as "SYN
+ flooding". This denial-of-service attack prevents legitimate remote
+ users from being able to connect to your computer during an ongoing
+ attack and requires very little work from the attacker, who can
+ operate from anywhere on the Internet.
+
+ SYN cookies provide protection against this type of attack. If you
+ say Y here, the TCP/IP stack will use a cryptographic challenge
+ protocol known as "SYN cookies" to enable legitimate users to
+ continue to connect, even when your machine is under attack. There
+ is no need for the legitimate users to change their TCP/IP software;
+ SYN cookies work transparently to them. For technical information
+ about SYN cookies, check out .
+
+ If you are SYN flooded, the source address reported by the kernel is
+ likely to have been forged by the attacker; it is only reported as
+ an aid in tracing the packets to their actual source and should not
+ be taken as absolute truth.
+
+ SYN cookies may prevent correct error reporting on clients when the
+ server is really overloaded. If this happens frequently better turn
+ them off.
+
+ If you say Y here, you can disable SYN cookies at run time by
+ saying Y to "/proc file system support" and
+ "Sysctl support" below and executing the command
+
+ echo 0 > /proc/sys/net/ipv4/tcp_syncookies
+
+ after the /proc file system has been mounted.
+
+ If unsure, say N.
+
+config INET_AH
+ tristate "IP: AH transformation"
+ select XFRM
+ select CRYPTO
+ select CRYPTO_HMAC
+ select CRYPTO_MD5
+ select CRYPTO_SHA1
+ ---help---
+ Support for IPsec AH.
+
+ If unsure, say Y.
+
+config INET_ESP
+ tristate "IP: ESP transformation"
+ select XFRM
+ select CRYPTO
+ select CRYPTO_AUTHENC
+ select CRYPTO_HMAC
+ select CRYPTO_MD5
+ select CRYPTO_CBC
+ select CRYPTO_SHA1
+ select CRYPTO_DES
+ ---help---
+ Support for IPsec ESP.
+
+ If unsure, say Y.
+
+config INET_IPCOMP
+ tristate "IP: IPComp transformation"
+ select INET_XFRM_TUNNEL
+ select XFRM_IPCOMP
+ ---help---
+ Support for IP Payload Compression Protocol (IPComp) (RFC3173),
+ typically needed for IPsec.
+
+ If unsure, say Y.
+
+config INET_XFRM_TUNNEL
+ tristate
+ select INET_TUNNEL
+ default n
+
+config INET_TUNNEL
+ tristate
+ default n
+
+config INET_XFRM_MODE_TRANSPORT
+ tristate "IP: IPsec transport mode"
+ default y
+ select XFRM
+ ---help---
+ Support for IPsec transport mode.
+
+ If unsure, say Y.
+
+config INET_XFRM_MODE_TUNNEL
+ tristate "IP: IPsec tunnel mode"
+ default y
+ select XFRM
+ ---help---
+ Support for IPsec tunnel mode.
+
+ If unsure, say Y.
+
+config INET_XFRM_MODE_BEET
+ tristate "IP: IPsec BEET mode"
+ default y
+ select XFRM
+ ---help---
+ Support for IPsec BEET mode.
+
+ If unsure, say Y.
+
+config INET_LRO
+ tristate "Large Receive Offload (ipv4/tcp)"
+ default y
+ ---help---
+ Support for Large Receive Offload (ipv4/tcp).
+
+ If unsure, say Y.
+
+config INET_DIAG
+ tristate "INET: socket monitoring interface"
+ default y
+ ---help---
+ Support for INET (TCP, DCCP, etc) socket monitoring interface used by
+ native Linux tools such as ss. ss is included in iproute2, currently
+ downloadable at:
+
+ http://www.linuxfoundation.org/collaborate/workgroups/networking/iproute2
+
+ If unsure, say Y.
+
+config INET_TCP_DIAG
+ depends on INET_DIAG
+ def_tristate INET_DIAG
+
+config INET_UDP_DIAG
+ tristate "UDP: socket monitoring interface"
+ depends on INET_DIAG && (IPV6 || IPV6=n)
+ default n
+ ---help---
+ Support for UDP socket monitoring interface used by the ss tool.
+ If unsure, say Y.
+
+menuconfig TCP_CONG_ADVANCED
+ bool "TCP: advanced congestion control"
+ ---help---
+ Support for selection of various TCP congestion control
+ modules.
+
+ Nearly all users can safely say no here, and a safe default
+ selection will be made (CUBIC with new Reno as a fallback).
+
+ If unsure, say N.
+
+if TCP_CONG_ADVANCED
+
+config TCP_CONG_BIC
+ tristate "Binary Increase Congestion (BIC) control"
+ default m
+ ---help---
+ BIC-TCP is a sender-side only change that ensures a linear RTT
+ fairness under large windows while offering both scalability and
+ bounded TCP-friendliness. The protocol combines two schemes
+ called additive increase and binary search increase. When the
+ congestion window is large, additive increase with a large
+ increment ensures linear RTT fairness as well as good
+ scalability. Under small congestion windows, binary search
+ increase provides TCP friendliness.
+ See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/
+
+config TCP_CONG_CUBIC
+ tristate "CUBIC TCP"
+ default y
+ ---help---
+ This is version 2.0 of BIC-TCP which uses a cubic growth function
+ among other techniques.
+ See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/cubic-paper.pdf
+
+config TCP_CONG_WESTWOOD
+ tristate "TCP Westwood+"
+ default m
+ ---help---
+ TCP Westwood+ is a sender-side only modification of the TCP Reno
+ protocol stack that optimizes the performance of TCP congestion
+ control. It is based on end-to-end bandwidth estimation to set
+ congestion window and slow start threshold after a congestion
+ episode. Using this estimation, TCP Westwood+ adaptively sets a
+ slow start threshold and a congestion window which takes into
+ account the bandwidth used at the time congestion is experienced.
+ TCP Westwood+ significantly increases fairness wrt TCP Reno in
+ wired networks and throughput over wireless links.
+
+config TCP_CONG_HTCP
+ tristate "H-TCP"
+ default m
+ ---help---
+ H-TCP is a send-side only modifications of the TCP Reno
+ protocol stack that optimizes the performance of TCP
+ congestion control for high speed network links. It uses a
+ modeswitch to change the alpha and beta parameters of TCP Reno
+ based on network conditions and in a way so as to be fair with
+ other Reno and H-TCP flows.
+
+config TCP_CONG_HSTCP
+ tristate "High Speed TCP"
+ depends on EXPERIMENTAL
+ default n
+ ---help---
+ Sally Floyd's High Speed TCP (RFC 3649) congestion control.
+ A modification to TCP's congestion control mechanism for use
+ with large congestion windows. A table indicates how much to
+ increase the congestion window by when an ACK is received.
+ For more detail see http://www.icir.org/floyd/hstcp.html
+
+config TCP_CONG_HYBLA
+ tristate "TCP-Hybla congestion control algorithm"
+ depends on EXPERIMENTAL
+ default n
+ ---help---
+ TCP-Hybla is a sender-side only change that eliminates penalization of
+ long-RTT, large-bandwidth connections, like when satellite legs are
+ involved, especially when sharing a common bottleneck with normal
+ terrestrial connections.
+
+config TCP_CONG_VEGAS
+ tristate "TCP Vegas"
+ depends on EXPERIMENTAL
+ default n
+ ---help---
+ TCP Vegas is a sender-side only change to TCP that anticipates
+ the onset of congestion by estimating the bandwidth. TCP Vegas
+ adjusts the sending rate by modifying the congestion
+ window. TCP Vegas should provide less packet loss, but it is
+ not as aggressive as TCP Reno.
+
+config TCP_CONG_SCALABLE
+ tristate "Scalable TCP"
+ depends on EXPERIMENTAL
+ default n
+ ---help---
+ Scalable TCP is a sender-side only change to TCP which uses a
+ MIMD congestion control algorithm which has some nice scaling
+ properties, though is known to have fairness issues.
+ See http://www.deneholme.net/tom/scalable/
+
+config TCP_CONG_LP
+ tristate "TCP Low Priority"
+ depends on EXPERIMENTAL
+ default n
+ ---help---
+ TCP Low Priority (TCP-LP), a distributed algorithm whose goal is
+ to utilize only the excess network bandwidth as compared to the
+ ``fair share`` of bandwidth as targeted by TCP.
+ See http://www-ece.rice.edu/networks/TCP-LP/
+
+config TCP_CONG_VENO
+ tristate "TCP Veno"
+ depends on EXPERIMENTAL
+ default n
+ ---help---
+ TCP Veno is a sender-side only enhancement of TCP to obtain better
+ throughput over wireless networks. TCP Veno makes use of state
+ distinguishing to circumvent the difficult judgment of the packet loss
+ type. TCP Veno cuts down less congestion window in response to random
+ loss packets.
+ See
+
+config TCP_CONG_YEAH
+ tristate "YeAH TCP"
+ depends on EXPERIMENTAL
+ select TCP_CONG_VEGAS
+ default n
+ ---help---
+ YeAH-TCP is a sender-side high-speed enabled TCP congestion control
+ algorithm, which uses a mixed loss/delay approach to compute the
+ congestion window. It's design goals target high efficiency,
+ internal, RTT and Reno fairness, resilience to link loss while
+ keeping network elements load as low as possible.
+
+ For further details look here:
+ http://wil.cs.caltech.edu/pfldnet2007/paper/YeAH_TCP.pdf
+
+config TCP_CONG_ILLINOIS
+ tristate "TCP Illinois"
+ depends on EXPERIMENTAL
+ default n
+ ---help---
+ TCP-Illinois is a sender-side modification of TCP Reno for
+ high speed long delay links. It uses round-trip-time to
+ adjust the alpha and beta parameters to achieve a higher average
+ throughput and maintain fairness.
+
+ For further details see:
+ http://www.ews.uiuc.edu/~shaoliu/tcpillinois/index.html
+
+choice
+ prompt "Default TCP congestion control"
+ default DEFAULT_CUBIC
+ help
+ Select the TCP congestion control that will be used by default
+ for all connections.
+
+ config DEFAULT_BIC
+ bool "Bic" if TCP_CONG_BIC=y
+
+ config DEFAULT_CUBIC
+ bool "Cubic" if TCP_CONG_CUBIC=y
+
+ config DEFAULT_HTCP
+ bool "Htcp" if TCP_CONG_HTCP=y
+
+ config DEFAULT_HYBLA
+ bool "Hybla" if TCP_CONG_HYBLA=y
+
+ config DEFAULT_VEGAS
+ bool "Vegas" if TCP_CONG_VEGAS=y
+
+ config DEFAULT_VENO
+ bool "Veno" if TCP_CONG_VENO=y
+
+ config DEFAULT_WESTWOOD
+ bool "Westwood" if TCP_CONG_WESTWOOD=y
+
+ config DEFAULT_RENO
+ bool "Reno"
+
+endchoice
+
+endif
+
+config TCP_CONG_CUBIC
+ tristate
+ depends on !TCP_CONG_ADVANCED
+ default y
+
+config DEFAULT_TCP_CONG
+ string
+ default "bic" if DEFAULT_BIC
+ default "cubic" if DEFAULT_CUBIC
+ default "htcp" if DEFAULT_HTCP
+ default "hybla" if DEFAULT_HYBLA
+ default "vegas" if DEFAULT_VEGAS
+ default "westwood" if DEFAULT_WESTWOOD
+ default "veno" if DEFAULT_VENO
+ default "reno" if DEFAULT_RENO
+ default "cubic"
+
+config TCP_MD5SIG
+ bool "TCP: MD5 Signature Option support (RFC2385) (EXPERIMENTAL)"
+ depends on EXPERIMENTAL
+ select CRYPTO
+ select CRYPTO_MD5
+ ---help---
+ RFC2385 specifies a method of giving MD5 protection to TCP sessions.
+ Its main (only?) use is to protect BGP sessions between core routers
+ on the Internet.
+
+ If unsure, say N.
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
new file mode 100644
index 00000000..c6f177cf
--- /dev/null
+++ b/net/ipv4/Makefile
@@ -0,0 +1,56 @@
+#
+# Makefile for the Linux TCP/IP (INET) layer.
+#
+
+obj-y := route.o inetpeer.o protocol.o \
+ ip_input.o ip_fragment.o ip_forward.o ip_options.o \
+ ip_output.o ip_sockglue.o inet_hashtables.o \
+ inet_timewait_sock.o inet_connection_sock.o \
+ tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
+ tcp_minisocks.o tcp_cong.o \
+ datagram.o raw.o udp.o udplite.o \
+ arp.o icmp.o devinet.o af_inet.o igmp.o \
+ fib_frontend.o fib_semantics.o fib_trie.o \
+ inet_fragment.o ping.o
+
+obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o
+obj-$(CONFIG_SYSFS) += sysfs_net_ipv4.o
+obj-$(CONFIG_PROC_FS) += proc.o
+obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o
+obj-$(CONFIG_IP_MROUTE) += ipmr.o
+obj-$(CONFIG_NET_IPIP) += ipip.o
+obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o
+obj-$(CONFIG_NET_IPGRE) += ip_gre.o
+obj-$(CONFIG_SYN_COOKIES) += syncookies.o
+obj-$(CONFIG_INET_AH) += ah4.o
+obj-$(CONFIG_INET_ESP) += esp4.o
+obj-$(CONFIG_INET_IPCOMP) += ipcomp.o
+obj-$(CONFIG_INET_XFRM_TUNNEL) += xfrm4_tunnel.o
+obj-$(CONFIG_INET_XFRM_MODE_BEET) += xfrm4_mode_beet.o
+obj-$(CONFIG_INET_LRO) += inet_lro.o
+obj-$(CONFIG_INET_TUNNEL) += tunnel4.o
+obj-$(CONFIG_INET_XFRM_MODE_TRANSPORT) += xfrm4_mode_transport.o
+obj-$(CONFIG_INET_XFRM_MODE_TUNNEL) += xfrm4_mode_tunnel.o
+obj-$(CONFIG_IP_PNP) += ipconfig.o
+obj-$(CONFIG_NETFILTER) += netfilter.o netfilter/
+obj-$(CONFIG_INET_DIAG) += inet_diag.o
+obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
+obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o
+obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o
+obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
+obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o
+obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o
+obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o
+obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o
+obj-$(CONFIG_TCP_CONG_HTCP) += tcp_htcp.o
+obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o
+obj-$(CONFIG_TCP_CONG_VENO) += tcp_veno.o
+obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o
+obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o
+obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o
+obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o
+obj-$(CONFIG_CGROUP_MEM_RES_CTLR_KMEM) += tcp_memcontrol.o
+obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
+
+obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
+ xfrm4_output.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
new file mode 100644
index 00000000..425f36d5
--- /dev/null
+++ b/net/ipv4/af_inet.c
@@ -0,0 +1,1824 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * PF_INET protocol family socket handler.
+ *
+ * Authors: Ross Biro
+ * Fred N. van Kempen,
+ * Florian La Roche,
+ * Alan Cox,
+ *
+ * Changes (see also sock.c)
+ *
+ * piggy,
+ * Karl Knutson : Socket protocol table
+ * A.N.Kuznetsov : Socket death error in accept().
+ * John Richardson : Fix non blocking error in connect()
+ * so sockets that fail to connect
+ * don't return -EINPROGRESS.
+ * Alan Cox : Asynchronous I/O support
+ * Alan Cox : Keep correct socket pointer on sock
+ * structures
+ * when accept() ed
+ * Alan Cox : Semantics of SO_LINGER aren't state
+ * moved to close when you look carefully.
+ * With this fixed and the accept bug fixed
+ * some RPC stuff seems happier.
+ * Niibe Yutaka : 4.4BSD style write async I/O
+ * Alan Cox,
+ * Tony Gale : Fixed reuse semantics.
+ * Alan Cox : bind() shouldn't abort existing but dead
+ * sockets. Stops FTP netin:.. I hope.
+ * Alan Cox : bind() works correctly for RAW sockets.
+ * Note that FreeBSD at least was broken
+ * in this respect so be careful with
+ * compatibility tests...
+ * Alan Cox : routing cache support
+ * Alan Cox : memzero the socket structure for
+ * compactness.
+ * Matt Day : nonblock connect error handler
+ * Alan Cox : Allow large numbers of pending sockets
+ * (eg for big web sites), but only if
+ * specifically application requested.
+ * Alan Cox : New buffering throughout IP. Used
+ * dumbly.
+ * Alan Cox : New buffering now used smartly.
+ * Alan Cox : BSD rather than common sense
+ * interpretation of listen.
+ * Germano Caronni : Assorted small races.
+ * Alan Cox : sendmsg/recvmsg basic support.
+ * Alan Cox : Only sendmsg/recvmsg now supported.
+ * Alan Cox : Locked down bind (see security list).
+ * Alan Cox : Loosened bind a little.
+ * Mike McLagan : ADD/DEL DLCI Ioctls
+ * Willy Konynenberg : Transparent proxying support.
+ * David S. Miller : New socket lookup architecture.
+ * Some other random speedups.
+ * Cyrus Durgin : Cleaned up file for kmod hacks.
+ * Andi Kleen : Fix inet_stream_connect TCP race.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) "IPv4: " fmt
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#ifdef CONFIG_IP_MROUTE
+#include
+#endif
+
+#ifdef CONFIG_ANDROID_PARANOID_NETWORK
+#include
+
+static inline int current_has_network(void)
+{
+ return in_egroup_p(AID_INET) || capable(CAP_NET_RAW);
+}
+#else
+static inline int current_has_network(void)
+{
+ return 1;
+}
+#endif
+
+/* The inetsw table contains everything that inet_create needs to
+ * build a new socket.
+ */
+static struct list_head inetsw[SOCK_MAX];
+static DEFINE_SPINLOCK(inetsw_lock);
+
+struct ipv4_config ipv4_config;
+EXPORT_SYMBOL(ipv4_config);
+
+/* New destruction routine */
+
+void inet_sock_destruct(struct sock *sk)
+{
+ struct inet_sock *inet = inet_sk(sk);
+
+ __skb_queue_purge(&sk->sk_receive_queue);
+ __skb_queue_purge(&sk->sk_error_queue);
+
+ sk_mem_reclaim(sk);
+
+ if (sk->sk_type == SOCK_STREAM && sk->sk_state != TCP_CLOSE) {
+ pr_err("Attempt to release TCP socket in state %d %p\n",
+ sk->sk_state, sk);
+ return;
+ }
+ if (!sock_flag(sk, SOCK_DEAD)) {
+ pr_err("Attempt to release alive inet socket %p\n", sk);
+ return;
+ }
+
+ WARN_ON(atomic_read(&sk->sk_rmem_alloc));
+ WARN_ON(atomic_read(&sk->sk_wmem_alloc));
+ WARN_ON(sk->sk_wmem_queued);
+ WARN_ON(sk->sk_forward_alloc);
+
+ kfree(rcu_dereference_protected(inet->inet_opt, 1));
+ dst_release(rcu_dereference_check(sk->sk_dst_cache, 1));
+ sk_refcnt_debug_dec(sk);
+}
+EXPORT_SYMBOL(inet_sock_destruct);
+
+/*
+ * The routines beyond this point handle the behaviour of an AF_INET
+ * socket object. Mostly it punts to the subprotocols of IP to do
+ * the work.
+ */
+
+/*
+ * Automatically bind an unbound socket.
+ */
+
+static int inet_autobind(struct sock *sk)
+{
+ struct inet_sock *inet;
+ /* We may need to bind the socket. */
+ lock_sock(sk);
+ inet = inet_sk(sk);
+ if (!inet->inet_num) {
+ if (sk->sk_prot->get_port(sk, 0)) {
+ release_sock(sk);
+ return -EAGAIN;
+ }
+ inet->inet_sport = htons(inet->inet_num);
+ }
+ release_sock(sk);
+ return 0;
+}
+
+/*
+ * Move a socket into listening state.
+ */
+int inet_listen(struct socket *sock, int backlog)
+{
+ struct sock *sk = sock->sk;
+ unsigned char old_state;
+ int err;
+
+ lock_sock(sk);
+
+ err = -EINVAL;
+ if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM)
+ goto out;
+
+ old_state = sk->sk_state;
+ if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN)))
+ goto out;
+
+ /* Really, if the socket is already in listen state
+ * we can only allow the backlog to be adjusted.
+ */
+ if (old_state != TCP_LISTEN) {
+ err = inet_csk_listen_start(sk, backlog);
+ if (err)
+ goto out;
+ }
+ sk->sk_max_ack_backlog = backlog;
+ err = 0;
+
+out:
+ release_sock(sk);
+ return err;
+}
+EXPORT_SYMBOL(inet_listen);
+
+u32 inet_ehash_secret __read_mostly;
+EXPORT_SYMBOL(inet_ehash_secret);
+
+/*
+ * inet_ehash_secret must be set exactly once
+ */
+void build_ehash_secret(void)
+{
+ u32 rnd;
+
+ do {
+ get_random_bytes(&rnd, sizeof(rnd));
+ } while (rnd == 0);
+
+ cmpxchg(&inet_ehash_secret, 0, rnd);
+}
+EXPORT_SYMBOL(build_ehash_secret);
+
+static inline int inet_netns_ok(struct net *net, int protocol)
+{
+ int hash;
+ const struct net_protocol *ipprot;
+
+ if (net_eq(net, &init_net))
+ return 1;
+
+ hash = protocol & (MAX_INET_PROTOS - 1);
+ ipprot = rcu_dereference(inet_protos[hash]);
+
+ if (ipprot == NULL)
+ /* raw IP is OK */
+ return 1;
+ return ipprot->netns_ok;
+}
+
+
+/*
+ * Create an inet socket.
+ */
+
+static int inet_create(struct net *net, struct socket *sock, int protocol,
+ int kern)
+{
+ struct sock *sk;
+ struct inet_protosw *answer;
+ struct inet_sock *inet;
+ struct proto *answer_prot;
+ unsigned char answer_flags;
+ char answer_no_check;
+ int try_loading_module = 0;
+ int err;
+
+ if (!current_has_network())
+ return -EACCES;
+
+ if (unlikely(!inet_ehash_secret))
+ if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM)
+ build_ehash_secret();
+
+ sock->state = SS_UNCONNECTED;
+
+ /* Look for the requested type/protocol pair. */
+lookup_protocol:
+ err = -ESOCKTNOSUPPORT;
+ rcu_read_lock();
+ list_for_each_entry_rcu(answer, &inetsw[sock->type], list) {
+
+ err = 0;
+ /* Check the non-wild match. */
+ if (protocol == answer->protocol) {
+ if (protocol != IPPROTO_IP)
+ break;
+ } else {
+ /* Check for the two wild cases. */
+ if (IPPROTO_IP == protocol) {
+ protocol = answer->protocol;
+ break;
+ }
+ if (IPPROTO_IP == answer->protocol)
+ break;
+ }
+ err = -EPROTONOSUPPORT;
+ }
+
+ if (unlikely(err)) {
+ if (try_loading_module < 2) {
+ rcu_read_unlock();
+ /*
+ * Be more specific, e.g. net-pf-2-proto-132-type-1
+ * (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM)
+ */
+ if (++try_loading_module == 1)
+ request_module("net-pf-%d-proto-%d-type-%d",
+ PF_INET, protocol, sock->type);
+ /*
+ * Fall back to generic, e.g. net-pf-2-proto-132
+ * (net-pf-PF_INET-proto-IPPROTO_SCTP)
+ */
+ else
+ request_module("net-pf-%d-proto-%d",
+ PF_INET, protocol);
+ goto lookup_protocol;
+ } else
+ goto out_rcu_unlock;
+ }
+
+ err = -EPERM;
+ if (sock->type == SOCK_RAW && !kern && !capable(CAP_NET_RAW))
+ goto out_rcu_unlock;
+
+ err = -EAFNOSUPPORT;
+ if (!inet_netns_ok(net, protocol))
+ goto out_rcu_unlock;
+
+ sock->ops = answer->ops;
+ answer_prot = answer->prot;
+ answer_no_check = answer->no_check;
+ answer_flags = answer->flags;
+ rcu_read_unlock();
+
+ WARN_ON(answer_prot->slab == NULL);
+
+ err = -ENOBUFS;
+ sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot);
+ if (sk == NULL)
+ goto out;
+
+ err = 0;
+ sk->sk_no_check = answer_no_check;
+ if (INET_PROTOSW_REUSE & answer_flags)
+ sk->sk_reuse = 1;
+
+ inet = inet_sk(sk);
+ inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;
+
+ inet->nodefrag = 0;
+
+ if (SOCK_RAW == sock->type) {
+ inet->inet_num = protocol;
+ if (IPPROTO_RAW == protocol)
+ inet->hdrincl = 1;
+ }
+
+ if (ipv4_config.no_pmtu_disc)
+ inet->pmtudisc = IP_PMTUDISC_DONT;
+ else
+ inet->pmtudisc = IP_PMTUDISC_WANT;
+
+ inet->inet_id = 0;
+
+ sock_init_data(sock, sk);
+
+ sk->sk_destruct = inet_sock_destruct;
+ sk->sk_protocol = protocol;
+ sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;
+
+ inet->uc_ttl = -1;
+ inet->mc_loop = 1;
+ inet->mc_ttl = 1;
+ inet->mc_all = 1;
+ inet->mc_index = 0;
+ inet->mc_list = NULL;
+ inet->rcv_tos = 0;
+
+ sk_refcnt_debug_inc(sk);
+
+ if (inet->inet_num) {
+ /* It assumes that any protocol which allows
+ * the user to assign a number at socket
+ * creation time automatically
+ * shares.
+ */
+ inet->inet_sport = htons(inet->inet_num);
+ /* Add to protocol hash chains. */
+ sk->sk_prot->hash(sk);
+ }
+
+ if (sk->sk_prot->init) {
+ err = sk->sk_prot->init(sk);
+ if (err)
+ sk_common_release(sk);
+ }
+out:
+ return err;
+out_rcu_unlock:
+ rcu_read_unlock();
+ goto out;
+}
+
+
+/*
+ * The peer socket should always be NULL (or else). When we call this
+ * function we are destroying the object and from then on nobody
+ * should refer to it.
+ */
+int inet_release(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+
+ if (sk) {
+ long timeout;
+
+ sock_rps_reset_flow(sk);
+
+ /* Applications forget to leave groups before exiting */
+ ip_mc_drop_socket(sk);
+
+ /* If linger is set, we don't return until the close
+ * is complete. Otherwise we return immediately. The
+ * actually closing is done the same either way.
+ *
+ * If the close is due to the process exiting, we never
+ * linger..
+ */
+ timeout = 0;
+ if (sock_flag(sk, SOCK_LINGER) &&
+ !(current->flags & PF_EXITING))
+ timeout = sk->sk_lingertime;
+ sock->sk = NULL;
+ sk->sk_prot->close(sk, timeout);
+ }
+ return 0;
+}
+EXPORT_SYMBOL(inet_release);
+
+/* It is off by default, see below. */
+int sysctl_ip_nonlocal_bind __read_mostly;
+EXPORT_SYMBOL(sysctl_ip_nonlocal_bind);
+
+int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+{
+ struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
+ struct sock *sk = sock->sk;
+ struct inet_sock *inet = inet_sk(sk);
+ unsigned short snum;
+ int chk_addr_ret;
+ int err;
+
+ /* If the socket has its own bind function then use it. (RAW) */
+ if (sk->sk_prot->bind) {
+ err = sk->sk_prot->bind(sk, uaddr, addr_len);
+ goto out;
+ }
+ err = -EINVAL;
+ if (addr_len < sizeof(struct sockaddr_in))
+ goto out;
+
+ if (addr->sin_family != AF_INET) {
+ /* Compatibility games : accept AF_UNSPEC (mapped to AF_INET)
+ * only if s_addr is INADDR_ANY.
+ */
+ err = -EAFNOSUPPORT;
+ if (addr->sin_family != AF_UNSPEC ||
+ addr->sin_addr.s_addr != htonl(INADDR_ANY))
+ goto out;
+ }
+
+ chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr);
+
+ /* Not specified by any standard per-se, however it breaks too
+ * many applications when removed. It is unfortunate since
+ * allowing applications to make a non-local bind solves
+ * several problems with systems using dynamic addressing.
+ * (ie. your servers still start up even if your ISDN link
+ * is temporarily down)
+ */
+ err = -EADDRNOTAVAIL;
+ if (!sysctl_ip_nonlocal_bind &&
+ !(inet->freebind || inet->transparent) &&
+ addr->sin_addr.s_addr != htonl(INADDR_ANY) &&
+ chk_addr_ret != RTN_LOCAL &&
+ chk_addr_ret != RTN_MULTICAST &&
+ chk_addr_ret != RTN_BROADCAST)
+ goto out;
+
+ snum = ntohs(addr->sin_port);
+ err = -EACCES;
+ if (snum && snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE))
+ goto out;
+
+ /* We keep a pair of addresses. rcv_saddr is the one
+ * used by hash lookups, and saddr is used for transmit.
+ *
+ * In the BSD API these are the same except where it
+ * would be illegal to use them (multicast/broadcast) in
+ * which case the sending device address is used.
+ */
+ lock_sock(sk);
+
+ /* Check these errors (active socket, double bind). */
+ err = -EINVAL;
+ if (sk->sk_state != TCP_CLOSE || inet->inet_num)
+ goto out_release_sock;
+
+ inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr;
+ if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
+ inet->inet_saddr = 0; /* Use device */
+
+ /* Make sure we are allowed to bind here. */
+ if (sk->sk_prot->get_port(sk, snum)) {
+ inet->inet_saddr = inet->inet_rcv_saddr = 0;
+ err = -EADDRINUSE;
+ goto out_release_sock;
+ }
+
+ if (inet->inet_rcv_saddr)
+ sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
+ if (snum)
+ sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
+ inet->inet_sport = htons(inet->inet_num);
+ inet->inet_daddr = 0;
+ inet->inet_dport = 0;
+ sk_dst_reset(sk);
+ err = 0;
+out_release_sock:
+ release_sock(sk);
+out:
+ return err;
+}
+EXPORT_SYMBOL(inet_bind);
+
+int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr,
+ int addr_len, int flags)
+{
+ struct sock *sk = sock->sk;
+
+ if (addr_len < sizeof(uaddr->sa_family))
+ return -EINVAL;
+ if (uaddr->sa_family == AF_UNSPEC)
+ return sk->sk_prot->disconnect(sk, flags);
+
+ if (!inet_sk(sk)->inet_num && inet_autobind(sk))
+ return -EAGAIN;
+ return sk->sk_prot->connect(sk, (struct sockaddr *)uaddr, addr_len);
+}
+EXPORT_SYMBOL(inet_dgram_connect);
+
+static long inet_wait_for_connect(struct sock *sk, long timeo)
+{
+ DEFINE_WAIT(wait);
+
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+
+ /* Basic assumption: if someone sets sk->sk_err, he _must_
+ * change state of the socket from TCP_SYN_*.
+ * Connect() does not allow to get error notifications
+ * without closing the socket.
+ */
+ while ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
+ release_sock(sk);
+ timeo = schedule_timeout(timeo);
+ lock_sock(sk);
+ if (signal_pending(current) || !timeo)
+ break;
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+ }
+ finish_wait(sk_sleep(sk), &wait);
+ return timeo;
+}
+
+/*
+ * Connect to a remote host. There is regrettably still a little
+ * TCP 'magic' in here.
+ */
+int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
+ int addr_len, int flags)
+{
+ struct sock *sk = sock->sk;
+ int err;
+ long timeo;
+
+ if (addr_len < sizeof(uaddr->sa_family))
+ return -EINVAL;
+
+ lock_sock(sk);
+
+ if (uaddr->sa_family == AF_UNSPEC) {
+ err = sk->sk_prot->disconnect(sk, flags);
+ sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
+ goto out;
+ }
+
+ switch (sock->state) {
+ default:
+ err = -EINVAL;
+ goto out;
+ case SS_CONNECTED:
+ err = -EISCONN;
+ goto out;
+ case SS_CONNECTING:
+ err = -EALREADY;
+ /* Fall out of switch with err, set for this state */
+ break;
+ case SS_UNCONNECTED:
+ err = -EISCONN;
+ if (sk->sk_state != TCP_CLOSE)
+ goto out;
+
+ err = sk->sk_prot->connect(sk, uaddr, addr_len);
+ if (err < 0)
+ goto out;
+
+ sock->state = SS_CONNECTING;
+
+ /* Just entered SS_CONNECTING state; the only
+ * difference is that return value in non-blocking
+ * case is EINPROGRESS, rather than EALREADY.
+ */
+ err = -EINPROGRESS;
+ break;
+ }
+
+ timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
+
+ if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
+ /* Error code is set above */
+ if (!timeo || !inet_wait_for_connect(sk, timeo))
+ goto out;
+
+ err = sock_intr_errno(timeo);
+ if (signal_pending(current))
+ goto out;
+ }
+
+ /* Connection was closed by RST, timeout, ICMP error
+ * or another process disconnected us.
+ */
+ if (sk->sk_state == TCP_CLOSE)
+ goto sock_error;
+
+ /* sk->sk_err may be not zero now, if RECVERR was ordered by user
+ * and error was received after socket entered established state.
+ * Hence, it is handled normally after connect() return successfully.
+ */
+
+ sock->state = SS_CONNECTED;
+ err = 0;
+out:
+ release_sock(sk);
+ return err;
+
+sock_error:
+ err = sock_error(sk) ? : -ECONNABORTED;
+ sock->state = SS_UNCONNECTED;
+ if (sk->sk_prot->disconnect(sk, flags))
+ sock->state = SS_DISCONNECTING;
+ goto out;
+}
+EXPORT_SYMBOL(inet_stream_connect);
+
+/*
+ * Accept a pending connection. The TCP layer now gives BSD semantics.
+ */
+
+int inet_accept(struct socket *sock, struct socket *newsock, int flags)
+{
+ struct sock *sk1 = sock->sk;
+ int err = -EINVAL;
+ struct sock *sk2 = sk1->sk_prot->accept(sk1, flags, &err);
+
+ if (!sk2)
+ goto do_err;
+
+ lock_sock(sk2);
+
+ sock_rps_record_flow(sk2);
+ WARN_ON(!((1 << sk2->sk_state) &
+ (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_CLOSE)));
+
+ sock_graft(sk2, newsock);
+
+ newsock->state = SS_CONNECTED;
+ err = 0;
+ release_sock(sk2);
+do_err:
+ return err;
+}
+EXPORT_SYMBOL(inet_accept);
+
+
+/*
+ * This does both peername and sockname.
+ */
+int inet_getname(struct socket *sock, struct sockaddr *uaddr,
+ int *uaddr_len, int peer)
+{
+ struct sock *sk = sock->sk;
+ struct inet_sock *inet = inet_sk(sk);
+ DECLARE_SOCKADDR(struct sockaddr_in *, sin, uaddr);
+
+ sin->sin_family = AF_INET;
+ if (peer) {
+ if (!inet->inet_dport ||
+ (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) &&
+ peer == 1))
+ return -ENOTCONN;
+ sin->sin_port = inet->inet_dport;
+ sin->sin_addr.s_addr = inet->inet_daddr;
+ } else {
+ __be32 addr = inet->inet_rcv_saddr;
+ if (!addr)
+ addr = inet->inet_saddr;
+ sin->sin_port = inet->inet_sport;
+ sin->sin_addr.s_addr = addr;
+ }
+ memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+ *uaddr_len = sizeof(*sin);
+ return 0;
+}
+EXPORT_SYMBOL(inet_getname);
+
+int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
+ size_t size)
+{
+ struct sock *sk = sock->sk;
+
+ sock_rps_record_flow(sk);
+
+ /* We may need to bind the socket. */
+ if (!inet_sk(sk)->inet_num && !sk->sk_prot->no_autobind &&
+ inet_autobind(sk))
+ return -EAGAIN;
+
+ return sk->sk_prot->sendmsg(iocb, sk, msg, size);
+}
+EXPORT_SYMBOL(inet_sendmsg);
+
+ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset,
+ size_t size, int flags)
+{
+ struct sock *sk = sock->sk;
+
+ sock_rps_record_flow(sk);
+
+ /* We may need to bind the socket. */
+ if (!inet_sk(sk)->inet_num && !sk->sk_prot->no_autobind &&
+ inet_autobind(sk))
+ return -EAGAIN;
+
+ if (sk->sk_prot->sendpage)
+ return sk->sk_prot->sendpage(sk, page, offset, size, flags);
+ return sock_no_sendpage(sock, page, offset, size, flags);
+}
+EXPORT_SYMBOL(inet_sendpage);
+
+int inet_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
+ size_t size, int flags)
+{
+ struct sock *sk = sock->sk;
+ int addr_len = 0;
+ int err;
+
+ sock_rps_record_flow(sk);
+
+ err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
+ flags & ~MSG_DONTWAIT, &addr_len);
+ if (err >= 0)
+ msg->msg_namelen = addr_len;
+ return err;
+}
+EXPORT_SYMBOL(inet_recvmsg);
+
+int inet_shutdown(struct socket *sock, int how)
+{
+ struct sock *sk = sock->sk;
+ int err = 0;
+
+ /* This should really check to make sure
+ * the socket is a TCP socket. (WHY AC...)
+ */
+ how++; /* maps 0->1 has the advantage of making bit 1 rcvs and
+ 1->2 bit 2 snds.
+ 2->3 */
+ if ((how & ~SHUTDOWN_MASK) || !how) /* MAXINT->0 */
+ return -EINVAL;
+
+ lock_sock(sk);
+ if (sock->state == SS_CONNECTING) {
+ if ((1 << sk->sk_state) &
+ (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE))
+ sock->state = SS_DISCONNECTING;
+ else
+ sock->state = SS_CONNECTED;
+ }
+
+ switch (sk->sk_state) {
+ case TCP_CLOSE:
+ err = -ENOTCONN;
+ /* Hack to wake up other listeners, who can poll for
+ POLLHUP, even on eg. unconnected UDP sockets -- RR */
+ default:
+ sk->sk_shutdown |= how;
+ if (sk->sk_prot->shutdown)
+ sk->sk_prot->shutdown(sk, how);
+ break;
+
+ /* Remaining two branches are temporary solution for missing
+ * close() in multithreaded environment. It is _not_ a good idea,
+ * but we have no choice until close() is repaired at VFS level.
+ */
+ case TCP_LISTEN:
+ if (!(how & RCV_SHUTDOWN))
+ break;
+ /* Fall through */
+ case TCP_SYN_SENT:
+ err = sk->sk_prot->disconnect(sk, O_NONBLOCK);
+ sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
+ break;
+ }
+
+ /* Wake up anyone sleeping in poll. */
+ sk->sk_state_change(sk);
+ release_sock(sk);
+ return err;
+}
+EXPORT_SYMBOL(inet_shutdown);
+
+/*
+ * ioctl() calls you can issue on an INET socket. Most of these are
+ * device configuration and stuff and very rarely used. Some ioctls
+ * pass on to the socket itself.
+ *
+ * NOTE: I like the idea of a module for the config stuff. ie ifconfig
+ * loads the devconfigure module does its configuring and unloads it.
+ * There's a good 20K of config code hanging around the kernel.
+ */
+
+int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+ struct sock *sk = sock->sk;
+ int err = 0;
+ struct net *net = sock_net(sk);
+
+ switch (cmd) {
+ case SIOCGSTAMP:
+ err = sock_get_timestamp(sk, (struct timeval __user *)arg);
+ break;
+ case SIOCGSTAMPNS:
+ err = sock_get_timestampns(sk, (struct timespec __user *)arg);
+ break;
+ case SIOCADDRT:
+ case SIOCDELRT:
+ case SIOCRTMSG:
+ err = ip_rt_ioctl(net, cmd, (void __user *)arg);
+ break;
+ case SIOCDARP:
+ case SIOCGARP:
+ case SIOCSARP:
+ err = arp_ioctl(net, cmd, (void __user *)arg);
+ break;
+ case SIOCGIFADDR:
+ case SIOCSIFADDR:
+ case SIOCGIFBRDADDR:
+ case SIOCSIFBRDADDR:
+ case SIOCGIFNETMASK:
+ case SIOCSIFNETMASK:
+ case SIOCGIFDSTADDR:
+ case SIOCSIFDSTADDR:
+ case SIOCSIFPFLAGS:
+ case SIOCGIFPFLAGS:
+ case SIOCSIFFLAGS:
+ case SIOCKILLADDR:
+ err = devinet_ioctl(net, cmd, (void __user *)arg);
+ break;
+ default:
+ if (sk->sk_prot->ioctl)
+ err = sk->sk_prot->ioctl(sk, cmd, arg);
+ else
+ err = -ENOIOCTLCMD;
+ break;
+ }
+ return err;
+}
+EXPORT_SYMBOL(inet_ioctl);
+
+#ifdef CONFIG_COMPAT
+static int inet_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+ struct sock *sk = sock->sk;
+ int err = -ENOIOCTLCMD;
+
+ if (sk->sk_prot->compat_ioctl)
+ err = sk->sk_prot->compat_ioctl(sk, cmd, arg);
+
+ return err;
+}
+#endif
+
+const struct proto_ops inet_stream_ops = {
+ .family = PF_INET,
+ .owner = THIS_MODULE,
+ .release = inet_release,
+ .bind = inet_bind,
+ .connect = inet_stream_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = inet_accept,
+ .getname = inet_getname,
+ .poll = tcp_poll,
+ .ioctl = inet_ioctl,
+ .listen = inet_listen,
+ .shutdown = inet_shutdown,
+ .setsockopt = sock_common_setsockopt,
+ .getsockopt = sock_common_getsockopt,
+ .sendmsg = inet_sendmsg,
+ .recvmsg = inet_recvmsg,
+ .mmap = sock_no_mmap,
+ .sendpage = inet_sendpage,
+ .splice_read = tcp_splice_read,
+#ifdef CONFIG_COMPAT
+ .compat_setsockopt = compat_sock_common_setsockopt,
+ .compat_getsockopt = compat_sock_common_getsockopt,
+ .compat_ioctl = inet_compat_ioctl,
+#endif
+};
+EXPORT_SYMBOL(inet_stream_ops);
+
+const struct proto_ops inet_dgram_ops = {
+ .family = PF_INET,
+ .owner = THIS_MODULE,
+ .release = inet_release,
+ .bind = inet_bind,
+ .connect = inet_dgram_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = sock_no_accept,
+ .getname = inet_getname,
+ .poll = udp_poll,
+ .ioctl = inet_ioctl,
+ .listen = sock_no_listen,
+ .shutdown = inet_shutdown,
+ .setsockopt = sock_common_setsockopt,
+ .getsockopt = sock_common_getsockopt,
+ .sendmsg = inet_sendmsg,
+ .recvmsg = inet_recvmsg,
+ .mmap = sock_no_mmap,
+ .sendpage = inet_sendpage,
+#ifdef CONFIG_COMPAT
+ .compat_setsockopt = compat_sock_common_setsockopt,
+ .compat_getsockopt = compat_sock_common_getsockopt,
+ .compat_ioctl = inet_compat_ioctl,
+#endif
+};
+EXPORT_SYMBOL(inet_dgram_ops);
+
+/*
+ * For SOCK_RAW sockets; should be the same as inet_dgram_ops but without
+ * udp_poll
+ */
+static const struct proto_ops inet_sockraw_ops = {
+ .family = PF_INET,
+ .owner = THIS_MODULE,
+ .release = inet_release,
+ .bind = inet_bind,
+ .connect = inet_dgram_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = sock_no_accept,
+ .getname = inet_getname,
+ .poll = datagram_poll,
+ .ioctl = inet_ioctl,
+ .listen = sock_no_listen,
+ .shutdown = inet_shutdown,
+ .setsockopt = sock_common_setsockopt,
+ .getsockopt = sock_common_getsockopt,
+ .sendmsg = inet_sendmsg,
+ .recvmsg = inet_recvmsg,
+ .mmap = sock_no_mmap,
+ .sendpage = inet_sendpage,
+#ifdef CONFIG_COMPAT
+ .compat_setsockopt = compat_sock_common_setsockopt,
+ .compat_getsockopt = compat_sock_common_getsockopt,
+ .compat_ioctl = inet_compat_ioctl,
+#endif
+};
+
+static const struct net_proto_family inet_family_ops = {
+ .family = PF_INET,
+ .create = inet_create,
+ .owner = THIS_MODULE,
+};
+
+/* Upon startup we insert all the elements in inetsw_array[] into
+ * the linked list inetsw.
+ */
+static struct inet_protosw inetsw_array[] =
+{
+ {
+ .type = SOCK_STREAM,
+ .protocol = IPPROTO_TCP,
+ .prot = &tcp_prot,
+ .ops = &inet_stream_ops,
+ .no_check = 0,
+ .flags = INET_PROTOSW_PERMANENT |
+ INET_PROTOSW_ICSK,
+ },
+
+ {
+ .type = SOCK_DGRAM,
+ .protocol = IPPROTO_UDP,
+ .prot = &udp_prot,
+ .ops = &inet_dgram_ops,
+ .no_check = UDP_CSUM_DEFAULT,
+ .flags = INET_PROTOSW_PERMANENT,
+ },
+
+ {
+ .type = SOCK_DGRAM,
+ .protocol = IPPROTO_ICMP,
+ .prot = &ping_prot,
+ .ops = &inet_dgram_ops,
+ .no_check = UDP_CSUM_DEFAULT,
+ .flags = INET_PROTOSW_REUSE,
+ },
+
+ {
+ .type = SOCK_RAW,
+ .protocol = IPPROTO_IP, /* wild card */
+ .prot = &raw_prot,
+ .ops = &inet_sockraw_ops,
+ .no_check = UDP_CSUM_DEFAULT,
+ .flags = INET_PROTOSW_REUSE,
+ }
+};
+
+#define INETSW_ARRAY_LEN ARRAY_SIZE(inetsw_array)
+
+void inet_register_protosw(struct inet_protosw *p)
+{
+ struct list_head *lh;
+ struct inet_protosw *answer;
+ int protocol = p->protocol;
+ struct list_head *last_perm;
+
+ spin_lock_bh(&inetsw_lock);
+
+ if (p->type >= SOCK_MAX)
+ goto out_illegal;
+
+ /* If we are trying to override a permanent protocol, bail. */
+ answer = NULL;
+ last_perm = &inetsw[p->type];
+ list_for_each(lh, &inetsw[p->type]) {
+ answer = list_entry(lh, struct inet_protosw, list);
+
+ /* Check only the non-wild match. */
+ if (INET_PROTOSW_PERMANENT & answer->flags) {
+ if (protocol == answer->protocol)
+ break;
+ last_perm = lh;
+ }
+
+ answer = NULL;
+ }
+ if (answer)
+ goto out_permanent;
+
+ /* Add the new entry after the last permanent entry if any, so that
+ * the new entry does not override a permanent entry when matched with
+ * a wild-card protocol. But it is allowed to override any existing
+ * non-permanent entry. This means that when we remove this entry, the
+ * system automatically returns to the old behavior.
+ */
+ list_add_rcu(&p->list, last_perm);
+out:
+ spin_unlock_bh(&inetsw_lock);
+
+ return;
+
+out_permanent:
+ pr_err("Attempt to override permanent protocol %d\n", protocol);
+ goto out;
+
+out_illegal:
+ pr_err("Ignoring attempt to register invalid socket type %d\n",
+ p->type);
+ goto out;
+}
+EXPORT_SYMBOL(inet_register_protosw);
+
+void inet_unregister_protosw(struct inet_protosw *p)
+{
+ if (INET_PROTOSW_PERMANENT & p->flags) {
+ pr_err("Attempt to unregister permanent protocol %d\n",
+ p->protocol);
+ } else {
+ spin_lock_bh(&inetsw_lock);
+ list_del_rcu(&p->list);
+ spin_unlock_bh(&inetsw_lock);
+
+ synchronize_net();
+ }
+}
+EXPORT_SYMBOL(inet_unregister_protosw);
+
+/*
+ * Shall we try to damage output packets if routing dev changes?
+ */
+
+int sysctl_ip_dynaddr __read_mostly;
+
+static int inet_sk_reselect_saddr(struct sock *sk)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ __be32 old_saddr = inet->inet_saddr;
+ __be32 daddr = inet->inet_daddr;
+ struct flowi4 *fl4;
+ struct rtable *rt;
+ __be32 new_saddr;
+ struct ip_options_rcu *inet_opt;
+
+ inet_opt = rcu_dereference_protected(inet->inet_opt,
+ sock_owned_by_user(sk));
+ if (inet_opt && inet_opt->opt.srr)
+ daddr = inet_opt->opt.faddr;
+
+ /* Query new route. */
+ fl4 = &inet->cork.fl.u.ip4;
+ rt = ip_route_connect(fl4, daddr, 0, RT_CONN_FLAGS(sk),
+ sk->sk_bound_dev_if, sk->sk_protocol,
+ inet->inet_sport, inet->inet_dport, sk, false);
+ if (IS_ERR(rt))
+ return PTR_ERR(rt);
+
+ sk_setup_caps(sk, &rt->dst);
+
+ new_saddr = fl4->saddr;
+
+ if (new_saddr == old_saddr)
+ return 0;
+
+ if (sysctl_ip_dynaddr > 1) {
+ pr_info("%s(): shifting inet->saddr from %pI4 to %pI4\n",
+ __func__, &old_saddr, &new_saddr);
+ }
+
+ inet->inet_saddr = inet->inet_rcv_saddr = new_saddr;
+
+ /*
+ * XXX The only one ugly spot where we need to
+ * XXX really change the sockets identity after
+ * XXX it has entered the hashes. -DaveM
+ *
+ * Besides that, it does not check for connection
+ * uniqueness. Wait for troubles.
+ */
+ __sk_prot_rehash(sk);
+ return 0;
+}
+
+int inet_sk_rebuild_header(struct sock *sk)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
+ __be32 daddr;
+ struct ip_options_rcu *inet_opt;
+ struct flowi4 *fl4;
+ int err;
+
+ /* Route is OK, nothing to do. */
+ if (rt)
+ return 0;
+
+ /* Reroute. */
+ rcu_read_lock();
+ inet_opt = rcu_dereference(inet->inet_opt);
+ daddr = inet->inet_daddr;
+ if (inet_opt && inet_opt->opt.srr)
+ daddr = inet_opt->opt.faddr;
+ rcu_read_unlock();
+ fl4 = &inet->cork.fl.u.ip4;
+ rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr, inet->inet_saddr,
+ inet->inet_dport, inet->inet_sport,
+ sk->sk_protocol, RT_CONN_FLAGS(sk),
+ sk->sk_bound_dev_if);
+ if (!IS_ERR(rt)) {
+ err = 0;
+ sk_setup_caps(sk, &rt->dst);
+ } else {
+ err = PTR_ERR(rt);
+
+ /* Routing failed... */
+ sk->sk_route_caps = 0;
+ /*
+ * Other protocols have to map its equivalent state to TCP_SYN_SENT.
+ * DCCP maps its DCCP_REQUESTING state to TCP_SYN_SENT. -acme
+ */
+ if (!sysctl_ip_dynaddr ||
+ sk->sk_state != TCP_SYN_SENT ||
+ (sk->sk_userlocks & SOCK_BINDADDR_LOCK) ||
+ (err = inet_sk_reselect_saddr(sk)) != 0)
+ sk->sk_err_soft = -err;
+ }
+
+ return err;
+}
+EXPORT_SYMBOL(inet_sk_rebuild_header);
+
+static int inet_gso_send_check(struct sk_buff *skb)
+{
+ const struct iphdr *iph;
+ const struct net_protocol *ops;
+ int proto;
+ int ihl;
+ int err = -EINVAL;
+
+ if (unlikely(!pskb_may_pull(skb, sizeof(*iph))))
+ goto out;
+
+ iph = ip_hdr(skb);
+ ihl = iph->ihl * 4;
+ if (ihl < sizeof(*iph))
+ goto out;
+
+ if (unlikely(!pskb_may_pull(skb, ihl)))
+ goto out;
+
+ __skb_pull(skb, ihl);
+ skb_reset_transport_header(skb);
+ iph = ip_hdr(skb);
+ proto = iph->protocol & (MAX_INET_PROTOS - 1);
+ err = -EPROTONOSUPPORT;
+
+ rcu_read_lock();
+ ops = rcu_dereference(inet_protos[proto]);
+ if (likely(ops && ops->gso_send_check))
+ err = ops->gso_send_check(skb);
+ rcu_read_unlock();
+
+out:
+ return err;
+}
+
+static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ struct sk_buff *segs = ERR_PTR(-EINVAL);
+ struct iphdr *iph;
+ const struct net_protocol *ops;
+ int proto;
+ int ihl;
+ int id;
+ unsigned int offset = 0;
+
+ if (!(features & NETIF_F_V4_CSUM))
+ features &= ~NETIF_F_SG;
+
+ if (unlikely(skb_shinfo(skb)->gso_type &
+ ~(SKB_GSO_TCPV4 |
+ SKB_GSO_UDP |
+ SKB_GSO_DODGY |
+ SKB_GSO_TCP_ECN |
+ 0)))
+ goto out;
+
+ if (unlikely(!pskb_may_pull(skb, sizeof(*iph))))
+ goto out;
+
+ iph = ip_hdr(skb);
+ ihl = iph->ihl * 4;
+ if (ihl < sizeof(*iph))
+ goto out;
+
+ if (unlikely(!pskb_may_pull(skb, ihl)))
+ goto out;
+
+ __skb_pull(skb, ihl);
+ skb_reset_transport_header(skb);
+ iph = ip_hdr(skb);
+ id = ntohs(iph->id);
+ proto = iph->protocol & (MAX_INET_PROTOS - 1);
+ segs = ERR_PTR(-EPROTONOSUPPORT);
+
+ rcu_read_lock();
+ ops = rcu_dereference(inet_protos[proto]);
+ if (likely(ops && ops->gso_segment))
+ segs = ops->gso_segment(skb, features);
+ rcu_read_unlock();
+
+ if (!segs || IS_ERR(segs))
+ goto out;
+
+ skb = segs;
+ do {
+ iph = ip_hdr(skb);
+ if (proto == IPPROTO_UDP) {
+ iph->id = htons(id);
+ iph->frag_off = htons(offset >> 3);
+ if (skb->next != NULL)
+ iph->frag_off |= htons(IP_MF);
+ offset += (skb->len - skb->mac_len - iph->ihl * 4);
+ } else
+ iph->id = htons(id++);
+ iph->tot_len = htons(skb->len - skb->mac_len);
+ iph->check = 0;
+ iph->check = ip_fast_csum(skb_network_header(skb), iph->ihl);
+ } while ((skb = skb->next));
+
+out:
+ return segs;
+}
+
+static struct sk_buff **inet_gro_receive(struct sk_buff **head,
+ struct sk_buff *skb)
+{
+ const struct net_protocol *ops;
+ struct sk_buff **pp = NULL;
+ struct sk_buff *p;
+ const struct iphdr *iph;
+ unsigned int hlen;
+ unsigned int off;
+ unsigned int id;
+ int flush = 1;
+ int proto;
+
+ off = skb_gro_offset(skb);
+ hlen = off + sizeof(*iph);
+ iph = skb_gro_header_fast(skb, off);
+ if (skb_gro_header_hard(skb, hlen)) {
+ iph = skb_gro_header_slow(skb, hlen, off);
+ if (unlikely(!iph))
+ goto out;
+ }
+
+ proto = iph->protocol & (MAX_INET_PROTOS - 1);
+
+ rcu_read_lock();
+ ops = rcu_dereference(inet_protos[proto]);
+ if (!ops || !ops->gro_receive)
+ goto out_unlock;
+
+ if (*(u8 *)iph != 0x45)
+ goto out_unlock;
+
+ if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
+ goto out_unlock;
+
+ id = ntohl(*(__be32 *)&iph->id);
+ flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id ^ IP_DF));
+ id >>= 16;
+
+ for (p = *head; p; p = p->next) {
+ struct iphdr *iph2;
+
+ if (!NAPI_GRO_CB(p)->same_flow)
+ continue;
+
+ iph2 = ip_hdr(p);
+
+ if ((iph->protocol ^ iph2->protocol) |
+ (iph->tos ^ iph2->tos) |
+ ((__force u32)iph->saddr ^ (__force u32)iph2->saddr) |
+ ((__force u32)iph->daddr ^ (__force u32)iph2->daddr)) {
+ NAPI_GRO_CB(p)->same_flow = 0;
+ continue;
+ }
+
+ /* All fields must match except length and checksum. */
+ NAPI_GRO_CB(p)->flush |=
+ (iph->ttl ^ iph2->ttl) |
+ ((u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) ^ id);
+
+ NAPI_GRO_CB(p)->flush |= flush;
+ }
+
+ NAPI_GRO_CB(skb)->flush |= flush;
+ skb_gro_pull(skb, sizeof(*iph));
+ skb_set_transport_header(skb, skb_gro_offset(skb));
+
+ pp = ops->gro_receive(head, skb);
+
+out_unlock:
+ rcu_read_unlock();
+
+out:
+ NAPI_GRO_CB(skb)->flush |= flush;
+
+ return pp;
+}
+
+static int inet_gro_complete(struct sk_buff *skb)
+{
+ const struct net_protocol *ops;
+ struct iphdr *iph = ip_hdr(skb);
+ int proto = iph->protocol & (MAX_INET_PROTOS - 1);
+ int err = -ENOSYS;
+ __be16 newlen = htons(skb->len - skb_network_offset(skb));
+
+ csum_replace2(&iph->check, iph->tot_len, newlen);
+ iph->tot_len = newlen;
+
+ rcu_read_lock();
+ ops = rcu_dereference(inet_protos[proto]);
+ if (WARN_ON(!ops || !ops->gro_complete))
+ goto out_unlock;
+
+ err = ops->gro_complete(skb);
+
+out_unlock:
+ rcu_read_unlock();
+
+ return err;
+}
+
+int inet_ctl_sock_create(struct sock **sk, unsigned short family,
+ unsigned short type, unsigned char protocol,
+ struct net *net)
+{
+ struct socket *sock;
+ int rc = sock_create_kern(family, type, protocol, &sock);
+
+ if (rc == 0) {
+ *sk = sock->sk;
+ (*sk)->sk_allocation = GFP_ATOMIC;
+ /*
+ * Unhash it so that IP input processing does not even see it,
+ * we do not wish this socket to see incoming packets.
+ */
+ (*sk)->sk_prot->unhash(*sk);
+
+ sk_change_net(*sk, net);
+ }
+ return rc;
+}
+EXPORT_SYMBOL_GPL(inet_ctl_sock_create);
+
+unsigned long snmp_fold_field(void __percpu *mib[], int offt)
+{
+ unsigned long res = 0;
+ int i, j;
+
+ for_each_possible_cpu(i) {
+ for (j = 0; j < SNMP_ARRAY_SZ; j++)
+ res += *(((unsigned long *) per_cpu_ptr(mib[j], i)) + offt);
+ }
+ return res;
+}
+EXPORT_SYMBOL_GPL(snmp_fold_field);
+
+#if BITS_PER_LONG==32
+
+u64 snmp_fold_field64(void __percpu *mib[], int offt, size_t syncp_offset)
+{
+ u64 res = 0;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ void *bhptr;
+ struct u64_stats_sync *syncp;
+ u64 v;
+ unsigned int start;
+
+ bhptr = per_cpu_ptr(mib[0], cpu);
+ syncp = (struct u64_stats_sync *)(bhptr + syncp_offset);
+ do {
+ start = u64_stats_fetch_begin_bh(syncp);
+ v = *(((u64 *) bhptr) + offt);
+ } while (u64_stats_fetch_retry_bh(syncp, start));
+
+ res += v;
+ }
+ return res;
+}
+EXPORT_SYMBOL_GPL(snmp_fold_field64);
+#endif
+
+int snmp_mib_init(void __percpu *ptr[2], size_t mibsize, size_t align)
+{
+ BUG_ON(ptr == NULL);
+ ptr[0] = __alloc_percpu(mibsize, align);
+ if (!ptr[0])
+ return -ENOMEM;
+#if SNMP_ARRAY_SZ == 2
+ ptr[1] = __alloc_percpu(mibsize, align);
+ if (!ptr[1]) {
+ free_percpu(ptr[0]);
+ ptr[0] = NULL;
+ return -ENOMEM;
+ }
+#endif
+ return 0;
+}
+EXPORT_SYMBOL_GPL(snmp_mib_init);
+
+void snmp_mib_free(void __percpu *ptr[SNMP_ARRAY_SZ])
+{
+ int i;
+
+ BUG_ON(ptr == NULL);
+ for (i = 0; i < SNMP_ARRAY_SZ; i++) {
+ free_percpu(ptr[i]);
+ ptr[i] = NULL;
+ }
+}
+EXPORT_SYMBOL_GPL(snmp_mib_free);
+
+#ifdef CONFIG_IP_MULTICAST
+static const struct net_protocol igmp_protocol = {
+ .handler = igmp_rcv,
+ .netns_ok = 1,
+};
+#endif
+
+static const struct net_protocol tcp_protocol = {
+ .handler = tcp_v4_rcv,
+ .err_handler = tcp_v4_err,
+ .gso_send_check = tcp_v4_gso_send_check,
+ .gso_segment = tcp_tso_segment,
+ .gro_receive = tcp4_gro_receive,
+ .gro_complete = tcp4_gro_complete,
+ .no_policy = 1,
+ .netns_ok = 1,
+};
+
+static const struct net_protocol udp_protocol = {
+ .handler = udp_rcv,
+ .err_handler = udp_err,
+ .gso_send_check = udp4_ufo_send_check,
+ .gso_segment = udp4_ufo_fragment,
+ .no_policy = 1,
+ .netns_ok = 1,
+};
+
+static const struct net_protocol icmp_protocol = {
+ .handler = icmp_rcv,
+ .err_handler = ping_v4_err,
+ .no_policy = 1,
+ .netns_ok = 1,
+};
+
+static __net_init int ipv4_mib_init_net(struct net *net)
+{
+ if (snmp_mib_init((void __percpu **)net->mib.tcp_statistics,
+ sizeof(struct tcp_mib),
+ __alignof__(struct tcp_mib)) < 0)
+ goto err_tcp_mib;
+ if (snmp_mib_init((void __percpu **)net->mib.ip_statistics,
+ sizeof(struct ipstats_mib),
+ __alignof__(struct ipstats_mib)) < 0)
+ goto err_ip_mib;
+ if (snmp_mib_init((void __percpu **)net->mib.net_statistics,
+ sizeof(struct linux_mib),
+ __alignof__(struct linux_mib)) < 0)
+ goto err_net_mib;
+ if (snmp_mib_init((void __percpu **)net->mib.udp_statistics,
+ sizeof(struct udp_mib),
+ __alignof__(struct udp_mib)) < 0)
+ goto err_udp_mib;
+ if (snmp_mib_init((void __percpu **)net->mib.udplite_statistics,
+ sizeof(struct udp_mib),
+ __alignof__(struct udp_mib)) < 0)
+ goto err_udplite_mib;
+ if (snmp_mib_init((void __percpu **)net->mib.icmp_statistics,
+ sizeof(struct icmp_mib),
+ __alignof__(struct icmp_mib)) < 0)
+ goto err_icmp_mib;
+ net->mib.icmpmsg_statistics = kzalloc(sizeof(struct icmpmsg_mib),
+ GFP_KERNEL);
+ if (!net->mib.icmpmsg_statistics)
+ goto err_icmpmsg_mib;
+
+ tcp_mib_init(net);
+ return 0;
+
+err_icmpmsg_mib:
+ snmp_mib_free((void __percpu **)net->mib.icmp_statistics);
+err_icmp_mib:
+ snmp_mib_free((void __percpu **)net->mib.udplite_statistics);
+err_udplite_mib:
+ snmp_mib_free((void __percpu **)net->mib.udp_statistics);
+err_udp_mib:
+ snmp_mib_free((void __percpu **)net->mib.net_statistics);
+err_net_mib:
+ snmp_mib_free((void __percpu **)net->mib.ip_statistics);
+err_ip_mib:
+ snmp_mib_free((void __percpu **)net->mib.tcp_statistics);
+err_tcp_mib:
+ return -ENOMEM;
+}
+
+static __net_exit void ipv4_mib_exit_net(struct net *net)
+{
+ kfree(net->mib.icmpmsg_statistics);
+ snmp_mib_free((void __percpu **)net->mib.icmp_statistics);
+ snmp_mib_free((void __percpu **)net->mib.udplite_statistics);
+ snmp_mib_free((void __percpu **)net->mib.udp_statistics);
+ snmp_mib_free((void __percpu **)net->mib.net_statistics);
+ snmp_mib_free((void __percpu **)net->mib.ip_statistics);
+ snmp_mib_free((void __percpu **)net->mib.tcp_statistics);
+}
+
+static __net_initdata struct pernet_operations ipv4_mib_ops = {
+ .init = ipv4_mib_init_net,
+ .exit = ipv4_mib_exit_net,
+};
+
+static int __init init_ipv4_mibs(void)
+{
+ return register_pernet_subsys(&ipv4_mib_ops);
+}
+
+static int ipv4_proc_init(void);
+
+/*
+ * IP protocol layer initialiser
+ */
+
+static struct packet_type ip_packet_type __read_mostly = {
+ .type = cpu_to_be16(ETH_P_IP),
+ .func = ip_rcv,
+ .gso_send_check = inet_gso_send_check,
+ .gso_segment = inet_gso_segment,
+ .gro_receive = inet_gro_receive,
+ .gro_complete = inet_gro_complete,
+};
+
+static int __init inet_init(void)
+{
+ struct sk_buff *dummy_skb;
+ struct inet_protosw *q;
+ struct list_head *r;
+ int rc = -EINVAL;
+
+ BUILD_BUG_ON(sizeof(struct inet_skb_parm) > sizeof(dummy_skb->cb));
+
+ sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL);
+ if (!sysctl_local_reserved_ports)
+ goto out;
+
+ rc = proto_register(&tcp_prot, 1);
+ if (rc)
+ goto out_free_reserved_ports;
+
+ rc = proto_register(&udp_prot, 1);
+ if (rc)
+ goto out_unregister_tcp_proto;
+
+ rc = proto_register(&raw_prot, 1);
+ if (rc)
+ goto out_unregister_udp_proto;
+
+ rc = proto_register(&ping_prot, 1);
+ if (rc)
+ goto out_unregister_raw_proto;
+
+ /*
+ * Tell SOCKET that we are alive...
+ */
+
+ (void)sock_register(&inet_family_ops);
+
+#ifdef CONFIG_SYSCTL
+ ip_static_sysctl_init();
+#endif
+
+ tcp_prot.sysctl_mem = init_net.ipv4.sysctl_tcp_mem;
+
+ /*
+ * Add all the base protocols.
+ */
+
+ if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0)
+ pr_crit("%s: Cannot add ICMP protocol\n", __func__);
+ if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0)
+ pr_crit("%s: Cannot add UDP protocol\n", __func__);
+ if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0)
+ pr_crit("%s: Cannot add TCP protocol\n", __func__);
+#ifdef CONFIG_IP_MULTICAST
+ if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0)
+ pr_crit("%s: Cannot add IGMP protocol\n", __func__);
+#endif
+
+ /* Register the socket-side information for inet_create. */
+ for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r)
+ INIT_LIST_HEAD(r);
+
+ for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)
+ inet_register_protosw(q);
+
+ /*
+ * Set the ARP module up
+ */
+
+ arp_init();
+
+ /*
+ * Set the IP module up
+ */
+
+ ip_init();
+
+ tcp_v4_init();
+
+ /* Setup TCP slab cache for open requests. */
+ tcp_init();
+
+ /* Setup UDP memory threshold */
+ udp_init();
+
+ /* Add UDP-Lite (RFC 3828) */
+ udplite4_register();
+
+ ping_init();
+
+ /*
+ * Set the ICMP layer up
+ */
+
+ if (icmp_init() < 0)
+ panic("Failed to create the ICMP control socket.\n");
+
+ /*
+ * Initialise the multicast router
+ */
+#if defined(CONFIG_IP_MROUTE)
+ if (ip_mr_init())
+ pr_crit("%s: Cannot init ipv4 mroute\n", __func__);
+#endif
+ /*
+ * Initialise per-cpu ipv4 mibs
+ */
+
+ if (init_ipv4_mibs())
+ pr_crit("%s: Cannot init ipv4 mibs\n", __func__);
+
+ ipv4_proc_init();
+
+ ipfrag_init();
+
+ dev_add_pack(&ip_packet_type);
+
+ rc = 0;
+out:
+ return rc;
+out_unregister_raw_proto:
+ proto_unregister(&raw_prot);
+out_unregister_udp_proto:
+ proto_unregister(&udp_prot);
+out_unregister_tcp_proto:
+ proto_unregister(&tcp_prot);
+out_free_reserved_ports:
+ kfree(sysctl_local_reserved_ports);
+ goto out;
+}
+
+fs_initcall(inet_init);
+
+/* ------------------------------------------------------------------------ */
+
+#ifdef CONFIG_PROC_FS
+static int __init ipv4_proc_init(void)
+{
+ int rc = 0;
+
+ if (raw_proc_init())
+ goto out_raw;
+ if (tcp4_proc_init())
+ goto out_tcp;
+ if (udp4_proc_init())
+ goto out_udp;
+ if (ping_proc_init())
+ goto out_ping;
+ if (ip_misc_proc_init())
+ goto out_misc;
+out:
+ return rc;
+out_misc:
+ ping_proc_exit();
+out_ping:
+ udp4_proc_exit();
+out_udp:
+ tcp4_proc_exit();
+out_tcp:
+ raw_proc_exit();
+out_raw:
+ rc = -ENOMEM;
+ goto out;
+}
+
+#else /* CONFIG_PROC_FS */
+static int __init ipv4_proc_init(void)
+{
+ return 0;
+}
+#endif /* CONFIG_PROC_FS */
+
+MODULE_ALIAS_NETPROTO(PF_INET);
+
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
new file mode 100644
index 00000000..fd508b52
--- /dev/null
+++ b/net/ipv4/ah4.c
@@ -0,0 +1,538 @@
+#define pr_fmt(fmt) "IPsec: " fmt
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+struct ah_skb_cb {
+ struct xfrm_skb_cb xfrm;
+ void *tmp;
+};
+
+#define AH_SKB_CB(__skb) ((struct ah_skb_cb *)&((__skb)->cb[0]))
+
+static void *ah_alloc_tmp(struct crypto_ahash *ahash, int nfrags,
+ unsigned int size)
+{
+ unsigned int len;
+
+ len = size + crypto_ahash_digestsize(ahash) +
+ (crypto_ahash_alignmask(ahash) &
+ ~(crypto_tfm_ctx_alignment() - 1));
+
+ len = ALIGN(len, crypto_tfm_ctx_alignment());
+
+ len += sizeof(struct ahash_request) + crypto_ahash_reqsize(ahash);
+ len = ALIGN(len, __alignof__(struct scatterlist));
+
+ len += sizeof(struct scatterlist) * nfrags;
+
+ return kmalloc(len, GFP_ATOMIC);
+}
+
+static inline u8 *ah_tmp_auth(void *tmp, unsigned int offset)
+{
+ return tmp + offset;
+}
+
+static inline u8 *ah_tmp_icv(struct crypto_ahash *ahash, void *tmp,
+ unsigned int offset)
+{
+ return PTR_ALIGN((u8 *)tmp + offset, crypto_ahash_alignmask(ahash) + 1);
+}
+
+static inline struct ahash_request *ah_tmp_req(struct crypto_ahash *ahash,
+ u8 *icv)
+{
+ struct ahash_request *req;
+
+ req = (void *)PTR_ALIGN(icv + crypto_ahash_digestsize(ahash),
+ crypto_tfm_ctx_alignment());
+
+ ahash_request_set_tfm(req, ahash);
+
+ return req;
+}
+
+static inline struct scatterlist *ah_req_sg(struct crypto_ahash *ahash,
+ struct ahash_request *req)
+{
+ return (void *)ALIGN((unsigned long)(req + 1) +
+ crypto_ahash_reqsize(ahash),
+ __alignof__(struct scatterlist));
+}
+
+/* Clear mutable options and find final destination to substitute
+ * into IP header for icv calculation. Options are already checked
+ * for validity, so paranoia is not required. */
+
+static int ip_clear_mutable_options(const struct iphdr *iph, __be32 *daddr)
+{
+ unsigned char * optptr = (unsigned char*)(iph+1);
+ int l = iph->ihl*4 - sizeof(struct iphdr);
+ int optlen;
+
+ while (l > 0) {
+ switch (*optptr) {
+ case IPOPT_END:
+ return 0;
+ case IPOPT_NOOP:
+ l--;
+ optptr++;
+ continue;
+ }
+ optlen = optptr[1];
+ if (optlen<2 || optlen>l)
+ return -EINVAL;
+ switch (*optptr) {
+ case IPOPT_SEC:
+ case 0x85: /* Some "Extended Security" crap. */
+ case IPOPT_CIPSO:
+ case IPOPT_RA:
+ case 0x80|21: /* RFC1770 */
+ break;
+ case IPOPT_LSRR:
+ case IPOPT_SSRR:
+ if (optlen < 6)
+ return -EINVAL;
+ memcpy(daddr, optptr+optlen-4, 4);
+ /* Fall through */
+ default:
+ memset(optptr, 0, optlen);
+ }
+ l -= optlen;
+ optptr += optlen;
+ }
+ return 0;
+}
+
+static void ah_output_done(struct crypto_async_request *base, int err)
+{
+ u8 *icv;
+ struct iphdr *iph;
+ struct sk_buff *skb = base->data;
+ struct xfrm_state *x = skb_dst(skb)->xfrm;
+ struct ah_data *ahp = x->data;
+ struct iphdr *top_iph = ip_hdr(skb);
+ struct ip_auth_hdr *ah = ip_auth_hdr(skb);
+ int ihl = ip_hdrlen(skb);
+
+ iph = AH_SKB_CB(skb)->tmp;
+ icv = ah_tmp_icv(ahp->ahash, iph, ihl);
+ memcpy(ah->auth_data, icv, ahp->icv_trunc_len);
+
+ top_iph->tos = iph->tos;
+ top_iph->ttl = iph->ttl;
+ top_iph->frag_off = iph->frag_off;
+ if (top_iph->ihl != 5) {
+ top_iph->daddr = iph->daddr;
+ memcpy(top_iph+1, iph+1, top_iph->ihl*4 - sizeof(struct iphdr));
+ }
+
+ kfree(AH_SKB_CB(skb)->tmp);
+ xfrm_output_resume(skb, err);
+}
+
+static int ah_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+ int err;
+ int nfrags;
+ int ihl;
+ u8 *icv;
+ struct sk_buff *trailer;
+ struct crypto_ahash *ahash;
+ struct ahash_request *req;
+ struct scatterlist *sg;
+ struct iphdr *iph, *top_iph;
+ struct ip_auth_hdr *ah;
+ struct ah_data *ahp;
+
+ ahp = x->data;
+ ahash = ahp->ahash;
+
+ if ((err = skb_cow_data(skb, 0, &trailer)) < 0)
+ goto out;
+ nfrags = err;
+
+ skb_push(skb, -skb_network_offset(skb));
+ ah = ip_auth_hdr(skb);
+ ihl = ip_hdrlen(skb);
+
+ err = -ENOMEM;
+ iph = ah_alloc_tmp(ahash, nfrags, ihl);
+ if (!iph)
+ goto out;
+
+ icv = ah_tmp_icv(ahash, iph, ihl);
+ req = ah_tmp_req(ahash, icv);
+ sg = ah_req_sg(ahash, req);
+
+ memset(ah->auth_data, 0, ahp->icv_trunc_len);
+
+ top_iph = ip_hdr(skb);
+
+ iph->tos = top_iph->tos;
+ iph->ttl = top_iph->ttl;
+ iph->frag_off = top_iph->frag_off;
+
+ if (top_iph->ihl != 5) {
+ iph->daddr = top_iph->daddr;
+ memcpy(iph+1, top_iph+1, top_iph->ihl*4 - sizeof(struct iphdr));
+ err = ip_clear_mutable_options(top_iph, &top_iph->daddr);
+ if (err)
+ goto out_free;
+ }
+
+ ah->nexthdr = *skb_mac_header(skb);
+ *skb_mac_header(skb) = IPPROTO_AH;
+
+ top_iph->tos = 0;
+ top_iph->tot_len = htons(skb->len);
+ top_iph->frag_off = 0;
+ top_iph->ttl = 0;
+ top_iph->check = 0;
+
+ if (x->props.flags & XFRM_STATE_ALIGN4)
+ ah->hdrlen = (XFRM_ALIGN4(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2;
+ else
+ ah->hdrlen = (XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2;
+
+ ah->reserved = 0;
+ ah->spi = x->id.spi;
+ ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
+
+ sg_init_table(sg, nfrags);
+ skb_to_sgvec(skb, sg, 0, skb->len);
+
+ ahash_request_set_crypt(req, sg, icv, skb->len);
+ ahash_request_set_callback(req, 0, ah_output_done, skb);
+
+ AH_SKB_CB(skb)->tmp = iph;
+
+ err = crypto_ahash_digest(req);
+ if (err) {
+ if (err == -EINPROGRESS)
+ goto out;
+
+ if (err == -EBUSY)
+ err = NET_XMIT_DROP;
+ goto out_free;
+ }
+
+ memcpy(ah->auth_data, icv, ahp->icv_trunc_len);
+
+ top_iph->tos = iph->tos;
+ top_iph->ttl = iph->ttl;
+ top_iph->frag_off = iph->frag_off;
+ if (top_iph->ihl != 5) {
+ top_iph->daddr = iph->daddr;
+ memcpy(top_iph+1, iph+1, top_iph->ihl*4 - sizeof(struct iphdr));
+ }
+
+out_free:
+ kfree(iph);
+out:
+ return err;
+}
+
+static void ah_input_done(struct crypto_async_request *base, int err)
+{
+ u8 *auth_data;
+ u8 *icv;
+ struct iphdr *work_iph;
+ struct sk_buff *skb = base->data;
+ struct xfrm_state *x = xfrm_input_state(skb);
+ struct ah_data *ahp = x->data;
+ struct ip_auth_hdr *ah = ip_auth_hdr(skb);
+ int ihl = ip_hdrlen(skb);
+ int ah_hlen = (ah->hdrlen + 2) << 2;
+
+ work_iph = AH_SKB_CB(skb)->tmp;
+ auth_data = ah_tmp_auth(work_iph, ihl);
+ icv = ah_tmp_icv(ahp->ahash, auth_data, ahp->icv_trunc_len);
+
+ err = memcmp(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG: 0;
+ if (err)
+ goto out;
+
+ err = ah->nexthdr;
+
+ skb->network_header += ah_hlen;
+ memcpy(skb_network_header(skb), work_iph, ihl);
+ __skb_pull(skb, ah_hlen + ihl);
+ skb_set_transport_header(skb, -ihl);
+out:
+ kfree(AH_SKB_CB(skb)->tmp);
+ xfrm_input_resume(skb, err);
+}
+
+static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
+{
+ int ah_hlen;
+ int ihl;
+ int nexthdr;
+ int nfrags;
+ u8 *auth_data;
+ u8 *icv;
+ struct sk_buff *trailer;
+ struct crypto_ahash *ahash;
+ struct ahash_request *req;
+ struct scatterlist *sg;
+ struct iphdr *iph, *work_iph;
+ struct ip_auth_hdr *ah;
+ struct ah_data *ahp;
+ int err = -ENOMEM;
+
+ if (!pskb_may_pull(skb, sizeof(*ah)))
+ goto out;
+
+ ah = (struct ip_auth_hdr *)skb->data;
+ ahp = x->data;
+ ahash = ahp->ahash;
+
+ nexthdr = ah->nexthdr;
+ ah_hlen = (ah->hdrlen + 2) << 2;
+
+ if (x->props.flags & XFRM_STATE_ALIGN4) {
+ if (ah_hlen != XFRM_ALIGN4(sizeof(*ah) + ahp->icv_full_len) &&
+ ah_hlen != XFRM_ALIGN4(sizeof(*ah) + ahp->icv_trunc_len))
+ goto out;
+ } else {
+ if (ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_full_len) &&
+ ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len))
+ goto out;
+ }
+
+ if (!pskb_may_pull(skb, ah_hlen))
+ goto out;
+
+ /* We are going to _remove_ AH header to keep sockets happy,
+ * so... Later this can change. */
+ if (skb_cloned(skb) &&
+ pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
+ goto out;
+
+ skb->ip_summed = CHECKSUM_NONE;
+
+
+ if ((err = skb_cow_data(skb, 0, &trailer)) < 0)
+ goto out;
+ nfrags = err;
+
+ ah = (struct ip_auth_hdr *)skb->data;
+ iph = ip_hdr(skb);
+ ihl = ip_hdrlen(skb);
+
+ work_iph = ah_alloc_tmp(ahash, nfrags, ihl + ahp->icv_trunc_len);
+ if (!work_iph)
+ goto out;
+
+ auth_data = ah_tmp_auth(work_iph, ihl);
+ icv = ah_tmp_icv(ahash, auth_data, ahp->icv_trunc_len);
+ req = ah_tmp_req(ahash, icv);
+ sg = ah_req_sg(ahash, req);
+
+ memcpy(work_iph, iph, ihl);
+ memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len);
+ memset(ah->auth_data, 0, ahp->icv_trunc_len);
+
+ iph->ttl = 0;
+ iph->tos = 0;
+ iph->frag_off = 0;
+ iph->check = 0;
+ if (ihl > sizeof(*iph)) {
+ __be32 dummy;
+ err = ip_clear_mutable_options(iph, &dummy);
+ if (err)
+ goto out_free;
+ }
+
+ skb_push(skb, ihl);
+
+ sg_init_table(sg, nfrags);
+ skb_to_sgvec(skb, sg, 0, skb->len);
+
+ ahash_request_set_crypt(req, sg, icv, skb->len);
+ ahash_request_set_callback(req, 0, ah_input_done, skb);
+
+ AH_SKB_CB(skb)->tmp = work_iph;
+
+ err = crypto_ahash_digest(req);
+ if (err) {
+ if (err == -EINPROGRESS)
+ goto out;
+
+ goto out_free;
+ }
+
+ err = memcmp(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG: 0;
+ if (err)
+ goto out_free;
+
+ skb->network_header += ah_hlen;
+ memcpy(skb_network_header(skb), work_iph, ihl);
+ __skb_pull(skb, ah_hlen + ihl);
+ skb_set_transport_header(skb, -ihl);
+
+ err = nexthdr;
+
+out_free:
+ kfree (work_iph);
+out:
+ return err;
+}
+
+static void ah4_err(struct sk_buff *skb, u32 info)
+{
+ struct net *net = dev_net(skb->dev);
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
+ struct ip_auth_hdr *ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2));
+ struct xfrm_state *x;
+
+ if (icmp_hdr(skb)->type != ICMP_DEST_UNREACH ||
+ icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
+ return;
+
+ x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
+ ah->spi, IPPROTO_AH, AF_INET);
+ if (!x)
+ return;
+ printk(KERN_DEBUG "pmtu discovery on SA AH/%08x/%08x\n",
+ ntohl(ah->spi), ntohl(iph->daddr));
+ xfrm_state_put(x);
+}
+
+static int ah_init_state(struct xfrm_state *x)
+{
+ struct ah_data *ahp = NULL;
+ struct xfrm_algo_desc *aalg_desc;
+ struct crypto_ahash *ahash;
+
+ if (!x->aalg)
+ goto error;
+
+ if (x->encap)
+ goto error;
+
+ ahp = kzalloc(sizeof(*ahp), GFP_KERNEL);
+ if (!ahp)
+ return -ENOMEM;
+
+ ahash = crypto_alloc_ahash(x->aalg->alg_name, 0, 0);
+ if (IS_ERR(ahash))
+ goto error;
+
+ ahp->ahash = ahash;
+ if (crypto_ahash_setkey(ahash, x->aalg->alg_key,
+ (x->aalg->alg_key_len + 7) / 8))
+ goto error;
+
+ /*
+ * Lookup the algorithm description maintained by xfrm_algo,
+ * verify crypto transform properties, and store information
+ * we need for AH processing. This lookup cannot fail here
+ * after a successful crypto_alloc_ahash().
+ */
+ aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name, 0);
+ BUG_ON(!aalg_desc);
+
+ if (aalg_desc->uinfo.auth.icv_fullbits/8 !=
+ crypto_ahash_digestsize(ahash)) {
+ pr_info("%s: %s digestsize %u != %hu\n",
+ __func__, x->aalg->alg_name,
+ crypto_ahash_digestsize(ahash),
+ aalg_desc->uinfo.auth.icv_fullbits / 8);
+ goto error;
+ }
+
+ ahp->icv_full_len = aalg_desc->uinfo.auth.icv_fullbits/8;
+ ahp->icv_trunc_len = x->aalg->alg_trunc_len/8;
+
+ BUG_ON(ahp->icv_trunc_len > MAX_AH_AUTH_LEN);
+
+ if (x->props.flags & XFRM_STATE_ALIGN4)
+ x->props.header_len = XFRM_ALIGN4(sizeof(struct ip_auth_hdr) +
+ ahp->icv_trunc_len);
+ else
+ x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) +
+ ahp->icv_trunc_len);
+ if (x->props.mode == XFRM_MODE_TUNNEL)
+ x->props.header_len += sizeof(struct iphdr);
+ x->data = ahp;
+
+ return 0;
+
+error:
+ if (ahp) {
+ crypto_free_ahash(ahp->ahash);
+ kfree(ahp);
+ }
+ return -EINVAL;
+}
+
+static void ah_destroy(struct xfrm_state *x)
+{
+ struct ah_data *ahp = x->data;
+
+ if (!ahp)
+ return;
+
+ crypto_free_ahash(ahp->ahash);
+ kfree(ahp);
+}
+
+
+static const struct xfrm_type ah_type =
+{
+ .description = "AH4",
+ .owner = THIS_MODULE,
+ .proto = IPPROTO_AH,
+ .flags = XFRM_TYPE_REPLAY_PROT,
+ .init_state = ah_init_state,
+ .destructor = ah_destroy,
+ .input = ah_input,
+ .output = ah_output
+};
+
+static const struct net_protocol ah4_protocol = {
+ .handler = xfrm4_rcv,
+ .err_handler = ah4_err,
+ .no_policy = 1,
+ .netns_ok = 1,
+};
+
+static int __init ah4_init(void)
+{
+ if (xfrm_register_type(&ah_type, AF_INET) < 0) {
+ pr_info("%s: can't add xfrm type\n", __func__);
+ return -EAGAIN;
+ }
+ if (inet_add_protocol(&ah4_protocol, IPPROTO_AH) < 0) {
+ pr_info("%s: can't add protocol\n", __func__);
+ xfrm_unregister_type(&ah_type, AF_INET);
+ return -EAGAIN;
+ }
+ return 0;
+}
+
+static void __exit ah4_fini(void)
+{
+ if (inet_del_protocol(&ah4_protocol, IPPROTO_AH) < 0)
+ pr_info("%s: can't remove protocol\n", __func__);
+ if (xfrm_unregister_type(&ah_type, AF_INET) < 0)
+ pr_info("%s: can't remove xfrm type\n", __func__);
+}
+
+module_init(ah4_init);
+module_exit(ah4_fini);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_XFRM_TYPE(AF_INET, XFRM_PROTO_AH);
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
new file mode 100644
index 00000000..18d9b81e
--- /dev/null
+++ b/net/ipv4/arp.c
@@ -0,0 +1,1446 @@
+/* linux/net/ipv4/arp.c
+ *
+ * Copyright (C) 1994 by Florian La Roche
+ *
+ * This module implements the Address Resolution Protocol ARP (RFC 826),
+ * which is used to convert IP addresses (or in the future maybe other
+ * high-level addresses) into a low-level hardware address (like an Ethernet
+ * address).
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Fixes:
+ * Alan Cox : Removed the Ethernet assumptions in
+ * Florian's code
+ * Alan Cox : Fixed some small errors in the ARP
+ * logic
+ * Alan Cox : Allow >4K in /proc
+ * Alan Cox : Make ARP add its own protocol entry
+ * Ross Martin : Rewrote arp_rcv() and arp_get_info()
+ * Stephen Henson : Add AX25 support to arp_get_info()
+ * Alan Cox : Drop data when a device is downed.
+ * Alan Cox : Use init_timer().
+ * Alan Cox : Double lock fixes.
+ * Martin Seine : Move the arphdr structure
+ * to if_arp.h for compatibility.
+ * with BSD based programs.
+ * Andrew Tridgell : Added ARP netmask code and
+ * re-arranged proxy handling.
+ * Alan Cox : Changed to use notifiers.
+ * Niibe Yutaka : Reply for this device or proxies only.
+ * Alan Cox : Don't proxy across hardware types!
+ * Jonathan Naylor : Added support for NET/ROM.
+ * Mike Shaver : RFC1122 checks.
+ * Jonathan Naylor : Only lookup the hardware address for
+ * the correct hardware type.
+ * Germano Caronni : Assorted subtle races.
+ * Craig Schlenter : Don't modify permanent entry
+ * during arp_rcv.
+ * Russ Nelson : Tidied up a few bits.
+ * Alexey Kuznetsov: Major changes to caching and behaviour,
+ * eg intelligent arp probing and
+ * generation
+ * of host down events.
+ * Alan Cox : Missing unlock in device events.
+ * Eckes : ARP ioctl control errors.
+ * Alexey Kuznetsov: Arp free fix.
+ * Manuel Rodriguez: Gratuitous ARP.
+ * Jonathan Layes : Added arpd support through kerneld
+ * message queue (960314)
+ * Mike Shaver : /proc/sys/net/ipv4/arp_* support
+ * Mike McLagan : Routing by source
+ * Stuart Cheshire : Metricom and grat arp fixes
+ * *** FOR 2.1 clean this up ***
+ * Lawrence V. Stefani: (08/12/96) Added FDDI support.
+ * Alan Cox : Took the AP1000 nasty FDDI hack and
+ * folded into the mainstream FDDI code.
+ * Ack spit, Linus how did you allow that
+ * one in...
+ * Jes Sorensen : Make FDDI work again in 2.1.x and
+ * clean up the APFDDI & gen. FDDI bits.
+ * Alexey Kuznetsov: new arp state machine;
+ * now it is in net/core/neighbour.c.
+ * Krzysztof Halasa: Added Frame Relay ARP support.
+ * Arnaldo C. Melo : convert /proc/net/arp to seq_file
+ * Shmulik Hen: Split arp_send to arp_create and
+ * arp_xmit so intermediate drivers like
+ * bonding can change the skb before
+ * sending (e.g. insert 8021q tag).
+ * Harald Welte : convert to make use of jenkins hash
+ * Jesper D. Brouer: Proxy ARP PVLAN RFC 3069 support.
+ */
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#ifdef CONFIG_SYSCTL
+#include
+#endif
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include
+
+#include
+
+/*
+ * Interface to generic neighbour cache.
+ */
+static u32 arp_hash(const void *pkey, const struct net_device *dev, __u32 *hash_rnd);
+static int arp_constructor(struct neighbour *neigh);
+static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb);
+static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb);
+static void parp_redo(struct sk_buff *skb);
+
+static const struct neigh_ops arp_generic_ops = {
+ .family = AF_INET,
+ .solicit = arp_solicit,
+ .error_report = arp_error_report,
+ .output = neigh_resolve_output,
+ .connected_output = neigh_connected_output,
+};
+
+static const struct neigh_ops arp_hh_ops = {
+ .family = AF_INET,
+ .solicit = arp_solicit,
+ .error_report = arp_error_report,
+ .output = neigh_resolve_output,
+ .connected_output = neigh_resolve_output,
+};
+
+static const struct neigh_ops arp_direct_ops = {
+ .family = AF_INET,
+ .output = neigh_direct_output,
+ .connected_output = neigh_direct_output,
+};
+
+static const struct neigh_ops arp_broken_ops = {
+ .family = AF_INET,
+ .solicit = arp_solicit,
+ .error_report = arp_error_report,
+ .output = neigh_compat_output,
+ .connected_output = neigh_compat_output,
+};
+
+struct neigh_table arp_tbl = {
+ .family = AF_INET,
+ .key_len = 4,
+ .hash = arp_hash,
+ .constructor = arp_constructor,
+ .proxy_redo = parp_redo,
+ .id = "arp_cache",
+ .parms = {
+ .tbl = &arp_tbl,
+ .base_reachable_time = 30 * HZ,
+ .retrans_time = 1 * HZ,
+ .gc_staletime = 60 * HZ,
+ .reachable_time = 30 * HZ,
+ .delay_probe_time = 5 * HZ,
+ .queue_len_bytes = 64*1024,
+ .ucast_probes = 3,
+ .mcast_probes = 3,
+ .anycast_delay = 1 * HZ,
+ .proxy_delay = (8 * HZ) / 10,
+ .proxy_qlen = 64,
+ .locktime = 1 * HZ,
+ },
+ .gc_interval = 30 * HZ,
+ .gc_thresh1 = 128,
+ .gc_thresh2 = 512,
+ .gc_thresh3 = 1024,
+};
+EXPORT_SYMBOL(arp_tbl);
+
+int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir)
+{
+ switch (dev->type) {
+ case ARPHRD_ETHER:
+ case ARPHRD_FDDI:
+ case ARPHRD_IEEE802:
+ ip_eth_mc_map(addr, haddr);
+ return 0;
+ case ARPHRD_IEEE802_TR:
+ ip_tr_mc_map(addr, haddr);
+ return 0;
+ case ARPHRD_INFINIBAND:
+ ip_ib_mc_map(addr, dev->broadcast, haddr);
+ return 0;
+ case ARPHRD_IPGRE:
+ ip_ipgre_mc_map(addr, dev->broadcast, haddr);
+ return 0;
+ default:
+ if (dir) {
+ memcpy(haddr, dev->broadcast, dev->addr_len);
+ return 0;
+ }
+ }
+ return -EINVAL;
+}
+
+
+static u32 arp_hash(const void *pkey,
+ const struct net_device *dev,
+ __u32 *hash_rnd)
+{
+ return arp_hashfn(*(u32 *)pkey, dev, *hash_rnd);
+}
+
+static int arp_constructor(struct neighbour *neigh)
+{
+ __be32 addr = *(__be32 *)neigh->primary_key;
+ struct net_device *dev = neigh->dev;
+ struct in_device *in_dev;
+ struct neigh_parms *parms;
+
+ rcu_read_lock();
+ in_dev = __in_dev_get_rcu(dev);
+ if (in_dev == NULL) {
+ rcu_read_unlock();
+ return -EINVAL;
+ }
+
+ neigh->type = inet_addr_type(dev_net(dev), addr);
+
+ parms = in_dev->arp_parms;
+ __neigh_parms_put(neigh->parms);
+ neigh->parms = neigh_parms_clone(parms);
+ rcu_read_unlock();
+
+ if (!dev->header_ops) {
+ neigh->nud_state = NUD_NOARP;
+ neigh->ops = &arp_direct_ops;
+ neigh->output = neigh_direct_output;
+ } else {
+ /* Good devices (checked by reading texts, but only Ethernet is
+ tested)
+
+ ARPHRD_ETHER: (ethernet, apfddi)
+ ARPHRD_FDDI: (fddi)
+ ARPHRD_IEEE802: (tr)
+ ARPHRD_METRICOM: (strip)
+ ARPHRD_ARCNET:
+ etc. etc. etc.
+
+ ARPHRD_IPDDP will also work, if author repairs it.
+ I did not it, because this driver does not work even
+ in old paradigm.
+ */
+
+#if 1
+ /* So... these "amateur" devices are hopeless.
+ The only thing, that I can say now:
+ It is very sad that we need to keep ugly obsolete
+ code to make them happy.
+
+ They should be moved to more reasonable state, now
+ they use rebuild_header INSTEAD OF hard_start_xmit!!!
+ Besides that, they are sort of out of date
+ (a lot of redundant clones/copies, useless in 2.1),
+ I wonder why people believe that they work.
+ */
+ switch (dev->type) {
+ default:
+ break;
+ case ARPHRD_ROSE:
+#if IS_ENABLED(CONFIG_AX25)
+ case ARPHRD_AX25:
+#if IS_ENABLED(CONFIG_NETROM)
+ case ARPHRD_NETROM:
+#endif
+ neigh->ops = &arp_broken_ops;
+ neigh->output = neigh->ops->output;
+ return 0;
+#else
+ break;
+#endif
+ }
+#endif
+ if (neigh->type == RTN_MULTICAST) {
+ neigh->nud_state = NUD_NOARP;
+ arp_mc_map(addr, neigh->ha, dev, 1);
+ } else if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) {
+ neigh->nud_state = NUD_NOARP;
+ memcpy(neigh->ha, dev->dev_addr, dev->addr_len);
+ } else if (neigh->type == RTN_BROADCAST ||
+ (dev->flags & IFF_POINTOPOINT)) {
+ neigh->nud_state = NUD_NOARP;
+ memcpy(neigh->ha, dev->broadcast, dev->addr_len);
+ }
+
+ if (dev->header_ops->cache)
+ neigh->ops = &arp_hh_ops;
+ else
+ neigh->ops = &arp_generic_ops;
+
+ if (neigh->nud_state & NUD_VALID)
+ neigh->output = neigh->ops->connected_output;
+ else
+ neigh->output = neigh->ops->output;
+ }
+ return 0;
+}
+
+static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb)
+{
+ dst_link_failure(skb);
+ kfree_skb(skb);
+}
+
+static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
+{
+ __be32 saddr = 0;
+ u8 *dst_ha = NULL;
+ struct net_device *dev = neigh->dev;
+ __be32 target = *(__be32 *)neigh->primary_key;
+ int probes = atomic_read(&neigh->probes);
+ struct in_device *in_dev;
+
+ rcu_read_lock();
+ in_dev = __in_dev_get_rcu(dev);
+ if (!in_dev) {
+ rcu_read_unlock();
+ return;
+ }
+ switch (IN_DEV_ARP_ANNOUNCE(in_dev)) {
+ default:
+ case 0: /* By default announce any local IP */
+ if (skb && inet_addr_type(dev_net(dev),
+ ip_hdr(skb)->saddr) == RTN_LOCAL)
+ saddr = ip_hdr(skb)->saddr;
+ break;
+ case 1: /* Restrict announcements of saddr in same subnet */
+ if (!skb)
+ break;
+ saddr = ip_hdr(skb)->saddr;
+ if (inet_addr_type(dev_net(dev), saddr) == RTN_LOCAL) {
+ /* saddr should be known to target */
+ if (inet_addr_onlink(in_dev, target, saddr))
+ break;
+ }
+ saddr = 0;
+ break;
+ case 2: /* Avoid secondary IPs, get a primary/preferred one */
+ break;
+ }
+ rcu_read_unlock();
+
+ if (!saddr)
+ saddr = inet_select_addr(dev, target, RT_SCOPE_LINK);
+
+ probes -= neigh->parms->ucast_probes;
+ if (probes < 0) {
+ if (!(neigh->nud_state & NUD_VALID))
+ printk(KERN_DEBUG
+ "trying to ucast probe in NUD_INVALID\n");
+ dst_ha = neigh->ha;
+ read_lock_bh(&neigh->lock);
+ } else {
+ probes -= neigh->parms->app_probes;
+ if (probes < 0) {
+#ifdef CONFIG_ARPD
+ neigh_app_ns(neigh);
+#endif
+ return;
+ }
+ }
+
+ arp_send(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr,
+ dst_ha, dev->dev_addr, NULL);
+ if (dst_ha)
+ read_unlock_bh(&neigh->lock);
+}
+
+static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip)
+{
+ int scope;
+
+ switch (IN_DEV_ARP_IGNORE(in_dev)) {
+ case 0: /* Reply, the tip is already validated */
+ return 0;
+ case 1: /* Reply only if tip is configured on the incoming interface */
+ sip = 0;
+ scope = RT_SCOPE_HOST;
+ break;
+ case 2: /*
+ * Reply only if tip is configured on the incoming interface
+ * and is in same subnet as sip
+ */
+ scope = RT_SCOPE_HOST;
+ break;
+ case 3: /* Do not reply for scope host addresses */
+ sip = 0;
+ scope = RT_SCOPE_LINK;
+ break;
+ case 4: /* Reserved */
+ case 5:
+ case 6:
+ case 7:
+ return 0;
+ case 8: /* Do not reply */
+ return 1;
+ default:
+ return 0;
+ }
+ return !inet_confirm_addr(in_dev, sip, tip, scope);
+}
+
+static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev)
+{
+ struct rtable *rt;
+ int flag = 0;
+ /*unsigned long now; */
+ struct net *net = dev_net(dev);
+
+ rt = ip_route_output(net, sip, tip, 0, 0);
+ if (IS_ERR(rt))
+ return 1;
+ if (rt->dst.dev != dev) {
+ NET_INC_STATS_BH(net, LINUX_MIB_ARPFILTER);
+ flag = 1;
+ }
+ ip_rt_put(rt);
+ return flag;
+}
+
+/* OBSOLETE FUNCTIONS */
+
+/*
+ * Find an arp mapping in the cache. If not found, post a request.
+ *
+ * It is very UGLY routine: it DOES NOT use skb->dst->neighbour,
+ * even if it exists. It is supposed that skb->dev was mangled
+ * by a virtual device (eql, shaper). Nobody but broken devices
+ * is allowed to use this function, it is scheduled to be removed. --ANK
+ */
+
+static int arp_set_predefined(int addr_hint, unsigned char *haddr,
+ __be32 paddr, struct net_device *dev)
+{
+ switch (addr_hint) {
+ case RTN_LOCAL:
+ printk(KERN_DEBUG "ARP: arp called for own IP address\n");
+ memcpy(haddr, dev->dev_addr, dev->addr_len);
+ return 1;
+ case RTN_MULTICAST:
+ arp_mc_map(paddr, haddr, dev, 1);
+ return 1;
+ case RTN_BROADCAST:
+ memcpy(haddr, dev->broadcast, dev->addr_len);
+ return 1;
+ }
+ return 0;
+}
+
+
+int arp_find(unsigned char *haddr, struct sk_buff *skb)
+{
+ struct net_device *dev = skb->dev;
+ __be32 paddr;
+ struct neighbour *n;
+
+ if (!skb_dst(skb)) {
+ printk(KERN_DEBUG "arp_find is called with dst==NULL\n");
+ kfree_skb(skb);
+ return 1;
+ }
+
+ paddr = skb_rtable(skb)->rt_gateway;
+
+ if (arp_set_predefined(inet_addr_type(dev_net(dev), paddr), haddr,
+ paddr, dev))
+ return 0;
+
+ n = __neigh_lookup(&arp_tbl, &paddr, dev, 1);
+
+ if (n) {
+ n->used = jiffies;
+ if (n->nud_state & NUD_VALID || neigh_event_send(n, skb) == 0) {
+ neigh_ha_snapshot(haddr, n, dev);
+ neigh_release(n);
+ return 0;
+ }
+ neigh_release(n);
+ } else
+ kfree_skb(skb);
+ return 1;
+}
+EXPORT_SYMBOL(arp_find);
+
+/* END OF OBSOLETE FUNCTIONS */
+
+/*
+ * Check if we can use proxy ARP for this path
+ */
+static inline int arp_fwd_proxy(struct in_device *in_dev,
+ struct net_device *dev, struct rtable *rt)
+{
+ struct in_device *out_dev;
+ int imi, omi = -1;
+
+ if (rt->dst.dev == dev)
+ return 0;
+
+ if (!IN_DEV_PROXY_ARP(in_dev))
+ return 0;
+ imi = IN_DEV_MEDIUM_ID(in_dev);
+ if (imi == 0)
+ return 1;
+ if (imi == -1)
+ return 0;
+
+ /* place to check for proxy_arp for routes */
+
+ out_dev = __in_dev_get_rcu(rt->dst.dev);
+ if (out_dev)
+ omi = IN_DEV_MEDIUM_ID(out_dev);
+
+ return omi != imi && omi != -1;
+}
+
+/*
+ * Check for RFC3069 proxy arp private VLAN (allow to send back to same dev)
+ *
+ * RFC3069 supports proxy arp replies back to the same interface. This
+ * is done to support (ethernet) switch features, like RFC 3069, where
+ * the individual ports are not allowed to communicate with each
+ * other, BUT they are allowed to talk to the upstream router. As
+ * described in RFC 3069, it is possible to allow these hosts to
+ * communicate through the upstream router, by proxy_arp'ing.
+ *
+ * RFC 3069: "VLAN Aggregation for Efficient IP Address Allocation"
+ *
+ * This technology is known by different names:
+ * In RFC 3069 it is called VLAN Aggregation.
+ * Cisco and Allied Telesyn call it Private VLAN.
+ * Hewlett-Packard call it Source-Port filtering or port-isolation.
+ * Ericsson call it MAC-Forced Forwarding (RFC Draft).
+ *
+ */
+static inline int arp_fwd_pvlan(struct in_device *in_dev,
+ struct net_device *dev, struct rtable *rt,
+ __be32 sip, __be32 tip)
+{
+ /* Private VLAN is only concerned about the same ethernet segment */
+ if (rt->dst.dev != dev)
+ return 0;
+
+ /* Don't reply on self probes (often done by windowz boxes)*/
+ if (sip == tip)
+ return 0;
+
+ if (IN_DEV_PROXY_ARP_PVLAN(in_dev))
+ return 1;
+ else
+ return 0;
+}
+
+/*
+ * Interface to link layer: send routine and receive handler.
+ */
+
+/*
+ * Create an arp packet. If (dest_hw == NULL), we create a broadcast
+ * message.
+ */
+struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip,
+ struct net_device *dev, __be32 src_ip,
+ const unsigned char *dest_hw,
+ const unsigned char *src_hw,
+ const unsigned char *target_hw)
+{
+ struct sk_buff *skb;
+ struct arphdr *arp;
+ unsigned char *arp_ptr;
+ int hlen = LL_RESERVED_SPACE(dev);
+ int tlen = dev->needed_tailroom;
+
+ /*
+ * Allocate a buffer
+ */
+
+ skb = alloc_skb(arp_hdr_len(dev) + hlen + tlen, GFP_ATOMIC);
+ if (skb == NULL)
+ return NULL;
+
+ skb_reserve(skb, hlen);
+ skb_reset_network_header(skb);
+ arp = (struct arphdr *) skb_put(skb, arp_hdr_len(dev));
+ skb->dev = dev;
+ skb->protocol = htons(ETH_P_ARP);
+ if (src_hw == NULL)
+ src_hw = dev->dev_addr;
+ if (dest_hw == NULL)
+ dest_hw = dev->broadcast;
+
+ /*
+ * Fill the device header for the ARP frame
+ */
+ if (dev_hard_header(skb, dev, ptype, dest_hw, src_hw, skb->len) < 0)
+ goto out;
+
+ /*
+ * Fill out the arp protocol part.
+ *
+ * The arp hardware type should match the device type, except for FDDI,
+ * which (according to RFC 1390) should always equal 1 (Ethernet).
+ */
+ /*
+ * Exceptions everywhere. AX.25 uses the AX.25 PID value not the
+ * DIX code for the protocol. Make these device structure fields.
+ */
+ switch (dev->type) {
+ default:
+ arp->ar_hrd = htons(dev->type);
+ arp->ar_pro = htons(ETH_P_IP);
+ break;
+
+#if IS_ENABLED(CONFIG_AX25)
+ case ARPHRD_AX25:
+ arp->ar_hrd = htons(ARPHRD_AX25);
+ arp->ar_pro = htons(AX25_P_IP);
+ break;
+
+#if IS_ENABLED(CONFIG_NETROM)
+ case ARPHRD_NETROM:
+ arp->ar_hrd = htons(ARPHRD_NETROM);
+ arp->ar_pro = htons(AX25_P_IP);
+ break;
+#endif
+#endif
+
+#if IS_ENABLED(CONFIG_FDDI)
+ case ARPHRD_FDDI:
+ arp->ar_hrd = htons(ARPHRD_ETHER);
+ arp->ar_pro = htons(ETH_P_IP);
+ break;
+#endif
+#if IS_ENABLED(CONFIG_TR)
+ case ARPHRD_IEEE802_TR:
+ arp->ar_hrd = htons(ARPHRD_IEEE802);
+ arp->ar_pro = htons(ETH_P_IP);
+ break;
+#endif
+ }
+
+ arp->ar_hln = dev->addr_len;
+ arp->ar_pln = 4;
+ arp->ar_op = htons(type);
+
+ arp_ptr = (unsigned char *)(arp + 1);
+
+ memcpy(arp_ptr, src_hw, dev->addr_len);
+ arp_ptr += dev->addr_len;
+ memcpy(arp_ptr, &src_ip, 4);
+ arp_ptr += 4;
+ if (target_hw != NULL)
+ memcpy(arp_ptr, target_hw, dev->addr_len);
+ else
+ memset(arp_ptr, 0, dev->addr_len);
+ arp_ptr += dev->addr_len;
+ memcpy(arp_ptr, &dest_ip, 4);
+
+ return skb;
+
+out:
+ kfree_skb(skb);
+ return NULL;
+}
+EXPORT_SYMBOL(arp_create);
+
+/*
+ * Send an arp packet.
+ */
+void arp_xmit(struct sk_buff *skb)
+{
+ /* Send it off, maybe filter it using firewalling first. */
+ NF_HOOK(NFPROTO_ARP, NF_ARP_OUT, skb, NULL, skb->dev, dev_queue_xmit);
+}
+EXPORT_SYMBOL(arp_xmit);
+
+/*
+ * Create and send an arp packet.
+ */
+void arp_send(int type, int ptype, __be32 dest_ip,
+ struct net_device *dev, __be32 src_ip,
+ const unsigned char *dest_hw, const unsigned char *src_hw,
+ const unsigned char *target_hw)
+{
+ struct sk_buff *skb;
+
+ /*
+ * No arp on this interface.
+ */
+
+ if (dev->flags&IFF_NOARP)
+ return;
+
+ skb = arp_create(type, ptype, dest_ip, dev, src_ip,
+ dest_hw, src_hw, target_hw);
+ if (skb == NULL)
+ return;
+
+ arp_xmit(skb);
+}
+EXPORT_SYMBOL(arp_send);
+
+/*
+ * Process an arp request.
+ */
+
+static int arp_process(struct sk_buff *skb)
+{
+ struct net_device *dev = skb->dev;
+ struct in_device *in_dev = __in_dev_get_rcu(dev);
+ struct arphdr *arp;
+ unsigned char *arp_ptr;
+ struct rtable *rt;
+ unsigned char *sha;
+ __be32 sip, tip;
+ u16 dev_type = dev->type;
+ int addr_type;
+ struct neighbour *n;
+ struct net *net = dev_net(dev);
+
+ /* arp_rcv below verifies the ARP header and verifies the device
+ * is ARP'able.
+ */
+
+ if (in_dev == NULL)
+ goto out;
+
+ arp = arp_hdr(skb);
+
+ switch (dev_type) {
+ default:
+ if (arp->ar_pro != htons(ETH_P_IP) ||
+ htons(dev_type) != arp->ar_hrd)
+ goto out;
+ break;
+ case ARPHRD_ETHER:
+ case ARPHRD_IEEE802_TR:
+ case ARPHRD_FDDI:
+ case ARPHRD_IEEE802:
+ /*
+ * ETHERNET, Token Ring and Fibre Channel (which are IEEE 802
+ * devices, according to RFC 2625) devices will accept ARP
+ * hardware types of either 1 (Ethernet) or 6 (IEEE 802.2).
+ * This is the case also of FDDI, where the RFC 1390 says that
+ * FDDI devices should accept ARP hardware of (1) Ethernet,
+ * however, to be more robust, we'll accept both 1 (Ethernet)
+ * or 6 (IEEE 802.2)
+ */
+ if ((arp->ar_hrd != htons(ARPHRD_ETHER) &&
+ arp->ar_hrd != htons(ARPHRD_IEEE802)) ||
+ arp->ar_pro != htons(ETH_P_IP))
+ goto out;
+ break;
+ case ARPHRD_AX25:
+ if (arp->ar_pro != htons(AX25_P_IP) ||
+ arp->ar_hrd != htons(ARPHRD_AX25))
+ goto out;
+ break;
+ case ARPHRD_NETROM:
+ if (arp->ar_pro != htons(AX25_P_IP) ||
+ arp->ar_hrd != htons(ARPHRD_NETROM))
+ goto out;
+ break;
+ }
+
+ /* Understand only these message types */
+
+ if (arp->ar_op != htons(ARPOP_REPLY) &&
+ arp->ar_op != htons(ARPOP_REQUEST))
+ goto out;
+
+/*
+ * Extract fields
+ */
+ arp_ptr = (unsigned char *)(arp + 1);
+ sha = arp_ptr;
+ arp_ptr += dev->addr_len;
+ memcpy(&sip, arp_ptr, 4);
+ arp_ptr += 4;
+ arp_ptr += dev->addr_len;
+ memcpy(&tip, arp_ptr, 4);
+/*
+ * Check for bad requests for 127.x.x.x and requests for multicast
+ * addresses. If this is one such, delete it.
+ */
+ if (ipv4_is_loopback(tip) || ipv4_is_multicast(tip))
+ goto out;
+
+/*
+ * Special case: We must set Frame Relay source Q.922 address
+ */
+ if (dev_type == ARPHRD_DLCI)
+ sha = dev->broadcast;
+
+/*
+ * Process entry. The idea here is we want to send a reply if it is a
+ * request for us or if it is a request for someone else that we hold
+ * a proxy for. We want to add an entry to our cache if it is a reply
+ * to us or if it is a request for our address.
+ * (The assumption for this last is that if someone is requesting our
+ * address, they are probably intending to talk to us, so it saves time
+ * if we cache their address. Their address is also probably not in
+ * our cache, since ours is not in their cache.)
+ *
+ * Putting this another way, we only care about replies if they are to
+ * us, in which case we add them to the cache. For requests, we care
+ * about those for us and those for our proxies. We reply to both,
+ * and in the case of requests for us we add the requester to the arp
+ * cache.
+ */
+
+ /* Special case: IPv4 duplicate address detection packet (RFC2131) */
+ if (sip == 0) {
+ if (arp->ar_op == htons(ARPOP_REQUEST) &&
+ inet_addr_type(net, tip) == RTN_LOCAL &&
+ !arp_ignore(in_dev, sip, tip))
+ arp_send(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha,
+ dev->dev_addr, sha);
+ goto out;
+ }
+
+ if (arp->ar_op == htons(ARPOP_REQUEST) &&
+ ip_route_input_noref(skb, tip, sip, 0, dev) == 0) {
+
+ rt = skb_rtable(skb);
+ addr_type = rt->rt_type;
+
+ if (addr_type == RTN_LOCAL) {
+ int dont_send;
+
+ dont_send = arp_ignore(in_dev, sip, tip);
+ if (!dont_send && IN_DEV_ARPFILTER(in_dev))
+ dont_send = arp_filter(sip, tip, dev);
+ if (!dont_send) {
+ n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
+ if (n) {
+ arp_send(ARPOP_REPLY, ETH_P_ARP, sip,
+ dev, tip, sha, dev->dev_addr,
+ sha);
+ neigh_release(n);
+ }
+ }
+ goto out;
+ } else if (IN_DEV_FORWARD(in_dev)) {
+ if (addr_type == RTN_UNICAST &&
+ (arp_fwd_proxy(in_dev, dev, rt) ||
+ arp_fwd_pvlan(in_dev, dev, rt, sip, tip) ||
+ (rt->dst.dev != dev &&
+ pneigh_lookup(&arp_tbl, net, &tip, dev, 0)))) {
+ n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
+ if (n)
+ neigh_release(n);
+
+ if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED ||
+ skb->pkt_type == PACKET_HOST ||
+ in_dev->arp_parms->proxy_delay == 0) {
+ arp_send(ARPOP_REPLY, ETH_P_ARP, sip,
+ dev, tip, sha, dev->dev_addr,
+ sha);
+ } else {
+ pneigh_enqueue(&arp_tbl,
+ in_dev->arp_parms, skb);
+ return 0;
+ }
+ goto out;
+ }
+ }
+ }
+
+ /* Update our ARP tables */
+
+ n = __neigh_lookup(&arp_tbl, &sip, dev, 0);
+
+ if (IN_DEV_ARP_ACCEPT(in_dev)) {
+ /* Unsolicited ARP is not accepted by default.
+ It is possible, that this option should be enabled for some
+ devices (strip is candidate)
+ */
+ if (n == NULL &&
+ (arp->ar_op == htons(ARPOP_REPLY) ||
+ (arp->ar_op == htons(ARPOP_REQUEST) && tip == sip)) &&
+ inet_addr_type(net, sip) == RTN_UNICAST)
+ n = __neigh_lookup(&arp_tbl, &sip, dev, 1);
+ }
+
+ if (n) {
+ int state = NUD_REACHABLE;
+ int override;
+
+ /* If several different ARP replies follows back-to-back,
+ use the FIRST one. It is possible, if several proxy
+ agents are active. Taking the first reply prevents
+ arp trashing and chooses the fastest router.
+ */
+ override = time_after(jiffies, n->updated + n->parms->locktime);
+
+ /* Broadcast replies and request packets
+ do not assert neighbour reachability.
+ */
+ if (arp->ar_op != htons(ARPOP_REPLY) ||
+ skb->pkt_type != PACKET_HOST)
+ state = NUD_STALE;
+ neigh_update(n, sha, state,
+ override ? NEIGH_UPDATE_F_OVERRIDE : 0);
+ neigh_release(n);
+ }
+
+out:
+ consume_skb(skb);
+ return 0;
+}
+
+static void parp_redo(struct sk_buff *skb)
+{
+ arp_process(skb);
+}
+
+
+/*
+ * Receive an arp request from the device layer.
+ */
+
+static int arp_rcv(struct sk_buff *skb, struct net_device *dev,
+ struct packet_type *pt, struct net_device *orig_dev)
+{
+ struct arphdr *arp;
+
+ /* ARP header, plus 2 device addresses, plus 2 IP addresses. */
+ if (!pskb_may_pull(skb, arp_hdr_len(dev)))
+ goto freeskb;
+
+ arp = arp_hdr(skb);
+ if (arp->ar_hln != dev->addr_len ||
+ dev->flags & IFF_NOARP ||
+ skb->pkt_type == PACKET_OTHERHOST ||
+ skb->pkt_type == PACKET_LOOPBACK ||
+ arp->ar_pln != 4)
+ goto freeskb;
+
+ skb = skb_share_check(skb, GFP_ATOMIC);
+ if (skb == NULL)
+ goto out_of_mem;
+
+ memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb));
+
+ return NF_HOOK(NFPROTO_ARP, NF_ARP_IN, skb, dev, NULL, arp_process);
+
+freeskb:
+ kfree_skb(skb);
+out_of_mem:
+ return 0;
+}
+
+/*
+ * User level interface (ioctl)
+ */
+
+/*
+ * Set (create) an ARP cache entry.
+ */
+
+static int arp_req_set_proxy(struct net *net, struct net_device *dev, int on)
+{
+ if (dev == NULL) {
+ IPV4_DEVCONF_ALL(net, PROXY_ARP) = on;
+ return 0;
+ }
+ if (__in_dev_get_rtnl(dev)) {
+ IN_DEV_CONF_SET(__in_dev_get_rtnl(dev), PROXY_ARP, on);
+ return 0;
+ }
+ return -ENXIO;
+}
+
+static int arp_req_set_public(struct net *net, struct arpreq *r,
+ struct net_device *dev)
+{
+ __be32 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;
+ __be32 mask = ((struct sockaddr_in *)&r->arp_netmask)->sin_addr.s_addr;
+
+ if (mask && mask != htonl(0xFFFFFFFF))
+ return -EINVAL;
+ if (!dev && (r->arp_flags & ATF_COM)) {
+ dev = dev_getbyhwaddr_rcu(net, r->arp_ha.sa_family,
+ r->arp_ha.sa_data);
+ if (!dev)
+ return -ENODEV;
+ }
+ if (mask) {
+ if (pneigh_lookup(&arp_tbl, net, &ip, dev, 1) == NULL)
+ return -ENOBUFS;
+ return 0;
+ }
+
+ return arp_req_set_proxy(net, dev, 1);
+}
+
+static int arp_req_set(struct net *net, struct arpreq *r,
+ struct net_device *dev)
+{
+ __be32 ip;
+ struct neighbour *neigh;
+ int err;
+
+ if (r->arp_flags & ATF_PUBL)
+ return arp_req_set_public(net, r, dev);
+
+ ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;
+ if (r->arp_flags & ATF_PERM)
+ r->arp_flags |= ATF_COM;
+ if (dev == NULL) {
+ struct rtable *rt = ip_route_output(net, ip, 0, RTO_ONLINK, 0);
+
+ if (IS_ERR(rt))
+ return PTR_ERR(rt);
+ dev = rt->dst.dev;
+ ip_rt_put(rt);
+ if (!dev)
+ return -EINVAL;
+ }
+ switch (dev->type) {
+#if IS_ENABLED(CONFIG_FDDI)
+ case ARPHRD_FDDI:
+ /*
+ * According to RFC 1390, FDDI devices should accept ARP
+ * hardware types of 1 (Ethernet). However, to be more
+ * robust, we'll accept hardware types of either 1 (Ethernet)
+ * or 6 (IEEE 802.2).
+ */
+ if (r->arp_ha.sa_family != ARPHRD_FDDI &&
+ r->arp_ha.sa_family != ARPHRD_ETHER &&
+ r->arp_ha.sa_family != ARPHRD_IEEE802)
+ return -EINVAL;
+ break;
+#endif
+ default:
+ if (r->arp_ha.sa_family != dev->type)
+ return -EINVAL;
+ break;
+ }
+
+ neigh = __neigh_lookup_errno(&arp_tbl, &ip, dev);
+ err = PTR_ERR(neigh);
+ if (!IS_ERR(neigh)) {
+ unsigned state = NUD_STALE;
+ if (r->arp_flags & ATF_PERM)
+ state = NUD_PERMANENT;
+ err = neigh_update(neigh, (r->arp_flags & ATF_COM) ?
+ r->arp_ha.sa_data : NULL, state,
+ NEIGH_UPDATE_F_OVERRIDE |
+ NEIGH_UPDATE_F_ADMIN);
+ neigh_release(neigh);
+ }
+ return err;
+}
+
+static unsigned arp_state_to_flags(struct neighbour *neigh)
+{
+ if (neigh->nud_state&NUD_PERMANENT)
+ return ATF_PERM | ATF_COM;
+ else if (neigh->nud_state&NUD_VALID)
+ return ATF_COM;
+ else
+ return 0;
+}
+
+/*
+ * Get an ARP cache entry.
+ */
+
+static int arp_req_get(struct arpreq *r, struct net_device *dev)
+{
+ __be32 ip = ((struct sockaddr_in *) &r->arp_pa)->sin_addr.s_addr;
+ struct neighbour *neigh;
+ int err = -ENXIO;
+
+ neigh = neigh_lookup(&arp_tbl, &ip, dev);
+ if (neigh) {
+ read_lock_bh(&neigh->lock);
+ memcpy(r->arp_ha.sa_data, neigh->ha, dev->addr_len);
+ r->arp_flags = arp_state_to_flags(neigh);
+ read_unlock_bh(&neigh->lock);
+ r->arp_ha.sa_family = dev->type;
+ strlcpy(r->arp_dev, dev->name, sizeof(r->arp_dev));
+ neigh_release(neigh);
+ err = 0;
+ }
+ return err;
+}
+
+int arp_invalidate(struct net_device *dev, __be32 ip)
+{
+ struct neighbour *neigh = neigh_lookup(&arp_tbl, &ip, dev);
+ int err = -ENXIO;
+
+ if (neigh) {
+ if (neigh->nud_state & ~NUD_NOARP)
+ err = neigh_update(neigh, NULL, NUD_FAILED,
+ NEIGH_UPDATE_F_OVERRIDE|
+ NEIGH_UPDATE_F_ADMIN);
+ neigh_release(neigh);
+ }
+
+ return err;
+}
+EXPORT_SYMBOL(arp_invalidate);
+
+static int arp_req_delete_public(struct net *net, struct arpreq *r,
+ struct net_device *dev)
+{
+ __be32 ip = ((struct sockaddr_in *) &r->arp_pa)->sin_addr.s_addr;
+ __be32 mask = ((struct sockaddr_in *)&r->arp_netmask)->sin_addr.s_addr;
+
+ if (mask == htonl(0xFFFFFFFF))
+ return pneigh_delete(&arp_tbl, net, &ip, dev);
+
+ if (mask)
+ return -EINVAL;
+
+ return arp_req_set_proxy(net, dev, 0);
+}
+
+static int arp_req_delete(struct net *net, struct arpreq *r,
+ struct net_device *dev)
+{
+ __be32 ip;
+
+ if (r->arp_flags & ATF_PUBL)
+ return arp_req_delete_public(net, r, dev);
+
+ ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;
+ if (dev == NULL) {
+ struct rtable *rt = ip_route_output(net, ip, 0, RTO_ONLINK, 0);
+ if (IS_ERR(rt))
+ return PTR_ERR(rt);
+ dev = rt->dst.dev;
+ ip_rt_put(rt);
+ if (!dev)
+ return -EINVAL;
+ }
+ return arp_invalidate(dev, ip);
+}
+
+/*
+ * Handle an ARP layer I/O control request.
+ */
+
+int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg)
+{
+ int err;
+ struct arpreq r;
+ struct net_device *dev = NULL;
+
+ switch (cmd) {
+ case SIOCDARP:
+ case SIOCSARP:
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+ case SIOCGARP:
+ err = copy_from_user(&r, arg, sizeof(struct arpreq));
+ if (err)
+ return -EFAULT;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (r.arp_pa.sa_family != AF_INET)
+ return -EPFNOSUPPORT;
+
+ if (!(r.arp_flags & ATF_PUBL) &&
+ (r.arp_flags & (ATF_NETMASK | ATF_DONTPUB)))
+ return -EINVAL;
+ if (!(r.arp_flags & ATF_NETMASK))
+ ((struct sockaddr_in *)&r.arp_netmask)->sin_addr.s_addr =
+ htonl(0xFFFFFFFFUL);
+ rtnl_lock();
+ if (r.arp_dev[0]) {
+ err = -ENODEV;
+ dev = __dev_get_by_name(net, r.arp_dev);
+ if (dev == NULL)
+ goto out;
+
+ /* Mmmm... It is wrong... ARPHRD_NETROM==0 */
+ if (!r.arp_ha.sa_family)
+ r.arp_ha.sa_family = dev->type;
+ err = -EINVAL;
+ if ((r.arp_flags & ATF_COM) && r.arp_ha.sa_family != dev->type)
+ goto out;
+ } else if (cmd == SIOCGARP) {
+ err = -ENODEV;
+ goto out;
+ }
+
+ switch (cmd) {
+ case SIOCDARP:
+ err = arp_req_delete(net, &r, dev);
+ break;
+ case SIOCSARP:
+ err = arp_req_set(net, &r, dev);
+ break;
+ case SIOCGARP:
+ err = arp_req_get(&r, dev);
+ break;
+ }
+out:
+ rtnl_unlock();
+ if (cmd == SIOCGARP && !err && copy_to_user(arg, &r, sizeof(r)))
+ err = -EFAULT;
+ return err;
+}
+
+static int arp_netdev_event(struct notifier_block *this, unsigned long event,
+ void *ptr)
+{
+ struct net_device *dev = ptr;
+
+ switch (event) {
+ case NETDEV_CHANGEADDR:
+ neigh_changeaddr(&arp_tbl, dev);
+ rt_cache_flush(dev_net(dev), 0);
+ break;
+ default:
+ break;
+ }
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block arp_netdev_notifier = {
+ .notifier_call = arp_netdev_event,
+};
+
+/* Note, that it is not on notifier chain.
+ It is necessary, that this routine was called after route cache will be
+ flushed.
+ */
+void arp_ifdown(struct net_device *dev)
+{
+ neigh_ifdown(&arp_tbl, dev);
+}
+
+
+/*
+ * Called once on startup.
+ */
+
+static struct packet_type arp_packet_type __read_mostly = {
+ .type = cpu_to_be16(ETH_P_ARP),
+ .func = arp_rcv,
+};
+
+static int arp_proc_init(void);
+
+void __init arp_init(void)
+{
+ neigh_table_init(&arp_tbl);
+
+ dev_add_pack(&arp_packet_type);
+ arp_proc_init();
+#ifdef CONFIG_SYSCTL
+ neigh_sysctl_register(NULL, &arp_tbl.parms, "ipv4", NULL);
+#endif
+ register_netdevice_notifier(&arp_netdev_notifier);
+}
+
+#ifdef CONFIG_PROC_FS
+#if IS_ENABLED(CONFIG_AX25)
+
+/* ------------------------------------------------------------------------ */
+/*
+ * ax25 -> ASCII conversion
+ */
+static char *ax2asc2(ax25_address *a, char *buf)
+{
+ char c, *s;
+ int n;
+
+ for (n = 0, s = buf; n < 6; n++) {
+ c = (a->ax25_call[n] >> 1) & 0x7F;
+
+ if (c != ' ')
+ *s++ = c;
+ }
+
+ *s++ = '-';
+ n = (a->ax25_call[6] >> 1) & 0x0F;
+ if (n > 9) {
+ *s++ = '1';
+ n -= 10;
+ }
+
+ *s++ = n + '0';
+ *s++ = '\0';
+
+ if (*buf == '\0' || *buf == '-')
+ return "*";
+
+ return buf;
+}
+#endif /* CONFIG_AX25 */
+
+#define HBUFFERLEN 30
+
+static void arp_format_neigh_entry(struct seq_file *seq,
+ struct neighbour *n)
+{
+ char hbuffer[HBUFFERLEN];
+ int k, j;
+ char tbuf[16];
+ struct net_device *dev = n->dev;
+ int hatype = dev->type;
+
+ read_lock(&n->lock);
+ /* Convert hardware address to XX:XX:XX:XX ... form. */
+#if IS_ENABLED(CONFIG_AX25)
+ if (hatype == ARPHRD_AX25 || hatype == ARPHRD_NETROM)
+ ax2asc2((ax25_address *)n->ha, hbuffer);
+ else {
+#endif
+ for (k = 0, j = 0; k < HBUFFERLEN - 3 && j < dev->addr_len; j++) {
+ hbuffer[k++] = hex_asc_hi(n->ha[j]);
+ hbuffer[k++] = hex_asc_lo(n->ha[j]);
+ hbuffer[k++] = ':';
+ }
+ if (k != 0)
+ --k;
+ hbuffer[k] = 0;
+#if IS_ENABLED(CONFIG_AX25)
+ }
+#endif
+ sprintf(tbuf, "%pI4", n->primary_key);
+ seq_printf(seq, "%-16s 0x%-10x0x%-10x%s * %s\n",
+ tbuf, hatype, arp_state_to_flags(n), hbuffer, dev->name);
+ read_unlock(&n->lock);
+}
+
+static void arp_format_pneigh_entry(struct seq_file *seq,
+ struct pneigh_entry *n)
+{
+ struct net_device *dev = n->dev;
+ int hatype = dev ? dev->type : 0;
+ char tbuf[16];
+
+ sprintf(tbuf, "%pI4", n->key);
+ seq_printf(seq, "%-16s 0x%-10x0x%-10x%s * %s\n",
+ tbuf, hatype, ATF_PUBL | ATF_PERM, "00:00:00:00:00:00",
+ dev ? dev->name : "*");
+}
+
+static int arp_seq_show(struct seq_file *seq, void *v)
+{
+ if (v == SEQ_START_TOKEN) {
+ seq_puts(seq, "IP address HW type Flags "
+ "HW address Mask Device\n");
+ } else {
+ struct neigh_seq_state *state = seq->private;
+
+ if (state->flags & NEIGH_SEQ_IS_PNEIGH)
+ arp_format_pneigh_entry(seq, v);
+ else
+ arp_format_neigh_entry(seq, v);
+ }
+
+ return 0;
+}
+
+static void *arp_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ /* Don't want to confuse "arp -a" w/ magic entries,
+ * so we tell the generic iterator to skip NUD_NOARP.
+ */
+ return neigh_seq_start(seq, pos, &arp_tbl, NEIGH_SEQ_SKIP_NOARP);
+}
+
+/* ------------------------------------------------------------------------ */
+
+static const struct seq_operations arp_seq_ops = {
+ .start = arp_seq_start,
+ .next = neigh_seq_next,
+ .stop = neigh_seq_stop,
+ .show = arp_seq_show,
+};
+
+static int arp_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &arp_seq_ops,
+ sizeof(struct neigh_seq_state));
+}
+
+static const struct file_operations arp_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = arp_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+
+
+static int __net_init arp_net_init(struct net *net)
+{
+ if (!proc_net_fops_create(net, "arp", S_IRUGO, &arp_seq_fops))
+ return -ENOMEM;
+ return 0;
+}
+
+static void __net_exit arp_net_exit(struct net *net)
+{
+ proc_net_remove(net, "arp");
+}
+
+static struct pernet_operations arp_net_ops = {
+ .init = arp_net_init,
+ .exit = arp_net_exit,
+};
+
+static int __init arp_proc_init(void)
+{
+ return register_pernet_subsys(&arp_net_ops);
+}
+
+#else /* CONFIG_PROC_FS */
+
+static int __init arp_proc_init(void)
+{
+ return 0;
+}
+
+#endif /* CONFIG_PROC_FS */
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
new file mode 100644
index 00000000..c48adc56
--- /dev/null
+++ b/net/ipv4/cipso_ipv4.c
@@ -0,0 +1,2363 @@
+/*
+ * CIPSO - Commercial IP Security Option
+ *
+ * This is an implementation of the CIPSO 2.2 protocol as specified in
+ * draft-ietf-cipso-ipsecurity-01.txt with additional tag types as found in
+ * FIPS-188. While CIPSO never became a full IETF RFC standard many vendors
+ * have chosen to adopt the protocol and over the years it has become a
+ * de-facto standard for labeled networking.
+ *
+ * The CIPSO draft specification can be found in the kernel's Documentation
+ * directory as well as the following URL:
+ * http://tools.ietf.org/id/draft-ietf-cipso-ipsecurity-01.txt
+ * The FIPS-188 specification can be found at the following URL:
+ * http://www.itl.nist.gov/fipspubs/fip188.htm
+ *
+ * Author: Paul Moore
+ *
+ */
+
+/*
+ * (c) Copyright Hewlett-Packard Development Company, L.P., 2006, 2008
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+/* List of available DOI definitions */
+/* XXX - This currently assumes a minimal number of different DOIs in use,
+ * if in practice there are a lot of different DOIs this list should
+ * probably be turned into a hash table or something similar so we
+ * can do quick lookups. */
+static DEFINE_SPINLOCK(cipso_v4_doi_list_lock);
+static LIST_HEAD(cipso_v4_doi_list);
+
+/* Label mapping cache */
+int cipso_v4_cache_enabled = 1;
+int cipso_v4_cache_bucketsize = 10;
+#define CIPSO_V4_CACHE_BUCKETBITS 7
+#define CIPSO_V4_CACHE_BUCKETS (1 << CIPSO_V4_CACHE_BUCKETBITS)
+#define CIPSO_V4_CACHE_REORDERLIMIT 10
+struct cipso_v4_map_cache_bkt {
+ spinlock_t lock;
+ u32 size;
+ struct list_head list;
+};
+struct cipso_v4_map_cache_entry {
+ u32 hash;
+ unsigned char *key;
+ size_t key_len;
+
+ struct netlbl_lsm_cache *lsm_data;
+
+ u32 activity;
+ struct list_head list;
+};
+static struct cipso_v4_map_cache_bkt *cipso_v4_cache = NULL;
+
+/* Restricted bitmap (tag #1) flags */
+int cipso_v4_rbm_optfmt = 0;
+int cipso_v4_rbm_strictvalid = 1;
+
+/*
+ * Protocol Constants
+ */
+
+/* Maximum size of the CIPSO IP option, derived from the fact that the maximum
+ * IPv4 header size is 60 bytes and the base IPv4 header is 20 bytes long. */
+#define CIPSO_V4_OPT_LEN_MAX 40
+
+/* Length of the base CIPSO option, this includes the option type (1 byte), the
+ * option length (1 byte), and the DOI (4 bytes). */
+#define CIPSO_V4_HDR_LEN 6
+
+/* Base length of the restrictive category bitmap tag (tag #1). */
+#define CIPSO_V4_TAG_RBM_BLEN 4
+
+/* Base length of the enumerated category tag (tag #2). */
+#define CIPSO_V4_TAG_ENUM_BLEN 4
+
+/* Base length of the ranged categories bitmap tag (tag #5). */
+#define CIPSO_V4_TAG_RNG_BLEN 4
+/* The maximum number of category ranges permitted in the ranged category tag
+ * (tag #5). You may note that the IETF draft states that the maximum number
+ * of category ranges is 7, but if the low end of the last category range is
+ * zero then it is possible to fit 8 category ranges because the zero should
+ * be omitted. */
+#define CIPSO_V4_TAG_RNG_CAT_MAX 8
+
+/* Base length of the local tag (non-standard tag).
+ * Tag definition (may change between kernel versions)
+ *
+ * 0 8 16 24 32
+ * +----------+----------+----------+----------+
+ * | 10000000 | 00000110 | 32-bit secid value |
+ * +----------+----------+----------+----------+
+ * | in (host byte order)|
+ * +----------+----------+
+ *
+ */
+#define CIPSO_V4_TAG_LOC_BLEN 6
+
+/*
+ * Helper Functions
+ */
+
+/**
+ * cipso_v4_bitmap_walk - Walk a bitmap looking for a bit
+ * @bitmap: the bitmap
+ * @bitmap_len: length in bits
+ * @offset: starting offset
+ * @state: if non-zero, look for a set (1) bit else look for a cleared (0) bit
+ *
+ * Description:
+ * Starting at @offset, walk the bitmap from left to right until either the
+ * desired bit is found or we reach the end. Return the bit offset, -1 if
+ * not found, or -2 if error.
+ */
+static int cipso_v4_bitmap_walk(const unsigned char *bitmap,
+ u32 bitmap_len,
+ u32 offset,
+ u8 state)
+{
+ u32 bit_spot;
+ u32 byte_offset;
+ unsigned char bitmask;
+ unsigned char byte;
+
+ /* gcc always rounds to zero when doing integer division */
+ byte_offset = offset / 8;
+ byte = bitmap[byte_offset];
+ bit_spot = offset;
+ bitmask = 0x80 >> (offset % 8);
+
+ while (bit_spot < bitmap_len) {
+ if ((state && (byte & bitmask) == bitmask) ||
+ (state == 0 && (byte & bitmask) == 0))
+ return bit_spot;
+
+ bit_spot++;
+ bitmask >>= 1;
+ if (bitmask == 0) {
+ byte = bitmap[++byte_offset];
+ bitmask = 0x80;
+ }
+ }
+
+ return -1;
+}
+
+/**
+ * cipso_v4_bitmap_setbit - Sets a single bit in a bitmap
+ * @bitmap: the bitmap
+ * @bit: the bit
+ * @state: if non-zero, set the bit (1) else clear the bit (0)
+ *
+ * Description:
+ * Set a single bit in the bitmask. Returns zero on success, negative values
+ * on error.
+ */
+static void cipso_v4_bitmap_setbit(unsigned char *bitmap,
+ u32 bit,
+ u8 state)
+{
+ u32 byte_spot;
+ u8 bitmask;
+
+ /* gcc always rounds to zero when doing integer division */
+ byte_spot = bit / 8;
+ bitmask = 0x80 >> (bit % 8);
+ if (state)
+ bitmap[byte_spot] |= bitmask;
+ else
+ bitmap[byte_spot] &= ~bitmask;
+}
+
+/**
+ * cipso_v4_cache_entry_free - Frees a cache entry
+ * @entry: the entry to free
+ *
+ * Description:
+ * This function frees the memory associated with a cache entry including the
+ * LSM cache data if there are no longer any users, i.e. reference count == 0.
+ *
+ */
+static void cipso_v4_cache_entry_free(struct cipso_v4_map_cache_entry *entry)
+{
+ if (entry->lsm_data)
+ netlbl_secattr_cache_free(entry->lsm_data);
+ kfree(entry->key);
+ kfree(entry);
+}
+
+/**
+ * cipso_v4_map_cache_hash - Hashing function for the CIPSO cache
+ * @key: the hash key
+ * @key_len: the length of the key in bytes
+ *
+ * Description:
+ * The CIPSO tag hashing function. Returns a 32-bit hash value.
+ *
+ */
+static u32 cipso_v4_map_cache_hash(const unsigned char *key, u32 key_len)
+{
+ return jhash(key, key_len, 0);
+}
+
+/*
+ * Label Mapping Cache Functions
+ */
+
+/**
+ * cipso_v4_cache_init - Initialize the CIPSO cache
+ *
+ * Description:
+ * Initializes the CIPSO label mapping cache, this function should be called
+ * before any of the other functions defined in this file. Returns zero on
+ * success, negative values on error.
+ *
+ */
+static int cipso_v4_cache_init(void)
+{
+ u32 iter;
+
+ cipso_v4_cache = kcalloc(CIPSO_V4_CACHE_BUCKETS,
+ sizeof(struct cipso_v4_map_cache_bkt),
+ GFP_KERNEL);
+ if (cipso_v4_cache == NULL)
+ return -ENOMEM;
+
+ for (iter = 0; iter < CIPSO_V4_CACHE_BUCKETS; iter++) {
+ spin_lock_init(&cipso_v4_cache[iter].lock);
+ cipso_v4_cache[iter].size = 0;
+ INIT_LIST_HEAD(&cipso_v4_cache[iter].list);
+ }
+
+ return 0;
+}
+
+/**
+ * cipso_v4_cache_invalidate - Invalidates the current CIPSO cache
+ *
+ * Description:
+ * Invalidates and frees any entries in the CIPSO cache. Returns zero on
+ * success and negative values on failure.
+ *
+ */
+void cipso_v4_cache_invalidate(void)
+{
+ struct cipso_v4_map_cache_entry *entry, *tmp_entry;
+ u32 iter;
+
+ for (iter = 0; iter < CIPSO_V4_CACHE_BUCKETS; iter++) {
+ spin_lock_bh(&cipso_v4_cache[iter].lock);
+ list_for_each_entry_safe(entry,
+ tmp_entry,
+ &cipso_v4_cache[iter].list, list) {
+ list_del(&entry->list);
+ cipso_v4_cache_entry_free(entry);
+ }
+ cipso_v4_cache[iter].size = 0;
+ spin_unlock_bh(&cipso_v4_cache[iter].lock);
+ }
+}
+
+/**
+ * cipso_v4_cache_check - Check the CIPSO cache for a label mapping
+ * @key: the buffer to check
+ * @key_len: buffer length in bytes
+ * @secattr: the security attribute struct to use
+ *
+ * Description:
+ * This function checks the cache to see if a label mapping already exists for
+ * the given key. If there is a match then the cache is adjusted and the
+ * @secattr struct is populated with the correct LSM security attributes. The
+ * cache is adjusted in the following manner if the entry is not already the
+ * first in the cache bucket:
+ *
+ * 1. The cache entry's activity counter is incremented
+ * 2. The previous (higher ranking) entry's activity counter is decremented
+ * 3. If the difference between the two activity counters is geater than
+ * CIPSO_V4_CACHE_REORDERLIMIT the two entries are swapped
+ *
+ * Returns zero on success, -ENOENT for a cache miss, and other negative values
+ * on error.
+ *
+ */
+static int cipso_v4_cache_check(const unsigned char *key,
+ u32 key_len,
+ struct netlbl_lsm_secattr *secattr)
+{
+ u32 bkt;
+ struct cipso_v4_map_cache_entry *entry;
+ struct cipso_v4_map_cache_entry *prev_entry = NULL;
+ u32 hash;
+
+ if (!cipso_v4_cache_enabled)
+ return -ENOENT;
+
+ hash = cipso_v4_map_cache_hash(key, key_len);
+ bkt = hash & (CIPSO_V4_CACHE_BUCKETS - 1);
+ spin_lock_bh(&cipso_v4_cache[bkt].lock);
+ list_for_each_entry(entry, &cipso_v4_cache[bkt].list, list) {
+ if (entry->hash == hash &&
+ entry->key_len == key_len &&
+ memcmp(entry->key, key, key_len) == 0) {
+ entry->activity += 1;
+ atomic_inc(&entry->lsm_data->refcount);
+ secattr->cache = entry->lsm_data;
+ secattr->flags |= NETLBL_SECATTR_CACHE;
+ secattr->type = NETLBL_NLTYPE_CIPSOV4;
+ if (prev_entry == NULL) {
+ spin_unlock_bh(&cipso_v4_cache[bkt].lock);
+ return 0;
+ }
+
+ if (prev_entry->activity > 0)
+ prev_entry->activity -= 1;
+ if (entry->activity > prev_entry->activity &&
+ entry->activity - prev_entry->activity >
+ CIPSO_V4_CACHE_REORDERLIMIT) {
+ __list_del(entry->list.prev, entry->list.next);
+ __list_add(&entry->list,
+ prev_entry->list.prev,
+ &prev_entry->list);
+ }
+
+ spin_unlock_bh(&cipso_v4_cache[bkt].lock);
+ return 0;
+ }
+ prev_entry = entry;
+ }
+ spin_unlock_bh(&cipso_v4_cache[bkt].lock);
+
+ return -ENOENT;
+}
+
+/**
+ * cipso_v4_cache_add - Add an entry to the CIPSO cache
+ * @skb: the packet
+ * @secattr: the packet's security attributes
+ *
+ * Description:
+ * Add a new entry into the CIPSO label mapping cache. Add the new entry to
+ * head of the cache bucket's list, if the cache bucket is out of room remove
+ * the last entry in the list first. It is important to note that there is
+ * currently no checking for duplicate keys. Returns zero on success,
+ * negative values on failure.
+ *
+ */
+int cipso_v4_cache_add(const struct sk_buff *skb,
+ const struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val = -EPERM;
+ u32 bkt;
+ struct cipso_v4_map_cache_entry *entry = NULL;
+ struct cipso_v4_map_cache_entry *old_entry = NULL;
+ unsigned char *cipso_ptr;
+ u32 cipso_ptr_len;
+
+ if (!cipso_v4_cache_enabled || cipso_v4_cache_bucketsize <= 0)
+ return 0;
+
+ cipso_ptr = CIPSO_V4_OPTPTR(skb);
+ cipso_ptr_len = cipso_ptr[1];
+
+ entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
+ if (entry == NULL)
+ return -ENOMEM;
+ entry->key = kmemdup(cipso_ptr, cipso_ptr_len, GFP_ATOMIC);
+ if (entry->key == NULL) {
+ ret_val = -ENOMEM;
+ goto cache_add_failure;
+ }
+ entry->key_len = cipso_ptr_len;
+ entry->hash = cipso_v4_map_cache_hash(cipso_ptr, cipso_ptr_len);
+ atomic_inc(&secattr->cache->refcount);
+ entry->lsm_data = secattr->cache;
+
+ bkt = entry->hash & (CIPSO_V4_CACHE_BUCKETS - 1);
+ spin_lock_bh(&cipso_v4_cache[bkt].lock);
+ if (cipso_v4_cache[bkt].size < cipso_v4_cache_bucketsize) {
+ list_add(&entry->list, &cipso_v4_cache[bkt].list);
+ cipso_v4_cache[bkt].size += 1;
+ } else {
+ old_entry = list_entry(cipso_v4_cache[bkt].list.prev,
+ struct cipso_v4_map_cache_entry, list);
+ list_del(&old_entry->list);
+ list_add(&entry->list, &cipso_v4_cache[bkt].list);
+ cipso_v4_cache_entry_free(old_entry);
+ }
+ spin_unlock_bh(&cipso_v4_cache[bkt].lock);
+
+ return 0;
+
+cache_add_failure:
+ if (entry)
+ cipso_v4_cache_entry_free(entry);
+ return ret_val;
+}
+
+/*
+ * DOI List Functions
+ */
+
+/**
+ * cipso_v4_doi_search - Searches for a DOI definition
+ * @doi: the DOI to search for
+ *
+ * Description:
+ * Search the DOI definition list for a DOI definition with a DOI value that
+ * matches @doi. The caller is responsible for calling rcu_read_[un]lock().
+ * Returns a pointer to the DOI definition on success and NULL on failure.
+ */
+static struct cipso_v4_doi *cipso_v4_doi_search(u32 doi)
+{
+ struct cipso_v4_doi *iter;
+
+ list_for_each_entry_rcu(iter, &cipso_v4_doi_list, list)
+ if (iter->doi == doi && atomic_read(&iter->refcount))
+ return iter;
+ return NULL;
+}
+
+/**
+ * cipso_v4_doi_add - Add a new DOI to the CIPSO protocol engine
+ * @doi_def: the DOI structure
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * The caller defines a new DOI for use by the CIPSO engine and calls this
+ * function to add it to the list of acceptable domains. The caller must
+ * ensure that the mapping table specified in @doi_def->map meets all of the
+ * requirements of the mapping type (see cipso_ipv4.h for details). Returns
+ * zero on success and non-zero on failure.
+ *
+ */
+int cipso_v4_doi_add(struct cipso_v4_doi *doi_def,
+ struct netlbl_audit *audit_info)
+{
+ int ret_val = -EINVAL;
+ u32 iter;
+ u32 doi;
+ u32 doi_type;
+ struct audit_buffer *audit_buf;
+
+ doi = doi_def->doi;
+ doi_type = doi_def->type;
+
+ if (doi_def->doi == CIPSO_V4_DOI_UNKNOWN)
+ goto doi_add_return;
+ for (iter = 0; iter < CIPSO_V4_TAG_MAXCNT; iter++) {
+ switch (doi_def->tags[iter]) {
+ case CIPSO_V4_TAG_RBITMAP:
+ break;
+ case CIPSO_V4_TAG_RANGE:
+ case CIPSO_V4_TAG_ENUM:
+ if (doi_def->type != CIPSO_V4_MAP_PASS)
+ goto doi_add_return;
+ break;
+ case CIPSO_V4_TAG_LOCAL:
+ if (doi_def->type != CIPSO_V4_MAP_LOCAL)
+ goto doi_add_return;
+ break;
+ case CIPSO_V4_TAG_INVALID:
+ if (iter == 0)
+ goto doi_add_return;
+ break;
+ default:
+ goto doi_add_return;
+ }
+ }
+
+ atomic_set(&doi_def->refcount, 1);
+
+ spin_lock(&cipso_v4_doi_list_lock);
+ if (cipso_v4_doi_search(doi_def->doi) != NULL) {
+ spin_unlock(&cipso_v4_doi_list_lock);
+ ret_val = -EEXIST;
+ goto doi_add_return;
+ }
+ list_add_tail_rcu(&doi_def->list, &cipso_v4_doi_list);
+ spin_unlock(&cipso_v4_doi_list_lock);
+ ret_val = 0;
+
+doi_add_return:
+ audit_buf = netlbl_audit_start(AUDIT_MAC_CIPSOV4_ADD, audit_info);
+ if (audit_buf != NULL) {
+ const char *type_str;
+ switch (doi_type) {
+ case CIPSO_V4_MAP_TRANS:
+ type_str = "trans";
+ break;
+ case CIPSO_V4_MAP_PASS:
+ type_str = "pass";
+ break;
+ case CIPSO_V4_MAP_LOCAL:
+ type_str = "local";
+ break;
+ default:
+ type_str = "(unknown)";
+ }
+ audit_log_format(audit_buf,
+ " cipso_doi=%u cipso_type=%s res=%u",
+ doi, type_str, ret_val == 0 ? 1 : 0);
+ audit_log_end(audit_buf);
+ }
+
+ return ret_val;
+}
+
+/**
+ * cipso_v4_doi_free - Frees a DOI definition
+ * @entry: the entry's RCU field
+ *
+ * Description:
+ * This function frees all of the memory associated with a DOI definition.
+ *
+ */
+void cipso_v4_doi_free(struct cipso_v4_doi *doi_def)
+{
+ if (doi_def == NULL)
+ return;
+
+ switch (doi_def->type) {
+ case CIPSO_V4_MAP_TRANS:
+ kfree(doi_def->map.std->lvl.cipso);
+ kfree(doi_def->map.std->lvl.local);
+ kfree(doi_def->map.std->cat.cipso);
+ kfree(doi_def->map.std->cat.local);
+ break;
+ }
+ kfree(doi_def);
+}
+
+/**
+ * cipso_v4_doi_free_rcu - Frees a DOI definition via the RCU pointer
+ * @entry: the entry's RCU field
+ *
+ * Description:
+ * This function is designed to be used as a callback to the call_rcu()
+ * function so that the memory allocated to the DOI definition can be released
+ * safely.
+ *
+ */
+static void cipso_v4_doi_free_rcu(struct rcu_head *entry)
+{
+ struct cipso_v4_doi *doi_def;
+
+ doi_def = container_of(entry, struct cipso_v4_doi, rcu);
+ cipso_v4_doi_free(doi_def);
+}
+
+/**
+ * cipso_v4_doi_remove - Remove an existing DOI from the CIPSO protocol engine
+ * @doi: the DOI value
+ * @audit_secid: the LSM secid to use in the audit message
+ *
+ * Description:
+ * Removes a DOI definition from the CIPSO engine. The NetLabel routines will
+ * be called to release their own LSM domain mappings as well as our own
+ * domain list. Returns zero on success and negative values on failure.
+ *
+ */
+int cipso_v4_doi_remove(u32 doi, struct netlbl_audit *audit_info)
+{
+ int ret_val;
+ struct cipso_v4_doi *doi_def;
+ struct audit_buffer *audit_buf;
+
+ spin_lock(&cipso_v4_doi_list_lock);
+ doi_def = cipso_v4_doi_search(doi);
+ if (doi_def == NULL) {
+ spin_unlock(&cipso_v4_doi_list_lock);
+ ret_val = -ENOENT;
+ goto doi_remove_return;
+ }
+ if (!atomic_dec_and_test(&doi_def->refcount)) {
+ spin_unlock(&cipso_v4_doi_list_lock);
+ ret_val = -EBUSY;
+ goto doi_remove_return;
+ }
+ list_del_rcu(&doi_def->list);
+ spin_unlock(&cipso_v4_doi_list_lock);
+
+ cipso_v4_cache_invalidate();
+ call_rcu(&doi_def->rcu, cipso_v4_doi_free_rcu);
+ ret_val = 0;
+
+doi_remove_return:
+ audit_buf = netlbl_audit_start(AUDIT_MAC_CIPSOV4_DEL, audit_info);
+ if (audit_buf != NULL) {
+ audit_log_format(audit_buf,
+ " cipso_doi=%u res=%u",
+ doi, ret_val == 0 ? 1 : 0);
+ audit_log_end(audit_buf);
+ }
+
+ return ret_val;
+}
+
+/**
+ * cipso_v4_doi_getdef - Returns a reference to a valid DOI definition
+ * @doi: the DOI value
+ *
+ * Description:
+ * Searches for a valid DOI definition and if one is found it is returned to
+ * the caller. Otherwise NULL is returned. The caller must ensure that
+ * rcu_read_lock() is held while accessing the returned definition and the DOI
+ * definition reference count is decremented when the caller is done.
+ *
+ */
+struct cipso_v4_doi *cipso_v4_doi_getdef(u32 doi)
+{
+ struct cipso_v4_doi *doi_def;
+
+ rcu_read_lock();
+ doi_def = cipso_v4_doi_search(doi);
+ if (doi_def == NULL)
+ goto doi_getdef_return;
+ if (!atomic_inc_not_zero(&doi_def->refcount))
+ doi_def = NULL;
+
+doi_getdef_return:
+ rcu_read_unlock();
+ return doi_def;
+}
+
+/**
+ * cipso_v4_doi_putdef - Releases a reference for the given DOI definition
+ * @doi_def: the DOI definition
+ *
+ * Description:
+ * Releases a DOI definition reference obtained from cipso_v4_doi_getdef().
+ *
+ */
+void cipso_v4_doi_putdef(struct cipso_v4_doi *doi_def)
+{
+ if (doi_def == NULL)
+ return;
+
+ if (!atomic_dec_and_test(&doi_def->refcount))
+ return;
+ spin_lock(&cipso_v4_doi_list_lock);
+ list_del_rcu(&doi_def->list);
+ spin_unlock(&cipso_v4_doi_list_lock);
+
+ cipso_v4_cache_invalidate();
+ call_rcu(&doi_def->rcu, cipso_v4_doi_free_rcu);
+}
+
+/**
+ * cipso_v4_doi_walk - Iterate through the DOI definitions
+ * @skip_cnt: skip past this number of DOI definitions, updated
+ * @callback: callback for each DOI definition
+ * @cb_arg: argument for the callback function
+ *
+ * Description:
+ * Iterate over the DOI definition list, skipping the first @skip_cnt entries.
+ * For each entry call @callback, if @callback returns a negative value stop
+ * 'walking' through the list and return. Updates the value in @skip_cnt upon
+ * return. Returns zero on success, negative values on failure.
+ *
+ */
+int cipso_v4_doi_walk(u32 *skip_cnt,
+ int (*callback) (struct cipso_v4_doi *doi_def, void *arg),
+ void *cb_arg)
+{
+ int ret_val = -ENOENT;
+ u32 doi_cnt = 0;
+ struct cipso_v4_doi *iter_doi;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(iter_doi, &cipso_v4_doi_list, list)
+ if (atomic_read(&iter_doi->refcount) > 0) {
+ if (doi_cnt++ < *skip_cnt)
+ continue;
+ ret_val = callback(iter_doi, cb_arg);
+ if (ret_val < 0) {
+ doi_cnt--;
+ goto doi_walk_return;
+ }
+ }
+
+doi_walk_return:
+ rcu_read_unlock();
+ *skip_cnt = doi_cnt;
+ return ret_val;
+}
+
+/*
+ * Label Mapping Functions
+ */
+
+/**
+ * cipso_v4_map_lvl_valid - Checks to see if the given level is understood
+ * @doi_def: the DOI definition
+ * @level: the level to check
+ *
+ * Description:
+ * Checks the given level against the given DOI definition and returns a
+ * negative value if the level does not have a valid mapping and a zero value
+ * if the level is defined by the DOI.
+ *
+ */
+static int cipso_v4_map_lvl_valid(const struct cipso_v4_doi *doi_def, u8 level)
+{
+ switch (doi_def->type) {
+ case CIPSO_V4_MAP_PASS:
+ return 0;
+ case CIPSO_V4_MAP_TRANS:
+ if (doi_def->map.std->lvl.cipso[level] < CIPSO_V4_INV_LVL)
+ return 0;
+ break;
+ }
+
+ return -EFAULT;
+}
+
+/**
+ * cipso_v4_map_lvl_hton - Perform a level mapping from the host to the network
+ * @doi_def: the DOI definition
+ * @host_lvl: the host MLS level
+ * @net_lvl: the network/CIPSO MLS level
+ *
+ * Description:
+ * Perform a label mapping to translate a local MLS level to the correct
+ * CIPSO level using the given DOI definition. Returns zero on success,
+ * negative values otherwise.
+ *
+ */
+static int cipso_v4_map_lvl_hton(const struct cipso_v4_doi *doi_def,
+ u32 host_lvl,
+ u32 *net_lvl)
+{
+ switch (doi_def->type) {
+ case CIPSO_V4_MAP_PASS:
+ *net_lvl = host_lvl;
+ return 0;
+ case CIPSO_V4_MAP_TRANS:
+ if (host_lvl < doi_def->map.std->lvl.local_size &&
+ doi_def->map.std->lvl.local[host_lvl] < CIPSO_V4_INV_LVL) {
+ *net_lvl = doi_def->map.std->lvl.local[host_lvl];
+ return 0;
+ }
+ return -EPERM;
+ }
+
+ return -EINVAL;
+}
+
+/**
+ * cipso_v4_map_lvl_ntoh - Perform a level mapping from the network to the host
+ * @doi_def: the DOI definition
+ * @net_lvl: the network/CIPSO MLS level
+ * @host_lvl: the host MLS level
+ *
+ * Description:
+ * Perform a label mapping to translate a CIPSO level to the correct local MLS
+ * level using the given DOI definition. Returns zero on success, negative
+ * values otherwise.
+ *
+ */
+static int cipso_v4_map_lvl_ntoh(const struct cipso_v4_doi *doi_def,
+ u32 net_lvl,
+ u32 *host_lvl)
+{
+ struct cipso_v4_std_map_tbl *map_tbl;
+
+ switch (doi_def->type) {
+ case CIPSO_V4_MAP_PASS:
+ *host_lvl = net_lvl;
+ return 0;
+ case CIPSO_V4_MAP_TRANS:
+ map_tbl = doi_def->map.std;
+ if (net_lvl < map_tbl->lvl.cipso_size &&
+ map_tbl->lvl.cipso[net_lvl] < CIPSO_V4_INV_LVL) {
+ *host_lvl = doi_def->map.std->lvl.cipso[net_lvl];
+ return 0;
+ }
+ return -EPERM;
+ }
+
+ return -EINVAL;
+}
+
+/**
+ * cipso_v4_map_cat_rbm_valid - Checks to see if the category bitmap is valid
+ * @doi_def: the DOI definition
+ * @bitmap: category bitmap
+ * @bitmap_len: bitmap length in bytes
+ *
+ * Description:
+ * Checks the given category bitmap against the given DOI definition and
+ * returns a negative value if any of the categories in the bitmap do not have
+ * a valid mapping and a zero value if all of the categories are valid.
+ *
+ */
+static int cipso_v4_map_cat_rbm_valid(const struct cipso_v4_doi *doi_def,
+ const unsigned char *bitmap,
+ u32 bitmap_len)
+{
+ int cat = -1;
+ u32 bitmap_len_bits = bitmap_len * 8;
+ u32 cipso_cat_size;
+ u32 *cipso_array;
+
+ switch (doi_def->type) {
+ case CIPSO_V4_MAP_PASS:
+ return 0;
+ case CIPSO_V4_MAP_TRANS:
+ cipso_cat_size = doi_def->map.std->cat.cipso_size;
+ cipso_array = doi_def->map.std->cat.cipso;
+ for (;;) {
+ cat = cipso_v4_bitmap_walk(bitmap,
+ bitmap_len_bits,
+ cat + 1,
+ 1);
+ if (cat < 0)
+ break;
+ if (cat >= cipso_cat_size ||
+ cipso_array[cat] >= CIPSO_V4_INV_CAT)
+ return -EFAULT;
+ }
+
+ if (cat == -1)
+ return 0;
+ break;
+ }
+
+ return -EFAULT;
+}
+
+/**
+ * cipso_v4_map_cat_rbm_hton - Perform a category mapping from host to network
+ * @doi_def: the DOI definition
+ * @secattr: the security attributes
+ * @net_cat: the zero'd out category bitmap in network/CIPSO format
+ * @net_cat_len: the length of the CIPSO bitmap in bytes
+ *
+ * Description:
+ * Perform a label mapping to translate a local MLS category bitmap to the
+ * correct CIPSO bitmap using the given DOI definition. Returns the minimum
+ * size in bytes of the network bitmap on success, negative values otherwise.
+ *
+ */
+static int cipso_v4_map_cat_rbm_hton(const struct cipso_v4_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr,
+ unsigned char *net_cat,
+ u32 net_cat_len)
+{
+ int host_spot = -1;
+ u32 net_spot = CIPSO_V4_INV_CAT;
+ u32 net_spot_max = 0;
+ u32 net_clen_bits = net_cat_len * 8;
+ u32 host_cat_size = 0;
+ u32 *host_cat_array = NULL;
+
+ if (doi_def->type == CIPSO_V4_MAP_TRANS) {
+ host_cat_size = doi_def->map.std->cat.local_size;
+ host_cat_array = doi_def->map.std->cat.local;
+ }
+
+ for (;;) {
+ host_spot = netlbl_secattr_catmap_walk(secattr->attr.mls.cat,
+ host_spot + 1);
+ if (host_spot < 0)
+ break;
+
+ switch (doi_def->type) {
+ case CIPSO_V4_MAP_PASS:
+ net_spot = host_spot;
+ break;
+ case CIPSO_V4_MAP_TRANS:
+ if (host_spot >= host_cat_size)
+ return -EPERM;
+ net_spot = host_cat_array[host_spot];
+ if (net_spot >= CIPSO_V4_INV_CAT)
+ return -EPERM;
+ break;
+ }
+ if (net_spot >= net_clen_bits)
+ return -ENOSPC;
+ cipso_v4_bitmap_setbit(net_cat, net_spot, 1);
+
+ if (net_spot > net_spot_max)
+ net_spot_max = net_spot;
+ }
+
+ if (++net_spot_max % 8)
+ return net_spot_max / 8 + 1;
+ return net_spot_max / 8;
+}
+
+/**
+ * cipso_v4_map_cat_rbm_ntoh - Perform a category mapping from network to host
+ * @doi_def: the DOI definition
+ * @net_cat: the category bitmap in network/CIPSO format
+ * @net_cat_len: the length of the CIPSO bitmap in bytes
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Perform a label mapping to translate a CIPSO bitmap to the correct local
+ * MLS category bitmap using the given DOI definition. Returns zero on
+ * success, negative values on failure.
+ *
+ */
+static int cipso_v4_map_cat_rbm_ntoh(const struct cipso_v4_doi *doi_def,
+ const unsigned char *net_cat,
+ u32 net_cat_len,
+ struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val;
+ int net_spot = -1;
+ u32 host_spot = CIPSO_V4_INV_CAT;
+ u32 net_clen_bits = net_cat_len * 8;
+ u32 net_cat_size = 0;
+ u32 *net_cat_array = NULL;
+
+ if (doi_def->type == CIPSO_V4_MAP_TRANS) {
+ net_cat_size = doi_def->map.std->cat.cipso_size;
+ net_cat_array = doi_def->map.std->cat.cipso;
+ }
+
+ for (;;) {
+ net_spot = cipso_v4_bitmap_walk(net_cat,
+ net_clen_bits,
+ net_spot + 1,
+ 1);
+ if (net_spot < 0) {
+ if (net_spot == -2)
+ return -EFAULT;
+ return 0;
+ }
+
+ switch (doi_def->type) {
+ case CIPSO_V4_MAP_PASS:
+ host_spot = net_spot;
+ break;
+ case CIPSO_V4_MAP_TRANS:
+ if (net_spot >= net_cat_size)
+ return -EPERM;
+ host_spot = net_cat_array[net_spot];
+ if (host_spot >= CIPSO_V4_INV_CAT)
+ return -EPERM;
+ break;
+ }
+ ret_val = netlbl_secattr_catmap_setbit(secattr->attr.mls.cat,
+ host_spot,
+ GFP_ATOMIC);
+ if (ret_val != 0)
+ return ret_val;
+ }
+
+ return -EINVAL;
+}
+
+/**
+ * cipso_v4_map_cat_enum_valid - Checks to see if the categories are valid
+ * @doi_def: the DOI definition
+ * @enumcat: category list
+ * @enumcat_len: length of the category list in bytes
+ *
+ * Description:
+ * Checks the given categories against the given DOI definition and returns a
+ * negative value if any of the categories do not have a valid mapping and a
+ * zero value if all of the categories are valid.
+ *
+ */
+static int cipso_v4_map_cat_enum_valid(const struct cipso_v4_doi *doi_def,
+ const unsigned char *enumcat,
+ u32 enumcat_len)
+{
+ u16 cat;
+ int cat_prev = -1;
+ u32 iter;
+
+ if (doi_def->type != CIPSO_V4_MAP_PASS || enumcat_len & 0x01)
+ return -EFAULT;
+
+ for (iter = 0; iter < enumcat_len; iter += 2) {
+ cat = get_unaligned_be16(&enumcat[iter]);
+ if (cat <= cat_prev)
+ return -EFAULT;
+ cat_prev = cat;
+ }
+
+ return 0;
+}
+
+/**
+ * cipso_v4_map_cat_enum_hton - Perform a category mapping from host to network
+ * @doi_def: the DOI definition
+ * @secattr: the security attributes
+ * @net_cat: the zero'd out category list in network/CIPSO format
+ * @net_cat_len: the length of the CIPSO category list in bytes
+ *
+ * Description:
+ * Perform a label mapping to translate a local MLS category bitmap to the
+ * correct CIPSO category list using the given DOI definition. Returns the
+ * size in bytes of the network category bitmap on success, negative values
+ * otherwise.
+ *
+ */
+static int cipso_v4_map_cat_enum_hton(const struct cipso_v4_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr,
+ unsigned char *net_cat,
+ u32 net_cat_len)
+{
+ int cat = -1;
+ u32 cat_iter = 0;
+
+ for (;;) {
+ cat = netlbl_secattr_catmap_walk(secattr->attr.mls.cat,
+ cat + 1);
+ if (cat < 0)
+ break;
+ if ((cat_iter + 2) > net_cat_len)
+ return -ENOSPC;
+
+ *((__be16 *)&net_cat[cat_iter]) = htons(cat);
+ cat_iter += 2;
+ }
+
+ return cat_iter;
+}
+
+/**
+ * cipso_v4_map_cat_enum_ntoh - Perform a category mapping from network to host
+ * @doi_def: the DOI definition
+ * @net_cat: the category list in network/CIPSO format
+ * @net_cat_len: the length of the CIPSO bitmap in bytes
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Perform a label mapping to translate a CIPSO category list to the correct
+ * local MLS category bitmap using the given DOI definition. Returns zero on
+ * success, negative values on failure.
+ *
+ */
+static int cipso_v4_map_cat_enum_ntoh(const struct cipso_v4_doi *doi_def,
+ const unsigned char *net_cat,
+ u32 net_cat_len,
+ struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val;
+ u32 iter;
+
+ for (iter = 0; iter < net_cat_len; iter += 2) {
+ ret_val = netlbl_secattr_catmap_setbit(secattr->attr.mls.cat,
+ get_unaligned_be16(&net_cat[iter]),
+ GFP_ATOMIC);
+ if (ret_val != 0)
+ return ret_val;
+ }
+
+ return 0;
+}
+
+/**
+ * cipso_v4_map_cat_rng_valid - Checks to see if the categories are valid
+ * @doi_def: the DOI definition
+ * @rngcat: category list
+ * @rngcat_len: length of the category list in bytes
+ *
+ * Description:
+ * Checks the given categories against the given DOI definition and returns a
+ * negative value if any of the categories do not have a valid mapping and a
+ * zero value if all of the categories are valid.
+ *
+ */
+static int cipso_v4_map_cat_rng_valid(const struct cipso_v4_doi *doi_def,
+ const unsigned char *rngcat,
+ u32 rngcat_len)
+{
+ u16 cat_high;
+ u16 cat_low;
+ u32 cat_prev = CIPSO_V4_MAX_REM_CATS + 1;
+ u32 iter;
+
+ if (doi_def->type != CIPSO_V4_MAP_PASS || rngcat_len & 0x01)
+ return -EFAULT;
+
+ for (iter = 0; iter < rngcat_len; iter += 4) {
+ cat_high = get_unaligned_be16(&rngcat[iter]);
+ if ((iter + 4) <= rngcat_len)
+ cat_low = get_unaligned_be16(&rngcat[iter + 2]);
+ else
+ cat_low = 0;
+
+ if (cat_high > cat_prev)
+ return -EFAULT;
+
+ cat_prev = cat_low;
+ }
+
+ return 0;
+}
+
+/**
+ * cipso_v4_map_cat_rng_hton - Perform a category mapping from host to network
+ * @doi_def: the DOI definition
+ * @secattr: the security attributes
+ * @net_cat: the zero'd out category list in network/CIPSO format
+ * @net_cat_len: the length of the CIPSO category list in bytes
+ *
+ * Description:
+ * Perform a label mapping to translate a local MLS category bitmap to the
+ * correct CIPSO category list using the given DOI definition. Returns the
+ * size in bytes of the network category bitmap on success, negative values
+ * otherwise.
+ *
+ */
+static int cipso_v4_map_cat_rng_hton(const struct cipso_v4_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr,
+ unsigned char *net_cat,
+ u32 net_cat_len)
+{
+ int iter = -1;
+ u16 array[CIPSO_V4_TAG_RNG_CAT_MAX * 2];
+ u32 array_cnt = 0;
+ u32 cat_size = 0;
+
+ /* make sure we don't overflow the 'array[]' variable */
+ if (net_cat_len >
+ (CIPSO_V4_OPT_LEN_MAX - CIPSO_V4_HDR_LEN - CIPSO_V4_TAG_RNG_BLEN))
+ return -ENOSPC;
+
+ for (;;) {
+ iter = netlbl_secattr_catmap_walk(secattr->attr.mls.cat,
+ iter + 1);
+ if (iter < 0)
+ break;
+ cat_size += (iter == 0 ? 0 : sizeof(u16));
+ if (cat_size > net_cat_len)
+ return -ENOSPC;
+ array[array_cnt++] = iter;
+
+ iter = netlbl_secattr_catmap_walk_rng(secattr->attr.mls.cat,
+ iter);
+ if (iter < 0)
+ return -EFAULT;
+ cat_size += sizeof(u16);
+ if (cat_size > net_cat_len)
+ return -ENOSPC;
+ array[array_cnt++] = iter;
+ }
+
+ for (iter = 0; array_cnt > 0;) {
+ *((__be16 *)&net_cat[iter]) = htons(array[--array_cnt]);
+ iter += 2;
+ array_cnt--;
+ if (array[array_cnt] != 0) {
+ *((__be16 *)&net_cat[iter]) = htons(array[array_cnt]);
+ iter += 2;
+ }
+ }
+
+ return cat_size;
+}
+
+/**
+ * cipso_v4_map_cat_rng_ntoh - Perform a category mapping from network to host
+ * @doi_def: the DOI definition
+ * @net_cat: the category list in network/CIPSO format
+ * @net_cat_len: the length of the CIPSO bitmap in bytes
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Perform a label mapping to translate a CIPSO category list to the correct
+ * local MLS category bitmap using the given DOI definition. Returns zero on
+ * success, negative values on failure.
+ *
+ */
+static int cipso_v4_map_cat_rng_ntoh(const struct cipso_v4_doi *doi_def,
+ const unsigned char *net_cat,
+ u32 net_cat_len,
+ struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val;
+ u32 net_iter;
+ u16 cat_low;
+ u16 cat_high;
+
+ for (net_iter = 0; net_iter < net_cat_len; net_iter += 4) {
+ cat_high = get_unaligned_be16(&net_cat[net_iter]);
+ if ((net_iter + 4) <= net_cat_len)
+ cat_low = get_unaligned_be16(&net_cat[net_iter + 2]);
+ else
+ cat_low = 0;
+
+ ret_val = netlbl_secattr_catmap_setrng(secattr->attr.mls.cat,
+ cat_low,
+ cat_high,
+ GFP_ATOMIC);
+ if (ret_val != 0)
+ return ret_val;
+ }
+
+ return 0;
+}
+
+/*
+ * Protocol Handling Functions
+ */
+
+/**
+ * cipso_v4_gentag_hdr - Generate a CIPSO option header
+ * @doi_def: the DOI definition
+ * @len: the total tag length in bytes, not including this header
+ * @buf: the CIPSO option buffer
+ *
+ * Description:
+ * Write a CIPSO header into the beginning of @buffer.
+ *
+ */
+static void cipso_v4_gentag_hdr(const struct cipso_v4_doi *doi_def,
+ unsigned char *buf,
+ u32 len)
+{
+ buf[0] = IPOPT_CIPSO;
+ buf[1] = CIPSO_V4_HDR_LEN + len;
+ *(__be32 *)&buf[2] = htonl(doi_def->doi);
+}
+
+/**
+ * cipso_v4_gentag_rbm - Generate a CIPSO restricted bitmap tag (type #1)
+ * @doi_def: the DOI definition
+ * @secattr: the security attributes
+ * @buffer: the option buffer
+ * @buffer_len: length of buffer in bytes
+ *
+ * Description:
+ * Generate a CIPSO option using the restricted bitmap tag, tag type #1. The
+ * actual buffer length may be larger than the indicated size due to
+ * translation between host and network category bitmaps. Returns the size of
+ * the tag on success, negative values on failure.
+ *
+ */
+static int cipso_v4_gentag_rbm(const struct cipso_v4_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr,
+ unsigned char *buffer,
+ u32 buffer_len)
+{
+ int ret_val;
+ u32 tag_len;
+ u32 level;
+
+ if ((secattr->flags & NETLBL_SECATTR_MLS_LVL) == 0)
+ return -EPERM;
+
+ ret_val = cipso_v4_map_lvl_hton(doi_def,
+ secattr->attr.mls.lvl,
+ &level);
+ if (ret_val != 0)
+ return ret_val;
+
+ if (secattr->flags & NETLBL_SECATTR_MLS_CAT) {
+ ret_val = cipso_v4_map_cat_rbm_hton(doi_def,
+ secattr,
+ &buffer[4],
+ buffer_len - 4);
+ if (ret_val < 0)
+ return ret_val;
+
+ /* This will send packets using the "optimized" format when
+ * possible as specified in section 3.4.2.6 of the
+ * CIPSO draft. */
+ if (cipso_v4_rbm_optfmt && ret_val > 0 && ret_val <= 10)
+ tag_len = 14;
+ else
+ tag_len = 4 + ret_val;
+ } else
+ tag_len = 4;
+
+ buffer[0] = CIPSO_V4_TAG_RBITMAP;
+ buffer[1] = tag_len;
+ buffer[3] = level;
+
+ return tag_len;
+}
+
+/**
+ * cipso_v4_parsetag_rbm - Parse a CIPSO restricted bitmap tag
+ * @doi_def: the DOI definition
+ * @tag: the CIPSO tag
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Parse a CIPSO restricted bitmap tag (tag type #1) and return the security
+ * attributes in @secattr. Return zero on success, negatives values on
+ * failure.
+ *
+ */
+static int cipso_v4_parsetag_rbm(const struct cipso_v4_doi *doi_def,
+ const unsigned char *tag,
+ struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val;
+ u8 tag_len = tag[1];
+ u32 level;
+
+ ret_val = cipso_v4_map_lvl_ntoh(doi_def, tag[3], &level);
+ if (ret_val != 0)
+ return ret_val;
+ secattr->attr.mls.lvl = level;
+ secattr->flags |= NETLBL_SECATTR_MLS_LVL;
+
+ if (tag_len > 4) {
+ secattr->attr.mls.cat =
+ netlbl_secattr_catmap_alloc(GFP_ATOMIC);
+ if (secattr->attr.mls.cat == NULL)
+ return -ENOMEM;
+
+ ret_val = cipso_v4_map_cat_rbm_ntoh(doi_def,
+ &tag[4],
+ tag_len - 4,
+ secattr);
+ if (ret_val != 0) {
+ netlbl_secattr_catmap_free(secattr->attr.mls.cat);
+ return ret_val;
+ }
+
+ secattr->flags |= NETLBL_SECATTR_MLS_CAT;
+ }
+
+ return 0;
+}
+
+/**
+ * cipso_v4_gentag_enum - Generate a CIPSO enumerated tag (type #2)
+ * @doi_def: the DOI definition
+ * @secattr: the security attributes
+ * @buffer: the option buffer
+ * @buffer_len: length of buffer in bytes
+ *
+ * Description:
+ * Generate a CIPSO option using the enumerated tag, tag type #2. Returns the
+ * size of the tag on success, negative values on failure.
+ *
+ */
+static int cipso_v4_gentag_enum(const struct cipso_v4_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr,
+ unsigned char *buffer,
+ u32 buffer_len)
+{
+ int ret_val;
+ u32 tag_len;
+ u32 level;
+
+ if (!(secattr->flags & NETLBL_SECATTR_MLS_LVL))
+ return -EPERM;
+
+ ret_val = cipso_v4_map_lvl_hton(doi_def,
+ secattr->attr.mls.lvl,
+ &level);
+ if (ret_val != 0)
+ return ret_val;
+
+ if (secattr->flags & NETLBL_SECATTR_MLS_CAT) {
+ ret_val = cipso_v4_map_cat_enum_hton(doi_def,
+ secattr,
+ &buffer[4],
+ buffer_len - 4);
+ if (ret_val < 0)
+ return ret_val;
+
+ tag_len = 4 + ret_val;
+ } else
+ tag_len = 4;
+
+ buffer[0] = CIPSO_V4_TAG_ENUM;
+ buffer[1] = tag_len;
+ buffer[3] = level;
+
+ return tag_len;
+}
+
+/**
+ * cipso_v4_parsetag_enum - Parse a CIPSO enumerated tag
+ * @doi_def: the DOI definition
+ * @tag: the CIPSO tag
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Parse a CIPSO enumerated tag (tag type #2) and return the security
+ * attributes in @secattr. Return zero on success, negatives values on
+ * failure.
+ *
+ */
+static int cipso_v4_parsetag_enum(const struct cipso_v4_doi *doi_def,
+ const unsigned char *tag,
+ struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val;
+ u8 tag_len = tag[1];
+ u32 level;
+
+ ret_val = cipso_v4_map_lvl_ntoh(doi_def, tag[3], &level);
+ if (ret_val != 0)
+ return ret_val;
+ secattr->attr.mls.lvl = level;
+ secattr->flags |= NETLBL_SECATTR_MLS_LVL;
+
+ if (tag_len > 4) {
+ secattr->attr.mls.cat =
+ netlbl_secattr_catmap_alloc(GFP_ATOMIC);
+ if (secattr->attr.mls.cat == NULL)
+ return -ENOMEM;
+
+ ret_val = cipso_v4_map_cat_enum_ntoh(doi_def,
+ &tag[4],
+ tag_len - 4,
+ secattr);
+ if (ret_val != 0) {
+ netlbl_secattr_catmap_free(secattr->attr.mls.cat);
+ return ret_val;
+ }
+
+ secattr->flags |= NETLBL_SECATTR_MLS_CAT;
+ }
+
+ return 0;
+}
+
+/**
+ * cipso_v4_gentag_rng - Generate a CIPSO ranged tag (type #5)
+ * @doi_def: the DOI definition
+ * @secattr: the security attributes
+ * @buffer: the option buffer
+ * @buffer_len: length of buffer in bytes
+ *
+ * Description:
+ * Generate a CIPSO option using the ranged tag, tag type #5. Returns the
+ * size of the tag on success, negative values on failure.
+ *
+ */
+static int cipso_v4_gentag_rng(const struct cipso_v4_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr,
+ unsigned char *buffer,
+ u32 buffer_len)
+{
+ int ret_val;
+ u32 tag_len;
+ u32 level;
+
+ if (!(secattr->flags & NETLBL_SECATTR_MLS_LVL))
+ return -EPERM;
+
+ ret_val = cipso_v4_map_lvl_hton(doi_def,
+ secattr->attr.mls.lvl,
+ &level);
+ if (ret_val != 0)
+ return ret_val;
+
+ if (secattr->flags & NETLBL_SECATTR_MLS_CAT) {
+ ret_val = cipso_v4_map_cat_rng_hton(doi_def,
+ secattr,
+ &buffer[4],
+ buffer_len - 4);
+ if (ret_val < 0)
+ return ret_val;
+
+ tag_len = 4 + ret_val;
+ } else
+ tag_len = 4;
+
+ buffer[0] = CIPSO_V4_TAG_RANGE;
+ buffer[1] = tag_len;
+ buffer[3] = level;
+
+ return tag_len;
+}
+
+/**
+ * cipso_v4_parsetag_rng - Parse a CIPSO ranged tag
+ * @doi_def: the DOI definition
+ * @tag: the CIPSO tag
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Parse a CIPSO ranged tag (tag type #5) and return the security attributes
+ * in @secattr. Return zero on success, negatives values on failure.
+ *
+ */
+static int cipso_v4_parsetag_rng(const struct cipso_v4_doi *doi_def,
+ const unsigned char *tag,
+ struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val;
+ u8 tag_len = tag[1];
+ u32 level;
+
+ ret_val = cipso_v4_map_lvl_ntoh(doi_def, tag[3], &level);
+ if (ret_val != 0)
+ return ret_val;
+ secattr->attr.mls.lvl = level;
+ secattr->flags |= NETLBL_SECATTR_MLS_LVL;
+
+ if (tag_len > 4) {
+ secattr->attr.mls.cat =
+ netlbl_secattr_catmap_alloc(GFP_ATOMIC);
+ if (secattr->attr.mls.cat == NULL)
+ return -ENOMEM;
+
+ ret_val = cipso_v4_map_cat_rng_ntoh(doi_def,
+ &tag[4],
+ tag_len - 4,
+ secattr);
+ if (ret_val != 0) {
+ netlbl_secattr_catmap_free(secattr->attr.mls.cat);
+ return ret_val;
+ }
+
+ secattr->flags |= NETLBL_SECATTR_MLS_CAT;
+ }
+
+ return 0;
+}
+
+/**
+ * cipso_v4_gentag_loc - Generate a CIPSO local tag (non-standard)
+ * @doi_def: the DOI definition
+ * @secattr: the security attributes
+ * @buffer: the option buffer
+ * @buffer_len: length of buffer in bytes
+ *
+ * Description:
+ * Generate a CIPSO option using the local tag. Returns the size of the tag
+ * on success, negative values on failure.
+ *
+ */
+static int cipso_v4_gentag_loc(const struct cipso_v4_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr,
+ unsigned char *buffer,
+ u32 buffer_len)
+{
+ if (!(secattr->flags & NETLBL_SECATTR_SECID))
+ return -EPERM;
+
+ buffer[0] = CIPSO_V4_TAG_LOCAL;
+ buffer[1] = CIPSO_V4_TAG_LOC_BLEN;
+ *(u32 *)&buffer[2] = secattr->attr.secid;
+
+ return CIPSO_V4_TAG_LOC_BLEN;
+}
+
+/**
+ * cipso_v4_parsetag_loc - Parse a CIPSO local tag
+ * @doi_def: the DOI definition
+ * @tag: the CIPSO tag
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Parse a CIPSO local tag and return the security attributes in @secattr.
+ * Return zero on success, negatives values on failure.
+ *
+ */
+static int cipso_v4_parsetag_loc(const struct cipso_v4_doi *doi_def,
+ const unsigned char *tag,
+ struct netlbl_lsm_secattr *secattr)
+{
+ secattr->attr.secid = *(u32 *)&tag[2];
+ secattr->flags |= NETLBL_SECATTR_SECID;
+
+ return 0;
+}
+
+/**
+ * cipso_v4_validate - Validate a CIPSO option
+ * @option: the start of the option, on error it is set to point to the error
+ *
+ * Description:
+ * This routine is called to validate a CIPSO option, it checks all of the
+ * fields to ensure that they are at least valid, see the draft snippet below
+ * for details. If the option is valid then a zero value is returned and
+ * the value of @option is unchanged. If the option is invalid then a
+ * non-zero value is returned and @option is adjusted to point to the
+ * offending portion of the option. From the IETF draft ...
+ *
+ * "If any field within the CIPSO options, such as the DOI identifier, is not
+ * recognized the IP datagram is discarded and an ICMP 'parameter problem'
+ * (type 12) is generated and returned. The ICMP code field is set to 'bad
+ * parameter' (code 0) and the pointer is set to the start of the CIPSO field
+ * that is unrecognized."
+ *
+ */
+int cipso_v4_validate(const struct sk_buff *skb, unsigned char **option)
+{
+ unsigned char *opt = *option;
+ unsigned char *tag;
+ unsigned char opt_iter;
+ unsigned char err_offset = 0;
+ u8 opt_len;
+ u8 tag_len;
+ struct cipso_v4_doi *doi_def = NULL;
+ u32 tag_iter;
+
+ /* caller already checks for length values that are too large */
+ opt_len = opt[1];
+ if (opt_len < 8) {
+ err_offset = 1;
+ goto validate_return;
+ }
+
+ rcu_read_lock();
+ doi_def = cipso_v4_doi_search(get_unaligned_be32(&opt[2]));
+ if (doi_def == NULL) {
+ err_offset = 2;
+ goto validate_return_locked;
+ }
+
+ opt_iter = CIPSO_V4_HDR_LEN;
+ tag = opt + opt_iter;
+ while (opt_iter < opt_len) {
+ for (tag_iter = 0; doi_def->tags[tag_iter] != tag[0];)
+ if (doi_def->tags[tag_iter] == CIPSO_V4_TAG_INVALID ||
+ ++tag_iter == CIPSO_V4_TAG_MAXCNT) {
+ err_offset = opt_iter;
+ goto validate_return_locked;
+ }
+
+ tag_len = tag[1];
+ if (tag_len > (opt_len - opt_iter)) {
+ err_offset = opt_iter + 1;
+ goto validate_return_locked;
+ }
+
+ switch (tag[0]) {
+ case CIPSO_V4_TAG_RBITMAP:
+ if (tag_len < CIPSO_V4_TAG_RBM_BLEN) {
+ err_offset = opt_iter + 1;
+ goto validate_return_locked;
+ }
+
+ /* We are already going to do all the verification
+ * necessary at the socket layer so from our point of
+ * view it is safe to turn these checks off (and less
+ * work), however, the CIPSO draft says we should do
+ * all the CIPSO validations here but it doesn't
+ * really specify _exactly_ what we need to validate
+ * ... so, just make it a sysctl tunable. */
+ if (cipso_v4_rbm_strictvalid) {
+ if (cipso_v4_map_lvl_valid(doi_def,
+ tag[3]) < 0) {
+ err_offset = opt_iter + 3;
+ goto validate_return_locked;
+ }
+ if (tag_len > CIPSO_V4_TAG_RBM_BLEN &&
+ cipso_v4_map_cat_rbm_valid(doi_def,
+ &tag[4],
+ tag_len - 4) < 0) {
+ err_offset = opt_iter + 4;
+ goto validate_return_locked;
+ }
+ }
+ break;
+ case CIPSO_V4_TAG_ENUM:
+ if (tag_len < CIPSO_V4_TAG_ENUM_BLEN) {
+ err_offset = opt_iter + 1;
+ goto validate_return_locked;
+ }
+
+ if (cipso_v4_map_lvl_valid(doi_def,
+ tag[3]) < 0) {
+ err_offset = opt_iter + 3;
+ goto validate_return_locked;
+ }
+ if (tag_len > CIPSO_V4_TAG_ENUM_BLEN &&
+ cipso_v4_map_cat_enum_valid(doi_def,
+ &tag[4],
+ tag_len - 4) < 0) {
+ err_offset = opt_iter + 4;
+ goto validate_return_locked;
+ }
+ break;
+ case CIPSO_V4_TAG_RANGE:
+ if (tag_len < CIPSO_V4_TAG_RNG_BLEN) {
+ err_offset = opt_iter + 1;
+ goto validate_return_locked;
+ }
+
+ if (cipso_v4_map_lvl_valid(doi_def,
+ tag[3]) < 0) {
+ err_offset = opt_iter + 3;
+ goto validate_return_locked;
+ }
+ if (tag_len > CIPSO_V4_TAG_RNG_BLEN &&
+ cipso_v4_map_cat_rng_valid(doi_def,
+ &tag[4],
+ tag_len - 4) < 0) {
+ err_offset = opt_iter + 4;
+ goto validate_return_locked;
+ }
+ break;
+ case CIPSO_V4_TAG_LOCAL:
+ /* This is a non-standard tag that we only allow for
+ * local connections, so if the incoming interface is
+ * not the loopback device drop the packet. */
+ if (!(skb->dev->flags & IFF_LOOPBACK)) {
+ err_offset = opt_iter;
+ goto validate_return_locked;
+ }
+ if (tag_len != CIPSO_V4_TAG_LOC_BLEN) {
+ err_offset = opt_iter + 1;
+ goto validate_return_locked;
+ }
+ break;
+ default:
+ err_offset = opt_iter;
+ goto validate_return_locked;
+ }
+
+ tag += tag_len;
+ opt_iter += tag_len;
+ }
+
+validate_return_locked:
+ rcu_read_unlock();
+validate_return:
+ *option = opt + err_offset;
+ return err_offset;
+}
+
+/**
+ * cipso_v4_error - Send the correct response for a bad packet
+ * @skb: the packet
+ * @error: the error code
+ * @gateway: CIPSO gateway flag
+ *
+ * Description:
+ * Based on the error code given in @error, send an ICMP error message back to
+ * the originating host. From the IETF draft ...
+ *
+ * "If the contents of the CIPSO [option] are valid but the security label is
+ * outside of the configured host or port label range, the datagram is
+ * discarded and an ICMP 'destination unreachable' (type 3) is generated and
+ * returned. The code field of the ICMP is set to 'communication with
+ * destination network administratively prohibited' (code 9) or to
+ * 'communication with destination host administratively prohibited'
+ * (code 10). The value of the code is dependent on whether the originator
+ * of the ICMP message is acting as a CIPSO host or a CIPSO gateway. The
+ * recipient of the ICMP message MUST be able to handle either value. The
+ * same procedure is performed if a CIPSO [option] can not be added to an
+ * IP packet because it is too large to fit in the IP options area."
+ *
+ * "If the error is triggered by receipt of an ICMP message, the message is
+ * discarded and no response is permitted (consistent with general ICMP
+ * processing rules)."
+ *
+ */
+void cipso_v4_error(struct sk_buff *skb, int error, u32 gateway)
+{
+ if (ip_hdr(skb)->protocol == IPPROTO_ICMP || error != -EACCES)
+ return;
+
+ if (gateway)
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_NET_ANO, 0);
+ else
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_ANO, 0);
+}
+
+/**
+ * cipso_v4_genopt - Generate a CIPSO option
+ * @buf: the option buffer
+ * @buf_len: the size of opt_buf
+ * @doi_def: the CIPSO DOI to use
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Generate a CIPSO option using the DOI definition and security attributes
+ * passed to the function. Returns the length of the option on success and
+ * negative values on failure.
+ *
+ */
+static int cipso_v4_genopt(unsigned char *buf, u32 buf_len,
+ const struct cipso_v4_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val;
+ u32 iter;
+
+ if (buf_len <= CIPSO_V4_HDR_LEN)
+ return -ENOSPC;
+
+ /* XXX - This code assumes only one tag per CIPSO option which isn't
+ * really a good assumption to make but since we only support the MAC
+ * tags right now it is a safe assumption. */
+ iter = 0;
+ do {
+ memset(buf, 0, buf_len);
+ switch (doi_def->tags[iter]) {
+ case CIPSO_V4_TAG_RBITMAP:
+ ret_val = cipso_v4_gentag_rbm(doi_def,
+ secattr,
+ &buf[CIPSO_V4_HDR_LEN],
+ buf_len - CIPSO_V4_HDR_LEN);
+ break;
+ case CIPSO_V4_TAG_ENUM:
+ ret_val = cipso_v4_gentag_enum(doi_def,
+ secattr,
+ &buf[CIPSO_V4_HDR_LEN],
+ buf_len - CIPSO_V4_HDR_LEN);
+ break;
+ case CIPSO_V4_TAG_RANGE:
+ ret_val = cipso_v4_gentag_rng(doi_def,
+ secattr,
+ &buf[CIPSO_V4_HDR_LEN],
+ buf_len - CIPSO_V4_HDR_LEN);
+ break;
+ case CIPSO_V4_TAG_LOCAL:
+ ret_val = cipso_v4_gentag_loc(doi_def,
+ secattr,
+ &buf[CIPSO_V4_HDR_LEN],
+ buf_len - CIPSO_V4_HDR_LEN);
+ break;
+ default:
+ return -EPERM;
+ }
+
+ iter++;
+ } while (ret_val < 0 &&
+ iter < CIPSO_V4_TAG_MAXCNT &&
+ doi_def->tags[iter] != CIPSO_V4_TAG_INVALID);
+ if (ret_val < 0)
+ return ret_val;
+ cipso_v4_gentag_hdr(doi_def, buf, ret_val);
+ return CIPSO_V4_HDR_LEN + ret_val;
+}
+
+/**
+ * cipso_v4_sock_setattr - Add a CIPSO option to a socket
+ * @sk: the socket
+ * @doi_def: the CIPSO DOI to use
+ * @secattr: the specific security attributes of the socket
+ *
+ * Description:
+ * Set the CIPSO option on the given socket using the DOI definition and
+ * security attributes passed to the function. This function requires
+ * exclusive access to @sk, which means it either needs to be in the
+ * process of being created or locked. Returns zero on success and negative
+ * values on failure.
+ *
+ */
+int cipso_v4_sock_setattr(struct sock *sk,
+ const struct cipso_v4_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val = -EPERM;
+ unsigned char *buf = NULL;
+ u32 buf_len;
+ u32 opt_len;
+ struct ip_options_rcu *old, *opt = NULL;
+ struct inet_sock *sk_inet;
+ struct inet_connection_sock *sk_conn;
+
+ /* In the case of sock_create_lite(), the sock->sk field is not
+ * defined yet but it is not a problem as the only users of these
+ * "lite" PF_INET sockets are functions which do an accept() call
+ * afterwards so we will label the socket as part of the accept(). */
+ if (sk == NULL)
+ return 0;
+
+ /* We allocate the maximum CIPSO option size here so we are probably
+ * being a little wasteful, but it makes our life _much_ easier later
+ * on and after all we are only talking about 40 bytes. */
+ buf_len = CIPSO_V4_OPT_LEN_MAX;
+ buf = kmalloc(buf_len, GFP_ATOMIC);
+ if (buf == NULL) {
+ ret_val = -ENOMEM;
+ goto socket_setattr_failure;
+ }
+
+ ret_val = cipso_v4_genopt(buf, buf_len, doi_def, secattr);
+ if (ret_val < 0)
+ goto socket_setattr_failure;
+ buf_len = ret_val;
+
+ /* We can't use ip_options_get() directly because it makes a call to
+ * ip_options_get_alloc() which allocates memory with GFP_KERNEL and
+ * we won't always have CAP_NET_RAW even though we _always_ want to
+ * set the IPOPT_CIPSO option. */
+ opt_len = (buf_len + 3) & ~3;
+ opt = kzalloc(sizeof(*opt) + opt_len, GFP_ATOMIC);
+ if (opt == NULL) {
+ ret_val = -ENOMEM;
+ goto socket_setattr_failure;
+ }
+ memcpy(opt->opt.__data, buf, buf_len);
+ opt->opt.optlen = opt_len;
+ opt->opt.cipso = sizeof(struct iphdr);
+ kfree(buf);
+ buf = NULL;
+
+ sk_inet = inet_sk(sk);
+
+ old = rcu_dereference_protected(sk_inet->inet_opt, sock_owned_by_user(sk));
+ if (sk_inet->is_icsk) {
+ sk_conn = inet_csk(sk);
+ if (old)
+ sk_conn->icsk_ext_hdr_len -= old->opt.optlen;
+ sk_conn->icsk_ext_hdr_len += opt->opt.optlen;
+ sk_conn->icsk_sync_mss(sk, sk_conn->icsk_pmtu_cookie);
+ }
+ rcu_assign_pointer(sk_inet->inet_opt, opt);
+ if (old)
+ kfree_rcu(old, rcu);
+
+ return 0;
+
+socket_setattr_failure:
+ kfree(buf);
+ kfree(opt);
+ return ret_val;
+}
+
+/**
+ * cipso_v4_req_setattr - Add a CIPSO option to a connection request socket
+ * @req: the connection request socket
+ * @doi_def: the CIPSO DOI to use
+ * @secattr: the specific security attributes of the socket
+ *
+ * Description:
+ * Set the CIPSO option on the given socket using the DOI definition and
+ * security attributes passed to the function. Returns zero on success and
+ * negative values on failure.
+ *
+ */
+int cipso_v4_req_setattr(struct request_sock *req,
+ const struct cipso_v4_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val = -EPERM;
+ unsigned char *buf = NULL;
+ u32 buf_len;
+ u32 opt_len;
+ struct ip_options_rcu *opt = NULL;
+ struct inet_request_sock *req_inet;
+
+ /* We allocate the maximum CIPSO option size here so we are probably
+ * being a little wasteful, but it makes our life _much_ easier later
+ * on and after all we are only talking about 40 bytes. */
+ buf_len = CIPSO_V4_OPT_LEN_MAX;
+ buf = kmalloc(buf_len, GFP_ATOMIC);
+ if (buf == NULL) {
+ ret_val = -ENOMEM;
+ goto req_setattr_failure;
+ }
+
+ ret_val = cipso_v4_genopt(buf, buf_len, doi_def, secattr);
+ if (ret_val < 0)
+ goto req_setattr_failure;
+ buf_len = ret_val;
+
+ /* We can't use ip_options_get() directly because it makes a call to
+ * ip_options_get_alloc() which allocates memory with GFP_KERNEL and
+ * we won't always have CAP_NET_RAW even though we _always_ want to
+ * set the IPOPT_CIPSO option. */
+ opt_len = (buf_len + 3) & ~3;
+ opt = kzalloc(sizeof(*opt) + opt_len, GFP_ATOMIC);
+ if (opt == NULL) {
+ ret_val = -ENOMEM;
+ goto req_setattr_failure;
+ }
+ memcpy(opt->opt.__data, buf, buf_len);
+ opt->opt.optlen = opt_len;
+ opt->opt.cipso = sizeof(struct iphdr);
+ kfree(buf);
+ buf = NULL;
+
+ req_inet = inet_rsk(req);
+ opt = xchg(&req_inet->opt, opt);
+ if (opt)
+ kfree_rcu(opt, rcu);
+
+ return 0;
+
+req_setattr_failure:
+ kfree(buf);
+ kfree(opt);
+ return ret_val;
+}
+
+/**
+ * cipso_v4_delopt - Delete the CIPSO option from a set of IP options
+ * @opt_ptr: IP option pointer
+ *
+ * Description:
+ * Deletes the CIPSO IP option from a set of IP options and makes the necessary
+ * adjustments to the IP option structure. Returns zero on success, negative
+ * values on failure.
+ *
+ */
+static int cipso_v4_delopt(struct ip_options_rcu **opt_ptr)
+{
+ int hdr_delta = 0;
+ struct ip_options_rcu *opt = *opt_ptr;
+
+ if (opt->opt.srr || opt->opt.rr || opt->opt.ts || opt->opt.router_alert) {
+ u8 cipso_len;
+ u8 cipso_off;
+ unsigned char *cipso_ptr;
+ int iter;
+ int optlen_new;
+
+ cipso_off = opt->opt.cipso - sizeof(struct iphdr);
+ cipso_ptr = &opt->opt.__data[cipso_off];
+ cipso_len = cipso_ptr[1];
+
+ if (opt->opt.srr > opt->opt.cipso)
+ opt->opt.srr -= cipso_len;
+ if (opt->opt.rr > opt->opt.cipso)
+ opt->opt.rr -= cipso_len;
+ if (opt->opt.ts > opt->opt.cipso)
+ opt->opt.ts -= cipso_len;
+ if (opt->opt.router_alert > opt->opt.cipso)
+ opt->opt.router_alert -= cipso_len;
+ opt->opt.cipso = 0;
+
+ memmove(cipso_ptr, cipso_ptr + cipso_len,
+ opt->opt.optlen - cipso_off - cipso_len);
+
+ /* determining the new total option length is tricky because of
+ * the padding necessary, the only thing i can think to do at
+ * this point is walk the options one-by-one, skipping the
+ * padding at the end to determine the actual option size and
+ * from there we can determine the new total option length */
+ iter = 0;
+ optlen_new = 0;
+ while (iter < opt->opt.optlen)
+ if (opt->opt.__data[iter] != IPOPT_NOP) {
+ iter += opt->opt.__data[iter + 1];
+ optlen_new = iter;
+ } else
+ iter++;
+ hdr_delta = opt->opt.optlen;
+ opt->opt.optlen = (optlen_new + 3) & ~3;
+ hdr_delta -= opt->opt.optlen;
+ } else {
+ /* only the cipso option was present on the socket so we can
+ * remove the entire option struct */
+ *opt_ptr = NULL;
+ hdr_delta = opt->opt.optlen;
+ kfree_rcu(opt, rcu);
+ }
+
+ return hdr_delta;
+}
+
+/**
+ * cipso_v4_sock_delattr - Delete the CIPSO option from a socket
+ * @sk: the socket
+ *
+ * Description:
+ * Removes the CIPSO option from a socket, if present.
+ *
+ */
+void cipso_v4_sock_delattr(struct sock *sk)
+{
+ int hdr_delta;
+ struct ip_options_rcu *opt;
+ struct inet_sock *sk_inet;
+
+ sk_inet = inet_sk(sk);
+ opt = rcu_dereference_protected(sk_inet->inet_opt, 1);
+ if (opt == NULL || opt->opt.cipso == 0)
+ return;
+
+ hdr_delta = cipso_v4_delopt(&sk_inet->inet_opt);
+ if (sk_inet->is_icsk && hdr_delta > 0) {
+ struct inet_connection_sock *sk_conn = inet_csk(sk);
+ sk_conn->icsk_ext_hdr_len -= hdr_delta;
+ sk_conn->icsk_sync_mss(sk, sk_conn->icsk_pmtu_cookie);
+ }
+}
+
+/**
+ * cipso_v4_req_delattr - Delete the CIPSO option from a request socket
+ * @reg: the request socket
+ *
+ * Description:
+ * Removes the CIPSO option from a request socket, if present.
+ *
+ */
+void cipso_v4_req_delattr(struct request_sock *req)
+{
+ struct ip_options_rcu *opt;
+ struct inet_request_sock *req_inet;
+
+ req_inet = inet_rsk(req);
+ opt = req_inet->opt;
+ if (opt == NULL || opt->opt.cipso == 0)
+ return;
+
+ cipso_v4_delopt(&req_inet->opt);
+}
+
+/**
+ * cipso_v4_getattr - Helper function for the cipso_v4_*_getattr functions
+ * @cipso: the CIPSO v4 option
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Inspect @cipso and return the security attributes in @secattr. Returns zero
+ * on success and negative values on failure.
+ *
+ */
+static int cipso_v4_getattr(const unsigned char *cipso,
+ struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val = -ENOMSG;
+ u32 doi;
+ struct cipso_v4_doi *doi_def;
+
+ if (cipso_v4_cache_check(cipso, cipso[1], secattr) == 0)
+ return 0;
+
+ doi = get_unaligned_be32(&cipso[2]);
+ rcu_read_lock();
+ doi_def = cipso_v4_doi_search(doi);
+ if (doi_def == NULL)
+ goto getattr_return;
+ /* XXX - This code assumes only one tag per CIPSO option which isn't
+ * really a good assumption to make but since we only support the MAC
+ * tags right now it is a safe assumption. */
+ switch (cipso[6]) {
+ case CIPSO_V4_TAG_RBITMAP:
+ ret_val = cipso_v4_parsetag_rbm(doi_def, &cipso[6], secattr);
+ break;
+ case CIPSO_V4_TAG_ENUM:
+ ret_val = cipso_v4_parsetag_enum(doi_def, &cipso[6], secattr);
+ break;
+ case CIPSO_V4_TAG_RANGE:
+ ret_val = cipso_v4_parsetag_rng(doi_def, &cipso[6], secattr);
+ break;
+ case CIPSO_V4_TAG_LOCAL:
+ ret_val = cipso_v4_parsetag_loc(doi_def, &cipso[6], secattr);
+ break;
+ }
+ if (ret_val == 0)
+ secattr->type = NETLBL_NLTYPE_CIPSOV4;
+
+getattr_return:
+ rcu_read_unlock();
+ return ret_val;
+}
+
+/**
+ * cipso_v4_sock_getattr - Get the security attributes from a sock
+ * @sk: the sock
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Query @sk to see if there is a CIPSO option attached to the sock and if
+ * there is return the CIPSO security attributes in @secattr. This function
+ * requires that @sk be locked, or privately held, but it does not do any
+ * locking itself. Returns zero on success and negative values on failure.
+ *
+ */
+int cipso_v4_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr)
+{
+ struct ip_options_rcu *opt;
+ int res = -ENOMSG;
+
+ rcu_read_lock();
+ opt = rcu_dereference(inet_sk(sk)->inet_opt);
+ if (opt && opt->opt.cipso)
+ res = cipso_v4_getattr(opt->opt.__data +
+ opt->opt.cipso -
+ sizeof(struct iphdr),
+ secattr);
+ rcu_read_unlock();
+ return res;
+}
+
+/**
+ * cipso_v4_skbuff_setattr - Set the CIPSO option on a packet
+ * @skb: the packet
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Set the CIPSO option on the given packet based on the security attributes.
+ * Returns a pointer to the IP header on success and NULL on failure.
+ *
+ */
+int cipso_v4_skbuff_setattr(struct sk_buff *skb,
+ const struct cipso_v4_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val;
+ struct iphdr *iph;
+ struct ip_options *opt = &IPCB(skb)->opt;
+ unsigned char buf[CIPSO_V4_OPT_LEN_MAX];
+ u32 buf_len = CIPSO_V4_OPT_LEN_MAX;
+ u32 opt_len;
+ int len_delta;
+
+ ret_val = cipso_v4_genopt(buf, buf_len, doi_def, secattr);
+ if (ret_val < 0)
+ return ret_val;
+ buf_len = ret_val;
+ opt_len = (buf_len + 3) & ~3;
+
+ /* we overwrite any existing options to ensure that we have enough
+ * room for the CIPSO option, the reason is that we _need_ to guarantee
+ * that the security label is applied to the packet - we do the same
+ * thing when using the socket options and it hasn't caused a problem,
+ * if we need to we can always revisit this choice later */
+
+ len_delta = opt_len - opt->optlen;
+ /* if we don't ensure enough headroom we could panic on the skb_push()
+ * call below so make sure we have enough, we are also "mangling" the
+ * packet so we should probably do a copy-on-write call anyway */
+ ret_val = skb_cow(skb, skb_headroom(skb) + len_delta);
+ if (ret_val < 0)
+ return ret_val;
+
+ if (len_delta > 0) {
+ /* we assume that the header + opt->optlen have already been
+ * "pushed" in ip_options_build() or similar */
+ iph = ip_hdr(skb);
+ skb_push(skb, len_delta);
+ memmove((char *)iph - len_delta, iph, iph->ihl << 2);
+ skb_reset_network_header(skb);
+ iph = ip_hdr(skb);
+ } else if (len_delta < 0) {
+ iph = ip_hdr(skb);
+ memset(iph + 1, IPOPT_NOP, opt->optlen);
+ } else
+ iph = ip_hdr(skb);
+
+ if (opt->optlen > 0)
+ memset(opt, 0, sizeof(*opt));
+ opt->optlen = opt_len;
+ opt->cipso = sizeof(struct iphdr);
+ opt->is_changed = 1;
+
+ /* we have to do the following because we are being called from a
+ * netfilter hook which means the packet already has had the header
+ * fields populated and the checksum calculated - yes this means we
+ * are doing more work than needed but we do it to keep the core
+ * stack clean and tidy */
+ memcpy(iph + 1, buf, buf_len);
+ if (opt_len > buf_len)
+ memset((char *)(iph + 1) + buf_len, 0, opt_len - buf_len);
+ if (len_delta != 0) {
+ iph->ihl = 5 + (opt_len >> 2);
+ iph->tot_len = htons(skb->len);
+ }
+ ip_send_check(iph);
+
+ return 0;
+}
+
+/**
+ * cipso_v4_skbuff_delattr - Delete any CIPSO options from a packet
+ * @skb: the packet
+ *
+ * Description:
+ * Removes any and all CIPSO options from the given packet. Returns zero on
+ * success, negative values on failure.
+ *
+ */
+int cipso_v4_skbuff_delattr(struct sk_buff *skb)
+{
+ int ret_val;
+ struct iphdr *iph;
+ struct ip_options *opt = &IPCB(skb)->opt;
+ unsigned char *cipso_ptr;
+
+ if (opt->cipso == 0)
+ return 0;
+
+ /* since we are changing the packet we should make a copy */
+ ret_val = skb_cow(skb, skb_headroom(skb));
+ if (ret_val < 0)
+ return ret_val;
+
+ /* the easiest thing to do is just replace the cipso option with noop
+ * options since we don't change the size of the packet, although we
+ * still need to recalculate the checksum */
+
+ iph = ip_hdr(skb);
+ cipso_ptr = (unsigned char *)iph + opt->cipso;
+ memset(cipso_ptr, IPOPT_NOOP, cipso_ptr[1]);
+ opt->cipso = 0;
+ opt->is_changed = 1;
+
+ ip_send_check(iph);
+
+ return 0;
+}
+
+/**
+ * cipso_v4_skbuff_getattr - Get the security attributes from the CIPSO option
+ * @skb: the packet
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Parse the given packet's CIPSO option and return the security attributes.
+ * Returns zero on success and negative values on failure.
+ *
+ */
+int cipso_v4_skbuff_getattr(const struct sk_buff *skb,
+ struct netlbl_lsm_secattr *secattr)
+{
+ return cipso_v4_getattr(CIPSO_V4_OPTPTR(skb), secattr);
+}
+
+/*
+ * Setup Functions
+ */
+
+/**
+ * cipso_v4_init - Initialize the CIPSO module
+ *
+ * Description:
+ * Initialize the CIPSO module and prepare it for use. Returns zero on success
+ * and negative values on failure.
+ *
+ */
+static int __init cipso_v4_init(void)
+{
+ int ret_val;
+
+ ret_val = cipso_v4_cache_init();
+ if (ret_val != 0)
+ panic("Failed to initialize the CIPSO/IPv4 cache (%d)\n",
+ ret_val);
+
+ return 0;
+}
+
+subsys_initcall(cipso_v4_init);
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
new file mode 100644
index 00000000..424fafbc
--- /dev/null
+++ b/net/ipv4/datagram.c
@@ -0,0 +1,87 @@
+/*
+ * common UDP/RAW code
+ * Linux INET implementation
+ *
+ * Authors:
+ * Hideaki YOSHIFUJI
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct sockaddr_in *usin = (struct sockaddr_in *) uaddr;
+ struct flowi4 *fl4;
+ struct rtable *rt;
+ __be32 saddr;
+ int oif;
+ int err;
+
+
+ if (addr_len < sizeof(*usin))
+ return -EINVAL;
+
+ if (usin->sin_family != AF_INET)
+ return -EAFNOSUPPORT;
+
+ sk_dst_reset(sk);
+
+ lock_sock(sk);
+
+ oif = sk->sk_bound_dev_if;
+ saddr = inet->inet_saddr;
+ if (ipv4_is_multicast(usin->sin_addr.s_addr)) {
+ if (!oif)
+ oif = inet->mc_index;
+ if (!saddr)
+ saddr = inet->mc_addr;
+ }
+ fl4 = &inet->cork.fl.u.ip4;
+ rt = ip_route_connect(fl4, usin->sin_addr.s_addr, saddr,
+ RT_CONN_FLAGS(sk), oif,
+ sk->sk_protocol,
+ inet->inet_sport, usin->sin_port, sk, true);
+ if (IS_ERR(rt)) {
+ err = PTR_ERR(rt);
+ if (err == -ENETUNREACH)
+ IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
+ goto out;
+ }
+
+ if ((rt->rt_flags & RTCF_BROADCAST) && !sock_flag(sk, SOCK_BROADCAST)) {
+ ip_rt_put(rt);
+ err = -EACCES;
+ goto out;
+ }
+ if (!inet->inet_saddr)
+ inet->inet_saddr = fl4->saddr; /* Update source address */
+ if (!inet->inet_rcv_saddr) {
+ inet->inet_rcv_saddr = fl4->saddr;
+ if (sk->sk_prot->rehash)
+ sk->sk_prot->rehash(sk);
+ }
+ inet->inet_daddr = fl4->daddr;
+ inet->inet_dport = usin->sin_port;
+ sk->sk_state = TCP_ESTABLISHED;
+ inet->inet_id = jiffies;
+
+ sk_dst_set(sk, &rt->dst);
+ err = 0;
+out:
+ release_sock(sk);
+ return err;
+}
+EXPORT_SYMBOL(ip4_datagram_connect);
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
new file mode 100644
index 00000000..8a9aab37
--- /dev/null
+++ b/net/ipv4/devinet.c
@@ -0,0 +1,1851 @@
+/*
+ * NET3 IP device support routines.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Derived from the IP parts of dev.c 1.0.19
+ * Authors: Ross Biro
+ * Fred N. van Kempen,
+ * Mark Evans,
+ *
+ * Additional Authors:
+ * Alan Cox,
+ * Alexey Kuznetsov,
+ *
+ * Changes:
+ * Alexey Kuznetsov: pa_* fields are replaced with ifaddr
+ * lists.
+ * Cyrus Durgin: updated for kmod
+ * Matthias Andree: in devinet_ioctl, compare label and
+ * address (4.4BSD alias style support),
+ * fall back to comparing just the label
+ * if no match found.
+ */
+
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#ifdef CONFIG_SYSCTL
+#include
+#endif
+#include
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include "fib_lookup.h"
+
+static struct ipv4_devconf ipv4_devconf = {
+ .data = {
+ [IPV4_DEVCONF_ACCEPT_REDIRECTS - 1] = 1,
+ [IPV4_DEVCONF_SEND_REDIRECTS - 1] = 1,
+ [IPV4_DEVCONF_SECURE_REDIRECTS - 1] = 1,
+ [IPV4_DEVCONF_SHARED_MEDIA - 1] = 1,
+ },
+};
+
+static struct ipv4_devconf ipv4_devconf_dflt = {
+ .data = {
+ [IPV4_DEVCONF_ACCEPT_REDIRECTS - 1] = 1,
+ [IPV4_DEVCONF_SEND_REDIRECTS - 1] = 1,
+ [IPV4_DEVCONF_SECURE_REDIRECTS - 1] = 1,
+ [IPV4_DEVCONF_SHARED_MEDIA - 1] = 1,
+ [IPV4_DEVCONF_ACCEPT_SOURCE_ROUTE - 1] = 1,
+ },
+};
+
+#define IPV4_DEVCONF_DFLT(net, attr) \
+ IPV4_DEVCONF((*net->ipv4.devconf_dflt), attr)
+
+static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = {
+ [IFA_LOCAL] = { .type = NLA_U32 },
+ [IFA_ADDRESS] = { .type = NLA_U32 },
+ [IFA_BROADCAST] = { .type = NLA_U32 },
+ [IFA_LABEL] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 },
+};
+
+/* inet_addr_hash's shifting is dependent upon this IN4_ADDR_HSIZE
+ * value. So if you change this define, make appropriate changes to
+ * inet_addr_hash as well.
+ */
+#define IN4_ADDR_HSIZE 256
+static struct hlist_head inet_addr_lst[IN4_ADDR_HSIZE];
+static DEFINE_SPINLOCK(inet_addr_hash_lock);
+
+static inline unsigned int inet_addr_hash(struct net *net, __be32 addr)
+{
+ u32 val = (__force u32) addr ^ hash_ptr(net, 8);
+
+ return ((val ^ (val >> 8) ^ (val >> 16) ^ (val >> 24)) &
+ (IN4_ADDR_HSIZE - 1));
+}
+
+static void inet_hash_insert(struct net *net, struct in_ifaddr *ifa)
+{
+ unsigned int hash = inet_addr_hash(net, ifa->ifa_local);
+
+ spin_lock(&inet_addr_hash_lock);
+ hlist_add_head_rcu(&ifa->hash, &inet_addr_lst[hash]);
+ spin_unlock(&inet_addr_hash_lock);
+}
+
+static void inet_hash_remove(struct in_ifaddr *ifa)
+{
+ spin_lock(&inet_addr_hash_lock);
+ hlist_del_init_rcu(&ifa->hash);
+ spin_unlock(&inet_addr_hash_lock);
+}
+
+/**
+ * __ip_dev_find - find the first device with a given source address.
+ * @net: the net namespace
+ * @addr: the source address
+ * @devref: if true, take a reference on the found device
+ *
+ * If a caller uses devref=false, it should be protected by RCU, or RTNL
+ */
+struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref)
+{
+ unsigned int hash = inet_addr_hash(net, addr);
+ struct net_device *result = NULL;
+ struct in_ifaddr *ifa;
+ struct hlist_node *node;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(ifa, node, &inet_addr_lst[hash], hash) {
+ struct net_device *dev = ifa->ifa_dev->dev;
+
+ if (!net_eq(dev_net(dev), net))
+ continue;
+ if (ifa->ifa_local == addr) {
+ result = dev;
+ break;
+ }
+ }
+ if (!result) {
+ struct flowi4 fl4 = { .daddr = addr };
+ struct fib_result res = { 0 };
+ struct fib_table *local;
+
+ /* Fallback to FIB local table so that communication
+ * over loopback subnets work.
+ */
+ local = fib_get_table(net, RT_TABLE_LOCAL);
+ if (local &&
+ !fib_table_lookup(local, &fl4, &res, FIB_LOOKUP_NOREF) &&
+ res.type == RTN_LOCAL)
+ result = FIB_RES_DEV(res);
+ }
+ if (result && devref)
+ dev_hold(result);
+ rcu_read_unlock();
+ return result;
+}
+EXPORT_SYMBOL(__ip_dev_find);
+
+static void rtmsg_ifa(int event, struct in_ifaddr *, struct nlmsghdr *, u32);
+
+static BLOCKING_NOTIFIER_HEAD(inetaddr_chain);
+static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
+ int destroy);
+#ifdef CONFIG_SYSCTL
+static void devinet_sysctl_register(struct in_device *idev);
+static void devinet_sysctl_unregister(struct in_device *idev);
+#else
+static inline void devinet_sysctl_register(struct in_device *idev)
+{
+}
+static inline void devinet_sysctl_unregister(struct in_device *idev)
+{
+}
+#endif
+
+/* Locks all the inet devices. */
+
+static struct in_ifaddr *inet_alloc_ifa(void)
+{
+ return kzalloc(sizeof(struct in_ifaddr), GFP_KERNEL);
+}
+
+static void inet_rcu_free_ifa(struct rcu_head *head)
+{
+ struct in_ifaddr *ifa = container_of(head, struct in_ifaddr, rcu_head);
+ if (ifa->ifa_dev)
+ in_dev_put(ifa->ifa_dev);
+ kfree(ifa);
+}
+
+static inline void inet_free_ifa(struct in_ifaddr *ifa)
+{
+ call_rcu(&ifa->rcu_head, inet_rcu_free_ifa);
+}
+
+void in_dev_finish_destroy(struct in_device *idev)
+{
+ struct net_device *dev = idev->dev;
+
+ WARN_ON(idev->ifa_list);
+ WARN_ON(idev->mc_list);
+#ifdef NET_REFCNT_DEBUG
+ printk(KERN_DEBUG "in_dev_finish_destroy: %p=%s\n",
+ idev, dev ? dev->name : "NIL");
+#endif
+ dev_put(dev);
+ if (!idev->dead)
+ pr_err("Freeing alive in_device %p\n", idev);
+ else
+ kfree(idev);
+}
+EXPORT_SYMBOL(in_dev_finish_destroy);
+
+static struct in_device *inetdev_init(struct net_device *dev)
+{
+ struct in_device *in_dev;
+
+ ASSERT_RTNL();
+
+ in_dev = kzalloc(sizeof(*in_dev), GFP_KERNEL);
+ if (!in_dev)
+ goto out;
+ memcpy(&in_dev->cnf, dev_net(dev)->ipv4.devconf_dflt,
+ sizeof(in_dev->cnf));
+ in_dev->cnf.sysctl = NULL;
+ in_dev->dev = dev;
+ in_dev->arp_parms = neigh_parms_alloc(dev, &arp_tbl);
+ if (!in_dev->arp_parms)
+ goto out_kfree;
+ if (IPV4_DEVCONF(in_dev->cnf, FORWARDING))
+ dev_disable_lro(dev);
+ /* Reference in_dev->dev */
+ dev_hold(dev);
+ /* Account for reference dev->ip_ptr (below) */
+ in_dev_hold(in_dev);
+
+ devinet_sysctl_register(in_dev);
+ ip_mc_init_dev(in_dev);
+ if (dev->flags & IFF_UP)
+ ip_mc_up(in_dev);
+
+ /* we can receive as soon as ip_ptr is set -- do this last */
+ rcu_assign_pointer(dev->ip_ptr, in_dev);
+out:
+ return in_dev;
+out_kfree:
+ kfree(in_dev);
+ in_dev = NULL;
+ goto out;
+}
+
+static void in_dev_rcu_put(struct rcu_head *head)
+{
+ struct in_device *idev = container_of(head, struct in_device, rcu_head);
+ in_dev_put(idev);
+}
+
+static void inetdev_destroy(struct in_device *in_dev)
+{
+ struct in_ifaddr *ifa;
+ struct net_device *dev;
+
+ ASSERT_RTNL();
+
+ dev = in_dev->dev;
+
+ in_dev->dead = 1;
+
+ ip_mc_destroy_dev(in_dev);
+
+ while ((ifa = in_dev->ifa_list) != NULL) {
+ inet_del_ifa(in_dev, &in_dev->ifa_list, 0);
+ inet_free_ifa(ifa);
+ }
+
+ RCU_INIT_POINTER(dev->ip_ptr, NULL);
+
+ devinet_sysctl_unregister(in_dev);
+ neigh_parms_release(&arp_tbl, in_dev->arp_parms);
+ arp_ifdown(dev);
+
+ call_rcu(&in_dev->rcu_head, in_dev_rcu_put);
+}
+
+int inet_addr_onlink(struct in_device *in_dev, __be32 a, __be32 b)
+{
+ rcu_read_lock();
+ for_primary_ifa(in_dev) {
+ if (inet_ifa_match(a, ifa)) {
+ if (!b || inet_ifa_match(b, ifa)) {
+ rcu_read_unlock();
+ return 1;
+ }
+ }
+ } endfor_ifa(in_dev);
+ rcu_read_unlock();
+ return 0;
+}
+
+static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
+ int destroy, struct nlmsghdr *nlh, u32 pid)
+{
+ struct in_ifaddr *promote = NULL;
+ struct in_ifaddr *ifa, *ifa1 = *ifap;
+ struct in_ifaddr *last_prim = in_dev->ifa_list;
+ struct in_ifaddr *prev_prom = NULL;
+ int do_promote = IN_DEV_PROMOTE_SECONDARIES(in_dev);
+
+ ASSERT_RTNL();
+
+ /* 1. Deleting primary ifaddr forces deletion all secondaries
+ * unless alias promotion is set
+ **/
+
+ if (!(ifa1->ifa_flags & IFA_F_SECONDARY)) {
+ struct in_ifaddr **ifap1 = &ifa1->ifa_next;
+
+ while ((ifa = *ifap1) != NULL) {
+ if (!(ifa->ifa_flags & IFA_F_SECONDARY) &&
+ ifa1->ifa_scope <= ifa->ifa_scope)
+ last_prim = ifa;
+
+ if (!(ifa->ifa_flags & IFA_F_SECONDARY) ||
+ ifa1->ifa_mask != ifa->ifa_mask ||
+ !inet_ifa_match(ifa1->ifa_address, ifa)) {
+ ifap1 = &ifa->ifa_next;
+ prev_prom = ifa;
+ continue;
+ }
+
+ if (!do_promote) {
+ inet_hash_remove(ifa);
+ *ifap1 = ifa->ifa_next;
+
+ rtmsg_ifa(RTM_DELADDR, ifa, nlh, pid);
+ blocking_notifier_call_chain(&inetaddr_chain,
+ NETDEV_DOWN, ifa);
+ inet_free_ifa(ifa);
+ } else {
+ promote = ifa;
+ break;
+ }
+ }
+ }
+
+ /* On promotion all secondaries from subnet are changing
+ * the primary IP, we must remove all their routes silently
+ * and later to add them back with new prefsrc. Do this
+ * while all addresses are on the device list.
+ */
+ for (ifa = promote; ifa; ifa = ifa->ifa_next) {
+ if (ifa1->ifa_mask == ifa->ifa_mask &&
+ inet_ifa_match(ifa1->ifa_address, ifa))
+ fib_del_ifaddr(ifa, ifa1);
+ }
+
+ /* 2. Unlink it */
+
+ *ifap = ifa1->ifa_next;
+ inet_hash_remove(ifa1);
+
+ /* 3. Announce address deletion */
+
+ /* Send message first, then call notifier.
+ At first sight, FIB update triggered by notifier
+ will refer to already deleted ifaddr, that could confuse
+ netlink listeners. It is not true: look, gated sees
+ that route deleted and if it still thinks that ifaddr
+ is valid, it will try to restore deleted routes... Grr.
+ So that, this order is correct.
+ */
+ rtmsg_ifa(RTM_DELADDR, ifa1, nlh, pid);
+ blocking_notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1);
+
+ if (promote) {
+ struct in_ifaddr *next_sec = promote->ifa_next;
+
+ if (prev_prom) {
+ prev_prom->ifa_next = promote->ifa_next;
+ promote->ifa_next = last_prim->ifa_next;
+ last_prim->ifa_next = promote;
+ }
+
+ promote->ifa_flags &= ~IFA_F_SECONDARY;
+ rtmsg_ifa(RTM_NEWADDR, promote, nlh, pid);
+ blocking_notifier_call_chain(&inetaddr_chain,
+ NETDEV_UP, promote);
+ for (ifa = next_sec; ifa; ifa = ifa->ifa_next) {
+ if (ifa1->ifa_mask != ifa->ifa_mask ||
+ !inet_ifa_match(ifa1->ifa_address, ifa))
+ continue;
+ fib_add_ifaddr(ifa);
+ }
+
+ }
+ if (destroy)
+ inet_free_ifa(ifa1);
+}
+
+static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
+ int destroy)
+{
+ __inet_del_ifa(in_dev, ifap, destroy, NULL, 0);
+}
+
+static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
+ u32 pid)
+{
+ struct in_device *in_dev = ifa->ifa_dev;
+ struct in_ifaddr *ifa1, **ifap, **last_primary;
+
+ ASSERT_RTNL();
+
+ if (!ifa->ifa_local) {
+ inet_free_ifa(ifa);
+ return 0;
+ }
+
+ ifa->ifa_flags &= ~IFA_F_SECONDARY;
+ last_primary = &in_dev->ifa_list;
+
+ for (ifap = &in_dev->ifa_list; (ifa1 = *ifap) != NULL;
+ ifap = &ifa1->ifa_next) {
+ if (!(ifa1->ifa_flags & IFA_F_SECONDARY) &&
+ ifa->ifa_scope <= ifa1->ifa_scope)
+ last_primary = &ifa1->ifa_next;
+ if (ifa1->ifa_mask == ifa->ifa_mask &&
+ inet_ifa_match(ifa1->ifa_address, ifa)) {
+ if (ifa1->ifa_local == ifa->ifa_local) {
+ inet_free_ifa(ifa);
+ return -EEXIST;
+ }
+ if (ifa1->ifa_scope != ifa->ifa_scope) {
+ inet_free_ifa(ifa);
+ return -EINVAL;
+ }
+ ifa->ifa_flags |= IFA_F_SECONDARY;
+ }
+ }
+
+ if (!(ifa->ifa_flags & IFA_F_SECONDARY)) {
+ net_srandom(ifa->ifa_local);
+ ifap = last_primary;
+ }
+
+ ifa->ifa_next = *ifap;
+ *ifap = ifa;
+
+ inet_hash_insert(dev_net(in_dev->dev), ifa);
+
+ /* Send message first, then call notifier.
+ Notifier will trigger FIB update, so that
+ listeners of netlink will know about new ifaddr */
+ rtmsg_ifa(RTM_NEWADDR, ifa, nlh, pid);
+ blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa);
+
+ return 0;
+}
+
+static int inet_insert_ifa(struct in_ifaddr *ifa)
+{
+ return __inet_insert_ifa(ifa, NULL, 0);
+}
+
+static int inet_set_ifa(struct net_device *dev, struct in_ifaddr *ifa)
+{
+ struct in_device *in_dev = __in_dev_get_rtnl(dev);
+
+ ASSERT_RTNL();
+
+ if (!in_dev) {
+ inet_free_ifa(ifa);
+ return -ENOBUFS;
+ }
+ ipv4_devconf_setall(in_dev);
+ if (ifa->ifa_dev != in_dev) {
+ WARN_ON(ifa->ifa_dev);
+ in_dev_hold(in_dev);
+ ifa->ifa_dev = in_dev;
+ }
+ if (ipv4_is_loopback(ifa->ifa_local))
+ ifa->ifa_scope = RT_SCOPE_HOST;
+ return inet_insert_ifa(ifa);
+}
+
+/* Caller must hold RCU or RTNL :
+ * We dont take a reference on found in_device
+ */
+struct in_device *inetdev_by_index(struct net *net, int ifindex)
+{
+ struct net_device *dev;
+ struct in_device *in_dev = NULL;
+
+ rcu_read_lock();
+ dev = dev_get_by_index_rcu(net, ifindex);
+ if (dev)
+ in_dev = rcu_dereference_rtnl(dev->ip_ptr);
+ rcu_read_unlock();
+ return in_dev;
+}
+EXPORT_SYMBOL(inetdev_by_index);
+
+/* Called only from RTNL semaphored context. No locks. */
+
+struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, __be32 prefix,
+ __be32 mask)
+{
+ ASSERT_RTNL();
+
+ for_primary_ifa(in_dev) {
+ if (ifa->ifa_mask == mask && inet_ifa_match(prefix, ifa))
+ return ifa;
+ } endfor_ifa(in_dev);
+ return NULL;
+}
+
+static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+{
+ struct net *net = sock_net(skb->sk);
+ struct nlattr *tb[IFA_MAX+1];
+ struct in_device *in_dev;
+ struct ifaddrmsg *ifm;
+ struct in_ifaddr *ifa, **ifap;
+ int err = -EINVAL;
+
+ ASSERT_RTNL();
+
+ err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv4_policy);
+ if (err < 0)
+ goto errout;
+
+ ifm = nlmsg_data(nlh);
+ in_dev = inetdev_by_index(net, ifm->ifa_index);
+ if (in_dev == NULL) {
+ err = -ENODEV;
+ goto errout;
+ }
+
+ for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL;
+ ifap = &ifa->ifa_next) {
+ if (tb[IFA_LOCAL] &&
+ ifa->ifa_local != nla_get_be32(tb[IFA_LOCAL]))
+ continue;
+
+ if (tb[IFA_LABEL] && nla_strcmp(tb[IFA_LABEL], ifa->ifa_label))
+ continue;
+
+ if (tb[IFA_ADDRESS] &&
+ (ifm->ifa_prefixlen != ifa->ifa_prefixlen ||
+ !inet_ifa_match(nla_get_be32(tb[IFA_ADDRESS]), ifa)))
+ continue;
+
+ __inet_del_ifa(in_dev, ifap, 1, nlh, NETLINK_CB(skb).pid);
+ return 0;
+ }
+
+ err = -EADDRNOTAVAIL;
+errout:
+ return err;
+}
+
+static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh)
+{
+ struct nlattr *tb[IFA_MAX+1];
+ struct in_ifaddr *ifa;
+ struct ifaddrmsg *ifm;
+ struct net_device *dev;
+ struct in_device *in_dev;
+ int err;
+
+ err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv4_policy);
+ if (err < 0)
+ goto errout;
+
+ ifm = nlmsg_data(nlh);
+ err = -EINVAL;
+ if (ifm->ifa_prefixlen > 32 || tb[IFA_LOCAL] == NULL)
+ goto errout;
+
+ dev = __dev_get_by_index(net, ifm->ifa_index);
+ err = -ENODEV;
+ if (dev == NULL)
+ goto errout;
+
+ in_dev = __in_dev_get_rtnl(dev);
+ err = -ENOBUFS;
+ if (in_dev == NULL)
+ goto errout;
+
+ ifa = inet_alloc_ifa();
+ if (ifa == NULL)
+ /*
+ * A potential indev allocation can be left alive, it stays
+ * assigned to its device and is destroy with it.
+ */
+ goto errout;
+
+ ipv4_devconf_setall(in_dev);
+ in_dev_hold(in_dev);
+
+ if (tb[IFA_ADDRESS] == NULL)
+ tb[IFA_ADDRESS] = tb[IFA_LOCAL];
+
+ INIT_HLIST_NODE(&ifa->hash);
+ ifa->ifa_prefixlen = ifm->ifa_prefixlen;
+ ifa->ifa_mask = inet_make_mask(ifm->ifa_prefixlen);
+ ifa->ifa_flags = ifm->ifa_flags;
+ ifa->ifa_scope = ifm->ifa_scope;
+ ifa->ifa_dev = in_dev;
+
+ ifa->ifa_local = nla_get_be32(tb[IFA_LOCAL]);
+ ifa->ifa_address = nla_get_be32(tb[IFA_ADDRESS]);
+
+ if (tb[IFA_BROADCAST])
+ ifa->ifa_broadcast = nla_get_be32(tb[IFA_BROADCAST]);
+
+ if (tb[IFA_LABEL])
+ nla_strlcpy(ifa->ifa_label, tb[IFA_LABEL], IFNAMSIZ);
+ else
+ memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
+
+ return ifa;
+
+errout:
+ return ERR_PTR(err);
+}
+
+static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+{
+ struct net *net = sock_net(skb->sk);
+ struct in_ifaddr *ifa;
+
+ ASSERT_RTNL();
+
+ ifa = rtm_to_ifaddr(net, nlh);
+ if (IS_ERR(ifa))
+ return PTR_ERR(ifa);
+
+ return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).pid);
+}
+
+/*
+ * Determine a default network mask, based on the IP address.
+ */
+
+static inline int inet_abc_len(__be32 addr)
+{
+ int rc = -1; /* Something else, probably a multicast. */
+
+ if (ipv4_is_zeronet(addr))
+ rc = 0;
+ else {
+ __u32 haddr = ntohl(addr);
+
+ if (IN_CLASSA(haddr))
+ rc = 8;
+ else if (IN_CLASSB(haddr))
+ rc = 16;
+ else if (IN_CLASSC(haddr))
+ rc = 24;
+ }
+
+ return rc;
+}
+
+
+int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)
+{
+ struct ifreq ifr;
+ struct sockaddr_in sin_orig;
+ struct sockaddr_in *sin = (struct sockaddr_in *)&ifr.ifr_addr;
+ struct in_device *in_dev;
+ struct in_ifaddr **ifap = NULL;
+ struct in_ifaddr *ifa = NULL;
+ struct net_device *dev;
+ char *colon;
+ int ret = -EFAULT;
+ int tryaddrmatch = 0;
+
+ /*
+ * Fetch the caller's info block into kernel space
+ */
+
+ if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
+ goto out;
+ ifr.ifr_name[IFNAMSIZ - 1] = 0;
+
+ /* save original address for comparison */
+ memcpy(&sin_orig, sin, sizeof(*sin));
+
+ colon = strchr(ifr.ifr_name, ':');
+ if (colon)
+ *colon = 0;
+
+ dev_load(net, ifr.ifr_name);
+
+ switch (cmd) {
+ case SIOCGIFADDR: /* Get interface address */
+ case SIOCGIFBRDADDR: /* Get the broadcast address */
+ case SIOCGIFDSTADDR: /* Get the destination address */
+ case SIOCGIFNETMASK: /* Get the netmask for the interface */
+ /* Note that these ioctls will not sleep,
+ so that we do not impose a lock.
+ One day we will be forced to put shlock here (I mean SMP)
+ */
+ tryaddrmatch = (sin_orig.sin_family == AF_INET);
+ memset(sin, 0, sizeof(*sin));
+ sin->sin_family = AF_INET;
+ break;
+
+ case SIOCSIFFLAGS:
+ ret = -EACCES;
+ if (!capable(CAP_NET_ADMIN))
+ goto out;
+ break;
+ case SIOCSIFADDR: /* Set interface address (and family) */
+ case SIOCSIFBRDADDR: /* Set the broadcast address */
+ case SIOCSIFDSTADDR: /* Set the destination address */
+ case SIOCSIFNETMASK: /* Set the netmask for the interface */
+ case SIOCKILLADDR: /* Nuke all sockets on this address */
+ ret = -EACCES;
+ if (!capable(CAP_NET_ADMIN))
+ goto out;
+ ret = -EINVAL;
+ if (sin->sin_family != AF_INET)
+ goto out;
+ break;
+ default:
+ ret = -EINVAL;
+ goto out;
+ }
+
+ rtnl_lock();
+
+ ret = -ENODEV;
+ dev = __dev_get_by_name(net, ifr.ifr_name);
+ if (!dev)
+ goto done;
+
+ if (colon)
+ *colon = ':';
+
+ in_dev = __in_dev_get_rtnl(dev);
+ if (in_dev) {
+ if (tryaddrmatch) {
+ /* Matthias Andree */
+ /* compare label and address (4.4BSD style) */
+ /* note: we only do this for a limited set of ioctls
+ and only if the original address family was AF_INET.
+ This is checked above. */
+ for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL;
+ ifap = &ifa->ifa_next) {
+ if (!strcmp(ifr.ifr_name, ifa->ifa_label) &&
+ sin_orig.sin_addr.s_addr ==
+ ifa->ifa_local) {
+ break; /* found */
+ }
+ }
+ }
+ /* we didn't get a match, maybe the application is
+ 4.3BSD-style and passed in junk so we fall back to
+ comparing just the label */
+ if (!ifa) {
+ for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL;
+ ifap = &ifa->ifa_next)
+ if (!strcmp(ifr.ifr_name, ifa->ifa_label))
+ break;
+ }
+ }
+
+ ret = -EADDRNOTAVAIL;
+ if (!ifa && cmd != SIOCSIFADDR && cmd != SIOCSIFFLAGS
+ && cmd != SIOCKILLADDR)
+ goto done;
+
+ switch (cmd) {
+ case SIOCGIFADDR: /* Get interface address */
+ sin->sin_addr.s_addr = ifa->ifa_local;
+ goto rarok;
+
+ case SIOCGIFBRDADDR: /* Get the broadcast address */
+ sin->sin_addr.s_addr = ifa->ifa_broadcast;
+ goto rarok;
+
+ case SIOCGIFDSTADDR: /* Get the destination address */
+ sin->sin_addr.s_addr = ifa->ifa_address;
+ goto rarok;
+
+ case SIOCGIFNETMASK: /* Get the netmask for the interface */
+ sin->sin_addr.s_addr = ifa->ifa_mask;
+ goto rarok;
+
+ case SIOCSIFFLAGS:
+ if (colon) {
+ ret = -EADDRNOTAVAIL;
+ if (!ifa)
+ break;
+ ret = 0;
+ if (!(ifr.ifr_flags & IFF_UP))
+ inet_del_ifa(in_dev, ifap, 1);
+ break;
+ }
+ ret = dev_change_flags(dev, ifr.ifr_flags);
+ break;
+
+ case SIOCSIFADDR: /* Set interface address (and family) */
+ ret = -EINVAL;
+ if (inet_abc_len(sin->sin_addr.s_addr) < 0)
+ break;
+
+ if (!ifa) {
+ ret = -ENOBUFS;
+ ifa = inet_alloc_ifa();
+ INIT_HLIST_NODE(&ifa->hash);
+ if (!ifa)
+ break;
+ if (colon)
+ memcpy(ifa->ifa_label, ifr.ifr_name, IFNAMSIZ);
+ else
+ memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
+ } else {
+ ret = 0;
+ if (ifa->ifa_local == sin->sin_addr.s_addr)
+ break;
+ inet_del_ifa(in_dev, ifap, 0);
+ ifa->ifa_broadcast = 0;
+ ifa->ifa_scope = 0;
+ }
+
+ ifa->ifa_address = ifa->ifa_local = sin->sin_addr.s_addr;
+
+ if (!(dev->flags & IFF_POINTOPOINT)) {
+ ifa->ifa_prefixlen = inet_abc_len(ifa->ifa_address);
+ ifa->ifa_mask = inet_make_mask(ifa->ifa_prefixlen);
+ if ((dev->flags & IFF_BROADCAST) &&
+ ifa->ifa_prefixlen < 31)
+ ifa->ifa_broadcast = ifa->ifa_address |
+ ~ifa->ifa_mask;
+ } else {
+ ifa->ifa_prefixlen = 32;
+ ifa->ifa_mask = inet_make_mask(32);
+ }
+ ret = inet_set_ifa(dev, ifa);
+ break;
+
+ case SIOCSIFBRDADDR: /* Set the broadcast address */
+ ret = 0;
+ if (ifa->ifa_broadcast != sin->sin_addr.s_addr) {
+ inet_del_ifa(in_dev, ifap, 0);
+ ifa->ifa_broadcast = sin->sin_addr.s_addr;
+ inet_insert_ifa(ifa);
+ }
+ break;
+
+ case SIOCSIFDSTADDR: /* Set the destination address */
+ ret = 0;
+ if (ifa->ifa_address == sin->sin_addr.s_addr)
+ break;
+ ret = -EINVAL;
+ if (inet_abc_len(sin->sin_addr.s_addr) < 0)
+ break;
+ ret = 0;
+ inet_del_ifa(in_dev, ifap, 0);
+ ifa->ifa_address = sin->sin_addr.s_addr;
+ inet_insert_ifa(ifa);
+ break;
+
+ case SIOCSIFNETMASK: /* Set the netmask for the interface */
+
+ /*
+ * The mask we set must be legal.
+ */
+ ret = -EINVAL;
+ if (bad_mask(sin->sin_addr.s_addr, 0))
+ break;
+ ret = 0;
+ if (ifa->ifa_mask != sin->sin_addr.s_addr) {
+ __be32 old_mask = ifa->ifa_mask;
+ inet_del_ifa(in_dev, ifap, 0);
+ ifa->ifa_mask = sin->sin_addr.s_addr;
+ ifa->ifa_prefixlen = inet_mask_len(ifa->ifa_mask);
+
+ /* See if current broadcast address matches
+ * with current netmask, then recalculate
+ * the broadcast address. Otherwise it's a
+ * funny address, so don't touch it since
+ * the user seems to know what (s)he's doing...
+ */
+ if ((dev->flags & IFF_BROADCAST) &&
+ (ifa->ifa_prefixlen < 31) &&
+ (ifa->ifa_broadcast ==
+ (ifa->ifa_local|~old_mask))) {
+ ifa->ifa_broadcast = (ifa->ifa_local |
+ ~sin->sin_addr.s_addr);
+ }
+ inet_insert_ifa(ifa);
+ }
+ break;
+ case SIOCKILLADDR: /* Nuke all connections on this address */
+ ret = tcp_nuke_addr(net, (struct sockaddr *) sin);
+ break;
+ }
+done:
+ rtnl_unlock();
+out:
+ return ret;
+rarok:
+ rtnl_unlock();
+ ret = copy_to_user(arg, &ifr, sizeof(struct ifreq)) ? -EFAULT : 0;
+ goto out;
+}
+
+static int inet_gifconf(struct net_device *dev, char __user *buf, int len)
+{
+ struct in_device *in_dev = __in_dev_get_rtnl(dev);
+ struct in_ifaddr *ifa;
+ struct ifreq ifr;
+ int done = 0;
+
+ if (!in_dev)
+ goto out;
+
+ for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
+ if (!buf) {
+ done += sizeof(ifr);
+ continue;
+ }
+ if (len < (int) sizeof(ifr))
+ break;
+ memset(&ifr, 0, sizeof(struct ifreq));
+ if (ifa->ifa_label)
+ strcpy(ifr.ifr_name, ifa->ifa_label);
+ else
+ strcpy(ifr.ifr_name, dev->name);
+
+ (*(struct sockaddr_in *)&ifr.ifr_addr).sin_family = AF_INET;
+ (*(struct sockaddr_in *)&ifr.ifr_addr).sin_addr.s_addr =
+ ifa->ifa_local;
+
+ if (copy_to_user(buf, &ifr, sizeof(struct ifreq))) {
+ done = -EFAULT;
+ break;
+ }
+ buf += sizeof(struct ifreq);
+ len -= sizeof(struct ifreq);
+ done += sizeof(struct ifreq);
+ }
+out:
+ return done;
+}
+
+__be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope)
+{
+ __be32 addr = 0;
+ struct in_device *in_dev;
+ struct net *net = dev_net(dev);
+
+ rcu_read_lock();
+ in_dev = __in_dev_get_rcu(dev);
+ if (!in_dev)
+ goto no_in_dev;
+
+ for_primary_ifa(in_dev) {
+ if (ifa->ifa_scope > scope)
+ continue;
+ if (!dst || inet_ifa_match(dst, ifa)) {
+ addr = ifa->ifa_local;
+ break;
+ }
+ if (!addr)
+ addr = ifa->ifa_local;
+ } endfor_ifa(in_dev);
+
+ if (addr)
+ goto out_unlock;
+no_in_dev:
+
+ /* Not loopback addresses on loopback should be preferred
+ in this case. It is importnat that lo is the first interface
+ in dev_base list.
+ */
+ for_each_netdev_rcu(net, dev) {
+ in_dev = __in_dev_get_rcu(dev);
+ if (!in_dev)
+ continue;
+
+ for_primary_ifa(in_dev) {
+ if (ifa->ifa_scope != RT_SCOPE_LINK &&
+ ifa->ifa_scope <= scope) {
+ addr = ifa->ifa_local;
+ goto out_unlock;
+ }
+ } endfor_ifa(in_dev);
+ }
+out_unlock:
+ rcu_read_unlock();
+ return addr;
+}
+EXPORT_SYMBOL(inet_select_addr);
+
+static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst,
+ __be32 local, int scope)
+{
+ int same = 0;
+ __be32 addr = 0;
+
+ for_ifa(in_dev) {
+ if (!addr &&
+ (local == ifa->ifa_local || !local) &&
+ ifa->ifa_scope <= scope) {
+ addr = ifa->ifa_local;
+ if (same)
+ break;
+ }
+ if (!same) {
+ same = (!local || inet_ifa_match(local, ifa)) &&
+ (!dst || inet_ifa_match(dst, ifa));
+ if (same && addr) {
+ if (local || !dst)
+ break;
+ /* Is the selected addr into dst subnet? */
+ if (inet_ifa_match(addr, ifa))
+ break;
+ /* No, then can we use new local src? */
+ if (ifa->ifa_scope <= scope) {
+ addr = ifa->ifa_local;
+ break;
+ }
+ /* search for large dst subnet for addr */
+ same = 0;
+ }
+ }
+ } endfor_ifa(in_dev);
+
+ return same ? addr : 0;
+}
+
+/*
+ * Confirm that local IP address exists using wildcards:
+ * - in_dev: only on this interface, 0=any interface
+ * - dst: only in the same subnet as dst, 0=any dst
+ * - local: address, 0=autoselect the local address
+ * - scope: maximum allowed scope value for the local address
+ */
+__be32 inet_confirm_addr(struct in_device *in_dev,
+ __be32 dst, __be32 local, int scope)
+{
+ __be32 addr = 0;
+ struct net_device *dev;
+ struct net *net;
+
+ if (scope != RT_SCOPE_LINK)
+ return confirm_addr_indev(in_dev, dst, local, scope);
+
+ net = dev_net(in_dev->dev);
+ rcu_read_lock();
+ for_each_netdev_rcu(net, dev) {
+ in_dev = __in_dev_get_rcu(dev);
+ if (in_dev) {
+ addr = confirm_addr_indev(in_dev, dst, local, scope);
+ if (addr)
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return addr;
+}
+EXPORT_SYMBOL(inet_confirm_addr);
+
+/*
+ * Device notifier
+ */
+
+int register_inetaddr_notifier(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_register(&inetaddr_chain, nb);
+}
+EXPORT_SYMBOL(register_inetaddr_notifier);
+
+int unregister_inetaddr_notifier(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_unregister(&inetaddr_chain, nb);
+}
+EXPORT_SYMBOL(unregister_inetaddr_notifier);
+
+/* Rename ifa_labels for a device name change. Make some effort to preserve
+ * existing alias numbering and to create unique labels if possible.
+*/
+static void inetdev_changename(struct net_device *dev, struct in_device *in_dev)
+{
+ struct in_ifaddr *ifa;
+ int named = 0;
+
+ for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
+ char old[IFNAMSIZ], *dot;
+
+ memcpy(old, ifa->ifa_label, IFNAMSIZ);
+ memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
+ if (named++ == 0)
+ goto skip;
+ dot = strchr(old, ':');
+ if (dot == NULL) {
+ sprintf(old, ":%d", named);
+ dot = old;
+ }
+ if (strlen(dot) + strlen(dev->name) < IFNAMSIZ)
+ strcat(ifa->ifa_label, dot);
+ else
+ strcpy(ifa->ifa_label + (IFNAMSIZ - strlen(dot) - 1), dot);
+skip:
+ rtmsg_ifa(RTM_NEWADDR, ifa, NULL, 0);
+ }
+}
+
+static inline bool inetdev_valid_mtu(unsigned mtu)
+{
+ return mtu >= 68;
+}
+
+static void inetdev_send_gratuitous_arp(struct net_device *dev,
+ struct in_device *in_dev)
+
+{
+ struct in_ifaddr *ifa;
+
+ for (ifa = in_dev->ifa_list; ifa;
+ ifa = ifa->ifa_next) {
+ arp_send(ARPOP_REQUEST, ETH_P_ARP,
+ ifa->ifa_local, dev,
+ ifa->ifa_local, NULL,
+ dev->dev_addr, NULL);
+ }
+}
+
+/* Called only under RTNL semaphore */
+
+static int inetdev_event(struct notifier_block *this, unsigned long event,
+ void *ptr)
+{
+ struct net_device *dev = ptr;
+ struct in_device *in_dev = __in_dev_get_rtnl(dev);
+
+ ASSERT_RTNL();
+
+ if (!in_dev) {
+ if (event == NETDEV_REGISTER) {
+ in_dev = inetdev_init(dev);
+ if (!in_dev)
+ return notifier_from_errno(-ENOMEM);
+ if (dev->flags & IFF_LOOPBACK) {
+ IN_DEV_CONF_SET(in_dev, NOXFRM, 1);
+ IN_DEV_CONF_SET(in_dev, NOPOLICY, 1);
+ }
+ } else if (event == NETDEV_CHANGEMTU) {
+ /* Re-enabling IP */
+ if (inetdev_valid_mtu(dev->mtu))
+ in_dev = inetdev_init(dev);
+ }
+ goto out;
+ }
+
+ switch (event) {
+ case NETDEV_REGISTER:
+ printk(KERN_DEBUG "inetdev_event: bug\n");
+ RCU_INIT_POINTER(dev->ip_ptr, NULL);
+ break;
+ case NETDEV_UP:
+ if (!inetdev_valid_mtu(dev->mtu))
+ break;
+ if (dev->flags & IFF_LOOPBACK) {
+ struct in_ifaddr *ifa = inet_alloc_ifa();
+
+ if (ifa) {
+ INIT_HLIST_NODE(&ifa->hash);
+ ifa->ifa_local =
+ ifa->ifa_address = htonl(INADDR_LOOPBACK);
+ ifa->ifa_prefixlen = 8;
+ ifa->ifa_mask = inet_make_mask(8);
+ in_dev_hold(in_dev);
+ ifa->ifa_dev = in_dev;
+ ifa->ifa_scope = RT_SCOPE_HOST;
+ memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
+ inet_insert_ifa(ifa);
+ }
+ }
+ ip_mc_up(in_dev);
+ /* fall through */
+ case NETDEV_CHANGEADDR:
+ if (!IN_DEV_ARP_NOTIFY(in_dev))
+ break;
+ /* fall through */
+ case NETDEV_NOTIFY_PEERS:
+ /* Send gratuitous ARP to notify of link change */
+ inetdev_send_gratuitous_arp(dev, in_dev);
+ break;
+ case NETDEV_DOWN:
+ ip_mc_down(in_dev);
+ break;
+ case NETDEV_PRE_TYPE_CHANGE:
+ ip_mc_unmap(in_dev);
+ break;
+ case NETDEV_POST_TYPE_CHANGE:
+ ip_mc_remap(in_dev);
+ break;
+ case NETDEV_CHANGEMTU:
+ if (inetdev_valid_mtu(dev->mtu))
+ break;
+ /* disable IP when MTU is not enough */
+ case NETDEV_UNREGISTER:
+ inetdev_destroy(in_dev);
+ break;
+ case NETDEV_CHANGENAME:
+ /* Do not notify about label change, this event is
+ * not interesting to applications using netlink.
+ */
+ inetdev_changename(dev, in_dev);
+
+ devinet_sysctl_unregister(in_dev);
+ devinet_sysctl_register(in_dev);
+ break;
+ }
+out:
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block ip_netdev_notifier = {
+ .notifier_call = inetdev_event,
+};
+
+static inline size_t inet_nlmsg_size(void)
+{
+ return NLMSG_ALIGN(sizeof(struct ifaddrmsg))
+ + nla_total_size(4) /* IFA_ADDRESS */
+ + nla_total_size(4) /* IFA_LOCAL */
+ + nla_total_size(4) /* IFA_BROADCAST */
+ + nla_total_size(IFNAMSIZ); /* IFA_LABEL */
+}
+
+static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
+ u32 pid, u32 seq, int event, unsigned int flags)
+{
+ struct ifaddrmsg *ifm;
+ struct nlmsghdr *nlh;
+
+ nlh = nlmsg_put(skb, pid, seq, event, sizeof(*ifm), flags);
+ if (nlh == NULL)
+ return -EMSGSIZE;
+
+ ifm = nlmsg_data(nlh);
+ ifm->ifa_family = AF_INET;
+ ifm->ifa_prefixlen = ifa->ifa_prefixlen;
+ ifm->ifa_flags = ifa->ifa_flags|IFA_F_PERMANENT;
+ ifm->ifa_scope = ifa->ifa_scope;
+ ifm->ifa_index = ifa->ifa_dev->dev->ifindex;
+
+ if (ifa->ifa_address)
+ NLA_PUT_BE32(skb, IFA_ADDRESS, ifa->ifa_address);
+
+ if (ifa->ifa_local)
+ NLA_PUT_BE32(skb, IFA_LOCAL, ifa->ifa_local);
+
+ if (ifa->ifa_broadcast)
+ NLA_PUT_BE32(skb, IFA_BROADCAST, ifa->ifa_broadcast);
+
+ if (ifa->ifa_label[0])
+ NLA_PUT_STRING(skb, IFA_LABEL, ifa->ifa_label);
+
+ return nlmsg_end(skb, nlh);
+
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ int h, s_h;
+ int idx, s_idx;
+ int ip_idx, s_ip_idx;
+ struct net_device *dev;
+ struct in_device *in_dev;
+ struct in_ifaddr *ifa;
+ struct hlist_head *head;
+ struct hlist_node *node;
+
+ s_h = cb->args[0];
+ s_idx = idx = cb->args[1];
+ s_ip_idx = ip_idx = cb->args[2];
+
+ for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
+ idx = 0;
+ head = &net->dev_index_head[h];
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(dev, node, head, index_hlist) {
+ if (idx < s_idx)
+ goto cont;
+ if (h > s_h || idx > s_idx)
+ s_ip_idx = 0;
+ in_dev = __in_dev_get_rcu(dev);
+ if (!in_dev)
+ goto cont;
+
+ for (ifa = in_dev->ifa_list, ip_idx = 0; ifa;
+ ifa = ifa->ifa_next, ip_idx++) {
+ if (ip_idx < s_ip_idx)
+ continue;
+ if (inet_fill_ifaddr(skb, ifa,
+ NETLINK_CB(cb->skb).pid,
+ cb->nlh->nlmsg_seq,
+ RTM_NEWADDR, NLM_F_MULTI) <= 0) {
+ rcu_read_unlock();
+ goto done;
+ }
+ }
+cont:
+ idx++;
+ }
+ rcu_read_unlock();
+ }
+
+done:
+ cb->args[0] = h;
+ cb->args[1] = idx;
+ cb->args[2] = ip_idx;
+
+ return skb->len;
+}
+
+static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh,
+ u32 pid)
+{
+ struct sk_buff *skb;
+ u32 seq = nlh ? nlh->nlmsg_seq : 0;
+ int err = -ENOBUFS;
+ struct net *net;
+
+ net = dev_net(ifa->ifa_dev->dev);
+ skb = nlmsg_new(inet_nlmsg_size(), GFP_KERNEL);
+ if (skb == NULL)
+ goto errout;
+
+ err = inet_fill_ifaddr(skb, ifa, pid, seq, event, 0);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in inet_nlmsg_size() */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout;
+ }
+ rtnl_notify(skb, net, pid, RTNLGRP_IPV4_IFADDR, nlh, GFP_KERNEL);
+ return;
+errout:
+ if (err < 0)
+ rtnl_set_sk_err(net, RTNLGRP_IPV4_IFADDR, err);
+}
+
+static size_t inet_get_link_af_size(const struct net_device *dev)
+{
+ struct in_device *in_dev = rcu_dereference_rtnl(dev->ip_ptr);
+
+ if (!in_dev)
+ return 0;
+
+ return nla_total_size(IPV4_DEVCONF_MAX * 4); /* IFLA_INET_CONF */
+}
+
+static int inet_fill_link_af(struct sk_buff *skb, const struct net_device *dev)
+{
+ struct in_device *in_dev = rcu_dereference_rtnl(dev->ip_ptr);
+ struct nlattr *nla;
+ int i;
+
+ if (!in_dev)
+ return -ENODATA;
+
+ nla = nla_reserve(skb, IFLA_INET_CONF, IPV4_DEVCONF_MAX * 4);
+ if (nla == NULL)
+ return -EMSGSIZE;
+
+ for (i = 0; i < IPV4_DEVCONF_MAX; i++)
+ ((u32 *) nla_data(nla))[i] = in_dev->cnf.data[i];
+
+ return 0;
+}
+
+static const struct nla_policy inet_af_policy[IFLA_INET_MAX+1] = {
+ [IFLA_INET_CONF] = { .type = NLA_NESTED },
+};
+
+static int inet_validate_link_af(const struct net_device *dev,
+ const struct nlattr *nla)
+{
+ struct nlattr *a, *tb[IFLA_INET_MAX+1];
+ int err, rem;
+
+ if (dev && !__in_dev_get_rtnl(dev))
+ return -EAFNOSUPPORT;
+
+ err = nla_parse_nested(tb, IFLA_INET_MAX, nla, inet_af_policy);
+ if (err < 0)
+ return err;
+
+ if (tb[IFLA_INET_CONF]) {
+ nla_for_each_nested(a, tb[IFLA_INET_CONF], rem) {
+ int cfgid = nla_type(a);
+
+ if (nla_len(a) < 4)
+ return -EINVAL;
+
+ if (cfgid <= 0 || cfgid > IPV4_DEVCONF_MAX)
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static int inet_set_link_af(struct net_device *dev, const struct nlattr *nla)
+{
+ struct in_device *in_dev = __in_dev_get_rtnl(dev);
+ struct nlattr *a, *tb[IFLA_INET_MAX+1];
+ int rem;
+
+ if (!in_dev)
+ return -EAFNOSUPPORT;
+
+ if (nla_parse_nested(tb, IFLA_INET_MAX, nla, NULL) < 0)
+ BUG();
+
+ if (tb[IFLA_INET_CONF]) {
+ nla_for_each_nested(a, tb[IFLA_INET_CONF], rem)
+ ipv4_devconf_set(in_dev, nla_type(a), nla_get_u32(a));
+ }
+
+ return 0;
+}
+
+#ifdef CONFIG_SYSCTL
+
+static void devinet_copy_dflt_conf(struct net *net, int i)
+{
+ struct net_device *dev;
+
+ rcu_read_lock();
+ for_each_netdev_rcu(net, dev) {
+ struct in_device *in_dev;
+
+ in_dev = __in_dev_get_rcu(dev);
+ if (in_dev && !test_bit(i, in_dev->cnf.state))
+ in_dev->cnf.data[i] = net->ipv4.devconf_dflt->data[i];
+ }
+ rcu_read_unlock();
+}
+
+/* called with RTNL locked */
+static void inet_forward_change(struct net *net)
+{
+ struct net_device *dev;
+ int on = IPV4_DEVCONF_ALL(net, FORWARDING);
+
+ IPV4_DEVCONF_ALL(net, ACCEPT_REDIRECTS) = !on;
+ IPV4_DEVCONF_DFLT(net, FORWARDING) = on;
+
+ for_each_netdev(net, dev) {
+ struct in_device *in_dev;
+ if (on)
+ dev_disable_lro(dev);
+ rcu_read_lock();
+ in_dev = __in_dev_get_rcu(dev);
+ if (in_dev)
+ IN_DEV_CONF_SET(in_dev, FORWARDING, on);
+ rcu_read_unlock();
+ }
+}
+
+static int devinet_conf_proc(ctl_table *ctl, int write,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ int old_value = *(int *)ctl->data;
+ int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
+ int new_value = *(int *)ctl->data;
+
+ if (write) {
+ struct ipv4_devconf *cnf = ctl->extra1;
+ struct net *net = ctl->extra2;
+ int i = (int *)ctl->data - cnf->data;
+
+ set_bit(i, cnf->state);
+
+ if (cnf == net->ipv4.devconf_dflt)
+ devinet_copy_dflt_conf(net, i);
+ if (i == IPV4_DEVCONF_ACCEPT_LOCAL - 1)
+ if ((new_value == 0) && (old_value != 0))
+ rt_cache_flush(net, 0);
+ }
+
+ return ret;
+}
+
+static int devinet_sysctl_forward(ctl_table *ctl, int write,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ int *valp = ctl->data;
+ int val = *valp;
+ loff_t pos = *ppos;
+ int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
+
+ if (write && *valp != val) {
+ struct net *net = ctl->extra2;
+
+ if (valp != &IPV4_DEVCONF_DFLT(net, FORWARDING)) {
+ if (!rtnl_trylock()) {
+ /* Restore the original values before restarting */
+ *valp = val;
+ *ppos = pos;
+ return restart_syscall();
+ }
+ if (valp == &IPV4_DEVCONF_ALL(net, FORWARDING)) {
+ inet_forward_change(net);
+ } else if (*valp) {
+ struct ipv4_devconf *cnf = ctl->extra1;
+ struct in_device *idev =
+ container_of(cnf, struct in_device, cnf);
+ dev_disable_lro(idev->dev);
+ }
+ rtnl_unlock();
+ rt_cache_flush(net, 0);
+ }
+ }
+
+ return ret;
+}
+
+static int ipv4_doint_and_flush(ctl_table *ctl, int write,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ int *valp = ctl->data;
+ int val = *valp;
+ int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
+ struct net *net = ctl->extra2;
+
+ if (write && *valp != val)
+ rt_cache_flush(net, 0);
+
+ return ret;
+}
+
+#define DEVINET_SYSCTL_ENTRY(attr, name, mval, proc) \
+ { \
+ .procname = name, \
+ .data = ipv4_devconf.data + \
+ IPV4_DEVCONF_ ## attr - 1, \
+ .maxlen = sizeof(int), \
+ .mode = mval, \
+ .proc_handler = proc, \
+ .extra1 = &ipv4_devconf, \
+ }
+
+#define DEVINET_SYSCTL_RW_ENTRY(attr, name) \
+ DEVINET_SYSCTL_ENTRY(attr, name, 0644, devinet_conf_proc)
+
+#define DEVINET_SYSCTL_RO_ENTRY(attr, name) \
+ DEVINET_SYSCTL_ENTRY(attr, name, 0444, devinet_conf_proc)
+
+#define DEVINET_SYSCTL_COMPLEX_ENTRY(attr, name, proc) \
+ DEVINET_SYSCTL_ENTRY(attr, name, 0644, proc)
+
+#define DEVINET_SYSCTL_FLUSHING_ENTRY(attr, name) \
+ DEVINET_SYSCTL_COMPLEX_ENTRY(attr, name, ipv4_doint_and_flush)
+
+static struct devinet_sysctl_table {
+ struct ctl_table_header *sysctl_header;
+ struct ctl_table devinet_vars[__IPV4_DEVCONF_MAX];
+ char *dev_name;
+} devinet_sysctl = {
+ .devinet_vars = {
+ DEVINET_SYSCTL_COMPLEX_ENTRY(FORWARDING, "forwarding",
+ devinet_sysctl_forward),
+ DEVINET_SYSCTL_RO_ENTRY(MC_FORWARDING, "mc_forwarding"),
+
+ DEVINET_SYSCTL_RW_ENTRY(ACCEPT_REDIRECTS, "accept_redirects"),
+ DEVINET_SYSCTL_RW_ENTRY(SECURE_REDIRECTS, "secure_redirects"),
+ DEVINET_SYSCTL_RW_ENTRY(SHARED_MEDIA, "shared_media"),
+ DEVINET_SYSCTL_RW_ENTRY(RP_FILTER, "rp_filter"),
+ DEVINET_SYSCTL_RW_ENTRY(SEND_REDIRECTS, "send_redirects"),
+ DEVINET_SYSCTL_RW_ENTRY(ACCEPT_SOURCE_ROUTE,
+ "accept_source_route"),
+ DEVINET_SYSCTL_RW_ENTRY(ACCEPT_LOCAL, "accept_local"),
+ DEVINET_SYSCTL_RW_ENTRY(SRC_VMARK, "src_valid_mark"),
+ DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP, "proxy_arp"),
+ DEVINET_SYSCTL_RW_ENTRY(MEDIUM_ID, "medium_id"),
+ DEVINET_SYSCTL_RW_ENTRY(BOOTP_RELAY, "bootp_relay"),
+ DEVINET_SYSCTL_RW_ENTRY(LOG_MARTIANS, "log_martians"),
+ DEVINET_SYSCTL_RW_ENTRY(TAG, "tag"),
+ DEVINET_SYSCTL_RW_ENTRY(ARPFILTER, "arp_filter"),
+ DEVINET_SYSCTL_RW_ENTRY(ARP_ANNOUNCE, "arp_announce"),
+ DEVINET_SYSCTL_RW_ENTRY(ARP_IGNORE, "arp_ignore"),
+ DEVINET_SYSCTL_RW_ENTRY(ARP_ACCEPT, "arp_accept"),
+ DEVINET_SYSCTL_RW_ENTRY(ARP_NOTIFY, "arp_notify"),
+ DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP_PVLAN, "proxy_arp_pvlan"),
+
+ DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"),
+ DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"),
+ DEVINET_SYSCTL_FLUSHING_ENTRY(FORCE_IGMP_VERSION,
+ "force_igmp_version"),
+ DEVINET_SYSCTL_FLUSHING_ENTRY(PROMOTE_SECONDARIES,
+ "promote_secondaries"),
+ },
+};
+
+static int __devinet_sysctl_register(struct net *net, char *dev_name,
+ struct ipv4_devconf *p)
+{
+ int i;
+ struct devinet_sysctl_table *t;
+
+#define DEVINET_CTL_PATH_DEV 3
+
+ struct ctl_path devinet_ctl_path[] = {
+ { .procname = "net", },
+ { .procname = "ipv4", },
+ { .procname = "conf", },
+ { /* to be set */ },
+ { },
+ };
+
+ t = kmemdup(&devinet_sysctl, sizeof(*t), GFP_KERNEL);
+ if (!t)
+ goto out;
+
+ for (i = 0; i < ARRAY_SIZE(t->devinet_vars) - 1; i++) {
+ t->devinet_vars[i].data += (char *)p - (char *)&ipv4_devconf;
+ t->devinet_vars[i].extra1 = p;
+ t->devinet_vars[i].extra2 = net;
+ }
+
+ /*
+ * Make a copy of dev_name, because '.procname' is regarded as const
+ * by sysctl and we wouldn't want anyone to change it under our feet
+ * (see SIOCSIFNAME).
+ */
+ t->dev_name = kstrdup(dev_name, GFP_KERNEL);
+ if (!t->dev_name)
+ goto free;
+
+ devinet_ctl_path[DEVINET_CTL_PATH_DEV].procname = t->dev_name;
+
+ t->sysctl_header = register_net_sysctl_table(net, devinet_ctl_path,
+ t->devinet_vars);
+ if (!t->sysctl_header)
+ goto free_procname;
+
+ p->sysctl = t;
+ return 0;
+
+free_procname:
+ kfree(t->dev_name);
+free:
+ kfree(t);
+out:
+ return -ENOBUFS;
+}
+
+static void __devinet_sysctl_unregister(struct ipv4_devconf *cnf)
+{
+ struct devinet_sysctl_table *t = cnf->sysctl;
+
+ if (t == NULL)
+ return;
+
+ cnf->sysctl = NULL;
+ unregister_net_sysctl_table(t->sysctl_header);
+ kfree(t->dev_name);
+ kfree(t);
+}
+
+static void devinet_sysctl_register(struct in_device *idev)
+{
+ neigh_sysctl_register(idev->dev, idev->arp_parms, "ipv4", NULL);
+ __devinet_sysctl_register(dev_net(idev->dev), idev->dev->name,
+ &idev->cnf);
+}
+
+static void devinet_sysctl_unregister(struct in_device *idev)
+{
+ __devinet_sysctl_unregister(&idev->cnf);
+ neigh_sysctl_unregister(idev->arp_parms);
+}
+
+static struct ctl_table ctl_forward_entry[] = {
+ {
+ .procname = "ip_forward",
+ .data = &ipv4_devconf.data[
+ IPV4_DEVCONF_FORWARDING - 1],
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = devinet_sysctl_forward,
+ .extra1 = &ipv4_devconf,
+ .extra2 = &init_net,
+ },
+ { },
+};
+
+static __net_initdata struct ctl_path net_ipv4_path[] = {
+ { .procname = "net", },
+ { .procname = "ipv4", },
+ { },
+};
+#endif
+
+static __net_init int devinet_init_net(struct net *net)
+{
+ int err;
+ struct ipv4_devconf *all, *dflt;
+#ifdef CONFIG_SYSCTL
+ struct ctl_table *tbl = ctl_forward_entry;
+ struct ctl_table_header *forw_hdr;
+#endif
+
+ err = -ENOMEM;
+ all = &ipv4_devconf;
+ dflt = &ipv4_devconf_dflt;
+
+ if (!net_eq(net, &init_net)) {
+ all = kmemdup(all, sizeof(ipv4_devconf), GFP_KERNEL);
+ if (all == NULL)
+ goto err_alloc_all;
+
+ dflt = kmemdup(dflt, sizeof(ipv4_devconf_dflt), GFP_KERNEL);
+ if (dflt == NULL)
+ goto err_alloc_dflt;
+
+#ifdef CONFIG_SYSCTL
+ tbl = kmemdup(tbl, sizeof(ctl_forward_entry), GFP_KERNEL);
+ if (tbl == NULL)
+ goto err_alloc_ctl;
+
+ tbl[0].data = &all->data[IPV4_DEVCONF_FORWARDING - 1];
+ tbl[0].extra1 = all;
+ tbl[0].extra2 = net;
+#endif
+ }
+
+#ifdef CONFIG_SYSCTL
+ err = __devinet_sysctl_register(net, "all", all);
+ if (err < 0)
+ goto err_reg_all;
+
+ err = __devinet_sysctl_register(net, "default", dflt);
+ if (err < 0)
+ goto err_reg_dflt;
+
+ err = -ENOMEM;
+ forw_hdr = register_net_sysctl_table(net, net_ipv4_path, tbl);
+ if (forw_hdr == NULL)
+ goto err_reg_ctl;
+ net->ipv4.forw_hdr = forw_hdr;
+#endif
+
+ net->ipv4.devconf_all = all;
+ net->ipv4.devconf_dflt = dflt;
+ return 0;
+
+#ifdef CONFIG_SYSCTL
+err_reg_ctl:
+ __devinet_sysctl_unregister(dflt);
+err_reg_dflt:
+ __devinet_sysctl_unregister(all);
+err_reg_all:
+ if (tbl != ctl_forward_entry)
+ kfree(tbl);
+err_alloc_ctl:
+#endif
+ if (dflt != &ipv4_devconf_dflt)
+ kfree(dflt);
+err_alloc_dflt:
+ if (all != &ipv4_devconf)
+ kfree(all);
+err_alloc_all:
+ return err;
+}
+
+static __net_exit void devinet_exit_net(struct net *net)
+{
+#ifdef CONFIG_SYSCTL
+ struct ctl_table *tbl;
+
+ tbl = net->ipv4.forw_hdr->ctl_table_arg;
+ unregister_net_sysctl_table(net->ipv4.forw_hdr);
+ __devinet_sysctl_unregister(net->ipv4.devconf_dflt);
+ __devinet_sysctl_unregister(net->ipv4.devconf_all);
+ kfree(tbl);
+#endif
+ kfree(net->ipv4.devconf_dflt);
+ kfree(net->ipv4.devconf_all);
+}
+
+static __net_initdata struct pernet_operations devinet_ops = {
+ .init = devinet_init_net,
+ .exit = devinet_exit_net,
+};
+
+static struct rtnl_af_ops inet_af_ops = {
+ .family = AF_INET,
+ .fill_link_af = inet_fill_link_af,
+ .get_link_af_size = inet_get_link_af_size,
+ .validate_link_af = inet_validate_link_af,
+ .set_link_af = inet_set_link_af,
+};
+
+void __init devinet_init(void)
+{
+ int i;
+
+ for (i = 0; i < IN4_ADDR_HSIZE; i++)
+ INIT_HLIST_HEAD(&inet_addr_lst[i]);
+
+ register_pernet_subsys(&devinet_ops);
+
+ register_gifconf(PF_INET, inet_gifconf);
+ register_netdevice_notifier(&ip_netdev_notifier);
+
+ rtnl_af_register(&inet_af_ops);
+
+ rtnl_register(PF_INET, RTM_NEWADDR, inet_rtm_newaddr, NULL, NULL);
+ rtnl_register(PF_INET, RTM_DELADDR, inet_rtm_deladdr, NULL, NULL);
+ rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr, NULL);
+}
+
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
new file mode 100644
index 00000000..cb982a61
--- /dev/null
+++ b/net/ipv4/esp4.c
@@ -0,0 +1,727 @@
+#define pr_fmt(fmt) "IPsec: " fmt
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+struct esp_skb_cb {
+ struct xfrm_skb_cb xfrm;
+ void *tmp;
+};
+
+#define ESP_SKB_CB(__skb) ((struct esp_skb_cb *)&((__skb)->cb[0]))
+
+static u32 esp4_get_mtu(struct xfrm_state *x, int mtu);
+
+/*
+ * Allocate an AEAD request structure with extra space for SG and IV.
+ *
+ * For alignment considerations the IV is placed at the front, followed
+ * by the request and finally the SG list.
+ *
+ * TODO: Use spare space in skb for this where possible.
+ */
+static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags, int seqhilen)
+{
+ unsigned int len;
+
+ len = seqhilen;
+
+ len += crypto_aead_ivsize(aead);
+
+ if (len) {
+ len += crypto_aead_alignmask(aead) &
+ ~(crypto_tfm_ctx_alignment() - 1);
+ len = ALIGN(len, crypto_tfm_ctx_alignment());
+ }
+
+ len += sizeof(struct aead_givcrypt_request) + crypto_aead_reqsize(aead);
+ len = ALIGN(len, __alignof__(struct scatterlist));
+
+ len += sizeof(struct scatterlist) * nfrags;
+
+ return kmalloc(len, GFP_ATOMIC);
+}
+
+static inline __be32 *esp_tmp_seqhi(void *tmp)
+{
+ return PTR_ALIGN((__be32 *)tmp, __alignof__(__be32));
+}
+static inline u8 *esp_tmp_iv(struct crypto_aead *aead, void *tmp, int seqhilen)
+{
+ return crypto_aead_ivsize(aead) ?
+ PTR_ALIGN((u8 *)tmp + seqhilen,
+ crypto_aead_alignmask(aead) + 1) : tmp + seqhilen;
+}
+
+static inline struct aead_givcrypt_request *esp_tmp_givreq(
+ struct crypto_aead *aead, u8 *iv)
+{
+ struct aead_givcrypt_request *req;
+
+ req = (void *)PTR_ALIGN(iv + crypto_aead_ivsize(aead),
+ crypto_tfm_ctx_alignment());
+ aead_givcrypt_set_tfm(req, aead);
+ return req;
+}
+
+static inline struct aead_request *esp_tmp_req(struct crypto_aead *aead, u8 *iv)
+{
+ struct aead_request *req;
+
+ req = (void *)PTR_ALIGN(iv + crypto_aead_ivsize(aead),
+ crypto_tfm_ctx_alignment());
+ aead_request_set_tfm(req, aead);
+ return req;
+}
+
+static inline struct scatterlist *esp_req_sg(struct crypto_aead *aead,
+ struct aead_request *req)
+{
+ return (void *)ALIGN((unsigned long)(req + 1) +
+ crypto_aead_reqsize(aead),
+ __alignof__(struct scatterlist));
+}
+
+static inline struct scatterlist *esp_givreq_sg(
+ struct crypto_aead *aead, struct aead_givcrypt_request *req)
+{
+ return (void *)ALIGN((unsigned long)(req + 1) +
+ crypto_aead_reqsize(aead),
+ __alignof__(struct scatterlist));
+}
+
+static void esp_output_done(struct crypto_async_request *base, int err)
+{
+ struct sk_buff *skb = base->data;
+
+ kfree(ESP_SKB_CB(skb)->tmp);
+ xfrm_output_resume(skb, err);
+}
+
+static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+ int err;
+ struct ip_esp_hdr *esph;
+ struct crypto_aead *aead;
+ struct aead_givcrypt_request *req;
+ struct scatterlist *sg;
+ struct scatterlist *asg;
+ struct esp_data *esp;
+ struct sk_buff *trailer;
+ void *tmp;
+ u8 *iv;
+ u8 *tail;
+ int blksize;
+ int clen;
+ int alen;
+ int plen;
+ int tfclen;
+ int nfrags;
+ int assoclen;
+ int sglists;
+ int seqhilen;
+ __be32 *seqhi;
+
+ /* skb is pure payload to encrypt */
+
+ err = -ENOMEM;
+
+ esp = x->data;
+ aead = esp->aead;
+ alen = crypto_aead_authsize(aead);
+
+ tfclen = 0;
+ if (x->tfcpad) {
+ struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb);
+ u32 padto;
+
+ padto = min(x->tfcpad, esp4_get_mtu(x, dst->child_mtu_cached));
+ if (skb->len < padto)
+ tfclen = padto - skb->len;
+ }
+ blksize = ALIGN(crypto_aead_blocksize(aead), 4);
+ clen = ALIGN(skb->len + 2 + tfclen, blksize);
+ if (esp->padlen)
+ clen = ALIGN(clen, esp->padlen);
+ plen = clen - skb->len - tfclen;
+
+ err = skb_cow_data(skb, tfclen + plen + alen, &trailer);
+ if (err < 0)
+ goto error;
+ nfrags = err;
+
+ assoclen = sizeof(*esph);
+ sglists = 1;
+ seqhilen = 0;
+
+ if (x->props.flags & XFRM_STATE_ESN) {
+ sglists += 2;
+ seqhilen += sizeof(__be32);
+ assoclen += seqhilen;
+ }
+
+ tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen);
+ if (!tmp)
+ goto error;
+
+ seqhi = esp_tmp_seqhi(tmp);
+ iv = esp_tmp_iv(aead, tmp, seqhilen);
+ req = esp_tmp_givreq(aead, iv);
+ asg = esp_givreq_sg(aead, req);
+ sg = asg + sglists;
+
+ /* Fill padding... */
+ tail = skb_tail_pointer(trailer);
+ if (tfclen) {
+ memset(tail, 0, tfclen);
+ tail += tfclen;
+ }
+ do {
+ int i;
+ for (i = 0; i < plen - 2; i++)
+ tail[i] = i + 1;
+ } while (0);
+ tail[plen - 2] = plen - 2;
+ tail[plen - 1] = *skb_mac_header(skb);
+ pskb_put(skb, trailer, clen - skb->len + alen);
+
+ skb_push(skb, -skb_network_offset(skb));
+ esph = ip_esp_hdr(skb);
+ *skb_mac_header(skb) = IPPROTO_ESP;
+
+ /* this is non-NULL only with UDP Encapsulation */
+ if (x->encap) {
+ struct xfrm_encap_tmpl *encap = x->encap;
+ struct udphdr *uh;
+ __be32 *udpdata32;
+ __be16 sport, dport;
+ int encap_type;
+
+ spin_lock_bh(&x->lock);
+ sport = encap->encap_sport;
+ dport = encap->encap_dport;
+ encap_type = encap->encap_type;
+ spin_unlock_bh(&x->lock);
+
+ uh = (struct udphdr *)esph;
+ uh->source = sport;
+ uh->dest = dport;
+ uh->len = htons(skb->len - skb_transport_offset(skb));
+ uh->check = 0;
+
+ switch (encap_type) {
+ default:
+ case UDP_ENCAP_ESPINUDP:
+ esph = (struct ip_esp_hdr *)(uh + 1);
+ break;
+ case UDP_ENCAP_ESPINUDP_NON_IKE:
+ udpdata32 = (__be32 *)(uh + 1);
+ udpdata32[0] = udpdata32[1] = 0;
+ esph = (struct ip_esp_hdr *)(udpdata32 + 2);
+ break;
+ }
+
+ *skb_mac_header(skb) = IPPROTO_UDP;
+ }
+
+ esph->spi = x->id.spi;
+ esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
+
+ sg_init_table(sg, nfrags);
+ skb_to_sgvec(skb, sg,
+ esph->enc_data + crypto_aead_ivsize(aead) - skb->data,
+ clen + alen);
+
+ if ((x->props.flags & XFRM_STATE_ESN)) {
+ sg_init_table(asg, 3);
+ sg_set_buf(asg, &esph->spi, sizeof(__be32));
+ *seqhi = htonl(XFRM_SKB_CB(skb)->seq.output.hi);
+ sg_set_buf(asg + 1, seqhi, seqhilen);
+ sg_set_buf(asg + 2, &esph->seq_no, sizeof(__be32));
+ } else
+ sg_init_one(asg, esph, sizeof(*esph));
+
+ aead_givcrypt_set_callback(req, 0, esp_output_done, skb);
+ aead_givcrypt_set_crypt(req, sg, sg, clen, iv);
+ aead_givcrypt_set_assoc(req, asg, assoclen);
+ aead_givcrypt_set_giv(req, esph->enc_data,
+ XFRM_SKB_CB(skb)->seq.output.low);
+
+ ESP_SKB_CB(skb)->tmp = tmp;
+ err = crypto_aead_givencrypt(req);
+ if (err == -EINPROGRESS)
+ goto error;
+
+ if (err == -EBUSY)
+ err = NET_XMIT_DROP;
+
+ kfree(tmp);
+
+error:
+ return err;
+}
+
+static int esp_input_done2(struct sk_buff *skb, int err)
+{
+ const struct iphdr *iph;
+ struct xfrm_state *x = xfrm_input_state(skb);
+ struct esp_data *esp = x->data;
+ struct crypto_aead *aead = esp->aead;
+ int alen = crypto_aead_authsize(aead);
+ int hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead);
+ int elen = skb->len - hlen;
+ int ihl;
+ u8 nexthdr[2];
+ int padlen;
+
+ kfree(ESP_SKB_CB(skb)->tmp);
+
+ if (unlikely(err))
+ goto out;
+
+ if (skb_copy_bits(skb, skb->len-alen-2, nexthdr, 2))
+ BUG();
+
+ err = -EINVAL;
+ padlen = nexthdr[0];
+ if (padlen + 2 + alen >= elen)
+ goto out;
+
+ /* ... check padding bits here. Silly. :-) */
+
+ iph = ip_hdr(skb);
+ ihl = iph->ihl * 4;
+
+ if (x->encap) {
+ struct xfrm_encap_tmpl *encap = x->encap;
+ struct udphdr *uh = (void *)(skb_network_header(skb) + ihl);
+
+ /*
+ * 1) if the NAT-T peer's IP or port changed then
+ * advertize the change to the keying daemon.
+ * This is an inbound SA, so just compare
+ * SRC ports.
+ */
+ if (iph->saddr != x->props.saddr.a4 ||
+ uh->source != encap->encap_sport) {
+ xfrm_address_t ipaddr;
+
+ ipaddr.a4 = iph->saddr;
+ km_new_mapping(x, &ipaddr, uh->source);
+
+ /* XXX: perhaps add an extra
+ * policy check here, to see
+ * if we should allow or
+ * reject a packet from a
+ * different source
+ * address/port.
+ */
+ }
+
+ /*
+ * 2) ignore UDP/TCP checksums in case
+ * of NAT-T in Transport Mode, or
+ * perform other post-processing fixes
+ * as per draft-ietf-ipsec-udp-encaps-06,
+ * section 3.1.2
+ */
+ if (x->props.mode == XFRM_MODE_TRANSPORT)
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ }
+
+ pskb_trim(skb, skb->len - alen - padlen - 2);
+ __skb_pull(skb, hlen);
+ skb_set_transport_header(skb, -ihl);
+
+ err = nexthdr[1];
+
+ /* RFC4303: Drop dummy packets without any error */
+ if (err == IPPROTO_NONE)
+ err = -EINVAL;
+
+out:
+ return err;
+}
+
+static void esp_input_done(struct crypto_async_request *base, int err)
+{
+ struct sk_buff *skb = base->data;
+
+ xfrm_input_resume(skb, esp_input_done2(skb, err));
+}
+
+/*
+ * Note: detecting truncated vs. non-truncated authentication data is very
+ * expensive, so we only support truncated data, which is the recommended
+ * and common case.
+ */
+static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
+{
+ struct ip_esp_hdr *esph;
+ struct esp_data *esp = x->data;
+ struct crypto_aead *aead = esp->aead;
+ struct aead_request *req;
+ struct sk_buff *trailer;
+ int elen = skb->len - sizeof(*esph) - crypto_aead_ivsize(aead);
+ int nfrags;
+ int assoclen;
+ int sglists;
+ int seqhilen;
+ __be32 *seqhi;
+ void *tmp;
+ u8 *iv;
+ struct scatterlist *sg;
+ struct scatterlist *asg;
+ int err = -EINVAL;
+
+ if (!pskb_may_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead)))
+ goto out;
+
+ if (elen <= 0)
+ goto out;
+
+ if ((err = skb_cow_data(skb, 0, &trailer)) < 0)
+ goto out;
+ nfrags = err;
+
+ assoclen = sizeof(*esph);
+ sglists = 1;
+ seqhilen = 0;
+
+ if (x->props.flags & XFRM_STATE_ESN) {
+ sglists += 2;
+ seqhilen += sizeof(__be32);
+ assoclen += seqhilen;
+ }
+
+ err = -ENOMEM;
+ tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen);
+ if (!tmp)
+ goto out;
+
+ ESP_SKB_CB(skb)->tmp = tmp;
+ seqhi = esp_tmp_seqhi(tmp);
+ iv = esp_tmp_iv(aead, tmp, seqhilen);
+ req = esp_tmp_req(aead, iv);
+ asg = esp_req_sg(aead, req);
+ sg = asg + sglists;
+
+ skb->ip_summed = CHECKSUM_NONE;
+
+ esph = (struct ip_esp_hdr *)skb->data;
+
+ /* Get ivec. This can be wrong, check against another impls. */
+ iv = esph->enc_data;
+
+ sg_init_table(sg, nfrags);
+ skb_to_sgvec(skb, sg, sizeof(*esph) + crypto_aead_ivsize(aead), elen);
+
+ if ((x->props.flags & XFRM_STATE_ESN)) {
+ sg_init_table(asg, 3);
+ sg_set_buf(asg, &esph->spi, sizeof(__be32));
+ *seqhi = XFRM_SKB_CB(skb)->seq.input.hi;
+ sg_set_buf(asg + 1, seqhi, seqhilen);
+ sg_set_buf(asg + 2, &esph->seq_no, sizeof(__be32));
+ } else
+ sg_init_one(asg, esph, sizeof(*esph));
+
+ aead_request_set_callback(req, 0, esp_input_done, skb);
+ aead_request_set_crypt(req, sg, sg, elen, iv);
+ aead_request_set_assoc(req, asg, assoclen);
+
+ err = crypto_aead_decrypt(req);
+ if (err == -EINPROGRESS)
+ goto out;
+
+ err = esp_input_done2(skb, err);
+
+out:
+ return err;
+}
+
+static u32 esp4_get_mtu(struct xfrm_state *x, int mtu)
+{
+ struct esp_data *esp = x->data;
+ u32 blksize = ALIGN(crypto_aead_blocksize(esp->aead), 4);
+ u32 align = max_t(u32, blksize, esp->padlen);
+ unsigned int net_adj;
+
+ switch (x->props.mode) {
+ case XFRM_MODE_TRANSPORT:
+ case XFRM_MODE_BEET:
+ net_adj = sizeof(struct iphdr);
+ break;
+ case XFRM_MODE_TUNNEL:
+ net_adj = 0;
+ break;
+ default:
+ BUG();
+ }
+
+ return ((mtu - x->props.header_len - crypto_aead_authsize(esp->aead) -
+ net_adj) & ~(align - 1)) + (net_adj - 2);
+}
+
+static void esp4_err(struct sk_buff *skb, u32 info)
+{
+ struct net *net = dev_net(skb->dev);
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
+ struct ip_esp_hdr *esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2));
+ struct xfrm_state *x;
+
+ if (icmp_hdr(skb)->type != ICMP_DEST_UNREACH ||
+ icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
+ return;
+
+ x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
+ esph->spi, IPPROTO_ESP, AF_INET);
+ if (!x)
+ return;
+ NETDEBUG(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n",
+ ntohl(esph->spi), ntohl(iph->daddr));
+ xfrm_state_put(x);
+}
+
+static void esp_destroy(struct xfrm_state *x)
+{
+ struct esp_data *esp = x->data;
+
+ if (!esp)
+ return;
+
+ crypto_free_aead(esp->aead);
+ kfree(esp);
+}
+
+static int esp_init_aead(struct xfrm_state *x)
+{
+ struct esp_data *esp = x->data;
+ struct crypto_aead *aead;
+ int err;
+
+ aead = crypto_alloc_aead(x->aead->alg_name, 0, 0);
+ err = PTR_ERR(aead);
+ if (IS_ERR(aead))
+ goto error;
+
+ esp->aead = aead;
+
+ err = crypto_aead_setkey(aead, x->aead->alg_key,
+ (x->aead->alg_key_len + 7) / 8);
+ if (err)
+ goto error;
+
+ err = crypto_aead_setauthsize(aead, x->aead->alg_icv_len / 8);
+ if (err)
+ goto error;
+
+error:
+ return err;
+}
+
+static int esp_init_authenc(struct xfrm_state *x)
+{
+ struct esp_data *esp = x->data;
+ struct crypto_aead *aead;
+ struct crypto_authenc_key_param *param;
+ struct rtattr *rta;
+ char *key;
+ char *p;
+ char authenc_name[CRYPTO_MAX_ALG_NAME];
+ unsigned int keylen;
+ int err;
+
+ err = -EINVAL;
+ if (x->ealg == NULL)
+ goto error;
+
+ err = -ENAMETOOLONG;
+
+ if ((x->props.flags & XFRM_STATE_ESN)) {
+ if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME,
+ "authencesn(%s,%s)",
+ x->aalg ? x->aalg->alg_name : "digest_null",
+ x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME)
+ goto error;
+ } else {
+ if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME,
+ "authenc(%s,%s)",
+ x->aalg ? x->aalg->alg_name : "digest_null",
+ x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME)
+ goto error;
+ }
+
+ aead = crypto_alloc_aead(authenc_name, 0, 0);
+ err = PTR_ERR(aead);
+ if (IS_ERR(aead))
+ goto error;
+
+ esp->aead = aead;
+
+ keylen = (x->aalg ? (x->aalg->alg_key_len + 7) / 8 : 0) +
+ (x->ealg->alg_key_len + 7) / 8 + RTA_SPACE(sizeof(*param));
+ err = -ENOMEM;
+ key = kmalloc(keylen, GFP_KERNEL);
+ if (!key)
+ goto error;
+
+ p = key;
+ rta = (void *)p;
+ rta->rta_type = CRYPTO_AUTHENC_KEYA_PARAM;
+ rta->rta_len = RTA_LENGTH(sizeof(*param));
+ param = RTA_DATA(rta);
+ p += RTA_SPACE(sizeof(*param));
+
+ if (x->aalg) {
+ struct xfrm_algo_desc *aalg_desc;
+
+ memcpy(p, x->aalg->alg_key, (x->aalg->alg_key_len + 7) / 8);
+ p += (x->aalg->alg_key_len + 7) / 8;
+
+ aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name, 0);
+ BUG_ON(!aalg_desc);
+
+ err = -EINVAL;
+ if (aalg_desc->uinfo.auth.icv_fullbits/8 !=
+ crypto_aead_authsize(aead)) {
+ NETDEBUG(KERN_INFO "ESP: %s digestsize %u != %hu\n",
+ x->aalg->alg_name,
+ crypto_aead_authsize(aead),
+ aalg_desc->uinfo.auth.icv_fullbits/8);
+ goto free_key;
+ }
+
+ err = crypto_aead_setauthsize(
+ aead, x->aalg->alg_trunc_len / 8);
+ if (err)
+ goto free_key;
+ }
+
+ param->enckeylen = cpu_to_be32((x->ealg->alg_key_len + 7) / 8);
+ memcpy(p, x->ealg->alg_key, (x->ealg->alg_key_len + 7) / 8);
+
+ err = crypto_aead_setkey(aead, key, keylen);
+
+free_key:
+ kfree(key);
+
+error:
+ return err;
+}
+
+static int esp_init_state(struct xfrm_state *x)
+{
+ struct esp_data *esp;
+ struct crypto_aead *aead;
+ u32 align;
+ int err;
+
+ esp = kzalloc(sizeof(*esp), GFP_KERNEL);
+ if (esp == NULL)
+ return -ENOMEM;
+
+ x->data = esp;
+
+ if (x->aead)
+ err = esp_init_aead(x);
+ else
+ err = esp_init_authenc(x);
+
+ if (err)
+ goto error;
+
+ aead = esp->aead;
+
+ esp->padlen = 0;
+
+ x->props.header_len = sizeof(struct ip_esp_hdr) +
+ crypto_aead_ivsize(aead);
+ if (x->props.mode == XFRM_MODE_TUNNEL)
+ x->props.header_len += sizeof(struct iphdr);
+ else if (x->props.mode == XFRM_MODE_BEET && x->sel.family != AF_INET6)
+ x->props.header_len += IPV4_BEET_PHMAXLEN;
+ if (x->encap) {
+ struct xfrm_encap_tmpl *encap = x->encap;
+
+ switch (encap->encap_type) {
+ default:
+ goto error;
+ case UDP_ENCAP_ESPINUDP:
+ x->props.header_len += sizeof(struct udphdr);
+ break;
+ case UDP_ENCAP_ESPINUDP_NON_IKE:
+ x->props.header_len += sizeof(struct udphdr) + 2 * sizeof(u32);
+ break;
+ }
+ }
+
+ align = ALIGN(crypto_aead_blocksize(aead), 4);
+ if (esp->padlen)
+ align = max_t(u32, align, esp->padlen);
+ x->props.trailer_len = align + 1 + crypto_aead_authsize(esp->aead);
+
+error:
+ return err;
+}
+
+static const struct xfrm_type esp_type =
+{
+ .description = "ESP4",
+ .owner = THIS_MODULE,
+ .proto = IPPROTO_ESP,
+ .flags = XFRM_TYPE_REPLAY_PROT,
+ .init_state = esp_init_state,
+ .destructor = esp_destroy,
+ .get_mtu = esp4_get_mtu,
+ .input = esp_input,
+ .output = esp_output
+};
+
+static const struct net_protocol esp4_protocol = {
+ .handler = xfrm4_rcv,
+ .err_handler = esp4_err,
+ .no_policy = 1,
+ .netns_ok = 1,
+};
+
+static int __init esp4_init(void)
+{
+ if (xfrm_register_type(&esp_type, AF_INET) < 0) {
+ pr_info("%s: can't add xfrm type\n", __func__);
+ return -EAGAIN;
+ }
+ if (inet_add_protocol(&esp4_protocol, IPPROTO_ESP) < 0) {
+ pr_info("%s: can't add protocol\n", __func__);
+ xfrm_unregister_type(&esp_type, AF_INET);
+ return -EAGAIN;
+ }
+ return 0;
+}
+
+static void __exit esp4_fini(void)
+{
+ if (inet_del_protocol(&esp4_protocol, IPPROTO_ESP) < 0)
+ pr_info("%s: can't remove protocol\n", __func__);
+ if (xfrm_unregister_type(&esp_type, AF_INET) < 0)
+ pr_info("%s: can't remove xfrm type\n", __func__);
+}
+
+module_init(esp4_init);
+module_exit(esp4_fini);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_XFRM_TYPE(AF_INET, XFRM_PROTO_ESP);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
new file mode 100644
index 00000000..cbe3a685
--- /dev/null
+++ b/net/ipv4/fib_frontend.c
@@ -0,0 +1,1135 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * IPv4 Forwarding Information Base: FIB frontend.
+ *
+ * Authors: Alexey Kuznetsov,
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#ifndef CONFIG_IP_MULTIPLE_TABLES
+
+static int __net_init fib4_rules_init(struct net *net)
+{
+ struct fib_table *local_table, *main_table;
+
+ local_table = fib_trie_table(RT_TABLE_LOCAL);
+ if (local_table == NULL)
+ return -ENOMEM;
+
+ main_table = fib_trie_table(RT_TABLE_MAIN);
+ if (main_table == NULL)
+ goto fail;
+
+ hlist_add_head_rcu(&local_table->tb_hlist,
+ &net->ipv4.fib_table_hash[TABLE_LOCAL_INDEX]);
+ hlist_add_head_rcu(&main_table->tb_hlist,
+ &net->ipv4.fib_table_hash[TABLE_MAIN_INDEX]);
+ return 0;
+
+fail:
+ kfree(local_table);
+ return -ENOMEM;
+}
+#else
+
+struct fib_table *fib_new_table(struct net *net, u32 id)
+{
+ struct fib_table *tb;
+ unsigned int h;
+
+ if (id == 0)
+ id = RT_TABLE_MAIN;
+ tb = fib_get_table(net, id);
+ if (tb)
+ return tb;
+
+ tb = fib_trie_table(id);
+ if (!tb)
+ return NULL;
+ h = id & (FIB_TABLE_HASHSZ - 1);
+ hlist_add_head_rcu(&tb->tb_hlist, &net->ipv4.fib_table_hash[h]);
+ return tb;
+}
+
+struct fib_table *fib_get_table(struct net *net, u32 id)
+{
+ struct fib_table *tb;
+ struct hlist_node *node;
+ struct hlist_head *head;
+ unsigned int h;
+
+ if (id == 0)
+ id = RT_TABLE_MAIN;
+ h = id & (FIB_TABLE_HASHSZ - 1);
+
+ rcu_read_lock();
+ head = &net->ipv4.fib_table_hash[h];
+ hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
+ if (tb->tb_id == id) {
+ rcu_read_unlock();
+ return tb;
+ }
+ }
+ rcu_read_unlock();
+ return NULL;
+}
+#endif /* CONFIG_IP_MULTIPLE_TABLES */
+
+static void fib_flush(struct net *net)
+{
+ int flushed = 0;
+ struct fib_table *tb;
+ struct hlist_node *node;
+ struct hlist_head *head;
+ unsigned int h;
+
+ for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
+ head = &net->ipv4.fib_table_hash[h];
+ hlist_for_each_entry(tb, node, head, tb_hlist)
+ flushed += fib_table_flush(tb);
+ }
+
+ if (flushed)
+ rt_cache_flush(net, -1);
+}
+
+/*
+ * Find address type as if only "dev" was present in the system. If
+ * on_dev is NULL then all interfaces are taken into consideration.
+ */
+static inline unsigned __inet_dev_addr_type(struct net *net,
+ const struct net_device *dev,
+ __be32 addr)
+{
+ struct flowi4 fl4 = { .daddr = addr };
+ struct fib_result res;
+ unsigned ret = RTN_BROADCAST;
+ struct fib_table *local_table;
+
+ if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr))
+ return RTN_BROADCAST;
+ if (ipv4_is_multicast(addr))
+ return RTN_MULTICAST;
+
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+ res.r = NULL;
+#endif
+
+ local_table = fib_get_table(net, RT_TABLE_LOCAL);
+ if (local_table) {
+ ret = RTN_UNICAST;
+ rcu_read_lock();
+ if (!fib_table_lookup(local_table, &fl4, &res, FIB_LOOKUP_NOREF)) {
+ if (!dev || dev == res.fi->fib_dev)
+ ret = res.type;
+ }
+ rcu_read_unlock();
+ }
+ return ret;
+}
+
+unsigned int inet_addr_type(struct net *net, __be32 addr)
+{
+ return __inet_dev_addr_type(net, NULL, addr);
+}
+EXPORT_SYMBOL(inet_addr_type);
+
+unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
+ __be32 addr)
+{
+ return __inet_dev_addr_type(net, dev, addr);
+}
+EXPORT_SYMBOL(inet_dev_addr_type);
+
+/* Given (packet source, input interface) and optional (dst, oif, tos):
+ * - (main) check, that source is valid i.e. not broadcast or our local
+ * address.
+ * - figure out what "logical" interface this packet arrived
+ * and calculate "specific destination" address.
+ * - check, that packet arrived from expected physical interface.
+ * called with rcu_read_lock()
+ */
+int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos,
+ int oif, struct net_device *dev, __be32 *spec_dst,
+ u32 *itag)
+{
+ struct in_device *in_dev;
+ struct flowi4 fl4;
+ struct fib_result res;
+ int no_addr, rpf, accept_local;
+ bool dev_match;
+ int ret;
+ struct net *net;
+
+ fl4.flowi4_oif = 0;
+ fl4.flowi4_iif = oif;
+ fl4.daddr = src;
+ fl4.saddr = dst;
+ fl4.flowi4_tos = tos;
+ fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
+
+ no_addr = rpf = accept_local = 0;
+ in_dev = __in_dev_get_rcu(dev);
+ if (in_dev) {
+ no_addr = in_dev->ifa_list == NULL;
+
+ /* Ignore rp_filter for packets protected by IPsec. */
+ rpf = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(in_dev);
+
+ accept_local = IN_DEV_ACCEPT_LOCAL(in_dev);
+ fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0;
+ }
+
+ if (in_dev == NULL)
+ goto e_inval;
+
+ net = dev_net(dev);
+ if (fib_lookup(net, &fl4, &res))
+ goto last_resort;
+ if (res.type != RTN_UNICAST) {
+ if (res.type != RTN_LOCAL || !accept_local)
+ goto e_inval;
+ }
+ *spec_dst = FIB_RES_PREFSRC(net, res);
+ fib_combine_itag(itag, &res);
+ dev_match = false;
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ for (ret = 0; ret < res.fi->fib_nhs; ret++) {
+ struct fib_nh *nh = &res.fi->fib_nh[ret];
+
+ if (nh->nh_dev == dev) {
+ dev_match = true;
+ break;
+ }
+ }
+#else
+ if (FIB_RES_DEV(res) == dev)
+ dev_match = true;
+#endif
+ if (dev_match) {
+ ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
+ return ret;
+ }
+ if (no_addr)
+ goto last_resort;
+ if (rpf == 1)
+ goto e_rpf;
+ fl4.flowi4_oif = dev->ifindex;
+
+ ret = 0;
+ if (fib_lookup(net, &fl4, &res) == 0) {
+ if (res.type == RTN_UNICAST) {
+ *spec_dst = FIB_RES_PREFSRC(net, res);
+ ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
+ }
+ }
+ return ret;
+
+last_resort:
+ if (rpf)
+ goto e_rpf;
+ *spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
+ *itag = 0;
+ return 0;
+
+e_inval:
+ return -EINVAL;
+e_rpf:
+ return -EXDEV;
+}
+
+static inline __be32 sk_extract_addr(struct sockaddr *addr)
+{
+ return ((struct sockaddr_in *) addr)->sin_addr.s_addr;
+}
+
+static int put_rtax(struct nlattr *mx, int len, int type, u32 value)
+{
+ struct nlattr *nla;
+
+ nla = (struct nlattr *) ((char *) mx + len);
+ nla->nla_type = type;
+ nla->nla_len = nla_attr_size(4);
+ *(u32 *) nla_data(nla) = value;
+
+ return len + nla_total_size(4);
+}
+
+static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt,
+ struct fib_config *cfg)
+{
+ __be32 addr;
+ int plen;
+
+ memset(cfg, 0, sizeof(*cfg));
+ cfg->fc_nlinfo.nl_net = net;
+
+ if (rt->rt_dst.sa_family != AF_INET)
+ return -EAFNOSUPPORT;
+
+ /*
+ * Check mask for validity:
+ * a) it must be contiguous.
+ * b) destination must have all host bits clear.
+ * c) if application forgot to set correct family (AF_INET),
+ * reject request unless it is absolutely clear i.e.
+ * both family and mask are zero.
+ */
+ plen = 32;
+ addr = sk_extract_addr(&rt->rt_dst);
+ if (!(rt->rt_flags & RTF_HOST)) {
+ __be32 mask = sk_extract_addr(&rt->rt_genmask);
+
+ if (rt->rt_genmask.sa_family != AF_INET) {
+ if (mask || rt->rt_genmask.sa_family)
+ return -EAFNOSUPPORT;
+ }
+
+ if (bad_mask(mask, addr))
+ return -EINVAL;
+
+ plen = inet_mask_len(mask);
+ }
+
+ cfg->fc_dst_len = plen;
+ cfg->fc_dst = addr;
+
+ if (cmd != SIOCDELRT) {
+ cfg->fc_nlflags = NLM_F_CREATE;
+ cfg->fc_protocol = RTPROT_BOOT;
+ }
+
+ if (rt->rt_metric)
+ cfg->fc_priority = rt->rt_metric - 1;
+
+ if (rt->rt_flags & RTF_REJECT) {
+ cfg->fc_scope = RT_SCOPE_HOST;
+ cfg->fc_type = RTN_UNREACHABLE;
+ return 0;
+ }
+
+ cfg->fc_scope = RT_SCOPE_NOWHERE;
+ cfg->fc_type = RTN_UNICAST;
+
+ if (rt->rt_dev) {
+ char *colon;
+ struct net_device *dev;
+ char devname[IFNAMSIZ];
+
+ if (copy_from_user(devname, rt->rt_dev, IFNAMSIZ-1))
+ return -EFAULT;
+
+ devname[IFNAMSIZ-1] = 0;
+ colon = strchr(devname, ':');
+ if (colon)
+ *colon = 0;
+ dev = __dev_get_by_name(net, devname);
+ if (!dev)
+ return -ENODEV;
+ cfg->fc_oif = dev->ifindex;
+ if (colon) {
+ struct in_ifaddr *ifa;
+ struct in_device *in_dev = __in_dev_get_rtnl(dev);
+ if (!in_dev)
+ return -ENODEV;
+ *colon = ':';
+ for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next)
+ if (strcmp(ifa->ifa_label, devname) == 0)
+ break;
+ if (ifa == NULL)
+ return -ENODEV;
+ cfg->fc_prefsrc = ifa->ifa_local;
+ }
+ }
+
+ addr = sk_extract_addr(&rt->rt_gateway);
+ if (rt->rt_gateway.sa_family == AF_INET && addr) {
+ cfg->fc_gw = addr;
+ if (rt->rt_flags & RTF_GATEWAY &&
+ inet_addr_type(net, addr) == RTN_UNICAST)
+ cfg->fc_scope = RT_SCOPE_UNIVERSE;
+ }
+
+ if (cmd == SIOCDELRT)
+ return 0;
+
+ if (rt->rt_flags & RTF_GATEWAY && !cfg->fc_gw)
+ return -EINVAL;
+
+ if (cfg->fc_scope == RT_SCOPE_NOWHERE)
+ cfg->fc_scope = RT_SCOPE_LINK;
+
+ if (rt->rt_flags & (RTF_MTU | RTF_WINDOW | RTF_IRTT)) {
+ struct nlattr *mx;
+ int len = 0;
+
+ mx = kzalloc(3 * nla_total_size(4), GFP_KERNEL);
+ if (mx == NULL)
+ return -ENOMEM;
+
+ if (rt->rt_flags & RTF_MTU)
+ len = put_rtax(mx, len, RTAX_ADVMSS, rt->rt_mtu - 40);
+
+ if (rt->rt_flags & RTF_WINDOW)
+ len = put_rtax(mx, len, RTAX_WINDOW, rt->rt_window);
+
+ if (rt->rt_flags & RTF_IRTT)
+ len = put_rtax(mx, len, RTAX_RTT, rt->rt_irtt << 3);
+
+ cfg->fc_mx = mx;
+ cfg->fc_mx_len = len;
+ }
+
+ return 0;
+}
+
+/*
+ * Handle IP routing ioctl calls.
+ * These are used to manipulate the routing tables
+ */
+int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg)
+{
+ struct fib_config cfg;
+ struct rtentry rt;
+ int err;
+
+ switch (cmd) {
+ case SIOCADDRT: /* Add a route */
+ case SIOCDELRT: /* Delete a route */
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (copy_from_user(&rt, arg, sizeof(rt)))
+ return -EFAULT;
+
+ rtnl_lock();
+ err = rtentry_to_fib_config(net, cmd, &rt, &cfg);
+ if (err == 0) {
+ struct fib_table *tb;
+
+ if (cmd == SIOCDELRT) {
+ tb = fib_get_table(net, cfg.fc_table);
+ if (tb)
+ err = fib_table_delete(tb, &cfg);
+ else
+ err = -ESRCH;
+ } else {
+ tb = fib_new_table(net, cfg.fc_table);
+ if (tb)
+ err = fib_table_insert(tb, &cfg);
+ else
+ err = -ENOBUFS;
+ }
+
+ /* allocated by rtentry_to_fib_config() */
+ kfree(cfg.fc_mx);
+ }
+ rtnl_unlock();
+ return err;
+ }
+ return -EINVAL;
+}
+
+const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
+ [RTA_DST] = { .type = NLA_U32 },
+ [RTA_SRC] = { .type = NLA_U32 },
+ [RTA_IIF] = { .type = NLA_U32 },
+ [RTA_OIF] = { .type = NLA_U32 },
+ [RTA_GATEWAY] = { .type = NLA_U32 },
+ [RTA_PRIORITY] = { .type = NLA_U32 },
+ [RTA_PREFSRC] = { .type = NLA_U32 },
+ [RTA_METRICS] = { .type = NLA_NESTED },
+ [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
+ [RTA_FLOW] = { .type = NLA_U32 },
+};
+
+static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
+ struct nlmsghdr *nlh, struct fib_config *cfg)
+{
+ struct nlattr *attr;
+ int err, remaining;
+ struct rtmsg *rtm;
+
+ err = nlmsg_validate(nlh, sizeof(*rtm), RTA_MAX, rtm_ipv4_policy);
+ if (err < 0)
+ goto errout;
+
+ memset(cfg, 0, sizeof(*cfg));
+
+ rtm = nlmsg_data(nlh);
+ cfg->fc_dst_len = rtm->rtm_dst_len;
+ cfg->fc_tos = rtm->rtm_tos;
+ cfg->fc_table = rtm->rtm_table;
+ cfg->fc_protocol = rtm->rtm_protocol;
+ cfg->fc_scope = rtm->rtm_scope;
+ cfg->fc_type = rtm->rtm_type;
+ cfg->fc_flags = rtm->rtm_flags;
+ cfg->fc_nlflags = nlh->nlmsg_flags;
+
+ cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
+ cfg->fc_nlinfo.nlh = nlh;
+ cfg->fc_nlinfo.nl_net = net;
+
+ if (cfg->fc_type > RTN_MAX) {
+ err = -EINVAL;
+ goto errout;
+ }
+
+ nlmsg_for_each_attr(attr, nlh, sizeof(struct rtmsg), remaining) {
+ switch (nla_type(attr)) {
+ case RTA_DST:
+ cfg->fc_dst = nla_get_be32(attr);
+ break;
+ case RTA_OIF:
+ cfg->fc_oif = nla_get_u32(attr);
+ break;
+ case RTA_GATEWAY:
+ cfg->fc_gw = nla_get_be32(attr);
+ break;
+ case RTA_PRIORITY:
+ cfg->fc_priority = nla_get_u32(attr);
+ break;
+ case RTA_PREFSRC:
+ cfg->fc_prefsrc = nla_get_be32(attr);
+ break;
+ case RTA_METRICS:
+ cfg->fc_mx = nla_data(attr);
+ cfg->fc_mx_len = nla_len(attr);
+ break;
+ case RTA_MULTIPATH:
+ cfg->fc_mp = nla_data(attr);
+ cfg->fc_mp_len = nla_len(attr);
+ break;
+ case RTA_FLOW:
+ cfg->fc_flow = nla_get_u32(attr);
+ break;
+ case RTA_TABLE:
+ cfg->fc_table = nla_get_u32(attr);
+ break;
+ }
+ }
+
+ return 0;
+errout:
+ return err;
+}
+
+static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+{
+ struct net *net = sock_net(skb->sk);
+ struct fib_config cfg;
+ struct fib_table *tb;
+ int err;
+
+ err = rtm_to_fib_config(net, skb, nlh, &cfg);
+ if (err < 0)
+ goto errout;
+
+ tb = fib_get_table(net, cfg.fc_table);
+ if (tb == NULL) {
+ err = -ESRCH;
+ goto errout;
+ }
+
+ err = fib_table_delete(tb, &cfg);
+errout:
+ return err;
+}
+
+static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+{
+ struct net *net = sock_net(skb->sk);
+ struct fib_config cfg;
+ struct fib_table *tb;
+ int err;
+
+ err = rtm_to_fib_config(net, skb, nlh, &cfg);
+ if (err < 0)
+ goto errout;
+
+ tb = fib_new_table(net, cfg.fc_table);
+ if (tb == NULL) {
+ err = -ENOBUFS;
+ goto errout;
+ }
+
+ err = fib_table_insert(tb, &cfg);
+errout:
+ return err;
+}
+
+static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ unsigned int h, s_h;
+ unsigned int e = 0, s_e;
+ struct fib_table *tb;
+ struct hlist_node *node;
+ struct hlist_head *head;
+ int dumped = 0;
+
+ if (nlmsg_len(cb->nlh) >= sizeof(struct rtmsg) &&
+ ((struct rtmsg *) nlmsg_data(cb->nlh))->rtm_flags & RTM_F_CLONED)
+ return ip_rt_dump(skb, cb);
+
+ s_h = cb->args[0];
+ s_e = cb->args[1];
+
+ for (h = s_h; h < FIB_TABLE_HASHSZ; h++, s_e = 0) {
+ e = 0;
+ head = &net->ipv4.fib_table_hash[h];
+ hlist_for_each_entry(tb, node, head, tb_hlist) {
+ if (e < s_e)
+ goto next;
+ if (dumped)
+ memset(&cb->args[2], 0, sizeof(cb->args) -
+ 2 * sizeof(cb->args[0]));
+ if (fib_table_dump(tb, skb, cb) < 0)
+ goto out;
+ dumped = 1;
+next:
+ e++;
+ }
+ }
+out:
+ cb->args[1] = e;
+ cb->args[0] = h;
+
+ return skb->len;
+}
+
+/* Prepare and feed intra-kernel routing request.
+ * Really, it should be netlink message, but :-( netlink
+ * can be not configured, so that we feed it directly
+ * to fib engine. It is legal, because all events occur
+ * only when netlink is already locked.
+ */
+static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa)
+{
+ struct net *net = dev_net(ifa->ifa_dev->dev);
+ struct fib_table *tb;
+ struct fib_config cfg = {
+ .fc_protocol = RTPROT_KERNEL,
+ .fc_type = type,
+ .fc_dst = dst,
+ .fc_dst_len = dst_len,
+ .fc_prefsrc = ifa->ifa_local,
+ .fc_oif = ifa->ifa_dev->dev->ifindex,
+ .fc_nlflags = NLM_F_CREATE | NLM_F_APPEND,
+ .fc_nlinfo = {
+ .nl_net = net,
+ },
+ };
+
+ if (type == RTN_UNICAST)
+ tb = fib_new_table(net, RT_TABLE_MAIN);
+ else
+ tb = fib_new_table(net, RT_TABLE_LOCAL);
+
+ if (tb == NULL)
+ return;
+
+ cfg.fc_table = tb->tb_id;
+
+ if (type != RTN_LOCAL)
+ cfg.fc_scope = RT_SCOPE_LINK;
+ else
+ cfg.fc_scope = RT_SCOPE_HOST;
+
+ if (cmd == RTM_NEWROUTE)
+ fib_table_insert(tb, &cfg);
+ else
+ fib_table_delete(tb, &cfg);
+}
+
+void fib_add_ifaddr(struct in_ifaddr *ifa)
+{
+ struct in_device *in_dev = ifa->ifa_dev;
+ struct net_device *dev = in_dev->dev;
+ struct in_ifaddr *prim = ifa;
+ __be32 mask = ifa->ifa_mask;
+ __be32 addr = ifa->ifa_local;
+ __be32 prefix = ifa->ifa_address & mask;
+
+ if (ifa->ifa_flags & IFA_F_SECONDARY) {
+ prim = inet_ifa_byprefix(in_dev, prefix, mask);
+ if (prim == NULL) {
+ pr_warn("%s: bug: prim == NULL\n", __func__);
+ return;
+ }
+ }
+
+ fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim);
+
+ if (!(dev->flags & IFF_UP))
+ return;
+
+ /* Add broadcast address, if it is explicitly assigned. */
+ if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF))
+ fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
+
+ if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags & IFA_F_SECONDARY) &&
+ (prefix != addr || ifa->ifa_prefixlen < 32)) {
+ fib_magic(RTM_NEWROUTE,
+ dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
+ prefix, ifa->ifa_prefixlen, prim);
+
+ /* Add network specific broadcasts, when it takes a sense */
+ if (ifa->ifa_prefixlen < 31) {
+ fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32, prim);
+ fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix | ~mask,
+ 32, prim);
+ }
+ }
+}
+
+/* Delete primary or secondary address.
+ * Optionally, on secondary address promotion consider the addresses
+ * from subnet iprim as deleted, even if they are in device list.
+ * In this case the secondary ifa can be in device list.
+ */
+void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
+{
+ struct in_device *in_dev = ifa->ifa_dev;
+ struct net_device *dev = in_dev->dev;
+ struct in_ifaddr *ifa1;
+ struct in_ifaddr *prim = ifa, *prim1 = NULL;
+ __be32 brd = ifa->ifa_address | ~ifa->ifa_mask;
+ __be32 any = ifa->ifa_address & ifa->ifa_mask;
+#define LOCAL_OK 1
+#define BRD_OK 2
+#define BRD0_OK 4
+#define BRD1_OK 8
+ unsigned ok = 0;
+ int subnet = 0; /* Primary network */
+ int gone = 1; /* Address is missing */
+ int same_prefsrc = 0; /* Another primary with same IP */
+
+ if (ifa->ifa_flags & IFA_F_SECONDARY) {
+ prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask);
+ if (prim == NULL) {
+ pr_warn("%s: bug: prim == NULL\n", __func__);
+ return;
+ }
+ if (iprim && iprim != prim) {
+ pr_warn("%s: bug: iprim != prim\n", __func__);
+ return;
+ }
+ } else if (!ipv4_is_zeronet(any) &&
+ (any != ifa->ifa_local || ifa->ifa_prefixlen < 32)) {
+ fib_magic(RTM_DELROUTE,
+ dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
+ any, ifa->ifa_prefixlen, prim);
+ subnet = 1;
+ }
+
+ /* Deletion is more complicated than add.
+ * We should take care of not to delete too much :-)
+ *
+ * Scan address list to be sure that addresses are really gone.
+ */
+
+ for (ifa1 = in_dev->ifa_list; ifa1; ifa1 = ifa1->ifa_next) {
+ if (ifa1 == ifa) {
+ /* promotion, keep the IP */
+ gone = 0;
+ continue;
+ }
+ /* Ignore IFAs from our subnet */
+ if (iprim && ifa1->ifa_mask == iprim->ifa_mask &&
+ inet_ifa_match(ifa1->ifa_address, iprim))
+ continue;
+
+ /* Ignore ifa1 if it uses different primary IP (prefsrc) */
+ if (ifa1->ifa_flags & IFA_F_SECONDARY) {
+ /* Another address from our subnet? */
+ if (ifa1->ifa_mask == prim->ifa_mask &&
+ inet_ifa_match(ifa1->ifa_address, prim))
+ prim1 = prim;
+ else {
+ /* We reached the secondaries, so
+ * same_prefsrc should be determined.
+ */
+ if (!same_prefsrc)
+ continue;
+ /* Search new prim1 if ifa1 is not
+ * using the current prim1
+ */
+ if (!prim1 ||
+ ifa1->ifa_mask != prim1->ifa_mask ||
+ !inet_ifa_match(ifa1->ifa_address, prim1))
+ prim1 = inet_ifa_byprefix(in_dev,
+ ifa1->ifa_address,
+ ifa1->ifa_mask);
+ if (!prim1)
+ continue;
+ if (prim1->ifa_local != prim->ifa_local)
+ continue;
+ }
+ } else {
+ if (prim->ifa_local != ifa1->ifa_local)
+ continue;
+ prim1 = ifa1;
+ if (prim != prim1)
+ same_prefsrc = 1;
+ }
+ if (ifa->ifa_local == ifa1->ifa_local)
+ ok |= LOCAL_OK;
+ if (ifa->ifa_broadcast == ifa1->ifa_broadcast)
+ ok |= BRD_OK;
+ if (brd == ifa1->ifa_broadcast)
+ ok |= BRD1_OK;
+ if (any == ifa1->ifa_broadcast)
+ ok |= BRD0_OK;
+ /* primary has network specific broadcasts */
+ if (prim1 == ifa1 && ifa1->ifa_prefixlen < 31) {
+ __be32 brd1 = ifa1->ifa_address | ~ifa1->ifa_mask;
+ __be32 any1 = ifa1->ifa_address & ifa1->ifa_mask;
+
+ if (!ipv4_is_zeronet(any1)) {
+ if (ifa->ifa_broadcast == brd1 ||
+ ifa->ifa_broadcast == any1)
+ ok |= BRD_OK;
+ if (brd == brd1 || brd == any1)
+ ok |= BRD1_OK;
+ if (any == brd1 || any == any1)
+ ok |= BRD0_OK;
+ }
+ }
+ }
+
+ if (!(ok & BRD_OK))
+ fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
+ if (subnet && ifa->ifa_prefixlen < 31) {
+ if (!(ok & BRD1_OK))
+ fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim);
+ if (!(ok & BRD0_OK))
+ fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim);
+ }
+ if (!(ok & LOCAL_OK)) {
+ fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim);
+
+ /* Check, that this local address finally disappeared. */
+ if (gone &&
+ inet_addr_type(dev_net(dev), ifa->ifa_local) != RTN_LOCAL) {
+ /* And the last, but not the least thing.
+ * We must flush stray FIB entries.
+ *
+ * First of all, we scan fib_info list searching
+ * for stray nexthop entries, then ignite fib_flush.
+ */
+ if (fib_sync_down_addr(dev_net(dev), ifa->ifa_local))
+ fib_flush(dev_net(dev));
+ }
+ }
+#undef LOCAL_OK
+#undef BRD_OK
+#undef BRD0_OK
+#undef BRD1_OK
+}
+
+static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb)
+{
+
+ struct fib_result res;
+ struct flowi4 fl4 = {
+ .flowi4_mark = frn->fl_mark,
+ .daddr = frn->fl_addr,
+ .flowi4_tos = frn->fl_tos,
+ .flowi4_scope = frn->fl_scope,
+ };
+
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+ res.r = NULL;
+#endif
+
+ frn->err = -ENOENT;
+ if (tb) {
+ local_bh_disable();
+
+ frn->tb_id = tb->tb_id;
+ rcu_read_lock();
+ frn->err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
+
+ if (!frn->err) {
+ frn->prefixlen = res.prefixlen;
+ frn->nh_sel = res.nh_sel;
+ frn->type = res.type;
+ frn->scope = res.scope;
+ }
+ rcu_read_unlock();
+ local_bh_enable();
+ }
+}
+
+static void nl_fib_input(struct sk_buff *skb)
+{
+ struct net *net;
+ struct fib_result_nl *frn;
+ struct nlmsghdr *nlh;
+ struct fib_table *tb;
+ u32 pid;
+
+ net = sock_net(skb->sk);
+ nlh = nlmsg_hdr(skb);
+ if (skb->len < NLMSG_SPACE(0) || skb->len < nlh->nlmsg_len ||
+ nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*frn)))
+ return;
+
+ skb = skb_clone(skb, GFP_KERNEL);
+ if (skb == NULL)
+ return;
+ nlh = nlmsg_hdr(skb);
+
+ frn = (struct fib_result_nl *) NLMSG_DATA(nlh);
+ tb = fib_get_table(net, frn->tb_id_in);
+
+ nl_fib_lookup(frn, tb);
+
+ pid = NETLINK_CB(skb).pid; /* pid of sending process */
+ NETLINK_CB(skb).pid = 0; /* from kernel */
+ NETLINK_CB(skb).dst_group = 0; /* unicast */
+ netlink_unicast(net->ipv4.fibnl, skb, pid, MSG_DONTWAIT);
+}
+
+static int __net_init nl_fib_lookup_init(struct net *net)
+{
+ struct sock *sk;
+ sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, 0,
+ nl_fib_input, NULL, THIS_MODULE);
+ if (sk == NULL)
+ return -EAFNOSUPPORT;
+ net->ipv4.fibnl = sk;
+ return 0;
+}
+
+static void nl_fib_lookup_exit(struct net *net)
+{
+ netlink_kernel_release(net->ipv4.fibnl);
+ net->ipv4.fibnl = NULL;
+}
+
+static void fib_disable_ip(struct net_device *dev, int force, int delay)
+{
+ if (fib_sync_down_dev(dev, force))
+ fib_flush(dev_net(dev));
+ rt_cache_flush(dev_net(dev), delay);
+ arp_ifdown(dev);
+}
+
+static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+ struct in_ifaddr *ifa = (struct in_ifaddr *)ptr;
+ struct net_device *dev = ifa->ifa_dev->dev;
+ struct net *net = dev_net(dev);
+
+ switch (event) {
+ case NETDEV_UP:
+ fib_add_ifaddr(ifa);
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ fib_sync_up(dev);
+#endif
+ atomic_inc(&net->ipv4.dev_addr_genid);
+ rt_cache_flush(dev_net(dev), -1);
+ break;
+ case NETDEV_DOWN:
+ fib_del_ifaddr(ifa, NULL);
+ atomic_inc(&net->ipv4.dev_addr_genid);
+ if (ifa->ifa_dev->ifa_list == NULL) {
+ /* Last address was deleted from this interface.
+ * Disable IP.
+ */
+ fib_disable_ip(dev, 1, 0);
+ } else {
+ rt_cache_flush(dev_net(dev), -1);
+ }
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
+static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+ struct net_device *dev = ptr;
+ struct in_device *in_dev = __in_dev_get_rtnl(dev);
+ struct net *net = dev_net(dev);
+
+ if (event == NETDEV_UNREGISTER) {
+ fib_disable_ip(dev, 2, -1);
+ return NOTIFY_DONE;
+ }
+
+ if (!in_dev)
+ return NOTIFY_DONE;
+
+ switch (event) {
+ case NETDEV_UP:
+ for_ifa(in_dev) {
+ fib_add_ifaddr(ifa);
+ } endfor_ifa(in_dev);
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ fib_sync_up(dev);
+#endif
+ atomic_inc(&net->ipv4.dev_addr_genid);
+ rt_cache_flush(dev_net(dev), -1);
+ break;
+ case NETDEV_DOWN:
+ fib_disable_ip(dev, 0, 0);
+ break;
+ case NETDEV_CHANGEMTU:
+ case NETDEV_CHANGE:
+ rt_cache_flush(dev_net(dev), 0);
+ break;
+ case NETDEV_UNREGISTER_BATCH:
+ /* The batch unregister is only called on the first
+ * device in the list of devices being unregistered.
+ * Therefore we should not pass dev_net(dev) in here.
+ */
+ rt_cache_flush_batch(NULL);
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block fib_inetaddr_notifier = {
+ .notifier_call = fib_inetaddr_event,
+};
+
+static struct notifier_block fib_netdev_notifier = {
+ .notifier_call = fib_netdev_event,
+};
+
+static int __net_init ip_fib_net_init(struct net *net)
+{
+ int err;
+ size_t size = sizeof(struct hlist_head) * FIB_TABLE_HASHSZ;
+
+ /* Avoid false sharing : Use at least a full cache line */
+ size = max_t(size_t, size, L1_CACHE_BYTES);
+
+ net->ipv4.fib_table_hash = kzalloc(size, GFP_KERNEL);
+ if (net->ipv4.fib_table_hash == NULL)
+ return -ENOMEM;
+
+ err = fib4_rules_init(net);
+ if (err < 0)
+ goto fail;
+ return 0;
+
+fail:
+ kfree(net->ipv4.fib_table_hash);
+ return err;
+}
+
+static void ip_fib_net_exit(struct net *net)
+{
+ unsigned int i;
+
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+ fib4_rules_exit(net);
+#endif
+
+ rtnl_lock();
+ for (i = 0; i < FIB_TABLE_HASHSZ; i++) {
+ struct fib_table *tb;
+ struct hlist_head *head;
+ struct hlist_node *node, *tmp;
+
+ head = &net->ipv4.fib_table_hash[i];
+ hlist_for_each_entry_safe(tb, node, tmp, head, tb_hlist) {
+ hlist_del(node);
+ fib_table_flush(tb);
+ fib_free_table(tb);
+ }
+ }
+ rtnl_unlock();
+ kfree(net->ipv4.fib_table_hash);
+}
+
+static int __net_init fib_net_init(struct net *net)
+{
+ int error;
+
+ error = ip_fib_net_init(net);
+ if (error < 0)
+ goto out;
+ error = nl_fib_lookup_init(net);
+ if (error < 0)
+ goto out_nlfl;
+ error = fib_proc_init(net);
+ if (error < 0)
+ goto out_proc;
+out:
+ return error;
+
+out_proc:
+ nl_fib_lookup_exit(net);
+out_nlfl:
+ ip_fib_net_exit(net);
+ goto out;
+}
+
+static void __net_exit fib_net_exit(struct net *net)
+{
+ fib_proc_exit(net);
+ nl_fib_lookup_exit(net);
+ ip_fib_net_exit(net);
+}
+
+static struct pernet_operations fib_net_ops = {
+ .init = fib_net_init,
+ .exit = fib_net_exit,
+};
+
+void __init ip_fib_init(void)
+{
+ rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL, NULL);
+ rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL, NULL);
+ rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib, NULL);
+
+ register_pernet_subsys(&fib_net_ops);
+ register_netdevice_notifier(&fib_netdev_notifier);
+ register_inetaddr_notifier(&fib_inetaddr_notifier);
+
+ fib_trie_init();
+}
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
new file mode 100644
index 00000000..af0f14ab
--- /dev/null
+++ b/net/ipv4/fib_lookup.h
@@ -0,0 +1,57 @@
+#ifndef _FIB_LOOKUP_H
+#define _FIB_LOOKUP_H
+
+#include
+#include
+#include
+
+struct fib_alias {
+ struct list_head fa_list;
+ struct fib_info *fa_info;
+ u8 fa_tos;
+ u8 fa_type;
+ u8 fa_state;
+ struct rcu_head rcu;
+};
+
+#define FA_S_ACCESSED 0x01
+
+/* Dont write on fa_state unless needed, to keep it shared on all cpus */
+static inline void fib_alias_accessed(struct fib_alias *fa)
+{
+ if (!(fa->fa_state & FA_S_ACCESSED))
+ fa->fa_state |= FA_S_ACCESSED;
+}
+
+/* Exported by fib_semantics.c */
+extern void fib_release_info(struct fib_info *);
+extern struct fib_info *fib_create_info(struct fib_config *cfg);
+extern int fib_nh_match(struct fib_config *cfg, struct fib_info *fi);
+extern int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
+ u32 tb_id, u8 type, __be32 dst,
+ int dst_len, u8 tos, struct fib_info *fi,
+ unsigned int);
+extern void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
+ int dst_len, u32 tb_id, struct nl_info *info,
+ unsigned int nlm_flags);
+extern struct fib_alias *fib_find_alias(struct list_head *fah,
+ u8 tos, u32 prio);
+extern int fib_detect_death(struct fib_info *fi, int order,
+ struct fib_info **last_resort,
+ int *last_idx, int dflt);
+
+static inline void fib_result_assign(struct fib_result *res,
+ struct fib_info *fi)
+{
+ /* we used to play games with refcounts, but we now use RCU */
+ res->fi = fi;
+}
+
+struct fib_prop {
+ int error;
+ u8 scope;
+};
+
+extern const struct fib_prop fib_props[RTN_MAX + 1];
+
+#endif /* _FIB_LOOKUP_H */
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
new file mode 100644
index 00000000..799fc790
--- /dev/null
+++ b/net/ipv4/fib_rules.c
@@ -0,0 +1,309 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * IPv4 Forwarding Information Base: policy rules.
+ *
+ * Authors: Alexey Kuznetsov,
+ * Thomas Graf
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Fixes:
+ * Rani Assaf : local_rule cannot be deleted
+ * Marc Boucher : routing by fwmark
+ */
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+struct fib4_rule {
+ struct fib_rule common;
+ u8 dst_len;
+ u8 src_len;
+ u8 tos;
+ __be32 src;
+ __be32 srcmask;
+ __be32 dst;
+ __be32 dstmask;
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ u32 tclassid;
+#endif
+};
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
+u32 fib_rules_tclass(const struct fib_result *res)
+{
+ return res->r ? ((struct fib4_rule *) res->r)->tclassid : 0;
+}
+#endif
+
+int fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res)
+{
+ struct fib_lookup_arg arg = {
+ .result = res,
+ .flags = FIB_LOOKUP_NOREF,
+ };
+ int err;
+
+ err = fib_rules_lookup(net->ipv4.rules_ops, flowi4_to_flowi(flp), 0, &arg);
+ res->r = arg.rule;
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(fib_lookup);
+
+static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp,
+ int flags, struct fib_lookup_arg *arg)
+{
+ int err = -EAGAIN;
+ struct fib_table *tbl;
+
+ switch (rule->action) {
+ case FR_ACT_TO_TBL:
+ break;
+
+ case FR_ACT_UNREACHABLE:
+ err = -ENETUNREACH;
+ goto errout;
+
+ case FR_ACT_PROHIBIT:
+ err = -EACCES;
+ goto errout;
+
+ case FR_ACT_BLACKHOLE:
+ default:
+ err = -EINVAL;
+ goto errout;
+ }
+
+ tbl = fib_get_table(rule->fr_net, rule->table);
+ if (!tbl)
+ goto errout;
+
+ err = fib_table_lookup(tbl, &flp->u.ip4, (struct fib_result *) arg->result, arg->flags);
+ if (err > 0)
+ err = -EAGAIN;
+errout:
+ return err;
+}
+
+
+static int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
+{
+ struct fib4_rule *r = (struct fib4_rule *) rule;
+ struct flowi4 *fl4 = &fl->u.ip4;
+ __be32 daddr = fl4->daddr;
+ __be32 saddr = fl4->saddr;
+
+ if (((saddr ^ r->src) & r->srcmask) ||
+ ((daddr ^ r->dst) & r->dstmask))
+ return 0;
+
+ if (r->tos && (r->tos != fl4->flowi4_tos))
+ return 0;
+
+ return 1;
+}
+
+static struct fib_table *fib_empty_table(struct net *net)
+{
+ u32 id;
+
+ for (id = 1; id <= RT_TABLE_MAX; id++)
+ if (fib_get_table(net, id) == NULL)
+ return fib_new_table(net, id);
+ return NULL;
+}
+
+static const struct nla_policy fib4_rule_policy[FRA_MAX+1] = {
+ FRA_GENERIC_POLICY,
+ [FRA_FLOW] = { .type = NLA_U32 },
+};
+
+static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
+ struct fib_rule_hdr *frh,
+ struct nlattr **tb)
+{
+ struct net *net = sock_net(skb->sk);
+ int err = -EINVAL;
+ struct fib4_rule *rule4 = (struct fib4_rule *) rule;
+
+ if (frh->tos & ~IPTOS_TOS_MASK)
+ goto errout;
+
+ if (rule->table == RT_TABLE_UNSPEC) {
+ if (rule->action == FR_ACT_TO_TBL) {
+ struct fib_table *table;
+
+ table = fib_empty_table(net);
+ if (table == NULL) {
+ err = -ENOBUFS;
+ goto errout;
+ }
+
+ rule->table = table->tb_id;
+ }
+ }
+
+ if (frh->src_len)
+ rule4->src = nla_get_be32(tb[FRA_SRC]);
+
+ if (frh->dst_len)
+ rule4->dst = nla_get_be32(tb[FRA_DST]);
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ if (tb[FRA_FLOW])
+ rule4->tclassid = nla_get_u32(tb[FRA_FLOW]);
+#endif
+
+ rule4->src_len = frh->src_len;
+ rule4->srcmask = inet_make_mask(rule4->src_len);
+ rule4->dst_len = frh->dst_len;
+ rule4->dstmask = inet_make_mask(rule4->dst_len);
+ rule4->tos = frh->tos;
+
+ err = 0;
+errout:
+ return err;
+}
+
+static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
+ struct nlattr **tb)
+{
+ struct fib4_rule *rule4 = (struct fib4_rule *) rule;
+
+ if (frh->src_len && (rule4->src_len != frh->src_len))
+ return 0;
+
+ if (frh->dst_len && (rule4->dst_len != frh->dst_len))
+ return 0;
+
+ if (frh->tos && (rule4->tos != frh->tos))
+ return 0;
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ if (tb[FRA_FLOW] && (rule4->tclassid != nla_get_u32(tb[FRA_FLOW])))
+ return 0;
+#endif
+
+ if (frh->src_len && (rule4->src != nla_get_be32(tb[FRA_SRC])))
+ return 0;
+
+ if (frh->dst_len && (rule4->dst != nla_get_be32(tb[FRA_DST])))
+ return 0;
+
+ return 1;
+}
+
+static int fib4_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
+ struct fib_rule_hdr *frh)
+{
+ struct fib4_rule *rule4 = (struct fib4_rule *) rule;
+
+ frh->dst_len = rule4->dst_len;
+ frh->src_len = rule4->src_len;
+ frh->tos = rule4->tos;
+
+ if (rule4->dst_len)
+ NLA_PUT_BE32(skb, FRA_DST, rule4->dst);
+
+ if (rule4->src_len)
+ NLA_PUT_BE32(skb, FRA_SRC, rule4->src);
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ if (rule4->tclassid)
+ NLA_PUT_U32(skb, FRA_FLOW, rule4->tclassid);
+#endif
+ return 0;
+
+nla_put_failure:
+ return -ENOBUFS;
+}
+
+static size_t fib4_rule_nlmsg_payload(struct fib_rule *rule)
+{
+ return nla_total_size(4) /* dst */
+ + nla_total_size(4) /* src */
+ + nla_total_size(4); /* flow */
+}
+
+static void fib4_rule_flush_cache(struct fib_rules_ops *ops)
+{
+ rt_cache_flush(ops->fro_net, -1);
+}
+
+static const struct fib_rules_ops __net_initdata fib4_rules_ops_template = {
+ .family = AF_INET,
+ .rule_size = sizeof(struct fib4_rule),
+ .addr_size = sizeof(u32),
+ .action = fib4_rule_action,
+ .match = fib4_rule_match,
+ .configure = fib4_rule_configure,
+ .compare = fib4_rule_compare,
+ .fill = fib4_rule_fill,
+ .default_pref = fib_default_rule_pref,
+ .nlmsg_payload = fib4_rule_nlmsg_payload,
+ .flush_cache = fib4_rule_flush_cache,
+ .nlgroup = RTNLGRP_IPV4_RULE,
+ .policy = fib4_rule_policy,
+ .owner = THIS_MODULE,
+};
+
+static int fib_default_rules_init(struct fib_rules_ops *ops)
+{
+ int err;
+
+ err = fib_default_rule_add(ops, 0, RT_TABLE_LOCAL, 0);
+ if (err < 0)
+ return err;
+ err = fib_default_rule_add(ops, 0x7FFE, RT_TABLE_MAIN, 0);
+ if (err < 0)
+ return err;
+ err = fib_default_rule_add(ops, 0x7FFF, RT_TABLE_DEFAULT, 0);
+ if (err < 0)
+ return err;
+ return 0;
+}
+
+int __net_init fib4_rules_init(struct net *net)
+{
+ int err;
+ struct fib_rules_ops *ops;
+
+ ops = fib_rules_register(&fib4_rules_ops_template, net);
+ if (IS_ERR(ops))
+ return PTR_ERR(ops);
+
+ err = fib_default_rules_init(ops);
+ if (err < 0)
+ goto fail;
+ net->ipv4.rules_ops = ops;
+ return 0;
+
+fail:
+ /* also cleans all rules already added */
+ fib_rules_unregister(ops);
+ return err;
+}
+
+void __net_exit fib4_rules_exit(struct net *net)
+{
+ fib_rules_unregister(net->ipv4.rules_ops);
+}
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
new file mode 100644
index 00000000..8861f91a
--- /dev/null
+++ b/net/ipv4/fib_semantics.c
@@ -0,0 +1,1249 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * IPv4 Forwarding Information Base: semantics.
+ *
+ * Authors: Alexey Kuznetsov,
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include