diff options
Diffstat (limited to 'net/core')
37 files changed, 33972 insertions, 0 deletions
diff --git a/net/core/Makefile b/net/core/Makefile new file mode 100644 index 00000000..674641b1 --- /dev/null +++ b/net/core/Makefile @@ -0,0 +1,23 @@ +# +# Makefile for the Linux networking core. +# + +obj-y := sock.o request_sock.o skbuff.o iovec.o datagram.o stream.o scm.o \ + gen_stats.o gen_estimator.o net_namespace.o secure_seq.o flow_dissector.o + +obj-$(CONFIG_SYSCTL) += sysctl_net_core.o + +obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \ + neighbour.o rtnetlink.o utils.o link_watch.o filter.o \ + sock_diag.o + +obj-$(CONFIG_XFRM) += flow.o +obj-y += net-sysfs.o +obj-$(CONFIG_NET_PKTGEN) += pktgen.o +obj-$(CONFIG_NETPOLL) += netpoll.o +obj-$(CONFIG_NET_DMA) += user_dma.o +obj-$(CONFIG_FIB_RULES) += fib_rules.o +obj-$(CONFIG_TRACEPOINTS) += net-traces.o +obj-$(CONFIG_NET_DROP_MONITOR) += drop_monitor.o +obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o +obj-$(CONFIG_NETPRIO_CGROUP) += netprio_cgroup.o diff --git a/net/core/datagram.c b/net/core/datagram.c new file mode 100644 index 00000000..e4fbfd6e --- /dev/null +++ b/net/core/datagram.c @@ -0,0 +1,780 @@ +/* + * SUCS NET3: + * + * Generic datagram handling routines. These are generic for all + * protocols. Possibly a generic IP version on top of these would + * make sense. Not tonight however 8-). + * This is used because UDP, RAW, PACKET, DDP, IPX, AX.25 and + * NetROM layer all have identical poll code and mostly + * identical recvmsg() code. So we share it here. The poll was + * shared before but buried in udp.c so I moved it. + * + * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk>. (datagram_poll() from old + * udp.c code) + * + * Fixes: + * Alan Cox : NULL return from skb_peek_copy() + * understood + * Alan Cox : Rewrote skb_read_datagram to avoid the + * skb_peek_copy stuff. + * Alan Cox : Added support for SOCK_SEQPACKET. + * IPX can no longer use the SO_TYPE hack + * but AX.25 now works right, and SPX is + * feasible. + * Alan Cox : Fixed write poll of non IP protocol + * crash. + * Florian La Roche: Changed for my new skbuff handling. + * Darryl Miles : Fixed non-blocking SOCK_SEQPACKET. + * Linus Torvalds : BSD semantic fixes. + * Alan Cox : Datagram iovec handling + * Darryl Miles : Fixed non-blocking SOCK_STREAM. + * Alan Cox : POSIXisms + * Pete Wyckoff : Unconnected accept() fix. + * + */ + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <asm/uaccess.h> +#include <linux/mm.h> +#include <linux/interrupt.h> +#include <linux/errno.h> +#include <linux/sched.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/rtnetlink.h> +#include <linux/poll.h> +#include <linux/highmem.h> +#include <linux/spinlock.h> +#include <linux/slab.h> + +#include <net/protocol.h> +#include <linux/skbuff.h> + +#include <net/checksum.h> +#include <net/sock.h> +#include <net/tcp_states.h> +#include <trace/events/skb.h> + +/* + * Is a socket 'connection oriented' ? + */ +static inline int connection_based(struct sock *sk) +{ + return sk->sk_type == SOCK_SEQPACKET || sk->sk_type == SOCK_STREAM; +} + +static int receiver_wake_function(wait_queue_t *wait, unsigned mode, int sync, + void *key) +{ + unsigned long bits = (unsigned long)key; + + /* + * Avoid a wakeup if event not interesting for us + */ + if (bits && !(bits & (POLLIN | POLLERR))) + return 0; + return autoremove_wake_function(wait, mode, sync, key); +} +/* + * Wait for a packet.. + */ +static int wait_for_packet(struct sock *sk, int *err, long *timeo_p) +{ + int error; + DEFINE_WAIT_FUNC(wait, receiver_wake_function); + + prepare_to_wait_exclusive(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + + /* Socket errors? */ + error = sock_error(sk); + if (error) + goto out_err; + + if (!skb_queue_empty(&sk->sk_receive_queue)) + goto out; + + /* Socket shut down? */ + if (sk->sk_shutdown & RCV_SHUTDOWN) + goto out_noerr; + + /* Sequenced packets can come disconnected. + * If so we report the problem + */ + error = -ENOTCONN; + if (connection_based(sk) && + !(sk->sk_state == TCP_ESTABLISHED || sk->sk_state == TCP_LISTEN)) + goto out_err; + + /* handle signals */ + if (signal_pending(current)) + goto interrupted; + + error = 0; + *timeo_p = schedule_timeout(*timeo_p); +out: + finish_wait(sk_sleep(sk), &wait); + return error; +interrupted: + error = sock_intr_errno(*timeo_p); +out_err: + *err = error; + goto out; +out_noerr: + *err = 0; + error = 1; + goto out; +} + +/** + * __skb_recv_datagram - Receive a datagram skbuff + * @sk: socket + * @flags: MSG_ flags + * @off: an offset in bytes to peek skb from. Returns an offset + * within an skb where data actually starts + * @peeked: returns non-zero if this packet has been seen before + * @err: error code returned + * + * Get a datagram skbuff, understands the peeking, nonblocking wakeups + * and possible races. This replaces identical code in packet, raw and + * udp, as well as the IPX AX.25 and Appletalk. It also finally fixes + * the long standing peek and read race for datagram sockets. If you + * alter this routine remember it must be re-entrant. + * + * This function will lock the socket if a skb is returned, so the caller + * needs to unlock the socket in that case (usually by calling + * skb_free_datagram) + * + * * It does not lock socket since today. This function is + * * free of race conditions. This measure should/can improve + * * significantly datagram socket latencies at high loads, + * * when data copying to user space takes lots of time. + * * (BTW I've just killed the last cli() in IP/IPv6/core/netlink/packet + * * 8) Great win.) + * * --ANK (980729) + * + * The order of the tests when we find no data waiting are specified + * quite explicitly by POSIX 1003.1g, don't change them without having + * the standard around please. + */ +struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned flags, + int *peeked, int *off, int *err) +{ + struct sk_buff *skb; + long timeo; + /* + * Caller is allowed not to check sk->sk_err before skb_recv_datagram() + */ + int error = sock_error(sk); + + if (error) + goto no_packet; + + timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); + + do { + /* Again only user level code calls this function, so nothing + * interrupt level will suddenly eat the receive_queue. + * + * Look at current nfs client by the way... + * However, this function was correct in any case. 8) + */ + unsigned long cpu_flags; + struct sk_buff_head *queue = &sk->sk_receive_queue; + + spin_lock_irqsave(&queue->lock, cpu_flags); + skb_queue_walk(queue, skb) { + *peeked = skb->peeked; + if (flags & MSG_PEEK) { + if (*off >= skb->len) { + *off -= skb->len; + continue; + } + skb->peeked = 1; + atomic_inc(&skb->users); + } else + __skb_unlink(skb, queue); + + spin_unlock_irqrestore(&queue->lock, cpu_flags); + return skb; + } + spin_unlock_irqrestore(&queue->lock, cpu_flags); + + /* User doesn't want to wait */ + error = -EAGAIN; + if (!timeo) + goto no_packet; + + } while (!wait_for_packet(sk, err, &timeo)); + + return NULL; + +no_packet: + *err = error; + return NULL; +} +EXPORT_SYMBOL(__skb_recv_datagram); + +struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags, + int noblock, int *err) +{ + int peeked, off = 0; + + return __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0), + &peeked, &off, err); +} +EXPORT_SYMBOL(skb_recv_datagram); + +void skb_free_datagram(struct sock *sk, struct sk_buff *skb) +{ + consume_skb(skb); + sk_mem_reclaim_partial(sk); +} +EXPORT_SYMBOL(skb_free_datagram); + +void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb) +{ + bool slow; + + if (likely(atomic_read(&skb->users) == 1)) + smp_rmb(); + else if (likely(!atomic_dec_and_test(&skb->users))) + return; + + slow = lock_sock_fast(sk); + skb_orphan(skb); + sk_mem_reclaim_partial(sk); + unlock_sock_fast(sk, slow); + + /* skb is now orphaned, can be freed outside of locked section */ + trace_kfree_skb(skb, skb_free_datagram_locked); + __kfree_skb(skb); +} +EXPORT_SYMBOL(skb_free_datagram_locked); + +/** + * skb_kill_datagram - Free a datagram skbuff forcibly + * @sk: socket + * @skb: datagram skbuff + * @flags: MSG_ flags + * + * This function frees a datagram skbuff that was received by + * skb_recv_datagram. The flags argument must match the one + * used for skb_recv_datagram. + * + * If the MSG_PEEK flag is set, and the packet is still on the + * receive queue of the socket, it will be taken off the queue + * before it is freed. + * + * This function currently only disables BH when acquiring the + * sk_receive_queue lock. Therefore it must not be used in a + * context where that lock is acquired in an IRQ context. + * + * It returns 0 if the packet was removed by us. + */ + +int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags) +{ + int err = 0; + + if (flags & MSG_PEEK) { + err = -ENOENT; + spin_lock_bh(&sk->sk_receive_queue.lock); + if (skb == skb_peek(&sk->sk_receive_queue)) { + __skb_unlink(skb, &sk->sk_receive_queue); + atomic_dec(&skb->users); + err = 0; + } + spin_unlock_bh(&sk->sk_receive_queue.lock); + } + + kfree_skb(skb); + atomic_inc(&sk->sk_drops); + sk_mem_reclaim_partial(sk); + + return err; +} +EXPORT_SYMBOL(skb_kill_datagram); + +/** + * skb_copy_datagram_iovec - Copy a datagram to an iovec. + * @skb: buffer to copy + * @offset: offset in the buffer to start copying from + * @to: io vector to copy to + * @len: amount of data to copy from buffer to iovec + * + * Note: the iovec is modified during the copy. + */ +int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset, + struct iovec *to, int len) +{ + int start = skb_headlen(skb); + int i, copy = start - offset; + struct sk_buff *frag_iter; + + trace_skb_copy_datagram_iovec(skb, len); + + /* Copy header. */ + if (copy > 0) { + if (copy > len) + copy = len; + if (memcpy_toiovec(to, skb->data + offset, copy)) + goto fault; + if ((len -= copy) == 0) + return 0; + offset += copy; + } + + /* Copy paged appendix. Hmm... why does this look so complicated? */ + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + int end; + const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + + WARN_ON(start > offset + len); + + end = start + skb_frag_size(frag); + if ((copy = end - offset) > 0) { + int err; + u8 *vaddr; + struct page *page = skb_frag_page(frag); + + if (copy > len) + copy = len; + vaddr = kmap(page); + err = memcpy_toiovec(to, vaddr + frag->page_offset + + offset - start, copy); + kunmap(page); + if (err) + goto fault; + if (!(len -= copy)) + return 0; + offset += copy; + } + start = end; + } + + skb_walk_frags(skb, frag_iter) { + int end; + + WARN_ON(start > offset + len); + + end = start + frag_iter->len; + if ((copy = end - offset) > 0) { + if (copy > len) + copy = len; + if (skb_copy_datagram_iovec(frag_iter, + offset - start, + to, copy)) + goto fault; + if ((len -= copy) == 0) + return 0; + offset += copy; + } + start = end; + } + if (!len) + return 0; + +fault: + return -EFAULT; +} +EXPORT_SYMBOL(skb_copy_datagram_iovec); + +/** + * skb_copy_datagram_const_iovec - Copy a datagram to an iovec. + * @skb: buffer to copy + * @offset: offset in the buffer to start copying from + * @to: io vector to copy to + * @to_offset: offset in the io vector to start copying to + * @len: amount of data to copy from buffer to iovec + * + * Returns 0 or -EFAULT. + * Note: the iovec is not modified during the copy. + */ +int skb_copy_datagram_const_iovec(const struct sk_buff *skb, int offset, + const struct iovec *to, int to_offset, + int len) +{ + int start = skb_headlen(skb); + int i, copy = start - offset; + struct sk_buff *frag_iter; + + /* Copy header. */ + if (copy > 0) { + if (copy > len) + copy = len; + if (memcpy_toiovecend(to, skb->data + offset, to_offset, copy)) + goto fault; + if ((len -= copy) == 0) + return 0; + offset += copy; + to_offset += copy; + } + + /* Copy paged appendix. Hmm... why does this look so complicated? */ + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + int end; + const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + + WARN_ON(start > offset + len); + + end = start + skb_frag_size(frag); + if ((copy = end - offset) > 0) { + int err; + u8 *vaddr; + struct page *page = skb_frag_page(frag); + + if (copy > len) + copy = len; + vaddr = kmap(page); + err = memcpy_toiovecend(to, vaddr + frag->page_offset + + offset - start, to_offset, copy); + kunmap(page); + if (err) + goto fault; + if (!(len -= copy)) + return 0; + offset += copy; + to_offset += copy; + } + start = end; + } + + skb_walk_frags(skb, frag_iter) { + int end; + + WARN_ON(start > offset + len); + + end = start + frag_iter->len; + if ((copy = end - offset) > 0) { + if (copy > len) + copy = len; + if (skb_copy_datagram_const_iovec(frag_iter, + offset - start, + to, to_offset, + copy)) + goto fault; + if ((len -= copy) == 0) + return 0; + offset += copy; + to_offset += copy; + } + start = end; + } + if (!len) + return 0; + +fault: + return -EFAULT; +} +EXPORT_SYMBOL(skb_copy_datagram_const_iovec); + +/** + * skb_copy_datagram_from_iovec - Copy a datagram from an iovec. + * @skb: buffer to copy + * @offset: offset in the buffer to start copying to + * @from: io vector to copy to + * @from_offset: offset in the io vector to start copying from + * @len: amount of data to copy to buffer from iovec + * + * Returns 0 or -EFAULT. + * Note: the iovec is not modified during the copy. + */ +int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset, + const struct iovec *from, int from_offset, + int len) +{ + int start = skb_headlen(skb); + int i, copy = start - offset; + struct sk_buff *frag_iter; + + /* Copy header. */ + if (copy > 0) { + if (copy > len) + copy = len; + if (memcpy_fromiovecend(skb->data + offset, from, from_offset, + copy)) + goto fault; + if ((len -= copy) == 0) + return 0; + offset += copy; + from_offset += copy; + } + + /* Copy paged appendix. Hmm... why does this look so complicated? */ + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + int end; + const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + + WARN_ON(start > offset + len); + + end = start + skb_frag_size(frag); + if ((copy = end - offset) > 0) { + int err; + u8 *vaddr; + struct page *page = skb_frag_page(frag); + + if (copy > len) + copy = len; + vaddr = kmap(page); + err = memcpy_fromiovecend(vaddr + frag->page_offset + + offset - start, + from, from_offset, copy); + kunmap(page); + if (err) + goto fault; + + if (!(len -= copy)) + return 0; + offset += copy; + from_offset += copy; + } + start = end; + } + + skb_walk_frags(skb, frag_iter) { + int end; + + WARN_ON(start > offset + len); + + end = start + frag_iter->len; + if ((copy = end - offset) > 0) { + if (copy > len) + copy = len; + if (skb_copy_datagram_from_iovec(frag_iter, + offset - start, + from, + from_offset, + copy)) + goto fault; + if ((len -= copy) == 0) + return 0; + offset += copy; + from_offset += copy; + } + start = end; + } + if (!len) + return 0; + +fault: + return -EFAULT; +} +EXPORT_SYMBOL(skb_copy_datagram_from_iovec); + +static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset, + u8 __user *to, int len, + __wsum *csump) +{ + int start = skb_headlen(skb); + int i, copy = start - offset; + struct sk_buff *frag_iter; + int pos = 0; + + /* Copy header. */ + if (copy > 0) { + int err = 0; + if (copy > len) + copy = len; + *csump = csum_and_copy_to_user(skb->data + offset, to, copy, + *csump, &err); + if (err) + goto fault; + if ((len -= copy) == 0) + return 0; + offset += copy; + to += copy; + pos = copy; + } + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + int end; + const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + + WARN_ON(start > offset + len); + + end = start + skb_frag_size(frag); + if ((copy = end - offset) > 0) { + __wsum csum2; + int err = 0; + u8 *vaddr; + struct page *page = skb_frag_page(frag); + + if (copy > len) + copy = len; + vaddr = kmap(page); + csum2 = csum_and_copy_to_user(vaddr + + frag->page_offset + + offset - start, + to, copy, 0, &err); + kunmap(page); + if (err) + goto fault; + *csump = csum_block_add(*csump, csum2, pos); + if (!(len -= copy)) + return 0; + offset += copy; + to += copy; + pos += copy; + } + start = end; + } + + skb_walk_frags(skb, frag_iter) { + int end; + + WARN_ON(start > offset + len); + + end = start + frag_iter->len; + if ((copy = end - offset) > 0) { + __wsum csum2 = 0; + if (copy > len) + copy = len; + if (skb_copy_and_csum_datagram(frag_iter, + offset - start, + to, copy, + &csum2)) + goto fault; + *csump = csum_block_add(*csump, csum2, pos); + if ((len -= copy) == 0) + return 0; + offset += copy; + to += copy; + pos += copy; + } + start = end; + } + if (!len) + return 0; + +fault: + return -EFAULT; +} + +__sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len) +{ + __sum16 sum; + + sum = csum_fold(skb_checksum(skb, 0, len, skb->csum)); + if (likely(!sum)) { + if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE)) + netdev_rx_csum_fault(skb->dev); + skb->ip_summed = CHECKSUM_UNNECESSARY; + } + return sum; +} +EXPORT_SYMBOL(__skb_checksum_complete_head); + +__sum16 __skb_checksum_complete(struct sk_buff *skb) +{ + return __skb_checksum_complete_head(skb, skb->len); +} +EXPORT_SYMBOL(__skb_checksum_complete); + +/** + * skb_copy_and_csum_datagram_iovec - Copy and checkum skb to user iovec. + * @skb: skbuff + * @hlen: hardware length + * @iov: io vector + * + * Caller _must_ check that skb will fit to this iovec. + * + * Returns: 0 - success. + * -EINVAL - checksum failure. + * -EFAULT - fault during copy. Beware, in this case iovec + * can be modified! + */ +int skb_copy_and_csum_datagram_iovec(struct sk_buff *skb, + int hlen, struct iovec *iov) +{ + __wsum csum; + int chunk = skb->len - hlen; + + if (!chunk) + return 0; + + /* Skip filled elements. + * Pretty silly, look at memcpy_toiovec, though 8) + */ + while (!iov->iov_len) + iov++; + + if (iov->iov_len < chunk) { + if (__skb_checksum_complete(skb)) + goto csum_error; + if (skb_copy_datagram_iovec(skb, hlen, iov, chunk)) + goto fault; + } else { + csum = csum_partial(skb->data, hlen, skb->csum); + if (skb_copy_and_csum_datagram(skb, hlen, iov->iov_base, + chunk, &csum)) + goto fault; + if (csum_fold(csum)) + goto csum_error; + if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE)) + netdev_rx_csum_fault(skb->dev); + iov->iov_len -= chunk; + iov->iov_base += chunk; + } + return 0; +csum_error: + return -EINVAL; +fault: + return -EFAULT; +} +EXPORT_SYMBOL(skb_copy_and_csum_datagram_iovec); + +/** + * datagram_poll - generic datagram poll + * @file: file struct + * @sock: socket + * @wait: poll table + * + * Datagram poll: Again totally generic. This also handles + * sequenced packet sockets providing the socket receive queue + * is only ever holding data ready to receive. + * + * Note: when you _don't_ use this routine for this protocol, + * and you use a different write policy from sock_writeable() + * then please supply your own write_space callback. + */ +unsigned int datagram_poll(struct file *file, struct socket *sock, + poll_table *wait) +{ + struct sock *sk = sock->sk; + unsigned int mask; + + sock_poll_wait(file, sk_sleep(sk), wait); + mask = 0; + + /* exceptional events? */ + if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) + mask |= POLLERR; + if (sk->sk_shutdown & RCV_SHUTDOWN) + mask |= POLLRDHUP | POLLIN | POLLRDNORM; + if (sk->sk_shutdown == SHUTDOWN_MASK) + mask |= POLLHUP; + + /* readable? */ + if (!skb_queue_empty(&sk->sk_receive_queue)) + mask |= POLLIN | POLLRDNORM; + + /* Connection-based need to check for termination and startup */ + if (connection_based(sk)) { + if (sk->sk_state == TCP_CLOSE) + mask |= POLLHUP; + /* connection hasn't started yet? */ + if (sk->sk_state == TCP_SYN_SENT) + return mask; + } + + /* writable? */ + if (sock_writeable(sk)) + mask |= POLLOUT | POLLWRNORM | POLLWRBAND; + else + set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + + return mask; +} +EXPORT_SYMBOL(datagram_poll); diff --git a/net/core/dev.c b/net/core/dev.c new file mode 100644 index 00000000..fa0c2b80 --- /dev/null +++ b/net/core/dev.c @@ -0,0 +1,6554 @@ +/* + * NET3 Protocol independent device support routines. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Derived from the non IP parts of dev.c 1.0.19 + * Authors: Ross Biro + * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> + * Mark Evans, <evansmp@uhura.aston.ac.uk> + * + * Additional Authors: + * Florian la Roche <rzsfl@rz.uni-sb.de> + * Alan Cox <gw4pts@gw4pts.ampr.org> + * David Hinds <dahinds@users.sourceforge.net> + * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> + * Adam Sulmicki <adam@cfar.umd.edu> + * Pekka Riikonen <priikone@poesidon.pspt.fi> + * + * Changes: + * D.J. Barrow : Fixed bug where dev->refcnt gets set + * to 2 if register_netdev gets called + * before net_dev_init & also removed a + * few lines of code in the process. + * Alan Cox : device private ioctl copies fields back. + * Alan Cox : Transmit queue code does relevant + * stunts to keep the queue safe. + * Alan Cox : Fixed double lock. + * Alan Cox : Fixed promisc NULL pointer trap + * ???????? : Support the full private ioctl range + * Alan Cox : Moved ioctl permission check into + * drivers + * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI + * Alan Cox : 100 backlog just doesn't cut it when + * you start doing multicast video 8) + * Alan Cox : Rewrote net_bh and list manager. + * Alan Cox : Fix ETH_P_ALL echoback lengths. + * Alan Cox : Took out transmit every packet pass + * Saved a few bytes in the ioctl handler + * Alan Cox : Network driver sets packet type before + * calling netif_rx. Saves a function + * call a packet. + * Alan Cox : Hashed net_bh() + * Richard Kooijman: Timestamp fixes. + * Alan Cox : Wrong field in SIOCGIFDSTADDR + * Alan Cox : Device lock protection. + * Alan Cox : Fixed nasty side effect of device close + * changes. + * Rudi Cilibrasi : Pass the right thing to + * set_mac_address() + * Dave Miller : 32bit quantity for the device lock to + * make it work out on a Sparc. + * Bjorn Ekwall : Added KERNELD hack. + * Alan Cox : Cleaned up the backlog initialise. + * Craig Metz : SIOCGIFCONF fix if space for under + * 1 device. + * Thomas Bogendoerfer : Return ENODEV for dev_open, if there + * is no device open function. + * Andi Kleen : Fix error reporting for SIOCGIFCONF + * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF + * Cyrus Durgin : Cleaned for KMOD + * Adam Sulmicki : Bug Fix : Network Device Unload + * A network device unload needs to purge + * the backlog queue. + * Paul Rusty Russell : SIOCSIFNAME + * Pekka Riikonen : Netdev boot-time settings code + * Andrew Morton : Make unregister_netdevice wait + * indefinitely on dev->refcnt + * J Hadi Salim : - Backlog queue sampling + * - netif_rx() feedback + */ + +#include <asm/uaccess.h> +#include <linux/bitops.h> +#include <linux/capability.h> +#include <linux/cpu.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/hash.h> +#include <linux/slab.h> +#include <linux/sched.h> +#include <linux/mutex.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/if_ether.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/ethtool.h> +#include <linux/notifier.h> +#include <linux/skbuff.h> +#include <net/net_namespace.h> +#include <net/sock.h> +#include <linux/rtnetlink.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/stat.h> +#include <net/dst.h> +#include <net/pkt_sched.h> +#include <net/checksum.h> +#include <net/xfrm.h> +#include <linux/highmem.h> +#include <linux/init.h> +#include <linux/kmod.h> +#include <linux/module.h> +#include <linux/netpoll.h> +#include <linux/rcupdate.h> +#include <linux/delay.h> +#include <net/wext.h> +#include <net/iw_handler.h> +#include <asm/current.h> +#include <linux/audit.h> +#include <linux/dmaengine.h> +#include <linux/err.h> +#include <linux/ctype.h> +#include <linux/if_arp.h> +#include <linux/if_vlan.h> +#include <linux/ip.h> +#include <net/ip.h> +#include <linux/ipv6.h> +#include <linux/in.h> +#include <linux/jhash.h> +#include <linux/random.h> +#include <trace/events/napi.h> +#include <trace/events/net.h> +#include <trace/events/skb.h> +#include <linux/pci.h> +#include <linux/inetdevice.h> +#include <linux/cpu_rmap.h> +#include <linux/net_tstamp.h> +#include <linux/static_key.h> +#include <net/flow_keys.h> + +#include "net-sysfs.h" + +/* Instead of increasing this, you should create a hash table. */ +#define MAX_GRO_SKBS 8 + +/* This should be increased if a protocol with a bigger head is added. */ +#define GRO_MAX_HEAD (MAX_HEADER + 128) + +/* + * The list of packet types we will receive (as opposed to discard) + * and the routines to invoke. + * + * Why 16. Because with 16 the only overlap we get on a hash of the + * low nibble of the protocol value is RARP/SNAP/X.25. + * + * NOTE: That is no longer true with the addition of VLAN tags. Not + * sure which should go first, but I bet it won't make much + * difference if we are running VLANs. The good news is that + * this protocol won't be in the list unless compiled in, so + * the average user (w/out VLANs) will not be adversely affected. + * --BLG + * + * 0800 IP + * 8100 802.1Q VLAN + * 0001 802.3 + * 0002 AX.25 + * 0004 802.2 + * 8035 RARP + * 0005 SNAP + * 0805 X.25 + * 0806 ARP + * 8137 IPX + * 0009 Localtalk + * 86DD IPv6 + */ + +#define PTYPE_HASH_SIZE (16) +#define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1) + +static DEFINE_SPINLOCK(ptype_lock); +static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; +static struct list_head ptype_all __read_mostly; /* Taps */ + +/* + * The @dev_base_head list is protected by @dev_base_lock and the rtnl + * semaphore. + * + * Pure readers hold dev_base_lock for reading, or rcu_read_lock() + * + * Writers must hold the rtnl semaphore while they loop through the + * dev_base_head list, and hold dev_base_lock for writing when they do the + * actual updates. This allows pure readers to access the list even + * while a writer is preparing to update it. + * + * To put it another way, dev_base_lock is held for writing only to + * protect against pure readers; the rtnl semaphore provides the + * protection against other writers. + * + * See, for example usages, register_netdevice() and + * unregister_netdevice(), which must be called with the rtnl + * semaphore held. + */ +DEFINE_RWLOCK(dev_base_lock); +EXPORT_SYMBOL(dev_base_lock); + +static inline void dev_base_seq_inc(struct net *net) +{ + while (++net->dev_base_seq == 0); +} + +static inline struct hlist_head *dev_name_hash(struct net *net, const char *name) +{ + unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ)); + return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)]; +} + +static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) +{ + return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)]; +} + +static inline void rps_lock(struct softnet_data *sd) +{ +#ifdef CONFIG_RPS + spin_lock(&sd->input_pkt_queue.lock); +#endif +} + +static inline void rps_unlock(struct softnet_data *sd) +{ +#ifdef CONFIG_RPS + spin_unlock(&sd->input_pkt_queue.lock); +#endif +} + +/* Device list insertion */ +static int list_netdevice(struct net_device *dev) +{ + struct net *net = dev_net(dev); + + ASSERT_RTNL(); + + write_lock_bh(&dev_base_lock); + list_add_tail_rcu(&dev->dev_list, &net->dev_base_head); + hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name)); + hlist_add_head_rcu(&dev->index_hlist, + dev_index_hash(net, dev->ifindex)); + write_unlock_bh(&dev_base_lock); + + dev_base_seq_inc(net); + + return 0; +} + +/* Device list removal + * caller must respect a RCU grace period before freeing/reusing dev + */ +static void unlist_netdevice(struct net_device *dev) +{ + ASSERT_RTNL(); + + /* Unlink dev from the device chain */ + write_lock_bh(&dev_base_lock); + list_del_rcu(&dev->dev_list); + hlist_del_rcu(&dev->name_hlist); + hlist_del_rcu(&dev->index_hlist); + write_unlock_bh(&dev_base_lock); + + dev_base_seq_inc(dev_net(dev)); +} + +/* + * Our notifier list + */ + +static RAW_NOTIFIER_HEAD(netdev_chain); + +/* + * Device drivers call our routines to queue packets here. We empty the + * queue in the local softnet handler. + */ + +DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); +EXPORT_PER_CPU_SYMBOL(softnet_data); + +#ifdef CONFIG_LOCKDEP +/* + * register_netdevice() inits txq->_xmit_lock and sets lockdep class + * according to dev->type + */ +static const unsigned short netdev_lock_type[] = + {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25, + ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET, + ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM, + ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP, + ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD, + ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25, + ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP, + ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD, + ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI, + ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE, + ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET, + ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL, + ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211, + ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, + ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154, + ARPHRD_VOID, ARPHRD_NONE}; + +static const char *const netdev_lock_name[] = + {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25", + "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET", + "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM", + "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP", + "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD", + "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25", + "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP", + "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD", + "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI", + "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE", + "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET", + "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL", + "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211", + "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", + "_xmit_PHONET_PIPE", "_xmit_IEEE802154", + "_xmit_VOID", "_xmit_NONE"}; + +static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)]; +static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)]; + +static inline unsigned short netdev_lock_pos(unsigned short dev_type) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++) + if (netdev_lock_type[i] == dev_type) + return i; + /* the last key is used by default */ + return ARRAY_SIZE(netdev_lock_type) - 1; +} + +static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, + unsigned short dev_type) +{ + int i; + + i = netdev_lock_pos(dev_type); + lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i], + netdev_lock_name[i]); +} + +static inline void netdev_set_addr_lockdep_class(struct net_device *dev) +{ + int i; + + i = netdev_lock_pos(dev->type); + lockdep_set_class_and_name(&dev->addr_list_lock, + &netdev_addr_lock_key[i], + netdev_lock_name[i]); +} +#else +static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, + unsigned short dev_type) +{ +} +static inline void netdev_set_addr_lockdep_class(struct net_device *dev) +{ +} +#endif + +/******************************************************************************* + + Protocol management and registration routines + +*******************************************************************************/ + +/* + * Add a protocol ID to the list. Now that the input handler is + * smarter we can dispense with all the messy stuff that used to be + * here. + * + * BEWARE!!! Protocol handlers, mangling input packets, + * MUST BE last in hash buckets and checking protocol handlers + * MUST start from promiscuous ptype_all chain in net_bh. + * It is true now, do not change it. + * Explanation follows: if protocol handler, mangling packet, will + * be the first on list, it is not able to sense, that packet + * is cloned and should be copied-on-write, so that it will + * change it and subsequent readers will get broken packet. + * --ANK (980803) + */ + +static inline struct list_head *ptype_head(const struct packet_type *pt) +{ + if (pt->type == htons(ETH_P_ALL)) + return &ptype_all; + else + return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK]; +} + +/** + * dev_add_pack - add packet handler + * @pt: packet type declaration + * + * Add a protocol handler to the networking stack. The passed &packet_type + * is linked into kernel lists and may not be freed until it has been + * removed from the kernel lists. + * + * This call does not sleep therefore it can not + * guarantee all CPU's that are in middle of receiving packets + * will see the new packet type (until the next received packet). + */ + +void dev_add_pack(struct packet_type *pt) +{ + struct list_head *head = ptype_head(pt); + + spin_lock(&ptype_lock); + list_add_rcu(&pt->list, head); + spin_unlock(&ptype_lock); +} +EXPORT_SYMBOL(dev_add_pack); + +/** + * __dev_remove_pack - remove packet handler + * @pt: packet type declaration + * + * Remove a protocol handler that was previously added to the kernel + * protocol handlers by dev_add_pack(). The passed &packet_type is removed + * from the kernel lists and can be freed or reused once this function + * returns. + * + * The packet type might still be in use by receivers + * and must not be freed until after all the CPU's have gone + * through a quiescent state. + */ +void __dev_remove_pack(struct packet_type *pt) +{ + struct list_head *head = ptype_head(pt); + struct packet_type *pt1; + + spin_lock(&ptype_lock); + + list_for_each_entry(pt1, head, list) { + if (pt == pt1) { + list_del_rcu(&pt->list); + goto out; + } + } + + pr_warn("dev_remove_pack: %p not found\n", pt); +out: + spin_unlock(&ptype_lock); +} +EXPORT_SYMBOL(__dev_remove_pack); + +/** + * dev_remove_pack - remove packet handler + * @pt: packet type declaration + * + * Remove a protocol handler that was previously added to the kernel + * protocol handlers by dev_add_pack(). The passed &packet_type is removed + * from the kernel lists and can be freed or reused once this function + * returns. + * + * This call sleeps to guarantee that no CPU is looking at the packet + * type after return. + */ +void dev_remove_pack(struct packet_type *pt) +{ + __dev_remove_pack(pt); + + synchronize_net(); +} +EXPORT_SYMBOL(dev_remove_pack); + +/****************************************************************************** + + Device Boot-time Settings Routines + +*******************************************************************************/ + +/* Boot time configuration table */ +static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX]; + +/** + * netdev_boot_setup_add - add new setup entry + * @name: name of the device + * @map: configured settings for the device + * + * Adds new setup entry to the dev_boot_setup list. The function + * returns 0 on error and 1 on success. This is a generic routine to + * all netdevices. + */ +static int netdev_boot_setup_add(char *name, struct ifmap *map) +{ + struct netdev_boot_setup *s; + int i; + + s = dev_boot_setup; + for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) { + if (s[i].name[0] == '\0' || s[i].name[0] == ' ') { + memset(s[i].name, 0, sizeof(s[i].name)); + strlcpy(s[i].name, name, IFNAMSIZ); + memcpy(&s[i].map, map, sizeof(s[i].map)); + break; + } + } + + return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1; +} + +/** + * netdev_boot_setup_check - check boot time settings + * @dev: the netdevice + * + * Check boot time settings for the device. + * The found settings are set for the device to be used + * later in the device probing. + * Returns 0 if no settings found, 1 if they are. + */ +int netdev_boot_setup_check(struct net_device *dev) +{ + struct netdev_boot_setup *s = dev_boot_setup; + int i; + + for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) { + if (s[i].name[0] != '\0' && s[i].name[0] != ' ' && + !strcmp(dev->name, s[i].name)) { + dev->irq = s[i].map.irq; + dev->base_addr = s[i].map.base_addr; + dev->mem_start = s[i].map.mem_start; + dev->mem_end = s[i].map.mem_end; + return 1; + } + } + return 0; +} +EXPORT_SYMBOL(netdev_boot_setup_check); + + +/** + * netdev_boot_base - get address from boot time settings + * @prefix: prefix for network device + * @unit: id for network device + * + * Check boot time settings for the base address of device. + * The found settings are set for the device to be used + * later in the device probing. + * Returns 0 if no settings found. + */ +unsigned long netdev_boot_base(const char *prefix, int unit) +{ + const struct netdev_boot_setup *s = dev_boot_setup; + char name[IFNAMSIZ]; + int i; + + sprintf(name, "%s%d", prefix, unit); + + /* + * If device already registered then return base of 1 + * to indicate not to probe for this interface + */ + if (__dev_get_by_name(&init_net, name)) + return 1; + + for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) + if (!strcmp(name, s[i].name)) + return s[i].map.base_addr; + return 0; +} + +/* + * Saves at boot time configured settings for any netdevice. + */ +int __init netdev_boot_setup(char *str) +{ + int ints[5]; + struct ifmap map; + + str = get_options(str, ARRAY_SIZE(ints), ints); + if (!str || !*str) + return 0; + + /* Save settings */ + memset(&map, 0, sizeof(map)); + if (ints[0] > 0) + map.irq = ints[1]; + if (ints[0] > 1) + map.base_addr = ints[2]; + if (ints[0] > 2) + map.mem_start = ints[3]; + if (ints[0] > 3) + map.mem_end = ints[4]; + + /* Add new entry to the list */ + return netdev_boot_setup_add(str, &map); +} + +__setup("netdev=", netdev_boot_setup); + +/******************************************************************************* + + Device Interface Subroutines + +*******************************************************************************/ + +/** + * __dev_get_by_name - find a device by its name + * @net: the applicable net namespace + * @name: name to find + * + * Find an interface by name. Must be called under RTNL semaphore + * or @dev_base_lock. If the name is found a pointer to the device + * is returned. If the name is not found then %NULL is returned. The + * reference counters are not incremented so the caller must be + * careful with locks. + */ + +struct net_device *__dev_get_by_name(struct net *net, const char *name) +{ + struct hlist_node *p; + struct net_device *dev; + struct hlist_head *head = dev_name_hash(net, name); + + hlist_for_each_entry(dev, p, head, name_hlist) + if (!strncmp(dev->name, name, IFNAMSIZ)) + return dev; + + return NULL; +} +EXPORT_SYMBOL(__dev_get_by_name); + +/** + * dev_get_by_name_rcu - find a device by its name + * @net: the applicable net namespace + * @name: name to find + * + * Find an interface by name. + * If the name is found a pointer to the device is returned. + * If the name is not found then %NULL is returned. + * The reference counters are not incremented so the caller must be + * careful with locks. The caller must hold RCU lock. + */ + +struct net_device *dev_get_by_name_rcu(struct net *net, const char *name) +{ + struct hlist_node *p; + struct net_device *dev; + struct hlist_head *head = dev_name_hash(net, name); + + hlist_for_each_entry_rcu(dev, p, head, name_hlist) + if (!strncmp(dev->name, name, IFNAMSIZ)) + return dev; + + return NULL; +} +EXPORT_SYMBOL(dev_get_by_name_rcu); + +/** + * dev_get_by_name - find a device by its name + * @net: the applicable net namespace + * @name: name to find + * + * Find an interface by name. This can be called from any + * context and does its own locking. The returned handle has + * the usage count incremented and the caller must use dev_put() to + * release it when it is no longer needed. %NULL is returned if no + * matching device is found. + */ + +struct net_device *dev_get_by_name(struct net *net, const char *name) +{ + struct net_device *dev; + + rcu_read_lock(); + dev = dev_get_by_name_rcu(net, name); + if (dev) + dev_hold(dev); + rcu_read_unlock(); + return dev; +} +EXPORT_SYMBOL(dev_get_by_name); + +/** + * __dev_get_by_index - find a device by its ifindex + * @net: the applicable net namespace + * @ifindex: index of device + * + * Search for an interface by index. Returns %NULL if the device + * is not found or a pointer to the device. The device has not + * had its reference counter increased so the caller must be careful + * about locking. The caller must hold either the RTNL semaphore + * or @dev_base_lock. + */ + +struct net_device *__dev_get_by_index(struct net *net, int ifindex) +{ + struct hlist_node *p; + struct net_device *dev; + struct hlist_head *head = dev_index_hash(net, ifindex); + + hlist_for_each_entry(dev, p, head, index_hlist) + if (dev->ifindex == ifindex) + return dev; + + return NULL; +} +EXPORT_SYMBOL(__dev_get_by_index); + +/** + * dev_get_by_index_rcu - find a device by its ifindex + * @net: the applicable net namespace + * @ifindex: index of device + * + * Search for an interface by index. Returns %NULL if the device + * is not found or a pointer to the device. The device has not + * had its reference counter increased so the caller must be careful + * about locking. The caller must hold RCU lock. + */ + +struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex) +{ + struct hlist_node *p; + struct net_device *dev; + struct hlist_head *head = dev_index_hash(net, ifindex); + + hlist_for_each_entry_rcu(dev, p, head, index_hlist) + if (dev->ifindex == ifindex) + return dev; + + return NULL; +} +EXPORT_SYMBOL(dev_get_by_index_rcu); + + +/** + * dev_get_by_index - find a device by its ifindex + * @net: the applicable net namespace + * @ifindex: index of device + * + * Search for an interface by index. Returns NULL if the device + * is not found or a pointer to the device. The device returned has + * had a reference added and the pointer is safe until the user calls + * dev_put to indicate they have finished with it. + */ + +struct net_device *dev_get_by_index(struct net *net, int ifindex) +{ + struct net_device *dev; + + rcu_read_lock(); + dev = dev_get_by_index_rcu(net, ifindex); + if (dev) + dev_hold(dev); + rcu_read_unlock(); + return dev; +} +EXPORT_SYMBOL(dev_get_by_index); + +/** + * dev_getbyhwaddr_rcu - find a device by its hardware address + * @net: the applicable net namespace + * @type: media type of device + * @ha: hardware address + * + * Search for an interface by MAC address. Returns NULL if the device + * is not found or a pointer to the device. + * The caller must hold RCU or RTNL. + * The returned device has not had its ref count increased + * and the caller must therefore be careful about locking + * + */ + +struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type, + const char *ha) +{ + struct net_device *dev; + + for_each_netdev_rcu(net, dev) + if (dev->type == type && + !memcmp(dev->dev_addr, ha, dev->addr_len)) + return dev; + + return NULL; +} +EXPORT_SYMBOL(dev_getbyhwaddr_rcu); + +struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type) +{ + struct net_device *dev; + + ASSERT_RTNL(); + for_each_netdev(net, dev) + if (dev->type == type) + return dev; + + return NULL; +} +EXPORT_SYMBOL(__dev_getfirstbyhwtype); + +struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type) +{ + struct net_device *dev, *ret = NULL; + + rcu_read_lock(); + for_each_netdev_rcu(net, dev) + if (dev->type == type) { + dev_hold(dev); + ret = dev; + break; + } + rcu_read_unlock(); + return ret; +} +EXPORT_SYMBOL(dev_getfirstbyhwtype); + +/** + * dev_get_by_flags_rcu - find any device with given flags + * @net: the applicable net namespace + * @if_flags: IFF_* values + * @mask: bitmask of bits in if_flags to check + * + * Search for any interface with the given flags. Returns NULL if a device + * is not found or a pointer to the device. Must be called inside + * rcu_read_lock(), and result refcount is unchanged. + */ + +struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags, + unsigned short mask) +{ + struct net_device *dev, *ret; + + ret = NULL; + for_each_netdev_rcu(net, dev) { + if (((dev->flags ^ if_flags) & mask) == 0) { + ret = dev; + break; + } + } + return ret; +} +EXPORT_SYMBOL(dev_get_by_flags_rcu); + +/** + * dev_valid_name - check if name is okay for network device + * @name: name string + * + * Network device names need to be valid file names to + * to allow sysfs to work. We also disallow any kind of + * whitespace. + */ +bool dev_valid_name(const char *name) +{ + if (*name == '\0') + return false; + if (strlen(name) >= IFNAMSIZ) + return false; + if (!strcmp(name, ".") || !strcmp(name, "..")) + return false; + + while (*name) { + if (*name == '/' || isspace(*name)) + return false; + name++; + } + return true; +} +EXPORT_SYMBOL(dev_valid_name); + +/** + * __dev_alloc_name - allocate a name for a device + * @net: network namespace to allocate the device name in + * @name: name format string + * @buf: scratch buffer and result name string + * + * Passed a format string - eg "lt%d" it will try and find a suitable + * id. It scans list of devices to build up a free map, then chooses + * the first empty slot. The caller must hold the dev_base or rtnl lock + * while allocating the name and adding the device in order to avoid + * duplicates. + * Limited to bits_per_byte * page size devices (ie 32K on most platforms). + * Returns the number of the unit assigned or a negative errno code. + */ + +static int __dev_alloc_name(struct net *net, const char *name, char *buf) +{ + int i = 0; + const char *p; + const int max_netdevices = 8*PAGE_SIZE; + unsigned long *inuse; + struct net_device *d; + + p = strnchr(name, IFNAMSIZ-1, '%'); + if (p) { + /* + * Verify the string as this thing may have come from + * the user. There must be either one "%d" and no other "%" + * characters. + */ + if (p[1] != 'd' || strchr(p + 2, '%')) + return -EINVAL; + + /* Use one page as a bit array of possible slots */ + inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC); + if (!inuse) + return -ENOMEM; + + for_each_netdev(net, d) { + if (!sscanf(d->name, name, &i)) + continue; + if (i < 0 || i >= max_netdevices) + continue; + + /* avoid cases where sscanf is not exact inverse of printf */ + snprintf(buf, IFNAMSIZ, name, i); + if (!strncmp(buf, d->name, IFNAMSIZ)) + set_bit(i, inuse); + } + + i = find_first_zero_bit(inuse, max_netdevices); + free_page((unsigned long) inuse); + } + + if (buf != name) + snprintf(buf, IFNAMSIZ, name, i); + if (!__dev_get_by_name(net, buf)) + return i; + + /* It is possible to run out of possible slots + * when the name is long and there isn't enough space left + * for the digits, or if all bits are used. + */ + return -ENFILE; +} + +/** + * dev_alloc_name - allocate a name for a device + * @dev: device + * @name: name format string + * + * Passed a format string - eg "lt%d" it will try and find a suitable + * id. It scans list of devices to build up a free map, then chooses + * the first empty slot. The caller must hold the dev_base or rtnl lock + * while allocating the name and adding the device in order to avoid + * duplicates. + * Limited to bits_per_byte * page size devices (ie 32K on most platforms). + * Returns the number of the unit assigned or a negative errno code. + */ + +int dev_alloc_name(struct net_device *dev, const char *name) +{ + char buf[IFNAMSIZ]; + struct net *net; + int ret; + + BUG_ON(!dev_net(dev)); + net = dev_net(dev); + ret = __dev_alloc_name(net, name, buf); + if (ret >= 0) + strlcpy(dev->name, buf, IFNAMSIZ); + return ret; +} +EXPORT_SYMBOL(dev_alloc_name); + +static int dev_get_valid_name(struct net_device *dev, const char *name) +{ + struct net *net; + + BUG_ON(!dev_net(dev)); + net = dev_net(dev); + + if (!dev_valid_name(name)) + return -EINVAL; + + if (strchr(name, '%')) + return dev_alloc_name(dev, name); + else if (__dev_get_by_name(net, name)) + return -EEXIST; + else if (dev->name != name) + strlcpy(dev->name, name, IFNAMSIZ); + + return 0; +} + +/** + * dev_change_name - change name of a device + * @dev: device + * @newname: name (or format string) must be at least IFNAMSIZ + * + * Change name of a device, can pass format strings "eth%d". + * for wildcarding. + */ +int dev_change_name(struct net_device *dev, const char *newname) +{ + char oldname[IFNAMSIZ]; + int err = 0; + int ret; + struct net *net; + + ASSERT_RTNL(); + BUG_ON(!dev_net(dev)); + + net = dev_net(dev); + if (dev->flags & IFF_UP) + return -EBUSY; + + if (strncmp(newname, dev->name, IFNAMSIZ) == 0) + return 0; + + memcpy(oldname, dev->name, IFNAMSIZ); + + err = dev_get_valid_name(dev, newname); + if (err < 0) + return err; + +rollback: + ret = device_rename(&dev->dev, dev->name); + if (ret) { + memcpy(dev->name, oldname, IFNAMSIZ); + return ret; + } + + write_lock_bh(&dev_base_lock); + hlist_del_rcu(&dev->name_hlist); + write_unlock_bh(&dev_base_lock); + + synchronize_rcu(); + + write_lock_bh(&dev_base_lock); + hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name)); + write_unlock_bh(&dev_base_lock); + + ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev); + ret = notifier_to_errno(ret); + + if (ret) { + /* err >= 0 after dev_alloc_name() or stores the first errno */ + if (err >= 0) { + err = ret; + memcpy(dev->name, oldname, IFNAMSIZ); + goto rollback; + } else { + pr_err("%s: name change rollback failed: %d\n", + dev->name, ret); + } + } + + return err; +} + +/** + * dev_set_alias - change ifalias of a device + * @dev: device + * @alias: name up to IFALIASZ + * @len: limit of bytes to copy from info + * + * Set ifalias for a device, + */ +int dev_set_alias(struct net_device *dev, const char *alias, size_t len) +{ + ASSERT_RTNL(); + + if (len >= IFALIASZ) + return -EINVAL; + + if (!len) { + if (dev->ifalias) { + kfree(dev->ifalias); + dev->ifalias = NULL; + } + return 0; + } + + dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL); + if (!dev->ifalias) + return -ENOMEM; + + strlcpy(dev->ifalias, alias, len+1); + return len; +} + + +/** + * netdev_features_change - device changes features + * @dev: device to cause notification + * + * Called to indicate a device has changed features. + */ +void netdev_features_change(struct net_device *dev) +{ + call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev); +} +EXPORT_SYMBOL(netdev_features_change); + +/** + * netdev_state_change - device changes state + * @dev: device to cause notification + * + * Called to indicate a device has changed state. This function calls + * the notifier chains for netdev_chain and sends a NEWLINK message + * to the routing socket. + */ +void netdev_state_change(struct net_device *dev) +{ + if (dev->flags & IFF_UP) { + call_netdevice_notifiers(NETDEV_CHANGE, dev); + rtmsg_ifinfo(RTM_NEWLINK, dev, 0); + } +} +EXPORT_SYMBOL(netdev_state_change); + +int netdev_bonding_change(struct net_device *dev, unsigned long event) +{ + return call_netdevice_notifiers(event, dev); +} +EXPORT_SYMBOL(netdev_bonding_change); + +/** + * dev_load - load a network module + * @net: the applicable net namespace + * @name: name of interface + * + * If a network interface is not present and the process has suitable + * privileges this function loads the module. If module loading is not + * available in this kernel then it becomes a nop. + */ + +void dev_load(struct net *net, const char *name) +{ + struct net_device *dev; + int no_module; + + rcu_read_lock(); + dev = dev_get_by_name_rcu(net, name); + rcu_read_unlock(); + + no_module = !dev; + if (no_module && capable(CAP_NET_ADMIN)) + no_module = request_module("netdev-%s", name); + if (no_module && capable(CAP_SYS_MODULE)) { + if (!request_module("%s", name)) + pr_err("Loading kernel module for a network device with CAP_SYS_MODULE (deprecated). Use CAP_NET_ADMIN and alias netdev-%s instead.\n", + name); + } +} +EXPORT_SYMBOL(dev_load); + +static int __dev_open(struct net_device *dev) +{ + const struct net_device_ops *ops = dev->netdev_ops; + int ret; + + ASSERT_RTNL(); + + if (!netif_device_present(dev)) + return -ENODEV; + + ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev); + ret = notifier_to_errno(ret); + if (ret) + return ret; + + set_bit(__LINK_STATE_START, &dev->state); + + if (ops->ndo_validate_addr) + ret = ops->ndo_validate_addr(dev); + + if (!ret && ops->ndo_open) + ret = ops->ndo_open(dev); + + if (ret) + clear_bit(__LINK_STATE_START, &dev->state); + else { + dev->flags |= IFF_UP; + net_dmaengine_get(); + dev_set_rx_mode(dev); + dev_activate(dev); + } + + return ret; +} + +/** + * dev_open - prepare an interface for use. + * @dev: device to open + * + * Takes a device from down to up state. The device's private open + * function is invoked and then the multicast lists are loaded. Finally + * the device is moved into the up state and a %NETDEV_UP message is + * sent to the netdev notifier chain. + * + * Calling this function on an active interface is a nop. On a failure + * a negative errno code is returned. + */ +int dev_open(struct net_device *dev) +{ + int ret; + + if (dev->flags & IFF_UP) + return 0; + + ret = __dev_open(dev); + if (ret < 0) + return ret; + + rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING); + call_netdevice_notifiers(NETDEV_UP, dev); + + return ret; +} +EXPORT_SYMBOL(dev_open); + +static int __dev_close_many(struct list_head *head) +{ + struct net_device *dev; + + ASSERT_RTNL(); + might_sleep(); + + list_for_each_entry(dev, head, unreg_list) { + call_netdevice_notifiers(NETDEV_GOING_DOWN, dev); + + clear_bit(__LINK_STATE_START, &dev->state); + + /* Synchronize to scheduled poll. We cannot touch poll list, it + * can be even on different cpu. So just clear netif_running(). + * + * dev->stop() will invoke napi_disable() on all of it's + * napi_struct instances on this device. + */ + smp_mb__after_clear_bit(); /* Commit netif_running(). */ + } + + dev_deactivate_many(head); + + list_for_each_entry(dev, head, unreg_list) { + const struct net_device_ops *ops = dev->netdev_ops; + + /* + * Call the device specific close. This cannot fail. + * Only if device is UP + * + * We allow it to be called even after a DETACH hot-plug + * event. + */ + if (ops->ndo_stop) + ops->ndo_stop(dev); + + dev->flags &= ~IFF_UP; + net_dmaengine_put(); + } + + return 0; +} + +static int __dev_close(struct net_device *dev) +{ + int retval; + LIST_HEAD(single); + + list_add(&dev->unreg_list, &single); + retval = __dev_close_many(&single); + list_del(&single); + return retval; +} + +static int dev_close_many(struct list_head *head) +{ + struct net_device *dev, *tmp; + LIST_HEAD(tmp_list); + + list_for_each_entry_safe(dev, tmp, head, unreg_list) + if (!(dev->flags & IFF_UP)) + list_move(&dev->unreg_list, &tmp_list); + + __dev_close_many(head); + + list_for_each_entry(dev, head, unreg_list) { + rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING); + call_netdevice_notifiers(NETDEV_DOWN, dev); + } + + /* rollback_registered_many needs the complete original list */ + list_splice(&tmp_list, head); + return 0; +} + +/** + * dev_close - shutdown an interface. + * @dev: device to shutdown + * + * This function moves an active device into down state. A + * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device + * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier + * chain. + */ +int dev_close(struct net_device *dev) +{ + if (dev->flags & IFF_UP) { + LIST_HEAD(single); + + list_add(&dev->unreg_list, &single); + dev_close_many(&single); + list_del(&single); + } + return 0; +} +EXPORT_SYMBOL(dev_close); + + +/** + * dev_disable_lro - disable Large Receive Offload on a device + * @dev: device + * + * Disable Large Receive Offload (LRO) on a net device. Must be + * called under RTNL. This is needed if received packets may be + * forwarded to another interface. + */ +void dev_disable_lro(struct net_device *dev) +{ + /* + * If we're trying to disable lro on a vlan device + * use the underlying physical device instead + */ + if (is_vlan_dev(dev)) + dev = vlan_dev_real_dev(dev); + + dev->wanted_features &= ~NETIF_F_LRO; + netdev_update_features(dev); + + if (unlikely(dev->features & NETIF_F_LRO)) + netdev_WARN(dev, "failed to disable LRO!\n"); +} +EXPORT_SYMBOL(dev_disable_lro); + + +static int dev_boot_phase = 1; + +/** + * register_netdevice_notifier - register a network notifier block + * @nb: notifier + * + * Register a notifier to be called when network device events occur. + * The notifier passed is linked into the kernel structures and must + * not be reused until it has been unregistered. A negative errno code + * is returned on a failure. + * + * When registered all registration and up events are replayed + * to the new notifier to allow device to have a race free + * view of the network device list. + */ + +int register_netdevice_notifier(struct notifier_block *nb) +{ + struct net_device *dev; + struct net_device *last; + struct net *net; + int err; + + rtnl_lock(); + err = raw_notifier_chain_register(&netdev_chain, nb); + if (err) + goto unlock; + if (dev_boot_phase) + goto unlock; + for_each_net(net) { + for_each_netdev(net, dev) { + err = nb->notifier_call(nb, NETDEV_REGISTER, dev); + err = notifier_to_errno(err); + if (err) + goto rollback; + + if (!(dev->flags & IFF_UP)) + continue; + + nb->notifier_call(nb, NETDEV_UP, dev); + } + } + +unlock: + rtnl_unlock(); + return err; + +rollback: + last = dev; + for_each_net(net) { + for_each_netdev(net, dev) { + if (dev == last) + goto outroll; + + if (dev->flags & IFF_UP) { + nb->notifier_call(nb, NETDEV_GOING_DOWN, dev); + nb->notifier_call(nb, NETDEV_DOWN, dev); + } + nb->notifier_call(nb, NETDEV_UNREGISTER, dev); + nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev); + } + } + +outroll: + raw_notifier_chain_unregister(&netdev_chain, nb); + goto unlock; +} +EXPORT_SYMBOL(register_netdevice_notifier); + +/** + * unregister_netdevice_notifier - unregister a network notifier block + * @nb: notifier + * + * Unregister a notifier previously registered by + * register_netdevice_notifier(). The notifier is unlinked into the + * kernel structures and may then be reused. A negative errno code + * is returned on a failure. + * + * After unregistering unregister and down device events are synthesized + * for all devices on the device list to the removed notifier to remove + * the need for special case cleanup code. + */ + +int unregister_netdevice_notifier(struct notifier_block *nb) +{ + struct net_device *dev; + struct net *net; + int err; + + rtnl_lock(); + err = raw_notifier_chain_unregister(&netdev_chain, nb); + if (err) + goto unlock; + + for_each_net(net) { + for_each_netdev(net, dev) { + if (dev->flags & IFF_UP) { + nb->notifier_call(nb, NETDEV_GOING_DOWN, dev); + nb->notifier_call(nb, NETDEV_DOWN, dev); + } + nb->notifier_call(nb, NETDEV_UNREGISTER, dev); + nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev); + } + } +unlock: + rtnl_unlock(); + return err; +} +EXPORT_SYMBOL(unregister_netdevice_notifier); + +/** + * call_netdevice_notifiers - call all network notifier blocks + * @val: value passed unmodified to notifier function + * @dev: net_device pointer passed unmodified to notifier function + * + * Call all network notifier blocks. Parameters and return value + * are as for raw_notifier_call_chain(). + */ + +int call_netdevice_notifiers(unsigned long val, struct net_device *dev) +{ + ASSERT_RTNL(); + return raw_notifier_call_chain(&netdev_chain, val, dev); +} +EXPORT_SYMBOL(call_netdevice_notifiers); + +static struct static_key netstamp_needed __read_mostly; +#ifdef HAVE_JUMP_LABEL +/* We are not allowed to call static_key_slow_dec() from irq context + * If net_disable_timestamp() is called from irq context, defer the + * static_key_slow_dec() calls. + */ +static atomic_t netstamp_needed_deferred; +#endif + +void net_enable_timestamp(void) +{ +#ifdef HAVE_JUMP_LABEL + int deferred = atomic_xchg(&netstamp_needed_deferred, 0); + + if (deferred) { + while (--deferred) + static_key_slow_dec(&netstamp_needed); + return; + } +#endif + WARN_ON(in_interrupt()); + static_key_slow_inc(&netstamp_needed); +} +EXPORT_SYMBOL(net_enable_timestamp); + +void net_disable_timestamp(void) +{ +#ifdef HAVE_JUMP_LABEL + if (in_interrupt()) { + atomic_inc(&netstamp_needed_deferred); + return; + } +#endif + static_key_slow_dec(&netstamp_needed); +} +EXPORT_SYMBOL(net_disable_timestamp); + +static inline void net_timestamp_set(struct sk_buff *skb) +{ + skb->tstamp.tv64 = 0; + if (static_key_false(&netstamp_needed)) + __net_timestamp(skb); +} + +#define net_timestamp_check(COND, SKB) \ + if (static_key_false(&netstamp_needed)) { \ + if ((COND) && !(SKB)->tstamp.tv64) \ + __net_timestamp(SKB); \ + } \ + +static int net_hwtstamp_validate(struct ifreq *ifr) +{ + struct hwtstamp_config cfg; + enum hwtstamp_tx_types tx_type; + enum hwtstamp_rx_filters rx_filter; + int tx_type_valid = 0; + int rx_filter_valid = 0; + + if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg))) + return -EFAULT; + + if (cfg.flags) /* reserved for future extensions */ + return -EINVAL; + + tx_type = cfg.tx_type; + rx_filter = cfg.rx_filter; + + switch (tx_type) { + case HWTSTAMP_TX_OFF: + case HWTSTAMP_TX_ON: + case HWTSTAMP_TX_ONESTEP_SYNC: + tx_type_valid = 1; + break; + } + + switch (rx_filter) { + case HWTSTAMP_FILTER_NONE: + case HWTSTAMP_FILTER_ALL: + case HWTSTAMP_FILTER_SOME: + case HWTSTAMP_FILTER_PTP_V1_L4_EVENT: + case HWTSTAMP_FILTER_PTP_V1_L4_SYNC: + case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ: + case HWTSTAMP_FILTER_PTP_V2_L4_EVENT: + case HWTSTAMP_FILTER_PTP_V2_L4_SYNC: + case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ: + case HWTSTAMP_FILTER_PTP_V2_L2_EVENT: + case HWTSTAMP_FILTER_PTP_V2_L2_SYNC: + case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ: + case HWTSTAMP_FILTER_PTP_V2_EVENT: + case HWTSTAMP_FILTER_PTP_V2_SYNC: + case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ: + rx_filter_valid = 1; + break; + } + + if (!tx_type_valid || !rx_filter_valid) + return -ERANGE; + + return 0; +} + +static inline bool is_skb_forwardable(struct net_device *dev, + struct sk_buff *skb) +{ + unsigned int len; + + if (!(dev->flags & IFF_UP)) + return false; + + len = dev->mtu + dev->hard_header_len + VLAN_HLEN; + if (skb->len <= len) + return true; + + /* if TSO is enabled, we don't care about the length as the packet + * could be forwarded without being segmented before + */ + if (skb_is_gso(skb)) + return true; + + return false; +} + +/** + * dev_forward_skb - loopback an skb to another netif + * + * @dev: destination network device + * @skb: buffer to forward + * + * return values: + * NET_RX_SUCCESS (no congestion) + * NET_RX_DROP (packet was dropped, but freed) + * + * dev_forward_skb can be used for injecting an skb from the + * start_xmit function of one device into the receive queue + * of another device. + * + * The receiving device may be in another namespace, so + * we have to clear all information in the skb that could + * impact namespace isolation. + */ +int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) +{ + if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { + if (skb_copy_ubufs(skb, GFP_ATOMIC)) { + atomic_long_inc(&dev->rx_dropped); + kfree_skb(skb); + return NET_RX_DROP; + } + } + + skb_orphan(skb); + nf_reset(skb); + + if (unlikely(!is_skb_forwardable(dev, skb))) { + atomic_long_inc(&dev->rx_dropped); + kfree_skb(skb); + return NET_RX_DROP; + } + skb->skb_iif = 0; + skb->dev = dev; + skb_dst_drop(skb); + skb->tstamp.tv64 = 0; + skb->pkt_type = PACKET_HOST; + skb->protocol = eth_type_trans(skb, dev); + skb->mark = 0; + secpath_reset(skb); + nf_reset(skb); + return netif_rx(skb); +} +EXPORT_SYMBOL_GPL(dev_forward_skb); + +static inline int deliver_skb(struct sk_buff *skb, + struct packet_type *pt_prev, + struct net_device *orig_dev) +{ + atomic_inc(&skb->users); + return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); +} + +/* + * Support routine. Sends outgoing frames to any network + * taps currently in use. + */ + +static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) +{ + struct packet_type *ptype; + struct sk_buff *skb2 = NULL; + struct packet_type *pt_prev = NULL; + + rcu_read_lock(); + list_for_each_entry_rcu(ptype, &ptype_all, list) { + /* Never send packets back to the socket + * they originated from - MvS (miquels@drinkel.ow.org) + */ + if ((ptype->dev == dev || !ptype->dev) && + (ptype->af_packet_priv == NULL || + (struct sock *)ptype->af_packet_priv != skb->sk)) { + if (pt_prev) { + deliver_skb(skb2, pt_prev, skb->dev); + pt_prev = ptype; + continue; + } + + skb2 = skb_clone(skb, GFP_ATOMIC); + if (!skb2) + break; + + net_timestamp_set(skb2); + + /* skb->nh should be correctly + set by sender, so that the second statement is + just protection against buggy protocols. + */ + skb_reset_mac_header(skb2); + + if (skb_network_header(skb2) < skb2->data || + skb2->network_header > skb2->tail) { + if (net_ratelimit()) + pr_crit("protocol %04x is buggy, dev %s\n", + ntohs(skb2->protocol), + dev->name); + skb_reset_network_header(skb2); + } + + skb2->transport_header = skb2->network_header; + skb2->pkt_type = PACKET_OUTGOING; + pt_prev = ptype; + } + } + if (pt_prev) + pt_prev->func(skb2, skb->dev, pt_prev, skb->dev); + rcu_read_unlock(); +} + +/* netif_setup_tc - Handle tc mappings on real_num_tx_queues change + * @dev: Network device + * @txq: number of queues available + * + * If real_num_tx_queues is changed the tc mappings may no longer be + * valid. To resolve this verify the tc mapping remains valid and if + * not NULL the mapping. With no priorities mapping to this + * offset/count pair it will no longer be used. In the worst case TC0 + * is invalid nothing can be done so disable priority mappings. If is + * expected that drivers will fix this mapping if they can before + * calling netif_set_real_num_tx_queues. + */ +static void netif_setup_tc(struct net_device *dev, unsigned int txq) +{ + int i; + struct netdev_tc_txq *tc = &dev->tc_to_txq[0]; + + /* If TC0 is invalidated disable TC mapping */ + if (tc->offset + tc->count > txq) { + pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n"); + dev->num_tc = 0; + return; + } + + /* Invalidated prio to tc mappings set to TC0 */ + for (i = 1; i < TC_BITMASK + 1; i++) { + int q = netdev_get_prio_tc_map(dev, i); + + tc = &dev->tc_to_txq[q]; + if (tc->offset + tc->count > txq) { + pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n", + i, q); + netdev_set_prio_tc_map(dev, i, 0); + } + } +} + +/* + * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues + * greater then real_num_tx_queues stale skbs on the qdisc must be flushed. + */ +int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) +{ + int rc; + + if (txq < 1 || txq > dev->num_tx_queues) + return -EINVAL; + + if (dev->reg_state == NETREG_REGISTERED || + dev->reg_state == NETREG_UNREGISTERING) { + ASSERT_RTNL(); + + rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues, + txq); + if (rc) + return rc; + + if (dev->num_tc) + netif_setup_tc(dev, txq); + + if (txq < dev->real_num_tx_queues) + qdisc_reset_all_tx_gt(dev, txq); + } + + dev->real_num_tx_queues = txq; + return 0; +} +EXPORT_SYMBOL(netif_set_real_num_tx_queues); + +#ifdef CONFIG_RPS +/** + * netif_set_real_num_rx_queues - set actual number of RX queues used + * @dev: Network device + * @rxq: Actual number of RX queues + * + * This must be called either with the rtnl_lock held or before + * registration of the net device. Returns 0 on success, or a + * negative error code. If called before registration, it always + * succeeds. + */ +int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq) +{ + int rc; + + if (rxq < 1 || rxq > dev->num_rx_queues) + return -EINVAL; + + if (dev->reg_state == NETREG_REGISTERED) { + ASSERT_RTNL(); + + rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues, + rxq); + if (rc) + return rc; + } + + dev->real_num_rx_queues = rxq; + return 0; +} +EXPORT_SYMBOL(netif_set_real_num_rx_queues); +#endif + +static inline void __netif_reschedule(struct Qdisc *q) +{ + struct softnet_data *sd; + unsigned long flags; + + local_irq_save(flags); + sd = &__get_cpu_var(softnet_data); + q->next_sched = NULL; + *sd->output_queue_tailp = q; + sd->output_queue_tailp = &q->next_sched; + raise_softirq_irqoff(NET_TX_SOFTIRQ); + local_irq_restore(flags); +} + +void __netif_schedule(struct Qdisc *q) +{ + if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state)) + __netif_reschedule(q); +} +EXPORT_SYMBOL(__netif_schedule); + +void dev_kfree_skb_irq(struct sk_buff *skb) +{ + if (atomic_dec_and_test(&skb->users)) { + struct softnet_data *sd; + unsigned long flags; + + local_irq_save(flags); + sd = &__get_cpu_var(softnet_data); + skb->next = sd->completion_queue; + sd->completion_queue = skb; + raise_softirq_irqoff(NET_TX_SOFTIRQ); + local_irq_restore(flags); + } +} +EXPORT_SYMBOL(dev_kfree_skb_irq); + +void dev_kfree_skb_any(struct sk_buff *skb) +{ + if (in_irq() || irqs_disabled()) + dev_kfree_skb_irq(skb); + else + dev_kfree_skb(skb); +} +EXPORT_SYMBOL(dev_kfree_skb_any); + + +/** + * netif_device_detach - mark device as removed + * @dev: network device + * + * Mark device as removed from system and therefore no longer available. + */ +void netif_device_detach(struct net_device *dev) +{ + if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) && + netif_running(dev)) { + netif_tx_stop_all_queues(dev); + } +} +EXPORT_SYMBOL(netif_device_detach); + +/** + * netif_device_attach - mark device as attached + * @dev: network device + * + * Mark device as attached from system and restart if needed. + */ +void netif_device_attach(struct net_device *dev) +{ + if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) && + netif_running(dev)) { + netif_tx_wake_all_queues(dev); + __netdev_watchdog_up(dev); + } +} +EXPORT_SYMBOL(netif_device_attach); + +static void skb_warn_bad_offload(const struct sk_buff *skb) +{ + static const netdev_features_t null_features = 0; + struct net_device *dev = skb->dev; + const char *driver = ""; + + if (dev && dev->dev.parent) + driver = dev_driver_string(dev->dev.parent); + + WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d " + "gso_type=%d ip_summed=%d\n", + driver, dev ? &dev->features : &null_features, + skb->sk ? &skb->sk->sk_route_caps : &null_features, + skb->len, skb->data_len, skb_shinfo(skb)->gso_size, + skb_shinfo(skb)->gso_type, skb->ip_summed); +} + +/* + * Invalidate hardware checksum when packet is to be mangled, and + * complete checksum manually on outgoing path. + */ +int skb_checksum_help(struct sk_buff *skb) +{ + __wsum csum; + int ret = 0, offset; + + if (skb->ip_summed == CHECKSUM_COMPLETE) + goto out_set_summed; + + if (unlikely(skb_shinfo(skb)->gso_size)) { + skb_warn_bad_offload(skb); + return -EINVAL; + } + + offset = skb_checksum_start_offset(skb); + BUG_ON(offset >= skb_headlen(skb)); + csum = skb_checksum(skb, offset, skb->len - offset, 0); + + offset += skb->csum_offset; + BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb)); + + if (skb_cloned(skb) && + !skb_clone_writable(skb, offset + sizeof(__sum16))) { + ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC); + if (ret) + goto out; + } + + *(__sum16 *)(skb->data + offset) = csum_fold(csum); +out_set_summed: + skb->ip_summed = CHECKSUM_NONE; +out: + return ret; +} +EXPORT_SYMBOL(skb_checksum_help); + +/** + * skb_gso_segment - Perform segmentation on skb. + * @skb: buffer to segment + * @features: features for the output path (see dev->features) + * + * This function segments the given skb and returns a list of segments. + * + * It may return NULL if the skb requires no segmentation. This is + * only possible when GSO is used for verifying header integrity. + */ +struct sk_buff *skb_gso_segment(struct sk_buff *skb, + netdev_features_t features) +{ + struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); + struct packet_type *ptype; + __be16 type = skb->protocol; + int vlan_depth = ETH_HLEN; + int err; + + while (type == htons(ETH_P_8021Q)) { + struct vlan_hdr *vh; + + if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN))) + return ERR_PTR(-EINVAL); + + vh = (struct vlan_hdr *)(skb->data + vlan_depth); + type = vh->h_vlan_encapsulated_proto; + vlan_depth += VLAN_HLEN; + } + + skb_reset_mac_header(skb); + skb->mac_len = skb->network_header - skb->mac_header; + __skb_pull(skb, skb->mac_len); + + if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) { + skb_warn_bad_offload(skb); + + if (skb_header_cloned(skb) && + (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) + return ERR_PTR(err); + } + + rcu_read_lock(); + list_for_each_entry_rcu(ptype, + &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) { + if (ptype->type == type && !ptype->dev && ptype->gso_segment) { + if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) { + err = ptype->gso_send_check(skb); + segs = ERR_PTR(err); + if (err || skb_gso_ok(skb, features)) + break; + __skb_push(skb, (skb->data - + skb_network_header(skb))); + } + segs = ptype->gso_segment(skb, features); + break; + } + } + rcu_read_unlock(); + + __skb_push(skb, skb->data - skb_mac_header(skb)); + + return segs; +} +EXPORT_SYMBOL(skb_gso_segment); + +/* Take action when hardware reception checksum errors are detected. */ +#ifdef CONFIG_BUG +void netdev_rx_csum_fault(struct net_device *dev) +{ + if (net_ratelimit()) { + pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>"); + dump_stack(); + } +} +EXPORT_SYMBOL(netdev_rx_csum_fault); +#endif + +/* Actually, we should eliminate this check as soon as we know, that: + * 1. IOMMU is present and allows to map all the memory. + * 2. No high memory really exists on this machine. + */ + +static int illegal_highdma(struct net_device *dev, struct sk_buff *skb) +{ +#ifdef CONFIG_HIGHMEM + int i; + if (!(dev->features & NETIF_F_HIGHDMA)) { + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + if (PageHighMem(skb_frag_page(frag))) + return 1; + } + } + + if (PCI_DMA_BUS_IS_PHYS) { + struct device *pdev = dev->dev.parent; + + if (!pdev) + return 0; + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + dma_addr_t addr = page_to_phys(skb_frag_page(frag)); + if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask) + return 1; + } + } +#endif + return 0; +} + +struct dev_gso_cb { + void (*destructor)(struct sk_buff *skb); +}; + +#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb) + +static void dev_gso_skb_destructor(struct sk_buff *skb) +{ + struct dev_gso_cb *cb; + + do { + struct sk_buff *nskb = skb->next; + + skb->next = nskb->next; + nskb->next = NULL; + kfree_skb(nskb); + } while (skb->next); + + cb = DEV_GSO_CB(skb); + if (cb->destructor) + cb->destructor(skb); +} + +/** + * dev_gso_segment - Perform emulated hardware segmentation on skb. + * @skb: buffer to segment + * @features: device features as applicable to this skb + * + * This function segments the given skb and stores the list of segments + * in skb->next. + */ +static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features) +{ + struct sk_buff *segs; + + segs = skb_gso_segment(skb, features); + + /* Verifying header integrity only. */ + if (!segs) + return 0; + + if (IS_ERR(segs)) + return PTR_ERR(segs); + + skb->next = segs; + DEV_GSO_CB(skb)->destructor = skb->destructor; + skb->destructor = dev_gso_skb_destructor; + + return 0; +} + +static bool can_checksum_protocol(netdev_features_t features, __be16 protocol) +{ + return ((features & NETIF_F_GEN_CSUM) || + ((features & NETIF_F_V4_CSUM) && + protocol == htons(ETH_P_IP)) || + ((features & NETIF_F_V6_CSUM) && + protocol == htons(ETH_P_IPV6)) || + ((features & NETIF_F_FCOE_CRC) && + protocol == htons(ETH_P_FCOE))); +} + +static netdev_features_t harmonize_features(struct sk_buff *skb, + __be16 protocol, netdev_features_t features) +{ + if (!can_checksum_protocol(features, protocol)) { + features &= ~NETIF_F_ALL_CSUM; + features &= ~NETIF_F_SG; + } else if (illegal_highdma(skb->dev, skb)) { + features &= ~NETIF_F_SG; + } + + return features; +} + +netdev_features_t netif_skb_features(struct sk_buff *skb) +{ + __be16 protocol = skb->protocol; + netdev_features_t features = skb->dev->features; + + if (protocol == htons(ETH_P_8021Q)) { + struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data; + protocol = veh->h_vlan_encapsulated_proto; + } else if (!vlan_tx_tag_present(skb)) { + return harmonize_features(skb, protocol, features); + } + + features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX); + + if (protocol != htons(ETH_P_8021Q)) { + return harmonize_features(skb, protocol, features); + } else { + features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST | + NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX; + return harmonize_features(skb, protocol, features); + } +} +EXPORT_SYMBOL(netif_skb_features); + +/* + * Returns true if either: + * 1. skb has frag_list and the device doesn't support FRAGLIST, or + * 2. skb is fragmented and the device does not support SG, or if + * at least one of fragments is in highmem and device does not + * support DMA from it. + */ +static inline int skb_needs_linearize(struct sk_buff *skb, + int features) +{ + return skb_is_nonlinear(skb) && + ((skb_has_frag_list(skb) && + !(features & NETIF_F_FRAGLIST)) || + (skb_shinfo(skb)->nr_frags && + !(features & NETIF_F_SG))); +} + +int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, + struct netdev_queue *txq) +{ + const struct net_device_ops *ops = dev->netdev_ops; + int rc = NETDEV_TX_OK; + unsigned int skb_len; + + if (likely(!skb->next)) { + netdev_features_t features; + + /* + * If device doesn't need skb->dst, release it right now while + * its hot in this cpu cache + */ + if (dev->priv_flags & IFF_XMIT_DST_RELEASE) + skb_dst_drop(skb); + + if (!list_empty(&ptype_all)) + dev_queue_xmit_nit(skb, dev); + + features = netif_skb_features(skb); + + if (vlan_tx_tag_present(skb) && + !(features & NETIF_F_HW_VLAN_TX)) { + skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb)); + if (unlikely(!skb)) + goto out; + + skb->vlan_tci = 0; + } + + if (netif_needs_gso(skb, features)) { + if (unlikely(dev_gso_segment(skb, features))) + goto out_kfree_skb; + if (skb->next) + goto gso; + } else { + if (skb_needs_linearize(skb, features) && + __skb_linearize(skb)) + goto out_kfree_skb; + + /* If packet is not checksummed and device does not + * support checksumming for this protocol, complete + * checksumming here. + */ + if (skb->ip_summed == CHECKSUM_PARTIAL) { + skb_set_transport_header(skb, + skb_checksum_start_offset(skb)); + if (!(features & NETIF_F_ALL_CSUM) && + skb_checksum_help(skb)) + goto out_kfree_skb; + } + } + + skb_len = skb->len; + rc = ops->ndo_start_xmit(skb, dev); + trace_net_dev_xmit(skb, rc, dev, skb_len); + if (rc == NETDEV_TX_OK) + txq_trans_update(txq); + return rc; + } + +gso: + do { + struct sk_buff *nskb = skb->next; + + skb->next = nskb->next; + nskb->next = NULL; + + /* + * If device doesn't need nskb->dst, release it right now while + * its hot in this cpu cache + */ + if (dev->priv_flags & IFF_XMIT_DST_RELEASE) + skb_dst_drop(nskb); + + skb_len = nskb->len; + rc = ops->ndo_start_xmit(nskb, dev); + trace_net_dev_xmit(nskb, rc, dev, skb_len); + if (unlikely(rc != NETDEV_TX_OK)) { + if (rc & ~NETDEV_TX_MASK) + goto out_kfree_gso_skb; + nskb->next = skb->next; + skb->next = nskb; + return rc; + } + txq_trans_update(txq); + if (unlikely(netif_xmit_stopped(txq) && skb->next)) + return NETDEV_TX_BUSY; + } while (skb->next); + +out_kfree_gso_skb: + if (likely(skb->next == NULL)) + skb->destructor = DEV_GSO_CB(skb)->destructor; +out_kfree_skb: + kfree_skb(skb); +out: + return rc; +} + +static u32 hashrnd __read_mostly; + +/* + * Returns a Tx hash based on the given packet descriptor a Tx queues' number + * to be used as a distribution range. + */ +u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb, + unsigned int num_tx_queues) +{ + u32 hash; + u16 qoffset = 0; + u16 qcount = num_tx_queues; + + if (skb_rx_queue_recorded(skb)) { + hash = skb_get_rx_queue(skb); + while (unlikely(hash >= num_tx_queues)) + hash -= num_tx_queues; + return hash; + } + + if (dev->num_tc) { + u8 tc = netdev_get_prio_tc_map(dev, skb->priority); + qoffset = dev->tc_to_txq[tc].offset; + qcount = dev->tc_to_txq[tc].count; + } + + if (skb->sk && skb->sk->sk_hash) + hash = skb->sk->sk_hash; + else + hash = (__force u16) skb->protocol; + hash = jhash_1word(hash, hashrnd); + + return (u16) (((u64) hash * qcount) >> 32) + qoffset; +} +EXPORT_SYMBOL(__skb_tx_hash); + +static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index) +{ + if (unlikely(queue_index >= dev->real_num_tx_queues)) { + if (net_ratelimit()) { + pr_warn("%s selects TX queue %d, but real number of TX queues is %d\n", + dev->name, queue_index, + dev->real_num_tx_queues); + } + return 0; + } + return queue_index; +} + +static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) +{ +#ifdef CONFIG_XPS + struct xps_dev_maps *dev_maps; + struct xps_map *map; + int queue_index = -1; + + rcu_read_lock(); + dev_maps = rcu_dereference(dev->xps_maps); + if (dev_maps) { + map = rcu_dereference( + dev_maps->cpu_map[raw_smp_processor_id()]); + if (map) { + if (map->len == 1) + queue_index = map->queues[0]; + else { + u32 hash; + if (skb->sk && skb->sk->sk_hash) + hash = skb->sk->sk_hash; + else + hash = (__force u16) skb->protocol ^ + skb->rxhash; + hash = jhash_1word(hash, hashrnd); + queue_index = map->queues[ + ((u64)hash * map->len) >> 32]; + } + if (unlikely(queue_index >= dev->real_num_tx_queues)) + queue_index = -1; + } + } + rcu_read_unlock(); + + return queue_index; +#else + return -1; +#endif +} + +static struct netdev_queue *dev_pick_tx(struct net_device *dev, + struct sk_buff *skb) +{ + int queue_index; + const struct net_device_ops *ops = dev->netdev_ops; + + if (dev->real_num_tx_queues == 1) + queue_index = 0; + else if (ops->ndo_select_queue) { + queue_index = ops->ndo_select_queue(dev, skb); + queue_index = dev_cap_txqueue(dev, queue_index); + } else { + struct sock *sk = skb->sk; + queue_index = sk_tx_queue_get(sk); + + if (queue_index < 0 || skb->ooo_okay || + queue_index >= dev->real_num_tx_queues) { + int old_index = queue_index; + + queue_index = get_xps_queue(dev, skb); + if (queue_index < 0) + queue_index = skb_tx_hash(dev, skb); + + if (queue_index != old_index && sk) { + struct dst_entry *dst = + rcu_dereference_check(sk->sk_dst_cache, 1); + + if (dst && skb_dst(skb) == dst) + sk_tx_queue_set(sk, queue_index); + } + } + } + + skb_set_queue_mapping(skb, queue_index); + return netdev_get_tx_queue(dev, queue_index); +} + +static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, + struct net_device *dev, + struct netdev_queue *txq) +{ + spinlock_t *root_lock = qdisc_lock(q); + bool contended; + int rc; + + qdisc_skb_cb(skb)->pkt_len = skb->len; + qdisc_calculate_pkt_len(skb, q); + /* + * Heuristic to force contended enqueues to serialize on a + * separate lock before trying to get qdisc main lock. + * This permits __QDISC_STATE_RUNNING owner to get the lock more often + * and dequeue packets faster. + */ + contended = qdisc_is_running(q); + if (unlikely(contended)) + spin_lock(&q->busylock); + + spin_lock(root_lock); + if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) { + kfree_skb(skb); + rc = NET_XMIT_DROP; + } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) && + qdisc_run_begin(q)) { + /* + * This is a work-conserving queue; there are no old skbs + * waiting to be sent out; and the qdisc is not running - + * xmit the skb directly. + */ + if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE)) + skb_dst_force(skb); + + qdisc_bstats_update(q, skb); + + if (sch_direct_xmit(skb, q, dev, txq, root_lock)) { + if (unlikely(contended)) { + spin_unlock(&q->busylock); + contended = false; + } + __qdisc_run(q); + } else + qdisc_run_end(q); + + rc = NET_XMIT_SUCCESS; + } else { + skb_dst_force(skb); + rc = q->enqueue(skb, q) & NET_XMIT_MASK; + if (qdisc_run_begin(q)) { + if (unlikely(contended)) { + spin_unlock(&q->busylock); + contended = false; + } + __qdisc_run(q); + } + } + spin_unlock(root_lock); + if (unlikely(contended)) + spin_unlock(&q->busylock); + return rc; +} + +#if IS_ENABLED(CONFIG_NETPRIO_CGROUP) +static void skb_update_prio(struct sk_buff *skb) +{ + struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap); + + if ((!skb->priority) && (skb->sk) && map) + skb->priority = map->priomap[skb->sk->sk_cgrp_prioidx]; +} +#else +#define skb_update_prio(skb) +#endif + +static DEFINE_PER_CPU(int, xmit_recursion); +#define RECURSION_LIMIT 10 + +/** + * dev_queue_xmit - transmit a buffer + * @skb: buffer to transmit + * + * Queue a buffer for transmission to a network device. The caller must + * have set the device and priority and built the buffer before calling + * this function. The function can be called from an interrupt. + * + * A negative errno code is returned on a failure. A success does not + * guarantee the frame will be transmitted as it may be dropped due + * to congestion or traffic shaping. + * + * ----------------------------------------------------------------------------------- + * I notice this method can also return errors from the queue disciplines, + * including NET_XMIT_DROP, which is a positive value. So, errors can also + * be positive. + * + * Regardless of the return value, the skb is consumed, so it is currently + * difficult to retry a send to this method. (You can bump the ref count + * before sending to hold a reference for retry if you are careful.) + * + * When calling this method, interrupts MUST be enabled. This is because + * the BH enable code must have IRQs enabled so that it will not deadlock. + * --BLG + */ +int dev_queue_xmit(struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + struct netdev_queue *txq; + struct Qdisc *q; + int rc = -ENOMEM; + + /* Disable soft irqs for various locks below. Also + * stops preemption for RCU. + */ + rcu_read_lock_bh(); + + skb_update_prio(skb); + + txq = dev_pick_tx(dev, skb); + q = rcu_dereference_bh(txq->qdisc); + +#ifdef CONFIG_NET_CLS_ACT + skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS); +#endif + trace_net_dev_queue(skb); + if (q->enqueue) { + rc = __dev_xmit_skb(skb, q, dev, txq); + goto out; + } + + /* The device has no queue. Common case for software devices: + loopback, all the sorts of tunnels... + + Really, it is unlikely that netif_tx_lock protection is necessary + here. (f.e. loopback and IP tunnels are clean ignoring statistics + counters.) + However, it is possible, that they rely on protection + made by us here. + + Check this and shot the lock. It is not prone from deadlocks. + Either shot noqueue qdisc, it is even simpler 8) + */ + if (dev->flags & IFF_UP) { + int cpu = smp_processor_id(); /* ok because BHs are off */ + + if (txq->xmit_lock_owner != cpu) { + + if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT) + goto recursion_alert; + + HARD_TX_LOCK(dev, txq, cpu); + + if (!netif_xmit_stopped(txq)) { + __this_cpu_inc(xmit_recursion); + rc = dev_hard_start_xmit(skb, dev, txq); + __this_cpu_dec(xmit_recursion); + if (dev_xmit_complete(rc)) { + HARD_TX_UNLOCK(dev, txq); + goto out; + } + } + HARD_TX_UNLOCK(dev, txq); + if (net_ratelimit()) + pr_crit("Virtual device %s asks to queue packet!\n", + dev->name); + } else { + /* Recursion is detected! It is possible, + * unfortunately + */ +recursion_alert: + if (net_ratelimit()) + pr_crit("Dead loop on virtual device %s, fix it urgently!\n", + dev->name); + } + } + + rc = -ENETDOWN; + rcu_read_unlock_bh(); + + kfree_skb(skb); + return rc; +out: + rcu_read_unlock_bh(); + return rc; +} +EXPORT_SYMBOL(dev_queue_xmit); + + +/*======================================================================= + Receiver routines + =======================================================================*/ + +int netdev_max_backlog __read_mostly = 1000; +int netdev_tstamp_prequeue __read_mostly = 1; +int netdev_budget __read_mostly = 300; +int weight_p __read_mostly = 64; /* old backlog weight */ + +/* Called with irq disabled */ +static inline void ____napi_schedule(struct softnet_data *sd, + struct napi_struct *napi) +{ + list_add_tail(&napi->poll_list, &sd->poll_list); + __raise_softirq_irqoff(NET_RX_SOFTIRQ); +} + +/* + * __skb_get_rxhash: calculate a flow hash based on src/dst addresses + * and src/dst port numbers. Sets rxhash in skb to non-zero hash value + * on success, zero indicates no valid hash. Also, sets l4_rxhash in skb + * if hash is a canonical 4-tuple hash over transport ports. + */ +void __skb_get_rxhash(struct sk_buff *skb) +{ + struct flow_keys keys; + u32 hash; + + if (!skb_flow_dissect(skb, &keys)) + return; + + if (keys.ports) { + if ((__force u16)keys.port16[1] < (__force u16)keys.port16[0]) + swap(keys.port16[0], keys.port16[1]); + skb->l4_rxhash = 1; + } + + /* get a consistent hash (same value on both flow directions) */ + if ((__force u32)keys.dst < (__force u32)keys.src) + swap(keys.dst, keys.src); + + hash = jhash_3words((__force u32)keys.dst, + (__force u32)keys.src, + (__force u32)keys.ports, hashrnd); + if (!hash) + hash = 1; + + skb->rxhash = hash; +} +EXPORT_SYMBOL(__skb_get_rxhash); + +#ifdef CONFIG_RPS + +/* One global table that all flow-based protocols share. */ +struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly; +EXPORT_SYMBOL(rps_sock_flow_table); + +struct static_key rps_needed __read_mostly; + +static struct rps_dev_flow * +set_rps_cpu(struct net_device *dev, struct sk_buff *skb, + struct rps_dev_flow *rflow, u16 next_cpu) +{ + if (next_cpu != RPS_NO_CPU) { +#ifdef CONFIG_RFS_ACCEL + struct netdev_rx_queue *rxqueue; + struct rps_dev_flow_table *flow_table; + struct rps_dev_flow *old_rflow; + u32 flow_id; + u16 rxq_index; + int rc; + + /* Should we steer this flow to a different hardware queue? */ + if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap || + !(dev->features & NETIF_F_NTUPLE)) + goto out; + rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu); + if (rxq_index == skb_get_rx_queue(skb)) + goto out; + + rxqueue = dev->_rx + rxq_index; + flow_table = rcu_dereference(rxqueue->rps_flow_table); + if (!flow_table) + goto out; + flow_id = skb->rxhash & flow_table->mask; + rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb, + rxq_index, flow_id); + if (rc < 0) + goto out; + old_rflow = rflow; + rflow = &flow_table->flows[flow_id]; + rflow->filter = rc; + if (old_rflow->filter == rflow->filter) + old_rflow->filter = RPS_NO_FILTER; + out: +#endif + rflow->last_qtail = + per_cpu(softnet_data, next_cpu).input_queue_head; + } + + rflow->cpu = next_cpu; + return rflow; +} + +/* + * get_rps_cpu is called from netif_receive_skb and returns the target + * CPU from the RPS map of the receiving queue for a given skb. + * rcu_read_lock must be held on entry. + */ +static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, + struct rps_dev_flow **rflowp) +{ + struct netdev_rx_queue *rxqueue; + struct rps_map *map; + struct rps_dev_flow_table *flow_table; + struct rps_sock_flow_table *sock_flow_table; + int cpu = -1; + u16 tcpu; + + if (skb_rx_queue_recorded(skb)) { + u16 index = skb_get_rx_queue(skb); + if (unlikely(index >= dev->real_num_rx_queues)) { + WARN_ONCE(dev->real_num_rx_queues > 1, + "%s received packet on queue %u, but number " + "of RX queues is %u\n", + dev->name, index, dev->real_num_rx_queues); + goto done; + } + rxqueue = dev->_rx + index; + } else + rxqueue = dev->_rx; + + map = rcu_dereference(rxqueue->rps_map); + if (map) { + if (map->len == 1 && + !rcu_access_pointer(rxqueue->rps_flow_table)) { + tcpu = map->cpus[0]; + if (cpu_online(tcpu)) + cpu = tcpu; + goto done; + } + } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) { + goto done; + } + + skb_reset_network_header(skb); + if (!skb_get_rxhash(skb)) + goto done; + + flow_table = rcu_dereference(rxqueue->rps_flow_table); + sock_flow_table = rcu_dereference(rps_sock_flow_table); + if (flow_table && sock_flow_table) { + u16 next_cpu; + struct rps_dev_flow *rflow; + + rflow = &flow_table->flows[skb->rxhash & flow_table->mask]; + tcpu = rflow->cpu; + + next_cpu = sock_flow_table->ents[skb->rxhash & + sock_flow_table->mask]; + + /* + * If the desired CPU (where last recvmsg was done) is + * different from current CPU (one in the rx-queue flow + * table entry), switch if one of the following holds: + * - Current CPU is unset (equal to RPS_NO_CPU). + * - Current CPU is offline. + * - The current CPU's queue tail has advanced beyond the + * last packet that was enqueued using this table entry. + * This guarantees that all previous packets for the flow + * have been dequeued, thus preserving in order delivery. + */ + if (unlikely(tcpu != next_cpu) && + (tcpu == RPS_NO_CPU || !cpu_online(tcpu) || + ((int)(per_cpu(softnet_data, tcpu).input_queue_head - + rflow->last_qtail)) >= 0)) + rflow = set_rps_cpu(dev, skb, rflow, next_cpu); + + if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) { + *rflowp = rflow; + cpu = tcpu; + goto done; + } + } + + if (map) { + tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32]; + + if (cpu_online(tcpu)) { + cpu = tcpu; + goto done; + } + } + +done: + return cpu; +} + +#ifdef CONFIG_RFS_ACCEL + +/** + * rps_may_expire_flow - check whether an RFS hardware filter may be removed + * @dev: Device on which the filter was set + * @rxq_index: RX queue index + * @flow_id: Flow ID passed to ndo_rx_flow_steer() + * @filter_id: Filter ID returned by ndo_rx_flow_steer() + * + * Drivers that implement ndo_rx_flow_steer() should periodically call + * this function for each installed filter and remove the filters for + * which it returns %true. + */ +bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, + u32 flow_id, u16 filter_id) +{ + struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index; + struct rps_dev_flow_table *flow_table; + struct rps_dev_flow *rflow; + bool expire = true; + int cpu; + + rcu_read_lock(); + flow_table = rcu_dereference(rxqueue->rps_flow_table); + if (flow_table && flow_id <= flow_table->mask) { + rflow = &flow_table->flows[flow_id]; + cpu = ACCESS_ONCE(rflow->cpu); + if (rflow->filter == filter_id && cpu != RPS_NO_CPU && + ((int)(per_cpu(softnet_data, cpu).input_queue_head - + rflow->last_qtail) < + (int)(10 * flow_table->mask))) + expire = false; + } + rcu_read_unlock(); + return expire; +} +EXPORT_SYMBOL(rps_may_expire_flow); + +#endif /* CONFIG_RFS_ACCEL */ + +/* Called from hardirq (IPI) context */ +static void rps_trigger_softirq(void *data) +{ + struct softnet_data *sd = data; + + ____napi_schedule(sd, &sd->backlog); + sd->received_rps++; +} + +#endif /* CONFIG_RPS */ + +/* + * Check if this softnet_data structure is another cpu one + * If yes, queue it to our IPI list and return 1 + * If no, return 0 + */ +static int rps_ipi_queued(struct softnet_data *sd) +{ +#ifdef CONFIG_RPS + struct softnet_data *mysd = &__get_cpu_var(softnet_data); + + if (sd != mysd) { + sd->rps_ipi_next = mysd->rps_ipi_list; + mysd->rps_ipi_list = sd; + + __raise_softirq_irqoff(NET_RX_SOFTIRQ); + return 1; + } +#endif /* CONFIG_RPS */ + return 0; +} + +/* + * enqueue_to_backlog is called to queue an skb to a per CPU backlog + * queue (may be a remote CPU queue). + */ +static int enqueue_to_backlog(struct sk_buff *skb, int cpu, + unsigned int *qtail) +{ + struct softnet_data *sd; + unsigned long flags; + + sd = &per_cpu(softnet_data, cpu); + + local_irq_save(flags); + + rps_lock(sd); + if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) { + if (skb_queue_len(&sd->input_pkt_queue)) { +enqueue: + __skb_queue_tail(&sd->input_pkt_queue, skb); + input_queue_tail_incr_save(sd, qtail); + rps_unlock(sd); + local_irq_restore(flags); + return NET_RX_SUCCESS; + } + + /* Schedule NAPI for backlog device + * We can use non atomic operation since we own the queue lock + */ + if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) { + if (!rps_ipi_queued(sd)) + ____napi_schedule(sd, &sd->backlog); + } + goto enqueue; + } + + sd->dropped++; + rps_unlock(sd); + + local_irq_restore(flags); + + atomic_long_inc(&skb->dev->rx_dropped); + kfree_skb(skb); + return NET_RX_DROP; +} + +/** + * netif_rx - post buffer to the network code + * @skb: buffer to post + * + * This function receives a packet from a device driver and queues it for + * the upper (protocol) levels to process. It always succeeds. The buffer + * may be dropped during processing for congestion control or by the + * protocol layers. + * + * return values: + * NET_RX_SUCCESS (no congestion) + * NET_RX_DROP (packet was dropped) + * + */ + +int netif_rx(struct sk_buff *skb) +{ + int ret; + + /* if netpoll wants it, pretend we never saw it */ + if (netpoll_rx(skb)) + return NET_RX_DROP; + + net_timestamp_check(netdev_tstamp_prequeue, skb); + + trace_netif_rx(skb); +#ifdef CONFIG_RPS + if (static_key_false(&rps_needed)) { + struct rps_dev_flow voidflow, *rflow = &voidflow; + int cpu; + + preempt_disable(); + rcu_read_lock(); + + cpu = get_rps_cpu(skb->dev, skb, &rflow); + if (cpu < 0) + cpu = smp_processor_id(); + + ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); + + rcu_read_unlock(); + preempt_enable(); + } else +#endif + { + unsigned int qtail; + ret = enqueue_to_backlog(skb, get_cpu(), &qtail); + put_cpu(); + } + return ret; +} +EXPORT_SYMBOL(netif_rx); + +int netif_rx_ni(struct sk_buff *skb) +{ + int err; + + preempt_disable(); + err = netif_rx(skb); + if (local_softirq_pending()) + do_softirq(); + preempt_enable(); + + return err; +} +EXPORT_SYMBOL(netif_rx_ni); + +static void net_tx_action(struct softirq_action *h) +{ + struct softnet_data *sd = &__get_cpu_var(softnet_data); + + if (sd->completion_queue) { + struct sk_buff *clist; + + local_irq_disable(); + clist = sd->completion_queue; + sd->completion_queue = NULL; + local_irq_enable(); + + while (clist) { + struct sk_buff *skb = clist; + clist = clist->next; + + WARN_ON(atomic_read(&skb->users)); + trace_kfree_skb(skb, net_tx_action); + __kfree_skb(skb); + } + } + + if (sd->output_queue) { + struct Qdisc *head; + + local_irq_disable(); + head = sd->output_queue; + sd->output_queue = NULL; + sd->output_queue_tailp = &sd->output_queue; + local_irq_enable(); + + while (head) { + struct Qdisc *q = head; + spinlock_t *root_lock; + + head = head->next_sched; + + root_lock = qdisc_lock(q); + if (spin_trylock(root_lock)) { + smp_mb__before_clear_bit(); + clear_bit(__QDISC_STATE_SCHED, + &q->state); + qdisc_run(q); + spin_unlock(root_lock); + } else { + if (!test_bit(__QDISC_STATE_DEACTIVATED, + &q->state)) { + __netif_reschedule(q); + } else { + smp_mb__before_clear_bit(); + clear_bit(__QDISC_STATE_SCHED, + &q->state); + } + } + } + } +} + +#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \ + (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE)) +/* This hook is defined here for ATM LANE */ +int (*br_fdb_test_addr_hook)(struct net_device *dev, + unsigned char *addr) __read_mostly; +EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook); +#endif + +#ifdef CONFIG_NET_CLS_ACT +/* TODO: Maybe we should just force sch_ingress to be compiled in + * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions + * a compare and 2 stores extra right now if we dont have it on + * but have CONFIG_NET_CLS_ACT + * NOTE: This doesn't stop any functionality; if you dont have + * the ingress scheduler, you just can't add policies on ingress. + * + */ +static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq) +{ + struct net_device *dev = skb->dev; + u32 ttl = G_TC_RTTL(skb->tc_verd); + int result = TC_ACT_OK; + struct Qdisc *q; + + if (unlikely(MAX_RED_LOOP < ttl++)) { + if (net_ratelimit()) + pr_warn("Redir loop detected Dropping packet (%d->%d)\n", + skb->skb_iif, dev->ifindex); + return TC_ACT_SHOT; + } + + skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl); + skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS); + + q = rxq->qdisc; + if (q != &noop_qdisc) { + spin_lock(qdisc_lock(q)); + if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) + result = qdisc_enqueue_root(skb, q); + spin_unlock(qdisc_lock(q)); + } + + return result; +} + +static inline struct sk_buff *handle_ing(struct sk_buff *skb, + struct packet_type **pt_prev, + int *ret, struct net_device *orig_dev) +{ + struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue); + + if (!rxq || rxq->qdisc == &noop_qdisc) + goto out; + + if (*pt_prev) { + *ret = deliver_skb(skb, *pt_prev, orig_dev); + *pt_prev = NULL; + } + + switch (ing_filter(skb, rxq)) { + case TC_ACT_SHOT: + case TC_ACT_STOLEN: + kfree_skb(skb); + return NULL; + } + +out: + skb->tc_verd = 0; + return skb; +} +#endif + +/** + * netdev_rx_handler_register - register receive handler + * @dev: device to register a handler for + * @rx_handler: receive handler to register + * @rx_handler_data: data pointer that is used by rx handler + * + * Register a receive hander for a device. This handler will then be + * called from __netif_receive_skb. A negative errno code is returned + * on a failure. + * + * The caller must hold the rtnl_mutex. + * + * For a general description of rx_handler, see enum rx_handler_result. + */ +int netdev_rx_handler_register(struct net_device *dev, + rx_handler_func_t *rx_handler, + void *rx_handler_data) +{ + ASSERT_RTNL(); + + if (dev->rx_handler) + return -EBUSY; + + rcu_assign_pointer(dev->rx_handler_data, rx_handler_data); + rcu_assign_pointer(dev->rx_handler, rx_handler); + + return 0; +} +EXPORT_SYMBOL_GPL(netdev_rx_handler_register); + +/** + * netdev_rx_handler_unregister - unregister receive handler + * @dev: device to unregister a handler from + * + * Unregister a receive hander from a device. + * + * The caller must hold the rtnl_mutex. + */ +void netdev_rx_handler_unregister(struct net_device *dev) +{ + + ASSERT_RTNL(); + RCU_INIT_POINTER(dev->rx_handler, NULL); + RCU_INIT_POINTER(dev->rx_handler_data, NULL); +} +EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister); + +static int __netif_receive_skb(struct sk_buff *skb) +{ + struct packet_type *ptype, *pt_prev; + rx_handler_func_t *rx_handler; + struct net_device *orig_dev; + struct net_device *null_or_dev; + bool deliver_exact = false; + int ret = NET_RX_DROP; + __be16 type; + + net_timestamp_check(!netdev_tstamp_prequeue, skb); + + trace_netif_receive_skb(skb); + + /* if we've gotten here through NAPI, check netpoll */ + if (netpoll_receive_skb(skb)) + return NET_RX_DROP; + + if (!skb->skb_iif) + skb->skb_iif = skb->dev->ifindex; + orig_dev = skb->dev; + + skb_reset_network_header(skb); + skb_reset_transport_header(skb); + skb_reset_mac_len(skb); + + pt_prev = NULL; + + rcu_read_lock(); + +another_round: + + __this_cpu_inc(softnet_data.processed); + + if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) { + skb = vlan_untag(skb); + if (unlikely(!skb)) + goto out; + } + +#ifdef CONFIG_NET_CLS_ACT + if (skb->tc_verd & TC_NCLS) { + skb->tc_verd = CLR_TC_NCLS(skb->tc_verd); + goto ncls; + } +#endif + + list_for_each_entry_rcu(ptype, &ptype_all, list) { + if (!ptype->dev || ptype->dev == skb->dev) { + if (pt_prev) + ret = deliver_skb(skb, pt_prev, orig_dev); + pt_prev = ptype; + } + } + +#ifdef CONFIG_NET_CLS_ACT + skb = handle_ing(skb, &pt_prev, &ret, orig_dev); + if (!skb) + goto out; +ncls: +#endif + + rx_handler = rcu_dereference(skb->dev->rx_handler); + if (vlan_tx_tag_present(skb)) { + if (pt_prev) { + ret = deliver_skb(skb, pt_prev, orig_dev); + pt_prev = NULL; + } + if (vlan_do_receive(&skb, !rx_handler)) + goto another_round; + else if (unlikely(!skb)) + goto out; + } + + if (rx_handler) { + if (pt_prev) { + ret = deliver_skb(skb, pt_prev, orig_dev); + pt_prev = NULL; + } + switch (rx_handler(&skb)) { + case RX_HANDLER_CONSUMED: + goto out; + case RX_HANDLER_ANOTHER: + goto another_round; + case RX_HANDLER_EXACT: + deliver_exact = true; + case RX_HANDLER_PASS: + break; + default: + BUG(); + } + } + + /* deliver only exact match when indicated */ + null_or_dev = deliver_exact ? skb->dev : NULL; + + type = skb->protocol; + list_for_each_entry_rcu(ptype, + &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) { + if (ptype->type == type && + (ptype->dev == null_or_dev || ptype->dev == skb->dev || + ptype->dev == orig_dev)) { + if (pt_prev) + ret = deliver_skb(skb, pt_prev, orig_dev); + pt_prev = ptype; + } + } + + if (pt_prev) { + ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); + } else { + atomic_long_inc(&skb->dev->rx_dropped); + kfree_skb(skb); + /* Jamal, now you will not able to escape explaining + * me how you were going to use this. :-) + */ + ret = NET_RX_DROP; + } + +out: + rcu_read_unlock(); + return ret; +} + +/** + * netif_receive_skb - process receive buffer from network + * @skb: buffer to process + * + * netif_receive_skb() is the main receive data processing function. + * It always succeeds. The buffer may be dropped during processing + * for congestion control or by the protocol layers. + * + * This function may only be called from softirq context and interrupts + * should be enabled. + * + * Return values (usually ignored): + * NET_RX_SUCCESS: no congestion + * NET_RX_DROP: packet was dropped + */ +int netif_receive_skb(struct sk_buff *skb) +{ + net_timestamp_check(netdev_tstamp_prequeue, skb); + + if (skb_defer_rx_timestamp(skb)) + return NET_RX_SUCCESS; + +#ifdef CONFIG_RPS + if (static_key_false(&rps_needed)) { + struct rps_dev_flow voidflow, *rflow = &voidflow; + int cpu, ret; + + rcu_read_lock(); + + cpu = get_rps_cpu(skb->dev, skb, &rflow); + + if (cpu >= 0) { + ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); + rcu_read_unlock(); + return ret; + } + rcu_read_unlock(); + } +#endif + return __netif_receive_skb(skb); +} +EXPORT_SYMBOL(netif_receive_skb); + +/* Network device is going away, flush any packets still pending + * Called with irqs disabled. + */ +static void flush_backlog(void *arg) +{ + struct net_device *dev = arg; + struct softnet_data *sd = &__get_cpu_var(softnet_data); + struct sk_buff *skb, *tmp; + + rps_lock(sd); + skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) { + if (skb->dev == dev) { + __skb_unlink(skb, &sd->input_pkt_queue); + kfree_skb(skb); + input_queue_head_incr(sd); + } + } + rps_unlock(sd); + + skb_queue_walk_safe(&sd->process_queue, skb, tmp) { + if (skb->dev == dev) { + __skb_unlink(skb, &sd->process_queue); + kfree_skb(skb); + input_queue_head_incr(sd); + } + } +} + +static int napi_gro_complete(struct sk_buff *skb) +{ + struct packet_type *ptype; + __be16 type = skb->protocol; + struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK]; + int err = -ENOENT; + + if (NAPI_GRO_CB(skb)->count == 1) { + skb_shinfo(skb)->gso_size = 0; + goto out; + } + + rcu_read_lock(); + list_for_each_entry_rcu(ptype, head, list) { + if (ptype->type != type || ptype->dev || !ptype->gro_complete) + continue; + + err = ptype->gro_complete(skb); + break; + } + rcu_read_unlock(); + + if (err) { + WARN_ON(&ptype->list == head); + kfree_skb(skb); + return NET_RX_SUCCESS; + } + +out: + return netif_receive_skb(skb); +} + +inline void napi_gro_flush(struct napi_struct *napi) +{ + struct sk_buff *skb, *next; + + for (skb = napi->gro_list; skb; skb = next) { + next = skb->next; + skb->next = NULL; + napi_gro_complete(skb); + } + + napi->gro_count = 0; + napi->gro_list = NULL; +} +EXPORT_SYMBOL(napi_gro_flush); + +enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) +{ + struct sk_buff **pp = NULL; + struct packet_type *ptype; + __be16 type = skb->protocol; + struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK]; + int same_flow; + int mac_len; + enum gro_result ret; + + if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb)) + goto normal; + + if (skb_is_gso(skb) || skb_has_frag_list(skb)) + goto normal; + + rcu_read_lock(); + list_for_each_entry_rcu(ptype, head, list) { + if (ptype->type != type || ptype->dev || !ptype->gro_receive) + continue; + + skb_set_network_header(skb, skb_gro_offset(skb)); + mac_len = skb->network_header - skb->mac_header; + skb->mac_len = mac_len; + NAPI_GRO_CB(skb)->same_flow = 0; + NAPI_GRO_CB(skb)->flush = 0; + NAPI_GRO_CB(skb)->free = 0; + + pp = ptype->gro_receive(&napi->gro_list, skb); + break; + } + rcu_read_unlock(); + + if (&ptype->list == head) + goto normal; + + same_flow = NAPI_GRO_CB(skb)->same_flow; + ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED; + + if (pp) { + struct sk_buff *nskb = *pp; + + *pp = nskb->next; + nskb->next = NULL; + napi_gro_complete(nskb); + napi->gro_count--; + } + + if (same_flow) + goto ok; + + if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS) + goto normal; + + napi->gro_count++; + NAPI_GRO_CB(skb)->count = 1; + skb_shinfo(skb)->gso_size = skb_gro_len(skb); + skb->next = napi->gro_list; + napi->gro_list = skb; + ret = GRO_HELD; + +pull: + if (skb_headlen(skb) < skb_gro_offset(skb)) { + int grow = skb_gro_offset(skb) - skb_headlen(skb); + + BUG_ON(skb->end - skb->tail < grow); + + memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow); + + skb->tail += grow; + skb->data_len -= grow; + + skb_shinfo(skb)->frags[0].page_offset += grow; + skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow); + + if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) { + skb_frag_unref(skb, 0); + memmove(skb_shinfo(skb)->frags, + skb_shinfo(skb)->frags + 1, + --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t)); + } + } + +ok: + return ret; + +normal: + ret = GRO_NORMAL; + goto pull; +} +EXPORT_SYMBOL(dev_gro_receive); + +static inline gro_result_t +__napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) +{ + struct sk_buff *p; + unsigned int maclen = skb->dev->hard_header_len; + + for (p = napi->gro_list; p; p = p->next) { + unsigned long diffs; + + diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; + diffs |= p->vlan_tci ^ skb->vlan_tci; + if (maclen == ETH_HLEN) + diffs |= compare_ether_header(skb_mac_header(p), + skb_gro_mac_header(skb)); + else if (!diffs) + diffs = memcmp(skb_mac_header(p), + skb_gro_mac_header(skb), + maclen); + NAPI_GRO_CB(p)->same_flow = !diffs; + NAPI_GRO_CB(p)->flush = 0; + } + + return dev_gro_receive(napi, skb); +} + +gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb) +{ + switch (ret) { + case GRO_NORMAL: + if (netif_receive_skb(skb)) + ret = GRO_DROP; + break; + + case GRO_DROP: + case GRO_MERGED_FREE: + kfree_skb(skb); + break; + + case GRO_HELD: + case GRO_MERGED: + break; + } + + return ret; +} +EXPORT_SYMBOL(napi_skb_finish); + +void skb_gro_reset_offset(struct sk_buff *skb) +{ + NAPI_GRO_CB(skb)->data_offset = 0; + NAPI_GRO_CB(skb)->frag0 = NULL; + NAPI_GRO_CB(skb)->frag0_len = 0; + + if (skb->mac_header == skb->tail && + !PageHighMem(skb_frag_page(&skb_shinfo(skb)->frags[0]))) { + NAPI_GRO_CB(skb)->frag0 = + skb_frag_address(&skb_shinfo(skb)->frags[0]); + NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(&skb_shinfo(skb)->frags[0]); + } +} +EXPORT_SYMBOL(skb_gro_reset_offset); + +gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) +{ + skb_gro_reset_offset(skb); + + return napi_skb_finish(__napi_gro_receive(napi, skb), skb); +} +EXPORT_SYMBOL(napi_gro_receive); + +static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) +{ + __skb_pull(skb, skb_headlen(skb)); + /* restore the reserve we had after netdev_alloc_skb_ip_align() */ + skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb)); + skb->vlan_tci = 0; + skb->dev = napi->dev; + skb->skb_iif = 0; + + napi->skb = skb; +} + +struct sk_buff *napi_get_frags(struct napi_struct *napi) +{ + struct sk_buff *skb = napi->skb; + + if (!skb) { + skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD); + if (skb) + napi->skb = skb; + } + return skb; +} +EXPORT_SYMBOL(napi_get_frags); + +gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, + gro_result_t ret) +{ + switch (ret) { + case GRO_NORMAL: + case GRO_HELD: + skb->protocol = eth_type_trans(skb, skb->dev); + + if (ret == GRO_HELD) + skb_gro_pull(skb, -ETH_HLEN); + else if (netif_receive_skb(skb)) + ret = GRO_DROP; + break; + + case GRO_DROP: + case GRO_MERGED_FREE: + napi_reuse_skb(napi, skb); + break; + + case GRO_MERGED: + break; + } + + return ret; +} +EXPORT_SYMBOL(napi_frags_finish); + +struct sk_buff *napi_frags_skb(struct napi_struct *napi) +{ + struct sk_buff *skb = napi->skb; + struct ethhdr *eth; + unsigned int hlen; + unsigned int off; + + napi->skb = NULL; + + skb_reset_mac_header(skb); + skb_gro_reset_offset(skb); + + off = skb_gro_offset(skb); + hlen = off + sizeof(*eth); + eth = skb_gro_header_fast(skb, off); + if (skb_gro_header_hard(skb, hlen)) { + eth = skb_gro_header_slow(skb, hlen, off); + if (unlikely(!eth)) { + napi_reuse_skb(napi, skb); + skb = NULL; + goto out; + } + } + + skb_gro_pull(skb, sizeof(*eth)); + + /* + * This works because the only protocols we care about don't require + * special handling. We'll fix it up properly at the end. + */ + skb->protocol = eth->h_proto; + +out: + return skb; +} +EXPORT_SYMBOL(napi_frags_skb); + +gro_result_t napi_gro_frags(struct napi_struct *napi) +{ + struct sk_buff *skb = napi_frags_skb(napi); + + if (!skb) + return GRO_DROP; + + return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb)); +} +EXPORT_SYMBOL(napi_gro_frags); + +/* + * net_rps_action sends any pending IPI's for rps. + * Note: called with local irq disabled, but exits with local irq enabled. + */ +static void net_rps_action_and_irq_enable(struct softnet_data *sd) +{ +#ifdef CONFIG_RPS + struct softnet_data *remsd = sd->rps_ipi_list; + + if (remsd) { + sd->rps_ipi_list = NULL; + + local_irq_enable(); + + /* Send pending IPI's to kick RPS processing on remote cpus. */ + while (remsd) { + struct softnet_data *next = remsd->rps_ipi_next; + + if (cpu_online(remsd->cpu)) + __smp_call_function_single(remsd->cpu, + &remsd->csd, 0); + remsd = next; + } + } else +#endif + local_irq_enable(); +} + +static int process_backlog(struct napi_struct *napi, int quota) +{ + int work = 0; + struct softnet_data *sd = container_of(napi, struct softnet_data, backlog); + +#ifdef CONFIG_RPS + /* Check if we have pending ipi, its better to send them now, + * not waiting net_rx_action() end. + */ + if (sd->rps_ipi_list) { + local_irq_disable(); + net_rps_action_and_irq_enable(sd); + } +#endif + napi->weight = weight_p; + local_irq_disable(); + while (work < quota) { + struct sk_buff *skb; + unsigned int qlen; + + while ((skb = __skb_dequeue(&sd->process_queue))) { + local_irq_enable(); + __netif_receive_skb(skb); + local_irq_disable(); + input_queue_head_incr(sd); + if (++work >= quota) { + local_irq_enable(); + return work; + } + } + + rps_lock(sd); + qlen = skb_queue_len(&sd->input_pkt_queue); + if (qlen) + skb_queue_splice_tail_init(&sd->input_pkt_queue, + &sd->process_queue); + + if (qlen < quota - work) { + /* + * Inline a custom version of __napi_complete(). + * only current cpu owns and manipulates this napi, + * and NAPI_STATE_SCHED is the only possible flag set on backlog. + * we can use a plain write instead of clear_bit(), + * and we dont need an smp_mb() memory barrier. + */ + list_del(&napi->poll_list); + napi->state = 0; + + quota = work + qlen; + } + rps_unlock(sd); + } + local_irq_enable(); + + return work; +} + +/** + * __napi_schedule - schedule for receive + * @n: entry to schedule + * + * The entry's receive function will be scheduled to run + */ +void __napi_schedule(struct napi_struct *n) +{ + unsigned long flags; + + local_irq_save(flags); + ____napi_schedule(&__get_cpu_var(softnet_data), n); + local_irq_restore(flags); +} +EXPORT_SYMBOL(__napi_schedule); + +void __napi_complete(struct napi_struct *n) +{ + BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state)); + BUG_ON(n->gro_list); + + list_del(&n->poll_list); + smp_mb__before_clear_bit(); + clear_bit(NAPI_STATE_SCHED, &n->state); +} +EXPORT_SYMBOL(__napi_complete); + +void napi_complete(struct napi_struct *n) +{ + unsigned long flags; + + /* + * don't let napi dequeue from the cpu poll list + * just in case its running on a different cpu + */ + if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state))) + return; + + napi_gro_flush(n); + local_irq_save(flags); + __napi_complete(n); + local_irq_restore(flags); +} +EXPORT_SYMBOL(napi_complete); + +void netif_napi_add(struct net_device *dev, struct napi_struct *napi, + int (*poll)(struct napi_struct *, int), int weight) +{ + INIT_LIST_HEAD(&napi->poll_list); + napi->gro_count = 0; + napi->gro_list = NULL; + napi->skb = NULL; + napi->poll = poll; + napi->weight = weight; + list_add(&napi->dev_list, &dev->napi_list); + napi->dev = dev; +#ifdef CONFIG_NETPOLL + spin_lock_init(&napi->poll_lock); + napi->poll_owner = -1; +#endif + set_bit(NAPI_STATE_SCHED, &napi->state); +} +EXPORT_SYMBOL(netif_napi_add); + +void netif_napi_del(struct napi_struct *napi) +{ + struct sk_buff *skb, *next; + + list_del_init(&napi->dev_list); + napi_free_frags(napi); + + for (skb = napi->gro_list; skb; skb = next) { + next = skb->next; + skb->next = NULL; + kfree_skb(skb); + } + + napi->gro_list = NULL; + napi->gro_count = 0; +} +EXPORT_SYMBOL(netif_napi_del); + +static void net_rx_action(struct softirq_action *h) +{ + struct softnet_data *sd = &__get_cpu_var(softnet_data); + unsigned long time_limit = jiffies + 2; + int budget = netdev_budget; + void *have; + + local_irq_disable(); + + while (!list_empty(&sd->poll_list)) { + struct napi_struct *n; + int work, weight; + + /* If softirq window is exhuasted then punt. + * Allow this to run for 2 jiffies since which will allow + * an average latency of 1.5/HZ. + */ + if (unlikely(budget <= 0 || time_after(jiffies, time_limit))) + goto softnet_break; + + local_irq_enable(); + + /* Even though interrupts have been re-enabled, this + * access is safe because interrupts can only add new + * entries to the tail of this list, and only ->poll() + * calls can remove this head entry from the list. + */ + n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list); + + have = netpoll_poll_lock(n); + + weight = n->weight; + + /* This NAPI_STATE_SCHED test is for avoiding a race + * with netpoll's poll_napi(). Only the entity which + * obtains the lock and sees NAPI_STATE_SCHED set will + * actually make the ->poll() call. Therefore we avoid + * accidentally calling ->poll() when NAPI is not scheduled. + */ + work = 0; + if (test_bit(NAPI_STATE_SCHED, &n->state)) { + work = n->poll(n, weight); + trace_napi_poll(n); + } + + WARN_ON_ONCE(work > weight); + + budget -= work; + + local_irq_disable(); + + /* Drivers must not modify the NAPI state if they + * consume the entire weight. In such cases this code + * still "owns" the NAPI instance and therefore can + * move the instance around on the list at-will. + */ + if (unlikely(work == weight)) { + if (unlikely(napi_disable_pending(n))) { + local_irq_enable(); + napi_complete(n); + local_irq_disable(); + } else + list_move_tail(&n->poll_list, &sd->poll_list); + } + + netpoll_poll_unlock(have); + } +out: + net_rps_action_and_irq_enable(sd); + +#ifdef CONFIG_NET_DMA + /* + * There may not be any more sk_buffs coming right now, so push + * any pending DMA copies to hardware + */ + dma_issue_pending_all(); +#endif + + return; + +softnet_break: + sd->time_squeeze++; + __raise_softirq_irqoff(NET_RX_SOFTIRQ); + goto out; +} + +static gifconf_func_t *gifconf_list[NPROTO]; + +/** + * register_gifconf - register a SIOCGIF handler + * @family: Address family + * @gifconf: Function handler + * + * Register protocol dependent address dumping routines. The handler + * that is passed must not be freed or reused until it has been replaced + * by another handler. + */ +int register_gifconf(unsigned int family, gifconf_func_t *gifconf) +{ + if (family >= NPROTO) + return -EINVAL; + gifconf_list[family] = gifconf; + return 0; +} +EXPORT_SYMBOL(register_gifconf); + + +/* + * Map an interface index to its name (SIOCGIFNAME) + */ + +/* + * We need this ioctl for efficient implementation of the + * if_indextoname() function required by the IPv6 API. Without + * it, we would have to search all the interfaces to find a + * match. --pb + */ + +static int dev_ifname(struct net *net, struct ifreq __user *arg) +{ + struct net_device *dev; + struct ifreq ifr; + + /* + * Fetch the caller's info block. + */ + + if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) + return -EFAULT; + + rcu_read_lock(); + dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex); + if (!dev) { + rcu_read_unlock(); + return -ENODEV; + } + + strcpy(ifr.ifr_name, dev->name); + rcu_read_unlock(); + + if (copy_to_user(arg, &ifr, sizeof(struct ifreq))) + return -EFAULT; + return 0; +} + +/* + * Perform a SIOCGIFCONF call. This structure will change + * size eventually, and there is nothing I can do about it. + * Thus we will need a 'compatibility mode'. + */ + +static int dev_ifconf(struct net *net, char __user *arg) +{ + struct ifconf ifc; + struct net_device *dev; + char __user *pos; + int len; + int total; + int i; + + /* + * Fetch the caller's info block. + */ + + if (copy_from_user(&ifc, arg, sizeof(struct ifconf))) + return -EFAULT; + + pos = ifc.ifc_buf; + len = ifc.ifc_len; + + /* + * Loop over the interfaces, and write an info block for each. + */ + + total = 0; + for_each_netdev(net, dev) { + for (i = 0; i < NPROTO; i++) { + if (gifconf_list[i]) { + int done; + if (!pos) + done = gifconf_list[i](dev, NULL, 0); + else + done = gifconf_list[i](dev, pos + total, + len - total); + if (done < 0) + return -EFAULT; + total += done; + } + } + } + + /* + * All done. Write the updated control block back to the caller. + */ + ifc.ifc_len = total; + + /* + * Both BSD and Solaris return 0 here, so we do too. + */ + return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0; +} + +#ifdef CONFIG_PROC_FS + +#define BUCKET_SPACE (32 - NETDEV_HASHBITS - 1) + +#define get_bucket(x) ((x) >> BUCKET_SPACE) +#define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1)) +#define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o)) + +static inline struct net_device *dev_from_same_bucket(struct seq_file *seq, loff_t *pos) +{ + struct net *net = seq_file_net(seq); + struct net_device *dev; + struct hlist_node *p; + struct hlist_head *h; + unsigned int count = 0, offset = get_offset(*pos); + + h = &net->dev_name_head[get_bucket(*pos)]; + hlist_for_each_entry_rcu(dev, p, h, name_hlist) { + if (++count == offset) + return dev; + } + + return NULL; +} + +static inline struct net_device *dev_from_bucket(struct seq_file *seq, loff_t *pos) +{ + struct net_device *dev; + unsigned int bucket; + + do { + dev = dev_from_same_bucket(seq, pos); + if (dev) + return dev; + + bucket = get_bucket(*pos) + 1; + *pos = set_bucket_offset(bucket, 1); + } while (bucket < NETDEV_HASHENTRIES); + + return NULL; +} + +/* + * This is invoked by the /proc filesystem handler to display a device + * in detail. + */ +void *dev_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(RCU) +{ + rcu_read_lock(); + if (!*pos) + return SEQ_START_TOKEN; + + if (get_bucket(*pos) >= NETDEV_HASHENTRIES) + return NULL; + + return dev_from_bucket(seq, pos); +} + +void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + ++*pos; + return dev_from_bucket(seq, pos); +} + +void dev_seq_stop(struct seq_file *seq, void *v) + __releases(RCU) +{ + rcu_read_unlock(); +} + +static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev) +{ + struct rtnl_link_stats64 temp; + const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp); + + seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu " + "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n", + dev->name, stats->rx_bytes, stats->rx_packets, + stats->rx_errors, + stats->rx_dropped + stats->rx_missed_errors, + stats->rx_fifo_errors, + stats->rx_length_errors + stats->rx_over_errors + + stats->rx_crc_errors + stats->rx_frame_errors, + stats->rx_compressed, stats->multicast, + stats->tx_bytes, stats->tx_packets, + stats->tx_errors, stats->tx_dropped, + stats->tx_fifo_errors, stats->collisions, + stats->tx_carrier_errors + + stats->tx_aborted_errors + + stats->tx_window_errors + + stats->tx_heartbeat_errors, + stats->tx_compressed); +} + +/* + * Called from the PROCfs module. This now uses the new arbitrary sized + * /proc/net interface to create /proc/net/dev + */ +static int dev_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) + seq_puts(seq, "Inter-| Receive " + " | Transmit\n" + " face |bytes packets errs drop fifo frame " + "compressed multicast|bytes packets errs " + "drop fifo colls carrier compressed\n"); + else + dev_seq_printf_stats(seq, v); + return 0; +} + +static struct softnet_data *softnet_get_online(loff_t *pos) +{ + struct softnet_data *sd = NULL; + + while (*pos < nr_cpu_ids) + if (cpu_online(*pos)) { + sd = &per_cpu(softnet_data, *pos); + break; + } else + ++*pos; + return sd; +} + +static void *softnet_seq_start(struct seq_file *seq, loff_t *pos) +{ + return softnet_get_online(pos); +} + +static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + ++*pos; + return softnet_get_online(pos); +} + +static void softnet_seq_stop(struct seq_file *seq, void *v) +{ +} + +static int softnet_seq_show(struct seq_file *seq, void *v) +{ + struct softnet_data *sd = v; + + seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n", + sd->processed, sd->dropped, sd->time_squeeze, 0, + 0, 0, 0, 0, /* was fastroute */ + sd->cpu_collision, sd->received_rps); + return 0; +} + +static const struct seq_operations dev_seq_ops = { + .start = dev_seq_start, + .next = dev_seq_next, + .stop = dev_seq_stop, + .show = dev_seq_show, +}; + +static int dev_seq_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &dev_seq_ops, + sizeof(struct seq_net_private)); +} + +static const struct file_operations dev_seq_fops = { + .owner = THIS_MODULE, + .open = dev_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +static const struct seq_operations softnet_seq_ops = { + .start = softnet_seq_start, + .next = softnet_seq_next, + .stop = softnet_seq_stop, + .show = softnet_seq_show, +}; + +static int softnet_seq_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &softnet_seq_ops); +} + +static const struct file_operations softnet_seq_fops = { + .owner = THIS_MODULE, + .open = softnet_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static void *ptype_get_idx(loff_t pos) +{ + struct packet_type *pt = NULL; + loff_t i = 0; + int t; + + list_for_each_entry_rcu(pt, &ptype_all, list) { + if (i == pos) + return pt; + ++i; + } + + for (t = 0; t < PTYPE_HASH_SIZE; t++) { + list_for_each_entry_rcu(pt, &ptype_base[t], list) { + if (i == pos) + return pt; + ++i; + } + } + return NULL; +} + +static void *ptype_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(RCU) +{ + rcu_read_lock(); + return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN; +} + +static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct packet_type *pt; + struct list_head *nxt; + int hash; + + ++*pos; + if (v == SEQ_START_TOKEN) + return ptype_get_idx(0); + + pt = v; + nxt = pt->list.next; + if (pt->type == htons(ETH_P_ALL)) { + if (nxt != &ptype_all) + goto found; + hash = 0; + nxt = ptype_base[0].next; + } else + hash = ntohs(pt->type) & PTYPE_HASH_MASK; + + while (nxt == &ptype_base[hash]) { + if (++hash >= PTYPE_HASH_SIZE) + return NULL; + nxt = ptype_base[hash].next; + } +found: + return list_entry(nxt, struct packet_type, list); +} + +static void ptype_seq_stop(struct seq_file *seq, void *v) + __releases(RCU) +{ + rcu_read_unlock(); +} + +static int ptype_seq_show(struct seq_file *seq, void *v) +{ + struct packet_type *pt = v; + + if (v == SEQ_START_TOKEN) + seq_puts(seq, "Type Device Function\n"); + else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) { + if (pt->type == htons(ETH_P_ALL)) + seq_puts(seq, "ALL "); + else + seq_printf(seq, "%04x", ntohs(pt->type)); + + seq_printf(seq, " %-8s %pF\n", + pt->dev ? pt->dev->name : "", pt->func); + } + + return 0; +} + +static const struct seq_operations ptype_seq_ops = { + .start = ptype_seq_start, + .next = ptype_seq_next, + .stop = ptype_seq_stop, + .show = ptype_seq_show, +}; + +static int ptype_seq_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &ptype_seq_ops, + sizeof(struct seq_net_private)); +} + +static const struct file_operations ptype_seq_fops = { + .owner = THIS_MODULE, + .open = ptype_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + + +static int __net_init dev_proc_net_init(struct net *net) +{ + int rc = -ENOMEM; + + if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops)) + goto out; + if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops)) + goto out_dev; + if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops)) + goto out_softnet; + + if (wext_proc_init(net)) + goto out_ptype; + rc = 0; +out: + return rc; +out_ptype: + proc_net_remove(net, "ptype"); +out_softnet: + proc_net_remove(net, "softnet_stat"); +out_dev: + proc_net_remove(net, "dev"); + goto out; +} + +static void __net_exit dev_proc_net_exit(struct net *net) +{ + wext_proc_exit(net); + + proc_net_remove(net, "ptype"); + proc_net_remove(net, "softnet_stat"); + proc_net_remove(net, "dev"); +} + +static struct pernet_operations __net_initdata dev_proc_ops = { + .init = dev_proc_net_init, + .exit = dev_proc_net_exit, +}; + +static int __init dev_proc_init(void) +{ + return register_pernet_subsys(&dev_proc_ops); +} +#else +#define dev_proc_init() 0 +#endif /* CONFIG_PROC_FS */ + + +/** + * netdev_set_master - set up master pointer + * @slave: slave device + * @master: new master device + * + * Changes the master device of the slave. Pass %NULL to break the + * bonding. The caller must hold the RTNL semaphore. On a failure + * a negative errno code is returned. On success the reference counts + * are adjusted and the function returns zero. + */ +int netdev_set_master(struct net_device *slave, struct net_device *master) +{ + struct net_device *old = slave->master; + + ASSERT_RTNL(); + + if (master) { + if (old) + return -EBUSY; + dev_hold(master); + } + + slave->master = master; + + if (old) + dev_put(old); + return 0; +} +EXPORT_SYMBOL(netdev_set_master); + +/** + * netdev_set_bond_master - set up bonding master/slave pair + * @slave: slave device + * @master: new master device + * + * Changes the master device of the slave. Pass %NULL to break the + * bonding. The caller must hold the RTNL semaphore. On a failure + * a negative errno code is returned. On success %RTM_NEWLINK is sent + * to the routing socket and the function returns zero. + */ +int netdev_set_bond_master(struct net_device *slave, struct net_device *master) +{ + int err; + + ASSERT_RTNL(); + + err = netdev_set_master(slave, master); + if (err) + return err; + if (master) + slave->flags |= IFF_SLAVE; + else + slave->flags &= ~IFF_SLAVE; + + rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE); + return 0; +} +EXPORT_SYMBOL(netdev_set_bond_master); + +static void dev_change_rx_flags(struct net_device *dev, int flags) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags) + ops->ndo_change_rx_flags(dev, flags); +} + +static int __dev_set_promiscuity(struct net_device *dev, int inc) +{ + unsigned int old_flags = dev->flags; + uid_t uid; + gid_t gid; + + ASSERT_RTNL(); + + dev->flags |= IFF_PROMISC; + dev->promiscuity += inc; + if (dev->promiscuity == 0) { + /* + * Avoid overflow. + * If inc causes overflow, untouch promisc and return error. + */ + if (inc < 0) + dev->flags &= ~IFF_PROMISC; + else { + dev->promiscuity -= inc; + pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n", + dev->name); + return -EOVERFLOW; + } + } + if (dev->flags != old_flags) { + pr_info("device %s %s promiscuous mode\n", + dev->name, + dev->flags & IFF_PROMISC ? "entered" : "left"); + if (audit_enabled) { + current_uid_gid(&uid, &gid); + audit_log(current->audit_context, GFP_ATOMIC, + AUDIT_ANOM_PROMISCUOUS, + "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u", + dev->name, (dev->flags & IFF_PROMISC), + (old_flags & IFF_PROMISC), + audit_get_loginuid(current), + uid, gid, + audit_get_sessionid(current)); + } + + dev_change_rx_flags(dev, IFF_PROMISC); + } + return 0; +} + +/** + * dev_set_promiscuity - update promiscuity count on a device + * @dev: device + * @inc: modifier + * + * Add or remove promiscuity from a device. While the count in the device + * remains above zero the interface remains promiscuous. Once it hits zero + * the device reverts back to normal filtering operation. A negative inc + * value is used to drop promiscuity on the device. + * Return 0 if successful or a negative errno code on error. + */ +int dev_set_promiscuity(struct net_device *dev, int inc) +{ + unsigned int old_flags = dev->flags; + int err; + + err = __dev_set_promiscuity(dev, inc); + if (err < 0) + return err; + if (dev->flags != old_flags) + dev_set_rx_mode(dev); + return err; +} +EXPORT_SYMBOL(dev_set_promiscuity); + +/** + * dev_set_allmulti - update allmulti count on a device + * @dev: device + * @inc: modifier + * + * Add or remove reception of all multicast frames to a device. While the + * count in the device remains above zero the interface remains listening + * to all interfaces. Once it hits zero the device reverts back to normal + * filtering operation. A negative @inc value is used to drop the counter + * when releasing a resource needing all multicasts. + * Return 0 if successful or a negative errno code on error. + */ + +int dev_set_allmulti(struct net_device *dev, int inc) +{ + unsigned int old_flags = dev->flags; + + ASSERT_RTNL(); + + dev->flags |= IFF_ALLMULTI; + dev->allmulti += inc; + if (dev->allmulti == 0) { + /* + * Avoid overflow. + * If inc causes overflow, untouch allmulti and return error. + */ + if (inc < 0) + dev->flags &= ~IFF_ALLMULTI; + else { + dev->allmulti -= inc; + pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n", + dev->name); + return -EOVERFLOW; + } + } + if (dev->flags ^ old_flags) { + dev_change_rx_flags(dev, IFF_ALLMULTI); + dev_set_rx_mode(dev); + } + return 0; +} +EXPORT_SYMBOL(dev_set_allmulti); + +/* + * Upload unicast and multicast address lists to device and + * configure RX filtering. When the device doesn't support unicast + * filtering it is put in promiscuous mode while unicast addresses + * are present. + */ +void __dev_set_rx_mode(struct net_device *dev) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + /* dev_open will call this function so the list will stay sane. */ + if (!(dev->flags&IFF_UP)) + return; + + if (!netif_device_present(dev)) + return; + + if (!(dev->priv_flags & IFF_UNICAST_FLT)) { + /* Unicast addresses changes may only happen under the rtnl, + * therefore calling __dev_set_promiscuity here is safe. + */ + if (!netdev_uc_empty(dev) && !dev->uc_promisc) { + __dev_set_promiscuity(dev, 1); + dev->uc_promisc = true; + } else if (netdev_uc_empty(dev) && dev->uc_promisc) { + __dev_set_promiscuity(dev, -1); + dev->uc_promisc = false; + } + } + + if (ops->ndo_set_rx_mode) + ops->ndo_set_rx_mode(dev); +} + +void dev_set_rx_mode(struct net_device *dev) +{ + netif_addr_lock_bh(dev); + __dev_set_rx_mode(dev); + netif_addr_unlock_bh(dev); +} + +/** + * dev_get_flags - get flags reported to userspace + * @dev: device + * + * Get the combination of flag bits exported through APIs to userspace. + */ +unsigned dev_get_flags(const struct net_device *dev) +{ + unsigned flags; + + flags = (dev->flags & ~(IFF_PROMISC | + IFF_ALLMULTI | + IFF_RUNNING | + IFF_LOWER_UP | + IFF_DORMANT)) | + (dev->gflags & (IFF_PROMISC | + IFF_ALLMULTI)); + + if (netif_running(dev)) { + if (netif_oper_up(dev)) + flags |= IFF_RUNNING; + if (netif_carrier_ok(dev)) + flags |= IFF_LOWER_UP; + if (netif_dormant(dev)) + flags |= IFF_DORMANT; + } + + return flags; +} +EXPORT_SYMBOL(dev_get_flags); + +int __dev_change_flags(struct net_device *dev, unsigned int flags) +{ + unsigned int old_flags = dev->flags; + int ret; + + ASSERT_RTNL(); + + /* + * Set the flags on our device. + */ + + dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP | + IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL | + IFF_AUTOMEDIA)) | + (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC | + IFF_ALLMULTI)); + + /* + * Load in the correct multicast list now the flags have changed. + */ + + if ((old_flags ^ flags) & IFF_MULTICAST) + dev_change_rx_flags(dev, IFF_MULTICAST); + + dev_set_rx_mode(dev); + + /* + * Have we downed the interface. We handle IFF_UP ourselves + * according to user attempts to set it, rather than blindly + * setting it. + */ + + ret = 0; + if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */ + ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev); + + if (!ret) + dev_set_rx_mode(dev); + } + + if ((flags ^ dev->gflags) & IFF_PROMISC) { + int inc = (flags & IFF_PROMISC) ? 1 : -1; + + dev->gflags ^= IFF_PROMISC; + dev_set_promiscuity(dev, inc); + } + + /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI + is important. Some (broken) drivers set IFF_PROMISC, when + IFF_ALLMULTI is requested not asking us and not reporting. + */ + if ((flags ^ dev->gflags) & IFF_ALLMULTI) { + int inc = (flags & IFF_ALLMULTI) ? 1 : -1; + + dev->gflags ^= IFF_ALLMULTI; + dev_set_allmulti(dev, inc); + } + + return ret; +} + +void __dev_notify_flags(struct net_device *dev, unsigned int old_flags) +{ + unsigned int changes = dev->flags ^ old_flags; + + if (changes & IFF_UP) { + if (dev->flags & IFF_UP) + call_netdevice_notifiers(NETDEV_UP, dev); + else + call_netdevice_notifiers(NETDEV_DOWN, dev); + } + + if (dev->flags & IFF_UP && + (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) + call_netdevice_notifiers(NETDEV_CHANGE, dev); +} + +/** + * dev_change_flags - change device settings + * @dev: device + * @flags: device state flags + * + * Change settings on device based state flags. The flags are + * in the userspace exported format. + */ +int dev_change_flags(struct net_device *dev, unsigned int flags) +{ + int ret; + unsigned int changes, old_flags = dev->flags; + + ret = __dev_change_flags(dev, flags); + if (ret < 0) + return ret; + + changes = old_flags ^ dev->flags; + if (changes) + rtmsg_ifinfo(RTM_NEWLINK, dev, changes); + + __dev_notify_flags(dev, old_flags); + return ret; +} +EXPORT_SYMBOL(dev_change_flags); + +/** + * dev_set_mtu - Change maximum transfer unit + * @dev: device + * @new_mtu: new transfer unit + * + * Change the maximum transfer size of the network device. + */ +int dev_set_mtu(struct net_device *dev, int new_mtu) +{ + const struct net_device_ops *ops = dev->netdev_ops; + int err; + + if (new_mtu == dev->mtu) + return 0; + + /* MTU must be positive. */ + if (new_mtu < 0) + return -EINVAL; + + if (!netif_device_present(dev)) + return -ENODEV; + + err = 0; + if (ops->ndo_change_mtu) + err = ops->ndo_change_mtu(dev, new_mtu); + else + dev->mtu = new_mtu; + + if (!err && dev->flags & IFF_UP) + call_netdevice_notifiers(NETDEV_CHANGEMTU, dev); + return err; +} +EXPORT_SYMBOL(dev_set_mtu); + +/** + * dev_set_group - Change group this device belongs to + * @dev: device + * @new_group: group this device should belong to + */ +void dev_set_group(struct net_device *dev, int new_group) +{ + dev->group = new_group; +} +EXPORT_SYMBOL(dev_set_group); + +/** + * dev_set_mac_address - Change Media Access Control Address + * @dev: device + * @sa: new address + * + * Change the hardware (MAC) address of the device + */ +int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa) +{ + const struct net_device_ops *ops = dev->netdev_ops; + int err; + + if (!ops->ndo_set_mac_address) + return -EOPNOTSUPP; + if (sa->sa_family != dev->type) + return -EINVAL; + if (!netif_device_present(dev)) + return -ENODEV; + err = ops->ndo_set_mac_address(dev, sa); + if (!err) + call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); + return err; +} +EXPORT_SYMBOL(dev_set_mac_address); + +/* + * Perform the SIOCxIFxxx calls, inside rcu_read_lock() + */ +static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd) +{ + int err; + struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name); + + if (!dev) + return -ENODEV; + + switch (cmd) { + case SIOCGIFFLAGS: /* Get interface flags */ + ifr->ifr_flags = (short) dev_get_flags(dev); + return 0; + + case SIOCGIFMETRIC: /* Get the metric on the interface + (currently unused) */ + ifr->ifr_metric = 0; + return 0; + + case SIOCGIFMTU: /* Get the MTU of a device */ + ifr->ifr_mtu = dev->mtu; + return 0; + + case SIOCGIFHWADDR: + if (!dev->addr_len) + memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data); + else + memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr, + min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len)); + ifr->ifr_hwaddr.sa_family = dev->type; + return 0; + + case SIOCGIFSLAVE: + err = -EINVAL; + break; + + case SIOCGIFMAP: + ifr->ifr_map.mem_start = dev->mem_start; + ifr->ifr_map.mem_end = dev->mem_end; + ifr->ifr_map.base_addr = dev->base_addr; + ifr->ifr_map.irq = dev->irq; + ifr->ifr_map.dma = dev->dma; + ifr->ifr_map.port = dev->if_port; + return 0; + + case SIOCGIFINDEX: + ifr->ifr_ifindex = dev->ifindex; + return 0; + + case SIOCGIFTXQLEN: + ifr->ifr_qlen = dev->tx_queue_len; + return 0; + + default: + /* dev_ioctl() should ensure this case + * is never reached + */ + WARN_ON(1); + err = -ENOTTY; + break; + + } + return err; +} + +/* + * Perform the SIOCxIFxxx calls, inside rtnl_lock() + */ +static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) +{ + int err; + struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name); + const struct net_device_ops *ops; + + if (!dev) + return -ENODEV; + + ops = dev->netdev_ops; + + switch (cmd) { + case SIOCSIFFLAGS: /* Set interface flags */ + return dev_change_flags(dev, ifr->ifr_flags); + + case SIOCSIFMETRIC: /* Set the metric on the interface + (currently unused) */ + return -EOPNOTSUPP; + + case SIOCSIFMTU: /* Set the MTU of a device */ + return dev_set_mtu(dev, ifr->ifr_mtu); + + case SIOCSIFHWADDR: + return dev_set_mac_address(dev, &ifr->ifr_hwaddr); + + case SIOCSIFHWBROADCAST: + if (ifr->ifr_hwaddr.sa_family != dev->type) + return -EINVAL; + memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data, + min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len)); + call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); + return 0; + + case SIOCSIFMAP: + if (ops->ndo_set_config) { + if (!netif_device_present(dev)) + return -ENODEV; + return ops->ndo_set_config(dev, &ifr->ifr_map); + } + return -EOPNOTSUPP; + + case SIOCADDMULTI: + if (!ops->ndo_set_rx_mode || + ifr->ifr_hwaddr.sa_family != AF_UNSPEC) + return -EINVAL; + if (!netif_device_present(dev)) + return -ENODEV; + return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data); + + case SIOCDELMULTI: + if (!ops->ndo_set_rx_mode || + ifr->ifr_hwaddr.sa_family != AF_UNSPEC) + return -EINVAL; + if (!netif_device_present(dev)) + return -ENODEV; + return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data); + + case SIOCSIFTXQLEN: + if (ifr->ifr_qlen < 0) + return -EINVAL; + dev->tx_queue_len = ifr->ifr_qlen; + return 0; + + case SIOCSIFNAME: + ifr->ifr_newname[IFNAMSIZ-1] = '\0'; + return dev_change_name(dev, ifr->ifr_newname); + + case SIOCSHWTSTAMP: + err = net_hwtstamp_validate(ifr); + if (err) + return err; + /* fall through */ + + /* + * Unknown or private ioctl + */ + default: + if ((cmd >= SIOCDEVPRIVATE && + cmd <= SIOCDEVPRIVATE + 15) || + cmd == SIOCBONDENSLAVE || + cmd == SIOCBONDRELEASE || + cmd == SIOCBONDSETHWADDR || + cmd == SIOCBONDSLAVEINFOQUERY || + cmd == SIOCBONDINFOQUERY || + cmd == SIOCBONDCHANGEACTIVE || + cmd == SIOCGMIIPHY || + cmd == SIOCGMIIREG || + cmd == SIOCSMIIREG || + cmd == SIOCBRADDIF || + cmd == SIOCBRDELIF || + cmd == SIOCSHWTSTAMP || + cmd == SIOCWANDEV) { + err = -EOPNOTSUPP; + if (ops->ndo_do_ioctl) { + if (netif_device_present(dev)) + err = ops->ndo_do_ioctl(dev, ifr, cmd); + else + err = -ENODEV; + } + } else + err = -EINVAL; + + } + return err; +} + +/* + * This function handles all "interface"-type I/O control requests. The actual + * 'doing' part of this is dev_ifsioc above. + */ + +/** + * dev_ioctl - network device ioctl + * @net: the applicable net namespace + * @cmd: command to issue + * @arg: pointer to a struct ifreq in user space + * + * Issue ioctl functions to devices. This is normally called by the + * user space syscall interfaces but can sometimes be useful for + * other purposes. The return value is the return from the syscall if + * positive or a negative errno code on error. + */ + +int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg) +{ + struct ifreq ifr; + int ret; + char *colon; + + /* One special case: SIOCGIFCONF takes ifconf argument + and requires shared lock, because it sleeps writing + to user space. + */ + + if (cmd == SIOCGIFCONF) { + rtnl_lock(); + ret = dev_ifconf(net, (char __user *) arg); + rtnl_unlock(); + return ret; + } + if (cmd == SIOCGIFNAME) + return dev_ifname(net, (struct ifreq __user *)arg); + + if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) + return -EFAULT; + + ifr.ifr_name[IFNAMSIZ-1] = 0; + + colon = strchr(ifr.ifr_name, ':'); + if (colon) + *colon = 0; + + /* + * See which interface the caller is talking about. + */ + + switch (cmd) { + /* + * These ioctl calls: + * - can be done by all. + * - atomic and do not require locking. + * - return a value + */ + case SIOCGIFFLAGS: + case SIOCGIFMETRIC: + case SIOCGIFMTU: + case SIOCGIFHWADDR: + case SIOCGIFSLAVE: + case SIOCGIFMAP: + case SIOCGIFINDEX: + case SIOCGIFTXQLEN: + dev_load(net, ifr.ifr_name); + rcu_read_lock(); + ret = dev_ifsioc_locked(net, &ifr, cmd); + rcu_read_unlock(); + if (!ret) { + if (colon) + *colon = ':'; + if (copy_to_user(arg, &ifr, + sizeof(struct ifreq))) + ret = -EFAULT; + } + return ret; + + case SIOCETHTOOL: + dev_load(net, ifr.ifr_name); + rtnl_lock(); + ret = dev_ethtool(net, &ifr); + rtnl_unlock(); + if (!ret) { + if (colon) + *colon = ':'; + if (copy_to_user(arg, &ifr, + sizeof(struct ifreq))) + ret = -EFAULT; + } + return ret; + + /* + * These ioctl calls: + * - require superuser power. + * - require strict serialization. + * - return a value + */ + case SIOCGMIIPHY: + case SIOCGMIIREG: + case SIOCSIFNAME: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + dev_load(net, ifr.ifr_name); + rtnl_lock(); + ret = dev_ifsioc(net, &ifr, cmd); + rtnl_unlock(); + if (!ret) { + if (colon) + *colon = ':'; + if (copy_to_user(arg, &ifr, + sizeof(struct ifreq))) + ret = -EFAULT; + } + return ret; + + /* + * These ioctl calls: + * - require superuser power. + * - require strict serialization. + * - do not return a value + */ + case SIOCSIFFLAGS: + case SIOCSIFMETRIC: + case SIOCSIFMTU: + case SIOCSIFMAP: + case SIOCSIFHWADDR: + case SIOCSIFSLAVE: + case SIOCADDMULTI: + case SIOCDELMULTI: + case SIOCSIFHWBROADCAST: + case SIOCSIFTXQLEN: + case SIOCSMIIREG: + case SIOCBONDENSLAVE: + case SIOCBONDRELEASE: + case SIOCBONDSETHWADDR: + case SIOCBONDCHANGEACTIVE: + case SIOCBRADDIF: + case SIOCBRDELIF: + case SIOCSHWTSTAMP: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + /* fall through */ + case SIOCBONDSLAVEINFOQUERY: + case SIOCBONDINFOQUERY: + dev_load(net, ifr.ifr_name); + rtnl_lock(); + ret = dev_ifsioc(net, &ifr, cmd); + rtnl_unlock(); + return ret; + + case SIOCGIFMEM: + /* Get the per device memory space. We can add this but + * currently do not support it */ + case SIOCSIFMEM: + /* Set the per device memory buffer space. + * Not applicable in our case */ + case SIOCSIFLINK: + return -ENOTTY; + + /* + * Unknown or private ioctl. + */ + default: + if (cmd == SIOCWANDEV || + (cmd >= SIOCDEVPRIVATE && + cmd <= SIOCDEVPRIVATE + 15)) { + dev_load(net, ifr.ifr_name); + rtnl_lock(); + ret = dev_ifsioc(net, &ifr, cmd); + rtnl_unlock(); + if (!ret && copy_to_user(arg, &ifr, + sizeof(struct ifreq))) + ret = -EFAULT; + return ret; + } + /* Take care of Wireless Extensions */ + if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) + return wext_handle_ioctl(net, &ifr, cmd, arg); + return -ENOTTY; + } +} + + +/** + * dev_new_index - allocate an ifindex + * @net: the applicable net namespace + * + * Returns a suitable unique value for a new device interface + * number. The caller must hold the rtnl semaphore or the + * dev_base_lock to be sure it remains unique. + */ +static int dev_new_index(struct net *net) +{ + static int ifindex; + for (;;) { + if (++ifindex <= 0) + ifindex = 1; + if (!__dev_get_by_index(net, ifindex)) + return ifindex; + } +} + +/* Delayed registration/unregisteration */ +static LIST_HEAD(net_todo_list); + +static void net_set_todo(struct net_device *dev) +{ + list_add_tail(&dev->todo_list, &net_todo_list); +} + +static void rollback_registered_many(struct list_head *head) +{ + struct net_device *dev, *tmp; + + BUG_ON(dev_boot_phase); + ASSERT_RTNL(); + + list_for_each_entry_safe(dev, tmp, head, unreg_list) { + /* Some devices call without registering + * for initialization unwind. Remove those + * devices and proceed with the remaining. + */ + if (dev->reg_state == NETREG_UNINITIALIZED) { + pr_debug("unregister_netdevice: device %s/%p never was registered\n", + dev->name, dev); + + WARN_ON(1); + list_del(&dev->unreg_list); + continue; + } + dev->dismantle = true; + BUG_ON(dev->reg_state != NETREG_REGISTERED); + } + + /* If device is running, close it first. */ + dev_close_many(head); + + list_for_each_entry(dev, head, unreg_list) { + /* And unlink it from device chain. */ + unlist_netdevice(dev); + + dev->reg_state = NETREG_UNREGISTERING; + } + + synchronize_net(); + + list_for_each_entry(dev, head, unreg_list) { + /* Shutdown queueing discipline. */ + dev_shutdown(dev); + + + /* Notify protocols, that we are about to destroy + this device. They should clean all the things. + */ + call_netdevice_notifiers(NETDEV_UNREGISTER, dev); + + if (!dev->rtnl_link_ops || + dev->rtnl_link_state == RTNL_LINK_INITIALIZED) + rtmsg_ifinfo(RTM_DELLINK, dev, ~0U); + + /* + * Flush the unicast and multicast chains + */ + dev_uc_flush(dev); + dev_mc_flush(dev); + + if (dev->netdev_ops->ndo_uninit) + dev->netdev_ops->ndo_uninit(dev); + + /* Notifier chain MUST detach us from master device. */ + WARN_ON(dev->master); + + /* Remove entries from kobject tree */ + netdev_unregister_kobject(dev); + } + + /* Process any work delayed until the end of the batch */ + dev = list_first_entry(head, struct net_device, unreg_list); + call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev); + + synchronize_net(); + + list_for_each_entry(dev, head, unreg_list) + dev_put(dev); +} + +static void rollback_registered(struct net_device *dev) +{ + LIST_HEAD(single); + + list_add(&dev->unreg_list, &single); + rollback_registered_many(&single); + list_del(&single); +} + +static netdev_features_t netdev_fix_features(struct net_device *dev, + netdev_features_t features) +{ + /* Fix illegal checksum combinations */ + if ((features & NETIF_F_HW_CSUM) && + (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { + netdev_warn(dev, "mixed HW and IP checksum settings.\n"); + features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM); + } + + /* Fix illegal SG+CSUM combinations. */ + if ((features & NETIF_F_SG) && + !(features & NETIF_F_ALL_CSUM)) { + netdev_dbg(dev, + "Dropping NETIF_F_SG since no checksum feature.\n"); + features &= ~NETIF_F_SG; + } + + /* TSO requires that SG is present as well. */ + if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) { + netdev_dbg(dev, "Dropping TSO features since no SG feature.\n"); + features &= ~NETIF_F_ALL_TSO; + } + + /* TSO ECN requires that TSO is present as well. */ + if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN) + features &= ~NETIF_F_TSO_ECN; + + /* Software GSO depends on SG. */ + if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) { + netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n"); + features &= ~NETIF_F_GSO; + } + + /* UFO needs SG and checksumming */ + if (features & NETIF_F_UFO) { + /* maybe split UFO into V4 and V6? */ + if (!((features & NETIF_F_GEN_CSUM) || + (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM)) + == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { + netdev_dbg(dev, + "Dropping NETIF_F_UFO since no checksum offload features.\n"); + features &= ~NETIF_F_UFO; + } + + if (!(features & NETIF_F_SG)) { + netdev_dbg(dev, + "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n"); + features &= ~NETIF_F_UFO; + } + } + + return features; +} + +int __netdev_update_features(struct net_device *dev) +{ + netdev_features_t features; + int err = 0; + + ASSERT_RTNL(); + + features = netdev_get_wanted_features(dev); + + if (dev->netdev_ops->ndo_fix_features) + features = dev->netdev_ops->ndo_fix_features(dev, features); + + /* driver might be less strict about feature dependencies */ + features = netdev_fix_features(dev, features); + + if (dev->features == features) + return 0; + + netdev_dbg(dev, "Features changed: %pNF -> %pNF\n", + &dev->features, &features); + + if (dev->netdev_ops->ndo_set_features) + err = dev->netdev_ops->ndo_set_features(dev, features); + + if (unlikely(err < 0)) { + netdev_err(dev, + "set_features() failed (%d); wanted %pNF, left %pNF\n", + err, &features, &dev->features); + return -1; + } + + if (!err) + dev->features = features; + + return 1; +} + +/** + * netdev_update_features - recalculate device features + * @dev: the device to check + * + * Recalculate dev->features set and send notifications if it + * has changed. Should be called after driver or hardware dependent + * conditions might have changed that influence the features. + */ +void netdev_update_features(struct net_device *dev) +{ + if (__netdev_update_features(dev)) + netdev_features_change(dev); +} +EXPORT_SYMBOL(netdev_update_features); + +/** + * netdev_change_features - recalculate device features + * @dev: the device to check + * + * Recalculate dev->features set and send notifications even + * if they have not changed. Should be called instead of + * netdev_update_features() if also dev->vlan_features might + * have changed to allow the changes to be propagated to stacked + * VLAN devices. + */ +void netdev_change_features(struct net_device *dev) +{ + __netdev_update_features(dev); + netdev_features_change(dev); +} +EXPORT_SYMBOL(netdev_change_features); + +/** + * netif_stacked_transfer_operstate - transfer operstate + * @rootdev: the root or lower level device to transfer state from + * @dev: the device to transfer operstate to + * + * Transfer operational state from root to device. This is normally + * called when a stacking relationship exists between the root + * device and the device(a leaf device). + */ +void netif_stacked_transfer_operstate(const struct net_device *rootdev, + struct net_device *dev) +{ + if (rootdev->operstate == IF_OPER_DORMANT) + netif_dormant_on(dev); + else + netif_dormant_off(dev); + + if (netif_carrier_ok(rootdev)) { + if (!netif_carrier_ok(dev)) + netif_carrier_on(dev); + } else { + if (netif_carrier_ok(dev)) + netif_carrier_off(dev); + } +} +EXPORT_SYMBOL(netif_stacked_transfer_operstate); + +#ifdef CONFIG_RPS +static int netif_alloc_rx_queues(struct net_device *dev) +{ + unsigned int i, count = dev->num_rx_queues; + struct netdev_rx_queue *rx; + + BUG_ON(count < 1); + + rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL); + if (!rx) { + pr_err("netdev: Unable to allocate %u rx queues\n", count); + return -ENOMEM; + } + dev->_rx = rx; + + for (i = 0; i < count; i++) + rx[i].dev = dev; + return 0; +} +#endif + +static void netdev_init_one_queue(struct net_device *dev, + struct netdev_queue *queue, void *_unused) +{ + /* Initialize queue lock */ + spin_lock_init(&queue->_xmit_lock); + netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type); + queue->xmit_lock_owner = -1; + netdev_queue_numa_node_write(queue, NUMA_NO_NODE); + queue->dev = dev; +#ifdef CONFIG_BQL + dql_init(&queue->dql, HZ); +#endif +} + +static int netif_alloc_netdev_queues(struct net_device *dev) +{ + unsigned int count = dev->num_tx_queues; + struct netdev_queue *tx; + + BUG_ON(count < 1); + + tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL); + if (!tx) { + pr_err("netdev: Unable to allocate %u tx queues\n", count); + return -ENOMEM; + } + dev->_tx = tx; + + netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL); + spin_lock_init(&dev->tx_global_lock); + + return 0; +} + +/** + * register_netdevice - register a network device + * @dev: device to register + * + * Take a completed network device structure and add it to the kernel + * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier + * chain. 0 is returned on success. A negative errno code is returned + * on a failure to set up the device, or if the name is a duplicate. + * + * Callers must hold the rtnl semaphore. You may want + * register_netdev() instead of this. + * + * BUGS: + * The locking appears insufficient to guarantee two parallel registers + * will not get the same name. + */ + +int register_netdevice(struct net_device *dev) +{ + int ret; + struct net *net = dev_net(dev); + + BUG_ON(dev_boot_phase); + ASSERT_RTNL(); + + might_sleep(); + + /* When net_device's are persistent, this will be fatal. */ + BUG_ON(dev->reg_state != NETREG_UNINITIALIZED); + BUG_ON(!net); + + spin_lock_init(&dev->addr_list_lock); + netdev_set_addr_lockdep_class(dev); + + dev->iflink = -1; + + ret = dev_get_valid_name(dev, dev->name); + if (ret < 0) + goto out; + + /* Init, if this function is available */ + if (dev->netdev_ops->ndo_init) { + ret = dev->netdev_ops->ndo_init(dev); + if (ret) { + if (ret > 0) + ret = -EIO; + goto out; + } + } + + dev->ifindex = dev_new_index(net); + if (dev->iflink == -1) + dev->iflink = dev->ifindex; + + /* Transfer changeable features to wanted_features and enable + * software offloads (GSO and GRO). + */ + dev->hw_features |= NETIF_F_SOFT_FEATURES; + dev->features |= NETIF_F_SOFT_FEATURES; + dev->wanted_features = dev->features & dev->hw_features; + + /* Turn on no cache copy if HW is doing checksum */ + if (!(dev->flags & IFF_LOOPBACK)) { + dev->hw_features |= NETIF_F_NOCACHE_COPY; + if (dev->features & NETIF_F_ALL_CSUM) { + dev->wanted_features |= NETIF_F_NOCACHE_COPY; + dev->features |= NETIF_F_NOCACHE_COPY; + } + } + + /* Make NETIF_F_HIGHDMA inheritable to VLAN devices. + */ + dev->vlan_features |= NETIF_F_HIGHDMA; + + ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev); + ret = notifier_to_errno(ret); + if (ret) + goto err_uninit; + + ret = netdev_register_kobject(dev); + if (ret) + goto err_uninit; + dev->reg_state = NETREG_REGISTERED; + + __netdev_update_features(dev); + + /* + * Default initial state at registry is that the + * device is present. + */ + + set_bit(__LINK_STATE_PRESENT, &dev->state); + + dev_init_scheduler(dev); + dev_hold(dev); + list_netdevice(dev); + + /* Notify protocols, that a new device appeared. */ + ret = call_netdevice_notifiers(NETDEV_REGISTER, dev); + ret = notifier_to_errno(ret); + if (ret) { + rollback_registered(dev); + dev->reg_state = NETREG_UNREGISTERED; + } + /* + * Prevent userspace races by waiting until the network + * device is fully setup before sending notifications. + */ + if (!dev->rtnl_link_ops || + dev->rtnl_link_state == RTNL_LINK_INITIALIZED) + rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U); + +out: + return ret; + +err_uninit: + if (dev->netdev_ops->ndo_uninit) + dev->netdev_ops->ndo_uninit(dev); + goto out; +} +EXPORT_SYMBOL(register_netdevice); + +/** + * init_dummy_netdev - init a dummy network device for NAPI + * @dev: device to init + * + * This takes a network device structure and initialize the minimum + * amount of fields so it can be used to schedule NAPI polls without + * registering a full blown interface. This is to be used by drivers + * that need to tie several hardware interfaces to a single NAPI + * poll scheduler due to HW limitations. + */ +int init_dummy_netdev(struct net_device *dev) +{ + /* Clear everything. Note we don't initialize spinlocks + * are they aren't supposed to be taken by any of the + * NAPI code and this dummy netdev is supposed to be + * only ever used for NAPI polls + */ + memset(dev, 0, sizeof(struct net_device)); + + /* make sure we BUG if trying to hit standard + * register/unregister code path + */ + dev->reg_state = NETREG_DUMMY; + + /* NAPI wants this */ + INIT_LIST_HEAD(&dev->napi_list); + + /* a dummy interface is started by default */ + set_bit(__LINK_STATE_PRESENT, &dev->state); + set_bit(__LINK_STATE_START, &dev->state); + + /* Note : We dont allocate pcpu_refcnt for dummy devices, + * because users of this 'device' dont need to change + * its refcount. + */ + + return 0; +} +EXPORT_SYMBOL_GPL(init_dummy_netdev); + + +/** + * register_netdev - register a network device + * @dev: device to register + * + * Take a completed network device structure and add it to the kernel + * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier + * chain. 0 is returned on success. A negative errno code is returned + * on a failure to set up the device, or if the name is a duplicate. + * + * This is a wrapper around register_netdevice that takes the rtnl semaphore + * and expands the device name if you passed a format string to + * alloc_netdev. + */ +int register_netdev(struct net_device *dev) +{ + int err; + + rtnl_lock(); + err = register_netdevice(dev); + rtnl_unlock(); + return err; +} +EXPORT_SYMBOL(register_netdev); + +int netdev_refcnt_read(const struct net_device *dev) +{ + int i, refcnt = 0; + + for_each_possible_cpu(i) + refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i); + return refcnt; +} +EXPORT_SYMBOL(netdev_refcnt_read); + +/* + * netdev_wait_allrefs - wait until all references are gone. + * + * This is called when unregistering network devices. + * + * Any protocol or device that holds a reference should register + * for netdevice notification, and cleanup and put back the + * reference if they receive an UNREGISTER event. + * We can get stuck here if buggy protocols don't correctly + * call dev_put. + */ +static void netdev_wait_allrefs(struct net_device *dev) +{ + unsigned long rebroadcast_time, warning_time; + int refcnt; + + linkwatch_forget_dev(dev); + + rebroadcast_time = warning_time = jiffies; + refcnt = netdev_refcnt_read(dev); + + while (refcnt != 0) { + if (time_after(jiffies, rebroadcast_time + 1 * HZ)) { + rtnl_lock(); + + /* Rebroadcast unregister notification */ + call_netdevice_notifiers(NETDEV_UNREGISTER, dev); + /* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users + * should have already handle it the first time */ + + if (test_bit(__LINK_STATE_LINKWATCH_PENDING, + &dev->state)) { + /* We must not have linkwatch events + * pending on unregister. If this + * happens, we simply run the queue + * unscheduled, resulting in a noop + * for this device. + */ + linkwatch_run_queue(); + } + + __rtnl_unlock(); + + rebroadcast_time = jiffies; + } + + msleep(250); + + refcnt = netdev_refcnt_read(dev); + + if (time_after(jiffies, warning_time + 10 * HZ)) { + pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n", + dev->name, refcnt); + warning_time = jiffies; + } + } +} + +/* The sequence is: + * + * rtnl_lock(); + * ... + * register_netdevice(x1); + * register_netdevice(x2); + * ... + * unregister_netdevice(y1); + * unregister_netdevice(y2); + * ... + * rtnl_unlock(); + * free_netdev(y1); + * free_netdev(y2); + * + * We are invoked by rtnl_unlock(). + * This allows us to deal with problems: + * 1) We can delete sysfs objects which invoke hotplug + * without deadlocking with linkwatch via keventd. + * 2) Since we run with the RTNL semaphore not held, we can sleep + * safely in order to wait for the netdev refcnt to drop to zero. + * + * We must not return until all unregister events added during + * the interval the lock was held have been completed. + */ +void netdev_run_todo(void) +{ + struct list_head list; + + /* Snapshot list, allow later requests */ + list_replace_init(&net_todo_list, &list); + + __rtnl_unlock(); + + /* Wait for rcu callbacks to finish before attempting to drain + * the device list. This usually avoids a 250ms wait. + */ + if (!list_empty(&list)) + rcu_barrier(); + + while (!list_empty(&list)) { + struct net_device *dev + = list_first_entry(&list, struct net_device, todo_list); + list_del(&dev->todo_list); + + if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) { + pr_err("network todo '%s' but state %d\n", + dev->name, dev->reg_state); + dump_stack(); + continue; + } + + dev->reg_state = NETREG_UNREGISTERED; + + on_each_cpu(flush_backlog, dev, 1); + + netdev_wait_allrefs(dev); + + /* paranoia */ + BUG_ON(netdev_refcnt_read(dev)); + WARN_ON(rcu_access_pointer(dev->ip_ptr)); + WARN_ON(rcu_access_pointer(dev->ip6_ptr)); + WARN_ON(dev->dn_ptr); + + if (dev->destructor) + dev->destructor(dev); + + /* Free network device */ + kobject_put(&dev->dev.kobj); + } +} + +/* Convert net_device_stats to rtnl_link_stats64. They have the same + * fields in the same order, with only the type differing. + */ +void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64, + const struct net_device_stats *netdev_stats) +{ +#if BITS_PER_LONG == 64 + BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats)); + memcpy(stats64, netdev_stats, sizeof(*stats64)); +#else + size_t i, n = sizeof(*stats64) / sizeof(u64); + const unsigned long *src = (const unsigned long *)netdev_stats; + u64 *dst = (u64 *)stats64; + + BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) != + sizeof(*stats64) / sizeof(u64)); + for (i = 0; i < n; i++) + dst[i] = src[i]; +#endif +} +EXPORT_SYMBOL(netdev_stats_to_stats64); + +/** + * dev_get_stats - get network device statistics + * @dev: device to get statistics from + * @storage: place to store stats + * + * Get network statistics from device. Return @storage. + * The device driver may provide its own method by setting + * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats; + * otherwise the internal statistics structure is used. + */ +struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, + struct rtnl_link_stats64 *storage) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (ops->ndo_get_stats64) { + memset(storage, 0, sizeof(*storage)); + ops->ndo_get_stats64(dev, storage); + } else if (ops->ndo_get_stats) { + netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev)); + } else { + netdev_stats_to_stats64(storage, &dev->stats); + } + storage->rx_dropped += atomic_long_read(&dev->rx_dropped); + return storage; +} +EXPORT_SYMBOL(dev_get_stats); + +struct netdev_queue *dev_ingress_queue_create(struct net_device *dev) +{ + struct netdev_queue *queue = dev_ingress_queue(dev); + +#ifdef CONFIG_NET_CLS_ACT + if (queue) + return queue; + queue = kzalloc(sizeof(*queue), GFP_KERNEL); + if (!queue) + return NULL; + netdev_init_one_queue(dev, queue, NULL); + queue->qdisc = &noop_qdisc; + queue->qdisc_sleeping = &noop_qdisc; + rcu_assign_pointer(dev->ingress_queue, queue); +#endif + return queue; +} + +/** + * alloc_netdev_mqs - allocate network device + * @sizeof_priv: size of private data to allocate space for + * @name: device name format string + * @setup: callback to initialize device + * @txqs: the number of TX subqueues to allocate + * @rxqs: the number of RX subqueues to allocate + * + * Allocates a struct net_device with private data area for driver use + * and performs basic initialization. Also allocates subquue structs + * for each queue on the device. + */ +struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, + void (*setup)(struct net_device *), + unsigned int txqs, unsigned int rxqs) +{ + struct net_device *dev; + size_t alloc_size; + struct net_device *p; + + BUG_ON(strlen(name) >= sizeof(dev->name)); + + if (txqs < 1) { + pr_err("alloc_netdev: Unable to allocate device with zero queues\n"); + return NULL; + } + +#ifdef CONFIG_RPS + if (rxqs < 1) { + pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n"); + return NULL; + } +#endif + + alloc_size = sizeof(struct net_device); + if (sizeof_priv) { + /* ensure 32-byte alignment of private area */ + alloc_size = ALIGN(alloc_size, NETDEV_ALIGN); + alloc_size += sizeof_priv; + } + /* ensure 32-byte alignment of whole construct */ + alloc_size += NETDEV_ALIGN - 1; + + p = kzalloc(alloc_size, GFP_KERNEL); + if (!p) { + pr_err("alloc_netdev: Unable to allocate device\n"); + return NULL; + } + + dev = PTR_ALIGN(p, NETDEV_ALIGN); + dev->padded = (char *)dev - (char *)p; + + dev->pcpu_refcnt = alloc_percpu(int); + if (!dev->pcpu_refcnt) + goto free_p; + + if (dev_addr_init(dev)) + goto free_pcpu; + + dev_mc_init(dev); + dev_uc_init(dev); + + dev_net_set(dev, &init_net); + + dev->gso_max_size = GSO_MAX_SIZE; + + INIT_LIST_HEAD(&dev->napi_list); + INIT_LIST_HEAD(&dev->unreg_list); + INIT_LIST_HEAD(&dev->link_watch_list); + dev->priv_flags = IFF_XMIT_DST_RELEASE; + setup(dev); + + dev->num_tx_queues = txqs; + dev->real_num_tx_queues = txqs; + if (netif_alloc_netdev_queues(dev)) + goto free_all; + +#ifdef CONFIG_RPS + dev->num_rx_queues = rxqs; + dev->real_num_rx_queues = rxqs; + if (netif_alloc_rx_queues(dev)) + goto free_all; +#endif + + strcpy(dev->name, name); + dev->group = INIT_NETDEV_GROUP; + return dev; + +free_all: + free_netdev(dev); + return NULL; + +free_pcpu: + free_percpu(dev->pcpu_refcnt); + kfree(dev->_tx); +#ifdef CONFIG_RPS + kfree(dev->_rx); +#endif + +free_p: + kfree(p); + return NULL; +} +EXPORT_SYMBOL(alloc_netdev_mqs); + +/** + * free_netdev - free network device + * @dev: device + * + * This function does the last stage of destroying an allocated device + * interface. The reference to the device object is released. + * If this is the last reference then it will be freed. + */ +void free_netdev(struct net_device *dev) +{ + struct napi_struct *p, *n; + + release_net(dev_net(dev)); + + kfree(dev->_tx); +#ifdef CONFIG_RPS + kfree(dev->_rx); +#endif + + kfree(rcu_dereference_protected(dev->ingress_queue, 1)); + + /* Flush device addresses */ + dev_addr_flush(dev); + + list_for_each_entry_safe(p, n, &dev->napi_list, dev_list) + netif_napi_del(p); + + free_percpu(dev->pcpu_refcnt); + dev->pcpu_refcnt = NULL; + + /* Compatibility with error handling in drivers */ + if (dev->reg_state == NETREG_UNINITIALIZED) { + kfree((char *)dev - dev->padded); + return; + } + + BUG_ON(dev->reg_state != NETREG_UNREGISTERED); + dev->reg_state = NETREG_RELEASED; + + /* will free via device release */ + put_device(&dev->dev); +} +EXPORT_SYMBOL(free_netdev); + +/** + * synchronize_net - Synchronize with packet receive processing + * + * Wait for packets currently being received to be done. + * Does not block later packets from starting. + */ +void synchronize_net(void) +{ + might_sleep(); + if (rtnl_is_locked()) + synchronize_rcu_expedited(); + else + synchronize_rcu(); +} +EXPORT_SYMBOL(synchronize_net); + +/** + * unregister_netdevice_queue - remove device from the kernel + * @dev: device + * @head: list + * + * This function shuts down a device interface and removes it + * from the kernel tables. + * If head not NULL, device is queued to be unregistered later. + * + * Callers must hold the rtnl semaphore. You may want + * unregister_netdev() instead of this. + */ + +void unregister_netdevice_queue(struct net_device *dev, struct list_head *head) +{ + ASSERT_RTNL(); + + if (head) { + list_move_tail(&dev->unreg_list, head); + } else { + rollback_registered(dev); + /* Finish processing unregister after unlock */ + net_set_todo(dev); + } +} +EXPORT_SYMBOL(unregister_netdevice_queue); + +/** + * unregister_netdevice_many - unregister many devices + * @head: list of devices + */ +void unregister_netdevice_many(struct list_head *head) +{ + struct net_device *dev; + + if (!list_empty(head)) { + rollback_registered_many(head); + list_for_each_entry(dev, head, unreg_list) + net_set_todo(dev); + } +} +EXPORT_SYMBOL(unregister_netdevice_many); + +/** + * unregister_netdev - remove device from the kernel + * @dev: device + * + * This function shuts down a device interface and removes it + * from the kernel tables. + * + * This is just a wrapper for unregister_netdevice that takes + * the rtnl semaphore. In general you want to use this and not + * unregister_netdevice. + */ +void unregister_netdev(struct net_device *dev) +{ + rtnl_lock(); + unregister_netdevice(dev); + rtnl_unlock(); +} +EXPORT_SYMBOL(unregister_netdev); + +/** + * dev_change_net_namespace - move device to different nethost namespace + * @dev: device + * @net: network namespace + * @pat: If not NULL name pattern to try if the current device name + * is already taken in the destination network namespace. + * + * This function shuts down a device interface and moves it + * to a new network namespace. On success 0 is returned, on + * a failure a netagive errno code is returned. + * + * Callers must hold the rtnl semaphore. + */ + +int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat) +{ + int err; + + ASSERT_RTNL(); + + /* Don't allow namespace local devices to be moved. */ + err = -EINVAL; + if (dev->features & NETIF_F_NETNS_LOCAL) + goto out; + + /* Ensure the device has been registrered */ + err = -EINVAL; + if (dev->reg_state != NETREG_REGISTERED) + goto out; + + /* Get out if there is nothing todo */ + err = 0; + if (net_eq(dev_net(dev), net)) + goto out; + + /* Pick the destination device name, and ensure + * we can use it in the destination network namespace. + */ + err = -EEXIST; + if (__dev_get_by_name(net, dev->name)) { + /* We get here if we can't use the current device name */ + if (!pat) + goto out; + if (dev_get_valid_name(dev, pat) < 0) + goto out; + } + + /* + * And now a mini version of register_netdevice unregister_netdevice. + */ + + /* If device is running close it first. */ + dev_close(dev); + + /* And unlink it from device chain */ + err = -ENODEV; + unlist_netdevice(dev); + + synchronize_net(); + + /* Shutdown queueing discipline. */ + dev_shutdown(dev); + + /* Notify protocols, that we are about to destroy + this device. They should clean all the things. + + Note that dev->reg_state stays at NETREG_REGISTERED. + This is wanted because this way 8021q and macvlan know + the device is just moving and can keep their slaves up. + */ + call_netdevice_notifiers(NETDEV_UNREGISTER, dev); + call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev); + rtmsg_ifinfo(RTM_DELLINK, dev, ~0U); + + /* + * Flush the unicast and multicast chains + */ + dev_uc_flush(dev); + dev_mc_flush(dev); + + /* Actually switch the network namespace */ + dev_net_set(dev, net); + + /* If there is an ifindex conflict assign a new one */ + if (__dev_get_by_index(net, dev->ifindex)) { + int iflink = (dev->iflink == dev->ifindex); + dev->ifindex = dev_new_index(net); + if (iflink) + dev->iflink = dev->ifindex; + } + + /* Fixup kobjects */ + err = device_rename(&dev->dev, dev->name); + WARN_ON(err); + + /* Add the device back in the hashes */ + list_netdevice(dev); + + /* Notify protocols, that a new device appeared. */ + call_netdevice_notifiers(NETDEV_REGISTER, dev); + + /* + * Prevent userspace races by waiting until the network + * device is fully setup before sending notifications. + */ + rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U); + + synchronize_net(); + err = 0; +out: + return err; +} +EXPORT_SYMBOL_GPL(dev_change_net_namespace); + +static int dev_cpu_callback(struct notifier_block *nfb, + unsigned long action, + void *ocpu) +{ + struct sk_buff **list_skb; + struct sk_buff *skb; + unsigned int cpu, oldcpu = (unsigned long)ocpu; + struct softnet_data *sd, *oldsd; + + if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) + return NOTIFY_OK; + + local_irq_disable(); + cpu = smp_processor_id(); + sd = &per_cpu(softnet_data, cpu); + oldsd = &per_cpu(softnet_data, oldcpu); + + /* Find end of our completion_queue. */ + list_skb = &sd->completion_queue; + while (*list_skb) + list_skb = &(*list_skb)->next; + /* Append completion queue from offline CPU. */ + *list_skb = oldsd->completion_queue; + oldsd->completion_queue = NULL; + + /* Append output queue from offline CPU. */ + if (oldsd->output_queue) { + *sd->output_queue_tailp = oldsd->output_queue; + sd->output_queue_tailp = oldsd->output_queue_tailp; + oldsd->output_queue = NULL; + oldsd->output_queue_tailp = &oldsd->output_queue; + } + /* Append NAPI poll list from offline CPU. */ + if (!list_empty(&oldsd->poll_list)) { + list_splice_init(&oldsd->poll_list, &sd->poll_list); + raise_softirq_irqoff(NET_RX_SOFTIRQ); + } + + raise_softirq_irqoff(NET_TX_SOFTIRQ); + local_irq_enable(); + + /* Process offline CPU's input_pkt_queue */ + while ((skb = __skb_dequeue(&oldsd->process_queue))) { + netif_rx(skb); + input_queue_head_incr(oldsd); + } + while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) { + netif_rx(skb); + input_queue_head_incr(oldsd); + } + + return NOTIFY_OK; +} + + +/** + * netdev_increment_features - increment feature set by one + * @all: current feature set + * @one: new feature set + * @mask: mask feature set + * + * Computes a new feature set after adding a device with feature set + * @one to the master device with current feature set @all. Will not + * enable anything that is off in @mask. Returns the new feature set. + */ +netdev_features_t netdev_increment_features(netdev_features_t all, + netdev_features_t one, netdev_features_t mask) +{ + if (mask & NETIF_F_GEN_CSUM) + mask |= NETIF_F_ALL_CSUM; + mask |= NETIF_F_VLAN_CHALLENGED; + + all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask; + all &= one | ~NETIF_F_ALL_FOR_ALL; + + /* If one device supports hw checksumming, set for all. */ + if (all & NETIF_F_GEN_CSUM) + all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM); + + return all; +} +EXPORT_SYMBOL(netdev_increment_features); + +static struct hlist_head *netdev_create_hash(void) +{ + int i; + struct hlist_head *hash; + + hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL); + if (hash != NULL) + for (i = 0; i < NETDEV_HASHENTRIES; i++) + INIT_HLIST_HEAD(&hash[i]); + + return hash; +} + +/* Initialize per network namespace state */ +static int __net_init netdev_init(struct net *net) +{ + INIT_LIST_HEAD(&net->dev_base_head); + + net->dev_name_head = netdev_create_hash(); + if (net->dev_name_head == NULL) + goto err_name; + + net->dev_index_head = netdev_create_hash(); + if (net->dev_index_head == NULL) + goto err_idx; + + return 0; + +err_idx: + kfree(net->dev_name_head); +err_name: + return -ENOMEM; +} + +/** + * netdev_drivername - network driver for the device + * @dev: network device + * + * Determine network driver for device. + */ +const char *netdev_drivername(const struct net_device *dev) +{ + const struct device_driver *driver; + const struct device *parent; + const char *empty = ""; + + parent = dev->dev.parent; + if (!parent) + return empty; + + driver = parent->driver; + if (driver && driver->name) + return driver->name; + return empty; +} + +int __netdev_printk(const char *level, const struct net_device *dev, + struct va_format *vaf) +{ + int r; + + if (dev && dev->dev.parent) + r = dev_printk(level, dev->dev.parent, "%s: %pV", + netdev_name(dev), vaf); + else if (dev) + r = printk("%s%s: %pV", level, netdev_name(dev), vaf); + else + r = printk("%s(NULL net_device): %pV", level, vaf); + + return r; +} +EXPORT_SYMBOL(__netdev_printk); + +int netdev_printk(const char *level, const struct net_device *dev, + const char *format, ...) +{ + struct va_format vaf; + va_list args; + int r; + + va_start(args, format); + + vaf.fmt = format; + vaf.va = &args; + + r = __netdev_printk(level, dev, &vaf); + va_end(args); + + return r; +} +EXPORT_SYMBOL(netdev_printk); + +#define define_netdev_printk_level(func, level) \ +int func(const struct net_device *dev, const char *fmt, ...) \ +{ \ + int r; \ + struct va_format vaf; \ + va_list args; \ + \ + va_start(args, fmt); \ + \ + vaf.fmt = fmt; \ + vaf.va = &args; \ + \ + r = __netdev_printk(level, dev, &vaf); \ + va_end(args); \ + \ + return r; \ +} \ +EXPORT_SYMBOL(func); + +define_netdev_printk_level(netdev_emerg, KERN_EMERG); +define_netdev_printk_level(netdev_alert, KERN_ALERT); +define_netdev_printk_level(netdev_crit, KERN_CRIT); +define_netdev_printk_level(netdev_err, KERN_ERR); +define_netdev_printk_level(netdev_warn, KERN_WARNING); +define_netdev_printk_level(netdev_notice, KERN_NOTICE); +define_netdev_printk_level(netdev_info, KERN_INFO); + +static void __net_exit netdev_exit(struct net *net) +{ + kfree(net->dev_name_head); + kfree(net->dev_index_head); +} + +static struct pernet_operations __net_initdata netdev_net_ops = { + .init = netdev_init, + .exit = netdev_exit, +}; + +static void __net_exit default_device_exit(struct net *net) +{ + struct net_device *dev, *aux; + /* + * Push all migratable network devices back to the + * initial network namespace + */ + rtnl_lock(); + for_each_netdev_safe(net, dev, aux) { + int err; + char fb_name[IFNAMSIZ]; + + /* Ignore unmoveable devices (i.e. loopback) */ + if (dev->features & NETIF_F_NETNS_LOCAL) + continue; + + /* Leave virtual devices for the generic cleanup */ + if (dev->rtnl_link_ops) + continue; + + /* Push remaining network devices to init_net */ + snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex); + err = dev_change_net_namespace(dev, &init_net, fb_name); + if (err) { + pr_emerg("%s: failed to move %s to init_net: %d\n", + __func__, dev->name, err); + BUG(); + } + } + rtnl_unlock(); +} + +static void __net_exit default_device_exit_batch(struct list_head *net_list) +{ + /* At exit all network devices most be removed from a network + * namespace. Do this in the reverse order of registration. + * Do this across as many network namespaces as possible to + * improve batching efficiency. + */ + struct net_device *dev; + struct net *net; + LIST_HEAD(dev_kill_list); + + rtnl_lock(); + list_for_each_entry(net, net_list, exit_list) { + for_each_netdev_reverse(net, dev) { + if (dev->rtnl_link_ops) + dev->rtnl_link_ops->dellink(dev, &dev_kill_list); + else + unregister_netdevice_queue(dev, &dev_kill_list); + } + } + unregister_netdevice_many(&dev_kill_list); + list_del(&dev_kill_list); + rtnl_unlock(); +} + +static struct pernet_operations __net_initdata default_device_ops = { + .exit = default_device_exit, + .exit_batch = default_device_exit_batch, +}; + +/* + * Initialize the DEV module. At boot time this walks the device list and + * unhooks any devices that fail to initialise (normally hardware not + * present) and leaves us with a valid list of present and active devices. + * + */ + +/* + * This is called single threaded during boot, so no need + * to take the rtnl semaphore. + */ +static int __init net_dev_init(void) +{ + int i, rc = -ENOMEM; + + BUG_ON(!dev_boot_phase); + + if (dev_proc_init()) + goto out; + + if (netdev_kobject_init()) + goto out; + + INIT_LIST_HEAD(&ptype_all); + for (i = 0; i < PTYPE_HASH_SIZE; i++) + INIT_LIST_HEAD(&ptype_base[i]); + + if (register_pernet_subsys(&netdev_net_ops)) + goto out; + + /* + * Initialise the packet receive queues. + */ + + for_each_possible_cpu(i) { + struct softnet_data *sd = &per_cpu(softnet_data, i); + + memset(sd, 0, sizeof(*sd)); + skb_queue_head_init(&sd->input_pkt_queue); + skb_queue_head_init(&sd->process_queue); + sd->completion_queue = NULL; + INIT_LIST_HEAD(&sd->poll_list); + sd->output_queue = NULL; + sd->output_queue_tailp = &sd->output_queue; +#ifdef CONFIG_RPS + sd->csd.func = rps_trigger_softirq; + sd->csd.info = sd; + sd->csd.flags = 0; + sd->cpu = i; +#endif + + sd->backlog.poll = process_backlog; + sd->backlog.weight = weight_p; + sd->backlog.gro_list = NULL; + sd->backlog.gro_count = 0; + } + + dev_boot_phase = 0; + + /* The loopback device is special if any other network devices + * is present in a network namespace the loopback device must + * be present. Since we now dynamically allocate and free the + * loopback device ensure this invariant is maintained by + * keeping the loopback device as the first device on the + * list of network devices. Ensuring the loopback devices + * is the first device that appears and the last network device + * that disappears. + */ + if (register_pernet_device(&loopback_net_ops)) + goto out; + + if (register_pernet_device(&default_device_ops)) + goto out; + + open_softirq(NET_TX_SOFTIRQ, net_tx_action); + open_softirq(NET_RX_SOFTIRQ, net_rx_action); + + hotcpu_notifier(dev_cpu_callback, 0); + dst_init(); + dev_mcast_init(); + rc = 0; +out: + return rc; +} + +subsys_initcall(net_dev_init); + +static int __init initialize_hashrnd(void) +{ + get_random_bytes(&hashrnd, sizeof(hashrnd)); + return 0; +} + +late_initcall_sync(initialize_hashrnd); +EXPORT_SYMBOL(dev_change_name); diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c new file mode 100644 index 00000000..626698f0 --- /dev/null +++ b/net/core/dev_addr_lists.c @@ -0,0 +1,734 @@ +/* + * net/core/dev_addr_lists.c - Functions for handling net device lists + * Copyright (c) 2010 Jiri Pirko <jpirko@redhat.com> + * + * This file contains functions for working with unicast, multicast and device + * addresses lists. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include <linux/netdevice.h> +#include <linux/rtnetlink.h> +#include <linux/export.h> +#include <linux/list.h> +#include <linux/proc_fs.h> + +/* + * General list handling functions + */ + +static int __hw_addr_add_ex(struct netdev_hw_addr_list *list, + unsigned char *addr, int addr_len, + unsigned char addr_type, bool global) +{ + struct netdev_hw_addr *ha; + int alloc_size; + + if (addr_len > MAX_ADDR_LEN) + return -EINVAL; + + list_for_each_entry(ha, &list->list, list) { + if (!memcmp(ha->addr, addr, addr_len) && + ha->type == addr_type) { + if (global) { + /* check if addr is already used as global */ + if (ha->global_use) + return 0; + else + ha->global_use = true; + } + ha->refcount++; + return 0; + } + } + + + alloc_size = sizeof(*ha); + if (alloc_size < L1_CACHE_BYTES) + alloc_size = L1_CACHE_BYTES; + ha = kmalloc(alloc_size, GFP_ATOMIC); + if (!ha) + return -ENOMEM; + memcpy(ha->addr, addr, addr_len); + ha->type = addr_type; + ha->refcount = 1; + ha->global_use = global; + ha->synced = false; + list_add_tail_rcu(&ha->list, &list->list); + list->count++; + return 0; +} + +static int __hw_addr_add(struct netdev_hw_addr_list *list, unsigned char *addr, + int addr_len, unsigned char addr_type) +{ + return __hw_addr_add_ex(list, addr, addr_len, addr_type, false); +} + +static int __hw_addr_del_ex(struct netdev_hw_addr_list *list, + unsigned char *addr, int addr_len, + unsigned char addr_type, bool global) +{ + struct netdev_hw_addr *ha; + + list_for_each_entry(ha, &list->list, list) { + if (!memcmp(ha->addr, addr, addr_len) && + (ha->type == addr_type || !addr_type)) { + if (global) { + if (!ha->global_use) + break; + else + ha->global_use = false; + } + if (--ha->refcount) + return 0; + list_del_rcu(&ha->list); + kfree_rcu(ha, rcu_head); + list->count--; + return 0; + } + } + return -ENOENT; +} + +static int __hw_addr_del(struct netdev_hw_addr_list *list, unsigned char *addr, + int addr_len, unsigned char addr_type) +{ + return __hw_addr_del_ex(list, addr, addr_len, addr_type, false); +} + +int __hw_addr_add_multiple(struct netdev_hw_addr_list *to_list, + struct netdev_hw_addr_list *from_list, + int addr_len, unsigned char addr_type) +{ + int err; + struct netdev_hw_addr *ha, *ha2; + unsigned char type; + + list_for_each_entry(ha, &from_list->list, list) { + type = addr_type ? addr_type : ha->type; + err = __hw_addr_add(to_list, ha->addr, addr_len, type); + if (err) + goto unroll; + } + return 0; + +unroll: + list_for_each_entry(ha2, &from_list->list, list) { + if (ha2 == ha) + break; + type = addr_type ? addr_type : ha2->type; + __hw_addr_del(to_list, ha2->addr, addr_len, type); + } + return err; +} +EXPORT_SYMBOL(__hw_addr_add_multiple); + +void __hw_addr_del_multiple(struct netdev_hw_addr_list *to_list, + struct netdev_hw_addr_list *from_list, + int addr_len, unsigned char addr_type) +{ + struct netdev_hw_addr *ha; + unsigned char type; + + list_for_each_entry(ha, &from_list->list, list) { + type = addr_type ? addr_type : ha->type; + __hw_addr_del(to_list, ha->addr, addr_len, type); + } +} +EXPORT_SYMBOL(__hw_addr_del_multiple); + +int __hw_addr_sync(struct netdev_hw_addr_list *to_list, + struct netdev_hw_addr_list *from_list, + int addr_len) +{ + int err = 0; + struct netdev_hw_addr *ha, *tmp; + + list_for_each_entry_safe(ha, tmp, &from_list->list, list) { + if (!ha->synced) { + err = __hw_addr_add(to_list, ha->addr, + addr_len, ha->type); + if (err) + break; + ha->synced = true; + ha->refcount++; + } else if (ha->refcount == 1) { + __hw_addr_del(to_list, ha->addr, addr_len, ha->type); + __hw_addr_del(from_list, ha->addr, addr_len, ha->type); + } + } + return err; +} +EXPORT_SYMBOL(__hw_addr_sync); + +void __hw_addr_unsync(struct netdev_hw_addr_list *to_list, + struct netdev_hw_addr_list *from_list, + int addr_len) +{ + struct netdev_hw_addr *ha, *tmp; + + list_for_each_entry_safe(ha, tmp, &from_list->list, list) { + if (ha->synced) { + __hw_addr_del(to_list, ha->addr, + addr_len, ha->type); + ha->synced = false; + __hw_addr_del(from_list, ha->addr, + addr_len, ha->type); + } + } +} +EXPORT_SYMBOL(__hw_addr_unsync); + +void __hw_addr_flush(struct netdev_hw_addr_list *list) +{ + struct netdev_hw_addr *ha, *tmp; + + list_for_each_entry_safe(ha, tmp, &list->list, list) { + list_del_rcu(&ha->list); + kfree_rcu(ha, rcu_head); + } + list->count = 0; +} +EXPORT_SYMBOL(__hw_addr_flush); + +void __hw_addr_init(struct netdev_hw_addr_list *list) +{ + INIT_LIST_HEAD(&list->list); + list->count = 0; +} +EXPORT_SYMBOL(__hw_addr_init); + +/* + * Device addresses handling functions + */ + +/** + * dev_addr_flush - Flush device address list + * @dev: device + * + * Flush device address list and reset ->dev_addr. + * + * The caller must hold the rtnl_mutex. + */ +void dev_addr_flush(struct net_device *dev) +{ + /* rtnl_mutex must be held here */ + + __hw_addr_flush(&dev->dev_addrs); + dev->dev_addr = NULL; +} +EXPORT_SYMBOL(dev_addr_flush); + +/** + * dev_addr_init - Init device address list + * @dev: device + * + * Init device address list and create the first element, + * used by ->dev_addr. + * + * The caller must hold the rtnl_mutex. + */ +int dev_addr_init(struct net_device *dev) +{ + unsigned char addr[MAX_ADDR_LEN]; + struct netdev_hw_addr *ha; + int err; + + /* rtnl_mutex must be held here */ + + __hw_addr_init(&dev->dev_addrs); + memset(addr, 0, sizeof(addr)); + err = __hw_addr_add(&dev->dev_addrs, addr, sizeof(addr), + NETDEV_HW_ADDR_T_LAN); + if (!err) { + /* + * Get the first (previously created) address from the list + * and set dev_addr pointer to this location. + */ + ha = list_first_entry(&dev->dev_addrs.list, + struct netdev_hw_addr, list); + dev->dev_addr = ha->addr; + } + return err; +} +EXPORT_SYMBOL(dev_addr_init); + +/** + * dev_addr_add - Add a device address + * @dev: device + * @addr: address to add + * @addr_type: address type + * + * Add a device address to the device or increase the reference count if + * it already exists. + * + * The caller must hold the rtnl_mutex. + */ +int dev_addr_add(struct net_device *dev, unsigned char *addr, + unsigned char addr_type) +{ + int err; + + ASSERT_RTNL(); + + err = __hw_addr_add(&dev->dev_addrs, addr, dev->addr_len, addr_type); + if (!err) + call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); + return err; +} +EXPORT_SYMBOL(dev_addr_add); + +/** + * dev_addr_del - Release a device address. + * @dev: device + * @addr: address to delete + * @addr_type: address type + * + * Release reference to a device address and remove it from the device + * if the reference count drops to zero. + * + * The caller must hold the rtnl_mutex. + */ +int dev_addr_del(struct net_device *dev, unsigned char *addr, + unsigned char addr_type) +{ + int err; + struct netdev_hw_addr *ha; + + ASSERT_RTNL(); + + /* + * We can not remove the first address from the list because + * dev->dev_addr points to that. + */ + ha = list_first_entry(&dev->dev_addrs.list, + struct netdev_hw_addr, list); + if (ha->addr == dev->dev_addr && ha->refcount == 1) + return -ENOENT; + + err = __hw_addr_del(&dev->dev_addrs, addr, dev->addr_len, + addr_type); + if (!err) + call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); + return err; +} +EXPORT_SYMBOL(dev_addr_del); + +/** + * dev_addr_add_multiple - Add device addresses from another device + * @to_dev: device to which addresses will be added + * @from_dev: device from which addresses will be added + * @addr_type: address type - 0 means type will be used from from_dev + * + * Add device addresses of the one device to another. + ** + * The caller must hold the rtnl_mutex. + */ +int dev_addr_add_multiple(struct net_device *to_dev, + struct net_device *from_dev, + unsigned char addr_type) +{ + int err; + + ASSERT_RTNL(); + + if (from_dev->addr_len != to_dev->addr_len) + return -EINVAL; + err = __hw_addr_add_multiple(&to_dev->dev_addrs, &from_dev->dev_addrs, + to_dev->addr_len, addr_type); + if (!err) + call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev); + return err; +} +EXPORT_SYMBOL(dev_addr_add_multiple); + +/** + * dev_addr_del_multiple - Delete device addresses by another device + * @to_dev: device where the addresses will be deleted + * @from_dev: device supplying the addresses to be deleted + * @addr_type: address type - 0 means type will be used from from_dev + * + * Deletes addresses in to device by the list of addresses in from device. + * + * The caller must hold the rtnl_mutex. + */ +int dev_addr_del_multiple(struct net_device *to_dev, + struct net_device *from_dev, + unsigned char addr_type) +{ + ASSERT_RTNL(); + + if (from_dev->addr_len != to_dev->addr_len) + return -EINVAL; + __hw_addr_del_multiple(&to_dev->dev_addrs, &from_dev->dev_addrs, + to_dev->addr_len, addr_type); + call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev); + return 0; +} +EXPORT_SYMBOL(dev_addr_del_multiple); + +/* + * Unicast list handling functions + */ + +/** + * dev_uc_add - Add a secondary unicast address + * @dev: device + * @addr: address to add + * + * Add a secondary unicast address to the device or increase + * the reference count if it already exists. + */ +int dev_uc_add(struct net_device *dev, unsigned char *addr) +{ + int err; + + netif_addr_lock_bh(dev); + err = __hw_addr_add(&dev->uc, addr, dev->addr_len, + NETDEV_HW_ADDR_T_UNICAST); + if (!err) + __dev_set_rx_mode(dev); + netif_addr_unlock_bh(dev); + return err; +} +EXPORT_SYMBOL(dev_uc_add); + +/** + * dev_uc_del - Release secondary unicast address. + * @dev: device + * @addr: address to delete + * + * Release reference to a secondary unicast address and remove it + * from the device if the reference count drops to zero. + */ +int dev_uc_del(struct net_device *dev, unsigned char *addr) +{ + int err; + + netif_addr_lock_bh(dev); + err = __hw_addr_del(&dev->uc, addr, dev->addr_len, + NETDEV_HW_ADDR_T_UNICAST); + if (!err) + __dev_set_rx_mode(dev); + netif_addr_unlock_bh(dev); + return err; +} +EXPORT_SYMBOL(dev_uc_del); + +/** + * dev_uc_sync - Synchronize device's unicast list to another device + * @to: destination device + * @from: source device + * + * Add newly added addresses to the destination device and release + * addresses that have no users left. The source device must be + * locked by netif_addr_lock_bh. + * + * This function is intended to be called from the dev->set_rx_mode + * function of layered software devices. + */ +int dev_uc_sync(struct net_device *to, struct net_device *from) +{ + int err = 0; + + if (to->addr_len != from->addr_len) + return -EINVAL; + + netif_addr_lock_nested(to); + err = __hw_addr_sync(&to->uc, &from->uc, to->addr_len); + if (!err) + __dev_set_rx_mode(to); + netif_addr_unlock(to); + return err; +} +EXPORT_SYMBOL(dev_uc_sync); + +/** + * dev_uc_unsync - Remove synchronized addresses from the destination device + * @to: destination device + * @from: source device + * + * Remove all addresses that were added to the destination device by + * dev_uc_sync(). This function is intended to be called from the + * dev->stop function of layered software devices. + */ +void dev_uc_unsync(struct net_device *to, struct net_device *from) +{ + if (to->addr_len != from->addr_len) + return; + + netif_addr_lock_bh(from); + netif_addr_lock_nested(to); + __hw_addr_unsync(&to->uc, &from->uc, to->addr_len); + __dev_set_rx_mode(to); + netif_addr_unlock(to); + netif_addr_unlock_bh(from); +} +EXPORT_SYMBOL(dev_uc_unsync); + +/** + * dev_uc_flush - Flush unicast addresses + * @dev: device + * + * Flush unicast addresses. + */ +void dev_uc_flush(struct net_device *dev) +{ + netif_addr_lock_bh(dev); + __hw_addr_flush(&dev->uc); + netif_addr_unlock_bh(dev); +} +EXPORT_SYMBOL(dev_uc_flush); + +/** + * dev_uc_flush - Init unicast address list + * @dev: device + * + * Init unicast address list. + */ +void dev_uc_init(struct net_device *dev) +{ + __hw_addr_init(&dev->uc); +} +EXPORT_SYMBOL(dev_uc_init); + +/* + * Multicast list handling functions + */ + +static int __dev_mc_add(struct net_device *dev, unsigned char *addr, + bool global) +{ + int err; + + netif_addr_lock_bh(dev); + err = __hw_addr_add_ex(&dev->mc, addr, dev->addr_len, + NETDEV_HW_ADDR_T_MULTICAST, global); + if (!err) + __dev_set_rx_mode(dev); + netif_addr_unlock_bh(dev); + return err; +} +/** + * dev_mc_add - Add a multicast address + * @dev: device + * @addr: address to add + * + * Add a multicast address to the device or increase + * the reference count if it already exists. + */ +int dev_mc_add(struct net_device *dev, unsigned char *addr) +{ + return __dev_mc_add(dev, addr, false); +} +EXPORT_SYMBOL(dev_mc_add); + +/** + * dev_mc_add_global - Add a global multicast address + * @dev: device + * @addr: address to add + * + * Add a global multicast address to the device. + */ +int dev_mc_add_global(struct net_device *dev, unsigned char *addr) +{ + return __dev_mc_add(dev, addr, true); +} +EXPORT_SYMBOL(dev_mc_add_global); + +static int __dev_mc_del(struct net_device *dev, unsigned char *addr, + bool global) +{ + int err; + + netif_addr_lock_bh(dev); + err = __hw_addr_del_ex(&dev->mc, addr, dev->addr_len, + NETDEV_HW_ADDR_T_MULTICAST, global); + if (!err) + __dev_set_rx_mode(dev); + netif_addr_unlock_bh(dev); + return err; +} + +/** + * dev_mc_del - Delete a multicast address. + * @dev: device + * @addr: address to delete + * + * Release reference to a multicast address and remove it + * from the device if the reference count drops to zero. + */ +int dev_mc_del(struct net_device *dev, unsigned char *addr) +{ + return __dev_mc_del(dev, addr, false); +} +EXPORT_SYMBOL(dev_mc_del); + +/** + * dev_mc_del_global - Delete a global multicast address. + * @dev: device + * @addr: address to delete + * + * Release reference to a multicast address and remove it + * from the device if the reference count drops to zero. + */ +int dev_mc_del_global(struct net_device *dev, unsigned char *addr) +{ + return __dev_mc_del(dev, addr, true); +} +EXPORT_SYMBOL(dev_mc_del_global); + +/** + * dev_mc_sync - Synchronize device's unicast list to another device + * @to: destination device + * @from: source device + * + * Add newly added addresses to the destination device and release + * addresses that have no users left. The source device must be + * locked by netif_addr_lock_bh. + * + * This function is intended to be called from the ndo_set_rx_mode + * function of layered software devices. + */ +int dev_mc_sync(struct net_device *to, struct net_device *from) +{ + int err = 0; + + if (to->addr_len != from->addr_len) + return -EINVAL; + + netif_addr_lock_nested(to); + err = __hw_addr_sync(&to->mc, &from->mc, to->addr_len); + if (!err) + __dev_set_rx_mode(to); + netif_addr_unlock(to); + return err; +} +EXPORT_SYMBOL(dev_mc_sync); + +/** + * dev_mc_unsync - Remove synchronized addresses from the destination device + * @to: destination device + * @from: source device + * + * Remove all addresses that were added to the destination device by + * dev_mc_sync(). This function is intended to be called from the + * dev->stop function of layered software devices. + */ +void dev_mc_unsync(struct net_device *to, struct net_device *from) +{ + if (to->addr_len != from->addr_len) + return; + + netif_addr_lock_bh(from); + netif_addr_lock_nested(to); + __hw_addr_unsync(&to->mc, &from->mc, to->addr_len); + __dev_set_rx_mode(to); + netif_addr_unlock(to); + netif_addr_unlock_bh(from); +} +EXPORT_SYMBOL(dev_mc_unsync); + +/** + * dev_mc_flush - Flush multicast addresses + * @dev: device + * + * Flush multicast addresses. + */ +void dev_mc_flush(struct net_device *dev) +{ + netif_addr_lock_bh(dev); + __hw_addr_flush(&dev->mc); + netif_addr_unlock_bh(dev); +} +EXPORT_SYMBOL(dev_mc_flush); + +/** + * dev_mc_flush - Init multicast address list + * @dev: device + * + * Init multicast address list. + */ +void dev_mc_init(struct net_device *dev) +{ + __hw_addr_init(&dev->mc); +} +EXPORT_SYMBOL(dev_mc_init); + +#ifdef CONFIG_PROC_FS +#include <linux/seq_file.h> + +static int dev_mc_seq_show(struct seq_file *seq, void *v) +{ + struct netdev_hw_addr *ha; + struct net_device *dev = v; + + if (v == SEQ_START_TOKEN) + return 0; + + netif_addr_lock_bh(dev); + netdev_for_each_mc_addr(ha, dev) { + int i; + + seq_printf(seq, "%-4d %-15s %-5d %-5d ", dev->ifindex, + dev->name, ha->refcount, ha->global_use); + + for (i = 0; i < dev->addr_len; i++) + seq_printf(seq, "%02x", ha->addr[i]); + + seq_putc(seq, '\n'); + } + netif_addr_unlock_bh(dev); + return 0; +} + +static const struct seq_operations dev_mc_seq_ops = { + .start = dev_seq_start, + .next = dev_seq_next, + .stop = dev_seq_stop, + .show = dev_mc_seq_show, +}; + +static int dev_mc_seq_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &dev_mc_seq_ops, + sizeof(struct seq_net_private)); +} + +static const struct file_operations dev_mc_seq_fops = { + .owner = THIS_MODULE, + .open = dev_mc_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +#endif + +static int __net_init dev_mc_net_init(struct net *net) +{ + if (!proc_net_fops_create(net, "dev_mcast", 0, &dev_mc_seq_fops)) + return -ENOMEM; + return 0; +} + +static void __net_exit dev_mc_net_exit(struct net *net) +{ + proc_net_remove(net, "dev_mcast"); +} + +static struct pernet_operations __net_initdata dev_mc_net_ops = { + .init = dev_mc_net_init, + .exit = dev_mc_net_exit, +}; + +void __init dev_mcast_init(void) +{ + register_pernet_subsys(&dev_mc_net_ops); +} + diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c new file mode 100644 index 00000000..b856f87e --- /dev/null +++ b/net/core/drop_monitor.c @@ -0,0 +1,390 @@ +/* + * Monitoring code for network dropped packet alerts + * + * Copyright (C) 2009 Neil Horman <nhorman@tuxdriver.com> + */ + +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/string.h> +#include <linux/if_arp.h> +#include <linux/inetdevice.h> +#include <linux/inet.h> +#include <linux/interrupt.h> +#include <linux/netpoll.h> +#include <linux/sched.h> +#include <linux/delay.h> +#include <linux/types.h> +#include <linux/workqueue.h> +#include <linux/netlink.h> +#include <linux/net_dropmon.h> +#include <linux/percpu.h> +#include <linux/timer.h> +#include <linux/bitops.h> +#include <linux/slab.h> +#include <net/genetlink.h> +#include <net/netevent.h> + +#include <trace/events/skb.h> +#include <trace/events/napi.h> + +#include <asm/unaligned.h> + +#define TRACE_ON 1 +#define TRACE_OFF 0 + +/* + * Globals, our netlink socket pointer + * and the work handle that will send up + * netlink alerts + */ +static int trace_state = TRACE_OFF; +static DEFINE_MUTEX(trace_state_mutex); + +struct per_cpu_dm_data { + spinlock_t lock; + struct sk_buff *skb; + struct work_struct dm_alert_work; + struct timer_list send_timer; +}; + +struct dm_hw_stat_delta { + struct net_device *dev; + unsigned long last_rx; + struct list_head list; + struct rcu_head rcu; + unsigned long last_drop_val; +}; + +static struct genl_family net_drop_monitor_family = { + .id = GENL_ID_GENERATE, + .hdrsize = 0, + .name = "NET_DM", + .version = 2, + .maxattr = NET_DM_CMD_MAX, +}; + +static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data); + +static int dm_hit_limit = 64; +static int dm_delay = 1; +static unsigned long dm_hw_check_delta = 2*HZ; +static LIST_HEAD(hw_stats_list); + +static struct sk_buff *reset_per_cpu_data(struct per_cpu_dm_data *data) +{ + size_t al; + struct net_dm_alert_msg *msg; + struct nlattr *nla; + struct sk_buff *skb; + unsigned long flags; + + al = sizeof(struct net_dm_alert_msg); + al += dm_hit_limit * sizeof(struct net_dm_drop_point); + al += sizeof(struct nlattr); + + skb = genlmsg_new(al, GFP_KERNEL); + + if (skb) { + genlmsg_put(skb, 0, 0, &net_drop_monitor_family, + 0, NET_DM_CMD_ALERT); + nla = nla_reserve(skb, NLA_UNSPEC, + sizeof(struct net_dm_alert_msg)); + msg = nla_data(nla); + memset(msg, 0, al); + } else { + mod_timer(&data->send_timer, jiffies + HZ / 10); + } + + spin_lock_irqsave(&data->lock, flags); + swap(data->skb, skb); + spin_unlock_irqrestore(&data->lock, flags); + + return skb; +} + +static void send_dm_alert(struct work_struct *work) +{ + struct sk_buff *skb; + struct per_cpu_dm_data *data; + + data = container_of(work, struct per_cpu_dm_data, dm_alert_work); + + skb = reset_per_cpu_data(data); + + if (skb) + genlmsg_multicast(skb, 0, NET_DM_GRP_ALERT, GFP_KERNEL); +} + +/* + * This is the timer function to delay the sending of an alert + * in the event that more drops will arrive during the + * hysteresis period. + */ +static void sched_send_work(unsigned long _data) +{ + struct per_cpu_dm_data *data = (struct per_cpu_dm_data *)_data; + + schedule_work(&data->dm_alert_work); +} + +static void trace_drop_common(struct sk_buff *skb, void *location) +{ + struct net_dm_alert_msg *msg; + struct nlmsghdr *nlh; + struct nlattr *nla; + int i; + struct sk_buff *dskb; + struct per_cpu_dm_data *data; + unsigned long flags; + + local_irq_save(flags); + data = &__get_cpu_var(dm_cpu_data); + spin_lock(&data->lock); + dskb = data->skb; + + if (!dskb) + goto out; + + nlh = (struct nlmsghdr *)dskb->data; + nla = genlmsg_data(nlmsg_data(nlh)); + msg = nla_data(nla); + for (i = 0; i < msg->entries; i++) { + if (!memcmp(&location, msg->points[i].pc, sizeof(void *))) { + msg->points[i].count++; + goto out; + } + } + if (msg->entries == dm_hit_limit) + goto out; + /* + * We need to create a new entry + */ + __nla_reserve_nohdr(dskb, sizeof(struct net_dm_drop_point)); + nla->nla_len += NLA_ALIGN(sizeof(struct net_dm_drop_point)); + memcpy(msg->points[msg->entries].pc, &location, sizeof(void *)); + msg->points[msg->entries].count = 1; + msg->entries++; + + if (!timer_pending(&data->send_timer)) { + data->send_timer.expires = jiffies + dm_delay * HZ; + add_timer(&data->send_timer); + } + +out: + spin_unlock_irqrestore(&data->lock, flags); +} + +static void trace_kfree_skb_hit(void *ignore, struct sk_buff *skb, void *location) +{ + trace_drop_common(skb, location); +} + +static void trace_napi_poll_hit(void *ignore, struct napi_struct *napi) +{ + struct dm_hw_stat_delta *new_stat; + + /* + * Don't check napi structures with no associated device + */ + if (!napi->dev) + return; + + rcu_read_lock(); + list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { + /* + * only add a note to our monitor buffer if: + * 1) this is the dev we received on + * 2) its after the last_rx delta + * 3) our rx_dropped count has gone up + */ + if ((new_stat->dev == napi->dev) && + (time_after(jiffies, new_stat->last_rx + dm_hw_check_delta)) && + (napi->dev->stats.rx_dropped != new_stat->last_drop_val)) { + trace_drop_common(NULL, NULL); + new_stat->last_drop_val = napi->dev->stats.rx_dropped; + new_stat->last_rx = jiffies; + break; + } + } + rcu_read_unlock(); +} + +static int set_all_monitor_traces(int state) +{ + int rc = 0; + struct dm_hw_stat_delta *new_stat = NULL; + struct dm_hw_stat_delta *temp; + + mutex_lock(&trace_state_mutex); + + if (state == trace_state) { + rc = -EAGAIN; + goto out_unlock; + } + + switch (state) { + case TRACE_ON: + rc |= register_trace_kfree_skb(trace_kfree_skb_hit, NULL); + rc |= register_trace_napi_poll(trace_napi_poll_hit, NULL); + break; + case TRACE_OFF: + rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit, NULL); + rc |= unregister_trace_napi_poll(trace_napi_poll_hit, NULL); + + tracepoint_synchronize_unregister(); + + /* + * Clean the device list + */ + list_for_each_entry_safe(new_stat, temp, &hw_stats_list, list) { + if (new_stat->dev == NULL) { + list_del_rcu(&new_stat->list); + kfree_rcu(new_stat, rcu); + } + } + break; + default: + rc = 1; + break; + } + + if (!rc) + trace_state = state; + else + rc = -EINPROGRESS; + +out_unlock: + mutex_unlock(&trace_state_mutex); + + return rc; +} + + +static int net_dm_cmd_config(struct sk_buff *skb, + struct genl_info *info) +{ + return -ENOTSUPP; +} + +static int net_dm_cmd_trace(struct sk_buff *skb, + struct genl_info *info) +{ + switch (info->genlhdr->cmd) { + case NET_DM_CMD_START: + return set_all_monitor_traces(TRACE_ON); + break; + case NET_DM_CMD_STOP: + return set_all_monitor_traces(TRACE_OFF); + break; + } + + return -ENOTSUPP; +} + +static int dropmon_net_event(struct notifier_block *ev_block, + unsigned long event, void *ptr) +{ + struct net_device *dev = ptr; + struct dm_hw_stat_delta *new_stat = NULL; + struct dm_hw_stat_delta *tmp; + + switch (event) { + case NETDEV_REGISTER: + new_stat = kzalloc(sizeof(struct dm_hw_stat_delta), GFP_KERNEL); + + if (!new_stat) + goto out; + + new_stat->dev = dev; + new_stat->last_rx = jiffies; + mutex_lock(&trace_state_mutex); + list_add_rcu(&new_stat->list, &hw_stats_list); + mutex_unlock(&trace_state_mutex); + break; + case NETDEV_UNREGISTER: + mutex_lock(&trace_state_mutex); + list_for_each_entry_safe(new_stat, tmp, &hw_stats_list, list) { + if (new_stat->dev == dev) { + new_stat->dev = NULL; + if (trace_state == TRACE_OFF) { + list_del_rcu(&new_stat->list); + kfree_rcu(new_stat, rcu); + break; + } + } + } + mutex_unlock(&trace_state_mutex); + break; + } +out: + return NOTIFY_DONE; +} + +static struct genl_ops dropmon_ops[] = { + { + .cmd = NET_DM_CMD_CONFIG, + .doit = net_dm_cmd_config, + }, + { + .cmd = NET_DM_CMD_START, + .doit = net_dm_cmd_trace, + }, + { + .cmd = NET_DM_CMD_STOP, + .doit = net_dm_cmd_trace, + }, +}; + +static struct notifier_block dropmon_net_notifier = { + .notifier_call = dropmon_net_event +}; + +static int __init init_net_drop_monitor(void) +{ + struct per_cpu_dm_data *data; + int cpu, rc; + + printk(KERN_INFO "Initializing network drop monitor service\n"); + + if (sizeof(void *) > 8) { + printk(KERN_ERR "Unable to store program counters on this arch, Drop monitor failed\n"); + return -ENOSPC; + } + + rc = genl_register_family_with_ops(&net_drop_monitor_family, + dropmon_ops, + ARRAY_SIZE(dropmon_ops)); + if (rc) { + printk(KERN_ERR "Could not create drop monitor netlink family\n"); + return rc; + } + + rc = register_netdevice_notifier(&dropmon_net_notifier); + if (rc < 0) { + printk(KERN_CRIT "Failed to register netdevice notifier\n"); + goto out_unreg; + } + + rc = 0; + + for_each_present_cpu(cpu) { + data = &per_cpu(dm_cpu_data, cpu); + INIT_WORK(&data->dm_alert_work, send_dm_alert); + init_timer(&data->send_timer); + data->send_timer.data = (unsigned long)data; + data->send_timer.function = sched_send_work; + spin_lock_init(&data->lock); + reset_per_cpu_data(data); + } + + + goto out; + +out_unreg: + genl_unregister_family(&net_drop_monitor_family); +out: + return rc; +} + +late_initcall(init_net_drop_monitor); diff --git a/net/core/dst.c b/net/core/dst.c new file mode 100644 index 00000000..43d94ced --- /dev/null +++ b/net/core/dst.c @@ -0,0 +1,419 @@ +/* + * net/core/dst.c Protocol independent destination cache. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + */ + +#include <linux/bitops.h> +#include <linux/errno.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/workqueue.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <linux/string.h> +#include <linux/types.h> +#include <net/net_namespace.h> +#include <linux/sched.h> +#include <linux/prefetch.h> + +#include <net/dst.h> + +/* + * Theory of operations: + * 1) We use a list, protected by a spinlock, to add + * new entries from both BH and non-BH context. + * 2) In order to keep spinlock held for a small delay, + * we use a second list where are stored long lived + * entries, that are handled by the garbage collect thread + * fired by a workqueue. + * 3) This list is guarded by a mutex, + * so that the gc_task and dst_dev_event() can be synchronized. + */ + +/* + * We want to keep lock & list close together + * to dirty as few cache lines as possible in __dst_free(). + * As this is not a very strong hint, we dont force an alignment on SMP. + */ +static struct { + spinlock_t lock; + struct dst_entry *list; + unsigned long timer_inc; + unsigned long timer_expires; +} dst_garbage = { + .lock = __SPIN_LOCK_UNLOCKED(dst_garbage.lock), + .timer_inc = DST_GC_MAX, +}; +static void dst_gc_task(struct work_struct *work); +static void ___dst_free(struct dst_entry *dst); + +static DECLARE_DELAYED_WORK(dst_gc_work, dst_gc_task); + +static DEFINE_MUTEX(dst_gc_mutex); +/* + * long lived entries are maintained in this list, guarded by dst_gc_mutex + */ +static struct dst_entry *dst_busy_list; + +static void dst_gc_task(struct work_struct *work) +{ + int delayed = 0; + int work_performed = 0; + unsigned long expires = ~0L; + struct dst_entry *dst, *next, head; + struct dst_entry *last = &head; + + mutex_lock(&dst_gc_mutex); + next = dst_busy_list; + +loop: + while ((dst = next) != NULL) { + next = dst->next; + prefetch(&next->next); + cond_resched(); + if (likely(atomic_read(&dst->__refcnt))) { + last->next = dst; + last = dst; + delayed++; + continue; + } + work_performed++; + + dst = dst_destroy(dst); + if (dst) { + /* NOHASH and still referenced. Unless it is already + * on gc list, invalidate it and add to gc list. + * + * Note: this is temporary. Actually, NOHASH dst's + * must be obsoleted when parent is obsoleted. + * But we do not have state "obsoleted, but + * referenced by parent", so it is right. + */ + if (dst->obsolete > 1) + continue; + + ___dst_free(dst); + dst->next = next; + next = dst; + } + } + + spin_lock_bh(&dst_garbage.lock); + next = dst_garbage.list; + if (next) { + dst_garbage.list = NULL; + spin_unlock_bh(&dst_garbage.lock); + goto loop; + } + last->next = NULL; + dst_busy_list = head.next; + if (!dst_busy_list) + dst_garbage.timer_inc = DST_GC_MAX; + else { + /* + * if we freed less than 1/10 of delayed entries, + * we can sleep longer. + */ + if (work_performed <= delayed/10) { + dst_garbage.timer_expires += dst_garbage.timer_inc; + if (dst_garbage.timer_expires > DST_GC_MAX) + dst_garbage.timer_expires = DST_GC_MAX; + dst_garbage.timer_inc += DST_GC_INC; + } else { + dst_garbage.timer_inc = DST_GC_INC; + dst_garbage.timer_expires = DST_GC_MIN; + } + expires = dst_garbage.timer_expires; + /* + * if the next desired timer is more than 4 seconds in the + * future then round the timer to whole seconds + */ + if (expires > 4*HZ) + expires = round_jiffies_relative(expires); + schedule_delayed_work(&dst_gc_work, expires); + } + + spin_unlock_bh(&dst_garbage.lock); + mutex_unlock(&dst_gc_mutex); +} + +int dst_discard(struct sk_buff *skb) +{ + kfree_skb(skb); + return 0; +} +EXPORT_SYMBOL(dst_discard); + +const u32 dst_default_metrics[RTAX_MAX]; + +void *dst_alloc(struct dst_ops *ops, struct net_device *dev, + int initial_ref, int initial_obsolete, int flags) +{ + struct dst_entry *dst; + + if (ops->gc && dst_entries_get_fast(ops) > ops->gc_thresh) { + if (ops->gc(ops)) + return NULL; + } + dst = kmem_cache_alloc(ops->kmem_cachep, GFP_ATOMIC); + if (!dst) + return NULL; + dst->child = NULL; + dst->dev = dev; + if (dev) + dev_hold(dev); + dst->ops = ops; + dst_init_metrics(dst, dst_default_metrics, true); + dst->expires = 0UL; + dst->path = dst; + RCU_INIT_POINTER(dst->_neighbour, NULL); +#ifdef CONFIG_XFRM + dst->xfrm = NULL; +#endif + dst->input = dst_discard; + dst->output = dst_discard; + dst->error = 0; + dst->obsolete = initial_obsolete; + dst->header_len = 0; + dst->trailer_len = 0; +#ifdef CONFIG_IP_ROUTE_CLASSID + dst->tclassid = 0; +#endif + atomic_set(&dst->__refcnt, initial_ref); + dst->__use = 0; + dst->lastuse = jiffies; + dst->flags = flags; + dst->next = NULL; + if (!(flags & DST_NOCOUNT)) + dst_entries_add(ops, 1); + return dst; +} +EXPORT_SYMBOL(dst_alloc); + +static void ___dst_free(struct dst_entry *dst) +{ + /* The first case (dev==NULL) is required, when + protocol module is unloaded. + */ + if (dst->dev == NULL || !(dst->dev->flags&IFF_UP)) + dst->input = dst->output = dst_discard; + dst->obsolete = 2; +} + +void __dst_free(struct dst_entry *dst) +{ + spin_lock_bh(&dst_garbage.lock); + ___dst_free(dst); + dst->next = dst_garbage.list; + dst_garbage.list = dst; + if (dst_garbage.timer_inc > DST_GC_INC) { + dst_garbage.timer_inc = DST_GC_INC; + dst_garbage.timer_expires = DST_GC_MIN; + cancel_delayed_work(&dst_gc_work); + schedule_delayed_work(&dst_gc_work, dst_garbage.timer_expires); + } + spin_unlock_bh(&dst_garbage.lock); +} +EXPORT_SYMBOL(__dst_free); + +struct dst_entry *dst_destroy(struct dst_entry * dst) +{ + struct dst_entry *child; + struct neighbour *neigh; + + smp_rmb(); + +again: + neigh = rcu_dereference_protected(dst->_neighbour, 1); + child = dst->child; + + if (neigh) { + RCU_INIT_POINTER(dst->_neighbour, NULL); + neigh_release(neigh); + } + + if (!(dst->flags & DST_NOCOUNT)) + dst_entries_add(dst->ops, -1); + + if (dst->ops->destroy) + dst->ops->destroy(dst); + if (dst->dev) + dev_put(dst->dev); + kmem_cache_free(dst->ops->kmem_cachep, dst); + + dst = child; + if (dst) { + int nohash = dst->flags & DST_NOHASH; + + if (atomic_dec_and_test(&dst->__refcnt)) { + /* We were real parent of this dst, so kill child. */ + if (nohash) + goto again; + } else { + /* Child is still referenced, return it for freeing. */ + if (nohash) + return dst; + /* Child is still in his hash table */ + } + } + return NULL; +} +EXPORT_SYMBOL(dst_destroy); + +void dst_release(struct dst_entry *dst) +{ + if (dst) { + int newrefcnt; + + newrefcnt = atomic_dec_return(&dst->__refcnt); + WARN_ON(newrefcnt < 0); + if (unlikely(dst->flags & DST_NOCACHE) && !newrefcnt) { + dst = dst_destroy(dst); + if (dst) + __dst_free(dst); + } + } +} +EXPORT_SYMBOL(dst_release); + +u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old) +{ + u32 *p = kmalloc(sizeof(u32) * RTAX_MAX, GFP_ATOMIC); + + if (p) { + u32 *old_p = __DST_METRICS_PTR(old); + unsigned long prev, new; + + memcpy(p, old_p, sizeof(u32) * RTAX_MAX); + + new = (unsigned long) p; + prev = cmpxchg(&dst->_metrics, old, new); + + if (prev != old) { + kfree(p); + p = __DST_METRICS_PTR(prev); + if (prev & DST_METRICS_READ_ONLY) + p = NULL; + } + } + return p; +} +EXPORT_SYMBOL(dst_cow_metrics_generic); + +/* Caller asserts that dst_metrics_read_only(dst) is false. */ +void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old) +{ + unsigned long prev, new; + + new = ((unsigned long) dst_default_metrics) | DST_METRICS_READ_ONLY; + prev = cmpxchg(&dst->_metrics, old, new); + if (prev == old) + kfree(__DST_METRICS_PTR(old)); +} +EXPORT_SYMBOL(__dst_destroy_metrics_generic); + +/** + * skb_dst_set_noref - sets skb dst, without a reference + * @skb: buffer + * @dst: dst entry + * + * Sets skb dst, assuming a reference was not taken on dst + * skb_dst_drop() should not dst_release() this dst + */ +void skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst) +{ + WARN_ON(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); + /* If dst not in cache, we must take a reference, because + * dst_release() will destroy dst as soon as its refcount becomes zero + */ + if (unlikely(dst->flags & DST_NOCACHE)) { + dst_hold(dst); + skb_dst_set(skb, dst); + } else { + skb->_skb_refdst = (unsigned long)dst | SKB_DST_NOREF; + } +} +EXPORT_SYMBOL(skb_dst_set_noref); + +/* Dirty hack. We did it in 2.2 (in __dst_free), + * we have _very_ good reasons not to repeat + * this mistake in 2.3, but we have no choice + * now. _It_ _is_ _explicit_ _deliberate_ + * _race_ _condition_. + * + * Commented and originally written by Alexey. + */ +static void dst_ifdown(struct dst_entry *dst, struct net_device *dev, + int unregister) +{ + if (dst->ops->ifdown) + dst->ops->ifdown(dst, dev, unregister); + + if (dev != dst->dev) + return; + + if (!unregister) { + dst->input = dst->output = dst_discard; + } else { + struct neighbour *neigh; + + dst->dev = dev_net(dst->dev)->loopback_dev; + dev_hold(dst->dev); + dev_put(dev); + rcu_read_lock(); + neigh = dst_get_neighbour_noref(dst); + if (neigh && neigh->dev == dev) { + neigh->dev = dst->dev; + dev_hold(dst->dev); + dev_put(dev); + } + rcu_read_unlock(); + } +} + +static int dst_dev_event(struct notifier_block *this, unsigned long event, + void *ptr) +{ + struct net_device *dev = ptr; + struct dst_entry *dst, *last = NULL; + + switch (event) { + case NETDEV_UNREGISTER: + case NETDEV_DOWN: + mutex_lock(&dst_gc_mutex); + for (dst = dst_busy_list; dst; dst = dst->next) { + last = dst; + dst_ifdown(dst, dev, event != NETDEV_DOWN); + } + + spin_lock_bh(&dst_garbage.lock); + dst = dst_garbage.list; + dst_garbage.list = NULL; + spin_unlock_bh(&dst_garbage.lock); + + if (last) + last->next = dst; + else + dst_busy_list = dst; + for (; dst; dst = dst->next) + dst_ifdown(dst, dev, event != NETDEV_DOWN); + mutex_unlock(&dst_gc_mutex); + break; + } + return NOTIFY_DONE; +} + +static struct notifier_block dst_dev_notifier = { + .notifier_call = dst_dev_event, + .priority = -10, /* must be called after other network notifiers */ +}; + +void __init dst_init(void) +{ + register_netdevice_notifier(&dst_dev_notifier); +} diff --git a/net/core/ethtool.c b/net/core/ethtool.c new file mode 100644 index 00000000..6d6d7d25 --- /dev/null +++ b/net/core/ethtool.c @@ -0,0 +1,1510 @@ +/* + * net/core/ethtool.c - Ethtool ioctl handler + * Copyright (c) 2003 Matthew Wilcox <matthew@wil.cx> + * + * This file is where we call all the ethtool_ops commands to get + * the information ethtool needs. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/capability.h> +#include <linux/errno.h> +#include <linux/ethtool.h> +#include <linux/netdevice.h> +#include <linux/bitops.h> +#include <linux/uaccess.h> +#include <linux/vmalloc.h> +#include <linux/slab.h> +#include <linux/rtnetlink.h> +#include <linux/sched.h> + +/* + * Some useful ethtool_ops methods that're device independent. + * If we find that all drivers want to do the same thing here, + * we can turn these into dev_() function calls. + */ + +u32 ethtool_op_get_link(struct net_device *dev) +{ + return netif_carrier_ok(dev) ? 1 : 0; +} +EXPORT_SYMBOL(ethtool_op_get_link); + +/* Handlers for each ethtool command */ + +#define ETHTOOL_DEV_FEATURE_WORDS ((NETDEV_FEATURE_COUNT + 31) / 32) + +static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] = { + [NETIF_F_SG_BIT] = "tx-scatter-gather", + [NETIF_F_IP_CSUM_BIT] = "tx-checksum-ipv4", + [NETIF_F_HW_CSUM_BIT] = "tx-checksum-ip-generic", + [NETIF_F_IPV6_CSUM_BIT] = "tx-checksum-ipv6", + [NETIF_F_HIGHDMA_BIT] = "highdma", + [NETIF_F_FRAGLIST_BIT] = "tx-scatter-gather-fraglist", + [NETIF_F_HW_VLAN_TX_BIT] = "tx-vlan-hw-insert", + + [NETIF_F_HW_VLAN_RX_BIT] = "rx-vlan-hw-parse", + [NETIF_F_HW_VLAN_FILTER_BIT] = "rx-vlan-filter", + [NETIF_F_VLAN_CHALLENGED_BIT] = "vlan-challenged", + [NETIF_F_GSO_BIT] = "tx-generic-segmentation", + [NETIF_F_LLTX_BIT] = "tx-lockless", + [NETIF_F_NETNS_LOCAL_BIT] = "netns-local", + [NETIF_F_GRO_BIT] = "rx-gro", + [NETIF_F_LRO_BIT] = "rx-lro", + + [NETIF_F_TSO_BIT] = "tx-tcp-segmentation", + [NETIF_F_UFO_BIT] = "tx-udp-fragmentation", + [NETIF_F_GSO_ROBUST_BIT] = "tx-gso-robust", + [NETIF_F_TSO_ECN_BIT] = "tx-tcp-ecn-segmentation", + [NETIF_F_TSO6_BIT] = "tx-tcp6-segmentation", + [NETIF_F_FSO_BIT] = "tx-fcoe-segmentation", + + [NETIF_F_FCOE_CRC_BIT] = "tx-checksum-fcoe-crc", + [NETIF_F_SCTP_CSUM_BIT] = "tx-checksum-sctp", + [NETIF_F_FCOE_MTU_BIT] = "fcoe-mtu", + [NETIF_F_NTUPLE_BIT] = "rx-ntuple-filter", + [NETIF_F_RXHASH_BIT] = "rx-hashing", + [NETIF_F_RXCSUM_BIT] = "rx-checksum", + [NETIF_F_NOCACHE_COPY_BIT] = "tx-nocache-copy", + [NETIF_F_LOOPBACK_BIT] = "loopback", + [NETIF_F_RXFCS_BIT] = "rx-fcs", + [NETIF_F_RXALL_BIT] = "rx-all", +}; + +static int ethtool_get_features(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_gfeatures cmd = { + .cmd = ETHTOOL_GFEATURES, + .size = ETHTOOL_DEV_FEATURE_WORDS, + }; + struct ethtool_get_features_block features[ETHTOOL_DEV_FEATURE_WORDS]; + u32 __user *sizeaddr; + u32 copy_size; + int i; + + /* in case feature bits run out again */ + BUILD_BUG_ON(ETHTOOL_DEV_FEATURE_WORDS * sizeof(u32) > sizeof(netdev_features_t)); + + for (i = 0; i < ETHTOOL_DEV_FEATURE_WORDS; ++i) { + features[i].available = (u32)(dev->hw_features >> (32 * i)); + features[i].requested = (u32)(dev->wanted_features >> (32 * i)); + features[i].active = (u32)(dev->features >> (32 * i)); + features[i].never_changed = + (u32)(NETIF_F_NEVER_CHANGE >> (32 * i)); + } + + sizeaddr = useraddr + offsetof(struct ethtool_gfeatures, size); + if (get_user(copy_size, sizeaddr)) + return -EFAULT; + + if (copy_size > ETHTOOL_DEV_FEATURE_WORDS) + copy_size = ETHTOOL_DEV_FEATURE_WORDS; + + if (copy_to_user(useraddr, &cmd, sizeof(cmd))) + return -EFAULT; + useraddr += sizeof(cmd); + if (copy_to_user(useraddr, features, copy_size * sizeof(*features))) + return -EFAULT; + + return 0; +} + +static int ethtool_set_features(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_sfeatures cmd; + struct ethtool_set_features_block features[ETHTOOL_DEV_FEATURE_WORDS]; + netdev_features_t wanted = 0, valid = 0; + int i, ret = 0; + + if (copy_from_user(&cmd, useraddr, sizeof(cmd))) + return -EFAULT; + useraddr += sizeof(cmd); + + if (cmd.size != ETHTOOL_DEV_FEATURE_WORDS) + return -EINVAL; + + if (copy_from_user(features, useraddr, sizeof(features))) + return -EFAULT; + + for (i = 0; i < ETHTOOL_DEV_FEATURE_WORDS; ++i) { + valid |= (netdev_features_t)features[i].valid << (32 * i); + wanted |= (netdev_features_t)features[i].requested << (32 * i); + } + + if (valid & ~NETIF_F_ETHTOOL_BITS) + return -EINVAL; + + if (valid & ~dev->hw_features) { + valid &= dev->hw_features; + ret |= ETHTOOL_F_UNSUPPORTED; + } + + dev->wanted_features &= ~valid; + dev->wanted_features |= wanted & valid; + __netdev_update_features(dev); + + if ((dev->wanted_features ^ dev->features) & valid) + ret |= ETHTOOL_F_WISH; + + return ret; +} + +static int __ethtool_get_sset_count(struct net_device *dev, int sset) +{ + const struct ethtool_ops *ops = dev->ethtool_ops; + + if (sset == ETH_SS_FEATURES) + return ARRAY_SIZE(netdev_features_strings); + + if (ops && ops->get_sset_count && ops->get_strings) + return ops->get_sset_count(dev, sset); + else + return -EOPNOTSUPP; +} + +static void __ethtool_get_strings(struct net_device *dev, + u32 stringset, u8 *data) +{ + const struct ethtool_ops *ops = dev->ethtool_ops; + + if (stringset == ETH_SS_FEATURES) + memcpy(data, netdev_features_strings, + sizeof(netdev_features_strings)); + else + /* ops->get_strings is valid because checked earlier */ + ops->get_strings(dev, stringset, data); +} + +static netdev_features_t ethtool_get_feature_mask(u32 eth_cmd) +{ + /* feature masks of legacy discrete ethtool ops */ + + switch (eth_cmd) { + case ETHTOOL_GTXCSUM: + case ETHTOOL_STXCSUM: + return NETIF_F_ALL_CSUM | NETIF_F_SCTP_CSUM; + case ETHTOOL_GRXCSUM: + case ETHTOOL_SRXCSUM: + return NETIF_F_RXCSUM; + case ETHTOOL_GSG: + case ETHTOOL_SSG: + return NETIF_F_SG; + case ETHTOOL_GTSO: + case ETHTOOL_STSO: + return NETIF_F_ALL_TSO; + case ETHTOOL_GUFO: + case ETHTOOL_SUFO: + return NETIF_F_UFO; + case ETHTOOL_GGSO: + case ETHTOOL_SGSO: + return NETIF_F_GSO; + case ETHTOOL_GGRO: + case ETHTOOL_SGRO: + return NETIF_F_GRO; + default: + BUG(); + } +} + +static int ethtool_get_one_feature(struct net_device *dev, + char __user *useraddr, u32 ethcmd) +{ + netdev_features_t mask = ethtool_get_feature_mask(ethcmd); + struct ethtool_value edata = { + .cmd = ethcmd, + .data = !!(dev->features & mask), + }; + + if (copy_to_user(useraddr, &edata, sizeof(edata))) + return -EFAULT; + return 0; +} + +static int ethtool_set_one_feature(struct net_device *dev, + void __user *useraddr, u32 ethcmd) +{ + struct ethtool_value edata; + netdev_features_t mask; + + if (copy_from_user(&edata, useraddr, sizeof(edata))) + return -EFAULT; + + mask = ethtool_get_feature_mask(ethcmd); + mask &= dev->hw_features; + if (!mask) + return -EOPNOTSUPP; + + if (edata.data) + dev->wanted_features |= mask; + else + dev->wanted_features &= ~mask; + + __netdev_update_features(dev); + + return 0; +} + +#define ETH_ALL_FLAGS (ETH_FLAG_LRO | ETH_FLAG_RXVLAN | ETH_FLAG_TXVLAN | \ + ETH_FLAG_NTUPLE | ETH_FLAG_RXHASH) +#define ETH_ALL_FEATURES (NETIF_F_LRO | NETIF_F_HW_VLAN_RX | \ + NETIF_F_HW_VLAN_TX | NETIF_F_NTUPLE | NETIF_F_RXHASH) + +static u32 __ethtool_get_flags(struct net_device *dev) +{ + u32 flags = 0; + + if (dev->features & NETIF_F_LRO) flags |= ETH_FLAG_LRO; + if (dev->features & NETIF_F_HW_VLAN_RX) flags |= ETH_FLAG_RXVLAN; + if (dev->features & NETIF_F_HW_VLAN_TX) flags |= ETH_FLAG_TXVLAN; + if (dev->features & NETIF_F_NTUPLE) flags |= ETH_FLAG_NTUPLE; + if (dev->features & NETIF_F_RXHASH) flags |= ETH_FLAG_RXHASH; + + return flags; +} + +static int __ethtool_set_flags(struct net_device *dev, u32 data) +{ + netdev_features_t features = 0, changed; + + if (data & ~ETH_ALL_FLAGS) + return -EINVAL; + + if (data & ETH_FLAG_LRO) features |= NETIF_F_LRO; + if (data & ETH_FLAG_RXVLAN) features |= NETIF_F_HW_VLAN_RX; + if (data & ETH_FLAG_TXVLAN) features |= NETIF_F_HW_VLAN_TX; + if (data & ETH_FLAG_NTUPLE) features |= NETIF_F_NTUPLE; + if (data & ETH_FLAG_RXHASH) features |= NETIF_F_RXHASH; + + /* allow changing only bits set in hw_features */ + changed = (features ^ dev->features) & ETH_ALL_FEATURES; + if (changed & ~dev->hw_features) + return (changed & dev->hw_features) ? -EINVAL : -EOPNOTSUPP; + + dev->wanted_features = + (dev->wanted_features & ~changed) | (features & changed); + + __netdev_update_features(dev); + + return 0; +} + +int __ethtool_get_settings(struct net_device *dev, struct ethtool_cmd *cmd) +{ + ASSERT_RTNL(); + + if (!dev->ethtool_ops || !dev->ethtool_ops->get_settings) + return -EOPNOTSUPP; + + memset(cmd, 0, sizeof(struct ethtool_cmd)); + cmd->cmd = ETHTOOL_GSET; + return dev->ethtool_ops->get_settings(dev, cmd); +} +EXPORT_SYMBOL(__ethtool_get_settings); + +static int ethtool_get_settings(struct net_device *dev, void __user *useraddr) +{ + int err; + struct ethtool_cmd cmd; + + err = __ethtool_get_settings(dev, &cmd); + if (err < 0) + return err; + + if (copy_to_user(useraddr, &cmd, sizeof(cmd))) + return -EFAULT; + return 0; +} + +static int ethtool_set_settings(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_cmd cmd; + + if (!dev->ethtool_ops->set_settings) + return -EOPNOTSUPP; + + if (copy_from_user(&cmd, useraddr, sizeof(cmd))) + return -EFAULT; + + return dev->ethtool_ops->set_settings(dev, &cmd); +} + +static noinline_for_stack int ethtool_get_drvinfo(struct net_device *dev, + void __user *useraddr) +{ + struct ethtool_drvinfo info; + const struct ethtool_ops *ops = dev->ethtool_ops; + + memset(&info, 0, sizeof(info)); + info.cmd = ETHTOOL_GDRVINFO; + if (ops && ops->get_drvinfo) { + ops->get_drvinfo(dev, &info); + } else if (dev->dev.parent && dev->dev.parent->driver) { + strlcpy(info.bus_info, dev_name(dev->dev.parent), + sizeof(info.bus_info)); + strlcpy(info.driver, dev->dev.parent->driver->name, + sizeof(info.driver)); + } else { + return -EOPNOTSUPP; + } + + /* + * this method of obtaining string set info is deprecated; + * Use ETHTOOL_GSSET_INFO instead. + */ + if (ops && ops->get_sset_count) { + int rc; + + rc = ops->get_sset_count(dev, ETH_SS_TEST); + if (rc >= 0) + info.testinfo_len = rc; + rc = ops->get_sset_count(dev, ETH_SS_STATS); + if (rc >= 0) + info.n_stats = rc; + rc = ops->get_sset_count(dev, ETH_SS_PRIV_FLAGS); + if (rc >= 0) + info.n_priv_flags = rc; + } + if (ops && ops->get_regs_len) + info.regdump_len = ops->get_regs_len(dev); + if (ops && ops->get_eeprom_len) + info.eedump_len = ops->get_eeprom_len(dev); + + if (copy_to_user(useraddr, &info, sizeof(info))) + return -EFAULT; + return 0; +} + +static noinline_for_stack int ethtool_get_sset_info(struct net_device *dev, + void __user *useraddr) +{ + struct ethtool_sset_info info; + u64 sset_mask; + int i, idx = 0, n_bits = 0, ret, rc; + u32 *info_buf = NULL; + + if (copy_from_user(&info, useraddr, sizeof(info))) + return -EFAULT; + + /* store copy of mask, because we zero struct later on */ + sset_mask = info.sset_mask; + if (!sset_mask) + return 0; + + /* calculate size of return buffer */ + n_bits = hweight64(sset_mask); + + memset(&info, 0, sizeof(info)); + info.cmd = ETHTOOL_GSSET_INFO; + + info_buf = kzalloc(n_bits * sizeof(u32), GFP_USER); + if (!info_buf) + return -ENOMEM; + + /* + * fill return buffer based on input bitmask and successful + * get_sset_count return + */ + for (i = 0; i < 64; i++) { + if (!(sset_mask & (1ULL << i))) + continue; + + rc = __ethtool_get_sset_count(dev, i); + if (rc >= 0) { + info.sset_mask |= (1ULL << i); + info_buf[idx++] = rc; + } + } + + ret = -EFAULT; + if (copy_to_user(useraddr, &info, sizeof(info))) + goto out; + + useraddr += offsetof(struct ethtool_sset_info, data); + if (copy_to_user(useraddr, info_buf, idx * sizeof(u32))) + goto out; + + ret = 0; + +out: + kfree(info_buf); + return ret; +} + +static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev, + u32 cmd, void __user *useraddr) +{ + struct ethtool_rxnfc info; + size_t info_size = sizeof(info); + int rc; + + if (!dev->ethtool_ops->set_rxnfc) + return -EOPNOTSUPP; + + /* struct ethtool_rxnfc was originally defined for + * ETHTOOL_{G,S}RXFH with only the cmd, flow_type and data + * members. User-space might still be using that + * definition. */ + if (cmd == ETHTOOL_SRXFH) + info_size = (offsetof(struct ethtool_rxnfc, data) + + sizeof(info.data)); + + if (copy_from_user(&info, useraddr, info_size)) + return -EFAULT; + + rc = dev->ethtool_ops->set_rxnfc(dev, &info); + if (rc) + return rc; + + if (cmd == ETHTOOL_SRXCLSRLINS && + copy_to_user(useraddr, &info, info_size)) + return -EFAULT; + + return 0; +} + +static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev, + u32 cmd, void __user *useraddr) +{ + struct ethtool_rxnfc info; + size_t info_size = sizeof(info); + const struct ethtool_ops *ops = dev->ethtool_ops; + int ret; + void *rule_buf = NULL; + + if (!ops->get_rxnfc) + return -EOPNOTSUPP; + + /* struct ethtool_rxnfc was originally defined for + * ETHTOOL_{G,S}RXFH with only the cmd, flow_type and data + * members. User-space might still be using that + * definition. */ + if (cmd == ETHTOOL_GRXFH) + info_size = (offsetof(struct ethtool_rxnfc, data) + + sizeof(info.data)); + + if (copy_from_user(&info, useraddr, info_size)) + return -EFAULT; + + if (info.cmd == ETHTOOL_GRXCLSRLALL) { + if (info.rule_cnt > 0) { + if (info.rule_cnt <= KMALLOC_MAX_SIZE / sizeof(u32)) + rule_buf = kzalloc(info.rule_cnt * sizeof(u32), + GFP_USER); + if (!rule_buf) + return -ENOMEM; + } + } + + ret = ops->get_rxnfc(dev, &info, rule_buf); + if (ret < 0) + goto err_out; + + ret = -EFAULT; + if (copy_to_user(useraddr, &info, info_size)) + goto err_out; + + if (rule_buf) { + useraddr += offsetof(struct ethtool_rxnfc, rule_locs); + if (copy_to_user(useraddr, rule_buf, + info.rule_cnt * sizeof(u32))) + goto err_out; + } + ret = 0; + +err_out: + kfree(rule_buf); + + return ret; +} + +static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev, + void __user *useraddr) +{ + u32 user_size, dev_size; + u32 *indir; + int ret; + + if (!dev->ethtool_ops->get_rxfh_indir_size || + !dev->ethtool_ops->get_rxfh_indir) + return -EOPNOTSUPP; + dev_size = dev->ethtool_ops->get_rxfh_indir_size(dev); + if (dev_size == 0) + return -EOPNOTSUPP; + + if (copy_from_user(&user_size, + useraddr + offsetof(struct ethtool_rxfh_indir, size), + sizeof(user_size))) + return -EFAULT; + + if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh_indir, size), + &dev_size, sizeof(dev_size))) + return -EFAULT; + + /* If the user buffer size is 0, this is just a query for the + * device table size. Otherwise, if it's smaller than the + * device table size it's an error. + */ + if (user_size < dev_size) + return user_size == 0 ? 0 : -EINVAL; + + indir = kcalloc(dev_size, sizeof(indir[0]), GFP_USER); + if (!indir) + return -ENOMEM; + + ret = dev->ethtool_ops->get_rxfh_indir(dev, indir); + if (ret) + goto out; + + if (copy_to_user(useraddr + + offsetof(struct ethtool_rxfh_indir, ring_index[0]), + indir, dev_size * sizeof(indir[0]))) + ret = -EFAULT; + +out: + kfree(indir); + return ret; +} + +static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev, + void __user *useraddr) +{ + struct ethtool_rxnfc rx_rings; + u32 user_size, dev_size, i; + u32 *indir; + int ret; + + if (!dev->ethtool_ops->get_rxfh_indir_size || + !dev->ethtool_ops->set_rxfh_indir || + !dev->ethtool_ops->get_rxnfc) + return -EOPNOTSUPP; + dev_size = dev->ethtool_ops->get_rxfh_indir_size(dev); + if (dev_size == 0) + return -EOPNOTSUPP; + + if (copy_from_user(&user_size, + useraddr + offsetof(struct ethtool_rxfh_indir, size), + sizeof(user_size))) + return -EFAULT; + + if (user_size != 0 && user_size != dev_size) + return -EINVAL; + + indir = kcalloc(dev_size, sizeof(indir[0]), GFP_USER); + if (!indir) + return -ENOMEM; + + rx_rings.cmd = ETHTOOL_GRXRINGS; + ret = dev->ethtool_ops->get_rxnfc(dev, &rx_rings, NULL); + if (ret) + goto out; + + if (user_size == 0) { + for (i = 0; i < dev_size; i++) + indir[i] = ethtool_rxfh_indir_default(i, rx_rings.data); + } else { + if (copy_from_user(indir, + useraddr + + offsetof(struct ethtool_rxfh_indir, + ring_index[0]), + dev_size * sizeof(indir[0]))) { + ret = -EFAULT; + goto out; + } + + /* Validate ring indices */ + for (i = 0; i < dev_size; i++) { + if (indir[i] >= rx_rings.data) { + ret = -EINVAL; + goto out; + } + } + } + + ret = dev->ethtool_ops->set_rxfh_indir(dev, indir); + +out: + kfree(indir); + return ret; +} + +static int ethtool_get_regs(struct net_device *dev, char __user *useraddr) +{ + struct ethtool_regs regs; + const struct ethtool_ops *ops = dev->ethtool_ops; + void *regbuf; + int reglen, ret; + + if (!ops->get_regs || !ops->get_regs_len) + return -EOPNOTSUPP; + + if (copy_from_user(®s, useraddr, sizeof(regs))) + return -EFAULT; + + reglen = ops->get_regs_len(dev); + if (regs.len > reglen) + regs.len = reglen; + + regbuf = vzalloc(reglen); + if (reglen && !regbuf) + return -ENOMEM; + + ops->get_regs(dev, ®s, regbuf); + + ret = -EFAULT; + if (copy_to_user(useraddr, ®s, sizeof(regs))) + goto out; + useraddr += offsetof(struct ethtool_regs, data); + if (regbuf && copy_to_user(useraddr, regbuf, regs.len)) + goto out; + ret = 0; + + out: + vfree(regbuf); + return ret; +} + +static int ethtool_reset(struct net_device *dev, char __user *useraddr) +{ + struct ethtool_value reset; + int ret; + + if (!dev->ethtool_ops->reset) + return -EOPNOTSUPP; + + if (copy_from_user(&reset, useraddr, sizeof(reset))) + return -EFAULT; + + ret = dev->ethtool_ops->reset(dev, &reset.data); + if (ret) + return ret; + + if (copy_to_user(useraddr, &reset, sizeof(reset))) + return -EFAULT; + return 0; +} + +static int ethtool_get_wol(struct net_device *dev, char __user *useraddr) +{ + struct ethtool_wolinfo wol = { .cmd = ETHTOOL_GWOL }; + + if (!dev->ethtool_ops->get_wol) + return -EOPNOTSUPP; + + dev->ethtool_ops->get_wol(dev, &wol); + + if (copy_to_user(useraddr, &wol, sizeof(wol))) + return -EFAULT; + return 0; +} + +static int ethtool_set_wol(struct net_device *dev, char __user *useraddr) +{ + struct ethtool_wolinfo wol; + + if (!dev->ethtool_ops->set_wol) + return -EOPNOTSUPP; + + if (copy_from_user(&wol, useraddr, sizeof(wol))) + return -EFAULT; + + return dev->ethtool_ops->set_wol(dev, &wol); +} + +static int ethtool_nway_reset(struct net_device *dev) +{ + if (!dev->ethtool_ops->nway_reset) + return -EOPNOTSUPP; + + return dev->ethtool_ops->nway_reset(dev); +} + +static int ethtool_get_link(struct net_device *dev, char __user *useraddr) +{ + struct ethtool_value edata = { .cmd = ETHTOOL_GLINK }; + + if (!dev->ethtool_ops->get_link) + return -EOPNOTSUPP; + + edata.data = netif_running(dev) && dev->ethtool_ops->get_link(dev); + + if (copy_to_user(useraddr, &edata, sizeof(edata))) + return -EFAULT; + return 0; +} + +static int ethtool_get_eeprom(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_eeprom eeprom; + const struct ethtool_ops *ops = dev->ethtool_ops; + void __user *userbuf = useraddr + sizeof(eeprom); + u32 bytes_remaining; + u8 *data; + int ret = 0; + + if (!ops->get_eeprom || !ops->get_eeprom_len) + return -EOPNOTSUPP; + + if (copy_from_user(&eeprom, useraddr, sizeof(eeprom))) + return -EFAULT; + + /* Check for wrap and zero */ + if (eeprom.offset + eeprom.len <= eeprom.offset) + return -EINVAL; + + /* Check for exceeding total eeprom len */ + if (eeprom.offset + eeprom.len > ops->get_eeprom_len(dev)) + return -EINVAL; + + data = kmalloc(PAGE_SIZE, GFP_USER); + if (!data) + return -ENOMEM; + + bytes_remaining = eeprom.len; + while (bytes_remaining > 0) { + eeprom.len = min(bytes_remaining, (u32)PAGE_SIZE); + + ret = ops->get_eeprom(dev, &eeprom, data); + if (ret) + break; + if (copy_to_user(userbuf, data, eeprom.len)) { + ret = -EFAULT; + break; + } + userbuf += eeprom.len; + eeprom.offset += eeprom.len; + bytes_remaining -= eeprom.len; + } + + eeprom.len = userbuf - (useraddr + sizeof(eeprom)); + eeprom.offset -= eeprom.len; + if (copy_to_user(useraddr, &eeprom, sizeof(eeprom))) + ret = -EFAULT; + + kfree(data); + return ret; +} + +static int ethtool_set_eeprom(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_eeprom eeprom; + const struct ethtool_ops *ops = dev->ethtool_ops; + void __user *userbuf = useraddr + sizeof(eeprom); + u32 bytes_remaining; + u8 *data; + int ret = 0; + + if (!ops->set_eeprom || !ops->get_eeprom_len) + return -EOPNOTSUPP; + + if (copy_from_user(&eeprom, useraddr, sizeof(eeprom))) + return -EFAULT; + + /* Check for wrap and zero */ + if (eeprom.offset + eeprom.len <= eeprom.offset) + return -EINVAL; + + /* Check for exceeding total eeprom len */ + if (eeprom.offset + eeprom.len > ops->get_eeprom_len(dev)) + return -EINVAL; + + data = kmalloc(PAGE_SIZE, GFP_USER); + if (!data) + return -ENOMEM; + + bytes_remaining = eeprom.len; + while (bytes_remaining > 0) { + eeprom.len = min(bytes_remaining, (u32)PAGE_SIZE); + + if (copy_from_user(data, userbuf, eeprom.len)) { + ret = -EFAULT; + break; + } + ret = ops->set_eeprom(dev, &eeprom, data); + if (ret) + break; + userbuf += eeprom.len; + eeprom.offset += eeprom.len; + bytes_remaining -= eeprom.len; + } + + kfree(data); + return ret; +} + +static noinline_for_stack int ethtool_get_coalesce(struct net_device *dev, + void __user *useraddr) +{ + struct ethtool_coalesce coalesce = { .cmd = ETHTOOL_GCOALESCE }; + + if (!dev->ethtool_ops->get_coalesce) + return -EOPNOTSUPP; + + dev->ethtool_ops->get_coalesce(dev, &coalesce); + + if (copy_to_user(useraddr, &coalesce, sizeof(coalesce))) + return -EFAULT; + return 0; +} + +static noinline_for_stack int ethtool_set_coalesce(struct net_device *dev, + void __user *useraddr) +{ + struct ethtool_coalesce coalesce; + + if (!dev->ethtool_ops->set_coalesce) + return -EOPNOTSUPP; + + if (copy_from_user(&coalesce, useraddr, sizeof(coalesce))) + return -EFAULT; + + return dev->ethtool_ops->set_coalesce(dev, &coalesce); +} + +static int ethtool_get_ringparam(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_ringparam ringparam = { .cmd = ETHTOOL_GRINGPARAM }; + + if (!dev->ethtool_ops->get_ringparam) + return -EOPNOTSUPP; + + dev->ethtool_ops->get_ringparam(dev, &ringparam); + + if (copy_to_user(useraddr, &ringparam, sizeof(ringparam))) + return -EFAULT; + return 0; +} + +static int ethtool_set_ringparam(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_ringparam ringparam; + + if (!dev->ethtool_ops->set_ringparam) + return -EOPNOTSUPP; + + if (copy_from_user(&ringparam, useraddr, sizeof(ringparam))) + return -EFAULT; + + return dev->ethtool_ops->set_ringparam(dev, &ringparam); +} + +static noinline_for_stack int ethtool_get_channels(struct net_device *dev, + void __user *useraddr) +{ + struct ethtool_channels channels = { .cmd = ETHTOOL_GCHANNELS }; + + if (!dev->ethtool_ops->get_channels) + return -EOPNOTSUPP; + + dev->ethtool_ops->get_channels(dev, &channels); + + if (copy_to_user(useraddr, &channels, sizeof(channels))) + return -EFAULT; + return 0; +} + +static noinline_for_stack int ethtool_set_channels(struct net_device *dev, + void __user *useraddr) +{ + struct ethtool_channels channels; + + if (!dev->ethtool_ops->set_channels) + return -EOPNOTSUPP; + + if (copy_from_user(&channels, useraddr, sizeof(channels))) + return -EFAULT; + + return dev->ethtool_ops->set_channels(dev, &channels); +} + +static int ethtool_get_pauseparam(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_pauseparam pauseparam = { ETHTOOL_GPAUSEPARAM }; + + if (!dev->ethtool_ops->get_pauseparam) + return -EOPNOTSUPP; + + dev->ethtool_ops->get_pauseparam(dev, &pauseparam); + + if (copy_to_user(useraddr, &pauseparam, sizeof(pauseparam))) + return -EFAULT; + return 0; +} + +static int ethtool_set_pauseparam(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_pauseparam pauseparam; + + if (!dev->ethtool_ops->set_pauseparam) + return -EOPNOTSUPP; + + if (copy_from_user(&pauseparam, useraddr, sizeof(pauseparam))) + return -EFAULT; + + return dev->ethtool_ops->set_pauseparam(dev, &pauseparam); +} + +static int ethtool_self_test(struct net_device *dev, char __user *useraddr) +{ + struct ethtool_test test; + const struct ethtool_ops *ops = dev->ethtool_ops; + u64 *data; + int ret, test_len; + + if (!ops->self_test || !ops->get_sset_count) + return -EOPNOTSUPP; + + test_len = ops->get_sset_count(dev, ETH_SS_TEST); + if (test_len < 0) + return test_len; + WARN_ON(test_len == 0); + + if (copy_from_user(&test, useraddr, sizeof(test))) + return -EFAULT; + + test.len = test_len; + data = kmalloc(test_len * sizeof(u64), GFP_USER); + if (!data) + return -ENOMEM; + + ops->self_test(dev, &test, data); + + ret = -EFAULT; + if (copy_to_user(useraddr, &test, sizeof(test))) + goto out; + useraddr += sizeof(test); + if (copy_to_user(useraddr, data, test.len * sizeof(u64))) + goto out; + ret = 0; + + out: + kfree(data); + return ret; +} + +static int ethtool_get_strings(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_gstrings gstrings; + u8 *data; + int ret; + + if (copy_from_user(&gstrings, useraddr, sizeof(gstrings))) + return -EFAULT; + + ret = __ethtool_get_sset_count(dev, gstrings.string_set); + if (ret < 0) + return ret; + + gstrings.len = ret; + + data = kmalloc(gstrings.len * ETH_GSTRING_LEN, GFP_USER); + if (!data) + return -ENOMEM; + + __ethtool_get_strings(dev, gstrings.string_set, data); + + ret = -EFAULT; + if (copy_to_user(useraddr, &gstrings, sizeof(gstrings))) + goto out; + useraddr += sizeof(gstrings); + if (copy_to_user(useraddr, data, gstrings.len * ETH_GSTRING_LEN)) + goto out; + ret = 0; + +out: + kfree(data); + return ret; +} + +static int ethtool_phys_id(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_value id; + static bool busy; + int rc; + + if (!dev->ethtool_ops->set_phys_id) + return -EOPNOTSUPP; + + if (busy) + return -EBUSY; + + if (copy_from_user(&id, useraddr, sizeof(id))) + return -EFAULT; + + rc = dev->ethtool_ops->set_phys_id(dev, ETHTOOL_ID_ACTIVE); + if (rc < 0) + return rc; + + /* Drop the RTNL lock while waiting, but prevent reentry or + * removal of the device. + */ + busy = true; + dev_hold(dev); + rtnl_unlock(); + + if (rc == 0) { + /* Driver will handle this itself */ + schedule_timeout_interruptible( + id.data ? (id.data * HZ) : MAX_SCHEDULE_TIMEOUT); + } else { + /* Driver expects to be called at twice the frequency in rc */ + int n = rc * 2, i, interval = HZ / n; + + /* Count down seconds */ + do { + /* Count down iterations per second */ + i = n; + do { + rtnl_lock(); + rc = dev->ethtool_ops->set_phys_id(dev, + (i & 1) ? ETHTOOL_ID_OFF : ETHTOOL_ID_ON); + rtnl_unlock(); + if (rc) + break; + schedule_timeout_interruptible(interval); + } while (!signal_pending(current) && --i != 0); + } while (!signal_pending(current) && + (id.data == 0 || --id.data != 0)); + } + + rtnl_lock(); + dev_put(dev); + busy = false; + + (void)dev->ethtool_ops->set_phys_id(dev, ETHTOOL_ID_INACTIVE); + return rc; +} + +static int ethtool_get_stats(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_stats stats; + const struct ethtool_ops *ops = dev->ethtool_ops; + u64 *data; + int ret, n_stats; + + if (!ops->get_ethtool_stats || !ops->get_sset_count) + return -EOPNOTSUPP; + + n_stats = ops->get_sset_count(dev, ETH_SS_STATS); + if (n_stats < 0) + return n_stats; + WARN_ON(n_stats == 0); + + if (copy_from_user(&stats, useraddr, sizeof(stats))) + return -EFAULT; + + stats.n_stats = n_stats; + data = kmalloc(n_stats * sizeof(u64), GFP_USER); + if (!data) + return -ENOMEM; + + ops->get_ethtool_stats(dev, &stats, data); + + ret = -EFAULT; + if (copy_to_user(useraddr, &stats, sizeof(stats))) + goto out; + useraddr += sizeof(stats); + if (copy_to_user(useraddr, data, stats.n_stats * sizeof(u64))) + goto out; + ret = 0; + + out: + kfree(data); + return ret; +} + +static int ethtool_get_perm_addr(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_perm_addr epaddr; + + if (copy_from_user(&epaddr, useraddr, sizeof(epaddr))) + return -EFAULT; + + if (epaddr.size < dev->addr_len) + return -ETOOSMALL; + epaddr.size = dev->addr_len; + + if (copy_to_user(useraddr, &epaddr, sizeof(epaddr))) + return -EFAULT; + useraddr += sizeof(epaddr); + if (copy_to_user(useraddr, dev->perm_addr, epaddr.size)) + return -EFAULT; + return 0; +} + +static int ethtool_get_value(struct net_device *dev, char __user *useraddr, + u32 cmd, u32 (*actor)(struct net_device *)) +{ + struct ethtool_value edata = { .cmd = cmd }; + + if (!actor) + return -EOPNOTSUPP; + + edata.data = actor(dev); + + if (copy_to_user(useraddr, &edata, sizeof(edata))) + return -EFAULT; + return 0; +} + +static int ethtool_set_value_void(struct net_device *dev, char __user *useraddr, + void (*actor)(struct net_device *, u32)) +{ + struct ethtool_value edata; + + if (!actor) + return -EOPNOTSUPP; + + if (copy_from_user(&edata, useraddr, sizeof(edata))) + return -EFAULT; + + actor(dev, edata.data); + return 0; +} + +static int ethtool_set_value(struct net_device *dev, char __user *useraddr, + int (*actor)(struct net_device *, u32)) +{ + struct ethtool_value edata; + + if (!actor) + return -EOPNOTSUPP; + + if (copy_from_user(&edata, useraddr, sizeof(edata))) + return -EFAULT; + + return actor(dev, edata.data); +} + +static noinline_for_stack int ethtool_flash_device(struct net_device *dev, + char __user *useraddr) +{ + struct ethtool_flash efl; + + if (copy_from_user(&efl, useraddr, sizeof(efl))) + return -EFAULT; + + if (!dev->ethtool_ops->flash_device) + return -EOPNOTSUPP; + + efl.data[ETHTOOL_FLASH_MAX_FILENAME - 1] = 0; + + return dev->ethtool_ops->flash_device(dev, &efl); +} + +static int ethtool_set_dump(struct net_device *dev, + void __user *useraddr) +{ + struct ethtool_dump dump; + + if (!dev->ethtool_ops->set_dump) + return -EOPNOTSUPP; + + if (copy_from_user(&dump, useraddr, sizeof(dump))) + return -EFAULT; + + return dev->ethtool_ops->set_dump(dev, &dump); +} + +static int ethtool_get_dump_flag(struct net_device *dev, + void __user *useraddr) +{ + int ret; + struct ethtool_dump dump; + const struct ethtool_ops *ops = dev->ethtool_ops; + + if (!dev->ethtool_ops->get_dump_flag) + return -EOPNOTSUPP; + + if (copy_from_user(&dump, useraddr, sizeof(dump))) + return -EFAULT; + + ret = ops->get_dump_flag(dev, &dump); + if (ret) + return ret; + + if (copy_to_user(useraddr, &dump, sizeof(dump))) + return -EFAULT; + return 0; +} + +static int ethtool_get_dump_data(struct net_device *dev, + void __user *useraddr) +{ + int ret; + __u32 len; + struct ethtool_dump dump, tmp; + const struct ethtool_ops *ops = dev->ethtool_ops; + void *data = NULL; + + if (!dev->ethtool_ops->get_dump_data || + !dev->ethtool_ops->get_dump_flag) + return -EOPNOTSUPP; + + if (copy_from_user(&dump, useraddr, sizeof(dump))) + return -EFAULT; + + memset(&tmp, 0, sizeof(tmp)); + tmp.cmd = ETHTOOL_GET_DUMP_FLAG; + ret = ops->get_dump_flag(dev, &tmp); + if (ret) + return ret; + + len = (tmp.len > dump.len) ? dump.len : tmp.len; + if (!len) + return -EFAULT; + + data = vzalloc(tmp.len); + if (!data) + return -ENOMEM; + ret = ops->get_dump_data(dev, &dump, data); + if (ret) + goto out; + + if (copy_to_user(useraddr, &dump, sizeof(dump))) { + ret = -EFAULT; + goto out; + } + useraddr += offsetof(struct ethtool_dump, data); + if (copy_to_user(useraddr, data, len)) + ret = -EFAULT; +out: + vfree(data); + return ret; +} + +/* The main entry point in this file. Called from net/core/dev.c */ + +int dev_ethtool(struct net *net, struct ifreq *ifr) +{ + struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name); + void __user *useraddr = ifr->ifr_data; + u32 ethcmd; + int rc; + u32 old_features; + + if (!dev || !netif_device_present(dev)) + return -ENODEV; + + if (copy_from_user(ðcmd, useraddr, sizeof(ethcmd))) + return -EFAULT; + + if (!dev->ethtool_ops) { + /* ETHTOOL_GDRVINFO does not require any driver support. + * It is also unprivileged and does not change anything, + * so we can take a shortcut to it. */ + if (ethcmd == ETHTOOL_GDRVINFO) + return ethtool_get_drvinfo(dev, useraddr); + else + return -EOPNOTSUPP; + } + + /* Allow some commands to be done by anyone */ + switch (ethcmd) { + case ETHTOOL_GSET: + case ETHTOOL_GDRVINFO: + case ETHTOOL_GMSGLVL: + case ETHTOOL_GCOALESCE: + case ETHTOOL_GRINGPARAM: + case ETHTOOL_GPAUSEPARAM: + case ETHTOOL_GRXCSUM: + case ETHTOOL_GTXCSUM: + case ETHTOOL_GSG: + case ETHTOOL_GSSET_INFO: + case ETHTOOL_GSTRINGS: + case ETHTOOL_GTSO: + case ETHTOOL_GPERMADDR: + case ETHTOOL_GUFO: + case ETHTOOL_GGSO: + case ETHTOOL_GGRO: + case ETHTOOL_GFLAGS: + case ETHTOOL_GPFLAGS: + case ETHTOOL_GRXFH: + case ETHTOOL_GRXRINGS: + case ETHTOOL_GRXCLSRLCNT: + case ETHTOOL_GRXCLSRULE: + case ETHTOOL_GRXCLSRLALL: + case ETHTOOL_GFEATURES: + break; + default: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + } + + if (dev->ethtool_ops->begin) { + rc = dev->ethtool_ops->begin(dev); + if (rc < 0) + return rc; + } + old_features = dev->features; + + switch (ethcmd) { + case ETHTOOL_GSET: + rc = ethtool_get_settings(dev, useraddr); + break; + case ETHTOOL_SSET: + rc = ethtool_set_settings(dev, useraddr); + break; + case ETHTOOL_GDRVINFO: + rc = ethtool_get_drvinfo(dev, useraddr); + break; + case ETHTOOL_GREGS: + rc = ethtool_get_regs(dev, useraddr); + break; + case ETHTOOL_GWOL: + rc = ethtool_get_wol(dev, useraddr); + break; + case ETHTOOL_SWOL: + rc = ethtool_set_wol(dev, useraddr); + break; + case ETHTOOL_GMSGLVL: + rc = ethtool_get_value(dev, useraddr, ethcmd, + dev->ethtool_ops->get_msglevel); + break; + case ETHTOOL_SMSGLVL: + rc = ethtool_set_value_void(dev, useraddr, + dev->ethtool_ops->set_msglevel); + break; + case ETHTOOL_NWAY_RST: + rc = ethtool_nway_reset(dev); + break; + case ETHTOOL_GLINK: + rc = ethtool_get_link(dev, useraddr); + break; + case ETHTOOL_GEEPROM: + rc = ethtool_get_eeprom(dev, useraddr); + break; + case ETHTOOL_SEEPROM: + rc = ethtool_set_eeprom(dev, useraddr); + break; + case ETHTOOL_GCOALESCE: + rc = ethtool_get_coalesce(dev, useraddr); + break; + case ETHTOOL_SCOALESCE: + rc = ethtool_set_coalesce(dev, useraddr); + break; + case ETHTOOL_GRINGPARAM: + rc = ethtool_get_ringparam(dev, useraddr); + break; + case ETHTOOL_SRINGPARAM: + rc = ethtool_set_ringparam(dev, useraddr); + break; + case ETHTOOL_GPAUSEPARAM: + rc = ethtool_get_pauseparam(dev, useraddr); + break; + case ETHTOOL_SPAUSEPARAM: + rc = ethtool_set_pauseparam(dev, useraddr); + break; + case ETHTOOL_TEST: + rc = ethtool_self_test(dev, useraddr); + break; + case ETHTOOL_GSTRINGS: + rc = ethtool_get_strings(dev, useraddr); + break; + case ETHTOOL_PHYS_ID: + rc = ethtool_phys_id(dev, useraddr); + break; + case ETHTOOL_GSTATS: + rc = ethtool_get_stats(dev, useraddr); + break; + case ETHTOOL_GPERMADDR: + rc = ethtool_get_perm_addr(dev, useraddr); + break; + case ETHTOOL_GFLAGS: + rc = ethtool_get_value(dev, useraddr, ethcmd, + __ethtool_get_flags); + break; + case ETHTOOL_SFLAGS: + rc = ethtool_set_value(dev, useraddr, __ethtool_set_flags); + break; + case ETHTOOL_GPFLAGS: + rc = ethtool_get_value(dev, useraddr, ethcmd, + dev->ethtool_ops->get_priv_flags); + break; + case ETHTOOL_SPFLAGS: + rc = ethtool_set_value(dev, useraddr, + dev->ethtool_ops->set_priv_flags); + break; + case ETHTOOL_GRXFH: + case ETHTOOL_GRXRINGS: + case ETHTOOL_GRXCLSRLCNT: + case ETHTOOL_GRXCLSRULE: + case ETHTOOL_GRXCLSRLALL: + rc = ethtool_get_rxnfc(dev, ethcmd, useraddr); + break; + case ETHTOOL_SRXFH: + case ETHTOOL_SRXCLSRLDEL: + case ETHTOOL_SRXCLSRLINS: + rc = ethtool_set_rxnfc(dev, ethcmd, useraddr); + break; + case ETHTOOL_FLASHDEV: + rc = ethtool_flash_device(dev, useraddr); + break; + case ETHTOOL_RESET: + rc = ethtool_reset(dev, useraddr); + break; + case ETHTOOL_GSSET_INFO: + rc = ethtool_get_sset_info(dev, useraddr); + break; + case ETHTOOL_GRXFHINDIR: + rc = ethtool_get_rxfh_indir(dev, useraddr); + break; + case ETHTOOL_SRXFHINDIR: + rc = ethtool_set_rxfh_indir(dev, useraddr); + break; + case ETHTOOL_GFEATURES: + rc = ethtool_get_features(dev, useraddr); + break; + case ETHTOOL_SFEATURES: + rc = ethtool_set_features(dev, useraddr); + break; + case ETHTOOL_GTXCSUM: + case ETHTOOL_GRXCSUM: + case ETHTOOL_GSG: + case ETHTOOL_GTSO: + case ETHTOOL_GUFO: + case ETHTOOL_GGSO: + case ETHTOOL_GGRO: + rc = ethtool_get_one_feature(dev, useraddr, ethcmd); + break; + case ETHTOOL_STXCSUM: + case ETHTOOL_SRXCSUM: + case ETHTOOL_SSG: + case ETHTOOL_STSO: + case ETHTOOL_SUFO: + case ETHTOOL_SGSO: + case ETHTOOL_SGRO: + rc = ethtool_set_one_feature(dev, useraddr, ethcmd); + break; + case ETHTOOL_GCHANNELS: + rc = ethtool_get_channels(dev, useraddr); + break; + case ETHTOOL_SCHANNELS: + rc = ethtool_set_channels(dev, useraddr); + break; + case ETHTOOL_SET_DUMP: + rc = ethtool_set_dump(dev, useraddr); + break; + case ETHTOOL_GET_DUMP_FLAG: + rc = ethtool_get_dump_flag(dev, useraddr); + break; + case ETHTOOL_GET_DUMP_DATA: + rc = ethtool_get_dump_data(dev, useraddr); + break; + default: + rc = -EOPNOTSUPP; + } + + if (dev->ethtool_ops->complete) + dev->ethtool_ops->complete(dev); + + if (old_features != dev->features) + netdev_features_change(dev); + + return rc; +} diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c new file mode 100644 index 00000000..c02e63c9 --- /dev/null +++ b/net/core/fib_rules.c @@ -0,0 +1,770 @@ +/* + * net/core/fib_rules.c Generic Routing Rules + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, version 2. + * + * Authors: Thomas Graf <tgraf@suug.ch> + */ + +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/list.h> +#include <linux/module.h> +#include <net/net_namespace.h> +#include <net/sock.h> +#include <net/fib_rules.h> + +int fib_default_rule_add(struct fib_rules_ops *ops, + u32 pref, u32 table, u32 flags) +{ + struct fib_rule *r; + + r = kzalloc(ops->rule_size, GFP_KERNEL); + if (r == NULL) + return -ENOMEM; + + atomic_set(&r->refcnt, 1); + r->action = FR_ACT_TO_TBL; + r->pref = pref; + r->table = table; + r->flags = flags; + r->fr_net = hold_net(ops->fro_net); + + /* The lock is not required here, the list in unreacheable + * at the moment this function is called */ + list_add_tail(&r->list, &ops->rules_list); + return 0; +} +EXPORT_SYMBOL(fib_default_rule_add); + +u32 fib_default_rule_pref(struct fib_rules_ops *ops) +{ + struct list_head *pos; + struct fib_rule *rule; + + if (!list_empty(&ops->rules_list)) { + pos = ops->rules_list.next; + if (pos->next != &ops->rules_list) { + rule = list_entry(pos->next, struct fib_rule, list); + if (rule->pref) + return rule->pref - 1; + } + } + + return 0; +} +EXPORT_SYMBOL(fib_default_rule_pref); + +static void notify_rule_change(int event, struct fib_rule *rule, + struct fib_rules_ops *ops, struct nlmsghdr *nlh, + u32 pid); + +static struct fib_rules_ops *lookup_rules_ops(struct net *net, int family) +{ + struct fib_rules_ops *ops; + + rcu_read_lock(); + list_for_each_entry_rcu(ops, &net->rules_ops, list) { + if (ops->family == family) { + if (!try_module_get(ops->owner)) + ops = NULL; + rcu_read_unlock(); + return ops; + } + } + rcu_read_unlock(); + + return NULL; +} + +static void rules_ops_put(struct fib_rules_ops *ops) +{ + if (ops) + module_put(ops->owner); +} + +static void flush_route_cache(struct fib_rules_ops *ops) +{ + if (ops->flush_cache) + ops->flush_cache(ops); +} + +static int __fib_rules_register(struct fib_rules_ops *ops) +{ + int err = -EEXIST; + struct fib_rules_ops *o; + struct net *net; + + net = ops->fro_net; + + if (ops->rule_size < sizeof(struct fib_rule)) + return -EINVAL; + + if (ops->match == NULL || ops->configure == NULL || + ops->compare == NULL || ops->fill == NULL || + ops->action == NULL) + return -EINVAL; + + spin_lock(&net->rules_mod_lock); + list_for_each_entry(o, &net->rules_ops, list) + if (ops->family == o->family) + goto errout; + + hold_net(net); + list_add_tail_rcu(&ops->list, &net->rules_ops); + err = 0; +errout: + spin_unlock(&net->rules_mod_lock); + + return err; +} + +struct fib_rules_ops * +fib_rules_register(const struct fib_rules_ops *tmpl, struct net *net) +{ + struct fib_rules_ops *ops; + int err; + + ops = kmemdup(tmpl, sizeof(*ops), GFP_KERNEL); + if (ops == NULL) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&ops->rules_list); + ops->fro_net = net; + + err = __fib_rules_register(ops); + if (err) { + kfree(ops); + ops = ERR_PTR(err); + } + + return ops; +} +EXPORT_SYMBOL_GPL(fib_rules_register); + +static void fib_rules_cleanup_ops(struct fib_rules_ops *ops) +{ + struct fib_rule *rule, *tmp; + + list_for_each_entry_safe(rule, tmp, &ops->rules_list, list) { + list_del_rcu(&rule->list); + fib_rule_put(rule); + } +} + +static void fib_rules_put_rcu(struct rcu_head *head) +{ + struct fib_rules_ops *ops = container_of(head, struct fib_rules_ops, rcu); + struct net *net = ops->fro_net; + + release_net(net); + kfree(ops); +} + +void fib_rules_unregister(struct fib_rules_ops *ops) +{ + struct net *net = ops->fro_net; + + spin_lock(&net->rules_mod_lock); + list_del_rcu(&ops->list); + fib_rules_cleanup_ops(ops); + spin_unlock(&net->rules_mod_lock); + + call_rcu(&ops->rcu, fib_rules_put_rcu); +} +EXPORT_SYMBOL_GPL(fib_rules_unregister); + +static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops, + struct flowi *fl, int flags) +{ + int ret = 0; + + if (rule->iifindex && (rule->iifindex != fl->flowi_iif)) + goto out; + + if (rule->oifindex && (rule->oifindex != fl->flowi_oif)) + goto out; + + if ((rule->mark ^ fl->flowi_mark) & rule->mark_mask) + goto out; + + ret = ops->match(rule, fl, flags); +out: + return (rule->flags & FIB_RULE_INVERT) ? !ret : ret; +} + +int fib_rules_lookup(struct fib_rules_ops *ops, struct flowi *fl, + int flags, struct fib_lookup_arg *arg) +{ + struct fib_rule *rule; + int err; + + rcu_read_lock(); + + list_for_each_entry_rcu(rule, &ops->rules_list, list) { +jumped: + if (!fib_rule_match(rule, ops, fl, flags)) + continue; + + if (rule->action == FR_ACT_GOTO) { + struct fib_rule *target; + + target = rcu_dereference(rule->ctarget); + if (target == NULL) { + continue; + } else { + rule = target; + goto jumped; + } + } else if (rule->action == FR_ACT_NOP) + continue; + else + err = ops->action(rule, fl, flags, arg); + + if (err != -EAGAIN) { + if ((arg->flags & FIB_LOOKUP_NOREF) || + likely(atomic_inc_not_zero(&rule->refcnt))) { + arg->rule = rule; + goto out; + } + break; + } + } + + err = -ESRCH; +out: + rcu_read_unlock(); + + return err; +} +EXPORT_SYMBOL_GPL(fib_rules_lookup); + +static int validate_rulemsg(struct fib_rule_hdr *frh, struct nlattr **tb, + struct fib_rules_ops *ops) +{ + int err = -EINVAL; + + if (frh->src_len) + if (tb[FRA_SRC] == NULL || + frh->src_len > (ops->addr_size * 8) || + nla_len(tb[FRA_SRC]) != ops->addr_size) + goto errout; + + if (frh->dst_len) + if (tb[FRA_DST] == NULL || + frh->dst_len > (ops->addr_size * 8) || + nla_len(tb[FRA_DST]) != ops->addr_size) + goto errout; + + err = 0; +errout: + return err; +} + +static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) +{ + struct net *net = sock_net(skb->sk); + struct fib_rule_hdr *frh = nlmsg_data(nlh); + struct fib_rules_ops *ops = NULL; + struct fib_rule *rule, *r, *last = NULL; + struct nlattr *tb[FRA_MAX+1]; + int err = -EINVAL, unresolved = 0; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) + goto errout; + + ops = lookup_rules_ops(net, frh->family); + if (ops == NULL) { + err = -EAFNOSUPPORT; + goto errout; + } + + err = nlmsg_parse(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy); + if (err < 0) + goto errout; + + err = validate_rulemsg(frh, tb, ops); + if (err < 0) + goto errout; + + rule = kzalloc(ops->rule_size, GFP_KERNEL); + if (rule == NULL) { + err = -ENOMEM; + goto errout; + } + rule->fr_net = hold_net(net); + + if (tb[FRA_PRIORITY]) + rule->pref = nla_get_u32(tb[FRA_PRIORITY]); + + if (tb[FRA_IIFNAME]) { + struct net_device *dev; + + rule->iifindex = -1; + nla_strlcpy(rule->iifname, tb[FRA_IIFNAME], IFNAMSIZ); + dev = __dev_get_by_name(net, rule->iifname); + if (dev) + rule->iifindex = dev->ifindex; + } + + if (tb[FRA_OIFNAME]) { + struct net_device *dev; + + rule->oifindex = -1; + nla_strlcpy(rule->oifname, tb[FRA_OIFNAME], IFNAMSIZ); + dev = __dev_get_by_name(net, rule->oifname); + if (dev) + rule->oifindex = dev->ifindex; + } + + if (tb[FRA_FWMARK]) { + rule->mark = nla_get_u32(tb[FRA_FWMARK]); + if (rule->mark) + /* compatibility: if the mark value is non-zero all bits + * are compared unless a mask is explicitly specified. + */ + rule->mark_mask = 0xFFFFFFFF; + } + + if (tb[FRA_FWMASK]) + rule->mark_mask = nla_get_u32(tb[FRA_FWMASK]); + + rule->action = frh->action; + rule->flags = frh->flags; + rule->table = frh_get_table(frh, tb); + + if (!tb[FRA_PRIORITY] && ops->default_pref) + rule->pref = ops->default_pref(ops); + + err = -EINVAL; + if (tb[FRA_GOTO]) { + if (rule->action != FR_ACT_GOTO) + goto errout_free; + + rule->target = nla_get_u32(tb[FRA_GOTO]); + /* Backward jumps are prohibited to avoid endless loops */ + if (rule->target <= rule->pref) + goto errout_free; + + list_for_each_entry(r, &ops->rules_list, list) { + if (r->pref == rule->target) { + RCU_INIT_POINTER(rule->ctarget, r); + break; + } + } + + if (rcu_dereference_protected(rule->ctarget, 1) == NULL) + unresolved = 1; + } else if (rule->action == FR_ACT_GOTO) + goto errout_free; + + err = ops->configure(rule, skb, frh, tb); + if (err < 0) + goto errout_free; + + list_for_each_entry(r, &ops->rules_list, list) { + if (r->pref > rule->pref) + break; + last = r; + } + + fib_rule_get(rule); + + if (last) + list_add_rcu(&rule->list, &last->list); + else + list_add_rcu(&rule->list, &ops->rules_list); + + if (ops->unresolved_rules) { + /* + * There are unresolved goto rules in the list, check if + * any of them are pointing to this new rule. + */ + list_for_each_entry(r, &ops->rules_list, list) { + if (r->action == FR_ACT_GOTO && + r->target == rule->pref && + rtnl_dereference(r->ctarget) == NULL) { + rcu_assign_pointer(r->ctarget, rule); + if (--ops->unresolved_rules == 0) + break; + } + } + } + + if (rule->action == FR_ACT_GOTO) + ops->nr_goto_rules++; + + if (unresolved) + ops->unresolved_rules++; + + notify_rule_change(RTM_NEWRULE, rule, ops, nlh, NETLINK_CB(skb).pid); + flush_route_cache(ops); + rules_ops_put(ops); + return 0; + +errout_free: + release_net(rule->fr_net); + kfree(rule); +errout: + rules_ops_put(ops); + return err; +} + +static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) +{ + struct net *net = sock_net(skb->sk); + struct fib_rule_hdr *frh = nlmsg_data(nlh); + struct fib_rules_ops *ops = NULL; + struct fib_rule *rule, *tmp; + struct nlattr *tb[FRA_MAX+1]; + int err = -EINVAL; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) + goto errout; + + ops = lookup_rules_ops(net, frh->family); + if (ops == NULL) { + err = -EAFNOSUPPORT; + goto errout; + } + + err = nlmsg_parse(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy); + if (err < 0) + goto errout; + + err = validate_rulemsg(frh, tb, ops); + if (err < 0) + goto errout; + + list_for_each_entry(rule, &ops->rules_list, list) { + if (frh->action && (frh->action != rule->action)) + continue; + + if (frh->table && (frh_get_table(frh, tb) != rule->table)) + continue; + + if (tb[FRA_PRIORITY] && + (rule->pref != nla_get_u32(tb[FRA_PRIORITY]))) + continue; + + if (tb[FRA_IIFNAME] && + nla_strcmp(tb[FRA_IIFNAME], rule->iifname)) + continue; + + if (tb[FRA_OIFNAME] && + nla_strcmp(tb[FRA_OIFNAME], rule->oifname)) + continue; + + if (tb[FRA_FWMARK] && + (rule->mark != nla_get_u32(tb[FRA_FWMARK]))) + continue; + + if (tb[FRA_FWMASK] && + (rule->mark_mask != nla_get_u32(tb[FRA_FWMASK]))) + continue; + + if (!ops->compare(rule, frh, tb)) + continue; + + if (rule->flags & FIB_RULE_PERMANENT) { + err = -EPERM; + goto errout; + } + + list_del_rcu(&rule->list); + + if (rule->action == FR_ACT_GOTO) { + ops->nr_goto_rules--; + if (rtnl_dereference(rule->ctarget) == NULL) + ops->unresolved_rules--; + } + + /* + * Check if this rule is a target to any of them. If so, + * disable them. As this operation is eventually very + * expensive, it is only performed if goto rules have + * actually been added. + */ + if (ops->nr_goto_rules > 0) { + list_for_each_entry(tmp, &ops->rules_list, list) { + if (rtnl_dereference(tmp->ctarget) == rule) { + RCU_INIT_POINTER(tmp->ctarget, NULL); + ops->unresolved_rules++; + } + } + } + + notify_rule_change(RTM_DELRULE, rule, ops, nlh, + NETLINK_CB(skb).pid); + fib_rule_put(rule); + flush_route_cache(ops); + rules_ops_put(ops); + return 0; + } + + err = -ENOENT; +errout: + rules_ops_put(ops); + return err; +} + +static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops, + struct fib_rule *rule) +{ + size_t payload = NLMSG_ALIGN(sizeof(struct fib_rule_hdr)) + + nla_total_size(IFNAMSIZ) /* FRA_IIFNAME */ + + nla_total_size(IFNAMSIZ) /* FRA_OIFNAME */ + + nla_total_size(4) /* FRA_PRIORITY */ + + nla_total_size(4) /* FRA_TABLE */ + + nla_total_size(4) /* FRA_FWMARK */ + + nla_total_size(4); /* FRA_FWMASK */ + + if (ops->nlmsg_payload) + payload += ops->nlmsg_payload(rule); + + return payload; +} + +static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, + u32 pid, u32 seq, int type, int flags, + struct fib_rules_ops *ops) +{ + struct nlmsghdr *nlh; + struct fib_rule_hdr *frh; + + nlh = nlmsg_put(skb, pid, seq, type, sizeof(*frh), flags); + if (nlh == NULL) + return -EMSGSIZE; + + frh = nlmsg_data(nlh); + frh->family = ops->family; + frh->table = rule->table; + NLA_PUT_U32(skb, FRA_TABLE, rule->table); + frh->res1 = 0; + frh->res2 = 0; + frh->action = rule->action; + frh->flags = rule->flags; + + if (rule->action == FR_ACT_GOTO && + rcu_access_pointer(rule->ctarget) == NULL) + frh->flags |= FIB_RULE_UNRESOLVED; + + if (rule->iifname[0]) { + NLA_PUT_STRING(skb, FRA_IIFNAME, rule->iifname); + + if (rule->iifindex == -1) + frh->flags |= FIB_RULE_IIF_DETACHED; + } + + if (rule->oifname[0]) { + NLA_PUT_STRING(skb, FRA_OIFNAME, rule->oifname); + + if (rule->oifindex == -1) + frh->flags |= FIB_RULE_OIF_DETACHED; + } + + if (rule->pref) + NLA_PUT_U32(skb, FRA_PRIORITY, rule->pref); + + if (rule->mark) + NLA_PUT_U32(skb, FRA_FWMARK, rule->mark); + + if (rule->mark_mask || rule->mark) + NLA_PUT_U32(skb, FRA_FWMASK, rule->mark_mask); + + if (rule->target) + NLA_PUT_U32(skb, FRA_GOTO, rule->target); + + if (ops->fill(rule, skb, frh) < 0) + goto nla_put_failure; + + return nlmsg_end(skb, nlh); + +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static int dump_rules(struct sk_buff *skb, struct netlink_callback *cb, + struct fib_rules_ops *ops) +{ + int idx = 0; + struct fib_rule *rule; + + rcu_read_lock(); + list_for_each_entry_rcu(rule, &ops->rules_list, list) { + if (idx < cb->args[1]) + goto skip; + + if (fib_nl_fill_rule(skb, rule, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, RTM_NEWRULE, + NLM_F_MULTI, ops) < 0) + break; +skip: + idx++; + } + rcu_read_unlock(); + cb->args[1] = idx; + rules_ops_put(ops); + + return skb->len; +} + +static int fib_nl_dumprule(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + struct fib_rules_ops *ops; + int idx = 0, family; + + family = rtnl_msg_family(cb->nlh); + if (family != AF_UNSPEC) { + /* Protocol specific dump request */ + ops = lookup_rules_ops(net, family); + if (ops == NULL) + return -EAFNOSUPPORT; + + return dump_rules(skb, cb, ops); + } + + rcu_read_lock(); + list_for_each_entry_rcu(ops, &net->rules_ops, list) { + if (idx < cb->args[0] || !try_module_get(ops->owner)) + goto skip; + + if (dump_rules(skb, cb, ops) < 0) + break; + + cb->args[1] = 0; +skip: + idx++; + } + rcu_read_unlock(); + cb->args[0] = idx; + + return skb->len; +} + +static void notify_rule_change(int event, struct fib_rule *rule, + struct fib_rules_ops *ops, struct nlmsghdr *nlh, + u32 pid) +{ + struct net *net; + struct sk_buff *skb; + int err = -ENOBUFS; + + net = ops->fro_net; + skb = nlmsg_new(fib_rule_nlmsg_size(ops, rule), GFP_KERNEL); + if (skb == NULL) + goto errout; + + err = fib_nl_fill_rule(skb, rule, pid, nlh->nlmsg_seq, event, 0, ops); + if (err < 0) { + /* -EMSGSIZE implies BUG in fib_rule_nlmsg_size() */ + WARN_ON(err == -EMSGSIZE); + kfree_skb(skb); + goto errout; + } + + rtnl_notify(skb, net, pid, ops->nlgroup, nlh, GFP_KERNEL); + return; +errout: + if (err < 0) + rtnl_set_sk_err(net, ops->nlgroup, err); +} + +static void attach_rules(struct list_head *rules, struct net_device *dev) +{ + struct fib_rule *rule; + + list_for_each_entry(rule, rules, list) { + if (rule->iifindex == -1 && + strcmp(dev->name, rule->iifname) == 0) + rule->iifindex = dev->ifindex; + if (rule->oifindex == -1 && + strcmp(dev->name, rule->oifname) == 0) + rule->oifindex = dev->ifindex; + } +} + +static void detach_rules(struct list_head *rules, struct net_device *dev) +{ + struct fib_rule *rule; + + list_for_each_entry(rule, rules, list) { + if (rule->iifindex == dev->ifindex) + rule->iifindex = -1; + if (rule->oifindex == dev->ifindex) + rule->oifindex = -1; + } +} + + +static int fib_rules_event(struct notifier_block *this, unsigned long event, + void *ptr) +{ + struct net_device *dev = ptr; + struct net *net = dev_net(dev); + struct fib_rules_ops *ops; + + ASSERT_RTNL(); + + switch (event) { + case NETDEV_REGISTER: + list_for_each_entry(ops, &net->rules_ops, list) + attach_rules(&ops->rules_list, dev); + break; + + case NETDEV_UNREGISTER: + list_for_each_entry(ops, &net->rules_ops, list) + detach_rules(&ops->rules_list, dev); + break; + } + + return NOTIFY_DONE; +} + +static struct notifier_block fib_rules_notifier = { + .notifier_call = fib_rules_event, +}; + +static int __net_init fib_rules_net_init(struct net *net) +{ + INIT_LIST_HEAD(&net->rules_ops); + spin_lock_init(&net->rules_mod_lock); + return 0; +} + +static struct pernet_operations fib_rules_net_ops = { + .init = fib_rules_net_init, +}; + +static int __init fib_rules_init(void) +{ + int err; + rtnl_register(PF_UNSPEC, RTM_NEWRULE, fib_nl_newrule, NULL, NULL); + rtnl_register(PF_UNSPEC, RTM_DELRULE, fib_nl_delrule, NULL, NULL); + rtnl_register(PF_UNSPEC, RTM_GETRULE, NULL, fib_nl_dumprule, NULL); + + err = register_pernet_subsys(&fib_rules_net_ops); + if (err < 0) + goto fail; + + err = register_netdevice_notifier(&fib_rules_notifier); + if (err < 0) + goto fail_unregister; + + return 0; + +fail_unregister: + unregister_pernet_subsys(&fib_rules_net_ops); +fail: + rtnl_unregister(PF_UNSPEC, RTM_NEWRULE); + rtnl_unregister(PF_UNSPEC, RTM_DELRULE); + rtnl_unregister(PF_UNSPEC, RTM_GETRULE); + return err; +} + +subsys_initcall(fib_rules_init); diff --git a/net/core/filter.c b/net/core/filter.c new file mode 100644 index 00000000..6f755cca --- /dev/null +++ b/net/core/filter.c @@ -0,0 +1,656 @@ +/* + * Linux Socket Filter - Kernel level socket filtering + * + * Author: + * Jay Schulist <jschlst@samba.org> + * + * Based on the design of: + * - The Berkeley Packet Filter + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Andi Kleen - Fix a few bad bugs and races. + * Kris Katterjohn - Added many additional checks in sk_chk_filter() + */ + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/mm.h> +#include <linux/fcntl.h> +#include <linux/socket.h> +#include <linux/in.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/if_packet.h> +#include <linux/gfp.h> +#include <net/ip.h> +#include <net/protocol.h> +#include <net/netlink.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <linux/errno.h> +#include <linux/timer.h> +#include <asm/uaccess.h> +#include <asm/unaligned.h> +#include <linux/filter.h> +#include <linux/reciprocal_div.h> +#include <linux/ratelimit.h> + +/* No hurry in this branch + * + * Exported for the bpf jit load helper. + */ +void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, unsigned int size) +{ + u8 *ptr = NULL; + + if (k >= SKF_NET_OFF) + ptr = skb_network_header(skb) + k - SKF_NET_OFF; + else if (k >= SKF_LL_OFF) + ptr = skb_mac_header(skb) + k - SKF_LL_OFF; + + if (ptr >= skb->head && ptr + size <= skb_tail_pointer(skb)) + return ptr; + return NULL; +} + +static inline void *load_pointer(const struct sk_buff *skb, int k, + unsigned int size, void *buffer) +{ + if (k >= 0) + return skb_header_pointer(skb, k, size, buffer); + return bpf_internal_load_pointer_neg_helper(skb, k, size); +} + +/** + * sk_filter - run a packet through a socket filter + * @sk: sock associated with &sk_buff + * @skb: buffer to filter + * + * Run the filter code and then cut skb->data to correct size returned by + * sk_run_filter. If pkt_len is 0 we toss packet. If skb->len is smaller + * than pkt_len we keep whole skb->data. This is the socket level + * wrapper to sk_run_filter. It returns 0 if the packet should + * be accepted or -EPERM if the packet should be tossed. + * + */ +int sk_filter(struct sock *sk, struct sk_buff *skb) +{ + int err; + struct sk_filter *filter; + + err = security_sock_rcv_skb(sk, skb); + if (err) + return err; + + rcu_read_lock(); + filter = rcu_dereference(sk->sk_filter); + if (filter) { + unsigned int pkt_len = SK_RUN_FILTER(filter, skb); + + err = pkt_len ? pskb_trim(skb, pkt_len) : -EPERM; + } + rcu_read_unlock(); + + return err; +} +EXPORT_SYMBOL(sk_filter); + +/** + * sk_run_filter - run a filter on a socket + * @skb: buffer to run the filter on + * @fentry: filter to apply + * + * Decode and apply filter instructions to the skb->data. + * Return length to keep, 0 for none. @skb is the data we are + * filtering, @filter is the array of filter instructions. + * Because all jumps are guaranteed to be before last instruction, + * and last instruction guaranteed to be a RET, we dont need to check + * flen. (We used to pass to this function the length of filter) + */ +unsigned int sk_run_filter(const struct sk_buff *skb, + const struct sock_filter *fentry) +{ + void *ptr; + u32 A = 0; /* Accumulator */ + u32 X = 0; /* Index Register */ + u32 mem[BPF_MEMWORDS]; /* Scratch Memory Store */ + u32 tmp; + int k; + + /* + * Process array of filter instructions. + */ + for (;; fentry++) { +#if defined(CONFIG_X86_32) +#define K (fentry->k) +#else + const u32 K = fentry->k; +#endif + + switch (fentry->code) { + case BPF_S_ALU_ADD_X: + A += X; + continue; + case BPF_S_ALU_ADD_K: + A += K; + continue; + case BPF_S_ALU_SUB_X: + A -= X; + continue; + case BPF_S_ALU_SUB_K: + A -= K; + continue; + case BPF_S_ALU_MUL_X: + A *= X; + continue; + case BPF_S_ALU_MUL_K: + A *= K; + continue; + case BPF_S_ALU_DIV_X: + if (X == 0) + return 0; + A /= X; + continue; + case BPF_S_ALU_DIV_K: + A = reciprocal_divide(A, K); + continue; + case BPF_S_ALU_AND_X: + A &= X; + continue; + case BPF_S_ALU_AND_K: + A &= K; + continue; + case BPF_S_ALU_OR_X: + A |= X; + continue; + case BPF_S_ALU_OR_K: + A |= K; + continue; + case BPF_S_ALU_LSH_X: + A <<= X; + continue; + case BPF_S_ALU_LSH_K: + A <<= K; + continue; + case BPF_S_ALU_RSH_X: + A >>= X; + continue; + case BPF_S_ALU_RSH_K: + A >>= K; + continue; + case BPF_S_ALU_NEG: + A = -A; + continue; + case BPF_S_JMP_JA: + fentry += K; + continue; + case BPF_S_JMP_JGT_K: + fentry += (A > K) ? fentry->jt : fentry->jf; + continue; + case BPF_S_JMP_JGE_K: + fentry += (A >= K) ? fentry->jt : fentry->jf; + continue; + case BPF_S_JMP_JEQ_K: + fentry += (A == K) ? fentry->jt : fentry->jf; + continue; + case BPF_S_JMP_JSET_K: + fentry += (A & K) ? fentry->jt : fentry->jf; + continue; + case BPF_S_JMP_JGT_X: + fentry += (A > X) ? fentry->jt : fentry->jf; + continue; + case BPF_S_JMP_JGE_X: + fentry += (A >= X) ? fentry->jt : fentry->jf; + continue; + case BPF_S_JMP_JEQ_X: + fentry += (A == X) ? fentry->jt : fentry->jf; + continue; + case BPF_S_JMP_JSET_X: + fentry += (A & X) ? fentry->jt : fentry->jf; + continue; + case BPF_S_LD_W_ABS: + k = K; +load_w: + ptr = load_pointer(skb, k, 4, &tmp); + if (ptr != NULL) { + A = get_unaligned_be32(ptr); + continue; + } + return 0; + case BPF_S_LD_H_ABS: + k = K; +load_h: + ptr = load_pointer(skb, k, 2, &tmp); + if (ptr != NULL) { + A = get_unaligned_be16(ptr); + continue; + } + return 0; + case BPF_S_LD_B_ABS: + k = K; +load_b: + ptr = load_pointer(skb, k, 1, &tmp); + if (ptr != NULL) { + A = *(u8 *)ptr; + continue; + } + return 0; + case BPF_S_LD_W_LEN: + A = skb->len; + continue; + case BPF_S_LDX_W_LEN: + X = skb->len; + continue; + case BPF_S_LD_W_IND: + k = X + K; + goto load_w; + case BPF_S_LD_H_IND: + k = X + K; + goto load_h; + case BPF_S_LD_B_IND: + k = X + K; + goto load_b; + case BPF_S_LDX_B_MSH: + ptr = load_pointer(skb, K, 1, &tmp); + if (ptr != NULL) { + X = (*(u8 *)ptr & 0xf) << 2; + continue; + } + return 0; + case BPF_S_LD_IMM: + A = K; + continue; + case BPF_S_LDX_IMM: + X = K; + continue; + case BPF_S_LD_MEM: + A = mem[K]; + continue; + case BPF_S_LDX_MEM: + X = mem[K]; + continue; + case BPF_S_MISC_TAX: + X = A; + continue; + case BPF_S_MISC_TXA: + A = X; + continue; + case BPF_S_RET_K: + return K; + case BPF_S_RET_A: + return A; + case BPF_S_ST: + mem[K] = A; + continue; + case BPF_S_STX: + mem[K] = X; + continue; + case BPF_S_ANC_PROTOCOL: + A = ntohs(skb->protocol); + continue; + case BPF_S_ANC_PKTTYPE: + A = skb->pkt_type; + continue; + case BPF_S_ANC_IFINDEX: + if (!skb->dev) + return 0; + A = skb->dev->ifindex; + continue; + case BPF_S_ANC_MARK: + A = skb->mark; + continue; + case BPF_S_ANC_QUEUE: + A = skb->queue_mapping; + continue; + case BPF_S_ANC_HATYPE: + if (!skb->dev) + return 0; + A = skb->dev->type; + continue; + case BPF_S_ANC_RXHASH: + A = skb->rxhash; + continue; + case BPF_S_ANC_CPU: + A = raw_smp_processor_id(); + continue; + case BPF_S_ANC_NLATTR: { + struct nlattr *nla; + + if (skb_is_nonlinear(skb)) + return 0; + if (A > skb->len - sizeof(struct nlattr)) + return 0; + + nla = nla_find((struct nlattr *)&skb->data[A], + skb->len - A, X); + if (nla) + A = (void *)nla - (void *)skb->data; + else + A = 0; + continue; + } + case BPF_S_ANC_NLATTR_NEST: { + struct nlattr *nla; + + if (skb_is_nonlinear(skb)) + return 0; + if (A > skb->len - sizeof(struct nlattr)) + return 0; + + nla = (struct nlattr *)&skb->data[A]; + if (nla->nla_len > A - skb->len) + return 0; + + nla = nla_find_nested(nla, X); + if (nla) + A = (void *)nla - (void *)skb->data; + else + A = 0; + continue; + } + default: + WARN_RATELIMIT(1, "Unknown code:%u jt:%u tf:%u k:%u\n", + fentry->code, fentry->jt, + fentry->jf, fentry->k); + return 0; + } + } + + return 0; +} +EXPORT_SYMBOL(sk_run_filter); + +/* + * Security : + * A BPF program is able to use 16 cells of memory to store intermediate + * values (check u32 mem[BPF_MEMWORDS] in sk_run_filter()) + * As we dont want to clear mem[] array for each packet going through + * sk_run_filter(), we check that filter loaded by user never try to read + * a cell if not previously written, and we check all branches to be sure + * a malicious user doesn't try to abuse us. + */ +static int check_load_and_stores(struct sock_filter *filter, int flen) +{ + u16 *masks, memvalid = 0; /* one bit per cell, 16 cells */ + int pc, ret = 0; + + BUILD_BUG_ON(BPF_MEMWORDS > 16); + masks = kmalloc(flen * sizeof(*masks), GFP_KERNEL); + if (!masks) + return -ENOMEM; + memset(masks, 0xff, flen * sizeof(*masks)); + + for (pc = 0; pc < flen; pc++) { + memvalid &= masks[pc]; + + switch (filter[pc].code) { + case BPF_S_ST: + case BPF_S_STX: + memvalid |= (1 << filter[pc].k); + break; + case BPF_S_LD_MEM: + case BPF_S_LDX_MEM: + if (!(memvalid & (1 << filter[pc].k))) { + ret = -EINVAL; + goto error; + } + break; + case BPF_S_JMP_JA: + /* a jump must set masks on target */ + masks[pc + 1 + filter[pc].k] &= memvalid; + memvalid = ~0; + break; + case BPF_S_JMP_JEQ_K: + case BPF_S_JMP_JEQ_X: + case BPF_S_JMP_JGE_K: + case BPF_S_JMP_JGE_X: + case BPF_S_JMP_JGT_K: + case BPF_S_JMP_JGT_X: + case BPF_S_JMP_JSET_X: + case BPF_S_JMP_JSET_K: + /* a jump must set masks on targets */ + masks[pc + 1 + filter[pc].jt] &= memvalid; + masks[pc + 1 + filter[pc].jf] &= memvalid; + memvalid = ~0; + break; + } + } +error: + kfree(masks); + return ret; +} + +/** + * sk_chk_filter - verify socket filter code + * @filter: filter to verify + * @flen: length of filter + * + * Check the user's filter code. If we let some ugly + * filter code slip through kaboom! The filter must contain + * no references or jumps that are out of range, no illegal + * instructions, and must end with a RET instruction. + * + * All jumps are forward as they are not signed. + * + * Returns 0 if the rule set is legal or -EINVAL if not. + */ +int sk_chk_filter(struct sock_filter *filter, unsigned int flen) +{ + /* + * Valid instructions are initialized to non-0. + * Invalid instructions are initialized to 0. + */ + static const u8 codes[] = { + [BPF_ALU|BPF_ADD|BPF_K] = BPF_S_ALU_ADD_K, + [BPF_ALU|BPF_ADD|BPF_X] = BPF_S_ALU_ADD_X, + [BPF_ALU|BPF_SUB|BPF_K] = BPF_S_ALU_SUB_K, + [BPF_ALU|BPF_SUB|BPF_X] = BPF_S_ALU_SUB_X, + [BPF_ALU|BPF_MUL|BPF_K] = BPF_S_ALU_MUL_K, + [BPF_ALU|BPF_MUL|BPF_X] = BPF_S_ALU_MUL_X, + [BPF_ALU|BPF_DIV|BPF_X] = BPF_S_ALU_DIV_X, + [BPF_ALU|BPF_AND|BPF_K] = BPF_S_ALU_AND_K, + [BPF_ALU|BPF_AND|BPF_X] = BPF_S_ALU_AND_X, + [BPF_ALU|BPF_OR|BPF_K] = BPF_S_ALU_OR_K, + [BPF_ALU|BPF_OR|BPF_X] = BPF_S_ALU_OR_X, + [BPF_ALU|BPF_LSH|BPF_K] = BPF_S_ALU_LSH_K, + [BPF_ALU|BPF_LSH|BPF_X] = BPF_S_ALU_LSH_X, + [BPF_ALU|BPF_RSH|BPF_K] = BPF_S_ALU_RSH_K, + [BPF_ALU|BPF_RSH|BPF_X] = BPF_S_ALU_RSH_X, + [BPF_ALU|BPF_NEG] = BPF_S_ALU_NEG, + [BPF_LD|BPF_W|BPF_ABS] = BPF_S_LD_W_ABS, + [BPF_LD|BPF_H|BPF_ABS] = BPF_S_LD_H_ABS, + [BPF_LD|BPF_B|BPF_ABS] = BPF_S_LD_B_ABS, + [BPF_LD|BPF_W|BPF_LEN] = BPF_S_LD_W_LEN, + [BPF_LD|BPF_W|BPF_IND] = BPF_S_LD_W_IND, + [BPF_LD|BPF_H|BPF_IND] = BPF_S_LD_H_IND, + [BPF_LD|BPF_B|BPF_IND] = BPF_S_LD_B_IND, + [BPF_LD|BPF_IMM] = BPF_S_LD_IMM, + [BPF_LDX|BPF_W|BPF_LEN] = BPF_S_LDX_W_LEN, + [BPF_LDX|BPF_B|BPF_MSH] = BPF_S_LDX_B_MSH, + [BPF_LDX|BPF_IMM] = BPF_S_LDX_IMM, + [BPF_MISC|BPF_TAX] = BPF_S_MISC_TAX, + [BPF_MISC|BPF_TXA] = BPF_S_MISC_TXA, + [BPF_RET|BPF_K] = BPF_S_RET_K, + [BPF_RET|BPF_A] = BPF_S_RET_A, + [BPF_ALU|BPF_DIV|BPF_K] = BPF_S_ALU_DIV_K, + [BPF_LD|BPF_MEM] = BPF_S_LD_MEM, + [BPF_LDX|BPF_MEM] = BPF_S_LDX_MEM, + [BPF_ST] = BPF_S_ST, + [BPF_STX] = BPF_S_STX, + [BPF_JMP|BPF_JA] = BPF_S_JMP_JA, + [BPF_JMP|BPF_JEQ|BPF_K] = BPF_S_JMP_JEQ_K, + [BPF_JMP|BPF_JEQ|BPF_X] = BPF_S_JMP_JEQ_X, + [BPF_JMP|BPF_JGE|BPF_K] = BPF_S_JMP_JGE_K, + [BPF_JMP|BPF_JGE|BPF_X] = BPF_S_JMP_JGE_X, + [BPF_JMP|BPF_JGT|BPF_K] = BPF_S_JMP_JGT_K, + [BPF_JMP|BPF_JGT|BPF_X] = BPF_S_JMP_JGT_X, + [BPF_JMP|BPF_JSET|BPF_K] = BPF_S_JMP_JSET_K, + [BPF_JMP|BPF_JSET|BPF_X] = BPF_S_JMP_JSET_X, + }; + int pc; + + if (flen == 0 || flen > BPF_MAXINSNS) + return -EINVAL; + + /* check the filter code now */ + for (pc = 0; pc < flen; pc++) { + struct sock_filter *ftest = &filter[pc]; + u16 code = ftest->code; + + if (code >= ARRAY_SIZE(codes)) + return -EINVAL; + code = codes[code]; + if (!code) + return -EINVAL; + /* Some instructions need special checks */ + switch (code) { + case BPF_S_ALU_DIV_K: + /* check for division by zero */ + if (ftest->k == 0) + return -EINVAL; + ftest->k = reciprocal_value(ftest->k); + break; + case BPF_S_LD_MEM: + case BPF_S_LDX_MEM: + case BPF_S_ST: + case BPF_S_STX: + /* check for invalid memory addresses */ + if (ftest->k >= BPF_MEMWORDS) + return -EINVAL; + break; + case BPF_S_JMP_JA: + /* + * Note, the large ftest->k might cause loops. + * Compare this with conditional jumps below, + * where offsets are limited. --ANK (981016) + */ + if (ftest->k >= (unsigned)(flen-pc-1)) + return -EINVAL; + break; + case BPF_S_JMP_JEQ_K: + case BPF_S_JMP_JEQ_X: + case BPF_S_JMP_JGE_K: + case BPF_S_JMP_JGE_X: + case BPF_S_JMP_JGT_K: + case BPF_S_JMP_JGT_X: + case BPF_S_JMP_JSET_X: + case BPF_S_JMP_JSET_K: + /* for conditionals both must be safe */ + if (pc + ftest->jt + 1 >= flen || + pc + ftest->jf + 1 >= flen) + return -EINVAL; + break; + case BPF_S_LD_W_ABS: + case BPF_S_LD_H_ABS: + case BPF_S_LD_B_ABS: +#define ANCILLARY(CODE) case SKF_AD_OFF + SKF_AD_##CODE: \ + code = BPF_S_ANC_##CODE; \ + break + switch (ftest->k) { + ANCILLARY(PROTOCOL); + ANCILLARY(PKTTYPE); + ANCILLARY(IFINDEX); + ANCILLARY(NLATTR); + ANCILLARY(NLATTR_NEST); + ANCILLARY(MARK); + ANCILLARY(QUEUE); + ANCILLARY(HATYPE); + ANCILLARY(RXHASH); + ANCILLARY(CPU); + } + } + ftest->code = code; + } + + /* last instruction must be a RET code */ + switch (filter[flen - 1].code) { + case BPF_S_RET_K: + case BPF_S_RET_A: + return check_load_and_stores(filter, flen); + } + return -EINVAL; +} +EXPORT_SYMBOL(sk_chk_filter); + +/** + * sk_filter_release_rcu - Release a socket filter by rcu_head + * @rcu: rcu_head that contains the sk_filter to free + */ +void sk_filter_release_rcu(struct rcu_head *rcu) +{ + struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu); + + bpf_jit_free(fp); + kfree(fp); +} +EXPORT_SYMBOL(sk_filter_release_rcu); + +/** + * sk_attach_filter - attach a socket filter + * @fprog: the filter program + * @sk: the socket to use + * + * Attach the user's filter code. We first run some sanity checks on + * it to make sure it does not explode on us later. If an error + * occurs or there is insufficient memory for the filter a negative + * errno code is returned. On success the return is zero. + */ +int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) +{ + struct sk_filter *fp, *old_fp; + unsigned int fsize = sizeof(struct sock_filter) * fprog->len; + int err; + + /* Make sure new filter is there and in the right amounts. */ + if (fprog->filter == NULL) + return -EINVAL; + + fp = sock_kmalloc(sk, fsize+sizeof(*fp), GFP_KERNEL); + if (!fp) + return -ENOMEM; + if (copy_from_user(fp->insns, fprog->filter, fsize)) { + sock_kfree_s(sk, fp, fsize+sizeof(*fp)); + return -EFAULT; + } + + atomic_set(&fp->refcnt, 1); + fp->len = fprog->len; + fp->bpf_func = sk_run_filter; + + err = sk_chk_filter(fp->insns, fp->len); + if (err) { + sk_filter_uncharge(sk, fp); + return err; + } + + bpf_jit_compile(fp); + + old_fp = rcu_dereference_protected(sk->sk_filter, + sock_owned_by_user(sk)); + rcu_assign_pointer(sk->sk_filter, fp); + + if (old_fp) + sk_filter_uncharge(sk, old_fp); + return 0; +} +EXPORT_SYMBOL_GPL(sk_attach_filter); + +int sk_detach_filter(struct sock *sk) +{ + int ret = -ENOENT; + struct sk_filter *filter; + + filter = rcu_dereference_protected(sk->sk_filter, + sock_owned_by_user(sk)); + if (filter) { + RCU_INIT_POINTER(sk->sk_filter, NULL); + sk_filter_uncharge(sk, filter); + ret = 0; + } + return ret; +} +EXPORT_SYMBOL_GPL(sk_detach_filter); diff --git a/net/core/flow.c b/net/core/flow.c new file mode 100644 index 00000000..e318c7e9 --- /dev/null +++ b/net/core/flow.c @@ -0,0 +1,464 @@ +/* flow.c: Generic flow cache. + * + * Copyright (C) 2003 Alexey N. Kuznetsov (kuznet@ms2.inr.ac.ru) + * Copyright (C) 2003 David S. Miller (davem@redhat.com) + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/list.h> +#include <linux/jhash.h> +#include <linux/interrupt.h> +#include <linux/mm.h> +#include <linux/random.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/smp.h> +#include <linux/completion.h> +#include <linux/percpu.h> +#include <linux/bitops.h> +#include <linux/notifier.h> +#include <linux/cpu.h> +#include <linux/cpumask.h> +#include <linux/mutex.h> +#include <net/flow.h> +#include <linux/atomic.h> +#include <linux/security.h> + +struct flow_cache_entry { + union { + struct hlist_node hlist; + struct list_head gc_list; + } u; + struct net *net; + u16 family; + u8 dir; + u32 genid; + struct flowi key; + struct flow_cache_object *object; +}; + +struct flow_cache_percpu { + struct hlist_head *hash_table; + int hash_count; + u32 hash_rnd; + int hash_rnd_recalc; + struct tasklet_struct flush_tasklet; +}; + +struct flow_flush_info { + struct flow_cache *cache; + atomic_t cpuleft; + struct completion completion; +}; + +struct flow_cache { + u32 hash_shift; + struct flow_cache_percpu __percpu *percpu; + struct notifier_block hotcpu_notifier; + int low_watermark; + int high_watermark; + struct timer_list rnd_timer; +}; + +atomic_t flow_cache_genid = ATOMIC_INIT(0); +EXPORT_SYMBOL(flow_cache_genid); +static struct flow_cache flow_cache_global; +static struct kmem_cache *flow_cachep __read_mostly; + +static DEFINE_SPINLOCK(flow_cache_gc_lock); +static LIST_HEAD(flow_cache_gc_list); + +#define flow_cache_hash_size(cache) (1 << (cache)->hash_shift) +#define FLOW_HASH_RND_PERIOD (10 * 60 * HZ) + +static void flow_cache_new_hashrnd(unsigned long arg) +{ + struct flow_cache *fc = (void *) arg; + int i; + + for_each_possible_cpu(i) + per_cpu_ptr(fc->percpu, i)->hash_rnd_recalc = 1; + + fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD; + add_timer(&fc->rnd_timer); +} + +static int flow_entry_valid(struct flow_cache_entry *fle) +{ + if (atomic_read(&flow_cache_genid) != fle->genid) + return 0; + if (fle->object && !fle->object->ops->check(fle->object)) + return 0; + return 1; +} + +static void flow_entry_kill(struct flow_cache_entry *fle) +{ + if (fle->object) + fle->object->ops->delete(fle->object); + kmem_cache_free(flow_cachep, fle); +} + +static void flow_cache_gc_task(struct work_struct *work) +{ + struct list_head gc_list; + struct flow_cache_entry *fce, *n; + + INIT_LIST_HEAD(&gc_list); + spin_lock_bh(&flow_cache_gc_lock); + list_splice_tail_init(&flow_cache_gc_list, &gc_list); + spin_unlock_bh(&flow_cache_gc_lock); + + list_for_each_entry_safe(fce, n, &gc_list, u.gc_list) + flow_entry_kill(fce); +} +static DECLARE_WORK(flow_cache_gc_work, flow_cache_gc_task); + +static void flow_cache_queue_garbage(struct flow_cache_percpu *fcp, + int deleted, struct list_head *gc_list) +{ + if (deleted) { + fcp->hash_count -= deleted; + spin_lock_bh(&flow_cache_gc_lock); + list_splice_tail(gc_list, &flow_cache_gc_list); + spin_unlock_bh(&flow_cache_gc_lock); + schedule_work(&flow_cache_gc_work); + } +} + +static void __flow_cache_shrink(struct flow_cache *fc, + struct flow_cache_percpu *fcp, + int shrink_to) +{ + struct flow_cache_entry *fle; + struct hlist_node *entry, *tmp; + LIST_HEAD(gc_list); + int i, deleted = 0; + + for (i = 0; i < flow_cache_hash_size(fc); i++) { + int saved = 0; + + hlist_for_each_entry_safe(fle, entry, tmp, + &fcp->hash_table[i], u.hlist) { + if (saved < shrink_to && + flow_entry_valid(fle)) { + saved++; + } else { + deleted++; + hlist_del(&fle->u.hlist); + list_add_tail(&fle->u.gc_list, &gc_list); + } + } + } + + flow_cache_queue_garbage(fcp, deleted, &gc_list); +} + +static void flow_cache_shrink(struct flow_cache *fc, + struct flow_cache_percpu *fcp) +{ + int shrink_to = fc->low_watermark / flow_cache_hash_size(fc); + + __flow_cache_shrink(fc, fcp, shrink_to); +} + +static void flow_new_hash_rnd(struct flow_cache *fc, + struct flow_cache_percpu *fcp) +{ + get_random_bytes(&fcp->hash_rnd, sizeof(u32)); + fcp->hash_rnd_recalc = 0; + __flow_cache_shrink(fc, fcp, 0); +} + +static u32 flow_hash_code(struct flow_cache *fc, + struct flow_cache_percpu *fcp, + const struct flowi *key, + size_t keysize) +{ + const u32 *k = (const u32 *) key; + const u32 length = keysize * sizeof(flow_compare_t) / sizeof(u32); + + return jhash2(k, length, fcp->hash_rnd) + & (flow_cache_hash_size(fc) - 1); +} + +/* I hear what you're saying, use memcmp. But memcmp cannot make + * important assumptions that we can here, such as alignment. + */ +static int flow_key_compare(const struct flowi *key1, const struct flowi *key2, + size_t keysize) +{ + const flow_compare_t *k1, *k1_lim, *k2; + + k1 = (const flow_compare_t *) key1; + k1_lim = k1 + keysize; + + k2 = (const flow_compare_t *) key2; + + do { + if (*k1++ != *k2++) + return 1; + } while (k1 < k1_lim); + + return 0; +} + +struct flow_cache_object * +flow_cache_lookup(struct net *net, const struct flowi *key, u16 family, u8 dir, + flow_resolve_t resolver, void *ctx) +{ + struct flow_cache *fc = &flow_cache_global; + struct flow_cache_percpu *fcp; + struct flow_cache_entry *fle, *tfle; + struct hlist_node *entry; + struct flow_cache_object *flo; + size_t keysize; + unsigned int hash; + + local_bh_disable(); + fcp = this_cpu_ptr(fc->percpu); + + fle = NULL; + flo = NULL; + + keysize = flow_key_size(family); + if (!keysize) + goto nocache; + + /* Packet really early in init? Making flow_cache_init a + * pre-smp initcall would solve this. --RR */ + if (!fcp->hash_table) + goto nocache; + + if (fcp->hash_rnd_recalc) + flow_new_hash_rnd(fc, fcp); + + hash = flow_hash_code(fc, fcp, key, keysize); + hlist_for_each_entry(tfle, entry, &fcp->hash_table[hash], u.hlist) { + if (tfle->net == net && + tfle->family == family && + tfle->dir == dir && + flow_key_compare(key, &tfle->key, keysize) == 0) { + fle = tfle; + break; + } + } + + if (unlikely(!fle)) { + if (fcp->hash_count > fc->high_watermark) + flow_cache_shrink(fc, fcp); + + fle = kmem_cache_alloc(flow_cachep, GFP_ATOMIC); + if (fle) { + fle->net = net; + fle->family = family; + fle->dir = dir; + memcpy(&fle->key, key, keysize * sizeof(flow_compare_t)); + fle->object = NULL; + hlist_add_head(&fle->u.hlist, &fcp->hash_table[hash]); + fcp->hash_count++; + } + } else if (likely(fle->genid == atomic_read(&flow_cache_genid))) { + flo = fle->object; + if (!flo) + goto ret_object; + flo = flo->ops->get(flo); + if (flo) + goto ret_object; + } else if (fle->object) { + flo = fle->object; + flo->ops->delete(flo); + fle->object = NULL; + } + +nocache: + flo = NULL; + if (fle) { + flo = fle->object; + fle->object = NULL; + } + flo = resolver(net, key, family, dir, flo, ctx); + if (fle) { + fle->genid = atomic_read(&flow_cache_genid); + if (!IS_ERR(flo)) + fle->object = flo; + else + fle->genid--; + } else { + if (flo && !IS_ERR(flo)) + flo->ops->delete(flo); + } +ret_object: + local_bh_enable(); + return flo; +} +EXPORT_SYMBOL(flow_cache_lookup); + +static void flow_cache_flush_tasklet(unsigned long data) +{ + struct flow_flush_info *info = (void *)data; + struct flow_cache *fc = info->cache; + struct flow_cache_percpu *fcp; + struct flow_cache_entry *fle; + struct hlist_node *entry, *tmp; + LIST_HEAD(gc_list); + int i, deleted = 0; + + fcp = this_cpu_ptr(fc->percpu); + for (i = 0; i < flow_cache_hash_size(fc); i++) { + hlist_for_each_entry_safe(fle, entry, tmp, + &fcp->hash_table[i], u.hlist) { + if (flow_entry_valid(fle)) + continue; + + deleted++; + hlist_del(&fle->u.hlist); + list_add_tail(&fle->u.gc_list, &gc_list); + } + } + + flow_cache_queue_garbage(fcp, deleted, &gc_list); + + if (atomic_dec_and_test(&info->cpuleft)) + complete(&info->completion); +} + +static void flow_cache_flush_per_cpu(void *data) +{ + struct flow_flush_info *info = data; + int cpu; + struct tasklet_struct *tasklet; + + cpu = smp_processor_id(); + tasklet = &per_cpu_ptr(info->cache->percpu, cpu)->flush_tasklet; + tasklet->data = (unsigned long)info; + tasklet_schedule(tasklet); +} + +void flow_cache_flush(void) +{ + struct flow_flush_info info; + static DEFINE_MUTEX(flow_flush_sem); + + /* Don't want cpus going down or up during this. */ + get_online_cpus(); + mutex_lock(&flow_flush_sem); + info.cache = &flow_cache_global; + atomic_set(&info.cpuleft, num_online_cpus()); + init_completion(&info.completion); + + local_bh_disable(); + smp_call_function(flow_cache_flush_per_cpu, &info, 0); + flow_cache_flush_tasklet((unsigned long)&info); + local_bh_enable(); + + wait_for_completion(&info.completion); + mutex_unlock(&flow_flush_sem); + put_online_cpus(); +} + +static void flow_cache_flush_task(struct work_struct *work) +{ + flow_cache_flush(); +} + +static DECLARE_WORK(flow_cache_flush_work, flow_cache_flush_task); + +void flow_cache_flush_deferred(void) +{ + schedule_work(&flow_cache_flush_work); +} + +static int __cpuinit flow_cache_cpu_prepare(struct flow_cache *fc, int cpu) +{ + struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu); + size_t sz = sizeof(struct hlist_head) * flow_cache_hash_size(fc); + + if (!fcp->hash_table) { + fcp->hash_table = kzalloc_node(sz, GFP_KERNEL, cpu_to_node(cpu)); + if (!fcp->hash_table) { + pr_err("NET: failed to allocate flow cache sz %zu\n", sz); + return -ENOMEM; + } + fcp->hash_rnd_recalc = 1; + fcp->hash_count = 0; + tasklet_init(&fcp->flush_tasklet, flow_cache_flush_tasklet, 0); + } + return 0; +} + +static int __cpuinit flow_cache_cpu(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + struct flow_cache *fc = container_of(nfb, struct flow_cache, hotcpu_notifier); + int res, cpu = (unsigned long) hcpu; + struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu); + + switch (action) { + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + res = flow_cache_cpu_prepare(fc, cpu); + if (res) + return notifier_from_errno(res); + break; + case CPU_DEAD: + case CPU_DEAD_FROZEN: + __flow_cache_shrink(fc, fcp, 0); + break; + } + return NOTIFY_OK; +} + +static int __init flow_cache_init(struct flow_cache *fc) +{ + int i; + + fc->hash_shift = 10; + fc->low_watermark = 2 * flow_cache_hash_size(fc); + fc->high_watermark = 4 * flow_cache_hash_size(fc); + + fc->percpu = alloc_percpu(struct flow_cache_percpu); + if (!fc->percpu) + return -ENOMEM; + + for_each_online_cpu(i) { + if (flow_cache_cpu_prepare(fc, i)) + goto err; + } + fc->hotcpu_notifier = (struct notifier_block){ + .notifier_call = flow_cache_cpu, + }; + register_hotcpu_notifier(&fc->hotcpu_notifier); + + setup_timer(&fc->rnd_timer, flow_cache_new_hashrnd, + (unsigned long) fc); + fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD; + add_timer(&fc->rnd_timer); + + return 0; + +err: + for_each_possible_cpu(i) { + struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, i); + kfree(fcp->hash_table); + fcp->hash_table = NULL; + } + + free_percpu(fc->percpu); + fc->percpu = NULL; + + return -ENOMEM; +} + +static int __init flow_cache_init_global(void) +{ + flow_cachep = kmem_cache_create("flow_cache", + sizeof(struct flow_cache_entry), + 0, SLAB_PANIC, NULL); + + return flow_cache_init(&flow_cache_global); +} + +module_init(flow_cache_init_global); diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c new file mode 100644 index 00000000..a225089d --- /dev/null +++ b/net/core/flow_dissector.c @@ -0,0 +1,144 @@ +#include <linux/skbuff.h> +#include <linux/export.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/if_vlan.h> +#include <net/ip.h> +#include <linux/if_tunnel.h> +#include <linux/if_pppox.h> +#include <linux/ppp_defs.h> +#include <net/flow_keys.h> + +/* copy saddr & daddr, possibly using 64bit load/store + * Equivalent to : flow->src = iph->saddr; + * flow->dst = iph->daddr; + */ +static void iph_to_flow_copy_addrs(struct flow_keys *flow, const struct iphdr *iph) +{ + BUILD_BUG_ON(offsetof(typeof(*flow), dst) != + offsetof(typeof(*flow), src) + sizeof(flow->src)); + memcpy(&flow->src, &iph->saddr, sizeof(flow->src) + sizeof(flow->dst)); +} + +bool skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow) +{ + int poff, nhoff = skb_network_offset(skb); + u8 ip_proto; + __be16 proto = skb->protocol; + + memset(flow, 0, sizeof(*flow)); + +again: + switch (proto) { + case __constant_htons(ETH_P_IP): { + const struct iphdr *iph; + struct iphdr _iph; +ip: + iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph); + if (!iph) + return false; + + if (ip_is_fragment(iph)) + ip_proto = 0; + else + ip_proto = iph->protocol; + iph_to_flow_copy_addrs(flow, iph); + nhoff += iph->ihl * 4; + break; + } + case __constant_htons(ETH_P_IPV6): { + const struct ipv6hdr *iph; + struct ipv6hdr _iph; +ipv6: + iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph); + if (!iph) + return false; + + ip_proto = iph->nexthdr; + flow->src = iph->saddr.s6_addr32[3]; + flow->dst = iph->daddr.s6_addr32[3]; + nhoff += sizeof(struct ipv6hdr); + break; + } + case __constant_htons(ETH_P_8021Q): { + const struct vlan_hdr *vlan; + struct vlan_hdr _vlan; + + vlan = skb_header_pointer(skb, nhoff, sizeof(_vlan), &_vlan); + if (!vlan) + return false; + + proto = vlan->h_vlan_encapsulated_proto; + nhoff += sizeof(*vlan); + goto again; + } + case __constant_htons(ETH_P_PPP_SES): { + struct { + struct pppoe_hdr hdr; + __be16 proto; + } *hdr, _hdr; + hdr = skb_header_pointer(skb, nhoff, sizeof(_hdr), &_hdr); + if (!hdr) + return false; + proto = hdr->proto; + nhoff += PPPOE_SES_HLEN; + switch (proto) { + case __constant_htons(PPP_IP): + goto ip; + case __constant_htons(PPP_IPV6): + goto ipv6; + default: + return false; + } + } + default: + return false; + } + + switch (ip_proto) { + case IPPROTO_GRE: { + struct gre_hdr { + __be16 flags; + __be16 proto; + } *hdr, _hdr; + + hdr = skb_header_pointer(skb, nhoff, sizeof(_hdr), &_hdr); + if (!hdr) + return false; + /* + * Only look inside GRE if version zero and no + * routing + */ + if (!(hdr->flags & (GRE_VERSION|GRE_ROUTING))) { + proto = hdr->proto; + nhoff += 4; + if (hdr->flags & GRE_CSUM) + nhoff += 4; + if (hdr->flags & GRE_KEY) + nhoff += 4; + if (hdr->flags & GRE_SEQ) + nhoff += 4; + goto again; + } + break; + } + case IPPROTO_IPIP: + goto again; + default: + break; + } + + flow->ip_proto = ip_proto; + poff = proto_ports_offset(ip_proto); + if (poff >= 0) { + __be32 *ports, _ports; + + nhoff += poff; + ports = skb_header_pointer(skb, nhoff, sizeof(_ports), &_ports); + if (ports) + flow->ports = *ports; + } + + return true; +} +EXPORT_SYMBOL(skb_flow_dissect); diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c new file mode 100644 index 00000000..d9d198aa --- /dev/null +++ b/net/core/gen_estimator.c @@ -0,0 +1,321 @@ +/* + * net/sched/gen_estimator.c Simple rate estimator. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + * Changes: + * Jamal Hadi Salim - moved it to net/core and reshulfed + * names to make it usable in general net subsystem. + */ + +#include <asm/uaccess.h> +#include <linux/bitops.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/jiffies.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <linux/rtnetlink.h> +#include <linux/init.h> +#include <linux/rbtree.h> +#include <linux/slab.h> +#include <net/sock.h> +#include <net/gen_stats.h> + +/* + This code is NOT intended to be used for statistics collection, + its purpose is to provide a base for statistical multiplexing + for controlled load service. + If you need only statistics, run a user level daemon which + periodically reads byte counters. + + Unfortunately, rate estimation is not a very easy task. + F.e. I did not find a simple way to estimate the current peak rate + and even failed to formulate the problem 8)8) + + So I preferred not to built an estimator into the scheduler, + but run this task separately. + Ideally, it should be kernel thread(s), but for now it runs + from timers, which puts apparent top bounds on the number of rated + flows, has minimal overhead on small, but is enough + to handle controlled load service, sets of aggregates. + + We measure rate over A=(1<<interval) seconds and evaluate EWMA: + + avrate = avrate*(1-W) + rate*W + + where W is chosen as negative power of 2: W = 2^(-ewma_log) + + The resulting time constant is: + + T = A/(-ln(1-W)) + + + NOTES. + + * avbps is scaled by 2^5, avpps is scaled by 2^10. + * both values are reported as 32 bit unsigned values. bps can + overflow for fast links : max speed being 34360Mbit/sec + * Minimal interval is HZ/4=250msec (it is the greatest common divisor + for HZ=100 and HZ=1024 8)), maximal interval + is (HZ*2^EST_MAX_INTERVAL)/4 = 8sec. Shorter intervals + are too expensive, longer ones can be implemented + at user level painlessly. + */ + +#define EST_MAX_INTERVAL 5 + +struct gen_estimator +{ + struct list_head list; + struct gnet_stats_basic_packed *bstats; + struct gnet_stats_rate_est *rate_est; + spinlock_t *stats_lock; + int ewma_log; + u64 last_bytes; + u64 avbps; + u32 last_packets; + u32 avpps; + struct rcu_head e_rcu; + struct rb_node node; +}; + +struct gen_estimator_head +{ + struct timer_list timer; + struct list_head list; +}; + +static struct gen_estimator_head elist[EST_MAX_INTERVAL+1]; + +/* Protects against NULL dereference */ +static DEFINE_RWLOCK(est_lock); + +/* Protects against soft lockup during large deletion */ +static struct rb_root est_root = RB_ROOT; +static DEFINE_SPINLOCK(est_tree_lock); + +static void est_timer(unsigned long arg) +{ + int idx = (int)arg; + struct gen_estimator *e; + + rcu_read_lock(); + list_for_each_entry_rcu(e, &elist[idx].list, list) { + u64 nbytes; + u64 brate; + u32 npackets; + u32 rate; + + spin_lock(e->stats_lock); + read_lock(&est_lock); + if (e->bstats == NULL) + goto skip; + + nbytes = e->bstats->bytes; + npackets = e->bstats->packets; + brate = (nbytes - e->last_bytes)<<(7 - idx); + e->last_bytes = nbytes; + e->avbps += (brate >> e->ewma_log) - (e->avbps >> e->ewma_log); + e->rate_est->bps = (e->avbps+0xF)>>5; + + rate = (npackets - e->last_packets)<<(12 - idx); + e->last_packets = npackets; + e->avpps += (rate >> e->ewma_log) - (e->avpps >> e->ewma_log); + e->rate_est->pps = (e->avpps+0x1FF)>>10; +skip: + read_unlock(&est_lock); + spin_unlock(e->stats_lock); + } + + if (!list_empty(&elist[idx].list)) + mod_timer(&elist[idx].timer, jiffies + ((HZ/4) << idx)); + rcu_read_unlock(); +} + +static void gen_add_node(struct gen_estimator *est) +{ + struct rb_node **p = &est_root.rb_node, *parent = NULL; + + while (*p) { + struct gen_estimator *e; + + parent = *p; + e = rb_entry(parent, struct gen_estimator, node); + + if (est->bstats > e->bstats) + p = &parent->rb_right; + else + p = &parent->rb_left; + } + rb_link_node(&est->node, parent, p); + rb_insert_color(&est->node, &est_root); +} + +static +struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats, + const struct gnet_stats_rate_est *rate_est) +{ + struct rb_node *p = est_root.rb_node; + + while (p) { + struct gen_estimator *e; + + e = rb_entry(p, struct gen_estimator, node); + + if (bstats > e->bstats) + p = p->rb_right; + else if (bstats < e->bstats || rate_est != e->rate_est) + p = p->rb_left; + else + return e; + } + return NULL; +} + +/** + * gen_new_estimator - create a new rate estimator + * @bstats: basic statistics + * @rate_est: rate estimator statistics + * @stats_lock: statistics lock + * @opt: rate estimator configuration TLV + * + * Creates a new rate estimator with &bstats as source and &rate_est + * as destination. A new timer with the interval specified in the + * configuration TLV is created. Upon each interval, the latest statistics + * will be read from &bstats and the estimated rate will be stored in + * &rate_est with the statistics lock grabed during this period. + * + * Returns 0 on success or a negative error code. + * + */ +int gen_new_estimator(struct gnet_stats_basic_packed *bstats, + struct gnet_stats_rate_est *rate_est, + spinlock_t *stats_lock, + struct nlattr *opt) +{ + struct gen_estimator *est; + struct gnet_estimator *parm = nla_data(opt); + int idx; + + if (nla_len(opt) < sizeof(*parm)) + return -EINVAL; + + if (parm->interval < -2 || parm->interval > 3) + return -EINVAL; + + est = kzalloc(sizeof(*est), GFP_KERNEL); + if (est == NULL) + return -ENOBUFS; + + idx = parm->interval + 2; + est->bstats = bstats; + est->rate_est = rate_est; + est->stats_lock = stats_lock; + est->ewma_log = parm->ewma_log; + est->last_bytes = bstats->bytes; + est->avbps = rate_est->bps<<5; + est->last_packets = bstats->packets; + est->avpps = rate_est->pps<<10; + + spin_lock_bh(&est_tree_lock); + if (!elist[idx].timer.function) { + INIT_LIST_HEAD(&elist[idx].list); + setup_timer(&elist[idx].timer, est_timer, idx); + } + + if (list_empty(&elist[idx].list)) + mod_timer(&elist[idx].timer, jiffies + ((HZ/4) << idx)); + + list_add_rcu(&est->list, &elist[idx].list); + gen_add_node(est); + spin_unlock_bh(&est_tree_lock); + + return 0; +} +EXPORT_SYMBOL(gen_new_estimator); + +/** + * gen_kill_estimator - remove a rate estimator + * @bstats: basic statistics + * @rate_est: rate estimator statistics + * + * Removes the rate estimator specified by &bstats and &rate_est. + * + * Note : Caller should respect an RCU grace period before freeing stats_lock + */ +void gen_kill_estimator(struct gnet_stats_basic_packed *bstats, + struct gnet_stats_rate_est *rate_est) +{ + struct gen_estimator *e; + + spin_lock_bh(&est_tree_lock); + while ((e = gen_find_node(bstats, rate_est))) { + rb_erase(&e->node, &est_root); + + write_lock(&est_lock); + e->bstats = NULL; + write_unlock(&est_lock); + + list_del_rcu(&e->list); + kfree_rcu(e, e_rcu); + } + spin_unlock_bh(&est_tree_lock); +} +EXPORT_SYMBOL(gen_kill_estimator); + +/** + * gen_replace_estimator - replace rate estimator configuration + * @bstats: basic statistics + * @rate_est: rate estimator statistics + * @stats_lock: statistics lock + * @opt: rate estimator configuration TLV + * + * Replaces the configuration of a rate estimator by calling + * gen_kill_estimator() and gen_new_estimator(). + * + * Returns 0 on success or a negative error code. + */ +int gen_replace_estimator(struct gnet_stats_basic_packed *bstats, + struct gnet_stats_rate_est *rate_est, + spinlock_t *stats_lock, struct nlattr *opt) +{ + gen_kill_estimator(bstats, rate_est); + return gen_new_estimator(bstats, rate_est, stats_lock, opt); +} +EXPORT_SYMBOL(gen_replace_estimator); + +/** + * gen_estimator_active - test if estimator is currently in use + * @bstats: basic statistics + * @rate_est: rate estimator statistics + * + * Returns true if estimator is active, and false if not. + */ +bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats, + const struct gnet_stats_rate_est *rate_est) +{ + bool res; + + ASSERT_RTNL(); + + spin_lock_bh(&est_tree_lock); + res = gen_find_node(bstats, rate_est) != NULL; + spin_unlock_bh(&est_tree_lock); + + return res; +} +EXPORT_SYMBOL(gen_estimator_active); diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c new file mode 100644 index 00000000..0452eb27 --- /dev/null +++ b/net/core/gen_stats.c @@ -0,0 +1,250 @@ +/* + * net/core/gen_stats.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Thomas Graf <tgraf@suug.ch> + * Jamal Hadi Salim + * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + * See Documentation/networking/gen_stats.txt + */ + +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/interrupt.h> +#include <linux/socket.h> +#include <linux/rtnetlink.h> +#include <linux/gen_stats.h> +#include <net/netlink.h> +#include <net/gen_stats.h> + + +static inline int +gnet_stats_copy(struct gnet_dump *d, int type, void *buf, int size) +{ + NLA_PUT(d->skb, type, size, buf); + return 0; + +nla_put_failure: + spin_unlock_bh(d->lock); + return -1; +} + +/** + * gnet_stats_start_copy_compat - start dumping procedure in compatibility mode + * @skb: socket buffer to put statistics TLVs into + * @type: TLV type for top level statistic TLV + * @tc_stats_type: TLV type for backward compatibility struct tc_stats TLV + * @xstats_type: TLV type for backward compatibility xstats TLV + * @lock: statistics lock + * @d: dumping handle + * + * Initializes the dumping handle, grabs the statistic lock and appends + * an empty TLV header to the socket buffer for use a container for all + * other statistic TLVS. + * + * The dumping handle is marked to be in backward compatibility mode telling + * all gnet_stats_copy_XXX() functions to fill a local copy of struct tc_stats. + * + * Returns 0 on success or -1 if the room in the socket buffer was not sufficient. + */ +int +gnet_stats_start_copy_compat(struct sk_buff *skb, int type, int tc_stats_type, + int xstats_type, spinlock_t *lock, struct gnet_dump *d) + __acquires(lock) +{ + memset(d, 0, sizeof(*d)); + + spin_lock_bh(lock); + d->lock = lock; + if (type) + d->tail = (struct nlattr *)skb_tail_pointer(skb); + d->skb = skb; + d->compat_tc_stats = tc_stats_type; + d->compat_xstats = xstats_type; + + if (d->tail) + return gnet_stats_copy(d, type, NULL, 0); + + return 0; +} +EXPORT_SYMBOL(gnet_stats_start_copy_compat); + +/** + * gnet_stats_start_copy_compat - start dumping procedure in compatibility mode + * @skb: socket buffer to put statistics TLVs into + * @type: TLV type for top level statistic TLV + * @lock: statistics lock + * @d: dumping handle + * + * Initializes the dumping handle, grabs the statistic lock and appends + * an empty TLV header to the socket buffer for use a container for all + * other statistic TLVS. + * + * Returns 0 on success or -1 if the room in the socket buffer was not sufficient. + */ +int +gnet_stats_start_copy(struct sk_buff *skb, int type, spinlock_t *lock, + struct gnet_dump *d) +{ + return gnet_stats_start_copy_compat(skb, type, 0, 0, lock, d); +} +EXPORT_SYMBOL(gnet_stats_start_copy); + +/** + * gnet_stats_copy_basic - copy basic statistics into statistic TLV + * @d: dumping handle + * @b: basic statistics + * + * Appends the basic statistics to the top level TLV created by + * gnet_stats_start_copy(). + * + * Returns 0 on success or -1 with the statistic lock released + * if the room in the socket buffer was not sufficient. + */ +int +gnet_stats_copy_basic(struct gnet_dump *d, struct gnet_stats_basic_packed *b) +{ + if (d->compat_tc_stats) { + d->tc_stats.bytes = b->bytes; + d->tc_stats.packets = b->packets; + } + + if (d->tail) { + struct gnet_stats_basic sb; + + memset(&sb, 0, sizeof(sb)); + sb.bytes = b->bytes; + sb.packets = b->packets; + return gnet_stats_copy(d, TCA_STATS_BASIC, &sb, sizeof(sb)); + } + return 0; +} +EXPORT_SYMBOL(gnet_stats_copy_basic); + +/** + * gnet_stats_copy_rate_est - copy rate estimator statistics into statistics TLV + * @d: dumping handle + * @b: basic statistics + * @r: rate estimator statistics + * + * Appends the rate estimator statistics to the top level TLV created by + * gnet_stats_start_copy(). + * + * Returns 0 on success or -1 with the statistic lock released + * if the room in the socket buffer was not sufficient. + */ +int +gnet_stats_copy_rate_est(struct gnet_dump *d, + const struct gnet_stats_basic_packed *b, + struct gnet_stats_rate_est *r) +{ + if (b && !gen_estimator_active(b, r)) + return 0; + + if (d->compat_tc_stats) { + d->tc_stats.bps = r->bps; + d->tc_stats.pps = r->pps; + } + + if (d->tail) + return gnet_stats_copy(d, TCA_STATS_RATE_EST, r, sizeof(*r)); + + return 0; +} +EXPORT_SYMBOL(gnet_stats_copy_rate_est); + +/** + * gnet_stats_copy_queue - copy queue statistics into statistics TLV + * @d: dumping handle + * @q: queue statistics + * + * Appends the queue statistics to the top level TLV created by + * gnet_stats_start_copy(). + * + * Returns 0 on success or -1 with the statistic lock released + * if the room in the socket buffer was not sufficient. + */ +int +gnet_stats_copy_queue(struct gnet_dump *d, struct gnet_stats_queue *q) +{ + if (d->compat_tc_stats) { + d->tc_stats.drops = q->drops; + d->tc_stats.qlen = q->qlen; + d->tc_stats.backlog = q->backlog; + d->tc_stats.overlimits = q->overlimits; + } + + if (d->tail) + return gnet_stats_copy(d, TCA_STATS_QUEUE, q, sizeof(*q)); + + return 0; +} +EXPORT_SYMBOL(gnet_stats_copy_queue); + +/** + * gnet_stats_copy_app - copy application specific statistics into statistics TLV + * @d: dumping handle + * @st: application specific statistics data + * @len: length of data + * + * Appends the application sepecific statistics to the top level TLV created by + * gnet_stats_start_copy() and remembers the data for XSTATS if the dumping + * handle is in backward compatibility mode. + * + * Returns 0 on success or -1 with the statistic lock released + * if the room in the socket buffer was not sufficient. + */ +int +gnet_stats_copy_app(struct gnet_dump *d, void *st, int len) +{ + if (d->compat_xstats) { + d->xstats = st; + d->xstats_len = len; + } + + if (d->tail) + return gnet_stats_copy(d, TCA_STATS_APP, st, len); + + return 0; +} +EXPORT_SYMBOL(gnet_stats_copy_app); + +/** + * gnet_stats_finish_copy - finish dumping procedure + * @d: dumping handle + * + * Corrects the length of the top level TLV to include all TLVs added + * by gnet_stats_copy_XXX() calls. Adds the backward compatibility TLVs + * if gnet_stats_start_copy_compat() was used and releases the statistics + * lock. + * + * Returns 0 on success or -1 with the statistic lock released + * if the room in the socket buffer was not sufficient. + */ +int +gnet_stats_finish_copy(struct gnet_dump *d) +{ + if (d->tail) + d->tail->nla_len = skb_tail_pointer(d->skb) - (u8 *)d->tail; + + if (d->compat_tc_stats) + if (gnet_stats_copy(d, d->compat_tc_stats, &d->tc_stats, + sizeof(d->tc_stats)) < 0) + return -1; + + if (d->compat_xstats && d->xstats) { + if (gnet_stats_copy(d, d->compat_xstats, d->xstats, + d->xstats_len) < 0) + return -1; + } + + spin_unlock_bh(d->lock); + return 0; +} +EXPORT_SYMBOL(gnet_stats_finish_copy); diff --git a/net/core/iovec.c b/net/core/iovec.c new file mode 100644 index 00000000..7e7aeb01 --- /dev/null +++ b/net/core/iovec.c @@ -0,0 +1,264 @@ +/* + * iovec manipulation routines. + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Fixes: + * Andrew Lunn : Errors in iovec copying. + * Pedro Roque : Added memcpy_fromiovecend and + * csum_..._fromiovecend. + * Andi Kleen : fixed error handling for 2.1 + * Alexey Kuznetsov: 2.1 optimisations + * Andi Kleen : Fix csum*fromiovecend for IPv6. + */ + +#include <linux/errno.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/net.h> +#include <linux/in6.h> +#include <asm/uaccess.h> +#include <asm/byteorder.h> +#include <net/checksum.h> +#include <net/sock.h> + +/* + * Verify iovec. The caller must ensure that the iovec is big enough + * to hold the message iovec. + * + * Save time not doing access_ok. copy_*_user will make this work + * in any case. + */ + +int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr_storage *address, int mode) +{ + int size, ct, err; + + if (m->msg_namelen) { + if (mode == VERIFY_READ) { + void __user *namep; + namep = (void __user __force *) m->msg_name; + err = move_addr_to_kernel(namep, m->msg_namelen, + address); + if (err < 0) + return err; + } + m->msg_name = address; + } else { + m->msg_name = NULL; + } + + size = m->msg_iovlen * sizeof(struct iovec); + if (copy_from_user(iov, (void __user __force *) m->msg_iov, size)) + return -EFAULT; + + m->msg_iov = iov; + err = 0; + + for (ct = 0; ct < m->msg_iovlen; ct++) { + size_t len = iov[ct].iov_len; + + if (len > INT_MAX - err) { + len = INT_MAX - err; + iov[ct].iov_len = len; + } + err += len; + } + + return err; +} + +/* + * Copy kernel to iovec. Returns -EFAULT on error. + * + * Note: this modifies the original iovec. + */ + +int memcpy_toiovec(struct iovec *iov, unsigned char *kdata, int len) +{ + while (len > 0) { + if (iov->iov_len) { + int copy = min_t(unsigned int, iov->iov_len, len); + if (copy_to_user(iov->iov_base, kdata, copy)) + return -EFAULT; + kdata += copy; + len -= copy; + iov->iov_len -= copy; + iov->iov_base += copy; + } + iov++; + } + + return 0; +} +EXPORT_SYMBOL(memcpy_toiovec); + +/* + * Copy kernel to iovec. Returns -EFAULT on error. + */ + +int memcpy_toiovecend(const struct iovec *iov, unsigned char *kdata, + int offset, int len) +{ + int copy; + for (; len > 0; ++iov) { + /* Skip over the finished iovecs */ + if (unlikely(offset >= iov->iov_len)) { + offset -= iov->iov_len; + continue; + } + copy = min_t(unsigned int, iov->iov_len - offset, len); + if (copy_to_user(iov->iov_base + offset, kdata, copy)) + return -EFAULT; + offset = 0; + kdata += copy; + len -= copy; + } + + return 0; +} +EXPORT_SYMBOL(memcpy_toiovecend); + +/* + * Copy iovec to kernel. Returns -EFAULT on error. + * + * Note: this modifies the original iovec. + */ + +int memcpy_fromiovec(unsigned char *kdata, struct iovec *iov, int len) +{ + while (len > 0) { + if (iov->iov_len) { + int copy = min_t(unsigned int, len, iov->iov_len); + if (copy_from_user(kdata, iov->iov_base, copy)) + return -EFAULT; + len -= copy; + kdata += copy; + iov->iov_base += copy; + iov->iov_len -= copy; + } + iov++; + } + + return 0; +} +EXPORT_SYMBOL(memcpy_fromiovec); + +/* + * Copy iovec from kernel. Returns -EFAULT on error. + */ + +int memcpy_fromiovecend(unsigned char *kdata, const struct iovec *iov, + int offset, int len) +{ + /* Skip over the finished iovecs */ + while (offset >= iov->iov_len) { + offset -= iov->iov_len; + iov++; + } + + while (len > 0) { + u8 __user *base = iov->iov_base + offset; + int copy = min_t(unsigned int, len, iov->iov_len - offset); + + offset = 0; + if (copy_from_user(kdata, base, copy)) + return -EFAULT; + len -= copy; + kdata += copy; + iov++; + } + + return 0; +} +EXPORT_SYMBOL(memcpy_fromiovecend); + +/* + * And now for the all-in-one: copy and checksum from a user iovec + * directly to a datagram + * Calls to csum_partial but the last must be in 32 bit chunks + * + * ip_build_xmit must ensure that when fragmenting only the last + * call to this function will be unaligned also. + */ +int csum_partial_copy_fromiovecend(unsigned char *kdata, struct iovec *iov, + int offset, unsigned int len, __wsum *csump) +{ + __wsum csum = *csump; + int partial_cnt = 0, err = 0; + + /* Skip over the finished iovecs */ + while (offset >= iov->iov_len) { + offset -= iov->iov_len; + iov++; + } + + while (len > 0) { + u8 __user *base = iov->iov_base + offset; + int copy = min_t(unsigned int, len, iov->iov_len - offset); + + offset = 0; + + /* There is a remnant from previous iov. */ + if (partial_cnt) { + int par_len = 4 - partial_cnt; + + /* iov component is too short ... */ + if (par_len > copy) { + if (copy_from_user(kdata, base, copy)) + goto out_fault; + kdata += copy; + base += copy; + partial_cnt += copy; + len -= copy; + iov++; + if (len) + continue; + *csump = csum_partial(kdata - partial_cnt, + partial_cnt, csum); + goto out; + } + if (copy_from_user(kdata, base, par_len)) + goto out_fault; + csum = csum_partial(kdata - partial_cnt, 4, csum); + kdata += par_len; + base += par_len; + copy -= par_len; + len -= par_len; + partial_cnt = 0; + } + + if (len > copy) { + partial_cnt = copy % 4; + if (partial_cnt) { + copy -= partial_cnt; + if (copy_from_user(kdata + copy, base + copy, + partial_cnt)) + goto out_fault; + } + } + + if (copy) { + csum = csum_and_copy_from_user(base, kdata, copy, + csum, &err); + if (err) + goto out; + } + len -= copy + partial_cnt; + kdata += copy + partial_cnt; + iov++; + } + *csump = csum; +out: + return err; + +out_fault: + err = -EFAULT; + goto out; +} +EXPORT_SYMBOL(csum_partial_copy_fromiovecend); diff --git a/net/core/kmap_skb.h b/net/core/kmap_skb.h new file mode 100644 index 00000000..52d0a445 --- /dev/null +++ b/net/core/kmap_skb.h @@ -0,0 +1,19 @@ +#include <linux/highmem.h> + +static inline void *kmap_skb_frag(const skb_frag_t *frag) +{ +#ifdef CONFIG_HIGHMEM + BUG_ON(in_irq()); + + local_bh_disable(); +#endif + return kmap_atomic(skb_frag_page(frag)); +} + +static inline void kunmap_skb_frag(void *vaddr) +{ + kunmap_atomic(vaddr); +#ifdef CONFIG_HIGHMEM + local_bh_enable(); +#endif +} diff --git a/net/core/link_watch.c b/net/core/link_watch.c new file mode 100644 index 00000000..c3519c6d --- /dev/null +++ b/net/core/link_watch.c @@ -0,0 +1,251 @@ +/* + * Linux network device link state notification + * + * Author: + * Stefan Rompf <sux@loplof.de> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <linux/module.h> +#include <linux/netdevice.h> +#include <linux/if.h> +#include <net/sock.h> +#include <net/pkt_sched.h> +#include <linux/rtnetlink.h> +#include <linux/jiffies.h> +#include <linux/spinlock.h> +#include <linux/workqueue.h> +#include <linux/bitops.h> +#include <asm/types.h> + + +enum lw_bits { + LW_URGENT = 0, +}; + +static unsigned long linkwatch_flags; +static unsigned long linkwatch_nextevent; + +static void linkwatch_event(struct work_struct *dummy); +static DECLARE_DELAYED_WORK(linkwatch_work, linkwatch_event); + +static LIST_HEAD(lweventlist); +static DEFINE_SPINLOCK(lweventlist_lock); + +static unsigned char default_operstate(const struct net_device *dev) +{ + if (!netif_carrier_ok(dev)) + return (dev->ifindex != dev->iflink ? + IF_OPER_LOWERLAYERDOWN : IF_OPER_DOWN); + + if (netif_dormant(dev)) + return IF_OPER_DORMANT; + + return IF_OPER_UP; +} + + +static void rfc2863_policy(struct net_device *dev) +{ + unsigned char operstate = default_operstate(dev); + + if (operstate == dev->operstate) + return; + + write_lock_bh(&dev_base_lock); + + switch(dev->link_mode) { + case IF_LINK_MODE_DORMANT: + if (operstate == IF_OPER_UP) + operstate = IF_OPER_DORMANT; + break; + + case IF_LINK_MODE_DEFAULT: + default: + break; + } + + dev->operstate = operstate; + + write_unlock_bh(&dev_base_lock); +} + + +static bool linkwatch_urgent_event(struct net_device *dev) +{ + if (!netif_running(dev)) + return false; + + if (dev->ifindex != dev->iflink) + return true; + + return netif_carrier_ok(dev) && qdisc_tx_changing(dev); +} + + +static void linkwatch_add_event(struct net_device *dev) +{ + unsigned long flags; + + spin_lock_irqsave(&lweventlist_lock, flags); + if (list_empty(&dev->link_watch_list)) { + list_add_tail(&dev->link_watch_list, &lweventlist); + dev_hold(dev); + } + spin_unlock_irqrestore(&lweventlist_lock, flags); +} + + +static void linkwatch_schedule_work(int urgent) +{ + unsigned long delay = linkwatch_nextevent - jiffies; + + if (test_bit(LW_URGENT, &linkwatch_flags)) + return; + + /* Minimise down-time: drop delay for up event. */ + if (urgent) { + if (test_and_set_bit(LW_URGENT, &linkwatch_flags)) + return; + delay = 0; + } + + /* If we wrap around we'll delay it by at most HZ. */ + if (delay > HZ) + delay = 0; + + /* + * This is true if we've scheduled it immeditately or if we don't + * need an immediate execution and it's already pending. + */ + if (schedule_delayed_work(&linkwatch_work, delay) == !delay) + return; + + /* Don't bother if there is nothing urgent. */ + if (!test_bit(LW_URGENT, &linkwatch_flags)) + return; + + /* It's already running which is good enough. */ + if (!__cancel_delayed_work(&linkwatch_work)) + return; + + /* Otherwise we reschedule it again for immediate execution. */ + schedule_delayed_work(&linkwatch_work, 0); +} + + +static void linkwatch_do_dev(struct net_device *dev) +{ + /* + * Make sure the above read is complete since it can be + * rewritten as soon as we clear the bit below. + */ + smp_mb__before_clear_bit(); + + /* We are about to handle this device, + * so new events can be accepted + */ + clear_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state); + + rfc2863_policy(dev); + if (dev->flags & IFF_UP) { + if (netif_carrier_ok(dev)) + dev_activate(dev); + else + dev_deactivate(dev); + + netdev_state_change(dev); + } + dev_put(dev); +} + +static void __linkwatch_run_queue(int urgent_only) +{ + struct net_device *dev; + LIST_HEAD(wrk); + + /* + * Limit the number of linkwatch events to one + * per second so that a runaway driver does not + * cause a storm of messages on the netlink + * socket. This limit does not apply to up events + * while the device qdisc is down. + */ + if (!urgent_only) + linkwatch_nextevent = jiffies + HZ; + /* Limit wrap-around effect on delay. */ + else if (time_after(linkwatch_nextevent, jiffies + HZ)) + linkwatch_nextevent = jiffies; + + clear_bit(LW_URGENT, &linkwatch_flags); + + spin_lock_irq(&lweventlist_lock); + list_splice_init(&lweventlist, &wrk); + + while (!list_empty(&wrk)) { + + dev = list_first_entry(&wrk, struct net_device, link_watch_list); + list_del_init(&dev->link_watch_list); + + if (urgent_only && !linkwatch_urgent_event(dev)) { + list_add_tail(&dev->link_watch_list, &lweventlist); + continue; + } + spin_unlock_irq(&lweventlist_lock); + linkwatch_do_dev(dev); + spin_lock_irq(&lweventlist_lock); + } + + if (!list_empty(&lweventlist)) + linkwatch_schedule_work(0); + spin_unlock_irq(&lweventlist_lock); +} + +void linkwatch_forget_dev(struct net_device *dev) +{ + unsigned long flags; + int clean = 0; + + spin_lock_irqsave(&lweventlist_lock, flags); + if (!list_empty(&dev->link_watch_list)) { + list_del_init(&dev->link_watch_list); + clean = 1; + } + spin_unlock_irqrestore(&lweventlist_lock, flags); + if (clean) + linkwatch_do_dev(dev); +} + + +/* Must be called with the rtnl semaphore held */ +void linkwatch_run_queue(void) +{ + __linkwatch_run_queue(0); +} + + +static void linkwatch_event(struct work_struct *dummy) +{ + rtnl_lock(); + __linkwatch_run_queue(time_after(linkwatch_nextevent, jiffies)); + rtnl_unlock(); +} + + +void linkwatch_fire_event(struct net_device *dev) +{ + bool urgent = linkwatch_urgent_event(dev); + + if (!test_and_set_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state)) { + linkwatch_add_event(dev); + } else if (!urgent) + return; + + linkwatch_schedule_work(urgent); +} +EXPORT_SYMBOL(linkwatch_fire_event); diff --git a/net/core/neighbour.c b/net/core/neighbour.c new file mode 100644 index 00000000..73b90351 --- /dev/null +++ b/net/core/neighbour.c @@ -0,0 +1,3037 @@ +/* + * Generic address resolution entity + * + * Authors: + * Pedro Roque <roque@di.fc.ul.pt> + * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Fixes: + * Vitaly E. Lavrov releasing NULL neighbor in neigh_add. + * Harald Welte Add neighbour cache statistics like rtstat + */ + +#include <linux/slab.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/socket.h> +#include <linux/netdevice.h> +#include <linux/proc_fs.h> +#ifdef CONFIG_SYSCTL +#include <linux/sysctl.h> +#endif +#include <linux/times.h> +#include <net/net_namespace.h> +#include <net/neighbour.h> +#include <net/dst.h> +#include <net/sock.h> +#include <net/netevent.h> +#include <net/netlink.h> +#include <linux/rtnetlink.h> +#include <linux/random.h> +#include <linux/string.h> +#include <linux/log2.h> + +#define NEIGH_DEBUG 1 + +#define NEIGH_PRINTK(x...) printk(x) +#define NEIGH_NOPRINTK(x...) do { ; } while(0) +#define NEIGH_PRINTK1 NEIGH_NOPRINTK +#define NEIGH_PRINTK2 NEIGH_NOPRINTK + +#if NEIGH_DEBUG >= 1 +#undef NEIGH_PRINTK1 +#define NEIGH_PRINTK1 NEIGH_PRINTK +#endif +#if NEIGH_DEBUG >= 2 +#undef NEIGH_PRINTK2 +#define NEIGH_PRINTK2 NEIGH_PRINTK +#endif + +#define PNEIGH_HASHMASK 0xF + +static void neigh_timer_handler(unsigned long arg); +static void __neigh_notify(struct neighbour *n, int type, int flags); +static void neigh_update_notify(struct neighbour *neigh); +static int pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev); + +static struct neigh_table *neigh_tables; +#ifdef CONFIG_PROC_FS +static const struct file_operations neigh_stat_seq_fops; +#endif + +/* + Neighbour hash table buckets are protected with rwlock tbl->lock. + + - All the scans/updates to hash buckets MUST be made under this lock. + - NOTHING clever should be made under this lock: no callbacks + to protocol backends, no attempts to send something to network. + It will result in deadlocks, if backend/driver wants to use neighbour + cache. + - If the entry requires some non-trivial actions, increase + its reference count and release table lock. + + Neighbour entries are protected: + - with reference count. + - with rwlock neigh->lock + + Reference count prevents destruction. + + neigh->lock mainly serializes ll address data and its validity state. + However, the same lock is used to protect another entry fields: + - timer + - resolution queue + + Again, nothing clever shall be made under neigh->lock, + the most complicated procedure, which we allow is dev->hard_header. + It is supposed, that dev->hard_header is simplistic and does + not make callbacks to neighbour tables. + + The last lock is neigh_tbl_lock. It is pure SMP lock, protecting + list of neighbour tables. This list is used only in process context, + */ + +static DEFINE_RWLOCK(neigh_tbl_lock); + +static int neigh_blackhole(struct neighbour *neigh, struct sk_buff *skb) +{ + kfree_skb(skb); + return -ENETDOWN; +} + +static void neigh_cleanup_and_release(struct neighbour *neigh) +{ + if (neigh->parms->neigh_cleanup) + neigh->parms->neigh_cleanup(neigh); + + __neigh_notify(neigh, RTM_DELNEIGH, 0); + neigh_release(neigh); +} + +/* + * It is random distribution in the interval (1/2)*base...(3/2)*base. + * It corresponds to default IPv6 settings and is not overridable, + * because it is really reasonable choice. + */ + +unsigned long neigh_rand_reach_time(unsigned long base) +{ + return base ? (net_random() % base) + (base >> 1) : 0; +} +EXPORT_SYMBOL(neigh_rand_reach_time); + + +static int neigh_forced_gc(struct neigh_table *tbl) +{ + int shrunk = 0; + int i; + struct neigh_hash_table *nht; + + NEIGH_CACHE_STAT_INC(tbl, forced_gc_runs); + + write_lock_bh(&tbl->lock); + nht = rcu_dereference_protected(tbl->nht, + lockdep_is_held(&tbl->lock)); + for (i = 0; i < (1 << nht->hash_shift); i++) { + struct neighbour *n; + struct neighbour __rcu **np; + + np = &nht->hash_buckets[i]; + while ((n = rcu_dereference_protected(*np, + lockdep_is_held(&tbl->lock))) != NULL) { + /* Neighbour record may be discarded if: + * - nobody refers to it. + * - it is not permanent + */ + write_lock(&n->lock); + if (atomic_read(&n->refcnt) == 1 && + !(n->nud_state & NUD_PERMANENT)) { + rcu_assign_pointer(*np, + rcu_dereference_protected(n->next, + lockdep_is_held(&tbl->lock))); + n->dead = 1; + shrunk = 1; + write_unlock(&n->lock); + neigh_cleanup_and_release(n); + continue; + } + write_unlock(&n->lock); + np = &n->next; + } + } + + tbl->last_flush = jiffies; + + write_unlock_bh(&tbl->lock); + + return shrunk; +} + +static void neigh_add_timer(struct neighbour *n, unsigned long when) +{ + neigh_hold(n); + if (unlikely(mod_timer(&n->timer, when))) { + printk("NEIGH: BUG, double timer add, state is %x\n", + n->nud_state); + dump_stack(); + } +} + +static int neigh_del_timer(struct neighbour *n) +{ + if ((n->nud_state & NUD_IN_TIMER) && + del_timer(&n->timer)) { + neigh_release(n); + return 1; + } + return 0; +} + +static void pneigh_queue_purge(struct sk_buff_head *list) +{ + struct sk_buff *skb; + + while ((skb = skb_dequeue(list)) != NULL) { + dev_put(skb->dev); + kfree_skb(skb); + } +} + +static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev) +{ + int i; + struct neigh_hash_table *nht; + + nht = rcu_dereference_protected(tbl->nht, + lockdep_is_held(&tbl->lock)); + + for (i = 0; i < (1 << nht->hash_shift); i++) { + struct neighbour *n; + struct neighbour __rcu **np = &nht->hash_buckets[i]; + + while ((n = rcu_dereference_protected(*np, + lockdep_is_held(&tbl->lock))) != NULL) { + if (dev && n->dev != dev) { + np = &n->next; + continue; + } + rcu_assign_pointer(*np, + rcu_dereference_protected(n->next, + lockdep_is_held(&tbl->lock))); + write_lock(&n->lock); + neigh_del_timer(n); + n->dead = 1; + + if (atomic_read(&n->refcnt) != 1) { + /* The most unpleasant situation. + We must destroy neighbour entry, + but someone still uses it. + + The destroy will be delayed until + the last user releases us, but + we must kill timers etc. and move + it to safe state. + */ + skb_queue_purge(&n->arp_queue); + n->arp_queue_len_bytes = 0; + n->output = neigh_blackhole; + if (n->nud_state & NUD_VALID) + n->nud_state = NUD_NOARP; + else + n->nud_state = NUD_NONE; + NEIGH_PRINTK2("neigh %p is stray.\n", n); + } + write_unlock(&n->lock); + neigh_cleanup_and_release(n); + } + } +} + +void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev) +{ + write_lock_bh(&tbl->lock); + neigh_flush_dev(tbl, dev); + write_unlock_bh(&tbl->lock); +} +EXPORT_SYMBOL(neigh_changeaddr); + +int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev) +{ + write_lock_bh(&tbl->lock); + neigh_flush_dev(tbl, dev); + pneigh_ifdown(tbl, dev); + write_unlock_bh(&tbl->lock); + + del_timer_sync(&tbl->proxy_timer); + pneigh_queue_purge(&tbl->proxy_queue); + return 0; +} +EXPORT_SYMBOL(neigh_ifdown); + +static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device *dev) +{ + struct neighbour *n = NULL; + unsigned long now = jiffies; + int entries; + + entries = atomic_inc_return(&tbl->entries) - 1; + if (entries >= tbl->gc_thresh3 || + (entries >= tbl->gc_thresh2 && + time_after(now, tbl->last_flush + 5 * HZ))) { + if (!neigh_forced_gc(tbl) && + entries >= tbl->gc_thresh3) + goto out_entries; + } + + if (tbl->entry_size) + n = kzalloc(tbl->entry_size, GFP_ATOMIC); + else { + int sz = sizeof(*n) + tbl->key_len; + + sz = ALIGN(sz, NEIGH_PRIV_ALIGN); + sz += dev->neigh_priv_len; + n = kzalloc(sz, GFP_ATOMIC); + } + if (!n) + goto out_entries; + + skb_queue_head_init(&n->arp_queue); + rwlock_init(&n->lock); + seqlock_init(&n->ha_lock); + n->updated = n->used = now; + n->nud_state = NUD_NONE; + n->output = neigh_blackhole; + seqlock_init(&n->hh.hh_lock); + n->parms = neigh_parms_clone(&tbl->parms); + setup_timer(&n->timer, neigh_timer_handler, (unsigned long)n); + + NEIGH_CACHE_STAT_INC(tbl, allocs); + n->tbl = tbl; + atomic_set(&n->refcnt, 1); + n->dead = 1; +out: + return n; + +out_entries: + atomic_dec(&tbl->entries); + goto out; +} + +static void neigh_get_hash_rnd(u32 *x) +{ + get_random_bytes(x, sizeof(*x)); + *x |= 1; +} + +static struct neigh_hash_table *neigh_hash_alloc(unsigned int shift) +{ + size_t size = (1 << shift) * sizeof(struct neighbour *); + struct neigh_hash_table *ret; + struct neighbour __rcu **buckets; + int i; + + ret = kmalloc(sizeof(*ret), GFP_ATOMIC); + if (!ret) + return NULL; + if (size <= PAGE_SIZE) + buckets = kzalloc(size, GFP_ATOMIC); + else + buckets = (struct neighbour __rcu **) + __get_free_pages(GFP_ATOMIC | __GFP_ZERO, + get_order(size)); + if (!buckets) { + kfree(ret); + return NULL; + } + ret->hash_buckets = buckets; + ret->hash_shift = shift; + for (i = 0; i < NEIGH_NUM_HASH_RND; i++) + neigh_get_hash_rnd(&ret->hash_rnd[i]); + return ret; +} + +static void neigh_hash_free_rcu(struct rcu_head *head) +{ + struct neigh_hash_table *nht = container_of(head, + struct neigh_hash_table, + rcu); + size_t size = (1 << nht->hash_shift) * sizeof(struct neighbour *); + struct neighbour __rcu **buckets = nht->hash_buckets; + + if (size <= PAGE_SIZE) + kfree(buckets); + else + free_pages((unsigned long)buckets, get_order(size)); + kfree(nht); +} + +static struct neigh_hash_table *neigh_hash_grow(struct neigh_table *tbl, + unsigned long new_shift) +{ + unsigned int i, hash; + struct neigh_hash_table *new_nht, *old_nht; + + NEIGH_CACHE_STAT_INC(tbl, hash_grows); + + old_nht = rcu_dereference_protected(tbl->nht, + lockdep_is_held(&tbl->lock)); + new_nht = neigh_hash_alloc(new_shift); + if (!new_nht) + return old_nht; + + for (i = 0; i < (1 << old_nht->hash_shift); i++) { + struct neighbour *n, *next; + + for (n = rcu_dereference_protected(old_nht->hash_buckets[i], + lockdep_is_held(&tbl->lock)); + n != NULL; + n = next) { + hash = tbl->hash(n->primary_key, n->dev, + new_nht->hash_rnd); + + hash >>= (32 - new_nht->hash_shift); + next = rcu_dereference_protected(n->next, + lockdep_is_held(&tbl->lock)); + + rcu_assign_pointer(n->next, + rcu_dereference_protected( + new_nht->hash_buckets[hash], + lockdep_is_held(&tbl->lock))); + rcu_assign_pointer(new_nht->hash_buckets[hash], n); + } + } + + rcu_assign_pointer(tbl->nht, new_nht); + call_rcu(&old_nht->rcu, neigh_hash_free_rcu); + return new_nht; +} + +struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey, + struct net_device *dev) +{ + struct neighbour *n; + int key_len = tbl->key_len; + u32 hash_val; + struct neigh_hash_table *nht; + + NEIGH_CACHE_STAT_INC(tbl, lookups); + + rcu_read_lock_bh(); + nht = rcu_dereference_bh(tbl->nht); + hash_val = tbl->hash(pkey, dev, nht->hash_rnd) >> (32 - nht->hash_shift); + + for (n = rcu_dereference_bh(nht->hash_buckets[hash_val]); + n != NULL; + n = rcu_dereference_bh(n->next)) { + if (dev == n->dev && !memcmp(n->primary_key, pkey, key_len)) { + if (!atomic_inc_not_zero(&n->refcnt)) + n = NULL; + NEIGH_CACHE_STAT_INC(tbl, hits); + break; + } + } + + rcu_read_unlock_bh(); + return n; +} +EXPORT_SYMBOL(neigh_lookup); + +struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net, + const void *pkey) +{ + struct neighbour *n; + int key_len = tbl->key_len; + u32 hash_val; + struct neigh_hash_table *nht; + + NEIGH_CACHE_STAT_INC(tbl, lookups); + + rcu_read_lock_bh(); + nht = rcu_dereference_bh(tbl->nht); + hash_val = tbl->hash(pkey, NULL, nht->hash_rnd) >> (32 - nht->hash_shift); + + for (n = rcu_dereference_bh(nht->hash_buckets[hash_val]); + n != NULL; + n = rcu_dereference_bh(n->next)) { + if (!memcmp(n->primary_key, pkey, key_len) && + net_eq(dev_net(n->dev), net)) { + if (!atomic_inc_not_zero(&n->refcnt)) + n = NULL; + NEIGH_CACHE_STAT_INC(tbl, hits); + break; + } + } + + rcu_read_unlock_bh(); + return n; +} +EXPORT_SYMBOL(neigh_lookup_nodev); + +struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey, + struct net_device *dev) +{ + u32 hash_val; + int key_len = tbl->key_len; + int error; + struct neighbour *n1, *rc, *n = neigh_alloc(tbl, dev); + struct neigh_hash_table *nht; + + if (!n) { + rc = ERR_PTR(-ENOBUFS); + goto out; + } + + memcpy(n->primary_key, pkey, key_len); + n->dev = dev; + dev_hold(dev); + + /* Protocol specific setup. */ + if (tbl->constructor && (error = tbl->constructor(n)) < 0) { + rc = ERR_PTR(error); + goto out_neigh_release; + } + + if (dev->netdev_ops->ndo_neigh_construct) { + error = dev->netdev_ops->ndo_neigh_construct(n); + if (error < 0) { + rc = ERR_PTR(error); + goto out_neigh_release; + } + } + + /* Device specific setup. */ + if (n->parms->neigh_setup && + (error = n->parms->neigh_setup(n)) < 0) { + rc = ERR_PTR(error); + goto out_neigh_release; + } + + n->confirmed = jiffies - (n->parms->base_reachable_time << 1); + + write_lock_bh(&tbl->lock); + nht = rcu_dereference_protected(tbl->nht, + lockdep_is_held(&tbl->lock)); + + if (atomic_read(&tbl->entries) > (1 << nht->hash_shift)) + nht = neigh_hash_grow(tbl, nht->hash_shift + 1); + + hash_val = tbl->hash(pkey, dev, nht->hash_rnd) >> (32 - nht->hash_shift); + + if (n->parms->dead) { + rc = ERR_PTR(-EINVAL); + goto out_tbl_unlock; + } + + for (n1 = rcu_dereference_protected(nht->hash_buckets[hash_val], + lockdep_is_held(&tbl->lock)); + n1 != NULL; + n1 = rcu_dereference_protected(n1->next, + lockdep_is_held(&tbl->lock))) { + if (dev == n1->dev && !memcmp(n1->primary_key, pkey, key_len)) { + neigh_hold(n1); + rc = n1; + goto out_tbl_unlock; + } + } + + n->dead = 0; + neigh_hold(n); + rcu_assign_pointer(n->next, + rcu_dereference_protected(nht->hash_buckets[hash_val], + lockdep_is_held(&tbl->lock))); + rcu_assign_pointer(nht->hash_buckets[hash_val], n); + write_unlock_bh(&tbl->lock); + NEIGH_PRINTK2("neigh %p is created.\n", n); + rc = n; +out: + return rc; +out_tbl_unlock: + write_unlock_bh(&tbl->lock); +out_neigh_release: + neigh_release(n); + goto out; +} +EXPORT_SYMBOL(neigh_create); + +static u32 pneigh_hash(const void *pkey, int key_len) +{ + u32 hash_val = *(u32 *)(pkey + key_len - 4); + hash_val ^= (hash_val >> 16); + hash_val ^= hash_val >> 8; + hash_val ^= hash_val >> 4; + hash_val &= PNEIGH_HASHMASK; + return hash_val; +} + +static struct pneigh_entry *__pneigh_lookup_1(struct pneigh_entry *n, + struct net *net, + const void *pkey, + int key_len, + struct net_device *dev) +{ + while (n) { + if (!memcmp(n->key, pkey, key_len) && + net_eq(pneigh_net(n), net) && + (n->dev == dev || !n->dev)) + return n; + n = n->next; + } + return NULL; +} + +struct pneigh_entry *__pneigh_lookup(struct neigh_table *tbl, + struct net *net, const void *pkey, struct net_device *dev) +{ + int key_len = tbl->key_len; + u32 hash_val = pneigh_hash(pkey, key_len); + + return __pneigh_lookup_1(tbl->phash_buckets[hash_val], + net, pkey, key_len, dev); +} +EXPORT_SYMBOL_GPL(__pneigh_lookup); + +struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl, + struct net *net, const void *pkey, + struct net_device *dev, int creat) +{ + struct pneigh_entry *n; + int key_len = tbl->key_len; + u32 hash_val = pneigh_hash(pkey, key_len); + + read_lock_bh(&tbl->lock); + n = __pneigh_lookup_1(tbl->phash_buckets[hash_val], + net, pkey, key_len, dev); + read_unlock_bh(&tbl->lock); + + if (n || !creat) + goto out; + + ASSERT_RTNL(); + + n = kmalloc(sizeof(*n) + key_len, GFP_KERNEL); + if (!n) + goto out; + + write_pnet(&n->net, hold_net(net)); + memcpy(n->key, pkey, key_len); + n->dev = dev; + if (dev) + dev_hold(dev); + + if (tbl->pconstructor && tbl->pconstructor(n)) { + if (dev) + dev_put(dev); + release_net(net); + kfree(n); + n = NULL; + goto out; + } + + write_lock_bh(&tbl->lock); + n->next = tbl->phash_buckets[hash_val]; + tbl->phash_buckets[hash_val] = n; + write_unlock_bh(&tbl->lock); +out: + return n; +} +EXPORT_SYMBOL(pneigh_lookup); + + +int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey, + struct net_device *dev) +{ + struct pneigh_entry *n, **np; + int key_len = tbl->key_len; + u32 hash_val = pneigh_hash(pkey, key_len); + + write_lock_bh(&tbl->lock); + for (np = &tbl->phash_buckets[hash_val]; (n = *np) != NULL; + np = &n->next) { + if (!memcmp(n->key, pkey, key_len) && n->dev == dev && + net_eq(pneigh_net(n), net)) { + *np = n->next; + write_unlock_bh(&tbl->lock); + if (tbl->pdestructor) + tbl->pdestructor(n); + if (n->dev) + dev_put(n->dev); + release_net(pneigh_net(n)); + kfree(n); + return 0; + } + } + write_unlock_bh(&tbl->lock); + return -ENOENT; +} + +static int pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev) +{ + struct pneigh_entry *n, **np; + u32 h; + + for (h = 0; h <= PNEIGH_HASHMASK; h++) { + np = &tbl->phash_buckets[h]; + while ((n = *np) != NULL) { + if (!dev || n->dev == dev) { + *np = n->next; + if (tbl->pdestructor) + tbl->pdestructor(n); + if (n->dev) + dev_put(n->dev); + release_net(pneigh_net(n)); + kfree(n); + continue; + } + np = &n->next; + } + } + return -ENOENT; +} + +static void neigh_parms_destroy(struct neigh_parms *parms); + +static inline void neigh_parms_put(struct neigh_parms *parms) +{ + if (atomic_dec_and_test(&parms->refcnt)) + neigh_parms_destroy(parms); +} + +/* + * neighbour must already be out of the table; + * + */ +void neigh_destroy(struct neighbour *neigh) +{ + struct net_device *dev = neigh->dev; + + NEIGH_CACHE_STAT_INC(neigh->tbl, destroys); + + if (!neigh->dead) { + printk(KERN_WARNING + "Destroying alive neighbour %p\n", neigh); + dump_stack(); + return; + } + + if (neigh_del_timer(neigh)) + printk(KERN_WARNING "Impossible event.\n"); + + skb_queue_purge(&neigh->arp_queue); + neigh->arp_queue_len_bytes = 0; + + if (dev->netdev_ops->ndo_neigh_destroy) + dev->netdev_ops->ndo_neigh_destroy(neigh); + + dev_put(dev); + neigh_parms_put(neigh->parms); + + NEIGH_PRINTK2("neigh %p is destroyed.\n", neigh); + + atomic_dec(&neigh->tbl->entries); + kfree_rcu(neigh, rcu); +} +EXPORT_SYMBOL(neigh_destroy); + +/* Neighbour state is suspicious; + disable fast path. + + Called with write_locked neigh. + */ +static void neigh_suspect(struct neighbour *neigh) +{ + NEIGH_PRINTK2("neigh %p is suspected.\n", neigh); + + neigh->output = neigh->ops->output; +} + +/* Neighbour state is OK; + enable fast path. + + Called with write_locked neigh. + */ +static void neigh_connect(struct neighbour *neigh) +{ + NEIGH_PRINTK2("neigh %p is connected.\n", neigh); + + neigh->output = neigh->ops->connected_output; +} + +static void neigh_periodic_work(struct work_struct *work) +{ + struct neigh_table *tbl = container_of(work, struct neigh_table, gc_work.work); + struct neighbour *n; + struct neighbour __rcu **np; + unsigned int i; + struct neigh_hash_table *nht; + + NEIGH_CACHE_STAT_INC(tbl, periodic_gc_runs); + + write_lock_bh(&tbl->lock); + nht = rcu_dereference_protected(tbl->nht, + lockdep_is_held(&tbl->lock)); + + /* + * periodically recompute ReachableTime from random function + */ + + if (time_after(jiffies, tbl->last_rand + 300 * HZ)) { + struct neigh_parms *p; + tbl->last_rand = jiffies; + for (p = &tbl->parms; p; p = p->next) + p->reachable_time = + neigh_rand_reach_time(p->base_reachable_time); + } + + for (i = 0 ; i < (1 << nht->hash_shift); i++) { + np = &nht->hash_buckets[i]; + + while ((n = rcu_dereference_protected(*np, + lockdep_is_held(&tbl->lock))) != NULL) { + unsigned int state; + + write_lock(&n->lock); + + state = n->nud_state; + if (state & (NUD_PERMANENT | NUD_IN_TIMER)) { + write_unlock(&n->lock); + goto next_elt; + } + + if (time_before(n->used, n->confirmed)) + n->used = n->confirmed; + + if (atomic_read(&n->refcnt) == 1 && + (state == NUD_FAILED || + time_after(jiffies, n->used + n->parms->gc_staletime))) { + *np = n->next; + n->dead = 1; + write_unlock(&n->lock); + neigh_cleanup_and_release(n); + continue; + } + write_unlock(&n->lock); + +next_elt: + np = &n->next; + } + /* + * It's fine to release lock here, even if hash table + * grows while we are preempted. + */ + write_unlock_bh(&tbl->lock); + cond_resched(); + write_lock_bh(&tbl->lock); + nht = rcu_dereference_protected(tbl->nht, + lockdep_is_held(&tbl->lock)); + } + /* Cycle through all hash buckets every base_reachable_time/2 ticks. + * ARP entry timeouts range from 1/2 base_reachable_time to 3/2 + * base_reachable_time. + */ + schedule_delayed_work(&tbl->gc_work, + tbl->parms.base_reachable_time >> 1); + write_unlock_bh(&tbl->lock); +} + +static __inline__ int neigh_max_probes(struct neighbour *n) +{ + struct neigh_parms *p = n->parms; + return (n->nud_state & NUD_PROBE) ? + p->ucast_probes : + p->ucast_probes + p->app_probes + p->mcast_probes; +} + +static void neigh_invalidate(struct neighbour *neigh) + __releases(neigh->lock) + __acquires(neigh->lock) +{ + struct sk_buff *skb; + + NEIGH_CACHE_STAT_INC(neigh->tbl, res_failed); + NEIGH_PRINTK2("neigh %p is failed.\n", neigh); + neigh->updated = jiffies; + + /* It is very thin place. report_unreachable is very complicated + routine. Particularly, it can hit the same neighbour entry! + + So that, we try to be accurate and avoid dead loop. --ANK + */ + while (neigh->nud_state == NUD_FAILED && + (skb = __skb_dequeue(&neigh->arp_queue)) != NULL) { + write_unlock(&neigh->lock); + neigh->ops->error_report(neigh, skb); + write_lock(&neigh->lock); + } + skb_queue_purge(&neigh->arp_queue); + neigh->arp_queue_len_bytes = 0; +} + +static void neigh_probe(struct neighbour *neigh) + __releases(neigh->lock) +{ + struct sk_buff *skb = skb_peek(&neigh->arp_queue); + /* keep skb alive even if arp_queue overflows */ + if (skb) + skb = skb_copy(skb, GFP_ATOMIC); + write_unlock(&neigh->lock); + neigh->ops->solicit(neigh, skb); + atomic_inc(&neigh->probes); + kfree_skb(skb); +} + +/* Called when a timer expires for a neighbour entry. */ + +static void neigh_timer_handler(unsigned long arg) +{ + unsigned long now, next; + struct neighbour *neigh = (struct neighbour *)arg; + unsigned state; + int notify = 0; + + write_lock(&neigh->lock); + + state = neigh->nud_state; + now = jiffies; + next = now + HZ; + + if (!(state & NUD_IN_TIMER)) + goto out; + + if (state & NUD_REACHABLE) { + if (time_before_eq(now, + neigh->confirmed + neigh->parms->reachable_time)) { + NEIGH_PRINTK2("neigh %p is still alive.\n", neigh); + next = neigh->confirmed + neigh->parms->reachable_time; + } else if (time_before_eq(now, + neigh->used + neigh->parms->delay_probe_time)) { + NEIGH_PRINTK2("neigh %p is delayed.\n", neigh); + neigh->nud_state = NUD_DELAY; + neigh->updated = jiffies; + neigh_suspect(neigh); + next = now + neigh->parms->delay_probe_time; + } else { + NEIGH_PRINTK2("neigh %p is suspected.\n", neigh); + neigh->nud_state = NUD_STALE; + neigh->updated = jiffies; + neigh_suspect(neigh); + notify = 1; + } + } else if (state & NUD_DELAY) { + if (time_before_eq(now, + neigh->confirmed + neigh->parms->delay_probe_time)) { + NEIGH_PRINTK2("neigh %p is now reachable.\n", neigh); + neigh->nud_state = NUD_REACHABLE; + neigh->updated = jiffies; + neigh_connect(neigh); + notify = 1; + next = neigh->confirmed + neigh->parms->reachable_time; + } else { + NEIGH_PRINTK2("neigh %p is probed.\n", neigh); + neigh->nud_state = NUD_PROBE; + neigh->updated = jiffies; + atomic_set(&neigh->probes, 0); + next = now + neigh->parms->retrans_time; + } + } else { + /* NUD_PROBE|NUD_INCOMPLETE */ + next = now + neigh->parms->retrans_time; + } + + if ((neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) && + atomic_read(&neigh->probes) >= neigh_max_probes(neigh)) { + neigh->nud_state = NUD_FAILED; + notify = 1; + neigh_invalidate(neigh); + } + + if (neigh->nud_state & NUD_IN_TIMER) { + if (time_before(next, jiffies + HZ/2)) + next = jiffies + HZ/2; + if (!mod_timer(&neigh->timer, next)) + neigh_hold(neigh); + } + if (neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) { + neigh_probe(neigh); + } else { +out: + write_unlock(&neigh->lock); + } + + if (notify) + neigh_update_notify(neigh); + + neigh_release(neigh); +} + +int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) +{ + int rc; + bool immediate_probe = false; + + write_lock_bh(&neigh->lock); + + rc = 0; + if (neigh->nud_state & (NUD_CONNECTED | NUD_DELAY | NUD_PROBE)) + goto out_unlock_bh; + + if (!(neigh->nud_state & (NUD_STALE | NUD_INCOMPLETE))) { + if (neigh->parms->mcast_probes + neigh->parms->app_probes) { + unsigned long next, now = jiffies; + + atomic_set(&neigh->probes, neigh->parms->ucast_probes); + neigh->nud_state = NUD_INCOMPLETE; + neigh->updated = now; + next = now + max(neigh->parms->retrans_time, HZ/2); + neigh_add_timer(neigh, next); + immediate_probe = true; + } else { + neigh->nud_state = NUD_FAILED; + neigh->updated = jiffies; + write_unlock_bh(&neigh->lock); + + kfree_skb(skb); + return 1; + } + } else if (neigh->nud_state & NUD_STALE) { + NEIGH_PRINTK2("neigh %p is delayed.\n", neigh); + neigh->nud_state = NUD_DELAY; + neigh->updated = jiffies; + neigh_add_timer(neigh, + jiffies + neigh->parms->delay_probe_time); + } + + if (neigh->nud_state == NUD_INCOMPLETE) { + if (skb) { + while (neigh->arp_queue_len_bytes + skb->truesize > + neigh->parms->queue_len_bytes) { + struct sk_buff *buff; + + buff = __skb_dequeue(&neigh->arp_queue); + if (!buff) + break; + neigh->arp_queue_len_bytes -= buff->truesize; + kfree_skb(buff); + NEIGH_CACHE_STAT_INC(neigh->tbl, unres_discards); + } + skb_dst_force(skb); + __skb_queue_tail(&neigh->arp_queue, skb); + neigh->arp_queue_len_bytes += skb->truesize; + } + rc = 1; + } +out_unlock_bh: + if (immediate_probe) + neigh_probe(neigh); + else + write_unlock(&neigh->lock); + local_bh_enable(); + return rc; +} +EXPORT_SYMBOL(__neigh_event_send); + +static void neigh_update_hhs(struct neighbour *neigh) +{ + struct hh_cache *hh; + void (*update)(struct hh_cache*, const struct net_device*, const unsigned char *) + = NULL; + + if (neigh->dev->header_ops) + update = neigh->dev->header_ops->cache_update; + + if (update) { + hh = &neigh->hh; + if (hh->hh_len) { + write_seqlock_bh(&hh->hh_lock); + update(hh, neigh->dev, neigh->ha); + write_sequnlock_bh(&hh->hh_lock); + } + } +} + + + +/* Generic update routine. + -- lladdr is new lladdr or NULL, if it is not supplied. + -- new is new state. + -- flags + NEIGH_UPDATE_F_OVERRIDE allows to override existing lladdr, + if it is different. + NEIGH_UPDATE_F_WEAK_OVERRIDE will suspect existing "connected" + lladdr instead of overriding it + if it is different. + It also allows to retain current state + if lladdr is unchanged. + NEIGH_UPDATE_F_ADMIN means that the change is administrative. + + NEIGH_UPDATE_F_OVERRIDE_ISROUTER allows to override existing + NTF_ROUTER flag. + NEIGH_UPDATE_F_ISROUTER indicates if the neighbour is known as + a router. + + Caller MUST hold reference count on the entry. + */ + +int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, + u32 flags) +{ + u8 old; + int err; + int notify = 0; + struct net_device *dev; + int update_isrouter = 0; + + write_lock_bh(&neigh->lock); + + dev = neigh->dev; + old = neigh->nud_state; + err = -EPERM; + + if (!(flags & NEIGH_UPDATE_F_ADMIN) && + (old & (NUD_NOARP | NUD_PERMANENT))) + goto out; + + if (!(new & NUD_VALID)) { + neigh_del_timer(neigh); + if (old & NUD_CONNECTED) + neigh_suspect(neigh); + neigh->nud_state = new; + err = 0; + notify = old & NUD_VALID; + if ((old & (NUD_INCOMPLETE | NUD_PROBE)) && + (new & NUD_FAILED)) { + neigh_invalidate(neigh); + notify = 1; + } + goto out; + } + + /* Compare new lladdr with cached one */ + if (!dev->addr_len) { + /* First case: device needs no address. */ + lladdr = neigh->ha; + } else if (lladdr) { + /* The second case: if something is already cached + and a new address is proposed: + - compare new & old + - if they are different, check override flag + */ + if ((old & NUD_VALID) && + !memcmp(lladdr, neigh->ha, dev->addr_len)) + lladdr = neigh->ha; + } else { + /* No address is supplied; if we know something, + use it, otherwise discard the request. + */ + err = -EINVAL; + if (!(old & NUD_VALID)) + goto out; + lladdr = neigh->ha; + } + + if (new & NUD_CONNECTED) + neigh->confirmed = jiffies; + neigh->updated = jiffies; + + /* If entry was valid and address is not changed, + do not change entry state, if new one is STALE. + */ + err = 0; + update_isrouter = flags & NEIGH_UPDATE_F_OVERRIDE_ISROUTER; + if (old & NUD_VALID) { + if (lladdr != neigh->ha && !(flags & NEIGH_UPDATE_F_OVERRIDE)) { + update_isrouter = 0; + if ((flags & NEIGH_UPDATE_F_WEAK_OVERRIDE) && + (old & NUD_CONNECTED)) { + lladdr = neigh->ha; + new = NUD_STALE; + } else + goto out; + } else { + if (lladdr == neigh->ha && new == NUD_STALE && + ((flags & NEIGH_UPDATE_F_WEAK_OVERRIDE) || + (old & NUD_CONNECTED)) + ) + new = old; + } + } + + if (new != old) { + neigh_del_timer(neigh); + if (new & NUD_IN_TIMER) + neigh_add_timer(neigh, (jiffies + + ((new & NUD_REACHABLE) ? + neigh->parms->reachable_time : + 0))); + neigh->nud_state = new; + } + + if (lladdr != neigh->ha) { + write_seqlock(&neigh->ha_lock); + memcpy(&neigh->ha, lladdr, dev->addr_len); + write_sequnlock(&neigh->ha_lock); + neigh_update_hhs(neigh); + if (!(new & NUD_CONNECTED)) + neigh->confirmed = jiffies - + (neigh->parms->base_reachable_time << 1); + notify = 1; + } + if (new == old) + goto out; + if (new & NUD_CONNECTED) + neigh_connect(neigh); + else + neigh_suspect(neigh); + if (!(old & NUD_VALID)) { + struct sk_buff *skb; + + /* Again: avoid dead loop if something went wrong */ + + while (neigh->nud_state & NUD_VALID && + (skb = __skb_dequeue(&neigh->arp_queue)) != NULL) { + struct dst_entry *dst = skb_dst(skb); + struct neighbour *n2, *n1 = neigh; + write_unlock_bh(&neigh->lock); + + rcu_read_lock(); + /* On shaper/eql skb->dst->neighbour != neigh :( */ + if (dst && (n2 = dst_get_neighbour_noref(dst)) != NULL) + n1 = n2; + n1->output(n1, skb); + rcu_read_unlock(); + + write_lock_bh(&neigh->lock); + } + skb_queue_purge(&neigh->arp_queue); + neigh->arp_queue_len_bytes = 0; + } +out: + if (update_isrouter) { + neigh->flags = (flags & NEIGH_UPDATE_F_ISROUTER) ? + (neigh->flags | NTF_ROUTER) : + (neigh->flags & ~NTF_ROUTER); + } + write_unlock_bh(&neigh->lock); + + if (notify) + neigh_update_notify(neigh); + + return err; +} +EXPORT_SYMBOL(neigh_update); + +struct neighbour *neigh_event_ns(struct neigh_table *tbl, + u8 *lladdr, void *saddr, + struct net_device *dev) +{ + struct neighbour *neigh = __neigh_lookup(tbl, saddr, dev, + lladdr || !dev->addr_len); + if (neigh) + neigh_update(neigh, lladdr, NUD_STALE, + NEIGH_UPDATE_F_OVERRIDE); + return neigh; +} +EXPORT_SYMBOL(neigh_event_ns); + +/* called with read_lock_bh(&n->lock); */ +static void neigh_hh_init(struct neighbour *n, struct dst_entry *dst) +{ + struct net_device *dev = dst->dev; + __be16 prot = dst->ops->protocol; + struct hh_cache *hh = &n->hh; + + write_lock_bh(&n->lock); + + /* Only one thread can come in here and initialize the + * hh_cache entry. + */ + if (!hh->hh_len) + dev->header_ops->cache(n, hh, prot); + + write_unlock_bh(&n->lock); +} + +/* This function can be used in contexts, where only old dev_queue_xmit + * worked, f.e. if you want to override normal output path (eql, shaper), + * but resolution is not made yet. + */ + +int neigh_compat_output(struct neighbour *neigh, struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + + __skb_pull(skb, skb_network_offset(skb)); + + if (dev_hard_header(skb, dev, ntohs(skb->protocol), NULL, NULL, + skb->len) < 0 && + dev->header_ops->rebuild(skb)) + return 0; + + return dev_queue_xmit(skb); +} +EXPORT_SYMBOL(neigh_compat_output); + +/* Slow and careful. */ + +int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + int rc = 0; + + if (!dst) + goto discard; + + __skb_pull(skb, skb_network_offset(skb)); + + if (!neigh_event_send(neigh, skb)) { + int err; + struct net_device *dev = neigh->dev; + unsigned int seq; + + if (dev->header_ops->cache && !neigh->hh.hh_len) + neigh_hh_init(neigh, dst); + + do { + seq = read_seqbegin(&neigh->ha_lock); + err = dev_hard_header(skb, dev, ntohs(skb->protocol), + neigh->ha, NULL, skb->len); + } while (read_seqretry(&neigh->ha_lock, seq)); + + if (err >= 0) + rc = dev_queue_xmit(skb); + else + goto out_kfree_skb; + } +out: + return rc; +discard: + NEIGH_PRINTK1("neigh_resolve_output: dst=%p neigh=%p\n", + dst, neigh); +out_kfree_skb: + rc = -EINVAL; + kfree_skb(skb); + goto out; +} +EXPORT_SYMBOL(neigh_resolve_output); + +/* As fast as possible without hh cache */ + +int neigh_connected_output(struct neighbour *neigh, struct sk_buff *skb) +{ + struct net_device *dev = neigh->dev; + unsigned int seq; + int err; + + __skb_pull(skb, skb_network_offset(skb)); + + do { + seq = read_seqbegin(&neigh->ha_lock); + err = dev_hard_header(skb, dev, ntohs(skb->protocol), + neigh->ha, NULL, skb->len); + } while (read_seqretry(&neigh->ha_lock, seq)); + + if (err >= 0) + err = dev_queue_xmit(skb); + else { + err = -EINVAL; + kfree_skb(skb); + } + return err; +} +EXPORT_SYMBOL(neigh_connected_output); + +int neigh_direct_output(struct neighbour *neigh, struct sk_buff *skb) +{ + return dev_queue_xmit(skb); +} +EXPORT_SYMBOL(neigh_direct_output); + +static void neigh_proxy_process(unsigned long arg) +{ + struct neigh_table *tbl = (struct neigh_table *)arg; + long sched_next = 0; + unsigned long now = jiffies; + struct sk_buff *skb, *n; + + spin_lock(&tbl->proxy_queue.lock); + + skb_queue_walk_safe(&tbl->proxy_queue, skb, n) { + long tdif = NEIGH_CB(skb)->sched_next - now; + + if (tdif <= 0) { + struct net_device *dev = skb->dev; + + __skb_unlink(skb, &tbl->proxy_queue); + if (tbl->proxy_redo && netif_running(dev)) { + rcu_read_lock(); + tbl->proxy_redo(skb); + rcu_read_unlock(); + } else { + kfree_skb(skb); + } + + dev_put(dev); + } else if (!sched_next || tdif < sched_next) + sched_next = tdif; + } + del_timer(&tbl->proxy_timer); + if (sched_next) + mod_timer(&tbl->proxy_timer, jiffies + sched_next); + spin_unlock(&tbl->proxy_queue.lock); +} + +void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p, + struct sk_buff *skb) +{ + unsigned long now = jiffies; + unsigned long sched_next = now + (net_random() % p->proxy_delay); + + if (tbl->proxy_queue.qlen > p->proxy_qlen) { + kfree_skb(skb); + return; + } + + NEIGH_CB(skb)->sched_next = sched_next; + NEIGH_CB(skb)->flags |= LOCALLY_ENQUEUED; + + spin_lock(&tbl->proxy_queue.lock); + if (del_timer(&tbl->proxy_timer)) { + if (time_before(tbl->proxy_timer.expires, sched_next)) + sched_next = tbl->proxy_timer.expires; + } + skb_dst_drop(skb); + dev_hold(skb->dev); + __skb_queue_tail(&tbl->proxy_queue, skb); + mod_timer(&tbl->proxy_timer, sched_next); + spin_unlock(&tbl->proxy_queue.lock); +} +EXPORT_SYMBOL(pneigh_enqueue); + +static inline struct neigh_parms *lookup_neigh_parms(struct neigh_table *tbl, + struct net *net, int ifindex) +{ + struct neigh_parms *p; + + for (p = &tbl->parms; p; p = p->next) { + if ((p->dev && p->dev->ifindex == ifindex && net_eq(neigh_parms_net(p), net)) || + (!p->dev && !ifindex)) + return p; + } + + return NULL; +} + +struct neigh_parms *neigh_parms_alloc(struct net_device *dev, + struct neigh_table *tbl) +{ + struct neigh_parms *p, *ref; + struct net *net = dev_net(dev); + const struct net_device_ops *ops = dev->netdev_ops; + + ref = lookup_neigh_parms(tbl, net, 0); + if (!ref) + return NULL; + + p = kmemdup(ref, sizeof(*p), GFP_KERNEL); + if (p) { + p->tbl = tbl; + atomic_set(&p->refcnt, 1); + p->reachable_time = + neigh_rand_reach_time(p->base_reachable_time); + + if (ops->ndo_neigh_setup && ops->ndo_neigh_setup(dev, p)) { + kfree(p); + return NULL; + } + + dev_hold(dev); + p->dev = dev; + write_pnet(&p->net, hold_net(net)); + p->sysctl_table = NULL; + write_lock_bh(&tbl->lock); + p->next = tbl->parms.next; + tbl->parms.next = p; + write_unlock_bh(&tbl->lock); + } + return p; +} +EXPORT_SYMBOL(neigh_parms_alloc); + +static void neigh_rcu_free_parms(struct rcu_head *head) +{ + struct neigh_parms *parms = + container_of(head, struct neigh_parms, rcu_head); + + neigh_parms_put(parms); +} + +void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms) +{ + struct neigh_parms **p; + + if (!parms || parms == &tbl->parms) + return; + write_lock_bh(&tbl->lock); + for (p = &tbl->parms.next; *p; p = &(*p)->next) { + if (*p == parms) { + *p = parms->next; + parms->dead = 1; + write_unlock_bh(&tbl->lock); + if (parms->dev) + dev_put(parms->dev); + call_rcu(&parms->rcu_head, neigh_rcu_free_parms); + return; + } + } + write_unlock_bh(&tbl->lock); + NEIGH_PRINTK1("neigh_parms_release: not found\n"); +} +EXPORT_SYMBOL(neigh_parms_release); + +static void neigh_parms_destroy(struct neigh_parms *parms) +{ + release_net(neigh_parms_net(parms)); + kfree(parms); +} + +static struct lock_class_key neigh_table_proxy_queue_class; + +void neigh_table_init_no_netlink(struct neigh_table *tbl) +{ + unsigned long now = jiffies; + unsigned long phsize; + + write_pnet(&tbl->parms.net, &init_net); + atomic_set(&tbl->parms.refcnt, 1); + tbl->parms.reachable_time = + neigh_rand_reach_time(tbl->parms.base_reachable_time); + + tbl->stats = alloc_percpu(struct neigh_statistics); + if (!tbl->stats) + panic("cannot create neighbour cache statistics"); + +#ifdef CONFIG_PROC_FS + if (!proc_create_data(tbl->id, 0, init_net.proc_net_stat, + &neigh_stat_seq_fops, tbl)) + panic("cannot create neighbour proc dir entry"); +#endif + + RCU_INIT_POINTER(tbl->nht, neigh_hash_alloc(3)); + + phsize = (PNEIGH_HASHMASK + 1) * sizeof(struct pneigh_entry *); + tbl->phash_buckets = kzalloc(phsize, GFP_KERNEL); + + if (!tbl->nht || !tbl->phash_buckets) + panic("cannot allocate neighbour cache hashes"); + + rwlock_init(&tbl->lock); + INIT_DELAYED_WORK_DEFERRABLE(&tbl->gc_work, neigh_periodic_work); + schedule_delayed_work(&tbl->gc_work, tbl->parms.reachable_time); + setup_timer(&tbl->proxy_timer, neigh_proxy_process, (unsigned long)tbl); + skb_queue_head_init_class(&tbl->proxy_queue, + &neigh_table_proxy_queue_class); + + tbl->last_flush = now; + tbl->last_rand = now + tbl->parms.reachable_time * 20; +} +EXPORT_SYMBOL(neigh_table_init_no_netlink); + +void neigh_table_init(struct neigh_table *tbl) +{ + struct neigh_table *tmp; + + neigh_table_init_no_netlink(tbl); + write_lock(&neigh_tbl_lock); + for (tmp = neigh_tables; tmp; tmp = tmp->next) { + if (tmp->family == tbl->family) + break; + } + tbl->next = neigh_tables; + neigh_tables = tbl; + write_unlock(&neigh_tbl_lock); + + if (unlikely(tmp)) { + printk(KERN_ERR "NEIGH: Registering multiple tables for " + "family %d\n", tbl->family); + dump_stack(); + } +} +EXPORT_SYMBOL(neigh_table_init); + +int neigh_table_clear(struct neigh_table *tbl) +{ + struct neigh_table **tp; + + /* It is not clean... Fix it to unload IPv6 module safely */ + cancel_delayed_work_sync(&tbl->gc_work); + del_timer_sync(&tbl->proxy_timer); + pneigh_queue_purge(&tbl->proxy_queue); + neigh_ifdown(tbl, NULL); + if (atomic_read(&tbl->entries)) + printk(KERN_CRIT "neighbour leakage\n"); + write_lock(&neigh_tbl_lock); + for (tp = &neigh_tables; *tp; tp = &(*tp)->next) { + if (*tp == tbl) { + *tp = tbl->next; + break; + } + } + write_unlock(&neigh_tbl_lock); + + call_rcu(&rcu_dereference_protected(tbl->nht, 1)->rcu, + neigh_hash_free_rcu); + tbl->nht = NULL; + + kfree(tbl->phash_buckets); + tbl->phash_buckets = NULL; + + remove_proc_entry(tbl->id, init_net.proc_net_stat); + + free_percpu(tbl->stats); + tbl->stats = NULL; + + return 0; +} +EXPORT_SYMBOL(neigh_table_clear); + +static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +{ + struct net *net = sock_net(skb->sk); + struct ndmsg *ndm; + struct nlattr *dst_attr; + struct neigh_table *tbl; + struct net_device *dev = NULL; + int err = -EINVAL; + + ASSERT_RTNL(); + if (nlmsg_len(nlh) < sizeof(*ndm)) + goto out; + + dst_attr = nlmsg_find_attr(nlh, sizeof(*ndm), NDA_DST); + if (dst_attr == NULL) + goto out; + + ndm = nlmsg_data(nlh); + if (ndm->ndm_ifindex) { + dev = __dev_get_by_index(net, ndm->ndm_ifindex); + if (dev == NULL) { + err = -ENODEV; + goto out; + } + } + + read_lock(&neigh_tbl_lock); + for (tbl = neigh_tables; tbl; tbl = tbl->next) { + struct neighbour *neigh; + + if (tbl->family != ndm->ndm_family) + continue; + read_unlock(&neigh_tbl_lock); + + if (nla_len(dst_attr) < tbl->key_len) + goto out; + + if (ndm->ndm_flags & NTF_PROXY) { + err = pneigh_delete(tbl, net, nla_data(dst_attr), dev); + goto out; + } + + if (dev == NULL) + goto out; + + neigh = neigh_lookup(tbl, nla_data(dst_attr), dev); + if (neigh == NULL) { + err = -ENOENT; + goto out; + } + + err = neigh_update(neigh, NULL, NUD_FAILED, + NEIGH_UPDATE_F_OVERRIDE | + NEIGH_UPDATE_F_ADMIN); + neigh_release(neigh); + goto out; + } + read_unlock(&neigh_tbl_lock); + err = -EAFNOSUPPORT; + +out: + return err; +} + +static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +{ + struct net *net = sock_net(skb->sk); + struct ndmsg *ndm; + struct nlattr *tb[NDA_MAX+1]; + struct neigh_table *tbl; + struct net_device *dev = NULL; + int err; + + ASSERT_RTNL(); + err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL); + if (err < 0) + goto out; + + err = -EINVAL; + if (tb[NDA_DST] == NULL) + goto out; + + ndm = nlmsg_data(nlh); + if (ndm->ndm_ifindex) { + dev = __dev_get_by_index(net, ndm->ndm_ifindex); + if (dev == NULL) { + err = -ENODEV; + goto out; + } + + if (tb[NDA_LLADDR] && nla_len(tb[NDA_LLADDR]) < dev->addr_len) + goto out; + } + + read_lock(&neigh_tbl_lock); + for (tbl = neigh_tables; tbl; tbl = tbl->next) { + int flags = NEIGH_UPDATE_F_ADMIN | NEIGH_UPDATE_F_OVERRIDE; + struct neighbour *neigh; + void *dst, *lladdr; + + if (tbl->family != ndm->ndm_family) + continue; + read_unlock(&neigh_tbl_lock); + + if (nla_len(tb[NDA_DST]) < tbl->key_len) + goto out; + dst = nla_data(tb[NDA_DST]); + lladdr = tb[NDA_LLADDR] ? nla_data(tb[NDA_LLADDR]) : NULL; + + if (ndm->ndm_flags & NTF_PROXY) { + struct pneigh_entry *pn; + + err = -ENOBUFS; + pn = pneigh_lookup(tbl, net, dst, dev, 1); + if (pn) { + pn->flags = ndm->ndm_flags; + err = 0; + } + goto out; + } + + if (dev == NULL) + goto out; + + neigh = neigh_lookup(tbl, dst, dev); + if (neigh == NULL) { + if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { + err = -ENOENT; + goto out; + } + + neigh = __neigh_lookup_errno(tbl, dst, dev); + if (IS_ERR(neigh)) { + err = PTR_ERR(neigh); + goto out; + } + } else { + if (nlh->nlmsg_flags & NLM_F_EXCL) { + err = -EEXIST; + neigh_release(neigh); + goto out; + } + + if (!(nlh->nlmsg_flags & NLM_F_REPLACE)) + flags &= ~NEIGH_UPDATE_F_OVERRIDE; + } + + if (ndm->ndm_flags & NTF_USE) { + neigh_event_send(neigh, NULL); + err = 0; + } else + err = neigh_update(neigh, lladdr, ndm->ndm_state, flags); + neigh_release(neigh); + goto out; + } + + read_unlock(&neigh_tbl_lock); + err = -EAFNOSUPPORT; +out: + return err; +} + +static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms) +{ + struct nlattr *nest; + + nest = nla_nest_start(skb, NDTA_PARMS); + if (nest == NULL) + return -ENOBUFS; + + if (parms->dev) + NLA_PUT_U32(skb, NDTPA_IFINDEX, parms->dev->ifindex); + + NLA_PUT_U32(skb, NDTPA_REFCNT, atomic_read(&parms->refcnt)); + NLA_PUT_U32(skb, NDTPA_QUEUE_LENBYTES, parms->queue_len_bytes); + /* approximative value for deprecated QUEUE_LEN (in packets) */ + NLA_PUT_U32(skb, NDTPA_QUEUE_LEN, + DIV_ROUND_UP(parms->queue_len_bytes, + SKB_TRUESIZE(ETH_FRAME_LEN))); + NLA_PUT_U32(skb, NDTPA_PROXY_QLEN, parms->proxy_qlen); + NLA_PUT_U32(skb, NDTPA_APP_PROBES, parms->app_probes); + NLA_PUT_U32(skb, NDTPA_UCAST_PROBES, parms->ucast_probes); + NLA_PUT_U32(skb, NDTPA_MCAST_PROBES, parms->mcast_probes); + NLA_PUT_MSECS(skb, NDTPA_REACHABLE_TIME, parms->reachable_time); + NLA_PUT_MSECS(skb, NDTPA_BASE_REACHABLE_TIME, + parms->base_reachable_time); + NLA_PUT_MSECS(skb, NDTPA_GC_STALETIME, parms->gc_staletime); + NLA_PUT_MSECS(skb, NDTPA_DELAY_PROBE_TIME, parms->delay_probe_time); + NLA_PUT_MSECS(skb, NDTPA_RETRANS_TIME, parms->retrans_time); + NLA_PUT_MSECS(skb, NDTPA_ANYCAST_DELAY, parms->anycast_delay); + NLA_PUT_MSECS(skb, NDTPA_PROXY_DELAY, parms->proxy_delay); + NLA_PUT_MSECS(skb, NDTPA_LOCKTIME, parms->locktime); + + return nla_nest_end(skb, nest); + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + +static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl, + u32 pid, u32 seq, int type, int flags) +{ + struct nlmsghdr *nlh; + struct ndtmsg *ndtmsg; + + nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndtmsg), flags); + if (nlh == NULL) + return -EMSGSIZE; + + ndtmsg = nlmsg_data(nlh); + + read_lock_bh(&tbl->lock); + ndtmsg->ndtm_family = tbl->family; + ndtmsg->ndtm_pad1 = 0; + ndtmsg->ndtm_pad2 = 0; + + NLA_PUT_STRING(skb, NDTA_NAME, tbl->id); + NLA_PUT_MSECS(skb, NDTA_GC_INTERVAL, tbl->gc_interval); + NLA_PUT_U32(skb, NDTA_THRESH1, tbl->gc_thresh1); + NLA_PUT_U32(skb, NDTA_THRESH2, tbl->gc_thresh2); + NLA_PUT_U32(skb, NDTA_THRESH3, tbl->gc_thresh3); + + { + unsigned long now = jiffies; + unsigned int flush_delta = now - tbl->last_flush; + unsigned int rand_delta = now - tbl->last_rand; + struct neigh_hash_table *nht; + struct ndt_config ndc = { + .ndtc_key_len = tbl->key_len, + .ndtc_entry_size = tbl->entry_size, + .ndtc_entries = atomic_read(&tbl->entries), + .ndtc_last_flush = jiffies_to_msecs(flush_delta), + .ndtc_last_rand = jiffies_to_msecs(rand_delta), + .ndtc_proxy_qlen = tbl->proxy_queue.qlen, + }; + + rcu_read_lock_bh(); + nht = rcu_dereference_bh(tbl->nht); + ndc.ndtc_hash_rnd = nht->hash_rnd[0]; + ndc.ndtc_hash_mask = ((1 << nht->hash_shift) - 1); + rcu_read_unlock_bh(); + + NLA_PUT(skb, NDTA_CONFIG, sizeof(ndc), &ndc); + } + + { + int cpu; + struct ndt_stats ndst; + + memset(&ndst, 0, sizeof(ndst)); + + for_each_possible_cpu(cpu) { + struct neigh_statistics *st; + + st = per_cpu_ptr(tbl->stats, cpu); + ndst.ndts_allocs += st->allocs; + ndst.ndts_destroys += st->destroys; + ndst.ndts_hash_grows += st->hash_grows; + ndst.ndts_res_failed += st->res_failed; + ndst.ndts_lookups += st->lookups; + ndst.ndts_hits += st->hits; + ndst.ndts_rcv_probes_mcast += st->rcv_probes_mcast; + ndst.ndts_rcv_probes_ucast += st->rcv_probes_ucast; + ndst.ndts_periodic_gc_runs += st->periodic_gc_runs; + ndst.ndts_forced_gc_runs += st->forced_gc_runs; + } + + NLA_PUT(skb, NDTA_STATS, sizeof(ndst), &ndst); + } + + BUG_ON(tbl->parms.dev); + if (neightbl_fill_parms(skb, &tbl->parms) < 0) + goto nla_put_failure; + + read_unlock_bh(&tbl->lock); + return nlmsg_end(skb, nlh); + +nla_put_failure: + read_unlock_bh(&tbl->lock); + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static int neightbl_fill_param_info(struct sk_buff *skb, + struct neigh_table *tbl, + struct neigh_parms *parms, + u32 pid, u32 seq, int type, + unsigned int flags) +{ + struct ndtmsg *ndtmsg; + struct nlmsghdr *nlh; + + nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndtmsg), flags); + if (nlh == NULL) + return -EMSGSIZE; + + ndtmsg = nlmsg_data(nlh); + + read_lock_bh(&tbl->lock); + ndtmsg->ndtm_family = tbl->family; + ndtmsg->ndtm_pad1 = 0; + ndtmsg->ndtm_pad2 = 0; + + if (nla_put_string(skb, NDTA_NAME, tbl->id) < 0 || + neightbl_fill_parms(skb, parms) < 0) + goto errout; + + read_unlock_bh(&tbl->lock); + return nlmsg_end(skb, nlh); +errout: + read_unlock_bh(&tbl->lock); + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static const struct nla_policy nl_neightbl_policy[NDTA_MAX+1] = { + [NDTA_NAME] = { .type = NLA_STRING }, + [NDTA_THRESH1] = { .type = NLA_U32 }, + [NDTA_THRESH2] = { .type = NLA_U32 }, + [NDTA_THRESH3] = { .type = NLA_U32 }, + [NDTA_GC_INTERVAL] = { .type = NLA_U64 }, + [NDTA_PARMS] = { .type = NLA_NESTED }, +}; + +static const struct nla_policy nl_ntbl_parm_policy[NDTPA_MAX+1] = { + [NDTPA_IFINDEX] = { .type = NLA_U32 }, + [NDTPA_QUEUE_LEN] = { .type = NLA_U32 }, + [NDTPA_PROXY_QLEN] = { .type = NLA_U32 }, + [NDTPA_APP_PROBES] = { .type = NLA_U32 }, + [NDTPA_UCAST_PROBES] = { .type = NLA_U32 }, + [NDTPA_MCAST_PROBES] = { .type = NLA_U32 }, + [NDTPA_BASE_REACHABLE_TIME] = { .type = NLA_U64 }, + [NDTPA_GC_STALETIME] = { .type = NLA_U64 }, + [NDTPA_DELAY_PROBE_TIME] = { .type = NLA_U64 }, + [NDTPA_RETRANS_TIME] = { .type = NLA_U64 }, + [NDTPA_ANYCAST_DELAY] = { .type = NLA_U64 }, + [NDTPA_PROXY_DELAY] = { .type = NLA_U64 }, + [NDTPA_LOCKTIME] = { .type = NLA_U64 }, +}; + +static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +{ + struct net *net = sock_net(skb->sk); + struct neigh_table *tbl; + struct ndtmsg *ndtmsg; + struct nlattr *tb[NDTA_MAX+1]; + int err; + + err = nlmsg_parse(nlh, sizeof(*ndtmsg), tb, NDTA_MAX, + nl_neightbl_policy); + if (err < 0) + goto errout; + + if (tb[NDTA_NAME] == NULL) { + err = -EINVAL; + goto errout; + } + + ndtmsg = nlmsg_data(nlh); + read_lock(&neigh_tbl_lock); + for (tbl = neigh_tables; tbl; tbl = tbl->next) { + if (ndtmsg->ndtm_family && tbl->family != ndtmsg->ndtm_family) + continue; + + if (nla_strcmp(tb[NDTA_NAME], tbl->id) == 0) + break; + } + + if (tbl == NULL) { + err = -ENOENT; + goto errout_locked; + } + + /* + * We acquire tbl->lock to be nice to the periodic timers and + * make sure they always see a consistent set of values. + */ + write_lock_bh(&tbl->lock); + + if (tb[NDTA_PARMS]) { + struct nlattr *tbp[NDTPA_MAX+1]; + struct neigh_parms *p; + int i, ifindex = 0; + + err = nla_parse_nested(tbp, NDTPA_MAX, tb[NDTA_PARMS], + nl_ntbl_parm_policy); + if (err < 0) + goto errout_tbl_lock; + + if (tbp[NDTPA_IFINDEX]) + ifindex = nla_get_u32(tbp[NDTPA_IFINDEX]); + + p = lookup_neigh_parms(tbl, net, ifindex); + if (p == NULL) { + err = -ENOENT; + goto errout_tbl_lock; + } + + for (i = 1; i <= NDTPA_MAX; i++) { + if (tbp[i] == NULL) + continue; + + switch (i) { + case NDTPA_QUEUE_LEN: + p->queue_len_bytes = nla_get_u32(tbp[i]) * + SKB_TRUESIZE(ETH_FRAME_LEN); + break; + case NDTPA_QUEUE_LENBYTES: + p->queue_len_bytes = nla_get_u32(tbp[i]); + break; + case NDTPA_PROXY_QLEN: + p->proxy_qlen = nla_get_u32(tbp[i]); + break; + case NDTPA_APP_PROBES: + p->app_probes = nla_get_u32(tbp[i]); + break; + case NDTPA_UCAST_PROBES: + p->ucast_probes = nla_get_u32(tbp[i]); + break; + case NDTPA_MCAST_PROBES: + p->mcast_probes = nla_get_u32(tbp[i]); + break; + case NDTPA_BASE_REACHABLE_TIME: + p->base_reachable_time = nla_get_msecs(tbp[i]); + break; + case NDTPA_GC_STALETIME: + p->gc_staletime = nla_get_msecs(tbp[i]); + break; + case NDTPA_DELAY_PROBE_TIME: + p->delay_probe_time = nla_get_msecs(tbp[i]); + break; + case NDTPA_RETRANS_TIME: + p->retrans_time = nla_get_msecs(tbp[i]); + break; + case NDTPA_ANYCAST_DELAY: + p->anycast_delay = nla_get_msecs(tbp[i]); + break; + case NDTPA_PROXY_DELAY: + p->proxy_delay = nla_get_msecs(tbp[i]); + break; + case NDTPA_LOCKTIME: + p->locktime = nla_get_msecs(tbp[i]); + break; + } + } + } + + if (tb[NDTA_THRESH1]) + tbl->gc_thresh1 = nla_get_u32(tb[NDTA_THRESH1]); + + if (tb[NDTA_THRESH2]) + tbl->gc_thresh2 = nla_get_u32(tb[NDTA_THRESH2]); + + if (tb[NDTA_THRESH3]) + tbl->gc_thresh3 = nla_get_u32(tb[NDTA_THRESH3]); + + if (tb[NDTA_GC_INTERVAL]) + tbl->gc_interval = nla_get_msecs(tb[NDTA_GC_INTERVAL]); + + err = 0; + +errout_tbl_lock: + write_unlock_bh(&tbl->lock); +errout_locked: + read_unlock(&neigh_tbl_lock); +errout: + return err; +} + +static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + int family, tidx, nidx = 0; + int tbl_skip = cb->args[0]; + int neigh_skip = cb->args[1]; + struct neigh_table *tbl; + + family = ((struct rtgenmsg *) nlmsg_data(cb->nlh))->rtgen_family; + + read_lock(&neigh_tbl_lock); + for (tbl = neigh_tables, tidx = 0; tbl; tbl = tbl->next, tidx++) { + struct neigh_parms *p; + + if (tidx < tbl_skip || (family && tbl->family != family)) + continue; + + if (neightbl_fill_info(skb, tbl, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, RTM_NEWNEIGHTBL, + NLM_F_MULTI) <= 0) + break; + + for (nidx = 0, p = tbl->parms.next; p; p = p->next) { + if (!net_eq(neigh_parms_net(p), net)) + continue; + + if (nidx < neigh_skip) + goto next; + + if (neightbl_fill_param_info(skb, tbl, p, + NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, + RTM_NEWNEIGHTBL, + NLM_F_MULTI) <= 0) + goto out; + next: + nidx++; + } + + neigh_skip = 0; + } +out: + read_unlock(&neigh_tbl_lock); + cb->args[0] = tidx; + cb->args[1] = nidx; + + return skb->len; +} + +static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh, + u32 pid, u32 seq, int type, unsigned int flags) +{ + unsigned long now = jiffies; + struct nda_cacheinfo ci; + struct nlmsghdr *nlh; + struct ndmsg *ndm; + + nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), flags); + if (nlh == NULL) + return -EMSGSIZE; + + ndm = nlmsg_data(nlh); + ndm->ndm_family = neigh->ops->family; + ndm->ndm_pad1 = 0; + ndm->ndm_pad2 = 0; + ndm->ndm_flags = neigh->flags; + ndm->ndm_type = neigh->type; + ndm->ndm_ifindex = neigh->dev->ifindex; + + NLA_PUT(skb, NDA_DST, neigh->tbl->key_len, neigh->primary_key); + + read_lock_bh(&neigh->lock); + ndm->ndm_state = neigh->nud_state; + if (neigh->nud_state & NUD_VALID) { + char haddr[MAX_ADDR_LEN]; + + neigh_ha_snapshot(haddr, neigh, neigh->dev); + if (nla_put(skb, NDA_LLADDR, neigh->dev->addr_len, haddr) < 0) { + read_unlock_bh(&neigh->lock); + goto nla_put_failure; + } + } + + ci.ndm_used = jiffies_to_clock_t(now - neigh->used); + ci.ndm_confirmed = jiffies_to_clock_t(now - neigh->confirmed); + ci.ndm_updated = jiffies_to_clock_t(now - neigh->updated); + ci.ndm_refcnt = atomic_read(&neigh->refcnt) - 1; + read_unlock_bh(&neigh->lock); + + NLA_PUT_U32(skb, NDA_PROBES, atomic_read(&neigh->probes)); + NLA_PUT(skb, NDA_CACHEINFO, sizeof(ci), &ci); + + return nlmsg_end(skb, nlh); + +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static int pneigh_fill_info(struct sk_buff *skb, struct pneigh_entry *pn, + u32 pid, u32 seq, int type, unsigned int flags, + struct neigh_table *tbl) +{ + struct nlmsghdr *nlh; + struct ndmsg *ndm; + + nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), flags); + if (nlh == NULL) + return -EMSGSIZE; + + ndm = nlmsg_data(nlh); + ndm->ndm_family = tbl->family; + ndm->ndm_pad1 = 0; + ndm->ndm_pad2 = 0; + ndm->ndm_flags = pn->flags | NTF_PROXY; + ndm->ndm_type = NDA_DST; + ndm->ndm_ifindex = pn->dev->ifindex; + ndm->ndm_state = NUD_NONE; + + NLA_PUT(skb, NDA_DST, tbl->key_len, pn->key); + + return nlmsg_end(skb, nlh); + +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static void neigh_update_notify(struct neighbour *neigh) +{ + call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh); + __neigh_notify(neigh, RTM_NEWNEIGH, 0); +} + +static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + struct neighbour *n; + int rc, h, s_h = cb->args[1]; + int idx, s_idx = idx = cb->args[2]; + struct neigh_hash_table *nht; + + rcu_read_lock_bh(); + nht = rcu_dereference_bh(tbl->nht); + + for (h = s_h; h < (1 << nht->hash_shift); h++) { + if (h > s_h) + s_idx = 0; + for (n = rcu_dereference_bh(nht->hash_buckets[h]), idx = 0; + n != NULL; + n = rcu_dereference_bh(n->next)) { + if (!net_eq(dev_net(n->dev), net)) + continue; + if (idx < s_idx) + goto next; + if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, + RTM_NEWNEIGH, + NLM_F_MULTI) <= 0) { + rc = -1; + goto out; + } +next: + idx++; + } + } + rc = skb->len; +out: + rcu_read_unlock_bh(); + cb->args[1] = h; + cb->args[2] = idx; + return rc; +} + +static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct pneigh_entry *n; + struct net *net = sock_net(skb->sk); + int rc, h, s_h = cb->args[3]; + int idx, s_idx = idx = cb->args[4]; + + read_lock_bh(&tbl->lock); + + for (h = s_h; h <= PNEIGH_HASHMASK; h++) { + if (h > s_h) + s_idx = 0; + for (n = tbl->phash_buckets[h], idx = 0; n; n = n->next) { + if (dev_net(n->dev) != net) + continue; + if (idx < s_idx) + goto next; + if (pneigh_fill_info(skb, n, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, + RTM_NEWNEIGH, + NLM_F_MULTI, tbl) <= 0) { + read_unlock_bh(&tbl->lock); + rc = -1; + goto out; + } + next: + idx++; + } + } + + read_unlock_bh(&tbl->lock); + rc = skb->len; +out: + cb->args[3] = h; + cb->args[4] = idx; + return rc; + +} + +static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct neigh_table *tbl; + int t, family, s_t; + int proxy = 0; + int err; + + read_lock(&neigh_tbl_lock); + family = ((struct rtgenmsg *) nlmsg_data(cb->nlh))->rtgen_family; + + /* check for full ndmsg structure presence, family member is + * the same for both structures + */ + if (nlmsg_len(cb->nlh) >= sizeof(struct ndmsg) && + ((struct ndmsg *) nlmsg_data(cb->nlh))->ndm_flags == NTF_PROXY) + proxy = 1; + + s_t = cb->args[0]; + + for (tbl = neigh_tables, t = 0; tbl; + tbl = tbl->next, t++) { + if (t < s_t || (family && tbl->family != family)) + continue; + if (t > s_t) + memset(&cb->args[1], 0, sizeof(cb->args) - + sizeof(cb->args[0])); + if (proxy) + err = pneigh_dump_table(tbl, skb, cb); + else + err = neigh_dump_table(tbl, skb, cb); + if (err < 0) + break; + } + read_unlock(&neigh_tbl_lock); + + cb->args[0] = t; + return skb->len; +} + +void neigh_for_each(struct neigh_table *tbl, void (*cb)(struct neighbour *, void *), void *cookie) +{ + int chain; + struct neigh_hash_table *nht; + + rcu_read_lock_bh(); + nht = rcu_dereference_bh(tbl->nht); + + read_lock(&tbl->lock); /* avoid resizes */ + for (chain = 0; chain < (1 << nht->hash_shift); chain++) { + struct neighbour *n; + + for (n = rcu_dereference_bh(nht->hash_buckets[chain]); + n != NULL; + n = rcu_dereference_bh(n->next)) + cb(n, cookie); + } + read_unlock(&tbl->lock); + rcu_read_unlock_bh(); +} +EXPORT_SYMBOL(neigh_for_each); + +/* The tbl->lock must be held as a writer and BH disabled. */ +void __neigh_for_each_release(struct neigh_table *tbl, + int (*cb)(struct neighbour *)) +{ + int chain; + struct neigh_hash_table *nht; + + nht = rcu_dereference_protected(tbl->nht, + lockdep_is_held(&tbl->lock)); + for (chain = 0; chain < (1 << nht->hash_shift); chain++) { + struct neighbour *n; + struct neighbour __rcu **np; + + np = &nht->hash_buckets[chain]; + while ((n = rcu_dereference_protected(*np, + lockdep_is_held(&tbl->lock))) != NULL) { + int release; + + write_lock(&n->lock); + release = cb(n); + if (release) { + rcu_assign_pointer(*np, + rcu_dereference_protected(n->next, + lockdep_is_held(&tbl->lock))); + n->dead = 1; + } else + np = &n->next; + write_unlock(&n->lock); + if (release) + neigh_cleanup_and_release(n); + } + } +} +EXPORT_SYMBOL(__neigh_for_each_release); + +#ifdef CONFIG_PROC_FS + +static struct neighbour *neigh_get_first(struct seq_file *seq) +{ + struct neigh_seq_state *state = seq->private; + struct net *net = seq_file_net(seq); + struct neigh_hash_table *nht = state->nht; + struct neighbour *n = NULL; + int bucket = state->bucket; + + state->flags &= ~NEIGH_SEQ_IS_PNEIGH; + for (bucket = 0; bucket < (1 << nht->hash_shift); bucket++) { + n = rcu_dereference_bh(nht->hash_buckets[bucket]); + + while (n) { + if (!net_eq(dev_net(n->dev), net)) + goto next; + if (state->neigh_sub_iter) { + loff_t fakep = 0; + void *v; + + v = state->neigh_sub_iter(state, n, &fakep); + if (!v) + goto next; + } + if (!(state->flags & NEIGH_SEQ_SKIP_NOARP)) + break; + if (n->nud_state & ~NUD_NOARP) + break; +next: + n = rcu_dereference_bh(n->next); + } + + if (n) + break; + } + state->bucket = bucket; + + return n; +} + +static struct neighbour *neigh_get_next(struct seq_file *seq, + struct neighbour *n, + loff_t *pos) +{ + struct neigh_seq_state *state = seq->private; + struct net *net = seq_file_net(seq); + struct neigh_hash_table *nht = state->nht; + + if (state->neigh_sub_iter) { + void *v = state->neigh_sub_iter(state, n, pos); + if (v) + return n; + } + n = rcu_dereference_bh(n->next); + + while (1) { + while (n) { + if (!net_eq(dev_net(n->dev), net)) + goto next; + if (state->neigh_sub_iter) { + void *v = state->neigh_sub_iter(state, n, pos); + if (v) + return n; + goto next; + } + if (!(state->flags & NEIGH_SEQ_SKIP_NOARP)) + break; + + if (n->nud_state & ~NUD_NOARP) + break; +next: + n = rcu_dereference_bh(n->next); + } + + if (n) + break; + + if (++state->bucket >= (1 << nht->hash_shift)) + break; + + n = rcu_dereference_bh(nht->hash_buckets[state->bucket]); + } + + if (n && pos) + --(*pos); + return n; +} + +static struct neighbour *neigh_get_idx(struct seq_file *seq, loff_t *pos) +{ + struct neighbour *n = neigh_get_first(seq); + + if (n) { + --(*pos); + while (*pos) { + n = neigh_get_next(seq, n, pos); + if (!n) + break; + } + } + return *pos ? NULL : n; +} + +static struct pneigh_entry *pneigh_get_first(struct seq_file *seq) +{ + struct neigh_seq_state *state = seq->private; + struct net *net = seq_file_net(seq); + struct neigh_table *tbl = state->tbl; + struct pneigh_entry *pn = NULL; + int bucket = state->bucket; + + state->flags |= NEIGH_SEQ_IS_PNEIGH; + for (bucket = 0; bucket <= PNEIGH_HASHMASK; bucket++) { + pn = tbl->phash_buckets[bucket]; + while (pn && !net_eq(pneigh_net(pn), net)) + pn = pn->next; + if (pn) + break; + } + state->bucket = bucket; + + return pn; +} + +static struct pneigh_entry *pneigh_get_next(struct seq_file *seq, + struct pneigh_entry *pn, + loff_t *pos) +{ + struct neigh_seq_state *state = seq->private; + struct net *net = seq_file_net(seq); + struct neigh_table *tbl = state->tbl; + + do { + pn = pn->next; + } while (pn && !net_eq(pneigh_net(pn), net)); + + while (!pn) { + if (++state->bucket > PNEIGH_HASHMASK) + break; + pn = tbl->phash_buckets[state->bucket]; + while (pn && !net_eq(pneigh_net(pn), net)) + pn = pn->next; + if (pn) + break; + } + + if (pn && pos) + --(*pos); + + return pn; +} + +static struct pneigh_entry *pneigh_get_idx(struct seq_file *seq, loff_t *pos) +{ + struct pneigh_entry *pn = pneigh_get_first(seq); + + if (pn) { + --(*pos); + while (*pos) { + pn = pneigh_get_next(seq, pn, pos); + if (!pn) + break; + } + } + return *pos ? NULL : pn; +} + +static void *neigh_get_idx_any(struct seq_file *seq, loff_t *pos) +{ + struct neigh_seq_state *state = seq->private; + void *rc; + loff_t idxpos = *pos; + + rc = neigh_get_idx(seq, &idxpos); + if (!rc && !(state->flags & NEIGH_SEQ_NEIGH_ONLY)) + rc = pneigh_get_idx(seq, &idxpos); + + return rc; +} + +void *neigh_seq_start(struct seq_file *seq, loff_t *pos, struct neigh_table *tbl, unsigned int neigh_seq_flags) + __acquires(rcu_bh) +{ + struct neigh_seq_state *state = seq->private; + + state->tbl = tbl; + state->bucket = 0; + state->flags = (neigh_seq_flags & ~NEIGH_SEQ_IS_PNEIGH); + + rcu_read_lock_bh(); + state->nht = rcu_dereference_bh(tbl->nht); + + return *pos ? neigh_get_idx_any(seq, pos) : SEQ_START_TOKEN; +} +EXPORT_SYMBOL(neigh_seq_start); + +void *neigh_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct neigh_seq_state *state; + void *rc; + + if (v == SEQ_START_TOKEN) { + rc = neigh_get_first(seq); + goto out; + } + + state = seq->private; + if (!(state->flags & NEIGH_SEQ_IS_PNEIGH)) { + rc = neigh_get_next(seq, v, NULL); + if (rc) + goto out; + if (!(state->flags & NEIGH_SEQ_NEIGH_ONLY)) + rc = pneigh_get_first(seq); + } else { + BUG_ON(state->flags & NEIGH_SEQ_NEIGH_ONLY); + rc = pneigh_get_next(seq, v, NULL); + } +out: + ++(*pos); + return rc; +} +EXPORT_SYMBOL(neigh_seq_next); + +void neigh_seq_stop(struct seq_file *seq, void *v) + __releases(rcu_bh) +{ + rcu_read_unlock_bh(); +} +EXPORT_SYMBOL(neigh_seq_stop); + +/* statistics via seq_file */ + +static void *neigh_stat_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct neigh_table *tbl = seq->private; + int cpu; + + if (*pos == 0) + return SEQ_START_TOKEN; + + for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) { + if (!cpu_possible(cpu)) + continue; + *pos = cpu+1; + return per_cpu_ptr(tbl->stats, cpu); + } + return NULL; +} + +static void *neigh_stat_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct neigh_table *tbl = seq->private; + int cpu; + + for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) { + if (!cpu_possible(cpu)) + continue; + *pos = cpu+1; + return per_cpu_ptr(tbl->stats, cpu); + } + return NULL; +} + +static void neigh_stat_seq_stop(struct seq_file *seq, void *v) +{ + +} + +static int neigh_stat_seq_show(struct seq_file *seq, void *v) +{ + struct neigh_table *tbl = seq->private; + struct neigh_statistics *st = v; + + if (v == SEQ_START_TOKEN) { + seq_printf(seq, "entries allocs destroys hash_grows lookups hits res_failed rcv_probes_mcast rcv_probes_ucast periodic_gc_runs forced_gc_runs unresolved_discards\n"); + return 0; + } + + seq_printf(seq, "%08x %08lx %08lx %08lx %08lx %08lx %08lx " + "%08lx %08lx %08lx %08lx %08lx\n", + atomic_read(&tbl->entries), + + st->allocs, + st->destroys, + st->hash_grows, + + st->lookups, + st->hits, + + st->res_failed, + + st->rcv_probes_mcast, + st->rcv_probes_ucast, + + st->periodic_gc_runs, + st->forced_gc_runs, + st->unres_discards + ); + + return 0; +} + +static const struct seq_operations neigh_stat_seq_ops = { + .start = neigh_stat_seq_start, + .next = neigh_stat_seq_next, + .stop = neigh_stat_seq_stop, + .show = neigh_stat_seq_show, +}; + +static int neigh_stat_seq_open(struct inode *inode, struct file *file) +{ + int ret = seq_open(file, &neigh_stat_seq_ops); + + if (!ret) { + struct seq_file *sf = file->private_data; + sf->private = PDE(inode)->data; + } + return ret; +}; + +static const struct file_operations neigh_stat_seq_fops = { + .owner = THIS_MODULE, + .open = neigh_stat_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +#endif /* CONFIG_PROC_FS */ + +static inline size_t neigh_nlmsg_size(void) +{ + return NLMSG_ALIGN(sizeof(struct ndmsg)) + + nla_total_size(MAX_ADDR_LEN) /* NDA_DST */ + + nla_total_size(MAX_ADDR_LEN) /* NDA_LLADDR */ + + nla_total_size(sizeof(struct nda_cacheinfo)) + + nla_total_size(4); /* NDA_PROBES */ +} + +static void __neigh_notify(struct neighbour *n, int type, int flags) +{ + struct net *net = dev_net(n->dev); + struct sk_buff *skb; + int err = -ENOBUFS; + + skb = nlmsg_new(neigh_nlmsg_size(), GFP_ATOMIC); + if (skb == NULL) + goto errout; + + err = neigh_fill_info(skb, n, 0, 0, type, flags); + if (err < 0) { + /* -EMSGSIZE implies BUG in neigh_nlmsg_size() */ + WARN_ON(err == -EMSGSIZE); + kfree_skb(skb); + goto errout; + } + rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC); + return; +errout: + if (err < 0) + rtnl_set_sk_err(net, RTNLGRP_NEIGH, err); +} + +#ifdef CONFIG_ARPD +void neigh_app_ns(struct neighbour *n) +{ + __neigh_notify(n, RTM_GETNEIGH, NLM_F_REQUEST); +} +EXPORT_SYMBOL(neigh_app_ns); +#endif /* CONFIG_ARPD */ + +#ifdef CONFIG_SYSCTL + +static int proc_unres_qlen(ctl_table *ctl, int write, void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + int size, ret; + ctl_table tmp = *ctl; + + tmp.data = &size; + size = DIV_ROUND_UP(*(int *)ctl->data, SKB_TRUESIZE(ETH_FRAME_LEN)); + ret = proc_dointvec(&tmp, write, buffer, lenp, ppos); + if (write && !ret) + *(int *)ctl->data = size * SKB_TRUESIZE(ETH_FRAME_LEN); + return ret; +} + +enum { + NEIGH_VAR_MCAST_PROBE, + NEIGH_VAR_UCAST_PROBE, + NEIGH_VAR_APP_PROBE, + NEIGH_VAR_RETRANS_TIME, + NEIGH_VAR_BASE_REACHABLE_TIME, + NEIGH_VAR_DELAY_PROBE_TIME, + NEIGH_VAR_GC_STALETIME, + NEIGH_VAR_QUEUE_LEN, + NEIGH_VAR_QUEUE_LEN_BYTES, + NEIGH_VAR_PROXY_QLEN, + NEIGH_VAR_ANYCAST_DELAY, + NEIGH_VAR_PROXY_DELAY, + NEIGH_VAR_LOCKTIME, + NEIGH_VAR_RETRANS_TIME_MS, + NEIGH_VAR_BASE_REACHABLE_TIME_MS, + NEIGH_VAR_GC_INTERVAL, + NEIGH_VAR_GC_THRESH1, + NEIGH_VAR_GC_THRESH2, + NEIGH_VAR_GC_THRESH3, + NEIGH_VAR_MAX +}; + +static struct neigh_sysctl_table { + struct ctl_table_header *sysctl_header; + struct ctl_table neigh_vars[NEIGH_VAR_MAX + 1]; + char *dev_name; +} neigh_sysctl_template __read_mostly = { + .neigh_vars = { + [NEIGH_VAR_MCAST_PROBE] = { + .procname = "mcast_solicit", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + [NEIGH_VAR_UCAST_PROBE] = { + .procname = "ucast_solicit", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + [NEIGH_VAR_APP_PROBE] = { + .procname = "app_solicit", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + [NEIGH_VAR_RETRANS_TIME] = { + .procname = "retrans_time", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_userhz_jiffies, + }, + [NEIGH_VAR_BASE_REACHABLE_TIME] = { + .procname = "base_reachable_time", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + [NEIGH_VAR_DELAY_PROBE_TIME] = { + .procname = "delay_first_probe_time", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + [NEIGH_VAR_GC_STALETIME] = { + .procname = "gc_stale_time", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + [NEIGH_VAR_QUEUE_LEN] = { + .procname = "unres_qlen", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_unres_qlen, + }, + [NEIGH_VAR_QUEUE_LEN_BYTES] = { + .procname = "unres_qlen_bytes", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + [NEIGH_VAR_PROXY_QLEN] = { + .procname = "proxy_qlen", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + [NEIGH_VAR_ANYCAST_DELAY] = { + .procname = "anycast_delay", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_userhz_jiffies, + }, + [NEIGH_VAR_PROXY_DELAY] = { + .procname = "proxy_delay", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_userhz_jiffies, + }, + [NEIGH_VAR_LOCKTIME] = { + .procname = "locktime", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_userhz_jiffies, + }, + [NEIGH_VAR_RETRANS_TIME_MS] = { + .procname = "retrans_time_ms", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_ms_jiffies, + }, + [NEIGH_VAR_BASE_REACHABLE_TIME_MS] = { + .procname = "base_reachable_time_ms", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_ms_jiffies, + }, + [NEIGH_VAR_GC_INTERVAL] = { + .procname = "gc_interval", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + [NEIGH_VAR_GC_THRESH1] = { + .procname = "gc_thresh1", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + [NEIGH_VAR_GC_THRESH2] = { + .procname = "gc_thresh2", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + [NEIGH_VAR_GC_THRESH3] = { + .procname = "gc_thresh3", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + {}, + }, +}; + +int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p, + char *p_name, proc_handler *handler) +{ + struct neigh_sysctl_table *t; + const char *dev_name_source = NULL; + +#define NEIGH_CTL_PATH_ROOT 0 +#define NEIGH_CTL_PATH_PROTO 1 +#define NEIGH_CTL_PATH_NEIGH 2 +#define NEIGH_CTL_PATH_DEV 3 + + struct ctl_path neigh_path[] = { + { .procname = "net", }, + { .procname = "proto", }, + { .procname = "neigh", }, + { .procname = "default", }, + { }, + }; + + t = kmemdup(&neigh_sysctl_template, sizeof(*t), GFP_KERNEL); + if (!t) + goto err; + + t->neigh_vars[NEIGH_VAR_MCAST_PROBE].data = &p->mcast_probes; + t->neigh_vars[NEIGH_VAR_UCAST_PROBE].data = &p->ucast_probes; + t->neigh_vars[NEIGH_VAR_APP_PROBE].data = &p->app_probes; + t->neigh_vars[NEIGH_VAR_RETRANS_TIME].data = &p->retrans_time; + t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].data = &p->base_reachable_time; + t->neigh_vars[NEIGH_VAR_DELAY_PROBE_TIME].data = &p->delay_probe_time; + t->neigh_vars[NEIGH_VAR_GC_STALETIME].data = &p->gc_staletime; + t->neigh_vars[NEIGH_VAR_QUEUE_LEN].data = &p->queue_len_bytes; + t->neigh_vars[NEIGH_VAR_QUEUE_LEN_BYTES].data = &p->queue_len_bytes; + t->neigh_vars[NEIGH_VAR_PROXY_QLEN].data = &p->proxy_qlen; + t->neigh_vars[NEIGH_VAR_ANYCAST_DELAY].data = &p->anycast_delay; + t->neigh_vars[NEIGH_VAR_PROXY_DELAY].data = &p->proxy_delay; + t->neigh_vars[NEIGH_VAR_LOCKTIME].data = &p->locktime; + t->neigh_vars[NEIGH_VAR_RETRANS_TIME_MS].data = &p->retrans_time; + t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].data = &p->base_reachable_time; + + if (dev) { + dev_name_source = dev->name; + /* Terminate the table early */ + memset(&t->neigh_vars[NEIGH_VAR_GC_INTERVAL], 0, + sizeof(t->neigh_vars[NEIGH_VAR_GC_INTERVAL])); + } else { + dev_name_source = neigh_path[NEIGH_CTL_PATH_DEV].procname; + t->neigh_vars[NEIGH_VAR_GC_INTERVAL].data = (int *)(p + 1); + t->neigh_vars[NEIGH_VAR_GC_THRESH1].data = (int *)(p + 1) + 1; + t->neigh_vars[NEIGH_VAR_GC_THRESH2].data = (int *)(p + 1) + 2; + t->neigh_vars[NEIGH_VAR_GC_THRESH3].data = (int *)(p + 1) + 3; + } + + + if (handler) { + /* RetransTime */ + t->neigh_vars[NEIGH_VAR_RETRANS_TIME].proc_handler = handler; + t->neigh_vars[NEIGH_VAR_RETRANS_TIME].extra1 = dev; + /* ReachableTime */ + t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].proc_handler = handler; + t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].extra1 = dev; + /* RetransTime (in milliseconds)*/ + t->neigh_vars[NEIGH_VAR_RETRANS_TIME_MS].proc_handler = handler; + t->neigh_vars[NEIGH_VAR_RETRANS_TIME_MS].extra1 = dev; + /* ReachableTime (in milliseconds) */ + t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].proc_handler = handler; + t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].extra1 = dev; + } + + t->dev_name = kstrdup(dev_name_source, GFP_KERNEL); + if (!t->dev_name) + goto free; + + neigh_path[NEIGH_CTL_PATH_DEV].procname = t->dev_name; + neigh_path[NEIGH_CTL_PATH_PROTO].procname = p_name; + + t->sysctl_header = + register_net_sysctl_table(neigh_parms_net(p), neigh_path, t->neigh_vars); + if (!t->sysctl_header) + goto free_procname; + + p->sysctl_table = t; + return 0; + +free_procname: + kfree(t->dev_name); +free: + kfree(t); +err: + return -ENOBUFS; +} +EXPORT_SYMBOL(neigh_sysctl_register); + +void neigh_sysctl_unregister(struct neigh_parms *p) +{ + if (p->sysctl_table) { + struct neigh_sysctl_table *t = p->sysctl_table; + p->sysctl_table = NULL; + unregister_sysctl_table(t->sysctl_header); + kfree(t->dev_name); + kfree(t); + } +} +EXPORT_SYMBOL(neigh_sysctl_unregister); + +#endif /* CONFIG_SYSCTL */ + +static int __init neigh_init(void) +{ + rtnl_register(PF_UNSPEC, RTM_NEWNEIGH, neigh_add, NULL, NULL); + rtnl_register(PF_UNSPEC, RTM_DELNEIGH, neigh_delete, NULL, NULL); + rtnl_register(PF_UNSPEC, RTM_GETNEIGH, NULL, neigh_dump_info, NULL); + + rtnl_register(PF_UNSPEC, RTM_GETNEIGHTBL, NULL, neightbl_dump_info, + NULL); + rtnl_register(PF_UNSPEC, RTM_SETNEIGHTBL, neightbl_set, NULL, NULL); + + return 0; +} + +subsys_initcall(neigh_init); + diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c new file mode 100644 index 00000000..49558623 --- /dev/null +++ b/net/core/net-sysfs.c @@ -0,0 +1,1506 @@ +/* + * net-sysfs.c - network device class and attributes + * + * Copyright (c) 2003 Stephen Hemminger <shemminger@osdl.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/capability.h> +#include <linux/kernel.h> +#include <linux/netdevice.h> +#include <linux/if_arp.h> +#include <linux/slab.h> +#include <linux/nsproxy.h> +#include <net/sock.h> +#include <net/net_namespace.h> +#include <linux/rtnetlink.h> +#include <linux/wireless.h> +#include <linux/vmalloc.h> +#include <linux/export.h> +#include <linux/jiffies.h> +#include <net/wext.h> + +#include "net-sysfs.h" + +#ifdef CONFIG_SYSFS +static const char fmt_hex[] = "%#x\n"; +static const char fmt_long_hex[] = "%#lx\n"; +static const char fmt_dec[] = "%d\n"; +static const char fmt_udec[] = "%u\n"; +static const char fmt_ulong[] = "%lu\n"; +static const char fmt_u64[] = "%llu\n"; + +static inline int dev_isalive(const struct net_device *dev) +{ + return dev->reg_state <= NETREG_REGISTERED; +} + +/* use same locking rules as GIF* ioctl's */ +static ssize_t netdev_show(const struct device *dev, + struct device_attribute *attr, char *buf, + ssize_t (*format)(const struct net_device *, char *)) +{ + struct net_device *net = to_net_dev(dev); + ssize_t ret = -EINVAL; + + read_lock(&dev_base_lock); + if (dev_isalive(net)) + ret = (*format)(net, buf); + read_unlock(&dev_base_lock); + + return ret; +} + +/* generate a show function for simple field */ +#define NETDEVICE_SHOW(field, format_string) \ +static ssize_t format_##field(const struct net_device *net, char *buf) \ +{ \ + return sprintf(buf, format_string, net->field); \ +} \ +static ssize_t show_##field(struct device *dev, \ + struct device_attribute *attr, char *buf) \ +{ \ + return netdev_show(dev, attr, buf, format_##field); \ +} + + +/* use same locking and permission rules as SIF* ioctl's */ +static ssize_t netdev_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len, + int (*set)(struct net_device *, unsigned long)) +{ + struct net_device *net = to_net_dev(dev); + char *endp; + unsigned long new; + int ret = -EINVAL; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + new = simple_strtoul(buf, &endp, 0); + if (endp == buf) + goto err; + + if (!rtnl_trylock()) + return restart_syscall(); + + if (dev_isalive(net)) { + if ((ret = (*set)(net, new)) == 0) + ret = len; + } + rtnl_unlock(); + err: + return ret; +} + +NETDEVICE_SHOW(dev_id, fmt_hex); +NETDEVICE_SHOW(addr_assign_type, fmt_dec); +NETDEVICE_SHOW(addr_len, fmt_dec); +NETDEVICE_SHOW(iflink, fmt_dec); +NETDEVICE_SHOW(ifindex, fmt_dec); +NETDEVICE_SHOW(type, fmt_dec); +NETDEVICE_SHOW(link_mode, fmt_dec); + +/* use same locking rules as GIFHWADDR ioctl's */ +static ssize_t show_address(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct net_device *net = to_net_dev(dev); + ssize_t ret = -EINVAL; + + read_lock(&dev_base_lock); + if (dev_isalive(net)) + ret = sysfs_format_mac(buf, net->dev_addr, net->addr_len); + read_unlock(&dev_base_lock); + return ret; +} + +static ssize_t show_broadcast(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct net_device *net = to_net_dev(dev); + if (dev_isalive(net)) + return sysfs_format_mac(buf, net->broadcast, net->addr_len); + return -EINVAL; +} + +static ssize_t show_carrier(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct net_device *netdev = to_net_dev(dev); + if (netif_running(netdev)) { + return sprintf(buf, fmt_dec, !!netif_carrier_ok(netdev)); + } + return -EINVAL; +} + +static ssize_t show_speed(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct net_device *netdev = to_net_dev(dev); + int ret = -EINVAL; + + if (!rtnl_trylock()) + return restart_syscall(); + + if (netif_running(netdev)) { + struct ethtool_cmd cmd; + if (!__ethtool_get_settings(netdev, &cmd)) + ret = sprintf(buf, fmt_udec, ethtool_cmd_speed(&cmd)); + } + rtnl_unlock(); + return ret; +} + +static ssize_t show_duplex(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct net_device *netdev = to_net_dev(dev); + int ret = -EINVAL; + + if (!rtnl_trylock()) + return restart_syscall(); + + if (netif_running(netdev)) { + struct ethtool_cmd cmd; + if (!__ethtool_get_settings(netdev, &cmd)) + ret = sprintf(buf, "%s\n", + cmd.duplex ? "full" : "half"); + } + rtnl_unlock(); + return ret; +} + +static ssize_t show_dormant(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct net_device *netdev = to_net_dev(dev); + + if (netif_running(netdev)) + return sprintf(buf, fmt_dec, !!netif_dormant(netdev)); + + return -EINVAL; +} + +static const char *const operstates[] = { + "unknown", + "notpresent", /* currently unused */ + "down", + "lowerlayerdown", + "testing", /* currently unused */ + "dormant", + "up" +}; + +static ssize_t show_operstate(struct device *dev, + struct device_attribute *attr, char *buf) +{ + const struct net_device *netdev = to_net_dev(dev); + unsigned char operstate; + + read_lock(&dev_base_lock); + operstate = netdev->operstate; + if (!netif_running(netdev)) + operstate = IF_OPER_DOWN; + read_unlock(&dev_base_lock); + + if (operstate >= ARRAY_SIZE(operstates)) + return -EINVAL; /* should not happen */ + + return sprintf(buf, "%s\n", operstates[operstate]); +} + +/* read-write attributes */ +NETDEVICE_SHOW(mtu, fmt_dec); + +static int change_mtu(struct net_device *net, unsigned long new_mtu) +{ + return dev_set_mtu(net, (int) new_mtu); +} + +static ssize_t store_mtu(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) +{ + return netdev_store(dev, attr, buf, len, change_mtu); +} + +NETDEVICE_SHOW(flags, fmt_hex); + +static int change_flags(struct net_device *net, unsigned long new_flags) +{ + return dev_change_flags(net, (unsigned) new_flags); +} + +static ssize_t store_flags(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) +{ + return netdev_store(dev, attr, buf, len, change_flags); +} + +NETDEVICE_SHOW(tx_queue_len, fmt_ulong); + +static int change_tx_queue_len(struct net_device *net, unsigned long new_len) +{ + net->tx_queue_len = new_len; + return 0; +} + +static ssize_t store_tx_queue_len(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + return netdev_store(dev, attr, buf, len, change_tx_queue_len); +} + +static ssize_t store_ifalias(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) +{ + struct net_device *netdev = to_net_dev(dev); + size_t count = len; + ssize_t ret; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + /* ignore trailing newline */ + if (len > 0 && buf[len - 1] == '\n') + --count; + + if (!rtnl_trylock()) + return restart_syscall(); + ret = dev_set_alias(netdev, buf, count); + rtnl_unlock(); + + return ret < 0 ? ret : len; +} + +static ssize_t show_ifalias(struct device *dev, + struct device_attribute *attr, char *buf) +{ + const struct net_device *netdev = to_net_dev(dev); + ssize_t ret = 0; + + if (!rtnl_trylock()) + return restart_syscall(); + if (netdev->ifalias) + ret = sprintf(buf, "%s\n", netdev->ifalias); + rtnl_unlock(); + return ret; +} + +NETDEVICE_SHOW(group, fmt_dec); + +static int change_group(struct net_device *net, unsigned long new_group) +{ + dev_set_group(net, (int) new_group); + return 0; +} + +static ssize_t store_group(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) +{ + return netdev_store(dev, attr, buf, len, change_group); +} + +static struct device_attribute net_class_attributes[] = { + __ATTR(addr_assign_type, S_IRUGO, show_addr_assign_type, NULL), + __ATTR(addr_len, S_IRUGO, show_addr_len, NULL), + __ATTR(dev_id, S_IRUGO, show_dev_id, NULL), + __ATTR(ifalias, S_IRUGO | S_IWUSR, show_ifalias, store_ifalias), + __ATTR(iflink, S_IRUGO, show_iflink, NULL), + __ATTR(ifindex, S_IRUGO, show_ifindex, NULL), + __ATTR(type, S_IRUGO, show_type, NULL), + __ATTR(link_mode, S_IRUGO, show_link_mode, NULL), + __ATTR(address, S_IRUGO, show_address, NULL), + __ATTR(broadcast, S_IRUGO, show_broadcast, NULL), + __ATTR(carrier, S_IRUGO, show_carrier, NULL), + __ATTR(speed, S_IRUGO, show_speed, NULL), + __ATTR(duplex, S_IRUGO, show_duplex, NULL), + __ATTR(dormant, S_IRUGO, show_dormant, NULL), + __ATTR(operstate, S_IRUGO, show_operstate, NULL), + __ATTR(mtu, S_IRUGO | S_IWUSR, show_mtu, store_mtu), + __ATTR(flags, S_IRUGO | S_IWUSR, show_flags, store_flags), + __ATTR(tx_queue_len, S_IRUGO | S_IWUSR, show_tx_queue_len, + store_tx_queue_len), + __ATTR(netdev_group, S_IRUGO | S_IWUSR, show_group, store_group), + {} +}; + +/* Show a given an attribute in the statistics group */ +static ssize_t netstat_show(const struct device *d, + struct device_attribute *attr, char *buf, + unsigned long offset) +{ + struct net_device *dev = to_net_dev(d); + ssize_t ret = -EINVAL; + + WARN_ON(offset > sizeof(struct rtnl_link_stats64) || + offset % sizeof(u64) != 0); + + read_lock(&dev_base_lock); + if (dev_isalive(dev)) { + struct rtnl_link_stats64 temp; + const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp); + + ret = sprintf(buf, fmt_u64, *(u64 *)(((u8 *) stats) + offset)); + } + read_unlock(&dev_base_lock); + return ret; +} + +/* generate a read-only statistics attribute */ +#define NETSTAT_ENTRY(name) \ +static ssize_t show_##name(struct device *d, \ + struct device_attribute *attr, char *buf) \ +{ \ + return netstat_show(d, attr, buf, \ + offsetof(struct rtnl_link_stats64, name)); \ +} \ +static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL) + +NETSTAT_ENTRY(rx_packets); +NETSTAT_ENTRY(tx_packets); +NETSTAT_ENTRY(rx_bytes); +NETSTAT_ENTRY(tx_bytes); +NETSTAT_ENTRY(rx_errors); +NETSTAT_ENTRY(tx_errors); +NETSTAT_ENTRY(rx_dropped); +NETSTAT_ENTRY(tx_dropped); +NETSTAT_ENTRY(multicast); +NETSTAT_ENTRY(collisions); +NETSTAT_ENTRY(rx_length_errors); +NETSTAT_ENTRY(rx_over_errors); +NETSTAT_ENTRY(rx_crc_errors); +NETSTAT_ENTRY(rx_frame_errors); +NETSTAT_ENTRY(rx_fifo_errors); +NETSTAT_ENTRY(rx_missed_errors); +NETSTAT_ENTRY(tx_aborted_errors); +NETSTAT_ENTRY(tx_carrier_errors); +NETSTAT_ENTRY(tx_fifo_errors); +NETSTAT_ENTRY(tx_heartbeat_errors); +NETSTAT_ENTRY(tx_window_errors); +NETSTAT_ENTRY(rx_compressed); +NETSTAT_ENTRY(tx_compressed); + +static struct attribute *netstat_attrs[] = { + &dev_attr_rx_packets.attr, + &dev_attr_tx_packets.attr, + &dev_attr_rx_bytes.attr, + &dev_attr_tx_bytes.attr, + &dev_attr_rx_errors.attr, + &dev_attr_tx_errors.attr, + &dev_attr_rx_dropped.attr, + &dev_attr_tx_dropped.attr, + &dev_attr_multicast.attr, + &dev_attr_collisions.attr, + &dev_attr_rx_length_errors.attr, + &dev_attr_rx_over_errors.attr, + &dev_attr_rx_crc_errors.attr, + &dev_attr_rx_frame_errors.attr, + &dev_attr_rx_fifo_errors.attr, + &dev_attr_rx_missed_errors.attr, + &dev_attr_tx_aborted_errors.attr, + &dev_attr_tx_carrier_errors.attr, + &dev_attr_tx_fifo_errors.attr, + &dev_attr_tx_heartbeat_errors.attr, + &dev_attr_tx_window_errors.attr, + &dev_attr_rx_compressed.attr, + &dev_attr_tx_compressed.attr, + NULL +}; + + +static struct attribute_group netstat_group = { + .name = "statistics", + .attrs = netstat_attrs, +}; + +#ifdef CONFIG_WIRELESS_EXT_SYSFS +/* helper function that does all the locking etc for wireless stats */ +static ssize_t wireless_show(struct device *d, char *buf, + ssize_t (*format)(const struct iw_statistics *, + char *)) +{ + struct net_device *dev = to_net_dev(d); + const struct iw_statistics *iw; + ssize_t ret = -EINVAL; + + if (!rtnl_trylock()) + return restart_syscall(); + if (dev_isalive(dev)) { + iw = get_wireless_stats(dev); + if (iw) + ret = (*format)(iw, buf); + } + rtnl_unlock(); + + return ret; +} + +/* show function template for wireless fields */ +#define WIRELESS_SHOW(name, field, format_string) \ +static ssize_t format_iw_##name(const struct iw_statistics *iw, char *buf) \ +{ \ + return sprintf(buf, format_string, iw->field); \ +} \ +static ssize_t show_iw_##name(struct device *d, \ + struct device_attribute *attr, char *buf) \ +{ \ + return wireless_show(d, buf, format_iw_##name); \ +} \ +static DEVICE_ATTR(name, S_IRUGO, show_iw_##name, NULL) + +WIRELESS_SHOW(status, status, fmt_hex); +WIRELESS_SHOW(link, qual.qual, fmt_dec); +WIRELESS_SHOW(level, qual.level, fmt_dec); +WIRELESS_SHOW(noise, qual.noise, fmt_dec); +WIRELESS_SHOW(nwid, discard.nwid, fmt_dec); +WIRELESS_SHOW(crypt, discard.code, fmt_dec); +WIRELESS_SHOW(fragment, discard.fragment, fmt_dec); +WIRELESS_SHOW(misc, discard.misc, fmt_dec); +WIRELESS_SHOW(retries, discard.retries, fmt_dec); +WIRELESS_SHOW(beacon, miss.beacon, fmt_dec); + +static struct attribute *wireless_attrs[] = { + &dev_attr_status.attr, + &dev_attr_link.attr, + &dev_attr_level.attr, + &dev_attr_noise.attr, + &dev_attr_nwid.attr, + &dev_attr_crypt.attr, + &dev_attr_fragment.attr, + &dev_attr_retries.attr, + &dev_attr_misc.attr, + &dev_attr_beacon.attr, + NULL +}; + +static struct attribute_group wireless_group = { + .name = "wireless", + .attrs = wireless_attrs, +}; +#endif +#endif /* CONFIG_SYSFS */ + +#ifdef CONFIG_RPS +/* + * RX queue sysfs structures and functions. + */ +struct rx_queue_attribute { + struct attribute attr; + ssize_t (*show)(struct netdev_rx_queue *queue, + struct rx_queue_attribute *attr, char *buf); + ssize_t (*store)(struct netdev_rx_queue *queue, + struct rx_queue_attribute *attr, const char *buf, size_t len); +}; +#define to_rx_queue_attr(_attr) container_of(_attr, \ + struct rx_queue_attribute, attr) + +#define to_rx_queue(obj) container_of(obj, struct netdev_rx_queue, kobj) + +static ssize_t rx_queue_attr_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct rx_queue_attribute *attribute = to_rx_queue_attr(attr); + struct netdev_rx_queue *queue = to_rx_queue(kobj); + + if (!attribute->show) + return -EIO; + + return attribute->show(queue, attribute, buf); +} + +static ssize_t rx_queue_attr_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t count) +{ + struct rx_queue_attribute *attribute = to_rx_queue_attr(attr); + struct netdev_rx_queue *queue = to_rx_queue(kobj); + + if (!attribute->store) + return -EIO; + + return attribute->store(queue, attribute, buf, count); +} + +static const struct sysfs_ops rx_queue_sysfs_ops = { + .show = rx_queue_attr_show, + .store = rx_queue_attr_store, +}; + +static ssize_t show_rps_map(struct netdev_rx_queue *queue, + struct rx_queue_attribute *attribute, char *buf) +{ + struct rps_map *map; + cpumask_var_t mask; + size_t len = 0; + int i; + + if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + + rcu_read_lock(); + map = rcu_dereference(queue->rps_map); + if (map) + for (i = 0; i < map->len; i++) + cpumask_set_cpu(map->cpus[i], mask); + + len += cpumask_scnprintf(buf + len, PAGE_SIZE, mask); + if (PAGE_SIZE - len < 3) { + rcu_read_unlock(); + free_cpumask_var(mask); + return -EINVAL; + } + rcu_read_unlock(); + + free_cpumask_var(mask); + len += sprintf(buf + len, "\n"); + return len; +} + +static ssize_t store_rps_map(struct netdev_rx_queue *queue, + struct rx_queue_attribute *attribute, + const char *buf, size_t len) +{ + struct rps_map *old_map, *map; + cpumask_var_t mask; + int err, cpu, i; + static DEFINE_SPINLOCK(rps_map_lock); + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + + err = bitmap_parse(buf, len, cpumask_bits(mask), nr_cpumask_bits); + if (err) { + free_cpumask_var(mask); + return err; + } + + map = kzalloc(max_t(unsigned, + RPS_MAP_SIZE(cpumask_weight(mask)), L1_CACHE_BYTES), + GFP_KERNEL); + if (!map) { + free_cpumask_var(mask); + return -ENOMEM; + } + + i = 0; + for_each_cpu_and(cpu, mask, cpu_online_mask) + map->cpus[i++] = cpu; + + if (i) + map->len = i; + else { + kfree(map); + map = NULL; + } + + spin_lock(&rps_map_lock); + old_map = rcu_dereference_protected(queue->rps_map, + lockdep_is_held(&rps_map_lock)); + rcu_assign_pointer(queue->rps_map, map); + spin_unlock(&rps_map_lock); + + if (map) + static_key_slow_inc(&rps_needed); + if (old_map) { + kfree_rcu(old_map, rcu); + static_key_slow_dec(&rps_needed); + } + free_cpumask_var(mask); + return len; +} + +static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, + struct rx_queue_attribute *attr, + char *buf) +{ + struct rps_dev_flow_table *flow_table; + unsigned long val = 0; + + rcu_read_lock(); + flow_table = rcu_dereference(queue->rps_flow_table); + if (flow_table) + val = (unsigned long)flow_table->mask + 1; + rcu_read_unlock(); + + return sprintf(buf, "%lu\n", val); +} + +static void rps_dev_flow_table_release_work(struct work_struct *work) +{ + struct rps_dev_flow_table *table = container_of(work, + struct rps_dev_flow_table, free_work); + + vfree(table); +} + +static void rps_dev_flow_table_release(struct rcu_head *rcu) +{ + struct rps_dev_flow_table *table = container_of(rcu, + struct rps_dev_flow_table, rcu); + + INIT_WORK(&table->free_work, rps_dev_flow_table_release_work); + schedule_work(&table->free_work); +} + +static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, + struct rx_queue_attribute *attr, + const char *buf, size_t len) +{ + unsigned long mask, count; + struct rps_dev_flow_table *table, *old_table; + static DEFINE_SPINLOCK(rps_dev_flow_lock); + int rc; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + rc = kstrtoul(buf, 0, &count); + if (rc < 0) + return rc; + + if (count) { + mask = count - 1; + /* mask = roundup_pow_of_two(count) - 1; + * without overflows... + */ + while ((mask | (mask >> 1)) != mask) + mask |= (mask >> 1); + /* On 64 bit arches, must check mask fits in table->mask (u32), + * and on 32bit arches, must check RPS_DEV_FLOW_TABLE_SIZE(mask + 1) + * doesnt overflow. + */ +#if BITS_PER_LONG > 32 + if (mask > (unsigned long)(u32)mask) + return -EINVAL; +#else + if (mask > (ULONG_MAX - RPS_DEV_FLOW_TABLE_SIZE(1)) + / sizeof(struct rps_dev_flow)) { + /* Enforce a limit to prevent overflow */ + return -EINVAL; + } +#endif + table = vmalloc(RPS_DEV_FLOW_TABLE_SIZE(mask + 1)); + if (!table) + return -ENOMEM; + + table->mask = mask; + for (count = 0; count <= mask; count++) + table->flows[count].cpu = RPS_NO_CPU; + } else + table = NULL; + + spin_lock(&rps_dev_flow_lock); + old_table = rcu_dereference_protected(queue->rps_flow_table, + lockdep_is_held(&rps_dev_flow_lock)); + rcu_assign_pointer(queue->rps_flow_table, table); + spin_unlock(&rps_dev_flow_lock); + + if (old_table) + call_rcu(&old_table->rcu, rps_dev_flow_table_release); + + return len; +} + +static struct rx_queue_attribute rps_cpus_attribute = + __ATTR(rps_cpus, S_IRUGO | S_IWUSR, show_rps_map, store_rps_map); + + +static struct rx_queue_attribute rps_dev_flow_table_cnt_attribute = + __ATTR(rps_flow_cnt, S_IRUGO | S_IWUSR, + show_rps_dev_flow_table_cnt, store_rps_dev_flow_table_cnt); + +static struct attribute *rx_queue_default_attrs[] = { + &rps_cpus_attribute.attr, + &rps_dev_flow_table_cnt_attribute.attr, + NULL +}; + +static void rx_queue_release(struct kobject *kobj) +{ + struct netdev_rx_queue *queue = to_rx_queue(kobj); + struct rps_map *map; + struct rps_dev_flow_table *flow_table; + + + map = rcu_dereference_protected(queue->rps_map, 1); + if (map) { + RCU_INIT_POINTER(queue->rps_map, NULL); + kfree_rcu(map, rcu); + } + + flow_table = rcu_dereference_protected(queue->rps_flow_table, 1); + if (flow_table) { + RCU_INIT_POINTER(queue->rps_flow_table, NULL); + call_rcu(&flow_table->rcu, rps_dev_flow_table_release); + } + + memset(kobj, 0, sizeof(*kobj)); + dev_put(queue->dev); +} + +static struct kobj_type rx_queue_ktype = { + .sysfs_ops = &rx_queue_sysfs_ops, + .release = rx_queue_release, + .default_attrs = rx_queue_default_attrs, +}; + +static int rx_queue_add_kobject(struct net_device *net, int index) +{ + struct netdev_rx_queue *queue = net->_rx + index; + struct kobject *kobj = &queue->kobj; + int error = 0; + + kobj->kset = net->queues_kset; + error = kobject_init_and_add(kobj, &rx_queue_ktype, NULL, + "rx-%u", index); + if (error) { + kobject_put(kobj); + return error; + } + + kobject_uevent(kobj, KOBJ_ADD); + dev_hold(queue->dev); + + return error; +} +#endif /* CONFIG_RPS */ + +int +net_rx_queue_update_kobjects(struct net_device *net, int old_num, int new_num) +{ +#ifdef CONFIG_RPS + int i; + int error = 0; + + for (i = old_num; i < new_num; i++) { + error = rx_queue_add_kobject(net, i); + if (error) { + new_num = old_num; + break; + } + } + + while (--i >= new_num) + kobject_put(&net->_rx[i].kobj); + + return error; +#else + return 0; +#endif +} + +#ifdef CONFIG_SYSFS +/* + * netdev_queue sysfs structures and functions. + */ +struct netdev_queue_attribute { + struct attribute attr; + ssize_t (*show)(struct netdev_queue *queue, + struct netdev_queue_attribute *attr, char *buf); + ssize_t (*store)(struct netdev_queue *queue, + struct netdev_queue_attribute *attr, const char *buf, size_t len); +}; +#define to_netdev_queue_attr(_attr) container_of(_attr, \ + struct netdev_queue_attribute, attr) + +#define to_netdev_queue(obj) container_of(obj, struct netdev_queue, kobj) + +static ssize_t netdev_queue_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct netdev_queue_attribute *attribute = to_netdev_queue_attr(attr); + struct netdev_queue *queue = to_netdev_queue(kobj); + + if (!attribute->show) + return -EIO; + + return attribute->show(queue, attribute, buf); +} + +static ssize_t netdev_queue_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t count) +{ + struct netdev_queue_attribute *attribute = to_netdev_queue_attr(attr); + struct netdev_queue *queue = to_netdev_queue(kobj); + + if (!attribute->store) + return -EIO; + + return attribute->store(queue, attribute, buf, count); +} + +static const struct sysfs_ops netdev_queue_sysfs_ops = { + .show = netdev_queue_attr_show, + .store = netdev_queue_attr_store, +}; + +static ssize_t show_trans_timeout(struct netdev_queue *queue, + struct netdev_queue_attribute *attribute, + char *buf) +{ + unsigned long trans_timeout; + + spin_lock_irq(&queue->_xmit_lock); + trans_timeout = queue->trans_timeout; + spin_unlock_irq(&queue->_xmit_lock); + + return sprintf(buf, "%lu", trans_timeout); +} + +static struct netdev_queue_attribute queue_trans_timeout = + __ATTR(tx_timeout, S_IRUGO, show_trans_timeout, NULL); + +#ifdef CONFIG_BQL +/* + * Byte queue limits sysfs structures and functions. + */ +static ssize_t bql_show(char *buf, unsigned int value) +{ + return sprintf(buf, "%u\n", value); +} + +static ssize_t bql_set(const char *buf, const size_t count, + unsigned int *pvalue) +{ + unsigned int value; + int err; + + if (!strcmp(buf, "max") || !strcmp(buf, "max\n")) + value = DQL_MAX_LIMIT; + else { + err = kstrtouint(buf, 10, &value); + if (err < 0) + return err; + if (value > DQL_MAX_LIMIT) + return -EINVAL; + } + + *pvalue = value; + + return count; +} + +static ssize_t bql_show_hold_time(struct netdev_queue *queue, + struct netdev_queue_attribute *attr, + char *buf) +{ + struct dql *dql = &queue->dql; + + return sprintf(buf, "%u\n", jiffies_to_msecs(dql->slack_hold_time)); +} + +static ssize_t bql_set_hold_time(struct netdev_queue *queue, + struct netdev_queue_attribute *attribute, + const char *buf, size_t len) +{ + struct dql *dql = &queue->dql; + unsigned value; + int err; + + err = kstrtouint(buf, 10, &value); + if (err < 0) + return err; + + dql->slack_hold_time = msecs_to_jiffies(value); + + return len; +} + +static struct netdev_queue_attribute bql_hold_time_attribute = + __ATTR(hold_time, S_IRUGO | S_IWUSR, bql_show_hold_time, + bql_set_hold_time); + +static ssize_t bql_show_inflight(struct netdev_queue *queue, + struct netdev_queue_attribute *attr, + char *buf) +{ + struct dql *dql = &queue->dql; + + return sprintf(buf, "%u\n", dql->num_queued - dql->num_completed); +} + +static struct netdev_queue_attribute bql_inflight_attribute = + __ATTR(inflight, S_IRUGO, bql_show_inflight, NULL); + +#define BQL_ATTR(NAME, FIELD) \ +static ssize_t bql_show_ ## NAME(struct netdev_queue *queue, \ + struct netdev_queue_attribute *attr, \ + char *buf) \ +{ \ + return bql_show(buf, queue->dql.FIELD); \ +} \ + \ +static ssize_t bql_set_ ## NAME(struct netdev_queue *queue, \ + struct netdev_queue_attribute *attr, \ + const char *buf, size_t len) \ +{ \ + return bql_set(buf, len, &queue->dql.FIELD); \ +} \ + \ +static struct netdev_queue_attribute bql_ ## NAME ## _attribute = \ + __ATTR(NAME, S_IRUGO | S_IWUSR, bql_show_ ## NAME, \ + bql_set_ ## NAME); + +BQL_ATTR(limit, limit) +BQL_ATTR(limit_max, max_limit) +BQL_ATTR(limit_min, min_limit) + +static struct attribute *dql_attrs[] = { + &bql_limit_attribute.attr, + &bql_limit_max_attribute.attr, + &bql_limit_min_attribute.attr, + &bql_hold_time_attribute.attr, + &bql_inflight_attribute.attr, + NULL +}; + +static struct attribute_group dql_group = { + .name = "byte_queue_limits", + .attrs = dql_attrs, +}; +#endif /* CONFIG_BQL */ + +#ifdef CONFIG_XPS +static inline unsigned int get_netdev_queue_index(struct netdev_queue *queue) +{ + struct net_device *dev = queue->dev; + int i; + + for (i = 0; i < dev->num_tx_queues; i++) + if (queue == &dev->_tx[i]) + break; + + BUG_ON(i >= dev->num_tx_queues); + + return i; +} + + +static ssize_t show_xps_map(struct netdev_queue *queue, + struct netdev_queue_attribute *attribute, char *buf) +{ + struct net_device *dev = queue->dev; + struct xps_dev_maps *dev_maps; + cpumask_var_t mask; + unsigned long index; + size_t len = 0; + int i; + + if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + + index = get_netdev_queue_index(queue); + + rcu_read_lock(); + dev_maps = rcu_dereference(dev->xps_maps); + if (dev_maps) { + for_each_possible_cpu(i) { + struct xps_map *map = + rcu_dereference(dev_maps->cpu_map[i]); + if (map) { + int j; + for (j = 0; j < map->len; j++) { + if (map->queues[j] == index) { + cpumask_set_cpu(i, mask); + break; + } + } + } + } + } + rcu_read_unlock(); + + len += cpumask_scnprintf(buf + len, PAGE_SIZE, mask); + if (PAGE_SIZE - len < 3) { + free_cpumask_var(mask); + return -EINVAL; + } + + free_cpumask_var(mask); + len += sprintf(buf + len, "\n"); + return len; +} + +static DEFINE_MUTEX(xps_map_mutex); +#define xmap_dereference(P) \ + rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex)) + +static void xps_queue_release(struct netdev_queue *queue) +{ + struct net_device *dev = queue->dev; + struct xps_dev_maps *dev_maps; + struct xps_map *map; + unsigned long index; + int i, pos, nonempty = 0; + + index = get_netdev_queue_index(queue); + + mutex_lock(&xps_map_mutex); + dev_maps = xmap_dereference(dev->xps_maps); + + if (dev_maps) { + for_each_possible_cpu(i) { + map = xmap_dereference(dev_maps->cpu_map[i]); + if (!map) + continue; + + for (pos = 0; pos < map->len; pos++) + if (map->queues[pos] == index) + break; + + if (pos < map->len) { + if (map->len > 1) + map->queues[pos] = + map->queues[--map->len]; + else { + RCU_INIT_POINTER(dev_maps->cpu_map[i], + NULL); + kfree_rcu(map, rcu); + map = NULL; + } + } + if (map) + nonempty = 1; + } + + if (!nonempty) { + RCU_INIT_POINTER(dev->xps_maps, NULL); + kfree_rcu(dev_maps, rcu); + } + } + mutex_unlock(&xps_map_mutex); +} + +static ssize_t store_xps_map(struct netdev_queue *queue, + struct netdev_queue_attribute *attribute, + const char *buf, size_t len) +{ + struct net_device *dev = queue->dev; + cpumask_var_t mask; + int err, i, cpu, pos, map_len, alloc_len, need_set; + unsigned long index; + struct xps_map *map, *new_map; + struct xps_dev_maps *dev_maps, *new_dev_maps; + int nonempty = 0; + int numa_node_id = -2; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + + index = get_netdev_queue_index(queue); + + err = bitmap_parse(buf, len, cpumask_bits(mask), nr_cpumask_bits); + if (err) { + free_cpumask_var(mask); + return err; + } + + new_dev_maps = kzalloc(max_t(unsigned, + XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES), GFP_KERNEL); + if (!new_dev_maps) { + free_cpumask_var(mask); + return -ENOMEM; + } + + mutex_lock(&xps_map_mutex); + + dev_maps = xmap_dereference(dev->xps_maps); + + for_each_possible_cpu(cpu) { + map = dev_maps ? + xmap_dereference(dev_maps->cpu_map[cpu]) : NULL; + new_map = map; + if (map) { + for (pos = 0; pos < map->len; pos++) + if (map->queues[pos] == index) + break; + map_len = map->len; + alloc_len = map->alloc_len; + } else + pos = map_len = alloc_len = 0; + + need_set = cpumask_test_cpu(cpu, mask) && cpu_online(cpu); +#ifdef CONFIG_NUMA + if (need_set) { + if (numa_node_id == -2) + numa_node_id = cpu_to_node(cpu); + else if (numa_node_id != cpu_to_node(cpu)) + numa_node_id = -1; + } +#endif + if (need_set && pos >= map_len) { + /* Need to add queue to this CPU's map */ + if (map_len >= alloc_len) { + alloc_len = alloc_len ? + 2 * alloc_len : XPS_MIN_MAP_ALLOC; + new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), + GFP_KERNEL, + cpu_to_node(cpu)); + if (!new_map) + goto error; + new_map->alloc_len = alloc_len; + for (i = 0; i < map_len; i++) + new_map->queues[i] = map->queues[i]; + new_map->len = map_len; + } + new_map->queues[new_map->len++] = index; + } else if (!need_set && pos < map_len) { + /* Need to remove queue from this CPU's map */ + if (map_len > 1) + new_map->queues[pos] = + new_map->queues[--new_map->len]; + else + new_map = NULL; + } + RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], new_map); + } + + /* Cleanup old maps */ + for_each_possible_cpu(cpu) { + map = dev_maps ? + xmap_dereference(dev_maps->cpu_map[cpu]) : NULL; + if (map && xmap_dereference(new_dev_maps->cpu_map[cpu]) != map) + kfree_rcu(map, rcu); + if (new_dev_maps->cpu_map[cpu]) + nonempty = 1; + } + + if (nonempty) { + rcu_assign_pointer(dev->xps_maps, new_dev_maps); + } else { + kfree(new_dev_maps); + RCU_INIT_POINTER(dev->xps_maps, NULL); + } + + if (dev_maps) + kfree_rcu(dev_maps, rcu); + + netdev_queue_numa_node_write(queue, (numa_node_id >= 0) ? numa_node_id : + NUMA_NO_NODE); + + mutex_unlock(&xps_map_mutex); + + free_cpumask_var(mask); + return len; + +error: + mutex_unlock(&xps_map_mutex); + + if (new_dev_maps) + for_each_possible_cpu(i) + kfree(rcu_dereference_protected( + new_dev_maps->cpu_map[i], + 1)); + kfree(new_dev_maps); + free_cpumask_var(mask); + return -ENOMEM; +} + +static struct netdev_queue_attribute xps_cpus_attribute = + __ATTR(xps_cpus, S_IRUGO | S_IWUSR, show_xps_map, store_xps_map); +#endif /* CONFIG_XPS */ + +static struct attribute *netdev_queue_default_attrs[] = { + &queue_trans_timeout.attr, +#ifdef CONFIG_XPS + &xps_cpus_attribute.attr, +#endif + NULL +}; + +static void netdev_queue_release(struct kobject *kobj) +{ + struct netdev_queue *queue = to_netdev_queue(kobj); + +#ifdef CONFIG_XPS + xps_queue_release(queue); +#endif + + memset(kobj, 0, sizeof(*kobj)); + dev_put(queue->dev); +} + +static struct kobj_type netdev_queue_ktype = { + .sysfs_ops = &netdev_queue_sysfs_ops, + .release = netdev_queue_release, + .default_attrs = netdev_queue_default_attrs, +}; + +static int netdev_queue_add_kobject(struct net_device *net, int index) +{ + struct netdev_queue *queue = net->_tx + index; + struct kobject *kobj = &queue->kobj; + int error = 0; + + kobj->kset = net->queues_kset; + error = kobject_init_and_add(kobj, &netdev_queue_ktype, NULL, + "tx-%u", index); + if (error) + goto exit; + +#ifdef CONFIG_BQL + error = sysfs_create_group(kobj, &dql_group); + if (error) + goto exit; +#endif + + kobject_uevent(kobj, KOBJ_ADD); + dev_hold(queue->dev); + + return 0; +exit: + kobject_put(kobj); + return error; +} +#endif /* CONFIG_SYSFS */ + +int +netdev_queue_update_kobjects(struct net_device *net, int old_num, int new_num) +{ +#ifdef CONFIG_SYSFS + int i; + int error = 0; + + for (i = old_num; i < new_num; i++) { + error = netdev_queue_add_kobject(net, i); + if (error) { + new_num = old_num; + break; + } + } + + while (--i >= new_num) { + struct netdev_queue *queue = net->_tx + i; + +#ifdef CONFIG_BQL + sysfs_remove_group(&queue->kobj, &dql_group); +#endif + kobject_put(&queue->kobj); + } + + return error; +#else + return 0; +#endif /* CONFIG_SYSFS */ +} + +static int register_queue_kobjects(struct net_device *net) +{ + int error = 0, txq = 0, rxq = 0, real_rx = 0, real_tx = 0; + +#ifdef CONFIG_SYSFS + net->queues_kset = kset_create_and_add("queues", + NULL, &net->dev.kobj); + if (!net->queues_kset) + return -ENOMEM; +#endif + +#ifdef CONFIG_RPS + real_rx = net->real_num_rx_queues; +#endif + real_tx = net->real_num_tx_queues; + + error = net_rx_queue_update_kobjects(net, 0, real_rx); + if (error) + goto error; + rxq = real_rx; + + error = netdev_queue_update_kobjects(net, 0, real_tx); + if (error) + goto error; + txq = real_tx; + + return 0; + +error: + netdev_queue_update_kobjects(net, txq, 0); + net_rx_queue_update_kobjects(net, rxq, 0); + return error; +} + +static void remove_queue_kobjects(struct net_device *net) +{ + int real_rx = 0, real_tx = 0; + +#ifdef CONFIG_RPS + real_rx = net->real_num_rx_queues; +#endif + real_tx = net->real_num_tx_queues; + + net_rx_queue_update_kobjects(net, real_rx, 0); + netdev_queue_update_kobjects(net, real_tx, 0); +#ifdef CONFIG_SYSFS + kset_unregister(net->queues_kset); +#endif +} + +static void *net_grab_current_ns(void) +{ + struct net *ns = current->nsproxy->net_ns; +#ifdef CONFIG_NET_NS + if (ns) + atomic_inc(&ns->passive); +#endif + return ns; +} + +static const void *net_initial_ns(void) +{ + return &init_net; +} + +static const void *net_netlink_ns(struct sock *sk) +{ + return sock_net(sk); +} + +struct kobj_ns_type_operations net_ns_type_operations = { + .type = KOBJ_NS_TYPE_NET, + .grab_current_ns = net_grab_current_ns, + .netlink_ns = net_netlink_ns, + .initial_ns = net_initial_ns, + .drop_ns = net_drop_ns, +}; +EXPORT_SYMBOL_GPL(net_ns_type_operations); + +#ifdef CONFIG_HOTPLUG +static int netdev_uevent(struct device *d, struct kobj_uevent_env *env) +{ + struct net_device *dev = to_net_dev(d); + int retval; + + /* pass interface to uevent. */ + retval = add_uevent_var(env, "INTERFACE=%s", dev->name); + if (retval) + goto exit; + + /* pass ifindex to uevent. + * ifindex is useful as it won't change (interface name may change) + * and is what RtNetlink uses natively. */ + retval = add_uevent_var(env, "IFINDEX=%d", dev->ifindex); + +exit: + return retval; +} +#endif + +/* + * netdev_release -- destroy and free a dead device. + * Called when last reference to device kobject is gone. + */ +static void netdev_release(struct device *d) +{ + struct net_device *dev = to_net_dev(d); + + BUG_ON(dev->reg_state != NETREG_RELEASED); + + kfree(dev->ifalias); + kfree((char *)dev - dev->padded); +} + +static const void *net_namespace(struct device *d) +{ + struct net_device *dev; + dev = container_of(d, struct net_device, dev); + return dev_net(dev); +} + +static struct class net_class = { + .name = "net", + .dev_release = netdev_release, +#ifdef CONFIG_SYSFS + .dev_attrs = net_class_attributes, +#endif /* CONFIG_SYSFS */ +#ifdef CONFIG_HOTPLUG + .dev_uevent = netdev_uevent, +#endif + .ns_type = &net_ns_type_operations, + .namespace = net_namespace, +}; + +/* Delete sysfs entries but hold kobject reference until after all + * netdev references are gone. + */ +void netdev_unregister_kobject(struct net_device * net) +{ + struct device *dev = &(net->dev); + + kobject_get(&dev->kobj); + + remove_queue_kobjects(net); + + device_del(dev); +} + +/* Create sysfs entries for network device. */ +int netdev_register_kobject(struct net_device *net) +{ + struct device *dev = &(net->dev); + const struct attribute_group **groups = net->sysfs_groups; + int error = 0; + + device_initialize(dev); + dev->class = &net_class; + dev->platform_data = net; + dev->groups = groups; + + dev_set_name(dev, "%s", net->name); + +#ifdef CONFIG_SYSFS + /* Allow for a device specific group */ + if (*groups) + groups++; + + *groups++ = &netstat_group; +#ifdef CONFIG_WIRELESS_EXT_SYSFS + if (net->ieee80211_ptr) + *groups++ = &wireless_group; +#ifdef CONFIG_WIRELESS_EXT + else if (net->wireless_handlers) + *groups++ = &wireless_group; +#endif +#endif +#endif /* CONFIG_SYSFS */ + + error = device_add(dev); + if (error) + return error; + + error = register_queue_kobjects(net); + if (error) { + device_del(dev); + return error; + } + + return error; +} + +int netdev_class_create_file(struct class_attribute *class_attr) +{ + return class_create_file(&net_class, class_attr); +} +EXPORT_SYMBOL(netdev_class_create_file); + +void netdev_class_remove_file(struct class_attribute *class_attr) +{ + class_remove_file(&net_class, class_attr); +} +EXPORT_SYMBOL(netdev_class_remove_file); + +int netdev_kobject_init(void) +{ + kobj_ns_type_register(&net_ns_type_operations); + return class_register(&net_class); +} diff --git a/net/core/net-sysfs.h b/net/core/net-sysfs.h new file mode 100644 index 00000000..bd7751ec --- /dev/null +++ b/net/core/net-sysfs.h @@ -0,0 +1,11 @@ +#ifndef __NET_SYSFS_H__ +#define __NET_SYSFS_H__ + +int netdev_kobject_init(void); +int netdev_register_kobject(struct net_device *); +void netdev_unregister_kobject(struct net_device *); +int net_rx_queue_update_kobjects(struct net_device *, int old_num, int new_num); +int netdev_queue_update_kobjects(struct net_device *net, + int old_num, int new_num); + +#endif diff --git a/net/core/net-traces.c b/net/core/net-traces.c new file mode 100644 index 00000000..ba3c0120 --- /dev/null +++ b/net/core/net-traces.c @@ -0,0 +1,37 @@ +/* + * consolidates trace point definitions + * + * Copyright (C) 2009 Neil Horman <nhorman@tuxdriver.com> + */ + +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/string.h> +#include <linux/if_arp.h> +#include <linux/inetdevice.h> +#include <linux/inet.h> +#include <linux/interrupt.h> +#include <linux/export.h> +#include <linux/netpoll.h> +#include <linux/sched.h> +#include <linux/delay.h> +#include <linux/rcupdate.h> +#include <linux/types.h> +#include <linux/workqueue.h> +#include <linux/netlink.h> +#include <linux/net_dropmon.h> +#include <linux/slab.h> + +#include <asm/unaligned.h> +#include <asm/bitops.h> + +#define CREATE_TRACE_POINTS +#include <trace/events/skb.h> +#include <trace/events/net.h> +#include <trace/events/napi.h> +#include <trace/events/sock.h> +#include <trace/events/udp.h> + +EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb); + +EXPORT_TRACEPOINT_SYMBOL_GPL(napi_poll); diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c new file mode 100644 index 00000000..31a5ae51 --- /dev/null +++ b/net/core/net_namespace.c @@ -0,0 +1,640 @@ +#include <linux/workqueue.h> +#include <linux/rtnetlink.h> +#include <linux/cache.h> +#include <linux/slab.h> +#include <linux/list.h> +#include <linux/delay.h> +#include <linux/sched.h> +#include <linux/idr.h> +#include <linux/rculist.h> +#include <linux/nsproxy.h> +#include <linux/proc_fs.h> +#include <linux/file.h> +#include <linux/export.h> +#include <net/net_namespace.h> +#include <net/netns/generic.h> + +/* + * Our network namespace constructor/destructor lists + */ + +static LIST_HEAD(pernet_list); +static struct list_head *first_device = &pernet_list; +static DEFINE_MUTEX(net_mutex); + +LIST_HEAD(net_namespace_list); +EXPORT_SYMBOL_GPL(net_namespace_list); + +struct net init_net; +EXPORT_SYMBOL(init_net); + +#define INITIAL_NET_GEN_PTRS 13 /* +1 for len +2 for rcu_head */ + +static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS; + +static struct net_generic *net_alloc_generic(void) +{ + struct net_generic *ng; + size_t generic_size = offsetof(struct net_generic, ptr[max_gen_ptrs]); + + ng = kzalloc(generic_size, GFP_KERNEL); + if (ng) + ng->len = max_gen_ptrs; + + return ng; +} + +static int net_assign_generic(struct net *net, int id, void *data) +{ + struct net_generic *ng, *old_ng; + + BUG_ON(!mutex_is_locked(&net_mutex)); + BUG_ON(id == 0); + + old_ng = rcu_dereference_protected(net->gen, + lockdep_is_held(&net_mutex)); + ng = old_ng; + if (old_ng->len >= id) + goto assign; + + ng = net_alloc_generic(); + if (ng == NULL) + return -ENOMEM; + + /* + * Some synchronisation notes: + * + * The net_generic explores the net->gen array inside rcu + * read section. Besides once set the net->gen->ptr[x] + * pointer never changes (see rules in netns/generic.h). + * + * That said, we simply duplicate this array and schedule + * the old copy for kfree after a grace period. + */ + + memcpy(&ng->ptr, &old_ng->ptr, old_ng->len * sizeof(void*)); + + rcu_assign_pointer(net->gen, ng); + kfree_rcu(old_ng, rcu); +assign: + ng->ptr[id - 1] = data; + return 0; +} + +static int ops_init(const struct pernet_operations *ops, struct net *net) +{ + int err = -ENOMEM; + void *data = NULL; + + if (ops->id && ops->size) { + data = kzalloc(ops->size, GFP_KERNEL); + if (!data) + goto out; + + err = net_assign_generic(net, *ops->id, data); + if (err) + goto cleanup; + } + err = 0; + if (ops->init) + err = ops->init(net); + if (!err) + return 0; + +cleanup: + kfree(data); + +out: + return err; +} + +static void ops_free(const struct pernet_operations *ops, struct net *net) +{ + if (ops->id && ops->size) { + int id = *ops->id; + kfree(net_generic(net, id)); + } +} + +static void ops_exit_list(const struct pernet_operations *ops, + struct list_head *net_exit_list) +{ + struct net *net; + if (ops->exit) { + list_for_each_entry(net, net_exit_list, exit_list) + ops->exit(net); + } + if (ops->exit_batch) + ops->exit_batch(net_exit_list); +} + +static void ops_free_list(const struct pernet_operations *ops, + struct list_head *net_exit_list) +{ + struct net *net; + if (ops->size && ops->id) { + list_for_each_entry(net, net_exit_list, exit_list) + ops_free(ops, net); + } +} + +/* + * setup_net runs the initializers for the network namespace object. + */ +static __net_init int setup_net(struct net *net) +{ + /* Must be called with net_mutex held */ + const struct pernet_operations *ops, *saved_ops; + int error = 0; + LIST_HEAD(net_exit_list); + + atomic_set(&net->count, 1); + atomic_set(&net->passive, 1); + net->dev_base_seq = 1; + +#ifdef NETNS_REFCNT_DEBUG + atomic_set(&net->use_count, 0); +#endif + + list_for_each_entry(ops, &pernet_list, list) { + error = ops_init(ops, net); + if (error < 0) + goto out_undo; + } +out: + return error; + +out_undo: + /* Walk through the list backwards calling the exit functions + * for the pernet modules whose init functions did not fail. + */ + list_add(&net->exit_list, &net_exit_list); + saved_ops = ops; + list_for_each_entry_continue_reverse(ops, &pernet_list, list) + ops_exit_list(ops, &net_exit_list); + + ops = saved_ops; + list_for_each_entry_continue_reverse(ops, &pernet_list, list) + ops_free_list(ops, &net_exit_list); + + rcu_barrier(); + goto out; +} + + +#ifdef CONFIG_NET_NS +static struct kmem_cache *net_cachep; +static struct workqueue_struct *netns_wq; + +static struct net *net_alloc(void) +{ + struct net *net = NULL; + struct net_generic *ng; + + ng = net_alloc_generic(); + if (!ng) + goto out; + + net = kmem_cache_zalloc(net_cachep, GFP_KERNEL); + if (!net) + goto out_free; + + rcu_assign_pointer(net->gen, ng); +out: + return net; + +out_free: + kfree(ng); + goto out; +} + +static void net_free(struct net *net) +{ +#ifdef NETNS_REFCNT_DEBUG + if (unlikely(atomic_read(&net->use_count) != 0)) { + printk(KERN_EMERG "network namespace not free! Usage: %d\n", + atomic_read(&net->use_count)); + return; + } +#endif + kfree(net->gen); + kmem_cache_free(net_cachep, net); +} + +void net_drop_ns(void *p) +{ + struct net *ns = p; + if (ns && atomic_dec_and_test(&ns->passive)) + net_free(ns); +} + +struct net *copy_net_ns(unsigned long flags, struct net *old_net) +{ + struct net *net; + int rv; + + if (!(flags & CLONE_NEWNET)) + return get_net(old_net); + + net = net_alloc(); + if (!net) + return ERR_PTR(-ENOMEM); + mutex_lock(&net_mutex); + rv = setup_net(net); + if (rv == 0) { + rtnl_lock(); + list_add_tail_rcu(&net->list, &net_namespace_list); + rtnl_unlock(); + } + mutex_unlock(&net_mutex); + if (rv < 0) { + net_drop_ns(net); + return ERR_PTR(rv); + } + return net; +} + +static DEFINE_SPINLOCK(cleanup_list_lock); +static LIST_HEAD(cleanup_list); /* Must hold cleanup_list_lock to touch */ + +static void cleanup_net(struct work_struct *work) +{ + const struct pernet_operations *ops; + struct net *net, *tmp; + LIST_HEAD(net_kill_list); + LIST_HEAD(net_exit_list); + + /* Atomically snapshot the list of namespaces to cleanup */ + spin_lock_irq(&cleanup_list_lock); + list_replace_init(&cleanup_list, &net_kill_list); + spin_unlock_irq(&cleanup_list_lock); + + mutex_lock(&net_mutex); + + /* Don't let anyone else find us. */ + rtnl_lock(); + list_for_each_entry(net, &net_kill_list, cleanup_list) { + list_del_rcu(&net->list); + list_add_tail(&net->exit_list, &net_exit_list); + } + rtnl_unlock(); + + /* + * Another CPU might be rcu-iterating the list, wait for it. + * This needs to be before calling the exit() notifiers, so + * the rcu_barrier() below isn't sufficient alone. + */ + synchronize_rcu(); + + /* Run all of the network namespace exit methods */ + list_for_each_entry_reverse(ops, &pernet_list, list) + ops_exit_list(ops, &net_exit_list); + + /* Free the net generic variables */ + list_for_each_entry_reverse(ops, &pernet_list, list) + ops_free_list(ops, &net_exit_list); + + mutex_unlock(&net_mutex); + + /* Ensure there are no outstanding rcu callbacks using this + * network namespace. + */ + rcu_barrier(); + + /* Finally it is safe to free my network namespace structure */ + list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) { + list_del_init(&net->exit_list); + net_drop_ns(net); + } +} +static DECLARE_WORK(net_cleanup_work, cleanup_net); + +void __put_net(struct net *net) +{ + /* Cleanup the network namespace in process context */ + unsigned long flags; + + spin_lock_irqsave(&cleanup_list_lock, flags); + list_add(&net->cleanup_list, &cleanup_list); + spin_unlock_irqrestore(&cleanup_list_lock, flags); + + queue_work(netns_wq, &net_cleanup_work); +} +EXPORT_SYMBOL_GPL(__put_net); + +struct net *get_net_ns_by_fd(int fd) +{ + struct proc_inode *ei; + struct file *file; + struct net *net; + + file = proc_ns_fget(fd); + if (IS_ERR(file)) + return ERR_CAST(file); + + ei = PROC_I(file->f_dentry->d_inode); + if (ei->ns_ops == &netns_operations) + net = get_net(ei->ns); + else + net = ERR_PTR(-EINVAL); + + fput(file); + return net; +} + +#else +struct net *copy_net_ns(unsigned long flags, struct net *old_net) +{ + if (flags & CLONE_NEWNET) + return ERR_PTR(-EINVAL); + return old_net; +} + +struct net *get_net_ns_by_fd(int fd) +{ + return ERR_PTR(-EINVAL); +} +#endif + +struct net *get_net_ns_by_pid(pid_t pid) +{ + struct task_struct *tsk; + struct net *net; + + /* Lookup the network namespace */ + net = ERR_PTR(-ESRCH); + rcu_read_lock(); + tsk = find_task_by_vpid(pid); + if (tsk) { + struct nsproxy *nsproxy; + nsproxy = task_nsproxy(tsk); + if (nsproxy) + net = get_net(nsproxy->net_ns); + } + rcu_read_unlock(); + return net; +} +EXPORT_SYMBOL_GPL(get_net_ns_by_pid); + +static int __init net_ns_init(void) +{ + struct net_generic *ng; + +#ifdef CONFIG_NET_NS + net_cachep = kmem_cache_create("net_namespace", sizeof(struct net), + SMP_CACHE_BYTES, + SLAB_PANIC, NULL); + + /* Create workqueue for cleanup */ + netns_wq = create_singlethread_workqueue("netns"); + if (!netns_wq) + panic("Could not create netns workq"); +#endif + + ng = net_alloc_generic(); + if (!ng) + panic("Could not allocate generic netns"); + + rcu_assign_pointer(init_net.gen, ng); + + mutex_lock(&net_mutex); + if (setup_net(&init_net)) + panic("Could not setup the initial network namespace"); + + rtnl_lock(); + list_add_tail_rcu(&init_net.list, &net_namespace_list); + rtnl_unlock(); + + mutex_unlock(&net_mutex); + + return 0; +} + +pure_initcall(net_ns_init); + +#ifdef CONFIG_NET_NS +static int __register_pernet_operations(struct list_head *list, + struct pernet_operations *ops) +{ + struct net *net; + int error; + LIST_HEAD(net_exit_list); + + list_add_tail(&ops->list, list); + if (ops->init || (ops->id && ops->size)) { + for_each_net(net) { + error = ops_init(ops, net); + if (error) + goto out_undo; + list_add_tail(&net->exit_list, &net_exit_list); + } + } + return 0; + +out_undo: + /* If I have an error cleanup all namespaces I initialized */ + list_del(&ops->list); + ops_exit_list(ops, &net_exit_list); + ops_free_list(ops, &net_exit_list); + return error; +} + +static void __unregister_pernet_operations(struct pernet_operations *ops) +{ + struct net *net; + LIST_HEAD(net_exit_list); + + list_del(&ops->list); + for_each_net(net) + list_add_tail(&net->exit_list, &net_exit_list); + ops_exit_list(ops, &net_exit_list); + ops_free_list(ops, &net_exit_list); +} + +#else + +static int __register_pernet_operations(struct list_head *list, + struct pernet_operations *ops) +{ + return ops_init(ops, &init_net); +} + +static void __unregister_pernet_operations(struct pernet_operations *ops) +{ + LIST_HEAD(net_exit_list); + list_add(&init_net.exit_list, &net_exit_list); + ops_exit_list(ops, &net_exit_list); + ops_free_list(ops, &net_exit_list); +} + +#endif /* CONFIG_NET_NS */ + +static DEFINE_IDA(net_generic_ids); + +static int register_pernet_operations(struct list_head *list, + struct pernet_operations *ops) +{ + int error; + + if (ops->id) { +again: + error = ida_get_new_above(&net_generic_ids, 1, ops->id); + if (error < 0) { + if (error == -EAGAIN) { + ida_pre_get(&net_generic_ids, GFP_KERNEL); + goto again; + } + return error; + } + max_gen_ptrs = max_t(unsigned int, max_gen_ptrs, *ops->id); + } + error = __register_pernet_operations(list, ops); + if (error) { + rcu_barrier(); + if (ops->id) + ida_remove(&net_generic_ids, *ops->id); + } + + return error; +} + +static void unregister_pernet_operations(struct pernet_operations *ops) +{ + + __unregister_pernet_operations(ops); + rcu_barrier(); + if (ops->id) + ida_remove(&net_generic_ids, *ops->id); +} + +/** + * register_pernet_subsys - register a network namespace subsystem + * @ops: pernet operations structure for the subsystem + * + * Register a subsystem which has init and exit functions + * that are called when network namespaces are created and + * destroyed respectively. + * + * When registered all network namespace init functions are + * called for every existing network namespace. Allowing kernel + * modules to have a race free view of the set of network namespaces. + * + * When a new network namespace is created all of the init + * methods are called in the order in which they were registered. + * + * When a network namespace is destroyed all of the exit methods + * are called in the reverse of the order with which they were + * registered. + */ +int register_pernet_subsys(struct pernet_operations *ops) +{ + int error; + mutex_lock(&net_mutex); + error = register_pernet_operations(first_device, ops); + mutex_unlock(&net_mutex); + return error; +} +EXPORT_SYMBOL_GPL(register_pernet_subsys); + +/** + * unregister_pernet_subsys - unregister a network namespace subsystem + * @ops: pernet operations structure to manipulate + * + * Remove the pernet operations structure from the list to be + * used when network namespaces are created or destroyed. In + * addition run the exit method for all existing network + * namespaces. + */ +void unregister_pernet_subsys(struct pernet_operations *ops) +{ + mutex_lock(&net_mutex); + unregister_pernet_operations(ops); + mutex_unlock(&net_mutex); +} +EXPORT_SYMBOL_GPL(unregister_pernet_subsys); + +/** + * register_pernet_device - register a network namespace device + * @ops: pernet operations structure for the subsystem + * + * Register a device which has init and exit functions + * that are called when network namespaces are created and + * destroyed respectively. + * + * When registered all network namespace init functions are + * called for every existing network namespace. Allowing kernel + * modules to have a race free view of the set of network namespaces. + * + * When a new network namespace is created all of the init + * methods are called in the order in which they were registered. + * + * When a network namespace is destroyed all of the exit methods + * are called in the reverse of the order with which they were + * registered. + */ +int register_pernet_device(struct pernet_operations *ops) +{ + int error; + mutex_lock(&net_mutex); + error = register_pernet_operations(&pernet_list, ops); + if (!error && (first_device == &pernet_list)) + first_device = &ops->list; + mutex_unlock(&net_mutex); + return error; +} +EXPORT_SYMBOL_GPL(register_pernet_device); + +/** + * unregister_pernet_device - unregister a network namespace netdevice + * @ops: pernet operations structure to manipulate + * + * Remove the pernet operations structure from the list to be + * used when network namespaces are created or destroyed. In + * addition run the exit method for all existing network + * namespaces. + */ +void unregister_pernet_device(struct pernet_operations *ops) +{ + mutex_lock(&net_mutex); + if (&ops->list == first_device) + first_device = first_device->next; + unregister_pernet_operations(ops); + mutex_unlock(&net_mutex); +} +EXPORT_SYMBOL_GPL(unregister_pernet_device); + +#ifdef CONFIG_NET_NS +static void *netns_get(struct task_struct *task) +{ + struct net *net = NULL; + struct nsproxy *nsproxy; + + rcu_read_lock(); + nsproxy = task_nsproxy(task); + if (nsproxy) + net = get_net(nsproxy->net_ns); + rcu_read_unlock(); + + return net; +} + +static void netns_put(void *ns) +{ + put_net(ns); +} + +static int netns_install(struct nsproxy *nsproxy, void *ns) +{ + put_net(nsproxy->net_ns); + nsproxy->net_ns = get_net(ns); + return 0; +} + +const struct proc_ns_operations netns_operations = { + .name = "net", + .type = CLONE_NEWNET, + .get = netns_get, + .put = netns_put, + .install = netns_install, +}; +#endif diff --git a/net/core/netevent.c b/net/core/netevent.c new file mode 100644 index 00000000..f17ccd29 --- /dev/null +++ b/net/core/netevent.c @@ -0,0 +1,70 @@ +/* + * Network event notifiers + * + * Authors: + * Tom Tucker <tom@opengridcomputing.com> + * Steve Wise <swise@opengridcomputing.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Fixes: + */ + +#include <linux/rtnetlink.h> +#include <linux/notifier.h> +#include <linux/export.h> +#include <net/netevent.h> + +static ATOMIC_NOTIFIER_HEAD(netevent_notif_chain); + +/** + * register_netevent_notifier - register a netevent notifier block + * @nb: notifier + * + * Register a notifier to be called when a netevent occurs. + * The notifier passed is linked into the kernel structures and must + * not be reused until it has been unregistered. A negative errno code + * is returned on a failure. + */ +int register_netevent_notifier(struct notifier_block *nb) +{ + int err; + + err = atomic_notifier_chain_register(&netevent_notif_chain, nb); + return err; +} +EXPORT_SYMBOL_GPL(register_netevent_notifier); + +/** + * netevent_unregister_notifier - unregister a netevent notifier block + * @nb: notifier + * + * Unregister a notifier previously registered by + * register_neigh_notifier(). The notifier is unlinked into the + * kernel structures and may then be reused. A negative errno code + * is returned on a failure. + */ + +int unregister_netevent_notifier(struct notifier_block *nb) +{ + return atomic_notifier_chain_unregister(&netevent_notif_chain, nb); +} +EXPORT_SYMBOL_GPL(unregister_netevent_notifier); + +/** + * call_netevent_notifiers - call all netevent notifier blocks + * @val: value passed unmodified to notifier function + * @v: pointer passed unmodified to notifier function + * + * Call all neighbour notifier blocks. Parameters and return value + * are as for notifier_call_chain(). + */ + +int call_netevent_notifiers(unsigned long val, void *v) +{ + return atomic_notifier_call_chain(&netevent_notif_chain, val, v); +} +EXPORT_SYMBOL_GPL(call_netevent_notifiers); diff --git a/net/core/netpoll.c b/net/core/netpoll.c new file mode 100644 index 00000000..f9f40b93 --- /dev/null +++ b/net/core/netpoll.c @@ -0,0 +1,948 @@ +/* + * Common framework for low-level network console, dump, and debugger code + * + * Sep 8 2003 Matt Mackall <mpm@selenic.com> + * + * based on the netconsole code from: + * + * Copyright (C) 2001 Ingo Molnar <mingo@redhat.com> + * Copyright (C) 2002 Red Hat, Inc. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/moduleparam.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/string.h> +#include <linux/if_arp.h> +#include <linux/inetdevice.h> +#include <linux/inet.h> +#include <linux/interrupt.h> +#include <linux/netpoll.h> +#include <linux/sched.h> +#include <linux/delay.h> +#include <linux/rcupdate.h> +#include <linux/workqueue.h> +#include <linux/slab.h> +#include <linux/export.h> +#include <net/tcp.h> +#include <net/udp.h> +#include <asm/unaligned.h> +#include <trace/events/napi.h> + +/* + * We maintain a small pool of fully-sized skbs, to make sure the + * message gets out even in extreme OOM situations. + */ + +#define MAX_UDP_CHUNK 1460 +#define MAX_SKBS 32 + +static struct sk_buff_head skb_pool; + +static atomic_t trapped; + +#define USEC_PER_POLL 50 +#define NETPOLL_RX_ENABLED 1 +#define NETPOLL_RX_DROP 2 + +#define MAX_SKB_SIZE \ + (sizeof(struct ethhdr) + \ + sizeof(struct iphdr) + \ + sizeof(struct udphdr) + \ + MAX_UDP_CHUNK) + +static void zap_completion_queue(void); +static void arp_reply(struct sk_buff *skb); + +static unsigned int carrier_timeout = 4; +module_param(carrier_timeout, uint, 0644); + +#define np_info(np, fmt, ...) \ + pr_info("%s: " fmt, np->name, ##__VA_ARGS__) +#define np_err(np, fmt, ...) \ + pr_err("%s: " fmt, np->name, ##__VA_ARGS__) +#define np_notice(np, fmt, ...) \ + pr_notice("%s: " fmt, np->name, ##__VA_ARGS__) + +static void queue_process(struct work_struct *work) +{ + struct netpoll_info *npinfo = + container_of(work, struct netpoll_info, tx_work.work); + struct sk_buff *skb; + unsigned long flags; + + while ((skb = skb_dequeue(&npinfo->txq))) { + struct net_device *dev = skb->dev; + const struct net_device_ops *ops = dev->netdev_ops; + struct netdev_queue *txq; + + if (!netif_device_present(dev) || !netif_running(dev)) { + __kfree_skb(skb); + continue; + } + + txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb)); + + local_irq_save(flags); + __netif_tx_lock(txq, smp_processor_id()); + if (netif_xmit_frozen_or_stopped(txq) || + ops->ndo_start_xmit(skb, dev) != NETDEV_TX_OK) { + skb_queue_head(&npinfo->txq, skb); + __netif_tx_unlock(txq); + local_irq_restore(flags); + + schedule_delayed_work(&npinfo->tx_work, HZ/10); + return; + } + __netif_tx_unlock(txq); + local_irq_restore(flags); + } +} + +static __sum16 checksum_udp(struct sk_buff *skb, struct udphdr *uh, + unsigned short ulen, __be32 saddr, __be32 daddr) +{ + __wsum psum; + + if (uh->check == 0 || skb_csum_unnecessary(skb)) + return 0; + + psum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0); + + if (skb->ip_summed == CHECKSUM_COMPLETE && + !csum_fold(csum_add(psum, skb->csum))) + return 0; + + skb->csum = psum; + + return __skb_checksum_complete(skb); +} + +/* + * Check whether delayed processing was scheduled for our NIC. If so, + * we attempt to grab the poll lock and use ->poll() to pump the card. + * If this fails, either we've recursed in ->poll() or it's already + * running on another CPU. + * + * Note: we don't mask interrupts with this lock because we're using + * trylock here and interrupts are already disabled in the softirq + * case. Further, we test the poll_owner to avoid recursion on UP + * systems where the lock doesn't exist. + * + * In cases where there is bi-directional communications, reading only + * one message at a time can lead to packets being dropped by the + * network adapter, forcing superfluous retries and possibly timeouts. + * Thus, we set our budget to greater than 1. + */ +static int poll_one_napi(struct netpoll_info *npinfo, + struct napi_struct *napi, int budget) +{ + int work; + + /* net_rx_action's ->poll() invocations and our's are + * synchronized by this test which is only made while + * holding the napi->poll_lock. + */ + if (!test_bit(NAPI_STATE_SCHED, &napi->state)) + return budget; + + npinfo->rx_flags |= NETPOLL_RX_DROP; + atomic_inc(&trapped); + set_bit(NAPI_STATE_NPSVC, &napi->state); + + work = napi->poll(napi, budget); + trace_napi_poll(napi); + + clear_bit(NAPI_STATE_NPSVC, &napi->state); + atomic_dec(&trapped); + npinfo->rx_flags &= ~NETPOLL_RX_DROP; + + return budget - work; +} + +static void poll_napi(struct net_device *dev) +{ + struct napi_struct *napi; + int budget = 16; + + list_for_each_entry(napi, &dev->napi_list, dev_list) { + if (napi->poll_owner != smp_processor_id() && + spin_trylock(&napi->poll_lock)) { + budget = poll_one_napi(dev->npinfo, napi, budget); + spin_unlock(&napi->poll_lock); + + if (!budget) + break; + } + } +} + +static void service_arp_queue(struct netpoll_info *npi) +{ + if (npi) { + struct sk_buff *skb; + + while ((skb = skb_dequeue(&npi->arp_tx))) + arp_reply(skb); + } +} + +static void netpoll_poll_dev(struct net_device *dev) +{ + const struct net_device_ops *ops; + + if (!dev || !netif_running(dev)) + return; + + ops = dev->netdev_ops; + if (!ops->ndo_poll_controller) + return; + + /* Process pending work on NIC */ + ops->ndo_poll_controller(dev); + + poll_napi(dev); + + if (dev->flags & IFF_SLAVE) { + if (dev->npinfo) { + struct net_device *bond_dev = dev->master; + struct sk_buff *skb; + while ((skb = skb_dequeue(&dev->npinfo->arp_tx))) { + skb->dev = bond_dev; + skb_queue_tail(&bond_dev->npinfo->arp_tx, skb); + } + } + } + + service_arp_queue(dev->npinfo); + + zap_completion_queue(); +} + +static void refill_skbs(void) +{ + struct sk_buff *skb; + unsigned long flags; + + spin_lock_irqsave(&skb_pool.lock, flags); + while (skb_pool.qlen < MAX_SKBS) { + skb = alloc_skb(MAX_SKB_SIZE, GFP_ATOMIC); + if (!skb) + break; + + __skb_queue_tail(&skb_pool, skb); + } + spin_unlock_irqrestore(&skb_pool.lock, flags); +} + +static void zap_completion_queue(void) +{ + unsigned long flags; + struct softnet_data *sd = &get_cpu_var(softnet_data); + + if (sd->completion_queue) { + struct sk_buff *clist; + + local_irq_save(flags); + clist = sd->completion_queue; + sd->completion_queue = NULL; + local_irq_restore(flags); + + while (clist != NULL) { + struct sk_buff *skb = clist; + clist = clist->next; + if (skb->destructor) { + atomic_inc(&skb->users); + dev_kfree_skb_any(skb); /* put this one back */ + } else { + __kfree_skb(skb); + } + } + } + + put_cpu_var(softnet_data); +} + +static struct sk_buff *find_skb(struct netpoll *np, int len, int reserve) +{ + int count = 0; + struct sk_buff *skb; + + zap_completion_queue(); + refill_skbs(); +repeat: + + skb = alloc_skb(len, GFP_ATOMIC); + if (!skb) + skb = skb_dequeue(&skb_pool); + + if (!skb) { + if (++count < 10) { + netpoll_poll_dev(np->dev); + goto repeat; + } + return NULL; + } + + atomic_set(&skb->users, 1); + skb_reserve(skb, reserve); + return skb; +} + +static int netpoll_owner_active(struct net_device *dev) +{ + struct napi_struct *napi; + + list_for_each_entry(napi, &dev->napi_list, dev_list) { + if (napi->poll_owner == smp_processor_id()) + return 1; + } + return 0; +} + +void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, + struct net_device *dev) +{ + int status = NETDEV_TX_BUSY; + unsigned long tries; + const struct net_device_ops *ops = dev->netdev_ops; + /* It is up to the caller to keep npinfo alive. */ + struct netpoll_info *npinfo = np->dev->npinfo; + + if (!npinfo || !netif_running(dev) || !netif_device_present(dev)) { + __kfree_skb(skb); + return; + } + + /* don't get messages out of order, and no recursion */ + if (skb_queue_len(&npinfo->txq) == 0 && !netpoll_owner_active(dev)) { + struct netdev_queue *txq; + unsigned long flags; + + txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb)); + + local_irq_save(flags); + /* try until next clock tick */ + for (tries = jiffies_to_usecs(1)/USEC_PER_POLL; + tries > 0; --tries) { + if (__netif_tx_trylock(txq)) { + if (!netif_xmit_stopped(txq)) { + status = ops->ndo_start_xmit(skb, dev); + if (status == NETDEV_TX_OK) + txq_trans_update(txq); + } + __netif_tx_unlock(txq); + + if (status == NETDEV_TX_OK) + break; + + } + + /* tickle device maybe there is some cleanup */ + netpoll_poll_dev(np->dev); + + udelay(USEC_PER_POLL); + } + + WARN_ONCE(!irqs_disabled(), + "netpoll_send_skb(): %s enabled interrupts in poll (%pF)\n", + dev->name, ops->ndo_start_xmit); + + local_irq_restore(flags); + } + + if (status != NETDEV_TX_OK) { + skb_queue_tail(&npinfo->txq, skb); + schedule_delayed_work(&npinfo->tx_work,0); + } +} +EXPORT_SYMBOL(netpoll_send_skb_on_dev); + +void netpoll_send_udp(struct netpoll *np, const char *msg, int len) +{ + int total_len, ip_len, udp_len; + struct sk_buff *skb; + struct udphdr *udph; + struct iphdr *iph; + struct ethhdr *eth; + + udp_len = len + sizeof(*udph); + ip_len = udp_len + sizeof(*iph); + total_len = ip_len + LL_RESERVED_SPACE(np->dev); + + skb = find_skb(np, total_len + np->dev->needed_tailroom, + total_len - len); + if (!skb) + return; + + skb_copy_to_linear_data(skb, msg, len); + skb_put(skb, len); + + skb_push(skb, sizeof(*udph)); + skb_reset_transport_header(skb); + udph = udp_hdr(skb); + udph->source = htons(np->local_port); + udph->dest = htons(np->remote_port); + udph->len = htons(udp_len); + udph->check = 0; + udph->check = csum_tcpudp_magic(np->local_ip, + np->remote_ip, + udp_len, IPPROTO_UDP, + csum_partial(udph, udp_len, 0)); + if (udph->check == 0) + udph->check = CSUM_MANGLED_0; + + skb_push(skb, sizeof(*iph)); + skb_reset_network_header(skb); + iph = ip_hdr(skb); + + /* iph->version = 4; iph->ihl = 5; */ + put_unaligned(0x45, (unsigned char *)iph); + iph->tos = 0; + put_unaligned(htons(ip_len), &(iph->tot_len)); + iph->id = 0; + iph->frag_off = 0; + iph->ttl = 64; + iph->protocol = IPPROTO_UDP; + iph->check = 0; + put_unaligned(np->local_ip, &(iph->saddr)); + put_unaligned(np->remote_ip, &(iph->daddr)); + iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); + + eth = (struct ethhdr *) skb_push(skb, ETH_HLEN); + skb_reset_mac_header(skb); + skb->protocol = eth->h_proto = htons(ETH_P_IP); + memcpy(eth->h_source, np->dev->dev_addr, ETH_ALEN); + memcpy(eth->h_dest, np->remote_mac, ETH_ALEN); + + skb->dev = np->dev; + + netpoll_send_skb(np, skb); +} +EXPORT_SYMBOL(netpoll_send_udp); + +static void arp_reply(struct sk_buff *skb) +{ + struct netpoll_info *npinfo = skb->dev->npinfo; + struct arphdr *arp; + unsigned char *arp_ptr; + int size, type = ARPOP_REPLY, ptype = ETH_P_ARP; + __be32 sip, tip; + unsigned char *sha; + struct sk_buff *send_skb; + struct netpoll *np, *tmp; + unsigned long flags; + int hlen, tlen; + int hits = 0; + + if (list_empty(&npinfo->rx_np)) + return; + + /* Before checking the packet, we do some early + inspection whether this is interesting at all */ + spin_lock_irqsave(&npinfo->rx_lock, flags); + list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) { + if (np->dev == skb->dev) + hits++; + } + spin_unlock_irqrestore(&npinfo->rx_lock, flags); + + /* No netpoll struct is using this dev */ + if (!hits) + return; + + /* No arp on this interface */ + if (skb->dev->flags & IFF_NOARP) + return; + + if (!pskb_may_pull(skb, arp_hdr_len(skb->dev))) + return; + + skb_reset_network_header(skb); + skb_reset_transport_header(skb); + arp = arp_hdr(skb); + + if ((arp->ar_hrd != htons(ARPHRD_ETHER) && + arp->ar_hrd != htons(ARPHRD_IEEE802)) || + arp->ar_pro != htons(ETH_P_IP) || + arp->ar_op != htons(ARPOP_REQUEST)) + return; + + arp_ptr = (unsigned char *)(arp+1); + /* save the location of the src hw addr */ + sha = arp_ptr; + arp_ptr += skb->dev->addr_len; + memcpy(&sip, arp_ptr, 4); + arp_ptr += 4; + /* If we actually cared about dst hw addr, + it would get copied here */ + arp_ptr += skb->dev->addr_len; + memcpy(&tip, arp_ptr, 4); + + /* Should we ignore arp? */ + if (ipv4_is_loopback(tip) || ipv4_is_multicast(tip)) + return; + + size = arp_hdr_len(skb->dev); + + spin_lock_irqsave(&npinfo->rx_lock, flags); + list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) { + if (tip != np->local_ip) + continue; + + hlen = LL_RESERVED_SPACE(np->dev); + tlen = np->dev->needed_tailroom; + send_skb = find_skb(np, size + hlen + tlen, hlen); + if (!send_skb) + continue; + + skb_reset_network_header(send_skb); + arp = (struct arphdr *) skb_put(send_skb, size); + send_skb->dev = skb->dev; + send_skb->protocol = htons(ETH_P_ARP); + + /* Fill the device header for the ARP frame */ + if (dev_hard_header(send_skb, skb->dev, ptype, + sha, np->dev->dev_addr, + send_skb->len) < 0) { + kfree_skb(send_skb); + continue; + } + + /* + * Fill out the arp protocol part. + * + * we only support ethernet device type, + * which (according to RFC 1390) should + * always equal 1 (Ethernet). + */ + + arp->ar_hrd = htons(np->dev->type); + arp->ar_pro = htons(ETH_P_IP); + arp->ar_hln = np->dev->addr_len; + arp->ar_pln = 4; + arp->ar_op = htons(type); + + arp_ptr = (unsigned char *)(arp + 1); + memcpy(arp_ptr, np->dev->dev_addr, np->dev->addr_len); + arp_ptr += np->dev->addr_len; + memcpy(arp_ptr, &tip, 4); + arp_ptr += 4; + memcpy(arp_ptr, sha, np->dev->addr_len); + arp_ptr += np->dev->addr_len; + memcpy(arp_ptr, &sip, 4); + + netpoll_send_skb(np, send_skb); + + /* If there are several rx_hooks for the same address, + we're fine by sending a single reply */ + break; + } + spin_unlock_irqrestore(&npinfo->rx_lock, flags); +} + +int __netpoll_rx(struct sk_buff *skb) +{ + int proto, len, ulen; + int hits = 0; + const struct iphdr *iph; + struct udphdr *uh; + struct netpoll_info *npinfo = skb->dev->npinfo; + struct netpoll *np, *tmp; + + if (list_empty(&npinfo->rx_np)) + goto out; + + if (skb->dev->type != ARPHRD_ETHER) + goto out; + + /* check if netpoll clients need ARP */ + if (skb->protocol == htons(ETH_P_ARP) && + atomic_read(&trapped)) { + skb_queue_tail(&npinfo->arp_tx, skb); + return 1; + } + + proto = ntohs(eth_hdr(skb)->h_proto); + if (proto != ETH_P_IP) + goto out; + if (skb->pkt_type == PACKET_OTHERHOST) + goto out; + if (skb_shared(skb)) + goto out; + + if (!pskb_may_pull(skb, sizeof(struct iphdr))) + goto out; + iph = (struct iphdr *)skb->data; + if (iph->ihl < 5 || iph->version != 4) + goto out; + if (!pskb_may_pull(skb, iph->ihl*4)) + goto out; + iph = (struct iphdr *)skb->data; + if (ip_fast_csum((u8 *)iph, iph->ihl) != 0) + goto out; + + len = ntohs(iph->tot_len); + if (skb->len < len || len < iph->ihl*4) + goto out; + + /* + * Our transport medium may have padded the buffer out. + * Now We trim to the true length of the frame. + */ + if (pskb_trim_rcsum(skb, len)) + goto out; + + iph = (struct iphdr *)skb->data; + if (iph->protocol != IPPROTO_UDP) + goto out; + + len -= iph->ihl*4; + uh = (struct udphdr *)(((char *)iph) + iph->ihl*4); + ulen = ntohs(uh->len); + + if (ulen != len) + goto out; + if (checksum_udp(skb, uh, ulen, iph->saddr, iph->daddr)) + goto out; + + list_for_each_entry_safe(np, tmp, &npinfo->rx_np, rx) { + if (np->local_ip && np->local_ip != iph->daddr) + continue; + if (np->remote_ip && np->remote_ip != iph->saddr) + continue; + if (np->local_port && np->local_port != ntohs(uh->dest)) + continue; + + np->rx_hook(np, ntohs(uh->source), + (char *)(uh+1), + ulen - sizeof(struct udphdr)); + hits++; + } + + if (!hits) + goto out; + + kfree_skb(skb); + return 1; + +out: + if (atomic_read(&trapped)) { + kfree_skb(skb); + return 1; + } + + return 0; +} + +void netpoll_print_options(struct netpoll *np) +{ + np_info(np, "local port %d\n", np->local_port); + np_info(np, "local IP %pI4\n", &np->local_ip); + np_info(np, "interface '%s'\n", np->dev_name); + np_info(np, "remote port %d\n", np->remote_port); + np_info(np, "remote IP %pI4\n", &np->remote_ip); + np_info(np, "remote ethernet address %pM\n", np->remote_mac); +} +EXPORT_SYMBOL(netpoll_print_options); + +int netpoll_parse_options(struct netpoll *np, char *opt) +{ + char *cur=opt, *delim; + + if (*cur != '@') { + if ((delim = strchr(cur, '@')) == NULL) + goto parse_failed; + *delim = 0; + np->local_port = simple_strtol(cur, NULL, 10); + cur = delim; + } + cur++; + + if (*cur != '/') { + if ((delim = strchr(cur, '/')) == NULL) + goto parse_failed; + *delim = 0; + np->local_ip = in_aton(cur); + cur = delim; + } + cur++; + + if (*cur != ',') { + /* parse out dev name */ + if ((delim = strchr(cur, ',')) == NULL) + goto parse_failed; + *delim = 0; + strlcpy(np->dev_name, cur, sizeof(np->dev_name)); + cur = delim; + } + cur++; + + if (*cur != '@') { + /* dst port */ + if ((delim = strchr(cur, '@')) == NULL) + goto parse_failed; + *delim = 0; + if (*cur == ' ' || *cur == '\t') + np_info(np, "warning: whitespace is not allowed\n"); + np->remote_port = simple_strtol(cur, NULL, 10); + cur = delim; + } + cur++; + + /* dst ip */ + if ((delim = strchr(cur, '/')) == NULL) + goto parse_failed; + *delim = 0; + np->remote_ip = in_aton(cur); + cur = delim + 1; + + if (*cur != 0) { + /* MAC address */ + if (!mac_pton(cur, np->remote_mac)) + goto parse_failed; + } + + netpoll_print_options(np); + + return 0; + + parse_failed: + np_info(np, "couldn't parse config at '%s'!\n", cur); + return -1; +} +EXPORT_SYMBOL(netpoll_parse_options); + +int __netpoll_setup(struct netpoll *np) +{ + struct net_device *ndev = np->dev; + struct netpoll_info *npinfo; + const struct net_device_ops *ops; + unsigned long flags; + int err; + + if ((ndev->priv_flags & IFF_DISABLE_NETPOLL) || + !ndev->netdev_ops->ndo_poll_controller) { + np_err(np, "%s doesn't support polling, aborting\n", + np->dev_name); + err = -ENOTSUPP; + goto out; + } + + if (!ndev->npinfo) { + npinfo = kmalloc(sizeof(*npinfo), GFP_KERNEL); + if (!npinfo) { + err = -ENOMEM; + goto out; + } + + npinfo->rx_flags = 0; + INIT_LIST_HEAD(&npinfo->rx_np); + + spin_lock_init(&npinfo->rx_lock); + skb_queue_head_init(&npinfo->arp_tx); + skb_queue_head_init(&npinfo->txq); + INIT_DELAYED_WORK(&npinfo->tx_work, queue_process); + + atomic_set(&npinfo->refcnt, 1); + + ops = np->dev->netdev_ops; + if (ops->ndo_netpoll_setup) { + err = ops->ndo_netpoll_setup(ndev, npinfo); + if (err) + goto free_npinfo; + } + } else { + npinfo = ndev->npinfo; + atomic_inc(&npinfo->refcnt); + } + + npinfo->netpoll = np; + + if (np->rx_hook) { + spin_lock_irqsave(&npinfo->rx_lock, flags); + npinfo->rx_flags |= NETPOLL_RX_ENABLED; + list_add_tail(&np->rx, &npinfo->rx_np); + spin_unlock_irqrestore(&npinfo->rx_lock, flags); + } + + /* last thing to do is link it to the net device structure */ + rcu_assign_pointer(ndev->npinfo, npinfo); + + return 0; + +free_npinfo: + kfree(npinfo); +out: + return err; +} +EXPORT_SYMBOL_GPL(__netpoll_setup); + +int netpoll_setup(struct netpoll *np) +{ + struct net_device *ndev = NULL; + struct in_device *in_dev; + int err; + + if (np->dev_name) + ndev = dev_get_by_name(&init_net, np->dev_name); + if (!ndev) { + np_err(np, "%s doesn't exist, aborting\n", np->dev_name); + return -ENODEV; + } + + if (ndev->master) { + np_err(np, "%s is a slave device, aborting\n", np->dev_name); + err = -EBUSY; + goto put; + } + + if (!netif_running(ndev)) { + unsigned long atmost, atleast; + + np_info(np, "device %s not up yet, forcing it\n", np->dev_name); + + rtnl_lock(); + err = dev_open(ndev); + rtnl_unlock(); + + if (err) { + np_err(np, "failed to open %s\n", ndev->name); + goto put; + } + + atleast = jiffies + HZ/10; + atmost = jiffies + carrier_timeout * HZ; + while (!netif_carrier_ok(ndev)) { + if (time_after(jiffies, atmost)) { + np_notice(np, "timeout waiting for carrier\n"); + break; + } + msleep(1); + } + + /* If carrier appears to come up instantly, we don't + * trust it and pause so that we don't pump all our + * queued console messages into the bitbucket. + */ + + if (time_before(jiffies, atleast)) { + np_notice(np, "carrier detect appears untrustworthy, waiting 4 seconds\n"); + msleep(4000); + } + } + + if (!np->local_ip) { + rcu_read_lock(); + in_dev = __in_dev_get_rcu(ndev); + + if (!in_dev || !in_dev->ifa_list) { + rcu_read_unlock(); + np_err(np, "no IP address for %s, aborting\n", + np->dev_name); + err = -EDESTADDRREQ; + goto put; + } + + np->local_ip = in_dev->ifa_list->ifa_local; + rcu_read_unlock(); + np_info(np, "local IP %pI4\n", &np->local_ip); + } + + np->dev = ndev; + + /* fill up the skb queue */ + refill_skbs(); + + rtnl_lock(); + err = __netpoll_setup(np); + rtnl_unlock(); + + if (err) + goto put; + + return 0; + +put: + dev_put(ndev); + return err; +} +EXPORT_SYMBOL(netpoll_setup); + +static int __init netpoll_init(void) +{ + skb_queue_head_init(&skb_pool); + return 0; +} +core_initcall(netpoll_init); + +void __netpoll_cleanup(struct netpoll *np) +{ + struct netpoll_info *npinfo; + unsigned long flags; + + npinfo = np->dev->npinfo; + if (!npinfo) + return; + + if (!list_empty(&npinfo->rx_np)) { + spin_lock_irqsave(&npinfo->rx_lock, flags); + list_del(&np->rx); + if (list_empty(&npinfo->rx_np)) + npinfo->rx_flags &= ~NETPOLL_RX_ENABLED; + spin_unlock_irqrestore(&npinfo->rx_lock, flags); + } + + if (atomic_dec_and_test(&npinfo->refcnt)) { + const struct net_device_ops *ops; + + ops = np->dev->netdev_ops; + if (ops->ndo_netpoll_cleanup) + ops->ndo_netpoll_cleanup(np->dev); + + RCU_INIT_POINTER(np->dev->npinfo, NULL); + + /* avoid racing with NAPI reading npinfo */ + synchronize_rcu_bh(); + + skb_queue_purge(&npinfo->arp_tx); + skb_queue_purge(&npinfo->txq); + cancel_delayed_work_sync(&npinfo->tx_work); + + /* clean after last, unfinished work */ + __skb_queue_purge(&npinfo->txq); + kfree(npinfo); + } +} +EXPORT_SYMBOL_GPL(__netpoll_cleanup); + +void netpoll_cleanup(struct netpoll *np) +{ + if (!np->dev) + return; + + rtnl_lock(); + __netpoll_cleanup(np); + rtnl_unlock(); + + dev_put(np->dev); + np->dev = NULL; +} +EXPORT_SYMBOL(netpoll_cleanup); + +int netpoll_trap(void) +{ + return atomic_read(&trapped); +} +EXPORT_SYMBOL(netpoll_trap); + +void netpoll_set_trap(int trap) +{ + if (trap) + atomic_inc(&trapped); + else + atomic_dec(&trapped); +} +EXPORT_SYMBOL(netpoll_set_trap); diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c new file mode 100644 index 00000000..ba6900f7 --- /dev/null +++ b/net/core/netprio_cgroup.c @@ -0,0 +1,337 @@ +/* + * net/core/netprio_cgroup.c Priority Control Group + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Neil Horman <nhorman@tuxdriver.com> + */ + +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/types.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/skbuff.h> +#include <linux/cgroup.h> +#include <linux/rcupdate.h> +#include <linux/atomic.h> +#include <net/rtnetlink.h> +#include <net/pkt_cls.h> +#include <net/sock.h> +#include <net/netprio_cgroup.h> + +static struct cgroup_subsys_state *cgrp_create(struct cgroup *cgrp); +static void cgrp_destroy(struct cgroup *cgrp); +static int cgrp_populate(struct cgroup_subsys *ss, struct cgroup *cgrp); + +struct cgroup_subsys net_prio_subsys = { + .name = "net_prio", + .create = cgrp_create, + .destroy = cgrp_destroy, + .populate = cgrp_populate, +#ifdef CONFIG_NETPRIO_CGROUP + .subsys_id = net_prio_subsys_id, +#endif + .module = THIS_MODULE +}; + +#define PRIOIDX_SZ 128 + +static unsigned long prioidx_map[PRIOIDX_SZ]; +static DEFINE_SPINLOCK(prioidx_map_lock); +static atomic_t max_prioidx = ATOMIC_INIT(0); + +static inline struct cgroup_netprio_state *cgrp_netprio_state(struct cgroup *cgrp) +{ + return container_of(cgroup_subsys_state(cgrp, net_prio_subsys_id), + struct cgroup_netprio_state, css); +} + +static int get_prioidx(u32 *prio) +{ + unsigned long flags; + u32 prioidx; + + spin_lock_irqsave(&prioidx_map_lock, flags); + prioidx = find_first_zero_bit(prioidx_map, sizeof(unsigned long) * PRIOIDX_SZ); + if (prioidx == sizeof(unsigned long) * PRIOIDX_SZ) { + spin_unlock_irqrestore(&prioidx_map_lock, flags); + return -ENOSPC; + } + set_bit(prioidx, prioidx_map); + spin_unlock_irqrestore(&prioidx_map_lock, flags); + atomic_set(&max_prioidx, prioidx); + *prio = prioidx; + return 0; +} + +static void put_prioidx(u32 idx) +{ + unsigned long flags; + + spin_lock_irqsave(&prioidx_map_lock, flags); + clear_bit(idx, prioidx_map); + spin_unlock_irqrestore(&prioidx_map_lock, flags); +} + +static void extend_netdev_table(struct net_device *dev, u32 new_len) +{ + size_t new_size = sizeof(struct netprio_map) + + ((sizeof(u32) * new_len)); + struct netprio_map *new_priomap = kzalloc(new_size, GFP_KERNEL); + struct netprio_map *old_priomap; + int i; + + old_priomap = rtnl_dereference(dev->priomap); + + if (!new_priomap) { + printk(KERN_WARNING "Unable to alloc new priomap!\n"); + return; + } + + for (i = 0; + old_priomap && (i < old_priomap->priomap_len); + i++) + new_priomap->priomap[i] = old_priomap->priomap[i]; + + new_priomap->priomap_len = new_len; + + rcu_assign_pointer(dev->priomap, new_priomap); + if (old_priomap) + kfree_rcu(old_priomap, rcu); +} + +static void update_netdev_tables(void) +{ + struct net_device *dev; + u32 max_len = atomic_read(&max_prioidx) + 1; + struct netprio_map *map; + + rtnl_lock(); + for_each_netdev(&init_net, dev) { + map = rtnl_dereference(dev->priomap); + if ((!map) || + (map->priomap_len < max_len)) + extend_netdev_table(dev, max_len); + } + rtnl_unlock(); +} + +static struct cgroup_subsys_state *cgrp_create(struct cgroup *cgrp) +{ + struct cgroup_netprio_state *cs; + int ret; + + cs = kzalloc(sizeof(*cs), GFP_KERNEL); + if (!cs) + return ERR_PTR(-ENOMEM); + + if (cgrp->parent && cgrp_netprio_state(cgrp->parent)->prioidx) { + kfree(cs); + return ERR_PTR(-EINVAL); + } + + ret = get_prioidx(&cs->prioidx); + if (ret != 0) { + printk(KERN_WARNING "No space in priority index array\n"); + kfree(cs); + return ERR_PTR(ret); + } + + return &cs->css; +} + +static void cgrp_destroy(struct cgroup *cgrp) +{ + struct cgroup_netprio_state *cs; + struct net_device *dev; + struct netprio_map *map; + + cs = cgrp_netprio_state(cgrp); + rtnl_lock(); + for_each_netdev(&init_net, dev) { + map = rtnl_dereference(dev->priomap); + if (map) + map->priomap[cs->prioidx] = 0; + } + rtnl_unlock(); + put_prioidx(cs->prioidx); + kfree(cs); +} + +static u64 read_prioidx(struct cgroup *cgrp, struct cftype *cft) +{ + return (u64)cgrp_netprio_state(cgrp)->prioidx; +} + +static int read_priomap(struct cgroup *cont, struct cftype *cft, + struct cgroup_map_cb *cb) +{ + struct net_device *dev; + u32 prioidx = cgrp_netprio_state(cont)->prioidx; + u32 priority; + struct netprio_map *map; + + rcu_read_lock(); + for_each_netdev_rcu(&init_net, dev) { + map = rcu_dereference(dev->priomap); + priority = map ? map->priomap[prioidx] : 0; + cb->fill(cb, dev->name, priority); + } + rcu_read_unlock(); + return 0; +} + +static int write_priomap(struct cgroup *cgrp, struct cftype *cft, + const char *buffer) +{ + char *devname = kstrdup(buffer, GFP_KERNEL); + int ret = -EINVAL; + u32 prioidx = cgrp_netprio_state(cgrp)->prioidx; + unsigned long priority; + char *priostr; + struct net_device *dev; + struct netprio_map *map; + + if (!devname) + return -ENOMEM; + + /* + * Minimally sized valid priomap string + */ + if (strlen(devname) < 3) + goto out_free_devname; + + priostr = strstr(devname, " "); + if (!priostr) + goto out_free_devname; + + /* + *Separate the devname from the associated priority + *and advance the priostr poitner to the priority value + */ + *priostr = '\0'; + priostr++; + + /* + * If the priostr points to NULL, we're at the end of the passed + * in string, and its not a valid write + */ + if (*priostr == '\0') + goto out_free_devname; + + ret = kstrtoul(priostr, 10, &priority); + if (ret < 0) + goto out_free_devname; + + ret = -ENODEV; + + dev = dev_get_by_name(&init_net, devname); + if (!dev) + goto out_free_devname; + + update_netdev_tables(); + ret = 0; + rcu_read_lock(); + map = rcu_dereference(dev->priomap); + if (map) + map->priomap[prioidx] = priority; + rcu_read_unlock(); + dev_put(dev); + +out_free_devname: + kfree(devname); + return ret; +} + +static struct cftype ss_files[] = { + { + .name = "prioidx", + .read_u64 = read_prioidx, + }, + { + .name = "ifpriomap", + .read_map = read_priomap, + .write_string = write_priomap, + }, +}; + +static int cgrp_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) +{ + return cgroup_add_files(cgrp, ss, ss_files, ARRAY_SIZE(ss_files)); +} + +static int netprio_device_event(struct notifier_block *unused, + unsigned long event, void *ptr) +{ + struct net_device *dev = ptr; + struct netprio_map *old; + + /* + * Note this is called with rtnl_lock held so we have update side + * protection on our rcu assignments + */ + + switch (event) { + case NETDEV_UNREGISTER: + old = rtnl_dereference(dev->priomap); + RCU_INIT_POINTER(dev->priomap, NULL); + if (old) + kfree_rcu(old, rcu); + break; + } + return NOTIFY_DONE; +} + +static struct notifier_block netprio_device_notifier = { + .notifier_call = netprio_device_event +}; + +static int __init init_cgroup_netprio(void) +{ + int ret; + + ret = cgroup_load_subsys(&net_prio_subsys); + if (ret) + goto out; +#ifndef CONFIG_NETPRIO_CGROUP + smp_wmb(); + net_prio_subsys_id = net_prio_subsys.subsys_id; +#endif + + register_netdevice_notifier(&netprio_device_notifier); + +out: + return ret; +} + +static void __exit exit_cgroup_netprio(void) +{ + struct netprio_map *old; + struct net_device *dev; + + unregister_netdevice_notifier(&netprio_device_notifier); + + cgroup_unload_subsys(&net_prio_subsys); + +#ifndef CONFIG_NETPRIO_CGROUP + net_prio_subsys_id = -1; + synchronize_rcu(); +#endif + + rtnl_lock(); + for_each_netdev(&init_net, dev) { + old = rtnl_dereference(dev->priomap); + RCU_INIT_POINTER(dev->priomap, NULL); + if (old) + kfree_rcu(old, rcu); + } + rtnl_unlock(); +} + +module_init(init_cgroup_netprio); +module_exit(exit_cgroup_netprio); +MODULE_LICENSE("GPL v2"); diff --git a/net/core/pktgen.c b/net/core/pktgen.c new file mode 100644 index 00000000..b81369b6 --- /dev/null +++ b/net/core/pktgen.c @@ -0,0 +1,3796 @@ +/* + * Authors: + * Copyright 2001, 2002 by Robert Olsson <robert.olsson@its.uu.se> + * Uppsala University and + * Swedish University of Agricultural Sciences + * + * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> + * Ben Greear <greearb@candelatech.com> + * Jens Låås <jens.laas@data.slu.se> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * + * A tool for loading the network with preconfigurated packets. + * The tool is implemented as a linux module. Parameters are output + * device, delay (to hard_xmit), number of packets, and whether + * to use multiple SKBs or just the same one. + * pktgen uses the installed interface's output routine. + * + * Additional hacking by: + * + * Jens.Laas@data.slu.se + * Improved by ANK. 010120. + * Improved by ANK even more. 010212. + * MAC address typo fixed. 010417 --ro + * Integrated. 020301 --DaveM + * Added multiskb option 020301 --DaveM + * Scaling of results. 020417--sigurdur@linpro.no + * Significant re-work of the module: + * * Convert to threaded model to more efficiently be able to transmit + * and receive on multiple interfaces at once. + * * Converted many counters to __u64 to allow longer runs. + * * Allow configuration of ranges, like min/max IP address, MACs, + * and UDP-ports, for both source and destination, and can + * set to use a random distribution or sequentially walk the range. + * * Can now change most values after starting. + * * Place 12-byte packet in UDP payload with magic number, + * sequence number, and timestamp. + * * Add receiver code that detects dropped pkts, re-ordered pkts, and + * latencies (with micro-second) precision. + * * Add IOCTL interface to easily get counters & configuration. + * --Ben Greear <greearb@candelatech.com> + * + * Renamed multiskb to clone_skb and cleaned up sending core for two distinct + * skb modes. A clone_skb=0 mode for Ben "ranges" work and a clone_skb != 0 + * as a "fastpath" with a configurable number of clones after alloc's. + * clone_skb=0 means all packets are allocated this also means ranges time + * stamps etc can be used. clone_skb=100 means 1 malloc is followed by 100 + * clones. + * + * Also moved to /proc/net/pktgen/ + * --ro + * + * Sept 10: Fixed threading/locking. Lots of bone-headed and more clever + * mistakes. Also merged in DaveM's patch in the -pre6 patch. + * --Ben Greear <greearb@candelatech.com> + * + * Integrated to 2.5.x 021029 --Lucio Maciel (luciomaciel@zipmail.com.br) + * + * + * 021124 Finished major redesign and rewrite for new functionality. + * See Documentation/networking/pktgen.txt for how to use this. + * + * The new operation: + * For each CPU one thread/process is created at start. This process checks + * for running devices in the if_list and sends packets until count is 0 it + * also the thread checks the thread->control which is used for inter-process + * communication. controlling process "posts" operations to the threads this + * way. The if_lock should be possible to remove when add/rem_device is merged + * into this too. + * + * By design there should only be *one* "controlling" process. In practice + * multiple write accesses gives unpredictable result. Understood by "write" + * to /proc gives result code thats should be read be the "writer". + * For practical use this should be no problem. + * + * Note when adding devices to a specific CPU there good idea to also assign + * /proc/irq/XX/smp_affinity so TX-interrupts gets bound to the same CPU. + * --ro + * + * Fix refcount off by one if first packet fails, potential null deref, + * memleak 030710- KJP + * + * First "ranges" functionality for ipv6 030726 --ro + * + * Included flow support. 030802 ANK. + * + * Fixed unaligned access on IA-64 Grant Grundler <grundler@parisc-linux.org> + * + * Remove if fix from added Harald Welte <laforge@netfilter.org> 040419 + * ia64 compilation fix from Aron Griffis <aron@hp.com> 040604 + * + * New xmit() return, do_div and misc clean up by Stephen Hemminger + * <shemminger@osdl.org> 040923 + * + * Randy Dunlap fixed u64 printk compiler waring + * + * Remove FCS from BW calculation. Lennert Buytenhek <buytenh@wantstofly.org> + * New time handling. Lennert Buytenhek <buytenh@wantstofly.org> 041213 + * + * Corrections from Nikolai Malykh (nmalykh@bilim.com) + * Removed unused flags F_SET_SRCMAC & F_SET_SRCIP 041230 + * + * interruptible_sleep_on_timeout() replaced Nishanth Aravamudan <nacc@us.ibm.com> + * 050103 + * + * MPLS support by Steven Whitehouse <steve@chygwyn.com> + * + * 802.1Q/Q-in-Q support by Francesco Fondelli (FF) <francesco.fondelli@gmail.com> + * + * Fixed src_mac command to set source mac of packet to value specified in + * command by Adit Ranadive <adit.262@gmail.com> + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/sys.h> +#include <linux/types.h> +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/kernel.h> +#include <linux/mutex.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/unistd.h> +#include <linux/string.h> +#include <linux/ptrace.h> +#include <linux/errno.h> +#include <linux/ioport.h> +#include <linux/interrupt.h> +#include <linux/capability.h> +#include <linux/hrtimer.h> +#include <linux/freezer.h> +#include <linux/delay.h> +#include <linux/timer.h> +#include <linux/list.h> +#include <linux/init.h> +#include <linux/skbuff.h> +#include <linux/netdevice.h> +#include <linux/inet.h> +#include <linux/inetdevice.h> +#include <linux/rtnetlink.h> +#include <linux/if_arp.h> +#include <linux/if_vlan.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/udp.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/wait.h> +#include <linux/etherdevice.h> +#include <linux/kthread.h> +#include <linux/prefetch.h> +#include <net/net_namespace.h> +#include <net/checksum.h> +#include <net/ipv6.h> +#include <net/addrconf.h> +#ifdef CONFIG_XFRM +#include <net/xfrm.h> +#endif +#include <asm/byteorder.h> +#include <linux/rcupdate.h> +#include <linux/bitops.h> +#include <linux/io.h> +#include <linux/timex.h> +#include <linux/uaccess.h> +#include <asm/dma.h> +#include <asm/div64.h> /* do_div */ + +#define VERSION "2.74" +#define IP_NAME_SZ 32 +#define MAX_MPLS_LABELS 16 /* This is the max label stack depth */ +#define MPLS_STACK_BOTTOM htonl(0x00000100) + +#define func_enter() pr_debug("entering %s\n", __func__); + +/* Device flag bits */ +#define F_IPSRC_RND (1<<0) /* IP-Src Random */ +#define F_IPDST_RND (1<<1) /* IP-Dst Random */ +#define F_UDPSRC_RND (1<<2) /* UDP-Src Random */ +#define F_UDPDST_RND (1<<3) /* UDP-Dst Random */ +#define F_MACSRC_RND (1<<4) /* MAC-Src Random */ +#define F_MACDST_RND (1<<5) /* MAC-Dst Random */ +#define F_TXSIZE_RND (1<<6) /* Transmit size is random */ +#define F_IPV6 (1<<7) /* Interface in IPV6 Mode */ +#define F_MPLS_RND (1<<8) /* Random MPLS labels */ +#define F_VID_RND (1<<9) /* Random VLAN ID */ +#define F_SVID_RND (1<<10) /* Random SVLAN ID */ +#define F_FLOW_SEQ (1<<11) /* Sequential flows */ +#define F_IPSEC_ON (1<<12) /* ipsec on for flows */ +#define F_QUEUE_MAP_RND (1<<13) /* queue map Random */ +#define F_QUEUE_MAP_CPU (1<<14) /* queue map mirrors smp_processor_id() */ +#define F_NODE (1<<15) /* Node memory alloc*/ + +/* Thread control flag bits */ +#define T_STOP (1<<0) /* Stop run */ +#define T_RUN (1<<1) /* Start run */ +#define T_REMDEVALL (1<<2) /* Remove all devs */ +#define T_REMDEV (1<<3) /* Remove one dev */ + +/* If lock -- can be removed after some work */ +#define if_lock(t) spin_lock(&(t->if_lock)); +#define if_unlock(t) spin_unlock(&(t->if_lock)); + +/* Used to help with determining the pkts on receive */ +#define PKTGEN_MAGIC 0xbe9be955 +#define PG_PROC_DIR "pktgen" +#define PGCTRL "pgctrl" +static struct proc_dir_entry *pg_proc_dir; + +#define MAX_CFLOWS 65536 + +#define VLAN_TAG_SIZE(x) ((x)->vlan_id == 0xffff ? 0 : 4) +#define SVLAN_TAG_SIZE(x) ((x)->svlan_id == 0xffff ? 0 : 4) + +struct flow_state { + __be32 cur_daddr; + int count; +#ifdef CONFIG_XFRM + struct xfrm_state *x; +#endif + __u32 flags; +}; + +/* flow flag bits */ +#define F_INIT (1<<0) /* flow has been initialized */ + +struct pktgen_dev { + /* + * Try to keep frequent/infrequent used vars. separated. + */ + struct proc_dir_entry *entry; /* proc file */ + struct pktgen_thread *pg_thread;/* the owner */ + struct list_head list; /* chaining in the thread's run-queue */ + + int running; /* if false, the test will stop */ + + /* If min != max, then we will either do a linear iteration, or + * we will do a random selection from within the range. + */ + __u32 flags; + int removal_mark; /* non-zero => the device is marked for + * removal by worker thread */ + + int min_pkt_size; /* = ETH_ZLEN; */ + int max_pkt_size; /* = ETH_ZLEN; */ + int pkt_overhead; /* overhead for MPLS, VLANs, IPSEC etc */ + int nfrags; + struct page *page; + u64 delay; /* nano-seconds */ + + __u64 count; /* Default No packets to send */ + __u64 sofar; /* How many pkts we've sent so far */ + __u64 tx_bytes; /* How many bytes we've transmitted */ + __u64 errors; /* Errors when trying to transmit, */ + + /* runtime counters relating to clone_skb */ + + __u64 allocated_skbs; + __u32 clone_count; + int last_ok; /* Was last skb sent? + * Or a failed transmit of some sort? + * This will keep sequence numbers in order + */ + ktime_t next_tx; + ktime_t started_at; + ktime_t stopped_at; + u64 idle_acc; /* nano-seconds */ + + __u32 seq_num; + + int clone_skb; /* + * Use multiple SKBs during packet gen. + * If this number is greater than 1, then + * that many copies of the same packet will be + * sent before a new packet is allocated. + * If you want to send 1024 identical packets + * before creating a new packet, + * set clone_skb to 1024. + */ + + char dst_min[IP_NAME_SZ]; /* IP, ie 1.2.3.4 */ + char dst_max[IP_NAME_SZ]; /* IP, ie 1.2.3.4 */ + char src_min[IP_NAME_SZ]; /* IP, ie 1.2.3.4 */ + char src_max[IP_NAME_SZ]; /* IP, ie 1.2.3.4 */ + + struct in6_addr in6_saddr; + struct in6_addr in6_daddr; + struct in6_addr cur_in6_daddr; + struct in6_addr cur_in6_saddr; + /* For ranges */ + struct in6_addr min_in6_daddr; + struct in6_addr max_in6_daddr; + struct in6_addr min_in6_saddr; + struct in6_addr max_in6_saddr; + + /* If we're doing ranges, random or incremental, then this + * defines the min/max for those ranges. + */ + __be32 saddr_min; /* inclusive, source IP address */ + __be32 saddr_max; /* exclusive, source IP address */ + __be32 daddr_min; /* inclusive, dest IP address */ + __be32 daddr_max; /* exclusive, dest IP address */ + + __u16 udp_src_min; /* inclusive, source UDP port */ + __u16 udp_src_max; /* exclusive, source UDP port */ + __u16 udp_dst_min; /* inclusive, dest UDP port */ + __u16 udp_dst_max; /* exclusive, dest UDP port */ + + /* DSCP + ECN */ + __u8 tos; /* six MSB of (former) IPv4 TOS + are for dscp codepoint */ + __u8 traffic_class; /* ditto for the (former) Traffic Class in IPv6 + (see RFC 3260, sec. 4) */ + + /* MPLS */ + unsigned nr_labels; /* Depth of stack, 0 = no MPLS */ + __be32 labels[MAX_MPLS_LABELS]; + + /* VLAN/SVLAN (802.1Q/Q-in-Q) */ + __u8 vlan_p; + __u8 vlan_cfi; + __u16 vlan_id; /* 0xffff means no vlan tag */ + + __u8 svlan_p; + __u8 svlan_cfi; + __u16 svlan_id; /* 0xffff means no svlan tag */ + + __u32 src_mac_count; /* How many MACs to iterate through */ + __u32 dst_mac_count; /* How many MACs to iterate through */ + + unsigned char dst_mac[ETH_ALEN]; + unsigned char src_mac[ETH_ALEN]; + + __u32 cur_dst_mac_offset; + __u32 cur_src_mac_offset; + __be32 cur_saddr; + __be32 cur_daddr; + __u16 ip_id; + __u16 cur_udp_dst; + __u16 cur_udp_src; + __u16 cur_queue_map; + __u32 cur_pkt_size; + __u32 last_pkt_size; + + __u8 hh[14]; + /* = { + 0x00, 0x80, 0xC8, 0x79, 0xB3, 0xCB, + + We fill in SRC address later + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x08, 0x00 + }; + */ + __u16 pad; /* pad out the hh struct to an even 16 bytes */ + + struct sk_buff *skb; /* skb we are to transmit next, used for when we + * are transmitting the same one multiple times + */ + struct net_device *odev; /* The out-going device. + * Note that the device should have it's + * pg_info pointer pointing back to this + * device. + * Set when the user specifies the out-going + * device name (not when the inject is + * started as it used to do.) + */ + char odevname[32]; + struct flow_state *flows; + unsigned cflows; /* Concurrent flows (config) */ + unsigned lflow; /* Flow length (config) */ + unsigned nflows; /* accumulated flows (stats) */ + unsigned curfl; /* current sequenced flow (state)*/ + + u16 queue_map_min; + u16 queue_map_max; + __u32 skb_priority; /* skb priority field */ + int node; /* Memory node */ + +#ifdef CONFIG_XFRM + __u8 ipsmode; /* IPSEC mode (config) */ + __u8 ipsproto; /* IPSEC type (config) */ +#endif + char result[512]; +}; + +struct pktgen_hdr { + __be32 pgh_magic; + __be32 seq_num; + __be32 tv_sec; + __be32 tv_usec; +}; + +static bool pktgen_exiting __read_mostly; + +struct pktgen_thread { + spinlock_t if_lock; /* for list of devices */ + struct list_head if_list; /* All device here */ + struct list_head th_list; + struct task_struct *tsk; + char result[512]; + + /* Field for thread to receive "posted" events terminate, + stop ifs etc. */ + + u32 control; + int cpu; + + wait_queue_head_t queue; + struct completion start_done; +}; + +#define REMOVE 1 +#define FIND 0 + +static inline ktime_t ktime_now(void) +{ + struct timespec ts; + ktime_get_ts(&ts); + + return timespec_to_ktime(ts); +} + +/* This works even if 32 bit because of careful byte order choice */ +static inline int ktime_lt(const ktime_t cmp1, const ktime_t cmp2) +{ + return cmp1.tv64 < cmp2.tv64; +} + +static const char version[] = + "Packet Generator for packet performance testing. " + "Version: " VERSION "\n"; + +static int pktgen_remove_device(struct pktgen_thread *t, struct pktgen_dev *i); +static int pktgen_add_device(struct pktgen_thread *t, const char *ifname); +static struct pktgen_dev *pktgen_find_dev(struct pktgen_thread *t, + const char *ifname, bool exact); +static int pktgen_device_event(struct notifier_block *, unsigned long, void *); +static void pktgen_run_all_threads(void); +static void pktgen_reset_all_threads(void); +static void pktgen_stop_all_threads_ifs(void); + +static void pktgen_stop(struct pktgen_thread *t); +static void pktgen_clear_counters(struct pktgen_dev *pkt_dev); + +static unsigned int scan_ip6(const char *s, char ip[16]); + +/* Module parameters, defaults. */ +static int pg_count_d __read_mostly = 1000; +static int pg_delay_d __read_mostly; +static int pg_clone_skb_d __read_mostly; +static int debug __read_mostly; + +static DEFINE_MUTEX(pktgen_thread_lock); +static LIST_HEAD(pktgen_threads); + +static struct notifier_block pktgen_notifier_block = { + .notifier_call = pktgen_device_event, +}; + +/* + * /proc handling functions + * + */ + +static int pgctrl_show(struct seq_file *seq, void *v) +{ + seq_puts(seq, version); + return 0; +} + +static ssize_t pgctrl_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + int err = 0; + char data[128]; + + if (!capable(CAP_NET_ADMIN)) { + err = -EPERM; + goto out; + } + + if (count > sizeof(data)) + count = sizeof(data); + + if (copy_from_user(data, buf, count)) { + err = -EFAULT; + goto out; + } + data[count - 1] = 0; /* Make string */ + + if (!strcmp(data, "stop")) + pktgen_stop_all_threads_ifs(); + + else if (!strcmp(data, "start")) + pktgen_run_all_threads(); + + else if (!strcmp(data, "reset")) + pktgen_reset_all_threads(); + + else + pr_warning("Unknown command: %s\n", data); + + err = count; + +out: + return err; +} + +static int pgctrl_open(struct inode *inode, struct file *file) +{ + return single_open(file, pgctrl_show, PDE(inode)->data); +} + +static const struct file_operations pktgen_fops = { + .owner = THIS_MODULE, + .open = pgctrl_open, + .read = seq_read, + .llseek = seq_lseek, + .write = pgctrl_write, + .release = single_release, +}; + +static int pktgen_if_show(struct seq_file *seq, void *v) +{ + const struct pktgen_dev *pkt_dev = seq->private; + ktime_t stopped; + u64 idle; + + seq_printf(seq, + "Params: count %llu min_pkt_size: %u max_pkt_size: %u\n", + (unsigned long long)pkt_dev->count, pkt_dev->min_pkt_size, + pkt_dev->max_pkt_size); + + seq_printf(seq, + " frags: %d delay: %llu clone_skb: %d ifname: %s\n", + pkt_dev->nfrags, (unsigned long long) pkt_dev->delay, + pkt_dev->clone_skb, pkt_dev->odevname); + + seq_printf(seq, " flows: %u flowlen: %u\n", pkt_dev->cflows, + pkt_dev->lflow); + + seq_printf(seq, + " queue_map_min: %u queue_map_max: %u\n", + pkt_dev->queue_map_min, + pkt_dev->queue_map_max); + + if (pkt_dev->skb_priority) + seq_printf(seq, " skb_priority: %u\n", + pkt_dev->skb_priority); + + if (pkt_dev->flags & F_IPV6) { + seq_printf(seq, + " saddr: %pI6c min_saddr: %pI6c max_saddr: %pI6c\n" + " daddr: %pI6c min_daddr: %pI6c max_daddr: %pI6c\n", + &pkt_dev->in6_saddr, + &pkt_dev->min_in6_saddr, &pkt_dev->max_in6_saddr, + &pkt_dev->in6_daddr, + &pkt_dev->min_in6_daddr, &pkt_dev->max_in6_daddr); + } else { + seq_printf(seq, + " dst_min: %s dst_max: %s\n", + pkt_dev->dst_min, pkt_dev->dst_max); + seq_printf(seq, + " src_min: %s src_max: %s\n", + pkt_dev->src_min, pkt_dev->src_max); + } + + seq_puts(seq, " src_mac: "); + + seq_printf(seq, "%pM ", + is_zero_ether_addr(pkt_dev->src_mac) ? + pkt_dev->odev->dev_addr : pkt_dev->src_mac); + + seq_printf(seq, "dst_mac: "); + seq_printf(seq, "%pM\n", pkt_dev->dst_mac); + + seq_printf(seq, + " udp_src_min: %d udp_src_max: %d" + " udp_dst_min: %d udp_dst_max: %d\n", + pkt_dev->udp_src_min, pkt_dev->udp_src_max, + pkt_dev->udp_dst_min, pkt_dev->udp_dst_max); + + seq_printf(seq, + " src_mac_count: %d dst_mac_count: %d\n", + pkt_dev->src_mac_count, pkt_dev->dst_mac_count); + + if (pkt_dev->nr_labels) { + unsigned i; + seq_printf(seq, " mpls: "); + for (i = 0; i < pkt_dev->nr_labels; i++) + seq_printf(seq, "%08x%s", ntohl(pkt_dev->labels[i]), + i == pkt_dev->nr_labels-1 ? "\n" : ", "); + } + + if (pkt_dev->vlan_id != 0xffff) + seq_printf(seq, " vlan_id: %u vlan_p: %u vlan_cfi: %u\n", + pkt_dev->vlan_id, pkt_dev->vlan_p, + pkt_dev->vlan_cfi); + + if (pkt_dev->svlan_id != 0xffff) + seq_printf(seq, " svlan_id: %u vlan_p: %u vlan_cfi: %u\n", + pkt_dev->svlan_id, pkt_dev->svlan_p, + pkt_dev->svlan_cfi); + + if (pkt_dev->tos) + seq_printf(seq, " tos: 0x%02x\n", pkt_dev->tos); + + if (pkt_dev->traffic_class) + seq_printf(seq, " traffic_class: 0x%02x\n", pkt_dev->traffic_class); + + if (pkt_dev->node >= 0) + seq_printf(seq, " node: %d\n", pkt_dev->node); + + seq_printf(seq, " Flags: "); + + if (pkt_dev->flags & F_IPV6) + seq_printf(seq, "IPV6 "); + + if (pkt_dev->flags & F_IPSRC_RND) + seq_printf(seq, "IPSRC_RND "); + + if (pkt_dev->flags & F_IPDST_RND) + seq_printf(seq, "IPDST_RND "); + + if (pkt_dev->flags & F_TXSIZE_RND) + seq_printf(seq, "TXSIZE_RND "); + + if (pkt_dev->flags & F_UDPSRC_RND) + seq_printf(seq, "UDPSRC_RND "); + + if (pkt_dev->flags & F_UDPDST_RND) + seq_printf(seq, "UDPDST_RND "); + + if (pkt_dev->flags & F_MPLS_RND) + seq_printf(seq, "MPLS_RND "); + + if (pkt_dev->flags & F_QUEUE_MAP_RND) + seq_printf(seq, "QUEUE_MAP_RND "); + + if (pkt_dev->flags & F_QUEUE_MAP_CPU) + seq_printf(seq, "QUEUE_MAP_CPU "); + + if (pkt_dev->cflows) { + if (pkt_dev->flags & F_FLOW_SEQ) + seq_printf(seq, "FLOW_SEQ "); /*in sequence flows*/ + else + seq_printf(seq, "FLOW_RND "); + } + +#ifdef CONFIG_XFRM + if (pkt_dev->flags & F_IPSEC_ON) + seq_printf(seq, "IPSEC "); +#endif + + if (pkt_dev->flags & F_MACSRC_RND) + seq_printf(seq, "MACSRC_RND "); + + if (pkt_dev->flags & F_MACDST_RND) + seq_printf(seq, "MACDST_RND "); + + if (pkt_dev->flags & F_VID_RND) + seq_printf(seq, "VID_RND "); + + if (pkt_dev->flags & F_SVID_RND) + seq_printf(seq, "SVID_RND "); + + if (pkt_dev->flags & F_NODE) + seq_printf(seq, "NODE_ALLOC "); + + seq_puts(seq, "\n"); + + /* not really stopped, more like last-running-at */ + stopped = pkt_dev->running ? ktime_now() : pkt_dev->stopped_at; + idle = pkt_dev->idle_acc; + do_div(idle, NSEC_PER_USEC); + + seq_printf(seq, + "Current:\n pkts-sofar: %llu errors: %llu\n", + (unsigned long long)pkt_dev->sofar, + (unsigned long long)pkt_dev->errors); + + seq_printf(seq, + " started: %lluus stopped: %lluus idle: %lluus\n", + (unsigned long long) ktime_to_us(pkt_dev->started_at), + (unsigned long long) ktime_to_us(stopped), + (unsigned long long) idle); + + seq_printf(seq, + " seq_num: %d cur_dst_mac_offset: %d cur_src_mac_offset: %d\n", + pkt_dev->seq_num, pkt_dev->cur_dst_mac_offset, + pkt_dev->cur_src_mac_offset); + + if (pkt_dev->flags & F_IPV6) { + seq_printf(seq, " cur_saddr: %pI6c cur_daddr: %pI6c\n", + &pkt_dev->cur_in6_saddr, + &pkt_dev->cur_in6_daddr); + } else + seq_printf(seq, " cur_saddr: 0x%x cur_daddr: 0x%x\n", + pkt_dev->cur_saddr, pkt_dev->cur_daddr); + + seq_printf(seq, " cur_udp_dst: %d cur_udp_src: %d\n", + pkt_dev->cur_udp_dst, pkt_dev->cur_udp_src); + + seq_printf(seq, " cur_queue_map: %u\n", pkt_dev->cur_queue_map); + + seq_printf(seq, " flows: %u\n", pkt_dev->nflows); + + if (pkt_dev->result[0]) + seq_printf(seq, "Result: %s\n", pkt_dev->result); + else + seq_printf(seq, "Result: Idle\n"); + + return 0; +} + + +static int hex32_arg(const char __user *user_buffer, unsigned long maxlen, + __u32 *num) +{ + int i = 0; + *num = 0; + + for (; i < maxlen; i++) { + int value; + char c; + *num <<= 4; + if (get_user(c, &user_buffer[i])) + return -EFAULT; + value = hex_to_bin(c); + if (value >= 0) + *num |= value; + else + break; + } + return i; +} + +static int count_trail_chars(const char __user * user_buffer, + unsigned int maxlen) +{ + int i; + + for (i = 0; i < maxlen; i++) { + char c; + if (get_user(c, &user_buffer[i])) + return -EFAULT; + switch (c) { + case '\"': + case '\n': + case '\r': + case '\t': + case ' ': + case '=': + break; + default: + goto done; + } + } +done: + return i; +} + +static long num_arg(const char __user *user_buffer, unsigned long maxlen, + unsigned long *num) +{ + int i; + *num = 0; + + for (i = 0; i < maxlen; i++) { + char c; + if (get_user(c, &user_buffer[i])) + return -EFAULT; + if ((c >= '0') && (c <= '9')) { + *num *= 10; + *num += c - '0'; + } else + break; + } + return i; +} + +static int strn_len(const char __user * user_buffer, unsigned int maxlen) +{ + int i; + + for (i = 0; i < maxlen; i++) { + char c; + if (get_user(c, &user_buffer[i])) + return -EFAULT; + switch (c) { + case '\"': + case '\n': + case '\r': + case '\t': + case ' ': + goto done_str; + break; + default: + break; + } + } +done_str: + return i; +} + +static ssize_t get_labels(const char __user *buffer, struct pktgen_dev *pkt_dev) +{ + unsigned n = 0; + char c; + ssize_t i = 0; + int len; + + pkt_dev->nr_labels = 0; + do { + __u32 tmp; + len = hex32_arg(&buffer[i], 8, &tmp); + if (len <= 0) + return len; + pkt_dev->labels[n] = htonl(tmp); + if (pkt_dev->labels[n] & MPLS_STACK_BOTTOM) + pkt_dev->flags |= F_MPLS_RND; + i += len; + if (get_user(c, &buffer[i])) + return -EFAULT; + i++; + n++; + if (n >= MAX_MPLS_LABELS) + return -E2BIG; + } while (c == ','); + + pkt_dev->nr_labels = n; + return i; +} + +static ssize_t pktgen_if_write(struct file *file, + const char __user * user_buffer, size_t count, + loff_t * offset) +{ + struct seq_file *seq = file->private_data; + struct pktgen_dev *pkt_dev = seq->private; + int i, max, len; + char name[16], valstr[32]; + unsigned long value = 0; + char *pg_result = NULL; + int tmp = 0; + char buf[128]; + + pg_result = &(pkt_dev->result[0]); + + if (count < 1) { + pr_warning("wrong command format\n"); + return -EINVAL; + } + + max = count; + tmp = count_trail_chars(user_buffer, max); + if (tmp < 0) { + pr_warning("illegal format\n"); + return tmp; + } + i = tmp; + + /* Read variable name */ + + len = strn_len(&user_buffer[i], sizeof(name) - 1); + if (len < 0) + return len; + + memset(name, 0, sizeof(name)); + if (copy_from_user(name, &user_buffer[i], len)) + return -EFAULT; + i += len; + + max = count - i; + len = count_trail_chars(&user_buffer[i], max); + if (len < 0) + return len; + + i += len; + + if (debug) { + size_t copy = min_t(size_t, count, 1023); + char tb[copy + 1]; + if (copy_from_user(tb, user_buffer, copy)) + return -EFAULT; + tb[copy] = 0; + printk(KERN_DEBUG "pktgen: %s,%lu buffer -:%s:-\n", name, + (unsigned long)count, tb); + } + + if (!strcmp(name, "min_pkt_size")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) + return len; + + i += len; + if (value < 14 + 20 + 8) + value = 14 + 20 + 8; + if (value != pkt_dev->min_pkt_size) { + pkt_dev->min_pkt_size = value; + pkt_dev->cur_pkt_size = value; + } + sprintf(pg_result, "OK: min_pkt_size=%u", + pkt_dev->min_pkt_size); + return count; + } + + if (!strcmp(name, "max_pkt_size")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) + return len; + + i += len; + if (value < 14 + 20 + 8) + value = 14 + 20 + 8; + if (value != pkt_dev->max_pkt_size) { + pkt_dev->max_pkt_size = value; + pkt_dev->cur_pkt_size = value; + } + sprintf(pg_result, "OK: max_pkt_size=%u", + pkt_dev->max_pkt_size); + return count; + } + + /* Shortcut for min = max */ + + if (!strcmp(name, "pkt_size")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) + return len; + + i += len; + if (value < 14 + 20 + 8) + value = 14 + 20 + 8; + if (value != pkt_dev->min_pkt_size) { + pkt_dev->min_pkt_size = value; + pkt_dev->max_pkt_size = value; + pkt_dev->cur_pkt_size = value; + } + sprintf(pg_result, "OK: pkt_size=%u", pkt_dev->min_pkt_size); + return count; + } + + if (!strcmp(name, "debug")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) + return len; + + i += len; + debug = value; + sprintf(pg_result, "OK: debug=%u", debug); + return count; + } + + if (!strcmp(name, "frags")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) + return len; + + i += len; + pkt_dev->nfrags = value; + sprintf(pg_result, "OK: frags=%u", pkt_dev->nfrags); + return count; + } + if (!strcmp(name, "delay")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) + return len; + + i += len; + if (value == 0x7FFFFFFF) + pkt_dev->delay = ULLONG_MAX; + else + pkt_dev->delay = (u64)value; + + sprintf(pg_result, "OK: delay=%llu", + (unsigned long long) pkt_dev->delay); + return count; + } + if (!strcmp(name, "rate")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) + return len; + + i += len; + if (!value) + return len; + pkt_dev->delay = pkt_dev->min_pkt_size*8*NSEC_PER_USEC/value; + if (debug) + pr_info("Delay set at: %llu ns\n", pkt_dev->delay); + + sprintf(pg_result, "OK: rate=%lu", value); + return count; + } + if (!strcmp(name, "ratep")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) + return len; + + i += len; + if (!value) + return len; + pkt_dev->delay = NSEC_PER_SEC/value; + if (debug) + pr_info("Delay set at: %llu ns\n", pkt_dev->delay); + + sprintf(pg_result, "OK: rate=%lu", value); + return count; + } + if (!strcmp(name, "udp_src_min")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) + return len; + + i += len; + if (value != pkt_dev->udp_src_min) { + pkt_dev->udp_src_min = value; + pkt_dev->cur_udp_src = value; + } + sprintf(pg_result, "OK: udp_src_min=%u", pkt_dev->udp_src_min); + return count; + } + if (!strcmp(name, "udp_dst_min")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) + return len; + + i += len; + if (value != pkt_dev->udp_dst_min) { + pkt_dev->udp_dst_min = value; + pkt_dev->cur_udp_dst = value; + } + sprintf(pg_result, "OK: udp_dst_min=%u", pkt_dev->udp_dst_min); + return count; + } + if (!strcmp(name, "udp_src_max")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) + return len; + + i += len; + if (value != pkt_dev->udp_src_max) { + pkt_dev->udp_src_max = value; + pkt_dev->cur_udp_src = value; + } + sprintf(pg_result, "OK: udp_src_max=%u", pkt_dev->udp_src_max); + return count; + } + if (!strcmp(name, "udp_dst_max")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) + return len; + + i += len; + if (value != pkt_dev->udp_dst_max) { + pkt_dev->udp_dst_max = value; + pkt_dev->cur_udp_dst = value; + } + sprintf(pg_result, "OK: udp_dst_max=%u", pkt_dev->udp_dst_max); + return count; + } + if (!strcmp(name, "clone_skb")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) + return len; + if ((value > 0) && + (!(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING))) + return -ENOTSUPP; + i += len; + pkt_dev->clone_skb = value; + + sprintf(pg_result, "OK: clone_skb=%d", pkt_dev->clone_skb); + return count; + } + if (!strcmp(name, "count")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) + return len; + + i += len; + pkt_dev->count = value; + sprintf(pg_result, "OK: count=%llu", + (unsigned long long)pkt_dev->count); + return count; + } + if (!strcmp(name, "src_mac_count")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) + return len; + + i += len; + if (pkt_dev->src_mac_count != value) { + pkt_dev->src_mac_count = value; + pkt_dev->cur_src_mac_offset = 0; + } + sprintf(pg_result, "OK: src_mac_count=%d", + pkt_dev->src_mac_count); + return count; + } + if (!strcmp(name, "dst_mac_count")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) + return len; + + i += len; + if (pkt_dev->dst_mac_count != value) { + pkt_dev->dst_mac_count = value; + pkt_dev->cur_dst_mac_offset = 0; + } + sprintf(pg_result, "OK: dst_mac_count=%d", + pkt_dev->dst_mac_count); + return count; + } + if (!strcmp(name, "node")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) + return len; + + i += len; + + if (node_possible(value)) { + pkt_dev->node = value; + sprintf(pg_result, "OK: node=%d", pkt_dev->node); + if (pkt_dev->page) { + put_page(pkt_dev->page); + pkt_dev->page = NULL; + } + } + else + sprintf(pg_result, "ERROR: node not possible"); + return count; + } + if (!strcmp(name, "flag")) { + char f[32]; + memset(f, 0, 32); + len = strn_len(&user_buffer[i], sizeof(f) - 1); + if (len < 0) + return len; + + if (copy_from_user(f, &user_buffer[i], len)) + return -EFAULT; + i += len; + if (strcmp(f, "IPSRC_RND") == 0) + pkt_dev->flags |= F_IPSRC_RND; + + else if (strcmp(f, "!IPSRC_RND") == 0) + pkt_dev->flags &= ~F_IPSRC_RND; + + else if (strcmp(f, "TXSIZE_RND") == 0) + pkt_dev->flags |= F_TXSIZE_RND; + + else if (strcmp(f, "!TXSIZE_RND") == 0) + pkt_dev->flags &= ~F_TXSIZE_RND; + + else if (strcmp(f, "IPDST_RND") == 0) + pkt_dev->flags |= F_IPDST_RND; + + else if (strcmp(f, "!IPDST_RND") == 0) + pkt_dev->flags &= ~F_IPDST_RND; + + else if (strcmp(f, "UDPSRC_RND") == 0) + pkt_dev->flags |= F_UDPSRC_RND; + + else if (strcmp(f, "!UDPSRC_RND") == 0) + pkt_dev->flags &= ~F_UDPSRC_RND; + + else if (strcmp(f, "UDPDST_RND") == 0) + pkt_dev->flags |= F_UDPDST_RND; + + else if (strcmp(f, "!UDPDST_RND") == 0) + pkt_dev->flags &= ~F_UDPDST_RND; + + else if (strcmp(f, "MACSRC_RND") == 0) + pkt_dev->flags |= F_MACSRC_RND; + + else if (strcmp(f, "!MACSRC_RND") == 0) + pkt_dev->flags &= ~F_MACSRC_RND; + + else if (strcmp(f, "MACDST_RND") == 0) + pkt_dev->flags |= F_MACDST_RND; + + else if (strcmp(f, "!MACDST_RND") == 0) + pkt_dev->flags &= ~F_MACDST_RND; + + else if (strcmp(f, "MPLS_RND") == 0) + pkt_dev->flags |= F_MPLS_RND; + + else if (strcmp(f, "!MPLS_RND") == 0) + pkt_dev->flags &= ~F_MPLS_RND; + + else if (strcmp(f, "VID_RND") == 0) + pkt_dev->flags |= F_VID_RND; + + else if (strcmp(f, "!VID_RND") == 0) + pkt_dev->flags &= ~F_VID_RND; + + else if (strcmp(f, "SVID_RND") == 0) + pkt_dev->flags |= F_SVID_RND; + + else if (strcmp(f, "!SVID_RND") == 0) + pkt_dev->flags &= ~F_SVID_RND; + + else if (strcmp(f, "FLOW_SEQ") == 0) + pkt_dev->flags |= F_FLOW_SEQ; + + else if (strcmp(f, "QUEUE_MAP_RND") == 0) + pkt_dev->flags |= F_QUEUE_MAP_RND; + + else if (strcmp(f, "!QUEUE_MAP_RND") == 0) + pkt_dev->flags &= ~F_QUEUE_MAP_RND; + + else if (strcmp(f, "QUEUE_MAP_CPU") == 0) + pkt_dev->flags |= F_QUEUE_MAP_CPU; + + else if (strcmp(f, "!QUEUE_MAP_CPU") == 0) + pkt_dev->flags &= ~F_QUEUE_MAP_CPU; +#ifdef CONFIG_XFRM + else if (strcmp(f, "IPSEC") == 0) + pkt_dev->flags |= F_IPSEC_ON; +#endif + + else if (strcmp(f, "!IPV6") == 0) + pkt_dev->flags &= ~F_IPV6; + + else if (strcmp(f, "NODE_ALLOC") == 0) + pkt_dev->flags |= F_NODE; + + else if (strcmp(f, "!NODE_ALLOC") == 0) + pkt_dev->flags &= ~F_NODE; + + else { + sprintf(pg_result, + "Flag -:%s:- unknown\nAvailable flags, (prepend ! to un-set flag):\n%s", + f, + "IPSRC_RND, IPDST_RND, UDPSRC_RND, UDPDST_RND, " + "MACSRC_RND, MACDST_RND, TXSIZE_RND, IPV6, MPLS_RND, VID_RND, SVID_RND, FLOW_SEQ, IPSEC, NODE_ALLOC\n"); + return count; + } + sprintf(pg_result, "OK: flags=0x%x", pkt_dev->flags); + return count; + } + if (!strcmp(name, "dst_min") || !strcmp(name, "dst")) { + len = strn_len(&user_buffer[i], sizeof(pkt_dev->dst_min) - 1); + if (len < 0) + return len; + + if (copy_from_user(buf, &user_buffer[i], len)) + return -EFAULT; + buf[len] = 0; + if (strcmp(buf, pkt_dev->dst_min) != 0) { + memset(pkt_dev->dst_min, 0, sizeof(pkt_dev->dst_min)); + strncpy(pkt_dev->dst_min, buf, len); + pkt_dev->daddr_min = in_aton(pkt_dev->dst_min); + pkt_dev->cur_daddr = pkt_dev->daddr_min; + } + if (debug) + printk(KERN_DEBUG "pktgen: dst_min set to: %s\n", + pkt_dev->dst_min); + i += len; + sprintf(pg_result, "OK: dst_min=%s", pkt_dev->dst_min); + return count; + } + if (!strcmp(name, "dst_max")) { + len = strn_len(&user_buffer[i], sizeof(pkt_dev->dst_max) - 1); + if (len < 0) + return len; + + + if (copy_from_user(buf, &user_buffer[i], len)) + return -EFAULT; + + buf[len] = 0; + if (strcmp(buf, pkt_dev->dst_max) != 0) { + memset(pkt_dev->dst_max, 0, sizeof(pkt_dev->dst_max)); + strncpy(pkt_dev->dst_max, buf, len); + pkt_dev->daddr_max = in_aton(pkt_dev->dst_max); + pkt_dev->cur_daddr = pkt_dev->daddr_max; + } + if (debug) + printk(KERN_DEBUG "pktgen: dst_max set to: %s\n", + pkt_dev->dst_max); + i += len; + sprintf(pg_result, "OK: dst_max=%s", pkt_dev->dst_max); + return count; + } + if (!strcmp(name, "dst6")) { + len = strn_len(&user_buffer[i], sizeof(buf) - 1); + if (len < 0) + return len; + + pkt_dev->flags |= F_IPV6; + + if (copy_from_user(buf, &user_buffer[i], len)) + return -EFAULT; + buf[len] = 0; + + scan_ip6(buf, pkt_dev->in6_daddr.s6_addr); + snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->in6_daddr); + + pkt_dev->cur_in6_daddr = pkt_dev->in6_daddr; + + if (debug) + printk(KERN_DEBUG "pktgen: dst6 set to: %s\n", buf); + + i += len; + sprintf(pg_result, "OK: dst6=%s", buf); + return count; + } + if (!strcmp(name, "dst6_min")) { + len = strn_len(&user_buffer[i], sizeof(buf) - 1); + if (len < 0) + return len; + + pkt_dev->flags |= F_IPV6; + + if (copy_from_user(buf, &user_buffer[i], len)) + return -EFAULT; + buf[len] = 0; + + scan_ip6(buf, pkt_dev->min_in6_daddr.s6_addr); + snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->min_in6_daddr); + + pkt_dev->cur_in6_daddr = pkt_dev->min_in6_daddr; + if (debug) + printk(KERN_DEBUG "pktgen: dst6_min set to: %s\n", buf); + + i += len; + sprintf(pg_result, "OK: dst6_min=%s", buf); + return count; + } + if (!strcmp(name, "dst6_max")) { + len = strn_len(&user_buffer[i], sizeof(buf) - 1); + if (len < 0) + return len; + + pkt_dev->flags |= F_IPV6; + + if (copy_from_user(buf, &user_buffer[i], len)) + return -EFAULT; + buf[len] = 0; + + scan_ip6(buf, pkt_dev->max_in6_daddr.s6_addr); + snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->max_in6_daddr); + + if (debug) + printk(KERN_DEBUG "pktgen: dst6_max set to: %s\n", buf); + + i += len; + sprintf(pg_result, "OK: dst6_max=%s", buf); + return count; + } + if (!strcmp(name, "src6")) { + len = strn_len(&user_buffer[i], sizeof(buf) - 1); + if (len < 0) + return len; + + pkt_dev->flags |= F_IPV6; + + if (copy_from_user(buf, &user_buffer[i], len)) + return -EFAULT; + buf[len] = 0; + + scan_ip6(buf, pkt_dev->in6_saddr.s6_addr); + snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->in6_saddr); + + pkt_dev->cur_in6_saddr = pkt_dev->in6_saddr; + + if (debug) + printk(KERN_DEBUG "pktgen: src6 set to: %s\n", buf); + + i += len; + sprintf(pg_result, "OK: src6=%s", buf); + return count; + } + if (!strcmp(name, "src_min")) { + len = strn_len(&user_buffer[i], sizeof(pkt_dev->src_min) - 1); + if (len < 0) + return len; + + if (copy_from_user(buf, &user_buffer[i], len)) + return -EFAULT; + buf[len] = 0; + if (strcmp(buf, pkt_dev->src_min) != 0) { + memset(pkt_dev->src_min, 0, sizeof(pkt_dev->src_min)); + strncpy(pkt_dev->src_min, buf, len); + pkt_dev->saddr_min = in_aton(pkt_dev->src_min); + pkt_dev->cur_saddr = pkt_dev->saddr_min; + } + if (debug) + printk(KERN_DEBUG "pktgen: src_min set to: %s\n", + pkt_dev->src_min); + i += len; + sprintf(pg_result, "OK: src_min=%s", pkt_dev->src_min); + return count; + } + if (!strcmp(name, "src_max")) { + len = strn_len(&user_buffer[i], sizeof(pkt_dev->src_max) - 1); + if (len < 0) + return len; + + if (copy_from_user(buf, &user_buffer[i], len)) + return -EFAULT; + buf[len] = 0; + if (strcmp(buf, pkt_dev->src_max) != 0) { + memset(pkt_dev->src_max, 0, sizeof(pkt_dev->src_max)); + strncpy(pkt_dev->src_max, buf, len); + pkt_dev->saddr_max = in_aton(pkt_dev->src_max); + pkt_dev->cur_saddr = pkt_dev->saddr_max; + } + if (debug) + printk(KERN_DEBUG "pktgen: src_max set to: %s\n", + pkt_dev->src_max); + i += len; + sprintf(pg_result, "OK: src_max=%s", pkt_dev->src_max); + return count; + } + if (!strcmp(name, "dst_mac")) { + len = strn_len(&user_buffer[i], sizeof(valstr) - 1); + if (len < 0) + return len; + + memset(valstr, 0, sizeof(valstr)); + if (copy_from_user(valstr, &user_buffer[i], len)) + return -EFAULT; + + if (!mac_pton(valstr, pkt_dev->dst_mac)) + return -EINVAL; + /* Set up Dest MAC */ + memcpy(&pkt_dev->hh[0], pkt_dev->dst_mac, ETH_ALEN); + + sprintf(pg_result, "OK: dstmac %pM", pkt_dev->dst_mac); + return count; + } + if (!strcmp(name, "src_mac")) { + len = strn_len(&user_buffer[i], sizeof(valstr) - 1); + if (len < 0) + return len; + + memset(valstr, 0, sizeof(valstr)); + if (copy_from_user(valstr, &user_buffer[i], len)) + return -EFAULT; + + if (!mac_pton(valstr, pkt_dev->src_mac)) + return -EINVAL; + /* Set up Src MAC */ + memcpy(&pkt_dev->hh[6], pkt_dev->src_mac, ETH_ALEN); + + sprintf(pg_result, "OK: srcmac %pM", pkt_dev->src_mac); + return count; + } + + if (!strcmp(name, "clear_counters")) { + pktgen_clear_counters(pkt_dev); + sprintf(pg_result, "OK: Clearing counters.\n"); + return count; + } + + if (!strcmp(name, "flows")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) + return len; + + i += len; + if (value > MAX_CFLOWS) + value = MAX_CFLOWS; + + pkt_dev->cflows = value; + sprintf(pg_result, "OK: flows=%u", pkt_dev->cflows); + return count; + } + + if (!strcmp(name, "flowlen")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) + return len; + + i += len; + pkt_dev->lflow = value; + sprintf(pg_result, "OK: flowlen=%u", pkt_dev->lflow); + return count; + } + + if (!strcmp(name, "queue_map_min")) { + len = num_arg(&user_buffer[i], 5, &value); + if (len < 0) + return len; + + i += len; + pkt_dev->queue_map_min = value; + sprintf(pg_result, "OK: queue_map_min=%u", pkt_dev->queue_map_min); + return count; + } + + if (!strcmp(name, "queue_map_max")) { + len = num_arg(&user_buffer[i], 5, &value); + if (len < 0) + return len; + + i += len; + pkt_dev->queue_map_max = value; + sprintf(pg_result, "OK: queue_map_max=%u", pkt_dev->queue_map_max); + return count; + } + + if (!strcmp(name, "mpls")) { + unsigned n, cnt; + + len = get_labels(&user_buffer[i], pkt_dev); + if (len < 0) + return len; + i += len; + cnt = sprintf(pg_result, "OK: mpls="); + for (n = 0; n < pkt_dev->nr_labels; n++) + cnt += sprintf(pg_result + cnt, + "%08x%s", ntohl(pkt_dev->labels[n]), + n == pkt_dev->nr_labels-1 ? "" : ","); + + if (pkt_dev->nr_labels && pkt_dev->vlan_id != 0xffff) { + pkt_dev->vlan_id = 0xffff; /* turn off VLAN/SVLAN */ + pkt_dev->svlan_id = 0xffff; + + if (debug) + printk(KERN_DEBUG "pktgen: VLAN/SVLAN auto turned off\n"); + } + return count; + } + + if (!strcmp(name, "vlan_id")) { + len = num_arg(&user_buffer[i], 4, &value); + if (len < 0) + return len; + + i += len; + if (value <= 4095) { + pkt_dev->vlan_id = value; /* turn on VLAN */ + + if (debug) + printk(KERN_DEBUG "pktgen: VLAN turned on\n"); + + if (debug && pkt_dev->nr_labels) + printk(KERN_DEBUG "pktgen: MPLS auto turned off\n"); + + pkt_dev->nr_labels = 0; /* turn off MPLS */ + sprintf(pg_result, "OK: vlan_id=%u", pkt_dev->vlan_id); + } else { + pkt_dev->vlan_id = 0xffff; /* turn off VLAN/SVLAN */ + pkt_dev->svlan_id = 0xffff; + + if (debug) + printk(KERN_DEBUG "pktgen: VLAN/SVLAN turned off\n"); + } + return count; + } + + if (!strcmp(name, "vlan_p")) { + len = num_arg(&user_buffer[i], 1, &value); + if (len < 0) + return len; + + i += len; + if ((value <= 7) && (pkt_dev->vlan_id != 0xffff)) { + pkt_dev->vlan_p = value; + sprintf(pg_result, "OK: vlan_p=%u", pkt_dev->vlan_p); + } else { + sprintf(pg_result, "ERROR: vlan_p must be 0-7"); + } + return count; + } + + if (!strcmp(name, "vlan_cfi")) { + len = num_arg(&user_buffer[i], 1, &value); + if (len < 0) + return len; + + i += len; + if ((value <= 1) && (pkt_dev->vlan_id != 0xffff)) { + pkt_dev->vlan_cfi = value; + sprintf(pg_result, "OK: vlan_cfi=%u", pkt_dev->vlan_cfi); + } else { + sprintf(pg_result, "ERROR: vlan_cfi must be 0-1"); + } + return count; + } + + if (!strcmp(name, "svlan_id")) { + len = num_arg(&user_buffer[i], 4, &value); + if (len < 0) + return len; + + i += len; + if ((value <= 4095) && ((pkt_dev->vlan_id != 0xffff))) { + pkt_dev->svlan_id = value; /* turn on SVLAN */ + + if (debug) + printk(KERN_DEBUG "pktgen: SVLAN turned on\n"); + + if (debug && pkt_dev->nr_labels) + printk(KERN_DEBUG "pktgen: MPLS auto turned off\n"); + + pkt_dev->nr_labels = 0; /* turn off MPLS */ + sprintf(pg_result, "OK: svlan_id=%u", pkt_dev->svlan_id); + } else { + pkt_dev->vlan_id = 0xffff; /* turn off VLAN/SVLAN */ + pkt_dev->svlan_id = 0xffff; + + if (debug) + printk(KERN_DEBUG "pktgen: VLAN/SVLAN turned off\n"); + } + return count; + } + + if (!strcmp(name, "svlan_p")) { + len = num_arg(&user_buffer[i], 1, &value); + if (len < 0) + return len; + + i += len; + if ((value <= 7) && (pkt_dev->svlan_id != 0xffff)) { + pkt_dev->svlan_p = value; + sprintf(pg_result, "OK: svlan_p=%u", pkt_dev->svlan_p); + } else { + sprintf(pg_result, "ERROR: svlan_p must be 0-7"); + } + return count; + } + + if (!strcmp(name, "svlan_cfi")) { + len = num_arg(&user_buffer[i], 1, &value); + if (len < 0) + return len; + + i += len; + if ((value <= 1) && (pkt_dev->svlan_id != 0xffff)) { + pkt_dev->svlan_cfi = value; + sprintf(pg_result, "OK: svlan_cfi=%u", pkt_dev->svlan_cfi); + } else { + sprintf(pg_result, "ERROR: svlan_cfi must be 0-1"); + } + return count; + } + + if (!strcmp(name, "tos")) { + __u32 tmp_value = 0; + len = hex32_arg(&user_buffer[i], 2, &tmp_value); + if (len < 0) + return len; + + i += len; + if (len == 2) { + pkt_dev->tos = tmp_value; + sprintf(pg_result, "OK: tos=0x%02x", pkt_dev->tos); + } else { + sprintf(pg_result, "ERROR: tos must be 00-ff"); + } + return count; + } + + if (!strcmp(name, "traffic_class")) { + __u32 tmp_value = 0; + len = hex32_arg(&user_buffer[i], 2, &tmp_value); + if (len < 0) + return len; + + i += len; + if (len == 2) { + pkt_dev->traffic_class = tmp_value; + sprintf(pg_result, "OK: traffic_class=0x%02x", pkt_dev->traffic_class); + } else { + sprintf(pg_result, "ERROR: traffic_class must be 00-ff"); + } + return count; + } + + if (!strcmp(name, "skb_priority")) { + len = num_arg(&user_buffer[i], 9, &value); + if (len < 0) + return len; + + i += len; + pkt_dev->skb_priority = value; + sprintf(pg_result, "OK: skb_priority=%i", + pkt_dev->skb_priority); + return count; + } + + sprintf(pkt_dev->result, "No such parameter \"%s\"", name); + return -EINVAL; +} + +static int pktgen_if_open(struct inode *inode, struct file *file) +{ + return single_open(file, pktgen_if_show, PDE(inode)->data); +} + +static const struct file_operations pktgen_if_fops = { + .owner = THIS_MODULE, + .open = pktgen_if_open, + .read = seq_read, + .llseek = seq_lseek, + .write = pktgen_if_write, + .release = single_release, +}; + +static int pktgen_thread_show(struct seq_file *seq, void *v) +{ + struct pktgen_thread *t = seq->private; + const struct pktgen_dev *pkt_dev; + + BUG_ON(!t); + + seq_printf(seq, "Running: "); + + if_lock(t); + list_for_each_entry(pkt_dev, &t->if_list, list) + if (pkt_dev->running) + seq_printf(seq, "%s ", pkt_dev->odevname); + + seq_printf(seq, "\nStopped: "); + + list_for_each_entry(pkt_dev, &t->if_list, list) + if (!pkt_dev->running) + seq_printf(seq, "%s ", pkt_dev->odevname); + + if (t->result[0]) + seq_printf(seq, "\nResult: %s\n", t->result); + else + seq_printf(seq, "\nResult: NA\n"); + + if_unlock(t); + + return 0; +} + +static ssize_t pktgen_thread_write(struct file *file, + const char __user * user_buffer, + size_t count, loff_t * offset) +{ + struct seq_file *seq = file->private_data; + struct pktgen_thread *t = seq->private; + int i, max, len, ret; + char name[40]; + char *pg_result; + + if (count < 1) { + // sprintf(pg_result, "Wrong command format"); + return -EINVAL; + } + + max = count; + len = count_trail_chars(user_buffer, max); + if (len < 0) + return len; + + i = len; + + /* Read variable name */ + + len = strn_len(&user_buffer[i], sizeof(name) - 1); + if (len < 0) + return len; + + memset(name, 0, sizeof(name)); + if (copy_from_user(name, &user_buffer[i], len)) + return -EFAULT; + i += len; + + max = count - i; + len = count_trail_chars(&user_buffer[i], max); + if (len < 0) + return len; + + i += len; + + if (debug) + printk(KERN_DEBUG "pktgen: t=%s, count=%lu\n", + name, (unsigned long)count); + + if (!t) { + pr_err("ERROR: No thread\n"); + ret = -EINVAL; + goto out; + } + + pg_result = &(t->result[0]); + + if (!strcmp(name, "add_device")) { + char f[32]; + memset(f, 0, 32); + len = strn_len(&user_buffer[i], sizeof(f) - 1); + if (len < 0) { + ret = len; + goto out; + } + if (copy_from_user(f, &user_buffer[i], len)) + return -EFAULT; + i += len; + mutex_lock(&pktgen_thread_lock); + pktgen_add_device(t, f); + mutex_unlock(&pktgen_thread_lock); + ret = count; + sprintf(pg_result, "OK: add_device=%s", f); + goto out; + } + + if (!strcmp(name, "rem_device_all")) { + mutex_lock(&pktgen_thread_lock); + t->control |= T_REMDEVALL; + mutex_unlock(&pktgen_thread_lock); + schedule_timeout_interruptible(msecs_to_jiffies(125)); /* Propagate thread->control */ + ret = count; + sprintf(pg_result, "OK: rem_device_all"); + goto out; + } + + if (!strcmp(name, "max_before_softirq")) { + sprintf(pg_result, "OK: Note! max_before_softirq is obsoleted -- Do not use"); + ret = count; + goto out; + } + + ret = -EINVAL; +out: + return ret; +} + +static int pktgen_thread_open(struct inode *inode, struct file *file) +{ + return single_open(file, pktgen_thread_show, PDE(inode)->data); +} + +static const struct file_operations pktgen_thread_fops = { + .owner = THIS_MODULE, + .open = pktgen_thread_open, + .read = seq_read, + .llseek = seq_lseek, + .write = pktgen_thread_write, + .release = single_release, +}; + +/* Think find or remove for NN */ +static struct pktgen_dev *__pktgen_NN_threads(const char *ifname, int remove) +{ + struct pktgen_thread *t; + struct pktgen_dev *pkt_dev = NULL; + bool exact = (remove == FIND); + + list_for_each_entry(t, &pktgen_threads, th_list) { + pkt_dev = pktgen_find_dev(t, ifname, exact); + if (pkt_dev) { + if (remove) { + if_lock(t); + pkt_dev->removal_mark = 1; + t->control |= T_REMDEV; + if_unlock(t); + } + break; + } + } + return pkt_dev; +} + +/* + * mark a device for removal + */ +static void pktgen_mark_device(const char *ifname) +{ + struct pktgen_dev *pkt_dev = NULL; + const int max_tries = 10, msec_per_try = 125; + int i = 0; + + mutex_lock(&pktgen_thread_lock); + pr_debug("%s: marking %s for removal\n", __func__, ifname); + + while (1) { + + pkt_dev = __pktgen_NN_threads(ifname, REMOVE); + if (pkt_dev == NULL) + break; /* success */ + + mutex_unlock(&pktgen_thread_lock); + pr_debug("%s: waiting for %s to disappear....\n", + __func__, ifname); + schedule_timeout_interruptible(msecs_to_jiffies(msec_per_try)); + mutex_lock(&pktgen_thread_lock); + + if (++i >= max_tries) { + pr_err("%s: timed out after waiting %d msec for device %s to be removed\n", + __func__, msec_per_try * i, ifname); + break; + } + + } + + mutex_unlock(&pktgen_thread_lock); +} + +static void pktgen_change_name(struct net_device *dev) +{ + struct pktgen_thread *t; + + list_for_each_entry(t, &pktgen_threads, th_list) { + struct pktgen_dev *pkt_dev; + + list_for_each_entry(pkt_dev, &t->if_list, list) { + if (pkt_dev->odev != dev) + continue; + + remove_proc_entry(pkt_dev->entry->name, pg_proc_dir); + + pkt_dev->entry = proc_create_data(dev->name, 0600, + pg_proc_dir, + &pktgen_if_fops, + pkt_dev); + if (!pkt_dev->entry) + pr_err("can't move proc entry for '%s'\n", + dev->name); + break; + } + } +} + +static int pktgen_device_event(struct notifier_block *unused, + unsigned long event, void *ptr) +{ + struct net_device *dev = ptr; + + if (!net_eq(dev_net(dev), &init_net) || pktgen_exiting) + return NOTIFY_DONE; + + /* It is OK that we do not hold the group lock right now, + * as we run under the RTNL lock. + */ + + switch (event) { + case NETDEV_CHANGENAME: + pktgen_change_name(dev); + break; + + case NETDEV_UNREGISTER: + pktgen_mark_device(dev->name); + break; + } + + return NOTIFY_DONE; +} + +static struct net_device *pktgen_dev_get_by_name(struct pktgen_dev *pkt_dev, + const char *ifname) +{ + char b[IFNAMSIZ+5]; + int i; + + for (i = 0; ifname[i] != '@'; i++) { + if (i == IFNAMSIZ) + break; + + b[i] = ifname[i]; + } + b[i] = 0; + + return dev_get_by_name(&init_net, b); +} + + +/* Associate pktgen_dev with a device. */ + +static int pktgen_setup_dev(struct pktgen_dev *pkt_dev, const char *ifname) +{ + struct net_device *odev; + int err; + + /* Clean old setups */ + if (pkt_dev->odev) { + dev_put(pkt_dev->odev); + pkt_dev->odev = NULL; + } + + odev = pktgen_dev_get_by_name(pkt_dev, ifname); + if (!odev) { + pr_err("no such netdevice: \"%s\"\n", ifname); + return -ENODEV; + } + + if (odev->type != ARPHRD_ETHER) { + pr_err("not an ethernet device: \"%s\"\n", ifname); + err = -EINVAL; + } else if (!netif_running(odev)) { + pr_err("device is down: \"%s\"\n", ifname); + err = -ENETDOWN; + } else { + pkt_dev->odev = odev; + return 0; + } + + dev_put(odev); + return err; +} + +/* Read pkt_dev from the interface and set up internal pktgen_dev + * structure to have the right information to create/send packets + */ +static void pktgen_setup_inject(struct pktgen_dev *pkt_dev) +{ + int ntxq; + + if (!pkt_dev->odev) { + pr_err("ERROR: pkt_dev->odev == NULL in setup_inject\n"); + sprintf(pkt_dev->result, + "ERROR: pkt_dev->odev == NULL in setup_inject.\n"); + return; + } + + /* make sure that we don't pick a non-existing transmit queue */ + ntxq = pkt_dev->odev->real_num_tx_queues; + + if (ntxq <= pkt_dev->queue_map_min) { + pr_warning("WARNING: Requested queue_map_min (zero-based) (%d) exceeds valid range [0 - %d] for (%d) queues on %s, resetting\n", + pkt_dev->queue_map_min, (ntxq ?: 1) - 1, ntxq, + pkt_dev->odevname); + pkt_dev->queue_map_min = (ntxq ?: 1) - 1; + } + if (pkt_dev->queue_map_max >= ntxq) { + pr_warning("WARNING: Requested queue_map_max (zero-based) (%d) exceeds valid range [0 - %d] for (%d) queues on %s, resetting\n", + pkt_dev->queue_map_max, (ntxq ?: 1) - 1, ntxq, + pkt_dev->odevname); + pkt_dev->queue_map_max = (ntxq ?: 1) - 1; + } + + /* Default to the interface's mac if not explicitly set. */ + + if (is_zero_ether_addr(pkt_dev->src_mac)) + memcpy(&(pkt_dev->hh[6]), pkt_dev->odev->dev_addr, ETH_ALEN); + + /* Set up Dest MAC */ + memcpy(&(pkt_dev->hh[0]), pkt_dev->dst_mac, ETH_ALEN); + + /* Set up pkt size */ + pkt_dev->cur_pkt_size = pkt_dev->min_pkt_size; + + if (pkt_dev->flags & F_IPV6) { + /* + * Skip this automatic address setting until locks or functions + * gets exported + */ + +#ifdef NOTNOW + int i, set = 0, err = 1; + struct inet6_dev *idev; + + for (i = 0; i < IN6_ADDR_HSIZE; i++) + if (pkt_dev->cur_in6_saddr.s6_addr[i]) { + set = 1; + break; + } + + if (!set) { + + /* + * Use linklevel address if unconfigured. + * + * use ipv6_get_lladdr if/when it's get exported + */ + + rcu_read_lock(); + idev = __in6_dev_get(pkt_dev->odev); + if (idev) { + struct inet6_ifaddr *ifp; + + read_lock_bh(&idev->lock); + for (ifp = idev->addr_list; ifp; + ifp = ifp->if_next) { + if (ifp->scope == IFA_LINK && + !(ifp->flags & IFA_F_TENTATIVE)) { + pkt_dev->cur_in6_saddr = ifp->addr; + err = 0; + break; + } + } + read_unlock_bh(&idev->lock); + } + rcu_read_unlock(); + if (err) + pr_err("ERROR: IPv6 link address not available\n"); + } +#endif + } else { + pkt_dev->saddr_min = 0; + pkt_dev->saddr_max = 0; + if (strlen(pkt_dev->src_min) == 0) { + + struct in_device *in_dev; + + rcu_read_lock(); + in_dev = __in_dev_get_rcu(pkt_dev->odev); + if (in_dev) { + if (in_dev->ifa_list) { + pkt_dev->saddr_min = + in_dev->ifa_list->ifa_address; + pkt_dev->saddr_max = pkt_dev->saddr_min; + } + } + rcu_read_unlock(); + } else { + pkt_dev->saddr_min = in_aton(pkt_dev->src_min); + pkt_dev->saddr_max = in_aton(pkt_dev->src_max); + } + + pkt_dev->daddr_min = in_aton(pkt_dev->dst_min); + pkt_dev->daddr_max = in_aton(pkt_dev->dst_max); + } + /* Initialize current values. */ + pkt_dev->cur_dst_mac_offset = 0; + pkt_dev->cur_src_mac_offset = 0; + pkt_dev->cur_saddr = pkt_dev->saddr_min; + pkt_dev->cur_daddr = pkt_dev->daddr_min; + pkt_dev->cur_udp_dst = pkt_dev->udp_dst_min; + pkt_dev->cur_udp_src = pkt_dev->udp_src_min; + pkt_dev->nflows = 0; +} + + +static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until) +{ + ktime_t start_time, end_time; + s64 remaining; + struct hrtimer_sleeper t; + + hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + hrtimer_set_expires(&t.timer, spin_until); + + remaining = ktime_to_ns(hrtimer_expires_remaining(&t.timer)); + if (remaining <= 0) { + pkt_dev->next_tx = ktime_add_ns(spin_until, pkt_dev->delay); + return; + } + + start_time = ktime_now(); + if (remaining < 100000) { + /* for small delays (<100us), just loop until limit is reached */ + do { + end_time = ktime_now(); + } while (ktime_lt(end_time, spin_until)); + } else { + /* see do_nanosleep */ + hrtimer_init_sleeper(&t, current); + do { + set_current_state(TASK_INTERRUPTIBLE); + hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS); + if (!hrtimer_active(&t.timer)) + t.task = NULL; + + if (likely(t.task)) + schedule(); + + hrtimer_cancel(&t.timer); + } while (t.task && pkt_dev->running && !signal_pending(current)); + __set_current_state(TASK_RUNNING); + end_time = ktime_now(); + } + + pkt_dev->idle_acc += ktime_to_ns(ktime_sub(end_time, start_time)); + pkt_dev->next_tx = ktime_add_ns(spin_until, pkt_dev->delay); +} + +static inline void set_pkt_overhead(struct pktgen_dev *pkt_dev) +{ + pkt_dev->pkt_overhead = 0; + pkt_dev->pkt_overhead += pkt_dev->nr_labels*sizeof(u32); + pkt_dev->pkt_overhead += VLAN_TAG_SIZE(pkt_dev); + pkt_dev->pkt_overhead += SVLAN_TAG_SIZE(pkt_dev); +} + +static inline int f_seen(const struct pktgen_dev *pkt_dev, int flow) +{ + return !!(pkt_dev->flows[flow].flags & F_INIT); +} + +static inline int f_pick(struct pktgen_dev *pkt_dev) +{ + int flow = pkt_dev->curfl; + + if (pkt_dev->flags & F_FLOW_SEQ) { + if (pkt_dev->flows[flow].count >= pkt_dev->lflow) { + /* reset time */ + pkt_dev->flows[flow].count = 0; + pkt_dev->flows[flow].flags = 0; + pkt_dev->curfl += 1; + if (pkt_dev->curfl >= pkt_dev->cflows) + pkt_dev->curfl = 0; /*reset */ + } + } else { + flow = random32() % pkt_dev->cflows; + pkt_dev->curfl = flow; + + if (pkt_dev->flows[flow].count > pkt_dev->lflow) { + pkt_dev->flows[flow].count = 0; + pkt_dev->flows[flow].flags = 0; + } + } + + return pkt_dev->curfl; +} + + +#ifdef CONFIG_XFRM +/* If there was already an IPSEC SA, we keep it as is, else + * we go look for it ... +*/ +#define DUMMY_MARK 0 +static void get_ipsec_sa(struct pktgen_dev *pkt_dev, int flow) +{ + struct xfrm_state *x = pkt_dev->flows[flow].x; + if (!x) { + /*slow path: we dont already have xfrm_state*/ + x = xfrm_stateonly_find(&init_net, DUMMY_MARK, + (xfrm_address_t *)&pkt_dev->cur_daddr, + (xfrm_address_t *)&pkt_dev->cur_saddr, + AF_INET, + pkt_dev->ipsmode, + pkt_dev->ipsproto, 0); + if (x) { + pkt_dev->flows[flow].x = x; + set_pkt_overhead(pkt_dev); + pkt_dev->pkt_overhead += x->props.header_len; + } + + } +} +#endif +static void set_cur_queue_map(struct pktgen_dev *pkt_dev) +{ + + if (pkt_dev->flags & F_QUEUE_MAP_CPU) + pkt_dev->cur_queue_map = smp_processor_id(); + + else if (pkt_dev->queue_map_min <= pkt_dev->queue_map_max) { + __u16 t; + if (pkt_dev->flags & F_QUEUE_MAP_RND) { + t = random32() % + (pkt_dev->queue_map_max - + pkt_dev->queue_map_min + 1) + + pkt_dev->queue_map_min; + } else { + t = pkt_dev->cur_queue_map + 1; + if (t > pkt_dev->queue_map_max) + t = pkt_dev->queue_map_min; + } + pkt_dev->cur_queue_map = t; + } + pkt_dev->cur_queue_map = pkt_dev->cur_queue_map % pkt_dev->odev->real_num_tx_queues; +} + +/* Increment/randomize headers according to flags and current values + * for IP src/dest, UDP src/dst port, MAC-Addr src/dst + */ +static void mod_cur_headers(struct pktgen_dev *pkt_dev) +{ + __u32 imn; + __u32 imx; + int flow = 0; + + if (pkt_dev->cflows) + flow = f_pick(pkt_dev); + + /* Deal with source MAC */ + if (pkt_dev->src_mac_count > 1) { + __u32 mc; + __u32 tmp; + + if (pkt_dev->flags & F_MACSRC_RND) + mc = random32() % pkt_dev->src_mac_count; + else { + mc = pkt_dev->cur_src_mac_offset++; + if (pkt_dev->cur_src_mac_offset >= + pkt_dev->src_mac_count) + pkt_dev->cur_src_mac_offset = 0; + } + + tmp = pkt_dev->src_mac[5] + (mc & 0xFF); + pkt_dev->hh[11] = tmp; + tmp = (pkt_dev->src_mac[4] + ((mc >> 8) & 0xFF) + (tmp >> 8)); + pkt_dev->hh[10] = tmp; + tmp = (pkt_dev->src_mac[3] + ((mc >> 16) & 0xFF) + (tmp >> 8)); + pkt_dev->hh[9] = tmp; + tmp = (pkt_dev->src_mac[2] + ((mc >> 24) & 0xFF) + (tmp >> 8)); + pkt_dev->hh[8] = tmp; + tmp = (pkt_dev->src_mac[1] + (tmp >> 8)); + pkt_dev->hh[7] = tmp; + } + + /* Deal with Destination MAC */ + if (pkt_dev->dst_mac_count > 1) { + __u32 mc; + __u32 tmp; + + if (pkt_dev->flags & F_MACDST_RND) + mc = random32() % pkt_dev->dst_mac_count; + + else { + mc = pkt_dev->cur_dst_mac_offset++; + if (pkt_dev->cur_dst_mac_offset >= + pkt_dev->dst_mac_count) { + pkt_dev->cur_dst_mac_offset = 0; + } + } + + tmp = pkt_dev->dst_mac[5] + (mc & 0xFF); + pkt_dev->hh[5] = tmp; + tmp = (pkt_dev->dst_mac[4] + ((mc >> 8) & 0xFF) + (tmp >> 8)); + pkt_dev->hh[4] = tmp; + tmp = (pkt_dev->dst_mac[3] + ((mc >> 16) & 0xFF) + (tmp >> 8)); + pkt_dev->hh[3] = tmp; + tmp = (pkt_dev->dst_mac[2] + ((mc >> 24) & 0xFF) + (tmp >> 8)); + pkt_dev->hh[2] = tmp; + tmp = (pkt_dev->dst_mac[1] + (tmp >> 8)); + pkt_dev->hh[1] = tmp; + } + + if (pkt_dev->flags & F_MPLS_RND) { + unsigned i; + for (i = 0; i < pkt_dev->nr_labels; i++) + if (pkt_dev->labels[i] & MPLS_STACK_BOTTOM) + pkt_dev->labels[i] = MPLS_STACK_BOTTOM | + ((__force __be32)random32() & + htonl(0x000fffff)); + } + + if ((pkt_dev->flags & F_VID_RND) && (pkt_dev->vlan_id != 0xffff)) { + pkt_dev->vlan_id = random32() & (4096-1); + } + + if ((pkt_dev->flags & F_SVID_RND) && (pkt_dev->svlan_id != 0xffff)) { + pkt_dev->svlan_id = random32() & (4096 - 1); + } + + if (pkt_dev->udp_src_min < pkt_dev->udp_src_max) { + if (pkt_dev->flags & F_UDPSRC_RND) + pkt_dev->cur_udp_src = random32() % + (pkt_dev->udp_src_max - pkt_dev->udp_src_min) + + pkt_dev->udp_src_min; + + else { + pkt_dev->cur_udp_src++; + if (pkt_dev->cur_udp_src >= pkt_dev->udp_src_max) + pkt_dev->cur_udp_src = pkt_dev->udp_src_min; + } + } + + if (pkt_dev->udp_dst_min < pkt_dev->udp_dst_max) { + if (pkt_dev->flags & F_UDPDST_RND) { + pkt_dev->cur_udp_dst = random32() % + (pkt_dev->udp_dst_max - pkt_dev->udp_dst_min) + + pkt_dev->udp_dst_min; + } else { + pkt_dev->cur_udp_dst++; + if (pkt_dev->cur_udp_dst >= pkt_dev->udp_dst_max) + pkt_dev->cur_udp_dst = pkt_dev->udp_dst_min; + } + } + + if (!(pkt_dev->flags & F_IPV6)) { + + imn = ntohl(pkt_dev->saddr_min); + imx = ntohl(pkt_dev->saddr_max); + if (imn < imx) { + __u32 t; + if (pkt_dev->flags & F_IPSRC_RND) + t = random32() % (imx - imn) + imn; + else { + t = ntohl(pkt_dev->cur_saddr); + t++; + if (t > imx) + t = imn; + + } + pkt_dev->cur_saddr = htonl(t); + } + + if (pkt_dev->cflows && f_seen(pkt_dev, flow)) { + pkt_dev->cur_daddr = pkt_dev->flows[flow].cur_daddr; + } else { + imn = ntohl(pkt_dev->daddr_min); + imx = ntohl(pkt_dev->daddr_max); + if (imn < imx) { + __u32 t; + __be32 s; + if (pkt_dev->flags & F_IPDST_RND) { + + t = random32() % (imx - imn) + imn; + s = htonl(t); + + while (ipv4_is_loopback(s) || + ipv4_is_multicast(s) || + ipv4_is_lbcast(s) || + ipv4_is_zeronet(s) || + ipv4_is_local_multicast(s)) { + t = random32() % (imx - imn) + imn; + s = htonl(t); + } + pkt_dev->cur_daddr = s; + } else { + t = ntohl(pkt_dev->cur_daddr); + t++; + if (t > imx) { + t = imn; + } + pkt_dev->cur_daddr = htonl(t); + } + } + if (pkt_dev->cflows) { + pkt_dev->flows[flow].flags |= F_INIT; + pkt_dev->flows[flow].cur_daddr = + pkt_dev->cur_daddr; +#ifdef CONFIG_XFRM + if (pkt_dev->flags & F_IPSEC_ON) + get_ipsec_sa(pkt_dev, flow); +#endif + pkt_dev->nflows++; + } + } + } else { /* IPV6 * */ + + if (pkt_dev->min_in6_daddr.s6_addr32[0] == 0 && + pkt_dev->min_in6_daddr.s6_addr32[1] == 0 && + pkt_dev->min_in6_daddr.s6_addr32[2] == 0 && + pkt_dev->min_in6_daddr.s6_addr32[3] == 0) ; + else { + int i; + + /* Only random destinations yet */ + + for (i = 0; i < 4; i++) { + pkt_dev->cur_in6_daddr.s6_addr32[i] = + (((__force __be32)random32() | + pkt_dev->min_in6_daddr.s6_addr32[i]) & + pkt_dev->max_in6_daddr.s6_addr32[i]); + } + } + } + + if (pkt_dev->min_pkt_size < pkt_dev->max_pkt_size) { + __u32 t; + if (pkt_dev->flags & F_TXSIZE_RND) { + t = random32() % + (pkt_dev->max_pkt_size - pkt_dev->min_pkt_size) + + pkt_dev->min_pkt_size; + } else { + t = pkt_dev->cur_pkt_size + 1; + if (t > pkt_dev->max_pkt_size) + t = pkt_dev->min_pkt_size; + } + pkt_dev->cur_pkt_size = t; + } + + set_cur_queue_map(pkt_dev); + + pkt_dev->flows[flow].count++; +} + + +#ifdef CONFIG_XFRM +static int pktgen_output_ipsec(struct sk_buff *skb, struct pktgen_dev *pkt_dev) +{ + struct xfrm_state *x = pkt_dev->flows[pkt_dev->curfl].x; + int err = 0; + + if (!x) + return 0; + /* XXX: we dont support tunnel mode for now until + * we resolve the dst issue */ + if (x->props.mode != XFRM_MODE_TRANSPORT) + return 0; + + spin_lock(&x->lock); + + err = x->outer_mode->output(x, skb); + if (err) + goto error; + err = x->type->output(x, skb); + if (err) + goto error; + + x->curlft.bytes += skb->len; + x->curlft.packets++; +error: + spin_unlock(&x->lock); + return err; +} + +static void free_SAs(struct pktgen_dev *pkt_dev) +{ + if (pkt_dev->cflows) { + /* let go of the SAs if we have them */ + int i; + for (i = 0; i < pkt_dev->cflows; i++) { + struct xfrm_state *x = pkt_dev->flows[i].x; + if (x) { + xfrm_state_put(x); + pkt_dev->flows[i].x = NULL; + } + } + } +} + +static int process_ipsec(struct pktgen_dev *pkt_dev, + struct sk_buff *skb, __be16 protocol) +{ + if (pkt_dev->flags & F_IPSEC_ON) { + struct xfrm_state *x = pkt_dev->flows[pkt_dev->curfl].x; + int nhead = 0; + if (x) { + int ret; + __u8 *eth; + nhead = x->props.header_len - skb_headroom(skb); + if (nhead > 0) { + ret = pskb_expand_head(skb, nhead, 0, GFP_ATOMIC); + if (ret < 0) { + pr_err("Error expanding ipsec packet %d\n", + ret); + goto err; + } + } + + /* ipsec is not expecting ll header */ + skb_pull(skb, ETH_HLEN); + ret = pktgen_output_ipsec(skb, pkt_dev); + if (ret) { + pr_err("Error creating ipsec packet %d\n", ret); + goto err; + } + /* restore ll */ + eth = (__u8 *) skb_push(skb, ETH_HLEN); + memcpy(eth, pkt_dev->hh, 12); + *(u16 *) ð[12] = protocol; + } + } + return 1; +err: + kfree_skb(skb); + return 0; +} +#endif + +static void mpls_push(__be32 *mpls, struct pktgen_dev *pkt_dev) +{ + unsigned i; + for (i = 0; i < pkt_dev->nr_labels; i++) + *mpls++ = pkt_dev->labels[i] & ~MPLS_STACK_BOTTOM; + + mpls--; + *mpls |= MPLS_STACK_BOTTOM; +} + +static inline __be16 build_tci(unsigned int id, unsigned int cfi, + unsigned int prio) +{ + return htons(id | (cfi << 12) | (prio << 13)); +} + +static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb, + int datalen) +{ + struct timeval timestamp; + struct pktgen_hdr *pgh; + + pgh = (struct pktgen_hdr *)skb_put(skb, sizeof(*pgh)); + datalen -= sizeof(*pgh); + + if (pkt_dev->nfrags <= 0) { + memset(skb_put(skb, datalen), 0, datalen); + } else { + int frags = pkt_dev->nfrags; + int i, len; + int frag_len; + + + if (frags > MAX_SKB_FRAGS) + frags = MAX_SKB_FRAGS; + len = datalen - frags * PAGE_SIZE; + if (len > 0) { + memset(skb_put(skb, len), 0, len); + datalen = frags * PAGE_SIZE; + } + + i = 0; + frag_len = (datalen/frags) < PAGE_SIZE ? + (datalen/frags) : PAGE_SIZE; + while (datalen > 0) { + if (unlikely(!pkt_dev->page)) { + int node = numa_node_id(); + + if (pkt_dev->node >= 0 && (pkt_dev->flags & F_NODE)) + node = pkt_dev->node; + pkt_dev->page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); + if (!pkt_dev->page) + break; + } + get_page(pkt_dev->page); + skb_frag_set_page(skb, i, pkt_dev->page); + skb_shinfo(skb)->frags[i].page_offset = 0; + /*last fragment, fill rest of data*/ + if (i == (frags - 1)) + skb_frag_size_set(&skb_shinfo(skb)->frags[i], + (datalen < PAGE_SIZE ? datalen : PAGE_SIZE)); + else + skb_frag_size_set(&skb_shinfo(skb)->frags[i], frag_len); + datalen -= skb_frag_size(&skb_shinfo(skb)->frags[i]); + skb->len += skb_frag_size(&skb_shinfo(skb)->frags[i]); + skb->data_len += skb_frag_size(&skb_shinfo(skb)->frags[i]); + i++; + skb_shinfo(skb)->nr_frags = i; + } + } + + /* Stamp the time, and sequence number, + * convert them to network byte order + */ + pgh->pgh_magic = htonl(PKTGEN_MAGIC); + pgh->seq_num = htonl(pkt_dev->seq_num); + + do_gettimeofday(×tamp); + pgh->tv_sec = htonl(timestamp.tv_sec); + pgh->tv_usec = htonl(timestamp.tv_usec); +} + +static struct sk_buff *fill_packet_ipv4(struct net_device *odev, + struct pktgen_dev *pkt_dev) +{ + struct sk_buff *skb = NULL; + __u8 *eth; + struct udphdr *udph; + int datalen, iplen; + struct iphdr *iph; + __be16 protocol = htons(ETH_P_IP); + __be32 *mpls; + __be16 *vlan_tci = NULL; /* Encapsulates priority and VLAN ID */ + __be16 *vlan_encapsulated_proto = NULL; /* packet type ID field (or len) for VLAN tag */ + __be16 *svlan_tci = NULL; /* Encapsulates priority and SVLAN ID */ + __be16 *svlan_encapsulated_proto = NULL; /* packet type ID field (or len) for SVLAN tag */ + u16 queue_map; + + if (pkt_dev->nr_labels) + protocol = htons(ETH_P_MPLS_UC); + + if (pkt_dev->vlan_id != 0xffff) + protocol = htons(ETH_P_8021Q); + + /* Update any of the values, used when we're incrementing various + * fields. + */ + mod_cur_headers(pkt_dev); + queue_map = pkt_dev->cur_queue_map; + + datalen = (odev->hard_header_len + 16) & ~0xf; + + if (pkt_dev->flags & F_NODE) { + int node; + + if (pkt_dev->node >= 0) + node = pkt_dev->node; + else + node = numa_node_id(); + + skb = __alloc_skb(NET_SKB_PAD + pkt_dev->cur_pkt_size + 64 + + datalen + pkt_dev->pkt_overhead, GFP_NOWAIT, 0, node); + if (likely(skb)) { + skb_reserve(skb, NET_SKB_PAD); + skb->dev = odev; + } + } + else + skb = __netdev_alloc_skb(odev, + pkt_dev->cur_pkt_size + 64 + + datalen + pkt_dev->pkt_overhead, GFP_NOWAIT); + + if (!skb) { + sprintf(pkt_dev->result, "No memory"); + return NULL; + } + prefetchw(skb->data); + + skb_reserve(skb, datalen); + + /* Reserve for ethernet and IP header */ + eth = (__u8 *) skb_push(skb, 14); + mpls = (__be32 *)skb_put(skb, pkt_dev->nr_labels*sizeof(__u32)); + if (pkt_dev->nr_labels) + mpls_push(mpls, pkt_dev); + + if (pkt_dev->vlan_id != 0xffff) { + if (pkt_dev->svlan_id != 0xffff) { + svlan_tci = (__be16 *)skb_put(skb, sizeof(__be16)); + *svlan_tci = build_tci(pkt_dev->svlan_id, + pkt_dev->svlan_cfi, + pkt_dev->svlan_p); + svlan_encapsulated_proto = (__be16 *)skb_put(skb, sizeof(__be16)); + *svlan_encapsulated_proto = htons(ETH_P_8021Q); + } + vlan_tci = (__be16 *)skb_put(skb, sizeof(__be16)); + *vlan_tci = build_tci(pkt_dev->vlan_id, + pkt_dev->vlan_cfi, + pkt_dev->vlan_p); + vlan_encapsulated_proto = (__be16 *)skb_put(skb, sizeof(__be16)); + *vlan_encapsulated_proto = htons(ETH_P_IP); + } + + skb->network_header = skb->tail; + skb->transport_header = skb->network_header + sizeof(struct iphdr); + skb_put(skb, sizeof(struct iphdr) + sizeof(struct udphdr)); + skb_set_queue_mapping(skb, queue_map); + skb->priority = pkt_dev->skb_priority; + + iph = ip_hdr(skb); + udph = udp_hdr(skb); + + memcpy(eth, pkt_dev->hh, 12); + *(__be16 *) & eth[12] = protocol; + + /* Eth + IPh + UDPh + mpls */ + datalen = pkt_dev->cur_pkt_size - 14 - 20 - 8 - + pkt_dev->pkt_overhead; + if (datalen < sizeof(struct pktgen_hdr)) + datalen = sizeof(struct pktgen_hdr); + + udph->source = htons(pkt_dev->cur_udp_src); + udph->dest = htons(pkt_dev->cur_udp_dst); + udph->len = htons(datalen + 8); /* DATA + udphdr */ + udph->check = 0; /* No checksum */ + + iph->ihl = 5; + iph->version = 4; + iph->ttl = 32; + iph->tos = pkt_dev->tos; + iph->protocol = IPPROTO_UDP; /* UDP */ + iph->saddr = pkt_dev->cur_saddr; + iph->daddr = pkt_dev->cur_daddr; + iph->id = htons(pkt_dev->ip_id); + pkt_dev->ip_id++; + iph->frag_off = 0; + iplen = 20 + 8 + datalen; + iph->tot_len = htons(iplen); + iph->check = 0; + iph->check = ip_fast_csum((void *)iph, iph->ihl); + skb->protocol = protocol; + skb->mac_header = (skb->network_header - ETH_HLEN - + pkt_dev->pkt_overhead); + skb->dev = odev; + skb->pkt_type = PACKET_HOST; + pktgen_finalize_skb(pkt_dev, skb, datalen); + +#ifdef CONFIG_XFRM + if (!process_ipsec(pkt_dev, skb, protocol)) + return NULL; +#endif + + return skb; +} + +/* + * scan_ip6, fmt_ip taken from dietlibc-0.21 + * Author Felix von Leitner <felix-dietlibc@fefe.de> + * + * Slightly modified for kernel. + * Should be candidate for net/ipv4/utils.c + * --ro + */ + +static unsigned int scan_ip6(const char *s, char ip[16]) +{ + unsigned int i; + unsigned int len = 0; + unsigned long u; + char suffix[16]; + unsigned int prefixlen = 0; + unsigned int suffixlen = 0; + __be32 tmp; + char *pos; + + for (i = 0; i < 16; i++) + ip[i] = 0; + + for (;;) { + if (*s == ':') { + len++; + if (s[1] == ':') { /* Found "::", skip to part 2 */ + s += 2; + len++; + break; + } + s++; + } + + u = simple_strtoul(s, &pos, 16); + i = pos - s; + if (!i) + return 0; + if (prefixlen == 12 && s[i] == '.') { + + /* the last 4 bytes may be written as IPv4 address */ + + tmp = in_aton(s); + memcpy((struct in_addr *)(ip + 12), &tmp, sizeof(tmp)); + return i + len; + } + ip[prefixlen++] = (u >> 8); + ip[prefixlen++] = (u & 255); + s += i; + len += i; + if (prefixlen == 16) + return len; + } + +/* part 2, after "::" */ + for (;;) { + if (*s == ':') { + if (suffixlen == 0) + break; + s++; + len++; + } else if (suffixlen != 0) + break; + + u = simple_strtol(s, &pos, 16); + i = pos - s; + if (!i) { + if (*s) + len--; + break; + } + if (suffixlen + prefixlen <= 12 && s[i] == '.') { + tmp = in_aton(s); + memcpy((struct in_addr *)(suffix + suffixlen), &tmp, + sizeof(tmp)); + suffixlen += 4; + len += strlen(s); + break; + } + suffix[suffixlen++] = (u >> 8); + suffix[suffixlen++] = (u & 255); + s += i; + len += i; + if (prefixlen + suffixlen == 16) + break; + } + for (i = 0; i < suffixlen; i++) + ip[16 - suffixlen + i] = suffix[i]; + return len; +} + +static struct sk_buff *fill_packet_ipv6(struct net_device *odev, + struct pktgen_dev *pkt_dev) +{ + struct sk_buff *skb = NULL; + __u8 *eth; + struct udphdr *udph; + int datalen; + struct ipv6hdr *iph; + __be16 protocol = htons(ETH_P_IPV6); + __be32 *mpls; + __be16 *vlan_tci = NULL; /* Encapsulates priority and VLAN ID */ + __be16 *vlan_encapsulated_proto = NULL; /* packet type ID field (or len) for VLAN tag */ + __be16 *svlan_tci = NULL; /* Encapsulates priority and SVLAN ID */ + __be16 *svlan_encapsulated_proto = NULL; /* packet type ID field (or len) for SVLAN tag */ + u16 queue_map; + + if (pkt_dev->nr_labels) + protocol = htons(ETH_P_MPLS_UC); + + if (pkt_dev->vlan_id != 0xffff) + protocol = htons(ETH_P_8021Q); + + /* Update any of the values, used when we're incrementing various + * fields. + */ + mod_cur_headers(pkt_dev); + queue_map = pkt_dev->cur_queue_map; + + skb = __netdev_alloc_skb(odev, + pkt_dev->cur_pkt_size + 64 + + 16 + pkt_dev->pkt_overhead, GFP_NOWAIT); + if (!skb) { + sprintf(pkt_dev->result, "No memory"); + return NULL; + } + prefetchw(skb->data); + + skb_reserve(skb, 16); + + /* Reserve for ethernet and IP header */ + eth = (__u8 *) skb_push(skb, 14); + mpls = (__be32 *)skb_put(skb, pkt_dev->nr_labels*sizeof(__u32)); + if (pkt_dev->nr_labels) + mpls_push(mpls, pkt_dev); + + if (pkt_dev->vlan_id != 0xffff) { + if (pkt_dev->svlan_id != 0xffff) { + svlan_tci = (__be16 *)skb_put(skb, sizeof(__be16)); + *svlan_tci = build_tci(pkt_dev->svlan_id, + pkt_dev->svlan_cfi, + pkt_dev->svlan_p); + svlan_encapsulated_proto = (__be16 *)skb_put(skb, sizeof(__be16)); + *svlan_encapsulated_proto = htons(ETH_P_8021Q); + } + vlan_tci = (__be16 *)skb_put(skb, sizeof(__be16)); + *vlan_tci = build_tci(pkt_dev->vlan_id, + pkt_dev->vlan_cfi, + pkt_dev->vlan_p); + vlan_encapsulated_proto = (__be16 *)skb_put(skb, sizeof(__be16)); + *vlan_encapsulated_proto = htons(ETH_P_IPV6); + } + + skb->network_header = skb->tail; + skb->transport_header = skb->network_header + sizeof(struct ipv6hdr); + skb_put(skb, sizeof(struct ipv6hdr) + sizeof(struct udphdr)); + skb_set_queue_mapping(skb, queue_map); + skb->priority = pkt_dev->skb_priority; + iph = ipv6_hdr(skb); + udph = udp_hdr(skb); + + memcpy(eth, pkt_dev->hh, 12); + *(__be16 *) ð[12] = protocol; + + /* Eth + IPh + UDPh + mpls */ + datalen = pkt_dev->cur_pkt_size - 14 - + sizeof(struct ipv6hdr) - sizeof(struct udphdr) - + pkt_dev->pkt_overhead; + + if (datalen < sizeof(struct pktgen_hdr)) { + datalen = sizeof(struct pktgen_hdr); + if (net_ratelimit()) + pr_info("increased datalen to %d\n", datalen); + } + + udph->source = htons(pkt_dev->cur_udp_src); + udph->dest = htons(pkt_dev->cur_udp_dst); + udph->len = htons(datalen + sizeof(struct udphdr)); + udph->check = 0; /* No checksum */ + + *(__be32 *) iph = htonl(0x60000000); /* Version + flow */ + + if (pkt_dev->traffic_class) { + /* Version + traffic class + flow (0) */ + *(__be32 *)iph |= htonl(0x60000000 | (pkt_dev->traffic_class << 20)); + } + + iph->hop_limit = 32; + + iph->payload_len = htons(sizeof(struct udphdr) + datalen); + iph->nexthdr = IPPROTO_UDP; + + iph->daddr = pkt_dev->cur_in6_daddr; + iph->saddr = pkt_dev->cur_in6_saddr; + + skb->mac_header = (skb->network_header - ETH_HLEN - + pkt_dev->pkt_overhead); + skb->protocol = protocol; + skb->dev = odev; + skb->pkt_type = PACKET_HOST; + + pktgen_finalize_skb(pkt_dev, skb, datalen); + + return skb; +} + +static struct sk_buff *fill_packet(struct net_device *odev, + struct pktgen_dev *pkt_dev) +{ + if (pkt_dev->flags & F_IPV6) + return fill_packet_ipv6(odev, pkt_dev); + else + return fill_packet_ipv4(odev, pkt_dev); +} + +static void pktgen_clear_counters(struct pktgen_dev *pkt_dev) +{ + pkt_dev->seq_num = 1; + pkt_dev->idle_acc = 0; + pkt_dev->sofar = 0; + pkt_dev->tx_bytes = 0; + pkt_dev->errors = 0; +} + +/* Set up structure for sending pkts, clear counters */ + +static void pktgen_run(struct pktgen_thread *t) +{ + struct pktgen_dev *pkt_dev; + int started = 0; + + func_enter(); + + if_lock(t); + list_for_each_entry(pkt_dev, &t->if_list, list) { + + /* + * setup odev and create initial packet. + */ + pktgen_setup_inject(pkt_dev); + + if (pkt_dev->odev) { + pktgen_clear_counters(pkt_dev); + pkt_dev->running = 1; /* Cranke yeself! */ + pkt_dev->skb = NULL; + pkt_dev->started_at = + pkt_dev->next_tx = ktime_now(); + + set_pkt_overhead(pkt_dev); + + strcpy(pkt_dev->result, "Starting"); + started++; + } else + strcpy(pkt_dev->result, "Error starting"); + } + if_unlock(t); + if (started) + t->control &= ~(T_STOP); +} + +static void pktgen_stop_all_threads_ifs(void) +{ + struct pktgen_thread *t; + + func_enter(); + + mutex_lock(&pktgen_thread_lock); + + list_for_each_entry(t, &pktgen_threads, th_list) + t->control |= T_STOP; + + mutex_unlock(&pktgen_thread_lock); +} + +static int thread_is_running(const struct pktgen_thread *t) +{ + const struct pktgen_dev *pkt_dev; + + list_for_each_entry(pkt_dev, &t->if_list, list) + if (pkt_dev->running) + return 1; + return 0; +} + +static int pktgen_wait_thread_run(struct pktgen_thread *t) +{ + if_lock(t); + + while (thread_is_running(t)) { + + if_unlock(t); + + msleep_interruptible(100); + + if (signal_pending(current)) + goto signal; + if_lock(t); + } + if_unlock(t); + return 1; +signal: + return 0; +} + +static int pktgen_wait_all_threads_run(void) +{ + struct pktgen_thread *t; + int sig = 1; + + mutex_lock(&pktgen_thread_lock); + + list_for_each_entry(t, &pktgen_threads, th_list) { + sig = pktgen_wait_thread_run(t); + if (sig == 0) + break; + } + + if (sig == 0) + list_for_each_entry(t, &pktgen_threads, th_list) + t->control |= (T_STOP); + + mutex_unlock(&pktgen_thread_lock); + return sig; +} + +static void pktgen_run_all_threads(void) +{ + struct pktgen_thread *t; + + func_enter(); + + mutex_lock(&pktgen_thread_lock); + + list_for_each_entry(t, &pktgen_threads, th_list) + t->control |= (T_RUN); + + mutex_unlock(&pktgen_thread_lock); + + /* Propagate thread->control */ + schedule_timeout_interruptible(msecs_to_jiffies(125)); + + pktgen_wait_all_threads_run(); +} + +static void pktgen_reset_all_threads(void) +{ + struct pktgen_thread *t; + + func_enter(); + + mutex_lock(&pktgen_thread_lock); + + list_for_each_entry(t, &pktgen_threads, th_list) + t->control |= (T_REMDEVALL); + + mutex_unlock(&pktgen_thread_lock); + + /* Propagate thread->control */ + schedule_timeout_interruptible(msecs_to_jiffies(125)); + + pktgen_wait_all_threads_run(); +} + +static void show_results(struct pktgen_dev *pkt_dev, int nr_frags) +{ + __u64 bps, mbps, pps; + char *p = pkt_dev->result; + ktime_t elapsed = ktime_sub(pkt_dev->stopped_at, + pkt_dev->started_at); + ktime_t idle = ns_to_ktime(pkt_dev->idle_acc); + + p += sprintf(p, "OK: %llu(c%llu+d%llu) usec, %llu (%dbyte,%dfrags)\n", + (unsigned long long)ktime_to_us(elapsed), + (unsigned long long)ktime_to_us(ktime_sub(elapsed, idle)), + (unsigned long long)ktime_to_us(idle), + (unsigned long long)pkt_dev->sofar, + pkt_dev->cur_pkt_size, nr_frags); + + pps = div64_u64(pkt_dev->sofar * NSEC_PER_SEC, + ktime_to_ns(elapsed)); + + bps = pps * 8 * pkt_dev->cur_pkt_size; + + mbps = bps; + do_div(mbps, 1000000); + p += sprintf(p, " %llupps %lluMb/sec (%llubps) errors: %llu", + (unsigned long long)pps, + (unsigned long long)mbps, + (unsigned long long)bps, + (unsigned long long)pkt_dev->errors); +} + +/* Set stopped-at timer, remove from running list, do counters & statistics */ +static int pktgen_stop_device(struct pktgen_dev *pkt_dev) +{ + int nr_frags = pkt_dev->skb ? skb_shinfo(pkt_dev->skb)->nr_frags : -1; + + if (!pkt_dev->running) { + pr_warning("interface: %s is already stopped\n", + pkt_dev->odevname); + return -EINVAL; + } + + kfree_skb(pkt_dev->skb); + pkt_dev->skb = NULL; + pkt_dev->stopped_at = ktime_now(); + pkt_dev->running = 0; + + show_results(pkt_dev, nr_frags); + + return 0; +} + +static struct pktgen_dev *next_to_run(struct pktgen_thread *t) +{ + struct pktgen_dev *pkt_dev, *best = NULL; + + if_lock(t); + + list_for_each_entry(pkt_dev, &t->if_list, list) { + if (!pkt_dev->running) + continue; + if (best == NULL) + best = pkt_dev; + else if (ktime_lt(pkt_dev->next_tx, best->next_tx)) + best = pkt_dev; + } + if_unlock(t); + return best; +} + +static void pktgen_stop(struct pktgen_thread *t) +{ + struct pktgen_dev *pkt_dev; + + func_enter(); + + if_lock(t); + + list_for_each_entry(pkt_dev, &t->if_list, list) { + pktgen_stop_device(pkt_dev); + } + + if_unlock(t); +} + +/* + * one of our devices needs to be removed - find it + * and remove it + */ +static void pktgen_rem_one_if(struct pktgen_thread *t) +{ + struct list_head *q, *n; + struct pktgen_dev *cur; + + func_enter(); + + if_lock(t); + + list_for_each_safe(q, n, &t->if_list) { + cur = list_entry(q, struct pktgen_dev, list); + + if (!cur->removal_mark) + continue; + + kfree_skb(cur->skb); + cur->skb = NULL; + + pktgen_remove_device(t, cur); + + break; + } + + if_unlock(t); +} + +static void pktgen_rem_all_ifs(struct pktgen_thread *t) +{ + struct list_head *q, *n; + struct pktgen_dev *cur; + + func_enter(); + + /* Remove all devices, free mem */ + + if_lock(t); + + list_for_each_safe(q, n, &t->if_list) { + cur = list_entry(q, struct pktgen_dev, list); + + kfree_skb(cur->skb); + cur->skb = NULL; + + pktgen_remove_device(t, cur); + } + + if_unlock(t); +} + +static void pktgen_rem_thread(struct pktgen_thread *t) +{ + /* Remove from the thread list */ + + remove_proc_entry(t->tsk->comm, pg_proc_dir); + +} + +static void pktgen_resched(struct pktgen_dev *pkt_dev) +{ + ktime_t idle_start = ktime_now(); + schedule(); + pkt_dev->idle_acc += ktime_to_ns(ktime_sub(ktime_now(), idle_start)); +} + +static void pktgen_wait_for_skb(struct pktgen_dev *pkt_dev) +{ + ktime_t idle_start = ktime_now(); + + while (atomic_read(&(pkt_dev->skb->users)) != 1) { + if (signal_pending(current)) + break; + + if (need_resched()) + pktgen_resched(pkt_dev); + else + cpu_relax(); + } + pkt_dev->idle_acc += ktime_to_ns(ktime_sub(ktime_now(), idle_start)); +} + +static void pktgen_xmit(struct pktgen_dev *pkt_dev) +{ + struct net_device *odev = pkt_dev->odev; + netdev_tx_t (*xmit)(struct sk_buff *, struct net_device *) + = odev->netdev_ops->ndo_start_xmit; + struct netdev_queue *txq; + u16 queue_map; + int ret; + + /* If device is offline, then don't send */ + if (unlikely(!netif_running(odev) || !netif_carrier_ok(odev))) { + pktgen_stop_device(pkt_dev); + return; + } + + /* This is max DELAY, this has special meaning of + * "never transmit" + */ + if (unlikely(pkt_dev->delay == ULLONG_MAX)) { + pkt_dev->next_tx = ktime_add_ns(ktime_now(), ULONG_MAX); + return; + } + + /* If no skb or clone count exhausted then get new one */ + if (!pkt_dev->skb || (pkt_dev->last_ok && + ++pkt_dev->clone_count >= pkt_dev->clone_skb)) { + /* build a new pkt */ + kfree_skb(pkt_dev->skb); + + pkt_dev->skb = fill_packet(odev, pkt_dev); + if (pkt_dev->skb == NULL) { + pr_err("ERROR: couldn't allocate skb in fill_packet\n"); + schedule(); + pkt_dev->clone_count--; /* back out increment, OOM */ + return; + } + pkt_dev->last_pkt_size = pkt_dev->skb->len; + pkt_dev->allocated_skbs++; + pkt_dev->clone_count = 0; /* reset counter */ + } + + if (pkt_dev->delay && pkt_dev->last_ok) + spin(pkt_dev, pkt_dev->next_tx); + + queue_map = skb_get_queue_mapping(pkt_dev->skb); + txq = netdev_get_tx_queue(odev, queue_map); + + __netif_tx_lock_bh(txq); + + if (unlikely(netif_xmit_frozen_or_stopped(txq))) { + ret = NETDEV_TX_BUSY; + pkt_dev->last_ok = 0; + goto unlock; + } + atomic_inc(&(pkt_dev->skb->users)); + ret = (*xmit)(pkt_dev->skb, odev); + + switch (ret) { + case NETDEV_TX_OK: + txq_trans_update(txq); + pkt_dev->last_ok = 1; + pkt_dev->sofar++; + pkt_dev->seq_num++; + pkt_dev->tx_bytes += pkt_dev->last_pkt_size; + break; + case NET_XMIT_DROP: + case NET_XMIT_CN: + case NET_XMIT_POLICED: + /* skb has been consumed */ + pkt_dev->errors++; + break; + default: /* Drivers are not supposed to return other values! */ + if (net_ratelimit()) + pr_info("%s xmit error: %d\n", pkt_dev->odevname, ret); + pkt_dev->errors++; + /* fallthru */ + case NETDEV_TX_LOCKED: + case NETDEV_TX_BUSY: + /* Retry it next time */ + atomic_dec(&(pkt_dev->skb->users)); + pkt_dev->last_ok = 0; + } +unlock: + __netif_tx_unlock_bh(txq); + + /* If pkt_dev->count is zero, then run forever */ + if ((pkt_dev->count != 0) && (pkt_dev->sofar >= pkt_dev->count)) { + pktgen_wait_for_skb(pkt_dev); + + /* Done with this */ + pktgen_stop_device(pkt_dev); + } +} + +/* + * Main loop of the thread goes here + */ + +static int pktgen_thread_worker(void *arg) +{ + DEFINE_WAIT(wait); + struct pktgen_thread *t = arg; + struct pktgen_dev *pkt_dev = NULL; + int cpu = t->cpu; + + BUG_ON(smp_processor_id() != cpu); + + init_waitqueue_head(&t->queue); + complete(&t->start_done); + + pr_debug("starting pktgen/%d: pid=%d\n", cpu, task_pid_nr(current)); + + set_current_state(TASK_INTERRUPTIBLE); + + set_freezable(); + + while (!kthread_should_stop()) { + pkt_dev = next_to_run(t); + + if (unlikely(!pkt_dev && t->control == 0)) { + if (pktgen_exiting) + break; + wait_event_interruptible_timeout(t->queue, + t->control != 0, + HZ/10); + try_to_freeze(); + continue; + } + + __set_current_state(TASK_RUNNING); + + if (likely(pkt_dev)) { + pktgen_xmit(pkt_dev); + + if (need_resched()) + pktgen_resched(pkt_dev); + else + cpu_relax(); + } + + if (t->control & T_STOP) { + pktgen_stop(t); + t->control &= ~(T_STOP); + } + + if (t->control & T_RUN) { + pktgen_run(t); + t->control &= ~(T_RUN); + } + + if (t->control & T_REMDEVALL) { + pktgen_rem_all_ifs(t); + t->control &= ~(T_REMDEVALL); + } + + if (t->control & T_REMDEV) { + pktgen_rem_one_if(t); + t->control &= ~(T_REMDEV); + } + + try_to_freeze(); + + set_current_state(TASK_INTERRUPTIBLE); + } + + pr_debug("%s stopping all device\n", t->tsk->comm); + pktgen_stop(t); + + pr_debug("%s removing all device\n", t->tsk->comm); + pktgen_rem_all_ifs(t); + + pr_debug("%s removing thread\n", t->tsk->comm); + pktgen_rem_thread(t); + + /* Wait for kthread_stop */ + while (!kthread_should_stop()) { + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + } + __set_current_state(TASK_RUNNING); + + return 0; +} + +static struct pktgen_dev *pktgen_find_dev(struct pktgen_thread *t, + const char *ifname, bool exact) +{ + struct pktgen_dev *p, *pkt_dev = NULL; + size_t len = strlen(ifname); + + if_lock(t); + list_for_each_entry(p, &t->if_list, list) + if (strncmp(p->odevname, ifname, len) == 0) { + if (p->odevname[len]) { + if (exact || p->odevname[len] != '@') + continue; + } + pkt_dev = p; + break; + } + + if_unlock(t); + pr_debug("find_dev(%s) returning %p\n", ifname, pkt_dev); + return pkt_dev; +} + +/* + * Adds a dev at front of if_list. + */ + +static int add_dev_to_thread(struct pktgen_thread *t, + struct pktgen_dev *pkt_dev) +{ + int rv = 0; + + if_lock(t); + + if (pkt_dev->pg_thread) { + pr_err("ERROR: already assigned to a thread\n"); + rv = -EBUSY; + goto out; + } + + list_add(&pkt_dev->list, &t->if_list); + pkt_dev->pg_thread = t; + pkt_dev->running = 0; + +out: + if_unlock(t); + return rv; +} + +/* Called under thread lock */ + +static int pktgen_add_device(struct pktgen_thread *t, const char *ifname) +{ + struct pktgen_dev *pkt_dev; + int err; + int node = cpu_to_node(t->cpu); + + /* We don't allow a device to be on several threads */ + + pkt_dev = __pktgen_NN_threads(ifname, FIND); + if (pkt_dev) { + pr_err("ERROR: interface already used\n"); + return -EBUSY; + } + + pkt_dev = kzalloc_node(sizeof(struct pktgen_dev), GFP_KERNEL, node); + if (!pkt_dev) + return -ENOMEM; + + strcpy(pkt_dev->odevname, ifname); + pkt_dev->flows = vzalloc_node(MAX_CFLOWS * sizeof(struct flow_state), + node); + if (pkt_dev->flows == NULL) { + kfree(pkt_dev); + return -ENOMEM; + } + + pkt_dev->removal_mark = 0; + pkt_dev->min_pkt_size = ETH_ZLEN; + pkt_dev->max_pkt_size = ETH_ZLEN; + pkt_dev->nfrags = 0; + pkt_dev->delay = pg_delay_d; + pkt_dev->count = pg_count_d; + pkt_dev->sofar = 0; + pkt_dev->udp_src_min = 9; /* sink port */ + pkt_dev->udp_src_max = 9; + pkt_dev->udp_dst_min = 9; + pkt_dev->udp_dst_max = 9; + pkt_dev->vlan_p = 0; + pkt_dev->vlan_cfi = 0; + pkt_dev->vlan_id = 0xffff; + pkt_dev->svlan_p = 0; + pkt_dev->svlan_cfi = 0; + pkt_dev->svlan_id = 0xffff; + pkt_dev->node = -1; + + err = pktgen_setup_dev(pkt_dev, ifname); + if (err) + goto out1; + if (pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING) + pkt_dev->clone_skb = pg_clone_skb_d; + + pkt_dev->entry = proc_create_data(ifname, 0600, pg_proc_dir, + &pktgen_if_fops, pkt_dev); + if (!pkt_dev->entry) { + pr_err("cannot create %s/%s procfs entry\n", + PG_PROC_DIR, ifname); + err = -EINVAL; + goto out2; + } +#ifdef CONFIG_XFRM + pkt_dev->ipsmode = XFRM_MODE_TRANSPORT; + pkt_dev->ipsproto = IPPROTO_ESP; +#endif + + return add_dev_to_thread(t, pkt_dev); +out2: + dev_put(pkt_dev->odev); +out1: +#ifdef CONFIG_XFRM + free_SAs(pkt_dev); +#endif + vfree(pkt_dev->flows); + kfree(pkt_dev); + return err; +} + +static int __init pktgen_create_thread(int cpu) +{ + struct pktgen_thread *t; + struct proc_dir_entry *pe; + struct task_struct *p; + + t = kzalloc_node(sizeof(struct pktgen_thread), GFP_KERNEL, + cpu_to_node(cpu)); + if (!t) { + pr_err("ERROR: out of memory, can't create new thread\n"); + return -ENOMEM; + } + + spin_lock_init(&t->if_lock); + t->cpu = cpu; + + INIT_LIST_HEAD(&t->if_list); + + list_add_tail(&t->th_list, &pktgen_threads); + init_completion(&t->start_done); + + p = kthread_create_on_node(pktgen_thread_worker, + t, + cpu_to_node(cpu), + "kpktgend_%d", cpu); + if (IS_ERR(p)) { + pr_err("kernel_thread() failed for cpu %d\n", t->cpu); + list_del(&t->th_list); + kfree(t); + return PTR_ERR(p); + } + kthread_bind(p, cpu); + t->tsk = p; + + pe = proc_create_data(t->tsk->comm, 0600, pg_proc_dir, + &pktgen_thread_fops, t); + if (!pe) { + pr_err("cannot create %s/%s procfs entry\n", + PG_PROC_DIR, t->tsk->comm); + kthread_stop(p); + list_del(&t->th_list); + kfree(t); + return -EINVAL; + } + + wake_up_process(p); + wait_for_completion(&t->start_done); + + return 0; +} + +/* + * Removes a device from the thread if_list. + */ +static void _rem_dev_from_if_list(struct pktgen_thread *t, + struct pktgen_dev *pkt_dev) +{ + struct list_head *q, *n; + struct pktgen_dev *p; + + list_for_each_safe(q, n, &t->if_list) { + p = list_entry(q, struct pktgen_dev, list); + if (p == pkt_dev) + list_del(&p->list); + } +} + +static int pktgen_remove_device(struct pktgen_thread *t, + struct pktgen_dev *pkt_dev) +{ + + pr_debug("remove_device pkt_dev=%p\n", pkt_dev); + + if (pkt_dev->running) { + pr_warning("WARNING: trying to remove a running interface, stopping it now\n"); + pktgen_stop_device(pkt_dev); + } + + /* Dis-associate from the interface */ + + if (pkt_dev->odev) { + dev_put(pkt_dev->odev); + pkt_dev->odev = NULL; + } + + /* And update the thread if_list */ + + _rem_dev_from_if_list(t, pkt_dev); + + if (pkt_dev->entry) + remove_proc_entry(pkt_dev->entry->name, pg_proc_dir); + +#ifdef CONFIG_XFRM + free_SAs(pkt_dev); +#endif + vfree(pkt_dev->flows); + if (pkt_dev->page) + put_page(pkt_dev->page); + kfree(pkt_dev); + return 0; +} + +static int __init pg_init(void) +{ + int cpu; + struct proc_dir_entry *pe; + int ret = 0; + + pr_info("%s", version); + + pg_proc_dir = proc_mkdir(PG_PROC_DIR, init_net.proc_net); + if (!pg_proc_dir) + return -ENODEV; + + pe = proc_create(PGCTRL, 0600, pg_proc_dir, &pktgen_fops); + if (pe == NULL) { + pr_err("ERROR: cannot create %s procfs entry\n", PGCTRL); + ret = -EINVAL; + goto remove_dir; + } + + register_netdevice_notifier(&pktgen_notifier_block); + + for_each_online_cpu(cpu) { + int err; + + err = pktgen_create_thread(cpu); + if (err) + pr_warning("WARNING: Cannot create thread for cpu %d (%d)\n", + cpu, err); + } + + if (list_empty(&pktgen_threads)) { + pr_err("ERROR: Initialization failed for all threads\n"); + ret = -ENODEV; + goto unregister; + } + + return 0; + + unregister: + unregister_netdevice_notifier(&pktgen_notifier_block); + remove_proc_entry(PGCTRL, pg_proc_dir); + remove_dir: + proc_net_remove(&init_net, PG_PROC_DIR); + return ret; +} + +static void __exit pg_cleanup(void) +{ + struct pktgen_thread *t; + struct list_head *q, *n; + LIST_HEAD(list); + + /* Stop all interfaces & threads */ + pktgen_exiting = true; + + mutex_lock(&pktgen_thread_lock); + list_splice_init(&pktgen_threads, &list); + mutex_unlock(&pktgen_thread_lock); + + list_for_each_safe(q, n, &list) { + t = list_entry(q, struct pktgen_thread, th_list); + list_del(&t->th_list); + kthread_stop(t->tsk); + kfree(t); + } + + /* Un-register us from receiving netdevice events */ + unregister_netdevice_notifier(&pktgen_notifier_block); + + /* Clean up proc file system */ + remove_proc_entry(PGCTRL, pg_proc_dir); + proc_net_remove(&init_net, PG_PROC_DIR); +} + +module_init(pg_init); +module_exit(pg_cleanup); + +MODULE_AUTHOR("Robert Olsson <robert.olsson@its.uu.se>"); +MODULE_DESCRIPTION("Packet Generator tool"); +MODULE_LICENSE("GPL"); +MODULE_VERSION(VERSION); +module_param(pg_count_d, int, 0); +MODULE_PARM_DESC(pg_count_d, "Default number of packets to inject"); +module_param(pg_delay_d, int, 0); +MODULE_PARM_DESC(pg_delay_d, "Default delay between packets (nanoseconds)"); +module_param(pg_clone_skb_d, int, 0); +MODULE_PARM_DESC(pg_clone_skb_d, "Default number of copies of the same packet"); +module_param(debug, int, 0); +MODULE_PARM_DESC(debug, "Enable debugging of pktgen module"); diff --git a/net/core/request_sock.c b/net/core/request_sock.c new file mode 100644 index 00000000..9b570a6a --- /dev/null +++ b/net/core/request_sock.c @@ -0,0 +1,132 @@ +/* + * NET Generic infrastructure for Network protocols. + * + * Authors: Arnaldo Carvalho de Melo <acme@conectiva.com.br> + * + * From code originally in include/net/tcp.h + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/module.h> +#include <linux/random.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/vmalloc.h> + +#include <net/request_sock.h> + +/* + * Maximum number of SYN_RECV sockets in queue per LISTEN socket. + * One SYN_RECV socket costs about 80bytes on a 32bit machine. + * It would be better to replace it with a global counter for all sockets + * but then some measure against one socket starving all other sockets + * would be needed. + * + * The minimum value of it is 128. Experiments with real servers show that + * it is absolutely not enough even at 100conn/sec. 256 cures most + * of problems. + * This value is adjusted to 128 for low memory machines, + * and it will increase in proportion to the memory of machine. + * Note : Dont forget somaxconn that may limit backlog too. + */ +int sysctl_max_syn_backlog = 256; +EXPORT_SYMBOL(sysctl_max_syn_backlog); + +int reqsk_queue_alloc(struct request_sock_queue *queue, + unsigned int nr_table_entries) +{ + size_t lopt_size = sizeof(struct listen_sock); + struct listen_sock *lopt; + + nr_table_entries = min_t(u32, nr_table_entries, sysctl_max_syn_backlog); + nr_table_entries = max_t(u32, nr_table_entries, 8); + nr_table_entries = roundup_pow_of_two(nr_table_entries + 1); + lopt_size += nr_table_entries * sizeof(struct request_sock *); + if (lopt_size > PAGE_SIZE) + lopt = vzalloc(lopt_size); + else + lopt = kzalloc(lopt_size, GFP_KERNEL); + if (lopt == NULL) + return -ENOMEM; + + for (lopt->max_qlen_log = 3; + (1 << lopt->max_qlen_log) < nr_table_entries; + lopt->max_qlen_log++); + + get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd)); + rwlock_init(&queue->syn_wait_lock); + queue->rskq_accept_head = NULL; + lopt->nr_table_entries = nr_table_entries; + + write_lock_bh(&queue->syn_wait_lock); + queue->listen_opt = lopt; + write_unlock_bh(&queue->syn_wait_lock); + + return 0; +} + +void __reqsk_queue_destroy(struct request_sock_queue *queue) +{ + struct listen_sock *lopt; + size_t lopt_size; + + /* + * this is an error recovery path only + * no locking needed and the lopt is not NULL + */ + + lopt = queue->listen_opt; + lopt_size = sizeof(struct listen_sock) + + lopt->nr_table_entries * sizeof(struct request_sock *); + + if (lopt_size > PAGE_SIZE) + vfree(lopt); + else + kfree(lopt); +} + +static inline struct listen_sock *reqsk_queue_yank_listen_sk( + struct request_sock_queue *queue) +{ + struct listen_sock *lopt; + + write_lock_bh(&queue->syn_wait_lock); + lopt = queue->listen_opt; + queue->listen_opt = NULL; + write_unlock_bh(&queue->syn_wait_lock); + + return lopt; +} + +void reqsk_queue_destroy(struct request_sock_queue *queue) +{ + /* make all the listen_opt local to us */ + struct listen_sock *lopt = reqsk_queue_yank_listen_sk(queue); + size_t lopt_size = sizeof(struct listen_sock) + + lopt->nr_table_entries * sizeof(struct request_sock *); + + if (lopt->qlen != 0) { + unsigned int i; + + for (i = 0; i < lopt->nr_table_entries; i++) { + struct request_sock *req; + + while ((req = lopt->syn_table[i]) != NULL) { + lopt->syn_table[i] = req->dl_next; + lopt->qlen--; + reqsk_free(req); + } + } + } + + WARN_ON(lopt->qlen != 0); + if (lopt_size > PAGE_SIZE) + vfree(lopt); + else + kfree(lopt); +} + diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c new file mode 100644 index 00000000..90430b77 --- /dev/null +++ b/net/core/rtnetlink.c @@ -0,0 +1,2148 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Routing netlink socket interface: protocol independent part. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Fixes: + * Vitaly E. Lavrov RTA_OK arithmetics was wrong. + */ + +#include <linux/errno.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/kernel.h> +#include <linux/timer.h> +#include <linux/string.h> +#include <linux/sockios.h> +#include <linux/net.h> +#include <linux/fcntl.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/interrupt.h> +#include <linux/capability.h> +#include <linux/skbuff.h> +#include <linux/init.h> +#include <linux/security.h> +#include <linux/mutex.h> +#include <linux/if_addr.h> +#include <linux/pci.h> + +#include <asm/uaccess.h> + +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <net/ip.h> +#include <net/protocol.h> +#include <net/arp.h> +#include <net/route.h> +#include <net/udp.h> +#include <net/sock.h> +#include <net/pkt_sched.h> +#include <net/fib_rules.h> +#include <net/rtnetlink.h> +#include <net/net_namespace.h> + +struct rtnl_link { + rtnl_doit_func doit; + rtnl_dumpit_func dumpit; + rtnl_calcit_func calcit; +}; + +static DEFINE_MUTEX(rtnl_mutex); + +void rtnl_lock(void) +{ + mutex_lock(&rtnl_mutex); +} +EXPORT_SYMBOL(rtnl_lock); + +void __rtnl_unlock(void) +{ + mutex_unlock(&rtnl_mutex); +} + +void rtnl_unlock(void) +{ + /* This fellow will unlock it for us. */ + netdev_run_todo(); +} +EXPORT_SYMBOL(rtnl_unlock); + +int rtnl_trylock(void) +{ + return mutex_trylock(&rtnl_mutex); +} +EXPORT_SYMBOL(rtnl_trylock); + +int rtnl_is_locked(void) +{ + return mutex_is_locked(&rtnl_mutex); +} +EXPORT_SYMBOL(rtnl_is_locked); + +#ifdef CONFIG_PROVE_LOCKING +int lockdep_rtnl_is_held(void) +{ + return lockdep_is_held(&rtnl_mutex); +} +EXPORT_SYMBOL(lockdep_rtnl_is_held); +#endif /* #ifdef CONFIG_PROVE_LOCKING */ + +static struct rtnl_link *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1]; + +static inline int rtm_msgindex(int msgtype) +{ + int msgindex = msgtype - RTM_BASE; + + /* + * msgindex < 0 implies someone tried to register a netlink + * control code. msgindex >= RTM_NR_MSGTYPES may indicate that + * the message type has not been added to linux/rtnetlink.h + */ + BUG_ON(msgindex < 0 || msgindex >= RTM_NR_MSGTYPES); + + return msgindex; +} + +static rtnl_doit_func rtnl_get_doit(int protocol, int msgindex) +{ + struct rtnl_link *tab; + + if (protocol <= RTNL_FAMILY_MAX) + tab = rtnl_msg_handlers[protocol]; + else + tab = NULL; + + if (tab == NULL || tab[msgindex].doit == NULL) + tab = rtnl_msg_handlers[PF_UNSPEC]; + + return tab ? tab[msgindex].doit : NULL; +} + +static rtnl_dumpit_func rtnl_get_dumpit(int protocol, int msgindex) +{ + struct rtnl_link *tab; + + if (protocol <= RTNL_FAMILY_MAX) + tab = rtnl_msg_handlers[protocol]; + else + tab = NULL; + + if (tab == NULL || tab[msgindex].dumpit == NULL) + tab = rtnl_msg_handlers[PF_UNSPEC]; + + return tab ? tab[msgindex].dumpit : NULL; +} + +static rtnl_calcit_func rtnl_get_calcit(int protocol, int msgindex) +{ + struct rtnl_link *tab; + + if (protocol <= RTNL_FAMILY_MAX) + tab = rtnl_msg_handlers[protocol]; + else + tab = NULL; + + if (tab == NULL || tab[msgindex].calcit == NULL) + tab = rtnl_msg_handlers[PF_UNSPEC]; + + return tab ? tab[msgindex].calcit : NULL; +} + +/** + * __rtnl_register - Register a rtnetlink message type + * @protocol: Protocol family or PF_UNSPEC + * @msgtype: rtnetlink message type + * @doit: Function pointer called for each request message + * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message + * @calcit: Function pointer to calc size of dump message + * + * Registers the specified function pointers (at least one of them has + * to be non-NULL) to be called whenever a request message for the + * specified protocol family and message type is received. + * + * The special protocol family PF_UNSPEC may be used to define fallback + * function pointers for the case when no entry for the specific protocol + * family exists. + * + * Returns 0 on success or a negative error code. + */ +int __rtnl_register(int protocol, int msgtype, + rtnl_doit_func doit, rtnl_dumpit_func dumpit, + rtnl_calcit_func calcit) +{ + struct rtnl_link *tab; + int msgindex; + + BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX); + msgindex = rtm_msgindex(msgtype); + + tab = rtnl_msg_handlers[protocol]; + if (tab == NULL) { + tab = kcalloc(RTM_NR_MSGTYPES, sizeof(*tab), GFP_KERNEL); + if (tab == NULL) + return -ENOBUFS; + + rtnl_msg_handlers[protocol] = tab; + } + + if (doit) + tab[msgindex].doit = doit; + + if (dumpit) + tab[msgindex].dumpit = dumpit; + + if (calcit) + tab[msgindex].calcit = calcit; + + return 0; +} +EXPORT_SYMBOL_GPL(__rtnl_register); + +/** + * rtnl_register - Register a rtnetlink message type + * + * Identical to __rtnl_register() but panics on failure. This is useful + * as failure of this function is very unlikely, it can only happen due + * to lack of memory when allocating the chain to store all message + * handlers for a protocol. Meant for use in init functions where lack + * of memory implies no sense in continuing. + */ +void rtnl_register(int protocol, int msgtype, + rtnl_doit_func doit, rtnl_dumpit_func dumpit, + rtnl_calcit_func calcit) +{ + if (__rtnl_register(protocol, msgtype, doit, dumpit, calcit) < 0) + panic("Unable to register rtnetlink message handler, " + "protocol = %d, message type = %d\n", + protocol, msgtype); +} +EXPORT_SYMBOL_GPL(rtnl_register); + +/** + * rtnl_unregister - Unregister a rtnetlink message type + * @protocol: Protocol family or PF_UNSPEC + * @msgtype: rtnetlink message type + * + * Returns 0 on success or a negative error code. + */ +int rtnl_unregister(int protocol, int msgtype) +{ + int msgindex; + + BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX); + msgindex = rtm_msgindex(msgtype); + + if (rtnl_msg_handlers[protocol] == NULL) + return -ENOENT; + + rtnl_msg_handlers[protocol][msgindex].doit = NULL; + rtnl_msg_handlers[protocol][msgindex].dumpit = NULL; + + return 0; +} +EXPORT_SYMBOL_GPL(rtnl_unregister); + +/** + * rtnl_unregister_all - Unregister all rtnetlink message type of a protocol + * @protocol : Protocol family or PF_UNSPEC + * + * Identical to calling rtnl_unregster() for all registered message types + * of a certain protocol family. + */ +void rtnl_unregister_all(int protocol) +{ + BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX); + + kfree(rtnl_msg_handlers[protocol]); + rtnl_msg_handlers[protocol] = NULL; +} +EXPORT_SYMBOL_GPL(rtnl_unregister_all); + +static LIST_HEAD(link_ops); + +static const struct rtnl_link_ops *rtnl_link_ops_get(const char *kind) +{ + const struct rtnl_link_ops *ops; + + list_for_each_entry(ops, &link_ops, list) { + if (!strcmp(ops->kind, kind)) + return ops; + } + return NULL; +} + +/** + * __rtnl_link_register - Register rtnl_link_ops with rtnetlink. + * @ops: struct rtnl_link_ops * to register + * + * The caller must hold the rtnl_mutex. This function should be used + * by drivers that create devices during module initialization. It + * must be called before registering the devices. + * + * Returns 0 on success or a negative error code. + */ +int __rtnl_link_register(struct rtnl_link_ops *ops) +{ + if (rtnl_link_ops_get(ops->kind)) + return -EEXIST; + + if (!ops->dellink) + ops->dellink = unregister_netdevice_queue; + + list_add_tail(&ops->list, &link_ops); + return 0; +} +EXPORT_SYMBOL_GPL(__rtnl_link_register); + +/** + * rtnl_link_register - Register rtnl_link_ops with rtnetlink. + * @ops: struct rtnl_link_ops * to register + * + * Returns 0 on success or a negative error code. + */ +int rtnl_link_register(struct rtnl_link_ops *ops) +{ + int err; + + rtnl_lock(); + err = __rtnl_link_register(ops); + rtnl_unlock(); + return err; +} +EXPORT_SYMBOL_GPL(rtnl_link_register); + +static void __rtnl_kill_links(struct net *net, struct rtnl_link_ops *ops) +{ + struct net_device *dev; + LIST_HEAD(list_kill); + + for_each_netdev(net, dev) { + if (dev->rtnl_link_ops == ops) + ops->dellink(dev, &list_kill); + } + unregister_netdevice_many(&list_kill); +} + +/** + * __rtnl_link_unregister - Unregister rtnl_link_ops from rtnetlink. + * @ops: struct rtnl_link_ops * to unregister + * + * The caller must hold the rtnl_mutex. + */ +void __rtnl_link_unregister(struct rtnl_link_ops *ops) +{ + struct net *net; + + for_each_net(net) { + __rtnl_kill_links(net, ops); + } + list_del(&ops->list); +} +EXPORT_SYMBOL_GPL(__rtnl_link_unregister); + +/** + * rtnl_link_unregister - Unregister rtnl_link_ops from rtnetlink. + * @ops: struct rtnl_link_ops * to unregister + */ +void rtnl_link_unregister(struct rtnl_link_ops *ops) +{ + rtnl_lock(); + __rtnl_link_unregister(ops); + rtnl_unlock(); +} +EXPORT_SYMBOL_GPL(rtnl_link_unregister); + +static size_t rtnl_link_get_size(const struct net_device *dev) +{ + const struct rtnl_link_ops *ops = dev->rtnl_link_ops; + size_t size; + + if (!ops) + return 0; + + size = nla_total_size(sizeof(struct nlattr)) + /* IFLA_LINKINFO */ + nla_total_size(strlen(ops->kind) + 1); /* IFLA_INFO_KIND */ + + if (ops->get_size) + /* IFLA_INFO_DATA + nested data */ + size += nla_total_size(sizeof(struct nlattr)) + + ops->get_size(dev); + + if (ops->get_xstats_size) + /* IFLA_INFO_XSTATS */ + size += nla_total_size(ops->get_xstats_size(dev)); + + return size; +} + +static LIST_HEAD(rtnl_af_ops); + +static const struct rtnl_af_ops *rtnl_af_lookup(const int family) +{ + const struct rtnl_af_ops *ops; + + list_for_each_entry(ops, &rtnl_af_ops, list) { + if (ops->family == family) + return ops; + } + + return NULL; +} + +/** + * __rtnl_af_register - Register rtnl_af_ops with rtnetlink. + * @ops: struct rtnl_af_ops * to register + * + * The caller must hold the rtnl_mutex. + * + * Returns 0 on success or a negative error code. + */ +int __rtnl_af_register(struct rtnl_af_ops *ops) +{ + list_add_tail(&ops->list, &rtnl_af_ops); + return 0; +} +EXPORT_SYMBOL_GPL(__rtnl_af_register); + +/** + * rtnl_af_register - Register rtnl_af_ops with rtnetlink. + * @ops: struct rtnl_af_ops * to register + * + * Returns 0 on success or a negative error code. + */ +int rtnl_af_register(struct rtnl_af_ops *ops) +{ + int err; + + rtnl_lock(); + err = __rtnl_af_register(ops); + rtnl_unlock(); + return err; +} +EXPORT_SYMBOL_GPL(rtnl_af_register); + +/** + * __rtnl_af_unregister - Unregister rtnl_af_ops from rtnetlink. + * @ops: struct rtnl_af_ops * to unregister + * + * The caller must hold the rtnl_mutex. + */ +void __rtnl_af_unregister(struct rtnl_af_ops *ops) +{ + list_del(&ops->list); +} +EXPORT_SYMBOL_GPL(__rtnl_af_unregister); + +/** + * rtnl_af_unregister - Unregister rtnl_af_ops from rtnetlink. + * @ops: struct rtnl_af_ops * to unregister + */ +void rtnl_af_unregister(struct rtnl_af_ops *ops) +{ + rtnl_lock(); + __rtnl_af_unregister(ops); + rtnl_unlock(); +} +EXPORT_SYMBOL_GPL(rtnl_af_unregister); + +static size_t rtnl_link_get_af_size(const struct net_device *dev) +{ + struct rtnl_af_ops *af_ops; + size_t size; + + /* IFLA_AF_SPEC */ + size = nla_total_size(sizeof(struct nlattr)); + + list_for_each_entry(af_ops, &rtnl_af_ops, list) { + if (af_ops->get_link_af_size) { + /* AF_* + nested data */ + size += nla_total_size(sizeof(struct nlattr)) + + af_ops->get_link_af_size(dev); + } + } + + return size; +} + +static int rtnl_link_fill(struct sk_buff *skb, const struct net_device *dev) +{ + const struct rtnl_link_ops *ops = dev->rtnl_link_ops; + struct nlattr *linkinfo, *data; + int err = -EMSGSIZE; + + linkinfo = nla_nest_start(skb, IFLA_LINKINFO); + if (linkinfo == NULL) + goto out; + + if (nla_put_string(skb, IFLA_INFO_KIND, ops->kind) < 0) + goto err_cancel_link; + if (ops->fill_xstats) { + err = ops->fill_xstats(skb, dev); + if (err < 0) + goto err_cancel_link; + } + if (ops->fill_info) { + data = nla_nest_start(skb, IFLA_INFO_DATA); + if (data == NULL) + goto err_cancel_link; + err = ops->fill_info(skb, dev); + if (err < 0) + goto err_cancel_data; + nla_nest_end(skb, data); + } + + nla_nest_end(skb, linkinfo); + return 0; + +err_cancel_data: + nla_nest_cancel(skb, data); +err_cancel_link: + nla_nest_cancel(skb, linkinfo); +out: + return err; +} + +static const int rtm_min[RTM_NR_FAMILIES] = +{ + [RTM_FAM(RTM_NEWLINK)] = NLMSG_LENGTH(sizeof(struct ifinfomsg)), + [RTM_FAM(RTM_NEWADDR)] = NLMSG_LENGTH(sizeof(struct ifaddrmsg)), + [RTM_FAM(RTM_NEWROUTE)] = NLMSG_LENGTH(sizeof(struct rtmsg)), + [RTM_FAM(RTM_NEWRULE)] = NLMSG_LENGTH(sizeof(struct fib_rule_hdr)), + [RTM_FAM(RTM_NEWQDISC)] = NLMSG_LENGTH(sizeof(struct tcmsg)), + [RTM_FAM(RTM_NEWTCLASS)] = NLMSG_LENGTH(sizeof(struct tcmsg)), + [RTM_FAM(RTM_NEWTFILTER)] = NLMSG_LENGTH(sizeof(struct tcmsg)), + [RTM_FAM(RTM_NEWACTION)] = NLMSG_LENGTH(sizeof(struct tcamsg)), + [RTM_FAM(RTM_GETMULTICAST)] = NLMSG_LENGTH(sizeof(struct rtgenmsg)), + [RTM_FAM(RTM_GETANYCAST)] = NLMSG_LENGTH(sizeof(struct rtgenmsg)), +}; + +static const int rta_max[RTM_NR_FAMILIES] = +{ + [RTM_FAM(RTM_NEWLINK)] = IFLA_MAX, + [RTM_FAM(RTM_NEWADDR)] = IFA_MAX, + [RTM_FAM(RTM_NEWROUTE)] = RTA_MAX, + [RTM_FAM(RTM_NEWRULE)] = FRA_MAX, + [RTM_FAM(RTM_NEWQDISC)] = TCA_MAX, + [RTM_FAM(RTM_NEWTCLASS)] = TCA_MAX, + [RTM_FAM(RTM_NEWTFILTER)] = TCA_MAX, + [RTM_FAM(RTM_NEWACTION)] = TCAA_MAX, +}; + +void __rta_fill(struct sk_buff *skb, int attrtype, int attrlen, const void *data) +{ + struct rtattr *rta; + int size = RTA_LENGTH(attrlen); + + rta = (struct rtattr *)skb_put(skb, RTA_ALIGN(size)); + rta->rta_type = attrtype; + rta->rta_len = size; + memcpy(RTA_DATA(rta), data, attrlen); + memset(RTA_DATA(rta) + attrlen, 0, RTA_ALIGN(size) - size); +} +EXPORT_SYMBOL(__rta_fill); + +int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, unsigned group, int echo) +{ + struct sock *rtnl = net->rtnl; + int err = 0; + + NETLINK_CB(skb).dst_group = group; + if (echo) + atomic_inc(&skb->users); + netlink_broadcast(rtnl, skb, pid, group, GFP_KERNEL); + if (echo) + err = netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT); + return err; +} + +int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid) +{ + struct sock *rtnl = net->rtnl; + + return nlmsg_unicast(rtnl, skb, pid); +} +EXPORT_SYMBOL(rtnl_unicast); + +void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group, + struct nlmsghdr *nlh, gfp_t flags) +{ + struct sock *rtnl = net->rtnl; + int report = 0; + + if (nlh) + report = nlmsg_report(nlh); + + nlmsg_notify(rtnl, skb, pid, group, report, flags); +} +EXPORT_SYMBOL(rtnl_notify); + +void rtnl_set_sk_err(struct net *net, u32 group, int error) +{ + struct sock *rtnl = net->rtnl; + + netlink_set_err(rtnl, 0, group, error); +} +EXPORT_SYMBOL(rtnl_set_sk_err); + +int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics) +{ + struct nlattr *mx; + int i, valid = 0; + + mx = nla_nest_start(skb, RTA_METRICS); + if (mx == NULL) + return -ENOBUFS; + + for (i = 0; i < RTAX_MAX; i++) { + if (metrics[i]) { + valid++; + NLA_PUT_U32(skb, i+1, metrics[i]); + } + } + + if (!valid) { + nla_nest_cancel(skb, mx); + return 0; + } + + return nla_nest_end(skb, mx); + +nla_put_failure: + nla_nest_cancel(skb, mx); + return -EMSGSIZE; +} +EXPORT_SYMBOL(rtnetlink_put_metrics); + +int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id, + u32 ts, u32 tsage, long expires, u32 error) +{ + struct rta_cacheinfo ci = { + .rta_lastuse = jiffies_to_clock_t(jiffies - dst->lastuse), + .rta_used = dst->__use, + .rta_clntref = atomic_read(&(dst->__refcnt)), + .rta_error = error, + .rta_id = id, + .rta_ts = ts, + .rta_tsage = tsage, + }; + + if (expires) + ci.rta_expires = jiffies_to_clock_t(expires); + + return nla_put(skb, RTA_CACHEINFO, sizeof(ci), &ci); +} +EXPORT_SYMBOL_GPL(rtnl_put_cacheinfo); + +static void set_operstate(struct net_device *dev, unsigned char transition) +{ + unsigned char operstate = dev->operstate; + + switch (transition) { + case IF_OPER_UP: + if ((operstate == IF_OPER_DORMANT || + operstate == IF_OPER_UNKNOWN) && + !netif_dormant(dev)) + operstate = IF_OPER_UP; + break; + + case IF_OPER_DORMANT: + if (operstate == IF_OPER_UP || + operstate == IF_OPER_UNKNOWN) + operstate = IF_OPER_DORMANT; + break; + } + + if (dev->operstate != operstate) { + write_lock_bh(&dev_base_lock); + dev->operstate = operstate; + write_unlock_bh(&dev_base_lock); + netdev_state_change(dev); + } +} + +static unsigned int rtnl_dev_combine_flags(const struct net_device *dev, + const struct ifinfomsg *ifm) +{ + unsigned int flags = ifm->ifi_flags; + + /* bugwards compatibility: ifi_change == 0 is treated as ~0 */ + if (ifm->ifi_change) + flags = (flags & ifm->ifi_change) | + (dev->flags & ~ifm->ifi_change); + + return flags; +} + +static void copy_rtnl_link_stats(struct rtnl_link_stats *a, + const struct rtnl_link_stats64 *b) +{ + a->rx_packets = b->rx_packets; + a->tx_packets = b->tx_packets; + a->rx_bytes = b->rx_bytes; + a->tx_bytes = b->tx_bytes; + a->rx_errors = b->rx_errors; + a->tx_errors = b->tx_errors; + a->rx_dropped = b->rx_dropped; + a->tx_dropped = b->tx_dropped; + + a->multicast = b->multicast; + a->collisions = b->collisions; + + a->rx_length_errors = b->rx_length_errors; + a->rx_over_errors = b->rx_over_errors; + a->rx_crc_errors = b->rx_crc_errors; + a->rx_frame_errors = b->rx_frame_errors; + a->rx_fifo_errors = b->rx_fifo_errors; + a->rx_missed_errors = b->rx_missed_errors; + + a->tx_aborted_errors = b->tx_aborted_errors; + a->tx_carrier_errors = b->tx_carrier_errors; + a->tx_fifo_errors = b->tx_fifo_errors; + a->tx_heartbeat_errors = b->tx_heartbeat_errors; + a->tx_window_errors = b->tx_window_errors; + + a->rx_compressed = b->rx_compressed; + a->tx_compressed = b->tx_compressed; +} + +static void copy_rtnl_link_stats64(void *v, const struct rtnl_link_stats64 *b) +{ + memcpy(v, b, sizeof(*b)); +} + +/* All VF info */ +static inline int rtnl_vfinfo_size(const struct net_device *dev, + u32 ext_filter_mask) +{ + if (dev->dev.parent && dev_is_pci(dev->dev.parent) && + (ext_filter_mask & RTEXT_FILTER_VF)) { + int num_vfs = dev_num_vf(dev->dev.parent); + size_t size = nla_total_size(sizeof(struct nlattr)); + size += nla_total_size(num_vfs * sizeof(struct nlattr)); + size += num_vfs * + (nla_total_size(sizeof(struct ifla_vf_mac)) + + nla_total_size(sizeof(struct ifla_vf_vlan)) + + nla_total_size(sizeof(struct ifla_vf_tx_rate)) + + nla_total_size(sizeof(struct ifla_vf_spoofchk))); + return size; + } else + return 0; +} + +static size_t rtnl_port_size(const struct net_device *dev) +{ + size_t port_size = nla_total_size(4) /* PORT_VF */ + + nla_total_size(PORT_PROFILE_MAX) /* PORT_PROFILE */ + + nla_total_size(sizeof(struct ifla_port_vsi)) + /* PORT_VSI_TYPE */ + + nla_total_size(PORT_UUID_MAX) /* PORT_INSTANCE_UUID */ + + nla_total_size(PORT_UUID_MAX) /* PORT_HOST_UUID */ + + nla_total_size(1) /* PROT_VDP_REQUEST */ + + nla_total_size(2); /* PORT_VDP_RESPONSE */ + size_t vf_ports_size = nla_total_size(sizeof(struct nlattr)); + size_t vf_port_size = nla_total_size(sizeof(struct nlattr)) + + port_size; + size_t port_self_size = nla_total_size(sizeof(struct nlattr)) + + port_size; + + if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent) + return 0; + if (dev_num_vf(dev->dev.parent)) + return port_self_size + vf_ports_size + + vf_port_size * dev_num_vf(dev->dev.parent); + else + return port_self_size; +} + +static noinline size_t if_nlmsg_size(const struct net_device *dev, + u32 ext_filter_mask) +{ + return NLMSG_ALIGN(sizeof(struct ifinfomsg)) + + nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */ + + nla_total_size(IFALIASZ) /* IFLA_IFALIAS */ + + nla_total_size(IFNAMSIZ) /* IFLA_QDISC */ + + nla_total_size(sizeof(struct rtnl_link_ifmap)) + + nla_total_size(sizeof(struct rtnl_link_stats)) + + nla_total_size(sizeof(struct rtnl_link_stats64)) + + nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */ + + nla_total_size(MAX_ADDR_LEN) /* IFLA_BROADCAST */ + + nla_total_size(4) /* IFLA_TXQLEN */ + + nla_total_size(4) /* IFLA_WEIGHT */ + + nla_total_size(4) /* IFLA_MTU */ + + nla_total_size(4) /* IFLA_LINK */ + + nla_total_size(4) /* IFLA_MASTER */ + + nla_total_size(1) /* IFLA_OPERSTATE */ + + nla_total_size(1) /* IFLA_LINKMODE */ + + nla_total_size(ext_filter_mask + & RTEXT_FILTER_VF ? 4 : 0) /* IFLA_NUM_VF */ + + rtnl_vfinfo_size(dev, ext_filter_mask) /* IFLA_VFINFO_LIST */ + + rtnl_port_size(dev) /* IFLA_VF_PORTS + IFLA_PORT_SELF */ + + rtnl_link_get_size(dev) /* IFLA_LINKINFO */ + + rtnl_link_get_af_size(dev); /* IFLA_AF_SPEC */ +} + +static int rtnl_vf_ports_fill(struct sk_buff *skb, struct net_device *dev) +{ + struct nlattr *vf_ports; + struct nlattr *vf_port; + int vf; + int err; + + vf_ports = nla_nest_start(skb, IFLA_VF_PORTS); + if (!vf_ports) + return -EMSGSIZE; + + for (vf = 0; vf < dev_num_vf(dev->dev.parent); vf++) { + vf_port = nla_nest_start(skb, IFLA_VF_PORT); + if (!vf_port) + goto nla_put_failure; + NLA_PUT_U32(skb, IFLA_PORT_VF, vf); + err = dev->netdev_ops->ndo_get_vf_port(dev, vf, skb); + if (err == -EMSGSIZE) + goto nla_put_failure; + if (err) { + nla_nest_cancel(skb, vf_port); + continue; + } + nla_nest_end(skb, vf_port); + } + + nla_nest_end(skb, vf_ports); + + return 0; + +nla_put_failure: + nla_nest_cancel(skb, vf_ports); + return -EMSGSIZE; +} + +static int rtnl_port_self_fill(struct sk_buff *skb, struct net_device *dev) +{ + struct nlattr *port_self; + int err; + + port_self = nla_nest_start(skb, IFLA_PORT_SELF); + if (!port_self) + return -EMSGSIZE; + + err = dev->netdev_ops->ndo_get_vf_port(dev, PORT_SELF_VF, skb); + if (err) { + nla_nest_cancel(skb, port_self); + return (err == -EMSGSIZE) ? err : 0; + } + + nla_nest_end(skb, port_self); + + return 0; +} + +static int rtnl_port_fill(struct sk_buff *skb, struct net_device *dev) +{ + int err; + + if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent) + return 0; + + err = rtnl_port_self_fill(skb, dev); + if (err) + return err; + + if (dev_num_vf(dev->dev.parent)) { + err = rtnl_vf_ports_fill(skb, dev); + if (err) + return err; + } + + return 0; +} + +static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, + int type, u32 pid, u32 seq, u32 change, + unsigned int flags, u32 ext_filter_mask) +{ + struct ifinfomsg *ifm; + struct nlmsghdr *nlh; + struct rtnl_link_stats64 temp; + const struct rtnl_link_stats64 *stats; + struct nlattr *attr, *af_spec; + struct rtnl_af_ops *af_ops; + + ASSERT_RTNL(); + nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ifm), flags); + if (nlh == NULL) + return -EMSGSIZE; + + ifm = nlmsg_data(nlh); + ifm->ifi_family = AF_UNSPEC; + ifm->__ifi_pad = 0; + ifm->ifi_type = dev->type; + ifm->ifi_index = dev->ifindex; + ifm->ifi_flags = dev_get_flags(dev); + ifm->ifi_change = change; + + NLA_PUT_STRING(skb, IFLA_IFNAME, dev->name); + NLA_PUT_U32(skb, IFLA_TXQLEN, dev->tx_queue_len); + NLA_PUT_U8(skb, IFLA_OPERSTATE, + netif_running(dev) ? dev->operstate : IF_OPER_DOWN); + NLA_PUT_U8(skb, IFLA_LINKMODE, dev->link_mode); + NLA_PUT_U32(skb, IFLA_MTU, dev->mtu); + NLA_PUT_U32(skb, IFLA_GROUP, dev->group); + + if (dev->ifindex != dev->iflink) + NLA_PUT_U32(skb, IFLA_LINK, dev->iflink); + + if (dev->master) + NLA_PUT_U32(skb, IFLA_MASTER, dev->master->ifindex); + + if (dev->qdisc) + NLA_PUT_STRING(skb, IFLA_QDISC, dev->qdisc->ops->id); + + if (dev->ifalias) + NLA_PUT_STRING(skb, IFLA_IFALIAS, dev->ifalias); + + if (1) { + struct rtnl_link_ifmap map = { + .mem_start = dev->mem_start, + .mem_end = dev->mem_end, + .base_addr = dev->base_addr, + .irq = dev->irq, + .dma = dev->dma, + .port = dev->if_port, + }; + NLA_PUT(skb, IFLA_MAP, sizeof(map), &map); + } + + if (dev->addr_len) { + NLA_PUT(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr); + NLA_PUT(skb, IFLA_BROADCAST, dev->addr_len, dev->broadcast); + } + + attr = nla_reserve(skb, IFLA_STATS, + sizeof(struct rtnl_link_stats)); + if (attr == NULL) + goto nla_put_failure; + + stats = dev_get_stats(dev, &temp); + copy_rtnl_link_stats(nla_data(attr), stats); + + attr = nla_reserve(skb, IFLA_STATS64, + sizeof(struct rtnl_link_stats64)); + if (attr == NULL) + goto nla_put_failure; + copy_rtnl_link_stats64(nla_data(attr), stats); + + if (dev->dev.parent && (ext_filter_mask & RTEXT_FILTER_VF)) + NLA_PUT_U32(skb, IFLA_NUM_VF, dev_num_vf(dev->dev.parent)); + + if (dev->netdev_ops->ndo_get_vf_config && dev->dev.parent + && (ext_filter_mask & RTEXT_FILTER_VF)) { + int i; + + struct nlattr *vfinfo, *vf; + int num_vfs = dev_num_vf(dev->dev.parent); + + vfinfo = nla_nest_start(skb, IFLA_VFINFO_LIST); + if (!vfinfo) + goto nla_put_failure; + for (i = 0; i < num_vfs; i++) { + struct ifla_vf_info ivi; + struct ifla_vf_mac vf_mac; + struct ifla_vf_vlan vf_vlan; + struct ifla_vf_tx_rate vf_tx_rate; + struct ifla_vf_spoofchk vf_spoofchk; + + /* + * Not all SR-IOV capable drivers support the + * spoofcheck query. Preset to -1 so the user + * space tool can detect that the driver didn't + * report anything. + */ + ivi.spoofchk = -1; + if (dev->netdev_ops->ndo_get_vf_config(dev, i, &ivi)) + break; + vf_mac.vf = + vf_vlan.vf = + vf_tx_rate.vf = + vf_spoofchk.vf = ivi.vf; + + memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac)); + vf_vlan.vlan = ivi.vlan; + vf_vlan.qos = ivi.qos; + vf_tx_rate.rate = ivi.tx_rate; + vf_spoofchk.setting = ivi.spoofchk; + vf = nla_nest_start(skb, IFLA_VF_INFO); + if (!vf) { + nla_nest_cancel(skb, vfinfo); + goto nla_put_failure; + } + NLA_PUT(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac); + NLA_PUT(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan); + NLA_PUT(skb, IFLA_VF_TX_RATE, sizeof(vf_tx_rate), + &vf_tx_rate); + NLA_PUT(skb, IFLA_VF_SPOOFCHK, sizeof(vf_spoofchk), + &vf_spoofchk); + nla_nest_end(skb, vf); + } + nla_nest_end(skb, vfinfo); + } + + if (rtnl_port_fill(skb, dev)) + goto nla_put_failure; + + if (dev->rtnl_link_ops) { + if (rtnl_link_fill(skb, dev) < 0) + goto nla_put_failure; + } + + if (!(af_spec = nla_nest_start(skb, IFLA_AF_SPEC))) + goto nla_put_failure; + + list_for_each_entry(af_ops, &rtnl_af_ops, list) { + if (af_ops->fill_link_af) { + struct nlattr *af; + int err; + + if (!(af = nla_nest_start(skb, af_ops->family))) + goto nla_put_failure; + + err = af_ops->fill_link_af(skb, dev); + + /* + * Caller may return ENODATA to indicate that there + * was no data to be dumped. This is not an error, it + * means we should trim the attribute header and + * continue. + */ + if (err == -ENODATA) + nla_nest_cancel(skb, af); + else if (err < 0) + goto nla_put_failure; + + nla_nest_end(skb, af); + } + } + + nla_nest_end(skb, af_spec); + + return nlmsg_end(skb, nlh); + +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + int h, s_h; + int idx = 0, s_idx; + struct net_device *dev; + struct hlist_head *head; + struct hlist_node *node; + struct nlattr *tb[IFLA_MAX+1]; + u32 ext_filter_mask = 0; + + s_h = cb->args[0]; + s_idx = cb->args[1]; + + rcu_read_lock(); + cb->seq = net->dev_base_seq; + + if (nlmsg_parse(cb->nlh, sizeof(struct rtgenmsg), tb, IFLA_MAX, + ifla_policy) >= 0) { + + if (tb[IFLA_EXT_MASK]) + ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]); + } + + for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { + idx = 0; + head = &net->dev_index_head[h]; + hlist_for_each_entry_rcu(dev, node, head, index_hlist) { + if (idx < s_idx) + goto cont; + if (rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK, + NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, 0, + NLM_F_MULTI, + ext_filter_mask) <= 0) + goto out; + + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); +cont: + idx++; + } + } +out: + rcu_read_unlock(); + cb->args[1] = idx; + cb->args[0] = h; + + return skb->len; +} + +const struct nla_policy ifla_policy[IFLA_MAX+1] = { + [IFLA_IFNAME] = { .type = NLA_STRING, .len = IFNAMSIZ-1 }, + [IFLA_ADDRESS] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, + [IFLA_BROADCAST] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, + [IFLA_MAP] = { .len = sizeof(struct rtnl_link_ifmap) }, + [IFLA_MTU] = { .type = NLA_U32 }, + [IFLA_LINK] = { .type = NLA_U32 }, + [IFLA_MASTER] = { .type = NLA_U32 }, + [IFLA_TXQLEN] = { .type = NLA_U32 }, + [IFLA_WEIGHT] = { .type = NLA_U32 }, + [IFLA_OPERSTATE] = { .type = NLA_U8 }, + [IFLA_LINKMODE] = { .type = NLA_U8 }, + [IFLA_LINKINFO] = { .type = NLA_NESTED }, + [IFLA_NET_NS_PID] = { .type = NLA_U32 }, + [IFLA_NET_NS_FD] = { .type = NLA_U32 }, + [IFLA_IFALIAS] = { .type = NLA_STRING, .len = IFALIASZ-1 }, + [IFLA_VFINFO_LIST] = {. type = NLA_NESTED }, + [IFLA_VF_PORTS] = { .type = NLA_NESTED }, + [IFLA_PORT_SELF] = { .type = NLA_NESTED }, + [IFLA_AF_SPEC] = { .type = NLA_NESTED }, + [IFLA_EXT_MASK] = { .type = NLA_U32 }, +}; +EXPORT_SYMBOL(ifla_policy); + +static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { + [IFLA_INFO_KIND] = { .type = NLA_STRING }, + [IFLA_INFO_DATA] = { .type = NLA_NESTED }, +}; + +static const struct nla_policy ifla_vfinfo_policy[IFLA_VF_INFO_MAX+1] = { + [IFLA_VF_INFO] = { .type = NLA_NESTED }, +}; + +static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = { + [IFLA_VF_MAC] = { .type = NLA_BINARY, + .len = sizeof(struct ifla_vf_mac) }, + [IFLA_VF_VLAN] = { .type = NLA_BINARY, + .len = sizeof(struct ifla_vf_vlan) }, + [IFLA_VF_TX_RATE] = { .type = NLA_BINARY, + .len = sizeof(struct ifla_vf_tx_rate) }, + [IFLA_VF_SPOOFCHK] = { .type = NLA_BINARY, + .len = sizeof(struct ifla_vf_spoofchk) }, +}; + +static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = { + [IFLA_PORT_VF] = { .type = NLA_U32 }, + [IFLA_PORT_PROFILE] = { .type = NLA_STRING, + .len = PORT_PROFILE_MAX }, + [IFLA_PORT_VSI_TYPE] = { .type = NLA_BINARY, + .len = sizeof(struct ifla_port_vsi)}, + [IFLA_PORT_INSTANCE_UUID] = { .type = NLA_BINARY, + .len = PORT_UUID_MAX }, + [IFLA_PORT_HOST_UUID] = { .type = NLA_STRING, + .len = PORT_UUID_MAX }, + [IFLA_PORT_REQUEST] = { .type = NLA_U8, }, + [IFLA_PORT_RESPONSE] = { .type = NLA_U16, }, +}; + +struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[]) +{ + struct net *net; + /* Examine the link attributes and figure out which + * network namespace we are talking about. + */ + if (tb[IFLA_NET_NS_PID]) + net = get_net_ns_by_pid(nla_get_u32(tb[IFLA_NET_NS_PID])); + else if (tb[IFLA_NET_NS_FD]) + net = get_net_ns_by_fd(nla_get_u32(tb[IFLA_NET_NS_FD])); + else + net = get_net(src_net); + return net; +} +EXPORT_SYMBOL(rtnl_link_get_net); + +static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[]) +{ + if (dev) { + if (tb[IFLA_ADDRESS] && + nla_len(tb[IFLA_ADDRESS]) < dev->addr_len) + return -EINVAL; + + if (tb[IFLA_BROADCAST] && + nla_len(tb[IFLA_BROADCAST]) < dev->addr_len) + return -EINVAL; + } + + if (tb[IFLA_AF_SPEC]) { + struct nlattr *af; + int rem, err; + + nla_for_each_nested(af, tb[IFLA_AF_SPEC], rem) { + const struct rtnl_af_ops *af_ops; + + if (!(af_ops = rtnl_af_lookup(nla_type(af)))) + return -EAFNOSUPPORT; + + if (!af_ops->set_link_af) + return -EOPNOTSUPP; + + if (af_ops->validate_link_af) { + err = af_ops->validate_link_af(dev, af); + if (err < 0) + return err; + } + } + } + + return 0; +} + +static int do_setvfinfo(struct net_device *dev, struct nlattr *attr) +{ + int rem, err = -EINVAL; + struct nlattr *vf; + const struct net_device_ops *ops = dev->netdev_ops; + + nla_for_each_nested(vf, attr, rem) { + switch (nla_type(vf)) { + case IFLA_VF_MAC: { + struct ifla_vf_mac *ivm; + ivm = nla_data(vf); + err = -EOPNOTSUPP; + if (ops->ndo_set_vf_mac) + err = ops->ndo_set_vf_mac(dev, ivm->vf, + ivm->mac); + break; + } + case IFLA_VF_VLAN: { + struct ifla_vf_vlan *ivv; + ivv = nla_data(vf); + err = -EOPNOTSUPP; + if (ops->ndo_set_vf_vlan) + err = ops->ndo_set_vf_vlan(dev, ivv->vf, + ivv->vlan, + ivv->qos); + break; + } + case IFLA_VF_TX_RATE: { + struct ifla_vf_tx_rate *ivt; + ivt = nla_data(vf); + err = -EOPNOTSUPP; + if (ops->ndo_set_vf_tx_rate) + err = ops->ndo_set_vf_tx_rate(dev, ivt->vf, + ivt->rate); + break; + } + case IFLA_VF_SPOOFCHK: { + struct ifla_vf_spoofchk *ivs; + ivs = nla_data(vf); + err = -EOPNOTSUPP; + if (ops->ndo_set_vf_spoofchk) + err = ops->ndo_set_vf_spoofchk(dev, ivs->vf, + ivs->setting); + break; + } + default: + err = -EINVAL; + break; + } + if (err) + break; + } + return err; +} + +static int do_set_master(struct net_device *dev, int ifindex) +{ + struct net_device *master_dev; + const struct net_device_ops *ops; + int err; + + if (dev->master) { + if (dev->master->ifindex == ifindex) + return 0; + ops = dev->master->netdev_ops; + if (ops->ndo_del_slave) { + err = ops->ndo_del_slave(dev->master, dev); + if (err) + return err; + } else { + return -EOPNOTSUPP; + } + } + + if (ifindex) { + master_dev = __dev_get_by_index(dev_net(dev), ifindex); + if (!master_dev) + return -EINVAL; + ops = master_dev->netdev_ops; + if (ops->ndo_add_slave) { + err = ops->ndo_add_slave(master_dev, dev); + if (err) + return err; + } else { + return -EOPNOTSUPP; + } + } + return 0; +} + +static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm, + struct nlattr **tb, char *ifname, int modified) +{ + const struct net_device_ops *ops = dev->netdev_ops; + int send_addr_notify = 0; + int err; + + if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD]) { + struct net *net = rtnl_link_get_net(dev_net(dev), tb); + if (IS_ERR(net)) { + err = PTR_ERR(net); + goto errout; + } + err = dev_change_net_namespace(dev, net, ifname); + put_net(net); + if (err) + goto errout; + modified = 1; + } + + if (tb[IFLA_MAP]) { + struct rtnl_link_ifmap *u_map; + struct ifmap k_map; + + if (!ops->ndo_set_config) { + err = -EOPNOTSUPP; + goto errout; + } + + if (!netif_device_present(dev)) { + err = -ENODEV; + goto errout; + } + + u_map = nla_data(tb[IFLA_MAP]); + k_map.mem_start = (unsigned long) u_map->mem_start; + k_map.mem_end = (unsigned long) u_map->mem_end; + k_map.base_addr = (unsigned short) u_map->base_addr; + k_map.irq = (unsigned char) u_map->irq; + k_map.dma = (unsigned char) u_map->dma; + k_map.port = (unsigned char) u_map->port; + + err = ops->ndo_set_config(dev, &k_map); + if (err < 0) + goto errout; + + modified = 1; + } + + if (tb[IFLA_ADDRESS]) { + struct sockaddr *sa; + int len; + + if (!ops->ndo_set_mac_address) { + err = -EOPNOTSUPP; + goto errout; + } + + if (!netif_device_present(dev)) { + err = -ENODEV; + goto errout; + } + + len = sizeof(sa_family_t) + dev->addr_len; + sa = kmalloc(len, GFP_KERNEL); + if (!sa) { + err = -ENOMEM; + goto errout; + } + sa->sa_family = dev->type; + memcpy(sa->sa_data, nla_data(tb[IFLA_ADDRESS]), + dev->addr_len); + err = ops->ndo_set_mac_address(dev, sa); + kfree(sa); + if (err) + goto errout; + send_addr_notify = 1; + modified = 1; + } + + if (tb[IFLA_MTU]) { + err = dev_set_mtu(dev, nla_get_u32(tb[IFLA_MTU])); + if (err < 0) + goto errout; + modified = 1; + } + + if (tb[IFLA_GROUP]) { + dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP])); + modified = 1; + } + + /* + * Interface selected by interface index but interface + * name provided implies that a name change has been + * requested. + */ + if (ifm->ifi_index > 0 && ifname[0]) { + err = dev_change_name(dev, ifname); + if (err < 0) + goto errout; + modified = 1; + } + + if (tb[IFLA_IFALIAS]) { + err = dev_set_alias(dev, nla_data(tb[IFLA_IFALIAS]), + nla_len(tb[IFLA_IFALIAS])); + if (err < 0) + goto errout; + modified = 1; + } + + if (tb[IFLA_BROADCAST]) { + nla_memcpy(dev->broadcast, tb[IFLA_BROADCAST], dev->addr_len); + send_addr_notify = 1; + } + + if (ifm->ifi_flags || ifm->ifi_change) { + err = dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm)); + if (err < 0) + goto errout; + } + + if (tb[IFLA_MASTER]) { + err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER])); + if (err) + goto errout; + modified = 1; + } + + if (tb[IFLA_TXQLEN]) + dev->tx_queue_len = nla_get_u32(tb[IFLA_TXQLEN]); + + if (tb[IFLA_OPERSTATE]) + set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE])); + + if (tb[IFLA_LINKMODE]) { + write_lock_bh(&dev_base_lock); + dev->link_mode = nla_get_u8(tb[IFLA_LINKMODE]); + write_unlock_bh(&dev_base_lock); + } + + if (tb[IFLA_VFINFO_LIST]) { + struct nlattr *attr; + int rem; + nla_for_each_nested(attr, tb[IFLA_VFINFO_LIST], rem) { + if (nla_type(attr) != IFLA_VF_INFO) { + err = -EINVAL; + goto errout; + } + err = do_setvfinfo(dev, attr); + if (err < 0) + goto errout; + modified = 1; + } + } + err = 0; + + if (tb[IFLA_VF_PORTS]) { + struct nlattr *port[IFLA_PORT_MAX+1]; + struct nlattr *attr; + int vf; + int rem; + + err = -EOPNOTSUPP; + if (!ops->ndo_set_vf_port) + goto errout; + + nla_for_each_nested(attr, tb[IFLA_VF_PORTS], rem) { + if (nla_type(attr) != IFLA_VF_PORT) + continue; + err = nla_parse_nested(port, IFLA_PORT_MAX, + attr, ifla_port_policy); + if (err < 0) + goto errout; + if (!port[IFLA_PORT_VF]) { + err = -EOPNOTSUPP; + goto errout; + } + vf = nla_get_u32(port[IFLA_PORT_VF]); + err = ops->ndo_set_vf_port(dev, vf, port); + if (err < 0) + goto errout; + modified = 1; + } + } + err = 0; + + if (tb[IFLA_PORT_SELF]) { + struct nlattr *port[IFLA_PORT_MAX+1]; + + err = nla_parse_nested(port, IFLA_PORT_MAX, + tb[IFLA_PORT_SELF], ifla_port_policy); + if (err < 0) + goto errout; + + err = -EOPNOTSUPP; + if (ops->ndo_set_vf_port) + err = ops->ndo_set_vf_port(dev, PORT_SELF_VF, port); + if (err < 0) + goto errout; + modified = 1; + } + + if (tb[IFLA_AF_SPEC]) { + struct nlattr *af; + int rem; + + nla_for_each_nested(af, tb[IFLA_AF_SPEC], rem) { + const struct rtnl_af_ops *af_ops; + + if (!(af_ops = rtnl_af_lookup(nla_type(af)))) + BUG(); + + err = af_ops->set_link_af(dev, af); + if (err < 0) + goto errout; + + modified = 1; + } + } + err = 0; + +errout: + if (err < 0 && modified && net_ratelimit()) + printk(KERN_WARNING "A link change request failed with " + "some changes committed already. Interface %s may " + "have been left with an inconsistent configuration, " + "please check.\n", dev->name); + + if (send_addr_notify) + call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); + + return err; +} + +static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +{ + struct net *net = sock_net(skb->sk); + struct ifinfomsg *ifm; + struct net_device *dev; + int err; + struct nlattr *tb[IFLA_MAX+1]; + char ifname[IFNAMSIZ]; + + err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy); + if (err < 0) + goto errout; + + if (tb[IFLA_IFNAME]) + nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); + else + ifname[0] = '\0'; + + err = -EINVAL; + ifm = nlmsg_data(nlh); + if (ifm->ifi_index > 0) + dev = __dev_get_by_index(net, ifm->ifi_index); + else if (tb[IFLA_IFNAME]) + dev = __dev_get_by_name(net, ifname); + else + goto errout; + + if (dev == NULL) { + err = -ENODEV; + goto errout; + } + + err = validate_linkmsg(dev, tb); + if (err < 0) + goto errout; + + err = do_setlink(dev, ifm, tb, ifname, 0); +errout: + return err; +} + +static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +{ + struct net *net = sock_net(skb->sk); + const struct rtnl_link_ops *ops; + struct net_device *dev; + struct ifinfomsg *ifm; + char ifname[IFNAMSIZ]; + struct nlattr *tb[IFLA_MAX+1]; + int err; + LIST_HEAD(list_kill); + + err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy); + if (err < 0) + return err; + + if (tb[IFLA_IFNAME]) + nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); + + ifm = nlmsg_data(nlh); + if (ifm->ifi_index > 0) + dev = __dev_get_by_index(net, ifm->ifi_index); + else if (tb[IFLA_IFNAME]) + dev = __dev_get_by_name(net, ifname); + else + return -EINVAL; + + if (!dev) + return -ENODEV; + + ops = dev->rtnl_link_ops; + if (!ops) + return -EOPNOTSUPP; + + ops->dellink(dev, &list_kill); + unregister_netdevice_many(&list_kill); + list_del(&list_kill); + return 0; +} + +int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm) +{ + unsigned int old_flags; + int err; + + old_flags = dev->flags; + if (ifm && (ifm->ifi_flags || ifm->ifi_change)) { + err = __dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm)); + if (err < 0) + return err; + } + + dev->rtnl_link_state = RTNL_LINK_INITIALIZED; + rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U); + + __dev_notify_flags(dev, old_flags); + return 0; +} +EXPORT_SYMBOL(rtnl_configure_link); + +struct net_device *rtnl_create_link(struct net *src_net, struct net *net, + char *ifname, const struct rtnl_link_ops *ops, struct nlattr *tb[]) +{ + int err; + struct net_device *dev; + unsigned int num_queues = 1; + unsigned int real_num_queues = 1; + + if (ops->get_tx_queues) { + err = ops->get_tx_queues(src_net, tb, &num_queues, + &real_num_queues); + if (err) + goto err; + } + err = -ENOMEM; + dev = alloc_netdev_mq(ops->priv_size, ifname, ops->setup, num_queues); + if (!dev) + goto err; + + dev_net_set(dev, net); + dev->rtnl_link_ops = ops; + dev->rtnl_link_state = RTNL_LINK_INITIALIZING; + + if (tb[IFLA_MTU]) + dev->mtu = nla_get_u32(tb[IFLA_MTU]); + if (tb[IFLA_ADDRESS]) + memcpy(dev->dev_addr, nla_data(tb[IFLA_ADDRESS]), + nla_len(tb[IFLA_ADDRESS])); + if (tb[IFLA_BROADCAST]) + memcpy(dev->broadcast, nla_data(tb[IFLA_BROADCAST]), + nla_len(tb[IFLA_BROADCAST])); + if (tb[IFLA_TXQLEN]) + dev->tx_queue_len = nla_get_u32(tb[IFLA_TXQLEN]); + if (tb[IFLA_OPERSTATE]) + set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE])); + if (tb[IFLA_LINKMODE]) + dev->link_mode = nla_get_u8(tb[IFLA_LINKMODE]); + if (tb[IFLA_GROUP]) + dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP])); + + return dev; + +err: + return ERR_PTR(err); +} +EXPORT_SYMBOL(rtnl_create_link); + +static int rtnl_group_changelink(struct net *net, int group, + struct ifinfomsg *ifm, + struct nlattr **tb) +{ + struct net_device *dev; + int err; + + for_each_netdev(net, dev) { + if (dev->group == group) { + err = do_setlink(dev, ifm, tb, NULL, 0); + if (err < 0) + return err; + } + } + + return 0; +} + +static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +{ + struct net *net = sock_net(skb->sk); + const struct rtnl_link_ops *ops; + struct net_device *dev; + struct ifinfomsg *ifm; + char kind[MODULE_NAME_LEN]; + char ifname[IFNAMSIZ]; + struct nlattr *tb[IFLA_MAX+1]; + struct nlattr *linkinfo[IFLA_INFO_MAX+1]; + int err; + +#ifdef CONFIG_MODULES +replay: +#endif + err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy); + if (err < 0) + return err; + + if (tb[IFLA_IFNAME]) + nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); + else + ifname[0] = '\0'; + + ifm = nlmsg_data(nlh); + if (ifm->ifi_index > 0) + dev = __dev_get_by_index(net, ifm->ifi_index); + else { + if (ifname[0]) + dev = __dev_get_by_name(net, ifname); + else + dev = NULL; + } + + err = validate_linkmsg(dev, tb); + if (err < 0) + return err; + + if (tb[IFLA_LINKINFO]) { + err = nla_parse_nested(linkinfo, IFLA_INFO_MAX, + tb[IFLA_LINKINFO], ifla_info_policy); + if (err < 0) + return err; + } else + memset(linkinfo, 0, sizeof(linkinfo)); + + if (linkinfo[IFLA_INFO_KIND]) { + nla_strlcpy(kind, linkinfo[IFLA_INFO_KIND], sizeof(kind)); + ops = rtnl_link_ops_get(kind); + } else { + kind[0] = '\0'; + ops = NULL; + } + + if (1) { + struct nlattr *attr[ops ? ops->maxtype + 1 : 0], **data = NULL; + struct net *dest_net; + + if (ops) { + if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) { + err = nla_parse_nested(attr, ops->maxtype, + linkinfo[IFLA_INFO_DATA], + ops->policy); + if (err < 0) + return err; + data = attr; + } + if (ops->validate) { + err = ops->validate(tb, data); + if (err < 0) + return err; + } + } + + if (dev) { + int modified = 0; + + if (nlh->nlmsg_flags & NLM_F_EXCL) + return -EEXIST; + if (nlh->nlmsg_flags & NLM_F_REPLACE) + return -EOPNOTSUPP; + + if (linkinfo[IFLA_INFO_DATA]) { + if (!ops || ops != dev->rtnl_link_ops || + !ops->changelink) + return -EOPNOTSUPP; + + err = ops->changelink(dev, tb, data); + if (err < 0) + return err; + modified = 1; + } + + return do_setlink(dev, ifm, tb, ifname, modified); + } + + if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { + if (ifm->ifi_index == 0 && tb[IFLA_GROUP]) + return rtnl_group_changelink(net, + nla_get_u32(tb[IFLA_GROUP]), + ifm, tb); + return -ENODEV; + } + + if (ifm->ifi_index) + return -EOPNOTSUPP; + if (tb[IFLA_MAP] || tb[IFLA_MASTER] || tb[IFLA_PROTINFO]) + return -EOPNOTSUPP; + + if (!ops) { +#ifdef CONFIG_MODULES + if (kind[0]) { + __rtnl_unlock(); + request_module("rtnl-link-%s", kind); + rtnl_lock(); + ops = rtnl_link_ops_get(kind); + if (ops) + goto replay; + } +#endif + return -EOPNOTSUPP; + } + + if (!ifname[0]) + snprintf(ifname, IFNAMSIZ, "%s%%d", ops->kind); + + dest_net = rtnl_link_get_net(net, tb); + if (IS_ERR(dest_net)) + return PTR_ERR(dest_net); + + dev = rtnl_create_link(net, dest_net, ifname, ops, tb); + + if (IS_ERR(dev)) + err = PTR_ERR(dev); + else if (ops->newlink) + err = ops->newlink(net, dev, tb, data); + else + err = register_netdevice(dev); + + if (err < 0 && !IS_ERR(dev)) + free_netdev(dev); + if (err < 0) + goto out; + + err = rtnl_configure_link(dev, ifm); + if (err < 0) + unregister_netdevice(dev); +out: + put_net(dest_net); + return err; + } +} + +static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) +{ + struct net *net = sock_net(skb->sk); + struct ifinfomsg *ifm; + char ifname[IFNAMSIZ]; + struct nlattr *tb[IFLA_MAX+1]; + struct net_device *dev = NULL; + struct sk_buff *nskb; + int err; + u32 ext_filter_mask = 0; + + err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy); + if (err < 0) + return err; + + if (tb[IFLA_IFNAME]) + nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); + + if (tb[IFLA_EXT_MASK]) + ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]); + + ifm = nlmsg_data(nlh); + if (ifm->ifi_index > 0) + dev = __dev_get_by_index(net, ifm->ifi_index); + else if (tb[IFLA_IFNAME]) + dev = __dev_get_by_name(net, ifname); + else + return -EINVAL; + + if (dev == NULL) + return -ENODEV; + + nskb = nlmsg_new(if_nlmsg_size(dev, ext_filter_mask), GFP_KERNEL); + if (nskb == NULL) + return -ENOBUFS; + + err = rtnl_fill_ifinfo(nskb, dev, RTM_NEWLINK, NETLINK_CB(skb).pid, + nlh->nlmsg_seq, 0, 0, ext_filter_mask); + if (err < 0) { + /* -EMSGSIZE implies BUG in if_nlmsg_size */ + WARN_ON(err == -EMSGSIZE); + kfree_skb(nskb); + } else + err = rtnl_unicast(nskb, net, NETLINK_CB(skb).pid); + + return err; +} + +static u16 rtnl_calcit(struct sk_buff *skb, struct nlmsghdr *nlh) +{ + struct net *net = sock_net(skb->sk); + struct net_device *dev; + struct nlattr *tb[IFLA_MAX+1]; + u32 ext_filter_mask = 0; + u16 min_ifinfo_dump_size = 0; + + if (nlmsg_parse(nlh, sizeof(struct rtgenmsg), tb, IFLA_MAX, + ifla_policy) >= 0) { + if (tb[IFLA_EXT_MASK]) + ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]); + } + + if (!ext_filter_mask) + return NLMSG_GOODSIZE; + /* + * traverse the list of net devices and compute the minimum + * buffer size based upon the filter mask. + */ + list_for_each_entry(dev, &net->dev_base_head, dev_list) { + min_ifinfo_dump_size = max_t(u16, min_ifinfo_dump_size, + if_nlmsg_size(dev, + ext_filter_mask)); + } + + return min_ifinfo_dump_size; +} + +static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb) +{ + int idx; + int s_idx = cb->family; + + if (s_idx == 0) + s_idx = 1; + for (idx = 1; idx <= RTNL_FAMILY_MAX; idx++) { + int type = cb->nlh->nlmsg_type-RTM_BASE; + if (idx < s_idx || idx == PF_PACKET) + continue; + if (rtnl_msg_handlers[idx] == NULL || + rtnl_msg_handlers[idx][type].dumpit == NULL) + continue; + if (idx > s_idx) + memset(&cb->args[0], 0, sizeof(cb->args)); + if (rtnl_msg_handlers[idx][type].dumpit(skb, cb)) + break; + } + cb->family = idx; + + return skb->len; +} + +void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change) +{ + struct net *net = dev_net(dev); + struct sk_buff *skb; + int err = -ENOBUFS; + size_t if_info_size; + + skb = nlmsg_new((if_info_size = if_nlmsg_size(dev, 0)), GFP_KERNEL); + if (skb == NULL) + goto errout; + + err = rtnl_fill_ifinfo(skb, dev, type, 0, 0, change, 0, 0); + if (err < 0) { + /* -EMSGSIZE implies BUG in if_nlmsg_size() */ + WARN_ON(err == -EMSGSIZE); + kfree_skb(skb); + goto errout; + } + rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_KERNEL); + return; +errout: + if (err < 0) + rtnl_set_sk_err(net, RTNLGRP_LINK, err); +} + +/* Protected by RTNL sempahore. */ +static struct rtattr **rta_buf; +static int rtattr_max; + +/* Process one rtnetlink message. */ + +static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) +{ + struct net *net = sock_net(skb->sk); + rtnl_doit_func doit; + int sz_idx, kind; + int min_len; + int family; + int type; + int err; + + type = nlh->nlmsg_type; + if (type > RTM_MAX) + return -EOPNOTSUPP; + + type -= RTM_BASE; + + /* All the messages must have at least 1 byte length */ + if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(struct rtgenmsg))) + return 0; + + family = ((struct rtgenmsg *)NLMSG_DATA(nlh))->rtgen_family; + sz_idx = type>>2; + kind = type&3; + + if (kind != 2 && !capable(CAP_NET_ADMIN)) + return -EPERM; + + if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) { + struct sock *rtnl; + rtnl_dumpit_func dumpit; + rtnl_calcit_func calcit; + u16 min_dump_alloc = 0; + + dumpit = rtnl_get_dumpit(family, type); + if (dumpit == NULL) + return -EOPNOTSUPP; + calcit = rtnl_get_calcit(family, type); + if (calcit) + min_dump_alloc = calcit(skb, nlh); + + __rtnl_unlock(); + rtnl = net->rtnl; + { + struct netlink_dump_control c = { + .dump = dumpit, + .min_dump_alloc = min_dump_alloc, + }; + err = netlink_dump_start(rtnl, skb, nlh, &c); + } + rtnl_lock(); + return err; + } + + memset(rta_buf, 0, (rtattr_max * sizeof(struct rtattr *))); + + min_len = rtm_min[sz_idx]; + if (nlh->nlmsg_len < min_len) + return -EINVAL; + + if (nlh->nlmsg_len > min_len) { + int attrlen = nlh->nlmsg_len - NLMSG_ALIGN(min_len); + struct rtattr *attr = (void *)nlh + NLMSG_ALIGN(min_len); + + while (RTA_OK(attr, attrlen)) { + unsigned flavor = attr->rta_type; + if (flavor) { + if (flavor > rta_max[sz_idx]) + return -EINVAL; + rta_buf[flavor-1] = attr; + } + attr = RTA_NEXT(attr, attrlen); + } + } + + doit = rtnl_get_doit(family, type); + if (doit == NULL) + return -EOPNOTSUPP; + + return doit(skb, nlh, (void *)&rta_buf[0]); +} + +static void rtnetlink_rcv(struct sk_buff *skb) +{ + rtnl_lock(); + netlink_rcv_skb(skb, &rtnetlink_rcv_msg); + rtnl_unlock(); +} + +static int rtnetlink_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct net_device *dev = ptr; + + switch (event) { + case NETDEV_UP: + case NETDEV_DOWN: + case NETDEV_PRE_UP: + case NETDEV_POST_INIT: + case NETDEV_REGISTER: + case NETDEV_CHANGE: + case NETDEV_PRE_TYPE_CHANGE: + case NETDEV_GOING_DOWN: + case NETDEV_UNREGISTER: + case NETDEV_UNREGISTER_BATCH: + case NETDEV_RELEASE: + case NETDEV_JOIN: + break; + default: + rtmsg_ifinfo(RTM_NEWLINK, dev, 0); + break; + } + return NOTIFY_DONE; +} + +static struct notifier_block rtnetlink_dev_notifier = { + .notifier_call = rtnetlink_event, +}; + + +static int __net_init rtnetlink_net_init(struct net *net) +{ + struct sock *sk; + sk = netlink_kernel_create(net, NETLINK_ROUTE, RTNLGRP_MAX, + rtnetlink_rcv, &rtnl_mutex, THIS_MODULE); + if (!sk) + return -ENOMEM; + net->rtnl = sk; + return 0; +} + +static void __net_exit rtnetlink_net_exit(struct net *net) +{ + netlink_kernel_release(net->rtnl); + net->rtnl = NULL; +} + +static struct pernet_operations rtnetlink_net_ops = { + .init = rtnetlink_net_init, + .exit = rtnetlink_net_exit, +}; + +void __init rtnetlink_init(void) +{ + int i; + + rtattr_max = 0; + for (i = 0; i < ARRAY_SIZE(rta_max); i++) + if (rta_max[i] > rtattr_max) + rtattr_max = rta_max[i]; + rta_buf = kmalloc(rtattr_max * sizeof(struct rtattr *), GFP_KERNEL); + if (!rta_buf) + panic("rtnetlink_init: cannot allocate rta_buf\n"); + + if (register_pernet_subsys(&rtnetlink_net_ops)) + panic("rtnetlink_init: cannot initialize rtnetlink\n"); + + netlink_set_nonroot(NETLINK_ROUTE, NL_NONROOT_RECV); + register_netdevice_notifier(&rtnetlink_dev_notifier); + + rtnl_register(PF_UNSPEC, RTM_GETLINK, rtnl_getlink, + rtnl_dump_ifinfo, rtnl_calcit); + rtnl_register(PF_UNSPEC, RTM_SETLINK, rtnl_setlink, NULL, NULL); + rtnl_register(PF_UNSPEC, RTM_NEWLINK, rtnl_newlink, NULL, NULL); + rtnl_register(PF_UNSPEC, RTM_DELLINK, rtnl_dellink, NULL, NULL); + + rtnl_register(PF_UNSPEC, RTM_GETADDR, NULL, rtnl_dump_all, NULL); + rtnl_register(PF_UNSPEC, RTM_GETROUTE, NULL, rtnl_dump_all, NULL); +} + diff --git a/net/core/scm.c b/net/core/scm.c new file mode 100644 index 00000000..611c5efd --- /dev/null +++ b/net/core/scm.c @@ -0,0 +1,345 @@ +/* scm.c - Socket level control messages processing. + * + * Author: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * Alignment and value checking mods by Craig Metz + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/module.h> +#include <linux/signal.h> +#include <linux/capability.h> +#include <linux/errno.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/kernel.h> +#include <linux/stat.h> +#include <linux/socket.h> +#include <linux/file.h> +#include <linux/fcntl.h> +#include <linux/net.h> +#include <linux/interrupt.h> +#include <linux/netdevice.h> +#include <linux/security.h> +#include <linux/pid.h> +#include <linux/nsproxy.h> +#include <linux/slab.h> + +#include <asm/uaccess.h> + +#include <net/protocol.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/compat.h> +#include <net/scm.h> + + +/* + * Only allow a user to send credentials, that they could set with + * setu(g)id. + */ + +static __inline__ int scm_check_creds(struct ucred *creds) +{ + const struct cred *cred = current_cred(); + + if ((creds->pid == task_tgid_vnr(current) || capable(CAP_SYS_ADMIN)) && + ((creds->uid == cred->uid || creds->uid == cred->euid || + creds->uid == cred->suid) || capable(CAP_SETUID)) && + ((creds->gid == cred->gid || creds->gid == cred->egid || + creds->gid == cred->sgid) || capable(CAP_SETGID))) { + return 0; + } + return -EPERM; +} + +static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) +{ + int *fdp = (int*)CMSG_DATA(cmsg); + struct scm_fp_list *fpl = *fplp; + struct file **fpp; + int i, num; + + num = (cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr)))/sizeof(int); + + if (num <= 0) + return 0; + + if (num > SCM_MAX_FD) + return -EINVAL; + + if (!fpl) + { + fpl = kmalloc(sizeof(struct scm_fp_list), GFP_KERNEL); + if (!fpl) + return -ENOMEM; + *fplp = fpl; + fpl->count = 0; + fpl->max = SCM_MAX_FD; + } + fpp = &fpl->fp[fpl->count]; + + if (fpl->count + num > fpl->max) + return -EINVAL; + + /* + * Verify the descriptors and increment the usage count. + */ + + for (i=0; i< num; i++) + { + int fd = fdp[i]; + struct file *file; + + if (fd < 0 || !(file = fget_raw(fd))) + return -EBADF; + *fpp++ = file; + fpl->count++; + } + return num; +} + +void __scm_destroy(struct scm_cookie *scm) +{ + struct scm_fp_list *fpl = scm->fp; + int i; + + if (fpl) { + scm->fp = NULL; + if (current->scm_work_list) { + list_add_tail(&fpl->list, current->scm_work_list); + } else { + LIST_HEAD(work_list); + + current->scm_work_list = &work_list; + + list_add(&fpl->list, &work_list); + while (!list_empty(&work_list)) { + fpl = list_first_entry(&work_list, struct scm_fp_list, list); + + list_del(&fpl->list); + for (i=fpl->count-1; i>=0; i--) + fput(fpl->fp[i]); + kfree(fpl); + } + + current->scm_work_list = NULL; + } + } +} +EXPORT_SYMBOL(__scm_destroy); + +int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p) +{ + struct cmsghdr *cmsg; + int err; + + for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) + { + err = -EINVAL; + + /* Verify that cmsg_len is at least sizeof(struct cmsghdr) */ + /* The first check was omitted in <= 2.2.5. The reasoning was + that parser checks cmsg_len in any case, so that + additional check would be work duplication. + But if cmsg_level is not SOL_SOCKET, we do not check + for too short ancillary data object at all! Oops. + OK, let's add it... + */ + if (!CMSG_OK(msg, cmsg)) + goto error; + + if (cmsg->cmsg_level != SOL_SOCKET) + continue; + + switch (cmsg->cmsg_type) + { + case SCM_RIGHTS: + if (!sock->ops || sock->ops->family != PF_UNIX) + goto error; + err=scm_fp_copy(cmsg, &p->fp); + if (err<0) + goto error; + break; + case SCM_CREDENTIALS: + if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct ucred))) + goto error; + memcpy(&p->creds, CMSG_DATA(cmsg), sizeof(struct ucred)); + err = scm_check_creds(&p->creds); + if (err) + goto error; + + if (!p->pid || pid_vnr(p->pid) != p->creds.pid) { + struct pid *pid; + err = -ESRCH; + pid = find_get_pid(p->creds.pid); + if (!pid) + goto error; + put_pid(p->pid); + p->pid = pid; + } + + if (!p->cred || + (p->cred->euid != p->creds.uid) || + (p->cred->egid != p->creds.gid)) { + struct cred *cred; + err = -ENOMEM; + cred = prepare_creds(); + if (!cred) + goto error; + + cred->uid = cred->euid = p->creds.uid; + cred->gid = cred->egid = p->creds.gid; + if (p->cred) + put_cred(p->cred); + p->cred = cred; + } + break; + default: + goto error; + } + } + + if (p->fp && !p->fp->count) + { + kfree(p->fp); + p->fp = NULL; + } + return 0; + +error: + scm_destroy(p); + return err; +} +EXPORT_SYMBOL(__scm_send); + +int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data) +{ + struct cmsghdr __user *cm + = (__force struct cmsghdr __user *)msg->msg_control; + struct cmsghdr cmhdr; + int cmlen = CMSG_LEN(len); + int err; + + if (MSG_CMSG_COMPAT & msg->msg_flags) + return put_cmsg_compat(msg, level, type, len, data); + + if (cm==NULL || msg->msg_controllen < sizeof(*cm)) { + msg->msg_flags |= MSG_CTRUNC; + return 0; /* XXX: return error? check spec. */ + } + if (msg->msg_controllen < cmlen) { + msg->msg_flags |= MSG_CTRUNC; + cmlen = msg->msg_controllen; + } + cmhdr.cmsg_level = level; + cmhdr.cmsg_type = type; + cmhdr.cmsg_len = cmlen; + + err = -EFAULT; + if (copy_to_user(cm, &cmhdr, sizeof cmhdr)) + goto out; + if (copy_to_user(CMSG_DATA(cm), data, cmlen - sizeof(struct cmsghdr))) + goto out; + cmlen = CMSG_SPACE(len); + if (msg->msg_controllen < cmlen) + cmlen = msg->msg_controllen; + msg->msg_control += cmlen; + msg->msg_controllen -= cmlen; + err = 0; +out: + return err; +} +EXPORT_SYMBOL(put_cmsg); + +void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm) +{ + struct cmsghdr __user *cm + = (__force struct cmsghdr __user*)msg->msg_control; + + int fdmax = 0; + int fdnum = scm->fp->count; + struct file **fp = scm->fp->fp; + int __user *cmfptr; + int err = 0, i; + + if (MSG_CMSG_COMPAT & msg->msg_flags) { + scm_detach_fds_compat(msg, scm); + return; + } + + if (msg->msg_controllen > sizeof(struct cmsghdr)) + fdmax = ((msg->msg_controllen - sizeof(struct cmsghdr)) + / sizeof(int)); + + if (fdnum < fdmax) + fdmax = fdnum; + + for (i=0, cmfptr=(__force int __user *)CMSG_DATA(cm); i<fdmax; + i++, cmfptr++) + { + int new_fd; + err = security_file_receive(fp[i]); + if (err) + break; + err = get_unused_fd_flags(MSG_CMSG_CLOEXEC & msg->msg_flags + ? O_CLOEXEC : 0); + if (err < 0) + break; + new_fd = err; + err = put_user(new_fd, cmfptr); + if (err) { + put_unused_fd(new_fd); + break; + } + /* Bump the usage count and install the file. */ + get_file(fp[i]); + fd_install(new_fd, fp[i]); + } + + if (i > 0) + { + int cmlen = CMSG_LEN(i*sizeof(int)); + err = put_user(SOL_SOCKET, &cm->cmsg_level); + if (!err) + err = put_user(SCM_RIGHTS, &cm->cmsg_type); + if (!err) + err = put_user(cmlen, &cm->cmsg_len); + if (!err) { + cmlen = CMSG_SPACE(i*sizeof(int)); + msg->msg_control += cmlen; + msg->msg_controllen -= cmlen; + } + } + if (i < fdnum || (fdnum && fdmax <= 0)) + msg->msg_flags |= MSG_CTRUNC; + + /* + * All of the files that fit in the message have had their + * usage counts incremented, so we just free the list. + */ + __scm_destroy(scm); +} +EXPORT_SYMBOL(scm_detach_fds); + +struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl) +{ + struct scm_fp_list *new_fpl; + int i; + + if (!fpl) + return NULL; + + new_fpl = kmemdup(fpl, offsetof(struct scm_fp_list, fp[fpl->count]), + GFP_KERNEL); + if (new_fpl) { + for (i = 0; i < fpl->count; i++) + get_file(fpl->fp[i]); + new_fpl->max = new_fpl->count; + } + return new_fpl; +} +EXPORT_SYMBOL(scm_fp_dup); diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c new file mode 100644 index 00000000..99b25965 --- /dev/null +++ b/net/core/secure_seq.c @@ -0,0 +1,186 @@ +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/cryptohash.h> +#include <linux/module.h> +#include <linux/cache.h> +#include <linux/random.h> +#include <linux/hrtimer.h> +#include <linux/ktime.h> +#include <linux/string.h> + +#include <net/secure_seq.h> + +static u32 net_secret[MD5_MESSAGE_BYTES / 4] ____cacheline_aligned; + +static int __init net_secret_init(void) +{ + get_random_bytes(net_secret, sizeof(net_secret)); + return 0; +} +late_initcall(net_secret_init); + +#ifdef CONFIG_INET +static u32 seq_scale(u32 seq) +{ + /* + * As close as possible to RFC 793, which + * suggests using a 250 kHz clock. + * Further reading shows this assumes 2 Mb/s networks. + * For 10 Mb/s Ethernet, a 1 MHz clock is appropriate. + * For 10 Gb/s Ethernet, a 1 GHz clock should be ok, but + * we also need to limit the resolution so that the u32 seq + * overlaps less than one time per MSL (2 minutes). + * Choosing a clock of 64 ns period is OK. (period of 274 s) + */ + return seq + (ktime_to_ns(ktime_get_real()) >> 6); +} +#endif + +#if IS_ENABLED(CONFIG_IPV6) +__u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr, + __be16 sport, __be16 dport) +{ + u32 secret[MD5_MESSAGE_BYTES / 4]; + u32 hash[MD5_DIGEST_WORDS]; + u32 i; + + memcpy(hash, saddr, 16); + for (i = 0; i < 4; i++) + secret[i] = net_secret[i] + (__force u32)daddr[i]; + secret[4] = net_secret[4] + + (((__force u16)sport << 16) + (__force u16)dport); + for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++) + secret[i] = net_secret[i]; + + md5_transform(hash, secret); + + return seq_scale(hash[0]); +} +EXPORT_SYMBOL(secure_tcpv6_sequence_number); + +u32 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr, + __be16 dport) +{ + u32 secret[MD5_MESSAGE_BYTES / 4]; + u32 hash[MD5_DIGEST_WORDS]; + u32 i; + + memcpy(hash, saddr, 16); + for (i = 0; i < 4; i++) + secret[i] = net_secret[i] + (__force u32) daddr[i]; + secret[4] = net_secret[4] + (__force u32)dport; + for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++) + secret[i] = net_secret[i]; + + md5_transform(hash, secret); + + return hash[0]; +} +#endif + +#ifdef CONFIG_INET +__u32 secure_ip_id(__be32 daddr) +{ + u32 hash[MD5_DIGEST_WORDS]; + + hash[0] = (__force __u32) daddr; + hash[1] = net_secret[13]; + hash[2] = net_secret[14]; + hash[3] = net_secret[15]; + + md5_transform(hash, net_secret); + + return hash[0]; +} + +__u32 secure_ipv6_id(const __be32 daddr[4]) +{ + __u32 hash[4]; + + memcpy(hash, daddr, 16); + md5_transform(hash, net_secret); + + return hash[0]; +} + +__u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr, + __be16 sport, __be16 dport) +{ + u32 hash[MD5_DIGEST_WORDS]; + + hash[0] = (__force u32)saddr; + hash[1] = (__force u32)daddr; + hash[2] = ((__force u16)sport << 16) + (__force u16)dport; + hash[3] = net_secret[15]; + + md5_transform(hash, net_secret); + + return seq_scale(hash[0]); +} + +u32 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport) +{ + u32 hash[MD5_DIGEST_WORDS]; + + hash[0] = (__force u32)saddr; + hash[1] = (__force u32)daddr; + hash[2] = (__force u32)dport ^ net_secret[14]; + hash[3] = net_secret[15]; + + md5_transform(hash, net_secret); + + return hash[0]; +} +EXPORT_SYMBOL_GPL(secure_ipv4_port_ephemeral); +#endif + +#if IS_ENABLED(CONFIG_IP_DCCP) +u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr, + __be16 sport, __be16 dport) +{ + u32 hash[MD5_DIGEST_WORDS]; + u64 seq; + + hash[0] = (__force u32)saddr; + hash[1] = (__force u32)daddr; + hash[2] = ((__force u16)sport << 16) + (__force u16)dport; + hash[3] = net_secret[15]; + + md5_transform(hash, net_secret); + + seq = hash[0] | (((u64)hash[1]) << 32); + seq += ktime_to_ns(ktime_get_real()); + seq &= (1ull << 48) - 1; + + return seq; +} +EXPORT_SYMBOL(secure_dccp_sequence_number); + +#if IS_ENABLED(CONFIG_IPV6) +u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr, + __be16 sport, __be16 dport) +{ + u32 secret[MD5_MESSAGE_BYTES / 4]; + u32 hash[MD5_DIGEST_WORDS]; + u64 seq; + u32 i; + + memcpy(hash, saddr, 16); + for (i = 0; i < 4; i++) + secret[i] = net_secret[i] + daddr[i]; + secret[4] = net_secret[4] + + (((__force u16)sport << 16) + (__force u16)dport); + for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++) + secret[i] = net_secret[i]; + + md5_transform(hash, secret); + + seq = hash[0] | (((u64)hash[1]) << 32); + seq += ktime_to_ns(ktime_get_real()); + seq &= (1ull << 48) - 1; + + return seq; +} +EXPORT_SYMBOL(secure_dccpv6_sequence_number); +#endif +#endif diff --git a/net/core/skbuff.c b/net/core/skbuff.c new file mode 100644 index 00000000..e99aedd9 --- /dev/null +++ b/net/core/skbuff.c @@ -0,0 +1,3283 @@ +/* + * Routines having to do with the 'struct sk_buff' memory handlers. + * + * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk> + * Florian La Roche <rzsfl@rz.uni-sb.de> + * + * Fixes: + * Alan Cox : Fixed the worst of the load + * balancer bugs. + * Dave Platt : Interrupt stacking fix. + * Richard Kooijman : Timestamp fixes. + * Alan Cox : Changed buffer format. + * Alan Cox : destructor hook for AF_UNIX etc. + * Linus Torvalds : Better skb_clone. + * Alan Cox : Added skb_copy. + * Alan Cox : Added all the changed routines Linus + * only put in the headers + * Ray VanTassle : Fixed --skb->lock in free + * Alan Cox : skb_copy copy arp field + * Andi Kleen : slabified it. + * Robert Olsson : Removed skb_head_pool + * + * NOTE: + * The __skb_ routines should be called with interrupts + * disabled, or you better be *real* sure that the operation is atomic + * with respect to whatever list is being frobbed (e.g. via lock_sock() + * or via disabling bottom half handlers, etc). + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* + * The functions in this file will not compile correctly with gcc 2.4.x + */ + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/kmemcheck.h> +#include <linux/mm.h> +#include <linux/interrupt.h> +#include <linux/in.h> +#include <linux/inet.h> +#include <linux/slab.h> +#include <linux/netdevice.h> +#ifdef CONFIG_NET_CLS_ACT +#include <net/pkt_sched.h> +#endif +#include <linux/string.h> +#include <linux/skbuff.h> +#include <linux/splice.h> +#include <linux/cache.h> +#include <linux/rtnetlink.h> +#include <linux/init.h> +#include <linux/scatterlist.h> +#include <linux/errqueue.h> +#include <linux/prefetch.h> + +#include <net/protocol.h> +#include <net/dst.h> +#include <net/sock.h> +#include <net/checksum.h> +#include <net/xfrm.h> + +#include <asm/uaccess.h> +#include <trace/events/skb.h> + +#include "kmap_skb.h" + +static struct kmem_cache *skbuff_head_cache __read_mostly; +static struct kmem_cache *skbuff_fclone_cache __read_mostly; + +static void sock_pipe_buf_release(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + put_page(buf->page); +} + +static void sock_pipe_buf_get(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + get_page(buf->page); +} + +static int sock_pipe_buf_steal(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + return 1; +} + + +/* Pipe buffer operations for a socket. */ +static const struct pipe_buf_operations sock_pipe_buf_ops = { + .can_merge = 0, + .map = generic_pipe_buf_map, + .unmap = generic_pipe_buf_unmap, + .confirm = generic_pipe_buf_confirm, + .release = sock_pipe_buf_release, + .steal = sock_pipe_buf_steal, + .get = sock_pipe_buf_get, +}; + +/* + * Keep out-of-line to prevent kernel bloat. + * __builtin_return_address is not used because it is not always + * reliable. + */ + +/** + * skb_over_panic - private function + * @skb: buffer + * @sz: size + * @here: address + * + * Out of line support code for skb_put(). Not user callable. + */ +static void skb_over_panic(struct sk_buff *skb, int sz, void *here) +{ + printk(KERN_EMERG "skb_over_panic: text:%p len:%d put:%d head:%p " + "data:%p tail:%#lx end:%#lx dev:%s\n", + here, skb->len, sz, skb->head, skb->data, + (unsigned long)skb->tail, (unsigned long)skb->end, + skb->dev ? skb->dev->name : "<NULL>"); + BUG(); +} + +/** + * skb_under_panic - private function + * @skb: buffer + * @sz: size + * @here: address + * + * Out of line support code for skb_push(). Not user callable. + */ + +static void skb_under_panic(struct sk_buff *skb, int sz, void *here) +{ + printk(KERN_EMERG "skb_under_panic: text:%p len:%d put:%d head:%p " + "data:%p tail:%#lx end:%#lx dev:%s\n", + here, skb->len, sz, skb->head, skb->data, + (unsigned long)skb->tail, (unsigned long)skb->end, + skb->dev ? skb->dev->name : "<NULL>"); + BUG(); +} + +/* Allocate a new skbuff. We do this ourselves so we can fill in a few + * 'private' fields and also do memory statistics to find all the + * [BEEP] leaks. + * + */ + +/** + * __alloc_skb - allocate a network buffer + * @size: size to allocate + * @gfp_mask: allocation mask + * @fclone: allocate from fclone cache instead of head cache + * and allocate a cloned (child) skb + * @node: numa node to allocate memory on + * + * Allocate a new &sk_buff. The returned buffer has no headroom and a + * tail room of size bytes. The object has a reference count of one. + * The return is the buffer. On a failure the return is %NULL. + * + * Buffers may only be allocated from interrupts using a @gfp_mask of + * %GFP_ATOMIC. + */ +struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, + int fclone, int node) +{ + struct kmem_cache *cache; + struct skb_shared_info *shinfo; + struct sk_buff *skb; + u8 *data; + + cache = fclone ? skbuff_fclone_cache : skbuff_head_cache; + + /* Get the HEAD */ + skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node); + if (!skb) + goto out; + prefetchw(skb); + + /* We do our best to align skb_shared_info on a separate cache + * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives + * aligned memory blocks, unless SLUB/SLAB debug is enabled. + * Both skb->head and skb_shared_info are cache line aligned. + */ + size = SKB_DATA_ALIGN(size); + size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + data = kmalloc_node_track_caller(size, gfp_mask, node); + if (!data) + goto nodata; + /* kmalloc(size) might give us more room than requested. + * Put skb_shared_info exactly at the end of allocated zone, + * to allow max possible filling before reallocation. + */ + size = SKB_WITH_OVERHEAD(ksize(data)); + prefetchw(data + size); + + /* + * Only clear those fields we need to clear, not those that we will + * actually initialise below. Hence, don't put any more fields after + * the tail pointer in struct sk_buff! + */ + memset(skb, 0, offsetof(struct sk_buff, tail)); + /* Account for allocated memory : skb + skb->head */ + skb->truesize = SKB_TRUESIZE(size); + atomic_set(&skb->users, 1); + skb->head = data; + skb->data = data; + skb_reset_tail_pointer(skb); + skb->end = skb->tail + size; +#ifdef NET_SKBUFF_DATA_USES_OFFSET + skb->mac_header = ~0U; +#endif + + /* make sure we initialize shinfo sequentially */ + shinfo = skb_shinfo(skb); + memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); + atomic_set(&shinfo->dataref, 1); + kmemcheck_annotate_variable(shinfo->destructor_arg); + + if (fclone) { + struct sk_buff *child = skb + 1; + atomic_t *fclone_ref = (atomic_t *) (child + 1); + + kmemcheck_annotate_bitfield(child, flags1); + kmemcheck_annotate_bitfield(child, flags2); + skb->fclone = SKB_FCLONE_ORIG; + atomic_set(fclone_ref, 1); + + child->fclone = SKB_FCLONE_UNAVAILABLE; + } +out: + return skb; +nodata: + kmem_cache_free(cache, skb); + skb = NULL; + goto out; +} +EXPORT_SYMBOL(__alloc_skb); + +/** + * build_skb - build a network buffer + * @data: data buffer provided by caller + * + * Allocate a new &sk_buff. Caller provides space holding head and + * skb_shared_info. @data must have been allocated by kmalloc() + * The return is the new skb buffer. + * On a failure the return is %NULL, and @data is not freed. + * Notes : + * Before IO, driver allocates only data buffer where NIC put incoming frame + * Driver should add room at head (NET_SKB_PAD) and + * MUST add room at tail (SKB_DATA_ALIGN(skb_shared_info)) + * After IO, driver calls build_skb(), to allocate sk_buff and populate it + * before giving packet to stack. + * RX rings only contains data buffers, not full skbs. + */ +struct sk_buff *build_skb(void *data) +{ + struct skb_shared_info *shinfo; + struct sk_buff *skb; + unsigned int size; + + skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC); + if (!skb) + return NULL; + + size = ksize(data) - SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + + memset(skb, 0, offsetof(struct sk_buff, tail)); + skb->truesize = SKB_TRUESIZE(size); + atomic_set(&skb->users, 1); + skb->head = data; + skb->data = data; + skb_reset_tail_pointer(skb); + skb->end = skb->tail + size; +#ifdef NET_SKBUFF_DATA_USES_OFFSET + skb->mac_header = ~0U; +#endif + + /* make sure we initialize shinfo sequentially */ + shinfo = skb_shinfo(skb); + memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); + atomic_set(&shinfo->dataref, 1); + kmemcheck_annotate_variable(shinfo->destructor_arg); + + return skb; +} +EXPORT_SYMBOL(build_skb); + +/** + * __netdev_alloc_skb - allocate an skbuff for rx on a specific device + * @dev: network device to receive on + * @length: length to allocate + * @gfp_mask: get_free_pages mask, passed to alloc_skb + * + * Allocate a new &sk_buff and assign it a usage count of one. The + * buffer has unspecified headroom built in. Users should allocate + * the headroom they think they need without accounting for the + * built in space. The built in space is used for optimisations. + * + * %NULL is returned if there is no free memory. + */ +struct sk_buff *__netdev_alloc_skb(struct net_device *dev, + unsigned int length, gfp_t gfp_mask) +{ + struct sk_buff *skb; + + skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, 0, NUMA_NO_NODE); + if (likely(skb)) { + skb_reserve(skb, NET_SKB_PAD); + skb->dev = dev; + } + return skb; +} +EXPORT_SYMBOL(__netdev_alloc_skb); + +void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off, + int size, unsigned int truesize) +{ + skb_fill_page_desc(skb, i, page, off, size); + skb->len += size; + skb->data_len += size; + skb->truesize += truesize; +} +EXPORT_SYMBOL(skb_add_rx_frag); + +/** + * dev_alloc_skb - allocate an skbuff for receiving + * @length: length to allocate + * + * Allocate a new &sk_buff and assign it a usage count of one. The + * buffer has unspecified headroom built in. Users should allocate + * the headroom they think they need without accounting for the + * built in space. The built in space is used for optimisations. + * + * %NULL is returned if there is no free memory. Although this function + * allocates memory it can be called from an interrupt. + */ +struct sk_buff *dev_alloc_skb(unsigned int length) +{ + /* + * There is more code here than it seems: + * __dev_alloc_skb is an inline + */ + return __dev_alloc_skb(length, GFP_ATOMIC); +} +EXPORT_SYMBOL(dev_alloc_skb); + +static void skb_drop_list(struct sk_buff **listp) +{ + struct sk_buff *list = *listp; + + *listp = NULL; + + do { + struct sk_buff *this = list; + list = list->next; + kfree_skb(this); + } while (list); +} + +static inline void skb_drop_fraglist(struct sk_buff *skb) +{ + skb_drop_list(&skb_shinfo(skb)->frag_list); +} + +static void skb_clone_fraglist(struct sk_buff *skb) +{ + struct sk_buff *list; + + skb_walk_frags(skb, list) + skb_get(list); +} + +static void skb_release_data(struct sk_buff *skb) +{ + if (!skb->cloned || + !atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1, + &skb_shinfo(skb)->dataref)) { + if (skb_shinfo(skb)->nr_frags) { + int i; + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) + skb_frag_unref(skb, i); + } + + /* + * If skb buf is from userspace, we need to notify the caller + * the lower device DMA has done; + */ + if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { + struct ubuf_info *uarg; + + uarg = skb_shinfo(skb)->destructor_arg; + if (uarg->callback) + uarg->callback(uarg); + } + + if (skb_has_frag_list(skb)) + skb_drop_fraglist(skb); + + kfree(skb->head); + } +} + +/* + * Free an skbuff by memory without cleaning the state. + */ +static void kfree_skbmem(struct sk_buff *skb) +{ + struct sk_buff *other; + atomic_t *fclone_ref; + + switch (skb->fclone) { + case SKB_FCLONE_UNAVAILABLE: + kmem_cache_free(skbuff_head_cache, skb); + break; + + case SKB_FCLONE_ORIG: + fclone_ref = (atomic_t *) (skb + 2); + if (atomic_dec_and_test(fclone_ref)) + kmem_cache_free(skbuff_fclone_cache, skb); + break; + + case SKB_FCLONE_CLONE: + fclone_ref = (atomic_t *) (skb + 1); + other = skb - 1; + + /* The clone portion is available for + * fast-cloning again. + */ + skb->fclone = SKB_FCLONE_UNAVAILABLE; + + if (atomic_dec_and_test(fclone_ref)) + kmem_cache_free(skbuff_fclone_cache, other); + break; + } +} + +static void skb_release_head_state(struct sk_buff *skb) +{ + skb_dst_drop(skb); +#ifdef CONFIG_XFRM + secpath_put(skb->sp); +#endif + if (skb->destructor) { + WARN_ON(in_irq()); + skb->destructor(skb); + } +#if IS_ENABLED(CONFIG_NF_CONNTRACK) + nf_conntrack_put(skb->nfct); +#endif +#ifdef NET_SKBUFF_NF_DEFRAG_NEEDED + nf_conntrack_put_reasm(skb->nfct_reasm); +#endif +#ifdef CONFIG_BRIDGE_NETFILTER + nf_bridge_put(skb->nf_bridge); +#endif +/* XXX: IS this still necessary? - JHS */ +#ifdef CONFIG_NET_SCHED + skb->tc_index = 0; +#ifdef CONFIG_NET_CLS_ACT + skb->tc_verd = 0; +#endif +#endif +} + +/* Free everything but the sk_buff shell. */ +static void skb_release_all(struct sk_buff *skb) +{ + skb_release_head_state(skb); + skb_release_data(skb); +} + +/** + * __kfree_skb - private function + * @skb: buffer + * + * Free an sk_buff. Release anything attached to the buffer. + * Clean the state. This is an internal helper function. Users should + * always call kfree_skb + */ + +void __kfree_skb(struct sk_buff *skb) +{ + skb_release_all(skb); + kfree_skbmem(skb); +} +EXPORT_SYMBOL(__kfree_skb); + +/** + * kfree_skb - free an sk_buff + * @skb: buffer to free + * + * Drop a reference to the buffer and free it if the usage count has + * hit zero. + */ +void kfree_skb(struct sk_buff *skb) +{ + if (unlikely(!skb)) + return; + if (likely(atomic_read(&skb->users) == 1)) + smp_rmb(); + else if (likely(!atomic_dec_and_test(&skb->users))) + return; + trace_kfree_skb(skb, __builtin_return_address(0)); + __kfree_skb(skb); +} +EXPORT_SYMBOL(kfree_skb); + +/** + * consume_skb - free an skbuff + * @skb: buffer to free + * + * Drop a ref to the buffer and free it if the usage count has hit zero + * Functions identically to kfree_skb, but kfree_skb assumes that the frame + * is being dropped after a failure and notes that + */ +void consume_skb(struct sk_buff *skb) +{ + if (unlikely(!skb)) + return; + if (likely(atomic_read(&skb->users) == 1)) + smp_rmb(); + else if (likely(!atomic_dec_and_test(&skb->users))) + return; + trace_consume_skb(skb); + __kfree_skb(skb); +} +EXPORT_SYMBOL(consume_skb); + +/** + * skb_recycle - clean up an skb for reuse + * @skb: buffer + * + * Recycles the skb to be reused as a receive buffer. This + * function does any necessary reference count dropping, and + * cleans up the skbuff as if it just came from __alloc_skb(). + */ +void skb_recycle(struct sk_buff *skb) +{ + struct skb_shared_info *shinfo; + + skb_release_head_state(skb); + + shinfo = skb_shinfo(skb); + memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); + atomic_set(&shinfo->dataref, 1); + + memset(skb, 0, offsetof(struct sk_buff, tail)); + skb->data = skb->head + NET_SKB_PAD; + skb_reset_tail_pointer(skb); +} +EXPORT_SYMBOL(skb_recycle); + +/** + * skb_recycle_check - check if skb can be reused for receive + * @skb: buffer + * @skb_size: minimum receive buffer size + * + * Checks that the skb passed in is not shared or cloned, and + * that it is linear and its head portion at least as large as + * skb_size so that it can be recycled as a receive buffer. + * If these conditions are met, this function does any necessary + * reference count dropping and cleans up the skbuff as if it + * just came from __alloc_skb(). + */ +bool skb_recycle_check(struct sk_buff *skb, int skb_size) +{ + if (!skb_is_recycleable(skb, skb_size)) + return false; + + skb_recycle(skb); + + return true; +} +EXPORT_SYMBOL(skb_recycle_check); + +static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) +{ + new->tstamp = old->tstamp; + new->dev = old->dev; + new->transport_header = old->transport_header; + new->network_header = old->network_header; + new->mac_header = old->mac_header; + skb_dst_copy(new, old); + new->rxhash = old->rxhash; + new->ooo_okay = old->ooo_okay; + new->l4_rxhash = old->l4_rxhash; + new->no_fcs = old->no_fcs; +#ifdef CONFIG_XFRM + new->sp = secpath_get(old->sp); +#endif + memcpy(new->cb, old->cb, sizeof(old->cb)); + new->csum = old->csum; + new->local_df = old->local_df; + new->pkt_type = old->pkt_type; + new->ip_summed = old->ip_summed; + skb_copy_queue_mapping(new, old); + new->priority = old->priority; +#if IS_ENABLED(CONFIG_IP_VS) + new->ipvs_property = old->ipvs_property; +#endif + new->protocol = old->protocol; + new->mark = old->mark; + new->skb_iif = old->skb_iif; + __nf_copy(new, old); +#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) + new->nf_trace = old->nf_trace; +#endif +#ifdef CONFIG_NET_SCHED + new->tc_index = old->tc_index; +#ifdef CONFIG_NET_CLS_ACT + new->tc_verd = old->tc_verd; +#endif +#endif + new->vlan_tci = old->vlan_tci; + + skb_copy_secmark(new, old); +} + +/* + * You should not add any new code to this function. Add it to + * __copy_skb_header above instead. + */ +static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb) +{ +#define C(x) n->x = skb->x + + n->next = n->prev = NULL; + n->sk = NULL; + __copy_skb_header(n, skb); + + C(len); + C(data_len); + C(mac_len); + n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len; + n->cloned = 1; + n->nohdr = 0; + n->destructor = NULL; + C(tail); + C(end); + C(head); + C(data); + C(truesize); + atomic_set(&n->users, 1); + + atomic_inc(&(skb_shinfo(skb)->dataref)); + skb->cloned = 1; + + return n; +#undef C +} + +/** + * skb_morph - morph one skb into another + * @dst: the skb to receive the contents + * @src: the skb to supply the contents + * + * This is identical to skb_clone except that the target skb is + * supplied by the user. + * + * The target skb is returned upon exit. + */ +struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src) +{ + skb_release_all(dst); + return __skb_clone(dst, src); +} +EXPORT_SYMBOL_GPL(skb_morph); + +/* skb_copy_ubufs - copy userspace skb frags buffers to kernel + * @skb: the skb to modify + * @gfp_mask: allocation priority + * + * This must be called on SKBTX_DEV_ZEROCOPY skb. + * It will copy all frags into kernel and drop the reference + * to userspace pages. + * + * If this function is called from an interrupt gfp_mask() must be + * %GFP_ATOMIC. + * + * Returns 0 on success or a negative error code on failure + * to allocate kernel memory to copy to. + */ +int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) +{ + int i; + int num_frags = skb_shinfo(skb)->nr_frags; + struct page *page, *head = NULL; + struct ubuf_info *uarg = skb_shinfo(skb)->destructor_arg; + + for (i = 0; i < num_frags; i++) { + u8 *vaddr; + skb_frag_t *f = &skb_shinfo(skb)->frags[i]; + + page = alloc_page(GFP_ATOMIC); + if (!page) { + while (head) { + struct page *next = (struct page *)head->private; + put_page(head); + head = next; + } + return -ENOMEM; + } + vaddr = kmap_skb_frag(&skb_shinfo(skb)->frags[i]); + memcpy(page_address(page), + vaddr + f->page_offset, skb_frag_size(f)); + kunmap_skb_frag(vaddr); + page->private = (unsigned long)head; + head = page; + } + + /* skb frags release userspace buffers */ + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) + skb_frag_unref(skb, i); + + uarg->callback(uarg); + + /* skb frags point to kernel buffers */ + for (i = skb_shinfo(skb)->nr_frags; i > 0; i--) { + __skb_fill_page_desc(skb, i-1, head, 0, + skb_shinfo(skb)->frags[i - 1].size); + head = (struct page *)head->private; + } + + skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY; + return 0; +} + + +/** + * skb_clone - duplicate an sk_buff + * @skb: buffer to clone + * @gfp_mask: allocation priority + * + * Duplicate an &sk_buff. The new one is not owned by a socket. Both + * copies share the same packet data but not structure. The new + * buffer has a reference count of 1. If the allocation fails the + * function returns %NULL otherwise the new buffer is returned. + * + * If this function is called from an interrupt gfp_mask() must be + * %GFP_ATOMIC. + */ + +struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) +{ + struct sk_buff *n; + + if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { + if (skb_copy_ubufs(skb, gfp_mask)) + return NULL; + } + + n = skb + 1; + if (skb->fclone == SKB_FCLONE_ORIG && + n->fclone == SKB_FCLONE_UNAVAILABLE) { + atomic_t *fclone_ref = (atomic_t *) (n + 1); + n->fclone = SKB_FCLONE_CLONE; + atomic_inc(fclone_ref); + } else { + n = kmem_cache_alloc(skbuff_head_cache, gfp_mask); + if (!n) + return NULL; + + kmemcheck_annotate_bitfield(n, flags1); + kmemcheck_annotate_bitfield(n, flags2); + n->fclone = SKB_FCLONE_UNAVAILABLE; + } + + return __skb_clone(n, skb); +} +EXPORT_SYMBOL(skb_clone); + +static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) +{ +#ifndef NET_SKBUFF_DATA_USES_OFFSET + /* + * Shift between the two data areas in bytes + */ + unsigned long offset = new->data - old->data; +#endif + + __copy_skb_header(new, old); + +#ifndef NET_SKBUFF_DATA_USES_OFFSET + /* {transport,network,mac}_header are relative to skb->head */ + new->transport_header += offset; + new->network_header += offset; + if (skb_mac_header_was_set(new)) + new->mac_header += offset; +#endif + skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size; + skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs; + skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type; +} + +/** + * skb_copy - create private copy of an sk_buff + * @skb: buffer to copy + * @gfp_mask: allocation priority + * + * Make a copy of both an &sk_buff and its data. This is used when the + * caller wishes to modify the data and needs a private copy of the + * data to alter. Returns %NULL on failure or the pointer to the buffer + * on success. The returned buffer has a reference count of 1. + * + * As by-product this function converts non-linear &sk_buff to linear + * one, so that &sk_buff becomes completely private and caller is allowed + * to modify all the data of returned buffer. This means that this + * function is not recommended for use in circumstances when only + * header is going to be modified. Use pskb_copy() instead. + */ + +struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask) +{ + int headerlen = skb_headroom(skb); + unsigned int size = (skb_end_pointer(skb) - skb->head) + skb->data_len; + struct sk_buff *n = alloc_skb(size, gfp_mask); + + if (!n) + return NULL; + + /* Set the data pointer */ + skb_reserve(n, headerlen); + /* Set the tail pointer and length */ + skb_put(n, skb->len); + + if (skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len)) + BUG(); + + copy_skb_header(n, skb); + return n; +} +EXPORT_SYMBOL(skb_copy); + +/** + * __pskb_copy - create copy of an sk_buff with private head. + * @skb: buffer to copy + * @headroom: headroom of new skb + * @gfp_mask: allocation priority + * + * Make a copy of both an &sk_buff and part of its data, located + * in header. Fragmented data remain shared. This is used when + * the caller wishes to modify only header of &sk_buff and needs + * private copy of the header to alter. Returns %NULL on failure + * or the pointer to the buffer on success. + * The returned buffer has a reference count of 1. + */ + +struct sk_buff *__pskb_copy(struct sk_buff *skb, int headroom, gfp_t gfp_mask) +{ + unsigned int size = skb_headlen(skb) + headroom; + struct sk_buff *n = alloc_skb(size, gfp_mask); + + if (!n) + goto out; + + /* Set the data pointer */ + skb_reserve(n, headroom); + /* Set the tail pointer and length */ + skb_put(n, skb_headlen(skb)); + /* Copy the bytes */ + skb_copy_from_linear_data(skb, n->data, n->len); + + n->truesize += skb->data_len; + n->data_len = skb->data_len; + n->len = skb->len; + + if (skb_shinfo(skb)->nr_frags) { + int i; + + if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { + if (skb_copy_ubufs(skb, gfp_mask)) { + kfree_skb(n); + n = NULL; + goto out; + } + } + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i]; + skb_frag_ref(skb, i); + } + skb_shinfo(n)->nr_frags = i; + } + + if (skb_has_frag_list(skb)) { + skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list; + skb_clone_fraglist(n); + } + + copy_skb_header(n, skb); +out: + return n; +} +EXPORT_SYMBOL(__pskb_copy); + +/** + * pskb_expand_head - reallocate header of &sk_buff + * @skb: buffer to reallocate + * @nhead: room to add at head + * @ntail: room to add at tail + * @gfp_mask: allocation priority + * + * Expands (or creates identical copy, if &nhead and &ntail are zero) + * header of skb. &sk_buff itself is not changed. &sk_buff MUST have + * reference count of 1. Returns zero in the case of success or error, + * if expansion failed. In the last case, &sk_buff is not changed. + * + * All the pointers pointing into skb header may change and must be + * reloaded after call to this function. + */ + +int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, + gfp_t gfp_mask) +{ + int i; + u8 *data; + int size = nhead + (skb_end_pointer(skb) - skb->head) + ntail; + long off; + bool fastpath; + + BUG_ON(nhead < 0); + + if (skb_shared(skb)) + BUG(); + + size = SKB_DATA_ALIGN(size); + + /* Check if we can avoid taking references on fragments if we own + * the last reference on skb->head. (see skb_release_data()) + */ + if (!skb->cloned) + fastpath = true; + else { + int delta = skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1; + fastpath = atomic_read(&skb_shinfo(skb)->dataref) == delta; + } + + if (fastpath && + size + sizeof(struct skb_shared_info) <= ksize(skb->head)) { + memmove(skb->head + size, skb_shinfo(skb), + offsetof(struct skb_shared_info, + frags[skb_shinfo(skb)->nr_frags])); + memmove(skb->head + nhead, skb->head, + skb_tail_pointer(skb) - skb->head); + off = nhead; + goto adjust_others; + } + + data = kmalloc(size + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)), + gfp_mask); + if (!data) + goto nodata; + size = SKB_WITH_OVERHEAD(ksize(data)); + + /* Copy only real data... and, alas, header. This should be + * optimized for the cases when header is void. + */ + memcpy(data + nhead, skb->head, skb_tail_pointer(skb) - skb->head); + + memcpy((struct skb_shared_info *)(data + size), + skb_shinfo(skb), + offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags])); + + if (fastpath) { + kfree(skb->head); + } else { + /* copy this zero copy skb frags */ + if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { + if (skb_copy_ubufs(skb, gfp_mask)) + goto nofrags; + } + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) + skb_frag_ref(skb, i); + + if (skb_has_frag_list(skb)) + skb_clone_fraglist(skb); + + skb_release_data(skb); + } + off = (data + nhead) - skb->head; + + skb->head = data; +adjust_others: + skb->data += off; +#ifdef NET_SKBUFF_DATA_USES_OFFSET + skb->end = size; + off = nhead; +#else + skb->end = skb->head + size; +#endif + /* {transport,network,mac}_header and tail are relative to skb->head */ + skb->tail += off; + skb->transport_header += off; + skb->network_header += off; + if (skb_mac_header_was_set(skb)) + skb->mac_header += off; + /* Only adjust this if it actually is csum_start rather than csum */ + if (skb->ip_summed == CHECKSUM_PARTIAL) + skb->csum_start += nhead; + skb->cloned = 0; + skb->hdr_len = 0; + skb->nohdr = 0; + atomic_set(&skb_shinfo(skb)->dataref, 1); + return 0; + +nofrags: + kfree(data); +nodata: + return -ENOMEM; +} +EXPORT_SYMBOL(pskb_expand_head); + +/* Make private copy of skb with writable head and some headroom */ + +struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom) +{ + struct sk_buff *skb2; + int delta = headroom - skb_headroom(skb); + + if (delta <= 0) + skb2 = pskb_copy(skb, GFP_ATOMIC); + else { + skb2 = skb_clone(skb, GFP_ATOMIC); + if (skb2 && pskb_expand_head(skb2, SKB_DATA_ALIGN(delta), 0, + GFP_ATOMIC)) { + kfree_skb(skb2); + skb2 = NULL; + } + } + return skb2; +} +EXPORT_SYMBOL(skb_realloc_headroom); + +/** + * skb_copy_expand - copy and expand sk_buff + * @skb: buffer to copy + * @newheadroom: new free bytes at head + * @newtailroom: new free bytes at tail + * @gfp_mask: allocation priority + * + * Make a copy of both an &sk_buff and its data and while doing so + * allocate additional space. + * + * This is used when the caller wishes to modify the data and needs a + * private copy of the data to alter as well as more space for new fields. + * Returns %NULL on failure or the pointer to the buffer + * on success. The returned buffer has a reference count of 1. + * + * You must pass %GFP_ATOMIC as the allocation priority if this function + * is called from an interrupt. + */ +struct sk_buff *skb_copy_expand(const struct sk_buff *skb, + int newheadroom, int newtailroom, + gfp_t gfp_mask) +{ + /* + * Allocate the copy buffer + */ + struct sk_buff *n = alloc_skb(newheadroom + skb->len + newtailroom, + gfp_mask); + int oldheadroom = skb_headroom(skb); + int head_copy_len, head_copy_off; + int off; + + if (!n) + return NULL; + + skb_reserve(n, newheadroom); + + /* Set the tail pointer and length */ + skb_put(n, skb->len); + + head_copy_len = oldheadroom; + head_copy_off = 0; + if (newheadroom <= head_copy_len) + head_copy_len = newheadroom; + else + head_copy_off = newheadroom - head_copy_len; + + /* Copy the linear header and data. */ + if (skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off, + skb->len + head_copy_len)) + BUG(); + + copy_skb_header(n, skb); + + off = newheadroom - oldheadroom; + if (n->ip_summed == CHECKSUM_PARTIAL) + n->csum_start += off; +#ifdef NET_SKBUFF_DATA_USES_OFFSET + n->transport_header += off; + n->network_header += off; + if (skb_mac_header_was_set(skb)) + n->mac_header += off; +#endif + + return n; +} +EXPORT_SYMBOL(skb_copy_expand); + +/** + * skb_pad - zero pad the tail of an skb + * @skb: buffer to pad + * @pad: space to pad + * + * Ensure that a buffer is followed by a padding area that is zero + * filled. Used by network drivers which may DMA or transfer data + * beyond the buffer end onto the wire. + * + * May return error in out of memory cases. The skb is freed on error. + */ + +int skb_pad(struct sk_buff *skb, int pad) +{ + int err; + int ntail; + + /* If the skbuff is non linear tailroom is always zero.. */ + if (!skb_cloned(skb) && skb_tailroom(skb) >= pad) { + memset(skb->data+skb->len, 0, pad); + return 0; + } + + ntail = skb->data_len + pad - (skb->end - skb->tail); + if (likely(skb_cloned(skb) || ntail > 0)) { + err = pskb_expand_head(skb, 0, ntail, GFP_ATOMIC); + if (unlikely(err)) + goto free_skb; + } + + /* FIXME: The use of this function with non-linear skb's really needs + * to be audited. + */ + err = skb_linearize(skb); + if (unlikely(err)) + goto free_skb; + + memset(skb->data + skb->len, 0, pad); + return 0; + +free_skb: + kfree_skb(skb); + return err; +} +EXPORT_SYMBOL(skb_pad); + +/** + * skb_put - add data to a buffer + * @skb: buffer to use + * @len: amount of data to add + * + * This function extends the used data area of the buffer. If this would + * exceed the total buffer size the kernel will panic. A pointer to the + * first byte of the extra data is returned. + */ +unsigned char *skb_put(struct sk_buff *skb, unsigned int len) +{ + unsigned char *tmp = skb_tail_pointer(skb); + SKB_LINEAR_ASSERT(skb); + skb->tail += len; + skb->len += len; + if (unlikely(skb->tail > skb->end)) + skb_over_panic(skb, len, __builtin_return_address(0)); + return tmp; +} +EXPORT_SYMBOL(skb_put); + +/** + * skb_push - add data to the start of a buffer + * @skb: buffer to use + * @len: amount of data to add + * + * This function extends the used data area of the buffer at the buffer + * start. If this would exceed the total buffer headroom the kernel will + * panic. A pointer to the first byte of the extra data is returned. + */ +unsigned char *skb_push(struct sk_buff *skb, unsigned int len) +{ + skb->data -= len; + skb->len += len; + if (unlikely(skb->data<skb->head)) + skb_under_panic(skb, len, __builtin_return_address(0)); + return skb->data; +} +EXPORT_SYMBOL(skb_push); + +/** + * skb_pull - remove data from the start of a buffer + * @skb: buffer to use + * @len: amount of data to remove + * + * This function removes data from the start of a buffer, returning + * the memory to the headroom. A pointer to the next data in the buffer + * is returned. Once the data has been pulled future pushes will overwrite + * the old data. + */ +unsigned char *skb_pull(struct sk_buff *skb, unsigned int len) +{ + return skb_pull_inline(skb, len); +} +EXPORT_SYMBOL(skb_pull); + +/** + * skb_trim - remove end from a buffer + * @skb: buffer to alter + * @len: new length + * + * Cut the length of a buffer down by removing data from the tail. If + * the buffer is already under the length specified it is not modified. + * The skb must be linear. + */ +void skb_trim(struct sk_buff *skb, unsigned int len) +{ + if (skb->len > len) + __skb_trim(skb, len); +} +EXPORT_SYMBOL(skb_trim); + +/* Trims skb to length len. It can change skb pointers. + */ + +int ___pskb_trim(struct sk_buff *skb, unsigned int len) +{ + struct sk_buff **fragp; + struct sk_buff *frag; + int offset = skb_headlen(skb); + int nfrags = skb_shinfo(skb)->nr_frags; + int i; + int err; + + if (skb_cloned(skb) && + unlikely((err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))) + return err; + + i = 0; + if (offset >= len) + goto drop_pages; + + for (; i < nfrags; i++) { + int end = offset + skb_frag_size(&skb_shinfo(skb)->frags[i]); + + if (end < len) { + offset = end; + continue; + } + + skb_frag_size_set(&skb_shinfo(skb)->frags[i++], len - offset); + +drop_pages: + skb_shinfo(skb)->nr_frags = i; + + for (; i < nfrags; i++) + skb_frag_unref(skb, i); + + if (skb_has_frag_list(skb)) + skb_drop_fraglist(skb); + goto done; + } + + for (fragp = &skb_shinfo(skb)->frag_list; (frag = *fragp); + fragp = &frag->next) { + int end = offset + frag->len; + + if (skb_shared(frag)) { + struct sk_buff *nfrag; + + nfrag = skb_clone(frag, GFP_ATOMIC); + if (unlikely(!nfrag)) + return -ENOMEM; + + nfrag->next = frag->next; + kfree_skb(frag); + frag = nfrag; + *fragp = frag; + } + + if (end < len) { + offset = end; + continue; + } + + if (end > len && + unlikely((err = pskb_trim(frag, len - offset)))) + return err; + + if (frag->next) + skb_drop_list(&frag->next); + break; + } + +done: + if (len > skb_headlen(skb)) { + skb->data_len -= skb->len - len; + skb->len = len; + } else { + skb->len = len; + skb->data_len = 0; + skb_set_tail_pointer(skb, len); + } + + return 0; +} +EXPORT_SYMBOL(___pskb_trim); + +/** + * __pskb_pull_tail - advance tail of skb header + * @skb: buffer to reallocate + * @delta: number of bytes to advance tail + * + * The function makes a sense only on a fragmented &sk_buff, + * it expands header moving its tail forward and copying necessary + * data from fragmented part. + * + * &sk_buff MUST have reference count of 1. + * + * Returns %NULL (and &sk_buff does not change) if pull failed + * or value of new tail of skb in the case of success. + * + * All the pointers pointing into skb header may change and must be + * reloaded after call to this function. + */ + +/* Moves tail of skb head forward, copying data from fragmented part, + * when it is necessary. + * 1. It may fail due to malloc failure. + * 2. It may change skb pointers. + * + * It is pretty complicated. Luckily, it is called only in exceptional cases. + */ +unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta) +{ + /* If skb has not enough free space at tail, get new one + * plus 128 bytes for future expansions. If we have enough + * room at tail, reallocate without expansion only if skb is cloned. + */ + int i, k, eat = (skb->tail + delta) - skb->end; + + if (eat > 0 || skb_cloned(skb)) { + if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0, + GFP_ATOMIC)) + return NULL; + } + + if (skb_copy_bits(skb, skb_headlen(skb), skb_tail_pointer(skb), delta)) + BUG(); + + /* Optimization: no fragments, no reasons to preestimate + * size of pulled pages. Superb. + */ + if (!skb_has_frag_list(skb)) + goto pull_pages; + + /* Estimate size of pulled pages. */ + eat = delta; + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); + + if (size >= eat) + goto pull_pages; + eat -= size; + } + + /* If we need update frag list, we are in troubles. + * Certainly, it possible to add an offset to skb data, + * but taking into account that pulling is expected to + * be very rare operation, it is worth to fight against + * further bloating skb head and crucify ourselves here instead. + * Pure masohism, indeed. 8)8) + */ + if (eat) { + struct sk_buff *list = skb_shinfo(skb)->frag_list; + struct sk_buff *clone = NULL; + struct sk_buff *insp = NULL; + + do { + BUG_ON(!list); + + if (list->len <= eat) { + /* Eaten as whole. */ + eat -= list->len; + list = list->next; + insp = list; + } else { + /* Eaten partially. */ + + if (skb_shared(list)) { + /* Sucks! We need to fork list. :-( */ + clone = skb_clone(list, GFP_ATOMIC); + if (!clone) + return NULL; + insp = list->next; + list = clone; + } else { + /* This may be pulled without + * problems. */ + insp = list; + } + if (!pskb_pull(list, eat)) { + kfree_skb(clone); + return NULL; + } + break; + } + } while (eat); + + /* Free pulled out fragments. */ + while ((list = skb_shinfo(skb)->frag_list) != insp) { + skb_shinfo(skb)->frag_list = list->next; + kfree_skb(list); + } + /* And insert new clone at head. */ + if (clone) { + clone->next = list; + skb_shinfo(skb)->frag_list = clone; + } + } + /* Success! Now we may commit changes to skb data. */ + +pull_pages: + eat = delta; + k = 0; + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); + + if (size <= eat) { + skb_frag_unref(skb, i); + eat -= size; + } else { + skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i]; + if (eat) { + skb_shinfo(skb)->frags[k].page_offset += eat; + skb_frag_size_sub(&skb_shinfo(skb)->frags[k], eat); + eat = 0; + } + k++; + } + } + skb_shinfo(skb)->nr_frags = k; + + skb->tail += delta; + skb->data_len -= delta; + + return skb_tail_pointer(skb); +} +EXPORT_SYMBOL(__pskb_pull_tail); + +/** + * skb_copy_bits - copy bits from skb to kernel buffer + * @skb: source skb + * @offset: offset in source + * @to: destination buffer + * @len: number of bytes to copy + * + * Copy the specified number of bytes from the source skb to the + * destination buffer. + * + * CAUTION ! : + * If its prototype is ever changed, + * check arch/{*}/net/{*}.S files, + * since it is called from BPF assembly code. + */ +int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len) +{ + int start = skb_headlen(skb); + struct sk_buff *frag_iter; + int i, copy; + + if (offset > (int)skb->len - len) + goto fault; + + /* Copy header. */ + if ((copy = start - offset) > 0) { + if (copy > len) + copy = len; + skb_copy_from_linear_data_offset(skb, offset, to, copy); + if ((len -= copy) == 0) + return 0; + offset += copy; + to += copy; + } + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + int end; + + WARN_ON(start > offset + len); + + end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]); + if ((copy = end - offset) > 0) { + u8 *vaddr; + + if (copy > len) + copy = len; + + vaddr = kmap_skb_frag(&skb_shinfo(skb)->frags[i]); + memcpy(to, + vaddr + skb_shinfo(skb)->frags[i].page_offset+ + offset - start, copy); + kunmap_skb_frag(vaddr); + + if ((len -= copy) == 0) + return 0; + offset += copy; + to += copy; + } + start = end; + } + + skb_walk_frags(skb, frag_iter) { + int end; + + WARN_ON(start > offset + len); + + end = start + frag_iter->len; + if ((copy = end - offset) > 0) { + if (copy > len) + copy = len; + if (skb_copy_bits(frag_iter, offset - start, to, copy)) + goto fault; + if ((len -= copy) == 0) + return 0; + offset += copy; + to += copy; + } + start = end; + } + + if (!len) + return 0; + +fault: + return -EFAULT; +} +EXPORT_SYMBOL(skb_copy_bits); + +/* + * Callback from splice_to_pipe(), if we need to release some pages + * at the end of the spd in case we error'ed out in filling the pipe. + */ +static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i) +{ + put_page(spd->pages[i]); +} + +static inline struct page *linear_to_page(struct page *page, unsigned int *len, + unsigned int *offset, + struct sk_buff *skb, struct sock *sk) +{ + struct page *p = sk->sk_sndmsg_page; + unsigned int off; + + if (!p) { +new_page: + p = sk->sk_sndmsg_page = alloc_pages(sk->sk_allocation, 0); + if (!p) + return NULL; + + off = sk->sk_sndmsg_off = 0; + /* hold one ref to this page until it's full */ + } else { + unsigned int mlen; + + off = sk->sk_sndmsg_off; + mlen = PAGE_SIZE - off; + if (mlen < 64 && mlen < *len) { + put_page(p); + goto new_page; + } + + *len = min_t(unsigned int, *len, mlen); + } + + memcpy(page_address(p) + off, page_address(page) + *offset, *len); + sk->sk_sndmsg_off += *len; + *offset = off; + get_page(p); + + return p; +} + +/* + * Fill page/offset/length into spd, if it can hold more pages. + */ +static inline int spd_fill_page(struct splice_pipe_desc *spd, + struct pipe_inode_info *pipe, struct page *page, + unsigned int *len, unsigned int offset, + struct sk_buff *skb, int linear, + struct sock *sk) +{ + if (unlikely(spd->nr_pages == pipe->buffers)) + return 1; + + if (linear) { + page = linear_to_page(page, len, &offset, skb, sk); + if (!page) + return 1; + } else + get_page(page); + + spd->pages[spd->nr_pages] = page; + spd->partial[spd->nr_pages].len = *len; + spd->partial[spd->nr_pages].offset = offset; + spd->nr_pages++; + + return 0; +} + +static inline void __segment_seek(struct page **page, unsigned int *poff, + unsigned int *plen, unsigned int off) +{ + unsigned long n; + + *poff += off; + n = *poff / PAGE_SIZE; + if (n) + *page = nth_page(*page, n); + + *poff = *poff % PAGE_SIZE; + *plen -= off; +} + +static inline int __splice_segment(struct page *page, unsigned int poff, + unsigned int plen, unsigned int *off, + unsigned int *len, struct sk_buff *skb, + struct splice_pipe_desc *spd, int linear, + struct sock *sk, + struct pipe_inode_info *pipe) +{ + if (!*len) + return 1; + + /* skip this segment if already processed */ + if (*off >= plen) { + *off -= plen; + return 0; + } + + /* ignore any bits we already processed */ + if (*off) { + __segment_seek(&page, &poff, &plen, *off); + *off = 0; + } + + do { + unsigned int flen = min(*len, plen); + + /* the linear region may spread across several pages */ + flen = min_t(unsigned int, flen, PAGE_SIZE - poff); + + if (spd_fill_page(spd, pipe, page, &flen, poff, skb, linear, sk)) + return 1; + + __segment_seek(&page, &poff, &plen, flen); + *len -= flen; + + } while (*len && plen); + + return 0; +} + +/* + * Map linear and fragment data from the skb to spd. It reports failure if the + * pipe is full or if we already spliced the requested length. + */ +static int __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe, + unsigned int *offset, unsigned int *len, + struct splice_pipe_desc *spd, struct sock *sk) +{ + int seg; + + /* + * map the linear part + */ + if (__splice_segment(virt_to_page(skb->data), + (unsigned long) skb->data & (PAGE_SIZE - 1), + skb_headlen(skb), + offset, len, skb, spd, 1, sk, pipe)) + return 1; + + /* + * then map the fragments + */ + for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) { + const skb_frag_t *f = &skb_shinfo(skb)->frags[seg]; + + if (__splice_segment(skb_frag_page(f), + f->page_offset, skb_frag_size(f), + offset, len, skb, spd, 0, sk, pipe)) + return 1; + } + + return 0; +} + +/* + * Map data from the skb to a pipe. Should handle both the linear part, + * the fragments, and the frag list. It does NOT handle frag lists within + * the frag list, if such a thing exists. We'd probably need to recurse to + * handle that cleanly. + */ +int skb_splice_bits(struct sk_buff *skb, unsigned int offset, + struct pipe_inode_info *pipe, unsigned int tlen, + unsigned int flags) +{ + struct partial_page partial[PIPE_DEF_BUFFERS]; + struct page *pages[PIPE_DEF_BUFFERS]; + struct splice_pipe_desc spd = { + .pages = pages, + .partial = partial, + .nr_pages_max = MAX_SKB_FRAGS, + .flags = flags, + .ops = &sock_pipe_buf_ops, + .spd_release = sock_spd_release, + }; + struct sk_buff *frag_iter; + struct sock *sk = skb->sk; + int ret = 0; + + if (splice_grow_spd(pipe, &spd)) + return -ENOMEM; + + /* + * __skb_splice_bits() only fails if the output has no room left, + * so no point in going over the frag_list for the error case. + */ + if (__skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk)) + goto done; + else if (!tlen) + goto done; + + /* + * now see if we have a frag_list to map + */ + skb_walk_frags(skb, frag_iter) { + if (!tlen) + break; + if (__skb_splice_bits(frag_iter, pipe, &offset, &tlen, &spd, sk)) + break; + } + +done: + if (spd.nr_pages) { + /* + * Drop the socket lock, otherwise we have reverse + * locking dependencies between sk_lock and i_mutex + * here as compared to sendfile(). We enter here + * with the socket lock held, and splice_to_pipe() will + * grab the pipe inode lock. For sendfile() emulation, + * we call into ->sendpage() with the i_mutex lock held + * and networking will grab the socket lock. + */ + release_sock(sk); + ret = splice_to_pipe(pipe, &spd); + lock_sock(sk); + } + + splice_shrink_spd(&spd); + return ret; +} + +/** + * skb_store_bits - store bits from kernel buffer to skb + * @skb: destination buffer + * @offset: offset in destination + * @from: source buffer + * @len: number of bytes to copy + * + * Copy the specified number of bytes from the source buffer to the + * destination skb. This function handles all the messy bits of + * traversing fragment lists and such. + */ + +int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len) +{ + int start = skb_headlen(skb); + struct sk_buff *frag_iter; + int i, copy; + + if (offset > (int)skb->len - len) + goto fault; + + if ((copy = start - offset) > 0) { + if (copy > len) + copy = len; + skb_copy_to_linear_data_offset(skb, offset, from, copy); + if ((len -= copy) == 0) + return 0; + offset += copy; + from += copy; + } + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + int end; + + WARN_ON(start > offset + len); + + end = start + skb_frag_size(frag); + if ((copy = end - offset) > 0) { + u8 *vaddr; + + if (copy > len) + copy = len; + + vaddr = kmap_skb_frag(frag); + memcpy(vaddr + frag->page_offset + offset - start, + from, copy); + kunmap_skb_frag(vaddr); + + if ((len -= copy) == 0) + return 0; + offset += copy; + from += copy; + } + start = end; + } + + skb_walk_frags(skb, frag_iter) { + int end; + + WARN_ON(start > offset + len); + + end = start + frag_iter->len; + if ((copy = end - offset) > 0) { + if (copy > len) + copy = len; + if (skb_store_bits(frag_iter, offset - start, + from, copy)) + goto fault; + if ((len -= copy) == 0) + return 0; + offset += copy; + from += copy; + } + start = end; + } + if (!len) + return 0; + +fault: + return -EFAULT; +} +EXPORT_SYMBOL(skb_store_bits); + +/* Checksum skb data. */ + +__wsum skb_checksum(const struct sk_buff *skb, int offset, + int len, __wsum csum) +{ + int start = skb_headlen(skb); + int i, copy = start - offset; + struct sk_buff *frag_iter; + int pos = 0; + + /* Checksum header. */ + if (copy > 0) { + if (copy > len) + copy = len; + csum = csum_partial(skb->data + offset, copy, csum); + if ((len -= copy) == 0) + return csum; + offset += copy; + pos = copy; + } + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + int end; + + WARN_ON(start > offset + len); + + end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]); + if ((copy = end - offset) > 0) { + __wsum csum2; + u8 *vaddr; + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + + if (copy > len) + copy = len; + vaddr = kmap_skb_frag(frag); + csum2 = csum_partial(vaddr + frag->page_offset + + offset - start, copy, 0); + kunmap_skb_frag(vaddr); + csum = csum_block_add(csum, csum2, pos); + if (!(len -= copy)) + return csum; + offset += copy; + pos += copy; + } + start = end; + } + + skb_walk_frags(skb, frag_iter) { + int end; + + WARN_ON(start > offset + len); + + end = start + frag_iter->len; + if ((copy = end - offset) > 0) { + __wsum csum2; + if (copy > len) + copy = len; + csum2 = skb_checksum(frag_iter, offset - start, + copy, 0); + csum = csum_block_add(csum, csum2, pos); + if ((len -= copy) == 0) + return csum; + offset += copy; + pos += copy; + } + start = end; + } + BUG_ON(len); + + return csum; +} +EXPORT_SYMBOL(skb_checksum); + +/* Both of above in one bottle. */ + +__wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, + u8 *to, int len, __wsum csum) +{ + int start = skb_headlen(skb); + int i, copy = start - offset; + struct sk_buff *frag_iter; + int pos = 0; + + /* Copy header. */ + if (copy > 0) { + if (copy > len) + copy = len; + csum = csum_partial_copy_nocheck(skb->data + offset, to, + copy, csum); + if ((len -= copy) == 0) + return csum; + offset += copy; + to += copy; + pos = copy; + } + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + int end; + + WARN_ON(start > offset + len); + + end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]); + if ((copy = end - offset) > 0) { + __wsum csum2; + u8 *vaddr; + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + + if (copy > len) + copy = len; + vaddr = kmap_skb_frag(frag); + csum2 = csum_partial_copy_nocheck(vaddr + + frag->page_offset + + offset - start, to, + copy, 0); + kunmap_skb_frag(vaddr); + csum = csum_block_add(csum, csum2, pos); + if (!(len -= copy)) + return csum; + offset += copy; + to += copy; + pos += copy; + } + start = end; + } + + skb_walk_frags(skb, frag_iter) { + __wsum csum2; + int end; + + WARN_ON(start > offset + len); + + end = start + frag_iter->len; + if ((copy = end - offset) > 0) { + if (copy > len) + copy = len; + csum2 = skb_copy_and_csum_bits(frag_iter, + offset - start, + to, copy, 0); + csum = csum_block_add(csum, csum2, pos); + if ((len -= copy) == 0) + return csum; + offset += copy; + to += copy; + pos += copy; + } + start = end; + } + BUG_ON(len); + return csum; +} +EXPORT_SYMBOL(skb_copy_and_csum_bits); + +void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to) +{ + __wsum csum; + long csstart; + + if (skb->ip_summed == CHECKSUM_PARTIAL) + csstart = skb_checksum_start_offset(skb); + else + csstart = skb_headlen(skb); + + BUG_ON(csstart > skb_headlen(skb)); + + skb_copy_from_linear_data(skb, to, csstart); + + csum = 0; + if (csstart != skb->len) + csum = skb_copy_and_csum_bits(skb, csstart, to + csstart, + skb->len - csstart, 0); + + if (skb->ip_summed == CHECKSUM_PARTIAL) { + long csstuff = csstart + skb->csum_offset; + + *((__sum16 *)(to + csstuff)) = csum_fold(csum); + } +} +EXPORT_SYMBOL(skb_copy_and_csum_dev); + +/** + * skb_dequeue - remove from the head of the queue + * @list: list to dequeue from + * + * Remove the head of the list. The list lock is taken so the function + * may be used safely with other locking list functions. The head item is + * returned or %NULL if the list is empty. + */ + +struct sk_buff *skb_dequeue(struct sk_buff_head *list) +{ + unsigned long flags; + struct sk_buff *result; + + spin_lock_irqsave(&list->lock, flags); + result = __skb_dequeue(list); + spin_unlock_irqrestore(&list->lock, flags); + return result; +} +EXPORT_SYMBOL(skb_dequeue); + +/** + * skb_dequeue_tail - remove from the tail of the queue + * @list: list to dequeue from + * + * Remove the tail of the list. The list lock is taken so the function + * may be used safely with other locking list functions. The tail item is + * returned or %NULL if the list is empty. + */ +struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list) +{ + unsigned long flags; + struct sk_buff *result; + + spin_lock_irqsave(&list->lock, flags); + result = __skb_dequeue_tail(list); + spin_unlock_irqrestore(&list->lock, flags); + return result; +} +EXPORT_SYMBOL(skb_dequeue_tail); + +/** + * skb_queue_purge - empty a list + * @list: list to empty + * + * Delete all buffers on an &sk_buff list. Each buffer is removed from + * the list and one reference dropped. This function takes the list + * lock and is atomic with respect to other list locking functions. + */ +void skb_queue_purge(struct sk_buff_head *list) +{ + struct sk_buff *skb; + while ((skb = skb_dequeue(list)) != NULL) + kfree_skb(skb); +} +EXPORT_SYMBOL(skb_queue_purge); + +/** + * skb_queue_head - queue a buffer at the list head + * @list: list to use + * @newsk: buffer to queue + * + * Queue a buffer at the start of the list. This function takes the + * list lock and can be used safely with other locking &sk_buff functions + * safely. + * + * A buffer cannot be placed on two lists at the same time. + */ +void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk) +{ + unsigned long flags; + + spin_lock_irqsave(&list->lock, flags); + __skb_queue_head(list, newsk); + spin_unlock_irqrestore(&list->lock, flags); +} +EXPORT_SYMBOL(skb_queue_head); + +/** + * skb_queue_tail - queue a buffer at the list tail + * @list: list to use + * @newsk: buffer to queue + * + * Queue a buffer at the tail of the list. This function takes the + * list lock and can be used safely with other locking &sk_buff functions + * safely. + * + * A buffer cannot be placed on two lists at the same time. + */ +void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk) +{ + unsigned long flags; + + spin_lock_irqsave(&list->lock, flags); + __skb_queue_tail(list, newsk); + spin_unlock_irqrestore(&list->lock, flags); +} +EXPORT_SYMBOL(skb_queue_tail); + +/** + * skb_unlink - remove a buffer from a list + * @skb: buffer to remove + * @list: list to use + * + * Remove a packet from a list. The list locks are taken and this + * function is atomic with respect to other list locked calls + * + * You must know what list the SKB is on. + */ +void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list) +{ + unsigned long flags; + + spin_lock_irqsave(&list->lock, flags); + __skb_unlink(skb, list); + spin_unlock_irqrestore(&list->lock, flags); +} +EXPORT_SYMBOL(skb_unlink); + +/** + * skb_append - append a buffer + * @old: buffer to insert after + * @newsk: buffer to insert + * @list: list to use + * + * Place a packet after a given packet in a list. The list locks are taken + * and this function is atomic with respect to other list locked calls. + * A buffer cannot be placed on two lists at the same time. + */ +void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list) +{ + unsigned long flags; + + spin_lock_irqsave(&list->lock, flags); + __skb_queue_after(list, old, newsk); + spin_unlock_irqrestore(&list->lock, flags); +} +EXPORT_SYMBOL(skb_append); + +/** + * skb_insert - insert a buffer + * @old: buffer to insert before + * @newsk: buffer to insert + * @list: list to use + * + * Place a packet before a given packet in a list. The list locks are + * taken and this function is atomic with respect to other list locked + * calls. + * + * A buffer cannot be placed on two lists at the same time. + */ +void skb_insert(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list) +{ + unsigned long flags; + + spin_lock_irqsave(&list->lock, flags); + __skb_insert(newsk, old->prev, old, list); + spin_unlock_irqrestore(&list->lock, flags); +} +EXPORT_SYMBOL(skb_insert); + +static inline void skb_split_inside_header(struct sk_buff *skb, + struct sk_buff* skb1, + const u32 len, const int pos) +{ + int i; + + skb_copy_from_linear_data_offset(skb, len, skb_put(skb1, pos - len), + pos - len); + /* And move data appendix as is. */ + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) + skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i]; + + skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags; + skb_shinfo(skb)->nr_frags = 0; + skb1->data_len = skb->data_len; + skb1->len += skb1->data_len; + skb->data_len = 0; + skb->len = len; + skb_set_tail_pointer(skb, len); +} + +static inline void skb_split_no_header(struct sk_buff *skb, + struct sk_buff* skb1, + const u32 len, int pos) +{ + int i, k = 0; + const int nfrags = skb_shinfo(skb)->nr_frags; + + skb_shinfo(skb)->nr_frags = 0; + skb1->len = skb1->data_len = skb->len - len; + skb->len = len; + skb->data_len = len - pos; + + for (i = 0; i < nfrags; i++) { + int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); + + if (pos + size > len) { + skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i]; + + if (pos < len) { + /* Split frag. + * We have two variants in this case: + * 1. Move all the frag to the second + * part, if it is possible. F.e. + * this approach is mandatory for TUX, + * where splitting is expensive. + * 2. Split is accurately. We make this. + */ + skb_frag_ref(skb, i); + skb_shinfo(skb1)->frags[0].page_offset += len - pos; + skb_frag_size_sub(&skb_shinfo(skb1)->frags[0], len - pos); + skb_frag_size_set(&skb_shinfo(skb)->frags[i], len - pos); + skb_shinfo(skb)->nr_frags++; + } + k++; + } else + skb_shinfo(skb)->nr_frags++; + pos += size; + } + skb_shinfo(skb1)->nr_frags = k; +} + +/** + * skb_split - Split fragmented skb to two parts at length len. + * @skb: the buffer to split + * @skb1: the buffer to receive the second part + * @len: new length for skb + */ +void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len) +{ + int pos = skb_headlen(skb); + + if (len < pos) /* Split line is inside header. */ + skb_split_inside_header(skb, skb1, len, pos); + else /* Second chunk has no header, nothing to copy. */ + skb_split_no_header(skb, skb1, len, pos); +} +EXPORT_SYMBOL(skb_split); + +/* Shifting from/to a cloned skb is a no-go. + * + * Caller cannot keep skb_shinfo related pointers past calling here! + */ +static int skb_prepare_for_shift(struct sk_buff *skb) +{ + return skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC); +} + +/** + * skb_shift - Shifts paged data partially from skb to another + * @tgt: buffer into which tail data gets added + * @skb: buffer from which the paged data comes from + * @shiftlen: shift up to this many bytes + * + * Attempts to shift up to shiftlen worth of bytes, which may be less than + * the length of the skb, from skb to tgt. Returns number bytes shifted. + * It's up to caller to free skb if everything was shifted. + * + * If @tgt runs out of frags, the whole operation is aborted. + * + * Skb cannot include anything else but paged data while tgt is allowed + * to have non-paged data as well. + * + * TODO: full sized shift could be optimized but that would need + * specialized skb free'er to handle frags without up-to-date nr_frags. + */ +int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) +{ + int from, to, merge, todo; + struct skb_frag_struct *fragfrom, *fragto; + + BUG_ON(shiftlen > skb->len); + BUG_ON(skb_headlen(skb)); /* Would corrupt stream */ + + todo = shiftlen; + from = 0; + to = skb_shinfo(tgt)->nr_frags; + fragfrom = &skb_shinfo(skb)->frags[from]; + + /* Actual merge is delayed until the point when we know we can + * commit all, so that we don't have to undo partial changes + */ + if (!to || + !skb_can_coalesce(tgt, to, skb_frag_page(fragfrom), + fragfrom->page_offset)) { + merge = -1; + } else { + merge = to - 1; + + todo -= skb_frag_size(fragfrom); + if (todo < 0) { + if (skb_prepare_for_shift(skb) || + skb_prepare_for_shift(tgt)) + return 0; + + /* All previous frag pointers might be stale! */ + fragfrom = &skb_shinfo(skb)->frags[from]; + fragto = &skb_shinfo(tgt)->frags[merge]; + + skb_frag_size_add(fragto, shiftlen); + skb_frag_size_sub(fragfrom, shiftlen); + fragfrom->page_offset += shiftlen; + + goto onlymerged; + } + + from++; + } + + /* Skip full, not-fitting skb to avoid expensive operations */ + if ((shiftlen == skb->len) && + (skb_shinfo(skb)->nr_frags - from) > (MAX_SKB_FRAGS - to)) + return 0; + + if (skb_prepare_for_shift(skb) || skb_prepare_for_shift(tgt)) + return 0; + + while ((todo > 0) && (from < skb_shinfo(skb)->nr_frags)) { + if (to == MAX_SKB_FRAGS) + return 0; + + fragfrom = &skb_shinfo(skb)->frags[from]; + fragto = &skb_shinfo(tgt)->frags[to]; + + if (todo >= skb_frag_size(fragfrom)) { + *fragto = *fragfrom; + todo -= skb_frag_size(fragfrom); + from++; + to++; + + } else { + __skb_frag_ref(fragfrom); + fragto->page = fragfrom->page; + fragto->page_offset = fragfrom->page_offset; + skb_frag_size_set(fragto, todo); + + fragfrom->page_offset += todo; + skb_frag_size_sub(fragfrom, todo); + todo = 0; + + to++; + break; + } + } + + /* Ready to "commit" this state change to tgt */ + skb_shinfo(tgt)->nr_frags = to; + + if (merge >= 0) { + fragfrom = &skb_shinfo(skb)->frags[0]; + fragto = &skb_shinfo(tgt)->frags[merge]; + + skb_frag_size_add(fragto, skb_frag_size(fragfrom)); + __skb_frag_unref(fragfrom); + } + + /* Reposition in the original skb */ + to = 0; + while (from < skb_shinfo(skb)->nr_frags) + skb_shinfo(skb)->frags[to++] = skb_shinfo(skb)->frags[from++]; + skb_shinfo(skb)->nr_frags = to; + + BUG_ON(todo > 0 && !skb_shinfo(skb)->nr_frags); + +onlymerged: + /* Most likely the tgt won't ever need its checksum anymore, skb on + * the other hand might need it if it needs to be resent + */ + tgt->ip_summed = CHECKSUM_PARTIAL; + skb->ip_summed = CHECKSUM_PARTIAL; + + /* Yak, is it really working this way? Some helper please? */ + skb->len -= shiftlen; + skb->data_len -= shiftlen; + skb->truesize -= shiftlen; + tgt->len += shiftlen; + tgt->data_len += shiftlen; + tgt->truesize += shiftlen; + + return shiftlen; +} + +/** + * skb_prepare_seq_read - Prepare a sequential read of skb data + * @skb: the buffer to read + * @from: lower offset of data to be read + * @to: upper offset of data to be read + * @st: state variable + * + * Initializes the specified state variable. Must be called before + * invoking skb_seq_read() for the first time. + */ +void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from, + unsigned int to, struct skb_seq_state *st) +{ + st->lower_offset = from; + st->upper_offset = to; + st->root_skb = st->cur_skb = skb; + st->frag_idx = st->stepped_offset = 0; + st->frag_data = NULL; +} +EXPORT_SYMBOL(skb_prepare_seq_read); + +/** + * skb_seq_read - Sequentially read skb data + * @consumed: number of bytes consumed by the caller so far + * @data: destination pointer for data to be returned + * @st: state variable + * + * Reads a block of skb data at &consumed relative to the + * lower offset specified to skb_prepare_seq_read(). Assigns + * the head of the data block to &data and returns the length + * of the block or 0 if the end of the skb data or the upper + * offset has been reached. + * + * The caller is not required to consume all of the data + * returned, i.e. &consumed is typically set to the number + * of bytes already consumed and the next call to + * skb_seq_read() will return the remaining part of the block. + * + * Note 1: The size of each block of data returned can be arbitrary, + * this limitation is the cost for zerocopy seqeuental + * reads of potentially non linear data. + * + * Note 2: Fragment lists within fragments are not implemented + * at the moment, state->root_skb could be replaced with + * a stack for this purpose. + */ +unsigned int skb_seq_read(unsigned int consumed, const u8 **data, + struct skb_seq_state *st) +{ + unsigned int block_limit, abs_offset = consumed + st->lower_offset; + skb_frag_t *frag; + + if (unlikely(abs_offset >= st->upper_offset)) + return 0; + +next_skb: + block_limit = skb_headlen(st->cur_skb) + st->stepped_offset; + + if (abs_offset < block_limit && !st->frag_data) { + *data = st->cur_skb->data + (abs_offset - st->stepped_offset); + return block_limit - abs_offset; + } + + if (st->frag_idx == 0 && !st->frag_data) + st->stepped_offset += skb_headlen(st->cur_skb); + + while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) { + frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx]; + block_limit = skb_frag_size(frag) + st->stepped_offset; + + if (abs_offset < block_limit) { + if (!st->frag_data) + st->frag_data = kmap_skb_frag(frag); + + *data = (u8 *) st->frag_data + frag->page_offset + + (abs_offset - st->stepped_offset); + + return block_limit - abs_offset; + } + + if (st->frag_data) { + kunmap_skb_frag(st->frag_data); + st->frag_data = NULL; + } + + st->frag_idx++; + st->stepped_offset += skb_frag_size(frag); + } + + if (st->frag_data) { + kunmap_skb_frag(st->frag_data); + st->frag_data = NULL; + } + + if (st->root_skb == st->cur_skb && skb_has_frag_list(st->root_skb)) { + st->cur_skb = skb_shinfo(st->root_skb)->frag_list; + st->frag_idx = 0; + goto next_skb; + } else if (st->cur_skb->next) { + st->cur_skb = st->cur_skb->next; + st->frag_idx = 0; + goto next_skb; + } + + return 0; +} +EXPORT_SYMBOL(skb_seq_read); + +/** + * skb_abort_seq_read - Abort a sequential read of skb data + * @st: state variable + * + * Must be called if skb_seq_read() was not called until it + * returned 0. + */ +void skb_abort_seq_read(struct skb_seq_state *st) +{ + if (st->frag_data) + kunmap_skb_frag(st->frag_data); +} +EXPORT_SYMBOL(skb_abort_seq_read); + +#define TS_SKB_CB(state) ((struct skb_seq_state *) &((state)->cb)) + +static unsigned int skb_ts_get_next_block(unsigned int offset, const u8 **text, + struct ts_config *conf, + struct ts_state *state) +{ + return skb_seq_read(offset, text, TS_SKB_CB(state)); +} + +static void skb_ts_finish(struct ts_config *conf, struct ts_state *state) +{ + skb_abort_seq_read(TS_SKB_CB(state)); +} + +/** + * skb_find_text - Find a text pattern in skb data + * @skb: the buffer to look in + * @from: search offset + * @to: search limit + * @config: textsearch configuration + * @state: uninitialized textsearch state variable + * + * Finds a pattern in the skb data according to the specified + * textsearch configuration. Use textsearch_next() to retrieve + * subsequent occurrences of the pattern. Returns the offset + * to the first occurrence or UINT_MAX if no match was found. + */ +unsigned int skb_find_text(struct sk_buff *skb, unsigned int from, + unsigned int to, struct ts_config *config, + struct ts_state *state) +{ + unsigned int ret; + + config->get_next_block = skb_ts_get_next_block; + config->finish = skb_ts_finish; + + skb_prepare_seq_read(skb, from, to, TS_SKB_CB(state)); + + ret = textsearch_find(config, state); + return (ret <= to - from ? ret : UINT_MAX); +} +EXPORT_SYMBOL(skb_find_text); + +/** + * skb_append_datato_frags: - append the user data to a skb + * @sk: sock structure + * @skb: skb structure to be appened with user data. + * @getfrag: call back function to be used for getting the user data + * @from: pointer to user message iov + * @length: length of the iov message + * + * Description: This procedure append the user data in the fragment part + * of the skb if any page alloc fails user this procedure returns -ENOMEM + */ +int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb, + int (*getfrag)(void *from, char *to, int offset, + int len, int odd, struct sk_buff *skb), + void *from, int length) +{ + int frg_cnt = 0; + skb_frag_t *frag = NULL; + struct page *page = NULL; + int copy, left; + int offset = 0; + int ret; + + do { + /* Return error if we don't have space for new frag */ + frg_cnt = skb_shinfo(skb)->nr_frags; + if (frg_cnt >= MAX_SKB_FRAGS) + return -EFAULT; + + /* allocate a new page for next frag */ + page = alloc_pages(sk->sk_allocation, 0); + + /* If alloc_page fails just return failure and caller will + * free previous allocated pages by doing kfree_skb() + */ + if (page == NULL) + return -ENOMEM; + + /* initialize the next frag */ + skb_fill_page_desc(skb, frg_cnt, page, 0, 0); + skb->truesize += PAGE_SIZE; + atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc); + + /* get the new initialized frag */ + frg_cnt = skb_shinfo(skb)->nr_frags; + frag = &skb_shinfo(skb)->frags[frg_cnt - 1]; + + /* copy the user data to page */ + left = PAGE_SIZE - frag->page_offset; + copy = (length > left)? left : length; + + ret = getfrag(from, skb_frag_address(frag) + skb_frag_size(frag), + offset, copy, 0, skb); + if (ret < 0) + return -EFAULT; + + /* copy was successful so update the size parameters */ + skb_frag_size_add(frag, copy); + skb->len += copy; + skb->data_len += copy; + offset += copy; + length -= copy; + + } while (length > 0); + + return 0; +} +EXPORT_SYMBOL(skb_append_datato_frags); + +/** + * skb_pull_rcsum - pull skb and update receive checksum + * @skb: buffer to update + * @len: length of data pulled + * + * This function performs an skb_pull on the packet and updates + * the CHECKSUM_COMPLETE checksum. It should be used on + * receive path processing instead of skb_pull unless you know + * that the checksum difference is zero (e.g., a valid IP header) + * or you are setting ip_summed to CHECKSUM_NONE. + */ +unsigned char *skb_pull_rcsum(struct sk_buff *skb, unsigned int len) +{ + BUG_ON(len > skb->len); + skb->len -= len; + BUG_ON(skb->len < skb->data_len); + skb_postpull_rcsum(skb, skb->data, len); + return skb->data += len; +} +EXPORT_SYMBOL_GPL(skb_pull_rcsum); + +/** + * skb_segment - Perform protocol segmentation on skb. + * @skb: buffer to segment + * @features: features for the output path (see dev->features) + * + * This function performs segmentation on the given skb. It returns + * a pointer to the first in a list of new skbs for the segments. + * In case of error it returns ERR_PTR(err). + */ +struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features) +{ + struct sk_buff *segs = NULL; + struct sk_buff *tail = NULL; + struct sk_buff *fskb = skb_shinfo(skb)->frag_list; + unsigned int mss = skb_shinfo(skb)->gso_size; + unsigned int doffset = skb->data - skb_mac_header(skb); + unsigned int offset = doffset; + unsigned int headroom; + unsigned int len; + int sg = !!(features & NETIF_F_SG); + int nfrags = skb_shinfo(skb)->nr_frags; + int err = -ENOMEM; + int i = 0; + int pos; + + __skb_push(skb, doffset); + headroom = skb_headroom(skb); + pos = skb_headlen(skb); + + do { + struct sk_buff *nskb; + skb_frag_t *frag; + int hsize; + int size; + + len = skb->len - offset; + if (len > mss) + len = mss; + + hsize = skb_headlen(skb) - offset; + if (hsize < 0) + hsize = 0; + if (hsize > len || !sg) + hsize = len; + + if (!hsize && i >= nfrags) { + BUG_ON(fskb->len != len); + + pos += len; + nskb = skb_clone(fskb, GFP_ATOMIC); + fskb = fskb->next; + + if (unlikely(!nskb)) + goto err; + + hsize = skb_end_pointer(nskb) - nskb->head; + if (skb_cow_head(nskb, doffset + headroom)) { + kfree_skb(nskb); + goto err; + } + + nskb->truesize += skb_end_pointer(nskb) - nskb->head - + hsize; + skb_release_head_state(nskb); + __skb_push(nskb, doffset); + } else { + nskb = alloc_skb(hsize + doffset + headroom, + GFP_ATOMIC); + + if (unlikely(!nskb)) + goto err; + + skb_reserve(nskb, headroom); + __skb_put(nskb, doffset); + } + + if (segs) + tail->next = nskb; + else + segs = nskb; + tail = nskb; + + __copy_skb_header(nskb, skb); + nskb->mac_len = skb->mac_len; + + /* nskb and skb might have different headroom */ + if (nskb->ip_summed == CHECKSUM_PARTIAL) + nskb->csum_start += skb_headroom(nskb) - headroom; + + skb_reset_mac_header(nskb); + skb_set_network_header(nskb, skb->mac_len); + nskb->transport_header = (nskb->network_header + + skb_network_header_len(skb)); + skb_copy_from_linear_data(skb, nskb->data, doffset); + + if (fskb != skb_shinfo(skb)->frag_list) + continue; + + if (!sg) { + nskb->ip_summed = CHECKSUM_NONE; + nskb->csum = skb_copy_and_csum_bits(skb, offset, + skb_put(nskb, len), + len, 0); + continue; + } + + frag = skb_shinfo(nskb)->frags; + + skb_copy_from_linear_data_offset(skb, offset, + skb_put(nskb, hsize), hsize); + + while (pos < offset + len && i < nfrags) { + *frag = skb_shinfo(skb)->frags[i]; + __skb_frag_ref(frag); + size = skb_frag_size(frag); + + if (pos < offset) { + frag->page_offset += offset - pos; + skb_frag_size_sub(frag, offset - pos); + } + + skb_shinfo(nskb)->nr_frags++; + + if (pos + size <= offset + len) { + i++; + pos += size; + } else { + skb_frag_size_sub(frag, pos + size - (offset + len)); + goto skip_fraglist; + } + + frag++; + } + + if (pos < offset + len) { + struct sk_buff *fskb2 = fskb; + + BUG_ON(pos + fskb->len != offset + len); + + pos += fskb->len; + fskb = fskb->next; + + if (fskb2->next) { + fskb2 = skb_clone(fskb2, GFP_ATOMIC); + if (!fskb2) + goto err; + } else + skb_get(fskb2); + + SKB_FRAG_ASSERT(nskb); + skb_shinfo(nskb)->frag_list = fskb2; + } + +skip_fraglist: + nskb->data_len = len - hsize; + nskb->len += nskb->data_len; + nskb->truesize += nskb->data_len; + } while ((offset += len) < skb->len); + + return segs; + +err: + while ((skb = segs)) { + segs = skb->next; + kfree_skb(skb); + } + return ERR_PTR(err); +} +EXPORT_SYMBOL_GPL(skb_segment); + +int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) +{ + struct sk_buff *p = *head; + struct sk_buff *nskb; + struct skb_shared_info *skbinfo = skb_shinfo(skb); + struct skb_shared_info *pinfo = skb_shinfo(p); + unsigned int headroom; + unsigned int len = skb_gro_len(skb); + unsigned int offset = skb_gro_offset(skb); + unsigned int headlen = skb_headlen(skb); + + if (p->len + len >= 65536) + return -E2BIG; + + if (pinfo->frag_list) + goto merge; + else if (headlen <= offset) { + skb_frag_t *frag; + skb_frag_t *frag2; + int i = skbinfo->nr_frags; + int nr_frags = pinfo->nr_frags + i; + + offset -= headlen; + + if (nr_frags > MAX_SKB_FRAGS) + return -E2BIG; + + pinfo->nr_frags = nr_frags; + skbinfo->nr_frags = 0; + + frag = pinfo->frags + nr_frags; + frag2 = skbinfo->frags + i; + do { + *--frag = *--frag2; + } while (--i); + + frag->page_offset += offset; + skb_frag_size_sub(frag, offset); + + skb->truesize -= skb->data_len; + skb->len -= skb->data_len; + skb->data_len = 0; + + NAPI_GRO_CB(skb)->free = 1; + goto done; + } else if (skb_gro_len(p) != pinfo->gso_size) + return -E2BIG; + + headroom = skb_headroom(p); + nskb = alloc_skb(headroom + skb_gro_offset(p), GFP_ATOMIC); + if (unlikely(!nskb)) + return -ENOMEM; + + __copy_skb_header(nskb, p); + nskb->mac_len = p->mac_len; + + skb_reserve(nskb, headroom); + __skb_put(nskb, skb_gro_offset(p)); + + skb_set_mac_header(nskb, skb_mac_header(p) - p->data); + skb_set_network_header(nskb, skb_network_offset(p)); + skb_set_transport_header(nskb, skb_transport_offset(p)); + + __skb_pull(p, skb_gro_offset(p)); + memcpy(skb_mac_header(nskb), skb_mac_header(p), + p->data - skb_mac_header(p)); + + *NAPI_GRO_CB(nskb) = *NAPI_GRO_CB(p); + skb_shinfo(nskb)->frag_list = p; + skb_shinfo(nskb)->gso_size = pinfo->gso_size; + pinfo->gso_size = 0; + skb_header_release(p); + nskb->prev = p; + + nskb->data_len += p->len; + nskb->truesize += p->truesize; + nskb->len += p->len; + + *head = nskb; + nskb->next = p->next; + p->next = NULL; + + p = nskb; + +merge: + p->truesize += skb->truesize - len; + if (offset > headlen) { + unsigned int eat = offset - headlen; + + skbinfo->frags[0].page_offset += eat; + skb_frag_size_sub(&skbinfo->frags[0], eat); + skb->data_len -= eat; + skb->len -= eat; + offset = headlen; + } + + __skb_pull(skb, offset); + + p->prev->next = skb; + p->prev = skb; + skb_header_release(skb); + +done: + NAPI_GRO_CB(p)->count++; + p->data_len += len; + p->truesize += len; + p->len += len; + + NAPI_GRO_CB(skb)->same_flow = 1; + return 0; +} +EXPORT_SYMBOL_GPL(skb_gro_receive); + +void __init skb_init(void) +{ + skbuff_head_cache = kmem_cache_create("skbuff_head_cache", + sizeof(struct sk_buff), + 0, + SLAB_HWCACHE_ALIGN|SLAB_PANIC, + NULL); + skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache", + (2*sizeof(struct sk_buff)) + + sizeof(atomic_t), + 0, + SLAB_HWCACHE_ALIGN|SLAB_PANIC, + NULL); +} + +/** + * skb_to_sgvec - Fill a scatter-gather list from a socket buffer + * @skb: Socket buffer containing the buffers to be mapped + * @sg: The scatter-gather list to map into + * @offset: The offset into the buffer's contents to start mapping + * @len: Length of buffer space to be mapped + * + * Fill the specified scatter-gather list with mappings/pointers into a + * region of the buffer space attached to a socket buffer. + */ +static int +__skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) +{ + int start = skb_headlen(skb); + int i, copy = start - offset; + struct sk_buff *frag_iter; + int elt = 0; + + if (copy > 0) { + if (copy > len) + copy = len; + sg_set_buf(sg, skb->data + offset, copy); + elt++; + if ((len -= copy) == 0) + return elt; + offset += copy; + } + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + int end; + + WARN_ON(start > offset + len); + + end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]); + if ((copy = end - offset) > 0) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + + if (copy > len) + copy = len; + sg_set_page(&sg[elt], skb_frag_page(frag), copy, + frag->page_offset+offset-start); + elt++; + if (!(len -= copy)) + return elt; + offset += copy; + } + start = end; + } + + skb_walk_frags(skb, frag_iter) { + int end; + + WARN_ON(start > offset + len); + + end = start + frag_iter->len; + if ((copy = end - offset) > 0) { + if (copy > len) + copy = len; + elt += __skb_to_sgvec(frag_iter, sg+elt, offset - start, + copy); + if ((len -= copy) == 0) + return elt; + offset += copy; + } + start = end; + } + BUG_ON(len); + return elt; +} + +int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) +{ + int nsg = __skb_to_sgvec(skb, sg, offset, len); + + sg_mark_end(&sg[nsg - 1]); + + return nsg; +} +EXPORT_SYMBOL_GPL(skb_to_sgvec); + +/** + * skb_cow_data - Check that a socket buffer's data buffers are writable + * @skb: The socket buffer to check. + * @tailbits: Amount of trailing space to be added + * @trailer: Returned pointer to the skb where the @tailbits space begins + * + * Make sure that the data buffers attached to a socket buffer are + * writable. If they are not, private copies are made of the data buffers + * and the socket buffer is set to use these instead. + * + * If @tailbits is given, make sure that there is space to write @tailbits + * bytes of data beyond current end of socket buffer. @trailer will be + * set to point to the skb in which this space begins. + * + * The number of scatterlist elements required to completely map the + * COW'd and extended socket buffer will be returned. + */ +int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer) +{ + int copyflag; + int elt; + struct sk_buff *skb1, **skb_p; + + /* If skb is cloned or its head is paged, reallocate + * head pulling out all the pages (pages are considered not writable + * at the moment even if they are anonymous). + */ + if ((skb_cloned(skb) || skb_shinfo(skb)->nr_frags) && + __pskb_pull_tail(skb, skb_pagelen(skb)-skb_headlen(skb)) == NULL) + return -ENOMEM; + + /* Easy case. Most of packets will go this way. */ + if (!skb_has_frag_list(skb)) { + /* A little of trouble, not enough of space for trailer. + * This should not happen, when stack is tuned to generate + * good frames. OK, on miss we reallocate and reserve even more + * space, 128 bytes is fair. */ + + if (skb_tailroom(skb) < tailbits && + pskb_expand_head(skb, 0, tailbits-skb_tailroom(skb)+128, GFP_ATOMIC)) + return -ENOMEM; + + /* Voila! */ + *trailer = skb; + return 1; + } + + /* Misery. We are in troubles, going to mincer fragments... */ + + elt = 1; + skb_p = &skb_shinfo(skb)->frag_list; + copyflag = 0; + + while ((skb1 = *skb_p) != NULL) { + int ntail = 0; + + /* The fragment is partially pulled by someone, + * this can happen on input. Copy it and everything + * after it. */ + + if (skb_shared(skb1)) + copyflag = 1; + + /* If the skb is the last, worry about trailer. */ + + if (skb1->next == NULL && tailbits) { + if (skb_shinfo(skb1)->nr_frags || + skb_has_frag_list(skb1) || + skb_tailroom(skb1) < tailbits) + ntail = tailbits + 128; + } + + if (copyflag || + skb_cloned(skb1) || + ntail || + skb_shinfo(skb1)->nr_frags || + skb_has_frag_list(skb1)) { + struct sk_buff *skb2; + + /* Fuck, we are miserable poor guys... */ + if (ntail == 0) + skb2 = skb_copy(skb1, GFP_ATOMIC); + else + skb2 = skb_copy_expand(skb1, + skb_headroom(skb1), + ntail, + GFP_ATOMIC); + if (unlikely(skb2 == NULL)) + return -ENOMEM; + + if (skb1->sk) + skb_set_owner_w(skb2, skb1->sk); + + /* Looking around. Are we still alive? + * OK, link new skb, drop old one */ + + skb2->next = skb1->next; + *skb_p = skb2; + kfree_skb(skb1); + skb1 = skb2; + } + elt++; + *trailer = skb1; + skb_p = &skb1->next; + } + + return elt; +} +EXPORT_SYMBOL_GPL(skb_cow_data); + +static void sock_rmem_free(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + + atomic_sub(skb->truesize, &sk->sk_rmem_alloc); +} + +/* + * Note: We dont mem charge error packets (no sk_forward_alloc changes) + */ +int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb) +{ + int len = skb->len; + + if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= + (unsigned)sk->sk_rcvbuf) + return -ENOMEM; + + skb_orphan(skb); + skb->sk = sk; + skb->destructor = sock_rmem_free; + atomic_add(skb->truesize, &sk->sk_rmem_alloc); + + /* before exiting rcu section, make sure dst is refcounted */ + skb_dst_force(skb); + + skb_queue_tail(&sk->sk_error_queue, skb); + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_data_ready(sk, len); + return 0; +} +EXPORT_SYMBOL(sock_queue_err_skb); + +void skb_tstamp_tx(struct sk_buff *orig_skb, + struct skb_shared_hwtstamps *hwtstamps) +{ + struct sock *sk = orig_skb->sk; + struct sock_exterr_skb *serr; + struct sk_buff *skb; + int err; + + if (!sk) + return; + + skb = skb_clone(orig_skb, GFP_ATOMIC); + if (!skb) + return; + + if (hwtstamps) { + *skb_hwtstamps(skb) = + *hwtstamps; + } else { + /* + * no hardware time stamps available, + * so keep the shared tx_flags and only + * store software time stamp + */ + skb->tstamp = ktime_get_real(); + } + + serr = SKB_EXT_ERR(skb); + memset(serr, 0, sizeof(*serr)); + serr->ee.ee_errno = ENOMSG; + serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; + + err = sock_queue_err_skb(sk, skb); + + if (err) + kfree_skb(skb); +} +EXPORT_SYMBOL_GPL(skb_tstamp_tx); + +void skb_complete_wifi_ack(struct sk_buff *skb, bool acked) +{ + struct sock *sk = skb->sk; + struct sock_exterr_skb *serr; + int err; + + skb->wifi_acked_valid = 1; + skb->wifi_acked = acked; + + serr = SKB_EXT_ERR(skb); + memset(serr, 0, sizeof(*serr)); + serr->ee.ee_errno = ENOMSG; + serr->ee.ee_origin = SO_EE_ORIGIN_TXSTATUS; + + err = sock_queue_err_skb(sk, skb); + if (err) + kfree_skb(skb); +} +EXPORT_SYMBOL_GPL(skb_complete_wifi_ack); + + +/** + * skb_partial_csum_set - set up and verify partial csum values for packet + * @skb: the skb to set + * @start: the number of bytes after skb->data to start checksumming. + * @off: the offset from start to place the checksum. + * + * For untrusted partially-checksummed packets, we need to make sure the values + * for skb->csum_start and skb->csum_offset are valid so we don't oops. + * + * This function checks and sets those values and skb->ip_summed: if this + * returns false you should drop the packet. + */ +bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off) +{ + if (unlikely(start > skb_headlen(skb)) || + unlikely((int)start + off > skb_headlen(skb) - 2)) { + if (net_ratelimit()) + printk(KERN_WARNING + "bad partial csum: csum=%u/%u len=%u\n", + start, off, skb_headlen(skb)); + return false; + } + skb->ip_summed = CHECKSUM_PARTIAL; + skb->csum_start = skb_headroom(skb) + start; + skb->csum_offset = off; + return true; +} +EXPORT_SYMBOL_GPL(skb_partial_csum_set); + +void __skb_warn_lro_forwarding(const struct sk_buff *skb) +{ + if (net_ratelimit()) + pr_warning("%s: received packets cannot be forwarded" + " while LRO is enabled\n", skb->dev->name); +} +EXPORT_SYMBOL(__skb_warn_lro_forwarding); diff --git a/net/core/sock.c b/net/core/sock.c new file mode 100644 index 00000000..376566ad --- /dev/null +++ b/net/core/sock.c @@ -0,0 +1,2691 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Generic socket support routines. Memory allocators, socket lock/release + * handler for protocols to use and generic option handler. + * + * + * Authors: Ross Biro + * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> + * Florian La Roche, <flla@stud.uni-sb.de> + * Alan Cox, <A.Cox@swansea.ac.uk> + * + * Fixes: + * Alan Cox : Numerous verify_area() problems + * Alan Cox : Connecting on a connecting socket + * now returns an error for tcp. + * Alan Cox : sock->protocol is set correctly. + * and is not sometimes left as 0. + * Alan Cox : connect handles icmp errors on a + * connect properly. Unfortunately there + * is a restart syscall nasty there. I + * can't match BSD without hacking the C + * library. Ideas urgently sought! + * Alan Cox : Disallow bind() to addresses that are + * not ours - especially broadcast ones!! + * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost) + * Alan Cox : sock_wfree/sock_rfree don't destroy sockets, + * instead they leave that for the DESTROY timer. + * Alan Cox : Clean up error flag in accept + * Alan Cox : TCP ack handling is buggy, the DESTROY timer + * was buggy. Put a remove_sock() in the handler + * for memory when we hit 0. Also altered the timer + * code. The ACK stuff can wait and needs major + * TCP layer surgery. + * Alan Cox : Fixed TCP ack bug, removed remove sock + * and fixed timer/inet_bh race. + * Alan Cox : Added zapped flag for TCP + * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code + * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb + * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources + * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing. + * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so... + * Rick Sladkey : Relaxed UDP rules for matching packets. + * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support + * Pauline Middelink : identd support + * Alan Cox : Fixed connect() taking signals I think. + * Alan Cox : SO_LINGER supported + * Alan Cox : Error reporting fixes + * Anonymous : inet_create tidied up (sk->reuse setting) + * Alan Cox : inet sockets don't set sk->type! + * Alan Cox : Split socket option code + * Alan Cox : Callbacks + * Alan Cox : Nagle flag for Charles & Johannes stuff + * Alex : Removed restriction on inet fioctl + * Alan Cox : Splitting INET from NET core + * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt() + * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code + * Alan Cox : Split IP from generic code + * Alan Cox : New kfree_skbmem() + * Alan Cox : Make SO_DEBUG superuser only. + * Alan Cox : Allow anyone to clear SO_DEBUG + * (compatibility fix) + * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput. + * Alan Cox : Allocator for a socket is settable. + * Alan Cox : SO_ERROR includes soft errors. + * Alan Cox : Allow NULL arguments on some SO_ opts + * Alan Cox : Generic socket allocation to make hooks + * easier (suggested by Craig Metz). + * Michael Pall : SO_ERROR returns positive errno again + * Steve Whitehouse: Added default destructor to free + * protocol private data. + * Steve Whitehouse: Added various other default routines + * common to several socket families. + * Chris Evans : Call suser() check last on F_SETOWN + * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER. + * Andi Kleen : Add sock_kmalloc()/sock_kfree_s() + * Andi Kleen : Fix write_space callback + * Chris Evans : Security fixes - signedness again + * Arnaldo C. Melo : cleanups, use skb_queue_purge + * + * To Fix: + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/capability.h> +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/in.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/sched.h> +#include <linux/timer.h> +#include <linux/string.h> +#include <linux/sockios.h> +#include <linux/net.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/interrupt.h> +#include <linux/poll.h> +#include <linux/tcp.h> +#include <linux/init.h> +#include <linux/highmem.h> +#include <linux/user_namespace.h> +#include <linux/static_key.h> +#include <linux/memcontrol.h> + +#include <asm/uaccess.h> + +#include <linux/netdevice.h> +#include <net/protocol.h> +#include <linux/skbuff.h> +#include <net/net_namespace.h> +#include <net/request_sock.h> +#include <net/sock.h> +#include <linux/net_tstamp.h> +#include <net/xfrm.h> +#include <linux/ipsec.h> +#include <net/cls_cgroup.h> +#include <net/netprio_cgroup.h> + +#include <linux/filter.h> + +#include <trace/events/sock.h> + +#ifdef CONFIG_INET +#include <net/tcp.h> +#endif + +static DEFINE_MUTEX(proto_list_mutex); +static LIST_HEAD(proto_list); + +#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM +int mem_cgroup_sockets_init(struct cgroup *cgrp, struct cgroup_subsys *ss) +{ + struct proto *proto; + int ret = 0; + + mutex_lock(&proto_list_mutex); + list_for_each_entry(proto, &proto_list, node) { + if (proto->init_cgroup) { + ret = proto->init_cgroup(cgrp, ss); + if (ret) + goto out; + } + } + + mutex_unlock(&proto_list_mutex); + return ret; +out: + list_for_each_entry_continue_reverse(proto, &proto_list, node) + if (proto->destroy_cgroup) + proto->destroy_cgroup(cgrp); + mutex_unlock(&proto_list_mutex); + return ret; +} + +void mem_cgroup_sockets_destroy(struct cgroup *cgrp) +{ + struct proto *proto; + + mutex_lock(&proto_list_mutex); + list_for_each_entry_reverse(proto, &proto_list, node) + if (proto->destroy_cgroup) + proto->destroy_cgroup(cgrp); + mutex_unlock(&proto_list_mutex); +} +#endif + +/* + * Each address family might have different locking rules, so we have + * one slock key per address family: + */ +static struct lock_class_key af_family_keys[AF_MAX]; +static struct lock_class_key af_family_slock_keys[AF_MAX]; + +struct static_key memcg_socket_limit_enabled; +EXPORT_SYMBOL(memcg_socket_limit_enabled); + +/* + * Make lock validator output more readable. (we pre-construct these + * strings build-time, so that runtime initialization of socket + * locks is fast): + */ +static const char *const af_family_key_strings[AF_MAX+1] = { + "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX" , "sk_lock-AF_INET" , + "sk_lock-AF_AX25" , "sk_lock-AF_IPX" , "sk_lock-AF_APPLETALK", + "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE" , "sk_lock-AF_ATMPVC" , + "sk_lock-AF_X25" , "sk_lock-AF_INET6" , "sk_lock-AF_ROSE" , + "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI" , "sk_lock-AF_SECURITY" , + "sk_lock-AF_KEY" , "sk_lock-AF_NETLINK" , "sk_lock-AF_PACKET" , + "sk_lock-AF_ASH" , "sk_lock-AF_ECONET" , "sk_lock-AF_ATMSVC" , + "sk_lock-AF_RDS" , "sk_lock-AF_SNA" , "sk_lock-AF_IRDA" , + "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE" , "sk_lock-AF_LLC" , + "sk_lock-27" , "sk_lock-28" , "sk_lock-AF_CAN" , + "sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" , + "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN" , "sk_lock-AF_PHONET" , + "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG" , + "sk_lock-AF_NFC" , "sk_lock-AF_MAX" +}; +static const char *const af_family_slock_key_strings[AF_MAX+1] = { + "slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" , + "slock-AF_AX25" , "slock-AF_IPX" , "slock-AF_APPLETALK", + "slock-AF_NETROM", "slock-AF_BRIDGE" , "slock-AF_ATMPVC" , + "slock-AF_X25" , "slock-AF_INET6" , "slock-AF_ROSE" , + "slock-AF_DECnet", "slock-AF_NETBEUI" , "slock-AF_SECURITY" , + "slock-AF_KEY" , "slock-AF_NETLINK" , "slock-AF_PACKET" , + "slock-AF_ASH" , "slock-AF_ECONET" , "slock-AF_ATMSVC" , + "slock-AF_RDS" , "slock-AF_SNA" , "slock-AF_IRDA" , + "slock-AF_PPPOX" , "slock-AF_WANPIPE" , "slock-AF_LLC" , + "slock-27" , "slock-28" , "slock-AF_CAN" , + "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" , + "slock-AF_RXRPC" , "slock-AF_ISDN" , "slock-AF_PHONET" , + "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG" , + "slock-AF_NFC" , "slock-AF_MAX" +}; +static const char *const af_family_clock_key_strings[AF_MAX+1] = { + "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" , + "clock-AF_AX25" , "clock-AF_IPX" , "clock-AF_APPLETALK", + "clock-AF_NETROM", "clock-AF_BRIDGE" , "clock-AF_ATMPVC" , + "clock-AF_X25" , "clock-AF_INET6" , "clock-AF_ROSE" , + "clock-AF_DECnet", "clock-AF_NETBEUI" , "clock-AF_SECURITY" , + "clock-AF_KEY" , "clock-AF_NETLINK" , "clock-AF_PACKET" , + "clock-AF_ASH" , "clock-AF_ECONET" , "clock-AF_ATMSVC" , + "clock-AF_RDS" , "clock-AF_SNA" , "clock-AF_IRDA" , + "clock-AF_PPPOX" , "clock-AF_WANPIPE" , "clock-AF_LLC" , + "clock-27" , "clock-28" , "clock-AF_CAN" , + "clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_IUCV" , + "clock-AF_RXRPC" , "clock-AF_ISDN" , "clock-AF_PHONET" , + "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG" , + "clock-AF_NFC" , "clock-AF_MAX" +}; + +/* + * sk_callback_lock locking rules are per-address-family, + * so split the lock classes by using a per-AF key: + */ +static struct lock_class_key af_callback_keys[AF_MAX]; + +/* Take into consideration the size of the struct sk_buff overhead in the + * determination of these values, since that is non-constant across + * platforms. This makes socket queueing behavior and performance + * not depend upon such differences. + */ +#define _SK_MEM_PACKETS 256 +#define _SK_MEM_OVERHEAD SKB_TRUESIZE(256) +#define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) +#define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) + +/* Run time adjustable parameters. */ +__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX; +__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX; +__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX; +__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; + +/* Maximal space eaten by iovec or ancillary data plus some space */ +int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512); +EXPORT_SYMBOL(sysctl_optmem_max); + +#if defined(CONFIG_CGROUPS) +#if !defined(CONFIG_NET_CLS_CGROUP) +int net_cls_subsys_id = -1; +EXPORT_SYMBOL_GPL(net_cls_subsys_id); +#endif +#if !defined(CONFIG_NETPRIO_CGROUP) +int net_prio_subsys_id = -1; +EXPORT_SYMBOL_GPL(net_prio_subsys_id); +#endif +#endif + +static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen) +{ + struct timeval tv; + + if (optlen < sizeof(tv)) + return -EINVAL; + if (copy_from_user(&tv, optval, sizeof(tv))) + return -EFAULT; + if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC) + return -EDOM; + + if (tv.tv_sec < 0) { + static int warned __read_mostly; + + *timeo_p = 0; + if (warned < 10 && net_ratelimit()) { + warned++; + printk(KERN_INFO "sock_set_timeout: `%s' (pid %d) " + "tries to set negative timeout\n", + current->comm, task_pid_nr(current)); + } + return 0; + } + *timeo_p = MAX_SCHEDULE_TIMEOUT; + if (tv.tv_sec == 0 && tv.tv_usec == 0) + return 0; + if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1)) + //*timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ); + *timeo_p = tv.tv_sec*HZ + (tv.tv_usec*HZ+(1000000-1))/1000000; + return 0; +} + +static void sock_warn_obsolete_bsdism(const char *name) +{ + static int warned; + static char warncomm[TASK_COMM_LEN]; + if (strcmp(warncomm, current->comm) && warned < 5) { + strcpy(warncomm, current->comm); + printk(KERN_WARNING "process `%s' is using obsolete " + "%s SO_BSDCOMPAT\n", warncomm, name); + warned++; + } +} + +#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)) + +static void sock_disable_timestamp(struct sock *sk, unsigned long flags) +{ + if (sk->sk_flags & flags) { + sk->sk_flags &= ~flags; + if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP)) + net_disable_timestamp(); + } +} + + +int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) +{ + int err; + int skb_len; + unsigned long flags; + struct sk_buff_head *list = &sk->sk_receive_queue; + + if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) { + atomic_inc(&sk->sk_drops); + trace_sock_rcvqueue_full(sk, skb); + return -ENOMEM; + } + + err = sk_filter(sk, skb); + if (err) + return err; + + if (!sk_rmem_schedule(sk, skb->truesize)) { + atomic_inc(&sk->sk_drops); + return -ENOBUFS; + } + + skb->dev = NULL; + skb_set_owner_r(skb, sk); + + /* Cache the SKB length before we tack it onto the receive + * queue. Once it is added it no longer belongs to us and + * may be freed by other threads of control pulling packets + * from the queue. + */ + skb_len = skb->len; + + /* we escape from rcu protected region, make sure we dont leak + * a norefcounted dst + */ + skb_dst_force(skb); + + spin_lock_irqsave(&list->lock, flags); + skb->dropcount = atomic_read(&sk->sk_drops); + __skb_queue_tail(list, skb); + spin_unlock_irqrestore(&list->lock, flags); + + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_data_ready(sk, skb_len); + return 0; +} +EXPORT_SYMBOL(sock_queue_rcv_skb); + +int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested) +{ + int rc = NET_RX_SUCCESS; + + if (sk_filter(sk, skb)) + goto discard_and_relse; + + skb->dev = NULL; + + if (sk_rcvqueues_full(sk, skb)) { + atomic_inc(&sk->sk_drops); + goto discard_and_relse; + } + if (nested) + bh_lock_sock_nested(sk); + else + bh_lock_sock(sk); + if (!sock_owned_by_user(sk)) { + /* + * trylock + unlock semantics: + */ + mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_); + + rc = sk_backlog_rcv(sk, skb); + + mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_); + } else if (sk_add_backlog(sk, skb)) { + bh_unlock_sock(sk); + atomic_inc(&sk->sk_drops); + goto discard_and_relse; + } + + bh_unlock_sock(sk); +out: + sock_put(sk); + return rc; +discard_and_relse: + kfree_skb(skb); + goto out; +} +EXPORT_SYMBOL(sk_receive_skb); + +void sk_reset_txq(struct sock *sk) +{ + sk_tx_queue_clear(sk); +} +EXPORT_SYMBOL(sk_reset_txq); + +struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) +{ + struct dst_entry *dst = __sk_dst_get(sk); + + if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { + sk_tx_queue_clear(sk); + RCU_INIT_POINTER(sk->sk_dst_cache, NULL); + dst_release(dst); + return NULL; + } + + return dst; +} +EXPORT_SYMBOL(__sk_dst_check); + +struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie) +{ + struct dst_entry *dst = sk_dst_get(sk); + + if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { + sk_dst_reset(sk); + dst_release(dst); + return NULL; + } + + return dst; +} +EXPORT_SYMBOL(sk_dst_check); + +static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen) +{ + int ret = -ENOPROTOOPT; +#ifdef CONFIG_NETDEVICES + struct net *net = sock_net(sk); + char devname[IFNAMSIZ]; + int index; + + /* Sorry... */ + ret = -EPERM; + if (!capable(CAP_NET_RAW)) + goto out; + + ret = -EINVAL; + if (optlen < 0) + goto out; + + /* Bind this socket to a particular device like "eth0", + * as specified in the passed interface name. If the + * name is "" or the option length is zero the socket + * is not bound. + */ + if (optlen > IFNAMSIZ - 1) + optlen = IFNAMSIZ - 1; + memset(devname, 0, sizeof(devname)); + + ret = -EFAULT; + if (copy_from_user(devname, optval, optlen)) + goto out; + + index = 0; + if (devname[0] != '\0') { + struct net_device *dev; + + rcu_read_lock(); + dev = dev_get_by_name_rcu(net, devname); + if (dev) + index = dev->ifindex; + rcu_read_unlock(); + ret = -ENODEV; + if (!dev) + goto out; + } + + lock_sock(sk); + sk->sk_bound_dev_if = index; + sk_dst_reset(sk); + release_sock(sk); + + ret = 0; + +out: +#endif + + return ret; +} + +static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool) +{ + if (valbool) + sock_set_flag(sk, bit); + else + sock_reset_flag(sk, bit); +} + +/* + * This is meant for all protocols to use and covers goings on + * at the socket level. Everything here is generic. + */ + +int sock_setsockopt(struct socket *sock, int level, int optname, + char __user *optval, unsigned int optlen) +{ + struct sock *sk = sock->sk; + int val; + int valbool; + struct linger ling; + int ret = 0; + + /* + * Options without arguments + */ + + if (optname == SO_BINDTODEVICE) + return sock_bindtodevice(sk, optval, optlen); + + if (optlen < sizeof(int)) + return -EINVAL; + + if (get_user(val, (int __user *)optval)) + return -EFAULT; + + valbool = val ? 1 : 0; + + lock_sock(sk); + + switch (optname) { + case SO_DEBUG: + if (val && !capable(CAP_NET_ADMIN)) + ret = -EACCES; + else + sock_valbool_flag(sk, SOCK_DBG, valbool); + break; + case SO_REUSEADDR: + sk->sk_reuse = valbool; + break; + case SO_TYPE: + case SO_PROTOCOL: + case SO_DOMAIN: + case SO_ERROR: + ret = -ENOPROTOOPT; + break; + case SO_DONTROUTE: + sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool); + break; + case SO_BROADCAST: + sock_valbool_flag(sk, SOCK_BROADCAST, valbool); + break; + case SO_SNDBUF: + /* Don't error on this BSD doesn't and if you think + about it this is right. Otherwise apps have to + play 'guess the biggest size' games. RCVBUF/SNDBUF + are treated in BSD as hints */ + + if (val > sysctl_wmem_max) + val = sysctl_wmem_max; +set_sndbuf: + sk->sk_userlocks |= SOCK_SNDBUF_LOCK; + if ((val * 2) < SOCK_MIN_SNDBUF) + sk->sk_sndbuf = SOCK_MIN_SNDBUF; + else + sk->sk_sndbuf = val * 2; + + /* + * Wake up sending tasks if we + * upped the value. + */ + sk->sk_write_space(sk); + break; + + case SO_SNDBUFFORCE: + if (!capable(CAP_NET_ADMIN)) { + ret = -EPERM; + break; + } + goto set_sndbuf; + + case SO_RCVBUF: + /* Don't error on this BSD doesn't and if you think + about it this is right. Otherwise apps have to + play 'guess the biggest size' games. RCVBUF/SNDBUF + are treated in BSD as hints */ + + if (val > sysctl_rmem_max) + val = sysctl_rmem_max; +set_rcvbuf: + sk->sk_userlocks |= SOCK_RCVBUF_LOCK; + /* + * We double it on the way in to account for + * "struct sk_buff" etc. overhead. Applications + * assume that the SO_RCVBUF setting they make will + * allow that much actual data to be received on that + * socket. + * + * Applications are unaware that "struct sk_buff" and + * other overheads allocate from the receive buffer + * during socket buffer allocation. + * + * And after considering the possible alternatives, + * returning the value we actually used in getsockopt + * is the most desirable behavior. + */ + if ((val * 2) < SOCK_MIN_RCVBUF) + sk->sk_rcvbuf = SOCK_MIN_RCVBUF; + else + sk->sk_rcvbuf = val * 2; + break; + + case SO_RCVBUFFORCE: + if (!capable(CAP_NET_ADMIN)) { + ret = -EPERM; + break; + } + goto set_rcvbuf; + + case SO_KEEPALIVE: +#ifdef CONFIG_INET + if (sk->sk_protocol == IPPROTO_TCP) + tcp_set_keepalive(sk, valbool); +#endif + sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool); + break; + + case SO_OOBINLINE: + sock_valbool_flag(sk, SOCK_URGINLINE, valbool); + break; + + case SO_NO_CHECK: + sk->sk_no_check = valbool; + break; + + case SO_PRIORITY: + if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN)) + sk->sk_priority = val; + else + ret = -EPERM; + break; + + case SO_LINGER: + if (optlen < sizeof(ling)) { + ret = -EINVAL; /* 1003.1g */ + break; + } + if (copy_from_user(&ling, optval, sizeof(ling))) { + ret = -EFAULT; + break; + } + if (!ling.l_onoff) + sock_reset_flag(sk, SOCK_LINGER); + else { +#if (BITS_PER_LONG == 32) + if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ) + sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT; + else +#endif + sk->sk_lingertime = (unsigned int)ling.l_linger * HZ; + sock_set_flag(sk, SOCK_LINGER); + } + break; + + case SO_BSDCOMPAT: + sock_warn_obsolete_bsdism("setsockopt"); + break; + + case SO_PASSCRED: + if (valbool) + set_bit(SOCK_PASSCRED, &sock->flags); + else + clear_bit(SOCK_PASSCRED, &sock->flags); + break; + + case SO_TIMESTAMP: + case SO_TIMESTAMPNS: + if (valbool) { + if (optname == SO_TIMESTAMP) + sock_reset_flag(sk, SOCK_RCVTSTAMPNS); + else + sock_set_flag(sk, SOCK_RCVTSTAMPNS); + sock_set_flag(sk, SOCK_RCVTSTAMP); + sock_enable_timestamp(sk, SOCK_TIMESTAMP); + } else { + sock_reset_flag(sk, SOCK_RCVTSTAMP); + sock_reset_flag(sk, SOCK_RCVTSTAMPNS); + } + break; + + case SO_TIMESTAMPING: + if (val & ~SOF_TIMESTAMPING_MASK) { + ret = -EINVAL; + break; + } + sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE, + val & SOF_TIMESTAMPING_TX_HARDWARE); + sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE, + val & SOF_TIMESTAMPING_TX_SOFTWARE); + sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE, + val & SOF_TIMESTAMPING_RX_HARDWARE); + if (val & SOF_TIMESTAMPING_RX_SOFTWARE) + sock_enable_timestamp(sk, + SOCK_TIMESTAMPING_RX_SOFTWARE); + else + sock_disable_timestamp(sk, + (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)); + sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE, + val & SOF_TIMESTAMPING_SOFTWARE); + sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE, + val & SOF_TIMESTAMPING_SYS_HARDWARE); + sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE, + val & SOF_TIMESTAMPING_RAW_HARDWARE); + break; + + case SO_RCVLOWAT: + if (val < 0) + val = INT_MAX; + sk->sk_rcvlowat = val ? : 1; + break; + + case SO_RCVTIMEO: + ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen); + break; + + case SO_SNDTIMEO: + ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen); + break; + + case SO_ATTACH_FILTER: + ret = -EINVAL; + if (optlen == sizeof(struct sock_fprog)) { + struct sock_fprog fprog; + + ret = -EFAULT; + if (copy_from_user(&fprog, optval, sizeof(fprog))) + break; + + ret = sk_attach_filter(&fprog, sk); + } + break; + + case SO_DETACH_FILTER: + ret = sk_detach_filter(sk); + break; + + case SO_PASSSEC: + if (valbool) + set_bit(SOCK_PASSSEC, &sock->flags); + else + clear_bit(SOCK_PASSSEC, &sock->flags); + break; + case SO_MARK: + if (!capable(CAP_NET_ADMIN)) + ret = -EPERM; + else + sk->sk_mark = val; + break; + + /* We implement the SO_SNDLOWAT etc to + not be settable (1003.1g 5.3) */ + case SO_RXQ_OVFL: + sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool); + break; + + case SO_WIFI_STATUS: + sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool); + break; + + case SO_PEEK_OFF: + if (sock->ops->set_peek_off) + sock->ops->set_peek_off(sk, val); + else + ret = -EOPNOTSUPP; + break; + + case SO_NOFCS: + sock_valbool_flag(sk, SOCK_NOFCS, valbool); + break; + + default: + ret = -ENOPROTOOPT; + break; + } + release_sock(sk); + return ret; +} +EXPORT_SYMBOL(sock_setsockopt); + + +void cred_to_ucred(struct pid *pid, const struct cred *cred, + struct ucred *ucred) +{ + ucred->pid = pid_vnr(pid); + ucred->uid = ucred->gid = -1; + if (cred) { + struct user_namespace *current_ns = current_user_ns(); + + ucred->uid = user_ns_map_uid(current_ns, cred, cred->euid); + ucred->gid = user_ns_map_gid(current_ns, cred, cred->egid); + } +} +EXPORT_SYMBOL_GPL(cred_to_ucred); + +int sock_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct sock *sk = sock->sk; + + union { + int val; + struct linger ling; + struct timeval tm; + } v; + + int lv = sizeof(int); + int len; + + if (get_user(len, optlen)) + return -EFAULT; + if (len < 0) + return -EINVAL; + + memset(&v, 0, sizeof(v)); + + switch (optname) { + case SO_DEBUG: + v.val = sock_flag(sk, SOCK_DBG); + break; + + case SO_DONTROUTE: + v.val = sock_flag(sk, SOCK_LOCALROUTE); + break; + + case SO_BROADCAST: + v.val = !!sock_flag(sk, SOCK_BROADCAST); + break; + + case SO_SNDBUF: + v.val = sk->sk_sndbuf; + break; + + case SO_RCVBUF: + v.val = sk->sk_rcvbuf; + break; + + case SO_REUSEADDR: + v.val = sk->sk_reuse; + break; + + case SO_KEEPALIVE: + v.val = !!sock_flag(sk, SOCK_KEEPOPEN); + break; + + case SO_TYPE: + v.val = sk->sk_type; + break; + + case SO_PROTOCOL: + v.val = sk->sk_protocol; + break; + + case SO_DOMAIN: + v.val = sk->sk_family; + break; + + case SO_ERROR: + v.val = -sock_error(sk); + if (v.val == 0) + v.val = xchg(&sk->sk_err_soft, 0); + break; + + case SO_OOBINLINE: + v.val = !!sock_flag(sk, SOCK_URGINLINE); + break; + + case SO_NO_CHECK: + v.val = sk->sk_no_check; + break; + + case SO_PRIORITY: + v.val = sk->sk_priority; + break; + + case SO_LINGER: + lv = sizeof(v.ling); + v.ling.l_onoff = !!sock_flag(sk, SOCK_LINGER); + v.ling.l_linger = sk->sk_lingertime / HZ; + break; + + case SO_BSDCOMPAT: + sock_warn_obsolete_bsdism("getsockopt"); + break; + + case SO_TIMESTAMP: + v.val = sock_flag(sk, SOCK_RCVTSTAMP) && + !sock_flag(sk, SOCK_RCVTSTAMPNS); + break; + + case SO_TIMESTAMPNS: + v.val = sock_flag(sk, SOCK_RCVTSTAMPNS); + break; + + case SO_TIMESTAMPING: + v.val = 0; + if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE)) + v.val |= SOF_TIMESTAMPING_TX_HARDWARE; + if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE)) + v.val |= SOF_TIMESTAMPING_TX_SOFTWARE; + if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE)) + v.val |= SOF_TIMESTAMPING_RX_HARDWARE; + if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE)) + v.val |= SOF_TIMESTAMPING_RX_SOFTWARE; + if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE)) + v.val |= SOF_TIMESTAMPING_SOFTWARE; + if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE)) + v.val |= SOF_TIMESTAMPING_SYS_HARDWARE; + if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE)) + v.val |= SOF_TIMESTAMPING_RAW_HARDWARE; + break; + + case SO_RCVTIMEO: + lv = sizeof(struct timeval); + if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) { + v.tm.tv_sec = 0; + v.tm.tv_usec = 0; + } else { + v.tm.tv_sec = sk->sk_rcvtimeo / HZ; + v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ; + } + break; + + case SO_SNDTIMEO: + lv = sizeof(struct timeval); + if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) { + v.tm.tv_sec = 0; + v.tm.tv_usec = 0; + } else { + v.tm.tv_sec = sk->sk_sndtimeo / HZ; + v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ; + } + break; + + case SO_RCVLOWAT: + v.val = sk->sk_rcvlowat; + break; + + case SO_SNDLOWAT: + v.val = 1; + break; + + case SO_PASSCRED: + v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0; + break; + + case SO_PEERCRED: + { + struct ucred peercred; + if (len > sizeof(peercred)) + len = sizeof(peercred); + cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred); + if (copy_to_user(optval, &peercred, len)) + return -EFAULT; + goto lenout; + } + + case SO_PEERNAME: + { + char address[128]; + + if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2)) + return -ENOTCONN; + if (lv < len) + return -EINVAL; + if (copy_to_user(optval, address, len)) + return -EFAULT; + goto lenout; + } + + /* Dubious BSD thing... Probably nobody even uses it, but + * the UNIX standard wants it for whatever reason... -DaveM + */ + case SO_ACCEPTCONN: + v.val = sk->sk_state == TCP_LISTEN; + break; + + case SO_PASSSEC: + v.val = test_bit(SOCK_PASSSEC, &sock->flags) ? 1 : 0; + break; + + case SO_PEERSEC: + return security_socket_getpeersec_stream(sock, optval, optlen, len); + + case SO_MARK: + v.val = sk->sk_mark; + break; + + case SO_RXQ_OVFL: + v.val = !!sock_flag(sk, SOCK_RXQ_OVFL); + break; + + case SO_WIFI_STATUS: + v.val = !!sock_flag(sk, SOCK_WIFI_STATUS); + break; + + case SO_PEEK_OFF: + if (!sock->ops->set_peek_off) + return -EOPNOTSUPP; + + v.val = sk->sk_peek_off; + break; + case SO_NOFCS: + v.val = !!sock_flag(sk, SOCK_NOFCS); + break; + default: + return -ENOPROTOOPT; + } + + if (len > lv) + len = lv; + if (copy_to_user(optval, &v, len)) + return -EFAULT; +lenout: + if (put_user(len, optlen)) + return -EFAULT; + return 0; +} + +/* + * Initialize an sk_lock. + * + * (We also register the sk_lock with the lock validator.) + */ +static inline void sock_lock_init(struct sock *sk) +{ + sock_lock_init_class_and_name(sk, + af_family_slock_key_strings[sk->sk_family], + af_family_slock_keys + sk->sk_family, + af_family_key_strings[sk->sk_family], + af_family_keys + sk->sk_family); +} + +/* + * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet, + * even temporarly, because of RCU lookups. sk_node should also be left as is. + * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end + */ +static void sock_copy(struct sock *nsk, const struct sock *osk) +{ +#ifdef CONFIG_SECURITY_NETWORK + void *sptr = nsk->sk_security; +#endif + memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin)); + + memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end, + osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end)); + +#ifdef CONFIG_SECURITY_NETWORK + nsk->sk_security = sptr; + security_sk_clone(osk, nsk); +#endif +} + +/* + * caches using SLAB_DESTROY_BY_RCU should let .next pointer from nulls nodes + * un-modified. Special care is taken when initializing object to zero. + */ +static inline void sk_prot_clear_nulls(struct sock *sk, int size) +{ + if (offsetof(struct sock, sk_node.next) != 0) + memset(sk, 0, offsetof(struct sock, sk_node.next)); + memset(&sk->sk_node.pprev, 0, + size - offsetof(struct sock, sk_node.pprev)); +} + +void sk_prot_clear_portaddr_nulls(struct sock *sk, int size) +{ + unsigned long nulls1, nulls2; + + nulls1 = offsetof(struct sock, __sk_common.skc_node.next); + nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next); + if (nulls1 > nulls2) + swap(nulls1, nulls2); + + if (nulls1 != 0) + memset((char *)sk, 0, nulls1); + memset((char *)sk + nulls1 + sizeof(void *), 0, + nulls2 - nulls1 - sizeof(void *)); + memset((char *)sk + nulls2 + sizeof(void *), 0, + size - nulls2 - sizeof(void *)); +} +EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls); + +static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, + int family) +{ + struct sock *sk; + struct kmem_cache *slab; + + slab = prot->slab; + if (slab != NULL) { + sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO); + if (!sk) + return sk; + if (priority & __GFP_ZERO) { + if (prot->clear_sk) + prot->clear_sk(sk, prot->obj_size); + else + sk_prot_clear_nulls(sk, prot->obj_size); + } + } else + sk = kmalloc(prot->obj_size, priority); + + if (sk != NULL) { + kmemcheck_annotate_bitfield(sk, flags); + + if (security_sk_alloc(sk, family, priority)) + goto out_free; + + if (!try_module_get(prot->owner)) + goto out_free_sec; + sk_tx_queue_clear(sk); + } + + return sk; + +out_free_sec: + security_sk_free(sk); +out_free: + if (slab != NULL) + kmem_cache_free(slab, sk); + else + kfree(sk); + return NULL; +} + +static void sk_prot_free(struct proto *prot, struct sock *sk) +{ + struct kmem_cache *slab; + struct module *owner; + + owner = prot->owner; + slab = prot->slab; + + security_sk_free(sk); + if (slab != NULL) + kmem_cache_free(slab, sk); + else + kfree(sk); + module_put(owner); +} + +#ifdef CONFIG_CGROUPS +void sock_update_classid(struct sock *sk) +{ + u32 classid; + + rcu_read_lock(); /* doing current task, which cannot vanish. */ + classid = task_cls_classid(current); + rcu_read_unlock(); + if (classid && classid != sk->sk_classid) + sk->sk_classid = classid; +} +EXPORT_SYMBOL(sock_update_classid); + +void sock_update_netprioidx(struct sock *sk) +{ + if (in_interrupt()) + return; + + sk->sk_cgrp_prioidx = task_netprioidx(current); +} +EXPORT_SYMBOL_GPL(sock_update_netprioidx); +#endif + +/** + * sk_alloc - All socket objects are allocated here + * @net: the applicable net namespace + * @family: protocol family + * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) + * @prot: struct proto associated with this new sock instance + */ +struct sock *sk_alloc(struct net *net, int family, gfp_t priority, + struct proto *prot) +{ + struct sock *sk; + + sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family); + if (sk) { + sk->sk_family = family; + /* + * See comment in struct sock definition to understand + * why we need sk_prot_creator -acme + */ + sk->sk_prot = sk->sk_prot_creator = prot; + sock_lock_init(sk); + sock_net_set(sk, get_net(net)); + atomic_set(&sk->sk_wmem_alloc, 1); + + sock_update_classid(sk); + sock_update_netprioidx(sk); + } + + return sk; +} +EXPORT_SYMBOL(sk_alloc); + +static void __sk_free(struct sock *sk) +{ + struct sk_filter *filter; + + if (sk->sk_destruct) + sk->sk_destruct(sk); + + filter = rcu_dereference_check(sk->sk_filter, + atomic_read(&sk->sk_wmem_alloc) == 0); + if (filter) { + sk_filter_uncharge(sk, filter); + RCU_INIT_POINTER(sk->sk_filter, NULL); + } + + sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP); + + if (atomic_read(&sk->sk_omem_alloc)) + printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n", + __func__, atomic_read(&sk->sk_omem_alloc)); + + if (sk->sk_peer_cred) + put_cred(sk->sk_peer_cred); + put_pid(sk->sk_peer_pid); + put_net(sock_net(sk)); + sk_prot_free(sk->sk_prot_creator, sk); +} + +void sk_free(struct sock *sk) +{ + /* + * We subtract one from sk_wmem_alloc and can know if + * some packets are still in some tx queue. + * If not null, sock_wfree() will call __sk_free(sk) later + */ + if (atomic_dec_and_test(&sk->sk_wmem_alloc)) + __sk_free(sk); +} +EXPORT_SYMBOL(sk_free); + +/* + * Last sock_put should drop reference to sk->sk_net. It has already + * been dropped in sk_change_net. Taking reference to stopping namespace + * is not an option. + * Take reference to a socket to remove it from hash _alive_ and after that + * destroy it in the context of init_net. + */ +void sk_release_kernel(struct sock *sk) +{ + if (sk == NULL || sk->sk_socket == NULL) + return; + + sock_hold(sk); + sock_release(sk->sk_socket); + release_net(sock_net(sk)); + sock_net_set(sk, get_net(&init_net)); + sock_put(sk); +} +EXPORT_SYMBOL(sk_release_kernel); + +static void sk_update_clone(const struct sock *sk, struct sock *newsk) +{ + if (mem_cgroup_sockets_enabled && sk->sk_cgrp) + sock_update_memcg(newsk); +} + +/** + * sk_clone_lock - clone a socket, and lock its clone + * @sk: the socket to clone + * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) + * + * Caller must unlock socket even in error path (bh_unlock_sock(newsk)) + */ +struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) +{ + struct sock *newsk; + + newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family); + if (newsk != NULL) { + struct sk_filter *filter; + + sock_copy(newsk, sk); + + /* SANITY */ + get_net(sock_net(newsk)); + sk_node_init(&newsk->sk_node); + sock_lock_init(newsk); + bh_lock_sock(newsk); + newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL; + newsk->sk_backlog.len = 0; + + atomic_set(&newsk->sk_rmem_alloc, 0); + /* + * sk_wmem_alloc set to one (see sk_free() and sock_wfree()) + */ + atomic_set(&newsk->sk_wmem_alloc, 1); + atomic_set(&newsk->sk_omem_alloc, 0); + skb_queue_head_init(&newsk->sk_receive_queue); + skb_queue_head_init(&newsk->sk_write_queue); +#ifdef CONFIG_NET_DMA + skb_queue_head_init(&newsk->sk_async_wait_queue); +#endif + + spin_lock_init(&newsk->sk_dst_lock); + rwlock_init(&newsk->sk_callback_lock); + lockdep_set_class_and_name(&newsk->sk_callback_lock, + af_callback_keys + newsk->sk_family, + af_family_clock_key_strings[newsk->sk_family]); + + newsk->sk_dst_cache = NULL; + newsk->sk_wmem_queued = 0; + newsk->sk_forward_alloc = 0; + newsk->sk_send_head = NULL; + newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; + + sock_reset_flag(newsk, SOCK_DONE); + skb_queue_head_init(&newsk->sk_error_queue); + + filter = rcu_dereference_protected(newsk->sk_filter, 1); + if (filter != NULL) + sk_filter_charge(newsk, filter); + + if (unlikely(xfrm_sk_clone_policy(newsk))) { + /* It is still raw copy of parent, so invalidate + * destructor and make plain sk_free() */ + newsk->sk_destruct = NULL; + bh_unlock_sock(newsk); + sk_free(newsk); + newsk = NULL; + goto out; + } + + newsk->sk_err = 0; + newsk->sk_priority = 0; + /* + * Before updating sk_refcnt, we must commit prior changes to memory + * (Documentation/RCU/rculist_nulls.txt for details) + */ + smp_wmb(); + atomic_set(&newsk->sk_refcnt, 2); + + /* + * Increment the counter in the same struct proto as the master + * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that + * is the same as sk->sk_prot->socks, as this field was copied + * with memcpy). + * + * This _changes_ the previous behaviour, where + * tcp_create_openreq_child always was incrementing the + * equivalent to tcp_prot->socks (inet_sock_nr), so this have + * to be taken into account in all callers. -acme + */ + sk_refcnt_debug_inc(newsk); + sk_set_socket(newsk, NULL); + newsk->sk_wq = NULL; + + sk_update_clone(sk, newsk); + + if (newsk->sk_prot->sockets_allocated) + sk_sockets_allocated_inc(newsk); + + if (newsk->sk_flags & SK_FLAGS_TIMESTAMP) + net_enable_timestamp(); + } +out: + return newsk; +} +EXPORT_SYMBOL_GPL(sk_clone_lock); + +void sk_setup_caps(struct sock *sk, struct dst_entry *dst) +{ + __sk_dst_set(sk, dst); + sk->sk_route_caps = dst->dev->features; + if (sk->sk_route_caps & NETIF_F_GSO) + sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE; + sk->sk_route_caps &= ~sk->sk_route_nocaps; + if (sk_can_gso(sk)) { + if (dst->header_len) { + sk->sk_route_caps &= ~NETIF_F_GSO_MASK; + } else { + sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; + sk->sk_gso_max_size = dst->dev->gso_max_size; + } + } +} +EXPORT_SYMBOL_GPL(sk_setup_caps); + +void __init sk_init(void) +{ + if (totalram_pages <= 4096) { + sysctl_wmem_max = 32767; + sysctl_rmem_max = 32767; + sysctl_wmem_default = 32767; + sysctl_rmem_default = 32767; + } else if (totalram_pages >= 131072) { + sysctl_wmem_max = 131071; + sysctl_rmem_max = 131071; + } +} + +/* + * Simple resource managers for sockets. + */ + + +/* + * Write buffer destructor automatically called from kfree_skb. + */ +void sock_wfree(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + unsigned int len = skb->truesize; + + if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) { + /* + * Keep a reference on sk_wmem_alloc, this will be released + * after sk_write_space() call + */ + atomic_sub(len - 1, &sk->sk_wmem_alloc); + sk->sk_write_space(sk); + len = 1; + } + /* + * if sk_wmem_alloc reaches 0, we must finish what sk_free() + * could not do because of in-flight packets + */ + if (atomic_sub_and_test(len, &sk->sk_wmem_alloc)) + __sk_free(sk); +} +EXPORT_SYMBOL(sock_wfree); + +/* + * Read buffer destructor automatically called from kfree_skb. + */ +void sock_rfree(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + unsigned int len = skb->truesize; + + atomic_sub(len, &sk->sk_rmem_alloc); + sk_mem_uncharge(sk, len); +} +EXPORT_SYMBOL(sock_rfree); + + +int sock_i_uid(struct sock *sk) +{ + int uid; + + read_lock_bh(&sk->sk_callback_lock); + uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0; + read_unlock_bh(&sk->sk_callback_lock); + return uid; +} +EXPORT_SYMBOL(sock_i_uid); + +unsigned long sock_i_ino(struct sock *sk) +{ + unsigned long ino; + + read_lock_bh(&sk->sk_callback_lock); + ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0; + read_unlock_bh(&sk->sk_callback_lock); + return ino; +} +EXPORT_SYMBOL(sock_i_ino); + +/* + * Allocate a skb from the socket's send buffer. + */ +struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, + gfp_t priority) +{ + if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) { + struct sk_buff *skb = alloc_skb(size, priority); + if (skb) { + skb_set_owner_w(skb, sk); + return skb; + } + } + return NULL; +} +EXPORT_SYMBOL(sock_wmalloc); + +/* + * Allocate a skb from the socket's receive buffer. + */ +struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force, + gfp_t priority) +{ + if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) { + struct sk_buff *skb = alloc_skb(size, priority); + if (skb) { + skb_set_owner_r(skb, sk); + return skb; + } + } + return NULL; +} + +/* + * Allocate a memory block from the socket's option memory buffer. + */ +void *sock_kmalloc(struct sock *sk, int size, gfp_t priority) +{ + if ((unsigned)size <= sysctl_optmem_max && + atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) { + void *mem; + /* First do the add, to avoid the race if kmalloc + * might sleep. + */ + atomic_add(size, &sk->sk_omem_alloc); + mem = kmalloc(size, priority); + if (mem) + return mem; + atomic_sub(size, &sk->sk_omem_alloc); + } + return NULL; +} +EXPORT_SYMBOL(sock_kmalloc); + +/* + * Free an option memory block. + */ +void sock_kfree_s(struct sock *sk, void *mem, int size) +{ + kfree(mem); + atomic_sub(size, &sk->sk_omem_alloc); +} +EXPORT_SYMBOL(sock_kfree_s); + +/* It is almost wait_for_tcp_memory minus release_sock/lock_sock. + I think, these locks should be removed for datagram sockets. + */ +static long sock_wait_for_wmem(struct sock *sk, long timeo) +{ + DEFINE_WAIT(wait); + + clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + for (;;) { + if (!timeo) + break; + if (signal_pending(current)) + break; + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) + break; + if (sk->sk_shutdown & SEND_SHUTDOWN) + break; + if (sk->sk_err) + break; + timeo = schedule_timeout(timeo); + } + finish_wait(sk_sleep(sk), &wait); + return timeo; +} + + +/* + * Generic send/receive buffer handlers + */ + +struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, + unsigned long data_len, int noblock, + int *errcode) +{ + struct sk_buff *skb; + gfp_t gfp_mask; + long timeo; + int err; + int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT; + + err = -EMSGSIZE; + if (npages > MAX_SKB_FRAGS) + goto failure; + + gfp_mask = sk->sk_allocation; + if (gfp_mask & __GFP_WAIT) + gfp_mask |= __GFP_REPEAT; + + timeo = sock_sndtimeo(sk, noblock); + while (1) { + err = sock_error(sk); + if (err != 0) + goto failure; + + err = -EPIPE; + if (sk->sk_shutdown & SEND_SHUTDOWN) + goto failure; + + if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) { + skb = alloc_skb(header_len, gfp_mask); + if (skb) { + int i; + + /* No pages, we're done... */ + if (!data_len) + break; + + skb->truesize += data_len; + skb_shinfo(skb)->nr_frags = npages; + for (i = 0; i < npages; i++) { + struct page *page; + + page = alloc_pages(sk->sk_allocation, 0); + if (!page) { + err = -ENOBUFS; + skb_shinfo(skb)->nr_frags = i; + kfree_skb(skb); + goto failure; + } + + __skb_fill_page_desc(skb, i, + page, 0, + (data_len >= PAGE_SIZE ? + PAGE_SIZE : + data_len)); + data_len -= PAGE_SIZE; + } + + /* Full success... */ + break; + } + err = -ENOBUFS; + goto failure; + } + set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + err = -EAGAIN; + if (!timeo) + goto failure; + if (signal_pending(current)) + goto interrupted; + timeo = sock_wait_for_wmem(sk, timeo); + } + + skb_set_owner_w(skb, sk); + return skb; + +interrupted: + err = sock_intr_errno(timeo); +failure: + *errcode = err; + return NULL; +} +EXPORT_SYMBOL(sock_alloc_send_pskb); + +struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, + int noblock, int *errcode) +{ + return sock_alloc_send_pskb(sk, size, 0, noblock, errcode); +} +EXPORT_SYMBOL(sock_alloc_send_skb); + +static void __lock_sock(struct sock *sk) + __releases(&sk->sk_lock.slock) + __acquires(&sk->sk_lock.slock) +{ + DEFINE_WAIT(wait); + + for (;;) { + prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait, + TASK_UNINTERRUPTIBLE); + spin_unlock_bh(&sk->sk_lock.slock); + schedule(); + spin_lock_bh(&sk->sk_lock.slock); + if (!sock_owned_by_user(sk)) + break; + } + finish_wait(&sk->sk_lock.wq, &wait); +} + +static void __release_sock(struct sock *sk) + __releases(&sk->sk_lock.slock) + __acquires(&sk->sk_lock.slock) +{ + struct sk_buff *skb = sk->sk_backlog.head; + + do { + sk->sk_backlog.head = sk->sk_backlog.tail = NULL; + bh_unlock_sock(sk); + + do { + struct sk_buff *next = skb->next; + + WARN_ON_ONCE(skb_dst_is_noref(skb)); + skb->next = NULL; + sk_backlog_rcv(sk, skb); + + /* + * We are in process context here with softirqs + * disabled, use cond_resched_softirq() to preempt. + * This is safe to do because we've taken the backlog + * queue private: + */ + cond_resched_softirq(); + + skb = next; + } while (skb != NULL); + + bh_lock_sock(sk); + } while ((skb = sk->sk_backlog.head) != NULL); + + /* + * Doing the zeroing here guarantee we can not loop forever + * while a wild producer attempts to flood us. + */ + sk->sk_backlog.len = 0; +} + +/** + * sk_wait_data - wait for data to arrive at sk_receive_queue + * @sk: sock to wait on + * @timeo: for how long + * + * Now socket state including sk->sk_err is changed only under lock, + * hence we may omit checks after joining wait queue. + * We check receive queue before schedule() only as optimization; + * it is very likely that release_sock() added new data. + */ +int sk_wait_data(struct sock *sk, long *timeo) +{ + int rc; + DEFINE_WAIT(wait); + + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue)); + clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + finish_wait(sk_sleep(sk), &wait); + return rc; +} +EXPORT_SYMBOL(sk_wait_data); + +/** + * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated + * @sk: socket + * @size: memory size to allocate + * @kind: allocation type + * + * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means + * rmem allocation. This function assumes that protocols which have + * memory_pressure use sk_wmem_queued as write buffer accounting. + */ +int __sk_mem_schedule(struct sock *sk, int size, int kind) +{ + struct proto *prot = sk->sk_prot; + int amt = sk_mem_pages(size); + long allocated; + int parent_status = UNDER_LIMIT; + + sk->sk_forward_alloc += amt * SK_MEM_QUANTUM; + + allocated = sk_memory_allocated_add(sk, amt, &parent_status); + + /* Under limit. */ + if (parent_status == UNDER_LIMIT && + allocated <= sk_prot_mem_limits(sk, 0)) { + sk_leave_memory_pressure(sk); + return 1; + } + + /* Under pressure. (we or our parents) */ + if ((parent_status > SOFT_LIMIT) || + allocated > sk_prot_mem_limits(sk, 1)) + sk_enter_memory_pressure(sk); + + /* Over hard limit (we or our parents) */ + if ((parent_status == OVER_LIMIT) || + (allocated > sk_prot_mem_limits(sk, 2))) + goto suppress_allocation; + + /* guarantee minimum buffer size under pressure */ + if (kind == SK_MEM_RECV) { + if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0]) + return 1; + + } else { /* SK_MEM_SEND */ + if (sk->sk_type == SOCK_STREAM) { + if (sk->sk_wmem_queued < prot->sysctl_wmem[0]) + return 1; + } else if (atomic_read(&sk->sk_wmem_alloc) < + prot->sysctl_wmem[0]) + return 1; + } + + if (sk_has_memory_pressure(sk)) { + int alloc; + + if (!sk_under_memory_pressure(sk)) + return 1; + alloc = sk_sockets_allocated_read_positive(sk); + if (sk_prot_mem_limits(sk, 2) > alloc * + sk_mem_pages(sk->sk_wmem_queued + + atomic_read(&sk->sk_rmem_alloc) + + sk->sk_forward_alloc)) + return 1; + } + +suppress_allocation: + + if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) { + sk_stream_moderate_sndbuf(sk); + + /* Fail only if socket is _under_ its sndbuf. + * In this case we cannot block, so that we have to fail. + */ + if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) + return 1; + } + + trace_sock_exceed_buf_limit(sk, prot, allocated); + + /* Alas. Undo changes. */ + sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM; + + sk_memory_allocated_sub(sk, amt); + + return 0; +} +EXPORT_SYMBOL(__sk_mem_schedule); + +/** + * __sk_reclaim - reclaim memory_allocated + * @sk: socket + */ +void __sk_mem_reclaim(struct sock *sk) +{ + sk_memory_allocated_sub(sk, + sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT); + sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1; + + if (sk_under_memory_pressure(sk) && + (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) + sk_leave_memory_pressure(sk); +} +EXPORT_SYMBOL(__sk_mem_reclaim); + + +/* + * Set of default routines for initialising struct proto_ops when + * the protocol does not support a particular function. In certain + * cases where it makes no sense for a protocol to have a "do nothing" + * function, some default processing is provided. + */ + +int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len) +{ + return -EOPNOTSUPP; +} +EXPORT_SYMBOL(sock_no_bind); + +int sock_no_connect(struct socket *sock, struct sockaddr *saddr, + int len, int flags) +{ + return -EOPNOTSUPP; +} +EXPORT_SYMBOL(sock_no_connect); + +int sock_no_socketpair(struct socket *sock1, struct socket *sock2) +{ + return -EOPNOTSUPP; +} +EXPORT_SYMBOL(sock_no_socketpair); + +int sock_no_accept(struct socket *sock, struct socket *newsock, int flags) +{ + return -EOPNOTSUPP; +} +EXPORT_SYMBOL(sock_no_accept); + +int sock_no_getname(struct socket *sock, struct sockaddr *saddr, + int *len, int peer) +{ + return -EOPNOTSUPP; +} +EXPORT_SYMBOL(sock_no_getname); + +unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt) +{ + return 0; +} +EXPORT_SYMBOL(sock_no_poll); + +int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + return -EOPNOTSUPP; +} +EXPORT_SYMBOL(sock_no_ioctl); + +int sock_no_listen(struct socket *sock, int backlog) +{ + return -EOPNOTSUPP; +} +EXPORT_SYMBOL(sock_no_listen); + +int sock_no_shutdown(struct socket *sock, int how) +{ + return -EOPNOTSUPP; +} +EXPORT_SYMBOL(sock_no_shutdown); + +int sock_no_setsockopt(struct socket *sock, int level, int optname, + char __user *optval, unsigned int optlen) +{ + return -EOPNOTSUPP; +} +EXPORT_SYMBOL(sock_no_setsockopt); + +int sock_no_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + return -EOPNOTSUPP; +} +EXPORT_SYMBOL(sock_no_getsockopt); + +int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m, + size_t len) +{ + return -EOPNOTSUPP; +} +EXPORT_SYMBOL(sock_no_sendmsg); + +int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m, + size_t len, int flags) +{ + return -EOPNOTSUPP; +} +EXPORT_SYMBOL(sock_no_recvmsg); + +int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) +{ + /* Mirror missing mmap method error code */ + return -ENODEV; +} +EXPORT_SYMBOL(sock_no_mmap); + +ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags) +{ + ssize_t res; + struct msghdr msg = {.msg_flags = flags}; + struct kvec iov; + char *kaddr = kmap(page); + iov.iov_base = kaddr + offset; + iov.iov_len = size; + res = kernel_sendmsg(sock, &msg, &iov, 1, size); + kunmap(page); + return res; +} +EXPORT_SYMBOL(sock_no_sendpage); + +/* + * Default Socket Callbacks + */ + +static void sock_def_wakeup(struct sock *sk) +{ + struct socket_wq *wq; + + rcu_read_lock(); + wq = rcu_dereference(sk->sk_wq); + if (wq_has_sleeper(wq)) + wake_up_interruptible_all(&wq->wait); + rcu_read_unlock(); +} + +static void sock_def_error_report(struct sock *sk) +{ + struct socket_wq *wq; + + rcu_read_lock(); + wq = rcu_dereference(sk->sk_wq); + if (wq_has_sleeper(wq)) + wake_up_interruptible_poll(&wq->wait, POLLERR); + sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR); + rcu_read_unlock(); +} + +static void sock_def_readable(struct sock *sk, int len) +{ + struct socket_wq *wq; + + rcu_read_lock(); + wq = rcu_dereference(sk->sk_wq); + if (wq_has_sleeper(wq)) + wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI | + POLLRDNORM | POLLRDBAND); + sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); + rcu_read_unlock(); +} + +static void sock_def_write_space(struct sock *sk) +{ + struct socket_wq *wq; + + rcu_read_lock(); + + /* Do not wake up a writer until he can make "significant" + * progress. --DaveM + */ + if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) { + wq = rcu_dereference(sk->sk_wq); + if (wq_has_sleeper(wq)) + wake_up_interruptible_sync_poll(&wq->wait, POLLOUT | + POLLWRNORM | POLLWRBAND); + + /* Should agree with poll, otherwise some programs break */ + if (sock_writeable(sk)) + sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); + } + + rcu_read_unlock(); +} + +static void sock_def_destruct(struct sock *sk) +{ + kfree(sk->sk_protinfo); +} + +void sk_send_sigurg(struct sock *sk) +{ + if (sk->sk_socket && sk->sk_socket->file) + if (send_sigurg(&sk->sk_socket->file->f_owner)) + sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI); +} +EXPORT_SYMBOL(sk_send_sigurg); + +void sk_reset_timer(struct sock *sk, struct timer_list* timer, + unsigned long expires) +{ + if (!mod_timer(timer, expires)) + sock_hold(sk); +} +EXPORT_SYMBOL(sk_reset_timer); + +void sk_stop_timer(struct sock *sk, struct timer_list* timer) +{ + if (timer_pending(timer) && del_timer(timer)) + __sock_put(sk); +} +EXPORT_SYMBOL(sk_stop_timer); + +void sock_init_data(struct socket *sock, struct sock *sk) +{ + skb_queue_head_init(&sk->sk_receive_queue); + skb_queue_head_init(&sk->sk_write_queue); + skb_queue_head_init(&sk->sk_error_queue); +#ifdef CONFIG_NET_DMA + skb_queue_head_init(&sk->sk_async_wait_queue); +#endif + + sk->sk_send_head = NULL; + + init_timer(&sk->sk_timer); + + sk->sk_allocation = GFP_KERNEL; + sk->sk_rcvbuf = sysctl_rmem_default; + sk->sk_sndbuf = sysctl_wmem_default; + sk->sk_state = TCP_CLOSE; + sk_set_socket(sk, sock); + + sock_set_flag(sk, SOCK_ZAPPED); + + if (sock) { + sk->sk_type = sock->type; + sk->sk_wq = sock->wq; + sock->sk = sk; + } else + sk->sk_wq = NULL; + + spin_lock_init(&sk->sk_dst_lock); + rwlock_init(&sk->sk_callback_lock); + lockdep_set_class_and_name(&sk->sk_callback_lock, + af_callback_keys + sk->sk_family, + af_family_clock_key_strings[sk->sk_family]); + + sk->sk_state_change = sock_def_wakeup; + sk->sk_data_ready = sock_def_readable; + sk->sk_write_space = sock_def_write_space; + sk->sk_error_report = sock_def_error_report; + sk->sk_destruct = sock_def_destruct; + + sk->sk_sndmsg_page = NULL; + sk->sk_sndmsg_off = 0; + sk->sk_peek_off = -1; + + sk->sk_peer_pid = NULL; + sk->sk_peer_cred = NULL; + sk->sk_write_pending = 0; + sk->sk_rcvlowat = 1; + sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; + sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; + + sk->sk_stamp = ktime_set(-1L, 0); + + /* + * Before updating sk_refcnt, we must commit prior changes to memory + * (Documentation/RCU/rculist_nulls.txt for details) + */ + smp_wmb(); + atomic_set(&sk->sk_refcnt, 1); + atomic_set(&sk->sk_drops, 0); +} +EXPORT_SYMBOL(sock_init_data); + +void lock_sock_nested(struct sock *sk, int subclass) +{ + might_sleep(); + spin_lock_bh(&sk->sk_lock.slock); + if (sk->sk_lock.owned) + __lock_sock(sk); + sk->sk_lock.owned = 1; + spin_unlock(&sk->sk_lock.slock); + /* + * The sk_lock has mutex_lock() semantics here: + */ + mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_); + local_bh_enable(); +} +EXPORT_SYMBOL(lock_sock_nested); + +void release_sock(struct sock *sk) +{ + /* + * The sk_lock has mutex_unlock() semantics: + */ + mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_); + + spin_lock_bh(&sk->sk_lock.slock); + if (sk->sk_backlog.tail) + __release_sock(sk); + sk->sk_lock.owned = 0; + if (waitqueue_active(&sk->sk_lock.wq)) + wake_up(&sk->sk_lock.wq); + spin_unlock_bh(&sk->sk_lock.slock); +} +EXPORT_SYMBOL(release_sock); + +/** + * lock_sock_fast - fast version of lock_sock + * @sk: socket + * + * This version should be used for very small section, where process wont block + * return false if fast path is taken + * sk_lock.slock locked, owned = 0, BH disabled + * return true if slow path is taken + * sk_lock.slock unlocked, owned = 1, BH enabled + */ +bool lock_sock_fast(struct sock *sk) +{ + might_sleep(); + spin_lock_bh(&sk->sk_lock.slock); + + if (!sk->sk_lock.owned) + /* + * Note : We must disable BH + */ + return false; + + __lock_sock(sk); + sk->sk_lock.owned = 1; + spin_unlock(&sk->sk_lock.slock); + /* + * The sk_lock has mutex_lock() semantics here: + */ + mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_); + local_bh_enable(); + return true; +} +EXPORT_SYMBOL(lock_sock_fast); + +int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp) +{ + struct timeval tv; + if (!sock_flag(sk, SOCK_TIMESTAMP)) + sock_enable_timestamp(sk, SOCK_TIMESTAMP); + tv = ktime_to_timeval(sk->sk_stamp); + if (tv.tv_sec == -1) + return -ENOENT; + if (tv.tv_sec == 0) { + sk->sk_stamp = ktime_get_real(); + tv = ktime_to_timeval(sk->sk_stamp); + } + return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0; +} +EXPORT_SYMBOL(sock_get_timestamp); + +int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp) +{ + struct timespec ts; + if (!sock_flag(sk, SOCK_TIMESTAMP)) + sock_enable_timestamp(sk, SOCK_TIMESTAMP); + ts = ktime_to_timespec(sk->sk_stamp); + if (ts.tv_sec == -1) + return -ENOENT; + if (ts.tv_sec == 0) { + sk->sk_stamp = ktime_get_real(); + ts = ktime_to_timespec(sk->sk_stamp); + } + return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0; +} +EXPORT_SYMBOL(sock_get_timestampns); + +void sock_enable_timestamp(struct sock *sk, int flag) +{ + if (!sock_flag(sk, flag)) { + unsigned long previous_flags = sk->sk_flags; + + sock_set_flag(sk, flag); + /* + * we just set one of the two flags which require net + * time stamping, but time stamping might have been on + * already because of the other one + */ + if (!(previous_flags & SK_FLAGS_TIMESTAMP)) + net_enable_timestamp(); + } +} + +/* + * Get a socket option on an socket. + * + * FIX: POSIX 1003.1g is very ambiguous here. It states that + * asynchronous errors should be reported by getsockopt. We assume + * this means if you specify SO_ERROR (otherwise whats the point of it). + */ +int sock_common_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct sock *sk = sock->sk; + + return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen); +} +EXPORT_SYMBOL(sock_common_getsockopt); + +#ifdef CONFIG_COMPAT +int compat_sock_common_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct sock *sk = sock->sk; + + if (sk->sk_prot->compat_getsockopt != NULL) + return sk->sk_prot->compat_getsockopt(sk, level, optname, + optval, optlen); + return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen); +} +EXPORT_SYMBOL(compat_sock_common_getsockopt); +#endif + +int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t size, int flags) +{ + struct sock *sk = sock->sk; + int addr_len = 0; + int err; + + err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT, + flags & ~MSG_DONTWAIT, &addr_len); + if (err >= 0) + msg->msg_namelen = addr_len; + return err; +} +EXPORT_SYMBOL(sock_common_recvmsg); + +/* + * Set socket options on an inet socket. + */ +int sock_common_setsockopt(struct socket *sock, int level, int optname, + char __user *optval, unsigned int optlen) +{ + struct sock *sk = sock->sk; + + return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen); +} +EXPORT_SYMBOL(sock_common_setsockopt); + +#ifdef CONFIG_COMPAT +int compat_sock_common_setsockopt(struct socket *sock, int level, int optname, + char __user *optval, unsigned int optlen) +{ + struct sock *sk = sock->sk; + + if (sk->sk_prot->compat_setsockopt != NULL) + return sk->sk_prot->compat_setsockopt(sk, level, optname, + optval, optlen); + return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen); +} +EXPORT_SYMBOL(compat_sock_common_setsockopt); +#endif + +void sk_common_release(struct sock *sk) +{ + if (sk->sk_prot->destroy) + sk->sk_prot->destroy(sk); + + /* + * Observation: when sock_common_release is called, processes have + * no access to socket. But net still has. + * Step one, detach it from networking: + * + * A. Remove from hash tables. + */ + + sk->sk_prot->unhash(sk); + + /* + * In this point socket cannot receive new packets, but it is possible + * that some packets are in flight because some CPU runs receiver and + * did hash table lookup before we unhashed socket. They will achieve + * receive queue and will be purged by socket destructor. + * + * Also we still have packets pending on receive queue and probably, + * our own packets waiting in device queues. sock_destroy will drain + * receive queue, but transmitted packets will delay socket destruction + * until the last reference will be released. + */ + + sock_orphan(sk); + + xfrm_sk_free_policy(sk); + + sk_refcnt_debug_release(sk); + sock_put(sk); +} +EXPORT_SYMBOL(sk_common_release); + +#ifdef CONFIG_PROC_FS +#define PROTO_INUSE_NR 64 /* should be enough for the first time */ +struct prot_inuse { + int val[PROTO_INUSE_NR]; +}; + +static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR); + +#ifdef CONFIG_NET_NS +void sock_prot_inuse_add(struct net *net, struct proto *prot, int val) +{ + __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val); +} +EXPORT_SYMBOL_GPL(sock_prot_inuse_add); + +int sock_prot_inuse_get(struct net *net, struct proto *prot) +{ + int cpu, idx = prot->inuse_idx; + int res = 0; + + for_each_possible_cpu(cpu) + res += per_cpu_ptr(net->core.inuse, cpu)->val[idx]; + + return res >= 0 ? res : 0; +} +EXPORT_SYMBOL_GPL(sock_prot_inuse_get); + +static int __net_init sock_inuse_init_net(struct net *net) +{ + net->core.inuse = alloc_percpu(struct prot_inuse); + return net->core.inuse ? 0 : -ENOMEM; +} + +static void __net_exit sock_inuse_exit_net(struct net *net) +{ + free_percpu(net->core.inuse); +} + +static struct pernet_operations net_inuse_ops = { + .init = sock_inuse_init_net, + .exit = sock_inuse_exit_net, +}; + +static __init int net_inuse_init(void) +{ + if (register_pernet_subsys(&net_inuse_ops)) + panic("Cannot initialize net inuse counters"); + + return 0; +} + +core_initcall(net_inuse_init); +#else +static DEFINE_PER_CPU(struct prot_inuse, prot_inuse); + +void sock_prot_inuse_add(struct net *net, struct proto *prot, int val) +{ + __this_cpu_add(prot_inuse.val[prot->inuse_idx], val); +} +EXPORT_SYMBOL_GPL(sock_prot_inuse_add); + +int sock_prot_inuse_get(struct net *net, struct proto *prot) +{ + int cpu, idx = prot->inuse_idx; + int res = 0; + + for_each_possible_cpu(cpu) + res += per_cpu(prot_inuse, cpu).val[idx]; + + return res >= 0 ? res : 0; +} +EXPORT_SYMBOL_GPL(sock_prot_inuse_get); +#endif + +static void assign_proto_idx(struct proto *prot) +{ + prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR); + + if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) { + printk(KERN_ERR "PROTO_INUSE_NR exhausted\n"); + return; + } + + set_bit(prot->inuse_idx, proto_inuse_idx); +} + +static void release_proto_idx(struct proto *prot) +{ + if (prot->inuse_idx != PROTO_INUSE_NR - 1) + clear_bit(prot->inuse_idx, proto_inuse_idx); +} +#else +static inline void assign_proto_idx(struct proto *prot) +{ +} + +static inline void release_proto_idx(struct proto *prot) +{ +} +#endif + +int proto_register(struct proto *prot, int alloc_slab) +{ + if (alloc_slab) { + prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0, + SLAB_HWCACHE_ALIGN | prot->slab_flags, + NULL); + + if (prot->slab == NULL) { + printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n", + prot->name); + goto out; + } + + if (prot->rsk_prot != NULL) { + prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name); + if (prot->rsk_prot->slab_name == NULL) + goto out_free_sock_slab; + + prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name, + prot->rsk_prot->obj_size, 0, + SLAB_HWCACHE_ALIGN, NULL); + + if (prot->rsk_prot->slab == NULL) { + printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n", + prot->name); + goto out_free_request_sock_slab_name; + } + } + + if (prot->twsk_prot != NULL) { + prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name); + + if (prot->twsk_prot->twsk_slab_name == NULL) + goto out_free_request_sock_slab; + + prot->twsk_prot->twsk_slab = + kmem_cache_create(prot->twsk_prot->twsk_slab_name, + prot->twsk_prot->twsk_obj_size, + 0, + SLAB_HWCACHE_ALIGN | + prot->slab_flags, + NULL); + if (prot->twsk_prot->twsk_slab == NULL) + goto out_free_timewait_sock_slab_name; + } + } + + mutex_lock(&proto_list_mutex); + list_add(&prot->node, &proto_list); + assign_proto_idx(prot); + mutex_unlock(&proto_list_mutex); + return 0; + +out_free_timewait_sock_slab_name: + kfree(prot->twsk_prot->twsk_slab_name); +out_free_request_sock_slab: + if (prot->rsk_prot && prot->rsk_prot->slab) { + kmem_cache_destroy(prot->rsk_prot->slab); + prot->rsk_prot->slab = NULL; + } +out_free_request_sock_slab_name: + if (prot->rsk_prot) + kfree(prot->rsk_prot->slab_name); +out_free_sock_slab: + kmem_cache_destroy(prot->slab); + prot->slab = NULL; +out: + return -ENOBUFS; +} +EXPORT_SYMBOL(proto_register); + +void proto_unregister(struct proto *prot) +{ + mutex_lock(&proto_list_mutex); + release_proto_idx(prot); + list_del(&prot->node); + mutex_unlock(&proto_list_mutex); + + if (prot->slab != NULL) { + kmem_cache_destroy(prot->slab); + prot->slab = NULL; + } + + if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) { + kmem_cache_destroy(prot->rsk_prot->slab); + kfree(prot->rsk_prot->slab_name); + prot->rsk_prot->slab = NULL; + } + + if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) { + kmem_cache_destroy(prot->twsk_prot->twsk_slab); + kfree(prot->twsk_prot->twsk_slab_name); + prot->twsk_prot->twsk_slab = NULL; + } +} +EXPORT_SYMBOL(proto_unregister); + +#ifdef CONFIG_PROC_FS +static void *proto_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(proto_list_mutex) +{ + mutex_lock(&proto_list_mutex); + return seq_list_start_head(&proto_list, *pos); +} + +static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + return seq_list_next(v, &proto_list, pos); +} + +static void proto_seq_stop(struct seq_file *seq, void *v) + __releases(proto_list_mutex) +{ + mutex_unlock(&proto_list_mutex); +} + +static char proto_method_implemented(const void *method) +{ + return method == NULL ? 'n' : 'y'; +} +static long sock_prot_memory_allocated(struct proto *proto) +{ + return proto->memory_allocated != NULL ? proto_memory_allocated(proto): -1L; +} + +static char *sock_prot_memory_pressure(struct proto *proto) +{ + return proto->memory_pressure != NULL ? + proto_memory_pressure(proto) ? "yes" : "no" : "NI"; +} + +static void proto_seq_printf(struct seq_file *seq, struct proto *proto) +{ + + seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s " + "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n", + proto->name, + proto->obj_size, + sock_prot_inuse_get(seq_file_net(seq), proto), + sock_prot_memory_allocated(proto), + sock_prot_memory_pressure(proto), + proto->max_header, + proto->slab == NULL ? "no" : "yes", + module_name(proto->owner), + proto_method_implemented(proto->close), + proto_method_implemented(proto->connect), + proto_method_implemented(proto->disconnect), + proto_method_implemented(proto->accept), + proto_method_implemented(proto->ioctl), + proto_method_implemented(proto->init), + proto_method_implemented(proto->destroy), + proto_method_implemented(proto->shutdown), + proto_method_implemented(proto->setsockopt), + proto_method_implemented(proto->getsockopt), + proto_method_implemented(proto->sendmsg), + proto_method_implemented(proto->recvmsg), + proto_method_implemented(proto->sendpage), + proto_method_implemented(proto->bind), + proto_method_implemented(proto->backlog_rcv), + proto_method_implemented(proto->hash), + proto_method_implemented(proto->unhash), + proto_method_implemented(proto->get_port), + proto_method_implemented(proto->enter_memory_pressure)); +} + +static int proto_seq_show(struct seq_file *seq, void *v) +{ + if (v == &proto_list) + seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s", + "protocol", + "size", + "sockets", + "memory", + "press", + "maxhdr", + "slab", + "module", + "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n"); + else + proto_seq_printf(seq, list_entry(v, struct proto, node)); + return 0; +} + +static const struct seq_operations proto_seq_ops = { + .start = proto_seq_start, + .next = proto_seq_next, + .stop = proto_seq_stop, + .show = proto_seq_show, +}; + +static int proto_seq_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &proto_seq_ops, + sizeof(struct seq_net_private)); +} + +static const struct file_operations proto_seq_fops = { + .owner = THIS_MODULE, + .open = proto_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; + +static __net_init int proto_init_net(struct net *net) +{ + if (!proc_net_fops_create(net, "protocols", S_IRUGO, &proto_seq_fops)) + return -ENOMEM; + + return 0; +} + +static __net_exit void proto_exit_net(struct net *net) +{ + proc_net_remove(net, "protocols"); +} + + +static __net_initdata struct pernet_operations proto_net_ops = { + .init = proto_init_net, + .exit = proto_exit_net, +}; + +static int __init proto_init(void) +{ + return register_pernet_subsys(&proto_net_ops); +} + +subsys_initcall(proto_init); + +#endif /* PROC_FS */ diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c new file mode 100644 index 00000000..b9868e1f --- /dev/null +++ b/net/core/sock_diag.c @@ -0,0 +1,192 @@ +#include <linux/mutex.h> +#include <linux/socket.h> +#include <linux/skbuff.h> +#include <net/netlink.h> +#include <net/net_namespace.h> +#include <linux/module.h> +#include <linux/rtnetlink.h> +#include <net/sock.h> + +#include <linux/inet_diag.h> +#include <linux/sock_diag.h> + +static struct sock_diag_handler *sock_diag_handlers[AF_MAX]; +static int (*inet_rcv_compat)(struct sk_buff *skb, struct nlmsghdr *nlh); +static DEFINE_MUTEX(sock_diag_table_mutex); + +int sock_diag_check_cookie(void *sk, __u32 *cookie) +{ + if ((cookie[0] != INET_DIAG_NOCOOKIE || + cookie[1] != INET_DIAG_NOCOOKIE) && + ((u32)(unsigned long)sk != cookie[0] || + (u32)((((unsigned long)sk) >> 31) >> 1) != cookie[1])) + return -ESTALE; + else + return 0; +} +EXPORT_SYMBOL_GPL(sock_diag_check_cookie); + +void sock_diag_save_cookie(void *sk, __u32 *cookie) +{ + cookie[0] = (u32)(unsigned long)sk; + cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1); +} +EXPORT_SYMBOL_GPL(sock_diag_save_cookie); + +int sock_diag_put_meminfo(struct sock *sk, struct sk_buff *skb, int attrtype) +{ + __u32 *mem; + + mem = RTA_DATA(__RTA_PUT(skb, attrtype, SK_MEMINFO_VARS * sizeof(__u32))); + + mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk); + mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf; + mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk); + mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf; + mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc; + mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued; + mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); + + return 0; + +rtattr_failure: + return -EMSGSIZE; +} +EXPORT_SYMBOL_GPL(sock_diag_put_meminfo); + +void sock_diag_register_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh)) +{ + mutex_lock(&sock_diag_table_mutex); + inet_rcv_compat = fn; + mutex_unlock(&sock_diag_table_mutex); +} +EXPORT_SYMBOL_GPL(sock_diag_register_inet_compat); + +void sock_diag_unregister_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh)) +{ + mutex_lock(&sock_diag_table_mutex); + inet_rcv_compat = NULL; + mutex_unlock(&sock_diag_table_mutex); +} +EXPORT_SYMBOL_GPL(sock_diag_unregister_inet_compat); + +int sock_diag_register(struct sock_diag_handler *hndl) +{ + int err = 0; + + if (hndl->family >= AF_MAX) + return -EINVAL; + + mutex_lock(&sock_diag_table_mutex); + if (sock_diag_handlers[hndl->family]) + err = -EBUSY; + else + sock_diag_handlers[hndl->family] = hndl; + mutex_unlock(&sock_diag_table_mutex); + + return err; +} +EXPORT_SYMBOL_GPL(sock_diag_register); + +void sock_diag_unregister(struct sock_diag_handler *hnld) +{ + int family = hnld->family; + + if (family >= AF_MAX) + return; + + mutex_lock(&sock_diag_table_mutex); + BUG_ON(sock_diag_handlers[family] != hnld); + sock_diag_handlers[family] = NULL; + mutex_unlock(&sock_diag_table_mutex); +} +EXPORT_SYMBOL_GPL(sock_diag_unregister); + +static inline struct sock_diag_handler *sock_diag_lock_handler(int family) +{ + if (sock_diag_handlers[family] == NULL) + request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, + NETLINK_SOCK_DIAG, family); + + mutex_lock(&sock_diag_table_mutex); + return sock_diag_handlers[family]; +} + +static inline void sock_diag_unlock_handler(struct sock_diag_handler *h) +{ + mutex_unlock(&sock_diag_table_mutex); +} + +static int __sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) +{ + int err; + struct sock_diag_req *req = NLMSG_DATA(nlh); + struct sock_diag_handler *hndl; + + if (nlmsg_len(nlh) < sizeof(*req)) + return -EINVAL; + + hndl = sock_diag_lock_handler(req->sdiag_family); + if (hndl == NULL) + err = -ENOENT; + else + err = hndl->dump(skb, nlh); + sock_diag_unlock_handler(hndl); + + return err; +} + +static int sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) +{ + int ret; + + switch (nlh->nlmsg_type) { + case TCPDIAG_GETSOCK: + case DCCPDIAG_GETSOCK: + if (inet_rcv_compat == NULL) + request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, + NETLINK_SOCK_DIAG, AF_INET); + + mutex_lock(&sock_diag_table_mutex); + if (inet_rcv_compat != NULL) + ret = inet_rcv_compat(skb, nlh); + else + ret = -EOPNOTSUPP; + mutex_unlock(&sock_diag_table_mutex); + + return ret; + case SOCK_DIAG_BY_FAMILY: + return __sock_diag_rcv_msg(skb, nlh); + default: + return -EINVAL; + } +} + +static DEFINE_MUTEX(sock_diag_mutex); + +static void sock_diag_rcv(struct sk_buff *skb) +{ + mutex_lock(&sock_diag_mutex); + netlink_rcv_skb(skb, &sock_diag_rcv_msg); + mutex_unlock(&sock_diag_mutex); +} + +struct sock *sock_diag_nlsk; +EXPORT_SYMBOL_GPL(sock_diag_nlsk); + +static int __init sock_diag_init(void) +{ + sock_diag_nlsk = netlink_kernel_create(&init_net, NETLINK_SOCK_DIAG, 0, + sock_diag_rcv, NULL, THIS_MODULE); + return sock_diag_nlsk == NULL ? -ENOMEM : 0; +} + +static void __exit sock_diag_exit(void) +{ + netlink_kernel_release(sock_diag_nlsk); +} + +module_init(sock_diag_init); +module_exit(sock_diag_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_SOCK_DIAG); diff --git a/net/core/stream.c b/net/core/stream.c new file mode 100644 index 00000000..f5df85dc --- /dev/null +++ b/net/core/stream.c @@ -0,0 +1,208 @@ +/* + * SUCS NET3: + * + * Generic stream handling routines. These are generic for most + * protocols. Even IP. Tonight 8-). + * This is used because TCP, LLC (others too) layer all have mostly + * identical sendmsg() and recvmsg() code. + * So we (will) share it here. + * + * Authors: Arnaldo Carvalho de Melo <acme@conectiva.com.br> + * (from old tcp.c code) + * Alan Cox <alan@lxorguk.ukuu.org.uk> (Borrowed comments 8-)) + */ + +#include <linux/module.h> +#include <linux/net.h> +#include <linux/signal.h> +#include <linux/tcp.h> +#include <linux/wait.h> +#include <net/sock.h> + +/** + * sk_stream_write_space - stream socket write_space callback. + * @sk: socket + * + * FIXME: write proper description + */ +void sk_stream_write_space(struct sock *sk) +{ + struct socket *sock = sk->sk_socket; + struct socket_wq *wq; + + if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk) && sock) { + clear_bit(SOCK_NOSPACE, &sock->flags); + + rcu_read_lock(); + wq = rcu_dereference(sk->sk_wq); + if (wq_has_sleeper(wq)) + wake_up_interruptible_poll(&wq->wait, POLLOUT | + POLLWRNORM | POLLWRBAND); + if (wq && wq->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN)) + sock_wake_async(sock, SOCK_WAKE_SPACE, POLL_OUT); + rcu_read_unlock(); + } +} +EXPORT_SYMBOL(sk_stream_write_space); + +/** + * sk_stream_wait_connect - Wait for a socket to get into the connected state + * @sk: sock to wait on + * @timeo_p: for how long to wait + * + * Must be called with the socket locked. + */ +int sk_stream_wait_connect(struct sock *sk, long *timeo_p) +{ + struct task_struct *tsk = current; + DEFINE_WAIT(wait); + int done; + + do { + int err = sock_error(sk); + if (err) + return err; + if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) + return -EPIPE; + if (!*timeo_p) + return -EAGAIN; + if (signal_pending(tsk)) + return sock_intr_errno(*timeo_p); + + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + sk->sk_write_pending++; + done = sk_wait_event(sk, timeo_p, + !sk->sk_err && + !((1 << sk->sk_state) & + ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))); + finish_wait(sk_sleep(sk), &wait); + sk->sk_write_pending--; + } while (!done); + return 0; +} +EXPORT_SYMBOL(sk_stream_wait_connect); + +/** + * sk_stream_closing - Return 1 if we still have things to send in our buffers. + * @sk: socket to verify + */ +static inline int sk_stream_closing(struct sock *sk) +{ + return (1 << sk->sk_state) & + (TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK); +} + +void sk_stream_wait_close(struct sock *sk, long timeout) +{ + if (timeout) { + DEFINE_WAIT(wait); + + do { + prepare_to_wait(sk_sleep(sk), &wait, + TASK_INTERRUPTIBLE); + if (sk_wait_event(sk, &timeout, !sk_stream_closing(sk))) + break; + } while (!signal_pending(current) && timeout); + + finish_wait(sk_sleep(sk), &wait); + } +} +EXPORT_SYMBOL(sk_stream_wait_close); + +/** + * sk_stream_wait_memory - Wait for more memory for a socket + * @sk: socket to wait for memory + * @timeo_p: for how long + */ +int sk_stream_wait_memory(struct sock *sk, long *timeo_p) +{ + int err = 0; + long vm_wait = 0; + long current_timeo = *timeo_p; + DEFINE_WAIT(wait); + + if (sk_stream_memory_free(sk)) + current_timeo = vm_wait = (net_random() % (HZ / 5)) + 2; + + while (1) { + set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + + if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) + goto do_error; + if (!*timeo_p) + goto do_nonblock; + if (signal_pending(current)) + goto do_interrupted; + clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + if (sk_stream_memory_free(sk) && !vm_wait) + break; + + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + sk->sk_write_pending++; + sk_wait_event(sk, ¤t_timeo, sk->sk_err || + (sk->sk_shutdown & SEND_SHUTDOWN) || + (sk_stream_memory_free(sk) && + !vm_wait)); + sk->sk_write_pending--; + + if (vm_wait) { + vm_wait -= current_timeo; + current_timeo = *timeo_p; + if (current_timeo != MAX_SCHEDULE_TIMEOUT && + (current_timeo -= vm_wait) < 0) + current_timeo = 0; + vm_wait = 0; + } + *timeo_p = current_timeo; + } +out: + finish_wait(sk_sleep(sk), &wait); + return err; + +do_error: + err = -EPIPE; + goto out; +do_nonblock: + err = -EAGAIN; + goto out; +do_interrupted: + err = sock_intr_errno(*timeo_p); + goto out; +} +EXPORT_SYMBOL(sk_stream_wait_memory); + +int sk_stream_error(struct sock *sk, int flags, int err) +{ + if (err == -EPIPE) + err = sock_error(sk) ? : -EPIPE; + if (err == -EPIPE && !(flags & MSG_NOSIGNAL)) + send_sig(SIGPIPE, current, 0); + return err; +} +EXPORT_SYMBOL(sk_stream_error); + +void sk_stream_kill_queues(struct sock *sk) +{ + /* First the read buffer. */ + __skb_queue_purge(&sk->sk_receive_queue); + + /* Next, the error queue. */ + __skb_queue_purge(&sk->sk_error_queue); + + /* Next, the write queue. */ + WARN_ON(!skb_queue_empty(&sk->sk_write_queue)); + + /* Account for returned memory. */ + sk_mem_reclaim(sk); + + WARN_ON(sk->sk_wmem_queued); + WARN_ON(sk->sk_forward_alloc); + + /* It is _impossible_ for the backlog to contain anything + * when we get here. All user references to this socket + * have gone away, only the net layer knows can touch it. + */ +} +EXPORT_SYMBOL(sk_stream_kill_queues); diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c new file mode 100644 index 00000000..0c285087 --- /dev/null +++ b/net/core/sysctl_net_core.c @@ -0,0 +1,264 @@ +/* -*- linux-c -*- + * sysctl_net_core.c: sysctl interface to net core subsystem. + * + * Begun April 1, 1996, Mike Shaver. + * Added /proc/sys/net/core directory entry (empty =) ). [MS] + */ + +#include <linux/mm.h> +#include <linux/sysctl.h> +#include <linux/module.h> +#include <linux/socket.h> +#include <linux/netdevice.h> +#include <linux/ratelimit.h> +#include <linux/vmalloc.h> +#include <linux/init.h> +#include <linux/slab.h> + +#include <net/ip.h> +#include <net/sock.h> +#include <net/net_ratelimit.h> + +#ifdef CONFIG_RPS +static int rps_sock_flow_sysctl(ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + unsigned int orig_size, size; + int ret, i; + ctl_table tmp = { + .data = &size, + .maxlen = sizeof(size), + .mode = table->mode + }; + struct rps_sock_flow_table *orig_sock_table, *sock_table; + static DEFINE_MUTEX(sock_flow_mutex); + + mutex_lock(&sock_flow_mutex); + + orig_sock_table = rcu_dereference_protected(rps_sock_flow_table, + lockdep_is_held(&sock_flow_mutex)); + size = orig_size = orig_sock_table ? orig_sock_table->mask + 1 : 0; + + ret = proc_dointvec(&tmp, write, buffer, lenp, ppos); + + if (write) { + if (size) { + if (size > 1<<30) { + /* Enforce limit to prevent overflow */ + mutex_unlock(&sock_flow_mutex); + return -EINVAL; + } + size = roundup_pow_of_two(size); + if (size != orig_size) { + sock_table = + vmalloc(RPS_SOCK_FLOW_TABLE_SIZE(size)); + if (!sock_table) { + mutex_unlock(&sock_flow_mutex); + return -ENOMEM; + } + + sock_table->mask = size - 1; + } else + sock_table = orig_sock_table; + + for (i = 0; i < size; i++) + sock_table->ents[i] = RPS_NO_CPU; + } else + sock_table = NULL; + + if (sock_table != orig_sock_table) { + rcu_assign_pointer(rps_sock_flow_table, sock_table); + if (sock_table) + static_key_slow_inc(&rps_needed); + if (orig_sock_table) { + static_key_slow_dec(&rps_needed); + synchronize_rcu(); + vfree(orig_sock_table); + } + } + } + + mutex_unlock(&sock_flow_mutex); + + return ret; +} +#endif /* CONFIG_RPS */ + +static struct ctl_table net_core_table[] = { +#ifdef CONFIG_NET + { + .procname = "wmem_max", + .data = &sysctl_wmem_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "rmem_max", + .data = &sysctl_rmem_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "wmem_default", + .data = &sysctl_wmem_default, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "rmem_default", + .data = &sysctl_rmem_default, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "dev_weight", + .data = &weight_p, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "netdev_max_backlog", + .data = &netdev_max_backlog, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, +#ifdef CONFIG_BPF_JIT + { + .procname = "bpf_jit_enable", + .data = &bpf_jit_enable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, +#endif + { + .procname = "netdev_tstamp_prequeue", + .data = &netdev_tstamp_prequeue, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "message_cost", + .data = &net_ratelimit_state.interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "message_burst", + .data = &net_ratelimit_state.burst, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "optmem_max", + .data = &sysctl_optmem_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, +#ifdef CONFIG_RPS + { + .procname = "rps_sock_flow_entries", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = rps_sock_flow_sysctl + }, +#endif +#endif /* CONFIG_NET */ + { + .procname = "netdev_budget", + .data = &netdev_budget, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "warnings", + .data = &net_msg_warn, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { } +}; + +static struct ctl_table netns_core_table[] = { + { + .procname = "somaxconn", + .data = &init_net.core.sysctl_somaxconn, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { } +}; + +__net_initdata struct ctl_path net_core_path[] = { + { .procname = "net", }, + { .procname = "core", }, + { }, +}; + +static __net_init int sysctl_core_net_init(struct net *net) +{ + struct ctl_table *tbl; + + net->core.sysctl_somaxconn = SOMAXCONN; + + tbl = netns_core_table; + if (!net_eq(net, &init_net)) { + tbl = kmemdup(tbl, sizeof(netns_core_table), GFP_KERNEL); + if (tbl == NULL) + goto err_dup; + + tbl[0].data = &net->core.sysctl_somaxconn; + } + + net->core.sysctl_hdr = register_net_sysctl_table(net, + net_core_path, tbl); + if (net->core.sysctl_hdr == NULL) + goto err_reg; + + return 0; + +err_reg: + if (tbl != netns_core_table) + kfree(tbl); +err_dup: + return -ENOMEM; +} + +static __net_exit void sysctl_core_net_exit(struct net *net) +{ + struct ctl_table *tbl; + + tbl = net->core.sysctl_hdr->ctl_table_arg; + unregister_net_sysctl_table(net->core.sysctl_hdr); + BUG_ON(tbl == netns_core_table); + kfree(tbl); +} + +static __net_initdata struct pernet_operations sysctl_core_ops = { + .init = sysctl_core_net_init, + .exit = sysctl_core_net_exit, +}; + +static __init int sysctl_core_init(void) +{ + static struct ctl_table empty[1]; + + register_sysctl_paths(net_core_path, empty); + register_net_sysctl_rotable(net_core_path, net_core_table); + return register_pernet_subsys(&sysctl_core_ops); +} + +fs_initcall(sysctl_core_init); diff --git a/net/core/timestamping.c b/net/core/timestamping.c new file mode 100644 index 00000000..661b5a40 --- /dev/null +++ b/net/core/timestamping.c @@ -0,0 +1,139 @@ +/* + * PTP 1588 clock support - support for timestamping in PHY devices + * + * Copyright (C) 2010 OMICRON electronics GmbH + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ +#include <linux/errqueue.h> +#include <linux/phy.h> +#include <linux/ptp_classify.h> +#include <linux/skbuff.h> +#include <linux/export.h> + +static struct sock_filter ptp_filter[] = { + PTP_FILTER +}; + +static unsigned int classify(const struct sk_buff *skb) +{ + if (likely(skb->dev && + skb->dev->phydev && + skb->dev->phydev->drv)) + return sk_run_filter(skb, ptp_filter); + else + return PTP_CLASS_NONE; +} + +void skb_clone_tx_timestamp(struct sk_buff *skb) +{ + struct phy_device *phydev; + struct sk_buff *clone; + struct sock *sk = skb->sk; + unsigned int type; + + if (!sk) + return; + + type = classify(skb); + + switch (type) { + case PTP_CLASS_V1_IPV4: + case PTP_CLASS_V1_IPV6: + case PTP_CLASS_V2_IPV4: + case PTP_CLASS_V2_IPV6: + case PTP_CLASS_V2_L2: + case PTP_CLASS_V2_VLAN: + phydev = skb->dev->phydev; + if (likely(phydev->drv->txtstamp)) { + if (!atomic_inc_not_zero(&sk->sk_refcnt)) + return; + clone = skb_clone(skb, GFP_ATOMIC); + if (!clone) { + sock_put(sk); + return; + } + clone->sk = sk; + phydev->drv->txtstamp(phydev, clone, type); + } + break; + default: + break; + } +} +EXPORT_SYMBOL_GPL(skb_clone_tx_timestamp); + +void skb_complete_tx_timestamp(struct sk_buff *skb, + struct skb_shared_hwtstamps *hwtstamps) +{ + struct sock *sk = skb->sk; + struct sock_exterr_skb *serr; + int err; + + if (!hwtstamps) { + sock_put(sk); + kfree_skb(skb); + return; + } + + *skb_hwtstamps(skb) = *hwtstamps; + serr = SKB_EXT_ERR(skb); + memset(serr, 0, sizeof(*serr)); + serr->ee.ee_errno = ENOMSG; + serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; + skb->sk = NULL; + err = sock_queue_err_skb(sk, skb); + sock_put(sk); + if (err) + kfree_skb(skb); +} +EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp); + +bool skb_defer_rx_timestamp(struct sk_buff *skb) +{ + struct phy_device *phydev; + unsigned int type; + + if (skb_headroom(skb) < ETH_HLEN) + return false; + __skb_push(skb, ETH_HLEN); + + type = classify(skb); + + __skb_pull(skb, ETH_HLEN); + + switch (type) { + case PTP_CLASS_V1_IPV4: + case PTP_CLASS_V1_IPV6: + case PTP_CLASS_V2_IPV4: + case PTP_CLASS_V2_IPV6: + case PTP_CLASS_V2_L2: + case PTP_CLASS_V2_VLAN: + phydev = skb->dev->phydev; + if (likely(phydev->drv->rxtstamp)) + return phydev->drv->rxtstamp(phydev, skb, type); + break; + default: + break; + } + + return false; +} +EXPORT_SYMBOL_GPL(skb_defer_rx_timestamp); + +void __init skb_timestamping_init(void) +{ + BUG_ON(sk_chk_filter(ptp_filter, ARRAY_SIZE(ptp_filter))); +} diff --git a/net/core/user_dma.c b/net/core/user_dma.c new file mode 100644 index 00000000..1b5fefdb --- /dev/null +++ b/net/core/user_dma.c @@ -0,0 +1,131 @@ +/* + * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved. + * Portions based on net/core/datagram.c and copyrighted by their authors. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * The full GNU General Public License is included in this distribution in the + * file called COPYING. + */ + +/* + * This code allows the net stack to make use of a DMA engine for + * skb to iovec copies. + */ + +#include <linux/dmaengine.h> +#include <linux/socket.h> +#include <linux/export.h> +#include <net/tcp.h> +#include <net/netdma.h> + +#define NET_DMA_DEFAULT_COPYBREAK 4096 + +int sysctl_tcp_dma_copybreak = NET_DMA_DEFAULT_COPYBREAK; +EXPORT_SYMBOL(sysctl_tcp_dma_copybreak); + +/** + * dma_skb_copy_datagram_iovec - Copy a datagram to an iovec. + * @skb - buffer to copy + * @offset - offset in the buffer to start copying from + * @iovec - io vector to copy to + * @len - amount of data to copy from buffer to iovec + * @pinned_list - locked iovec buffer data + * + * Note: the iovec is modified during the copy. + */ +int dma_skb_copy_datagram_iovec(struct dma_chan *chan, + struct sk_buff *skb, int offset, struct iovec *to, + size_t len, struct dma_pinned_list *pinned_list) +{ + int start = skb_headlen(skb); + int i, copy = start - offset; + struct sk_buff *frag_iter; + dma_cookie_t cookie = 0; + + /* Copy header. */ + if (copy > 0) { + if (copy > len) + copy = len; + cookie = dma_memcpy_to_iovec(chan, to, pinned_list, + skb->data + offset, copy); + if (cookie < 0) + goto fault; + len -= copy; + if (len == 0) + goto end; + offset += copy; + } + + /* Copy paged appendix. Hmm... why does this look so complicated? */ + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + int end; + const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + + WARN_ON(start > offset + len); + + end = start + skb_frag_size(frag); + copy = end - offset; + if (copy > 0) { + struct page *page = skb_frag_page(frag); + + if (copy > len) + copy = len; + + cookie = dma_memcpy_pg_to_iovec(chan, to, pinned_list, page, + frag->page_offset + offset - start, copy); + if (cookie < 0) + goto fault; + len -= copy; + if (len == 0) + goto end; + offset += copy; + } + start = end; + } + + skb_walk_frags(skb, frag_iter) { + int end; + + WARN_ON(start > offset + len); + + end = start + frag_iter->len; + copy = end - offset; + if (copy > 0) { + if (copy > len) + copy = len; + cookie = dma_skb_copy_datagram_iovec(chan, frag_iter, + offset - start, + to, copy, + pinned_list); + if (cookie < 0) + goto fault; + len -= copy; + if (len == 0) + goto end; + offset += copy; + } + start = end; + } + +end: + if (!len) { + skb->dma_cookie = cookie; + return cookie; + } + +fault: + return -EFAULT; +} diff --git a/net/core/utils.c b/net/core/utils.c new file mode 100644 index 00000000..dc3c3faf --- /dev/null +++ b/net/core/utils.c @@ -0,0 +1,322 @@ +/* + * Generic address resultion entity + * + * Authors: + * net_random Alan Cox + * net_ratelimit Andi Kleen + * in{4,6}_pton YOSHIFUJI Hideaki, Copyright (C)2006 USAGI/WIDE Project + * + * Created by Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/module.h> +#include <linux/jiffies.h> +#include <linux/kernel.h> +#include <linux/inet.h> +#include <linux/mm.h> +#include <linux/net.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/percpu.h> +#include <linux/init.h> +#include <linux/ratelimit.h> + +#include <net/sock.h> +#include <net/net_ratelimit.h> + +#include <asm/byteorder.h> +#include <asm/uaccess.h> + +int net_msg_warn __read_mostly = 1; +EXPORT_SYMBOL(net_msg_warn); + +DEFINE_RATELIMIT_STATE(net_ratelimit_state, 5 * HZ, 10); +/* + * All net warning printk()s should be guarded by this function. + */ +int net_ratelimit(void) +{ + return __ratelimit(&net_ratelimit_state); +} +EXPORT_SYMBOL(net_ratelimit); + +/* + * Convert an ASCII string to binary IP. + * This is outside of net/ipv4/ because various code that uses IP addresses + * is otherwise not dependent on the TCP/IP stack. + */ + +__be32 in_aton(const char *str) +{ + unsigned long l; + unsigned int val; + int i; + + l = 0; + for (i = 0; i < 4; i++) + { + l <<= 8; + if (*str != '\0') + { + val = 0; + while (*str != '\0' && *str != '.' && *str != '\n') + { + val *= 10; + val += *str - '0'; + str++; + } + l |= val; + if (*str != '\0') + str++; + } + } + return htonl(l); +} +EXPORT_SYMBOL(in_aton); + +#define IN6PTON_XDIGIT 0x00010000 +#define IN6PTON_DIGIT 0x00020000 +#define IN6PTON_COLON_MASK 0x00700000 +#define IN6PTON_COLON_1 0x00100000 /* single : requested */ +#define IN6PTON_COLON_2 0x00200000 /* second : requested */ +#define IN6PTON_COLON_1_2 0x00400000 /* :: requested */ +#define IN6PTON_DOT 0x00800000 /* . */ +#define IN6PTON_DELIM 0x10000000 +#define IN6PTON_NULL 0x20000000 /* first/tail */ +#define IN6PTON_UNKNOWN 0x40000000 + +static inline int xdigit2bin(char c, int delim) +{ + int val; + + if (c == delim || c == '\0') + return IN6PTON_DELIM; + if (c == ':') + return IN6PTON_COLON_MASK; + if (c == '.') + return IN6PTON_DOT; + + val = hex_to_bin(c); + if (val >= 0) + return val | IN6PTON_XDIGIT | (val < 10 ? IN6PTON_DIGIT : 0); + + if (delim == -1) + return IN6PTON_DELIM; + return IN6PTON_UNKNOWN; +} + +int in4_pton(const char *src, int srclen, + u8 *dst, + int delim, const char **end) +{ + const char *s; + u8 *d; + u8 dbuf[4]; + int ret = 0; + int i; + int w = 0; + + if (srclen < 0) + srclen = strlen(src); + s = src; + d = dbuf; + i = 0; + while(1) { + int c; + c = xdigit2bin(srclen > 0 ? *s : '\0', delim); + if (!(c & (IN6PTON_DIGIT | IN6PTON_DOT | IN6PTON_DELIM | IN6PTON_COLON_MASK))) { + goto out; + } + if (c & (IN6PTON_DOT | IN6PTON_DELIM | IN6PTON_COLON_MASK)) { + if (w == 0) + goto out; + *d++ = w & 0xff; + w = 0; + i++; + if (c & (IN6PTON_DELIM | IN6PTON_COLON_MASK)) { + if (i != 4) + goto out; + break; + } + goto cont; + } + w = (w * 10) + c; + if ((w & 0xffff) > 255) { + goto out; + } +cont: + if (i >= 4) + goto out; + s++; + srclen--; + } + ret = 1; + memcpy(dst, dbuf, sizeof(dbuf)); +out: + if (end) + *end = s; + return ret; +} +EXPORT_SYMBOL(in4_pton); + +int in6_pton(const char *src, int srclen, + u8 *dst, + int delim, const char **end) +{ + const char *s, *tok = NULL; + u8 *d, *dc = NULL; + u8 dbuf[16]; + int ret = 0; + int i; + int state = IN6PTON_COLON_1_2 | IN6PTON_XDIGIT | IN6PTON_NULL; + int w = 0; + + memset(dbuf, 0, sizeof(dbuf)); + + s = src; + d = dbuf; + if (srclen < 0) + srclen = strlen(src); + + while (1) { + int c; + + c = xdigit2bin(srclen > 0 ? *s : '\0', delim); + if (!(c & state)) + goto out; + if (c & (IN6PTON_DELIM | IN6PTON_COLON_MASK)) { + /* process one 16-bit word */ + if (!(state & IN6PTON_NULL)) { + *d++ = (w >> 8) & 0xff; + *d++ = w & 0xff; + } + w = 0; + if (c & IN6PTON_DELIM) { + /* We've processed last word */ + break; + } + /* + * COLON_1 => XDIGIT + * COLON_2 => XDIGIT|DELIM + * COLON_1_2 => COLON_2 + */ + switch (state & IN6PTON_COLON_MASK) { + case IN6PTON_COLON_2: + dc = d; + state = IN6PTON_XDIGIT | IN6PTON_DELIM; + if (dc - dbuf >= sizeof(dbuf)) + state |= IN6PTON_NULL; + break; + case IN6PTON_COLON_1|IN6PTON_COLON_1_2: + state = IN6PTON_XDIGIT | IN6PTON_COLON_2; + break; + case IN6PTON_COLON_1: + state = IN6PTON_XDIGIT; + break; + case IN6PTON_COLON_1_2: + state = IN6PTON_COLON_2; + break; + default: + state = 0; + } + tok = s + 1; + goto cont; + } + + if (c & IN6PTON_DOT) { + ret = in4_pton(tok ? tok : s, srclen + (int)(s - tok), d, delim, &s); + if (ret > 0) { + d += 4; + break; + } + goto out; + } + + w = (w << 4) | (0xff & c); + state = IN6PTON_COLON_1 | IN6PTON_DELIM; + if (!(w & 0xf000)) { + state |= IN6PTON_XDIGIT; + } + if (!dc && d + 2 < dbuf + sizeof(dbuf)) { + state |= IN6PTON_COLON_1_2; + state &= ~IN6PTON_DELIM; + } + if (d + 2 >= dbuf + sizeof(dbuf)) { + state &= ~(IN6PTON_COLON_1|IN6PTON_COLON_1_2); + } +cont: + if ((dc && d + 4 < dbuf + sizeof(dbuf)) || + d + 4 == dbuf + sizeof(dbuf)) { + state |= IN6PTON_DOT; + } + if (d >= dbuf + sizeof(dbuf)) { + state &= ~(IN6PTON_XDIGIT|IN6PTON_COLON_MASK); + } + s++; + srclen--; + } + + i = 15; d--; + + if (dc) { + while(d >= dc) + dst[i--] = *d--; + while(i >= dc - dbuf) + dst[i--] = 0; + while(i >= 0) + dst[i--] = *d--; + } else + memcpy(dst, dbuf, sizeof(dbuf)); + + ret = 1; +out: + if (end) + *end = s; + return ret; +} +EXPORT_SYMBOL(in6_pton); + +void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb, + __be32 from, __be32 to, int pseudohdr) +{ + __be32 diff[] = { ~from, to }; + if (skb->ip_summed != CHECKSUM_PARTIAL) { + *sum = csum_fold(csum_partial(diff, sizeof(diff), + ~csum_unfold(*sum))); + if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr) + skb->csum = ~csum_partial(diff, sizeof(diff), + ~skb->csum); + } else if (pseudohdr) + *sum = ~csum_fold(csum_partial(diff, sizeof(diff), + csum_unfold(*sum))); +} +EXPORT_SYMBOL(inet_proto_csum_replace4); + +int mac_pton(const char *s, u8 *mac) +{ + int i; + + /* XX:XX:XX:XX:XX:XX */ + if (strlen(s) < 3 * ETH_ALEN - 1) + return 0; + + /* Don't dirty result unless string is valid MAC. */ + for (i = 0; i < ETH_ALEN; i++) { + if (!strchr("0123456789abcdefABCDEF", s[i * 3])) + return 0; + if (!strchr("0123456789abcdefABCDEF", s[i * 3 + 1])) + return 0; + if (i != ETH_ALEN - 1 && s[i * 3 + 2] != ':') + return 0; + } + for (i = 0; i < ETH_ALEN; i++) { + mac[i] = (hex_to_bin(s[i * 3]) << 4) | hex_to_bin(s[i * 3 + 1]); + } + return 1; +} +EXPORT_SYMBOL(mac_pton); |