diff options
Diffstat (limited to 'fs/proc')
-rw-r--r-- | fs/proc/Kconfig | 69 | ||||
-rw-r--r-- | fs/proc/Makefile | 30 | ||||
-rw-r--r-- | fs/proc/array.c | 558 | ||||
-rw-r--r-- | fs/proc/base.c | 3678 | ||||
-rw-r--r-- | fs/proc/cmdline.c | 29 | ||||
-rw-r--r-- | fs/proc/consoles.c | 114 | ||||
-rw-r--r-- | fs/proc/cpuinfo.c | 24 | ||||
-rw-r--r-- | fs/proc/devices.c | 70 | ||||
-rw-r--r-- | fs/proc/generic.c | 852 | ||||
-rw-r--r-- | fs/proc/inode.c | 503 | ||||
-rw-r--r-- | fs/proc/internal.h | 154 | ||||
-rw-r--r-- | fs/proc/interrupts.c | 53 | ||||
-rw-r--r-- | fs/proc/kcore.c | 637 | ||||
-rw-r--r-- | fs/proc/kmsg.c | 64 | ||||
-rw-r--r-- | fs/proc/loadavg.c | 45 | ||||
-rw-r--r-- | fs/proc/meminfo.c | 215 | ||||
-rw-r--r-- | fs/proc/mmu.c | 60 | ||||
-rw-r--r-- | fs/proc/namespaces.c | 200 | ||||
-rw-r--r-- | fs/proc/nommu.c | 136 | ||||
-rw-r--r-- | fs/proc/page.c | 214 | ||||
-rw-r--r-- | fs/proc/proc_devtree.c | 241 | ||||
-rw-r--r-- | fs/proc/proc_net.c | 241 | ||||
-rw-r--r-- | fs/proc/proc_sysctl.c | 1606 | ||||
-rw-r--r-- | fs/proc/proc_tty.c | 189 | ||||
-rw-r--r-- | fs/proc/root.c | 278 | ||||
-rw-r--r-- | fs/proc/softirqs.c | 44 | ||||
-rw-r--r-- | fs/proc/stat.c | 218 | ||||
-rw-r--r-- | fs/proc/task_mmu.c | 1286 | ||||
-rw-r--r-- | fs/proc/task_nommu.c | 318 | ||||
-rw-r--r-- | fs/proc/uptime.c | 53 | ||||
-rw-r--r-- | fs/proc/version.c | 40 | ||||
-rw-r--r-- | fs/proc/vmcore.c | 725 |
32 files changed, 12944 insertions, 0 deletions
diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig new file mode 100644 index 00000000..15af6222 --- /dev/null +++ b/fs/proc/Kconfig @@ -0,0 +1,69 @@ +config PROC_FS + bool "/proc file system support" if EXPERT + default y + help + This is a virtual file system providing information about the status + of the system. "Virtual" means that it doesn't take up any space on + your hard disk: the files are created on the fly by the kernel when + you try to access them. Also, you cannot read the files with older + version of the program less: you need to use more or cat. + + It's totally cool; for example, "cat /proc/interrupts" gives + information about what the different IRQs are used for at the moment + (there is a small number of Interrupt ReQuest lines in your computer + that are used by the attached devices to gain the CPU's attention -- + often a source of trouble if two devices are mistakenly configured + to use the same IRQ). The program procinfo to display some + information about your system gathered from the /proc file system. + + Before you can use the /proc file system, it has to be mounted, + meaning it has to be given a location in the directory hierarchy. + That location should be /proc. A command such as "mount -t proc proc + /proc" or the equivalent line in /etc/fstab does the job. + + The /proc file system is explained in the file + <file:Documentation/filesystems/proc.txt> and on the proc(5) manpage + ("man 5 proc"). + + This option will enlarge your kernel by about 67 KB. Several + programs depend on this, so everyone should say Y here. + +config PROC_KCORE + bool "/proc/kcore support" if !ARM + depends on PROC_FS && MMU + +config PROC_VMCORE + bool "/proc/vmcore support" + depends on PROC_FS && CRASH_DUMP + default y + help + Exports the dump image of crashed kernel in ELF format. + +config PROC_SYSCTL + bool "Sysctl support (/proc/sys)" if EXPERT + depends on PROC_FS + select SYSCTL + default y + ---help--- + The sysctl interface provides a means of dynamically changing + certain kernel parameters and variables on the fly without requiring + a recompile of the kernel or reboot of the system. The primary + interface is through /proc/sys. If you say Y here a tree of + modifiable sysctl entries will be generated beneath the + /proc/sys directory. They are explained in the files + in <file:Documentation/sysctl/>. Note that enabling this + option will enlarge the kernel by at least 8 KB. + + As it is generally a good thing, you should say Y here unless + building a kernel for install/rescue disks or your system is very + limited in memory. + +config PROC_PAGE_MONITOR + default y + depends on PROC_FS && MMU + bool "Enable /proc page monitoring" if EXPERT + help + Various /proc files exist to monitor process memory utilization: + /proc/pid/smaps, /proc/pid/clear_refs, /proc/pid/pagemap, + /proc/kpagecount, and /proc/kpageflags. Disabling these + interfaces will reduce the size of the kernel by approximately 4kb. diff --git a/fs/proc/Makefile b/fs/proc/Makefile new file mode 100644 index 00000000..c1c72933 --- /dev/null +++ b/fs/proc/Makefile @@ -0,0 +1,30 @@ +# +# Makefile for the Linux proc filesystem routines. +# + +obj-y += proc.o + +proc-y := nommu.o task_nommu.o +proc-$(CONFIG_MMU) := mmu.o task_mmu.o + +proc-y += inode.o root.o base.o generic.o array.o \ + proc_tty.o +proc-y += cmdline.o +proc-y += consoles.o +proc-y += cpuinfo.o +proc-y += devices.o +proc-y += interrupts.o +proc-y += loadavg.o +proc-y += meminfo.o +proc-y += stat.o +proc-y += uptime.o +proc-y += version.o +proc-y += softirqs.o +proc-y += namespaces.o +proc-$(CONFIG_PROC_SYSCTL) += proc_sysctl.o +proc-$(CONFIG_NET) += proc_net.o +proc-$(CONFIG_PROC_KCORE) += kcore.o +proc-$(CONFIG_PROC_VMCORE) += vmcore.o +proc-$(CONFIG_PROC_DEVICETREE) += proc_devtree.o +proc-$(CONFIG_PRINTK) += kmsg.o +proc-$(CONFIG_PROC_PAGE_MONITOR) += page.o diff --git a/fs/proc/array.c b/fs/proc/array.c new file mode 100644 index 00000000..f9bd395b --- /dev/null +++ b/fs/proc/array.c @@ -0,0 +1,558 @@ +/* + * linux/fs/proc/array.c + * + * Copyright (C) 1992 by Linus Torvalds + * based on ideas by Darren Senn + * + * Fixes: + * Michael. K. Johnson: stat,statm extensions. + * <johnsonm@stolaf.edu> + * + * Pauline Middelink : Made cmdline,envline only break at '\0's, to + * make sure SET_PROCTITLE works. Also removed + * bad '!' which forced address recalculation for + * EVERY character on the current page. + * <middelin@polyware.iaf.nl> + * + * Danny ter Haar : added cpuinfo + * <dth@cistron.nl> + * + * Alessandro Rubini : profile extension. + * <rubini@ipvvis.unipv.it> + * + * Jeff Tranter : added BogoMips field to cpuinfo + * <Jeff_Tranter@Mitel.COM> + * + * Bruno Haible : remove 4K limit for the maps file + * <haible@ma2s2.mathematik.uni-karlsruhe.de> + * + * Yves Arrouye : remove removal of trailing spaces in get_array. + * <Yves.Arrouye@marin.fdn.fr> + * + * Jerome Forissier : added per-CPU time information to /proc/stat + * and /proc/<pid>/cpu extension + * <forissier@isia.cma.fr> + * - Incorporation and non-SMP safe operation + * of forissier patch in 2.1.78 by + * Hans Marcus <crowbar@concepts.nl> + * + * aeb@cwi.nl : /proc/partitions + * + * + * Alan Cox : security fixes. + * <alan@lxorguk.ukuu.org.uk> + * + * Al Viro : safe handling of mm_struct + * + * Gerhard Wichert : added BIGMEM support + * Siemens AG <Gerhard.Wichert@pdb.siemens.de> + * + * Al Viro & Jeff Garzik : moved most of the thing into base.c and + * : proc_misc.c. The rest may eventually go into + * : base.c too. + */ + +#include <linux/types.h> +#include <linux/errno.h> +#include <linux/time.h> +#include <linux/kernel.h> +#include <linux/kernel_stat.h> +#include <linux/tty.h> +#include <linux/string.h> +#include <linux/mman.h> +#include <linux/proc_fs.h> +#include <linux/ioport.h> +#include <linux/uaccess.h> +#include <linux/io.h> +#include <linux/mm.h> +#include <linux/hugetlb.h> +#include <linux/pagemap.h> +#include <linux/swap.h> +#include <linux/smp.h> +#include <linux/signal.h> +#include <linux/highmem.h> +#include <linux/file.h> +#include <linux/fdtable.h> +#include <linux/times.h> +#include <linux/cpuset.h> +#include <linux/rcupdate.h> +#include <linux/delayacct.h> +#include <linux/seq_file.h> +#include <linux/pid_namespace.h> +#include <linux/ptrace.h> +#include <linux/tracehook.h> + +#include <asm/pgtable.h> +#include <asm/processor.h> +#include "internal.h" + +static inline void task_name(struct seq_file *m, struct task_struct *p) +{ + int i; + char *buf, *end; + char *name; + char tcomm[sizeof(p->comm)]; + + get_task_comm(tcomm, p); + + seq_puts(m, "Name:\t"); + end = m->buf + m->size; + buf = m->buf + m->count; + name = tcomm; + i = sizeof(tcomm); + while (i && (buf < end)) { + unsigned char c = *name; + name++; + i--; + *buf = c; + if (!c) + break; + if (c == '\\') { + buf++; + if (buf < end) + *buf++ = c; + continue; + } + if (c == '\n') { + *buf++ = '\\'; + if (buf < end) + *buf++ = 'n'; + continue; + } + buf++; + } + m->count = buf - m->buf; + seq_putc(m, '\n'); +} + +/* + * The task state array is a strange "bitmap" of + * reasons to sleep. Thus "running" is zero, and + * you can test for combinations of others with + * simple bit tests. + */ +static const char * const task_state_array[] = { + "R (running)", /* 0 */ + "S (sleeping)", /* 1 */ + "D (disk sleep)", /* 2 */ + "T (stopped)", /* 4 */ + "t (tracing stop)", /* 8 */ + "Z (zombie)", /* 16 */ + "X (dead)", /* 32 */ + "x (dead)", /* 64 */ + "K (wakekill)", /* 128 */ + "W (waking)", /* 256 */ +}; + +static inline const char *get_task_state(struct task_struct *tsk) +{ + unsigned int state = (tsk->state & TASK_REPORT) | tsk->exit_state; + const char * const *p = &task_state_array[0]; + + BUILD_BUG_ON(1 + ilog2(TASK_STATE_MAX) != ARRAY_SIZE(task_state_array)); + + while (state) { + p++; + state >>= 1; + } + return *p; +} + +static inline void task_state(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *p) +{ + struct group_info *group_info; + int g; + struct fdtable *fdt = NULL; + const struct cred *cred; + pid_t ppid, tpid; + + rcu_read_lock(); + ppid = pid_alive(p) ? + task_tgid_nr_ns(rcu_dereference(p->real_parent), ns) : 0; + tpid = 0; + if (pid_alive(p)) { + struct task_struct *tracer = ptrace_parent(p); + if (tracer) + tpid = task_pid_nr_ns(tracer, ns); + } + cred = get_task_cred(p); + seq_printf(m, + "State:\t%s\n" + "Tgid:\t%d\n" + "Pid:\t%d\n" + "PPid:\t%d\n" + "TracerPid:\t%d\n" + "Uid:\t%d\t%d\t%d\t%d\n" + "Gid:\t%d\t%d\t%d\t%d\n", + get_task_state(p), + task_tgid_nr_ns(p, ns), + pid_nr_ns(pid, ns), + ppid, tpid, + cred->uid, cred->euid, cred->suid, cred->fsuid, + cred->gid, cred->egid, cred->sgid, cred->fsgid); + + task_lock(p); + if (p->files) + fdt = files_fdtable(p->files); + seq_printf(m, + "FDSize:\t%d\n" + "Groups:\t", + fdt ? fdt->max_fds : 0); + rcu_read_unlock(); + + group_info = cred->group_info; + task_unlock(p); + + for (g = 0; g < min(group_info->ngroups, NGROUPS_SMALL); g++) + seq_printf(m, "%d ", GROUP_AT(group_info, g)); + put_cred(cred); + + seq_putc(m, '\n'); +} + +static void render_sigset_t(struct seq_file *m, const char *header, + sigset_t *set) +{ + int i; + + seq_puts(m, header); + + i = _NSIG; + do { + int x = 0; + + i -= 4; + if (sigismember(set, i+1)) x |= 1; + if (sigismember(set, i+2)) x |= 2; + if (sigismember(set, i+3)) x |= 4; + if (sigismember(set, i+4)) x |= 8; + seq_printf(m, "%x", x); + } while (i >= 4); + + seq_putc(m, '\n'); +} + +static void collect_sigign_sigcatch(struct task_struct *p, sigset_t *ign, + sigset_t *catch) +{ + struct k_sigaction *k; + int i; + + k = p->sighand->action; + for (i = 1; i <= _NSIG; ++i, ++k) { + if (k->sa.sa_handler == SIG_IGN) + sigaddset(ign, i); + else if (k->sa.sa_handler != SIG_DFL) + sigaddset(catch, i); + } +} + +static inline void task_sig(struct seq_file *m, struct task_struct *p) +{ + unsigned long flags; + sigset_t pending, shpending, blocked, ignored, caught; + int num_threads = 0; + unsigned long qsize = 0; + unsigned long qlim = 0; + + sigemptyset(&pending); + sigemptyset(&shpending); + sigemptyset(&blocked); + sigemptyset(&ignored); + sigemptyset(&caught); + + if (lock_task_sighand(p, &flags)) { + pending = p->pending.signal; + shpending = p->signal->shared_pending.signal; + blocked = p->blocked; + collect_sigign_sigcatch(p, &ignored, &caught); + num_threads = get_nr_threads(p); + rcu_read_lock(); /* FIXME: is this correct? */ + qsize = atomic_read(&__task_cred(p)->user->sigpending); + rcu_read_unlock(); + qlim = task_rlimit(p, RLIMIT_SIGPENDING); + unlock_task_sighand(p, &flags); + } + + seq_printf(m, "Threads:\t%d\n", num_threads); + seq_printf(m, "SigQ:\t%lu/%lu\n", qsize, qlim); + + /* render them all */ + render_sigset_t(m, "SigPnd:\t", &pending); + render_sigset_t(m, "ShdPnd:\t", &shpending); + render_sigset_t(m, "SigBlk:\t", &blocked); + render_sigset_t(m, "SigIgn:\t", &ignored); + render_sigset_t(m, "SigCgt:\t", &caught); +} + +static void render_cap_t(struct seq_file *m, const char *header, + kernel_cap_t *a) +{ + unsigned __capi; + + seq_puts(m, header); + CAP_FOR_EACH_U32(__capi) { + seq_printf(m, "%08x", + a->cap[(_KERNEL_CAPABILITY_U32S-1) - __capi]); + } + seq_putc(m, '\n'); +} + +static inline void task_cap(struct seq_file *m, struct task_struct *p) +{ + const struct cred *cred; + kernel_cap_t cap_inheritable, cap_permitted, cap_effective, cap_bset; + + rcu_read_lock(); + cred = __task_cred(p); + cap_inheritable = cred->cap_inheritable; + cap_permitted = cred->cap_permitted; + cap_effective = cred->cap_effective; + cap_bset = cred->cap_bset; + rcu_read_unlock(); + + render_cap_t(m, "CapInh:\t", &cap_inheritable); + render_cap_t(m, "CapPrm:\t", &cap_permitted); + render_cap_t(m, "CapEff:\t", &cap_effective); + render_cap_t(m, "CapBnd:\t", &cap_bset); +} + +static inline void task_context_switch_counts(struct seq_file *m, + struct task_struct *p) +{ + seq_printf(m, "voluntary_ctxt_switches:\t%lu\n" + "nonvoluntary_ctxt_switches:\t%lu\n", + p->nvcsw, + p->nivcsw); +} + +static void task_cpus_allowed(struct seq_file *m, struct task_struct *task) +{ + seq_puts(m, "Cpus_allowed:\t"); + seq_cpumask(m, &task->cpus_allowed); + seq_putc(m, '\n'); + seq_puts(m, "Cpus_allowed_list:\t"); + seq_cpumask_list(m, &task->cpus_allowed); + seq_putc(m, '\n'); +} + +int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + struct mm_struct *mm = get_task_mm(task); + + task_name(m, task); + task_state(m, ns, pid, task); + + if (mm) { + task_mem(m, mm); + mmput(mm); + } + task_sig(m, task); + task_cap(m, task); + task_cpus_allowed(m, task); + cpuset_task_status_allowed(m, task); + task_context_switch_counts(m, task); + return 0; +} + +static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task, int whole) +{ + unsigned long vsize, eip, esp, wchan = ~0UL; + long priority, nice; + int tty_pgrp = -1, tty_nr = 0; + sigset_t sigign, sigcatch; + char state; + pid_t ppid = 0, pgid = -1, sid = -1; + int num_threads = 0; + int permitted; + struct mm_struct *mm; + unsigned long long start_time; + unsigned long cmin_flt = 0, cmaj_flt = 0; + unsigned long min_flt = 0, maj_flt = 0; + cputime_t cutime, cstime, utime, stime; + cputime_t cgtime, gtime; + unsigned long rsslim = 0; + char tcomm[sizeof(task->comm)]; + unsigned long flags; + + state = *get_task_state(task); + vsize = eip = esp = 0; + permitted = ptrace_may_access(task, PTRACE_MODE_READ | PTRACE_MODE_NOAUDIT); + mm = get_task_mm(task); + if (mm) { + vsize = task_vsize(mm); + if (permitted) { + eip = KSTK_EIP(task); + esp = KSTK_ESP(task); + } + } + + get_task_comm(tcomm, task); + + sigemptyset(&sigign); + sigemptyset(&sigcatch); + cutime = cstime = utime = stime = 0; + cgtime = gtime = 0; + + if (lock_task_sighand(task, &flags)) { + struct signal_struct *sig = task->signal; + + if (sig->tty) { + struct pid *pgrp = tty_get_pgrp(sig->tty); + tty_pgrp = pid_nr_ns(pgrp, ns); + put_pid(pgrp); + tty_nr = new_encode_dev(tty_devnum(sig->tty)); + } + + num_threads = get_nr_threads(task); + collect_sigign_sigcatch(task, &sigign, &sigcatch); + + cmin_flt = sig->cmin_flt; + cmaj_flt = sig->cmaj_flt; + cutime = sig->cutime; + cstime = sig->cstime; + cgtime = sig->cgtime; + rsslim = ACCESS_ONCE(sig->rlim[RLIMIT_RSS].rlim_cur); + + /* add up live thread stats at the group level */ + if (whole) { + struct task_struct *t = task; + do { + min_flt += t->min_flt; + maj_flt += t->maj_flt; + gtime += t->gtime; + t = next_thread(t); + } while (t != task); + + min_flt += sig->min_flt; + maj_flt += sig->maj_flt; + thread_group_times(task, &utime, &stime); + gtime += sig->gtime; + } + + sid = task_session_nr_ns(task, ns); + ppid = task_tgid_nr_ns(task->real_parent, ns); + pgid = task_pgrp_nr_ns(task, ns); + + unlock_task_sighand(task, &flags); + } + + if (permitted && (!whole || num_threads < 2)) + wchan = get_wchan(task); + if (!whole) { + min_flt = task->min_flt; + maj_flt = task->maj_flt; + task_times(task, &utime, &stime); + gtime = task->gtime; + } + + /* scale priority and nice values from timeslices to -20..20 */ + /* to make it look like a "normal" Unix priority/nice value */ + priority = task_prio(task); + nice = task_nice(task); + + /* Temporary variable needed for gcc-2.96 */ + /* convert timespec -> nsec*/ + start_time = + (unsigned long long)task->real_start_time.tv_sec * NSEC_PER_SEC + + task->real_start_time.tv_nsec; + /* convert nsec -> ticks */ + start_time = nsec_to_clock_t(start_time); + + seq_printf(m, "%d (%s) %c", pid_nr_ns(pid, ns), tcomm, state); + seq_put_decimal_ll(m, ' ', ppid); + seq_put_decimal_ll(m, ' ', pgid); + seq_put_decimal_ll(m, ' ', sid); + seq_put_decimal_ll(m, ' ', tty_nr); + seq_put_decimal_ll(m, ' ', tty_pgrp); + seq_put_decimal_ull(m, ' ', task->flags); + seq_put_decimal_ull(m, ' ', min_flt); + seq_put_decimal_ull(m, ' ', cmin_flt); + seq_put_decimal_ull(m, ' ', maj_flt); + seq_put_decimal_ull(m, ' ', cmaj_flt); + seq_put_decimal_ull(m, ' ', cputime_to_clock_t(utime)); + seq_put_decimal_ull(m, ' ', cputime_to_clock_t(stime)); + seq_put_decimal_ll(m, ' ', cputime_to_clock_t(cutime)); + seq_put_decimal_ll(m, ' ', cputime_to_clock_t(cstime)); + seq_put_decimal_ll(m, ' ', priority); + seq_put_decimal_ll(m, ' ', nice); + seq_put_decimal_ll(m, ' ', num_threads); + seq_put_decimal_ull(m, ' ', 0); + seq_put_decimal_ull(m, ' ', start_time); + seq_put_decimal_ull(m, ' ', vsize); + seq_put_decimal_ll(m, ' ', mm ? get_mm_rss(mm) : 0); + seq_put_decimal_ull(m, ' ', rsslim); + seq_put_decimal_ull(m, ' ', mm ? (permitted ? mm->start_code : 1) : 0); + seq_put_decimal_ull(m, ' ', mm ? (permitted ? mm->end_code : 1) : 0); + seq_put_decimal_ull(m, ' ', (permitted && mm) ? mm->start_stack : 0); + seq_put_decimal_ull(m, ' ', esp); + seq_put_decimal_ull(m, ' ', eip); + /* The signal information here is obsolete. + * It must be decimal for Linux 2.0 compatibility. + * Use /proc/#/status for real-time signals. + */ + seq_put_decimal_ull(m, ' ', task->pending.signal.sig[0] & 0x7fffffffUL); + seq_put_decimal_ull(m, ' ', task->blocked.sig[0] & 0x7fffffffUL); + seq_put_decimal_ull(m, ' ', sigign.sig[0] & 0x7fffffffUL); + seq_put_decimal_ull(m, ' ', sigcatch.sig[0] & 0x7fffffffUL); + seq_put_decimal_ull(m, ' ', wchan); + seq_put_decimal_ull(m, ' ', 0); + seq_put_decimal_ull(m, ' ', 0); + seq_put_decimal_ll(m, ' ', task->exit_signal); + seq_put_decimal_ll(m, ' ', task_cpu(task)); + seq_put_decimal_ull(m, ' ', task->rt_priority); + seq_put_decimal_ull(m, ' ', task->policy); + seq_put_decimal_ull(m, ' ', delayacct_blkio_ticks(task)); + seq_put_decimal_ull(m, ' ', cputime_to_clock_t(gtime)); + seq_put_decimal_ll(m, ' ', cputime_to_clock_t(cgtime)); + seq_put_decimal_ull(m, ' ', (mm && permitted) ? mm->start_data : 0); + seq_put_decimal_ull(m, ' ', (mm && permitted) ? mm->end_data : 0); + seq_put_decimal_ull(m, ' ', (mm && permitted) ? mm->start_brk : 0); + seq_putc(m, '\n'); + if (mm) + mmput(mm); + return 0; +} + +int proc_tid_stat(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + return do_task_stat(m, ns, pid, task, 0); +} + +int proc_tgid_stat(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + return do_task_stat(m, ns, pid, task, 1); +} + +int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + unsigned long size = 0, resident = 0, shared = 0, text = 0, data = 0; + struct mm_struct *mm = get_task_mm(task); + + if (mm) { + size = task_statm(mm, &shared, &text, &data, &resident); + mmput(mm); + } + /* + * For quick read, open code by putting numbers directly + * expected format is + * seq_printf(m, "%lu %lu %lu %lu 0 %lu 0\n", + * size, resident, shared, text, data); + */ + seq_put_decimal_ull(m, 0, size); + seq_put_decimal_ull(m, ' ', resident); + seq_put_decimal_ull(m, ' ', shared); + seq_put_decimal_ull(m, ' ', text); + seq_put_decimal_ull(m, ' ', 0); + seq_put_decimal_ull(m, ' ', data); + seq_put_decimal_ull(m, ' ', 0); + seq_putc(m, '\n'); + + return 0; +} diff --git a/fs/proc/base.c b/fs/proc/base.c new file mode 100644 index 00000000..c8cb15dc --- /dev/null +++ b/fs/proc/base.c @@ -0,0 +1,3678 @@ +/* + * linux/fs/proc/base.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * proc base directory handling functions + * + * 1999, Al Viro. Rewritten. Now it covers the whole per-process part. + * Instead of using magical inumbers to determine the kind of object + * we allocate and fill in-core inodes upon lookup. They don't even + * go into icache. We cache the reference to task_struct upon lookup too. + * Eventually it should become a filesystem in its own. We don't use the + * rest of procfs anymore. + * + * + * Changelog: + * 17-Jan-2005 + * Allan Bezerra + * Bruna Moreira <bruna.moreira@indt.org.br> + * Edjard Mota <edjard.mota@indt.org.br> + * Ilias Biris <ilias.biris@indt.org.br> + * Mauricio Lin <mauricio.lin@indt.org.br> + * + * Embedded Linux Lab - 10LE Instituto Nokia de Tecnologia - INdT + * + * A new process specific entry (smaps) included in /proc. It shows the + * size of rss for each memory area. The maps entry lacks information + * about physical memory size (rss) for each mapped file, i.e., + * rss information for executables and library files. + * This additional information is useful for any tools that need to know + * about physical memory consumption for a process specific library. + * + * Changelog: + * 21-Feb-2005 + * Embedded Linux Lab - 10LE Instituto Nokia de Tecnologia - INdT + * Pud inclusion in the page table walking. + * + * ChangeLog: + * 10-Mar-2005 + * 10LE Instituto Nokia de Tecnologia - INdT: + * A better way to walks through the page table as suggested by Hugh Dickins. + * + * Simo Piiroinen <simo.piiroinen@nokia.com>: + * Smaps information related to shared, private, clean and dirty pages. + * + * Paul Mundt <paul.mundt@nokia.com>: + * Overall revision about smaps. + */ + +#include <asm/uaccess.h> + +#include <linux/errno.h> +#include <linux/time.h> +#include <linux/proc_fs.h> +#include <linux/stat.h> +#include <linux/task_io_accounting_ops.h> +#include <linux/init.h> +#include <linux/capability.h> +#include <linux/file.h> +#include <linux/fdtable.h> +#include <linux/string.h> +#include <linux/seq_file.h> +#include <linux/namei.h> +#include <linux/mnt_namespace.h> +#include <linux/mm.h> +#include <linux/swap.h> +#include <linux/rcupdate.h> +#include <linux/kallsyms.h> +#include <linux/stacktrace.h> +#include <linux/resource.h> +#include <linux/module.h> +#include <linux/mount.h> +#include <linux/security.h> +#include <linux/ptrace.h> +#include <linux/tracehook.h> +#include <linux/cgroup.h> +#include <linux/cpuset.h> +#include <linux/audit.h> +#include <linux/poll.h> +#include <linux/nsproxy.h> +#include <linux/oom.h> +#include <linux/elf.h> +#include <linux/pid_namespace.h> +#include <linux/fs_struct.h> +#include <linux/slab.h> +#include <linux/flex_array.h> +#ifdef CONFIG_HARDWALL +#include <asm/hardwall.h> +#endif +#include <trace/events/oom.h> +#include "internal.h" + +/* NOTE: + * Implementing inode permission operations in /proc is almost + * certainly an error. Permission checks need to happen during + * each system call not at open time. The reason is that most of + * what we wish to check for permissions in /proc varies at runtime. + * + * The classic example of a problem is opening file descriptors + * in /proc for a task before it execs a suid executable. + */ + +struct pid_entry { + char *name; + int len; + umode_t mode; + const struct inode_operations *iop; + const struct file_operations *fop; + union proc_op op; +}; + +#define NOD(NAME, MODE, IOP, FOP, OP) { \ + .name = (NAME), \ + .len = sizeof(NAME) - 1, \ + .mode = MODE, \ + .iop = IOP, \ + .fop = FOP, \ + .op = OP, \ +} + +#define DIR(NAME, MODE, iops, fops) \ + NOD(NAME, (S_IFDIR|(MODE)), &iops, &fops, {} ) +#define LNK(NAME, get_link) \ + NOD(NAME, (S_IFLNK|S_IRWXUGO), \ + &proc_pid_link_inode_operations, NULL, \ + { .proc_get_link = get_link } ) +#define REG(NAME, MODE, fops) \ + NOD(NAME, (S_IFREG|(MODE)), NULL, &fops, {}) +#define INF(NAME, MODE, read) \ + NOD(NAME, (S_IFREG|(MODE)), \ + NULL, &proc_info_file_operations, \ + { .proc_read = read } ) +#define ONE(NAME, MODE, show) \ + NOD(NAME, (S_IFREG|(MODE)), \ + NULL, &proc_single_file_operations, \ + { .proc_show = show } ) + +static int proc_fd_permission(struct inode *inode, int mask); + +/* ANDROID is for special files in /proc. */ +#define ANDROID(NAME, MODE, OTYPE) \ + NOD(NAME, (S_IFREG|(MODE)), \ + &proc_##OTYPE##_inode_operations, \ + &proc_##OTYPE##_operations, {}) + +/* + * Count the number of hardlinks for the pid_entry table, excluding the . + * and .. links. + */ +static unsigned int pid_entry_count_dirs(const struct pid_entry *entries, + unsigned int n) +{ + unsigned int i; + unsigned int count; + + count = 0; + for (i = 0; i < n; ++i) { + if (S_ISDIR(entries[i].mode)) + ++count; + } + + return count; +} + +static int get_task_root(struct task_struct *task, struct path *root) +{ + int result = -ENOENT; + + task_lock(task); + if (task->fs) { + get_fs_root(task->fs, root); + result = 0; + } + task_unlock(task); + return result; +} + +static int proc_cwd_link(struct dentry *dentry, struct path *path) +{ + struct task_struct *task = get_proc_task(dentry->d_inode); + int result = -ENOENT; + + if (task) { + task_lock(task); + if (task->fs) { + get_fs_pwd(task->fs, path); + result = 0; + } + task_unlock(task); + put_task_struct(task); + } + return result; +} + +static int proc_root_link(struct dentry *dentry, struct path *path) +{ + struct task_struct *task = get_proc_task(dentry->d_inode); + int result = -ENOENT; + + if (task) { + result = get_task_root(task, path); + put_task_struct(task); + } + return result; +} + +struct mm_struct *mm_for_maps(struct task_struct *task) +{ + return mm_access(task, PTRACE_MODE_READ); +} + +static int proc_pid_cmdline(struct task_struct *task, char * buffer) +{ + int res = 0; + unsigned int len; + struct mm_struct *mm = get_task_mm(task); + if (!mm) + goto out; + if (!mm->arg_end) + goto out_mm; /* Shh! No looking before we're done */ + + len = mm->arg_end - mm->arg_start; + + if (len > PAGE_SIZE) + len = PAGE_SIZE; + + res = access_process_vm(task, mm->arg_start, buffer, len, 0); + + // If the nul at the end of args has been overwritten, then + // assume application is using setproctitle(3). + if (res > 0 && buffer[res-1] != '\0' && len < PAGE_SIZE) { + len = strnlen(buffer, res); + if (len < res) { + res = len; + } else { + len = mm->env_end - mm->env_start; + if (len > PAGE_SIZE - res) + len = PAGE_SIZE - res; + res += access_process_vm(task, mm->env_start, buffer+res, len, 0); + res = strnlen(buffer, res); + } + } +out_mm: + mmput(mm); +out: + return res; +} + +static int proc_pid_auxv(struct task_struct *task, char *buffer) +{ + struct mm_struct *mm = mm_for_maps(task); + int res = PTR_ERR(mm); + if (mm && !IS_ERR(mm)) { + unsigned int nwords = 0; + do { + nwords += 2; + } while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */ + res = nwords * sizeof(mm->saved_auxv[0]); + if (res > PAGE_SIZE) + res = PAGE_SIZE; + memcpy(buffer, mm->saved_auxv, res); + mmput(mm); + } + return res; +} + + +#ifdef CONFIG_KALLSYMS +/* + * Provides a wchan file via kallsyms in a proper one-value-per-file format. + * Returns the resolved symbol. If that fails, simply return the address. + */ +static int proc_pid_wchan(struct task_struct *task, char *buffer) +{ + unsigned long wchan; + char symname[KSYM_NAME_LEN]; + + wchan = get_wchan(task); + + if (lookup_symbol_name(wchan, symname) < 0) + if (!ptrace_may_access(task, PTRACE_MODE_READ)) + return 0; + else + return sprintf(buffer, "%lu", wchan); + else + return sprintf(buffer, "%s", symname); +} +#endif /* CONFIG_KALLSYMS */ + +static int lock_trace(struct task_struct *task) +{ + int err = mutex_lock_killable(&task->signal->cred_guard_mutex); + if (err) + return err; + if (!ptrace_may_access(task, PTRACE_MODE_ATTACH)) { + mutex_unlock(&task->signal->cred_guard_mutex); + return -EPERM; + } + return 0; +} + +static void unlock_trace(struct task_struct *task) +{ + mutex_unlock(&task->signal->cred_guard_mutex); +} + +#ifdef CONFIG_STACKTRACE + +#define MAX_STACK_TRACE_DEPTH 64 + +static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + struct stack_trace trace; + unsigned long *entries; + int err; + int i; + + entries = kmalloc(MAX_STACK_TRACE_DEPTH * sizeof(*entries), GFP_KERNEL); + if (!entries) + return -ENOMEM; + + trace.nr_entries = 0; + trace.max_entries = MAX_STACK_TRACE_DEPTH; + trace.entries = entries; + trace.skip = 0; + + err = lock_trace(task); + if (!err) { + save_stack_trace_tsk(task, &trace); + + for (i = 0; i < trace.nr_entries; i++) { + seq_printf(m, "[<%pK>] %pS\n", + (void *)entries[i], (void *)entries[i]); + } + unlock_trace(task); + } + kfree(entries); + + return err; +} +#endif + +#ifdef CONFIG_SCHEDSTATS +/* + * Provides /proc/PID/schedstat + */ +static int proc_pid_schedstat(struct task_struct *task, char *buffer) +{ + return sprintf(buffer, "%llu %llu %lu\n", + (unsigned long long)task->se.sum_exec_runtime, + (unsigned long long)task->sched_info.run_delay, + task->sched_info.pcount); +} +#endif + +#ifdef CONFIG_LATENCYTOP +static int lstats_show_proc(struct seq_file *m, void *v) +{ + int i; + struct inode *inode = m->private; + struct task_struct *task = get_proc_task(inode); + + if (!task) + return -ESRCH; + seq_puts(m, "Latency Top version : v0.1\n"); + for (i = 0; i < 32; i++) { + struct latency_record *lr = &task->latency_record[i]; + if (lr->backtrace[0]) { + int q; + seq_printf(m, "%i %li %li", + lr->count, lr->time, lr->max); + for (q = 0; q < LT_BACKTRACEDEPTH; q++) { + unsigned long bt = lr->backtrace[q]; + if (!bt) + break; + if (bt == ULONG_MAX) + break; + seq_printf(m, " %ps", (void *)bt); + } + seq_putc(m, '\n'); + } + + } + put_task_struct(task); + return 0; +} + +static int lstats_open(struct inode *inode, struct file *file) +{ + return single_open(file, lstats_show_proc, inode); +} + +static ssize_t lstats_write(struct file *file, const char __user *buf, + size_t count, loff_t *offs) +{ + struct task_struct *task = get_proc_task(file->f_dentry->d_inode); + + if (!task) + return -ESRCH; + clear_all_latency_tracing(task); + put_task_struct(task); + + return count; +} + +static const struct file_operations proc_lstats_operations = { + .open = lstats_open, + .read = seq_read, + .write = lstats_write, + .llseek = seq_lseek, + .release = single_release, +}; + +#endif + +static int proc_oom_score(struct task_struct *task, char *buffer) +{ + unsigned long points = 0; + + read_lock(&tasklist_lock); + if (pid_alive(task)) + points = oom_badness(task, NULL, NULL, + totalram_pages + total_swap_pages); + read_unlock(&tasklist_lock); + return sprintf(buffer, "%lu\n", points); +} + +struct limit_names { + char *name; + char *unit; +}; + +static const struct limit_names lnames[RLIM_NLIMITS] = { + [RLIMIT_CPU] = {"Max cpu time", "seconds"}, + [RLIMIT_FSIZE] = {"Max file size", "bytes"}, + [RLIMIT_DATA] = {"Max data size", "bytes"}, + [RLIMIT_STACK] = {"Max stack size", "bytes"}, + [RLIMIT_CORE] = {"Max core file size", "bytes"}, + [RLIMIT_RSS] = {"Max resident set", "bytes"}, + [RLIMIT_NPROC] = {"Max processes", "processes"}, + [RLIMIT_NOFILE] = {"Max open files", "files"}, + [RLIMIT_MEMLOCK] = {"Max locked memory", "bytes"}, + [RLIMIT_AS] = {"Max address space", "bytes"}, + [RLIMIT_LOCKS] = {"Max file locks", "locks"}, + [RLIMIT_SIGPENDING] = {"Max pending signals", "signals"}, + [RLIMIT_MSGQUEUE] = {"Max msgqueue size", "bytes"}, + [RLIMIT_NICE] = {"Max nice priority", NULL}, + [RLIMIT_RTPRIO] = {"Max realtime priority", NULL}, + [RLIMIT_RTTIME] = {"Max realtime timeout", "us"}, +}; + +/* Display limits for a process */ +static int proc_pid_limits(struct task_struct *task, char *buffer) +{ + unsigned int i; + int count = 0; + unsigned long flags; + char *bufptr = buffer; + + struct rlimit rlim[RLIM_NLIMITS]; + + if (!lock_task_sighand(task, &flags)) + return 0; + memcpy(rlim, task->signal->rlim, sizeof(struct rlimit) * RLIM_NLIMITS); + unlock_task_sighand(task, &flags); + + /* + * print the file header + */ + count += sprintf(&bufptr[count], "%-25s %-20s %-20s %-10s\n", + "Limit", "Soft Limit", "Hard Limit", "Units"); + + for (i = 0; i < RLIM_NLIMITS; i++) { + if (rlim[i].rlim_cur == RLIM_INFINITY) + count += sprintf(&bufptr[count], "%-25s %-20s ", + lnames[i].name, "unlimited"); + else + count += sprintf(&bufptr[count], "%-25s %-20lu ", + lnames[i].name, rlim[i].rlim_cur); + + if (rlim[i].rlim_max == RLIM_INFINITY) + count += sprintf(&bufptr[count], "%-20s ", "unlimited"); + else + count += sprintf(&bufptr[count], "%-20lu ", + rlim[i].rlim_max); + + if (lnames[i].unit) + count += sprintf(&bufptr[count], "%-10s\n", + lnames[i].unit); + else + count += sprintf(&bufptr[count], "\n"); + } + + return count; +} + +#ifdef CONFIG_HAVE_ARCH_TRACEHOOK +static int proc_pid_syscall(struct task_struct *task, char *buffer) +{ + long nr; + unsigned long args[6], sp, pc; + int res = lock_trace(task); + if (res) + return res; + + if (task_current_syscall(task, &nr, args, 6, &sp, &pc)) + res = sprintf(buffer, "running\n"); + else if (nr < 0) + res = sprintf(buffer, "%ld 0x%lx 0x%lx\n", nr, sp, pc); + else + res = sprintf(buffer, + "%ld 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx\n", + nr, + args[0], args[1], args[2], args[3], args[4], args[5], + sp, pc); + unlock_trace(task); + return res; +} +#endif /* CONFIG_HAVE_ARCH_TRACEHOOK */ + +/************************************************************************/ +/* Here the fs part begins */ +/************************************************************************/ + +/* permission checks */ +static int proc_fd_access_allowed(struct inode *inode) +{ + struct task_struct *task; + int allowed = 0; + /* Allow access to a task's file descriptors if it is us or we + * may use ptrace attach to the process and find out that + * information. + */ + task = get_proc_task(inode); + if (task) { + allowed = ptrace_may_access(task, PTRACE_MODE_READ); + put_task_struct(task); + } + return allowed; +} + +int proc_setattr(struct dentry *dentry, struct iattr *attr) +{ + int error; + struct inode *inode = dentry->d_inode; + + if (attr->ia_valid & ATTR_MODE) + return -EPERM; + + error = inode_change_ok(inode, attr); + if (error) + return error; + + if ((attr->ia_valid & ATTR_SIZE) && + attr->ia_size != i_size_read(inode)) { + error = vmtruncate(inode, attr->ia_size); + if (error) + return error; + } + + setattr_copy(inode, attr); + mark_inode_dirty(inode); + return 0; +} + +/* + * May current process learn task's sched/cmdline info (for hide_pid_min=1) + * or euid/egid (for hide_pid_min=2)? + */ +static bool has_pid_permissions(struct pid_namespace *pid, + struct task_struct *task, + int hide_pid_min) +{ + if (pid->hide_pid < hide_pid_min) + return true; + if (in_group_p(pid->pid_gid)) + return true; + return ptrace_may_access(task, PTRACE_MODE_READ); +} + + +static int proc_pid_permission(struct inode *inode, int mask) +{ + struct pid_namespace *pid = inode->i_sb->s_fs_info; + struct task_struct *task; + bool has_perms; + + task = get_proc_task(inode); + if (!task) + return -ESRCH; + has_perms = has_pid_permissions(pid, task, 1); + put_task_struct(task); + + if (!has_perms) { + if (pid->hide_pid == 2) { + /* + * Let's make getdents(), stat(), and open() + * consistent with each other. If a process + * may not stat() a file, it shouldn't be seen + * in procfs at all. + */ + return -ENOENT; + } + + return -EPERM; + } + return generic_permission(inode, mask); +} + + + +static const struct inode_operations proc_def_inode_operations = { + .setattr = proc_setattr, +}; + +#define PROC_BLOCK_SIZE (3*1024) /* 4K page size but our output routines use some slack for overruns */ + +static ssize_t proc_info_read(struct file * file, char __user * buf, + size_t count, loff_t *ppos) +{ + struct inode * inode = file->f_path.dentry->d_inode; + unsigned long page; + ssize_t length; + struct task_struct *task = get_proc_task(inode); + + length = -ESRCH; + if (!task) + goto out_no_task; + + if (count > PROC_BLOCK_SIZE) + count = PROC_BLOCK_SIZE; + + length = -ENOMEM; + if (!(page = __get_free_page(GFP_TEMPORARY))) + goto out; + + length = PROC_I(inode)->op.proc_read(task, (char*)page); + + if (length >= 0) + length = simple_read_from_buffer(buf, count, ppos, (char *)page, length); + free_page(page); +out: + put_task_struct(task); +out_no_task: + return length; +} + +static const struct file_operations proc_info_file_operations = { + .read = proc_info_read, + .llseek = generic_file_llseek, +}; + +static int proc_single_show(struct seq_file *m, void *v) +{ + struct inode *inode = m->private; + struct pid_namespace *ns; + struct pid *pid; + struct task_struct *task; + int ret; + + ns = inode->i_sb->s_fs_info; + pid = proc_pid(inode); + task = get_pid_task(pid, PIDTYPE_PID); + if (!task) + return -ESRCH; + + ret = PROC_I(inode)->op.proc_show(m, ns, pid, task); + + put_task_struct(task); + return ret; +} + +static int proc_single_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, proc_single_show, inode); +} + +static const struct file_operations proc_single_file_operations = { + .open = proc_single_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int mem_open(struct inode* inode, struct file* file) +{ + struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); + struct mm_struct *mm; + + if (!task) + return -ESRCH; + + mm = mm_access(task, PTRACE_MODE_ATTACH); + put_task_struct(task); + + if (IS_ERR(mm)) + return PTR_ERR(mm); + + if (mm) { + /* ensure this mm_struct can't be freed */ + atomic_inc(&mm->mm_count); + /* but do not pin its memory */ + mmput(mm); + } + + /* OK to pass negative loff_t, we can catch out-of-range */ + file->f_mode |= FMODE_UNSIGNED_OFFSET; + file->private_data = mm; + + return 0; +} + +static ssize_t mem_rw(struct file *file, char __user *buf, + size_t count, loff_t *ppos, int write) +{ + struct mm_struct *mm = file->private_data; + unsigned long addr = *ppos; + ssize_t copied; + char *page; + + if (!mm) + return 0; + + page = (char *)__get_free_page(GFP_TEMPORARY); + if (!page) + return -ENOMEM; + + copied = 0; + if (!atomic_inc_not_zero(&mm->mm_users)) + goto free; + + while (count > 0) { + int this_len = min_t(int, count, PAGE_SIZE); + + if (write && copy_from_user(page, buf, this_len)) { + copied = -EFAULT; + break; + } + + this_len = access_remote_vm(mm, addr, page, this_len, write); + if (!this_len) { + if (!copied) + copied = -EIO; + break; + } + + if (!write && copy_to_user(buf, page, this_len)) { + copied = -EFAULT; + break; + } + + buf += this_len; + addr += this_len; + copied += this_len; + count -= this_len; + } + *ppos = addr; + + mmput(mm); +free: + free_page((unsigned long) page); + return copied; +} + +static ssize_t mem_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + return mem_rw(file, buf, count, ppos, 0); +} + +static ssize_t mem_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + return mem_rw(file, (char __user*)buf, count, ppos, 1); +} + +loff_t mem_lseek(struct file *file, loff_t offset, int orig) +{ + switch (orig) { + case 0: + file->f_pos = offset; + break; + case 1: + file->f_pos += offset; + break; + default: + return -EINVAL; + } + force_successful_syscall_return(); + return file->f_pos; +} + +static int mem_release(struct inode *inode, struct file *file) +{ + struct mm_struct *mm = file->private_data; + if (mm) + mmdrop(mm); + return 0; +} + +static const struct file_operations proc_mem_operations = { + .llseek = mem_lseek, + .read = mem_read, + .write = mem_write, + .open = mem_open, + .release = mem_release, +}; + +static ssize_t environ_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task = get_proc_task(file->f_dentry->d_inode); + char *page; + unsigned long src = *ppos; + int ret = -ESRCH; + struct mm_struct *mm; + + if (!task) + goto out_no_task; + + ret = -ENOMEM; + page = (char *)__get_free_page(GFP_TEMPORARY); + if (!page) + goto out; + + + mm = mm_for_maps(task); + ret = PTR_ERR(mm); + if (!mm || IS_ERR(mm)) + goto out_free; + + ret = 0; + while (count > 0) { + int this_len, retval, max_len; + + this_len = mm->env_end - (mm->env_start + src); + + if (this_len <= 0) + break; + + max_len = (count > PAGE_SIZE) ? PAGE_SIZE : count; + this_len = (this_len > max_len) ? max_len : this_len; + + retval = access_process_vm(task, (mm->env_start + src), + page, this_len, 0); + + if (retval <= 0) { + ret = retval; + break; + } + + if (copy_to_user(buf, page, retval)) { + ret = -EFAULT; + break; + } + + ret += retval; + src += retval; + buf += retval; + count -= retval; + } + *ppos = src; + + mmput(mm); +out_free: + free_page((unsigned long) page); +out: + put_task_struct(task); +out_no_task: + return ret; +} + +static const struct file_operations proc_environ_operations = { + .read = environ_read, + .llseek = generic_file_llseek, +}; + +static ssize_t oom_adjust_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); + char buffer[PROC_NUMBUF]; + size_t len; + int oom_adjust = OOM_DISABLE; + unsigned long flags; + + if (!task) + return -ESRCH; + + if (lock_task_sighand(task, &flags)) { + oom_adjust = task->signal->oom_adj; + unlock_task_sighand(task, &flags); + } + + put_task_struct(task); + + len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust); + + return simple_read_from_buffer(buf, count, ppos, buffer, len); +} + +static ssize_t oom_adjust_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task; + char buffer[PROC_NUMBUF]; + int oom_adjust; + unsigned long flags; + int err; + + memset(buffer, 0, sizeof(buffer)); + if (count > sizeof(buffer) - 1) + count = sizeof(buffer) - 1; + if (copy_from_user(buffer, buf, count)) { + err = -EFAULT; + goto out; + } + + err = kstrtoint(strstrip(buffer), 0, &oom_adjust); + if (err) + goto out; + if ((oom_adjust < OOM_ADJUST_MIN || oom_adjust > OOM_ADJUST_MAX) && + oom_adjust != OOM_DISABLE) { + err = -EINVAL; + goto out; + } + + task = get_proc_task(file->f_path.dentry->d_inode); + if (!task) { + err = -ESRCH; + goto out; + } + + task_lock(task); + if (!task->mm) { + err = -EINVAL; + goto err_task_lock; + } + + if (!lock_task_sighand(task, &flags)) { + err = -ESRCH; + goto err_task_lock; + } + + if (oom_adjust < task->signal->oom_adj && !capable(CAP_SYS_RESOURCE)) { + err = -EACCES; + goto err_sighand; + } + + /* + * Warn that /proc/pid/oom_adj is deprecated, see + * Documentation/feature-removal-schedule.txt. + */ + printk_once(KERN_WARNING "%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n", + current->comm, task_pid_nr(current), task_pid_nr(task), + task_pid_nr(task)); + task->signal->oom_adj = oom_adjust; + /* + * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum + * value is always attainable. + */ + if (task->signal->oom_adj == OOM_ADJUST_MAX) + task->signal->oom_score_adj = OOM_SCORE_ADJ_MAX; + else + task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) / + -OOM_DISABLE; + trace_oom_score_adj_update(task); +err_sighand: + unlock_task_sighand(task, &flags); +err_task_lock: + task_unlock(task); + put_task_struct(task); +out: + return err < 0 ? err : count; +} + +static int oom_adjust_permission(struct inode *inode, int mask) +{ + uid_t uid; + struct task_struct *p; + + p = get_proc_task(inode); + if(p) { + uid = task_uid(p); + put_task_struct(p); + } + + /* + * System Server (uid == 1000) is granted access to oom_adj of all + * android applications (uid > 10000) as and services (uid >= 1000) + */ + if (p && (current_fsuid() == 1000) && (uid >= 1000)) { + if (inode->i_mode >> 6 & mask) { + return 0; + } + } + + /* Fall back to default. */ + return generic_permission(inode, mask); +} + +static const struct inode_operations proc_oom_adjust_inode_operations = { + .permission = oom_adjust_permission, +}; + +static const struct file_operations proc_oom_adjust_operations = { + .read = oom_adjust_read, + .write = oom_adjust_write, + .llseek = generic_file_llseek, +}; + +static ssize_t oom_score_adj_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); + char buffer[PROC_NUMBUF]; + int oom_score_adj = OOM_SCORE_ADJ_MIN; + unsigned long flags; + size_t len; + + if (!task) + return -ESRCH; + if (lock_task_sighand(task, &flags)) { + oom_score_adj = task->signal->oom_score_adj; + unlock_task_sighand(task, &flags); + } + put_task_struct(task); + len = snprintf(buffer, sizeof(buffer), "%d\n", oom_score_adj); + return simple_read_from_buffer(buf, count, ppos, buffer, len); +} + +static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task; + char buffer[PROC_NUMBUF]; + unsigned long flags; + int oom_score_adj; + int err; + + memset(buffer, 0, sizeof(buffer)); + if (count > sizeof(buffer) - 1) + count = sizeof(buffer) - 1; + if (copy_from_user(buffer, buf, count)) { + err = -EFAULT; + goto out; + } + + err = kstrtoint(strstrip(buffer), 0, &oom_score_adj); + if (err) + goto out; + if (oom_score_adj < OOM_SCORE_ADJ_MIN || + oom_score_adj > OOM_SCORE_ADJ_MAX) { + err = -EINVAL; + goto out; + } + + task = get_proc_task(file->f_path.dentry->d_inode); + if (!task) { + err = -ESRCH; + goto out; + } + + task_lock(task); + if (!task->mm) { + err = -EINVAL; + goto err_task_lock; + } + + if (!lock_task_sighand(task, &flags)) { + err = -ESRCH; + goto err_task_lock; + } + + if (oom_score_adj < task->signal->oom_score_adj_min && + !capable(CAP_SYS_RESOURCE)) { + err = -EACCES; + goto err_sighand; + } + + task->signal->oom_score_adj = oom_score_adj; + if (has_capability_noaudit(current, CAP_SYS_RESOURCE)) + task->signal->oom_score_adj_min = oom_score_adj; + trace_oom_score_adj_update(task); + /* + * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is + * always attainable. + */ + if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) + task->signal->oom_adj = OOM_DISABLE; + else + task->signal->oom_adj = (oom_score_adj * OOM_ADJUST_MAX) / + OOM_SCORE_ADJ_MAX; +err_sighand: + unlock_task_sighand(task, &flags); +err_task_lock: + task_unlock(task); + put_task_struct(task); +out: + return err < 0 ? err : count; +} + +static const struct file_operations proc_oom_score_adj_operations = { + .read = oom_score_adj_read, + .write = oom_score_adj_write, + .llseek = default_llseek, +}; + +#ifdef CONFIG_AUDITSYSCALL +#define TMPBUFLEN 21 +static ssize_t proc_loginuid_read(struct file * file, char __user * buf, + size_t count, loff_t *ppos) +{ + struct inode * inode = file->f_path.dentry->d_inode; + struct task_struct *task = get_proc_task(inode); + ssize_t length; + char tmpbuf[TMPBUFLEN]; + + if (!task) + return -ESRCH; + length = scnprintf(tmpbuf, TMPBUFLEN, "%u", + audit_get_loginuid(task)); + put_task_struct(task); + return simple_read_from_buffer(buf, count, ppos, tmpbuf, length); +} + +static ssize_t proc_loginuid_write(struct file * file, const char __user * buf, + size_t count, loff_t *ppos) +{ + struct inode * inode = file->f_path.dentry->d_inode; + char *page, *tmp; + ssize_t length; + uid_t loginuid; + + rcu_read_lock(); + if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) { + rcu_read_unlock(); + return -EPERM; + } + rcu_read_unlock(); + + if (count >= PAGE_SIZE) + count = PAGE_SIZE - 1; + + if (*ppos != 0) { + /* No partial writes. */ + return -EINVAL; + } + page = (char*)__get_free_page(GFP_TEMPORARY); + if (!page) + return -ENOMEM; + length = -EFAULT; + if (copy_from_user(page, buf, count)) + goto out_free_page; + + page[count] = '\0'; + loginuid = simple_strtoul(page, &tmp, 10); + if (tmp == page) { + length = -EINVAL; + goto out_free_page; + + } + length = audit_set_loginuid(loginuid); + if (likely(length == 0)) + length = count; + +out_free_page: + free_page((unsigned long) page); + return length; +} + +static const struct file_operations proc_loginuid_operations = { + .read = proc_loginuid_read, + .write = proc_loginuid_write, + .llseek = generic_file_llseek, +}; + +static ssize_t proc_sessionid_read(struct file * file, char __user * buf, + size_t count, loff_t *ppos) +{ + struct inode * inode = file->f_path.dentry->d_inode; + struct task_struct *task = get_proc_task(inode); + ssize_t length; + char tmpbuf[TMPBUFLEN]; + + if (!task) + return -ESRCH; + length = scnprintf(tmpbuf, TMPBUFLEN, "%u", + audit_get_sessionid(task)); + put_task_struct(task); + return simple_read_from_buffer(buf, count, ppos, tmpbuf, length); +} + +static const struct file_operations proc_sessionid_operations = { + .read = proc_sessionid_read, + .llseek = generic_file_llseek, +}; +#endif + +#ifdef CONFIG_FAULT_INJECTION +static ssize_t proc_fault_inject_read(struct file * file, char __user * buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task = get_proc_task(file->f_dentry->d_inode); + char buffer[PROC_NUMBUF]; + size_t len; + int make_it_fail; + + if (!task) + return -ESRCH; + make_it_fail = task->make_it_fail; + put_task_struct(task); + + len = snprintf(buffer, sizeof(buffer), "%i\n", make_it_fail); + + return simple_read_from_buffer(buf, count, ppos, buffer, len); +} + +static ssize_t proc_fault_inject_write(struct file * file, + const char __user * buf, size_t count, loff_t *ppos) +{ + struct task_struct *task; + char buffer[PROC_NUMBUF], *end; + int make_it_fail; + + if (!capable(CAP_SYS_RESOURCE)) + return -EPERM; + memset(buffer, 0, sizeof(buffer)); + if (count > sizeof(buffer) - 1) + count = sizeof(buffer) - 1; + if (copy_from_user(buffer, buf, count)) + return -EFAULT; + make_it_fail = simple_strtol(strstrip(buffer), &end, 0); + if (*end) + return -EINVAL; + task = get_proc_task(file->f_dentry->d_inode); + if (!task) + return -ESRCH; + task->make_it_fail = make_it_fail; + put_task_struct(task); + + return count; +} + +static const struct file_operations proc_fault_inject_operations = { + .read = proc_fault_inject_read, + .write = proc_fault_inject_write, + .llseek = generic_file_llseek, +}; +#endif + + +#ifdef CONFIG_SCHED_DEBUG +/* + * Print out various scheduling related per-task fields: + */ +static int sched_show(struct seq_file *m, void *v) +{ + struct inode *inode = m->private; + struct task_struct *p; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + proc_sched_show_task(p, m); + + put_task_struct(p); + + return 0; +} + +static ssize_t +sched_write(struct file *file, const char __user *buf, + size_t count, loff_t *offset) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct task_struct *p; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + proc_sched_set_task(p); + + put_task_struct(p); + + return count; +} + +static int sched_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, sched_show, inode); +} + +static const struct file_operations proc_pid_sched_operations = { + .open = sched_open, + .read = seq_read, + .write = sched_write, + .llseek = seq_lseek, + .release = single_release, +}; + +#endif + +#ifdef CONFIG_SCHED_AUTOGROUP +/* + * Print out autogroup related information: + */ +static int sched_autogroup_show(struct seq_file *m, void *v) +{ + struct inode *inode = m->private; + struct task_struct *p; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + proc_sched_autogroup_show_task(p, m); + + put_task_struct(p); + + return 0; +} + +static ssize_t +sched_autogroup_write(struct file *file, const char __user *buf, + size_t count, loff_t *offset) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct task_struct *p; + char buffer[PROC_NUMBUF]; + int nice; + int err; + + memset(buffer, 0, sizeof(buffer)); + if (count > sizeof(buffer) - 1) + count = sizeof(buffer) - 1; + if (copy_from_user(buffer, buf, count)) + return -EFAULT; + + err = kstrtoint(strstrip(buffer), 0, &nice); + if (err < 0) + return err; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + + err = proc_sched_autogroup_set_nice(p, nice); + if (err) + count = err; + + put_task_struct(p); + + return count; +} + +static int sched_autogroup_open(struct inode *inode, struct file *filp) +{ + int ret; + + ret = single_open(filp, sched_autogroup_show, NULL); + if (!ret) { + struct seq_file *m = filp->private_data; + + m->private = inode; + } + return ret; +} + +static const struct file_operations proc_pid_sched_autogroup_operations = { + .open = sched_autogroup_open, + .read = seq_read, + .write = sched_autogroup_write, + .llseek = seq_lseek, + .release = single_release, +}; + +#endif /* CONFIG_SCHED_AUTOGROUP */ + +static ssize_t comm_write(struct file *file, const char __user *buf, + size_t count, loff_t *offset) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct task_struct *p; + char buffer[TASK_COMM_LEN]; + + memset(buffer, 0, sizeof(buffer)); + if (count > sizeof(buffer) - 1) + count = sizeof(buffer) - 1; + if (copy_from_user(buffer, buf, count)) + return -EFAULT; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + + if (same_thread_group(current, p)) + set_task_comm(p, buffer); + else + count = -EINVAL; + + put_task_struct(p); + + return count; +} + +static int comm_show(struct seq_file *m, void *v) +{ + struct inode *inode = m->private; + struct task_struct *p; + + p = get_proc_task(inode); + if (!p) + return -ESRCH; + + task_lock(p); + seq_printf(m, "%s\n", p->comm); + task_unlock(p); + + put_task_struct(p); + + return 0; +} + +static int comm_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, comm_show, inode); +} + +static const struct file_operations proc_pid_set_comm_operations = { + .open = comm_open, + .read = seq_read, + .write = comm_write, + .llseek = seq_lseek, + .release = single_release, +}; + +static int proc_exe_link(struct dentry *dentry, struct path *exe_path) +{ + struct task_struct *task; + struct mm_struct *mm; + struct file *exe_file; + + task = get_proc_task(dentry->d_inode); + if (!task) + return -ENOENT; + mm = get_task_mm(task); + put_task_struct(task); + if (!mm) + return -ENOENT; + exe_file = get_mm_exe_file(mm); + mmput(mm); + if (exe_file) { + *exe_path = exe_file->f_path; + path_get(&exe_file->f_path); + fput(exe_file); + return 0; + } else + return -ENOENT; +} + +static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct inode *inode = dentry->d_inode; + int error = -EACCES; + + /* We don't need a base pointer in the /proc filesystem */ + path_put(&nd->path); + + /* Are we allowed to snoop on the tasks file descriptors? */ + if (!proc_fd_access_allowed(inode)) + goto out; + + error = PROC_I(inode)->op.proc_get_link(dentry, &nd->path); +out: + return ERR_PTR(error); +} + +static int do_proc_readlink(struct path *path, char __user *buffer, int buflen) +{ + char *tmp = (char*)__get_free_page(GFP_TEMPORARY); + char *pathname; + int len; + + if (!tmp) + return -ENOMEM; + + pathname = d_path(path, tmp, PAGE_SIZE); + len = PTR_ERR(pathname); + if (IS_ERR(pathname)) + goto out; + len = tmp + PAGE_SIZE - 1 - pathname; + + if (len > buflen) + len = buflen; + if (copy_to_user(buffer, pathname, len)) + len = -EFAULT; + out: + free_page((unsigned long)tmp); + return len; +} + +static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int buflen) +{ + int error = -EACCES; + struct inode *inode = dentry->d_inode; + struct path path; + + /* Are we allowed to snoop on the tasks file descriptors? */ + if (!proc_fd_access_allowed(inode)) + goto out; + + error = PROC_I(inode)->op.proc_get_link(dentry, &path); + if (error) + goto out; + + error = do_proc_readlink(&path, buffer, buflen); + path_put(&path); +out: + return error; +} + +static const struct inode_operations proc_pid_link_inode_operations = { + .readlink = proc_pid_readlink, + .follow_link = proc_pid_follow_link, + .setattr = proc_setattr, +}; + + +/* building an inode */ + +static int task_dumpable(struct task_struct *task) +{ + int dumpable = 0; + struct mm_struct *mm; + + task_lock(task); + mm = task->mm; + if (mm) + dumpable = get_dumpable(mm); + task_unlock(task); + if(dumpable == 1) + return 1; + return 0; +} + +struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task) +{ + struct inode * inode; + struct proc_inode *ei; + const struct cred *cred; + + /* We need a new inode */ + + inode = new_inode(sb); + if (!inode) + goto out; + + /* Common stuff */ + ei = PROC_I(inode); + inode->i_ino = get_next_ino(); + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode->i_op = &proc_def_inode_operations; + + /* + * grab the reference to task. + */ + ei->pid = get_task_pid(task, PIDTYPE_PID); + if (!ei->pid) + goto out_unlock; + + if (task_dumpable(task)) { + rcu_read_lock(); + cred = __task_cred(task); + inode->i_uid = cred->euid; + inode->i_gid = cred->egid; + rcu_read_unlock(); + } + security_task_to_inode(task, inode); + +out: + return inode; + +out_unlock: + iput(inode); + return NULL; +} + +int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) +{ + struct inode *inode = dentry->d_inode; + struct task_struct *task; + const struct cred *cred; + struct pid_namespace *pid = dentry->d_sb->s_fs_info; + + generic_fillattr(inode, stat); + + rcu_read_lock(); + stat->uid = 0; + stat->gid = 0; + task = pid_task(proc_pid(inode), PIDTYPE_PID); + if (task) { + if (!has_pid_permissions(pid, task, 2)) { + rcu_read_unlock(); + /* + * This doesn't prevent learning whether PID exists, + * it only makes getattr() consistent with readdir(). + */ + return -ENOENT; + } + if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) || + task_dumpable(task)) { + cred = __task_cred(task); + stat->uid = cred->euid; + stat->gid = cred->egid; + } + } + rcu_read_unlock(); + return 0; +} + +/* dentry stuff */ + +/* + * Exceptional case: normally we are not allowed to unhash a busy + * directory. In this case, however, we can do it - no aliasing problems + * due to the way we treat inodes. + * + * Rewrite the inode's ownerships here because the owning task may have + * performed a setuid(), etc. + * + * Before the /proc/pid/status file was created the only way to read + * the effective uid of a /process was to stat /proc/pid. Reading + * /proc/pid/status is slow enough that procps and other packages + * kept stating /proc/pid. To keep the rules in /proc simple I have + * made this apply to all per process world readable and executable + * directories. + */ +int pid_revalidate(struct dentry *dentry, struct nameidata *nd) +{ + struct inode *inode; + struct task_struct *task; + const struct cred *cred; + + if (nd && nd->flags & LOOKUP_RCU) + return -ECHILD; + + inode = dentry->d_inode; + task = get_proc_task(inode); + + if (task) { + if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) || + task_dumpable(task)) { + rcu_read_lock(); + cred = __task_cred(task); + inode->i_uid = cred->euid; + inode->i_gid = cred->egid; + rcu_read_unlock(); + } else { + inode->i_uid = 0; + inode->i_gid = 0; + } + inode->i_mode &= ~(S_ISUID | S_ISGID); + security_task_to_inode(task, inode); + put_task_struct(task); + return 1; + } + d_drop(dentry); + return 0; +} + +static int pid_delete_dentry(const struct dentry * dentry) +{ + /* Is the task we represent dead? + * If so, then don't put the dentry on the lru list, + * kill it immediately. + */ + return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first; +} + +const struct dentry_operations pid_dentry_operations = +{ + .d_revalidate = pid_revalidate, + .d_delete = pid_delete_dentry, +}; + +/* Lookups */ + +/* + * Fill a directory entry. + * + * If possible create the dcache entry and derive our inode number and + * file type from dcache entry. + * + * Since all of the proc inode numbers are dynamically generated, the inode + * numbers do not exist until the inode is cache. This means creating the + * the dcache entry in readdir is necessary to keep the inode numbers + * reported by readdir in sync with the inode numbers reported + * by stat. + */ +int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir, + const char *name, int len, + instantiate_t instantiate, struct task_struct *task, const void *ptr) +{ + struct dentry *child, *dir = filp->f_path.dentry; + struct inode *inode; + struct qstr qname; + ino_t ino = 0; + unsigned type = DT_UNKNOWN; + + qname.name = name; + qname.len = len; + qname.hash = full_name_hash(name, len); + + child = d_lookup(dir, &qname); + if (!child) { + struct dentry *new; + new = d_alloc(dir, &qname); + if (new) { + child = instantiate(dir->d_inode, new, task, ptr); + if (child) + dput(new); + else + child = new; + } + } + if (!child || IS_ERR(child) || !child->d_inode) + goto end_instantiate; + inode = child->d_inode; + if (inode) { + ino = inode->i_ino; + type = inode->i_mode >> 12; + } + dput(child); +end_instantiate: + if (!ino) + ino = find_inode_number(dir, &qname); + if (!ino) + ino = 1; + return filldir(dirent, name, len, filp->f_pos, ino, type); +} + +static unsigned name_to_int(struct dentry *dentry) +{ + const char *name = dentry->d_name.name; + int len = dentry->d_name.len; + unsigned n = 0; + + if (len > 1 && *name == '0') + goto out; + while (len-- > 0) { + unsigned c = *name++ - '0'; + if (c > 9) + goto out; + if (n >= (~0U-9)/10) + goto out; + n *= 10; + n += c; + } + return n; +out: + return ~0U; +} + +#define PROC_FDINFO_MAX 64 + +static int proc_fd_info(struct inode *inode, struct path *path, char *info) +{ + struct task_struct *task = get_proc_task(inode); + struct files_struct *files = NULL; + struct file *file; + int fd = proc_fd(inode); + + if (task) { + files = get_files_struct(task); + put_task_struct(task); + } + if (files) { + /* + * We are not taking a ref to the file structure, so we must + * hold ->file_lock. + */ + spin_lock(&files->file_lock); + file = fcheck_files(files, fd); + if (file) { + unsigned int f_flags; + struct fdtable *fdt; + + fdt = files_fdtable(files); + f_flags = file->f_flags & ~O_CLOEXEC; + if (close_on_exec(fd, fdt)) + f_flags |= O_CLOEXEC; + + if (path) { + *path = file->f_path; + path_get(&file->f_path); + } + if (info) + snprintf(info, PROC_FDINFO_MAX, + "pos:\t%lli\n" + "flags:\t0%o\n", + (long long) file->f_pos, + f_flags); + spin_unlock(&files->file_lock); + put_files_struct(files); + return 0; + } + spin_unlock(&files->file_lock); + put_files_struct(files); + } + return -ENOENT; +} + +static int proc_fd_link(struct dentry *dentry, struct path *path) +{ + return proc_fd_info(dentry->d_inode, path, NULL); +} + +static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd) +{ + struct inode *inode; + struct task_struct *task; + int fd; + struct files_struct *files; + const struct cred *cred; + + if (nd && nd->flags & LOOKUP_RCU) + return -ECHILD; + + inode = dentry->d_inode; + task = get_proc_task(inode); + fd = proc_fd(inode); + + if (task) { + files = get_files_struct(task); + if (files) { + struct file *file; + rcu_read_lock(); + file = fcheck_files(files, fd); + if (file) { + unsigned f_mode = file->f_mode; + + rcu_read_unlock(); + put_files_struct(files); + + if (task_dumpable(task)) { + rcu_read_lock(); + cred = __task_cred(task); + inode->i_uid = cred->euid; + inode->i_gid = cred->egid; + rcu_read_unlock(); + } else { + inode->i_uid = 0; + inode->i_gid = 0; + } + + if (S_ISLNK(inode->i_mode)) { + unsigned i_mode = S_IFLNK; + if (f_mode & FMODE_READ) + i_mode |= S_IRUSR | S_IXUSR; + if (f_mode & FMODE_WRITE) + i_mode |= S_IWUSR | S_IXUSR; + inode->i_mode = i_mode; + } + + security_task_to_inode(task, inode); + put_task_struct(task); + return 1; + } + rcu_read_unlock(); + put_files_struct(files); + } + put_task_struct(task); + } + d_drop(dentry); + return 0; +} + +static const struct dentry_operations tid_fd_dentry_operations = +{ + .d_revalidate = tid_fd_revalidate, + .d_delete = pid_delete_dentry, +}; + +static struct dentry *proc_fd_instantiate(struct inode *dir, + struct dentry *dentry, struct task_struct *task, const void *ptr) +{ + unsigned fd = *(const unsigned *)ptr; + struct inode *inode; + struct proc_inode *ei; + struct dentry *error = ERR_PTR(-ENOENT); + + inode = proc_pid_make_inode(dir->i_sb, task); + if (!inode) + goto out; + ei = PROC_I(inode); + ei->fd = fd; + + inode->i_mode = S_IFLNK; + inode->i_op = &proc_pid_link_inode_operations; + inode->i_size = 64; + ei->op.proc_get_link = proc_fd_link; + d_set_d_op(dentry, &tid_fd_dentry_operations); + d_add(dentry, inode); + /* Close the race of the process dying before we return the dentry */ + if (tid_fd_revalidate(dentry, NULL)) + error = NULL; + + out: + return error; +} + +static struct dentry *proc_lookupfd_common(struct inode *dir, + struct dentry *dentry, + instantiate_t instantiate) +{ + struct task_struct *task = get_proc_task(dir); + unsigned fd = name_to_int(dentry); + struct dentry *result = ERR_PTR(-ENOENT); + + if (!task) + goto out_no_task; + if (fd == ~0U) + goto out; + + result = instantiate(dir, dentry, task, &fd); +out: + put_task_struct(task); +out_no_task: + return result; +} + +static int proc_readfd_common(struct file * filp, void * dirent, + filldir_t filldir, instantiate_t instantiate) +{ + struct dentry *dentry = filp->f_path.dentry; + struct inode *inode = dentry->d_inode; + struct task_struct *p = get_proc_task(inode); + unsigned int fd, ino; + int retval; + struct files_struct * files; + + retval = -ENOENT; + if (!p) + goto out_no_task; + retval = 0; + + fd = filp->f_pos; + switch (fd) { + case 0: + if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0) + goto out; + filp->f_pos++; + case 1: + ino = parent_ino(dentry); + if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0) + goto out; + filp->f_pos++; + default: + files = get_files_struct(p); + if (!files) + goto out; + rcu_read_lock(); + for (fd = filp->f_pos-2; + fd < files_fdtable(files)->max_fds; + fd++, filp->f_pos++) { + char name[PROC_NUMBUF]; + int len; + + if (!fcheck_files(files, fd)) + continue; + rcu_read_unlock(); + + len = snprintf(name, sizeof(name), "%d", fd); + if (proc_fill_cache(filp, dirent, filldir, + name, len, instantiate, + p, &fd) < 0) { + rcu_read_lock(); + break; + } + rcu_read_lock(); + } + rcu_read_unlock(); + put_files_struct(files); + } +out: + put_task_struct(p); +out_no_task: + return retval; +} + +static struct dentry *proc_lookupfd(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) +{ + return proc_lookupfd_common(dir, dentry, proc_fd_instantiate); +} + +static int proc_readfd(struct file *filp, void *dirent, filldir_t filldir) +{ + return proc_readfd_common(filp, dirent, filldir, proc_fd_instantiate); +} + +static ssize_t proc_fdinfo_read(struct file *file, char __user *buf, + size_t len, loff_t *ppos) +{ + char tmp[PROC_FDINFO_MAX]; + int err = proc_fd_info(file->f_path.dentry->d_inode, NULL, tmp); + if (!err) + err = simple_read_from_buffer(buf, len, ppos, tmp, strlen(tmp)); + return err; +} + +static const struct file_operations proc_fdinfo_file_operations = { + .open = nonseekable_open, + .read = proc_fdinfo_read, + .llseek = no_llseek, +}; + +static const struct file_operations proc_fd_operations = { + .read = generic_read_dir, + .readdir = proc_readfd, + .llseek = default_llseek, +}; + +#ifdef CONFIG_CHECKPOINT_RESTORE + +/* + * dname_to_vma_addr - maps a dentry name into two unsigned longs + * which represent vma start and end addresses. + */ +static int dname_to_vma_addr(struct dentry *dentry, + unsigned long *start, unsigned long *end) +{ + if (sscanf(dentry->d_name.name, "%lx-%lx", start, end) != 2) + return -EINVAL; + + return 0; +} + +static int map_files_d_revalidate(struct dentry *dentry, struct nameidata *nd) +{ + unsigned long vm_start, vm_end; + bool exact_vma_exists = false; + struct mm_struct *mm = NULL; + struct task_struct *task; + const struct cred *cred; + struct inode *inode; + int status = 0; + + if (nd && nd->flags & LOOKUP_RCU) + return -ECHILD; + + if (!capable(CAP_SYS_ADMIN)) { + status = -EACCES; + goto out_notask; + } + + inode = dentry->d_inode; + task = get_proc_task(inode); + if (!task) + goto out_notask; + + if (!ptrace_may_access(task, PTRACE_MODE_READ)) + goto out; + + mm = get_task_mm(task); + if (!mm) + goto out; + + if (!dname_to_vma_addr(dentry, &vm_start, &vm_end)) { + down_read(&mm->mmap_sem); + exact_vma_exists = !!find_exact_vma(mm, vm_start, vm_end); + up_read(&mm->mmap_sem); + } + + mmput(mm); + + if (exact_vma_exists) { + if (task_dumpable(task)) { + rcu_read_lock(); + cred = __task_cred(task); + inode->i_uid = cred->euid; + inode->i_gid = cred->egid; + rcu_read_unlock(); + } else { + inode->i_uid = 0; + inode->i_gid = 0; + } + security_task_to_inode(task, inode); + status = 1; + } + +out: + put_task_struct(task); + +out_notask: + if (status <= 0) + d_drop(dentry); + + return status; +} + +static const struct dentry_operations tid_map_files_dentry_operations = { + .d_revalidate = map_files_d_revalidate, + .d_delete = pid_delete_dentry, +}; + +static int proc_map_files_get_link(struct dentry *dentry, struct path *path) +{ + unsigned long vm_start, vm_end; + struct vm_area_struct *vma; + struct task_struct *task; + struct mm_struct *mm; + int rc; + + rc = -ENOENT; + task = get_proc_task(dentry->d_inode); + if (!task) + goto out; + + mm = get_task_mm(task); + put_task_struct(task); + if (!mm) + goto out; + + rc = dname_to_vma_addr(dentry, &vm_start, &vm_end); + if (rc) + goto out_mmput; + + down_read(&mm->mmap_sem); + vma = find_exact_vma(mm, vm_start, vm_end); + if (vma && vma->vm_file) { + *path = vma->vm_file->f_path; + path_get(path); + rc = 0; + } + up_read(&mm->mmap_sem); + +out_mmput: + mmput(mm); +out: + return rc; +} + +struct map_files_info { + struct file *file; + unsigned long len; + unsigned char name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */ +}; + +static struct dentry * +proc_map_files_instantiate(struct inode *dir, struct dentry *dentry, + struct task_struct *task, const void *ptr) +{ + const struct file *file = ptr; + struct proc_inode *ei; + struct inode *inode; + + if (!file) + return ERR_PTR(-ENOENT); + + inode = proc_pid_make_inode(dir->i_sb, task); + if (!inode) + return ERR_PTR(-ENOENT); + + ei = PROC_I(inode); + ei->op.proc_get_link = proc_map_files_get_link; + + inode->i_op = &proc_pid_link_inode_operations; + inode->i_size = 64; + inode->i_mode = S_IFLNK; + + if (file->f_mode & FMODE_READ) + inode->i_mode |= S_IRUSR; + if (file->f_mode & FMODE_WRITE) + inode->i_mode |= S_IWUSR; + + d_set_d_op(dentry, &tid_map_files_dentry_operations); + d_add(dentry, inode); + + return NULL; +} + +static struct dentry *proc_map_files_lookup(struct inode *dir, + struct dentry *dentry, struct nameidata *nd) +{ + unsigned long vm_start, vm_end; + struct vm_area_struct *vma; + struct task_struct *task; + struct dentry *result; + struct mm_struct *mm; + + result = ERR_PTR(-EACCES); + if (!capable(CAP_SYS_ADMIN)) + goto out; + + result = ERR_PTR(-ENOENT); + task = get_proc_task(dir); + if (!task) + goto out; + + result = ERR_PTR(-EACCES); + if (!ptrace_may_access(task, PTRACE_MODE_READ)) + goto out_put_task; + + result = ERR_PTR(-ENOENT); + if (dname_to_vma_addr(dentry, &vm_start, &vm_end)) + goto out_put_task; + + mm = get_task_mm(task); + if (!mm) + goto out_put_task; + + down_read(&mm->mmap_sem); + vma = find_exact_vma(mm, vm_start, vm_end); + if (!vma) + goto out_no_vma; + + result = proc_map_files_instantiate(dir, dentry, task, vma->vm_file); + +out_no_vma: + up_read(&mm->mmap_sem); + mmput(mm); +out_put_task: + put_task_struct(task); +out: + return result; +} + +static const struct inode_operations proc_map_files_inode_operations = { + .lookup = proc_map_files_lookup, + .permission = proc_fd_permission, + .setattr = proc_setattr, +}; + +static int +proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir) +{ + struct dentry *dentry = filp->f_path.dentry; + struct inode *inode = dentry->d_inode; + struct vm_area_struct *vma; + struct task_struct *task; + struct mm_struct *mm; + ino_t ino; + int ret; + + ret = -EACCES; + if (!capable(CAP_SYS_ADMIN)) + goto out; + + ret = -ENOENT; + task = get_proc_task(inode); + if (!task) + goto out; + + ret = -EACCES; + if (!ptrace_may_access(task, PTRACE_MODE_READ)) + goto out_put_task; + + ret = 0; + switch (filp->f_pos) { + case 0: + ino = inode->i_ino; + if (filldir(dirent, ".", 1, 0, ino, DT_DIR) < 0) + goto out_put_task; + filp->f_pos++; + case 1: + ino = parent_ino(dentry); + if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0) + goto out_put_task; + filp->f_pos++; + default: + { + unsigned long nr_files, pos, i; + struct flex_array *fa = NULL; + struct map_files_info info; + struct map_files_info *p; + + mm = get_task_mm(task); + if (!mm) + goto out_put_task; + down_read(&mm->mmap_sem); + + nr_files = 0; + + /* + * We need two passes here: + * + * 1) Collect vmas of mapped files with mmap_sem taken + * 2) Release mmap_sem and instantiate entries + * + * otherwise we get lockdep complained, since filldir() + * routine might require mmap_sem taken in might_fault(). + */ + + for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) { + if (vma->vm_file && ++pos > filp->f_pos) + nr_files++; + } + + if (nr_files) { + fa = flex_array_alloc(sizeof(info), nr_files, + GFP_KERNEL); + if (!fa || flex_array_prealloc(fa, 0, nr_files, + GFP_KERNEL)) { + ret = -ENOMEM; + if (fa) + flex_array_free(fa); + up_read(&mm->mmap_sem); + mmput(mm); + goto out_put_task; + } + for (i = 0, vma = mm->mmap, pos = 2; vma; + vma = vma->vm_next) { + if (!vma->vm_file) + continue; + if (++pos <= filp->f_pos) + continue; + + get_file(vma->vm_file); + info.file = vma->vm_file; + info.len = snprintf(info.name, + sizeof(info.name), "%lx-%lx", + vma->vm_start, vma->vm_end); + if (flex_array_put(fa, i++, &info, GFP_KERNEL)) + BUG(); + } + } + up_read(&mm->mmap_sem); + + for (i = 0; i < nr_files; i++) { + p = flex_array_get(fa, i); + ret = proc_fill_cache(filp, dirent, filldir, + p->name, p->len, + proc_map_files_instantiate, + task, p->file); + if (ret) + break; + filp->f_pos++; + fput(p->file); + } + for (; i < nr_files; i++) { + /* + * In case of error don't forget + * to put rest of file refs. + */ + p = flex_array_get(fa, i); + fput(p->file); + } + if (fa) + flex_array_free(fa); + mmput(mm); + } + } + +out_put_task: + put_task_struct(task); +out: + return ret; +} + +static const struct file_operations proc_map_files_operations = { + .read = generic_read_dir, + .readdir = proc_map_files_readdir, + .llseek = default_llseek, +}; + +#endif /* CONFIG_CHECKPOINT_RESTORE */ + +/* + * /proc/pid/fd needs a special permission handler so that a process can still + * access /proc/self/fd after it has executed a setuid(). + */ +static int proc_fd_permission(struct inode *inode, int mask) +{ + int rv = generic_permission(inode, mask); + if (rv == 0) + return 0; + if (task_pid(current) == proc_pid(inode)) + rv = 0; + return rv; +} + +/* + * proc directories can do almost nothing.. + */ +static const struct inode_operations proc_fd_inode_operations = { + .lookup = proc_lookupfd, + .permission = proc_fd_permission, + .setattr = proc_setattr, +}; + +static struct dentry *proc_fdinfo_instantiate(struct inode *dir, + struct dentry *dentry, struct task_struct *task, const void *ptr) +{ + unsigned fd = *(unsigned *)ptr; + struct inode *inode; + struct proc_inode *ei; + struct dentry *error = ERR_PTR(-ENOENT); + + inode = proc_pid_make_inode(dir->i_sb, task); + if (!inode) + goto out; + ei = PROC_I(inode); + ei->fd = fd; + inode->i_mode = S_IFREG | S_IRUSR; + inode->i_fop = &proc_fdinfo_file_operations; + d_set_d_op(dentry, &tid_fd_dentry_operations); + d_add(dentry, inode); + /* Close the race of the process dying before we return the dentry */ + if (tid_fd_revalidate(dentry, NULL)) + error = NULL; + + out: + return error; +} + +static struct dentry *proc_lookupfdinfo(struct inode *dir, + struct dentry *dentry, + struct nameidata *nd) +{ + return proc_lookupfd_common(dir, dentry, proc_fdinfo_instantiate); +} + +static int proc_readfdinfo(struct file *filp, void *dirent, filldir_t filldir) +{ + return proc_readfd_common(filp, dirent, filldir, + proc_fdinfo_instantiate); +} + +static const struct file_operations proc_fdinfo_operations = { + .read = generic_read_dir, + .readdir = proc_readfdinfo, + .llseek = default_llseek, +}; + +/* + * proc directories can do almost nothing.. + */ +static const struct inode_operations proc_fdinfo_inode_operations = { + .lookup = proc_lookupfdinfo, + .setattr = proc_setattr, +}; + + +static struct dentry *proc_pident_instantiate(struct inode *dir, + struct dentry *dentry, struct task_struct *task, const void *ptr) +{ + const struct pid_entry *p = ptr; + struct inode *inode; + struct proc_inode *ei; + struct dentry *error = ERR_PTR(-ENOENT); + + inode = proc_pid_make_inode(dir->i_sb, task); + if (!inode) + goto out; + + ei = PROC_I(inode); + inode->i_mode = p->mode; + if (S_ISDIR(inode->i_mode)) + set_nlink(inode, 2); /* Use getattr to fix if necessary */ + if (p->iop) + inode->i_op = p->iop; + if (p->fop) + inode->i_fop = p->fop; + ei->op = p->op; + d_set_d_op(dentry, &pid_dentry_operations); + d_add(dentry, inode); + /* Close the race of the process dying before we return the dentry */ + if (pid_revalidate(dentry, NULL)) + error = NULL; +out: + return error; +} + +static struct dentry *proc_pident_lookup(struct inode *dir, + struct dentry *dentry, + const struct pid_entry *ents, + unsigned int nents) +{ + struct dentry *error; + struct task_struct *task = get_proc_task(dir); + const struct pid_entry *p, *last; + + error = ERR_PTR(-ENOENT); + + if (!task) + goto out_no_task; + + /* + * Yes, it does not scale. And it should not. Don't add + * new entries into /proc/<tgid>/ without very good reasons. + */ + last = &ents[nents - 1]; + for (p = ents; p <= last; p++) { + if (p->len != dentry->d_name.len) + continue; + if (!memcmp(dentry->d_name.name, p->name, p->len)) + break; + } + if (p > last) + goto out; + + error = proc_pident_instantiate(dir, dentry, task, p); +out: + put_task_struct(task); +out_no_task: + return error; +} + +static int proc_pident_fill_cache(struct file *filp, void *dirent, + filldir_t filldir, struct task_struct *task, const struct pid_entry *p) +{ + return proc_fill_cache(filp, dirent, filldir, p->name, p->len, + proc_pident_instantiate, task, p); +} + +static int proc_pident_readdir(struct file *filp, + void *dirent, filldir_t filldir, + const struct pid_entry *ents, unsigned int nents) +{ + int i; + struct dentry *dentry = filp->f_path.dentry; + struct inode *inode = dentry->d_inode; + struct task_struct *task = get_proc_task(inode); + const struct pid_entry *p, *last; + ino_t ino; + int ret; + + ret = -ENOENT; + if (!task) + goto out_no_task; + + ret = 0; + i = filp->f_pos; + switch (i) { + case 0: + ino = inode->i_ino; + if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0) + goto out; + i++; + filp->f_pos++; + /* fall through */ + case 1: + ino = parent_ino(dentry); + if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0) + goto out; + i++; + filp->f_pos++; + /* fall through */ + default: + i -= 2; + if (i >= nents) { + ret = 1; + goto out; + } + p = ents + i; + last = &ents[nents - 1]; + while (p <= last) { + if (proc_pident_fill_cache(filp, dirent, filldir, task, p) < 0) + goto out; + filp->f_pos++; + p++; + } + } + + ret = 1; +out: + put_task_struct(task); +out_no_task: + return ret; +} + +#ifdef CONFIG_SECURITY +static ssize_t proc_pid_attr_read(struct file * file, char __user * buf, + size_t count, loff_t *ppos) +{ + struct inode * inode = file->f_path.dentry->d_inode; + char *p = NULL; + ssize_t length; + struct task_struct *task = get_proc_task(inode); + + if (!task) + return -ESRCH; + + length = security_getprocattr(task, + (char*)file->f_path.dentry->d_name.name, + &p); + put_task_struct(task); + if (length > 0) + length = simple_read_from_buffer(buf, count, ppos, p, length); + kfree(p); + return length; +} + +static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf, + size_t count, loff_t *ppos) +{ + struct inode * inode = file->f_path.dentry->d_inode; + char *page; + ssize_t length; + struct task_struct *task = get_proc_task(inode); + + length = -ESRCH; + if (!task) + goto out_no_task; + if (count > PAGE_SIZE) + count = PAGE_SIZE; + + /* No partial writes. */ + length = -EINVAL; + if (*ppos != 0) + goto out; + + length = -ENOMEM; + page = (char*)__get_free_page(GFP_TEMPORARY); + if (!page) + goto out; + + length = -EFAULT; + if (copy_from_user(page, buf, count)) + goto out_free; + + /* Guard against adverse ptrace interaction */ + length = mutex_lock_interruptible(&task->signal->cred_guard_mutex); + if (length < 0) + goto out_free; + + length = security_setprocattr(task, + (char*)file->f_path.dentry->d_name.name, + (void*)page, count); + mutex_unlock(&task->signal->cred_guard_mutex); +out_free: + free_page((unsigned long) page); +out: + put_task_struct(task); +out_no_task: + return length; +} + +static const struct file_operations proc_pid_attr_operations = { + .read = proc_pid_attr_read, + .write = proc_pid_attr_write, + .llseek = generic_file_llseek, +}; + +static const struct pid_entry attr_dir_stuff[] = { + REG("current", S_IRUGO|S_IWUGO, proc_pid_attr_operations), + REG("prev", S_IRUGO, proc_pid_attr_operations), + REG("exec", S_IRUGO|S_IWUGO, proc_pid_attr_operations), + REG("fscreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations), + REG("keycreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations), + REG("sockcreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations), +}; + +static int proc_attr_dir_readdir(struct file * filp, + void * dirent, filldir_t filldir) +{ + return proc_pident_readdir(filp,dirent,filldir, + attr_dir_stuff,ARRAY_SIZE(attr_dir_stuff)); +} + +static const struct file_operations proc_attr_dir_operations = { + .read = generic_read_dir, + .readdir = proc_attr_dir_readdir, + .llseek = default_llseek, +}; + +static struct dentry *proc_attr_dir_lookup(struct inode *dir, + struct dentry *dentry, struct nameidata *nd) +{ + return proc_pident_lookup(dir, dentry, + attr_dir_stuff, ARRAY_SIZE(attr_dir_stuff)); +} + +static const struct inode_operations proc_attr_dir_inode_operations = { + .lookup = proc_attr_dir_lookup, + .getattr = pid_getattr, + .setattr = proc_setattr, +}; + +#endif + +#ifdef CONFIG_ELF_CORE +static ssize_t proc_coredump_filter_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task = get_proc_task(file->f_dentry->d_inode); + struct mm_struct *mm; + char buffer[PROC_NUMBUF]; + size_t len; + int ret; + + if (!task) + return -ESRCH; + + ret = 0; + mm = get_task_mm(task); + if (mm) { + len = snprintf(buffer, sizeof(buffer), "%08lx\n", + ((mm->flags & MMF_DUMP_FILTER_MASK) >> + MMF_DUMP_FILTER_SHIFT)); + mmput(mm); + ret = simple_read_from_buffer(buf, count, ppos, buffer, len); + } + + put_task_struct(task); + + return ret; +} + +static ssize_t proc_coredump_filter_write(struct file *file, + const char __user *buf, + size_t count, + loff_t *ppos) +{ + struct task_struct *task; + struct mm_struct *mm; + char buffer[PROC_NUMBUF], *end; + unsigned int val; + int ret; + int i; + unsigned long mask; + + ret = -EFAULT; + memset(buffer, 0, sizeof(buffer)); + if (count > sizeof(buffer) - 1) + count = sizeof(buffer) - 1; + if (copy_from_user(buffer, buf, count)) + goto out_no_task; + + ret = -EINVAL; + val = (unsigned int)simple_strtoul(buffer, &end, 0); + if (*end == '\n') + end++; + if (end - buffer == 0) + goto out_no_task; + + ret = -ESRCH; + task = get_proc_task(file->f_dentry->d_inode); + if (!task) + goto out_no_task; + + ret = end - buffer; + mm = get_task_mm(task); + if (!mm) + goto out_no_mm; + + for (i = 0, mask = 1; i < MMF_DUMP_FILTER_BITS; i++, mask <<= 1) { + if (val & mask) + set_bit(i + MMF_DUMP_FILTER_SHIFT, &mm->flags); + else + clear_bit(i + MMF_DUMP_FILTER_SHIFT, &mm->flags); + } + + mmput(mm); + out_no_mm: + put_task_struct(task); + out_no_task: + return ret; +} + +static const struct file_operations proc_coredump_filter_operations = { + .read = proc_coredump_filter_read, + .write = proc_coredump_filter_write, + .llseek = generic_file_llseek, +}; +#endif + +/* + * /proc/self: + */ +static int proc_self_readlink(struct dentry *dentry, char __user *buffer, + int buflen) +{ + struct pid_namespace *ns = dentry->d_sb->s_fs_info; + pid_t tgid = task_tgid_nr_ns(current, ns); + char tmp[PROC_NUMBUF]; + if (!tgid) + return -ENOENT; + sprintf(tmp, "%d", tgid); + return vfs_readlink(dentry,buffer,buflen,tmp); +} + +static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct pid_namespace *ns = dentry->d_sb->s_fs_info; + pid_t tgid = task_tgid_nr_ns(current, ns); + char *name = ERR_PTR(-ENOENT); + if (tgid) { + name = __getname(); + if (!name) + name = ERR_PTR(-ENOMEM); + else + sprintf(name, "%d", tgid); + } + nd_set_link(nd, name); + return NULL; +} + +static void proc_self_put_link(struct dentry *dentry, struct nameidata *nd, + void *cookie) +{ + char *s = nd_get_link(nd); + if (!IS_ERR(s)) + __putname(s); +} + +static const struct inode_operations proc_self_inode_operations = { + .readlink = proc_self_readlink, + .follow_link = proc_self_follow_link, + .put_link = proc_self_put_link, +}; + +/* + * proc base + * + * These are the directory entries in the root directory of /proc + * that properly belong to the /proc filesystem, as they describe + * describe something that is process related. + */ +static const struct pid_entry proc_base_stuff[] = { + NOD("self", S_IFLNK|S_IRWXUGO, + &proc_self_inode_operations, NULL, {}), +}; + +static struct dentry *proc_base_instantiate(struct inode *dir, + struct dentry *dentry, struct task_struct *task, const void *ptr) +{ + const struct pid_entry *p = ptr; + struct inode *inode; + struct proc_inode *ei; + struct dentry *error; + + /* Allocate the inode */ + error = ERR_PTR(-ENOMEM); + inode = new_inode(dir->i_sb); + if (!inode) + goto out; + + /* Initialize the inode */ + ei = PROC_I(inode); + inode->i_ino = get_next_ino(); + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + + /* + * grab the reference to the task. + */ + ei->pid = get_task_pid(task, PIDTYPE_PID); + if (!ei->pid) + goto out_iput; + + inode->i_mode = p->mode; + if (S_ISDIR(inode->i_mode)) + set_nlink(inode, 2); + if (S_ISLNK(inode->i_mode)) + inode->i_size = 64; + if (p->iop) + inode->i_op = p->iop; + if (p->fop) + inode->i_fop = p->fop; + ei->op = p->op; + d_add(dentry, inode); + error = NULL; +out: + return error; +out_iput: + iput(inode); + goto out; +} + +static struct dentry *proc_base_lookup(struct inode *dir, struct dentry *dentry) +{ + struct dentry *error; + struct task_struct *task = get_proc_task(dir); + const struct pid_entry *p, *last; + + error = ERR_PTR(-ENOENT); + + if (!task) + goto out_no_task; + + /* Lookup the directory entry */ + last = &proc_base_stuff[ARRAY_SIZE(proc_base_stuff) - 1]; + for (p = proc_base_stuff; p <= last; p++) { + if (p->len != dentry->d_name.len) + continue; + if (!memcmp(dentry->d_name.name, p->name, p->len)) + break; + } + if (p > last) + goto out; + + error = proc_base_instantiate(dir, dentry, task, p); + +out: + put_task_struct(task); +out_no_task: + return error; +} + +static int proc_base_fill_cache(struct file *filp, void *dirent, + filldir_t filldir, struct task_struct *task, const struct pid_entry *p) +{ + return proc_fill_cache(filp, dirent, filldir, p->name, p->len, + proc_base_instantiate, task, p); +} + +#ifdef CONFIG_TASK_IO_ACCOUNTING +static int do_io_accounting(struct task_struct *task, char *buffer, int whole) +{ + struct task_io_accounting acct = task->ioac; + unsigned long flags; + int result; + + result = mutex_lock_killable(&task->signal->cred_guard_mutex); + if (result) + return result; + + if (!ptrace_may_access(task, PTRACE_MODE_READ)) { + result = -EACCES; + goto out_unlock; + } + + if (whole && lock_task_sighand(task, &flags)) { + struct task_struct *t = task; + + task_io_accounting_add(&acct, &task->signal->ioac); + while_each_thread(task, t) + task_io_accounting_add(&acct, &t->ioac); + + unlock_task_sighand(task, &flags); + } + result = sprintf(buffer, + "rchar: %llu\n" + "wchar: %llu\n" + "syscr: %llu\n" + "syscw: %llu\n" + "read_bytes: %llu\n" + "write_bytes: %llu\n" + "cancelled_write_bytes: %llu\n", + (unsigned long long)acct.rchar, + (unsigned long long)acct.wchar, + (unsigned long long)acct.syscr, + (unsigned long long)acct.syscw, + (unsigned long long)acct.read_bytes, + (unsigned long long)acct.write_bytes, + (unsigned long long)acct.cancelled_write_bytes); +out_unlock: + mutex_unlock(&task->signal->cred_guard_mutex); + return result; +} + +static int proc_tid_io_accounting(struct task_struct *task, char *buffer) +{ + return do_io_accounting(task, buffer, 0); +} + +static int proc_tgid_io_accounting(struct task_struct *task, char *buffer) +{ + return do_io_accounting(task, buffer, 1); +} +#endif /* CONFIG_TASK_IO_ACCOUNTING */ + +static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + int err = lock_trace(task); + if (!err) { + seq_printf(m, "%08x\n", task->personality); + unlock_trace(task); + } + return err; +} + +/* + * Thread groups + */ +static const struct file_operations proc_task_operations; +static const struct inode_operations proc_task_inode_operations; + +static const struct pid_entry tgid_base_stuff[] = { + DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations), + DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations), +#ifdef CONFIG_CHECKPOINT_RESTORE + DIR("map_files", S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations), +#endif + DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations), + DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations), +#ifdef CONFIG_NET + DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations), +#endif + REG("environ", S_IRUSR, proc_environ_operations), + INF("auxv", S_IRUSR, proc_pid_auxv), + ONE("status", S_IRUGO, proc_pid_status), + ONE("personality", S_IRUGO, proc_pid_personality), + INF("limits", S_IRUGO, proc_pid_limits), +#ifdef CONFIG_SCHED_DEBUG + REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), +#endif +#ifdef CONFIG_SCHED_AUTOGROUP + REG("autogroup", S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations), +#endif + REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations), +#ifdef CONFIG_HAVE_ARCH_TRACEHOOK + INF("syscall", S_IRUGO, proc_pid_syscall), +#endif + INF("cmdline", S_IRUGO, proc_pid_cmdline), + ONE("stat", S_IRUGO, proc_tgid_stat), + ONE("statm", S_IRUGO, proc_pid_statm), + REG("maps", S_IRUGO, proc_pid_maps_operations), +#ifdef CONFIG_NUMA + REG("numa_maps", S_IRUGO, proc_pid_numa_maps_operations), +#endif + REG("mem", S_IRUSR|S_IWUSR, proc_mem_operations), + LNK("cwd", proc_cwd_link), + LNK("root", proc_root_link), + LNK("exe", proc_exe_link), + REG("mounts", S_IRUGO, proc_mounts_operations), + REG("mountinfo", S_IRUGO, proc_mountinfo_operations), + REG("mountstats", S_IRUSR, proc_mountstats_operations), +#ifdef CONFIG_PROC_PAGE_MONITOR + REG("clear_refs", S_IWUSR, proc_clear_refs_operations), + REG("smaps", S_IRUGO, proc_pid_smaps_operations), + REG("pagemap", S_IRUGO, proc_pagemap_operations), +#endif +#ifdef CONFIG_SECURITY + DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), +#endif +#ifdef CONFIG_KALLSYMS + INF("wchan", S_IRUGO, proc_pid_wchan), +#endif +#ifdef CONFIG_STACKTRACE + ONE("stack", S_IRUGO, proc_pid_stack), +#endif +#ifdef CONFIG_SCHEDSTATS + INF("schedstat", S_IRUGO, proc_pid_schedstat), +#endif +#ifdef CONFIG_LATENCYTOP + REG("latency", S_IRUGO, proc_lstats_operations), +#endif +#ifdef CONFIG_PROC_PID_CPUSET + REG("cpuset", S_IRUGO, proc_cpuset_operations), +#endif +#ifdef CONFIG_CGROUPS + REG("cgroup", S_IRUGO, proc_cgroup_operations), +#endif + INF("oom_score", S_IRUGO, proc_oom_score), + ANDROID("oom_adj",S_IRUGO|S_IWUSR, oom_adjust), + REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), +#ifdef CONFIG_AUDITSYSCALL + REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), + REG("sessionid", S_IRUGO, proc_sessionid_operations), +#endif +#ifdef CONFIG_FAULT_INJECTION + REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations), +#endif +#ifdef CONFIG_ELF_CORE + REG("coredump_filter", S_IRUGO|S_IWUSR, proc_coredump_filter_operations), +#endif +#ifdef CONFIG_TASK_IO_ACCOUNTING + INF("io", S_IRUSR, proc_tgid_io_accounting), +#endif +#ifdef CONFIG_HARDWALL + INF("hardwall", S_IRUGO, proc_pid_hardwall), +#endif +}; + +static int proc_tgid_base_readdir(struct file * filp, + void * dirent, filldir_t filldir) +{ + return proc_pident_readdir(filp,dirent,filldir, + tgid_base_stuff,ARRAY_SIZE(tgid_base_stuff)); +} + +static const struct file_operations proc_tgid_base_operations = { + .read = generic_read_dir, + .readdir = proc_tgid_base_readdir, + .llseek = default_llseek, +}; + +static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd){ + return proc_pident_lookup(dir, dentry, + tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff)); +} + +static const struct inode_operations proc_tgid_base_inode_operations = { + .lookup = proc_tgid_base_lookup, + .getattr = pid_getattr, + .setattr = proc_setattr, + .permission = proc_pid_permission, +}; + +static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid) +{ + struct dentry *dentry, *leader, *dir; + char buf[PROC_NUMBUF]; + struct qstr name; + + name.name = buf; + name.len = snprintf(buf, sizeof(buf), "%d", pid); + dentry = d_hash_and_lookup(mnt->mnt_root, &name); + if (dentry) { + shrink_dcache_parent(dentry); + d_drop(dentry); + dput(dentry); + } + + name.name = buf; + name.len = snprintf(buf, sizeof(buf), "%d", tgid); + leader = d_hash_and_lookup(mnt->mnt_root, &name); + if (!leader) + goto out; + + name.name = "task"; + name.len = strlen(name.name); + dir = d_hash_and_lookup(leader, &name); + if (!dir) + goto out_put_leader; + + name.name = buf; + name.len = snprintf(buf, sizeof(buf), "%d", pid); + dentry = d_hash_and_lookup(dir, &name); + if (dentry) { + shrink_dcache_parent(dentry); + d_drop(dentry); + dput(dentry); + } + + dput(dir); +out_put_leader: + dput(leader); +out: + return; +} + +/** + * proc_flush_task - Remove dcache entries for @task from the /proc dcache. + * @task: task that should be flushed. + * + * When flushing dentries from proc, one needs to flush them from global + * proc (proc_mnt) and from all the namespaces' procs this task was seen + * in. This call is supposed to do all of this job. + * + * Looks in the dcache for + * /proc/@pid + * /proc/@tgid/task/@pid + * if either directory is present flushes it and all of it'ts children + * from the dcache. + * + * It is safe and reasonable to cache /proc entries for a task until + * that task exits. After that they just clog up the dcache with + * useless entries, possibly causing useful dcache entries to be + * flushed instead. This routine is proved to flush those useless + * dcache entries at process exit time. + * + * NOTE: This routine is just an optimization so it does not guarantee + * that no dcache entries will exist at process exit time it + * just makes it very unlikely that any will persist. + */ + +void proc_flush_task(struct task_struct *task) +{ + int i; + struct pid *pid, *tgid; + struct upid *upid; + + pid = task_pid(task); + tgid = task_tgid(task); + + for (i = 0; i <= pid->level; i++) { + upid = &pid->numbers[i]; + proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr, + tgid->numbers[i].nr); + } + + upid = &pid->numbers[pid->level]; + if (upid->nr == 1) + pid_ns_release_proc(upid->ns); +} + +static struct dentry *proc_pid_instantiate(struct inode *dir, + struct dentry * dentry, + struct task_struct *task, const void *ptr) +{ + struct dentry *error = ERR_PTR(-ENOENT); + struct inode *inode; + + inode = proc_pid_make_inode(dir->i_sb, task); + if (!inode) + goto out; + + inode->i_mode = S_IFDIR|S_IRUGO|S_IXUGO; + inode->i_op = &proc_tgid_base_inode_operations; + inode->i_fop = &proc_tgid_base_operations; + inode->i_flags|=S_IMMUTABLE; + + set_nlink(inode, 2 + pid_entry_count_dirs(tgid_base_stuff, + ARRAY_SIZE(tgid_base_stuff))); + + d_set_d_op(dentry, &pid_dentry_operations); + + d_add(dentry, inode); + /* Close the race of the process dying before we return the dentry */ + if (pid_revalidate(dentry, NULL)) + error = NULL; +out: + return error; +} + +struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd) +{ + struct dentry *result; + struct task_struct *task; + unsigned tgid; + struct pid_namespace *ns; + + result = proc_base_lookup(dir, dentry); + if (!IS_ERR(result) || PTR_ERR(result) != -ENOENT) + goto out; + + tgid = name_to_int(dentry); + if (tgid == ~0U) + goto out; + + ns = dentry->d_sb->s_fs_info; + rcu_read_lock(); + task = find_task_by_pid_ns(tgid, ns); + if (task) + get_task_struct(task); + rcu_read_unlock(); + if (!task) + goto out; + + result = proc_pid_instantiate(dir, dentry, task, NULL); + put_task_struct(task); +out: + return result; +} + +/* + * Find the first task with tgid >= tgid + * + */ +struct tgid_iter { + unsigned int tgid; + struct task_struct *task; +}; +static struct tgid_iter next_tgid(struct pid_namespace *ns, struct tgid_iter iter) +{ + struct pid *pid; + + if (iter.task) + put_task_struct(iter.task); + rcu_read_lock(); +retry: + iter.task = NULL; + pid = find_ge_pid(iter.tgid, ns); + if (pid) { + iter.tgid = pid_nr_ns(pid, ns); + iter.task = pid_task(pid, PIDTYPE_PID); + /* What we to know is if the pid we have find is the + * pid of a thread_group_leader. Testing for task + * being a thread_group_leader is the obvious thing + * todo but there is a window when it fails, due to + * the pid transfer logic in de_thread. + * + * So we perform the straight forward test of seeing + * if the pid we have found is the pid of a thread + * group leader, and don't worry if the task we have + * found doesn't happen to be a thread group leader. + * As we don't care in the case of readdir. + */ + if (!iter.task || !has_group_leader_pid(iter.task)) { + iter.tgid += 1; + goto retry; + } + get_task_struct(iter.task); + } + rcu_read_unlock(); + return iter; +} + +#define TGID_OFFSET (FIRST_PROCESS_ENTRY + ARRAY_SIZE(proc_base_stuff)) + +static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldir, + struct tgid_iter iter) +{ + char name[PROC_NUMBUF]; + int len = snprintf(name, sizeof(name), "%d", iter.tgid); + return proc_fill_cache(filp, dirent, filldir, name, len, + proc_pid_instantiate, iter.task, NULL); +} + +static int fake_filldir(void *buf, const char *name, int namelen, + loff_t offset, u64 ino, unsigned d_type) +{ + return 0; +} + +/* for the /proc/ directory itself, after non-process stuff has been done */ +int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) +{ + unsigned int nr; + struct task_struct *reaper; + struct tgid_iter iter; + struct pid_namespace *ns; + filldir_t __filldir; + + if (filp->f_pos >= PID_MAX_LIMIT + TGID_OFFSET) + goto out_no_task; + nr = filp->f_pos - FIRST_PROCESS_ENTRY; + + reaper = get_proc_task(filp->f_path.dentry->d_inode); + if (!reaper) + goto out_no_task; + + for (; nr < ARRAY_SIZE(proc_base_stuff); filp->f_pos++, nr++) { + const struct pid_entry *p = &proc_base_stuff[nr]; + if (proc_base_fill_cache(filp, dirent, filldir, reaper, p) < 0) + goto out; + } + + ns = filp->f_dentry->d_sb->s_fs_info; + iter.task = NULL; + iter.tgid = filp->f_pos - TGID_OFFSET; + for (iter = next_tgid(ns, iter); + iter.task; + iter.tgid += 1, iter = next_tgid(ns, iter)) { + if (has_pid_permissions(ns, iter.task, 2)) + __filldir = filldir; + else + __filldir = fake_filldir; + + filp->f_pos = iter.tgid + TGID_OFFSET; + if (proc_pid_fill_cache(filp, dirent, __filldir, iter) < 0) { + put_task_struct(iter.task); + goto out; + } + } + filp->f_pos = PID_MAX_LIMIT + TGID_OFFSET; +out: + put_task_struct(reaper); +out_no_task: + return 0; +} + +/* + * Tasks + */ +static const struct pid_entry tid_base_stuff[] = { + DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations), + DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations), + DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations), + REG("environ", S_IRUSR, proc_environ_operations), + INF("auxv", S_IRUSR, proc_pid_auxv), + ONE("status", S_IRUGO, proc_pid_status), + ONE("personality", S_IRUGO, proc_pid_personality), + INF("limits", S_IRUGO, proc_pid_limits), +#ifdef CONFIG_SCHED_DEBUG + REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), +#endif + REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations), +#ifdef CONFIG_HAVE_ARCH_TRACEHOOK + INF("syscall", S_IRUGO, proc_pid_syscall), +#endif + INF("cmdline", S_IRUGO, proc_pid_cmdline), + ONE("stat", S_IRUGO, proc_tid_stat), + ONE("statm", S_IRUGO, proc_pid_statm), + REG("maps", S_IRUGO, proc_tid_maps_operations), +#ifdef CONFIG_NUMA + REG("numa_maps", S_IRUGO, proc_tid_numa_maps_operations), +#endif + REG("mem", S_IRUSR|S_IWUSR, proc_mem_operations), + LNK("cwd", proc_cwd_link), + LNK("root", proc_root_link), + LNK("exe", proc_exe_link), + REG("mounts", S_IRUGO, proc_mounts_operations), + REG("mountinfo", S_IRUGO, proc_mountinfo_operations), +#ifdef CONFIG_PROC_PAGE_MONITOR + REG("clear_refs", S_IWUSR, proc_clear_refs_operations), + REG("smaps", S_IRUGO, proc_tid_smaps_operations), + REG("pagemap", S_IRUGO, proc_pagemap_operations), +#endif +#ifdef CONFIG_SECURITY + DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), +#endif +#ifdef CONFIG_KALLSYMS + INF("wchan", S_IRUGO, proc_pid_wchan), +#endif +#ifdef CONFIG_STACKTRACE + ONE("stack", S_IRUGO, proc_pid_stack), +#endif +#ifdef CONFIG_SCHEDSTATS + INF("schedstat", S_IRUGO, proc_pid_schedstat), +#endif +#ifdef CONFIG_LATENCYTOP + REG("latency", S_IRUGO, proc_lstats_operations), +#endif +#ifdef CONFIG_PROC_PID_CPUSET + REG("cpuset", S_IRUGO, proc_cpuset_operations), +#endif +#ifdef CONFIG_CGROUPS + REG("cgroup", S_IRUGO, proc_cgroup_operations), +#endif + INF("oom_score", S_IRUGO, proc_oom_score), + REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adjust_operations), + REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), +#ifdef CONFIG_AUDITSYSCALL + REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), + REG("sessionid", S_IRUGO, proc_sessionid_operations), +#endif +#ifdef CONFIG_FAULT_INJECTION + REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations), +#endif +#ifdef CONFIG_TASK_IO_ACCOUNTING + INF("io", S_IRUSR, proc_tid_io_accounting), +#endif +#ifdef CONFIG_HARDWALL + INF("hardwall", S_IRUGO, proc_pid_hardwall), +#endif +}; + +static int proc_tid_base_readdir(struct file * filp, + void * dirent, filldir_t filldir) +{ + return proc_pident_readdir(filp,dirent,filldir, + tid_base_stuff,ARRAY_SIZE(tid_base_stuff)); +} + +static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd){ + return proc_pident_lookup(dir, dentry, + tid_base_stuff, ARRAY_SIZE(tid_base_stuff)); +} + +static const struct file_operations proc_tid_base_operations = { + .read = generic_read_dir, + .readdir = proc_tid_base_readdir, + .llseek = default_llseek, +}; + +static const struct inode_operations proc_tid_base_inode_operations = { + .lookup = proc_tid_base_lookup, + .getattr = pid_getattr, + .setattr = proc_setattr, +}; + +static struct dentry *proc_task_instantiate(struct inode *dir, + struct dentry *dentry, struct task_struct *task, const void *ptr) +{ + struct dentry *error = ERR_PTR(-ENOENT); + struct inode *inode; + inode = proc_pid_make_inode(dir->i_sb, task); + + if (!inode) + goto out; + inode->i_mode = S_IFDIR|S_IRUGO|S_IXUGO; + inode->i_op = &proc_tid_base_inode_operations; + inode->i_fop = &proc_tid_base_operations; + inode->i_flags|=S_IMMUTABLE; + + set_nlink(inode, 2 + pid_entry_count_dirs(tid_base_stuff, + ARRAY_SIZE(tid_base_stuff))); + + d_set_d_op(dentry, &pid_dentry_operations); + + d_add(dentry, inode); + /* Close the race of the process dying before we return the dentry */ + if (pid_revalidate(dentry, NULL)) + error = NULL; +out: + return error; +} + +static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd) +{ + struct dentry *result = ERR_PTR(-ENOENT); + struct task_struct *task; + struct task_struct *leader = get_proc_task(dir); + unsigned tid; + struct pid_namespace *ns; + + if (!leader) + goto out_no_task; + + tid = name_to_int(dentry); + if (tid == ~0U) + goto out; + + ns = dentry->d_sb->s_fs_info; + rcu_read_lock(); + task = find_task_by_pid_ns(tid, ns); + if (task) + get_task_struct(task); + rcu_read_unlock(); + if (!task) + goto out; + if (!same_thread_group(leader, task)) + goto out_drop_task; + + result = proc_task_instantiate(dir, dentry, task, NULL); +out_drop_task: + put_task_struct(task); +out: + put_task_struct(leader); +out_no_task: + return result; +} + +/* + * Find the first tid of a thread group to return to user space. + * + * Usually this is just the thread group leader, but if the users + * buffer was too small or there was a seek into the middle of the + * directory we have more work todo. + * + * In the case of a short read we start with find_task_by_pid. + * + * In the case of a seek we start with the leader and walk nr + * threads past it. + */ +static struct task_struct *first_tid(struct task_struct *leader, + int tid, int nr, struct pid_namespace *ns) +{ + struct task_struct *pos; + + rcu_read_lock(); + /* Attempt to start with the pid of a thread */ + if (tid && (nr > 0)) { + pos = find_task_by_pid_ns(tid, ns); + if (pos && (pos->group_leader == leader)) + goto found; + } + + /* If nr exceeds the number of threads there is nothing todo */ + pos = NULL; + if (nr && nr >= get_nr_threads(leader)) + goto out; + + /* If we haven't found our starting place yet start + * with the leader and walk nr threads forward. + */ + for (pos = leader; nr > 0; --nr) { + pos = next_thread(pos); + if (pos == leader) { + pos = NULL; + goto out; + } + } +found: + get_task_struct(pos); +out: + rcu_read_unlock(); + return pos; +} + +/* + * Find the next thread in the thread list. + * Return NULL if there is an error or no next thread. + * + * The reference to the input task_struct is released. + */ +static struct task_struct *next_tid(struct task_struct *start) +{ + struct task_struct *pos = NULL; + rcu_read_lock(); + if (pid_alive(start)) { + pos = next_thread(start); + if (thread_group_leader(pos)) + pos = NULL; + else + get_task_struct(pos); + } + rcu_read_unlock(); + put_task_struct(start); + return pos; +} + +static int proc_task_fill_cache(struct file *filp, void *dirent, filldir_t filldir, + struct task_struct *task, int tid) +{ + char name[PROC_NUMBUF]; + int len = snprintf(name, sizeof(name), "%d", tid); + return proc_fill_cache(filp, dirent, filldir, name, len, + proc_task_instantiate, task, NULL); +} + +/* for the /proc/TGID/task/ directories */ +static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldir) +{ + struct dentry *dentry = filp->f_path.dentry; + struct inode *inode = dentry->d_inode; + struct task_struct *leader = NULL; + struct task_struct *task; + int retval = -ENOENT; + ino_t ino; + int tid; + struct pid_namespace *ns; + + task = get_proc_task(inode); + if (!task) + goto out_no_task; + rcu_read_lock(); + if (pid_alive(task)) { + leader = task->group_leader; + get_task_struct(leader); + } + rcu_read_unlock(); + put_task_struct(task); + if (!leader) + goto out_no_task; + retval = 0; + + switch ((unsigned long)filp->f_pos) { + case 0: + ino = inode->i_ino; + if (filldir(dirent, ".", 1, filp->f_pos, ino, DT_DIR) < 0) + goto out; + filp->f_pos++; + /* fall through */ + case 1: + ino = parent_ino(dentry); + if (filldir(dirent, "..", 2, filp->f_pos, ino, DT_DIR) < 0) + goto out; + filp->f_pos++; + /* fall through */ + } + + /* f_version caches the tgid value that the last readdir call couldn't + * return. lseek aka telldir automagically resets f_version to 0. + */ + ns = filp->f_dentry->d_sb->s_fs_info; + tid = (int)filp->f_version; + filp->f_version = 0; + for (task = first_tid(leader, tid, filp->f_pos - 2, ns); + task; + task = next_tid(task), filp->f_pos++) { + tid = task_pid_nr_ns(task, ns); + if (proc_task_fill_cache(filp, dirent, filldir, task, tid) < 0) { + /* returning this tgid failed, save it as the first + * pid for the next readir call */ + filp->f_version = (u64)tid; + put_task_struct(task); + break; + } + } +out: + put_task_struct(leader); +out_no_task: + return retval; +} + +static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) +{ + struct inode *inode = dentry->d_inode; + struct task_struct *p = get_proc_task(inode); + generic_fillattr(inode, stat); + + if (p) { + stat->nlink += get_nr_threads(p); + put_task_struct(p); + } + + return 0; +} + +static const struct inode_operations proc_task_inode_operations = { + .lookup = proc_task_lookup, + .getattr = proc_task_getattr, + .setattr = proc_setattr, + .permission = proc_pid_permission, +}; + +static const struct file_operations proc_task_operations = { + .read = generic_read_dir, + .readdir = proc_task_readdir, + .llseek = default_llseek, +}; diff --git a/fs/proc/cmdline.c b/fs/proc/cmdline.c new file mode 100644 index 00000000..82676e3f --- /dev/null +++ b/fs/proc/cmdline.c @@ -0,0 +1,29 @@ +#include <linux/fs.h> +#include <linux/init.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> + +static int cmdline_proc_show(struct seq_file *m, void *v) +{ + seq_printf(m, "%s\n", saved_command_line); + return 0; +} + +static int cmdline_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, cmdline_proc_show, NULL); +} + +static const struct file_operations cmdline_proc_fops = { + .open = cmdline_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init proc_cmdline_init(void) +{ + proc_create("cmdline", 0, NULL, &cmdline_proc_fops); + return 0; +} +module_init(proc_cmdline_init); diff --git a/fs/proc/consoles.c b/fs/proc/consoles.c new file mode 100644 index 00000000..b701eaa4 --- /dev/null +++ b/fs/proc/consoles.c @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2010 Werner Fink, Jiri Slaby + * + * Licensed under GPLv2 + */ + +#include <linux/console.h> +#include <linux/kernel.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/tty_driver.h> + +/* + * This is handler for /proc/consoles + */ +static int show_console_dev(struct seq_file *m, void *v) +{ + static const struct { + short flag; + char name; + } con_flags[] = { + { CON_ENABLED, 'E' }, + { CON_CONSDEV, 'C' }, + { CON_BOOT, 'B' }, + { CON_PRINTBUFFER, 'p' }, + { CON_BRL, 'b' }, + { CON_ANYTIME, 'a' }, + }; + char flags[ARRAY_SIZE(con_flags) + 1]; + struct console *con = v; + unsigned int a; + int len; + dev_t dev = 0; + + if (con->device) { + const struct tty_driver *driver; + int index; + driver = con->device(con, &index); + if (driver) { + dev = MKDEV(driver->major, driver->minor_start); + dev += index; + } + } + + for (a = 0; a < ARRAY_SIZE(con_flags); a++) + flags[a] = (con->flags & con_flags[a].flag) ? + con_flags[a].name : ' '; + flags[a] = 0; + + seq_printf(m, "%s%d%n", con->name, con->index, &len); + len = 21 - len; + if (len < 1) + len = 1; + seq_printf(m, "%*c%c%c%c (%s)", len, ' ', con->read ? 'R' : '-', + con->write ? 'W' : '-', con->unblank ? 'U' : '-', + flags); + if (dev) + seq_printf(m, " %4d:%d", MAJOR(dev), MINOR(dev)); + + seq_printf(m, "\n"); + + return 0; +} + +static void *c_start(struct seq_file *m, loff_t *pos) +{ + struct console *con; + loff_t off = 0; + + console_lock(); + for_each_console(con) + if (off++ == *pos) + break; + + return con; +} + +static void *c_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct console *con = v; + ++*pos; + return con->next; +} + +static void c_stop(struct seq_file *m, void *v) +{ + console_unlock(); +} + +static const struct seq_operations consoles_op = { + .start = c_start, + .next = c_next, + .stop = c_stop, + .show = show_console_dev +}; + +static int consoles_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &consoles_op); +} + +static const struct file_operations proc_consoles_operations = { + .open = consoles_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init proc_consoles_init(void) +{ + proc_create("consoles", 0, NULL, &proc_consoles_operations); + return 0; +} +module_init(proc_consoles_init); diff --git a/fs/proc/cpuinfo.c b/fs/proc/cpuinfo.c new file mode 100644 index 00000000..5a1e539a --- /dev/null +++ b/fs/proc/cpuinfo.c @@ -0,0 +1,24 @@ +#include <linux/fs.h> +#include <linux/init.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> + +extern const struct seq_operations cpuinfo_op; +static int cpuinfo_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &cpuinfo_op); +} + +static const struct file_operations proc_cpuinfo_operations = { + .open = cpuinfo_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init proc_cpuinfo_init(void) +{ + proc_create("cpuinfo", 0, NULL, &proc_cpuinfo_operations); + return 0; +} +module_init(proc_cpuinfo_init); diff --git a/fs/proc/devices.c b/fs/proc/devices.c new file mode 100644 index 00000000..b1434716 --- /dev/null +++ b/fs/proc/devices.c @@ -0,0 +1,70 @@ +#include <linux/fs.h> +#include <linux/init.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> + +static int devinfo_show(struct seq_file *f, void *v) +{ + int i = *(loff_t *) v; + + if (i < CHRDEV_MAJOR_HASH_SIZE) { + if (i == 0) + seq_puts(f, "Character devices:\n"); + chrdev_show(f, i); + } +#ifdef CONFIG_BLOCK + else { + i -= CHRDEV_MAJOR_HASH_SIZE; + if (i == 0) + seq_puts(f, "\nBlock devices:\n"); + blkdev_show(f, i); + } +#endif + return 0; +} + +static void *devinfo_start(struct seq_file *f, loff_t *pos) +{ + if (*pos < (BLKDEV_MAJOR_HASH_SIZE + CHRDEV_MAJOR_HASH_SIZE)) + return pos; + return NULL; +} + +static void *devinfo_next(struct seq_file *f, void *v, loff_t *pos) +{ + (*pos)++; + if (*pos >= (BLKDEV_MAJOR_HASH_SIZE + CHRDEV_MAJOR_HASH_SIZE)) + return NULL; + return pos; +} + +static void devinfo_stop(struct seq_file *f, void *v) +{ + /* Nothing to do */ +} + +static const struct seq_operations devinfo_ops = { + .start = devinfo_start, + .next = devinfo_next, + .stop = devinfo_stop, + .show = devinfo_show +}; + +static int devinfo_open(struct inode *inode, struct file *filp) +{ + return seq_open(filp, &devinfo_ops); +} + +static const struct file_operations proc_devinfo_operations = { + .open = devinfo_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init proc_devices_init(void) +{ + proc_create("devices", 0, NULL, &proc_devinfo_operations); + return 0; +} +module_init(proc_devices_init); diff --git a/fs/proc/generic.c b/fs/proc/generic.c new file mode 100644 index 00000000..2edf34f2 --- /dev/null +++ b/fs/proc/generic.c @@ -0,0 +1,852 @@ +/* + * proc/fs/generic.c --- generic routines for the proc-fs + * + * This file contains generic proc-fs routines for handling + * directories and files. + * + * Copyright (C) 1991, 1992 Linus Torvalds. + * Copyright (C) 1997 Theodore Ts'o + */ + +#include <linux/errno.h> +#include <linux/time.h> +#include <linux/proc_fs.h> +#include <linux/stat.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/mount.h> +#include <linux/init.h> +#include <linux/idr.h> +#include <linux/namei.h> +#include <linux/bitops.h> +#include <linux/spinlock.h> +#include <linux/completion.h> +#include <asm/uaccess.h> + +#include "internal.h" + +DEFINE_SPINLOCK(proc_subdir_lock); + +static int proc_match(unsigned int len, const char *name, struct proc_dir_entry *de) +{ + if (de->namelen != len) + return 0; + return !memcmp(name, de->name, len); +} + +/* buffer size is one page but our output routines use some slack for overruns */ +#define PROC_BLOCK_SIZE (PAGE_SIZE - 1024) + +static ssize_t +__proc_file_read(struct file *file, char __user *buf, size_t nbytes, + loff_t *ppos) +{ + struct inode * inode = file->f_path.dentry->d_inode; + char *page; + ssize_t retval=0; + int eof=0; + ssize_t n, count; + char *start; + struct proc_dir_entry * dp; + unsigned long long pos; + + /* + * Gaah, please just use "seq_file" instead. The legacy /proc + * interfaces cut loff_t down to off_t for reads, and ignore + * the offset entirely for writes.. + */ + pos = *ppos; + if (pos > MAX_NON_LFS) + return 0; + if (nbytes > MAX_NON_LFS - pos) + nbytes = MAX_NON_LFS - pos; + + dp = PDE(inode); + if (!(page = (char*) __get_free_page(GFP_TEMPORARY))) + return -ENOMEM; + + while ((nbytes > 0) && !eof) { + count = min_t(size_t, PROC_BLOCK_SIZE, nbytes); + + start = NULL; + if (dp->read_proc) { + /* + * How to be a proc read function + * ------------------------------ + * Prototype: + * int f(char *buffer, char **start, off_t offset, + * int count, int *peof, void *dat) + * + * Assume that the buffer is "count" bytes in size. + * + * If you know you have supplied all the data you + * have, set *peof. + * + * You have three ways to return data: + * 0) Leave *start = NULL. (This is the default.) + * Put the data of the requested offset at that + * offset within the buffer. Return the number (n) + * of bytes there are from the beginning of the + * buffer up to the last byte of data. If the + * number of supplied bytes (= n - offset) is + * greater than zero and you didn't signal eof + * and the reader is prepared to take more data + * you will be called again with the requested + * offset advanced by the number of bytes + * absorbed. This interface is useful for files + * no larger than the buffer. + * 1) Set *start = an unsigned long value less than + * the buffer address but greater than zero. + * Put the data of the requested offset at the + * beginning of the buffer. Return the number of + * bytes of data placed there. If this number is + * greater than zero and you didn't signal eof + * and the reader is prepared to take more data + * you will be called again with the requested + * offset advanced by *start. This interface is + * useful when you have a large file consisting + * of a series of blocks which you want to count + * and return as wholes. + * (Hack by Paul.Russell@rustcorp.com.au) + * 2) Set *start = an address within the buffer. + * Put the data of the requested offset at *start. + * Return the number of bytes of data placed there. + * If this number is greater than zero and you + * didn't signal eof and the reader is prepared to + * take more data you will be called again with the + * requested offset advanced by the number of bytes + * absorbed. + */ + n = dp->read_proc(page, &start, *ppos, + count, &eof, dp->data); + } else + break; + + if (n == 0) /* end of file */ + break; + if (n < 0) { /* error */ + if (retval == 0) + retval = n; + break; + } + + if (start == NULL) { + if (n > PAGE_SIZE) { + printk(KERN_ERR + "proc_file_read: Apparent buffer overflow!\n"); + n = PAGE_SIZE; + } + n -= *ppos; + if (n <= 0) + break; + if (n > count) + n = count; + start = page + *ppos; + } else if (start < page) { + if (n > PAGE_SIZE) { + printk(KERN_ERR + "proc_file_read: Apparent buffer overflow!\n"); + n = PAGE_SIZE; + } + if (n > count) { + /* + * Don't reduce n because doing so might + * cut off part of a data block. + */ + printk(KERN_WARNING + "proc_file_read: Read count exceeded\n"); + } + } else /* start >= page */ { + unsigned long startoff = (unsigned long)(start - page); + if (n > (PAGE_SIZE - startoff)) { + printk(KERN_ERR + "proc_file_read: Apparent buffer overflow!\n"); + n = PAGE_SIZE - startoff; + } + if (n > count) + n = count; + } + + n -= copy_to_user(buf, start < page ? page : start, n); + if (n == 0) { + if (retval == 0) + retval = -EFAULT; + break; + } + + *ppos += start < page ? (unsigned long)start : n; + nbytes -= n; + buf += n; + retval += n; + } + free_page((unsigned long) page); + return retval; +} + +static ssize_t +proc_file_read(struct file *file, char __user *buf, size_t nbytes, + loff_t *ppos) +{ + struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode); + ssize_t rv = -EIO; + + spin_lock(&pde->pde_unload_lock); + if (!pde->proc_fops) { + spin_unlock(&pde->pde_unload_lock); + return rv; + } + pde->pde_users++; + spin_unlock(&pde->pde_unload_lock); + + rv = __proc_file_read(file, buf, nbytes, ppos); + + pde_users_dec(pde); + return rv; +} + +static ssize_t +proc_file_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos) +{ + struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode); + ssize_t rv = -EIO; + + if (pde->write_proc) { + spin_lock(&pde->pde_unload_lock); + if (!pde->proc_fops) { + spin_unlock(&pde->pde_unload_lock); + return rv; + } + pde->pde_users++; + spin_unlock(&pde->pde_unload_lock); + + /* FIXME: does this routine need ppos? probably... */ + rv = pde->write_proc(file, buffer, count, pde->data); + pde_users_dec(pde); + } + return rv; +} + + +static loff_t +proc_file_lseek(struct file *file, loff_t offset, int orig) +{ + loff_t retval = -EINVAL; + switch (orig) { + case 1: + offset += file->f_pos; + /* fallthrough */ + case 0: + if (offset < 0 || offset > MAX_NON_LFS) + break; + file->f_pos = retval = offset; + } + return retval; +} + +static const struct file_operations proc_file_operations = { + .llseek = proc_file_lseek, + .read = proc_file_read, + .write = proc_file_write, +}; + +static int proc_notify_change(struct dentry *dentry, struct iattr *iattr) +{ + struct inode *inode = dentry->d_inode; + struct proc_dir_entry *de = PDE(inode); + int error; + + error = inode_change_ok(inode, iattr); + if (error) + return error; + + if ((iattr->ia_valid & ATTR_SIZE) && + iattr->ia_size != i_size_read(inode)) { + error = vmtruncate(inode, iattr->ia_size); + if (error) + return error; + } + + setattr_copy(inode, iattr); + mark_inode_dirty(inode); + + de->uid = inode->i_uid; + de->gid = inode->i_gid; + de->mode = inode->i_mode; + return 0; +} + +static int proc_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + struct inode *inode = dentry->d_inode; + struct proc_dir_entry *de = PROC_I(inode)->pde; + if (de && de->nlink) + set_nlink(inode, de->nlink); + + generic_fillattr(inode, stat); + return 0; +} + +static const struct inode_operations proc_file_inode_operations = { + .setattr = proc_notify_change, +}; + +/* + * This function parses a name such as "tty/driver/serial", and + * returns the struct proc_dir_entry for "/proc/tty/driver", and + * returns "serial" in residual. + */ +static int __xlate_proc_name(const char *name, struct proc_dir_entry **ret, + const char **residual) +{ + const char *cp = name, *next; + struct proc_dir_entry *de; + unsigned int len; + + de = *ret; + if (!de) + de = &proc_root; + + while (1) { + next = strchr(cp, '/'); + if (!next) + break; + + len = next - cp; + for (de = de->subdir; de ; de = de->next) { + if (proc_match(len, cp, de)) + break; + } + if (!de) { + WARN(1, "name '%s'\n", name); + return -ENOENT; + } + cp += len + 1; + } + *residual = cp; + *ret = de; + return 0; +} + +static int xlate_proc_name(const char *name, struct proc_dir_entry **ret, + const char **residual) +{ + int rv; + + spin_lock(&proc_subdir_lock); + rv = __xlate_proc_name(name, ret, residual); + spin_unlock(&proc_subdir_lock); + return rv; +} + +static DEFINE_IDA(proc_inum_ida); +static DEFINE_SPINLOCK(proc_inum_lock); /* protects the above */ + +#define PROC_DYNAMIC_FIRST 0xF0000000U + +/* + * Return an inode number between PROC_DYNAMIC_FIRST and + * 0xffffffff, or zero on failure. + */ +static unsigned int get_inode_number(void) +{ + unsigned int i; + int error; + +retry: + if (ida_pre_get(&proc_inum_ida, GFP_KERNEL) == 0) + return 0; + + spin_lock(&proc_inum_lock); + error = ida_get_new(&proc_inum_ida, &i); + spin_unlock(&proc_inum_lock); + if (error == -EAGAIN) + goto retry; + else if (error) + return 0; + + if (i > UINT_MAX - PROC_DYNAMIC_FIRST) { + spin_lock(&proc_inum_lock); + ida_remove(&proc_inum_ida, i); + spin_unlock(&proc_inum_lock); + return 0; + } + return PROC_DYNAMIC_FIRST + i; +} + +static void release_inode_number(unsigned int inum) +{ + spin_lock(&proc_inum_lock); + ida_remove(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST); + spin_unlock(&proc_inum_lock); +} + +static void *proc_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + nd_set_link(nd, PDE(dentry->d_inode)->data); + return NULL; +} + +static const struct inode_operations proc_link_inode_operations = { + .readlink = generic_readlink, + .follow_link = proc_follow_link, +}; + +/* + * As some entries in /proc are volatile, we want to + * get rid of unused dentries. This could be made + * smarter: we could keep a "volatile" flag in the + * inode to indicate which ones to keep. + */ +static int proc_delete_dentry(const struct dentry * dentry) +{ + return 1; +} + +static const struct dentry_operations proc_dentry_operations = +{ + .d_delete = proc_delete_dentry, +}; + +/* + * Don't create negative dentries here, return -ENOENT by hand + * instead. + */ +struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir, + struct dentry *dentry) +{ + struct inode *inode = NULL; + int error = -ENOENT; + + spin_lock(&proc_subdir_lock); + for (de = de->subdir; de ; de = de->next) { + if (de->namelen != dentry->d_name.len) + continue; + if (!memcmp(dentry->d_name.name, de->name, de->namelen)) { + pde_get(de); + spin_unlock(&proc_subdir_lock); + error = -EINVAL; + inode = proc_get_inode(dir->i_sb, de); + goto out_unlock; + } + } + spin_unlock(&proc_subdir_lock); +out_unlock: + + if (inode) { + d_set_d_op(dentry, &proc_dentry_operations); + d_add(dentry, inode); + return NULL; + } + if (de) + pde_put(de); + return ERR_PTR(error); +} + +struct dentry *proc_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) +{ + return proc_lookup_de(PDE(dir), dir, dentry); +} + +/* + * This returns non-zero if at EOF, so that the /proc + * root directory can use this and check if it should + * continue with the <pid> entries.. + * + * Note that the VFS-layer doesn't care about the return + * value of the readdir() call, as long as it's non-negative + * for success.. + */ +int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent, + filldir_t filldir) +{ + unsigned int ino; + int i; + struct inode *inode = filp->f_path.dentry->d_inode; + int ret = 0; + + ino = inode->i_ino; + i = filp->f_pos; + switch (i) { + case 0: + if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0) + goto out; + i++; + filp->f_pos++; + /* fall through */ + case 1: + if (filldir(dirent, "..", 2, i, + parent_ino(filp->f_path.dentry), + DT_DIR) < 0) + goto out; + i++; + filp->f_pos++; + /* fall through */ + default: + spin_lock(&proc_subdir_lock); + de = de->subdir; + i -= 2; + for (;;) { + if (!de) { + ret = 1; + spin_unlock(&proc_subdir_lock); + goto out; + } + if (!i) + break; + de = de->next; + i--; + } + + do { + struct proc_dir_entry *next; + + /* filldir passes info to user space */ + pde_get(de); + spin_unlock(&proc_subdir_lock); + if (filldir(dirent, de->name, de->namelen, filp->f_pos, + de->low_ino, de->mode >> 12) < 0) { + pde_put(de); + goto out; + } + spin_lock(&proc_subdir_lock); + filp->f_pos++; + next = de->next; + pde_put(de); + de = next; + } while (de); + spin_unlock(&proc_subdir_lock); + } + ret = 1; +out: + return ret; +} + +int proc_readdir(struct file *filp, void *dirent, filldir_t filldir) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + + return proc_readdir_de(PDE(inode), filp, dirent, filldir); +} + +/* + * These are the generic /proc directory operations. They + * use the in-memory "struct proc_dir_entry" tree to parse + * the /proc directory. + */ +static const struct file_operations proc_dir_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .readdir = proc_readdir, +}; + +/* + * proc directories can do almost nothing.. + */ +static const struct inode_operations proc_dir_inode_operations = { + .lookup = proc_lookup, + .getattr = proc_getattr, + .setattr = proc_notify_change, +}; + +static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp) +{ + unsigned int i; + struct proc_dir_entry *tmp; + + i = get_inode_number(); + if (i == 0) + return -EAGAIN; + dp->low_ino = i; + + if (S_ISDIR(dp->mode)) { + if (dp->proc_iops == NULL) { + dp->proc_fops = &proc_dir_operations; + dp->proc_iops = &proc_dir_inode_operations; + } + dir->nlink++; + } else if (S_ISLNK(dp->mode)) { + if (dp->proc_iops == NULL) + dp->proc_iops = &proc_link_inode_operations; + } else if (S_ISREG(dp->mode)) { + if (dp->proc_fops == NULL) + dp->proc_fops = &proc_file_operations; + if (dp->proc_iops == NULL) + dp->proc_iops = &proc_file_inode_operations; + } + + spin_lock(&proc_subdir_lock); + + for (tmp = dir->subdir; tmp; tmp = tmp->next) + if (strcmp(tmp->name, dp->name) == 0) { + WARN(1, KERN_WARNING "proc_dir_entry '%s/%s' already registered\n", + dir->name, dp->name); + break; + } + + dp->next = dir->subdir; + dp->parent = dir; + dir->subdir = dp; + spin_unlock(&proc_subdir_lock); + + return 0; +} + +static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent, + const char *name, + umode_t mode, + nlink_t nlink) +{ + struct proc_dir_entry *ent = NULL; + const char *fn = name; + unsigned int len; + + /* make sure name is valid */ + if (!name || !strlen(name)) goto out; + + if (xlate_proc_name(name, parent, &fn) != 0) + goto out; + + /* At this point there must not be any '/' characters beyond *fn */ + if (strchr(fn, '/')) + goto out; + + len = strlen(fn); + + ent = kmalloc(sizeof(struct proc_dir_entry) + len + 1, GFP_KERNEL); + if (!ent) goto out; + + memset(ent, 0, sizeof(struct proc_dir_entry)); + memcpy(ent->name, fn, len + 1); + ent->namelen = len; + ent->mode = mode; + ent->nlink = nlink; + atomic_set(&ent->count, 1); + ent->pde_users = 0; + spin_lock_init(&ent->pde_unload_lock); + ent->pde_unload_completion = NULL; + INIT_LIST_HEAD(&ent->pde_openers); + out: + return ent; +} + +struct proc_dir_entry *proc_symlink(const char *name, + struct proc_dir_entry *parent, const char *dest) +{ + struct proc_dir_entry *ent; + + ent = __proc_create(&parent, name, + (S_IFLNK | S_IRUGO | S_IWUGO | S_IXUGO),1); + + if (ent) { + ent->data = kmalloc((ent->size=strlen(dest))+1, GFP_KERNEL); + if (ent->data) { + strcpy((char*)ent->data,dest); + if (proc_register(parent, ent) < 0) { + kfree(ent->data); + kfree(ent); + ent = NULL; + } + } else { + kfree(ent); + ent = NULL; + } + } + return ent; +} +EXPORT_SYMBOL(proc_symlink); + +struct proc_dir_entry *proc_mkdir_mode(const char *name, umode_t mode, + struct proc_dir_entry *parent) +{ + struct proc_dir_entry *ent; + + ent = __proc_create(&parent, name, S_IFDIR | mode, 2); + if (ent) { + if (proc_register(parent, ent) < 0) { + kfree(ent); + ent = NULL; + } + } + return ent; +} +EXPORT_SYMBOL(proc_mkdir_mode); + +struct proc_dir_entry *proc_net_mkdir(struct net *net, const char *name, + struct proc_dir_entry *parent) +{ + struct proc_dir_entry *ent; + + ent = __proc_create(&parent, name, S_IFDIR | S_IRUGO | S_IXUGO, 2); + if (ent) { + ent->data = net; + if (proc_register(parent, ent) < 0) { + kfree(ent); + ent = NULL; + } + } + return ent; +} +EXPORT_SYMBOL_GPL(proc_net_mkdir); + +struct proc_dir_entry *proc_mkdir(const char *name, + struct proc_dir_entry *parent) +{ + return proc_mkdir_mode(name, S_IRUGO | S_IXUGO, parent); +} +EXPORT_SYMBOL(proc_mkdir); + +struct proc_dir_entry *create_proc_entry(const char *name, umode_t mode, + struct proc_dir_entry *parent) +{ + struct proc_dir_entry *ent; + nlink_t nlink; + + if (S_ISDIR(mode)) { + if ((mode & S_IALLUGO) == 0) + mode |= S_IRUGO | S_IXUGO; + nlink = 2; + } else { + if ((mode & S_IFMT) == 0) + mode |= S_IFREG; + if ((mode & S_IALLUGO) == 0) + mode |= S_IRUGO; + nlink = 1; + } + + ent = __proc_create(&parent, name, mode, nlink); + if (ent) { + if (proc_register(parent, ent) < 0) { + kfree(ent); + ent = NULL; + } + } + return ent; +} +EXPORT_SYMBOL(create_proc_entry); + +struct proc_dir_entry *proc_create_data(const char *name, umode_t mode, + struct proc_dir_entry *parent, + const struct file_operations *proc_fops, + void *data) +{ + struct proc_dir_entry *pde; + nlink_t nlink; + + if (S_ISDIR(mode)) { + if ((mode & S_IALLUGO) == 0) + mode |= S_IRUGO | S_IXUGO; + nlink = 2; + } else { + if ((mode & S_IFMT) == 0) + mode |= S_IFREG; + if ((mode & S_IALLUGO) == 0) + mode |= S_IRUGO; + nlink = 1; + } + + pde = __proc_create(&parent, name, mode, nlink); + if (!pde) + goto out; + pde->proc_fops = proc_fops; + pde->data = data; + if (proc_register(parent, pde) < 0) + goto out_free; + return pde; +out_free: + kfree(pde); +out: + return NULL; +} +EXPORT_SYMBOL(proc_create_data); + +static void free_proc_entry(struct proc_dir_entry *de) +{ + release_inode_number(de->low_ino); + + if (S_ISLNK(de->mode)) + kfree(de->data); + kfree(de); +} + +void pde_put(struct proc_dir_entry *pde) +{ + if (atomic_dec_and_test(&pde->count)) + free_proc_entry(pde); +} + +/* + * Remove a /proc entry and free it if it's not currently in use. + */ +void remove_proc_entry(const char *name, struct proc_dir_entry *parent) +{ + struct proc_dir_entry **p; + struct proc_dir_entry *de = NULL; + const char *fn = name; + unsigned int len; + + spin_lock(&proc_subdir_lock); + if (__xlate_proc_name(name, &parent, &fn) != 0) { + spin_unlock(&proc_subdir_lock); + return; + } + len = strlen(fn); + + for (p = &parent->subdir; *p; p=&(*p)->next ) { + if (proc_match(len, fn, *p)) { + de = *p; + *p = de->next; + de->next = NULL; + break; + } + } + spin_unlock(&proc_subdir_lock); + if (!de) { + WARN(1, "name '%s'\n", name); + return; + } + + spin_lock(&de->pde_unload_lock); + /* + * Stop accepting new callers into module. If you're + * dynamically allocating ->proc_fops, save a pointer somewhere. + */ + de->proc_fops = NULL; + /* Wait until all existing callers into module are done. */ + if (de->pde_users > 0) { + DECLARE_COMPLETION_ONSTACK(c); + + if (!de->pde_unload_completion) + de->pde_unload_completion = &c; + + spin_unlock(&de->pde_unload_lock); + + wait_for_completion(de->pde_unload_completion); + + spin_lock(&de->pde_unload_lock); + } + + while (!list_empty(&de->pde_openers)) { + struct pde_opener *pdeo; + + pdeo = list_first_entry(&de->pde_openers, struct pde_opener, lh); + list_del(&pdeo->lh); + spin_unlock(&de->pde_unload_lock); + pdeo->release(pdeo->inode, pdeo->file); + kfree(pdeo); + spin_lock(&de->pde_unload_lock); + } + spin_unlock(&de->pde_unload_lock); + + if (S_ISDIR(de->mode)) + parent->nlink--; + de->nlink = 0; + WARN(de->subdir, KERN_WARNING "%s: removing non-empty directory " + "'%s/%s', leaking at least '%s'\n", __func__, + de->parent->name, de->name, de->subdir->name); + pde_put(de); +} +EXPORT_SYMBOL(remove_proc_entry); diff --git a/fs/proc/inode.c b/fs/proc/inode.c new file mode 100644 index 00000000..205c9228 --- /dev/null +++ b/fs/proc/inode.c @@ -0,0 +1,503 @@ +/* + * linux/fs/proc/inode.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +#include <linux/time.h> +#include <linux/proc_fs.h> +#include <linux/kernel.h> +#include <linux/pid_namespace.h> +#include <linux/mm.h> +#include <linux/string.h> +#include <linux/stat.h> +#include <linux/completion.h> +#include <linux/poll.h> +#include <linux/file.h> +#include <linux/limits.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/sysctl.h> +#include <linux/seq_file.h> +#include <linux/slab.h> +#include <linux/mount.h> + +#include <asm/uaccess.h> + +#include "internal.h" + +static void proc_evict_inode(struct inode *inode) +{ + struct proc_dir_entry *de; + struct ctl_table_header *head; + const struct proc_ns_operations *ns_ops; + + truncate_inode_pages(&inode->i_data, 0); + end_writeback(inode); + + /* Stop tracking associated processes */ + put_pid(PROC_I(inode)->pid); + + /* Let go of any associated proc directory entry */ + de = PROC_I(inode)->pde; + if (de) + pde_put(de); + head = PROC_I(inode)->sysctl; + if (head) { + rcu_assign_pointer(PROC_I(inode)->sysctl, NULL); + sysctl_head_put(head); + } + /* Release any associated namespace */ + ns_ops = PROC_I(inode)->ns_ops; + if (ns_ops && ns_ops->put) + ns_ops->put(PROC_I(inode)->ns); +} + +static struct kmem_cache * proc_inode_cachep; + +static struct inode *proc_alloc_inode(struct super_block *sb) +{ + struct proc_inode *ei; + struct inode *inode; + + ei = (struct proc_inode *)kmem_cache_alloc(proc_inode_cachep, GFP_KERNEL); + if (!ei) + return NULL; + ei->pid = NULL; + ei->fd = 0; + ei->op.proc_get_link = NULL; + ei->pde = NULL; + ei->sysctl = NULL; + ei->sysctl_entry = NULL; + ei->ns = NULL; + ei->ns_ops = NULL; + inode = &ei->vfs_inode; + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + return inode; +} + +static void proc_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + kmem_cache_free(proc_inode_cachep, PROC_I(inode)); +} + +static void proc_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, proc_i_callback); +} + +static void init_once(void *foo) +{ + struct proc_inode *ei = (struct proc_inode *) foo; + + inode_init_once(&ei->vfs_inode); +} + +void __init proc_init_inodecache(void) +{ + proc_inode_cachep = kmem_cache_create("proc_inode_cache", + sizeof(struct proc_inode), + 0, (SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD|SLAB_PANIC), + init_once); +} + +static int proc_show_options(struct seq_file *seq, struct dentry *root) +{ + struct super_block *sb = root->d_sb; + struct pid_namespace *pid = sb->s_fs_info; + + if (pid->pid_gid) + seq_printf(seq, ",gid=%lu", (unsigned long)pid->pid_gid); + if (pid->hide_pid != 0) + seq_printf(seq, ",hidepid=%u", pid->hide_pid); + + return 0; +} + +static const struct super_operations proc_sops = { + .alloc_inode = proc_alloc_inode, + .destroy_inode = proc_destroy_inode, + .drop_inode = generic_delete_inode, + .evict_inode = proc_evict_inode, + .statfs = simple_statfs, + .remount_fs = proc_remount, + .show_options = proc_show_options, +}; + +static void __pde_users_dec(struct proc_dir_entry *pde) +{ + pde->pde_users--; + if (pde->pde_unload_completion && pde->pde_users == 0) + complete(pde->pde_unload_completion); +} + +void pde_users_dec(struct proc_dir_entry *pde) +{ + spin_lock(&pde->pde_unload_lock); + __pde_users_dec(pde); + spin_unlock(&pde->pde_unload_lock); +} + +static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence) +{ + struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode); + loff_t rv = -EINVAL; + loff_t (*llseek)(struct file *, loff_t, int); + + spin_lock(&pde->pde_unload_lock); + /* + * remove_proc_entry() is going to delete PDE (as part of module + * cleanup sequence). No new callers into module allowed. + */ + if (!pde->proc_fops) { + spin_unlock(&pde->pde_unload_lock); + return rv; + } + /* + * Bump refcount so that remove_proc_entry will wail for ->llseek to + * complete. + */ + pde->pde_users++; + /* + * Save function pointer under lock, to protect against ->proc_fops + * NULL'ifying right after ->pde_unload_lock is dropped. + */ + llseek = pde->proc_fops->llseek; + spin_unlock(&pde->pde_unload_lock); + + if (!llseek) + llseek = default_llseek; + rv = llseek(file, offset, whence); + + pde_users_dec(pde); + return rv; +} + +static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) +{ + struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode); + ssize_t rv = -EIO; + ssize_t (*read)(struct file *, char __user *, size_t, loff_t *); + + spin_lock(&pde->pde_unload_lock); + if (!pde->proc_fops) { + spin_unlock(&pde->pde_unload_lock); + return rv; + } + pde->pde_users++; + read = pde->proc_fops->read; + spin_unlock(&pde->pde_unload_lock); + + if (read) + rv = read(file, buf, count, ppos); + + pde_users_dec(pde); + return rv; +} + +static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) +{ + struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode); + ssize_t rv = -EIO; + ssize_t (*write)(struct file *, const char __user *, size_t, loff_t *); + + spin_lock(&pde->pde_unload_lock); + if (!pde->proc_fops) { + spin_unlock(&pde->pde_unload_lock); + return rv; + } + pde->pde_users++; + write = pde->proc_fops->write; + spin_unlock(&pde->pde_unload_lock); + + if (write) + rv = write(file, buf, count, ppos); + + pde_users_dec(pde); + return rv; +} + +static unsigned int proc_reg_poll(struct file *file, struct poll_table_struct *pts) +{ + struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode); + unsigned int rv = DEFAULT_POLLMASK; + unsigned int (*poll)(struct file *, struct poll_table_struct *); + + spin_lock(&pde->pde_unload_lock); + if (!pde->proc_fops) { + spin_unlock(&pde->pde_unload_lock); + return rv; + } + pde->pde_users++; + poll = pde->proc_fops->poll; + spin_unlock(&pde->pde_unload_lock); + + if (poll) + rv = poll(file, pts); + + pde_users_dec(pde); + return rv; +} + +static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode); + long rv = -ENOTTY; + long (*ioctl)(struct file *, unsigned int, unsigned long); + + spin_lock(&pde->pde_unload_lock); + if (!pde->proc_fops) { + spin_unlock(&pde->pde_unload_lock); + return rv; + } + pde->pde_users++; + ioctl = pde->proc_fops->unlocked_ioctl; + spin_unlock(&pde->pde_unload_lock); + + if (ioctl) + rv = ioctl(file, cmd, arg); + + pde_users_dec(pde); + return rv; +} + +#ifdef CONFIG_COMPAT +static long proc_reg_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode); + long rv = -ENOTTY; + long (*compat_ioctl)(struct file *, unsigned int, unsigned long); + + spin_lock(&pde->pde_unload_lock); + if (!pde->proc_fops) { + spin_unlock(&pde->pde_unload_lock); + return rv; + } + pde->pde_users++; + compat_ioctl = pde->proc_fops->compat_ioctl; + spin_unlock(&pde->pde_unload_lock); + + if (compat_ioctl) + rv = compat_ioctl(file, cmd, arg); + + pde_users_dec(pde); + return rv; +} +#endif + +static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode); + int rv = -EIO; + int (*mmap)(struct file *, struct vm_area_struct *); + + spin_lock(&pde->pde_unload_lock); + if (!pde->proc_fops) { + spin_unlock(&pde->pde_unload_lock); + return rv; + } + pde->pde_users++; + mmap = pde->proc_fops->mmap; + spin_unlock(&pde->pde_unload_lock); + + if (mmap) + rv = mmap(file, vma); + + pde_users_dec(pde); + return rv; +} + +static int proc_reg_open(struct inode *inode, struct file *file) +{ + struct proc_dir_entry *pde = PDE(inode); + int rv = 0; + int (*open)(struct inode *, struct file *); + int (*release)(struct inode *, struct file *); + struct pde_opener *pdeo; + + /* + * What for, you ask? Well, we can have open, rmmod, remove_proc_entry + * sequence. ->release won't be called because ->proc_fops will be + * cleared. Depending on complexity of ->release, consequences vary. + * + * We can't wait for mercy when close will be done for real, it's + * deadlockable: rmmod foo </proc/foo . So, we're going to do ->release + * by hand in remove_proc_entry(). For this, save opener's credentials + * for later. + */ + pdeo = kmalloc(sizeof(struct pde_opener), GFP_KERNEL); + if (!pdeo) + return -ENOMEM; + + spin_lock(&pde->pde_unload_lock); + if (!pde->proc_fops) { + spin_unlock(&pde->pde_unload_lock); + kfree(pdeo); + return -ENOENT; + } + pde->pde_users++; + open = pde->proc_fops->open; + release = pde->proc_fops->release; + spin_unlock(&pde->pde_unload_lock); + + if (open) + rv = open(inode, file); + + spin_lock(&pde->pde_unload_lock); + if (rv == 0 && release) { + /* To know what to release. */ + pdeo->inode = inode; + pdeo->file = file; + /* Strictly for "too late" ->release in proc_reg_release(). */ + pdeo->release = release; + list_add(&pdeo->lh, &pde->pde_openers); + } else + kfree(pdeo); + __pde_users_dec(pde); + spin_unlock(&pde->pde_unload_lock); + return rv; +} + +static struct pde_opener *find_pde_opener(struct proc_dir_entry *pde, + struct inode *inode, struct file *file) +{ + struct pde_opener *pdeo; + + list_for_each_entry(pdeo, &pde->pde_openers, lh) { + if (pdeo->inode == inode && pdeo->file == file) + return pdeo; + } + return NULL; +} + +static int proc_reg_release(struct inode *inode, struct file *file) +{ + struct proc_dir_entry *pde = PDE(inode); + int rv = 0; + int (*release)(struct inode *, struct file *); + struct pde_opener *pdeo; + + spin_lock(&pde->pde_unload_lock); + pdeo = find_pde_opener(pde, inode, file); + if (!pde->proc_fops) { + /* + * Can't simply exit, __fput() will think that everything is OK, + * and move on to freeing struct file. remove_proc_entry() will + * find slacker in opener's list and will try to do non-trivial + * things with struct file. Therefore, remove opener from list. + * + * But if opener is removed from list, who will ->release it? + */ + if (pdeo) { + list_del(&pdeo->lh); + spin_unlock(&pde->pde_unload_lock); + rv = pdeo->release(inode, file); + kfree(pdeo); + } else + spin_unlock(&pde->pde_unload_lock); + return rv; + } + pde->pde_users++; + release = pde->proc_fops->release; + if (pdeo) { + list_del(&pdeo->lh); + kfree(pdeo); + } + spin_unlock(&pde->pde_unload_lock); + + if (release) + rv = release(inode, file); + + pde_users_dec(pde); + return rv; +} + +static const struct file_operations proc_reg_file_ops = { + .llseek = proc_reg_llseek, + .read = proc_reg_read, + .write = proc_reg_write, + .poll = proc_reg_poll, + .unlocked_ioctl = proc_reg_unlocked_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = proc_reg_compat_ioctl, +#endif + .mmap = proc_reg_mmap, + .open = proc_reg_open, + .release = proc_reg_release, +}; + +#ifdef CONFIG_COMPAT +static const struct file_operations proc_reg_file_ops_no_compat = { + .llseek = proc_reg_llseek, + .read = proc_reg_read, + .write = proc_reg_write, + .poll = proc_reg_poll, + .unlocked_ioctl = proc_reg_unlocked_ioctl, + .mmap = proc_reg_mmap, + .open = proc_reg_open, + .release = proc_reg_release, +}; +#endif + +struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de) +{ + struct inode * inode; + + inode = iget_locked(sb, de->low_ino); + if (!inode) + return NULL; + if (inode->i_state & I_NEW) { + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + PROC_I(inode)->fd = 0; + PROC_I(inode)->pde = de; + + if (de->mode) { + inode->i_mode = de->mode; + inode->i_uid = de->uid; + inode->i_gid = de->gid; + } + if (de->size) + inode->i_size = de->size; + if (de->nlink) + set_nlink(inode, de->nlink); + if (de->proc_iops) + inode->i_op = de->proc_iops; + if (de->proc_fops) { + if (S_ISREG(inode->i_mode)) { +#ifdef CONFIG_COMPAT + if (!de->proc_fops->compat_ioctl) + inode->i_fop = + &proc_reg_file_ops_no_compat; + else +#endif + inode->i_fop = &proc_reg_file_ops; + } else { + inode->i_fop = de->proc_fops; + } + } + unlock_new_inode(inode); + } else + pde_put(de); + return inode; +} + +int proc_fill_super(struct super_block *s) +{ + s->s_flags |= MS_NODIRATIME | MS_NOSUID | MS_NOEXEC; + s->s_blocksize = 1024; + s->s_blocksize_bits = 10; + s->s_magic = PROC_SUPER_MAGIC; + s->s_op = &proc_sops; + s->s_time_gran = 1; + + pde_get(&proc_root); + s->s_root = d_make_root(proc_get_inode(s, &proc_root)); + if (s->s_root) + return 0; + + printk("proc_read_super: get root inode failed\n"); + pde_put(&proc_root); + return -ENOMEM; +} diff --git a/fs/proc/internal.h b/fs/proc/internal.h new file mode 100644 index 00000000..5f79bb8b --- /dev/null +++ b/fs/proc/internal.h @@ -0,0 +1,154 @@ +/* internal.h: internal procfs definitions + * + * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/proc_fs.h> +struct ctl_table_header; + +extern struct proc_dir_entry proc_root; +#ifdef CONFIG_PROC_SYSCTL +extern int proc_sys_init(void); +extern void sysctl_head_put(struct ctl_table_header *head); +#else +static inline void proc_sys_init(void) { } +static inline void sysctl_head_put(struct ctl_table_header *head) { } +#endif +#ifdef CONFIG_NET +extern int proc_net_init(void); +#else +static inline int proc_net_init(void) { return 0; } +#endif + +struct vmalloc_info { + unsigned long used; + unsigned long largest_chunk; +}; + +extern struct mm_struct *mm_for_maps(struct task_struct *); + +#ifdef CONFIG_MMU +#define VMALLOC_TOTAL (VMALLOC_END - VMALLOC_START) +extern void get_vmalloc_info(struct vmalloc_info *vmi); +#else + +#define VMALLOC_TOTAL 0UL +#define get_vmalloc_info(vmi) \ +do { \ + (vmi)->used = 0; \ + (vmi)->largest_chunk = 0; \ +} while(0) +#endif + +extern int proc_tid_stat(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task); +extern int proc_tgid_stat(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task); +extern int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task); +extern int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task); +extern loff_t mem_lseek(struct file *file, loff_t offset, int orig); + +extern const struct file_operations proc_pid_maps_operations; +extern const struct file_operations proc_tid_maps_operations; +extern const struct file_operations proc_pid_numa_maps_operations; +extern const struct file_operations proc_tid_numa_maps_operations; +extern const struct file_operations proc_pid_smaps_operations; +extern const struct file_operations proc_tid_smaps_operations; +extern const struct file_operations proc_clear_refs_operations; +extern const struct file_operations proc_pagemap_operations; +extern const struct file_operations proc_net_operations; +extern const struct inode_operations proc_net_inode_operations; + +struct proc_maps_private { + struct pid *pid; + struct task_struct *task; +#ifdef CONFIG_MMU + struct vm_area_struct *tail_vma; +#endif +}; + +void proc_init_inodecache(void); + +static inline struct pid *proc_pid(struct inode *inode) +{ + return PROC_I(inode)->pid; +} + +static inline struct task_struct *get_proc_task(struct inode *inode) +{ + return get_pid_task(proc_pid(inode), PIDTYPE_PID); +} + +static inline int proc_fd(struct inode *inode) +{ + return PROC_I(inode)->fd; +} + +struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *ino, + struct dentry *dentry); +int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent, + filldir_t filldir); + +struct pde_opener { + struct inode *inode; + struct file *file; + int (*release)(struct inode *, struct file *); + struct list_head lh; +}; +void pde_users_dec(struct proc_dir_entry *pde); + +extern spinlock_t proc_subdir_lock; + +struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *); +int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir); +unsigned long task_vsize(struct mm_struct *); +unsigned long task_statm(struct mm_struct *, + unsigned long *, unsigned long *, unsigned long *, unsigned long *); +void task_mem(struct seq_file *, struct mm_struct *); + +static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde) +{ + atomic_inc(&pde->count); + return pde; +} +void pde_put(struct proc_dir_entry *pde); + +int proc_fill_super(struct super_block *); +struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *); +int proc_remount(struct super_block *sb, int *flags, char *data); + +/* + * These are generic /proc routines that use the internal + * "struct proc_dir_entry" tree to traverse the filesystem. + * + * The /proc root directory has extended versions to take care + * of the /proc/<pid> subdirectories. + */ +int proc_readdir(struct file *, void *, filldir_t); +struct dentry *proc_lookup(struct inode *, struct dentry *, struct nameidata *); + + + +/* Lookups */ +typedef struct dentry *instantiate_t(struct inode *, struct dentry *, + struct task_struct *, const void *); +int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir, + const char *name, int len, + instantiate_t instantiate, struct task_struct *task, const void *ptr); +int pid_revalidate(struct dentry *dentry, struct nameidata *nd); +struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task); +extern const struct dentry_operations pid_dentry_operations; +int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); +int proc_setattr(struct dentry *dentry, struct iattr *attr); + +extern const struct inode_operations proc_ns_dir_inode_operations; +extern const struct file_operations proc_ns_dir_operations; + diff --git a/fs/proc/interrupts.c b/fs/proc/interrupts.c new file mode 100644 index 00000000..05029c0e --- /dev/null +++ b/fs/proc/interrupts.c @@ -0,0 +1,53 @@ +#include <linux/fs.h> +#include <linux/init.h> +#include <linux/interrupt.h> +#include <linux/irqnr.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> + +/* + * /proc/interrupts + */ +static void *int_seq_start(struct seq_file *f, loff_t *pos) +{ + return (*pos <= nr_irqs) ? pos : NULL; +} + +static void *int_seq_next(struct seq_file *f, void *v, loff_t *pos) +{ + (*pos)++; + if (*pos > nr_irqs) + return NULL; + return pos; +} + +static void int_seq_stop(struct seq_file *f, void *v) +{ + /* Nothing to do */ +} + +static const struct seq_operations int_seq_ops = { + .start = int_seq_start, + .next = int_seq_next, + .stop = int_seq_stop, + .show = show_interrupts +}; + +static int interrupts_open(struct inode *inode, struct file *filp) +{ + return seq_open(filp, &int_seq_ops); +} + +static const struct file_operations proc_interrupts_operations = { + .open = interrupts_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init proc_interrupts_init(void) +{ + proc_create("interrupts", 0, NULL, &proc_interrupts_operations); + return 0; +} +module_init(proc_interrupts_init); diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c new file mode 100644 index 00000000..86c67eee --- /dev/null +++ b/fs/proc/kcore.c @@ -0,0 +1,637 @@ +/* + * fs/proc/kcore.c kernel ELF core dumper + * + * Modelled on fs/exec.c:aout_core_dump() + * Jeremy Fitzhardinge <jeremy@sw.oz.au> + * ELF version written by David Howells <David.Howells@nexor.co.uk> + * Modified and incorporated into 2.3.x by Tigran Aivazian <tigran@veritas.com> + * Support to dump vmalloc'd areas (ELF only), Tigran Aivazian <tigran@veritas.com> + * Safe accesses to vmalloc/direct-mapped discontiguous areas, Kanoj Sarcar <kanoj@sgi.com> + */ + +#include <linux/mm.h> +#include <linux/proc_fs.h> +#include <linux/user.h> +#include <linux/capability.h> +#include <linux/elf.h> +#include <linux/elfcore.h> +#include <linux/vmalloc.h> +#include <linux/highmem.h> +#include <linux/bootmem.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <asm/uaccess.h> +#include <asm/io.h> +#include <linux/list.h> +#include <linux/ioport.h> +#include <linux/memory.h> +#include <asm/sections.h> + +#define CORE_STR "CORE" + +#ifndef ELF_CORE_EFLAGS +#define ELF_CORE_EFLAGS 0 +#endif + +static struct proc_dir_entry *proc_root_kcore; + + +#ifndef kc_vaddr_to_offset +#define kc_vaddr_to_offset(v) ((v) - PAGE_OFFSET) +#endif +#ifndef kc_offset_to_vaddr +#define kc_offset_to_vaddr(o) ((o) + PAGE_OFFSET) +#endif + +/* An ELF note in memory */ +struct memelfnote +{ + const char *name; + int type; + unsigned int datasz; + void *data; +}; + +static LIST_HEAD(kclist_head); +static DEFINE_RWLOCK(kclist_lock); +static int kcore_need_update = 1; + +void +kclist_add(struct kcore_list *new, void *addr, size_t size, int type) +{ + new->addr = (unsigned long)addr; + new->size = size; + new->type = type; + + write_lock(&kclist_lock); + list_add_tail(&new->list, &kclist_head); + write_unlock(&kclist_lock); +} + +static size_t get_kcore_size(int *nphdr, size_t *elf_buflen) +{ + size_t try, size; + struct kcore_list *m; + + *nphdr = 1; /* PT_NOTE */ + size = 0; + + list_for_each_entry(m, &kclist_head, list) { + try = kc_vaddr_to_offset((size_t)m->addr + m->size); + if (try > size) + size = try; + *nphdr = *nphdr + 1; + } + *elf_buflen = sizeof(struct elfhdr) + + (*nphdr + 2)*sizeof(struct elf_phdr) + + 3 * ((sizeof(struct elf_note)) + + roundup(sizeof(CORE_STR), 4)) + + roundup(sizeof(struct elf_prstatus), 4) + + roundup(sizeof(struct elf_prpsinfo), 4) + + roundup(sizeof(struct task_struct), 4); + *elf_buflen = PAGE_ALIGN(*elf_buflen); + return size + *elf_buflen; +} + +static void free_kclist_ents(struct list_head *head) +{ + struct kcore_list *tmp, *pos; + + list_for_each_entry_safe(pos, tmp, head, list) { + list_del(&pos->list); + kfree(pos); + } +} +/* + * Replace all KCORE_RAM/KCORE_VMEMMAP information with passed list. + */ +static void __kcore_update_ram(struct list_head *list) +{ + int nphdr; + size_t size; + struct kcore_list *tmp, *pos; + LIST_HEAD(garbage); + + write_lock(&kclist_lock); + if (kcore_need_update) { + list_for_each_entry_safe(pos, tmp, &kclist_head, list) { + if (pos->type == KCORE_RAM + || pos->type == KCORE_VMEMMAP) + list_move(&pos->list, &garbage); + } + list_splice_tail(list, &kclist_head); + } else + list_splice(list, &garbage); + kcore_need_update = 0; + proc_root_kcore->size = get_kcore_size(&nphdr, &size); + write_unlock(&kclist_lock); + + free_kclist_ents(&garbage); +} + + +#ifdef CONFIG_HIGHMEM +/* + * If no highmem, we can assume [0...max_low_pfn) continuous range of memory + * because memory hole is not as big as !HIGHMEM case. + * (HIGHMEM is special because part of memory is _invisible_ from the kernel.) + */ +static int kcore_update_ram(void) +{ + LIST_HEAD(head); + struct kcore_list *ent; + int ret = 0; + + ent = kmalloc(sizeof(*ent), GFP_KERNEL); + if (!ent) + return -ENOMEM; + ent->addr = (unsigned long)__va(0); + ent->size = max_low_pfn << PAGE_SHIFT; + ent->type = KCORE_RAM; + list_add(&ent->list, &head); + __kcore_update_ram(&head); + return ret; +} + +#else /* !CONFIG_HIGHMEM */ + +#ifdef CONFIG_SPARSEMEM_VMEMMAP +/* calculate vmemmap's address from given system ram pfn and register it */ +static int +get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head) +{ + unsigned long pfn = __pa(ent->addr) >> PAGE_SHIFT; + unsigned long nr_pages = ent->size >> PAGE_SHIFT; + unsigned long start, end; + struct kcore_list *vmm, *tmp; + + + start = ((unsigned long)pfn_to_page(pfn)) & PAGE_MASK; + end = ((unsigned long)pfn_to_page(pfn + nr_pages)) - 1; + end = ALIGN(end, PAGE_SIZE); + /* overlap check (because we have to align page */ + list_for_each_entry(tmp, head, list) { + if (tmp->type != KCORE_VMEMMAP) + continue; + if (start < tmp->addr + tmp->size) + if (end > tmp->addr) + end = tmp->addr; + } + if (start < end) { + vmm = kmalloc(sizeof(*vmm), GFP_KERNEL); + if (!vmm) + return 0; + vmm->addr = start; + vmm->size = end - start; + vmm->type = KCORE_VMEMMAP; + list_add_tail(&vmm->list, head); + } + return 1; + +} +#else +static int +get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head) +{ + return 1; +} + +#endif + +static int +kclist_add_private(unsigned long pfn, unsigned long nr_pages, void *arg) +{ + struct list_head *head = (struct list_head *)arg; + struct kcore_list *ent; + + ent = kmalloc(sizeof(*ent), GFP_KERNEL); + if (!ent) + return -ENOMEM; + ent->addr = (unsigned long)__va((pfn << PAGE_SHIFT)); + ent->size = nr_pages << PAGE_SHIFT; + + /* Sanity check: Can happen in 32bit arch...maybe */ + if (ent->addr < (unsigned long) __va(0)) + goto free_out; + + /* cut not-mapped area. ....from ppc-32 code. */ + if (ULONG_MAX - ent->addr < ent->size) + ent->size = ULONG_MAX - ent->addr; + + /* cut when vmalloc() area is higher than direct-map area */ + if (VMALLOC_START > (unsigned long)__va(0)) { + if (ent->addr > VMALLOC_START) + goto free_out; + if (VMALLOC_START - ent->addr < ent->size) + ent->size = VMALLOC_START - ent->addr; + } + + ent->type = KCORE_RAM; + list_add_tail(&ent->list, head); + + if (!get_sparsemem_vmemmap_info(ent, head)) { + list_del(&ent->list); + goto free_out; + } + + return 0; +free_out: + kfree(ent); + return 1; +} + +static int kcore_update_ram(void) +{ + int nid, ret; + unsigned long end_pfn; + LIST_HEAD(head); + + /* Not inialized....update now */ + /* find out "max pfn" */ + end_pfn = 0; + for_each_node_state(nid, N_HIGH_MEMORY) { + unsigned long node_end; + node_end = NODE_DATA(nid)->node_start_pfn + + NODE_DATA(nid)->node_spanned_pages; + if (end_pfn < node_end) + end_pfn = node_end; + } + /* scan 0 to max_pfn */ + ret = walk_system_ram_range(0, end_pfn, &head, kclist_add_private); + if (ret) { + free_kclist_ents(&head); + return -ENOMEM; + } + __kcore_update_ram(&head); + return ret; +} +#endif /* CONFIG_HIGHMEM */ + +/*****************************************************************************/ +/* + * determine size of ELF note + */ +static int notesize(struct memelfnote *en) +{ + int sz; + + sz = sizeof(struct elf_note); + sz += roundup((strlen(en->name) + 1), 4); + sz += roundup(en->datasz, 4); + + return sz; +} /* end notesize() */ + +/*****************************************************************************/ +/* + * store a note in the header buffer + */ +static char *storenote(struct memelfnote *men, char *bufp) +{ + struct elf_note en; + +#define DUMP_WRITE(addr,nr) do { memcpy(bufp,addr,nr); bufp += nr; } while(0) + + en.n_namesz = strlen(men->name) + 1; + en.n_descsz = men->datasz; + en.n_type = men->type; + + DUMP_WRITE(&en, sizeof(en)); + DUMP_WRITE(men->name, en.n_namesz); + + /* XXX - cast from long long to long to avoid need for libgcc.a */ + bufp = (char*) roundup((unsigned long)bufp,4); + DUMP_WRITE(men->data, men->datasz); + bufp = (char*) roundup((unsigned long)bufp,4); + +#undef DUMP_WRITE + + return bufp; +} /* end storenote() */ + +/* + * store an ELF coredump header in the supplied buffer + * nphdr is the number of elf_phdr to insert + */ +static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff) +{ + struct elf_prstatus prstatus; /* NT_PRSTATUS */ + struct elf_prpsinfo prpsinfo; /* NT_PRPSINFO */ + struct elf_phdr *nhdr, *phdr; + struct elfhdr *elf; + struct memelfnote notes[3]; + off_t offset = 0; + struct kcore_list *m; + + /* setup ELF header */ + elf = (struct elfhdr *) bufp; + bufp += sizeof(struct elfhdr); + offset += sizeof(struct elfhdr); + memcpy(elf->e_ident, ELFMAG, SELFMAG); + elf->e_ident[EI_CLASS] = ELF_CLASS; + elf->e_ident[EI_DATA] = ELF_DATA; + elf->e_ident[EI_VERSION]= EV_CURRENT; + elf->e_ident[EI_OSABI] = ELF_OSABI; + memset(elf->e_ident+EI_PAD, 0, EI_NIDENT-EI_PAD); + elf->e_type = ET_CORE; + elf->e_machine = ELF_ARCH; + elf->e_version = EV_CURRENT; + elf->e_entry = 0; + elf->e_phoff = sizeof(struct elfhdr); + elf->e_shoff = 0; + elf->e_flags = ELF_CORE_EFLAGS; + elf->e_ehsize = sizeof(struct elfhdr); + elf->e_phentsize= sizeof(struct elf_phdr); + elf->e_phnum = nphdr; + elf->e_shentsize= 0; + elf->e_shnum = 0; + elf->e_shstrndx = 0; + + /* setup ELF PT_NOTE program header */ + nhdr = (struct elf_phdr *) bufp; + bufp += sizeof(struct elf_phdr); + offset += sizeof(struct elf_phdr); + nhdr->p_type = PT_NOTE; + nhdr->p_offset = 0; + nhdr->p_vaddr = 0; + nhdr->p_paddr = 0; + nhdr->p_filesz = 0; + nhdr->p_memsz = 0; + nhdr->p_flags = 0; + nhdr->p_align = 0; + + /* setup ELF PT_LOAD program header for every area */ + list_for_each_entry(m, &kclist_head, list) { + phdr = (struct elf_phdr *) bufp; + bufp += sizeof(struct elf_phdr); + offset += sizeof(struct elf_phdr); + + phdr->p_type = PT_LOAD; + phdr->p_flags = PF_R|PF_W|PF_X; + phdr->p_offset = kc_vaddr_to_offset(m->addr) + dataoff; + phdr->p_vaddr = (size_t)m->addr; + phdr->p_paddr = 0; + phdr->p_filesz = phdr->p_memsz = m->size; + phdr->p_align = PAGE_SIZE; + } + + /* + * Set up the notes in similar form to SVR4 core dumps made + * with info from their /proc. + */ + nhdr->p_offset = offset; + + /* set up the process status */ + notes[0].name = CORE_STR; + notes[0].type = NT_PRSTATUS; + notes[0].datasz = sizeof(struct elf_prstatus); + notes[0].data = &prstatus; + + memset(&prstatus, 0, sizeof(struct elf_prstatus)); + + nhdr->p_filesz = notesize(¬es[0]); + bufp = storenote(¬es[0], bufp); + + /* set up the process info */ + notes[1].name = CORE_STR; + notes[1].type = NT_PRPSINFO; + notes[1].datasz = sizeof(struct elf_prpsinfo); + notes[1].data = &prpsinfo; + + memset(&prpsinfo, 0, sizeof(struct elf_prpsinfo)); + prpsinfo.pr_state = 0; + prpsinfo.pr_sname = 'R'; + prpsinfo.pr_zomb = 0; + + strcpy(prpsinfo.pr_fname, "vmlinux"); + strncpy(prpsinfo.pr_psargs, saved_command_line, ELF_PRARGSZ); + + nhdr->p_filesz += notesize(¬es[1]); + bufp = storenote(¬es[1], bufp); + + /* set up the task structure */ + notes[2].name = CORE_STR; + notes[2].type = NT_TASKSTRUCT; + notes[2].datasz = sizeof(struct task_struct); + notes[2].data = current; + + nhdr->p_filesz += notesize(¬es[2]); + bufp = storenote(¬es[2], bufp); + +} /* end elf_kcore_store_hdr() */ + +/*****************************************************************************/ +/* + * read from the ELF header and then kernel memory + */ +static ssize_t +read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) +{ + ssize_t acc = 0; + size_t size, tsz; + size_t elf_buflen; + int nphdr; + unsigned long start; + + read_lock(&kclist_lock); + size = get_kcore_size(&nphdr, &elf_buflen); + + if (buflen == 0 || *fpos >= size) { + read_unlock(&kclist_lock); + return 0; + } + + /* trim buflen to not go beyond EOF */ + if (buflen > size - *fpos) + buflen = size - *fpos; + + /* construct an ELF core header if we'll need some of it */ + if (*fpos < elf_buflen) { + char * elf_buf; + + tsz = elf_buflen - *fpos; + if (buflen < tsz) + tsz = buflen; + elf_buf = kzalloc(elf_buflen, GFP_ATOMIC); + if (!elf_buf) { + read_unlock(&kclist_lock); + return -ENOMEM; + } + elf_kcore_store_hdr(elf_buf, nphdr, elf_buflen); + read_unlock(&kclist_lock); + if (copy_to_user(buffer, elf_buf + *fpos, tsz)) { + kfree(elf_buf); + return -EFAULT; + } + kfree(elf_buf); + buflen -= tsz; + *fpos += tsz; + buffer += tsz; + acc += tsz; + + /* leave now if filled buffer already */ + if (buflen == 0) + return acc; + } else + read_unlock(&kclist_lock); + + /* + * Check to see if our file offset matches with any of + * the addresses in the elf_phdr on our list. + */ + start = kc_offset_to_vaddr(*fpos - elf_buflen); + if ((tsz = (PAGE_SIZE - (start & ~PAGE_MASK))) > buflen) + tsz = buflen; + + while (buflen) { + struct kcore_list *m; + + read_lock(&kclist_lock); + list_for_each_entry(m, &kclist_head, list) { + if (start >= m->addr && start < (m->addr+m->size)) + break; + } + read_unlock(&kclist_lock); + + if (&m->list == &kclist_head) { + if (clear_user(buffer, tsz)) + return -EFAULT; + } else if (is_vmalloc_or_module_addr((void *)start)) { + char * elf_buf; + + elf_buf = kzalloc(tsz, GFP_KERNEL); + if (!elf_buf) + return -ENOMEM; + vread(elf_buf, (char *)start, tsz); + /* we have to zero-fill user buffer even if no read */ + if (copy_to_user(buffer, elf_buf, tsz)) { + kfree(elf_buf); + return -EFAULT; + } + kfree(elf_buf); + } else { + if (kern_addr_valid(start)) { + unsigned long n; + + n = copy_to_user(buffer, (char *)start, tsz); + /* + * We cannot distinguish between fault on source + * and fault on destination. When this happens + * we clear too and hope it will trigger the + * EFAULT again. + */ + if (n) { + if (clear_user(buffer + tsz - n, + n)) + return -EFAULT; + } + } else { + if (clear_user(buffer, tsz)) + return -EFAULT; + } + } + buflen -= tsz; + *fpos += tsz; + buffer += tsz; + acc += tsz; + start += tsz; + tsz = (buflen > PAGE_SIZE ? PAGE_SIZE : buflen); + } + + return acc; +} + + +static int open_kcore(struct inode *inode, struct file *filp) +{ + if (!capable(CAP_SYS_RAWIO)) + return -EPERM; + if (kcore_need_update) + kcore_update_ram(); + if (i_size_read(inode) != proc_root_kcore->size) { + mutex_lock(&inode->i_mutex); + i_size_write(inode, proc_root_kcore->size); + mutex_unlock(&inode->i_mutex); + } + return 0; +} + + +static const struct file_operations proc_kcore_operations = { + .read = read_kcore, + .open = open_kcore, + .llseek = default_llseek, +}; + +#ifdef CONFIG_MEMORY_HOTPLUG +/* just remember that we have to update kcore */ +static int __meminit kcore_callback(struct notifier_block *self, + unsigned long action, void *arg) +{ + switch (action) { + case MEM_ONLINE: + case MEM_OFFLINE: + write_lock(&kclist_lock); + kcore_need_update = 1; + write_unlock(&kclist_lock); + } + return NOTIFY_OK; +} +#endif + + +static struct kcore_list kcore_vmalloc; + +#ifdef CONFIG_ARCH_PROC_KCORE_TEXT +static struct kcore_list kcore_text; +/* + * If defined, special segment is used for mapping kernel text instead of + * direct-map area. We need to create special TEXT section. + */ +static void __init proc_kcore_text_init(void) +{ + kclist_add(&kcore_text, _text, _end - _text, KCORE_TEXT); +} +#else +static void __init proc_kcore_text_init(void) +{ +} +#endif + +#if defined(CONFIG_MODULES) && defined(MODULES_VADDR) +/* + * MODULES_VADDR has no intersection with VMALLOC_ADDR. + */ +struct kcore_list kcore_modules; +static void __init add_modules_range(void) +{ + kclist_add(&kcore_modules, (void *)MODULES_VADDR, + MODULES_END - MODULES_VADDR, KCORE_VMALLOC); +} +#else +static void __init add_modules_range(void) +{ +} +#endif + +static int __init proc_kcore_init(void) +{ + proc_root_kcore = proc_create("kcore", S_IRUSR, NULL, + &proc_kcore_operations); + if (!proc_root_kcore) { + printk(KERN_ERR "couldn't create /proc/kcore\n"); + return 0; /* Always returns 0. */ + } + /* Store text area if it's special */ + proc_kcore_text_init(); + /* Store vmalloc area */ + kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, + VMALLOC_END - VMALLOC_START, KCORE_VMALLOC); + add_modules_range(); + /* Store direct-map area from physical memory map */ + kcore_update_ram(); + hotplug_memory_notifier(kcore_callback, 0); + + return 0; +} +module_init(proc_kcore_init); diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c new file mode 100644 index 00000000..bd4b5a74 --- /dev/null +++ b/fs/proc/kmsg.c @@ -0,0 +1,64 @@ +/* + * linux/fs/proc/kmsg.c + * + * Copyright (C) 1992 by Linus Torvalds + * + */ + +#include <linux/types.h> +#include <linux/errno.h> +#include <linux/time.h> +#include <linux/kernel.h> +#include <linux/poll.h> +#include <linux/proc_fs.h> +#include <linux/fs.h> +#include <linux/syslog.h> + +#include <asm/uaccess.h> +#include <asm/io.h> + +extern wait_queue_head_t log_wait; + +static int kmsg_open(struct inode * inode, struct file * file) +{ + return do_syslog(SYSLOG_ACTION_OPEN, NULL, 0, SYSLOG_FROM_FILE); +} + +static int kmsg_release(struct inode * inode, struct file * file) +{ + (void) do_syslog(SYSLOG_ACTION_CLOSE, NULL, 0, SYSLOG_FROM_FILE); + return 0; +} + +static ssize_t kmsg_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + if ((file->f_flags & O_NONBLOCK) && + !do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_FILE)) + return -EAGAIN; + return do_syslog(SYSLOG_ACTION_READ, buf, count, SYSLOG_FROM_FILE); +} + +static unsigned int kmsg_poll(struct file *file, poll_table *wait) +{ + poll_wait(file, &log_wait, wait); + if (do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_FILE)) + return POLLIN | POLLRDNORM; + return 0; +} + + +static const struct file_operations proc_kmsg_operations = { + .read = kmsg_read, + .poll = kmsg_poll, + .open = kmsg_open, + .release = kmsg_release, + .llseek = generic_file_llseek, +}; + +static int __init proc_kmsg_init(void) +{ + proc_create("kmsg", S_IRUSR, NULL, &proc_kmsg_operations); + return 0; +} +module_init(proc_kmsg_init); diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c new file mode 100644 index 00000000..1afa4dd4 --- /dev/null +++ b/fs/proc/loadavg.c @@ -0,0 +1,45 @@ +#include <linux/fs.h> +#include <linux/init.h> +#include <linux/pid_namespace.h> +#include <linux/proc_fs.h> +#include <linux/sched.h> +#include <linux/seq_file.h> +#include <linux/seqlock.h> +#include <linux/time.h> + +#define LOAD_INT(x) ((x) >> FSHIFT) +#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) + +static int loadavg_proc_show(struct seq_file *m, void *v) +{ + unsigned long avnrun[3]; + + get_avenrun(avnrun, FIXED_1/200, 0); + + seq_printf(m, "%lu.%02lu %lu.%02lu %lu.%02lu %ld/%d %d\n", + LOAD_INT(avnrun[0]), LOAD_FRAC(avnrun[0]), + LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]), + LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]), + nr_running(), nr_threads, + task_active_pid_ns(current)->last_pid); + return 0; +} + +static int loadavg_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, loadavg_proc_show, NULL); +} + +static const struct file_operations loadavg_proc_fops = { + .open = loadavg_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init proc_loadavg_init(void) +{ + proc_create("loadavg", 0, NULL, &loadavg_proc_fops); + return 0; +} +module_init(proc_loadavg_init); diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c new file mode 100644 index 00000000..3fdbefd4 --- /dev/null +++ b/fs/proc/meminfo.c @@ -0,0 +1,215 @@ +#include <linux/fs.h> +#include <linux/hugetlb.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/mman.h> +#include <linux/mmzone.h> +#include <linux/proc_fs.h> +#include <linux/quicklist.h> +#include <linux/seq_file.h> +#include <linux/swap.h> +#include <linux/vmstat.h> +#include <linux/atomic.h> +#include <asm/page.h> +#include <asm/pgtable.h> +#include "internal.h" + +extern int wmt_getsyspara(char *varname, unsigned char *varval, int *varlen); + +void __attribute__((weak)) arch_report_meminfo(struct seq_file *m) +{ +} + +static int meminfo_proc_show(struct seq_file *m, void *v) +{ + struct sysinfo i; + unsigned long committed; + unsigned long allowed; + struct vmalloc_info vmi; + long cached; + unsigned long pages[NR_LRU_LISTS]; + int lru; + unsigned long misc_mem_total,misc_mem_free,m_mem_total,m_mem_free; + char param_name[] = "wmt.ram.fake_size"; + int ret = -1; + unsigned char buf[80]; + unsigned int paramlen= 80; + unsigned char colon; +/* + * display in kilobytes. + */ +#define K(x) ((x) << (PAGE_SHIFT - 10)) + si_meminfo(&i); + si_swapinfo(&i); + + ret = wmt_getsyspara(param_name, buf, ¶mlen); + if(ret == 0) + { + sscanf(buf,"%8lu%c%8lu", &misc_mem_total,&colon,&misc_mem_free); + m_mem_total = K(i.totalram)+misc_mem_total*1024; + m_mem_free = K(i.freeram)+misc_mem_free*1024; + } + else + { + m_mem_total = K(i.totalram); + m_mem_free = K(i.freeram); + } + + committed = percpu_counter_read_positive(&vm_committed_as); + allowed = ((totalram_pages - hugetlb_total_pages()) + * sysctl_overcommit_ratio / 100) + total_swap_pages; + + cached = global_page_state(NR_FILE_PAGES) - + total_swapcache_pages - i.bufferram; + if (cached < 0) + cached = 0; + + get_vmalloc_info(&vmi); + + for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) + pages[lru] = global_page_state(NR_LRU_BASE + lru); + + /* + * Tagged format, for easy grepping and expansion. + */ + seq_printf(m, + "MemTotal: %8lu kB\n" + "MemFree: %8lu kB\n" + "Buffers: %8lu kB\n" + "Cached: %8lu kB\n" + "SwapCached: %8lu kB\n" + "Active: %8lu kB\n" + "Inactive: %8lu kB\n" + "Active(anon): %8lu kB\n" + "Inactive(anon): %8lu kB\n" + "Active(file): %8lu kB\n" + "Inactive(file): %8lu kB\n" + "Unevictable: %8lu kB\n" + "Mlocked: %8lu kB\n" +#ifdef CONFIG_HIGHMEM + "HighTotal: %8lu kB\n" + "HighFree: %8lu kB\n" + "LowTotal: %8lu kB\n" + "LowFree: %8lu kB\n" +#endif +#ifndef CONFIG_MMU + "MmapCopy: %8lu kB\n" +#endif + "SwapTotal: %8lu kB\n" + "SwapFree: %8lu kB\n" + "Dirty: %8lu kB\n" + "Writeback: %8lu kB\n" + "AnonPages: %8lu kB\n" + "Mapped: %8lu kB\n" + "Shmem: %8lu kB\n" + "Slab: %8lu kB\n" + "SReclaimable: %8lu kB\n" + "SUnreclaim: %8lu kB\n" + "KernelStack: %8lu kB\n" + "PageTables: %8lu kB\n" +#ifdef CONFIG_QUICKLIST + "Quicklists: %8lu kB\n" +#endif + "NFS_Unstable: %8lu kB\n" + "Bounce: %8lu kB\n" + "WritebackTmp: %8lu kB\n" + "CommitLimit: %8lu kB\n" + "Committed_AS: %8lu kB\n" + "VmallocTotal: %8lu kB\n" + "VmallocUsed: %8lu kB\n" + "VmallocChunk: %8lu kB\n" +#ifdef CONFIG_MEMORY_FAILURE + "HardwareCorrupted: %5lu kB\n" +#endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + "AnonHugePages: %8lu kB\n" +#endif + , + m_mem_total, + m_mem_free, + K(i.bufferram), + K(cached), + K(total_swapcache_pages), + K(pages[LRU_ACTIVE_ANON] + pages[LRU_ACTIVE_FILE]), + K(pages[LRU_INACTIVE_ANON] + pages[LRU_INACTIVE_FILE]), + K(pages[LRU_ACTIVE_ANON]), + K(pages[LRU_INACTIVE_ANON]), + K(pages[LRU_ACTIVE_FILE]), + K(pages[LRU_INACTIVE_FILE]), + K(pages[LRU_UNEVICTABLE]), + K(global_page_state(NR_MLOCK)), +#ifdef CONFIG_HIGHMEM + K(i.totalhigh), + K(i.freehigh), + K(i.totalram-i.totalhigh), + K(i.freeram-i.freehigh), +#endif +#ifndef CONFIG_MMU + K((unsigned long) atomic_long_read(&mmap_pages_allocated)), +#endif + K(i.totalswap), + K(i.freeswap), + K(global_page_state(NR_FILE_DIRTY)), + K(global_page_state(NR_WRITEBACK)), +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + K(global_page_state(NR_ANON_PAGES) + + global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) * + HPAGE_PMD_NR), +#else + K(global_page_state(NR_ANON_PAGES)), +#endif + K(global_page_state(NR_FILE_MAPPED)), + K(global_page_state(NR_SHMEM)), + K(global_page_state(NR_SLAB_RECLAIMABLE) + + global_page_state(NR_SLAB_UNRECLAIMABLE)), + K(global_page_state(NR_SLAB_RECLAIMABLE)), + K(global_page_state(NR_SLAB_UNRECLAIMABLE)), + global_page_state(NR_KERNEL_STACK) * THREAD_SIZE / 1024, + K(global_page_state(NR_PAGETABLE)), +#ifdef CONFIG_QUICKLIST + K(quicklist_total_size()), +#endif + K(global_page_state(NR_UNSTABLE_NFS)), + K(global_page_state(NR_BOUNCE)), + K(global_page_state(NR_WRITEBACK_TEMP)), + K(allowed), + K(committed), + (unsigned long)VMALLOC_TOTAL >> 10, + vmi.used >> 10, + vmi.largest_chunk >> 10 +#ifdef CONFIG_MEMORY_FAILURE + ,atomic_long_read(&mce_bad_pages) << (PAGE_SHIFT - 10) +#endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + ,K(global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) * + HPAGE_PMD_NR) +#endif + ); + + hugetlb_report_meminfo(m); + + arch_report_meminfo(m); + + return 0; +#undef K +} + +static int meminfo_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, meminfo_proc_show, NULL); +} + +static const struct file_operations meminfo_proc_fops = { + .open = meminfo_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init proc_meminfo_init(void) +{ + proc_create("meminfo", 0, NULL, &meminfo_proc_fops); + return 0; +} +module_init(proc_meminfo_init); diff --git a/fs/proc/mmu.c b/fs/proc/mmu.c new file mode 100644 index 00000000..8ae221df --- /dev/null +++ b/fs/proc/mmu.c @@ -0,0 +1,60 @@ +/* mmu.c: mmu memory info files + * + * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <linux/spinlock.h> +#include <linux/vmalloc.h> +#include <linux/highmem.h> +#include <asm/pgtable.h> +#include "internal.h" + +void get_vmalloc_info(struct vmalloc_info *vmi) +{ + struct vm_struct *vma; + unsigned long free_area_size; + unsigned long prev_end; + + vmi->used = 0; + + if (!vmlist) { + vmi->largest_chunk = VMALLOC_TOTAL; + } + else { + vmi->largest_chunk = 0; + + prev_end = VMALLOC_START; + + read_lock(&vmlist_lock); + + for (vma = vmlist; vma; vma = vma->next) { + unsigned long addr = (unsigned long) vma->addr; + + /* + * Some archs keep another range for modules in vmlist + */ + if (addr < VMALLOC_START) + continue; + if (addr >= VMALLOC_END) + break; + + vmi->used += vma->size; + + free_area_size = addr - prev_end; + if (vmi->largest_chunk < free_area_size) + vmi->largest_chunk = free_area_size; + + prev_end = vma->size + addr; + } + + if (VMALLOC_END - prev_end > vmi->largest_chunk) + vmi->largest_chunk = VMALLOC_END - prev_end; + + read_unlock(&vmlist_lock); + } +} diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c new file mode 100644 index 00000000..0d9e23a3 --- /dev/null +++ b/fs/proc/namespaces.c @@ -0,0 +1,200 @@ +#include <linux/proc_fs.h> +#include <linux/nsproxy.h> +#include <linux/sched.h> +#include <linux/ptrace.h> +#include <linux/fs_struct.h> +#include <linux/mount.h> +#include <linux/path.h> +#include <linux/namei.h> +#include <linux/file.h> +#include <linux/utsname.h> +#include <net/net_namespace.h> +#include <linux/ipc_namespace.h> +#include <linux/pid_namespace.h> +#include "internal.h" + + +static const struct proc_ns_operations *ns_entries[] = { +#ifdef CONFIG_NET_NS + &netns_operations, +#endif +#ifdef CONFIG_UTS_NS + &utsns_operations, +#endif +#ifdef CONFIG_IPC_NS + &ipcns_operations, +#endif +}; + +static const struct file_operations ns_file_operations = { + .llseek = no_llseek, +}; + +static struct dentry *proc_ns_instantiate(struct inode *dir, + struct dentry *dentry, struct task_struct *task, const void *ptr) +{ + const struct proc_ns_operations *ns_ops = ptr; + struct inode *inode; + struct proc_inode *ei; + struct dentry *error = ERR_PTR(-ENOENT); + void *ns; + + inode = proc_pid_make_inode(dir->i_sb, task); + if (!inode) + goto out; + + ns = ns_ops->get(task); + if (!ns) + goto out_iput; + + ei = PROC_I(inode); + inode->i_mode = S_IFREG|S_IRUSR; + inode->i_fop = &ns_file_operations; + ei->ns_ops = ns_ops; + ei->ns = ns; + + d_set_d_op(dentry, &pid_dentry_operations); + d_add(dentry, inode); + /* Close the race of the process dying before we return the dentry */ + if (pid_revalidate(dentry, NULL)) + error = NULL; +out: + return error; +out_iput: + iput(inode); + goto out; +} + +static int proc_ns_fill_cache(struct file *filp, void *dirent, + filldir_t filldir, struct task_struct *task, + const struct proc_ns_operations *ops) +{ + return proc_fill_cache(filp, dirent, filldir, + ops->name, strlen(ops->name), + proc_ns_instantiate, task, ops); +} + +static int proc_ns_dir_readdir(struct file *filp, void *dirent, + filldir_t filldir) +{ + int i; + struct dentry *dentry = filp->f_path.dentry; + struct inode *inode = dentry->d_inode; + struct task_struct *task = get_proc_task(inode); + const struct proc_ns_operations **entry, **last; + ino_t ino; + int ret; + + ret = -ENOENT; + if (!task) + goto out_no_task; + + ret = -EPERM; + if (!ptrace_may_access(task, PTRACE_MODE_READ)) + goto out; + + ret = 0; + i = filp->f_pos; + switch (i) { + case 0: + ino = inode->i_ino; + if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0) + goto out; + i++; + filp->f_pos++; + /* fall through */ + case 1: + ino = parent_ino(dentry); + if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0) + goto out; + i++; + filp->f_pos++; + /* fall through */ + default: + i -= 2; + if (i >= ARRAY_SIZE(ns_entries)) { + ret = 1; + goto out; + } + entry = ns_entries + i; + last = &ns_entries[ARRAY_SIZE(ns_entries) - 1]; + while (entry <= last) { + if (proc_ns_fill_cache(filp, dirent, filldir, + task, *entry) < 0) + goto out; + filp->f_pos++; + entry++; + } + } + + ret = 1; +out: + put_task_struct(task); +out_no_task: + return ret; +} + +const struct file_operations proc_ns_dir_operations = { + .read = generic_read_dir, + .readdir = proc_ns_dir_readdir, +}; + +static struct dentry *proc_ns_dir_lookup(struct inode *dir, + struct dentry *dentry, struct nameidata *nd) +{ + struct dentry *error; + struct task_struct *task = get_proc_task(dir); + const struct proc_ns_operations **entry, **last; + unsigned int len = dentry->d_name.len; + + error = ERR_PTR(-ENOENT); + + if (!task) + goto out_no_task; + + error = ERR_PTR(-EPERM); + if (!ptrace_may_access(task, PTRACE_MODE_READ)) + goto out; + + last = &ns_entries[ARRAY_SIZE(ns_entries)]; + for (entry = ns_entries; entry < last; entry++) { + if (strlen((*entry)->name) != len) + continue; + if (!memcmp(dentry->d_name.name, (*entry)->name, len)) + break; + } + error = ERR_PTR(-ENOENT); + if (entry == last) + goto out; + + error = proc_ns_instantiate(dir, dentry, task, *entry); +out: + put_task_struct(task); +out_no_task: + return error; +} + +const struct inode_operations proc_ns_dir_inode_operations = { + .lookup = proc_ns_dir_lookup, + .getattr = pid_getattr, + .setattr = proc_setattr, +}; + +struct file *proc_ns_fget(int fd) +{ + struct file *file; + + file = fget(fd); + if (!file) + return ERR_PTR(-EBADF); + + if (file->f_op != &ns_file_operations) + goto out_invalid; + + return file; + +out_invalid: + fput(file); + return ERR_PTR(-EINVAL); +} + diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c new file mode 100644 index 00000000..b1822dde --- /dev/null +++ b/fs/proc/nommu.c @@ -0,0 +1,136 @@ +/* nommu.c: mmu-less memory info files + * + * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/errno.h> +#include <linux/time.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/mman.h> +#include <linux/proc_fs.h> +#include <linux/mm.h> +#include <linux/mmzone.h> +#include <linux/pagemap.h> +#include <linux/swap.h> +#include <linux/smp.h> +#include <linux/seq_file.h> +#include <linux/hugetlb.h> +#include <linux/vmalloc.h> +#include <asm/uaccess.h> +#include <asm/pgtable.h> +#include <asm/tlb.h> +#include <asm/div64.h> +#include "internal.h" + +/* + * display a single region to a sequenced file + */ +static int nommu_region_show(struct seq_file *m, struct vm_region *region) +{ + unsigned long ino = 0; + struct file *file; + dev_t dev = 0; + int flags, len; + + flags = region->vm_flags; + file = region->vm_file; + + if (file) { + struct inode *inode = region->vm_file->f_path.dentry->d_inode; + dev = inode->i_sb->s_dev; + ino = inode->i_ino; + } + + seq_printf(m, + "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n", + region->vm_start, + region->vm_end, + flags & VM_READ ? 'r' : '-', + flags & VM_WRITE ? 'w' : '-', + flags & VM_EXEC ? 'x' : '-', + flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p', + ((loff_t)region->vm_pgoff) << PAGE_SHIFT, + MAJOR(dev), MINOR(dev), ino, &len); + + if (file) { + len = 25 + sizeof(void *) * 6 - len; + if (len < 1) + len = 1; + seq_printf(m, "%*c", len, ' '); + seq_path(m, &file->f_path, ""); + } + + seq_putc(m, '\n'); + return 0; +} + +/* + * display a list of all the REGIONs the kernel knows about + * - nommu kernels have a single flat list + */ +static int nommu_region_list_show(struct seq_file *m, void *_p) +{ + struct rb_node *p = _p; + + return nommu_region_show(m, rb_entry(p, struct vm_region, vm_rb)); +} + +static void *nommu_region_list_start(struct seq_file *m, loff_t *_pos) +{ + struct rb_node *p; + loff_t pos = *_pos; + + down_read(&nommu_region_sem); + + for (p = rb_first(&nommu_region_tree); p; p = rb_next(p)) + if (pos-- == 0) + return p; + return NULL; +} + +static void nommu_region_list_stop(struct seq_file *m, void *v) +{ + up_read(&nommu_region_sem); +} + +static void *nommu_region_list_next(struct seq_file *m, void *v, loff_t *pos) +{ + (*pos)++; + return rb_next((struct rb_node *) v); +} + +static const struct seq_operations proc_nommu_region_list_seqop = { + .start = nommu_region_list_start, + .next = nommu_region_list_next, + .stop = nommu_region_list_stop, + .show = nommu_region_list_show +}; + +static int proc_nommu_region_list_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &proc_nommu_region_list_seqop); +} + +static const struct file_operations proc_nommu_region_list_operations = { + .open = proc_nommu_region_list_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init proc_nommu_init(void) +{ + proc_create("maps", S_IRUGO, NULL, &proc_nommu_region_list_operations); + return 0; +} + +module_init(proc_nommu_init); diff --git a/fs/proc/page.c b/fs/proc/page.c new file mode 100644 index 00000000..7fcd0d60 --- /dev/null +++ b/fs/proc/page.c @@ -0,0 +1,214 @@ +#include <linux/bootmem.h> +#include <linux/compiler.h> +#include <linux/fs.h> +#include <linux/init.h> +#include <linux/ksm.h> +#include <linux/mm.h> +#include <linux/mmzone.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/hugetlb.h> +#include <linux/kernel-page-flags.h> +#include <asm/uaccess.h> +#include "internal.h" + +#define KPMSIZE sizeof(u64) +#define KPMMASK (KPMSIZE - 1) + +/* /proc/kpagecount - an array exposing page counts + * + * Each entry is a u64 representing the corresponding + * physical page count. + */ +static ssize_t kpagecount_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + u64 __user *out = (u64 __user *)buf; + struct page *ppage; + unsigned long src = *ppos; + unsigned long pfn; + ssize_t ret = 0; + u64 pcount; + + pfn = src / KPMSIZE; + count = min_t(size_t, count, (max_pfn * KPMSIZE) - src); + if (src & KPMMASK || count & KPMMASK) + return -EINVAL; + + while (count > 0) { + if (pfn_valid(pfn)) + ppage = pfn_to_page(pfn); + else + ppage = NULL; + if (!ppage || PageSlab(ppage)) + pcount = 0; + else + pcount = page_mapcount(ppage); + + if (put_user(pcount, out)) { + ret = -EFAULT; + break; + } + + pfn++; + out++; + count -= KPMSIZE; + } + + *ppos += (char __user *)out - buf; + if (!ret) + ret = (char __user *)out - buf; + return ret; +} + +static const struct file_operations proc_kpagecount_operations = { + .llseek = mem_lseek, + .read = kpagecount_read, +}; + +/* /proc/kpageflags - an array exposing page flags + * + * Each entry is a u64 representing the corresponding + * physical page flags. + */ + +static inline u64 kpf_copy_bit(u64 kflags, int ubit, int kbit) +{ + return ((kflags >> kbit) & 1) << ubit; +} + +u64 stable_page_flags(struct page *page) +{ + u64 k; + u64 u; + + /* + * pseudo flag: KPF_NOPAGE + * it differentiates a memory hole from a page with no flags + */ + if (!page) + return 1 << KPF_NOPAGE; + + k = page->flags; + u = 0; + + /* + * pseudo flags for the well known (anonymous) memory mapped pages + * + * Note that page->_mapcount is overloaded in SLOB/SLUB/SLQB, so the + * simple test in page_mapped() is not enough. + */ + if (!PageSlab(page) && page_mapped(page)) + u |= 1 << KPF_MMAP; + if (PageAnon(page)) + u |= 1 << KPF_ANON; + if (PageKsm(page)) + u |= 1 << KPF_KSM; + + /* + * compound pages: export both head/tail info + * they together define a compound page's start/end pos and order + */ + if (PageHead(page)) + u |= 1 << KPF_COMPOUND_HEAD; + if (PageTail(page)) + u |= 1 << KPF_COMPOUND_TAIL; + if (PageHuge(page)) + u |= 1 << KPF_HUGE; + else if (PageTransCompound(page)) + u |= 1 << KPF_THP; + + /* + * Caveats on high order pages: page->_count will only be set + * -1 on the head page; SLUB/SLQB do the same for PG_slab; + * SLOB won't set PG_slab at all on compound pages. + */ + if (PageBuddy(page)) + u |= 1 << KPF_BUDDY; + + u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked); + + u |= kpf_copy_bit(k, KPF_SLAB, PG_slab); + + u |= kpf_copy_bit(k, KPF_ERROR, PG_error); + u |= kpf_copy_bit(k, KPF_DIRTY, PG_dirty); + u |= kpf_copy_bit(k, KPF_UPTODATE, PG_uptodate); + u |= kpf_copy_bit(k, KPF_WRITEBACK, PG_writeback); + + u |= kpf_copy_bit(k, KPF_LRU, PG_lru); + u |= kpf_copy_bit(k, KPF_REFERENCED, PG_referenced); + u |= kpf_copy_bit(k, KPF_ACTIVE, PG_active); + u |= kpf_copy_bit(k, KPF_RECLAIM, PG_reclaim); + + u |= kpf_copy_bit(k, KPF_SWAPCACHE, PG_swapcache); + u |= kpf_copy_bit(k, KPF_SWAPBACKED, PG_swapbacked); + + u |= kpf_copy_bit(k, KPF_UNEVICTABLE, PG_unevictable); + u |= kpf_copy_bit(k, KPF_MLOCKED, PG_mlocked); + +#ifdef CONFIG_MEMORY_FAILURE + u |= kpf_copy_bit(k, KPF_HWPOISON, PG_hwpoison); +#endif + +#ifdef CONFIG_ARCH_USES_PG_UNCACHED + u |= kpf_copy_bit(k, KPF_UNCACHED, PG_uncached); +#endif + + u |= kpf_copy_bit(k, KPF_RESERVED, PG_reserved); + u |= kpf_copy_bit(k, KPF_MAPPEDTODISK, PG_mappedtodisk); + u |= kpf_copy_bit(k, KPF_PRIVATE, PG_private); + u |= kpf_copy_bit(k, KPF_PRIVATE_2, PG_private_2); + u |= kpf_copy_bit(k, KPF_OWNER_PRIVATE, PG_owner_priv_1); + u |= kpf_copy_bit(k, KPF_ARCH, PG_arch_1); + + return u; +}; + +static ssize_t kpageflags_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + u64 __user *out = (u64 __user *)buf; + struct page *ppage; + unsigned long src = *ppos; + unsigned long pfn; + ssize_t ret = 0; + + pfn = src / KPMSIZE; + count = min_t(unsigned long, count, (max_pfn * KPMSIZE) - src); + if (src & KPMMASK || count & KPMMASK) + return -EINVAL; + + while (count > 0) { + if (pfn_valid(pfn)) + ppage = pfn_to_page(pfn); + else + ppage = NULL; + + if (put_user(stable_page_flags(ppage), out)) { + ret = -EFAULT; + break; + } + + pfn++; + out++; + count -= KPMSIZE; + } + + *ppos += (char __user *)out - buf; + if (!ret) + ret = (char __user *)out - buf; + return ret; +} + +static const struct file_operations proc_kpageflags_operations = { + .llseek = mem_lseek, + .read = kpageflags_read, +}; + +static int __init proc_page_init(void) +{ + proc_create("kpagecount", S_IRUSR, NULL, &proc_kpagecount_operations); + proc_create("kpageflags", S_IRUSR, NULL, &proc_kpageflags_operations); + return 0; +} +module_init(proc_page_init); diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c new file mode 100644 index 00000000..927cbd11 --- /dev/null +++ b/fs/proc/proc_devtree.c @@ -0,0 +1,241 @@ +/* + * proc_devtree.c - handles /proc/device-tree + * + * Copyright 1997 Paul Mackerras + */ +#include <linux/errno.h> +#include <linux/init.h> +#include <linux/time.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/stat.h> +#include <linux/string.h> +#include <linux/of.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <asm/prom.h> +#include <asm/uaccess.h> +#include "internal.h" + +static inline void set_node_proc_entry(struct device_node *np, + struct proc_dir_entry *de) +{ +#ifdef HAVE_ARCH_DEVTREE_FIXUPS + np->pde = de; +#endif +} + +static struct proc_dir_entry *proc_device_tree; + +/* + * Supply data on a read from /proc/device-tree/node/property. + */ +static int property_proc_show(struct seq_file *m, void *v) +{ + struct property *pp = m->private; + + seq_write(m, pp->value, pp->length); + return 0; +} + +static int property_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, property_proc_show, PDE(inode)->data); +} + +static const struct file_operations property_proc_fops = { + .owner = THIS_MODULE, + .open = property_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +/* + * For a node with a name like "gc@10", we make symlinks called "gc" + * and "@10" to it. + */ + +/* + * Add a property to a node + */ +static struct proc_dir_entry * +__proc_device_tree_add_prop(struct proc_dir_entry *de, struct property *pp, + const char *name) +{ + struct proc_dir_entry *ent; + + /* + * Unfortunately proc_register puts each new entry + * at the beginning of the list. So we rearrange them. + */ + ent = proc_create_data(name, + strncmp(name, "security-", 9) ? S_IRUGO : S_IRUSR, + de, &property_proc_fops, pp); + if (ent == NULL) + return NULL; + + if (!strncmp(name, "security-", 9)) + ent->size = 0; /* don't leak number of password chars */ + else + ent->size = pp->length; + + return ent; +} + + +void proc_device_tree_add_prop(struct proc_dir_entry *pde, struct property *prop) +{ + __proc_device_tree_add_prop(pde, prop, prop->name); +} + +void proc_device_tree_remove_prop(struct proc_dir_entry *pde, + struct property *prop) +{ + remove_proc_entry(prop->name, pde); +} + +void proc_device_tree_update_prop(struct proc_dir_entry *pde, + struct property *newprop, + struct property *oldprop) +{ + struct proc_dir_entry *ent; + + for (ent = pde->subdir; ent != NULL; ent = ent->next) + if (ent->data == oldprop) + break; + if (ent == NULL) { + printk(KERN_WARNING "device-tree: property \"%s\" " + " does not exist\n", oldprop->name); + } else { + ent->data = newprop; + ent->size = newprop->length; + } +} + +/* + * Various dodgy firmware might give us nodes and/or properties with + * conflicting names. That's generally ok, except for exporting via /proc, + * so munge names here to ensure they're unique. + */ + +static int duplicate_name(struct proc_dir_entry *de, const char *name) +{ + struct proc_dir_entry *ent; + int found = 0; + + spin_lock(&proc_subdir_lock); + + for (ent = de->subdir; ent != NULL; ent = ent->next) { + if (strcmp(ent->name, name) == 0) { + found = 1; + break; + } + } + + spin_unlock(&proc_subdir_lock); + + return found; +} + +static const char *fixup_name(struct device_node *np, struct proc_dir_entry *de, + const char *name) +{ + char *fixed_name; + int fixup_len = strlen(name) + 2 + 1; /* name + #x + \0 */ + int i = 1, size; + +realloc: + fixed_name = kmalloc(fixup_len, GFP_KERNEL); + if (fixed_name == NULL) { + printk(KERN_ERR "device-tree: Out of memory trying to fixup " + "name \"%s\"\n", name); + return name; + } + +retry: + size = snprintf(fixed_name, fixup_len, "%s#%d", name, i); + size++; /* account for NULL */ + + if (size > fixup_len) { + /* We ran out of space, free and reallocate. */ + kfree(fixed_name); + fixup_len = size; + goto realloc; + } + + if (duplicate_name(de, fixed_name)) { + /* Multiple duplicates. Retry with a different offset. */ + i++; + goto retry; + } + + printk(KERN_WARNING "device-tree: Duplicate name in %s, " + "renamed to \"%s\"\n", np->full_name, fixed_name); + + return fixed_name; +} + +/* + * Process a node, adding entries for its children and its properties. + */ +void proc_device_tree_add_node(struct device_node *np, + struct proc_dir_entry *de) +{ + struct property *pp; + struct proc_dir_entry *ent; + struct device_node *child; + const char *p; + + set_node_proc_entry(np, de); + for (child = NULL; (child = of_get_next_child(np, child));) { + /* Use everything after the last slash, or the full name */ + p = strrchr(child->full_name, '/'); + if (!p) + p = child->full_name; + else + ++p; + + if (duplicate_name(de, p)) + p = fixup_name(np, de, p); + + ent = proc_mkdir(p, de); + if (ent == NULL) + break; + proc_device_tree_add_node(child, ent); + } + of_node_put(child); + + for (pp = np->properties; pp != NULL; pp = pp->next) { + p = pp->name; + + if (strchr(p, '/')) + continue; + + if (duplicate_name(de, p)) + p = fixup_name(np, de, p); + + ent = __proc_device_tree_add_prop(de, pp, p); + if (ent == NULL) + break; + } +} + +/* + * Called on initialization to set up the /proc/device-tree subtree + */ +void __init proc_device_tree_init(void) +{ + struct device_node *root; + + proc_device_tree = proc_mkdir("device-tree", NULL); + if (proc_device_tree == NULL) + return; + root = of_find_node_by_path("/"); + if (root == NULL) { + pr_debug("/proc/device-tree: can't find root\n"); + return; + } + proc_device_tree_add_node(root, proc_device_tree); + of_node_put(root); +} diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c new file mode 100644 index 00000000..06e1cc17 --- /dev/null +++ b/fs/proc/proc_net.c @@ -0,0 +1,241 @@ +/* + * linux/fs/proc/net.c + * + * Copyright (C) 2007 + * + * Author: Eric Biederman <ebiederm@xmission.com> + * + * proc net directory handling functions + */ + +#include <asm/uaccess.h> + +#include <linux/errno.h> +#include <linux/time.h> +#include <linux/proc_fs.h> +#include <linux/stat.h> +#include <linux/slab.h> +#include <linux/init.h> +#include <linux/sched.h> +#include <linux/module.h> +#include <linux/bitops.h> +#include <linux/mount.h> +#include <linux/nsproxy.h> +#include <net/net_namespace.h> +#include <linux/seq_file.h> + +#include "internal.h" + + +static struct net *get_proc_net(const struct inode *inode) +{ + return maybe_get_net(PDE_NET(PDE(inode))); +} + +int seq_open_net(struct inode *ino, struct file *f, + const struct seq_operations *ops, int size) +{ + struct net *net; + struct seq_net_private *p; + + BUG_ON(size < sizeof(*p)); + + net = get_proc_net(ino); + if (net == NULL) + return -ENXIO; + + p = __seq_open_private(f, ops, size); + if (p == NULL) { + put_net(net); + return -ENOMEM; + } +#ifdef CONFIG_NET_NS + p->net = net; +#endif + return 0; +} +EXPORT_SYMBOL_GPL(seq_open_net); + +int single_open_net(struct inode *inode, struct file *file, + int (*show)(struct seq_file *, void *)) +{ + int err; + struct net *net; + + err = -ENXIO; + net = get_proc_net(inode); + if (net == NULL) + goto err_net; + + err = single_open(file, show, net); + if (err < 0) + goto err_open; + + return 0; + +err_open: + put_net(net); +err_net: + return err; +} +EXPORT_SYMBOL_GPL(single_open_net); + +int seq_release_net(struct inode *ino, struct file *f) +{ + struct seq_file *seq; + + seq = f->private_data; + + put_net(seq_file_net(seq)); + seq_release_private(ino, f); + return 0; +} +EXPORT_SYMBOL_GPL(seq_release_net); + +int single_release_net(struct inode *ino, struct file *f) +{ + struct seq_file *seq = f->private_data; + put_net(seq->private); + return single_release(ino, f); +} +EXPORT_SYMBOL_GPL(single_release_net); + +static struct net *get_proc_task_net(struct inode *dir) +{ + struct task_struct *task; + struct nsproxy *ns; + struct net *net = NULL; + + rcu_read_lock(); + task = pid_task(proc_pid(dir), PIDTYPE_PID); + if (task != NULL) { + ns = task_nsproxy(task); + if (ns != NULL) + net = get_net(ns->net_ns); + } + rcu_read_unlock(); + + return net; +} + +static struct dentry *proc_tgid_net_lookup(struct inode *dir, + struct dentry *dentry, struct nameidata *nd) +{ + struct dentry *de; + struct net *net; + + de = ERR_PTR(-ENOENT); + net = get_proc_task_net(dir); + if (net != NULL) { + de = proc_lookup_de(net->proc_net, dir, dentry); + put_net(net); + } + return de; +} + +static int proc_tgid_net_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + struct inode *inode = dentry->d_inode; + struct net *net; + + net = get_proc_task_net(inode); + + generic_fillattr(inode, stat); + + if (net != NULL) { + stat->nlink = net->proc_net->nlink; + put_net(net); + } + + return 0; +} + +const struct inode_operations proc_net_inode_operations = { + .lookup = proc_tgid_net_lookup, + .getattr = proc_tgid_net_getattr, +}; + +static int proc_tgid_net_readdir(struct file *filp, void *dirent, + filldir_t filldir) +{ + int ret; + struct net *net; + + ret = -EINVAL; + net = get_proc_task_net(filp->f_path.dentry->d_inode); + if (net != NULL) { + ret = proc_readdir_de(net->proc_net, filp, dirent, filldir); + put_net(net); + } + return ret; +} + +const struct file_operations proc_net_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .readdir = proc_tgid_net_readdir, +}; + + +struct proc_dir_entry *proc_net_fops_create(struct net *net, + const char *name, umode_t mode, const struct file_operations *fops) +{ + return proc_create(name, mode, net->proc_net, fops); +} +EXPORT_SYMBOL_GPL(proc_net_fops_create); + +void proc_net_remove(struct net *net, const char *name) +{ + remove_proc_entry(name, net->proc_net); +} +EXPORT_SYMBOL_GPL(proc_net_remove); + +static __net_init int proc_net_ns_init(struct net *net) +{ + struct proc_dir_entry *netd, *net_statd; + int err; + + err = -ENOMEM; + netd = kzalloc(sizeof(*netd) + 4, GFP_KERNEL); + if (!netd) + goto out; + + netd->data = net; + netd->nlink = 2; + netd->namelen = 3; + netd->parent = &proc_root; + memcpy(netd->name, "net", 4); + + err = -EEXIST; + net_statd = proc_net_mkdir(net, "stat", netd); + if (!net_statd) + goto free_net; + + net->proc_net = netd; + net->proc_net_stat = net_statd; + return 0; + +free_net: + kfree(netd); +out: + return err; +} + +static __net_exit void proc_net_ns_exit(struct net *net) +{ + remove_proc_entry("stat", net->proc_net); + kfree(net->proc_net); +} + +static struct pernet_operations __net_initdata proc_net_ns_ops = { + .init = proc_net_ns_init, + .exit = proc_net_ns_exit, +}; + +int __init proc_net_init(void) +{ + proc_symlink("net", NULL, "self/net"); + + return register_pernet_subsys(&proc_net_ns_ops); +} diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c new file mode 100644 index 00000000..21d836f4 --- /dev/null +++ b/fs/proc/proc_sysctl.c @@ -0,0 +1,1606 @@ +/* + * /proc/sys support + */ +#include <linux/init.h> +#include <linux/sysctl.h> +#include <linux/poll.h> +#include <linux/proc_fs.h> +#include <linux/security.h> +#include <linux/sched.h> +#include <linux/namei.h> +#include <linux/mm.h> +#include <linux/module.h> +#include "internal.h" + +static const struct dentry_operations proc_sys_dentry_operations; +static const struct file_operations proc_sys_file_operations; +static const struct inode_operations proc_sys_inode_operations; +static const struct file_operations proc_sys_dir_file_operations; +static const struct inode_operations proc_sys_dir_operations; + +void proc_sys_poll_notify(struct ctl_table_poll *poll) +{ + if (!poll) + return; + + atomic_inc(&poll->event); + wake_up_interruptible(&poll->wait); +} + +static struct ctl_table root_table[] = { + { + .procname = "", + .mode = S_IFDIR|S_IRUGO|S_IXUGO, + }, + { } +}; +static struct ctl_table_root sysctl_table_root = { + .default_set.dir.header = { + {{.count = 1, + .nreg = 1, + .ctl_table = root_table }}, + .ctl_table_arg = root_table, + .root = &sysctl_table_root, + .set = &sysctl_table_root.default_set, + }, +}; + +static DEFINE_SPINLOCK(sysctl_lock); + +static void drop_sysctl_table(struct ctl_table_header *header); +static int sysctl_follow_link(struct ctl_table_header **phead, + struct ctl_table **pentry, struct nsproxy *namespaces); +static int insert_links(struct ctl_table_header *head); +static void put_links(struct ctl_table_header *header); + +static void sysctl_print_dir(struct ctl_dir *dir) +{ + if (dir->header.parent) + sysctl_print_dir(dir->header.parent); + printk(KERN_CONT "%s/", dir->header.ctl_table[0].procname); +} + +static int namecmp(const char *name1, int len1, const char *name2, int len2) +{ + int minlen; + int cmp; + + minlen = len1; + if (minlen > len2) + minlen = len2; + + cmp = memcmp(name1, name2, minlen); + if (cmp == 0) + cmp = len1 - len2; + return cmp; +} + +/* Called under sysctl_lock */ +static struct ctl_table *find_entry(struct ctl_table_header **phead, + struct ctl_dir *dir, const char *name, int namelen) +{ + struct ctl_table_header *head; + struct ctl_table *entry; + struct rb_node *node = dir->root.rb_node; + + while (node) + { + struct ctl_node *ctl_node; + const char *procname; + int cmp; + + ctl_node = rb_entry(node, struct ctl_node, node); + head = ctl_node->header; + entry = &head->ctl_table[ctl_node - head->node]; + procname = entry->procname; + + cmp = namecmp(name, namelen, procname, strlen(procname)); + if (cmp < 0) + node = node->rb_left; + else if (cmp > 0) + node = node->rb_right; + else { + *phead = head; + return entry; + } + } + return NULL; +} + +static int insert_entry(struct ctl_table_header *head, struct ctl_table *entry) +{ + struct rb_node *node = &head->node[entry - head->ctl_table].node; + struct rb_node **p = &head->parent->root.rb_node; + struct rb_node *parent = NULL; + const char *name = entry->procname; + int namelen = strlen(name); + + while (*p) { + struct ctl_table_header *parent_head; + struct ctl_table *parent_entry; + struct ctl_node *parent_node; + const char *parent_name; + int cmp; + + parent = *p; + parent_node = rb_entry(parent, struct ctl_node, node); + parent_head = parent_node->header; + parent_entry = &parent_head->ctl_table[parent_node - parent_head->node]; + parent_name = parent_entry->procname; + + cmp = namecmp(name, namelen, parent_name, strlen(parent_name)); + if (cmp < 0) + p = &(*p)->rb_left; + else if (cmp > 0) + p = &(*p)->rb_right; + else { + printk(KERN_ERR "sysctl duplicate entry: "); + sysctl_print_dir(head->parent); + printk(KERN_CONT "/%s\n", entry->procname); + return -EEXIST; + } + } + + rb_link_node(node, parent, p); + return 0; +} + +static void erase_entry(struct ctl_table_header *head, struct ctl_table *entry) +{ + struct rb_node *node = &head->node[entry - head->ctl_table].node; + + rb_erase(node, &head->parent->root); +} + +static void init_header(struct ctl_table_header *head, + struct ctl_table_root *root, struct ctl_table_set *set, + struct ctl_node *node, struct ctl_table *table) +{ + head->ctl_table = table; + head->ctl_table_arg = table; + head->used = 0; + head->count = 1; + head->nreg = 1; + head->unregistering = NULL; + head->root = root; + head->set = set; + head->parent = NULL; + head->node = node; + if (node) { + struct ctl_table *entry; + for (entry = table; entry->procname; entry++, node++) { + rb_init_node(&node->node); + node->header = head; + } + } +} + +static void erase_header(struct ctl_table_header *head) +{ + struct ctl_table *entry; + for (entry = head->ctl_table; entry->procname; entry++) + erase_entry(head, entry); +} + +static int insert_header(struct ctl_dir *dir, struct ctl_table_header *header) +{ + struct ctl_table *entry; + int err; + + dir->header.nreg++; + header->parent = dir; + err = insert_links(header); + if (err) + goto fail_links; + for (entry = header->ctl_table; entry->procname; entry++) { + err = insert_entry(header, entry); + if (err) + goto fail; + } + return 0; +fail: + erase_header(header); + put_links(header); +fail_links: + header->parent = NULL; + drop_sysctl_table(&dir->header); + return err; +} + +/* called under sysctl_lock */ +static int use_table(struct ctl_table_header *p) +{ + if (unlikely(p->unregistering)) + return 0; + p->used++; + return 1; +} + +/* called under sysctl_lock */ +static void unuse_table(struct ctl_table_header *p) +{ + if (!--p->used) + if (unlikely(p->unregistering)) + complete(p->unregistering); +} + +/* called under sysctl_lock, will reacquire if has to wait */ +static void start_unregistering(struct ctl_table_header *p) +{ + /* + * if p->used is 0, nobody will ever touch that entry again; + * we'll eliminate all paths to it before dropping sysctl_lock + */ + if (unlikely(p->used)) { + struct completion wait; + init_completion(&wait); + p->unregistering = &wait; + spin_unlock(&sysctl_lock); + wait_for_completion(&wait); + spin_lock(&sysctl_lock); + } else { + /* anything non-NULL; we'll never dereference it */ + p->unregistering = ERR_PTR(-EINVAL); + } + /* + * do not remove from the list until nobody holds it; walking the + * list in do_sysctl() relies on that. + */ + erase_header(p); +} + +static void sysctl_head_get(struct ctl_table_header *head) +{ + spin_lock(&sysctl_lock); + head->count++; + spin_unlock(&sysctl_lock); +} + +void sysctl_head_put(struct ctl_table_header *head) +{ + spin_lock(&sysctl_lock); + if (!--head->count) + kfree_rcu(head, rcu); + spin_unlock(&sysctl_lock); +} + +static struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head) +{ + if (!head) + BUG(); + spin_lock(&sysctl_lock); + if (!use_table(head)) + head = ERR_PTR(-ENOENT); + spin_unlock(&sysctl_lock); + return head; +} + +static void sysctl_head_finish(struct ctl_table_header *head) +{ + if (!head) + return; + spin_lock(&sysctl_lock); + unuse_table(head); + spin_unlock(&sysctl_lock); +} + +static struct ctl_table_set * +lookup_header_set(struct ctl_table_root *root, struct nsproxy *namespaces) +{ + struct ctl_table_set *set = &root->default_set; + if (root->lookup) + set = root->lookup(root, namespaces); + return set; +} + +static struct ctl_table *lookup_entry(struct ctl_table_header **phead, + struct ctl_dir *dir, + const char *name, int namelen) +{ + struct ctl_table_header *head; + struct ctl_table *entry; + + spin_lock(&sysctl_lock); + entry = find_entry(&head, dir, name, namelen); + if (entry && use_table(head)) + *phead = head; + else + entry = NULL; + spin_unlock(&sysctl_lock); + return entry; +} + +static struct ctl_node *first_usable_entry(struct rb_node *node) +{ + struct ctl_node *ctl_node; + + for (;node; node = rb_next(node)) { + ctl_node = rb_entry(node, struct ctl_node, node); + if (use_table(ctl_node->header)) + return ctl_node; + } + return NULL; +} + +static void first_entry(struct ctl_dir *dir, + struct ctl_table_header **phead, struct ctl_table **pentry) +{ + struct ctl_table_header *head = NULL; + struct ctl_table *entry = NULL; + struct ctl_node *ctl_node; + + spin_lock(&sysctl_lock); + ctl_node = first_usable_entry(rb_first(&dir->root)); + spin_unlock(&sysctl_lock); + if (ctl_node) { + head = ctl_node->header; + entry = &head->ctl_table[ctl_node - head->node]; + } + *phead = head; + *pentry = entry; +} + +static void next_entry(struct ctl_table_header **phead, struct ctl_table **pentry) +{ + struct ctl_table_header *head = *phead; + struct ctl_table *entry = *pentry; + struct ctl_node *ctl_node = &head->node[entry - head->ctl_table]; + + spin_lock(&sysctl_lock); + unuse_table(head); + + ctl_node = first_usable_entry(rb_next(&ctl_node->node)); + spin_unlock(&sysctl_lock); + head = NULL; + if (ctl_node) { + head = ctl_node->header; + entry = &head->ctl_table[ctl_node - head->node]; + } + *phead = head; + *pentry = entry; +} + +void register_sysctl_root(struct ctl_table_root *root) +{ +} + +/* + * sysctl_perm does NOT grant the superuser all rights automatically, because + * some sysctl variables are readonly even to root. + */ + +static int test_perm(int mode, int op) +{ + if (!current_euid()) + mode >>= 6; + else if (in_egroup_p(0)) + mode >>= 3; + if ((op & ~mode & (MAY_READ|MAY_WRITE|MAY_EXEC)) == 0) + return 0; + return -EACCES; +} + +static int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op) +{ + int mode; + + if (root->permissions) + mode = root->permissions(root, current->nsproxy, table); + else + mode = table->mode; + + return test_perm(mode, op); +} + +static struct inode *proc_sys_make_inode(struct super_block *sb, + struct ctl_table_header *head, struct ctl_table *table) +{ + struct inode *inode; + struct proc_inode *ei; + + inode = new_inode(sb); + if (!inode) + goto out; + + inode->i_ino = get_next_ino(); + + sysctl_head_get(head); + ei = PROC_I(inode); + ei->sysctl = head; + ei->sysctl_entry = table; + + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode->i_mode = table->mode; + if (!S_ISDIR(table->mode)) { + inode->i_mode |= S_IFREG; + inode->i_op = &proc_sys_inode_operations; + inode->i_fop = &proc_sys_file_operations; + } else { + inode->i_mode |= S_IFDIR; + inode->i_op = &proc_sys_dir_operations; + inode->i_fop = &proc_sys_dir_file_operations; + } +out: + return inode; +} + +static struct ctl_table_header *grab_header(struct inode *inode) +{ + struct ctl_table_header *head = PROC_I(inode)->sysctl; + if (!head) + head = &sysctl_table_root.default_set.dir.header; + return sysctl_head_grab(head); +} + +static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) +{ + struct ctl_table_header *head = grab_header(dir); + struct ctl_table_header *h = NULL; + struct qstr *name = &dentry->d_name; + struct ctl_table *p; + struct inode *inode; + struct dentry *err = ERR_PTR(-ENOENT); + struct ctl_dir *ctl_dir; + int ret; + + if (IS_ERR(head)) + return ERR_CAST(head); + + ctl_dir = container_of(head, struct ctl_dir, header); + + p = lookup_entry(&h, ctl_dir, name->name, name->len); + if (!p) + goto out; + + if (S_ISLNK(p->mode)) { + ret = sysctl_follow_link(&h, &p, current->nsproxy); + err = ERR_PTR(ret); + if (ret) + goto out; + } + + err = ERR_PTR(-ENOMEM); + inode = proc_sys_make_inode(dir->i_sb, h ? h : head, p); + if (h) + sysctl_head_finish(h); + + if (!inode) + goto out; + + err = NULL; + d_set_d_op(dentry, &proc_sys_dentry_operations); + d_add(dentry, inode); + +out: + sysctl_head_finish(head); + return err; +} + +static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf, + size_t count, loff_t *ppos, int write) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + struct ctl_table_header *head = grab_header(inode); + struct ctl_table *table = PROC_I(inode)->sysctl_entry; + ssize_t error; + size_t res; + + if (IS_ERR(head)) + return PTR_ERR(head); + + /* + * At this point we know that the sysctl was not unregistered + * and won't be until we finish. + */ + error = -EPERM; + if (sysctl_perm(head->root, table, write ? MAY_WRITE : MAY_READ)) + goto out; + + /* if that can happen at all, it should be -EINVAL, not -EISDIR */ + error = -EINVAL; + if (!table->proc_handler) + goto out; + + /* careful: calling conventions are nasty here */ + res = count; + error = table->proc_handler(table, write, buf, &res, ppos); + if (!error) + error = res; +out: + sysctl_head_finish(head); + + return error; +} + +static ssize_t proc_sys_read(struct file *filp, char __user *buf, + size_t count, loff_t *ppos) +{ + return proc_sys_call_handler(filp, (void __user *)buf, count, ppos, 0); +} + +static ssize_t proc_sys_write(struct file *filp, const char __user *buf, + size_t count, loff_t *ppos) +{ + return proc_sys_call_handler(filp, (void __user *)buf, count, ppos, 1); +} + +static int proc_sys_open(struct inode *inode, struct file *filp) +{ + struct ctl_table_header *head = grab_header(inode); + struct ctl_table *table = PROC_I(inode)->sysctl_entry; + + /* sysctl was unregistered */ + if (IS_ERR(head)) + return PTR_ERR(head); + + if (table->poll) + filp->private_data = proc_sys_poll_event(table->poll); + + sysctl_head_finish(head); + + return 0; +} + +static unsigned int proc_sys_poll(struct file *filp, poll_table *wait) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + struct ctl_table_header *head = grab_header(inode); + struct ctl_table *table = PROC_I(inode)->sysctl_entry; + unsigned int ret = DEFAULT_POLLMASK; + unsigned long event; + + /* sysctl was unregistered */ + if (IS_ERR(head)) + return POLLERR | POLLHUP; + + if (!table->proc_handler) + goto out; + + if (!table->poll) + goto out; + + event = (unsigned long)filp->private_data; + poll_wait(filp, &table->poll->wait, wait); + + if (event != atomic_read(&table->poll->event)) { + filp->private_data = proc_sys_poll_event(table->poll); + ret = POLLIN | POLLRDNORM | POLLERR | POLLPRI; + } + +out: + sysctl_head_finish(head); + + return ret; +} + +static int proc_sys_fill_cache(struct file *filp, void *dirent, + filldir_t filldir, + struct ctl_table_header *head, + struct ctl_table *table) +{ + struct dentry *child, *dir = filp->f_path.dentry; + struct inode *inode; + struct qstr qname; + ino_t ino = 0; + unsigned type = DT_UNKNOWN; + + qname.name = table->procname; + qname.len = strlen(table->procname); + qname.hash = full_name_hash(qname.name, qname.len); + + child = d_lookup(dir, &qname); + if (!child) { + child = d_alloc(dir, &qname); + if (child) { + inode = proc_sys_make_inode(dir->d_sb, head, table); + if (!inode) { + dput(child); + return -ENOMEM; + } else { + d_set_d_op(child, &proc_sys_dentry_operations); + d_add(child, inode); + } + } else { + return -ENOMEM; + } + } + inode = child->d_inode; + ino = inode->i_ino; + type = inode->i_mode >> 12; + dput(child); + return !!filldir(dirent, qname.name, qname.len, filp->f_pos, ino, type); +} + +static int proc_sys_link_fill_cache(struct file *filp, void *dirent, + filldir_t filldir, + struct ctl_table_header *head, + struct ctl_table *table) +{ + int err, ret = 0; + head = sysctl_head_grab(head); + + if (S_ISLNK(table->mode)) { + /* It is not an error if we can not follow the link ignore it */ + err = sysctl_follow_link(&head, &table, current->nsproxy); + if (err) + goto out; + } + + ret = proc_sys_fill_cache(filp, dirent, filldir, head, table); +out: + sysctl_head_finish(head); + return ret; +} + +static int scan(struct ctl_table_header *head, ctl_table *table, + unsigned long *pos, struct file *file, + void *dirent, filldir_t filldir) +{ + int res; + + if ((*pos)++ < file->f_pos) + return 0; + + if (unlikely(S_ISLNK(table->mode))) + res = proc_sys_link_fill_cache(file, dirent, filldir, head, table); + else + res = proc_sys_fill_cache(file, dirent, filldir, head, table); + + if (res == 0) + file->f_pos = *pos; + + return res; +} + +static int proc_sys_readdir(struct file *filp, void *dirent, filldir_t filldir) +{ + struct dentry *dentry = filp->f_path.dentry; + struct inode *inode = dentry->d_inode; + struct ctl_table_header *head = grab_header(inode); + struct ctl_table_header *h = NULL; + struct ctl_table *entry; + struct ctl_dir *ctl_dir; + unsigned long pos; + int ret = -EINVAL; + + if (IS_ERR(head)) + return PTR_ERR(head); + + ctl_dir = container_of(head, struct ctl_dir, header); + + ret = 0; + /* Avoid a switch here: arm builds fail with missing __cmpdi2 */ + if (filp->f_pos == 0) { + if (filldir(dirent, ".", 1, filp->f_pos, + inode->i_ino, DT_DIR) < 0) + goto out; + filp->f_pos++; + } + if (filp->f_pos == 1) { + if (filldir(dirent, "..", 2, filp->f_pos, + parent_ino(dentry), DT_DIR) < 0) + goto out; + filp->f_pos++; + } + pos = 2; + + for (first_entry(ctl_dir, &h, &entry); h; next_entry(&h, &entry)) { + ret = scan(h, entry, &pos, filp, dirent, filldir); + if (ret) { + sysctl_head_finish(h); + break; + } + } + ret = 1; +out: + sysctl_head_finish(head); + return ret; +} + +static int proc_sys_permission(struct inode *inode, int mask) +{ + /* + * sysctl entries that are not writeable, + * are _NOT_ writeable, capabilities or not. + */ + struct ctl_table_header *head; + struct ctl_table *table; + int error; + + /* Executable files are not allowed under /proc/sys/ */ + if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode)) + return -EACCES; + + head = grab_header(inode); + if (IS_ERR(head)) + return PTR_ERR(head); + + table = PROC_I(inode)->sysctl_entry; + if (!table) /* global root - r-xr-xr-x */ + error = mask & MAY_WRITE ? -EACCES : 0; + else /* Use the permissions on the sysctl table entry */ + error = sysctl_perm(head->root, table, mask & ~MAY_NOT_BLOCK); + + sysctl_head_finish(head); + return error; +} + +static int proc_sys_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = dentry->d_inode; + int error; + + if (attr->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID)) + return -EPERM; + + error = inode_change_ok(inode, attr); + if (error) + return error; + + if ((attr->ia_valid & ATTR_SIZE) && + attr->ia_size != i_size_read(inode)) { + error = vmtruncate(inode, attr->ia_size); + if (error) + return error; + } + + setattr_copy(inode, attr); + mark_inode_dirty(inode); + return 0; +} + +static int proc_sys_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) +{ + struct inode *inode = dentry->d_inode; + struct ctl_table_header *head = grab_header(inode); + struct ctl_table *table = PROC_I(inode)->sysctl_entry; + + if (IS_ERR(head)) + return PTR_ERR(head); + + generic_fillattr(inode, stat); + if (table) + stat->mode = (stat->mode & S_IFMT) | table->mode; + + sysctl_head_finish(head); + return 0; +} + +static const struct file_operations proc_sys_file_operations = { + .open = proc_sys_open, + .poll = proc_sys_poll, + .read = proc_sys_read, + .write = proc_sys_write, + .llseek = default_llseek, +}; + +static const struct file_operations proc_sys_dir_file_operations = { + .read = generic_read_dir, + .readdir = proc_sys_readdir, + .llseek = generic_file_llseek, +}; + +static const struct inode_operations proc_sys_inode_operations = { + .permission = proc_sys_permission, + .setattr = proc_sys_setattr, + .getattr = proc_sys_getattr, +}; + +static const struct inode_operations proc_sys_dir_operations = { + .lookup = proc_sys_lookup, + .permission = proc_sys_permission, + .setattr = proc_sys_setattr, + .getattr = proc_sys_getattr, +}; + +static int proc_sys_revalidate(struct dentry *dentry, struct nameidata *nd) +{ + if (nd->flags & LOOKUP_RCU) + return -ECHILD; + return !PROC_I(dentry->d_inode)->sysctl->unregistering; +} + +static int proc_sys_delete(const struct dentry *dentry) +{ + return !!PROC_I(dentry->d_inode)->sysctl->unregistering; +} + +static int sysctl_is_seen(struct ctl_table_header *p) +{ + struct ctl_table_set *set = p->set; + int res; + spin_lock(&sysctl_lock); + if (p->unregistering) + res = 0; + else if (!set->is_seen) + res = 1; + else + res = set->is_seen(set); + spin_unlock(&sysctl_lock); + return res; +} + +static int proc_sys_compare(const struct dentry *parent, + const struct inode *pinode, + const struct dentry *dentry, const struct inode *inode, + unsigned int len, const char *str, const struct qstr *name) +{ + struct ctl_table_header *head; + /* Although proc doesn't have negative dentries, rcu-walk means + * that inode here can be NULL */ + /* AV: can it, indeed? */ + if (!inode) + return 1; + if (name->len != len) + return 1; + if (memcmp(name->name, str, len)) + return 1; + head = rcu_dereference(PROC_I(inode)->sysctl); + return !head || !sysctl_is_seen(head); +} + +static const struct dentry_operations proc_sys_dentry_operations = { + .d_revalidate = proc_sys_revalidate, + .d_delete = proc_sys_delete, + .d_compare = proc_sys_compare, +}; + +static struct ctl_dir *find_subdir(struct ctl_dir *dir, + const char *name, int namelen) +{ + struct ctl_table_header *head; + struct ctl_table *entry; + + entry = find_entry(&head, dir, name, namelen); + if (!entry) + return ERR_PTR(-ENOENT); + if (!S_ISDIR(entry->mode)) + return ERR_PTR(-ENOTDIR); + return container_of(head, struct ctl_dir, header); +} + +static struct ctl_dir *new_dir(struct ctl_table_set *set, + const char *name, int namelen) +{ + struct ctl_table *table; + struct ctl_dir *new; + struct ctl_node *node; + char *new_name; + + new = kzalloc(sizeof(*new) + sizeof(struct ctl_node) + + sizeof(struct ctl_table)*2 + namelen + 1, + GFP_KERNEL); + if (!new) + return NULL; + + node = (struct ctl_node *)(new + 1); + table = (struct ctl_table *)(node + 1); + new_name = (char *)(table + 2); + memcpy(new_name, name, namelen); + new_name[namelen] = '\0'; + table[0].procname = new_name; + table[0].mode = S_IFDIR|S_IRUGO|S_IXUGO; + init_header(&new->header, set->dir.header.root, set, node, table); + + return new; +} + +/** + * get_subdir - find or create a subdir with the specified name. + * @dir: Directory to create the subdirectory in + * @name: The name of the subdirectory to find or create + * @namelen: The length of name + * + * Takes a directory with an elevated reference count so we know that + * if we drop the lock the directory will not go away. Upon success + * the reference is moved from @dir to the returned subdirectory. + * Upon error an error code is returned and the reference on @dir is + * simply dropped. + */ +static struct ctl_dir *get_subdir(struct ctl_dir *dir, + const char *name, int namelen) +{ + struct ctl_table_set *set = dir->header.set; + struct ctl_dir *subdir, *new = NULL; + int err; + + spin_lock(&sysctl_lock); + subdir = find_subdir(dir, name, namelen); + if (!IS_ERR(subdir)) + goto found; + if (PTR_ERR(subdir) != -ENOENT) + goto failed; + + spin_unlock(&sysctl_lock); + new = new_dir(set, name, namelen); + spin_lock(&sysctl_lock); + subdir = ERR_PTR(-ENOMEM); + if (!new) + goto failed; + + /* Was the subdir added while we dropped the lock? */ + subdir = find_subdir(dir, name, namelen); + if (!IS_ERR(subdir)) + goto found; + if (PTR_ERR(subdir) != -ENOENT) + goto failed; + + /* Nope. Use the our freshly made directory entry. */ + err = insert_header(dir, &new->header); + subdir = ERR_PTR(err); + if (err) + goto failed; + subdir = new; +found: + subdir->header.nreg++; +failed: + if (unlikely(IS_ERR(subdir))) { + printk(KERN_ERR "sysctl could not get directory: "); + sysctl_print_dir(dir); + printk(KERN_CONT "/%*.*s %ld\n", + namelen, namelen, name, PTR_ERR(subdir)); + } + drop_sysctl_table(&dir->header); + if (new) + drop_sysctl_table(&new->header); + spin_unlock(&sysctl_lock); + return subdir; +} + +static struct ctl_dir *xlate_dir(struct ctl_table_set *set, struct ctl_dir *dir) +{ + struct ctl_dir *parent; + const char *procname; + if (!dir->header.parent) + return &set->dir; + parent = xlate_dir(set, dir->header.parent); + if (IS_ERR(parent)) + return parent; + procname = dir->header.ctl_table[0].procname; + return find_subdir(parent, procname, strlen(procname)); +} + +static int sysctl_follow_link(struct ctl_table_header **phead, + struct ctl_table **pentry, struct nsproxy *namespaces) +{ + struct ctl_table_header *head; + struct ctl_table_root *root; + struct ctl_table_set *set; + struct ctl_table *entry; + struct ctl_dir *dir; + int ret; + + ret = 0; + spin_lock(&sysctl_lock); + root = (*pentry)->data; + set = lookup_header_set(root, namespaces); + dir = xlate_dir(set, (*phead)->parent); + if (IS_ERR(dir)) + ret = PTR_ERR(dir); + else { + const char *procname = (*pentry)->procname; + head = NULL; + entry = find_entry(&head, dir, procname, strlen(procname)); + ret = -ENOENT; + if (entry && use_table(head)) { + unuse_table(*phead); + *phead = head; + *pentry = entry; + ret = 0; + } + } + + spin_unlock(&sysctl_lock); + return ret; +} + +static int sysctl_err(const char *path, struct ctl_table *table, char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + + printk(KERN_ERR "sysctl table check failed: %s/%s %pV\n", + path, table->procname, &vaf); + + va_end(args); + return -EINVAL; +} + +static int sysctl_check_table(const char *path, struct ctl_table *table) +{ + int err = 0; + for (; table->procname; table++) { + if (table->child) + err = sysctl_err(path, table, "Not a file"); + + if ((table->proc_handler == proc_dostring) || + (table->proc_handler == proc_dointvec) || + (table->proc_handler == proc_dointvec_minmax) || + (table->proc_handler == proc_dointvec_jiffies) || + (table->proc_handler == proc_dointvec_userhz_jiffies) || + (table->proc_handler == proc_dointvec_ms_jiffies) || + (table->proc_handler == proc_doulongvec_minmax) || + (table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) { + if (!table->data) + err = sysctl_err(path, table, "No data"); + if (!table->maxlen) + err = sysctl_err(path, table, "No maxlen"); + } + if (!table->proc_handler) + err = sysctl_err(path, table, "No proc_handler"); + + if ((table->mode & (S_IRUGO|S_IWUGO)) != table->mode) + err = sysctl_err(path, table, "bogus .mode 0%o", + table->mode); + } + return err; +} + +static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table *table, + struct ctl_table_root *link_root) +{ + struct ctl_table *link_table, *entry, *link; + struct ctl_table_header *links; + struct ctl_node *node; + char *link_name; + int nr_entries, name_bytes; + + name_bytes = 0; + nr_entries = 0; + for (entry = table; entry->procname; entry++) { + nr_entries++; + name_bytes += strlen(entry->procname) + 1; + } + + links = kzalloc(sizeof(struct ctl_table_header) + + sizeof(struct ctl_node)*nr_entries + + sizeof(struct ctl_table)*(nr_entries + 1) + + name_bytes, + GFP_KERNEL); + + if (!links) + return NULL; + + node = (struct ctl_node *)(links + 1); + link_table = (struct ctl_table *)(node + nr_entries); + link_name = (char *)&link_table[nr_entries + 1]; + + for (link = link_table, entry = table; entry->procname; link++, entry++) { + int len = strlen(entry->procname) + 1; + memcpy(link_name, entry->procname, len); + link->procname = link_name; + link->mode = S_IFLNK|S_IRWXUGO; + link->data = link_root; + link_name += len; + } + init_header(links, dir->header.root, dir->header.set, node, link_table); + links->nreg = nr_entries; + + return links; +} + +static bool get_links(struct ctl_dir *dir, + struct ctl_table *table, struct ctl_table_root *link_root) +{ + struct ctl_table_header *head; + struct ctl_table *entry, *link; + + /* Are there links available for every entry in table? */ + for (entry = table; entry->procname; entry++) { + const char *procname = entry->procname; + link = find_entry(&head, dir, procname, strlen(procname)); + if (!link) + return false; + if (S_ISDIR(link->mode) && S_ISDIR(entry->mode)) + continue; + if (S_ISLNK(link->mode) && (link->data == link_root)) + continue; + return false; + } + + /* The checks passed. Increase the registration count on the links */ + for (entry = table; entry->procname; entry++) { + const char *procname = entry->procname; + link = find_entry(&head, dir, procname, strlen(procname)); + head->nreg++; + } + return true; +} + +static int insert_links(struct ctl_table_header *head) +{ + struct ctl_table_set *root_set = &sysctl_table_root.default_set; + struct ctl_dir *core_parent = NULL; + struct ctl_table_header *links; + int err; + + if (head->set == root_set) + return 0; + + core_parent = xlate_dir(root_set, head->parent); + if (IS_ERR(core_parent)) + return 0; + + if (get_links(core_parent, head->ctl_table, head->root)) + return 0; + + core_parent->header.nreg++; + spin_unlock(&sysctl_lock); + + links = new_links(core_parent, head->ctl_table, head->root); + + spin_lock(&sysctl_lock); + err = -ENOMEM; + if (!links) + goto out; + + err = 0; + if (get_links(core_parent, head->ctl_table, head->root)) { + kfree(links); + goto out; + } + + err = insert_header(core_parent, links); + if (err) + kfree(links); +out: + drop_sysctl_table(&core_parent->header); + return err; +} + +/** + * __register_sysctl_table - register a leaf sysctl table + * @set: Sysctl tree to register on + * @path: The path to the directory the sysctl table is in. + * @table: the top-level table structure + * + * Register a sysctl table hierarchy. @table should be a filled in ctl_table + * array. A completely 0 filled entry terminates the table. + * + * The members of the &struct ctl_table structure are used as follows: + * + * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not + * enter a sysctl file + * + * data - a pointer to data for use by proc_handler + * + * maxlen - the maximum size in bytes of the data + * + * mode - the file permissions for the /proc/sys file + * + * child - must be %NULL. + * + * proc_handler - the text handler routine (described below) + * + * extra1, extra2 - extra pointers usable by the proc handler routines + * + * Leaf nodes in the sysctl tree will be represented by a single file + * under /proc; non-leaf nodes will be represented by directories. + * + * There must be a proc_handler routine for any terminal nodes. + * Several default handlers are available to cover common cases - + * + * proc_dostring(), proc_dointvec(), proc_dointvec_jiffies(), + * proc_dointvec_userhz_jiffies(), proc_dointvec_minmax(), + * proc_doulongvec_ms_jiffies_minmax(), proc_doulongvec_minmax() + * + * It is the handler's job to read the input buffer from user memory + * and process it. The handler should return 0 on success. + * + * This routine returns %NULL on a failure to register, and a pointer + * to the table header on success. + */ +struct ctl_table_header *__register_sysctl_table( + struct ctl_table_set *set, + const char *path, struct ctl_table *table) +{ + struct ctl_table_root *root = set->dir.header.root; + struct ctl_table_header *header; + const char *name, *nextname; + struct ctl_dir *dir; + struct ctl_table *entry; + struct ctl_node *node; + int nr_entries = 0; + + for (entry = table; entry->procname; entry++) + nr_entries++; + + header = kzalloc(sizeof(struct ctl_table_header) + + sizeof(struct ctl_node)*nr_entries, GFP_KERNEL); + if (!header) + return NULL; + + node = (struct ctl_node *)(header + 1); + init_header(header, root, set, node, table); + if (sysctl_check_table(path, table)) + goto fail; + + spin_lock(&sysctl_lock); + dir = &set->dir; + /* Reference moved down the diretory tree get_subdir */ + dir->header.nreg++; + spin_unlock(&sysctl_lock); + + /* Find the directory for the ctl_table */ + for (name = path; name; name = nextname) { + int namelen; + nextname = strchr(name, '/'); + if (nextname) { + namelen = nextname - name; + nextname++; + } else { + namelen = strlen(name); + } + if (namelen == 0) + continue; + + dir = get_subdir(dir, name, namelen); + if (IS_ERR(dir)) + goto fail; + } + + spin_lock(&sysctl_lock); + if (insert_header(dir, header)) + goto fail_put_dir_locked; + + drop_sysctl_table(&dir->header); + spin_unlock(&sysctl_lock); + + return header; + +fail_put_dir_locked: + drop_sysctl_table(&dir->header); + spin_unlock(&sysctl_lock); +fail: + kfree(header); + dump_stack(); + return NULL; +} + +/** + * register_sysctl - register a sysctl table + * @path: The path to the directory the sysctl table is in. + * @table: the table structure + * + * Register a sysctl table. @table should be a filled in ctl_table + * array. A completely 0 filled entry terminates the table. + * + * See __register_sysctl_table for more details. + */ +struct ctl_table_header *register_sysctl(const char *path, struct ctl_table *table) +{ + return __register_sysctl_table(&sysctl_table_root.default_set, + path, table); +} +EXPORT_SYMBOL(register_sysctl); + +static char *append_path(const char *path, char *pos, const char *name) +{ + int namelen; + namelen = strlen(name); + if (((pos - path) + namelen + 2) >= PATH_MAX) + return NULL; + memcpy(pos, name, namelen); + pos[namelen] = '/'; + pos[namelen + 1] = '\0'; + pos += namelen + 1; + return pos; +} + +static int count_subheaders(struct ctl_table *table) +{ + int has_files = 0; + int nr_subheaders = 0; + struct ctl_table *entry; + + /* special case: no directory and empty directory */ + if (!table || !table->procname) + return 1; + + for (entry = table; entry->procname; entry++) { + if (entry->child) + nr_subheaders += count_subheaders(entry->child); + else + has_files = 1; + } + return nr_subheaders + has_files; +} + +static int register_leaf_sysctl_tables(const char *path, char *pos, + struct ctl_table_header ***subheader, struct ctl_table_set *set, + struct ctl_table *table) +{ + struct ctl_table *ctl_table_arg = NULL; + struct ctl_table *entry, *files; + int nr_files = 0; + int nr_dirs = 0; + int err = -ENOMEM; + + for (entry = table; entry->procname; entry++) { + if (entry->child) + nr_dirs++; + else + nr_files++; + } + + files = table; + /* If there are mixed files and directories we need a new table */ + if (nr_dirs && nr_files) { + struct ctl_table *new; + files = kzalloc(sizeof(struct ctl_table) * (nr_files + 1), + GFP_KERNEL); + if (!files) + goto out; + + ctl_table_arg = files; + for (new = files, entry = table; entry->procname; entry++) { + if (entry->child) + continue; + *new = *entry; + new++; + } + } + + /* Register everything except a directory full of subdirectories */ + if (nr_files || !nr_dirs) { + struct ctl_table_header *header; + header = __register_sysctl_table(set, path, files); + if (!header) { + kfree(ctl_table_arg); + goto out; + } + + /* Remember if we need to free the file table */ + header->ctl_table_arg = ctl_table_arg; + **subheader = header; + (*subheader)++; + } + + /* Recurse into the subdirectories. */ + for (entry = table; entry->procname; entry++) { + char *child_pos; + + if (!entry->child) + continue; + + err = -ENAMETOOLONG; + child_pos = append_path(path, pos, entry->procname); + if (!child_pos) + goto out; + + err = register_leaf_sysctl_tables(path, child_pos, subheader, + set, entry->child); + pos[0] = '\0'; + if (err) + goto out; + } + err = 0; +out: + /* On failure our caller will unregister all registered subheaders */ + return err; +} + +/** + * __register_sysctl_paths - register a sysctl table hierarchy + * @set: Sysctl tree to register on + * @path: The path to the directory the sysctl table is in. + * @table: the top-level table structure + * + * Register a sysctl table hierarchy. @table should be a filled in ctl_table + * array. A completely 0 filled entry terminates the table. + * + * See __register_sysctl_table for more details. + */ +struct ctl_table_header *__register_sysctl_paths( + struct ctl_table_set *set, + const struct ctl_path *path, struct ctl_table *table) +{ + struct ctl_table *ctl_table_arg = table; + int nr_subheaders = count_subheaders(table); + struct ctl_table_header *header = NULL, **subheaders, **subheader; + const struct ctl_path *component; + char *new_path, *pos; + + pos = new_path = kmalloc(PATH_MAX, GFP_KERNEL); + if (!new_path) + return NULL; + + pos[0] = '\0'; + for (component = path; component->procname; component++) { + pos = append_path(new_path, pos, component->procname); + if (!pos) + goto out; + } + while (table->procname && table->child && !table[1].procname) { + pos = append_path(new_path, pos, table->procname); + if (!pos) + goto out; + table = table->child; + } + if (nr_subheaders == 1) { + header = __register_sysctl_table(set, new_path, table); + if (header) + header->ctl_table_arg = ctl_table_arg; + } else { + header = kzalloc(sizeof(*header) + + sizeof(*subheaders)*nr_subheaders, GFP_KERNEL); + if (!header) + goto out; + + subheaders = (struct ctl_table_header **) (header + 1); + subheader = subheaders; + header->ctl_table_arg = ctl_table_arg; + + if (register_leaf_sysctl_tables(new_path, pos, &subheader, + set, table)) + goto err_register_leaves; + } + +out: + kfree(new_path); + return header; + +err_register_leaves: + while (subheader > subheaders) { + struct ctl_table_header *subh = *(--subheader); + struct ctl_table *table = subh->ctl_table_arg; + unregister_sysctl_table(subh); + kfree(table); + } + kfree(header); + header = NULL; + goto out; +} + +/** + * register_sysctl_table_path - register a sysctl table hierarchy + * @path: The path to the directory the sysctl table is in. + * @table: the top-level table structure + * + * Register a sysctl table hierarchy. @table should be a filled in ctl_table + * array. A completely 0 filled entry terminates the table. + * + * See __register_sysctl_paths for more details. + */ +struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, + struct ctl_table *table) +{ + return __register_sysctl_paths(&sysctl_table_root.default_set, + path, table); +} +EXPORT_SYMBOL(register_sysctl_paths); + +/** + * register_sysctl_table - register a sysctl table hierarchy + * @table: the top-level table structure + * + * Register a sysctl table hierarchy. @table should be a filled in ctl_table + * array. A completely 0 filled entry terminates the table. + * + * See register_sysctl_paths for more details. + */ +struct ctl_table_header *register_sysctl_table(struct ctl_table *table) +{ + static const struct ctl_path null_path[] = { {} }; + + return register_sysctl_paths(null_path, table); +} +EXPORT_SYMBOL(register_sysctl_table); + +static void put_links(struct ctl_table_header *header) +{ + struct ctl_table_set *root_set = &sysctl_table_root.default_set; + struct ctl_table_root *root = header->root; + struct ctl_dir *parent = header->parent; + struct ctl_dir *core_parent; + struct ctl_table *entry; + + if (header->set == root_set) + return; + + core_parent = xlate_dir(root_set, parent); + if (IS_ERR(core_parent)) + return; + + for (entry = header->ctl_table; entry->procname; entry++) { + struct ctl_table_header *link_head; + struct ctl_table *link; + const char *name = entry->procname; + + link = find_entry(&link_head, core_parent, name, strlen(name)); + if (link && + ((S_ISDIR(link->mode) && S_ISDIR(entry->mode)) || + (S_ISLNK(link->mode) && (link->data == root)))) { + drop_sysctl_table(link_head); + } + else { + printk(KERN_ERR "sysctl link missing during unregister: "); + sysctl_print_dir(parent); + printk(KERN_CONT "/%s\n", name); + } + } +} + +static void drop_sysctl_table(struct ctl_table_header *header) +{ + struct ctl_dir *parent = header->parent; + + if (--header->nreg) + return; + + put_links(header); + start_unregistering(header); + if (!--header->count) + kfree_rcu(header, rcu); + + if (parent) + drop_sysctl_table(&parent->header); +} + +/** + * unregister_sysctl_table - unregister a sysctl table hierarchy + * @header: the header returned from register_sysctl_table + * + * Unregisters the sysctl table and all children. proc entries may not + * actually be removed until they are no longer used by anyone. + */ +void unregister_sysctl_table(struct ctl_table_header * header) +{ + int nr_subheaders; + might_sleep(); + + if (header == NULL) + return; + + nr_subheaders = count_subheaders(header->ctl_table_arg); + if (unlikely(nr_subheaders > 1)) { + struct ctl_table_header **subheaders; + int i; + + subheaders = (struct ctl_table_header **)(header + 1); + for (i = nr_subheaders -1; i >= 0; i--) { + struct ctl_table_header *subh = subheaders[i]; + struct ctl_table *table = subh->ctl_table_arg; + unregister_sysctl_table(subh); + kfree(table); + } + kfree(header); + return; + } + + spin_lock(&sysctl_lock); + drop_sysctl_table(header); + spin_unlock(&sysctl_lock); +} +EXPORT_SYMBOL(unregister_sysctl_table); + +void setup_sysctl_set(struct ctl_table_set *set, + struct ctl_table_root *root, + int (*is_seen)(struct ctl_table_set *)) +{ + memset(set, 0, sizeof(*set)); + set->is_seen = is_seen; + init_header(&set->dir.header, root, set, NULL, root_table); +} + +void retire_sysctl_set(struct ctl_table_set *set) +{ + WARN_ON(!RB_EMPTY_ROOT(&set->dir.root)); +} + +int __init proc_sys_init(void) +{ + struct proc_dir_entry *proc_sys_root; + + proc_sys_root = proc_mkdir("sys", NULL); + proc_sys_root->proc_iops = &proc_sys_dir_operations; + proc_sys_root->proc_fops = &proc_sys_dir_file_operations; + proc_sys_root->nlink = 0; + + return sysctl_init(); +} diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c new file mode 100644 index 00000000..cb761f01 --- /dev/null +++ b/fs/proc/proc_tty.c @@ -0,0 +1,189 @@ +/* + * proc_tty.c -- handles /proc/tty + * + * Copyright 1997, Theodore Ts'o + */ + +#include <asm/uaccess.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/errno.h> +#include <linux/time.h> +#include <linux/proc_fs.h> +#include <linux/stat.h> +#include <linux/tty.h> +#include <linux/seq_file.h> +#include <linux/bitops.h> + +/* + * The /proc/tty directory inodes... + */ +static struct proc_dir_entry *proc_tty_ldisc, *proc_tty_driver; + +/* + * This is the handler for /proc/tty/drivers + */ +static void show_tty_range(struct seq_file *m, struct tty_driver *p, + dev_t from, int num) +{ + seq_printf(m, "%-20s ", p->driver_name ? p->driver_name : "unknown"); + seq_printf(m, "/dev/%-8s ", p->name); + if (p->num > 1) { + seq_printf(m, "%3d %d-%d ", MAJOR(from), MINOR(from), + MINOR(from) + num - 1); + } else { + seq_printf(m, "%3d %7d ", MAJOR(from), MINOR(from)); + } + switch (p->type) { + case TTY_DRIVER_TYPE_SYSTEM: + seq_puts(m, "system"); + if (p->subtype == SYSTEM_TYPE_TTY) + seq_puts(m, ":/dev/tty"); + else if (p->subtype == SYSTEM_TYPE_SYSCONS) + seq_puts(m, ":console"); + else if (p->subtype == SYSTEM_TYPE_CONSOLE) + seq_puts(m, ":vtmaster"); + break; + case TTY_DRIVER_TYPE_CONSOLE: + seq_puts(m, "console"); + break; + case TTY_DRIVER_TYPE_SERIAL: + seq_puts(m, "serial"); + break; + case TTY_DRIVER_TYPE_PTY: + if (p->subtype == PTY_TYPE_MASTER) + seq_puts(m, "pty:master"); + else if (p->subtype == PTY_TYPE_SLAVE) + seq_puts(m, "pty:slave"); + else + seq_puts(m, "pty"); + break; + default: + seq_printf(m, "type:%d.%d", p->type, p->subtype); + } + seq_putc(m, '\n'); +} + +static int show_tty_driver(struct seq_file *m, void *v) +{ + struct tty_driver *p = list_entry(v, struct tty_driver, tty_drivers); + dev_t from = MKDEV(p->major, p->minor_start); + dev_t to = from + p->num; + + if (&p->tty_drivers == tty_drivers.next) { + /* pseudo-drivers first */ + seq_printf(m, "%-20s /dev/%-8s ", "/dev/tty", "tty"); + seq_printf(m, "%3d %7d ", TTYAUX_MAJOR, 0); + seq_puts(m, "system:/dev/tty\n"); + seq_printf(m, "%-20s /dev/%-8s ", "/dev/console", "console"); + seq_printf(m, "%3d %7d ", TTYAUX_MAJOR, 1); + seq_puts(m, "system:console\n"); +#ifdef CONFIG_UNIX98_PTYS + seq_printf(m, "%-20s /dev/%-8s ", "/dev/ptmx", "ptmx"); + seq_printf(m, "%3d %7d ", TTYAUX_MAJOR, 2); + seq_puts(m, "system\n"); +#endif +#ifdef CONFIG_VT + seq_printf(m, "%-20s /dev/%-8s ", "/dev/vc/0", "vc/0"); + seq_printf(m, "%3d %7d ", TTY_MAJOR, 0); + seq_puts(m, "system:vtmaster\n"); +#endif + } + + while (MAJOR(from) < MAJOR(to)) { + dev_t next = MKDEV(MAJOR(from)+1, 0); + show_tty_range(m, p, from, next - from); + from = next; + } + if (from != to) + show_tty_range(m, p, from, to - from); + return 0; +} + +/* iterator */ +static void *t_start(struct seq_file *m, loff_t *pos) +{ + mutex_lock(&tty_mutex); + return seq_list_start(&tty_drivers, *pos); +} + +static void *t_next(struct seq_file *m, void *v, loff_t *pos) +{ + return seq_list_next(v, &tty_drivers, pos); +} + +static void t_stop(struct seq_file *m, void *v) +{ + mutex_unlock(&tty_mutex); +} + +static const struct seq_operations tty_drivers_op = { + .start = t_start, + .next = t_next, + .stop = t_stop, + .show = show_tty_driver +}; + +static int tty_drivers_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &tty_drivers_op); +} + +static const struct file_operations proc_tty_drivers_operations = { + .open = tty_drivers_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +/* + * This function is called by tty_register_driver() to handle + * registering the driver's /proc handler into /proc/tty/driver/<foo> + */ +void proc_tty_register_driver(struct tty_driver *driver) +{ + struct proc_dir_entry *ent; + + if (!driver->driver_name || driver->proc_entry || + !driver->ops->proc_fops) + return; + + ent = proc_create_data(driver->driver_name, 0, proc_tty_driver, + driver->ops->proc_fops, driver); + driver->proc_entry = ent; +} + +/* + * This function is called by tty_unregister_driver() + */ +void proc_tty_unregister_driver(struct tty_driver *driver) +{ + struct proc_dir_entry *ent; + + ent = driver->proc_entry; + if (!ent) + return; + + remove_proc_entry(driver->driver_name, proc_tty_driver); + + driver->proc_entry = NULL; +} + +/* + * Called by proc_root_init() to initialize the /proc/tty subtree + */ +void __init proc_tty_init(void) +{ + if (!proc_mkdir("tty", NULL)) + return; + proc_tty_ldisc = proc_mkdir("tty/ldisc", NULL); + /* + * /proc/tty/driver/serial reveals the exact character counts for + * serial links which is just too easy to abuse for inferring + * password lengths and inter-keystroke timings during password + * entry. + */ + proc_tty_driver = proc_mkdir_mode("tty/driver", S_IRUSR|S_IXUSR, NULL); + proc_create("tty/ldiscs", 0, NULL, &tty_ldiscs_proc_fops); + proc_create("tty/drivers", 0, NULL, &proc_tty_drivers_operations); +} diff --git a/fs/proc/root.c b/fs/proc/root.c new file mode 100644 index 00000000..eed44bfc --- /dev/null +++ b/fs/proc/root.c @@ -0,0 +1,278 @@ +/* + * linux/fs/proc/root.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * proc root directory handling functions + */ + +#include <asm/uaccess.h> + +#include <linux/errno.h> +#include <linux/time.h> +#include <linux/proc_fs.h> +#include <linux/stat.h> +#include <linux/init.h> +#include <linux/sched.h> +#include <linux/module.h> +#include <linux/bitops.h> +#include <linux/mount.h> +#include <linux/pid_namespace.h> +#include <linux/parser.h> + +#include "internal.h" + +static int proc_test_super(struct super_block *sb, void *data) +{ + return sb->s_fs_info == data; +} + +static int proc_set_super(struct super_block *sb, void *data) +{ + int err = set_anon_super(sb, NULL); + if (!err) { + struct pid_namespace *ns = (struct pid_namespace *)data; + sb->s_fs_info = get_pid_ns(ns); + } + return err; +} + +enum { + Opt_gid, Opt_hidepid, Opt_err, +}; + +static const match_table_t tokens = { + {Opt_hidepid, "hidepid=%u"}, + {Opt_gid, "gid=%u"}, + {Opt_err, NULL}, +}; + +static int proc_parse_options(char *options, struct pid_namespace *pid) +{ + char *p; + substring_t args[MAX_OPT_ARGS]; + int option; + + if (!options) + return 1; + + while ((p = strsep(&options, ",")) != NULL) { + int token; + if (!*p) + continue; + + args[0].to = args[0].from = 0; + token = match_token(p, tokens, args); + switch (token) { + case Opt_gid: + if (match_int(&args[0], &option)) + return 0; + pid->pid_gid = option; + break; + case Opt_hidepid: + if (match_int(&args[0], &option)) + return 0; + if (option < 0 || option > 2) { + pr_err("proc: hidepid value must be between 0 and 2.\n"); + return 0; + } + pid->hide_pid = option; + break; + default: + pr_err("proc: unrecognized mount option \"%s\" " + "or missing value\n", p); + return 0; + } + } + + return 1; +} + +int proc_remount(struct super_block *sb, int *flags, char *data) +{ + struct pid_namespace *pid = sb->s_fs_info; + return !proc_parse_options(data, pid); +} + +static struct dentry *proc_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + int err; + struct super_block *sb; + struct pid_namespace *ns; + struct proc_inode *ei; + char *options; + + if (flags & MS_KERNMOUNT) { + ns = (struct pid_namespace *)data; + options = NULL; + } else { + ns = current->nsproxy->pid_ns; + options = data; + } + + sb = sget(fs_type, proc_test_super, proc_set_super, ns); + if (IS_ERR(sb)) + return ERR_CAST(sb); + + if (!proc_parse_options(options, ns)) { + deactivate_locked_super(sb); + return ERR_PTR(-EINVAL); + } + + if (!sb->s_root) { + sb->s_flags = flags; + err = proc_fill_super(sb); + if (err) { + deactivate_locked_super(sb); + return ERR_PTR(err); + } + + sb->s_flags |= MS_ACTIVE; + } + + ei = PROC_I(sb->s_root->d_inode); + if (!ei->pid) { + rcu_read_lock(); + ei->pid = get_pid(find_pid_ns(1, ns)); + rcu_read_unlock(); + } + + return dget(sb->s_root); +} + +static void proc_kill_sb(struct super_block *sb) +{ + struct pid_namespace *ns; + + ns = (struct pid_namespace *)sb->s_fs_info; + kill_anon_super(sb); + put_pid_ns(ns); +} + +static struct file_system_type proc_fs_type = { + .name = "proc", + .mount = proc_mount, + .kill_sb = proc_kill_sb, +}; + +void __init proc_root_init(void) +{ + int err; + + proc_init_inodecache(); + err = register_filesystem(&proc_fs_type); + if (err) + return; + err = pid_ns_prepare_proc(&init_pid_ns); + if (err) { + unregister_filesystem(&proc_fs_type); + return; + } + + proc_symlink("mounts", NULL, "self/mounts"); + + proc_net_init(); + +#ifdef CONFIG_SYSVIPC + proc_mkdir("sysvipc", NULL); +#endif + proc_mkdir("fs", NULL); + proc_mkdir("driver", NULL); + proc_mkdir("fs/nfsd", NULL); /* somewhere for the nfsd filesystem to be mounted */ +#if defined(CONFIG_SUN_OPENPROMFS) || defined(CONFIG_SUN_OPENPROMFS_MODULE) + /* just give it a mountpoint */ + proc_mkdir("openprom", NULL); +#endif + proc_tty_init(); +#ifdef CONFIG_PROC_DEVICETREE + proc_device_tree_init(); +#endif + proc_mkdir("bus", NULL); + proc_sys_init(); +} + +static int proc_root_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat +) +{ + generic_fillattr(dentry->d_inode, stat); + stat->nlink = proc_root.nlink + nr_processes(); + return 0; +} + +static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry, struct nameidata *nd) +{ + if (!proc_lookup(dir, dentry, nd)) { + return NULL; + } + + return proc_pid_lookup(dir, dentry, nd); +} + +static int proc_root_readdir(struct file * filp, + void * dirent, filldir_t filldir) +{ + unsigned int nr = filp->f_pos; + int ret; + + if (nr < FIRST_PROCESS_ENTRY) { + int error = proc_readdir(filp, dirent, filldir); + if (error <= 0) + return error; + filp->f_pos = FIRST_PROCESS_ENTRY; + } + + ret = proc_pid_readdir(filp, dirent, filldir); + return ret; +} + +/* + * The root /proc directory is special, as it has the + * <pid> directories. Thus we don't use the generic + * directory handling functions for that.. + */ +static const struct file_operations proc_root_operations = { + .read = generic_read_dir, + .readdir = proc_root_readdir, + .llseek = default_llseek, +}; + +/* + * proc root can do almost nothing.. + */ +static const struct inode_operations proc_root_inode_operations = { + .lookup = proc_root_lookup, + .getattr = proc_root_getattr, +}; + +/* + * This is the root "inode" in the /proc tree.. + */ +struct proc_dir_entry proc_root = { + .low_ino = PROC_ROOT_INO, + .namelen = 5, + .mode = S_IFDIR | S_IRUGO | S_IXUGO, + .nlink = 2, + .count = ATOMIC_INIT(1), + .proc_iops = &proc_root_inode_operations, + .proc_fops = &proc_root_operations, + .parent = &proc_root, + .name = "/proc", +}; + +int pid_ns_prepare_proc(struct pid_namespace *ns) +{ + struct vfsmount *mnt; + + mnt = kern_mount_data(&proc_fs_type, ns); + if (IS_ERR(mnt)) + return PTR_ERR(mnt); + + ns->proc_mnt = mnt; + return 0; +} + +void pid_ns_release_proc(struct pid_namespace *ns) +{ + kern_unmount(ns->proc_mnt); +} diff --git a/fs/proc/softirqs.c b/fs/proc/softirqs.c new file mode 100644 index 00000000..62604be9 --- /dev/null +++ b/fs/proc/softirqs.c @@ -0,0 +1,44 @@ +#include <linux/init.h> +#include <linux/kernel_stat.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> + +/* + * /proc/softirqs ... display the number of softirqs + */ +static int show_softirqs(struct seq_file *p, void *v) +{ + int i, j; + + seq_puts(p, " "); + for_each_possible_cpu(i) + seq_printf(p, "CPU%-8d", i); + seq_putc(p, '\n'); + + for (i = 0; i < NR_SOFTIRQS; i++) { + seq_printf(p, "%12s:", softirq_to_name[i]); + for_each_possible_cpu(j) + seq_printf(p, " %10u", kstat_softirqs_cpu(i, j)); + seq_putc(p, '\n'); + } + return 0; +} + +static int softirqs_open(struct inode *inode, struct file *file) +{ + return single_open(file, show_softirqs, NULL); +} + +static const struct file_operations proc_softirqs_operations = { + .open = softirqs_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init proc_softirqs_init(void) +{ + proc_create("softirqs", 0, NULL, &proc_softirqs_operations); + return 0; +} +module_init(proc_softirqs_init); diff --git a/fs/proc/stat.c b/fs/proc/stat.c new file mode 100644 index 00000000..64c3b317 --- /dev/null +++ b/fs/proc/stat.c @@ -0,0 +1,218 @@ +#include <linux/cpumask.h> +#include <linux/fs.h> +#include <linux/init.h> +#include <linux/interrupt.h> +#include <linux/kernel_stat.h> +#include <linux/proc_fs.h> +#include <linux/sched.h> +#include <linux/seq_file.h> +#include <linux/slab.h> +#include <linux/time.h> +#include <linux/irqnr.h> +#include <asm/cputime.h> +#include <linux/tick.h> + +#ifndef arch_irq_stat_cpu +#define arch_irq_stat_cpu(cpu) 0 +#endif +#ifndef arch_irq_stat +#define arch_irq_stat() 0 +#endif + +#ifdef arch_idle_time + +static cputime64_t get_idle_time(int cpu) +{ + cputime64_t idle; + + idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE]; + if (cpu_online(cpu) && !nr_iowait_cpu(cpu)) + idle += arch_idle_time(cpu); + return idle; +} + +static cputime64_t get_iowait_time(int cpu) +{ + cputime64_t iowait; + + iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT]; + if (cpu_online(cpu) && nr_iowait_cpu(cpu)) + iowait += arch_idle_time(cpu); + return iowait; +} + +#else + +static u64 get_idle_time(int cpu) +{ + u64 idle, idle_time = get_cpu_idle_time_us(cpu, NULL); + + if (idle_time == -1ULL) + /* !NO_HZ so we can rely on cpustat.idle */ + idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE]; + else + idle = usecs_to_cputime64(idle_time); + + return idle; +} + +static u64 get_iowait_time(int cpu) +{ + u64 iowait, iowait_time = get_cpu_iowait_time_us(cpu, NULL); + + if (iowait_time == -1ULL) + /* !NO_HZ so we can rely on cpustat.iowait */ + iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT]; + else + iowait = usecs_to_cputime64(iowait_time); + + return iowait; +} + +#endif + +static int show_stat(struct seq_file *p, void *v) +{ + int i, j; + unsigned long jif; + u64 user, nice, system, idle, iowait, irq, softirq, steal; + u64 guest, guest_nice; + u64 sum = 0; + u64 sum_softirq = 0; + unsigned int per_softirq_sums[NR_SOFTIRQS] = {0}; + struct timespec boottime; + + user = nice = system = idle = iowait = + irq = softirq = steal = 0; + guest = guest_nice = 0; + getboottime(&boottime); + jif = boottime.tv_sec; + + for_each_possible_cpu(i) { + user += kcpustat_cpu(i).cpustat[CPUTIME_USER]; + nice += kcpustat_cpu(i).cpustat[CPUTIME_NICE]; + system += kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM]; + idle += get_idle_time(i); + iowait += get_iowait_time(i); + irq += kcpustat_cpu(i).cpustat[CPUTIME_IRQ]; + softirq += kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ]; + steal += kcpustat_cpu(i).cpustat[CPUTIME_STEAL]; + guest += kcpustat_cpu(i).cpustat[CPUTIME_GUEST]; + guest_nice += kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE]; + sum += kstat_cpu_irqs_sum(i); + sum += arch_irq_stat_cpu(i); + + for (j = 0; j < NR_SOFTIRQS; j++) { + unsigned int softirq_stat = kstat_softirqs_cpu(j, i); + + per_softirq_sums[j] += softirq_stat; + sum_softirq += softirq_stat; + } + } + sum += arch_irq_stat(); + + seq_puts(p, "cpu "); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(user)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(nice)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(system)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(idle)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(iowait)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(irq)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(softirq)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(steal)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest_nice)); + seq_putc(p, '\n'); + + for_each_online_cpu(i) { + /* Copy values here to work around gcc-2.95.3, gcc-2.96 */ + user = kcpustat_cpu(i).cpustat[CPUTIME_USER]; + nice = kcpustat_cpu(i).cpustat[CPUTIME_NICE]; + system = kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM]; + idle = get_idle_time(i); + iowait = get_iowait_time(i); + irq = kcpustat_cpu(i).cpustat[CPUTIME_IRQ]; + softirq = kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ]; + steal = kcpustat_cpu(i).cpustat[CPUTIME_STEAL]; + guest = kcpustat_cpu(i).cpustat[CPUTIME_GUEST]; + guest_nice = kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE]; + seq_printf(p, "cpu%d", i); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(user)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(nice)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(system)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(idle)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(iowait)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(irq)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(softirq)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(steal)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest)); + seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest_nice)); + seq_putc(p, '\n'); + } + seq_printf(p, "intr %llu", (unsigned long long)sum); + + /* sum again ? it could be updated? */ + for_each_irq_nr(j) + seq_put_decimal_ull(p, ' ', kstat_irqs(j)); + + seq_printf(p, + "\nctxt %llu\n" + "btime %lu\n" + "processes %lu\n" + "procs_running %lu\n" + "procs_blocked %lu\n", + nr_context_switches(), + (unsigned long)jif, + total_forks, + nr_running(), + nr_iowait()); + + seq_printf(p, "softirq %llu", (unsigned long long)sum_softirq); + + for (i = 0; i < NR_SOFTIRQS; i++) + seq_put_decimal_ull(p, ' ', per_softirq_sums[i]); + seq_putc(p, '\n'); + + return 0; +} + +static int stat_open(struct inode *inode, struct file *file) +{ + unsigned size = 1024 + 128 * num_possible_cpus(); + char *buf; + struct seq_file *m; + int res; + + /* minimum size to display an interrupt count : 2 bytes */ + size += 2 * nr_irqs; + + /* don't ask for more than the kmalloc() max size */ + if (size > KMALLOC_MAX_SIZE) + size = KMALLOC_MAX_SIZE; + buf = kmalloc(size, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + res = single_open(file, show_stat, NULL); + if (!res) { + m = file->private_data; + m->buf = buf; + m->size = ksize(buf); + } else + kfree(buf); + return res; +} + +static const struct file_operations proc_stat_operations = { + .open = stat_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init proc_stat_init(void) +{ + proc_create("stat", 0, NULL, &proc_stat_operations); + return 0; +} +module_init(proc_stat_init); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c new file mode 100644 index 00000000..7faaf2ac --- /dev/null +++ b/fs/proc/task_mmu.c @@ -0,0 +1,1286 @@ +#include <linux/mm.h> +#include <linux/hugetlb.h> +#include <linux/huge_mm.h> +#include <linux/mount.h> +#include <linux/seq_file.h> +#include <linux/highmem.h> +#include <linux/ptrace.h> +#include <linux/slab.h> +#include <linux/pagemap.h> +#include <linux/mempolicy.h> +#include <linux/rmap.h> +#include <linux/swap.h> +#include <linux/swapops.h> + +#include <asm/elf.h> +#include <asm/uaccess.h> +#include <asm/tlbflush.h> +#include "internal.h" + +void task_mem(struct seq_file *m, struct mm_struct *mm) +{ + unsigned long data, text, lib, swap; + unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss; + + /* + * Note: to minimize their overhead, mm maintains hiwater_vm and + * hiwater_rss only when about to *lower* total_vm or rss. Any + * collector of these hiwater stats must therefore get total_vm + * and rss too, which will usually be the higher. Barriers? not + * worth the effort, such snapshots can always be inconsistent. + */ + hiwater_vm = total_vm = mm->total_vm; + if (hiwater_vm < mm->hiwater_vm) + hiwater_vm = mm->hiwater_vm; + hiwater_rss = total_rss = get_mm_rss(mm); + if (hiwater_rss < mm->hiwater_rss) + hiwater_rss = mm->hiwater_rss; + + data = mm->total_vm - mm->shared_vm - mm->stack_vm; + text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10; + lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text; + swap = get_mm_counter(mm, MM_SWAPENTS); + seq_printf(m, + "VmPeak:\t%8lu kB\n" + "VmSize:\t%8lu kB\n" + "VmLck:\t%8lu kB\n" + "VmPin:\t%8lu kB\n" + "VmHWM:\t%8lu kB\n" + "VmRSS:\t%8lu kB\n" + "VmData:\t%8lu kB\n" + "VmStk:\t%8lu kB\n" + "VmExe:\t%8lu kB\n" + "VmLib:\t%8lu kB\n" + "VmPTE:\t%8lu kB\n" + "VmSwap:\t%8lu kB\n", + hiwater_vm << (PAGE_SHIFT-10), + (total_vm - mm->reserved_vm) << (PAGE_SHIFT-10), + mm->locked_vm << (PAGE_SHIFT-10), + mm->pinned_vm << (PAGE_SHIFT-10), + hiwater_rss << (PAGE_SHIFT-10), + total_rss << (PAGE_SHIFT-10), + data << (PAGE_SHIFT-10), + mm->stack_vm << (PAGE_SHIFT-10), text, lib, + (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10, + swap << (PAGE_SHIFT-10)); +} + +unsigned long task_vsize(struct mm_struct *mm) +{ + return PAGE_SIZE * mm->total_vm; +} + +unsigned long task_statm(struct mm_struct *mm, + unsigned long *shared, unsigned long *text, + unsigned long *data, unsigned long *resident) +{ + *shared = get_mm_counter(mm, MM_FILEPAGES); + *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) + >> PAGE_SHIFT; + *data = mm->total_vm - mm->shared_vm; + *resident = *shared + get_mm_counter(mm, MM_ANONPAGES); + return mm->total_vm; +} + +static void pad_len_spaces(struct seq_file *m, int len) +{ + len = 25 + sizeof(void*) * 6 - len; + if (len < 1) + len = 1; + seq_printf(m, "%*c", len, ' '); +} + +static void vma_stop(struct proc_maps_private *priv, struct vm_area_struct *vma) +{ + if (vma && vma != priv->tail_vma) { + struct mm_struct *mm = vma->vm_mm; + up_read(&mm->mmap_sem); + mmput(mm); + } +} + +static void *m_start(struct seq_file *m, loff_t *pos) +{ + struct proc_maps_private *priv = m->private; + unsigned long last_addr = m->version; + struct mm_struct *mm; + struct vm_area_struct *vma, *tail_vma = NULL; + loff_t l = *pos; + + /* Clear the per syscall fields in priv */ + priv->task = NULL; + priv->tail_vma = NULL; + + /* + * We remember last_addr rather than next_addr to hit with + * mmap_cache most of the time. We have zero last_addr at + * the beginning and also after lseek. We will have -1 last_addr + * after the end of the vmas. + */ + + if (last_addr == -1UL) + return NULL; + + priv->task = get_pid_task(priv->pid, PIDTYPE_PID); + if (!priv->task) + return ERR_PTR(-ESRCH); + + mm = mm_for_maps(priv->task); + if (!mm || IS_ERR(mm)) + return mm; + down_read(&mm->mmap_sem); + + tail_vma = get_gate_vma(priv->task->mm); + priv->tail_vma = tail_vma; + + /* Start with last addr hint */ + vma = find_vma(mm, last_addr); + if (last_addr && vma) { + vma = vma->vm_next; + goto out; + } + + /* + * Check the vma index is within the range and do + * sequential scan until m_index. + */ + vma = NULL; + if ((unsigned long)l < mm->map_count) { + vma = mm->mmap; + while (l-- && vma) + vma = vma->vm_next; + goto out; + } + + if (l != mm->map_count) + tail_vma = NULL; /* After gate vma */ + +out: + if (vma) + return vma; + + /* End of vmas has been reached */ + m->version = (tail_vma != NULL)? 0: -1UL; + up_read(&mm->mmap_sem); + mmput(mm); + return tail_vma; +} + +static void *m_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct proc_maps_private *priv = m->private; + struct vm_area_struct *vma = v; + struct vm_area_struct *tail_vma = priv->tail_vma; + + (*pos)++; + if (vma && (vma != tail_vma) && vma->vm_next) + return vma->vm_next; + vma_stop(priv, vma); + return (vma != tail_vma)? tail_vma: NULL; +} + +static void m_stop(struct seq_file *m, void *v) +{ + struct proc_maps_private *priv = m->private; + struct vm_area_struct *vma = v; + + if (!IS_ERR(vma)) + vma_stop(priv, vma); + if (priv->task) + put_task_struct(priv->task); +} + +static int do_maps_open(struct inode *inode, struct file *file, + const struct seq_operations *ops) +{ + struct proc_maps_private *priv; + int ret = -ENOMEM; + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (priv) { + priv->pid = proc_pid(inode); + ret = seq_open(file, ops); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = priv; + } else { + kfree(priv); + } + } + return ret; +} + +static void +show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) +{ + struct mm_struct *mm = vma->vm_mm; + struct file *file = vma->vm_file; + struct proc_maps_private *priv = m->private; + struct task_struct *task = priv->task; + vm_flags_t flags = vma->vm_flags; + unsigned long ino = 0; + unsigned long long pgoff = 0; + unsigned long start, end; + dev_t dev = 0; + int len; + const char *name = NULL; + + if (file) { + struct inode *inode = vma->vm_file->f_path.dentry->d_inode; + dev = inode->i_sb->s_dev; + ino = inode->i_ino; + pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT; + } + + /* We don't show the stack guard page in /proc/maps */ + start = vma->vm_start; + if (stack_guard_page_start(vma, start)) + start += PAGE_SIZE; + end = vma->vm_end; + if (stack_guard_page_end(vma, end)) + end -= PAGE_SIZE; + + seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n", + start, + end, + flags & VM_READ ? 'r' : '-', + flags & VM_WRITE ? 'w' : '-', + flags & VM_EXEC ? 'x' : '-', + flags & VM_MAYSHARE ? 's' : 'p', + pgoff, + MAJOR(dev), MINOR(dev), ino, &len); + + /* + * Print the dentry name for named mappings, and a + * special [heap] marker for the heap: + */ + if (file) { + pad_len_spaces(m, len); + seq_path(m, &file->f_path, "\n"); + goto done; + } + + name = arch_vma_name(vma); + if (!name) { + pid_t tid; + + if (!mm) { + name = "[vdso]"; + goto done; + } + + if (vma->vm_start <= mm->brk && + vma->vm_end >= mm->start_brk) { + name = "[heap]"; + goto done; + } + + tid = vm_is_stack(task, vma, is_pid); + + if (tid != 0) { + /* + * Thread stack in /proc/PID/task/TID/maps or + * the main process stack. + */ + if (!is_pid || (vma->vm_start <= mm->start_stack && + vma->vm_end >= mm->start_stack)) { + name = "[stack]"; + } else { + /* Thread stack in /proc/PID/maps */ + pad_len_spaces(m, len); + seq_printf(m, "[stack:%d]", tid); + } + } + } + +done: + if (name) { + pad_len_spaces(m, len); + seq_puts(m, name); + } + seq_putc(m, '\n'); +} + +static int show_map(struct seq_file *m, void *v, int is_pid) +{ + struct vm_area_struct *vma = v; + struct proc_maps_private *priv = m->private; + struct task_struct *task = priv->task; + + show_map_vma(m, vma, is_pid); + + if (m->count < m->size) /* vma is copied successfully */ + m->version = (vma != get_gate_vma(task->mm)) + ? vma->vm_start : 0; + return 0; +} + +static int show_pid_map(struct seq_file *m, void *v) +{ + return show_map(m, v, 1); +} + +static int show_tid_map(struct seq_file *m, void *v) +{ + return show_map(m, v, 0); +} + +static const struct seq_operations proc_pid_maps_op = { + .start = m_start, + .next = m_next, + .stop = m_stop, + .show = show_pid_map +}; + +static const struct seq_operations proc_tid_maps_op = { + .start = m_start, + .next = m_next, + .stop = m_stop, + .show = show_tid_map +}; + +static int pid_maps_open(struct inode *inode, struct file *file) +{ + return do_maps_open(inode, file, &proc_pid_maps_op); +} + +static int tid_maps_open(struct inode *inode, struct file *file) +{ + return do_maps_open(inode, file, &proc_tid_maps_op); +} + +const struct file_operations proc_pid_maps_operations = { + .open = pid_maps_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +const struct file_operations proc_tid_maps_operations = { + .open = tid_maps_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +/* + * Proportional Set Size(PSS): my share of RSS. + * + * PSS of a process is the count of pages it has in memory, where each + * page is divided by the number of processes sharing it. So if a + * process has 1000 pages all to itself, and 1000 shared with one other + * process, its PSS will be 1500. + * + * To keep (accumulated) division errors low, we adopt a 64bit + * fixed-point pss counter to minimize division errors. So (pss >> + * PSS_SHIFT) would be the real byte count. + * + * A shift of 12 before division means (assuming 4K page size): + * - 1M 3-user-pages add up to 8KB errors; + * - supports mapcount up to 2^24, or 16M; + * - supports PSS up to 2^52 bytes, or 4PB. + */ +#define PSS_SHIFT 12 + +#ifdef CONFIG_PROC_PAGE_MONITOR +struct mem_size_stats { + struct vm_area_struct *vma; + unsigned long resident; + unsigned long shared_clean; + unsigned long shared_dirty; + unsigned long private_clean; + unsigned long private_dirty; + unsigned long referenced; + unsigned long anonymous; + unsigned long anonymous_thp; + unsigned long swap; + u64 pss; +}; + + +static void smaps_pte_entry(pte_t ptent, unsigned long addr, + unsigned long ptent_size, struct mm_walk *walk) +{ + struct mem_size_stats *mss = walk->private; + struct vm_area_struct *vma = mss->vma; + struct page *page; + int mapcount; + + if (is_swap_pte(ptent)) { + mss->swap += ptent_size; + return; + } + + if (!pte_present(ptent)) + return; + + page = vm_normal_page(vma, addr, ptent); + if (!page) + return; + + if (PageAnon(page)) + mss->anonymous += ptent_size; + + mss->resident += ptent_size; + /* Accumulate the size in pages that have been accessed. */ + if (pte_young(ptent) || PageReferenced(page)) + mss->referenced += ptent_size; + mapcount = page_mapcount(page); + if (mapcount >= 2) { + if (pte_dirty(ptent) || PageDirty(page)) + mss->shared_dirty += ptent_size; + else + mss->shared_clean += ptent_size; + mss->pss += (ptent_size << PSS_SHIFT) / mapcount; + } else { + if (pte_dirty(ptent) || PageDirty(page)) + mss->private_dirty += ptent_size; + else + mss->private_clean += ptent_size; + mss->pss += (ptent_size << PSS_SHIFT); + } +} + +static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct mem_size_stats *mss = walk->private; + struct vm_area_struct *vma = mss->vma; + pte_t *pte; + spinlock_t *ptl; + + if (pmd_trans_huge_lock(pmd, vma) == 1) { + smaps_pte_entry(*(pte_t *)pmd, addr, HPAGE_PMD_SIZE, walk); + spin_unlock(&walk->mm->page_table_lock); + mss->anonymous_thp += HPAGE_PMD_SIZE; + return 0; + } + + if (pmd_trans_unstable(pmd)) + return 0; + /* + * The mmap_sem held all the way back in m_start() is what + * keeps khugepaged out of here and from collapsing things + * in here. + */ + pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + for (; addr != end; pte++, addr += PAGE_SIZE) + smaps_pte_entry(*pte, addr, PAGE_SIZE, walk); + pte_unmap_unlock(pte - 1, ptl); + cond_resched(); + return 0; +} + +static int show_smap(struct seq_file *m, void *v, int is_pid) +{ + struct proc_maps_private *priv = m->private; + struct task_struct *task = priv->task; + struct vm_area_struct *vma = v; + struct mem_size_stats mss; + struct mm_walk smaps_walk = { + .pmd_entry = smaps_pte_range, + .mm = vma->vm_mm, + .private = &mss, + }; + + memset(&mss, 0, sizeof mss); + mss.vma = vma; + /* mmap_sem is held in m_start */ + if (vma->vm_mm && !is_vm_hugetlb_page(vma)) + walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk); + + show_map_vma(m, vma, is_pid); + + seq_printf(m, + "Size: %8lu kB\n" + "Rss: %8lu kB\n" + "Pss: %8lu kB\n" + "Shared_Clean: %8lu kB\n" + "Shared_Dirty: %8lu kB\n" + "Private_Clean: %8lu kB\n" + "Private_Dirty: %8lu kB\n" + "Referenced: %8lu kB\n" + "Anonymous: %8lu kB\n" + "AnonHugePages: %8lu kB\n" + "Swap: %8lu kB\n" + "KernelPageSize: %8lu kB\n" + "MMUPageSize: %8lu kB\n" + "Locked: %8lu kB\n", + (vma->vm_end - vma->vm_start) >> 10, + mss.resident >> 10, + (unsigned long)(mss.pss >> (10 + PSS_SHIFT)), + mss.shared_clean >> 10, + mss.shared_dirty >> 10, + mss.private_clean >> 10, + mss.private_dirty >> 10, + mss.referenced >> 10, + mss.anonymous >> 10, + mss.anonymous_thp >> 10, + mss.swap >> 10, + vma_kernel_pagesize(vma) >> 10, + vma_mmu_pagesize(vma) >> 10, + (vma->vm_flags & VM_LOCKED) ? + (unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0); + + if (m->count < m->size) /* vma is copied successfully */ + m->version = (vma != get_gate_vma(task->mm)) + ? vma->vm_start : 0; + return 0; +} + +static int show_pid_smap(struct seq_file *m, void *v) +{ + return show_smap(m, v, 1); +} + +static int show_tid_smap(struct seq_file *m, void *v) +{ + return show_smap(m, v, 0); +} + +static const struct seq_operations proc_pid_smaps_op = { + .start = m_start, + .next = m_next, + .stop = m_stop, + .show = show_pid_smap +}; + +static const struct seq_operations proc_tid_smaps_op = { + .start = m_start, + .next = m_next, + .stop = m_stop, + .show = show_tid_smap +}; + +static int pid_smaps_open(struct inode *inode, struct file *file) +{ + return do_maps_open(inode, file, &proc_pid_smaps_op); +} + +static int tid_smaps_open(struct inode *inode, struct file *file) +{ + return do_maps_open(inode, file, &proc_tid_smaps_op); +} + +const struct file_operations proc_pid_smaps_operations = { + .open = pid_smaps_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +const struct file_operations proc_tid_smaps_operations = { + .open = tid_smaps_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, + unsigned long end, struct mm_walk *walk) +{ + struct vm_area_struct *vma = walk->private; + pte_t *pte, ptent; + spinlock_t *ptl; + struct page *page; + + split_huge_page_pmd(walk->mm, pmd); + if (pmd_trans_unstable(pmd)) + return 0; + + pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + for (; addr != end; pte++, addr += PAGE_SIZE) { + ptent = *pte; + if (!pte_present(ptent)) + continue; + + page = vm_normal_page(vma, addr, ptent); + if (!page) + continue; + + /* Clear accessed and referenced bits. */ + ptep_test_and_clear_young(vma, addr, pte); + ClearPageReferenced(page); + } + pte_unmap_unlock(pte - 1, ptl); + cond_resched(); + return 0; +} + +#define CLEAR_REFS_ALL 1 +#define CLEAR_REFS_ANON 2 +#define CLEAR_REFS_MAPPED 3 + +static ssize_t clear_refs_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task; + char buffer[PROC_NUMBUF]; + struct mm_struct *mm; + struct vm_area_struct *vma; + int type; + int rv; + + memset(buffer, 0, sizeof(buffer)); + if (count > sizeof(buffer) - 1) + count = sizeof(buffer) - 1; + if (copy_from_user(buffer, buf, count)) + return -EFAULT; + rv = kstrtoint(strstrip(buffer), 10, &type); + if (rv < 0) + return rv; + if (type < CLEAR_REFS_ALL || type > CLEAR_REFS_MAPPED) + return -EINVAL; + task = get_proc_task(file->f_path.dentry->d_inode); + if (!task) + return -ESRCH; + mm = get_task_mm(task); + if (mm) { + struct mm_walk clear_refs_walk = { + .pmd_entry = clear_refs_pte_range, + .mm = mm, + }; + down_read(&mm->mmap_sem); + for (vma = mm->mmap; vma; vma = vma->vm_next) { + clear_refs_walk.private = vma; + if (is_vm_hugetlb_page(vma)) + continue; + /* + * Writing 1 to /proc/pid/clear_refs affects all pages. + * + * Writing 2 to /proc/pid/clear_refs only affects + * Anonymous pages. + * + * Writing 3 to /proc/pid/clear_refs only affects file + * mapped pages. + */ + if (type == CLEAR_REFS_ANON && vma->vm_file) + continue; + if (type == CLEAR_REFS_MAPPED && !vma->vm_file) + continue; + walk_page_range(vma->vm_start, vma->vm_end, + &clear_refs_walk); + } + flush_tlb_mm(mm); + up_read(&mm->mmap_sem); + mmput(mm); + } + put_task_struct(task); + + return count; +} + +const struct file_operations proc_clear_refs_operations = { + .write = clear_refs_write, + .llseek = noop_llseek, +}; + +typedef struct { + u64 pme; +} pagemap_entry_t; + +struct pagemapread { + int pos, len; + pagemap_entry_t *buffer; +}; + +#define PAGEMAP_WALK_SIZE (PMD_SIZE) +#define PAGEMAP_WALK_MASK (PMD_MASK) + +#define PM_ENTRY_BYTES sizeof(u64) +#define PM_STATUS_BITS 3 +#define PM_STATUS_OFFSET (64 - PM_STATUS_BITS) +#define PM_STATUS_MASK (((1LL << PM_STATUS_BITS) - 1) << PM_STATUS_OFFSET) +#define PM_STATUS(nr) (((nr) << PM_STATUS_OFFSET) & PM_STATUS_MASK) +#define PM_PSHIFT_BITS 6 +#define PM_PSHIFT_OFFSET (PM_STATUS_OFFSET - PM_PSHIFT_BITS) +#define PM_PSHIFT_MASK (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET) +#define PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK) +#define PM_PFRAME_MASK ((1LL << PM_PSHIFT_OFFSET) - 1) +#define PM_PFRAME(x) ((x) & PM_PFRAME_MASK) + +#define PM_PRESENT PM_STATUS(4LL) +#define PM_SWAP PM_STATUS(2LL) +#define PM_NOT_PRESENT PM_PSHIFT(PAGE_SHIFT) +#define PM_END_OF_BUFFER 1 + +static inline pagemap_entry_t make_pme(u64 val) +{ + return (pagemap_entry_t) { .pme = val }; +} + +static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme, + struct pagemapread *pm) +{ + pm->buffer[pm->pos++] = *pme; + if (pm->pos >= pm->len) + return PM_END_OF_BUFFER; + return 0; +} + +static int pagemap_pte_hole(unsigned long start, unsigned long end, + struct mm_walk *walk) +{ + struct pagemapread *pm = walk->private; + unsigned long addr; + int err = 0; + pagemap_entry_t pme = make_pme(PM_NOT_PRESENT); + + for (addr = start; addr < end; addr += PAGE_SIZE) { + err = add_to_pagemap(addr, &pme, pm); + if (err) + break; + } + return err; +} + +static u64 swap_pte_to_pagemap_entry(pte_t pte) +{ + swp_entry_t e = pte_to_swp_entry(pte); + return swp_type(e) | (swp_offset(e) << MAX_SWAPFILES_SHIFT); +} + +static void pte_to_pagemap_entry(pagemap_entry_t *pme, pte_t pte) +{ + if (is_swap_pte(pte)) + *pme = make_pme(PM_PFRAME(swap_pte_to_pagemap_entry(pte)) + | PM_PSHIFT(PAGE_SHIFT) | PM_SWAP); + else if (pte_present(pte)) + *pme = make_pme(PM_PFRAME(pte_pfn(pte)) + | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT); + else + *pme = make_pme(PM_NOT_PRESENT); +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, + pmd_t pmd, int offset) +{ + /* + * Currently pmd for thp is always present because thp can not be + * swapped-out, migrated, or HWPOISONed (split in such cases instead.) + * This if-check is just to prepare for future implementation. + */ + if (pmd_present(pmd)) + *pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset) + | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT); + else + *pme = make_pme(PM_NOT_PRESENT); +} +#else +static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, + pmd_t pmd, int offset) +{ +} +#endif + +static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct vm_area_struct *vma; + struct pagemapread *pm = walk->private; + pte_t *pte; + int err = 0; + pagemap_entry_t pme = make_pme(PM_NOT_PRESENT); + + /* find the first VMA at or above 'addr' */ + vma = find_vma(walk->mm, addr); + if (vma && pmd_trans_huge_lock(pmd, vma) == 1) { + for (; addr != end; addr += PAGE_SIZE) { + unsigned long offset; + + offset = (addr & ~PAGEMAP_WALK_MASK) >> + PAGE_SHIFT; + thp_pmd_to_pagemap_entry(&pme, *pmd, offset); + err = add_to_pagemap(addr, &pme, pm); + if (err) + break; + } + spin_unlock(&walk->mm->page_table_lock); + return err; + } + + if (pmd_trans_unstable(pmd)) + return 0; + for (; addr != end; addr += PAGE_SIZE) { + + /* check to see if we've left 'vma' behind + * and need a new, higher one */ + if (vma && (addr >= vma->vm_end)) { + vma = find_vma(walk->mm, addr); + pme = make_pme(PM_NOT_PRESENT); + } + + /* check that 'vma' actually covers this address, + * and that it isn't a huge page vma */ + if (vma && (vma->vm_start <= addr) && + !is_vm_hugetlb_page(vma)) { + pte = pte_offset_map(pmd, addr); + pte_to_pagemap_entry(&pme, *pte); + /* unmap before userspace copy */ + pte_unmap(pte); + } + err = add_to_pagemap(addr, &pme, pm); + if (err) + return err; + } + + cond_resched(); + + return err; +} + +#ifdef CONFIG_HUGETLB_PAGE +static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, + pte_t pte, int offset) +{ + if (pte_present(pte)) + *pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset) + | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT); + else + *pme = make_pme(PM_NOT_PRESENT); +} + +/* This function walks within one hugetlb entry in the single call */ +static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask, + unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct pagemapread *pm = walk->private; + int err = 0; + pagemap_entry_t pme; + + for (; addr != end; addr += PAGE_SIZE) { + int offset = (addr & ~hmask) >> PAGE_SHIFT; + huge_pte_to_pagemap_entry(&pme, *pte, offset); + err = add_to_pagemap(addr, &pme, pm); + if (err) + return err; + } + + cond_resched(); + + return err; +} +#endif /* HUGETLB_PAGE */ + +/* + * /proc/pid/pagemap - an array mapping virtual pages to pfns + * + * For each page in the address space, this file contains one 64-bit entry + * consisting of the following: + * + * Bits 0-55 page frame number (PFN) if present + * Bits 0-4 swap type if swapped + * Bits 5-55 swap offset if swapped + * Bits 55-60 page shift (page size = 1<<page shift) + * Bit 61 reserved for future use + * Bit 62 page swapped + * Bit 63 page present + * + * If the page is not present but in swap, then the PFN contains an + * encoding of the swap file number and the page's offset into the + * swap. Unmapped pages return a null PFN. This allows determining + * precisely which pages are mapped (or in swap) and comparing mapped + * pages between processes. + * + * Efficient users of this interface will use /proc/pid/maps to + * determine which areas of memory are actually mapped and llseek to + * skip over unmapped regions. + */ +static ssize_t pagemap_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); + struct mm_struct *mm; + struct pagemapread pm; + int ret = -ESRCH; + struct mm_walk pagemap_walk = {}; + unsigned long src; + unsigned long svpfn; + unsigned long start_vaddr; + unsigned long end_vaddr; + int copied = 0; + + if (!task) + goto out; + + ret = -EINVAL; + /* file position must be aligned */ + if ((*ppos % PM_ENTRY_BYTES) || (count % PM_ENTRY_BYTES)) + goto out_task; + + ret = 0; + if (!count) + goto out_task; + + pm.len = PM_ENTRY_BYTES * (PAGEMAP_WALK_SIZE >> PAGE_SHIFT); + pm.buffer = kmalloc(pm.len, GFP_TEMPORARY); + ret = -ENOMEM; + if (!pm.buffer) + goto out_task; + + mm = mm_for_maps(task); + ret = PTR_ERR(mm); + if (!mm || IS_ERR(mm)) + goto out_free; + + pagemap_walk.pmd_entry = pagemap_pte_range; + pagemap_walk.pte_hole = pagemap_pte_hole; +#ifdef CONFIG_HUGETLB_PAGE + pagemap_walk.hugetlb_entry = pagemap_hugetlb_range; +#endif + pagemap_walk.mm = mm; + pagemap_walk.private = ± + + src = *ppos; + svpfn = src / PM_ENTRY_BYTES; + start_vaddr = svpfn << PAGE_SHIFT; + end_vaddr = TASK_SIZE_OF(task); + + /* watch out for wraparound */ + if (svpfn > TASK_SIZE_OF(task) >> PAGE_SHIFT) + start_vaddr = end_vaddr; + + /* + * The odds are that this will stop walking way + * before end_vaddr, because the length of the + * user buffer is tracked in "pm", and the walk + * will stop when we hit the end of the buffer. + */ + ret = 0; + while (count && (start_vaddr < end_vaddr)) { + int len; + unsigned long end; + + pm.pos = 0; + end = (start_vaddr + PAGEMAP_WALK_SIZE) & PAGEMAP_WALK_MASK; + /* overflow ? */ + if (end < start_vaddr || end > end_vaddr) + end = end_vaddr; + down_read(&mm->mmap_sem); + ret = walk_page_range(start_vaddr, end, &pagemap_walk); + up_read(&mm->mmap_sem); + start_vaddr = end; + + len = min(count, PM_ENTRY_BYTES * pm.pos); + if (copy_to_user(buf, pm.buffer, len)) { + ret = -EFAULT; + goto out_mm; + } + copied += len; + buf += len; + count -= len; + } + *ppos += copied; + if (!ret || ret == PM_END_OF_BUFFER) + ret = copied; + +out_mm: + mmput(mm); +out_free: + kfree(pm.buffer); +out_task: + put_task_struct(task); +out: + return ret; +} + +const struct file_operations proc_pagemap_operations = { + .llseek = mem_lseek, /* borrow this */ + .read = pagemap_read, +}; +#endif /* CONFIG_PROC_PAGE_MONITOR */ + +#ifdef CONFIG_NUMA + +struct numa_maps { + struct vm_area_struct *vma; + unsigned long pages; + unsigned long anon; + unsigned long active; + unsigned long writeback; + unsigned long mapcount_max; + unsigned long dirty; + unsigned long swapcache; + unsigned long node[MAX_NUMNODES]; +}; + +struct numa_maps_private { + struct proc_maps_private proc_maps; + struct numa_maps md; +}; + +static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty, + unsigned long nr_pages) +{ + int count = page_mapcount(page); + + md->pages += nr_pages; + if (pte_dirty || PageDirty(page)) + md->dirty += nr_pages; + + if (PageSwapCache(page)) + md->swapcache += nr_pages; + + if (PageActive(page) || PageUnevictable(page)) + md->active += nr_pages; + + if (PageWriteback(page)) + md->writeback += nr_pages; + + if (PageAnon(page)) + md->anon += nr_pages; + + if (count > md->mapcount_max) + md->mapcount_max = count; + + md->node[page_to_nid(page)] += nr_pages; +} + +static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma, + unsigned long addr) +{ + struct page *page; + int nid; + + if (!pte_present(pte)) + return NULL; + + page = vm_normal_page(vma, addr, pte); + if (!page) + return NULL; + + if (PageReserved(page)) + return NULL; + + nid = page_to_nid(page); + if (!node_isset(nid, node_states[N_HIGH_MEMORY])) + return NULL; + + return page; +} + +static int gather_pte_stats(pmd_t *pmd, unsigned long addr, + unsigned long end, struct mm_walk *walk) +{ + struct numa_maps *md; + spinlock_t *ptl; + pte_t *orig_pte; + pte_t *pte; + + md = walk->private; + + if (pmd_trans_huge_lock(pmd, md->vma) == 1) { + pte_t huge_pte = *(pte_t *)pmd; + struct page *page; + + page = can_gather_numa_stats(huge_pte, md->vma, addr); + if (page) + gather_stats(page, md, pte_dirty(huge_pte), + HPAGE_PMD_SIZE/PAGE_SIZE); + spin_unlock(&walk->mm->page_table_lock); + return 0; + } + + if (pmd_trans_unstable(pmd)) + return 0; + orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); + do { + struct page *page = can_gather_numa_stats(*pte, md->vma, addr); + if (!page) + continue; + gather_stats(page, md, pte_dirty(*pte), 1); + + } while (pte++, addr += PAGE_SIZE, addr != end); + pte_unmap_unlock(orig_pte, ptl); + return 0; +} +#ifdef CONFIG_HUGETLB_PAGE +static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask, + unsigned long addr, unsigned long end, struct mm_walk *walk) +{ + struct numa_maps *md; + struct page *page; + + if (pte_none(*pte)) + return 0; + + page = pte_page(*pte); + if (!page) + return 0; + + md = walk->private; + gather_stats(page, md, pte_dirty(*pte), 1); + return 0; +} + +#else +static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask, + unsigned long addr, unsigned long end, struct mm_walk *walk) +{ + return 0; +} +#endif + +/* + * Display pages allocated per node and memory policy via /proc. + */ +static int show_numa_map(struct seq_file *m, void *v, int is_pid) +{ + struct numa_maps_private *numa_priv = m->private; + struct proc_maps_private *proc_priv = &numa_priv->proc_maps; + struct vm_area_struct *vma = v; + struct numa_maps *md = &numa_priv->md; + struct file *file = vma->vm_file; + struct mm_struct *mm = vma->vm_mm; + struct mm_walk walk = {}; + struct mempolicy *pol; + int n; + char buffer[50]; + + if (!mm) + return 0; + + /* Ensure we start with an empty set of numa_maps statistics. */ + memset(md, 0, sizeof(*md)); + + md->vma = vma; + + walk.hugetlb_entry = gather_hugetbl_stats; + walk.pmd_entry = gather_pte_stats; + walk.private = md; + walk.mm = mm; + + pol = get_vma_policy(proc_priv->task, vma, vma->vm_start); + mpol_to_str(buffer, sizeof(buffer), pol, 0); + mpol_cond_put(pol); + + seq_printf(m, "%08lx %s", vma->vm_start, buffer); + + if (file) { + seq_printf(m, " file="); + seq_path(m, &file->f_path, "\n\t= "); + } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { + seq_printf(m, " heap"); + } else { + pid_t tid = vm_is_stack(proc_priv->task, vma, is_pid); + if (tid != 0) { + /* + * Thread stack in /proc/PID/task/TID/maps or + * the main process stack. + */ + if (!is_pid || (vma->vm_start <= mm->start_stack && + vma->vm_end >= mm->start_stack)) + seq_printf(m, " stack"); + else + seq_printf(m, " stack:%d", tid); + } + } + + if (is_vm_hugetlb_page(vma)) + seq_printf(m, " huge"); + + walk_page_range(vma->vm_start, vma->vm_end, &walk); + + if (!md->pages) + goto out; + + if (md->anon) + seq_printf(m, " anon=%lu", md->anon); + + if (md->dirty) + seq_printf(m, " dirty=%lu", md->dirty); + + if (md->pages != md->anon && md->pages != md->dirty) + seq_printf(m, " mapped=%lu", md->pages); + + if (md->mapcount_max > 1) + seq_printf(m, " mapmax=%lu", md->mapcount_max); + + if (md->swapcache) + seq_printf(m, " swapcache=%lu", md->swapcache); + + if (md->active < md->pages && !is_vm_hugetlb_page(vma)) + seq_printf(m, " active=%lu", md->active); + + if (md->writeback) + seq_printf(m, " writeback=%lu", md->writeback); + + for_each_node_state(n, N_HIGH_MEMORY) + if (md->node[n]) + seq_printf(m, " N%d=%lu", n, md->node[n]); +out: + seq_putc(m, '\n'); + + if (m->count < m->size) + m->version = (vma != proc_priv->tail_vma) ? vma->vm_start : 0; + return 0; +} + +static int show_pid_numa_map(struct seq_file *m, void *v) +{ + return show_numa_map(m, v, 1); +} + +static int show_tid_numa_map(struct seq_file *m, void *v) +{ + return show_numa_map(m, v, 0); +} + +static const struct seq_operations proc_pid_numa_maps_op = { + .start = m_start, + .next = m_next, + .stop = m_stop, + .show = show_pid_numa_map, +}; + +static const struct seq_operations proc_tid_numa_maps_op = { + .start = m_start, + .next = m_next, + .stop = m_stop, + .show = show_tid_numa_map, +}; + +static int numa_maps_open(struct inode *inode, struct file *file, + const struct seq_operations *ops) +{ + struct numa_maps_private *priv; + int ret = -ENOMEM; + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (priv) { + priv->proc_maps.pid = proc_pid(inode); + ret = seq_open(file, ops); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = priv; + } else { + kfree(priv); + } + } + return ret; +} + +static int pid_numa_maps_open(struct inode *inode, struct file *file) +{ + return numa_maps_open(inode, file, &proc_pid_numa_maps_op); +} + +static int tid_numa_maps_open(struct inode *inode, struct file *file) +{ + return numa_maps_open(inode, file, &proc_tid_numa_maps_op); +} + +const struct file_operations proc_pid_numa_maps_operations = { + .open = pid_numa_maps_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +const struct file_operations proc_tid_numa_maps_operations = { + .open = tid_numa_maps_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; +#endif /* CONFIG_NUMA */ diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c new file mode 100644 index 00000000..74fe164d --- /dev/null +++ b/fs/proc/task_nommu.c @@ -0,0 +1,318 @@ + +#include <linux/mm.h> +#include <linux/file.h> +#include <linux/fdtable.h> +#include <linux/fs_struct.h> +#include <linux/mount.h> +#include <linux/ptrace.h> +#include <linux/slab.h> +#include <linux/seq_file.h> +#include "internal.h" + +/* + * Logic: we've got two memory sums for each process, "shared", and + * "non-shared". Shared memory may get counted more than once, for + * each process that owns it. Non-shared memory is counted + * accurately. + */ +void task_mem(struct seq_file *m, struct mm_struct *mm) +{ + struct vm_area_struct *vma; + struct vm_region *region; + struct rb_node *p; + unsigned long bytes = 0, sbytes = 0, slack = 0, size; + + down_read(&mm->mmap_sem); + for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) { + vma = rb_entry(p, struct vm_area_struct, vm_rb); + + bytes += kobjsize(vma); + + region = vma->vm_region; + if (region) { + size = kobjsize(region); + size += region->vm_end - region->vm_start; + } else { + size = vma->vm_end - vma->vm_start; + } + + if (atomic_read(&mm->mm_count) > 1 || + vma->vm_flags & VM_MAYSHARE) { + sbytes += size; + } else { + bytes += size; + if (region) + slack = region->vm_end - vma->vm_end; + } + } + + if (atomic_read(&mm->mm_count) > 1) + sbytes += kobjsize(mm); + else + bytes += kobjsize(mm); + + if (current->fs && current->fs->users > 1) + sbytes += kobjsize(current->fs); + else + bytes += kobjsize(current->fs); + + if (current->files && atomic_read(¤t->files->count) > 1) + sbytes += kobjsize(current->files); + else + bytes += kobjsize(current->files); + + if (current->sighand && atomic_read(¤t->sighand->count) > 1) + sbytes += kobjsize(current->sighand); + else + bytes += kobjsize(current->sighand); + + bytes += kobjsize(current); /* includes kernel stack */ + + seq_printf(m, + "Mem:\t%8lu bytes\n" + "Slack:\t%8lu bytes\n" + "Shared:\t%8lu bytes\n", + bytes, slack, sbytes); + + up_read(&mm->mmap_sem); +} + +unsigned long task_vsize(struct mm_struct *mm) +{ + struct vm_area_struct *vma; + struct rb_node *p; + unsigned long vsize = 0; + + down_read(&mm->mmap_sem); + for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) { + vma = rb_entry(p, struct vm_area_struct, vm_rb); + vsize += vma->vm_end - vma->vm_start; + } + up_read(&mm->mmap_sem); + return vsize; +} + +unsigned long task_statm(struct mm_struct *mm, + unsigned long *shared, unsigned long *text, + unsigned long *data, unsigned long *resident) +{ + struct vm_area_struct *vma; + struct vm_region *region; + struct rb_node *p; + unsigned long size = kobjsize(mm); + + down_read(&mm->mmap_sem); + for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) { + vma = rb_entry(p, struct vm_area_struct, vm_rb); + size += kobjsize(vma); + region = vma->vm_region; + if (region) { + size += kobjsize(region); + size += region->vm_end - region->vm_start; + } + } + + *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) + >> PAGE_SHIFT; + *data = (PAGE_ALIGN(mm->start_stack) - (mm->start_data & PAGE_MASK)) + >> PAGE_SHIFT; + up_read(&mm->mmap_sem); + size >>= PAGE_SHIFT; + size += *text + *data; + *resident = size; + return size; +} + +static void pad_len_spaces(struct seq_file *m, int len) +{ + len = 25 + sizeof(void*) * 6 - len; + if (len < 1) + len = 1; + seq_printf(m, "%*c", len, ' '); +} + +/* + * display a single VMA to a sequenced file + */ +static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma, + int is_pid) +{ + struct mm_struct *mm = vma->vm_mm; + struct proc_maps_private *priv = m->private; + unsigned long ino = 0; + struct file *file; + dev_t dev = 0; + int flags, len; + unsigned long long pgoff = 0; + + flags = vma->vm_flags; + file = vma->vm_file; + + if (file) { + struct inode *inode = vma->vm_file->f_path.dentry->d_inode; + dev = inode->i_sb->s_dev; + ino = inode->i_ino; + pgoff = (loff_t)vma->vm_pgoff << PAGE_SHIFT; + } + + seq_printf(m, + "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n", + vma->vm_start, + vma->vm_end, + flags & VM_READ ? 'r' : '-', + flags & VM_WRITE ? 'w' : '-', + flags & VM_EXEC ? 'x' : '-', + flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p', + pgoff, + MAJOR(dev), MINOR(dev), ino, &len); + + if (file) { + pad_len_spaces(m, len); + seq_path(m, &file->f_path, ""); + } else if (mm) { + pid_t tid = vm_is_stack(priv->task, vma, is_pid); + + if (tid != 0) { + pad_len_spaces(m, len); + /* + * Thread stack in /proc/PID/task/TID/maps or + * the main process stack. + */ + if (!is_pid || (vma->vm_start <= mm->start_stack && + vma->vm_end >= mm->start_stack)) + seq_printf(m, "[stack]"); + else + seq_printf(m, "[stack:%d]", tid); + } + } + + seq_putc(m, '\n'); + return 0; +} + +/* + * display mapping lines for a particular process's /proc/pid/maps + */ +static int show_map(struct seq_file *m, void *_p, int is_pid) +{ + struct rb_node *p = _p; + + return nommu_vma_show(m, rb_entry(p, struct vm_area_struct, vm_rb), + is_pid); +} + +static int show_pid_map(struct seq_file *m, void *_p) +{ + return show_map(m, _p, 1); +} + +static int show_tid_map(struct seq_file *m, void *_p) +{ + return show_map(m, _p, 0); +} + +static void *m_start(struct seq_file *m, loff_t *pos) +{ + struct proc_maps_private *priv = m->private; + struct mm_struct *mm; + struct rb_node *p; + loff_t n = *pos; + + /* pin the task and mm whilst we play with them */ + priv->task = get_pid_task(priv->pid, PIDTYPE_PID); + if (!priv->task) + return ERR_PTR(-ESRCH); + + mm = mm_for_maps(priv->task); + if (!mm || IS_ERR(mm)) { + put_task_struct(priv->task); + priv->task = NULL; + return mm; + } + down_read(&mm->mmap_sem); + + /* start from the Nth VMA */ + for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) + if (n-- == 0) + return p; + return NULL; +} + +static void m_stop(struct seq_file *m, void *_vml) +{ + struct proc_maps_private *priv = m->private; + + if (priv->task) { + struct mm_struct *mm = priv->task->mm; + up_read(&mm->mmap_sem); + mmput(mm); + put_task_struct(priv->task); + } +} + +static void *m_next(struct seq_file *m, void *_p, loff_t *pos) +{ + struct rb_node *p = _p; + + (*pos)++; + return p ? rb_next(p) : NULL; +} + +static const struct seq_operations proc_pid_maps_ops = { + .start = m_start, + .next = m_next, + .stop = m_stop, + .show = show_pid_map +}; + +static const struct seq_operations proc_tid_maps_ops = { + .start = m_start, + .next = m_next, + .stop = m_stop, + .show = show_tid_map +}; + +static int maps_open(struct inode *inode, struct file *file, + const struct seq_operations *ops) +{ + struct proc_maps_private *priv; + int ret = -ENOMEM; + + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (priv) { + priv->pid = proc_pid(inode); + ret = seq_open(file, ops); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = priv; + } else { + kfree(priv); + } + } + return ret; +} + +static int pid_maps_open(struct inode *inode, struct file *file) +{ + return maps_open(inode, file, &proc_pid_maps_ops); +} + +static int tid_maps_open(struct inode *inode, struct file *file) +{ + return maps_open(inode, file, &proc_tid_maps_ops); +} + +const struct file_operations proc_pid_maps_operations = { + .open = pid_maps_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +const struct file_operations proc_tid_maps_operations = { + .open = tid_maps_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c new file mode 100644 index 00000000..9610ac77 --- /dev/null +++ b/fs/proc/uptime.c @@ -0,0 +1,53 @@ +#include <linux/fs.h> +#include <linux/init.h> +#include <linux/proc_fs.h> +#include <linux/sched.h> +#include <linux/seq_file.h> +#include <linux/time.h> +#include <linux/kernel_stat.h> +#include <asm/cputime.h> + +static int uptime_proc_show(struct seq_file *m, void *v) +{ + struct timespec uptime; + struct timespec idle; + u64 idletime; + u64 nsec; + u32 rem; + int i; + + idletime = 0; + for_each_possible_cpu(i) + idletime += (__force u64) kcpustat_cpu(i).cpustat[CPUTIME_IDLE]; + + do_posix_clock_monotonic_gettime(&uptime); + monotonic_to_bootbased(&uptime); + nsec = cputime64_to_jiffies64(idletime) * TICK_NSEC; + idle.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem); + idle.tv_nsec = rem; + seq_printf(m, "%lu.%02lu %lu.%02lu\n", + (unsigned long) uptime.tv_sec, + (uptime.tv_nsec / (NSEC_PER_SEC / 100)), + (unsigned long) idle.tv_sec, + (idle.tv_nsec / (NSEC_PER_SEC / 100))); + return 0; +} + +static int uptime_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, uptime_proc_show, NULL); +} + +static const struct file_operations uptime_proc_fops = { + .open = uptime_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init proc_uptime_init(void) +{ + proc_create("uptime", 0, NULL, &uptime_proc_fops); + return 0; +} +module_init(proc_uptime_init); diff --git a/fs/proc/version.c b/fs/proc/version.c new file mode 100644 index 00000000..8d181cdc --- /dev/null +++ b/fs/proc/version.c @@ -0,0 +1,40 @@ +#include <linux/fs.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/utsname.h> +#include <asm/mach/version.h> + +static int version_proc_show(struct seq_file *m, void *v) +{ +#if 1 /* add to show kernel version */ + seq_printf(m, "%s\n",linux_banner); + seq_printf(m, "Kernel %s (%s - %s)\n",CONFIG_KERNEL_VERSION,__DATE__,__TIME__); +#else + seq_printf(m, linux_proc_banner, + utsname()->sysname, + utsname()->release, + utsname()->version); +#endif + return 0; +} + +static int version_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, version_proc_show, NULL); +} + +static const struct file_operations version_proc_fops = { + .open = version_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init proc_version_init(void) +{ + proc_create("version", 0, NULL, &version_proc_fops); + return 0; +} +module_init(proc_version_init); diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c new file mode 100644 index 00000000..0d5071d2 --- /dev/null +++ b/fs/proc/vmcore.c @@ -0,0 +1,725 @@ +/* + * fs/proc/vmcore.c Interface for accessing the crash + * dump from the system's previous life. + * Heavily borrowed from fs/proc/kcore.c + * Created by: Hariprasad Nellitheertha (hari@in.ibm.com) + * Copyright (C) IBM Corporation, 2004. All rights reserved + * + */ + +#include <linux/mm.h> +#include <linux/proc_fs.h> +#include <linux/user.h> +#include <linux/elf.h> +#include <linux/elfcore.h> +#include <linux/export.h> +#include <linux/slab.h> +#include <linux/highmem.h> +#include <linux/bootmem.h> +#include <linux/init.h> +#include <linux/crash_dump.h> +#include <linux/list.h> +#include <asm/uaccess.h> +#include <asm/io.h> + +/* List representing chunks of contiguous memory areas and their offsets in + * vmcore file. + */ +static LIST_HEAD(vmcore_list); + +/* Stores the pointer to the buffer containing kernel elf core headers. */ +static char *elfcorebuf; +static size_t elfcorebuf_sz; + +/* Total size of vmcore file. */ +static u64 vmcore_size; + +static struct proc_dir_entry *proc_vmcore = NULL; + +/* + * Returns > 0 for RAM pages, 0 for non-RAM pages, < 0 on error + * The called function has to take care of module refcounting. + */ +static int (*oldmem_pfn_is_ram)(unsigned long pfn); + +int register_oldmem_pfn_is_ram(int (*fn)(unsigned long pfn)) +{ + if (oldmem_pfn_is_ram) + return -EBUSY; + oldmem_pfn_is_ram = fn; + return 0; +} +EXPORT_SYMBOL_GPL(register_oldmem_pfn_is_ram); + +void unregister_oldmem_pfn_is_ram(void) +{ + oldmem_pfn_is_ram = NULL; + wmb(); +} +EXPORT_SYMBOL_GPL(unregister_oldmem_pfn_is_ram); + +static int pfn_is_ram(unsigned long pfn) +{ + int (*fn)(unsigned long pfn); + /* pfn is ram unless fn() checks pagetype */ + int ret = 1; + + /* + * Ask hypervisor if the pfn is really ram. + * A ballooned page contains no data and reading from such a page + * will cause high load in the hypervisor. + */ + fn = oldmem_pfn_is_ram; + if (fn) + ret = fn(pfn); + + return ret; +} + +/* Reads a page from the oldmem device from given offset. */ +static ssize_t read_from_oldmem(char *buf, size_t count, + u64 *ppos, int userbuf) +{ + unsigned long pfn, offset; + size_t nr_bytes; + ssize_t read = 0, tmp; + + if (!count) + return 0; + + offset = (unsigned long)(*ppos % PAGE_SIZE); + pfn = (unsigned long)(*ppos / PAGE_SIZE); + + do { + if (count > (PAGE_SIZE - offset)) + nr_bytes = PAGE_SIZE - offset; + else + nr_bytes = count; + + /* If pfn is not ram, return zeros for sparse dump files */ + if (pfn_is_ram(pfn) == 0) + memset(buf, 0, nr_bytes); + else { + tmp = copy_oldmem_page(pfn, buf, nr_bytes, + offset, userbuf); + if (tmp < 0) + return tmp; + } + *ppos += nr_bytes; + count -= nr_bytes; + buf += nr_bytes; + read += nr_bytes; + ++pfn; + offset = 0; + } while (count); + + return read; +} + +/* Maps vmcore file offset to respective physical address in memroy. */ +static u64 map_offset_to_paddr(loff_t offset, struct list_head *vc_list, + struct vmcore **m_ptr) +{ + struct vmcore *m; + u64 paddr; + + list_for_each_entry(m, vc_list, list) { + u64 start, end; + start = m->offset; + end = m->offset + m->size - 1; + if (offset >= start && offset <= end) { + paddr = m->paddr + offset - start; + *m_ptr = m; + return paddr; + } + } + *m_ptr = NULL; + return 0; +} + +/* Read from the ELF header and then the crash dump. On error, negative value is + * returned otherwise number of bytes read are returned. + */ +static ssize_t read_vmcore(struct file *file, char __user *buffer, + size_t buflen, loff_t *fpos) +{ + ssize_t acc = 0, tmp; + size_t tsz; + u64 start, nr_bytes; + struct vmcore *curr_m = NULL; + + if (buflen == 0 || *fpos >= vmcore_size) + return 0; + + /* trim buflen to not go beyond EOF */ + if (buflen > vmcore_size - *fpos) + buflen = vmcore_size - *fpos; + + /* Read ELF core header */ + if (*fpos < elfcorebuf_sz) { + tsz = elfcorebuf_sz - *fpos; + if (buflen < tsz) + tsz = buflen; + if (copy_to_user(buffer, elfcorebuf + *fpos, tsz)) + return -EFAULT; + buflen -= tsz; + *fpos += tsz; + buffer += tsz; + acc += tsz; + + /* leave now if filled buffer already */ + if (buflen == 0) + return acc; + } + + start = map_offset_to_paddr(*fpos, &vmcore_list, &curr_m); + if (!curr_m) + return -EINVAL; + if ((tsz = (PAGE_SIZE - (start & ~PAGE_MASK))) > buflen) + tsz = buflen; + + /* Calculate left bytes in current memory segment. */ + nr_bytes = (curr_m->size - (start - curr_m->paddr)); + if (tsz > nr_bytes) + tsz = nr_bytes; + + while (buflen) { + tmp = read_from_oldmem(buffer, tsz, &start, 1); + if (tmp < 0) + return tmp; + buflen -= tsz; + *fpos += tsz; + buffer += tsz; + acc += tsz; + if (start >= (curr_m->paddr + curr_m->size)) { + if (curr_m->list.next == &vmcore_list) + return acc; /*EOF*/ + curr_m = list_entry(curr_m->list.next, + struct vmcore, list); + start = curr_m->paddr; + } + if ((tsz = (PAGE_SIZE - (start & ~PAGE_MASK))) > buflen) + tsz = buflen; + /* Calculate left bytes in current memory segment. */ + nr_bytes = (curr_m->size - (start - curr_m->paddr)); + if (tsz > nr_bytes) + tsz = nr_bytes; + } + return acc; +} + +static const struct file_operations proc_vmcore_operations = { + .read = read_vmcore, + .llseek = default_llseek, +}; + +static struct vmcore* __init get_new_element(void) +{ + return kzalloc(sizeof(struct vmcore), GFP_KERNEL); +} + +static u64 __init get_vmcore_size_elf64(char *elfptr) +{ + int i; + u64 size; + Elf64_Ehdr *ehdr_ptr; + Elf64_Phdr *phdr_ptr; + + ehdr_ptr = (Elf64_Ehdr *)elfptr; + phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr)); + size = sizeof(Elf64_Ehdr) + ((ehdr_ptr->e_phnum) * sizeof(Elf64_Phdr)); + for (i = 0; i < ehdr_ptr->e_phnum; i++) { + size += phdr_ptr->p_memsz; + phdr_ptr++; + } + return size; +} + +static u64 __init get_vmcore_size_elf32(char *elfptr) +{ + int i; + u64 size; + Elf32_Ehdr *ehdr_ptr; + Elf32_Phdr *phdr_ptr; + + ehdr_ptr = (Elf32_Ehdr *)elfptr; + phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr)); + size = sizeof(Elf32_Ehdr) + ((ehdr_ptr->e_phnum) * sizeof(Elf32_Phdr)); + for (i = 0; i < ehdr_ptr->e_phnum; i++) { + size += phdr_ptr->p_memsz; + phdr_ptr++; + } + return size; +} + +/* Merges all the PT_NOTE headers into one. */ +static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz, + struct list_head *vc_list) +{ + int i, nr_ptnote=0, rc=0; + char *tmp; + Elf64_Ehdr *ehdr_ptr; + Elf64_Phdr phdr, *phdr_ptr; + Elf64_Nhdr *nhdr_ptr; + u64 phdr_sz = 0, note_off; + + ehdr_ptr = (Elf64_Ehdr *)elfptr; + phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr)); + for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { + int j; + void *notes_section; + struct vmcore *new; + u64 offset, max_sz, sz, real_sz = 0; + if (phdr_ptr->p_type != PT_NOTE) + continue; + nr_ptnote++; + max_sz = phdr_ptr->p_memsz; + offset = phdr_ptr->p_offset; + notes_section = kmalloc(max_sz, GFP_KERNEL); + if (!notes_section) + return -ENOMEM; + rc = read_from_oldmem(notes_section, max_sz, &offset, 0); + if (rc < 0) { + kfree(notes_section); + return rc; + } + nhdr_ptr = notes_section; + for (j = 0; j < max_sz; j += sz) { + if (nhdr_ptr->n_namesz == 0) + break; + sz = sizeof(Elf64_Nhdr) + + ((nhdr_ptr->n_namesz + 3) & ~3) + + ((nhdr_ptr->n_descsz + 3) & ~3); + real_sz += sz; + nhdr_ptr = (Elf64_Nhdr*)((char*)nhdr_ptr + sz); + } + + /* Add this contiguous chunk of notes section to vmcore list.*/ + new = get_new_element(); + if (!new) { + kfree(notes_section); + return -ENOMEM; + } + new->paddr = phdr_ptr->p_offset; + new->size = real_sz; + list_add_tail(&new->list, vc_list); + phdr_sz += real_sz; + kfree(notes_section); + } + + /* Prepare merged PT_NOTE program header. */ + phdr.p_type = PT_NOTE; + phdr.p_flags = 0; + note_off = sizeof(Elf64_Ehdr) + + (ehdr_ptr->e_phnum - nr_ptnote +1) * sizeof(Elf64_Phdr); + phdr.p_offset = note_off; + phdr.p_vaddr = phdr.p_paddr = 0; + phdr.p_filesz = phdr.p_memsz = phdr_sz; + phdr.p_align = 0; + + /* Add merged PT_NOTE program header*/ + tmp = elfptr + sizeof(Elf64_Ehdr); + memcpy(tmp, &phdr, sizeof(phdr)); + tmp += sizeof(phdr); + + /* Remove unwanted PT_NOTE program headers. */ + i = (nr_ptnote - 1) * sizeof(Elf64_Phdr); + *elfsz = *elfsz - i; + memmove(tmp, tmp+i, ((*elfsz)-sizeof(Elf64_Ehdr)-sizeof(Elf64_Phdr))); + + /* Modify e_phnum to reflect merged headers. */ + ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1; + + return 0; +} + +/* Merges all the PT_NOTE headers into one. */ +static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz, + struct list_head *vc_list) +{ + int i, nr_ptnote=0, rc=0; + char *tmp; + Elf32_Ehdr *ehdr_ptr; + Elf32_Phdr phdr, *phdr_ptr; + Elf32_Nhdr *nhdr_ptr; + u64 phdr_sz = 0, note_off; + + ehdr_ptr = (Elf32_Ehdr *)elfptr; + phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr)); + for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { + int j; + void *notes_section; + struct vmcore *new; + u64 offset, max_sz, sz, real_sz = 0; + if (phdr_ptr->p_type != PT_NOTE) + continue; + nr_ptnote++; + max_sz = phdr_ptr->p_memsz; + offset = phdr_ptr->p_offset; + notes_section = kmalloc(max_sz, GFP_KERNEL); + if (!notes_section) + return -ENOMEM; + rc = read_from_oldmem(notes_section, max_sz, &offset, 0); + if (rc < 0) { + kfree(notes_section); + return rc; + } + nhdr_ptr = notes_section; + for (j = 0; j < max_sz; j += sz) { + if (nhdr_ptr->n_namesz == 0) + break; + sz = sizeof(Elf32_Nhdr) + + ((nhdr_ptr->n_namesz + 3) & ~3) + + ((nhdr_ptr->n_descsz + 3) & ~3); + real_sz += sz; + nhdr_ptr = (Elf32_Nhdr*)((char*)nhdr_ptr + sz); + } + + /* Add this contiguous chunk of notes section to vmcore list.*/ + new = get_new_element(); + if (!new) { + kfree(notes_section); + return -ENOMEM; + } + new->paddr = phdr_ptr->p_offset; + new->size = real_sz; + list_add_tail(&new->list, vc_list); + phdr_sz += real_sz; + kfree(notes_section); + } + + /* Prepare merged PT_NOTE program header. */ + phdr.p_type = PT_NOTE; + phdr.p_flags = 0; + note_off = sizeof(Elf32_Ehdr) + + (ehdr_ptr->e_phnum - nr_ptnote +1) * sizeof(Elf32_Phdr); + phdr.p_offset = note_off; + phdr.p_vaddr = phdr.p_paddr = 0; + phdr.p_filesz = phdr.p_memsz = phdr_sz; + phdr.p_align = 0; + + /* Add merged PT_NOTE program header*/ + tmp = elfptr + sizeof(Elf32_Ehdr); + memcpy(tmp, &phdr, sizeof(phdr)); + tmp += sizeof(phdr); + + /* Remove unwanted PT_NOTE program headers. */ + i = (nr_ptnote - 1) * sizeof(Elf32_Phdr); + *elfsz = *elfsz - i; + memmove(tmp, tmp+i, ((*elfsz)-sizeof(Elf32_Ehdr)-sizeof(Elf32_Phdr))); + + /* Modify e_phnum to reflect merged headers. */ + ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1; + + return 0; +} + +/* Add memory chunks represented by program headers to vmcore list. Also update + * the new offset fields of exported program headers. */ +static int __init process_ptload_program_headers_elf64(char *elfptr, + size_t elfsz, + struct list_head *vc_list) +{ + int i; + Elf64_Ehdr *ehdr_ptr; + Elf64_Phdr *phdr_ptr; + loff_t vmcore_off; + struct vmcore *new; + + ehdr_ptr = (Elf64_Ehdr *)elfptr; + phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr)); /* PT_NOTE hdr */ + + /* First program header is PT_NOTE header. */ + vmcore_off = sizeof(Elf64_Ehdr) + + (ehdr_ptr->e_phnum) * sizeof(Elf64_Phdr) + + phdr_ptr->p_memsz; /* Note sections */ + + for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { + if (phdr_ptr->p_type != PT_LOAD) + continue; + + /* Add this contiguous chunk of memory to vmcore list.*/ + new = get_new_element(); + if (!new) + return -ENOMEM; + new->paddr = phdr_ptr->p_offset; + new->size = phdr_ptr->p_memsz; + list_add_tail(&new->list, vc_list); + + /* Update the program header offset. */ + phdr_ptr->p_offset = vmcore_off; + vmcore_off = vmcore_off + phdr_ptr->p_memsz; + } + return 0; +} + +static int __init process_ptload_program_headers_elf32(char *elfptr, + size_t elfsz, + struct list_head *vc_list) +{ + int i; + Elf32_Ehdr *ehdr_ptr; + Elf32_Phdr *phdr_ptr; + loff_t vmcore_off; + struct vmcore *new; + + ehdr_ptr = (Elf32_Ehdr *)elfptr; + phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr)); /* PT_NOTE hdr */ + + /* First program header is PT_NOTE header. */ + vmcore_off = sizeof(Elf32_Ehdr) + + (ehdr_ptr->e_phnum) * sizeof(Elf32_Phdr) + + phdr_ptr->p_memsz; /* Note sections */ + + for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { + if (phdr_ptr->p_type != PT_LOAD) + continue; + + /* Add this contiguous chunk of memory to vmcore list.*/ + new = get_new_element(); + if (!new) + return -ENOMEM; + new->paddr = phdr_ptr->p_offset; + new->size = phdr_ptr->p_memsz; + list_add_tail(&new->list, vc_list); + + /* Update the program header offset */ + phdr_ptr->p_offset = vmcore_off; + vmcore_off = vmcore_off + phdr_ptr->p_memsz; + } + return 0; +} + +/* Sets offset fields of vmcore elements. */ +static void __init set_vmcore_list_offsets_elf64(char *elfptr, + struct list_head *vc_list) +{ + loff_t vmcore_off; + Elf64_Ehdr *ehdr_ptr; + struct vmcore *m; + + ehdr_ptr = (Elf64_Ehdr *)elfptr; + + /* Skip Elf header and program headers. */ + vmcore_off = sizeof(Elf64_Ehdr) + + (ehdr_ptr->e_phnum) * sizeof(Elf64_Phdr); + + list_for_each_entry(m, vc_list, list) { + m->offset = vmcore_off; + vmcore_off += m->size; + } +} + +/* Sets offset fields of vmcore elements. */ +static void __init set_vmcore_list_offsets_elf32(char *elfptr, + struct list_head *vc_list) +{ + loff_t vmcore_off; + Elf32_Ehdr *ehdr_ptr; + struct vmcore *m; + + ehdr_ptr = (Elf32_Ehdr *)elfptr; + + /* Skip Elf header and program headers. */ + vmcore_off = sizeof(Elf32_Ehdr) + + (ehdr_ptr->e_phnum) * sizeof(Elf32_Phdr); + + list_for_each_entry(m, vc_list, list) { + m->offset = vmcore_off; + vmcore_off += m->size; + } +} + +static int __init parse_crash_elf64_headers(void) +{ + int rc=0; + Elf64_Ehdr ehdr; + u64 addr; + + addr = elfcorehdr_addr; + + /* Read Elf header */ + rc = read_from_oldmem((char*)&ehdr, sizeof(Elf64_Ehdr), &addr, 0); + if (rc < 0) + return rc; + + /* Do some basic Verification. */ + if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0 || + (ehdr.e_type != ET_CORE) || + !vmcore_elf64_check_arch(&ehdr) || + ehdr.e_ident[EI_CLASS] != ELFCLASS64 || + ehdr.e_ident[EI_VERSION] != EV_CURRENT || + ehdr.e_version != EV_CURRENT || + ehdr.e_ehsize != sizeof(Elf64_Ehdr) || + ehdr.e_phentsize != sizeof(Elf64_Phdr) || + ehdr.e_phnum == 0) { + printk(KERN_WARNING "Warning: Core image elf header is not" + "sane\n"); + return -EINVAL; + } + + /* Read in all elf headers. */ + elfcorebuf_sz = sizeof(Elf64_Ehdr) + ehdr.e_phnum * sizeof(Elf64_Phdr); + elfcorebuf = kmalloc(elfcorebuf_sz, GFP_KERNEL); + if (!elfcorebuf) + return -ENOMEM; + addr = elfcorehdr_addr; + rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz, &addr, 0); + if (rc < 0) { + kfree(elfcorebuf); + return rc; + } + + /* Merge all PT_NOTE headers into one. */ + rc = merge_note_headers_elf64(elfcorebuf, &elfcorebuf_sz, &vmcore_list); + if (rc) { + kfree(elfcorebuf); + return rc; + } + rc = process_ptload_program_headers_elf64(elfcorebuf, elfcorebuf_sz, + &vmcore_list); + if (rc) { + kfree(elfcorebuf); + return rc; + } + set_vmcore_list_offsets_elf64(elfcorebuf, &vmcore_list); + return 0; +} + +static int __init parse_crash_elf32_headers(void) +{ + int rc=0; + Elf32_Ehdr ehdr; + u64 addr; + + addr = elfcorehdr_addr; + + /* Read Elf header */ + rc = read_from_oldmem((char*)&ehdr, sizeof(Elf32_Ehdr), &addr, 0); + if (rc < 0) + return rc; + + /* Do some basic Verification. */ + if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0 || + (ehdr.e_type != ET_CORE) || + !elf_check_arch(&ehdr) || + ehdr.e_ident[EI_CLASS] != ELFCLASS32|| + ehdr.e_ident[EI_VERSION] != EV_CURRENT || + ehdr.e_version != EV_CURRENT || + ehdr.e_ehsize != sizeof(Elf32_Ehdr) || + ehdr.e_phentsize != sizeof(Elf32_Phdr) || + ehdr.e_phnum == 0) { + printk(KERN_WARNING "Warning: Core image elf header is not" + "sane\n"); + return -EINVAL; + } + + /* Read in all elf headers. */ + elfcorebuf_sz = sizeof(Elf32_Ehdr) + ehdr.e_phnum * sizeof(Elf32_Phdr); + elfcorebuf = kmalloc(elfcorebuf_sz, GFP_KERNEL); + if (!elfcorebuf) + return -ENOMEM; + addr = elfcorehdr_addr; + rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz, &addr, 0); + if (rc < 0) { + kfree(elfcorebuf); + return rc; + } + + /* Merge all PT_NOTE headers into one. */ + rc = merge_note_headers_elf32(elfcorebuf, &elfcorebuf_sz, &vmcore_list); + if (rc) { + kfree(elfcorebuf); + return rc; + } + rc = process_ptload_program_headers_elf32(elfcorebuf, elfcorebuf_sz, + &vmcore_list); + if (rc) { + kfree(elfcorebuf); + return rc; + } + set_vmcore_list_offsets_elf32(elfcorebuf, &vmcore_list); + return 0; +} + +static int __init parse_crash_elf_headers(void) +{ + unsigned char e_ident[EI_NIDENT]; + u64 addr; + int rc=0; + + addr = elfcorehdr_addr; + rc = read_from_oldmem(e_ident, EI_NIDENT, &addr, 0); + if (rc < 0) + return rc; + if (memcmp(e_ident, ELFMAG, SELFMAG) != 0) { + printk(KERN_WARNING "Warning: Core image elf header" + " not found\n"); + return -EINVAL; + } + + if (e_ident[EI_CLASS] == ELFCLASS64) { + rc = parse_crash_elf64_headers(); + if (rc) + return rc; + + /* Determine vmcore size. */ + vmcore_size = get_vmcore_size_elf64(elfcorebuf); + } else if (e_ident[EI_CLASS] == ELFCLASS32) { + rc = parse_crash_elf32_headers(); + if (rc) + return rc; + + /* Determine vmcore size. */ + vmcore_size = get_vmcore_size_elf32(elfcorebuf); + } else { + printk(KERN_WARNING "Warning: Core image elf header is not" + " sane\n"); + return -EINVAL; + } + return 0; +} + +/* Init function for vmcore module. */ +static int __init vmcore_init(void) +{ + int rc = 0; + + /* If elfcorehdr= has been passed in cmdline, then capture the dump.*/ + if (!(is_vmcore_usable())) + return rc; + rc = parse_crash_elf_headers(); + if (rc) { + printk(KERN_WARNING "Kdump: vmcore not initialized\n"); + return rc; + } + + proc_vmcore = proc_create("vmcore", S_IRUSR, NULL, &proc_vmcore_operations); + if (proc_vmcore) + proc_vmcore->size = vmcore_size; + return 0; +} +module_init(vmcore_init) + +/* Cleanup function for vmcore module. */ +void vmcore_cleanup(void) +{ + struct list_head *pos, *next; + + if (proc_vmcore) { + remove_proc_entry(proc_vmcore->name, proc_vmcore->parent); + proc_vmcore = NULL; + } + + /* clear the vmcore list. */ + list_for_each_safe(pos, next, &vmcore_list) { + struct vmcore *m; + + m = list_entry(pos, struct vmcore, list); + list_del(&m->list); + kfree(m); + } + kfree(elfcorebuf); + elfcorebuf = NULL; +} +EXPORT_SYMBOL_GPL(vmcore_cleanup); |