From 871480933a1c28f8a9fed4c4d34d06c439a7a422 Mon Sep 17 00:00:00 2001 From: Srikant Patnaik Date: Sun, 11 Jan 2015 12:28:04 +0530 Subject: Moved, renamed, and deleted files The original directory structure was scattered and unorganized. Changes are basically to make it look like kernel structure. --- init/Kconfig | 1445 +++++++++++++++++++++++++++++++++++++++++++++++ init/Makefile | 32 ++ init/calibrate.c | 301 ++++++++++ init/do_mounts.c | 560 ++++++++++++++++++ init/do_mounts.h | 76 +++ init/do_mounts_initrd.c | 125 ++++ init/do_mounts_md.c | 302 ++++++++++ init/do_mounts_rd.c | 340 +++++++++++ init/initramfs.c | 611 ++++++++++++++++++++ init/main.c | 894 +++++++++++++++++++++++++++++ init/noinitramfs.c | 52 ++ init/version.c | 48 ++ 12 files changed, 4786 insertions(+) create mode 100644 init/Kconfig create mode 100644 init/Makefile create mode 100644 init/calibrate.c create mode 100644 init/do_mounts.c create mode 100644 init/do_mounts.h create mode 100644 init/do_mounts_initrd.c create mode 100644 init/do_mounts_md.c create mode 100644 init/do_mounts_rd.c create mode 100644 init/initramfs.c create mode 100644 init/main.c create mode 100644 init/noinitramfs.c create mode 100644 init/version.c (limited to 'init') diff --git a/init/Kconfig b/init/Kconfig new file mode 100644 index 00000000..a7cffc83 --- /dev/null +++ b/init/Kconfig @@ -0,0 +1,1445 @@ +config ARCH + string + option env="ARCH" + +config KERNELVERSION + string + option env="KERNELVERSION" + +config DEFCONFIG_LIST + string + depends on !UML + option defconfig_list + default "/lib/modules/$UNAME_RELEASE/.config" + default "/etc/kernel-config" + default "/boot/config-$UNAME_RELEASE" + default "$ARCH_DEFCONFIG" + default "arch/$ARCH/defconfig" + +config CONSTRUCTORS + bool + depends on !UML + +config HAVE_IRQ_WORK + bool + +config IRQ_WORK + bool + depends on HAVE_IRQ_WORK + +menu "General setup" + +config EXPERIMENTAL + bool "Prompt for development and/or incomplete code/drivers" + ---help--- + Some of the various things that Linux supports (such as network + drivers, file systems, network protocols, etc.) can be in a state + of development where the functionality, stability, or the level of + testing is not yet high enough for general use. This is usually + known as the "alpha-test" phase among developers. If a feature is + currently in alpha-test, then the developers usually discourage + uninformed widespread use of this feature by the general public to + avoid "Why doesn't this work?" type mail messages. However, active + testing and use of these systems is welcomed. Just be aware that it + may not meet the normal level of reliability or it may fail to work + in some special cases. Detailed bug reports from people familiar + with the kernel internals are usually welcomed by the developers + (before submitting bug reports, please read the documents + , , , + , and + in the kernel source). + + This option will also make obsoleted drivers available. These are + drivers that have been replaced by something else, and/or are + scheduled to be removed in a future kernel release. + + Unless you intend to help test and develop a feature or driver that + falls into this category, or you have a situation that requires + using these features, you should probably say N here, which will + cause the configurator to present you with fewer choices. If + you say Y here, you will be offered the choice of using features or + drivers that are currently considered to be in the alpha-test phase. + +config BROKEN + bool + +config BROKEN_ON_SMP + bool + depends on BROKEN || !SMP + default y + +config INIT_ENV_ARG_LIMIT + int + default 32 if !UML + default 128 if UML + help + Maximum of each of the number of arguments and environment + variables passed to init from the kernel command line. + + +config CROSS_COMPILE + string "Cross-compiler tool prefix" + help + Same as running 'make CROSS_COMPILE=prefix-' but stored for + default make runs in this kernel build directory. You don't + need to set this unless you want the configured kernel build + directory to select the cross-compiler automatically. + +config LOCALVERSION + string "Local version - append to kernel release" + help + Append an extra string to the end of your kernel version. + This will show up when you type uname, for example. + The string you set here will be appended after the contents of + any files with a filename matching localversion* in your + object and source tree, in that order. Your total string can + be a maximum of 64 characters. + +config LOCALVERSION_AUTO + bool "Automatically append version information to the version string" + default y + help + This will try to automatically determine if the current tree is a + release tree by looking for git tags that belong to the current + top of tree revision. + + A string of the format -gxxxxxxxx will be added to the localversion + if a git-based tree is found. The string generated by this will be + appended after any matching localversion* files, and after the value + set in CONFIG_LOCALVERSION. + + (The actual string used here is the first eight characters produced + by running the command: + + $ git rev-parse --verify HEAD + + which is done within the script "scripts/setlocalversion".) + +config HAVE_KERNEL_GZIP + bool + +config HAVE_KERNEL_BZIP2 + bool + +config HAVE_KERNEL_LZMA + bool + +config HAVE_KERNEL_XZ + bool + +config HAVE_KERNEL_LZO + bool + +choice + prompt "Kernel compression mode" + default KERNEL_GZIP + depends on HAVE_KERNEL_GZIP || HAVE_KERNEL_BZIP2 || HAVE_KERNEL_LZMA || HAVE_KERNEL_XZ || HAVE_KERNEL_LZO + help + The linux kernel is a kind of self-extracting executable. + Several compression algorithms are available, which differ + in efficiency, compression and decompression speed. + Compression speed is only relevant when building a kernel. + Decompression speed is relevant at each boot. + + If you have any problems with bzip2 or lzma compressed + kernels, mail me (Alain Knaff) . (An older + version of this functionality (bzip2 only), for 2.4, was + supplied by Christian Ludwig) + + High compression options are mostly useful for users, who + are low on disk space (embedded systems), but for whom ram + size matters less. + + If in doubt, select 'gzip' + +config KERNEL_GZIP + bool "Gzip" + depends on HAVE_KERNEL_GZIP + help + The old and tried gzip compression. It provides a good balance + between compression ratio and decompression speed. + +config KERNEL_BZIP2 + bool "Bzip2" + depends on HAVE_KERNEL_BZIP2 + help + Its compression ratio and speed is intermediate. + Decompression speed is slowest among the three. The kernel + size is about 10% smaller with bzip2, in comparison to gzip. + Bzip2 uses a large amount of memory. For modern kernels you + will need at least 8MB RAM or more for booting. + +config KERNEL_LZMA + bool "LZMA" + depends on HAVE_KERNEL_LZMA + help + The most recent compression algorithm. + Its ratio is best, decompression speed is between the other + two. Compression is slowest. The kernel size is about 33% + smaller with LZMA in comparison to gzip. + +config KERNEL_XZ + bool "XZ" + depends on HAVE_KERNEL_XZ + help + XZ uses the LZMA2 algorithm and instruction set specific + BCJ filters which can improve compression ratio of executable + code. The size of the kernel is about 30% smaller with XZ in + comparison to gzip. On architectures for which there is a BCJ + filter (i386, x86_64, ARM, IA-64, PowerPC, and SPARC), XZ + will create a few percent smaller kernel than plain LZMA. + + The speed is about the same as with LZMA: The decompression + speed of XZ is better than that of bzip2 but worse than gzip + and LZO. Compression is slow. + +config KERNEL_LZO + bool "LZO" + depends on HAVE_KERNEL_LZO + help + Its compression ratio is the poorest among the 4. The kernel + size is about 10% bigger than gzip; however its speed + (both compression and decompression) is the fastest. + +endchoice + +config DEFAULT_HOSTNAME + string "Default hostname" + default "(none)" + help + This option determines the default system hostname before userspace + calls sethostname(2). The kernel traditionally uses "(none)" here, + but you may wish to use a different default here to make a minimal + system more usable with less configuration. + +config SWAP + bool "Support for paging of anonymous memory (swap)" + depends on MMU && BLOCK + default y + help + This option allows you to choose whether you want to have support + for so called swap devices or swap files in your kernel that are + used to provide more virtual memory than the actual RAM present + in your computer. If unsure say Y. + +config SYSVIPC + bool "System V IPC" + ---help--- + Inter Process Communication is a suite of library functions and + system calls which let processes (running programs) synchronize and + exchange information. It is generally considered to be a good thing, + and some programs won't run unless you say Y here. In particular, if + you want to run the DOS emulator dosemu under Linux (read the + DOSEMU-HOWTO, available from ), + you'll need to say Y here. + + You can find documentation about IPC with "info ipc" and also in + section 6.4 of the Linux Programmer's Guide, available from + . + +config SYSVIPC_SYSCTL + bool + depends on SYSVIPC + depends on SYSCTL + default y + +config POSIX_MQUEUE + bool "POSIX Message Queues" + depends on NET && EXPERIMENTAL + ---help--- + POSIX variant of message queues is a part of IPC. In POSIX message + queues every message has a priority which decides about succession + of receiving it by a process. If you want to compile and run + programs written e.g. for Solaris with use of its POSIX message + queues (functions mq_*) say Y here. + + POSIX message queues are visible as a filesystem called 'mqueue' + and can be mounted somewhere if you want to do filesystem + operations on message queues. + + If unsure, say Y. + +config POSIX_MQUEUE_SYSCTL + bool + depends on POSIX_MQUEUE + depends on SYSCTL + default y + +config BSD_PROCESS_ACCT + bool "BSD Process Accounting" + help + If you say Y here, a user level program will be able to instruct the + kernel (via a special system call) to write process accounting + information to a file: whenever a process exits, information about + that process will be appended to the file by the kernel. The + information includes things such as creation time, owning user, + command name, memory usage, controlling terminal etc. (the complete + list is in the struct acct in ). It is + up to the user level program to do useful things with this + information. This is generally a good idea, so say Y. + +config BSD_PROCESS_ACCT_V3 + bool "BSD Process Accounting version 3 file format" + depends on BSD_PROCESS_ACCT + default n + help + If you say Y here, the process accounting information is written + in a new file format that also logs the process IDs of each + process and it's parent. Note that this file format is incompatible + with previous v0/v1/v2 file formats, so you will need updated tools + for processing it. A preliminary version of these tools is available + at . + +config FHANDLE + bool "open by fhandle syscalls" + select EXPORTFS + help + If you say Y here, a user level program will be able to map + file names to handle and then later use the handle for + different file system operations. This is useful in implementing + userspace file servers, which now track files using handles instead + of names. The handle would remain the same even if file names + get renamed. Enables open_by_handle_at(2) and name_to_handle_at(2) + syscalls. + +config TASKSTATS + bool "Export task/process statistics through netlink (EXPERIMENTAL)" + depends on NET + default n + help + Export selected statistics for tasks/processes through the + generic netlink interface. Unlike BSD process accounting, the + statistics are available during the lifetime of tasks/processes as + responses to commands. Like BSD accounting, they are sent to user + space on task exit. + + Say N if unsure. + +config TASK_DELAY_ACCT + bool "Enable per-task delay accounting (EXPERIMENTAL)" + depends on TASKSTATS + help + Collect information on time spent by a task waiting for system + resources like cpu, synchronous block I/O completion and swapping + in pages. Such statistics can help in setting a task's priorities + relative to other tasks for cpu, io, rss limits etc. + + Say N if unsure. + +config TASK_XACCT + bool "Enable extended accounting over taskstats (EXPERIMENTAL)" + depends on TASKSTATS + help + Collect extended task accounting data and send the data + to userland for processing over the taskstats interface. + + Say N if unsure. + +config TASK_IO_ACCOUNTING + bool "Enable per-task storage I/O accounting (EXPERIMENTAL)" + depends on TASK_XACCT + help + Collect information on the number of bytes of storage I/O which this + task has caused. + + Say N if unsure. + +config AUDIT + bool "Auditing support" + depends on NET + help + Enable auditing infrastructure that can be used with another + kernel subsystem, such as SELinux (which requires this for + logging of avc messages output). Does not do system-call + auditing without CONFIG_AUDITSYSCALL. + +config AUDITSYSCALL + bool "Enable system-call auditing support" + depends on AUDIT && (X86 || PPC || S390 || IA64 || UML || SPARC64 || SUPERH || ARM) + default y if SECURITY_SELINUX + help + Enable low-overhead system-call auditing infrastructure that + can be used independently or with another kernel subsystem, + such as SELinux. + +config AUDIT_WATCH + def_bool y + depends on AUDITSYSCALL + select FSNOTIFY + +config AUDIT_TREE + def_bool y + depends on AUDITSYSCALL + select FSNOTIFY + +config AUDIT_LOGINUID_IMMUTABLE + bool "Make audit loginuid immutable" + depends on AUDIT + help + The config option toggles if a task setting its loginuid requires + CAP_SYS_AUDITCONTROL or if that task should require no special permissions + but should instead only allow setting its loginuid if it was never + previously set. On systems which use systemd or a similar central + process to restart login services this should be set to true. On older + systems in which an admin would typically have to directly stop and + start processes this should be set to false. Setting this to true allows + one to drop potentially dangerous capabilites from the login tasks, + but may not be backwards compatible with older init systems. + +source "kernel/irq/Kconfig" + +menu "RCU Subsystem" + +choice + prompt "RCU Implementation" + default TREE_RCU + +config TREE_RCU + bool "Tree-based hierarchical RCU" + depends on !PREEMPT && SMP + help + This option selects the RCU implementation that is + designed for very large SMP system with hundreds or + thousands of CPUs. It also scales down nicely to + smaller systems. + +config TREE_PREEMPT_RCU + bool "Preemptible tree-based hierarchical RCU" + depends on PREEMPT && SMP + help + This option selects the RCU implementation that is + designed for very large SMP systems with hundreds or + thousands of CPUs, but for which real-time response + is also required. It also scales down nicely to + smaller systems. + +config TINY_RCU + bool "UP-only small-memory-footprint RCU" + depends on !PREEMPT && !SMP + help + This option selects the RCU implementation that is + designed for UP systems from which real-time response + is not required. This option greatly reduces the + memory footprint of RCU. + +config TINY_PREEMPT_RCU + bool "Preemptible UP-only small-memory-footprint RCU" + depends on PREEMPT && !SMP + help + This option selects the RCU implementation that is designed + for real-time UP systems. This option greatly reduces the + memory footprint of RCU. + +endchoice + +config PREEMPT_RCU + def_bool ( TREE_PREEMPT_RCU || TINY_PREEMPT_RCU ) + help + This option enables preemptible-RCU code that is common between + the TREE_PREEMPT_RCU and TINY_PREEMPT_RCU implementations. + +config RCU_FANOUT + int "Tree-based hierarchical RCU fanout value" + range 2 64 if 64BIT + range 2 32 if !64BIT + depends on TREE_RCU || TREE_PREEMPT_RCU + default 64 if 64BIT + default 32 if !64BIT + help + This option controls the fanout of hierarchical implementations + of RCU, allowing RCU to work efficiently on machines with + large numbers of CPUs. This value must be at least the fourth + root of NR_CPUS, which allows NR_CPUS to be insanely large. + The default value of RCU_FANOUT should be used for production + systems, but if you are stress-testing the RCU implementation + itself, small RCU_FANOUT values allow you to test large-system + code paths on small(er) systems. + + Select a specific number if testing RCU itself. + Take the default if unsure. + +config RCU_FANOUT_EXACT + bool "Disable tree-based hierarchical RCU auto-balancing" + depends on TREE_RCU || TREE_PREEMPT_RCU + default n + help + This option forces use of the exact RCU_FANOUT value specified, + regardless of imbalances in the hierarchy. This is useful for + testing RCU itself, and might one day be useful on systems with + strong NUMA behavior. + + Without RCU_FANOUT_EXACT, the code will balance the hierarchy. + + Say N if unsure. + +config RCU_FAST_NO_HZ + bool "Accelerate last non-dyntick-idle CPU's grace periods" + depends on NO_HZ && SMP + default n + help + This option causes RCU to attempt to accelerate grace periods + in order to allow CPUs to enter dynticks-idle state more + quickly. On the other hand, this option increases the overhead + of the dynticks-idle checking, particularly on systems with + large numbers of CPUs. + + Say Y if energy efficiency is critically important, particularly + if you have relatively few CPUs. + + Say N if you are unsure. + +config TREE_RCU_TRACE + def_bool RCU_TRACE && ( TREE_RCU || TREE_PREEMPT_RCU ) + select DEBUG_FS + help + This option provides tracing for the TREE_RCU and + TREE_PREEMPT_RCU implementations, permitting Makefile to + trivially select kernel/rcutree_trace.c. + +config RCU_BOOST + bool "Enable RCU priority boosting" + depends on RT_MUTEXES && PREEMPT_RCU + default n + help + This option boosts the priority of preempted RCU readers that + block the current preemptible RCU grace period for too long. + This option also prevents heavy loads from blocking RCU + callback invocation for all flavors of RCU. + + Say Y here if you are working with real-time apps or heavy loads + Say N here if you are unsure. + +config RCU_BOOST_PRIO + int "Real-time priority to boost RCU readers to" + range 1 99 + depends on RCU_BOOST + default 1 + help + This option specifies the real-time priority to which preempted + RCU readers are to be boosted. If you are working with CPU-bound + real-time applications, you should specify a priority higher then + the highest-priority CPU-bound application. + + Specify the real-time priority, or take the default if unsure. + +config RCU_BOOST_DELAY + int "Milliseconds to delay boosting after RCU grace-period start" + range 0 3000 + depends on RCU_BOOST + default 500 + help + This option specifies the time to wait after the beginning of + a given grace period before priority-boosting preempted RCU + readers blocking that grace period. Note that any RCU reader + blocking an expedited RCU grace period is boosted immediately. + + Accept the default if unsure. + +endmenu # "RCU Subsystem" + +config IKCONFIG + tristate "Kernel .config support" + ---help--- + This option enables the complete Linux kernel ".config" file + contents to be saved in the kernel. It provides documentation + of which kernel options are used in a running kernel or in an + on-disk kernel. This information can be extracted from the kernel + image file with the script scripts/extract-ikconfig and used as + input to rebuild the current kernel or to build another kernel. + It can also be extracted from a running kernel by reading + /proc/config.gz if enabled (below). + +config IKCONFIG_PROC + bool "Enable access to .config through /proc/config.gz" + depends on IKCONFIG && PROC_FS + ---help--- + This option enables access to the kernel configuration file + through /proc/config.gz. + +config LOG_BUF_SHIFT + int "Kernel log buffer size (16 => 64KB, 17 => 128KB)" + range 12 21 + default 17 + help + Select kernel log buffer size as a power of 2. + Examples: + 17 => 128 KB + 16 => 64 KB + 15 => 32 KB + 14 => 16 KB + 13 => 8 KB + 12 => 4 KB + +# +# Architectures with an unreliable sched_clock() should select this: +# +config HAVE_UNSTABLE_SCHED_CLOCK + bool + +menuconfig CGROUPS + boolean "Control Group support" + depends on EVENTFD + help + This option adds support for grouping sets of processes together, for + use with process control subsystems such as Cpusets, CFS, memory + controls or device isolation. + See + - Documentation/scheduler/sched-design-CFS.txt (CFS) + - Documentation/cgroups/ (features for grouping, isolation + and resource control) + + Say N if unsure. + +if CGROUPS + +config CGROUP_DEBUG + bool "Example debug cgroup subsystem" + default n + help + This option enables a simple cgroup subsystem that + exports useful debugging information about the cgroups + framework. + + Say N if unsure. + +config CGROUP_FREEZER + bool "Freezer cgroup subsystem" + help + Provides a way to freeze and unfreeze all tasks in a + cgroup. + +config CGROUP_DEVICE + bool "Device controller for cgroups" + help + Provides a cgroup implementing whitelists for devices which + a process in the cgroup can mknod or open. + +config CPUSETS + bool "Cpuset support" + help + This option will let you create and manage CPUSETs which + allow dynamically partitioning a system into sets of CPUs and + Memory Nodes and assigning tasks to run only within those sets. + This is primarily useful on large SMP or NUMA systems. + + Say N if unsure. + +config PROC_PID_CPUSET + bool "Include legacy /proc//cpuset file" + depends on CPUSETS + default y + +config CGROUP_CPUACCT + bool "Simple CPU accounting cgroup subsystem" + help + Provides a simple Resource Controller for monitoring the + total CPU consumed by the tasks in a cgroup. + +config RESOURCE_COUNTERS + bool "Resource counters" + help + This option enables controller independent resource accounting + infrastructure that works with cgroups. + +config CGROUP_MEM_RES_CTLR + bool "Memory Resource Controller for Control Groups" + depends on RESOURCE_COUNTERS + select MM_OWNER + help + Provides a memory resource controller that manages both anonymous + memory and page cache. (See Documentation/cgroups/memory.txt) + + Note that setting this option increases fixed memory overhead + associated with each page of memory in the system. By this, + 20(40)bytes/PAGE_SIZE on 32(64)bit system will be occupied by memory + usage tracking struct at boot. Total amount of this is printed out + at boot. + + Only enable when you're ok with these trade offs and really + sure you need the memory resource controller. Even when you enable + this, you can set "cgroup_disable=memory" at your boot option to + disable memory resource controller and you can avoid overheads. + (and lose benefits of memory resource controller) + + This config option also selects MM_OWNER config option, which + could in turn add some fork/exit overhead. + +config CGROUP_MEM_RES_CTLR_SWAP + bool "Memory Resource Controller Swap Extension" + depends on CGROUP_MEM_RES_CTLR && SWAP + help + Add swap management feature to memory resource controller. When you + enable this, you can limit mem+swap usage per cgroup. In other words, + when you disable this, memory resource controller has no cares to + usage of swap...a process can exhaust all of the swap. This extension + is useful when you want to avoid exhaustion swap but this itself + adds more overheads and consumes memory for remembering information. + Especially if you use 32bit system or small memory system, please + be careful about enabling this. When memory resource controller + is disabled by boot option, this will be automatically disabled and + there will be no overhead from this. Even when you set this config=y, + if boot option "swapaccount=0" is set, swap will not be accounted. + Now, memory usage of swap_cgroup is 2 bytes per entry. If swap page + size is 4096bytes, 512k per 1Gbytes of swap. +config CGROUP_MEM_RES_CTLR_SWAP_ENABLED + bool "Memory Resource Controller Swap Extension enabled by default" + depends on CGROUP_MEM_RES_CTLR_SWAP + default y + help + Memory Resource Controller Swap Extension comes with its price in + a bigger memory consumption. General purpose distribution kernels + which want to enable the feature but keep it disabled by default + and let the user enable it by swapaccount boot command line + parameter should have this option unselected. + For those who want to have the feature enabled by default should + select this option (if, for some reason, they need to disable it + then swapaccount=0 does the trick). +config CGROUP_MEM_RES_CTLR_KMEM + bool "Memory Resource Controller Kernel Memory accounting (EXPERIMENTAL)" + depends on CGROUP_MEM_RES_CTLR && EXPERIMENTAL + default n + help + The Kernel Memory extension for Memory Resource Controller can limit + the amount of memory used by kernel objects in the system. Those are + fundamentally different from the entities handled by the standard + Memory Controller, which are page-based, and can be swapped. Users of + the kmem extension can use it to guarantee that no group of processes + will ever exhaust kernel resources alone. + +config CGROUP_PERF + bool "Enable perf_event per-cpu per-container group (cgroup) monitoring" + depends on PERF_EVENTS && CGROUPS + help + This option extends the per-cpu mode to restrict monitoring to + threads which belong to the cgroup specified and run on the + designated cpu. + + Say N if unsure. + +menuconfig CGROUP_SCHED + bool "Group CPU scheduler" + default n + help + This feature lets CPU scheduler recognize task groups and control CPU + bandwidth allocation to such task groups. It uses cgroups to group + tasks. + +if CGROUP_SCHED +config FAIR_GROUP_SCHED + bool "Group scheduling for SCHED_OTHER" + depends on CGROUP_SCHED + default CGROUP_SCHED + +config CFS_BANDWIDTH + bool "CPU bandwidth provisioning for FAIR_GROUP_SCHED" + depends on EXPERIMENTAL + depends on FAIR_GROUP_SCHED + default n + help + This option allows users to define CPU bandwidth rates (limits) for + tasks running within the fair group scheduler. Groups with no limit + set are considered to be unconstrained and will run with no + restriction. + See tip/Documentation/scheduler/sched-bwc.txt for more information. + +config RT_GROUP_SCHED + bool "Group scheduling for SCHED_RR/FIFO" + depends on EXPERIMENTAL + depends on CGROUP_SCHED + default n + help + This feature lets you explicitly allocate real CPU bandwidth + to task groups. If enabled, it will also make it impossible to + schedule realtime tasks for non-root users until you allocate + realtime bandwidth for them. + See Documentation/scheduler/sched-rt-group.txt for more information. + +endif #CGROUP_SCHED + +config BLK_CGROUP + tristate "Block IO controller" + depends on BLOCK + default n + ---help--- + Generic block IO controller cgroup interface. This is the common + cgroup interface which should be used by various IO controlling + policies. + + Currently, CFQ IO scheduler uses it to recognize task groups and + control disk bandwidth allocation (proportional time slice allocation) + to such task groups. It is also used by bio throttling logic in + block layer to implement upper limit in IO rates on a device. + + This option only enables generic Block IO controller infrastructure. + One needs to also enable actual IO controlling logic/policy. For + enabling proportional weight division of disk bandwidth in CFQ, set + CONFIG_CFQ_GROUP_IOSCHED=y; for enabling throttling policy, set + CONFIG_BLK_DEV_THROTTLING=y. + + See Documentation/cgroups/blkio-controller.txt for more information. + +config DEBUG_BLK_CGROUP + bool "Enable Block IO controller debugging" + depends on BLK_CGROUP + default n + ---help--- + Enable some debugging help. Currently it exports additional stat + files in a cgroup which can be useful for debugging. + +endif # CGROUPS + +config CHECKPOINT_RESTORE + bool "Checkpoint/restore support" if EXPERT + default n + help + Enables additional kernel features in a sake of checkpoint/restore. + In particular it adds auxiliary prctl codes to setup process text, + data and heap segment sizes, and a few additional /proc filesystem + entries. + + If unsure, say N here. + +menuconfig NAMESPACES + bool "Namespaces support" if EXPERT + default !EXPERT + help + Provides the way to make tasks work with different objects using + the same id. For example same IPC id may refer to different objects + or same user id or pid may refer to different tasks when used in + different namespaces. + +if NAMESPACES + +config UTS_NS + bool "UTS namespace" + default y + help + In this namespace tasks see different info provided with the + uname() system call + +config IPC_NS + bool "IPC namespace" + depends on (SYSVIPC || POSIX_MQUEUE) + default y + help + In this namespace tasks work with IPC ids which correspond to + different IPC objects in different namespaces. + +config USER_NS + bool "User namespace (EXPERIMENTAL)" + depends on EXPERIMENTAL + default y + help + This allows containers, i.e. vservers, to use user namespaces + to provide different user info for different servers. + If unsure, say N. + +config PID_NS + bool "PID Namespaces" + default y + help + Support process id namespaces. This allows having multiple + processes with the same pid as long as they are in different + pid namespaces. This is a building block of containers. + +config NET_NS + bool "Network namespace" + depends on NET + default y + help + Allow user space to create what appear to be multiple instances + of the network stack. + +endif # NAMESPACES + +config SCHED_AUTOGROUP + bool "Automatic process group scheduling" + select EVENTFD + select CGROUPS + select CGROUP_SCHED + select FAIR_GROUP_SCHED + help + This option optimizes the scheduler for common desktop workloads by + automatically creating and populating task groups. This separation + of workloads isolates aggressive CPU burners (like build jobs) from + desktop applications. Task group autogeneration is currently based + upon task session. + +config MM_OWNER + bool + +config SYSFS_DEPRECATED + bool "Enable deprecated sysfs features to support old userspace tools" + depends on SYSFS + default n + help + This option adds code that switches the layout of the "block" class + devices, to not show up in /sys/class/block/, but only in + /sys/block/. + + This switch is only active when the sysfs.deprecated=1 boot option is + passed or the SYSFS_DEPRECATED_V2 option is set. + + This option allows new kernels to run on old distributions and tools, + which might get confused by /sys/class/block/. Since 2007/2008 all + major distributions and tools handle this just fine. + + Recent distributions and userspace tools after 2009/2010 depend on + the existence of /sys/class/block/, and will not work with this + option enabled. + + Only if you are using a new kernel on an old distribution, you might + need to say Y here. + +config SYSFS_DEPRECATED_V2 + bool "Enable deprecated sysfs features by default" + default n + depends on SYSFS + depends on SYSFS_DEPRECATED + help + Enable deprecated sysfs by default. + + See the CONFIG_SYSFS_DEPRECATED option for more details about this + option. + + Only if you are using a new kernel on an old distribution, you might + need to say Y here. Even then, odds are you would not need it + enabled, you can always pass the boot option if absolutely necessary. + +config RELAY + bool "Kernel->user space relay support (formerly relayfs)" + help + This option enables support for relay interface support in + certain file systems (such as debugfs). + It is designed to provide an efficient mechanism for tools and + facilities to relay large amounts of data from kernel space to + user space. + + If unsure, say N. + +config BLK_DEV_INITRD + bool "Initial RAM filesystem and RAM disk (initramfs/initrd) support" + depends on BROKEN || !FRV + help + The initial RAM filesystem is a ramfs which is loaded by the + boot loader (loadlin or lilo) and that is mounted as root + before the normal boot procedure. It is typically used to + load modules needed to mount the "real" root file system, + etc. See for details. + + If RAM disk support (BLK_DEV_RAM) is also included, this + also enables initial RAM disk (initrd) support and adds + 15 Kbytes (more on some other architectures) to the kernel size. + + If unsure say Y. + +if BLK_DEV_INITRD + +source "usr/Kconfig" + +endif + +config CC_OPTIMIZE_FOR_SIZE + bool "Optimize for size" + help + Enabling this option will pass "-Os" instead of "-O2" to gcc + resulting in a smaller kernel. + + If unsure, say Y. + +config SYSCTL + bool + +config ANON_INODES + bool + +config PANIC_TIMEOUT + int "Default panic timeout" + default 0 + help + Set default panic timeout. + +menuconfig EXPERT + bool "Configure standard kernel features (expert users)" + # Unhide debug options, to make the on-by-default options visible + select DEBUG_KERNEL + help + This option allows certain base kernel options and settings + to be disabled or tweaked. This is for specialized + environments which can tolerate a "non-standard" kernel. + Only use this if you really know what you are doing. + +config UID16 + bool "Enable 16-bit UID system calls" if EXPERT + depends on ARM || BLACKFIN || CRIS || FRV || H8300 || X86_32 || M68K || (S390 && !64BIT) || SUPERH || SPARC32 || (SPARC64 && COMPAT) || UML || (X86_64 && IA32_EMULATION) + default y + help + This enables the legacy 16-bit UID syscall wrappers. + +config SYSCTL_SYSCALL + bool "Sysctl syscall support" if EXPERT + depends on PROC_SYSCTL + default n + select SYSCTL + ---help--- + sys_sysctl uses binary paths that have been found challenging + to properly maintain and use. The interface in /proc/sys + using paths with ascii names is now the primary path to this + information. + + Almost nothing using the binary sysctl interface so if you are + trying to save some space it is probably safe to disable this, + making your kernel marginally smaller. + + If unsure say N here. + +config KALLSYMS + bool "Load all symbols for debugging/ksymoops" if EXPERT + default y + help + Say Y here to let the kernel print out symbolic crash information and + symbolic stack backtraces. This increases the size of the kernel + somewhat, as all symbols have to be loaded into the kernel image. + +config KALLSYMS_ALL + bool "Include all symbols in kallsyms" + depends on DEBUG_KERNEL && KALLSYMS + help + Normally kallsyms only contains the symbols of functions for nicer + OOPS messages and backtraces (i.e., symbols from the text and inittext + sections). This is sufficient for most cases. And only in very rare + cases (e.g., when a debugger is used) all symbols are required (e.g., + names of variables from the data sections, etc). + + This option makes sure that all symbols are loaded into the kernel + image (i.e., symbols from all sections) in cost of increased kernel + size (depending on the kernel configuration, it may be 300KiB or + something like this). + + Say N unless you really need all symbols. + +config HOTPLUG + bool "Support for hot-pluggable devices" if EXPERT + default y + help + This option is provided for the case where no hotplug or uevent + capabilities is wanted by the kernel. You should only consider + disabling this option for embedded systems that do not use modules, a + dynamic /dev tree, or dynamic device discovery. Just say Y. + +config PRINTK + default y + bool "Enable support for printk" if EXPERT + help + This option enables normal printk support. Removing it + eliminates most of the message strings from the kernel image + and makes the kernel more or less silent. As this makes it + very difficult to diagnose system problems, saying N here is + strongly discouraged. + +config BUG + bool "BUG() support" if EXPERT + default y + help + Disabling this option eliminates support for BUG and WARN, reducing + the size of your kernel image and potentially quietly ignoring + numerous fatal conditions. You should only consider disabling this + option for embedded systems with no facilities for reporting errors. + Just say Y. + +config ELF_CORE + default y + bool "Enable ELF core dumps" if EXPERT + help + Enable support for generating core dumps. Disabling saves about 4k. + + +config PCSPKR_PLATFORM + bool "Enable PC-Speaker support" if EXPERT + depends on HAVE_PCSPKR_PLATFORM + select I8253_LOCK + default y + help + This option allows to disable the internal PC-Speaker + support, saving some memory. + +config HAVE_PCSPKR_PLATFORM + bool + +config BASE_FULL + default y + bool "Enable full-sized data structures for core" if EXPERT + help + Disabling this option reduces the size of miscellaneous core + kernel data structures. This saves memory on small machines, + but may reduce performance. + +config FUTEX + bool "Enable futex support" if EXPERT + default y + select RT_MUTEXES + help + Disabling this option will cause the kernel to be built without + support for "fast userspace mutexes". The resulting kernel may not + run glibc-based applications correctly. + +config EPOLL + bool "Enable eventpoll support" if EXPERT + default y + select ANON_INODES + help + Disabling this option will cause the kernel to be built without + support for epoll family of system calls. + +config SIGNALFD + bool "Enable signalfd() system call" if EXPERT + select ANON_INODES + default y + help + Enable the signalfd() system call that allows to receive signals + on a file descriptor. + + If unsure, say Y. + +config TIMERFD + bool "Enable timerfd() system call" if EXPERT + select ANON_INODES + default y + help + Enable the timerfd() system call that allows to receive timer + events on a file descriptor. + + If unsure, say Y. + +config EVENTFD + bool "Enable eventfd() system call" if EXPERT + select ANON_INODES + default y + help + Enable the eventfd() system call that allows to receive both + kernel notification (ie. KAIO) or userspace notifications. + + If unsure, say Y. + +config SHMEM + bool "Use full shmem filesystem" if EXPERT + default y + depends on MMU + help + The shmem is an internal filesystem used to manage shared memory. + It is backed by swap and manages resource limits. It is also exported + to userspace as tmpfs if TMPFS is enabled. Disabling this + option replaces shmem and tmpfs with the much simpler ramfs code, + which may be appropriate on small systems without swap. + +config AIO + bool "Enable AIO support" if EXPERT + default y + help + This option enables POSIX asynchronous I/O which may by used + by some high performance threaded applications. Disabling + this option saves about 7k. + +config EMBEDDED + bool "Embedded system" + select EXPERT + help + This option should be enabled if compiling the kernel for + an embedded system so certain expert options are available + for configuration. + +config HAVE_PERF_EVENTS + bool + help + See tools/perf/design.txt for details. + +config PERF_USE_VMALLOC + bool + help + See tools/perf/design.txt for details + +menu "Kernel Performance Events And Counters" + +config PERF_EVENTS + bool "Kernel performance events and counters" + default y if (PROFILING || PERF_COUNTERS) + depends on HAVE_PERF_EVENTS + select ANON_INODES + select IRQ_WORK + help + Enable kernel support for various performance events provided + by software and hardware. + + Software events are supported either built-in or via the + use of generic tracepoints. + + Most modern CPUs support performance events via performance + counter registers. These registers count the number of certain + types of hw events: such as instructions executed, cachemisses + suffered, or branches mis-predicted - without slowing down the + kernel or applications. These registers can also trigger interrupts + when a threshold number of events have passed - and can thus be + used to profile the code that runs on that CPU. + + The Linux Performance Event subsystem provides an abstraction of + these software and hardware event capabilities, available via a + system call and used by the "perf" utility in tools/perf/. It + provides per task and per CPU counters, and it provides event + capabilities on top of those. + + Say Y if unsure. + +config PERF_COUNTERS + bool "Kernel performance counters (old config option)" + depends on HAVE_PERF_EVENTS + help + This config has been obsoleted by the PERF_EVENTS + config option - please see that one for details. + + It has no effect on the kernel whether you enable + it or not, it is a compatibility placeholder. + + Say N if unsure. + +config DEBUG_PERF_USE_VMALLOC + default n + bool "Debug: use vmalloc to back perf mmap() buffers" + depends on PERF_EVENTS && DEBUG_KERNEL + select PERF_USE_VMALLOC + help + Use vmalloc memory to back perf mmap() buffers. + + Mostly useful for debugging the vmalloc code on platforms + that don't require it. + + Say N if unsure. + +endmenu + +config VM_EVENT_COUNTERS + default y + bool "Enable VM event counters for /proc/vmstat" if EXPERT + help + VM event counters are needed for event counts to be shown. + This option allows the disabling of the VM event counters + on EXPERT systems. /proc/vmstat will only show page counts + if VM event counters are disabled. + +config PCI_QUIRKS + default y + bool "Enable PCI quirk workarounds" if EXPERT + depends on PCI + help + This enables workarounds for various PCI chipset + bugs/quirks. Disable this only if your target machine is + unaffected by PCI quirks. + +config SLUB_DEBUG + default y + bool "Enable SLUB debugging support" if EXPERT + depends on SLUB && SYSFS + help + SLUB has extensive debug support features. Disabling these can + result in significant savings in code size. This also disables + SLUB sysfs support. /sys/slab will not exist and there will be + no support for cache validation etc. + +config COMPAT_BRK + bool "Disable heap randomization" + default y + help + Randomizing heap placement makes heap exploits harder, but it + also breaks ancient binaries (including anything libc5 based). + This option changes the bootup default to heap randomization + disabled, and can be overridden at runtime by setting + /proc/sys/kernel/randomize_va_space to 2. + + On non-ancient distros (post-2000 ones) N is usually a safe choice. + +choice + prompt "Choose SLAB allocator" + default SLUB + help + This option allows to select a slab allocator. + +config SLAB + bool "SLAB" + help + The regular slab allocator that is established and known to work + well in all environments. It organizes cache hot objects in + per cpu and per node queues. + +config SLUB + bool "SLUB (Unqueued Allocator)" + help + SLUB is a slab allocator that minimizes cache line usage + instead of managing queues of cached objects (SLAB approach). + Per cpu caching is realized using slabs of objects instead + of queues of objects. SLUB can use memory efficiently + and has enhanced diagnostics. SLUB is the default choice for + a slab allocator. + +config SLOB + depends on EXPERT + bool "SLOB (Simple Allocator)" + help + SLOB replaces the stock allocator with a drastically simpler + allocator. SLOB is generally more space efficient but + does not perform as well on large systems. + +endchoice + +config MMAP_ALLOW_UNINITIALIZED + bool "Allow mmapped anonymous memory to be uninitialized" + depends on EXPERT && !MMU + default n + help + Normally, and according to the Linux spec, anonymous memory obtained + from mmap() has it's contents cleared before it is passed to + userspace. Enabling this config option allows you to request that + mmap() skip that if it is given an MAP_UNINITIALIZED flag, thus + providing a huge performance boost. If this option is not enabled, + then the flag will be ignored. + + This is taken advantage of by uClibc's malloc(), and also by + ELF-FDPIC binfmt's brk and stack allocator. + + Because of the obvious security issues, this option should only be + enabled on embedded devices where you control what is run in + userspace. Since that isn't generally a problem on no-MMU systems, + it is normally safe to say Y here. + + See Documentation/nommu-mmap.txt for more information. + +config PROFILING + bool "Profiling support" + help + Say Y here to enable the extended profiling support mechanisms used + by profilers such as OProfile. + +# +# Place an empty function call at each tracepoint site. Can be +# dynamically changed for a probe function. +# +config TRACEPOINTS + bool + +source "arch/Kconfig" + +endmenu # General setup + +config HAVE_GENERIC_DMA_COHERENT + bool + default n + +config SLABINFO + bool + depends on PROC_FS + depends on SLAB || SLUB_DEBUG + default y + +config RT_MUTEXES + boolean + +config BASE_SMALL + int + default 0 if BASE_FULL + default 1 if !BASE_FULL + +menuconfig MODULES + bool "Enable loadable module support" + help + Kernel modules are small pieces of compiled code which can + be inserted in the running kernel, rather than being + permanently built into the kernel. You use the "modprobe" + tool to add (and sometimes remove) them. If you say Y here, + many parts of the kernel can be built as modules (by + answering M instead of Y where indicated): this is most + useful for infrequently used options which are not required + for booting. For more information, see the man pages for + modprobe, lsmod, modinfo, insmod and rmmod. + + If you say Y here, you will need to run "make + modules_install" to put the modules under /lib/modules/ + where modprobe can find them (you may need to be root to do + this). + + If unsure, say Y. + +if MODULES + +config MODULE_FORCE_LOAD + bool "Forced module loading" + default n + help + Allow loading of modules without version information (ie. modprobe + --force). Forced module loading sets the 'F' (forced) taint flag and + is usually a really bad idea. + +config MODULE_UNLOAD + bool "Module unloading" + help + Without this option you will not be able to unload any + modules (note that some modules may not be unloadable + anyway), which makes your kernel smaller, faster + and simpler. If unsure, say Y. + +config MODULE_FORCE_UNLOAD + bool "Forced module unloading" + depends on MODULE_UNLOAD && EXPERIMENTAL + help + This option allows you to force a module to unload, even if the + kernel believes it is unsafe: the kernel will remove the module + without waiting for anyone to stop using it (using the -f option to + rmmod). This is mainly for kernel developers and desperate users. + If unsure, say N. + +config MODVERSIONS + bool "Module versioning support" + help + Usually, you have to use modules compiled with your kernel. + Saying Y here makes it sometimes possible to use modules + compiled for different kernels, by adding enough information + to the modules to (hopefully) spot any changes which would + make them incompatible with the kernel you are running. If + unsure, say N. + +config MODULE_SRCVERSION_ALL + bool "Source checksum for all modules" + help + Modules which contain a MODULE_VERSION get an extra "srcversion" + field inserted into their modinfo section, which contains a + sum of the source files which made it. This helps maintainers + see exactly which source was used to build a module (since + others sometimes change the module source without updating + the version). With this option, such a "srcversion" field + will be created for all modules. If unsure, say N. + +endif # MODULES + +config INIT_ALL_POSSIBLE + bool + help + Back when each arch used to define their own cpu_online_mask and + cpu_possible_mask, some of them chose to initialize cpu_possible_mask + with all 1s, and others with all 0s. When they were centralised, + it was better to provide this option than to break all the archs + and have several arch maintainers pursuing me down dark alleys. + +config STOP_MACHINE + bool + default y + depends on (SMP && MODULE_UNLOAD) || HOTPLUG_CPU + help + Need stop_machine() primitive. + +source "block/Kconfig" + +config PREEMPT_NOTIFIERS + bool + +config PADATA + depends on SMP + bool + +source "kernel/Kconfig.locks" diff --git a/init/Makefile b/init/Makefile new file mode 100644 index 00000000..0bf677aa --- /dev/null +++ b/init/Makefile @@ -0,0 +1,32 @@ +# +# Makefile for the linux kernel. +# + +obj-y := main.o version.o mounts.o +ifneq ($(CONFIG_BLK_DEV_INITRD),y) +obj-y += noinitramfs.o +else +obj-$(CONFIG_BLK_DEV_INITRD) += initramfs.o +endif +obj-$(CONFIG_GENERIC_CALIBRATE_DELAY) += calibrate.o + +mounts-y := do_mounts.o +mounts-$(CONFIG_BLK_DEV_RAM) += do_mounts_rd.o +mounts-$(CONFIG_BLK_DEV_INITRD) += do_mounts_initrd.o +mounts-$(CONFIG_BLK_DEV_MD) += do_mounts_md.o + +# dependencies on generated files need to be listed explicitly +$(obj)/version.o: include/generated/compile.h + +# compile.h changes depending on hostname, generation number, etc, +# so we regenerate it always. +# mkcompile_h will make sure to only update the +# actual file if its content has changed. + + chk_compile.h = : + quiet_chk_compile.h = echo ' CHK $@' +silent_chk_compile.h = : +include/generated/compile.h: FORCE + @$($(quiet)chk_compile.h) + $(Q)$(CONFIG_SHELL) $(srctree)/scripts/mkcompile_h $@ \ + "$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" "$(CC) $(KBUILD_CFLAGS)" diff --git a/init/calibrate.c b/init/calibrate.c new file mode 100644 index 00000000..fda0a7b0 --- /dev/null +++ b/init/calibrate.c @@ -0,0 +1,301 @@ +/* calibrate.c: default delay calibration + * + * Excised from init/main.c + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +#include +#include +#include +#include +#include +#include + +unsigned long lpj_fine; +unsigned long preset_lpj; +static int __init lpj_setup(char *str) +{ + preset_lpj = simple_strtoul(str,NULL,0); + return 1; +} + +__setup("lpj=", lpj_setup); + +#ifdef ARCH_HAS_READ_CURRENT_TIMER + +/* This routine uses the read_current_timer() routine and gets the + * loops per jiffy directly, instead of guessing it using delay(). + * Also, this code tries to handle non-maskable asynchronous events + * (like SMIs) + */ +#define DELAY_CALIBRATION_TICKS ((HZ < 100) ? 1 : (HZ/100)) +#define MAX_DIRECT_CALIBRATION_RETRIES 5 + +static unsigned long __cpuinit calibrate_delay_direct(void) +{ + unsigned long pre_start, start, post_start; + unsigned long pre_end, end, post_end; + unsigned long start_jiffies; + unsigned long timer_rate_min, timer_rate_max; + unsigned long good_timer_sum = 0; + unsigned long good_timer_count = 0; + unsigned long measured_times[MAX_DIRECT_CALIBRATION_RETRIES]; + int max = -1; /* index of measured_times with max/min values or not set */ + int min = -1; + int i; + + if (read_current_timer(&pre_start) < 0 ) + return 0; + + /* + * A simple loop like + * while ( jiffies < start_jiffies+1) + * start = read_current_timer(); + * will not do. As we don't really know whether jiffy switch + * happened first or timer_value was read first. And some asynchronous + * event can happen between these two events introducing errors in lpj. + * + * So, we do + * 1. pre_start <- When we are sure that jiffy switch hasn't happened + * 2. check jiffy switch + * 3. start <- timer value before or after jiffy switch + * 4. post_start <- When we are sure that jiffy switch has happened + * + * Note, we don't know anything about order of 2 and 3. + * Now, by looking at post_start and pre_start difference, we can + * check whether any asynchronous event happened or not + */ + + for (i = 0; i < MAX_DIRECT_CALIBRATION_RETRIES; i++) { + pre_start = 0; + read_current_timer(&start); + start_jiffies = jiffies; + while (time_before_eq(jiffies, start_jiffies + 1)) { + pre_start = start; + read_current_timer(&start); + } + read_current_timer(&post_start); + + pre_end = 0; + end = post_start; + while (time_before_eq(jiffies, start_jiffies + 1 + + DELAY_CALIBRATION_TICKS)) { + pre_end = end; + read_current_timer(&end); + } + read_current_timer(&post_end); + + timer_rate_max = (post_end - pre_start) / + DELAY_CALIBRATION_TICKS; + timer_rate_min = (pre_end - post_start) / + DELAY_CALIBRATION_TICKS; + + /* + * If the upper limit and lower limit of the timer_rate is + * >= 12.5% apart, redo calibration. + */ + if (start >= post_end) + printk(KERN_NOTICE "calibrate_delay_direct() ignoring " + "timer_rate as we had a TSC wrap around" + " start=%lu >=post_end=%lu\n", + start, post_end); + if (start < post_end && pre_start != 0 && pre_end != 0 && + (timer_rate_max - timer_rate_min) < (timer_rate_max >> 3)) { + good_timer_count++; + good_timer_sum += timer_rate_max; + measured_times[i] = timer_rate_max; + if (max < 0 || timer_rate_max > measured_times[max]) + max = i; + if (min < 0 || timer_rate_max < measured_times[min]) + min = i; + } else + measured_times[i] = 0; + + } + + /* + * Find the maximum & minimum - if they differ too much throw out the + * one with the largest difference from the mean and try again... + */ + while (good_timer_count > 1) { + unsigned long estimate; + unsigned long maxdiff; + + /* compute the estimate */ + estimate = (good_timer_sum/good_timer_count); + maxdiff = estimate >> 3; + + /* if range is within 12% let's take it */ + if ((measured_times[max] - measured_times[min]) < maxdiff) + return estimate; + + /* ok - drop the worse value and try again... */ + good_timer_sum = 0; + good_timer_count = 0; + if ((measured_times[max] - estimate) < + (estimate - measured_times[min])) { + printk(KERN_NOTICE "calibrate_delay_direct() dropping " + "min bogoMips estimate %d = %lu\n", + min, measured_times[min]); + measured_times[min] = 0; + min = max; + } else { + printk(KERN_NOTICE "calibrate_delay_direct() dropping " + "max bogoMips estimate %d = %lu\n", + max, measured_times[max]); + measured_times[max] = 0; + max = min; + } + + for (i = 0; i < MAX_DIRECT_CALIBRATION_RETRIES; i++) { + if (measured_times[i] == 0) + continue; + good_timer_count++; + good_timer_sum += measured_times[i]; + if (measured_times[i] < measured_times[min]) + min = i; + if (measured_times[i] > measured_times[max]) + max = i; + } + + } + + printk(KERN_NOTICE "calibrate_delay_direct() failed to get a good " + "estimate for loops_per_jiffy.\nProbably due to long platform " + "interrupts. Consider using \"lpj=\" boot option.\n"); + return 0; +} +#else +static unsigned long __cpuinit calibrate_delay_direct(void) {return 0;} +#endif + +/* + * This is the number of bits of precision for the loops_per_jiffy. Each + * time we refine our estimate after the first takes 1.5/HZ seconds, so try + * to start with a good estimate. + * For the boot cpu we can skip the delay calibration and assign it a value + * calculated based on the timer frequency. + * For the rest of the CPUs we cannot assume that the timer frequency is same as + * the cpu frequency, hence do the calibration for those. + */ +#define LPS_PREC 8 + +static unsigned long __cpuinit calibrate_delay_converge(void) +{ + /* First stage - slowly accelerate to find initial bounds */ + unsigned long lpj, lpj_base, ticks, loopadd, loopadd_base, chop_limit; + int trials = 0, band = 0, trial_in_band = 0; + + lpj = (1<<12); + + /* wait for "start of" clock tick */ + ticks = jiffies; + while (ticks == jiffies) + ; /* nothing */ + /* Go .. */ + ticks = jiffies; + do { + if (++trial_in_band == (1<> LPS_PREC; + while (loopadd > chop_limit) { + lpj += loopadd; + ticks = jiffies; + while (ticks == jiffies) + ; /* nothing */ + ticks = jiffies; + __delay(lpj); + if (jiffies != ticks) /* longer than 1 tick */ + lpj -= loopadd; + loopadd >>= 1; + } + /* + * If we incremented every single time possible, presume we've + * massively underestimated initially, and retry with a higher + * start, and larger range. (Only seen on x86_64, due to SMIs) + */ + if (lpj + loopadd * 2 == lpj_base + loopadd_base * 2) { + lpj_base = lpj; + loopadd_base <<= 2; + goto recalibrate; + } + + return lpj; +} + +static DEFINE_PER_CPU(unsigned long, cpu_loops_per_jiffy) = { 0 }; + +/* + * Check if cpu calibration delay is already known. For example, + * some processors with multi-core sockets may have all cores + * with the same calibration delay. + * + * Architectures should override this function if a faster calibration + * method is available. + */ +unsigned long __attribute__((weak)) __cpuinit calibrate_delay_is_known(void) +{ + return 0; +} + +void __cpuinit calibrate_delay(void) +{ + unsigned long lpj; + static bool printed; + int this_cpu = smp_processor_id(); + + if (per_cpu(cpu_loops_per_jiffy, this_cpu)) { + lpj = per_cpu(cpu_loops_per_jiffy, this_cpu); + if (!printed) + pr_info("Calibrating delay loop (skipped) " + "already calibrated this CPU"); + } else if (preset_lpj) { + lpj = preset_lpj; + if (!printed) + pr_info("Calibrating delay loop (skipped) " + "preset value.. "); + } else if ((!printed) && lpj_fine) { + lpj = lpj_fine; + pr_info("Calibrating delay loop (skipped), " + "value calculated using timer frequency.. "); + } else if ((lpj = calibrate_delay_is_known())) { + ; + } else if ((lpj = calibrate_delay_direct()) != 0) { + if (!printed) + pr_info("Calibrating delay using timer " + "specific routine.. "); + } else { + if (!printed) + pr_info("Calibrating delay loop... "); + lpj = calibrate_delay_converge(); + } + per_cpu(cpu_loops_per_jiffy, this_cpu) = lpj; + if (!printed) + pr_cont("%lu.%02lu BogoMIPS (lpj=%lu)\n", + lpj/(500000/HZ), + (lpj/(5000/HZ)) % 100, lpj); + + loops_per_jiffy = lpj; + printed = true; +} diff --git a/init/do_mounts.c b/init/do_mounts.c new file mode 100644 index 00000000..42b0707c --- /dev/null +++ b/init/do_mounts.c @@ -0,0 +1,560 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "do_mounts.h" + +int __initdata rd_doload; /* 1 = load RAM disk, 0 = don't load */ + +int root_mountflags = MS_RDONLY | MS_SILENT; +static char * __initdata root_device_name; +static char __initdata saved_root_name[64]; +static int root_wait; + +dev_t ROOT_DEV; + +static int __init load_ramdisk(char *str) +{ + rd_doload = simple_strtol(str,NULL,0) & 3; + return 1; +} +__setup("load_ramdisk=", load_ramdisk); + +static int __init readonly(char *str) +{ + if (*str) + return 0; + root_mountflags |= MS_RDONLY; + return 1; +} + +static int __init readwrite(char *str) +{ + if (*str) + return 0; + root_mountflags &= ~MS_RDONLY; + return 1; +} + +__setup("ro", readonly); +__setup("rw", readwrite); + +#ifdef CONFIG_BLOCK +/** + * match_dev_by_uuid - callback for finding a partition using its uuid + * @dev: device passed in by the caller + * @data: opaque pointer to a 36 byte char array with a UUID + * + * Returns 1 if the device matches, and 0 otherwise. + */ +static int match_dev_by_uuid(struct device *dev, void *data) +{ + u8 *uuid = data; + struct hd_struct *part = dev_to_part(dev); + + if (!part->info) + goto no_match; + + if (memcmp(uuid, part->info->uuid, sizeof(part->info->uuid))) + goto no_match; + + return 1; +no_match: + return 0; +} + + +/** + * devt_from_partuuid - looks up the dev_t of a partition by its UUID + * @uuid: min 36 byte char array containing a hex ascii UUID + * + * The function will return the first partition which contains a matching + * UUID value in its partition_meta_info struct. This does not search + * by filesystem UUIDs. + * + * If @uuid is followed by a "/PARTNROFF=%d", then the number will be + * extracted and used as an offset from the partition identified by the UUID. + * + * Returns the matching dev_t on success or 0 on failure. + */ +static dev_t devt_from_partuuid(char *uuid_str) +{ + dev_t res = 0; + struct device *dev = NULL; + u8 uuid[16]; + struct gendisk *disk; + struct hd_struct *part; + int offset = 0; + + if (strlen(uuid_str) < 36) + goto done; + + /* Check for optional partition number offset attributes. */ + if (uuid_str[36]) { + char c = 0; + /* Explicitly fail on poor PARTUUID syntax. */ + if (sscanf(&uuid_str[36], + "/PARTNROFF=%d%c", &offset, &c) != 1) { + printk(KERN_ERR "VFS: PARTUUID= is invalid.\n" + "Expected PARTUUID=[/PARTNROFF=%%d]\n"); + if (root_wait) + printk(KERN_ERR + "Disabling rootwait; root= is invalid.\n"); + root_wait = 0; + goto done; + } + } + + /* Pack the requested UUID in the expected format. */ + part_pack_uuid(uuid_str, uuid); + + dev = class_find_device(&block_class, NULL, uuid, &match_dev_by_uuid); + if (!dev) + goto done; + + res = dev->devt; + + /* Attempt to find the partition by offset. */ + if (!offset) + goto no_offset; + + res = 0; + disk = part_to_disk(dev_to_part(dev)); + part = disk_get_part(disk, dev_to_part(dev)->partno + offset); + if (part) { + res = part_devt(part); + put_device(part_to_dev(part)); + } + +no_offset: + put_device(dev); +done: + return res; +} +#endif + +/* + * Convert a name into device number. We accept the following variants: + * + * 1) device number in hexadecimal represents itself + * 2) /dev/nfs represents Root_NFS (0xff) + * 3) /dev/ represents the device number of disk + * 4) /dev/ represents the device number + * of partition - device number of disk plus the partition number + * 5) /dev/p - same as the above, that form is + * used when disk name of partitioned disk ends on a digit. + * 6) PARTUUID=00112233-4455-6677-8899-AABBCCDDEEFF representing the + * unique id of a partition if the partition table provides it. + * 7) PARTUUID=/PARTNROFF= to select a partition in relation to + * a partition with a known unique id. + * + * If name doesn't have fall into the categories above, we return (0,0). + * block_class is used to check if something is a disk name. If the disk + * name contains slashes, the device name has them replaced with + * bangs. + */ + +dev_t name_to_dev_t(char *name) +{ + char s[32]; + char *p; + dev_t res = 0; + int part; + +#ifdef CONFIG_BLOCK + if (strncmp(name, "PARTUUID=", 9) == 0) { + name += 9; + res = devt_from_partuuid(name); + if (!res) + goto fail; + goto done; + } +#endif + + if (strncmp(name, "/dev/", 5) != 0) { + unsigned maj, min; + + if (sscanf(name, "%u:%u", &maj, &min) == 2) { + res = MKDEV(maj, min); + if (maj != MAJOR(res) || min != MINOR(res)) + goto fail; + } else { + res = new_decode_dev(simple_strtoul(name, &p, 16)); + if (*p) + goto fail; + } + goto done; + } + + name += 5; + res = Root_NFS; + if (strcmp(name, "nfs") == 0) + goto done; + res = Root_RAM0; + if (strcmp(name, "ram") == 0) + goto done; + + if (strlen(name) > 31) + goto fail; + strcpy(s, name); + for (p = s; *p; p++) + if (*p == '/') + *p = '!'; + res = blk_lookup_devt(s, 0); + if (res) + goto done; + + /* + * try non-existent, but valid partition, which may only exist + * after revalidating the disk, like partitioned md devices + */ + while (p > s && isdigit(p[-1])) + p--; + if (p == s || !*p || *p == '0') + goto fail; + + /* try disk name without */ + part = simple_strtoul(p, NULL, 10); + *p = '\0'; + res = blk_lookup_devt(s, part); + if (res) + goto done; + + /* try disk name without p */ + if (p < s + 2 || !isdigit(p[-2]) || p[-1] != 'p') + goto fail; + p[-1] = '\0'; + res = blk_lookup_devt(s, part); + if (res) + goto done; + +fail: + return 0; +done: + return res; +} + +static int __init root_dev_setup(char *line) +{ + strlcpy(saved_root_name, line, sizeof(saved_root_name)); + return 1; +} + +__setup("root=", root_dev_setup); + +static int __init rootwait_setup(char *str) +{ + if (*str) + return 0; + root_wait = 1; + return 1; +} + +__setup("rootwait", rootwait_setup); + +static char * __initdata root_mount_data; +static int __init root_data_setup(char *str) +{ + root_mount_data = str; + return 1; +} + +static char * __initdata root_fs_names; +static int __init fs_names_setup(char *str) +{ + root_fs_names = str; + return 1; +} + +static unsigned int __initdata root_delay; +static int __init root_delay_setup(char *str) +{ + root_delay = simple_strtoul(str, NULL, 0); + return 1; +} + +__setup("rootflags=", root_data_setup); +__setup("rootfstype=", fs_names_setup); +__setup("rootdelay=", root_delay_setup); + +static void __init get_fs_names(char *page) +{ + char *s = page; + + if (root_fs_names) { + strcpy(page, root_fs_names); + while (*s++) { + if (s[-1] == ',') + s[-1] = '\0'; + } + } else { + int len = get_filesystem_list(page); + char *p, *next; + + page[len] = '\0'; + for (p = page-1; p; p = next) { + next = strchr(++p, '\n'); + if (*p++ != '\t') + continue; + while ((*s++ = *p++) != '\n') + ; + s[-1] = '\0'; + } + } + *s = '\0'; +} + +static int __init do_mount_root(char *name, char *fs, int flags, void *data) +{ + struct super_block *s; + int err = sys_mount(name, "/root", fs, flags, data); + if (err) + return err; + + sys_chdir((const char __user __force *)"/root"); + s = current->fs->pwd.dentry->d_sb; + ROOT_DEV = s->s_dev; + printk(KERN_INFO + "VFS: Mounted root (%s filesystem)%s on device %u:%u.\n", + s->s_type->name, + s->s_flags & MS_RDONLY ? " readonly" : "", + MAJOR(ROOT_DEV), MINOR(ROOT_DEV)); + return 0; +} + +void __init mount_block_root(char *name, int flags) +{ + char *fs_names = __getname_gfp(GFP_KERNEL + | __GFP_NOTRACK_FALSE_POSITIVE); + char *p; +#ifdef CONFIG_BLOCK + char b[BDEVNAME_SIZE]; +#else + const char *b = name; +#endif + + get_fs_names(fs_names); +retry: + for (p = fs_names; *p; p += strlen(p)+1) { + int err = do_mount_root(name, p, flags, root_mount_data); + switch (err) { + case 0: + goto out; + case -EACCES: + flags |= MS_RDONLY; + goto retry; + case -EINVAL: + continue; + } + /* + * Allow the user to distinguish between failed sys_open + * and bad superblock on root device. + * and give them a list of the available devices + */ +#ifdef CONFIG_BLOCK + __bdevname(ROOT_DEV, b); +#endif + printk("VFS: Cannot open root device \"%s\" or %s: error %d\n", + root_device_name, b, err); + printk("Please append a correct \"root=\" boot option; here are the available partitions:\n"); + + printk_all_partitions(); +#ifdef CONFIG_DEBUG_BLOCK_EXT_DEVT + printk("DEBUG_BLOCK_EXT_DEVT is enabled, you need to specify " + "explicit textual name for \"root=\" boot option.\n"); +#endif + panic("VFS: Unable to mount root fs on %s", b); + } + + printk("List of all partitions:\n"); + printk_all_partitions(); + printk("No filesystem could mount root, tried: "); + for (p = fs_names; *p; p += strlen(p)+1) + printk(" %s", p); + printk("\n"); +#ifdef CONFIG_BLOCK + __bdevname(ROOT_DEV, b); +#endif + panic("VFS: Unable to mount root fs on %s", b); +out: + putname(fs_names); +} + +#ifdef CONFIG_ROOT_NFS + +#define NFSROOT_TIMEOUT_MIN 5 +#define NFSROOT_TIMEOUT_MAX 30 +#define NFSROOT_RETRY_MAX 5 + +static int __init mount_nfs_root(void) +{ + char *root_dev, *root_data; + unsigned int timeout; + int try, err; + + err = nfs_root_data(&root_dev, &root_data); + if (err != 0) + return 0; + + /* + * The server or network may not be ready, so try several + * times. Stop after a few tries in case the client wants + * to fall back to other boot methods. + */ + timeout = NFSROOT_TIMEOUT_MIN; + for (try = 1; ; try++) { + err = do_mount_root(root_dev, "nfs", + root_mountflags, root_data); + if (err == 0) + return 1; + if (try > NFSROOT_RETRY_MAX) + break; + + /* Wait, in case the server refused us immediately */ + ssleep(timeout); + timeout <<= 1; + if (timeout > NFSROOT_TIMEOUT_MAX) + timeout = NFSROOT_TIMEOUT_MAX; + } + return 0; +} +#endif + +#if defined(CONFIG_BLK_DEV_RAM) || defined(CONFIG_BLK_DEV_FD) +void __init change_floppy(char *fmt, ...) +{ + struct termios termios; + char buf[80]; + char c; + int fd; + va_list args; + va_start(args, fmt); + vsprintf(buf, fmt, args); + va_end(args); + fd = sys_open("/dev/root", O_RDWR | O_NDELAY, 0); + if (fd >= 0) { + sys_ioctl(fd, FDEJECT, 0); + sys_close(fd); + } + printk(KERN_NOTICE "VFS: Insert %s and press ENTER\n", buf); + fd = sys_open("/dev/console", O_RDWR, 0); + if (fd >= 0) { + sys_ioctl(fd, TCGETS, (long)&termios); + termios.c_lflag &= ~ICANON; + sys_ioctl(fd, TCSETSF, (long)&termios); + sys_read(fd, &c, 1); + termios.c_lflag |= ICANON; + sys_ioctl(fd, TCSETSF, (long)&termios); + sys_close(fd); + } +} +#endif + +void __init mount_root(void) +{ +#ifdef CONFIG_ROOT_NFS + if (ROOT_DEV == Root_NFS) { + if (mount_nfs_root()) + return; + + printk(KERN_ERR "VFS: Unable to mount root fs via NFS, trying floppy.\n"); + ROOT_DEV = Root_FD0; + } +#endif +#ifdef CONFIG_BLK_DEV_FD + if (MAJOR(ROOT_DEV) == FLOPPY_MAJOR) { + /* rd_doload is 2 for a dual initrd/ramload setup */ + if (rd_doload==2) { + if (rd_load_disk(1)) { + ROOT_DEV = Root_RAM1; + root_device_name = NULL; + } + } else + change_floppy("root floppy"); + } +#endif +#ifdef CONFIG_BLOCK + create_dev("/dev/root", ROOT_DEV); + mount_block_root("/dev/root", root_mountflags); +#endif +} + +/* + * Prepare the namespace - decide what/where to mount, load ramdisks, etc. + */ +void __init prepare_namespace(void) +{ + int is_floppy; + + if (root_delay) { + printk(KERN_INFO "Waiting %dsec before mounting root device...\n", + root_delay); + ssleep(root_delay); + } + + /* + * wait for the known devices to complete their probing + * + * Note: this is a potential source of long boot delays. + * For example, it is not atypical to wait 5 seconds here + * for the touchpad of a laptop to initialize. + */ + wait_for_device_probe(); + + md_run_setup(); + + if (saved_root_name[0]) { + root_device_name = saved_root_name; + if (!strncmp(root_device_name, "mtd", 3) || + !strncmp(root_device_name, "ubi", 3)) { + mount_block_root(root_device_name, root_mountflags); + goto out; + } + ROOT_DEV = name_to_dev_t(root_device_name); + if (strncmp(root_device_name, "/dev/", 5) == 0) + root_device_name += 5; + } + + if (initrd_load()) + goto out; + + /* wait for any asynchronous scanning to complete */ + if ((ROOT_DEV == 0) && root_wait) { + printk(KERN_INFO "Waiting for root device %s...\n", + saved_root_name); + while (driver_probe_done() != 0 || + (ROOT_DEV = name_to_dev_t(saved_root_name)) == 0) + msleep(100); + async_synchronize_full(); + } + + is_floppy = MAJOR(ROOT_DEV) == FLOPPY_MAJOR; + + if (is_floppy && rd_doload && rd_load_disk(0)) + ROOT_DEV = Root_RAM0; + + mount_root(); +out: + devtmpfs_mount("dev"); + sys_mount(".", "/", NULL, MS_MOVE, NULL); + sys_chroot((const char __user __force *)"."); +} diff --git a/init/do_mounts.h b/init/do_mounts.h new file mode 100644 index 00000000..f5b978a9 --- /dev/null +++ b/init/do_mounts.h @@ -0,0 +1,76 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +void change_floppy(char *fmt, ...); +void mount_block_root(char *name, int flags); +void mount_root(void); +extern int root_mountflags; + +static inline int create_dev(char *name, dev_t dev) +{ + sys_unlink(name); + return sys_mknod(name, S_IFBLK|0600, new_encode_dev(dev)); +} + +#if BITS_PER_LONG == 32 +static inline u32 bstat(char *name) +{ + struct stat64 stat; + if (sys_stat64(name, &stat) != 0) + return 0; + if (!S_ISBLK(stat.st_mode)) + return 0; + if (stat.st_rdev != (u32)stat.st_rdev) + return 0; + return stat.st_rdev; +} +#else +static inline u32 bstat(char *name) +{ + struct stat stat; + if (sys_newstat(name, &stat) != 0) + return 0; + if (!S_ISBLK(stat.st_mode)) + return 0; + return stat.st_rdev; +} +#endif + +#ifdef CONFIG_BLK_DEV_RAM + +int __init rd_load_disk(int n); +int __init rd_load_image(char *from); + +#else + +static inline int rd_load_disk(int n) { return 0; } +static inline int rd_load_image(char *from) { return 0; } + +#endif + +#ifdef CONFIG_BLK_DEV_INITRD + +int __init initrd_load(void); + +#else + +static inline int initrd_load(void) { return 0; } + +#endif + +#ifdef CONFIG_BLK_DEV_MD + +void md_run_setup(void); + +#else + +static inline void md_run_setup(void) {} + +#endif diff --git a/init/do_mounts_initrd.c b/init/do_mounts_initrd.c new file mode 100644 index 00000000..9047330c --- /dev/null +++ b/init/do_mounts_initrd.c @@ -0,0 +1,125 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "do_mounts.h" + +unsigned long initrd_start, initrd_end; +int initrd_below_start_ok; +unsigned int real_root_dev; /* do_proc_dointvec cannot handle kdev_t */ +static int __initdata old_fd, root_fd; +static int __initdata mount_initrd = 1; + +static int __init no_initrd(char *str) +{ + mount_initrd = 0; + return 1; +} + +__setup("noinitrd", no_initrd); + +static int __init do_linuxrc(void *_shell) +{ + static const char *argv[] = { "linuxrc", NULL, }; + extern const char *envp_init[]; + const char *shell = _shell; + + sys_close(old_fd);sys_close(root_fd); + sys_setsid(); + return kernel_execve(shell, argv, envp_init); +} + +static void __init handle_initrd(void) +{ + int error; + int pid; + + real_root_dev = new_encode_dev(ROOT_DEV); + create_dev("/dev/root.old", Root_RAM0); + /* mount initrd on rootfs' /root */ + mount_block_root("/dev/root.old", root_mountflags & ~MS_RDONLY); + sys_mkdir("/old", 0700); + root_fd = sys_open("/", 0, 0); + old_fd = sys_open("/old", 0, 0); + /* move initrd over / and chdir/chroot in initrd root */ + sys_chdir("/root"); + sys_mount(".", "/", NULL, MS_MOVE, NULL); + sys_chroot("."); + + /* + * In case that a resume from disk is carried out by linuxrc or one of + * its children, we need to tell the freezer not to wait for us. + */ + current->flags |= PF_FREEZER_SKIP; + + pid = kernel_thread(do_linuxrc, "/linuxrc", SIGCHLD); + if (pid > 0) + while (pid != sys_wait4(-1, NULL, 0, NULL)) + yield(); + + current->flags &= ~PF_FREEZER_SKIP; + + /* move initrd to rootfs' /old */ + sys_fchdir(old_fd); + sys_mount("/", ".", NULL, MS_MOVE, NULL); + /* switch root and cwd back to / of rootfs */ + sys_fchdir(root_fd); + sys_chroot("."); + sys_close(old_fd); + sys_close(root_fd); + + if (new_decode_dev(real_root_dev) == Root_RAM0) { + sys_chdir("/old"); + return; + } + + ROOT_DEV = new_decode_dev(real_root_dev); + mount_root(); + + printk(KERN_NOTICE "Trying to move old root to /initrd ... "); + error = sys_mount("/old", "/root/initrd", NULL, MS_MOVE, NULL); + if (!error) + printk("okay\n"); + else { + int fd = sys_open("/dev/root.old", O_RDWR, 0); + if (error == -ENOENT) + printk("/initrd does not exist. Ignored.\n"); + else + printk("failed\n"); + printk(KERN_NOTICE "Unmounting old root\n"); + sys_umount("/old", MNT_DETACH); + printk(KERN_NOTICE "Trying to free ramdisk memory ... "); + if (fd < 0) { + error = fd; + } else { + error = sys_ioctl(fd, BLKFLSBUF, 0); + sys_close(fd); + } + printk(!error ? "okay\n" : "failed\n"); + } +} + +int __init initrd_load(void) +{ + if (mount_initrd) { + create_dev("/dev/ram", Root_RAM0); + /* + * Load the initrd data into /dev/ram0. Execute it as initrd + * unless /dev/ram0 is supposed to be our actual root device, + * in that case the ram disk is just set up here, and gets + * mounted in the normal path. + */ + if (rd_load_image("/initrd.image") && ROOT_DEV != Root_RAM0) { + sys_unlink("/initrd.image"); + handle_initrd(); + return 1; + } + } + sys_unlink("/initrd.image"); + return 0; +} diff --git a/init/do_mounts_md.c b/init/do_mounts_md.c new file mode 100644 index 00000000..32c4799b --- /dev/null +++ b/init/do_mounts_md.c @@ -0,0 +1,302 @@ +#include +#include +#include + +#include "do_mounts.h" + +/* + * When md (and any require personalities) are compiled into the kernel + * (not a module), arrays can be assembles are boot time using with AUTODETECT + * where specially marked partitions are registered with md_autodetect_dev(), + * and with MD_BOOT where devices to be collected are given on the boot line + * with md=..... + * The code for that is here. + */ + +#ifdef CONFIG_MD_AUTODETECT +static int __initdata raid_noautodetect; +#else +static int __initdata raid_noautodetect=1; +#endif +static int __initdata raid_autopart; + +static struct { + int minor; + int partitioned; + int level; + int chunk; + char *device_names; +} md_setup_args[256] __initdata; + +static int md_setup_ents __initdata; + +/* + * Parse the command-line parameters given our kernel, but do not + * actually try to invoke the MD device now; that is handled by + * md_setup_drive after the low-level disk drivers have initialised. + * + * 27/11/1999: Fixed to work correctly with the 2.3 kernel (which + * assigns the task of parsing integer arguments to the + * invoked program now). Added ability to initialise all + * the MD devices (by specifying multiple "md=" lines) + * instead of just one. -- KTK + * 18May2000: Added support for persistent-superblock arrays: + * md=n,0,factor,fault,device-list uses RAID0 for device n + * md=n,-1,factor,fault,device-list uses LINEAR for device n + * md=n,device-list reads a RAID superblock from the devices + * elements in device-list are read by name_to_kdev_t so can be + * a hex number or something like /dev/hda1 /dev/sdb + * 2001-06-03: Dave Cinege + * Shifted name_to_kdev_t() and related operations to md_set_drive() + * for later execution. Rewrote section to make devfs compatible. + */ +static int __init md_setup(char *str) +{ + int minor, level, factor, fault, partitioned = 0; + char *pername = ""; + char *str1; + int ent; + + if (*str == 'd') { + partitioned = 1; + str++; + } + if (get_option(&str, &minor) != 2) { /* MD Number */ + printk(KERN_WARNING "md: Too few arguments supplied to md=.\n"); + return 0; + } + str1 = str; + for (ent=0 ; ent< md_setup_ents ; ent++) + if (md_setup_args[ent].minor == minor && + md_setup_args[ent].partitioned == partitioned) { + printk(KERN_WARNING "md: md=%s%d, Specified more than once. " + "Replacing previous definition.\n", partitioned?"d":"", minor); + break; + } + if (ent >= ARRAY_SIZE(md_setup_args)) { + printk(KERN_WARNING "md: md=%s%d - too many md initialisations\n", partitioned?"d":"", minor); + return 0; + } + if (ent >= md_setup_ents) + md_setup_ents++; + switch (get_option(&str, &level)) { /* RAID level */ + case 2: /* could be 0 or -1.. */ + if (level == 0 || level == LEVEL_LINEAR) { + if (get_option(&str, &factor) != 2 || /* Chunk Size */ + get_option(&str, &fault) != 2) { + printk(KERN_WARNING "md: Too few arguments supplied to md=.\n"); + return 0; + } + md_setup_args[ent].level = level; + md_setup_args[ent].chunk = 1 << (factor+12); + if (level == LEVEL_LINEAR) + pername = "linear"; + else + pername = "raid0"; + break; + } + /* FALL THROUGH */ + case 1: /* the first device is numeric */ + str = str1; + /* FALL THROUGH */ + case 0: + md_setup_args[ent].level = LEVEL_NONE; + pername="super-block"; + } + + printk(KERN_INFO "md: Will configure md%d (%s) from %s, below.\n", + minor, pername, str); + md_setup_args[ent].device_names = str; + md_setup_args[ent].partitioned = partitioned; + md_setup_args[ent].minor = minor; + + return 1; +} + +static void __init md_setup_drive(void) +{ + int minor, i, ent, partitioned; + dev_t dev; + dev_t devices[MD_SB_DISKS+1]; + + for (ent = 0; ent < md_setup_ents ; ent++) { + int fd; + int err = 0; + char *devname; + mdu_disk_info_t dinfo; + char name[16]; + + minor = md_setup_args[ent].minor; + partitioned = md_setup_args[ent].partitioned; + devname = md_setup_args[ent].device_names; + + sprintf(name, "/dev/md%s%d", partitioned?"_d":"", minor); + if (partitioned) + dev = MKDEV(mdp_major, minor << MdpMinorShift); + else + dev = MKDEV(MD_MAJOR, minor); + create_dev(name, dev); + for (i = 0; i < MD_SB_DISKS && devname != NULL; i++) { + char *p; + char comp_name[64]; + u32 rdev; + + p = strchr(devname, ','); + if (p) + *p++ = 0; + + dev = name_to_dev_t(devname); + if (strncmp(devname, "/dev/", 5) == 0) + devname += 5; + snprintf(comp_name, 63, "/dev/%s", devname); + rdev = bstat(comp_name); + if (rdev) + dev = new_decode_dev(rdev); + if (!dev) { + printk(KERN_WARNING "md: Unknown device name: %s\n", devname); + break; + } + + devices[i] = dev; + + devname = p; + } + devices[i] = 0; + + if (!i) + continue; + + printk(KERN_INFO "md: Loading md%s%d: %s\n", + partitioned ? "_d" : "", minor, + md_setup_args[ent].device_names); + + fd = sys_open(name, 0, 0); + if (fd < 0) { + printk(KERN_ERR "md: open failed - cannot start " + "array %s\n", name); + continue; + } + if (sys_ioctl(fd, SET_ARRAY_INFO, 0) == -EBUSY) { + printk(KERN_WARNING + "md: Ignoring md=%d, already autodetected. (Use raid=noautodetect)\n", + minor); + sys_close(fd); + continue; + } + + if (md_setup_args[ent].level != LEVEL_NONE) { + /* non-persistent */ + mdu_array_info_t ainfo; + ainfo.level = md_setup_args[ent].level; + ainfo.size = 0; + ainfo.nr_disks =0; + ainfo.raid_disks =0; + while (devices[ainfo.raid_disks]) + ainfo.raid_disks++; + ainfo.md_minor =minor; + ainfo.not_persistent = 1; + + ainfo.state = (1 << MD_SB_CLEAN); + ainfo.layout = 0; + ainfo.chunk_size = md_setup_args[ent].chunk; + err = sys_ioctl(fd, SET_ARRAY_INFO, (long)&ainfo); + for (i = 0; !err && i <= MD_SB_DISKS; i++) { + dev = devices[i]; + if (!dev) + break; + dinfo.number = i; + dinfo.raid_disk = i; + dinfo.state = (1<= 0) { + sys_ioctl(fd, RAID_AUTORUN, raid_autopart); + sys_close(fd); + } +} + +void __init md_run_setup(void) +{ + create_dev("/dev/md0", MKDEV(MD_MAJOR, 0)); + + if (raid_noautodetect) + printk(KERN_INFO "md: Skipping autodetection of RAID arrays. (raid=autodetect will force)\n"); + else + autodetect_raid(); + md_setup_drive(); +} diff --git a/init/do_mounts_rd.c b/init/do_mounts_rd.c new file mode 100644 index 00000000..6212586d --- /dev/null +++ b/init/do_mounts_rd.c @@ -0,0 +1,340 @@ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "do_mounts.h" +#include "../fs/squashfs/squashfs_fs.h" + +#include + + +int __initdata rd_prompt = 1;/* 1 = prompt for RAM disk, 0 = don't prompt */ + +static int __init prompt_ramdisk(char *str) +{ + rd_prompt = simple_strtol(str,NULL,0) & 1; + return 1; +} +__setup("prompt_ramdisk=", prompt_ramdisk); + +int __initdata rd_image_start; /* starting block # of image */ + +static int __init ramdisk_start_setup(char *str) +{ + rd_image_start = simple_strtol(str,NULL,0); + return 1; +} +__setup("ramdisk_start=", ramdisk_start_setup); + +static int __init crd_load(int in_fd, int out_fd, decompress_fn deco); + +/* + * This routine tries to find a RAM disk image to load, and returns the + * number of blocks to read for a non-compressed image, 0 if the image + * is a compressed image, and -1 if an image with the right magic + * numbers could not be found. + * + * We currently check for the following magic numbers: + * minix + * ext2 + * romfs + * cramfs + * squashfs + * gzip + */ +static int __init +identify_ramdisk_image(int fd, int start_block, decompress_fn *decompressor) +{ + const int size = 512; + struct minix_super_block *minixsb; + struct romfs_super_block *romfsb; + struct cramfs_super *cramfsb; + struct squashfs_super_block *squashfsb; + int nblocks = -1; + unsigned char *buf; + const char *compress_name; + unsigned long n; + + buf = kmalloc(size, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + minixsb = (struct minix_super_block *) buf; + romfsb = (struct romfs_super_block *) buf; + cramfsb = (struct cramfs_super *) buf; + squashfsb = (struct squashfs_super_block *) buf; + memset(buf, 0xe5, size); + + /* + * Read block 0 to test for compressed kernel + */ + sys_lseek(fd, start_block * BLOCK_SIZE, 0); + sys_read(fd, buf, size); + + *decompressor = decompress_method(buf, size, &compress_name); + if (compress_name) { + printk(KERN_NOTICE "RAMDISK: %s image found at block %d\n", + compress_name, start_block); + if (!*decompressor) + printk(KERN_EMERG + "RAMDISK: %s decompressor not configured!\n", + compress_name); + nblocks = 0; + goto done; + } + + /* romfs is at block zero too */ + if (romfsb->word0 == ROMSB_WORD0 && + romfsb->word1 == ROMSB_WORD1) { + printk(KERN_NOTICE + "RAMDISK: romfs filesystem found at block %d\n", + start_block); + nblocks = (ntohl(romfsb->size)+BLOCK_SIZE-1)>>BLOCK_SIZE_BITS; + goto done; + } + + if (cramfsb->magic == CRAMFS_MAGIC) { + printk(KERN_NOTICE + "RAMDISK: cramfs filesystem found at block %d\n", + start_block); + nblocks = (cramfsb->size + BLOCK_SIZE - 1) >> BLOCK_SIZE_BITS; + goto done; + } + + /* squashfs is at block zero too */ + if (le32_to_cpu(squashfsb->s_magic) == SQUASHFS_MAGIC) { + printk(KERN_NOTICE + "RAMDISK: squashfs filesystem found at block %d\n", + start_block); + nblocks = (le64_to_cpu(squashfsb->bytes_used) + BLOCK_SIZE - 1) + >> BLOCK_SIZE_BITS; + goto done; + } + + /* + * Read 512 bytes further to check if cramfs is padded + */ + sys_lseek(fd, start_block * BLOCK_SIZE + 0x200, 0); + sys_read(fd, buf, size); + + if (cramfsb->magic == CRAMFS_MAGIC) { + printk(KERN_NOTICE + "RAMDISK: cramfs filesystem found at block %d\n", + start_block); + nblocks = (cramfsb->size + BLOCK_SIZE - 1) >> BLOCK_SIZE_BITS; + goto done; + } + + /* + * Read block 1 to test for minix and ext2 superblock + */ + sys_lseek(fd, (start_block+1) * BLOCK_SIZE, 0); + sys_read(fd, buf, size); + + /* Try minix */ + if (minixsb->s_magic == MINIX_SUPER_MAGIC || + minixsb->s_magic == MINIX_SUPER_MAGIC2) { + printk(KERN_NOTICE + "RAMDISK: Minix filesystem found at block %d\n", + start_block); + nblocks = minixsb->s_nzones << minixsb->s_log_zone_size; + goto done; + } + + /* Try ext2 */ + n = ext2_image_size(buf); + if (n) { + printk(KERN_NOTICE + "RAMDISK: ext2 filesystem found at block %d\n", + start_block); + nblocks = n; + goto done; + } + + printk(KERN_NOTICE + "RAMDISK: Couldn't find valid RAM disk image starting at %d.\n", + start_block); + +done: + sys_lseek(fd, start_block * BLOCK_SIZE, 0); + kfree(buf); + return nblocks; +} + +int __init rd_load_image(char *from) +{ + int res = 0; + int in_fd, out_fd; + unsigned long rd_blocks, devblocks; + int nblocks, i, disk; + char *buf = NULL; + unsigned short rotate = 0; + decompress_fn decompressor = NULL; +#if !defined(CONFIG_S390) + char rotator[4] = { '|' , '/' , '-' , '\\' }; +#endif + + out_fd = sys_open((const char __user __force *) "/dev/ram", O_RDWR, 0); + if (out_fd < 0) + goto out; + + in_fd = sys_open(from, O_RDONLY, 0); + if (in_fd < 0) + goto noclose_input; + + nblocks = identify_ramdisk_image(in_fd, rd_image_start, &decompressor); + if (nblocks < 0) + goto done; + + if (nblocks == 0) { + if (crd_load(in_fd, out_fd, decompressor) == 0) + goto successful_load; + goto done; + } + + /* + * NOTE NOTE: nblocks is not actually blocks but + * the number of kibibytes of data to load into a ramdisk. + * So any ramdisk block size that is a multiple of 1KiB should + * work when the appropriate ramdisk_blocksize is specified + * on the command line. + * + * The default ramdisk_blocksize is 1KiB and it is generally + * silly to use anything else, so make sure to use 1KiB + * blocksize while generating ext2fs ramdisk-images. + */ + if (sys_ioctl(out_fd, BLKGETSIZE, (unsigned long)&rd_blocks) < 0) + rd_blocks = 0; + else + rd_blocks >>= 1; + + if (nblocks > rd_blocks) { + printk("RAMDISK: image too big! (%dKiB/%ldKiB)\n", + nblocks, rd_blocks); + goto done; + } + + /* + * OK, time to copy in the data + */ + if (sys_ioctl(in_fd, BLKGETSIZE, (unsigned long)&devblocks) < 0) + devblocks = 0; + else + devblocks >>= 1; + + if (strcmp(from, "/initrd.image") == 0) + devblocks = nblocks; + + if (devblocks == 0) { + printk(KERN_ERR "RAMDISK: could not determine device size\n"); + goto done; + } + + buf = kmalloc(BLOCK_SIZE, GFP_KERNEL); + if (!buf) { + printk(KERN_ERR "RAMDISK: could not allocate buffer\n"); + goto done; + } + + printk(KERN_NOTICE "RAMDISK: Loading %dKiB [%ld disk%s] into ram disk... ", + nblocks, ((nblocks-1)/devblocks)+1, nblocks>devblocks ? "s" : ""); + for (i = 0, disk = 1; i < nblocks; i++) { + if (i && (i % devblocks == 0)) { + printk("done disk #%d.\n", disk++); + rotate = 0; + if (sys_close(in_fd)) { + printk("Error closing the disk.\n"); + goto noclose_input; + } + change_floppy("disk #%d", disk); + in_fd = sys_open(from, O_RDONLY, 0); + if (in_fd < 0) { + printk("Error opening disk.\n"); + goto noclose_input; + } + printk("Loading disk #%d... ", disk); + } + sys_read(in_fd, buf, BLOCK_SIZE); + sys_write(out_fd, buf, BLOCK_SIZE); +#if !defined(CONFIG_S390) + if (!(i % 16)) { + printk("%c\b", rotator[rotate & 0x3]); + rotate++; + } +#endif + } + printk("done.\n"); + +successful_load: + res = 1; +done: + sys_close(in_fd); +noclose_input: + sys_close(out_fd); +out: + kfree(buf); + sys_unlink((const char __user __force *) "/dev/ram"); + return res; +} + +int __init rd_load_disk(int n) +{ + if (rd_prompt) + change_floppy("root floppy disk to be loaded into RAM disk"); + create_dev("/dev/root", ROOT_DEV); + create_dev("/dev/ram", MKDEV(RAMDISK_MAJOR, n)); + return rd_load_image("/dev/root"); +} + +static int exit_code; +static int decompress_error; +static int crd_infd, crd_outfd; + +static int __init compr_fill(void *buf, unsigned int len) +{ + int r = sys_read(crd_infd, buf, len); + if (r < 0) + printk(KERN_ERR "RAMDISK: error while reading compressed data"); + else if (r == 0) + printk(KERN_ERR "RAMDISK: EOF while reading compressed data"); + return r; +} + +static int __init compr_flush(void *window, unsigned int outcnt) +{ + int written = sys_write(crd_outfd, window, outcnt); + if (written != outcnt) { + if (decompress_error == 0) + printk(KERN_ERR + "RAMDISK: incomplete write (%d != %d)\n", + written, outcnt); + decompress_error = 1; + return -1; + } + return outcnt; +} + +static void __init error(char *x) +{ + printk(KERN_ERR "%s\n", x); + exit_code = 1; + decompress_error = 1; +} + +static int __init crd_load(int in_fd, int out_fd, decompress_fn deco) +{ + int result; + crd_infd = in_fd; + crd_outfd = out_fd; + result = deco(NULL, 0, compr_fill, compr_flush, NULL, NULL, error); + if (decompress_error) + result = 1; + return result; +} diff --git a/init/initramfs.c b/init/initramfs.c new file mode 100644 index 00000000..8216c303 --- /dev/null +++ b/init/initramfs.c @@ -0,0 +1,611 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static __initdata char *message; +static void __init error(char *x) +{ + if (!message) + message = x; +} + +/* link hash */ + +#define N_ALIGN(len) ((((len) + 1) & ~3) + 2) + +static __initdata struct hash { + int ino, minor, major; + umode_t mode; + struct hash *next; + char name[N_ALIGN(PATH_MAX)]; +} *head[32]; + +static inline int hash(int major, int minor, int ino) +{ + unsigned long tmp = ino + minor + (major << 3); + tmp += tmp >> 5; + return tmp & 31; +} + +static char __init *find_link(int major, int minor, int ino, + umode_t mode, char *name) +{ + struct hash **p, *q; + for (p = head + hash(major, minor, ino); *p; p = &(*p)->next) { + if ((*p)->ino != ino) + continue; + if ((*p)->minor != minor) + continue; + if ((*p)->major != major) + continue; + if (((*p)->mode ^ mode) & S_IFMT) + continue; + return (*p)->name; + } + q = kmalloc(sizeof(struct hash), GFP_KERNEL); + if (!q) + panic("can't allocate link hash entry"); + q->major = major; + q->minor = minor; + q->ino = ino; + q->mode = mode; + strcpy(q->name, name); + q->next = NULL; + *p = q; + return NULL; +} + +static void __init free_hash(void) +{ + struct hash **p, *q; + for (p = head; p < head + 32; p++) { + while (*p) { + q = *p; + *p = q->next; + kfree(q); + } + } +} + +static long __init do_utime(char __user *filename, time_t mtime) +{ + struct timespec t[2]; + + t[0].tv_sec = mtime; + t[0].tv_nsec = 0; + t[1].tv_sec = mtime; + t[1].tv_nsec = 0; + + return do_utimes(AT_FDCWD, filename, t, AT_SYMLINK_NOFOLLOW); +} + +static __initdata LIST_HEAD(dir_list); +struct dir_entry { + struct list_head list; + char *name; + time_t mtime; +}; + +static void __init dir_add(const char *name, time_t mtime) +{ + struct dir_entry *de = kmalloc(sizeof(struct dir_entry), GFP_KERNEL); + if (!de) + panic("can't allocate dir_entry buffer"); + INIT_LIST_HEAD(&de->list); + de->name = kstrdup(name, GFP_KERNEL); + de->mtime = mtime; + list_add(&de->list, &dir_list); +} + +static void __init dir_utime(void) +{ + struct dir_entry *de, *tmp; + list_for_each_entry_safe(de, tmp, &dir_list, list) { + list_del(&de->list); + do_utime(de->name, de->mtime); + kfree(de->name); + kfree(de); + } +} + +static __initdata time_t mtime; + +/* cpio header parsing */ + +static __initdata unsigned long ino, major, minor, nlink; +static __initdata umode_t mode; +static __initdata unsigned long body_len, name_len; +static __initdata uid_t uid; +static __initdata gid_t gid; +static __initdata unsigned rdev; + +static void __init parse_header(char *s) +{ + unsigned long parsed[12]; + char buf[9]; + int i; + + buf[8] = '\0'; + for (i = 0, s += 6; i < 12; i++, s += 8) { + memcpy(buf, s, 8); + parsed[i] = simple_strtoul(buf, NULL, 16); + } + ino = parsed[0]; + mode = parsed[1]; + uid = parsed[2]; + gid = parsed[3]; + nlink = parsed[4]; + mtime = parsed[5]; + body_len = parsed[6]; + major = parsed[7]; + minor = parsed[8]; + rdev = new_encode_dev(MKDEV(parsed[9], parsed[10])); + name_len = parsed[11]; +} + +/* FSM */ + +static __initdata enum state { + Start, + Collect, + GotHeader, + SkipIt, + GotName, + CopyFile, + GotSymlink, + Reset +} state, next_state; + +static __initdata char *victim; +static __initdata unsigned count; +static __initdata loff_t this_header, next_header; + +static inline void __init eat(unsigned n) +{ + victim += n; + this_header += n; + count -= n; +} + +static __initdata char *vcollected; +static __initdata char *collected; +static __initdata int remains; +static __initdata char *collect; + +static void __init read_into(char *buf, unsigned size, enum state next) +{ + if (count >= size) { + collected = victim; + eat(size); + state = next; + } else { + collect = collected = buf; + remains = size; + next_state = next; + state = Collect; + } +} + +static __initdata char *header_buf, *symlink_buf, *name_buf; + +static int __init do_start(void) +{ + read_into(header_buf, 110, GotHeader); + return 0; +} + +static int __init do_collect(void) +{ + unsigned n = remains; + if (count < n) + n = count; + memcpy(collect, victim, n); + eat(n); + collect += n; + if ((remains -= n) != 0) + return 1; + state = next_state; + return 0; +} + +static int __init do_header(void) +{ + if (memcmp(collected, "070707", 6)==0) { + error("incorrect cpio method used: use -H newc option"); + return 1; + } + if (memcmp(collected, "070701", 6)) { + error("no cpio magic"); + return 1; + } + parse_header(collected); + next_header = this_header + N_ALIGN(name_len) + body_len; + next_header = (next_header + 3) & ~3; + state = SkipIt; + if (name_len <= 0 || name_len > PATH_MAX) + return 0; + if (S_ISLNK(mode)) { + if (body_len > PATH_MAX) + return 0; + collect = collected = symlink_buf; + remains = N_ALIGN(name_len) + body_len; + next_state = GotSymlink; + state = Collect; + return 0; + } + if (S_ISREG(mode) || !body_len) + read_into(name_buf, N_ALIGN(name_len), GotName); + return 0; +} + +static int __init do_skip(void) +{ + if (this_header + count < next_header) { + eat(count); + return 1; + } else { + eat(next_header - this_header); + state = next_state; + return 0; + } +} + +static int __init do_reset(void) +{ + while(count && *victim == '\0') + eat(1); + if (count && (this_header & 3)) + error("broken padding"); + return 1; +} + +static int __init maybe_link(void) +{ + if (nlink >= 2) { + char *old = find_link(major, minor, ino, mode, collected); + if (old) + return (sys_link(old, collected) < 0) ? -1 : 1; + } + return 0; +} + +static void __init clean_path(char *path, umode_t mode) +{ + struct stat st; + + if (!sys_newlstat(path, &st) && (st.st_mode^mode) & S_IFMT) { + if (S_ISDIR(st.st_mode)) + sys_rmdir(path); + else + sys_unlink(path); + } +} + +static __initdata int wfd; + +static int __init do_name(void) +{ + state = SkipIt; + next_state = Reset; + if (strcmp(collected, "TRAILER!!!") == 0) { + free_hash(); + return 0; + } + clean_path(collected, mode); + if (S_ISREG(mode)) { + int ml = maybe_link(); + if (ml >= 0) { + int openflags = O_WRONLY|O_CREAT; + if (ml != 1) + openflags |= O_TRUNC; + wfd = sys_open(collected, openflags, mode); + + if (wfd >= 0) { + sys_fchown(wfd, uid, gid); + sys_fchmod(wfd, mode); + if (body_len) + sys_ftruncate(wfd, body_len); + vcollected = kstrdup(collected, GFP_KERNEL); + state = CopyFile; + } + } + } else if (S_ISDIR(mode)) { + sys_mkdir(collected, mode); + sys_chown(collected, uid, gid); + sys_chmod(collected, mode); + dir_add(collected, mtime); + } else if (S_ISBLK(mode) || S_ISCHR(mode) || + S_ISFIFO(mode) || S_ISSOCK(mode)) { + if (maybe_link() == 0) { + sys_mknod(collected, mode, rdev); + sys_chown(collected, uid, gid); + sys_chmod(collected, mode); + do_utime(collected, mtime); + } + } + return 0; +} + +static int __init do_copy(void) +{ + if (count >= body_len) { + sys_write(wfd, victim, body_len); + sys_close(wfd); + do_utime(vcollected, mtime); + kfree(vcollected); + eat(body_len); + state = SkipIt; + return 0; + } else { + sys_write(wfd, victim, count); + body_len -= count; + eat(count); + return 1; + } +} + +static int __init do_symlink(void) +{ + collected[N_ALIGN(name_len) + body_len] = '\0'; + clean_path(collected, 0); + sys_symlink(collected + N_ALIGN(name_len), collected); + sys_lchown(collected, uid, gid); + do_utime(collected, mtime); + state = SkipIt; + next_state = Reset; + return 0; +} + +static __initdata int (*actions[])(void) = { + [Start] = do_start, + [Collect] = do_collect, + [GotHeader] = do_header, + [SkipIt] = do_skip, + [GotName] = do_name, + [CopyFile] = do_copy, + [GotSymlink] = do_symlink, + [Reset] = do_reset, +}; + +static int __init write_buffer(char *buf, unsigned len) +{ + count = len; + victim = buf; + + while (!actions[state]()) + ; + return len - count; +} + +static int __init flush_buffer(void *bufv, unsigned len) +{ + char *buf = (char *) bufv; + int written; + int origLen = len; + if (message) + return -1; + while ((written = write_buffer(buf, len)) < len && !message) { + char c = buf[written]; + if (c == '0') { + buf += written; + len -= written; + state = Start; + } else if (c == 0) { + buf += written; + len -= written; + state = Reset; + } else + error("junk in compressed archive"); + } + return origLen; +} + +static unsigned my_inptr; /* index of next byte to be processed in inbuf */ + +#include + +static char * __init unpack_to_rootfs(char *buf, unsigned len) +{ + int written, res; + decompress_fn decompress; + const char *compress_name; + static __initdata char msg_buf[64]; + + header_buf = kmalloc(110, GFP_KERNEL); + symlink_buf = kmalloc(PATH_MAX + N_ALIGN(PATH_MAX) + 1, GFP_KERNEL); + name_buf = kmalloc(N_ALIGN(PATH_MAX), GFP_KERNEL); + + if (!header_buf || !symlink_buf || !name_buf) + panic("can't allocate buffers"); + + state = Start; + this_header = 0; + message = NULL; + while (!message && len) { + loff_t saved_offset = this_header; + if (*buf == '0' && !(this_header & 3)) { + state = Start; + written = write_buffer(buf, len); + buf += written; + len -= written; + continue; + } + if (!*buf) { + buf++; + len--; + this_header++; + continue; + } + this_header = 0; + decompress = decompress_method(buf, len, &compress_name); + if (decompress) { + res = decompress(buf, len, NULL, flush_buffer, NULL, + &my_inptr, error); + if (res) + error("decompressor failed"); + } else if (compress_name) { + if (!message) { + snprintf(msg_buf, sizeof msg_buf, + "compression method %s not configured", + compress_name); + message = msg_buf; + } + } else + error("junk in compressed archive"); + if (state != Reset) + error("junk in compressed archive"); + this_header = saved_offset + my_inptr; + buf += my_inptr; + len -= my_inptr; + } + dir_utime(); + kfree(name_buf); + kfree(symlink_buf); + kfree(header_buf); + return message; +} + +static int __initdata do_retain_initrd; + +static int __init retain_initrd_param(char *str) +{ + if (*str) + return 0; + do_retain_initrd = 1; + return 1; +} +__setup("retain_initrd", retain_initrd_param); + +extern char __initramfs_start[]; +extern unsigned long __initramfs_size; +#include +#include + +static void __init free_initrd(void) +{ +#ifdef CONFIG_KEXEC + unsigned long crashk_start = (unsigned long)__va(crashk_res.start); + unsigned long crashk_end = (unsigned long)__va(crashk_res.end); +#endif + if (do_retain_initrd) + goto skip; + +#ifdef CONFIG_KEXEC + /* + * If the initrd region is overlapped with crashkernel reserved region, + * free only memory that is not part of crashkernel region. + */ + if (initrd_start < crashk_end && initrd_end > crashk_start) { + /* + * Initialize initrd memory region since the kexec boot does + * not do. + */ + memset((void *)initrd_start, 0, initrd_end - initrd_start); + if (initrd_start < crashk_start) + free_initrd_mem(initrd_start, crashk_start); + if (initrd_end > crashk_end) + free_initrd_mem(crashk_end, initrd_end); + } else +#endif + free_initrd_mem(initrd_start, initrd_end); +skip: + initrd_start = 0; + initrd_end = 0; +} + +#ifdef CONFIG_BLK_DEV_RAM +#define BUF_SIZE 1024 +static void __init clean_rootfs(void) +{ + int fd; + void *buf; + struct linux_dirent64 *dirp; + int num; + + fd = sys_open((const char __user __force *) "/", O_RDONLY, 0); + WARN_ON(fd < 0); + if (fd < 0) + return; + buf = kzalloc(BUF_SIZE, GFP_KERNEL); + WARN_ON(!buf); + if (!buf) { + sys_close(fd); + return; + } + + dirp = buf; + num = sys_getdents64(fd, dirp, BUF_SIZE); + while (num > 0) { + while (num > 0) { + struct stat st; + int ret; + + ret = sys_newlstat(dirp->d_name, &st); + WARN_ON_ONCE(ret); + if (!ret) { + if (S_ISDIR(st.st_mode)) + sys_rmdir(dirp->d_name); + else + sys_unlink(dirp->d_name); + } + + num -= dirp->d_reclen; + dirp = (void *)dirp + dirp->d_reclen; + } + dirp = buf; + memset(buf, 0, BUF_SIZE); + num = sys_getdents64(fd, dirp, BUF_SIZE); + } + + sys_close(fd); + kfree(buf); +} +#endif + +static int __init populate_rootfs(void) +{ + char *err = unpack_to_rootfs(__initramfs_start, __initramfs_size); + if (err) + panic(err); /* Failed to decompress INTERNAL initramfs */ + if (initrd_start) { +#ifdef CONFIG_BLK_DEV_RAM + int fd; + printk(KERN_INFO "Trying to unpack rootfs image as initramfs...\n"); + err = unpack_to_rootfs((char *)initrd_start, + initrd_end - initrd_start); + if (!err) { + free_initrd(); + return 0; + } else { + clean_rootfs(); + unpack_to_rootfs(__initramfs_start, __initramfs_size); + } + printk(KERN_INFO "rootfs image is not initramfs (%s)" + "; looks like an initrd\n", err); + fd = sys_open((const char __user __force *) "/initrd.image", + O_WRONLY|O_CREAT, 0700); + if (fd >= 0) { + sys_write(fd, (char *)initrd_start, + initrd_end - initrd_start); + sys_close(fd); + free_initrd(); + } +#else + printk(KERN_INFO "Unpacking initramfs...\n"); + err = unpack_to_rootfs((char *)initrd_start, + initrd_end - initrd_start); + if (err) + printk(KERN_EMERG "Initramfs unpacking failed: %s\n", err); + free_initrd(); +#endif + } + return 0; +} +rootfs_initcall(populate_rootfs); diff --git a/init/main.c b/init/main.c new file mode 100644 index 00000000..b08c5f75 --- /dev/null +++ b/init/main.c @@ -0,0 +1,894 @@ +/* + * linux/init/main.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * GK 2/5/95 - Changed to support mounting root fs via NFS + * Added initrd & change_root: Werner Almesberger & Hans Lermen, Feb '96 + * Moan early if gcc is old, avoiding bogus kernels - Paul Gortmaker, May '96 + * Simplified starting of init: Michael A. Griffith + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#ifdef CONFIG_X86_LOCAL_APIC +#include +#endif + +static int kernel_init(void *); + +extern void init_IRQ(void); +extern void fork_init(unsigned long); +extern void mca_init(void); +extern void sbus_init(void); +extern void prio_tree_init(void); +extern void radix_tree_init(void); +#ifndef CONFIG_DEBUG_RODATA +static inline void mark_rodata_ro(void) { } +#endif + +#ifdef CONFIG_TC +extern void tc_init(void); +#endif + +/* + * Debug helper: via this flag we know that we are in 'early bootup code' + * where only the boot processor is running with IRQ disabled. This means + * two things - IRQ must not be enabled before the flag is cleared and some + * operations which are not allowed with IRQ disabled are allowed while the + * flag is set. + */ +bool early_boot_irqs_disabled __read_mostly; + +enum system_states system_state __read_mostly; +EXPORT_SYMBOL(system_state); + +/* + * Boot command-line arguments + */ +#define MAX_INIT_ARGS CONFIG_INIT_ENV_ARG_LIMIT +#define MAX_INIT_ENVS CONFIG_INIT_ENV_ARG_LIMIT + +extern void time_init(void); +/* Default late time init is NULL. archs can override this later. */ +void (*__initdata late_time_init)(void); +extern void softirq_init(void); + +/* Untouched command line saved by arch-specific code. */ +char __initdata boot_command_line[COMMAND_LINE_SIZE]; +/* Untouched saved command line (eg. for /proc) */ +char *saved_command_line; +/* Command line for parameter parsing */ +static char *static_command_line; + +static char *execute_command; +static char *ramdisk_execute_command; + +/* + * If set, this is an indication to the drivers that reset the underlying + * device before going ahead with the initialization otherwise driver might + * rely on the BIOS and skip the reset operation. + * + * This is useful if kernel is booting in an unreliable environment. + * For ex. kdump situaiton where previous kernel has crashed, BIOS has been + * skipped and devices will be in unknown state. + */ +unsigned int reset_devices; +EXPORT_SYMBOL(reset_devices); + +static int __init set_reset_devices(char *str) +{ + reset_devices = 1; + return 1; +} + +__setup("reset_devices", set_reset_devices); + +static const char * argv_init[MAX_INIT_ARGS+2] = { "init", NULL, }; +const char * envp_init[MAX_INIT_ENVS+2] = { "HOME=/", "TERM=linux", NULL, }; +static const char *panic_later, *panic_param; + +extern const struct obs_kernel_param __setup_start[], __setup_end[]; + +static int __init obsolete_checksetup(char *line) +{ + const struct obs_kernel_param *p; + int had_early_param = 0; + + p = __setup_start; + do { + int n = strlen(p->str); + if (parameqn(line, p->str, n)) { + if (p->early) { + /* Already done in parse_early_param? + * (Needs exact match on param part). + * Keep iterating, as we can have early + * params and __setups of same names 8( */ + if (line[n] == '\0' || line[n] == '=') + had_early_param = 1; + } else if (!p->setup_func) { + printk(KERN_WARNING "Parameter %s is obsolete," + " ignored\n", p->str); + return 1; + } else if (p->setup_func(line + n)) + return 1; + } + p++; + } while (p < __setup_end); + + return had_early_param; +} + +/* + * This should be approx 2 Bo*oMips to start (note initial shift), and will + * still work even if initially too large, it will just take slightly longer + */ +unsigned long loops_per_jiffy = (1<<12); + +EXPORT_SYMBOL(loops_per_jiffy); + +static int __init debug_kernel(char *str) +{ + console_loglevel = 10; + return 0; +} + +static int __init quiet_kernel(char *str) +{ + console_loglevel = 4; + return 0; +} + +early_param("debug", debug_kernel); +early_param("quiet", quiet_kernel); + +static int __init loglevel(char *str) +{ + int newlevel; + + /* + * Only update loglevel value when a correct setting was passed, + * to prevent blind crashes (when loglevel being set to 0) that + * are quite hard to debug + */ + if (get_option(&str, &newlevel)) { + console_loglevel = newlevel; + return 0; + } + + return -EINVAL; +} + +early_param("loglevel", loglevel); + +/* Change NUL term back to "=", to make "param" the whole string. */ +static int __init repair_env_string(char *param, char *val) +{ + if (val) { + /* param=val or param="val"? */ + if (val == param+strlen(param)+1) + val[-1] = '='; + else if (val == param+strlen(param)+2) { + val[-2] = '='; + memmove(val-1, val, strlen(val)+1); + val--; + } else + BUG(); + } + return 0; +} + +/* + * Unknown boot options get handed to init, unless they look like + * unused parameters (modprobe will find them in /proc/cmdline). + */ +static int __init unknown_bootoption(char *param, char *val) +{ + repair_env_string(param, val); + + /* Handle obsolete-style parameters */ + if (obsolete_checksetup(param)) + return 0; + + /* Unused module parameter. */ + if (strchr(param, '.') && (!val || strchr(param, '.') < val)) + return 0; + + if (panic_later) + return 0; + + if (val) { + /* Environment option */ + unsigned int i; + for (i = 0; envp_init[i]; i++) { + if (i == MAX_INIT_ENVS) { + panic_later = "Too many boot env vars at `%s'"; + panic_param = param; + } + if (!strncmp(param, envp_init[i], val - param)) + break; + } + envp_init[i] = param; + } else { + /* Command line option */ + unsigned int i; + for (i = 0; argv_init[i]; i++) { + if (i == MAX_INIT_ARGS) { + panic_later = "Too many boot init vars at `%s'"; + panic_param = param; + } + } + argv_init[i] = param; + } + return 0; +} + +static int __init init_setup(char *str) +{ + unsigned int i; + + execute_command = str; + /* + * In case LILO is going to boot us with default command line, + * it prepends "auto" before the whole cmdline which makes + * the shell think it should execute a script with such name. + * So we ignore all arguments entered _before_ init=... [MJ] + */ + for (i = 1; i < MAX_INIT_ARGS; i++) + argv_init[i] = NULL; + return 1; +} +__setup("init=", init_setup); + +static int __init rdinit_setup(char *str) +{ + unsigned int i; + + ramdisk_execute_command = str; + /* See "auto" comment in init_setup */ + for (i = 1; i < MAX_INIT_ARGS; i++) + argv_init[i] = NULL; + return 1; +} +__setup("rdinit=", rdinit_setup); + +#ifndef CONFIG_SMP +static const unsigned int setup_max_cpus = NR_CPUS; +#ifdef CONFIG_X86_LOCAL_APIC +static void __init smp_init(void) +{ + APIC_init_uniprocessor(); +} +#else +#define smp_init() do { } while (0) +#endif + +static inline void setup_nr_cpu_ids(void) { } +static inline void smp_prepare_cpus(unsigned int maxcpus) { } +#endif + +/* + * We need to store the untouched command line for future reference. + * We also need to store the touched command line since the parameter + * parsing is performed in place, and we should allow a component to + * store reference of name/value for future reference. + */ +static void __init setup_command_line(char *command_line) +{ + saved_command_line = alloc_bootmem(strlen (boot_command_line)+1); + static_command_line = alloc_bootmem(strlen (command_line)+1); + strcpy (saved_command_line, boot_command_line); + strcpy (static_command_line, command_line); +} + +/* + * We need to finalize in a non-__init function or else race conditions + * between the root thread and the init thread may cause start_kernel to + * be reaped by free_initmem before the root thread has proceeded to + * cpu_idle. + * + * gcc-3.4 accidentally inlines this function, so use noinline. + */ + +static __initdata DECLARE_COMPLETION(kthreadd_done); + +static noinline void __init_refok rest_init(void) +{ + int pid; + + rcu_scheduler_starting(); + /* + * We need to spawn init first so that it obtains pid 1, however + * the init task will end up wanting to create kthreads, which, if + * we schedule it before we create kthreadd, will OOPS. + */ + kernel_thread(kernel_init, NULL, CLONE_FS | CLONE_SIGHAND); + numa_default_policy(); + pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES); + rcu_read_lock(); + kthreadd_task = find_task_by_pid_ns(pid, &init_pid_ns); + rcu_read_unlock(); + complete(&kthreadd_done); + + /* + * The boot idle thread must execute schedule() + * at least once to get things moving: + */ + init_idle_bootup_task(current); + schedule_preempt_disabled(); + /* Call into cpu_idle with preempt disabled */ + cpu_idle(); +} + +/* Check for early params. */ +static int __init do_early_param(char *param, char *val) +{ + const struct obs_kernel_param *p; + + for (p = __setup_start; p < __setup_end; p++) { + if ((p->early && parameq(param, p->str)) || + (strcmp(param, "console") == 0 && + strcmp(p->str, "earlycon") == 0) + ) { + if (p->setup_func(val) != 0) + printk(KERN_WARNING + "Malformed early option '%s'\n", param); + } + } + /* We accept everything at this stage. */ + return 0; +} + +void __init parse_early_options(char *cmdline) +{ + parse_args("early options", cmdline, NULL, 0, 0, 0, do_early_param); +} + +/* Arch code calls this early on, or if not, just before other parsing. */ +void __init parse_early_param(void) +{ + static __initdata int done = 0; + static __initdata char tmp_cmdline[COMMAND_LINE_SIZE]; + + if (done) + return; + + /* All fall through to do_early_param. */ + strlcpy(tmp_cmdline, boot_command_line, COMMAND_LINE_SIZE); + parse_early_options(tmp_cmdline); + done = 1; +} + +/* + * Activate the first processor. + */ + +static void __init boot_cpu_init(void) +{ + int cpu = smp_processor_id(); + /* Mark the boot cpu "present", "online" etc for SMP and UP case */ + set_cpu_online(cpu, true); + set_cpu_active(cpu, true); + set_cpu_present(cpu, true); + set_cpu_possible(cpu, true); +} + +void __init __weak smp_setup_processor_id(void) +{ +} + +void __init __weak thread_info_cache_init(void) +{ +} + +/* + * Set up kernel memory allocators + */ +static void __init mm_init(void) +{ + /* + * page_cgroup requires contiguous pages, + * bigger than MAX_ORDER unless SPARSEMEM. + */ + page_cgroup_init_flatmem(); + mem_init(); + kmem_cache_init(); + percpu_init_late(); + pgtable_cache_init(); + vmalloc_init(); +} + +asmlinkage void __init start_kernel(void) +{ + char * command_line; + extern const struct kernel_param __start___param[], __stop___param[]; + + /* + * Need to run as early as possible, to initialize the + * lockdep hash: + */ + lockdep_init(); + smp_setup_processor_id(); + debug_objects_early_init(); + + /* + * Set up the the initial canary ASAP: + */ + boot_init_stack_canary(); + + cgroup_init_early(); + + local_irq_disable(); + early_boot_irqs_disabled = true; + +/* + * Interrupts are still disabled. Do necessary setups, then + * enable them + */ + tick_init(); + boot_cpu_init(); + page_address_init(); + printk(KERN_NOTICE "%s", linux_banner); + setup_arch(&command_line); + mm_init_owner(&init_mm, &init_task); + mm_init_cpumask(&init_mm); + setup_command_line(command_line); + setup_nr_cpu_ids(); + setup_per_cpu_areas(); + smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */ + + build_all_zonelists(NULL); + page_alloc_init(); + + printk(KERN_NOTICE "Kernel command line: %s\n", boot_command_line); + parse_early_param(); + parse_args("Booting kernel", static_command_line, __start___param, + __stop___param - __start___param, + -1, -1, &unknown_bootoption); + + jump_label_init(); + + /* + * These use large bootmem allocations and must precede + * kmem_cache_init() + */ + setup_log_buf(0); + pidhash_init(); + vfs_caches_init_early(); + sort_main_extable(); + trap_init(); + mm_init(); + + /* + * Set up the scheduler prior starting any interrupts (such as the + * timer interrupt). Full topology setup happens at smp_init() + * time - but meanwhile we still have a functioning scheduler. + */ + sched_init(); + /* + * Disable preemption - early bootup scheduling is extremely + * fragile until we cpu_idle() for the first time. + */ + preempt_disable(); + if (!irqs_disabled()) { + printk(KERN_WARNING "start_kernel(): bug: interrupts were " + "enabled *very* early, fixing it\n"); + local_irq_disable(); + } + idr_init_cache(); + perf_event_init(); + rcu_init(); + radix_tree_init(); + /* init some links before init_ISA_irqs() */ + early_irq_init(); + init_IRQ(); + prio_tree_init(); + init_timers(); + hrtimers_init(); + softirq_init(); + timekeeping_init(); + time_init(); + profile_init(); + call_function_init(); + if (!irqs_disabled()) + printk(KERN_CRIT "start_kernel(): bug: interrupts were " + "enabled early\n"); + early_boot_irqs_disabled = false; + local_irq_enable(); + + kmem_cache_init_late(); + + /* + * HACK ALERT! This is early. We're enabling the console before + * we've done PCI setups etc, and console_init() must be aware of + * this. But we do want output early, in case something goes wrong. + */ + console_init(); + if (panic_later) + panic(panic_later, panic_param); + + lockdep_info(); + + /* + * Need to run this when irqs are enabled, because it wants + * to self-test [hard/soft]-irqs on/off lock inversion bugs + * too: + */ + locking_selftest(); + +#ifdef CONFIG_BLK_DEV_INITRD + if (initrd_start && !initrd_below_start_ok && + page_to_pfn(virt_to_page((void *)initrd_start)) < min_low_pfn) { + printk(KERN_CRIT "initrd overwritten (0x%08lx < 0x%08lx) - " + "disabling it.\n", + page_to_pfn(virt_to_page((void *)initrd_start)), + min_low_pfn); + initrd_start = 0; + } +#endif + page_cgroup_init(); + debug_objects_mem_init(); + kmemleak_init(); + setup_per_cpu_pageset(); + numa_policy_init(); + if (late_time_init) + late_time_init(); + sched_clock_init(); + calibrate_delay(); + pidmap_init(); + anon_vma_init(); +#ifdef CONFIG_X86 + if (efi_enabled) + efi_enter_virtual_mode(); +#endif + thread_info_cache_init(); + cred_init(); + fork_init(totalram_pages); + proc_caches_init(); + buffer_init(); + key_init(); + security_init(); + dbg_late_init(); + vfs_caches_init(totalram_pages); + signals_init(); + /* rootfs populating might need page-writeback */ + page_writeback_init(); +#ifdef CONFIG_PROC_FS + proc_root_init(); +#endif + cgroup_init(); + cpuset_init(); + taskstats_init_early(); + delayacct_init(); + + check_bugs(); + + acpi_early_init(); /* before LAPIC and SMP init */ + sfi_init_late(); + + ftrace_init(); + + /* Do the rest non-__init'ed, we're now alive */ + rest_init(); +} + +/* Call all constructor functions linked into the kernel. */ +static void __init do_ctors(void) +{ +#ifdef CONFIG_CONSTRUCTORS + ctor_fn_t *fn = (ctor_fn_t *) __ctors_start; + + for (; fn < (ctor_fn_t *) __ctors_end; fn++) + (*fn)(); +#endif +} + +bool initcall_debug; +core_param(initcall_debug, initcall_debug, bool, 0644); + +static char msgbuf[64]; + +static int __init_or_module do_one_initcall_debug(initcall_t fn) +{ + ktime_t calltime, delta, rettime; + unsigned long long duration; + int ret; + + printk(KERN_DEBUG "calling %pF @ %i\n", fn, task_pid_nr(current)); + calltime = ktime_get(); + ret = fn(); + rettime = ktime_get(); + delta = ktime_sub(rettime, calltime); + duration = (unsigned long long) ktime_to_ns(delta) >> 10; + printk(KERN_DEBUG "initcall %pF returned %d after %lld usecs\n", fn, + ret, duration); + + return ret; +} + +int __init_or_module do_one_initcall(initcall_t fn) +{ + int count = preempt_count(); + int ret; + + if (initcall_debug) + ret = do_one_initcall_debug(fn); + else + ret = fn(); + + msgbuf[0] = 0; + + if (ret && ret != -ENODEV && initcall_debug) + sprintf(msgbuf, "error code %d ", ret); + + if (preempt_count() != count) { + strlcat(msgbuf, "preemption imbalance ", sizeof(msgbuf)); + preempt_count() = count; + } + if (irqs_disabled()) { + strlcat(msgbuf, "disabled interrupts ", sizeof(msgbuf)); + local_irq_enable(); + } + if (msgbuf[0]) { + printk("initcall %pF returned with %s\n", fn, msgbuf); + } + + return ret; +} + + +extern initcall_t __initcall_start[]; +extern initcall_t __initcall0_start[]; +extern initcall_t __initcall1_start[]; +extern initcall_t __initcall2_start[]; +extern initcall_t __initcall3_start[]; +extern initcall_t __initcall4_start[]; +extern initcall_t __initcall5_start[]; +extern initcall_t __initcall6_start[]; +extern initcall_t __initcall7_start[]; +extern initcall_t __initcall_end[]; + +static initcall_t *initcall_levels[] __initdata = { + __initcall0_start, + __initcall1_start, + __initcall2_start, + __initcall3_start, + __initcall4_start, + __initcall5_start, + __initcall6_start, + __initcall7_start, + __initcall_end, +}; + +static char *initcall_level_names[] __initdata = { + "early parameters", + "core parameters", + "postcore parameters", + "arch parameters", + "subsys parameters", + "fs parameters", + "device parameters", + "late parameters", +}; + +static void __init do_initcall_level(int level) +{ + extern const struct kernel_param __start___param[], __stop___param[]; + initcall_t *fn; + + strcpy(static_command_line, saved_command_line); + parse_args(initcall_level_names[level], + static_command_line, __start___param, + __stop___param - __start___param, + level, level, + repair_env_string); + + for (fn = initcall_levels[level]; fn < initcall_levels[level+1]; fn++) + do_one_initcall(*fn); +} + +static void __init do_initcalls(void) +{ + int level; + + for (level = 0; level < ARRAY_SIZE(initcall_levels) - 1; level++) + do_initcall_level(level); +} + +/* + * Ok, the machine is now initialized. None of the devices + * have been touched yet, but the CPU subsystem is up and + * running, and memory and process management works. + * + * Now we can finally start doing some real work.. + */ +static void __init do_basic_setup(void) +{ + cpuset_init_smp(); + usermodehelper_init(); + shmem_init(); + driver_init(); + init_irq_proc(); + do_ctors(); + usermodehelper_enable(); + do_initcalls(); +} + +static void __init do_pre_smp_initcalls(void) +{ + initcall_t *fn; + + for (fn = __initcall_start; fn < __initcall0_start; fn++) + do_one_initcall(*fn); +} + +static void run_init_process(const char *init_filename) +{ + argv_init[0] = init_filename; + kernel_execve(init_filename, argv_init, envp_init); +} + +/* This is a non __init function. Force it to be noinline otherwise gcc + * makes it inline to init() and it becomes part of init.text section + */ +static noinline int init_post(void) +{ + /* need to finish all async __init code before freeing the memory */ + async_synchronize_full(); + free_initmem(); + mark_rodata_ro(); + system_state = SYSTEM_RUNNING; + numa_default_policy(); + + + current->signal->flags |= SIGNAL_UNKILLABLE; + + if (ramdisk_execute_command) { + run_init_process(ramdisk_execute_command); + printk(KERN_WARNING "Failed to execute %s\n", + ramdisk_execute_command); + } + + /* + * We try each of these until one succeeds. + * + * The Bourne shell can be used instead of init if we are + * trying to recover a really broken machine. + */ + if (execute_command) { + run_init_process(execute_command); + printk(KERN_WARNING "Failed to execute %s. Attempting " + "defaults...\n", execute_command); + } + run_init_process("/sbin/init"); + run_init_process("/etc/init"); + run_init_process("/bin/init"); + run_init_process("/bin/sh"); + + panic("No init found. Try passing init= option to kernel. " + "See Linux Documentation/init.txt for guidance."); +} + +static int __init kernel_init(void * unused) +{ + /* + * Wait until kthreadd is all set-up. + */ + wait_for_completion(&kthreadd_done); + + /* Now the scheduler is fully set up and can do blocking allocations */ + gfp_allowed_mask = __GFP_BITS_MASK; + + /* + * init can allocate pages on any node + */ + set_mems_allowed(node_states[N_HIGH_MEMORY]); + /* + * init can run on any cpu. + */ + set_cpus_allowed_ptr(current, cpu_all_mask); + + cad_pid = task_pid(current); + + smp_prepare_cpus(setup_max_cpus); + + do_pre_smp_initcalls(); + lockup_detector_init(); + + smp_init(); + sched_init_smp(); + + do_basic_setup(); + + /* Open the /dev/console on the rootfs, this should never fail */ + if (sys_open((const char __user *) "/dev/console", O_RDWR, 0) < 0) + printk(KERN_WARNING "Warning: unable to open an initial console.\n"); + + (void) sys_dup(0); + (void) sys_dup(0); + /* + * check if there is an early userspace init. If yes, let it do all + * the work + */ + + if (!ramdisk_execute_command) + ramdisk_execute_command = "/init"; + + if (sys_access((const char __user *) ramdisk_execute_command, 0) != 0) { + ramdisk_execute_command = NULL; + prepare_namespace(); + } + + /* + * Ok, we have completed the initial bootup, and + * we're essentially up and running. Get rid of the + * initmem segments and start the user-mode stuff.. + */ + + init_post(); + return 0; +} diff --git a/init/noinitramfs.c b/init/noinitramfs.c new file mode 100644 index 00000000..267739d8 --- /dev/null +++ b/init/noinitramfs.c @@ -0,0 +1,52 @@ +/* + * init/noinitramfs.c + * + * Copyright (C) 2006, NXP Semiconductors, All Rights Reserved + * Author: Jean-Paul Saman + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include +#include +#include +#include + +/* + * Create a simple rootfs that is similar to the default initramfs + */ +static int __init default_rootfs(void) +{ + int err; + + err = sys_mkdir((const char __user __force *) "/dev", 0755); + if (err < 0) + goto out; + + err = sys_mknod((const char __user __force *) "/dev/console", + S_IFCHR | S_IRUSR | S_IWUSR, + new_encode_dev(MKDEV(5, 1))); + if (err < 0) + goto out; + + err = sys_mkdir((const char __user __force *) "/root", 0700); + if (err < 0) + goto out; + + return 0; + +out: + printk(KERN_WARNING "Failed to create a rootfs\n"); + return err; +} +rootfs_initcall(default_rootfs); diff --git a/init/version.c b/init/version.c new file mode 100644 index 00000000..86fe0ccb --- /dev/null +++ b/init/version.c @@ -0,0 +1,48 @@ +/* + * linux/init/version.c + * + * Copyright (C) 1992 Theodore Ts'o + * + * May be freely distributed as part of Linux. + */ + +#include +#include +#include +#include +#include +#include + +#ifndef CONFIG_KALLSYMS +#define version(a) Version_ ## a +#define version_string(a) version(a) + +extern int version_string(LINUX_VERSION_CODE); +int version_string(LINUX_VERSION_CODE); +#endif + +struct uts_namespace init_uts_ns = { + .kref = { + .refcount = ATOMIC_INIT(2), + }, + .name = { + .sysname = UTS_SYSNAME, + .nodename = UTS_NODENAME, + .release = UTS_RELEASE, + .version = UTS_VERSION, + .machine = UTS_MACHINE, + .domainname = UTS_DOMAINNAME, + }, + .user_ns = &init_user_ns, +}; +EXPORT_SYMBOL_GPL(init_uts_ns); + +/* FIXED STRINGS! Don't touch! */ +const char linux_banner[] = + "Linux version " UTS_RELEASE " (" LINUX_COMPILE_BY "@" + LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION "\n"; + +const char linux_proc_banner[] = + "%s version %s" + " (" LINUX_COMPILE_BY "@" LINUX_COMPILE_HOST ")" + " (" LINUX_COMPILER ") %s\n"; -- cgit