diff options
Diffstat (limited to 'arch/x86/mm')
49 files changed, 14947 insertions, 0 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile new file mode 100644 index 00000000..23d8e5fe --- /dev/null +++ b/arch/x86/mm/Makefile @@ -0,0 +1,30 @@ +obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ + pat.o pgtable.o physaddr.o gup.o setup_nx.o + +# Make sure __phys_addr has no stackprotector +nostackp := $(call cc-option, -fno-stack-protector) +CFLAGS_physaddr.o := $(nostackp) +CFLAGS_setup_nx.o := $(nostackp) + +obj-$(CONFIG_X86_PAT) += pat_rbtree.o +obj-$(CONFIG_SMP) += tlb.o + +obj-$(CONFIG_X86_32) += pgtable_32.o iomap_32.o + +obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o +obj-$(CONFIG_X86_PTDUMP) += dump_pagetables.o + +obj-$(CONFIG_HIGHMEM) += highmem_32.o + +obj-$(CONFIG_KMEMCHECK) += kmemcheck/ + +obj-$(CONFIG_MMIOTRACE) += mmiotrace.o +mmiotrace-y := kmmio.o pf_in.o mmio-mod.o +obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o + +obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o +obj-$(CONFIG_AMD_NUMA) += amdtopology.o +obj-$(CONFIG_ACPI_NUMA) += srat.o +obj-$(CONFIG_NUMA_EMU) += numa_emulation.o + +obj-$(CONFIG_MEMTEST) += memtest.o diff --git a/arch/x86/mm/amdtopology.c b/arch/x86/mm/amdtopology.c new file mode 100644 index 00000000..5247d013 --- /dev/null +++ b/arch/x86/mm/amdtopology.c @@ -0,0 +1,197 @@ +/* + * AMD NUMA support. + * Discover the memory map and associated nodes. + * + * This version reads it directly from the AMD northbridge. + * + * Copyright 2002,2003 Andi Kleen, SuSE Labs. + */ +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/string.h> +#include <linux/module.h> +#include <linux/nodemask.h> +#include <linux/memblock.h> +#include <linux/bootmem.h> + +#include <asm/io.h> +#include <linux/pci_ids.h> +#include <linux/acpi.h> +#include <asm/types.h> +#include <asm/mmzone.h> +#include <asm/proto.h> +#include <asm/e820.h> +#include <asm/pci-direct.h> +#include <asm/numa.h> +#include <asm/mpspec.h> +#include <asm/apic.h> +#include <asm/amd_nb.h> + +static unsigned char __initdata nodeids[8]; + +static __init int find_northbridge(void) +{ + int num; + + for (num = 0; num < 32; num++) { + u32 header; + + header = read_pci_config(0, num, 0, 0x00); + if (header != (PCI_VENDOR_ID_AMD | (0x1100<<16)) && + header != (PCI_VENDOR_ID_AMD | (0x1200<<16)) && + header != (PCI_VENDOR_ID_AMD | (0x1300<<16))) + continue; + + header = read_pci_config(0, num, 1, 0x00); + if (header != (PCI_VENDOR_ID_AMD | (0x1101<<16)) && + header != (PCI_VENDOR_ID_AMD | (0x1201<<16)) && + header != (PCI_VENDOR_ID_AMD | (0x1301<<16))) + continue; + return num; + } + + return -ENOENT; +} + +static __init void early_get_boot_cpu_id(void) +{ + /* + * need to get the APIC ID of the BSP so can use that to + * create apicid_to_node in amd_scan_nodes() + */ +#ifdef CONFIG_X86_MPPARSE + /* + * get boot-time SMP configuration: + */ + if (smp_found_config) + early_get_smp_config(); +#endif +} + +int __init amd_numa_init(void) +{ + u64 start = PFN_PHYS(0); + u64 end = PFN_PHYS(max_pfn); + unsigned numnodes; + u64 prevbase; + int i, j, nb; + u32 nodeid, reg; + unsigned int bits, cores, apicid_base; + + if (!early_pci_allowed()) + return -EINVAL; + + nb = find_northbridge(); + if (nb < 0) + return nb; + + pr_info("Scanning NUMA topology in Northbridge %d\n", nb); + + reg = read_pci_config(0, nb, 0, 0x60); + numnodes = ((reg >> 4) & 0xF) + 1; + if (numnodes <= 1) + return -ENOENT; + + pr_info("Number of physical nodes %d\n", numnodes); + + prevbase = 0; + for (i = 0; i < 8; i++) { + u64 base, limit; + + base = read_pci_config(0, nb, 1, 0x40 + i*8); + limit = read_pci_config(0, nb, 1, 0x44 + i*8); + + nodeids[i] = nodeid = limit & 7; + if ((base & 3) == 0) { + if (i < numnodes) + pr_info("Skipping disabled node %d\n", i); + continue; + } + if (nodeid >= numnodes) { + pr_info("Ignoring excess node %d (%Lx:%Lx)\n", nodeid, + base, limit); + continue; + } + + if (!limit) { + pr_info("Skipping node entry %d (base %Lx)\n", + i, base); + continue; + } + if ((base >> 8) & 3 || (limit >> 8) & 3) { + pr_err("Node %d using interleaving mode %Lx/%Lx\n", + nodeid, (base >> 8) & 3, (limit >> 8) & 3); + return -EINVAL; + } + if (node_isset(nodeid, numa_nodes_parsed)) { + pr_info("Node %d already present, skipping\n", + nodeid); + continue; + } + + limit >>= 16; + limit <<= 24; + limit |= (1<<24)-1; + limit++; + + if (limit > end) + limit = end; + if (limit <= base) + continue; + + base >>= 16; + base <<= 24; + + if (base < start) + base = start; + if (limit > end) + limit = end; + if (limit == base) { + pr_err("Empty node %d\n", nodeid); + continue; + } + if (limit < base) { + pr_err("Node %d bogus settings %Lx-%Lx.\n", + nodeid, base, limit); + continue; + } + + /* Could sort here, but pun for now. Should not happen anyroads. */ + if (prevbase > base) { + pr_err("Node map not sorted %Lx,%Lx\n", + prevbase, base); + return -EINVAL; + } + + pr_info("Node %d MemBase %016Lx Limit %016Lx\n", + nodeid, base, limit); + + prevbase = base; + numa_add_memblk(nodeid, base, limit); + node_set(nodeid, numa_nodes_parsed); + } + + if (!nodes_weight(numa_nodes_parsed)) + return -ENOENT; + + /* + * We seem to have valid NUMA configuration. Map apicids to nodes + * using the coreid bits from early_identify_cpu. + */ + bits = boot_cpu_data.x86_coreid_bits; + cores = 1 << bits; + apicid_base = 0; + + /* get the APIC ID of the BSP early for systems with apicid lifting */ + early_get_boot_cpu_id(); + if (boot_cpu_physical_apicid > 0) { + pr_info("BSP APIC ID: %02x\n", boot_cpu_physical_apicid); + apicid_base = boot_cpu_physical_apicid; + } + + for_each_node_mask(i, numa_nodes_parsed) + for (j = apicid_base; j < cores + apicid_base; j++) + set_apicid_to_node((i << bits) + j, i); + + return 0; +} diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c new file mode 100644 index 00000000..0002a3a3 --- /dev/null +++ b/arch/x86/mm/dump_pagetables.c @@ -0,0 +1,375 @@ +/* + * Debug helper to dump the current kernel pagetables of the system + * so that we can see what the various memory ranges are set to. + * + * (C) Copyright 2008 Intel Corporation + * + * Author: Arjan van de Ven <arjan@linux.intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#include <linux/debugfs.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/seq_file.h> + +#include <asm/pgtable.h> + +/* + * The dumper groups pagetable entries of the same type into one, and for + * that it needs to keep some state when walking, and flush this state + * when a "break" in the continuity is found. + */ +struct pg_state { + int level; + pgprot_t current_prot; + unsigned long start_address; + unsigned long current_address; + const struct addr_marker *marker; +}; + +struct addr_marker { + unsigned long start_address; + const char *name; +}; + +/* indices for address_markers; keep sync'd w/ address_markers below */ +enum address_markers_idx { + USER_SPACE_NR = 0, +#ifdef CONFIG_X86_64 + KERNEL_SPACE_NR, + LOW_KERNEL_NR, + VMALLOC_START_NR, + VMEMMAP_START_NR, + HIGH_KERNEL_NR, + MODULES_VADDR_NR, + MODULES_END_NR, +#else + KERNEL_SPACE_NR, + VMALLOC_START_NR, + VMALLOC_END_NR, +# ifdef CONFIG_HIGHMEM + PKMAP_BASE_NR, +# endif + FIXADDR_START_NR, +#endif +}; + +/* Address space markers hints */ +static struct addr_marker address_markers[] = { + { 0, "User Space" }, +#ifdef CONFIG_X86_64 + { 0x8000000000000000UL, "Kernel Space" }, + { PAGE_OFFSET, "Low Kernel Mapping" }, + { VMALLOC_START, "vmalloc() Area" }, + { VMEMMAP_START, "Vmemmap" }, + { __START_KERNEL_map, "High Kernel Mapping" }, + { MODULES_VADDR, "Modules" }, + { MODULES_END, "End Modules" }, +#else + { PAGE_OFFSET, "Kernel Mapping" }, + { 0/* VMALLOC_START */, "vmalloc() Area" }, + { 0/*VMALLOC_END*/, "vmalloc() End" }, +# ifdef CONFIG_HIGHMEM + { 0/*PKMAP_BASE*/, "Persisent kmap() Area" }, +# endif + { 0/*FIXADDR_START*/, "Fixmap Area" }, +#endif + { -1, NULL } /* End of list */ +}; + +/* Multipliers for offsets within the PTEs */ +#define PTE_LEVEL_MULT (PAGE_SIZE) +#define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT) +#define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT) +#define PGD_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT) + +/* + * Print a readable form of a pgprot_t to the seq_file + */ +static void printk_prot(struct seq_file *m, pgprot_t prot, int level) +{ + pgprotval_t pr = pgprot_val(prot); + static const char * const level_name[] = + { "cr3", "pgd", "pud", "pmd", "pte" }; + + if (!pgprot_val(prot)) { + /* Not present */ + seq_printf(m, " "); + } else { + if (pr & _PAGE_USER) + seq_printf(m, "USR "); + else + seq_printf(m, " "); + if (pr & _PAGE_RW) + seq_printf(m, "RW "); + else + seq_printf(m, "ro "); + if (pr & _PAGE_PWT) + seq_printf(m, "PWT "); + else + seq_printf(m, " "); + if (pr & _PAGE_PCD) + seq_printf(m, "PCD "); + else + seq_printf(m, " "); + + /* Bit 9 has a different meaning on level 3 vs 4 */ + if (level <= 3) { + if (pr & _PAGE_PSE) + seq_printf(m, "PSE "); + else + seq_printf(m, " "); + } else { + if (pr & _PAGE_PAT) + seq_printf(m, "pat "); + else + seq_printf(m, " "); + } + if (pr & _PAGE_GLOBAL) + seq_printf(m, "GLB "); + else + seq_printf(m, " "); + if (pr & _PAGE_NX) + seq_printf(m, "NX "); + else + seq_printf(m, "x "); + } + seq_printf(m, "%s\n", level_name[level]); +} + +/* + * On 64 bits, sign-extend the 48 bit address to 64 bit + */ +static unsigned long normalize_addr(unsigned long u) +{ +#ifdef CONFIG_X86_64 + return (signed long)(u << 16) >> 16; +#else + return u; +#endif +} + +/* + * This function gets called on a break in a continuous series + * of PTE entries; the next one is different so we need to + * print what we collected so far. + */ +static void note_page(struct seq_file *m, struct pg_state *st, + pgprot_t new_prot, int level) +{ + pgprotval_t prot, cur; + static const char units[] = "KMGTPE"; + + /* + * If we have a "break" in the series, we need to flush the state that + * we have now. "break" is either changing perms, levels or + * address space marker. + */ + prot = pgprot_val(new_prot) & PTE_FLAGS_MASK; + cur = pgprot_val(st->current_prot) & PTE_FLAGS_MASK; + + if (!st->level) { + /* First entry */ + st->current_prot = new_prot; + st->level = level; + st->marker = address_markers; + seq_printf(m, "---[ %s ]---\n", st->marker->name); + } else if (prot != cur || level != st->level || + st->current_address >= st->marker[1].start_address) { + const char *unit = units; + unsigned long delta; + int width = sizeof(unsigned long) * 2; + + /* + * Now print the actual finished series + */ + seq_printf(m, "0x%0*lx-0x%0*lx ", + width, st->start_address, + width, st->current_address); + + delta = (st->current_address - st->start_address) >> 10; + while (!(delta & 1023) && unit[1]) { + delta >>= 10; + unit++; + } + seq_printf(m, "%9lu%c ", delta, *unit); + printk_prot(m, st->current_prot, st->level); + + /* + * We print markers for special areas of address space, + * such as the start of vmalloc space etc. + * This helps in the interpretation. + */ + if (st->current_address >= st->marker[1].start_address) { + st->marker++; + seq_printf(m, "---[ %s ]---\n", st->marker->name); + } + + st->start_address = st->current_address; + st->current_prot = new_prot; + st->level = level; + } +} + +static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr, + unsigned long P) +{ + int i; + pte_t *start; + + start = (pte_t *) pmd_page_vaddr(addr); + for (i = 0; i < PTRS_PER_PTE; i++) { + pgprot_t prot = pte_pgprot(*start); + + st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT); + note_page(m, st, prot, 4); + start++; + } +} + +#if PTRS_PER_PMD > 1 + +static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr, + unsigned long P) +{ + int i; + pmd_t *start; + + start = (pmd_t *) pud_page_vaddr(addr); + for (i = 0; i < PTRS_PER_PMD; i++) { + st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT); + if (!pmd_none(*start)) { + pgprotval_t prot = pmd_val(*start) & PTE_FLAGS_MASK; + + if (pmd_large(*start) || !pmd_present(*start)) + note_page(m, st, __pgprot(prot), 3); + else + walk_pte_level(m, st, *start, + P + i * PMD_LEVEL_MULT); + } else + note_page(m, st, __pgprot(0), 3); + start++; + } +} + +#else +#define walk_pmd_level(m,s,a,p) walk_pte_level(m,s,__pmd(pud_val(a)),p) +#define pud_large(a) pmd_large(__pmd(pud_val(a))) +#define pud_none(a) pmd_none(__pmd(pud_val(a))) +#endif + +#if PTRS_PER_PUD > 1 + +static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr, + unsigned long P) +{ + int i; + pud_t *start; + + start = (pud_t *) pgd_page_vaddr(addr); + + for (i = 0; i < PTRS_PER_PUD; i++) { + st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT); + if (!pud_none(*start)) { + pgprotval_t prot = pud_val(*start) & PTE_FLAGS_MASK; + + if (pud_large(*start) || !pud_present(*start)) + note_page(m, st, __pgprot(prot), 2); + else + walk_pmd_level(m, st, *start, + P + i * PUD_LEVEL_MULT); + } else + note_page(m, st, __pgprot(0), 2); + + start++; + } +} + +#else +#define walk_pud_level(m,s,a,p) walk_pmd_level(m,s,__pud(pgd_val(a)),p) +#define pgd_large(a) pud_large(__pud(pgd_val(a))) +#define pgd_none(a) pud_none(__pud(pgd_val(a))) +#endif + +static void walk_pgd_level(struct seq_file *m) +{ +#ifdef CONFIG_X86_64 + pgd_t *start = (pgd_t *) &init_level4_pgt; +#else + pgd_t *start = swapper_pg_dir; +#endif + int i; + struct pg_state st; + + memset(&st, 0, sizeof(st)); + + for (i = 0; i < PTRS_PER_PGD; i++) { + st.current_address = normalize_addr(i * PGD_LEVEL_MULT); + if (!pgd_none(*start)) { + pgprotval_t prot = pgd_val(*start) & PTE_FLAGS_MASK; + + if (pgd_large(*start) || !pgd_present(*start)) + note_page(m, &st, __pgprot(prot), 1); + else + walk_pud_level(m, &st, *start, + i * PGD_LEVEL_MULT); + } else + note_page(m, &st, __pgprot(0), 1); + + start++; + } + + /* Flush out the last page */ + st.current_address = normalize_addr(PTRS_PER_PGD*PGD_LEVEL_MULT); + note_page(m, &st, __pgprot(0), 0); +} + +static int ptdump_show(struct seq_file *m, void *v) +{ + walk_pgd_level(m); + return 0; +} + +static int ptdump_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, ptdump_show, NULL); +} + +static const struct file_operations ptdump_fops = { + .open = ptdump_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int pt_dump_init(void) +{ + struct dentry *pe; + +#ifdef CONFIG_X86_32 + /* Not a compile-time constant on x86-32 */ + address_markers[VMALLOC_START_NR].start_address = VMALLOC_START; + address_markers[VMALLOC_END_NR].start_address = VMALLOC_END; +# ifdef CONFIG_HIGHMEM + address_markers[PKMAP_BASE_NR].start_address = PKMAP_BASE; +# endif + address_markers[FIXADDR_START_NR].start_address = FIXADDR_START; +#endif + + pe = debugfs_create_file("kernel_page_tables", 0600, NULL, NULL, + &ptdump_fops); + if (!pe) + return -ENOMEM; + + return 0; +} + +__initcall(pt_dump_init); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>"); +MODULE_DESCRIPTION("Kernel debugging helper that dumps pagetables"); diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c new file mode 100644 index 00000000..1fb85dbe --- /dev/null +++ b/arch/x86/mm/extable.c @@ -0,0 +1,37 @@ +#include <linux/module.h> +#include <linux/spinlock.h> +#include <asm/uaccess.h> + + +int fixup_exception(struct pt_regs *regs) +{ + const struct exception_table_entry *fixup; + +#ifdef CONFIG_PNPBIOS + if (unlikely(SEGMENT_IS_PNP_CODE(regs->cs))) { + extern u32 pnp_bios_fault_eip, pnp_bios_fault_esp; + extern u32 pnp_bios_is_utter_crap; + pnp_bios_is_utter_crap = 1; + printk(KERN_CRIT "PNPBIOS fault.. attempting recovery.\n"); + __asm__ volatile( + "movl %0, %%esp\n\t" + "jmp *%1\n\t" + : : "g" (pnp_bios_fault_esp), "g" (pnp_bios_fault_eip)); + panic("do_trap: can't hit this"); + } +#endif + + fixup = search_exception_tables(regs->ip); + if (fixup) { + /* If fixup is less than 16, it means uaccess error */ + if (fixup->fixup < 16) { + current_thread_info()->uaccess_err = 1; + regs->ip += fixup->fixup; + return 1; + } + regs->ip = fixup->fixup; + return 1; + } + + return 0; +} diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c new file mode 100644 index 00000000..3ecfd1aa --- /dev/null +++ b/arch/x86/mm/fault.c @@ -0,0 +1,1211 @@ +/* + * Copyright (C) 1995 Linus Torvalds + * Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs. + * Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar + */ +#include <linux/magic.h> /* STACK_END_MAGIC */ +#include <linux/sched.h> /* test_thread_flag(), ... */ +#include <linux/kdebug.h> /* oops_begin/end, ... */ +#include <linux/module.h> /* search_exception_table */ +#include <linux/bootmem.h> /* max_low_pfn */ +#include <linux/kprobes.h> /* __kprobes, ... */ +#include <linux/mmiotrace.h> /* kmmio_handler, ... */ +#include <linux/perf_event.h> /* perf_sw_event */ +#include <linux/hugetlb.h> /* hstate_index_to_shift */ +#include <linux/prefetch.h> /* prefetchw */ + +#include <asm/traps.h> /* dotraplinkage, ... */ +#include <asm/pgalloc.h> /* pgd_*(), ... */ +#include <asm/kmemcheck.h> /* kmemcheck_*(), ... */ +#include <asm/fixmap.h> /* VSYSCALL_START */ + +/* + * Page fault error code bits: + * + * bit 0 == 0: no page found 1: protection fault + * bit 1 == 0: read access 1: write access + * bit 2 == 0: kernel-mode access 1: user-mode access + * bit 3 == 1: use of reserved bit detected + * bit 4 == 1: fault was an instruction fetch + */ +enum x86_pf_error_code { + + PF_PROT = 1 << 0, + PF_WRITE = 1 << 1, + PF_USER = 1 << 2, + PF_RSVD = 1 << 3, + PF_INSTR = 1 << 4, +}; + +/* + * Returns 0 if mmiotrace is disabled, or if the fault is not + * handled by mmiotrace: + */ +static inline int __kprobes +kmmio_fault(struct pt_regs *regs, unsigned long addr) +{ + if (unlikely(is_kmmio_active())) + if (kmmio_handler(regs, addr) == 1) + return -1; + return 0; +} + +static inline int __kprobes notify_page_fault(struct pt_regs *regs) +{ + int ret = 0; + + /* kprobe_running() needs smp_processor_id() */ + if (kprobes_built_in() && !user_mode_vm(regs)) { + preempt_disable(); + if (kprobe_running() && kprobe_fault_handler(regs, 14)) + ret = 1; + preempt_enable(); + } + + return ret; +} + +/* + * Prefetch quirks: + * + * 32-bit mode: + * + * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch. + * Check that here and ignore it. + * + * 64-bit mode: + * + * Sometimes the CPU reports invalid exceptions on prefetch. + * Check that here and ignore it. + * + * Opcode checker based on code by Richard Brunner. + */ +static inline int +check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr, + unsigned char opcode, int *prefetch) +{ + unsigned char instr_hi = opcode & 0xf0; + unsigned char instr_lo = opcode & 0x0f; + + switch (instr_hi) { + case 0x20: + case 0x30: + /* + * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. + * In X86_64 long mode, the CPU will signal invalid + * opcode if some of these prefixes are present so + * X86_64 will never get here anyway + */ + return ((instr_lo & 7) == 0x6); +#ifdef CONFIG_X86_64 + case 0x40: + /* + * In AMD64 long mode 0x40..0x4F are valid REX prefixes + * Need to figure out under what instruction mode the + * instruction was issued. Could check the LDT for lm, + * but for now it's good enough to assume that long + * mode only uses well known segments or kernel. + */ + return (!user_mode(regs) || user_64bit_mode(regs)); +#endif + case 0x60: + /* 0x64 thru 0x67 are valid prefixes in all modes. */ + return (instr_lo & 0xC) == 0x4; + case 0xF0: + /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */ + return !instr_lo || (instr_lo>>1) == 1; + case 0x00: + /* Prefetch instruction is 0x0F0D or 0x0F18 */ + if (probe_kernel_address(instr, opcode)) + return 0; + + *prefetch = (instr_lo == 0xF) && + (opcode == 0x0D || opcode == 0x18); + return 0; + default: + return 0; + } +} + +static int +is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) +{ + unsigned char *max_instr; + unsigned char *instr; + int prefetch = 0; + + /* + * If it was a exec (instruction fetch) fault on NX page, then + * do not ignore the fault: + */ + if (error_code & PF_INSTR) + return 0; + + instr = (void *)convert_ip_to_linear(current, regs); + max_instr = instr + 15; + + if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE) + return 0; + + while (instr < max_instr) { + unsigned char opcode; + + if (probe_kernel_address(instr, opcode)) + break; + + instr++; + + if (!check_prefetch_opcode(regs, instr, opcode, &prefetch)) + break; + } + return prefetch; +} + +static void +force_sig_info_fault(int si_signo, int si_code, unsigned long address, + struct task_struct *tsk, int fault) +{ + unsigned lsb = 0; + siginfo_t info; + + info.si_signo = si_signo; + info.si_errno = 0; + info.si_code = si_code; + info.si_addr = (void __user *)address; + if (fault & VM_FAULT_HWPOISON_LARGE) + lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault)); + if (fault & VM_FAULT_HWPOISON) + lsb = PAGE_SHIFT; + info.si_addr_lsb = lsb; + + force_sig_info(si_signo, &info, tsk); +} + +DEFINE_SPINLOCK(pgd_lock); +LIST_HEAD(pgd_list); + +#ifdef CONFIG_X86_32 +static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) +{ + unsigned index = pgd_index(address); + pgd_t *pgd_k; + pud_t *pud, *pud_k; + pmd_t *pmd, *pmd_k; + + pgd += index; + pgd_k = init_mm.pgd + index; + + if (!pgd_present(*pgd_k)) + return NULL; + + /* + * set_pgd(pgd, *pgd_k); here would be useless on PAE + * and redundant with the set_pmd() on non-PAE. As would + * set_pud. + */ + pud = pud_offset(pgd, address); + pud_k = pud_offset(pgd_k, address); + if (!pud_present(*pud_k)) + return NULL; + + pmd = pmd_offset(pud, address); + pmd_k = pmd_offset(pud_k, address); + if (!pmd_present(*pmd_k)) + return NULL; + + if (!pmd_present(*pmd)) + set_pmd(pmd, *pmd_k); + else + BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k)); + + return pmd_k; +} + +void vmalloc_sync_all(void) +{ + unsigned long address; + + if (SHARED_KERNEL_PMD) + return; + + for (address = VMALLOC_START & PMD_MASK; + address >= TASK_SIZE && address < FIXADDR_TOP; + address += PMD_SIZE) { + struct page *page; + + spin_lock(&pgd_lock); + list_for_each_entry(page, &pgd_list, lru) { + spinlock_t *pgt_lock; + pmd_t *ret; + + /* the pgt_lock only for Xen */ + pgt_lock = &pgd_page_get_mm(page)->page_table_lock; + + spin_lock(pgt_lock); + ret = vmalloc_sync_one(page_address(page), address); + spin_unlock(pgt_lock); + + if (!ret) + break; + } + spin_unlock(&pgd_lock); + } +} + +/* + * 32-bit: + * + * Handle a fault on the vmalloc or module mapping area + */ +static noinline __kprobes int vmalloc_fault(unsigned long address) +{ + unsigned long pgd_paddr; + pmd_t *pmd_k; + pte_t *pte_k; + + /* Make sure we are in vmalloc area: */ + if (!(address >= VMALLOC_START && address < VMALLOC_END)) + return -1; + + WARN_ON_ONCE(in_nmi()); + + /* + * Synchronize this task's top level page-table + * with the 'reference' page table. + * + * Do _not_ use "current" here. We might be inside + * an interrupt in the middle of a task switch.. + */ + pgd_paddr = read_cr3(); + pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); + if (!pmd_k) + return -1; + + pte_k = pte_offset_kernel(pmd_k, address); + if (!pte_present(*pte_k)) + return -1; + + return 0; +} + +/* + * Did it hit the DOS screen memory VA from vm86 mode? + */ +static inline void +check_v8086_mode(struct pt_regs *regs, unsigned long address, + struct task_struct *tsk) +{ + unsigned long bit; + + if (!v8086_mode(regs)) + return; + + bit = (address - 0xA0000) >> PAGE_SHIFT; + if (bit < 32) + tsk->thread.screen_bitmap |= 1 << bit; +} + +static bool low_pfn(unsigned long pfn) +{ + return pfn < max_low_pfn; +} + +static void dump_pagetable(unsigned long address) +{ + pgd_t *base = __va(read_cr3()); + pgd_t *pgd = &base[pgd_index(address)]; + pmd_t *pmd; + pte_t *pte; + +#ifdef CONFIG_X86_PAE + printk("*pdpt = %016Lx ", pgd_val(*pgd)); + if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd)) + goto out; +#endif + pmd = pmd_offset(pud_offset(pgd, address), address); + printk(KERN_CONT "*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd)); + + /* + * We must not directly access the pte in the highpte + * case if the page table is located in highmem. + * And let's rather not kmap-atomic the pte, just in case + * it's allocated already: + */ + if (!low_pfn(pmd_pfn(*pmd)) || !pmd_present(*pmd) || pmd_large(*pmd)) + goto out; + + pte = pte_offset_kernel(pmd, address); + printk("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte)); +out: + printk("\n"); +} + +#else /* CONFIG_X86_64: */ + +void vmalloc_sync_all(void) +{ + sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END); +} + +/* + * 64-bit: + * + * Handle a fault on the vmalloc area + * + * This assumes no large pages in there. + */ +static noinline __kprobes int vmalloc_fault(unsigned long address) +{ + pgd_t *pgd, *pgd_ref; + pud_t *pud, *pud_ref; + pmd_t *pmd, *pmd_ref; + pte_t *pte, *pte_ref; + + /* Make sure we are in vmalloc area: */ + if (!(address >= VMALLOC_START && address < VMALLOC_END)) + return -1; + + WARN_ON_ONCE(in_nmi()); + + /* + * Copy kernel mappings over when needed. This can also + * happen within a race in page table update. In the later + * case just flush: + */ + pgd = pgd_offset(current->active_mm, address); + pgd_ref = pgd_offset_k(address); + if (pgd_none(*pgd_ref)) + return -1; + + if (pgd_none(*pgd)) + set_pgd(pgd, *pgd_ref); + else + BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); + + /* + * Below here mismatches are bugs because these lower tables + * are shared: + */ + + pud = pud_offset(pgd, address); + pud_ref = pud_offset(pgd_ref, address); + if (pud_none(*pud_ref)) + return -1; + + if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref)) + BUG(); + + pmd = pmd_offset(pud, address); + pmd_ref = pmd_offset(pud_ref, address); + if (pmd_none(*pmd_ref)) + return -1; + + if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref)) + BUG(); + + pte_ref = pte_offset_kernel(pmd_ref, address); + if (!pte_present(*pte_ref)) + return -1; + + pte = pte_offset_kernel(pmd, address); + + /* + * Don't use pte_page here, because the mappings can point + * outside mem_map, and the NUMA hash lookup cannot handle + * that: + */ + if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref)) + BUG(); + + return 0; +} + +#ifdef CONFIG_CPU_SUP_AMD +static const char errata93_warning[] = +KERN_ERR +"******* Your BIOS seems to not contain a fix for K8 errata #93\n" +"******* Working around it, but it may cause SEGVs or burn power.\n" +"******* Please consider a BIOS update.\n" +"******* Disabling USB legacy in the BIOS may also help.\n"; +#endif + +/* + * No vm86 mode in 64-bit mode: + */ +static inline void +check_v8086_mode(struct pt_regs *regs, unsigned long address, + struct task_struct *tsk) +{ +} + +static int bad_address(void *p) +{ + unsigned long dummy; + + return probe_kernel_address((unsigned long *)p, dummy); +} + +static void dump_pagetable(unsigned long address) +{ + pgd_t *base = __va(read_cr3() & PHYSICAL_PAGE_MASK); + pgd_t *pgd = base + pgd_index(address); + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + if (bad_address(pgd)) + goto bad; + + printk("PGD %lx ", pgd_val(*pgd)); + + if (!pgd_present(*pgd)) + goto out; + + pud = pud_offset(pgd, address); + if (bad_address(pud)) + goto bad; + + printk("PUD %lx ", pud_val(*pud)); + if (!pud_present(*pud) || pud_large(*pud)) + goto out; + + pmd = pmd_offset(pud, address); + if (bad_address(pmd)) + goto bad; + + printk("PMD %lx ", pmd_val(*pmd)); + if (!pmd_present(*pmd) || pmd_large(*pmd)) + goto out; + + pte = pte_offset_kernel(pmd, address); + if (bad_address(pte)) + goto bad; + + printk("PTE %lx", pte_val(*pte)); +out: + printk("\n"); + return; +bad: + printk("BAD\n"); +} + +#endif /* CONFIG_X86_64 */ + +/* + * Workaround for K8 erratum #93 & buggy BIOS. + * + * BIOS SMM functions are required to use a specific workaround + * to avoid corruption of the 64bit RIP register on C stepping K8. + * + * A lot of BIOS that didn't get tested properly miss this. + * + * The OS sees this as a page fault with the upper 32bits of RIP cleared. + * Try to work around it here. + * + * Note we only handle faults in kernel here. + * Does nothing on 32-bit. + */ +static int is_errata93(struct pt_regs *regs, unsigned long address) +{ +#if defined(CONFIG_X86_64) && defined(CONFIG_CPU_SUP_AMD) + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD + || boot_cpu_data.x86 != 0xf) + return 0; + + if (address != regs->ip) + return 0; + + if ((address >> 32) != 0) + return 0; + + address |= 0xffffffffUL << 32; + if ((address >= (u64)_stext && address <= (u64)_etext) || + (address >= MODULES_VADDR && address <= MODULES_END)) { + printk_once(errata93_warning); + regs->ip = address; + return 1; + } +#endif + return 0; +} + +/* + * Work around K8 erratum #100 K8 in compat mode occasionally jumps + * to illegal addresses >4GB. + * + * We catch this in the page fault handler because these addresses + * are not reachable. Just detect this case and return. Any code + * segment in LDT is compatibility mode. + */ +static int is_errata100(struct pt_regs *regs, unsigned long address) +{ +#ifdef CONFIG_X86_64 + if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && (address >> 32)) + return 1; +#endif + return 0; +} + +static int is_f00f_bug(struct pt_regs *regs, unsigned long address) +{ +#ifdef CONFIG_X86_F00F_BUG + unsigned long nr; + + /* + * Pentium F0 0F C7 C8 bug workaround: + */ + if (boot_cpu_data.f00f_bug) { + nr = (address - idt_descr.address) >> 3; + + if (nr == 6) { + do_invalid_op(regs, 0); + return 1; + } + } +#endif + return 0; +} + +static const char nx_warning[] = KERN_CRIT +"kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n"; + +static void +show_fault_oops(struct pt_regs *regs, unsigned long error_code, + unsigned long address) +{ + if (!oops_may_print()) + return; + + if (error_code & PF_INSTR) { + unsigned int level; + + pte_t *pte = lookup_address(address, &level); + + if (pte && pte_present(*pte) && !pte_exec(*pte)) + printk(nx_warning, current_uid()); + } + + printk(KERN_ALERT "BUG: unable to handle kernel "); + if (address < PAGE_SIZE) + printk(KERN_CONT "NULL pointer dereference"); + else + printk(KERN_CONT "paging request"); + + printk(KERN_CONT " at %p\n", (void *) address); + printk(KERN_ALERT "IP:"); + printk_address(regs->ip, 1); + + dump_pagetable(address); +} + +static noinline void +pgtable_bad(struct pt_regs *regs, unsigned long error_code, + unsigned long address) +{ + struct task_struct *tsk; + unsigned long flags; + int sig; + + flags = oops_begin(); + tsk = current; + sig = SIGKILL; + + printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", + tsk->comm, address); + dump_pagetable(address); + + tsk->thread.cr2 = address; + tsk->thread.trap_nr = X86_TRAP_PF; + tsk->thread.error_code = error_code; + + if (__die("Bad pagetable", regs, error_code)) + sig = 0; + + oops_end(flags, regs, sig); +} + +static noinline void +no_context(struct pt_regs *regs, unsigned long error_code, + unsigned long address, int signal, int si_code) +{ + struct task_struct *tsk = current; + unsigned long *stackend; + unsigned long flags; + int sig; + + /* Are we prepared to handle this kernel fault? */ + if (fixup_exception(regs)) { + if (current_thread_info()->sig_on_uaccess_error && signal) { + tsk->thread.trap_nr = X86_TRAP_PF; + tsk->thread.error_code = error_code | PF_USER; + tsk->thread.cr2 = address; + + /* XXX: hwpoison faults will set the wrong code. */ + force_sig_info_fault(signal, si_code, address, tsk, 0); + } + return; + } + + /* + * 32-bit: + * + * Valid to do another page fault here, because if this fault + * had been triggered by is_prefetch fixup_exception would have + * handled it. + * + * 64-bit: + * + * Hall of shame of CPU/BIOS bugs. + */ + if (is_prefetch(regs, error_code, address)) + return; + + if (is_errata93(regs, address)) + return; + + /* + * Oops. The kernel tried to access some bad page. We'll have to + * terminate things with extreme prejudice: + */ + flags = oops_begin(); + + show_fault_oops(regs, error_code, address); + + stackend = end_of_stack(tsk); + if (tsk != &init_task && *stackend != STACK_END_MAGIC) + printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); + + tsk->thread.cr2 = address; + tsk->thread.trap_nr = X86_TRAP_PF; + tsk->thread.error_code = error_code; + + sig = SIGKILL; + if (__die("Oops", regs, error_code)) + sig = 0; + + /* Executive summary in case the body of the oops scrolled away */ + printk(KERN_DEFAULT "CR2: %016lx\n", address); + + oops_end(flags, regs, sig); +} + +/* + * Print out info about fatal segfaults, if the show_unhandled_signals + * sysctl is set: + */ +static inline void +show_signal_msg(struct pt_regs *regs, unsigned long error_code, + unsigned long address, struct task_struct *tsk) +{ + if (!unhandled_signal(tsk, SIGSEGV)) + return; + + if (!printk_ratelimit()) + return; + + printk("%s%s[%d]: segfault at %lx ip %p sp %p error %lx", + task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, + tsk->comm, task_pid_nr(tsk), address, + (void *)regs->ip, (void *)regs->sp, error_code); + + print_vma_addr(KERN_CONT " in ", regs->ip); + + printk(KERN_CONT "\n"); +} + +static void +__bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, + unsigned long address, int si_code) +{ + struct task_struct *tsk = current; + + /* User mode accesses just cause a SIGSEGV */ + if (error_code & PF_USER) { + /* + * It's possible to have interrupts off here: + */ + local_irq_enable(); + + /* + * Valid to do another page fault here because this one came + * from user space: + */ + if (is_prefetch(regs, error_code, address)) + return; + + if (is_errata100(regs, address)) + return; + +#ifdef CONFIG_X86_64 + /* + * Instruction fetch faults in the vsyscall page might need + * emulation. + */ + if (unlikely((error_code & PF_INSTR) && + ((address & ~0xfff) == VSYSCALL_START))) { + if (emulate_vsyscall(regs, address)) + return; + } +#endif + + if (unlikely(show_unhandled_signals)) + show_signal_msg(regs, error_code, address, tsk); + + /* Kernel addresses are always protection faults: */ + tsk->thread.cr2 = address; + tsk->thread.error_code = error_code | (address >= TASK_SIZE); + tsk->thread.trap_nr = X86_TRAP_PF; + + force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0); + + return; + } + + if (is_f00f_bug(regs, address)) + return; + + no_context(regs, error_code, address, SIGSEGV, si_code); +} + +static noinline void +bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, + unsigned long address) +{ + __bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR); +} + +static void +__bad_area(struct pt_regs *regs, unsigned long error_code, + unsigned long address, int si_code) +{ + struct mm_struct *mm = current->mm; + + /* + * Something tried to access memory that isn't in our memory map.. + * Fix it, but check if it's kernel or user first.. + */ + up_read(&mm->mmap_sem); + + __bad_area_nosemaphore(regs, error_code, address, si_code); +} + +static noinline void +bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address) +{ + __bad_area(regs, error_code, address, SEGV_MAPERR); +} + +static noinline void +bad_area_access_error(struct pt_regs *regs, unsigned long error_code, + unsigned long address) +{ + __bad_area(regs, error_code, address, SEGV_ACCERR); +} + +/* TODO: fixup for "mm-invoke-oom-killer-from-page-fault.patch" */ +static void +out_of_memory(struct pt_regs *regs, unsigned long error_code, + unsigned long address) +{ + /* + * We ran out of memory, call the OOM killer, and return the userspace + * (which will retry the fault, or kill us if we got oom-killed): + */ + up_read(¤t->mm->mmap_sem); + + pagefault_out_of_memory(); +} + +static void +do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, + unsigned int fault) +{ + struct task_struct *tsk = current; + struct mm_struct *mm = tsk->mm; + int code = BUS_ADRERR; + + up_read(&mm->mmap_sem); + + /* Kernel mode? Handle exceptions or die: */ + if (!(error_code & PF_USER)) { + no_context(regs, error_code, address, SIGBUS, BUS_ADRERR); + return; + } + + /* User-space => ok to do another page fault: */ + if (is_prefetch(regs, error_code, address)) + return; + + tsk->thread.cr2 = address; + tsk->thread.error_code = error_code; + tsk->thread.trap_nr = X86_TRAP_PF; + +#ifdef CONFIG_MEMORY_FAILURE + if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) { + printk(KERN_ERR + "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n", + tsk->comm, tsk->pid, address); + code = BUS_MCEERR_AR; + } +#endif + force_sig_info_fault(SIGBUS, code, address, tsk, fault); +} + +static noinline int +mm_fault_error(struct pt_regs *regs, unsigned long error_code, + unsigned long address, unsigned int fault) +{ + /* + * Pagefault was interrupted by SIGKILL. We have no reason to + * continue pagefault. + */ + if (fatal_signal_pending(current)) { + if (!(fault & VM_FAULT_RETRY)) + up_read(¤t->mm->mmap_sem); + if (!(error_code & PF_USER)) + no_context(regs, error_code, address, 0, 0); + return 1; + } + if (!(fault & VM_FAULT_ERROR)) + return 0; + + if (fault & VM_FAULT_OOM) { + /* Kernel mode? Handle exceptions or die: */ + if (!(error_code & PF_USER)) { + up_read(¤t->mm->mmap_sem); + no_context(regs, error_code, address, + SIGSEGV, SEGV_MAPERR); + return 1; + } + + out_of_memory(regs, error_code, address); + } else { + if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON| + VM_FAULT_HWPOISON_LARGE)) + do_sigbus(regs, error_code, address, fault); + else + BUG(); + } + return 1; +} + +static int spurious_fault_check(unsigned long error_code, pte_t *pte) +{ + if ((error_code & PF_WRITE) && !pte_write(*pte)) + return 0; + + if ((error_code & PF_INSTR) && !pte_exec(*pte)) + return 0; + + return 1; +} + +/* + * Handle a spurious fault caused by a stale TLB entry. + * + * This allows us to lazily refresh the TLB when increasing the + * permissions of a kernel page (RO -> RW or NX -> X). Doing it + * eagerly is very expensive since that implies doing a full + * cross-processor TLB flush, even if no stale TLB entries exist + * on other processors. + * + * There are no security implications to leaving a stale TLB when + * increasing the permissions on a page. + */ +static noinline __kprobes int +spurious_fault(unsigned long error_code, unsigned long address) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + int ret; + + /* Reserved-bit violation or user access to kernel space? */ + if (error_code & (PF_USER | PF_RSVD)) + return 0; + + pgd = init_mm.pgd + pgd_index(address); + if (!pgd_present(*pgd)) + return 0; + + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + return 0; + + if (pud_large(*pud)) + return spurious_fault_check(error_code, (pte_t *) pud); + + pmd = pmd_offset(pud, address); + if (!pmd_present(*pmd)) + return 0; + + if (pmd_large(*pmd)) + return spurious_fault_check(error_code, (pte_t *) pmd); + + /* + * Note: don't use pte_present() here, since it returns true + * if the _PAGE_PROTNONE bit is set. However, this aliases the + * _PAGE_GLOBAL bit, which for kernel pages give false positives + * when CONFIG_DEBUG_PAGEALLOC is used. + */ + pte = pte_offset_kernel(pmd, address); + if (!(pte_flags(*pte) & _PAGE_PRESENT)) + return 0; + + ret = spurious_fault_check(error_code, pte); + if (!ret) + return 0; + + /* + * Make sure we have permissions in PMD. + * If not, then there's a bug in the page tables: + */ + ret = spurious_fault_check(error_code, (pte_t *) pmd); + WARN_ONCE(!ret, "PMD has incorrect permission bits\n"); + + return ret; +} + +int show_unhandled_signals = 1; + +static inline int +access_error(unsigned long error_code, struct vm_area_struct *vma) +{ + if (error_code & PF_WRITE) { + /* write, present and write, not present: */ + if (unlikely(!(vma->vm_flags & VM_WRITE))) + return 1; + return 0; + } + + /* read, present: */ + if (unlikely(error_code & PF_PROT)) + return 1; + + /* read, not present: */ + if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))) + return 1; + + return 0; +} + +static int fault_in_kernel_space(unsigned long address) +{ + return address >= TASK_SIZE_MAX; +} + +/* + * This routine handles page faults. It determines the address, + * and the problem, and then passes it off to one of the appropriate + * routines. + */ +dotraplinkage void __kprobes +do_page_fault(struct pt_regs *regs, unsigned long error_code) +{ + struct vm_area_struct *vma; + struct task_struct *tsk; + unsigned long address; + struct mm_struct *mm; + int fault; + int write = error_code & PF_WRITE; + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | + (write ? FAULT_FLAG_WRITE : 0); + + tsk = current; + mm = tsk->mm; + + /* Get the faulting address: */ + address = read_cr2(); + + /* + * Detect and handle instructions that would cause a page fault for + * both a tracked kernel page and a userspace page. + */ + if (kmemcheck_active(regs)) + kmemcheck_hide(regs); + prefetchw(&mm->mmap_sem); + + if (unlikely(kmmio_fault(regs, address))) + return; + + /* + * We fault-in kernel-space virtual memory on-demand. The + * 'reference' page table is init_mm.pgd. + * + * NOTE! We MUST NOT take any locks for this case. We may + * be in an interrupt or a critical region, and should + * only copy the information from the master page table, + * nothing more. + * + * This verifies that the fault happens in kernel space + * (error_code & 4) == 0, and that the fault was not a + * protection error (error_code & 9) == 0. + */ + if (unlikely(fault_in_kernel_space(address))) { + if (!(error_code & (PF_RSVD | PF_USER | PF_PROT))) { + if (vmalloc_fault(address) >= 0) + return; + + if (kmemcheck_fault(regs, address, error_code)) + return; + } + + /* Can handle a stale RO->RW TLB: */ + if (spurious_fault(error_code, address)) + return; + + /* kprobes don't want to hook the spurious faults: */ + if (notify_page_fault(regs)) + return; + /* + * Don't take the mm semaphore here. If we fixup a prefetch + * fault we could otherwise deadlock: + */ + bad_area_nosemaphore(regs, error_code, address); + + return; + } + + /* kprobes don't want to hook the spurious faults: */ + if (unlikely(notify_page_fault(regs))) + return; + /* + * It's safe to allow irq's after cr2 has been saved and the + * vmalloc fault has been handled. + * + * User-mode registers count as a user access even for any + * potential system fault or CPU buglet: + */ + if (user_mode_vm(regs)) { + local_irq_enable(); + error_code |= PF_USER; + } else { + if (regs->flags & X86_EFLAGS_IF) + local_irq_enable(); + } + + if (unlikely(error_code & PF_RSVD)) + pgtable_bad(regs, error_code, address); + + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); + + /* + * If we're in an interrupt, have no user context or are running + * in an atomic region then we must not take the fault: + */ + if (unlikely(in_atomic() || !mm)) { + bad_area_nosemaphore(regs, error_code, address); + return; + } + + /* + * When running in the kernel we expect faults to occur only to + * addresses in user space. All other faults represent errors in + * the kernel and should generate an OOPS. Unfortunately, in the + * case of an erroneous fault occurring in a code path which already + * holds mmap_sem we will deadlock attempting to validate the fault + * against the address space. Luckily the kernel only validly + * references user space from well defined areas of code, which are + * listed in the exceptions table. + * + * As the vast majority of faults will be valid we will only perform + * the source reference check when there is a possibility of a + * deadlock. Attempt to lock the address space, if we cannot we then + * validate the source. If this is invalid we can skip the address + * space check, thus avoiding the deadlock: + */ + if (unlikely(!down_read_trylock(&mm->mmap_sem))) { + if ((error_code & PF_USER) == 0 && + !search_exception_tables(regs->ip)) { + bad_area_nosemaphore(regs, error_code, address); + return; + } +retry: + down_read(&mm->mmap_sem); + } else { + /* + * The above down_read_trylock() might have succeeded in + * which case we'll have missed the might_sleep() from + * down_read(): + */ + might_sleep(); + } + + vma = find_vma(mm, address); + if (unlikely(!vma)) { + bad_area(regs, error_code, address); + return; + } + if (likely(vma->vm_start <= address)) + goto good_area; + if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { + bad_area(regs, error_code, address); + return; + } + if (error_code & PF_USER) { + /* + * Accessing the stack below %sp is always a bug. + * The large cushion allows instructions like enter + * and pusha to work. ("enter $65535, $31" pushes + * 32 pointers and then decrements %sp by 65535.) + */ + if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) { + bad_area(regs, error_code, address); + return; + } + } + if (unlikely(expand_stack(vma, address))) { + bad_area(regs, error_code, address); + return; + } + + /* + * Ok, we have a good vm_area for this memory access, so + * we can handle it.. + */ +good_area: + if (unlikely(access_error(error_code, vma))) { + bad_area_access_error(regs, error_code, address); + return; + } + + /* + * If for any reason at all we couldn't handle the fault, + * make sure we exit gracefully rather than endlessly redo + * the fault: + */ + fault = handle_mm_fault(mm, vma, address, flags); + + if (unlikely(fault & (VM_FAULT_RETRY|VM_FAULT_ERROR))) { + if (mm_fault_error(regs, error_code, address, fault)) + return; + } + + /* + * Major/minor page fault accounting is only done on the + * initial attempt. If we go through a retry, it is extremely + * likely that the page will be found in page cache at that point. + */ + if (flags & FAULT_FLAG_ALLOW_RETRY) { + if (fault & VM_FAULT_MAJOR) { + tsk->maj_flt++; + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, + regs, address); + } else { + tsk->min_flt++; + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, + regs, address); + } + if (fault & VM_FAULT_RETRY) { + /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk + * of starvation. */ + flags &= ~FAULT_FLAG_ALLOW_RETRY; + goto retry; + } + } + + check_v8086_mode(regs, address, tsk); + + up_read(&mm->mmap_sem); +} diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c new file mode 100644 index 00000000..dd74e468 --- /dev/null +++ b/arch/x86/mm/gup.c @@ -0,0 +1,393 @@ +/* + * Lockless get_user_pages_fast for x86 + * + * Copyright (C) 2008 Nick Piggin + * Copyright (C) 2008 Novell Inc. + */ +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/vmstat.h> +#include <linux/highmem.h> +#include <linux/swap.h> + +#include <asm/pgtable.h> + +static inline pte_t gup_get_pte(pte_t *ptep) +{ +#ifndef CONFIG_X86_PAE + return ACCESS_ONCE(*ptep); +#else + /* + * With get_user_pages_fast, we walk down the pagetables without taking + * any locks. For this we would like to load the pointers atomically, + * but that is not possible (without expensive cmpxchg8b) on PAE. What + * we do have is the guarantee that a pte will only either go from not + * present to present, or present to not present or both -- it will not + * switch to a completely different present page without a TLB flush in + * between; something that we are blocking by holding interrupts off. + * + * Setting ptes from not present to present goes: + * ptep->pte_high = h; + * smp_wmb(); + * ptep->pte_low = l; + * + * And present to not present goes: + * ptep->pte_low = 0; + * smp_wmb(); + * ptep->pte_high = 0; + * + * We must ensure here that the load of pte_low sees l iff pte_high + * sees h. We load pte_high *after* loading pte_low, which ensures we + * don't see an older value of pte_high. *Then* we recheck pte_low, + * which ensures that we haven't picked up a changed pte high. We might + * have got rubbish values from pte_low and pte_high, but we are + * guaranteed that pte_low will not have the present bit set *unless* + * it is 'l'. And get_user_pages_fast only operates on present ptes, so + * we're safe. + * + * gup_get_pte should not be used or copied outside gup.c without being + * very careful -- it does not atomically load the pte or anything that + * is likely to be useful for you. + */ + pte_t pte; + +retry: + pte.pte_low = ptep->pte_low; + smp_rmb(); + pte.pte_high = ptep->pte_high; + smp_rmb(); + if (unlikely(pte.pte_low != ptep->pte_low)) + goto retry; + + return pte; +#endif +} + +/* + * The performance critical leaf functions are made noinline otherwise gcc + * inlines everything into a single function which results in too much + * register pressure. + */ +static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, + unsigned long end, int write, struct page **pages, int *nr) +{ + unsigned long mask; + pte_t *ptep; + + mask = _PAGE_PRESENT|_PAGE_USER; + if (write) + mask |= _PAGE_RW; + + ptep = pte_offset_map(&pmd, addr); + do { + pte_t pte = gup_get_pte(ptep); + struct page *page; + + if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) { + pte_unmap(ptep); + return 0; + } + VM_BUG_ON(!pfn_valid(pte_pfn(pte))); + page = pte_page(pte); + get_page(page); + SetPageReferenced(page); + pages[*nr] = page; + (*nr)++; + + } while (ptep++, addr += PAGE_SIZE, addr != end); + pte_unmap(ptep - 1); + + return 1; +} + +static inline void get_head_page_multiple(struct page *page, int nr) +{ + VM_BUG_ON(page != compound_head(page)); + VM_BUG_ON(page_count(page) == 0); + atomic_add(nr, &page->_count); + SetPageReferenced(page); +} + +static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, + unsigned long end, int write, struct page **pages, int *nr) +{ + unsigned long mask; + pte_t pte = *(pte_t *)&pmd; + struct page *head, *page; + int refs; + + mask = _PAGE_PRESENT|_PAGE_USER; + if (write) + mask |= _PAGE_RW; + if ((pte_flags(pte) & mask) != mask) + return 0; + /* hugepages are never "special" */ + VM_BUG_ON(pte_flags(pte) & _PAGE_SPECIAL); + VM_BUG_ON(!pfn_valid(pte_pfn(pte))); + + refs = 0; + head = pte_page(pte); + page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); + do { + VM_BUG_ON(compound_head(page) != head); + pages[*nr] = page; + if (PageTail(page)) + get_huge_page_tail(page); + (*nr)++; + page++; + refs++; + } while (addr += PAGE_SIZE, addr != end); + get_head_page_multiple(head, refs); + + return 1; +} + +static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, + int write, struct page **pages, int *nr) +{ + unsigned long next; + pmd_t *pmdp; + + pmdp = pmd_offset(&pud, addr); + do { + pmd_t pmd = *pmdp; + + next = pmd_addr_end(addr, end); + /* + * The pmd_trans_splitting() check below explains why + * pmdp_splitting_flush has to flush the tlb, to stop + * this gup-fast code from running while we set the + * splitting bit in the pmd. Returning zero will take + * the slow path that will call wait_split_huge_page() + * if the pmd is still in splitting state. gup-fast + * can't because it has irq disabled and + * wait_split_huge_page() would never return as the + * tlb flush IPI wouldn't run. + */ + if (pmd_none(pmd) || pmd_trans_splitting(pmd)) + return 0; + if (unlikely(pmd_large(pmd))) { + if (!gup_huge_pmd(pmd, addr, next, write, pages, nr)) + return 0; + } else { + if (!gup_pte_range(pmd, addr, next, write, pages, nr)) + return 0; + } + } while (pmdp++, addr = next, addr != end); + + return 1; +} + +static noinline int gup_huge_pud(pud_t pud, unsigned long addr, + unsigned long end, int write, struct page **pages, int *nr) +{ + unsigned long mask; + pte_t pte = *(pte_t *)&pud; + struct page *head, *page; + int refs; + + mask = _PAGE_PRESENT|_PAGE_USER; + if (write) + mask |= _PAGE_RW; + if ((pte_flags(pte) & mask) != mask) + return 0; + /* hugepages are never "special" */ + VM_BUG_ON(pte_flags(pte) & _PAGE_SPECIAL); + VM_BUG_ON(!pfn_valid(pte_pfn(pte))); + + refs = 0; + head = pte_page(pte); + page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT); + do { + VM_BUG_ON(compound_head(page) != head); + pages[*nr] = page; + if (PageTail(page)) + get_huge_page_tail(page); + (*nr)++; + page++; + refs++; + } while (addr += PAGE_SIZE, addr != end); + get_head_page_multiple(head, refs); + + return 1; +} + +static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end, + int write, struct page **pages, int *nr) +{ + unsigned long next; + pud_t *pudp; + + pudp = pud_offset(&pgd, addr); + do { + pud_t pud = *pudp; + + next = pud_addr_end(addr, end); + if (pud_none(pud)) + return 0; + if (unlikely(pud_large(pud))) { + if (!gup_huge_pud(pud, addr, next, write, pages, nr)) + return 0; + } else { + if (!gup_pmd_range(pud, addr, next, write, pages, nr)) + return 0; + } + } while (pudp++, addr = next, addr != end); + + return 1; +} + +/* + * Like get_user_pages_fast() except its IRQ-safe in that it won't fall + * back to the regular GUP. + */ +int __get_user_pages_fast(unsigned long start, int nr_pages, int write, + struct page **pages) +{ + struct mm_struct *mm = current->mm; + unsigned long addr, len, end; + unsigned long next; + unsigned long flags; + pgd_t *pgdp; + int nr = 0; + + start &= PAGE_MASK; + addr = start; + len = (unsigned long) nr_pages << PAGE_SHIFT; + end = start + len; + if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ, + (void __user *)start, len))) + return 0; + + /* + * XXX: batch / limit 'nr', to avoid large irq off latency + * needs some instrumenting to determine the common sizes used by + * important workloads (eg. DB2), and whether limiting the batch size + * will decrease performance. + * + * It seems like we're in the clear for the moment. Direct-IO is + * the main guy that batches up lots of get_user_pages, and even + * they are limited to 64-at-a-time which is not so many. + */ + /* + * This doesn't prevent pagetable teardown, but does prevent + * the pagetables and pages from being freed on x86. + * + * So long as we atomically load page table pointers versus teardown + * (which we do on x86, with the above PAE exception), we can follow the + * address down to the the page and take a ref on it. + */ + local_irq_save(flags); + pgdp = pgd_offset(mm, addr); + do { + pgd_t pgd = *pgdp; + + next = pgd_addr_end(addr, end); + if (pgd_none(pgd)) + break; + if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) + break; + } while (pgdp++, addr = next, addr != end); + local_irq_restore(flags); + + return nr; +} + +/** + * get_user_pages_fast() - pin user pages in memory + * @start: starting user address + * @nr_pages: number of pages from start to pin + * @write: whether pages will be written to + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_pages long. + * + * Attempt to pin user pages in memory without taking mm->mmap_sem. + * If not successful, it will fall back to taking the lock and + * calling get_user_pages(). + * + * Returns number of pages pinned. This may be fewer than the number + * requested. If nr_pages is 0 or negative, returns 0. If no pages + * were pinned, returns -errno. + */ +int get_user_pages_fast(unsigned long start, int nr_pages, int write, + struct page **pages) +{ + struct mm_struct *mm = current->mm; + unsigned long addr, len, end; + unsigned long next; + pgd_t *pgdp; + int nr = 0; + + start &= PAGE_MASK; + addr = start; + len = (unsigned long) nr_pages << PAGE_SHIFT; + + end = start + len; + if (end < start) + goto slow_irqon; + +#ifdef CONFIG_X86_64 + if (end >> __VIRTUAL_MASK_SHIFT) + goto slow_irqon; +#endif + + /* + * XXX: batch / limit 'nr', to avoid large irq off latency + * needs some instrumenting to determine the common sizes used by + * important workloads (eg. DB2), and whether limiting the batch size + * will decrease performance. + * + * It seems like we're in the clear for the moment. Direct-IO is + * the main guy that batches up lots of get_user_pages, and even + * they are limited to 64-at-a-time which is not so many. + */ + /* + * This doesn't prevent pagetable teardown, but does prevent + * the pagetables and pages from being freed on x86. + * + * So long as we atomically load page table pointers versus teardown + * (which we do on x86, with the above PAE exception), we can follow the + * address down to the the page and take a ref on it. + */ + local_irq_disable(); + pgdp = pgd_offset(mm, addr); + do { + pgd_t pgd = *pgdp; + + next = pgd_addr_end(addr, end); + if (pgd_none(pgd)) + goto slow; + if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) + goto slow; + } while (pgdp++, addr = next, addr != end); + local_irq_enable(); + + VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT); + return nr; + + { + int ret; + +slow: + local_irq_enable(); +slow_irqon: + /* Try to get the remaining pages with get_user_pages */ + start += nr << PAGE_SHIFT; + pages += nr; + + down_read(&mm->mmap_sem); + ret = get_user_pages(current, mm, start, + (end - start) >> PAGE_SHIFT, write, 0, pages, NULL); + up_read(&mm->mmap_sem); + + /* Have to be a bit careful with return values */ + if (nr > 0) { + if (ret < 0) + ret = nr; + else + ret += nr; + } + + return ret; + } +} diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c new file mode 100644 index 00000000..6f31ee56 --- /dev/null +++ b/arch/x86/mm/highmem_32.c @@ -0,0 +1,141 @@ +#include <linux/highmem.h> +#include <linux/module.h> +#include <linux/swap.h> /* for totalram_pages */ + +void *kmap(struct page *page) +{ + might_sleep(); + if (!PageHighMem(page)) + return page_address(page); + return kmap_high(page); +} +EXPORT_SYMBOL(kmap); + +void kunmap(struct page *page) +{ + if (in_interrupt()) + BUG(); + if (!PageHighMem(page)) + return; + kunmap_high(page); +} +EXPORT_SYMBOL(kunmap); + +/* + * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because + * no global lock is needed and because the kmap code must perform a global TLB + * invalidation when the kmap pool wraps. + * + * However when holding an atomic kmap it is not legal to sleep, so atomic + * kmaps are appropriate for short, tight code paths only. + */ +void *kmap_atomic_prot(struct page *page, pgprot_t prot) +{ + unsigned long vaddr; + int idx, type; + + /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ + pagefault_disable(); + + if (!PageHighMem(page)) + return page_address(page); + + type = kmap_atomic_idx_push(); + idx = type + KM_TYPE_NR*smp_processor_id(); + vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); + BUG_ON(!pte_none(*(kmap_pte-idx))); + set_pte(kmap_pte-idx, mk_pte(page, prot)); + arch_flush_lazy_mmu_mode(); + + return (void *)vaddr; +} +EXPORT_SYMBOL(kmap_atomic_prot); + +void *kmap_atomic(struct page *page) +{ + return kmap_atomic_prot(page, kmap_prot); +} +EXPORT_SYMBOL(kmap_atomic); + +/* + * This is the same as kmap_atomic() but can map memory that doesn't + * have a struct page associated with it. + */ +void *kmap_atomic_pfn(unsigned long pfn) +{ + return kmap_atomic_prot_pfn(pfn, kmap_prot); +} +EXPORT_SYMBOL_GPL(kmap_atomic_pfn); + +void __kunmap_atomic(void *kvaddr) +{ + unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; + + if (vaddr >= __fix_to_virt(FIX_KMAP_END) && + vaddr <= __fix_to_virt(FIX_KMAP_BEGIN)) { + int idx, type; + + type = kmap_atomic_idx(); + idx = type + KM_TYPE_NR * smp_processor_id(); + +#ifdef CONFIG_DEBUG_HIGHMEM + WARN_ON_ONCE(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx)); +#endif + /* + * Force other mappings to Oops if they'll try to access this + * pte without first remap it. Keeping stale mappings around + * is a bad idea also, in case the page changes cacheability + * attributes or becomes a protected page in a hypervisor. + */ + kpte_clear_flush(kmap_pte-idx, vaddr); + kmap_atomic_idx_pop(); + arch_flush_lazy_mmu_mode(); + } +#ifdef CONFIG_DEBUG_HIGHMEM + else { + BUG_ON(vaddr < PAGE_OFFSET); + BUG_ON(vaddr >= (unsigned long)high_memory); + } +#endif + + pagefault_enable(); +} +EXPORT_SYMBOL(__kunmap_atomic); + +struct page *kmap_atomic_to_page(void *ptr) +{ + unsigned long idx, vaddr = (unsigned long)ptr; + pte_t *pte; + + if (vaddr < FIXADDR_START) + return virt_to_page(ptr); + + idx = virt_to_fix(vaddr); + pte = kmap_pte - (idx - FIX_KMAP_BEGIN); + return pte_page(*pte); +} +EXPORT_SYMBOL(kmap_atomic_to_page); + +void __init set_highmem_pages_init(void) +{ + struct zone *zone; + int nid; + + for_each_zone(zone) { + unsigned long zone_start_pfn, zone_end_pfn; + + if (!is_highmem(zone)) + continue; + + zone_start_pfn = zone->zone_start_pfn; + zone_end_pfn = zone_start_pfn + zone->spanned_pages; + + nid = zone_to_nid(zone); + printk(KERN_INFO "Initializing %s for node %d (%08lx:%08lx)\n", + zone->name, nid, zone_start_pfn, zone_end_pfn); + + add_highpages_with_active_regions(nid, zone_start_pfn, + zone_end_pfn); + } + totalram_pages += totalhigh_pages; +} diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c new file mode 100644 index 00000000..f6679a7f --- /dev/null +++ b/arch/x86/mm/hugetlbpage.c @@ -0,0 +1,443 @@ +/* + * IA-32 Huge TLB Page Support for Kernel. + * + * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com> + */ + +#include <linux/init.h> +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/hugetlb.h> +#include <linux/pagemap.h> +#include <linux/err.h> +#include <linux/sysctl.h> +#include <asm/mman.h> +#include <asm/tlb.h> +#include <asm/tlbflush.h> +#include <asm/pgalloc.h> + +static unsigned long page_table_shareable(struct vm_area_struct *svma, + struct vm_area_struct *vma, + unsigned long addr, pgoff_t idx) +{ + unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) + + svma->vm_start; + unsigned long sbase = saddr & PUD_MASK; + unsigned long s_end = sbase + PUD_SIZE; + + /* Allow segments to share if only one is marked locked */ + unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED; + unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED; + + /* + * match the virtual addresses, permission and the alignment of the + * page table page. + */ + if (pmd_index(addr) != pmd_index(saddr) || + vm_flags != svm_flags || + sbase < svma->vm_start || svma->vm_end < s_end) + return 0; + + return saddr; +} + +static int vma_shareable(struct vm_area_struct *vma, unsigned long addr) +{ + unsigned long base = addr & PUD_MASK; + unsigned long end = base + PUD_SIZE; + + /* + * check on proper vm_flags and page table alignment + */ + if (vma->vm_flags & VM_MAYSHARE && + vma->vm_start <= base && end <= vma->vm_end) + return 1; + return 0; +} + +/* + * search for a shareable pmd page for hugetlb. + */ +static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) +{ + struct vm_area_struct *vma = find_vma(mm, addr); + struct address_space *mapping = vma->vm_file->f_mapping; + pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + + vma->vm_pgoff; + struct prio_tree_iter iter; + struct vm_area_struct *svma; + unsigned long saddr; + pte_t *spte = NULL; + + if (!vma_shareable(vma, addr)) + return; + + mutex_lock(&mapping->i_mmap_mutex); + vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) { + if (svma == vma) + continue; + + saddr = page_table_shareable(svma, vma, addr, idx); + if (saddr) { + spte = huge_pte_offset(svma->vm_mm, saddr); + if (spte) { + get_page(virt_to_page(spte)); + break; + } + } + } + + if (!spte) + goto out; + + spin_lock(&mm->page_table_lock); + if (pud_none(*pud)) + pud_populate(mm, pud, (pmd_t *)((unsigned long)spte & PAGE_MASK)); + else + put_page(virt_to_page(spte)); + spin_unlock(&mm->page_table_lock); +out: + mutex_unlock(&mapping->i_mmap_mutex); +} + +/* + * unmap huge page backed by shared pte. + * + * Hugetlb pte page is ref counted at the time of mapping. If pte is shared + * indicated by page_count > 1, unmap is achieved by clearing pud and + * decrementing the ref count. If count == 1, the pte page is not shared. + * + * called with vma->vm_mm->page_table_lock held. + * + * returns: 1 successfully unmapped a shared pte page + * 0 the underlying pte page is not shared, or it is the last user + */ +int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) +{ + pgd_t *pgd = pgd_offset(mm, *addr); + pud_t *pud = pud_offset(pgd, *addr); + + BUG_ON(page_count(virt_to_page(ptep)) == 0); + if (page_count(virt_to_page(ptep)) == 1) + return 0; + + pud_clear(pud); + put_page(virt_to_page(ptep)); + *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE; + return 1; +} + +pte_t *huge_pte_alloc(struct mm_struct *mm, + unsigned long addr, unsigned long sz) +{ + pgd_t *pgd; + pud_t *pud; + pte_t *pte = NULL; + + pgd = pgd_offset(mm, addr); + pud = pud_alloc(mm, pgd, addr); + if (pud) { + if (sz == PUD_SIZE) { + pte = (pte_t *)pud; + } else { + BUG_ON(sz != PMD_SIZE); + if (pud_none(*pud)) + huge_pmd_share(mm, addr, pud); + pte = (pte_t *) pmd_alloc(mm, pud, addr); + } + } + BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte)); + + return pte; +} + +pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd = NULL; + + pgd = pgd_offset(mm, addr); + if (pgd_present(*pgd)) { + pud = pud_offset(pgd, addr); + if (pud_present(*pud)) { + if (pud_large(*pud)) + return (pte_t *)pud; + pmd = pmd_offset(pud, addr); + } + } + return (pte_t *) pmd; +} + +#if 0 /* This is just for testing */ +struct page * +follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) +{ + unsigned long start = address; + int length = 1; + int nr; + struct page *page; + struct vm_area_struct *vma; + + vma = find_vma(mm, addr); + if (!vma || !is_vm_hugetlb_page(vma)) + return ERR_PTR(-EINVAL); + + pte = huge_pte_offset(mm, address); + + /* hugetlb should be locked, and hence, prefaulted */ + WARN_ON(!pte || pte_none(*pte)); + + page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)]; + + WARN_ON(!PageHead(page)); + + return page; +} + +int pmd_huge(pmd_t pmd) +{ + return 0; +} + +int pud_huge(pud_t pud) +{ + return 0; +} + +struct page * +follow_huge_pmd(struct mm_struct *mm, unsigned long address, + pmd_t *pmd, int write) +{ + return NULL; +} + +#else + +struct page * +follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) +{ + return ERR_PTR(-EINVAL); +} + +int pmd_huge(pmd_t pmd) +{ + return !!(pmd_val(pmd) & _PAGE_PSE); +} + +int pud_huge(pud_t pud) +{ + return !!(pud_val(pud) & _PAGE_PSE); +} + +struct page * +follow_huge_pmd(struct mm_struct *mm, unsigned long address, + pmd_t *pmd, int write) +{ + struct page *page; + + page = pte_page(*(pte_t *)pmd); + if (page) + page += ((address & ~PMD_MASK) >> PAGE_SHIFT); + return page; +} + +struct page * +follow_huge_pud(struct mm_struct *mm, unsigned long address, + pud_t *pud, int write) +{ + struct page *page; + + page = pte_page(*(pte_t *)pud); + if (page) + page += ((address & ~PUD_MASK) >> PAGE_SHIFT); + return page; +} + +#endif + +/* x86_64 also uses this file */ + +#ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA +static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, + unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags) +{ + struct hstate *h = hstate_file(file); + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + unsigned long start_addr; + + if (len > mm->cached_hole_size) { + start_addr = mm->free_area_cache; + } else { + start_addr = TASK_UNMAPPED_BASE; + mm->cached_hole_size = 0; + } + +full_search: + addr = ALIGN(start_addr, huge_page_size(h)); + + for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { + /* At this point: (!vma || addr < vma->vm_end). */ + if (TASK_SIZE - len < addr) { + /* + * Start a new search - just in case we missed + * some holes. + */ + if (start_addr != TASK_UNMAPPED_BASE) { + start_addr = TASK_UNMAPPED_BASE; + mm->cached_hole_size = 0; + goto full_search; + } + return -ENOMEM; + } + if (!vma || addr + len <= vma->vm_start) { + mm->free_area_cache = addr + len; + return addr; + } + if (addr + mm->cached_hole_size < vma->vm_start) + mm->cached_hole_size = vma->vm_start - addr; + addr = ALIGN(vma->vm_end, huge_page_size(h)); + } +} + +static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, + unsigned long addr0, unsigned long len, + unsigned long pgoff, unsigned long flags) +{ + struct hstate *h = hstate_file(file); + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + unsigned long base = mm->mmap_base; + unsigned long addr = addr0; + unsigned long largest_hole = mm->cached_hole_size; + unsigned long start_addr; + + /* don't allow allocations above current base */ + if (mm->free_area_cache > base) + mm->free_area_cache = base; + + if (len <= largest_hole) { + largest_hole = 0; + mm->free_area_cache = base; + } +try_again: + start_addr = mm->free_area_cache; + + /* make sure it can fit in the remaining address space */ + if (mm->free_area_cache < len) + goto fail; + + /* either no address requested or can't fit in requested address hole */ + addr = (mm->free_area_cache - len) & huge_page_mask(h); + do { + /* + * Lookup failure means no vma is above this address, + * i.e. return with success: + */ + vma = find_vma(mm, addr); + if (!vma) + return addr; + + if (addr + len <= vma->vm_start) { + /* remember the address as a hint for next time */ + mm->cached_hole_size = largest_hole; + return (mm->free_area_cache = addr); + } else if (mm->free_area_cache == vma->vm_end) { + /* pull free_area_cache down to the first hole */ + mm->free_area_cache = vma->vm_start; + mm->cached_hole_size = largest_hole; + } + + /* remember the largest hole we saw so far */ + if (addr + largest_hole < vma->vm_start) + largest_hole = vma->vm_start - addr; + + /* try just below the current vma->vm_start */ + addr = (vma->vm_start - len) & huge_page_mask(h); + } while (len <= vma->vm_start); + +fail: + /* + * if hint left us with no space for the requested + * mapping then try again: + */ + if (start_addr != base) { + mm->free_area_cache = base; + largest_hole = 0; + goto try_again; + } + /* + * A failed mmap() very likely causes application failure, + * so fall back to the bottom-up function here. This scenario + * can happen with large stack limits and large mmap() + * allocations. + */ + mm->free_area_cache = TASK_UNMAPPED_BASE; + mm->cached_hole_size = ~0UL; + addr = hugetlb_get_unmapped_area_bottomup(file, addr0, + len, pgoff, flags); + + /* + * Restore the topdown base: + */ + mm->free_area_cache = base; + mm->cached_hole_size = ~0UL; + + return addr; +} + +unsigned long +hugetlb_get_unmapped_area(struct file *file, unsigned long addr, + unsigned long len, unsigned long pgoff, unsigned long flags) +{ + struct hstate *h = hstate_file(file); + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + + if (len & ~huge_page_mask(h)) + return -EINVAL; + if (len > TASK_SIZE) + return -ENOMEM; + + if (flags & MAP_FIXED) { + if (prepare_hugepage_range(file, addr, len)) + return -EINVAL; + return addr; + } + + if (addr) { + addr = ALIGN(addr, huge_page_size(h)); + vma = find_vma(mm, addr); + if (TASK_SIZE - len >= addr && + (!vma || addr + len <= vma->vm_start)) + return addr; + } + if (mm->get_unmapped_area == arch_get_unmapped_area) + return hugetlb_get_unmapped_area_bottomup(file, addr, len, + pgoff, flags); + else + return hugetlb_get_unmapped_area_topdown(file, addr, len, + pgoff, flags); +} + +#endif /*HAVE_ARCH_HUGETLB_UNMAPPED_AREA*/ + +#ifdef CONFIG_X86_64 +static __init int setup_hugepagesz(char *opt) +{ + unsigned long ps = memparse(opt, &opt); + if (ps == PMD_SIZE) { + hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT); + } else if (ps == PUD_SIZE && cpu_has_gbpages) { + hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT); + } else { + printk(KERN_ERR "hugepagesz: Unsupported page size %lu M\n", + ps >> 20); + return 0; + } + return 1; +} +__setup("hugepagesz=", setup_hugepagesz); +#endif diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c new file mode 100644 index 00000000..4f0cec7e --- /dev/null +++ b/arch/x86/mm/init.c @@ -0,0 +1,416 @@ +#include <linux/gfp.h> +#include <linux/initrd.h> +#include <linux/ioport.h> +#include <linux/swap.h> +#include <linux/memblock.h> +#include <linux/bootmem.h> /* for max_low_pfn */ + +#include <asm/cacheflush.h> +#include <asm/e820.h> +#include <asm/init.h> +#include <asm/page.h> +#include <asm/page_types.h> +#include <asm/sections.h> +#include <asm/setup.h> +#include <asm/tlbflush.h> +#include <asm/tlb.h> +#include <asm/proto.h> +#include <asm/dma.h> /* for MAX_DMA_PFN */ + +unsigned long __initdata pgt_buf_start; +unsigned long __meminitdata pgt_buf_end; +unsigned long __meminitdata pgt_buf_top; + +int after_bootmem; + +int direct_gbpages +#ifdef CONFIG_DIRECT_GBPAGES + = 1 +#endif +; + +static void __init find_early_table_space(unsigned long end, int use_pse, + int use_gbpages) +{ + unsigned long puds, pmds, ptes, tables, start = 0, good_end = end; + phys_addr_t base; + + puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; + tables = roundup(puds * sizeof(pud_t), PAGE_SIZE); + + if (use_gbpages) { + unsigned long extra; + + extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT); + pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT; + } else + pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; + + tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE); + + if (use_pse) { + unsigned long extra; + + extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); +#ifdef CONFIG_X86_32 + extra += PMD_SIZE; +#endif + ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; + } else + ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; + + tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE); + +#ifdef CONFIG_X86_32 + /* for fixmap */ + tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE); +#endif + good_end = max_pfn_mapped << PAGE_SHIFT; + + base = memblock_find_in_range(start, good_end, tables, PAGE_SIZE); + if (!base) + panic("Cannot find space for the kernel page tables"); + + pgt_buf_start = base >> PAGE_SHIFT; + pgt_buf_end = pgt_buf_start; + pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT); + + printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n", + end, pgt_buf_start << PAGE_SHIFT, pgt_buf_top << PAGE_SHIFT); +} + +void __init native_pagetable_reserve(u64 start, u64 end) +{ + memblock_reserve(start, end - start); +} + +struct map_range { + unsigned long start; + unsigned long end; + unsigned page_size_mask; +}; + +#ifdef CONFIG_X86_32 +#define NR_RANGE_MR 3 +#else /* CONFIG_X86_64 */ +#define NR_RANGE_MR 5 +#endif + +static int __meminit save_mr(struct map_range *mr, int nr_range, + unsigned long start_pfn, unsigned long end_pfn, + unsigned long page_size_mask) +{ + if (start_pfn < end_pfn) { + if (nr_range >= NR_RANGE_MR) + panic("run out of range for init_memory_mapping\n"); + mr[nr_range].start = start_pfn<<PAGE_SHIFT; + mr[nr_range].end = end_pfn<<PAGE_SHIFT; + mr[nr_range].page_size_mask = page_size_mask; + nr_range++; + } + + return nr_range; +} + +/* + * Setup the direct mapping of the physical memory at PAGE_OFFSET. + * This runs before bootmem is initialized and gets pages directly from + * the physical memory. To access them they are temporarily mapped. + */ +unsigned long __init_refok init_memory_mapping(unsigned long start, + unsigned long end) +{ + unsigned long page_size_mask = 0; + unsigned long start_pfn, end_pfn; + unsigned long ret = 0; + unsigned long pos; + + struct map_range mr[NR_RANGE_MR]; + int nr_range, i; + int use_pse, use_gbpages; + + printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end); + +#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK) + /* + * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. + * This will simplify cpa(), which otherwise needs to support splitting + * large pages into small in interrupt context, etc. + */ + use_pse = use_gbpages = 0; +#else + use_pse = cpu_has_pse; + use_gbpages = direct_gbpages; +#endif + + /* Enable PSE if available */ + if (cpu_has_pse) + set_in_cr4(X86_CR4_PSE); + + /* Enable PGE if available */ + if (cpu_has_pge) { + set_in_cr4(X86_CR4_PGE); + __supported_pte_mask |= _PAGE_GLOBAL; + } + + if (use_gbpages) + page_size_mask |= 1 << PG_LEVEL_1G; + if (use_pse) + page_size_mask |= 1 << PG_LEVEL_2M; + + memset(mr, 0, sizeof(mr)); + nr_range = 0; + + /* head if not big page alignment ? */ + start_pfn = start >> PAGE_SHIFT; + pos = start_pfn << PAGE_SHIFT; +#ifdef CONFIG_X86_32 + /* + * Don't use a large page for the first 2/4MB of memory + * because there are often fixed size MTRRs in there + * and overlapping MTRRs into large pages can cause + * slowdowns. + */ + if (pos == 0) + end_pfn = 1<<(PMD_SHIFT - PAGE_SHIFT); + else + end_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) + << (PMD_SHIFT - PAGE_SHIFT); +#else /* CONFIG_X86_64 */ + end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT) + << (PMD_SHIFT - PAGE_SHIFT); +#endif + if (end_pfn > (end >> PAGE_SHIFT)) + end_pfn = end >> PAGE_SHIFT; + if (start_pfn < end_pfn) { + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); + pos = end_pfn << PAGE_SHIFT; + } + + /* big page (2M) range */ + start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) + << (PMD_SHIFT - PAGE_SHIFT); +#ifdef CONFIG_X86_32 + end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); +#else /* CONFIG_X86_64 */ + end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT) + << (PUD_SHIFT - PAGE_SHIFT); + if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT))) + end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)); +#endif + + if (start_pfn < end_pfn) { + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, + page_size_mask & (1<<PG_LEVEL_2M)); + pos = end_pfn << PAGE_SHIFT; + } + +#ifdef CONFIG_X86_64 + /* big page (1G) range */ + start_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT) + << (PUD_SHIFT - PAGE_SHIFT); + end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT); + if (start_pfn < end_pfn) { + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, + page_size_mask & + ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G))); + pos = end_pfn << PAGE_SHIFT; + } + + /* tail is not big page (1G) alignment */ + start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) + << (PMD_SHIFT - PAGE_SHIFT); + end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); + if (start_pfn < end_pfn) { + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, + page_size_mask & (1<<PG_LEVEL_2M)); + pos = end_pfn << PAGE_SHIFT; + } +#endif + + /* tail is not big page (2M) alignment */ + start_pfn = pos>>PAGE_SHIFT; + end_pfn = end>>PAGE_SHIFT; + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); + + /* try to merge same page size and continuous */ + for (i = 0; nr_range > 1 && i < nr_range - 1; i++) { + unsigned long old_start; + if (mr[i].end != mr[i+1].start || + mr[i].page_size_mask != mr[i+1].page_size_mask) + continue; + /* move it */ + old_start = mr[i].start; + memmove(&mr[i], &mr[i+1], + (nr_range - 1 - i) * sizeof(struct map_range)); + mr[i--].start = old_start; + nr_range--; + } + + for (i = 0; i < nr_range; i++) + printk(KERN_DEBUG " %010lx - %010lx page %s\n", + mr[i].start, mr[i].end, + (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":( + (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k")); + + /* + * Find space for the kernel direct mapping tables. + * + * Later we should allocate these tables in the local node of the + * memory mapped. Unfortunately this is done currently before the + * nodes are discovered. + */ + if (!after_bootmem) + find_early_table_space(end, use_pse, use_gbpages); + + for (i = 0; i < nr_range; i++) + ret = kernel_physical_mapping_init(mr[i].start, mr[i].end, + mr[i].page_size_mask); + +#ifdef CONFIG_X86_32 + early_ioremap_page_table_range_init(); + + load_cr3(swapper_pg_dir); +#endif + + __flush_tlb_all(); + + /* + * Reserve the kernel pagetable pages we used (pgt_buf_start - + * pgt_buf_end) and free the other ones (pgt_buf_end - pgt_buf_top) + * so that they can be reused for other purposes. + * + * On native it just means calling memblock_reserve, on Xen it also + * means marking RW the pagetable pages that we allocated before + * but that haven't been used. + * + * In fact on xen we mark RO the whole range pgt_buf_start - + * pgt_buf_top, because we have to make sure that when + * init_memory_mapping reaches the pagetable pages area, it maps + * RO all the pagetable pages, including the ones that are beyond + * pgt_buf_end at that time. + */ + if (!after_bootmem && pgt_buf_end > pgt_buf_start) + x86_init.mapping.pagetable_reserve(PFN_PHYS(pgt_buf_start), + PFN_PHYS(pgt_buf_end)); + + if (!after_bootmem) + early_memtest(start, end); + + return ret >> PAGE_SHIFT; +} + + +/* + * devmem_is_allowed() checks to see if /dev/mem access to a certain address + * is valid. The argument is a physical page number. + * + * + * On x86, access has to be given to the first megabyte of ram because that area + * contains bios code and data regions used by X and dosemu and similar apps. + * Access has to be given to non-kernel-ram areas as well, these contain the PCI + * mmio resources as well as potential bios/acpi data regions. + */ +int devmem_is_allowed(unsigned long pagenr) +{ + if (pagenr <= 256) + return 1; + if (iomem_is_exclusive(pagenr << PAGE_SHIFT)) + return 0; + if (!page_is_ram(pagenr)) + return 1; + return 0; +} + +void free_init_pages(char *what, unsigned long begin, unsigned long end) +{ + unsigned long addr; + unsigned long begin_aligned, end_aligned; + + /* Make sure boundaries are page aligned */ + begin_aligned = PAGE_ALIGN(begin); + end_aligned = end & PAGE_MASK; + + if (WARN_ON(begin_aligned != begin || end_aligned != end)) { + begin = begin_aligned; + end = end_aligned; + } + + if (begin >= end) + return; + + addr = begin; + + /* + * If debugging page accesses then do not free this memory but + * mark them not present - any buggy init-section access will + * create a kernel page fault: + */ +#ifdef CONFIG_DEBUG_PAGEALLOC + printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n", + begin, end); + set_memory_np(begin, (end - begin) >> PAGE_SHIFT); +#else + /* + * We just marked the kernel text read only above, now that + * we are going to free part of that, we need to make that + * writeable and non-executable first. + */ + set_memory_nx(begin, (end - begin) >> PAGE_SHIFT); + set_memory_rw(begin, (end - begin) >> PAGE_SHIFT); + + printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); + + for (; addr < end; addr += PAGE_SIZE) { + ClearPageReserved(virt_to_page(addr)); + init_page_count(virt_to_page(addr)); + memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE); + free_page(addr); + totalram_pages++; + } +#endif +} + +void free_initmem(void) +{ + free_init_pages("unused kernel memory", + (unsigned long)(&__init_begin), + (unsigned long)(&__init_end)); +} + +#ifdef CONFIG_BLK_DEV_INITRD +void free_initrd_mem(unsigned long start, unsigned long end) +{ + /* + * end could be not aligned, and We can not align that, + * decompresser could be confused by aligned initrd_end + * We already reserve the end partial page before in + * - i386_start_kernel() + * - x86_64_start_kernel() + * - relocate_initrd() + * So here We can do PAGE_ALIGN() safely to get partial page to be freed + */ + free_init_pages("initrd memory", start, PAGE_ALIGN(end)); +} +#endif + +void __init zone_sizes_init(void) +{ + unsigned long max_zone_pfns[MAX_NR_ZONES]; + + memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); + +#ifdef CONFIG_ZONE_DMA + max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; +#endif +#ifdef CONFIG_ZONE_DMA32 + max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; +#endif + max_zone_pfns[ZONE_NORMAL] = max_low_pfn; +#ifdef CONFIG_HIGHMEM + max_zone_pfns[ZONE_HIGHMEM] = max_pfn; +#endif + + free_area_init_nodes(max_zone_pfns); +} + diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c new file mode 100644 index 00000000..575d86f8 --- /dev/null +++ b/arch/x86/mm/init_32.c @@ -0,0 +1,959 @@ +/* + * + * Copyright (C) 1995 Linus Torvalds + * + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 + */ + +#include <linux/module.h> +#include <linux/signal.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/ptrace.h> +#include <linux/mman.h> +#include <linux/mm.h> +#include <linux/hugetlb.h> +#include <linux/swap.h> +#include <linux/smp.h> +#include <linux/init.h> +#include <linux/highmem.h> +#include <linux/pagemap.h> +#include <linux/pci.h> +#include <linux/pfn.h> +#include <linux/poison.h> +#include <linux/bootmem.h> +#include <linux/memblock.h> +#include <linux/proc_fs.h> +#include <linux/memory_hotplug.h> +#include <linux/initrd.h> +#include <linux/cpumask.h> +#include <linux/gfp.h> + +#include <asm/asm.h> +#include <asm/bios_ebda.h> +#include <asm/processor.h> +#include <asm/uaccess.h> +#include <asm/pgtable.h> +#include <asm/dma.h> +#include <asm/fixmap.h> +#include <asm/e820.h> +#include <asm/apic.h> +#include <asm/bugs.h> +#include <asm/tlb.h> +#include <asm/tlbflush.h> +#include <asm/olpc_ofw.h> +#include <asm/pgalloc.h> +#include <asm/sections.h> +#include <asm/paravirt.h> +#include <asm/setup.h> +#include <asm/cacheflush.h> +#include <asm/page_types.h> +#include <asm/init.h> + +unsigned long highstart_pfn, highend_pfn; + +static noinline int do_test_wp_bit(void); + +bool __read_mostly __vmalloc_start_set = false; + +static __init void *alloc_low_page(void) +{ + unsigned long pfn = pgt_buf_end++; + void *adr; + + if (pfn >= pgt_buf_top) + panic("alloc_low_page: ran out of memory"); + + adr = __va(pfn * PAGE_SIZE); + clear_page(adr); + return adr; +} + +/* + * Creates a middle page table and puts a pointer to it in the + * given global directory entry. This only returns the gd entry + * in non-PAE compilation mode, since the middle layer is folded. + */ +static pmd_t * __init one_md_table_init(pgd_t *pgd) +{ + pud_t *pud; + pmd_t *pmd_table; + +#ifdef CONFIG_X86_PAE + if (!(pgd_val(*pgd) & _PAGE_PRESENT)) { + if (after_bootmem) + pmd_table = (pmd_t *)alloc_bootmem_pages(PAGE_SIZE); + else + pmd_table = (pmd_t *)alloc_low_page(); + paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT); + set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); + pud = pud_offset(pgd, 0); + BUG_ON(pmd_table != pmd_offset(pud, 0)); + + return pmd_table; + } +#endif + pud = pud_offset(pgd, 0); + pmd_table = pmd_offset(pud, 0); + + return pmd_table; +} + +/* + * Create a page table and place a pointer to it in a middle page + * directory entry: + */ +static pte_t * __init one_page_table_init(pmd_t *pmd) +{ + if (!(pmd_val(*pmd) & _PAGE_PRESENT)) { + pte_t *page_table = NULL; + + if (after_bootmem) { +#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK) + page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE); +#endif + if (!page_table) + page_table = + (pte_t *)alloc_bootmem_pages(PAGE_SIZE); + } else + page_table = (pte_t *)alloc_low_page(); + + paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT); + set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); + BUG_ON(page_table != pte_offset_kernel(pmd, 0)); + } + + return pte_offset_kernel(pmd, 0); +} + +pmd_t * __init populate_extra_pmd(unsigned long vaddr) +{ + int pgd_idx = pgd_index(vaddr); + int pmd_idx = pmd_index(vaddr); + + return one_md_table_init(swapper_pg_dir + pgd_idx) + pmd_idx; +} + +pte_t * __init populate_extra_pte(unsigned long vaddr) +{ + int pte_idx = pte_index(vaddr); + pmd_t *pmd; + + pmd = populate_extra_pmd(vaddr); + return one_page_table_init(pmd) + pte_idx; +} + +static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd, + unsigned long vaddr, pte_t *lastpte) +{ +#ifdef CONFIG_HIGHMEM + /* + * Something (early fixmap) may already have put a pte + * page here, which causes the page table allocation + * to become nonlinear. Attempt to fix it, and if it + * is still nonlinear then we have to bug. + */ + int pmd_idx_kmap_begin = fix_to_virt(FIX_KMAP_END) >> PMD_SHIFT; + int pmd_idx_kmap_end = fix_to_virt(FIX_KMAP_BEGIN) >> PMD_SHIFT; + + if (pmd_idx_kmap_begin != pmd_idx_kmap_end + && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin + && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end + && ((__pa(pte) >> PAGE_SHIFT) < pgt_buf_start + || (__pa(pte) >> PAGE_SHIFT) >= pgt_buf_end)) { + pte_t *newpte; + int i; + + BUG_ON(after_bootmem); + newpte = alloc_low_page(); + for (i = 0; i < PTRS_PER_PTE; i++) + set_pte(newpte + i, pte[i]); + + paravirt_alloc_pte(&init_mm, __pa(newpte) >> PAGE_SHIFT); + set_pmd(pmd, __pmd(__pa(newpte)|_PAGE_TABLE)); + BUG_ON(newpte != pte_offset_kernel(pmd, 0)); + __flush_tlb_all(); + + paravirt_release_pte(__pa(pte) >> PAGE_SHIFT); + pte = newpte; + } + BUG_ON(vaddr < fix_to_virt(FIX_KMAP_BEGIN - 1) + && vaddr > fix_to_virt(FIX_KMAP_END) + && lastpte && lastpte + PTRS_PER_PTE != pte); +#endif + return pte; +} + +/* + * This function initializes a certain range of kernel virtual memory + * with new bootmem page tables, everywhere page tables are missing in + * the given range. + * + * NOTE: The pagetables are allocated contiguous on the physical space + * so we can cache the place of the first one and move around without + * checking the pgd every time. + */ +static void __init +page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base) +{ + int pgd_idx, pmd_idx; + unsigned long vaddr; + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte = NULL; + + vaddr = start; + pgd_idx = pgd_index(vaddr); + pmd_idx = pmd_index(vaddr); + pgd = pgd_base + pgd_idx; + + for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) { + pmd = one_md_table_init(pgd); + pmd = pmd + pmd_index(vaddr); + for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); + pmd++, pmd_idx++) { + pte = page_table_kmap_check(one_page_table_init(pmd), + pmd, vaddr, pte); + + vaddr += PMD_SIZE; + } + pmd_idx = 0; + } +} + +static inline int is_kernel_text(unsigned long addr) +{ + if (addr >= (unsigned long)_text && addr <= (unsigned long)__init_end) + return 1; + return 0; +} + +/* + * This maps the physical memory to kernel virtual address space, a total + * of max_low_pfn pages, by creating page tables starting from address + * PAGE_OFFSET: + */ +unsigned long __init +kernel_physical_mapping_init(unsigned long start, + unsigned long end, + unsigned long page_size_mask) +{ + int use_pse = page_size_mask == (1<<PG_LEVEL_2M); + unsigned long last_map_addr = end; + unsigned long start_pfn, end_pfn; + pgd_t *pgd_base = swapper_pg_dir; + int pgd_idx, pmd_idx, pte_ofs; + unsigned long pfn; + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + unsigned pages_2m, pages_4k; + int mapping_iter; + + start_pfn = start >> PAGE_SHIFT; + end_pfn = end >> PAGE_SHIFT; + + /* + * First iteration will setup identity mapping using large/small pages + * based on use_pse, with other attributes same as set by + * the early code in head_32.S + * + * Second iteration will setup the appropriate attributes (NX, GLOBAL..) + * as desired for the kernel identity mapping. + * + * This two pass mechanism conforms to the TLB app note which says: + * + * "Software should not write to a paging-structure entry in a way + * that would change, for any linear address, both the page size + * and either the page frame or attributes." + */ + mapping_iter = 1; + + if (!cpu_has_pse) + use_pse = 0; + +repeat: + pages_2m = pages_4k = 0; + pfn = start_pfn; + pgd_idx = pgd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET); + pgd = pgd_base + pgd_idx; + for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) { + pmd = one_md_table_init(pgd); + + if (pfn >= end_pfn) + continue; +#ifdef CONFIG_X86_PAE + pmd_idx = pmd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET); + pmd += pmd_idx; +#else + pmd_idx = 0; +#endif + for (; pmd_idx < PTRS_PER_PMD && pfn < end_pfn; + pmd++, pmd_idx++) { + unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET; + + /* + * Map with big pages if possible, otherwise + * create normal page tables: + */ + if (use_pse) { + unsigned int addr2; + pgprot_t prot = PAGE_KERNEL_LARGE; + /* + * first pass will use the same initial + * identity mapping attribute + _PAGE_PSE. + */ + pgprot_t init_prot = + __pgprot(PTE_IDENT_ATTR | + _PAGE_PSE); + + addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE + + PAGE_OFFSET + PAGE_SIZE-1; + + if (is_kernel_text(addr) || + is_kernel_text(addr2)) + prot = PAGE_KERNEL_LARGE_EXEC; + + pages_2m++; + if (mapping_iter == 1) + set_pmd(pmd, pfn_pmd(pfn, init_prot)); + else + set_pmd(pmd, pfn_pmd(pfn, prot)); + + pfn += PTRS_PER_PTE; + continue; + } + pte = one_page_table_init(pmd); + + pte_ofs = pte_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET); + pte += pte_ofs; + for (; pte_ofs < PTRS_PER_PTE && pfn < end_pfn; + pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) { + pgprot_t prot = PAGE_KERNEL; + /* + * first pass will use the same initial + * identity mapping attribute. + */ + pgprot_t init_prot = __pgprot(PTE_IDENT_ATTR); + + if (is_kernel_text(addr)) + prot = PAGE_KERNEL_EXEC; + + pages_4k++; + if (mapping_iter == 1) { + set_pte(pte, pfn_pte(pfn, init_prot)); + last_map_addr = (pfn << PAGE_SHIFT) + PAGE_SIZE; + } else + set_pte(pte, pfn_pte(pfn, prot)); + } + } + } + if (mapping_iter == 1) { + /* + * update direct mapping page count only in the first + * iteration. + */ + update_page_count(PG_LEVEL_2M, pages_2m); + update_page_count(PG_LEVEL_4K, pages_4k); + + /* + * local global flush tlb, which will flush the previous + * mappings present in both small and large page TLB's. + */ + __flush_tlb_all(); + + /* + * Second iteration will set the actual desired PTE attributes. + */ + mapping_iter = 2; + goto repeat; + } + return last_map_addr; +} + +pte_t *kmap_pte; +pgprot_t kmap_prot; + +static inline pte_t *kmap_get_fixmap_pte(unsigned long vaddr) +{ + return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), + vaddr), vaddr), vaddr); +} + +static void __init kmap_init(void) +{ + unsigned long kmap_vstart; + + /* + * Cache the first kmap pte: + */ + kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN); + kmap_pte = kmap_get_fixmap_pte(kmap_vstart); + + kmap_prot = PAGE_KERNEL; +} + +#ifdef CONFIG_HIGHMEM +static void __init permanent_kmaps_init(pgd_t *pgd_base) +{ + unsigned long vaddr; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + vaddr = PKMAP_BASE; + page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base); + + pgd = swapper_pg_dir + pgd_index(vaddr); + pud = pud_offset(pgd, vaddr); + pmd = pmd_offset(pud, vaddr); + pte = pte_offset_kernel(pmd, vaddr); + pkmap_page_table = pte; +} + +static void __init add_one_highpage_init(struct page *page) +{ + ClearPageReserved(page); + init_page_count(page); + __free_page(page); + totalhigh_pages++; +} + +void __init add_highpages_with_active_regions(int nid, + unsigned long start_pfn, unsigned long end_pfn) +{ + phys_addr_t start, end; + u64 i; + + for_each_free_mem_range(i, nid, &start, &end, NULL) { + unsigned long pfn = clamp_t(unsigned long, PFN_UP(start), + start_pfn, end_pfn); + unsigned long e_pfn = clamp_t(unsigned long, PFN_DOWN(end), + start_pfn, end_pfn); + for ( ; pfn < e_pfn; pfn++) + if (pfn_valid(pfn)) + add_one_highpage_init(pfn_to_page(pfn)); + } +} +#else +static inline void permanent_kmaps_init(pgd_t *pgd_base) +{ +} +#endif /* CONFIG_HIGHMEM */ + +void __init native_pagetable_setup_start(pgd_t *base) +{ + unsigned long pfn, va; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + /* + * Remove any mappings which extend past the end of physical + * memory from the boot time page table: + */ + for (pfn = max_low_pfn + 1; pfn < 1<<(32-PAGE_SHIFT); pfn++) { + va = PAGE_OFFSET + (pfn<<PAGE_SHIFT); + pgd = base + pgd_index(va); + if (!pgd_present(*pgd)) + break; + + pud = pud_offset(pgd, va); + pmd = pmd_offset(pud, va); + if (!pmd_present(*pmd)) + break; + + pte = pte_offset_kernel(pmd, va); + if (!pte_present(*pte)) + break; + + pte_clear(NULL, va, pte); + } + paravirt_alloc_pmd(&init_mm, __pa(base) >> PAGE_SHIFT); +} + +void __init native_pagetable_setup_done(pgd_t *base) +{ +} + +/* + * Build a proper pagetable for the kernel mappings. Up until this + * point, we've been running on some set of pagetables constructed by + * the boot process. + * + * If we're booting on native hardware, this will be a pagetable + * constructed in arch/x86/kernel/head_32.S. The root of the + * pagetable will be swapper_pg_dir. + * + * If we're booting paravirtualized under a hypervisor, then there are + * more options: we may already be running PAE, and the pagetable may + * or may not be based in swapper_pg_dir. In any case, + * paravirt_pagetable_setup_start() will set up swapper_pg_dir + * appropriately for the rest of the initialization to work. + * + * In general, pagetable_init() assumes that the pagetable may already + * be partially populated, and so it avoids stomping on any existing + * mappings. + */ +void __init early_ioremap_page_table_range_init(void) +{ + pgd_t *pgd_base = swapper_pg_dir; + unsigned long vaddr, end; + + /* + * Fixed mappings, only the page table structure has to be + * created - mappings will be set by set_fixmap(): + */ + vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK; + end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK; + page_table_range_init(vaddr, end, pgd_base); + early_ioremap_reset(); +} + +static void __init pagetable_init(void) +{ + pgd_t *pgd_base = swapper_pg_dir; + + permanent_kmaps_init(pgd_base); +} + +pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP); +EXPORT_SYMBOL_GPL(__supported_pte_mask); + +/* user-defined highmem size */ +static unsigned int highmem_pages = -1; + +/* + * highmem=size forces highmem to be exactly 'size' bytes. + * This works even on boxes that have no highmem otherwise. + * This also works to reduce highmem size on bigger boxes. + */ +static int __init parse_highmem(char *arg) +{ + if (!arg) + return -EINVAL; + + highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT; + return 0; +} +early_param("highmem", parse_highmem); + +#define MSG_HIGHMEM_TOO_BIG \ + "highmem size (%luMB) is bigger than pages available (%luMB)!\n" + +#define MSG_LOWMEM_TOO_SMALL \ + "highmem size (%luMB) results in <64MB lowmem, ignoring it!\n" +/* + * All of RAM fits into lowmem - but if user wants highmem + * artificially via the highmem=x boot parameter then create + * it: + */ +void __init lowmem_pfn_init(void) +{ + /* max_low_pfn is 0, we already have early_res support */ + max_low_pfn = max_pfn; + + if (highmem_pages == -1) + highmem_pages = 0; +#ifdef CONFIG_HIGHMEM + if (highmem_pages >= max_pfn) { + printk(KERN_ERR MSG_HIGHMEM_TOO_BIG, + pages_to_mb(highmem_pages), pages_to_mb(max_pfn)); + highmem_pages = 0; + } + if (highmem_pages) { + if (max_low_pfn - highmem_pages < 64*1024*1024/PAGE_SIZE) { + printk(KERN_ERR MSG_LOWMEM_TOO_SMALL, + pages_to_mb(highmem_pages)); + highmem_pages = 0; + } + max_low_pfn -= highmem_pages; + } +#else + if (highmem_pages) + printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n"); +#endif +} + +#define MSG_HIGHMEM_TOO_SMALL \ + "only %luMB highmem pages available, ignoring highmem size of %luMB!\n" + +#define MSG_HIGHMEM_TRIMMED \ + "Warning: only 4GB will be used. Use a HIGHMEM64G enabled kernel!\n" +/* + * We have more RAM than fits into lowmem - we try to put it into + * highmem, also taking the highmem=x boot parameter into account: + */ +void __init highmem_pfn_init(void) +{ + max_low_pfn = MAXMEM_PFN; + + if (highmem_pages == -1) + highmem_pages = max_pfn - MAXMEM_PFN; + + if (highmem_pages + MAXMEM_PFN < max_pfn) + max_pfn = MAXMEM_PFN + highmem_pages; + + if (highmem_pages + MAXMEM_PFN > max_pfn) { + printk(KERN_WARNING MSG_HIGHMEM_TOO_SMALL, + pages_to_mb(max_pfn - MAXMEM_PFN), + pages_to_mb(highmem_pages)); + highmem_pages = 0; + } +#ifndef CONFIG_HIGHMEM + /* Maximum memory usable is what is directly addressable */ + printk(KERN_WARNING "Warning only %ldMB will be used.\n", MAXMEM>>20); + if (max_pfn > MAX_NONPAE_PFN) + printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n"); + else + printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n"); + max_pfn = MAXMEM_PFN; +#else /* !CONFIG_HIGHMEM */ +#ifndef CONFIG_HIGHMEM64G + if (max_pfn > MAX_NONPAE_PFN) { + max_pfn = MAX_NONPAE_PFN; + printk(KERN_WARNING MSG_HIGHMEM_TRIMMED); + } +#endif /* !CONFIG_HIGHMEM64G */ +#endif /* !CONFIG_HIGHMEM */ +} + +/* + * Determine low and high memory ranges: + */ +void __init find_low_pfn_range(void) +{ + /* it could update max_pfn */ + + if (max_pfn <= MAXMEM_PFN) + lowmem_pfn_init(); + else + highmem_pfn_init(); +} + +#ifndef CONFIG_NEED_MULTIPLE_NODES +void __init initmem_init(void) +{ +#ifdef CONFIG_HIGHMEM + highstart_pfn = highend_pfn = max_pfn; + if (max_pfn > max_low_pfn) + highstart_pfn = max_low_pfn; + printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", + pages_to_mb(highend_pfn - highstart_pfn)); + num_physpages = highend_pfn; + high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; +#else + num_physpages = max_low_pfn; + high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; +#endif + + memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0); + sparse_memory_present_with_active_regions(0); + +#ifdef CONFIG_FLATMEM + max_mapnr = num_physpages; +#endif + __vmalloc_start_set = true; + + printk(KERN_NOTICE "%ldMB LOWMEM available.\n", + pages_to_mb(max_low_pfn)); + + setup_bootmem_allocator(); +} +#endif /* !CONFIG_NEED_MULTIPLE_NODES */ + +void __init setup_bootmem_allocator(void) +{ + printk(KERN_INFO " mapped low ram: 0 - %08lx\n", + max_pfn_mapped<<PAGE_SHIFT); + printk(KERN_INFO " low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT); + + after_bootmem = 1; +} + +/* + * paging_init() sets up the page tables - note that the first 8MB are + * already mapped by head.S. + * + * This routines also unmaps the page at virtual kernel address 0, so + * that we can trap those pesky NULL-reference errors in the kernel. + */ +void __init paging_init(void) +{ + pagetable_init(); + + __flush_tlb_all(); + + kmap_init(); + + /* + * NOTE: at this point the bootmem allocator is fully available. + */ + olpc_dt_build_devicetree(); + sparse_memory_present_with_active_regions(MAX_NUMNODES); + sparse_init(); + zone_sizes_init(); +} + +/* + * Test if the WP bit works in supervisor mode. It isn't supported on 386's + * and also on some strange 486's. All 586+'s are OK. This used to involve + * black magic jumps to work around some nasty CPU bugs, but fortunately the + * switch to using exceptions got rid of all that. + */ +static void __init test_wp_bit(void) +{ + printk(KERN_INFO + "Checking if this processor honours the WP bit even in supervisor mode..."); + + /* Any page-aligned address will do, the test is non-destructive */ + __set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_READONLY); + boot_cpu_data.wp_works_ok = do_test_wp_bit(); + clear_fixmap(FIX_WP_TEST); + + if (!boot_cpu_data.wp_works_ok) { + printk(KERN_CONT "No.\n"); +#ifdef CONFIG_X86_WP_WORKS_OK + panic( + "This kernel doesn't support CPU's with broken WP. Recompile it for a 386!"); +#endif + } else { + printk(KERN_CONT "Ok.\n"); + } +} + +void __init mem_init(void) +{ + int codesize, reservedpages, datasize, initsize; + int tmp; + + pci_iommu_alloc(); + +#ifdef CONFIG_FLATMEM + BUG_ON(!mem_map); +#endif + /* + * With CONFIG_DEBUG_PAGEALLOC initialization of highmem pages has to + * be done before free_all_bootmem(). Memblock use free low memory for + * temporary data (see find_range_array()) and for this purpose can use + * pages that was already passed to the buddy allocator, hence marked as + * not accessible in the page tables when compiled with + * CONFIG_DEBUG_PAGEALLOC. Otherwise order of initialization is not + * important here. + */ + set_highmem_pages_init(); + + /* this will put all low memory onto the freelists */ + totalram_pages += free_all_bootmem(); + + reservedpages = 0; + for (tmp = 0; tmp < max_low_pfn; tmp++) + /* + * Only count reserved RAM pages: + */ + if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp))) + reservedpages++; + + codesize = (unsigned long) &_etext - (unsigned long) &_text; + datasize = (unsigned long) &_edata - (unsigned long) &_etext; + initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; + + printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, " + "%dk reserved, %dk data, %dk init, %ldk highmem)\n", + nr_free_pages() << (PAGE_SHIFT-10), + num_physpages << (PAGE_SHIFT-10), + codesize >> 10, + reservedpages << (PAGE_SHIFT-10), + datasize >> 10, + initsize >> 10, + totalhigh_pages << (PAGE_SHIFT-10)); + + printk(KERN_INFO "virtual kernel memory layout:\n" + " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n" +#ifdef CONFIG_HIGHMEM + " pkmap : 0x%08lx - 0x%08lx (%4ld kB)\n" +#endif + " vmalloc : 0x%08lx - 0x%08lx (%4ld MB)\n" + " lowmem : 0x%08lx - 0x%08lx (%4ld MB)\n" + " .init : 0x%08lx - 0x%08lx (%4ld kB)\n" + " .data : 0x%08lx - 0x%08lx (%4ld kB)\n" + " .text : 0x%08lx - 0x%08lx (%4ld kB)\n", + FIXADDR_START, FIXADDR_TOP, + (FIXADDR_TOP - FIXADDR_START) >> 10, + +#ifdef CONFIG_HIGHMEM + PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, + (LAST_PKMAP*PAGE_SIZE) >> 10, +#endif + + VMALLOC_START, VMALLOC_END, + (VMALLOC_END - VMALLOC_START) >> 20, + + (unsigned long)__va(0), (unsigned long)high_memory, + ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20, + + (unsigned long)&__init_begin, (unsigned long)&__init_end, + ((unsigned long)&__init_end - + (unsigned long)&__init_begin) >> 10, + + (unsigned long)&_etext, (unsigned long)&_edata, + ((unsigned long)&_edata - (unsigned long)&_etext) >> 10, + + (unsigned long)&_text, (unsigned long)&_etext, + ((unsigned long)&_etext - (unsigned long)&_text) >> 10); + + /* + * Check boundaries twice: Some fundamental inconsistencies can + * be detected at build time already. + */ +#define __FIXADDR_TOP (-PAGE_SIZE) +#ifdef CONFIG_HIGHMEM + BUILD_BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START); + BUILD_BUG_ON(VMALLOC_END > PKMAP_BASE); +#endif +#define high_memory (-128UL << 20) + BUILD_BUG_ON(VMALLOC_START >= VMALLOC_END); +#undef high_memory +#undef __FIXADDR_TOP + +#ifdef CONFIG_HIGHMEM + BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START); + BUG_ON(VMALLOC_END > PKMAP_BASE); +#endif + BUG_ON(VMALLOC_START >= VMALLOC_END); + BUG_ON((unsigned long)high_memory > VMALLOC_START); + + if (boot_cpu_data.wp_works_ok < 0) + test_wp_bit(); +} + +#ifdef CONFIG_MEMORY_HOTPLUG +int arch_add_memory(int nid, u64 start, u64 size) +{ + struct pglist_data *pgdata = NODE_DATA(nid); + struct zone *zone = pgdata->node_zones + ZONE_HIGHMEM; + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long nr_pages = size >> PAGE_SHIFT; + + return __add_pages(nid, zone, start_pfn, nr_pages); +} +#endif + +/* + * This function cannot be __init, since exceptions don't work in that + * section. Put this after the callers, so that it cannot be inlined. + */ +static noinline int do_test_wp_bit(void) +{ + char tmp_reg; + int flag; + + __asm__ __volatile__( + " movb %0, %1 \n" + "1: movb %1, %0 \n" + " xorl %2, %2 \n" + "2: \n" + _ASM_EXTABLE(1b,2b) + :"=m" (*(char *)fix_to_virt(FIX_WP_TEST)), + "=q" (tmp_reg), + "=r" (flag) + :"2" (1) + :"memory"); + + return flag; +} + +#ifdef CONFIG_DEBUG_RODATA +const int rodata_test_data = 0xC3; +EXPORT_SYMBOL_GPL(rodata_test_data); + +int kernel_set_to_readonly __read_mostly; + +void set_kernel_text_rw(void) +{ + unsigned long start = PFN_ALIGN(_text); + unsigned long size = PFN_ALIGN(_etext) - start; + + if (!kernel_set_to_readonly) + return; + + pr_debug("Set kernel text: %lx - %lx for read write\n", + start, start+size); + + set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT); +} + +void set_kernel_text_ro(void) +{ + unsigned long start = PFN_ALIGN(_text); + unsigned long size = PFN_ALIGN(_etext) - start; + + if (!kernel_set_to_readonly) + return; + + pr_debug("Set kernel text: %lx - %lx for read only\n", + start, start+size); + + set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); +} + +static void mark_nxdata_nx(void) +{ + /* + * When this called, init has already been executed and released, + * so everything past _etext should be NX. + */ + unsigned long start = PFN_ALIGN(_etext); + /* + * This comes from is_kernel_text upper limit. Also HPAGE where used: + */ + unsigned long size = (((unsigned long)__init_end + HPAGE_SIZE) & HPAGE_MASK) - start; + + if (__supported_pte_mask & _PAGE_NX) + printk(KERN_INFO "NX-protecting the kernel data: %luk\n", size >> 10); + set_pages_nx(virt_to_page(start), size >> PAGE_SHIFT); +} + +void mark_rodata_ro(void) +{ + unsigned long start = PFN_ALIGN(_text); + unsigned long size = PFN_ALIGN(_etext) - start; + + set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); + printk(KERN_INFO "Write protecting the kernel text: %luk\n", + size >> 10); + + kernel_set_to_readonly = 1; + +#ifdef CONFIG_CPA_DEBUG + printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n", + start, start+size); + set_pages_rw(virt_to_page(start), size>>PAGE_SHIFT); + + printk(KERN_INFO "Testing CPA: write protecting again\n"); + set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT); +#endif + + start += size; + size = (unsigned long)__end_rodata - start; + set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); + printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", + size >> 10); + rodata_test(); + +#ifdef CONFIG_CPA_DEBUG + printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, start + size); + set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT); + + printk(KERN_INFO "Testing CPA: write protecting again\n"); + set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); +#endif + mark_nxdata_nx(); +} +#endif + diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c new file mode 100644 index 00000000..fc18be0f --- /dev/null +++ b/arch/x86/mm/init_64.c @@ -0,0 +1,988 @@ +/* + * linux/arch/x86_64/mm/init.c + * + * Copyright (C) 1995 Linus Torvalds + * Copyright (C) 2000 Pavel Machek <pavel@ucw.cz> + * Copyright (C) 2002,2003 Andi Kleen <ak@suse.de> + */ + +#include <linux/signal.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/ptrace.h> +#include <linux/mman.h> +#include <linux/mm.h> +#include <linux/swap.h> +#include <linux/smp.h> +#include <linux/init.h> +#include <linux/initrd.h> +#include <linux/pagemap.h> +#include <linux/bootmem.h> +#include <linux/memblock.h> +#include <linux/proc_fs.h> +#include <linux/pci.h> +#include <linux/pfn.h> +#include <linux/poison.h> +#include <linux/dma-mapping.h> +#include <linux/module.h> +#include <linux/memory.h> +#include <linux/memory_hotplug.h> +#include <linux/nmi.h> +#include <linux/gfp.h> + +#include <asm/processor.h> +#include <asm/bios_ebda.h> +#include <asm/uaccess.h> +#include <asm/pgtable.h> +#include <asm/pgalloc.h> +#include <asm/dma.h> +#include <asm/fixmap.h> +#include <asm/e820.h> +#include <asm/apic.h> +#include <asm/tlb.h> +#include <asm/mmu_context.h> +#include <asm/proto.h> +#include <asm/smp.h> +#include <asm/sections.h> +#include <asm/kdebug.h> +#include <asm/numa.h> +#include <asm/cacheflush.h> +#include <asm/init.h> +#include <asm/uv/uv.h> +#include <asm/setup.h> + +static int __init parse_direct_gbpages_off(char *arg) +{ + direct_gbpages = 0; + return 0; +} +early_param("nogbpages", parse_direct_gbpages_off); + +static int __init parse_direct_gbpages_on(char *arg) +{ + direct_gbpages = 1; + return 0; +} +early_param("gbpages", parse_direct_gbpages_on); + +/* + * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the + * physical space so we can cache the place of the first one and move + * around without checking the pgd every time. + */ + +pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP; +EXPORT_SYMBOL_GPL(__supported_pte_mask); + +int force_personality32; + +/* + * noexec32=on|off + * Control non executable heap for 32bit processes. + * To control the stack too use noexec=off + * + * on PROT_READ does not imply PROT_EXEC for 32-bit processes (default) + * off PROT_READ implies PROT_EXEC + */ +static int __init nonx32_setup(char *str) +{ + if (!strcmp(str, "on")) + force_personality32 &= ~READ_IMPLIES_EXEC; + else if (!strcmp(str, "off")) + force_personality32 |= READ_IMPLIES_EXEC; + return 1; +} +__setup("noexec32=", nonx32_setup); + +/* + * When memory was added/removed make sure all the processes MM have + * suitable PGD entries in the local PGD level page. + */ +void sync_global_pgds(unsigned long start, unsigned long end) +{ + unsigned long address; + + for (address = start; address <= end; address += PGDIR_SIZE) { + const pgd_t *pgd_ref = pgd_offset_k(address); + struct page *page; + + if (pgd_none(*pgd_ref)) + continue; + + spin_lock(&pgd_lock); + list_for_each_entry(page, &pgd_list, lru) { + pgd_t *pgd; + spinlock_t *pgt_lock; + + pgd = (pgd_t *)page_address(page) + pgd_index(address); + /* the pgt_lock only for Xen */ + pgt_lock = &pgd_page_get_mm(page)->page_table_lock; + spin_lock(pgt_lock); + + if (pgd_none(*pgd)) + set_pgd(pgd, *pgd_ref); + else + BUG_ON(pgd_page_vaddr(*pgd) + != pgd_page_vaddr(*pgd_ref)); + + spin_unlock(pgt_lock); + } + spin_unlock(&pgd_lock); + } +} + +/* + * NOTE: This function is marked __ref because it calls __init function + * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0. + */ +static __ref void *spp_getpage(void) +{ + void *ptr; + + if (after_bootmem) + ptr = (void *) get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK); + else + ptr = alloc_bootmem_pages(PAGE_SIZE); + + if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) { + panic("set_pte_phys: cannot allocate page data %s\n", + after_bootmem ? "after bootmem" : ""); + } + + pr_debug("spp_getpage %p\n", ptr); + + return ptr; +} + +static pud_t *fill_pud(pgd_t *pgd, unsigned long vaddr) +{ + if (pgd_none(*pgd)) { + pud_t *pud = (pud_t *)spp_getpage(); + pgd_populate(&init_mm, pgd, pud); + if (pud != pud_offset(pgd, 0)) + printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n", + pud, pud_offset(pgd, 0)); + } + return pud_offset(pgd, vaddr); +} + +static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr) +{ + if (pud_none(*pud)) { + pmd_t *pmd = (pmd_t *) spp_getpage(); + pud_populate(&init_mm, pud, pmd); + if (pmd != pmd_offset(pud, 0)) + printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n", + pmd, pmd_offset(pud, 0)); + } + return pmd_offset(pud, vaddr); +} + +static pte_t *fill_pte(pmd_t *pmd, unsigned long vaddr) +{ + if (pmd_none(*pmd)) { + pte_t *pte = (pte_t *) spp_getpage(); + pmd_populate_kernel(&init_mm, pmd, pte); + if (pte != pte_offset_kernel(pmd, 0)) + printk(KERN_ERR "PAGETABLE BUG #02!\n"); + } + return pte_offset_kernel(pmd, vaddr); +} + +void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte) +{ + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pud = pud_page + pud_index(vaddr); + pmd = fill_pmd(pud, vaddr); + pte = fill_pte(pmd, vaddr); + + set_pte(pte, new_pte); + + /* + * It's enough to flush this one mapping. + * (PGE mappings get flushed as well) + */ + __flush_tlb_one(vaddr); +} + +void set_pte_vaddr(unsigned long vaddr, pte_t pteval) +{ + pgd_t *pgd; + pud_t *pud_page; + + pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, native_pte_val(pteval)); + + pgd = pgd_offset_k(vaddr); + if (pgd_none(*pgd)) { + printk(KERN_ERR + "PGD FIXMAP MISSING, it should be setup in head.S!\n"); + return; + } + pud_page = (pud_t*)pgd_page_vaddr(*pgd); + set_pte_vaddr_pud(pud_page, vaddr, pteval); +} + +pmd_t * __init populate_extra_pmd(unsigned long vaddr) +{ + pgd_t *pgd; + pud_t *pud; + + pgd = pgd_offset_k(vaddr); + pud = fill_pud(pgd, vaddr); + return fill_pmd(pud, vaddr); +} + +pte_t * __init populate_extra_pte(unsigned long vaddr) +{ + pmd_t *pmd; + + pmd = populate_extra_pmd(vaddr); + return fill_pte(pmd, vaddr); +} + +/* + * Create large page table mappings for a range of physical addresses. + */ +static void __init __init_extra_mapping(unsigned long phys, unsigned long size, + pgprot_t prot) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + + BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK)); + for (; size; phys += PMD_SIZE, size -= PMD_SIZE) { + pgd = pgd_offset_k((unsigned long)__va(phys)); + if (pgd_none(*pgd)) { + pud = (pud_t *) spp_getpage(); + set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE | + _PAGE_USER)); + } + pud = pud_offset(pgd, (unsigned long)__va(phys)); + if (pud_none(*pud)) { + pmd = (pmd_t *) spp_getpage(); + set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | + _PAGE_USER)); + } + pmd = pmd_offset(pud, phys); + BUG_ON(!pmd_none(*pmd)); + set_pmd(pmd, __pmd(phys | pgprot_val(prot))); + } +} + +void __init init_extra_mapping_wb(unsigned long phys, unsigned long size) +{ + __init_extra_mapping(phys, size, PAGE_KERNEL_LARGE); +} + +void __init init_extra_mapping_uc(unsigned long phys, unsigned long size) +{ + __init_extra_mapping(phys, size, PAGE_KERNEL_LARGE_NOCACHE); +} + +/* + * The head.S code sets up the kernel high mapping: + * + * from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text) + * + * phys_addr holds the negative offset to the kernel, which is added + * to the compile time generated pmds. This results in invalid pmds up + * to the point where we hit the physaddr 0 mapping. + * + * We limit the mappings to the region from _text to _brk_end. _brk_end + * is rounded up to the 2MB boundary. This catches the invalid pmds as + * well, as they are located before _text: + */ +void __init cleanup_highmap(void) +{ + unsigned long vaddr = __START_KERNEL_map; + unsigned long vaddr_end = __START_KERNEL_map + (max_pfn_mapped << PAGE_SHIFT); + unsigned long end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1; + pmd_t *pmd = level2_kernel_pgt; + + for (; vaddr + PMD_SIZE - 1 < vaddr_end; pmd++, vaddr += PMD_SIZE) { + if (pmd_none(*pmd)) + continue; + if (vaddr < (unsigned long) _text || vaddr > end) + set_pmd(pmd, __pmd(0)); + } +} + +static __ref void *alloc_low_page(unsigned long *phys) +{ + unsigned long pfn = pgt_buf_end++; + void *adr; + + if (after_bootmem) { + adr = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK); + *phys = __pa(adr); + + return adr; + } + + if (pfn >= pgt_buf_top) + panic("alloc_low_page: ran out of memory"); + + adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE); + clear_page(adr); + *phys = pfn * PAGE_SIZE; + return adr; +} + +static __ref void *map_low_page(void *virt) +{ + void *adr; + unsigned long phys, left; + + if (after_bootmem) + return virt; + + phys = __pa(virt); + left = phys & (PAGE_SIZE - 1); + adr = early_memremap(phys & PAGE_MASK, PAGE_SIZE); + adr = (void *)(((unsigned long)adr) | left); + + return adr; +} + +static __ref void unmap_low_page(void *adr) +{ + if (after_bootmem) + return; + + early_iounmap((void *)((unsigned long)adr & PAGE_MASK), PAGE_SIZE); +} + +static unsigned long __meminit +phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end, + pgprot_t prot) +{ + unsigned pages = 0; + unsigned long last_map_addr = end; + int i; + + pte_t *pte = pte_page + pte_index(addr); + + for(i = pte_index(addr); i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) { + + if (addr >= end) { + if (!after_bootmem) { + for(; i < PTRS_PER_PTE; i++, pte++) + set_pte(pte, __pte(0)); + } + break; + } + + /* + * We will re-use the existing mapping. + * Xen for example has some special requirements, like mapping + * pagetable pages as RO. So assume someone who pre-setup + * these mappings are more intelligent. + */ + if (pte_val(*pte)) { + pages++; + continue; + } + + if (0) + printk(" pte=%p addr=%lx pte=%016lx\n", + pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte); + pages++; + set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, prot)); + last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE; + } + + update_page_count(PG_LEVEL_4K, pages); + + return last_map_addr; +} + +static unsigned long __meminit +phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, + unsigned long page_size_mask, pgprot_t prot) +{ + unsigned long pages = 0; + unsigned long last_map_addr = end; + + int i = pmd_index(address); + + for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) { + unsigned long pte_phys; + pmd_t *pmd = pmd_page + pmd_index(address); + pte_t *pte; + pgprot_t new_prot = prot; + + if (address >= end) { + if (!after_bootmem) { + for (; i < PTRS_PER_PMD; i++, pmd++) + set_pmd(pmd, __pmd(0)); + } + break; + } + + if (pmd_val(*pmd)) { + if (!pmd_large(*pmd)) { + spin_lock(&init_mm.page_table_lock); + pte = map_low_page((pte_t *)pmd_page_vaddr(*pmd)); + last_map_addr = phys_pte_init(pte, address, + end, prot); + unmap_low_page(pte); + spin_unlock(&init_mm.page_table_lock); + continue; + } + /* + * If we are ok with PG_LEVEL_2M mapping, then we will + * use the existing mapping, + * + * Otherwise, we will split the large page mapping but + * use the same existing protection bits except for + * large page, so that we don't violate Intel's TLB + * Application note (317080) which says, while changing + * the page sizes, new and old translations should + * not differ with respect to page frame and + * attributes. + */ + if (page_size_mask & (1 << PG_LEVEL_2M)) { + pages++; + continue; + } + new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd)); + } + + if (page_size_mask & (1<<PG_LEVEL_2M)) { + pages++; + spin_lock(&init_mm.page_table_lock); + set_pte((pte_t *)pmd, + pfn_pte(address >> PAGE_SHIFT, + __pgprot(pgprot_val(prot) | _PAGE_PSE))); + spin_unlock(&init_mm.page_table_lock); + last_map_addr = (address & PMD_MASK) + PMD_SIZE; + continue; + } + + pte = alloc_low_page(&pte_phys); + last_map_addr = phys_pte_init(pte, address, end, new_prot); + unmap_low_page(pte); + + spin_lock(&init_mm.page_table_lock); + pmd_populate_kernel(&init_mm, pmd, __va(pte_phys)); + spin_unlock(&init_mm.page_table_lock); + } + update_page_count(PG_LEVEL_2M, pages); + return last_map_addr; +} + +static unsigned long __meminit +phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, + unsigned long page_size_mask) +{ + unsigned long pages = 0; + unsigned long last_map_addr = end; + int i = pud_index(addr); + + for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE) { + unsigned long pmd_phys; + pud_t *pud = pud_page + pud_index(addr); + pmd_t *pmd; + pgprot_t prot = PAGE_KERNEL; + + if (addr >= end) + break; + + if (!after_bootmem && + !e820_any_mapped(addr, addr+PUD_SIZE, 0)) { + set_pud(pud, __pud(0)); + continue; + } + + if (pud_val(*pud)) { + if (!pud_large(*pud)) { + pmd = map_low_page(pmd_offset(pud, 0)); + last_map_addr = phys_pmd_init(pmd, addr, end, + page_size_mask, prot); + unmap_low_page(pmd); + __flush_tlb_all(); + continue; + } + /* + * If we are ok with PG_LEVEL_1G mapping, then we will + * use the existing mapping. + * + * Otherwise, we will split the gbpage mapping but use + * the same existing protection bits except for large + * page, so that we don't violate Intel's TLB + * Application note (317080) which says, while changing + * the page sizes, new and old translations should + * not differ with respect to page frame and + * attributes. + */ + if (page_size_mask & (1 << PG_LEVEL_1G)) { + pages++; + continue; + } + prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud)); + } + + if (page_size_mask & (1<<PG_LEVEL_1G)) { + pages++; + spin_lock(&init_mm.page_table_lock); + set_pte((pte_t *)pud, + pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); + spin_unlock(&init_mm.page_table_lock); + last_map_addr = (addr & PUD_MASK) + PUD_SIZE; + continue; + } + + pmd = alloc_low_page(&pmd_phys); + last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask, + prot); + unmap_low_page(pmd); + + spin_lock(&init_mm.page_table_lock); + pud_populate(&init_mm, pud, __va(pmd_phys)); + spin_unlock(&init_mm.page_table_lock); + } + __flush_tlb_all(); + + update_page_count(PG_LEVEL_1G, pages); + + return last_map_addr; +} + +unsigned long __meminit +kernel_physical_mapping_init(unsigned long start, + unsigned long end, + unsigned long page_size_mask) +{ + bool pgd_changed = false; + unsigned long next, last_map_addr = end; + unsigned long addr; + + start = (unsigned long)__va(start); + end = (unsigned long)__va(end); + addr = start; + + for (; start < end; start = next) { + pgd_t *pgd = pgd_offset_k(start); + unsigned long pud_phys; + pud_t *pud; + + next = (start + PGDIR_SIZE) & PGDIR_MASK; + if (next > end) + next = end; + + if (pgd_val(*pgd)) { + pud = map_low_page((pud_t *)pgd_page_vaddr(*pgd)); + last_map_addr = phys_pud_init(pud, __pa(start), + __pa(end), page_size_mask); + unmap_low_page(pud); + continue; + } + + pud = alloc_low_page(&pud_phys); + last_map_addr = phys_pud_init(pud, __pa(start), __pa(next), + page_size_mask); + unmap_low_page(pud); + + spin_lock(&init_mm.page_table_lock); + pgd_populate(&init_mm, pgd, __va(pud_phys)); + spin_unlock(&init_mm.page_table_lock); + pgd_changed = true; + } + + if (pgd_changed) + sync_global_pgds(addr, end); + + __flush_tlb_all(); + + return last_map_addr; +} + +#ifndef CONFIG_NUMA +void __init initmem_init(void) +{ + memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0); +} +#endif + +void __init paging_init(void) +{ + sparse_memory_present_with_active_regions(MAX_NUMNODES); + sparse_init(); + + /* + * clear the default setting with node 0 + * note: don't use nodes_clear here, that is really clearing when + * numa support is not compiled in, and later node_set_state + * will not set it back. + */ + node_clear_state(0, N_NORMAL_MEMORY); + + zone_sizes_init(); +} + +/* + * Memory hotplug specific functions + */ +#ifdef CONFIG_MEMORY_HOTPLUG +/* + * After memory hotplug the variables max_pfn, max_low_pfn and high_memory need + * updating. + */ +static void update_end_of_memory_vars(u64 start, u64 size) +{ + unsigned long end_pfn = PFN_UP(start + size); + + if (end_pfn > max_pfn) { + max_pfn = end_pfn; + max_low_pfn = end_pfn; + high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1; + } +} + +/* + * Memory is added always to NORMAL zone. This means you will never get + * additional DMA/DMA32 memory. + */ +int arch_add_memory(int nid, u64 start, u64 size) +{ + struct pglist_data *pgdat = NODE_DATA(nid); + struct zone *zone = pgdat->node_zones + ZONE_NORMAL; + unsigned long last_mapped_pfn, start_pfn = start >> PAGE_SHIFT; + unsigned long nr_pages = size >> PAGE_SHIFT; + int ret; + + last_mapped_pfn = init_memory_mapping(start, start + size); + if (last_mapped_pfn > max_pfn_mapped) + max_pfn_mapped = last_mapped_pfn; + + ret = __add_pages(nid, zone, start_pfn, nr_pages); + WARN_ON_ONCE(ret); + + /* update max_pfn, max_low_pfn and high_memory */ + update_end_of_memory_vars(start, size); + + return ret; +} +EXPORT_SYMBOL_GPL(arch_add_memory); + +#endif /* CONFIG_MEMORY_HOTPLUG */ + +static struct kcore_list kcore_vsyscall; + +void __init mem_init(void) +{ + long codesize, reservedpages, datasize, initsize; + unsigned long absent_pages; + + pci_iommu_alloc(); + + /* clear_bss() already clear the empty_zero_page */ + + reservedpages = 0; + + /* this will put all low memory onto the freelists */ +#ifdef CONFIG_NUMA + totalram_pages = numa_free_all_bootmem(); +#else + totalram_pages = free_all_bootmem(); +#endif + + absent_pages = absent_pages_in_range(0, max_pfn); + reservedpages = max_pfn - totalram_pages - absent_pages; + after_bootmem = 1; + + codesize = (unsigned long) &_etext - (unsigned long) &_text; + datasize = (unsigned long) &_edata - (unsigned long) &_etext; + initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; + + /* Register memory areas for /proc/kcore */ + kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START, + VSYSCALL_END - VSYSCALL_START, KCORE_OTHER); + + printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, " + "%ldk absent, %ldk reserved, %ldk data, %ldk init)\n", + nr_free_pages() << (PAGE_SHIFT-10), + max_pfn << (PAGE_SHIFT-10), + codesize >> 10, + absent_pages << (PAGE_SHIFT-10), + reservedpages << (PAGE_SHIFT-10), + datasize >> 10, + initsize >> 10); +} + +#ifdef CONFIG_DEBUG_RODATA +const int rodata_test_data = 0xC3; +EXPORT_SYMBOL_GPL(rodata_test_data); + +int kernel_set_to_readonly; + +void set_kernel_text_rw(void) +{ + unsigned long start = PFN_ALIGN(_text); + unsigned long end = PFN_ALIGN(__stop___ex_table); + + if (!kernel_set_to_readonly) + return; + + pr_debug("Set kernel text: %lx - %lx for read write\n", + start, end); + + /* + * Make the kernel identity mapping for text RW. Kernel text + * mapping will always be RO. Refer to the comment in + * static_protections() in pageattr.c + */ + set_memory_rw(start, (end - start) >> PAGE_SHIFT); +} + +void set_kernel_text_ro(void) +{ + unsigned long start = PFN_ALIGN(_text); + unsigned long end = PFN_ALIGN(__stop___ex_table); + + if (!kernel_set_to_readonly) + return; + + pr_debug("Set kernel text: %lx - %lx for read only\n", + start, end); + + /* + * Set the kernel identity mapping for text RO. + */ + set_memory_ro(start, (end - start) >> PAGE_SHIFT); +} + +void mark_rodata_ro(void) +{ + unsigned long start = PFN_ALIGN(_text); + unsigned long rodata_start = + ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK; + unsigned long end = (unsigned long) &__end_rodata_hpage_align; + unsigned long text_end = PAGE_ALIGN((unsigned long) &__stop___ex_table); + unsigned long rodata_end = PAGE_ALIGN((unsigned long) &__end_rodata); + unsigned long data_start = (unsigned long) &_sdata; + + printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", + (end - start) >> 10); + set_memory_ro(start, (end - start) >> PAGE_SHIFT); + + kernel_set_to_readonly = 1; + + /* + * The rodata section (but not the kernel text!) should also be + * not-executable. + */ + set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT); + + rodata_test(); + +#ifdef CONFIG_CPA_DEBUG + printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end); + set_memory_rw(start, (end-start) >> PAGE_SHIFT); + + printk(KERN_INFO "Testing CPA: again\n"); + set_memory_ro(start, (end-start) >> PAGE_SHIFT); +#endif + + free_init_pages("unused kernel memory", + (unsigned long) page_address(virt_to_page(text_end)), + (unsigned long) + page_address(virt_to_page(rodata_start))); + free_init_pages("unused kernel memory", + (unsigned long) page_address(virt_to_page(rodata_end)), + (unsigned long) page_address(virt_to_page(data_start))); +} + +#endif + +int kern_addr_valid(unsigned long addr) +{ + unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + if (above != 0 && above != -1UL) + return 0; + + pgd = pgd_offset_k(addr); + if (pgd_none(*pgd)) + return 0; + + pud = pud_offset(pgd, addr); + if (pud_none(*pud)) + return 0; + + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) + return 0; + + if (pmd_large(*pmd)) + return pfn_valid(pmd_pfn(*pmd)); + + pte = pte_offset_kernel(pmd, addr); + if (pte_none(*pte)) + return 0; + + return pfn_valid(pte_pfn(*pte)); +} + +/* + * A pseudo VMA to allow ptrace access for the vsyscall page. This only + * covers the 64bit vsyscall page now. 32bit has a real VMA now and does + * not need special handling anymore: + */ +static struct vm_area_struct gate_vma = { + .vm_start = VSYSCALL_START, + .vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES * PAGE_SIZE), + .vm_page_prot = PAGE_READONLY_EXEC, + .vm_flags = VM_READ | VM_EXEC +}; + +struct vm_area_struct *get_gate_vma(struct mm_struct *mm) +{ +#ifdef CONFIG_IA32_EMULATION + if (!mm || mm->context.ia32_compat) + return NULL; +#endif + return &gate_vma; +} + +int in_gate_area(struct mm_struct *mm, unsigned long addr) +{ + struct vm_area_struct *vma = get_gate_vma(mm); + + if (!vma) + return 0; + + return (addr >= vma->vm_start) && (addr < vma->vm_end); +} + +/* + * Use this when you have no reliable mm, typically from interrupt + * context. It is less reliable than using a task's mm and may give + * false positives. + */ +int in_gate_area_no_mm(unsigned long addr) +{ + return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END); +} + +const char *arch_vma_name(struct vm_area_struct *vma) +{ + if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso) + return "[vdso]"; + if (vma == &gate_vma) + return "[vsyscall]"; + return NULL; +} + +#ifdef CONFIG_X86_UV +unsigned long memory_block_size_bytes(void) +{ + if (is_uv_system()) { + printk(KERN_INFO "UV: memory block size 2GB\n"); + return 2UL * 1024 * 1024 * 1024; + } + return MIN_MEMORY_BLOCK_SIZE; +} +#endif + +#ifdef CONFIG_SPARSEMEM_VMEMMAP +/* + * Initialise the sparsemem vmemmap using huge-pages at the PMD level. + */ +static long __meminitdata addr_start, addr_end; +static void __meminitdata *p_start, *p_end; +static int __meminitdata node_start; + +int __meminit +vmemmap_populate(struct page *start_page, unsigned long size, int node) +{ + unsigned long addr = (unsigned long)start_page; + unsigned long end = (unsigned long)(start_page + size); + unsigned long next; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + + for (; addr < end; addr = next) { + void *p = NULL; + + pgd = vmemmap_pgd_populate(addr, node); + if (!pgd) + return -ENOMEM; + + pud = vmemmap_pud_populate(pgd, addr, node); + if (!pud) + return -ENOMEM; + + if (!cpu_has_pse) { + next = (addr + PAGE_SIZE) & PAGE_MASK; + pmd = vmemmap_pmd_populate(pud, addr, node); + + if (!pmd) + return -ENOMEM; + + p = vmemmap_pte_populate(pmd, addr, node); + + if (!p) + return -ENOMEM; + + addr_end = addr + PAGE_SIZE; + p_end = p + PAGE_SIZE; + } else { + next = pmd_addr_end(addr, end); + + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) { + pte_t entry; + + p = vmemmap_alloc_block_buf(PMD_SIZE, node); + if (!p) + return -ENOMEM; + + entry = pfn_pte(__pa(p) >> PAGE_SHIFT, + PAGE_KERNEL_LARGE); + set_pmd(pmd, __pmd(pte_val(entry))); + + /* check to see if we have contiguous blocks */ + if (p_end != p || node_start != node) { + if (p_start) + printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n", + addr_start, addr_end-1, p_start, p_end-1, node_start); + addr_start = addr; + node_start = node; + p_start = p; + } + + addr_end = addr + PMD_SIZE; + p_end = p + PMD_SIZE; + } else + vmemmap_verify((pte_t *)pmd, node, addr, next); + } + + } + sync_global_pgds((unsigned long)start_page, end); + return 0; +} + +void __meminit vmemmap_populate_print_last(void) +{ + if (p_start) { + printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n", + addr_start, addr_end-1, p_start, p_end-1, node_start); + p_start = NULL; + p_end = NULL; + node_start = 0; + } +} +#endif diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c new file mode 100644 index 00000000..7b179b49 --- /dev/null +++ b/arch/x86/mm/iomap_32.c @@ -0,0 +1,119 @@ +/* + * Copyright © 2008 Ingo Molnar + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. + */ + +#include <asm/iomap.h> +#include <asm/pat.h> +#include <linux/module.h> +#include <linux/highmem.h> + +static int is_io_mapping_possible(resource_size_t base, unsigned long size) +{ +#if !defined(CONFIG_X86_PAE) && defined(CONFIG_PHYS_ADDR_T_64BIT) + /* There is no way to map greater than 1 << 32 address without PAE */ + if (base + size > 0x100000000ULL) + return 0; +#endif + return 1; +} + +int iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot) +{ + unsigned long flag = _PAGE_CACHE_WC; + int ret; + + if (!is_io_mapping_possible(base, size)) + return -EINVAL; + + ret = io_reserve_memtype(base, base + size, &flag); + if (ret) + return ret; + + *prot = __pgprot(__PAGE_KERNEL | flag); + return 0; +} +EXPORT_SYMBOL_GPL(iomap_create_wc); + +void iomap_free(resource_size_t base, unsigned long size) +{ + io_free_memtype(base, base + size); +} +EXPORT_SYMBOL_GPL(iomap_free); + +void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot) +{ + unsigned long vaddr; + int idx, type; + + pagefault_disable(); + + type = kmap_atomic_idx_push(); + idx = type + KM_TYPE_NR * smp_processor_id(); + vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); + set_pte(kmap_pte - idx, pfn_pte(pfn, prot)); + arch_flush_lazy_mmu_mode(); + + return (void *)vaddr; +} + +/* + * Map 'pfn' using protections 'prot' + */ +void __iomem * +iomap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot) +{ + /* + * For non-PAT systems, promote PAGE_KERNEL_WC to PAGE_KERNEL_UC_MINUS. + * PAGE_KERNEL_WC maps to PWT, which translates to uncached if the + * MTRR is UC or WC. UC_MINUS gets the real intention, of the + * user, which is "WC if the MTRR is WC, UC if you can't do that." + */ + if (!pat_enabled && pgprot_val(prot) == pgprot_val(PAGE_KERNEL_WC)) + prot = PAGE_KERNEL_UC_MINUS; + + return (void __force __iomem *) kmap_atomic_prot_pfn(pfn, prot); +} +EXPORT_SYMBOL_GPL(iomap_atomic_prot_pfn); + +void +iounmap_atomic(void __iomem *kvaddr) +{ + unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; + + if (vaddr >= __fix_to_virt(FIX_KMAP_END) && + vaddr <= __fix_to_virt(FIX_KMAP_BEGIN)) { + int idx, type; + + type = kmap_atomic_idx(); + idx = type + KM_TYPE_NR * smp_processor_id(); + +#ifdef CONFIG_DEBUG_HIGHMEM + WARN_ON_ONCE(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx)); +#endif + /* + * Force other mappings to Oops if they'll try to access this + * pte without first remap it. Keeping stale mappings around + * is a bad idea also, in case the page changes cacheability + * attributes or becomes a protected page in a hypervisor. + */ + kpte_clear_flush(kmap_pte-idx, vaddr); + kmap_atomic_idx_pop(); + } + + pagefault_enable(); +} +EXPORT_SYMBOL_GPL(iounmap_atomic); diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c new file mode 100644 index 00000000..be1ef574 --- /dev/null +++ b/arch/x86/mm/ioremap.c @@ -0,0 +1,628 @@ +/* + * Re-map IO memory to kernel address space so that we can access it. + * This is needed for high PCI addresses that aren't mapped in the + * 640k-1MB IO memory area on PC's + * + * (C) Copyright 1995 1996 Linus Torvalds + */ + +#include <linux/bootmem.h> +#include <linux/init.h> +#include <linux/io.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/mmiotrace.h> + +#include <asm/cacheflush.h> +#include <asm/e820.h> +#include <asm/fixmap.h> +#include <asm/pgtable.h> +#include <asm/tlbflush.h> +#include <asm/pgalloc.h> +#include <asm/pat.h> + +#include "physaddr.h" + +/* + * Fix up the linear direct mapping of the kernel to avoid cache attribute + * conflicts. + */ +int ioremap_change_attr(unsigned long vaddr, unsigned long size, + unsigned long prot_val) +{ + unsigned long nrpages = size >> PAGE_SHIFT; + int err; + + switch (prot_val) { + case _PAGE_CACHE_UC: + default: + err = _set_memory_uc(vaddr, nrpages); + break; + case _PAGE_CACHE_WC: + err = _set_memory_wc(vaddr, nrpages); + break; + case _PAGE_CACHE_WB: + err = _set_memory_wb(vaddr, nrpages); + break; + } + + return err; +} + +/* + * Remap an arbitrary physical address space into the kernel virtual + * address space. Needed when the kernel wants to access high addresses + * directly. + * + * NOTE! We need to allow non-page-aligned mappings too: we will obviously + * have to convert them into an offset in a page-aligned mapping, but the + * caller shouldn't need to know that small detail. + */ +static void __iomem *__ioremap_caller(resource_size_t phys_addr, + unsigned long size, unsigned long prot_val, void *caller) +{ + unsigned long offset, vaddr; + resource_size_t pfn, last_pfn, last_addr; + const resource_size_t unaligned_phys_addr = phys_addr; + const unsigned long unaligned_size = size; + struct vm_struct *area; + unsigned long new_prot_val; + pgprot_t prot; + int retval; + void __iomem *ret_addr; + + /* Don't allow wraparound or zero size */ + last_addr = phys_addr + size - 1; + if (!size || last_addr < phys_addr) + return NULL; + + if (!phys_addr_valid(phys_addr)) { + printk(KERN_WARNING "ioremap: invalid physical address %llx\n", + (unsigned long long)phys_addr); + WARN_ON_ONCE(1); + return NULL; + } + + /* + * Don't remap the low PCI/ISA area, it's always mapped.. + */ + if (is_ISA_range(phys_addr, last_addr)) + return (__force void __iomem *)phys_to_virt(phys_addr); + + /* + * Don't allow anybody to remap normal RAM that we're using.. + */ + last_pfn = last_addr >> PAGE_SHIFT; + for (pfn = phys_addr >> PAGE_SHIFT; pfn <= last_pfn; pfn++) { + int is_ram = page_is_ram(pfn); + + if (is_ram && pfn_valid(pfn) && !PageReserved(pfn_to_page(pfn))) + return NULL; + WARN_ON_ONCE(is_ram); + } + + /* + * Mappings have to be page-aligned + */ + offset = phys_addr & ~PAGE_MASK; + phys_addr &= PHYSICAL_PAGE_MASK; + size = PAGE_ALIGN(last_addr+1) - phys_addr; + + retval = reserve_memtype(phys_addr, (u64)phys_addr + size, + prot_val, &new_prot_val); + if (retval) { + printk(KERN_ERR "ioremap reserve_memtype failed %d\n", retval); + return NULL; + } + + if (prot_val != new_prot_val) { + if (!is_new_memtype_allowed(phys_addr, size, + prot_val, new_prot_val)) { + printk(KERN_ERR + "ioremap error for 0x%llx-0x%llx, requested 0x%lx, got 0x%lx\n", + (unsigned long long)phys_addr, + (unsigned long long)(phys_addr + size), + prot_val, new_prot_val); + goto err_free_memtype; + } + prot_val = new_prot_val; + } + + switch (prot_val) { + case _PAGE_CACHE_UC: + default: + prot = PAGE_KERNEL_IO_NOCACHE; + break; + case _PAGE_CACHE_UC_MINUS: + prot = PAGE_KERNEL_IO_UC_MINUS; + break; + case _PAGE_CACHE_WC: + prot = PAGE_KERNEL_IO_WC; + break; + case _PAGE_CACHE_WB: + prot = PAGE_KERNEL_IO; + break; + } + + /* + * Ok, go for it.. + */ + area = get_vm_area_caller(size, VM_IOREMAP, caller); + if (!area) + goto err_free_memtype; + area->phys_addr = phys_addr; + vaddr = (unsigned long) area->addr; + + if (kernel_map_sync_memtype(phys_addr, size, prot_val)) + goto err_free_area; + + if (ioremap_page_range(vaddr, vaddr + size, phys_addr, prot)) + goto err_free_area; + + ret_addr = (void __iomem *) (vaddr + offset); + mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr); + + /* + * Check if the request spans more than any BAR in the iomem resource + * tree. + */ + WARN_ONCE(iomem_map_sanity_check(unaligned_phys_addr, unaligned_size), + KERN_INFO "Info: mapping multiple BARs. Your kernel is fine."); + + return ret_addr; +err_free_area: + free_vm_area(area); +err_free_memtype: + free_memtype(phys_addr, phys_addr + size); + return NULL; +} + +/** + * ioremap_nocache - map bus memory into CPU space + * @offset: bus address of the memory + * @size: size of the resource to map + * + * ioremap_nocache performs a platform specific sequence of operations to + * make bus memory CPU accessible via the readb/readw/readl/writeb/ + * writew/writel functions and the other mmio helpers. The returned + * address is not guaranteed to be usable directly as a virtual + * address. + * + * This version of ioremap ensures that the memory is marked uncachable + * on the CPU as well as honouring existing caching rules from things like + * the PCI bus. Note that there are other caches and buffers on many + * busses. In particular driver authors should read up on PCI writes + * + * It's useful if some control registers are in such an area and + * write combining or read caching is not desirable: + * + * Must be freed with iounmap. + */ +void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size) +{ + /* + * Ideally, this should be: + * pat_enabled ? _PAGE_CACHE_UC : _PAGE_CACHE_UC_MINUS; + * + * Till we fix all X drivers to use ioremap_wc(), we will use + * UC MINUS. + */ + unsigned long val = _PAGE_CACHE_UC_MINUS; + + return __ioremap_caller(phys_addr, size, val, + __builtin_return_address(0)); +} +EXPORT_SYMBOL(ioremap_nocache); + +/** + * ioremap_wc - map memory into CPU space write combined + * @offset: bus address of the memory + * @size: size of the resource to map + * + * This version of ioremap ensures that the memory is marked write combining. + * Write combining allows faster writes to some hardware devices. + * + * Must be freed with iounmap. + */ +void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size) +{ + if (pat_enabled) + return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC, + __builtin_return_address(0)); + else + return ioremap_nocache(phys_addr, size); +} +EXPORT_SYMBOL(ioremap_wc); + +void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size) +{ + return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WB, + __builtin_return_address(0)); +} +EXPORT_SYMBOL(ioremap_cache); + +void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size, + unsigned long prot_val) +{ + return __ioremap_caller(phys_addr, size, (prot_val & _PAGE_CACHE_MASK), + __builtin_return_address(0)); +} +EXPORT_SYMBOL(ioremap_prot); + +/** + * iounmap - Free a IO remapping + * @addr: virtual address from ioremap_* + * + * Caller must ensure there is only one unmapping for the same pointer. + */ +void iounmap(volatile void __iomem *addr) +{ + struct vm_struct *p, *o; + + if ((void __force *)addr <= high_memory) + return; + + /* + * __ioremap special-cases the PCI/ISA range by not instantiating a + * vm_area and by simply returning an address into the kernel mapping + * of ISA space. So handle that here. + */ + if ((void __force *)addr >= phys_to_virt(ISA_START_ADDRESS) && + (void __force *)addr < phys_to_virt(ISA_END_ADDRESS)) + return; + + addr = (volatile void __iomem *) + (PAGE_MASK & (unsigned long __force)addr); + + mmiotrace_iounmap(addr); + + /* Use the vm area unlocked, assuming the caller + ensures there isn't another iounmap for the same address + in parallel. Reuse of the virtual address is prevented by + leaving it in the global lists until we're done with it. + cpa takes care of the direct mappings. */ + read_lock(&vmlist_lock); + for (p = vmlist; p; p = p->next) { + if (p->addr == (void __force *)addr) + break; + } + read_unlock(&vmlist_lock); + + if (!p) { + printk(KERN_ERR "iounmap: bad address %p\n", addr); + dump_stack(); + return; + } + + free_memtype(p->phys_addr, p->phys_addr + get_vm_area_size(p)); + + /* Finally remove it */ + o = remove_vm_area((void __force *)addr); + BUG_ON(p != o || o == NULL); + kfree(p); +} +EXPORT_SYMBOL(iounmap); + +/* + * Convert a physical pointer to a virtual kernel pointer for /dev/mem + * access + */ +void *xlate_dev_mem_ptr(unsigned long phys) +{ + void *addr; + unsigned long start = phys & PAGE_MASK; + + /* If page is RAM, we can use __va. Otherwise ioremap and unmap. */ + if (page_is_ram(start >> PAGE_SHIFT)) + return __va(phys); + + addr = (void __force *)ioremap_cache(start, PAGE_SIZE); + if (addr) + addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK)); + + return addr; +} + +void unxlate_dev_mem_ptr(unsigned long phys, void *addr) +{ + if (page_is_ram(phys >> PAGE_SHIFT)) + return; + + iounmap((void __iomem *)((unsigned long)addr & PAGE_MASK)); + return; +} + +static int __initdata early_ioremap_debug; + +static int __init early_ioremap_debug_setup(char *str) +{ + early_ioremap_debug = 1; + + return 0; +} +early_param("early_ioremap_debug", early_ioremap_debug_setup); + +static __initdata int after_paging_init; +static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss; + +static inline pmd_t * __init early_ioremap_pmd(unsigned long addr) +{ + /* Don't assume we're using swapper_pg_dir at this point */ + pgd_t *base = __va(read_cr3()); + pgd_t *pgd = &base[pgd_index(addr)]; + pud_t *pud = pud_offset(pgd, addr); + pmd_t *pmd = pmd_offset(pud, addr); + + return pmd; +} + +static inline pte_t * __init early_ioremap_pte(unsigned long addr) +{ + return &bm_pte[pte_index(addr)]; +} + +bool __init is_early_ioremap_ptep(pte_t *ptep) +{ + return ptep >= &bm_pte[0] && ptep < &bm_pte[PAGE_SIZE/sizeof(pte_t)]; +} + +static unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata; + +void __init early_ioremap_init(void) +{ + pmd_t *pmd; + int i; + + if (early_ioremap_debug) + printk(KERN_INFO "early_ioremap_init()\n"); + + for (i = 0; i < FIX_BTMAPS_SLOTS; i++) + slot_virt[i] = __fix_to_virt(FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*i); + + pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)); + memset(bm_pte, 0, sizeof(bm_pte)); + pmd_populate_kernel(&init_mm, pmd, bm_pte); + + /* + * The boot-ioremap range spans multiple pmds, for which + * we are not prepared: + */ +#define __FIXADDR_TOP (-PAGE_SIZE) + BUILD_BUG_ON((__fix_to_virt(FIX_BTMAP_BEGIN) >> PMD_SHIFT) + != (__fix_to_virt(FIX_BTMAP_END) >> PMD_SHIFT)); +#undef __FIXADDR_TOP + if (pmd != early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END))) { + WARN_ON(1); + printk(KERN_WARNING "pmd %p != %p\n", + pmd, early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END))); + printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_BEGIN): %08lx\n", + fix_to_virt(FIX_BTMAP_BEGIN)); + printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_END): %08lx\n", + fix_to_virt(FIX_BTMAP_END)); + + printk(KERN_WARNING "FIX_BTMAP_END: %d\n", FIX_BTMAP_END); + printk(KERN_WARNING "FIX_BTMAP_BEGIN: %d\n", + FIX_BTMAP_BEGIN); + } +} + +void __init early_ioremap_reset(void) +{ + after_paging_init = 1; +} + +static void __init __early_set_fixmap(enum fixed_addresses idx, + phys_addr_t phys, pgprot_t flags) +{ + unsigned long addr = __fix_to_virt(idx); + pte_t *pte; + + if (idx >= __end_of_fixed_addresses) { + BUG(); + return; + } + pte = early_ioremap_pte(addr); + + if (pgprot_val(flags)) + set_pte(pte, pfn_pte(phys >> PAGE_SHIFT, flags)); + else + pte_clear(&init_mm, addr, pte); + __flush_tlb_one(addr); +} + +static inline void __init early_set_fixmap(enum fixed_addresses idx, + phys_addr_t phys, pgprot_t prot) +{ + if (after_paging_init) + __set_fixmap(idx, phys, prot); + else + __early_set_fixmap(idx, phys, prot); +} + +static inline void __init early_clear_fixmap(enum fixed_addresses idx) +{ + if (after_paging_init) + clear_fixmap(idx); + else + __early_set_fixmap(idx, 0, __pgprot(0)); +} + +static void __iomem *prev_map[FIX_BTMAPS_SLOTS] __initdata; +static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata; + +void __init fixup_early_ioremap(void) +{ + int i; + + for (i = 0; i < FIX_BTMAPS_SLOTS; i++) { + if (prev_map[i]) { + WARN_ON(1); + break; + } + } + + early_ioremap_init(); +} + +static int __init check_early_ioremap_leak(void) +{ + int count = 0; + int i; + + for (i = 0; i < FIX_BTMAPS_SLOTS; i++) + if (prev_map[i]) + count++; + + if (!count) + return 0; + WARN(1, KERN_WARNING + "Debug warning: early ioremap leak of %d areas detected.\n", + count); + printk(KERN_WARNING + "please boot with early_ioremap_debug and report the dmesg.\n"); + + return 1; +} +late_initcall(check_early_ioremap_leak); + +static void __init __iomem * +__early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot) +{ + unsigned long offset; + resource_size_t last_addr; + unsigned int nrpages; + enum fixed_addresses idx0, idx; + int i, slot; + + WARN_ON(system_state != SYSTEM_BOOTING); + + slot = -1; + for (i = 0; i < FIX_BTMAPS_SLOTS; i++) { + if (!prev_map[i]) { + slot = i; + break; + } + } + + if (slot < 0) { + printk(KERN_INFO "early_iomap(%08llx, %08lx) not found slot\n", + (u64)phys_addr, size); + WARN_ON(1); + return NULL; + } + + if (early_ioremap_debug) { + printk(KERN_INFO "early_ioremap(%08llx, %08lx) [%d] => ", + (u64)phys_addr, size, slot); + dump_stack(); + } + + /* Don't allow wraparound or zero size */ + last_addr = phys_addr + size - 1; + if (!size || last_addr < phys_addr) { + WARN_ON(1); + return NULL; + } + + prev_size[slot] = size; + /* + * Mappings have to be page-aligned + */ + offset = phys_addr & ~PAGE_MASK; + phys_addr &= PAGE_MASK; + size = PAGE_ALIGN(last_addr + 1) - phys_addr; + + /* + * Mappings have to fit in the FIX_BTMAP area. + */ + nrpages = size >> PAGE_SHIFT; + if (nrpages > NR_FIX_BTMAPS) { + WARN_ON(1); + return NULL; + } + + /* + * Ok, go for it.. + */ + idx0 = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot; + idx = idx0; + while (nrpages > 0) { + early_set_fixmap(idx, phys_addr, prot); + phys_addr += PAGE_SIZE; + --idx; + --nrpages; + } + if (early_ioremap_debug) + printk(KERN_CONT "%08lx + %08lx\n", offset, slot_virt[slot]); + + prev_map[slot] = (void __iomem *)(offset + slot_virt[slot]); + return prev_map[slot]; +} + +/* Remap an IO device */ +void __init __iomem * +early_ioremap(resource_size_t phys_addr, unsigned long size) +{ + return __early_ioremap(phys_addr, size, PAGE_KERNEL_IO); +} + +/* Remap memory */ +void __init __iomem * +early_memremap(resource_size_t phys_addr, unsigned long size) +{ + return __early_ioremap(phys_addr, size, PAGE_KERNEL); +} + +void __init early_iounmap(void __iomem *addr, unsigned long size) +{ + unsigned long virt_addr; + unsigned long offset; + unsigned int nrpages; + enum fixed_addresses idx; + int i, slot; + + slot = -1; + for (i = 0; i < FIX_BTMAPS_SLOTS; i++) { + if (prev_map[i] == addr) { + slot = i; + break; + } + } + + if (slot < 0) { + printk(KERN_INFO "early_iounmap(%p, %08lx) not found slot\n", + addr, size); + WARN_ON(1); + return; + } + + if (prev_size[slot] != size) { + printk(KERN_INFO "early_iounmap(%p, %08lx) [%d] size not consistent %08lx\n", + addr, size, slot, prev_size[slot]); + WARN_ON(1); + return; + } + + if (early_ioremap_debug) { + printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr, + size, slot); + dump_stack(); + } + + virt_addr = (unsigned long)addr; + if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN)) { + WARN_ON(1); + return; + } + offset = virt_addr & ~PAGE_MASK; + nrpages = PAGE_ALIGN(offset + size) >> PAGE_SHIFT; + + idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot; + while (nrpages > 0) { + early_clear_fixmap(idx); + --idx; + --nrpages; + } + prev_map[slot] = NULL; +} diff --git a/arch/x86/mm/kmemcheck/Makefile b/arch/x86/mm/kmemcheck/Makefile new file mode 100644 index 00000000..520b3bce --- /dev/null +++ b/arch/x86/mm/kmemcheck/Makefile @@ -0,0 +1 @@ +obj-y := error.o kmemcheck.o opcode.o pte.o selftest.o shadow.o diff --git a/arch/x86/mm/kmemcheck/error.c b/arch/x86/mm/kmemcheck/error.c new file mode 100644 index 00000000..dab41876 --- /dev/null +++ b/arch/x86/mm/kmemcheck/error.c @@ -0,0 +1,227 @@ +#include <linux/interrupt.h> +#include <linux/kdebug.h> +#include <linux/kmemcheck.h> +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/ptrace.h> +#include <linux/stacktrace.h> +#include <linux/string.h> + +#include "error.h" +#include "shadow.h" + +enum kmemcheck_error_type { + KMEMCHECK_ERROR_INVALID_ACCESS, + KMEMCHECK_ERROR_BUG, +}; + +#define SHADOW_COPY_SIZE (1 << CONFIG_KMEMCHECK_SHADOW_COPY_SHIFT) + +struct kmemcheck_error { + enum kmemcheck_error_type type; + + union { + /* KMEMCHECK_ERROR_INVALID_ACCESS */ + struct { + /* Kind of access that caused the error */ + enum kmemcheck_shadow state; + /* Address and size of the erroneous read */ + unsigned long address; + unsigned int size; + }; + }; + + struct pt_regs regs; + struct stack_trace trace; + unsigned long trace_entries[32]; + + /* We compress it to a char. */ + unsigned char shadow_copy[SHADOW_COPY_SIZE]; + unsigned char memory_copy[SHADOW_COPY_SIZE]; +}; + +/* + * Create a ring queue of errors to output. We can't call printk() directly + * from the kmemcheck traps, since this may call the console drivers and + * result in a recursive fault. + */ +static struct kmemcheck_error error_fifo[CONFIG_KMEMCHECK_QUEUE_SIZE]; +static unsigned int error_count; +static unsigned int error_rd; +static unsigned int error_wr; +static unsigned int error_missed_count; + +static struct kmemcheck_error *error_next_wr(void) +{ + struct kmemcheck_error *e; + + if (error_count == ARRAY_SIZE(error_fifo)) { + ++error_missed_count; + return NULL; + } + + e = &error_fifo[error_wr]; + if (++error_wr == ARRAY_SIZE(error_fifo)) + error_wr = 0; + ++error_count; + return e; +} + +static struct kmemcheck_error *error_next_rd(void) +{ + struct kmemcheck_error *e; + + if (error_count == 0) + return NULL; + + e = &error_fifo[error_rd]; + if (++error_rd == ARRAY_SIZE(error_fifo)) + error_rd = 0; + --error_count; + return e; +} + +void kmemcheck_error_recall(void) +{ + static const char *desc[] = { + [KMEMCHECK_SHADOW_UNALLOCATED] = "unallocated", + [KMEMCHECK_SHADOW_UNINITIALIZED] = "uninitialized", + [KMEMCHECK_SHADOW_INITIALIZED] = "initialized", + [KMEMCHECK_SHADOW_FREED] = "freed", + }; + + static const char short_desc[] = { + [KMEMCHECK_SHADOW_UNALLOCATED] = 'a', + [KMEMCHECK_SHADOW_UNINITIALIZED] = 'u', + [KMEMCHECK_SHADOW_INITIALIZED] = 'i', + [KMEMCHECK_SHADOW_FREED] = 'f', + }; + + struct kmemcheck_error *e; + unsigned int i; + + e = error_next_rd(); + if (!e) + return; + + switch (e->type) { + case KMEMCHECK_ERROR_INVALID_ACCESS: + printk(KERN_WARNING "WARNING: kmemcheck: Caught %d-bit read from %s memory (%p)\n", + 8 * e->size, e->state < ARRAY_SIZE(desc) ? + desc[e->state] : "(invalid shadow state)", + (void *) e->address); + + printk(KERN_WARNING); + for (i = 0; i < SHADOW_COPY_SIZE; ++i) + printk(KERN_CONT "%02x", e->memory_copy[i]); + printk(KERN_CONT "\n"); + + printk(KERN_WARNING); + for (i = 0; i < SHADOW_COPY_SIZE; ++i) { + if (e->shadow_copy[i] < ARRAY_SIZE(short_desc)) + printk(KERN_CONT " %c", short_desc[e->shadow_copy[i]]); + else + printk(KERN_CONT " ?"); + } + printk(KERN_CONT "\n"); + printk(KERN_WARNING "%*c\n", 2 + 2 + * (int) (e->address & (SHADOW_COPY_SIZE - 1)), '^'); + break; + case KMEMCHECK_ERROR_BUG: + printk(KERN_EMERG "ERROR: kmemcheck: Fatal error\n"); + break; + } + + __show_regs(&e->regs, 1); + print_stack_trace(&e->trace, 0); +} + +static void do_wakeup(unsigned long data) +{ + while (error_count > 0) + kmemcheck_error_recall(); + + if (error_missed_count > 0) { + printk(KERN_WARNING "kmemcheck: Lost %d error reports because " + "the queue was too small\n", error_missed_count); + error_missed_count = 0; + } +} + +static DECLARE_TASKLET(kmemcheck_tasklet, &do_wakeup, 0); + +/* + * Save the context of an error report. + */ +void kmemcheck_error_save(enum kmemcheck_shadow state, + unsigned long address, unsigned int size, struct pt_regs *regs) +{ + static unsigned long prev_ip; + + struct kmemcheck_error *e; + void *shadow_copy; + void *memory_copy; + + /* Don't report several adjacent errors from the same EIP. */ + if (regs->ip == prev_ip) + return; + prev_ip = regs->ip; + + e = error_next_wr(); + if (!e) + return; + + e->type = KMEMCHECK_ERROR_INVALID_ACCESS; + + e->state = state; + e->address = address; + e->size = size; + + /* Save regs */ + memcpy(&e->regs, regs, sizeof(*regs)); + + /* Save stack trace */ + e->trace.nr_entries = 0; + e->trace.entries = e->trace_entries; + e->trace.max_entries = ARRAY_SIZE(e->trace_entries); + e->trace.skip = 0; + save_stack_trace_regs(regs, &e->trace); + + /* Round address down to nearest 16 bytes */ + shadow_copy = kmemcheck_shadow_lookup(address + & ~(SHADOW_COPY_SIZE - 1)); + BUG_ON(!shadow_copy); + + memcpy(e->shadow_copy, shadow_copy, SHADOW_COPY_SIZE); + + kmemcheck_show_addr(address); + memory_copy = (void *) (address & ~(SHADOW_COPY_SIZE - 1)); + memcpy(e->memory_copy, memory_copy, SHADOW_COPY_SIZE); + kmemcheck_hide_addr(address); + + tasklet_hi_schedule_first(&kmemcheck_tasklet); +} + +/* + * Save the context of a kmemcheck bug. + */ +void kmemcheck_error_save_bug(struct pt_regs *regs) +{ + struct kmemcheck_error *e; + + e = error_next_wr(); + if (!e) + return; + + e->type = KMEMCHECK_ERROR_BUG; + + memcpy(&e->regs, regs, sizeof(*regs)); + + e->trace.nr_entries = 0; + e->trace.entries = e->trace_entries; + e->trace.max_entries = ARRAY_SIZE(e->trace_entries); + e->trace.skip = 1; + save_stack_trace(&e->trace); + + tasklet_hi_schedule_first(&kmemcheck_tasklet); +} diff --git a/arch/x86/mm/kmemcheck/error.h b/arch/x86/mm/kmemcheck/error.h new file mode 100644 index 00000000..0efc2e8d --- /dev/null +++ b/arch/x86/mm/kmemcheck/error.h @@ -0,0 +1,15 @@ +#ifndef ARCH__X86__MM__KMEMCHECK__ERROR_H +#define ARCH__X86__MM__KMEMCHECK__ERROR_H + +#include <linux/ptrace.h> + +#include "shadow.h" + +void kmemcheck_error_save(enum kmemcheck_shadow state, + unsigned long address, unsigned int size, struct pt_regs *regs); + +void kmemcheck_error_save_bug(struct pt_regs *regs); + +void kmemcheck_error_recall(void); + +#endif diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c new file mode 100644 index 00000000..d87dd6d0 --- /dev/null +++ b/arch/x86/mm/kmemcheck/kmemcheck.c @@ -0,0 +1,653 @@ +/** + * kmemcheck - a heavyweight memory checker for the linux kernel + * Copyright (C) 2007, 2008 Vegard Nossum <vegardno@ifi.uio.no> + * (With a lot of help from Ingo Molnar and Pekka Enberg.) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License (version 2) as + * published by the Free Software Foundation. + */ + +#include <linux/init.h> +#include <linux/interrupt.h> +#include <linux/kallsyms.h> +#include <linux/kernel.h> +#include <linux/kmemcheck.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/page-flags.h> +#include <linux/percpu.h> +#include <linux/ptrace.h> +#include <linux/string.h> +#include <linux/types.h> + +#include <asm/cacheflush.h> +#include <asm/kmemcheck.h> +#include <asm/pgtable.h> +#include <asm/tlbflush.h> + +#include "error.h" +#include "opcode.h" +#include "pte.h" +#include "selftest.h" +#include "shadow.h" + + +#ifdef CONFIG_KMEMCHECK_DISABLED_BY_DEFAULT +# define KMEMCHECK_ENABLED 0 +#endif + +#ifdef CONFIG_KMEMCHECK_ENABLED_BY_DEFAULT +# define KMEMCHECK_ENABLED 1 +#endif + +#ifdef CONFIG_KMEMCHECK_ONESHOT_BY_DEFAULT +# define KMEMCHECK_ENABLED 2 +#endif + +int kmemcheck_enabled = KMEMCHECK_ENABLED; + +int __init kmemcheck_init(void) +{ +#ifdef CONFIG_SMP + /* + * Limit SMP to use a single CPU. We rely on the fact that this code + * runs before SMP is set up. + */ + if (setup_max_cpus > 1) { + printk(KERN_INFO + "kmemcheck: Limiting number of CPUs to 1.\n"); + setup_max_cpus = 1; + } +#endif + + if (!kmemcheck_selftest()) { + printk(KERN_INFO "kmemcheck: self-tests failed; disabling\n"); + kmemcheck_enabled = 0; + return -EINVAL; + } + + printk(KERN_INFO "kmemcheck: Initialized\n"); + return 0; +} + +early_initcall(kmemcheck_init); + +/* + * We need to parse the kmemcheck= option before any memory is allocated. + */ +static int __init param_kmemcheck(char *str) +{ + if (!str) + return -EINVAL; + + sscanf(str, "%d", &kmemcheck_enabled); + return 0; +} + +early_param("kmemcheck", param_kmemcheck); + +int kmemcheck_show_addr(unsigned long address) +{ + pte_t *pte; + + pte = kmemcheck_pte_lookup(address); + if (!pte) + return 0; + + set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT)); + __flush_tlb_one(address); + return 1; +} + +int kmemcheck_hide_addr(unsigned long address) +{ + pte_t *pte; + + pte = kmemcheck_pte_lookup(address); + if (!pte) + return 0; + + set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT)); + __flush_tlb_one(address); + return 1; +} + +struct kmemcheck_context { + bool busy; + int balance; + + /* + * There can be at most two memory operands to an instruction, but + * each address can cross a page boundary -- so we may need up to + * four addresses that must be hidden/revealed for each fault. + */ + unsigned long addr[4]; + unsigned long n_addrs; + unsigned long flags; + + /* Data size of the instruction that caused a fault. */ + unsigned int size; +}; + +static DEFINE_PER_CPU(struct kmemcheck_context, kmemcheck_context); + +bool kmemcheck_active(struct pt_regs *regs) +{ + struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context); + + return data->balance > 0; +} + +/* Save an address that needs to be shown/hidden */ +static void kmemcheck_save_addr(unsigned long addr) +{ + struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context); + + BUG_ON(data->n_addrs >= ARRAY_SIZE(data->addr)); + data->addr[data->n_addrs++] = addr; +} + +static unsigned int kmemcheck_show_all(void) +{ + struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context); + unsigned int i; + unsigned int n; + + n = 0; + for (i = 0; i < data->n_addrs; ++i) + n += kmemcheck_show_addr(data->addr[i]); + + return n; +} + +static unsigned int kmemcheck_hide_all(void) +{ + struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context); + unsigned int i; + unsigned int n; + + n = 0; + for (i = 0; i < data->n_addrs; ++i) + n += kmemcheck_hide_addr(data->addr[i]); + + return n; +} + +/* + * Called from the #PF handler. + */ +void kmemcheck_show(struct pt_regs *regs) +{ + struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context); + + BUG_ON(!irqs_disabled()); + + if (unlikely(data->balance != 0)) { + kmemcheck_show_all(); + kmemcheck_error_save_bug(regs); + data->balance = 0; + return; + } + + /* + * None of the addresses actually belonged to kmemcheck. Note that + * this is not an error. + */ + if (kmemcheck_show_all() == 0) + return; + + ++data->balance; + + /* + * The IF needs to be cleared as well, so that the faulting + * instruction can run "uninterrupted". Otherwise, we might take + * an interrupt and start executing that before we've had a chance + * to hide the page again. + * + * NOTE: In the rare case of multiple faults, we must not override + * the original flags: + */ + if (!(regs->flags & X86_EFLAGS_TF)) + data->flags = regs->flags; + + regs->flags |= X86_EFLAGS_TF; + regs->flags &= ~X86_EFLAGS_IF; +} + +/* + * Called from the #DB handler. + */ +void kmemcheck_hide(struct pt_regs *regs) +{ + struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context); + int n; + + BUG_ON(!irqs_disabled()); + + if (unlikely(data->balance != 1)) { + kmemcheck_show_all(); + kmemcheck_error_save_bug(regs); + data->n_addrs = 0; + data->balance = 0; + + if (!(data->flags & X86_EFLAGS_TF)) + regs->flags &= ~X86_EFLAGS_TF; + if (data->flags & X86_EFLAGS_IF) + regs->flags |= X86_EFLAGS_IF; + return; + } + + if (kmemcheck_enabled) + n = kmemcheck_hide_all(); + else + n = kmemcheck_show_all(); + + if (n == 0) + return; + + --data->balance; + + data->n_addrs = 0; + + if (!(data->flags & X86_EFLAGS_TF)) + regs->flags &= ~X86_EFLAGS_TF; + if (data->flags & X86_EFLAGS_IF) + regs->flags |= X86_EFLAGS_IF; +} + +void kmemcheck_show_pages(struct page *p, unsigned int n) +{ + unsigned int i; + + for (i = 0; i < n; ++i) { + unsigned long address; + pte_t *pte; + unsigned int level; + + address = (unsigned long) page_address(&p[i]); + pte = lookup_address(address, &level); + BUG_ON(!pte); + BUG_ON(level != PG_LEVEL_4K); + + set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT)); + set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_HIDDEN)); + __flush_tlb_one(address); + } +} + +bool kmemcheck_page_is_tracked(struct page *p) +{ + /* This will also check the "hidden" flag of the PTE. */ + return kmemcheck_pte_lookup((unsigned long) page_address(p)); +} + +void kmemcheck_hide_pages(struct page *p, unsigned int n) +{ + unsigned int i; + + for (i = 0; i < n; ++i) { + unsigned long address; + pte_t *pte; + unsigned int level; + + address = (unsigned long) page_address(&p[i]); + pte = lookup_address(address, &level); + BUG_ON(!pte); + BUG_ON(level != PG_LEVEL_4K); + + set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT)); + set_pte(pte, __pte(pte_val(*pte) | _PAGE_HIDDEN)); + __flush_tlb_one(address); + } +} + +/* Access may NOT cross page boundary */ +static void kmemcheck_read_strict(struct pt_regs *regs, + unsigned long addr, unsigned int size) +{ + void *shadow; + enum kmemcheck_shadow status; + + shadow = kmemcheck_shadow_lookup(addr); + if (!shadow) + return; + + kmemcheck_save_addr(addr); + status = kmemcheck_shadow_test(shadow, size); + if (status == KMEMCHECK_SHADOW_INITIALIZED) + return; + + if (kmemcheck_enabled) + kmemcheck_error_save(status, addr, size, regs); + + if (kmemcheck_enabled == 2) + kmemcheck_enabled = 0; + + /* Don't warn about it again. */ + kmemcheck_shadow_set(shadow, size); +} + +bool kmemcheck_is_obj_initialized(unsigned long addr, size_t size) +{ + enum kmemcheck_shadow status; + void *shadow; + + shadow = kmemcheck_shadow_lookup(addr); + if (!shadow) + return true; + + status = kmemcheck_shadow_test_all(shadow, size); + + return status == KMEMCHECK_SHADOW_INITIALIZED; +} + +/* Access may cross page boundary */ +static void kmemcheck_read(struct pt_regs *regs, + unsigned long addr, unsigned int size) +{ + unsigned long page = addr & PAGE_MASK; + unsigned long next_addr = addr + size - 1; + unsigned long next_page = next_addr & PAGE_MASK; + + if (likely(page == next_page)) { + kmemcheck_read_strict(regs, addr, size); + return; + } + + /* + * What we do is basically to split the access across the + * two pages and handle each part separately. Yes, this means + * that we may now see reads that are 3 + 5 bytes, for + * example (and if both are uninitialized, there will be two + * reports), but it makes the code a lot simpler. + */ + kmemcheck_read_strict(regs, addr, next_page - addr); + kmemcheck_read_strict(regs, next_page, next_addr - next_page); +} + +static void kmemcheck_write_strict(struct pt_regs *regs, + unsigned long addr, unsigned int size) +{ + void *shadow; + + shadow = kmemcheck_shadow_lookup(addr); + if (!shadow) + return; + + kmemcheck_save_addr(addr); + kmemcheck_shadow_set(shadow, size); +} + +static void kmemcheck_write(struct pt_regs *regs, + unsigned long addr, unsigned int size) +{ + unsigned long page = addr & PAGE_MASK; + unsigned long next_addr = addr + size - 1; + unsigned long next_page = next_addr & PAGE_MASK; + + if (likely(page == next_page)) { + kmemcheck_write_strict(regs, addr, size); + return; + } + + /* See comment in kmemcheck_read(). */ + kmemcheck_write_strict(regs, addr, next_page - addr); + kmemcheck_write_strict(regs, next_page, next_addr - next_page); +} + +/* + * Copying is hard. We have two addresses, each of which may be split across + * a page (and each page will have different shadow addresses). + */ +static void kmemcheck_copy(struct pt_regs *regs, + unsigned long src_addr, unsigned long dst_addr, unsigned int size) +{ + uint8_t shadow[8]; + enum kmemcheck_shadow status; + + unsigned long page; + unsigned long next_addr; + unsigned long next_page; + + uint8_t *x; + unsigned int i; + unsigned int n; + + BUG_ON(size > sizeof(shadow)); + + page = src_addr & PAGE_MASK; + next_addr = src_addr + size - 1; + next_page = next_addr & PAGE_MASK; + + if (likely(page == next_page)) { + /* Same page */ + x = kmemcheck_shadow_lookup(src_addr); + if (x) { + kmemcheck_save_addr(src_addr); + for (i = 0; i < size; ++i) + shadow[i] = x[i]; + } else { + for (i = 0; i < size; ++i) + shadow[i] = KMEMCHECK_SHADOW_INITIALIZED; + } + } else { + n = next_page - src_addr; + BUG_ON(n > sizeof(shadow)); + + /* First page */ + x = kmemcheck_shadow_lookup(src_addr); + if (x) { + kmemcheck_save_addr(src_addr); + for (i = 0; i < n; ++i) + shadow[i] = x[i]; + } else { + /* Not tracked */ + for (i = 0; i < n; ++i) + shadow[i] = KMEMCHECK_SHADOW_INITIALIZED; + } + + /* Second page */ + x = kmemcheck_shadow_lookup(next_page); + if (x) { + kmemcheck_save_addr(next_page); + for (i = n; i < size; ++i) + shadow[i] = x[i - n]; + } else { + /* Not tracked */ + for (i = n; i < size; ++i) + shadow[i] = KMEMCHECK_SHADOW_INITIALIZED; + } + } + + page = dst_addr & PAGE_MASK; + next_addr = dst_addr + size - 1; + next_page = next_addr & PAGE_MASK; + + if (likely(page == next_page)) { + /* Same page */ + x = kmemcheck_shadow_lookup(dst_addr); + if (x) { + kmemcheck_save_addr(dst_addr); + for (i = 0; i < size; ++i) { + x[i] = shadow[i]; + shadow[i] = KMEMCHECK_SHADOW_INITIALIZED; + } + } + } else { + n = next_page - dst_addr; + BUG_ON(n > sizeof(shadow)); + + /* First page */ + x = kmemcheck_shadow_lookup(dst_addr); + if (x) { + kmemcheck_save_addr(dst_addr); + for (i = 0; i < n; ++i) { + x[i] = shadow[i]; + shadow[i] = KMEMCHECK_SHADOW_INITIALIZED; + } + } + + /* Second page */ + x = kmemcheck_shadow_lookup(next_page); + if (x) { + kmemcheck_save_addr(next_page); + for (i = n; i < size; ++i) { + x[i - n] = shadow[i]; + shadow[i] = KMEMCHECK_SHADOW_INITIALIZED; + } + } + } + + status = kmemcheck_shadow_test(shadow, size); + if (status == KMEMCHECK_SHADOW_INITIALIZED) + return; + + if (kmemcheck_enabled) + kmemcheck_error_save(status, src_addr, size, regs); + + if (kmemcheck_enabled == 2) + kmemcheck_enabled = 0; +} + +enum kmemcheck_method { + KMEMCHECK_READ, + KMEMCHECK_WRITE, +}; + +static void kmemcheck_access(struct pt_regs *regs, + unsigned long fallback_address, enum kmemcheck_method fallback_method) +{ + const uint8_t *insn; + const uint8_t *insn_primary; + unsigned int size; + + struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context); + + /* Recursive fault -- ouch. */ + if (data->busy) { + kmemcheck_show_addr(fallback_address); + kmemcheck_error_save_bug(regs); + return; + } + + data->busy = true; + + insn = (const uint8_t *) regs->ip; + insn_primary = kmemcheck_opcode_get_primary(insn); + + kmemcheck_opcode_decode(insn, &size); + + switch (insn_primary[0]) { +#ifdef CONFIG_KMEMCHECK_BITOPS_OK + /* AND, OR, XOR */ + /* + * Unfortunately, these instructions have to be excluded from + * our regular checking since they access only some (and not + * all) bits. This clears out "bogus" bitfield-access warnings. + */ + case 0x80: + case 0x81: + case 0x82: + case 0x83: + switch ((insn_primary[1] >> 3) & 7) { + /* OR */ + case 1: + /* AND */ + case 4: + /* XOR */ + case 6: + kmemcheck_write(regs, fallback_address, size); + goto out; + + /* ADD */ + case 0: + /* ADC */ + case 2: + /* SBB */ + case 3: + /* SUB */ + case 5: + /* CMP */ + case 7: + break; + } + break; +#endif + + /* MOVS, MOVSB, MOVSW, MOVSD */ + case 0xa4: + case 0xa5: + /* + * These instructions are special because they take two + * addresses, but we only get one page fault. + */ + kmemcheck_copy(regs, regs->si, regs->di, size); + goto out; + + /* CMPS, CMPSB, CMPSW, CMPSD */ + case 0xa6: + case 0xa7: + kmemcheck_read(regs, regs->si, size); + kmemcheck_read(regs, regs->di, size); + goto out; + } + + /* + * If the opcode isn't special in any way, we use the data from the + * page fault handler to determine the address and type of memory + * access. + */ + switch (fallback_method) { + case KMEMCHECK_READ: + kmemcheck_read(regs, fallback_address, size); + goto out; + case KMEMCHECK_WRITE: + kmemcheck_write(regs, fallback_address, size); + goto out; + } + +out: + data->busy = false; +} + +bool kmemcheck_fault(struct pt_regs *regs, unsigned long address, + unsigned long error_code) +{ + pte_t *pte; + + /* + * XXX: Is it safe to assume that memory accesses from virtual 86 + * mode or non-kernel code segments will _never_ access kernel + * memory (e.g. tracked pages)? For now, we need this to avoid + * invoking kmemcheck for PnP BIOS calls. + */ + if (regs->flags & X86_VM_MASK) + return false; + if (regs->cs != __KERNEL_CS) + return false; + + pte = kmemcheck_pte_lookup(address); + if (!pte) + return false; + + WARN_ON_ONCE(in_nmi()); + + if (error_code & 2) + kmemcheck_access(regs, address, KMEMCHECK_WRITE); + else + kmemcheck_access(regs, address, KMEMCHECK_READ); + + kmemcheck_show(regs); + return true; +} + +bool kmemcheck_trap(struct pt_regs *regs) +{ + if (!kmemcheck_active(regs)) + return false; + + /* We're done. */ + kmemcheck_hide(regs); + return true; +} diff --git a/arch/x86/mm/kmemcheck/opcode.c b/arch/x86/mm/kmemcheck/opcode.c new file mode 100644 index 00000000..324aa3f0 --- /dev/null +++ b/arch/x86/mm/kmemcheck/opcode.c @@ -0,0 +1,106 @@ +#include <linux/types.h> + +#include "opcode.h" + +static bool opcode_is_prefix(uint8_t b) +{ + return + /* Group 1 */ + b == 0xf0 || b == 0xf2 || b == 0xf3 + /* Group 2 */ + || b == 0x2e || b == 0x36 || b == 0x3e || b == 0x26 + || b == 0x64 || b == 0x65 + /* Group 3 */ + || b == 0x66 + /* Group 4 */ + || b == 0x67; +} + +#ifdef CONFIG_X86_64 +static bool opcode_is_rex_prefix(uint8_t b) +{ + return (b & 0xf0) == 0x40; +} +#else +static bool opcode_is_rex_prefix(uint8_t b) +{ + return false; +} +#endif + +#define REX_W (1 << 3) + +/* + * This is a VERY crude opcode decoder. We only need to find the size of the + * load/store that caused our #PF and this should work for all the opcodes + * that we care about. Moreover, the ones who invented this instruction set + * should be shot. + */ +void kmemcheck_opcode_decode(const uint8_t *op, unsigned int *size) +{ + /* Default operand size */ + int operand_size_override = 4; + + /* prefixes */ + for (; opcode_is_prefix(*op); ++op) { + if (*op == 0x66) + operand_size_override = 2; + } + + /* REX prefix */ + if (opcode_is_rex_prefix(*op)) { + uint8_t rex = *op; + + ++op; + if (rex & REX_W) { + switch (*op) { + case 0x63: + *size = 4; + return; + case 0x0f: + ++op; + + switch (*op) { + case 0xb6: + case 0xbe: + *size = 1; + return; + case 0xb7: + case 0xbf: + *size = 2; + return; + } + + break; + } + + *size = 8; + return; + } + } + + /* escape opcode */ + if (*op == 0x0f) { + ++op; + + /* + * This is move with zero-extend and sign-extend, respectively; + * we don't have to think about 0xb6/0xbe, because this is + * already handled in the conditional below. + */ + if (*op == 0xb7 || *op == 0xbf) + operand_size_override = 2; + } + + *size = (*op & 1) ? operand_size_override : 1; +} + +const uint8_t *kmemcheck_opcode_get_primary(const uint8_t *op) +{ + /* skip prefixes */ + while (opcode_is_prefix(*op)) + ++op; + if (opcode_is_rex_prefix(*op)) + ++op; + return op; +} diff --git a/arch/x86/mm/kmemcheck/opcode.h b/arch/x86/mm/kmemcheck/opcode.h new file mode 100644 index 00000000..6956aad6 --- /dev/null +++ b/arch/x86/mm/kmemcheck/opcode.h @@ -0,0 +1,9 @@ +#ifndef ARCH__X86__MM__KMEMCHECK__OPCODE_H +#define ARCH__X86__MM__KMEMCHECK__OPCODE_H + +#include <linux/types.h> + +void kmemcheck_opcode_decode(const uint8_t *op, unsigned int *size); +const uint8_t *kmemcheck_opcode_get_primary(const uint8_t *op); + +#endif diff --git a/arch/x86/mm/kmemcheck/pte.c b/arch/x86/mm/kmemcheck/pte.c new file mode 100644 index 00000000..4ead26ee --- /dev/null +++ b/arch/x86/mm/kmemcheck/pte.c @@ -0,0 +1,22 @@ +#include <linux/mm.h> + +#include <asm/pgtable.h> + +#include "pte.h" + +pte_t *kmemcheck_pte_lookup(unsigned long address) +{ + pte_t *pte; + unsigned int level; + + pte = lookup_address(address, &level); + if (!pte) + return NULL; + if (level != PG_LEVEL_4K) + return NULL; + if (!pte_hidden(*pte)) + return NULL; + + return pte; +} + diff --git a/arch/x86/mm/kmemcheck/pte.h b/arch/x86/mm/kmemcheck/pte.h new file mode 100644 index 00000000..9f596645 --- /dev/null +++ b/arch/x86/mm/kmemcheck/pte.h @@ -0,0 +1,10 @@ +#ifndef ARCH__X86__MM__KMEMCHECK__PTE_H +#define ARCH__X86__MM__KMEMCHECK__PTE_H + +#include <linux/mm.h> + +#include <asm/pgtable.h> + +pte_t *kmemcheck_pte_lookup(unsigned long address); + +#endif diff --git a/arch/x86/mm/kmemcheck/selftest.c b/arch/x86/mm/kmemcheck/selftest.c new file mode 100644 index 00000000..aef7140c --- /dev/null +++ b/arch/x86/mm/kmemcheck/selftest.c @@ -0,0 +1,70 @@ +#include <linux/bug.h> +#include <linux/kernel.h> + +#include "opcode.h" +#include "selftest.h" + +struct selftest_opcode { + unsigned int expected_size; + const uint8_t *insn; + const char *desc; +}; + +static const struct selftest_opcode selftest_opcodes[] = { + /* REP MOVS */ + {1, "\xf3\xa4", "rep movsb <mem8>, <mem8>"}, + {4, "\xf3\xa5", "rep movsl <mem32>, <mem32>"}, + + /* MOVZX / MOVZXD */ + {1, "\x66\x0f\xb6\x51\xf8", "movzwq <mem8>, <reg16>"}, + {1, "\x0f\xb6\x51\xf8", "movzwq <mem8>, <reg32>"}, + + /* MOVSX / MOVSXD */ + {1, "\x66\x0f\xbe\x51\xf8", "movswq <mem8>, <reg16>"}, + {1, "\x0f\xbe\x51\xf8", "movswq <mem8>, <reg32>"}, + +#ifdef CONFIG_X86_64 + /* MOVZX / MOVZXD */ + {1, "\x49\x0f\xb6\x51\xf8", "movzbq <mem8>, <reg64>"}, + {2, "\x49\x0f\xb7\x51\xf8", "movzbq <mem16>, <reg64>"}, + + /* MOVSX / MOVSXD */ + {1, "\x49\x0f\xbe\x51\xf8", "movsbq <mem8>, <reg64>"}, + {2, "\x49\x0f\xbf\x51\xf8", "movsbq <mem16>, <reg64>"}, + {4, "\x49\x63\x51\xf8", "movslq <mem32>, <reg64>"}, +#endif +}; + +static bool selftest_opcode_one(const struct selftest_opcode *op) +{ + unsigned size; + + kmemcheck_opcode_decode(op->insn, &size); + + if (size == op->expected_size) + return true; + + printk(KERN_WARNING "kmemcheck: opcode %s: expected size %d, got %d\n", + op->desc, op->expected_size, size); + return false; +} + +static bool selftest_opcodes_all(void) +{ + bool pass = true; + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(selftest_opcodes); ++i) + pass = pass && selftest_opcode_one(&selftest_opcodes[i]); + + return pass; +} + +bool kmemcheck_selftest(void) +{ + bool pass = true; + + pass = pass && selftest_opcodes_all(); + + return pass; +} diff --git a/arch/x86/mm/kmemcheck/selftest.h b/arch/x86/mm/kmemcheck/selftest.h new file mode 100644 index 00000000..8fed4fe1 --- /dev/null +++ b/arch/x86/mm/kmemcheck/selftest.h @@ -0,0 +1,6 @@ +#ifndef ARCH_X86_MM_KMEMCHECK_SELFTEST_H +#define ARCH_X86_MM_KMEMCHECK_SELFTEST_H + +bool kmemcheck_selftest(void); + +#endif diff --git a/arch/x86/mm/kmemcheck/shadow.c b/arch/x86/mm/kmemcheck/shadow.c new file mode 100644 index 00000000..aec12421 --- /dev/null +++ b/arch/x86/mm/kmemcheck/shadow.c @@ -0,0 +1,173 @@ +#include <linux/kmemcheck.h> +#include <linux/module.h> +#include <linux/mm.h> + +#include <asm/page.h> +#include <asm/pgtable.h> + +#include "pte.h" +#include "shadow.h" + +/* + * Return the shadow address for the given address. Returns NULL if the + * address is not tracked. + * + * We need to be extremely careful not to follow any invalid pointers, + * because this function can be called for *any* possible address. + */ +void *kmemcheck_shadow_lookup(unsigned long address) +{ + pte_t *pte; + struct page *page; + + if (!virt_addr_valid(address)) + return NULL; + + pte = kmemcheck_pte_lookup(address); + if (!pte) + return NULL; + + page = virt_to_page(address); + if (!page->shadow) + return NULL; + return page->shadow + (address & (PAGE_SIZE - 1)); +} + +static void mark_shadow(void *address, unsigned int n, + enum kmemcheck_shadow status) +{ + unsigned long addr = (unsigned long) address; + unsigned long last_addr = addr + n - 1; + unsigned long page = addr & PAGE_MASK; + unsigned long last_page = last_addr & PAGE_MASK; + unsigned int first_n; + void *shadow; + + /* If the memory range crosses a page boundary, stop there. */ + if (page == last_page) + first_n = n; + else + first_n = page + PAGE_SIZE - addr; + + shadow = kmemcheck_shadow_lookup(addr); + if (shadow) + memset(shadow, status, first_n); + + addr += first_n; + n -= first_n; + + /* Do full-page memset()s. */ + while (n >= PAGE_SIZE) { + shadow = kmemcheck_shadow_lookup(addr); + if (shadow) + memset(shadow, status, PAGE_SIZE); + + addr += PAGE_SIZE; + n -= PAGE_SIZE; + } + + /* Do the remaining page, if any. */ + if (n > 0) { + shadow = kmemcheck_shadow_lookup(addr); + if (shadow) + memset(shadow, status, n); + } +} + +void kmemcheck_mark_unallocated(void *address, unsigned int n) +{ + mark_shadow(address, n, KMEMCHECK_SHADOW_UNALLOCATED); +} + +void kmemcheck_mark_uninitialized(void *address, unsigned int n) +{ + mark_shadow(address, n, KMEMCHECK_SHADOW_UNINITIALIZED); +} + +/* + * Fill the shadow memory of the given address such that the memory at that + * address is marked as being initialized. + */ +void kmemcheck_mark_initialized(void *address, unsigned int n) +{ + mark_shadow(address, n, KMEMCHECK_SHADOW_INITIALIZED); +} +EXPORT_SYMBOL_GPL(kmemcheck_mark_initialized); + +void kmemcheck_mark_freed(void *address, unsigned int n) +{ + mark_shadow(address, n, KMEMCHECK_SHADOW_FREED); +} + +void kmemcheck_mark_unallocated_pages(struct page *p, unsigned int n) +{ + unsigned int i; + + for (i = 0; i < n; ++i) + kmemcheck_mark_unallocated(page_address(&p[i]), PAGE_SIZE); +} + +void kmemcheck_mark_uninitialized_pages(struct page *p, unsigned int n) +{ + unsigned int i; + + for (i = 0; i < n; ++i) + kmemcheck_mark_uninitialized(page_address(&p[i]), PAGE_SIZE); +} + +void kmemcheck_mark_initialized_pages(struct page *p, unsigned int n) +{ + unsigned int i; + + for (i = 0; i < n; ++i) + kmemcheck_mark_initialized(page_address(&p[i]), PAGE_SIZE); +} + +enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size) +{ +#ifdef CONFIG_KMEMCHECK_PARTIAL_OK + uint8_t *x; + unsigned int i; + + x = shadow; + + /* + * Make sure _some_ bytes are initialized. Gcc frequently generates + * code to access neighboring bytes. + */ + for (i = 0; i < size; ++i) { + if (x[i] == KMEMCHECK_SHADOW_INITIALIZED) + return x[i]; + } + + return x[0]; +#else + return kmemcheck_shadow_test_all(shadow, size); +#endif +} + +enum kmemcheck_shadow kmemcheck_shadow_test_all(void *shadow, unsigned int size) +{ + uint8_t *x; + unsigned int i; + + x = shadow; + + /* All bytes must be initialized. */ + for (i = 0; i < size; ++i) { + if (x[i] != KMEMCHECK_SHADOW_INITIALIZED) + return x[i]; + } + + return x[0]; +} + +void kmemcheck_shadow_set(void *shadow, unsigned int size) +{ + uint8_t *x; + unsigned int i; + + x = shadow; + for (i = 0; i < size; ++i) + x[i] = KMEMCHECK_SHADOW_INITIALIZED; +} diff --git a/arch/x86/mm/kmemcheck/shadow.h b/arch/x86/mm/kmemcheck/shadow.h new file mode 100644 index 00000000..ff0b2f70 --- /dev/null +++ b/arch/x86/mm/kmemcheck/shadow.h @@ -0,0 +1,18 @@ +#ifndef ARCH__X86__MM__KMEMCHECK__SHADOW_H +#define ARCH__X86__MM__KMEMCHECK__SHADOW_H + +enum kmemcheck_shadow { + KMEMCHECK_SHADOW_UNALLOCATED, + KMEMCHECK_SHADOW_UNINITIALIZED, + KMEMCHECK_SHADOW_INITIALIZED, + KMEMCHECK_SHADOW_FREED, +}; + +void *kmemcheck_shadow_lookup(unsigned long address); + +enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size); +enum kmemcheck_shadow kmemcheck_shadow_test_all(void *shadow, + unsigned int size); +void kmemcheck_shadow_set(void *shadow, unsigned int size); + +#endif diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c new file mode 100644 index 00000000..e5d5e2ce --- /dev/null +++ b/arch/x86/mm/kmmio.c @@ -0,0 +1,590 @@ +/* Support for MMIO probes. + * Benfit many code from kprobes + * (C) 2002 Louis Zhuang <louis.zhuang@intel.com>. + * 2007 Alexander Eichner + * 2008 Pekka Paalanen <pq@iki.fi> + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/list.h> +#include <linux/rculist.h> +#include <linux/spinlock.h> +#include <linux/hash.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/uaccess.h> +#include <linux/ptrace.h> +#include <linux/preempt.h> +#include <linux/percpu.h> +#include <linux/kdebug.h> +#include <linux/mutex.h> +#include <linux/io.h> +#include <linux/slab.h> +#include <asm/cacheflush.h> +#include <asm/tlbflush.h> +#include <linux/errno.h> +#include <asm/debugreg.h> +#include <linux/mmiotrace.h> + +#define KMMIO_PAGE_HASH_BITS 4 +#define KMMIO_PAGE_TABLE_SIZE (1 << KMMIO_PAGE_HASH_BITS) + +struct kmmio_fault_page { + struct list_head list; + struct kmmio_fault_page *release_next; + unsigned long page; /* location of the fault page */ + pteval_t old_presence; /* page presence prior to arming */ + bool armed; + + /* + * Number of times this page has been registered as a part + * of a probe. If zero, page is disarmed and this may be freed. + * Used only by writers (RCU) and post_kmmio_handler(). + * Protected by kmmio_lock, when linked into kmmio_page_table. + */ + int count; + + bool scheduled_for_release; +}; + +struct kmmio_delayed_release { + struct rcu_head rcu; + struct kmmio_fault_page *release_list; +}; + +struct kmmio_context { + struct kmmio_fault_page *fpage; + struct kmmio_probe *probe; + unsigned long saved_flags; + unsigned long addr; + int active; +}; + +static DEFINE_SPINLOCK(kmmio_lock); + +/* Protected by kmmio_lock */ +unsigned int kmmio_count; + +/* Read-protected by RCU, write-protected by kmmio_lock. */ +static struct list_head kmmio_page_table[KMMIO_PAGE_TABLE_SIZE]; +static LIST_HEAD(kmmio_probes); + +static struct list_head *kmmio_page_list(unsigned long page) +{ + return &kmmio_page_table[hash_long(page, KMMIO_PAGE_HASH_BITS)]; +} + +/* Accessed per-cpu */ +static DEFINE_PER_CPU(struct kmmio_context, kmmio_ctx); + +/* + * this is basically a dynamic stabbing problem: + * Could use the existing prio tree code or + * Possible better implementations: + * The Interval Skip List: A Data Structure for Finding All Intervals That + * Overlap a Point (might be simple) + * Space Efficient Dynamic Stabbing with Fast Queries - Mikkel Thorup + */ +/* Get the kmmio at this addr (if any). You must be holding RCU read lock. */ +static struct kmmio_probe *get_kmmio_probe(unsigned long addr) +{ + struct kmmio_probe *p; + list_for_each_entry_rcu(p, &kmmio_probes, list) { + if (addr >= p->addr && addr < (p->addr + p->len)) + return p; + } + return NULL; +} + +/* You must be holding RCU read lock. */ +static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page) +{ + struct list_head *head; + struct kmmio_fault_page *f; + + page &= PAGE_MASK; + head = kmmio_page_list(page); + list_for_each_entry_rcu(f, head, list) { + if (f->page == page) + return f; + } + return NULL; +} + +static void clear_pmd_presence(pmd_t *pmd, bool clear, pmdval_t *old) +{ + pmdval_t v = pmd_val(*pmd); + if (clear) { + *old = v & _PAGE_PRESENT; + v &= ~_PAGE_PRESENT; + } else /* presume this has been called with clear==true previously */ + v |= *old; + set_pmd(pmd, __pmd(v)); +} + +static void clear_pte_presence(pte_t *pte, bool clear, pteval_t *old) +{ + pteval_t v = pte_val(*pte); + if (clear) { + *old = v & _PAGE_PRESENT; + v &= ~_PAGE_PRESENT; + } else /* presume this has been called with clear==true previously */ + v |= *old; + set_pte_atomic(pte, __pte(v)); +} + +static int clear_page_presence(struct kmmio_fault_page *f, bool clear) +{ + unsigned int level; + pte_t *pte = lookup_address(f->page, &level); + + if (!pte) { + pr_err("no pte for page 0x%08lx\n", f->page); + return -1; + } + + switch (level) { + case PG_LEVEL_2M: + clear_pmd_presence((pmd_t *)pte, clear, &f->old_presence); + break; + case PG_LEVEL_4K: + clear_pte_presence(pte, clear, &f->old_presence); + break; + default: + pr_err("unexpected page level 0x%x.\n", level); + return -1; + } + + __flush_tlb_one(f->page); + return 0; +} + +/* + * Mark the given page as not present. Access to it will trigger a fault. + * + * Struct kmmio_fault_page is protected by RCU and kmmio_lock, but the + * protection is ignored here. RCU read lock is assumed held, so the struct + * will not disappear unexpectedly. Furthermore, the caller must guarantee, + * that double arming the same virtual address (page) cannot occur. + * + * Double disarming on the other hand is allowed, and may occur when a fault + * and mmiotrace shutdown happen simultaneously. + */ +static int arm_kmmio_fault_page(struct kmmio_fault_page *f) +{ + int ret; + WARN_ONCE(f->armed, KERN_ERR pr_fmt("kmmio page already armed.\n")); + if (f->armed) { + pr_warning("double-arm: page 0x%08lx, ref %d, old %d\n", + f->page, f->count, !!f->old_presence); + } + ret = clear_page_presence(f, true); + WARN_ONCE(ret < 0, KERN_ERR pr_fmt("arming 0x%08lx failed.\n"), + f->page); + f->armed = true; + return ret; +} + +/** Restore the given page to saved presence state. */ +static void disarm_kmmio_fault_page(struct kmmio_fault_page *f) +{ + int ret = clear_page_presence(f, false); + WARN_ONCE(ret < 0, + KERN_ERR "kmmio disarming 0x%08lx failed.\n", f->page); + f->armed = false; +} + +/* + * This is being called from do_page_fault(). + * + * We may be in an interrupt or a critical section. Also prefecthing may + * trigger a page fault. We may be in the middle of process switch. + * We cannot take any locks, because we could be executing especially + * within a kmmio critical section. + * + * Local interrupts are disabled, so preemption cannot happen. + * Do not enable interrupts, do not sleep, and watch out for other CPUs. + */ +/* + * Interrupts are disabled on entry as trap3 is an interrupt gate + * and they remain disabled throughout this function. + */ +int kmmio_handler(struct pt_regs *regs, unsigned long addr) +{ + struct kmmio_context *ctx; + struct kmmio_fault_page *faultpage; + int ret = 0; /* default to fault not handled */ + + /* + * Preemption is now disabled to prevent process switch during + * single stepping. We can only handle one active kmmio trace + * per cpu, so ensure that we finish it before something else + * gets to run. We also hold the RCU read lock over single + * stepping to avoid looking up the probe and kmmio_fault_page + * again. + */ + preempt_disable(); + rcu_read_lock(); + + faultpage = get_kmmio_fault_page(addr); + if (!faultpage) { + /* + * Either this page fault is not caused by kmmio, or + * another CPU just pulled the kmmio probe from under + * our feet. The latter case should not be possible. + */ + goto no_kmmio; + } + + ctx = &get_cpu_var(kmmio_ctx); + if (ctx->active) { + if (addr == ctx->addr) { + /* + * A second fault on the same page means some other + * condition needs handling by do_page_fault(), the + * page really not being present is the most common. + */ + pr_debug("secondary hit for 0x%08lx CPU %d.\n", + addr, smp_processor_id()); + + if (!faultpage->old_presence) + pr_info("unexpected secondary hit for address 0x%08lx on CPU %d.\n", + addr, smp_processor_id()); + } else { + /* + * Prevent overwriting already in-flight context. + * This should not happen, let's hope disarming at + * least prevents a panic. + */ + pr_emerg("recursive probe hit on CPU %d, for address 0x%08lx. Ignoring.\n", + smp_processor_id(), addr); + pr_emerg("previous hit was at 0x%08lx.\n", ctx->addr); + disarm_kmmio_fault_page(faultpage); + } + goto no_kmmio_ctx; + } + ctx->active++; + + ctx->fpage = faultpage; + ctx->probe = get_kmmio_probe(addr); + ctx->saved_flags = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF)); + ctx->addr = addr; + + if (ctx->probe && ctx->probe->pre_handler) + ctx->probe->pre_handler(ctx->probe, regs, addr); + + /* + * Enable single-stepping and disable interrupts for the faulting + * context. Local interrupts must not get enabled during stepping. + */ + regs->flags |= X86_EFLAGS_TF; + regs->flags &= ~X86_EFLAGS_IF; + + /* Now we set present bit in PTE and single step. */ + disarm_kmmio_fault_page(ctx->fpage); + + /* + * If another cpu accesses the same page while we are stepping, + * the access will not be caught. It will simply succeed and the + * only downside is we lose the event. If this becomes a problem, + * the user should drop to single cpu before tracing. + */ + + put_cpu_var(kmmio_ctx); + return 1; /* fault handled */ + +no_kmmio_ctx: + put_cpu_var(kmmio_ctx); +no_kmmio: + rcu_read_unlock(); + preempt_enable_no_resched(); + return ret; +} + +/* + * Interrupts are disabled on entry as trap1 is an interrupt gate + * and they remain disabled throughout this function. + * This must always get called as the pair to kmmio_handler(). + */ +static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs) +{ + int ret = 0; + struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx); + + if (!ctx->active) { + /* + * debug traps without an active context are due to either + * something external causing them (f.e. using a debugger while + * mmio tracing enabled), or erroneous behaviour + */ + pr_warning("unexpected debug trap on CPU %d.\n", + smp_processor_id()); + goto out; + } + + if (ctx->probe && ctx->probe->post_handler) + ctx->probe->post_handler(ctx->probe, condition, regs); + + /* Prevent racing against release_kmmio_fault_page(). */ + spin_lock(&kmmio_lock); + if (ctx->fpage->count) + arm_kmmio_fault_page(ctx->fpage); + spin_unlock(&kmmio_lock); + + regs->flags &= ~X86_EFLAGS_TF; + regs->flags |= ctx->saved_flags; + + /* These were acquired in kmmio_handler(). */ + ctx->active--; + BUG_ON(ctx->active); + rcu_read_unlock(); + preempt_enable_no_resched(); + + /* + * if somebody else is singlestepping across a probe point, flags + * will have TF set, in which case, continue the remaining processing + * of do_debug, as if this is not a probe hit. + */ + if (!(regs->flags & X86_EFLAGS_TF)) + ret = 1; +out: + put_cpu_var(kmmio_ctx); + return ret; +} + +/* You must be holding kmmio_lock. */ +static int add_kmmio_fault_page(unsigned long page) +{ + struct kmmio_fault_page *f; + + page &= PAGE_MASK; + f = get_kmmio_fault_page(page); + if (f) { + if (!f->count) + arm_kmmio_fault_page(f); + f->count++; + return 0; + } + + f = kzalloc(sizeof(*f), GFP_ATOMIC); + if (!f) + return -1; + + f->count = 1; + f->page = page; + + if (arm_kmmio_fault_page(f)) { + kfree(f); + return -1; + } + + list_add_rcu(&f->list, kmmio_page_list(f->page)); + + return 0; +} + +/* You must be holding kmmio_lock. */ +static void release_kmmio_fault_page(unsigned long page, + struct kmmio_fault_page **release_list) +{ + struct kmmio_fault_page *f; + + page &= PAGE_MASK; + f = get_kmmio_fault_page(page); + if (!f) + return; + + f->count--; + BUG_ON(f->count < 0); + if (!f->count) { + disarm_kmmio_fault_page(f); + if (!f->scheduled_for_release) { + f->release_next = *release_list; + *release_list = f; + f->scheduled_for_release = true; + } + } +} + +/* + * With page-unaligned ioremaps, one or two armed pages may contain + * addresses from outside the intended mapping. Events for these addresses + * are currently silently dropped. The events may result only from programming + * mistakes by accessing addresses before the beginning or past the end of a + * mapping. + */ +int register_kmmio_probe(struct kmmio_probe *p) +{ + unsigned long flags; + int ret = 0; + unsigned long size = 0; + const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK); + + spin_lock_irqsave(&kmmio_lock, flags); + if (get_kmmio_probe(p->addr)) { + ret = -EEXIST; + goto out; + } + kmmio_count++; + list_add_rcu(&p->list, &kmmio_probes); + while (size < size_lim) { + if (add_kmmio_fault_page(p->addr + size)) + pr_err("Unable to set page fault.\n"); + size += PAGE_SIZE; + } +out: + spin_unlock_irqrestore(&kmmio_lock, flags); + /* + * XXX: What should I do here? + * Here was a call to global_flush_tlb(), but it does not exist + * anymore. It seems it's not needed after all. + */ + return ret; +} +EXPORT_SYMBOL(register_kmmio_probe); + +static void rcu_free_kmmio_fault_pages(struct rcu_head *head) +{ + struct kmmio_delayed_release *dr = container_of( + head, + struct kmmio_delayed_release, + rcu); + struct kmmio_fault_page *f = dr->release_list; + while (f) { + struct kmmio_fault_page *next = f->release_next; + BUG_ON(f->count); + kfree(f); + f = next; + } + kfree(dr); +} + +static void remove_kmmio_fault_pages(struct rcu_head *head) +{ + struct kmmio_delayed_release *dr = + container_of(head, struct kmmio_delayed_release, rcu); + struct kmmio_fault_page *f = dr->release_list; + struct kmmio_fault_page **prevp = &dr->release_list; + unsigned long flags; + + spin_lock_irqsave(&kmmio_lock, flags); + while (f) { + if (!f->count) { + list_del_rcu(&f->list); + prevp = &f->release_next; + } else { + *prevp = f->release_next; + f->release_next = NULL; + f->scheduled_for_release = false; + } + f = *prevp; + } + spin_unlock_irqrestore(&kmmio_lock, flags); + + /* This is the real RCU destroy call. */ + call_rcu(&dr->rcu, rcu_free_kmmio_fault_pages); +} + +/* + * Remove a kmmio probe. You have to synchronize_rcu() before you can be + * sure that the callbacks will not be called anymore. Only after that + * you may actually release your struct kmmio_probe. + * + * Unregistering a kmmio fault page has three steps: + * 1. release_kmmio_fault_page() + * Disarm the page, wait a grace period to let all faults finish. + * 2. remove_kmmio_fault_pages() + * Remove the pages from kmmio_page_table. + * 3. rcu_free_kmmio_fault_pages() + * Actually free the kmmio_fault_page structs as with RCU. + */ +void unregister_kmmio_probe(struct kmmio_probe *p) +{ + unsigned long flags; + unsigned long size = 0; + const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK); + struct kmmio_fault_page *release_list = NULL; + struct kmmio_delayed_release *drelease; + + spin_lock_irqsave(&kmmio_lock, flags); + while (size < size_lim) { + release_kmmio_fault_page(p->addr + size, &release_list); + size += PAGE_SIZE; + } + list_del_rcu(&p->list); + kmmio_count--; + spin_unlock_irqrestore(&kmmio_lock, flags); + + if (!release_list) + return; + + drelease = kmalloc(sizeof(*drelease), GFP_ATOMIC); + if (!drelease) { + pr_crit("leaking kmmio_fault_page objects.\n"); + return; + } + drelease->release_list = release_list; + + /* + * This is not really RCU here. We have just disarmed a set of + * pages so that they cannot trigger page faults anymore. However, + * we cannot remove the pages from kmmio_page_table, + * because a probe hit might be in flight on another CPU. The + * pages are collected into a list, and they will be removed from + * kmmio_page_table when it is certain that no probe hit related to + * these pages can be in flight. RCU grace period sounds like a + * good choice. + * + * If we removed the pages too early, kmmio page fault handler might + * not find the respective kmmio_fault_page and determine it's not + * a kmmio fault, when it actually is. This would lead to madness. + */ + call_rcu(&drelease->rcu, remove_kmmio_fault_pages); +} +EXPORT_SYMBOL(unregister_kmmio_probe); + +static int +kmmio_die_notifier(struct notifier_block *nb, unsigned long val, void *args) +{ + struct die_args *arg = args; + unsigned long* dr6_p = (unsigned long *)ERR_PTR(arg->err); + + if (val == DIE_DEBUG && (*dr6_p & DR_STEP)) + if (post_kmmio_handler(*dr6_p, arg->regs) == 1) { + /* + * Reset the BS bit in dr6 (pointed by args->err) to + * denote completion of processing + */ + *dr6_p &= ~DR_STEP; + return NOTIFY_STOP; + } + + return NOTIFY_DONE; +} + +static struct notifier_block nb_die = { + .notifier_call = kmmio_die_notifier +}; + +int kmmio_init(void) +{ + int i; + + for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++) + INIT_LIST_HEAD(&kmmio_page_table[i]); + + return register_die_notifier(&nb_die); +} + +void kmmio_cleanup(void) +{ + int i; + + unregister_die_notifier(&nb_die); + for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++) { + WARN_ONCE(!list_empty(&kmmio_page_table[i]), + KERN_ERR "kmmio_page_table not empty at cleanup, any further tracing will leak memory.\n"); + } +} diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c new file mode 100644 index 00000000..c80b9fb9 --- /dev/null +++ b/arch/x86/mm/memtest.c @@ -0,0 +1,124 @@ +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/mm.h> +#include <linux/smp.h> +#include <linux/init.h> +#include <linux/pfn.h> +#include <linux/memblock.h> + +static u64 patterns[] __initdata = { + 0, + 0xffffffffffffffffULL, + 0x5555555555555555ULL, + 0xaaaaaaaaaaaaaaaaULL, + 0x1111111111111111ULL, + 0x2222222222222222ULL, + 0x4444444444444444ULL, + 0x8888888888888888ULL, + 0x3333333333333333ULL, + 0x6666666666666666ULL, + 0x9999999999999999ULL, + 0xccccccccccccccccULL, + 0x7777777777777777ULL, + 0xbbbbbbbbbbbbbbbbULL, + 0xddddddddddddddddULL, + 0xeeeeeeeeeeeeeeeeULL, + 0x7a6c7258554e494cULL, /* yeah ;-) */ +}; + +static void __init reserve_bad_mem(u64 pattern, u64 start_bad, u64 end_bad) +{ + printk(KERN_INFO " %016llx bad mem addr %010llx - %010llx reserved\n", + (unsigned long long) pattern, + (unsigned long long) start_bad, + (unsigned long long) end_bad); + memblock_reserve(start_bad, end_bad - start_bad); +} + +static void __init memtest(u64 pattern, u64 start_phys, u64 size) +{ + u64 *p, *start, *end; + u64 start_bad, last_bad; + u64 start_phys_aligned; + const size_t incr = sizeof(pattern); + + start_phys_aligned = ALIGN(start_phys, incr); + start = __va(start_phys_aligned); + end = start + (size - (start_phys_aligned - start_phys)) / incr; + start_bad = 0; + last_bad = 0; + + for (p = start; p < end; p++) + *p = pattern; + + for (p = start; p < end; p++, start_phys_aligned += incr) { + if (*p == pattern) + continue; + if (start_phys_aligned == last_bad + incr) { + last_bad += incr; + continue; + } + if (start_bad) + reserve_bad_mem(pattern, start_bad, last_bad + incr); + start_bad = last_bad = start_phys_aligned; + } + if (start_bad) + reserve_bad_mem(pattern, start_bad, last_bad + incr); +} + +static void __init do_one_pass(u64 pattern, u64 start, u64 end) +{ + u64 i; + phys_addr_t this_start, this_end; + + for_each_free_mem_range(i, MAX_NUMNODES, &this_start, &this_end, NULL) { + this_start = clamp_t(phys_addr_t, this_start, start, end); + this_end = clamp_t(phys_addr_t, this_end, start, end); + if (this_start < this_end) { + printk(KERN_INFO " %010llx - %010llx pattern %016llx\n", + (unsigned long long)this_start, + (unsigned long long)this_end, + (unsigned long long)cpu_to_be64(pattern)); + memtest(pattern, this_start, this_end - this_start); + } + } +} + +/* default is disabled */ +static int memtest_pattern __initdata; + +static int __init parse_memtest(char *arg) +{ + if (arg) + memtest_pattern = simple_strtoul(arg, NULL, 0); + else + memtest_pattern = ARRAY_SIZE(patterns); + + return 0; +} + +early_param("memtest", parse_memtest); + +void __init early_memtest(unsigned long start, unsigned long end) +{ + unsigned int i; + unsigned int idx = 0; + + if (!memtest_pattern) + return; + + printk(KERN_INFO "early_memtest: # of tests: %d\n", memtest_pattern); + for (i = 0; i < memtest_pattern; i++) { + idx = i % ARRAY_SIZE(patterns); + do_one_pass(patterns[idx], start, end); + } + + if (idx > 0) { + printk(KERN_INFO "early_memtest: wipe out " + "test pattern from memory\n"); + /* additional test with pattern 0 will do this */ + do_one_pass(0, start, end); + } +} diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c new file mode 100644 index 00000000..845df683 --- /dev/null +++ b/arch/x86/mm/mmap.c @@ -0,0 +1,124 @@ +/* + * Flexible mmap layout support + * + * Based on code by Ingo Molnar and Andi Kleen, copyrighted + * as follows: + * + * Copyright 2003-2009 Red Hat Inc. + * All Rights Reserved. + * Copyright 2005 Andi Kleen, SUSE Labs. + * Copyright 2007 Jiri Kosina, SUSE Labs. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/personality.h> +#include <linux/mm.h> +#include <linux/random.h> +#include <linux/limits.h> +#include <linux/sched.h> +#include <asm/elf.h> + +struct __read_mostly va_alignment va_align = { + .flags = -1, +}; + +static unsigned int stack_maxrandom_size(void) +{ + unsigned int max = 0; + if ((current->flags & PF_RANDOMIZE) && + !(current->personality & ADDR_NO_RANDOMIZE)) { + max = ((-1U) & STACK_RND_MASK) << PAGE_SHIFT; + } + + return max; +} + +/* + * Top of mmap area (just below the process stack). + * + * Leave an at least ~128 MB hole with possible stack randomization. + */ +#define MIN_GAP (128*1024*1024UL + stack_maxrandom_size()) +#define MAX_GAP (TASK_SIZE/6*5) + +static int mmap_is_legacy(void) +{ + if (current->personality & ADDR_COMPAT_LAYOUT) + return 1; + + if (rlimit(RLIMIT_STACK) == RLIM_INFINITY) + return 1; + + return sysctl_legacy_va_layout; +} + +static unsigned long mmap_rnd(void) +{ + unsigned long rnd = 0; + + /* + * 8 bits of randomness in 32bit mmaps, 20 address space bits + * 28 bits of randomness in 64bit mmaps, 40 address space bits + */ + if (current->flags & PF_RANDOMIZE) { + if (mmap_is_ia32()) + rnd = get_random_int() % (1<<8); + else + rnd = get_random_int() % (1<<28); + } + return rnd << PAGE_SHIFT; +} + +static unsigned long mmap_base(void) +{ + unsigned long gap = rlimit(RLIMIT_STACK); + + if (gap < MIN_GAP) + gap = MIN_GAP; + else if (gap > MAX_GAP) + gap = MAX_GAP; + + return PAGE_ALIGN(TASK_SIZE - gap - mmap_rnd()); +} + +/* + * Bottom-up (legacy) layout on X86_32 did not support randomization, X86_64 + * does, but not when emulating X86_32 + */ +static unsigned long mmap_legacy_base(void) +{ + if (mmap_is_ia32()) + return TASK_UNMAPPED_BASE; + else + return TASK_UNMAPPED_BASE + mmap_rnd(); +} + +/* + * This function, called very early during the creation of a new + * process VM image, sets up which VM layout function to use: + */ +void arch_pick_mmap_layout(struct mm_struct *mm) +{ + if (mmap_is_legacy()) { + mm->mmap_base = mmap_legacy_base(); + mm->get_unmapped_area = arch_get_unmapped_area; + mm->unmap_area = arch_unmap_area; + } else { + mm->mmap_base = mmap_base(); + mm->get_unmapped_area = arch_get_unmapped_area_topdown; + mm->unmap_area = arch_unmap_area_topdown; + } +} diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c new file mode 100644 index 00000000..dc0b7277 --- /dev/null +++ b/arch/x86/mm/mmio-mod.c @@ -0,0 +1,480 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2005 + * Jeff Muizelaar, 2006, 2007 + * Pekka Paalanen, 2008 <pq@iki.fi> + * + * Derived from the read-mod example from relay-examples by Tom Zanussi. + */ + +#define pr_fmt(fmt) "mmiotrace: " fmt + +#define DEBUG 1 + +#include <linux/module.h> +#include <linux/debugfs.h> +#include <linux/slab.h> +#include <linux/uaccess.h> +#include <linux/io.h> +#include <linux/kallsyms.h> +#include <asm/pgtable.h> +#include <linux/mmiotrace.h> +#include <asm/e820.h> /* for ISA_START_ADDRESS */ +#include <linux/atomic.h> +#include <linux/percpu.h> +#include <linux/cpu.h> + +#include "pf_in.h" + +struct trap_reason { + unsigned long addr; + unsigned long ip; + enum reason_type type; + int active_traces; +}; + +struct remap_trace { + struct list_head list; + struct kmmio_probe probe; + resource_size_t phys; + unsigned long id; +}; + +/* Accessed per-cpu. */ +static DEFINE_PER_CPU(struct trap_reason, pf_reason); +static DEFINE_PER_CPU(struct mmiotrace_rw, cpu_trace); + +static DEFINE_MUTEX(mmiotrace_mutex); +static DEFINE_SPINLOCK(trace_lock); +static atomic_t mmiotrace_enabled; +static LIST_HEAD(trace_list); /* struct remap_trace */ + +/* + * Locking in this file: + * - mmiotrace_mutex enforces enable/disable_mmiotrace() critical sections. + * - mmiotrace_enabled may be modified only when holding mmiotrace_mutex + * and trace_lock. + * - Routines depending on is_enabled() must take trace_lock. + * - trace_list users must hold trace_lock. + * - is_enabled() guarantees that mmio_trace_{rw,mapping} are allowed. + * - pre/post callbacks assume the effect of is_enabled() being true. + */ + +/* module parameters */ +static unsigned long filter_offset; +static bool nommiotrace; +static bool trace_pc; + +module_param(filter_offset, ulong, 0); +module_param(nommiotrace, bool, 0); +module_param(trace_pc, bool, 0); + +MODULE_PARM_DESC(filter_offset, "Start address of traced mappings."); +MODULE_PARM_DESC(nommiotrace, "Disable actual MMIO tracing."); +MODULE_PARM_DESC(trace_pc, "Record address of faulting instructions."); + +static bool is_enabled(void) +{ + return atomic_read(&mmiotrace_enabled); +} + +static void print_pte(unsigned long address) +{ + unsigned int level; + pte_t *pte = lookup_address(address, &level); + + if (!pte) { + pr_err("Error in %s: no pte for page 0x%08lx\n", + __func__, address); + return; + } + + if (level == PG_LEVEL_2M) { + pr_emerg("4MB pages are not currently supported: 0x%08lx\n", + address); + BUG(); + } + pr_info("pte for 0x%lx: 0x%llx 0x%llx\n", + address, + (unsigned long long)pte_val(*pte), + (unsigned long long)pte_val(*pte) & _PAGE_PRESENT); +} + +/* + * For some reason the pre/post pairs have been called in an + * unmatched order. Report and die. + */ +static void die_kmmio_nesting_error(struct pt_regs *regs, unsigned long addr) +{ + const struct trap_reason *my_reason = &get_cpu_var(pf_reason); + pr_emerg("unexpected fault for address: 0x%08lx, last fault for address: 0x%08lx\n", + addr, my_reason->addr); + print_pte(addr); + print_symbol(KERN_EMERG "faulting IP is at %s\n", regs->ip); + print_symbol(KERN_EMERG "last faulting IP was at %s\n", my_reason->ip); +#ifdef __i386__ + pr_emerg("eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n", + regs->ax, regs->bx, regs->cx, regs->dx); + pr_emerg("esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n", + regs->si, regs->di, regs->bp, regs->sp); +#else + pr_emerg("rax: %016lx rcx: %016lx rdx: %016lx\n", + regs->ax, regs->cx, regs->dx); + pr_emerg("rsi: %016lx rdi: %016lx rbp: %016lx rsp: %016lx\n", + regs->si, regs->di, regs->bp, regs->sp); +#endif + put_cpu_var(pf_reason); + BUG(); +} + +static void pre(struct kmmio_probe *p, struct pt_regs *regs, + unsigned long addr) +{ + struct trap_reason *my_reason = &get_cpu_var(pf_reason); + struct mmiotrace_rw *my_trace = &get_cpu_var(cpu_trace); + const unsigned long instptr = instruction_pointer(regs); + const enum reason_type type = get_ins_type(instptr); + struct remap_trace *trace = p->private; + + /* it doesn't make sense to have more than one active trace per cpu */ + if (my_reason->active_traces) + die_kmmio_nesting_error(regs, addr); + else + my_reason->active_traces++; + + my_reason->type = type; + my_reason->addr = addr; + my_reason->ip = instptr; + + my_trace->phys = addr - trace->probe.addr + trace->phys; + my_trace->map_id = trace->id; + + /* + * Only record the program counter when requested. + * It may taint clean-room reverse engineering. + */ + if (trace_pc) + my_trace->pc = instptr; + else + my_trace->pc = 0; + + /* + * XXX: the timestamp recorded will be *after* the tracing has been + * done, not at the time we hit the instruction. SMP implications + * on event ordering? + */ + + switch (type) { + case REG_READ: + my_trace->opcode = MMIO_READ; + my_trace->width = get_ins_mem_width(instptr); + break; + case REG_WRITE: + my_trace->opcode = MMIO_WRITE; + my_trace->width = get_ins_mem_width(instptr); + my_trace->value = get_ins_reg_val(instptr, regs); + break; + case IMM_WRITE: + my_trace->opcode = MMIO_WRITE; + my_trace->width = get_ins_mem_width(instptr); + my_trace->value = get_ins_imm_val(instptr); + break; + default: + { + unsigned char *ip = (unsigned char *)instptr; + my_trace->opcode = MMIO_UNKNOWN_OP; + my_trace->width = 0; + my_trace->value = (*ip) << 16 | *(ip + 1) << 8 | + *(ip + 2); + } + } + put_cpu_var(cpu_trace); + put_cpu_var(pf_reason); +} + +static void post(struct kmmio_probe *p, unsigned long condition, + struct pt_regs *regs) +{ + struct trap_reason *my_reason = &get_cpu_var(pf_reason); + struct mmiotrace_rw *my_trace = &get_cpu_var(cpu_trace); + + /* this should always return the active_trace count to 0 */ + my_reason->active_traces--; + if (my_reason->active_traces) { + pr_emerg("unexpected post handler"); + BUG(); + } + + switch (my_reason->type) { + case REG_READ: + my_trace->value = get_ins_reg_val(my_reason->ip, regs); + break; + default: + break; + } + + mmio_trace_rw(my_trace); + put_cpu_var(cpu_trace); + put_cpu_var(pf_reason); +} + +static void ioremap_trace_core(resource_size_t offset, unsigned long size, + void __iomem *addr) +{ + static atomic_t next_id; + struct remap_trace *trace = kmalloc(sizeof(*trace), GFP_KERNEL); + /* These are page-unaligned. */ + struct mmiotrace_map map = { + .phys = offset, + .virt = (unsigned long)addr, + .len = size, + .opcode = MMIO_PROBE + }; + + if (!trace) { + pr_err("kmalloc failed in ioremap\n"); + return; + } + + *trace = (struct remap_trace) { + .probe = { + .addr = (unsigned long)addr, + .len = size, + .pre_handler = pre, + .post_handler = post, + .private = trace + }, + .phys = offset, + .id = atomic_inc_return(&next_id) + }; + map.map_id = trace->id; + + spin_lock_irq(&trace_lock); + if (!is_enabled()) { + kfree(trace); + goto not_enabled; + } + + mmio_trace_mapping(&map); + list_add_tail(&trace->list, &trace_list); + if (!nommiotrace) + register_kmmio_probe(&trace->probe); + +not_enabled: + spin_unlock_irq(&trace_lock); +} + +void mmiotrace_ioremap(resource_size_t offset, unsigned long size, + void __iomem *addr) +{ + if (!is_enabled()) /* recheck and proper locking in *_core() */ + return; + + pr_debug("ioremap_*(0x%llx, 0x%lx) = %p\n", + (unsigned long long)offset, size, addr); + if ((filter_offset) && (offset != filter_offset)) + return; + ioremap_trace_core(offset, size, addr); +} + +static void iounmap_trace_core(volatile void __iomem *addr) +{ + struct mmiotrace_map map = { + .phys = 0, + .virt = (unsigned long)addr, + .len = 0, + .opcode = MMIO_UNPROBE + }; + struct remap_trace *trace; + struct remap_trace *tmp; + struct remap_trace *found_trace = NULL; + + pr_debug("Unmapping %p.\n", addr); + + spin_lock_irq(&trace_lock); + if (!is_enabled()) + goto not_enabled; + + list_for_each_entry_safe(trace, tmp, &trace_list, list) { + if ((unsigned long)addr == trace->probe.addr) { + if (!nommiotrace) + unregister_kmmio_probe(&trace->probe); + list_del(&trace->list); + found_trace = trace; + break; + } + } + map.map_id = (found_trace) ? found_trace->id : -1; + mmio_trace_mapping(&map); + +not_enabled: + spin_unlock_irq(&trace_lock); + if (found_trace) { + synchronize_rcu(); /* unregister_kmmio_probe() requirement */ + kfree(found_trace); + } +} + +void mmiotrace_iounmap(volatile void __iomem *addr) +{ + might_sleep(); + if (is_enabled()) /* recheck and proper locking in *_core() */ + iounmap_trace_core(addr); +} + +int mmiotrace_printk(const char *fmt, ...) +{ + int ret = 0; + va_list args; + unsigned long flags; + va_start(args, fmt); + + spin_lock_irqsave(&trace_lock, flags); + if (is_enabled()) + ret = mmio_trace_printk(fmt, args); + spin_unlock_irqrestore(&trace_lock, flags); + + va_end(args); + return ret; +} +EXPORT_SYMBOL(mmiotrace_printk); + +static void clear_trace_list(void) +{ + struct remap_trace *trace; + struct remap_trace *tmp; + + /* + * No locking required, because the caller ensures we are in a + * critical section via mutex, and is_enabled() is false, + * i.e. nothing can traverse or modify this list. + * Caller also ensures is_enabled() cannot change. + */ + list_for_each_entry(trace, &trace_list, list) { + pr_notice("purging non-iounmapped trace @0x%08lx, size 0x%lx.\n", + trace->probe.addr, trace->probe.len); + if (!nommiotrace) + unregister_kmmio_probe(&trace->probe); + } + synchronize_rcu(); /* unregister_kmmio_probe() requirement */ + + list_for_each_entry_safe(trace, tmp, &trace_list, list) { + list_del(&trace->list); + kfree(trace); + } +} + +#ifdef CONFIG_HOTPLUG_CPU +static cpumask_var_t downed_cpus; + +static void enter_uniprocessor(void) +{ + int cpu; + int err; + + if (downed_cpus == NULL && + !alloc_cpumask_var(&downed_cpus, GFP_KERNEL)) { + pr_notice("Failed to allocate mask\n"); + goto out; + } + + get_online_cpus(); + cpumask_copy(downed_cpus, cpu_online_mask); + cpumask_clear_cpu(cpumask_first(cpu_online_mask), downed_cpus); + if (num_online_cpus() > 1) + pr_notice("Disabling non-boot CPUs...\n"); + put_online_cpus(); + + for_each_cpu(cpu, downed_cpus) { + err = cpu_down(cpu); + if (!err) + pr_info("CPU%d is down.\n", cpu); + else + pr_err("Error taking CPU%d down: %d\n", cpu, err); + } +out: + if (num_online_cpus() > 1) + pr_warning("multiple CPUs still online, may miss events.\n"); +} + +/* __ref because leave_uniprocessor calls cpu_up which is __cpuinit, + but this whole function is ifdefed CONFIG_HOTPLUG_CPU */ +static void __ref leave_uniprocessor(void) +{ + int cpu; + int err; + + if (downed_cpus == NULL || cpumask_weight(downed_cpus) == 0) + return; + pr_notice("Re-enabling CPUs...\n"); + for_each_cpu(cpu, downed_cpus) { + err = cpu_up(cpu); + if (!err) + pr_info("enabled CPU%d.\n", cpu); + else + pr_err("cannot re-enable CPU%d: %d\n", cpu, err); + } +} + +#else /* !CONFIG_HOTPLUG_CPU */ +static void enter_uniprocessor(void) +{ + if (num_online_cpus() > 1) + pr_warning("multiple CPUs are online, may miss events. " + "Suggest booting with maxcpus=1 kernel argument.\n"); +} + +static void leave_uniprocessor(void) +{ +} +#endif + +void enable_mmiotrace(void) +{ + mutex_lock(&mmiotrace_mutex); + if (is_enabled()) + goto out; + + if (nommiotrace) + pr_info("MMIO tracing disabled.\n"); + kmmio_init(); + enter_uniprocessor(); + spin_lock_irq(&trace_lock); + atomic_inc(&mmiotrace_enabled); + spin_unlock_irq(&trace_lock); + pr_info("enabled.\n"); +out: + mutex_unlock(&mmiotrace_mutex); +} + +void disable_mmiotrace(void) +{ + mutex_lock(&mmiotrace_mutex); + if (!is_enabled()) + goto out; + + spin_lock_irq(&trace_lock); + atomic_dec(&mmiotrace_enabled); + BUG_ON(is_enabled()); + spin_unlock_irq(&trace_lock); + + clear_trace_list(); /* guarantees: no more kmmio callbacks */ + leave_uniprocessor(); + kmmio_cleanup(); + pr_info("disabled.\n"); +out: + mutex_unlock(&mmiotrace_mutex); +} diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c new file mode 100644 index 00000000..19d3fa08 --- /dev/null +++ b/arch/x86/mm/numa.c @@ -0,0 +1,834 @@ +/* Common code for 32 and 64-bit NUMA */ +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/string.h> +#include <linux/init.h> +#include <linux/bootmem.h> +#include <linux/memblock.h> +#include <linux/mmzone.h> +#include <linux/ctype.h> +#include <linux/module.h> +#include <linux/nodemask.h> +#include <linux/sched.h> +#include <linux/topology.h> + +#include <asm/e820.h> +#include <asm/proto.h> +#include <asm/dma.h> +#include <asm/acpi.h> +#include <asm/amd_nb.h> + +#include "numa_internal.h" + +int __initdata numa_off; +nodemask_t numa_nodes_parsed __initdata; + +struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; +EXPORT_SYMBOL(node_data); + +static struct numa_meminfo numa_meminfo +#ifndef CONFIG_MEMORY_HOTPLUG +__initdata +#endif +; + +static int numa_distance_cnt; +static u8 *numa_distance; + +static __init int numa_setup(char *opt) +{ + if (!opt) + return -EINVAL; + if (!strncmp(opt, "off", 3)) + numa_off = 1; +#ifdef CONFIG_NUMA_EMU + if (!strncmp(opt, "fake=", 5)) + numa_emu_cmdline(opt + 5); +#endif +#ifdef CONFIG_ACPI_NUMA + if (!strncmp(opt, "noacpi", 6)) + acpi_numa = -1; +#endif + return 0; +} +early_param("numa", numa_setup); + +/* + * apicid, cpu, node mappings + */ +s16 __apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { + [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE +}; + +int __cpuinit numa_cpu_node(int cpu) +{ + int apicid = early_per_cpu(x86_cpu_to_apicid, cpu); + + if (apicid != BAD_APICID) + return __apicid_to_node[apicid]; + return NUMA_NO_NODE; +} + +cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; +EXPORT_SYMBOL(node_to_cpumask_map); + +/* + * Map cpu index to node index + */ +DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); +EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); + +void __cpuinit numa_set_node(int cpu, int node) +{ + int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); + + /* early setting, no percpu area yet */ + if (cpu_to_node_map) { + cpu_to_node_map[cpu] = node; + return; + } + +#ifdef CONFIG_DEBUG_PER_CPU_MAPS + if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) { + printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu); + dump_stack(); + return; + } +#endif + per_cpu(x86_cpu_to_node_map, cpu) = node; + + if (node != NUMA_NO_NODE) + set_cpu_numa_node(cpu, node); +} + +void __cpuinit numa_clear_node(int cpu) +{ + numa_set_node(cpu, NUMA_NO_NODE); +} + +/* + * Allocate node_to_cpumask_map based on number of available nodes + * Requires node_possible_map to be valid. + * + * Note: cpumask_of_node() is not valid until after this is done. + * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.) + */ +void __init setup_node_to_cpumask_map(void) +{ + unsigned int node, num = 0; + + /* setup nr_node_ids if not done yet */ + if (nr_node_ids == MAX_NUMNODES) { + for_each_node_mask(node, node_possible_map) + num = node; + nr_node_ids = num + 1; + } + + /* allocate the map */ + for (node = 0; node < nr_node_ids; node++) + alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]); + + /* cpumask_of_node() will now work */ + pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids); +} + +static int __init numa_add_memblk_to(int nid, u64 start, u64 end, + struct numa_meminfo *mi) +{ + /* ignore zero length blks */ + if (start == end) + return 0; + + /* whine about and ignore invalid blks */ + if (start > end || nid < 0 || nid >= MAX_NUMNODES) { + pr_warning("NUMA: Warning: invalid memblk node %d (%Lx-%Lx)\n", + nid, start, end); + return 0; + } + + if (mi->nr_blks >= NR_NODE_MEMBLKS) { + pr_err("NUMA: too many memblk ranges\n"); + return -EINVAL; + } + + mi->blk[mi->nr_blks].start = start; + mi->blk[mi->nr_blks].end = end; + mi->blk[mi->nr_blks].nid = nid; + mi->nr_blks++; + return 0; +} + +/** + * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo + * @idx: Index of memblk to remove + * @mi: numa_meminfo to remove memblk from + * + * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and + * decrementing @mi->nr_blks. + */ +void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi) +{ + mi->nr_blks--; + memmove(&mi->blk[idx], &mi->blk[idx + 1], + (mi->nr_blks - idx) * sizeof(mi->blk[0])); +} + +/** + * numa_add_memblk - Add one numa_memblk to numa_meminfo + * @nid: NUMA node ID of the new memblk + * @start: Start address of the new memblk + * @end: End address of the new memblk + * + * Add a new memblk to the default numa_meminfo. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int __init numa_add_memblk(int nid, u64 start, u64 end) +{ + return numa_add_memblk_to(nid, start, end, &numa_meminfo); +} + +/* Initialize NODE_DATA for a node on the local memory */ +static void __init setup_node_data(int nid, u64 start, u64 end) +{ + const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE); + bool remapped = false; + u64 nd_pa; + void *nd; + int tnid; + + /* + * Don't confuse VM with a node that doesn't have the + * minimum amount of memory: + */ + if (end && (end - start) < NODE_MIN_SIZE) + return; + + /* initialize remap allocator before aligning to ZONE_ALIGN */ + init_alloc_remap(nid, start, end); + + start = roundup(start, ZONE_ALIGN); + + printk(KERN_INFO "Initmem setup node %d %016Lx-%016Lx\n", + nid, start, end); + + /* + * Allocate node data. Try remap allocator first, node-local + * memory and then any node. Never allocate in DMA zone. + */ + nd = alloc_remap(nid, nd_size); + if (nd) { + nd_pa = __pa(nd); + remapped = true; + } else { + nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid); + if (!nd_pa) { + pr_err("Cannot find %zu bytes in node %d\n", + nd_size, nid); + return; + } + nd = __va(nd_pa); + } + + /* report and initialize */ + printk(KERN_INFO " NODE_DATA [%016Lx - %016Lx]%s\n", + nd_pa, nd_pa + nd_size - 1, remapped ? " (remapped)" : ""); + tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT); + if (!remapped && tnid != nid) + printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nid, tnid); + + node_data[nid] = nd; + memset(NODE_DATA(nid), 0, sizeof(pg_data_t)); + NODE_DATA(nid)->node_id = nid; + NODE_DATA(nid)->node_start_pfn = start >> PAGE_SHIFT; + NODE_DATA(nid)->node_spanned_pages = (end - start) >> PAGE_SHIFT; + + node_set_online(nid); +} + +/** + * numa_cleanup_meminfo - Cleanup a numa_meminfo + * @mi: numa_meminfo to clean up + * + * Sanitize @mi by merging and removing unncessary memblks. Also check for + * conflicts and clear unused memblks. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int __init numa_cleanup_meminfo(struct numa_meminfo *mi) +{ + const u64 low = 0; + const u64 high = PFN_PHYS(max_pfn); + int i, j, k; + + /* first, trim all entries */ + for (i = 0; i < mi->nr_blks; i++) { + struct numa_memblk *bi = &mi->blk[i]; + + /* make sure all blocks are inside the limits */ + bi->start = max(bi->start, low); + bi->end = min(bi->end, high); + + /* and there's no empty block */ + if (bi->start >= bi->end) + numa_remove_memblk_from(i--, mi); + } + + /* merge neighboring / overlapping entries */ + for (i = 0; i < mi->nr_blks; i++) { + struct numa_memblk *bi = &mi->blk[i]; + + for (j = i + 1; j < mi->nr_blks; j++) { + struct numa_memblk *bj = &mi->blk[j]; + u64 start, end; + + /* + * See whether there are overlapping blocks. Whine + * about but allow overlaps of the same nid. They + * will be merged below. + */ + if (bi->end > bj->start && bi->start < bj->end) { + if (bi->nid != bj->nid) { + pr_err("NUMA: node %d (%Lx-%Lx) overlaps with node %d (%Lx-%Lx)\n", + bi->nid, bi->start, bi->end, + bj->nid, bj->start, bj->end); + return -EINVAL; + } + pr_warning("NUMA: Warning: node %d (%Lx-%Lx) overlaps with itself (%Lx-%Lx)\n", + bi->nid, bi->start, bi->end, + bj->start, bj->end); + } + + /* + * Join together blocks on the same node, holes + * between which don't overlap with memory on other + * nodes. + */ + if (bi->nid != bj->nid) + continue; + start = min(bi->start, bj->start); + end = max(bi->end, bj->end); + for (k = 0; k < mi->nr_blks; k++) { + struct numa_memblk *bk = &mi->blk[k]; + + if (bi->nid == bk->nid) + continue; + if (start < bk->end && end > bk->start) + break; + } + if (k < mi->nr_blks) + continue; + printk(KERN_INFO "NUMA: Node %d [%Lx,%Lx) + [%Lx,%Lx) -> [%Lx,%Lx)\n", + bi->nid, bi->start, bi->end, bj->start, bj->end, + start, end); + bi->start = start; + bi->end = end; + numa_remove_memblk_from(j--, mi); + } + } + + /* clear unused ones */ + for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) { + mi->blk[i].start = mi->blk[i].end = 0; + mi->blk[i].nid = NUMA_NO_NODE; + } + + return 0; +} + +/* + * Set nodes, which have memory in @mi, in *@nodemask. + */ +static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask, + const struct numa_meminfo *mi) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(mi->blk); i++) + if (mi->blk[i].start != mi->blk[i].end && + mi->blk[i].nid != NUMA_NO_NODE) + node_set(mi->blk[i].nid, *nodemask); +} + +/** + * numa_reset_distance - Reset NUMA distance table + * + * The current table is freed. The next numa_set_distance() call will + * create a new one. + */ +void __init numa_reset_distance(void) +{ + size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]); + + /* numa_distance could be 1LU marking allocation failure, test cnt */ + if (numa_distance_cnt) + memblock_free(__pa(numa_distance), size); + numa_distance_cnt = 0; + numa_distance = NULL; /* enable table creation */ +} + +static int __init numa_alloc_distance(void) +{ + nodemask_t nodes_parsed; + size_t size; + int i, j, cnt = 0; + u64 phys; + + /* size the new table and allocate it */ + nodes_parsed = numa_nodes_parsed; + numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo); + + for_each_node_mask(i, nodes_parsed) + cnt = i; + cnt++; + size = cnt * cnt * sizeof(numa_distance[0]); + + phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), + size, PAGE_SIZE); + if (!phys) { + pr_warning("NUMA: Warning: can't allocate distance table!\n"); + /* don't retry until explicitly reset */ + numa_distance = (void *)1LU; + return -ENOMEM; + } + memblock_reserve(phys, size); + + numa_distance = __va(phys); + numa_distance_cnt = cnt; + + /* fill with the default distances */ + for (i = 0; i < cnt; i++) + for (j = 0; j < cnt; j++) + numa_distance[i * cnt + j] = i == j ? + LOCAL_DISTANCE : REMOTE_DISTANCE; + printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt); + + return 0; +} + +/** + * numa_set_distance - Set NUMA distance from one NUMA to another + * @from: the 'from' node to set distance + * @to: the 'to' node to set distance + * @distance: NUMA distance + * + * Set the distance from node @from to @to to @distance. If distance table + * doesn't exist, one which is large enough to accommodate all the currently + * known nodes will be created. + * + * If such table cannot be allocated, a warning is printed and further + * calls are ignored until the distance table is reset with + * numa_reset_distance(). + * + * If @from or @to is higher than the highest known node or lower than zero + * at the time of table creation or @distance doesn't make sense, the call + * is ignored. + * This is to allow simplification of specific NUMA config implementations. + */ +void __init numa_set_distance(int from, int to, int distance) +{ + if (!numa_distance && numa_alloc_distance() < 0) + return; + + if (from >= numa_distance_cnt || to >= numa_distance_cnt || + from < 0 || to < 0) { + pr_warn_once("NUMA: Warning: node ids are out of bound, from=%d to=%d distance=%d\n", + from, to, distance); + return; + } + + if ((u8)distance != distance || + (from == to && distance != LOCAL_DISTANCE)) { + pr_warn_once("NUMA: Warning: invalid distance parameter, from=%d to=%d distance=%d\n", + from, to, distance); + return; + } + + numa_distance[from * numa_distance_cnt + to] = distance; +} + +int __node_distance(int from, int to) +{ + if (from >= numa_distance_cnt || to >= numa_distance_cnt) + return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE; + return numa_distance[from * numa_distance_cnt + to]; +} +EXPORT_SYMBOL(__node_distance); + +/* + * Sanity check to catch more bad NUMA configurations (they are amazingly + * common). Make sure the nodes cover all memory. + */ +static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi) +{ + u64 numaram, e820ram; + int i; + + numaram = 0; + for (i = 0; i < mi->nr_blks; i++) { + u64 s = mi->blk[i].start >> PAGE_SHIFT; + u64 e = mi->blk[i].end >> PAGE_SHIFT; + numaram += e - s; + numaram -= __absent_pages_in_range(mi->blk[i].nid, s, e); + if ((s64)numaram < 0) + numaram = 0; + } + + e820ram = max_pfn - absent_pages_in_range(0, max_pfn); + + /* We seem to lose 3 pages somewhere. Allow 1M of slack. */ + if ((s64)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) { + printk(KERN_ERR "NUMA: nodes only cover %LuMB of your %LuMB e820 RAM. Not used.\n", + (numaram << PAGE_SHIFT) >> 20, + (e820ram << PAGE_SHIFT) >> 20); + return false; + } + return true; +} + +static int __init numa_register_memblks(struct numa_meminfo *mi) +{ + unsigned long uninitialized_var(pfn_align); + int i, nid; + + /* Account for nodes with cpus and no memory */ + node_possible_map = numa_nodes_parsed; + numa_nodemask_from_meminfo(&node_possible_map, mi); + if (WARN_ON(nodes_empty(node_possible_map))) + return -EINVAL; + + for (i = 0; i < mi->nr_blks; i++) { + struct numa_memblk *mb = &mi->blk[i]; + memblock_set_node(mb->start, mb->end - mb->start, mb->nid); + } + + /* + * If sections array is gonna be used for pfn -> nid mapping, check + * whether its granularity is fine enough. + */ +#ifdef NODE_NOT_IN_PAGE_FLAGS + pfn_align = node_map_pfn_alignment(); + if (pfn_align && pfn_align < PAGES_PER_SECTION) { + printk(KERN_WARNING "Node alignment %LuMB < min %LuMB, rejecting NUMA config\n", + PFN_PHYS(pfn_align) >> 20, + PFN_PHYS(PAGES_PER_SECTION) >> 20); + return -EINVAL; + } +#endif + if (!numa_meminfo_cover_memory(mi)) + return -EINVAL; + + /* Finally register nodes. */ + for_each_node_mask(nid, node_possible_map) { + u64 start = PFN_PHYS(max_pfn); + u64 end = 0; + + for (i = 0; i < mi->nr_blks; i++) { + if (nid != mi->blk[i].nid) + continue; + start = min(mi->blk[i].start, start); + end = max(mi->blk[i].end, end); + } + + if (start < end) + setup_node_data(nid, start, end); + } + + /* Dump memblock with node info and return. */ + memblock_dump_all(); + return 0; +} + +/* + * There are unfortunately some poorly designed mainboards around that + * only connect memory to a single CPU. This breaks the 1:1 cpu->node + * mapping. To avoid this fill in the mapping for all possible CPUs, + * as the number of CPUs is not known yet. We round robin the existing + * nodes. + */ +static void __init numa_init_array(void) +{ + int rr, i; + + rr = first_node(node_online_map); + for (i = 0; i < nr_cpu_ids; i++) { + if (early_cpu_to_node(i) != NUMA_NO_NODE) + continue; + numa_set_node(i, rr); + rr = next_node(rr, node_online_map); + if (rr == MAX_NUMNODES) + rr = first_node(node_online_map); + } +} + +static int __init numa_init(int (*init_func)(void)) +{ + int i; + int ret; + + for (i = 0; i < MAX_LOCAL_APIC; i++) + set_apicid_to_node(i, NUMA_NO_NODE); + + nodes_clear(numa_nodes_parsed); + nodes_clear(node_possible_map); + nodes_clear(node_online_map); + memset(&numa_meminfo, 0, sizeof(numa_meminfo)); + WARN_ON(memblock_set_node(0, ULLONG_MAX, MAX_NUMNODES)); + numa_reset_distance(); + + ret = init_func(); + if (ret < 0) + return ret; + ret = numa_cleanup_meminfo(&numa_meminfo); + if (ret < 0) + return ret; + + numa_emulation(&numa_meminfo, numa_distance_cnt); + + ret = numa_register_memblks(&numa_meminfo); + if (ret < 0) + return ret; + + for (i = 0; i < nr_cpu_ids; i++) { + int nid = early_cpu_to_node(i); + + if (nid == NUMA_NO_NODE) + continue; + if (!node_online(nid)) + numa_clear_node(i); + } + numa_init_array(); + return 0; +} + +/** + * dummy_numa_init - Fallback dummy NUMA init + * + * Used if there's no underlying NUMA architecture, NUMA initialization + * fails, or NUMA is disabled on the command line. + * + * Must online at least one node and add memory blocks that cover all + * allowed memory. This function must not fail. + */ +static int __init dummy_numa_init(void) +{ + printk(KERN_INFO "%s\n", + numa_off ? "NUMA turned off" : "No NUMA configuration found"); + printk(KERN_INFO "Faking a node at %016Lx-%016Lx\n", + 0LLU, PFN_PHYS(max_pfn)); + + node_set(0, numa_nodes_parsed); + numa_add_memblk(0, 0, PFN_PHYS(max_pfn)); + + return 0; +} + +/** + * x86_numa_init - Initialize NUMA + * + * Try each configured NUMA initialization method until one succeeds. The + * last fallback is dummy single node config encomapssing whole memory and + * never fails. + */ +void __init x86_numa_init(void) +{ + if (!numa_off) { +#ifdef CONFIG_X86_NUMAQ + if (!numa_init(numaq_numa_init)) + return; +#endif +#ifdef CONFIG_ACPI_NUMA + if (!numa_init(x86_acpi_numa_init)) + return; +#endif +#ifdef CONFIG_AMD_NUMA + if (!numa_init(amd_numa_init)) + return; +#endif + } + + numa_init(dummy_numa_init); +} + +static __init int find_near_online_node(int node) +{ + int n, val; + int min_val = INT_MAX; + int best_node = -1; + + for_each_online_node(n) { + val = node_distance(node, n); + + if (val < min_val) { + min_val = val; + best_node = n; + } + } + + return best_node; +} + +/* + * Setup early cpu_to_node. + * + * Populate cpu_to_node[] only if x86_cpu_to_apicid[], + * and apicid_to_node[] tables have valid entries for a CPU. + * This means we skip cpu_to_node[] initialisation for NUMA + * emulation and faking node case (when running a kernel compiled + * for NUMA on a non NUMA box), which is OK as cpu_to_node[] + * is already initialized in a round robin manner at numa_init_array, + * prior to this call, and this initialization is good enough + * for the fake NUMA cases. + * + * Called before the per_cpu areas are setup. + */ +void __init init_cpu_to_node(void) +{ + int cpu; + u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); + + BUG_ON(cpu_to_apicid == NULL); + + for_each_possible_cpu(cpu) { + int node = numa_cpu_node(cpu); + + if (node == NUMA_NO_NODE) + continue; + if (!node_online(node)) + node = find_near_online_node(node); + numa_set_node(cpu, node); + } +} + +#ifndef CONFIG_DEBUG_PER_CPU_MAPS + +# ifndef CONFIG_NUMA_EMU +void __cpuinit numa_add_cpu(int cpu) +{ + cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); +} + +void __cpuinit numa_remove_cpu(int cpu) +{ + cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); +} +# endif /* !CONFIG_NUMA_EMU */ + +#else /* !CONFIG_DEBUG_PER_CPU_MAPS */ + +int __cpu_to_node(int cpu) +{ + if (early_per_cpu_ptr(x86_cpu_to_node_map)) { + printk(KERN_WARNING + "cpu_to_node(%d): usage too early!\n", cpu); + dump_stack(); + return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; + } + return per_cpu(x86_cpu_to_node_map, cpu); +} +EXPORT_SYMBOL(__cpu_to_node); + +/* + * Same function as cpu_to_node() but used if called before the + * per_cpu areas are setup. + */ +int early_cpu_to_node(int cpu) +{ + if (early_per_cpu_ptr(x86_cpu_to_node_map)) + return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; + + if (!cpu_possible(cpu)) { + printk(KERN_WARNING + "early_cpu_to_node(%d): no per_cpu area!\n", cpu); + dump_stack(); + return NUMA_NO_NODE; + } + return per_cpu(x86_cpu_to_node_map, cpu); +} + +void debug_cpumask_set_cpu(int cpu, int node, bool enable) +{ + struct cpumask *mask; + char buf[64]; + + if (node == NUMA_NO_NODE) { + /* early_cpu_to_node() already emits a warning and trace */ + return; + } + mask = node_to_cpumask_map[node]; + if (!mask) { + pr_err("node_to_cpumask_map[%i] NULL\n", node); + dump_stack(); + return; + } + + if (enable) + cpumask_set_cpu(cpu, mask); + else + cpumask_clear_cpu(cpu, mask); + + cpulist_scnprintf(buf, sizeof(buf), mask); + printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n", + enable ? "numa_add_cpu" : "numa_remove_cpu", + cpu, node, buf); + return; +} + +# ifndef CONFIG_NUMA_EMU +static void __cpuinit numa_set_cpumask(int cpu, bool enable) +{ + debug_cpumask_set_cpu(cpu, early_cpu_to_node(cpu), enable); +} + +void __cpuinit numa_add_cpu(int cpu) +{ + numa_set_cpumask(cpu, true); +} + +void __cpuinit numa_remove_cpu(int cpu) +{ + numa_set_cpumask(cpu, false); +} +# endif /* !CONFIG_NUMA_EMU */ + +/* + * Returns a pointer to the bitmask of CPUs on Node 'node'. + */ +const struct cpumask *cpumask_of_node(int node) +{ + if (node >= nr_node_ids) { + printk(KERN_WARNING + "cpumask_of_node(%d): node > nr_node_ids(%d)\n", + node, nr_node_ids); + dump_stack(); + return cpu_none_mask; + } + if (node_to_cpumask_map[node] == NULL) { + printk(KERN_WARNING + "cpumask_of_node(%d): no node_to_cpumask_map!\n", + node); + dump_stack(); + return cpu_online_mask; + } + return node_to_cpumask_map[node]; +} +EXPORT_SYMBOL(cpumask_of_node); + +#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ + +#ifdef CONFIG_MEMORY_HOTPLUG +int memory_add_physaddr_to_nid(u64 start) +{ + struct numa_meminfo *mi = &numa_meminfo; + int nid = mi->blk[0].nid; + int i; + + for (i = 0; i < mi->nr_blks; i++) + if (mi->blk[i].start <= start && mi->blk[i].end > start) + nid = mi->blk[i].nid; + return nid; +} +EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); +#endif diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c new file mode 100644 index 00000000..534255a3 --- /dev/null +++ b/arch/x86/mm/numa_32.c @@ -0,0 +1,265 @@ +/* + * Written by: Patricia Gaughen <gone@us.ibm.com>, IBM Corporation + * August 2002: added remote node KVA remap - Martin J. Bligh + * + * Copyright (C) 2002, IBM Corp. + * + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include <linux/bootmem.h> +#include <linux/memblock.h> +#include <linux/module.h> + +#include "numa_internal.h" + +#ifdef CONFIG_DISCONTIGMEM +/* + * 4) physnode_map - the mapping between a pfn and owning node + * physnode_map keeps track of the physical memory layout of a generic + * numa node on a 64Mb break (each element of the array will + * represent 64Mb of memory and will be marked by the node id. so, + * if the first gig is on node 0, and the second gig is on node 1 + * physnode_map will contain: + * + * physnode_map[0-15] = 0; + * physnode_map[16-31] = 1; + * physnode_map[32- ] = -1; + */ +s8 physnode_map[MAX_SECTIONS] __read_mostly = { [0 ... (MAX_SECTIONS - 1)] = -1}; +EXPORT_SYMBOL(physnode_map); + +void memory_present(int nid, unsigned long start, unsigned long end) +{ + unsigned long pfn; + + printk(KERN_INFO "Node: %d, start_pfn: %lx, end_pfn: %lx\n", + nid, start, end); + printk(KERN_DEBUG " Setting physnode_map array to node %d for pfns:\n", nid); + printk(KERN_DEBUG " "); + for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) { + physnode_map[pfn / PAGES_PER_SECTION] = nid; + printk(KERN_CONT "%lx ", pfn); + } + printk(KERN_CONT "\n"); +} + +unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn, + unsigned long end_pfn) +{ + unsigned long nr_pages = end_pfn - start_pfn; + + if (!nr_pages) + return 0; + + return (nr_pages + 1) * sizeof(struct page); +} +#endif + +extern unsigned long highend_pfn, highstart_pfn; + +#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) + +static void *node_remap_start_vaddr[MAX_NUMNODES]; +void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); + +/* + * Remap memory allocator + */ +static unsigned long node_remap_start_pfn[MAX_NUMNODES]; +static void *node_remap_end_vaddr[MAX_NUMNODES]; +static void *node_remap_alloc_vaddr[MAX_NUMNODES]; + +/** + * alloc_remap - Allocate remapped memory + * @nid: NUMA node to allocate memory from + * @size: The size of allocation + * + * Allocate @size bytes from the remap area of NUMA node @nid. The + * size of the remap area is predetermined by init_alloc_remap() and + * only the callers considered there should call this function. For + * more info, please read the comment on top of init_alloc_remap(). + * + * The caller must be ready to handle allocation failure from this + * function and fall back to regular memory allocator in such cases. + * + * CONTEXT: + * Single CPU early boot context. + * + * RETURNS: + * Pointer to the allocated memory on success, %NULL on failure. + */ +void *alloc_remap(int nid, unsigned long size) +{ + void *allocation = node_remap_alloc_vaddr[nid]; + + size = ALIGN(size, L1_CACHE_BYTES); + + if (!allocation || (allocation + size) > node_remap_end_vaddr[nid]) + return NULL; + + node_remap_alloc_vaddr[nid] += size; + memset(allocation, 0, size); + + return allocation; +} + +#ifdef CONFIG_HIBERNATION +/** + * resume_map_numa_kva - add KVA mapping to the temporary page tables created + * during resume from hibernation + * @pgd_base - temporary resume page directory + */ +void resume_map_numa_kva(pgd_t *pgd_base) +{ + int node; + + for_each_online_node(node) { + unsigned long start_va, start_pfn, nr_pages, pfn; + + start_va = (unsigned long)node_remap_start_vaddr[node]; + start_pfn = node_remap_start_pfn[node]; + nr_pages = (node_remap_end_vaddr[node] - + node_remap_start_vaddr[node]) >> PAGE_SHIFT; + + printk(KERN_DEBUG "%s: node %d\n", __func__, node); + + for (pfn = 0; pfn < nr_pages; pfn += PTRS_PER_PTE) { + unsigned long vaddr = start_va + (pfn << PAGE_SHIFT); + pgd_t *pgd = pgd_base + pgd_index(vaddr); + pud_t *pud = pud_offset(pgd, vaddr); + pmd_t *pmd = pmd_offset(pud, vaddr); + + set_pmd(pmd, pfn_pmd(start_pfn + pfn, + PAGE_KERNEL_LARGE_EXEC)); + + printk(KERN_DEBUG "%s: %08lx -> pfn %08lx\n", + __func__, vaddr, start_pfn + pfn); + } + } +} +#endif + +/** + * init_alloc_remap - Initialize remap allocator for a NUMA node + * @nid: NUMA node to initizlie remap allocator for + * + * NUMA nodes may end up without any lowmem. As allocating pgdat and + * memmap on a different node with lowmem is inefficient, a special + * remap allocator is implemented which can be used by alloc_remap(). + * + * For each node, the amount of memory which will be necessary for + * pgdat and memmap is calculated and two memory areas of the size are + * allocated - one in the node and the other in lowmem; then, the area + * in the node is remapped to the lowmem area. + * + * As pgdat and memmap must be allocated in lowmem anyway, this + * doesn't waste lowmem address space; however, the actual lowmem + * which gets remapped over is wasted. The amount shouldn't be + * problematic on machines this feature will be used. + * + * Initialization failure isn't fatal. alloc_remap() is used + * opportunistically and the callers will fall back to other memory + * allocation mechanisms on failure. + */ +void __init init_alloc_remap(int nid, u64 start, u64 end) +{ + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long end_pfn = end >> PAGE_SHIFT; + unsigned long size, pfn; + u64 node_pa, remap_pa; + void *remap_va; + + /* + * The acpi/srat node info can show hot-add memroy zones where + * memory could be added but not currently present. + */ + printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n", + nid, start_pfn, end_pfn); + + /* calculate the necessary space aligned to large page size */ + size = node_memmap_size_bytes(nid, start_pfn, end_pfn); + size += ALIGN(sizeof(pg_data_t), PAGE_SIZE); + size = ALIGN(size, LARGE_PAGE_BYTES); + + /* allocate node memory and the lowmem remap area */ + node_pa = memblock_find_in_range(start, end, size, LARGE_PAGE_BYTES); + if (!node_pa) { + pr_warning("remap_alloc: failed to allocate %lu bytes for node %d\n", + size, nid); + return; + } + memblock_reserve(node_pa, size); + + remap_pa = memblock_find_in_range(min_low_pfn << PAGE_SHIFT, + max_low_pfn << PAGE_SHIFT, + size, LARGE_PAGE_BYTES); + if (!remap_pa) { + pr_warning("remap_alloc: failed to allocate %lu bytes remap area for node %d\n", + size, nid); + memblock_free(node_pa, size); + return; + } + memblock_reserve(remap_pa, size); + remap_va = phys_to_virt(remap_pa); + + /* perform actual remap */ + for (pfn = 0; pfn < size >> PAGE_SHIFT; pfn += PTRS_PER_PTE) + set_pmd_pfn((unsigned long)remap_va + (pfn << PAGE_SHIFT), + (node_pa >> PAGE_SHIFT) + pfn, + PAGE_KERNEL_LARGE); + + /* initialize remap allocator parameters */ + node_remap_start_pfn[nid] = node_pa >> PAGE_SHIFT; + node_remap_start_vaddr[nid] = remap_va; + node_remap_end_vaddr[nid] = remap_va + size; + node_remap_alloc_vaddr[nid] = remap_va; + + printk(KERN_DEBUG "remap_alloc: node %d [%08llx-%08llx) -> [%p-%p)\n", + nid, node_pa, node_pa + size, remap_va, remap_va + size); +} + +void __init initmem_init(void) +{ + x86_numa_init(); + +#ifdef CONFIG_HIGHMEM + highstart_pfn = highend_pfn = max_pfn; + if (max_pfn > max_low_pfn) + highstart_pfn = max_low_pfn; + printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", + pages_to_mb(highend_pfn - highstart_pfn)); + num_physpages = highend_pfn; + high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; +#else + num_physpages = max_low_pfn; + high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; +#endif + printk(KERN_NOTICE "%ldMB LOWMEM available.\n", + pages_to_mb(max_low_pfn)); + printk(KERN_DEBUG "max_low_pfn = %lx, highstart_pfn = %lx\n", + max_low_pfn, highstart_pfn); + + printk(KERN_DEBUG "Low memory ends at vaddr %08lx\n", + (ulong) pfn_to_kaddr(max_low_pfn)); + + printk(KERN_DEBUG "High memory starts at vaddr %08lx\n", + (ulong) pfn_to_kaddr(highstart_pfn)); + + setup_bootmem_allocator(); +} diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c new file mode 100644 index 00000000..92e27119 --- /dev/null +++ b/arch/x86/mm/numa_64.c @@ -0,0 +1,25 @@ +/* + * Generic VM initialization for x86-64 NUMA setups. + * Copyright 2002,2003 Andi Kleen, SuSE Labs. + */ +#include <linux/bootmem.h> + +#include "numa_internal.h" + +void __init initmem_init(void) +{ + x86_numa_init(); +} + +unsigned long __init numa_free_all_bootmem(void) +{ + unsigned long pages = 0; + int i; + + for_each_online_node(i) + pages += free_all_bootmem_node(NODE_DATA(i)); + + pages += free_low_memory_core_early(MAX_NUMNODES); + + return pages; +} diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c new file mode 100644 index 00000000..53489ff6 --- /dev/null +++ b/arch/x86/mm/numa_emulation.c @@ -0,0 +1,498 @@ +/* + * NUMA emulation + */ +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/topology.h> +#include <linux/memblock.h> +#include <linux/bootmem.h> +#include <asm/dma.h> + +#include "numa_internal.h" + +static int emu_nid_to_phys[MAX_NUMNODES] __cpuinitdata; +static char *emu_cmdline __initdata; + +void __init numa_emu_cmdline(char *str) +{ + emu_cmdline = str; +} + +static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi) +{ + int i; + + for (i = 0; i < mi->nr_blks; i++) + if (mi->blk[i].nid == nid) + return i; + return -ENOENT; +} + +static u64 __init mem_hole_size(u64 start, u64 end) +{ + unsigned long start_pfn = PFN_UP(start); + unsigned long end_pfn = PFN_DOWN(end); + + if (start_pfn < end_pfn) + return PFN_PHYS(absent_pages_in_range(start_pfn, end_pfn)); + return 0; +} + +/* + * Sets up nid to range from @start to @end. The return value is -errno if + * something went wrong, 0 otherwise. + */ +static int __init emu_setup_memblk(struct numa_meminfo *ei, + struct numa_meminfo *pi, + int nid, int phys_blk, u64 size) +{ + struct numa_memblk *eb = &ei->blk[ei->nr_blks]; + struct numa_memblk *pb = &pi->blk[phys_blk]; + + if (ei->nr_blks >= NR_NODE_MEMBLKS) { + pr_err("NUMA: Too many emulated memblks, failing emulation\n"); + return -EINVAL; + } + + ei->nr_blks++; + eb->start = pb->start; + eb->end = pb->start + size; + eb->nid = nid; + + if (emu_nid_to_phys[nid] == NUMA_NO_NODE) + emu_nid_to_phys[nid] = nid; + + pb->start += size; + if (pb->start >= pb->end) { + WARN_ON_ONCE(pb->start > pb->end); + numa_remove_memblk_from(phys_blk, pi); + } + + printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid, + eb->start, eb->end, (eb->end - eb->start) >> 20); + return 0; +} + +/* + * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr + * to max_addr. The return value is the number of nodes allocated. + */ +static int __init split_nodes_interleave(struct numa_meminfo *ei, + struct numa_meminfo *pi, + u64 addr, u64 max_addr, int nr_nodes) +{ + nodemask_t physnode_mask = NODE_MASK_NONE; + u64 size; + int big; + int nid = 0; + int i, ret; + + if (nr_nodes <= 0) + return -1; + if (nr_nodes > MAX_NUMNODES) { + pr_info("numa=fake=%d too large, reducing to %d\n", + nr_nodes, MAX_NUMNODES); + nr_nodes = MAX_NUMNODES; + } + + /* + * Calculate target node size. x86_32 freaks on __udivdi3() so do + * the division in ulong number of pages and convert back. + */ + size = max_addr - addr - mem_hole_size(addr, max_addr); + size = PFN_PHYS((unsigned long)(size >> PAGE_SHIFT) / nr_nodes); + + /* + * Calculate the number of big nodes that can be allocated as a result + * of consolidating the remainder. + */ + big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) / + FAKE_NODE_MIN_SIZE; + + size &= FAKE_NODE_MIN_HASH_MASK; + if (!size) { + pr_err("Not enough memory for each node. " + "NUMA emulation disabled.\n"); + return -1; + } + + for (i = 0; i < pi->nr_blks; i++) + node_set(pi->blk[i].nid, physnode_mask); + + /* + * Continue to fill physical nodes with fake nodes until there is no + * memory left on any of them. + */ + while (nodes_weight(physnode_mask)) { + for_each_node_mask(i, physnode_mask) { + u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN); + u64 start, limit, end; + int phys_blk; + + phys_blk = emu_find_memblk_by_nid(i, pi); + if (phys_blk < 0) { + node_clear(i, physnode_mask); + continue; + } + start = pi->blk[phys_blk].start; + limit = pi->blk[phys_blk].end; + end = start + size; + + if (nid < big) + end += FAKE_NODE_MIN_SIZE; + + /* + * Continue to add memory to this fake node if its + * non-reserved memory is less than the per-node size. + */ + while (end - start - mem_hole_size(start, end) < size) { + end += FAKE_NODE_MIN_SIZE; + if (end > limit) { + end = limit; + break; + } + } + + /* + * If there won't be at least FAKE_NODE_MIN_SIZE of + * non-reserved memory in ZONE_DMA32 for the next node, + * this one must extend to the boundary. + */ + if (end < dma32_end && dma32_end - end - + mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) + end = dma32_end; + + /* + * If there won't be enough non-reserved memory for the + * next node, this one must extend to the end of the + * physical node. + */ + if (limit - end - mem_hole_size(end, limit) < size) + end = limit; + + ret = emu_setup_memblk(ei, pi, nid++ % nr_nodes, + phys_blk, + min(end, limit) - start); + if (ret < 0) + return ret; + } + } + return 0; +} + +/* + * Returns the end address of a node so that there is at least `size' amount of + * non-reserved memory or `max_addr' is reached. + */ +static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size) +{ + u64 end = start + size; + + while (end - start - mem_hole_size(start, end) < size) { + end += FAKE_NODE_MIN_SIZE; + if (end > max_addr) { + end = max_addr; + break; + } + } + return end; +} + +/* + * Sets up fake nodes of `size' interleaved over physical nodes ranging from + * `addr' to `max_addr'. The return value is the number of nodes allocated. + */ +static int __init split_nodes_size_interleave(struct numa_meminfo *ei, + struct numa_meminfo *pi, + u64 addr, u64 max_addr, u64 size) +{ + nodemask_t physnode_mask = NODE_MASK_NONE; + u64 min_size; + int nid = 0; + int i, ret; + + if (!size) + return -1; + /* + * The limit on emulated nodes is MAX_NUMNODES, so the size per node is + * increased accordingly if the requested size is too small. This + * creates a uniform distribution of node sizes across the entire + * machine (but not necessarily over physical nodes). + */ + min_size = (max_addr - addr - mem_hole_size(addr, max_addr)) / MAX_NUMNODES; + min_size = max(min_size, FAKE_NODE_MIN_SIZE); + if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size) + min_size = (min_size + FAKE_NODE_MIN_SIZE) & + FAKE_NODE_MIN_HASH_MASK; + if (size < min_size) { + pr_err("Fake node size %LuMB too small, increasing to %LuMB\n", + size >> 20, min_size >> 20); + size = min_size; + } + size &= FAKE_NODE_MIN_HASH_MASK; + + for (i = 0; i < pi->nr_blks; i++) + node_set(pi->blk[i].nid, physnode_mask); + + /* + * Fill physical nodes with fake nodes of size until there is no memory + * left on any of them. + */ + while (nodes_weight(physnode_mask)) { + for_each_node_mask(i, physnode_mask) { + u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN); + u64 start, limit, end; + int phys_blk; + + phys_blk = emu_find_memblk_by_nid(i, pi); + if (phys_blk < 0) { + node_clear(i, physnode_mask); + continue; + } + start = pi->blk[phys_blk].start; + limit = pi->blk[phys_blk].end; + + end = find_end_of_node(start, limit, size); + /* + * If there won't be at least FAKE_NODE_MIN_SIZE of + * non-reserved memory in ZONE_DMA32 for the next node, + * this one must extend to the boundary. + */ + if (end < dma32_end && dma32_end - end - + mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) + end = dma32_end; + + /* + * If there won't be enough non-reserved memory for the + * next node, this one must extend to the end of the + * physical node. + */ + if (limit - end - mem_hole_size(end, limit) < size) + end = limit; + + ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES, + phys_blk, + min(end, limit) - start); + if (ret < 0) + return ret; + } + } + return 0; +} + +/** + * numa_emulation - Emulate NUMA nodes + * @numa_meminfo: NUMA configuration to massage + * @numa_dist_cnt: The size of the physical NUMA distance table + * + * Emulate NUMA nodes according to the numa=fake kernel parameter. + * @numa_meminfo contains the physical memory configuration and is modified + * to reflect the emulated configuration on success. @numa_dist_cnt is + * used to determine the size of the physical distance table. + * + * On success, the following modifications are made. + * + * - @numa_meminfo is updated to reflect the emulated nodes. + * + * - __apicid_to_node[] is updated such that APIC IDs are mapped to the + * emulated nodes. + * + * - NUMA distance table is rebuilt to represent distances between emulated + * nodes. The distances are determined considering how emulated nodes + * are mapped to physical nodes and match the actual distances. + * + * - emu_nid_to_phys[] reflects how emulated nodes are mapped to physical + * nodes. This is used by numa_add_cpu() and numa_remove_cpu(). + * + * If emulation is not enabled or fails, emu_nid_to_phys[] is filled with + * identity mapping and no other modification is made. + */ +void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) +{ + static struct numa_meminfo ei __initdata; + static struct numa_meminfo pi __initdata; + const u64 max_addr = PFN_PHYS(max_pfn); + u8 *phys_dist = NULL; + size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]); + int max_emu_nid, dfl_phys_nid; + int i, j, ret; + + if (!emu_cmdline) + goto no_emu; + + memset(&ei, 0, sizeof(ei)); + pi = *numa_meminfo; + + for (i = 0; i < MAX_NUMNODES; i++) + emu_nid_to_phys[i] = NUMA_NO_NODE; + + /* + * If the numa=fake command-line contains a 'M' or 'G', it represents + * the fixed node size. Otherwise, if it is just a single number N, + * split the system RAM into N fake nodes. + */ + if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) { + u64 size; + + size = memparse(emu_cmdline, &emu_cmdline); + ret = split_nodes_size_interleave(&ei, &pi, 0, max_addr, size); + } else { + unsigned long n; + + n = simple_strtoul(emu_cmdline, NULL, 0); + ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n); + } + + if (ret < 0) + goto no_emu; + + if (numa_cleanup_meminfo(&ei) < 0) { + pr_warning("NUMA: Warning: constructed meminfo invalid, disabling emulation\n"); + goto no_emu; + } + + /* copy the physical distance table */ + if (numa_dist_cnt) { + u64 phys; + + phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), + phys_size, PAGE_SIZE); + if (!phys) { + pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n"); + goto no_emu; + } + memblock_reserve(phys, phys_size); + phys_dist = __va(phys); + + for (i = 0; i < numa_dist_cnt; i++) + for (j = 0; j < numa_dist_cnt; j++) + phys_dist[i * numa_dist_cnt + j] = + node_distance(i, j); + } + + /* + * Determine the max emulated nid and the default phys nid to use + * for unmapped nodes. + */ + max_emu_nid = 0; + dfl_phys_nid = NUMA_NO_NODE; + for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) { + if (emu_nid_to_phys[i] != NUMA_NO_NODE) { + max_emu_nid = i; + if (dfl_phys_nid == NUMA_NO_NODE) + dfl_phys_nid = emu_nid_to_phys[i]; + } + } + if (dfl_phys_nid == NUMA_NO_NODE) { + pr_warning("NUMA: Warning: can't determine default physical node, disabling emulation\n"); + goto no_emu; + } + + /* commit */ + *numa_meminfo = ei; + + /* + * Transform __apicid_to_node table to use emulated nids by + * reverse-mapping phys_nid. The maps should always exist but fall + * back to zero just in case. + */ + for (i = 0; i < ARRAY_SIZE(__apicid_to_node); i++) { + if (__apicid_to_node[i] == NUMA_NO_NODE) + continue; + for (j = 0; j < ARRAY_SIZE(emu_nid_to_phys); j++) + if (__apicid_to_node[i] == emu_nid_to_phys[j]) + break; + __apicid_to_node[i] = j < ARRAY_SIZE(emu_nid_to_phys) ? j : 0; + } + + /* make sure all emulated nodes are mapped to a physical node */ + for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) + if (emu_nid_to_phys[i] == NUMA_NO_NODE) + emu_nid_to_phys[i] = dfl_phys_nid; + + /* transform distance table */ + numa_reset_distance(); + for (i = 0; i < max_emu_nid + 1; i++) { + for (j = 0; j < max_emu_nid + 1; j++) { + int physi = emu_nid_to_phys[i]; + int physj = emu_nid_to_phys[j]; + int dist; + + if (physi >= numa_dist_cnt || physj >= numa_dist_cnt) + dist = physi == physj ? + LOCAL_DISTANCE : REMOTE_DISTANCE; + else + dist = phys_dist[physi * numa_dist_cnt + physj]; + + numa_set_distance(i, j, dist); + } + } + + /* free the copied physical distance table */ + if (phys_dist) + memblock_free(__pa(phys_dist), phys_size); + return; + +no_emu: + /* No emulation. Build identity emu_nid_to_phys[] for numa_add_cpu() */ + for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) + emu_nid_to_phys[i] = i; +} + +#ifndef CONFIG_DEBUG_PER_CPU_MAPS +void __cpuinit numa_add_cpu(int cpu) +{ + int physnid, nid; + + nid = early_cpu_to_node(cpu); + BUG_ON(nid == NUMA_NO_NODE || !node_online(nid)); + + physnid = emu_nid_to_phys[nid]; + + /* + * Map the cpu to each emulated node that is allocated on the physical + * node of the cpu's apic id. + */ + for_each_online_node(nid) + if (emu_nid_to_phys[nid] == physnid) + cpumask_set_cpu(cpu, node_to_cpumask_map[nid]); +} + +void __cpuinit numa_remove_cpu(int cpu) +{ + int i; + + for_each_online_node(i) + cpumask_clear_cpu(cpu, node_to_cpumask_map[i]); +} +#else /* !CONFIG_DEBUG_PER_CPU_MAPS */ +static void __cpuinit numa_set_cpumask(int cpu, bool enable) +{ + int nid, physnid; + + nid = early_cpu_to_node(cpu); + if (nid == NUMA_NO_NODE) { + /* early_cpu_to_node() already emits a warning and trace */ + return; + } + + physnid = emu_nid_to_phys[nid]; + + for_each_online_node(nid) { + if (emu_nid_to_phys[nid] != physnid) + continue; + + debug_cpumask_set_cpu(cpu, nid, enable); + } +} + +void __cpuinit numa_add_cpu(int cpu) +{ + numa_set_cpumask(cpu, true); +} + +void __cpuinit numa_remove_cpu(int cpu) +{ + numa_set_cpumask(cpu, false); +} +#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ diff --git a/arch/x86/mm/numa_internal.h b/arch/x86/mm/numa_internal.h new file mode 100644 index 00000000..7178c3af --- /dev/null +++ b/arch/x86/mm/numa_internal.h @@ -0,0 +1,39 @@ +#ifndef __X86_MM_NUMA_INTERNAL_H +#define __X86_MM_NUMA_INTERNAL_H + +#include <linux/types.h> +#include <asm/numa.h> + +struct numa_memblk { + u64 start; + u64 end; + int nid; +}; + +struct numa_meminfo { + int nr_blks; + struct numa_memblk blk[NR_NODE_MEMBLKS]; +}; + +void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi); +int __init numa_cleanup_meminfo(struct numa_meminfo *mi); +void __init numa_reset_distance(void); + +void __init x86_numa_init(void); + +#ifdef CONFIG_X86_64 +static inline void init_alloc_remap(int nid, u64 start, u64 end) { } +#else +void __init init_alloc_remap(int nid, u64 start, u64 end); +#endif + +#ifdef CONFIG_NUMA_EMU +void __init numa_emulation(struct numa_meminfo *numa_meminfo, + int numa_dist_cnt); +#else +static inline void numa_emulation(struct numa_meminfo *numa_meminfo, + int numa_dist_cnt) +{ } +#endif + +#endif /* __X86_MM_NUMA_INTERNAL_H */ diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c new file mode 100644 index 00000000..b0086567 --- /dev/null +++ b/arch/x86/mm/pageattr-test.c @@ -0,0 +1,261 @@ +/* + * self test for change_page_attr. + * + * Clears the a test pte bit on random pages in the direct mapping, + * then reverts and compares page tables forwards and afterwards. + */ +#include <linux/bootmem.h> +#include <linux/kthread.h> +#include <linux/random.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/mm.h> + +#include <asm/cacheflush.h> +#include <asm/pgtable.h> +#include <asm/kdebug.h> + +/* + * Only print the results of the first pass: + */ +static __read_mostly int print = 1; + +enum { + NTEST = 400, +#ifdef CONFIG_X86_64 + LPS = (1 << PMD_SHIFT), +#elif defined(CONFIG_X86_PAE) + LPS = (1 << PMD_SHIFT), +#else + LPS = (1 << 22), +#endif + GPS = (1<<30) +}; + +#define PAGE_CPA_TEST __pgprot(_PAGE_CPA_TEST) + +static int pte_testbit(pte_t pte) +{ + return pte_flags(pte) & _PAGE_UNUSED1; +} + +struct split_state { + long lpg, gpg, spg, exec; + long min_exec, max_exec; +}; + +static int print_split(struct split_state *s) +{ + long i, expected, missed = 0; + int err = 0; + + s->lpg = s->gpg = s->spg = s->exec = 0; + s->min_exec = ~0UL; + s->max_exec = 0; + for (i = 0; i < max_pfn_mapped; ) { + unsigned long addr = (unsigned long)__va(i << PAGE_SHIFT); + unsigned int level; + pte_t *pte; + + pte = lookup_address(addr, &level); + if (!pte) { + missed++; + i++; + continue; + } + + if (level == PG_LEVEL_1G && sizeof(long) == 8) { + s->gpg++; + i += GPS/PAGE_SIZE; + } else if (level == PG_LEVEL_2M) { + if (!(pte_val(*pte) & _PAGE_PSE)) { + printk(KERN_ERR + "%lx level %d but not PSE %Lx\n", + addr, level, (u64)pte_val(*pte)); + err = 1; + } + s->lpg++; + i += LPS/PAGE_SIZE; + } else { + s->spg++; + i++; + } + if (!(pte_val(*pte) & _PAGE_NX)) { + s->exec++; + if (addr < s->min_exec) + s->min_exec = addr; + if (addr > s->max_exec) + s->max_exec = addr; + } + } + if (print) { + printk(KERN_INFO + " 4k %lu large %lu gb %lu x %lu[%lx-%lx] miss %lu\n", + s->spg, s->lpg, s->gpg, s->exec, + s->min_exec != ~0UL ? s->min_exec : 0, + s->max_exec, missed); + } + + expected = (s->gpg*GPS + s->lpg*LPS)/PAGE_SIZE + s->spg + missed; + if (expected != i) { + printk(KERN_ERR "CPA max_pfn_mapped %lu but expected %lu\n", + max_pfn_mapped, expected); + return 1; + } + return err; +} + +static unsigned long addr[NTEST]; +static unsigned int len[NTEST]; + +/* Change the global bit on random pages in the direct mapping */ +static int pageattr_test(void) +{ + struct split_state sa, sb, sc; + unsigned long *bm; + pte_t *pte, pte0; + int failed = 0; + unsigned int level; + int i, k; + int err; + unsigned long test_addr; + + if (print) + printk(KERN_INFO "CPA self-test:\n"); + + bm = vzalloc((max_pfn_mapped + 7) / 8); + if (!bm) { + printk(KERN_ERR "CPA Cannot vmalloc bitmap\n"); + return -ENOMEM; + } + + failed += print_split(&sa); + srandom32(100); + + for (i = 0; i < NTEST; i++) { + unsigned long pfn = random32() % max_pfn_mapped; + + addr[i] = (unsigned long)__va(pfn << PAGE_SHIFT); + len[i] = random32() % 100; + len[i] = min_t(unsigned long, len[i], max_pfn_mapped - pfn - 1); + + if (len[i] == 0) + len[i] = 1; + + pte = NULL; + pte0 = pfn_pte(0, __pgprot(0)); /* shut gcc up */ + + for (k = 0; k < len[i]; k++) { + pte = lookup_address(addr[i] + k*PAGE_SIZE, &level); + if (!pte || pgprot_val(pte_pgprot(*pte)) == 0 || + !(pte_val(*pte) & _PAGE_PRESENT)) { + addr[i] = 0; + break; + } + if (k == 0) { + pte0 = *pte; + } else { + if (pgprot_val(pte_pgprot(*pte)) != + pgprot_val(pte_pgprot(pte0))) { + len[i] = k; + break; + } + } + if (test_bit(pfn + k, bm)) { + len[i] = k; + break; + } + __set_bit(pfn + k, bm); + } + if (!addr[i] || !pte || !k) { + addr[i] = 0; + continue; + } + + test_addr = addr[i]; + err = change_page_attr_set(&test_addr, len[i], PAGE_CPA_TEST, 0); + if (err < 0) { + printk(KERN_ERR "CPA %d failed %d\n", i, err); + failed++; + } + + pte = lookup_address(addr[i], &level); + if (!pte || !pte_testbit(*pte) || pte_huge(*pte)) { + printk(KERN_ERR "CPA %lx: bad pte %Lx\n", addr[i], + pte ? (u64)pte_val(*pte) : 0ULL); + failed++; + } + if (level != PG_LEVEL_4K) { + printk(KERN_ERR "CPA %lx: unexpected level %d\n", + addr[i], level); + failed++; + } + + } + vfree(bm); + + failed += print_split(&sb); + + for (i = 0; i < NTEST; i++) { + if (!addr[i]) + continue; + pte = lookup_address(addr[i], &level); + if (!pte) { + printk(KERN_ERR "CPA lookup of %lx failed\n", addr[i]); + failed++; + continue; + } + test_addr = addr[i]; + err = change_page_attr_clear(&test_addr, len[i], PAGE_CPA_TEST, 0); + if (err < 0) { + printk(KERN_ERR "CPA reverting failed: %d\n", err); + failed++; + } + pte = lookup_address(addr[i], &level); + if (!pte || pte_testbit(*pte)) { + printk(KERN_ERR "CPA %lx: bad pte after revert %Lx\n", + addr[i], pte ? (u64)pte_val(*pte) : 0ULL); + failed++; + } + + } + + failed += print_split(&sc); + + if (failed) { + WARN(1, KERN_ERR "NOT PASSED. Please report.\n"); + return -EINVAL; + } else { + if (print) + printk(KERN_INFO "ok.\n"); + } + + return 0; +} + +static int do_pageattr_test(void *__unused) +{ + while (!kthread_should_stop()) { + schedule_timeout_interruptible(HZ*30); + if (pageattr_test() < 0) + break; + if (print) + print--; + } + return 0; +} + +static int start_pageattr_test(void) +{ + struct task_struct *p; + + p = kthread_create(do_pageattr_test, NULL, "pageattr-test"); + if (!IS_ERR(p)) + wake_up_process(p); + else + WARN_ON(1); + + return 0; +} + +module_init(start_pageattr_test); diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c new file mode 100644 index 00000000..e1ebde31 --- /dev/null +++ b/arch/x86/mm/pageattr.c @@ -0,0 +1,1377 @@ +/* + * Copyright 2002 Andi Kleen, SuSE Labs. + * Thanks to Ben LaHaise for precious feedback. + */ +#include <linux/highmem.h> +#include <linux/bootmem.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/interrupt.h> +#include <linux/seq_file.h> +#include <linux/debugfs.h> +#include <linux/pfn.h> +#include <linux/percpu.h> +#include <linux/gfp.h> +#include <linux/pci.h> + +#include <asm/e820.h> +#include <asm/processor.h> +#include <asm/tlbflush.h> +#include <asm/sections.h> +#include <asm/setup.h> +#include <asm/uaccess.h> +#include <asm/pgalloc.h> +#include <asm/proto.h> +#include <asm/pat.h> + +/* + * The current flushing context - we pass it instead of 5 arguments: + */ +struct cpa_data { + unsigned long *vaddr; + pgprot_t mask_set; + pgprot_t mask_clr; + int numpages; + int flags; + unsigned long pfn; + unsigned force_split : 1; + int curpage; + struct page **pages; +}; + +/* + * Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings) + * using cpa_lock. So that we don't allow any other cpu, with stale large tlb + * entries change the page attribute in parallel to some other cpu + * splitting a large page entry along with changing the attribute. + */ +static DEFINE_SPINLOCK(cpa_lock); + +#define CPA_FLUSHTLB 1 +#define CPA_ARRAY 2 +#define CPA_PAGES_ARRAY 4 + +#ifdef CONFIG_PROC_FS +static unsigned long direct_pages_count[PG_LEVEL_NUM]; + +void update_page_count(int level, unsigned long pages) +{ + /* Protect against CPA */ + spin_lock(&pgd_lock); + direct_pages_count[level] += pages; + spin_unlock(&pgd_lock); +} + +static void split_page_count(int level) +{ + direct_pages_count[level]--; + direct_pages_count[level - 1] += PTRS_PER_PTE; +} + +void arch_report_meminfo(struct seq_file *m) +{ + seq_printf(m, "DirectMap4k: %8lu kB\n", + direct_pages_count[PG_LEVEL_4K] << 2); +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) + seq_printf(m, "DirectMap2M: %8lu kB\n", + direct_pages_count[PG_LEVEL_2M] << 11); +#else + seq_printf(m, "DirectMap4M: %8lu kB\n", + direct_pages_count[PG_LEVEL_2M] << 12); +#endif +#ifdef CONFIG_X86_64 + if (direct_gbpages) + seq_printf(m, "DirectMap1G: %8lu kB\n", + direct_pages_count[PG_LEVEL_1G] << 20); +#endif +} +#else +static inline void split_page_count(int level) { } +#endif + +#ifdef CONFIG_X86_64 + +static inline unsigned long highmap_start_pfn(void) +{ + return __pa(_text) >> PAGE_SHIFT; +} + +static inline unsigned long highmap_end_pfn(void) +{ + return __pa(roundup(_brk_end, PMD_SIZE)) >> PAGE_SHIFT; +} + +#endif + +#ifdef CONFIG_DEBUG_PAGEALLOC +# define debug_pagealloc 1 +#else +# define debug_pagealloc 0 +#endif + +static inline int +within(unsigned long addr, unsigned long start, unsigned long end) +{ + return addr >= start && addr < end; +} + +/* + * Flushing functions + */ + +/** + * clflush_cache_range - flush a cache range with clflush + * @addr: virtual start address + * @size: number of bytes to flush + * + * clflush is an unordered instruction which needs fencing with mfence + * to avoid ordering issues. + */ +void clflush_cache_range(void *vaddr, unsigned int size) +{ + void *vend = vaddr + size - 1; + + mb(); + + for (; vaddr < vend; vaddr += boot_cpu_data.x86_clflush_size) + clflush(vaddr); + /* + * Flush any possible final partial cacheline: + */ + clflush(vend); + + mb(); +} +EXPORT_SYMBOL_GPL(clflush_cache_range); + +static void __cpa_flush_all(void *arg) +{ + unsigned long cache = (unsigned long)arg; + + /* + * Flush all to work around Errata in early athlons regarding + * large page flushing. + */ + __flush_tlb_all(); + + if (cache && boot_cpu_data.x86 >= 4) + wbinvd(); +} + +static void cpa_flush_all(unsigned long cache) +{ + BUG_ON(irqs_disabled()); + + on_each_cpu(__cpa_flush_all, (void *) cache, 1); +} + +static void __cpa_flush_range(void *arg) +{ + /* + * We could optimize that further and do individual per page + * tlb invalidates for a low number of pages. Caveat: we must + * flush the high aliases on 64bit as well. + */ + __flush_tlb_all(); +} + +static void cpa_flush_range(unsigned long start, int numpages, int cache) +{ + unsigned int i, level; + unsigned long addr; + + BUG_ON(irqs_disabled()); + WARN_ON(PAGE_ALIGN(start) != start); + + on_each_cpu(__cpa_flush_range, NULL, 1); + + if (!cache) + return; + + /* + * We only need to flush on one CPU, + * clflush is a MESI-coherent instruction that + * will cause all other CPUs to flush the same + * cachelines: + */ + for (i = 0, addr = start; i < numpages; i++, addr += PAGE_SIZE) { + pte_t *pte = lookup_address(addr, &level); + + /* + * Only flush present addresses: + */ + if (pte && (pte_val(*pte) & _PAGE_PRESENT)) + clflush_cache_range((void *) addr, PAGE_SIZE); + } +} + +static void cpa_flush_array(unsigned long *start, int numpages, int cache, + int in_flags, struct page **pages) +{ + unsigned int i, level; + unsigned long do_wbinvd = cache && numpages >= 1024; /* 4M threshold */ + + BUG_ON(irqs_disabled()); + + on_each_cpu(__cpa_flush_all, (void *) do_wbinvd, 1); + + if (!cache || do_wbinvd) + return; + + /* + * We only need to flush on one CPU, + * clflush is a MESI-coherent instruction that + * will cause all other CPUs to flush the same + * cachelines: + */ + for (i = 0; i < numpages; i++) { + unsigned long addr; + pte_t *pte; + + if (in_flags & CPA_PAGES_ARRAY) + addr = (unsigned long)page_address(pages[i]); + else + addr = start[i]; + + pte = lookup_address(addr, &level); + + /* + * Only flush present addresses: + */ + if (pte && (pte_val(*pte) & _PAGE_PRESENT)) + clflush_cache_range((void *)addr, PAGE_SIZE); + } +} + +/* + * Certain areas of memory on x86 require very specific protection flags, + * for example the BIOS area or kernel text. Callers don't always get this + * right (again, ioremap() on BIOS memory is not uncommon) so this function + * checks and fixes these known static required protection bits. + */ +static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, + unsigned long pfn) +{ + pgprot_t forbidden = __pgprot(0); + + /* + * The BIOS area between 640k and 1Mb needs to be executable for + * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support. + */ +#ifdef CONFIG_PCI_BIOS + if (pcibios_enabled && within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT)) + pgprot_val(forbidden) |= _PAGE_NX; +#endif + + /* + * The kernel text needs to be executable for obvious reasons + * Does not cover __inittext since that is gone later on. On + * 64bit we do not enforce !NX on the low mapping + */ + if (within(address, (unsigned long)_text, (unsigned long)_etext)) + pgprot_val(forbidden) |= _PAGE_NX; + + /* + * The .rodata section needs to be read-only. Using the pfn + * catches all aliases. + */ + if (within(pfn, __pa((unsigned long)__start_rodata) >> PAGE_SHIFT, + __pa((unsigned long)__end_rodata) >> PAGE_SHIFT)) + pgprot_val(forbidden) |= _PAGE_RW; + +#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA) + /* + * Once the kernel maps the text as RO (kernel_set_to_readonly is set), + * kernel text mappings for the large page aligned text, rodata sections + * will be always read-only. For the kernel identity mappings covering + * the holes caused by this alignment can be anything that user asks. + * + * This will preserve the large page mappings for kernel text/data + * at no extra cost. + */ + if (kernel_set_to_readonly && + within(address, (unsigned long)_text, + (unsigned long)__end_rodata_hpage_align)) { + unsigned int level; + + /* + * Don't enforce the !RW mapping for the kernel text mapping, + * if the current mapping is already using small page mapping. + * No need to work hard to preserve large page mappings in this + * case. + * + * This also fixes the Linux Xen paravirt guest boot failure + * (because of unexpected read-only mappings for kernel identity + * mappings). In this paravirt guest case, the kernel text + * mapping and the kernel identity mapping share the same + * page-table pages. Thus we can't really use different + * protections for the kernel text and identity mappings. Also, + * these shared mappings are made of small page mappings. + * Thus this don't enforce !RW mapping for small page kernel + * text mapping logic will help Linux Xen parvirt guest boot + * as well. + */ + if (lookup_address(address, &level) && (level != PG_LEVEL_4K)) + pgprot_val(forbidden) |= _PAGE_RW; + } +#endif + + prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden)); + + return prot; +} + +/* + * Lookup the page table entry for a virtual address. Return a pointer + * to the entry and the level of the mapping. + * + * Note: We return pud and pmd either when the entry is marked large + * or when the present bit is not set. Otherwise we would return a + * pointer to a nonexisting mapping. + */ +pte_t *lookup_address(unsigned long address, unsigned int *level) +{ + pgd_t *pgd = pgd_offset_k(address); + pud_t *pud; + pmd_t *pmd; + + *level = PG_LEVEL_NONE; + + if (pgd_none(*pgd)) + return NULL; + + pud = pud_offset(pgd, address); + if (pud_none(*pud)) + return NULL; + + *level = PG_LEVEL_1G; + if (pud_large(*pud) || !pud_present(*pud)) + return (pte_t *)pud; + + pmd = pmd_offset(pud, address); + if (pmd_none(*pmd)) + return NULL; + + *level = PG_LEVEL_2M; + if (pmd_large(*pmd) || !pmd_present(*pmd)) + return (pte_t *)pmd; + + *level = PG_LEVEL_4K; + + return pte_offset_kernel(pmd, address); +} +EXPORT_SYMBOL_GPL(lookup_address); + +/* + * Set the new pmd in all the pgds we know about: + */ +static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) +{ + /* change init_mm */ + set_pte_atomic(kpte, pte); +#ifdef CONFIG_X86_32 + if (!SHARED_KERNEL_PMD) { + struct page *page; + + list_for_each_entry(page, &pgd_list, lru) { + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + + pgd = (pgd_t *)page_address(page) + pgd_index(address); + pud = pud_offset(pgd, address); + pmd = pmd_offset(pud, address); + set_pte_atomic((pte_t *)pmd, pte); + } + } +#endif +} + +static int +try_preserve_large_page(pte_t *kpte, unsigned long address, + struct cpa_data *cpa) +{ + unsigned long nextpage_addr, numpages, pmask, psize, addr, pfn; + pte_t new_pte, old_pte, *tmp; + pgprot_t old_prot, new_prot, req_prot; + int i, do_split = 1; + unsigned int level; + + if (cpa->force_split) + return 1; + + spin_lock(&pgd_lock); + /* + * Check for races, another CPU might have split this page + * up already: + */ + tmp = lookup_address(address, &level); + if (tmp != kpte) + goto out_unlock; + + switch (level) { + case PG_LEVEL_2M: + psize = PMD_PAGE_SIZE; + pmask = PMD_PAGE_MASK; + break; +#ifdef CONFIG_X86_64 + case PG_LEVEL_1G: + psize = PUD_PAGE_SIZE; + pmask = PUD_PAGE_MASK; + break; +#endif + default: + do_split = -EINVAL; + goto out_unlock; + } + + /* + * Calculate the number of pages, which fit into this large + * page starting at address: + */ + nextpage_addr = (address + psize) & pmask; + numpages = (nextpage_addr - address) >> PAGE_SHIFT; + if (numpages < cpa->numpages) + cpa->numpages = numpages; + + /* + * We are safe now. Check whether the new pgprot is the same: + */ + old_pte = *kpte; + old_prot = new_prot = req_prot = pte_pgprot(old_pte); + + pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr); + pgprot_val(req_prot) |= pgprot_val(cpa->mask_set); + + /* + * old_pte points to the large page base address. So we need + * to add the offset of the virtual address: + */ + pfn = pte_pfn(old_pte) + ((address & (psize - 1)) >> PAGE_SHIFT); + cpa->pfn = pfn; + + new_prot = static_protections(req_prot, address, pfn); + + /* + * We need to check the full range, whether + * static_protection() requires a different pgprot for one of + * the pages in the range we try to preserve: + */ + addr = address & pmask; + pfn = pte_pfn(old_pte); + for (i = 0; i < (psize >> PAGE_SHIFT); i++, addr += PAGE_SIZE, pfn++) { + pgprot_t chk_prot = static_protections(req_prot, addr, pfn); + + if (pgprot_val(chk_prot) != pgprot_val(new_prot)) + goto out_unlock; + } + + /* + * If there are no changes, return. maxpages has been updated + * above: + */ + if (pgprot_val(new_prot) == pgprot_val(old_prot)) { + do_split = 0; + goto out_unlock; + } + + /* + * We need to change the attributes. Check, whether we can + * change the large page in one go. We request a split, when + * the address is not aligned and the number of pages is + * smaller than the number of pages in the large page. Note + * that we limited the number of possible pages already to + * the number of pages in the large page. + */ + if (address == (address & pmask) && cpa->numpages == (psize >> PAGE_SHIFT)) { + /* + * The address is aligned and the number of pages + * covers the full page. + */ + new_pte = pfn_pte(pte_pfn(old_pte), canon_pgprot(new_prot)); + __set_pmd_pte(kpte, address, new_pte); + cpa->flags |= CPA_FLUSHTLB; + do_split = 0; + } + +out_unlock: + spin_unlock(&pgd_lock); + + return do_split; +} + +static int split_large_page(pte_t *kpte, unsigned long address) +{ + unsigned long pfn, pfninc = 1; + unsigned int i, level; + pte_t *pbase, *tmp; + pgprot_t ref_prot; + struct page *base; + + if (!debug_pagealloc) + spin_unlock(&cpa_lock); + base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0); + if (!debug_pagealloc) + spin_lock(&cpa_lock); + if (!base) + return -ENOMEM; + + spin_lock(&pgd_lock); + /* + * Check for races, another CPU might have split this page + * up for us already: + */ + tmp = lookup_address(address, &level); + if (tmp != kpte) + goto out_unlock; + + pbase = (pte_t *)page_address(base); + paravirt_alloc_pte(&init_mm, page_to_pfn(base)); + ref_prot = pte_pgprot(pte_clrhuge(*kpte)); + /* + * If we ever want to utilize the PAT bit, we need to + * update this function to make sure it's converted from + * bit 12 to bit 7 when we cross from the 2MB level to + * the 4K level: + */ + WARN_ON_ONCE(pgprot_val(ref_prot) & _PAGE_PAT_LARGE); + +#ifdef CONFIG_X86_64 + if (level == PG_LEVEL_1G) { + pfninc = PMD_PAGE_SIZE >> PAGE_SHIFT; + pgprot_val(ref_prot) |= _PAGE_PSE; + } +#endif + + /* + * Get the target pfn from the original entry: + */ + pfn = pte_pfn(*kpte); + for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc) + set_pte(&pbase[i], pfn_pte(pfn, ref_prot)); + + if (address >= (unsigned long)__va(0) && + address < (unsigned long)__va(max_low_pfn_mapped << PAGE_SHIFT)) + split_page_count(level); + +#ifdef CONFIG_X86_64 + if (address >= (unsigned long)__va(1UL<<32) && + address < (unsigned long)__va(max_pfn_mapped << PAGE_SHIFT)) + split_page_count(level); +#endif + + /* + * Install the new, split up pagetable. + * + * We use the standard kernel pagetable protections for the new + * pagetable protections, the actual ptes set above control the + * primary protection behavior: + */ + __set_pmd_pte(kpte, address, mk_pte(base, __pgprot(_KERNPG_TABLE))); + + /* + * Intel Atom errata AAH41 workaround. + * + * The real fix should be in hw or in a microcode update, but + * we also probabilistically try to reduce the window of having + * a large TLB mixed with 4K TLBs while instruction fetches are + * going on. + */ + __flush_tlb_all(); + + base = NULL; + +out_unlock: + /* + * If we dropped out via the lookup_address check under + * pgd_lock then stick the page back into the pool: + */ + if (base) + __free_page(base); + spin_unlock(&pgd_lock); + + return 0; +} + +static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr, + int primary) +{ + /* + * Ignore all non primary paths. + */ + if (!primary) + return 0; + + /* + * Ignore the NULL PTE for kernel identity mapping, as it is expected + * to have holes. + * Also set numpages to '1' indicating that we processed cpa req for + * one virtual address page and its pfn. TBD: numpages can be set based + * on the initial value and the level returned by lookup_address(). + */ + if (within(vaddr, PAGE_OFFSET, + PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) { + cpa->numpages = 1; + cpa->pfn = __pa(vaddr) >> PAGE_SHIFT; + return 0; + } else { + WARN(1, KERN_WARNING "CPA: called for zero pte. " + "vaddr = %lx cpa->vaddr = %lx\n", vaddr, + *cpa->vaddr); + + return -EFAULT; + } +} + +static int __change_page_attr(struct cpa_data *cpa, int primary) +{ + unsigned long address; + int do_split, err; + unsigned int level; + pte_t *kpte, old_pte; + + if (cpa->flags & CPA_PAGES_ARRAY) { + struct page *page = cpa->pages[cpa->curpage]; + if (unlikely(PageHighMem(page))) + return 0; + address = (unsigned long)page_address(page); + } else if (cpa->flags & CPA_ARRAY) + address = cpa->vaddr[cpa->curpage]; + else + address = *cpa->vaddr; +repeat: + kpte = lookup_address(address, &level); + if (!kpte) + return __cpa_process_fault(cpa, address, primary); + + old_pte = *kpte; + if (!pte_val(old_pte)) + return __cpa_process_fault(cpa, address, primary); + + if (level == PG_LEVEL_4K) { + pte_t new_pte; + pgprot_t new_prot = pte_pgprot(old_pte); + unsigned long pfn = pte_pfn(old_pte); + + pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr); + pgprot_val(new_prot) |= pgprot_val(cpa->mask_set); + + new_prot = static_protections(new_prot, address, pfn); + + /* + * We need to keep the pfn from the existing PTE, + * after all we're only going to change it's attributes + * not the memory it points to + */ + new_pte = pfn_pte(pfn, canon_pgprot(new_prot)); + cpa->pfn = pfn; + /* + * Do we really change anything ? + */ + if (pte_val(old_pte) != pte_val(new_pte)) { + set_pte_atomic(kpte, new_pte); + cpa->flags |= CPA_FLUSHTLB; + } + cpa->numpages = 1; + return 0; + } + + /* + * Check, whether we can keep the large page intact + * and just change the pte: + */ + do_split = try_preserve_large_page(kpte, address, cpa); + /* + * When the range fits into the existing large page, + * return. cp->numpages and cpa->tlbflush have been updated in + * try_large_page: + */ + if (do_split <= 0) + return do_split; + + /* + * We have to split the large page: + */ + err = split_large_page(kpte, address); + if (!err) { + /* + * Do a global flush tlb after splitting the large page + * and before we do the actual change page attribute in the PTE. + * + * With out this, we violate the TLB application note, that says + * "The TLBs may contain both ordinary and large-page + * translations for a 4-KByte range of linear addresses. This + * may occur if software modifies the paging structures so that + * the page size used for the address range changes. If the two + * translations differ with respect to page frame or attributes + * (e.g., permissions), processor behavior is undefined and may + * be implementation-specific." + * + * We do this global tlb flush inside the cpa_lock, so that we + * don't allow any other cpu, with stale tlb entries change the + * page attribute in parallel, that also falls into the + * just split large page entry. + */ + flush_tlb_all(); + goto repeat; + } + + return err; +} + +static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias); + +static int cpa_process_alias(struct cpa_data *cpa) +{ + struct cpa_data alias_cpa; + unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT); + unsigned long vaddr; + int ret; + + if (cpa->pfn >= max_pfn_mapped) + return 0; + +#ifdef CONFIG_X86_64 + if (cpa->pfn >= max_low_pfn_mapped && cpa->pfn < (1UL<<(32-PAGE_SHIFT))) + return 0; +#endif + /* + * No need to redo, when the primary call touched the direct + * mapping already: + */ + if (cpa->flags & CPA_PAGES_ARRAY) { + struct page *page = cpa->pages[cpa->curpage]; + if (unlikely(PageHighMem(page))) + return 0; + vaddr = (unsigned long)page_address(page); + } else if (cpa->flags & CPA_ARRAY) + vaddr = cpa->vaddr[cpa->curpage]; + else + vaddr = *cpa->vaddr; + + if (!(within(vaddr, PAGE_OFFSET, + PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)))) { + + alias_cpa = *cpa; + alias_cpa.vaddr = &laddr; + alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); + + ret = __change_page_attr_set_clr(&alias_cpa, 0); + if (ret) + return ret; + } + +#ifdef CONFIG_X86_64 + /* + * If the primary call didn't touch the high mapping already + * and the physical address is inside the kernel map, we need + * to touch the high mapped kernel as well: + */ + if (!within(vaddr, (unsigned long)_text, _brk_end) && + within(cpa->pfn, highmap_start_pfn(), highmap_end_pfn())) { + unsigned long temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) + + __START_KERNEL_map - phys_base; + alias_cpa = *cpa; + alias_cpa.vaddr = &temp_cpa_vaddr; + alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); + + /* + * The high mapping range is imprecise, so ignore the + * return value. + */ + __change_page_attr_set_clr(&alias_cpa, 0); + } +#endif + + return 0; +} + +static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) +{ + int ret, numpages = cpa->numpages; + + while (numpages) { + /* + * Store the remaining nr of pages for the large page + * preservation check. + */ + cpa->numpages = numpages; + /* for array changes, we can't use large page */ + if (cpa->flags & (CPA_ARRAY | CPA_PAGES_ARRAY)) + cpa->numpages = 1; + + if (!debug_pagealloc) + spin_lock(&cpa_lock); + ret = __change_page_attr(cpa, checkalias); + if (!debug_pagealloc) + spin_unlock(&cpa_lock); + if (ret) + return ret; + + if (checkalias) { + ret = cpa_process_alias(cpa); + if (ret) + return ret; + } + + /* + * Adjust the number of pages with the result of the + * CPA operation. Either a large page has been + * preserved or a single page update happened. + */ + BUG_ON(cpa->numpages > numpages); + numpages -= cpa->numpages; + if (cpa->flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) + cpa->curpage++; + else + *cpa->vaddr += cpa->numpages * PAGE_SIZE; + + } + return 0; +} + +static inline int cache_attr(pgprot_t attr) +{ + return pgprot_val(attr) & + (_PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PWT | _PAGE_PCD); +} + +static int change_page_attr_set_clr(unsigned long *addr, int numpages, + pgprot_t mask_set, pgprot_t mask_clr, + int force_split, int in_flag, + struct page **pages) +{ + struct cpa_data cpa; + int ret, cache, checkalias; + unsigned long baddr = 0; + + /* + * Check, if we are requested to change a not supported + * feature: + */ + mask_set = canon_pgprot(mask_set); + mask_clr = canon_pgprot(mask_clr); + if (!pgprot_val(mask_set) && !pgprot_val(mask_clr) && !force_split) + return 0; + + /* Ensure we are PAGE_SIZE aligned */ + if (in_flag & CPA_ARRAY) { + int i; + for (i = 0; i < numpages; i++) { + if (addr[i] & ~PAGE_MASK) { + addr[i] &= PAGE_MASK; + WARN_ON_ONCE(1); + } + } + } else if (!(in_flag & CPA_PAGES_ARRAY)) { + /* + * in_flag of CPA_PAGES_ARRAY implies it is aligned. + * No need to cehck in that case + */ + if (*addr & ~PAGE_MASK) { + *addr &= PAGE_MASK; + /* + * People should not be passing in unaligned addresses: + */ + WARN_ON_ONCE(1); + } + /* + * Save address for cache flush. *addr is modified in the call + * to __change_page_attr_set_clr() below. + */ + baddr = *addr; + } + + /* Must avoid aliasing mappings in the highmem code */ + kmap_flush_unused(); + + vm_unmap_aliases(); + + cpa.vaddr = addr; + cpa.pages = pages; + cpa.numpages = numpages; + cpa.mask_set = mask_set; + cpa.mask_clr = mask_clr; + cpa.flags = 0; + cpa.curpage = 0; + cpa.force_split = force_split; + + if (in_flag & (CPA_ARRAY | CPA_PAGES_ARRAY)) + cpa.flags |= in_flag; + + /* No alias checking for _NX bit modifications */ + checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX; + + ret = __change_page_attr_set_clr(&cpa, checkalias); + + /* + * Check whether we really changed something: + */ + if (!(cpa.flags & CPA_FLUSHTLB)) + goto out; + + /* + * No need to flush, when we did not set any of the caching + * attributes: + */ + cache = cache_attr(mask_set); + + /* + * On success we use clflush, when the CPU supports it to + * avoid the wbindv. If the CPU does not support it and in the + * error case we fall back to cpa_flush_all (which uses + * wbindv): + */ + if (!ret && cpu_has_clflush) { + if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) { + cpa_flush_array(addr, numpages, cache, + cpa.flags, pages); + } else + cpa_flush_range(baddr, numpages, cache); + } else + cpa_flush_all(cache); + +out: + return ret; +} + +static inline int change_page_attr_set(unsigned long *addr, int numpages, + pgprot_t mask, int array) +{ + return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0, + (array ? CPA_ARRAY : 0), NULL); +} + +static inline int change_page_attr_clear(unsigned long *addr, int numpages, + pgprot_t mask, int array) +{ + return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0, + (array ? CPA_ARRAY : 0), NULL); +} + +static inline int cpa_set_pages_array(struct page **pages, int numpages, + pgprot_t mask) +{ + return change_page_attr_set_clr(NULL, numpages, mask, __pgprot(0), 0, + CPA_PAGES_ARRAY, pages); +} + +static inline int cpa_clear_pages_array(struct page **pages, int numpages, + pgprot_t mask) +{ + return change_page_attr_set_clr(NULL, numpages, __pgprot(0), mask, 0, + CPA_PAGES_ARRAY, pages); +} + +int _set_memory_uc(unsigned long addr, int numpages) +{ + /* + * for now UC MINUS. see comments in ioremap_nocache() + */ + return change_page_attr_set(&addr, numpages, + __pgprot(_PAGE_CACHE_UC_MINUS), 0); +} + +int set_memory_uc(unsigned long addr, int numpages) +{ + int ret; + + /* + * for now UC MINUS. see comments in ioremap_nocache() + */ + ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE, + _PAGE_CACHE_UC_MINUS, NULL); + if (ret) + goto out_err; + + ret = _set_memory_uc(addr, numpages); + if (ret) + goto out_free; + + return 0; + +out_free: + free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); +out_err: + return ret; +} +EXPORT_SYMBOL(set_memory_uc); + +static int _set_memory_array(unsigned long *addr, int addrinarray, + unsigned long new_type) +{ + int i, j; + int ret; + + /* + * for now UC MINUS. see comments in ioremap_nocache() + */ + for (i = 0; i < addrinarray; i++) { + ret = reserve_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE, + new_type, NULL); + if (ret) + goto out_free; + } + + ret = change_page_attr_set(addr, addrinarray, + __pgprot(_PAGE_CACHE_UC_MINUS), 1); + + if (!ret && new_type == _PAGE_CACHE_WC) + ret = change_page_attr_set_clr(addr, addrinarray, + __pgprot(_PAGE_CACHE_WC), + __pgprot(_PAGE_CACHE_MASK), + 0, CPA_ARRAY, NULL); + if (ret) + goto out_free; + + return 0; + +out_free: + for (j = 0; j < i; j++) + free_memtype(__pa(addr[j]), __pa(addr[j]) + PAGE_SIZE); + + return ret; +} + +int set_memory_array_uc(unsigned long *addr, int addrinarray) +{ + return _set_memory_array(addr, addrinarray, _PAGE_CACHE_UC_MINUS); +} +EXPORT_SYMBOL(set_memory_array_uc); + +int set_memory_array_wc(unsigned long *addr, int addrinarray) +{ + return _set_memory_array(addr, addrinarray, _PAGE_CACHE_WC); +} +EXPORT_SYMBOL(set_memory_array_wc); + +int _set_memory_wc(unsigned long addr, int numpages) +{ + int ret; + unsigned long addr_copy = addr; + + ret = change_page_attr_set(&addr, numpages, + __pgprot(_PAGE_CACHE_UC_MINUS), 0); + if (!ret) { + ret = change_page_attr_set_clr(&addr_copy, numpages, + __pgprot(_PAGE_CACHE_WC), + __pgprot(_PAGE_CACHE_MASK), + 0, 0, NULL); + } + return ret; +} + +int set_memory_wc(unsigned long addr, int numpages) +{ + int ret; + + if (!pat_enabled) + return set_memory_uc(addr, numpages); + + ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE, + _PAGE_CACHE_WC, NULL); + if (ret) + goto out_err; + + ret = _set_memory_wc(addr, numpages); + if (ret) + goto out_free; + + return 0; + +out_free: + free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); +out_err: + return ret; +} +EXPORT_SYMBOL(set_memory_wc); + +int _set_memory_wb(unsigned long addr, int numpages) +{ + return change_page_attr_clear(&addr, numpages, + __pgprot(_PAGE_CACHE_MASK), 0); +} + +int set_memory_wb(unsigned long addr, int numpages) +{ + int ret; + + ret = _set_memory_wb(addr, numpages); + if (ret) + return ret; + + free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); + return 0; +} +EXPORT_SYMBOL(set_memory_wb); + +int set_memory_array_wb(unsigned long *addr, int addrinarray) +{ + int i; + int ret; + + ret = change_page_attr_clear(addr, addrinarray, + __pgprot(_PAGE_CACHE_MASK), 1); + if (ret) + return ret; + + for (i = 0; i < addrinarray; i++) + free_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE); + + return 0; +} +EXPORT_SYMBOL(set_memory_array_wb); + +int set_memory_x(unsigned long addr, int numpages) +{ + if (!(__supported_pte_mask & _PAGE_NX)) + return 0; + + return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0); +} +EXPORT_SYMBOL(set_memory_x); + +int set_memory_nx(unsigned long addr, int numpages) +{ + if (!(__supported_pte_mask & _PAGE_NX)) + return 0; + + return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0); +} +EXPORT_SYMBOL(set_memory_nx); + +int set_memory_ro(unsigned long addr, int numpages) +{ + return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW), 0); +} +EXPORT_SYMBOL_GPL(set_memory_ro); + +int set_memory_rw(unsigned long addr, int numpages) +{ + return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0); +} +EXPORT_SYMBOL_GPL(set_memory_rw); + +int set_memory_np(unsigned long addr, int numpages) +{ + return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_PRESENT), 0); +} + +int set_memory_4k(unsigned long addr, int numpages) +{ + return change_page_attr_set_clr(&addr, numpages, __pgprot(0), + __pgprot(0), 1, 0, NULL); +} + +int set_pages_uc(struct page *page, int numpages) +{ + unsigned long addr = (unsigned long)page_address(page); + + return set_memory_uc(addr, numpages); +} +EXPORT_SYMBOL(set_pages_uc); + +static int _set_pages_array(struct page **pages, int addrinarray, + unsigned long new_type) +{ + unsigned long start; + unsigned long end; + int i; + int free_idx; + int ret; + + for (i = 0; i < addrinarray; i++) { + if (PageHighMem(pages[i])) + continue; + start = page_to_pfn(pages[i]) << PAGE_SHIFT; + end = start + PAGE_SIZE; + if (reserve_memtype(start, end, new_type, NULL)) + goto err_out; + } + + ret = cpa_set_pages_array(pages, addrinarray, + __pgprot(_PAGE_CACHE_UC_MINUS)); + if (!ret && new_type == _PAGE_CACHE_WC) + ret = change_page_attr_set_clr(NULL, addrinarray, + __pgprot(_PAGE_CACHE_WC), + __pgprot(_PAGE_CACHE_MASK), + 0, CPA_PAGES_ARRAY, pages); + if (ret) + goto err_out; + return 0; /* Success */ +err_out: + free_idx = i; + for (i = 0; i < free_idx; i++) { + if (PageHighMem(pages[i])) + continue; + start = page_to_pfn(pages[i]) << PAGE_SHIFT; + end = start + PAGE_SIZE; + free_memtype(start, end); + } + return -EINVAL; +} + +int set_pages_array_uc(struct page **pages, int addrinarray) +{ + return _set_pages_array(pages, addrinarray, _PAGE_CACHE_UC_MINUS); +} +EXPORT_SYMBOL(set_pages_array_uc); + +int set_pages_array_wc(struct page **pages, int addrinarray) +{ + return _set_pages_array(pages, addrinarray, _PAGE_CACHE_WC); +} +EXPORT_SYMBOL(set_pages_array_wc); + +int set_pages_wb(struct page *page, int numpages) +{ + unsigned long addr = (unsigned long)page_address(page); + + return set_memory_wb(addr, numpages); +} +EXPORT_SYMBOL(set_pages_wb); + +int set_pages_array_wb(struct page **pages, int addrinarray) +{ + int retval; + unsigned long start; + unsigned long end; + int i; + + retval = cpa_clear_pages_array(pages, addrinarray, + __pgprot(_PAGE_CACHE_MASK)); + if (retval) + return retval; + + for (i = 0; i < addrinarray; i++) { + if (PageHighMem(pages[i])) + continue; + start = page_to_pfn(pages[i]) << PAGE_SHIFT; + end = start + PAGE_SIZE; + free_memtype(start, end); + } + + return 0; +} +EXPORT_SYMBOL(set_pages_array_wb); + +int set_pages_x(struct page *page, int numpages) +{ + unsigned long addr = (unsigned long)page_address(page); + + return set_memory_x(addr, numpages); +} +EXPORT_SYMBOL(set_pages_x); + +int set_pages_nx(struct page *page, int numpages) +{ + unsigned long addr = (unsigned long)page_address(page); + + return set_memory_nx(addr, numpages); +} +EXPORT_SYMBOL(set_pages_nx); + +int set_pages_ro(struct page *page, int numpages) +{ + unsigned long addr = (unsigned long)page_address(page); + + return set_memory_ro(addr, numpages); +} + +int set_pages_rw(struct page *page, int numpages) +{ + unsigned long addr = (unsigned long)page_address(page); + + return set_memory_rw(addr, numpages); +} + +#ifdef CONFIG_DEBUG_PAGEALLOC + +static int __set_pages_p(struct page *page, int numpages) +{ + unsigned long tempaddr = (unsigned long) page_address(page); + struct cpa_data cpa = { .vaddr = &tempaddr, + .numpages = numpages, + .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW), + .mask_clr = __pgprot(0), + .flags = 0}; + + /* + * No alias checking needed for setting present flag. otherwise, + * we may need to break large pages for 64-bit kernel text + * mappings (this adds to complexity if we want to do this from + * atomic context especially). Let's keep it simple! + */ + return __change_page_attr_set_clr(&cpa, 0); +} + +static int __set_pages_np(struct page *page, int numpages) +{ + unsigned long tempaddr = (unsigned long) page_address(page); + struct cpa_data cpa = { .vaddr = &tempaddr, + .numpages = numpages, + .mask_set = __pgprot(0), + .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW), + .flags = 0}; + + /* + * No alias checking needed for setting not present flag. otherwise, + * we may need to break large pages for 64-bit kernel text + * mappings (this adds to complexity if we want to do this from + * atomic context especially). Let's keep it simple! + */ + return __change_page_attr_set_clr(&cpa, 0); +} + +void kernel_map_pages(struct page *page, int numpages, int enable) +{ + if (PageHighMem(page)) + return; + if (!enable) { + debug_check_no_locks_freed(page_address(page), + numpages * PAGE_SIZE); + } + + /* + * The return value is ignored as the calls cannot fail. + * Large pages for identity mappings are not used at boot time + * and hence no memory allocations during large page split. + */ + if (enable) + __set_pages_p(page, numpages); + else + __set_pages_np(page, numpages); + + /* + * We should perform an IPI and flush all tlbs, + * but that can deadlock->flush only current cpu: + */ + __flush_tlb_all(); +} + +#ifdef CONFIG_HIBERNATION + +bool kernel_page_present(struct page *page) +{ + unsigned int level; + pte_t *pte; + + if (PageHighMem(page)) + return false; + + pte = lookup_address((unsigned long)page_address(page), &level); + return (pte_val(*pte) & _PAGE_PRESENT); +} + +#endif /* CONFIG_HIBERNATION */ + +#endif /* CONFIG_DEBUG_PAGEALLOC */ + +/* + * The testcases use internal knowledge of the implementation that shouldn't + * be exposed to the rest of the kernel. Include these directly here. + */ +#ifdef CONFIG_CPA_DEBUG +#include "pageattr-test.c" +#endif diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c new file mode 100644 index 00000000..f6ff57b7 --- /dev/null +++ b/arch/x86/mm/pat.c @@ -0,0 +1,828 @@ +/* + * Handle caching attributes in page tables (PAT) + * + * Authors: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> + * Suresh B Siddha <suresh.b.siddha@intel.com> + * + * Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen. + */ + +#include <linux/seq_file.h> +#include <linux/bootmem.h> +#include <linux/debugfs.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/mm.h> +#include <linux/fs.h> +#include <linux/rbtree.h> + +#include <asm/cacheflush.h> +#include <asm/processor.h> +#include <asm/tlbflush.h> +#include <asm/x86_init.h> +#include <asm/pgtable.h> +#include <asm/fcntl.h> +#include <asm/e820.h> +#include <asm/mtrr.h> +#include <asm/page.h> +#include <asm/msr.h> +#include <asm/pat.h> +#include <asm/io.h> + +#include "pat_internal.h" + +#ifdef CONFIG_X86_PAT +int __read_mostly pat_enabled = 1; + +static inline void pat_disable(const char *reason) +{ + pat_enabled = 0; + printk(KERN_INFO "%s\n", reason); +} + +static int __init nopat(char *str) +{ + pat_disable("PAT support disabled."); + return 0; +} +early_param("nopat", nopat); +#else +static inline void pat_disable(const char *reason) +{ + (void)reason; +} +#endif + + +int pat_debug_enable; + +static int __init pat_debug_setup(char *str) +{ + pat_debug_enable = 1; + return 0; +} +__setup("debugpat", pat_debug_setup); + +static u64 __read_mostly boot_pat_state; + +enum { + PAT_UC = 0, /* uncached */ + PAT_WC = 1, /* Write combining */ + PAT_WT = 4, /* Write Through */ + PAT_WP = 5, /* Write Protected */ + PAT_WB = 6, /* Write Back (default) */ + PAT_UC_MINUS = 7, /* UC, but can be overriden by MTRR */ +}; + +#define PAT(x, y) ((u64)PAT_ ## y << ((x)*8)) + +void pat_init(void) +{ + u64 pat; + bool boot_cpu = !boot_pat_state; + + if (!pat_enabled) + return; + + if (!cpu_has_pat) { + if (!boot_pat_state) { + pat_disable("PAT not supported by CPU."); + return; + } else { + /* + * If this happens we are on a secondary CPU, but + * switched to PAT on the boot CPU. We have no way to + * undo PAT. + */ + printk(KERN_ERR "PAT enabled, " + "but not supported by secondary CPU\n"); + BUG(); + } + } + + /* Set PWT to Write-Combining. All other bits stay the same */ + /* + * PTE encoding used in Linux: + * PAT + * |PCD + * ||PWT + * ||| + * 000 WB _PAGE_CACHE_WB + * 001 WC _PAGE_CACHE_WC + * 010 UC- _PAGE_CACHE_UC_MINUS + * 011 UC _PAGE_CACHE_UC + * PAT bit unused + */ + pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) | + PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, UC); + + /* Boot CPU check */ + if (!boot_pat_state) + rdmsrl(MSR_IA32_CR_PAT, boot_pat_state); + + wrmsrl(MSR_IA32_CR_PAT, pat); + + if (boot_cpu) + printk(KERN_INFO "x86 PAT enabled: cpu %d, old 0x%Lx, new 0x%Lx\n", + smp_processor_id(), boot_pat_state, pat); +} + +#undef PAT + +static DEFINE_SPINLOCK(memtype_lock); /* protects memtype accesses */ + +/* + * Does intersection of PAT memory type and MTRR memory type and returns + * the resulting memory type as PAT understands it. + * (Type in pat and mtrr will not have same value) + * The intersection is based on "Effective Memory Type" tables in IA-32 + * SDM vol 3a + */ +static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type) +{ + /* + * Look for MTRR hint to get the effective type in case where PAT + * request is for WB. + */ + if (req_type == _PAGE_CACHE_WB) { + u8 mtrr_type; + + mtrr_type = mtrr_type_lookup(start, end); + if (mtrr_type != MTRR_TYPE_WRBACK) + return _PAGE_CACHE_UC_MINUS; + + return _PAGE_CACHE_WB; + } + + return req_type; +} + +static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end) +{ + int ram_page = 0, not_rampage = 0; + unsigned long page_nr; + + for (page_nr = (start >> PAGE_SHIFT); page_nr < (end >> PAGE_SHIFT); + ++page_nr) { + /* + * For legacy reasons, physical address range in the legacy ISA + * region is tracked as non-RAM. This will allow users of + * /dev/mem to map portions of legacy ISA region, even when + * some of those portions are listed(or not even listed) with + * different e820 types(RAM/reserved/..) + */ + if (page_nr >= (ISA_END_ADDRESS >> PAGE_SHIFT) && + page_is_ram(page_nr)) + ram_page = 1; + else + not_rampage = 1; + + if (ram_page == not_rampage) + return -1; + } + + return ram_page; +} + +/* + * For RAM pages, we use page flags to mark the pages with appropriate type. + * Here we do two pass: + * - Find the memtype of all the pages in the range, look for any conflicts + * - In case of no conflicts, set the new memtype for pages in the range + */ +static int reserve_ram_pages_type(u64 start, u64 end, unsigned long req_type, + unsigned long *new_type) +{ + struct page *page; + u64 pfn; + + if (req_type == _PAGE_CACHE_UC) { + /* We do not support strong UC */ + WARN_ON_ONCE(1); + req_type = _PAGE_CACHE_UC_MINUS; + } + + for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { + unsigned long type; + + page = pfn_to_page(pfn); + type = get_page_memtype(page); + if (type != -1) { + printk(KERN_INFO "reserve_ram_pages_type failed " + "0x%Lx-0x%Lx, track 0x%lx, req 0x%lx\n", + start, end, type, req_type); + if (new_type) + *new_type = type; + + return -EBUSY; + } + } + + if (new_type) + *new_type = req_type; + + for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { + page = pfn_to_page(pfn); + set_page_memtype(page, req_type); + } + return 0; +} + +static int free_ram_pages_type(u64 start, u64 end) +{ + struct page *page; + u64 pfn; + + for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { + page = pfn_to_page(pfn); + set_page_memtype(page, -1); + } + return 0; +} + +/* + * req_type typically has one of the: + * - _PAGE_CACHE_WB + * - _PAGE_CACHE_WC + * - _PAGE_CACHE_UC_MINUS + * - _PAGE_CACHE_UC + * + * If new_type is NULL, function will return an error if it cannot reserve the + * region with req_type. If new_type is non-NULL, function will return + * available type in new_type in case of no error. In case of any error + * it will return a negative return value. + */ +int reserve_memtype(u64 start, u64 end, unsigned long req_type, + unsigned long *new_type) +{ + struct memtype *new; + unsigned long actual_type; + int is_range_ram; + int err = 0; + + BUG_ON(start >= end); /* end is exclusive */ + + if (!pat_enabled) { + /* This is identical to page table setting without PAT */ + if (new_type) { + if (req_type == _PAGE_CACHE_WC) + *new_type = _PAGE_CACHE_UC_MINUS; + else + *new_type = req_type & _PAGE_CACHE_MASK; + } + return 0; + } + + /* Low ISA region is always mapped WB in page table. No need to track */ + if (x86_platform.is_untracked_pat_range(start, end)) { + if (new_type) + *new_type = _PAGE_CACHE_WB; + return 0; + } + + /* + * Call mtrr_lookup to get the type hint. This is an + * optimization for /dev/mem mmap'ers into WB memory (BIOS + * tools and ACPI tools). Use WB request for WB memory and use + * UC_MINUS otherwise. + */ + actual_type = pat_x_mtrr_type(start, end, req_type & _PAGE_CACHE_MASK); + + if (new_type) + *new_type = actual_type; + + is_range_ram = pat_pagerange_is_ram(start, end); + if (is_range_ram == 1) { + + err = reserve_ram_pages_type(start, end, req_type, new_type); + + return err; + } else if (is_range_ram < 0) { + return -EINVAL; + } + + new = kzalloc(sizeof(struct memtype), GFP_KERNEL); + if (!new) + return -ENOMEM; + + new->start = start; + new->end = end; + new->type = actual_type; + + spin_lock(&memtype_lock); + + err = rbt_memtype_check_insert(new, new_type); + if (err) { + printk(KERN_INFO "reserve_memtype failed 0x%Lx-0x%Lx, " + "track %s, req %s\n", + start, end, cattr_name(new->type), cattr_name(req_type)); + kfree(new); + spin_unlock(&memtype_lock); + + return err; + } + + spin_unlock(&memtype_lock); + + dprintk("reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n", + start, end, cattr_name(new->type), cattr_name(req_type), + new_type ? cattr_name(*new_type) : "-"); + + return err; +} + +int free_memtype(u64 start, u64 end) +{ + int err = -EINVAL; + int is_range_ram; + struct memtype *entry; + + if (!pat_enabled) + return 0; + + /* Low ISA region is always mapped WB. No need to track */ + if (x86_platform.is_untracked_pat_range(start, end)) + return 0; + + is_range_ram = pat_pagerange_is_ram(start, end); + if (is_range_ram == 1) { + + err = free_ram_pages_type(start, end); + + return err; + } else if (is_range_ram < 0) { + return -EINVAL; + } + + spin_lock(&memtype_lock); + entry = rbt_memtype_erase(start, end); + spin_unlock(&memtype_lock); + + if (!entry) { + printk(KERN_INFO "%s:%d freeing invalid memtype %Lx-%Lx\n", + current->comm, current->pid, start, end); + return -EINVAL; + } + + kfree(entry); + + dprintk("free_memtype request 0x%Lx-0x%Lx\n", start, end); + + return 0; +} + + +/** + * lookup_memtype - Looksup the memory type for a physical address + * @paddr: physical address of which memory type needs to be looked up + * + * Only to be called when PAT is enabled + * + * Returns _PAGE_CACHE_WB, _PAGE_CACHE_WC, _PAGE_CACHE_UC_MINUS or + * _PAGE_CACHE_UC + */ +static unsigned long lookup_memtype(u64 paddr) +{ + int rettype = _PAGE_CACHE_WB; + struct memtype *entry; + + if (x86_platform.is_untracked_pat_range(paddr, paddr + PAGE_SIZE)) + return rettype; + + if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) { + struct page *page; + page = pfn_to_page(paddr >> PAGE_SHIFT); + rettype = get_page_memtype(page); + /* + * -1 from get_page_memtype() implies RAM page is in its + * default state and not reserved, and hence of type WB + */ + if (rettype == -1) + rettype = _PAGE_CACHE_WB; + + return rettype; + } + + spin_lock(&memtype_lock); + + entry = rbt_memtype_lookup(paddr); + if (entry != NULL) + rettype = entry->type; + else + rettype = _PAGE_CACHE_UC_MINUS; + + spin_unlock(&memtype_lock); + return rettype; +} + +/** + * io_reserve_memtype - Request a memory type mapping for a region of memory + * @start: start (physical address) of the region + * @end: end (physical address) of the region + * @type: A pointer to memtype, with requested type. On success, requested + * or any other compatible type that was available for the region is returned + * + * On success, returns 0 + * On failure, returns non-zero + */ +int io_reserve_memtype(resource_size_t start, resource_size_t end, + unsigned long *type) +{ + resource_size_t size = end - start; + unsigned long req_type = *type; + unsigned long new_type; + int ret; + + WARN_ON_ONCE(iomem_map_sanity_check(start, size)); + + ret = reserve_memtype(start, end, req_type, &new_type); + if (ret) + goto out_err; + + if (!is_new_memtype_allowed(start, size, req_type, new_type)) + goto out_free; + + if (kernel_map_sync_memtype(start, size, new_type) < 0) + goto out_free; + + *type = new_type; + return 0; + +out_free: + free_memtype(start, end); + ret = -EBUSY; +out_err: + return ret; +} + +/** + * io_free_memtype - Release a memory type mapping for a region of memory + * @start: start (physical address) of the region + * @end: end (physical address) of the region + */ +void io_free_memtype(resource_size_t start, resource_size_t end) +{ + free_memtype(start, end); +} + +pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, + unsigned long size, pgprot_t vma_prot) +{ + return vma_prot; +} + +#ifdef CONFIG_STRICT_DEVMEM +/* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM*/ +static inline int range_is_allowed(unsigned long pfn, unsigned long size) +{ + return 1; +} +#else +/* This check is needed to avoid cache aliasing when PAT is enabled */ +static inline int range_is_allowed(unsigned long pfn, unsigned long size) +{ + u64 from = ((u64)pfn) << PAGE_SHIFT; + u64 to = from + size; + u64 cursor = from; + + if (!pat_enabled) + return 1; + + while (cursor < to) { + if (!devmem_is_allowed(pfn)) { + printk(KERN_INFO + "Program %s tried to access /dev/mem between %Lx->%Lx.\n", + current->comm, from, to); + return 0; + } + cursor += PAGE_SIZE; + pfn++; + } + return 1; +} +#endif /* CONFIG_STRICT_DEVMEM */ + +int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, + unsigned long size, pgprot_t *vma_prot) +{ + unsigned long flags = _PAGE_CACHE_WB; + + if (!range_is_allowed(pfn, size)) + return 0; + + if (file->f_flags & O_DSYNC) + flags = _PAGE_CACHE_UC_MINUS; + +#ifdef CONFIG_X86_32 + /* + * On the PPro and successors, the MTRRs are used to set + * memory types for physical addresses outside main memory, + * so blindly setting UC or PWT on those pages is wrong. + * For Pentiums and earlier, the surround logic should disable + * caching for the high addresses through the KEN pin, but + * we maintain the tradition of paranoia in this code. + */ + if (!pat_enabled && + !(boot_cpu_has(X86_FEATURE_MTRR) || + boot_cpu_has(X86_FEATURE_K6_MTRR) || + boot_cpu_has(X86_FEATURE_CYRIX_ARR) || + boot_cpu_has(X86_FEATURE_CENTAUR_MCR)) && + (pfn << PAGE_SHIFT) >= __pa(high_memory)) { + flags = _PAGE_CACHE_UC; + } +#endif + + *vma_prot = __pgprot((pgprot_val(*vma_prot) & ~_PAGE_CACHE_MASK) | + flags); + return 1; +} + +/* + * Change the memory type for the physial address range in kernel identity + * mapping space if that range is a part of identity map. + */ +int kernel_map_sync_memtype(u64 base, unsigned long size, unsigned long flags) +{ + unsigned long id_sz; + + if (base >= __pa(high_memory)) + return 0; + + id_sz = (__pa(high_memory) < base + size) ? + __pa(high_memory) - base : + size; + + if (ioremap_change_attr((unsigned long)__va(base), id_sz, flags) < 0) { + printk(KERN_INFO + "%s:%d ioremap_change_attr failed %s " + "for %Lx-%Lx\n", + current->comm, current->pid, + cattr_name(flags), + base, (unsigned long long)(base + size)); + return -EINVAL; + } + return 0; +} + +/* + * Internal interface to reserve a range of physical memory with prot. + * Reserved non RAM regions only and after successful reserve_memtype, + * this func also keeps identity mapping (if any) in sync with this new prot. + */ +static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, + int strict_prot) +{ + int is_ram = 0; + int ret; + unsigned long want_flags = (pgprot_val(*vma_prot) & _PAGE_CACHE_MASK); + unsigned long flags = want_flags; + + is_ram = pat_pagerange_is_ram(paddr, paddr + size); + + /* + * reserve_pfn_range() for RAM pages. We do not refcount to keep + * track of number of mappings of RAM pages. We can assert that + * the type requested matches the type of first page in the range. + */ + if (is_ram) { + if (!pat_enabled) + return 0; + + flags = lookup_memtype(paddr); + if (want_flags != flags) { + printk(KERN_WARNING + "%s:%d map pfn RAM range req %s for %Lx-%Lx, got %s\n", + current->comm, current->pid, + cattr_name(want_flags), + (unsigned long long)paddr, + (unsigned long long)(paddr + size), + cattr_name(flags)); + *vma_prot = __pgprot((pgprot_val(*vma_prot) & + (~_PAGE_CACHE_MASK)) | + flags); + } + return 0; + } + + ret = reserve_memtype(paddr, paddr + size, want_flags, &flags); + if (ret) + return ret; + + if (flags != want_flags) { + if (strict_prot || + !is_new_memtype_allowed(paddr, size, want_flags, flags)) { + free_memtype(paddr, paddr + size); + printk(KERN_ERR "%s:%d map pfn expected mapping type %s" + " for %Lx-%Lx, got %s\n", + current->comm, current->pid, + cattr_name(want_flags), + (unsigned long long)paddr, + (unsigned long long)(paddr + size), + cattr_name(flags)); + return -EINVAL; + } + /* + * We allow returning different type than the one requested in + * non strict case. + */ + *vma_prot = __pgprot((pgprot_val(*vma_prot) & + (~_PAGE_CACHE_MASK)) | + flags); + } + + if (kernel_map_sync_memtype(paddr, size, flags) < 0) { + free_memtype(paddr, paddr + size); + return -EINVAL; + } + return 0; +} + +/* + * Internal interface to free a range of physical memory. + * Frees non RAM regions only. + */ +static void free_pfn_range(u64 paddr, unsigned long size) +{ + int is_ram; + + is_ram = pat_pagerange_is_ram(paddr, paddr + size); + if (is_ram == 0) + free_memtype(paddr, paddr + size); +} + +/* + * track_pfn_vma_copy is called when vma that is covering the pfnmap gets + * copied through copy_page_range(). + * + * If the vma has a linear pfn mapping for the entire range, we get the prot + * from pte and reserve the entire vma range with single reserve_pfn_range call. + */ +int track_pfn_vma_copy(struct vm_area_struct *vma) +{ + resource_size_t paddr; + unsigned long prot; + unsigned long vma_size = vma->vm_end - vma->vm_start; + pgprot_t pgprot; + + if (is_linear_pfn_mapping(vma)) { + /* + * reserve the whole chunk covered by vma. We need the + * starting address and protection from pte. + */ + if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) { + WARN_ON_ONCE(1); + return -EINVAL; + } + pgprot = __pgprot(prot); + return reserve_pfn_range(paddr, vma_size, &pgprot, 1); + } + + return 0; +} + +/* + * track_pfn_vma_new is called when a _new_ pfn mapping is being established + * for physical range indicated by pfn and size. + * + * prot is passed in as a parameter for the new mapping. If the vma has a + * linear pfn mapping for the entire range reserve the entire vma range with + * single reserve_pfn_range call. + */ +int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot, + unsigned long pfn, unsigned long size) +{ + unsigned long flags; + resource_size_t paddr; + unsigned long vma_size = vma->vm_end - vma->vm_start; + + if (is_linear_pfn_mapping(vma)) { + /* reserve the whole chunk starting from vm_pgoff */ + paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT; + return reserve_pfn_range(paddr, vma_size, prot, 0); + } + + if (!pat_enabled) + return 0; + + /* for vm_insert_pfn and friends, we set prot based on lookup */ + flags = lookup_memtype(pfn << PAGE_SHIFT); + *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) | + flags); + + return 0; +} + +/* + * untrack_pfn_vma is called while unmapping a pfnmap for a region. + * untrack can be called for a specific region indicated by pfn and size or + * can be for the entire vma (in which case size can be zero). + */ +void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn, + unsigned long size) +{ + resource_size_t paddr; + unsigned long vma_size = vma->vm_end - vma->vm_start; + + if (is_linear_pfn_mapping(vma)) { + /* free the whole chunk starting from vm_pgoff */ + paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT; + free_pfn_range(paddr, vma_size); + return; + } +} + +pgprot_t pgprot_writecombine(pgprot_t prot) +{ + if (pat_enabled) + return __pgprot(pgprot_val(prot) | _PAGE_CACHE_WC); + else + return pgprot_noncached(prot); +} +EXPORT_SYMBOL_GPL(pgprot_writecombine); + +#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT) + +static struct memtype *memtype_get_idx(loff_t pos) +{ + struct memtype *print_entry; + int ret; + + print_entry = kzalloc(sizeof(struct memtype), GFP_KERNEL); + if (!print_entry) + return NULL; + + spin_lock(&memtype_lock); + ret = rbt_memtype_copy_nth_element(print_entry, pos); + spin_unlock(&memtype_lock); + + if (!ret) { + return print_entry; + } else { + kfree(print_entry); + return NULL; + } +} + +static void *memtype_seq_start(struct seq_file *seq, loff_t *pos) +{ + if (*pos == 0) { + ++*pos; + seq_printf(seq, "PAT memtype list:\n"); + } + + return memtype_get_idx(*pos); +} + +static void *memtype_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + ++*pos; + return memtype_get_idx(*pos); +} + +static void memtype_seq_stop(struct seq_file *seq, void *v) +{ +} + +static int memtype_seq_show(struct seq_file *seq, void *v) +{ + struct memtype *print_entry = (struct memtype *)v; + + seq_printf(seq, "%s @ 0x%Lx-0x%Lx\n", cattr_name(print_entry->type), + print_entry->start, print_entry->end); + kfree(print_entry); + + return 0; +} + +static const struct seq_operations memtype_seq_ops = { + .start = memtype_seq_start, + .next = memtype_seq_next, + .stop = memtype_seq_stop, + .show = memtype_seq_show, +}; + +static int memtype_seq_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &memtype_seq_ops); +} + +static const struct file_operations memtype_fops = { + .open = memtype_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init pat_memtype_list_init(void) +{ + if (pat_enabled) { + debugfs_create_file("pat_memtype_list", S_IRUSR, + arch_debugfs_dir, NULL, &memtype_fops); + } + return 0; +} + +late_initcall(pat_memtype_list_init); + +#endif /* CONFIG_DEBUG_FS && CONFIG_X86_PAT */ diff --git a/arch/x86/mm/pat_internal.h b/arch/x86/mm/pat_internal.h new file mode 100644 index 00000000..77e5ba15 --- /dev/null +++ b/arch/x86/mm/pat_internal.h @@ -0,0 +1,46 @@ +#ifndef __PAT_INTERNAL_H_ +#define __PAT_INTERNAL_H_ + +extern int pat_debug_enable; + +#define dprintk(fmt, arg...) \ + do { if (pat_debug_enable) printk(KERN_INFO fmt, ##arg); } while (0) + +struct memtype { + u64 start; + u64 end; + u64 subtree_max_end; + unsigned long type; + struct rb_node rb; +}; + +static inline char *cattr_name(unsigned long flags) +{ + switch (flags & _PAGE_CACHE_MASK) { + case _PAGE_CACHE_UC: return "uncached"; + case _PAGE_CACHE_UC_MINUS: return "uncached-minus"; + case _PAGE_CACHE_WB: return "write-back"; + case _PAGE_CACHE_WC: return "write-combining"; + default: return "broken"; + } +} + +#ifdef CONFIG_X86_PAT +extern int rbt_memtype_check_insert(struct memtype *new, + unsigned long *new_type); +extern struct memtype *rbt_memtype_erase(u64 start, u64 end); +extern struct memtype *rbt_memtype_lookup(u64 addr); +extern int rbt_memtype_copy_nth_element(struct memtype *out, loff_t pos); +#else +static inline int rbt_memtype_check_insert(struct memtype *new, + unsigned long *new_type) +{ return 0; } +static inline struct memtype *rbt_memtype_erase(u64 start, u64 end) +{ return NULL; } +static inline struct memtype *rbt_memtype_lookup(u64 addr) +{ return NULL; } +static inline int rbt_memtype_copy_nth_element(struct memtype *out, loff_t pos) +{ return 0; } +#endif + +#endif /* __PAT_INTERNAL_H_ */ diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c new file mode 100644 index 00000000..8acaddd0 --- /dev/null +++ b/arch/x86/mm/pat_rbtree.c @@ -0,0 +1,253 @@ +/* + * Handle caching attributes in page tables (PAT) + * + * Authors: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> + * Suresh B Siddha <suresh.b.siddha@intel.com> + * + * Interval tree (augmented rbtree) used to store the PAT memory type + * reservations. + */ + +#include <linux/seq_file.h> +#include <linux/debugfs.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/rbtree.h> +#include <linux/sched.h> +#include <linux/gfp.h> + +#include <asm/pgtable.h> +#include <asm/pat.h> + +#include "pat_internal.h" + +/* + * The memtype tree keeps track of memory type for specific + * physical memory areas. Without proper tracking, conflicting memory + * types in different mappings can cause CPU cache corruption. + * + * The tree is an interval tree (augmented rbtree) with tree ordered + * on starting address. Tree can contain multiple entries for + * different regions which overlap. All the aliases have the same + * cache attributes of course. + * + * memtype_lock protects the rbtree. + */ + +static struct rb_root memtype_rbroot = RB_ROOT; + +static int is_node_overlap(struct memtype *node, u64 start, u64 end) +{ + if (node->start >= end || node->end <= start) + return 0; + + return 1; +} + +static u64 get_subtree_max_end(struct rb_node *node) +{ + u64 ret = 0; + if (node) { + struct memtype *data = container_of(node, struct memtype, rb); + ret = data->subtree_max_end; + } + return ret; +} + +/* Update 'subtree_max_end' for a node, based on node and its children */ +static void memtype_rb_augment_cb(struct rb_node *node, void *__unused) +{ + struct memtype *data; + u64 max_end, child_max_end; + + if (!node) + return; + + data = container_of(node, struct memtype, rb); + max_end = data->end; + + child_max_end = get_subtree_max_end(node->rb_right); + if (child_max_end > max_end) + max_end = child_max_end; + + child_max_end = get_subtree_max_end(node->rb_left); + if (child_max_end > max_end) + max_end = child_max_end; + + data->subtree_max_end = max_end; +} + +/* Find the first (lowest start addr) overlapping range from rb tree */ +static struct memtype *memtype_rb_lowest_match(struct rb_root *root, + u64 start, u64 end) +{ + struct rb_node *node = root->rb_node; + struct memtype *last_lower = NULL; + + while (node) { + struct memtype *data = container_of(node, struct memtype, rb); + + if (get_subtree_max_end(node->rb_left) > start) { + /* Lowest overlap if any must be on left side */ + node = node->rb_left; + } else if (is_node_overlap(data, start, end)) { + last_lower = data; + break; + } else if (start >= data->start) { + /* Lowest overlap if any must be on right side */ + node = node->rb_right; + } else { + break; + } + } + return last_lower; /* Returns NULL if there is no overlap */ +} + +static struct memtype *memtype_rb_exact_match(struct rb_root *root, + u64 start, u64 end) +{ + struct memtype *match; + + match = memtype_rb_lowest_match(root, start, end); + while (match != NULL && match->start < end) { + struct rb_node *node; + + if (match->start == start && match->end == end) + return match; + + node = rb_next(&match->rb); + if (node) + match = container_of(node, struct memtype, rb); + else + match = NULL; + } + + return NULL; /* Returns NULL if there is no exact match */ +} + +static int memtype_rb_check_conflict(struct rb_root *root, + u64 start, u64 end, + unsigned long reqtype, unsigned long *newtype) +{ + struct rb_node *node; + struct memtype *match; + int found_type = reqtype; + + match = memtype_rb_lowest_match(&memtype_rbroot, start, end); + if (match == NULL) + goto success; + + if (match->type != found_type && newtype == NULL) + goto failure; + + dprintk("Overlap at 0x%Lx-0x%Lx\n", match->start, match->end); + found_type = match->type; + + node = rb_next(&match->rb); + while (node) { + match = container_of(node, struct memtype, rb); + + if (match->start >= end) /* Checked all possible matches */ + goto success; + + if (is_node_overlap(match, start, end) && + match->type != found_type) { + goto failure; + } + + node = rb_next(&match->rb); + } +success: + if (newtype) + *newtype = found_type; + + return 0; + +failure: + printk(KERN_INFO "%s:%d conflicting memory types " + "%Lx-%Lx %s<->%s\n", current->comm, current->pid, start, + end, cattr_name(found_type), cattr_name(match->type)); + return -EBUSY; +} + +static void memtype_rb_insert(struct rb_root *root, struct memtype *newdata) +{ + struct rb_node **node = &(root->rb_node); + struct rb_node *parent = NULL; + + while (*node) { + struct memtype *data = container_of(*node, struct memtype, rb); + + parent = *node; + if (newdata->start <= data->start) + node = &((*node)->rb_left); + else if (newdata->start > data->start) + node = &((*node)->rb_right); + } + + rb_link_node(&newdata->rb, parent, node); + rb_insert_color(&newdata->rb, root); + rb_augment_insert(&newdata->rb, memtype_rb_augment_cb, NULL); +} + +int rbt_memtype_check_insert(struct memtype *new, unsigned long *ret_type) +{ + int err = 0; + + err = memtype_rb_check_conflict(&memtype_rbroot, new->start, new->end, + new->type, ret_type); + + if (!err) { + if (ret_type) + new->type = *ret_type; + + new->subtree_max_end = new->end; + memtype_rb_insert(&memtype_rbroot, new); + } + return err; +} + +struct memtype *rbt_memtype_erase(u64 start, u64 end) +{ + struct rb_node *deepest; + struct memtype *data; + + data = memtype_rb_exact_match(&memtype_rbroot, start, end); + if (!data) + goto out; + + deepest = rb_augment_erase_begin(&data->rb); + rb_erase(&data->rb, &memtype_rbroot); + rb_augment_erase_end(deepest, memtype_rb_augment_cb, NULL); +out: + return data; +} + +struct memtype *rbt_memtype_lookup(u64 addr) +{ + struct memtype *data; + data = memtype_rb_lowest_match(&memtype_rbroot, addr, addr + PAGE_SIZE); + return data; +} + +#if defined(CONFIG_DEBUG_FS) +int rbt_memtype_copy_nth_element(struct memtype *out, loff_t pos) +{ + struct rb_node *node; + int i = 1; + + node = rb_first(&memtype_rbroot); + while (node && pos != i) { + node = rb_next(node); + i++; + } + + if (node) { /* pos == i */ + struct memtype *this = container_of(node, struct memtype, rb); + *out = *this; + return 0; + } else { + return 1; + } +} +#endif diff --git a/arch/x86/mm/pf_in.c b/arch/x86/mm/pf_in.c new file mode 100644 index 00000000..9f0614da --- /dev/null +++ b/arch/x86/mm/pf_in.c @@ -0,0 +1,532 @@ +/* + * Fault Injection Test harness (FI) + * Copyright (C) Intel Crop. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, + * USA. + * + */ + +/* Id: pf_in.c,v 1.1.1.1 2002/11/12 05:56:32 brlock Exp + * Copyright by Intel Crop., 2002 + * Louis Zhuang (louis.zhuang@intel.com) + * + * Bjorn Steinbrink (B.Steinbrink@gmx.de), 2007 + */ + +#include <linux/module.h> +#include <linux/ptrace.h> /* struct pt_regs */ +#include "pf_in.h" + +#ifdef __i386__ +/* IA32 Manual 3, 2-1 */ +static unsigned char prefix_codes[] = { + 0xF0, 0xF2, 0xF3, 0x2E, 0x36, 0x3E, 0x26, 0x64, + 0x65, 0x66, 0x67 +}; +/* IA32 Manual 3, 3-432*/ +static unsigned int reg_rop[] = { + 0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F +}; +static unsigned int reg_wop[] = { 0x88, 0x89, 0xAA, 0xAB }; +static unsigned int imm_wop[] = { 0xC6, 0xC7 }; +/* IA32 Manual 3, 3-432*/ +static unsigned int rw8[] = { 0x88, 0x8A, 0xC6, 0xAA }; +static unsigned int rw32[] = { + 0x89, 0x8B, 0xC7, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F, 0xAB +}; +static unsigned int mw8[] = { 0x88, 0x8A, 0xC6, 0xB60F, 0xBE0F, 0xAA }; +static unsigned int mw16[] = { 0xB70F, 0xBF0F }; +static unsigned int mw32[] = { 0x89, 0x8B, 0xC7, 0xAB }; +static unsigned int mw64[] = {}; +#else /* not __i386__ */ +static unsigned char prefix_codes[] = { + 0x66, 0x67, 0x2E, 0x3E, 0x26, 0x64, 0x65, 0x36, + 0xF0, 0xF3, 0xF2, + /* REX Prefixes */ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f +}; +/* AMD64 Manual 3, Appendix A*/ +static unsigned int reg_rop[] = { + 0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F +}; +static unsigned int reg_wop[] = { 0x88, 0x89, 0xAA, 0xAB }; +static unsigned int imm_wop[] = { 0xC6, 0xC7 }; +static unsigned int rw8[] = { 0xC6, 0x88, 0x8A, 0xAA }; +static unsigned int rw32[] = { + 0xC7, 0x89, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F, 0xAB +}; +/* 8 bit only */ +static unsigned int mw8[] = { 0xC6, 0x88, 0x8A, 0xB60F, 0xBE0F, 0xAA }; +/* 16 bit only */ +static unsigned int mw16[] = { 0xB70F, 0xBF0F }; +/* 16 or 32 bit */ +static unsigned int mw32[] = { 0xC7 }; +/* 16, 32 or 64 bit */ +static unsigned int mw64[] = { 0x89, 0x8B, 0xAB }; +#endif /* not __i386__ */ + +struct prefix_bits { + unsigned shorted:1; + unsigned enlarged:1; + unsigned rexr:1; + unsigned rex:1; +}; + +static int skip_prefix(unsigned char *addr, struct prefix_bits *prf) +{ + int i; + unsigned char *p = addr; + prf->shorted = 0; + prf->enlarged = 0; + prf->rexr = 0; + prf->rex = 0; + +restart: + for (i = 0; i < ARRAY_SIZE(prefix_codes); i++) { + if (*p == prefix_codes[i]) { + if (*p == 0x66) + prf->shorted = 1; +#ifdef __amd64__ + if ((*p & 0xf8) == 0x48) + prf->enlarged = 1; + if ((*p & 0xf4) == 0x44) + prf->rexr = 1; + if ((*p & 0xf0) == 0x40) + prf->rex = 1; +#endif + p++; + goto restart; + } + } + + return (p - addr); +} + +static int get_opcode(unsigned char *addr, unsigned int *opcode) +{ + int len; + + if (*addr == 0x0F) { + /* 0x0F is extension instruction */ + *opcode = *(unsigned short *)addr; + len = 2; + } else { + *opcode = *addr; + len = 1; + } + + return len; +} + +#define CHECK_OP_TYPE(opcode, array, type) \ + for (i = 0; i < ARRAY_SIZE(array); i++) { \ + if (array[i] == opcode) { \ + rv = type; \ + goto exit; \ + } \ + } + +enum reason_type get_ins_type(unsigned long ins_addr) +{ + unsigned int opcode; + unsigned char *p; + struct prefix_bits prf; + int i; + enum reason_type rv = OTHERS; + + p = (unsigned char *)ins_addr; + p += skip_prefix(p, &prf); + p += get_opcode(p, &opcode); + + CHECK_OP_TYPE(opcode, reg_rop, REG_READ); + CHECK_OP_TYPE(opcode, reg_wop, REG_WRITE); + CHECK_OP_TYPE(opcode, imm_wop, IMM_WRITE); + +exit: + return rv; +} +#undef CHECK_OP_TYPE + +static unsigned int get_ins_reg_width(unsigned long ins_addr) +{ + unsigned int opcode; + unsigned char *p; + struct prefix_bits prf; + int i; + + p = (unsigned char *)ins_addr; + p += skip_prefix(p, &prf); + p += get_opcode(p, &opcode); + + for (i = 0; i < ARRAY_SIZE(rw8); i++) + if (rw8[i] == opcode) + return 1; + + for (i = 0; i < ARRAY_SIZE(rw32); i++) + if (rw32[i] == opcode) + return prf.shorted ? 2 : (prf.enlarged ? 8 : 4); + + printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode); + return 0; +} + +unsigned int get_ins_mem_width(unsigned long ins_addr) +{ + unsigned int opcode; + unsigned char *p; + struct prefix_bits prf; + int i; + + p = (unsigned char *)ins_addr; + p += skip_prefix(p, &prf); + p += get_opcode(p, &opcode); + + for (i = 0; i < ARRAY_SIZE(mw8); i++) + if (mw8[i] == opcode) + return 1; + + for (i = 0; i < ARRAY_SIZE(mw16); i++) + if (mw16[i] == opcode) + return 2; + + for (i = 0; i < ARRAY_SIZE(mw32); i++) + if (mw32[i] == opcode) + return prf.shorted ? 2 : 4; + + for (i = 0; i < ARRAY_SIZE(mw64); i++) + if (mw64[i] == opcode) + return prf.shorted ? 2 : (prf.enlarged ? 8 : 4); + + printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode); + return 0; +} + +/* + * Define register ident in mod/rm byte. + * Note: these are NOT the same as in ptrace-abi.h. + */ +enum { + arg_AL = 0, + arg_CL = 1, + arg_DL = 2, + arg_BL = 3, + arg_AH = 4, + arg_CH = 5, + arg_DH = 6, + arg_BH = 7, + + arg_AX = 0, + arg_CX = 1, + arg_DX = 2, + arg_BX = 3, + arg_SP = 4, + arg_BP = 5, + arg_SI = 6, + arg_DI = 7, +#ifdef __amd64__ + arg_R8 = 8, + arg_R9 = 9, + arg_R10 = 10, + arg_R11 = 11, + arg_R12 = 12, + arg_R13 = 13, + arg_R14 = 14, + arg_R15 = 15 +#endif +}; + +static unsigned char *get_reg_w8(int no, int rex, struct pt_regs *regs) +{ + unsigned char *rv = NULL; + + switch (no) { + case arg_AL: + rv = (unsigned char *)®s->ax; + break; + case arg_BL: + rv = (unsigned char *)®s->bx; + break; + case arg_CL: + rv = (unsigned char *)®s->cx; + break; + case arg_DL: + rv = (unsigned char *)®s->dx; + break; +#ifdef __amd64__ + case arg_R8: + rv = (unsigned char *)®s->r8; + break; + case arg_R9: + rv = (unsigned char *)®s->r9; + break; + case arg_R10: + rv = (unsigned char *)®s->r10; + break; + case arg_R11: + rv = (unsigned char *)®s->r11; + break; + case arg_R12: + rv = (unsigned char *)®s->r12; + break; + case arg_R13: + rv = (unsigned char *)®s->r13; + break; + case arg_R14: + rv = (unsigned char *)®s->r14; + break; + case arg_R15: + rv = (unsigned char *)®s->r15; + break; +#endif + default: + break; + } + + if (rv) + return rv; + + if (rex) { + /* + * If REX prefix exists, access low bytes of SI etc. + * instead of AH etc. + */ + switch (no) { + case arg_SI: + rv = (unsigned char *)®s->si; + break; + case arg_DI: + rv = (unsigned char *)®s->di; + break; + case arg_BP: + rv = (unsigned char *)®s->bp; + break; + case arg_SP: + rv = (unsigned char *)®s->sp; + break; + default: + break; + } + } else { + switch (no) { + case arg_AH: + rv = 1 + (unsigned char *)®s->ax; + break; + case arg_BH: + rv = 1 + (unsigned char *)®s->bx; + break; + case arg_CH: + rv = 1 + (unsigned char *)®s->cx; + break; + case arg_DH: + rv = 1 + (unsigned char *)®s->dx; + break; + default: + break; + } + } + + if (!rv) + printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no); + + return rv; +} + +static unsigned long *get_reg_w32(int no, struct pt_regs *regs) +{ + unsigned long *rv = NULL; + + switch (no) { + case arg_AX: + rv = ®s->ax; + break; + case arg_BX: + rv = ®s->bx; + break; + case arg_CX: + rv = ®s->cx; + break; + case arg_DX: + rv = ®s->dx; + break; + case arg_SP: + rv = ®s->sp; + break; + case arg_BP: + rv = ®s->bp; + break; + case arg_SI: + rv = ®s->si; + break; + case arg_DI: + rv = ®s->di; + break; +#ifdef __amd64__ + case arg_R8: + rv = ®s->r8; + break; + case arg_R9: + rv = ®s->r9; + break; + case arg_R10: + rv = ®s->r10; + break; + case arg_R11: + rv = ®s->r11; + break; + case arg_R12: + rv = ®s->r12; + break; + case arg_R13: + rv = ®s->r13; + break; + case arg_R14: + rv = ®s->r14; + break; + case arg_R15: + rv = ®s->r15; + break; +#endif + default: + printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no); + } + + return rv; +} + +unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs) +{ + unsigned int opcode; + int reg; + unsigned char *p; + struct prefix_bits prf; + int i; + + p = (unsigned char *)ins_addr; + p += skip_prefix(p, &prf); + p += get_opcode(p, &opcode); + for (i = 0; i < ARRAY_SIZE(reg_rop); i++) + if (reg_rop[i] == opcode) + goto do_work; + + for (i = 0; i < ARRAY_SIZE(reg_wop); i++) + if (reg_wop[i] == opcode) + goto do_work; + + printk(KERN_ERR "mmiotrace: Not a register instruction, opcode " + "0x%02x\n", opcode); + goto err; + +do_work: + /* for STOS, source register is fixed */ + if (opcode == 0xAA || opcode == 0xAB) { + reg = arg_AX; + } else { + unsigned char mod_rm = *p; + reg = ((mod_rm >> 3) & 0x7) | (prf.rexr << 3); + } + switch (get_ins_reg_width(ins_addr)) { + case 1: + return *get_reg_w8(reg, prf.rex, regs); + + case 2: + return *(unsigned short *)get_reg_w32(reg, regs); + + case 4: + return *(unsigned int *)get_reg_w32(reg, regs); + +#ifdef __amd64__ + case 8: + return *(unsigned long *)get_reg_w32(reg, regs); +#endif + + default: + printk(KERN_ERR "mmiotrace: Error width# %d\n", reg); + } + +err: + return 0; +} + +unsigned long get_ins_imm_val(unsigned long ins_addr) +{ + unsigned int opcode; + unsigned char mod_rm; + unsigned char mod; + unsigned char *p; + struct prefix_bits prf; + int i; + + p = (unsigned char *)ins_addr; + p += skip_prefix(p, &prf); + p += get_opcode(p, &opcode); + for (i = 0; i < ARRAY_SIZE(imm_wop); i++) + if (imm_wop[i] == opcode) + goto do_work; + + printk(KERN_ERR "mmiotrace: Not an immediate instruction, opcode " + "0x%02x\n", opcode); + goto err; + +do_work: + mod_rm = *p; + mod = mod_rm >> 6; + p++; + switch (mod) { + case 0: + /* if r/m is 5 we have a 32 disp (IA32 Manual 3, Table 2-2) */ + /* AMD64: XXX Check for address size prefix? */ + if ((mod_rm & 0x7) == 0x5) + p += 4; + break; + + case 1: + p += 1; + break; + + case 2: + p += 4; + break; + + case 3: + default: + printk(KERN_ERR "mmiotrace: not a memory access instruction " + "at 0x%lx, rm_mod=0x%02x\n", + ins_addr, mod_rm); + } + + switch (get_ins_reg_width(ins_addr)) { + case 1: + return *(unsigned char *)p; + + case 2: + return *(unsigned short *)p; + + case 4: + return *(unsigned int *)p; + +#ifdef __amd64__ + case 8: + return *(unsigned long *)p; +#endif + + default: + printk(KERN_ERR "mmiotrace: Error: width.\n"); + } + +err: + return 0; +} diff --git a/arch/x86/mm/pf_in.h b/arch/x86/mm/pf_in.h new file mode 100644 index 00000000..e05341a5 --- /dev/null +++ b/arch/x86/mm/pf_in.h @@ -0,0 +1,39 @@ +/* + * Fault Injection Test harness (FI) + * Copyright (C) Intel Crop. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, + * USA. + * + */ + +#ifndef __PF_H_ +#define __PF_H_ + +enum reason_type { + NOT_ME, /* page fault is not in regions */ + NOTHING, /* access others point in regions */ + REG_READ, /* read from addr to reg */ + REG_WRITE, /* write from reg to addr */ + IMM_WRITE, /* write from imm to addr */ + OTHERS /* Other instructions can not intercept */ +}; + +enum reason_type get_ins_type(unsigned long ins_addr); +unsigned int get_ins_mem_width(unsigned long ins_addr); +unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs); +unsigned long get_ins_imm_val(unsigned long ins_addr); + +#endif /* __PF_H_ */ diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c new file mode 100644 index 00000000..8573b83a --- /dev/null +++ b/arch/x86/mm/pgtable.c @@ -0,0 +1,447 @@ +#include <linux/mm.h> +#include <linux/gfp.h> +#include <asm/pgalloc.h> +#include <asm/pgtable.h> +#include <asm/tlb.h> +#include <asm/fixmap.h> + +#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO + +#ifdef CONFIG_HIGHPTE +#define PGALLOC_USER_GFP __GFP_HIGHMEM +#else +#define PGALLOC_USER_GFP 0 +#endif + +gfp_t __userpte_alloc_gfp = PGALLOC_GFP | PGALLOC_USER_GFP; + +pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) +{ + return (pte_t *)__get_free_page(PGALLOC_GFP); +} + +pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) +{ + struct page *pte; + + pte = alloc_pages(__userpte_alloc_gfp, 0); + if (pte) + pgtable_page_ctor(pte); + return pte; +} + +static int __init setup_userpte(char *arg) +{ + if (!arg) + return -EINVAL; + + /* + * "userpte=nohigh" disables allocation of user pagetables in + * high memory. + */ + if (strcmp(arg, "nohigh") == 0) + __userpte_alloc_gfp &= ~__GFP_HIGHMEM; + else + return -EINVAL; + return 0; +} +early_param("userpte", setup_userpte); + +void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) +{ + pgtable_page_dtor(pte); + paravirt_release_pte(page_to_pfn(pte)); + tlb_remove_page(tlb, pte); +} + +#if PAGETABLE_LEVELS > 2 +void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) +{ + paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT); + tlb_remove_page(tlb, virt_to_page(pmd)); +} + +#if PAGETABLE_LEVELS > 3 +void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) +{ + paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); + tlb_remove_page(tlb, virt_to_page(pud)); +} +#endif /* PAGETABLE_LEVELS > 3 */ +#endif /* PAGETABLE_LEVELS > 2 */ + +static inline void pgd_list_add(pgd_t *pgd) +{ + struct page *page = virt_to_page(pgd); + + list_add(&page->lru, &pgd_list); +} + +static inline void pgd_list_del(pgd_t *pgd) +{ + struct page *page = virt_to_page(pgd); + + list_del(&page->lru); +} + +#define UNSHARED_PTRS_PER_PGD \ + (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD) + + +static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm) +{ + BUILD_BUG_ON(sizeof(virt_to_page(pgd)->index) < sizeof(mm)); + virt_to_page(pgd)->index = (pgoff_t)mm; +} + +struct mm_struct *pgd_page_get_mm(struct page *page) +{ + return (struct mm_struct *)page->index; +} + +static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd) +{ + /* If the pgd points to a shared pagetable level (either the + ptes in non-PAE, or shared PMD in PAE), then just copy the + references from swapper_pg_dir. */ + if (PAGETABLE_LEVELS == 2 || + (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD) || + PAGETABLE_LEVELS == 4) { + clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY, + swapper_pg_dir + KERNEL_PGD_BOUNDARY, + KERNEL_PGD_PTRS); + } + + /* list required to sync kernel mapping updates */ + if (!SHARED_KERNEL_PMD) { + pgd_set_mm(pgd, mm); + pgd_list_add(pgd); + } +} + +static void pgd_dtor(pgd_t *pgd) +{ + if (SHARED_KERNEL_PMD) + return; + + spin_lock(&pgd_lock); + pgd_list_del(pgd); + spin_unlock(&pgd_lock); +} + +/* + * List of all pgd's needed for non-PAE so it can invalidate entries + * in both cached and uncached pgd's; not needed for PAE since the + * kernel pmd is shared. If PAE were not to share the pmd a similar + * tactic would be needed. This is essentially codepath-based locking + * against pageattr.c; it is the unique case in which a valid change + * of kernel pagetables can't be lazily synchronized by vmalloc faults. + * vmalloc faults work because attached pagetables are never freed. + * -- wli + */ + +#ifdef CONFIG_X86_PAE +/* + * In PAE mode, we need to do a cr3 reload (=tlb flush) when + * updating the top-level pagetable entries to guarantee the + * processor notices the update. Since this is expensive, and + * all 4 top-level entries are used almost immediately in a + * new process's life, we just pre-populate them here. + * + * Also, if we're in a paravirt environment where the kernel pmd is + * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate + * and initialize the kernel pmds here. + */ +#define PREALLOCATED_PMDS UNSHARED_PTRS_PER_PGD + +void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) +{ + paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT); + + /* Note: almost everything apart from _PAGE_PRESENT is + reserved at the pmd (PDPT) level. */ + set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT)); + + /* + * According to Intel App note "TLBs, Paging-Structure Caches, + * and Their Invalidation", April 2007, document 317080-001, + * section 8.1: in PAE mode we explicitly have to flush the + * TLB via cr3 if the top-level pgd is changed... + */ + flush_tlb_mm(mm); +} +#else /* !CONFIG_X86_PAE */ + +/* No need to prepopulate any pagetable entries in non-PAE modes. */ +#define PREALLOCATED_PMDS 0 + +#endif /* CONFIG_X86_PAE */ + +static void free_pmds(pmd_t *pmds[]) +{ + int i; + + for(i = 0; i < PREALLOCATED_PMDS; i++) + if (pmds[i]) + free_page((unsigned long)pmds[i]); +} + +static int preallocate_pmds(pmd_t *pmds[]) +{ + int i; + bool failed = false; + + for(i = 0; i < PREALLOCATED_PMDS; i++) { + pmd_t *pmd = (pmd_t *)__get_free_page(PGALLOC_GFP); + if (pmd == NULL) + failed = true; + pmds[i] = pmd; + } + + if (failed) { + free_pmds(pmds); + return -ENOMEM; + } + + return 0; +} + +/* + * Mop up any pmd pages which may still be attached to the pgd. + * Normally they will be freed by munmap/exit_mmap, but any pmd we + * preallocate which never got a corresponding vma will need to be + * freed manually. + */ +static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) +{ + int i; + + for(i = 0; i < PREALLOCATED_PMDS; i++) { + pgd_t pgd = pgdp[i]; + + if (pgd_val(pgd) != 0) { + pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd); + + pgdp[i] = native_make_pgd(0); + + paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT); + pmd_free(mm, pmd); + } + } +} + +static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[]) +{ + pud_t *pud; + unsigned long addr; + int i; + + if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */ + return; + + pud = pud_offset(pgd, 0); + + for (addr = i = 0; i < PREALLOCATED_PMDS; + i++, pud++, addr += PUD_SIZE) { + pmd_t *pmd = pmds[i]; + + if (i >= KERNEL_PGD_BOUNDARY) + memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]), + sizeof(pmd_t) * PTRS_PER_PMD); + + pud_populate(mm, pud, pmd); + } +} + +pgd_t *pgd_alloc(struct mm_struct *mm) +{ + pgd_t *pgd; + pmd_t *pmds[PREALLOCATED_PMDS]; + + pgd = (pgd_t *)__get_free_page(PGALLOC_GFP); + + if (pgd == NULL) + goto out; + + mm->pgd = pgd; + + if (preallocate_pmds(pmds) != 0) + goto out_free_pgd; + + if (paravirt_pgd_alloc(mm) != 0) + goto out_free_pmds; + + /* + * Make sure that pre-populating the pmds is atomic with + * respect to anything walking the pgd_list, so that they + * never see a partially populated pgd. + */ + spin_lock(&pgd_lock); + + pgd_ctor(mm, pgd); + pgd_prepopulate_pmd(mm, pgd, pmds); + + spin_unlock(&pgd_lock); + + return pgd; + +out_free_pmds: + free_pmds(pmds); +out_free_pgd: + free_page((unsigned long)pgd); +out: + return NULL; +} + +void pgd_free(struct mm_struct *mm, pgd_t *pgd) +{ + pgd_mop_up_pmds(mm, pgd); + pgd_dtor(pgd); + paravirt_pgd_free(mm, pgd); + free_page((unsigned long)pgd); +} + +int ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, + pte_t entry, int dirty) +{ + int changed = !pte_same(*ptep, entry); + + if (changed && dirty) { + *ptep = entry; + pte_update_defer(vma->vm_mm, address, ptep); + flush_tlb_page(vma, address); + } + + return changed; +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +int pmdp_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp, + pmd_t entry, int dirty) +{ + int changed = !pmd_same(*pmdp, entry); + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + + if (changed && dirty) { + *pmdp = entry; + pmd_update_defer(vma->vm_mm, address, pmdp); + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + } + + return changed; +} +#endif + +int ptep_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) +{ + int ret = 0; + + if (pte_young(*ptep)) + ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, + (unsigned long *) &ptep->pte); + + if (ret) + pte_update(vma->vm_mm, addr, ptep); + + return ret; +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +int pmdp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp) +{ + int ret = 0; + + if (pmd_young(*pmdp)) + ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, + (unsigned long *)pmdp); + + if (ret) + pmd_update(vma->vm_mm, addr, pmdp); + + return ret; +} +#endif + +int ptep_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep) +{ + int young; + + young = ptep_test_and_clear_young(vma, address, ptep); + if (young) + flush_tlb_page(vma, address); + + return young; +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +int pmdp_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) +{ + int young; + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + + young = pmdp_test_and_clear_young(vma, address, pmdp); + if (young) + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + + return young; +} + +void pmdp_splitting_flush(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) +{ + int set; + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + set = !test_and_set_bit(_PAGE_BIT_SPLITTING, + (unsigned long *)pmdp); + if (set) { + pmd_update(vma->vm_mm, address, pmdp); + /* need tlb flush only to serialize against gup-fast */ + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + } +} +#endif + +/** + * reserve_top_address - reserves a hole in the top of kernel address space + * @reserve - size of hole to reserve + * + * Can be used to relocate the fixmap area and poke a hole in the top + * of kernel address space to make room for a hypervisor. + */ +void __init reserve_top_address(unsigned long reserve) +{ +#ifdef CONFIG_X86_32 + BUG_ON(fixmaps_set > 0); + printk(KERN_INFO "Reserving virtual address space above 0x%08x\n", + (int)-reserve); + __FIXADDR_TOP = -reserve - PAGE_SIZE; +#endif +} + +int fixmaps_set; + +void __native_set_fixmap(enum fixed_addresses idx, pte_t pte) +{ + unsigned long address = __fix_to_virt(idx); + + if (idx >= __end_of_fixed_addresses) { + BUG(); + return; + } + set_pte_vaddr(address, pte); + fixmaps_set++; +} + +void native_set_fixmap(enum fixed_addresses idx, phys_addr_t phys, + pgprot_t flags) +{ + __native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags)); +} diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c new file mode 100644 index 00000000..a69bcb8c --- /dev/null +++ b/arch/x86/mm/pgtable_32.c @@ -0,0 +1,133 @@ +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/mm.h> +#include <linux/nmi.h> +#include <linux/swap.h> +#include <linux/smp.h> +#include <linux/highmem.h> +#include <linux/pagemap.h> +#include <linux/spinlock.h> +#include <linux/module.h> + +#include <asm/pgtable.h> +#include <asm/pgalloc.h> +#include <asm/fixmap.h> +#include <asm/e820.h> +#include <asm/tlb.h> +#include <asm/tlbflush.h> +#include <asm/io.h> + +unsigned int __VMALLOC_RESERVE = 128 << 20; + +/* + * Associate a virtual page frame with a given physical page frame + * and protection flags for that frame. + */ +void set_pte_vaddr(unsigned long vaddr, pte_t pteval) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pgd = swapper_pg_dir + pgd_index(vaddr); + if (pgd_none(*pgd)) { + BUG(); + return; + } + pud = pud_offset(pgd, vaddr); + if (pud_none(*pud)) { + BUG(); + return; + } + pmd = pmd_offset(pud, vaddr); + if (pmd_none(*pmd)) { + BUG(); + return; + } + pte = pte_offset_kernel(pmd, vaddr); + if (pte_val(pteval)) + set_pte_at(&init_mm, vaddr, pte, pteval); + else + pte_clear(&init_mm, vaddr, pte); + + /* + * It's enough to flush this one mapping. + * (PGE mappings get flushed as well) + */ + __flush_tlb_one(vaddr); +} + +/* + * Associate a large virtual page frame with a given physical page frame + * and protection flags for that frame. pfn is for the base of the page, + * vaddr is what the page gets mapped to - both must be properly aligned. + * The pmd must already be instantiated. Assumes PAE mode. + */ +void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + + if (vaddr & (PMD_SIZE-1)) { /* vaddr is misaligned */ + printk(KERN_WARNING "set_pmd_pfn: vaddr misaligned\n"); + return; /* BUG(); */ + } + if (pfn & (PTRS_PER_PTE-1)) { /* pfn is misaligned */ + printk(KERN_WARNING "set_pmd_pfn: pfn misaligned\n"); + return; /* BUG(); */ + } + pgd = swapper_pg_dir + pgd_index(vaddr); + if (pgd_none(*pgd)) { + printk(KERN_WARNING "set_pmd_pfn: pgd_none\n"); + return; /* BUG(); */ + } + pud = pud_offset(pgd, vaddr); + pmd = pmd_offset(pud, vaddr); + set_pmd(pmd, pfn_pmd(pfn, flags)); + /* + * It's enough to flush this one mapping. + * (PGE mappings get flushed as well) + */ + __flush_tlb_one(vaddr); +} + +unsigned long __FIXADDR_TOP = 0xfffff000; +EXPORT_SYMBOL(__FIXADDR_TOP); + +/* + * vmalloc=size forces the vmalloc area to be exactly 'size' + * bytes. This can be used to increase (or decrease) the + * vmalloc area - the default is 128m. + */ +static int __init parse_vmalloc(char *arg) +{ + if (!arg) + return -EINVAL; + + /* Add VMALLOC_OFFSET to the parsed value due to vm area guard hole*/ + __VMALLOC_RESERVE = memparse(arg, &arg) + VMALLOC_OFFSET; + return 0; +} +early_param("vmalloc", parse_vmalloc); + +/* + * reservetop=size reserves a hole at the top of the kernel address space which + * a hypervisor can load into later. Needed for dynamically loaded hypervisors, + * so relocating the fixmap can be done before paging initialization. + */ +static int __init parse_reservetop(char *arg) +{ + unsigned long address; + + if (!arg) + return -EINVAL; + + address = memparse(arg, &arg); + reserve_top_address(address); + fixup_early_ioremap(); + return 0; +} +early_param("reservetop", parse_reservetop); diff --git a/arch/x86/mm/physaddr.c b/arch/x86/mm/physaddr.c new file mode 100644 index 00000000..d2e27353 --- /dev/null +++ b/arch/x86/mm/physaddr.c @@ -0,0 +1,70 @@ +#include <linux/mmdebug.h> +#include <linux/module.h> +#include <linux/mm.h> + +#include <asm/page.h> + +#include "physaddr.h" + +#ifdef CONFIG_X86_64 + +unsigned long __phys_addr(unsigned long x) +{ + if (x >= __START_KERNEL_map) { + x -= __START_KERNEL_map; + VIRTUAL_BUG_ON(x >= KERNEL_IMAGE_SIZE); + x += phys_base; + } else { + VIRTUAL_BUG_ON(x < PAGE_OFFSET); + x -= PAGE_OFFSET; + VIRTUAL_BUG_ON(!phys_addr_valid(x)); + } + return x; +} +EXPORT_SYMBOL(__phys_addr); + +bool __virt_addr_valid(unsigned long x) +{ + if (x >= __START_KERNEL_map) { + x -= __START_KERNEL_map; + if (x >= KERNEL_IMAGE_SIZE) + return false; + x += phys_base; + } else { + if (x < PAGE_OFFSET) + return false; + x -= PAGE_OFFSET; + if (!phys_addr_valid(x)) + return false; + } + + return pfn_valid(x >> PAGE_SHIFT); +} +EXPORT_SYMBOL(__virt_addr_valid); + +#else + +#ifdef CONFIG_DEBUG_VIRTUAL +unsigned long __phys_addr(unsigned long x) +{ + /* VMALLOC_* aren't constants */ + VIRTUAL_BUG_ON(x < PAGE_OFFSET); + VIRTUAL_BUG_ON(__vmalloc_start_set && is_vmalloc_addr((void *) x)); + return x - PAGE_OFFSET; +} +EXPORT_SYMBOL(__phys_addr); +#endif + +bool __virt_addr_valid(unsigned long x) +{ + if (x < PAGE_OFFSET) + return false; + if (__vmalloc_start_set && is_vmalloc_addr((void *) x)) + return false; + if (x >= FIXADDR_START) + return false; + return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT); +} +EXPORT_SYMBOL(__virt_addr_valid); + +#endif /* CONFIG_X86_64 */ diff --git a/arch/x86/mm/physaddr.h b/arch/x86/mm/physaddr.h new file mode 100644 index 00000000..a3cd5a0c --- /dev/null +++ b/arch/x86/mm/physaddr.h @@ -0,0 +1,10 @@ +#include <asm/processor.h> + +static inline int phys_addr_valid(resource_size_t addr) +{ +#ifdef CONFIG_PHYS_ADDR_T_64BIT + return !(addr >> boot_cpu_data.x86_phys_bits); +#else + return 1; +#endif +} diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c new file mode 100644 index 00000000..410531d3 --- /dev/null +++ b/arch/x86/mm/setup_nx.c @@ -0,0 +1,60 @@ +#include <linux/spinlock.h> +#include <linux/errno.h> +#include <linux/init.h> + +#include <asm/pgtable.h> +#include <asm/proto.h> + +static int disable_nx __cpuinitdata; + +/* + * noexec = on|off + * + * Control non-executable mappings for processes. + * + * on Enable + * off Disable + */ +static int __init noexec_setup(char *str) +{ + if (!str) + return -EINVAL; + if (!strncmp(str, "on", 2)) { + disable_nx = 0; + } else if (!strncmp(str, "off", 3)) { + disable_nx = 1; + } + x86_configure_nx(); + return 0; +} +early_param("noexec", noexec_setup); + +void __cpuinit x86_configure_nx(void) +{ + if (cpu_has_nx && !disable_nx) + __supported_pte_mask |= _PAGE_NX; + else + __supported_pte_mask &= ~_PAGE_NX; +} + +void __init x86_report_nx(void) +{ + if (!cpu_has_nx) { + printk(KERN_NOTICE "Notice: NX (Execute Disable) protection " + "missing in CPU!\n"); + } else { +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) + if (disable_nx) { + printk(KERN_INFO "NX (Execute Disable) protection: " + "disabled by kernel command line option\n"); + } else { + printk(KERN_INFO "NX (Execute Disable) protection: " + "active\n"); + } +#else + /* 32bit non-PAE kernel, NX cannot be used */ + printk(KERN_NOTICE "Notice: NX (Execute Disable) protection " + "cannot be enabled: non-PAE kernel!\n"); +#endif + } +} diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c new file mode 100644 index 00000000..efb5b4b9 --- /dev/null +++ b/arch/x86/mm/srat.c @@ -0,0 +1,193 @@ +/* + * ACPI 3.0 based NUMA setup + * Copyright 2004 Andi Kleen, SuSE Labs. + * + * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs. + * + * Called from acpi_numa_init while reading the SRAT and SLIT tables. + * Assumes all memory regions belonging to a single proximity domain + * are in one chunk. Holes between them will be included in the node. + */ + +#include <linux/kernel.h> +#include <linux/acpi.h> +#include <linux/mmzone.h> +#include <linux/bitmap.h> +#include <linux/module.h> +#include <linux/topology.h> +#include <linux/bootmem.h> +#include <linux/memblock.h> +#include <linux/mm.h> +#include <asm/proto.h> +#include <asm/numa.h> +#include <asm/e820.h> +#include <asm/apic.h> +#include <asm/uv/uv.h> + +int acpi_numa __initdata; + +static __init int setup_node(int pxm) +{ + return acpi_map_pxm_to_node(pxm); +} + +static __init void bad_srat(void) +{ + printk(KERN_ERR "SRAT: SRAT not used.\n"); + acpi_numa = -1; +} + +static __init inline int srat_disabled(void) +{ + return acpi_numa < 0; +} + +/* Callback for SLIT parsing */ +void __init acpi_numa_slit_init(struct acpi_table_slit *slit) +{ + int i, j; + + for (i = 0; i < slit->locality_count; i++) + for (j = 0; j < slit->locality_count; j++) + numa_set_distance(pxm_to_node(i), pxm_to_node(j), + slit->entry[slit->locality_count * i + j]); +} + +/* Callback for Proximity Domain -> x2APIC mapping */ +void __init +acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa) +{ + int pxm, node; + int apic_id; + + if (srat_disabled()) + return; + if (pa->header.length < sizeof(struct acpi_srat_x2apic_cpu_affinity)) { + bad_srat(); + return; + } + if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0) + return; + pxm = pa->proximity_domain; + apic_id = pa->apic_id; + if (!apic->apic_id_valid(apic_id)) { + printk(KERN_INFO "SRAT: PXM %u -> X2APIC 0x%04x ignored\n", + pxm, apic_id); + return; + } + node = setup_node(pxm); + if (node < 0) { + printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm); + bad_srat(); + return; + } + + if (apic_id >= MAX_LOCAL_APIC) { + printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node); + return; + } + set_apicid_to_node(apic_id, node); + node_set(node, numa_nodes_parsed); + acpi_numa = 1; + printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n", + pxm, apic_id, node); +} + +/* Callback for Proximity Domain -> LAPIC mapping */ +void __init +acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) +{ + int pxm, node; + int apic_id; + + if (srat_disabled()) + return; + if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) { + bad_srat(); + return; + } + if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0) + return; + pxm = pa->proximity_domain_lo; + if (acpi_srat_revision >= 2) + pxm |= *((unsigned int*)pa->proximity_domain_hi) << 8; + node = setup_node(pxm); + if (node < 0) { + printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm); + bad_srat(); + return; + } + + if (get_uv_system_type() >= UV_X2APIC) + apic_id = (pa->apic_id << 8) | pa->local_sapic_eid; + else + apic_id = pa->apic_id; + + if (apic_id >= MAX_LOCAL_APIC) { + printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node); + return; + } + + set_apicid_to_node(apic_id, node); + node_set(node, numa_nodes_parsed); + acpi_numa = 1; + printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n", + pxm, apic_id, node); +} + +#ifdef CONFIG_MEMORY_HOTPLUG +static inline int save_add_info(void) {return 1;} +#else +static inline int save_add_info(void) {return 0;} +#endif + +/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */ +void __init +acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) +{ + u64 start, end; + int node, pxm; + + if (srat_disabled()) + return; + if (ma->header.length != sizeof(struct acpi_srat_mem_affinity)) { + bad_srat(); + return; + } + if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0) + return; + + if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info()) + return; + start = ma->base_address; + end = start + ma->length; + pxm = ma->proximity_domain; + if (acpi_srat_revision <= 1) + pxm &= 0xff; + node = setup_node(pxm); + if (node < 0) { + printk(KERN_ERR "SRAT: Too many proximity domains.\n"); + bad_srat(); + return; + } + + if (numa_add_memblk(node, start, end) < 0) { + bad_srat(); + return; + } + + printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm, + start, end); +} + +void __init acpi_numa_arch_fixup(void) {} + +int __init x86_acpi_numa_init(void) +{ + int ret; + + ret = acpi_numa_init(); + if (ret < 0) + return ret; + return srat_disabled() ? -EINVAL : 0; +} diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c new file mode 100644 index 00000000..38868adf --- /dev/null +++ b/arch/x86/mm/testmmiotrace.c @@ -0,0 +1,140 @@ +/* + * Written by Pekka Paalanen, 2008-2009 <pq@iki.fi> + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/module.h> +#include <linux/io.h> +#include <linux/mmiotrace.h> + +static unsigned long mmio_address; +module_param(mmio_address, ulong, 0); +MODULE_PARM_DESC(mmio_address, " Start address of the mapping of 16 kB " + "(or 8 MB if read_far is non-zero)."); + +static unsigned long read_far = 0x400100; +module_param(read_far, ulong, 0); +MODULE_PARM_DESC(read_far, " Offset of a 32-bit read within 8 MB " + "(default: 0x400100)."); + +static unsigned v16(unsigned i) +{ + return i * 12 + 7; +} + +static unsigned v32(unsigned i) +{ + return i * 212371 + 13; +} + +static void do_write_test(void __iomem *p) +{ + unsigned int i; + pr_info("write test.\n"); + mmiotrace_printk("Write test.\n"); + + for (i = 0; i < 256; i++) + iowrite8(i, p + i); + + for (i = 1024; i < (5 * 1024); i += 2) + iowrite16(v16(i), p + i); + + for (i = (5 * 1024); i < (16 * 1024); i += 4) + iowrite32(v32(i), p + i); +} + +static void do_read_test(void __iomem *p) +{ + unsigned int i; + unsigned errs[3] = { 0 }; + pr_info("read test.\n"); + mmiotrace_printk("Read test.\n"); + + for (i = 0; i < 256; i++) + if (ioread8(p + i) != i) + ++errs[0]; + + for (i = 1024; i < (5 * 1024); i += 2) + if (ioread16(p + i) != v16(i)) + ++errs[1]; + + for (i = (5 * 1024); i < (16 * 1024); i += 4) + if (ioread32(p + i) != v32(i)) + ++errs[2]; + + mmiotrace_printk("Read errors: 8-bit %d, 16-bit %d, 32-bit %d.\n", + errs[0], errs[1], errs[2]); +} + +static void do_read_far_test(void __iomem *p) +{ + pr_info("read far test.\n"); + mmiotrace_printk("Read far test.\n"); + + ioread32(p + read_far); +} + +static void do_test(unsigned long size) +{ + void __iomem *p = ioremap_nocache(mmio_address, size); + if (!p) { + pr_err("could not ioremap, aborting.\n"); + return; + } + mmiotrace_printk("ioremap returned %p.\n", p); + do_write_test(p); + do_read_test(p); + if (read_far && read_far < size - 4) + do_read_far_test(p); + iounmap(p); +} + +/* + * Tests how mmiotrace behaves in face of multiple ioremap / iounmaps in + * a short time. We had a bug in deferred freeing procedure which tried + * to free this region multiple times (ioremap can reuse the same address + * for many mappings). + */ +static void do_test_bulk_ioremapping(void) +{ + void __iomem *p; + int i; + + for (i = 0; i < 10; ++i) { + p = ioremap_nocache(mmio_address, PAGE_SIZE); + if (p) + iounmap(p); + } + + /* Force freeing. If it will crash we will know why. */ + synchronize_rcu(); +} + +static int __init init(void) +{ + unsigned long size = (read_far) ? (8 << 20) : (16 << 10); + + if (mmio_address == 0) { + pr_err("you have to use the module argument mmio_address.\n"); + pr_err("DO NOT LOAD THIS MODULE UNLESS YOU REALLY KNOW WHAT YOU ARE DOING!\n"); + return -ENXIO; + } + + pr_warning("WARNING: mapping %lu kB @ 0x%08lx in PCI address space, " + "and writing 16 kB of rubbish in there.\n", + size >> 10, mmio_address); + do_test(size); + do_test_bulk_ioremapping(); + pr_info("All done.\n"); + return 0; +} + +static void __exit cleanup(void) +{ + pr_debug("unloaded.\n"); +} + +module_init(init); +module_exit(cleanup); +MODULE_LICENSE("GPL"); diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c new file mode 100644 index 00000000..d6c0418c --- /dev/null +++ b/arch/x86/mm/tlb.c @@ -0,0 +1,332 @@ +#include <linux/init.h> + +#include <linux/mm.h> +#include <linux/spinlock.h> +#include <linux/smp.h> +#include <linux/interrupt.h> +#include <linux/module.h> +#include <linux/cpu.h> + +#include <asm/tlbflush.h> +#include <asm/mmu_context.h> +#include <asm/cache.h> +#include <asm/apic.h> +#include <asm/uv/uv.h> + +DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) + = { &init_mm, 0, }; + +/* + * Smarter SMP flushing macros. + * c/o Linus Torvalds. + * + * These mean you can really definitely utterly forget about + * writing to user space from interrupts. (Its not allowed anyway). + * + * Optimizations Manfred Spraul <manfred@colorfullife.com> + * + * More scalable flush, from Andi Kleen + * + * To avoid global state use 8 different call vectors. + * Each CPU uses a specific vector to trigger flushes on other + * CPUs. Depending on the received vector the target CPUs look into + * the right array slot for the flush data. + * + * With more than 8 CPUs they are hashed to the 8 available + * vectors. The limited global vector space forces us to this right now. + * In future when interrupts are split into per CPU domains this could be + * fixed, at the cost of triggering multiple IPIs in some cases. + */ + +union smp_flush_state { + struct { + struct mm_struct *flush_mm; + unsigned long flush_va; + raw_spinlock_t tlbstate_lock; + DECLARE_BITMAP(flush_cpumask, NR_CPUS); + }; + char pad[INTERNODE_CACHE_BYTES]; +} ____cacheline_internodealigned_in_smp; + +/* State is put into the per CPU data section, but padded + to a full cache line because other CPUs can access it and we don't + want false sharing in the per cpu data segment. */ +static union smp_flush_state flush_state[NUM_INVALIDATE_TLB_VECTORS]; + +static DEFINE_PER_CPU_READ_MOSTLY(int, tlb_vector_offset); + +/* + * We cannot call mmdrop() because we are in interrupt context, + * instead update mm->cpu_vm_mask. + */ +void leave_mm(int cpu) +{ + if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) + BUG(); + cpumask_clear_cpu(cpu, + mm_cpumask(percpu_read(cpu_tlbstate.active_mm))); + load_cr3(swapper_pg_dir); +} +EXPORT_SYMBOL_GPL(leave_mm); + +/* + * + * The flush IPI assumes that a thread switch happens in this order: + * [cpu0: the cpu that switches] + * 1) switch_mm() either 1a) or 1b) + * 1a) thread switch to a different mm + * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask); + * Stop ipi delivery for the old mm. This is not synchronized with + * the other cpus, but smp_invalidate_interrupt ignore flush ipis + * for the wrong mm, and in the worst case we perform a superfluous + * tlb flush. + * 1a2) set cpu mmu_state to TLBSTATE_OK + * Now the smp_invalidate_interrupt won't call leave_mm if cpu0 + * was in lazy tlb mode. + * 1a3) update cpu active_mm + * Now cpu0 accepts tlb flushes for the new mm. + * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask); + * Now the other cpus will send tlb flush ipis. + * 1a4) change cr3. + * 1b) thread switch without mm change + * cpu active_mm is correct, cpu0 already handles + * flush ipis. + * 1b1) set cpu mmu_state to TLBSTATE_OK + * 1b2) test_and_set the cpu bit in cpu_vm_mask. + * Atomically set the bit [other cpus will start sending flush ipis], + * and test the bit. + * 1b3) if the bit was 0: leave_mm was called, flush the tlb. + * 2) switch %%esp, ie current + * + * The interrupt must handle 2 special cases: + * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm. + * - the cpu performs speculative tlb reads, i.e. even if the cpu only + * runs in kernel space, the cpu could load tlb entries for user space + * pages. + * + * The good news is that cpu mmu_state is local to each cpu, no + * write/read ordering problems. + */ + +/* + * TLB flush IPI: + * + * 1) Flush the tlb entries if the cpu uses the mm that's being flushed. + * 2) Leave the mm if we are in the lazy tlb mode. + * + * Interrupts are disabled. + */ + +/* + * FIXME: use of asmlinkage is not consistent. On x86_64 it's noop + * but still used for documentation purpose but the usage is slightly + * inconsistent. On x86_32, asmlinkage is regparm(0) but interrupt + * entry calls in with the first parameter in %eax. Maybe define + * intrlinkage? + */ +#ifdef CONFIG_X86_64 +asmlinkage +#endif +void smp_invalidate_interrupt(struct pt_regs *regs) +{ + unsigned int cpu; + unsigned int sender; + union smp_flush_state *f; + + cpu = smp_processor_id(); + /* + * orig_rax contains the negated interrupt vector. + * Use that to determine where the sender put the data. + */ + sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START; + f = &flush_state[sender]; + + if (!cpumask_test_cpu(cpu, to_cpumask(f->flush_cpumask))) + goto out; + /* + * This was a BUG() but until someone can quote me the + * line from the intel manual that guarantees an IPI to + * multiple CPUs is retried _only_ on the erroring CPUs + * its staying as a return + * + * BUG(); + */ + + if (f->flush_mm == percpu_read(cpu_tlbstate.active_mm)) { + if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) { + if (f->flush_va == TLB_FLUSH_ALL) + local_flush_tlb(); + else + __flush_tlb_one(f->flush_va); + } else + leave_mm(cpu); + } +out: + ack_APIC_irq(); + smp_mb__before_clear_bit(); + cpumask_clear_cpu(cpu, to_cpumask(f->flush_cpumask)); + smp_mb__after_clear_bit(); + inc_irq_stat(irq_tlb_count); +} + +static void flush_tlb_others_ipi(const struct cpumask *cpumask, + struct mm_struct *mm, unsigned long va) +{ + unsigned int sender; + union smp_flush_state *f; + + /* Caller has disabled preemption */ + sender = this_cpu_read(tlb_vector_offset); + f = &flush_state[sender]; + + if (nr_cpu_ids > NUM_INVALIDATE_TLB_VECTORS) + raw_spin_lock(&f->tlbstate_lock); + + f->flush_mm = mm; + f->flush_va = va; + if (cpumask_andnot(to_cpumask(f->flush_cpumask), cpumask, cpumask_of(smp_processor_id()))) { + /* + * We have to send the IPI only to + * CPUs affected. + */ + apic->send_IPI_mask(to_cpumask(f->flush_cpumask), + INVALIDATE_TLB_VECTOR_START + sender); + + while (!cpumask_empty(to_cpumask(f->flush_cpumask))) + cpu_relax(); + } + + f->flush_mm = NULL; + f->flush_va = 0; + if (nr_cpu_ids > NUM_INVALIDATE_TLB_VECTORS) + raw_spin_unlock(&f->tlbstate_lock); +} + +void native_flush_tlb_others(const struct cpumask *cpumask, + struct mm_struct *mm, unsigned long va) +{ + if (is_uv_system()) { + unsigned int cpu; + + cpu = smp_processor_id(); + cpumask = uv_flush_tlb_others(cpumask, mm, va, cpu); + if (cpumask) + flush_tlb_others_ipi(cpumask, mm, va); + return; + } + flush_tlb_others_ipi(cpumask, mm, va); +} + +static void __cpuinit calculate_tlb_offset(void) +{ + int cpu, node, nr_node_vecs, idx = 0; + /* + * we are changing tlb_vector_offset for each CPU in runtime, but this + * will not cause inconsistency, as the write is atomic under X86. we + * might see more lock contentions in a short time, but after all CPU's + * tlb_vector_offset are changed, everything should go normal + * + * Note: if NUM_INVALIDATE_TLB_VECTORS % nr_online_nodes !=0, we might + * waste some vectors. + **/ + if (nr_online_nodes > NUM_INVALIDATE_TLB_VECTORS) + nr_node_vecs = 1; + else + nr_node_vecs = NUM_INVALIDATE_TLB_VECTORS/nr_online_nodes; + + for_each_online_node(node) { + int node_offset = (idx % NUM_INVALIDATE_TLB_VECTORS) * + nr_node_vecs; + int cpu_offset = 0; + for_each_cpu(cpu, cpumask_of_node(node)) { + per_cpu(tlb_vector_offset, cpu) = node_offset + + cpu_offset; + cpu_offset++; + cpu_offset = cpu_offset % nr_node_vecs; + } + idx++; + } +} + +static int __cpuinit tlb_cpuhp_notify(struct notifier_block *n, + unsigned long action, void *hcpu) +{ + switch (action & 0xf) { + case CPU_ONLINE: + case CPU_DEAD: + calculate_tlb_offset(); + } + return NOTIFY_OK; +} + +static int __cpuinit init_smp_flush(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(flush_state); i++) + raw_spin_lock_init(&flush_state[i].tlbstate_lock); + + calculate_tlb_offset(); + hotcpu_notifier(tlb_cpuhp_notify, 0); + return 0; +} +core_initcall(init_smp_flush); + +void flush_tlb_current_task(void) +{ + struct mm_struct *mm = current->mm; + + preempt_disable(); + + local_flush_tlb(); + if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) + flush_tlb_others(mm_cpumask(mm), mm, TLB_FLUSH_ALL); + preempt_enable(); +} + +void flush_tlb_mm(struct mm_struct *mm) +{ + preempt_disable(); + + if (current->active_mm == mm) { + if (current->mm) + local_flush_tlb(); + else + leave_mm(smp_processor_id()); + } + if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) + flush_tlb_others(mm_cpumask(mm), mm, TLB_FLUSH_ALL); + + preempt_enable(); +} + +void flush_tlb_page(struct vm_area_struct *vma, unsigned long va) +{ + struct mm_struct *mm = vma->vm_mm; + + preempt_disable(); + + if (current->active_mm == mm) { + if (current->mm) + __flush_tlb_one(va); + else + leave_mm(smp_processor_id()); + } + + if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) + flush_tlb_others(mm_cpumask(mm), mm, va); + + preempt_enable(); +} + +static void do_flush_tlb_all(void *info) +{ + __flush_tlb_all(); + if (percpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY) + leave_mm(smp_processor_id()); +} + +void flush_tlb_all(void) +{ + on_each_cpu(do_flush_tlb_all, NULL, 1); +} |