diff options
Diffstat (limited to 'ANDROID_3.4.5/arch/x86/kvm')
27 files changed, 0 insertions, 33791 deletions
diff --git a/ANDROID_3.4.5/arch/x86/kvm/Kconfig b/ANDROID_3.4.5/arch/x86/kvm/Kconfig deleted file mode 100644 index 1a7fe868..00000000 --- a/ANDROID_3.4.5/arch/x86/kvm/Kconfig +++ /dev/null @@ -1,87 +0,0 @@ -# -# KVM configuration -# - -source "virt/kvm/Kconfig" - -menuconfig VIRTUALIZATION - bool "Virtualization" - depends on HAVE_KVM || X86 - default y - ---help--- - Say Y here to get to see options for using your Linux host to run other - operating systems inside virtual machines (guests). - This option alone does not add any kernel code. - - If you say N, all options in this submenu will be skipped and disabled. - -if VIRTUALIZATION - -config KVM - tristate "Kernel-based Virtual Machine (KVM) support" - depends on HAVE_KVM - # for device assignment: - depends on PCI - # for TASKSTATS/TASK_DELAY_ACCT: - depends on NET - select PREEMPT_NOTIFIERS - select MMU_NOTIFIER - select ANON_INODES - select HAVE_KVM_IRQCHIP - select HAVE_KVM_EVENTFD - select KVM_APIC_ARCHITECTURE - select KVM_ASYNC_PF - select USER_RETURN_NOTIFIER - select KVM_MMIO - select TASKSTATS - select TASK_DELAY_ACCT - select PERF_EVENTS - ---help--- - Support hosting fully virtualized guest machines using hardware - virtualization extensions. You will need a fairly recent - processor equipped with virtualization extensions. You will also - need to select one or more of the processor modules below. - - This module provides access to the hardware capabilities through - a character device node named /dev/kvm. - - To compile this as a module, choose M here: the module - will be called kvm. - - If unsure, say N. - -config KVM_INTEL - tristate "KVM for Intel processors support" - depends on KVM - # for perf_guest_get_msrs(): - depends on CPU_SUP_INTEL - ---help--- - Provides support for KVM on Intel processors equipped with the VT - extensions. - - To compile this as a module, choose M here: the module - will be called kvm-intel. - -config KVM_AMD - tristate "KVM for AMD processors support" - depends on KVM - ---help--- - Provides support for KVM on AMD processors equipped with the AMD-V - (SVM) extensions. - - To compile this as a module, choose M here: the module - will be called kvm-amd. - -config KVM_MMU_AUDIT - bool "Audit KVM MMU" - depends on KVM && TRACEPOINTS - ---help--- - This option adds a R/W kVM module parameter 'mmu_audit', which allows - audit KVM MMU at runtime. - -# OK, it's a little counter-intuitive to do this, but it puts it neatly under -# the virtualization menu. -source drivers/vhost/Kconfig -source drivers/lguest/Kconfig - -endif # VIRTUALIZATION diff --git a/ANDROID_3.4.5/arch/x86/kvm/Makefile b/ANDROID_3.4.5/arch/x86/kvm/Makefile deleted file mode 100644 index 4f579e8d..00000000 --- a/ANDROID_3.4.5/arch/x86/kvm/Makefile +++ /dev/null @@ -1,21 +0,0 @@ - -ccflags-y += -Ivirt/kvm -Iarch/x86/kvm - -CFLAGS_x86.o := -I. -CFLAGS_svm.o := -I. -CFLAGS_vmx.o := -I. - -kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ - coalesced_mmio.o irq_comm.o eventfd.o \ - assigned-dev.o) -kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o) -kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o) - -kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ - i8254.o timer.o cpuid.o pmu.o -kvm-intel-y += vmx.o -kvm-amd-y += svm.o - -obj-$(CONFIG_KVM) += kvm.o -obj-$(CONFIG_KVM_INTEL) += kvm-intel.o -obj-$(CONFIG_KVM_AMD) += kvm-amd.o diff --git a/ANDROID_3.4.5/arch/x86/kvm/cpuid.c b/ANDROID_3.4.5/arch/x86/kvm/cpuid.c deleted file mode 100644 index 9fed5bed..00000000 --- a/ANDROID_3.4.5/arch/x86/kvm/cpuid.c +++ /dev/null @@ -1,670 +0,0 @@ -/* - * Kernel-based Virtual Machine driver for Linux - * cpuid support routines - * - * derived from arch/x86/kvm/x86.c - * - * Copyright 2011 Red Hat, Inc. and/or its affiliates. - * Copyright IBM Corporation, 2008 - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * - */ - -#include <linux/kvm_host.h> -#include <linux/module.h> -#include <linux/vmalloc.h> -#include <linux/uaccess.h> -#include <asm/user.h> -#include <asm/xsave.h> -#include "cpuid.h" -#include "lapic.h" -#include "mmu.h" -#include "trace.h" - -void kvm_update_cpuid(struct kvm_vcpu *vcpu) -{ - struct kvm_cpuid_entry2 *best; - struct kvm_lapic *apic = vcpu->arch.apic; - - best = kvm_find_cpuid_entry(vcpu, 1, 0); - if (!best) - return; - - /* Update OSXSAVE bit */ - if (cpu_has_xsave && best->function == 0x1) { - best->ecx &= ~(bit(X86_FEATURE_OSXSAVE)); - if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) - best->ecx |= bit(X86_FEATURE_OSXSAVE); - } - - if (apic) { - if (best->ecx & bit(X86_FEATURE_TSC_DEADLINE_TIMER)) - apic->lapic_timer.timer_mode_mask = 3 << 17; - else - apic->lapic_timer.timer_mode_mask = 1 << 17; - } - - kvm_pmu_cpuid_update(vcpu); -} - -static int is_efer_nx(void) -{ - unsigned long long efer = 0; - - rdmsrl_safe(MSR_EFER, &efer); - return efer & EFER_NX; -} - -static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu) -{ - int i; - struct kvm_cpuid_entry2 *e, *entry; - - entry = NULL; - for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { - e = &vcpu->arch.cpuid_entries[i]; - if (e->function == 0x80000001) { - entry = e; - break; - } - } - if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) { - entry->edx &= ~(1 << 20); - printk(KERN_INFO "kvm: guest NX capability removed\n"); - } -} - -/* when an old userspace process fills a new kernel module */ -int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, - struct kvm_cpuid *cpuid, - struct kvm_cpuid_entry __user *entries) -{ - int r, i; - struct kvm_cpuid_entry *cpuid_entries; - - r = -E2BIG; - if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) - goto out; - r = -ENOMEM; - cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent); - if (!cpuid_entries) - goto out; - r = -EFAULT; - if (copy_from_user(cpuid_entries, entries, - cpuid->nent * sizeof(struct kvm_cpuid_entry))) - goto out_free; - for (i = 0; i < cpuid->nent; i++) { - vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function; - vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax; - vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx; - vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx; - vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx; - vcpu->arch.cpuid_entries[i].index = 0; - vcpu->arch.cpuid_entries[i].flags = 0; - vcpu->arch.cpuid_entries[i].padding[0] = 0; - vcpu->arch.cpuid_entries[i].padding[1] = 0; - vcpu->arch.cpuid_entries[i].padding[2] = 0; - } - vcpu->arch.cpuid_nent = cpuid->nent; - cpuid_fix_nx_cap(vcpu); - r = 0; - kvm_apic_set_version(vcpu); - kvm_x86_ops->cpuid_update(vcpu); - kvm_update_cpuid(vcpu); - -out_free: - vfree(cpuid_entries); -out: - return r; -} - -int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, - struct kvm_cpuid2 *cpuid, - struct kvm_cpuid_entry2 __user *entries) -{ - int r; - - r = -E2BIG; - if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) - goto out; - r = -EFAULT; - if (copy_from_user(&vcpu->arch.cpuid_entries, entries, - cpuid->nent * sizeof(struct kvm_cpuid_entry2))) - goto out; - vcpu->arch.cpuid_nent = cpuid->nent; - kvm_apic_set_version(vcpu); - kvm_x86_ops->cpuid_update(vcpu); - kvm_update_cpuid(vcpu); - return 0; - -out: - return r; -} - -int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, - struct kvm_cpuid2 *cpuid, - struct kvm_cpuid_entry2 __user *entries) -{ - int r; - - r = -E2BIG; - if (cpuid->nent < vcpu->arch.cpuid_nent) - goto out; - r = -EFAULT; - if (copy_to_user(entries, &vcpu->arch.cpuid_entries, - vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2))) - goto out; - return 0; - -out: - cpuid->nent = vcpu->arch.cpuid_nent; - return r; -} - -static void cpuid_mask(u32 *word, int wordnum) -{ - *word &= boot_cpu_data.x86_capability[wordnum]; -} - -static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, - u32 index) -{ - entry->function = function; - entry->index = index; - cpuid_count(entry->function, entry->index, - &entry->eax, &entry->ebx, &entry->ecx, &entry->edx); - entry->flags = 0; -} - -static bool supported_xcr0_bit(unsigned bit) -{ - u64 mask = ((u64)1 << bit); - - return mask & (XSTATE_FP | XSTATE_SSE | XSTATE_YMM) & host_xcr0; -} - -#define F(x) bit(X86_FEATURE_##x) - -static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, - u32 index, int *nent, int maxnent) -{ - int r; - unsigned f_nx = is_efer_nx() ? F(NX) : 0; -#ifdef CONFIG_X86_64 - unsigned f_gbpages = (kvm_x86_ops->get_lpage_level() == PT_PDPE_LEVEL) - ? F(GBPAGES) : 0; - unsigned f_lm = F(LM); -#else - unsigned f_gbpages = 0; - unsigned f_lm = 0; -#endif - unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0; - - /* cpuid 1.edx */ - const u32 kvm_supported_word0_x86_features = - F(FPU) | F(VME) | F(DE) | F(PSE) | - F(TSC) | F(MSR) | F(PAE) | F(MCE) | - F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) | - F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | - F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLSH) | - 0 /* Reserved, DS, ACPI */ | F(MMX) | - F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) | - 0 /* HTT, TM, Reserved, PBE */; - /* cpuid 0x80000001.edx */ - const u32 kvm_supported_word1_x86_features = - F(FPU) | F(VME) | F(DE) | F(PSE) | - F(TSC) | F(MSR) | F(PAE) | F(MCE) | - F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) | - F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | - F(PAT) | F(PSE36) | 0 /* Reserved */ | - f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) | - F(FXSR) | F(FXSR_OPT) | f_gbpages | f_rdtscp | - 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW); - /* cpuid 1.ecx */ - const u32 kvm_supported_word4_x86_features = - F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ | - 0 /* DS-CPL, VMX, SMX, EST */ | - 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | - F(FMA) | F(CX16) | 0 /* xTPR Update, PDCM */ | - 0 /* Reserved, DCA */ | F(XMM4_1) | - F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | - 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) | - F(F16C) | F(RDRAND); - /* cpuid 0x80000001.ecx */ - const u32 kvm_supported_word6_x86_features = - F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ | - F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | - F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) | - 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM); - - /* cpuid 0xC0000001.edx */ - const u32 kvm_supported_word5_x86_features = - F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) | - F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) | - F(PMM) | F(PMM_EN); - - /* cpuid 7.0.ebx */ - const u32 kvm_supported_word9_x86_features = - F(FSGSBASE) | F(BMI1) | F(AVX2) | F(SMEP) | F(BMI2) | F(ERMS); - - /* all calls to cpuid_count() should be made on the same cpu */ - get_cpu(); - - r = -E2BIG; - - if (*nent >= maxnent) - goto out; - - do_cpuid_1_ent(entry, function, index); - ++*nent; - - switch (function) { - case 0: - entry->eax = min(entry->eax, (u32)0xd); - break; - case 1: - entry->edx &= kvm_supported_word0_x86_features; - cpuid_mask(&entry->edx, 0); - entry->ecx &= kvm_supported_word4_x86_features; - cpuid_mask(&entry->ecx, 4); - /* we support x2apic emulation even if host does not support - * it since we emulate x2apic in software */ - entry->ecx |= F(X2APIC); - break; - /* function 2 entries are STATEFUL. That is, repeated cpuid commands - * may return different values. This forces us to get_cpu() before - * issuing the first command, and also to emulate this annoying behavior - * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */ - case 2: { - int t, times = entry->eax & 0xff; - - entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; - entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; - for (t = 1; t < times; ++t) { - if (*nent >= maxnent) - goto out; - - do_cpuid_1_ent(&entry[t], function, 0); - entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; - ++*nent; - } - break; - } - /* function 4 has additional index. */ - case 4: { - int i, cache_type; - - entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; - /* read more entries until cache_type is zero */ - for (i = 1; ; ++i) { - if (*nent >= maxnent) - goto out; - - cache_type = entry[i - 1].eax & 0x1f; - if (!cache_type) - break; - do_cpuid_1_ent(&entry[i], function, i); - entry[i].flags |= - KVM_CPUID_FLAG_SIGNIFCANT_INDEX; - ++*nent; - } - break; - } - case 7: { - entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; - /* Mask ebx against host capbability word 9 */ - if (index == 0) { - entry->ebx &= kvm_supported_word9_x86_features; - cpuid_mask(&entry->ebx, 9); - } else - entry->ebx = 0; - entry->eax = 0; - entry->ecx = 0; - entry->edx = 0; - break; - } - case 9: - break; - case 0xa: { /* Architectural Performance Monitoring */ - struct x86_pmu_capability cap; - union cpuid10_eax eax; - union cpuid10_edx edx; - - perf_get_x86_pmu_capability(&cap); - - /* - * Only support guest architectural pmu on a host - * with architectural pmu. - */ - if (!cap.version) - memset(&cap, 0, sizeof(cap)); - - eax.split.version_id = min(cap.version, 2); - eax.split.num_counters = cap.num_counters_gp; - eax.split.bit_width = cap.bit_width_gp; - eax.split.mask_length = cap.events_mask_len; - - edx.split.num_counters_fixed = cap.num_counters_fixed; - edx.split.bit_width_fixed = cap.bit_width_fixed; - edx.split.reserved = 0; - - entry->eax = eax.full; - entry->ebx = cap.events_mask; - entry->ecx = 0; - entry->edx = edx.full; - break; - } - /* function 0xb has additional index. */ - case 0xb: { - int i, level_type; - - entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; - /* read more entries until level_type is zero */ - for (i = 1; ; ++i) { - if (*nent >= maxnent) - goto out; - - level_type = entry[i - 1].ecx & 0xff00; - if (!level_type) - break; - do_cpuid_1_ent(&entry[i], function, i); - entry[i].flags |= - KVM_CPUID_FLAG_SIGNIFCANT_INDEX; - ++*nent; - } - break; - } - case 0xd: { - int idx, i; - - entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; - for (idx = 1, i = 1; idx < 64; ++idx) { - if (*nent >= maxnent) - goto out; - - do_cpuid_1_ent(&entry[i], function, idx); - if (entry[i].eax == 0 || !supported_xcr0_bit(idx)) - continue; - entry[i].flags |= - KVM_CPUID_FLAG_SIGNIFCANT_INDEX; - ++*nent; - ++i; - } - break; - } - case KVM_CPUID_SIGNATURE: { - char signature[12] = "KVMKVMKVM\0\0"; - u32 *sigptr = (u32 *)signature; - entry->eax = 0; - entry->ebx = sigptr[0]; - entry->ecx = sigptr[1]; - entry->edx = sigptr[2]; - break; - } - case KVM_CPUID_FEATURES: - entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) | - (1 << KVM_FEATURE_NOP_IO_DELAY) | - (1 << KVM_FEATURE_CLOCKSOURCE2) | - (1 << KVM_FEATURE_ASYNC_PF) | - (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT); - - if (sched_info_on()) - entry->eax |= (1 << KVM_FEATURE_STEAL_TIME); - - entry->ebx = 0; - entry->ecx = 0; - entry->edx = 0; - break; - case 0x80000000: - entry->eax = min(entry->eax, 0x8000001a); - break; - case 0x80000001: - entry->edx &= kvm_supported_word1_x86_features; - cpuid_mask(&entry->edx, 1); - entry->ecx &= kvm_supported_word6_x86_features; - cpuid_mask(&entry->ecx, 6); - break; - case 0x80000008: { - unsigned g_phys_as = (entry->eax >> 16) & 0xff; - unsigned virt_as = max((entry->eax >> 8) & 0xff, 48U); - unsigned phys_as = entry->eax & 0xff; - - if (!g_phys_as) - g_phys_as = phys_as; - entry->eax = g_phys_as | (virt_as << 8); - entry->ebx = entry->edx = 0; - break; - } - case 0x80000019: - entry->ecx = entry->edx = 0; - break; - case 0x8000001a: - break; - case 0x8000001d: - break; - /*Add support for Centaur's CPUID instruction*/ - case 0xC0000000: - /*Just support up to 0xC0000004 now*/ - entry->eax = min(entry->eax, 0xC0000004); - break; - case 0xC0000001: - entry->edx &= kvm_supported_word5_x86_features; - cpuid_mask(&entry->edx, 5); - break; - case 3: /* Processor serial number */ - case 5: /* MONITOR/MWAIT */ - case 6: /* Thermal management */ - case 0x80000007: /* Advanced power management */ - case 0xC0000002: - case 0xC0000003: - case 0xC0000004: - default: - entry->eax = entry->ebx = entry->ecx = entry->edx = 0; - break; - } - - kvm_x86_ops->set_supported_cpuid(function, entry); - - r = 0; - -out: - put_cpu(); - - return r; -} - -#undef F - -struct kvm_cpuid_param { - u32 func; - u32 idx; - bool has_leaf_count; - bool (*qualifier)(struct kvm_cpuid_param *param); -}; - -static bool is_centaur_cpu(struct kvm_cpuid_param *param) -{ - return boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR; -} - -int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, - struct kvm_cpuid_entry2 __user *entries) -{ - struct kvm_cpuid_entry2 *cpuid_entries; - int limit, nent = 0, r = -E2BIG, i; - u32 func; - static struct kvm_cpuid_param param[] = { - { .func = 0, .has_leaf_count = true }, - { .func = 0x80000000, .has_leaf_count = true }, - { .func = 0xC0000000, .qualifier = is_centaur_cpu, .has_leaf_count = true }, - { .func = KVM_CPUID_SIGNATURE }, - { .func = KVM_CPUID_FEATURES }, - }; - - if (cpuid->nent < 1) - goto out; - if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) - cpuid->nent = KVM_MAX_CPUID_ENTRIES; - r = -ENOMEM; - cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent); - if (!cpuid_entries) - goto out; - - r = 0; - for (i = 0; i < ARRAY_SIZE(param); i++) { - struct kvm_cpuid_param *ent = ¶m[i]; - - if (ent->qualifier && !ent->qualifier(ent)) - continue; - - r = do_cpuid_ent(&cpuid_entries[nent], ent->func, ent->idx, - &nent, cpuid->nent); - - if (r) - goto out_free; - - if (!ent->has_leaf_count) - continue; - - limit = cpuid_entries[nent - 1].eax; - for (func = ent->func + 1; func <= limit && nent < cpuid->nent && r == 0; ++func) - r = do_cpuid_ent(&cpuid_entries[nent], func, ent->idx, - &nent, cpuid->nent); - - if (r) - goto out_free; - } - - r = -EFAULT; - if (copy_to_user(entries, cpuid_entries, - nent * sizeof(struct kvm_cpuid_entry2))) - goto out_free; - cpuid->nent = nent; - r = 0; - -out_free: - vfree(cpuid_entries); -out: - return r; -} - -static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) -{ - struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i]; - int j, nent = vcpu->arch.cpuid_nent; - - e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT; - /* when no next entry is found, the current entry[i] is reselected */ - for (j = i + 1; ; j = (j + 1) % nent) { - struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j]; - if (ej->function == e->function) { - ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; - return j; - } - } - return 0; /* silence gcc, even though control never reaches here */ -} - -/* find an entry with matching function, matching index (if needed), and that - * should be read next (if it's stateful) */ -static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e, - u32 function, u32 index) -{ - if (e->function != function) - return 0; - if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index) - return 0; - if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) && - !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT)) - return 0; - return 1; -} - -struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, - u32 function, u32 index) -{ - int i; - struct kvm_cpuid_entry2 *best = NULL; - - for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { - struct kvm_cpuid_entry2 *e; - - e = &vcpu->arch.cpuid_entries[i]; - if (is_matching_cpuid_entry(e, function, index)) { - if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) - move_to_next_stateful_cpuid_entry(vcpu, i); - best = e; - break; - } - } - return best; -} -EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry); - -int cpuid_maxphyaddr(struct kvm_vcpu *vcpu) -{ - struct kvm_cpuid_entry2 *best; - - best = kvm_find_cpuid_entry(vcpu, 0x80000000, 0); - if (!best || best->eax < 0x80000008) - goto not_found; - best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0); - if (best) - return best->eax & 0xff; -not_found: - return 36; -} - -/* - * If no match is found, check whether we exceed the vCPU's limit - * and return the content of the highest valid _standard_ leaf instead. - * This is to satisfy the CPUID specification. - */ -static struct kvm_cpuid_entry2* check_cpuid_limit(struct kvm_vcpu *vcpu, - u32 function, u32 index) -{ - struct kvm_cpuid_entry2 *maxlevel; - - maxlevel = kvm_find_cpuid_entry(vcpu, function & 0x80000000, 0); - if (!maxlevel || maxlevel->eax >= function) - return NULL; - if (function & 0x80000000) { - maxlevel = kvm_find_cpuid_entry(vcpu, 0, 0); - if (!maxlevel) - return NULL; - } - return kvm_find_cpuid_entry(vcpu, maxlevel->eax, index); -} - -void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) -{ - u32 function, index; - struct kvm_cpuid_entry2 *best; - - function = kvm_register_read(vcpu, VCPU_REGS_RAX); - index = kvm_register_read(vcpu, VCPU_REGS_RCX); - kvm_register_write(vcpu, VCPU_REGS_RAX, 0); - kvm_register_write(vcpu, VCPU_REGS_RBX, 0); - kvm_register_write(vcpu, VCPU_REGS_RCX, 0); - kvm_register_write(vcpu, VCPU_REGS_RDX, 0); - best = kvm_find_cpuid_entry(vcpu, function, index); - - if (!best) - best = check_cpuid_limit(vcpu, function, index); - - if (best) { - kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax); - kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx); - kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx); - kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx); - } - kvm_x86_ops->skip_emulated_instruction(vcpu); - trace_kvm_cpuid(function, - kvm_register_read(vcpu, VCPU_REGS_RAX), - kvm_register_read(vcpu, VCPU_REGS_RBX), - kvm_register_read(vcpu, VCPU_REGS_RCX), - kvm_register_read(vcpu, VCPU_REGS_RDX)); -} -EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); diff --git a/ANDROID_3.4.5/arch/x86/kvm/cpuid.h b/ANDROID_3.4.5/arch/x86/kvm/cpuid.h deleted file mode 100644 index 26d1fb43..00000000 --- a/ANDROID_3.4.5/arch/x86/kvm/cpuid.h +++ /dev/null @@ -1,54 +0,0 @@ -#ifndef ARCH_X86_KVM_CPUID_H -#define ARCH_X86_KVM_CPUID_H - -#include "x86.h" - -void kvm_update_cpuid(struct kvm_vcpu *vcpu); -struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, - u32 function, u32 index); -int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, - struct kvm_cpuid_entry2 __user *entries); -int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, - struct kvm_cpuid *cpuid, - struct kvm_cpuid_entry __user *entries); -int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, - struct kvm_cpuid2 *cpuid, - struct kvm_cpuid_entry2 __user *entries); -int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, - struct kvm_cpuid2 *cpuid, - struct kvm_cpuid_entry2 __user *entries); - - -static inline bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu) -{ - struct kvm_cpuid_entry2 *best; - - best = kvm_find_cpuid_entry(vcpu, 1, 0); - return best && (best->ecx & bit(X86_FEATURE_XSAVE)); -} - -static inline bool guest_cpuid_has_smep(struct kvm_vcpu *vcpu) -{ - struct kvm_cpuid_entry2 *best; - - best = kvm_find_cpuid_entry(vcpu, 7, 0); - return best && (best->ebx & bit(X86_FEATURE_SMEP)); -} - -static inline bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu) -{ - struct kvm_cpuid_entry2 *best; - - best = kvm_find_cpuid_entry(vcpu, 7, 0); - return best && (best->ebx & bit(X86_FEATURE_FSGSBASE)); -} - -static inline bool guest_cpuid_has_osvw(struct kvm_vcpu *vcpu) -{ - struct kvm_cpuid_entry2 *best; - - best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); - return best && (best->ecx & bit(X86_FEATURE_OSVW)); -} - -#endif diff --git a/ANDROID_3.4.5/arch/x86/kvm/emulate.c b/ANDROID_3.4.5/arch/x86/kvm/emulate.c deleted file mode 100644 index 83756223..00000000 --- a/ANDROID_3.4.5/arch/x86/kvm/emulate.c +++ /dev/null @@ -1,4360 +0,0 @@ -/****************************************************************************** - * emulate.c - * - * Generic x86 (32-bit and 64-bit) instruction decoder and emulator. - * - * Copyright (c) 2005 Keir Fraser - * - * Linux coding style, mod r/m decoder, segment base fixes, real-mode - * privileged instructions: - * - * Copyright (C) 2006 Qumranet - * Copyright 2010 Red Hat, Inc. and/or its affiliates. - * - * Avi Kivity <avi@qumranet.com> - * Yaniv Kamay <yaniv@qumranet.com> - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * - * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4 - */ - -#include <linux/kvm_host.h> -#include "kvm_cache_regs.h" -#include <linux/module.h> -#include <asm/kvm_emulate.h> - -#include "x86.h" -#include "tss.h" - -/* - * Operand types - */ -#define OpNone 0ull -#define OpImplicit 1ull /* No generic decode */ -#define OpReg 2ull /* Register */ -#define OpMem 3ull /* Memory */ -#define OpAcc 4ull /* Accumulator: AL/AX/EAX/RAX */ -#define OpDI 5ull /* ES:DI/EDI/RDI */ -#define OpMem64 6ull /* Memory, 64-bit */ -#define OpImmUByte 7ull /* Zero-extended 8-bit immediate */ -#define OpDX 8ull /* DX register */ -#define OpCL 9ull /* CL register (for shifts) */ -#define OpImmByte 10ull /* 8-bit sign extended immediate */ -#define OpOne 11ull /* Implied 1 */ -#define OpImm 12ull /* Sign extended immediate */ -#define OpMem16 13ull /* Memory operand (16-bit). */ -#define OpMem32 14ull /* Memory operand (32-bit). */ -#define OpImmU 15ull /* Immediate operand, zero extended */ -#define OpSI 16ull /* SI/ESI/RSI */ -#define OpImmFAddr 17ull /* Immediate far address */ -#define OpMemFAddr 18ull /* Far address in memory */ -#define OpImmU16 19ull /* Immediate operand, 16 bits, zero extended */ -#define OpES 20ull /* ES */ -#define OpCS 21ull /* CS */ -#define OpSS 22ull /* SS */ -#define OpDS 23ull /* DS */ -#define OpFS 24ull /* FS */ -#define OpGS 25ull /* GS */ -#define OpMem8 26ull /* 8-bit zero extended memory operand */ - -#define OpBits 5 /* Width of operand field */ -#define OpMask ((1ull << OpBits) - 1) - -/* - * Opcode effective-address decode tables. - * Note that we only emulate instructions that have at least one memory - * operand (excluding implicit stack references). We assume that stack - * references and instruction fetches will never occur in special memory - * areas that require emulation. So, for example, 'mov <imm>,<reg>' need - * not be handled. - */ - -/* Operand sizes: 8-bit operands or specified/overridden size. */ -#define ByteOp (1<<0) /* 8-bit operands. */ -/* Destination operand type. */ -#define DstShift 1 -#define ImplicitOps (OpImplicit << DstShift) -#define DstReg (OpReg << DstShift) -#define DstMem (OpMem << DstShift) -#define DstAcc (OpAcc << DstShift) -#define DstDI (OpDI << DstShift) -#define DstMem64 (OpMem64 << DstShift) -#define DstImmUByte (OpImmUByte << DstShift) -#define DstDX (OpDX << DstShift) -#define DstMask (OpMask << DstShift) -/* Source operand type. */ -#define SrcShift 6 -#define SrcNone (OpNone << SrcShift) -#define SrcReg (OpReg << SrcShift) -#define SrcMem (OpMem << SrcShift) -#define SrcMem16 (OpMem16 << SrcShift) -#define SrcMem32 (OpMem32 << SrcShift) -#define SrcImm (OpImm << SrcShift) -#define SrcImmByte (OpImmByte << SrcShift) -#define SrcOne (OpOne << SrcShift) -#define SrcImmUByte (OpImmUByte << SrcShift) -#define SrcImmU (OpImmU << SrcShift) -#define SrcSI (OpSI << SrcShift) -#define SrcImmFAddr (OpImmFAddr << SrcShift) -#define SrcMemFAddr (OpMemFAddr << SrcShift) -#define SrcAcc (OpAcc << SrcShift) -#define SrcImmU16 (OpImmU16 << SrcShift) -#define SrcDX (OpDX << SrcShift) -#define SrcMem8 (OpMem8 << SrcShift) -#define SrcMask (OpMask << SrcShift) -#define BitOp (1<<11) -#define MemAbs (1<<12) /* Memory operand is absolute displacement */ -#define String (1<<13) /* String instruction (rep capable) */ -#define Stack (1<<14) /* Stack instruction (push/pop) */ -#define GroupMask (7<<15) /* Opcode uses one of the group mechanisms */ -#define Group (1<<15) /* Bits 3:5 of modrm byte extend opcode */ -#define GroupDual (2<<15) /* Alternate decoding of mod == 3 */ -#define Prefix (3<<15) /* Instruction varies with 66/f2/f3 prefix */ -#define RMExt (4<<15) /* Opcode extension in ModRM r/m if mod == 3 */ -#define Sse (1<<18) /* SSE Vector instruction */ -/* Generic ModRM decode. */ -#define ModRM (1<<19) -/* Destination is only written; never read. */ -#define Mov (1<<20) -/* Misc flags */ -#define Prot (1<<21) /* instruction generates #UD if not in prot-mode */ -#define VendorSpecific (1<<22) /* Vendor specific instruction */ -#define NoAccess (1<<23) /* Don't access memory (lea/invlpg/verr etc) */ -#define Op3264 (1<<24) /* Operand is 64b in long mode, 32b otherwise */ -#define Undefined (1<<25) /* No Such Instruction */ -#define Lock (1<<26) /* lock prefix is allowed for the instruction */ -#define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */ -#define No64 (1<<28) -#define PageTable (1 << 29) /* instruction used to write page table */ -/* Source 2 operand type */ -#define Src2Shift (30) -#define Src2None (OpNone << Src2Shift) -#define Src2CL (OpCL << Src2Shift) -#define Src2ImmByte (OpImmByte << Src2Shift) -#define Src2One (OpOne << Src2Shift) -#define Src2Imm (OpImm << Src2Shift) -#define Src2ES (OpES << Src2Shift) -#define Src2CS (OpCS << Src2Shift) -#define Src2SS (OpSS << Src2Shift) -#define Src2DS (OpDS << Src2Shift) -#define Src2FS (OpFS << Src2Shift) -#define Src2GS (OpGS << Src2Shift) -#define Src2Mask (OpMask << Src2Shift) - -#define X2(x...) x, x -#define X3(x...) X2(x), x -#define X4(x...) X2(x), X2(x) -#define X5(x...) X4(x), x -#define X6(x...) X4(x), X2(x) -#define X7(x...) X4(x), X3(x) -#define X8(x...) X4(x), X4(x) -#define X16(x...) X8(x), X8(x) - -struct opcode { - u64 flags : 56; - u64 intercept : 8; - union { - int (*execute)(struct x86_emulate_ctxt *ctxt); - struct opcode *group; - struct group_dual *gdual; - struct gprefix *gprefix; - } u; - int (*check_perm)(struct x86_emulate_ctxt *ctxt); -}; - -struct group_dual { - struct opcode mod012[8]; - struct opcode mod3[8]; -}; - -struct gprefix { - struct opcode pfx_no; - struct opcode pfx_66; - struct opcode pfx_f2; - struct opcode pfx_f3; -}; - -/* EFLAGS bit definitions. */ -#define EFLG_ID (1<<21) -#define EFLG_VIP (1<<20) -#define EFLG_VIF (1<<19) -#define EFLG_AC (1<<18) -#define EFLG_VM (1<<17) -#define EFLG_RF (1<<16) -#define EFLG_IOPL (3<<12) -#define EFLG_NT (1<<14) -#define EFLG_OF (1<<11) -#define EFLG_DF (1<<10) -#define EFLG_IF (1<<9) -#define EFLG_TF (1<<8) -#define EFLG_SF (1<<7) -#define EFLG_ZF (1<<6) -#define EFLG_AF (1<<4) -#define EFLG_PF (1<<2) -#define EFLG_CF (1<<0) - -#define EFLG_RESERVED_ZEROS_MASK 0xffc0802a -#define EFLG_RESERVED_ONE_MASK 2 - -/* - * Instruction emulation: - * Most instructions are emulated directly via a fragment of inline assembly - * code. This allows us to save/restore EFLAGS and thus very easily pick up - * any modified flags. - */ - -#if defined(CONFIG_X86_64) -#define _LO32 "k" /* force 32-bit operand */ -#define _STK "%%rsp" /* stack pointer */ -#elif defined(__i386__) -#define _LO32 "" /* force 32-bit operand */ -#define _STK "%%esp" /* stack pointer */ -#endif - -/* - * These EFLAGS bits are restored from saved value during emulation, and - * any changes are written back to the saved value after emulation. - */ -#define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF) - -/* Before executing instruction: restore necessary bits in EFLAGS. */ -#define _PRE_EFLAGS(_sav, _msk, _tmp) \ - /* EFLAGS = (_sav & _msk) | (EFLAGS & ~_msk); _sav &= ~_msk; */ \ - "movl %"_sav",%"_LO32 _tmp"; " \ - "push %"_tmp"; " \ - "push %"_tmp"; " \ - "movl %"_msk",%"_LO32 _tmp"; " \ - "andl %"_LO32 _tmp",("_STK"); " \ - "pushf; " \ - "notl %"_LO32 _tmp"; " \ - "andl %"_LO32 _tmp",("_STK"); " \ - "andl %"_LO32 _tmp","__stringify(BITS_PER_LONG/4)"("_STK"); " \ - "pop %"_tmp"; " \ - "orl %"_LO32 _tmp",("_STK"); " \ - "popf; " \ - "pop %"_sav"; " - -/* After executing instruction: write-back necessary bits in EFLAGS. */ -#define _POST_EFLAGS(_sav, _msk, _tmp) \ - /* _sav |= EFLAGS & _msk; */ \ - "pushf; " \ - "pop %"_tmp"; " \ - "andl %"_msk",%"_LO32 _tmp"; " \ - "orl %"_LO32 _tmp",%"_sav"; " - -#ifdef CONFIG_X86_64 -#define ON64(x) x -#else -#define ON64(x) -#endif - -#define ____emulate_2op(ctxt, _op, _x, _y, _suffix, _dsttype) \ - do { \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0", "4", "2") \ - _op _suffix " %"_x"3,%1; " \ - _POST_EFLAGS("0", "4", "2") \ - : "=m" ((ctxt)->eflags), \ - "+q" (*(_dsttype*)&(ctxt)->dst.val), \ - "=&r" (_tmp) \ - : _y ((ctxt)->src.val), "i" (EFLAGS_MASK)); \ - } while (0) - - -/* Raw emulation: instruction has two explicit operands. */ -#define __emulate_2op_nobyte(ctxt,_op,_wx,_wy,_lx,_ly,_qx,_qy) \ - do { \ - unsigned long _tmp; \ - \ - switch ((ctxt)->dst.bytes) { \ - case 2: \ - ____emulate_2op(ctxt,_op,_wx,_wy,"w",u16); \ - break; \ - case 4: \ - ____emulate_2op(ctxt,_op,_lx,_ly,"l",u32); \ - break; \ - case 8: \ - ON64(____emulate_2op(ctxt,_op,_qx,_qy,"q",u64)); \ - break; \ - } \ - } while (0) - -#define __emulate_2op(ctxt,_op,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \ - do { \ - unsigned long _tmp; \ - switch ((ctxt)->dst.bytes) { \ - case 1: \ - ____emulate_2op(ctxt,_op,_bx,_by,"b",u8); \ - break; \ - default: \ - __emulate_2op_nobyte(ctxt, _op, \ - _wx, _wy, _lx, _ly, _qx, _qy); \ - break; \ - } \ - } while (0) - -/* Source operand is byte-sized and may be restricted to just %cl. */ -#define emulate_2op_SrcB(ctxt, _op) \ - __emulate_2op(ctxt, _op, "b", "c", "b", "c", "b", "c", "b", "c") - -/* Source operand is byte, word, long or quad sized. */ -#define emulate_2op_SrcV(ctxt, _op) \ - __emulate_2op(ctxt, _op, "b", "q", "w", "r", _LO32, "r", "", "r") - -/* Source operand is word, long or quad sized. */ -#define emulate_2op_SrcV_nobyte(ctxt, _op) \ - __emulate_2op_nobyte(ctxt, _op, "w", "r", _LO32, "r", "", "r") - -/* Instruction has three operands and one operand is stored in ECX register */ -#define __emulate_2op_cl(ctxt, _op, _suffix, _type) \ - do { \ - unsigned long _tmp; \ - _type _clv = (ctxt)->src2.val; \ - _type _srcv = (ctxt)->src.val; \ - _type _dstv = (ctxt)->dst.val; \ - \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0", "5", "2") \ - _op _suffix " %4,%1 \n" \ - _POST_EFLAGS("0", "5", "2") \ - : "=m" ((ctxt)->eflags), "+r" (_dstv), "=&r" (_tmp) \ - : "c" (_clv) , "r" (_srcv), "i" (EFLAGS_MASK) \ - ); \ - \ - (ctxt)->src2.val = (unsigned long) _clv; \ - (ctxt)->src2.val = (unsigned long) _srcv; \ - (ctxt)->dst.val = (unsigned long) _dstv; \ - } while (0) - -#define emulate_2op_cl(ctxt, _op) \ - do { \ - switch ((ctxt)->dst.bytes) { \ - case 2: \ - __emulate_2op_cl(ctxt, _op, "w", u16); \ - break; \ - case 4: \ - __emulate_2op_cl(ctxt, _op, "l", u32); \ - break; \ - case 8: \ - ON64(__emulate_2op_cl(ctxt, _op, "q", ulong)); \ - break; \ - } \ - } while (0) - -#define __emulate_1op(ctxt, _op, _suffix) \ - do { \ - unsigned long _tmp; \ - \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0", "3", "2") \ - _op _suffix " %1; " \ - _POST_EFLAGS("0", "3", "2") \ - : "=m" ((ctxt)->eflags), "+m" ((ctxt)->dst.val), \ - "=&r" (_tmp) \ - : "i" (EFLAGS_MASK)); \ - } while (0) - -/* Instruction has only one explicit operand (no source operand). */ -#define emulate_1op(ctxt, _op) \ - do { \ - switch ((ctxt)->dst.bytes) { \ - case 1: __emulate_1op(ctxt, _op, "b"); break; \ - case 2: __emulate_1op(ctxt, _op, "w"); break; \ - case 4: __emulate_1op(ctxt, _op, "l"); break; \ - case 8: ON64(__emulate_1op(ctxt, _op, "q")); break; \ - } \ - } while (0) - -#define __emulate_1op_rax_rdx(ctxt, _op, _suffix, _ex) \ - do { \ - unsigned long _tmp; \ - ulong *rax = &(ctxt)->regs[VCPU_REGS_RAX]; \ - ulong *rdx = &(ctxt)->regs[VCPU_REGS_RDX]; \ - \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0", "5", "1") \ - "1: \n\t" \ - _op _suffix " %6; " \ - "2: \n\t" \ - _POST_EFLAGS("0", "5", "1") \ - ".pushsection .fixup,\"ax\" \n\t" \ - "3: movb $1, %4 \n\t" \ - "jmp 2b \n\t" \ - ".popsection \n\t" \ - _ASM_EXTABLE(1b, 3b) \ - : "=m" ((ctxt)->eflags), "=&r" (_tmp), \ - "+a" (*rax), "+d" (*rdx), "+qm"(_ex) \ - : "i" (EFLAGS_MASK), "m" ((ctxt)->src.val), \ - "a" (*rax), "d" (*rdx)); \ - } while (0) - -/* instruction has only one source operand, destination is implicit (e.g. mul, div, imul, idiv) */ -#define emulate_1op_rax_rdx(ctxt, _op, _ex) \ - do { \ - switch((ctxt)->src.bytes) { \ - case 1: \ - __emulate_1op_rax_rdx(ctxt, _op, "b", _ex); \ - break; \ - case 2: \ - __emulate_1op_rax_rdx(ctxt, _op, "w", _ex); \ - break; \ - case 4: \ - __emulate_1op_rax_rdx(ctxt, _op, "l", _ex); \ - break; \ - case 8: ON64( \ - __emulate_1op_rax_rdx(ctxt, _op, "q", _ex)); \ - break; \ - } \ - } while (0) - -static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt, - enum x86_intercept intercept, - enum x86_intercept_stage stage) -{ - struct x86_instruction_info info = { - .intercept = intercept, - .rep_prefix = ctxt->rep_prefix, - .modrm_mod = ctxt->modrm_mod, - .modrm_reg = ctxt->modrm_reg, - .modrm_rm = ctxt->modrm_rm, - .src_val = ctxt->src.val64, - .src_bytes = ctxt->src.bytes, - .dst_bytes = ctxt->dst.bytes, - .ad_bytes = ctxt->ad_bytes, - .next_rip = ctxt->eip, - }; - - return ctxt->ops->intercept(ctxt, &info, stage); -} - -static inline unsigned long ad_mask(struct x86_emulate_ctxt *ctxt) -{ - return (1UL << (ctxt->ad_bytes << 3)) - 1; -} - -/* Access/update address held in a register, based on addressing mode. */ -static inline unsigned long -address_mask(struct x86_emulate_ctxt *ctxt, unsigned long reg) -{ - if (ctxt->ad_bytes == sizeof(unsigned long)) - return reg; - else - return reg & ad_mask(ctxt); -} - -static inline unsigned long -register_address(struct x86_emulate_ctxt *ctxt, unsigned long reg) -{ - return address_mask(ctxt, reg); -} - -static inline void -register_address_increment(struct x86_emulate_ctxt *ctxt, unsigned long *reg, int inc) -{ - if (ctxt->ad_bytes == sizeof(unsigned long)) - *reg += inc; - else - *reg = (*reg & ~ad_mask(ctxt)) | ((*reg + inc) & ad_mask(ctxt)); -} - -static inline void jmp_rel(struct x86_emulate_ctxt *ctxt, int rel) -{ - register_address_increment(ctxt, &ctxt->_eip, rel); -} - -static u32 desc_limit_scaled(struct desc_struct *desc) -{ - u32 limit = get_desc_limit(desc); - - return desc->g ? (limit << 12) | 0xfff : limit; -} - -static void set_seg_override(struct x86_emulate_ctxt *ctxt, int seg) -{ - ctxt->has_seg_override = true; - ctxt->seg_override = seg; -} - -static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg) -{ - if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS) - return 0; - - return ctxt->ops->get_cached_segment_base(ctxt, seg); -} - -static unsigned seg_override(struct x86_emulate_ctxt *ctxt) -{ - if (!ctxt->has_seg_override) - return 0; - - return ctxt->seg_override; -} - -static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec, - u32 error, bool valid) -{ - ctxt->exception.vector = vec; - ctxt->exception.error_code = error; - ctxt->exception.error_code_valid = valid; - return X86EMUL_PROPAGATE_FAULT; -} - -static int emulate_db(struct x86_emulate_ctxt *ctxt) -{ - return emulate_exception(ctxt, DB_VECTOR, 0, false); -} - -static int emulate_gp(struct x86_emulate_ctxt *ctxt, int err) -{ - return emulate_exception(ctxt, GP_VECTOR, err, true); -} - -static int emulate_ss(struct x86_emulate_ctxt *ctxt, int err) -{ - return emulate_exception(ctxt, SS_VECTOR, err, true); -} - -static int emulate_ud(struct x86_emulate_ctxt *ctxt) -{ - return emulate_exception(ctxt, UD_VECTOR, 0, false); -} - -static int emulate_ts(struct x86_emulate_ctxt *ctxt, int err) -{ - return emulate_exception(ctxt, TS_VECTOR, err, true); -} - -static int emulate_de(struct x86_emulate_ctxt *ctxt) -{ - return emulate_exception(ctxt, DE_VECTOR, 0, false); -} - -static int emulate_nm(struct x86_emulate_ctxt *ctxt) -{ - return emulate_exception(ctxt, NM_VECTOR, 0, false); -} - -static u16 get_segment_selector(struct x86_emulate_ctxt *ctxt, unsigned seg) -{ - u16 selector; - struct desc_struct desc; - - ctxt->ops->get_segment(ctxt, &selector, &desc, NULL, seg); - return selector; -} - -static void set_segment_selector(struct x86_emulate_ctxt *ctxt, u16 selector, - unsigned seg) -{ - u16 dummy; - u32 base3; - struct desc_struct desc; - - ctxt->ops->get_segment(ctxt, &dummy, &desc, &base3, seg); - ctxt->ops->set_segment(ctxt, selector, &desc, base3, seg); -} - -static int __linearize(struct x86_emulate_ctxt *ctxt, - struct segmented_address addr, - unsigned size, bool write, bool fetch, - ulong *linear) -{ - struct desc_struct desc; - bool usable; - ulong la; - u32 lim; - u16 sel; - unsigned cpl, rpl; - - la = seg_base(ctxt, addr.seg) + addr.ea; - switch (ctxt->mode) { - case X86EMUL_MODE_REAL: - break; - case X86EMUL_MODE_PROT64: - if (((signed long)la << 16) >> 16 != la) - return emulate_gp(ctxt, 0); - break; - default: - usable = ctxt->ops->get_segment(ctxt, &sel, &desc, NULL, - addr.seg); - if (!usable) - goto bad; - /* code segment or read-only data segment */ - if (((desc.type & 8) || !(desc.type & 2)) && write) - goto bad; - /* unreadable code segment */ - if (!fetch && (desc.type & 8) && !(desc.type & 2)) - goto bad; - lim = desc_limit_scaled(&desc); - if ((desc.type & 8) || !(desc.type & 4)) { - /* expand-up segment */ - if (addr.ea > lim || (u32)(addr.ea + size - 1) > lim) - goto bad; - } else { - /* exapand-down segment */ - if (addr.ea <= lim || (u32)(addr.ea + size - 1) <= lim) - goto bad; - lim = desc.d ? 0xffffffff : 0xffff; - if (addr.ea > lim || (u32)(addr.ea + size - 1) > lim) - goto bad; - } - cpl = ctxt->ops->cpl(ctxt); - rpl = sel & 3; - cpl = max(cpl, rpl); - if (!(desc.type & 8)) { - /* data segment */ - if (cpl > desc.dpl) - goto bad; - } else if ((desc.type & 8) && !(desc.type & 4)) { - /* nonconforming code segment */ - if (cpl != desc.dpl) - goto bad; - } else if ((desc.type & 8) && (desc.type & 4)) { - /* conforming code segment */ - if (cpl < desc.dpl) - goto bad; - } - break; - } - if (fetch ? ctxt->mode != X86EMUL_MODE_PROT64 : ctxt->ad_bytes != 8) - la &= (u32)-1; - *linear = la; - return X86EMUL_CONTINUE; -bad: - if (addr.seg == VCPU_SREG_SS) - return emulate_ss(ctxt, addr.seg); - else - return emulate_gp(ctxt, addr.seg); -} - -static int linearize(struct x86_emulate_ctxt *ctxt, - struct segmented_address addr, - unsigned size, bool write, - ulong *linear) -{ - return __linearize(ctxt, addr, size, write, false, linear); -} - - -static int segmented_read_std(struct x86_emulate_ctxt *ctxt, - struct segmented_address addr, - void *data, - unsigned size) -{ - int rc; - ulong linear; - - rc = linearize(ctxt, addr, size, false, &linear); - if (rc != X86EMUL_CONTINUE) - return rc; - return ctxt->ops->read_std(ctxt, linear, data, size, &ctxt->exception); -} - -/* - * Fetch the next byte of the instruction being emulated which is pointed to - * by ctxt->_eip, then increment ctxt->_eip. - * - * Also prefetch the remaining bytes of the instruction without crossing page - * boundary if they are not in fetch_cache yet. - */ -static int do_insn_fetch_byte(struct x86_emulate_ctxt *ctxt, u8 *dest) -{ - struct fetch_cache *fc = &ctxt->fetch; - int rc; - int size, cur_size; - - if (ctxt->_eip == fc->end) { - unsigned long linear; - struct segmented_address addr = { .seg = VCPU_SREG_CS, - .ea = ctxt->_eip }; - cur_size = fc->end - fc->start; - size = min(15UL - cur_size, - PAGE_SIZE - offset_in_page(ctxt->_eip)); - rc = __linearize(ctxt, addr, size, false, true, &linear); - if (unlikely(rc != X86EMUL_CONTINUE)) - return rc; - rc = ctxt->ops->fetch(ctxt, linear, fc->data + cur_size, - size, &ctxt->exception); - if (unlikely(rc != X86EMUL_CONTINUE)) - return rc; - fc->end += size; - } - *dest = fc->data[ctxt->_eip - fc->start]; - ctxt->_eip++; - return X86EMUL_CONTINUE; -} - -static int do_insn_fetch(struct x86_emulate_ctxt *ctxt, - void *dest, unsigned size) -{ - int rc; - - /* x86 instructions are limited to 15 bytes. */ - if (unlikely(ctxt->_eip + size - ctxt->eip > 15)) - return X86EMUL_UNHANDLEABLE; - while (size--) { - rc = do_insn_fetch_byte(ctxt, dest++); - if (rc != X86EMUL_CONTINUE) - return rc; - } - return X86EMUL_CONTINUE; -} - -/* Fetch next part of the instruction being emulated. */ -#define insn_fetch(_type, _ctxt) \ -({ unsigned long _x; \ - rc = do_insn_fetch(_ctxt, &_x, sizeof(_type)); \ - if (rc != X86EMUL_CONTINUE) \ - goto done; \ - (_type)_x; \ -}) - -#define insn_fetch_arr(_arr, _size, _ctxt) \ -({ rc = do_insn_fetch(_ctxt, _arr, (_size)); \ - if (rc != X86EMUL_CONTINUE) \ - goto done; \ -}) - -/* - * Given the 'reg' portion of a ModRM byte, and a register block, return a - * pointer into the block that addresses the relevant register. - * @highbyte_regs specifies whether to decode AH,CH,DH,BH. - */ -static void *decode_register(u8 modrm_reg, unsigned long *regs, - int highbyte_regs) -{ - void *p; - - p = ®s[modrm_reg]; - if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8) - p = (unsigned char *)®s[modrm_reg & 3] + 1; - return p; -} - -static int read_descriptor(struct x86_emulate_ctxt *ctxt, - struct segmented_address addr, - u16 *size, unsigned long *address, int op_bytes) -{ - int rc; - - if (op_bytes == 2) - op_bytes = 3; - *address = 0; - rc = segmented_read_std(ctxt, addr, size, 2); - if (rc != X86EMUL_CONTINUE) - return rc; - addr.ea += 2; - rc = segmented_read_std(ctxt, addr, address, op_bytes); - return rc; -} - -static int test_cc(unsigned int condition, unsigned int flags) -{ - int rc = 0; - - switch ((condition & 15) >> 1) { - case 0: /* o */ - rc |= (flags & EFLG_OF); - break; - case 1: /* b/c/nae */ - rc |= (flags & EFLG_CF); - break; - case 2: /* z/e */ - rc |= (flags & EFLG_ZF); - break; - case 3: /* be/na */ - rc |= (flags & (EFLG_CF|EFLG_ZF)); - break; - case 4: /* s */ - rc |= (flags & EFLG_SF); - break; - case 5: /* p/pe */ - rc |= (flags & EFLG_PF); - break; - case 7: /* le/ng */ - rc |= (flags & EFLG_ZF); - /* fall through */ - case 6: /* l/nge */ - rc |= (!(flags & EFLG_SF) != !(flags & EFLG_OF)); - break; - } - - /* Odd condition identifiers (lsb == 1) have inverted sense. */ - return (!!rc ^ (condition & 1)); -} - -static void fetch_register_operand(struct operand *op) -{ - switch (op->bytes) { - case 1: - op->val = *(u8 *)op->addr.reg; - break; - case 2: - op->val = *(u16 *)op->addr.reg; - break; - case 4: - op->val = *(u32 *)op->addr.reg; - break; - case 8: - op->val = *(u64 *)op->addr.reg; - break; - } -} - -static void read_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, int reg) -{ - ctxt->ops->get_fpu(ctxt); - switch (reg) { - case 0: asm("movdqu %%xmm0, %0" : "=m"(*data)); break; - case 1: asm("movdqu %%xmm1, %0" : "=m"(*data)); break; - case 2: asm("movdqu %%xmm2, %0" : "=m"(*data)); break; - case 3: asm("movdqu %%xmm3, %0" : "=m"(*data)); break; - case 4: asm("movdqu %%xmm4, %0" : "=m"(*data)); break; - case 5: asm("movdqu %%xmm5, %0" : "=m"(*data)); break; - case 6: asm("movdqu %%xmm6, %0" : "=m"(*data)); break; - case 7: asm("movdqu %%xmm7, %0" : "=m"(*data)); break; -#ifdef CONFIG_X86_64 - case 8: asm("movdqu %%xmm8, %0" : "=m"(*data)); break; - case 9: asm("movdqu %%xmm9, %0" : "=m"(*data)); break; - case 10: asm("movdqu %%xmm10, %0" : "=m"(*data)); break; - case 11: asm("movdqu %%xmm11, %0" : "=m"(*data)); break; - case 12: asm("movdqu %%xmm12, %0" : "=m"(*data)); break; - case 13: asm("movdqu %%xmm13, %0" : "=m"(*data)); break; - case 14: asm("movdqu %%xmm14, %0" : "=m"(*data)); break; - case 15: asm("movdqu %%xmm15, %0" : "=m"(*data)); break; -#endif - default: BUG(); - } - ctxt->ops->put_fpu(ctxt); -} - -static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, - int reg) -{ - ctxt->ops->get_fpu(ctxt); - switch (reg) { - case 0: asm("movdqu %0, %%xmm0" : : "m"(*data)); break; - case 1: asm("movdqu %0, %%xmm1" : : "m"(*data)); break; - case 2: asm("movdqu %0, %%xmm2" : : "m"(*data)); break; - case 3: asm("movdqu %0, %%xmm3" : : "m"(*data)); break; - case 4: asm("movdqu %0, %%xmm4" : : "m"(*data)); break; - case 5: asm("movdqu %0, %%xmm5" : : "m"(*data)); break; - case 6: asm("movdqu %0, %%xmm6" : : "m"(*data)); break; - case 7: asm("movdqu %0, %%xmm7" : : "m"(*data)); break; -#ifdef CONFIG_X86_64 - case 8: asm("movdqu %0, %%xmm8" : : "m"(*data)); break; - case 9: asm("movdqu %0, %%xmm9" : : "m"(*data)); break; - case 10: asm("movdqu %0, %%xmm10" : : "m"(*data)); break; - case 11: asm("movdqu %0, %%xmm11" : : "m"(*data)); break; - case 12: asm("movdqu %0, %%xmm12" : : "m"(*data)); break; - case 13: asm("movdqu %0, %%xmm13" : : "m"(*data)); break; - case 14: asm("movdqu %0, %%xmm14" : : "m"(*data)); break; - case 15: asm("movdqu %0, %%xmm15" : : "m"(*data)); break; -#endif - default: BUG(); - } - ctxt->ops->put_fpu(ctxt); -} - -static void decode_register_operand(struct x86_emulate_ctxt *ctxt, - struct operand *op) -{ - unsigned reg = ctxt->modrm_reg; - int highbyte_regs = ctxt->rex_prefix == 0; - - if (!(ctxt->d & ModRM)) - reg = (ctxt->b & 7) | ((ctxt->rex_prefix & 1) << 3); - - if (ctxt->d & Sse) { - op->type = OP_XMM; - op->bytes = 16; - op->addr.xmm = reg; - read_sse_reg(ctxt, &op->vec_val, reg); - return; - } - - op->type = OP_REG; - if (ctxt->d & ByteOp) { - op->addr.reg = decode_register(reg, ctxt->regs, highbyte_regs); - op->bytes = 1; - } else { - op->addr.reg = decode_register(reg, ctxt->regs, 0); - op->bytes = ctxt->op_bytes; - } - fetch_register_operand(op); - op->orig_val = op->val; -} - -static int decode_modrm(struct x86_emulate_ctxt *ctxt, - struct operand *op) -{ - u8 sib; - int index_reg = 0, base_reg = 0, scale; - int rc = X86EMUL_CONTINUE; - ulong modrm_ea = 0; - - if (ctxt->rex_prefix) { - ctxt->modrm_reg = (ctxt->rex_prefix & 4) << 1; /* REX.R */ - index_reg = (ctxt->rex_prefix & 2) << 2; /* REX.X */ - ctxt->modrm_rm = base_reg = (ctxt->rex_prefix & 1) << 3; /* REG.B */ - } - - ctxt->modrm = insn_fetch(u8, ctxt); - ctxt->modrm_mod |= (ctxt->modrm & 0xc0) >> 6; - ctxt->modrm_reg |= (ctxt->modrm & 0x38) >> 3; - ctxt->modrm_rm |= (ctxt->modrm & 0x07); - ctxt->modrm_seg = VCPU_SREG_DS; - - if (ctxt->modrm_mod == 3) { - op->type = OP_REG; - op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; - op->addr.reg = decode_register(ctxt->modrm_rm, - ctxt->regs, ctxt->d & ByteOp); - if (ctxt->d & Sse) { - op->type = OP_XMM; - op->bytes = 16; - op->addr.xmm = ctxt->modrm_rm; - read_sse_reg(ctxt, &op->vec_val, ctxt->modrm_rm); - return rc; - } - fetch_register_operand(op); - return rc; - } - - op->type = OP_MEM; - - if (ctxt->ad_bytes == 2) { - unsigned bx = ctxt->regs[VCPU_REGS_RBX]; - unsigned bp = ctxt->regs[VCPU_REGS_RBP]; - unsigned si = ctxt->regs[VCPU_REGS_RSI]; - unsigned di = ctxt->regs[VCPU_REGS_RDI]; - - /* 16-bit ModR/M decode. */ - switch (ctxt->modrm_mod) { - case 0: - if (ctxt->modrm_rm == 6) - modrm_ea += insn_fetch(u16, ctxt); - break; - case 1: - modrm_ea += insn_fetch(s8, ctxt); - break; - case 2: - modrm_ea += insn_fetch(u16, ctxt); - break; - } - switch (ctxt->modrm_rm) { - case 0: - modrm_ea += bx + si; - break; - case 1: - modrm_ea += bx + di; - break; - case 2: - modrm_ea += bp + si; - break; - case 3: - modrm_ea += bp + di; - break; - case 4: - modrm_ea += si; - break; - case 5: - modrm_ea += di; - break; - case 6: - if (ctxt->modrm_mod != 0) - modrm_ea += bp; - break; - case 7: - modrm_ea += bx; - break; - } - if (ctxt->modrm_rm == 2 || ctxt->modrm_rm == 3 || - (ctxt->modrm_rm == 6 && ctxt->modrm_mod != 0)) - ctxt->modrm_seg = VCPU_SREG_SS; - modrm_ea = (u16)modrm_ea; - } else { - /* 32/64-bit ModR/M decode. */ - if ((ctxt->modrm_rm & 7) == 4) { - sib = insn_fetch(u8, ctxt); - index_reg |= (sib >> 3) & 7; - base_reg |= sib & 7; - scale = sib >> 6; - - if ((base_reg & 7) == 5 && ctxt->modrm_mod == 0) - modrm_ea += insn_fetch(s32, ctxt); - else - modrm_ea += ctxt->regs[base_reg]; - if (index_reg != 4) - modrm_ea += ctxt->regs[index_reg] << scale; - } else if ((ctxt->modrm_rm & 7) == 5 && ctxt->modrm_mod == 0) { - if (ctxt->mode == X86EMUL_MODE_PROT64) - ctxt->rip_relative = 1; - } else - modrm_ea += ctxt->regs[ctxt->modrm_rm]; - switch (ctxt->modrm_mod) { - case 0: - if (ctxt->modrm_rm == 5) - modrm_ea += insn_fetch(s32, ctxt); - break; - case 1: - modrm_ea += insn_fetch(s8, ctxt); - break; - case 2: - modrm_ea += insn_fetch(s32, ctxt); - break; - } - } - op->addr.mem.ea = modrm_ea; -done: - return rc; -} - -static int decode_abs(struct x86_emulate_ctxt *ctxt, - struct operand *op) -{ - int rc = X86EMUL_CONTINUE; - - op->type = OP_MEM; - switch (ctxt->ad_bytes) { - case 2: - op->addr.mem.ea = insn_fetch(u16, ctxt); - break; - case 4: - op->addr.mem.ea = insn_fetch(u32, ctxt); - break; - case 8: - op->addr.mem.ea = insn_fetch(u64, ctxt); - break; - } -done: - return rc; -} - -static void fetch_bit_operand(struct x86_emulate_ctxt *ctxt) -{ - long sv = 0, mask; - - if (ctxt->dst.type == OP_MEM && ctxt->src.type == OP_REG) { - mask = ~(ctxt->dst.bytes * 8 - 1); - - if (ctxt->src.bytes == 2) - sv = (s16)ctxt->src.val & (s16)mask; - else if (ctxt->src.bytes == 4) - sv = (s32)ctxt->src.val & (s32)mask; - - ctxt->dst.addr.mem.ea += (sv >> 3); - } - - /* only subword offset */ - ctxt->src.val &= (ctxt->dst.bytes << 3) - 1; -} - -static int read_emulated(struct x86_emulate_ctxt *ctxt, - unsigned long addr, void *dest, unsigned size) -{ - int rc; - struct read_cache *mc = &ctxt->mem_read; - - while (size) { - int n = min(size, 8u); - size -= n; - if (mc->pos < mc->end) - goto read_cached; - - rc = ctxt->ops->read_emulated(ctxt, addr, mc->data + mc->end, n, - &ctxt->exception); - if (rc != X86EMUL_CONTINUE) - return rc; - mc->end += n; - - read_cached: - memcpy(dest, mc->data + mc->pos, n); - mc->pos += n; - dest += n; - addr += n; - } - return X86EMUL_CONTINUE; -} - -static int segmented_read(struct x86_emulate_ctxt *ctxt, - struct segmented_address addr, - void *data, - unsigned size) -{ - int rc; - ulong linear; - - rc = linearize(ctxt, addr, size, false, &linear); - if (rc != X86EMUL_CONTINUE) - return rc; - return read_emulated(ctxt, linear, data, size); -} - -static int segmented_write(struct x86_emulate_ctxt *ctxt, - struct segmented_address addr, - const void *data, - unsigned size) -{ - int rc; - ulong linear; - - rc = linearize(ctxt, addr, size, true, &linear); - if (rc != X86EMUL_CONTINUE) - return rc; - return ctxt->ops->write_emulated(ctxt, linear, data, size, - &ctxt->exception); -} - -static int segmented_cmpxchg(struct x86_emulate_ctxt *ctxt, - struct segmented_address addr, - const void *orig_data, const void *data, - unsigned size) -{ - int rc; - ulong linear; - - rc = linearize(ctxt, addr, size, true, &linear); - if (rc != X86EMUL_CONTINUE) - return rc; - return ctxt->ops->cmpxchg_emulated(ctxt, linear, orig_data, data, - size, &ctxt->exception); -} - -static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, - unsigned int size, unsigned short port, - void *dest) -{ - struct read_cache *rc = &ctxt->io_read; - - if (rc->pos == rc->end) { /* refill pio read ahead */ - unsigned int in_page, n; - unsigned int count = ctxt->rep_prefix ? - address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) : 1; - in_page = (ctxt->eflags & EFLG_DF) ? - offset_in_page(ctxt->regs[VCPU_REGS_RDI]) : - PAGE_SIZE - offset_in_page(ctxt->regs[VCPU_REGS_RDI]); - n = min(min(in_page, (unsigned int)sizeof(rc->data)) / size, - count); - if (n == 0) - n = 1; - rc->pos = rc->end = 0; - if (!ctxt->ops->pio_in_emulated(ctxt, size, port, rc->data, n)) - return 0; - rc->end = n * size; - } - - memcpy(dest, rc->data + rc->pos, size); - rc->pos += size; - return 1; -} - -static int read_interrupt_descriptor(struct x86_emulate_ctxt *ctxt, - u16 index, struct desc_struct *desc) -{ - struct desc_ptr dt; - ulong addr; - - ctxt->ops->get_idt(ctxt, &dt); - - if (dt.size < index * 8 + 7) - return emulate_gp(ctxt, index << 3 | 0x2); - - addr = dt.address + index * 8; - return ctxt->ops->read_std(ctxt, addr, desc, sizeof *desc, - &ctxt->exception); -} - -static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt, - u16 selector, struct desc_ptr *dt) -{ - struct x86_emulate_ops *ops = ctxt->ops; - - if (selector & 1 << 2) { - struct desc_struct desc; - u16 sel; - - memset (dt, 0, sizeof *dt); - if (!ops->get_segment(ctxt, &sel, &desc, NULL, VCPU_SREG_LDTR)) - return; - - dt->size = desc_limit_scaled(&desc); /* what if limit > 65535? */ - dt->address = get_desc_base(&desc); - } else - ops->get_gdt(ctxt, dt); -} - -/* allowed just for 8 bytes segments */ -static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt, - u16 selector, struct desc_struct *desc) -{ - struct desc_ptr dt; - u16 index = selector >> 3; - ulong addr; - - get_descriptor_table_ptr(ctxt, selector, &dt); - - if (dt.size < index * 8 + 7) - return emulate_gp(ctxt, selector & 0xfffc); - - addr = dt.address + index * 8; - return ctxt->ops->read_std(ctxt, addr, desc, sizeof *desc, - &ctxt->exception); -} - -/* allowed just for 8 bytes segments */ -static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt, - u16 selector, struct desc_struct *desc) -{ - struct desc_ptr dt; - u16 index = selector >> 3; - ulong addr; - - get_descriptor_table_ptr(ctxt, selector, &dt); - - if (dt.size < index * 8 + 7) - return emulate_gp(ctxt, selector & 0xfffc); - - addr = dt.address + index * 8; - return ctxt->ops->write_std(ctxt, addr, desc, sizeof *desc, - &ctxt->exception); -} - -/* Does not support long mode */ -static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, - u16 selector, int seg) -{ - struct desc_struct seg_desc; - u8 dpl, rpl, cpl; - unsigned err_vec = GP_VECTOR; - u32 err_code = 0; - bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */ - int ret; - - memset(&seg_desc, 0, sizeof seg_desc); - - if ((seg <= VCPU_SREG_GS && ctxt->mode == X86EMUL_MODE_VM86) - || ctxt->mode == X86EMUL_MODE_REAL) { - /* set real mode segment descriptor */ - set_desc_base(&seg_desc, selector << 4); - set_desc_limit(&seg_desc, 0xffff); - seg_desc.type = 3; - seg_desc.p = 1; - seg_desc.s = 1; - if (ctxt->mode == X86EMUL_MODE_VM86) - seg_desc.dpl = 3; - goto load; - } - - /* NULL selector is not valid for TR, CS and SS */ - if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR) - && null_selector) - goto exception; - - /* TR should be in GDT only */ - if (seg == VCPU_SREG_TR && (selector & (1 << 2))) - goto exception; - - if (null_selector) /* for NULL selector skip all following checks */ - goto load; - - ret = read_segment_descriptor(ctxt, selector, &seg_desc); - if (ret != X86EMUL_CONTINUE) - return ret; - - err_code = selector & 0xfffc; - err_vec = GP_VECTOR; - - /* can't load system descriptor into segment selecor */ - if (seg <= VCPU_SREG_GS && !seg_desc.s) - goto exception; - - if (!seg_desc.p) { - err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR; - goto exception; - } - - rpl = selector & 3; - dpl = seg_desc.dpl; - cpl = ctxt->ops->cpl(ctxt); - - switch (seg) { - case VCPU_SREG_SS: - /* - * segment is not a writable data segment or segment - * selector's RPL != CPL or segment selector's RPL != CPL - */ - if (rpl != cpl || (seg_desc.type & 0xa) != 0x2 || dpl != cpl) - goto exception; - break; - case VCPU_SREG_CS: - if (!(seg_desc.type & 8)) - goto exception; - - if (seg_desc.type & 4) { - /* conforming */ - if (dpl > cpl) - goto exception; - } else { - /* nonconforming */ - if (rpl > cpl || dpl != cpl) - goto exception; - } - /* CS(RPL) <- CPL */ - selector = (selector & 0xfffc) | cpl; - break; - case VCPU_SREG_TR: - if (seg_desc.s || (seg_desc.type != 1 && seg_desc.type != 9)) - goto exception; - break; - case VCPU_SREG_LDTR: - if (seg_desc.s || seg_desc.type != 2) - goto exception; - break; - default: /* DS, ES, FS, or GS */ - /* - * segment is not a data or readable code segment or - * ((segment is a data or nonconforming code segment) - * and (both RPL and CPL > DPL)) - */ - if ((seg_desc.type & 0xa) == 0x8 || - (((seg_desc.type & 0xc) != 0xc) && - (rpl > dpl && cpl > dpl))) - goto exception; - break; - } - - if (seg_desc.s) { - /* mark segment as accessed */ - seg_desc.type |= 1; - ret = write_segment_descriptor(ctxt, selector, &seg_desc); - if (ret != X86EMUL_CONTINUE) - return ret; - } -load: - ctxt->ops->set_segment(ctxt, selector, &seg_desc, 0, seg); - return X86EMUL_CONTINUE; -exception: - emulate_exception(ctxt, err_vec, err_code, true); - return X86EMUL_PROPAGATE_FAULT; -} - -static void write_register_operand(struct operand *op) -{ - /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */ - switch (op->bytes) { - case 1: - *(u8 *)op->addr.reg = (u8)op->val; - break; - case 2: - *(u16 *)op->addr.reg = (u16)op->val; - break; - case 4: - *op->addr.reg = (u32)op->val; - break; /* 64b: zero-extend */ - case 8: - *op->addr.reg = op->val; - break; - } -} - -static int writeback(struct x86_emulate_ctxt *ctxt) -{ - int rc; - - switch (ctxt->dst.type) { - case OP_REG: - write_register_operand(&ctxt->dst); - break; - case OP_MEM: - if (ctxt->lock_prefix) - rc = segmented_cmpxchg(ctxt, - ctxt->dst.addr.mem, - &ctxt->dst.orig_val, - &ctxt->dst.val, - ctxt->dst.bytes); - else - rc = segmented_write(ctxt, - ctxt->dst.addr.mem, - &ctxt->dst.val, - ctxt->dst.bytes); - if (rc != X86EMUL_CONTINUE) - return rc; - break; - case OP_XMM: - write_sse_reg(ctxt, &ctxt->dst.vec_val, ctxt->dst.addr.xmm); - break; - case OP_NONE: - /* no writeback */ - break; - default: - break; - } - return X86EMUL_CONTINUE; -} - -static int em_push(struct x86_emulate_ctxt *ctxt) -{ - struct segmented_address addr; - - register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], -ctxt->op_bytes); - addr.ea = register_address(ctxt, ctxt->regs[VCPU_REGS_RSP]); - addr.seg = VCPU_SREG_SS; - - /* Disable writeback. */ - ctxt->dst.type = OP_NONE; - return segmented_write(ctxt, addr, &ctxt->src.val, ctxt->op_bytes); -} - -static int emulate_pop(struct x86_emulate_ctxt *ctxt, - void *dest, int len) -{ - int rc; - struct segmented_address addr; - - addr.ea = register_address(ctxt, ctxt->regs[VCPU_REGS_RSP]); - addr.seg = VCPU_SREG_SS; - rc = segmented_read(ctxt, addr, dest, len); - if (rc != X86EMUL_CONTINUE) - return rc; - - register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], len); - return rc; -} - -static int em_pop(struct x86_emulate_ctxt *ctxt) -{ - return emulate_pop(ctxt, &ctxt->dst.val, ctxt->op_bytes); -} - -static int emulate_popf(struct x86_emulate_ctxt *ctxt, - void *dest, int len) -{ - int rc; - unsigned long val, change_mask; - int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; - int cpl = ctxt->ops->cpl(ctxt); - - rc = emulate_pop(ctxt, &val, len); - if (rc != X86EMUL_CONTINUE) - return rc; - - change_mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_OF - | EFLG_TF | EFLG_DF | EFLG_NT | EFLG_RF | EFLG_AC | EFLG_ID; - - switch(ctxt->mode) { - case X86EMUL_MODE_PROT64: - case X86EMUL_MODE_PROT32: - case X86EMUL_MODE_PROT16: - if (cpl == 0) - change_mask |= EFLG_IOPL; - if (cpl <= iopl) - change_mask |= EFLG_IF; - break; - case X86EMUL_MODE_VM86: - if (iopl < 3) - return emulate_gp(ctxt, 0); - change_mask |= EFLG_IF; - break; - default: /* real mode */ - change_mask |= (EFLG_IOPL | EFLG_IF); - break; - } - - *(unsigned long *)dest = - (ctxt->eflags & ~change_mask) | (val & change_mask); - - return rc; -} - -static int em_popf(struct x86_emulate_ctxt *ctxt) -{ - ctxt->dst.type = OP_REG; - ctxt->dst.addr.reg = &ctxt->eflags; - ctxt->dst.bytes = ctxt->op_bytes; - return emulate_popf(ctxt, &ctxt->dst.val, ctxt->op_bytes); -} - -static int em_push_sreg(struct x86_emulate_ctxt *ctxt) -{ - int seg = ctxt->src2.val; - - ctxt->src.val = get_segment_selector(ctxt, seg); - - return em_push(ctxt); -} - -static int em_pop_sreg(struct x86_emulate_ctxt *ctxt) -{ - int seg = ctxt->src2.val; - unsigned long selector; - int rc; - - rc = emulate_pop(ctxt, &selector, ctxt->op_bytes); - if (rc != X86EMUL_CONTINUE) - return rc; - - rc = load_segment_descriptor(ctxt, (u16)selector, seg); - return rc; -} - -static int em_pusha(struct x86_emulate_ctxt *ctxt) -{ - unsigned long old_esp = ctxt->regs[VCPU_REGS_RSP]; - int rc = X86EMUL_CONTINUE; - int reg = VCPU_REGS_RAX; - - while (reg <= VCPU_REGS_RDI) { - (reg == VCPU_REGS_RSP) ? - (ctxt->src.val = old_esp) : (ctxt->src.val = ctxt->regs[reg]); - - rc = em_push(ctxt); - if (rc != X86EMUL_CONTINUE) - return rc; - - ++reg; - } - - return rc; -} - -static int em_pushf(struct x86_emulate_ctxt *ctxt) -{ - ctxt->src.val = (unsigned long)ctxt->eflags; - return em_push(ctxt); -} - -static int em_popa(struct x86_emulate_ctxt *ctxt) -{ - int rc = X86EMUL_CONTINUE; - int reg = VCPU_REGS_RDI; - - while (reg >= VCPU_REGS_RAX) { - if (reg == VCPU_REGS_RSP) { - register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], - ctxt->op_bytes); - --reg; - } - - rc = emulate_pop(ctxt, &ctxt->regs[reg], ctxt->op_bytes); - if (rc != X86EMUL_CONTINUE) - break; - --reg; - } - return rc; -} - -int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq) -{ - struct x86_emulate_ops *ops = ctxt->ops; - int rc; - struct desc_ptr dt; - gva_t cs_addr; - gva_t eip_addr; - u16 cs, eip; - - /* TODO: Add limit checks */ - ctxt->src.val = ctxt->eflags; - rc = em_push(ctxt); - if (rc != X86EMUL_CONTINUE) - return rc; - - ctxt->eflags &= ~(EFLG_IF | EFLG_TF | EFLG_AC); - - ctxt->src.val = get_segment_selector(ctxt, VCPU_SREG_CS); - rc = em_push(ctxt); - if (rc != X86EMUL_CONTINUE) - return rc; - - ctxt->src.val = ctxt->_eip; - rc = em_push(ctxt); - if (rc != X86EMUL_CONTINUE) - return rc; - - ops->get_idt(ctxt, &dt); - - eip_addr = dt.address + (irq << 2); - cs_addr = dt.address + (irq << 2) + 2; - - rc = ops->read_std(ctxt, cs_addr, &cs, 2, &ctxt->exception); - if (rc != X86EMUL_CONTINUE) - return rc; - - rc = ops->read_std(ctxt, eip_addr, &eip, 2, &ctxt->exception); - if (rc != X86EMUL_CONTINUE) - return rc; - - rc = load_segment_descriptor(ctxt, cs, VCPU_SREG_CS); - if (rc != X86EMUL_CONTINUE) - return rc; - - ctxt->_eip = eip; - - return rc; -} - -static int emulate_int(struct x86_emulate_ctxt *ctxt, int irq) -{ - switch(ctxt->mode) { - case X86EMUL_MODE_REAL: - return emulate_int_real(ctxt, irq); - case X86EMUL_MODE_VM86: - case X86EMUL_MODE_PROT16: - case X86EMUL_MODE_PROT32: - case X86EMUL_MODE_PROT64: - default: - /* Protected mode interrupts unimplemented yet */ - return X86EMUL_UNHANDLEABLE; - } -} - -static int emulate_iret_real(struct x86_emulate_ctxt *ctxt) -{ - int rc = X86EMUL_CONTINUE; - unsigned long temp_eip = 0; - unsigned long temp_eflags = 0; - unsigned long cs = 0; - unsigned long mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_TF | - EFLG_IF | EFLG_DF | EFLG_OF | EFLG_IOPL | EFLG_NT | EFLG_RF | - EFLG_AC | EFLG_ID | (1 << 1); /* Last one is the reserved bit */ - unsigned long vm86_mask = EFLG_VM | EFLG_VIF | EFLG_VIP; - - /* TODO: Add stack limit check */ - - rc = emulate_pop(ctxt, &temp_eip, ctxt->op_bytes); - - if (rc != X86EMUL_CONTINUE) - return rc; - - if (temp_eip & ~0xffff) - return emulate_gp(ctxt, 0); - - rc = emulate_pop(ctxt, &cs, ctxt->op_bytes); - - if (rc != X86EMUL_CONTINUE) - return rc; - - rc = emulate_pop(ctxt, &temp_eflags, ctxt->op_bytes); - - if (rc != X86EMUL_CONTINUE) - return rc; - - rc = load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS); - - if (rc != X86EMUL_CONTINUE) - return rc; - - ctxt->_eip = temp_eip; - - - if (ctxt->op_bytes == 4) - ctxt->eflags = ((temp_eflags & mask) | (ctxt->eflags & vm86_mask)); - else if (ctxt->op_bytes == 2) { - ctxt->eflags &= ~0xffff; - ctxt->eflags |= temp_eflags; - } - - ctxt->eflags &= ~EFLG_RESERVED_ZEROS_MASK; /* Clear reserved zeros */ - ctxt->eflags |= EFLG_RESERVED_ONE_MASK; - - return rc; -} - -static int em_iret(struct x86_emulate_ctxt *ctxt) -{ - switch(ctxt->mode) { - case X86EMUL_MODE_REAL: - return emulate_iret_real(ctxt); - case X86EMUL_MODE_VM86: - case X86EMUL_MODE_PROT16: - case X86EMUL_MODE_PROT32: - case X86EMUL_MODE_PROT64: - default: - /* iret from protected mode unimplemented yet */ - return X86EMUL_UNHANDLEABLE; - } -} - -static int em_jmp_far(struct x86_emulate_ctxt *ctxt) -{ - int rc; - unsigned short sel; - - memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2); - - rc = load_segment_descriptor(ctxt, sel, VCPU_SREG_CS); - if (rc != X86EMUL_CONTINUE) - return rc; - - ctxt->_eip = 0; - memcpy(&ctxt->_eip, ctxt->src.valptr, ctxt->op_bytes); - return X86EMUL_CONTINUE; -} - -static int em_grp2(struct x86_emulate_ctxt *ctxt) -{ - switch (ctxt->modrm_reg) { - case 0: /* rol */ - emulate_2op_SrcB(ctxt, "rol"); - break; - case 1: /* ror */ - emulate_2op_SrcB(ctxt, "ror"); - break; - case 2: /* rcl */ - emulate_2op_SrcB(ctxt, "rcl"); - break; - case 3: /* rcr */ - emulate_2op_SrcB(ctxt, "rcr"); - break; - case 4: /* sal/shl */ - case 6: /* sal/shl */ - emulate_2op_SrcB(ctxt, "sal"); - break; - case 5: /* shr */ - emulate_2op_SrcB(ctxt, "shr"); - break; - case 7: /* sar */ - emulate_2op_SrcB(ctxt, "sar"); - break; - } - return X86EMUL_CONTINUE; -} - -static int em_not(struct x86_emulate_ctxt *ctxt) -{ - ctxt->dst.val = ~ctxt->dst.val; - return X86EMUL_CONTINUE; -} - -static int em_neg(struct x86_emulate_ctxt *ctxt) -{ - emulate_1op(ctxt, "neg"); - return X86EMUL_CONTINUE; -} - -static int em_mul_ex(struct x86_emulate_ctxt *ctxt) -{ - u8 ex = 0; - - emulate_1op_rax_rdx(ctxt, "mul", ex); - return X86EMUL_CONTINUE; -} - -static int em_imul_ex(struct x86_emulate_ctxt *ctxt) -{ - u8 ex = 0; - - emulate_1op_rax_rdx(ctxt, "imul", ex); - return X86EMUL_CONTINUE; -} - -static int em_div_ex(struct x86_emulate_ctxt *ctxt) -{ - u8 de = 0; - - emulate_1op_rax_rdx(ctxt, "div", de); - if (de) - return emulate_de(ctxt); - return X86EMUL_CONTINUE; -} - -static int em_idiv_ex(struct x86_emulate_ctxt *ctxt) -{ - u8 de = 0; - - emulate_1op_rax_rdx(ctxt, "idiv", de); - if (de) - return emulate_de(ctxt); - return X86EMUL_CONTINUE; -} - -static int em_grp45(struct x86_emulate_ctxt *ctxt) -{ - int rc = X86EMUL_CONTINUE; - - switch (ctxt->modrm_reg) { - case 0: /* inc */ - emulate_1op(ctxt, "inc"); - break; - case 1: /* dec */ - emulate_1op(ctxt, "dec"); - break; - case 2: /* call near abs */ { - long int old_eip; - old_eip = ctxt->_eip; - ctxt->_eip = ctxt->src.val; - ctxt->src.val = old_eip; - rc = em_push(ctxt); - break; - } - case 4: /* jmp abs */ - ctxt->_eip = ctxt->src.val; - break; - case 5: /* jmp far */ - rc = em_jmp_far(ctxt); - break; - case 6: /* push */ - rc = em_push(ctxt); - break; - } - return rc; -} - -static int em_cmpxchg8b(struct x86_emulate_ctxt *ctxt) -{ - u64 old = ctxt->dst.orig_val64; - - if (((u32) (old >> 0) != (u32) ctxt->regs[VCPU_REGS_RAX]) || - ((u32) (old >> 32) != (u32) ctxt->regs[VCPU_REGS_RDX])) { - ctxt->regs[VCPU_REGS_RAX] = (u32) (old >> 0); - ctxt->regs[VCPU_REGS_RDX] = (u32) (old >> 32); - ctxt->eflags &= ~EFLG_ZF; - } else { - ctxt->dst.val64 = ((u64)ctxt->regs[VCPU_REGS_RCX] << 32) | - (u32) ctxt->regs[VCPU_REGS_RBX]; - - ctxt->eflags |= EFLG_ZF; - } - return X86EMUL_CONTINUE; -} - -static int em_ret(struct x86_emulate_ctxt *ctxt) -{ - ctxt->dst.type = OP_REG; - ctxt->dst.addr.reg = &ctxt->_eip; - ctxt->dst.bytes = ctxt->op_bytes; - return em_pop(ctxt); -} - -static int em_ret_far(struct x86_emulate_ctxt *ctxt) -{ - int rc; - unsigned long cs; - - rc = emulate_pop(ctxt, &ctxt->_eip, ctxt->op_bytes); - if (rc != X86EMUL_CONTINUE) - return rc; - if (ctxt->op_bytes == 4) - ctxt->_eip = (u32)ctxt->_eip; - rc = emulate_pop(ctxt, &cs, ctxt->op_bytes); - if (rc != X86EMUL_CONTINUE) - return rc; - rc = load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS); - return rc; -} - -static int em_cmpxchg(struct x86_emulate_ctxt *ctxt) -{ - /* Save real source value, then compare EAX against destination. */ - ctxt->src.orig_val = ctxt->src.val; - ctxt->src.val = ctxt->regs[VCPU_REGS_RAX]; - emulate_2op_SrcV(ctxt, "cmp"); - - if (ctxt->eflags & EFLG_ZF) { - /* Success: write back to memory. */ - ctxt->dst.val = ctxt->src.orig_val; - } else { - /* Failure: write the value we saw to EAX. */ - ctxt->dst.type = OP_REG; - ctxt->dst.addr.reg = (unsigned long *)&ctxt->regs[VCPU_REGS_RAX]; - } - return X86EMUL_CONTINUE; -} - -static int em_lseg(struct x86_emulate_ctxt *ctxt) -{ - int seg = ctxt->src2.val; - unsigned short sel; - int rc; - - memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2); - - rc = load_segment_descriptor(ctxt, sel, seg); - if (rc != X86EMUL_CONTINUE) - return rc; - - ctxt->dst.val = ctxt->src.val; - return rc; -} - -static void -setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, - struct desc_struct *cs, struct desc_struct *ss) -{ - u16 selector; - - memset(cs, 0, sizeof(struct desc_struct)); - ctxt->ops->get_segment(ctxt, &selector, cs, NULL, VCPU_SREG_CS); - memset(ss, 0, sizeof(struct desc_struct)); - - cs->l = 0; /* will be adjusted later */ - set_desc_base(cs, 0); /* flat segment */ - cs->g = 1; /* 4kb granularity */ - set_desc_limit(cs, 0xfffff); /* 4GB limit */ - cs->type = 0x0b; /* Read, Execute, Accessed */ - cs->s = 1; - cs->dpl = 0; /* will be adjusted later */ - cs->p = 1; - cs->d = 1; - - set_desc_base(ss, 0); /* flat segment */ - set_desc_limit(ss, 0xfffff); /* 4GB limit */ - ss->g = 1; /* 4kb granularity */ - ss->s = 1; - ss->type = 0x03; /* Read/Write, Accessed */ - ss->d = 1; /* 32bit stack segment */ - ss->dpl = 0; - ss->p = 1; -} - -static bool vendor_intel(struct x86_emulate_ctxt *ctxt) -{ - u32 eax, ebx, ecx, edx; - - eax = ecx = 0; - return ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx) - && ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx - && ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx - && edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx; -} - -static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt) -{ - struct x86_emulate_ops *ops = ctxt->ops; - u32 eax, ebx, ecx, edx; - - /* - * syscall should always be enabled in longmode - so only become - * vendor specific (cpuid) if other modes are active... - */ - if (ctxt->mode == X86EMUL_MODE_PROT64) - return true; - - eax = 0x00000000; - ecx = 0x00000000; - if (ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx)) { - /* - * Intel ("GenuineIntel") - * remark: Intel CPUs only support "syscall" in 64bit - * longmode. Also an 64bit guest with a - * 32bit compat-app running will #UD !! While this - * behaviour can be fixed (by emulating) into AMD - * response - CPUs of AMD can't behave like Intel. - */ - if (ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx && - ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx && - edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx) - return false; - - /* AMD ("AuthenticAMD") */ - if (ebx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx && - ecx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx && - edx == X86EMUL_CPUID_VENDOR_AuthenticAMD_edx) - return true; - - /* AMD ("AMDisbetter!") */ - if (ebx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ebx && - ecx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ecx && - edx == X86EMUL_CPUID_VENDOR_AMDisbetterI_edx) - return true; - } - - /* default: (not Intel, not AMD), apply Intel's stricter rules... */ - return false; -} - -static int em_syscall(struct x86_emulate_ctxt *ctxt) -{ - struct x86_emulate_ops *ops = ctxt->ops; - struct desc_struct cs, ss; - u64 msr_data; - u16 cs_sel, ss_sel; - u64 efer = 0; - - /* syscall is not available in real mode */ - if (ctxt->mode == X86EMUL_MODE_REAL || - ctxt->mode == X86EMUL_MODE_VM86) - return emulate_ud(ctxt); - - if (!(em_syscall_is_enabled(ctxt))) - return emulate_ud(ctxt); - - ops->get_msr(ctxt, MSR_EFER, &efer); - setup_syscalls_segments(ctxt, &cs, &ss); - - if (!(efer & EFER_SCE)) - return emulate_ud(ctxt); - - ops->get_msr(ctxt, MSR_STAR, &msr_data); - msr_data >>= 32; - cs_sel = (u16)(msr_data & 0xfffc); - ss_sel = (u16)(msr_data + 8); - - if (efer & EFER_LMA) { - cs.d = 0; - cs.l = 1; - } - ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS); - ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); - - ctxt->regs[VCPU_REGS_RCX] = ctxt->_eip; - if (efer & EFER_LMA) { -#ifdef CONFIG_X86_64 - ctxt->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF; - - ops->get_msr(ctxt, - ctxt->mode == X86EMUL_MODE_PROT64 ? - MSR_LSTAR : MSR_CSTAR, &msr_data); - ctxt->_eip = msr_data; - - ops->get_msr(ctxt, MSR_SYSCALL_MASK, &msr_data); - ctxt->eflags &= ~(msr_data | EFLG_RF); -#endif - } else { - /* legacy mode */ - ops->get_msr(ctxt, MSR_STAR, &msr_data); - ctxt->_eip = (u32)msr_data; - - ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); - } - - return X86EMUL_CONTINUE; -} - -static int em_sysenter(struct x86_emulate_ctxt *ctxt) -{ - struct x86_emulate_ops *ops = ctxt->ops; - struct desc_struct cs, ss; - u64 msr_data; - u16 cs_sel, ss_sel; - u64 efer = 0; - - ops->get_msr(ctxt, MSR_EFER, &efer); - /* inject #GP if in real mode */ - if (ctxt->mode == X86EMUL_MODE_REAL) - return emulate_gp(ctxt, 0); - - /* - * Not recognized on AMD in compat mode (but is recognized in legacy - * mode). - */ - if ((ctxt->mode == X86EMUL_MODE_PROT32) && (efer & EFER_LMA) - && !vendor_intel(ctxt)) - return emulate_ud(ctxt); - - /* XXX sysenter/sysexit have not been tested in 64bit mode. - * Therefore, we inject an #UD. - */ - if (ctxt->mode == X86EMUL_MODE_PROT64) - return emulate_ud(ctxt); - - setup_syscalls_segments(ctxt, &cs, &ss); - - ops->get_msr(ctxt, MSR_IA32_SYSENTER_CS, &msr_data); - switch (ctxt->mode) { - case X86EMUL_MODE_PROT32: - if ((msr_data & 0xfffc) == 0x0) - return emulate_gp(ctxt, 0); - break; - case X86EMUL_MODE_PROT64: - if (msr_data == 0x0) - return emulate_gp(ctxt, 0); - break; - } - - ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); - cs_sel = (u16)msr_data; - cs_sel &= ~SELECTOR_RPL_MASK; - ss_sel = cs_sel + 8; - ss_sel &= ~SELECTOR_RPL_MASK; - if (ctxt->mode == X86EMUL_MODE_PROT64 || (efer & EFER_LMA)) { - cs.d = 0; - cs.l = 1; - } - - ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS); - ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); - - ops->get_msr(ctxt, MSR_IA32_SYSENTER_EIP, &msr_data); - ctxt->_eip = msr_data; - - ops->get_msr(ctxt, MSR_IA32_SYSENTER_ESP, &msr_data); - ctxt->regs[VCPU_REGS_RSP] = msr_data; - - return X86EMUL_CONTINUE; -} - -static int em_sysexit(struct x86_emulate_ctxt *ctxt) -{ - struct x86_emulate_ops *ops = ctxt->ops; - struct desc_struct cs, ss; - u64 msr_data; - int usermode; - u16 cs_sel = 0, ss_sel = 0; - - /* inject #GP if in real mode or Virtual 8086 mode */ - if (ctxt->mode == X86EMUL_MODE_REAL || - ctxt->mode == X86EMUL_MODE_VM86) - return emulate_gp(ctxt, 0); - - setup_syscalls_segments(ctxt, &cs, &ss); - - if ((ctxt->rex_prefix & 0x8) != 0x0) - usermode = X86EMUL_MODE_PROT64; - else - usermode = X86EMUL_MODE_PROT32; - - cs.dpl = 3; - ss.dpl = 3; - ops->get_msr(ctxt, MSR_IA32_SYSENTER_CS, &msr_data); - switch (usermode) { - case X86EMUL_MODE_PROT32: - cs_sel = (u16)(msr_data + 16); - if ((msr_data & 0xfffc) == 0x0) - return emulate_gp(ctxt, 0); - ss_sel = (u16)(msr_data + 24); - break; - case X86EMUL_MODE_PROT64: - cs_sel = (u16)(msr_data + 32); - if (msr_data == 0x0) - return emulate_gp(ctxt, 0); - ss_sel = cs_sel + 8; - cs.d = 0; - cs.l = 1; - break; - } - cs_sel |= SELECTOR_RPL_MASK; - ss_sel |= SELECTOR_RPL_MASK; - - ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS); - ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); - - ctxt->_eip = ctxt->regs[VCPU_REGS_RDX]; - ctxt->regs[VCPU_REGS_RSP] = ctxt->regs[VCPU_REGS_RCX]; - - return X86EMUL_CONTINUE; -} - -static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt) -{ - int iopl; - if (ctxt->mode == X86EMUL_MODE_REAL) - return false; - if (ctxt->mode == X86EMUL_MODE_VM86) - return true; - iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; - return ctxt->ops->cpl(ctxt) > iopl; -} - -static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt, - u16 port, u16 len) -{ - struct x86_emulate_ops *ops = ctxt->ops; - struct desc_struct tr_seg; - u32 base3; - int r; - u16 tr, io_bitmap_ptr, perm, bit_idx = port & 0x7; - unsigned mask = (1 << len) - 1; - unsigned long base; - - ops->get_segment(ctxt, &tr, &tr_seg, &base3, VCPU_SREG_TR); - if (!tr_seg.p) - return false; - if (desc_limit_scaled(&tr_seg) < 103) - return false; - base = get_desc_base(&tr_seg); -#ifdef CONFIG_X86_64 - base |= ((u64)base3) << 32; -#endif - r = ops->read_std(ctxt, base + 102, &io_bitmap_ptr, 2, NULL); - if (r != X86EMUL_CONTINUE) - return false; - if (io_bitmap_ptr + port/8 > desc_limit_scaled(&tr_seg)) - return false; - r = ops->read_std(ctxt, base + io_bitmap_ptr + port/8, &perm, 2, NULL); - if (r != X86EMUL_CONTINUE) - return false; - if ((perm >> bit_idx) & mask) - return false; - return true; -} - -static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt, - u16 port, u16 len) -{ - if (ctxt->perm_ok) - return true; - - if (emulator_bad_iopl(ctxt)) - if (!emulator_io_port_access_allowed(ctxt, port, len)) - return false; - - ctxt->perm_ok = true; - - return true; -} - -static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt, - struct tss_segment_16 *tss) -{ - tss->ip = ctxt->_eip; - tss->flag = ctxt->eflags; - tss->ax = ctxt->regs[VCPU_REGS_RAX]; - tss->cx = ctxt->regs[VCPU_REGS_RCX]; - tss->dx = ctxt->regs[VCPU_REGS_RDX]; - tss->bx = ctxt->regs[VCPU_REGS_RBX]; - tss->sp = ctxt->regs[VCPU_REGS_RSP]; - tss->bp = ctxt->regs[VCPU_REGS_RBP]; - tss->si = ctxt->regs[VCPU_REGS_RSI]; - tss->di = ctxt->regs[VCPU_REGS_RDI]; - - tss->es = get_segment_selector(ctxt, VCPU_SREG_ES); - tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS); - tss->ss = get_segment_selector(ctxt, VCPU_SREG_SS); - tss->ds = get_segment_selector(ctxt, VCPU_SREG_DS); - tss->ldt = get_segment_selector(ctxt, VCPU_SREG_LDTR); -} - -static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt, - struct tss_segment_16 *tss) -{ - int ret; - - ctxt->_eip = tss->ip; - ctxt->eflags = tss->flag | 2; - ctxt->regs[VCPU_REGS_RAX] = tss->ax; - ctxt->regs[VCPU_REGS_RCX] = tss->cx; - ctxt->regs[VCPU_REGS_RDX] = tss->dx; - ctxt->regs[VCPU_REGS_RBX] = tss->bx; - ctxt->regs[VCPU_REGS_RSP] = tss->sp; - ctxt->regs[VCPU_REGS_RBP] = tss->bp; - ctxt->regs[VCPU_REGS_RSI] = tss->si; - ctxt->regs[VCPU_REGS_RDI] = tss->di; - - /* - * SDM says that segment selectors are loaded before segment - * descriptors - */ - set_segment_selector(ctxt, tss->ldt, VCPU_SREG_LDTR); - set_segment_selector(ctxt, tss->es, VCPU_SREG_ES); - set_segment_selector(ctxt, tss->cs, VCPU_SREG_CS); - set_segment_selector(ctxt, tss->ss, VCPU_SREG_SS); - set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS); - - /* - * Now load segment descriptors. If fault happenes at this stage - * it is handled in a context of new task - */ - ret = load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR); - if (ret != X86EMUL_CONTINUE) - return ret; - ret = load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES); - if (ret != X86EMUL_CONTINUE) - return ret; - ret = load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS); - if (ret != X86EMUL_CONTINUE) - return ret; - ret = load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS); - if (ret != X86EMUL_CONTINUE) - return ret; - ret = load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS); - if (ret != X86EMUL_CONTINUE) - return ret; - - return X86EMUL_CONTINUE; -} - -static int task_switch_16(struct x86_emulate_ctxt *ctxt, - u16 tss_selector, u16 old_tss_sel, - ulong old_tss_base, struct desc_struct *new_desc) -{ - struct x86_emulate_ops *ops = ctxt->ops; - struct tss_segment_16 tss_seg; - int ret; - u32 new_tss_base = get_desc_base(new_desc); - - ret = ops->read_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg, - &ctxt->exception); - if (ret != X86EMUL_CONTINUE) - /* FIXME: need to provide precise fault address */ - return ret; - - save_state_to_tss16(ctxt, &tss_seg); - - ret = ops->write_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg, - &ctxt->exception); - if (ret != X86EMUL_CONTINUE) - /* FIXME: need to provide precise fault address */ - return ret; - - ret = ops->read_std(ctxt, new_tss_base, &tss_seg, sizeof tss_seg, - &ctxt->exception); - if (ret != X86EMUL_CONTINUE) - /* FIXME: need to provide precise fault address */ - return ret; - - if (old_tss_sel != 0xffff) { - tss_seg.prev_task_link = old_tss_sel; - - ret = ops->write_std(ctxt, new_tss_base, - &tss_seg.prev_task_link, - sizeof tss_seg.prev_task_link, - &ctxt->exception); - if (ret != X86EMUL_CONTINUE) - /* FIXME: need to provide precise fault address */ - return ret; - } - - return load_state_from_tss16(ctxt, &tss_seg); -} - -static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt, - struct tss_segment_32 *tss) -{ - tss->cr3 = ctxt->ops->get_cr(ctxt, 3); - tss->eip = ctxt->_eip; - tss->eflags = ctxt->eflags; - tss->eax = ctxt->regs[VCPU_REGS_RAX]; - tss->ecx = ctxt->regs[VCPU_REGS_RCX]; - tss->edx = ctxt->regs[VCPU_REGS_RDX]; - tss->ebx = ctxt->regs[VCPU_REGS_RBX]; - tss->esp = ctxt->regs[VCPU_REGS_RSP]; - tss->ebp = ctxt->regs[VCPU_REGS_RBP]; - tss->esi = ctxt->regs[VCPU_REGS_RSI]; - tss->edi = ctxt->regs[VCPU_REGS_RDI]; - - tss->es = get_segment_selector(ctxt, VCPU_SREG_ES); - tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS); - tss->ss = get_segment_selector(ctxt, VCPU_SREG_SS); - tss->ds = get_segment_selector(ctxt, VCPU_SREG_DS); - tss->fs = get_segment_selector(ctxt, VCPU_SREG_FS); - tss->gs = get_segment_selector(ctxt, VCPU_SREG_GS); - tss->ldt_selector = get_segment_selector(ctxt, VCPU_SREG_LDTR); -} - -static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, - struct tss_segment_32 *tss) -{ - int ret; - - if (ctxt->ops->set_cr(ctxt, 3, tss->cr3)) - return emulate_gp(ctxt, 0); - ctxt->_eip = tss->eip; - ctxt->eflags = tss->eflags | 2; - - /* General purpose registers */ - ctxt->regs[VCPU_REGS_RAX] = tss->eax; - ctxt->regs[VCPU_REGS_RCX] = tss->ecx; - ctxt->regs[VCPU_REGS_RDX] = tss->edx; - ctxt->regs[VCPU_REGS_RBX] = tss->ebx; - ctxt->regs[VCPU_REGS_RSP] = tss->esp; - ctxt->regs[VCPU_REGS_RBP] = tss->ebp; - ctxt->regs[VCPU_REGS_RSI] = tss->esi; - ctxt->regs[VCPU_REGS_RDI] = tss->edi; - - /* - * SDM says that segment selectors are loaded before segment - * descriptors - */ - set_segment_selector(ctxt, tss->ldt_selector, VCPU_SREG_LDTR); - set_segment_selector(ctxt, tss->es, VCPU_SREG_ES); - set_segment_selector(ctxt, tss->cs, VCPU_SREG_CS); - set_segment_selector(ctxt, tss->ss, VCPU_SREG_SS); - set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS); - set_segment_selector(ctxt, tss->fs, VCPU_SREG_FS); - set_segment_selector(ctxt, tss->gs, VCPU_SREG_GS); - - /* - * If we're switching between Protected Mode and VM86, we need to make - * sure to update the mode before loading the segment descriptors so - * that the selectors are interpreted correctly. - * - * Need to get rflags to the vcpu struct immediately because it - * influences the CPL which is checked at least when loading the segment - * descriptors and when pushing an error code to the new kernel stack. - * - * TODO Introduce a separate ctxt->ops->set_cpl callback - */ - if (ctxt->eflags & X86_EFLAGS_VM) - ctxt->mode = X86EMUL_MODE_VM86; - else - ctxt->mode = X86EMUL_MODE_PROT32; - - ctxt->ops->set_rflags(ctxt, ctxt->eflags); - - /* - * Now load segment descriptors. If fault happenes at this stage - * it is handled in a context of new task - */ - ret = load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR); - if (ret != X86EMUL_CONTINUE) - return ret; - ret = load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES); - if (ret != X86EMUL_CONTINUE) - return ret; - ret = load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS); - if (ret != X86EMUL_CONTINUE) - return ret; - ret = load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS); - if (ret != X86EMUL_CONTINUE) - return ret; - ret = load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS); - if (ret != X86EMUL_CONTINUE) - return ret; - ret = load_segment_descriptor(ctxt, tss->fs, VCPU_SREG_FS); - if (ret != X86EMUL_CONTINUE) - return ret; - ret = load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS); - if (ret != X86EMUL_CONTINUE) - return ret; - - return X86EMUL_CONTINUE; -} - -static int task_switch_32(struct x86_emulate_ctxt *ctxt, - u16 tss_selector, u16 old_tss_sel, - ulong old_tss_base, struct desc_struct *new_desc) -{ - struct x86_emulate_ops *ops = ctxt->ops; - struct tss_segment_32 tss_seg; - int ret; - u32 new_tss_base = get_desc_base(new_desc); - - ret = ops->read_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg, - &ctxt->exception); - if (ret != X86EMUL_CONTINUE) - /* FIXME: need to provide precise fault address */ - return ret; - - save_state_to_tss32(ctxt, &tss_seg); - - ret = ops->write_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg, - &ctxt->exception); - if (ret != X86EMUL_CONTINUE) - /* FIXME: need to provide precise fault address */ - return ret; - - ret = ops->read_std(ctxt, new_tss_base, &tss_seg, sizeof tss_seg, - &ctxt->exception); - if (ret != X86EMUL_CONTINUE) - /* FIXME: need to provide precise fault address */ - return ret; - - if (old_tss_sel != 0xffff) { - tss_seg.prev_task_link = old_tss_sel; - - ret = ops->write_std(ctxt, new_tss_base, - &tss_seg.prev_task_link, - sizeof tss_seg.prev_task_link, - &ctxt->exception); - if (ret != X86EMUL_CONTINUE) - /* FIXME: need to provide precise fault address */ - return ret; - } - - return load_state_from_tss32(ctxt, &tss_seg); -} - -static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, - u16 tss_selector, int idt_index, int reason, - bool has_error_code, u32 error_code) -{ - struct x86_emulate_ops *ops = ctxt->ops; - struct desc_struct curr_tss_desc, next_tss_desc; - int ret; - u16 old_tss_sel = get_segment_selector(ctxt, VCPU_SREG_TR); - ulong old_tss_base = - ops->get_cached_segment_base(ctxt, VCPU_SREG_TR); - u32 desc_limit; - - /* FIXME: old_tss_base == ~0 ? */ - - ret = read_segment_descriptor(ctxt, tss_selector, &next_tss_desc); - if (ret != X86EMUL_CONTINUE) - return ret; - ret = read_segment_descriptor(ctxt, old_tss_sel, &curr_tss_desc); - if (ret != X86EMUL_CONTINUE) - return ret; - - /* FIXME: check that next_tss_desc is tss */ - - /* - * Check privileges. The three cases are task switch caused by... - * - * 1. jmp/call/int to task gate: Check against DPL of the task gate - * 2. Exception/IRQ/iret: No check is performed - * 3. jmp/call to TSS: Check agains DPL of the TSS - */ - if (reason == TASK_SWITCH_GATE) { - if (idt_index != -1) { - /* Software interrupts */ - struct desc_struct task_gate_desc; - int dpl; - - ret = read_interrupt_descriptor(ctxt, idt_index, - &task_gate_desc); - if (ret != X86EMUL_CONTINUE) - return ret; - - dpl = task_gate_desc.dpl; - if ((tss_selector & 3) > dpl || ops->cpl(ctxt) > dpl) - return emulate_gp(ctxt, (idt_index << 3) | 0x2); - } - } else if (reason != TASK_SWITCH_IRET) { - int dpl = next_tss_desc.dpl; - if ((tss_selector & 3) > dpl || ops->cpl(ctxt) > dpl) - return emulate_gp(ctxt, tss_selector); - } - - - desc_limit = desc_limit_scaled(&next_tss_desc); - if (!next_tss_desc.p || - ((desc_limit < 0x67 && (next_tss_desc.type & 8)) || - desc_limit < 0x2b)) { - emulate_ts(ctxt, tss_selector & 0xfffc); - return X86EMUL_PROPAGATE_FAULT; - } - - if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) { - curr_tss_desc.type &= ~(1 << 1); /* clear busy flag */ - write_segment_descriptor(ctxt, old_tss_sel, &curr_tss_desc); - } - - if (reason == TASK_SWITCH_IRET) - ctxt->eflags = ctxt->eflags & ~X86_EFLAGS_NT; - - /* set back link to prev task only if NT bit is set in eflags - note that old_tss_sel is not used afetr this point */ - if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE) - old_tss_sel = 0xffff; - - if (next_tss_desc.type & 8) - ret = task_switch_32(ctxt, tss_selector, old_tss_sel, - old_tss_base, &next_tss_desc); - else - ret = task_switch_16(ctxt, tss_selector, old_tss_sel, - old_tss_base, &next_tss_desc); - if (ret != X86EMUL_CONTINUE) - return ret; - - if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) - ctxt->eflags = ctxt->eflags | X86_EFLAGS_NT; - - if (reason != TASK_SWITCH_IRET) { - next_tss_desc.type |= (1 << 1); /* set busy flag */ - write_segment_descriptor(ctxt, tss_selector, &next_tss_desc); - } - - ops->set_cr(ctxt, 0, ops->get_cr(ctxt, 0) | X86_CR0_TS); - ops->set_segment(ctxt, tss_selector, &next_tss_desc, 0, VCPU_SREG_TR); - - if (has_error_code) { - ctxt->op_bytes = ctxt->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2; - ctxt->lock_prefix = 0; - ctxt->src.val = (unsigned long) error_code; - ret = em_push(ctxt); - } - - return ret; -} - -int emulator_task_switch(struct x86_emulate_ctxt *ctxt, - u16 tss_selector, int idt_index, int reason, - bool has_error_code, u32 error_code) -{ - int rc; - - ctxt->_eip = ctxt->eip; - ctxt->dst.type = OP_NONE; - - rc = emulator_do_task_switch(ctxt, tss_selector, idt_index, reason, - has_error_code, error_code); - - if (rc == X86EMUL_CONTINUE) - ctxt->eip = ctxt->_eip; - - return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; -} - -static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned seg, - int reg, struct operand *op) -{ - int df = (ctxt->eflags & EFLG_DF) ? -1 : 1; - - register_address_increment(ctxt, &ctxt->regs[reg], df * op->bytes); - op->addr.mem.ea = register_address(ctxt, ctxt->regs[reg]); - op->addr.mem.seg = seg; -} - -static int em_das(struct x86_emulate_ctxt *ctxt) -{ - u8 al, old_al; - bool af, cf, old_cf; - - cf = ctxt->eflags & X86_EFLAGS_CF; - al = ctxt->dst.val; - - old_al = al; - old_cf = cf; - cf = false; - af = ctxt->eflags & X86_EFLAGS_AF; - if ((al & 0x0f) > 9 || af) { - al -= 6; - cf = old_cf | (al >= 250); - af = true; - } else { - af = false; - } - if (old_al > 0x99 || old_cf) { - al -= 0x60; - cf = true; - } - - ctxt->dst.val = al; - /* Set PF, ZF, SF */ - ctxt->src.type = OP_IMM; - ctxt->src.val = 0; - ctxt->src.bytes = 1; - emulate_2op_SrcV(ctxt, "or"); - ctxt->eflags &= ~(X86_EFLAGS_AF | X86_EFLAGS_CF); - if (cf) - ctxt->eflags |= X86_EFLAGS_CF; - if (af) - ctxt->eflags |= X86_EFLAGS_AF; - return X86EMUL_CONTINUE; -} - -static int em_call(struct x86_emulate_ctxt *ctxt) -{ - long rel = ctxt->src.val; - - ctxt->src.val = (unsigned long)ctxt->_eip; - jmp_rel(ctxt, rel); - return em_push(ctxt); -} - -static int em_call_far(struct x86_emulate_ctxt *ctxt) -{ - u16 sel, old_cs; - ulong old_eip; - int rc; - - old_cs = get_segment_selector(ctxt, VCPU_SREG_CS); - old_eip = ctxt->_eip; - - memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2); - if (load_segment_descriptor(ctxt, sel, VCPU_SREG_CS)) - return X86EMUL_CONTINUE; - - ctxt->_eip = 0; - memcpy(&ctxt->_eip, ctxt->src.valptr, ctxt->op_bytes); - - ctxt->src.val = old_cs; - rc = em_push(ctxt); - if (rc != X86EMUL_CONTINUE) - return rc; - - ctxt->src.val = old_eip; - return em_push(ctxt); -} - -static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt) -{ - int rc; - - ctxt->dst.type = OP_REG; - ctxt->dst.addr.reg = &ctxt->_eip; - ctxt->dst.bytes = ctxt->op_bytes; - rc = emulate_pop(ctxt, &ctxt->dst.val, ctxt->op_bytes); - if (rc != X86EMUL_CONTINUE) - return rc; - register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], ctxt->src.val); - return X86EMUL_CONTINUE; -} - -static int em_add(struct x86_emulate_ctxt *ctxt) -{ - emulate_2op_SrcV(ctxt, "add"); - return X86EMUL_CONTINUE; -} - -static int em_or(struct x86_emulate_ctxt *ctxt) -{ - emulate_2op_SrcV(ctxt, "or"); - return X86EMUL_CONTINUE; -} - -static int em_adc(struct x86_emulate_ctxt *ctxt) -{ - emulate_2op_SrcV(ctxt, "adc"); - return X86EMUL_CONTINUE; -} - -static int em_sbb(struct x86_emulate_ctxt *ctxt) -{ - emulate_2op_SrcV(ctxt, "sbb"); - return X86EMUL_CONTINUE; -} - -static int em_and(struct x86_emulate_ctxt *ctxt) -{ - emulate_2op_SrcV(ctxt, "and"); - return X86EMUL_CONTINUE; -} - -static int em_sub(struct x86_emulate_ctxt *ctxt) -{ - emulate_2op_SrcV(ctxt, "sub"); - return X86EMUL_CONTINUE; -} - -static int em_xor(struct x86_emulate_ctxt *ctxt) -{ - emulate_2op_SrcV(ctxt, "xor"); - return X86EMUL_CONTINUE; -} - -static int em_cmp(struct x86_emulate_ctxt *ctxt) -{ - emulate_2op_SrcV(ctxt, "cmp"); - /* Disable writeback. */ - ctxt->dst.type = OP_NONE; - return X86EMUL_CONTINUE; -} - -static int em_test(struct x86_emulate_ctxt *ctxt) -{ - emulate_2op_SrcV(ctxt, "test"); - /* Disable writeback. */ - ctxt->dst.type = OP_NONE; - return X86EMUL_CONTINUE; -} - -static int em_xchg(struct x86_emulate_ctxt *ctxt) -{ - /* Write back the register source. */ - ctxt->src.val = ctxt->dst.val; - write_register_operand(&ctxt->src); - - /* Write back the memory destination with implicit LOCK prefix. */ - ctxt->dst.val = ctxt->src.orig_val; - ctxt->lock_prefix = 1; - return X86EMUL_CONTINUE; -} - -static int em_imul(struct x86_emulate_ctxt *ctxt) -{ - emulate_2op_SrcV_nobyte(ctxt, "imul"); - return X86EMUL_CONTINUE; -} - -static int em_imul_3op(struct x86_emulate_ctxt *ctxt) -{ - ctxt->dst.val = ctxt->src2.val; - return em_imul(ctxt); -} - -static int em_cwd(struct x86_emulate_ctxt *ctxt) -{ - ctxt->dst.type = OP_REG; - ctxt->dst.bytes = ctxt->src.bytes; - ctxt->dst.addr.reg = &ctxt->regs[VCPU_REGS_RDX]; - ctxt->dst.val = ~((ctxt->src.val >> (ctxt->src.bytes * 8 - 1)) - 1); - - return X86EMUL_CONTINUE; -} - -static int em_rdtsc(struct x86_emulate_ctxt *ctxt) -{ - u64 tsc = 0; - - ctxt->ops->get_msr(ctxt, MSR_IA32_TSC, &tsc); - ctxt->regs[VCPU_REGS_RAX] = (u32)tsc; - ctxt->regs[VCPU_REGS_RDX] = tsc >> 32; - return X86EMUL_CONTINUE; -} - -static int em_rdpmc(struct x86_emulate_ctxt *ctxt) -{ - u64 pmc; - - if (ctxt->ops->read_pmc(ctxt, ctxt->regs[VCPU_REGS_RCX], &pmc)) - return emulate_gp(ctxt, 0); - ctxt->regs[VCPU_REGS_RAX] = (u32)pmc; - ctxt->regs[VCPU_REGS_RDX] = pmc >> 32; - return X86EMUL_CONTINUE; -} - -static int em_mov(struct x86_emulate_ctxt *ctxt) -{ - ctxt->dst.val = ctxt->src.val; - return X86EMUL_CONTINUE; -} - -static int em_cr_write(struct x86_emulate_ctxt *ctxt) -{ - if (ctxt->ops->set_cr(ctxt, ctxt->modrm_reg, ctxt->src.val)) - return emulate_gp(ctxt, 0); - - /* Disable writeback. */ - ctxt->dst.type = OP_NONE; - return X86EMUL_CONTINUE; -} - -static int em_dr_write(struct x86_emulate_ctxt *ctxt) -{ - unsigned long val; - - if (ctxt->mode == X86EMUL_MODE_PROT64) - val = ctxt->src.val & ~0ULL; - else - val = ctxt->src.val & ~0U; - - /* #UD condition is already handled. */ - if (ctxt->ops->set_dr(ctxt, ctxt->modrm_reg, val) < 0) - return emulate_gp(ctxt, 0); - - /* Disable writeback. */ - ctxt->dst.type = OP_NONE; - return X86EMUL_CONTINUE; -} - -static int em_wrmsr(struct x86_emulate_ctxt *ctxt) -{ - u64 msr_data; - - msr_data = (u32)ctxt->regs[VCPU_REGS_RAX] - | ((u64)ctxt->regs[VCPU_REGS_RDX] << 32); - if (ctxt->ops->set_msr(ctxt, ctxt->regs[VCPU_REGS_RCX], msr_data)) - return emulate_gp(ctxt, 0); - - return X86EMUL_CONTINUE; -} - -static int em_rdmsr(struct x86_emulate_ctxt *ctxt) -{ - u64 msr_data; - - if (ctxt->ops->get_msr(ctxt, ctxt->regs[VCPU_REGS_RCX], &msr_data)) - return emulate_gp(ctxt, 0); - - ctxt->regs[VCPU_REGS_RAX] = (u32)msr_data; - ctxt->regs[VCPU_REGS_RDX] = msr_data >> 32; - return X86EMUL_CONTINUE; -} - -static int em_mov_rm_sreg(struct x86_emulate_ctxt *ctxt) -{ - if (ctxt->modrm_reg > VCPU_SREG_GS) - return emulate_ud(ctxt); - - ctxt->dst.val = get_segment_selector(ctxt, ctxt->modrm_reg); - return X86EMUL_CONTINUE; -} - -static int em_mov_sreg_rm(struct x86_emulate_ctxt *ctxt) -{ - u16 sel = ctxt->src.val; - - if (ctxt->modrm_reg == VCPU_SREG_CS || ctxt->modrm_reg > VCPU_SREG_GS) - return emulate_ud(ctxt); - - if (ctxt->modrm_reg == VCPU_SREG_SS) - ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS; - - /* Disable writeback. */ - ctxt->dst.type = OP_NONE; - return load_segment_descriptor(ctxt, sel, ctxt->modrm_reg); -} - -static int em_movdqu(struct x86_emulate_ctxt *ctxt) -{ - memcpy(&ctxt->dst.vec_val, &ctxt->src.vec_val, ctxt->op_bytes); - return X86EMUL_CONTINUE; -} - -static int em_invlpg(struct x86_emulate_ctxt *ctxt) -{ - int rc; - ulong linear; - - rc = linearize(ctxt, ctxt->src.addr.mem, 1, false, &linear); - if (rc == X86EMUL_CONTINUE) - ctxt->ops->invlpg(ctxt, linear); - /* Disable writeback. */ - ctxt->dst.type = OP_NONE; - return X86EMUL_CONTINUE; -} - -static int em_clts(struct x86_emulate_ctxt *ctxt) -{ - ulong cr0; - - cr0 = ctxt->ops->get_cr(ctxt, 0); - cr0 &= ~X86_CR0_TS; - ctxt->ops->set_cr(ctxt, 0, cr0); - return X86EMUL_CONTINUE; -} - -static int em_vmcall(struct x86_emulate_ctxt *ctxt) -{ - int rc; - - if (ctxt->modrm_mod != 3 || ctxt->modrm_rm != 1) - return X86EMUL_UNHANDLEABLE; - - rc = ctxt->ops->fix_hypercall(ctxt); - if (rc != X86EMUL_CONTINUE) - return rc; - - /* Let the processor re-execute the fixed hypercall */ - ctxt->_eip = ctxt->eip; - /* Disable writeback. */ - ctxt->dst.type = OP_NONE; - return X86EMUL_CONTINUE; -} - -static int em_lgdt(struct x86_emulate_ctxt *ctxt) -{ - struct desc_ptr desc_ptr; - int rc; - - rc = read_descriptor(ctxt, ctxt->src.addr.mem, - &desc_ptr.size, &desc_ptr.address, - ctxt->op_bytes); - if (rc != X86EMUL_CONTINUE) - return rc; - ctxt->ops->set_gdt(ctxt, &desc_ptr); - /* Disable writeback. */ - ctxt->dst.type = OP_NONE; - return X86EMUL_CONTINUE; -} - -static int em_vmmcall(struct x86_emulate_ctxt *ctxt) -{ - int rc; - - rc = ctxt->ops->fix_hypercall(ctxt); - - /* Disable writeback. */ - ctxt->dst.type = OP_NONE; - return rc; -} - -static int em_lidt(struct x86_emulate_ctxt *ctxt) -{ - struct desc_ptr desc_ptr; - int rc; - - rc = read_descriptor(ctxt, ctxt->src.addr.mem, - &desc_ptr.size, &desc_ptr.address, - ctxt->op_bytes); - if (rc != X86EMUL_CONTINUE) - return rc; - ctxt->ops->set_idt(ctxt, &desc_ptr); - /* Disable writeback. */ - ctxt->dst.type = OP_NONE; - return X86EMUL_CONTINUE; -} - -static int em_smsw(struct x86_emulate_ctxt *ctxt) -{ - ctxt->dst.bytes = 2; - ctxt->dst.val = ctxt->ops->get_cr(ctxt, 0); - return X86EMUL_CONTINUE; -} - -static int em_lmsw(struct x86_emulate_ctxt *ctxt) -{ - ctxt->ops->set_cr(ctxt, 0, (ctxt->ops->get_cr(ctxt, 0) & ~0x0eul) - | (ctxt->src.val & 0x0f)); - ctxt->dst.type = OP_NONE; - return X86EMUL_CONTINUE; -} - -static int em_loop(struct x86_emulate_ctxt *ctxt) -{ - register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RCX], -1); - if ((address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) != 0) && - (ctxt->b == 0xe2 || test_cc(ctxt->b ^ 0x5, ctxt->eflags))) - jmp_rel(ctxt, ctxt->src.val); - - return X86EMUL_CONTINUE; -} - -static int em_jcxz(struct x86_emulate_ctxt *ctxt) -{ - if (address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) == 0) - jmp_rel(ctxt, ctxt->src.val); - - return X86EMUL_CONTINUE; -} - -static int em_in(struct x86_emulate_ctxt *ctxt) -{ - if (!pio_in_emulated(ctxt, ctxt->dst.bytes, ctxt->src.val, - &ctxt->dst.val)) - return X86EMUL_IO_NEEDED; - - return X86EMUL_CONTINUE; -} - -static int em_out(struct x86_emulate_ctxt *ctxt) -{ - ctxt->ops->pio_out_emulated(ctxt, ctxt->src.bytes, ctxt->dst.val, - &ctxt->src.val, 1); - /* Disable writeback. */ - ctxt->dst.type = OP_NONE; - return X86EMUL_CONTINUE; -} - -static int em_cli(struct x86_emulate_ctxt *ctxt) -{ - if (emulator_bad_iopl(ctxt)) - return emulate_gp(ctxt, 0); - - ctxt->eflags &= ~X86_EFLAGS_IF; - return X86EMUL_CONTINUE; -} - -static int em_sti(struct x86_emulate_ctxt *ctxt) -{ - if (emulator_bad_iopl(ctxt)) - return emulate_gp(ctxt, 0); - - ctxt->interruptibility = KVM_X86_SHADOW_INT_STI; - ctxt->eflags |= X86_EFLAGS_IF; - return X86EMUL_CONTINUE; -} - -static int em_bt(struct x86_emulate_ctxt *ctxt) -{ - /* Disable writeback. */ - ctxt->dst.type = OP_NONE; - /* only subword offset */ - ctxt->src.val &= (ctxt->dst.bytes << 3) - 1; - - emulate_2op_SrcV_nobyte(ctxt, "bt"); - return X86EMUL_CONTINUE; -} - -static int em_bts(struct x86_emulate_ctxt *ctxt) -{ - emulate_2op_SrcV_nobyte(ctxt, "bts"); - return X86EMUL_CONTINUE; -} - -static int em_btr(struct x86_emulate_ctxt *ctxt) -{ - emulate_2op_SrcV_nobyte(ctxt, "btr"); - return X86EMUL_CONTINUE; -} - -static int em_btc(struct x86_emulate_ctxt *ctxt) -{ - emulate_2op_SrcV_nobyte(ctxt, "btc"); - return X86EMUL_CONTINUE; -} - -static int em_bsf(struct x86_emulate_ctxt *ctxt) -{ - u8 zf; - - __asm__ ("bsf %2, %0; setz %1" - : "=r"(ctxt->dst.val), "=q"(zf) - : "r"(ctxt->src.val)); - - ctxt->eflags &= ~X86_EFLAGS_ZF; - if (zf) { - ctxt->eflags |= X86_EFLAGS_ZF; - /* Disable writeback. */ - ctxt->dst.type = OP_NONE; - } - return X86EMUL_CONTINUE; -} - -static int em_bsr(struct x86_emulate_ctxt *ctxt) -{ - u8 zf; - - __asm__ ("bsr %2, %0; setz %1" - : "=r"(ctxt->dst.val), "=q"(zf) - : "r"(ctxt->src.val)); - - ctxt->eflags &= ~X86_EFLAGS_ZF; - if (zf) { - ctxt->eflags |= X86_EFLAGS_ZF; - /* Disable writeback. */ - ctxt->dst.type = OP_NONE; - } - return X86EMUL_CONTINUE; -} - -static bool valid_cr(int nr) -{ - switch (nr) { - case 0: - case 2 ... 4: - case 8: - return true; - default: - return false; - } -} - -static int check_cr_read(struct x86_emulate_ctxt *ctxt) -{ - if (!valid_cr(ctxt->modrm_reg)) - return emulate_ud(ctxt); - - return X86EMUL_CONTINUE; -} - -static int check_cr_write(struct x86_emulate_ctxt *ctxt) -{ - u64 new_val = ctxt->src.val64; - int cr = ctxt->modrm_reg; - u64 efer = 0; - - static u64 cr_reserved_bits[] = { - 0xffffffff00000000ULL, - 0, 0, 0, /* CR3 checked later */ - CR4_RESERVED_BITS, - 0, 0, 0, - CR8_RESERVED_BITS, - }; - - if (!valid_cr(cr)) - return emulate_ud(ctxt); - - if (new_val & cr_reserved_bits[cr]) - return emulate_gp(ctxt, 0); - - switch (cr) { - case 0: { - u64 cr4; - if (((new_val & X86_CR0_PG) && !(new_val & X86_CR0_PE)) || - ((new_val & X86_CR0_NW) && !(new_val & X86_CR0_CD))) - return emulate_gp(ctxt, 0); - - cr4 = ctxt->ops->get_cr(ctxt, 4); - ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); - - if ((new_val & X86_CR0_PG) && (efer & EFER_LME) && - !(cr4 & X86_CR4_PAE)) - return emulate_gp(ctxt, 0); - - break; - } - case 3: { - u64 rsvd = 0; - - ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); - if (efer & EFER_LMA) - rsvd = CR3_L_MODE_RESERVED_BITS; - else if (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_PAE) - rsvd = CR3_PAE_RESERVED_BITS; - else if (ctxt->ops->get_cr(ctxt, 0) & X86_CR0_PG) - rsvd = CR3_NONPAE_RESERVED_BITS; - - if (new_val & rsvd) - return emulate_gp(ctxt, 0); - - break; - } - case 4: { - ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); - - if ((efer & EFER_LMA) && !(new_val & X86_CR4_PAE)) - return emulate_gp(ctxt, 0); - - break; - } - } - - return X86EMUL_CONTINUE; -} - -static int check_dr7_gd(struct x86_emulate_ctxt *ctxt) -{ - unsigned long dr7; - - ctxt->ops->get_dr(ctxt, 7, &dr7); - - /* Check if DR7.Global_Enable is set */ - return dr7 & (1 << 13); -} - -static int check_dr_read(struct x86_emulate_ctxt *ctxt) -{ - int dr = ctxt->modrm_reg; - u64 cr4; - - if (dr > 7) - return emulate_ud(ctxt); - - cr4 = ctxt->ops->get_cr(ctxt, 4); - if ((cr4 & X86_CR4_DE) && (dr == 4 || dr == 5)) - return emulate_ud(ctxt); - - if (check_dr7_gd(ctxt)) - return emulate_db(ctxt); - - return X86EMUL_CONTINUE; -} - -static int check_dr_write(struct x86_emulate_ctxt *ctxt) -{ - u64 new_val = ctxt->src.val64; - int dr = ctxt->modrm_reg; - - if ((dr == 6 || dr == 7) && (new_val & 0xffffffff00000000ULL)) - return emulate_gp(ctxt, 0); - - return check_dr_read(ctxt); -} - -static int check_svme(struct x86_emulate_ctxt *ctxt) -{ - u64 efer; - - ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); - - if (!(efer & EFER_SVME)) - return emulate_ud(ctxt); - - return X86EMUL_CONTINUE; -} - -static int check_svme_pa(struct x86_emulate_ctxt *ctxt) -{ - u64 rax = ctxt->regs[VCPU_REGS_RAX]; - - /* Valid physical address? */ - if (rax & 0xffff000000000000ULL) - return emulate_gp(ctxt, 0); - - return check_svme(ctxt); -} - -static int check_rdtsc(struct x86_emulate_ctxt *ctxt) -{ - u64 cr4 = ctxt->ops->get_cr(ctxt, 4); - - if (cr4 & X86_CR4_TSD && ctxt->ops->cpl(ctxt)) - return emulate_ud(ctxt); - - return X86EMUL_CONTINUE; -} - -static int check_rdpmc(struct x86_emulate_ctxt *ctxt) -{ - u64 cr4 = ctxt->ops->get_cr(ctxt, 4); - u64 rcx = ctxt->regs[VCPU_REGS_RCX]; - - if ((!(cr4 & X86_CR4_PCE) && ctxt->ops->cpl(ctxt)) || - (rcx > 3)) - return emulate_gp(ctxt, 0); - - return X86EMUL_CONTINUE; -} - -static int check_perm_in(struct x86_emulate_ctxt *ctxt) -{ - ctxt->dst.bytes = min(ctxt->dst.bytes, 4u); - if (!emulator_io_permited(ctxt, ctxt->src.val, ctxt->dst.bytes)) - return emulate_gp(ctxt, 0); - - return X86EMUL_CONTINUE; -} - -static int check_perm_out(struct x86_emulate_ctxt *ctxt) -{ - ctxt->src.bytes = min(ctxt->src.bytes, 4u); - if (!emulator_io_permited(ctxt, ctxt->dst.val, ctxt->src.bytes)) - return emulate_gp(ctxt, 0); - - return X86EMUL_CONTINUE; -} - -#define D(_y) { .flags = (_y) } -#define DI(_y, _i) { .flags = (_y), .intercept = x86_intercept_##_i } -#define DIP(_y, _i, _p) { .flags = (_y), .intercept = x86_intercept_##_i, \ - .check_perm = (_p) } -#define N D(0) -#define EXT(_f, _e) { .flags = ((_f) | RMExt), .u.group = (_e) } -#define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) } -#define GD(_f, _g) { .flags = ((_f) | GroupDual), .u.gdual = (_g) } -#define I(_f, _e) { .flags = (_f), .u.execute = (_e) } -#define II(_f, _e, _i) \ - { .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i } -#define IIP(_f, _e, _i, _p) \ - { .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i, \ - .check_perm = (_p) } -#define GP(_f, _g) { .flags = ((_f) | Prefix), .u.gprefix = (_g) } - -#define D2bv(_f) D((_f) | ByteOp), D(_f) -#define D2bvIP(_f, _i, _p) DIP((_f) | ByteOp, _i, _p), DIP(_f, _i, _p) -#define I2bv(_f, _e) I((_f) | ByteOp, _e), I(_f, _e) -#define I2bvIP(_f, _e, _i, _p) \ - IIP((_f) | ByteOp, _e, _i, _p), IIP(_f, _e, _i, _p) - -#define I6ALU(_f, _e) I2bv((_f) | DstMem | SrcReg | ModRM, _e), \ - I2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock, _e), \ - I2bv(((_f) & ~Lock) | DstAcc | SrcImm, _e) - -static struct opcode group7_rm1[] = { - DI(SrcNone | ModRM | Priv, monitor), - DI(SrcNone | ModRM | Priv, mwait), - N, N, N, N, N, N, -}; - -static struct opcode group7_rm3[] = { - DIP(SrcNone | ModRM | Prot | Priv, vmrun, check_svme_pa), - II(SrcNone | ModRM | Prot | VendorSpecific, em_vmmcall, vmmcall), - DIP(SrcNone | ModRM | Prot | Priv, vmload, check_svme_pa), - DIP(SrcNone | ModRM | Prot | Priv, vmsave, check_svme_pa), - DIP(SrcNone | ModRM | Prot | Priv, stgi, check_svme), - DIP(SrcNone | ModRM | Prot | Priv, clgi, check_svme), - DIP(SrcNone | ModRM | Prot | Priv, skinit, check_svme), - DIP(SrcNone | ModRM | Prot | Priv, invlpga, check_svme), -}; - -static struct opcode group7_rm7[] = { - N, - DIP(SrcNone | ModRM, rdtscp, check_rdtsc), - N, N, N, N, N, N, -}; - -static struct opcode group1[] = { - I(Lock, em_add), - I(Lock | PageTable, em_or), - I(Lock, em_adc), - I(Lock, em_sbb), - I(Lock | PageTable, em_and), - I(Lock, em_sub), - I(Lock, em_xor), - I(0, em_cmp), -}; - -static struct opcode group1A[] = { - I(DstMem | SrcNone | ModRM | Mov | Stack, em_pop), N, N, N, N, N, N, N, -}; - -static struct opcode group3[] = { - I(DstMem | SrcImm | ModRM, em_test), - I(DstMem | SrcImm | ModRM, em_test), - I(DstMem | SrcNone | ModRM | Lock, em_not), - I(DstMem | SrcNone | ModRM | Lock, em_neg), - I(SrcMem | ModRM, em_mul_ex), - I(SrcMem | ModRM, em_imul_ex), - I(SrcMem | ModRM, em_div_ex), - I(SrcMem | ModRM, em_idiv_ex), -}; - -static struct opcode group4[] = { - I(ByteOp | DstMem | SrcNone | ModRM | Lock, em_grp45), - I(ByteOp | DstMem | SrcNone | ModRM | Lock, em_grp45), - N, N, N, N, N, N, -}; - -static struct opcode group5[] = { - I(DstMem | SrcNone | ModRM | Lock, em_grp45), - I(DstMem | SrcNone | ModRM | Lock, em_grp45), - I(SrcMem | ModRM | Stack, em_grp45), - I(SrcMemFAddr | ModRM | ImplicitOps | Stack, em_call_far), - I(SrcMem | ModRM | Stack, em_grp45), - I(SrcMemFAddr | ModRM | ImplicitOps, em_grp45), - I(SrcMem | ModRM | Stack, em_grp45), N, -}; - -static struct opcode group6[] = { - DI(ModRM | Prot, sldt), - DI(ModRM | Prot, str), - DI(ModRM | Prot | Priv, lldt), - DI(ModRM | Prot | Priv, ltr), - N, N, N, N, -}; - -static struct group_dual group7 = { { - DI(ModRM | Mov | DstMem | Priv, sgdt), - DI(ModRM | Mov | DstMem | Priv, sidt), - II(ModRM | SrcMem | Priv, em_lgdt, lgdt), - II(ModRM | SrcMem | Priv, em_lidt, lidt), - II(SrcNone | ModRM | DstMem | Mov, em_smsw, smsw), N, - II(SrcMem16 | ModRM | Mov | Priv, em_lmsw, lmsw), - II(SrcMem | ModRM | ByteOp | Priv | NoAccess, em_invlpg, invlpg), -}, { - I(SrcNone | ModRM | Priv | VendorSpecific, em_vmcall), - EXT(0, group7_rm1), - N, EXT(0, group7_rm3), - II(SrcNone | ModRM | DstMem | Mov, em_smsw, smsw), N, - II(SrcMem16 | ModRM | Mov | Priv, em_lmsw, lmsw), EXT(0, group7_rm7), -} }; - -static struct opcode group8[] = { - N, N, N, N, - I(DstMem | SrcImmByte | ModRM, em_bt), - I(DstMem | SrcImmByte | ModRM | Lock | PageTable, em_bts), - I(DstMem | SrcImmByte | ModRM | Lock, em_btr), - I(DstMem | SrcImmByte | ModRM | Lock | PageTable, em_btc), -}; - -static struct group_dual group9 = { { - N, I(DstMem64 | ModRM | Lock | PageTable, em_cmpxchg8b), N, N, N, N, N, N, -}, { - N, N, N, N, N, N, N, N, -} }; - -static struct opcode group11[] = { - I(DstMem | SrcImm | ModRM | Mov | PageTable, em_mov), - X7(D(Undefined)), -}; - -static struct gprefix pfx_0f_6f_0f_7f = { - N, N, N, I(Sse, em_movdqu), -}; - -static struct opcode opcode_table[256] = { - /* 0x00 - 0x07 */ - I6ALU(Lock, em_add), - I(ImplicitOps | Stack | No64 | Src2ES, em_push_sreg), - I(ImplicitOps | Stack | No64 | Src2ES, em_pop_sreg), - /* 0x08 - 0x0F */ - I6ALU(Lock | PageTable, em_or), - I(ImplicitOps | Stack | No64 | Src2CS, em_push_sreg), - N, - /* 0x10 - 0x17 */ - I6ALU(Lock, em_adc), - I(ImplicitOps | Stack | No64 | Src2SS, em_push_sreg), - I(ImplicitOps | Stack | No64 | Src2SS, em_pop_sreg), - /* 0x18 - 0x1F */ - I6ALU(Lock, em_sbb), - I(ImplicitOps | Stack | No64 | Src2DS, em_push_sreg), - I(ImplicitOps | Stack | No64 | Src2DS, em_pop_sreg), - /* 0x20 - 0x27 */ - I6ALU(Lock | PageTable, em_and), N, N, - /* 0x28 - 0x2F */ - I6ALU(Lock, em_sub), N, I(ByteOp | DstAcc | No64, em_das), - /* 0x30 - 0x37 */ - I6ALU(Lock, em_xor), N, N, - /* 0x38 - 0x3F */ - I6ALU(0, em_cmp), N, N, - /* 0x40 - 0x4F */ - X16(D(DstReg)), - /* 0x50 - 0x57 */ - X8(I(SrcReg | Stack, em_push)), - /* 0x58 - 0x5F */ - X8(I(DstReg | Stack, em_pop)), - /* 0x60 - 0x67 */ - I(ImplicitOps | Stack | No64, em_pusha), - I(ImplicitOps | Stack | No64, em_popa), - N, D(DstReg | SrcMem32 | ModRM | Mov) /* movsxd (x86/64) */ , - N, N, N, N, - /* 0x68 - 0x6F */ - I(SrcImm | Mov | Stack, em_push), - I(DstReg | SrcMem | ModRM | Src2Imm, em_imul_3op), - I(SrcImmByte | Mov | Stack, em_push), - I(DstReg | SrcMem | ModRM | Src2ImmByte, em_imul_3op), - I2bvIP(DstDI | SrcDX | Mov | String, em_in, ins, check_perm_in), /* insb, insw/insd */ - I2bvIP(SrcSI | DstDX | String, em_out, outs, check_perm_out), /* outsb, outsw/outsd */ - /* 0x70 - 0x7F */ - X16(D(SrcImmByte)), - /* 0x80 - 0x87 */ - G(ByteOp | DstMem | SrcImm | ModRM | Group, group1), - G(DstMem | SrcImm | ModRM | Group, group1), - G(ByteOp | DstMem | SrcImm | ModRM | No64 | Group, group1), - G(DstMem | SrcImmByte | ModRM | Group, group1), - I2bv(DstMem | SrcReg | ModRM, em_test), - I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_xchg), - /* 0x88 - 0x8F */ - I2bv(DstMem | SrcReg | ModRM | Mov | PageTable, em_mov), - I2bv(DstReg | SrcMem | ModRM | Mov, em_mov), - I(DstMem | SrcNone | ModRM | Mov | PageTable, em_mov_rm_sreg), - D(ModRM | SrcMem | NoAccess | DstReg), - I(ImplicitOps | SrcMem16 | ModRM, em_mov_sreg_rm), - G(0, group1A), - /* 0x90 - 0x97 */ - DI(SrcAcc | DstReg, pause), X7(D(SrcAcc | DstReg)), - /* 0x98 - 0x9F */ - D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd), - I(SrcImmFAddr | No64, em_call_far), N, - II(ImplicitOps | Stack, em_pushf, pushf), - II(ImplicitOps | Stack, em_popf, popf), N, N, - /* 0xA0 - 0xA7 */ - I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov), - I2bv(DstMem | SrcAcc | Mov | MemAbs | PageTable, em_mov), - I2bv(SrcSI | DstDI | Mov | String, em_mov), - I2bv(SrcSI | DstDI | String, em_cmp), - /* 0xA8 - 0xAF */ - I2bv(DstAcc | SrcImm, em_test), - I2bv(SrcAcc | DstDI | Mov | String, em_mov), - I2bv(SrcSI | DstAcc | Mov | String, em_mov), - I2bv(SrcAcc | DstDI | String, em_cmp), - /* 0xB0 - 0xB7 */ - X8(I(ByteOp | DstReg | SrcImm | Mov, em_mov)), - /* 0xB8 - 0xBF */ - X8(I(DstReg | SrcImm | Mov, em_mov)), - /* 0xC0 - 0xC7 */ - D2bv(DstMem | SrcImmByte | ModRM), - I(ImplicitOps | Stack | SrcImmU16, em_ret_near_imm), - I(ImplicitOps | Stack, em_ret), - I(DstReg | SrcMemFAddr | ModRM | No64 | Src2ES, em_lseg), - I(DstReg | SrcMemFAddr | ModRM | No64 | Src2DS, em_lseg), - G(ByteOp, group11), G(0, group11), - /* 0xC8 - 0xCF */ - N, N, N, I(ImplicitOps | Stack, em_ret_far), - D(ImplicitOps), DI(SrcImmByte, intn), - D(ImplicitOps | No64), II(ImplicitOps, em_iret, iret), - /* 0xD0 - 0xD7 */ - D2bv(DstMem | SrcOne | ModRM), D2bv(DstMem | ModRM), - N, N, N, N, - /* 0xD8 - 0xDF */ - N, N, N, N, N, N, N, N, - /* 0xE0 - 0xE7 */ - X3(I(SrcImmByte, em_loop)), - I(SrcImmByte, em_jcxz), - I2bvIP(SrcImmUByte | DstAcc, em_in, in, check_perm_in), - I2bvIP(SrcAcc | DstImmUByte, em_out, out, check_perm_out), - /* 0xE8 - 0xEF */ - I(SrcImm | Stack, em_call), D(SrcImm | ImplicitOps), - I(SrcImmFAddr | No64, em_jmp_far), D(SrcImmByte | ImplicitOps), - I2bvIP(SrcDX | DstAcc, em_in, in, check_perm_in), - I2bvIP(SrcAcc | DstDX, em_out, out, check_perm_out), - /* 0xF0 - 0xF7 */ - N, DI(ImplicitOps, icebp), N, N, - DI(ImplicitOps | Priv, hlt), D(ImplicitOps), - G(ByteOp, group3), G(0, group3), - /* 0xF8 - 0xFF */ - D(ImplicitOps), D(ImplicitOps), - I(ImplicitOps, em_cli), I(ImplicitOps, em_sti), - D(ImplicitOps), D(ImplicitOps), G(0, group4), G(0, group5), -}; - -static struct opcode twobyte_table[256] = { - /* 0x00 - 0x0F */ - G(0, group6), GD(0, &group7), N, N, - N, I(ImplicitOps | VendorSpecific, em_syscall), - II(ImplicitOps | Priv, em_clts, clts), N, - DI(ImplicitOps | Priv, invd), DI(ImplicitOps | Priv, wbinvd), N, N, - N, D(ImplicitOps | ModRM), N, N, - /* 0x10 - 0x1F */ - N, N, N, N, N, N, N, N, D(ImplicitOps | ModRM), N, N, N, N, N, N, N, - /* 0x20 - 0x2F */ - DIP(ModRM | DstMem | Priv | Op3264, cr_read, check_cr_read), - DIP(ModRM | DstMem | Priv | Op3264, dr_read, check_dr_read), - IIP(ModRM | SrcMem | Priv | Op3264, em_cr_write, cr_write, check_cr_write), - IIP(ModRM | SrcMem | Priv | Op3264, em_dr_write, dr_write, check_dr_write), - N, N, N, N, - N, N, N, N, N, N, N, N, - /* 0x30 - 0x3F */ - II(ImplicitOps | Priv, em_wrmsr, wrmsr), - IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc), - II(ImplicitOps | Priv, em_rdmsr, rdmsr), - IIP(ImplicitOps, em_rdpmc, rdpmc, check_rdpmc), - I(ImplicitOps | VendorSpecific, em_sysenter), - I(ImplicitOps | Priv | VendorSpecific, em_sysexit), - N, N, - N, N, N, N, N, N, N, N, - /* 0x40 - 0x4F */ - X16(D(DstReg | SrcMem | ModRM | Mov)), - /* 0x50 - 0x5F */ - N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, - /* 0x60 - 0x6F */ - N, N, N, N, - N, N, N, N, - N, N, N, N, - N, N, N, GP(SrcMem | DstReg | ModRM | Mov, &pfx_0f_6f_0f_7f), - /* 0x70 - 0x7F */ - N, N, N, N, - N, N, N, N, - N, N, N, N, - N, N, N, GP(SrcReg | DstMem | ModRM | Mov, &pfx_0f_6f_0f_7f), - /* 0x80 - 0x8F */ - X16(D(SrcImm)), - /* 0x90 - 0x9F */ - X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)), - /* 0xA0 - 0xA7 */ - I(Stack | Src2FS, em_push_sreg), I(Stack | Src2FS, em_pop_sreg), - DI(ImplicitOps, cpuid), I(DstMem | SrcReg | ModRM | BitOp, em_bt), - D(DstMem | SrcReg | Src2ImmByte | ModRM), - D(DstMem | SrcReg | Src2CL | ModRM), N, N, - /* 0xA8 - 0xAF */ - I(Stack | Src2GS, em_push_sreg), I(Stack | Src2GS, em_pop_sreg), - DI(ImplicitOps, rsm), - I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_bts), - D(DstMem | SrcReg | Src2ImmByte | ModRM), - D(DstMem | SrcReg | Src2CL | ModRM), - D(ModRM), I(DstReg | SrcMem | ModRM, em_imul), - /* 0xB0 - 0xB7 */ - I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_cmpxchg), - I(DstReg | SrcMemFAddr | ModRM | Src2SS, em_lseg), - I(DstMem | SrcReg | ModRM | BitOp | Lock, em_btr), - I(DstReg | SrcMemFAddr | ModRM | Src2FS, em_lseg), - I(DstReg | SrcMemFAddr | ModRM | Src2GS, em_lseg), - D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), - /* 0xB8 - 0xBF */ - N, N, - G(BitOp, group8), - I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc), - I(DstReg | SrcMem | ModRM, em_bsf), I(DstReg | SrcMem | ModRM, em_bsr), - D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), - /* 0xC0 - 0xCF */ - D2bv(DstMem | SrcReg | ModRM | Lock), - N, D(DstMem | SrcReg | ModRM | Mov), - N, N, N, GD(0, &group9), - N, N, N, N, N, N, N, N, - /* 0xD0 - 0xDF */ - N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, - /* 0xE0 - 0xEF */ - N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, - /* 0xF0 - 0xFF */ - N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N -}; - -#undef D -#undef N -#undef G -#undef GD -#undef I -#undef GP -#undef EXT - -#undef D2bv -#undef D2bvIP -#undef I2bv -#undef I2bvIP -#undef I6ALU - -static unsigned imm_size(struct x86_emulate_ctxt *ctxt) -{ - unsigned size; - - size = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; - if (size == 8) - size = 4; - return size; -} - -static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op, - unsigned size, bool sign_extension) -{ - int rc = X86EMUL_CONTINUE; - - op->type = OP_IMM; - op->bytes = size; - op->addr.mem.ea = ctxt->_eip; - /* NB. Immediates are sign-extended as necessary. */ - switch (op->bytes) { - case 1: - op->val = insn_fetch(s8, ctxt); - break; - case 2: - op->val = insn_fetch(s16, ctxt); - break; - case 4: - op->val = insn_fetch(s32, ctxt); - break; - } - if (!sign_extension) { - switch (op->bytes) { - case 1: - op->val &= 0xff; - break; - case 2: - op->val &= 0xffff; - break; - case 4: - op->val &= 0xffffffff; - break; - } - } -done: - return rc; -} - -static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, - unsigned d) -{ - int rc = X86EMUL_CONTINUE; - - switch (d) { - case OpReg: - decode_register_operand(ctxt, op); - break; - case OpImmUByte: - rc = decode_imm(ctxt, op, 1, false); - break; - case OpMem: - ctxt->memop.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; - mem_common: - *op = ctxt->memop; - ctxt->memopp = op; - if ((ctxt->d & BitOp) && op == &ctxt->dst) - fetch_bit_operand(ctxt); - op->orig_val = op->val; - break; - case OpMem64: - ctxt->memop.bytes = 8; - goto mem_common; - case OpAcc: - op->type = OP_REG; - op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; - op->addr.reg = &ctxt->regs[VCPU_REGS_RAX]; - fetch_register_operand(op); - op->orig_val = op->val; - break; - case OpDI: - op->type = OP_MEM; - op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; - op->addr.mem.ea = - register_address(ctxt, ctxt->regs[VCPU_REGS_RDI]); - op->addr.mem.seg = VCPU_SREG_ES; - op->val = 0; - break; - case OpDX: - op->type = OP_REG; - op->bytes = 2; - op->addr.reg = &ctxt->regs[VCPU_REGS_RDX]; - fetch_register_operand(op); - break; - case OpCL: - op->bytes = 1; - op->val = ctxt->regs[VCPU_REGS_RCX] & 0xff; - break; - case OpImmByte: - rc = decode_imm(ctxt, op, 1, true); - break; - case OpOne: - op->bytes = 1; - op->val = 1; - break; - case OpImm: - rc = decode_imm(ctxt, op, imm_size(ctxt), true); - break; - case OpMem8: - ctxt->memop.bytes = 1; - goto mem_common; - case OpMem16: - ctxt->memop.bytes = 2; - goto mem_common; - case OpMem32: - ctxt->memop.bytes = 4; - goto mem_common; - case OpImmU16: - rc = decode_imm(ctxt, op, 2, false); - break; - case OpImmU: - rc = decode_imm(ctxt, op, imm_size(ctxt), false); - break; - case OpSI: - op->type = OP_MEM; - op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; - op->addr.mem.ea = - register_address(ctxt, ctxt->regs[VCPU_REGS_RSI]); - op->addr.mem.seg = seg_override(ctxt); - op->val = 0; - break; - case OpImmFAddr: - op->type = OP_IMM; - op->addr.mem.ea = ctxt->_eip; - op->bytes = ctxt->op_bytes + 2; - insn_fetch_arr(op->valptr, op->bytes, ctxt); - break; - case OpMemFAddr: - ctxt->memop.bytes = ctxt->op_bytes + 2; - goto mem_common; - case OpES: - op->val = VCPU_SREG_ES; - break; - case OpCS: - op->val = VCPU_SREG_CS; - break; - case OpSS: - op->val = VCPU_SREG_SS; - break; - case OpDS: - op->val = VCPU_SREG_DS; - break; - case OpFS: - op->val = VCPU_SREG_FS; - break; - case OpGS: - op->val = VCPU_SREG_GS; - break; - case OpImplicit: - /* Special instructions do their own operand decoding. */ - default: - op->type = OP_NONE; /* Disable writeback. */ - break; - } - -done: - return rc; -} - -int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) -{ - int rc = X86EMUL_CONTINUE; - int mode = ctxt->mode; - int def_op_bytes, def_ad_bytes, goffset, simd_prefix; - bool op_prefix = false; - struct opcode opcode; - - ctxt->memop.type = OP_NONE; - ctxt->memopp = NULL; - ctxt->_eip = ctxt->eip; - ctxt->fetch.start = ctxt->_eip; - ctxt->fetch.end = ctxt->fetch.start + insn_len; - if (insn_len > 0) - memcpy(ctxt->fetch.data, insn, insn_len); - - switch (mode) { - case X86EMUL_MODE_REAL: - case X86EMUL_MODE_VM86: - case X86EMUL_MODE_PROT16: - def_op_bytes = def_ad_bytes = 2; - break; - case X86EMUL_MODE_PROT32: - def_op_bytes = def_ad_bytes = 4; - break; -#ifdef CONFIG_X86_64 - case X86EMUL_MODE_PROT64: - def_op_bytes = 4; - def_ad_bytes = 8; - break; -#endif - default: - return EMULATION_FAILED; - } - - ctxt->op_bytes = def_op_bytes; - ctxt->ad_bytes = def_ad_bytes; - - /* Legacy prefixes. */ - for (;;) { - switch (ctxt->b = insn_fetch(u8, ctxt)) { - case 0x66: /* operand-size override */ - op_prefix = true; - /* switch between 2/4 bytes */ - ctxt->op_bytes = def_op_bytes ^ 6; - break; - case 0x67: /* address-size override */ - if (mode == X86EMUL_MODE_PROT64) - /* switch between 4/8 bytes */ - ctxt->ad_bytes = def_ad_bytes ^ 12; - else - /* switch between 2/4 bytes */ - ctxt->ad_bytes = def_ad_bytes ^ 6; - break; - case 0x26: /* ES override */ - case 0x2e: /* CS override */ - case 0x36: /* SS override */ - case 0x3e: /* DS override */ - set_seg_override(ctxt, (ctxt->b >> 3) & 3); - break; - case 0x64: /* FS override */ - case 0x65: /* GS override */ - set_seg_override(ctxt, ctxt->b & 7); - break; - case 0x40 ... 0x4f: /* REX */ - if (mode != X86EMUL_MODE_PROT64) - goto done_prefixes; - ctxt->rex_prefix = ctxt->b; - continue; - case 0xf0: /* LOCK */ - ctxt->lock_prefix = 1; - break; - case 0xf2: /* REPNE/REPNZ */ - case 0xf3: /* REP/REPE/REPZ */ - ctxt->rep_prefix = ctxt->b; - break; - default: - goto done_prefixes; - } - - /* Any legacy prefix after a REX prefix nullifies its effect. */ - - ctxt->rex_prefix = 0; - } - -done_prefixes: - - /* REX prefix. */ - if (ctxt->rex_prefix & 8) - ctxt->op_bytes = 8; /* REX.W */ - - /* Opcode byte(s). */ - opcode = opcode_table[ctxt->b]; - /* Two-byte opcode? */ - if (ctxt->b == 0x0f) { - ctxt->twobyte = 1; - ctxt->b = insn_fetch(u8, ctxt); - opcode = twobyte_table[ctxt->b]; - } - ctxt->d = opcode.flags; - - while (ctxt->d & GroupMask) { - switch (ctxt->d & GroupMask) { - case Group: - ctxt->modrm = insn_fetch(u8, ctxt); - --ctxt->_eip; - goffset = (ctxt->modrm >> 3) & 7; - opcode = opcode.u.group[goffset]; - break; - case GroupDual: - ctxt->modrm = insn_fetch(u8, ctxt); - --ctxt->_eip; - goffset = (ctxt->modrm >> 3) & 7; - if ((ctxt->modrm >> 6) == 3) - opcode = opcode.u.gdual->mod3[goffset]; - else - opcode = opcode.u.gdual->mod012[goffset]; - break; - case RMExt: - goffset = ctxt->modrm & 7; - opcode = opcode.u.group[goffset]; - break; - case Prefix: - if (ctxt->rep_prefix && op_prefix) - return EMULATION_FAILED; - simd_prefix = op_prefix ? 0x66 : ctxt->rep_prefix; - switch (simd_prefix) { - case 0x00: opcode = opcode.u.gprefix->pfx_no; break; - case 0x66: opcode = opcode.u.gprefix->pfx_66; break; - case 0xf2: opcode = opcode.u.gprefix->pfx_f2; break; - case 0xf3: opcode = opcode.u.gprefix->pfx_f3; break; - } - break; - default: - return EMULATION_FAILED; - } - - ctxt->d &= ~(u64)GroupMask; - ctxt->d |= opcode.flags; - } - - ctxt->execute = opcode.u.execute; - ctxt->check_perm = opcode.check_perm; - ctxt->intercept = opcode.intercept; - - /* Unrecognised? */ - if (ctxt->d == 0 || (ctxt->d & Undefined)) - return EMULATION_FAILED; - - if (!(ctxt->d & VendorSpecific) && ctxt->only_vendor_specific_insn) - return EMULATION_FAILED; - - if (mode == X86EMUL_MODE_PROT64 && (ctxt->d & Stack)) - ctxt->op_bytes = 8; - - if (ctxt->d & Op3264) { - if (mode == X86EMUL_MODE_PROT64) - ctxt->op_bytes = 8; - else - ctxt->op_bytes = 4; - } - - if (ctxt->d & Sse) - ctxt->op_bytes = 16; - - /* ModRM and SIB bytes. */ - if (ctxt->d & ModRM) { - rc = decode_modrm(ctxt, &ctxt->memop); - if (!ctxt->has_seg_override) - set_seg_override(ctxt, ctxt->modrm_seg); - } else if (ctxt->d & MemAbs) - rc = decode_abs(ctxt, &ctxt->memop); - if (rc != X86EMUL_CONTINUE) - goto done; - - if (!ctxt->has_seg_override) - set_seg_override(ctxt, VCPU_SREG_DS); - - ctxt->memop.addr.mem.seg = seg_override(ctxt); - - if (ctxt->memop.type == OP_MEM && ctxt->ad_bytes != 8) - ctxt->memop.addr.mem.ea = (u32)ctxt->memop.addr.mem.ea; - - /* - * Decode and fetch the source operand: register, memory - * or immediate. - */ - rc = decode_operand(ctxt, &ctxt->src, (ctxt->d >> SrcShift) & OpMask); - if (rc != X86EMUL_CONTINUE) - goto done; - - /* - * Decode and fetch the second source operand: register, memory - * or immediate. - */ - rc = decode_operand(ctxt, &ctxt->src2, (ctxt->d >> Src2Shift) & OpMask); - if (rc != X86EMUL_CONTINUE) - goto done; - - /* Decode and fetch the destination operand: register or memory. */ - rc = decode_operand(ctxt, &ctxt->dst, (ctxt->d >> DstShift) & OpMask); - -done: - if (ctxt->memopp && ctxt->memopp->type == OP_MEM && ctxt->rip_relative) - ctxt->memopp->addr.mem.ea += ctxt->_eip; - - return (rc != X86EMUL_CONTINUE) ? EMULATION_FAILED : EMULATION_OK; -} - -bool x86_page_table_writing_insn(struct x86_emulate_ctxt *ctxt) -{ - return ctxt->d & PageTable; -} - -static bool string_insn_completed(struct x86_emulate_ctxt *ctxt) -{ - /* The second termination condition only applies for REPE - * and REPNE. Test if the repeat string operation prefix is - * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the - * corresponding termination condition according to: - * - if REPE/REPZ and ZF = 0 then done - * - if REPNE/REPNZ and ZF = 1 then done - */ - if (((ctxt->b == 0xa6) || (ctxt->b == 0xa7) || - (ctxt->b == 0xae) || (ctxt->b == 0xaf)) - && (((ctxt->rep_prefix == REPE_PREFIX) && - ((ctxt->eflags & EFLG_ZF) == 0)) - || ((ctxt->rep_prefix == REPNE_PREFIX) && - ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)))) - return true; - - return false; -} - -int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) -{ - struct x86_emulate_ops *ops = ctxt->ops; - int rc = X86EMUL_CONTINUE; - int saved_dst_type = ctxt->dst.type; - - ctxt->mem_read.pos = 0; - - if (ctxt->mode == X86EMUL_MODE_PROT64 && (ctxt->d & No64)) { - rc = emulate_ud(ctxt); - goto done; - } - - /* LOCK prefix is allowed only with some instructions */ - if (ctxt->lock_prefix && (!(ctxt->d & Lock) || ctxt->dst.type != OP_MEM)) { - rc = emulate_ud(ctxt); - goto done; - } - - if ((ctxt->d & SrcMask) == SrcMemFAddr && ctxt->src.type != OP_MEM) { - rc = emulate_ud(ctxt); - goto done; - } - - if ((ctxt->d & Sse) - && ((ops->get_cr(ctxt, 0) & X86_CR0_EM) - || !(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))) { - rc = emulate_ud(ctxt); - goto done; - } - - if ((ctxt->d & Sse) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) { - rc = emulate_nm(ctxt); - goto done; - } - - if (unlikely(ctxt->guest_mode) && ctxt->intercept) { - rc = emulator_check_intercept(ctxt, ctxt->intercept, - X86_ICPT_PRE_EXCEPT); - if (rc != X86EMUL_CONTINUE) - goto done; - } - - /* Privileged instruction can be executed only in CPL=0 */ - if ((ctxt->d & Priv) && ops->cpl(ctxt)) { - rc = emulate_gp(ctxt, 0); - goto done; - } - - /* Instruction can only be executed in protected mode */ - if ((ctxt->d & Prot) && !(ctxt->mode & X86EMUL_MODE_PROT)) { - rc = emulate_ud(ctxt); - goto done; - } - - /* Do instruction specific permission checks */ - if (ctxt->check_perm) { - rc = ctxt->check_perm(ctxt); - if (rc != X86EMUL_CONTINUE) - goto done; - } - - if (unlikely(ctxt->guest_mode) && ctxt->intercept) { - rc = emulator_check_intercept(ctxt, ctxt->intercept, - X86_ICPT_POST_EXCEPT); - if (rc != X86EMUL_CONTINUE) - goto done; - } - - if (ctxt->rep_prefix && (ctxt->d & String)) { - /* All REP prefixes have the same first termination condition */ - if (address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) == 0) { - ctxt->eip = ctxt->_eip; - goto done; - } - } - - if ((ctxt->src.type == OP_MEM) && !(ctxt->d & NoAccess)) { - rc = segmented_read(ctxt, ctxt->src.addr.mem, - ctxt->src.valptr, ctxt->src.bytes); - if (rc != X86EMUL_CONTINUE) - goto done; - ctxt->src.orig_val64 = ctxt->src.val64; - } - - if (ctxt->src2.type == OP_MEM) { - rc = segmented_read(ctxt, ctxt->src2.addr.mem, - &ctxt->src2.val, ctxt->src2.bytes); - if (rc != X86EMUL_CONTINUE) - goto done; - } - - if ((ctxt->d & DstMask) == ImplicitOps) - goto special_insn; - - - if ((ctxt->dst.type == OP_MEM) && !(ctxt->d & Mov)) { - /* optimisation - avoid slow emulated read if Mov */ - rc = segmented_read(ctxt, ctxt->dst.addr.mem, - &ctxt->dst.val, ctxt->dst.bytes); - if (rc != X86EMUL_CONTINUE) - goto done; - } - ctxt->dst.orig_val = ctxt->dst.val; - -special_insn: - - if (unlikely(ctxt->guest_mode) && ctxt->intercept) { - rc = emulator_check_intercept(ctxt, ctxt->intercept, - X86_ICPT_POST_MEMACCESS); - if (rc != X86EMUL_CONTINUE) - goto done; - } - - if (ctxt->execute) { - rc = ctxt->execute(ctxt); - if (rc != X86EMUL_CONTINUE) - goto done; - goto writeback; - } - - if (ctxt->twobyte) - goto twobyte_insn; - - switch (ctxt->b) { - case 0x40 ... 0x47: /* inc r16/r32 */ - emulate_1op(ctxt, "inc"); - break; - case 0x48 ... 0x4f: /* dec r16/r32 */ - emulate_1op(ctxt, "dec"); - break; - case 0x63: /* movsxd */ - if (ctxt->mode != X86EMUL_MODE_PROT64) - goto cannot_emulate; - ctxt->dst.val = (s32) ctxt->src.val; - break; - case 0x70 ... 0x7f: /* jcc (short) */ - if (test_cc(ctxt->b, ctxt->eflags)) - jmp_rel(ctxt, ctxt->src.val); - break; - case 0x8d: /* lea r16/r32, m */ - ctxt->dst.val = ctxt->src.addr.mem.ea; - break; - case 0x90 ... 0x97: /* nop / xchg reg, rax */ - if (ctxt->dst.addr.reg == &ctxt->regs[VCPU_REGS_RAX]) - break; - rc = em_xchg(ctxt); - break; - case 0x98: /* cbw/cwde/cdqe */ - switch (ctxt->op_bytes) { - case 2: ctxt->dst.val = (s8)ctxt->dst.val; break; - case 4: ctxt->dst.val = (s16)ctxt->dst.val; break; - case 8: ctxt->dst.val = (s32)ctxt->dst.val; break; - } - break; - case 0xc0 ... 0xc1: - rc = em_grp2(ctxt); - break; - case 0xcc: /* int3 */ - rc = emulate_int(ctxt, 3); - break; - case 0xcd: /* int n */ - rc = emulate_int(ctxt, ctxt->src.val); - break; - case 0xce: /* into */ - if (ctxt->eflags & EFLG_OF) - rc = emulate_int(ctxt, 4); - break; - case 0xd0 ... 0xd1: /* Grp2 */ - rc = em_grp2(ctxt); - break; - case 0xd2 ... 0xd3: /* Grp2 */ - ctxt->src.val = ctxt->regs[VCPU_REGS_RCX]; - rc = em_grp2(ctxt); - break; - case 0xe9: /* jmp rel */ - case 0xeb: /* jmp rel short */ - jmp_rel(ctxt, ctxt->src.val); - ctxt->dst.type = OP_NONE; /* Disable writeback. */ - break; - case 0xf4: /* hlt */ - ctxt->ops->halt(ctxt); - break; - case 0xf5: /* cmc */ - /* complement carry flag from eflags reg */ - ctxt->eflags ^= EFLG_CF; - break; - case 0xf8: /* clc */ - ctxt->eflags &= ~EFLG_CF; - break; - case 0xf9: /* stc */ - ctxt->eflags |= EFLG_CF; - break; - case 0xfc: /* cld */ - ctxt->eflags &= ~EFLG_DF; - break; - case 0xfd: /* std */ - ctxt->eflags |= EFLG_DF; - break; - default: - goto cannot_emulate; - } - - if (rc != X86EMUL_CONTINUE) - goto done; - -writeback: - rc = writeback(ctxt); - if (rc != X86EMUL_CONTINUE) - goto done; - - /* - * restore dst type in case the decoding will be reused - * (happens for string instruction ) - */ - ctxt->dst.type = saved_dst_type; - - if ((ctxt->d & SrcMask) == SrcSI) - string_addr_inc(ctxt, seg_override(ctxt), - VCPU_REGS_RSI, &ctxt->src); - - if ((ctxt->d & DstMask) == DstDI) - string_addr_inc(ctxt, VCPU_SREG_ES, VCPU_REGS_RDI, - &ctxt->dst); - - if (ctxt->rep_prefix && (ctxt->d & String)) { - struct read_cache *r = &ctxt->io_read; - register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RCX], -1); - - if (!string_insn_completed(ctxt)) { - /* - * Re-enter guest when pio read ahead buffer is empty - * or, if it is not used, after each 1024 iteration. - */ - if ((r->end != 0 || ctxt->regs[VCPU_REGS_RCX] & 0x3ff) && - (r->end == 0 || r->end != r->pos)) { - /* - * Reset read cache. Usually happens before - * decode, but since instruction is restarted - * we have to do it here. - */ - ctxt->mem_read.end = 0; - return EMULATION_RESTART; - } - goto done; /* skip rip writeback */ - } - } - - ctxt->eip = ctxt->_eip; - -done: - if (rc == X86EMUL_PROPAGATE_FAULT) - ctxt->have_exception = true; - if (rc == X86EMUL_INTERCEPTED) - return EMULATION_INTERCEPTED; - - return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; - -twobyte_insn: - switch (ctxt->b) { - case 0x09: /* wbinvd */ - (ctxt->ops->wbinvd)(ctxt); - break; - case 0x08: /* invd */ - case 0x0d: /* GrpP (prefetch) */ - case 0x18: /* Grp16 (prefetch/nop) */ - break; - case 0x20: /* mov cr, reg */ - ctxt->dst.val = ops->get_cr(ctxt, ctxt->modrm_reg); - break; - case 0x21: /* mov from dr to reg */ - ops->get_dr(ctxt, ctxt->modrm_reg, &ctxt->dst.val); - break; - case 0x40 ... 0x4f: /* cmov */ - ctxt->dst.val = ctxt->dst.orig_val = ctxt->src.val; - if (!test_cc(ctxt->b, ctxt->eflags)) - ctxt->dst.type = OP_NONE; /* no writeback */ - break; - case 0x80 ... 0x8f: /* jnz rel, etc*/ - if (test_cc(ctxt->b, ctxt->eflags)) - jmp_rel(ctxt, ctxt->src.val); - break; - case 0x90 ... 0x9f: /* setcc r/m8 */ - ctxt->dst.val = test_cc(ctxt->b, ctxt->eflags); - break; - case 0xa4: /* shld imm8, r, r/m */ - case 0xa5: /* shld cl, r, r/m */ - emulate_2op_cl(ctxt, "shld"); - break; - case 0xac: /* shrd imm8, r, r/m */ - case 0xad: /* shrd cl, r, r/m */ - emulate_2op_cl(ctxt, "shrd"); - break; - case 0xae: /* clflush */ - break; - case 0xb6 ... 0xb7: /* movzx */ - ctxt->dst.bytes = ctxt->op_bytes; - ctxt->dst.val = (ctxt->d & ByteOp) ? (u8) ctxt->src.val - : (u16) ctxt->src.val; - break; - case 0xbe ... 0xbf: /* movsx */ - ctxt->dst.bytes = ctxt->op_bytes; - ctxt->dst.val = (ctxt->d & ByteOp) ? (s8) ctxt->src.val : - (s16) ctxt->src.val; - break; - case 0xc0 ... 0xc1: /* xadd */ - emulate_2op_SrcV(ctxt, "add"); - /* Write back the register source. */ - ctxt->src.val = ctxt->dst.orig_val; - write_register_operand(&ctxt->src); - break; - case 0xc3: /* movnti */ - ctxt->dst.bytes = ctxt->op_bytes; - ctxt->dst.val = (ctxt->op_bytes == 4) ? (u32) ctxt->src.val : - (u64) ctxt->src.val; - break; - default: - goto cannot_emulate; - } - - if (rc != X86EMUL_CONTINUE) - goto done; - - goto writeback; - -cannot_emulate: - return EMULATION_FAILED; -} diff --git a/ANDROID_3.4.5/arch/x86/kvm/i8254.c b/ANDROID_3.4.5/arch/x86/kvm/i8254.c deleted file mode 100644 index d68f99df..00000000 --- a/ANDROID_3.4.5/arch/x86/kvm/i8254.c +++ /dev/null @@ -1,765 +0,0 @@ -/* - * 8253/8254 interval timer emulation - * - * Copyright (c) 2003-2004 Fabrice Bellard - * Copyright (c) 2006 Intel Corporation - * Copyright (c) 2007 Keir Fraser, XenSource Inc - * Copyright (c) 2008 Intel Corporation - * Copyright 2009 Red Hat, Inc. and/or its affiliates. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - * - * Authors: - * Sheng Yang <sheng.yang@intel.com> - * Based on QEMU and Xen. - */ - -#define pr_fmt(fmt) "pit: " fmt - -#include <linux/kvm_host.h> -#include <linux/slab.h> -#include <linux/workqueue.h> - -#include "irq.h" -#include "i8254.h" - -#ifndef CONFIG_X86_64 -#define mod_64(x, y) ((x) - (y) * div64_u64(x, y)) -#else -#define mod_64(x, y) ((x) % (y)) -#endif - -#define RW_STATE_LSB 1 -#define RW_STATE_MSB 2 -#define RW_STATE_WORD0 3 -#define RW_STATE_WORD1 4 - -/* Compute with 96 bit intermediate result: (a*b)/c */ -static u64 muldiv64(u64 a, u32 b, u32 c) -{ - union { - u64 ll; - struct { - u32 low, high; - } l; - } u, res; - u64 rl, rh; - - u.ll = a; - rl = (u64)u.l.low * (u64)b; - rh = (u64)u.l.high * (u64)b; - rh += (rl >> 32); - res.l.high = div64_u64(rh, c); - res.l.low = div64_u64(((mod_64(rh, c) << 32) + (rl & 0xffffffff)), c); - return res.ll; -} - -static void pit_set_gate(struct kvm *kvm, int channel, u32 val) -{ - struct kvm_kpit_channel_state *c = - &kvm->arch.vpit->pit_state.channels[channel]; - - WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock)); - - switch (c->mode) { - default: - case 0: - case 4: - /* XXX: just disable/enable counting */ - break; - case 1: - case 2: - case 3: - case 5: - /* Restart counting on rising edge. */ - if (c->gate < val) - c->count_load_time = ktime_get(); - break; - } - - c->gate = val; -} - -static int pit_get_gate(struct kvm *kvm, int channel) -{ - WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock)); - - return kvm->arch.vpit->pit_state.channels[channel].gate; -} - -static s64 __kpit_elapsed(struct kvm *kvm) -{ - s64 elapsed; - ktime_t remaining; - struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state; - - if (!ps->pit_timer.period) - return 0; - - /* - * The Counter does not stop when it reaches zero. In - * Modes 0, 1, 4, and 5 the Counter ``wraps around'' to - * the highest count, either FFFF hex for binary counting - * or 9999 for BCD counting, and continues counting. - * Modes 2 and 3 are periodic; the Counter reloads - * itself with the initial count and continues counting - * from there. - */ - remaining = hrtimer_get_remaining(&ps->pit_timer.timer); - elapsed = ps->pit_timer.period - ktime_to_ns(remaining); - elapsed = mod_64(elapsed, ps->pit_timer.period); - - return elapsed; -} - -static s64 kpit_elapsed(struct kvm *kvm, struct kvm_kpit_channel_state *c, - int channel) -{ - if (channel == 0) - return __kpit_elapsed(kvm); - - return ktime_to_ns(ktime_sub(ktime_get(), c->count_load_time)); -} - -static int pit_get_count(struct kvm *kvm, int channel) -{ - struct kvm_kpit_channel_state *c = - &kvm->arch.vpit->pit_state.channels[channel]; - s64 d, t; - int counter; - - WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock)); - - t = kpit_elapsed(kvm, c, channel); - d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC); - - switch (c->mode) { - case 0: - case 1: - case 4: - case 5: - counter = (c->count - d) & 0xffff; - break; - case 3: - /* XXX: may be incorrect for odd counts */ - counter = c->count - (mod_64((2 * d), c->count)); - break; - default: - counter = c->count - mod_64(d, c->count); - break; - } - return counter; -} - -static int pit_get_out(struct kvm *kvm, int channel) -{ - struct kvm_kpit_channel_state *c = - &kvm->arch.vpit->pit_state.channels[channel]; - s64 d, t; - int out; - - WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock)); - - t = kpit_elapsed(kvm, c, channel); - d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC); - - switch (c->mode) { - default: - case 0: - out = (d >= c->count); - break; - case 1: - out = (d < c->count); - break; - case 2: - out = ((mod_64(d, c->count) == 0) && (d != 0)); - break; - case 3: - out = (mod_64(d, c->count) < ((c->count + 1) >> 1)); - break; - case 4: - case 5: - out = (d == c->count); - break; - } - - return out; -} - -static void pit_latch_count(struct kvm *kvm, int channel) -{ - struct kvm_kpit_channel_state *c = - &kvm->arch.vpit->pit_state.channels[channel]; - - WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock)); - - if (!c->count_latched) { - c->latched_count = pit_get_count(kvm, channel); - c->count_latched = c->rw_mode; - } -} - -static void pit_latch_status(struct kvm *kvm, int channel) -{ - struct kvm_kpit_channel_state *c = - &kvm->arch.vpit->pit_state.channels[channel]; - - WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock)); - - if (!c->status_latched) { - /* TODO: Return NULL COUNT (bit 6). */ - c->status = ((pit_get_out(kvm, channel) << 7) | - (c->rw_mode << 4) | - (c->mode << 1) | - c->bcd); - c->status_latched = 1; - } -} - -static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian) -{ - struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state, - irq_ack_notifier); - int value; - - spin_lock(&ps->inject_lock); - value = atomic_dec_return(&ps->pit_timer.pending); - if (value < 0) - /* spurious acks can be generated if, for example, the - * PIC is being reset. Handle it gracefully here - */ - atomic_inc(&ps->pit_timer.pending); - else if (value > 0) - /* in this case, we had multiple outstanding pit interrupts - * that we needed to inject. Reinject - */ - queue_work(ps->pit->wq, &ps->pit->expired); - ps->irq_ack = 1; - spin_unlock(&ps->inject_lock); -} - -void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) -{ - struct kvm_pit *pit = vcpu->kvm->arch.vpit; - struct hrtimer *timer; - - if (!kvm_vcpu_is_bsp(vcpu) || !pit) - return; - - timer = &pit->pit_state.pit_timer.timer; - if (hrtimer_cancel(timer)) - hrtimer_start_expires(timer, HRTIMER_MODE_ABS); -} - -static void destroy_pit_timer(struct kvm_pit *pit) -{ - hrtimer_cancel(&pit->pit_state.pit_timer.timer); - cancel_work_sync(&pit->expired); -} - -static bool kpit_is_periodic(struct kvm_timer *ktimer) -{ - struct kvm_kpit_state *ps = container_of(ktimer, struct kvm_kpit_state, - pit_timer); - return ps->is_periodic; -} - -static struct kvm_timer_ops kpit_ops = { - .is_periodic = kpit_is_periodic, -}; - -static void pit_do_work(struct work_struct *work) -{ - struct kvm_pit *pit = container_of(work, struct kvm_pit, expired); - struct kvm *kvm = pit->kvm; - struct kvm_vcpu *vcpu; - int i; - struct kvm_kpit_state *ps = &pit->pit_state; - int inject = 0; - - /* Try to inject pending interrupts when - * last one has been acked. - */ - spin_lock(&ps->inject_lock); - if (ps->irq_ack) { - ps->irq_ack = 0; - inject = 1; - } - spin_unlock(&ps->inject_lock); - if (inject) { - kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1); - kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0); - - /* - * Provides NMI watchdog support via Virtual Wire mode. - * The route is: PIT -> PIC -> LVT0 in NMI mode. - * - * Note: Our Virtual Wire implementation is simplified, only - * propagating PIT interrupts to all VCPUs when they have set - * LVT0 to NMI delivery. Other PIC interrupts are just sent to - * VCPU0, and only if its LVT0 is in EXTINT mode. - */ - if (kvm->arch.vapics_in_nmi_mode > 0) - kvm_for_each_vcpu(i, vcpu, kvm) - kvm_apic_nmi_wd_deliver(vcpu); - } -} - -static enum hrtimer_restart pit_timer_fn(struct hrtimer *data) -{ - struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer); - struct kvm_pit *pt = ktimer->kvm->arch.vpit; - - if (ktimer->reinject || !atomic_read(&ktimer->pending)) { - atomic_inc(&ktimer->pending); - queue_work(pt->wq, &pt->expired); - } - - if (ktimer->t_ops->is_periodic(ktimer)) { - hrtimer_add_expires_ns(&ktimer->timer, ktimer->period); - return HRTIMER_RESTART; - } else - return HRTIMER_NORESTART; -} - -static void create_pit_timer(struct kvm *kvm, u32 val, int is_period) -{ - struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state; - struct kvm_timer *pt = &ps->pit_timer; - s64 interval; - - if (!irqchip_in_kernel(kvm) || ps->flags & KVM_PIT_FLAGS_HPET_LEGACY) - return; - - interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ); - - pr_debug("create pit timer, interval is %llu nsec\n", interval); - - /* TODO The new value only affected after the retriggered */ - hrtimer_cancel(&pt->timer); - cancel_work_sync(&ps->pit->expired); - pt->period = interval; - ps->is_periodic = is_period; - - pt->timer.function = pit_timer_fn; - pt->t_ops = &kpit_ops; - pt->kvm = ps->pit->kvm; - - atomic_set(&pt->pending, 0); - ps->irq_ack = 1; - - hrtimer_start(&pt->timer, ktime_add_ns(ktime_get(), interval), - HRTIMER_MODE_ABS); -} - -static void pit_load_count(struct kvm *kvm, int channel, u32 val) -{ - struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state; - - WARN_ON(!mutex_is_locked(&ps->lock)); - - pr_debug("load_count val is %d, channel is %d\n", val, channel); - - /* - * The largest possible initial count is 0; this is equivalent - * to 216 for binary counting and 104 for BCD counting. - */ - if (val == 0) - val = 0x10000; - - ps->channels[channel].count = val; - - if (channel != 0) { - ps->channels[channel].count_load_time = ktime_get(); - return; - } - - /* Two types of timer - * mode 1 is one shot, mode 2 is period, otherwise del timer */ - switch (ps->channels[0].mode) { - case 0: - case 1: - /* FIXME: enhance mode 4 precision */ - case 4: - create_pit_timer(kvm, val, 0); - break; - case 2: - case 3: - create_pit_timer(kvm, val, 1); - break; - default: - destroy_pit_timer(kvm->arch.vpit); - } -} - -void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val, int hpet_legacy_start) -{ - u8 saved_mode; - if (hpet_legacy_start) { - /* save existing mode for later reenablement */ - saved_mode = kvm->arch.vpit->pit_state.channels[0].mode; - kvm->arch.vpit->pit_state.channels[0].mode = 0xff; /* disable timer */ - pit_load_count(kvm, channel, val); - kvm->arch.vpit->pit_state.channels[0].mode = saved_mode; - } else { - pit_load_count(kvm, channel, val); - } -} - -static inline struct kvm_pit *dev_to_pit(struct kvm_io_device *dev) -{ - return container_of(dev, struct kvm_pit, dev); -} - -static inline struct kvm_pit *speaker_to_pit(struct kvm_io_device *dev) -{ - return container_of(dev, struct kvm_pit, speaker_dev); -} - -static inline int pit_in_range(gpa_t addr) -{ - return ((addr >= KVM_PIT_BASE_ADDRESS) && - (addr < KVM_PIT_BASE_ADDRESS + KVM_PIT_MEM_LENGTH)); -} - -static int pit_ioport_write(struct kvm_io_device *this, - gpa_t addr, int len, const void *data) -{ - struct kvm_pit *pit = dev_to_pit(this); - struct kvm_kpit_state *pit_state = &pit->pit_state; - struct kvm *kvm = pit->kvm; - int channel, access; - struct kvm_kpit_channel_state *s; - u32 val = *(u32 *) data; - if (!pit_in_range(addr)) - return -EOPNOTSUPP; - - val &= 0xff; - addr &= KVM_PIT_CHANNEL_MASK; - - mutex_lock(&pit_state->lock); - - if (val != 0) - pr_debug("write addr is 0x%x, len is %d, val is 0x%x\n", - (unsigned int)addr, len, val); - - if (addr == 3) { - channel = val >> 6; - if (channel == 3) { - /* Read-Back Command. */ - for (channel = 0; channel < 3; channel++) { - s = &pit_state->channels[channel]; - if (val & (2 << channel)) { - if (!(val & 0x20)) - pit_latch_count(kvm, channel); - if (!(val & 0x10)) - pit_latch_status(kvm, channel); - } - } - } else { - /* Select Counter <channel>. */ - s = &pit_state->channels[channel]; - access = (val >> 4) & KVM_PIT_CHANNEL_MASK; - if (access == 0) { - pit_latch_count(kvm, channel); - } else { - s->rw_mode = access; - s->read_state = access; - s->write_state = access; - s->mode = (val >> 1) & 7; - if (s->mode > 5) - s->mode -= 4; - s->bcd = val & 1; - } - } - } else { - /* Write Count. */ - s = &pit_state->channels[addr]; - switch (s->write_state) { - default: - case RW_STATE_LSB: - pit_load_count(kvm, addr, val); - break; - case RW_STATE_MSB: - pit_load_count(kvm, addr, val << 8); - break; - case RW_STATE_WORD0: - s->write_latch = val; - s->write_state = RW_STATE_WORD1; - break; - case RW_STATE_WORD1: - pit_load_count(kvm, addr, s->write_latch | (val << 8)); - s->write_state = RW_STATE_WORD0; - break; - } - } - - mutex_unlock(&pit_state->lock); - return 0; -} - -static int pit_ioport_read(struct kvm_io_device *this, - gpa_t addr, int len, void *data) -{ - struct kvm_pit *pit = dev_to_pit(this); - struct kvm_kpit_state *pit_state = &pit->pit_state; - struct kvm *kvm = pit->kvm; - int ret, count; - struct kvm_kpit_channel_state *s; - if (!pit_in_range(addr)) - return -EOPNOTSUPP; - - addr &= KVM_PIT_CHANNEL_MASK; - if (addr == 3) - return 0; - - s = &pit_state->channels[addr]; - - mutex_lock(&pit_state->lock); - - if (s->status_latched) { - s->status_latched = 0; - ret = s->status; - } else if (s->count_latched) { - switch (s->count_latched) { - default: - case RW_STATE_LSB: - ret = s->latched_count & 0xff; - s->count_latched = 0; - break; - case RW_STATE_MSB: - ret = s->latched_count >> 8; - s->count_latched = 0; - break; - case RW_STATE_WORD0: - ret = s->latched_count & 0xff; - s->count_latched = RW_STATE_MSB; - break; - } - } else { - switch (s->read_state) { - default: - case RW_STATE_LSB: - count = pit_get_count(kvm, addr); - ret = count & 0xff; - break; - case RW_STATE_MSB: - count = pit_get_count(kvm, addr); - ret = (count >> 8) & 0xff; - break; - case RW_STATE_WORD0: - count = pit_get_count(kvm, addr); - ret = count & 0xff; - s->read_state = RW_STATE_WORD1; - break; - case RW_STATE_WORD1: - count = pit_get_count(kvm, addr); - ret = (count >> 8) & 0xff; - s->read_state = RW_STATE_WORD0; - break; - } - } - - if (len > sizeof(ret)) - len = sizeof(ret); - memcpy(data, (char *)&ret, len); - - mutex_unlock(&pit_state->lock); - return 0; -} - -static int speaker_ioport_write(struct kvm_io_device *this, - gpa_t addr, int len, const void *data) -{ - struct kvm_pit *pit = speaker_to_pit(this); - struct kvm_kpit_state *pit_state = &pit->pit_state; - struct kvm *kvm = pit->kvm; - u32 val = *(u32 *) data; - if (addr != KVM_SPEAKER_BASE_ADDRESS) - return -EOPNOTSUPP; - - mutex_lock(&pit_state->lock); - pit_state->speaker_data_on = (val >> 1) & 1; - pit_set_gate(kvm, 2, val & 1); - mutex_unlock(&pit_state->lock); - return 0; -} - -static int speaker_ioport_read(struct kvm_io_device *this, - gpa_t addr, int len, void *data) -{ - struct kvm_pit *pit = speaker_to_pit(this); - struct kvm_kpit_state *pit_state = &pit->pit_state; - struct kvm *kvm = pit->kvm; - unsigned int refresh_clock; - int ret; - if (addr != KVM_SPEAKER_BASE_ADDRESS) - return -EOPNOTSUPP; - - /* Refresh clock toggles at about 15us. We approximate as 2^14ns. */ - refresh_clock = ((unsigned int)ktime_to_ns(ktime_get()) >> 14) & 1; - - mutex_lock(&pit_state->lock); - ret = ((pit_state->speaker_data_on << 1) | pit_get_gate(kvm, 2) | - (pit_get_out(kvm, 2) << 5) | (refresh_clock << 4)); - if (len > sizeof(ret)) - len = sizeof(ret); - memcpy(data, (char *)&ret, len); - mutex_unlock(&pit_state->lock); - return 0; -} - -void kvm_pit_reset(struct kvm_pit *pit) -{ - int i; - struct kvm_kpit_channel_state *c; - - mutex_lock(&pit->pit_state.lock); - pit->pit_state.flags = 0; - for (i = 0; i < 3; i++) { - c = &pit->pit_state.channels[i]; - c->mode = 0xff; - c->gate = (i != 2); - pit_load_count(pit->kvm, i, 0); - } - mutex_unlock(&pit->pit_state.lock); - - atomic_set(&pit->pit_state.pit_timer.pending, 0); - pit->pit_state.irq_ack = 1; -} - -static void pit_mask_notifer(struct kvm_irq_mask_notifier *kimn, bool mask) -{ - struct kvm_pit *pit = container_of(kimn, struct kvm_pit, mask_notifier); - - if (!mask) { - atomic_set(&pit->pit_state.pit_timer.pending, 0); - pit->pit_state.irq_ack = 1; - } -} - -static const struct kvm_io_device_ops pit_dev_ops = { - .read = pit_ioport_read, - .write = pit_ioport_write, -}; - -static const struct kvm_io_device_ops speaker_dev_ops = { - .read = speaker_ioport_read, - .write = speaker_ioport_write, -}; - -/* Caller must hold slots_lock */ -struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) -{ - struct kvm_pit *pit; - struct kvm_kpit_state *pit_state; - int ret; - - pit = kzalloc(sizeof(struct kvm_pit), GFP_KERNEL); - if (!pit) - return NULL; - - pit->irq_source_id = kvm_request_irq_source_id(kvm); - if (pit->irq_source_id < 0) { - kfree(pit); - return NULL; - } - - mutex_init(&pit->pit_state.lock); - mutex_lock(&pit->pit_state.lock); - spin_lock_init(&pit->pit_state.inject_lock); - - pit->wq = create_singlethread_workqueue("kvm-pit-wq"); - if (!pit->wq) { - mutex_unlock(&pit->pit_state.lock); - kvm_free_irq_source_id(kvm, pit->irq_source_id); - kfree(pit); - return NULL; - } - INIT_WORK(&pit->expired, pit_do_work); - - kvm->arch.vpit = pit; - pit->kvm = kvm; - - pit_state = &pit->pit_state; - pit_state->pit = pit; - hrtimer_init(&pit_state->pit_timer.timer, - CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - pit_state->irq_ack_notifier.gsi = 0; - pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq; - kvm_register_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier); - pit_state->pit_timer.reinject = true; - mutex_unlock(&pit->pit_state.lock); - - kvm_pit_reset(pit); - - pit->mask_notifier.func = pit_mask_notifer; - kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier); - - kvm_iodevice_init(&pit->dev, &pit_dev_ops); - ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, KVM_PIT_BASE_ADDRESS, - KVM_PIT_MEM_LENGTH, &pit->dev); - if (ret < 0) - goto fail; - - if (flags & KVM_PIT_SPEAKER_DUMMY) { - kvm_iodevice_init(&pit->speaker_dev, &speaker_dev_ops); - ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, - KVM_SPEAKER_BASE_ADDRESS, 4, - &pit->speaker_dev); - if (ret < 0) - goto fail_unregister; - } - - return pit; - -fail_unregister: - kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->dev); - -fail: - kvm_unregister_irq_mask_notifier(kvm, 0, &pit->mask_notifier); - kvm_unregister_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier); - kvm_free_irq_source_id(kvm, pit->irq_source_id); - destroy_workqueue(pit->wq); - kfree(pit); - return NULL; -} - -void kvm_free_pit(struct kvm *kvm) -{ - struct hrtimer *timer; - - if (kvm->arch.vpit) { - kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &kvm->arch.vpit->dev); - kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, - &kvm->arch.vpit->speaker_dev); - kvm_unregister_irq_mask_notifier(kvm, 0, - &kvm->arch.vpit->mask_notifier); - kvm_unregister_irq_ack_notifier(kvm, - &kvm->arch.vpit->pit_state.irq_ack_notifier); - mutex_lock(&kvm->arch.vpit->pit_state.lock); - timer = &kvm->arch.vpit->pit_state.pit_timer.timer; - hrtimer_cancel(timer); - cancel_work_sync(&kvm->arch.vpit->expired); - kvm_free_irq_source_id(kvm, kvm->arch.vpit->irq_source_id); - mutex_unlock(&kvm->arch.vpit->pit_state.lock); - destroy_workqueue(kvm->arch.vpit->wq); - kfree(kvm->arch.vpit); - } -} diff --git a/ANDROID_3.4.5/arch/x86/kvm/i8254.h b/ANDROID_3.4.5/arch/x86/kvm/i8254.h deleted file mode 100644 index 51a97426..00000000 --- a/ANDROID_3.4.5/arch/x86/kvm/i8254.h +++ /dev/null @@ -1,58 +0,0 @@ -#ifndef __I8254_H -#define __I8254_H - -#include "iodev.h" - -struct kvm_kpit_channel_state { - u32 count; /* can be 65536 */ - u16 latched_count; - u8 count_latched; - u8 status_latched; - u8 status; - u8 read_state; - u8 write_state; - u8 write_latch; - u8 rw_mode; - u8 mode; - u8 bcd; /* not supported */ - u8 gate; /* timer start */ - ktime_t count_load_time; -}; - -struct kvm_kpit_state { - struct kvm_kpit_channel_state channels[3]; - u32 flags; - struct kvm_timer pit_timer; - bool is_periodic; - u32 speaker_data_on; - struct mutex lock; - struct kvm_pit *pit; - spinlock_t inject_lock; - unsigned long irq_ack; - struct kvm_irq_ack_notifier irq_ack_notifier; -}; - -struct kvm_pit { - struct kvm_io_device dev; - struct kvm_io_device speaker_dev; - struct kvm *kvm; - struct kvm_kpit_state pit_state; - int irq_source_id; - struct kvm_irq_mask_notifier mask_notifier; - struct workqueue_struct *wq; - struct work_struct expired; -}; - -#define KVM_PIT_BASE_ADDRESS 0x40 -#define KVM_SPEAKER_BASE_ADDRESS 0x61 -#define KVM_PIT_MEM_LENGTH 4 -#define KVM_PIT_FREQ 1193181 -#define KVM_MAX_PIT_INTR_INTERVAL HZ / 100 -#define KVM_PIT_CHANNEL_MASK 0x3 - -void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val, int hpet_legacy_start); -struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags); -void kvm_free_pit(struct kvm *kvm); -void kvm_pit_reset(struct kvm_pit *pit); - -#endif diff --git a/ANDROID_3.4.5/arch/x86/kvm/i8259.c b/ANDROID_3.4.5/arch/x86/kvm/i8259.c deleted file mode 100644 index 81cf4fa4..00000000 --- a/ANDROID_3.4.5/arch/x86/kvm/i8259.c +++ /dev/null @@ -1,664 +0,0 @@ -/* - * 8259 interrupt controller emulation - * - * Copyright (c) 2003-2004 Fabrice Bellard - * Copyright (c) 2007 Intel Corporation - * Copyright 2009 Red Hat, Inc. and/or its affiliates. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - * Authors: - * Yaozu (Eddie) Dong <Eddie.dong@intel.com> - * Port from Qemu. - */ -#include <linux/mm.h> -#include <linux/slab.h> -#include <linux/bitops.h> -#include "irq.h" - -#include <linux/kvm_host.h> -#include "trace.h" - -#define pr_pic_unimpl(fmt, ...) \ - pr_err_ratelimited("kvm: pic: " fmt, ## __VA_ARGS__) - -static void pic_irq_request(struct kvm *kvm, int level); - -static void pic_lock(struct kvm_pic *s) - __acquires(&s->lock) -{ - spin_lock(&s->lock); -} - -static void pic_unlock(struct kvm_pic *s) - __releases(&s->lock) -{ - bool wakeup = s->wakeup_needed; - struct kvm_vcpu *vcpu, *found = NULL; - int i; - - s->wakeup_needed = false; - - spin_unlock(&s->lock); - - if (wakeup) { - kvm_for_each_vcpu(i, vcpu, s->kvm) { - if (kvm_apic_accept_pic_intr(vcpu)) { - found = vcpu; - break; - } - } - - if (!found) - return; - - kvm_make_request(KVM_REQ_EVENT, found); - kvm_vcpu_kick(found); - } -} - -static void pic_clear_isr(struct kvm_kpic_state *s, int irq) -{ - s->isr &= ~(1 << irq); - if (s != &s->pics_state->pics[0]) - irq += 8; - /* - * We are dropping lock while calling ack notifiers since ack - * notifier callbacks for assigned devices call into PIC recursively. - * Other interrupt may be delivered to PIC while lock is dropped but - * it should be safe since PIC state is already updated at this stage. - */ - pic_unlock(s->pics_state); - kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq); - pic_lock(s->pics_state); -} - -/* - * set irq level. If an edge is detected, then the IRR is set to 1 - */ -static inline int pic_set_irq1(struct kvm_kpic_state *s, int irq, int level) -{ - int mask, ret = 1; - mask = 1 << irq; - if (s->elcr & mask) /* level triggered */ - if (level) { - ret = !(s->irr & mask); - s->irr |= mask; - s->last_irr |= mask; - } else { - s->irr &= ~mask; - s->last_irr &= ~mask; - } - else /* edge triggered */ - if (level) { - if ((s->last_irr & mask) == 0) { - ret = !(s->irr & mask); - s->irr |= mask; - } - s->last_irr |= mask; - } else - s->last_irr &= ~mask; - - return (s->imr & mask) ? -1 : ret; -} - -/* - * return the highest priority found in mask (highest = smallest - * number). Return 8 if no irq - */ -static inline int get_priority(struct kvm_kpic_state *s, int mask) -{ - int priority; - if (mask == 0) - return 8; - priority = 0; - while ((mask & (1 << ((priority + s->priority_add) & 7))) == 0) - priority++; - return priority; -} - -/* - * return the pic wanted interrupt. return -1 if none - */ -static int pic_get_irq(struct kvm_kpic_state *s) -{ - int mask, cur_priority, priority; - - mask = s->irr & ~s->imr; - priority = get_priority(s, mask); - if (priority == 8) - return -1; - /* - * compute current priority. If special fully nested mode on the - * master, the IRQ coming from the slave is not taken into account - * for the priority computation. - */ - mask = s->isr; - if (s->special_fully_nested_mode && s == &s->pics_state->pics[0]) - mask &= ~(1 << 2); - cur_priority = get_priority(s, mask); - if (priority < cur_priority) - /* - * higher priority found: an irq should be generated - */ - return (priority + s->priority_add) & 7; - else - return -1; -} - -/* - * raise irq to CPU if necessary. must be called every time the active - * irq may change - */ -static void pic_update_irq(struct kvm_pic *s) -{ - int irq2, irq; - - irq2 = pic_get_irq(&s->pics[1]); - if (irq2 >= 0) { - /* - * if irq request by slave pic, signal master PIC - */ - pic_set_irq1(&s->pics[0], 2, 1); - pic_set_irq1(&s->pics[0], 2, 0); - } - irq = pic_get_irq(&s->pics[0]); - pic_irq_request(s->kvm, irq >= 0); -} - -void kvm_pic_update_irq(struct kvm_pic *s) -{ - pic_lock(s); - pic_update_irq(s); - pic_unlock(s); -} - -int kvm_pic_set_irq(void *opaque, int irq, int level) -{ - struct kvm_pic *s = opaque; - int ret = -1; - - pic_lock(s); - if (irq >= 0 && irq < PIC_NUM_PINS) { - ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level); - pic_update_irq(s); - trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr, - s->pics[irq >> 3].imr, ret == 0); - } - pic_unlock(s); - - return ret; -} - -/* - * acknowledge interrupt 'irq' - */ -static inline void pic_intack(struct kvm_kpic_state *s, int irq) -{ - s->isr |= 1 << irq; - /* - * We don't clear a level sensitive interrupt here - */ - if (!(s->elcr & (1 << irq))) - s->irr &= ~(1 << irq); - - if (s->auto_eoi) { - if (s->rotate_on_auto_eoi) - s->priority_add = (irq + 1) & 7; - pic_clear_isr(s, irq); - } - -} - -int kvm_pic_read_irq(struct kvm *kvm) -{ - int irq, irq2, intno; - struct kvm_pic *s = pic_irqchip(kvm); - - pic_lock(s); - irq = pic_get_irq(&s->pics[0]); - if (irq >= 0) { - pic_intack(&s->pics[0], irq); - if (irq == 2) { - irq2 = pic_get_irq(&s->pics[1]); - if (irq2 >= 0) - pic_intack(&s->pics[1], irq2); - else - /* - * spurious IRQ on slave controller - */ - irq2 = 7; - intno = s->pics[1].irq_base + irq2; - irq = irq2 + 8; - } else - intno = s->pics[0].irq_base + irq; - } else { - /* - * spurious IRQ on host controller - */ - irq = 7; - intno = s->pics[0].irq_base + irq; - } - pic_update_irq(s); - pic_unlock(s); - - return intno; -} - -void kvm_pic_reset(struct kvm_kpic_state *s) -{ - int irq, i; - struct kvm_vcpu *vcpu; - u8 irr = s->irr, isr = s->imr; - bool found = false; - - s->last_irr = 0; - s->irr = 0; - s->imr = 0; - s->isr = 0; - s->priority_add = 0; - s->irq_base = 0; - s->read_reg_select = 0; - s->poll = 0; - s->special_mask = 0; - s->init_state = 0; - s->auto_eoi = 0; - s->rotate_on_auto_eoi = 0; - s->special_fully_nested_mode = 0; - s->init4 = 0; - - kvm_for_each_vcpu(i, vcpu, s->pics_state->kvm) - if (kvm_apic_accept_pic_intr(vcpu)) { - found = true; - break; - } - - - if (!found) - return; - - for (irq = 0; irq < PIC_NUM_PINS/2; irq++) - if (irr & (1 << irq) || isr & (1 << irq)) - pic_clear_isr(s, irq); -} - -static void pic_ioport_write(void *opaque, u32 addr, u32 val) -{ - struct kvm_kpic_state *s = opaque; - int priority, cmd, irq; - - addr &= 1; - if (addr == 0) { - if (val & 0x10) { - s->init4 = val & 1; - s->last_irr = 0; - s->irr &= s->elcr; - s->imr = 0; - s->priority_add = 0; - s->special_mask = 0; - s->read_reg_select = 0; - if (!s->init4) { - s->special_fully_nested_mode = 0; - s->auto_eoi = 0; - } - s->init_state = 1; - if (val & 0x02) - pr_pic_unimpl("single mode not supported"); - if (val & 0x08) - pr_pic_unimpl( - "level sensitive irq not supported"); - } else if (val & 0x08) { - if (val & 0x04) - s->poll = 1; - if (val & 0x02) - s->read_reg_select = val & 1; - if (val & 0x40) - s->special_mask = (val >> 5) & 1; - } else { - cmd = val >> 5; - switch (cmd) { - case 0: - case 4: - s->rotate_on_auto_eoi = cmd >> 2; - break; - case 1: /* end of interrupt */ - case 5: - priority = get_priority(s, s->isr); - if (priority != 8) { - irq = (priority + s->priority_add) & 7; - if (cmd == 5) - s->priority_add = (irq + 1) & 7; - pic_clear_isr(s, irq); - pic_update_irq(s->pics_state); - } - break; - case 3: - irq = val & 7; - pic_clear_isr(s, irq); - pic_update_irq(s->pics_state); - break; - case 6: - s->priority_add = (val + 1) & 7; - pic_update_irq(s->pics_state); - break; - case 7: - irq = val & 7; - s->priority_add = (irq + 1) & 7; - pic_clear_isr(s, irq); - pic_update_irq(s->pics_state); - break; - default: - break; /* no operation */ - } - } - } else - switch (s->init_state) { - case 0: { /* normal mode */ - u8 imr_diff = s->imr ^ val, - off = (s == &s->pics_state->pics[0]) ? 0 : 8; - s->imr = val; - for (irq = 0; irq < PIC_NUM_PINS/2; irq++) - if (imr_diff & (1 << irq)) - kvm_fire_mask_notifiers( - s->pics_state->kvm, - SELECT_PIC(irq + off), - irq + off, - !!(s->imr & (1 << irq))); - pic_update_irq(s->pics_state); - break; - } - case 1: - s->irq_base = val & 0xf8; - s->init_state = 2; - break; - case 2: - if (s->init4) - s->init_state = 3; - else - s->init_state = 0; - break; - case 3: - s->special_fully_nested_mode = (val >> 4) & 1; - s->auto_eoi = (val >> 1) & 1; - s->init_state = 0; - break; - } -} - -static u32 pic_poll_read(struct kvm_kpic_state *s, u32 addr1) -{ - int ret; - - ret = pic_get_irq(s); - if (ret >= 0) { - if (addr1 >> 7) { - s->pics_state->pics[0].isr &= ~(1 << 2); - s->pics_state->pics[0].irr &= ~(1 << 2); - } - s->irr &= ~(1 << ret); - pic_clear_isr(s, ret); - if (addr1 >> 7 || ret != 2) - pic_update_irq(s->pics_state); - } else { - ret = 0x07; - pic_update_irq(s->pics_state); - } - - return ret; -} - -static u32 pic_ioport_read(void *opaque, u32 addr1) -{ - struct kvm_kpic_state *s = opaque; - unsigned int addr; - int ret; - - addr = addr1; - addr &= 1; - if (s->poll) { - ret = pic_poll_read(s, addr1); - s->poll = 0; - } else - if (addr == 0) - if (s->read_reg_select) - ret = s->isr; - else - ret = s->irr; - else - ret = s->imr; - return ret; -} - -static void elcr_ioport_write(void *opaque, u32 addr, u32 val) -{ - struct kvm_kpic_state *s = opaque; - s->elcr = val & s->elcr_mask; -} - -static u32 elcr_ioport_read(void *opaque, u32 addr1) -{ - struct kvm_kpic_state *s = opaque; - return s->elcr; -} - -static int picdev_in_range(gpa_t addr) -{ - switch (addr) { - case 0x20: - case 0x21: - case 0xa0: - case 0xa1: - case 0x4d0: - case 0x4d1: - return 1; - default: - return 0; - } -} - -static int picdev_write(struct kvm_pic *s, - gpa_t addr, int len, const void *val) -{ - unsigned char data = *(unsigned char *)val; - if (!picdev_in_range(addr)) - return -EOPNOTSUPP; - - if (len != 1) { - pr_pic_unimpl("non byte write\n"); - return 0; - } - pic_lock(s); - switch (addr) { - case 0x20: - case 0x21: - case 0xa0: - case 0xa1: - pic_ioport_write(&s->pics[addr >> 7], addr, data); - break; - case 0x4d0: - case 0x4d1: - elcr_ioport_write(&s->pics[addr & 1], addr, data); - break; - } - pic_unlock(s); - return 0; -} - -static int picdev_read(struct kvm_pic *s, - gpa_t addr, int len, void *val) -{ - unsigned char data = 0; - if (!picdev_in_range(addr)) - return -EOPNOTSUPP; - - if (len != 1) { - pr_pic_unimpl("non byte read\n"); - return 0; - } - pic_lock(s); - switch (addr) { - case 0x20: - case 0x21: - case 0xa0: - case 0xa1: - data = pic_ioport_read(&s->pics[addr >> 7], addr); - break; - case 0x4d0: - case 0x4d1: - data = elcr_ioport_read(&s->pics[addr & 1], addr); - break; - } - *(unsigned char *)val = data; - pic_unlock(s); - return 0; -} - -static int picdev_master_write(struct kvm_io_device *dev, - gpa_t addr, int len, const void *val) -{ - return picdev_write(container_of(dev, struct kvm_pic, dev_master), - addr, len, val); -} - -static int picdev_master_read(struct kvm_io_device *dev, - gpa_t addr, int len, void *val) -{ - return picdev_read(container_of(dev, struct kvm_pic, dev_master), - addr, len, val); -} - -static int picdev_slave_write(struct kvm_io_device *dev, - gpa_t addr, int len, const void *val) -{ - return picdev_write(container_of(dev, struct kvm_pic, dev_slave), - addr, len, val); -} - -static int picdev_slave_read(struct kvm_io_device *dev, - gpa_t addr, int len, void *val) -{ - return picdev_read(container_of(dev, struct kvm_pic, dev_slave), - addr, len, val); -} - -static int picdev_eclr_write(struct kvm_io_device *dev, - gpa_t addr, int len, const void *val) -{ - return picdev_write(container_of(dev, struct kvm_pic, dev_eclr), - addr, len, val); -} - -static int picdev_eclr_read(struct kvm_io_device *dev, - gpa_t addr, int len, void *val) -{ - return picdev_read(container_of(dev, struct kvm_pic, dev_eclr), - addr, len, val); -} - -/* - * callback when PIC0 irq status changed - */ -static void pic_irq_request(struct kvm *kvm, int level) -{ - struct kvm_pic *s = pic_irqchip(kvm); - - if (!s->output) - s->wakeup_needed = true; - s->output = level; -} - -static const struct kvm_io_device_ops picdev_master_ops = { - .read = picdev_master_read, - .write = picdev_master_write, -}; - -static const struct kvm_io_device_ops picdev_slave_ops = { - .read = picdev_slave_read, - .write = picdev_slave_write, -}; - -static const struct kvm_io_device_ops picdev_eclr_ops = { - .read = picdev_eclr_read, - .write = picdev_eclr_write, -}; - -struct kvm_pic *kvm_create_pic(struct kvm *kvm) -{ - struct kvm_pic *s; - int ret; - - s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL); - if (!s) - return NULL; - spin_lock_init(&s->lock); - s->kvm = kvm; - s->pics[0].elcr_mask = 0xf8; - s->pics[1].elcr_mask = 0xde; - s->pics[0].pics_state = s; - s->pics[1].pics_state = s; - - /* - * Initialize PIO device - */ - kvm_iodevice_init(&s->dev_master, &picdev_master_ops); - kvm_iodevice_init(&s->dev_slave, &picdev_slave_ops); - kvm_iodevice_init(&s->dev_eclr, &picdev_eclr_ops); - mutex_lock(&kvm->slots_lock); - ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 0x20, 2, - &s->dev_master); - if (ret < 0) - goto fail_unlock; - - ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 0xa0, 2, &s->dev_slave); - if (ret < 0) - goto fail_unreg_2; - - ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 0x4d0, 2, &s->dev_eclr); - if (ret < 0) - goto fail_unreg_1; - - mutex_unlock(&kvm->slots_lock); - - return s; - -fail_unreg_1: - kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &s->dev_slave); - -fail_unreg_2: - kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &s->dev_master); - -fail_unlock: - mutex_unlock(&kvm->slots_lock); - - kfree(s); - - return NULL; -} - -void kvm_destroy_pic(struct kvm *kvm) -{ - struct kvm_pic *vpic = kvm->arch.vpic; - - if (vpic) { - kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev_master); - kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev_slave); - kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev_eclr); - kvm->arch.vpic = NULL; - kfree(vpic); - } -} diff --git a/ANDROID_3.4.5/arch/x86/kvm/irq.c b/ANDROID_3.4.5/arch/x86/kvm/irq.c deleted file mode 100644 index 7e06ba16..00000000 --- a/ANDROID_3.4.5/arch/x86/kvm/irq.c +++ /dev/null @@ -1,96 +0,0 @@ -/* - * irq.c: API for in kernel interrupt controller - * Copyright (c) 2007, Intel Corporation. - * Copyright 2009 Red Hat, Inc. and/or its affiliates. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., 59 Temple - * Place - Suite 330, Boston, MA 02111-1307 USA. - * Authors: - * Yaozu (Eddie) Dong <Eddie.dong@intel.com> - * - */ - -#include <linux/module.h> -#include <linux/kvm_host.h> - -#include "irq.h" -#include "i8254.h" -#include "x86.h" - -/* - * check if there are pending timer events - * to be processed. - */ -int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) -{ - return apic_has_pending_timer(vcpu); -} -EXPORT_SYMBOL(kvm_cpu_has_pending_timer); - -/* - * check if there is pending interrupt without - * intack. - */ -int kvm_cpu_has_interrupt(struct kvm_vcpu *v) -{ - struct kvm_pic *s; - - if (!irqchip_in_kernel(v->kvm)) - return v->arch.interrupt.pending; - - if (kvm_apic_has_interrupt(v) == -1) { /* LAPIC */ - if (kvm_apic_accept_pic_intr(v)) { - s = pic_irqchip(v->kvm); /* PIC */ - return s->output; - } else - return 0; - } - return 1; -} -EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt); - -/* - * Read pending interrupt vector and intack. - */ -int kvm_cpu_get_interrupt(struct kvm_vcpu *v) -{ - struct kvm_pic *s; - int vector; - - if (!irqchip_in_kernel(v->kvm)) - return v->arch.interrupt.nr; - - vector = kvm_get_apic_interrupt(v); /* APIC */ - if (vector == -1) { - if (kvm_apic_accept_pic_intr(v)) { - s = pic_irqchip(v->kvm); - s->output = 0; /* PIC */ - vector = kvm_pic_read_irq(v->kvm); - } - } - return vector; -} -EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt); - -void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu) -{ - kvm_inject_apic_timer_irqs(vcpu); - /* TODO: PIT, RTC etc. */ -} -EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs); - -void __kvm_migrate_timers(struct kvm_vcpu *vcpu) -{ - __kvm_migrate_apic_timer(vcpu); - __kvm_migrate_pit_timer(vcpu); -} diff --git a/ANDROID_3.4.5/arch/x86/kvm/irq.h b/ANDROID_3.4.5/arch/x86/kvm/irq.h deleted file mode 100644 index 2086f2bf..00000000 --- a/ANDROID_3.4.5/arch/x86/kvm/irq.h +++ /dev/null @@ -1,106 +0,0 @@ -/* - * irq.h: in kernel interrupt controller related definitions - * Copyright (c) 2007, Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., 59 Temple - * Place - Suite 330, Boston, MA 02111-1307 USA. - * Authors: - * Yaozu (Eddie) Dong <Eddie.dong@intel.com> - * - */ - -#ifndef __IRQ_H -#define __IRQ_H - -#include <linux/mm_types.h> -#include <linux/hrtimer.h> -#include <linux/kvm_host.h> -#include <linux/spinlock.h> - -#include "iodev.h" -#include "ioapic.h" -#include "lapic.h" - -#define PIC_NUM_PINS 16 -#define SELECT_PIC(irq) \ - ((irq) < 8 ? KVM_IRQCHIP_PIC_MASTER : KVM_IRQCHIP_PIC_SLAVE) - -struct kvm; -struct kvm_vcpu; - -struct kvm_kpic_state { - u8 last_irr; /* edge detection */ - u8 irr; /* interrupt request register */ - u8 imr; /* interrupt mask register */ - u8 isr; /* interrupt service register */ - u8 priority_add; /* highest irq priority */ - u8 irq_base; - u8 read_reg_select; - u8 poll; - u8 special_mask; - u8 init_state; - u8 auto_eoi; - u8 rotate_on_auto_eoi; - u8 special_fully_nested_mode; - u8 init4; /* true if 4 byte init */ - u8 elcr; /* PIIX edge/trigger selection */ - u8 elcr_mask; - u8 isr_ack; /* interrupt ack detection */ - struct kvm_pic *pics_state; -}; - -struct kvm_pic { - spinlock_t lock; - bool wakeup_needed; - unsigned pending_acks; - struct kvm *kvm; - struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */ - int output; /* intr from master PIC */ - struct kvm_io_device dev_master; - struct kvm_io_device dev_slave; - struct kvm_io_device dev_eclr; - void (*ack_notifier)(void *opaque, int irq); - unsigned long irq_states[16]; -}; - -struct kvm_pic *kvm_create_pic(struct kvm *kvm); -void kvm_destroy_pic(struct kvm *kvm); -int kvm_pic_read_irq(struct kvm *kvm); -void kvm_pic_update_irq(struct kvm_pic *s); - -static inline struct kvm_pic *pic_irqchip(struct kvm *kvm) -{ - return kvm->arch.vpic; -} - -static inline int irqchip_in_kernel(struct kvm *kvm) -{ - int ret; - - ret = (pic_irqchip(kvm) != NULL); - smp_rmb(); - return ret; -} - -void kvm_pic_reset(struct kvm_kpic_state *s); - -void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu); -void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu); -void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu); -void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu); -void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu); -void __kvm_migrate_timers(struct kvm_vcpu *vcpu); - -int apic_has_pending_timer(struct kvm_vcpu *vcpu); - -#endif diff --git a/ANDROID_3.4.5/arch/x86/kvm/kvm_cache_regs.h b/ANDROID_3.4.5/arch/x86/kvm/kvm_cache_regs.h deleted file mode 100644 index 544076c4..00000000 --- a/ANDROID_3.4.5/arch/x86/kvm/kvm_cache_regs.h +++ /dev/null @@ -1,102 +0,0 @@ -#ifndef ASM_KVM_CACHE_REGS_H -#define ASM_KVM_CACHE_REGS_H - -#define KVM_POSSIBLE_CR0_GUEST_BITS X86_CR0_TS -#define KVM_POSSIBLE_CR4_GUEST_BITS \ - (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ - | X86_CR4_OSXMMEXCPT | X86_CR4_PGE) - -static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu, - enum kvm_reg reg) -{ - if (!test_bit(reg, (unsigned long *)&vcpu->arch.regs_avail)) - kvm_x86_ops->cache_reg(vcpu, reg); - - return vcpu->arch.regs[reg]; -} - -static inline void kvm_register_write(struct kvm_vcpu *vcpu, - enum kvm_reg reg, - unsigned long val) -{ - vcpu->arch.regs[reg] = val; - __set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty); - __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); -} - -static inline unsigned long kvm_rip_read(struct kvm_vcpu *vcpu) -{ - return kvm_register_read(vcpu, VCPU_REGS_RIP); -} - -static inline void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val) -{ - kvm_register_write(vcpu, VCPU_REGS_RIP, val); -} - -static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index) -{ - might_sleep(); /* on svm */ - - if (!test_bit(VCPU_EXREG_PDPTR, - (unsigned long *)&vcpu->arch.regs_avail)) - kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR); - - return vcpu->arch.walk_mmu->pdptrs[index]; -} - -static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask) -{ - ulong tmask = mask & KVM_POSSIBLE_CR0_GUEST_BITS; - if (tmask & vcpu->arch.cr0_guest_owned_bits) - kvm_x86_ops->decache_cr0_guest_bits(vcpu); - return vcpu->arch.cr0 & mask; -} - -static inline ulong kvm_read_cr0(struct kvm_vcpu *vcpu) -{ - return kvm_read_cr0_bits(vcpu, ~0UL); -} - -static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask) -{ - ulong tmask = mask & KVM_POSSIBLE_CR4_GUEST_BITS; - if (tmask & vcpu->arch.cr4_guest_owned_bits) - kvm_x86_ops->decache_cr4_guest_bits(vcpu); - return vcpu->arch.cr4 & mask; -} - -static inline ulong kvm_read_cr3(struct kvm_vcpu *vcpu) -{ - if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail)) - kvm_x86_ops->decache_cr3(vcpu); - return vcpu->arch.cr3; -} - -static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu) -{ - return kvm_read_cr4_bits(vcpu, ~0UL); -} - -static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu) -{ - return (kvm_register_read(vcpu, VCPU_REGS_RAX) & -1u) - | ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32); -} - -static inline void enter_guest_mode(struct kvm_vcpu *vcpu) -{ - vcpu->arch.hflags |= HF_GUEST_MASK; -} - -static inline void leave_guest_mode(struct kvm_vcpu *vcpu) -{ - vcpu->arch.hflags &= ~HF_GUEST_MASK; -} - -static inline bool is_guest_mode(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.hflags & HF_GUEST_MASK; -} - -#endif diff --git a/ANDROID_3.4.5/arch/x86/kvm/kvm_timer.h b/ANDROID_3.4.5/arch/x86/kvm/kvm_timer.h deleted file mode 100644 index 497dbaa3..00000000 --- a/ANDROID_3.4.5/arch/x86/kvm/kvm_timer.h +++ /dev/null @@ -1,18 +0,0 @@ - -struct kvm_timer { - struct hrtimer timer; - s64 period; /* unit: ns */ - u32 timer_mode_mask; - u64 tscdeadline; - atomic_t pending; /* accumulated triggered timers */ - bool reinject; - struct kvm_timer_ops *t_ops; - struct kvm *kvm; - struct kvm_vcpu *vcpu; -}; - -struct kvm_timer_ops { - bool (*is_periodic)(struct kvm_timer *); -}; - -enum hrtimer_restart kvm_timer_fn(struct hrtimer *data); diff --git a/ANDROID_3.4.5/arch/x86/kvm/lapic.c b/ANDROID_3.4.5/arch/x86/kvm/lapic.c deleted file mode 100644 index 85843228..00000000 --- a/ANDROID_3.4.5/arch/x86/kvm/lapic.c +++ /dev/null @@ -1,1387 +0,0 @@ - -/* - * Local APIC virtualization - * - * Copyright (C) 2006 Qumranet, Inc. - * Copyright (C) 2007 Novell - * Copyright (C) 2007 Intel - * Copyright 2009 Red Hat, Inc. and/or its affiliates. - * - * Authors: - * Dor Laor <dor.laor@qumranet.com> - * Gregory Haskins <ghaskins@novell.com> - * Yaozu (Eddie) Dong <eddie.dong@intel.com> - * - * Based on Xen 3.1 code, Copyright (c) 2004, Intel Corporation. - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - */ - -#include <linux/kvm_host.h> -#include <linux/kvm.h> -#include <linux/mm.h> -#include <linux/highmem.h> -#include <linux/smp.h> -#include <linux/hrtimer.h> -#include <linux/io.h> -#include <linux/module.h> -#include <linux/math64.h> -#include <linux/slab.h> -#include <asm/processor.h> -#include <asm/msr.h> -#include <asm/page.h> -#include <asm/current.h> -#include <asm/apicdef.h> -#include <linux/atomic.h> -#include "kvm_cache_regs.h" -#include "irq.h" -#include "trace.h" -#include "x86.h" -#include "cpuid.h" - -#ifndef CONFIG_X86_64 -#define mod_64(x, y) ((x) - (y) * div64_u64(x, y)) -#else -#define mod_64(x, y) ((x) % (y)) -#endif - -#define PRId64 "d" -#define PRIx64 "llx" -#define PRIu64 "u" -#define PRIo64 "o" - -#define APIC_BUS_CYCLE_NS 1 - -/* #define apic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) */ -#define apic_debug(fmt, arg...) - -#define APIC_LVT_NUM 6 -/* 14 is the version for Xeon and Pentium 8.4.8*/ -#define APIC_VERSION (0x14UL | ((APIC_LVT_NUM - 1) << 16)) -#define LAPIC_MMIO_LENGTH (1 << 12) -/* followed define is not in apicdef.h */ -#define APIC_SHORT_MASK 0xc0000 -#define APIC_DEST_NOSHORT 0x0 -#define APIC_DEST_MASK 0x800 -#define MAX_APIC_VECTOR 256 - -#define VEC_POS(v) ((v) & (32 - 1)) -#define REG_POS(v) (((v) >> 5) << 4) - -static unsigned int min_timer_period_us = 500; -module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR); - -static inline u32 apic_get_reg(struct kvm_lapic *apic, int reg_off) -{ - return *((u32 *) (apic->regs + reg_off)); -} - -static inline void apic_set_reg(struct kvm_lapic *apic, int reg_off, u32 val) -{ - *((u32 *) (apic->regs + reg_off)) = val; -} - -static inline int apic_test_and_set_vector(int vec, void *bitmap) -{ - return test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); -} - -static inline int apic_test_and_clear_vector(int vec, void *bitmap) -{ - return test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); -} - -static inline void apic_set_vector(int vec, void *bitmap) -{ - set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); -} - -static inline void apic_clear_vector(int vec, void *bitmap) -{ - clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); -} - -static inline int apic_hw_enabled(struct kvm_lapic *apic) -{ - return (apic)->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE; -} - -static inline int apic_sw_enabled(struct kvm_lapic *apic) -{ - return apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_APIC_ENABLED; -} - -static inline int apic_enabled(struct kvm_lapic *apic) -{ - return apic_sw_enabled(apic) && apic_hw_enabled(apic); -} - -#define LVT_MASK \ - (APIC_LVT_MASKED | APIC_SEND_PENDING | APIC_VECTOR_MASK) - -#define LINT_MASK \ - (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \ - APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER) - -static inline int kvm_apic_id(struct kvm_lapic *apic) -{ - return (apic_get_reg(apic, APIC_ID) >> 24) & 0xff; -} - -static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type) -{ - return !(apic_get_reg(apic, lvt_type) & APIC_LVT_MASKED); -} - -static inline int apic_lvt_vector(struct kvm_lapic *apic, int lvt_type) -{ - return apic_get_reg(apic, lvt_type) & APIC_VECTOR_MASK; -} - -static inline int apic_lvtt_oneshot(struct kvm_lapic *apic) -{ - return ((apic_get_reg(apic, APIC_LVTT) & - apic->lapic_timer.timer_mode_mask) == APIC_LVT_TIMER_ONESHOT); -} - -static inline int apic_lvtt_period(struct kvm_lapic *apic) -{ - return ((apic_get_reg(apic, APIC_LVTT) & - apic->lapic_timer.timer_mode_mask) == APIC_LVT_TIMER_PERIODIC); -} - -static inline int apic_lvtt_tscdeadline(struct kvm_lapic *apic) -{ - return ((apic_get_reg(apic, APIC_LVTT) & - apic->lapic_timer.timer_mode_mask) == - APIC_LVT_TIMER_TSCDEADLINE); -} - -static inline int apic_lvt_nmi_mode(u32 lvt_val) -{ - return (lvt_val & (APIC_MODE_MASK | APIC_LVT_MASKED)) == APIC_DM_NMI; -} - -void kvm_apic_set_version(struct kvm_vcpu *vcpu) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - struct kvm_cpuid_entry2 *feat; - u32 v = APIC_VERSION; - - if (!irqchip_in_kernel(vcpu->kvm)) - return; - - feat = kvm_find_cpuid_entry(apic->vcpu, 0x1, 0); - if (feat && (feat->ecx & (1 << (X86_FEATURE_X2APIC & 31)))) - v |= APIC_LVR_DIRECTED_EOI; - apic_set_reg(apic, APIC_LVR, v); -} - -static inline int apic_x2apic_mode(struct kvm_lapic *apic) -{ - return apic->vcpu->arch.apic_base & X2APIC_ENABLE; -} - -static unsigned int apic_lvt_mask[APIC_LVT_NUM] = { - LVT_MASK , /* part LVTT mask, timer mode mask added at runtime */ - LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */ - LVT_MASK | APIC_MODE_MASK, /* LVTPC */ - LINT_MASK, LINT_MASK, /* LVT0-1 */ - LVT_MASK /* LVTERR */ -}; - -static int find_highest_vector(void *bitmap) -{ - u32 *word = bitmap; - int word_offset = MAX_APIC_VECTOR >> 5; - - while ((word_offset != 0) && (word[(--word_offset) << 2] == 0)) - continue; - - if (likely(!word_offset && !word[0])) - return -1; - else - return fls(word[word_offset << 2]) - 1 + (word_offset << 5); -} - -static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic) -{ - apic->irr_pending = true; - return apic_test_and_set_vector(vec, apic->regs + APIC_IRR); -} - -static inline int apic_search_irr(struct kvm_lapic *apic) -{ - return find_highest_vector(apic->regs + APIC_IRR); -} - -static inline int apic_find_highest_irr(struct kvm_lapic *apic) -{ - int result; - - if (!apic->irr_pending) - return -1; - - result = apic_search_irr(apic); - ASSERT(result == -1 || result >= 16); - - return result; -} - -static inline void apic_clear_irr(int vec, struct kvm_lapic *apic) -{ - apic->irr_pending = false; - apic_clear_vector(vec, apic->regs + APIC_IRR); - if (apic_search_irr(apic) != -1) - apic->irr_pending = true; -} - -int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - int highest_irr; - - /* This may race with setting of irr in __apic_accept_irq() and - * value returned may be wrong, but kvm_vcpu_kick() in __apic_accept_irq - * will cause vmexit immediately and the value will be recalculated - * on the next vmentry. - */ - if (!apic) - return 0; - highest_irr = apic_find_highest_irr(apic); - - return highest_irr; -} - -static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, - int vector, int level, int trig_mode); - -int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - - return __apic_accept_irq(apic, irq->delivery_mode, irq->vector, - irq->level, irq->trig_mode); -} - -static inline int apic_find_highest_isr(struct kvm_lapic *apic) -{ - int result; - - result = find_highest_vector(apic->regs + APIC_ISR); - ASSERT(result == -1 || result >= 16); - - return result; -} - -static void apic_update_ppr(struct kvm_lapic *apic) -{ - u32 tpr, isrv, ppr, old_ppr; - int isr; - - old_ppr = apic_get_reg(apic, APIC_PROCPRI); - tpr = apic_get_reg(apic, APIC_TASKPRI); - isr = apic_find_highest_isr(apic); - isrv = (isr != -1) ? isr : 0; - - if ((tpr & 0xf0) >= (isrv & 0xf0)) - ppr = tpr & 0xff; - else - ppr = isrv & 0xf0; - - apic_debug("vlapic %p, ppr 0x%x, isr 0x%x, isrv 0x%x", - apic, ppr, isr, isrv); - - if (old_ppr != ppr) { - apic_set_reg(apic, APIC_PROCPRI, ppr); - if (ppr < old_ppr) - kvm_make_request(KVM_REQ_EVENT, apic->vcpu); - } -} - -static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr) -{ - apic_set_reg(apic, APIC_TASKPRI, tpr); - apic_update_ppr(apic); -} - -int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest) -{ - return dest == 0xff || kvm_apic_id(apic) == dest; -} - -int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda) -{ - int result = 0; - u32 logical_id; - - if (apic_x2apic_mode(apic)) { - logical_id = apic_get_reg(apic, APIC_LDR); - return logical_id & mda; - } - - logical_id = GET_APIC_LOGICAL_ID(apic_get_reg(apic, APIC_LDR)); - - switch (apic_get_reg(apic, APIC_DFR)) { - case APIC_DFR_FLAT: - if (logical_id & mda) - result = 1; - break; - case APIC_DFR_CLUSTER: - if (((logical_id >> 4) == (mda >> 0x4)) - && (logical_id & mda & 0xf)) - result = 1; - break; - default: - apic_debug("Bad DFR vcpu %d: %08x\n", - apic->vcpu->vcpu_id, apic_get_reg(apic, APIC_DFR)); - break; - } - - return result; -} - -int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, - int short_hand, int dest, int dest_mode) -{ - int result = 0; - struct kvm_lapic *target = vcpu->arch.apic; - - apic_debug("target %p, source %p, dest 0x%x, " - "dest_mode 0x%x, short_hand 0x%x\n", - target, source, dest, dest_mode, short_hand); - - ASSERT(target); - switch (short_hand) { - case APIC_DEST_NOSHORT: - if (dest_mode == 0) - /* Physical mode. */ - result = kvm_apic_match_physical_addr(target, dest); - else - /* Logical mode. */ - result = kvm_apic_match_logical_addr(target, dest); - break; - case APIC_DEST_SELF: - result = (target == source); - break; - case APIC_DEST_ALLINC: - result = 1; - break; - case APIC_DEST_ALLBUT: - result = (target != source); - break; - default: - apic_debug("kvm: apic: Bad dest shorthand value %x\n", - short_hand); - break; - } - - return result; -} - -/* - * Add a pending IRQ into lapic. - * Return 1 if successfully added and 0 if discarded. - */ -static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, - int vector, int level, int trig_mode) -{ - int result = 0; - struct kvm_vcpu *vcpu = apic->vcpu; - - switch (delivery_mode) { - case APIC_DM_LOWEST: - vcpu->arch.apic_arb_prio++; - case APIC_DM_FIXED: - /* FIXME add logic for vcpu on reset */ - if (unlikely(!apic_enabled(apic))) - break; - - if (trig_mode) { - apic_debug("level trig mode for vector %d", vector); - apic_set_vector(vector, apic->regs + APIC_TMR); - } else - apic_clear_vector(vector, apic->regs + APIC_TMR); - - result = !apic_test_and_set_irr(vector, apic); - trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode, - trig_mode, vector, !result); - if (!result) { - if (trig_mode) - apic_debug("level trig mode repeatedly for " - "vector %d", vector); - break; - } - - kvm_make_request(KVM_REQ_EVENT, vcpu); - kvm_vcpu_kick(vcpu); - break; - - case APIC_DM_REMRD: - apic_debug("Ignoring delivery mode 3\n"); - break; - - case APIC_DM_SMI: - apic_debug("Ignoring guest SMI\n"); - break; - - case APIC_DM_NMI: - result = 1; - kvm_inject_nmi(vcpu); - kvm_vcpu_kick(vcpu); - break; - - case APIC_DM_INIT: - if (!trig_mode || level) { - result = 1; - vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; - kvm_make_request(KVM_REQ_EVENT, vcpu); - kvm_vcpu_kick(vcpu); - } else { - apic_debug("Ignoring de-assert INIT to vcpu %d\n", - vcpu->vcpu_id); - } - break; - - case APIC_DM_STARTUP: - apic_debug("SIPI to vcpu %d vector 0x%02x\n", - vcpu->vcpu_id, vector); - if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { - result = 1; - vcpu->arch.sipi_vector = vector; - vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED; - kvm_make_request(KVM_REQ_EVENT, vcpu); - kvm_vcpu_kick(vcpu); - } - break; - - case APIC_DM_EXTINT: - /* - * Should only be called by kvm_apic_local_deliver() with LVT0, - * before NMI watchdog was enabled. Already handled by - * kvm_apic_accept_pic_intr(). - */ - break; - - default: - printk(KERN_ERR "TODO: unsupported delivery mode %x\n", - delivery_mode); - break; - } - return result; -} - -int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2) -{ - return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio; -} - -static void apic_set_eoi(struct kvm_lapic *apic) -{ - int vector = apic_find_highest_isr(apic); - int trigger_mode; - /* - * Not every write EOI will has corresponding ISR, - * one example is when Kernel check timer on setup_IO_APIC - */ - if (vector == -1) - return; - - apic_clear_vector(vector, apic->regs + APIC_ISR); - apic_update_ppr(apic); - - if (apic_test_and_clear_vector(vector, apic->regs + APIC_TMR)) - trigger_mode = IOAPIC_LEVEL_TRIG; - else - trigger_mode = IOAPIC_EDGE_TRIG; - if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI)) - kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); - kvm_make_request(KVM_REQ_EVENT, apic->vcpu); -} - -static void apic_send_ipi(struct kvm_lapic *apic) -{ - u32 icr_low = apic_get_reg(apic, APIC_ICR); - u32 icr_high = apic_get_reg(apic, APIC_ICR2); - struct kvm_lapic_irq irq; - - irq.vector = icr_low & APIC_VECTOR_MASK; - irq.delivery_mode = icr_low & APIC_MODE_MASK; - irq.dest_mode = icr_low & APIC_DEST_MASK; - irq.level = icr_low & APIC_INT_ASSERT; - irq.trig_mode = icr_low & APIC_INT_LEVELTRIG; - irq.shorthand = icr_low & APIC_SHORT_MASK; - if (apic_x2apic_mode(apic)) - irq.dest_id = icr_high; - else - irq.dest_id = GET_APIC_DEST_FIELD(icr_high); - - trace_kvm_apic_ipi(icr_low, irq.dest_id); - - apic_debug("icr_high 0x%x, icr_low 0x%x, " - "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, " - "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x\n", - icr_high, icr_low, irq.shorthand, irq.dest_id, - irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode, - irq.vector); - - kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq); -} - -static u32 apic_get_tmcct(struct kvm_lapic *apic) -{ - ktime_t remaining; - s64 ns; - u32 tmcct; - - ASSERT(apic != NULL); - - /* if initial count is 0, current count should also be 0 */ - if (apic_get_reg(apic, APIC_TMICT) == 0) - return 0; - - remaining = hrtimer_get_remaining(&apic->lapic_timer.timer); - if (ktime_to_ns(remaining) < 0) - remaining = ktime_set(0, 0); - - ns = mod_64(ktime_to_ns(remaining), apic->lapic_timer.period); - tmcct = div64_u64(ns, - (APIC_BUS_CYCLE_NS * apic->divide_count)); - - return tmcct; -} - -static void __report_tpr_access(struct kvm_lapic *apic, bool write) -{ - struct kvm_vcpu *vcpu = apic->vcpu; - struct kvm_run *run = vcpu->run; - - kvm_make_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu); - run->tpr_access.rip = kvm_rip_read(vcpu); - run->tpr_access.is_write = write; -} - -static inline void report_tpr_access(struct kvm_lapic *apic, bool write) -{ - if (apic->vcpu->arch.tpr_access_reporting) - __report_tpr_access(apic, write); -} - -static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset) -{ - u32 val = 0; - - if (offset >= LAPIC_MMIO_LENGTH) - return 0; - - switch (offset) { - case APIC_ID: - if (apic_x2apic_mode(apic)) - val = kvm_apic_id(apic); - else - val = kvm_apic_id(apic) << 24; - break; - case APIC_ARBPRI: - apic_debug("Access APIC ARBPRI register which is for P6\n"); - break; - - case APIC_TMCCT: /* Timer CCR */ - if (apic_lvtt_tscdeadline(apic)) - return 0; - - val = apic_get_tmcct(apic); - break; - - case APIC_TASKPRI: - report_tpr_access(apic, false); - /* fall thru */ - default: - apic_update_ppr(apic); - val = apic_get_reg(apic, offset); - break; - } - - return val; -} - -static inline struct kvm_lapic *to_lapic(struct kvm_io_device *dev) -{ - return container_of(dev, struct kvm_lapic, dev); -} - -static int apic_reg_read(struct kvm_lapic *apic, u32 offset, int len, - void *data) -{ - unsigned char alignment = offset & 0xf; - u32 result; - /* this bitmask has a bit cleared for each reserver register */ - static const u64 rmask = 0x43ff01ffffffe70cULL; - - if ((alignment + len) > 4) { - apic_debug("KVM_APIC_READ: alignment error %x %d\n", - offset, len); - return 1; - } - - if (offset > 0x3f0 || !(rmask & (1ULL << (offset >> 4)))) { - apic_debug("KVM_APIC_READ: read reserved register %x\n", - offset); - return 1; - } - - result = __apic_read(apic, offset & ~0xf); - - trace_kvm_apic_read(offset, result); - - switch (len) { - case 1: - case 2: - case 4: - memcpy(data, (char *)&result + alignment, len); - break; - default: - printk(KERN_ERR "Local APIC read with len = %x, " - "should be 1,2, or 4 instead\n", len); - break; - } - return 0; -} - -static int apic_mmio_in_range(struct kvm_lapic *apic, gpa_t addr) -{ - return apic_hw_enabled(apic) && - addr >= apic->base_address && - addr < apic->base_address + LAPIC_MMIO_LENGTH; -} - -static int apic_mmio_read(struct kvm_io_device *this, - gpa_t address, int len, void *data) -{ - struct kvm_lapic *apic = to_lapic(this); - u32 offset = address - apic->base_address; - - if (!apic_mmio_in_range(apic, address)) - return -EOPNOTSUPP; - - apic_reg_read(apic, offset, len, data); - - return 0; -} - -static void update_divide_count(struct kvm_lapic *apic) -{ - u32 tmp1, tmp2, tdcr; - - tdcr = apic_get_reg(apic, APIC_TDCR); - tmp1 = tdcr & 0xf; - tmp2 = ((tmp1 & 0x3) | ((tmp1 & 0x8) >> 1)) + 1; - apic->divide_count = 0x1 << (tmp2 & 0x7); - - apic_debug("timer divide count is 0x%x\n", - apic->divide_count); -} - -static void start_apic_timer(struct kvm_lapic *apic) -{ - ktime_t now; - atomic_set(&apic->lapic_timer.pending, 0); - - if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) { - /* lapic timer in oneshot or peroidic mode */ - now = apic->lapic_timer.timer.base->get_time(); - apic->lapic_timer.period = (u64)apic_get_reg(apic, APIC_TMICT) - * APIC_BUS_CYCLE_NS * apic->divide_count; - - if (!apic->lapic_timer.period) - return; - /* - * Do not allow the guest to program periodic timers with small - * interval, since the hrtimers are not throttled by the host - * scheduler. - */ - if (apic_lvtt_period(apic)) { - s64 min_period = min_timer_period_us * 1000LL; - - if (apic->lapic_timer.period < min_period) { - pr_info_ratelimited( - "kvm: vcpu %i: requested %lld ns " - "lapic timer period limited to %lld ns\n", - apic->vcpu->vcpu_id, - apic->lapic_timer.period, min_period); - apic->lapic_timer.period = min_period; - } - } - - hrtimer_start(&apic->lapic_timer.timer, - ktime_add_ns(now, apic->lapic_timer.period), - HRTIMER_MODE_ABS); - - apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016" - PRIx64 ", " - "timer initial count 0x%x, period %lldns, " - "expire @ 0x%016" PRIx64 ".\n", __func__, - APIC_BUS_CYCLE_NS, ktime_to_ns(now), - apic_get_reg(apic, APIC_TMICT), - apic->lapic_timer.period, - ktime_to_ns(ktime_add_ns(now, - apic->lapic_timer.period))); - } else if (apic_lvtt_tscdeadline(apic)) { - /* lapic timer in tsc deadline mode */ - u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline; - u64 ns = 0; - struct kvm_vcpu *vcpu = apic->vcpu; - unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz; - unsigned long flags; - - if (unlikely(!tscdeadline || !this_tsc_khz)) - return; - - local_irq_save(flags); - - now = apic->lapic_timer.timer.base->get_time(); - guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu); - if (likely(tscdeadline > guest_tsc)) { - ns = (tscdeadline - guest_tsc) * 1000000ULL; - do_div(ns, this_tsc_khz); - } - hrtimer_start(&apic->lapic_timer.timer, - ktime_add_ns(now, ns), HRTIMER_MODE_ABS); - - local_irq_restore(flags); - } -} - -static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val) -{ - int nmi_wd_enabled = apic_lvt_nmi_mode(apic_get_reg(apic, APIC_LVT0)); - - if (apic_lvt_nmi_mode(lvt0_val)) { - if (!nmi_wd_enabled) { - apic_debug("Receive NMI setting on APIC_LVT0 " - "for cpu %d\n", apic->vcpu->vcpu_id); - apic->vcpu->kvm->arch.vapics_in_nmi_mode++; - } - } else if (nmi_wd_enabled) - apic->vcpu->kvm->arch.vapics_in_nmi_mode--; -} - -static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) -{ - int ret = 0; - - trace_kvm_apic_write(reg, val); - - switch (reg) { - case APIC_ID: /* Local APIC ID */ - if (!apic_x2apic_mode(apic)) - apic_set_reg(apic, APIC_ID, val); - else - ret = 1; - break; - - case APIC_TASKPRI: - report_tpr_access(apic, true); - apic_set_tpr(apic, val & 0xff); - break; - - case APIC_EOI: - apic_set_eoi(apic); - break; - - case APIC_LDR: - if (!apic_x2apic_mode(apic)) - apic_set_reg(apic, APIC_LDR, val & APIC_LDR_MASK); - else - ret = 1; - break; - - case APIC_DFR: - if (!apic_x2apic_mode(apic)) - apic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF); - else - ret = 1; - break; - - case APIC_SPIV: { - u32 mask = 0x3ff; - if (apic_get_reg(apic, APIC_LVR) & APIC_LVR_DIRECTED_EOI) - mask |= APIC_SPIV_DIRECTED_EOI; - apic_set_reg(apic, APIC_SPIV, val & mask); - if (!(val & APIC_SPIV_APIC_ENABLED)) { - int i; - u32 lvt_val; - - for (i = 0; i < APIC_LVT_NUM; i++) { - lvt_val = apic_get_reg(apic, - APIC_LVTT + 0x10 * i); - apic_set_reg(apic, APIC_LVTT + 0x10 * i, - lvt_val | APIC_LVT_MASKED); - } - atomic_set(&apic->lapic_timer.pending, 0); - - } - break; - } - case APIC_ICR: - /* No delay here, so we always clear the pending bit */ - apic_set_reg(apic, APIC_ICR, val & ~(1 << 12)); - apic_send_ipi(apic); - break; - - case APIC_ICR2: - if (!apic_x2apic_mode(apic)) - val &= 0xff000000; - apic_set_reg(apic, APIC_ICR2, val); - break; - - case APIC_LVT0: - apic_manage_nmi_watchdog(apic, val); - case APIC_LVTTHMR: - case APIC_LVTPC: - case APIC_LVT1: - case APIC_LVTERR: - /* TODO: Check vector */ - if (!apic_sw_enabled(apic)) - val |= APIC_LVT_MASKED; - - val &= apic_lvt_mask[(reg - APIC_LVTT) >> 4]; - apic_set_reg(apic, reg, val); - - break; - - case APIC_LVTT: - if ((apic_get_reg(apic, APIC_LVTT) & - apic->lapic_timer.timer_mode_mask) != - (val & apic->lapic_timer.timer_mode_mask)) - hrtimer_cancel(&apic->lapic_timer.timer); - - if (!apic_sw_enabled(apic)) - val |= APIC_LVT_MASKED; - val &= (apic_lvt_mask[0] | apic->lapic_timer.timer_mode_mask); - apic_set_reg(apic, APIC_LVTT, val); - break; - - case APIC_TMICT: - if (apic_lvtt_tscdeadline(apic)) - break; - - hrtimer_cancel(&apic->lapic_timer.timer); - apic_set_reg(apic, APIC_TMICT, val); - start_apic_timer(apic); - break; - - case APIC_TDCR: - if (val & 4) - apic_debug("KVM_WRITE:TDCR %x\n", val); - apic_set_reg(apic, APIC_TDCR, val); - update_divide_count(apic); - break; - - case APIC_ESR: - if (apic_x2apic_mode(apic) && val != 0) { - apic_debug("KVM_WRITE:ESR not zero %x\n", val); - ret = 1; - } - break; - - case APIC_SELF_IPI: - if (apic_x2apic_mode(apic)) { - apic_reg_write(apic, APIC_ICR, 0x40000 | (val & 0xff)); - } else - ret = 1; - break; - default: - ret = 1; - break; - } - if (ret) - apic_debug("Local APIC Write to read-only register %x\n", reg); - return ret; -} - -static int apic_mmio_write(struct kvm_io_device *this, - gpa_t address, int len, const void *data) -{ - struct kvm_lapic *apic = to_lapic(this); - unsigned int offset = address - apic->base_address; - u32 val; - - if (!apic_mmio_in_range(apic, address)) - return -EOPNOTSUPP; - - /* - * APIC register must be aligned on 128-bits boundary. - * 32/64/128 bits registers must be accessed thru 32 bits. - * Refer SDM 8.4.1 - */ - if (len != 4 || (offset & 0xf)) { - /* Don't shout loud, $infamous_os would cause only noise. */ - apic_debug("apic write: bad size=%d %lx\n", len, (long)address); - return 0; - } - - val = *(u32*)data; - - /* too common printing */ - if (offset != APIC_EOI) - apic_debug("%s: offset 0x%x with length 0x%x, and value is " - "0x%x\n", __func__, offset, len, val); - - apic_reg_write(apic, offset & 0xff0, val); - - return 0; -} - -void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - - if (apic) - apic_reg_write(vcpu->arch.apic, APIC_EOI, 0); -} -EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi); - -void kvm_free_lapic(struct kvm_vcpu *vcpu) -{ - if (!vcpu->arch.apic) - return; - - hrtimer_cancel(&vcpu->arch.apic->lapic_timer.timer); - - if (vcpu->arch.apic->regs) - free_page((unsigned long)vcpu->arch.apic->regs); - - kfree(vcpu->arch.apic); -} - -/* - *---------------------------------------------------------------------- - * LAPIC interface - *---------------------------------------------------------------------- - */ - -u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - if (!apic) - return 0; - - if (apic_lvtt_oneshot(apic) || apic_lvtt_period(apic)) - return 0; - - return apic->lapic_timer.tscdeadline; -} - -void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - if (!apic) - return; - - if (apic_lvtt_oneshot(apic) || apic_lvtt_period(apic)) - return; - - hrtimer_cancel(&apic->lapic_timer.timer); - apic->lapic_timer.tscdeadline = data; - start_apic_timer(apic); -} - -void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - - if (!apic) - return; - apic_set_tpr(apic, ((cr8 & 0x0f) << 4) - | (apic_get_reg(apic, APIC_TASKPRI) & 4)); -} - -u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - u64 tpr; - - if (!apic) - return 0; - tpr = (u64) apic_get_reg(apic, APIC_TASKPRI); - - return (tpr & 0xf0) >> 4; -} - -void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - - if (!apic) { - value |= MSR_IA32_APICBASE_BSP; - vcpu->arch.apic_base = value; - return; - } - - if (!kvm_vcpu_is_bsp(apic->vcpu)) - value &= ~MSR_IA32_APICBASE_BSP; - - vcpu->arch.apic_base = value; - if (apic_x2apic_mode(apic)) { - u32 id = kvm_apic_id(apic); - u32 ldr = ((id & ~0xf) << 16) | (1 << (id & 0xf)); - apic_set_reg(apic, APIC_LDR, ldr); - } - apic->base_address = apic->vcpu->arch.apic_base & - MSR_IA32_APICBASE_BASE; - - /* with FSB delivery interrupt, we can restart APIC functionality */ - apic_debug("apic base msr is 0x%016" PRIx64 ", and base address is " - "0x%lx.\n", apic->vcpu->arch.apic_base, apic->base_address); - -} - -void kvm_lapic_reset(struct kvm_vcpu *vcpu) -{ - struct kvm_lapic *apic; - int i; - - apic_debug("%s\n", __func__); - - ASSERT(vcpu); - apic = vcpu->arch.apic; - ASSERT(apic != NULL); - - /* Stop the timer in case it's a reset to an active apic */ - hrtimer_cancel(&apic->lapic_timer.timer); - - apic_set_reg(apic, APIC_ID, vcpu->vcpu_id << 24); - kvm_apic_set_version(apic->vcpu); - - for (i = 0; i < APIC_LVT_NUM; i++) - apic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED); - apic_set_reg(apic, APIC_LVT0, - SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT)); - - apic_set_reg(apic, APIC_DFR, 0xffffffffU); - apic_set_reg(apic, APIC_SPIV, 0xff); - apic_set_reg(apic, APIC_TASKPRI, 0); - apic_set_reg(apic, APIC_LDR, 0); - apic_set_reg(apic, APIC_ESR, 0); - apic_set_reg(apic, APIC_ICR, 0); - apic_set_reg(apic, APIC_ICR2, 0); - apic_set_reg(apic, APIC_TDCR, 0); - apic_set_reg(apic, APIC_TMICT, 0); - for (i = 0; i < 8; i++) { - apic_set_reg(apic, APIC_IRR + 0x10 * i, 0); - apic_set_reg(apic, APIC_ISR + 0x10 * i, 0); - apic_set_reg(apic, APIC_TMR + 0x10 * i, 0); - } - apic->irr_pending = false; - update_divide_count(apic); - atomic_set(&apic->lapic_timer.pending, 0); - if (kvm_vcpu_is_bsp(vcpu)) - vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP; - apic_update_ppr(apic); - - vcpu->arch.apic_arb_prio = 0; - - apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr=" - "0x%016" PRIx64 ", base_address=0x%0lx.\n", __func__, - vcpu, kvm_apic_id(apic), - vcpu->arch.apic_base, apic->base_address); -} - -bool kvm_apic_present(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.apic && apic_hw_enabled(vcpu->arch.apic); -} - -int kvm_lapic_enabled(struct kvm_vcpu *vcpu) -{ - return kvm_apic_present(vcpu) && apic_sw_enabled(vcpu->arch.apic); -} - -/* - *---------------------------------------------------------------------- - * timer interface - *---------------------------------------------------------------------- - */ - -static bool lapic_is_periodic(struct kvm_timer *ktimer) -{ - struct kvm_lapic *apic = container_of(ktimer, struct kvm_lapic, - lapic_timer); - return apic_lvtt_period(apic); -} - -int apic_has_pending_timer(struct kvm_vcpu *vcpu) -{ - struct kvm_lapic *lapic = vcpu->arch.apic; - - if (lapic && apic_enabled(lapic) && apic_lvt_enabled(lapic, APIC_LVTT)) - return atomic_read(&lapic->lapic_timer.pending); - - return 0; -} - -int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type) -{ - u32 reg = apic_get_reg(apic, lvt_type); - int vector, mode, trig_mode; - - if (apic_hw_enabled(apic) && !(reg & APIC_LVT_MASKED)) { - vector = reg & APIC_VECTOR_MASK; - mode = reg & APIC_MODE_MASK; - trig_mode = reg & APIC_LVT_LEVEL_TRIGGER; - return __apic_accept_irq(apic, mode, vector, 1, trig_mode); - } - return 0; -} - -void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - - if (apic) - kvm_apic_local_deliver(apic, APIC_LVT0); -} - -static struct kvm_timer_ops lapic_timer_ops = { - .is_periodic = lapic_is_periodic, -}; - -static const struct kvm_io_device_ops apic_mmio_ops = { - .read = apic_mmio_read, - .write = apic_mmio_write, -}; - -int kvm_create_lapic(struct kvm_vcpu *vcpu) -{ - struct kvm_lapic *apic; - - ASSERT(vcpu != NULL); - apic_debug("apic_init %d\n", vcpu->vcpu_id); - - apic = kzalloc(sizeof(*apic), GFP_KERNEL); - if (!apic) - goto nomem; - - vcpu->arch.apic = apic; - - apic->regs = (void *)get_zeroed_page(GFP_KERNEL); - if (!apic->regs) { - printk(KERN_ERR "malloc apic regs error for vcpu %x\n", - vcpu->vcpu_id); - goto nomem_free_apic; - } - apic->vcpu = vcpu; - - hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC, - HRTIMER_MODE_ABS); - apic->lapic_timer.timer.function = kvm_timer_fn; - apic->lapic_timer.t_ops = &lapic_timer_ops; - apic->lapic_timer.kvm = vcpu->kvm; - apic->lapic_timer.vcpu = vcpu; - - apic->base_address = APIC_DEFAULT_PHYS_BASE; - vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE; - - kvm_lapic_reset(vcpu); - kvm_iodevice_init(&apic->dev, &apic_mmio_ops); - - return 0; -nomem_free_apic: - kfree(apic); -nomem: - return -ENOMEM; -} - -int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - int highest_irr; - - if (!apic || !apic_enabled(apic)) - return -1; - - apic_update_ppr(apic); - highest_irr = apic_find_highest_irr(apic); - if ((highest_irr == -1) || - ((highest_irr & 0xF0) <= apic_get_reg(apic, APIC_PROCPRI))) - return -1; - return highest_irr; -} - -int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu) -{ - u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0); - int r = 0; - - if (!apic_hw_enabled(vcpu->arch.apic)) - r = 1; - if ((lvt0 & APIC_LVT_MASKED) == 0 && - GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT) - r = 1; - return r; -} - -void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - - if (apic && atomic_read(&apic->lapic_timer.pending) > 0) { - if (kvm_apic_local_deliver(apic, APIC_LVTT)) - atomic_dec(&apic->lapic_timer.pending); - } -} - -int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu) -{ - int vector = kvm_apic_has_interrupt(vcpu); - struct kvm_lapic *apic = vcpu->arch.apic; - - if (vector == -1) - return -1; - - apic_set_vector(vector, apic->regs + APIC_ISR); - apic_update_ppr(apic); - apic_clear_irr(vector, apic); - return vector; -} - -void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - - apic->base_address = vcpu->arch.apic_base & - MSR_IA32_APICBASE_BASE; - kvm_apic_set_version(vcpu); - - apic_update_ppr(apic); - hrtimer_cancel(&apic->lapic_timer.timer); - update_divide_count(apic); - start_apic_timer(apic); - apic->irr_pending = true; - kvm_make_request(KVM_REQ_EVENT, vcpu); -} - -void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - struct hrtimer *timer; - - if (!apic) - return; - - timer = &apic->lapic_timer.timer; - if (hrtimer_cancel(timer)) - hrtimer_start_expires(timer, HRTIMER_MODE_ABS); -} - -void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu) -{ - u32 data; - void *vapic; - - if (!irqchip_in_kernel(vcpu->kvm) || !vcpu->arch.apic->vapic_addr) - return; - - vapic = kmap_atomic(vcpu->arch.apic->vapic_page); - data = *(u32 *)(vapic + offset_in_page(vcpu->arch.apic->vapic_addr)); - kunmap_atomic(vapic); - - apic_set_tpr(vcpu->arch.apic, data & 0xff); -} - -void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu) -{ - u32 data, tpr; - int max_irr, max_isr; - struct kvm_lapic *apic; - void *vapic; - - if (!irqchip_in_kernel(vcpu->kvm) || !vcpu->arch.apic->vapic_addr) - return; - - apic = vcpu->arch.apic; - tpr = apic_get_reg(apic, APIC_TASKPRI) & 0xff; - max_irr = apic_find_highest_irr(apic); - if (max_irr < 0) - max_irr = 0; - max_isr = apic_find_highest_isr(apic); - if (max_isr < 0) - max_isr = 0; - data = (tpr & 0xff) | ((max_isr & 0xf0) << 8) | (max_irr << 24); - - vapic = kmap_atomic(vcpu->arch.apic->vapic_page); - *(u32 *)(vapic + offset_in_page(vcpu->arch.apic->vapic_addr)) = data; - kunmap_atomic(vapic); -} - -void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr) -{ - if (!irqchip_in_kernel(vcpu->kvm)) - return; - - vcpu->arch.apic->vapic_addr = vapic_addr; -} - -int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - u32 reg = (msr - APIC_BASE_MSR) << 4; - - if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic)) - return 1; - - /* if this is ICR write vector before command */ - if (msr == 0x830) - apic_reg_write(apic, APIC_ICR2, (u32)(data >> 32)); - return apic_reg_write(apic, reg, (u32)data); -} - -int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - u32 reg = (msr - APIC_BASE_MSR) << 4, low, high = 0; - - if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic)) - return 1; - - if (apic_reg_read(apic, reg, 4, &low)) - return 1; - if (msr == 0x830) - apic_reg_read(apic, APIC_ICR2, 4, &high); - - *data = (((u64)high) << 32) | low; - - return 0; -} - -int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 reg, u64 data) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - - if (!irqchip_in_kernel(vcpu->kvm)) - return 1; - - /* if this is ICR write vector before command */ - if (reg == APIC_ICR) - apic_reg_write(apic, APIC_ICR2, (u32)(data >> 32)); - return apic_reg_write(apic, reg, (u32)data); -} - -int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - u32 low, high = 0; - - if (!irqchip_in_kernel(vcpu->kvm)) - return 1; - - if (apic_reg_read(apic, reg, 4, &low)) - return 1; - if (reg == APIC_ICR) - apic_reg_read(apic, APIC_ICR2, 4, &high); - - *data = (((u64)high) << 32) | low; - - return 0; -} diff --git a/ANDROID_3.4.5/arch/x86/kvm/lapic.h b/ANDROID_3.4.5/arch/x86/kvm/lapic.h deleted file mode 100644 index 6f4ce257..00000000 --- a/ANDROID_3.4.5/arch/x86/kvm/lapic.h +++ /dev/null @@ -1,63 +0,0 @@ -#ifndef __KVM_X86_LAPIC_H -#define __KVM_X86_LAPIC_H - -#include "iodev.h" -#include "kvm_timer.h" - -#include <linux/kvm_host.h> - -struct kvm_lapic { - unsigned long base_address; - struct kvm_io_device dev; - struct kvm_timer lapic_timer; - u32 divide_count; - struct kvm_vcpu *vcpu; - bool irr_pending; - void *regs; - gpa_t vapic_addr; - struct page *vapic_page; -}; -int kvm_create_lapic(struct kvm_vcpu *vcpu); -void kvm_free_lapic(struct kvm_vcpu *vcpu); - -int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu); -int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu); -int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu); -void kvm_lapic_reset(struct kvm_vcpu *vcpu); -u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu); -void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8); -void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu); -void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value); -u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu); -void kvm_apic_set_version(struct kvm_vcpu *vcpu); - -int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest); -int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda); -int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq); -int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type); - -u64 kvm_get_apic_base(struct kvm_vcpu *vcpu); -void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data); -void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu); -int kvm_lapic_enabled(struct kvm_vcpu *vcpu); -bool kvm_apic_present(struct kvm_vcpu *vcpu); -int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); - -u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu); -void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data); - -void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr); -void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu); -void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu); - -int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data); -int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data); - -int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data); -int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data); - -static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.hv_vapic & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE; -} -#endif diff --git a/ANDROID_3.4.5/arch/x86/kvm/mmu.c b/ANDROID_3.4.5/arch/x86/kvm/mmu.c deleted file mode 100644 index 4cb16426..00000000 --- a/ANDROID_3.4.5/arch/x86/kvm/mmu.c +++ /dev/null @@ -1,4027 +0,0 @@ -/* - * Kernel-based Virtual Machine driver for Linux - * - * This module enables machines with Intel VT-x extensions to run virtual - * machines without emulation or binary translation. - * - * MMU support - * - * Copyright (C) 2006 Qumranet, Inc. - * Copyright 2010 Red Hat, Inc. and/or its affiliates. - * - * Authors: - * Yaniv Kamay <yaniv@qumranet.com> - * Avi Kivity <avi@qumranet.com> - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * - */ - -#include "irq.h" -#include "mmu.h" -#include "x86.h" -#include "kvm_cache_regs.h" - -#include <linux/kvm_host.h> -#include <linux/types.h> -#include <linux/string.h> -#include <linux/mm.h> -#include <linux/highmem.h> -#include <linux/module.h> -#include <linux/swap.h> -#include <linux/hugetlb.h> -#include <linux/compiler.h> -#include <linux/srcu.h> -#include <linux/slab.h> -#include <linux/uaccess.h> - -#include <asm/page.h> -#include <asm/cmpxchg.h> -#include <asm/io.h> -#include <asm/vmx.h> - -/* - * When setting this variable to true it enables Two-Dimensional-Paging - * where the hardware walks 2 page tables: - * 1. the guest-virtual to guest-physical - * 2. while doing 1. it walks guest-physical to host-physical - * If the hardware supports that we don't need to do shadow paging. - */ -bool tdp_enabled = false; - -enum { - AUDIT_PRE_PAGE_FAULT, - AUDIT_POST_PAGE_FAULT, - AUDIT_PRE_PTE_WRITE, - AUDIT_POST_PTE_WRITE, - AUDIT_PRE_SYNC, - AUDIT_POST_SYNC -}; - -#undef MMU_DEBUG - -#ifdef MMU_DEBUG - -#define pgprintk(x...) do { if (dbg) printk(x); } while (0) -#define rmap_printk(x...) do { if (dbg) printk(x); } while (0) - -#else - -#define pgprintk(x...) do { } while (0) -#define rmap_printk(x...) do { } while (0) - -#endif - -#ifdef MMU_DEBUG -static bool dbg = 0; -module_param(dbg, bool, 0644); -#endif - -#ifndef MMU_DEBUG -#define ASSERT(x) do { } while (0) -#else -#define ASSERT(x) \ - if (!(x)) { \ - printk(KERN_WARNING "assertion failed %s:%d: %s\n", \ - __FILE__, __LINE__, #x); \ - } -#endif - -#define PTE_PREFETCH_NUM 8 - -#define PT_FIRST_AVAIL_BITS_SHIFT 9 -#define PT64_SECOND_AVAIL_BITS_SHIFT 52 - -#define PT64_LEVEL_BITS 9 - -#define PT64_LEVEL_SHIFT(level) \ - (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS) - -#define PT64_INDEX(address, level)\ - (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1)) - - -#define PT32_LEVEL_BITS 10 - -#define PT32_LEVEL_SHIFT(level) \ - (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS) - -#define PT32_LVL_OFFSET_MASK(level) \ - (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \ - * PT32_LEVEL_BITS))) - 1)) - -#define PT32_INDEX(address, level)\ - (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1)) - - -#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) -#define PT64_DIR_BASE_ADDR_MASK \ - (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1)) -#define PT64_LVL_ADDR_MASK(level) \ - (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \ - * PT64_LEVEL_BITS))) - 1)) -#define PT64_LVL_OFFSET_MASK(level) \ - (PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \ - * PT64_LEVEL_BITS))) - 1)) - -#define PT32_BASE_ADDR_MASK PAGE_MASK -#define PT32_DIR_BASE_ADDR_MASK \ - (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1)) -#define PT32_LVL_ADDR_MASK(level) \ - (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \ - * PT32_LEVEL_BITS))) - 1)) - -#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \ - | PT64_NX_MASK) - -#define PTE_LIST_EXT 4 - -#define ACC_EXEC_MASK 1 -#define ACC_WRITE_MASK PT_WRITABLE_MASK -#define ACC_USER_MASK PT_USER_MASK -#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) - -#include <trace/events/kvm.h> - -#define CREATE_TRACE_POINTS -#include "mmutrace.h" - -#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) - -#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) - -struct pte_list_desc { - u64 *sptes[PTE_LIST_EXT]; - struct pte_list_desc *more; -}; - -struct kvm_shadow_walk_iterator { - u64 addr; - hpa_t shadow_addr; - u64 *sptep; - int level; - unsigned index; -}; - -#define for_each_shadow_entry(_vcpu, _addr, _walker) \ - for (shadow_walk_init(&(_walker), _vcpu, _addr); \ - shadow_walk_okay(&(_walker)); \ - shadow_walk_next(&(_walker))) - -#define for_each_shadow_entry_lockless(_vcpu, _addr, _walker, spte) \ - for (shadow_walk_init(&(_walker), _vcpu, _addr); \ - shadow_walk_okay(&(_walker)) && \ - ({ spte = mmu_spte_get_lockless(_walker.sptep); 1; }); \ - __shadow_walk_next(&(_walker), spte)) - -static struct kmem_cache *pte_list_desc_cache; -static struct kmem_cache *mmu_page_header_cache; -static struct percpu_counter kvm_total_used_mmu_pages; - -static u64 __read_mostly shadow_nx_mask; -static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ -static u64 __read_mostly shadow_user_mask; -static u64 __read_mostly shadow_accessed_mask; -static u64 __read_mostly shadow_dirty_mask; -static u64 __read_mostly shadow_mmio_mask; - -static void mmu_spte_set(u64 *sptep, u64 spte); - -void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask) -{ - shadow_mmio_mask = mmio_mask; -} -EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); - -static void mark_mmio_spte(u64 *sptep, u64 gfn, unsigned access) -{ - access &= ACC_WRITE_MASK | ACC_USER_MASK; - - trace_mark_mmio_spte(sptep, gfn, access); - mmu_spte_set(sptep, shadow_mmio_mask | access | gfn << PAGE_SHIFT); -} - -static bool is_mmio_spte(u64 spte) -{ - return (spte & shadow_mmio_mask) == shadow_mmio_mask; -} - -static gfn_t get_mmio_spte_gfn(u64 spte) -{ - return (spte & ~shadow_mmio_mask) >> PAGE_SHIFT; -} - -static unsigned get_mmio_spte_access(u64 spte) -{ - return (spte & ~shadow_mmio_mask) & ~PAGE_MASK; -} - -static bool set_mmio_spte(u64 *sptep, gfn_t gfn, pfn_t pfn, unsigned access) -{ - if (unlikely(is_noslot_pfn(pfn))) { - mark_mmio_spte(sptep, gfn, access); - return true; - } - - return false; -} - -static inline u64 rsvd_bits(int s, int e) -{ - return ((1ULL << (e - s + 1)) - 1) << s; -} - -void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, - u64 dirty_mask, u64 nx_mask, u64 x_mask) -{ - shadow_user_mask = user_mask; - shadow_accessed_mask = accessed_mask; - shadow_dirty_mask = dirty_mask; - shadow_nx_mask = nx_mask; - shadow_x_mask = x_mask; -} -EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); - -static int is_cpuid_PSE36(void) -{ - return 1; -} - -static int is_nx(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.efer & EFER_NX; -} - -static int is_shadow_present_pte(u64 pte) -{ - return pte & PT_PRESENT_MASK && !is_mmio_spte(pte); -} - -static int is_large_pte(u64 pte) -{ - return pte & PT_PAGE_SIZE_MASK; -} - -static int is_dirty_gpte(unsigned long pte) -{ - return pte & PT_DIRTY_MASK; -} - -static int is_rmap_spte(u64 pte) -{ - return is_shadow_present_pte(pte); -} - -static int is_last_spte(u64 pte, int level) -{ - if (level == PT_PAGE_TABLE_LEVEL) - return 1; - if (is_large_pte(pte)) - return 1; - return 0; -} - -static pfn_t spte_to_pfn(u64 pte) -{ - return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; -} - -static gfn_t pse36_gfn_delta(u32 gpte) -{ - int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT; - - return (gpte & PT32_DIR_PSE36_MASK) << shift; -} - -#ifdef CONFIG_X86_64 -static void __set_spte(u64 *sptep, u64 spte) -{ - *sptep = spte; -} - -static void __update_clear_spte_fast(u64 *sptep, u64 spte) -{ - *sptep = spte; -} - -static u64 __update_clear_spte_slow(u64 *sptep, u64 spte) -{ - return xchg(sptep, spte); -} - -static u64 __get_spte_lockless(u64 *sptep) -{ - return ACCESS_ONCE(*sptep); -} - -static bool __check_direct_spte_mmio_pf(u64 spte) -{ - /* It is valid if the spte is zapped. */ - return spte == 0ull; -} -#else -union split_spte { - struct { - u32 spte_low; - u32 spte_high; - }; - u64 spte; -}; - -static void count_spte_clear(u64 *sptep, u64 spte) -{ - struct kvm_mmu_page *sp = page_header(__pa(sptep)); - - if (is_shadow_present_pte(spte)) - return; - - /* Ensure the spte is completely set before we increase the count */ - smp_wmb(); - sp->clear_spte_count++; -} - -static void __set_spte(u64 *sptep, u64 spte) -{ - union split_spte *ssptep, sspte; - - ssptep = (union split_spte *)sptep; - sspte = (union split_spte)spte; - - ssptep->spte_high = sspte.spte_high; - - /* - * If we map the spte from nonpresent to present, We should store - * the high bits firstly, then set present bit, so cpu can not - * fetch this spte while we are setting the spte. - */ - smp_wmb(); - - ssptep->spte_low = sspte.spte_low; -} - -static void __update_clear_spte_fast(u64 *sptep, u64 spte) -{ - union split_spte *ssptep, sspte; - - ssptep = (union split_spte *)sptep; - sspte = (union split_spte)spte; - - ssptep->spte_low = sspte.spte_low; - - /* - * If we map the spte from present to nonpresent, we should clear - * present bit firstly to avoid vcpu fetch the old high bits. - */ - smp_wmb(); - - ssptep->spte_high = sspte.spte_high; - count_spte_clear(sptep, spte); -} - -static u64 __update_clear_spte_slow(u64 *sptep, u64 spte) -{ - union split_spte *ssptep, sspte, orig; - - ssptep = (union split_spte *)sptep; - sspte = (union split_spte)spte; - - /* xchg acts as a barrier before the setting of the high bits */ - orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low); - orig.spte_high = ssptep->spte_high; - ssptep->spte_high = sspte.spte_high; - count_spte_clear(sptep, spte); - - return orig.spte; -} - -/* - * The idea using the light way get the spte on x86_32 guest is from - * gup_get_pte(arch/x86/mm/gup.c). - * The difference is we can not catch the spte tlb flush if we leave - * guest mode, so we emulate it by increase clear_spte_count when spte - * is cleared. - */ -static u64 __get_spte_lockless(u64 *sptep) -{ - struct kvm_mmu_page *sp = page_header(__pa(sptep)); - union split_spte spte, *orig = (union split_spte *)sptep; - int count; - -retry: - count = sp->clear_spte_count; - smp_rmb(); - - spte.spte_low = orig->spte_low; - smp_rmb(); - - spte.spte_high = orig->spte_high; - smp_rmb(); - - if (unlikely(spte.spte_low != orig->spte_low || - count != sp->clear_spte_count)) - goto retry; - - return spte.spte; -} - -static bool __check_direct_spte_mmio_pf(u64 spte) -{ - union split_spte sspte = (union split_spte)spte; - u32 high_mmio_mask = shadow_mmio_mask >> 32; - - /* It is valid if the spte is zapped. */ - if (spte == 0ull) - return true; - - /* It is valid if the spte is being zapped. */ - if (sspte.spte_low == 0ull && - (sspte.spte_high & high_mmio_mask) == high_mmio_mask) - return true; - - return false; -} -#endif - -static bool spte_has_volatile_bits(u64 spte) -{ - if (!shadow_accessed_mask) - return false; - - if (!is_shadow_present_pte(spte)) - return false; - - if ((spte & shadow_accessed_mask) && - (!is_writable_pte(spte) || (spte & shadow_dirty_mask))) - return false; - - return true; -} - -static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask) -{ - return (old_spte & bit_mask) && !(new_spte & bit_mask); -} - -/* Rules for using mmu_spte_set: - * Set the sptep from nonpresent to present. - * Note: the sptep being assigned *must* be either not present - * or in a state where the hardware will not attempt to update - * the spte. - */ -static void mmu_spte_set(u64 *sptep, u64 new_spte) -{ - WARN_ON(is_shadow_present_pte(*sptep)); - __set_spte(sptep, new_spte); -} - -/* Rules for using mmu_spte_update: - * Update the state bits, it means the mapped pfn is not changged. - */ -static void mmu_spte_update(u64 *sptep, u64 new_spte) -{ - u64 mask, old_spte = *sptep; - - WARN_ON(!is_rmap_spte(new_spte)); - - if (!is_shadow_present_pte(old_spte)) - return mmu_spte_set(sptep, new_spte); - - new_spte |= old_spte & shadow_dirty_mask; - - mask = shadow_accessed_mask; - if (is_writable_pte(old_spte)) - mask |= shadow_dirty_mask; - - if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask) - __update_clear_spte_fast(sptep, new_spte); - else - old_spte = __update_clear_spte_slow(sptep, new_spte); - - if (!shadow_accessed_mask) - return; - - if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask)) - kvm_set_pfn_accessed(spte_to_pfn(old_spte)); - if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask)) - kvm_set_pfn_dirty(spte_to_pfn(old_spte)); -} - -/* - * Rules for using mmu_spte_clear_track_bits: - * It sets the sptep from present to nonpresent, and track the - * state bits, it is used to clear the last level sptep. - */ -static int mmu_spte_clear_track_bits(u64 *sptep) -{ - pfn_t pfn; - u64 old_spte = *sptep; - - if (!spte_has_volatile_bits(old_spte)) - __update_clear_spte_fast(sptep, 0ull); - else - old_spte = __update_clear_spte_slow(sptep, 0ull); - - if (!is_rmap_spte(old_spte)) - return 0; - - pfn = spte_to_pfn(old_spte); - if (!shadow_accessed_mask || old_spte & shadow_accessed_mask) - kvm_set_pfn_accessed(pfn); - if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask)) - kvm_set_pfn_dirty(pfn); - return 1; -} - -/* - * Rules for using mmu_spte_clear_no_track: - * Directly clear spte without caring the state bits of sptep, - * it is used to set the upper level spte. - */ -static void mmu_spte_clear_no_track(u64 *sptep) -{ - __update_clear_spte_fast(sptep, 0ull); -} - -static u64 mmu_spte_get_lockless(u64 *sptep) -{ - return __get_spte_lockless(sptep); -} - -static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu) -{ - rcu_read_lock(); - atomic_inc(&vcpu->kvm->arch.reader_counter); - - /* Increase the counter before walking shadow page table */ - smp_mb__after_atomic_inc(); -} - -static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu) -{ - /* Decrease the counter after walking shadow page table finished */ - smp_mb__before_atomic_dec(); - atomic_dec(&vcpu->kvm->arch.reader_counter); - rcu_read_unlock(); -} - -static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, - struct kmem_cache *base_cache, int min) -{ - void *obj; - - if (cache->nobjs >= min) - return 0; - while (cache->nobjs < ARRAY_SIZE(cache->objects)) { - obj = kmem_cache_zalloc(base_cache, GFP_KERNEL); - if (!obj) - return -ENOMEM; - cache->objects[cache->nobjs++] = obj; - } - return 0; -} - -static int mmu_memory_cache_free_objects(struct kvm_mmu_memory_cache *cache) -{ - return cache->nobjs; -} - -static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc, - struct kmem_cache *cache) -{ - while (mc->nobjs) - kmem_cache_free(cache, mc->objects[--mc->nobjs]); -} - -static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, - int min) -{ - void *page; - - if (cache->nobjs >= min) - return 0; - while (cache->nobjs < ARRAY_SIZE(cache->objects)) { - page = (void *)__get_free_page(GFP_KERNEL); - if (!page) - return -ENOMEM; - cache->objects[cache->nobjs++] = page; - } - return 0; -} - -static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc) -{ - while (mc->nobjs) - free_page((unsigned long)mc->objects[--mc->nobjs]); -} - -static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu) -{ - int r; - - r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache, - pte_list_desc_cache, 8 + PTE_PREFETCH_NUM); - if (r) - goto out; - r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8); - if (r) - goto out; - r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache, - mmu_page_header_cache, 4); -out: - return r; -} - -static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) -{ - mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache, - pte_list_desc_cache); - mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache); - mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache, - mmu_page_header_cache); -} - -static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc, - size_t size) -{ - void *p; - - BUG_ON(!mc->nobjs); - p = mc->objects[--mc->nobjs]; - return p; -} - -static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu) -{ - return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache, - sizeof(struct pte_list_desc)); -} - -static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc) -{ - kmem_cache_free(pte_list_desc_cache, pte_list_desc); -} - -static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index) -{ - if (!sp->role.direct) - return sp->gfns[index]; - - return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS)); -} - -static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn) -{ - if (sp->role.direct) - BUG_ON(gfn != kvm_mmu_page_get_gfn(sp, index)); - else - sp->gfns[index] = gfn; -} - -/* - * Return the pointer to the large page information for a given gfn, - * handling slots that are not large page aligned. - */ -static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn, - struct kvm_memory_slot *slot, - int level) -{ - unsigned long idx; - - idx = gfn_to_index(gfn, slot->base_gfn, level); - return &slot->arch.lpage_info[level - 2][idx]; -} - -static void account_shadowed(struct kvm *kvm, gfn_t gfn) -{ - struct kvm_memory_slot *slot; - struct kvm_lpage_info *linfo; - int i; - - slot = gfn_to_memslot(kvm, gfn); - for (i = PT_DIRECTORY_LEVEL; - i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { - linfo = lpage_info_slot(gfn, slot, i); - linfo->write_count += 1; - } - kvm->arch.indirect_shadow_pages++; -} - -static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) -{ - struct kvm_memory_slot *slot; - struct kvm_lpage_info *linfo; - int i; - - slot = gfn_to_memslot(kvm, gfn); - for (i = PT_DIRECTORY_LEVEL; - i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { - linfo = lpage_info_slot(gfn, slot, i); - linfo->write_count -= 1; - WARN_ON(linfo->write_count < 0); - } - kvm->arch.indirect_shadow_pages--; -} - -static int has_wrprotected_page(struct kvm *kvm, - gfn_t gfn, - int level) -{ - struct kvm_memory_slot *slot; - struct kvm_lpage_info *linfo; - - slot = gfn_to_memslot(kvm, gfn); - if (slot) { - linfo = lpage_info_slot(gfn, slot, level); - return linfo->write_count; - } - - return 1; -} - -static int host_mapping_level(struct kvm *kvm, gfn_t gfn) -{ - unsigned long page_size; - int i, ret = 0; - - page_size = kvm_host_page_size(kvm, gfn); - - for (i = PT_PAGE_TABLE_LEVEL; - i < (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES); ++i) { - if (page_size >= KVM_HPAGE_SIZE(i)) - ret = i; - else - break; - } - - return ret; -} - -static struct kvm_memory_slot * -gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn, - bool no_dirty_log) -{ - struct kvm_memory_slot *slot; - - slot = gfn_to_memslot(vcpu->kvm, gfn); - if (!slot || slot->flags & KVM_MEMSLOT_INVALID || - (no_dirty_log && slot->dirty_bitmap)) - slot = NULL; - - return slot; -} - -static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn) -{ - return !gfn_to_memslot_dirty_bitmap(vcpu, large_gfn, true); -} - -static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) -{ - int host_level, level, max_level; - - host_level = host_mapping_level(vcpu->kvm, large_gfn); - - if (host_level == PT_PAGE_TABLE_LEVEL) - return host_level; - - max_level = kvm_x86_ops->get_lpage_level() < host_level ? - kvm_x86_ops->get_lpage_level() : host_level; - - for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level) - if (has_wrprotected_page(vcpu->kvm, large_gfn, level)) - break; - - return level - 1; -} - -/* - * Pte mapping structures: - * - * If pte_list bit zero is zero, then pte_list point to the spte. - * - * If pte_list bit zero is one, (then pte_list & ~1) points to a struct - * pte_list_desc containing more mappings. - * - * Returns the number of pte entries before the spte was added or zero if - * the spte was not added. - * - */ -static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte, - unsigned long *pte_list) -{ - struct pte_list_desc *desc; - int i, count = 0; - - if (!*pte_list) { - rmap_printk("pte_list_add: %p %llx 0->1\n", spte, *spte); - *pte_list = (unsigned long)spte; - } else if (!(*pte_list & 1)) { - rmap_printk("pte_list_add: %p %llx 1->many\n", spte, *spte); - desc = mmu_alloc_pte_list_desc(vcpu); - desc->sptes[0] = (u64 *)*pte_list; - desc->sptes[1] = spte; - *pte_list = (unsigned long)desc | 1; - ++count; - } else { - rmap_printk("pte_list_add: %p %llx many->many\n", spte, *spte); - desc = (struct pte_list_desc *)(*pte_list & ~1ul); - while (desc->sptes[PTE_LIST_EXT-1] && desc->more) { - desc = desc->more; - count += PTE_LIST_EXT; - } - if (desc->sptes[PTE_LIST_EXT-1]) { - desc->more = mmu_alloc_pte_list_desc(vcpu); - desc = desc->more; - } - for (i = 0; desc->sptes[i]; ++i) - ++count; - desc->sptes[i] = spte; - } - return count; -} - -static u64 *pte_list_next(unsigned long *pte_list, u64 *spte) -{ - struct pte_list_desc *desc; - u64 *prev_spte; - int i; - - if (!*pte_list) - return NULL; - else if (!(*pte_list & 1)) { - if (!spte) - return (u64 *)*pte_list; - return NULL; - } - desc = (struct pte_list_desc *)(*pte_list & ~1ul); - prev_spte = NULL; - while (desc) { - for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) { - if (prev_spte == spte) - return desc->sptes[i]; - prev_spte = desc->sptes[i]; - } - desc = desc->more; - } - return NULL; -} - -static void -pte_list_desc_remove_entry(unsigned long *pte_list, struct pte_list_desc *desc, - int i, struct pte_list_desc *prev_desc) -{ - int j; - - for (j = PTE_LIST_EXT - 1; !desc->sptes[j] && j > i; --j) - ; - desc->sptes[i] = desc->sptes[j]; - desc->sptes[j] = NULL; - if (j != 0) - return; - if (!prev_desc && !desc->more) - *pte_list = (unsigned long)desc->sptes[0]; - else - if (prev_desc) - prev_desc->more = desc->more; - else - *pte_list = (unsigned long)desc->more | 1; - mmu_free_pte_list_desc(desc); -} - -static void pte_list_remove(u64 *spte, unsigned long *pte_list) -{ - struct pte_list_desc *desc; - struct pte_list_desc *prev_desc; - int i; - - if (!*pte_list) { - printk(KERN_ERR "pte_list_remove: %p 0->BUG\n", spte); - BUG(); - } else if (!(*pte_list & 1)) { - rmap_printk("pte_list_remove: %p 1->0\n", spte); - if ((u64 *)*pte_list != spte) { - printk(KERN_ERR "pte_list_remove: %p 1->BUG\n", spte); - BUG(); - } - *pte_list = 0; - } else { - rmap_printk("pte_list_remove: %p many->many\n", spte); - desc = (struct pte_list_desc *)(*pte_list & ~1ul); - prev_desc = NULL; - while (desc) { - for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) - if (desc->sptes[i] == spte) { - pte_list_desc_remove_entry(pte_list, - desc, i, - prev_desc); - return; - } - prev_desc = desc; - desc = desc->more; - } - pr_err("pte_list_remove: %p many->many\n", spte); - BUG(); - } -} - -typedef void (*pte_list_walk_fn) (u64 *spte); -static void pte_list_walk(unsigned long *pte_list, pte_list_walk_fn fn) -{ - struct pte_list_desc *desc; - int i; - - if (!*pte_list) - return; - - if (!(*pte_list & 1)) - return fn((u64 *)*pte_list); - - desc = (struct pte_list_desc *)(*pte_list & ~1ul); - while (desc) { - for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) - fn(desc->sptes[i]); - desc = desc->more; - } -} - -static unsigned long *__gfn_to_rmap(gfn_t gfn, int level, - struct kvm_memory_slot *slot) -{ - struct kvm_lpage_info *linfo; - - if (likely(level == PT_PAGE_TABLE_LEVEL)) - return &slot->rmap[gfn - slot->base_gfn]; - - linfo = lpage_info_slot(gfn, slot, level); - return &linfo->rmap_pde; -} - -/* - * Take gfn and return the reverse mapping to it. - */ -static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) -{ - struct kvm_memory_slot *slot; - - slot = gfn_to_memslot(kvm, gfn); - return __gfn_to_rmap(gfn, level, slot); -} - -static bool rmap_can_add(struct kvm_vcpu *vcpu) -{ - struct kvm_mmu_memory_cache *cache; - - cache = &vcpu->arch.mmu_pte_list_desc_cache; - return mmu_memory_cache_free_objects(cache); -} - -static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) -{ - struct kvm_mmu_page *sp; - unsigned long *rmapp; - - sp = page_header(__pa(spte)); - kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn); - rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); - return pte_list_add(vcpu, spte, rmapp); -} - -static u64 *rmap_next(unsigned long *rmapp, u64 *spte) -{ - return pte_list_next(rmapp, spte); -} - -static void rmap_remove(struct kvm *kvm, u64 *spte) -{ - struct kvm_mmu_page *sp; - gfn_t gfn; - unsigned long *rmapp; - - sp = page_header(__pa(spte)); - gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); - rmapp = gfn_to_rmap(kvm, gfn, sp->role.level); - pte_list_remove(spte, rmapp); -} - -static void drop_spte(struct kvm *kvm, u64 *sptep) -{ - if (mmu_spte_clear_track_bits(sptep)) - rmap_remove(kvm, sptep); -} - -int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn, - struct kvm_memory_slot *slot) -{ - unsigned long *rmapp; - u64 *spte; - int i, write_protected = 0; - - rmapp = __gfn_to_rmap(gfn, PT_PAGE_TABLE_LEVEL, slot); - spte = rmap_next(rmapp, NULL); - while (spte) { - BUG_ON(!(*spte & PT_PRESENT_MASK)); - rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); - if (is_writable_pte(*spte)) { - mmu_spte_update(spte, *spte & ~PT_WRITABLE_MASK); - write_protected = 1; - } - spte = rmap_next(rmapp, spte); - } - - /* check for huge page mappings */ - for (i = PT_DIRECTORY_LEVEL; - i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { - rmapp = __gfn_to_rmap(gfn, i, slot); - spte = rmap_next(rmapp, NULL); - while (spte) { - BUG_ON(!(*spte & PT_PRESENT_MASK)); - BUG_ON(!is_large_pte(*spte)); - pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn); - if (is_writable_pte(*spte)) { - drop_spte(kvm, spte); - --kvm->stat.lpages; - spte = NULL; - write_protected = 1; - } - spte = rmap_next(rmapp, spte); - } - } - - return write_protected; -} - -static int rmap_write_protect(struct kvm *kvm, u64 gfn) -{ - struct kvm_memory_slot *slot; - - slot = gfn_to_memslot(kvm, gfn); - return kvm_mmu_rmap_write_protect(kvm, gfn, slot); -} - -static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, - unsigned long data) -{ - u64 *spte; - int need_tlb_flush = 0; - - while ((spte = rmap_next(rmapp, NULL))) { - BUG_ON(!(*spte & PT_PRESENT_MASK)); - rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte); - drop_spte(kvm, spte); - need_tlb_flush = 1; - } - return need_tlb_flush; -} - -static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, - unsigned long data) -{ - int need_flush = 0; - u64 *spte, new_spte; - pte_t *ptep = (pte_t *)data; - pfn_t new_pfn; - - WARN_ON(pte_huge(*ptep)); - new_pfn = pte_pfn(*ptep); - spte = rmap_next(rmapp, NULL); - while (spte) { - BUG_ON(!is_shadow_present_pte(*spte)); - rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte); - need_flush = 1; - if (pte_write(*ptep)) { - drop_spte(kvm, spte); - spte = rmap_next(rmapp, NULL); - } else { - new_spte = *spte &~ (PT64_BASE_ADDR_MASK); - new_spte |= (u64)new_pfn << PAGE_SHIFT; - - new_spte &= ~PT_WRITABLE_MASK; - new_spte &= ~SPTE_HOST_WRITEABLE; - new_spte &= ~shadow_accessed_mask; - mmu_spte_clear_track_bits(spte); - mmu_spte_set(spte, new_spte); - spte = rmap_next(rmapp, spte); - } - } - if (need_flush) - kvm_flush_remote_tlbs(kvm); - - return 0; -} - -static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, - unsigned long data, - int (*handler)(struct kvm *kvm, unsigned long *rmapp, - unsigned long data)) -{ - int j; - int ret; - int retval = 0; - struct kvm_memslots *slots; - struct kvm_memory_slot *memslot; - - slots = kvm_memslots(kvm); - - kvm_for_each_memslot(memslot, slots) { - unsigned long start = memslot->userspace_addr; - unsigned long end; - - end = start + (memslot->npages << PAGE_SHIFT); - if (hva >= start && hva < end) { - gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; - gfn_t gfn = memslot->base_gfn + gfn_offset; - - ret = handler(kvm, &memslot->rmap[gfn_offset], data); - - for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) { - struct kvm_lpage_info *linfo; - - linfo = lpage_info_slot(gfn, memslot, - PT_DIRECTORY_LEVEL + j); - ret |= handler(kvm, &linfo->rmap_pde, data); - } - trace_kvm_age_page(hva, memslot, ret); - retval |= ret; - } - } - - return retval; -} - -int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) -{ - return kvm_handle_hva(kvm, hva, 0, kvm_unmap_rmapp); -} - -void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) -{ - kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp); -} - -static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, - unsigned long data) -{ - u64 *spte; - int young = 0; - - /* - * Emulate the accessed bit for EPT, by checking if this page has - * an EPT mapping, and clearing it if it does. On the next access, - * a new EPT mapping will be established. - * This has some overhead, but not as much as the cost of swapping - * out actively used pages or breaking up actively used hugepages. - */ - if (!shadow_accessed_mask) - return kvm_unmap_rmapp(kvm, rmapp, data); - - spte = rmap_next(rmapp, NULL); - while (spte) { - int _young; - u64 _spte = *spte; - BUG_ON(!(_spte & PT_PRESENT_MASK)); - _young = _spte & PT_ACCESSED_MASK; - if (_young) { - young = 1; - clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte); - } - spte = rmap_next(rmapp, spte); - } - return young; -} - -static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, - unsigned long data) -{ - u64 *spte; - int young = 0; - - /* - * If there's no access bit in the secondary pte set by the - * hardware it's up to gup-fast/gup to set the access bit in - * the primary pte or in the page structure. - */ - if (!shadow_accessed_mask) - goto out; - - spte = rmap_next(rmapp, NULL); - while (spte) { - u64 _spte = *spte; - BUG_ON(!(_spte & PT_PRESENT_MASK)); - young = _spte & PT_ACCESSED_MASK; - if (young) { - young = 1; - break; - } - spte = rmap_next(rmapp, spte); - } -out: - return young; -} - -#define RMAP_RECYCLE_THRESHOLD 1000 - -static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) -{ - unsigned long *rmapp; - struct kvm_mmu_page *sp; - - sp = page_header(__pa(spte)); - - rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); - - kvm_unmap_rmapp(vcpu->kvm, rmapp, 0); - kvm_flush_remote_tlbs(vcpu->kvm); -} - -int kvm_age_hva(struct kvm *kvm, unsigned long hva) -{ - return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp); -} - -int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) -{ - return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp); -} - -#ifdef MMU_DEBUG -static int is_empty_shadow_page(u64 *spt) -{ - u64 *pos; - u64 *end; - - for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++) - if (is_shadow_present_pte(*pos)) { - printk(KERN_ERR "%s: %p %llx\n", __func__, - pos, *pos); - return 0; - } - return 1; -} -#endif - -/* - * This value is the sum of all of the kvm instances's - * kvm->arch.n_used_mmu_pages values. We need a global, - * aggregate version in order to make the slab shrinker - * faster - */ -static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr) -{ - kvm->arch.n_used_mmu_pages += nr; - percpu_counter_add(&kvm_total_used_mmu_pages, nr); -} - -/* - * Remove the sp from shadow page cache, after call it, - * we can not find this sp from the cache, and the shadow - * page table is still valid. - * It should be under the protection of mmu lock. - */ -static void kvm_mmu_isolate_page(struct kvm_mmu_page *sp) -{ - ASSERT(is_empty_shadow_page(sp->spt)); - hlist_del(&sp->hash_link); - if (!sp->role.direct) - free_page((unsigned long)sp->gfns); -} - -/* - * Free the shadow page table and the sp, we can do it - * out of the protection of mmu lock. - */ -static void kvm_mmu_free_page(struct kvm_mmu_page *sp) -{ - list_del(&sp->link); - free_page((unsigned long)sp->spt); - kmem_cache_free(mmu_page_header_cache, sp); -} - -static unsigned kvm_page_table_hashfn(gfn_t gfn) -{ - return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1); -} - -static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *sp, u64 *parent_pte) -{ - if (!parent_pte) - return; - - pte_list_add(vcpu, parent_pte, &sp->parent_ptes); -} - -static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp, - u64 *parent_pte) -{ - pte_list_remove(parent_pte, &sp->parent_ptes); -} - -static void drop_parent_pte(struct kvm_mmu_page *sp, - u64 *parent_pte) -{ - mmu_page_remove_parent_pte(sp, parent_pte); - mmu_spte_clear_no_track(parent_pte); -} - -static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, - u64 *parent_pte, int direct) -{ - struct kvm_mmu_page *sp; - sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, - sizeof *sp); - sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); - if (!direct) - sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, - PAGE_SIZE); - set_page_private(virt_to_page(sp->spt), (unsigned long)sp); - list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); - bitmap_zero(sp->slot_bitmap, KVM_MEM_SLOTS_NUM); - sp->parent_ptes = 0; - mmu_page_add_parent_pte(vcpu, sp, parent_pte); - kvm_mod_used_mmu_pages(vcpu->kvm, +1); - return sp; -} - -static void mark_unsync(u64 *spte); -static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp) -{ - pte_list_walk(&sp->parent_ptes, mark_unsync); -} - -static void mark_unsync(u64 *spte) -{ - struct kvm_mmu_page *sp; - unsigned int index; - - sp = page_header(__pa(spte)); - index = spte - sp->spt; - if (__test_and_set_bit(index, sp->unsync_child_bitmap)) - return; - if (sp->unsync_children++) - return; - kvm_mmu_mark_parents_unsync(sp); -} - -static int nonpaging_sync_page(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *sp) -{ - return 1; -} - -static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva) -{ -} - -static void nonpaging_update_pte(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *sp, u64 *spte, - const void *pte) -{ - WARN_ON(1); -} - -#define KVM_PAGE_ARRAY_NR 16 - -struct kvm_mmu_pages { - struct mmu_page_and_offset { - struct kvm_mmu_page *sp; - unsigned int idx; - } page[KVM_PAGE_ARRAY_NR]; - unsigned int nr; -}; - -static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp, - int idx) -{ - int i; - - if (sp->unsync) - for (i=0; i < pvec->nr; i++) - if (pvec->page[i].sp == sp) - return 0; - - pvec->page[pvec->nr].sp = sp; - pvec->page[pvec->nr].idx = idx; - pvec->nr++; - return (pvec->nr == KVM_PAGE_ARRAY_NR); -} - -static int __mmu_unsync_walk(struct kvm_mmu_page *sp, - struct kvm_mmu_pages *pvec) -{ - int i, ret, nr_unsync_leaf = 0; - - for_each_set_bit(i, sp->unsync_child_bitmap, 512) { - struct kvm_mmu_page *child; - u64 ent = sp->spt[i]; - - if (!is_shadow_present_pte(ent) || is_large_pte(ent)) - goto clear_child_bitmap; - - child = page_header(ent & PT64_BASE_ADDR_MASK); - - if (child->unsync_children) { - if (mmu_pages_add(pvec, child, i)) - return -ENOSPC; - - ret = __mmu_unsync_walk(child, pvec); - if (!ret) - goto clear_child_bitmap; - else if (ret > 0) - nr_unsync_leaf += ret; - else - return ret; - } else if (child->unsync) { - nr_unsync_leaf++; - if (mmu_pages_add(pvec, child, i)) - return -ENOSPC; - } else - goto clear_child_bitmap; - - continue; - -clear_child_bitmap: - __clear_bit(i, sp->unsync_child_bitmap); - sp->unsync_children--; - WARN_ON((int)sp->unsync_children < 0); - } - - - return nr_unsync_leaf; -} - -static int mmu_unsync_walk(struct kvm_mmu_page *sp, - struct kvm_mmu_pages *pvec) -{ - if (!sp->unsync_children) - return 0; - - mmu_pages_add(pvec, sp, 0); - return __mmu_unsync_walk(sp, pvec); -} - -static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) -{ - WARN_ON(!sp->unsync); - trace_kvm_mmu_sync_page(sp); - sp->unsync = 0; - --kvm->stat.mmu_unsync; -} - -static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, - struct list_head *invalid_list); -static void kvm_mmu_commit_zap_page(struct kvm *kvm, - struct list_head *invalid_list); - -#define for_each_gfn_sp(kvm, sp, gfn, pos) \ - hlist_for_each_entry(sp, pos, \ - &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \ - if ((sp)->gfn != (gfn)) {} else - -#define for_each_gfn_indirect_valid_sp(kvm, sp, gfn, pos) \ - hlist_for_each_entry(sp, pos, \ - &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \ - if ((sp)->gfn != (gfn) || (sp)->role.direct || \ - (sp)->role.invalid) {} else - -/* @sp->gfn should be write-protected at the call site */ -static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, - struct list_head *invalid_list, bool clear_unsync) -{ - if (sp->role.cr4_pae != !!is_pae(vcpu)) { - kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); - return 1; - } - - if (clear_unsync) - kvm_unlink_unsync_page(vcpu->kvm, sp); - - if (vcpu->arch.mmu.sync_page(vcpu, sp)) { - kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); - return 1; - } - - kvm_mmu_flush_tlb(vcpu); - return 0; -} - -static int kvm_sync_page_transient(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *sp) -{ - LIST_HEAD(invalid_list); - int ret; - - ret = __kvm_sync_page(vcpu, sp, &invalid_list, false); - if (ret) - kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); - - return ret; -} - -#ifdef CONFIG_KVM_MMU_AUDIT -#include "mmu_audit.c" -#else -static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { } -static void mmu_audit_disable(void) { } -#endif - -static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, - struct list_head *invalid_list) -{ - return __kvm_sync_page(vcpu, sp, invalid_list, true); -} - -/* @gfn should be write-protected at the call site */ -static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) -{ - struct kvm_mmu_page *s; - struct hlist_node *node; - LIST_HEAD(invalid_list); - bool flush = false; - - for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) { - if (!s->unsync) - continue; - - WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); - kvm_unlink_unsync_page(vcpu->kvm, s); - if ((s->role.cr4_pae != !!is_pae(vcpu)) || - (vcpu->arch.mmu.sync_page(vcpu, s))) { - kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list); - continue; - } - flush = true; - } - - kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); - if (flush) - kvm_mmu_flush_tlb(vcpu); -} - -struct mmu_page_path { - struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1]; - unsigned int idx[PT64_ROOT_LEVEL-1]; -}; - -#define for_each_sp(pvec, sp, parents, i) \ - for (i = mmu_pages_next(&pvec, &parents, -1), \ - sp = pvec.page[i].sp; \ - i < pvec.nr && ({ sp = pvec.page[i].sp; 1;}); \ - i = mmu_pages_next(&pvec, &parents, i)) - -static int mmu_pages_next(struct kvm_mmu_pages *pvec, - struct mmu_page_path *parents, - int i) -{ - int n; - - for (n = i+1; n < pvec->nr; n++) { - struct kvm_mmu_page *sp = pvec->page[n].sp; - - if (sp->role.level == PT_PAGE_TABLE_LEVEL) { - parents->idx[0] = pvec->page[n].idx; - return n; - } - - parents->parent[sp->role.level-2] = sp; - parents->idx[sp->role.level-1] = pvec->page[n].idx; - } - - return n; -} - -static void mmu_pages_clear_parents(struct mmu_page_path *parents) -{ - struct kvm_mmu_page *sp; - unsigned int level = 0; - - do { - unsigned int idx = parents->idx[level]; - - sp = parents->parent[level]; - if (!sp) - return; - - --sp->unsync_children; - WARN_ON((int)sp->unsync_children < 0); - __clear_bit(idx, sp->unsync_child_bitmap); - level++; - } while (level < PT64_ROOT_LEVEL-1 && !sp->unsync_children); -} - -static void kvm_mmu_pages_init(struct kvm_mmu_page *parent, - struct mmu_page_path *parents, - struct kvm_mmu_pages *pvec) -{ - parents->parent[parent->role.level-1] = NULL; - pvec->nr = 0; -} - -static void mmu_sync_children(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *parent) -{ - int i; - struct kvm_mmu_page *sp; - struct mmu_page_path parents; - struct kvm_mmu_pages pages; - LIST_HEAD(invalid_list); - - kvm_mmu_pages_init(parent, &parents, &pages); - while (mmu_unsync_walk(parent, &pages)) { - int protected = 0; - - for_each_sp(pages, sp, parents, i) - protected |= rmap_write_protect(vcpu->kvm, sp->gfn); - - if (protected) - kvm_flush_remote_tlbs(vcpu->kvm); - - for_each_sp(pages, sp, parents, i) { - kvm_sync_page(vcpu, sp, &invalid_list); - mmu_pages_clear_parents(&parents); - } - kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); - cond_resched_lock(&vcpu->kvm->mmu_lock); - kvm_mmu_pages_init(parent, &parents, &pages); - } -} - -static void init_shadow_page_table(struct kvm_mmu_page *sp) -{ - int i; - - for (i = 0; i < PT64_ENT_PER_PAGE; ++i) - sp->spt[i] = 0ull; -} - -static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp) -{ - sp->write_flooding_count = 0; -} - -static void clear_sp_write_flooding_count(u64 *spte) -{ - struct kvm_mmu_page *sp = page_header(__pa(spte)); - - __clear_sp_write_flooding_count(sp); -} - -static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, - gfn_t gfn, - gva_t gaddr, - unsigned level, - int direct, - unsigned access, - u64 *parent_pte) -{ - union kvm_mmu_page_role role; - unsigned quadrant; - struct kvm_mmu_page *sp; - struct hlist_node *node; - bool need_sync = false; - - role = vcpu->arch.mmu.base_role; - role.level = level; - role.direct = direct; - if (role.direct) - role.cr4_pae = 0; - role.access = access; - if (!vcpu->arch.mmu.direct_map - && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) { - quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); - quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; - role.quadrant = quadrant; - } - for_each_gfn_sp(vcpu->kvm, sp, gfn, node) { - if (!need_sync && sp->unsync) - need_sync = true; - - if (sp->role.word != role.word) - continue; - - if (sp->unsync && kvm_sync_page_transient(vcpu, sp)) - break; - - mmu_page_add_parent_pte(vcpu, sp, parent_pte); - if (sp->unsync_children) { - kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); - kvm_mmu_mark_parents_unsync(sp); - } else if (sp->unsync) - kvm_mmu_mark_parents_unsync(sp); - - __clear_sp_write_flooding_count(sp); - trace_kvm_mmu_get_page(sp, false); - return sp; - } - ++vcpu->kvm->stat.mmu_cache_miss; - sp = kvm_mmu_alloc_page(vcpu, parent_pte, direct); - if (!sp) - return sp; - sp->gfn = gfn; - sp->role = role; - hlist_add_head(&sp->hash_link, - &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]); - if (!direct) { - if (rmap_write_protect(vcpu->kvm, gfn)) - kvm_flush_remote_tlbs(vcpu->kvm); - if (level > PT_PAGE_TABLE_LEVEL && need_sync) - kvm_sync_pages(vcpu, gfn); - - account_shadowed(vcpu->kvm, gfn); - } - init_shadow_page_table(sp); - trace_kvm_mmu_get_page(sp, true); - return sp; -} - -static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator, - struct kvm_vcpu *vcpu, u64 addr) -{ - iterator->addr = addr; - iterator->shadow_addr = vcpu->arch.mmu.root_hpa; - iterator->level = vcpu->arch.mmu.shadow_root_level; - - if (iterator->level == PT64_ROOT_LEVEL && - vcpu->arch.mmu.root_level < PT64_ROOT_LEVEL && - !vcpu->arch.mmu.direct_map) - --iterator->level; - - if (iterator->level == PT32E_ROOT_LEVEL) { - iterator->shadow_addr - = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; - iterator->shadow_addr &= PT64_BASE_ADDR_MASK; - --iterator->level; - if (!iterator->shadow_addr) - iterator->level = 0; - } -} - -static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator) -{ - if (iterator->level < PT_PAGE_TABLE_LEVEL) - return false; - - iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level); - iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index; - return true; -} - -static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator, - u64 spte) -{ - if (is_last_spte(spte, iterator->level)) { - iterator->level = 0; - return; - } - - iterator->shadow_addr = spte & PT64_BASE_ADDR_MASK; - --iterator->level; -} - -static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator) -{ - return __shadow_walk_next(iterator, *iterator->sptep); -} - -static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp) -{ - u64 spte; - - spte = __pa(sp->spt) - | PT_PRESENT_MASK | PT_ACCESSED_MASK - | PT_WRITABLE_MASK | PT_USER_MASK; - mmu_spte_set(sptep, spte); -} - -static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) -{ - if (is_large_pte(*sptep)) { - drop_spte(vcpu->kvm, sptep); - --vcpu->kvm->stat.lpages; - kvm_flush_remote_tlbs(vcpu->kvm); - } -} - -static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, - unsigned direct_access) -{ - if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) { - struct kvm_mmu_page *child; - - /* - * For the direct sp, if the guest pte's dirty bit - * changed form clean to dirty, it will corrupt the - * sp's access: allow writable in the read-only sp, - * so we should update the spte at this point to get - * a new sp with the correct access. - */ - child = page_header(*sptep & PT64_BASE_ADDR_MASK); - if (child->role.access == direct_access) - return; - - drop_parent_pte(child, sptep); - kvm_flush_remote_tlbs(vcpu->kvm); - } -} - -static bool mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, - u64 *spte) -{ - u64 pte; - struct kvm_mmu_page *child; - - pte = *spte; - if (is_shadow_present_pte(pte)) { - if (is_last_spte(pte, sp->role.level)) { - drop_spte(kvm, spte); - if (is_large_pte(pte)) - --kvm->stat.lpages; - } else { - child = page_header(pte & PT64_BASE_ADDR_MASK); - drop_parent_pte(child, spte); - } - return true; - } - - if (is_mmio_spte(pte)) - mmu_spte_clear_no_track(spte); - - return false; -} - -static void kvm_mmu_page_unlink_children(struct kvm *kvm, - struct kvm_mmu_page *sp) -{ - unsigned i; - - for (i = 0; i < PT64_ENT_PER_PAGE; ++i) - mmu_page_zap_pte(kvm, sp, sp->spt + i); -} - -static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte) -{ - mmu_page_remove_parent_pte(sp, parent_pte); -} - -static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) -{ - u64 *parent_pte; - - while ((parent_pte = pte_list_next(&sp->parent_ptes, NULL))) - drop_parent_pte(sp, parent_pte); -} - -static int mmu_zap_unsync_children(struct kvm *kvm, - struct kvm_mmu_page *parent, - struct list_head *invalid_list) -{ - int i, zapped = 0; - struct mmu_page_path parents; - struct kvm_mmu_pages pages; - - if (parent->role.level == PT_PAGE_TABLE_LEVEL) - return 0; - - kvm_mmu_pages_init(parent, &parents, &pages); - while (mmu_unsync_walk(parent, &pages)) { - struct kvm_mmu_page *sp; - - for_each_sp(pages, sp, parents, i) { - kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); - mmu_pages_clear_parents(&parents); - zapped++; - } - kvm_mmu_pages_init(parent, &parents, &pages); - } - - return zapped; -} - -static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, - struct list_head *invalid_list) -{ - int ret; - - trace_kvm_mmu_prepare_zap_page(sp); - ++kvm->stat.mmu_shadow_zapped; - ret = mmu_zap_unsync_children(kvm, sp, invalid_list); - kvm_mmu_page_unlink_children(kvm, sp); - kvm_mmu_unlink_parents(kvm, sp); - if (!sp->role.invalid && !sp->role.direct) - unaccount_shadowed(kvm, sp->gfn); - if (sp->unsync) - kvm_unlink_unsync_page(kvm, sp); - if (!sp->root_count) { - /* Count self */ - ret++; - list_move(&sp->link, invalid_list); - kvm_mod_used_mmu_pages(kvm, -1); - } else { - list_move(&sp->link, &kvm->arch.active_mmu_pages); - kvm_reload_remote_mmus(kvm); - } - - sp->role.invalid = 1; - return ret; -} - -static void kvm_mmu_isolate_pages(struct list_head *invalid_list) -{ - struct kvm_mmu_page *sp; - - list_for_each_entry(sp, invalid_list, link) - kvm_mmu_isolate_page(sp); -} - -static void free_pages_rcu(struct rcu_head *head) -{ - struct kvm_mmu_page *next, *sp; - - sp = container_of(head, struct kvm_mmu_page, rcu); - while (sp) { - if (!list_empty(&sp->link)) - next = list_first_entry(&sp->link, - struct kvm_mmu_page, link); - else - next = NULL; - kvm_mmu_free_page(sp); - sp = next; - } -} - -static void kvm_mmu_commit_zap_page(struct kvm *kvm, - struct list_head *invalid_list) -{ - struct kvm_mmu_page *sp; - - if (list_empty(invalid_list)) - return; - - kvm_flush_remote_tlbs(kvm); - - if (atomic_read(&kvm->arch.reader_counter)) { - kvm_mmu_isolate_pages(invalid_list); - sp = list_first_entry(invalid_list, struct kvm_mmu_page, link); - list_del_init(invalid_list); - - trace_kvm_mmu_delay_free_pages(sp); - call_rcu(&sp->rcu, free_pages_rcu); - return; - } - - do { - sp = list_first_entry(invalid_list, struct kvm_mmu_page, link); - WARN_ON(!sp->role.invalid || sp->root_count); - kvm_mmu_isolate_page(sp); - kvm_mmu_free_page(sp); - } while (!list_empty(invalid_list)); - -} - -/* - * Changing the number of mmu pages allocated to the vm - * Note: if goal_nr_mmu_pages is too small, you will get dead lock - */ -void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages) -{ - LIST_HEAD(invalid_list); - /* - * If we set the number of mmu pages to be smaller be than the - * number of actived pages , we must to free some mmu pages before we - * change the value - */ - - if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) { - while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages && - !list_empty(&kvm->arch.active_mmu_pages)) { - struct kvm_mmu_page *page; - - page = container_of(kvm->arch.active_mmu_pages.prev, - struct kvm_mmu_page, link); - kvm_mmu_prepare_zap_page(kvm, page, &invalid_list); - } - kvm_mmu_commit_zap_page(kvm, &invalid_list); - goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages; - } - - kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages; -} - -int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) -{ - struct kvm_mmu_page *sp; - struct hlist_node *node; - LIST_HEAD(invalid_list); - int r; - - pgprintk("%s: looking for gfn %llx\n", __func__, gfn); - r = 0; - spin_lock(&kvm->mmu_lock); - for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) { - pgprintk("%s: gfn %llx role %x\n", __func__, gfn, - sp->role.word); - r = 1; - kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); - } - kvm_mmu_commit_zap_page(kvm, &invalid_list); - spin_unlock(&kvm->mmu_lock); - - return r; -} -EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page); - -static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn) -{ - int slot = memslot_id(kvm, gfn); - struct kvm_mmu_page *sp = page_header(__pa(pte)); - - __set_bit(slot, sp->slot_bitmap); -} - -/* - * The function is based on mtrr_type_lookup() in - * arch/x86/kernel/cpu/mtrr/generic.c - */ -static int get_mtrr_type(struct mtrr_state_type *mtrr_state, - u64 start, u64 end) -{ - int i; - u64 base, mask; - u8 prev_match, curr_match; - int num_var_ranges = KVM_NR_VAR_MTRR; - - if (!mtrr_state->enabled) - return 0xFF; - - /* Make end inclusive end, instead of exclusive */ - end--; - - /* Look in fixed ranges. Just return the type as per start */ - if (mtrr_state->have_fixed && (start < 0x100000)) { - int idx; - - if (start < 0x80000) { - idx = 0; - idx += (start >> 16); - return mtrr_state->fixed_ranges[idx]; - } else if (start < 0xC0000) { - idx = 1 * 8; - idx += ((start - 0x80000) >> 14); - return mtrr_state->fixed_ranges[idx]; - } else if (start < 0x1000000) { - idx = 3 * 8; - idx += ((start - 0xC0000) >> 12); - return mtrr_state->fixed_ranges[idx]; - } - } - - /* - * Look in variable ranges - * Look of multiple ranges matching this address and pick type - * as per MTRR precedence - */ - if (!(mtrr_state->enabled & 2)) - return mtrr_state->def_type; - - prev_match = 0xFF; - for (i = 0; i < num_var_ranges; ++i) { - unsigned short start_state, end_state; - - if (!(mtrr_state->var_ranges[i].mask_lo & (1 << 11))) - continue; - - base = (((u64)mtrr_state->var_ranges[i].base_hi) << 32) + - (mtrr_state->var_ranges[i].base_lo & PAGE_MASK); - mask = (((u64)mtrr_state->var_ranges[i].mask_hi) << 32) + - (mtrr_state->var_ranges[i].mask_lo & PAGE_MASK); - - start_state = ((start & mask) == (base & mask)); - end_state = ((end & mask) == (base & mask)); - if (start_state != end_state) - return 0xFE; - - if ((start & mask) != (base & mask)) - continue; - - curr_match = mtrr_state->var_ranges[i].base_lo & 0xff; - if (prev_match == 0xFF) { - prev_match = curr_match; - continue; - } - - if (prev_match == MTRR_TYPE_UNCACHABLE || - curr_match == MTRR_TYPE_UNCACHABLE) - return MTRR_TYPE_UNCACHABLE; - - if ((prev_match == MTRR_TYPE_WRBACK && - curr_match == MTRR_TYPE_WRTHROUGH) || - (prev_match == MTRR_TYPE_WRTHROUGH && - curr_match == MTRR_TYPE_WRBACK)) { - prev_match = MTRR_TYPE_WRTHROUGH; - curr_match = MTRR_TYPE_WRTHROUGH; - } - - if (prev_match != curr_match) - return MTRR_TYPE_UNCACHABLE; - } - - if (prev_match != 0xFF) - return prev_match; - - return mtrr_state->def_type; -} - -u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn) -{ - u8 mtrr; - - mtrr = get_mtrr_type(&vcpu->arch.mtrr_state, gfn << PAGE_SHIFT, - (gfn << PAGE_SHIFT) + PAGE_SIZE); - if (mtrr == 0xfe || mtrr == 0xff) - mtrr = MTRR_TYPE_WRBACK; - return mtrr; -} -EXPORT_SYMBOL_GPL(kvm_get_guest_memory_type); - -static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) -{ - trace_kvm_mmu_unsync_page(sp); - ++vcpu->kvm->stat.mmu_unsync; - sp->unsync = 1; - - kvm_mmu_mark_parents_unsync(sp); -} - -static void kvm_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) -{ - struct kvm_mmu_page *s; - struct hlist_node *node; - - for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) { - if (s->unsync) - continue; - WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); - __kvm_unsync_page(vcpu, s); - } -} - -static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, - bool can_unsync) -{ - struct kvm_mmu_page *s; - struct hlist_node *node; - bool need_unsync = false; - - for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) { - if (!can_unsync) - return 1; - - if (s->role.level != PT_PAGE_TABLE_LEVEL) - return 1; - - if (!need_unsync && !s->unsync) { - need_unsync = true; - } - } - if (need_unsync) - kvm_unsync_pages(vcpu, gfn); - return 0; -} - -static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, - unsigned pte_access, int user_fault, - int write_fault, int level, - gfn_t gfn, pfn_t pfn, bool speculative, - bool can_unsync, bool host_writable) -{ - u64 spte, entry = *sptep; - int ret = 0; - - if (set_mmio_spte(sptep, gfn, pfn, pte_access)) - return 0; - - spte = PT_PRESENT_MASK; - if (!speculative) - spte |= shadow_accessed_mask; - - if (pte_access & ACC_EXEC_MASK) - spte |= shadow_x_mask; - else - spte |= shadow_nx_mask; - if (pte_access & ACC_USER_MASK) - spte |= shadow_user_mask; - if (level > PT_PAGE_TABLE_LEVEL) - spte |= PT_PAGE_SIZE_MASK; - if (tdp_enabled) - spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn, - kvm_is_mmio_pfn(pfn)); - - if (host_writable) - spte |= SPTE_HOST_WRITEABLE; - else - pte_access &= ~ACC_WRITE_MASK; - - spte |= (u64)pfn << PAGE_SHIFT; - - if ((pte_access & ACC_WRITE_MASK) - || (!vcpu->arch.mmu.direct_map && write_fault - && !is_write_protection(vcpu) && !user_fault)) { - - if (level > PT_PAGE_TABLE_LEVEL && - has_wrprotected_page(vcpu->kvm, gfn, level)) { - ret = 1; - drop_spte(vcpu->kvm, sptep); - goto done; - } - - spte |= PT_WRITABLE_MASK; - - if (!vcpu->arch.mmu.direct_map - && !(pte_access & ACC_WRITE_MASK)) { - spte &= ~PT_USER_MASK; - /* - * If we converted a user page to a kernel page, - * so that the kernel can write to it when cr0.wp=0, - * then we should prevent the kernel from executing it - * if SMEP is enabled. - */ - if (kvm_read_cr4_bits(vcpu, X86_CR4_SMEP)) - spte |= PT64_NX_MASK; - } - - /* - * Optimization: for pte sync, if spte was writable the hash - * lookup is unnecessary (and expensive). Write protection - * is responsibility of mmu_get_page / kvm_sync_page. - * Same reasoning can be applied to dirty page accounting. - */ - if (!can_unsync && is_writable_pte(*sptep)) - goto set_pte; - - if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { - pgprintk("%s: found shadow page for %llx, marking ro\n", - __func__, gfn); - ret = 1; - pte_access &= ~ACC_WRITE_MASK; - if (is_writable_pte(spte)) - spte &= ~PT_WRITABLE_MASK; - } - } - - if (pte_access & ACC_WRITE_MASK) - mark_page_dirty(vcpu->kvm, gfn); - -set_pte: - mmu_spte_update(sptep, spte); - /* - * If we overwrite a writable spte with a read-only one we - * should flush remote TLBs. Otherwise rmap_write_protect - * will find a read-only spte, even though the writable spte - * might be cached on a CPU's TLB. - */ - if (is_writable_pte(entry) && !is_writable_pte(*sptep)) - kvm_flush_remote_tlbs(vcpu->kvm); -done: - return ret; -} - -static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, - unsigned pt_access, unsigned pte_access, - int user_fault, int write_fault, - int *emulate, int level, gfn_t gfn, - pfn_t pfn, bool speculative, - bool host_writable) -{ - int was_rmapped = 0; - int rmap_count; - - pgprintk("%s: spte %llx access %x write_fault %d" - " user_fault %d gfn %llx\n", - __func__, *sptep, pt_access, - write_fault, user_fault, gfn); - - if (is_rmap_spte(*sptep)) { - /* - * If we overwrite a PTE page pointer with a 2MB PMD, unlink - * the parent of the now unreachable PTE. - */ - if (level > PT_PAGE_TABLE_LEVEL && - !is_large_pte(*sptep)) { - struct kvm_mmu_page *child; - u64 pte = *sptep; - - child = page_header(pte & PT64_BASE_ADDR_MASK); - drop_parent_pte(child, sptep); - kvm_flush_remote_tlbs(vcpu->kvm); - } else if (pfn != spte_to_pfn(*sptep)) { - pgprintk("hfn old %llx new %llx\n", - spte_to_pfn(*sptep), pfn); - drop_spte(vcpu->kvm, sptep); - kvm_flush_remote_tlbs(vcpu->kvm); - } else - was_rmapped = 1; - } - - if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault, - level, gfn, pfn, speculative, true, - host_writable)) { - if (write_fault) - *emulate = 1; - kvm_mmu_flush_tlb(vcpu); - } - - if (unlikely(is_mmio_spte(*sptep) && emulate)) - *emulate = 1; - - pgprintk("%s: setting spte %llx\n", __func__, *sptep); - pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n", - is_large_pte(*sptep)? "2MB" : "4kB", - *sptep & PT_PRESENT_MASK ?"RW":"R", gfn, - *sptep, sptep); - if (!was_rmapped && is_large_pte(*sptep)) - ++vcpu->kvm->stat.lpages; - - if (is_shadow_present_pte(*sptep)) { - page_header_update_slot(vcpu->kvm, sptep, gfn); - if (!was_rmapped) { - rmap_count = rmap_add(vcpu, sptep, gfn); - if (rmap_count > RMAP_RECYCLE_THRESHOLD) - rmap_recycle(vcpu, sptep, gfn); - } - } - kvm_release_pfn_clean(pfn); -} - -static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) -{ -} - -static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, - bool no_dirty_log) -{ - struct kvm_memory_slot *slot; - unsigned long hva; - - slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log); - if (!slot) { - get_page(fault_page); - return page_to_pfn(fault_page); - } - - hva = gfn_to_hva_memslot(slot, gfn); - - return hva_to_pfn_atomic(vcpu->kvm, hva); -} - -static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *sp, - u64 *start, u64 *end) -{ - struct page *pages[PTE_PREFETCH_NUM]; - unsigned access = sp->role.access; - int i, ret; - gfn_t gfn; - - gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt); - if (!gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK)) - return -1; - - ret = gfn_to_page_many_atomic(vcpu->kvm, gfn, pages, end - start); - if (ret <= 0) - return -1; - - for (i = 0; i < ret; i++, gfn++, start++) - mmu_set_spte(vcpu, start, ACC_ALL, - access, 0, 0, NULL, - sp->role.level, gfn, - page_to_pfn(pages[i]), true, true); - - return 0; -} - -static void __direct_pte_prefetch(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *sp, u64 *sptep) -{ - u64 *spte, *start = NULL; - int i; - - WARN_ON(!sp->role.direct); - - i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1); - spte = sp->spt + i; - - for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) { - if (is_shadow_present_pte(*spte) || spte == sptep) { - if (!start) - continue; - if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0) - break; - start = NULL; - } else if (!start) - start = spte; - } -} - -static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) -{ - struct kvm_mmu_page *sp; - - /* - * Since it's no accessed bit on EPT, it's no way to - * distinguish between actually accessed translations - * and prefetched, so disable pte prefetch if EPT is - * enabled. - */ - if (!shadow_accessed_mask) - return; - - sp = page_header(__pa(sptep)); - if (sp->role.level > PT_PAGE_TABLE_LEVEL) - return; - - __direct_pte_prefetch(vcpu, sp, sptep); -} - -static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, - int map_writable, int level, gfn_t gfn, pfn_t pfn, - bool prefault) -{ - struct kvm_shadow_walk_iterator iterator; - struct kvm_mmu_page *sp; - int emulate = 0; - gfn_t pseudo_gfn; - - for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { - if (iterator.level == level) { - unsigned pte_access = ACC_ALL; - - mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access, - 0, write, &emulate, - level, gfn, pfn, prefault, map_writable); - direct_pte_prefetch(vcpu, iterator.sptep); - ++vcpu->stat.pf_fixed; - break; - } - - if (!is_shadow_present_pte(*iterator.sptep)) { - u64 base_addr = iterator.addr; - - base_addr &= PT64_LVL_ADDR_MASK(iterator.level); - pseudo_gfn = base_addr >> PAGE_SHIFT; - sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr, - iterator.level - 1, - 1, ACC_ALL, iterator.sptep); - if (!sp) { - pgprintk("nonpaging_map: ENOMEM\n"); - kvm_release_pfn_clean(pfn); - return -ENOMEM; - } - - mmu_spte_set(iterator.sptep, - __pa(sp->spt) - | PT_PRESENT_MASK | PT_WRITABLE_MASK - | shadow_user_mask | shadow_x_mask - | shadow_accessed_mask); - } - } - return emulate; -} - -static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk) -{ - siginfo_t info; - - info.si_signo = SIGBUS; - info.si_errno = 0; - info.si_code = BUS_MCEERR_AR; - info.si_addr = (void __user *)address; - info.si_addr_lsb = PAGE_SHIFT; - - send_sig_info(SIGBUS, &info, tsk); -} - -static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn) -{ - kvm_release_pfn_clean(pfn); - if (is_hwpoison_pfn(pfn)) { - kvm_send_hwpoison_signal(gfn_to_hva(vcpu->kvm, gfn), current); - return 0; - } - - return -EFAULT; -} - -static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, - gfn_t *gfnp, pfn_t *pfnp, int *levelp) -{ - pfn_t pfn = *pfnp; - gfn_t gfn = *gfnp; - int level = *levelp; - - /* - * Check if it's a transparent hugepage. If this would be an - * hugetlbfs page, level wouldn't be set to - * PT_PAGE_TABLE_LEVEL and there would be no adjustment done - * here. - */ - if (!is_error_pfn(pfn) && !kvm_is_mmio_pfn(pfn) && - level == PT_PAGE_TABLE_LEVEL && - PageTransCompound(pfn_to_page(pfn)) && - !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) { - unsigned long mask; - /* - * mmu_notifier_retry was successful and we hold the - * mmu_lock here, so the pmd can't become splitting - * from under us, and in turn - * __split_huge_page_refcount() can't run from under - * us and we can safely transfer the refcount from - * PG_tail to PG_head as we switch the pfn to tail to - * head. - */ - *levelp = level = PT_DIRECTORY_LEVEL; - mask = KVM_PAGES_PER_HPAGE(level) - 1; - VM_BUG_ON((gfn & mask) != (pfn & mask)); - if (pfn & mask) { - gfn &= ~mask; - *gfnp = gfn; - kvm_release_pfn_clean(pfn); - pfn &= ~mask; - if (!get_page_unless_zero(pfn_to_page(pfn))) - BUG(); - *pfnp = pfn; - } - } -} - -static bool mmu_invalid_pfn(pfn_t pfn) -{ - return unlikely(is_invalid_pfn(pfn)); -} - -static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, - pfn_t pfn, unsigned access, int *ret_val) -{ - bool ret = true; - - /* The pfn is invalid, report the error! */ - if (unlikely(is_invalid_pfn(pfn))) { - *ret_val = kvm_handle_bad_page(vcpu, gfn, pfn); - goto exit; - } - - if (unlikely(is_noslot_pfn(pfn))) - vcpu_cache_mmio_info(vcpu, gva, gfn, access); - - ret = false; -exit: - return ret; -} - -static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, - gva_t gva, pfn_t *pfn, bool write, bool *writable); - -static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn, - bool prefault) -{ - int r; - int level; - int force_pt_level; - pfn_t pfn; - unsigned long mmu_seq; - bool map_writable; - - force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn); - if (likely(!force_pt_level)) { - level = mapping_level(vcpu, gfn); - /* - * This path builds a PAE pagetable - so we can map - * 2mb pages at maximum. Therefore check if the level - * is larger than that. - */ - if (level > PT_DIRECTORY_LEVEL) - level = PT_DIRECTORY_LEVEL; - - gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); - } else - level = PT_PAGE_TABLE_LEVEL; - - mmu_seq = vcpu->kvm->mmu_notifier_seq; - smp_rmb(); - - if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable)) - return 0; - - if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r)) - return r; - - spin_lock(&vcpu->kvm->mmu_lock); - if (mmu_notifier_retry(vcpu, mmu_seq)) - goto out_unlock; - kvm_mmu_free_some_pages(vcpu); - if (likely(!force_pt_level)) - transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); - r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn, - prefault); - spin_unlock(&vcpu->kvm->mmu_lock); - - - return r; - -out_unlock: - spin_unlock(&vcpu->kvm->mmu_lock); - kvm_release_pfn_clean(pfn); - return 0; -} - - -static void mmu_free_roots(struct kvm_vcpu *vcpu) -{ - int i; - struct kvm_mmu_page *sp; - LIST_HEAD(invalid_list); - - if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) - return; - spin_lock(&vcpu->kvm->mmu_lock); - if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL && - (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL || - vcpu->arch.mmu.direct_map)) { - hpa_t root = vcpu->arch.mmu.root_hpa; - - sp = page_header(root); - --sp->root_count; - if (!sp->root_count && sp->role.invalid) { - kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); - kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); - } - vcpu->arch.mmu.root_hpa = INVALID_PAGE; - spin_unlock(&vcpu->kvm->mmu_lock); - return; - } - for (i = 0; i < 4; ++i) { - hpa_t root = vcpu->arch.mmu.pae_root[i]; - - if (root) { - root &= PT64_BASE_ADDR_MASK; - sp = page_header(root); - --sp->root_count; - if (!sp->root_count && sp->role.invalid) - kvm_mmu_prepare_zap_page(vcpu->kvm, sp, - &invalid_list); - } - vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; - } - kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); - spin_unlock(&vcpu->kvm->mmu_lock); - vcpu->arch.mmu.root_hpa = INVALID_PAGE; -} - -static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn) -{ - int ret = 0; - - if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) { - kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); - ret = 1; - } - - return ret; -} - -static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) -{ - struct kvm_mmu_page *sp; - unsigned i; - - if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { - spin_lock(&vcpu->kvm->mmu_lock); - kvm_mmu_free_some_pages(vcpu); - sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_LEVEL, - 1, ACC_ALL, NULL); - ++sp->root_count; - spin_unlock(&vcpu->kvm->mmu_lock); - vcpu->arch.mmu.root_hpa = __pa(sp->spt); - } else if (vcpu->arch.mmu.shadow_root_level == PT32E_ROOT_LEVEL) { - for (i = 0; i < 4; ++i) { - hpa_t root = vcpu->arch.mmu.pae_root[i]; - - ASSERT(!VALID_PAGE(root)); - spin_lock(&vcpu->kvm->mmu_lock); - kvm_mmu_free_some_pages(vcpu); - sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT), - i << 30, - PT32_ROOT_LEVEL, 1, ACC_ALL, - NULL); - root = __pa(sp->spt); - ++sp->root_count; - spin_unlock(&vcpu->kvm->mmu_lock); - vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK; - } - vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); - } else - BUG(); - - return 0; -} - -static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) -{ - struct kvm_mmu_page *sp; - u64 pdptr, pm_mask; - gfn_t root_gfn; - int i; - - root_gfn = vcpu->arch.mmu.get_cr3(vcpu) >> PAGE_SHIFT; - - if (mmu_check_root(vcpu, root_gfn)) - return 1; - - /* - * Do we shadow a long mode page table? If so we need to - * write-protect the guests page table root. - */ - if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) { - hpa_t root = vcpu->arch.mmu.root_hpa; - - ASSERT(!VALID_PAGE(root)); - - spin_lock(&vcpu->kvm->mmu_lock); - kvm_mmu_free_some_pages(vcpu); - sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL, - 0, ACC_ALL, NULL); - root = __pa(sp->spt); - ++sp->root_count; - spin_unlock(&vcpu->kvm->mmu_lock); - vcpu->arch.mmu.root_hpa = root; - return 0; - } - - /* - * We shadow a 32 bit page table. This may be a legacy 2-level - * or a PAE 3-level page table. In either case we need to be aware that - * the shadow page table may be a PAE or a long mode page table. - */ - pm_mask = PT_PRESENT_MASK; - if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) - pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK; - - for (i = 0; i < 4; ++i) { - hpa_t root = vcpu->arch.mmu.pae_root[i]; - - ASSERT(!VALID_PAGE(root)); - if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) { - pdptr = vcpu->arch.mmu.get_pdptr(vcpu, i); - if (!is_present_gpte(pdptr)) { - vcpu->arch.mmu.pae_root[i] = 0; - continue; - } - root_gfn = pdptr >> PAGE_SHIFT; - if (mmu_check_root(vcpu, root_gfn)) - return 1; - } - spin_lock(&vcpu->kvm->mmu_lock); - kvm_mmu_free_some_pages(vcpu); - sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, - PT32_ROOT_LEVEL, 0, - ACC_ALL, NULL); - root = __pa(sp->spt); - ++sp->root_count; - spin_unlock(&vcpu->kvm->mmu_lock); - - vcpu->arch.mmu.pae_root[i] = root | pm_mask; - } - vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); - - /* - * If we shadow a 32 bit page table with a long mode page - * table we enter this path. - */ - if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { - if (vcpu->arch.mmu.lm_root == NULL) { - /* - * The additional page necessary for this is only - * allocated on demand. - */ - - u64 *lm_root; - - lm_root = (void*)get_zeroed_page(GFP_KERNEL); - if (lm_root == NULL) - return 1; - - lm_root[0] = __pa(vcpu->arch.mmu.pae_root) | pm_mask; - - vcpu->arch.mmu.lm_root = lm_root; - } - - vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.lm_root); - } - - return 0; -} - -static int mmu_alloc_roots(struct kvm_vcpu *vcpu) -{ - if (vcpu->arch.mmu.direct_map) - return mmu_alloc_direct_roots(vcpu); - else - return mmu_alloc_shadow_roots(vcpu); -} - -static void mmu_sync_roots(struct kvm_vcpu *vcpu) -{ - int i; - struct kvm_mmu_page *sp; - - if (vcpu->arch.mmu.direct_map) - return; - - if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) - return; - - vcpu_clear_mmio_info(vcpu, ~0ul); - kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); - if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) { - hpa_t root = vcpu->arch.mmu.root_hpa; - sp = page_header(root); - mmu_sync_children(vcpu, sp); - kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); - return; - } - for (i = 0; i < 4; ++i) { - hpa_t root = vcpu->arch.mmu.pae_root[i]; - - if (root && VALID_PAGE(root)) { - root &= PT64_BASE_ADDR_MASK; - sp = page_header(root); - mmu_sync_children(vcpu, sp); - } - } - kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); -} - -void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) -{ - spin_lock(&vcpu->kvm->mmu_lock); - mmu_sync_roots(vcpu); - spin_unlock(&vcpu->kvm->mmu_lock); -} - -static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr, - u32 access, struct x86_exception *exception) -{ - if (exception) - exception->error_code = 0; - return vaddr; -} - -static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr, - u32 access, - struct x86_exception *exception) -{ - if (exception) - exception->error_code = 0; - return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access); -} - -static bool quickly_check_mmio_pf(struct kvm_vcpu *vcpu, u64 addr, bool direct) -{ - if (direct) - return vcpu_match_mmio_gpa(vcpu, addr); - - return vcpu_match_mmio_gva(vcpu, addr); -} - - -/* - * On direct hosts, the last spte is only allows two states - * for mmio page fault: - * - It is the mmio spte - * - It is zapped or it is being zapped. - * - * This function completely checks the spte when the last spte - * is not the mmio spte. - */ -static bool check_direct_spte_mmio_pf(u64 spte) -{ - return __check_direct_spte_mmio_pf(spte); -} - -static u64 walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr) -{ - struct kvm_shadow_walk_iterator iterator; - u64 spte = 0ull; - - walk_shadow_page_lockless_begin(vcpu); - for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) - if (!is_shadow_present_pte(spte)) - break; - walk_shadow_page_lockless_end(vcpu); - - return spte; -} - -/* - * If it is a real mmio page fault, return 1 and emulat the instruction - * directly, return 0 to let CPU fault again on the address, -1 is - * returned if bug is detected. - */ -int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct) -{ - u64 spte; - - if (quickly_check_mmio_pf(vcpu, addr, direct)) - return 1; - - spte = walk_shadow_page_get_mmio_spte(vcpu, addr); - - if (is_mmio_spte(spte)) { - gfn_t gfn = get_mmio_spte_gfn(spte); - unsigned access = get_mmio_spte_access(spte); - - if (direct) - addr = 0; - - trace_handle_mmio_page_fault(addr, gfn, access); - vcpu_cache_mmio_info(vcpu, addr, gfn, access); - return 1; - } - - /* - * It's ok if the gva is remapped by other cpus on shadow guest, - * it's a BUG if the gfn is not a mmio page. - */ - if (direct && !check_direct_spte_mmio_pf(spte)) - return -1; - - /* - * If the page table is zapped by other cpus, let CPU fault again on - * the address. - */ - return 0; -} -EXPORT_SYMBOL_GPL(handle_mmio_page_fault_common); - -static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, - u32 error_code, bool direct) -{ - int ret; - - ret = handle_mmio_page_fault_common(vcpu, addr, direct); - WARN_ON(ret < 0); - return ret; -} - -static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, - u32 error_code, bool prefault) -{ - gfn_t gfn; - int r; - - pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code); - - if (unlikely(error_code & PFERR_RSVD_MASK)) - return handle_mmio_page_fault(vcpu, gva, error_code, true); - - r = mmu_topup_memory_caches(vcpu); - if (r) - return r; - - ASSERT(vcpu); - ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); - - gfn = gva >> PAGE_SHIFT; - - return nonpaging_map(vcpu, gva & PAGE_MASK, - error_code & PFERR_WRITE_MASK, gfn, prefault); -} - -static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn) -{ - struct kvm_arch_async_pf arch; - - arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id; - arch.gfn = gfn; - arch.direct_map = vcpu->arch.mmu.direct_map; - arch.cr3 = vcpu->arch.mmu.get_cr3(vcpu); - - return kvm_setup_async_pf(vcpu, gva, gfn, &arch); -} - -static bool can_do_async_pf(struct kvm_vcpu *vcpu) -{ - if (unlikely(!irqchip_in_kernel(vcpu->kvm) || - kvm_event_needs_reinjection(vcpu))) - return false; - - return kvm_x86_ops->interrupt_allowed(vcpu); -} - -static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, - gva_t gva, pfn_t *pfn, bool write, bool *writable) -{ - bool async; - - *pfn = gfn_to_pfn_async(vcpu->kvm, gfn, &async, write, writable); - - if (!async) - return false; /* *pfn has correct page already */ - - put_page(pfn_to_page(*pfn)); - - if (!prefault && can_do_async_pf(vcpu)) { - trace_kvm_try_async_get_page(gva, gfn); - if (kvm_find_async_pf_gfn(vcpu, gfn)) { - trace_kvm_async_pf_doublefault(gva, gfn); - kvm_make_request(KVM_REQ_APF_HALT, vcpu); - return true; - } else if (kvm_arch_setup_async_pf(vcpu, gva, gfn)) - return true; - } - - *pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write, writable); - - return false; -} - -static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, - bool prefault) -{ - pfn_t pfn; - int r; - int level; - int force_pt_level; - gfn_t gfn = gpa >> PAGE_SHIFT; - unsigned long mmu_seq; - int write = error_code & PFERR_WRITE_MASK; - bool map_writable; - - ASSERT(vcpu); - ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); - - if (unlikely(error_code & PFERR_RSVD_MASK)) - return handle_mmio_page_fault(vcpu, gpa, error_code, true); - - r = mmu_topup_memory_caches(vcpu); - if (r) - return r; - - force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn); - if (likely(!force_pt_level)) { - level = mapping_level(vcpu, gfn); - gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); - } else - level = PT_PAGE_TABLE_LEVEL; - - mmu_seq = vcpu->kvm->mmu_notifier_seq; - smp_rmb(); - - if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable)) - return 0; - - if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r)) - return r; - - spin_lock(&vcpu->kvm->mmu_lock); - if (mmu_notifier_retry(vcpu, mmu_seq)) - goto out_unlock; - kvm_mmu_free_some_pages(vcpu); - if (likely(!force_pt_level)) - transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); - r = __direct_map(vcpu, gpa, write, map_writable, - level, gfn, pfn, prefault); - spin_unlock(&vcpu->kvm->mmu_lock); - - return r; - -out_unlock: - spin_unlock(&vcpu->kvm->mmu_lock); - kvm_release_pfn_clean(pfn); - return 0; -} - -static void nonpaging_free(struct kvm_vcpu *vcpu) -{ - mmu_free_roots(vcpu); -} - -static int nonpaging_init_context(struct kvm_vcpu *vcpu, - struct kvm_mmu *context) -{ - context->new_cr3 = nonpaging_new_cr3; - context->page_fault = nonpaging_page_fault; - context->gva_to_gpa = nonpaging_gva_to_gpa; - context->free = nonpaging_free; - context->sync_page = nonpaging_sync_page; - context->invlpg = nonpaging_invlpg; - context->update_pte = nonpaging_update_pte; - context->root_level = 0; - context->shadow_root_level = PT32E_ROOT_LEVEL; - context->root_hpa = INVALID_PAGE; - context->direct_map = true; - context->nx = false; - return 0; -} - -void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) -{ - ++vcpu->stat.tlb_flush; - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); -} - -static void paging_new_cr3(struct kvm_vcpu *vcpu) -{ - pgprintk("%s: cr3 %lx\n", __func__, kvm_read_cr3(vcpu)); - mmu_free_roots(vcpu); -} - -static unsigned long get_cr3(struct kvm_vcpu *vcpu) -{ - return kvm_read_cr3(vcpu); -} - -static void inject_page_fault(struct kvm_vcpu *vcpu, - struct x86_exception *fault) -{ - vcpu->arch.mmu.inject_page_fault(vcpu, fault); -} - -static void paging_free(struct kvm_vcpu *vcpu) -{ - nonpaging_free(vcpu); -} - -static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level) -{ - int bit7; - - bit7 = (gpte >> 7) & 1; - return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0; -} - -static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access, - int *nr_present) -{ - if (unlikely(is_mmio_spte(*sptep))) { - if (gfn != get_mmio_spte_gfn(*sptep)) { - mmu_spte_clear_no_track(sptep); - return true; - } - - (*nr_present)++; - mark_mmio_spte(sptep, gfn, access); - return true; - } - - return false; -} - -#define PTTYPE 64 -#include "paging_tmpl.h" -#undef PTTYPE - -#define PTTYPE 32 -#include "paging_tmpl.h" -#undef PTTYPE - -static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, - struct kvm_mmu *context) -{ - int maxphyaddr = cpuid_maxphyaddr(vcpu); - u64 exb_bit_rsvd = 0; - - if (!context->nx) - exb_bit_rsvd = rsvd_bits(63, 63); - switch (context->root_level) { - case PT32_ROOT_LEVEL: - /* no rsvd bits for 2 level 4K page table entries */ - context->rsvd_bits_mask[0][1] = 0; - context->rsvd_bits_mask[0][0] = 0; - context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0]; - - if (!is_pse(vcpu)) { - context->rsvd_bits_mask[1][1] = 0; - break; - } - - if (is_cpuid_PSE36()) - /* 36bits PSE 4MB page */ - context->rsvd_bits_mask[1][1] = rsvd_bits(17, 21); - else - /* 32 bits PSE 4MB page */ - context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21); - break; - case PT32E_ROOT_LEVEL: - context->rsvd_bits_mask[0][2] = - rsvd_bits(maxphyaddr, 63) | - rsvd_bits(7, 8) | rsvd_bits(1, 2); /* PDPTE */ - context->rsvd_bits_mask[0][1] = exb_bit_rsvd | - rsvd_bits(maxphyaddr, 62); /* PDE */ - context->rsvd_bits_mask[0][0] = exb_bit_rsvd | - rsvd_bits(maxphyaddr, 62); /* PTE */ - context->rsvd_bits_mask[1][1] = exb_bit_rsvd | - rsvd_bits(maxphyaddr, 62) | - rsvd_bits(13, 20); /* large page */ - context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0]; - break; - case PT64_ROOT_LEVEL: - context->rsvd_bits_mask[0][3] = exb_bit_rsvd | - rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8); - context->rsvd_bits_mask[0][2] = exb_bit_rsvd | - rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8); - context->rsvd_bits_mask[0][1] = exb_bit_rsvd | - rsvd_bits(maxphyaddr, 51); - context->rsvd_bits_mask[0][0] = exb_bit_rsvd | - rsvd_bits(maxphyaddr, 51); - context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3]; - context->rsvd_bits_mask[1][2] = exb_bit_rsvd | - rsvd_bits(maxphyaddr, 51) | - rsvd_bits(13, 29); - context->rsvd_bits_mask[1][1] = exb_bit_rsvd | - rsvd_bits(maxphyaddr, 51) | - rsvd_bits(13, 20); /* large page */ - context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0]; - break; - } -} - -static int paging64_init_context_common(struct kvm_vcpu *vcpu, - struct kvm_mmu *context, - int level) -{ - context->nx = is_nx(vcpu); - context->root_level = level; - - reset_rsvds_bits_mask(vcpu, context); - - ASSERT(is_pae(vcpu)); - context->new_cr3 = paging_new_cr3; - context->page_fault = paging64_page_fault; - context->gva_to_gpa = paging64_gva_to_gpa; - context->sync_page = paging64_sync_page; - context->invlpg = paging64_invlpg; - context->update_pte = paging64_update_pte; - context->free = paging_free; - context->shadow_root_level = level; - context->root_hpa = INVALID_PAGE; - context->direct_map = false; - return 0; -} - -static int paging64_init_context(struct kvm_vcpu *vcpu, - struct kvm_mmu *context) -{ - return paging64_init_context_common(vcpu, context, PT64_ROOT_LEVEL); -} - -static int paging32_init_context(struct kvm_vcpu *vcpu, - struct kvm_mmu *context) -{ - context->nx = false; - context->root_level = PT32_ROOT_LEVEL; - - reset_rsvds_bits_mask(vcpu, context); - - context->new_cr3 = paging_new_cr3; - context->page_fault = paging32_page_fault; - context->gva_to_gpa = paging32_gva_to_gpa; - context->free = paging_free; - context->sync_page = paging32_sync_page; - context->invlpg = paging32_invlpg; - context->update_pte = paging32_update_pte; - context->shadow_root_level = PT32E_ROOT_LEVEL; - context->root_hpa = INVALID_PAGE; - context->direct_map = false; - return 0; -} - -static int paging32E_init_context(struct kvm_vcpu *vcpu, - struct kvm_mmu *context) -{ - return paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL); -} - -static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) -{ - struct kvm_mmu *context = vcpu->arch.walk_mmu; - - context->base_role.word = 0; - context->new_cr3 = nonpaging_new_cr3; - context->page_fault = tdp_page_fault; - context->free = nonpaging_free; - context->sync_page = nonpaging_sync_page; - context->invlpg = nonpaging_invlpg; - context->update_pte = nonpaging_update_pte; - context->shadow_root_level = kvm_x86_ops->get_tdp_level(); - context->root_hpa = INVALID_PAGE; - context->direct_map = true; - context->set_cr3 = kvm_x86_ops->set_tdp_cr3; - context->get_cr3 = get_cr3; - context->get_pdptr = kvm_pdptr_read; - context->inject_page_fault = kvm_inject_page_fault; - - if (!is_paging(vcpu)) { - context->nx = false; - context->gva_to_gpa = nonpaging_gva_to_gpa; - context->root_level = 0; - } else if (is_long_mode(vcpu)) { - context->nx = is_nx(vcpu); - context->root_level = PT64_ROOT_LEVEL; - reset_rsvds_bits_mask(vcpu, context); - context->gva_to_gpa = paging64_gva_to_gpa; - } else if (is_pae(vcpu)) { - context->nx = is_nx(vcpu); - context->root_level = PT32E_ROOT_LEVEL; - reset_rsvds_bits_mask(vcpu, context); - context->gva_to_gpa = paging64_gva_to_gpa; - } else { - context->nx = false; - context->root_level = PT32_ROOT_LEVEL; - reset_rsvds_bits_mask(vcpu, context); - context->gva_to_gpa = paging32_gva_to_gpa; - } - - return 0; -} - -int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context) -{ - int r; - bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP); - ASSERT(vcpu); - ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); - - if (!is_paging(vcpu)) - r = nonpaging_init_context(vcpu, context); - else if (is_long_mode(vcpu)) - r = paging64_init_context(vcpu, context); - else if (is_pae(vcpu)) - r = paging32E_init_context(vcpu, context); - else - r = paging32_init_context(vcpu, context); - - vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu); - vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu); - vcpu->arch.mmu.base_role.smep_andnot_wp - = smep && !is_write_protection(vcpu); - - return r; -} -EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu); - -static int init_kvm_softmmu(struct kvm_vcpu *vcpu) -{ - int r = kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu); - - vcpu->arch.walk_mmu->set_cr3 = kvm_x86_ops->set_cr3; - vcpu->arch.walk_mmu->get_cr3 = get_cr3; - vcpu->arch.walk_mmu->get_pdptr = kvm_pdptr_read; - vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault; - - return r; -} - -static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu) -{ - struct kvm_mmu *g_context = &vcpu->arch.nested_mmu; - - g_context->get_cr3 = get_cr3; - g_context->get_pdptr = kvm_pdptr_read; - g_context->inject_page_fault = kvm_inject_page_fault; - - /* - * Note that arch.mmu.gva_to_gpa translates l2_gva to l1_gpa. The - * translation of l2_gpa to l1_gpa addresses is done using the - * arch.nested_mmu.gva_to_gpa function. Basically the gva_to_gpa - * functions between mmu and nested_mmu are swapped. - */ - if (!is_paging(vcpu)) { - g_context->nx = false; - g_context->root_level = 0; - g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested; - } else if (is_long_mode(vcpu)) { - g_context->nx = is_nx(vcpu); - g_context->root_level = PT64_ROOT_LEVEL; - reset_rsvds_bits_mask(vcpu, g_context); - g_context->gva_to_gpa = paging64_gva_to_gpa_nested; - } else if (is_pae(vcpu)) { - g_context->nx = is_nx(vcpu); - g_context->root_level = PT32E_ROOT_LEVEL; - reset_rsvds_bits_mask(vcpu, g_context); - g_context->gva_to_gpa = paging64_gva_to_gpa_nested; - } else { - g_context->nx = false; - g_context->root_level = PT32_ROOT_LEVEL; - reset_rsvds_bits_mask(vcpu, g_context); - g_context->gva_to_gpa = paging32_gva_to_gpa_nested; - } - - return 0; -} - -static int init_kvm_mmu(struct kvm_vcpu *vcpu) -{ - if (mmu_is_nested(vcpu)) - return init_kvm_nested_mmu(vcpu); - else if (tdp_enabled) - return init_kvm_tdp_mmu(vcpu); - else - return init_kvm_softmmu(vcpu); -} - -static void destroy_kvm_mmu(struct kvm_vcpu *vcpu) -{ - ASSERT(vcpu); - if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) - /* mmu.free() should set root_hpa = INVALID_PAGE */ - vcpu->arch.mmu.free(vcpu); -} - -int kvm_mmu_reset_context(struct kvm_vcpu *vcpu) -{ - destroy_kvm_mmu(vcpu); - return init_kvm_mmu(vcpu); -} -EXPORT_SYMBOL_GPL(kvm_mmu_reset_context); - -int kvm_mmu_load(struct kvm_vcpu *vcpu) -{ - int r; - - r = mmu_topup_memory_caches(vcpu); - if (r) - goto out; - r = mmu_alloc_roots(vcpu); - spin_lock(&vcpu->kvm->mmu_lock); - mmu_sync_roots(vcpu); - spin_unlock(&vcpu->kvm->mmu_lock); - if (r) - goto out; - /* set_cr3() should ensure TLB has been flushed */ - vcpu->arch.mmu.set_cr3(vcpu, vcpu->arch.mmu.root_hpa); -out: - return r; -} -EXPORT_SYMBOL_GPL(kvm_mmu_load); - -void kvm_mmu_unload(struct kvm_vcpu *vcpu) -{ - mmu_free_roots(vcpu); -} -EXPORT_SYMBOL_GPL(kvm_mmu_unload); - -static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *sp, u64 *spte, - const void *new) -{ - if (sp->role.level != PT_PAGE_TABLE_LEVEL) { - ++vcpu->kvm->stat.mmu_pde_zapped; - return; - } - - ++vcpu->kvm->stat.mmu_pte_updated; - vcpu->arch.mmu.update_pte(vcpu, sp, spte, new); -} - -static bool need_remote_flush(u64 old, u64 new) -{ - if (!is_shadow_present_pte(old)) - return false; - if (!is_shadow_present_pte(new)) - return true; - if ((old ^ new) & PT64_BASE_ADDR_MASK) - return true; - old ^= PT64_NX_MASK; - new ^= PT64_NX_MASK; - return (old & ~new & PT64_PERM_MASK) != 0; -} - -static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, bool zap_page, - bool remote_flush, bool local_flush) -{ - if (zap_page) - return; - - if (remote_flush) - kvm_flush_remote_tlbs(vcpu->kvm); - else if (local_flush) - kvm_mmu_flush_tlb(vcpu); -} - -static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa, - const u8 *new, int *bytes) -{ - u64 gentry; - int r; - - /* - * Assume that the pte write on a page table of the same type - * as the current vcpu paging mode since we update the sptes only - * when they have the same mode. - */ - if (is_pae(vcpu) && *bytes == 4) { - /* Handle a 32-bit guest writing two halves of a 64-bit gpte */ - *gpa &= ~(gpa_t)7; - *bytes = 8; - r = kvm_read_guest(vcpu->kvm, *gpa, &gentry, min(*bytes, 8)); - if (r) - gentry = 0; - new = (const u8 *)&gentry; - } - - switch (*bytes) { - case 4: - gentry = *(const u32 *)new; - break; - case 8: - gentry = *(const u64 *)new; - break; - default: - gentry = 0; - break; - } - - return gentry; -} - -/* - * If we're seeing too many writes to a page, it may no longer be a page table, - * or we may be forking, in which case it is better to unmap the page. - */ -static bool detect_write_flooding(struct kvm_mmu_page *sp) -{ - /* - * Skip write-flooding detected for the sp whose level is 1, because - * it can become unsync, then the guest page is not write-protected. - */ - if (sp->role.level == 1) - return false; - - return ++sp->write_flooding_count >= 3; -} - -/* - * Misaligned accesses are too much trouble to fix up; also, they usually - * indicate a page is not used as a page table. - */ -static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa, - int bytes) -{ - unsigned offset, pte_size, misaligned; - - pgprintk("misaligned: gpa %llx bytes %d role %x\n", - gpa, bytes, sp->role.word); - - offset = offset_in_page(gpa); - pte_size = sp->role.cr4_pae ? 8 : 4; - - /* - * Sometimes, the OS only writes the last one bytes to update status - * bits, for example, in linux, andb instruction is used in clear_bit(). - */ - if (!(offset & (pte_size - 1)) && bytes == 1) - return false; - - misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); - misaligned |= bytes < 4; - - return misaligned; -} - -static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte) -{ - unsigned page_offset, quadrant; - u64 *spte; - int level; - - page_offset = offset_in_page(gpa); - level = sp->role.level; - *nspte = 1; - if (!sp->role.cr4_pae) { - page_offset <<= 1; /* 32->64 */ - /* - * A 32-bit pde maps 4MB while the shadow pdes map - * only 2MB. So we need to double the offset again - * and zap two pdes instead of one. - */ - if (level == PT32_ROOT_LEVEL) { - page_offset &= ~7; /* kill rounding error */ - page_offset <<= 1; - *nspte = 2; - } - quadrant = page_offset >> PAGE_SHIFT; - page_offset &= ~PAGE_MASK; - if (quadrant != sp->role.quadrant) - return NULL; - } - - spte = &sp->spt[page_offset / sizeof(*spte)]; - return spte; -} - -void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, - const u8 *new, int bytes) -{ - gfn_t gfn = gpa >> PAGE_SHIFT; - union kvm_mmu_page_role mask = { .word = 0 }; - struct kvm_mmu_page *sp; - struct hlist_node *node; - LIST_HEAD(invalid_list); - u64 entry, gentry, *spte; - int npte; - bool remote_flush, local_flush, zap_page; - - /* - * If we don't have indirect shadow pages, it means no page is - * write-protected, so we can exit simply. - */ - if (!ACCESS_ONCE(vcpu->kvm->arch.indirect_shadow_pages)) - return; - - zap_page = remote_flush = local_flush = false; - - pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); - - gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, new, &bytes); - - /* - * No need to care whether allocation memory is successful - * or not since pte prefetch is skiped if it does not have - * enough objects in the cache. - */ - mmu_topup_memory_caches(vcpu); - - spin_lock(&vcpu->kvm->mmu_lock); - ++vcpu->kvm->stat.mmu_pte_write; - kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE); - - mask.cr0_wp = mask.cr4_pae = mask.nxe = 1; - for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) { - if (detect_write_misaligned(sp, gpa, bytes) || - detect_write_flooding(sp)) { - zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp, - &invalid_list); - ++vcpu->kvm->stat.mmu_flooded; - continue; - } - - spte = get_written_sptes(sp, gpa, &npte); - if (!spte) - continue; - - local_flush = true; - while (npte--) { - entry = *spte; - mmu_page_zap_pte(vcpu->kvm, sp, spte); - if (gentry && - !((sp->role.word ^ vcpu->arch.mmu.base_role.word) - & mask.word) && rmap_can_add(vcpu)) - mmu_pte_write_new_pte(vcpu, sp, spte, &gentry); - if (!remote_flush && need_remote_flush(entry, *spte)) - remote_flush = true; - ++spte; - } - } - mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush); - kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); - kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE); - spin_unlock(&vcpu->kvm->mmu_lock); -} - -int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) -{ - gpa_t gpa; - int r; - - if (vcpu->arch.mmu.direct_map) - return 0; - - gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); - - r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); - - return r; -} -EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt); - -void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) -{ - LIST_HEAD(invalid_list); - - while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES && - !list_empty(&vcpu->kvm->arch.active_mmu_pages)) { - struct kvm_mmu_page *sp; - - sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev, - struct kvm_mmu_page, link); - kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); - ++vcpu->kvm->stat.mmu_recycled; - } - kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); -} - -static bool is_mmio_page_fault(struct kvm_vcpu *vcpu, gva_t addr) -{ - if (vcpu->arch.mmu.direct_map || mmu_is_nested(vcpu)) - return vcpu_match_mmio_gpa(vcpu, addr); - - return vcpu_match_mmio_gva(vcpu, addr); -} - -int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code, - void *insn, int insn_len) -{ - int r, emulation_type = EMULTYPE_RETRY; - enum emulation_result er; - - r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false); - if (r < 0) - goto out; - - if (!r) { - r = 1; - goto out; - } - - if (is_mmio_page_fault(vcpu, cr2)) - emulation_type = 0; - - er = x86_emulate_instruction(vcpu, cr2, emulation_type, insn, insn_len); - - switch (er) { - case EMULATE_DONE: - return 1; - case EMULATE_DO_MMIO: - ++vcpu->stat.mmio_exits; - /* fall through */ - case EMULATE_FAIL: - return 0; - default: - BUG(); - } -out: - return r; -} -EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); - -void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) -{ - vcpu->arch.mmu.invlpg(vcpu, gva); - kvm_mmu_flush_tlb(vcpu); - ++vcpu->stat.invlpg; -} -EXPORT_SYMBOL_GPL(kvm_mmu_invlpg); - -void kvm_enable_tdp(void) -{ - tdp_enabled = true; -} -EXPORT_SYMBOL_GPL(kvm_enable_tdp); - -void kvm_disable_tdp(void) -{ - tdp_enabled = false; -} -EXPORT_SYMBOL_GPL(kvm_disable_tdp); - -static void free_mmu_pages(struct kvm_vcpu *vcpu) -{ - free_page((unsigned long)vcpu->arch.mmu.pae_root); - if (vcpu->arch.mmu.lm_root != NULL) - free_page((unsigned long)vcpu->arch.mmu.lm_root); -} - -static int alloc_mmu_pages(struct kvm_vcpu *vcpu) -{ - struct page *page; - int i; - - ASSERT(vcpu); - - /* - * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64. - * Therefore we need to allocate shadow page tables in the first - * 4GB of memory, which happens to fit the DMA32 zone. - */ - page = alloc_page(GFP_KERNEL | __GFP_DMA32); - if (!page) - return -ENOMEM; - - vcpu->arch.mmu.pae_root = page_address(page); - for (i = 0; i < 4; ++i) - vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; - - return 0; -} - -int kvm_mmu_create(struct kvm_vcpu *vcpu) -{ - ASSERT(vcpu); - - vcpu->arch.walk_mmu = &vcpu->arch.mmu; - vcpu->arch.mmu.root_hpa = INVALID_PAGE; - vcpu->arch.mmu.translate_gpa = translate_gpa; - vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa; - - return alloc_mmu_pages(vcpu); -} - -int kvm_mmu_setup(struct kvm_vcpu *vcpu) -{ - ASSERT(vcpu); - ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); - - return init_kvm_mmu(vcpu); -} - -void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) -{ - struct kvm_mmu_page *sp; - - list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) { - int i; - u64 *pt; - - if (!test_bit(slot, sp->slot_bitmap)) - continue; - - pt = sp->spt; - for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { - if (!is_shadow_present_pte(pt[i]) || - !is_last_spte(pt[i], sp->role.level)) - continue; - - if (is_large_pte(pt[i])) { - drop_spte(kvm, &pt[i]); - --kvm->stat.lpages; - continue; - } - - /* avoid RMW */ - if (is_writable_pte(pt[i])) - mmu_spte_update(&pt[i], - pt[i] & ~PT_WRITABLE_MASK); - } - } - kvm_flush_remote_tlbs(kvm); -} - -void kvm_mmu_zap_all(struct kvm *kvm) -{ - struct kvm_mmu_page *sp, *node; - LIST_HEAD(invalid_list); - - spin_lock(&kvm->mmu_lock); -restart: - list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) - if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list)) - goto restart; - - kvm_mmu_commit_zap_page(kvm, &invalid_list); - spin_unlock(&kvm->mmu_lock); -} - -static void kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm, - struct list_head *invalid_list) -{ - struct kvm_mmu_page *page; - - page = container_of(kvm->arch.active_mmu_pages.prev, - struct kvm_mmu_page, link); - kvm_mmu_prepare_zap_page(kvm, page, invalid_list); -} - -static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) -{ - struct kvm *kvm; - struct kvm *kvm_freed = NULL; - int nr_to_scan = sc->nr_to_scan; - - if (nr_to_scan == 0) - goto out; - - raw_spin_lock(&kvm_lock); - - list_for_each_entry(kvm, &vm_list, vm_list) { - int idx; - LIST_HEAD(invalid_list); - - idx = srcu_read_lock(&kvm->srcu); - spin_lock(&kvm->mmu_lock); - if (!kvm_freed && nr_to_scan > 0 && - kvm->arch.n_used_mmu_pages > 0) { - kvm_mmu_remove_some_alloc_mmu_pages(kvm, - &invalid_list); - kvm_freed = kvm; - } - nr_to_scan--; - - kvm_mmu_commit_zap_page(kvm, &invalid_list); - spin_unlock(&kvm->mmu_lock); - srcu_read_unlock(&kvm->srcu, idx); - } - if (kvm_freed) - list_move_tail(&kvm_freed->vm_list, &vm_list); - - raw_spin_unlock(&kvm_lock); - -out: - return percpu_counter_read_positive(&kvm_total_used_mmu_pages); -} - -static struct shrinker mmu_shrinker = { - .shrink = mmu_shrink, - .seeks = DEFAULT_SEEKS * 10, -}; - -static void mmu_destroy_caches(void) -{ - if (pte_list_desc_cache) - kmem_cache_destroy(pte_list_desc_cache); - if (mmu_page_header_cache) - kmem_cache_destroy(mmu_page_header_cache); -} - -int kvm_mmu_module_init(void) -{ - pte_list_desc_cache = kmem_cache_create("pte_list_desc", - sizeof(struct pte_list_desc), - 0, 0, NULL); - if (!pte_list_desc_cache) - goto nomem; - - mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header", - sizeof(struct kvm_mmu_page), - 0, 0, NULL); - if (!mmu_page_header_cache) - goto nomem; - - if (percpu_counter_init(&kvm_total_used_mmu_pages, 0)) - goto nomem; - - register_shrinker(&mmu_shrinker); - - return 0; - -nomem: - mmu_destroy_caches(); - return -ENOMEM; -} - -/* - * Caculate mmu pages needed for kvm. - */ -unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm) -{ - unsigned int nr_mmu_pages; - unsigned int nr_pages = 0; - struct kvm_memslots *slots; - struct kvm_memory_slot *memslot; - - slots = kvm_memslots(kvm); - - kvm_for_each_memslot(memslot, slots) - nr_pages += memslot->npages; - - nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000; - nr_mmu_pages = max(nr_mmu_pages, - (unsigned int) KVM_MIN_ALLOC_MMU_PAGES); - - return nr_mmu_pages; -} - -int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]) -{ - struct kvm_shadow_walk_iterator iterator; - u64 spte; - int nr_sptes = 0; - - walk_shadow_page_lockless_begin(vcpu); - for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) { - sptes[iterator.level-1] = spte; - nr_sptes++; - if (!is_shadow_present_pte(spte)) - break; - } - walk_shadow_page_lockless_end(vcpu); - - return nr_sptes; -} -EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy); - -void kvm_mmu_destroy(struct kvm_vcpu *vcpu) -{ - ASSERT(vcpu); - - destroy_kvm_mmu(vcpu); - free_mmu_pages(vcpu); - mmu_free_memory_caches(vcpu); -} - -void kvm_mmu_module_exit(void) -{ - mmu_destroy_caches(); - percpu_counter_destroy(&kvm_total_used_mmu_pages); - unregister_shrinker(&mmu_shrinker); - mmu_audit_disable(); -} diff --git a/ANDROID_3.4.5/arch/x86/kvm/mmu.h b/ANDROID_3.4.5/arch/x86/kvm/mmu.h deleted file mode 100644 index e374db9a..00000000 --- a/ANDROID_3.4.5/arch/x86/kvm/mmu.h +++ /dev/null @@ -1,104 +0,0 @@ -#ifndef __KVM_X86_MMU_H -#define __KVM_X86_MMU_H - -#include <linux/kvm_host.h> -#include "kvm_cache_regs.h" - -#define PT64_PT_BITS 9 -#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS) -#define PT32_PT_BITS 10 -#define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS) - -#define PT_WRITABLE_SHIFT 1 - -#define PT_PRESENT_MASK (1ULL << 0) -#define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT) -#define PT_USER_MASK (1ULL << 2) -#define PT_PWT_MASK (1ULL << 3) -#define PT_PCD_MASK (1ULL << 4) -#define PT_ACCESSED_SHIFT 5 -#define PT_ACCESSED_MASK (1ULL << PT_ACCESSED_SHIFT) -#define PT_DIRTY_MASK (1ULL << 6) -#define PT_PAGE_SIZE_MASK (1ULL << 7) -#define PT_PAT_MASK (1ULL << 7) -#define PT_GLOBAL_MASK (1ULL << 8) -#define PT64_NX_SHIFT 63 -#define PT64_NX_MASK (1ULL << PT64_NX_SHIFT) - -#define PT_PAT_SHIFT 7 -#define PT_DIR_PAT_SHIFT 12 -#define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT) - -#define PT32_DIR_PSE36_SIZE 4 -#define PT32_DIR_PSE36_SHIFT 13 -#define PT32_DIR_PSE36_MASK \ - (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT) - -#define PT64_ROOT_LEVEL 4 -#define PT32_ROOT_LEVEL 2 -#define PT32E_ROOT_LEVEL 3 - -#define PT_PDPE_LEVEL 3 -#define PT_DIRECTORY_LEVEL 2 -#define PT_PAGE_TABLE_LEVEL 1 - -#define PFERR_PRESENT_MASK (1U << 0) -#define PFERR_WRITE_MASK (1U << 1) -#define PFERR_USER_MASK (1U << 2) -#define PFERR_RSVD_MASK (1U << 3) -#define PFERR_FETCH_MASK (1U << 4) - -int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]); -void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask); -int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct); -int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context); - -static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm) -{ - return kvm->arch.n_max_mmu_pages - - kvm->arch.n_used_mmu_pages; -} - -static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) -{ - if (unlikely(kvm_mmu_available_pages(vcpu->kvm)< KVM_MIN_FREE_MMU_PAGES)) - __kvm_mmu_free_some_pages(vcpu); -} - -static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) -{ - if (likely(vcpu->arch.mmu.root_hpa != INVALID_PAGE)) - return 0; - - return kvm_mmu_load(vcpu); -} - -static inline int is_present_gpte(unsigned long pte) -{ - return pte & PT_PRESENT_MASK; -} - -static inline int is_writable_pte(unsigned long pte) -{ - return pte & PT_WRITABLE_MASK; -} - -static inline bool is_write_protection(struct kvm_vcpu *vcpu) -{ - return kvm_read_cr0_bits(vcpu, X86_CR0_WP); -} - -static inline bool check_write_user_access(struct kvm_vcpu *vcpu, - bool write_fault, bool user_fault, - unsigned long pte) -{ - if (unlikely(write_fault && !is_writable_pte(pte) - && (user_fault || is_write_protection(vcpu)))) - return false; - - if (unlikely(user_fault && !(pte & PT_USER_MASK))) - return false; - - return true; -} -#endif diff --git a/ANDROID_3.4.5/arch/x86/kvm/mmu_audit.c b/ANDROID_3.4.5/arch/x86/kvm/mmu_audit.c deleted file mode 100644 index 715da5a1..00000000 --- a/ANDROID_3.4.5/arch/x86/kvm/mmu_audit.c +++ /dev/null @@ -1,303 +0,0 @@ -/* - * mmu_audit.c: - * - * Audit code for KVM MMU - * - * Copyright (C) 2006 Qumranet, Inc. - * Copyright 2010 Red Hat, Inc. and/or its affiliates. - * - * Authors: - * Yaniv Kamay <yaniv@qumranet.com> - * Avi Kivity <avi@qumranet.com> - * Marcelo Tosatti <mtosatti@redhat.com> - * Xiao Guangrong <xiaoguangrong@cn.fujitsu.com> - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * - */ - -#include <linux/ratelimit.h> - -char const *audit_point_name[] = { - "pre page fault", - "post page fault", - "pre pte write", - "post pte write", - "pre sync", - "post sync" -}; - -#define audit_printk(kvm, fmt, args...) \ - printk(KERN_ERR "audit: (%s) error: " \ - fmt, audit_point_name[kvm->arch.audit_point], ##args) - -typedef void (*inspect_spte_fn) (struct kvm_vcpu *vcpu, u64 *sptep, int level); - -static void __mmu_spte_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, - inspect_spte_fn fn, int level) -{ - int i; - - for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { - u64 *ent = sp->spt; - - fn(vcpu, ent + i, level); - - if (is_shadow_present_pte(ent[i]) && - !is_last_spte(ent[i], level)) { - struct kvm_mmu_page *child; - - child = page_header(ent[i] & PT64_BASE_ADDR_MASK); - __mmu_spte_walk(vcpu, child, fn, level - 1); - } - } -} - -static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn) -{ - int i; - struct kvm_mmu_page *sp; - - if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) - return; - - if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) { - hpa_t root = vcpu->arch.mmu.root_hpa; - - sp = page_header(root); - __mmu_spte_walk(vcpu, sp, fn, PT64_ROOT_LEVEL); - return; - } - - for (i = 0; i < 4; ++i) { - hpa_t root = vcpu->arch.mmu.pae_root[i]; - - if (root && VALID_PAGE(root)) { - root &= PT64_BASE_ADDR_MASK; - sp = page_header(root); - __mmu_spte_walk(vcpu, sp, fn, 2); - } - } - - return; -} - -typedef void (*sp_handler) (struct kvm *kvm, struct kvm_mmu_page *sp); - -static void walk_all_active_sps(struct kvm *kvm, sp_handler fn) -{ - struct kvm_mmu_page *sp; - - list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) - fn(kvm, sp); -} - -static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level) -{ - struct kvm_mmu_page *sp; - gfn_t gfn; - pfn_t pfn; - hpa_t hpa; - - sp = page_header(__pa(sptep)); - - if (sp->unsync) { - if (level != PT_PAGE_TABLE_LEVEL) { - audit_printk(vcpu->kvm, "unsync sp: %p " - "level = %d\n", sp, level); - return; - } - } - - if (!is_shadow_present_pte(*sptep) || !is_last_spte(*sptep, level)) - return; - - gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); - pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn); - - if (is_error_pfn(pfn)) { - kvm_release_pfn_clean(pfn); - return; - } - - hpa = pfn << PAGE_SHIFT; - if ((*sptep & PT64_BASE_ADDR_MASK) != hpa) - audit_printk(vcpu->kvm, "levels %d pfn %llx hpa %llx " - "ent %llxn", vcpu->arch.mmu.root_level, pfn, - hpa, *sptep); -} - -static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) -{ - static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10); - unsigned long *rmapp; - struct kvm_mmu_page *rev_sp; - gfn_t gfn; - - rev_sp = page_header(__pa(sptep)); - gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt); - - if (!gfn_to_memslot(kvm, gfn)) { - if (!__ratelimit(&ratelimit_state)) - return; - audit_printk(kvm, "no memslot for gfn %llx\n", gfn); - audit_printk(kvm, "index %ld of sp (gfn=%llx)\n", - (long int)(sptep - rev_sp->spt), rev_sp->gfn); - dump_stack(); - return; - } - - rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level); - if (!*rmapp) { - if (!__ratelimit(&ratelimit_state)) - return; - audit_printk(kvm, "no rmap for writable spte %llx\n", - *sptep); - dump_stack(); - } -} - -static void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu, u64 *sptep, int level) -{ - if (is_shadow_present_pte(*sptep) && is_last_spte(*sptep, level)) - inspect_spte_has_rmap(vcpu->kvm, sptep); -} - -static void audit_spte_after_sync(struct kvm_vcpu *vcpu, u64 *sptep, int level) -{ - struct kvm_mmu_page *sp = page_header(__pa(sptep)); - - if (vcpu->kvm->arch.audit_point == AUDIT_POST_SYNC && sp->unsync) - audit_printk(vcpu->kvm, "meet unsync sp(%p) after sync " - "root.\n", sp); -} - -static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp) -{ - int i; - - if (sp->role.level != PT_PAGE_TABLE_LEVEL) - return; - - for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { - if (!is_rmap_spte(sp->spt[i])) - continue; - - inspect_spte_has_rmap(kvm, sp->spt + i); - } -} - -static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp) -{ - struct kvm_memory_slot *slot; - unsigned long *rmapp; - u64 *spte; - - if (sp->role.direct || sp->unsync || sp->role.invalid) - return; - - slot = gfn_to_memslot(kvm, sp->gfn); - rmapp = &slot->rmap[sp->gfn - slot->base_gfn]; - - spte = rmap_next(rmapp, NULL); - while (spte) { - if (is_writable_pte(*spte)) - audit_printk(kvm, "shadow page has writable " - "mappings: gfn %llx role %x\n", - sp->gfn, sp->role.word); - spte = rmap_next(rmapp, spte); - } -} - -static void audit_sp(struct kvm *kvm, struct kvm_mmu_page *sp) -{ - check_mappings_rmap(kvm, sp); - audit_write_protection(kvm, sp); -} - -static void audit_all_active_sps(struct kvm *kvm) -{ - walk_all_active_sps(kvm, audit_sp); -} - -static void audit_spte(struct kvm_vcpu *vcpu, u64 *sptep, int level) -{ - audit_sptes_have_rmaps(vcpu, sptep, level); - audit_mappings(vcpu, sptep, level); - audit_spte_after_sync(vcpu, sptep, level); -} - -static void audit_vcpu_spte(struct kvm_vcpu *vcpu) -{ - mmu_spte_walk(vcpu, audit_spte); -} - -static bool mmu_audit; -static struct static_key mmu_audit_key; - -static void __kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) -{ - static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10); - - if (!__ratelimit(&ratelimit_state)) - return; - - vcpu->kvm->arch.audit_point = point; - audit_all_active_sps(vcpu->kvm); - audit_vcpu_spte(vcpu); -} - -static inline void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) -{ - if (static_key_false((&mmu_audit_key))) - __kvm_mmu_audit(vcpu, point); -} - -static void mmu_audit_enable(void) -{ - if (mmu_audit) - return; - - static_key_slow_inc(&mmu_audit_key); - mmu_audit = true; -} - -static void mmu_audit_disable(void) -{ - if (!mmu_audit) - return; - - static_key_slow_dec(&mmu_audit_key); - mmu_audit = false; -} - -static int mmu_audit_set(const char *val, const struct kernel_param *kp) -{ - int ret; - unsigned long enable; - - ret = strict_strtoul(val, 10, &enable); - if (ret < 0) - return -EINVAL; - - switch (enable) { - case 0: - mmu_audit_disable(); - break; - case 1: - mmu_audit_enable(); - break; - default: - return -EINVAL; - } - - return 0; -} - -static struct kernel_param_ops audit_param_ops = { - .set = mmu_audit_set, - .get = param_get_bool, -}; - -module_param_cb(mmu_audit, &audit_param_ops, &mmu_audit, 0644); diff --git a/ANDROID_3.4.5/arch/x86/kvm/mmutrace.h b/ANDROID_3.4.5/arch/x86/kvm/mmutrace.h deleted file mode 100644 index 89fb0e81..00000000 --- a/ANDROID_3.4.5/arch/x86/kvm/mmutrace.h +++ /dev/null @@ -1,254 +0,0 @@ -#if !defined(_TRACE_KVMMMU_H) || defined(TRACE_HEADER_MULTI_READ) -#define _TRACE_KVMMMU_H - -#include <linux/tracepoint.h> -#include <linux/ftrace_event.h> - -#undef TRACE_SYSTEM -#define TRACE_SYSTEM kvmmmu - -#define KVM_MMU_PAGE_FIELDS \ - __field(__u64, gfn) \ - __field(__u32, role) \ - __field(__u32, root_count) \ - __field(bool, unsync) - -#define KVM_MMU_PAGE_ASSIGN(sp) \ - __entry->gfn = sp->gfn; \ - __entry->role = sp->role.word; \ - __entry->root_count = sp->root_count; \ - __entry->unsync = sp->unsync; - -#define KVM_MMU_PAGE_PRINTK() ({ \ - const char *ret = p->buffer + p->len; \ - static const char *access_str[] = { \ - "---", "--x", "w--", "w-x", "-u-", "-ux", "wu-", "wux" \ - }; \ - union kvm_mmu_page_role role; \ - \ - role.word = __entry->role; \ - \ - trace_seq_printf(p, "sp gfn %llx %u%s q%u%s %s%s" \ - " %snxe root %u %s%c", \ - __entry->gfn, role.level, \ - role.cr4_pae ? " pae" : "", \ - role.quadrant, \ - role.direct ? " direct" : "", \ - access_str[role.access], \ - role.invalid ? " invalid" : "", \ - role.nxe ? "" : "!", \ - __entry->root_count, \ - __entry->unsync ? "unsync" : "sync", 0); \ - ret; \ - }) - -#define kvm_mmu_trace_pferr_flags \ - { PFERR_PRESENT_MASK, "P" }, \ - { PFERR_WRITE_MASK, "W" }, \ - { PFERR_USER_MASK, "U" }, \ - { PFERR_RSVD_MASK, "RSVD" }, \ - { PFERR_FETCH_MASK, "F" } - -/* - * A pagetable walk has started - */ -TRACE_EVENT( - kvm_mmu_pagetable_walk, - TP_PROTO(u64 addr, int write_fault, int user_fault, int fetch_fault), - TP_ARGS(addr, write_fault, user_fault, fetch_fault), - - TP_STRUCT__entry( - __field(__u64, addr) - __field(__u32, pferr) - ), - - TP_fast_assign( - __entry->addr = addr; - __entry->pferr = (!!write_fault << 1) | (!!user_fault << 2) - | (!!fetch_fault << 4); - ), - - TP_printk("addr %llx pferr %x %s", __entry->addr, __entry->pferr, - __print_flags(__entry->pferr, "|", kvm_mmu_trace_pferr_flags)) -); - - -/* We just walked a paging element */ -TRACE_EVENT( - kvm_mmu_paging_element, - TP_PROTO(u64 pte, int level), - TP_ARGS(pte, level), - - TP_STRUCT__entry( - __field(__u64, pte) - __field(__u32, level) - ), - - TP_fast_assign( - __entry->pte = pte; - __entry->level = level; - ), - - TP_printk("pte %llx level %u", __entry->pte, __entry->level) -); - -DECLARE_EVENT_CLASS(kvm_mmu_set_bit_class, - - TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size), - - TP_ARGS(table_gfn, index, size), - - TP_STRUCT__entry( - __field(__u64, gpa) - ), - - TP_fast_assign( - __entry->gpa = ((u64)table_gfn << PAGE_SHIFT) - + index * size; - ), - - TP_printk("gpa %llx", __entry->gpa) -); - -/* We set a pte accessed bit */ -DEFINE_EVENT(kvm_mmu_set_bit_class, kvm_mmu_set_accessed_bit, - - TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size), - - TP_ARGS(table_gfn, index, size) -); - -/* We set a pte dirty bit */ -DEFINE_EVENT(kvm_mmu_set_bit_class, kvm_mmu_set_dirty_bit, - - TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size), - - TP_ARGS(table_gfn, index, size) -); - -TRACE_EVENT( - kvm_mmu_walker_error, - TP_PROTO(u32 pferr), - TP_ARGS(pferr), - - TP_STRUCT__entry( - __field(__u32, pferr) - ), - - TP_fast_assign( - __entry->pferr = pferr; - ), - - TP_printk("pferr %x %s", __entry->pferr, - __print_flags(__entry->pferr, "|", kvm_mmu_trace_pferr_flags)) -); - -TRACE_EVENT( - kvm_mmu_get_page, - TP_PROTO(struct kvm_mmu_page *sp, bool created), - TP_ARGS(sp, created), - - TP_STRUCT__entry( - KVM_MMU_PAGE_FIELDS - __field(bool, created) - ), - - TP_fast_assign( - KVM_MMU_PAGE_ASSIGN(sp) - __entry->created = created; - ), - - TP_printk("%s %s", KVM_MMU_PAGE_PRINTK(), - __entry->created ? "new" : "existing") -); - -DECLARE_EVENT_CLASS(kvm_mmu_page_class, - - TP_PROTO(struct kvm_mmu_page *sp), - TP_ARGS(sp), - - TP_STRUCT__entry( - KVM_MMU_PAGE_FIELDS - ), - - TP_fast_assign( - KVM_MMU_PAGE_ASSIGN(sp) - ), - - TP_printk("%s", KVM_MMU_PAGE_PRINTK()) -); - -DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_sync_page, - TP_PROTO(struct kvm_mmu_page *sp), - - TP_ARGS(sp) -); - -DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_unsync_page, - TP_PROTO(struct kvm_mmu_page *sp), - - TP_ARGS(sp) -); - -DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page, - TP_PROTO(struct kvm_mmu_page *sp), - - TP_ARGS(sp) -); - -DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_delay_free_pages, - TP_PROTO(struct kvm_mmu_page *sp), - - TP_ARGS(sp) -); - -TRACE_EVENT( - mark_mmio_spte, - TP_PROTO(u64 *sptep, gfn_t gfn, unsigned access), - TP_ARGS(sptep, gfn, access), - - TP_STRUCT__entry( - __field(void *, sptep) - __field(gfn_t, gfn) - __field(unsigned, access) - ), - - TP_fast_assign( - __entry->sptep = sptep; - __entry->gfn = gfn; - __entry->access = access; - ), - - TP_printk("sptep:%p gfn %llx access %x", __entry->sptep, __entry->gfn, - __entry->access) -); - -TRACE_EVENT( - handle_mmio_page_fault, - TP_PROTO(u64 addr, gfn_t gfn, unsigned access), - TP_ARGS(addr, gfn, access), - - TP_STRUCT__entry( - __field(u64, addr) - __field(gfn_t, gfn) - __field(unsigned, access) - ), - - TP_fast_assign( - __entry->addr = addr; - __entry->gfn = gfn; - __entry->access = access; - ), - - TP_printk("addr:%llx gfn %llx access %x", __entry->addr, __entry->gfn, - __entry->access) -); -#endif /* _TRACE_KVMMMU_H */ - -#undef TRACE_INCLUDE_PATH -#define TRACE_INCLUDE_PATH . -#undef TRACE_INCLUDE_FILE -#define TRACE_INCLUDE_FILE mmutrace - -/* This part must be outside protection */ -#include <trace/define_trace.h> diff --git a/ANDROID_3.4.5/arch/x86/kvm/paging_tmpl.h b/ANDROID_3.4.5/arch/x86/kvm/paging_tmpl.h deleted file mode 100644 index df5a7031..00000000 --- a/ANDROID_3.4.5/arch/x86/kvm/paging_tmpl.h +++ /dev/null @@ -1,837 +0,0 @@ -/* - * Kernel-based Virtual Machine driver for Linux - * - * This module enables machines with Intel VT-x extensions to run virtual - * machines without emulation or binary translation. - * - * MMU support - * - * Copyright (C) 2006 Qumranet, Inc. - * Copyright 2010 Red Hat, Inc. and/or its affiliates. - * - * Authors: - * Yaniv Kamay <yaniv@qumranet.com> - * Avi Kivity <avi@qumranet.com> - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * - */ - -/* - * We need the mmu code to access both 32-bit and 64-bit guest ptes, - * so the code in this file is compiled twice, once per pte size. - */ - -#if PTTYPE == 64 - #define pt_element_t u64 - #define guest_walker guest_walker64 - #define FNAME(name) paging##64_##name - #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK - #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl) - #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) - #define PT_INDEX(addr, level) PT64_INDEX(addr, level) - #define PT_LEVEL_BITS PT64_LEVEL_BITS - #ifdef CONFIG_X86_64 - #define PT_MAX_FULL_LEVELS 4 - #define CMPXCHG cmpxchg - #else - #define CMPXCHG cmpxchg64 - #define PT_MAX_FULL_LEVELS 2 - #endif -#elif PTTYPE == 32 - #define pt_element_t u32 - #define guest_walker guest_walker32 - #define FNAME(name) paging##32_##name - #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK - #define PT_LVL_ADDR_MASK(lvl) PT32_LVL_ADDR_MASK(lvl) - #define PT_LVL_OFFSET_MASK(lvl) PT32_LVL_OFFSET_MASK(lvl) - #define PT_INDEX(addr, level) PT32_INDEX(addr, level) - #define PT_LEVEL_BITS PT32_LEVEL_BITS - #define PT_MAX_FULL_LEVELS 2 - #define CMPXCHG cmpxchg -#else - #error Invalid PTTYPE value -#endif - -#define gpte_to_gfn_lvl FNAME(gpte_to_gfn_lvl) -#define gpte_to_gfn(pte) gpte_to_gfn_lvl((pte), PT_PAGE_TABLE_LEVEL) - -/* - * The guest_walker structure emulates the behavior of the hardware page - * table walker. - */ -struct guest_walker { - int level; - gfn_t table_gfn[PT_MAX_FULL_LEVELS]; - pt_element_t ptes[PT_MAX_FULL_LEVELS]; - pt_element_t prefetch_ptes[PTE_PREFETCH_NUM]; - gpa_t pte_gpa[PT_MAX_FULL_LEVELS]; - unsigned pt_access; - unsigned pte_access; - gfn_t gfn; - struct x86_exception fault; -}; - -static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl) -{ - return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT; -} - -static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, - pt_element_t __user *ptep_user, unsigned index, - pt_element_t orig_pte, pt_element_t new_pte) -{ - int npages; - pt_element_t ret; - pt_element_t *table; - struct page *page; - - npages = get_user_pages_fast((unsigned long)ptep_user, 1, 1, &page); - /* Check if the user is doing something meaningless. */ - if (unlikely(npages != 1)) - return -EFAULT; - - table = kmap_atomic(page); - ret = CMPXCHG(&table[index], orig_pte, new_pte); - kunmap_atomic(table); - - kvm_release_page_dirty(page); - - return (ret != orig_pte); -} - -static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte, - bool last) -{ - unsigned access; - - access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK; - if (last && !is_dirty_gpte(gpte)) - access &= ~ACC_WRITE_MASK; - -#if PTTYPE == 64 - if (vcpu->arch.mmu.nx) - access &= ~(gpte >> PT64_NX_SHIFT); -#endif - return access; -} - -static bool FNAME(is_last_gpte)(struct guest_walker *walker, - struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, - pt_element_t gpte) -{ - if (walker->level == PT_PAGE_TABLE_LEVEL) - return true; - - if ((walker->level == PT_DIRECTORY_LEVEL) && is_large_pte(gpte) && - (PTTYPE == 64 || is_pse(vcpu))) - return true; - - if ((walker->level == PT_PDPE_LEVEL) && is_large_pte(gpte) && - (mmu->root_level == PT64_ROOT_LEVEL)) - return true; - - return false; -} - -/* - * Fetch a guest pte for a guest virtual address - */ -static int FNAME(walk_addr_generic)(struct guest_walker *walker, - struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, - gva_t addr, u32 access) -{ - pt_element_t pte; - pt_element_t __user *uninitialized_var(ptep_user); - gfn_t table_gfn; - unsigned index, pt_access, uninitialized_var(pte_access); - gpa_t pte_gpa; - bool eperm, last_gpte; - int offset; - const int write_fault = access & PFERR_WRITE_MASK; - const int user_fault = access & PFERR_USER_MASK; - const int fetch_fault = access & PFERR_FETCH_MASK; - u16 errcode = 0; - - trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault, - fetch_fault); -retry_walk: - eperm = false; - walker->level = mmu->root_level; - pte = mmu->get_cr3(vcpu); - -#if PTTYPE == 64 - if (walker->level == PT32E_ROOT_LEVEL) { - pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3); - trace_kvm_mmu_paging_element(pte, walker->level); - if (!is_present_gpte(pte)) - goto error; - --walker->level; - } -#endif - ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) || - (mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0); - - pt_access = ACC_ALL; - - for (;;) { - gfn_t real_gfn; - unsigned long host_addr; - - index = PT_INDEX(addr, walker->level); - - table_gfn = gpte_to_gfn(pte); - offset = index * sizeof(pt_element_t); - pte_gpa = gfn_to_gpa(table_gfn) + offset; - walker->table_gfn[walker->level - 1] = table_gfn; - walker->pte_gpa[walker->level - 1] = pte_gpa; - - real_gfn = mmu->translate_gpa(vcpu, gfn_to_gpa(table_gfn), - PFERR_USER_MASK|PFERR_WRITE_MASK); - if (unlikely(real_gfn == UNMAPPED_GVA)) - goto error; - real_gfn = gpa_to_gfn(real_gfn); - - host_addr = gfn_to_hva(vcpu->kvm, real_gfn); - if (unlikely(kvm_is_error_hva(host_addr))) - goto error; - - ptep_user = (pt_element_t __user *)((void *)host_addr + offset); - if (unlikely(__copy_from_user(&pte, ptep_user, sizeof(pte)))) - goto error; - - trace_kvm_mmu_paging_element(pte, walker->level); - - if (unlikely(!is_present_gpte(pte))) - goto error; - - if (unlikely(is_rsvd_bits_set(&vcpu->arch.mmu, pte, - walker->level))) { - errcode |= PFERR_RSVD_MASK | PFERR_PRESENT_MASK; - goto error; - } - - if (!check_write_user_access(vcpu, write_fault, user_fault, - pte)) - eperm = true; - -#if PTTYPE == 64 - if (unlikely(fetch_fault && (pte & PT64_NX_MASK))) - eperm = true; -#endif - - last_gpte = FNAME(is_last_gpte)(walker, vcpu, mmu, pte); - if (last_gpte) { - pte_access = pt_access & - FNAME(gpte_access)(vcpu, pte, true); - /* check if the kernel is fetching from user page */ - if (unlikely(pte_access & PT_USER_MASK) && - kvm_read_cr4_bits(vcpu, X86_CR4_SMEP)) - if (fetch_fault && !user_fault) - eperm = true; - } - - if (!eperm && unlikely(!(pte & PT_ACCESSED_MASK))) { - int ret; - trace_kvm_mmu_set_accessed_bit(table_gfn, index, - sizeof(pte)); - ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, - pte, pte|PT_ACCESSED_MASK); - if (unlikely(ret < 0)) - goto error; - else if (ret) - goto retry_walk; - - mark_page_dirty(vcpu->kvm, table_gfn); - pte |= PT_ACCESSED_MASK; - } - - walker->ptes[walker->level - 1] = pte; - - if (last_gpte) { - int lvl = walker->level; - gpa_t real_gpa; - gfn_t gfn; - u32 ac; - - gfn = gpte_to_gfn_lvl(pte, lvl); - gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) >> PAGE_SHIFT; - - if (PTTYPE == 32 && - walker->level == PT_DIRECTORY_LEVEL && - is_cpuid_PSE36()) - gfn += pse36_gfn_delta(pte); - - ac = write_fault | fetch_fault | user_fault; - - real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn), - ac); - if (real_gpa == UNMAPPED_GVA) - return 0; - - walker->gfn = real_gpa >> PAGE_SHIFT; - - break; - } - - pt_access &= FNAME(gpte_access)(vcpu, pte, false); - --walker->level; - } - - if (unlikely(eperm)) { - errcode |= PFERR_PRESENT_MASK; - goto error; - } - - if (write_fault && unlikely(!is_dirty_gpte(pte))) { - int ret; - - trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); - ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, - pte, pte|PT_DIRTY_MASK); - if (unlikely(ret < 0)) - goto error; - else if (ret) - goto retry_walk; - - mark_page_dirty(vcpu->kvm, table_gfn); - pte |= PT_DIRTY_MASK; - walker->ptes[walker->level - 1] = pte; - } - - walker->pt_access = pt_access; - walker->pte_access = pte_access; - pgprintk("%s: pte %llx pte_access %x pt_access %x\n", - __func__, (u64)pte, pte_access, pt_access); - return 1; - -error: - errcode |= write_fault | user_fault; - if (fetch_fault && (mmu->nx || - kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))) - errcode |= PFERR_FETCH_MASK; - - walker->fault.vector = PF_VECTOR; - walker->fault.error_code_valid = true; - walker->fault.error_code = errcode; - walker->fault.address = addr; - walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu; - - trace_kvm_mmu_walker_error(walker->fault.error_code); - return 0; -} - -static int FNAME(walk_addr)(struct guest_walker *walker, - struct kvm_vcpu *vcpu, gva_t addr, u32 access) -{ - return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.mmu, addr, - access); -} - -static int FNAME(walk_addr_nested)(struct guest_walker *walker, - struct kvm_vcpu *vcpu, gva_t addr, - u32 access) -{ - return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu, - addr, access); -} - -static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *sp, u64 *spte, - pt_element_t gpte) -{ - if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) - goto no_present; - - if (!is_present_gpte(gpte)) - goto no_present; - - if (!(gpte & PT_ACCESSED_MASK)) - goto no_present; - - return false; - -no_present: - drop_spte(vcpu->kvm, spte); - return true; -} - -static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, - u64 *spte, const void *pte) -{ - pt_element_t gpte; - unsigned pte_access; - pfn_t pfn; - - gpte = *(const pt_element_t *)pte; - if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) - return; - - pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); - pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte, true); - pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte)); - if (mmu_invalid_pfn(pfn)) { - kvm_release_pfn_clean(pfn); - return; - } - - /* - * we call mmu_set_spte() with host_writable = true because that - * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1). - */ - mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, - NULL, PT_PAGE_TABLE_LEVEL, - gpte_to_gfn(gpte), pfn, true, true); -} - -static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu, - struct guest_walker *gw, int level) -{ - pt_element_t curr_pte; - gpa_t base_gpa, pte_gpa = gw->pte_gpa[level - 1]; - u64 mask; - int r, index; - - if (level == PT_PAGE_TABLE_LEVEL) { - mask = PTE_PREFETCH_NUM * sizeof(pt_element_t) - 1; - base_gpa = pte_gpa & ~mask; - index = (pte_gpa - base_gpa) / sizeof(pt_element_t); - - r = kvm_read_guest_atomic(vcpu->kvm, base_gpa, - gw->prefetch_ptes, sizeof(gw->prefetch_ptes)); - curr_pte = gw->prefetch_ptes[index]; - } else - r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, - &curr_pte, sizeof(curr_pte)); - - return r || curr_pte != gw->ptes[level - 1]; -} - -static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, - u64 *sptep) -{ - struct kvm_mmu_page *sp; - pt_element_t *gptep = gw->prefetch_ptes; - u64 *spte; - int i; - - sp = page_header(__pa(sptep)); - - if (sp->role.level > PT_PAGE_TABLE_LEVEL) - return; - - if (sp->role.direct) - return __direct_pte_prefetch(vcpu, sp, sptep); - - i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1); - spte = sp->spt + i; - - for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) { - pt_element_t gpte; - unsigned pte_access; - gfn_t gfn; - pfn_t pfn; - - if (spte == sptep) - continue; - - if (is_shadow_present_pte(*spte)) - continue; - - gpte = gptep[i]; - - if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) - continue; - - pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte, - true); - gfn = gpte_to_gfn(gpte); - pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, - pte_access & ACC_WRITE_MASK); - if (mmu_invalid_pfn(pfn)) { - kvm_release_pfn_clean(pfn); - break; - } - - mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, - NULL, PT_PAGE_TABLE_LEVEL, gfn, - pfn, true, true); - } -} - -/* - * Fetch a shadow pte for a specific level in the paging hierarchy. - */ -static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, - struct guest_walker *gw, - int user_fault, int write_fault, int hlevel, - int *emulate, pfn_t pfn, bool map_writable, - bool prefault) -{ - unsigned access = gw->pt_access; - struct kvm_mmu_page *sp = NULL; - int top_level; - unsigned direct_access; - struct kvm_shadow_walk_iterator it; - - if (!is_present_gpte(gw->ptes[gw->level - 1])) - return NULL; - - direct_access = gw->pte_access; - - top_level = vcpu->arch.mmu.root_level; - if (top_level == PT32E_ROOT_LEVEL) - top_level = PT32_ROOT_LEVEL; - /* - * Verify that the top-level gpte is still there. Since the page - * is a root page, it is either write protected (and cannot be - * changed from now on) or it is invalid (in which case, we don't - * really care if it changes underneath us after this point). - */ - if (FNAME(gpte_changed)(vcpu, gw, top_level)) - goto out_gpte_changed; - - for (shadow_walk_init(&it, vcpu, addr); - shadow_walk_okay(&it) && it.level > gw->level; - shadow_walk_next(&it)) { - gfn_t table_gfn; - - clear_sp_write_flooding_count(it.sptep); - drop_large_spte(vcpu, it.sptep); - - sp = NULL; - if (!is_shadow_present_pte(*it.sptep)) { - table_gfn = gw->table_gfn[it.level - 2]; - sp = kvm_mmu_get_page(vcpu, table_gfn, addr, it.level-1, - false, access, it.sptep); - } - - /* - * Verify that the gpte in the page we've just write - * protected is still there. - */ - if (FNAME(gpte_changed)(vcpu, gw, it.level - 1)) - goto out_gpte_changed; - - if (sp) - link_shadow_page(it.sptep, sp); - } - - for (; - shadow_walk_okay(&it) && it.level > hlevel; - shadow_walk_next(&it)) { - gfn_t direct_gfn; - - clear_sp_write_flooding_count(it.sptep); - validate_direct_spte(vcpu, it.sptep, direct_access); - - drop_large_spte(vcpu, it.sptep); - - if (is_shadow_present_pte(*it.sptep)) - continue; - - direct_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); - - sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1, - true, direct_access, it.sptep); - link_shadow_page(it.sptep, sp); - } - - clear_sp_write_flooding_count(it.sptep); - mmu_set_spte(vcpu, it.sptep, access, gw->pte_access, - user_fault, write_fault, emulate, it.level, - gw->gfn, pfn, prefault, map_writable); - FNAME(pte_prefetch)(vcpu, gw, it.sptep); - - return it.sptep; - -out_gpte_changed: - if (sp) - kvm_mmu_put_page(sp, it.sptep); - kvm_release_pfn_clean(pfn); - return NULL; -} - -/* - * Page fault handler. There are several causes for a page fault: - * - there is no shadow pte for the guest pte - * - write access through a shadow pte marked read only so that we can set - * the dirty bit - * - write access to a shadow pte marked read only so we can update the page - * dirty bitmap, when userspace requests it - * - mmio access; in this case we will never install a present shadow pte - * - normal guest page fault due to the guest pte marked not present, not - * writable, or not executable - * - * Returns: 1 if we need to emulate the instruction, 0 otherwise, or - * a negative value on error. - */ -static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, - bool prefault) -{ - int write_fault = error_code & PFERR_WRITE_MASK; - int user_fault = error_code & PFERR_USER_MASK; - struct guest_walker walker; - u64 *sptep; - int emulate = 0; - int r; - pfn_t pfn; - int level = PT_PAGE_TABLE_LEVEL; - int force_pt_level; - unsigned long mmu_seq; - bool map_writable; - - pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); - - if (unlikely(error_code & PFERR_RSVD_MASK)) - return handle_mmio_page_fault(vcpu, addr, error_code, - mmu_is_nested(vcpu)); - - r = mmu_topup_memory_caches(vcpu); - if (r) - return r; - - /* - * Look up the guest pte for the faulting address. - */ - r = FNAME(walk_addr)(&walker, vcpu, addr, error_code); - - /* - * The page is not mapped by the guest. Let the guest handle it. - */ - if (!r) { - pgprintk("%s: guest page fault\n", __func__); - if (!prefault) - inject_page_fault(vcpu, &walker.fault); - - return 0; - } - - if (walker.level >= PT_DIRECTORY_LEVEL) - force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn); - else - force_pt_level = 1; - if (!force_pt_level) { - level = min(walker.level, mapping_level(vcpu, walker.gfn)); - walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); - } - - mmu_seq = vcpu->kvm->mmu_notifier_seq; - smp_rmb(); - - if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault, - &map_writable)) - return 0; - - if (handle_abnormal_pfn(vcpu, mmu_is_nested(vcpu) ? 0 : addr, - walker.gfn, pfn, walker.pte_access, &r)) - return r; - - spin_lock(&vcpu->kvm->mmu_lock); - if (mmu_notifier_retry(vcpu, mmu_seq)) - goto out_unlock; - - kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); - kvm_mmu_free_some_pages(vcpu); - if (!force_pt_level) - transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level); - sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, - level, &emulate, pfn, map_writable, prefault); - (void)sptep; - pgprintk("%s: shadow pte %p %llx emulate %d\n", __func__, - sptep, *sptep, emulate); - - ++vcpu->stat.pf_fixed; - kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); - spin_unlock(&vcpu->kvm->mmu_lock); - - return emulate; - -out_unlock: - spin_unlock(&vcpu->kvm->mmu_lock); - kvm_release_pfn_clean(pfn); - return 0; -} - -static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp) -{ - int offset = 0; - - WARN_ON(sp->role.level != 1); - - if (PTTYPE == 32) - offset = sp->role.quadrant << PT64_LEVEL_BITS; - - return gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t); -} - -static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) -{ - struct kvm_shadow_walk_iterator iterator; - struct kvm_mmu_page *sp; - int level; - u64 *sptep; - - vcpu_clear_mmio_info(vcpu, gva); - - /* - * No need to check return value here, rmap_can_add() can - * help us to skip pte prefetch later. - */ - mmu_topup_memory_caches(vcpu); - - spin_lock(&vcpu->kvm->mmu_lock); - for_each_shadow_entry(vcpu, gva, iterator) { - level = iterator.level; - sptep = iterator.sptep; - - sp = page_header(__pa(sptep)); - if (is_last_spte(*sptep, level)) { - pt_element_t gpte; - gpa_t pte_gpa; - - if (!sp->unsync) - break; - - pte_gpa = FNAME(get_level1_sp_gpa)(sp); - pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t); - - if (mmu_page_zap_pte(vcpu->kvm, sp, sptep)) - kvm_flush_remote_tlbs(vcpu->kvm); - - if (!rmap_can_add(vcpu)) - break; - - if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte, - sizeof(pt_element_t))) - break; - - FNAME(update_pte)(vcpu, sp, sptep, &gpte); - } - - if (!is_shadow_present_pte(*sptep) || !sp->unsync_children) - break; - } - spin_unlock(&vcpu->kvm->mmu_lock); -} - -static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, - struct x86_exception *exception) -{ - struct guest_walker walker; - gpa_t gpa = UNMAPPED_GVA; - int r; - - r = FNAME(walk_addr)(&walker, vcpu, vaddr, access); - - if (r) { - gpa = gfn_to_gpa(walker.gfn); - gpa |= vaddr & ~PAGE_MASK; - } else if (exception) - *exception = walker.fault; - - return gpa; -} - -static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr, - u32 access, - struct x86_exception *exception) -{ - struct guest_walker walker; - gpa_t gpa = UNMAPPED_GVA; - int r; - - r = FNAME(walk_addr_nested)(&walker, vcpu, vaddr, access); - - if (r) { - gpa = gfn_to_gpa(walker.gfn); - gpa |= vaddr & ~PAGE_MASK; - } else if (exception) - *exception = walker.fault; - - return gpa; -} - -/* - * Using the cached information from sp->gfns is safe because: - * - The spte has a reference to the struct page, so the pfn for a given gfn - * can't change unless all sptes pointing to it are nuked first. - * - * Note: - * We should flush all tlbs if spte is dropped even though guest is - * responsible for it. Since if we don't, kvm_mmu_notifier_invalidate_page - * and kvm_mmu_notifier_invalidate_range_start detect the mapping page isn't - * used by guest then tlbs are not flushed, so guest is allowed to access the - * freed pages. - * And we increase kvm->tlbs_dirty to delay tlbs flush in this case. - */ -static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) -{ - int i, nr_present = 0; - bool host_writable; - gpa_t first_pte_gpa; - - /* direct kvm_mmu_page can not be unsync. */ - BUG_ON(sp->role.direct); - - first_pte_gpa = FNAME(get_level1_sp_gpa)(sp); - - for (i = 0; i < PT64_ENT_PER_PAGE; i++) { - unsigned pte_access; - pt_element_t gpte; - gpa_t pte_gpa; - gfn_t gfn; - - if (!sp->spt[i]) - continue; - - pte_gpa = first_pte_gpa + i * sizeof(pt_element_t); - - if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte, - sizeof(pt_element_t))) - return -EINVAL; - - if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) { - vcpu->kvm->tlbs_dirty++; - continue; - } - - gfn = gpte_to_gfn(gpte); - pte_access = sp->role.access; - pte_access &= FNAME(gpte_access)(vcpu, gpte, true); - - if (sync_mmio_spte(&sp->spt[i], gfn, pte_access, &nr_present)) - continue; - - if (gfn != sp->gfns[i]) { - drop_spte(vcpu->kvm, &sp->spt[i]); - vcpu->kvm->tlbs_dirty++; - continue; - } - - nr_present++; - - host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE; - - set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, - PT_PAGE_TABLE_LEVEL, gfn, - spte_to_pfn(sp->spt[i]), true, false, - host_writable); - } - - return !nr_present; -} - -#undef pt_element_t -#undef guest_walker -#undef FNAME -#undef PT_BASE_ADDR_MASK -#undef PT_INDEX -#undef PT_LVL_ADDR_MASK -#undef PT_LVL_OFFSET_MASK -#undef PT_LEVEL_BITS -#undef PT_MAX_FULL_LEVELS -#undef gpte_to_gfn -#undef gpte_to_gfn_lvl -#undef CMPXCHG diff --git a/ANDROID_3.4.5/arch/x86/kvm/pmu.c b/ANDROID_3.4.5/arch/x86/kvm/pmu.c deleted file mode 100644 index 2e88438f..00000000 --- a/ANDROID_3.4.5/arch/x86/kvm/pmu.c +++ /dev/null @@ -1,537 +0,0 @@ -/* - * Kernel-based Virtual Machine -- Performane Monitoring Unit support - * - * Copyright 2011 Red Hat, Inc. and/or its affiliates. - * - * Authors: - * Avi Kivity <avi@redhat.com> - * Gleb Natapov <gleb@redhat.com> - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * - */ - -#include <linux/types.h> -#include <linux/kvm_host.h> -#include <linux/perf_event.h> -#include "x86.h" -#include "cpuid.h" -#include "lapic.h" - -static struct kvm_arch_event_perf_mapping { - u8 eventsel; - u8 unit_mask; - unsigned event_type; - bool inexact; -} arch_events[] = { - /* Index must match CPUID 0x0A.EBX bit vector */ - [0] = { 0x3c, 0x00, PERF_COUNT_HW_CPU_CYCLES }, - [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS }, - [2] = { 0x3c, 0x01, PERF_COUNT_HW_BUS_CYCLES }, - [3] = { 0x2e, 0x4f, PERF_COUNT_HW_CACHE_REFERENCES }, - [4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES }, - [5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, - [6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES }, - [7] = { 0x00, 0x30, PERF_COUNT_HW_REF_CPU_CYCLES }, -}; - -/* mapping between fixed pmc index and arch_events array */ -int fixed_pmc_events[] = {1, 0, 7}; - -static bool pmc_is_gp(struct kvm_pmc *pmc) -{ - return pmc->type == KVM_PMC_GP; -} - -static inline u64 pmc_bitmask(struct kvm_pmc *pmc) -{ - struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu; - - return pmu->counter_bitmask[pmc->type]; -} - -static inline bool pmc_enabled(struct kvm_pmc *pmc) -{ - struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu; - return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl); -} - -static inline struct kvm_pmc *get_gp_pmc(struct kvm_pmu *pmu, u32 msr, - u32 base) -{ - if (msr >= base && msr < base + pmu->nr_arch_gp_counters) - return &pmu->gp_counters[msr - base]; - return NULL; -} - -static inline struct kvm_pmc *get_fixed_pmc(struct kvm_pmu *pmu, u32 msr) -{ - int base = MSR_CORE_PERF_FIXED_CTR0; - if (msr >= base && msr < base + pmu->nr_arch_fixed_counters) - return &pmu->fixed_counters[msr - base]; - return NULL; -} - -static inline struct kvm_pmc *get_fixed_pmc_idx(struct kvm_pmu *pmu, int idx) -{ - return get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + idx); -} - -static struct kvm_pmc *global_idx_to_pmc(struct kvm_pmu *pmu, int idx) -{ - if (idx < X86_PMC_IDX_FIXED) - return get_gp_pmc(pmu, MSR_P6_EVNTSEL0 + idx, MSR_P6_EVNTSEL0); - else - return get_fixed_pmc_idx(pmu, idx - X86_PMC_IDX_FIXED); -} - -void kvm_deliver_pmi(struct kvm_vcpu *vcpu) -{ - if (vcpu->arch.apic) - kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC); -} - -static void trigger_pmi(struct irq_work *irq_work) -{ - struct kvm_pmu *pmu = container_of(irq_work, struct kvm_pmu, - irq_work); - struct kvm_vcpu *vcpu = container_of(pmu, struct kvm_vcpu, - arch.pmu); - - kvm_deliver_pmi(vcpu); -} - -static void kvm_perf_overflow(struct perf_event *perf_event, - struct perf_sample_data *data, - struct pt_regs *regs) -{ - struct kvm_pmc *pmc = perf_event->overflow_handler_context; - struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu; - __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); -} - -static void kvm_perf_overflow_intr(struct perf_event *perf_event, - struct perf_sample_data *data, struct pt_regs *regs) -{ - struct kvm_pmc *pmc = perf_event->overflow_handler_context; - struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu; - if (!test_and_set_bit(pmc->idx, (unsigned long *)&pmu->reprogram_pmi)) { - kvm_perf_overflow(perf_event, data, regs); - kvm_make_request(KVM_REQ_PMU, pmc->vcpu); - /* - * Inject PMI. If vcpu was in a guest mode during NMI PMI - * can be ejected on a guest mode re-entry. Otherwise we can't - * be sure that vcpu wasn't executing hlt instruction at the - * time of vmexit and is not going to re-enter guest mode until, - * woken up. So we should wake it, but this is impossible from - * NMI context. Do it from irq work instead. - */ - if (!kvm_is_in_guest()) - irq_work_queue(&pmc->vcpu->arch.pmu.irq_work); - else - kvm_make_request(KVM_REQ_PMI, pmc->vcpu); - } -} - -static u64 read_pmc(struct kvm_pmc *pmc) -{ - u64 counter, enabled, running; - - counter = pmc->counter; - - if (pmc->perf_event) - counter += perf_event_read_value(pmc->perf_event, - &enabled, &running); - - /* FIXME: Scaling needed? */ - - return counter & pmc_bitmask(pmc); -} - -static void stop_counter(struct kvm_pmc *pmc) -{ - if (pmc->perf_event) { - pmc->counter = read_pmc(pmc); - perf_event_release_kernel(pmc->perf_event); - pmc->perf_event = NULL; - } -} - -static void reprogram_counter(struct kvm_pmc *pmc, u32 type, - unsigned config, bool exclude_user, bool exclude_kernel, - bool intr) -{ - struct perf_event *event; - struct perf_event_attr attr = { - .type = type, - .size = sizeof(attr), - .pinned = true, - .exclude_idle = true, - .exclude_host = 1, - .exclude_user = exclude_user, - .exclude_kernel = exclude_kernel, - .config = config, - }; - - attr.sample_period = (-pmc->counter) & pmc_bitmask(pmc); - - event = perf_event_create_kernel_counter(&attr, -1, current, - intr ? kvm_perf_overflow_intr : - kvm_perf_overflow, pmc); - if (IS_ERR(event)) { - printk_once("kvm: pmu event creation failed %ld\n", - PTR_ERR(event)); - return; - } - - pmc->perf_event = event; - clear_bit(pmc->idx, (unsigned long*)&pmc->vcpu->arch.pmu.reprogram_pmi); -} - -static unsigned find_arch_event(struct kvm_pmu *pmu, u8 event_select, - u8 unit_mask) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(arch_events); i++) - if (arch_events[i].eventsel == event_select - && arch_events[i].unit_mask == unit_mask - && (pmu->available_event_types & (1 << i))) - break; - - if (i == ARRAY_SIZE(arch_events)) - return PERF_COUNT_HW_MAX; - - return arch_events[i].event_type; -} - -static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) -{ - unsigned config, type = PERF_TYPE_RAW; - u8 event_select, unit_mask; - - if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL) - printk_once("kvm pmu: pin control bit is ignored\n"); - - pmc->eventsel = eventsel; - - stop_counter(pmc); - - if (!(eventsel & ARCH_PERFMON_EVENTSEL_ENABLE) || !pmc_enabled(pmc)) - return; - - event_select = eventsel & ARCH_PERFMON_EVENTSEL_EVENT; - unit_mask = (eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; - - if (!(eventsel & (ARCH_PERFMON_EVENTSEL_EDGE | - ARCH_PERFMON_EVENTSEL_INV | - ARCH_PERFMON_EVENTSEL_CMASK))) { - config = find_arch_event(&pmc->vcpu->arch.pmu, event_select, - unit_mask); - if (config != PERF_COUNT_HW_MAX) - type = PERF_TYPE_HARDWARE; - } - - if (type == PERF_TYPE_RAW) - config = eventsel & X86_RAW_EVENT_MASK; - - reprogram_counter(pmc, type, config, - !(eventsel & ARCH_PERFMON_EVENTSEL_USR), - !(eventsel & ARCH_PERFMON_EVENTSEL_OS), - eventsel & ARCH_PERFMON_EVENTSEL_INT); -} - -static void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 en_pmi, int idx) -{ - unsigned en = en_pmi & 0x3; - bool pmi = en_pmi & 0x8; - - stop_counter(pmc); - - if (!en || !pmc_enabled(pmc)) - return; - - reprogram_counter(pmc, PERF_TYPE_HARDWARE, - arch_events[fixed_pmc_events[idx]].event_type, - !(en & 0x2), /* exclude user */ - !(en & 0x1), /* exclude kernel */ - pmi); -} - -static inline u8 fixed_en_pmi(u64 ctrl, int idx) -{ - return (ctrl >> (idx * 4)) & 0xf; -} - -static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data) -{ - int i; - - for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { - u8 en_pmi = fixed_en_pmi(data, i); - struct kvm_pmc *pmc = get_fixed_pmc_idx(pmu, i); - - if (fixed_en_pmi(pmu->fixed_ctr_ctrl, i) == en_pmi) - continue; - - reprogram_fixed_counter(pmc, en_pmi, i); - } - - pmu->fixed_ctr_ctrl = data; -} - -static void reprogram_idx(struct kvm_pmu *pmu, int idx) -{ - struct kvm_pmc *pmc = global_idx_to_pmc(pmu, idx); - - if (!pmc) - return; - - if (pmc_is_gp(pmc)) - reprogram_gp_counter(pmc, pmc->eventsel); - else { - int fidx = idx - X86_PMC_IDX_FIXED; - reprogram_fixed_counter(pmc, - fixed_en_pmi(pmu->fixed_ctr_ctrl, fidx), fidx); - } -} - -static void global_ctrl_changed(struct kvm_pmu *pmu, u64 data) -{ - int bit; - u64 diff = pmu->global_ctrl ^ data; - - pmu->global_ctrl = data; - - for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX) - reprogram_idx(pmu, bit); -} - -bool kvm_pmu_msr(struct kvm_vcpu *vcpu, u32 msr) -{ - struct kvm_pmu *pmu = &vcpu->arch.pmu; - int ret; - - switch (msr) { - case MSR_CORE_PERF_FIXED_CTR_CTRL: - case MSR_CORE_PERF_GLOBAL_STATUS: - case MSR_CORE_PERF_GLOBAL_CTRL: - case MSR_CORE_PERF_GLOBAL_OVF_CTRL: - ret = pmu->version > 1; - break; - default: - ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) - || get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0) - || get_fixed_pmc(pmu, msr); - break; - } - return ret; -} - -int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data) -{ - struct kvm_pmu *pmu = &vcpu->arch.pmu; - struct kvm_pmc *pmc; - - switch (index) { - case MSR_CORE_PERF_FIXED_CTR_CTRL: - *data = pmu->fixed_ctr_ctrl; - return 0; - case MSR_CORE_PERF_GLOBAL_STATUS: - *data = pmu->global_status; - return 0; - case MSR_CORE_PERF_GLOBAL_CTRL: - *data = pmu->global_ctrl; - return 0; - case MSR_CORE_PERF_GLOBAL_OVF_CTRL: - *data = pmu->global_ovf_ctrl; - return 0; - default: - if ((pmc = get_gp_pmc(pmu, index, MSR_IA32_PERFCTR0)) || - (pmc = get_fixed_pmc(pmu, index))) { - *data = read_pmc(pmc); - return 0; - } else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) { - *data = pmc->eventsel; - return 0; - } - } - return 1; -} - -int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data) -{ - struct kvm_pmu *pmu = &vcpu->arch.pmu; - struct kvm_pmc *pmc; - - switch (index) { - case MSR_CORE_PERF_FIXED_CTR_CTRL: - if (pmu->fixed_ctr_ctrl == data) - return 0; - if (!(data & 0xfffffffffffff444ull)) { - reprogram_fixed_counters(pmu, data); - return 0; - } - break; - case MSR_CORE_PERF_GLOBAL_STATUS: - break; /* RO MSR */ - case MSR_CORE_PERF_GLOBAL_CTRL: - if (pmu->global_ctrl == data) - return 0; - if (!(data & pmu->global_ctrl_mask)) { - global_ctrl_changed(pmu, data); - return 0; - } - break; - case MSR_CORE_PERF_GLOBAL_OVF_CTRL: - if (!(data & (pmu->global_ctrl_mask & ~(3ull<<62)))) { - pmu->global_status &= ~data; - pmu->global_ovf_ctrl = data; - return 0; - } - break; - default: - if ((pmc = get_gp_pmc(pmu, index, MSR_IA32_PERFCTR0)) || - (pmc = get_fixed_pmc(pmu, index))) { - data = (s64)(s32)data; - pmc->counter += data - read_pmc(pmc); - return 0; - } else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) { - if (data == pmc->eventsel) - return 0; - if (!(data & 0xffffffff00200000ull)) { - reprogram_gp_counter(pmc, data); - return 0; - } - } - } - return 1; -} - -int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data) -{ - struct kvm_pmu *pmu = &vcpu->arch.pmu; - bool fast_mode = pmc & (1u << 31); - bool fixed = pmc & (1u << 30); - struct kvm_pmc *counters; - u64 ctr; - - pmc &= ~(3u << 30); - if (!fixed && pmc >= pmu->nr_arch_gp_counters) - return 1; - if (fixed && pmc >= pmu->nr_arch_fixed_counters) - return 1; - counters = fixed ? pmu->fixed_counters : pmu->gp_counters; - ctr = read_pmc(&counters[pmc]); - if (fast_mode) - ctr = (u32)ctr; - *data = ctr; - - return 0; -} - -void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu) -{ - struct kvm_pmu *pmu = &vcpu->arch.pmu; - struct kvm_cpuid_entry2 *entry; - unsigned bitmap_len; - - pmu->nr_arch_gp_counters = 0; - pmu->nr_arch_fixed_counters = 0; - pmu->counter_bitmask[KVM_PMC_GP] = 0; - pmu->counter_bitmask[KVM_PMC_FIXED] = 0; - pmu->version = 0; - - entry = kvm_find_cpuid_entry(vcpu, 0xa, 0); - if (!entry) - return; - - pmu->version = entry->eax & 0xff; - if (!pmu->version) - return; - - pmu->nr_arch_gp_counters = min((int)(entry->eax >> 8) & 0xff, - X86_PMC_MAX_GENERIC); - pmu->counter_bitmask[KVM_PMC_GP] = - ((u64)1 << ((entry->eax >> 16) & 0xff)) - 1; - bitmap_len = (entry->eax >> 24) & 0xff; - pmu->available_event_types = ~entry->ebx & ((1ull << bitmap_len) - 1); - - if (pmu->version == 1) { - pmu->nr_arch_fixed_counters = 0; - } else { - pmu->nr_arch_fixed_counters = min((int)(entry->edx & 0x1f), - X86_PMC_MAX_FIXED); - pmu->counter_bitmask[KVM_PMC_FIXED] = - ((u64)1 << ((entry->edx >> 5) & 0xff)) - 1; - } - - pmu->global_ctrl = ((1 << pmu->nr_arch_gp_counters) - 1) | - (((1ull << pmu->nr_arch_fixed_counters) - 1) << X86_PMC_IDX_FIXED); - pmu->global_ctrl_mask = ~pmu->global_ctrl; -} - -void kvm_pmu_init(struct kvm_vcpu *vcpu) -{ - int i; - struct kvm_pmu *pmu = &vcpu->arch.pmu; - - memset(pmu, 0, sizeof(*pmu)); - for (i = 0; i < X86_PMC_MAX_GENERIC; i++) { - pmu->gp_counters[i].type = KVM_PMC_GP; - pmu->gp_counters[i].vcpu = vcpu; - pmu->gp_counters[i].idx = i; - } - for (i = 0; i < X86_PMC_MAX_FIXED; i++) { - pmu->fixed_counters[i].type = KVM_PMC_FIXED; - pmu->fixed_counters[i].vcpu = vcpu; - pmu->fixed_counters[i].idx = i + X86_PMC_IDX_FIXED; - } - init_irq_work(&pmu->irq_work, trigger_pmi); - kvm_pmu_cpuid_update(vcpu); -} - -void kvm_pmu_reset(struct kvm_vcpu *vcpu) -{ - struct kvm_pmu *pmu = &vcpu->arch.pmu; - int i; - - irq_work_sync(&pmu->irq_work); - for (i = 0; i < X86_PMC_MAX_GENERIC; i++) { - struct kvm_pmc *pmc = &pmu->gp_counters[i]; - stop_counter(pmc); - pmc->counter = pmc->eventsel = 0; - } - - for (i = 0; i < X86_PMC_MAX_FIXED; i++) - stop_counter(&pmu->fixed_counters[i]); - - pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = - pmu->global_ovf_ctrl = 0; -} - -void kvm_pmu_destroy(struct kvm_vcpu *vcpu) -{ - kvm_pmu_reset(vcpu); -} - -void kvm_handle_pmu_event(struct kvm_vcpu *vcpu) -{ - struct kvm_pmu *pmu = &vcpu->arch.pmu; - u64 bitmask; - int bit; - - bitmask = pmu->reprogram_pmi; - - for_each_set_bit(bit, (unsigned long *)&bitmask, X86_PMC_IDX_MAX) { - struct kvm_pmc *pmc = global_idx_to_pmc(pmu, bit); - - if (unlikely(!pmc || !pmc->perf_event)) { - clear_bit(bit, (unsigned long *)&pmu->reprogram_pmi); - continue; - } - - reprogram_idx(pmu, bit); - } -} diff --git a/ANDROID_3.4.5/arch/x86/kvm/svm.c b/ANDROID_3.4.5/arch/x86/kvm/svm.c deleted file mode 100644 index e334389e..00000000 --- a/ANDROID_3.4.5/arch/x86/kvm/svm.c +++ /dev/null @@ -1,4336 +0,0 @@ -/* - * Kernel-based Virtual Machine driver for Linux - * - * AMD SVM support - * - * Copyright (C) 2006 Qumranet, Inc. - * Copyright 2010 Red Hat, Inc. and/or its affiliates. - * - * Authors: - * Yaniv Kamay <yaniv@qumranet.com> - * Avi Kivity <avi@qumranet.com> - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * - */ -#include <linux/kvm_host.h> - -#include "irq.h" -#include "mmu.h" -#include "kvm_cache_regs.h" -#include "x86.h" - -#include <linux/module.h> -#include <linux/kernel.h> -#include <linux/vmalloc.h> -#include <linux/highmem.h> -#include <linux/sched.h> -#include <linux/ftrace_event.h> -#include <linux/slab.h> - -#include <asm/perf_event.h> -#include <asm/tlbflush.h> -#include <asm/desc.h> -#include <asm/kvm_para.h> - -#include <asm/virtext.h> -#include "trace.h" - -#define __ex(x) __kvm_handle_fault_on_reboot(x) - -MODULE_AUTHOR("Qumranet"); -MODULE_LICENSE("GPL"); - -#define IOPM_ALLOC_ORDER 2 -#define MSRPM_ALLOC_ORDER 1 - -#define SEG_TYPE_LDT 2 -#define SEG_TYPE_BUSY_TSS16 3 - -#define SVM_FEATURE_NPT (1 << 0) -#define SVM_FEATURE_LBRV (1 << 1) -#define SVM_FEATURE_SVML (1 << 2) -#define SVM_FEATURE_NRIP (1 << 3) -#define SVM_FEATURE_TSC_RATE (1 << 4) -#define SVM_FEATURE_VMCB_CLEAN (1 << 5) -#define SVM_FEATURE_FLUSH_ASID (1 << 6) -#define SVM_FEATURE_DECODE_ASSIST (1 << 7) -#define SVM_FEATURE_PAUSE_FILTER (1 << 10) - -#define NESTED_EXIT_HOST 0 /* Exit handled on host level */ -#define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */ -#define NESTED_EXIT_CONTINUE 2 /* Further checks needed */ - -#define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) - -#define TSC_RATIO_RSVD 0xffffff0000000000ULL -#define TSC_RATIO_MIN 0x0000000000000001ULL -#define TSC_RATIO_MAX 0x000000ffffffffffULL - -static bool erratum_383_found __read_mostly; - -static const u32 host_save_user_msrs[] = { -#ifdef CONFIG_X86_64 - MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE, - MSR_FS_BASE, -#endif - MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, -}; - -#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs) - -struct kvm_vcpu; - -struct nested_state { - struct vmcb *hsave; - u64 hsave_msr; - u64 vm_cr_msr; - u64 vmcb; - - /* These are the merged vectors */ - u32 *msrpm; - - /* gpa pointers to the real vectors */ - u64 vmcb_msrpm; - u64 vmcb_iopm; - - /* A VMEXIT is required but not yet emulated */ - bool exit_required; - - /* cache for intercepts of the guest */ - u32 intercept_cr; - u32 intercept_dr; - u32 intercept_exceptions; - u64 intercept; - - /* Nested Paging related state */ - u64 nested_cr3; -}; - -#define MSRPM_OFFSETS 16 -static u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; - -/* - * Set osvw_len to higher value when updated Revision Guides - * are published and we know what the new status bits are - */ -static uint64_t osvw_len = 4, osvw_status; - -struct vcpu_svm { - struct kvm_vcpu vcpu; - struct vmcb *vmcb; - unsigned long vmcb_pa; - struct svm_cpu_data *svm_data; - uint64_t asid_generation; - uint64_t sysenter_esp; - uint64_t sysenter_eip; - - u64 next_rip; - - u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS]; - struct { - u16 fs; - u16 gs; - u16 ldt; - u64 gs_base; - } host; - - u32 *msrpm; - - ulong nmi_iret_rip; - - struct nested_state nested; - - bool nmi_singlestep; - - unsigned int3_injected; - unsigned long int3_rip; - u32 apf_reason; - - u64 tsc_ratio; -}; - -static DEFINE_PER_CPU(u64, current_tsc_ratio); -#define TSC_RATIO_DEFAULT 0x0100000000ULL - -#define MSR_INVALID 0xffffffffU - -static struct svm_direct_access_msrs { - u32 index; /* Index of the MSR */ - bool always; /* True if intercept is always on */ -} direct_access_msrs[] = { - { .index = MSR_STAR, .always = true }, - { .index = MSR_IA32_SYSENTER_CS, .always = true }, -#ifdef CONFIG_X86_64 - { .index = MSR_GS_BASE, .always = true }, - { .index = MSR_FS_BASE, .always = true }, - { .index = MSR_KERNEL_GS_BASE, .always = true }, - { .index = MSR_LSTAR, .always = true }, - { .index = MSR_CSTAR, .always = true }, - { .index = MSR_SYSCALL_MASK, .always = true }, -#endif - { .index = MSR_IA32_LASTBRANCHFROMIP, .always = false }, - { .index = MSR_IA32_LASTBRANCHTOIP, .always = false }, - { .index = MSR_IA32_LASTINTFROMIP, .always = false }, - { .index = MSR_IA32_LASTINTTOIP, .always = false }, - { .index = MSR_INVALID, .always = false }, -}; - -/* enable NPT for AMD64 and X86 with PAE */ -#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) -static bool npt_enabled = true; -#else -static bool npt_enabled; -#endif - -/* allow nested paging (virtualized MMU) for all guests */ -static int npt = true; -module_param(npt, int, S_IRUGO); - -/* allow nested virtualization in KVM/SVM */ -static int nested = true; -module_param(nested, int, S_IRUGO); - -static void svm_flush_tlb(struct kvm_vcpu *vcpu); -static void svm_complete_interrupts(struct vcpu_svm *svm); - -static int nested_svm_exit_handled(struct vcpu_svm *svm); -static int nested_svm_intercept(struct vcpu_svm *svm); -static int nested_svm_vmexit(struct vcpu_svm *svm); -static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, - bool has_error_code, u32 error_code); -static u64 __scale_tsc(u64 ratio, u64 tsc); - -enum { - VMCB_INTERCEPTS, /* Intercept vectors, TSC offset, - pause filter count */ - VMCB_PERM_MAP, /* IOPM Base and MSRPM Base */ - VMCB_ASID, /* ASID */ - VMCB_INTR, /* int_ctl, int_vector */ - VMCB_NPT, /* npt_en, nCR3, gPAT */ - VMCB_CR, /* CR0, CR3, CR4, EFER */ - VMCB_DR, /* DR6, DR7 */ - VMCB_DT, /* GDT, IDT */ - VMCB_SEG, /* CS, DS, SS, ES, CPL */ - VMCB_CR2, /* CR2 only */ - VMCB_LBR, /* DBGCTL, BR_FROM, BR_TO, LAST_EX_FROM, LAST_EX_TO */ - VMCB_DIRTY_MAX, -}; - -/* TPR and CR2 are always written before VMRUN */ -#define VMCB_ALWAYS_DIRTY_MASK ((1U << VMCB_INTR) | (1U << VMCB_CR2)) - -static inline void mark_all_dirty(struct vmcb *vmcb) -{ - vmcb->control.clean = 0; -} - -static inline void mark_all_clean(struct vmcb *vmcb) -{ - vmcb->control.clean = ((1 << VMCB_DIRTY_MAX) - 1) - & ~VMCB_ALWAYS_DIRTY_MASK; -} - -static inline void mark_dirty(struct vmcb *vmcb, int bit) -{ - vmcb->control.clean &= ~(1 << bit); -} - -static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) -{ - return container_of(vcpu, struct vcpu_svm, vcpu); -} - -static void recalc_intercepts(struct vcpu_svm *svm) -{ - struct vmcb_control_area *c, *h; - struct nested_state *g; - - mark_dirty(svm->vmcb, VMCB_INTERCEPTS); - - if (!is_guest_mode(&svm->vcpu)) - return; - - c = &svm->vmcb->control; - h = &svm->nested.hsave->control; - g = &svm->nested; - - c->intercept_cr = h->intercept_cr | g->intercept_cr; - c->intercept_dr = h->intercept_dr | g->intercept_dr; - c->intercept_exceptions = h->intercept_exceptions | g->intercept_exceptions; - c->intercept = h->intercept | g->intercept; -} - -static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm) -{ - if (is_guest_mode(&svm->vcpu)) - return svm->nested.hsave; - else - return svm->vmcb; -} - -static inline void set_cr_intercept(struct vcpu_svm *svm, int bit) -{ - struct vmcb *vmcb = get_host_vmcb(svm); - - vmcb->control.intercept_cr |= (1U << bit); - - recalc_intercepts(svm); -} - -static inline void clr_cr_intercept(struct vcpu_svm *svm, int bit) -{ - struct vmcb *vmcb = get_host_vmcb(svm); - - vmcb->control.intercept_cr &= ~(1U << bit); - - recalc_intercepts(svm); -} - -static inline bool is_cr_intercept(struct vcpu_svm *svm, int bit) -{ - struct vmcb *vmcb = get_host_vmcb(svm); - - return vmcb->control.intercept_cr & (1U << bit); -} - -static inline void set_dr_intercept(struct vcpu_svm *svm, int bit) -{ - struct vmcb *vmcb = get_host_vmcb(svm); - - vmcb->control.intercept_dr |= (1U << bit); - - recalc_intercepts(svm); -} - -static inline void clr_dr_intercept(struct vcpu_svm *svm, int bit) -{ - struct vmcb *vmcb = get_host_vmcb(svm); - - vmcb->control.intercept_dr &= ~(1U << bit); - - recalc_intercepts(svm); -} - -static inline void set_exception_intercept(struct vcpu_svm *svm, int bit) -{ - struct vmcb *vmcb = get_host_vmcb(svm); - - vmcb->control.intercept_exceptions |= (1U << bit); - - recalc_intercepts(svm); -} - -static inline void clr_exception_intercept(struct vcpu_svm *svm, int bit) -{ - struct vmcb *vmcb = get_host_vmcb(svm); - - vmcb->control.intercept_exceptions &= ~(1U << bit); - - recalc_intercepts(svm); -} - -static inline void set_intercept(struct vcpu_svm *svm, int bit) -{ - struct vmcb *vmcb = get_host_vmcb(svm); - - vmcb->control.intercept |= (1ULL << bit); - - recalc_intercepts(svm); -} - -static inline void clr_intercept(struct vcpu_svm *svm, int bit) -{ - struct vmcb *vmcb = get_host_vmcb(svm); - - vmcb->control.intercept &= ~(1ULL << bit); - - recalc_intercepts(svm); -} - -static inline void enable_gif(struct vcpu_svm *svm) -{ - svm->vcpu.arch.hflags |= HF_GIF_MASK; -} - -static inline void disable_gif(struct vcpu_svm *svm) -{ - svm->vcpu.arch.hflags &= ~HF_GIF_MASK; -} - -static inline bool gif_set(struct vcpu_svm *svm) -{ - return !!(svm->vcpu.arch.hflags & HF_GIF_MASK); -} - -static unsigned long iopm_base; - -struct kvm_ldttss_desc { - u16 limit0; - u16 base0; - unsigned base1:8, type:5, dpl:2, p:1; - unsigned limit1:4, zero0:3, g:1, base2:8; - u32 base3; - u32 zero1; -} __attribute__((packed)); - -struct svm_cpu_data { - int cpu; - - u64 asid_generation; - u32 max_asid; - u32 next_asid; - struct kvm_ldttss_desc *tss_desc; - - struct page *save_area; -}; - -static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data); - -struct svm_init_data { - int cpu; - int r; -}; - -static u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000}; - -#define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges) -#define MSRS_RANGE_SIZE 2048 -#define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2) - -static u32 svm_msrpm_offset(u32 msr) -{ - u32 offset; - int i; - - for (i = 0; i < NUM_MSR_MAPS; i++) { - if (msr < msrpm_ranges[i] || - msr >= msrpm_ranges[i] + MSRS_IN_RANGE) - continue; - - offset = (msr - msrpm_ranges[i]) / 4; /* 4 msrs per u8 */ - offset += (i * MSRS_RANGE_SIZE); /* add range offset */ - - /* Now we have the u8 offset - but need the u32 offset */ - return offset / 4; - } - - /* MSR not in any range */ - return MSR_INVALID; -} - -#define MAX_INST_SIZE 15 - -static inline void clgi(void) -{ - asm volatile (__ex(SVM_CLGI)); -} - -static inline void stgi(void) -{ - asm volatile (__ex(SVM_STGI)); -} - -static inline void invlpga(unsigned long addr, u32 asid) -{ - asm volatile (__ex(SVM_INVLPGA) : : "a"(addr), "c"(asid)); -} - -static int get_npt_level(void) -{ -#ifdef CONFIG_X86_64 - return PT64_ROOT_LEVEL; -#else - return PT32E_ROOT_LEVEL; -#endif -} - -static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) -{ - vcpu->arch.efer = efer; - if (!npt_enabled && !(efer & EFER_LMA)) - efer &= ~EFER_LME; - - to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME; - mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR); -} - -static int is_external_interrupt(u32 info) -{ - info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID; - return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR); -} - -static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) -{ - struct vcpu_svm *svm = to_svm(vcpu); - u32 ret = 0; - - if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) - ret |= KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS; - return ret & mask; -} - -static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - if (mask == 0) - svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK; - else - svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK; - -} - -static void skip_emulated_instruction(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - if (svm->vmcb->control.next_rip != 0) - svm->next_rip = svm->vmcb->control.next_rip; - - if (!svm->next_rip) { - if (emulate_instruction(vcpu, EMULTYPE_SKIP) != - EMULATE_DONE) - printk(KERN_DEBUG "%s: NOP\n", __func__); - return; - } - if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE) - printk(KERN_ERR "%s: ip 0x%lx next 0x%llx\n", - __func__, kvm_rip_read(vcpu), svm->next_rip); - - kvm_rip_write(vcpu, svm->next_rip); - svm_set_interrupt_shadow(vcpu, 0); -} - -static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, - bool has_error_code, u32 error_code, - bool reinject) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - /* - * If we are within a nested VM we'd better #VMEXIT and let the guest - * handle the exception - */ - if (!reinject && - nested_svm_check_exception(svm, nr, has_error_code, error_code)) - return; - - if (nr == BP_VECTOR && !static_cpu_has(X86_FEATURE_NRIPS)) { - unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu); - - /* - * For guest debugging where we have to reinject #BP if some - * INT3 is guest-owned: - * Emulate nRIP by moving RIP forward. Will fail if injection - * raises a fault that is not intercepted. Still better than - * failing in all cases. - */ - skip_emulated_instruction(&svm->vcpu); - rip = kvm_rip_read(&svm->vcpu); - svm->int3_rip = rip + svm->vmcb->save.cs.base; - svm->int3_injected = rip - old_rip; - } - - svm->vmcb->control.event_inj = nr - | SVM_EVTINJ_VALID - | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0) - | SVM_EVTINJ_TYPE_EXEPT; - svm->vmcb->control.event_inj_err = error_code; -} - -static void svm_init_erratum_383(void) -{ - u32 low, high; - int err; - u64 val; - - if (!cpu_has_amd_erratum(amd_erratum_383)) - return; - - /* Use _safe variants to not break nested virtualization */ - val = native_read_msr_safe(MSR_AMD64_DC_CFG, &err); - if (err) - return; - - val |= (1ULL << 47); - - low = lower_32_bits(val); - high = upper_32_bits(val); - - native_write_msr_safe(MSR_AMD64_DC_CFG, low, high); - - erratum_383_found = true; -} - -static void svm_init_osvw(struct kvm_vcpu *vcpu) -{ - /* - * Guests should see errata 400 and 415 as fixed (assuming that - * HLT and IO instructions are intercepted). - */ - vcpu->arch.osvw.length = (osvw_len >= 3) ? (osvw_len) : 3; - vcpu->arch.osvw.status = osvw_status & ~(6ULL); - - /* - * By increasing VCPU's osvw.length to 3 we are telling the guest that - * all osvw.status bits inside that length, including bit 0 (which is - * reserved for erratum 298), are valid. However, if host processor's - * osvw_len is 0 then osvw_status[0] carries no information. We need to - * be conservative here and therefore we tell the guest that erratum 298 - * is present (because we really don't know). - */ - if (osvw_len == 0 && boot_cpu_data.x86 == 0x10) - vcpu->arch.osvw.status |= 1; -} - -static int has_svm(void) -{ - const char *msg; - - if (!cpu_has_svm(&msg)) { - printk(KERN_INFO "has_svm: %s\n", msg); - return 0; - } - - return 1; -} - -static void svm_hardware_disable(void *garbage) -{ - /* Make sure we clean up behind us */ - if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) - wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT); - - cpu_svm_disable(); - - amd_pmu_disable_virt(); -} - -static int svm_hardware_enable(void *garbage) -{ - - struct svm_cpu_data *sd; - uint64_t efer; - struct desc_ptr gdt_descr; - struct desc_struct *gdt; - int me = raw_smp_processor_id(); - - rdmsrl(MSR_EFER, efer); - if (efer & EFER_SVME) - return -EBUSY; - - if (!has_svm()) { - printk(KERN_ERR "svm_hardware_enable: err EOPNOTSUPP on %d\n", - me); - return -EINVAL; - } - sd = per_cpu(svm_data, me); - - if (!sd) { - printk(KERN_ERR "svm_hardware_enable: svm_data is NULL on %d\n", - me); - return -EINVAL; - } - - sd->asid_generation = 1; - sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; - sd->next_asid = sd->max_asid + 1; - - native_store_gdt(&gdt_descr); - gdt = (struct desc_struct *)gdt_descr.address; - sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS); - - wrmsrl(MSR_EFER, efer | EFER_SVME); - - wrmsrl(MSR_VM_HSAVE_PA, page_to_pfn(sd->save_area) << PAGE_SHIFT); - - if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) { - wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT); - __get_cpu_var(current_tsc_ratio) = TSC_RATIO_DEFAULT; - } - - - /* - * Get OSVW bits. - * - * Note that it is possible to have a system with mixed processor - * revisions and therefore different OSVW bits. If bits are not the same - * on different processors then choose the worst case (i.e. if erratum - * is present on one processor and not on another then assume that the - * erratum is present everywhere). - */ - if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) { - uint64_t len, status = 0; - int err; - - len = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &err); - if (!err) - status = native_read_msr_safe(MSR_AMD64_OSVW_STATUS, - &err); - - if (err) - osvw_status = osvw_len = 0; - else { - if (len < osvw_len) - osvw_len = len; - osvw_status |= status; - osvw_status &= (1ULL << osvw_len) - 1; - } - } else - osvw_status = osvw_len = 0; - - svm_init_erratum_383(); - - amd_pmu_enable_virt(); - - return 0; -} - -static void svm_cpu_uninit(int cpu) -{ - struct svm_cpu_data *sd = per_cpu(svm_data, raw_smp_processor_id()); - - if (!sd) - return; - - per_cpu(svm_data, raw_smp_processor_id()) = NULL; - __free_page(sd->save_area); - kfree(sd); -} - -static int svm_cpu_init(int cpu) -{ - struct svm_cpu_data *sd; - int r; - - sd = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL); - if (!sd) - return -ENOMEM; - sd->cpu = cpu; - sd->save_area = alloc_page(GFP_KERNEL); - r = -ENOMEM; - if (!sd->save_area) - goto err_1; - - per_cpu(svm_data, cpu) = sd; - - return 0; - -err_1: - kfree(sd); - return r; - -} - -static bool valid_msr_intercept(u32 index) -{ - int i; - - for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) - if (direct_access_msrs[i].index == index) - return true; - - return false; -} - -static void set_msr_interception(u32 *msrpm, unsigned msr, - int read, int write) -{ - u8 bit_read, bit_write; - unsigned long tmp; - u32 offset; - - /* - * If this warning triggers extend the direct_access_msrs list at the - * beginning of the file - */ - WARN_ON(!valid_msr_intercept(msr)); - - offset = svm_msrpm_offset(msr); - bit_read = 2 * (msr & 0x0f); - bit_write = 2 * (msr & 0x0f) + 1; - tmp = msrpm[offset]; - - BUG_ON(offset == MSR_INVALID); - - read ? clear_bit(bit_read, &tmp) : set_bit(bit_read, &tmp); - write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp); - - msrpm[offset] = tmp; -} - -static void svm_vcpu_init_msrpm(u32 *msrpm) -{ - int i; - - memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER)); - - for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) { - if (!direct_access_msrs[i].always) - continue; - - set_msr_interception(msrpm, direct_access_msrs[i].index, 1, 1); - } -} - -static void add_msr_offset(u32 offset) -{ - int i; - - for (i = 0; i < MSRPM_OFFSETS; ++i) { - - /* Offset already in list? */ - if (msrpm_offsets[i] == offset) - return; - - /* Slot used by another offset? */ - if (msrpm_offsets[i] != MSR_INVALID) - continue; - - /* Add offset to list */ - msrpm_offsets[i] = offset; - - return; - } - - /* - * If this BUG triggers the msrpm_offsets table has an overflow. Just - * increase MSRPM_OFFSETS in this case. - */ - BUG(); -} - -static void init_msrpm_offsets(void) -{ - int i; - - memset(msrpm_offsets, 0xff, sizeof(msrpm_offsets)); - - for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) { - u32 offset; - - offset = svm_msrpm_offset(direct_access_msrs[i].index); - BUG_ON(offset == MSR_INVALID); - - add_msr_offset(offset); - } -} - -static void svm_enable_lbrv(struct vcpu_svm *svm) -{ - u32 *msrpm = svm->msrpm; - - svm->vmcb->control.lbr_ctl = 1; - set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1); - set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1); - set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 1, 1); - set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 1, 1); -} - -static void svm_disable_lbrv(struct vcpu_svm *svm) -{ - u32 *msrpm = svm->msrpm; - - svm->vmcb->control.lbr_ctl = 0; - set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0); - set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0); - set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 0, 0); - set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0); -} - -static __init int svm_hardware_setup(void) -{ - int cpu; - struct page *iopm_pages; - void *iopm_va; - int r; - - iopm_pages = alloc_pages(GFP_KERNEL, IOPM_ALLOC_ORDER); - - if (!iopm_pages) - return -ENOMEM; - - iopm_va = page_address(iopm_pages); - memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER)); - iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT; - - init_msrpm_offsets(); - - if (boot_cpu_has(X86_FEATURE_NX)) - kvm_enable_efer_bits(EFER_NX); - - if (boot_cpu_has(X86_FEATURE_FXSR_OPT)) - kvm_enable_efer_bits(EFER_FFXSR); - - if (boot_cpu_has(X86_FEATURE_TSCRATEMSR)) { - u64 max; - - kvm_has_tsc_control = true; - - /* - * Make sure the user can only configure tsc_khz values that - * fit into a signed integer. - * A min value is not calculated needed because it will always - * be 1 on all machines and a value of 0 is used to disable - * tsc-scaling for the vcpu. - */ - max = min(0x7fffffffULL, __scale_tsc(tsc_khz, TSC_RATIO_MAX)); - - kvm_max_guest_tsc_khz = max; - } - - if (nested) { - printk(KERN_INFO "kvm: Nested Virtualization enabled\n"); - kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE); - } - - for_each_possible_cpu(cpu) { - r = svm_cpu_init(cpu); - if (r) - goto err; - } - - if (!boot_cpu_has(X86_FEATURE_NPT)) - npt_enabled = false; - - if (npt_enabled && !npt) { - printk(KERN_INFO "kvm: Nested Paging disabled\n"); - npt_enabled = false; - } - - if (npt_enabled) { - printk(KERN_INFO "kvm: Nested Paging enabled\n"); - kvm_enable_tdp(); - } else - kvm_disable_tdp(); - - return 0; - -err: - __free_pages(iopm_pages, IOPM_ALLOC_ORDER); - iopm_base = 0; - return r; -} - -static __exit void svm_hardware_unsetup(void) -{ - int cpu; - - for_each_possible_cpu(cpu) - svm_cpu_uninit(cpu); - - __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER); - iopm_base = 0; -} - -static void init_seg(struct vmcb_seg *seg) -{ - seg->selector = 0; - seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK | - SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */ - seg->limit = 0xffff; - seg->base = 0; -} - -static void init_sys_seg(struct vmcb_seg *seg, uint32_t type) -{ - seg->selector = 0; - seg->attrib = SVM_SELECTOR_P_MASK | type; - seg->limit = 0xffff; - seg->base = 0; -} - -static u64 __scale_tsc(u64 ratio, u64 tsc) -{ - u64 mult, frac, _tsc; - - mult = ratio >> 32; - frac = ratio & ((1ULL << 32) - 1); - - _tsc = tsc; - _tsc *= mult; - _tsc += (tsc >> 32) * frac; - _tsc += ((tsc & ((1ULL << 32) - 1)) * frac) >> 32; - - return _tsc; -} - -static u64 svm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc) -{ - struct vcpu_svm *svm = to_svm(vcpu); - u64 _tsc = tsc; - - if (svm->tsc_ratio != TSC_RATIO_DEFAULT) - _tsc = __scale_tsc(svm->tsc_ratio, tsc); - - return _tsc; -} - -static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) -{ - struct vcpu_svm *svm = to_svm(vcpu); - u64 ratio; - u64 khz; - - /* Guest TSC same frequency as host TSC? */ - if (!scale) { - svm->tsc_ratio = TSC_RATIO_DEFAULT; - return; - } - - /* TSC scaling supported? */ - if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) { - if (user_tsc_khz > tsc_khz) { - vcpu->arch.tsc_catchup = 1; - vcpu->arch.tsc_always_catchup = 1; - } else - WARN(1, "user requested TSC rate below hardware speed\n"); - return; - } - - khz = user_tsc_khz; - - /* TSC scaling required - calculate ratio */ - ratio = khz << 32; - do_div(ratio, tsc_khz); - - if (ratio == 0 || ratio & TSC_RATIO_RSVD) { - WARN_ONCE(1, "Invalid TSC ratio - virtual-tsc-khz=%u\n", - user_tsc_khz); - return; - } - svm->tsc_ratio = ratio; -} - -static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) -{ - struct vcpu_svm *svm = to_svm(vcpu); - u64 g_tsc_offset = 0; - - if (is_guest_mode(vcpu)) { - g_tsc_offset = svm->vmcb->control.tsc_offset - - svm->nested.hsave->control.tsc_offset; - svm->nested.hsave->control.tsc_offset = offset; - } - - svm->vmcb->control.tsc_offset = offset + g_tsc_offset; - - mark_dirty(svm->vmcb, VMCB_INTERCEPTS); -} - -static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - WARN_ON(adjustment < 0); - if (host) - adjustment = svm_scale_tsc(vcpu, adjustment); - - svm->vmcb->control.tsc_offset += adjustment; - if (is_guest_mode(vcpu)) - svm->nested.hsave->control.tsc_offset += adjustment; - mark_dirty(svm->vmcb, VMCB_INTERCEPTS); -} - -static u64 svm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) -{ - u64 tsc; - - tsc = svm_scale_tsc(vcpu, native_read_tsc()); - - return target_tsc - tsc; -} - -static void init_vmcb(struct vcpu_svm *svm) -{ - struct vmcb_control_area *control = &svm->vmcb->control; - struct vmcb_save_area *save = &svm->vmcb->save; - - svm->vcpu.fpu_active = 1; - svm->vcpu.arch.hflags = 0; - - set_cr_intercept(svm, INTERCEPT_CR0_READ); - set_cr_intercept(svm, INTERCEPT_CR3_READ); - set_cr_intercept(svm, INTERCEPT_CR4_READ); - set_cr_intercept(svm, INTERCEPT_CR0_WRITE); - set_cr_intercept(svm, INTERCEPT_CR3_WRITE); - set_cr_intercept(svm, INTERCEPT_CR4_WRITE); - set_cr_intercept(svm, INTERCEPT_CR8_WRITE); - - set_dr_intercept(svm, INTERCEPT_DR0_READ); - set_dr_intercept(svm, INTERCEPT_DR1_READ); - set_dr_intercept(svm, INTERCEPT_DR2_READ); - set_dr_intercept(svm, INTERCEPT_DR3_READ); - set_dr_intercept(svm, INTERCEPT_DR4_READ); - set_dr_intercept(svm, INTERCEPT_DR5_READ); - set_dr_intercept(svm, INTERCEPT_DR6_READ); - set_dr_intercept(svm, INTERCEPT_DR7_READ); - - set_dr_intercept(svm, INTERCEPT_DR0_WRITE); - set_dr_intercept(svm, INTERCEPT_DR1_WRITE); - set_dr_intercept(svm, INTERCEPT_DR2_WRITE); - set_dr_intercept(svm, INTERCEPT_DR3_WRITE); - set_dr_intercept(svm, INTERCEPT_DR4_WRITE); - set_dr_intercept(svm, INTERCEPT_DR5_WRITE); - set_dr_intercept(svm, INTERCEPT_DR6_WRITE); - set_dr_intercept(svm, INTERCEPT_DR7_WRITE); - - set_exception_intercept(svm, PF_VECTOR); - set_exception_intercept(svm, UD_VECTOR); - set_exception_intercept(svm, MC_VECTOR); - - set_intercept(svm, INTERCEPT_INTR); - set_intercept(svm, INTERCEPT_NMI); - set_intercept(svm, INTERCEPT_SMI); - set_intercept(svm, INTERCEPT_SELECTIVE_CR0); - set_intercept(svm, INTERCEPT_RDPMC); - set_intercept(svm, INTERCEPT_CPUID); - set_intercept(svm, INTERCEPT_INVD); - set_intercept(svm, INTERCEPT_HLT); - set_intercept(svm, INTERCEPT_INVLPG); - set_intercept(svm, INTERCEPT_INVLPGA); - set_intercept(svm, INTERCEPT_IOIO_PROT); - set_intercept(svm, INTERCEPT_MSR_PROT); - set_intercept(svm, INTERCEPT_TASK_SWITCH); - set_intercept(svm, INTERCEPT_SHUTDOWN); - set_intercept(svm, INTERCEPT_VMRUN); - set_intercept(svm, INTERCEPT_VMMCALL); - set_intercept(svm, INTERCEPT_VMLOAD); - set_intercept(svm, INTERCEPT_VMSAVE); - set_intercept(svm, INTERCEPT_STGI); - set_intercept(svm, INTERCEPT_CLGI); - set_intercept(svm, INTERCEPT_SKINIT); - set_intercept(svm, INTERCEPT_WBINVD); - set_intercept(svm, INTERCEPT_MONITOR); - set_intercept(svm, INTERCEPT_MWAIT); - set_intercept(svm, INTERCEPT_XSETBV); - - control->iopm_base_pa = iopm_base; - control->msrpm_base_pa = __pa(svm->msrpm); - control->int_ctl = V_INTR_MASKING_MASK; - - init_seg(&save->es); - init_seg(&save->ss); - init_seg(&save->ds); - init_seg(&save->fs); - init_seg(&save->gs); - - save->cs.selector = 0xf000; - /* Executable/Readable Code Segment */ - save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK | - SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK; - save->cs.limit = 0xffff; - /* - * cs.base should really be 0xffff0000, but vmx can't handle that, so - * be consistent with it. - * - * Replace when we have real mode working for vmx. - */ - save->cs.base = 0xf0000; - - save->gdtr.limit = 0xffff; - save->idtr.limit = 0xffff; - - init_sys_seg(&save->ldtr, SEG_TYPE_LDT); - init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16); - - svm_set_efer(&svm->vcpu, 0); - save->dr6 = 0xffff0ff0; - save->dr7 = 0x400; - kvm_set_rflags(&svm->vcpu, 2); - save->rip = 0x0000fff0; - svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip; - - /* - * This is the guest-visible cr0 value. - * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0. - */ - svm->vcpu.arch.cr0 = 0; - (void)kvm_set_cr0(&svm->vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET); - - save->cr4 = X86_CR4_PAE; - /* rdx = ?? */ - - if (npt_enabled) { - /* Setup VMCB for Nested Paging */ - control->nested_ctl = 1; - clr_intercept(svm, INTERCEPT_INVLPG); - clr_exception_intercept(svm, PF_VECTOR); - clr_cr_intercept(svm, INTERCEPT_CR3_READ); - clr_cr_intercept(svm, INTERCEPT_CR3_WRITE); - save->g_pat = 0x0007040600070406ULL; - save->cr3 = 0; - save->cr4 = 0; - } - svm->asid_generation = 0; - - svm->nested.vmcb = 0; - svm->vcpu.arch.hflags = 0; - - if (boot_cpu_has(X86_FEATURE_PAUSEFILTER)) { - control->pause_filter_count = 3000; - set_intercept(svm, INTERCEPT_PAUSE); - } - - mark_all_dirty(svm->vmcb); - - enable_gif(svm); -} - -static int svm_vcpu_reset(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - init_vmcb(svm); - - if (!kvm_vcpu_is_bsp(vcpu)) { - kvm_rip_write(vcpu, 0); - svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12; - svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8; - } - vcpu->arch.regs_avail = ~0; - vcpu->arch.regs_dirty = ~0; - - return 0; -} - -static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) -{ - struct vcpu_svm *svm; - struct page *page; - struct page *msrpm_pages; - struct page *hsave_page; - struct page *nested_msrpm_pages; - int err; - - svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); - if (!svm) { - err = -ENOMEM; - goto out; - } - - svm->tsc_ratio = TSC_RATIO_DEFAULT; - - err = kvm_vcpu_init(&svm->vcpu, kvm, id); - if (err) - goto free_svm; - - err = -ENOMEM; - page = alloc_page(GFP_KERNEL); - if (!page) - goto uninit; - - msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER); - if (!msrpm_pages) - goto free_page1; - - nested_msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER); - if (!nested_msrpm_pages) - goto free_page2; - - hsave_page = alloc_page(GFP_KERNEL); - if (!hsave_page) - goto free_page3; - - svm->nested.hsave = page_address(hsave_page); - - svm->msrpm = page_address(msrpm_pages); - svm_vcpu_init_msrpm(svm->msrpm); - - svm->nested.msrpm = page_address(nested_msrpm_pages); - svm_vcpu_init_msrpm(svm->nested.msrpm); - - svm->vmcb = page_address(page); - clear_page(svm->vmcb); - svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; - svm->asid_generation = 0; - init_vmcb(svm); - kvm_write_tsc(&svm->vcpu, 0); - - err = fx_init(&svm->vcpu); - if (err) - goto free_page4; - - svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; - if (kvm_vcpu_is_bsp(&svm->vcpu)) - svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; - - svm_init_osvw(&svm->vcpu); - - return &svm->vcpu; - -free_page4: - __free_page(hsave_page); -free_page3: - __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER); -free_page2: - __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER); -free_page1: - __free_page(page); -uninit: - kvm_vcpu_uninit(&svm->vcpu); -free_svm: - kmem_cache_free(kvm_vcpu_cache, svm); -out: - return ERR_PTR(err); -} - -static void svm_free_vcpu(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - __free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT)); - __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER); - __free_page(virt_to_page(svm->nested.hsave)); - __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER); - kvm_vcpu_uninit(vcpu); - kmem_cache_free(kvm_vcpu_cache, svm); -} - -static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - int i; - - if (unlikely(cpu != vcpu->cpu)) { - svm->asid_generation = 0; - mark_all_dirty(svm->vmcb); - } - -#ifdef CONFIG_X86_64 - rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host.gs_base); -#endif - savesegment(fs, svm->host.fs); - savesegment(gs, svm->host.gs); - svm->host.ldt = kvm_read_ldt(); - - for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) - rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); - - if (static_cpu_has(X86_FEATURE_TSCRATEMSR) && - svm->tsc_ratio != __get_cpu_var(current_tsc_ratio)) { - __get_cpu_var(current_tsc_ratio) = svm->tsc_ratio; - wrmsrl(MSR_AMD64_TSC_RATIO, svm->tsc_ratio); - } -} - -static void svm_vcpu_put(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - int i; - - ++vcpu->stat.host_state_reload; - kvm_load_ldt(svm->host.ldt); -#ifdef CONFIG_X86_64 - loadsegment(fs, svm->host.fs); - wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs); - load_gs_index(svm->host.gs); -#else -#ifdef CONFIG_X86_32_LAZY_GS - loadsegment(gs, svm->host.gs); -#endif -#endif - for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) - wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); -} - -static void svm_update_cpl(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - int cpl; - - if (!is_protmode(vcpu)) - cpl = 0; - else if (svm->vmcb->save.rflags & X86_EFLAGS_VM) - cpl = 3; - else - cpl = svm->vmcb->save.cs.selector & 0x3; - - svm->vmcb->save.cpl = cpl; -} - -static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) -{ - return to_svm(vcpu)->vmcb->save.rflags; -} - -static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) -{ - unsigned long old_rflags = to_svm(vcpu)->vmcb->save.rflags; - - to_svm(vcpu)->vmcb->save.rflags = rflags; - if ((old_rflags ^ rflags) & X86_EFLAGS_VM) - svm_update_cpl(vcpu); -} - -static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) -{ - switch (reg) { - case VCPU_EXREG_PDPTR: - BUG_ON(!npt_enabled); - load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)); - break; - default: - BUG(); - } -} - -static void svm_set_vintr(struct vcpu_svm *svm) -{ - set_intercept(svm, INTERCEPT_VINTR); -} - -static void svm_clear_vintr(struct vcpu_svm *svm) -{ - clr_intercept(svm, INTERCEPT_VINTR); -} - -static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg) -{ - struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save; - - switch (seg) { - case VCPU_SREG_CS: return &save->cs; - case VCPU_SREG_DS: return &save->ds; - case VCPU_SREG_ES: return &save->es; - case VCPU_SREG_FS: return &save->fs; - case VCPU_SREG_GS: return &save->gs; - case VCPU_SREG_SS: return &save->ss; - case VCPU_SREG_TR: return &save->tr; - case VCPU_SREG_LDTR: return &save->ldtr; - } - BUG(); - return NULL; -} - -static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg) -{ - struct vmcb_seg *s = svm_seg(vcpu, seg); - - return s->base; -} - -static void svm_get_segment(struct kvm_vcpu *vcpu, - struct kvm_segment *var, int seg) -{ - struct vmcb_seg *s = svm_seg(vcpu, seg); - - var->base = s->base; - var->limit = s->limit; - var->selector = s->selector; - var->type = s->attrib & SVM_SELECTOR_TYPE_MASK; - var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1; - var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3; - var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1; - var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1; - var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1; - var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1; - var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1; - - /* - * AMD's VMCB does not have an explicit unusable field, so emulate it - * for cross vendor migration purposes by "not present" - */ - var->unusable = !var->present || (var->type == 0); - - switch (seg) { - case VCPU_SREG_CS: - /* - * SVM always stores 0 for the 'G' bit in the CS selector in - * the VMCB on a VMEXIT. This hurts cross-vendor migration: - * Intel's VMENTRY has a check on the 'G' bit. - */ - var->g = s->limit > 0xfffff; - break; - case VCPU_SREG_TR: - /* - * Work around a bug where the busy flag in the tr selector - * isn't exposed - */ - var->type |= 0x2; - break; - case VCPU_SREG_DS: - case VCPU_SREG_ES: - case VCPU_SREG_FS: - case VCPU_SREG_GS: - /* - * The accessed bit must always be set in the segment - * descriptor cache, although it can be cleared in the - * descriptor, the cached bit always remains at 1. Since - * Intel has a check on this, set it here to support - * cross-vendor migration. - */ - if (!var->unusable) - var->type |= 0x1; - break; - case VCPU_SREG_SS: - /* - * On AMD CPUs sometimes the DB bit in the segment - * descriptor is left as 1, although the whole segment has - * been made unusable. Clear it here to pass an Intel VMX - * entry check when cross vendor migrating. - */ - if (var->unusable) - var->db = 0; - break; - } -} - -static int svm_get_cpl(struct kvm_vcpu *vcpu) -{ - struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save; - - return save->cpl; -} - -static void svm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - dt->size = svm->vmcb->save.idtr.limit; - dt->address = svm->vmcb->save.idtr.base; -} - -static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - svm->vmcb->save.idtr.limit = dt->size; - svm->vmcb->save.idtr.base = dt->address ; - mark_dirty(svm->vmcb, VMCB_DT); -} - -static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - dt->size = svm->vmcb->save.gdtr.limit; - dt->address = svm->vmcb->save.gdtr.base; -} - -static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - svm->vmcb->save.gdtr.limit = dt->size; - svm->vmcb->save.gdtr.base = dt->address ; - mark_dirty(svm->vmcb, VMCB_DT); -} - -static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) -{ -} - -static void svm_decache_cr3(struct kvm_vcpu *vcpu) -{ -} - -static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) -{ -} - -static void update_cr0_intercept(struct vcpu_svm *svm) -{ - ulong gcr0 = svm->vcpu.arch.cr0; - u64 *hcr0 = &svm->vmcb->save.cr0; - - if (!svm->vcpu.fpu_active) - *hcr0 |= SVM_CR0_SELECTIVE_MASK; - else - *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK) - | (gcr0 & SVM_CR0_SELECTIVE_MASK); - - mark_dirty(svm->vmcb, VMCB_CR); - - if (gcr0 == *hcr0 && svm->vcpu.fpu_active) { - clr_cr_intercept(svm, INTERCEPT_CR0_READ); - clr_cr_intercept(svm, INTERCEPT_CR0_WRITE); - } else { - set_cr_intercept(svm, INTERCEPT_CR0_READ); - set_cr_intercept(svm, INTERCEPT_CR0_WRITE); - } -} - -static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) -{ - struct vcpu_svm *svm = to_svm(vcpu); - -#ifdef CONFIG_X86_64 - if (vcpu->arch.efer & EFER_LME) { - if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { - vcpu->arch.efer |= EFER_LMA; - svm->vmcb->save.efer |= EFER_LMA | EFER_LME; - } - - if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) { - vcpu->arch.efer &= ~EFER_LMA; - svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME); - } - } -#endif - vcpu->arch.cr0 = cr0; - - if (!npt_enabled) - cr0 |= X86_CR0_PG | X86_CR0_WP; - - if (!vcpu->fpu_active) - cr0 |= X86_CR0_TS; - /* - * re-enable caching here because the QEMU bios - * does not do it - this results in some delay at - * reboot - */ - cr0 &= ~(X86_CR0_CD | X86_CR0_NW); - svm->vmcb->save.cr0 = cr0; - mark_dirty(svm->vmcb, VMCB_CR); - update_cr0_intercept(svm); -} - -static int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) -{ - unsigned long host_cr4_mce = read_cr4() & X86_CR4_MCE; - unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4; - - if (cr4 & X86_CR4_VMXE) - return 1; - - if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE)) - svm_flush_tlb(vcpu); - - vcpu->arch.cr4 = cr4; - if (!npt_enabled) - cr4 |= X86_CR4_PAE; - cr4 |= host_cr4_mce; - to_svm(vcpu)->vmcb->save.cr4 = cr4; - mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR); - return 0; -} - -static void svm_set_segment(struct kvm_vcpu *vcpu, - struct kvm_segment *var, int seg) -{ - struct vcpu_svm *svm = to_svm(vcpu); - struct vmcb_seg *s = svm_seg(vcpu, seg); - - s->base = var->base; - s->limit = var->limit; - s->selector = var->selector; - if (var->unusable) - s->attrib = 0; - else { - s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK); - s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT; - s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT; - s->attrib |= (var->present & 1) << SVM_SELECTOR_P_SHIFT; - s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT; - s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT; - s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT; - s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT; - } - if (seg == VCPU_SREG_CS) - svm_update_cpl(vcpu); - - mark_dirty(svm->vmcb, VMCB_SEG); -} - -static void update_db_intercept(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - clr_exception_intercept(svm, DB_VECTOR); - clr_exception_intercept(svm, BP_VECTOR); - - if (svm->nmi_singlestep) - set_exception_intercept(svm, DB_VECTOR); - - if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { - if (vcpu->guest_debug & - (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) - set_exception_intercept(svm, DB_VECTOR); - if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) - set_exception_intercept(svm, BP_VECTOR); - } else - vcpu->guest_debug = 0; -} - -static void svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) - svm->vmcb->save.dr7 = dbg->arch.debugreg[7]; - else - svm->vmcb->save.dr7 = vcpu->arch.dr7; - - mark_dirty(svm->vmcb, VMCB_DR); - - update_db_intercept(vcpu); -} - -static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) -{ - if (sd->next_asid > sd->max_asid) { - ++sd->asid_generation; - sd->next_asid = 1; - svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID; - } - - svm->asid_generation = sd->asid_generation; - svm->vmcb->control.asid = sd->next_asid++; - - mark_dirty(svm->vmcb, VMCB_ASID); -} - -static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - svm->vmcb->save.dr7 = value; - mark_dirty(svm->vmcb, VMCB_DR); -} - -static int pf_interception(struct vcpu_svm *svm) -{ - u64 fault_address = svm->vmcb->control.exit_info_2; - u32 error_code; - int r = 1; - - switch (svm->apf_reason) { - default: - error_code = svm->vmcb->control.exit_info_1; - - trace_kvm_page_fault(fault_address, error_code); - if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu)) - kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); - r = kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code, - svm->vmcb->control.insn_bytes, - svm->vmcb->control.insn_len); - break; - case KVM_PV_REASON_PAGE_NOT_PRESENT: - svm->apf_reason = 0; - local_irq_disable(); - kvm_async_pf_task_wait(fault_address); - local_irq_enable(); - break; - case KVM_PV_REASON_PAGE_READY: - svm->apf_reason = 0; - local_irq_disable(); - kvm_async_pf_task_wake(fault_address); - local_irq_enable(); - break; - } - return r; -} - -static int db_interception(struct vcpu_svm *svm) -{ - struct kvm_run *kvm_run = svm->vcpu.run; - - if (!(svm->vcpu.guest_debug & - (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) && - !svm->nmi_singlestep) { - kvm_queue_exception(&svm->vcpu, DB_VECTOR); - return 1; - } - - if (svm->nmi_singlestep) { - svm->nmi_singlestep = false; - if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) - svm->vmcb->save.rflags &= - ~(X86_EFLAGS_TF | X86_EFLAGS_RF); - update_db_intercept(&svm->vcpu); - } - - if (svm->vcpu.guest_debug & - (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) { - kvm_run->exit_reason = KVM_EXIT_DEBUG; - kvm_run->debug.arch.pc = - svm->vmcb->save.cs.base + svm->vmcb->save.rip; - kvm_run->debug.arch.exception = DB_VECTOR; - return 0; - } - - return 1; -} - -static int bp_interception(struct vcpu_svm *svm) -{ - struct kvm_run *kvm_run = svm->vcpu.run; - - kvm_run->exit_reason = KVM_EXIT_DEBUG; - kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip; - kvm_run->debug.arch.exception = BP_VECTOR; - return 0; -} - -static int ud_interception(struct vcpu_svm *svm) -{ - int er; - - er = emulate_instruction(&svm->vcpu, EMULTYPE_TRAP_UD); - if (er != EMULATE_DONE) - kvm_queue_exception(&svm->vcpu, UD_VECTOR); - return 1; -} - -static void svm_fpu_activate(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - clr_exception_intercept(svm, NM_VECTOR); - - svm->vcpu.fpu_active = 1; - update_cr0_intercept(svm); -} - -static int nm_interception(struct vcpu_svm *svm) -{ - svm_fpu_activate(&svm->vcpu); - return 1; -} - -static bool is_erratum_383(void) -{ - int err, i; - u64 value; - - if (!erratum_383_found) - return false; - - value = native_read_msr_safe(MSR_IA32_MC0_STATUS, &err); - if (err) - return false; - - /* Bit 62 may or may not be set for this mce */ - value &= ~(1ULL << 62); - - if (value != 0xb600000000010015ULL) - return false; - - /* Clear MCi_STATUS registers */ - for (i = 0; i < 6; ++i) - native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0, 0); - - value = native_read_msr_safe(MSR_IA32_MCG_STATUS, &err); - if (!err) { - u32 low, high; - - value &= ~(1ULL << 2); - low = lower_32_bits(value); - high = upper_32_bits(value); - - native_write_msr_safe(MSR_IA32_MCG_STATUS, low, high); - } - - /* Flush tlb to evict multi-match entries */ - __flush_tlb_all(); - - return true; -} - -static void svm_handle_mce(struct vcpu_svm *svm) -{ - if (is_erratum_383()) { - /* - * Erratum 383 triggered. Guest state is corrupt so kill the - * guest. - */ - pr_err("KVM: Guest triggered AMD Erratum 383\n"); - - kvm_make_request(KVM_REQ_TRIPLE_FAULT, &svm->vcpu); - - return; - } - - /* - * On an #MC intercept the MCE handler is not called automatically in - * the host. So do it by hand here. - */ - asm volatile ( - "int $0x12\n"); - /* not sure if we ever come back to this point */ - - return; -} - -static int mc_interception(struct vcpu_svm *svm) -{ - return 1; -} - -static int shutdown_interception(struct vcpu_svm *svm) -{ - struct kvm_run *kvm_run = svm->vcpu.run; - - /* - * VMCB is undefined after a SHUTDOWN intercept - * so reinitialize it. - */ - clear_page(svm->vmcb); - init_vmcb(svm); - - kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; - return 0; -} - -static int io_interception(struct vcpu_svm *svm) -{ - struct kvm_vcpu *vcpu = &svm->vcpu; - u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */ - int size, in, string; - unsigned port; - - ++svm->vcpu.stat.io_exits; - string = (io_info & SVM_IOIO_STR_MASK) != 0; - in = (io_info & SVM_IOIO_TYPE_MASK) != 0; - if (string || in) - return emulate_instruction(vcpu, 0) == EMULATE_DONE; - - port = io_info >> 16; - size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; - svm->next_rip = svm->vmcb->control.exit_info_2; - skip_emulated_instruction(&svm->vcpu); - - return kvm_fast_pio_out(vcpu, size, port); -} - -static int nmi_interception(struct vcpu_svm *svm) -{ - return 1; -} - -static int intr_interception(struct vcpu_svm *svm) -{ - ++svm->vcpu.stat.irq_exits; - return 1; -} - -static int nop_on_interception(struct vcpu_svm *svm) -{ - return 1; -} - -static int halt_interception(struct vcpu_svm *svm) -{ - svm->next_rip = kvm_rip_read(&svm->vcpu) + 1; - skip_emulated_instruction(&svm->vcpu); - return kvm_emulate_halt(&svm->vcpu); -} - -static int vmmcall_interception(struct vcpu_svm *svm) -{ - svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; - skip_emulated_instruction(&svm->vcpu); - kvm_emulate_hypercall(&svm->vcpu); - return 1; -} - -static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - return svm->nested.nested_cr3; -} - -static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index) -{ - struct vcpu_svm *svm = to_svm(vcpu); - u64 cr3 = svm->nested.nested_cr3; - u64 pdpte; - int ret; - - ret = kvm_read_guest_page(vcpu->kvm, gpa_to_gfn(cr3), &pdpte, - offset_in_page(cr3) + index * 8, 8); - if (ret) - return 0; - return pdpte; -} - -static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu, - unsigned long root) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - svm->vmcb->control.nested_cr3 = root; - mark_dirty(svm->vmcb, VMCB_NPT); - svm_flush_tlb(vcpu); -} - -static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu, - struct x86_exception *fault) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - svm->vmcb->control.exit_code = SVM_EXIT_NPF; - svm->vmcb->control.exit_code_hi = 0; - svm->vmcb->control.exit_info_1 = fault->error_code; - svm->vmcb->control.exit_info_2 = fault->address; - - nested_svm_vmexit(svm); -} - -static int nested_svm_init_mmu_context(struct kvm_vcpu *vcpu) -{ - int r; - - r = kvm_init_shadow_mmu(vcpu, &vcpu->arch.mmu); - - vcpu->arch.mmu.set_cr3 = nested_svm_set_tdp_cr3; - vcpu->arch.mmu.get_cr3 = nested_svm_get_tdp_cr3; - vcpu->arch.mmu.get_pdptr = nested_svm_get_tdp_pdptr; - vcpu->arch.mmu.inject_page_fault = nested_svm_inject_npf_exit; - vcpu->arch.mmu.shadow_root_level = get_npt_level(); - vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; - - return r; -} - -static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu) -{ - vcpu->arch.walk_mmu = &vcpu->arch.mmu; -} - -static int nested_svm_check_permissions(struct vcpu_svm *svm) -{ - if (!(svm->vcpu.arch.efer & EFER_SVME) - || !is_paging(&svm->vcpu)) { - kvm_queue_exception(&svm->vcpu, UD_VECTOR); - return 1; - } - - if (svm->vmcb->save.cpl) { - kvm_inject_gp(&svm->vcpu, 0); - return 1; - } - - return 0; -} - -static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, - bool has_error_code, u32 error_code) -{ - int vmexit; - - if (!is_guest_mode(&svm->vcpu)) - return 0; - - svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr; - svm->vmcb->control.exit_code_hi = 0; - svm->vmcb->control.exit_info_1 = error_code; - svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2; - - vmexit = nested_svm_intercept(svm); - if (vmexit == NESTED_EXIT_DONE) - svm->nested.exit_required = true; - - return vmexit; -} - -/* This function returns true if it is save to enable the irq window */ -static inline bool nested_svm_intr(struct vcpu_svm *svm) -{ - if (!is_guest_mode(&svm->vcpu)) - return true; - - if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK)) - return true; - - if (!(svm->vcpu.arch.hflags & HF_HIF_MASK)) - return false; - - /* - * if vmexit was already requested (by intercepted exception - * for instance) do not overwrite it with "external interrupt" - * vmexit. - */ - if (svm->nested.exit_required) - return false; - - svm->vmcb->control.exit_code = SVM_EXIT_INTR; - svm->vmcb->control.exit_info_1 = 0; - svm->vmcb->control.exit_info_2 = 0; - - if (svm->nested.intercept & 1ULL) { - /* - * The #vmexit can't be emulated here directly because this - * code path runs with irqs and preemtion disabled. A - * #vmexit emulation might sleep. Only signal request for - * the #vmexit here. - */ - svm->nested.exit_required = true; - trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip); - return false; - } - - return true; -} - -/* This function returns true if it is save to enable the nmi window */ -static inline bool nested_svm_nmi(struct vcpu_svm *svm) -{ - if (!is_guest_mode(&svm->vcpu)) - return true; - - if (!(svm->nested.intercept & (1ULL << INTERCEPT_NMI))) - return true; - - svm->vmcb->control.exit_code = SVM_EXIT_NMI; - svm->nested.exit_required = true; - - return false; -} - -static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, struct page **_page) -{ - struct page *page; - - might_sleep(); - - page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT); - if (is_error_page(page)) - goto error; - - *_page = page; - - return kmap(page); - -error: - kvm_release_page_clean(page); - kvm_inject_gp(&svm->vcpu, 0); - - return NULL; -} - -static void nested_svm_unmap(struct page *page) -{ - kunmap(page); - kvm_release_page_dirty(page); -} - -static int nested_svm_intercept_ioio(struct vcpu_svm *svm) -{ - unsigned port; - u8 val, bit; - u64 gpa; - - if (!(svm->nested.intercept & (1ULL << INTERCEPT_IOIO_PROT))) - return NESTED_EXIT_HOST; - - port = svm->vmcb->control.exit_info_1 >> 16; - gpa = svm->nested.vmcb_iopm + (port / 8); - bit = port % 8; - val = 0; - - if (kvm_read_guest(svm->vcpu.kvm, gpa, &val, 1)) - val &= (1 << bit); - - return val ? NESTED_EXIT_DONE : NESTED_EXIT_HOST; -} - -static int nested_svm_exit_handled_msr(struct vcpu_svm *svm) -{ - u32 offset, msr, value; - int write, mask; - - if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT))) - return NESTED_EXIT_HOST; - - msr = svm->vcpu.arch.regs[VCPU_REGS_RCX]; - offset = svm_msrpm_offset(msr); - write = svm->vmcb->control.exit_info_1 & 1; - mask = 1 << ((2 * (msr & 0xf)) + write); - - if (offset == MSR_INVALID) - return NESTED_EXIT_DONE; - - /* Offset is in 32 bit units but need in 8 bit units */ - offset *= 4; - - if (kvm_read_guest(svm->vcpu.kvm, svm->nested.vmcb_msrpm + offset, &value, 4)) - return NESTED_EXIT_DONE; - - return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST; -} - -static int nested_svm_exit_special(struct vcpu_svm *svm) -{ - u32 exit_code = svm->vmcb->control.exit_code; - - switch (exit_code) { - case SVM_EXIT_INTR: - case SVM_EXIT_NMI: - case SVM_EXIT_EXCP_BASE + MC_VECTOR: - return NESTED_EXIT_HOST; - case SVM_EXIT_NPF: - /* For now we are always handling NPFs when using them */ - if (npt_enabled) - return NESTED_EXIT_HOST; - break; - case SVM_EXIT_EXCP_BASE + PF_VECTOR: - /* When we're shadowing, trap PFs, but not async PF */ - if (!npt_enabled && svm->apf_reason == 0) - return NESTED_EXIT_HOST; - break; - case SVM_EXIT_EXCP_BASE + NM_VECTOR: - nm_interception(svm); - break; - default: - break; - } - - return NESTED_EXIT_CONTINUE; -} - -/* - * If this function returns true, this #vmexit was already handled - */ -static int nested_svm_intercept(struct vcpu_svm *svm) -{ - u32 exit_code = svm->vmcb->control.exit_code; - int vmexit = NESTED_EXIT_HOST; - - switch (exit_code) { - case SVM_EXIT_MSR: - vmexit = nested_svm_exit_handled_msr(svm); - break; - case SVM_EXIT_IOIO: - vmexit = nested_svm_intercept_ioio(svm); - break; - case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: { - u32 bit = 1U << (exit_code - SVM_EXIT_READ_CR0); - if (svm->nested.intercept_cr & bit) - vmexit = NESTED_EXIT_DONE; - break; - } - case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: { - u32 bit = 1U << (exit_code - SVM_EXIT_READ_DR0); - if (svm->nested.intercept_dr & bit) - vmexit = NESTED_EXIT_DONE; - break; - } - case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: { - u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE); - if (svm->nested.intercept_exceptions & excp_bits) - vmexit = NESTED_EXIT_DONE; - /* async page fault always cause vmexit */ - else if ((exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) && - svm->apf_reason != 0) - vmexit = NESTED_EXIT_DONE; - break; - } - case SVM_EXIT_ERR: { - vmexit = NESTED_EXIT_DONE; - break; - } - default: { - u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR); - if (svm->nested.intercept & exit_bits) - vmexit = NESTED_EXIT_DONE; - } - } - - return vmexit; -} - -static int nested_svm_exit_handled(struct vcpu_svm *svm) -{ - int vmexit; - - vmexit = nested_svm_intercept(svm); - - if (vmexit == NESTED_EXIT_DONE) - nested_svm_vmexit(svm); - - return vmexit; -} - -static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *from_vmcb) -{ - struct vmcb_control_area *dst = &dst_vmcb->control; - struct vmcb_control_area *from = &from_vmcb->control; - - dst->intercept_cr = from->intercept_cr; - dst->intercept_dr = from->intercept_dr; - dst->intercept_exceptions = from->intercept_exceptions; - dst->intercept = from->intercept; - dst->iopm_base_pa = from->iopm_base_pa; - dst->msrpm_base_pa = from->msrpm_base_pa; - dst->tsc_offset = from->tsc_offset; - dst->asid = from->asid; - dst->tlb_ctl = from->tlb_ctl; - dst->int_ctl = from->int_ctl; - dst->int_vector = from->int_vector; - dst->int_state = from->int_state; - dst->exit_code = from->exit_code; - dst->exit_code_hi = from->exit_code_hi; - dst->exit_info_1 = from->exit_info_1; - dst->exit_info_2 = from->exit_info_2; - dst->exit_int_info = from->exit_int_info; - dst->exit_int_info_err = from->exit_int_info_err; - dst->nested_ctl = from->nested_ctl; - dst->event_inj = from->event_inj; - dst->event_inj_err = from->event_inj_err; - dst->nested_cr3 = from->nested_cr3; - dst->lbr_ctl = from->lbr_ctl; -} - -static int nested_svm_vmexit(struct vcpu_svm *svm) -{ - struct vmcb *nested_vmcb; - struct vmcb *hsave = svm->nested.hsave; - struct vmcb *vmcb = svm->vmcb; - struct page *page; - - trace_kvm_nested_vmexit_inject(vmcb->control.exit_code, - vmcb->control.exit_info_1, - vmcb->control.exit_info_2, - vmcb->control.exit_int_info, - vmcb->control.exit_int_info_err, - KVM_ISA_SVM); - - nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, &page); - if (!nested_vmcb) - return 1; - - /* Exit Guest-Mode */ - leave_guest_mode(&svm->vcpu); - svm->nested.vmcb = 0; - - /* Give the current vmcb to the guest */ - disable_gif(svm); - - nested_vmcb->save.es = vmcb->save.es; - nested_vmcb->save.cs = vmcb->save.cs; - nested_vmcb->save.ss = vmcb->save.ss; - nested_vmcb->save.ds = vmcb->save.ds; - nested_vmcb->save.gdtr = vmcb->save.gdtr; - nested_vmcb->save.idtr = vmcb->save.idtr; - nested_vmcb->save.efer = svm->vcpu.arch.efer; - nested_vmcb->save.cr0 = kvm_read_cr0(&svm->vcpu); - nested_vmcb->save.cr3 = kvm_read_cr3(&svm->vcpu); - nested_vmcb->save.cr2 = vmcb->save.cr2; - nested_vmcb->save.cr4 = svm->vcpu.arch.cr4; - nested_vmcb->save.rflags = kvm_get_rflags(&svm->vcpu); - nested_vmcb->save.rip = vmcb->save.rip; - nested_vmcb->save.rsp = vmcb->save.rsp; - nested_vmcb->save.rax = vmcb->save.rax; - nested_vmcb->save.dr7 = vmcb->save.dr7; - nested_vmcb->save.dr6 = vmcb->save.dr6; - nested_vmcb->save.cpl = vmcb->save.cpl; - - nested_vmcb->control.int_ctl = vmcb->control.int_ctl; - nested_vmcb->control.int_vector = vmcb->control.int_vector; - nested_vmcb->control.int_state = vmcb->control.int_state; - nested_vmcb->control.exit_code = vmcb->control.exit_code; - nested_vmcb->control.exit_code_hi = vmcb->control.exit_code_hi; - nested_vmcb->control.exit_info_1 = vmcb->control.exit_info_1; - nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2; - nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info; - nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err; - nested_vmcb->control.next_rip = vmcb->control.next_rip; - - /* - * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have - * to make sure that we do not lose injected events. So check event_inj - * here and copy it to exit_int_info if it is valid. - * Exit_int_info and event_inj can't be both valid because the case - * below only happens on a VMRUN instruction intercept which has - * no valid exit_int_info set. - */ - if (vmcb->control.event_inj & SVM_EVTINJ_VALID) { - struct vmcb_control_area *nc = &nested_vmcb->control; - - nc->exit_int_info = vmcb->control.event_inj; - nc->exit_int_info_err = vmcb->control.event_inj_err; - } - - nested_vmcb->control.tlb_ctl = 0; - nested_vmcb->control.event_inj = 0; - nested_vmcb->control.event_inj_err = 0; - - /* We always set V_INTR_MASKING and remember the old value in hflags */ - if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK)) - nested_vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK; - - /* Restore the original control entries */ - copy_vmcb_control_area(vmcb, hsave); - - kvm_clear_exception_queue(&svm->vcpu); - kvm_clear_interrupt_queue(&svm->vcpu); - - svm->nested.nested_cr3 = 0; - - /* Restore selected save entries */ - svm->vmcb->save.es = hsave->save.es; - svm->vmcb->save.cs = hsave->save.cs; - svm->vmcb->save.ss = hsave->save.ss; - svm->vmcb->save.ds = hsave->save.ds; - svm->vmcb->save.gdtr = hsave->save.gdtr; - svm->vmcb->save.idtr = hsave->save.idtr; - kvm_set_rflags(&svm->vcpu, hsave->save.rflags); - svm_set_efer(&svm->vcpu, hsave->save.efer); - svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE); - svm_set_cr4(&svm->vcpu, hsave->save.cr4); - if (npt_enabled) { - svm->vmcb->save.cr3 = hsave->save.cr3; - svm->vcpu.arch.cr3 = hsave->save.cr3; - } else { - (void)kvm_set_cr3(&svm->vcpu, hsave->save.cr3); - } - kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, hsave->save.rax); - kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, hsave->save.rsp); - kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, hsave->save.rip); - svm->vmcb->save.dr7 = 0; - svm->vmcb->save.cpl = 0; - svm->vmcb->control.exit_int_info = 0; - - mark_all_dirty(svm->vmcb); - - nested_svm_unmap(page); - - nested_svm_uninit_mmu_context(&svm->vcpu); - kvm_mmu_reset_context(&svm->vcpu); - kvm_mmu_load(&svm->vcpu); - - return 0; -} - -static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) -{ - /* - * This function merges the msr permission bitmaps of kvm and the - * nested vmcb. It is omptimized in that it only merges the parts where - * the kvm msr permission bitmap may contain zero bits - */ - int i; - - if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT))) - return true; - - for (i = 0; i < MSRPM_OFFSETS; i++) { - u32 value, p; - u64 offset; - - if (msrpm_offsets[i] == 0xffffffff) - break; - - p = msrpm_offsets[i]; - offset = svm->nested.vmcb_msrpm + (p * 4); - - if (kvm_read_guest(svm->vcpu.kvm, offset, &value, 4)) - return false; - - svm->nested.msrpm[p] = svm->msrpm[p] | value; - } - - svm->vmcb->control.msrpm_base_pa = __pa(svm->nested.msrpm); - - return true; -} - -static bool nested_vmcb_checks(struct vmcb *vmcb) -{ - if ((vmcb->control.intercept & (1ULL << INTERCEPT_VMRUN)) == 0) - return false; - - if (vmcb->control.asid == 0) - return false; - - if (vmcb->control.nested_ctl && !npt_enabled) - return false; - - return true; -} - -static bool nested_svm_vmrun(struct vcpu_svm *svm) -{ - struct vmcb *nested_vmcb; - struct vmcb *hsave = svm->nested.hsave; - struct vmcb *vmcb = svm->vmcb; - struct page *page; - u64 vmcb_gpa; - - vmcb_gpa = svm->vmcb->save.rax; - - nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page); - if (!nested_vmcb) - return false; - - if (!nested_vmcb_checks(nested_vmcb)) { - nested_vmcb->control.exit_code = SVM_EXIT_ERR; - nested_vmcb->control.exit_code_hi = 0; - nested_vmcb->control.exit_info_1 = 0; - nested_vmcb->control.exit_info_2 = 0; - - nested_svm_unmap(page); - - return false; - } - - trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa, - nested_vmcb->save.rip, - nested_vmcb->control.int_ctl, - nested_vmcb->control.event_inj, - nested_vmcb->control.nested_ctl); - - trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr & 0xffff, - nested_vmcb->control.intercept_cr >> 16, - nested_vmcb->control.intercept_exceptions, - nested_vmcb->control.intercept); - - /* Clear internal status */ - kvm_clear_exception_queue(&svm->vcpu); - kvm_clear_interrupt_queue(&svm->vcpu); - - /* - * Save the old vmcb, so we don't need to pick what we save, but can - * restore everything when a VMEXIT occurs - */ - hsave->save.es = vmcb->save.es; - hsave->save.cs = vmcb->save.cs; - hsave->save.ss = vmcb->save.ss; - hsave->save.ds = vmcb->save.ds; - hsave->save.gdtr = vmcb->save.gdtr; - hsave->save.idtr = vmcb->save.idtr; - hsave->save.efer = svm->vcpu.arch.efer; - hsave->save.cr0 = kvm_read_cr0(&svm->vcpu); - hsave->save.cr4 = svm->vcpu.arch.cr4; - hsave->save.rflags = kvm_get_rflags(&svm->vcpu); - hsave->save.rip = kvm_rip_read(&svm->vcpu); - hsave->save.rsp = vmcb->save.rsp; - hsave->save.rax = vmcb->save.rax; - if (npt_enabled) - hsave->save.cr3 = vmcb->save.cr3; - else - hsave->save.cr3 = kvm_read_cr3(&svm->vcpu); - - copy_vmcb_control_area(hsave, vmcb); - - if (kvm_get_rflags(&svm->vcpu) & X86_EFLAGS_IF) - svm->vcpu.arch.hflags |= HF_HIF_MASK; - else - svm->vcpu.arch.hflags &= ~HF_HIF_MASK; - - if (nested_vmcb->control.nested_ctl) { - kvm_mmu_unload(&svm->vcpu); - svm->nested.nested_cr3 = nested_vmcb->control.nested_cr3; - nested_svm_init_mmu_context(&svm->vcpu); - } - - /* Load the nested guest state */ - svm->vmcb->save.es = nested_vmcb->save.es; - svm->vmcb->save.cs = nested_vmcb->save.cs; - svm->vmcb->save.ss = nested_vmcb->save.ss; - svm->vmcb->save.ds = nested_vmcb->save.ds; - svm->vmcb->save.gdtr = nested_vmcb->save.gdtr; - svm->vmcb->save.idtr = nested_vmcb->save.idtr; - kvm_set_rflags(&svm->vcpu, nested_vmcb->save.rflags); - svm_set_efer(&svm->vcpu, nested_vmcb->save.efer); - svm_set_cr0(&svm->vcpu, nested_vmcb->save.cr0); - svm_set_cr4(&svm->vcpu, nested_vmcb->save.cr4); - if (npt_enabled) { - svm->vmcb->save.cr3 = nested_vmcb->save.cr3; - svm->vcpu.arch.cr3 = nested_vmcb->save.cr3; - } else - (void)kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3); - - /* Guest paging mode is active - reset mmu */ - kvm_mmu_reset_context(&svm->vcpu); - - svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2; - kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, nested_vmcb->save.rax); - kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, nested_vmcb->save.rsp); - kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, nested_vmcb->save.rip); - - /* In case we don't even reach vcpu_run, the fields are not updated */ - svm->vmcb->save.rax = nested_vmcb->save.rax; - svm->vmcb->save.rsp = nested_vmcb->save.rsp; - svm->vmcb->save.rip = nested_vmcb->save.rip; - svm->vmcb->save.dr7 = nested_vmcb->save.dr7; - svm->vmcb->save.dr6 = nested_vmcb->save.dr6; - svm->vmcb->save.cpl = nested_vmcb->save.cpl; - - svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa & ~0x0fffULL; - svm->nested.vmcb_iopm = nested_vmcb->control.iopm_base_pa & ~0x0fffULL; - - /* cache intercepts */ - svm->nested.intercept_cr = nested_vmcb->control.intercept_cr; - svm->nested.intercept_dr = nested_vmcb->control.intercept_dr; - svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions; - svm->nested.intercept = nested_vmcb->control.intercept; - - svm_flush_tlb(&svm->vcpu); - svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK; - if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK) - svm->vcpu.arch.hflags |= HF_VINTR_MASK; - else - svm->vcpu.arch.hflags &= ~HF_VINTR_MASK; - - if (svm->vcpu.arch.hflags & HF_VINTR_MASK) { - /* We only want the cr8 intercept bits of the guest */ - clr_cr_intercept(svm, INTERCEPT_CR8_READ); - clr_cr_intercept(svm, INTERCEPT_CR8_WRITE); - } - - /* We don't want to see VMMCALLs from a nested guest */ - clr_intercept(svm, INTERCEPT_VMMCALL); - - svm->vmcb->control.lbr_ctl = nested_vmcb->control.lbr_ctl; - svm->vmcb->control.int_vector = nested_vmcb->control.int_vector; - svm->vmcb->control.int_state = nested_vmcb->control.int_state; - svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset; - svm->vmcb->control.event_inj = nested_vmcb->control.event_inj; - svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err; - - nested_svm_unmap(page); - - /* Enter Guest-Mode */ - enter_guest_mode(&svm->vcpu); - - /* - * Merge guest and host intercepts - must be called with vcpu in - * guest-mode to take affect here - */ - recalc_intercepts(svm); - - svm->nested.vmcb = vmcb_gpa; - - enable_gif(svm); - - mark_all_dirty(svm->vmcb); - - return true; -} - -static void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb) -{ - to_vmcb->save.fs = from_vmcb->save.fs; - to_vmcb->save.gs = from_vmcb->save.gs; - to_vmcb->save.tr = from_vmcb->save.tr; - to_vmcb->save.ldtr = from_vmcb->save.ldtr; - to_vmcb->save.kernel_gs_base = from_vmcb->save.kernel_gs_base; - to_vmcb->save.star = from_vmcb->save.star; - to_vmcb->save.lstar = from_vmcb->save.lstar; - to_vmcb->save.cstar = from_vmcb->save.cstar; - to_vmcb->save.sfmask = from_vmcb->save.sfmask; - to_vmcb->save.sysenter_cs = from_vmcb->save.sysenter_cs; - to_vmcb->save.sysenter_esp = from_vmcb->save.sysenter_esp; - to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip; -} - -static int vmload_interception(struct vcpu_svm *svm) -{ - struct vmcb *nested_vmcb; - struct page *page; - - if (nested_svm_check_permissions(svm)) - return 1; - - nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page); - if (!nested_vmcb) - return 1; - - svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; - skip_emulated_instruction(&svm->vcpu); - - nested_svm_vmloadsave(nested_vmcb, svm->vmcb); - nested_svm_unmap(page); - - return 1; -} - -static int vmsave_interception(struct vcpu_svm *svm) -{ - struct vmcb *nested_vmcb; - struct page *page; - - if (nested_svm_check_permissions(svm)) - return 1; - - nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page); - if (!nested_vmcb) - return 1; - - svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; - skip_emulated_instruction(&svm->vcpu); - - nested_svm_vmloadsave(svm->vmcb, nested_vmcb); - nested_svm_unmap(page); - - return 1; -} - -static int vmrun_interception(struct vcpu_svm *svm) -{ - if (nested_svm_check_permissions(svm)) - return 1; - - /* Save rip after vmrun instruction */ - kvm_rip_write(&svm->vcpu, kvm_rip_read(&svm->vcpu) + 3); - - if (!nested_svm_vmrun(svm)) - return 1; - - if (!nested_svm_vmrun_msrpm(svm)) - goto failed; - - return 1; - -failed: - - svm->vmcb->control.exit_code = SVM_EXIT_ERR; - svm->vmcb->control.exit_code_hi = 0; - svm->vmcb->control.exit_info_1 = 0; - svm->vmcb->control.exit_info_2 = 0; - - nested_svm_vmexit(svm); - - return 1; -} - -static int stgi_interception(struct vcpu_svm *svm) -{ - if (nested_svm_check_permissions(svm)) - return 1; - - svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; - skip_emulated_instruction(&svm->vcpu); - kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); - - enable_gif(svm); - - return 1; -} - -static int clgi_interception(struct vcpu_svm *svm) -{ - if (nested_svm_check_permissions(svm)) - return 1; - - svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; - skip_emulated_instruction(&svm->vcpu); - - disable_gif(svm); - - /* After a CLGI no interrupts should come */ - svm_clear_vintr(svm); - svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; - - mark_dirty(svm->vmcb, VMCB_INTR); - - return 1; -} - -static int invlpga_interception(struct vcpu_svm *svm) -{ - struct kvm_vcpu *vcpu = &svm->vcpu; - - trace_kvm_invlpga(svm->vmcb->save.rip, vcpu->arch.regs[VCPU_REGS_RCX], - vcpu->arch.regs[VCPU_REGS_RAX]); - - /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */ - kvm_mmu_invlpg(vcpu, vcpu->arch.regs[VCPU_REGS_RAX]); - - svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; - skip_emulated_instruction(&svm->vcpu); - return 1; -} - -static int skinit_interception(struct vcpu_svm *svm) -{ - trace_kvm_skinit(svm->vmcb->save.rip, svm->vcpu.arch.regs[VCPU_REGS_RAX]); - - kvm_queue_exception(&svm->vcpu, UD_VECTOR); - return 1; -} - -static int xsetbv_interception(struct vcpu_svm *svm) -{ - u64 new_bv = kvm_read_edx_eax(&svm->vcpu); - u32 index = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX); - - if (kvm_set_xcr(&svm->vcpu, index, new_bv) == 0) { - svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; - skip_emulated_instruction(&svm->vcpu); - } - - return 1; -} - -static int invalid_op_interception(struct vcpu_svm *svm) -{ - kvm_queue_exception(&svm->vcpu, UD_VECTOR); - return 1; -} - -static int task_switch_interception(struct vcpu_svm *svm) -{ - u16 tss_selector; - int reason; - int int_type = svm->vmcb->control.exit_int_info & - SVM_EXITINTINFO_TYPE_MASK; - int int_vec = svm->vmcb->control.exit_int_info & SVM_EVTINJ_VEC_MASK; - uint32_t type = - svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK; - uint32_t idt_v = - svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID; - bool has_error_code = false; - u32 error_code = 0; - - tss_selector = (u16)svm->vmcb->control.exit_info_1; - - if (svm->vmcb->control.exit_info_2 & - (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET)) - reason = TASK_SWITCH_IRET; - else if (svm->vmcb->control.exit_info_2 & - (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP)) - reason = TASK_SWITCH_JMP; - else if (idt_v) - reason = TASK_SWITCH_GATE; - else - reason = TASK_SWITCH_CALL; - - if (reason == TASK_SWITCH_GATE) { - switch (type) { - case SVM_EXITINTINFO_TYPE_NMI: - svm->vcpu.arch.nmi_injected = false; - break; - case SVM_EXITINTINFO_TYPE_EXEPT: - if (svm->vmcb->control.exit_info_2 & - (1ULL << SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE)) { - has_error_code = true; - error_code = - (u32)svm->vmcb->control.exit_info_2; - } - kvm_clear_exception_queue(&svm->vcpu); - break; - case SVM_EXITINTINFO_TYPE_INTR: - kvm_clear_interrupt_queue(&svm->vcpu); - break; - default: - break; - } - } - - if (reason != TASK_SWITCH_GATE || - int_type == SVM_EXITINTINFO_TYPE_SOFT || - (int_type == SVM_EXITINTINFO_TYPE_EXEPT && - (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) - skip_emulated_instruction(&svm->vcpu); - - if (int_type != SVM_EXITINTINFO_TYPE_SOFT) - int_vec = -1; - - if (kvm_task_switch(&svm->vcpu, tss_selector, int_vec, reason, - has_error_code, error_code) == EMULATE_FAIL) { - svm->vcpu.run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - svm->vcpu.run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; - svm->vcpu.run->internal.ndata = 0; - return 0; - } - return 1; -} - -static int cpuid_interception(struct vcpu_svm *svm) -{ - svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; - kvm_emulate_cpuid(&svm->vcpu); - return 1; -} - -static int iret_interception(struct vcpu_svm *svm) -{ - ++svm->vcpu.stat.nmi_window_exits; - clr_intercept(svm, INTERCEPT_IRET); - svm->vcpu.arch.hflags |= HF_IRET_MASK; - svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu); - return 1; -} - -static int invlpg_interception(struct vcpu_svm *svm) -{ - if (!static_cpu_has(X86_FEATURE_DECODEASSISTS)) - return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE; - - kvm_mmu_invlpg(&svm->vcpu, svm->vmcb->control.exit_info_1); - skip_emulated_instruction(&svm->vcpu); - return 1; -} - -static int emulate_on_interception(struct vcpu_svm *svm) -{ - return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE; -} - -static int rdpmc_interception(struct vcpu_svm *svm) -{ - int err; - - if (!static_cpu_has(X86_FEATURE_NRIPS)) - return emulate_on_interception(svm); - - err = kvm_rdpmc(&svm->vcpu); - kvm_complete_insn_gp(&svm->vcpu, err); - - return 1; -} - -bool check_selective_cr0_intercepted(struct vcpu_svm *svm, unsigned long val) -{ - unsigned long cr0 = svm->vcpu.arch.cr0; - bool ret = false; - u64 intercept; - - intercept = svm->nested.intercept; - - if (!is_guest_mode(&svm->vcpu) || - (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0)))) - return false; - - cr0 &= ~SVM_CR0_SELECTIVE_MASK; - val &= ~SVM_CR0_SELECTIVE_MASK; - - if (cr0 ^ val) { - svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE; - ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE); - } - - return ret; -} - -#define CR_VALID (1ULL << 63) - -static int cr_interception(struct vcpu_svm *svm) -{ - int reg, cr; - unsigned long val; - int err; - - if (!static_cpu_has(X86_FEATURE_DECODEASSISTS)) - return emulate_on_interception(svm); - - if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0)) - return emulate_on_interception(svm); - - reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK; - cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0; - - err = 0; - if (cr >= 16) { /* mov to cr */ - cr -= 16; - val = kvm_register_read(&svm->vcpu, reg); - switch (cr) { - case 0: - if (!check_selective_cr0_intercepted(svm, val)) - err = kvm_set_cr0(&svm->vcpu, val); - else - return 1; - - break; - case 3: - err = kvm_set_cr3(&svm->vcpu, val); - break; - case 4: - err = kvm_set_cr4(&svm->vcpu, val); - break; - case 8: - err = kvm_set_cr8(&svm->vcpu, val); - break; - default: - WARN(1, "unhandled write to CR%d", cr); - kvm_queue_exception(&svm->vcpu, UD_VECTOR); - return 1; - } - } else { /* mov from cr */ - switch (cr) { - case 0: - val = kvm_read_cr0(&svm->vcpu); - break; - case 2: - val = svm->vcpu.arch.cr2; - break; - case 3: - val = kvm_read_cr3(&svm->vcpu); - break; - case 4: - val = kvm_read_cr4(&svm->vcpu); - break; - case 8: - val = kvm_get_cr8(&svm->vcpu); - break; - default: - WARN(1, "unhandled read from CR%d", cr); - kvm_queue_exception(&svm->vcpu, UD_VECTOR); - return 1; - } - kvm_register_write(&svm->vcpu, reg, val); - } - kvm_complete_insn_gp(&svm->vcpu, err); - - return 1; -} - -static int dr_interception(struct vcpu_svm *svm) -{ - int reg, dr; - unsigned long val; - int err; - - if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS)) - return emulate_on_interception(svm); - - reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK; - dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0; - - if (dr >= 16) { /* mov to DRn */ - val = kvm_register_read(&svm->vcpu, reg); - kvm_set_dr(&svm->vcpu, dr - 16, val); - } else { - err = kvm_get_dr(&svm->vcpu, dr, &val); - if (!err) - kvm_register_write(&svm->vcpu, reg, val); - } - - skip_emulated_instruction(&svm->vcpu); - - return 1; -} - -static int cr8_write_interception(struct vcpu_svm *svm) -{ - struct kvm_run *kvm_run = svm->vcpu.run; - int r; - - u8 cr8_prev = kvm_get_cr8(&svm->vcpu); - /* instruction emulation calls kvm_set_cr8() */ - r = cr_interception(svm); - if (irqchip_in_kernel(svm->vcpu.kvm)) { - clr_cr_intercept(svm, INTERCEPT_CR8_WRITE); - return r; - } - if (cr8_prev <= kvm_get_cr8(&svm->vcpu)) - return r; - kvm_run->exit_reason = KVM_EXIT_SET_TPR; - return 0; -} - -u64 svm_read_l1_tsc(struct kvm_vcpu *vcpu) -{ - struct vmcb *vmcb = get_host_vmcb(to_svm(vcpu)); - return vmcb->control.tsc_offset + - svm_scale_tsc(vcpu, native_read_tsc()); -} - -static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - switch (ecx) { - case MSR_IA32_TSC: { - *data = svm->vmcb->control.tsc_offset + - svm_scale_tsc(vcpu, native_read_tsc()); - - break; - } - case MSR_STAR: - *data = svm->vmcb->save.star; - break; -#ifdef CONFIG_X86_64 - case MSR_LSTAR: - *data = svm->vmcb->save.lstar; - break; - case MSR_CSTAR: - *data = svm->vmcb->save.cstar; - break; - case MSR_KERNEL_GS_BASE: - *data = svm->vmcb->save.kernel_gs_base; - break; - case MSR_SYSCALL_MASK: - *data = svm->vmcb->save.sfmask; - break; -#endif - case MSR_IA32_SYSENTER_CS: - *data = svm->vmcb->save.sysenter_cs; - break; - case MSR_IA32_SYSENTER_EIP: - *data = svm->sysenter_eip; - break; - case MSR_IA32_SYSENTER_ESP: - *data = svm->sysenter_esp; - break; - /* - * Nobody will change the following 5 values in the VMCB so we can - * safely return them on rdmsr. They will always be 0 until LBRV is - * implemented. - */ - case MSR_IA32_DEBUGCTLMSR: - *data = svm->vmcb->save.dbgctl; - break; - case MSR_IA32_LASTBRANCHFROMIP: - *data = svm->vmcb->save.br_from; - break; - case MSR_IA32_LASTBRANCHTOIP: - *data = svm->vmcb->save.br_to; - break; - case MSR_IA32_LASTINTFROMIP: - *data = svm->vmcb->save.last_excp_from; - break; - case MSR_IA32_LASTINTTOIP: - *data = svm->vmcb->save.last_excp_to; - break; - case MSR_VM_HSAVE_PA: - *data = svm->nested.hsave_msr; - break; - case MSR_VM_CR: - *data = svm->nested.vm_cr_msr; - break; - case MSR_IA32_UCODE_REV: - *data = 0x01000065; - break; - default: - return kvm_get_msr_common(vcpu, ecx, data); - } - return 0; -} - -static int rdmsr_interception(struct vcpu_svm *svm) -{ - u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; - u64 data; - - if (svm_get_msr(&svm->vcpu, ecx, &data)) { - trace_kvm_msr_read_ex(ecx); - kvm_inject_gp(&svm->vcpu, 0); - } else { - trace_kvm_msr_read(ecx, data); - - svm->vcpu.arch.regs[VCPU_REGS_RAX] = data & 0xffffffff; - svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32; - svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; - skip_emulated_instruction(&svm->vcpu); - } - return 1; -} - -static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data) -{ - struct vcpu_svm *svm = to_svm(vcpu); - int svm_dis, chg_mask; - - if (data & ~SVM_VM_CR_VALID_MASK) - return 1; - - chg_mask = SVM_VM_CR_VALID_MASK; - - if (svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK) - chg_mask &= ~(SVM_VM_CR_SVM_LOCK_MASK | SVM_VM_CR_SVM_DIS_MASK); - - svm->nested.vm_cr_msr &= ~chg_mask; - svm->nested.vm_cr_msr |= (data & chg_mask); - - svm_dis = svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK; - - /* check for svm_disable while efer.svme is set */ - if (svm_dis && (vcpu->arch.efer & EFER_SVME)) - return 1; - - return 0; -} - -static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - switch (ecx) { - case MSR_IA32_TSC: - kvm_write_tsc(vcpu, data); - break; - case MSR_STAR: - svm->vmcb->save.star = data; - break; -#ifdef CONFIG_X86_64 - case MSR_LSTAR: - svm->vmcb->save.lstar = data; - break; - case MSR_CSTAR: - svm->vmcb->save.cstar = data; - break; - case MSR_KERNEL_GS_BASE: - svm->vmcb->save.kernel_gs_base = data; - break; - case MSR_SYSCALL_MASK: - svm->vmcb->save.sfmask = data; - break; -#endif - case MSR_IA32_SYSENTER_CS: - svm->vmcb->save.sysenter_cs = data; - break; - case MSR_IA32_SYSENTER_EIP: - svm->sysenter_eip = data; - svm->vmcb->save.sysenter_eip = data; - break; - case MSR_IA32_SYSENTER_ESP: - svm->sysenter_esp = data; - svm->vmcb->save.sysenter_esp = data; - break; - case MSR_IA32_DEBUGCTLMSR: - if (!boot_cpu_has(X86_FEATURE_LBRV)) { - pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n", - __func__, data); - break; - } - if (data & DEBUGCTL_RESERVED_BITS) - return 1; - - svm->vmcb->save.dbgctl = data; - mark_dirty(svm->vmcb, VMCB_LBR); - if (data & (1ULL<<0)) - svm_enable_lbrv(svm); - else - svm_disable_lbrv(svm); - break; - case MSR_VM_HSAVE_PA: - svm->nested.hsave_msr = data; - break; - case MSR_VM_CR: - return svm_set_vm_cr(vcpu, data); - case MSR_VM_IGNNE: - pr_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); - break; - default: - return kvm_set_msr_common(vcpu, ecx, data); - } - return 0; -} - -static int wrmsr_interception(struct vcpu_svm *svm) -{ - u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; - u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u) - | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32); - - - svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; - if (svm_set_msr(&svm->vcpu, ecx, data)) { - trace_kvm_msr_write_ex(ecx, data); - kvm_inject_gp(&svm->vcpu, 0); - } else { - trace_kvm_msr_write(ecx, data); - skip_emulated_instruction(&svm->vcpu); - } - return 1; -} - -static int msr_interception(struct vcpu_svm *svm) -{ - if (svm->vmcb->control.exit_info_1) - return wrmsr_interception(svm); - else - return rdmsr_interception(svm); -} - -static int interrupt_window_interception(struct vcpu_svm *svm) -{ - struct kvm_run *kvm_run = svm->vcpu.run; - - kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); - svm_clear_vintr(svm); - svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; - mark_dirty(svm->vmcb, VMCB_INTR); - /* - * If the user space waits to inject interrupts, exit as soon as - * possible - */ - if (!irqchip_in_kernel(svm->vcpu.kvm) && - kvm_run->request_interrupt_window && - !kvm_cpu_has_interrupt(&svm->vcpu)) { - ++svm->vcpu.stat.irq_window_exits; - kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; - return 0; - } - - return 1; -} - -static int pause_interception(struct vcpu_svm *svm) -{ - kvm_vcpu_on_spin(&(svm->vcpu)); - return 1; -} - -static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { - [SVM_EXIT_READ_CR0] = cr_interception, - [SVM_EXIT_READ_CR3] = cr_interception, - [SVM_EXIT_READ_CR4] = cr_interception, - [SVM_EXIT_READ_CR8] = cr_interception, - [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, - [SVM_EXIT_WRITE_CR0] = cr_interception, - [SVM_EXIT_WRITE_CR3] = cr_interception, - [SVM_EXIT_WRITE_CR4] = cr_interception, - [SVM_EXIT_WRITE_CR8] = cr8_write_interception, - [SVM_EXIT_READ_DR0] = dr_interception, - [SVM_EXIT_READ_DR1] = dr_interception, - [SVM_EXIT_READ_DR2] = dr_interception, - [SVM_EXIT_READ_DR3] = dr_interception, - [SVM_EXIT_READ_DR4] = dr_interception, - [SVM_EXIT_READ_DR5] = dr_interception, - [SVM_EXIT_READ_DR6] = dr_interception, - [SVM_EXIT_READ_DR7] = dr_interception, - [SVM_EXIT_WRITE_DR0] = dr_interception, - [SVM_EXIT_WRITE_DR1] = dr_interception, - [SVM_EXIT_WRITE_DR2] = dr_interception, - [SVM_EXIT_WRITE_DR3] = dr_interception, - [SVM_EXIT_WRITE_DR4] = dr_interception, - [SVM_EXIT_WRITE_DR5] = dr_interception, - [SVM_EXIT_WRITE_DR6] = dr_interception, - [SVM_EXIT_WRITE_DR7] = dr_interception, - [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception, - [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception, - [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception, - [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception, - [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception, - [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception, - [SVM_EXIT_INTR] = intr_interception, - [SVM_EXIT_NMI] = nmi_interception, - [SVM_EXIT_SMI] = nop_on_interception, - [SVM_EXIT_INIT] = nop_on_interception, - [SVM_EXIT_VINTR] = interrupt_window_interception, - [SVM_EXIT_RDPMC] = rdpmc_interception, - [SVM_EXIT_CPUID] = cpuid_interception, - [SVM_EXIT_IRET] = iret_interception, - [SVM_EXIT_INVD] = emulate_on_interception, - [SVM_EXIT_PAUSE] = pause_interception, - [SVM_EXIT_HLT] = halt_interception, - [SVM_EXIT_INVLPG] = invlpg_interception, - [SVM_EXIT_INVLPGA] = invlpga_interception, - [SVM_EXIT_IOIO] = io_interception, - [SVM_EXIT_MSR] = msr_interception, - [SVM_EXIT_TASK_SWITCH] = task_switch_interception, - [SVM_EXIT_SHUTDOWN] = shutdown_interception, - [SVM_EXIT_VMRUN] = vmrun_interception, - [SVM_EXIT_VMMCALL] = vmmcall_interception, - [SVM_EXIT_VMLOAD] = vmload_interception, - [SVM_EXIT_VMSAVE] = vmsave_interception, - [SVM_EXIT_STGI] = stgi_interception, - [SVM_EXIT_CLGI] = clgi_interception, - [SVM_EXIT_SKINIT] = skinit_interception, - [SVM_EXIT_WBINVD] = emulate_on_interception, - [SVM_EXIT_MONITOR] = invalid_op_interception, - [SVM_EXIT_MWAIT] = invalid_op_interception, - [SVM_EXIT_XSETBV] = xsetbv_interception, - [SVM_EXIT_NPF] = pf_interception, -}; - -static void dump_vmcb(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - struct vmcb_control_area *control = &svm->vmcb->control; - struct vmcb_save_area *save = &svm->vmcb->save; - - pr_err("VMCB Control Area:\n"); - pr_err("%-20s%04x\n", "cr_read:", control->intercept_cr & 0xffff); - pr_err("%-20s%04x\n", "cr_write:", control->intercept_cr >> 16); - pr_err("%-20s%04x\n", "dr_read:", control->intercept_dr & 0xffff); - pr_err("%-20s%04x\n", "dr_write:", control->intercept_dr >> 16); - pr_err("%-20s%08x\n", "exceptions:", control->intercept_exceptions); - pr_err("%-20s%016llx\n", "intercepts:", control->intercept); - pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count); - pr_err("%-20s%016llx\n", "iopm_base_pa:", control->iopm_base_pa); - pr_err("%-20s%016llx\n", "msrpm_base_pa:", control->msrpm_base_pa); - pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset); - pr_err("%-20s%d\n", "asid:", control->asid); - pr_err("%-20s%d\n", "tlb_ctl:", control->tlb_ctl); - pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl); - pr_err("%-20s%08x\n", "int_vector:", control->int_vector); - pr_err("%-20s%08x\n", "int_state:", control->int_state); - pr_err("%-20s%08x\n", "exit_code:", control->exit_code); - pr_err("%-20s%016llx\n", "exit_info1:", control->exit_info_1); - pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2); - pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info); - pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err); - pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl); - pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3); - pr_err("%-20s%08x\n", "event_inj:", control->event_inj); - pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err); - pr_err("%-20s%lld\n", "lbr_ctl:", control->lbr_ctl); - pr_err("%-20s%016llx\n", "next_rip:", control->next_rip); - pr_err("VMCB State Save Area:\n"); - pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", - "es:", - save->es.selector, save->es.attrib, - save->es.limit, save->es.base); - pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", - "cs:", - save->cs.selector, save->cs.attrib, - save->cs.limit, save->cs.base); - pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", - "ss:", - save->ss.selector, save->ss.attrib, - save->ss.limit, save->ss.base); - pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", - "ds:", - save->ds.selector, save->ds.attrib, - save->ds.limit, save->ds.base); - pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", - "fs:", - save->fs.selector, save->fs.attrib, - save->fs.limit, save->fs.base); - pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", - "gs:", - save->gs.selector, save->gs.attrib, - save->gs.limit, save->gs.base); - pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", - "gdtr:", - save->gdtr.selector, save->gdtr.attrib, - save->gdtr.limit, save->gdtr.base); - pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", - "ldtr:", - save->ldtr.selector, save->ldtr.attrib, - save->ldtr.limit, save->ldtr.base); - pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", - "idtr:", - save->idtr.selector, save->idtr.attrib, - save->idtr.limit, save->idtr.base); - pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", - "tr:", - save->tr.selector, save->tr.attrib, - save->tr.limit, save->tr.base); - pr_err("cpl: %d efer: %016llx\n", - save->cpl, save->efer); - pr_err("%-15s %016llx %-13s %016llx\n", - "cr0:", save->cr0, "cr2:", save->cr2); - pr_err("%-15s %016llx %-13s %016llx\n", - "cr3:", save->cr3, "cr4:", save->cr4); - pr_err("%-15s %016llx %-13s %016llx\n", - "dr6:", save->dr6, "dr7:", save->dr7); - pr_err("%-15s %016llx %-13s %016llx\n", - "rip:", save->rip, "rflags:", save->rflags); - pr_err("%-15s %016llx %-13s %016llx\n", - "rsp:", save->rsp, "rax:", save->rax); - pr_err("%-15s %016llx %-13s %016llx\n", - "star:", save->star, "lstar:", save->lstar); - pr_err("%-15s %016llx %-13s %016llx\n", - "cstar:", save->cstar, "sfmask:", save->sfmask); - pr_err("%-15s %016llx %-13s %016llx\n", - "kernel_gs_base:", save->kernel_gs_base, - "sysenter_cs:", save->sysenter_cs); - pr_err("%-15s %016llx %-13s %016llx\n", - "sysenter_esp:", save->sysenter_esp, - "sysenter_eip:", save->sysenter_eip); - pr_err("%-15s %016llx %-13s %016llx\n", - "gpat:", save->g_pat, "dbgctl:", save->dbgctl); - pr_err("%-15s %016llx %-13s %016llx\n", - "br_from:", save->br_from, "br_to:", save->br_to); - pr_err("%-15s %016llx %-13s %016llx\n", - "excp_from:", save->last_excp_from, - "excp_to:", save->last_excp_to); -} - -static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) -{ - struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control; - - *info1 = control->exit_info_1; - *info2 = control->exit_info_2; -} - -static int handle_exit(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - struct kvm_run *kvm_run = vcpu->run; - u32 exit_code = svm->vmcb->control.exit_code; - - if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE)) - vcpu->arch.cr0 = svm->vmcb->save.cr0; - if (npt_enabled) - vcpu->arch.cr3 = svm->vmcb->save.cr3; - - if (unlikely(svm->nested.exit_required)) { - nested_svm_vmexit(svm); - svm->nested.exit_required = false; - - return 1; - } - - if (is_guest_mode(vcpu)) { - int vmexit; - - trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code, - svm->vmcb->control.exit_info_1, - svm->vmcb->control.exit_info_2, - svm->vmcb->control.exit_int_info, - svm->vmcb->control.exit_int_info_err, - KVM_ISA_SVM); - - vmexit = nested_svm_exit_special(svm); - - if (vmexit == NESTED_EXIT_CONTINUE) - vmexit = nested_svm_exit_handled(svm); - - if (vmexit == NESTED_EXIT_DONE) - return 1; - } - - svm_complete_interrupts(svm); - - if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) { - kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; - kvm_run->fail_entry.hardware_entry_failure_reason - = svm->vmcb->control.exit_code; - pr_err("KVM: FAILED VMRUN WITH VMCB:\n"); - dump_vmcb(vcpu); - return 0; - } - - if (is_external_interrupt(svm->vmcb->control.exit_int_info) && - exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR && - exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH && - exit_code != SVM_EXIT_INTR && exit_code != SVM_EXIT_NMI) - printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x " - "exit_code 0x%x\n", - __func__, svm->vmcb->control.exit_int_info, - exit_code); - - if (exit_code >= ARRAY_SIZE(svm_exit_handlers) - || !svm_exit_handlers[exit_code]) { - kvm_run->exit_reason = KVM_EXIT_UNKNOWN; - kvm_run->hw.hardware_exit_reason = exit_code; - return 0; - } - - return svm_exit_handlers[exit_code](svm); -} - -static void reload_tss(struct kvm_vcpu *vcpu) -{ - int cpu = raw_smp_processor_id(); - - struct svm_cpu_data *sd = per_cpu(svm_data, cpu); - sd->tss_desc->type = 9; /* available 32/64-bit TSS */ - load_TR_desc(); -} - -static void pre_svm_run(struct vcpu_svm *svm) -{ - int cpu = raw_smp_processor_id(); - - struct svm_cpu_data *sd = per_cpu(svm_data, cpu); - - /* FIXME: handle wraparound of asid_generation */ - if (svm->asid_generation != sd->asid_generation) - new_asid(svm, sd); -} - -static void svm_inject_nmi(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI; - vcpu->arch.hflags |= HF_NMI_MASK; - set_intercept(svm, INTERCEPT_IRET); - ++vcpu->stat.nmi_injections; -} - -static inline void svm_inject_irq(struct vcpu_svm *svm, int irq) -{ - struct vmcb_control_area *control; - - control = &svm->vmcb->control; - control->int_vector = irq; - control->int_ctl &= ~V_INTR_PRIO_MASK; - control->int_ctl |= V_IRQ_MASK | - ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT); - mark_dirty(svm->vmcb, VMCB_INTR); -} - -static void svm_set_irq(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - BUG_ON(!(gif_set(svm))); - - trace_kvm_inj_virq(vcpu->arch.interrupt.nr); - ++vcpu->stat.irq_injections; - - svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr | - SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR; -} - -static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK)) - return; - - if (irr == -1) - return; - - if (tpr >= irr) - set_cr_intercept(svm, INTERCEPT_CR8_WRITE); -} - -static int svm_nmi_allowed(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - struct vmcb *vmcb = svm->vmcb; - int ret; - ret = !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) && - !(svm->vcpu.arch.hflags & HF_NMI_MASK); - ret = ret && gif_set(svm) && nested_svm_nmi(svm); - - return ret; -} - -static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - return !!(svm->vcpu.arch.hflags & HF_NMI_MASK); -} - -static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - if (masked) { - svm->vcpu.arch.hflags |= HF_NMI_MASK; - set_intercept(svm, INTERCEPT_IRET); - } else { - svm->vcpu.arch.hflags &= ~HF_NMI_MASK; - clr_intercept(svm, INTERCEPT_IRET); - } -} - -static int svm_interrupt_allowed(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - struct vmcb *vmcb = svm->vmcb; - int ret; - - if (!gif_set(svm) || - (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)) - return 0; - - ret = !!(kvm_get_rflags(vcpu) & X86_EFLAGS_IF); - - if (is_guest_mode(vcpu)) - return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK); - - return ret; -} - -static void enable_irq_window(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - /* - * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes - * 1, because that's a separate STGI/VMRUN intercept. The next time we - * get that intercept, this function will be called again though and - * we'll get the vintr intercept. - */ - if (gif_set(svm) && nested_svm_intr(svm)) { - svm_set_vintr(svm); - svm_inject_irq(svm, 0x0); - } -} - -static void enable_nmi_window(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - if ((svm->vcpu.arch.hflags & (HF_NMI_MASK | HF_IRET_MASK)) - == HF_NMI_MASK) - return; /* IRET will cause a vm exit */ - - /* - * Something prevents NMI from been injected. Single step over possible - * problem (IRET or exception injection or interrupt shadow) - */ - svm->nmi_singlestep = true; - svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); - update_db_intercept(vcpu); -} - -static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr) -{ - return 0; -} - -static void svm_flush_tlb(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - if (static_cpu_has(X86_FEATURE_FLUSHBYASID)) - svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID; - else - svm->asid_generation--; -} - -static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu) -{ -} - -static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK)) - return; - - if (!is_cr_intercept(svm, INTERCEPT_CR8_WRITE)) { - int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK; - kvm_set_cr8(vcpu, cr8); - } -} - -static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - u64 cr8; - - if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK)) - return; - - cr8 = kvm_get_cr8(vcpu); - svm->vmcb->control.int_ctl &= ~V_TPR_MASK; - svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK; -} - -static void svm_complete_interrupts(struct vcpu_svm *svm) -{ - u8 vector; - int type; - u32 exitintinfo = svm->vmcb->control.exit_int_info; - unsigned int3_injected = svm->int3_injected; - - svm->int3_injected = 0; - - /* - * If we've made progress since setting HF_IRET_MASK, we've - * executed an IRET and can allow NMI injection. - */ - if ((svm->vcpu.arch.hflags & HF_IRET_MASK) - && kvm_rip_read(&svm->vcpu) != svm->nmi_iret_rip) { - svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK); - kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); - } - - svm->vcpu.arch.nmi_injected = false; - kvm_clear_exception_queue(&svm->vcpu); - kvm_clear_interrupt_queue(&svm->vcpu); - - if (!(exitintinfo & SVM_EXITINTINFO_VALID)) - return; - - kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); - - vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK; - type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK; - - switch (type) { - case SVM_EXITINTINFO_TYPE_NMI: - svm->vcpu.arch.nmi_injected = true; - break; - case SVM_EXITINTINFO_TYPE_EXEPT: - /* - * In case of software exceptions, do not reinject the vector, - * but re-execute the instruction instead. Rewind RIP first - * if we emulated INT3 before. - */ - if (kvm_exception_is_soft(vector)) { - if (vector == BP_VECTOR && int3_injected && - kvm_is_linear_rip(&svm->vcpu, svm->int3_rip)) - kvm_rip_write(&svm->vcpu, - kvm_rip_read(&svm->vcpu) - - int3_injected); - break; - } - if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) { - u32 err = svm->vmcb->control.exit_int_info_err; - kvm_requeue_exception_e(&svm->vcpu, vector, err); - - } else - kvm_requeue_exception(&svm->vcpu, vector); - break; - case SVM_EXITINTINFO_TYPE_INTR: - kvm_queue_interrupt(&svm->vcpu, vector, false); - break; - default: - break; - } -} - -static void svm_cancel_injection(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - struct vmcb_control_area *control = &svm->vmcb->control; - - control->exit_int_info = control->event_inj; - control->exit_int_info_err = control->event_inj_err; - control->event_inj = 0; - svm_complete_interrupts(svm); -} - -#ifdef CONFIG_X86_64 -#define R "r" -#else -#define R "e" -#endif - -static void svm_vcpu_run(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; - svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; - svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP]; - - /* - * A vmexit emulation is required before the vcpu can be executed - * again. - */ - if (unlikely(svm->nested.exit_required)) - return; - - pre_svm_run(svm); - - sync_lapic_to_cr8(vcpu); - - svm->vmcb->save.cr2 = vcpu->arch.cr2; - - clgi(); - - local_irq_enable(); - - asm volatile ( - "push %%"R"bp; \n\t" - "mov %c[rbx](%[svm]), %%"R"bx \n\t" - "mov %c[rcx](%[svm]), %%"R"cx \n\t" - "mov %c[rdx](%[svm]), %%"R"dx \n\t" - "mov %c[rsi](%[svm]), %%"R"si \n\t" - "mov %c[rdi](%[svm]), %%"R"di \n\t" - "mov %c[rbp](%[svm]), %%"R"bp \n\t" -#ifdef CONFIG_X86_64 - "mov %c[r8](%[svm]), %%r8 \n\t" - "mov %c[r9](%[svm]), %%r9 \n\t" - "mov %c[r10](%[svm]), %%r10 \n\t" - "mov %c[r11](%[svm]), %%r11 \n\t" - "mov %c[r12](%[svm]), %%r12 \n\t" - "mov %c[r13](%[svm]), %%r13 \n\t" - "mov %c[r14](%[svm]), %%r14 \n\t" - "mov %c[r15](%[svm]), %%r15 \n\t" -#endif - - /* Enter guest mode */ - "push %%"R"ax \n\t" - "mov %c[vmcb](%[svm]), %%"R"ax \n\t" - __ex(SVM_VMLOAD) "\n\t" - __ex(SVM_VMRUN) "\n\t" - __ex(SVM_VMSAVE) "\n\t" - "pop %%"R"ax \n\t" - - /* Save guest registers, load host registers */ - "mov %%"R"bx, %c[rbx](%[svm]) \n\t" - "mov %%"R"cx, %c[rcx](%[svm]) \n\t" - "mov %%"R"dx, %c[rdx](%[svm]) \n\t" - "mov %%"R"si, %c[rsi](%[svm]) \n\t" - "mov %%"R"di, %c[rdi](%[svm]) \n\t" - "mov %%"R"bp, %c[rbp](%[svm]) \n\t" -#ifdef CONFIG_X86_64 - "mov %%r8, %c[r8](%[svm]) \n\t" - "mov %%r9, %c[r9](%[svm]) \n\t" - "mov %%r10, %c[r10](%[svm]) \n\t" - "mov %%r11, %c[r11](%[svm]) \n\t" - "mov %%r12, %c[r12](%[svm]) \n\t" - "mov %%r13, %c[r13](%[svm]) \n\t" - "mov %%r14, %c[r14](%[svm]) \n\t" - "mov %%r15, %c[r15](%[svm]) \n\t" -#endif - "pop %%"R"bp" - : - : [svm]"a"(svm), - [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)), - [rbx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBX])), - [rcx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RCX])), - [rdx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDX])), - [rsi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RSI])), - [rdi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDI])), - [rbp]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBP])) -#ifdef CONFIG_X86_64 - , [r8]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R8])), - [r9]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R9])), - [r10]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R10])), - [r11]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R11])), - [r12]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R12])), - [r13]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R13])), - [r14]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R14])), - [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15])) -#endif - : "cc", "memory" - , R"bx", R"cx", R"dx", R"si", R"di" -#ifdef CONFIG_X86_64 - , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15" -#endif - ); - -#ifdef CONFIG_X86_64 - wrmsrl(MSR_GS_BASE, svm->host.gs_base); -#else - loadsegment(fs, svm->host.fs); -#ifndef CONFIG_X86_32_LAZY_GS - loadsegment(gs, svm->host.gs); -#endif -#endif - - reload_tss(vcpu); - - local_irq_disable(); - - vcpu->arch.cr2 = svm->vmcb->save.cr2; - vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; - vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; - vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; - - trace_kvm_exit(svm->vmcb->control.exit_code, vcpu, KVM_ISA_SVM); - - if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) - kvm_before_handle_nmi(&svm->vcpu); - - stgi(); - - /* Any pending NMI will happen here */ - - if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) - kvm_after_handle_nmi(&svm->vcpu); - - sync_cr8_to_lapic(vcpu); - - svm->next_rip = 0; - - svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING; - - /* if exit due to PF check for async PF */ - if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) - svm->apf_reason = kvm_read_and_reset_pf_reason(); - - if (npt_enabled) { - vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR); - vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR); - } - - /* - * We need to handle MC intercepts here before the vcpu has a chance to - * change the physical cpu - */ - if (unlikely(svm->vmcb->control.exit_code == - SVM_EXIT_EXCP_BASE + MC_VECTOR)) - svm_handle_mce(svm); - - mark_all_clean(svm->vmcb); -} - -#undef R - -static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - svm->vmcb->save.cr3 = root; - mark_dirty(svm->vmcb, VMCB_CR); - svm_flush_tlb(vcpu); -} - -static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - svm->vmcb->control.nested_cr3 = root; - mark_dirty(svm->vmcb, VMCB_NPT); - - /* Also sync guest cr3 here in case we live migrate */ - svm->vmcb->save.cr3 = kvm_read_cr3(vcpu); - mark_dirty(svm->vmcb, VMCB_CR); - - svm_flush_tlb(vcpu); -} - -static int is_disabled(void) -{ - u64 vm_cr; - - rdmsrl(MSR_VM_CR, vm_cr); - if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE)) - return 1; - - return 0; -} - -static void -svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) -{ - /* - * Patch in the VMMCALL instruction: - */ - hypercall[0] = 0x0f; - hypercall[1] = 0x01; - hypercall[2] = 0xd9; -} - -static void svm_check_processor_compat(void *rtn) -{ - *(int *)rtn = 0; -} - -static bool svm_cpu_has_accelerated_tpr(void) -{ - return false; -} - -static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) -{ - return 0; -} - -static void svm_cpuid_update(struct kvm_vcpu *vcpu) -{ -} - -static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) -{ - switch (func) { - case 0x80000001: - if (nested) - entry->ecx |= (1 << 2); /* Set SVM bit */ - break; - case 0x8000000A: - entry->eax = 1; /* SVM revision 1 */ - entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper - ASID emulation to nested SVM */ - entry->ecx = 0; /* Reserved */ - entry->edx = 0; /* Per default do not support any - additional features */ - - /* Support next_rip if host supports it */ - if (boot_cpu_has(X86_FEATURE_NRIPS)) - entry->edx |= SVM_FEATURE_NRIP; - - /* Support NPT for the guest if enabled */ - if (npt_enabled) - entry->edx |= SVM_FEATURE_NPT; - - break; - } -} - -static int svm_get_lpage_level(void) -{ - return PT_PDPE_LEVEL; -} - -static bool svm_rdtscp_supported(void) -{ - return false; -} - -static bool svm_has_wbinvd_exit(void) -{ - return true; -} - -static void svm_fpu_deactivate(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - set_exception_intercept(svm, NM_VECTOR); - update_cr0_intercept(svm); -} - -#define PRE_EX(exit) { .exit_code = (exit), \ - .stage = X86_ICPT_PRE_EXCEPT, } -#define POST_EX(exit) { .exit_code = (exit), \ - .stage = X86_ICPT_POST_EXCEPT, } -#define POST_MEM(exit) { .exit_code = (exit), \ - .stage = X86_ICPT_POST_MEMACCESS, } - -static struct __x86_intercept { - u32 exit_code; - enum x86_intercept_stage stage; -} x86_intercept_map[] = { - [x86_intercept_cr_read] = POST_EX(SVM_EXIT_READ_CR0), - [x86_intercept_cr_write] = POST_EX(SVM_EXIT_WRITE_CR0), - [x86_intercept_clts] = POST_EX(SVM_EXIT_WRITE_CR0), - [x86_intercept_lmsw] = POST_EX(SVM_EXIT_WRITE_CR0), - [x86_intercept_smsw] = POST_EX(SVM_EXIT_READ_CR0), - [x86_intercept_dr_read] = POST_EX(SVM_EXIT_READ_DR0), - [x86_intercept_dr_write] = POST_EX(SVM_EXIT_WRITE_DR0), - [x86_intercept_sldt] = POST_EX(SVM_EXIT_LDTR_READ), - [x86_intercept_str] = POST_EX(SVM_EXIT_TR_READ), - [x86_intercept_lldt] = POST_EX(SVM_EXIT_LDTR_WRITE), - [x86_intercept_ltr] = POST_EX(SVM_EXIT_TR_WRITE), - [x86_intercept_sgdt] = POST_EX(SVM_EXIT_GDTR_READ), - [x86_intercept_sidt] = POST_EX(SVM_EXIT_IDTR_READ), - [x86_intercept_lgdt] = POST_EX(SVM_EXIT_GDTR_WRITE), - [x86_intercept_lidt] = POST_EX(SVM_EXIT_IDTR_WRITE), - [x86_intercept_vmrun] = POST_EX(SVM_EXIT_VMRUN), - [x86_intercept_vmmcall] = POST_EX(SVM_EXIT_VMMCALL), - [x86_intercept_vmload] = POST_EX(SVM_EXIT_VMLOAD), - [x86_intercept_vmsave] = POST_EX(SVM_EXIT_VMSAVE), - [x86_intercept_stgi] = POST_EX(SVM_EXIT_STGI), - [x86_intercept_clgi] = POST_EX(SVM_EXIT_CLGI), - [x86_intercept_skinit] = POST_EX(SVM_EXIT_SKINIT), - [x86_intercept_invlpga] = POST_EX(SVM_EXIT_INVLPGA), - [x86_intercept_rdtscp] = POST_EX(SVM_EXIT_RDTSCP), - [x86_intercept_monitor] = POST_MEM(SVM_EXIT_MONITOR), - [x86_intercept_mwait] = POST_EX(SVM_EXIT_MWAIT), - [x86_intercept_invlpg] = POST_EX(SVM_EXIT_INVLPG), - [x86_intercept_invd] = POST_EX(SVM_EXIT_INVD), - [x86_intercept_wbinvd] = POST_EX(SVM_EXIT_WBINVD), - [x86_intercept_wrmsr] = POST_EX(SVM_EXIT_MSR), - [x86_intercept_rdtsc] = POST_EX(SVM_EXIT_RDTSC), - [x86_intercept_rdmsr] = POST_EX(SVM_EXIT_MSR), - [x86_intercept_rdpmc] = POST_EX(SVM_EXIT_RDPMC), - [x86_intercept_cpuid] = PRE_EX(SVM_EXIT_CPUID), - [x86_intercept_rsm] = PRE_EX(SVM_EXIT_RSM), - [x86_intercept_pause] = PRE_EX(SVM_EXIT_PAUSE), - [x86_intercept_pushf] = PRE_EX(SVM_EXIT_PUSHF), - [x86_intercept_popf] = PRE_EX(SVM_EXIT_POPF), - [x86_intercept_intn] = PRE_EX(SVM_EXIT_SWINT), - [x86_intercept_iret] = PRE_EX(SVM_EXIT_IRET), - [x86_intercept_icebp] = PRE_EX(SVM_EXIT_ICEBP), - [x86_intercept_hlt] = POST_EX(SVM_EXIT_HLT), - [x86_intercept_in] = POST_EX(SVM_EXIT_IOIO), - [x86_intercept_ins] = POST_EX(SVM_EXIT_IOIO), - [x86_intercept_out] = POST_EX(SVM_EXIT_IOIO), - [x86_intercept_outs] = POST_EX(SVM_EXIT_IOIO), -}; - -#undef PRE_EX -#undef POST_EX -#undef POST_MEM - -static int svm_check_intercept(struct kvm_vcpu *vcpu, - struct x86_instruction_info *info, - enum x86_intercept_stage stage) -{ - struct vcpu_svm *svm = to_svm(vcpu); - int vmexit, ret = X86EMUL_CONTINUE; - struct __x86_intercept icpt_info; - struct vmcb *vmcb = svm->vmcb; - - if (info->intercept >= ARRAY_SIZE(x86_intercept_map)) - goto out; - - icpt_info = x86_intercept_map[info->intercept]; - - if (stage != icpt_info.stage) - goto out; - - switch (icpt_info.exit_code) { - case SVM_EXIT_READ_CR0: - if (info->intercept == x86_intercept_cr_read) - icpt_info.exit_code += info->modrm_reg; - break; - case SVM_EXIT_WRITE_CR0: { - unsigned long cr0, val; - u64 intercept; - - if (info->intercept == x86_intercept_cr_write) - icpt_info.exit_code += info->modrm_reg; - - if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0) - break; - - intercept = svm->nested.intercept; - - if (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0))) - break; - - cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK; - val = info->src_val & ~SVM_CR0_SELECTIVE_MASK; - - if (info->intercept == x86_intercept_lmsw) { - cr0 &= 0xfUL; - val &= 0xfUL; - /* lmsw can't clear PE - catch this here */ - if (cr0 & X86_CR0_PE) - val |= X86_CR0_PE; - } - - if (cr0 ^ val) - icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE; - - break; - } - case SVM_EXIT_READ_DR0: - case SVM_EXIT_WRITE_DR0: - icpt_info.exit_code += info->modrm_reg; - break; - case SVM_EXIT_MSR: - if (info->intercept == x86_intercept_wrmsr) - vmcb->control.exit_info_1 = 1; - else - vmcb->control.exit_info_1 = 0; - break; - case SVM_EXIT_PAUSE: - /* - * We get this for NOP only, but pause - * is rep not, check this here - */ - if (info->rep_prefix != REPE_PREFIX) - goto out; - case SVM_EXIT_IOIO: { - u64 exit_info; - u32 bytes; - - exit_info = (vcpu->arch.regs[VCPU_REGS_RDX] & 0xffff) << 16; - - if (info->intercept == x86_intercept_in || - info->intercept == x86_intercept_ins) { - exit_info |= SVM_IOIO_TYPE_MASK; - bytes = info->src_bytes; - } else { - bytes = info->dst_bytes; - } - - if (info->intercept == x86_intercept_outs || - info->intercept == x86_intercept_ins) - exit_info |= SVM_IOIO_STR_MASK; - - if (info->rep_prefix) - exit_info |= SVM_IOIO_REP_MASK; - - bytes = min(bytes, 4u); - - exit_info |= bytes << SVM_IOIO_SIZE_SHIFT; - - exit_info |= (u32)info->ad_bytes << (SVM_IOIO_ASIZE_SHIFT - 1); - - vmcb->control.exit_info_1 = exit_info; - vmcb->control.exit_info_2 = info->next_rip; - - break; - } - default: - break; - } - - vmcb->control.next_rip = info->next_rip; - vmcb->control.exit_code = icpt_info.exit_code; - vmexit = nested_svm_exit_handled(svm); - - ret = (vmexit == NESTED_EXIT_DONE) ? X86EMUL_INTERCEPTED - : X86EMUL_CONTINUE; - -out: - return ret; -} - -static struct kvm_x86_ops svm_x86_ops = { - .cpu_has_kvm_support = has_svm, - .disabled_by_bios = is_disabled, - .hardware_setup = svm_hardware_setup, - .hardware_unsetup = svm_hardware_unsetup, - .check_processor_compatibility = svm_check_processor_compat, - .hardware_enable = svm_hardware_enable, - .hardware_disable = svm_hardware_disable, - .cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr, - - .vcpu_create = svm_create_vcpu, - .vcpu_free = svm_free_vcpu, - .vcpu_reset = svm_vcpu_reset, - - .prepare_guest_switch = svm_prepare_guest_switch, - .vcpu_load = svm_vcpu_load, - .vcpu_put = svm_vcpu_put, - - .set_guest_debug = svm_guest_debug, - .get_msr = svm_get_msr, - .set_msr = svm_set_msr, - .get_segment_base = svm_get_segment_base, - .get_segment = svm_get_segment, - .set_segment = svm_set_segment, - .get_cpl = svm_get_cpl, - .get_cs_db_l_bits = kvm_get_cs_db_l_bits, - .decache_cr0_guest_bits = svm_decache_cr0_guest_bits, - .decache_cr3 = svm_decache_cr3, - .decache_cr4_guest_bits = svm_decache_cr4_guest_bits, - .set_cr0 = svm_set_cr0, - .set_cr3 = svm_set_cr3, - .set_cr4 = svm_set_cr4, - .set_efer = svm_set_efer, - .get_idt = svm_get_idt, - .set_idt = svm_set_idt, - .get_gdt = svm_get_gdt, - .set_gdt = svm_set_gdt, - .set_dr7 = svm_set_dr7, - .cache_reg = svm_cache_reg, - .get_rflags = svm_get_rflags, - .set_rflags = svm_set_rflags, - .fpu_activate = svm_fpu_activate, - .fpu_deactivate = svm_fpu_deactivate, - - .tlb_flush = svm_flush_tlb, - - .run = svm_vcpu_run, - .handle_exit = handle_exit, - .skip_emulated_instruction = skip_emulated_instruction, - .set_interrupt_shadow = svm_set_interrupt_shadow, - .get_interrupt_shadow = svm_get_interrupt_shadow, - .patch_hypercall = svm_patch_hypercall, - .set_irq = svm_set_irq, - .set_nmi = svm_inject_nmi, - .queue_exception = svm_queue_exception, - .cancel_injection = svm_cancel_injection, - .interrupt_allowed = svm_interrupt_allowed, - .nmi_allowed = svm_nmi_allowed, - .get_nmi_mask = svm_get_nmi_mask, - .set_nmi_mask = svm_set_nmi_mask, - .enable_nmi_window = enable_nmi_window, - .enable_irq_window = enable_irq_window, - .update_cr8_intercept = update_cr8_intercept, - - .set_tss_addr = svm_set_tss_addr, - .get_tdp_level = get_npt_level, - .get_mt_mask = svm_get_mt_mask, - - .get_exit_info = svm_get_exit_info, - - .get_lpage_level = svm_get_lpage_level, - - .cpuid_update = svm_cpuid_update, - - .rdtscp_supported = svm_rdtscp_supported, - - .set_supported_cpuid = svm_set_supported_cpuid, - - .has_wbinvd_exit = svm_has_wbinvd_exit, - - .set_tsc_khz = svm_set_tsc_khz, - .write_tsc_offset = svm_write_tsc_offset, - .adjust_tsc_offset = svm_adjust_tsc_offset, - .compute_tsc_offset = svm_compute_tsc_offset, - .read_l1_tsc = svm_read_l1_tsc, - - .set_tdp_cr3 = set_tdp_cr3, - - .check_intercept = svm_check_intercept, -}; - -static int __init svm_init(void) -{ - return kvm_init(&svm_x86_ops, sizeof(struct vcpu_svm), - __alignof__(struct vcpu_svm), THIS_MODULE); -} - -static void __exit svm_exit(void) -{ - kvm_exit(); -} - -module_init(svm_init) -module_exit(svm_exit) diff --git a/ANDROID_3.4.5/arch/x86/kvm/timer.c b/ANDROID_3.4.5/arch/x86/kvm/timer.c deleted file mode 100644 index 6b85cc64..00000000 --- a/ANDROID_3.4.5/arch/x86/kvm/timer.c +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Kernel-based Virtual Machine driver for Linux - * - * This module enables machines with Intel VT-x extensions to run virtual - * machines without emulation or binary translation. - * - * timer support - * - * Copyright 2010 Red Hat, Inc. and/or its affiliates. - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - */ - -#include <linux/kvm_host.h> -#include <linux/kvm.h> -#include <linux/hrtimer.h> -#include <linux/atomic.h> -#include "kvm_timer.h" - -enum hrtimer_restart kvm_timer_fn(struct hrtimer *data) -{ - struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer); - struct kvm_vcpu *vcpu = ktimer->vcpu; - wait_queue_head_t *q = &vcpu->wq; - - /* - * There is a race window between reading and incrementing, but we do - * not care about potentially losing timer events in the !reinject - * case anyway. Note: KVM_REQ_PENDING_TIMER is implicitly checked - * in vcpu_enter_guest. - */ - if (ktimer->reinject || !atomic_read(&ktimer->pending)) { - atomic_inc(&ktimer->pending); - /* FIXME: this code should not know anything about vcpus */ - kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu); - } - - if (waitqueue_active(q)) - wake_up_interruptible(q); - - if (ktimer->t_ops->is_periodic(ktimer)) { - hrtimer_add_expires_ns(&ktimer->timer, ktimer->period); - return HRTIMER_RESTART; - } else - return HRTIMER_NORESTART; -} diff --git a/ANDROID_3.4.5/arch/x86/kvm/trace.h b/ANDROID_3.4.5/arch/x86/kvm/trace.h deleted file mode 100644 index 911d2641..00000000 --- a/ANDROID_3.4.5/arch/x86/kvm/trace.h +++ /dev/null @@ -1,830 +0,0 @@ -#if !defined(_TRACE_KVM_H) || defined(TRACE_HEADER_MULTI_READ) -#define _TRACE_KVM_H - -#include <linux/tracepoint.h> -#include <asm/vmx.h> -#include <asm/svm.h> - -#undef TRACE_SYSTEM -#define TRACE_SYSTEM kvm - -/* - * Tracepoint for guest mode entry. - */ -TRACE_EVENT(kvm_entry, - TP_PROTO(unsigned int vcpu_id), - TP_ARGS(vcpu_id), - - TP_STRUCT__entry( - __field( unsigned int, vcpu_id ) - ), - - TP_fast_assign( - __entry->vcpu_id = vcpu_id; - ), - - TP_printk("vcpu %u", __entry->vcpu_id) -); - -/* - * Tracepoint for hypercall. - */ -TRACE_EVENT(kvm_hypercall, - TP_PROTO(unsigned long nr, unsigned long a0, unsigned long a1, - unsigned long a2, unsigned long a3), - TP_ARGS(nr, a0, a1, a2, a3), - - TP_STRUCT__entry( - __field( unsigned long, nr ) - __field( unsigned long, a0 ) - __field( unsigned long, a1 ) - __field( unsigned long, a2 ) - __field( unsigned long, a3 ) - ), - - TP_fast_assign( - __entry->nr = nr; - __entry->a0 = a0; - __entry->a1 = a1; - __entry->a2 = a2; - __entry->a3 = a3; - ), - - TP_printk("nr 0x%lx a0 0x%lx a1 0x%lx a2 0x%lx a3 0x%lx", - __entry->nr, __entry->a0, __entry->a1, __entry->a2, - __entry->a3) -); - -/* - * Tracepoint for hypercall. - */ -TRACE_EVENT(kvm_hv_hypercall, - TP_PROTO(__u16 code, bool fast, __u16 rep_cnt, __u16 rep_idx, - __u64 ingpa, __u64 outgpa), - TP_ARGS(code, fast, rep_cnt, rep_idx, ingpa, outgpa), - - TP_STRUCT__entry( - __field( __u16, rep_cnt ) - __field( __u16, rep_idx ) - __field( __u64, ingpa ) - __field( __u64, outgpa ) - __field( __u16, code ) - __field( bool, fast ) - ), - - TP_fast_assign( - __entry->rep_cnt = rep_cnt; - __entry->rep_idx = rep_idx; - __entry->ingpa = ingpa; - __entry->outgpa = outgpa; - __entry->code = code; - __entry->fast = fast; - ), - - TP_printk("code 0x%x %s cnt 0x%x idx 0x%x in 0x%llx out 0x%llx", - __entry->code, __entry->fast ? "fast" : "slow", - __entry->rep_cnt, __entry->rep_idx, __entry->ingpa, - __entry->outgpa) -); - -/* - * Tracepoint for PIO. - */ -TRACE_EVENT(kvm_pio, - TP_PROTO(unsigned int rw, unsigned int port, unsigned int size, - unsigned int count), - TP_ARGS(rw, port, size, count), - - TP_STRUCT__entry( - __field( unsigned int, rw ) - __field( unsigned int, port ) - __field( unsigned int, size ) - __field( unsigned int, count ) - ), - - TP_fast_assign( - __entry->rw = rw; - __entry->port = port; - __entry->size = size; - __entry->count = count; - ), - - TP_printk("pio_%s at 0x%x size %d count %d", - __entry->rw ? "write" : "read", - __entry->port, __entry->size, __entry->count) -); - -/* - * Tracepoint for cpuid. - */ -TRACE_EVENT(kvm_cpuid, - TP_PROTO(unsigned int function, unsigned long rax, unsigned long rbx, - unsigned long rcx, unsigned long rdx), - TP_ARGS(function, rax, rbx, rcx, rdx), - - TP_STRUCT__entry( - __field( unsigned int, function ) - __field( unsigned long, rax ) - __field( unsigned long, rbx ) - __field( unsigned long, rcx ) - __field( unsigned long, rdx ) - ), - - TP_fast_assign( - __entry->function = function; - __entry->rax = rax; - __entry->rbx = rbx; - __entry->rcx = rcx; - __entry->rdx = rdx; - ), - - TP_printk("func %x rax %lx rbx %lx rcx %lx rdx %lx", - __entry->function, __entry->rax, - __entry->rbx, __entry->rcx, __entry->rdx) -); - -#define AREG(x) { APIC_##x, "APIC_" #x } - -#define kvm_trace_symbol_apic \ - AREG(ID), AREG(LVR), AREG(TASKPRI), AREG(ARBPRI), AREG(PROCPRI), \ - AREG(EOI), AREG(RRR), AREG(LDR), AREG(DFR), AREG(SPIV), AREG(ISR), \ - AREG(TMR), AREG(IRR), AREG(ESR), AREG(ICR), AREG(ICR2), AREG(LVTT), \ - AREG(LVTTHMR), AREG(LVTPC), AREG(LVT0), AREG(LVT1), AREG(LVTERR), \ - AREG(TMICT), AREG(TMCCT), AREG(TDCR), AREG(SELF_IPI), AREG(EFEAT), \ - AREG(ECTRL) -/* - * Tracepoint for apic access. - */ -TRACE_EVENT(kvm_apic, - TP_PROTO(unsigned int rw, unsigned int reg, unsigned int val), - TP_ARGS(rw, reg, val), - - TP_STRUCT__entry( - __field( unsigned int, rw ) - __field( unsigned int, reg ) - __field( unsigned int, val ) - ), - - TP_fast_assign( - __entry->rw = rw; - __entry->reg = reg; - __entry->val = val; - ), - - TP_printk("apic_%s %s = 0x%x", - __entry->rw ? "write" : "read", - __print_symbolic(__entry->reg, kvm_trace_symbol_apic), - __entry->val) -); - -#define trace_kvm_apic_read(reg, val) trace_kvm_apic(0, reg, val) -#define trace_kvm_apic_write(reg, val) trace_kvm_apic(1, reg, val) - -#define KVM_ISA_VMX 1 -#define KVM_ISA_SVM 2 - -#define VMX_EXIT_REASONS \ - { EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \ - { EXIT_REASON_EXTERNAL_INTERRUPT, "EXTERNAL_INTERRUPT" }, \ - { EXIT_REASON_TRIPLE_FAULT, "TRIPLE_FAULT" }, \ - { EXIT_REASON_PENDING_INTERRUPT, "PENDING_INTERRUPT" }, \ - { EXIT_REASON_NMI_WINDOW, "NMI_WINDOW" }, \ - { EXIT_REASON_TASK_SWITCH, "TASK_SWITCH" }, \ - { EXIT_REASON_CPUID, "CPUID" }, \ - { EXIT_REASON_HLT, "HLT" }, \ - { EXIT_REASON_INVLPG, "INVLPG" }, \ - { EXIT_REASON_RDPMC, "RDPMC" }, \ - { EXIT_REASON_RDTSC, "RDTSC" }, \ - { EXIT_REASON_VMCALL, "VMCALL" }, \ - { EXIT_REASON_VMCLEAR, "VMCLEAR" }, \ - { EXIT_REASON_VMLAUNCH, "VMLAUNCH" }, \ - { EXIT_REASON_VMPTRLD, "VMPTRLD" }, \ - { EXIT_REASON_VMPTRST, "VMPTRST" }, \ - { EXIT_REASON_VMREAD, "VMREAD" }, \ - { EXIT_REASON_VMRESUME, "VMRESUME" }, \ - { EXIT_REASON_VMWRITE, "VMWRITE" }, \ - { EXIT_REASON_VMOFF, "VMOFF" }, \ - { EXIT_REASON_VMON, "VMON" }, \ - { EXIT_REASON_CR_ACCESS, "CR_ACCESS" }, \ - { EXIT_REASON_DR_ACCESS, "DR_ACCESS" }, \ - { EXIT_REASON_IO_INSTRUCTION, "IO_INSTRUCTION" }, \ - { EXIT_REASON_MSR_READ, "MSR_READ" }, \ - { EXIT_REASON_MSR_WRITE, "MSR_WRITE" }, \ - { EXIT_REASON_MWAIT_INSTRUCTION, "MWAIT_INSTRUCTION" }, \ - { EXIT_REASON_MONITOR_INSTRUCTION, "MONITOR_INSTRUCTION" }, \ - { EXIT_REASON_PAUSE_INSTRUCTION, "PAUSE_INSTRUCTION" }, \ - { EXIT_REASON_MCE_DURING_VMENTRY, "MCE_DURING_VMENTRY" }, \ - { EXIT_REASON_TPR_BELOW_THRESHOLD, "TPR_BELOW_THRESHOLD" }, \ - { EXIT_REASON_APIC_ACCESS, "APIC_ACCESS" }, \ - { EXIT_REASON_EPT_VIOLATION, "EPT_VIOLATION" }, \ - { EXIT_REASON_EPT_MISCONFIG, "EPT_MISCONFIG" }, \ - { EXIT_REASON_WBINVD, "WBINVD" } - -#define SVM_EXIT_REASONS \ - { SVM_EXIT_READ_CR0, "read_cr0" }, \ - { SVM_EXIT_READ_CR3, "read_cr3" }, \ - { SVM_EXIT_READ_CR4, "read_cr4" }, \ - { SVM_EXIT_READ_CR8, "read_cr8" }, \ - { SVM_EXIT_WRITE_CR0, "write_cr0" }, \ - { SVM_EXIT_WRITE_CR3, "write_cr3" }, \ - { SVM_EXIT_WRITE_CR4, "write_cr4" }, \ - { SVM_EXIT_WRITE_CR8, "write_cr8" }, \ - { SVM_EXIT_READ_DR0, "read_dr0" }, \ - { SVM_EXIT_READ_DR1, "read_dr1" }, \ - { SVM_EXIT_READ_DR2, "read_dr2" }, \ - { SVM_EXIT_READ_DR3, "read_dr3" }, \ - { SVM_EXIT_WRITE_DR0, "write_dr0" }, \ - { SVM_EXIT_WRITE_DR1, "write_dr1" }, \ - { SVM_EXIT_WRITE_DR2, "write_dr2" }, \ - { SVM_EXIT_WRITE_DR3, "write_dr3" }, \ - { SVM_EXIT_WRITE_DR5, "write_dr5" }, \ - { SVM_EXIT_WRITE_DR7, "write_dr7" }, \ - { SVM_EXIT_EXCP_BASE + DB_VECTOR, "DB excp" }, \ - { SVM_EXIT_EXCP_BASE + BP_VECTOR, "BP excp" }, \ - { SVM_EXIT_EXCP_BASE + UD_VECTOR, "UD excp" }, \ - { SVM_EXIT_EXCP_BASE + PF_VECTOR, "PF excp" }, \ - { SVM_EXIT_EXCP_BASE + NM_VECTOR, "NM excp" }, \ - { SVM_EXIT_EXCP_BASE + MC_VECTOR, "MC excp" }, \ - { SVM_EXIT_INTR, "interrupt" }, \ - { SVM_EXIT_NMI, "nmi" }, \ - { SVM_EXIT_SMI, "smi" }, \ - { SVM_EXIT_INIT, "init" }, \ - { SVM_EXIT_VINTR, "vintr" }, \ - { SVM_EXIT_CPUID, "cpuid" }, \ - { SVM_EXIT_INVD, "invd" }, \ - { SVM_EXIT_HLT, "hlt" }, \ - { SVM_EXIT_INVLPG, "invlpg" }, \ - { SVM_EXIT_INVLPGA, "invlpga" }, \ - { SVM_EXIT_IOIO, "io" }, \ - { SVM_EXIT_MSR, "msr" }, \ - { SVM_EXIT_TASK_SWITCH, "task_switch" }, \ - { SVM_EXIT_SHUTDOWN, "shutdown" }, \ - { SVM_EXIT_VMRUN, "vmrun" }, \ - { SVM_EXIT_VMMCALL, "hypercall" }, \ - { SVM_EXIT_VMLOAD, "vmload" }, \ - { SVM_EXIT_VMSAVE, "vmsave" }, \ - { SVM_EXIT_STGI, "stgi" }, \ - { SVM_EXIT_CLGI, "clgi" }, \ - { SVM_EXIT_SKINIT, "skinit" }, \ - { SVM_EXIT_WBINVD, "wbinvd" }, \ - { SVM_EXIT_MONITOR, "monitor" }, \ - { SVM_EXIT_MWAIT, "mwait" }, \ - { SVM_EXIT_XSETBV, "xsetbv" }, \ - { SVM_EXIT_NPF, "npf" } - -/* - * Tracepoint for kvm guest exit: - */ -TRACE_EVENT(kvm_exit, - TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu, u32 isa), - TP_ARGS(exit_reason, vcpu, isa), - - TP_STRUCT__entry( - __field( unsigned int, exit_reason ) - __field( unsigned long, guest_rip ) - __field( u32, isa ) - __field( u64, info1 ) - __field( u64, info2 ) - ), - - TP_fast_assign( - __entry->exit_reason = exit_reason; - __entry->guest_rip = kvm_rip_read(vcpu); - __entry->isa = isa; - kvm_x86_ops->get_exit_info(vcpu, &__entry->info1, - &__entry->info2); - ), - - TP_printk("reason %s rip 0x%lx info %llx %llx", - (__entry->isa == KVM_ISA_VMX) ? - __print_symbolic(__entry->exit_reason, VMX_EXIT_REASONS) : - __print_symbolic(__entry->exit_reason, SVM_EXIT_REASONS), - __entry->guest_rip, __entry->info1, __entry->info2) -); - -/* - * Tracepoint for kvm interrupt injection: - */ -TRACE_EVENT(kvm_inj_virq, - TP_PROTO(unsigned int irq), - TP_ARGS(irq), - - TP_STRUCT__entry( - __field( unsigned int, irq ) - ), - - TP_fast_assign( - __entry->irq = irq; - ), - - TP_printk("irq %u", __entry->irq) -); - -#define EXS(x) { x##_VECTOR, "#" #x } - -#define kvm_trace_sym_exc \ - EXS(DE), EXS(DB), EXS(BP), EXS(OF), EXS(BR), EXS(UD), EXS(NM), \ - EXS(DF), EXS(TS), EXS(NP), EXS(SS), EXS(GP), EXS(PF), \ - EXS(MF), EXS(MC) - -/* - * Tracepoint for kvm interrupt injection: - */ -TRACE_EVENT(kvm_inj_exception, - TP_PROTO(unsigned exception, bool has_error, unsigned error_code), - TP_ARGS(exception, has_error, error_code), - - TP_STRUCT__entry( - __field( u8, exception ) - __field( u8, has_error ) - __field( u32, error_code ) - ), - - TP_fast_assign( - __entry->exception = exception; - __entry->has_error = has_error; - __entry->error_code = error_code; - ), - - TP_printk("%s (0x%x)", - __print_symbolic(__entry->exception, kvm_trace_sym_exc), - /* FIXME: don't print error_code if not present */ - __entry->has_error ? __entry->error_code : 0) -); - -/* - * Tracepoint for page fault. - */ -TRACE_EVENT(kvm_page_fault, - TP_PROTO(unsigned long fault_address, unsigned int error_code), - TP_ARGS(fault_address, error_code), - - TP_STRUCT__entry( - __field( unsigned long, fault_address ) - __field( unsigned int, error_code ) - ), - - TP_fast_assign( - __entry->fault_address = fault_address; - __entry->error_code = error_code; - ), - - TP_printk("address %lx error_code %x", - __entry->fault_address, __entry->error_code) -); - -/* - * Tracepoint for guest MSR access. - */ -TRACE_EVENT(kvm_msr, - TP_PROTO(unsigned write, u32 ecx, u64 data, bool exception), - TP_ARGS(write, ecx, data, exception), - - TP_STRUCT__entry( - __field( unsigned, write ) - __field( u32, ecx ) - __field( u64, data ) - __field( u8, exception ) - ), - - TP_fast_assign( - __entry->write = write; - __entry->ecx = ecx; - __entry->data = data; - __entry->exception = exception; - ), - - TP_printk("msr_%s %x = 0x%llx%s", - __entry->write ? "write" : "read", - __entry->ecx, __entry->data, - __entry->exception ? " (#GP)" : "") -); - -#define trace_kvm_msr_read(ecx, data) trace_kvm_msr(0, ecx, data, false) -#define trace_kvm_msr_write(ecx, data) trace_kvm_msr(1, ecx, data, false) -#define trace_kvm_msr_read_ex(ecx) trace_kvm_msr(0, ecx, 0, true) -#define trace_kvm_msr_write_ex(ecx, data) trace_kvm_msr(1, ecx, data, true) - -/* - * Tracepoint for guest CR access. - */ -TRACE_EVENT(kvm_cr, - TP_PROTO(unsigned int rw, unsigned int cr, unsigned long val), - TP_ARGS(rw, cr, val), - - TP_STRUCT__entry( - __field( unsigned int, rw ) - __field( unsigned int, cr ) - __field( unsigned long, val ) - ), - - TP_fast_assign( - __entry->rw = rw; - __entry->cr = cr; - __entry->val = val; - ), - - TP_printk("cr_%s %x = 0x%lx", - __entry->rw ? "write" : "read", - __entry->cr, __entry->val) -); - -#define trace_kvm_cr_read(cr, val) trace_kvm_cr(0, cr, val) -#define trace_kvm_cr_write(cr, val) trace_kvm_cr(1, cr, val) - -TRACE_EVENT(kvm_pic_set_irq, - TP_PROTO(__u8 chip, __u8 pin, __u8 elcr, __u8 imr, bool coalesced), - TP_ARGS(chip, pin, elcr, imr, coalesced), - - TP_STRUCT__entry( - __field( __u8, chip ) - __field( __u8, pin ) - __field( __u8, elcr ) - __field( __u8, imr ) - __field( bool, coalesced ) - ), - - TP_fast_assign( - __entry->chip = chip; - __entry->pin = pin; - __entry->elcr = elcr; - __entry->imr = imr; - __entry->coalesced = coalesced; - ), - - TP_printk("chip %u pin %u (%s%s)%s", - __entry->chip, __entry->pin, - (__entry->elcr & (1 << __entry->pin)) ? "level":"edge", - (__entry->imr & (1 << __entry->pin)) ? "|masked":"", - __entry->coalesced ? " (coalesced)" : "") -); - -#define kvm_apic_dst_shorthand \ - {0x0, "dst"}, \ - {0x1, "self"}, \ - {0x2, "all"}, \ - {0x3, "all-but-self"} - -TRACE_EVENT(kvm_apic_ipi, - TP_PROTO(__u32 icr_low, __u32 dest_id), - TP_ARGS(icr_low, dest_id), - - TP_STRUCT__entry( - __field( __u32, icr_low ) - __field( __u32, dest_id ) - ), - - TP_fast_assign( - __entry->icr_low = icr_low; - __entry->dest_id = dest_id; - ), - - TP_printk("dst %x vec %u (%s|%s|%s|%s|%s)", - __entry->dest_id, (u8)__entry->icr_low, - __print_symbolic((__entry->icr_low >> 8 & 0x7), - kvm_deliver_mode), - (__entry->icr_low & (1<<11)) ? "logical" : "physical", - (__entry->icr_low & (1<<14)) ? "assert" : "de-assert", - (__entry->icr_low & (1<<15)) ? "level" : "edge", - __print_symbolic((__entry->icr_low >> 18 & 0x3), - kvm_apic_dst_shorthand)) -); - -TRACE_EVENT(kvm_apic_accept_irq, - TP_PROTO(__u32 apicid, __u16 dm, __u8 tm, __u8 vec, bool coalesced), - TP_ARGS(apicid, dm, tm, vec, coalesced), - - TP_STRUCT__entry( - __field( __u32, apicid ) - __field( __u16, dm ) - __field( __u8, tm ) - __field( __u8, vec ) - __field( bool, coalesced ) - ), - - TP_fast_assign( - __entry->apicid = apicid; - __entry->dm = dm; - __entry->tm = tm; - __entry->vec = vec; - __entry->coalesced = coalesced; - ), - - TP_printk("apicid %x vec %u (%s|%s)%s", - __entry->apicid, __entry->vec, - __print_symbolic((__entry->dm >> 8 & 0x7), kvm_deliver_mode), - __entry->tm ? "level" : "edge", - __entry->coalesced ? " (coalesced)" : "") -); - -/* - * Tracepoint for nested VMRUN - */ -TRACE_EVENT(kvm_nested_vmrun, - TP_PROTO(__u64 rip, __u64 vmcb, __u64 nested_rip, __u32 int_ctl, - __u32 event_inj, bool npt), - TP_ARGS(rip, vmcb, nested_rip, int_ctl, event_inj, npt), - - TP_STRUCT__entry( - __field( __u64, rip ) - __field( __u64, vmcb ) - __field( __u64, nested_rip ) - __field( __u32, int_ctl ) - __field( __u32, event_inj ) - __field( bool, npt ) - ), - - TP_fast_assign( - __entry->rip = rip; - __entry->vmcb = vmcb; - __entry->nested_rip = nested_rip; - __entry->int_ctl = int_ctl; - __entry->event_inj = event_inj; - __entry->npt = npt; - ), - - TP_printk("rip: 0x%016llx vmcb: 0x%016llx nrip: 0x%016llx int_ctl: 0x%08x " - "event_inj: 0x%08x npt: %s", - __entry->rip, __entry->vmcb, __entry->nested_rip, - __entry->int_ctl, __entry->event_inj, - __entry->npt ? "on" : "off") -); - -TRACE_EVENT(kvm_nested_intercepts, - TP_PROTO(__u16 cr_read, __u16 cr_write, __u32 exceptions, __u64 intercept), - TP_ARGS(cr_read, cr_write, exceptions, intercept), - - TP_STRUCT__entry( - __field( __u16, cr_read ) - __field( __u16, cr_write ) - __field( __u32, exceptions ) - __field( __u64, intercept ) - ), - - TP_fast_assign( - __entry->cr_read = cr_read; - __entry->cr_write = cr_write; - __entry->exceptions = exceptions; - __entry->intercept = intercept; - ), - - TP_printk("cr_read: %04x cr_write: %04x excp: %08x intercept: %016llx", - __entry->cr_read, __entry->cr_write, __entry->exceptions, - __entry->intercept) -); -/* - * Tracepoint for #VMEXIT while nested - */ -TRACE_EVENT(kvm_nested_vmexit, - TP_PROTO(__u64 rip, __u32 exit_code, - __u64 exit_info1, __u64 exit_info2, - __u32 exit_int_info, __u32 exit_int_info_err, __u32 isa), - TP_ARGS(rip, exit_code, exit_info1, exit_info2, - exit_int_info, exit_int_info_err, isa), - - TP_STRUCT__entry( - __field( __u64, rip ) - __field( __u32, exit_code ) - __field( __u64, exit_info1 ) - __field( __u64, exit_info2 ) - __field( __u32, exit_int_info ) - __field( __u32, exit_int_info_err ) - __field( __u32, isa ) - ), - - TP_fast_assign( - __entry->rip = rip; - __entry->exit_code = exit_code; - __entry->exit_info1 = exit_info1; - __entry->exit_info2 = exit_info2; - __entry->exit_int_info = exit_int_info; - __entry->exit_int_info_err = exit_int_info_err; - __entry->isa = isa; - ), - TP_printk("rip: 0x%016llx reason: %s ext_inf1: 0x%016llx " - "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x", - __entry->rip, - (__entry->isa == KVM_ISA_VMX) ? - __print_symbolic(__entry->exit_code, VMX_EXIT_REASONS) : - __print_symbolic(__entry->exit_code, SVM_EXIT_REASONS), - __entry->exit_info1, __entry->exit_info2, - __entry->exit_int_info, __entry->exit_int_info_err) -); - -/* - * Tracepoint for #VMEXIT reinjected to the guest - */ -TRACE_EVENT(kvm_nested_vmexit_inject, - TP_PROTO(__u32 exit_code, - __u64 exit_info1, __u64 exit_info2, - __u32 exit_int_info, __u32 exit_int_info_err, __u32 isa), - TP_ARGS(exit_code, exit_info1, exit_info2, - exit_int_info, exit_int_info_err, isa), - - TP_STRUCT__entry( - __field( __u32, exit_code ) - __field( __u64, exit_info1 ) - __field( __u64, exit_info2 ) - __field( __u32, exit_int_info ) - __field( __u32, exit_int_info_err ) - __field( __u32, isa ) - ), - - TP_fast_assign( - __entry->exit_code = exit_code; - __entry->exit_info1 = exit_info1; - __entry->exit_info2 = exit_info2; - __entry->exit_int_info = exit_int_info; - __entry->exit_int_info_err = exit_int_info_err; - __entry->isa = isa; - ), - - TP_printk("reason: %s ext_inf1: 0x%016llx " - "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x", - (__entry->isa == KVM_ISA_VMX) ? - __print_symbolic(__entry->exit_code, VMX_EXIT_REASONS) : - __print_symbolic(__entry->exit_code, SVM_EXIT_REASONS), - __entry->exit_info1, __entry->exit_info2, - __entry->exit_int_info, __entry->exit_int_info_err) -); - -/* - * Tracepoint for nested #vmexit because of interrupt pending - */ -TRACE_EVENT(kvm_nested_intr_vmexit, - TP_PROTO(__u64 rip), - TP_ARGS(rip), - - TP_STRUCT__entry( - __field( __u64, rip ) - ), - - TP_fast_assign( - __entry->rip = rip - ), - - TP_printk("rip: 0x%016llx", __entry->rip) -); - -/* - * Tracepoint for nested #vmexit because of interrupt pending - */ -TRACE_EVENT(kvm_invlpga, - TP_PROTO(__u64 rip, int asid, u64 address), - TP_ARGS(rip, asid, address), - - TP_STRUCT__entry( - __field( __u64, rip ) - __field( int, asid ) - __field( __u64, address ) - ), - - TP_fast_assign( - __entry->rip = rip; - __entry->asid = asid; - __entry->address = address; - ), - - TP_printk("rip: 0x%016llx asid: %d address: 0x%016llx", - __entry->rip, __entry->asid, __entry->address) -); - -/* - * Tracepoint for nested #vmexit because of interrupt pending - */ -TRACE_EVENT(kvm_skinit, - TP_PROTO(__u64 rip, __u32 slb), - TP_ARGS(rip, slb), - - TP_STRUCT__entry( - __field( __u64, rip ) - __field( __u32, slb ) - ), - - TP_fast_assign( - __entry->rip = rip; - __entry->slb = slb; - ), - - TP_printk("rip: 0x%016llx slb: 0x%08x", - __entry->rip, __entry->slb) -); - -#define __print_insn(insn, ilen) ({ \ - int i; \ - const char *ret = p->buffer + p->len; \ - \ - for (i = 0; i < ilen; ++i) \ - trace_seq_printf(p, " %02x", insn[i]); \ - trace_seq_printf(p, "%c", 0); \ - ret; \ - }) - -#define KVM_EMUL_INSN_F_CR0_PE (1 << 0) -#define KVM_EMUL_INSN_F_EFL_VM (1 << 1) -#define KVM_EMUL_INSN_F_CS_D (1 << 2) -#define KVM_EMUL_INSN_F_CS_L (1 << 3) - -#define kvm_trace_symbol_emul_flags \ - { 0, "real" }, \ - { KVM_EMUL_INSN_F_CR0_PE \ - | KVM_EMUL_INSN_F_EFL_VM, "vm16" }, \ - { KVM_EMUL_INSN_F_CR0_PE, "prot16" }, \ - { KVM_EMUL_INSN_F_CR0_PE \ - | KVM_EMUL_INSN_F_CS_D, "prot32" }, \ - { KVM_EMUL_INSN_F_CR0_PE \ - | KVM_EMUL_INSN_F_CS_L, "prot64" } - -#define kei_decode_mode(mode) ({ \ - u8 flags = 0xff; \ - switch (mode) { \ - case X86EMUL_MODE_REAL: \ - flags = 0; \ - break; \ - case X86EMUL_MODE_VM86: \ - flags = KVM_EMUL_INSN_F_EFL_VM; \ - break; \ - case X86EMUL_MODE_PROT16: \ - flags = KVM_EMUL_INSN_F_CR0_PE; \ - break; \ - case X86EMUL_MODE_PROT32: \ - flags = KVM_EMUL_INSN_F_CR0_PE \ - | KVM_EMUL_INSN_F_CS_D; \ - break; \ - case X86EMUL_MODE_PROT64: \ - flags = KVM_EMUL_INSN_F_CR0_PE \ - | KVM_EMUL_INSN_F_CS_L; \ - break; \ - } \ - flags; \ - }) - -TRACE_EVENT(kvm_emulate_insn, - TP_PROTO(struct kvm_vcpu *vcpu, __u8 failed), - TP_ARGS(vcpu, failed), - - TP_STRUCT__entry( - __field( __u64, rip ) - __field( __u32, csbase ) - __field( __u8, len ) - __array( __u8, insn, 15 ) - __field( __u8, flags ) - __field( __u8, failed ) - ), - - TP_fast_assign( - __entry->rip = vcpu->arch.emulate_ctxt.fetch.start; - __entry->csbase = kvm_x86_ops->get_segment_base(vcpu, VCPU_SREG_CS); - __entry->len = vcpu->arch.emulate_ctxt._eip - - vcpu->arch.emulate_ctxt.fetch.start; - memcpy(__entry->insn, - vcpu->arch.emulate_ctxt.fetch.data, - 15); - __entry->flags = kei_decode_mode(vcpu->arch.emulate_ctxt.mode); - __entry->failed = failed; - ), - - TP_printk("%x:%llx:%s (%s)%s", - __entry->csbase, __entry->rip, - __print_insn(__entry->insn, __entry->len), - __print_symbolic(__entry->flags, - kvm_trace_symbol_emul_flags), - __entry->failed ? " failed" : "" - ) - ); - -#define trace_kvm_emulate_insn_start(vcpu) trace_kvm_emulate_insn(vcpu, 0) -#define trace_kvm_emulate_insn_failed(vcpu) trace_kvm_emulate_insn(vcpu, 1) - -TRACE_EVENT( - vcpu_match_mmio, - TP_PROTO(gva_t gva, gpa_t gpa, bool write, bool gpa_match), - TP_ARGS(gva, gpa, write, gpa_match), - - TP_STRUCT__entry( - __field(gva_t, gva) - __field(gpa_t, gpa) - __field(bool, write) - __field(bool, gpa_match) - ), - - TP_fast_assign( - __entry->gva = gva; - __entry->gpa = gpa; - __entry->write = write; - __entry->gpa_match = gpa_match - ), - - TP_printk("gva %#lx gpa %#llx %s %s", __entry->gva, __entry->gpa, - __entry->write ? "Write" : "Read", - __entry->gpa_match ? "GPA" : "GVA") -); -#endif /* _TRACE_KVM_H */ - -#undef TRACE_INCLUDE_PATH -#define TRACE_INCLUDE_PATH arch/x86/kvm -#undef TRACE_INCLUDE_FILE -#define TRACE_INCLUDE_FILE trace - -/* This part must be outside protection */ -#include <trace/define_trace.h> diff --git a/ANDROID_3.4.5/arch/x86/kvm/tss.h b/ANDROID_3.4.5/arch/x86/kvm/tss.h deleted file mode 100644 index 622aa10f..00000000 --- a/ANDROID_3.4.5/arch/x86/kvm/tss.h +++ /dev/null @@ -1,59 +0,0 @@ -#ifndef __TSS_SEGMENT_H -#define __TSS_SEGMENT_H - -struct tss_segment_32 { - u32 prev_task_link; - u32 esp0; - u32 ss0; - u32 esp1; - u32 ss1; - u32 esp2; - u32 ss2; - u32 cr3; - u32 eip; - u32 eflags; - u32 eax; - u32 ecx; - u32 edx; - u32 ebx; - u32 esp; - u32 ebp; - u32 esi; - u32 edi; - u32 es; - u32 cs; - u32 ss; - u32 ds; - u32 fs; - u32 gs; - u32 ldt_selector; - u16 t; - u16 io_map; -}; - -struct tss_segment_16 { - u16 prev_task_link; - u16 sp0; - u16 ss0; - u16 sp1; - u16 ss1; - u16 sp2; - u16 ss2; - u16 ip; - u16 flag; - u16 ax; - u16 cx; - u16 dx; - u16 bx; - u16 sp; - u16 bp; - u16 si; - u16 di; - u16 es; - u16 cs; - u16 ss; - u16 ds; - u16 ldt; -}; - -#endif diff --git a/ANDROID_3.4.5/arch/x86/kvm/vmx.c b/ANDROID_3.4.5/arch/x86/kvm/vmx.c deleted file mode 100644 index 4ff0ab9b..00000000 --- a/ANDROID_3.4.5/arch/x86/kvm/vmx.c +++ /dev/null @@ -1,7272 +0,0 @@ -/* - * Kernel-based Virtual Machine driver for Linux - * - * This module enables machines with Intel VT-x extensions to run virtual - * machines without emulation or binary translation. - * - * Copyright (C) 2006 Qumranet, Inc. - * Copyright 2010 Red Hat, Inc. and/or its affiliates. - * - * Authors: - * Avi Kivity <avi@qumranet.com> - * Yaniv Kamay <yaniv@qumranet.com> - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * - */ - -#include "irq.h" -#include "mmu.h" -#include "cpuid.h" - -#include <linux/kvm_host.h> -#include <linux/module.h> -#include <linux/kernel.h> -#include <linux/mm.h> -#include <linux/highmem.h> -#include <linux/sched.h> -#include <linux/moduleparam.h> -#include <linux/ftrace_event.h> -#include <linux/slab.h> -#include <linux/tboot.h> -#include "kvm_cache_regs.h" -#include "x86.h" - -#include <asm/io.h> -#include <asm/desc.h> -#include <asm/vmx.h> -#include <asm/virtext.h> -#include <asm/mce.h> -#include <asm/i387.h> -#include <asm/xcr.h> -#include <asm/perf_event.h> - -#include "trace.h" - -#define __ex(x) __kvm_handle_fault_on_reboot(x) -#define __ex_clear(x, reg) \ - ____kvm_handle_fault_on_reboot(x, "xor " reg " , " reg) - -MODULE_AUTHOR("Qumranet"); -MODULE_LICENSE("GPL"); - -static bool __read_mostly enable_vpid = 1; -module_param_named(vpid, enable_vpid, bool, 0444); - -static bool __read_mostly flexpriority_enabled = 1; -module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO); - -static bool __read_mostly enable_ept = 1; -module_param_named(ept, enable_ept, bool, S_IRUGO); - -static bool __read_mostly enable_unrestricted_guest = 1; -module_param_named(unrestricted_guest, - enable_unrestricted_guest, bool, S_IRUGO); - -static bool __read_mostly emulate_invalid_guest_state = 0; -module_param(emulate_invalid_guest_state, bool, S_IRUGO); - -static bool __read_mostly vmm_exclusive = 1; -module_param(vmm_exclusive, bool, S_IRUGO); - -static bool __read_mostly fasteoi = 1; -module_param(fasteoi, bool, S_IRUGO); - -/* - * If nested=1, nested virtualization is supported, i.e., guests may use - * VMX and be a hypervisor for its own guests. If nested=0, guests may not - * use VMX instructions. - */ -static bool __read_mostly nested = 0; -module_param(nested, bool, S_IRUGO); - -#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \ - (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD) -#define KVM_GUEST_CR0_MASK \ - (KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) -#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST \ - (X86_CR0_WP | X86_CR0_NE) -#define KVM_VM_CR0_ALWAYS_ON \ - (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) -#define KVM_CR4_GUEST_OWNED_BITS \ - (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ - | X86_CR4_OSXMMEXCPT) - -#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) -#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE) - -#define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM)) - -/* - * These 2 parameters are used to config the controls for Pause-Loop Exiting: - * ple_gap: upper bound on the amount of time between two successive - * executions of PAUSE in a loop. Also indicate if ple enabled. - * According to test, this time is usually smaller than 128 cycles. - * ple_window: upper bound on the amount of time a guest is allowed to execute - * in a PAUSE loop. Tests indicate that most spinlocks are held for - * less than 2^12 cycles - * Time is measured based on a counter that runs at the same rate as the TSC, - * refer SDM volume 3b section 21.6.13 & 22.1.3. - */ -#define KVM_VMX_DEFAULT_PLE_GAP 128 -#define KVM_VMX_DEFAULT_PLE_WINDOW 4096 -static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP; -module_param(ple_gap, int, S_IRUGO); - -static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW; -module_param(ple_window, int, S_IRUGO); - -#define NR_AUTOLOAD_MSRS 8 -#define VMCS02_POOL_SIZE 1 - -struct vmcs { - u32 revision_id; - u32 abort; - char data[0]; -}; - -/* - * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also - * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs - * loaded on this CPU (so we can clear them if the CPU goes down). - */ -struct loaded_vmcs { - struct vmcs *vmcs; - int cpu; - int launched; - struct list_head loaded_vmcss_on_cpu_link; -}; - -struct shared_msr_entry { - unsigned index; - u64 data; - u64 mask; -}; - -/* - * struct vmcs12 describes the state that our guest hypervisor (L1) keeps for a - * single nested guest (L2), hence the name vmcs12. Any VMX implementation has - * a VMCS structure, and vmcs12 is our emulated VMX's VMCS. This structure is - * stored in guest memory specified by VMPTRLD, but is opaque to the guest, - * which must access it using VMREAD/VMWRITE/VMCLEAR instructions. - * More than one of these structures may exist, if L1 runs multiple L2 guests. - * nested_vmx_run() will use the data here to build a vmcs02: a VMCS for the - * underlying hardware which will be used to run L2. - * This structure is packed to ensure that its layout is identical across - * machines (necessary for live migration). - * If there are changes in this struct, VMCS12_REVISION must be changed. - */ -typedef u64 natural_width; -struct __packed vmcs12 { - /* According to the Intel spec, a VMCS region must start with the - * following two fields. Then follow implementation-specific data. - */ - u32 revision_id; - u32 abort; - - u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */ - u32 padding[7]; /* room for future expansion */ - - u64 io_bitmap_a; - u64 io_bitmap_b; - u64 msr_bitmap; - u64 vm_exit_msr_store_addr; - u64 vm_exit_msr_load_addr; - u64 vm_entry_msr_load_addr; - u64 tsc_offset; - u64 virtual_apic_page_addr; - u64 apic_access_addr; - u64 ept_pointer; - u64 guest_physical_address; - u64 vmcs_link_pointer; - u64 guest_ia32_debugctl; - u64 guest_ia32_pat; - u64 guest_ia32_efer; - u64 guest_ia32_perf_global_ctrl; - u64 guest_pdptr0; - u64 guest_pdptr1; - u64 guest_pdptr2; - u64 guest_pdptr3; - u64 host_ia32_pat; - u64 host_ia32_efer; - u64 host_ia32_perf_global_ctrl; - u64 padding64[8]; /* room for future expansion */ - /* - * To allow migration of L1 (complete with its L2 guests) between - * machines of different natural widths (32 or 64 bit), we cannot have - * unsigned long fields with no explict size. We use u64 (aliased - * natural_width) instead. Luckily, x86 is little-endian. - */ - natural_width cr0_guest_host_mask; - natural_width cr4_guest_host_mask; - natural_width cr0_read_shadow; - natural_width cr4_read_shadow; - natural_width cr3_target_value0; - natural_width cr3_target_value1; - natural_width cr3_target_value2; - natural_width cr3_target_value3; - natural_width exit_qualification; - natural_width guest_linear_address; - natural_width guest_cr0; - natural_width guest_cr3; - natural_width guest_cr4; - natural_width guest_es_base; - natural_width guest_cs_base; - natural_width guest_ss_base; - natural_width guest_ds_base; - natural_width guest_fs_base; - natural_width guest_gs_base; - natural_width guest_ldtr_base; - natural_width guest_tr_base; - natural_width guest_gdtr_base; - natural_width guest_idtr_base; - natural_width guest_dr7; - natural_width guest_rsp; - natural_width guest_rip; - natural_width guest_rflags; - natural_width guest_pending_dbg_exceptions; - natural_width guest_sysenter_esp; - natural_width guest_sysenter_eip; - natural_width host_cr0; - natural_width host_cr3; - natural_width host_cr4; - natural_width host_fs_base; - natural_width host_gs_base; - natural_width host_tr_base; - natural_width host_gdtr_base; - natural_width host_idtr_base; - natural_width host_ia32_sysenter_esp; - natural_width host_ia32_sysenter_eip; - natural_width host_rsp; - natural_width host_rip; - natural_width paddingl[8]; /* room for future expansion */ - u32 pin_based_vm_exec_control; - u32 cpu_based_vm_exec_control; - u32 exception_bitmap; - u32 page_fault_error_code_mask; - u32 page_fault_error_code_match; - u32 cr3_target_count; - u32 vm_exit_controls; - u32 vm_exit_msr_store_count; - u32 vm_exit_msr_load_count; - u32 vm_entry_controls; - u32 vm_entry_msr_load_count; - u32 vm_entry_intr_info_field; - u32 vm_entry_exception_error_code; - u32 vm_entry_instruction_len; - u32 tpr_threshold; - u32 secondary_vm_exec_control; - u32 vm_instruction_error; - u32 vm_exit_reason; - u32 vm_exit_intr_info; - u32 vm_exit_intr_error_code; - u32 idt_vectoring_info_field; - u32 idt_vectoring_error_code; - u32 vm_exit_instruction_len; - u32 vmx_instruction_info; - u32 guest_es_limit; - u32 guest_cs_limit; - u32 guest_ss_limit; - u32 guest_ds_limit; - u32 guest_fs_limit; - u32 guest_gs_limit; - u32 guest_ldtr_limit; - u32 guest_tr_limit; - u32 guest_gdtr_limit; - u32 guest_idtr_limit; - u32 guest_es_ar_bytes; - u32 guest_cs_ar_bytes; - u32 guest_ss_ar_bytes; - u32 guest_ds_ar_bytes; - u32 guest_fs_ar_bytes; - u32 guest_gs_ar_bytes; - u32 guest_ldtr_ar_bytes; - u32 guest_tr_ar_bytes; - u32 guest_interruptibility_info; - u32 guest_activity_state; - u32 guest_sysenter_cs; - u32 host_ia32_sysenter_cs; - u32 padding32[8]; /* room for future expansion */ - u16 virtual_processor_id; - u16 guest_es_selector; - u16 guest_cs_selector; - u16 guest_ss_selector; - u16 guest_ds_selector; - u16 guest_fs_selector; - u16 guest_gs_selector; - u16 guest_ldtr_selector; - u16 guest_tr_selector; - u16 host_es_selector; - u16 host_cs_selector; - u16 host_ss_selector; - u16 host_ds_selector; - u16 host_fs_selector; - u16 host_gs_selector; - u16 host_tr_selector; -}; - -/* - * VMCS12_REVISION is an arbitrary id that should be changed if the content or - * layout of struct vmcs12 is changed. MSR_IA32_VMX_BASIC returns this id, and - * VMPTRLD verifies that the VMCS region that L1 is loading contains this id. - */ -#define VMCS12_REVISION 0x11e57ed0 - -/* - * VMCS12_SIZE is the number of bytes L1 should allocate for the VMXON region - * and any VMCS region. Although only sizeof(struct vmcs12) are used by the - * current implementation, 4K are reserved to avoid future complications. - */ -#define VMCS12_SIZE 0x1000 - -/* Used to remember the last vmcs02 used for some recently used vmcs12s */ -struct vmcs02_list { - struct list_head list; - gpa_t vmptr; - struct loaded_vmcs vmcs02; -}; - -/* - * The nested_vmx structure is part of vcpu_vmx, and holds information we need - * for correct emulation of VMX (i.e., nested VMX) on this vcpu. - */ -struct nested_vmx { - /* Has the level1 guest done vmxon? */ - bool vmxon; - - /* The guest-physical address of the current VMCS L1 keeps for L2 */ - gpa_t current_vmptr; - /* The host-usable pointer to the above */ - struct page *current_vmcs12_page; - struct vmcs12 *current_vmcs12; - - /* vmcs02_list cache of VMCSs recently used to run L2 guests */ - struct list_head vmcs02_pool; - int vmcs02_num; - u64 vmcs01_tsc_offset; - /* L2 must run next, and mustn't decide to exit to L1. */ - bool nested_run_pending; - /* - * Guest pages referred to in vmcs02 with host-physical pointers, so - * we must keep them pinned while L2 runs. - */ - struct page *apic_access_page; -}; - -struct vcpu_vmx { - struct kvm_vcpu vcpu; - unsigned long host_rsp; - u8 fail; - u8 cpl; - bool nmi_known_unmasked; - u32 exit_intr_info; - u32 idt_vectoring_info; - ulong rflags; - struct shared_msr_entry *guest_msrs; - int nmsrs; - int save_nmsrs; -#ifdef CONFIG_X86_64 - u64 msr_host_kernel_gs_base; - u64 msr_guest_kernel_gs_base; -#endif - /* - * loaded_vmcs points to the VMCS currently used in this vcpu. For a - * non-nested (L1) guest, it always points to vmcs01. For a nested - * guest (L2), it points to a different VMCS. - */ - struct loaded_vmcs vmcs01; - struct loaded_vmcs *loaded_vmcs; - bool __launched; /* temporary, used in vmx_vcpu_run */ - struct msr_autoload { - unsigned nr; - struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS]; - struct vmx_msr_entry host[NR_AUTOLOAD_MSRS]; - } msr_autoload; - struct { - int loaded; - u16 fs_sel, gs_sel, ldt_sel; - int gs_ldt_reload_needed; - int fs_reload_needed; - } host_state; - struct { - int vm86_active; - ulong save_rflags; - struct kvm_save_segment { - u16 selector; - unsigned long base; - u32 limit; - u32 ar; - } tr, es, ds, fs, gs; - } rmode; - struct { - u32 bitmask; /* 4 bits per segment (1 bit per field) */ - struct kvm_save_segment seg[8]; - } segment_cache; - int vpid; - bool emulation_required; - - /* Support for vnmi-less CPUs */ - int soft_vnmi_blocked; - ktime_t entry_time; - s64 vnmi_blocked_time; - u32 exit_reason; - - bool rdtscp_enabled; - - /* Support for a guest hypervisor (nested VMX) */ - struct nested_vmx nested; -}; - -enum segment_cache_field { - SEG_FIELD_SEL = 0, - SEG_FIELD_BASE = 1, - SEG_FIELD_LIMIT = 2, - SEG_FIELD_AR = 3, - - SEG_FIELD_NR = 4 -}; - -static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) -{ - return container_of(vcpu, struct vcpu_vmx, vcpu); -} - -#define VMCS12_OFFSET(x) offsetof(struct vmcs12, x) -#define FIELD(number, name) [number] = VMCS12_OFFSET(name) -#define FIELD64(number, name) [number] = VMCS12_OFFSET(name), \ - [number##_HIGH] = VMCS12_OFFSET(name)+4 - -static unsigned short vmcs_field_to_offset_table[] = { - FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id), - FIELD(GUEST_ES_SELECTOR, guest_es_selector), - FIELD(GUEST_CS_SELECTOR, guest_cs_selector), - FIELD(GUEST_SS_SELECTOR, guest_ss_selector), - FIELD(GUEST_DS_SELECTOR, guest_ds_selector), - FIELD(GUEST_FS_SELECTOR, guest_fs_selector), - FIELD(GUEST_GS_SELECTOR, guest_gs_selector), - FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector), - FIELD(GUEST_TR_SELECTOR, guest_tr_selector), - FIELD(HOST_ES_SELECTOR, host_es_selector), - FIELD(HOST_CS_SELECTOR, host_cs_selector), - FIELD(HOST_SS_SELECTOR, host_ss_selector), - FIELD(HOST_DS_SELECTOR, host_ds_selector), - FIELD(HOST_FS_SELECTOR, host_fs_selector), - FIELD(HOST_GS_SELECTOR, host_gs_selector), - FIELD(HOST_TR_SELECTOR, host_tr_selector), - FIELD64(IO_BITMAP_A, io_bitmap_a), - FIELD64(IO_BITMAP_B, io_bitmap_b), - FIELD64(MSR_BITMAP, msr_bitmap), - FIELD64(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr), - FIELD64(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr), - FIELD64(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr), - FIELD64(TSC_OFFSET, tsc_offset), - FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr), - FIELD64(APIC_ACCESS_ADDR, apic_access_addr), - FIELD64(EPT_POINTER, ept_pointer), - FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address), - FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer), - FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl), - FIELD64(GUEST_IA32_PAT, guest_ia32_pat), - FIELD64(GUEST_IA32_EFER, guest_ia32_efer), - FIELD64(GUEST_IA32_PERF_GLOBAL_CTRL, guest_ia32_perf_global_ctrl), - FIELD64(GUEST_PDPTR0, guest_pdptr0), - FIELD64(GUEST_PDPTR1, guest_pdptr1), - FIELD64(GUEST_PDPTR2, guest_pdptr2), - FIELD64(GUEST_PDPTR3, guest_pdptr3), - FIELD64(HOST_IA32_PAT, host_ia32_pat), - FIELD64(HOST_IA32_EFER, host_ia32_efer), - FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl), - FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control), - FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control), - FIELD(EXCEPTION_BITMAP, exception_bitmap), - FIELD(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask), - FIELD(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match), - FIELD(CR3_TARGET_COUNT, cr3_target_count), - FIELD(VM_EXIT_CONTROLS, vm_exit_controls), - FIELD(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count), - FIELD(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count), - FIELD(VM_ENTRY_CONTROLS, vm_entry_controls), - FIELD(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count), - FIELD(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field), - FIELD(VM_ENTRY_EXCEPTION_ERROR_CODE, vm_entry_exception_error_code), - FIELD(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len), - FIELD(TPR_THRESHOLD, tpr_threshold), - FIELD(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control), - FIELD(VM_INSTRUCTION_ERROR, vm_instruction_error), - FIELD(VM_EXIT_REASON, vm_exit_reason), - FIELD(VM_EXIT_INTR_INFO, vm_exit_intr_info), - FIELD(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code), - FIELD(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field), - FIELD(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code), - FIELD(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len), - FIELD(VMX_INSTRUCTION_INFO, vmx_instruction_info), - FIELD(GUEST_ES_LIMIT, guest_es_limit), - FIELD(GUEST_CS_LIMIT, guest_cs_limit), - FIELD(GUEST_SS_LIMIT, guest_ss_limit), - FIELD(GUEST_DS_LIMIT, guest_ds_limit), - FIELD(GUEST_FS_LIMIT, guest_fs_limit), - FIELD(GUEST_GS_LIMIT, guest_gs_limit), - FIELD(GUEST_LDTR_LIMIT, guest_ldtr_limit), - FIELD(GUEST_TR_LIMIT, guest_tr_limit), - FIELD(GUEST_GDTR_LIMIT, guest_gdtr_limit), - FIELD(GUEST_IDTR_LIMIT, guest_idtr_limit), - FIELD(GUEST_ES_AR_BYTES, guest_es_ar_bytes), - FIELD(GUEST_CS_AR_BYTES, guest_cs_ar_bytes), - FIELD(GUEST_SS_AR_BYTES, guest_ss_ar_bytes), - FIELD(GUEST_DS_AR_BYTES, guest_ds_ar_bytes), - FIELD(GUEST_FS_AR_BYTES, guest_fs_ar_bytes), - FIELD(GUEST_GS_AR_BYTES, guest_gs_ar_bytes), - FIELD(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes), - FIELD(GUEST_TR_AR_BYTES, guest_tr_ar_bytes), - FIELD(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info), - FIELD(GUEST_ACTIVITY_STATE, guest_activity_state), - FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs), - FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs), - FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask), - FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask), - FIELD(CR0_READ_SHADOW, cr0_read_shadow), - FIELD(CR4_READ_SHADOW, cr4_read_shadow), - FIELD(CR3_TARGET_VALUE0, cr3_target_value0), - FIELD(CR3_TARGET_VALUE1, cr3_target_value1), - FIELD(CR3_TARGET_VALUE2, cr3_target_value2), - FIELD(CR3_TARGET_VALUE3, cr3_target_value3), - FIELD(EXIT_QUALIFICATION, exit_qualification), - FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address), - FIELD(GUEST_CR0, guest_cr0), - FIELD(GUEST_CR3, guest_cr3), - FIELD(GUEST_CR4, guest_cr4), - FIELD(GUEST_ES_BASE, guest_es_base), - FIELD(GUEST_CS_BASE, guest_cs_base), - FIELD(GUEST_SS_BASE, guest_ss_base), - FIELD(GUEST_DS_BASE, guest_ds_base), - FIELD(GUEST_FS_BASE, guest_fs_base), - FIELD(GUEST_GS_BASE, guest_gs_base), - FIELD(GUEST_LDTR_BASE, guest_ldtr_base), - FIELD(GUEST_TR_BASE, guest_tr_base), - FIELD(GUEST_GDTR_BASE, guest_gdtr_base), - FIELD(GUEST_IDTR_BASE, guest_idtr_base), - FIELD(GUEST_DR7, guest_dr7), - FIELD(GUEST_RSP, guest_rsp), - FIELD(GUEST_RIP, guest_rip), - FIELD(GUEST_RFLAGS, guest_rflags), - FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions), - FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp), - FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip), - FIELD(HOST_CR0, host_cr0), - FIELD(HOST_CR3, host_cr3), - FIELD(HOST_CR4, host_cr4), - FIELD(HOST_FS_BASE, host_fs_base), - FIELD(HOST_GS_BASE, host_gs_base), - FIELD(HOST_TR_BASE, host_tr_base), - FIELD(HOST_GDTR_BASE, host_gdtr_base), - FIELD(HOST_IDTR_BASE, host_idtr_base), - FIELD(HOST_IA32_SYSENTER_ESP, host_ia32_sysenter_esp), - FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip), - FIELD(HOST_RSP, host_rsp), - FIELD(HOST_RIP, host_rip), -}; -static const int max_vmcs_field = ARRAY_SIZE(vmcs_field_to_offset_table); - -static inline short vmcs_field_to_offset(unsigned long field) -{ - if (field >= max_vmcs_field || vmcs_field_to_offset_table[field] == 0) - return -1; - return vmcs_field_to_offset_table[field]; -} - -static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu) -{ - return to_vmx(vcpu)->nested.current_vmcs12; -} - -static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr) -{ - struct page *page = gfn_to_page(vcpu->kvm, addr >> PAGE_SHIFT); - if (is_error_page(page)) { - kvm_release_page_clean(page); - return NULL; - } - return page; -} - -static void nested_release_page(struct page *page) -{ - kvm_release_page_dirty(page); -} - -static void nested_release_page_clean(struct page *page) -{ - kvm_release_page_clean(page); -} - -static u64 construct_eptp(unsigned long root_hpa); -static void kvm_cpu_vmxon(u64 addr); -static void kvm_cpu_vmxoff(void); -static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); -static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); - -static DEFINE_PER_CPU(struct vmcs *, vmxarea); -static DEFINE_PER_CPU(struct vmcs *, current_vmcs); -/* - * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed - * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it. - */ -static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu); -static DEFINE_PER_CPU(struct desc_ptr, host_gdt); - -static unsigned long *vmx_io_bitmap_a; -static unsigned long *vmx_io_bitmap_b; -static unsigned long *vmx_msr_bitmap_legacy; -static unsigned long *vmx_msr_bitmap_longmode; - -static bool cpu_has_load_ia32_efer; -static bool cpu_has_load_perf_global_ctrl; - -static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS); -static DEFINE_SPINLOCK(vmx_vpid_lock); - -static struct vmcs_config { - int size; - int order; - u32 revision_id; - u32 pin_based_exec_ctrl; - u32 cpu_based_exec_ctrl; - u32 cpu_based_2nd_exec_ctrl; - u32 vmexit_ctrl; - u32 vmentry_ctrl; -} vmcs_config; - -static struct vmx_capability { - u32 ept; - u32 vpid; -} vmx_capability; - -#define VMX_SEGMENT_FIELD(seg) \ - [VCPU_SREG_##seg] = { \ - .selector = GUEST_##seg##_SELECTOR, \ - .base = GUEST_##seg##_BASE, \ - .limit = GUEST_##seg##_LIMIT, \ - .ar_bytes = GUEST_##seg##_AR_BYTES, \ - } - -static struct kvm_vmx_segment_field { - unsigned selector; - unsigned base; - unsigned limit; - unsigned ar_bytes; -} kvm_vmx_segment_fields[] = { - VMX_SEGMENT_FIELD(CS), - VMX_SEGMENT_FIELD(DS), - VMX_SEGMENT_FIELD(ES), - VMX_SEGMENT_FIELD(FS), - VMX_SEGMENT_FIELD(GS), - VMX_SEGMENT_FIELD(SS), - VMX_SEGMENT_FIELD(TR), - VMX_SEGMENT_FIELD(LDTR), -}; - -static u64 host_efer; - -static void ept_save_pdptrs(struct kvm_vcpu *vcpu); - -/* - * Keep MSR_STAR at the end, as setup_msrs() will try to optimize it - * away by decrementing the array size. - */ -static const u32 vmx_msr_index[] = { -#ifdef CONFIG_X86_64 - MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, -#endif - MSR_EFER, MSR_TSC_AUX, MSR_STAR, -}; -#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index) - -static inline bool is_page_fault(u32 intr_info) -{ - return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | - INTR_INFO_VALID_MASK)) == - (INTR_TYPE_HARD_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK); -} - -static inline bool is_no_device(u32 intr_info) -{ - return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | - INTR_INFO_VALID_MASK)) == - (INTR_TYPE_HARD_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK); -} - -static inline bool is_invalid_opcode(u32 intr_info) -{ - return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | - INTR_INFO_VALID_MASK)) == - (INTR_TYPE_HARD_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK); -} - -static inline bool is_external_interrupt(u32 intr_info) -{ - return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) - == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); -} - -static inline bool is_machine_check(u32 intr_info) -{ - return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | - INTR_INFO_VALID_MASK)) == - (INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK); -} - -static inline bool cpu_has_vmx_msr_bitmap(void) -{ - return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS; -} - -static inline bool cpu_has_vmx_tpr_shadow(void) -{ - return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW; -} - -static inline bool vm_need_tpr_shadow(struct kvm *kvm) -{ - return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm)); -} - -static inline bool cpu_has_secondary_exec_ctrls(void) -{ - return vmcs_config.cpu_based_exec_ctrl & - CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; -} - -static inline bool cpu_has_vmx_virtualize_apic_accesses(void) -{ - return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; -} - -static inline bool cpu_has_vmx_flexpriority(void) -{ - return cpu_has_vmx_tpr_shadow() && - cpu_has_vmx_virtualize_apic_accesses(); -} - -static inline bool cpu_has_vmx_ept_execute_only(void) -{ - return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT; -} - -static inline bool cpu_has_vmx_eptp_uncacheable(void) -{ - return vmx_capability.ept & VMX_EPTP_UC_BIT; -} - -static inline bool cpu_has_vmx_eptp_writeback(void) -{ - return vmx_capability.ept & VMX_EPTP_WB_BIT; -} - -static inline bool cpu_has_vmx_ept_2m_page(void) -{ - return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT; -} - -static inline bool cpu_has_vmx_ept_1g_page(void) -{ - return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT; -} - -static inline bool cpu_has_vmx_ept_4levels(void) -{ - return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT; -} - -static inline bool cpu_has_vmx_invept_individual_addr(void) -{ - return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT; -} - -static inline bool cpu_has_vmx_invept_context(void) -{ - return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT; -} - -static inline bool cpu_has_vmx_invept_global(void) -{ - return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT; -} - -static inline bool cpu_has_vmx_invvpid_single(void) -{ - return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT; -} - -static inline bool cpu_has_vmx_invvpid_global(void) -{ - return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT; -} - -static inline bool cpu_has_vmx_ept(void) -{ - return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_ENABLE_EPT; -} - -static inline bool cpu_has_vmx_unrestricted_guest(void) -{ - return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_UNRESTRICTED_GUEST; -} - -static inline bool cpu_has_vmx_ple(void) -{ - return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_PAUSE_LOOP_EXITING; -} - -static inline bool vm_need_virtualize_apic_accesses(struct kvm *kvm) -{ - return flexpriority_enabled && irqchip_in_kernel(kvm); -} - -static inline bool cpu_has_vmx_vpid(void) -{ - return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_ENABLE_VPID; -} - -static inline bool cpu_has_vmx_rdtscp(void) -{ - return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_RDTSCP; -} - -static inline bool cpu_has_virtual_nmis(void) -{ - return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; -} - -static inline bool cpu_has_vmx_wbinvd_exit(void) -{ - return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_WBINVD_EXITING; -} - -static inline bool report_flexpriority(void) -{ - return flexpriority_enabled; -} - -static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit) -{ - return vmcs12->cpu_based_vm_exec_control & bit; -} - -static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit) -{ - return (vmcs12->cpu_based_vm_exec_control & - CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) && - (vmcs12->secondary_vm_exec_control & bit); -} - -static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12, - struct kvm_vcpu *vcpu) -{ - return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS; -} - -static inline bool is_exception(u32 intr_info) -{ - return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) - == (INTR_TYPE_HARD_EXCEPTION | INTR_INFO_VALID_MASK); -} - -static void nested_vmx_vmexit(struct kvm_vcpu *vcpu); -static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12, - u32 reason, unsigned long qualification); - -static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) -{ - int i; - - for (i = 0; i < vmx->nmsrs; ++i) - if (vmx_msr_index[vmx->guest_msrs[i].index] == msr) - return i; - return -1; -} - -static inline void __invvpid(int ext, u16 vpid, gva_t gva) -{ - struct { - u64 vpid : 16; - u64 rsvd : 48; - u64 gva; - } operand = { vpid, 0, gva }; - - asm volatile (__ex(ASM_VMX_INVVPID) - /* CF==1 or ZF==1 --> rc = -1 */ - "; ja 1f ; ud2 ; 1:" - : : "a"(&operand), "c"(ext) : "cc", "memory"); -} - -static inline void __invept(int ext, u64 eptp, gpa_t gpa) -{ - struct { - u64 eptp, gpa; - } operand = {eptp, gpa}; - - asm volatile (__ex(ASM_VMX_INVEPT) - /* CF==1 or ZF==1 --> rc = -1 */ - "; ja 1f ; ud2 ; 1:\n" - : : "a" (&operand), "c" (ext) : "cc", "memory"); -} - -static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) -{ - int i; - - i = __find_msr_index(vmx, msr); - if (i >= 0) - return &vmx->guest_msrs[i]; - return NULL; -} - -static void vmcs_clear(struct vmcs *vmcs) -{ - u64 phys_addr = __pa(vmcs); - u8 error; - - asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0" - : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr) - : "cc", "memory"); - if (error) - printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n", - vmcs, phys_addr); -} - -static inline void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs) -{ - vmcs_clear(loaded_vmcs->vmcs); - loaded_vmcs->cpu = -1; - loaded_vmcs->launched = 0; -} - -static void vmcs_load(struct vmcs *vmcs) -{ - u64 phys_addr = __pa(vmcs); - u8 error; - - asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0" - : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr) - : "cc", "memory"); - if (error) - printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n", - vmcs, phys_addr); -} - -static void __loaded_vmcs_clear(void *arg) -{ - struct loaded_vmcs *loaded_vmcs = arg; - int cpu = raw_smp_processor_id(); - - if (loaded_vmcs->cpu != cpu) - return; /* vcpu migration can race with cpu offline */ - if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs) - per_cpu(current_vmcs, cpu) = NULL; - list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link); - loaded_vmcs_init(loaded_vmcs); -} - -static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) -{ - if (loaded_vmcs->cpu != -1) - smp_call_function_single( - loaded_vmcs->cpu, __loaded_vmcs_clear, loaded_vmcs, 1); -} - -static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx) -{ - if (vmx->vpid == 0) - return; - - if (cpu_has_vmx_invvpid_single()) - __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0); -} - -static inline void vpid_sync_vcpu_global(void) -{ - if (cpu_has_vmx_invvpid_global()) - __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0); -} - -static inline void vpid_sync_context(struct vcpu_vmx *vmx) -{ - if (cpu_has_vmx_invvpid_single()) - vpid_sync_vcpu_single(vmx); - else - vpid_sync_vcpu_global(); -} - -static inline void ept_sync_global(void) -{ - if (cpu_has_vmx_invept_global()) - __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0); -} - -static inline void ept_sync_context(u64 eptp) -{ - if (enable_ept) { - if (cpu_has_vmx_invept_context()) - __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0); - else - ept_sync_global(); - } -} - -static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa) -{ - if (enable_ept) { - if (cpu_has_vmx_invept_individual_addr()) - __invept(VMX_EPT_EXTENT_INDIVIDUAL_ADDR, - eptp, gpa); - else - ept_sync_context(eptp); - } -} - -static __always_inline unsigned long vmcs_readl(unsigned long field) -{ - unsigned long value; - - asm volatile (__ex_clear(ASM_VMX_VMREAD_RDX_RAX, "%0") - : "=a"(value) : "d"(field) : "cc"); - return value; -} - -static __always_inline u16 vmcs_read16(unsigned long field) -{ - return vmcs_readl(field); -} - -static __always_inline u32 vmcs_read32(unsigned long field) -{ - return vmcs_readl(field); -} - -static __always_inline u64 vmcs_read64(unsigned long field) -{ -#ifdef CONFIG_X86_64 - return vmcs_readl(field); -#else - return vmcs_readl(field) | ((u64)vmcs_readl(field+1) << 32); -#endif -} - -static noinline void vmwrite_error(unsigned long field, unsigned long value) -{ - printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n", - field, value, vmcs_read32(VM_INSTRUCTION_ERROR)); - dump_stack(); -} - -static void vmcs_writel(unsigned long field, unsigned long value) -{ - u8 error; - - asm volatile (__ex(ASM_VMX_VMWRITE_RAX_RDX) "; setna %0" - : "=q"(error) : "a"(value), "d"(field) : "cc"); - if (unlikely(error)) - vmwrite_error(field, value); -} - -static void vmcs_write16(unsigned long field, u16 value) -{ - vmcs_writel(field, value); -} - -static void vmcs_write32(unsigned long field, u32 value) -{ - vmcs_writel(field, value); -} - -static void vmcs_write64(unsigned long field, u64 value) -{ - vmcs_writel(field, value); -#ifndef CONFIG_X86_64 - asm volatile (""); - vmcs_writel(field+1, value >> 32); -#endif -} - -static void vmcs_clear_bits(unsigned long field, u32 mask) -{ - vmcs_writel(field, vmcs_readl(field) & ~mask); -} - -static void vmcs_set_bits(unsigned long field, u32 mask) -{ - vmcs_writel(field, vmcs_readl(field) | mask); -} - -static void vmx_segment_cache_clear(struct vcpu_vmx *vmx) -{ - vmx->segment_cache.bitmask = 0; -} - -static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg, - unsigned field) -{ - bool ret; - u32 mask = 1 << (seg * SEG_FIELD_NR + field); - - if (!(vmx->vcpu.arch.regs_avail & (1 << VCPU_EXREG_SEGMENTS))) { - vmx->vcpu.arch.regs_avail |= (1 << VCPU_EXREG_SEGMENTS); - vmx->segment_cache.bitmask = 0; - } - ret = vmx->segment_cache.bitmask & mask; - vmx->segment_cache.bitmask |= mask; - return ret; -} - -static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg) -{ - u16 *p = &vmx->segment_cache.seg[seg].selector; - - if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL)) - *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector); - return *p; -} - -static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg) -{ - ulong *p = &vmx->segment_cache.seg[seg].base; - - if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE)) - *p = vmcs_readl(kvm_vmx_segment_fields[seg].base); - return *p; -} - -static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg) -{ - u32 *p = &vmx->segment_cache.seg[seg].limit; - - if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT)) - *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit); - return *p; -} - -static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg) -{ - u32 *p = &vmx->segment_cache.seg[seg].ar; - - if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR)) - *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes); - return *p; -} - -static void update_exception_bitmap(struct kvm_vcpu *vcpu) -{ - u32 eb; - - eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) | - (1u << NM_VECTOR) | (1u << DB_VECTOR); - if ((vcpu->guest_debug & - (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) == - (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) - eb |= 1u << BP_VECTOR; - if (to_vmx(vcpu)->rmode.vm86_active) - eb = ~0; - if (enable_ept) - eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */ - if (vcpu->fpu_active) - eb &= ~(1u << NM_VECTOR); - - /* When we are running a nested L2 guest and L1 specified for it a - * certain exception bitmap, we must trap the same exceptions and pass - * them to L1. When running L2, we will only handle the exceptions - * specified above if L1 did not want them. - */ - if (is_guest_mode(vcpu)) - eb |= get_vmcs12(vcpu)->exception_bitmap; - - vmcs_write32(EXCEPTION_BITMAP, eb); -} - -static void clear_atomic_switch_msr_special(unsigned long entry, - unsigned long exit) -{ - vmcs_clear_bits(VM_ENTRY_CONTROLS, entry); - vmcs_clear_bits(VM_EXIT_CONTROLS, exit); -} - -static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) -{ - unsigned i; - struct msr_autoload *m = &vmx->msr_autoload; - - switch (msr) { - case MSR_EFER: - if (cpu_has_load_ia32_efer) { - clear_atomic_switch_msr_special(VM_ENTRY_LOAD_IA32_EFER, - VM_EXIT_LOAD_IA32_EFER); - return; - } - break; - case MSR_CORE_PERF_GLOBAL_CTRL: - if (cpu_has_load_perf_global_ctrl) { - clear_atomic_switch_msr_special( - VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, - VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL); - return; - } - break; - } - - for (i = 0; i < m->nr; ++i) - if (m->guest[i].index == msr) - break; - - if (i == m->nr) - return; - --m->nr; - m->guest[i] = m->guest[m->nr]; - m->host[i] = m->host[m->nr]; - vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr); - vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr); -} - -static void add_atomic_switch_msr_special(unsigned long entry, - unsigned long exit, unsigned long guest_val_vmcs, - unsigned long host_val_vmcs, u64 guest_val, u64 host_val) -{ - vmcs_write64(guest_val_vmcs, guest_val); - vmcs_write64(host_val_vmcs, host_val); - vmcs_set_bits(VM_ENTRY_CONTROLS, entry); - vmcs_set_bits(VM_EXIT_CONTROLS, exit); -} - -static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, - u64 guest_val, u64 host_val) -{ - unsigned i; - struct msr_autoload *m = &vmx->msr_autoload; - - switch (msr) { - case MSR_EFER: - if (cpu_has_load_ia32_efer) { - add_atomic_switch_msr_special(VM_ENTRY_LOAD_IA32_EFER, - VM_EXIT_LOAD_IA32_EFER, - GUEST_IA32_EFER, - HOST_IA32_EFER, - guest_val, host_val); - return; - } - break; - case MSR_CORE_PERF_GLOBAL_CTRL: - if (cpu_has_load_perf_global_ctrl) { - add_atomic_switch_msr_special( - VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, - VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL, - GUEST_IA32_PERF_GLOBAL_CTRL, - HOST_IA32_PERF_GLOBAL_CTRL, - guest_val, host_val); - return; - } - break; - } - - for (i = 0; i < m->nr; ++i) - if (m->guest[i].index == msr) - break; - - if (i == NR_AUTOLOAD_MSRS) { - printk_once(KERN_WARNING"Not enough mst switch entries. " - "Can't add msr %x\n", msr); - return; - } else if (i == m->nr) { - ++m->nr; - vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr); - vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr); - } - - m->guest[i].index = msr; - m->guest[i].value = guest_val; - m->host[i].index = msr; - m->host[i].value = host_val; -} - -static void reload_tss(void) -{ - /* - * VT restores TR but not its size. Useless. - */ - struct desc_ptr *gdt = &__get_cpu_var(host_gdt); - struct desc_struct *descs; - - descs = (void *)gdt->address; - descs[GDT_ENTRY_TSS].type = 9; /* available TSS */ - load_TR_desc(); -} - -static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) -{ - u64 guest_efer; - u64 ignore_bits; - - guest_efer = vmx->vcpu.arch.efer; - - /* - * NX is emulated; LMA and LME handled by hardware; SCE meaninless - * outside long mode - */ - ignore_bits = EFER_NX | EFER_SCE; -#ifdef CONFIG_X86_64 - ignore_bits |= EFER_LMA | EFER_LME; - /* SCE is meaningful only in long mode on Intel */ - if (guest_efer & EFER_LMA) - ignore_bits &= ~(u64)EFER_SCE; -#endif - guest_efer &= ~ignore_bits; - guest_efer |= host_efer & ignore_bits; - vmx->guest_msrs[efer_offset].data = guest_efer; - vmx->guest_msrs[efer_offset].mask = ~ignore_bits; - - clear_atomic_switch_msr(vmx, MSR_EFER); - /* On ept, can't emulate nx, and must switch nx atomically */ - if (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX)) { - guest_efer = vmx->vcpu.arch.efer; - if (!(guest_efer & EFER_LMA)) - guest_efer &= ~EFER_LME; - add_atomic_switch_msr(vmx, MSR_EFER, guest_efer, host_efer); - return false; - } - - return true; -} - -static unsigned long segment_base(u16 selector) -{ - struct desc_ptr *gdt = &__get_cpu_var(host_gdt); - struct desc_struct *d; - unsigned long table_base; - unsigned long v; - - if (!(selector & ~3)) - return 0; - - table_base = gdt->address; - - if (selector & 4) { /* from ldt */ - u16 ldt_selector = kvm_read_ldt(); - - if (!(ldt_selector & ~3)) - return 0; - - table_base = segment_base(ldt_selector); - } - d = (struct desc_struct *)(table_base + (selector & ~7)); - v = get_desc_base(d); -#ifdef CONFIG_X86_64 - if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11)) - v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32; -#endif - return v; -} - -static inline unsigned long kvm_read_tr_base(void) -{ - u16 tr; - asm("str %0" : "=g"(tr)); - return segment_base(tr); -} - -static void vmx_save_host_state(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - int i; - - if (vmx->host_state.loaded) - return; - - vmx->host_state.loaded = 1; - /* - * Set host fs and gs selectors. Unfortunately, 22.2.3 does not - * allow segment selectors with cpl > 0 or ti == 1. - */ - vmx->host_state.ldt_sel = kvm_read_ldt(); - vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel; - savesegment(fs, vmx->host_state.fs_sel); - if (!(vmx->host_state.fs_sel & 7)) { - vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel); - vmx->host_state.fs_reload_needed = 0; - } else { - vmcs_write16(HOST_FS_SELECTOR, 0); - vmx->host_state.fs_reload_needed = 1; - } - savesegment(gs, vmx->host_state.gs_sel); - if (!(vmx->host_state.gs_sel & 7)) - vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel); - else { - vmcs_write16(HOST_GS_SELECTOR, 0); - vmx->host_state.gs_ldt_reload_needed = 1; - } - -#ifdef CONFIG_X86_64 - vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE)); - vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE)); -#else - vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel)); - vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel)); -#endif - -#ifdef CONFIG_X86_64 - rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); - if (is_long_mode(&vmx->vcpu)) - wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); -#endif - for (i = 0; i < vmx->save_nmsrs; ++i) - kvm_set_shared_msr(vmx->guest_msrs[i].index, - vmx->guest_msrs[i].data, - vmx->guest_msrs[i].mask); -} - -static void __vmx_load_host_state(struct vcpu_vmx *vmx) -{ - if (!vmx->host_state.loaded) - return; - - ++vmx->vcpu.stat.host_state_reload; - vmx->host_state.loaded = 0; -#ifdef CONFIG_X86_64 - if (is_long_mode(&vmx->vcpu)) - rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); -#endif - if (vmx->host_state.gs_ldt_reload_needed) { - kvm_load_ldt(vmx->host_state.ldt_sel); -#ifdef CONFIG_X86_64 - load_gs_index(vmx->host_state.gs_sel); -#else - loadsegment(gs, vmx->host_state.gs_sel); -#endif - } - if (vmx->host_state.fs_reload_needed) - loadsegment(fs, vmx->host_state.fs_sel); - reload_tss(); -#ifdef CONFIG_X86_64 - wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); -#endif - if (user_has_fpu()) - clts(); - load_gdt(&__get_cpu_var(host_gdt)); -} - -static void vmx_load_host_state(struct vcpu_vmx *vmx) -{ - preempt_disable(); - __vmx_load_host_state(vmx); - preempt_enable(); -} - -/* - * Switches to specified vcpu, until a matching vcpu_put(), but assumes - * vcpu mutex is already taken. - */ -static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); - - if (!vmm_exclusive) - kvm_cpu_vmxon(phys_addr); - else if (vmx->loaded_vmcs->cpu != cpu) - loaded_vmcs_clear(vmx->loaded_vmcs); - - if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) { - per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs; - vmcs_load(vmx->loaded_vmcs->vmcs); - } - - if (vmx->loaded_vmcs->cpu != cpu) { - struct desc_ptr *gdt = &__get_cpu_var(host_gdt); - unsigned long sysenter_esp; - - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); - local_irq_disable(); - list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link, - &per_cpu(loaded_vmcss_on_cpu, cpu)); - local_irq_enable(); - - /* - * Linux uses per-cpu TSS and GDT, so set these when switching - * processors. - */ - vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */ - vmcs_writel(HOST_GDTR_BASE, gdt->address); /* 22.2.4 */ - - rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); - vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ - vmx->loaded_vmcs->cpu = cpu; - } -} - -static void vmx_vcpu_put(struct kvm_vcpu *vcpu) -{ - __vmx_load_host_state(to_vmx(vcpu)); - if (!vmm_exclusive) { - __loaded_vmcs_clear(to_vmx(vcpu)->loaded_vmcs); - vcpu->cpu = -1; - kvm_cpu_vmxoff(); - } -} - -static void vmx_fpu_activate(struct kvm_vcpu *vcpu) -{ - ulong cr0; - - if (vcpu->fpu_active) - return; - vcpu->fpu_active = 1; - cr0 = vmcs_readl(GUEST_CR0); - cr0 &= ~(X86_CR0_TS | X86_CR0_MP); - cr0 |= kvm_read_cr0_bits(vcpu, X86_CR0_TS | X86_CR0_MP); - vmcs_writel(GUEST_CR0, cr0); - update_exception_bitmap(vcpu); - vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS; - if (is_guest_mode(vcpu)) - vcpu->arch.cr0_guest_owned_bits &= - ~get_vmcs12(vcpu)->cr0_guest_host_mask; - vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); -} - -static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu); - -/* - * Return the cr0 value that a nested guest would read. This is a combination - * of the real cr0 used to run the guest (guest_cr0), and the bits shadowed by - * its hypervisor (cr0_read_shadow). - */ -static inline unsigned long nested_read_cr0(struct vmcs12 *fields) -{ - return (fields->guest_cr0 & ~fields->cr0_guest_host_mask) | - (fields->cr0_read_shadow & fields->cr0_guest_host_mask); -} -static inline unsigned long nested_read_cr4(struct vmcs12 *fields) -{ - return (fields->guest_cr4 & ~fields->cr4_guest_host_mask) | - (fields->cr4_read_shadow & fields->cr4_guest_host_mask); -} - -static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu) -{ - /* Note that there is no vcpu->fpu_active = 0 here. The caller must - * set this *before* calling this function. - */ - vmx_decache_cr0_guest_bits(vcpu); - vmcs_set_bits(GUEST_CR0, X86_CR0_TS | X86_CR0_MP); - update_exception_bitmap(vcpu); - vcpu->arch.cr0_guest_owned_bits = 0; - vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); - if (is_guest_mode(vcpu)) { - /* - * L1's specified read shadow might not contain the TS bit, - * so now that we turned on shadowing of this bit, we need to - * set this bit of the shadow. Like in nested_vmx_run we need - * nested_read_cr0(vmcs12), but vmcs12->guest_cr0 is not yet - * up-to-date here because we just decached cr0.TS (and we'll - * only update vmcs12->guest_cr0 on nested exit). - */ - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - vmcs12->guest_cr0 = (vmcs12->guest_cr0 & ~X86_CR0_TS) | - (vcpu->arch.cr0 & X86_CR0_TS); - vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12)); - } else - vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); -} - -static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) -{ - unsigned long rflags, save_rflags; - - if (!test_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail)) { - __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail); - rflags = vmcs_readl(GUEST_RFLAGS); - if (to_vmx(vcpu)->rmode.vm86_active) { - rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS; - save_rflags = to_vmx(vcpu)->rmode.save_rflags; - rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; - } - to_vmx(vcpu)->rflags = rflags; - } - return to_vmx(vcpu)->rflags; -} - -static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) -{ - __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail); - __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); - to_vmx(vcpu)->rflags = rflags; - if (to_vmx(vcpu)->rmode.vm86_active) { - to_vmx(vcpu)->rmode.save_rflags = rflags; - rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; - } - vmcs_writel(GUEST_RFLAGS, rflags); -} - -static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) -{ - u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); - int ret = 0; - - if (interruptibility & GUEST_INTR_STATE_STI) - ret |= KVM_X86_SHADOW_INT_STI; - if (interruptibility & GUEST_INTR_STATE_MOV_SS) - ret |= KVM_X86_SHADOW_INT_MOV_SS; - - return ret & mask; -} - -static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) -{ - u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); - u32 interruptibility = interruptibility_old; - - interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS); - - if (mask & KVM_X86_SHADOW_INT_MOV_SS) - interruptibility |= GUEST_INTR_STATE_MOV_SS; - else if (mask & KVM_X86_SHADOW_INT_STI) - interruptibility |= GUEST_INTR_STATE_STI; - - if ((interruptibility != interruptibility_old)) - vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility); -} - -static void skip_emulated_instruction(struct kvm_vcpu *vcpu) -{ - unsigned long rip; - - rip = kvm_rip_read(vcpu); - rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN); - kvm_rip_write(vcpu, rip); - - /* skipping an emulated instruction also counts */ - vmx_set_interrupt_shadow(vcpu, 0); -} - -/* - * KVM wants to inject page-faults which it got to the guest. This function - * checks whether in a nested guest, we need to inject them to L1 or L2. - * This function assumes it is called with the exit reason in vmcs02 being - * a #PF exception (this is the only case in which KVM injects a #PF when L2 - * is running). - */ -static int nested_pf_handled(struct kvm_vcpu *vcpu) -{ - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - - /* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */ - if (!(vmcs12->exception_bitmap & (1u << PF_VECTOR))) - return 0; - - nested_vmx_vmexit(vcpu); - return 1; -} - -static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, - bool has_error_code, u32 error_code, - bool reinject) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - u32 intr_info = nr | INTR_INFO_VALID_MASK; - - if (nr == PF_VECTOR && is_guest_mode(vcpu) && - nested_pf_handled(vcpu)) - return; - - if (has_error_code) { - vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); - intr_info |= INTR_INFO_DELIVER_CODE_MASK; - } - - if (vmx->rmode.vm86_active) { - int inc_eip = 0; - if (kvm_exception_is_soft(nr)) - inc_eip = vcpu->arch.event_exit_inst_len; - if (kvm_inject_realmode_interrupt(vcpu, nr, inc_eip) != EMULATE_DONE) - kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); - return; - } - - if (kvm_exception_is_soft(nr)) { - vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, - vmx->vcpu.arch.event_exit_inst_len); - intr_info |= INTR_TYPE_SOFT_EXCEPTION; - } else - intr_info |= INTR_TYPE_HARD_EXCEPTION; - - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); -} - -static bool vmx_rdtscp_supported(void) -{ - return cpu_has_vmx_rdtscp(); -} - -/* - * Swap MSR entry in host/guest MSR entry array. - */ -static void move_msr_up(struct vcpu_vmx *vmx, int from, int to) -{ - struct shared_msr_entry tmp; - - tmp = vmx->guest_msrs[to]; - vmx->guest_msrs[to] = vmx->guest_msrs[from]; - vmx->guest_msrs[from] = tmp; -} - -/* - * Set up the vmcs to automatically save and restore system - * msrs. Don't touch the 64-bit msrs if the guest is in legacy - * mode, as fiddling with msrs is very expensive. - */ -static void setup_msrs(struct vcpu_vmx *vmx) -{ - int save_nmsrs, index; - unsigned long *msr_bitmap; - - save_nmsrs = 0; -#ifdef CONFIG_X86_64 - if (is_long_mode(&vmx->vcpu)) { - index = __find_msr_index(vmx, MSR_SYSCALL_MASK); - if (index >= 0) - move_msr_up(vmx, index, save_nmsrs++); - index = __find_msr_index(vmx, MSR_LSTAR); - if (index >= 0) - move_msr_up(vmx, index, save_nmsrs++); - index = __find_msr_index(vmx, MSR_CSTAR); - if (index >= 0) - move_msr_up(vmx, index, save_nmsrs++); - index = __find_msr_index(vmx, MSR_TSC_AUX); - if (index >= 0 && vmx->rdtscp_enabled) - move_msr_up(vmx, index, save_nmsrs++); - /* - * MSR_STAR is only needed on long mode guests, and only - * if efer.sce is enabled. - */ - index = __find_msr_index(vmx, MSR_STAR); - if ((index >= 0) && (vmx->vcpu.arch.efer & EFER_SCE)) - move_msr_up(vmx, index, save_nmsrs++); - } -#endif - index = __find_msr_index(vmx, MSR_EFER); - if (index >= 0 && update_transition_efer(vmx, index)) - move_msr_up(vmx, index, save_nmsrs++); - - vmx->save_nmsrs = save_nmsrs; - - if (cpu_has_vmx_msr_bitmap()) { - if (is_long_mode(&vmx->vcpu)) - msr_bitmap = vmx_msr_bitmap_longmode; - else - msr_bitmap = vmx_msr_bitmap_legacy; - - vmcs_write64(MSR_BITMAP, __pa(msr_bitmap)); - } -} - -/* - * reads and returns guest's timestamp counter "register" - * guest_tsc = host_tsc + tsc_offset -- 21.3 - */ -static u64 guest_read_tsc(void) -{ - u64 host_tsc, tsc_offset; - - rdtscll(host_tsc); - tsc_offset = vmcs_read64(TSC_OFFSET); - return host_tsc + tsc_offset; -} - -/* - * Like guest_read_tsc, but always returns L1's notion of the timestamp - * counter, even if a nested guest (L2) is currently running. - */ -u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu) -{ - u64 host_tsc, tsc_offset; - - rdtscll(host_tsc); - tsc_offset = is_guest_mode(vcpu) ? - to_vmx(vcpu)->nested.vmcs01_tsc_offset : - vmcs_read64(TSC_OFFSET); - return host_tsc + tsc_offset; -} - -/* - * Engage any workarounds for mis-matched TSC rates. Currently limited to - * software catchup for faster rates on slower CPUs. - */ -static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) -{ - if (!scale) - return; - - if (user_tsc_khz > tsc_khz) { - vcpu->arch.tsc_catchup = 1; - vcpu->arch.tsc_always_catchup = 1; - } else - WARN(1, "user requested TSC rate below hardware speed\n"); -} - -/* - * writes 'offset' into guest's timestamp counter offset register - */ -static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) -{ - if (is_guest_mode(vcpu)) { - /* - * We're here if L1 chose not to trap WRMSR to TSC. According - * to the spec, this should set L1's TSC; The offset that L1 - * set for L2 remains unchanged, and still needs to be added - * to the newly set TSC to get L2's TSC. - */ - struct vmcs12 *vmcs12; - to_vmx(vcpu)->nested.vmcs01_tsc_offset = offset; - /* recalculate vmcs02.TSC_OFFSET: */ - vmcs12 = get_vmcs12(vcpu); - vmcs_write64(TSC_OFFSET, offset + - (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING) ? - vmcs12->tsc_offset : 0)); - } else { - vmcs_write64(TSC_OFFSET, offset); - } -} - -static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host) -{ - u64 offset = vmcs_read64(TSC_OFFSET); - vmcs_write64(TSC_OFFSET, offset + adjustment); - if (is_guest_mode(vcpu)) { - /* Even when running L2, the adjustment needs to apply to L1 */ - to_vmx(vcpu)->nested.vmcs01_tsc_offset += adjustment; - } -} - -static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) -{ - return target_tsc - native_read_tsc(); -} - -static bool guest_cpuid_has_vmx(struct kvm_vcpu *vcpu) -{ - struct kvm_cpuid_entry2 *best = kvm_find_cpuid_entry(vcpu, 1, 0); - return best && (best->ecx & (1 << (X86_FEATURE_VMX & 31))); -} - -/* - * nested_vmx_allowed() checks whether a guest should be allowed to use VMX - * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for - * all guests if the "nested" module option is off, and can also be disabled - * for a single guest by disabling its VMX cpuid bit. - */ -static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu) -{ - return nested && guest_cpuid_has_vmx(vcpu); -} - -/* - * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be - * returned for the various VMX controls MSRs when nested VMX is enabled. - * The same values should also be used to verify that vmcs12 control fields are - * valid during nested entry from L1 to L2. - * Each of these control msrs has a low and high 32-bit half: A low bit is on - * if the corresponding bit in the (32-bit) control field *must* be on, and a - * bit in the high half is on if the corresponding bit in the control field - * may be on. See also vmx_control_verify(). - * TODO: allow these variables to be modified (downgraded) by module options - * or other means. - */ -static u32 nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high; -static u32 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high; -static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high; -static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high; -static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high; -static __init void nested_vmx_setup_ctls_msrs(void) -{ - /* - * Note that as a general rule, the high half of the MSRs (bits in - * the control fields which may be 1) should be initialized by the - * intersection of the underlying hardware's MSR (i.e., features which - * can be supported) and the list of features we want to expose - - * because they are known to be properly supported in our code. - * Also, usually, the low half of the MSRs (bits which must be 1) can - * be set to 0, meaning that L1 may turn off any of these bits. The - * reason is that if one of these bits is necessary, it will appear - * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control - * fields of vmcs01 and vmcs02, will turn these bits off - and - * nested_vmx_exit_handled() will not pass related exits to L1. - * These rules have exceptions below. - */ - - /* pin-based controls */ - /* - * According to the Intel spec, if bit 55 of VMX_BASIC is off (as it is - * in our case), bits 1, 2 and 4 (i.e., 0x16) must be 1 in this MSR. - */ - nested_vmx_pinbased_ctls_low = 0x16 ; - nested_vmx_pinbased_ctls_high = 0x16 | - PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING | - PIN_BASED_VIRTUAL_NMIS; - - /* exit controls */ - nested_vmx_exit_ctls_low = 0; - /* Note that guest use of VM_EXIT_ACK_INTR_ON_EXIT is not supported. */ -#ifdef CONFIG_X86_64 - nested_vmx_exit_ctls_high = VM_EXIT_HOST_ADDR_SPACE_SIZE; -#else - nested_vmx_exit_ctls_high = 0; -#endif - - /* entry controls */ - rdmsr(MSR_IA32_VMX_ENTRY_CTLS, - nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high); - nested_vmx_entry_ctls_low = 0; - nested_vmx_entry_ctls_high &= - VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_IA32E_MODE; - - /* cpu-based controls */ - rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, - nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high); - nested_vmx_procbased_ctls_low = 0; - nested_vmx_procbased_ctls_high &= - CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_USE_TSC_OFFSETING | - CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | - CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING | - CPU_BASED_CR3_STORE_EXITING | -#ifdef CONFIG_X86_64 - CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING | -#endif - CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | - CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING | - CPU_BASED_RDPMC_EXITING | - CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; - /* - * We can allow some features even when not supported by the - * hardware. For example, L1 can specify an MSR bitmap - and we - * can use it to avoid exits to L1 - even when L0 runs L2 - * without MSR bitmaps. - */ - nested_vmx_procbased_ctls_high |= CPU_BASED_USE_MSR_BITMAPS; - - /* secondary cpu-based controls */ - rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2, - nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high); - nested_vmx_secondary_ctls_low = 0; - nested_vmx_secondary_ctls_high &= - SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; -} - -static inline bool vmx_control_verify(u32 control, u32 low, u32 high) -{ - /* - * Bits 0 in high must be 0, and bits 1 in low must be 1. - */ - return ((control & high) | low) == control; -} - -static inline u64 vmx_control_msr(u32 low, u32 high) -{ - return low | ((u64)high << 32); -} - -/* - * If we allow our guest to use VMX instructions (i.e., nested VMX), we should - * also let it use VMX-specific MSRs. - * vmx_get_vmx_msr() and vmx_set_vmx_msr() return 1 when we handled a - * VMX-specific MSR, or 0 when we haven't (and the caller should handle it - * like all other MSRs). - */ -static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) -{ - if (!nested_vmx_allowed(vcpu) && msr_index >= MSR_IA32_VMX_BASIC && - msr_index <= MSR_IA32_VMX_TRUE_ENTRY_CTLS) { - /* - * According to the spec, processors which do not support VMX - * should throw a #GP(0) when VMX capability MSRs are read. - */ - kvm_queue_exception_e(vcpu, GP_VECTOR, 0); - return 1; - } - - switch (msr_index) { - case MSR_IA32_FEATURE_CONTROL: - *pdata = 0; - break; - case MSR_IA32_VMX_BASIC: - /* - * This MSR reports some information about VMX support. We - * should return information about the VMX we emulate for the - * guest, and the VMCS structure we give it - not about the - * VMX support of the underlying hardware. - */ - *pdata = VMCS12_REVISION | - ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | - (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); - break; - case MSR_IA32_VMX_TRUE_PINBASED_CTLS: - case MSR_IA32_VMX_PINBASED_CTLS: - *pdata = vmx_control_msr(nested_vmx_pinbased_ctls_low, - nested_vmx_pinbased_ctls_high); - break; - case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: - case MSR_IA32_VMX_PROCBASED_CTLS: - *pdata = vmx_control_msr(nested_vmx_procbased_ctls_low, - nested_vmx_procbased_ctls_high); - break; - case MSR_IA32_VMX_TRUE_EXIT_CTLS: - case MSR_IA32_VMX_EXIT_CTLS: - *pdata = vmx_control_msr(nested_vmx_exit_ctls_low, - nested_vmx_exit_ctls_high); - break; - case MSR_IA32_VMX_TRUE_ENTRY_CTLS: - case MSR_IA32_VMX_ENTRY_CTLS: - *pdata = vmx_control_msr(nested_vmx_entry_ctls_low, - nested_vmx_entry_ctls_high); - break; - case MSR_IA32_VMX_MISC: - *pdata = 0; - break; - /* - * These MSRs specify bits which the guest must keep fixed (on or off) - * while L1 is in VMXON mode (in L1's root mode, or running an L2). - * We picked the standard core2 setting. - */ -#define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE) -#define VMXON_CR4_ALWAYSON X86_CR4_VMXE - case MSR_IA32_VMX_CR0_FIXED0: - *pdata = VMXON_CR0_ALWAYSON; - break; - case MSR_IA32_VMX_CR0_FIXED1: - *pdata = -1ULL; - break; - case MSR_IA32_VMX_CR4_FIXED0: - *pdata = VMXON_CR4_ALWAYSON; - break; - case MSR_IA32_VMX_CR4_FIXED1: - *pdata = -1ULL; - break; - case MSR_IA32_VMX_VMCS_ENUM: - *pdata = 0x1f; - break; - case MSR_IA32_VMX_PROCBASED_CTLS2: - *pdata = vmx_control_msr(nested_vmx_secondary_ctls_low, - nested_vmx_secondary_ctls_high); - break; - case MSR_IA32_VMX_EPT_VPID_CAP: - /* Currently, no nested ept or nested vpid */ - *pdata = 0; - break; - default: - return 0; - } - - return 1; -} - -static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) -{ - if (!nested_vmx_allowed(vcpu)) - return 0; - - if (msr_index == MSR_IA32_FEATURE_CONTROL) - /* TODO: the right thing. */ - return 1; - /* - * No need to treat VMX capability MSRs specially: If we don't handle - * them, handle_wrmsr will #GP(0), which is correct (they are readonly) - */ - return 0; -} - -/* - * Reads an msr value (of 'msr_index') into 'pdata'. - * Returns 0 on success, non-0 otherwise. - * Assumes vcpu_load() was already called. - */ -static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) -{ - u64 data; - struct shared_msr_entry *msr; - - if (!pdata) { - printk(KERN_ERR "BUG: get_msr called with NULL pdata\n"); - return -EINVAL; - } - - switch (msr_index) { -#ifdef CONFIG_X86_64 - case MSR_FS_BASE: - data = vmcs_readl(GUEST_FS_BASE); - break; - case MSR_GS_BASE: - data = vmcs_readl(GUEST_GS_BASE); - break; - case MSR_KERNEL_GS_BASE: - vmx_load_host_state(to_vmx(vcpu)); - data = to_vmx(vcpu)->msr_guest_kernel_gs_base; - break; -#endif - case MSR_EFER: - return kvm_get_msr_common(vcpu, msr_index, pdata); - case MSR_IA32_TSC: - data = guest_read_tsc(); - break; - case MSR_IA32_SYSENTER_CS: - data = vmcs_read32(GUEST_SYSENTER_CS); - break; - case MSR_IA32_SYSENTER_EIP: - data = vmcs_readl(GUEST_SYSENTER_EIP); - break; - case MSR_IA32_SYSENTER_ESP: - data = vmcs_readl(GUEST_SYSENTER_ESP); - break; - case MSR_TSC_AUX: - if (!to_vmx(vcpu)->rdtscp_enabled) - return 1; - /* Otherwise falls through */ - default: - if (vmx_get_vmx_msr(vcpu, msr_index, pdata)) - return 0; - msr = find_msr_entry(to_vmx(vcpu), msr_index); - if (msr) { - data = msr->data; - break; - } - return kvm_get_msr_common(vcpu, msr_index, pdata); - } - - *pdata = data; - return 0; -} - -/* - * Writes msr value into into the appropriate "register". - * Returns 0 on success, non-0 otherwise. - * Assumes vcpu_load() was already called. - */ -static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - struct shared_msr_entry *msr; - int ret = 0; - - switch (msr_index) { - case MSR_EFER: - ret = kvm_set_msr_common(vcpu, msr_index, data); - break; -#ifdef CONFIG_X86_64 - case MSR_FS_BASE: - vmx_segment_cache_clear(vmx); - vmcs_writel(GUEST_FS_BASE, data); - break; - case MSR_GS_BASE: - vmx_segment_cache_clear(vmx); - vmcs_writel(GUEST_GS_BASE, data); - break; - case MSR_KERNEL_GS_BASE: - vmx_load_host_state(vmx); - vmx->msr_guest_kernel_gs_base = data; - break; -#endif - case MSR_IA32_SYSENTER_CS: - vmcs_write32(GUEST_SYSENTER_CS, data); - break; - case MSR_IA32_SYSENTER_EIP: - vmcs_writel(GUEST_SYSENTER_EIP, data); - break; - case MSR_IA32_SYSENTER_ESP: - vmcs_writel(GUEST_SYSENTER_ESP, data); - break; - case MSR_IA32_TSC: - kvm_write_tsc(vcpu, data); - break; - case MSR_IA32_CR_PAT: - if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { - vmcs_write64(GUEST_IA32_PAT, data); - vcpu->arch.pat = data; - break; - } - ret = kvm_set_msr_common(vcpu, msr_index, data); - break; - case MSR_TSC_AUX: - if (!vmx->rdtscp_enabled) - return 1; - /* Check reserved bit, higher 32 bits should be zero */ - if ((data >> 32) != 0) - return 1; - /* Otherwise falls through */ - default: - if (vmx_set_vmx_msr(vcpu, msr_index, data)) - break; - msr = find_msr_entry(vmx, msr_index); - if (msr) { - msr->data = data; - if (msr - vmx->guest_msrs < vmx->save_nmsrs) { - preempt_disable(); - kvm_set_shared_msr(msr->index, msr->data, - msr->mask); - preempt_enable(); - } - break; - } - ret = kvm_set_msr_common(vcpu, msr_index, data); - } - - return ret; -} - -static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) -{ - __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); - switch (reg) { - case VCPU_REGS_RSP: - vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); - break; - case VCPU_REGS_RIP: - vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP); - break; - case VCPU_EXREG_PDPTR: - if (enable_ept) - ept_save_pdptrs(vcpu); - break; - default: - break; - } -} - -static void set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) -{ - if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) - vmcs_writel(GUEST_DR7, dbg->arch.debugreg[7]); - else - vmcs_writel(GUEST_DR7, vcpu->arch.dr7); - - update_exception_bitmap(vcpu); -} - -static __init int cpu_has_kvm_support(void) -{ - return cpu_has_vmx(); -} - -static __init int vmx_disabled_by_bios(void) -{ - u64 msr; - - rdmsrl(MSR_IA32_FEATURE_CONTROL, msr); - if (msr & FEATURE_CONTROL_LOCKED) { - /* launched w/ TXT and VMX disabled */ - if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX) - && tboot_enabled()) - return 1; - /* launched w/o TXT and VMX only enabled w/ TXT */ - if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX) - && (msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX) - && !tboot_enabled()) { - printk(KERN_WARNING "kvm: disable TXT in the BIOS or " - "activate TXT before enabling KVM\n"); - return 1; - } - /* launched w/o TXT and VMX disabled */ - if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX) - && !tboot_enabled()) - return 1; - } - - return 0; -} - -static void kvm_cpu_vmxon(u64 addr) -{ - asm volatile (ASM_VMX_VMXON_RAX - : : "a"(&addr), "m"(addr) - : "memory", "cc"); -} - -static int hardware_enable(void *garbage) -{ - int cpu = raw_smp_processor_id(); - u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); - u64 old, test_bits; - - if (read_cr4() & X86_CR4_VMXE) - return -EBUSY; - - INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); - rdmsrl(MSR_IA32_FEATURE_CONTROL, old); - - test_bits = FEATURE_CONTROL_LOCKED; - test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; - if (tboot_enabled()) - test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX; - - if ((old & test_bits) != test_bits) { - /* enable and lock */ - wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits); - } - write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */ - - if (vmm_exclusive) { - kvm_cpu_vmxon(phys_addr); - ept_sync_global(); - } - - store_gdt(&__get_cpu_var(host_gdt)); - - return 0; -} - -static void vmclear_local_loaded_vmcss(void) -{ - int cpu = raw_smp_processor_id(); - struct loaded_vmcs *v, *n; - - list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu), - loaded_vmcss_on_cpu_link) - __loaded_vmcs_clear(v); -} - - -/* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot() - * tricks. - */ -static void kvm_cpu_vmxoff(void) -{ - asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc"); -} - -static void hardware_disable(void *garbage) -{ - if (vmm_exclusive) { - vmclear_local_loaded_vmcss(); - kvm_cpu_vmxoff(); - } - write_cr4(read_cr4() & ~X86_CR4_VMXE); -} - -static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, - u32 msr, u32 *result) -{ - u32 vmx_msr_low, vmx_msr_high; - u32 ctl = ctl_min | ctl_opt; - - rdmsr(msr, vmx_msr_low, vmx_msr_high); - - ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */ - ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */ - - /* Ensure minimum (required) set of control bits are supported. */ - if (ctl_min & ~ctl) - return -EIO; - - *result = ctl; - return 0; -} - -static __init bool allow_1_setting(u32 msr, u32 ctl) -{ - u32 vmx_msr_low, vmx_msr_high; - - rdmsr(msr, vmx_msr_low, vmx_msr_high); - return vmx_msr_high & ctl; -} - -static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) -{ - u32 vmx_msr_low, vmx_msr_high; - u32 min, opt, min2, opt2; - u32 _pin_based_exec_control = 0; - u32 _cpu_based_exec_control = 0; - u32 _cpu_based_2nd_exec_control = 0; - u32 _vmexit_control = 0; - u32 _vmentry_control = 0; - - min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING; - opt = PIN_BASED_VIRTUAL_NMIS; - if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS, - &_pin_based_exec_control) < 0) - return -EIO; - - min = CPU_BASED_HLT_EXITING | -#ifdef CONFIG_X86_64 - CPU_BASED_CR8_LOAD_EXITING | - CPU_BASED_CR8_STORE_EXITING | -#endif - CPU_BASED_CR3_LOAD_EXITING | - CPU_BASED_CR3_STORE_EXITING | - CPU_BASED_USE_IO_BITMAPS | - CPU_BASED_MOV_DR_EXITING | - CPU_BASED_USE_TSC_OFFSETING | - CPU_BASED_MWAIT_EXITING | - CPU_BASED_MONITOR_EXITING | - CPU_BASED_INVLPG_EXITING | - CPU_BASED_RDPMC_EXITING; - - opt = CPU_BASED_TPR_SHADOW | - CPU_BASED_USE_MSR_BITMAPS | - CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; - if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS, - &_cpu_based_exec_control) < 0) - return -EIO; -#ifdef CONFIG_X86_64 - if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW)) - _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING & - ~CPU_BASED_CR8_STORE_EXITING; -#endif - if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) { - min2 = 0; - opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | - SECONDARY_EXEC_WBINVD_EXITING | - SECONDARY_EXEC_ENABLE_VPID | - SECONDARY_EXEC_ENABLE_EPT | - SECONDARY_EXEC_UNRESTRICTED_GUEST | - SECONDARY_EXEC_PAUSE_LOOP_EXITING | - SECONDARY_EXEC_RDTSCP; - if (adjust_vmx_controls(min2, opt2, - MSR_IA32_VMX_PROCBASED_CTLS2, - &_cpu_based_2nd_exec_control) < 0) - return -EIO; - } -#ifndef CONFIG_X86_64 - if (!(_cpu_based_2nd_exec_control & - SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) - _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW; -#endif - if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) { - /* CR3 accesses and invlpg don't need to cause VM Exits when EPT - enabled */ - _cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING | - CPU_BASED_CR3_STORE_EXITING | - CPU_BASED_INVLPG_EXITING); - rdmsr(MSR_IA32_VMX_EPT_VPID_CAP, - vmx_capability.ept, vmx_capability.vpid); - } - - min = 0; -#ifdef CONFIG_X86_64 - min |= VM_EXIT_HOST_ADDR_SPACE_SIZE; -#endif - opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT; - if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS, - &_vmexit_control) < 0) - return -EIO; - - min = 0; - opt = VM_ENTRY_LOAD_IA32_PAT; - if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS, - &_vmentry_control) < 0) - return -EIO; - - rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high); - - /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */ - if ((vmx_msr_high & 0x1fff) > PAGE_SIZE) - return -EIO; - -#ifdef CONFIG_X86_64 - /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */ - if (vmx_msr_high & (1u<<16)) - return -EIO; -#endif - - /* Require Write-Back (WB) memory type for VMCS accesses. */ - if (((vmx_msr_high >> 18) & 15) != 6) - return -EIO; - - vmcs_conf->size = vmx_msr_high & 0x1fff; - vmcs_conf->order = get_order(vmcs_config.size); - vmcs_conf->revision_id = vmx_msr_low; - - vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; - vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control; - vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control; - vmcs_conf->vmexit_ctrl = _vmexit_control; - vmcs_conf->vmentry_ctrl = _vmentry_control; - - cpu_has_load_ia32_efer = - allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS, - VM_ENTRY_LOAD_IA32_EFER) - && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS, - VM_EXIT_LOAD_IA32_EFER); - - cpu_has_load_perf_global_ctrl = - allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS, - VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) - && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS, - VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL); - - /* - * Some cpus support VM_ENTRY_(LOAD|SAVE)_IA32_PERF_GLOBAL_CTRL - * but due to arrata below it can't be used. Workaround is to use - * msr load mechanism to switch IA32_PERF_GLOBAL_CTRL. - * - * VM Exit May Incorrectly Clear IA32_PERF_GLOBAL_CTRL [34:32] - * - * AAK155 (model 26) - * AAP115 (model 30) - * AAT100 (model 37) - * BC86,AAY89,BD102 (model 44) - * BA97 (model 46) - * - */ - if (cpu_has_load_perf_global_ctrl && boot_cpu_data.x86 == 0x6) { - switch (boot_cpu_data.x86_model) { - case 26: - case 30: - case 37: - case 44: - case 46: - cpu_has_load_perf_global_ctrl = false; - printk_once(KERN_WARNING"kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL " - "does not work properly. Using workaround\n"); - break; - default: - break; - } - } - - return 0; -} - -static struct vmcs *alloc_vmcs_cpu(int cpu) -{ - int node = cpu_to_node(cpu); - struct page *pages; - struct vmcs *vmcs; - - pages = alloc_pages_exact_node(node, GFP_KERNEL, vmcs_config.order); - if (!pages) - return NULL; - vmcs = page_address(pages); - memset(vmcs, 0, vmcs_config.size); - vmcs->revision_id = vmcs_config.revision_id; /* vmcs revision id */ - return vmcs; -} - -static struct vmcs *alloc_vmcs(void) -{ - return alloc_vmcs_cpu(raw_smp_processor_id()); -} - -static void free_vmcs(struct vmcs *vmcs) -{ - free_pages((unsigned long)vmcs, vmcs_config.order); -} - -/* - * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded - */ -static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) -{ - if (!loaded_vmcs->vmcs) - return; - loaded_vmcs_clear(loaded_vmcs); - free_vmcs(loaded_vmcs->vmcs); - loaded_vmcs->vmcs = NULL; -} - -static void free_kvm_area(void) -{ - int cpu; - - for_each_possible_cpu(cpu) { - free_vmcs(per_cpu(vmxarea, cpu)); - per_cpu(vmxarea, cpu) = NULL; - } -} - -static __init int alloc_kvm_area(void) -{ - int cpu; - - for_each_possible_cpu(cpu) { - struct vmcs *vmcs; - - vmcs = alloc_vmcs_cpu(cpu); - if (!vmcs) { - free_kvm_area(); - return -ENOMEM; - } - - per_cpu(vmxarea, cpu) = vmcs; - } - return 0; -} - -static __init int hardware_setup(void) -{ - if (setup_vmcs_config(&vmcs_config) < 0) - return -EIO; - - if (boot_cpu_has(X86_FEATURE_NX)) - kvm_enable_efer_bits(EFER_NX); - - if (!cpu_has_vmx_vpid()) - enable_vpid = 0; - - if (!cpu_has_vmx_ept() || - !cpu_has_vmx_ept_4levels()) { - enable_ept = 0; - enable_unrestricted_guest = 0; - } - - if (!cpu_has_vmx_unrestricted_guest()) - enable_unrestricted_guest = 0; - - if (!cpu_has_vmx_flexpriority()) - flexpriority_enabled = 0; - - if (!cpu_has_vmx_tpr_shadow()) - kvm_x86_ops->update_cr8_intercept = NULL; - - if (enable_ept && !cpu_has_vmx_ept_2m_page()) - kvm_disable_largepages(); - - if (!cpu_has_vmx_ple()) - ple_gap = 0; - - if (nested) - nested_vmx_setup_ctls_msrs(); - - return alloc_kvm_area(); -} - -static __exit void hardware_unsetup(void) -{ - free_kvm_area(); -} - -static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save) -{ - struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; - - if (vmcs_readl(sf->base) == save->base && (save->base & AR_S_MASK)) { - vmcs_write16(sf->selector, save->selector); - vmcs_writel(sf->base, save->base); - vmcs_write32(sf->limit, save->limit); - vmcs_write32(sf->ar_bytes, save->ar); - } else { - u32 dpl = (vmcs_read16(sf->selector) & SELECTOR_RPL_MASK) - << AR_DPL_SHIFT; - vmcs_write32(sf->ar_bytes, 0x93 | dpl); - } -} - -static void enter_pmode(struct kvm_vcpu *vcpu) -{ - unsigned long flags; - struct vcpu_vmx *vmx = to_vmx(vcpu); - - vmx->emulation_required = 1; - vmx->rmode.vm86_active = 0; - - vmx_segment_cache_clear(vmx); - - vmcs_write16(GUEST_TR_SELECTOR, vmx->rmode.tr.selector); - vmcs_writel(GUEST_TR_BASE, vmx->rmode.tr.base); - vmcs_write32(GUEST_TR_LIMIT, vmx->rmode.tr.limit); - vmcs_write32(GUEST_TR_AR_BYTES, vmx->rmode.tr.ar); - - flags = vmcs_readl(GUEST_RFLAGS); - flags &= RMODE_GUEST_OWNED_EFLAGS_BITS; - flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; - vmcs_writel(GUEST_RFLAGS, flags); - - vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) | - (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME)); - - update_exception_bitmap(vcpu); - - if (emulate_invalid_guest_state) - return; - - fix_pmode_dataseg(VCPU_SREG_ES, &vmx->rmode.es); - fix_pmode_dataseg(VCPU_SREG_DS, &vmx->rmode.ds); - fix_pmode_dataseg(VCPU_SREG_GS, &vmx->rmode.gs); - fix_pmode_dataseg(VCPU_SREG_FS, &vmx->rmode.fs); - - vmx_segment_cache_clear(vmx); - - vmcs_write16(GUEST_SS_SELECTOR, 0); - vmcs_write32(GUEST_SS_AR_BYTES, 0x93); - - vmcs_write16(GUEST_CS_SELECTOR, - vmcs_read16(GUEST_CS_SELECTOR) & ~SELECTOR_RPL_MASK); - vmcs_write32(GUEST_CS_AR_BYTES, 0x9b); -} - -static gva_t rmode_tss_base(struct kvm *kvm) -{ - if (!kvm->arch.tss_addr) { - struct kvm_memslots *slots; - struct kvm_memory_slot *slot; - gfn_t base_gfn; - - slots = kvm_memslots(kvm); - slot = id_to_memslot(slots, 0); - base_gfn = slot->base_gfn + slot->npages - 3; - - return base_gfn << PAGE_SHIFT; - } - return kvm->arch.tss_addr; -} - -static void fix_rmode_seg(int seg, struct kvm_save_segment *save) -{ - struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; - - save->selector = vmcs_read16(sf->selector); - save->base = vmcs_readl(sf->base); - save->limit = vmcs_read32(sf->limit); - save->ar = vmcs_read32(sf->ar_bytes); - vmcs_write16(sf->selector, save->base >> 4); - vmcs_write32(sf->base, save->base & 0xffff0); - vmcs_write32(sf->limit, 0xffff); - vmcs_write32(sf->ar_bytes, 0xf3); - if (save->base & 0xf) - printk_once(KERN_WARNING "kvm: segment base is not paragraph" - " aligned when entering protected mode (seg=%d)", - seg); -} - -static void enter_rmode(struct kvm_vcpu *vcpu) -{ - unsigned long flags; - struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (enable_unrestricted_guest) - return; - - vmx->emulation_required = 1; - vmx->rmode.vm86_active = 1; - - /* - * Very old userspace does not call KVM_SET_TSS_ADDR before entering - * vcpu. Call it here with phys address pointing 16M below 4G. - */ - if (!vcpu->kvm->arch.tss_addr) { - printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be " - "called before entering vcpu\n"); - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); - vmx_set_tss_addr(vcpu->kvm, 0xfeffd000); - vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); - } - - vmx_segment_cache_clear(vmx); - - vmx->rmode.tr.selector = vmcs_read16(GUEST_TR_SELECTOR); - vmx->rmode.tr.base = vmcs_readl(GUEST_TR_BASE); - vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm)); - - vmx->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT); - vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); - - vmx->rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES); - vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); - - flags = vmcs_readl(GUEST_RFLAGS); - vmx->rmode.save_rflags = flags; - - flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; - - vmcs_writel(GUEST_RFLAGS, flags); - vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME); - update_exception_bitmap(vcpu); - - if (emulate_invalid_guest_state) - goto continue_rmode; - - vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4); - vmcs_write32(GUEST_SS_LIMIT, 0xffff); - vmcs_write32(GUEST_SS_AR_BYTES, 0xf3); - - vmcs_write32(GUEST_CS_AR_BYTES, 0xf3); - vmcs_write32(GUEST_CS_LIMIT, 0xffff); - if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000) - vmcs_writel(GUEST_CS_BASE, 0xf0000); - vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4); - - fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.es); - fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.ds); - fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.gs); - fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.fs); - -continue_rmode: - kvm_mmu_reset_context(vcpu); -} - -static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER); - - if (!msr) - return; - - /* - * Force kernel_gs_base reloading before EFER changes, as control - * of this msr depends on is_long_mode(). - */ - vmx_load_host_state(to_vmx(vcpu)); - vcpu->arch.efer = efer; - if (efer & EFER_LMA) { - vmcs_write32(VM_ENTRY_CONTROLS, - vmcs_read32(VM_ENTRY_CONTROLS) | - VM_ENTRY_IA32E_MODE); - msr->data = efer; - } else { - vmcs_write32(VM_ENTRY_CONTROLS, - vmcs_read32(VM_ENTRY_CONTROLS) & - ~VM_ENTRY_IA32E_MODE); - - msr->data = efer & ~EFER_LME; - } - setup_msrs(vmx); -} - -#ifdef CONFIG_X86_64 - -static void enter_lmode(struct kvm_vcpu *vcpu) -{ - u32 guest_tr_ar; - - vmx_segment_cache_clear(to_vmx(vcpu)); - - guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES); - if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) { - pr_debug_ratelimited("%s: tss fixup for long mode. \n", - __func__); - vmcs_write32(GUEST_TR_AR_BYTES, - (guest_tr_ar & ~AR_TYPE_MASK) - | AR_TYPE_BUSY_64_TSS); - } - vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA); -} - -static void exit_lmode(struct kvm_vcpu *vcpu) -{ - vmcs_write32(VM_ENTRY_CONTROLS, - vmcs_read32(VM_ENTRY_CONTROLS) - & ~VM_ENTRY_IA32E_MODE); - vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA); -} - -#endif - -static void vmx_flush_tlb(struct kvm_vcpu *vcpu) -{ - vpid_sync_context(to_vmx(vcpu)); - if (enable_ept) { - if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) - return; - ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa)); - } -} - -static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) -{ - ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits; - - vcpu->arch.cr0 &= ~cr0_guest_owned_bits; - vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits; -} - -static void vmx_decache_cr3(struct kvm_vcpu *vcpu) -{ - if (enable_ept && is_paging(vcpu)) - vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); - __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); -} - -static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) -{ - ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits; - - vcpu->arch.cr4 &= ~cr4_guest_owned_bits; - vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & cr4_guest_owned_bits; -} - -static void ept_load_pdptrs(struct kvm_vcpu *vcpu) -{ - if (!test_bit(VCPU_EXREG_PDPTR, - (unsigned long *)&vcpu->arch.regs_dirty)) - return; - - if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { - vmcs_write64(GUEST_PDPTR0, vcpu->arch.mmu.pdptrs[0]); - vmcs_write64(GUEST_PDPTR1, vcpu->arch.mmu.pdptrs[1]); - vmcs_write64(GUEST_PDPTR2, vcpu->arch.mmu.pdptrs[2]); - vmcs_write64(GUEST_PDPTR3, vcpu->arch.mmu.pdptrs[3]); - } -} - -static void ept_save_pdptrs(struct kvm_vcpu *vcpu) -{ - if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { - vcpu->arch.mmu.pdptrs[0] = vmcs_read64(GUEST_PDPTR0); - vcpu->arch.mmu.pdptrs[1] = vmcs_read64(GUEST_PDPTR1); - vcpu->arch.mmu.pdptrs[2] = vmcs_read64(GUEST_PDPTR2); - vcpu->arch.mmu.pdptrs[3] = vmcs_read64(GUEST_PDPTR3); - } - - __set_bit(VCPU_EXREG_PDPTR, - (unsigned long *)&vcpu->arch.regs_avail); - __set_bit(VCPU_EXREG_PDPTR, - (unsigned long *)&vcpu->arch.regs_dirty); -} - -static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); - -static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, - unsigned long cr0, - struct kvm_vcpu *vcpu) -{ - if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail)) - vmx_decache_cr3(vcpu); - if (!(cr0 & X86_CR0_PG)) { - /* From paging/starting to nonpaging */ - vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, - vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) | - (CPU_BASED_CR3_LOAD_EXITING | - CPU_BASED_CR3_STORE_EXITING)); - vcpu->arch.cr0 = cr0; - vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); - } else if (!is_paging(vcpu)) { - /* From nonpaging to paging */ - vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, - vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) & - ~(CPU_BASED_CR3_LOAD_EXITING | - CPU_BASED_CR3_STORE_EXITING)); - vcpu->arch.cr0 = cr0; - vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); - } - - if (!(cr0 & X86_CR0_WP)) - *hw_cr0 &= ~X86_CR0_WP; -} - -static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned long hw_cr0; - - if (enable_unrestricted_guest) - hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST) - | KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST; - else - hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON; - - if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE)) - enter_pmode(vcpu); - - if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE)) - enter_rmode(vcpu); - -#ifdef CONFIG_X86_64 - if (vcpu->arch.efer & EFER_LME) { - if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) - enter_lmode(vcpu); - if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) - exit_lmode(vcpu); - } -#endif - - if (enable_ept) - ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu); - - if (!vcpu->fpu_active) - hw_cr0 |= X86_CR0_TS | X86_CR0_MP; - - vmcs_writel(CR0_READ_SHADOW, cr0); - vmcs_writel(GUEST_CR0, hw_cr0); - vcpu->arch.cr0 = cr0; - __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); -} - -static u64 construct_eptp(unsigned long root_hpa) -{ - u64 eptp; - - /* TODO write the value reading from MSR */ - eptp = VMX_EPT_DEFAULT_MT | - VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT; - eptp |= (root_hpa & PAGE_MASK); - - return eptp; -} - -static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) -{ - unsigned long guest_cr3; - u64 eptp; - - guest_cr3 = cr3; - if (enable_ept) { - eptp = construct_eptp(cr3); - vmcs_write64(EPT_POINTER, eptp); - guest_cr3 = is_paging(vcpu) ? kvm_read_cr3(vcpu) : - vcpu->kvm->arch.ept_identity_map_addr; - ept_load_pdptrs(vcpu); - } - - vmx_flush_tlb(vcpu); - vmcs_writel(GUEST_CR3, guest_cr3); -} - -static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) -{ - unsigned long hw_cr4 = cr4 | (to_vmx(vcpu)->rmode.vm86_active ? - KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON); - - if (cr4 & X86_CR4_VMXE) { - /* - * To use VMXON (and later other VMX instructions), a guest - * must first be able to turn on cr4.VMXE (see handle_vmon()). - * So basically the check on whether to allow nested VMX - * is here. - */ - if (!nested_vmx_allowed(vcpu)) - return 1; - } else if (to_vmx(vcpu)->nested.vmxon) - return 1; - - vcpu->arch.cr4 = cr4; - if (enable_ept) { - if (!is_paging(vcpu)) { - hw_cr4 &= ~X86_CR4_PAE; - hw_cr4 |= X86_CR4_PSE; - } else if (!(cr4 & X86_CR4_PAE)) { - hw_cr4 &= ~X86_CR4_PAE; - } - } - - vmcs_writel(CR4_READ_SHADOW, cr4); - vmcs_writel(GUEST_CR4, hw_cr4); - return 0; -} - -static void vmx_get_segment(struct kvm_vcpu *vcpu, - struct kvm_segment *var, int seg) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - struct kvm_save_segment *save; - u32 ar; - - if (vmx->rmode.vm86_active - && (seg == VCPU_SREG_TR || seg == VCPU_SREG_ES - || seg == VCPU_SREG_DS || seg == VCPU_SREG_FS - || seg == VCPU_SREG_GS) - && !emulate_invalid_guest_state) { - switch (seg) { - case VCPU_SREG_TR: save = &vmx->rmode.tr; break; - case VCPU_SREG_ES: save = &vmx->rmode.es; break; - case VCPU_SREG_DS: save = &vmx->rmode.ds; break; - case VCPU_SREG_FS: save = &vmx->rmode.fs; break; - case VCPU_SREG_GS: save = &vmx->rmode.gs; break; - default: BUG(); - } - var->selector = save->selector; - var->base = save->base; - var->limit = save->limit; - ar = save->ar; - if (seg == VCPU_SREG_TR - || var->selector == vmx_read_guest_seg_selector(vmx, seg)) - goto use_saved_rmode_seg; - } - var->base = vmx_read_guest_seg_base(vmx, seg); - var->limit = vmx_read_guest_seg_limit(vmx, seg); - var->selector = vmx_read_guest_seg_selector(vmx, seg); - ar = vmx_read_guest_seg_ar(vmx, seg); -use_saved_rmode_seg: - if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state) - ar = 0; - var->type = ar & 15; - var->s = (ar >> 4) & 1; - var->dpl = (ar >> 5) & 3; - var->present = (ar >> 7) & 1; - var->avl = (ar >> 12) & 1; - var->l = (ar >> 13) & 1; - var->db = (ar >> 14) & 1; - var->g = (ar >> 15) & 1; - var->unusable = (ar >> 16) & 1; -} - -static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) -{ - struct kvm_segment s; - - if (to_vmx(vcpu)->rmode.vm86_active) { - vmx_get_segment(vcpu, &s, seg); - return s.base; - } - return vmx_read_guest_seg_base(to_vmx(vcpu), seg); -} - -static int __vmx_get_cpl(struct kvm_vcpu *vcpu) -{ - if (!is_protmode(vcpu)) - return 0; - - if (!is_long_mode(vcpu) - && (kvm_get_rflags(vcpu) & X86_EFLAGS_VM)) /* if virtual 8086 */ - return 3; - - return vmx_read_guest_seg_selector(to_vmx(vcpu), VCPU_SREG_CS) & 3; -} - -static int vmx_get_cpl(struct kvm_vcpu *vcpu) -{ - if (!test_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail)) { - __set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); - to_vmx(vcpu)->cpl = __vmx_get_cpl(vcpu); - } - return to_vmx(vcpu)->cpl; -} - - -static u32 vmx_segment_access_rights(struct kvm_segment *var) -{ - u32 ar; - - if (var->unusable) - ar = 1 << 16; - else { - ar = var->type & 15; - ar |= (var->s & 1) << 4; - ar |= (var->dpl & 3) << 5; - ar |= (var->present & 1) << 7; - ar |= (var->avl & 1) << 12; - ar |= (var->l & 1) << 13; - ar |= (var->db & 1) << 14; - ar |= (var->g & 1) << 15; - } - if (ar == 0) /* a 0 value means unusable */ - ar = AR_UNUSABLE_MASK; - - return ar; -} - -static void vmx_set_segment(struct kvm_vcpu *vcpu, - struct kvm_segment *var, int seg) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; - u32 ar; - - vmx_segment_cache_clear(vmx); - - if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) { - vmcs_write16(sf->selector, var->selector); - vmx->rmode.tr.selector = var->selector; - vmx->rmode.tr.base = var->base; - vmx->rmode.tr.limit = var->limit; - vmx->rmode.tr.ar = vmx_segment_access_rights(var); - return; - } - vmcs_writel(sf->base, var->base); - vmcs_write32(sf->limit, var->limit); - vmcs_write16(sf->selector, var->selector); - if (vmx->rmode.vm86_active && var->s) { - /* - * Hack real-mode segments into vm86 compatibility. - */ - if (var->base == 0xffff0000 && var->selector == 0xf000) - vmcs_writel(sf->base, 0xf0000); - ar = 0xf3; - } else - ar = vmx_segment_access_rights(var); - - /* - * Fix the "Accessed" bit in AR field of segment registers for older - * qemu binaries. - * IA32 arch specifies that at the time of processor reset the - * "Accessed" bit in the AR field of segment registers is 1. And qemu - * is setting it to 0 in the usedland code. This causes invalid guest - * state vmexit when "unrestricted guest" mode is turned on. - * Fix for this setup issue in cpu_reset is being pushed in the qemu - * tree. Newer qemu binaries with that qemu fix would not need this - * kvm hack. - */ - if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR)) - ar |= 0x1; /* Accessed */ - - vmcs_write32(sf->ar_bytes, ar); - __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); -} - -static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) -{ - u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS); - - *db = (ar >> 14) & 1; - *l = (ar >> 13) & 1; -} - -static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) -{ - dt->size = vmcs_read32(GUEST_IDTR_LIMIT); - dt->address = vmcs_readl(GUEST_IDTR_BASE); -} - -static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) -{ - vmcs_write32(GUEST_IDTR_LIMIT, dt->size); - vmcs_writel(GUEST_IDTR_BASE, dt->address); -} - -static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) -{ - dt->size = vmcs_read32(GUEST_GDTR_LIMIT); - dt->address = vmcs_readl(GUEST_GDTR_BASE); -} - -static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) -{ - vmcs_write32(GUEST_GDTR_LIMIT, dt->size); - vmcs_writel(GUEST_GDTR_BASE, dt->address); -} - -static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg) -{ - struct kvm_segment var; - u32 ar; - - vmx_get_segment(vcpu, &var, seg); - ar = vmx_segment_access_rights(&var); - - if (var.base != (var.selector << 4)) - return false; - if (var.limit != 0xffff) - return false; - if (ar != 0xf3) - return false; - - return true; -} - -static bool code_segment_valid(struct kvm_vcpu *vcpu) -{ - struct kvm_segment cs; - unsigned int cs_rpl; - - vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); - cs_rpl = cs.selector & SELECTOR_RPL_MASK; - - if (cs.unusable) - return false; - if (~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_ACCESSES_MASK)) - return false; - if (!cs.s) - return false; - if (cs.type & AR_TYPE_WRITEABLE_MASK) { - if (cs.dpl > cs_rpl) - return false; - } else { - if (cs.dpl != cs_rpl) - return false; - } - if (!cs.present) - return false; - - /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */ - return true; -} - -static bool stack_segment_valid(struct kvm_vcpu *vcpu) -{ - struct kvm_segment ss; - unsigned int ss_rpl; - - vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); - ss_rpl = ss.selector & SELECTOR_RPL_MASK; - - if (ss.unusable) - return true; - if (ss.type != 3 && ss.type != 7) - return false; - if (!ss.s) - return false; - if (ss.dpl != ss_rpl) /* DPL != RPL */ - return false; - if (!ss.present) - return false; - - return true; -} - -static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg) -{ - struct kvm_segment var; - unsigned int rpl; - - vmx_get_segment(vcpu, &var, seg); - rpl = var.selector & SELECTOR_RPL_MASK; - - if (var.unusable) - return true; - if (!var.s) - return false; - if (!var.present) - return false; - if (~var.type & (AR_TYPE_CODE_MASK|AR_TYPE_WRITEABLE_MASK)) { - if (var.dpl < rpl) /* DPL < RPL */ - return false; - } - - /* TODO: Add other members to kvm_segment_field to allow checking for other access - * rights flags - */ - return true; -} - -static bool tr_valid(struct kvm_vcpu *vcpu) -{ - struct kvm_segment tr; - - vmx_get_segment(vcpu, &tr, VCPU_SREG_TR); - - if (tr.unusable) - return false; - if (tr.selector & SELECTOR_TI_MASK) /* TI = 1 */ - return false; - if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */ - return false; - if (!tr.present) - return false; - - return true; -} - -static bool ldtr_valid(struct kvm_vcpu *vcpu) -{ - struct kvm_segment ldtr; - - vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR); - - if (ldtr.unusable) - return true; - if (ldtr.selector & SELECTOR_TI_MASK) /* TI = 1 */ - return false; - if (ldtr.type != 2) - return false; - if (!ldtr.present) - return false; - - return true; -} - -static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu) -{ - struct kvm_segment cs, ss; - - vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); - vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); - - return ((cs.selector & SELECTOR_RPL_MASK) == - (ss.selector & SELECTOR_RPL_MASK)); -} - -/* - * Check if guest state is valid. Returns true if valid, false if - * not. - * We assume that registers are always usable - */ -static bool guest_state_valid(struct kvm_vcpu *vcpu) -{ - /* real mode guest state checks */ - if (!is_protmode(vcpu)) { - if (!rmode_segment_valid(vcpu, VCPU_SREG_CS)) - return false; - if (!rmode_segment_valid(vcpu, VCPU_SREG_SS)) - return false; - if (!rmode_segment_valid(vcpu, VCPU_SREG_DS)) - return false; - if (!rmode_segment_valid(vcpu, VCPU_SREG_ES)) - return false; - if (!rmode_segment_valid(vcpu, VCPU_SREG_FS)) - return false; - if (!rmode_segment_valid(vcpu, VCPU_SREG_GS)) - return false; - } else { - /* protected mode guest state checks */ - if (!cs_ss_rpl_check(vcpu)) - return false; - if (!code_segment_valid(vcpu)) - return false; - if (!stack_segment_valid(vcpu)) - return false; - if (!data_segment_valid(vcpu, VCPU_SREG_DS)) - return false; - if (!data_segment_valid(vcpu, VCPU_SREG_ES)) - return false; - if (!data_segment_valid(vcpu, VCPU_SREG_FS)) - return false; - if (!data_segment_valid(vcpu, VCPU_SREG_GS)) - return false; - if (!tr_valid(vcpu)) - return false; - if (!ldtr_valid(vcpu)) - return false; - } - /* TODO: - * - Add checks on RIP - * - Add checks on RFLAGS - */ - - return true; -} - -static int init_rmode_tss(struct kvm *kvm) -{ - gfn_t fn; - u16 data = 0; - int r, idx, ret = 0; - - idx = srcu_read_lock(&kvm->srcu); - fn = rmode_tss_base(kvm) >> PAGE_SHIFT; - r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); - if (r < 0) - goto out; - data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE; - r = kvm_write_guest_page(kvm, fn++, &data, - TSS_IOPB_BASE_OFFSET, sizeof(u16)); - if (r < 0) - goto out; - r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE); - if (r < 0) - goto out; - r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); - if (r < 0) - goto out; - data = ~0; - r = kvm_write_guest_page(kvm, fn, &data, - RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1, - sizeof(u8)); - if (r < 0) - goto out; - - ret = 1; -out: - srcu_read_unlock(&kvm->srcu, idx); - return ret; -} - -static int init_rmode_identity_map(struct kvm *kvm) -{ - int i, idx, r, ret; - pfn_t identity_map_pfn; - u32 tmp; - - if (!enable_ept) - return 1; - if (unlikely(!kvm->arch.ept_identity_pagetable)) { - printk(KERN_ERR "EPT: identity-mapping pagetable " - "haven't been allocated!\n"); - return 0; - } - if (likely(kvm->arch.ept_identity_pagetable_done)) - return 1; - ret = 0; - identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT; - idx = srcu_read_lock(&kvm->srcu); - r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE); - if (r < 0) - goto out; - /* Set up identity-mapping pagetable for EPT in real mode */ - for (i = 0; i < PT32_ENT_PER_PAGE; i++) { - tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | - _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE); - r = kvm_write_guest_page(kvm, identity_map_pfn, - &tmp, i * sizeof(tmp), sizeof(tmp)); - if (r < 0) - goto out; - } - kvm->arch.ept_identity_pagetable_done = true; - ret = 1; -out: - srcu_read_unlock(&kvm->srcu, idx); - return ret; -} - -static void seg_setup(int seg) -{ - struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; - unsigned int ar; - - vmcs_write16(sf->selector, 0); - vmcs_writel(sf->base, 0); - vmcs_write32(sf->limit, 0xffff); - if (enable_unrestricted_guest) { - ar = 0x93; - if (seg == VCPU_SREG_CS) - ar |= 0x08; /* code segment */ - } else - ar = 0xf3; - - vmcs_write32(sf->ar_bytes, ar); -} - -static int alloc_apic_access_page(struct kvm *kvm) -{ - struct kvm_userspace_memory_region kvm_userspace_mem; - int r = 0; - - mutex_lock(&kvm->slots_lock); - if (kvm->arch.apic_access_page) - goto out; - kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT; - kvm_userspace_mem.flags = 0; - kvm_userspace_mem.guest_phys_addr = 0xfee00000ULL; - kvm_userspace_mem.memory_size = PAGE_SIZE; - r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0); - if (r) - goto out; - - kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00); -out: - mutex_unlock(&kvm->slots_lock); - return r; -} - -static int alloc_identity_pagetable(struct kvm *kvm) -{ - struct kvm_userspace_memory_region kvm_userspace_mem; - int r = 0; - - mutex_lock(&kvm->slots_lock); - if (kvm->arch.ept_identity_pagetable) - goto out; - kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT; - kvm_userspace_mem.flags = 0; - kvm_userspace_mem.guest_phys_addr = - kvm->arch.ept_identity_map_addr; - kvm_userspace_mem.memory_size = PAGE_SIZE; - r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0); - if (r) - goto out; - - kvm->arch.ept_identity_pagetable = gfn_to_page(kvm, - kvm->arch.ept_identity_map_addr >> PAGE_SHIFT); -out: - mutex_unlock(&kvm->slots_lock); - return r; -} - -static void allocate_vpid(struct vcpu_vmx *vmx) -{ - int vpid; - - vmx->vpid = 0; - if (!enable_vpid) - return; - spin_lock(&vmx_vpid_lock); - vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS); - if (vpid < VMX_NR_VPIDS) { - vmx->vpid = vpid; - __set_bit(vpid, vmx_vpid_bitmap); - } - spin_unlock(&vmx_vpid_lock); -} - -static void free_vpid(struct vcpu_vmx *vmx) -{ - if (!enable_vpid) - return; - spin_lock(&vmx_vpid_lock); - if (vmx->vpid != 0) - __clear_bit(vmx->vpid, vmx_vpid_bitmap); - spin_unlock(&vmx_vpid_lock); -} - -static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr) -{ - int f = sizeof(unsigned long); - - if (!cpu_has_vmx_msr_bitmap()) - return; - - /* - * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals - * have the write-low and read-high bitmap offsets the wrong way round. - * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. - */ - if (msr <= 0x1fff) { - __clear_bit(msr, msr_bitmap + 0x000 / f); /* read-low */ - __clear_bit(msr, msr_bitmap + 0x800 / f); /* write-low */ - } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { - msr &= 0x1fff; - __clear_bit(msr, msr_bitmap + 0x400 / f); /* read-high */ - __clear_bit(msr, msr_bitmap + 0xc00 / f); /* write-high */ - } -} - -static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only) -{ - if (!longmode_only) - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, msr); - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, msr); -} - -/* - * Set up the vmcs's constant host-state fields, i.e., host-state fields that - * will not change in the lifetime of the guest. - * Note that host-state that does change is set elsewhere. E.g., host-state - * that is set differently for each CPU is set in vmx_vcpu_load(), not here. - */ -static void vmx_set_constant_host_state(void) -{ - u32 low32, high32; - unsigned long tmpl; - struct desc_ptr dt; - - vmcs_writel(HOST_CR0, read_cr0() | X86_CR0_TS); /* 22.2.3 */ - vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */ - vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */ - - vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ - vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ - vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ - vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ - vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ - - native_store_idt(&dt); - vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */ - - asm("mov $.Lkvm_vmx_return, %0" : "=r"(tmpl)); - vmcs_writel(HOST_RIP, tmpl); /* 22.2.5 */ - - rdmsr(MSR_IA32_SYSENTER_CS, low32, high32); - vmcs_write32(HOST_IA32_SYSENTER_CS, low32); - rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl); - vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */ - - if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) { - rdmsr(MSR_IA32_CR_PAT, low32, high32); - vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32)); - } -} - -static void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) -{ - vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS; - if (enable_ept) - vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE; - if (is_guest_mode(&vmx->vcpu)) - vmx->vcpu.arch.cr4_guest_owned_bits &= - ~get_vmcs12(&vmx->vcpu)->cr4_guest_host_mask; - vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits); -} - -static u32 vmx_exec_control(struct vcpu_vmx *vmx) -{ - u32 exec_control = vmcs_config.cpu_based_exec_ctrl; - if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) { - exec_control &= ~CPU_BASED_TPR_SHADOW; -#ifdef CONFIG_X86_64 - exec_control |= CPU_BASED_CR8_STORE_EXITING | - CPU_BASED_CR8_LOAD_EXITING; -#endif - } - if (!enable_ept) - exec_control |= CPU_BASED_CR3_STORE_EXITING | - CPU_BASED_CR3_LOAD_EXITING | - CPU_BASED_INVLPG_EXITING; - return exec_control; -} - -static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) -{ - u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; - if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) - exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; - if (vmx->vpid == 0) - exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; - if (!enable_ept) { - exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; - enable_unrestricted_guest = 0; - } - if (!enable_unrestricted_guest) - exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; - if (!ple_gap) - exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; - return exec_control; -} - -static void ept_set_mmio_spte_mask(void) -{ - /* - * EPT Misconfigurations can be generated if the value of bits 2:0 - * of an EPT paging-structure entry is 110b (write/execute). - * Also, magic bits (0xffull << 49) is set to quickly identify mmio - * spte. - */ - kvm_mmu_set_mmio_spte_mask(0xffull << 49 | 0x6ull); -} - -/* - * Sets up the vmcs for emulated real mode. - */ -static int vmx_vcpu_setup(struct vcpu_vmx *vmx) -{ -#ifdef CONFIG_X86_64 - unsigned long a; -#endif - int i; - - /* I/O */ - vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a)); - vmcs_write64(IO_BITMAP_B, __pa(vmx_io_bitmap_b)); - - if (cpu_has_vmx_msr_bitmap()) - vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_legacy)); - - vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ - - /* Control */ - vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, - vmcs_config.pin_based_exec_ctrl); - - vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx)); - - if (cpu_has_secondary_exec_ctrls()) { - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, - vmx_secondary_exec_control(vmx)); - } - - if (ple_gap) { - vmcs_write32(PLE_GAP, ple_gap); - vmcs_write32(PLE_WINDOW, ple_window); - } - - vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); - vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); - vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ - - vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */ - vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */ - vmx_set_constant_host_state(); -#ifdef CONFIG_X86_64 - rdmsrl(MSR_FS_BASE, a); - vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */ - rdmsrl(MSR_GS_BASE, a); - vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */ -#else - vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */ - vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */ -#endif - - vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); - vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); - vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host)); - vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); - vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest)); - - if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { - u32 msr_low, msr_high; - u64 host_pat; - rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high); - host_pat = msr_low | ((u64) msr_high << 32); - /* Write the default value follow host pat */ - vmcs_write64(GUEST_IA32_PAT, host_pat); - /* Keep arch.pat sync with GUEST_IA32_PAT */ - vmx->vcpu.arch.pat = host_pat; - } - - for (i = 0; i < NR_VMX_MSR; ++i) { - u32 index = vmx_msr_index[i]; - u32 data_low, data_high; - int j = vmx->nmsrs; - - if (rdmsr_safe(index, &data_low, &data_high) < 0) - continue; - if (wrmsr_safe(index, data_low, data_high) < 0) - continue; - vmx->guest_msrs[j].index = i; - vmx->guest_msrs[j].data = 0; - vmx->guest_msrs[j].mask = -1ull; - ++vmx->nmsrs; - } - - vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl); - - /* 22.2.1, 20.8.1 */ - vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl); - - vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); - set_cr4_guest_host_mask(vmx); - - kvm_write_tsc(&vmx->vcpu, 0); - - return 0; -} - -static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - u64 msr; - int ret; - - vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); - - vmx->rmode.vm86_active = 0; - - vmx->soft_vnmi_blocked = 0; - - vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); - kvm_set_cr8(&vmx->vcpu, 0); - msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; - if (kvm_vcpu_is_bsp(&vmx->vcpu)) - msr |= MSR_IA32_APICBASE_BSP; - kvm_set_apic_base(&vmx->vcpu, msr); - - ret = fx_init(&vmx->vcpu); - if (ret != 0) - goto out; - - vmx_segment_cache_clear(vmx); - - seg_setup(VCPU_SREG_CS); - /* - * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode - * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh. - */ - if (kvm_vcpu_is_bsp(&vmx->vcpu)) { - vmcs_write16(GUEST_CS_SELECTOR, 0xf000); - vmcs_writel(GUEST_CS_BASE, 0x000f0000); - } else { - vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8); - vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12); - } - - seg_setup(VCPU_SREG_DS); - seg_setup(VCPU_SREG_ES); - seg_setup(VCPU_SREG_FS); - seg_setup(VCPU_SREG_GS); - seg_setup(VCPU_SREG_SS); - - vmcs_write16(GUEST_TR_SELECTOR, 0); - vmcs_writel(GUEST_TR_BASE, 0); - vmcs_write32(GUEST_TR_LIMIT, 0xffff); - vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); - - vmcs_write16(GUEST_LDTR_SELECTOR, 0); - vmcs_writel(GUEST_LDTR_BASE, 0); - vmcs_write32(GUEST_LDTR_LIMIT, 0xffff); - vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082); - - vmcs_write32(GUEST_SYSENTER_CS, 0); - vmcs_writel(GUEST_SYSENTER_ESP, 0); - vmcs_writel(GUEST_SYSENTER_EIP, 0); - - vmcs_writel(GUEST_RFLAGS, 0x02); - if (kvm_vcpu_is_bsp(&vmx->vcpu)) - kvm_rip_write(vcpu, 0xfff0); - else - kvm_rip_write(vcpu, 0); - kvm_register_write(vcpu, VCPU_REGS_RSP, 0); - - vmcs_writel(GUEST_DR7, 0x400); - - vmcs_writel(GUEST_GDTR_BASE, 0); - vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); - - vmcs_writel(GUEST_IDTR_BASE, 0); - vmcs_write32(GUEST_IDTR_LIMIT, 0xffff); - - vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); - vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); - vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0); - - /* Special registers */ - vmcs_write64(GUEST_IA32_DEBUGCTL, 0); - - setup_msrs(vmx); - - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */ - - if (cpu_has_vmx_tpr_shadow()) { - vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); - if (vm_need_tpr_shadow(vmx->vcpu.kvm)) - vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, - __pa(vmx->vcpu.arch.apic->regs)); - vmcs_write32(TPR_THRESHOLD, 0); - } - - if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) - vmcs_write64(APIC_ACCESS_ADDR, - page_to_phys(vmx->vcpu.kvm->arch.apic_access_page)); - - if (vmx->vpid != 0) - vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); - - vmx->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; - vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); - vmx_set_cr0(&vmx->vcpu, kvm_read_cr0(vcpu)); /* enter rmode */ - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); - vmx_set_cr4(&vmx->vcpu, 0); - vmx_set_efer(&vmx->vcpu, 0); - vmx_fpu_activate(&vmx->vcpu); - update_exception_bitmap(&vmx->vcpu); - - vpid_sync_context(vmx); - - ret = 0; - - /* HACK: Don't enable emulation on guest boot/reset */ - vmx->emulation_required = 0; - -out: - return ret; -} - -/* - * In nested virtualization, check if L1 asked to exit on external interrupts. - * For most existing hypervisors, this will always return true. - */ -static bool nested_exit_on_intr(struct kvm_vcpu *vcpu) -{ - return get_vmcs12(vcpu)->pin_based_vm_exec_control & - PIN_BASED_EXT_INTR_MASK; -} - -static void enable_irq_window(struct kvm_vcpu *vcpu) -{ - u32 cpu_based_vm_exec_control; - if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) { - /* - * We get here if vmx_interrupt_allowed() said we can't - * inject to L1 now because L2 must run. Ask L2 to exit - * right after entry, so we can inject to L1 more promptly. - */ - kvm_make_request(KVM_REQ_IMMEDIATE_EXIT, vcpu); - return; - } - - cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); - cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING; - vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); -} - -static void enable_nmi_window(struct kvm_vcpu *vcpu) -{ - u32 cpu_based_vm_exec_control; - - if (!cpu_has_virtual_nmis()) { - enable_irq_window(vcpu); - return; - } - - if (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) { - enable_irq_window(vcpu); - return; - } - cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); - cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING; - vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); -} - -static void vmx_inject_irq(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - uint32_t intr; - int irq = vcpu->arch.interrupt.nr; - - trace_kvm_inj_virq(irq); - - ++vcpu->stat.irq_injections; - if (vmx->rmode.vm86_active) { - int inc_eip = 0; - if (vcpu->arch.interrupt.soft) - inc_eip = vcpu->arch.event_exit_inst_len; - if (kvm_inject_realmode_interrupt(vcpu, irq, inc_eip) != EMULATE_DONE) - kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); - return; - } - intr = irq | INTR_INFO_VALID_MASK; - if (vcpu->arch.interrupt.soft) { - intr |= INTR_TYPE_SOFT_INTR; - vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, - vmx->vcpu.arch.event_exit_inst_len); - } else - intr |= INTR_TYPE_EXT_INTR; - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr); -} - -static void vmx_inject_nmi(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (is_guest_mode(vcpu)) - return; - - if (!cpu_has_virtual_nmis()) { - /* - * Tracking the NMI-blocked state in software is built upon - * finding the next open IRQ window. This, in turn, depends on - * well-behaving guests: They have to keep IRQs disabled at - * least as long as the NMI handler runs. Otherwise we may - * cause NMI nesting, maybe breaking the guest. But as this is - * highly unlikely, we can live with the residual risk. - */ - vmx->soft_vnmi_blocked = 1; - vmx->vnmi_blocked_time = 0; - } - - ++vcpu->stat.nmi_injections; - vmx->nmi_known_unmasked = false; - if (vmx->rmode.vm86_active) { - if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0) != EMULATE_DONE) - kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); - return; - } - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, - INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); -} - -static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) -{ - if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked) - return 0; - - return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & - (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI - | GUEST_INTR_STATE_NMI)); -} - -static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) -{ - if (!cpu_has_virtual_nmis()) - return to_vmx(vcpu)->soft_vnmi_blocked; - if (to_vmx(vcpu)->nmi_known_unmasked) - return false; - return vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI; -} - -static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (!cpu_has_virtual_nmis()) { - if (vmx->soft_vnmi_blocked != masked) { - vmx->soft_vnmi_blocked = masked; - vmx->vnmi_blocked_time = 0; - } - } else { - vmx->nmi_known_unmasked = !masked; - if (masked) - vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, - GUEST_INTR_STATE_NMI); - else - vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, - GUEST_INTR_STATE_NMI); - } -} - -static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) -{ - if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) { - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - if (to_vmx(vcpu)->nested.nested_run_pending || - (vmcs12->idt_vectoring_info_field & - VECTORING_INFO_VALID_MASK)) - return 0; - nested_vmx_vmexit(vcpu); - vmcs12->vm_exit_reason = EXIT_REASON_EXTERNAL_INTERRUPT; - vmcs12->vm_exit_intr_info = 0; - /* fall through to normal code, but now in L1, not L2 */ - } - - return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && - !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & - (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); -} - -static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) -{ - int ret; - struct kvm_userspace_memory_region tss_mem = { - .slot = TSS_PRIVATE_MEMSLOT, - .guest_phys_addr = addr, - .memory_size = PAGE_SIZE * 3, - .flags = 0, - }; - - ret = kvm_set_memory_region(kvm, &tss_mem, 0); - if (ret) - return ret; - kvm->arch.tss_addr = addr; - if (!init_rmode_tss(kvm)) - return -ENOMEM; - - return 0; -} - -static int handle_rmode_exception(struct kvm_vcpu *vcpu, - int vec, u32 err_code) -{ - /* - * Instruction with address size override prefix opcode 0x67 - * Cause the #SS fault with 0 error code in VM86 mode. - */ - if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) - if (emulate_instruction(vcpu, 0) == EMULATE_DONE) - return 1; - /* - * Forward all other exceptions that are valid in real mode. - * FIXME: Breaks guest debugging in real mode, needs to be fixed with - * the required debugging infrastructure rework. - */ - switch (vec) { - case DB_VECTOR: - if (vcpu->guest_debug & - (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) - return 0; - kvm_queue_exception(vcpu, vec); - return 1; - case BP_VECTOR: - /* - * Update instruction length as we may reinject the exception - * from user space while in guest debugging mode. - */ - to_vmx(vcpu)->vcpu.arch.event_exit_inst_len = - vmcs_read32(VM_EXIT_INSTRUCTION_LEN); - if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) - return 0; - /* fall through */ - case DE_VECTOR: - case OF_VECTOR: - case BR_VECTOR: - case UD_VECTOR: - case DF_VECTOR: - case SS_VECTOR: - case GP_VECTOR: - case MF_VECTOR: - kvm_queue_exception(vcpu, vec); - return 1; - } - return 0; -} - -/* - * Trigger machine check on the host. We assume all the MSRs are already set up - * by the CPU and that we still run on the same CPU as the MCE occurred on. - * We pass a fake environment to the machine check handler because we want - * the guest to be always treated like user space, no matter what context - * it used internally. - */ -static void kvm_machine_check(void) -{ -#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64) - struct pt_regs regs = { - .cs = 3, /* Fake ring 3 no matter what the guest ran on */ - .flags = X86_EFLAGS_IF, - }; - - do_machine_check(®s, 0); -#endif -} - -static int handle_machine_check(struct kvm_vcpu *vcpu) -{ - /* already handled by vcpu_run */ - return 1; -} - -static int handle_exception(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - struct kvm_run *kvm_run = vcpu->run; - u32 intr_info, ex_no, error_code; - unsigned long cr2, rip, dr6; - u32 vect_info; - enum emulation_result er; - - vect_info = vmx->idt_vectoring_info; - intr_info = vmx->exit_intr_info; - - if (is_machine_check(intr_info)) - return handle_machine_check(vcpu); - - if ((vect_info & VECTORING_INFO_VALID_MASK) && - !is_page_fault(intr_info)) { - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX; - vcpu->run->internal.ndata = 2; - vcpu->run->internal.data[0] = vect_info; - vcpu->run->internal.data[1] = intr_info; - return 0; - } - - if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR) - return 1; /* already handled by vmx_vcpu_run() */ - - if (is_no_device(intr_info)) { - vmx_fpu_activate(vcpu); - return 1; - } - - if (is_invalid_opcode(intr_info)) { - er = emulate_instruction(vcpu, EMULTYPE_TRAP_UD); - if (er != EMULATE_DONE) - kvm_queue_exception(vcpu, UD_VECTOR); - return 1; - } - - error_code = 0; - if (intr_info & INTR_INFO_DELIVER_CODE_MASK) - error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); - if (is_page_fault(intr_info)) { - /* EPT won't cause page fault directly */ - BUG_ON(enable_ept); - cr2 = vmcs_readl(EXIT_QUALIFICATION); - trace_kvm_page_fault(cr2, error_code); - - if (kvm_event_needs_reinjection(vcpu)) - kvm_mmu_unprotect_page_virt(vcpu, cr2); - return kvm_mmu_page_fault(vcpu, cr2, error_code, NULL, 0); - } - - if (vmx->rmode.vm86_active && - handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK, - error_code)) { - if (vcpu->arch.halt_request) { - vcpu->arch.halt_request = 0; - return kvm_emulate_halt(vcpu); - } - return 1; - } - - ex_no = intr_info & INTR_INFO_VECTOR_MASK; - switch (ex_no) { - case DB_VECTOR: - dr6 = vmcs_readl(EXIT_QUALIFICATION); - if (!(vcpu->guest_debug & - (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { - vcpu->arch.dr6 = dr6 | DR6_FIXED_1; - kvm_queue_exception(vcpu, DB_VECTOR); - return 1; - } - kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1; - kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7); - /* fall through */ - case BP_VECTOR: - /* - * Update instruction length as we may reinject #BP from - * user space while in guest debugging mode. Reading it for - * #DB as well causes no harm, it is not used in that case. - */ - vmx->vcpu.arch.event_exit_inst_len = - vmcs_read32(VM_EXIT_INSTRUCTION_LEN); - kvm_run->exit_reason = KVM_EXIT_DEBUG; - rip = kvm_rip_read(vcpu); - kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip; - kvm_run->debug.arch.exception = ex_no; - break; - default: - kvm_run->exit_reason = KVM_EXIT_EXCEPTION; - kvm_run->ex.exception = ex_no; - kvm_run->ex.error_code = error_code; - break; - } - return 0; -} - -static int handle_external_interrupt(struct kvm_vcpu *vcpu) -{ - ++vcpu->stat.irq_exits; - return 1; -} - -static int handle_triple_fault(struct kvm_vcpu *vcpu) -{ - vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; - return 0; -} - -static int handle_io(struct kvm_vcpu *vcpu) -{ - unsigned long exit_qualification; - int size, in, string; - unsigned port; - - exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - string = (exit_qualification & 16) != 0; - in = (exit_qualification & 8) != 0; - - ++vcpu->stat.io_exits; - - if (string || in) - return emulate_instruction(vcpu, 0) == EMULATE_DONE; - - port = exit_qualification >> 16; - size = (exit_qualification & 7) + 1; - skip_emulated_instruction(vcpu); - - return kvm_fast_pio_out(vcpu, size, port); -} - -static void -vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) -{ - /* - * Patch in the VMCALL instruction: - */ - hypercall[0] = 0x0f; - hypercall[1] = 0x01; - hypercall[2] = 0xc1; -} - -/* called to set cr0 as approriate for a mov-to-cr0 exit. */ -static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) -{ - if (to_vmx(vcpu)->nested.vmxon && - ((val & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON)) - return 1; - - if (is_guest_mode(vcpu)) { - /* - * We get here when L2 changed cr0 in a way that did not change - * any of L1's shadowed bits (see nested_vmx_exit_handled_cr), - * but did change L0 shadowed bits. This can currently happen - * with the TS bit: L0 may want to leave TS on (for lazy fpu - * loading) while pretending to allow the guest to change it. - */ - if (kvm_set_cr0(vcpu, (val & vcpu->arch.cr0_guest_owned_bits) | - (vcpu->arch.cr0 & ~vcpu->arch.cr0_guest_owned_bits))) - return 1; - vmcs_writel(CR0_READ_SHADOW, val); - return 0; - } else - return kvm_set_cr0(vcpu, val); -} - -static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val) -{ - if (is_guest_mode(vcpu)) { - if (kvm_set_cr4(vcpu, (val & vcpu->arch.cr4_guest_owned_bits) | - (vcpu->arch.cr4 & ~vcpu->arch.cr4_guest_owned_bits))) - return 1; - vmcs_writel(CR4_READ_SHADOW, val); - return 0; - } else - return kvm_set_cr4(vcpu, val); -} - -/* called to set cr0 as approriate for clts instruction exit. */ -static void handle_clts(struct kvm_vcpu *vcpu) -{ - if (is_guest_mode(vcpu)) { - /* - * We get here when L2 did CLTS, and L1 didn't shadow CR0.TS - * but we did (!fpu_active). We need to keep GUEST_CR0.TS on, - * just pretend it's off (also in arch.cr0 for fpu_activate). - */ - vmcs_writel(CR0_READ_SHADOW, - vmcs_readl(CR0_READ_SHADOW) & ~X86_CR0_TS); - vcpu->arch.cr0 &= ~X86_CR0_TS; - } else - vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); -} - -static int handle_cr(struct kvm_vcpu *vcpu) -{ - unsigned long exit_qualification, val; - int cr; - int reg; - int err; - - exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - cr = exit_qualification & 15; - reg = (exit_qualification >> 8) & 15; - switch ((exit_qualification >> 4) & 3) { - case 0: /* mov to cr */ - val = kvm_register_read(vcpu, reg); - trace_kvm_cr_write(cr, val); - switch (cr) { - case 0: - err = handle_set_cr0(vcpu, val); - kvm_complete_insn_gp(vcpu, err); - return 1; - case 3: - err = kvm_set_cr3(vcpu, val); - kvm_complete_insn_gp(vcpu, err); - return 1; - case 4: - err = handle_set_cr4(vcpu, val); - kvm_complete_insn_gp(vcpu, err); - return 1; - case 8: { - u8 cr8_prev = kvm_get_cr8(vcpu); - u8 cr8 = kvm_register_read(vcpu, reg); - err = kvm_set_cr8(vcpu, cr8); - kvm_complete_insn_gp(vcpu, err); - if (irqchip_in_kernel(vcpu->kvm)) - return 1; - if (cr8_prev <= cr8) - return 1; - vcpu->run->exit_reason = KVM_EXIT_SET_TPR; - return 0; - } - }; - break; - case 2: /* clts */ - handle_clts(vcpu); - trace_kvm_cr_write(0, kvm_read_cr0(vcpu)); - skip_emulated_instruction(vcpu); - vmx_fpu_activate(vcpu); - return 1; - case 1: /*mov from cr*/ - switch (cr) { - case 3: - val = kvm_read_cr3(vcpu); - kvm_register_write(vcpu, reg, val); - trace_kvm_cr_read(cr, val); - skip_emulated_instruction(vcpu); - return 1; - case 8: - val = kvm_get_cr8(vcpu); - kvm_register_write(vcpu, reg, val); - trace_kvm_cr_read(cr, val); - skip_emulated_instruction(vcpu); - return 1; - } - break; - case 3: /* lmsw */ - val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; - trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val); - kvm_lmsw(vcpu, val); - - skip_emulated_instruction(vcpu); - return 1; - default: - break; - } - vcpu->run->exit_reason = 0; - pr_unimpl(vcpu, "unhandled control register: op %d cr %d\n", - (int)(exit_qualification >> 4) & 3, cr); - return 0; -} - -static int handle_dr(struct kvm_vcpu *vcpu) -{ - unsigned long exit_qualification; - int dr, reg; - - /* Do not handle if the CPL > 0, will trigger GP on re-entry */ - if (!kvm_require_cpl(vcpu, 0)) - return 1; - dr = vmcs_readl(GUEST_DR7); - if (dr & DR7_GD) { - /* - * As the vm-exit takes precedence over the debug trap, we - * need to emulate the latter, either for the host or the - * guest debugging itself. - */ - if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) { - vcpu->run->debug.arch.dr6 = vcpu->arch.dr6; - vcpu->run->debug.arch.dr7 = dr; - vcpu->run->debug.arch.pc = - vmcs_readl(GUEST_CS_BASE) + - vmcs_readl(GUEST_RIP); - vcpu->run->debug.arch.exception = DB_VECTOR; - vcpu->run->exit_reason = KVM_EXIT_DEBUG; - return 0; - } else { - vcpu->arch.dr7 &= ~DR7_GD; - vcpu->arch.dr6 |= DR6_BD; - vmcs_writel(GUEST_DR7, vcpu->arch.dr7); - kvm_queue_exception(vcpu, DB_VECTOR); - return 1; - } - } - - exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - dr = exit_qualification & DEBUG_REG_ACCESS_NUM; - reg = DEBUG_REG_ACCESS_REG(exit_qualification); - if (exit_qualification & TYPE_MOV_FROM_DR) { - unsigned long val; - if (!kvm_get_dr(vcpu, dr, &val)) - kvm_register_write(vcpu, reg, val); - } else - kvm_set_dr(vcpu, dr, vcpu->arch.regs[reg]); - skip_emulated_instruction(vcpu); - return 1; -} - -static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) -{ - vmcs_writel(GUEST_DR7, val); -} - -static int handle_cpuid(struct kvm_vcpu *vcpu) -{ - kvm_emulate_cpuid(vcpu); - return 1; -} - -static int handle_rdmsr(struct kvm_vcpu *vcpu) -{ - u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; - u64 data; - - if (vmx_get_msr(vcpu, ecx, &data)) { - trace_kvm_msr_read_ex(ecx); - kvm_inject_gp(vcpu, 0); - return 1; - } - - trace_kvm_msr_read(ecx, data); - - /* FIXME: handling of bits 32:63 of rax, rdx */ - vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u; - vcpu->arch.regs[VCPU_REGS_RDX] = (data >> 32) & -1u; - skip_emulated_instruction(vcpu); - return 1; -} - -static int handle_wrmsr(struct kvm_vcpu *vcpu) -{ - u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; - u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) - | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32); - - if (vmx_set_msr(vcpu, ecx, data) != 0) { - trace_kvm_msr_write_ex(ecx, data); - kvm_inject_gp(vcpu, 0); - return 1; - } - - trace_kvm_msr_write(ecx, data); - skip_emulated_instruction(vcpu); - return 1; -} - -static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu) -{ - kvm_make_request(KVM_REQ_EVENT, vcpu); - return 1; -} - -static int handle_interrupt_window(struct kvm_vcpu *vcpu) -{ - u32 cpu_based_vm_exec_control; - - /* clear pending irq */ - cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); - cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; - vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); - - kvm_make_request(KVM_REQ_EVENT, vcpu); - - ++vcpu->stat.irq_window_exits; - - /* - * If the user space waits to inject interrupts, exit as soon as - * possible - */ - if (!irqchip_in_kernel(vcpu->kvm) && - vcpu->run->request_interrupt_window && - !kvm_cpu_has_interrupt(vcpu)) { - vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; - return 0; - } - return 1; -} - -static int handle_halt(struct kvm_vcpu *vcpu) -{ - skip_emulated_instruction(vcpu); - return kvm_emulate_halt(vcpu); -} - -static int handle_vmcall(struct kvm_vcpu *vcpu) -{ - skip_emulated_instruction(vcpu); - kvm_emulate_hypercall(vcpu); - return 1; -} - -static int handle_invd(struct kvm_vcpu *vcpu) -{ - return emulate_instruction(vcpu, 0) == EMULATE_DONE; -} - -static int handle_invlpg(struct kvm_vcpu *vcpu) -{ - unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - - kvm_mmu_invlpg(vcpu, exit_qualification); - skip_emulated_instruction(vcpu); - return 1; -} - -static int handle_rdpmc(struct kvm_vcpu *vcpu) -{ - int err; - - err = kvm_rdpmc(vcpu); - kvm_complete_insn_gp(vcpu, err); - - return 1; -} - -static int handle_wbinvd(struct kvm_vcpu *vcpu) -{ - skip_emulated_instruction(vcpu); - kvm_emulate_wbinvd(vcpu); - return 1; -} - -static int handle_xsetbv(struct kvm_vcpu *vcpu) -{ - u64 new_bv = kvm_read_edx_eax(vcpu); - u32 index = kvm_register_read(vcpu, VCPU_REGS_RCX); - - if (kvm_set_xcr(vcpu, index, new_bv) == 0) - skip_emulated_instruction(vcpu); - return 1; -} - -static int handle_apic_access(struct kvm_vcpu *vcpu) -{ - if (likely(fasteoi)) { - unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - int access_type, offset; - - access_type = exit_qualification & APIC_ACCESS_TYPE; - offset = exit_qualification & APIC_ACCESS_OFFSET; - /* - * Sane guest uses MOV to write EOI, with written value - * not cared. So make a short-circuit here by avoiding - * heavy instruction emulation. - */ - if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) && - (offset == APIC_EOI)) { - kvm_lapic_set_eoi(vcpu); - skip_emulated_instruction(vcpu); - return 1; - } - } - return emulate_instruction(vcpu, 0) == EMULATE_DONE; -} - -static int handle_task_switch(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned long exit_qualification; - bool has_error_code = false; - u32 error_code = 0; - u16 tss_selector; - int reason, type, idt_v, idt_index; - - idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK); - idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK); - type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK); - - exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - - reason = (u32)exit_qualification >> 30; - if (reason == TASK_SWITCH_GATE && idt_v) { - switch (type) { - case INTR_TYPE_NMI_INTR: - vcpu->arch.nmi_injected = false; - vmx_set_nmi_mask(vcpu, true); - break; - case INTR_TYPE_EXT_INTR: - case INTR_TYPE_SOFT_INTR: - kvm_clear_interrupt_queue(vcpu); - break; - case INTR_TYPE_HARD_EXCEPTION: - if (vmx->idt_vectoring_info & - VECTORING_INFO_DELIVER_CODE_MASK) { - has_error_code = true; - error_code = - vmcs_read32(IDT_VECTORING_ERROR_CODE); - } - /* fall through */ - case INTR_TYPE_SOFT_EXCEPTION: - kvm_clear_exception_queue(vcpu); - break; - default: - break; - } - } - tss_selector = exit_qualification; - - if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION && - type != INTR_TYPE_EXT_INTR && - type != INTR_TYPE_NMI_INTR)) - skip_emulated_instruction(vcpu); - - if (kvm_task_switch(vcpu, tss_selector, - type == INTR_TYPE_SOFT_INTR ? idt_index : -1, reason, - has_error_code, error_code) == EMULATE_FAIL) { - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; - vcpu->run->internal.ndata = 0; - return 0; - } - - /* clear all local breakpoint enable flags */ - vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~55); - - /* - * TODO: What about debug traps on tss switch? - * Are we supposed to inject them and update dr6? - */ - - return 1; -} - -static int handle_ept_violation(struct kvm_vcpu *vcpu) -{ - unsigned long exit_qualification; - gpa_t gpa; - int gla_validity; - - exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - - if (exit_qualification & (1 << 6)) { - printk(KERN_ERR "EPT: GPA exceeds GAW!\n"); - return -EINVAL; - } - - gla_validity = (exit_qualification >> 7) & 0x3; - if (gla_validity != 0x3 && gla_validity != 0x1 && gla_validity != 0) { - printk(KERN_ERR "EPT: Handling EPT violation failed!\n"); - printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n", - (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS), - vmcs_readl(GUEST_LINEAR_ADDRESS)); - printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n", - (long unsigned int)exit_qualification); - vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; - vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_VIOLATION; - return 0; - } - - gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); - trace_kvm_page_fault(gpa, exit_qualification); - return kvm_mmu_page_fault(vcpu, gpa, exit_qualification & 0x3, NULL, 0); -} - -static u64 ept_rsvd_mask(u64 spte, int level) -{ - int i; - u64 mask = 0; - - for (i = 51; i > boot_cpu_data.x86_phys_bits; i--) - mask |= (1ULL << i); - - if (level > 2) - /* bits 7:3 reserved */ - mask |= 0xf8; - else if (level == 2) { - if (spte & (1ULL << 7)) - /* 2MB ref, bits 20:12 reserved */ - mask |= 0x1ff000; - else - /* bits 6:3 reserved */ - mask |= 0x78; - } - - return mask; -} - -static void ept_misconfig_inspect_spte(struct kvm_vcpu *vcpu, u64 spte, - int level) -{ - printk(KERN_ERR "%s: spte 0x%llx level %d\n", __func__, spte, level); - - /* 010b (write-only) */ - WARN_ON((spte & 0x7) == 0x2); - - /* 110b (write/execute) */ - WARN_ON((spte & 0x7) == 0x6); - - /* 100b (execute-only) and value not supported by logical processor */ - if (!cpu_has_vmx_ept_execute_only()) - WARN_ON((spte & 0x7) == 0x4); - - /* not 000b */ - if ((spte & 0x7)) { - u64 rsvd_bits = spte & ept_rsvd_mask(spte, level); - - if (rsvd_bits != 0) { - printk(KERN_ERR "%s: rsvd_bits = 0x%llx\n", - __func__, rsvd_bits); - WARN_ON(1); - } - - if (level == 1 || (level == 2 && (spte & (1ULL << 7)))) { - u64 ept_mem_type = (spte & 0x38) >> 3; - - if (ept_mem_type == 2 || ept_mem_type == 3 || - ept_mem_type == 7) { - printk(KERN_ERR "%s: ept_mem_type=0x%llx\n", - __func__, ept_mem_type); - WARN_ON(1); - } - } - } -} - -static int handle_ept_misconfig(struct kvm_vcpu *vcpu) -{ - u64 sptes[4]; - int nr_sptes, i, ret; - gpa_t gpa; - - gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); - - ret = handle_mmio_page_fault_common(vcpu, gpa, true); - if (likely(ret == 1)) - return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) == - EMULATE_DONE; - if (unlikely(!ret)) - return 1; - - /* It is the real ept misconfig */ - printk(KERN_ERR "EPT: Misconfiguration.\n"); - printk(KERN_ERR "EPT: GPA: 0x%llx\n", gpa); - - nr_sptes = kvm_mmu_get_spte_hierarchy(vcpu, gpa, sptes); - - for (i = PT64_ROOT_LEVEL; i > PT64_ROOT_LEVEL - nr_sptes; --i) - ept_misconfig_inspect_spte(vcpu, sptes[i-1], i); - - vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; - vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_MISCONFIG; - - return 0; -} - -static int handle_nmi_window(struct kvm_vcpu *vcpu) -{ - u32 cpu_based_vm_exec_control; - - /* clear pending NMI */ - cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); - cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING; - vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); - ++vcpu->stat.nmi_window_exits; - kvm_make_request(KVM_REQ_EVENT, vcpu); - - return 1; -} - -static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - enum emulation_result err = EMULATE_DONE; - int ret = 1; - u32 cpu_exec_ctrl; - bool intr_window_requested; - - cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); - intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING; - - while (!guest_state_valid(vcpu)) { - if (intr_window_requested - && (kvm_get_rflags(&vmx->vcpu) & X86_EFLAGS_IF)) - return handle_interrupt_window(&vmx->vcpu); - - err = emulate_instruction(vcpu, 0); - - if (err == EMULATE_DO_MMIO) { - ret = 0; - goto out; - } - - if (err != EMULATE_DONE) - return 0; - - if (signal_pending(current)) - goto out; - if (need_resched()) - schedule(); - } - - vmx->emulation_required = 0; -out: - return ret; -} - -/* - * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE - * exiting, so only get here on cpu with PAUSE-Loop-Exiting. - */ -static int handle_pause(struct kvm_vcpu *vcpu) -{ - skip_emulated_instruction(vcpu); - kvm_vcpu_on_spin(vcpu); - - return 1; -} - -static int handle_invalid_op(struct kvm_vcpu *vcpu) -{ - kvm_queue_exception(vcpu, UD_VECTOR); - return 1; -} - -/* - * To run an L2 guest, we need a vmcs02 based on the L1-specified vmcs12. - * We could reuse a single VMCS for all the L2 guests, but we also want the - * option to allocate a separate vmcs02 for each separate loaded vmcs12 - this - * allows keeping them loaded on the processor, and in the future will allow - * optimizations where prepare_vmcs02 doesn't need to set all the fields on - * every entry if they never change. - * So we keep, in vmx->nested.vmcs02_pool, a cache of size VMCS02_POOL_SIZE - * (>=0) with a vmcs02 for each recently loaded vmcs12s, most recent first. - * - * The following functions allocate and free a vmcs02 in this pool. - */ - -/* Get a VMCS from the pool to use as vmcs02 for the current vmcs12. */ -static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx) -{ - struct vmcs02_list *item; - list_for_each_entry(item, &vmx->nested.vmcs02_pool, list) - if (item->vmptr == vmx->nested.current_vmptr) { - list_move(&item->list, &vmx->nested.vmcs02_pool); - return &item->vmcs02; - } - - if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) { - /* Recycle the least recently used VMCS. */ - item = list_entry(vmx->nested.vmcs02_pool.prev, - struct vmcs02_list, list); - item->vmptr = vmx->nested.current_vmptr; - list_move(&item->list, &vmx->nested.vmcs02_pool); - return &item->vmcs02; - } - - /* Create a new VMCS */ - item = (struct vmcs02_list *) - kmalloc(sizeof(struct vmcs02_list), GFP_KERNEL); - if (!item) - return NULL; - item->vmcs02.vmcs = alloc_vmcs(); - if (!item->vmcs02.vmcs) { - kfree(item); - return NULL; - } - loaded_vmcs_init(&item->vmcs02); - item->vmptr = vmx->nested.current_vmptr; - list_add(&(item->list), &(vmx->nested.vmcs02_pool)); - vmx->nested.vmcs02_num++; - return &item->vmcs02; -} - -/* Free and remove from pool a vmcs02 saved for a vmcs12 (if there is one) */ -static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr) -{ - struct vmcs02_list *item; - list_for_each_entry(item, &vmx->nested.vmcs02_pool, list) - if (item->vmptr == vmptr) { - free_loaded_vmcs(&item->vmcs02); - list_del(&item->list); - kfree(item); - vmx->nested.vmcs02_num--; - return; - } -} - -/* - * Free all VMCSs saved for this vcpu, except the one pointed by - * vmx->loaded_vmcs. These include the VMCSs in vmcs02_pool (except the one - * currently used, if running L2), and vmcs01 when running L2. - */ -static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx) -{ - struct vmcs02_list *item, *n; - list_for_each_entry_safe(item, n, &vmx->nested.vmcs02_pool, list) { - if (vmx->loaded_vmcs != &item->vmcs02) - free_loaded_vmcs(&item->vmcs02); - list_del(&item->list); - kfree(item); - } - vmx->nested.vmcs02_num = 0; - - if (vmx->loaded_vmcs != &vmx->vmcs01) - free_loaded_vmcs(&vmx->vmcs01); -} - -/* - * Emulate the VMXON instruction. - * Currently, we just remember that VMX is active, and do not save or even - * inspect the argument to VMXON (the so-called "VMXON pointer") because we - * do not currently need to store anything in that guest-allocated memory - * region. Consequently, VMCLEAR and VMPTRLD also do not verify that the their - * argument is different from the VMXON pointer (which the spec says they do). - */ -static int handle_vmon(struct kvm_vcpu *vcpu) -{ - struct kvm_segment cs; - struct vcpu_vmx *vmx = to_vmx(vcpu); - - /* The Intel VMX Instruction Reference lists a bunch of bits that - * are prerequisite to running VMXON, most notably cr4.VMXE must be - * set to 1 (see vmx_set_cr4() for when we allow the guest to set this). - * Otherwise, we should fail with #UD. We test these now: - */ - if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE) || - !kvm_read_cr0_bits(vcpu, X86_CR0_PE) || - (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) { - kvm_queue_exception(vcpu, UD_VECTOR); - return 1; - } - - vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); - if (is_long_mode(vcpu) && !cs.l) { - kvm_queue_exception(vcpu, UD_VECTOR); - return 1; - } - - if (vmx_get_cpl(vcpu)) { - kvm_inject_gp(vcpu, 0); - return 1; - } - - INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool)); - vmx->nested.vmcs02_num = 0; - - vmx->nested.vmxon = true; - - skip_emulated_instruction(vcpu); - return 1; -} - -/* - * Intel's VMX Instruction Reference specifies a common set of prerequisites - * for running VMX instructions (except VMXON, whose prerequisites are - * slightly different). It also specifies what exception to inject otherwise. - */ -static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) -{ - struct kvm_segment cs; - struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (!vmx->nested.vmxon) { - kvm_queue_exception(vcpu, UD_VECTOR); - return 0; - } - - vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); - if ((vmx_get_rflags(vcpu) & X86_EFLAGS_VM) || - (is_long_mode(vcpu) && !cs.l)) { - kvm_queue_exception(vcpu, UD_VECTOR); - return 0; - } - - if (vmx_get_cpl(vcpu)) { - kvm_inject_gp(vcpu, 0); - return 0; - } - - return 1; -} - -/* - * Free whatever needs to be freed from vmx->nested when L1 goes down, or - * just stops using VMX. - */ -static void free_nested(struct vcpu_vmx *vmx) -{ - if (!vmx->nested.vmxon) - return; - vmx->nested.vmxon = false; - if (vmx->nested.current_vmptr != -1ull) { - kunmap(vmx->nested.current_vmcs12_page); - nested_release_page(vmx->nested.current_vmcs12_page); - vmx->nested.current_vmptr = -1ull; - vmx->nested.current_vmcs12 = NULL; - } - /* Unpin physical memory we referred to in current vmcs02 */ - if (vmx->nested.apic_access_page) { - nested_release_page(vmx->nested.apic_access_page); - vmx->nested.apic_access_page = 0; - } - - nested_free_all_saved_vmcss(vmx); -} - -/* Emulate the VMXOFF instruction */ -static int handle_vmoff(struct kvm_vcpu *vcpu) -{ - if (!nested_vmx_check_permission(vcpu)) - return 1; - free_nested(to_vmx(vcpu)); - skip_emulated_instruction(vcpu); - return 1; -} - -/* - * Decode the memory-address operand of a vmx instruction, as recorded on an - * exit caused by such an instruction (run by a guest hypervisor). - * On success, returns 0. When the operand is invalid, returns 1 and throws - * #UD or #GP. - */ -static int get_vmx_mem_address(struct kvm_vcpu *vcpu, - unsigned long exit_qualification, - u32 vmx_instruction_info, gva_t *ret) -{ - /* - * According to Vol. 3B, "Information for VM Exits Due to Instruction - * Execution", on an exit, vmx_instruction_info holds most of the - * addressing components of the operand. Only the displacement part - * is put in exit_qualification (see 3B, "Basic VM-Exit Information"). - * For how an actual address is calculated from all these components, - * refer to Vol. 1, "Operand Addressing". - */ - int scaling = vmx_instruction_info & 3; - int addr_size = (vmx_instruction_info >> 7) & 7; - bool is_reg = vmx_instruction_info & (1u << 10); - int seg_reg = (vmx_instruction_info >> 15) & 7; - int index_reg = (vmx_instruction_info >> 18) & 0xf; - bool index_is_valid = !(vmx_instruction_info & (1u << 22)); - int base_reg = (vmx_instruction_info >> 23) & 0xf; - bool base_is_valid = !(vmx_instruction_info & (1u << 27)); - - if (is_reg) { - kvm_queue_exception(vcpu, UD_VECTOR); - return 1; - } - - /* Addr = segment_base + offset */ - /* offset = base + [index * scale] + displacement */ - *ret = vmx_get_segment_base(vcpu, seg_reg); - if (base_is_valid) - *ret += kvm_register_read(vcpu, base_reg); - if (index_is_valid) - *ret += kvm_register_read(vcpu, index_reg)<<scaling; - *ret += exit_qualification; /* holds the displacement */ - - if (addr_size == 1) /* 32 bit */ - *ret &= 0xffffffff; - - /* - * TODO: throw #GP (and return 1) in various cases that the VM* - * instructions require it - e.g., offset beyond segment limit, - * unusable or unreadable/unwritable segment, non-canonical 64-bit - * address, and so on. Currently these are not checked. - */ - return 0; -} - -/* - * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), - * set the success or error code of an emulated VMX instruction, as specified - * by Vol 2B, VMX Instruction Reference, "Conventions". - */ -static void nested_vmx_succeed(struct kvm_vcpu *vcpu) -{ - vmx_set_rflags(vcpu, vmx_get_rflags(vcpu) - & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | - X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF)); -} - -static void nested_vmx_failInvalid(struct kvm_vcpu *vcpu) -{ - vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) - & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | - X86_EFLAGS_SF | X86_EFLAGS_OF)) - | X86_EFLAGS_CF); -} - -static void nested_vmx_failValid(struct kvm_vcpu *vcpu, - u32 vm_instruction_error) -{ - if (to_vmx(vcpu)->nested.current_vmptr == -1ull) { - /* - * failValid writes the error number to the current VMCS, which - * can't be done there isn't a current VMCS. - */ - nested_vmx_failInvalid(vcpu); - return; - } - vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) - & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | - X86_EFLAGS_SF | X86_EFLAGS_OF)) - | X86_EFLAGS_ZF); - get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error; -} - -/* Emulate the VMCLEAR instruction */ -static int handle_vmclear(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - gva_t gva; - gpa_t vmptr; - struct vmcs12 *vmcs12; - struct page *page; - struct x86_exception e; - - if (!nested_vmx_check_permission(vcpu)) - return 1; - - if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), - vmcs_read32(VMX_INSTRUCTION_INFO), &gva)) - return 1; - - if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr, - sizeof(vmptr), &e)) { - kvm_inject_page_fault(vcpu, &e); - return 1; - } - - if (!IS_ALIGNED(vmptr, PAGE_SIZE)) { - nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS); - skip_emulated_instruction(vcpu); - return 1; - } - - if (vmptr == vmx->nested.current_vmptr) { - kunmap(vmx->nested.current_vmcs12_page); - nested_release_page(vmx->nested.current_vmcs12_page); - vmx->nested.current_vmptr = -1ull; - vmx->nested.current_vmcs12 = NULL; - } - - page = nested_get_page(vcpu, vmptr); - if (page == NULL) { - /* - * For accurate processor emulation, VMCLEAR beyond available - * physical memory should do nothing at all. However, it is - * possible that a nested vmx bug, not a guest hypervisor bug, - * resulted in this case, so let's shut down before doing any - * more damage: - */ - kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); - return 1; - } - vmcs12 = kmap(page); - vmcs12->launch_state = 0; - kunmap(page); - nested_release_page(page); - - nested_free_vmcs02(vmx, vmptr); - - skip_emulated_instruction(vcpu); - nested_vmx_succeed(vcpu); - return 1; -} - -static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch); - -/* Emulate the VMLAUNCH instruction */ -static int handle_vmlaunch(struct kvm_vcpu *vcpu) -{ - return nested_vmx_run(vcpu, true); -} - -/* Emulate the VMRESUME instruction */ -static int handle_vmresume(struct kvm_vcpu *vcpu) -{ - - return nested_vmx_run(vcpu, false); -} - -enum vmcs_field_type { - VMCS_FIELD_TYPE_U16 = 0, - VMCS_FIELD_TYPE_U64 = 1, - VMCS_FIELD_TYPE_U32 = 2, - VMCS_FIELD_TYPE_NATURAL_WIDTH = 3 -}; - -static inline int vmcs_field_type(unsigned long field) -{ - if (0x1 & field) /* the *_HIGH fields are all 32 bit */ - return VMCS_FIELD_TYPE_U32; - return (field >> 13) & 0x3 ; -} - -static inline int vmcs_field_readonly(unsigned long field) -{ - return (((field >> 10) & 0x3) == 1); -} - -/* - * Read a vmcs12 field. Since these can have varying lengths and we return - * one type, we chose the biggest type (u64) and zero-extend the return value - * to that size. Note that the caller, handle_vmread, might need to use only - * some of the bits we return here (e.g., on 32-bit guests, only 32 bits of - * 64-bit fields are to be returned). - */ -static inline bool vmcs12_read_any(struct kvm_vcpu *vcpu, - unsigned long field, u64 *ret) -{ - short offset = vmcs_field_to_offset(field); - char *p; - - if (offset < 0) - return 0; - - p = ((char *)(get_vmcs12(vcpu))) + offset; - - switch (vmcs_field_type(field)) { - case VMCS_FIELD_TYPE_NATURAL_WIDTH: - *ret = *((natural_width *)p); - return 1; - case VMCS_FIELD_TYPE_U16: - *ret = *((u16 *)p); - return 1; - case VMCS_FIELD_TYPE_U32: - *ret = *((u32 *)p); - return 1; - case VMCS_FIELD_TYPE_U64: - *ret = *((u64 *)p); - return 1; - default: - return 0; /* can never happen. */ - } -} - -/* - * VMX instructions which assume a current vmcs12 (i.e., that VMPTRLD was - * used before) all generate the same failure when it is missing. - */ -static int nested_vmx_check_vmcs12(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - if (vmx->nested.current_vmptr == -1ull) { - nested_vmx_failInvalid(vcpu); - skip_emulated_instruction(vcpu); - return 0; - } - return 1; -} - -static int handle_vmread(struct kvm_vcpu *vcpu) -{ - unsigned long field; - u64 field_value; - unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); - gva_t gva = 0; - - if (!nested_vmx_check_permission(vcpu) || - !nested_vmx_check_vmcs12(vcpu)) - return 1; - - /* Decode instruction info and find the field to read */ - field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); - /* Read the field, zero-extended to a u64 field_value */ - if (!vmcs12_read_any(vcpu, field, &field_value)) { - nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); - skip_emulated_instruction(vcpu); - return 1; - } - /* - * Now copy part of this value to register or memory, as requested. - * Note that the number of bits actually copied is 32 or 64 depending - * on the guest's mode (32 or 64 bit), not on the given field's length. - */ - if (vmx_instruction_info & (1u << 10)) { - kvm_register_write(vcpu, (((vmx_instruction_info) >> 3) & 0xf), - field_value); - } else { - if (get_vmx_mem_address(vcpu, exit_qualification, - vmx_instruction_info, &gva)) - return 1; - /* _system ok, as nested_vmx_check_permission verified cpl=0 */ - kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, gva, - &field_value, (is_long_mode(vcpu) ? 8 : 4), NULL); - } - - nested_vmx_succeed(vcpu); - skip_emulated_instruction(vcpu); - return 1; -} - - -static int handle_vmwrite(struct kvm_vcpu *vcpu) -{ - unsigned long field; - gva_t gva; - unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); - char *p; - short offset; - /* The value to write might be 32 or 64 bits, depending on L1's long - * mode, and eventually we need to write that into a field of several - * possible lengths. The code below first zero-extends the value to 64 - * bit (field_value), and then copies only the approriate number of - * bits into the vmcs12 field. - */ - u64 field_value = 0; - struct x86_exception e; - - if (!nested_vmx_check_permission(vcpu) || - !nested_vmx_check_vmcs12(vcpu)) - return 1; - - if (vmx_instruction_info & (1u << 10)) - field_value = kvm_register_read(vcpu, - (((vmx_instruction_info) >> 3) & 0xf)); - else { - if (get_vmx_mem_address(vcpu, exit_qualification, - vmx_instruction_info, &gva)) - return 1; - if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, - &field_value, (is_long_mode(vcpu) ? 8 : 4), &e)) { - kvm_inject_page_fault(vcpu, &e); - return 1; - } - } - - - field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); - if (vmcs_field_readonly(field)) { - nested_vmx_failValid(vcpu, - VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); - skip_emulated_instruction(vcpu); - return 1; - } - - offset = vmcs_field_to_offset(field); - if (offset < 0) { - nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); - skip_emulated_instruction(vcpu); - return 1; - } - p = ((char *) get_vmcs12(vcpu)) + offset; - - switch (vmcs_field_type(field)) { - case VMCS_FIELD_TYPE_U16: - *(u16 *)p = field_value; - break; - case VMCS_FIELD_TYPE_U32: - *(u32 *)p = field_value; - break; - case VMCS_FIELD_TYPE_U64: - *(u64 *)p = field_value; - break; - case VMCS_FIELD_TYPE_NATURAL_WIDTH: - *(natural_width *)p = field_value; - break; - default: - nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); - skip_emulated_instruction(vcpu); - return 1; - } - - nested_vmx_succeed(vcpu); - skip_emulated_instruction(vcpu); - return 1; -} - -/* Emulate the VMPTRLD instruction */ -static int handle_vmptrld(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - gva_t gva; - gpa_t vmptr; - struct x86_exception e; - - if (!nested_vmx_check_permission(vcpu)) - return 1; - - if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), - vmcs_read32(VMX_INSTRUCTION_INFO), &gva)) - return 1; - - if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr, - sizeof(vmptr), &e)) { - kvm_inject_page_fault(vcpu, &e); - return 1; - } - - if (!IS_ALIGNED(vmptr, PAGE_SIZE)) { - nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS); - skip_emulated_instruction(vcpu); - return 1; - } - - if (vmx->nested.current_vmptr != vmptr) { - struct vmcs12 *new_vmcs12; - struct page *page; - page = nested_get_page(vcpu, vmptr); - if (page == NULL) { - nested_vmx_failInvalid(vcpu); - skip_emulated_instruction(vcpu); - return 1; - } - new_vmcs12 = kmap(page); - if (new_vmcs12->revision_id != VMCS12_REVISION) { - kunmap(page); - nested_release_page_clean(page); - nested_vmx_failValid(vcpu, - VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); - skip_emulated_instruction(vcpu); - return 1; - } - if (vmx->nested.current_vmptr != -1ull) { - kunmap(vmx->nested.current_vmcs12_page); - nested_release_page(vmx->nested.current_vmcs12_page); - } - - vmx->nested.current_vmptr = vmptr; - vmx->nested.current_vmcs12 = new_vmcs12; - vmx->nested.current_vmcs12_page = page; - } - - nested_vmx_succeed(vcpu); - skip_emulated_instruction(vcpu); - return 1; -} - -/* Emulate the VMPTRST instruction */ -static int handle_vmptrst(struct kvm_vcpu *vcpu) -{ - unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); - gva_t vmcs_gva; - struct x86_exception e; - - if (!nested_vmx_check_permission(vcpu)) - return 1; - - if (get_vmx_mem_address(vcpu, exit_qualification, - vmx_instruction_info, &vmcs_gva)) - return 1; - /* ok to use *_system, as nested_vmx_check_permission verified cpl=0 */ - if (kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, vmcs_gva, - (void *)&to_vmx(vcpu)->nested.current_vmptr, - sizeof(u64), &e)) { - kvm_inject_page_fault(vcpu, &e); - return 1; - } - nested_vmx_succeed(vcpu); - skip_emulated_instruction(vcpu); - return 1; -} - -/* - * The exit handlers return 1 if the exit was handled fully and guest execution - * may resume. Otherwise they set the kvm_run parameter to indicate what needs - * to be done to userspace and return 0. - */ -static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { - [EXIT_REASON_EXCEPTION_NMI] = handle_exception, - [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, - [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault, - [EXIT_REASON_NMI_WINDOW] = handle_nmi_window, - [EXIT_REASON_IO_INSTRUCTION] = handle_io, - [EXIT_REASON_CR_ACCESS] = handle_cr, - [EXIT_REASON_DR_ACCESS] = handle_dr, - [EXIT_REASON_CPUID] = handle_cpuid, - [EXIT_REASON_MSR_READ] = handle_rdmsr, - [EXIT_REASON_MSR_WRITE] = handle_wrmsr, - [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, - [EXIT_REASON_HLT] = handle_halt, - [EXIT_REASON_INVD] = handle_invd, - [EXIT_REASON_INVLPG] = handle_invlpg, - [EXIT_REASON_RDPMC] = handle_rdpmc, - [EXIT_REASON_VMCALL] = handle_vmcall, - [EXIT_REASON_VMCLEAR] = handle_vmclear, - [EXIT_REASON_VMLAUNCH] = handle_vmlaunch, - [EXIT_REASON_VMPTRLD] = handle_vmptrld, - [EXIT_REASON_VMPTRST] = handle_vmptrst, - [EXIT_REASON_VMREAD] = handle_vmread, - [EXIT_REASON_VMRESUME] = handle_vmresume, - [EXIT_REASON_VMWRITE] = handle_vmwrite, - [EXIT_REASON_VMOFF] = handle_vmoff, - [EXIT_REASON_VMON] = handle_vmon, - [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, - [EXIT_REASON_APIC_ACCESS] = handle_apic_access, - [EXIT_REASON_WBINVD] = handle_wbinvd, - [EXIT_REASON_XSETBV] = handle_xsetbv, - [EXIT_REASON_TASK_SWITCH] = handle_task_switch, - [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, - [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, - [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, - [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause, - [EXIT_REASON_MWAIT_INSTRUCTION] = handle_invalid_op, - [EXIT_REASON_MONITOR_INSTRUCTION] = handle_invalid_op, -}; - -static const int kvm_vmx_max_exit_handlers = - ARRAY_SIZE(kvm_vmx_exit_handlers); - -/* - * Return 1 if we should exit from L2 to L1 to handle an MSR access access, - * rather than handle it ourselves in L0. I.e., check whether L1 expressed - * disinterest in the current event (read or write a specific MSR) by using an - * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps. - */ -static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12, u32 exit_reason) -{ - u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX]; - gpa_t bitmap; - - if (!nested_cpu_has(get_vmcs12(vcpu), CPU_BASED_USE_MSR_BITMAPS)) - return 1; - - /* - * The MSR_BITMAP page is divided into four 1024-byte bitmaps, - * for the four combinations of read/write and low/high MSR numbers. - * First we need to figure out which of the four to use: - */ - bitmap = vmcs12->msr_bitmap; - if (exit_reason == EXIT_REASON_MSR_WRITE) - bitmap += 2048; - if (msr_index >= 0xc0000000) { - msr_index -= 0xc0000000; - bitmap += 1024; - } - - /* Then read the msr_index'th bit from this bitmap: */ - if (msr_index < 1024*8) { - unsigned char b; - kvm_read_guest(vcpu->kvm, bitmap + msr_index/8, &b, 1); - return 1 & (b >> (msr_index & 7)); - } else - return 1; /* let L1 handle the wrong parameter */ -} - -/* - * Return 1 if we should exit from L2 to L1 to handle a CR access exit, - * rather than handle it ourselves in L0. I.e., check if L1 wanted to - * intercept (via guest_host_mask etc.) the current event. - */ -static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12) -{ - unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - int cr = exit_qualification & 15; - int reg = (exit_qualification >> 8) & 15; - unsigned long val = kvm_register_read(vcpu, reg); - - switch ((exit_qualification >> 4) & 3) { - case 0: /* mov to cr */ - switch (cr) { - case 0: - if (vmcs12->cr0_guest_host_mask & - (val ^ vmcs12->cr0_read_shadow)) - return 1; - break; - case 3: - if ((vmcs12->cr3_target_count >= 1 && - vmcs12->cr3_target_value0 == val) || - (vmcs12->cr3_target_count >= 2 && - vmcs12->cr3_target_value1 == val) || - (vmcs12->cr3_target_count >= 3 && - vmcs12->cr3_target_value2 == val) || - (vmcs12->cr3_target_count >= 4 && - vmcs12->cr3_target_value3 == val)) - return 0; - if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING)) - return 1; - break; - case 4: - if (vmcs12->cr4_guest_host_mask & - (vmcs12->cr4_read_shadow ^ val)) - return 1; - break; - case 8: - if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING)) - return 1; - break; - } - break; - case 2: /* clts */ - if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) && - (vmcs12->cr0_read_shadow & X86_CR0_TS)) - return 1; - break; - case 1: /* mov from cr */ - switch (cr) { - case 3: - if (vmcs12->cpu_based_vm_exec_control & - CPU_BASED_CR3_STORE_EXITING) - return 1; - break; - case 8: - if (vmcs12->cpu_based_vm_exec_control & - CPU_BASED_CR8_STORE_EXITING) - return 1; - break; - } - break; - case 3: /* lmsw */ - /* - * lmsw can change bits 1..3 of cr0, and only set bit 0 of - * cr0. Other attempted changes are ignored, with no exit. - */ - if (vmcs12->cr0_guest_host_mask & 0xe & - (val ^ vmcs12->cr0_read_shadow)) - return 1; - if ((vmcs12->cr0_guest_host_mask & 0x1) && - !(vmcs12->cr0_read_shadow & 0x1) && - (val & 0x1)) - return 1; - break; - } - return 0; -} - -/* - * Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we - * should handle it ourselves in L0 (and then continue L2). Only call this - * when in is_guest_mode (L2). - */ -static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) -{ - u32 exit_reason = vmcs_read32(VM_EXIT_REASON); - u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO); - struct vcpu_vmx *vmx = to_vmx(vcpu); - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - - if (vmx->nested.nested_run_pending) - return 0; - - if (unlikely(vmx->fail)) { - pr_info_ratelimited("%s failed vm entry %x\n", __func__, - vmcs_read32(VM_INSTRUCTION_ERROR)); - return 1; - } - - switch (exit_reason) { - case EXIT_REASON_EXCEPTION_NMI: - if (!is_exception(intr_info)) - return 0; - else if (is_page_fault(intr_info)) - return enable_ept; - return vmcs12->exception_bitmap & - (1u << (intr_info & INTR_INFO_VECTOR_MASK)); - case EXIT_REASON_EXTERNAL_INTERRUPT: - return 0; - case EXIT_REASON_TRIPLE_FAULT: - return 1; - case EXIT_REASON_PENDING_INTERRUPT: - case EXIT_REASON_NMI_WINDOW: - /* - * prepare_vmcs02() set the CPU_BASED_VIRTUAL_INTR_PENDING bit - * (aka Interrupt Window Exiting) only when L1 turned it on, - * so if we got a PENDING_INTERRUPT exit, this must be for L1. - * Same for NMI Window Exiting. - */ - return 1; - case EXIT_REASON_TASK_SWITCH: - return 1; - case EXIT_REASON_CPUID: - return 1; - case EXIT_REASON_HLT: - return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING); - case EXIT_REASON_INVD: - return 1; - case EXIT_REASON_INVLPG: - return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); - case EXIT_REASON_RDPMC: - return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING); - case EXIT_REASON_RDTSC: - return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); - case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: - case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD: - case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD: - case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE: - case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: - /* - * VMX instructions trap unconditionally. This allows L1 to - * emulate them for its L2 guest, i.e., allows 3-level nesting! - */ - return 1; - case EXIT_REASON_CR_ACCESS: - return nested_vmx_exit_handled_cr(vcpu, vmcs12); - case EXIT_REASON_DR_ACCESS: - return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING); - case EXIT_REASON_IO_INSTRUCTION: - /* TODO: support IO bitmaps */ - return 1; - case EXIT_REASON_MSR_READ: - case EXIT_REASON_MSR_WRITE: - return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); - case EXIT_REASON_INVALID_STATE: - return 1; - case EXIT_REASON_MWAIT_INSTRUCTION: - return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING); - case EXIT_REASON_MONITOR_INSTRUCTION: - return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING); - case EXIT_REASON_PAUSE_INSTRUCTION: - return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) || - nested_cpu_has2(vmcs12, - SECONDARY_EXEC_PAUSE_LOOP_EXITING); - case EXIT_REASON_MCE_DURING_VMENTRY: - return 0; - case EXIT_REASON_TPR_BELOW_THRESHOLD: - return 1; - case EXIT_REASON_APIC_ACCESS: - return nested_cpu_has2(vmcs12, - SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); - case EXIT_REASON_EPT_VIOLATION: - case EXIT_REASON_EPT_MISCONFIG: - return 0; - case EXIT_REASON_WBINVD: - return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); - case EXIT_REASON_XSETBV: - return 1; - default: - return 1; - } -} - -static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) -{ - *info1 = vmcs_readl(EXIT_QUALIFICATION); - *info2 = vmcs_read32(VM_EXIT_INTR_INFO); -} - -/* - * The guest has exited. See if we can fix it or if we need userspace - * assistance. - */ -static int vmx_handle_exit(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - u32 exit_reason = vmx->exit_reason; - u32 vectoring_info = vmx->idt_vectoring_info; - - /* If guest state is invalid, start emulating */ - if (vmx->emulation_required && emulate_invalid_guest_state) - return handle_invalid_guest_state(vcpu); - - /* - * the KVM_REQ_EVENT optimization bit is only on for one entry, and if - * we did not inject a still-pending event to L1 now because of - * nested_run_pending, we need to re-enable this bit. - */ - if (vmx->nested.nested_run_pending) - kvm_make_request(KVM_REQ_EVENT, vcpu); - - if (!is_guest_mode(vcpu) && (exit_reason == EXIT_REASON_VMLAUNCH || - exit_reason == EXIT_REASON_VMRESUME)) - vmx->nested.nested_run_pending = 1; - else - vmx->nested.nested_run_pending = 0; - - if (is_guest_mode(vcpu) && nested_vmx_exit_handled(vcpu)) { - nested_vmx_vmexit(vcpu); - return 1; - } - - if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) { - vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; - vcpu->run->fail_entry.hardware_entry_failure_reason - = exit_reason; - return 0; - } - - if (unlikely(vmx->fail)) { - vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; - vcpu->run->fail_entry.hardware_entry_failure_reason - = vmcs_read32(VM_INSTRUCTION_ERROR); - return 0; - } - - if ((vectoring_info & VECTORING_INFO_VALID_MASK) && - (exit_reason != EXIT_REASON_EXCEPTION_NMI && - exit_reason != EXIT_REASON_EPT_VIOLATION && - exit_reason != EXIT_REASON_TASK_SWITCH)) - printk(KERN_WARNING "%s: unexpected, valid vectoring info " - "(0x%x) and exit reason is 0x%x\n", - __func__, vectoring_info, exit_reason); - - if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked && - !(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis( - get_vmcs12(vcpu), vcpu)))) { - if (vmx_interrupt_allowed(vcpu)) { - vmx->soft_vnmi_blocked = 0; - } else if (vmx->vnmi_blocked_time > 1000000000LL && - vcpu->arch.nmi_pending) { - /* - * This CPU don't support us in finding the end of an - * NMI-blocked window if the guest runs with IRQs - * disabled. So we pull the trigger after 1 s of - * futile waiting, but inform the user about this. - */ - printk(KERN_WARNING "%s: Breaking out of NMI-blocked " - "state on VCPU %d after 1 s timeout\n", - __func__, vcpu->vcpu_id); - vmx->soft_vnmi_blocked = 0; - } - } - - if (exit_reason < kvm_vmx_max_exit_handlers - && kvm_vmx_exit_handlers[exit_reason]) - return kvm_vmx_exit_handlers[exit_reason](vcpu); - else { - vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; - vcpu->run->hw.hardware_exit_reason = exit_reason; - } - return 0; -} - -static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) -{ - if (irr == -1 || tpr < irr) { - vmcs_write32(TPR_THRESHOLD, 0); - return; - } - - vmcs_write32(TPR_THRESHOLD, irr); -} - -static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) -{ - u32 exit_intr_info; - - if (!(vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY - || vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI)) - return; - - vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); - exit_intr_info = vmx->exit_intr_info; - - /* Handle machine checks before interrupts are enabled */ - if (is_machine_check(exit_intr_info)) - kvm_machine_check(); - - /* We need to handle NMIs before interrupts are enabled */ - if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR && - (exit_intr_info & INTR_INFO_VALID_MASK)) { - kvm_before_handle_nmi(&vmx->vcpu); - asm("int $2"); - kvm_after_handle_nmi(&vmx->vcpu); - } -} - -static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) -{ - u32 exit_intr_info; - bool unblock_nmi; - u8 vector; - bool idtv_info_valid; - - idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK; - - if (cpu_has_virtual_nmis()) { - if (vmx->nmi_known_unmasked) - return; - /* - * Can't use vmx->exit_intr_info since we're not sure what - * the exit reason is. - */ - exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); - unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; - vector = exit_intr_info & INTR_INFO_VECTOR_MASK; - /* - * SDM 3: 27.7.1.2 (September 2008) - * Re-set bit "block by NMI" before VM entry if vmexit caused by - * a guest IRET fault. - * SDM 3: 23.2.2 (September 2008) - * Bit 12 is undefined in any of the following cases: - * If the VM exit sets the valid bit in the IDT-vectoring - * information field. - * If the VM exit is due to a double fault. - */ - if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi && - vector != DF_VECTOR && !idtv_info_valid) - vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, - GUEST_INTR_STATE_NMI); - else - vmx->nmi_known_unmasked = - !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) - & GUEST_INTR_STATE_NMI); - } else if (unlikely(vmx->soft_vnmi_blocked)) - vmx->vnmi_blocked_time += - ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time)); -} - -static void __vmx_complete_interrupts(struct vcpu_vmx *vmx, - u32 idt_vectoring_info, - int instr_len_field, - int error_code_field) -{ - u8 vector; - int type; - bool idtv_info_valid; - - idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; - - vmx->vcpu.arch.nmi_injected = false; - kvm_clear_exception_queue(&vmx->vcpu); - kvm_clear_interrupt_queue(&vmx->vcpu); - - if (!idtv_info_valid) - return; - - kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu); - - vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK; - type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK; - - switch (type) { - case INTR_TYPE_NMI_INTR: - vmx->vcpu.arch.nmi_injected = true; - /* - * SDM 3: 27.7.1.2 (September 2008) - * Clear bit "block by NMI" before VM entry if a NMI - * delivery faulted. - */ - vmx_set_nmi_mask(&vmx->vcpu, false); - break; - case INTR_TYPE_SOFT_EXCEPTION: - vmx->vcpu.arch.event_exit_inst_len = - vmcs_read32(instr_len_field); - /* fall through */ - case INTR_TYPE_HARD_EXCEPTION: - if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) { - u32 err = vmcs_read32(error_code_field); - kvm_queue_exception_e(&vmx->vcpu, vector, err); - } else - kvm_queue_exception(&vmx->vcpu, vector); - break; - case INTR_TYPE_SOFT_INTR: - vmx->vcpu.arch.event_exit_inst_len = - vmcs_read32(instr_len_field); - /* fall through */ - case INTR_TYPE_EXT_INTR: - kvm_queue_interrupt(&vmx->vcpu, vector, - type == INTR_TYPE_SOFT_INTR); - break; - default: - break; - } -} - -static void vmx_complete_interrupts(struct vcpu_vmx *vmx) -{ - if (is_guest_mode(&vmx->vcpu)) - return; - __vmx_complete_interrupts(vmx, vmx->idt_vectoring_info, - VM_EXIT_INSTRUCTION_LEN, - IDT_VECTORING_ERROR_CODE); -} - -static void vmx_cancel_injection(struct kvm_vcpu *vcpu) -{ - if (is_guest_mode(vcpu)) - return; - __vmx_complete_interrupts(to_vmx(vcpu), - vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), - VM_ENTRY_INSTRUCTION_LEN, - VM_ENTRY_EXCEPTION_ERROR_CODE); - - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); -} - -static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) -{ - int i, nr_msrs; - struct perf_guest_switch_msr *msrs; - - msrs = perf_guest_get_msrs(&nr_msrs); - - if (!msrs) - return; - - for (i = 0; i < nr_msrs; i++) - if (msrs[i].host == msrs[i].guest) - clear_atomic_switch_msr(vmx, msrs[i].msr); - else - add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest, - msrs[i].host); -} - -#ifdef CONFIG_X86_64 -#define R "r" -#define Q "q" -#else -#define R "e" -#define Q "l" -#endif - -static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (is_guest_mode(vcpu) && !vmx->nested.nested_run_pending) { - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - if (vmcs12->idt_vectoring_info_field & - VECTORING_INFO_VALID_MASK) { - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, - vmcs12->idt_vectoring_info_field); - vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, - vmcs12->vm_exit_instruction_len); - if (vmcs12->idt_vectoring_info_field & - VECTORING_INFO_DELIVER_CODE_MASK) - vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, - vmcs12->idt_vectoring_error_code); - } - } - - /* Record the guest's net vcpu time for enforced NMI injections. */ - if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) - vmx->entry_time = ktime_get(); - - /* Don't enter VMX if guest state is invalid, let the exit handler - start emulation until we arrive back to a valid state */ - if (vmx->emulation_required && emulate_invalid_guest_state) - return; - - if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty)) - vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); - if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty)) - vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); - - /* When single-stepping over STI and MOV SS, we must clear the - * corresponding interruptibility bits in the guest state. Otherwise - * vmentry fails as it then expects bit 14 (BS) in pending debug - * exceptions being set, but that's not correct for the guest debugging - * case. */ - if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) - vmx_set_interrupt_shadow(vcpu, 0); - - atomic_switch_perf_msrs(vmx); - - vmx->__launched = vmx->loaded_vmcs->launched; - asm( - /* Store host registers */ - "push %%"R"dx; push %%"R"bp;" - "push %%"R"cx \n\t" /* placeholder for guest rcx */ - "push %%"R"cx \n\t" - "cmp %%"R"sp, %c[host_rsp](%0) \n\t" - "je 1f \n\t" - "mov %%"R"sp, %c[host_rsp](%0) \n\t" - __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t" - "1: \n\t" - /* Reload cr2 if changed */ - "mov %c[cr2](%0), %%"R"ax \n\t" - "mov %%cr2, %%"R"dx \n\t" - "cmp %%"R"ax, %%"R"dx \n\t" - "je 2f \n\t" - "mov %%"R"ax, %%cr2 \n\t" - "2: \n\t" - /* Check if vmlaunch of vmresume is needed */ - "cmpl $0, %c[launched](%0) \n\t" - /* Load guest registers. Don't clobber flags. */ - "mov %c[rax](%0), %%"R"ax \n\t" - "mov %c[rbx](%0), %%"R"bx \n\t" - "mov %c[rdx](%0), %%"R"dx \n\t" - "mov %c[rsi](%0), %%"R"si \n\t" - "mov %c[rdi](%0), %%"R"di \n\t" - "mov %c[rbp](%0), %%"R"bp \n\t" -#ifdef CONFIG_X86_64 - "mov %c[r8](%0), %%r8 \n\t" - "mov %c[r9](%0), %%r9 \n\t" - "mov %c[r10](%0), %%r10 \n\t" - "mov %c[r11](%0), %%r11 \n\t" - "mov %c[r12](%0), %%r12 \n\t" - "mov %c[r13](%0), %%r13 \n\t" - "mov %c[r14](%0), %%r14 \n\t" - "mov %c[r15](%0), %%r15 \n\t" -#endif - "mov %c[rcx](%0), %%"R"cx \n\t" /* kills %0 (ecx) */ - - /* Enter guest mode */ - "jne .Llaunched \n\t" - __ex(ASM_VMX_VMLAUNCH) "\n\t" - "jmp .Lkvm_vmx_return \n\t" - ".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t" - ".Lkvm_vmx_return: " - /* Save guest registers, load host registers, keep flags */ - "mov %0, %c[wordsize](%%"R"sp) \n\t" - "pop %0 \n\t" - "mov %%"R"ax, %c[rax](%0) \n\t" - "mov %%"R"bx, %c[rbx](%0) \n\t" - "pop"Q" %c[rcx](%0) \n\t" - "mov %%"R"dx, %c[rdx](%0) \n\t" - "mov %%"R"si, %c[rsi](%0) \n\t" - "mov %%"R"di, %c[rdi](%0) \n\t" - "mov %%"R"bp, %c[rbp](%0) \n\t" -#ifdef CONFIG_X86_64 - "mov %%r8, %c[r8](%0) \n\t" - "mov %%r9, %c[r9](%0) \n\t" - "mov %%r10, %c[r10](%0) \n\t" - "mov %%r11, %c[r11](%0) \n\t" - "mov %%r12, %c[r12](%0) \n\t" - "mov %%r13, %c[r13](%0) \n\t" - "mov %%r14, %c[r14](%0) \n\t" - "mov %%r15, %c[r15](%0) \n\t" -#endif - "mov %%cr2, %%"R"ax \n\t" - "mov %%"R"ax, %c[cr2](%0) \n\t" - - "pop %%"R"bp; pop %%"R"dx \n\t" - "setbe %c[fail](%0) \n\t" - : : "c"(vmx), "d"((unsigned long)HOST_RSP), - [launched]"i"(offsetof(struct vcpu_vmx, __launched)), - [fail]"i"(offsetof(struct vcpu_vmx, fail)), - [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)), - [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])), - [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])), - [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])), - [rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])), - [rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])), - [rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])), - [rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])), -#ifdef CONFIG_X86_64 - [r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])), - [r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])), - [r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])), - [r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])), - [r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])), - [r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])), - [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])), - [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])), -#endif - [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)), - [wordsize]"i"(sizeof(ulong)) - : "cc", "memory" - , R"ax", R"bx", R"di", R"si" -#ifdef CONFIG_X86_64 - , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" -#endif - ); - - vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP) - | (1 << VCPU_EXREG_RFLAGS) - | (1 << VCPU_EXREG_CPL) - | (1 << VCPU_EXREG_PDPTR) - | (1 << VCPU_EXREG_SEGMENTS) - | (1 << VCPU_EXREG_CR3)); - vcpu->arch.regs_dirty = 0; - - vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); - - if (is_guest_mode(vcpu)) { - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - vmcs12->idt_vectoring_info_field = vmx->idt_vectoring_info; - if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) { - vmcs12->idt_vectoring_error_code = - vmcs_read32(IDT_VECTORING_ERROR_CODE); - vmcs12->vm_exit_instruction_len = - vmcs_read32(VM_EXIT_INSTRUCTION_LEN); - } - } - - asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); - vmx->loaded_vmcs->launched = 1; - - vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); - trace_kvm_exit(vmx->exit_reason, vcpu, KVM_ISA_VMX); - - vmx_complete_atomic_exit(vmx); - vmx_recover_nmi_blocking(vmx); - vmx_complete_interrupts(vmx); -} - -#undef R -#undef Q - -static void vmx_free_vcpu(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - free_vpid(vmx); - free_nested(vmx); - free_loaded_vmcs(vmx->loaded_vmcs); - kfree(vmx->guest_msrs); - kvm_vcpu_uninit(vcpu); - kmem_cache_free(kvm_vcpu_cache, vmx); -} - -static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) -{ - int err; - struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); - int cpu; - - if (!vmx) - return ERR_PTR(-ENOMEM); - - allocate_vpid(vmx); - - err = kvm_vcpu_init(&vmx->vcpu, kvm, id); - if (err) - goto free_vcpu; - - vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); - err = -ENOMEM; - if (!vmx->guest_msrs) { - goto uninit_vcpu; - } - - vmx->loaded_vmcs = &vmx->vmcs01; - vmx->loaded_vmcs->vmcs = alloc_vmcs(); - if (!vmx->loaded_vmcs->vmcs) - goto free_msrs; - if (!vmm_exclusive) - kvm_cpu_vmxon(__pa(per_cpu(vmxarea, raw_smp_processor_id()))); - loaded_vmcs_init(vmx->loaded_vmcs); - if (!vmm_exclusive) - kvm_cpu_vmxoff(); - - cpu = get_cpu(); - vmx_vcpu_load(&vmx->vcpu, cpu); - vmx->vcpu.cpu = cpu; - err = vmx_vcpu_setup(vmx); - vmx_vcpu_put(&vmx->vcpu); - put_cpu(); - if (err) - goto free_vmcs; - if (vm_need_virtualize_apic_accesses(kvm)) - err = alloc_apic_access_page(kvm); - if (err) - goto free_vmcs; - - if (enable_ept) { - if (!kvm->arch.ept_identity_map_addr) - kvm->arch.ept_identity_map_addr = - VMX_EPT_IDENTITY_PAGETABLE_ADDR; - err = -ENOMEM; - if (alloc_identity_pagetable(kvm) != 0) - goto free_vmcs; - if (!init_rmode_identity_map(kvm)) - goto free_vmcs; - } - - vmx->nested.current_vmptr = -1ull; - vmx->nested.current_vmcs12 = NULL; - - return &vmx->vcpu; - -free_vmcs: - free_vmcs(vmx->loaded_vmcs->vmcs); -free_msrs: - kfree(vmx->guest_msrs); -uninit_vcpu: - kvm_vcpu_uninit(&vmx->vcpu); -free_vcpu: - free_vpid(vmx); - kmem_cache_free(kvm_vcpu_cache, vmx); - return ERR_PTR(err); -} - -static void __init vmx_check_processor_compat(void *rtn) -{ - struct vmcs_config vmcs_conf; - - *(int *)rtn = 0; - if (setup_vmcs_config(&vmcs_conf) < 0) - *(int *)rtn = -EIO; - if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) { - printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n", - smp_processor_id()); - *(int *)rtn = -EIO; - } -} - -static int get_ept_level(void) -{ - return VMX_EPT_DEFAULT_GAW + 1; -} - -static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) -{ - u64 ret; - - /* For VT-d and EPT combination - * 1. MMIO: always map as UC - * 2. EPT with VT-d: - * a. VT-d without snooping control feature: can't guarantee the - * result, try to trust guest. - * b. VT-d with snooping control feature: snooping control feature of - * VT-d engine can guarantee the cache correctness. Just set it - * to WB to keep consistent with host. So the same as item 3. - * 3. EPT without VT-d: always map as WB and set IPAT=1 to keep - * consistent with host MTRR - */ - if (is_mmio) - ret = MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT; - else if (vcpu->kvm->arch.iommu_domain && - !(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY)) - ret = kvm_get_guest_memory_type(vcpu, gfn) << - VMX_EPT_MT_EPTE_SHIFT; - else - ret = (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) - | VMX_EPT_IPAT_BIT; - - return ret; -} - -static int vmx_get_lpage_level(void) -{ - if (enable_ept && !cpu_has_vmx_ept_1g_page()) - return PT_DIRECTORY_LEVEL; - else - /* For shadow and EPT supported 1GB page */ - return PT_PDPE_LEVEL; -} - -static void vmx_cpuid_update(struct kvm_vcpu *vcpu) -{ - struct kvm_cpuid_entry2 *best; - struct vcpu_vmx *vmx = to_vmx(vcpu); - u32 exec_control; - - vmx->rdtscp_enabled = false; - if (vmx_rdtscp_supported()) { - exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); - if (exec_control & SECONDARY_EXEC_RDTSCP) { - best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); - if (best && (best->edx & bit(X86_FEATURE_RDTSCP))) - vmx->rdtscp_enabled = true; - else { - exec_control &= ~SECONDARY_EXEC_RDTSCP; - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, - exec_control); - } - } - } -} - -static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) -{ - if (func == 1 && nested) - entry->ecx |= bit(X86_FEATURE_VMX); -} - -/* - * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested - * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it - * with L0's requirements for its guest (a.k.a. vmsc01), so we can run the L2 - * guest in a way that will both be appropriate to L1's requests, and our - * needs. In addition to modifying the active vmcs (which is vmcs02), this - * function also has additional necessary side-effects, like setting various - * vcpu->arch fields. - */ -static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - u32 exec_control; - - vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); - vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); - vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector); - vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector); - vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector); - vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector); - vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector); - vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector); - vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit); - vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit); - vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit); - vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit); - vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit); - vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit); - vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit); - vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit); - vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit); - vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit); - vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes); - vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes); - vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes); - vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes); - vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes); - vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes); - vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes); - vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes); - vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base); - vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base); - vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base); - vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base); - vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base); - vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base); - vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base); - vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base); - vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); - vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); - - vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, - vmcs12->vm_entry_intr_info_field); - vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, - vmcs12->vm_entry_exception_error_code); - vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, - vmcs12->vm_entry_instruction_len); - vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, - vmcs12->guest_interruptibility_info); - vmcs_write32(GUEST_ACTIVITY_STATE, vmcs12->guest_activity_state); - vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); - vmcs_writel(GUEST_DR7, vmcs12->guest_dr7); - vmcs_writel(GUEST_RFLAGS, vmcs12->guest_rflags); - vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, - vmcs12->guest_pending_dbg_exceptions); - vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); - vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip); - - vmcs_write64(VMCS_LINK_POINTER, -1ull); - - vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, - (vmcs_config.pin_based_exec_ctrl | - vmcs12->pin_based_vm_exec_control)); - - /* - * Whether page-faults are trapped is determined by a combination of - * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. - * If enable_ept, L0 doesn't care about page faults and we should - * set all of these to L1's desires. However, if !enable_ept, L0 does - * care about (at least some) page faults, and because it is not easy - * (if at all possible?) to merge L0 and L1's desires, we simply ask - * to exit on each and every L2 page fault. This is done by setting - * MASK=MATCH=0 and (see below) EB.PF=1. - * Note that below we don't need special code to set EB.PF beyond the - * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept, - * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when - * !enable_ept, EB.PF is 1, so the "or" will always be 1. - * - * A problem with this approach (when !enable_ept) is that L1 may be - * injected with more page faults than it asked for. This could have - * caused problems, but in practice existing hypervisors don't care. - * To fix this, we will need to emulate the PFEC checking (on the L1 - * page tables), using walk_addr(), when injecting PFs to L1. - */ - vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, - enable_ept ? vmcs12->page_fault_error_code_mask : 0); - vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, - enable_ept ? vmcs12->page_fault_error_code_match : 0); - - if (cpu_has_secondary_exec_ctrls()) { - u32 exec_control = vmx_secondary_exec_control(vmx); - if (!vmx->rdtscp_enabled) - exec_control &= ~SECONDARY_EXEC_RDTSCP; - /* Take the following fields only from vmcs12 */ - exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; - if (nested_cpu_has(vmcs12, - CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) - exec_control |= vmcs12->secondary_vm_exec_control; - - if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) { - /* - * Translate L1 physical address to host physical - * address for vmcs02. Keep the page pinned, so this - * physical address remains valid. We keep a reference - * to it so we can release it later. - */ - if (vmx->nested.apic_access_page) /* shouldn't happen */ - nested_release_page(vmx->nested.apic_access_page); - vmx->nested.apic_access_page = - nested_get_page(vcpu, vmcs12->apic_access_addr); - /* - * If translation failed, no matter: This feature asks - * to exit when accessing the given address, and if it - * can never be accessed, this feature won't do - * anything anyway. - */ - if (!vmx->nested.apic_access_page) - exec_control &= - ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; - else - vmcs_write64(APIC_ACCESS_ADDR, - page_to_phys(vmx->nested.apic_access_page)); - } - - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); - } - - - /* - * Set host-state according to L0's settings (vmcs12 is irrelevant here) - * Some constant fields are set here by vmx_set_constant_host_state(). - * Other fields are different per CPU, and will be set later when - * vmx_vcpu_load() is called, and when vmx_save_host_state() is called. - */ - vmx_set_constant_host_state(); - - /* - * HOST_RSP is normally set correctly in vmx_vcpu_run() just before - * entry, but only if the current (host) sp changed from the value - * we wrote last (vmx->host_rsp). This cache is no longer relevant - * if we switch vmcs, and rather than hold a separate cache per vmcs, - * here we just force the write to happen on entry. - */ - vmx->host_rsp = 0; - - exec_control = vmx_exec_control(vmx); /* L0's desires */ - exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; - exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING; - exec_control &= ~CPU_BASED_TPR_SHADOW; - exec_control |= vmcs12->cpu_based_vm_exec_control; - /* - * Merging of IO and MSR bitmaps not currently supported. - * Rather, exit every time. - */ - exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; - exec_control &= ~CPU_BASED_USE_IO_BITMAPS; - exec_control |= CPU_BASED_UNCOND_IO_EXITING; - - vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control); - - /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the - * bitwise-or of what L1 wants to trap for L2, and what we want to - * trap. Note that CR0.TS also needs updating - we do this later. - */ - update_exception_bitmap(vcpu); - vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; - vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); - - /* Note: IA32_MODE, LOAD_IA32_EFER are modified by vmx_set_efer below */ - vmcs_write32(VM_EXIT_CONTROLS, - vmcs12->vm_exit_controls | vmcs_config.vmexit_ctrl); - vmcs_write32(VM_ENTRY_CONTROLS, vmcs12->vm_entry_controls | - (vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE)); - - if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) - vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat); - else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) - vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); - - - set_cr4_guest_host_mask(vmx); - - if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) - vmcs_write64(TSC_OFFSET, - vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset); - else - vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset); - - if (enable_vpid) { - /* - * Trivially support vpid by letting L2s share their parent - * L1's vpid. TODO: move to a more elaborate solution, giving - * each L2 its own vpid and exposing the vpid feature to L1. - */ - vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); - vmx_flush_tlb(vcpu); - } - - if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER) - vcpu->arch.efer = vmcs12->guest_ia32_efer; - if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) - vcpu->arch.efer |= (EFER_LMA | EFER_LME); - else - vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); - /* Note: modifies VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */ - vmx_set_efer(vcpu, vcpu->arch.efer); - - /* - * This sets GUEST_CR0 to vmcs12->guest_cr0, with possibly a modified - * TS bit (for lazy fpu) and bits which we consider mandatory enabled. - * The CR0_READ_SHADOW is what L2 should have expected to read given - * the specifications by L1; It's not enough to take - * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we - * have more bits than L1 expected. - */ - vmx_set_cr0(vcpu, vmcs12->guest_cr0); - vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12)); - - vmx_set_cr4(vcpu, vmcs12->guest_cr4); - vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12)); - - /* shadow page tables on either EPT or shadow page tables */ - kvm_set_cr3(vcpu, vmcs12->guest_cr3); - kvm_mmu_reset_context(vcpu); - - kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp); - kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip); -} - -/* - * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1 - * for running an L2 nested guest. - */ -static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) -{ - struct vmcs12 *vmcs12; - struct vcpu_vmx *vmx = to_vmx(vcpu); - int cpu; - struct loaded_vmcs *vmcs02; - - if (!nested_vmx_check_permission(vcpu) || - !nested_vmx_check_vmcs12(vcpu)) - return 1; - - skip_emulated_instruction(vcpu); - vmcs12 = get_vmcs12(vcpu); - - /* - * The nested entry process starts with enforcing various prerequisites - * on vmcs12 as required by the Intel SDM, and act appropriately when - * they fail: As the SDM explains, some conditions should cause the - * instruction to fail, while others will cause the instruction to seem - * to succeed, but return an EXIT_REASON_INVALID_STATE. - * To speed up the normal (success) code path, we should avoid checking - * for misconfigurations which will anyway be caught by the processor - * when using the merged vmcs02. - */ - if (vmcs12->launch_state == launch) { - nested_vmx_failValid(vcpu, - launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS - : VMXERR_VMRESUME_NONLAUNCHED_VMCS); - return 1; - } - - if ((vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_MSR_BITMAPS) && - !IS_ALIGNED(vmcs12->msr_bitmap, PAGE_SIZE)) { - /*TODO: Also verify bits beyond physical address width are 0*/ - nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); - return 1; - } - - if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) && - !IS_ALIGNED(vmcs12->apic_access_addr, PAGE_SIZE)) { - /*TODO: Also verify bits beyond physical address width are 0*/ - nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); - return 1; - } - - if (vmcs12->vm_entry_msr_load_count > 0 || - vmcs12->vm_exit_msr_load_count > 0 || - vmcs12->vm_exit_msr_store_count > 0) { - pr_warn_ratelimited("%s: VMCS MSR_{LOAD,STORE} unsupported\n", - __func__); - nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); - return 1; - } - - if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, - nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high) || - !vmx_control_verify(vmcs12->secondary_vm_exec_control, - nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high) || - !vmx_control_verify(vmcs12->pin_based_vm_exec_control, - nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high) || - !vmx_control_verify(vmcs12->vm_exit_controls, - nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high) || - !vmx_control_verify(vmcs12->vm_entry_controls, - nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high)) - { - nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); - return 1; - } - - if (((vmcs12->host_cr0 & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON) || - ((vmcs12->host_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) { - nested_vmx_failValid(vcpu, - VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); - return 1; - } - - if (((vmcs12->guest_cr0 & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON) || - ((vmcs12->guest_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) { - nested_vmx_entry_failure(vcpu, vmcs12, - EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT); - return 1; - } - if (vmcs12->vmcs_link_pointer != -1ull) { - nested_vmx_entry_failure(vcpu, vmcs12, - EXIT_REASON_INVALID_STATE, ENTRY_FAIL_VMCS_LINK_PTR); - return 1; - } - - /* - * We're finally done with prerequisite checking, and can start with - * the nested entry. - */ - - vmcs02 = nested_get_current_vmcs02(vmx); - if (!vmcs02) - return -ENOMEM; - - enter_guest_mode(vcpu); - - vmx->nested.vmcs01_tsc_offset = vmcs_read64(TSC_OFFSET); - - cpu = get_cpu(); - vmx->loaded_vmcs = vmcs02; - vmx_vcpu_put(vcpu); - vmx_vcpu_load(vcpu, cpu); - vcpu->cpu = cpu; - put_cpu(); - - vmcs12->launch_state = 1; - - prepare_vmcs02(vcpu, vmcs12); - - /* - * Note no nested_vmx_succeed or nested_vmx_fail here. At this point - * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet - * returned as far as L1 is concerned. It will only return (and set - * the success flag) when L2 exits (see nested_vmx_vmexit()). - */ - return 1; -} - -/* - * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date - * because L2 may have changed some cr0 bits directly (CRO_GUEST_HOST_MASK). - * This function returns the new value we should put in vmcs12.guest_cr0. - * It's not enough to just return the vmcs02 GUEST_CR0. Rather, - * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now - * available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0 - * didn't trap the bit, because if L1 did, so would L0). - * 2. Bits that L1 asked to trap (and therefore L0 also did) could not have - * been modified by L2, and L1 knows it. So just leave the old value of - * the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0 - * isn't relevant, because if L0 traps this bit it can set it to anything. - * 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have - * changed these bits, and therefore they need to be updated, but L0 - * didn't necessarily allow them to be changed in GUEST_CR0 - and rather - * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there. - */ -static inline unsigned long -vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) -{ - return - /*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) | - /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) | - /*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask | - vcpu->arch.cr0_guest_owned_bits)); -} - -static inline unsigned long -vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) -{ - return - /*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) | - /*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) | - /*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask | - vcpu->arch.cr4_guest_owned_bits)); -} - -/* - * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits - * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), - * and this function updates it to reflect the changes to the guest state while - * L2 was running (and perhaps made some exits which were handled directly by L0 - * without going back to L1), and to reflect the exit reason. - * Note that we do not have to copy here all VMCS fields, just those that - * could have changed by the L2 guest or the exit - i.e., the guest-state and - * exit-information fields only. Other fields are modified by L1 with VMWRITE, - * which already writes to vmcs12 directly. - */ -void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) -{ - /* update guest state fields: */ - vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); - vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); - - kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7); - vmcs12->guest_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP); - vmcs12->guest_rip = kvm_register_read(vcpu, VCPU_REGS_RIP); - vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS); - - vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR); - vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR); - vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR); - vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR); - vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR); - vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR); - vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR); - vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR); - vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT); - vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT); - vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT); - vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT); - vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT); - vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT); - vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT); - vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT); - vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT); - vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT); - vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES); - vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES); - vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES); - vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES); - vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES); - vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES); - vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES); - vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES); - vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE); - vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE); - vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE); - vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE); - vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE); - vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE); - vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE); - vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE); - vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE); - vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE); - - vmcs12->guest_activity_state = vmcs_read32(GUEST_ACTIVITY_STATE); - vmcs12->guest_interruptibility_info = - vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); - vmcs12->guest_pending_dbg_exceptions = - vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); - - /* TODO: These cannot have changed unless we have MSR bitmaps and - * the relevant bit asks not to trap the change */ - vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); - if (vmcs12->vm_entry_controls & VM_EXIT_SAVE_IA32_PAT) - vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT); - vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS); - vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP); - vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP); - - /* update exit information fields: */ - - vmcs12->vm_exit_reason = vmcs_read32(VM_EXIT_REASON); - vmcs12->exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - - vmcs12->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); - vmcs12->vm_exit_intr_error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); - vmcs12->idt_vectoring_info_field = - vmcs_read32(IDT_VECTORING_INFO_FIELD); - vmcs12->idt_vectoring_error_code = - vmcs_read32(IDT_VECTORING_ERROR_CODE); - vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); - vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); - - /* clear vm-entry fields which are to be cleared on exit */ - if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) - vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK; -} - -/* - * A part of what we need to when the nested L2 guest exits and we want to - * run its L1 parent, is to reset L1's guest state to the host state specified - * in vmcs12. - * This function is to be called not only on normal nested exit, but also on - * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry - * Failures During or After Loading Guest State"). - * This function should be called when the active VMCS is L1's (vmcs01). - */ -void load_vmcs12_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) -{ - if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) - vcpu->arch.efer = vmcs12->host_ia32_efer; - if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) - vcpu->arch.efer |= (EFER_LMA | EFER_LME); - else - vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); - vmx_set_efer(vcpu, vcpu->arch.efer); - - kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->host_rsp); - kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->host_rip); - /* - * Note that calling vmx_set_cr0 is important, even if cr0 hasn't - * actually changed, because it depends on the current state of - * fpu_active (which may have changed). - * Note that vmx_set_cr0 refers to efer set above. - */ - kvm_set_cr0(vcpu, vmcs12->host_cr0); - /* - * If we did fpu_activate()/fpu_deactivate() during L2's run, we need - * to apply the same changes to L1's vmcs. We just set cr0 correctly, - * but we also need to update cr0_guest_host_mask and exception_bitmap. - */ - update_exception_bitmap(vcpu); - vcpu->arch.cr0_guest_owned_bits = (vcpu->fpu_active ? X86_CR0_TS : 0); - vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); - - /* - * Note that CR4_GUEST_HOST_MASK is already set in the original vmcs01 - * (KVM doesn't change it)- no reason to call set_cr4_guest_host_mask(); - */ - vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); - kvm_set_cr4(vcpu, vmcs12->host_cr4); - - /* shadow page tables on either EPT or shadow page tables */ - kvm_set_cr3(vcpu, vmcs12->host_cr3); - kvm_mmu_reset_context(vcpu); - - if (enable_vpid) { - /* - * Trivially support vpid by letting L2s share their parent - * L1's vpid. TODO: move to a more elaborate solution, giving - * each L2 its own vpid and exposing the vpid feature to L1. - */ - vmx_flush_tlb(vcpu); - } - - - vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs); - vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp); - vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip); - vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); - vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); - vmcs_writel(GUEST_TR_BASE, vmcs12->host_tr_base); - vmcs_writel(GUEST_GS_BASE, vmcs12->host_gs_base); - vmcs_writel(GUEST_FS_BASE, vmcs12->host_fs_base); - vmcs_write16(GUEST_ES_SELECTOR, vmcs12->host_es_selector); - vmcs_write16(GUEST_CS_SELECTOR, vmcs12->host_cs_selector); - vmcs_write16(GUEST_SS_SELECTOR, vmcs12->host_ss_selector); - vmcs_write16(GUEST_DS_SELECTOR, vmcs12->host_ds_selector); - vmcs_write16(GUEST_FS_SELECTOR, vmcs12->host_fs_selector); - vmcs_write16(GUEST_GS_SELECTOR, vmcs12->host_gs_selector); - vmcs_write16(GUEST_TR_SELECTOR, vmcs12->host_tr_selector); - - if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) - vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); - if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) - vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL, - vmcs12->host_ia32_perf_global_ctrl); -} - -/* - * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1 - * and modify vmcs12 to make it see what it would expect to see there if - * L2 was its real guest. Must only be called when in L2 (is_guest_mode()) - */ -static void nested_vmx_vmexit(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - int cpu; - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - - leave_guest_mode(vcpu); - prepare_vmcs12(vcpu, vmcs12); - - cpu = get_cpu(); - vmx->loaded_vmcs = &vmx->vmcs01; - vmx_vcpu_put(vcpu); - vmx_vcpu_load(vcpu, cpu); - vcpu->cpu = cpu; - put_cpu(); - - /* if no vmcs02 cache requested, remove the one we used */ - if (VMCS02_POOL_SIZE == 0) - nested_free_vmcs02(vmx, vmx->nested.current_vmptr); - - load_vmcs12_host_state(vcpu, vmcs12); - - /* Update TSC_OFFSET if TSC was changed while L2 ran */ - vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset); - - /* This is needed for same reason as it was needed in prepare_vmcs02 */ - vmx->host_rsp = 0; - - /* Unpin physical memory we referred to in vmcs02 */ - if (vmx->nested.apic_access_page) { - nested_release_page(vmx->nested.apic_access_page); - vmx->nested.apic_access_page = 0; - } - - /* - * Exiting from L2 to L1, we're now back to L1 which thinks it just - * finished a VMLAUNCH or VMRESUME instruction, so we need to set the - * success or failure flag accordingly. - */ - if (unlikely(vmx->fail)) { - vmx->fail = 0; - nested_vmx_failValid(vcpu, vmcs_read32(VM_INSTRUCTION_ERROR)); - } else - nested_vmx_succeed(vcpu); -} - -/* - * L1's failure to enter L2 is a subset of a normal exit, as explained in - * 23.7 "VM-entry failures during or after loading guest state" (this also - * lists the acceptable exit-reason and exit-qualification parameters). - * It should only be called before L2 actually succeeded to run, and when - * vmcs01 is current (it doesn't leave_guest_mode() or switch vmcss). - */ -static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12, - u32 reason, unsigned long qualification) -{ - load_vmcs12_host_state(vcpu, vmcs12); - vmcs12->vm_exit_reason = reason | VMX_EXIT_REASONS_FAILED_VMENTRY; - vmcs12->exit_qualification = qualification; - nested_vmx_succeed(vcpu); -} - -static int vmx_check_intercept(struct kvm_vcpu *vcpu, - struct x86_instruction_info *info, - enum x86_intercept_stage stage) -{ - return X86EMUL_CONTINUE; -} - -static struct kvm_x86_ops vmx_x86_ops = { - .cpu_has_kvm_support = cpu_has_kvm_support, - .disabled_by_bios = vmx_disabled_by_bios, - .hardware_setup = hardware_setup, - .hardware_unsetup = hardware_unsetup, - .check_processor_compatibility = vmx_check_processor_compat, - .hardware_enable = hardware_enable, - .hardware_disable = hardware_disable, - .cpu_has_accelerated_tpr = report_flexpriority, - - .vcpu_create = vmx_create_vcpu, - .vcpu_free = vmx_free_vcpu, - .vcpu_reset = vmx_vcpu_reset, - - .prepare_guest_switch = vmx_save_host_state, - .vcpu_load = vmx_vcpu_load, - .vcpu_put = vmx_vcpu_put, - - .set_guest_debug = set_guest_debug, - .get_msr = vmx_get_msr, - .set_msr = vmx_set_msr, - .get_segment_base = vmx_get_segment_base, - .get_segment = vmx_get_segment, - .set_segment = vmx_set_segment, - .get_cpl = vmx_get_cpl, - .get_cs_db_l_bits = vmx_get_cs_db_l_bits, - .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits, - .decache_cr3 = vmx_decache_cr3, - .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits, - .set_cr0 = vmx_set_cr0, - .set_cr3 = vmx_set_cr3, - .set_cr4 = vmx_set_cr4, - .set_efer = vmx_set_efer, - .get_idt = vmx_get_idt, - .set_idt = vmx_set_idt, - .get_gdt = vmx_get_gdt, - .set_gdt = vmx_set_gdt, - .set_dr7 = vmx_set_dr7, - .cache_reg = vmx_cache_reg, - .get_rflags = vmx_get_rflags, - .set_rflags = vmx_set_rflags, - .fpu_activate = vmx_fpu_activate, - .fpu_deactivate = vmx_fpu_deactivate, - - .tlb_flush = vmx_flush_tlb, - - .run = vmx_vcpu_run, - .handle_exit = vmx_handle_exit, - .skip_emulated_instruction = skip_emulated_instruction, - .set_interrupt_shadow = vmx_set_interrupt_shadow, - .get_interrupt_shadow = vmx_get_interrupt_shadow, - .patch_hypercall = vmx_patch_hypercall, - .set_irq = vmx_inject_irq, - .set_nmi = vmx_inject_nmi, - .queue_exception = vmx_queue_exception, - .cancel_injection = vmx_cancel_injection, - .interrupt_allowed = vmx_interrupt_allowed, - .nmi_allowed = vmx_nmi_allowed, - .get_nmi_mask = vmx_get_nmi_mask, - .set_nmi_mask = vmx_set_nmi_mask, - .enable_nmi_window = enable_nmi_window, - .enable_irq_window = enable_irq_window, - .update_cr8_intercept = update_cr8_intercept, - - .set_tss_addr = vmx_set_tss_addr, - .get_tdp_level = get_ept_level, - .get_mt_mask = vmx_get_mt_mask, - - .get_exit_info = vmx_get_exit_info, - - .get_lpage_level = vmx_get_lpage_level, - - .cpuid_update = vmx_cpuid_update, - - .rdtscp_supported = vmx_rdtscp_supported, - - .set_supported_cpuid = vmx_set_supported_cpuid, - - .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, - - .set_tsc_khz = vmx_set_tsc_khz, - .write_tsc_offset = vmx_write_tsc_offset, - .adjust_tsc_offset = vmx_adjust_tsc_offset, - .compute_tsc_offset = vmx_compute_tsc_offset, - .read_l1_tsc = vmx_read_l1_tsc, - - .set_tdp_cr3 = vmx_set_cr3, - - .check_intercept = vmx_check_intercept, -}; - -static int __init vmx_init(void) -{ - int r, i; - - rdmsrl_safe(MSR_EFER, &host_efer); - - for (i = 0; i < NR_VMX_MSR; ++i) - kvm_define_shared_msr(i, vmx_msr_index[i]); - - vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_io_bitmap_a) - return -ENOMEM; - - vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_io_bitmap_b) { - r = -ENOMEM; - goto out; - } - - vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_msr_bitmap_legacy) { - r = -ENOMEM; - goto out1; - } - - vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_msr_bitmap_longmode) { - r = -ENOMEM; - goto out2; - } - - /* - * Allow direct access to the PC debug port (it is often used for I/O - * delays, but the vmexits simply slow things down). - */ - memset(vmx_io_bitmap_a, 0xff, PAGE_SIZE); - clear_bit(0x80, vmx_io_bitmap_a); - - memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE); - - memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE); - memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE); - - set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ - - r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), - __alignof__(struct vcpu_vmx), THIS_MODULE); - if (r) - goto out3; - - vmx_disable_intercept_for_msr(MSR_FS_BASE, false); - vmx_disable_intercept_for_msr(MSR_GS_BASE, false); - vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true); - vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false); - vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false); - vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); - - if (enable_ept) { - kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull, - VMX_EPT_EXECUTABLE_MASK); - ept_set_mmio_spte_mask(); - kvm_enable_tdp(); - } else - kvm_disable_tdp(); - - return 0; - -out3: - free_page((unsigned long)vmx_msr_bitmap_longmode); -out2: - free_page((unsigned long)vmx_msr_bitmap_legacy); -out1: - free_page((unsigned long)vmx_io_bitmap_b); -out: - free_page((unsigned long)vmx_io_bitmap_a); - return r; -} - -static void __exit vmx_exit(void) -{ - free_page((unsigned long)vmx_msr_bitmap_legacy); - free_page((unsigned long)vmx_msr_bitmap_longmode); - free_page((unsigned long)vmx_io_bitmap_b); - free_page((unsigned long)vmx_io_bitmap_a); - - kvm_exit(); -} - -module_init(vmx_init) -module_exit(vmx_exit) diff --git a/ANDROID_3.4.5/arch/x86/kvm/x86.c b/ANDROID_3.4.5/arch/x86/kvm/x86.c deleted file mode 100644 index 185a2b82..00000000 --- a/ANDROID_3.4.5/arch/x86/kvm/x86.c +++ /dev/null @@ -1,6607 +0,0 @@ -/* - * Kernel-based Virtual Machine driver for Linux - * - * derived from drivers/kvm/kvm_main.c - * - * Copyright (C) 2006 Qumranet, Inc. - * Copyright (C) 2008 Qumranet, Inc. - * Copyright IBM Corporation, 2008 - * Copyright 2010 Red Hat, Inc. and/or its affiliates. - * - * Authors: - * Avi Kivity <avi@qumranet.com> - * Yaniv Kamay <yaniv@qumranet.com> - * Amit Shah <amit.shah@qumranet.com> - * Ben-Ami Yassour <benami@il.ibm.com> - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * - */ - -#include <linux/kvm_host.h> -#include "irq.h" -#include "mmu.h" -#include "i8254.h" -#include "tss.h" -#include "kvm_cache_regs.h" -#include "x86.h" -#include "cpuid.h" - -#include <linux/clocksource.h> -#include <linux/interrupt.h> -#include <linux/kvm.h> -#include <linux/fs.h> -#include <linux/vmalloc.h> -#include <linux/module.h> -#include <linux/mman.h> -#include <linux/highmem.h> -#include <linux/iommu.h> -#include <linux/intel-iommu.h> -#include <linux/cpufreq.h> -#include <linux/user-return-notifier.h> -#include <linux/srcu.h> -#include <linux/slab.h> -#include <linux/perf_event.h> -#include <linux/uaccess.h> -#include <linux/hash.h> -#include <linux/pci.h> -#include <trace/events/kvm.h> - -#define CREATE_TRACE_POINTS -#include "trace.h" - -#include <asm/debugreg.h> -#include <asm/msr.h> -#include <asm/desc.h> -#include <asm/mtrr.h> -#include <asm/mce.h> -#include <asm/i387.h> -#include <asm/fpu-internal.h> /* Ugh! */ -#include <asm/xcr.h> -#include <asm/pvclock.h> -#include <asm/div64.h> - -#define MAX_IO_MSRS 256 -#define KVM_MAX_MCE_BANKS 32 -#define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P) - -#define emul_to_vcpu(ctxt) \ - container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt) - -/* EFER defaults: - * - enable syscall per default because its emulated by KVM - * - enable LME and LMA per default on 64 bit KVM - */ -#ifdef CONFIG_X86_64 -static -u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA)); -#else -static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE); -#endif - -#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM -#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU - -static void update_cr8_intercept(struct kvm_vcpu *vcpu); -static void process_nmi(struct kvm_vcpu *vcpu); - -struct kvm_x86_ops *kvm_x86_ops; -EXPORT_SYMBOL_GPL(kvm_x86_ops); - -static bool ignore_msrs = 0; -module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR); - -bool kvm_has_tsc_control; -EXPORT_SYMBOL_GPL(kvm_has_tsc_control); -u32 kvm_max_guest_tsc_khz; -EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz); - -/* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */ -static u32 tsc_tolerance_ppm = 250; -module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); - -#define KVM_NR_SHARED_MSRS 16 - -struct kvm_shared_msrs_global { - int nr; - u32 msrs[KVM_NR_SHARED_MSRS]; -}; - -struct kvm_shared_msrs { - struct user_return_notifier urn; - bool registered; - struct kvm_shared_msr_values { - u64 host; - u64 curr; - } values[KVM_NR_SHARED_MSRS]; -}; - -static struct kvm_shared_msrs_global __read_mostly shared_msrs_global; -static DEFINE_PER_CPU(struct kvm_shared_msrs, shared_msrs); - -struct kvm_stats_debugfs_item debugfs_entries[] = { - { "pf_fixed", VCPU_STAT(pf_fixed) }, - { "pf_guest", VCPU_STAT(pf_guest) }, - { "tlb_flush", VCPU_STAT(tlb_flush) }, - { "invlpg", VCPU_STAT(invlpg) }, - { "exits", VCPU_STAT(exits) }, - { "io_exits", VCPU_STAT(io_exits) }, - { "mmio_exits", VCPU_STAT(mmio_exits) }, - { "signal_exits", VCPU_STAT(signal_exits) }, - { "irq_window", VCPU_STAT(irq_window_exits) }, - { "nmi_window", VCPU_STAT(nmi_window_exits) }, - { "halt_exits", VCPU_STAT(halt_exits) }, - { "halt_wakeup", VCPU_STAT(halt_wakeup) }, - { "hypercalls", VCPU_STAT(hypercalls) }, - { "request_irq", VCPU_STAT(request_irq_exits) }, - { "irq_exits", VCPU_STAT(irq_exits) }, - { "host_state_reload", VCPU_STAT(host_state_reload) }, - { "efer_reload", VCPU_STAT(efer_reload) }, - { "fpu_reload", VCPU_STAT(fpu_reload) }, - { "insn_emulation", VCPU_STAT(insn_emulation) }, - { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) }, - { "irq_injections", VCPU_STAT(irq_injections) }, - { "nmi_injections", VCPU_STAT(nmi_injections) }, - { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) }, - { "mmu_pte_write", VM_STAT(mmu_pte_write) }, - { "mmu_pte_updated", VM_STAT(mmu_pte_updated) }, - { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) }, - { "mmu_flooded", VM_STAT(mmu_flooded) }, - { "mmu_recycled", VM_STAT(mmu_recycled) }, - { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, - { "mmu_unsync", VM_STAT(mmu_unsync) }, - { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, - { "largepages", VM_STAT(lpages) }, - { NULL } -}; - -u64 __read_mostly host_xcr0; - -int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt); - -static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) -{ - int i; - for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU); i++) - vcpu->arch.apf.gfns[i] = ~0; -} - -static void kvm_on_user_return(struct user_return_notifier *urn) -{ - unsigned slot; - struct kvm_shared_msrs *locals - = container_of(urn, struct kvm_shared_msrs, urn); - struct kvm_shared_msr_values *values; - - for (slot = 0; slot < shared_msrs_global.nr; ++slot) { - values = &locals->values[slot]; - if (values->host != values->curr) { - wrmsrl(shared_msrs_global.msrs[slot], values->host); - values->curr = values->host; - } - } - locals->registered = false; - user_return_notifier_unregister(urn); -} - -static void shared_msr_update(unsigned slot, u32 msr) -{ - struct kvm_shared_msrs *smsr; - u64 value; - - smsr = &__get_cpu_var(shared_msrs); - /* only read, and nobody should modify it at this time, - * so don't need lock */ - if (slot >= shared_msrs_global.nr) { - printk(KERN_ERR "kvm: invalid MSR slot!"); - return; - } - rdmsrl_safe(msr, &value); - smsr->values[slot].host = value; - smsr->values[slot].curr = value; -} - -void kvm_define_shared_msr(unsigned slot, u32 msr) -{ - if (slot >= shared_msrs_global.nr) - shared_msrs_global.nr = slot + 1; - shared_msrs_global.msrs[slot] = msr; - /* we need ensured the shared_msr_global have been updated */ - smp_wmb(); -} -EXPORT_SYMBOL_GPL(kvm_define_shared_msr); - -static void kvm_shared_msr_cpu_online(void) -{ - unsigned i; - - for (i = 0; i < shared_msrs_global.nr; ++i) - shared_msr_update(i, shared_msrs_global.msrs[i]); -} - -void kvm_set_shared_msr(unsigned slot, u64 value, u64 mask) -{ - struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs); - - if (((value ^ smsr->values[slot].curr) & mask) == 0) - return; - smsr->values[slot].curr = value; - wrmsrl(shared_msrs_global.msrs[slot], value); - if (!smsr->registered) { - smsr->urn.on_user_return = kvm_on_user_return; - user_return_notifier_register(&smsr->urn); - smsr->registered = true; - } -} -EXPORT_SYMBOL_GPL(kvm_set_shared_msr); - -static void drop_user_return_notifiers(void *ignore) -{ - struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs); - - if (smsr->registered) - kvm_on_user_return(&smsr->urn); -} - -u64 kvm_get_apic_base(struct kvm_vcpu *vcpu) -{ - if (irqchip_in_kernel(vcpu->kvm)) - return vcpu->arch.apic_base; - else - return vcpu->arch.apic_base; -} -EXPORT_SYMBOL_GPL(kvm_get_apic_base); - -void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data) -{ - /* TODO: reserve bits check */ - if (irqchip_in_kernel(vcpu->kvm)) - kvm_lapic_set_base(vcpu, data); - else - vcpu->arch.apic_base = data; -} -EXPORT_SYMBOL_GPL(kvm_set_apic_base); - -#define EXCPT_BENIGN 0 -#define EXCPT_CONTRIBUTORY 1 -#define EXCPT_PF 2 - -static int exception_class(int vector) -{ - switch (vector) { - case PF_VECTOR: - return EXCPT_PF; - case DE_VECTOR: - case TS_VECTOR: - case NP_VECTOR: - case SS_VECTOR: - case GP_VECTOR: - return EXCPT_CONTRIBUTORY; - default: - break; - } - return EXCPT_BENIGN; -} - -static void kvm_multiple_exception(struct kvm_vcpu *vcpu, - unsigned nr, bool has_error, u32 error_code, - bool reinject) -{ - u32 prev_nr; - int class1, class2; - - kvm_make_request(KVM_REQ_EVENT, vcpu); - - if (!vcpu->arch.exception.pending) { - queue: - vcpu->arch.exception.pending = true; - vcpu->arch.exception.has_error_code = has_error; - vcpu->arch.exception.nr = nr; - vcpu->arch.exception.error_code = error_code; - vcpu->arch.exception.reinject = reinject; - return; - } - - /* to check exception */ - prev_nr = vcpu->arch.exception.nr; - if (prev_nr == DF_VECTOR) { - /* triple fault -> shutdown */ - kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); - return; - } - class1 = exception_class(prev_nr); - class2 = exception_class(nr); - if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY) - || (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) { - /* generate double fault per SDM Table 5-5 */ - vcpu->arch.exception.pending = true; - vcpu->arch.exception.has_error_code = true; - vcpu->arch.exception.nr = DF_VECTOR; - vcpu->arch.exception.error_code = 0; - } else - /* replace previous exception with a new one in a hope - that instruction re-execution will regenerate lost - exception */ - goto queue; -} - -void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr) -{ - kvm_multiple_exception(vcpu, nr, false, 0, false); -} -EXPORT_SYMBOL_GPL(kvm_queue_exception); - -void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr) -{ - kvm_multiple_exception(vcpu, nr, false, 0, true); -} -EXPORT_SYMBOL_GPL(kvm_requeue_exception); - -void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err) -{ - if (err) - kvm_inject_gp(vcpu, 0); - else - kvm_x86_ops->skip_emulated_instruction(vcpu); -} -EXPORT_SYMBOL_GPL(kvm_complete_insn_gp); - -void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) -{ - ++vcpu->stat.pf_guest; - vcpu->arch.cr2 = fault->address; - kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code); -} -EXPORT_SYMBOL_GPL(kvm_inject_page_fault); - -void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) -{ - if (mmu_is_nested(vcpu) && !fault->nested_page_fault) - vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault); - else - vcpu->arch.mmu.inject_page_fault(vcpu, fault); -} - -void kvm_inject_nmi(struct kvm_vcpu *vcpu) -{ - atomic_inc(&vcpu->arch.nmi_queued); - kvm_make_request(KVM_REQ_NMI, vcpu); -} -EXPORT_SYMBOL_GPL(kvm_inject_nmi); - -void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) -{ - kvm_multiple_exception(vcpu, nr, true, error_code, false); -} -EXPORT_SYMBOL_GPL(kvm_queue_exception_e); - -void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) -{ - kvm_multiple_exception(vcpu, nr, true, error_code, true); -} -EXPORT_SYMBOL_GPL(kvm_requeue_exception_e); - -/* - * Checks if cpl <= required_cpl; if true, return true. Otherwise queue - * a #GP and return false. - */ -bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl) -{ - if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl) - return true; - kvm_queue_exception_e(vcpu, GP_VECTOR, 0); - return false; -} -EXPORT_SYMBOL_GPL(kvm_require_cpl); - -/* - * This function will be used to read from the physical memory of the currently - * running guest. The difference to kvm_read_guest_page is that this function - * can read from guest physical or from the guest's guest physical memory. - */ -int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, - gfn_t ngfn, void *data, int offset, int len, - u32 access) -{ - gfn_t real_gfn; - gpa_t ngpa; - - ngpa = gfn_to_gpa(ngfn); - real_gfn = mmu->translate_gpa(vcpu, ngpa, access); - if (real_gfn == UNMAPPED_GVA) - return -EFAULT; - - real_gfn = gpa_to_gfn(real_gfn); - - return kvm_read_guest_page(vcpu->kvm, real_gfn, data, offset, len); -} -EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu); - -int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, - void *data, int offset, int len, u32 access) -{ - return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn, - data, offset, len, access); -} - -/* - * Load the pae pdptrs. Return true is they are all valid. - */ -int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3) -{ - gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; - unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2; - int i; - int ret; - u64 pdpte[ARRAY_SIZE(mmu->pdptrs)]; - - ret = kvm_read_guest_page_mmu(vcpu, mmu, pdpt_gfn, pdpte, - offset * sizeof(u64), sizeof(pdpte), - PFERR_USER_MASK|PFERR_WRITE_MASK); - if (ret < 0) { - ret = 0; - goto out; - } - for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { - if (is_present_gpte(pdpte[i]) && - (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) { - ret = 0; - goto out; - } - } - ret = 1; - - memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs)); - __set_bit(VCPU_EXREG_PDPTR, - (unsigned long *)&vcpu->arch.regs_avail); - __set_bit(VCPU_EXREG_PDPTR, - (unsigned long *)&vcpu->arch.regs_dirty); -out: - - return ret; -} -EXPORT_SYMBOL_GPL(load_pdptrs); - -static bool pdptrs_changed(struct kvm_vcpu *vcpu) -{ - u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)]; - bool changed = true; - int offset; - gfn_t gfn; - int r; - - if (is_long_mode(vcpu) || !is_pae(vcpu)) - return false; - - if (!test_bit(VCPU_EXREG_PDPTR, - (unsigned long *)&vcpu->arch.regs_avail)) - return true; - - gfn = (kvm_read_cr3(vcpu) & ~31u) >> PAGE_SHIFT; - offset = (kvm_read_cr3(vcpu) & ~31u) & (PAGE_SIZE - 1); - r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte), - PFERR_USER_MASK | PFERR_WRITE_MASK); - if (r < 0) - goto out; - changed = memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0; -out: - - return changed; -} - -int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) -{ - unsigned long old_cr0 = kvm_read_cr0(vcpu); - unsigned long update_bits = X86_CR0_PG | X86_CR0_WP | - X86_CR0_CD | X86_CR0_NW; - - cr0 |= X86_CR0_ET; - -#ifdef CONFIG_X86_64 - if (cr0 & 0xffffffff00000000UL) - return 1; -#endif - - cr0 &= ~CR0_RESERVED_BITS; - - if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) - return 1; - - if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) - return 1; - - if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { -#ifdef CONFIG_X86_64 - if ((vcpu->arch.efer & EFER_LME)) { - int cs_db, cs_l; - - if (!is_pae(vcpu)) - return 1; - kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); - if (cs_l) - return 1; - } else -#endif - if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, - kvm_read_cr3(vcpu))) - return 1; - } - - kvm_x86_ops->set_cr0(vcpu, cr0); - - if ((cr0 ^ old_cr0) & X86_CR0_PG) { - kvm_clear_async_pf_completion_queue(vcpu); - kvm_async_pf_hash_reset(vcpu); - } - - if ((cr0 ^ old_cr0) & update_bits) - kvm_mmu_reset_context(vcpu); - return 0; -} -EXPORT_SYMBOL_GPL(kvm_set_cr0); - -void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) -{ - (void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f)); -} -EXPORT_SYMBOL_GPL(kvm_lmsw); - -int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) -{ - u64 xcr0; - - /* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now */ - if (index != XCR_XFEATURE_ENABLED_MASK) - return 1; - xcr0 = xcr; - if (kvm_x86_ops->get_cpl(vcpu) != 0) - return 1; - if (!(xcr0 & XSTATE_FP)) - return 1; - if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE)) - return 1; - if (xcr0 & ~host_xcr0) - return 1; - vcpu->arch.xcr0 = xcr0; - vcpu->guest_xcr0_loaded = 0; - return 0; -} - -int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) -{ - if (__kvm_set_xcr(vcpu, index, xcr)) { - kvm_inject_gp(vcpu, 0); - return 1; - } - return 0; -} -EXPORT_SYMBOL_GPL(kvm_set_xcr); - -int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) -{ - unsigned long old_cr4 = kvm_read_cr4(vcpu); - unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | - X86_CR4_PAE | X86_CR4_SMEP; - if (cr4 & CR4_RESERVED_BITS) - return 1; - - if (!guest_cpuid_has_xsave(vcpu) && (cr4 & X86_CR4_OSXSAVE)) - return 1; - - if (!guest_cpuid_has_smep(vcpu) && (cr4 & X86_CR4_SMEP)) - return 1; - - if (!guest_cpuid_has_fsgsbase(vcpu) && (cr4 & X86_CR4_RDWRGSFS)) - return 1; - - if (is_long_mode(vcpu)) { - if (!(cr4 & X86_CR4_PAE)) - return 1; - } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) - && ((cr4 ^ old_cr4) & pdptr_bits) - && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, - kvm_read_cr3(vcpu))) - return 1; - - if (kvm_x86_ops->set_cr4(vcpu, cr4)) - return 1; - - if ((cr4 ^ old_cr4) & pdptr_bits) - kvm_mmu_reset_context(vcpu); - - if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE) - kvm_update_cpuid(vcpu); - - return 0; -} -EXPORT_SYMBOL_GPL(kvm_set_cr4); - -int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) -{ - if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) { - kvm_mmu_sync_roots(vcpu); - kvm_mmu_flush_tlb(vcpu); - return 0; - } - - if (is_long_mode(vcpu)) { - if (cr3 & CR3_L_MODE_RESERVED_BITS) - return 1; - } else { - if (is_pae(vcpu)) { - if (cr3 & CR3_PAE_RESERVED_BITS) - return 1; - if (is_paging(vcpu) && - !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) - return 1; - } - /* - * We don't check reserved bits in nonpae mode, because - * this isn't enforced, and VMware depends on this. - */ - } - - /* - * Does the new cr3 value map to physical memory? (Note, we - * catch an invalid cr3 even in real-mode, because it would - * cause trouble later on when we turn on paging anyway.) - * - * A real CPU would silently accept an invalid cr3 and would - * attempt to use it - with largely undefined (and often hard - * to debug) behavior on the guest side. - */ - if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT))) - return 1; - vcpu->arch.cr3 = cr3; - __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); - vcpu->arch.mmu.new_cr3(vcpu); - return 0; -} -EXPORT_SYMBOL_GPL(kvm_set_cr3); - -int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) -{ - if (cr8 & CR8_RESERVED_BITS) - return 1; - if (irqchip_in_kernel(vcpu->kvm)) - kvm_lapic_set_tpr(vcpu, cr8); - else - vcpu->arch.cr8 = cr8; - return 0; -} -EXPORT_SYMBOL_GPL(kvm_set_cr8); - -unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) -{ - if (irqchip_in_kernel(vcpu->kvm)) - return kvm_lapic_get_cr8(vcpu); - else - return vcpu->arch.cr8; -} -EXPORT_SYMBOL_GPL(kvm_get_cr8); - -static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) -{ - switch (dr) { - case 0 ... 3: - vcpu->arch.db[dr] = val; - if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) - vcpu->arch.eff_db[dr] = val; - break; - case 4: - if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) - return 1; /* #UD */ - /* fall through */ - case 6: - if (val & 0xffffffff00000000ULL) - return -1; /* #GP */ - vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1; - break; - case 5: - if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) - return 1; /* #UD */ - /* fall through */ - default: /* 7 */ - if (val & 0xffffffff00000000ULL) - return -1; /* #GP */ - vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1; - if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { - kvm_x86_ops->set_dr7(vcpu, vcpu->arch.dr7); - vcpu->arch.switch_db_regs = (val & DR7_BP_EN_MASK); - } - break; - } - - return 0; -} - -int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) -{ - int res; - - res = __kvm_set_dr(vcpu, dr, val); - if (res > 0) - kvm_queue_exception(vcpu, UD_VECTOR); - else if (res < 0) - kvm_inject_gp(vcpu, 0); - - return res; -} -EXPORT_SYMBOL_GPL(kvm_set_dr); - -static int _kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) -{ - switch (dr) { - case 0 ... 3: - *val = vcpu->arch.db[dr]; - break; - case 4: - if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) - return 1; - /* fall through */ - case 6: - *val = vcpu->arch.dr6; - break; - case 5: - if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) - return 1; - /* fall through */ - default: /* 7 */ - *val = vcpu->arch.dr7; - break; - } - - return 0; -} - -int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) -{ - if (_kvm_get_dr(vcpu, dr, val)) { - kvm_queue_exception(vcpu, UD_VECTOR); - return 1; - } - return 0; -} -EXPORT_SYMBOL_GPL(kvm_get_dr); - -bool kvm_rdpmc(struct kvm_vcpu *vcpu) -{ - u32 ecx = kvm_register_read(vcpu, VCPU_REGS_RCX); - u64 data; - int err; - - err = kvm_pmu_read_pmc(vcpu, ecx, &data); - if (err) - return err; - kvm_register_write(vcpu, VCPU_REGS_RAX, (u32)data); - kvm_register_write(vcpu, VCPU_REGS_RDX, data >> 32); - return err; -} -EXPORT_SYMBOL_GPL(kvm_rdpmc); - -/* - * List of msr numbers which we expose to userspace through KVM_GET_MSRS - * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. - * - * This list is modified at module load time to reflect the - * capabilities of the host cpu. This capabilities test skips MSRs that are - * kvm-specific. Those are put in the beginning of the list. - */ - -#define KVM_SAVE_MSRS_BEGIN 9 -static u32 msrs_to_save[] = { - MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, - MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, - HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, - HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, - MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, - MSR_STAR, -#ifdef CONFIG_X86_64 - MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, -#endif - MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA -}; - -static unsigned num_msrs_to_save; - -static u32 emulated_msrs[] = { - MSR_IA32_TSCDEADLINE, - MSR_IA32_MISC_ENABLE, - MSR_IA32_MCG_STATUS, - MSR_IA32_MCG_CTL, -}; - -static int set_efer(struct kvm_vcpu *vcpu, u64 efer) -{ - u64 old_efer = vcpu->arch.efer; - - if (efer & efer_reserved_bits) - return 1; - - if (is_paging(vcpu) - && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) - return 1; - - if (efer & EFER_FFXSR) { - struct kvm_cpuid_entry2 *feat; - - feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); - if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) - return 1; - } - - if (efer & EFER_SVME) { - struct kvm_cpuid_entry2 *feat; - - feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); - if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) - return 1; - } - - efer &= ~EFER_LMA; - efer |= vcpu->arch.efer & EFER_LMA; - - kvm_x86_ops->set_efer(vcpu, efer); - - vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled; - - /* Update reserved bits */ - if ((efer ^ old_efer) & EFER_NX) - kvm_mmu_reset_context(vcpu); - - return 0; -} - -void kvm_enable_efer_bits(u64 mask) -{ - efer_reserved_bits &= ~mask; -} -EXPORT_SYMBOL_GPL(kvm_enable_efer_bits); - - -/* - * Writes msr value into into the appropriate "register". - * Returns 0 on success, non-0 otherwise. - * Assumes vcpu_load() was already called. - */ -int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) -{ - return kvm_x86_ops->set_msr(vcpu, msr_index, data); -} - -/* - * Adapt set_msr() to msr_io()'s calling convention - */ -static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) -{ - return kvm_set_msr(vcpu, index, *data); -} - -static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) -{ - int version; - int r; - struct pvclock_wall_clock wc; - struct timespec boot; - - if (!wall_clock) - return; - - r = kvm_read_guest(kvm, wall_clock, &version, sizeof(version)); - if (r) - return; - - if (version & 1) - ++version; /* first time write, random junk */ - - ++version; - - kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); - - /* - * The guest calculates current wall clock time by adding - * system time (updated by kvm_guest_time_update below) to the - * wall clock specified here. guest system time equals host - * system time for us, thus we must fill in host boot time here. - */ - getboottime(&boot); - - wc.sec = boot.tv_sec; - wc.nsec = boot.tv_nsec; - wc.version = version; - - kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc)); - - version++; - kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); -} - -static uint32_t div_frac(uint32_t dividend, uint32_t divisor) -{ - uint32_t quotient, remainder; - - /* Don't try to replace with do_div(), this one calculates - * "(dividend << 32) / divisor" */ - __asm__ ( "divl %4" - : "=a" (quotient), "=d" (remainder) - : "0" (0), "1" (dividend), "r" (divisor) ); - return quotient; -} - -static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz, - s8 *pshift, u32 *pmultiplier) -{ - uint64_t scaled64; - int32_t shift = 0; - uint64_t tps64; - uint32_t tps32; - - tps64 = base_khz * 1000LL; - scaled64 = scaled_khz * 1000LL; - while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) { - tps64 >>= 1; - shift--; - } - - tps32 = (uint32_t)tps64; - while (tps32 <= scaled64 || scaled64 & 0xffffffff00000000ULL) { - if (scaled64 & 0xffffffff00000000ULL || tps32 & 0x80000000) - scaled64 >>= 1; - else - tps32 <<= 1; - shift++; - } - - *pshift = shift; - *pmultiplier = div_frac(scaled64, tps32); - - pr_debug("%s: base_khz %u => %u, shift %d, mul %u\n", - __func__, base_khz, scaled_khz, shift, *pmultiplier); -} - -static inline u64 get_kernel_ns(void) -{ - struct timespec ts; - - WARN_ON(preemptible()); - ktime_get_ts(&ts); - monotonic_to_bootbased(&ts); - return timespec_to_ns(&ts); -} - -static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); -unsigned long max_tsc_khz; - -static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) -{ - return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult, - vcpu->arch.virtual_tsc_shift); -} - -static u32 adjust_tsc_khz(u32 khz, s32 ppm) -{ - u64 v = (u64)khz * (1000000 + ppm); - do_div(v, 1000000); - return v; -} - -static void kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz) -{ - u32 thresh_lo, thresh_hi; - int use_scaling = 0; - - /* Compute a scale to convert nanoseconds in TSC cycles */ - kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000, - &vcpu->arch.virtual_tsc_shift, - &vcpu->arch.virtual_tsc_mult); - vcpu->arch.virtual_tsc_khz = this_tsc_khz; - - /* - * Compute the variation in TSC rate which is acceptable - * within the range of tolerance and decide if the - * rate being applied is within that bounds of the hardware - * rate. If so, no scaling or compensation need be done. - */ - thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm); - thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm); - if (this_tsc_khz < thresh_lo || this_tsc_khz > thresh_hi) { - pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", this_tsc_khz, thresh_lo, thresh_hi); - use_scaling = 1; - } - kvm_x86_ops->set_tsc_khz(vcpu, this_tsc_khz, use_scaling); -} - -static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) -{ - u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.this_tsc_nsec, - vcpu->arch.virtual_tsc_mult, - vcpu->arch.virtual_tsc_shift); - tsc += vcpu->arch.this_tsc_write; - return tsc; -} - -void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) -{ - struct kvm *kvm = vcpu->kvm; - u64 offset, ns, elapsed; - unsigned long flags; - s64 usdiff; - - raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); - offset = kvm_x86_ops->compute_tsc_offset(vcpu, data); - ns = get_kernel_ns(); - elapsed = ns - kvm->arch.last_tsc_nsec; - - /* n.b - signed multiplication and division required */ - usdiff = data - kvm->arch.last_tsc_write; -#ifdef CONFIG_X86_64 - usdiff = (usdiff * 1000) / vcpu->arch.virtual_tsc_khz; -#else - /* do_div() only does unsigned */ - asm("idivl %2; xor %%edx, %%edx" - : "=A"(usdiff) - : "A"(usdiff * 1000), "rm"(vcpu->arch.virtual_tsc_khz)); -#endif - do_div(elapsed, 1000); - usdiff -= elapsed; - if (usdiff < 0) - usdiff = -usdiff; - - /* - * Special case: TSC write with a small delta (1 second) of virtual - * cycle time against real time is interpreted as an attempt to - * synchronize the CPU. - * - * For a reliable TSC, we can match TSC offsets, and for an unstable - * TSC, we add elapsed time in this computation. We could let the - * compensation code attempt to catch up if we fall behind, but - * it's better to try to match offsets from the beginning. - */ - if (usdiff < USEC_PER_SEC && - vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) { - if (!check_tsc_unstable()) { - offset = kvm->arch.cur_tsc_offset; - pr_debug("kvm: matched tsc offset for %llu\n", data); - } else { - u64 delta = nsec_to_cycles(vcpu, elapsed); - data += delta; - offset = kvm_x86_ops->compute_tsc_offset(vcpu, data); - pr_debug("kvm: adjusted tsc offset by %llu\n", delta); - } - } else { - /* - * We split periods of matched TSC writes into generations. - * For each generation, we track the original measured - * nanosecond time, offset, and write, so if TSCs are in - * sync, we can match exact offset, and if not, we can match - * exact software computaion in compute_guest_tsc() - * - * These values are tracked in kvm->arch.cur_xxx variables. - */ - kvm->arch.cur_tsc_generation++; - kvm->arch.cur_tsc_nsec = ns; - kvm->arch.cur_tsc_write = data; - kvm->arch.cur_tsc_offset = offset; - pr_debug("kvm: new tsc generation %u, clock %llu\n", - kvm->arch.cur_tsc_generation, data); - } - - /* - * We also track th most recent recorded KHZ, write and time to - * allow the matching interval to be extended at each write. - */ - kvm->arch.last_tsc_nsec = ns; - kvm->arch.last_tsc_write = data; - kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz; - - /* Reset of TSC must disable overshoot protection below */ - vcpu->arch.hv_clock.tsc_timestamp = 0; - vcpu->arch.last_guest_tsc = data; - - /* Keep track of which generation this VCPU has synchronized to */ - vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation; - vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec; - vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write; - - kvm_x86_ops->write_tsc_offset(vcpu, offset); - raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); -} - -EXPORT_SYMBOL_GPL(kvm_write_tsc); - -static int kvm_guest_time_update(struct kvm_vcpu *v) -{ - unsigned long flags; - struct kvm_vcpu_arch *vcpu = &v->arch; - void *shared_kaddr; - unsigned long this_tsc_khz; - s64 kernel_ns, max_kernel_ns; - u64 tsc_timestamp; - - /* Keep irq disabled to prevent changes to the clock */ - local_irq_save(flags); - tsc_timestamp = kvm_x86_ops->read_l1_tsc(v); - kernel_ns = get_kernel_ns(); - this_tsc_khz = __get_cpu_var(cpu_tsc_khz); - if (unlikely(this_tsc_khz == 0)) { - local_irq_restore(flags); - kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); - return 1; - } - - /* - * We may have to catch up the TSC to match elapsed wall clock - * time for two reasons, even if kvmclock is used. - * 1) CPU could have been running below the maximum TSC rate - * 2) Broken TSC compensation resets the base at each VCPU - * entry to avoid unknown leaps of TSC even when running - * again on the same CPU. This may cause apparent elapsed - * time to disappear, and the guest to stand still or run - * very slowly. - */ - if (vcpu->tsc_catchup) { - u64 tsc = compute_guest_tsc(v, kernel_ns); - if (tsc > tsc_timestamp) { - adjust_tsc_offset_guest(v, tsc - tsc_timestamp); - tsc_timestamp = tsc; - } - } - - local_irq_restore(flags); - - if (!vcpu->time_page) - return 0; - - /* - * Time as measured by the TSC may go backwards when resetting the base - * tsc_timestamp. The reason for this is that the TSC resolution is - * higher than the resolution of the other clock scales. Thus, many - * possible measurments of the TSC correspond to one measurement of any - * other clock, and so a spread of values is possible. This is not a - * problem for the computation of the nanosecond clock; with TSC rates - * around 1GHZ, there can only be a few cycles which correspond to one - * nanosecond value, and any path through this code will inevitably - * take longer than that. However, with the kernel_ns value itself, - * the precision may be much lower, down to HZ granularity. If the - * first sampling of TSC against kernel_ns ends in the low part of the - * range, and the second in the high end of the range, we can get: - * - * (TSC - offset_low) * S + kns_old > (TSC - offset_high) * S + kns_new - * - * As the sampling errors potentially range in the thousands of cycles, - * it is possible such a time value has already been observed by the - * guest. To protect against this, we must compute the system time as - * observed by the guest and ensure the new system time is greater. - */ - max_kernel_ns = 0; - if (vcpu->hv_clock.tsc_timestamp) { - max_kernel_ns = vcpu->last_guest_tsc - - vcpu->hv_clock.tsc_timestamp; - max_kernel_ns = pvclock_scale_delta(max_kernel_ns, - vcpu->hv_clock.tsc_to_system_mul, - vcpu->hv_clock.tsc_shift); - max_kernel_ns += vcpu->last_kernel_ns; - } - - if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) { - kvm_get_time_scale(NSEC_PER_SEC / 1000, this_tsc_khz, - &vcpu->hv_clock.tsc_shift, - &vcpu->hv_clock.tsc_to_system_mul); - vcpu->hw_tsc_khz = this_tsc_khz; - } - - if (max_kernel_ns > kernel_ns) - kernel_ns = max_kernel_ns; - - /* With all the info we got, fill in the values */ - vcpu->hv_clock.tsc_timestamp = tsc_timestamp; - vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; - vcpu->last_kernel_ns = kernel_ns; - vcpu->last_guest_tsc = tsc_timestamp; - vcpu->hv_clock.flags = 0; - - /* - * The interface expects us to write an even number signaling that the - * update is finished. Since the guest won't see the intermediate - * state, we just increase by 2 at the end. - */ - vcpu->hv_clock.version += 2; - - shared_kaddr = kmap_atomic(vcpu->time_page); - - memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock, - sizeof(vcpu->hv_clock)); - - kunmap_atomic(shared_kaddr); - - mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT); - return 0; -} - -static bool msr_mtrr_valid(unsigned msr) -{ - switch (msr) { - case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1: - case MSR_MTRRfix64K_00000: - case MSR_MTRRfix16K_80000: - case MSR_MTRRfix16K_A0000: - case MSR_MTRRfix4K_C0000: - case MSR_MTRRfix4K_C8000: - case MSR_MTRRfix4K_D0000: - case MSR_MTRRfix4K_D8000: - case MSR_MTRRfix4K_E0000: - case MSR_MTRRfix4K_E8000: - case MSR_MTRRfix4K_F0000: - case MSR_MTRRfix4K_F8000: - case MSR_MTRRdefType: - case MSR_IA32_CR_PAT: - return true; - case 0x2f8: - return true; - } - return false; -} - -static bool valid_pat_type(unsigned t) -{ - return t < 8 && (1 << t) & 0xf3; /* 0, 1, 4, 5, 6, 7 */ -} - -static bool valid_mtrr_type(unsigned t) -{ - return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */ -} - -static bool mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data) -{ - int i; - - if (!msr_mtrr_valid(msr)) - return false; - - if (msr == MSR_IA32_CR_PAT) { - for (i = 0; i < 8; i++) - if (!valid_pat_type((data >> (i * 8)) & 0xff)) - return false; - return true; - } else if (msr == MSR_MTRRdefType) { - if (data & ~0xcff) - return false; - return valid_mtrr_type(data & 0xff); - } else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) { - for (i = 0; i < 8 ; i++) - if (!valid_mtrr_type((data >> (i * 8)) & 0xff)) - return false; - return true; - } - - /* variable MTRRs */ - return valid_mtrr_type(data & 0xff); -} - -static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data) -{ - u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges; - - if (!mtrr_valid(vcpu, msr, data)) - return 1; - - if (msr == MSR_MTRRdefType) { - vcpu->arch.mtrr_state.def_type = data; - vcpu->arch.mtrr_state.enabled = (data & 0xc00) >> 10; - } else if (msr == MSR_MTRRfix64K_00000) - p[0] = data; - else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000) - p[1 + msr - MSR_MTRRfix16K_80000] = data; - else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000) - p[3 + msr - MSR_MTRRfix4K_C0000] = data; - else if (msr == MSR_IA32_CR_PAT) - vcpu->arch.pat = data; - else { /* Variable MTRRs */ - int idx, is_mtrr_mask; - u64 *pt; - - idx = (msr - 0x200) / 2; - is_mtrr_mask = msr - 0x200 - 2 * idx; - if (!is_mtrr_mask) - pt = - (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo; - else - pt = - (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo; - *pt = data; - } - - kvm_mmu_reset_context(vcpu); - return 0; -} - -static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data) -{ - u64 mcg_cap = vcpu->arch.mcg_cap; - unsigned bank_num = mcg_cap & 0xff; - - switch (msr) { - case MSR_IA32_MCG_STATUS: - vcpu->arch.mcg_status = data; - break; - case MSR_IA32_MCG_CTL: - if (!(mcg_cap & MCG_CTL_P)) - return 1; - if (data != 0 && data != ~(u64)0) - return -1; - vcpu->arch.mcg_ctl = data; - break; - default: - if (msr >= MSR_IA32_MC0_CTL && - msr < MSR_IA32_MC0_CTL + 4 * bank_num) { - u32 offset = msr - MSR_IA32_MC0_CTL; - /* only 0 or all 1s can be written to IA32_MCi_CTL - * some Linux kernels though clear bit 10 in bank 4 to - * workaround a BIOS/GART TBL issue on AMD K8s, ignore - * this to avoid an uncatched #GP in the guest - */ - if ((offset & 0x3) == 0 && - data != 0 && (data | (1 << 10)) != ~(u64)0) - return -1; - vcpu->arch.mce_banks[offset] = data; - break; - } - return 1; - } - return 0; -} - -static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data) -{ - struct kvm *kvm = vcpu->kvm; - int lm = is_long_mode(vcpu); - u8 *blob_addr = lm ? (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_64 - : (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_32; - u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64 - : kvm->arch.xen_hvm_config.blob_size_32; - u32 page_num = data & ~PAGE_MASK; - u64 page_addr = data & PAGE_MASK; - u8 *page; - int r; - - r = -E2BIG; - if (page_num >= blob_size) - goto out; - r = -ENOMEM; - page = memdup_user(blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE); - if (IS_ERR(page)) { - r = PTR_ERR(page); - goto out; - } - if (kvm_write_guest(kvm, page_addr, page, PAGE_SIZE)) - goto out_free; - r = 0; -out_free: - kfree(page); -out: - return r; -} - -static bool kvm_hv_hypercall_enabled(struct kvm *kvm) -{ - return kvm->arch.hv_hypercall & HV_X64_MSR_HYPERCALL_ENABLE; -} - -static bool kvm_hv_msr_partition_wide(u32 msr) -{ - bool r = false; - switch (msr) { - case HV_X64_MSR_GUEST_OS_ID: - case HV_X64_MSR_HYPERCALL: - r = true; - break; - } - - return r; -} - -static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data) -{ - struct kvm *kvm = vcpu->kvm; - - switch (msr) { - case HV_X64_MSR_GUEST_OS_ID: - kvm->arch.hv_guest_os_id = data; - /* setting guest os id to zero disables hypercall page */ - if (!kvm->arch.hv_guest_os_id) - kvm->arch.hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE; - break; - case HV_X64_MSR_HYPERCALL: { - u64 gfn; - unsigned long addr; - u8 instructions[4]; - - /* if guest os id is not set hypercall should remain disabled */ - if (!kvm->arch.hv_guest_os_id) - break; - if (!(data & HV_X64_MSR_HYPERCALL_ENABLE)) { - kvm->arch.hv_hypercall = data; - break; - } - gfn = data >> HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT; - addr = gfn_to_hva(kvm, gfn); - if (kvm_is_error_hva(addr)) - return 1; - kvm_x86_ops->patch_hypercall(vcpu, instructions); - ((unsigned char *)instructions)[3] = 0xc3; /* ret */ - if (__copy_to_user((void __user *)addr, instructions, 4)) - return 1; - kvm->arch.hv_hypercall = data; - break; - } - default: - pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " - "data 0x%llx\n", msr, data); - return 1; - } - return 0; -} - -static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data) -{ - switch (msr) { - case HV_X64_MSR_APIC_ASSIST_PAGE: { - unsigned long addr; - - if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) { - vcpu->arch.hv_vapic = data; - break; - } - addr = gfn_to_hva(vcpu->kvm, data >> - HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT); - if (kvm_is_error_hva(addr)) - return 1; - if (__clear_user((void __user *)addr, PAGE_SIZE)) - return 1; - vcpu->arch.hv_vapic = data; - break; - } - case HV_X64_MSR_EOI: - return kvm_hv_vapic_msr_write(vcpu, APIC_EOI, data); - case HV_X64_MSR_ICR: - return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data); - case HV_X64_MSR_TPR: - return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data); - default: - pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " - "data 0x%llx\n", msr, data); - return 1; - } - - return 0; -} - -static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) -{ - gpa_t gpa = data & ~0x3f; - - /* Bits 2:5 are resrved, Should be zero */ - if (data & 0x3c) - return 1; - - vcpu->arch.apf.msr_val = data; - - if (!(data & KVM_ASYNC_PF_ENABLED)) { - kvm_clear_async_pf_completion_queue(vcpu); - kvm_async_pf_hash_reset(vcpu); - return 0; - } - - if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa)) - return 1; - - vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS); - kvm_async_pf_wakeup_all(vcpu); - return 0; -} - -static void kvmclock_reset(struct kvm_vcpu *vcpu) -{ - if (vcpu->arch.time_page) { - kvm_release_page_dirty(vcpu->arch.time_page); - vcpu->arch.time_page = NULL; - } -} - -static void accumulate_steal_time(struct kvm_vcpu *vcpu) -{ - u64 delta; - - if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) - return; - - delta = current->sched_info.run_delay - vcpu->arch.st.last_steal; - vcpu->arch.st.last_steal = current->sched_info.run_delay; - vcpu->arch.st.accum_steal = delta; -} - -static void record_steal_time(struct kvm_vcpu *vcpu) -{ - if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) - return; - - if (unlikely(kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, - &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)))) - return; - - vcpu->arch.st.steal.steal += vcpu->arch.st.accum_steal; - vcpu->arch.st.steal.version += 2; - vcpu->arch.st.accum_steal = 0; - - kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, - &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); -} - -int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) -{ - bool pr = false; - - switch (msr) { - case MSR_EFER: - return set_efer(vcpu, data); - case MSR_K7_HWCR: - data &= ~(u64)0x40; /* ignore flush filter disable */ - data &= ~(u64)0x100; /* ignore ignne emulation enable */ - data &= ~(u64)0x8; /* ignore TLB cache disable */ - if (data != 0) { - pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", - data); - return 1; - } - break; - case MSR_FAM10H_MMIO_CONF_BASE: - if (data != 0) { - pr_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: " - "0x%llx\n", data); - return 1; - } - break; - case MSR_AMD64_NB_CFG: - break; - case MSR_IA32_DEBUGCTLMSR: - if (!data) { - /* We support the non-activated case already */ - break; - } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) { - /* Values other than LBR and BTF are vendor-specific, - thus reserved and should throw a #GP */ - return 1; - } - pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n", - __func__, data); - break; - case MSR_IA32_UCODE_REV: - case MSR_IA32_UCODE_WRITE: - case MSR_VM_HSAVE_PA: - case MSR_AMD64_PATCH_LOADER: - break; - case 0x200 ... 0x2ff: - return set_msr_mtrr(vcpu, msr, data); - case MSR_IA32_APICBASE: - kvm_set_apic_base(vcpu, data); - break; - case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: - return kvm_x2apic_msr_write(vcpu, msr, data); - case MSR_IA32_TSCDEADLINE: - kvm_set_lapic_tscdeadline_msr(vcpu, data); - break; - case MSR_IA32_MISC_ENABLE: - vcpu->arch.ia32_misc_enable_msr = data; - break; - case MSR_KVM_WALL_CLOCK_NEW: - case MSR_KVM_WALL_CLOCK: - vcpu->kvm->arch.wall_clock = data; - kvm_write_wall_clock(vcpu->kvm, data); - break; - case MSR_KVM_SYSTEM_TIME_NEW: - case MSR_KVM_SYSTEM_TIME: { - kvmclock_reset(vcpu); - - vcpu->arch.time = data; - kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); - - /* we verify if the enable bit is set... */ - if (!(data & 1)) - break; - - /* ...but clean it before doing the actual write */ - vcpu->arch.time_offset = data & ~(PAGE_MASK | 1); - - vcpu->arch.time_page = - gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT); - - if (is_error_page(vcpu->arch.time_page)) { - kvm_release_page_clean(vcpu->arch.time_page); - vcpu->arch.time_page = NULL; - } - break; - } - case MSR_KVM_ASYNC_PF_EN: - if (kvm_pv_enable_async_pf(vcpu, data)) - return 1; - break; - case MSR_KVM_STEAL_TIME: - - if (unlikely(!sched_info_on())) - return 1; - - if (data & KVM_STEAL_RESERVED_MASK) - return 1; - - if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime, - data & KVM_STEAL_VALID_BITS)) - return 1; - - vcpu->arch.st.msr_val = data; - - if (!(data & KVM_MSR_ENABLED)) - break; - - vcpu->arch.st.last_steal = current->sched_info.run_delay; - - preempt_disable(); - accumulate_steal_time(vcpu); - preempt_enable(); - - kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); - - break; - - case MSR_IA32_MCG_CTL: - case MSR_IA32_MCG_STATUS: - case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: - return set_msr_mce(vcpu, msr, data); - - /* Performance counters are not protected by a CPUID bit, - * so we should check all of them in the generic path for the sake of - * cross vendor migration. - * Writing a zero into the event select MSRs disables them, - * which we perfectly emulate ;-). Any other value should be at least - * reported, some guests depend on them. - */ - case MSR_K7_EVNTSEL0: - case MSR_K7_EVNTSEL1: - case MSR_K7_EVNTSEL2: - case MSR_K7_EVNTSEL3: - if (data != 0) - pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " - "0x%x data 0x%llx\n", msr, data); - break; - /* at least RHEL 4 unconditionally writes to the perfctr registers, - * so we ignore writes to make it happy. - */ - case MSR_K7_PERFCTR0: - case MSR_K7_PERFCTR1: - case MSR_K7_PERFCTR2: - case MSR_K7_PERFCTR3: - pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " - "0x%x data 0x%llx\n", msr, data); - break; - case MSR_P6_PERFCTR0: - case MSR_P6_PERFCTR1: - pr = true; - case MSR_P6_EVNTSEL0: - case MSR_P6_EVNTSEL1: - if (kvm_pmu_msr(vcpu, msr)) - return kvm_pmu_set_msr(vcpu, msr, data); - - if (pr || data != 0) - pr_unimpl(vcpu, "disabled perfctr wrmsr: " - "0x%x data 0x%llx\n", msr, data); - break; - case MSR_K7_CLK_CTL: - /* - * Ignore all writes to this no longer documented MSR. - * Writes are only relevant for old K7 processors, - * all pre-dating SVM, but a recommended workaround from - * AMD for these chips. It is possible to speicify the - * affected processor models on the command line, hence - * the need to ignore the workaround. - */ - break; - case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: - if (kvm_hv_msr_partition_wide(msr)) { - int r; - mutex_lock(&vcpu->kvm->lock); - r = set_msr_hyperv_pw(vcpu, msr, data); - mutex_unlock(&vcpu->kvm->lock); - return r; - } else - return set_msr_hyperv(vcpu, msr, data); - break; - case MSR_IA32_BBL_CR_CTL3: - /* Drop writes to this legacy MSR -- see rdmsr - * counterpart for further detail. - */ - pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data); - break; - case MSR_AMD64_OSVW_ID_LENGTH: - if (!guest_cpuid_has_osvw(vcpu)) - return 1; - vcpu->arch.osvw.length = data; - break; - case MSR_AMD64_OSVW_STATUS: - if (!guest_cpuid_has_osvw(vcpu)) - return 1; - vcpu->arch.osvw.status = data; - break; - default: - if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) - return xen_hvm_config(vcpu, data); - if (kvm_pmu_msr(vcpu, msr)) - return kvm_pmu_set_msr(vcpu, msr, data); - if (!ignore_msrs) { - pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", - msr, data); - return 1; - } else { - pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", - msr, data); - break; - } - } - return 0; -} -EXPORT_SYMBOL_GPL(kvm_set_msr_common); - - -/* - * Reads an msr value (of 'msr_index') into 'pdata'. - * Returns 0 on success, non-0 otherwise. - * Assumes vcpu_load() was already called. - */ -int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) -{ - return kvm_x86_ops->get_msr(vcpu, msr_index, pdata); -} - -static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) -{ - u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges; - - if (!msr_mtrr_valid(msr)) - return 1; - - if (msr == MSR_MTRRdefType) - *pdata = vcpu->arch.mtrr_state.def_type + - (vcpu->arch.mtrr_state.enabled << 10); - else if (msr == MSR_MTRRfix64K_00000) - *pdata = p[0]; - else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000) - *pdata = p[1 + msr - MSR_MTRRfix16K_80000]; - else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000) - *pdata = p[3 + msr - MSR_MTRRfix4K_C0000]; - else if (msr == MSR_IA32_CR_PAT) - *pdata = vcpu->arch.pat; - else { /* Variable MTRRs */ - int idx, is_mtrr_mask; - u64 *pt; - - idx = (msr - 0x200) / 2; - is_mtrr_mask = msr - 0x200 - 2 * idx; - if (!is_mtrr_mask) - pt = - (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo; - else - pt = - (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo; - *pdata = *pt; - } - - return 0; -} - -static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) -{ - u64 data; - u64 mcg_cap = vcpu->arch.mcg_cap; - unsigned bank_num = mcg_cap & 0xff; - - switch (msr) { - case MSR_IA32_P5_MC_ADDR: - case MSR_IA32_P5_MC_TYPE: - data = 0; - break; - case MSR_IA32_MCG_CAP: - data = vcpu->arch.mcg_cap; - break; - case MSR_IA32_MCG_CTL: - if (!(mcg_cap & MCG_CTL_P)) - return 1; - data = vcpu->arch.mcg_ctl; - break; - case MSR_IA32_MCG_STATUS: - data = vcpu->arch.mcg_status; - break; - default: - if (msr >= MSR_IA32_MC0_CTL && - msr < MSR_IA32_MC0_CTL + 4 * bank_num) { - u32 offset = msr - MSR_IA32_MC0_CTL; - data = vcpu->arch.mce_banks[offset]; - break; - } - return 1; - } - *pdata = data; - return 0; -} - -static int get_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) -{ - u64 data = 0; - struct kvm *kvm = vcpu->kvm; - - switch (msr) { - case HV_X64_MSR_GUEST_OS_ID: - data = kvm->arch.hv_guest_os_id; - break; - case HV_X64_MSR_HYPERCALL: - data = kvm->arch.hv_hypercall; - break; - default: - pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); - return 1; - } - - *pdata = data; - return 0; -} - -static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) -{ - u64 data = 0; - - switch (msr) { - case HV_X64_MSR_VP_INDEX: { - int r; - struct kvm_vcpu *v; - kvm_for_each_vcpu(r, v, vcpu->kvm) - if (v == vcpu) - data = r; - break; - } - case HV_X64_MSR_EOI: - return kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata); - case HV_X64_MSR_ICR: - return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata); - case HV_X64_MSR_TPR: - return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata); - case HV_X64_MSR_APIC_ASSIST_PAGE: - data = vcpu->arch.hv_vapic; - break; - default: - pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); - return 1; - } - *pdata = data; - return 0; -} - -int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) -{ - u64 data; - - switch (msr) { - case MSR_IA32_PLATFORM_ID: - case MSR_IA32_EBL_CR_POWERON: - case MSR_IA32_DEBUGCTLMSR: - case MSR_IA32_LASTBRANCHFROMIP: - case MSR_IA32_LASTBRANCHTOIP: - case MSR_IA32_LASTINTFROMIP: - case MSR_IA32_LASTINTTOIP: - case MSR_K8_SYSCFG: - case MSR_K7_HWCR: - case MSR_VM_HSAVE_PA: - case MSR_K7_EVNTSEL0: - case MSR_K7_PERFCTR0: - case MSR_K8_INT_PENDING_MSG: - case MSR_AMD64_NB_CFG: - case MSR_FAM10H_MMIO_CONF_BASE: - data = 0; - break; - case MSR_P6_PERFCTR0: - case MSR_P6_PERFCTR1: - case MSR_P6_EVNTSEL0: - case MSR_P6_EVNTSEL1: - if (kvm_pmu_msr(vcpu, msr)) - return kvm_pmu_get_msr(vcpu, msr, pdata); - data = 0; - break; - case MSR_IA32_UCODE_REV: - data = 0x100000000ULL; - break; - case MSR_MTRRcap: - data = 0x500 | KVM_NR_VAR_MTRR; - break; - case 0x200 ... 0x2ff: - return get_msr_mtrr(vcpu, msr, pdata); - case 0xcd: /* fsb frequency */ - data = 3; - break; - /* - * MSR_EBC_FREQUENCY_ID - * Conservative value valid for even the basic CPU models. - * Models 0,1: 000 in bits 23:21 indicating a bus speed of - * 100MHz, model 2 000 in bits 18:16 indicating 100MHz, - * and 266MHz for model 3, or 4. Set Core Clock - * Frequency to System Bus Frequency Ratio to 1 (bits - * 31:24) even though these are only valid for CPU - * models > 2, however guests may end up dividing or - * multiplying by zero otherwise. - */ - case MSR_EBC_FREQUENCY_ID: - data = 1 << 24; - break; - case MSR_IA32_APICBASE: - data = kvm_get_apic_base(vcpu); - break; - case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: - return kvm_x2apic_msr_read(vcpu, msr, pdata); - break; - case MSR_IA32_TSCDEADLINE: - data = kvm_get_lapic_tscdeadline_msr(vcpu); - break; - case MSR_IA32_MISC_ENABLE: - data = vcpu->arch.ia32_misc_enable_msr; - break; - case MSR_IA32_PERF_STATUS: - /* TSC increment by tick */ - data = 1000ULL; - /* CPU multiplier */ - data |= (((uint64_t)4ULL) << 40); - break; - case MSR_EFER: - data = vcpu->arch.efer; - break; - case MSR_KVM_WALL_CLOCK: - case MSR_KVM_WALL_CLOCK_NEW: - data = vcpu->kvm->arch.wall_clock; - break; - case MSR_KVM_SYSTEM_TIME: - case MSR_KVM_SYSTEM_TIME_NEW: - data = vcpu->arch.time; - break; - case MSR_KVM_ASYNC_PF_EN: - data = vcpu->arch.apf.msr_val; - break; - case MSR_KVM_STEAL_TIME: - data = vcpu->arch.st.msr_val; - break; - case MSR_IA32_P5_MC_ADDR: - case MSR_IA32_P5_MC_TYPE: - case MSR_IA32_MCG_CAP: - case MSR_IA32_MCG_CTL: - case MSR_IA32_MCG_STATUS: - case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: - return get_msr_mce(vcpu, msr, pdata); - case MSR_K7_CLK_CTL: - /* - * Provide expected ramp-up count for K7. All other - * are set to zero, indicating minimum divisors for - * every field. - * - * This prevents guest kernels on AMD host with CPU - * type 6, model 8 and higher from exploding due to - * the rdmsr failing. - */ - data = 0x20000000; - break; - case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: - if (kvm_hv_msr_partition_wide(msr)) { - int r; - mutex_lock(&vcpu->kvm->lock); - r = get_msr_hyperv_pw(vcpu, msr, pdata); - mutex_unlock(&vcpu->kvm->lock); - return r; - } else - return get_msr_hyperv(vcpu, msr, pdata); - break; - case MSR_IA32_BBL_CR_CTL3: - /* This legacy MSR exists but isn't fully documented in current - * silicon. It is however accessed by winxp in very narrow - * scenarios where it sets bit #19, itself documented as - * a "reserved" bit. Best effort attempt to source coherent - * read data here should the balance of the register be - * interpreted by the guest: - * - * L2 cache control register 3: 64GB range, 256KB size, - * enabled, latency 0x1, configured - */ - data = 0xbe702111; - break; - case MSR_AMD64_OSVW_ID_LENGTH: - if (!guest_cpuid_has_osvw(vcpu)) - return 1; - data = vcpu->arch.osvw.length; - break; - case MSR_AMD64_OSVW_STATUS: - if (!guest_cpuid_has_osvw(vcpu)) - return 1; - data = vcpu->arch.osvw.status; - break; - default: - if (kvm_pmu_msr(vcpu, msr)) - return kvm_pmu_get_msr(vcpu, msr, pdata); - if (!ignore_msrs) { - pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); - return 1; - } else { - pr_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr); - data = 0; - } - break; - } - *pdata = data; - return 0; -} -EXPORT_SYMBOL_GPL(kvm_get_msr_common); - -/* - * Read or write a bunch of msrs. All parameters are kernel addresses. - * - * @return number of msrs set successfully. - */ -static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, - struct kvm_msr_entry *entries, - int (*do_msr)(struct kvm_vcpu *vcpu, - unsigned index, u64 *data)) -{ - int i, idx; - - idx = srcu_read_lock(&vcpu->kvm->srcu); - for (i = 0; i < msrs->nmsrs; ++i) - if (do_msr(vcpu, entries[i].index, &entries[i].data)) - break; - srcu_read_unlock(&vcpu->kvm->srcu, idx); - - return i; -} - -/* - * Read or write a bunch of msrs. Parameters are user addresses. - * - * @return number of msrs set successfully. - */ -static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs, - int (*do_msr)(struct kvm_vcpu *vcpu, - unsigned index, u64 *data), - int writeback) -{ - struct kvm_msrs msrs; - struct kvm_msr_entry *entries; - int r, n; - unsigned size; - - r = -EFAULT; - if (copy_from_user(&msrs, user_msrs, sizeof msrs)) - goto out; - - r = -E2BIG; - if (msrs.nmsrs >= MAX_IO_MSRS) - goto out; - - size = sizeof(struct kvm_msr_entry) * msrs.nmsrs; - entries = memdup_user(user_msrs->entries, size); - if (IS_ERR(entries)) { - r = PTR_ERR(entries); - goto out; - } - - r = n = __msr_io(vcpu, &msrs, entries, do_msr); - if (r < 0) - goto out_free; - - r = -EFAULT; - if (writeback && copy_to_user(user_msrs->entries, entries, size)) - goto out_free; - - r = n; - -out_free: - kfree(entries); -out: - return r; -} - -int kvm_dev_ioctl_check_extension(long ext) -{ - int r; - - switch (ext) { - case KVM_CAP_IRQCHIP: - case KVM_CAP_HLT: - case KVM_CAP_MMU_SHADOW_CACHE_CONTROL: - case KVM_CAP_SET_TSS_ADDR: - case KVM_CAP_EXT_CPUID: - case KVM_CAP_CLOCKSOURCE: - case KVM_CAP_PIT: - case KVM_CAP_NOP_IO_DELAY: - case KVM_CAP_MP_STATE: - case KVM_CAP_SYNC_MMU: - case KVM_CAP_USER_NMI: - case KVM_CAP_REINJECT_CONTROL: - case KVM_CAP_IRQ_INJECT_STATUS: - case KVM_CAP_ASSIGN_DEV_IRQ: - case KVM_CAP_IRQFD: - case KVM_CAP_IOEVENTFD: - case KVM_CAP_PIT2: - case KVM_CAP_PIT_STATE2: - case KVM_CAP_SET_IDENTITY_MAP_ADDR: - case KVM_CAP_XEN_HVM: - case KVM_CAP_ADJUST_CLOCK: - case KVM_CAP_VCPU_EVENTS: - case KVM_CAP_HYPERV: - case KVM_CAP_HYPERV_VAPIC: - case KVM_CAP_HYPERV_SPIN: - case KVM_CAP_PCI_SEGMENT: - case KVM_CAP_DEBUGREGS: - case KVM_CAP_X86_ROBUST_SINGLESTEP: - case KVM_CAP_XSAVE: - case KVM_CAP_ASYNC_PF: - case KVM_CAP_GET_TSC_KHZ: - case KVM_CAP_PCI_2_3: - r = 1; - break; - case KVM_CAP_COALESCED_MMIO: - r = KVM_COALESCED_MMIO_PAGE_OFFSET; - break; - case KVM_CAP_VAPIC: - r = !kvm_x86_ops->cpu_has_accelerated_tpr(); - break; - case KVM_CAP_NR_VCPUS: - r = KVM_SOFT_MAX_VCPUS; - break; - case KVM_CAP_MAX_VCPUS: - r = KVM_MAX_VCPUS; - break; - case KVM_CAP_NR_MEMSLOTS: - r = KVM_MEMORY_SLOTS; - break; - case KVM_CAP_PV_MMU: /* obsolete */ - r = 0; - break; - case KVM_CAP_IOMMU: - r = iommu_present(&pci_bus_type); - break; - case KVM_CAP_MCE: - r = KVM_MAX_MCE_BANKS; - break; - case KVM_CAP_XCRS: - r = cpu_has_xsave; - break; - case KVM_CAP_TSC_CONTROL: - r = kvm_has_tsc_control; - break; - case KVM_CAP_TSC_DEADLINE_TIMER: - r = boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER); - break; - default: - r = 0; - break; - } - return r; - -} - -long kvm_arch_dev_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) -{ - void __user *argp = (void __user *)arg; - long r; - - switch (ioctl) { - case KVM_GET_MSR_INDEX_LIST: { - struct kvm_msr_list __user *user_msr_list = argp; - struct kvm_msr_list msr_list; - unsigned n; - - r = -EFAULT; - if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list)) - goto out; - n = msr_list.nmsrs; - msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs); - if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list)) - goto out; - r = -E2BIG; - if (n < msr_list.nmsrs) - goto out; - r = -EFAULT; - if (copy_to_user(user_msr_list->indices, &msrs_to_save, - num_msrs_to_save * sizeof(u32))) - goto out; - if (copy_to_user(user_msr_list->indices + num_msrs_to_save, - &emulated_msrs, - ARRAY_SIZE(emulated_msrs) * sizeof(u32))) - goto out; - r = 0; - break; - } - case KVM_GET_SUPPORTED_CPUID: { - struct kvm_cpuid2 __user *cpuid_arg = argp; - struct kvm_cpuid2 cpuid; - - r = -EFAULT; - if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) - goto out; - r = kvm_dev_ioctl_get_supported_cpuid(&cpuid, - cpuid_arg->entries); - if (r) - goto out; - - r = -EFAULT; - if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid)) - goto out; - r = 0; - break; - } - case KVM_X86_GET_MCE_CAP_SUPPORTED: { - u64 mce_cap; - - mce_cap = KVM_MCE_CAP_SUPPORTED; - r = -EFAULT; - if (copy_to_user(argp, &mce_cap, sizeof mce_cap)) - goto out; - r = 0; - break; - } - default: - r = -EINVAL; - } -out: - return r; -} - -static void wbinvd_ipi(void *garbage) -{ - wbinvd(); -} - -static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu) -{ - return vcpu->kvm->arch.iommu_domain && - !(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY); -} - -void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) -{ - /* Address WBINVD may be executed by guest */ - if (need_emulate_wbinvd(vcpu)) { - if (kvm_x86_ops->has_wbinvd_exit()) - cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask); - else if (vcpu->cpu != -1 && vcpu->cpu != cpu) - smp_call_function_single(vcpu->cpu, - wbinvd_ipi, NULL, 1); - } - - kvm_x86_ops->vcpu_load(vcpu, cpu); - - /* Apply any externally detected TSC adjustments (due to suspend) */ - if (unlikely(vcpu->arch.tsc_offset_adjustment)) { - adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment); - vcpu->arch.tsc_offset_adjustment = 0; - set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests); - } - - if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) { - s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 : - native_read_tsc() - vcpu->arch.last_host_tsc; - if (tsc_delta < 0) - mark_tsc_unstable("KVM discovered backwards TSC"); - if (check_tsc_unstable()) { - u64 offset = kvm_x86_ops->compute_tsc_offset(vcpu, - vcpu->arch.last_guest_tsc); - kvm_x86_ops->write_tsc_offset(vcpu, offset); - vcpu->arch.tsc_catchup = 1; - } - kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); - if (vcpu->cpu != cpu) - kvm_migrate_timers(vcpu); - vcpu->cpu = cpu; - } - - accumulate_steal_time(vcpu); - kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); -} - -void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) -{ - kvm_x86_ops->vcpu_put(vcpu); - kvm_put_guest_fpu(vcpu); - vcpu->arch.last_host_tsc = native_read_tsc(); -} - -static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, - struct kvm_lapic_state *s) -{ - memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s); - - return 0; -} - -static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, - struct kvm_lapic_state *s) -{ - memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s); - kvm_apic_post_state_restore(vcpu); - update_cr8_intercept(vcpu); - - return 0; -} - -static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, - struct kvm_interrupt *irq) -{ - if (irq->irq < 0 || irq->irq >= 256) - return -EINVAL; - if (irqchip_in_kernel(vcpu->kvm)) - return -ENXIO; - - kvm_queue_interrupt(vcpu, irq->irq, false); - kvm_make_request(KVM_REQ_EVENT, vcpu); - - return 0; -} - -static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu) -{ - kvm_inject_nmi(vcpu); - - return 0; -} - -static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu, - struct kvm_tpr_access_ctl *tac) -{ - if (tac->flags) - return -EINVAL; - vcpu->arch.tpr_access_reporting = !!tac->enabled; - return 0; -} - -static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, - u64 mcg_cap) -{ - int r; - unsigned bank_num = mcg_cap & 0xff, bank; - - r = -EINVAL; - if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS) - goto out; - if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000)) - goto out; - r = 0; - vcpu->arch.mcg_cap = mcg_cap; - /* Init IA32_MCG_CTL to all 1s */ - if (mcg_cap & MCG_CTL_P) - vcpu->arch.mcg_ctl = ~(u64)0; - /* Init IA32_MCi_CTL to all 1s */ - for (bank = 0; bank < bank_num; bank++) - vcpu->arch.mce_banks[bank*4] = ~(u64)0; -out: - return r; -} - -static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, - struct kvm_x86_mce *mce) -{ - u64 mcg_cap = vcpu->arch.mcg_cap; - unsigned bank_num = mcg_cap & 0xff; - u64 *banks = vcpu->arch.mce_banks; - - if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL)) - return -EINVAL; - /* - * if IA32_MCG_CTL is not all 1s, the uncorrected error - * reporting is disabled - */ - if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) && - vcpu->arch.mcg_ctl != ~(u64)0) - return 0; - banks += 4 * mce->bank; - /* - * if IA32_MCi_CTL is not all 1s, the uncorrected error - * reporting is disabled for the bank - */ - if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0) - return 0; - if (mce->status & MCI_STATUS_UC) { - if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) || - !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) { - kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); - return 0; - } - if (banks[1] & MCI_STATUS_VAL) - mce->status |= MCI_STATUS_OVER; - banks[2] = mce->addr; - banks[3] = mce->misc; - vcpu->arch.mcg_status = mce->mcg_status; - banks[1] = mce->status; - kvm_queue_exception(vcpu, MC_VECTOR); - } else if (!(banks[1] & MCI_STATUS_VAL) - || !(banks[1] & MCI_STATUS_UC)) { - if (banks[1] & MCI_STATUS_VAL) - mce->status |= MCI_STATUS_OVER; - banks[2] = mce->addr; - banks[3] = mce->misc; - banks[1] = mce->status; - } else - banks[1] |= MCI_STATUS_OVER; - return 0; -} - -static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, - struct kvm_vcpu_events *events) -{ - process_nmi(vcpu); - events->exception.injected = - vcpu->arch.exception.pending && - !kvm_exception_is_soft(vcpu->arch.exception.nr); - events->exception.nr = vcpu->arch.exception.nr; - events->exception.has_error_code = vcpu->arch.exception.has_error_code; - events->exception.pad = 0; - events->exception.error_code = vcpu->arch.exception.error_code; - - events->interrupt.injected = - vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft; - events->interrupt.nr = vcpu->arch.interrupt.nr; - events->interrupt.soft = 0; - events->interrupt.shadow = - kvm_x86_ops->get_interrupt_shadow(vcpu, - KVM_X86_SHADOW_INT_MOV_SS | KVM_X86_SHADOW_INT_STI); - - events->nmi.injected = vcpu->arch.nmi_injected; - events->nmi.pending = vcpu->arch.nmi_pending != 0; - events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu); - events->nmi.pad = 0; - - events->sipi_vector = vcpu->arch.sipi_vector; - - events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING - | KVM_VCPUEVENT_VALID_SIPI_VECTOR - | KVM_VCPUEVENT_VALID_SHADOW); - memset(&events->reserved, 0, sizeof(events->reserved)); -} - -static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, - struct kvm_vcpu_events *events) -{ - if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING - | KVM_VCPUEVENT_VALID_SIPI_VECTOR - | KVM_VCPUEVENT_VALID_SHADOW)) - return -EINVAL; - - process_nmi(vcpu); - vcpu->arch.exception.pending = events->exception.injected; - vcpu->arch.exception.nr = events->exception.nr; - vcpu->arch.exception.has_error_code = events->exception.has_error_code; - vcpu->arch.exception.error_code = events->exception.error_code; - - vcpu->arch.interrupt.pending = events->interrupt.injected; - vcpu->arch.interrupt.nr = events->interrupt.nr; - vcpu->arch.interrupt.soft = events->interrupt.soft; - if (events->flags & KVM_VCPUEVENT_VALID_SHADOW) - kvm_x86_ops->set_interrupt_shadow(vcpu, - events->interrupt.shadow); - - vcpu->arch.nmi_injected = events->nmi.injected; - if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) - vcpu->arch.nmi_pending = events->nmi.pending; - kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked); - - if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR) - vcpu->arch.sipi_vector = events->sipi_vector; - - kvm_make_request(KVM_REQ_EVENT, vcpu); - - return 0; -} - -static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu, - struct kvm_debugregs *dbgregs) -{ - memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db)); - dbgregs->dr6 = vcpu->arch.dr6; - dbgregs->dr7 = vcpu->arch.dr7; - dbgregs->flags = 0; - memset(&dbgregs->reserved, 0, sizeof(dbgregs->reserved)); -} - -static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, - struct kvm_debugregs *dbgregs) -{ - if (dbgregs->flags) - return -EINVAL; - - memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db)); - vcpu->arch.dr6 = dbgregs->dr6; - vcpu->arch.dr7 = dbgregs->dr7; - - return 0; -} - -static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, - struct kvm_xsave *guest_xsave) -{ - if (cpu_has_xsave) - memcpy(guest_xsave->region, - &vcpu->arch.guest_fpu.state->xsave, - xstate_size); - else { - memcpy(guest_xsave->region, - &vcpu->arch.guest_fpu.state->fxsave, - sizeof(struct i387_fxsave_struct)); - *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] = - XSTATE_FPSSE; - } -} - -static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, - struct kvm_xsave *guest_xsave) -{ - u64 xstate_bv = - *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)]; - - if (cpu_has_xsave) - memcpy(&vcpu->arch.guest_fpu.state->xsave, - guest_xsave->region, xstate_size); - else { - if (xstate_bv & ~XSTATE_FPSSE) - return -EINVAL; - memcpy(&vcpu->arch.guest_fpu.state->fxsave, - guest_xsave->region, sizeof(struct i387_fxsave_struct)); - } - return 0; -} - -static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu, - struct kvm_xcrs *guest_xcrs) -{ - if (!cpu_has_xsave) { - guest_xcrs->nr_xcrs = 0; - return; - } - - guest_xcrs->nr_xcrs = 1; - guest_xcrs->flags = 0; - guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK; - guest_xcrs->xcrs[0].value = vcpu->arch.xcr0; -} - -static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu, - struct kvm_xcrs *guest_xcrs) -{ - int i, r = 0; - - if (!cpu_has_xsave) - return -EINVAL; - - if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS || guest_xcrs->flags) - return -EINVAL; - - for (i = 0; i < guest_xcrs->nr_xcrs; i++) - /* Only support XCR0 currently */ - if (guest_xcrs->xcrs[0].xcr == XCR_XFEATURE_ENABLED_MASK) { - r = __kvm_set_xcr(vcpu, XCR_XFEATURE_ENABLED_MASK, - guest_xcrs->xcrs[0].value); - break; - } - if (r) - r = -EINVAL; - return r; -} - -long kvm_arch_vcpu_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) -{ - struct kvm_vcpu *vcpu = filp->private_data; - void __user *argp = (void __user *)arg; - int r; - union { - struct kvm_lapic_state *lapic; - struct kvm_xsave *xsave; - struct kvm_xcrs *xcrs; - void *buffer; - } u; - - u.buffer = NULL; - switch (ioctl) { - case KVM_GET_LAPIC: { - r = -EINVAL; - if (!vcpu->arch.apic) - goto out; - u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); - - r = -ENOMEM; - if (!u.lapic) - goto out; - r = kvm_vcpu_ioctl_get_lapic(vcpu, u.lapic); - if (r) - goto out; - r = -EFAULT; - if (copy_to_user(argp, u.lapic, sizeof(struct kvm_lapic_state))) - goto out; - r = 0; - break; - } - case KVM_SET_LAPIC: { - r = -EINVAL; - if (!vcpu->arch.apic) - goto out; - u.lapic = memdup_user(argp, sizeof(*u.lapic)); - if (IS_ERR(u.lapic)) { - r = PTR_ERR(u.lapic); - goto out; - } - - r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic); - if (r) - goto out; - r = 0; - break; - } - case KVM_INTERRUPT: { - struct kvm_interrupt irq; - - r = -EFAULT; - if (copy_from_user(&irq, argp, sizeof irq)) - goto out; - r = kvm_vcpu_ioctl_interrupt(vcpu, &irq); - if (r) - goto out; - r = 0; - break; - } - case KVM_NMI: { - r = kvm_vcpu_ioctl_nmi(vcpu); - if (r) - goto out; - r = 0; - break; - } - case KVM_SET_CPUID: { - struct kvm_cpuid __user *cpuid_arg = argp; - struct kvm_cpuid cpuid; - - r = -EFAULT; - if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) - goto out; - r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries); - if (r) - goto out; - break; - } - case KVM_SET_CPUID2: { - struct kvm_cpuid2 __user *cpuid_arg = argp; - struct kvm_cpuid2 cpuid; - - r = -EFAULT; - if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) - goto out; - r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid, - cpuid_arg->entries); - if (r) - goto out; - break; - } - case KVM_GET_CPUID2: { - struct kvm_cpuid2 __user *cpuid_arg = argp; - struct kvm_cpuid2 cpuid; - - r = -EFAULT; - if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) - goto out; - r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid, - cpuid_arg->entries); - if (r) - goto out; - r = -EFAULT; - if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid)) - goto out; - r = 0; - break; - } - case KVM_GET_MSRS: - r = msr_io(vcpu, argp, kvm_get_msr, 1); - break; - case KVM_SET_MSRS: - r = msr_io(vcpu, argp, do_set_msr, 0); - break; - case KVM_TPR_ACCESS_REPORTING: { - struct kvm_tpr_access_ctl tac; - - r = -EFAULT; - if (copy_from_user(&tac, argp, sizeof tac)) - goto out; - r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac); - if (r) - goto out; - r = -EFAULT; - if (copy_to_user(argp, &tac, sizeof tac)) - goto out; - r = 0; - break; - }; - case KVM_SET_VAPIC_ADDR: { - struct kvm_vapic_addr va; - - r = -EINVAL; - if (!irqchip_in_kernel(vcpu->kvm)) - goto out; - r = -EFAULT; - if (copy_from_user(&va, argp, sizeof va)) - goto out; - r = 0; - kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr); - break; - } - case KVM_X86_SETUP_MCE: { - u64 mcg_cap; - - r = -EFAULT; - if (copy_from_user(&mcg_cap, argp, sizeof mcg_cap)) - goto out; - r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap); - break; - } - case KVM_X86_SET_MCE: { - struct kvm_x86_mce mce; - - r = -EFAULT; - if (copy_from_user(&mce, argp, sizeof mce)) - goto out; - r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce); - break; - } - case KVM_GET_VCPU_EVENTS: { - struct kvm_vcpu_events events; - - kvm_vcpu_ioctl_x86_get_vcpu_events(vcpu, &events); - - r = -EFAULT; - if (copy_to_user(argp, &events, sizeof(struct kvm_vcpu_events))) - break; - r = 0; - break; - } - case KVM_SET_VCPU_EVENTS: { - struct kvm_vcpu_events events; - - r = -EFAULT; - if (copy_from_user(&events, argp, sizeof(struct kvm_vcpu_events))) - break; - - r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events); - break; - } - case KVM_GET_DEBUGREGS: { - struct kvm_debugregs dbgregs; - - kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs); - - r = -EFAULT; - if (copy_to_user(argp, &dbgregs, - sizeof(struct kvm_debugregs))) - break; - r = 0; - break; - } - case KVM_SET_DEBUGREGS: { - struct kvm_debugregs dbgregs; - - r = -EFAULT; - if (copy_from_user(&dbgregs, argp, - sizeof(struct kvm_debugregs))) - break; - - r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs); - break; - } - case KVM_GET_XSAVE: { - u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL); - r = -ENOMEM; - if (!u.xsave) - break; - - kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave); - - r = -EFAULT; - if (copy_to_user(argp, u.xsave, sizeof(struct kvm_xsave))) - break; - r = 0; - break; - } - case KVM_SET_XSAVE: { - u.xsave = memdup_user(argp, sizeof(*u.xsave)); - if (IS_ERR(u.xsave)) { - r = PTR_ERR(u.xsave); - goto out; - } - - r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave); - break; - } - case KVM_GET_XCRS: { - u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL); - r = -ENOMEM; - if (!u.xcrs) - break; - - kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs); - - r = -EFAULT; - if (copy_to_user(argp, u.xcrs, - sizeof(struct kvm_xcrs))) - break; - r = 0; - break; - } - case KVM_SET_XCRS: { - u.xcrs = memdup_user(argp, sizeof(*u.xcrs)); - if (IS_ERR(u.xcrs)) { - r = PTR_ERR(u.xcrs); - goto out; - } - - r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs); - break; - } - case KVM_SET_TSC_KHZ: { - u32 user_tsc_khz; - - r = -EINVAL; - user_tsc_khz = (u32)arg; - - if (user_tsc_khz >= kvm_max_guest_tsc_khz) - goto out; - - if (user_tsc_khz == 0) - user_tsc_khz = tsc_khz; - - kvm_set_tsc_khz(vcpu, user_tsc_khz); - - r = 0; - goto out; - } - case KVM_GET_TSC_KHZ: { - r = vcpu->arch.virtual_tsc_khz; - goto out; - } - default: - r = -EINVAL; - } -out: - kfree(u.buffer); - return r; -} - -int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) -{ - return VM_FAULT_SIGBUS; -} - -static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr) -{ - int ret; - - if (addr > (unsigned int)(-3 * PAGE_SIZE)) - return -1; - ret = kvm_x86_ops->set_tss_addr(kvm, addr); - return ret; -} - -static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm, - u64 ident_addr) -{ - kvm->arch.ept_identity_map_addr = ident_addr; - return 0; -} - -static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, - u32 kvm_nr_mmu_pages) -{ - if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES) - return -EINVAL; - - mutex_lock(&kvm->slots_lock); - spin_lock(&kvm->mmu_lock); - - kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages); - kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages; - - spin_unlock(&kvm->mmu_lock); - mutex_unlock(&kvm->slots_lock); - return 0; -} - -static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm) -{ - return kvm->arch.n_max_mmu_pages; -} - -static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) -{ - int r; - - r = 0; - switch (chip->chip_id) { - case KVM_IRQCHIP_PIC_MASTER: - memcpy(&chip->chip.pic, - &pic_irqchip(kvm)->pics[0], - sizeof(struct kvm_pic_state)); - break; - case KVM_IRQCHIP_PIC_SLAVE: - memcpy(&chip->chip.pic, - &pic_irqchip(kvm)->pics[1], - sizeof(struct kvm_pic_state)); - break; - case KVM_IRQCHIP_IOAPIC: - r = kvm_get_ioapic(kvm, &chip->chip.ioapic); - break; - default: - r = -EINVAL; - break; - } - return r; -} - -static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) -{ - int r; - - r = 0; - switch (chip->chip_id) { - case KVM_IRQCHIP_PIC_MASTER: - spin_lock(&pic_irqchip(kvm)->lock); - memcpy(&pic_irqchip(kvm)->pics[0], - &chip->chip.pic, - sizeof(struct kvm_pic_state)); - spin_unlock(&pic_irqchip(kvm)->lock); - break; - case KVM_IRQCHIP_PIC_SLAVE: - spin_lock(&pic_irqchip(kvm)->lock); - memcpy(&pic_irqchip(kvm)->pics[1], - &chip->chip.pic, - sizeof(struct kvm_pic_state)); - spin_unlock(&pic_irqchip(kvm)->lock); - break; - case KVM_IRQCHIP_IOAPIC: - r = kvm_set_ioapic(kvm, &chip->chip.ioapic); - break; - default: - r = -EINVAL; - break; - } - kvm_pic_update_irq(pic_irqchip(kvm)); - return r; -} - -static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps) -{ - int r = 0; - - mutex_lock(&kvm->arch.vpit->pit_state.lock); - memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state)); - mutex_unlock(&kvm->arch.vpit->pit_state.lock); - return r; -} - -static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps) -{ - int r = 0; - - mutex_lock(&kvm->arch.vpit->pit_state.lock); - memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state)); - kvm_pit_load_count(kvm, 0, ps->channels[0].count, 0); - mutex_unlock(&kvm->arch.vpit->pit_state.lock); - return r; -} - -static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) -{ - int r = 0; - - mutex_lock(&kvm->arch.vpit->pit_state.lock); - memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels, - sizeof(ps->channels)); - ps->flags = kvm->arch.vpit->pit_state.flags; - mutex_unlock(&kvm->arch.vpit->pit_state.lock); - memset(&ps->reserved, 0, sizeof(ps->reserved)); - return r; -} - -static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) -{ - int r = 0, start = 0; - u32 prev_legacy, cur_legacy; - mutex_lock(&kvm->arch.vpit->pit_state.lock); - prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY; - cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY; - if (!prev_legacy && cur_legacy) - start = 1; - memcpy(&kvm->arch.vpit->pit_state.channels, &ps->channels, - sizeof(kvm->arch.vpit->pit_state.channels)); - kvm->arch.vpit->pit_state.flags = ps->flags; - kvm_pit_load_count(kvm, 0, kvm->arch.vpit->pit_state.channels[0].count, start); - mutex_unlock(&kvm->arch.vpit->pit_state.lock); - return r; -} - -static int kvm_vm_ioctl_reinject(struct kvm *kvm, - struct kvm_reinject_control *control) -{ - if (!kvm->arch.vpit) - return -ENXIO; - mutex_lock(&kvm->arch.vpit->pit_state.lock); - kvm->arch.vpit->pit_state.pit_timer.reinject = control->pit_reinject; - mutex_unlock(&kvm->arch.vpit->pit_state.lock); - return 0; -} - -/** - * write_protect_slot - write protect a slot for dirty logging - * @kvm: the kvm instance - * @memslot: the slot we protect - * @dirty_bitmap: the bitmap indicating which pages are dirty - * @nr_dirty_pages: the number of dirty pages - * - * We have two ways to find all sptes to protect: - * 1. Use kvm_mmu_slot_remove_write_access() which walks all shadow pages and - * checks ones that have a spte mapping a page in the slot. - * 2. Use kvm_mmu_rmap_write_protect() for each gfn found in the bitmap. - * - * Generally speaking, if there are not so many dirty pages compared to the - * number of shadow pages, we should use the latter. - * - * Note that letting others write into a page marked dirty in the old bitmap - * by using the remaining tlb entry is not a problem. That page will become - * write protected again when we flush the tlb and then be reported dirty to - * the user space by copying the old bitmap. - */ -static void write_protect_slot(struct kvm *kvm, - struct kvm_memory_slot *memslot, - unsigned long *dirty_bitmap, - unsigned long nr_dirty_pages) -{ - spin_lock(&kvm->mmu_lock); - - /* Not many dirty pages compared to # of shadow pages. */ - if (nr_dirty_pages < kvm->arch.n_used_mmu_pages) { - unsigned long gfn_offset; - - for_each_set_bit(gfn_offset, dirty_bitmap, memslot->npages) { - unsigned long gfn = memslot->base_gfn + gfn_offset; - - kvm_mmu_rmap_write_protect(kvm, gfn, memslot); - } - kvm_flush_remote_tlbs(kvm); - } else - kvm_mmu_slot_remove_write_access(kvm, memslot->id); - - spin_unlock(&kvm->mmu_lock); -} - -/* - * Get (and clear) the dirty memory log for a memory slot. - */ -int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, - struct kvm_dirty_log *log) -{ - int r; - struct kvm_memory_slot *memslot; - unsigned long n, nr_dirty_pages; - - mutex_lock(&kvm->slots_lock); - - r = -EINVAL; - if (log->slot >= KVM_MEMORY_SLOTS) - goto out; - - memslot = id_to_memslot(kvm->memslots, log->slot); - r = -ENOENT; - if (!memslot->dirty_bitmap) - goto out; - - n = kvm_dirty_bitmap_bytes(memslot); - nr_dirty_pages = memslot->nr_dirty_pages; - - /* If nothing is dirty, don't bother messing with page tables. */ - if (nr_dirty_pages) { - struct kvm_memslots *slots, *old_slots; - unsigned long *dirty_bitmap, *dirty_bitmap_head; - - dirty_bitmap = memslot->dirty_bitmap; - dirty_bitmap_head = memslot->dirty_bitmap_head; - if (dirty_bitmap == dirty_bitmap_head) - dirty_bitmap_head += n / sizeof(long); - memset(dirty_bitmap_head, 0, n); - - r = -ENOMEM; - slots = kmemdup(kvm->memslots, sizeof(*kvm->memslots), GFP_KERNEL); - if (!slots) - goto out; - - memslot = id_to_memslot(slots, log->slot); - memslot->nr_dirty_pages = 0; - memslot->dirty_bitmap = dirty_bitmap_head; - update_memslots(slots, NULL); - - old_slots = kvm->memslots; - rcu_assign_pointer(kvm->memslots, slots); - synchronize_srcu_expedited(&kvm->srcu); - kfree(old_slots); - - write_protect_slot(kvm, memslot, dirty_bitmap, nr_dirty_pages); - - r = -EFAULT; - if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) - goto out; - } else { - r = -EFAULT; - if (clear_user(log->dirty_bitmap, n)) - goto out; - } - - r = 0; -out: - mutex_unlock(&kvm->slots_lock); - return r; -} - -long kvm_arch_vm_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) -{ - struct kvm *kvm = filp->private_data; - void __user *argp = (void __user *)arg; - int r = -ENOTTY; - /* - * This union makes it completely explicit to gcc-3.x - * that these two variables' stack usage should be - * combined, not added together. - */ - union { - struct kvm_pit_state ps; - struct kvm_pit_state2 ps2; - struct kvm_pit_config pit_config; - } u; - - switch (ioctl) { - case KVM_SET_TSS_ADDR: - r = kvm_vm_ioctl_set_tss_addr(kvm, arg); - if (r < 0) - goto out; - break; - case KVM_SET_IDENTITY_MAP_ADDR: { - u64 ident_addr; - - r = -EFAULT; - if (copy_from_user(&ident_addr, argp, sizeof ident_addr)) - goto out; - r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr); - if (r < 0) - goto out; - break; - } - case KVM_SET_NR_MMU_PAGES: - r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg); - if (r) - goto out; - break; - case KVM_GET_NR_MMU_PAGES: - r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); - break; - case KVM_CREATE_IRQCHIP: { - struct kvm_pic *vpic; - - mutex_lock(&kvm->lock); - r = -EEXIST; - if (kvm->arch.vpic) - goto create_irqchip_unlock; - r = -EINVAL; - if (atomic_read(&kvm->online_vcpus)) - goto create_irqchip_unlock; - r = -ENOMEM; - vpic = kvm_create_pic(kvm); - if (vpic) { - r = kvm_ioapic_init(kvm); - if (r) { - mutex_lock(&kvm->slots_lock); - kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, - &vpic->dev_master); - kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, - &vpic->dev_slave); - kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, - &vpic->dev_eclr); - mutex_unlock(&kvm->slots_lock); - kfree(vpic); - goto create_irqchip_unlock; - } - } else - goto create_irqchip_unlock; - smp_wmb(); - kvm->arch.vpic = vpic; - smp_wmb(); - r = kvm_setup_default_irq_routing(kvm); - if (r) { - mutex_lock(&kvm->slots_lock); - mutex_lock(&kvm->irq_lock); - kvm_ioapic_destroy(kvm); - kvm_destroy_pic(kvm); - mutex_unlock(&kvm->irq_lock); - mutex_unlock(&kvm->slots_lock); - } - create_irqchip_unlock: - mutex_unlock(&kvm->lock); - break; - } - case KVM_CREATE_PIT: - u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY; - goto create_pit; - case KVM_CREATE_PIT2: - r = -EFAULT; - if (copy_from_user(&u.pit_config, argp, - sizeof(struct kvm_pit_config))) - goto out; - create_pit: - mutex_lock(&kvm->slots_lock); - r = -EEXIST; - if (kvm->arch.vpit) - goto create_pit_unlock; - r = -ENOMEM; - kvm->arch.vpit = kvm_create_pit(kvm, u.pit_config.flags); - if (kvm->arch.vpit) - r = 0; - create_pit_unlock: - mutex_unlock(&kvm->slots_lock); - break; - case KVM_IRQ_LINE_STATUS: - case KVM_IRQ_LINE: { - struct kvm_irq_level irq_event; - - r = -EFAULT; - if (copy_from_user(&irq_event, argp, sizeof irq_event)) - goto out; - r = -ENXIO; - if (irqchip_in_kernel(kvm)) { - __s32 status; - status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, - irq_event.irq, irq_event.level); - if (ioctl == KVM_IRQ_LINE_STATUS) { - r = -EFAULT; - irq_event.status = status; - if (copy_to_user(argp, &irq_event, - sizeof irq_event)) - goto out; - } - r = 0; - } - break; - } - case KVM_GET_IRQCHIP: { - /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ - struct kvm_irqchip *chip; - - chip = memdup_user(argp, sizeof(*chip)); - if (IS_ERR(chip)) { - r = PTR_ERR(chip); - goto out; - } - - r = -ENXIO; - if (!irqchip_in_kernel(kvm)) - goto get_irqchip_out; - r = kvm_vm_ioctl_get_irqchip(kvm, chip); - if (r) - goto get_irqchip_out; - r = -EFAULT; - if (copy_to_user(argp, chip, sizeof *chip)) - goto get_irqchip_out; - r = 0; - get_irqchip_out: - kfree(chip); - if (r) - goto out; - break; - } - case KVM_SET_IRQCHIP: { - /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ - struct kvm_irqchip *chip; - - chip = memdup_user(argp, sizeof(*chip)); - if (IS_ERR(chip)) { - r = PTR_ERR(chip); - goto out; - } - - r = -ENXIO; - if (!irqchip_in_kernel(kvm)) - goto set_irqchip_out; - r = kvm_vm_ioctl_set_irqchip(kvm, chip); - if (r) - goto set_irqchip_out; - r = 0; - set_irqchip_out: - kfree(chip); - if (r) - goto out; - break; - } - case KVM_GET_PIT: { - r = -EFAULT; - if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state))) - goto out; - r = -ENXIO; - if (!kvm->arch.vpit) - goto out; - r = kvm_vm_ioctl_get_pit(kvm, &u.ps); - if (r) - goto out; - r = -EFAULT; - if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state))) - goto out; - r = 0; - break; - } - case KVM_SET_PIT: { - r = -EFAULT; - if (copy_from_user(&u.ps, argp, sizeof u.ps)) - goto out; - r = -ENXIO; - if (!kvm->arch.vpit) - goto out; - r = kvm_vm_ioctl_set_pit(kvm, &u.ps); - if (r) - goto out; - r = 0; - break; - } - case KVM_GET_PIT2: { - r = -ENXIO; - if (!kvm->arch.vpit) - goto out; - r = kvm_vm_ioctl_get_pit2(kvm, &u.ps2); - if (r) - goto out; - r = -EFAULT; - if (copy_to_user(argp, &u.ps2, sizeof(u.ps2))) - goto out; - r = 0; - break; - } - case KVM_SET_PIT2: { - r = -EFAULT; - if (copy_from_user(&u.ps2, argp, sizeof(u.ps2))) - goto out; - r = -ENXIO; - if (!kvm->arch.vpit) - goto out; - r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2); - if (r) - goto out; - r = 0; - break; - } - case KVM_REINJECT_CONTROL: { - struct kvm_reinject_control control; - r = -EFAULT; - if (copy_from_user(&control, argp, sizeof(control))) - goto out; - r = kvm_vm_ioctl_reinject(kvm, &control); - if (r) - goto out; - r = 0; - break; - } - case KVM_XEN_HVM_CONFIG: { - r = -EFAULT; - if (copy_from_user(&kvm->arch.xen_hvm_config, argp, - sizeof(struct kvm_xen_hvm_config))) - goto out; - r = -EINVAL; - if (kvm->arch.xen_hvm_config.flags) - goto out; - r = 0; - break; - } - case KVM_SET_CLOCK: { - struct kvm_clock_data user_ns; - u64 now_ns; - s64 delta; - - r = -EFAULT; - if (copy_from_user(&user_ns, argp, sizeof(user_ns))) - goto out; - - r = -EINVAL; - if (user_ns.flags) - goto out; - - r = 0; - local_irq_disable(); - now_ns = get_kernel_ns(); - delta = user_ns.clock - now_ns; - local_irq_enable(); - kvm->arch.kvmclock_offset = delta; - break; - } - case KVM_GET_CLOCK: { - struct kvm_clock_data user_ns; - u64 now_ns; - - local_irq_disable(); - now_ns = get_kernel_ns(); - user_ns.clock = kvm->arch.kvmclock_offset + now_ns; - local_irq_enable(); - user_ns.flags = 0; - memset(&user_ns.pad, 0, sizeof(user_ns.pad)); - - r = -EFAULT; - if (copy_to_user(argp, &user_ns, sizeof(user_ns))) - goto out; - r = 0; - break; - } - - default: - ; - } -out: - return r; -} - -static void kvm_init_msr_list(void) -{ - u32 dummy[2]; - unsigned i, j; - - /* skip the first msrs in the list. KVM-specific */ - for (i = j = KVM_SAVE_MSRS_BEGIN; i < ARRAY_SIZE(msrs_to_save); i++) { - if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) - continue; - if (j < i) - msrs_to_save[j] = msrs_to_save[i]; - j++; - } - num_msrs_to_save = j; -} - -static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len, - const void *v) -{ - int handled = 0; - int n; - - do { - n = min(len, 8); - if (!(vcpu->arch.apic && - !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, n, v)) - && kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, n, v)) - break; - handled += n; - addr += n; - len -= n; - v += n; - } while (len); - - return handled; -} - -static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) -{ - int handled = 0; - int n; - - do { - n = min(len, 8); - if (!(vcpu->arch.apic && - !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, n, v)) - && kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, n, v)) - break; - trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, *(u64 *)v); - handled += n; - addr += n; - len -= n; - v += n; - } while (len); - - return handled; -} - -static void kvm_set_segment(struct kvm_vcpu *vcpu, - struct kvm_segment *var, int seg) -{ - kvm_x86_ops->set_segment(vcpu, var, seg); -} - -void kvm_get_segment(struct kvm_vcpu *vcpu, - struct kvm_segment *var, int seg) -{ - kvm_x86_ops->get_segment(vcpu, var, seg); -} - -gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access) -{ - gpa_t t_gpa; - struct x86_exception exception; - - BUG_ON(!mmu_is_nested(vcpu)); - - /* NPT walks are always user-walks */ - access |= PFERR_USER_MASK; - t_gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, &exception); - - return t_gpa; -} - -gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, - struct x86_exception *exception) -{ - u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; - return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); -} - - gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, - struct x86_exception *exception) -{ - u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; - access |= PFERR_FETCH_MASK; - return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); -} - -gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, - struct x86_exception *exception) -{ - u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; - access |= PFERR_WRITE_MASK; - return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); -} - -/* uses this to access any guest's mapped memory without checking CPL */ -gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, - struct x86_exception *exception) -{ - return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, exception); -} - -static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, - struct kvm_vcpu *vcpu, u32 access, - struct x86_exception *exception) -{ - void *data = val; - int r = X86EMUL_CONTINUE; - - while (bytes) { - gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access, - exception); - unsigned offset = addr & (PAGE_SIZE-1); - unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset); - int ret; - - if (gpa == UNMAPPED_GVA) - return X86EMUL_PROPAGATE_FAULT; - ret = kvm_read_guest(vcpu->kvm, gpa, data, toread); - if (ret < 0) { - r = X86EMUL_IO_NEEDED; - goto out; - } - - bytes -= toread; - data += toread; - addr += toread; - } -out: - return r; -} - -/* used for instruction fetching */ -static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt, - gva_t addr, void *val, unsigned int bytes, - struct x86_exception *exception) -{ - struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; - - return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, - access | PFERR_FETCH_MASK, - exception); -} - -int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt, - gva_t addr, void *val, unsigned int bytes, - struct x86_exception *exception) -{ - struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; - - return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, - exception); -} -EXPORT_SYMBOL_GPL(kvm_read_guest_virt); - -static int kvm_read_guest_virt_system(struct x86_emulate_ctxt *ctxt, - gva_t addr, void *val, unsigned int bytes, - struct x86_exception *exception) -{ - struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception); -} - -int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt, - gva_t addr, void *val, - unsigned int bytes, - struct x86_exception *exception) -{ - struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - void *data = val; - int r = X86EMUL_CONTINUE; - - while (bytes) { - gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, - PFERR_WRITE_MASK, - exception); - unsigned offset = addr & (PAGE_SIZE-1); - unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); - int ret; - - if (gpa == UNMAPPED_GVA) - return X86EMUL_PROPAGATE_FAULT; - ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite); - if (ret < 0) { - r = X86EMUL_IO_NEEDED; - goto out; - } - - bytes -= towrite; - data += towrite; - addr += towrite; - } -out: - return r; -} -EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system); - -static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, - gpa_t *gpa, struct x86_exception *exception, - bool write) -{ - u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; - - if (vcpu_match_mmio_gva(vcpu, gva) && - check_write_user_access(vcpu, write, access, - vcpu->arch.access)) { - *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT | - (gva & (PAGE_SIZE - 1)); - trace_vcpu_match_mmio(gva, *gpa, write, false); - return 1; - } - - if (write) - access |= PFERR_WRITE_MASK; - - *gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); - - if (*gpa == UNMAPPED_GVA) - return -1; - - /* For APIC access vmexit */ - if ((*gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) - return 1; - - if (vcpu_match_mmio_gpa(vcpu, *gpa)) { - trace_vcpu_match_mmio(gva, *gpa, write, true); - return 1; - } - - return 0; -} - -int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, - const void *val, int bytes) -{ - int ret; - - ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes); - if (ret < 0) - return 0; - kvm_mmu_pte_write(vcpu, gpa, val, bytes); - return 1; -} - -struct read_write_emulator_ops { - int (*read_write_prepare)(struct kvm_vcpu *vcpu, void *val, - int bytes); - int (*read_write_emulate)(struct kvm_vcpu *vcpu, gpa_t gpa, - void *val, int bytes); - int (*read_write_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa, - int bytes, void *val); - int (*read_write_exit_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa, - void *val, int bytes); - bool write; -}; - -static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes) -{ - if (vcpu->mmio_read_completed) { - memcpy(val, vcpu->mmio_data, bytes); - trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, - vcpu->mmio_phys_addr, *(u64 *)val); - vcpu->mmio_read_completed = 0; - return 1; - } - - return 0; -} - -static int read_emulate(struct kvm_vcpu *vcpu, gpa_t gpa, - void *val, int bytes) -{ - return !kvm_read_guest(vcpu->kvm, gpa, val, bytes); -} - -static int write_emulate(struct kvm_vcpu *vcpu, gpa_t gpa, - void *val, int bytes) -{ - return emulator_write_phys(vcpu, gpa, val, bytes); -} - -static int write_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes, void *val) -{ - trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val); - return vcpu_mmio_write(vcpu, gpa, bytes, val); -} - -static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, - void *val, int bytes) -{ - trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0); - return X86EMUL_IO_NEEDED; -} - -static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, - void *val, int bytes) -{ - memcpy(vcpu->mmio_data, val, bytes); - memcpy(vcpu->run->mmio.data, vcpu->mmio_data, 8); - return X86EMUL_CONTINUE; -} - -static struct read_write_emulator_ops read_emultor = { - .read_write_prepare = read_prepare, - .read_write_emulate = read_emulate, - .read_write_mmio = vcpu_mmio_read, - .read_write_exit_mmio = read_exit_mmio, -}; - -static struct read_write_emulator_ops write_emultor = { - .read_write_emulate = write_emulate, - .read_write_mmio = write_mmio, - .read_write_exit_mmio = write_exit_mmio, - .write = true, -}; - -static int emulator_read_write_onepage(unsigned long addr, void *val, - unsigned int bytes, - struct x86_exception *exception, - struct kvm_vcpu *vcpu, - struct read_write_emulator_ops *ops) -{ - gpa_t gpa; - int handled, ret; - bool write = ops->write; - - if (ops->read_write_prepare && - ops->read_write_prepare(vcpu, val, bytes)) - return X86EMUL_CONTINUE; - - ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write); - - if (ret < 0) - return X86EMUL_PROPAGATE_FAULT; - - /* For APIC access vmexit */ - if (ret) - goto mmio; - - if (ops->read_write_emulate(vcpu, gpa, val, bytes)) - return X86EMUL_CONTINUE; - -mmio: - /* - * Is this MMIO handled locally? - */ - handled = ops->read_write_mmio(vcpu, gpa, bytes, val); - if (handled == bytes) - return X86EMUL_CONTINUE; - - gpa += handled; - bytes -= handled; - val += handled; - - vcpu->mmio_needed = 1; - vcpu->run->exit_reason = KVM_EXIT_MMIO; - vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa; - vcpu->mmio_size = bytes; - vcpu->run->mmio.len = min(vcpu->mmio_size, 8); - vcpu->run->mmio.is_write = vcpu->mmio_is_write = write; - vcpu->mmio_index = 0; - - return ops->read_write_exit_mmio(vcpu, gpa, val, bytes); -} - -int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr, - void *val, unsigned int bytes, - struct x86_exception *exception, - struct read_write_emulator_ops *ops) -{ - struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - - /* Crossing a page boundary? */ - if (((addr + bytes - 1) ^ addr) & PAGE_MASK) { - int rc, now; - - now = -addr & ~PAGE_MASK; - rc = emulator_read_write_onepage(addr, val, now, exception, - vcpu, ops); - - if (rc != X86EMUL_CONTINUE) - return rc; - addr += now; - val += now; - bytes -= now; - } - - return emulator_read_write_onepage(addr, val, bytes, exception, - vcpu, ops); -} - -static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt, - unsigned long addr, - void *val, - unsigned int bytes, - struct x86_exception *exception) -{ - return emulator_read_write(ctxt, addr, val, bytes, - exception, &read_emultor); -} - -int emulator_write_emulated(struct x86_emulate_ctxt *ctxt, - unsigned long addr, - const void *val, - unsigned int bytes, - struct x86_exception *exception) -{ - return emulator_read_write(ctxt, addr, (void *)val, bytes, - exception, &write_emultor); -} - -#define CMPXCHG_TYPE(t, ptr, old, new) \ - (cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old)) - -#ifdef CONFIG_X86_64 -# define CMPXCHG64(ptr, old, new) CMPXCHG_TYPE(u64, ptr, old, new) -#else -# define CMPXCHG64(ptr, old, new) \ - (cmpxchg64((u64 *)(ptr), *(u64 *)(old), *(u64 *)(new)) == *(u64 *)(old)) -#endif - -static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, - unsigned long addr, - const void *old, - const void *new, - unsigned int bytes, - struct x86_exception *exception) -{ - struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - gpa_t gpa; - struct page *page; - char *kaddr; - bool exchanged; - - /* guests cmpxchg8b have to be emulated atomically */ - if (bytes > 8 || (bytes & (bytes - 1))) - goto emul_write; - - gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL); - - if (gpa == UNMAPPED_GVA || - (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) - goto emul_write; - - if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK)) - goto emul_write; - - page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); - if (is_error_page(page)) { - kvm_release_page_clean(page); - goto emul_write; - } - - kaddr = kmap_atomic(page); - kaddr += offset_in_page(gpa); - switch (bytes) { - case 1: - exchanged = CMPXCHG_TYPE(u8, kaddr, old, new); - break; - case 2: - exchanged = CMPXCHG_TYPE(u16, kaddr, old, new); - break; - case 4: - exchanged = CMPXCHG_TYPE(u32, kaddr, old, new); - break; - case 8: - exchanged = CMPXCHG64(kaddr, old, new); - break; - default: - BUG(); - } - kunmap_atomic(kaddr); - kvm_release_page_dirty(page); - - if (!exchanged) - return X86EMUL_CMPXCHG_FAILED; - - kvm_mmu_pte_write(vcpu, gpa, new, bytes); - - return X86EMUL_CONTINUE; - -emul_write: - printk_once(KERN_WARNING "kvm: emulating exchange as write\n"); - - return emulator_write_emulated(ctxt, addr, new, bytes, exception); -} - -static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) -{ - /* TODO: String I/O for in kernel device */ - int r; - - if (vcpu->arch.pio.in) - r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port, - vcpu->arch.pio.size, pd); - else - r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS, - vcpu->arch.pio.port, vcpu->arch.pio.size, - pd); - return r; -} - -static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size, - unsigned short port, void *val, - unsigned int count, bool in) -{ - trace_kvm_pio(!in, port, size, count); - - vcpu->arch.pio.port = port; - vcpu->arch.pio.in = in; - vcpu->arch.pio.count = count; - vcpu->arch.pio.size = size; - - if (!kernel_pio(vcpu, vcpu->arch.pio_data)) { - vcpu->arch.pio.count = 0; - return 1; - } - - vcpu->run->exit_reason = KVM_EXIT_IO; - vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; - vcpu->run->io.size = size; - vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; - vcpu->run->io.count = count; - vcpu->run->io.port = port; - - return 0; -} - -static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt, - int size, unsigned short port, void *val, - unsigned int count) -{ - struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - int ret; - - if (vcpu->arch.pio.count) - goto data_avail; - - ret = emulator_pio_in_out(vcpu, size, port, val, count, true); - if (ret) { -data_avail: - memcpy(val, vcpu->arch.pio_data, size * count); - vcpu->arch.pio.count = 0; - return 1; - } - - return 0; -} - -static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt, - int size, unsigned short port, - const void *val, unsigned int count) -{ - struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - - memcpy(vcpu->arch.pio_data, val, size * count); - return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false); -} - -static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) -{ - return kvm_x86_ops->get_segment_base(vcpu, seg); -} - -static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address) -{ - kvm_mmu_invlpg(emul_to_vcpu(ctxt), address); -} - -int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu) -{ - if (!need_emulate_wbinvd(vcpu)) - return X86EMUL_CONTINUE; - - if (kvm_x86_ops->has_wbinvd_exit()) { - int cpu = get_cpu(); - - cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask); - smp_call_function_many(vcpu->arch.wbinvd_dirty_mask, - wbinvd_ipi, NULL, 1); - put_cpu(); - cpumask_clear(vcpu->arch.wbinvd_dirty_mask); - } else - wbinvd(); - return X86EMUL_CONTINUE; -} -EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd); - -static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt) -{ - kvm_emulate_wbinvd(emul_to_vcpu(ctxt)); -} - -int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest) -{ - return _kvm_get_dr(emul_to_vcpu(ctxt), dr, dest); -} - -int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) -{ - - return __kvm_set_dr(emul_to_vcpu(ctxt), dr, value); -} - -static u64 mk_cr_64(u64 curr_cr, u32 new_val) -{ - return (curr_cr & ~((1ULL << 32) - 1)) | new_val; -} - -static unsigned long emulator_get_cr(struct x86_emulate_ctxt *ctxt, int cr) -{ - struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - unsigned long value; - - switch (cr) { - case 0: - value = kvm_read_cr0(vcpu); - break; - case 2: - value = vcpu->arch.cr2; - break; - case 3: - value = kvm_read_cr3(vcpu); - break; - case 4: - value = kvm_read_cr4(vcpu); - break; - case 8: - value = kvm_get_cr8(vcpu); - break; - default: - vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); - return 0; - } - - return value; -} - -static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val) -{ - struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - int res = 0; - - switch (cr) { - case 0: - res = kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val)); - break; - case 2: - vcpu->arch.cr2 = val; - break; - case 3: - res = kvm_set_cr3(vcpu, val); - break; - case 4: - res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val)); - break; - case 8: - res = kvm_set_cr8(vcpu, val); - break; - default: - vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); - res = -1; - } - - return res; -} - -static void emulator_set_rflags(struct x86_emulate_ctxt *ctxt, ulong val) -{ - kvm_set_rflags(emul_to_vcpu(ctxt), val); -} - -static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt) -{ - return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt)); -} - -static void emulator_get_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) -{ - kvm_x86_ops->get_gdt(emul_to_vcpu(ctxt), dt); -} - -static void emulator_get_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) -{ - kvm_x86_ops->get_idt(emul_to_vcpu(ctxt), dt); -} - -static void emulator_set_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) -{ - kvm_x86_ops->set_gdt(emul_to_vcpu(ctxt), dt); -} - -static void emulator_set_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) -{ - kvm_x86_ops->set_idt(emul_to_vcpu(ctxt), dt); -} - -static unsigned long emulator_get_cached_segment_base( - struct x86_emulate_ctxt *ctxt, int seg) -{ - return get_segment_base(emul_to_vcpu(ctxt), seg); -} - -static bool emulator_get_segment(struct x86_emulate_ctxt *ctxt, u16 *selector, - struct desc_struct *desc, u32 *base3, - int seg) -{ - struct kvm_segment var; - - kvm_get_segment(emul_to_vcpu(ctxt), &var, seg); - *selector = var.selector; - - if (var.unusable) - return false; - - if (var.g) - var.limit >>= 12; - set_desc_limit(desc, var.limit); - set_desc_base(desc, (unsigned long)var.base); -#ifdef CONFIG_X86_64 - if (base3) - *base3 = var.base >> 32; -#endif - desc->type = var.type; - desc->s = var.s; - desc->dpl = var.dpl; - desc->p = var.present; - desc->avl = var.avl; - desc->l = var.l; - desc->d = var.db; - desc->g = var.g; - - return true; -} - -static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector, - struct desc_struct *desc, u32 base3, - int seg) -{ - struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - struct kvm_segment var; - - var.selector = selector; - var.base = get_desc_base(desc); -#ifdef CONFIG_X86_64 - var.base |= ((u64)base3) << 32; -#endif - var.limit = get_desc_limit(desc); - if (desc->g) - var.limit = (var.limit << 12) | 0xfff; - var.type = desc->type; - var.present = desc->p; - var.dpl = desc->dpl; - var.db = desc->d; - var.s = desc->s; - var.l = desc->l; - var.g = desc->g; - var.avl = desc->avl; - var.present = desc->p; - var.unusable = !var.present; - var.padding = 0; - - kvm_set_segment(vcpu, &var, seg); - return; -} - -static int emulator_get_msr(struct x86_emulate_ctxt *ctxt, - u32 msr_index, u64 *pdata) -{ - return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata); -} - -static int emulator_set_msr(struct x86_emulate_ctxt *ctxt, - u32 msr_index, u64 data) -{ - return kvm_set_msr(emul_to_vcpu(ctxt), msr_index, data); -} - -static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt, - u32 pmc, u64 *pdata) -{ - return kvm_pmu_read_pmc(emul_to_vcpu(ctxt), pmc, pdata); -} - -static void emulator_halt(struct x86_emulate_ctxt *ctxt) -{ - emul_to_vcpu(ctxt)->arch.halt_request = 1; -} - -static void emulator_get_fpu(struct x86_emulate_ctxt *ctxt) -{ - preempt_disable(); - kvm_load_guest_fpu(emul_to_vcpu(ctxt)); - /* - * CR0.TS may reference the host fpu state, not the guest fpu state, - * so it may be clear at this point. - */ - clts(); -} - -static void emulator_put_fpu(struct x86_emulate_ctxt *ctxt) -{ - preempt_enable(); -} - -static int emulator_intercept(struct x86_emulate_ctxt *ctxt, - struct x86_instruction_info *info, - enum x86_intercept_stage stage) -{ - return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage); -} - -static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt, - u32 *eax, u32 *ebx, u32 *ecx, u32 *edx) -{ - struct kvm_cpuid_entry2 *cpuid = NULL; - - if (eax && ecx) - cpuid = kvm_find_cpuid_entry(emul_to_vcpu(ctxt), - *eax, *ecx); - - if (cpuid) { - *eax = cpuid->eax; - *ecx = cpuid->ecx; - if (ebx) - *ebx = cpuid->ebx; - if (edx) - *edx = cpuid->edx; - return true; - } - - return false; -} - -static struct x86_emulate_ops emulate_ops = { - .read_std = kvm_read_guest_virt_system, - .write_std = kvm_write_guest_virt_system, - .fetch = kvm_fetch_guest_virt, - .read_emulated = emulator_read_emulated, - .write_emulated = emulator_write_emulated, - .cmpxchg_emulated = emulator_cmpxchg_emulated, - .invlpg = emulator_invlpg, - .pio_in_emulated = emulator_pio_in_emulated, - .pio_out_emulated = emulator_pio_out_emulated, - .get_segment = emulator_get_segment, - .set_segment = emulator_set_segment, - .get_cached_segment_base = emulator_get_cached_segment_base, - .get_gdt = emulator_get_gdt, - .get_idt = emulator_get_idt, - .set_gdt = emulator_set_gdt, - .set_idt = emulator_set_idt, - .get_cr = emulator_get_cr, - .set_cr = emulator_set_cr, - .set_rflags = emulator_set_rflags, - .cpl = emulator_get_cpl, - .get_dr = emulator_get_dr, - .set_dr = emulator_set_dr, - .set_msr = emulator_set_msr, - .get_msr = emulator_get_msr, - .read_pmc = emulator_read_pmc, - .halt = emulator_halt, - .wbinvd = emulator_wbinvd, - .fix_hypercall = emulator_fix_hypercall, - .get_fpu = emulator_get_fpu, - .put_fpu = emulator_put_fpu, - .intercept = emulator_intercept, - .get_cpuid = emulator_get_cpuid, -}; - -static void cache_all_regs(struct kvm_vcpu *vcpu) -{ - kvm_register_read(vcpu, VCPU_REGS_RAX); - kvm_register_read(vcpu, VCPU_REGS_RSP); - kvm_register_read(vcpu, VCPU_REGS_RIP); - vcpu->arch.regs_dirty = ~0; -} - -static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) -{ - u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu, mask); - /* - * an sti; sti; sequence only disable interrupts for the first - * instruction. So, if the last instruction, be it emulated or - * not, left the system with the INT_STI flag enabled, it - * means that the last instruction is an sti. We should not - * leave the flag on in this case. The same goes for mov ss - */ - if (!(int_shadow & mask)) - kvm_x86_ops->set_interrupt_shadow(vcpu, mask); -} - -static void inject_emulated_exception(struct kvm_vcpu *vcpu) -{ - struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; - if (ctxt->exception.vector == PF_VECTOR) - kvm_propagate_fault(vcpu, &ctxt->exception); - else if (ctxt->exception.error_code_valid) - kvm_queue_exception_e(vcpu, ctxt->exception.vector, - ctxt->exception.error_code); - else - kvm_queue_exception(vcpu, ctxt->exception.vector); -} - -static void init_decode_cache(struct x86_emulate_ctxt *ctxt, - const unsigned long *regs) -{ - memset(&ctxt->twobyte, 0, - (void *)&ctxt->regs - (void *)&ctxt->twobyte); - memcpy(ctxt->regs, regs, sizeof(ctxt->regs)); - - ctxt->fetch.start = 0; - ctxt->fetch.end = 0; - ctxt->io_read.pos = 0; - ctxt->io_read.end = 0; - ctxt->mem_read.pos = 0; - ctxt->mem_read.end = 0; -} - -static void init_emulate_ctxt(struct kvm_vcpu *vcpu) -{ - struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; - int cs_db, cs_l; - - /* - * TODO: fix emulate.c to use guest_read/write_register - * instead of direct ->regs accesses, can save hundred cycles - * on Intel for instructions that don't read/change RSP, for - * for example. - */ - cache_all_regs(vcpu); - - kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); - - ctxt->eflags = kvm_get_rflags(vcpu); - ctxt->eip = kvm_rip_read(vcpu); - ctxt->mode = (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : - (ctxt->eflags & X86_EFLAGS_VM) ? X86EMUL_MODE_VM86 : - cs_l ? X86EMUL_MODE_PROT64 : - cs_db ? X86EMUL_MODE_PROT32 : - X86EMUL_MODE_PROT16; - ctxt->guest_mode = is_guest_mode(vcpu); - - init_decode_cache(ctxt, vcpu->arch.regs); - vcpu->arch.emulate_regs_need_sync_from_vcpu = false; -} - -int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip) -{ - struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; - int ret; - - init_emulate_ctxt(vcpu); - - ctxt->op_bytes = 2; - ctxt->ad_bytes = 2; - ctxt->_eip = ctxt->eip + inc_eip; - ret = emulate_int_real(ctxt, irq); - - if (ret != X86EMUL_CONTINUE) - return EMULATE_FAIL; - - ctxt->eip = ctxt->_eip; - memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs); - kvm_rip_write(vcpu, ctxt->eip); - kvm_set_rflags(vcpu, ctxt->eflags); - - if (irq == NMI_VECTOR) - vcpu->arch.nmi_pending = 0; - else - vcpu->arch.interrupt.pending = false; - - return EMULATE_DONE; -} -EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt); - -static int handle_emulation_failure(struct kvm_vcpu *vcpu) -{ - int r = EMULATE_DONE; - - ++vcpu->stat.insn_emulation_fail; - trace_kvm_emulate_insn_failed(vcpu); - if (!is_guest_mode(vcpu)) { - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; - vcpu->run->internal.ndata = 0; - r = EMULATE_FAIL; - } - kvm_queue_exception(vcpu, UD_VECTOR); - - return r; -} - -static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva) -{ - gpa_t gpa; - - if (tdp_enabled) - return false; - - /* - * if emulation was due to access to shadowed page table - * and it failed try to unshadow page and re-entetr the - * guest to let CPU execute the instruction. - */ - if (kvm_mmu_unprotect_page_virt(vcpu, gva)) - return true; - - gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL); - - if (gpa == UNMAPPED_GVA) - return true; /* let cpu generate fault */ - - if (!kvm_is_error_hva(gfn_to_hva(vcpu->kvm, gpa >> PAGE_SHIFT))) - return true; - - return false; -} - -static bool retry_instruction(struct x86_emulate_ctxt *ctxt, - unsigned long cr2, int emulation_type) -{ - struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - unsigned long last_retry_eip, last_retry_addr, gpa = cr2; - - last_retry_eip = vcpu->arch.last_retry_eip; - last_retry_addr = vcpu->arch.last_retry_addr; - - /* - * If the emulation is caused by #PF and it is non-page_table - * writing instruction, it means the VM-EXIT is caused by shadow - * page protected, we can zap the shadow page and retry this - * instruction directly. - * - * Note: if the guest uses a non-page-table modifying instruction - * on the PDE that points to the instruction, then we will unmap - * the instruction and go to an infinite loop. So, we cache the - * last retried eip and the last fault address, if we meet the eip - * and the address again, we can break out of the potential infinite - * loop. - */ - vcpu->arch.last_retry_eip = vcpu->arch.last_retry_addr = 0; - - if (!(emulation_type & EMULTYPE_RETRY)) - return false; - - if (x86_page_table_writing_insn(ctxt)) - return false; - - if (ctxt->eip == last_retry_eip && last_retry_addr == cr2) - return false; - - vcpu->arch.last_retry_eip = ctxt->eip; - vcpu->arch.last_retry_addr = cr2; - - if (!vcpu->arch.mmu.direct_map) - gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL); - - kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); - - return true; -} - -int x86_emulate_instruction(struct kvm_vcpu *vcpu, - unsigned long cr2, - int emulation_type, - void *insn, - int insn_len) -{ - int r; - struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; - bool writeback = true; - - kvm_clear_exception_queue(vcpu); - - if (!(emulation_type & EMULTYPE_NO_DECODE)) { - init_emulate_ctxt(vcpu); - ctxt->interruptibility = 0; - ctxt->have_exception = false; - ctxt->perm_ok = false; - - ctxt->only_vendor_specific_insn - = emulation_type & EMULTYPE_TRAP_UD; - - r = x86_decode_insn(ctxt, insn, insn_len); - - trace_kvm_emulate_insn_start(vcpu); - ++vcpu->stat.insn_emulation; - if (r != EMULATION_OK) { - if (emulation_type & EMULTYPE_TRAP_UD) - return EMULATE_FAIL; - if (reexecute_instruction(vcpu, cr2)) - return EMULATE_DONE; - if (emulation_type & EMULTYPE_SKIP) - return EMULATE_FAIL; - return handle_emulation_failure(vcpu); - } - } - - if (emulation_type & EMULTYPE_SKIP) { - kvm_rip_write(vcpu, ctxt->_eip); - return EMULATE_DONE; - } - - if (retry_instruction(ctxt, cr2, emulation_type)) - return EMULATE_DONE; - - /* this is needed for vmware backdoor interface to work since it - changes registers values during IO operation */ - if (vcpu->arch.emulate_regs_need_sync_from_vcpu) { - vcpu->arch.emulate_regs_need_sync_from_vcpu = false; - memcpy(ctxt->regs, vcpu->arch.regs, sizeof ctxt->regs); - } - -restart: - r = x86_emulate_insn(ctxt); - - if (r == EMULATION_INTERCEPTED) - return EMULATE_DONE; - - if (r == EMULATION_FAILED) { - if (reexecute_instruction(vcpu, cr2)) - return EMULATE_DONE; - - return handle_emulation_failure(vcpu); - } - - if (ctxt->have_exception) { - inject_emulated_exception(vcpu); - r = EMULATE_DONE; - } else if (vcpu->arch.pio.count) { - if (!vcpu->arch.pio.in) - vcpu->arch.pio.count = 0; - else - writeback = false; - r = EMULATE_DO_MMIO; - } else if (vcpu->mmio_needed) { - if (!vcpu->mmio_is_write) - writeback = false; - r = EMULATE_DO_MMIO; - } else if (r == EMULATION_RESTART) - goto restart; - else - r = EMULATE_DONE; - - if (writeback) { - toggle_interruptibility(vcpu, ctxt->interruptibility); - kvm_set_rflags(vcpu, ctxt->eflags); - kvm_make_request(KVM_REQ_EVENT, vcpu); - memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs); - vcpu->arch.emulate_regs_need_sync_to_vcpu = false; - kvm_rip_write(vcpu, ctxt->eip); - } else - vcpu->arch.emulate_regs_need_sync_to_vcpu = true; - - return r; -} -EXPORT_SYMBOL_GPL(x86_emulate_instruction); - -int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port) -{ - unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX); - int ret = emulator_pio_out_emulated(&vcpu->arch.emulate_ctxt, - size, port, &val, 1); - /* do not return to emulator after return from userspace */ - vcpu->arch.pio.count = 0; - return ret; -} -EXPORT_SYMBOL_GPL(kvm_fast_pio_out); - -static void tsc_bad(void *info) -{ - __this_cpu_write(cpu_tsc_khz, 0); -} - -static void tsc_khz_changed(void *data) -{ - struct cpufreq_freqs *freq = data; - unsigned long khz = 0; - - if (data) - khz = freq->new; - else if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) - khz = cpufreq_quick_get(raw_smp_processor_id()); - if (!khz) - khz = tsc_khz; - __this_cpu_write(cpu_tsc_khz, khz); -} - -static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val, - void *data) -{ - struct cpufreq_freqs *freq = data; - struct kvm *kvm; - struct kvm_vcpu *vcpu; - int i, send_ipi = 0; - - /* - * We allow guests to temporarily run on slowing clocks, - * provided we notify them after, or to run on accelerating - * clocks, provided we notify them before. Thus time never - * goes backwards. - * - * However, we have a problem. We can't atomically update - * the frequency of a given CPU from this function; it is - * merely a notifier, which can be called from any CPU. - * Changing the TSC frequency at arbitrary points in time - * requires a recomputation of local variables related to - * the TSC for each VCPU. We must flag these local variables - * to be updated and be sure the update takes place with the - * new frequency before any guests proceed. - * - * Unfortunately, the combination of hotplug CPU and frequency - * change creates an intractable locking scenario; the order - * of when these callouts happen is undefined with respect to - * CPU hotplug, and they can race with each other. As such, - * merely setting per_cpu(cpu_tsc_khz) = X during a hotadd is - * undefined; you can actually have a CPU frequency change take - * place in between the computation of X and the setting of the - * variable. To protect against this problem, all updates of - * the per_cpu tsc_khz variable are done in an interrupt - * protected IPI, and all callers wishing to update the value - * must wait for a synchronous IPI to complete (which is trivial - * if the caller is on the CPU already). This establishes the - * necessary total order on variable updates. - * - * Note that because a guest time update may take place - * anytime after the setting of the VCPU's request bit, the - * correct TSC value must be set before the request. However, - * to ensure the update actually makes it to any guest which - * starts running in hardware virtualization between the set - * and the acquisition of the spinlock, we must also ping the - * CPU after setting the request bit. - * - */ - - if (val == CPUFREQ_PRECHANGE && freq->old > freq->new) - return 0; - if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new) - return 0; - - smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1); - - raw_spin_lock(&kvm_lock); - list_for_each_entry(kvm, &vm_list, vm_list) { - kvm_for_each_vcpu(i, vcpu, kvm) { - if (vcpu->cpu != freq->cpu) - continue; - kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); - if (vcpu->cpu != smp_processor_id()) - send_ipi = 1; - } - } - raw_spin_unlock(&kvm_lock); - - if (freq->old < freq->new && send_ipi) { - /* - * We upscale the frequency. Must make the guest - * doesn't see old kvmclock values while running with - * the new frequency, otherwise we risk the guest sees - * time go backwards. - * - * In case we update the frequency for another cpu - * (which might be in guest context) send an interrupt - * to kick the cpu out of guest context. Next time - * guest context is entered kvmclock will be updated, - * so the guest will not see stale values. - */ - smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1); - } - return 0; -} - -static struct notifier_block kvmclock_cpufreq_notifier_block = { - .notifier_call = kvmclock_cpufreq_notifier -}; - -static int kvmclock_cpu_notifier(struct notifier_block *nfb, - unsigned long action, void *hcpu) -{ - unsigned int cpu = (unsigned long)hcpu; - - switch (action) { - case CPU_ONLINE: - case CPU_DOWN_FAILED: - smp_call_function_single(cpu, tsc_khz_changed, NULL, 1); - break; - case CPU_DOWN_PREPARE: - smp_call_function_single(cpu, tsc_bad, NULL, 1); - break; - } - return NOTIFY_OK; -} - -static struct notifier_block kvmclock_cpu_notifier_block = { - .notifier_call = kvmclock_cpu_notifier, - .priority = -INT_MAX -}; - -static void kvm_timer_init(void) -{ - int cpu; - - max_tsc_khz = tsc_khz; - register_hotcpu_notifier(&kvmclock_cpu_notifier_block); - if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { -#ifdef CONFIG_CPU_FREQ - struct cpufreq_policy policy; - memset(&policy, 0, sizeof(policy)); - cpu = get_cpu(); - cpufreq_get_policy(&policy, cpu); - if (policy.cpuinfo.max_freq) - max_tsc_khz = policy.cpuinfo.max_freq; - put_cpu(); -#endif - cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block, - CPUFREQ_TRANSITION_NOTIFIER); - } - pr_debug("kvm: max_tsc_khz = %ld\n", max_tsc_khz); - for_each_online_cpu(cpu) - smp_call_function_single(cpu, tsc_khz_changed, NULL, 1); -} - -static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu); - -int kvm_is_in_guest(void) -{ - return __this_cpu_read(current_vcpu) != NULL; -} - -static int kvm_is_user_mode(void) -{ - int user_mode = 3; - - if (__this_cpu_read(current_vcpu)) - user_mode = kvm_x86_ops->get_cpl(__this_cpu_read(current_vcpu)); - - return user_mode != 0; -} - -static unsigned long kvm_get_guest_ip(void) -{ - unsigned long ip = 0; - - if (__this_cpu_read(current_vcpu)) - ip = kvm_rip_read(__this_cpu_read(current_vcpu)); - - return ip; -} - -static struct perf_guest_info_callbacks kvm_guest_cbs = { - .is_in_guest = kvm_is_in_guest, - .is_user_mode = kvm_is_user_mode, - .get_guest_ip = kvm_get_guest_ip, -}; - -void kvm_before_handle_nmi(struct kvm_vcpu *vcpu) -{ - __this_cpu_write(current_vcpu, vcpu); -} -EXPORT_SYMBOL_GPL(kvm_before_handle_nmi); - -void kvm_after_handle_nmi(struct kvm_vcpu *vcpu) -{ - __this_cpu_write(current_vcpu, NULL); -} -EXPORT_SYMBOL_GPL(kvm_after_handle_nmi); - -static void kvm_set_mmio_spte_mask(void) -{ - u64 mask; - int maxphyaddr = boot_cpu_data.x86_phys_bits; - - /* - * Set the reserved bits and the present bit of an paging-structure - * entry to generate page fault with PFER.RSV = 1. - */ - mask = ((1ull << (62 - maxphyaddr + 1)) - 1) << maxphyaddr; - mask |= 1ull; - -#ifdef CONFIG_X86_64 - /* - * If reserved bit is not supported, clear the present bit to disable - * mmio page fault. - */ - if (maxphyaddr == 52) - mask &= ~1ull; -#endif - - kvm_mmu_set_mmio_spte_mask(mask); -} - -int kvm_arch_init(void *opaque) -{ - int r; - struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque; - - if (kvm_x86_ops) { - printk(KERN_ERR "kvm: already loaded the other module\n"); - r = -EEXIST; - goto out; - } - - if (!ops->cpu_has_kvm_support()) { - printk(KERN_ERR "kvm: no hardware support\n"); - r = -EOPNOTSUPP; - goto out; - } - if (ops->disabled_by_bios()) { - printk(KERN_ERR "kvm: disabled by bios\n"); - r = -EOPNOTSUPP; - goto out; - } - - r = kvm_mmu_module_init(); - if (r) - goto out; - - kvm_set_mmio_spte_mask(); - kvm_init_msr_list(); - - kvm_x86_ops = ops; - kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, - PT_DIRTY_MASK, PT64_NX_MASK, 0); - - kvm_timer_init(); - - perf_register_guest_info_callbacks(&kvm_guest_cbs); - - if (cpu_has_xsave) - host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); - - return 0; - -out: - return r; -} - -void kvm_arch_exit(void) -{ - perf_unregister_guest_info_callbacks(&kvm_guest_cbs); - - if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) - cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block, - CPUFREQ_TRANSITION_NOTIFIER); - unregister_hotcpu_notifier(&kvmclock_cpu_notifier_block); - kvm_x86_ops = NULL; - kvm_mmu_module_exit(); -} - -int kvm_emulate_halt(struct kvm_vcpu *vcpu) -{ - ++vcpu->stat.halt_exits; - if (irqchip_in_kernel(vcpu->kvm)) { - vcpu->arch.mp_state = KVM_MP_STATE_HALTED; - return 1; - } else { - vcpu->run->exit_reason = KVM_EXIT_HLT; - return 0; - } -} -EXPORT_SYMBOL_GPL(kvm_emulate_halt); - -int kvm_hv_hypercall(struct kvm_vcpu *vcpu) -{ - u64 param, ingpa, outgpa, ret; - uint16_t code, rep_idx, rep_cnt, res = HV_STATUS_SUCCESS, rep_done = 0; - bool fast, longmode; - int cs_db, cs_l; - - /* - * hypercall generates UD from non zero cpl and real mode - * per HYPER-V spec - */ - if (kvm_x86_ops->get_cpl(vcpu) != 0 || !is_protmode(vcpu)) { - kvm_queue_exception(vcpu, UD_VECTOR); - return 0; - } - - kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); - longmode = is_long_mode(vcpu) && cs_l == 1; - - if (!longmode) { - param = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDX) << 32) | - (kvm_register_read(vcpu, VCPU_REGS_RAX) & 0xffffffff); - ingpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RBX) << 32) | - (kvm_register_read(vcpu, VCPU_REGS_RCX) & 0xffffffff); - outgpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDI) << 32) | - (kvm_register_read(vcpu, VCPU_REGS_RSI) & 0xffffffff); - } -#ifdef CONFIG_X86_64 - else { - param = kvm_register_read(vcpu, VCPU_REGS_RCX); - ingpa = kvm_register_read(vcpu, VCPU_REGS_RDX); - outgpa = kvm_register_read(vcpu, VCPU_REGS_R8); - } -#endif - - code = param & 0xffff; - fast = (param >> 16) & 0x1; - rep_cnt = (param >> 32) & 0xfff; - rep_idx = (param >> 48) & 0xfff; - - trace_kvm_hv_hypercall(code, fast, rep_cnt, rep_idx, ingpa, outgpa); - - switch (code) { - case HV_X64_HV_NOTIFY_LONG_SPIN_WAIT: - kvm_vcpu_on_spin(vcpu); - break; - default: - res = HV_STATUS_INVALID_HYPERCALL_CODE; - break; - } - - ret = res | (((u64)rep_done & 0xfff) << 32); - if (longmode) { - kvm_register_write(vcpu, VCPU_REGS_RAX, ret); - } else { - kvm_register_write(vcpu, VCPU_REGS_RDX, ret >> 32); - kvm_register_write(vcpu, VCPU_REGS_RAX, ret & 0xffffffff); - } - - return 1; -} - -int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) -{ - unsigned long nr, a0, a1, a2, a3, ret; - int r = 1; - - if (kvm_hv_hypercall_enabled(vcpu->kvm)) - return kvm_hv_hypercall(vcpu); - - nr = kvm_register_read(vcpu, VCPU_REGS_RAX); - a0 = kvm_register_read(vcpu, VCPU_REGS_RBX); - a1 = kvm_register_read(vcpu, VCPU_REGS_RCX); - a2 = kvm_register_read(vcpu, VCPU_REGS_RDX); - a3 = kvm_register_read(vcpu, VCPU_REGS_RSI); - - trace_kvm_hypercall(nr, a0, a1, a2, a3); - - if (!is_long_mode(vcpu)) { - nr &= 0xFFFFFFFF; - a0 &= 0xFFFFFFFF; - a1 &= 0xFFFFFFFF; - a2 &= 0xFFFFFFFF; - a3 &= 0xFFFFFFFF; - } - - if (kvm_x86_ops->get_cpl(vcpu) != 0) { - ret = -KVM_EPERM; - goto out; - } - - switch (nr) { - case KVM_HC_VAPIC_POLL_IRQ: - ret = 0; - break; - default: - ret = -KVM_ENOSYS; - break; - } -out: - kvm_register_write(vcpu, VCPU_REGS_RAX, ret); - ++vcpu->stat.hypercalls; - return r; -} -EXPORT_SYMBOL_GPL(kvm_emulate_hypercall); - -int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt) -{ - struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - char instruction[3]; - unsigned long rip = kvm_rip_read(vcpu); - - /* - * Blow out the MMU to ensure that no other VCPU has an active mapping - * to ensure that the updated hypercall appears atomically across all - * VCPUs. - */ - kvm_mmu_zap_all(vcpu->kvm); - - kvm_x86_ops->patch_hypercall(vcpu, instruction); - - return emulator_write_emulated(ctxt, rip, instruction, 3, NULL); -} - -/* - * Check if userspace requested an interrupt window, and that the - * interrupt window is open. - * - * No need to exit to userspace if we already have an interrupt queued. - */ -static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu) -{ - return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) && - vcpu->run->request_interrupt_window && - kvm_arch_interrupt_allowed(vcpu)); -} - -static void post_kvm_run_save(struct kvm_vcpu *vcpu) -{ - struct kvm_run *kvm_run = vcpu->run; - - kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0; - kvm_run->cr8 = kvm_get_cr8(vcpu); - kvm_run->apic_base = kvm_get_apic_base(vcpu); - if (irqchip_in_kernel(vcpu->kvm)) - kvm_run->ready_for_interrupt_injection = 1; - else - kvm_run->ready_for_interrupt_injection = - kvm_arch_interrupt_allowed(vcpu) && - !kvm_cpu_has_interrupt(vcpu) && - !kvm_event_needs_reinjection(vcpu); -} - -static void vapic_enter(struct kvm_vcpu *vcpu) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - struct page *page; - - if (!apic || !apic->vapic_addr) - return; - - page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); - - vcpu->arch.apic->vapic_page = page; -} - -static void vapic_exit(struct kvm_vcpu *vcpu) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - int idx; - - if (!apic || !apic->vapic_addr) - return; - - idx = srcu_read_lock(&vcpu->kvm->srcu); - kvm_release_page_dirty(apic->vapic_page); - mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); - srcu_read_unlock(&vcpu->kvm->srcu, idx); -} - -static void update_cr8_intercept(struct kvm_vcpu *vcpu) -{ - int max_irr, tpr; - - if (!kvm_x86_ops->update_cr8_intercept) - return; - - if (!vcpu->arch.apic) - return; - - if (!vcpu->arch.apic->vapic_addr) - max_irr = kvm_lapic_find_highest_irr(vcpu); - else - max_irr = -1; - - if (max_irr != -1) - max_irr >>= 4; - - tpr = kvm_lapic_get_cr8(vcpu); - - kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr); -} - -static void inject_pending_event(struct kvm_vcpu *vcpu) -{ - /* try to reinject previous events if any */ - if (vcpu->arch.exception.pending) { - trace_kvm_inj_exception(vcpu->arch.exception.nr, - vcpu->arch.exception.has_error_code, - vcpu->arch.exception.error_code); - kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr, - vcpu->arch.exception.has_error_code, - vcpu->arch.exception.error_code, - vcpu->arch.exception.reinject); - return; - } - - if (vcpu->arch.nmi_injected) { - kvm_x86_ops->set_nmi(vcpu); - return; - } - - if (vcpu->arch.interrupt.pending) { - kvm_x86_ops->set_irq(vcpu); - return; - } - - /* try to inject new event if pending */ - if (vcpu->arch.nmi_pending) { - if (kvm_x86_ops->nmi_allowed(vcpu)) { - --vcpu->arch.nmi_pending; - vcpu->arch.nmi_injected = true; - kvm_x86_ops->set_nmi(vcpu); - } - } else if (kvm_cpu_has_interrupt(vcpu)) { - if (kvm_x86_ops->interrupt_allowed(vcpu)) { - kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), - false); - kvm_x86_ops->set_irq(vcpu); - } - } -} - -static void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu) -{ - if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) && - !vcpu->guest_xcr0_loaded) { - /* kvm_set_xcr() also depends on this */ - xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0); - vcpu->guest_xcr0_loaded = 1; - } -} - -static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu) -{ - if (vcpu->guest_xcr0_loaded) { - if (vcpu->arch.xcr0 != host_xcr0) - xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0); - vcpu->guest_xcr0_loaded = 0; - } -} - -static void process_nmi(struct kvm_vcpu *vcpu) -{ - unsigned limit = 2; - - /* - * x86 is limited to one NMI running, and one NMI pending after it. - * If an NMI is already in progress, limit further NMIs to just one. - * Otherwise, allow two (and we'll inject the first one immediately). - */ - if (kvm_x86_ops->get_nmi_mask(vcpu) || vcpu->arch.nmi_injected) - limit = 1; - - vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0); - vcpu->arch.nmi_pending = min(vcpu->arch.nmi_pending, limit); - kvm_make_request(KVM_REQ_EVENT, vcpu); -} - -static int vcpu_enter_guest(struct kvm_vcpu *vcpu) -{ - int r; - bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && - vcpu->run->request_interrupt_window; - bool req_immediate_exit = 0; - - if (vcpu->requests) { - if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) - kvm_mmu_unload(vcpu); - if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) - __kvm_migrate_timers(vcpu); - if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) { - r = kvm_guest_time_update(vcpu); - if (unlikely(r)) - goto out; - } - if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu)) - kvm_mmu_sync_roots(vcpu); - if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) - kvm_x86_ops->tlb_flush(vcpu); - if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) { - vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS; - r = 0; - goto out; - } - if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) { - vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; - r = 0; - goto out; - } - if (kvm_check_request(KVM_REQ_DEACTIVATE_FPU, vcpu)) { - vcpu->fpu_active = 0; - kvm_x86_ops->fpu_deactivate(vcpu); - } - if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) { - /* Page is swapped out. Do synthetic halt */ - vcpu->arch.apf.halted = true; - r = 1; - goto out; - } - if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu)) - record_steal_time(vcpu); - if (kvm_check_request(KVM_REQ_NMI, vcpu)) - process_nmi(vcpu); - req_immediate_exit = - kvm_check_request(KVM_REQ_IMMEDIATE_EXIT, vcpu); - if (kvm_check_request(KVM_REQ_PMU, vcpu)) - kvm_handle_pmu_event(vcpu); - if (kvm_check_request(KVM_REQ_PMI, vcpu)) - kvm_deliver_pmi(vcpu); - } - - r = kvm_mmu_reload(vcpu); - if (unlikely(r)) - goto out; - - if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { - inject_pending_event(vcpu); - - /* enable NMI/IRQ window open exits if needed */ - if (vcpu->arch.nmi_pending) - kvm_x86_ops->enable_nmi_window(vcpu); - else if (kvm_cpu_has_interrupt(vcpu) || req_int_win) - kvm_x86_ops->enable_irq_window(vcpu); - - if (kvm_lapic_enabled(vcpu)) { - update_cr8_intercept(vcpu); - kvm_lapic_sync_to_vapic(vcpu); - } - } - - preempt_disable(); - - kvm_x86_ops->prepare_guest_switch(vcpu); - if (vcpu->fpu_active) - kvm_load_guest_fpu(vcpu); - kvm_load_guest_xcr0(vcpu); - - vcpu->mode = IN_GUEST_MODE; - - /* We should set ->mode before check ->requests, - * see the comment in make_all_cpus_request. - */ - smp_mb(); - - local_irq_disable(); - - if (vcpu->mode == EXITING_GUEST_MODE || vcpu->requests - || need_resched() || signal_pending(current)) { - vcpu->mode = OUTSIDE_GUEST_MODE; - smp_wmb(); - local_irq_enable(); - preempt_enable(); - kvm_x86_ops->cancel_injection(vcpu); - r = 1; - goto out; - } - - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); - - if (req_immediate_exit) - smp_send_reschedule(vcpu->cpu); - - kvm_guest_enter(); - - if (unlikely(vcpu->arch.switch_db_regs)) { - set_debugreg(0, 7); - set_debugreg(vcpu->arch.eff_db[0], 0); - set_debugreg(vcpu->arch.eff_db[1], 1); - set_debugreg(vcpu->arch.eff_db[2], 2); - set_debugreg(vcpu->arch.eff_db[3], 3); - } - - trace_kvm_entry(vcpu->vcpu_id); - kvm_x86_ops->run(vcpu); - - /* - * If the guest has used debug registers, at least dr7 - * will be disabled while returning to the host. - * If we don't have active breakpoints in the host, we don't - * care about the messed up debug address registers. But if - * we have some of them active, restore the old state. - */ - if (hw_breakpoint_active()) - hw_breakpoint_restore(); - - vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu); - - vcpu->mode = OUTSIDE_GUEST_MODE; - smp_wmb(); - local_irq_enable(); - - ++vcpu->stat.exits; - - /* - * We must have an instruction between local_irq_enable() and - * kvm_guest_exit(), so the timer interrupt isn't delayed by - * the interrupt shadow. The stat.exits increment will do nicely. - * But we need to prevent reordering, hence this barrier(): - */ - barrier(); - - kvm_guest_exit(); - - preempt_enable(); - - vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); - - /* - * Profile KVM exit RIPs: - */ - if (unlikely(prof_on == KVM_PROFILING)) { - unsigned long rip = kvm_rip_read(vcpu); - profile_hit(KVM_PROFILING, (void *)rip); - } - - if (unlikely(vcpu->arch.tsc_always_catchup)) - kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); - - kvm_lapic_sync_from_vapic(vcpu); - - r = kvm_x86_ops->handle_exit(vcpu); -out: - return r; -} - - -static int __vcpu_run(struct kvm_vcpu *vcpu) -{ - int r; - struct kvm *kvm = vcpu->kvm; - - if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) { - pr_debug("vcpu %d received sipi with vector # %x\n", - vcpu->vcpu_id, vcpu->arch.sipi_vector); - kvm_lapic_reset(vcpu); - r = kvm_arch_vcpu_reset(vcpu); - if (r) - return r; - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; - } - - vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); - vapic_enter(vcpu); - - r = 1; - while (r > 0) { - if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && - !vcpu->arch.apf.halted) - r = vcpu_enter_guest(vcpu); - else { - srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); - kvm_vcpu_block(vcpu); - vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); - if (kvm_check_request(KVM_REQ_UNHALT, vcpu)) - { - switch(vcpu->arch.mp_state) { - case KVM_MP_STATE_HALTED: - vcpu->arch.mp_state = - KVM_MP_STATE_RUNNABLE; - case KVM_MP_STATE_RUNNABLE: - vcpu->arch.apf.halted = false; - break; - case KVM_MP_STATE_SIPI_RECEIVED: - default: - r = -EINTR; - break; - } - } - } - - if (r <= 0) - break; - - clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests); - if (kvm_cpu_has_pending_timer(vcpu)) - kvm_inject_pending_timer_irqs(vcpu); - - if (dm_request_for_irq_injection(vcpu)) { - r = -EINTR; - vcpu->run->exit_reason = KVM_EXIT_INTR; - ++vcpu->stat.request_irq_exits; - } - - kvm_check_async_pf_completion(vcpu); - - if (signal_pending(current)) { - r = -EINTR; - vcpu->run->exit_reason = KVM_EXIT_INTR; - ++vcpu->stat.signal_exits; - } - if (need_resched()) { - srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); - kvm_resched(vcpu); - vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); - } - } - - srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); - - vapic_exit(vcpu); - - return r; -} - -static int complete_mmio(struct kvm_vcpu *vcpu) -{ - struct kvm_run *run = vcpu->run; - int r; - - if (!(vcpu->arch.pio.count || vcpu->mmio_needed)) - return 1; - - if (vcpu->mmio_needed) { - vcpu->mmio_needed = 0; - if (!vcpu->mmio_is_write) - memcpy(vcpu->mmio_data + vcpu->mmio_index, - run->mmio.data, 8); - vcpu->mmio_index += 8; - if (vcpu->mmio_index < vcpu->mmio_size) { - run->exit_reason = KVM_EXIT_MMIO; - run->mmio.phys_addr = vcpu->mmio_phys_addr + vcpu->mmio_index; - memcpy(run->mmio.data, vcpu->mmio_data + vcpu->mmio_index, 8); - run->mmio.len = min(vcpu->mmio_size - vcpu->mmio_index, 8); - run->mmio.is_write = vcpu->mmio_is_write; - vcpu->mmio_needed = 1; - return 0; - } - if (vcpu->mmio_is_write) - return 1; - vcpu->mmio_read_completed = 1; - } - vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); - r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE); - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); - if (r != EMULATE_DONE) - return 0; - return 1; -} - -int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) -{ - int r; - sigset_t sigsaved; - - if (!tsk_used_math(current) && init_fpu(current)) - return -ENOMEM; - - if (vcpu->sigset_active) - sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); - - if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { - kvm_vcpu_block(vcpu); - clear_bit(KVM_REQ_UNHALT, &vcpu->requests); - r = -EAGAIN; - goto out; - } - - /* re-sync apic's tpr */ - if (!irqchip_in_kernel(vcpu->kvm)) { - if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) { - r = -EINVAL; - goto out; - } - } - - r = complete_mmio(vcpu); - if (r <= 0) - goto out; - - r = __vcpu_run(vcpu); - -out: - post_kvm_run_save(vcpu); - if (vcpu->sigset_active) - sigprocmask(SIG_SETMASK, &sigsaved, NULL); - - return r; -} - -int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) -{ - if (vcpu->arch.emulate_regs_need_sync_to_vcpu) { - /* - * We are here if userspace calls get_regs() in the middle of - * instruction emulation. Registers state needs to be copied - * back from emulation context to vcpu. Usrapace shouldn't do - * that usually, but some bad designed PV devices (vmware - * backdoor interface) need this to work - */ - struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; - memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs); - vcpu->arch.emulate_regs_need_sync_to_vcpu = false; - } - regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX); - regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX); - regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX); - regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX); - regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI); - regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI); - regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP); - regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP); -#ifdef CONFIG_X86_64 - regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8); - regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9); - regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10); - regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11); - regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12); - regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13); - regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14); - regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15); -#endif - - regs->rip = kvm_rip_read(vcpu); - regs->rflags = kvm_get_rflags(vcpu); - - return 0; -} - -int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) -{ - vcpu->arch.emulate_regs_need_sync_from_vcpu = true; - vcpu->arch.emulate_regs_need_sync_to_vcpu = false; - - kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax); - kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx); - kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx); - kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx); - kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi); - kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi); - kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp); - kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp); -#ifdef CONFIG_X86_64 - kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8); - kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9); - kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10); - kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11); - kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12); - kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13); - kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14); - kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15); -#endif - - kvm_rip_write(vcpu, regs->rip); - kvm_set_rflags(vcpu, regs->rflags); - - vcpu->arch.exception.pending = false; - - kvm_make_request(KVM_REQ_EVENT, vcpu); - - return 0; -} - -void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) -{ - struct kvm_segment cs; - - kvm_get_segment(vcpu, &cs, VCPU_SREG_CS); - *db = cs.db; - *l = cs.l; -} -EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits); - -int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, - struct kvm_sregs *sregs) -{ - struct desc_ptr dt; - - kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS); - kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS); - kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES); - kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS); - kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS); - kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS); - - kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR); - kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); - - kvm_x86_ops->get_idt(vcpu, &dt); - sregs->idt.limit = dt.size; - sregs->idt.base = dt.address; - kvm_x86_ops->get_gdt(vcpu, &dt); - sregs->gdt.limit = dt.size; - sregs->gdt.base = dt.address; - - sregs->cr0 = kvm_read_cr0(vcpu); - sregs->cr2 = vcpu->arch.cr2; - sregs->cr3 = kvm_read_cr3(vcpu); - sregs->cr4 = kvm_read_cr4(vcpu); - sregs->cr8 = kvm_get_cr8(vcpu); - sregs->efer = vcpu->arch.efer; - sregs->apic_base = kvm_get_apic_base(vcpu); - - memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap); - - if (vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft) - set_bit(vcpu->arch.interrupt.nr, - (unsigned long *)sregs->interrupt_bitmap); - - return 0; -} - -int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, - struct kvm_mp_state *mp_state) -{ - mp_state->mp_state = vcpu->arch.mp_state; - return 0; -} - -int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, - struct kvm_mp_state *mp_state) -{ - vcpu->arch.mp_state = mp_state->mp_state; - kvm_make_request(KVM_REQ_EVENT, vcpu); - return 0; -} - -int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, - int reason, bool has_error_code, u32 error_code) -{ - struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; - int ret; - - init_emulate_ctxt(vcpu); - - ret = emulator_task_switch(ctxt, tss_selector, idt_index, reason, - has_error_code, error_code); - - if (ret) - return EMULATE_FAIL; - - memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs); - kvm_rip_write(vcpu, ctxt->eip); - kvm_set_rflags(vcpu, ctxt->eflags); - kvm_make_request(KVM_REQ_EVENT, vcpu); - return EMULATE_DONE; -} -EXPORT_SYMBOL_GPL(kvm_task_switch); - -int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, - struct kvm_sregs *sregs) -{ - int mmu_reset_needed = 0; - int pending_vec, max_bits, idx; - struct desc_ptr dt; - - dt.size = sregs->idt.limit; - dt.address = sregs->idt.base; - kvm_x86_ops->set_idt(vcpu, &dt); - dt.size = sregs->gdt.limit; - dt.address = sregs->gdt.base; - kvm_x86_ops->set_gdt(vcpu, &dt); - - vcpu->arch.cr2 = sregs->cr2; - mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3; - vcpu->arch.cr3 = sregs->cr3; - __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); - - kvm_set_cr8(vcpu, sregs->cr8); - - mmu_reset_needed |= vcpu->arch.efer != sregs->efer; - kvm_x86_ops->set_efer(vcpu, sregs->efer); - kvm_set_apic_base(vcpu, sregs->apic_base); - - mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0; - kvm_x86_ops->set_cr0(vcpu, sregs->cr0); - vcpu->arch.cr0 = sregs->cr0; - - mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; - kvm_x86_ops->set_cr4(vcpu, sregs->cr4); - if (sregs->cr4 & X86_CR4_OSXSAVE) - kvm_update_cpuid(vcpu); - - idx = srcu_read_lock(&vcpu->kvm->srcu); - if (!is_long_mode(vcpu) && is_pae(vcpu)) { - load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)); - mmu_reset_needed = 1; - } - srcu_read_unlock(&vcpu->kvm->srcu, idx); - - if (mmu_reset_needed) - kvm_mmu_reset_context(vcpu); - - max_bits = (sizeof sregs->interrupt_bitmap) << 3; - pending_vec = find_first_bit( - (const unsigned long *)sregs->interrupt_bitmap, max_bits); - if (pending_vec < max_bits) { - kvm_queue_interrupt(vcpu, pending_vec, false); - pr_debug("Set back pending irq %d\n", pending_vec); - } - - kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); - kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS); - kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES); - kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS); - kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS); - kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS); - - kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR); - kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); - - update_cr8_intercept(vcpu); - - /* Older userspace won't unhalt the vcpu on reset. */ - if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 && - sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 && - !is_protmode(vcpu)) - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; - - kvm_make_request(KVM_REQ_EVENT, vcpu); - - return 0; -} - -int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, - struct kvm_guest_debug *dbg) -{ - unsigned long rflags; - int i, r; - - if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) { - r = -EBUSY; - if (vcpu->arch.exception.pending) - goto out; - if (dbg->control & KVM_GUESTDBG_INJECT_DB) - kvm_queue_exception(vcpu, DB_VECTOR); - else - kvm_queue_exception(vcpu, BP_VECTOR); - } - - /* - * Read rflags as long as potentially injected trace flags are still - * filtered out. - */ - rflags = kvm_get_rflags(vcpu); - - vcpu->guest_debug = dbg->control; - if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE)) - vcpu->guest_debug = 0; - - if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) { - for (i = 0; i < KVM_NR_DB_REGS; ++i) - vcpu->arch.eff_db[i] = dbg->arch.debugreg[i]; - vcpu->arch.switch_db_regs = - (dbg->arch.debugreg[7] & DR7_BP_EN_MASK); - } else { - for (i = 0; i < KVM_NR_DB_REGS; i++) - vcpu->arch.eff_db[i] = vcpu->arch.db[i]; - vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK); - } - - if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) - vcpu->arch.singlestep_rip = kvm_rip_read(vcpu) + - get_segment_base(vcpu, VCPU_SREG_CS); - - /* - * Trigger an rflags update that will inject or remove the trace - * flags. - */ - kvm_set_rflags(vcpu, rflags); - - kvm_x86_ops->set_guest_debug(vcpu, dbg); - - r = 0; - -out: - - return r; -} - -/* - * Translate a guest virtual address to a guest physical address. - */ -int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, - struct kvm_translation *tr) -{ - unsigned long vaddr = tr->linear_address; - gpa_t gpa; - int idx; - - idx = srcu_read_lock(&vcpu->kvm->srcu); - gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL); - srcu_read_unlock(&vcpu->kvm->srcu, idx); - tr->physical_address = gpa; - tr->valid = gpa != UNMAPPED_GVA; - tr->writeable = 1; - tr->usermode = 0; - - return 0; -} - -int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) -{ - struct i387_fxsave_struct *fxsave = - &vcpu->arch.guest_fpu.state->fxsave; - - memcpy(fpu->fpr, fxsave->st_space, 128); - fpu->fcw = fxsave->cwd; - fpu->fsw = fxsave->swd; - fpu->ftwx = fxsave->twd; - fpu->last_opcode = fxsave->fop; - fpu->last_ip = fxsave->rip; - fpu->last_dp = fxsave->rdp; - memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space); - - return 0; -} - -int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) -{ - struct i387_fxsave_struct *fxsave = - &vcpu->arch.guest_fpu.state->fxsave; - - memcpy(fxsave->st_space, fpu->fpr, 128); - fxsave->cwd = fpu->fcw; - fxsave->swd = fpu->fsw; - fxsave->twd = fpu->ftwx; - fxsave->fop = fpu->last_opcode; - fxsave->rip = fpu->last_ip; - fxsave->rdp = fpu->last_dp; - memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space); - - return 0; -} - -int fx_init(struct kvm_vcpu *vcpu) -{ - int err; - - err = fpu_alloc(&vcpu->arch.guest_fpu); - if (err) - return err; - - fpu_finit(&vcpu->arch.guest_fpu); - - /* - * Ensure guest xcr0 is valid for loading - */ - vcpu->arch.xcr0 = XSTATE_FP; - - vcpu->arch.cr0 |= X86_CR0_ET; - - return 0; -} -EXPORT_SYMBOL_GPL(fx_init); - -static void fx_free(struct kvm_vcpu *vcpu) -{ - fpu_free(&vcpu->arch.guest_fpu); -} - -void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) -{ - if (vcpu->guest_fpu_loaded) - return; - - /* - * Restore all possible states in the guest, - * and assume host would use all available bits. - * Guest xcr0 would be loaded later. - */ - kvm_put_guest_xcr0(vcpu); - vcpu->guest_fpu_loaded = 1; - unlazy_fpu(current); - fpu_restore_checking(&vcpu->arch.guest_fpu); - trace_kvm_fpu(1); -} - -void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) -{ - kvm_put_guest_xcr0(vcpu); - - if (!vcpu->guest_fpu_loaded) - return; - - vcpu->guest_fpu_loaded = 0; - fpu_save_init(&vcpu->arch.guest_fpu); - ++vcpu->stat.fpu_reload; - kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu); - trace_kvm_fpu(0); -} - -void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) -{ - kvmclock_reset(vcpu); - - free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); - fx_free(vcpu); - kvm_x86_ops->vcpu_free(vcpu); -} - -struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, - unsigned int id) -{ - if (check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0) - printk_once(KERN_WARNING - "kvm: SMP vm created on host with unstable TSC; " - "guest TSC will not be reliable\n"); - return kvm_x86_ops->vcpu_create(kvm, id); -} - -int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) -{ - int r; - - vcpu->arch.mtrr_state.have_fixed = 1; - vcpu_load(vcpu); - r = kvm_arch_vcpu_reset(vcpu); - if (r == 0) - r = kvm_mmu_setup(vcpu); - vcpu_put(vcpu); - - return r; -} - -void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) -{ - vcpu->arch.apf.msr_val = 0; - - vcpu_load(vcpu); - kvm_mmu_unload(vcpu); - vcpu_put(vcpu); - - fx_free(vcpu); - kvm_x86_ops->vcpu_free(vcpu); -} - -int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) -{ - atomic_set(&vcpu->arch.nmi_queued, 0); - vcpu->arch.nmi_pending = 0; - vcpu->arch.nmi_injected = false; - - vcpu->arch.switch_db_regs = 0; - memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db)); - vcpu->arch.dr6 = DR6_FIXED_1; - vcpu->arch.dr7 = DR7_FIXED_1; - - kvm_make_request(KVM_REQ_EVENT, vcpu); - vcpu->arch.apf.msr_val = 0; - vcpu->arch.st.msr_val = 0; - - kvmclock_reset(vcpu); - - kvm_clear_async_pf_completion_queue(vcpu); - kvm_async_pf_hash_reset(vcpu); - vcpu->arch.apf.halted = false; - - kvm_pmu_reset(vcpu); - - return kvm_x86_ops->vcpu_reset(vcpu); -} - -int kvm_arch_hardware_enable(void *garbage) -{ - struct kvm *kvm; - struct kvm_vcpu *vcpu; - int i; - int ret; - u64 local_tsc; - u64 max_tsc = 0; - bool stable, backwards_tsc = false; - - kvm_shared_msr_cpu_online(); - ret = kvm_x86_ops->hardware_enable(garbage); - if (ret != 0) - return ret; - - local_tsc = native_read_tsc(); - stable = !check_tsc_unstable(); - list_for_each_entry(kvm, &vm_list, vm_list) { - kvm_for_each_vcpu(i, vcpu, kvm) { - if (!stable && vcpu->cpu == smp_processor_id()) - set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests); - if (stable && vcpu->arch.last_host_tsc > local_tsc) { - backwards_tsc = true; - if (vcpu->arch.last_host_tsc > max_tsc) - max_tsc = vcpu->arch.last_host_tsc; - } - } - } - - /* - * Sometimes, even reliable TSCs go backwards. This happens on - * platforms that reset TSC during suspend or hibernate actions, but - * maintain synchronization. We must compensate. Fortunately, we can - * detect that condition here, which happens early in CPU bringup, - * before any KVM threads can be running. Unfortunately, we can't - * bring the TSCs fully up to date with real time, as we aren't yet far - * enough into CPU bringup that we know how much real time has actually - * elapsed; our helper function, get_kernel_ns() will be using boot - * variables that haven't been updated yet. - * - * So we simply find the maximum observed TSC above, then record the - * adjustment to TSC in each VCPU. When the VCPU later gets loaded, - * the adjustment will be applied. Note that we accumulate - * adjustments, in case multiple suspend cycles happen before some VCPU - * gets a chance to run again. In the event that no KVM threads get a - * chance to run, we will miss the entire elapsed period, as we'll have - * reset last_host_tsc, so VCPUs will not have the TSC adjusted and may - * loose cycle time. This isn't too big a deal, since the loss will be - * uniform across all VCPUs (not to mention the scenario is extremely - * unlikely). It is possible that a second hibernate recovery happens - * much faster than a first, causing the observed TSC here to be - * smaller; this would require additional padding adjustment, which is - * why we set last_host_tsc to the local tsc observed here. - * - * N.B. - this code below runs only on platforms with reliable TSC, - * as that is the only way backwards_tsc is set above. Also note - * that this runs for ALL vcpus, which is not a bug; all VCPUs should - * have the same delta_cyc adjustment applied if backwards_tsc - * is detected. Note further, this adjustment is only done once, - * as we reset last_host_tsc on all VCPUs to stop this from being - * called multiple times (one for each physical CPU bringup). - * - * Platforms with unnreliable TSCs don't have to deal with this, they - * will be compensated by the logic in vcpu_load, which sets the TSC to - * catchup mode. This will catchup all VCPUs to real time, but cannot - * guarantee that they stay in perfect synchronization. - */ - if (backwards_tsc) { - u64 delta_cyc = max_tsc - local_tsc; - list_for_each_entry(kvm, &vm_list, vm_list) { - kvm_for_each_vcpu(i, vcpu, kvm) { - vcpu->arch.tsc_offset_adjustment += delta_cyc; - vcpu->arch.last_host_tsc = local_tsc; - } - - /* - * We have to disable TSC offset matching.. if you were - * booting a VM while issuing an S4 host suspend.... - * you may have some problem. Solving this issue is - * left as an exercise to the reader. - */ - kvm->arch.last_tsc_nsec = 0; - kvm->arch.last_tsc_write = 0; - } - - } - return 0; -} - -void kvm_arch_hardware_disable(void *garbage) -{ - kvm_x86_ops->hardware_disable(garbage); - drop_user_return_notifiers(garbage); -} - -int kvm_arch_hardware_setup(void) -{ - return kvm_x86_ops->hardware_setup(); -} - -void kvm_arch_hardware_unsetup(void) -{ - kvm_x86_ops->hardware_unsetup(); -} - -void kvm_arch_check_processor_compat(void *rtn) -{ - kvm_x86_ops->check_processor_compatibility(rtn); -} - -bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu) -{ - return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL); -} - -int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) -{ - struct page *page; - struct kvm *kvm; - int r; - - BUG_ON(vcpu->kvm == NULL); - kvm = vcpu->kvm; - - vcpu->arch.emulate_ctxt.ops = &emulate_ops; - if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu)) - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; - else - vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED; - - page = alloc_page(GFP_KERNEL | __GFP_ZERO); - if (!page) { - r = -ENOMEM; - goto fail; - } - vcpu->arch.pio_data = page_address(page); - - kvm_set_tsc_khz(vcpu, max_tsc_khz); - - r = kvm_mmu_create(vcpu); - if (r < 0) - goto fail_free_pio_data; - - if (irqchip_in_kernel(kvm)) { - r = kvm_create_lapic(vcpu); - if (r < 0) - goto fail_mmu_destroy; - } - - vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4, - GFP_KERNEL); - if (!vcpu->arch.mce_banks) { - r = -ENOMEM; - goto fail_free_lapic; - } - vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; - - if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) - goto fail_free_mce_banks; - - kvm_async_pf_hash_reset(vcpu); - kvm_pmu_init(vcpu); - - return 0; -fail_free_mce_banks: - kfree(vcpu->arch.mce_banks); -fail_free_lapic: - kvm_free_lapic(vcpu); -fail_mmu_destroy: - kvm_mmu_destroy(vcpu); -fail_free_pio_data: - free_page((unsigned long)vcpu->arch.pio_data); -fail: - return r; -} - -void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) -{ - int idx; - - kvm_pmu_destroy(vcpu); - kfree(vcpu->arch.mce_banks); - kvm_free_lapic(vcpu); - idx = srcu_read_lock(&vcpu->kvm->srcu); - kvm_mmu_destroy(vcpu); - srcu_read_unlock(&vcpu->kvm->srcu, idx); - free_page((unsigned long)vcpu->arch.pio_data); -} - -int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) -{ - if (type) - return -EINVAL; - - INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); - INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); - - /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ - set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); - - raw_spin_lock_init(&kvm->arch.tsc_write_lock); - - return 0; -} - -static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) -{ - vcpu_load(vcpu); - kvm_mmu_unload(vcpu); - vcpu_put(vcpu); -} - -static void kvm_free_vcpus(struct kvm *kvm) -{ - unsigned int i; - struct kvm_vcpu *vcpu; - - /* - * Unpin any mmu pages first. - */ - kvm_for_each_vcpu(i, vcpu, kvm) { - kvm_clear_async_pf_completion_queue(vcpu); - kvm_unload_vcpu_mmu(vcpu); - } - kvm_for_each_vcpu(i, vcpu, kvm) - kvm_arch_vcpu_free(vcpu); - - mutex_lock(&kvm->lock); - for (i = 0; i < atomic_read(&kvm->online_vcpus); i++) - kvm->vcpus[i] = NULL; - - atomic_set(&kvm->online_vcpus, 0); - mutex_unlock(&kvm->lock); -} - -void kvm_arch_sync_events(struct kvm *kvm) -{ - kvm_free_all_assigned_devices(kvm); - kvm_free_pit(kvm); -} - -void kvm_arch_destroy_vm(struct kvm *kvm) -{ - kvm_iommu_unmap_guest(kvm); - kfree(kvm->arch.vpic); - kfree(kvm->arch.vioapic); - kvm_free_vcpus(kvm); - if (kvm->arch.apic_access_page) - put_page(kvm->arch.apic_access_page); - if (kvm->arch.ept_identity_pagetable) - put_page(kvm->arch.ept_identity_pagetable); -} - -void kvm_arch_free_memslot(struct kvm_memory_slot *free, - struct kvm_memory_slot *dont) -{ - int i; - - for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { - if (!dont || free->arch.lpage_info[i] != dont->arch.lpage_info[i]) { - vfree(free->arch.lpage_info[i]); - free->arch.lpage_info[i] = NULL; - } - } -} - -int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) -{ - int i; - - for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { - unsigned long ugfn; - int lpages; - int level = i + 2; - - lpages = gfn_to_index(slot->base_gfn + npages - 1, - slot->base_gfn, level) + 1; - - slot->arch.lpage_info[i] = - vzalloc(lpages * sizeof(*slot->arch.lpage_info[i])); - if (!slot->arch.lpage_info[i]) - goto out_free; - - if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1)) - slot->arch.lpage_info[i][0].write_count = 1; - if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1)) - slot->arch.lpage_info[i][lpages - 1].write_count = 1; - ugfn = slot->userspace_addr >> PAGE_SHIFT; - /* - * If the gfn and userspace address are not aligned wrt each - * other, or if explicitly asked to, disable large page - * support for this slot - */ - if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) || - !kvm_largepages_enabled()) { - unsigned long j; - - for (j = 0; j < lpages; ++j) - slot->arch.lpage_info[i][j].write_count = 1; - } - } - - return 0; - -out_free: - for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { - vfree(slot->arch.lpage_info[i]); - slot->arch.lpage_info[i] = NULL; - } - return -ENOMEM; -} - -int kvm_arch_prepare_memory_region(struct kvm *kvm, - struct kvm_memory_slot *memslot, - struct kvm_memory_slot old, - struct kvm_userspace_memory_region *mem, - int user_alloc) -{ - int npages = memslot->npages; - int map_flags = MAP_PRIVATE | MAP_ANONYMOUS; - - /* Prevent internal slot pages from being moved by fork()/COW. */ - if (memslot->id >= KVM_MEMORY_SLOTS) - map_flags = MAP_SHARED | MAP_ANONYMOUS; - - /*To keep backward compatibility with older userspace, - *x86 needs to hanlde !user_alloc case. - */ - if (!user_alloc) { - if (npages && !old.rmap) { - unsigned long userspace_addr; - - userspace_addr = vm_mmap(NULL, 0, - npages * PAGE_SIZE, - PROT_READ | PROT_WRITE, - map_flags, - 0); - - if (IS_ERR((void *)userspace_addr)) - return PTR_ERR((void *)userspace_addr); - - memslot->userspace_addr = userspace_addr; - } - } - - - return 0; -} - -void kvm_arch_commit_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem, - struct kvm_memory_slot old, - int user_alloc) -{ - - int nr_mmu_pages = 0, npages = mem->memory_size >> PAGE_SHIFT; - - if (!user_alloc && !old.user_alloc && old.rmap && !npages) { - int ret; - - ret = vm_munmap(old.userspace_addr, - old.npages * PAGE_SIZE); - if (ret < 0) - printk(KERN_WARNING - "kvm_vm_ioctl_set_memory_region: " - "failed to munmap memory\n"); - } - - if (!kvm->arch.n_requested_mmu_pages) - nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); - - spin_lock(&kvm->mmu_lock); - if (nr_mmu_pages) - kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); - kvm_mmu_slot_remove_write_access(kvm, mem->slot); - spin_unlock(&kvm->mmu_lock); -} - -void kvm_arch_flush_shadow(struct kvm *kvm) -{ - kvm_mmu_zap_all(kvm); - kvm_reload_remote_mmus(kvm); -} - -int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) -{ - return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && - !vcpu->arch.apf.halted) - || !list_empty_careful(&vcpu->async_pf.done) - || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED - || atomic_read(&vcpu->arch.nmi_queued) || - (kvm_arch_interrupt_allowed(vcpu) && - kvm_cpu_has_interrupt(vcpu)); -} - -void kvm_vcpu_kick(struct kvm_vcpu *vcpu) -{ - int me; - int cpu = vcpu->cpu; - - if (waitqueue_active(&vcpu->wq)) { - wake_up_interruptible(&vcpu->wq); - ++vcpu->stat.halt_wakeup; - } - - me = get_cpu(); - if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) - if (kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE) - smp_send_reschedule(cpu); - put_cpu(); -} - -int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu) -{ - return kvm_x86_ops->interrupt_allowed(vcpu); -} - -bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip) -{ - unsigned long current_rip = kvm_rip_read(vcpu) + - get_segment_base(vcpu, VCPU_SREG_CS); - - return current_rip == linear_rip; -} -EXPORT_SYMBOL_GPL(kvm_is_linear_rip); - -unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu) -{ - unsigned long rflags; - - rflags = kvm_x86_ops->get_rflags(vcpu); - if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) - rflags &= ~X86_EFLAGS_TF; - return rflags; -} -EXPORT_SYMBOL_GPL(kvm_get_rflags); - -void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) -{ - if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP && - kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip)) - rflags |= X86_EFLAGS_TF; - kvm_x86_ops->set_rflags(vcpu, rflags); - kvm_make_request(KVM_REQ_EVENT, vcpu); -} -EXPORT_SYMBOL_GPL(kvm_set_rflags); - -void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) -{ - int r; - - if ((vcpu->arch.mmu.direct_map != work->arch.direct_map) || - is_error_page(work->page)) - return; - - r = kvm_mmu_reload(vcpu); - if (unlikely(r)) - return; - - if (!vcpu->arch.mmu.direct_map && - work->arch.cr3 != vcpu->arch.mmu.get_cr3(vcpu)) - return; - - vcpu->arch.mmu.page_fault(vcpu, work->gva, 0, true); -} - -static inline u32 kvm_async_pf_hash_fn(gfn_t gfn) -{ - return hash_32(gfn & 0xffffffff, order_base_2(ASYNC_PF_PER_VCPU)); -} - -static inline u32 kvm_async_pf_next_probe(u32 key) -{ - return (key + 1) & (roundup_pow_of_two(ASYNC_PF_PER_VCPU) - 1); -} - -static void kvm_add_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) -{ - u32 key = kvm_async_pf_hash_fn(gfn); - - while (vcpu->arch.apf.gfns[key] != ~0) - key = kvm_async_pf_next_probe(key); - - vcpu->arch.apf.gfns[key] = gfn; -} - -static u32 kvm_async_pf_gfn_slot(struct kvm_vcpu *vcpu, gfn_t gfn) -{ - int i; - u32 key = kvm_async_pf_hash_fn(gfn); - - for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU) && - (vcpu->arch.apf.gfns[key] != gfn && - vcpu->arch.apf.gfns[key] != ~0); i++) - key = kvm_async_pf_next_probe(key); - - return key; -} - -bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) -{ - return vcpu->arch.apf.gfns[kvm_async_pf_gfn_slot(vcpu, gfn)] == gfn; -} - -static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) -{ - u32 i, j, k; - - i = j = kvm_async_pf_gfn_slot(vcpu, gfn); - while (true) { - vcpu->arch.apf.gfns[i] = ~0; - do { - j = kvm_async_pf_next_probe(j); - if (vcpu->arch.apf.gfns[j] == ~0) - return; - k = kvm_async_pf_hash_fn(vcpu->arch.apf.gfns[j]); - /* - * k lies cyclically in ]i,j] - * | i.k.j | - * |....j i.k.| or |.k..j i...| - */ - } while ((i <= j) ? (i < k && k <= j) : (i < k || k <= j)); - vcpu->arch.apf.gfns[i] = vcpu->arch.apf.gfns[j]; - i = j; - } -} - -static int apf_put_user(struct kvm_vcpu *vcpu, u32 val) -{ - - return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val, - sizeof(val)); -} - -void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, - struct kvm_async_pf *work) -{ - struct x86_exception fault; - - trace_kvm_async_pf_not_present(work->arch.token, work->gva); - kvm_add_async_pf_gfn(vcpu, work->arch.gfn); - - if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) || - (vcpu->arch.apf.send_user_only && - kvm_x86_ops->get_cpl(vcpu) == 0)) - kvm_make_request(KVM_REQ_APF_HALT, vcpu); - else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_NOT_PRESENT)) { - fault.vector = PF_VECTOR; - fault.error_code_valid = true; - fault.error_code = 0; - fault.nested_page_fault = false; - fault.address = work->arch.token; - kvm_inject_page_fault(vcpu, &fault); - } -} - -void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, - struct kvm_async_pf *work) -{ - struct x86_exception fault; - - trace_kvm_async_pf_ready(work->arch.token, work->gva); - if (is_error_page(work->page)) - work->arch.token = ~0; /* broadcast wakeup */ - else - kvm_del_async_pf_gfn(vcpu, work->arch.gfn); - - if ((vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) && - !apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) { - fault.vector = PF_VECTOR; - fault.error_code_valid = true; - fault.error_code = 0; - fault.nested_page_fault = false; - fault.address = work->arch.token; - kvm_inject_page_fault(vcpu, &fault); - } - vcpu->arch.apf.halted = false; - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; -} - -bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu) -{ - if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED)) - return true; - else - return !kvm_event_needs_reinjection(vcpu) && - kvm_x86_ops->interrupt_allowed(vcpu); -} - -EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); -EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); -EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault); -EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr); -EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr); -EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmrun); -EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit); -EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject); -EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit); -EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga); -EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit); -EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts); diff --git a/ANDROID_3.4.5/arch/x86/kvm/x86.h b/ANDROID_3.4.5/arch/x86/kvm/x86.h deleted file mode 100644 index cb80c293..00000000 --- a/ANDROID_3.4.5/arch/x86/kvm/x86.h +++ /dev/null @@ -1,127 +0,0 @@ -#ifndef ARCH_X86_KVM_X86_H -#define ARCH_X86_KVM_X86_H - -#include <linux/kvm_host.h> -#include "kvm_cache_regs.h" - -static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu) -{ - vcpu->arch.exception.pending = false; -} - -static inline void kvm_queue_interrupt(struct kvm_vcpu *vcpu, u8 vector, - bool soft) -{ - vcpu->arch.interrupt.pending = true; - vcpu->arch.interrupt.soft = soft; - vcpu->arch.interrupt.nr = vector; -} - -static inline void kvm_clear_interrupt_queue(struct kvm_vcpu *vcpu) -{ - vcpu->arch.interrupt.pending = false; -} - -static inline bool kvm_event_needs_reinjection(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.exception.pending || vcpu->arch.interrupt.pending || - vcpu->arch.nmi_injected; -} - -static inline bool kvm_exception_is_soft(unsigned int nr) -{ - return (nr == BP_VECTOR) || (nr == OF_VECTOR); -} - -static inline bool is_protmode(struct kvm_vcpu *vcpu) -{ - return kvm_read_cr0_bits(vcpu, X86_CR0_PE); -} - -static inline int is_long_mode(struct kvm_vcpu *vcpu) -{ -#ifdef CONFIG_X86_64 - return vcpu->arch.efer & EFER_LMA; -#else - return 0; -#endif -} - -static inline bool mmu_is_nested(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.walk_mmu == &vcpu->arch.nested_mmu; -} - -static inline int is_pae(struct kvm_vcpu *vcpu) -{ - return kvm_read_cr4_bits(vcpu, X86_CR4_PAE); -} - -static inline int is_pse(struct kvm_vcpu *vcpu) -{ - return kvm_read_cr4_bits(vcpu, X86_CR4_PSE); -} - -static inline int is_paging(struct kvm_vcpu *vcpu) -{ - return kvm_read_cr0_bits(vcpu, X86_CR0_PG); -} - -static inline u32 bit(int bitno) -{ - return 1 << (bitno & 31); -} - -static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu, - gva_t gva, gfn_t gfn, unsigned access) -{ - vcpu->arch.mmio_gva = gva & PAGE_MASK; - vcpu->arch.access = access; - vcpu->arch.mmio_gfn = gfn; -} - -/* - * Clear the mmio cache info for the given gva, - * specially, if gva is ~0ul, we clear all mmio cache info. - */ -static inline void vcpu_clear_mmio_info(struct kvm_vcpu *vcpu, gva_t gva) -{ - if (gva != (~0ul) && vcpu->arch.mmio_gva != (gva & PAGE_MASK)) - return; - - vcpu->arch.mmio_gva = 0; -} - -static inline bool vcpu_match_mmio_gva(struct kvm_vcpu *vcpu, unsigned long gva) -{ - if (vcpu->arch.mmio_gva && vcpu->arch.mmio_gva == (gva & PAGE_MASK)) - return true; - - return false; -} - -static inline bool vcpu_match_mmio_gpa(struct kvm_vcpu *vcpu, gpa_t gpa) -{ - if (vcpu->arch.mmio_gfn && vcpu->arch.mmio_gfn == gpa >> PAGE_SHIFT) - return true; - - return false; -} - -void kvm_before_handle_nmi(struct kvm_vcpu *vcpu); -void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); -int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); - -void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data); - -int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt, - gva_t addr, void *val, unsigned int bytes, - struct x86_exception *exception); - -int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt, - gva_t addr, void *val, unsigned int bytes, - struct x86_exception *exception); - -extern u64 host_xcr0; - -#endif |